diff --git a/nekryptology/pkg/core/curves/bls48581_curve.go b/nekryptology/pkg/core/curves/bls48581_curve.go
index 6326408..bec397d 100644
--- a/nekryptology/pkg/core/curves/bls48581_curve.go
+++ b/nekryptology/pkg/core/curves/bls48581_curve.go
@@ -7,6 +7,7 @@
 package curves
 
 import (
+	"arena"
 	"errors"
 	"fmt"
 	"io"
@@ -47,9 +48,9 @@ func (s *ScalarBls48581) Random(reader io.Reader) Scalar {
 func (s *ScalarBls48581) Hash(bytes []byte) Scalar {
 	DST := []byte("BLS_SIG_BLS48581G1_XMD:SHA-512_SVDW_RO_NUL_")
 	u := bls48581.Hash_to_field(ext.MC_SHA2, bls48581.HASH_TYPE, DST, bytes, 2)
-	u[0].Add(u[1])
-	b := u[0].Redc()
-	b.Mod(bls48581.NewBIGints(bls48581.CURVE_Order))
+	u[0].Add(u[1], nil)
+	b := u[0].Redc(nil)
+	b.Mod(bls48581.NewBIGints(bls48581.CURVE_Order, nil), nil)
 	return &ScalarBls48581{
 		Value: b,
 		point: s.point,
@@ -58,14 +59,14 @@ func (s *ScalarBls48581) Hash(bytes []byte) Scalar {
 
 func (s *ScalarBls48581) Zero() Scalar {
 	return &ScalarBls48581{
-		Value: bls48581.NewBIGint(0),
+		Value: bls48581.NewBIGint(0, nil),
 		point: s.point,
 	}
 }
 
 func (s *ScalarBls48581) One() Scalar {
 	return &ScalarBls48581{
-		Value: bls48581.NewBIGint(1),
+		Value: bls48581.NewBIGint(1, nil),
 		point: s.point,
 	}
 }
@@ -75,7 +76,7 @@ func (s *ScalarBls48581) IsZero() bool {
 }
 
 func (s *ScalarBls48581) IsOne() bool {
-	t := bls48581.NewBIGint(1)
+	t := bls48581.NewBIGint(1, nil)
 	t.Sub(s.Value)
 	return t.IsZero()
 }
@@ -94,15 +95,15 @@ func (s *ScalarBls48581) IsEven() bool {
 
 func (s *ScalarBls48581) New(value int) Scalar {
 	if value > 0 {
-		t := bls48581.NewBIGint(value)
-		t.Mod(bls48581.NewBIGints(bls48581.CURVE_Order))
+		t := bls48581.NewBIGint(value, nil)
+		t.Mod(bls48581.NewBIGints(bls48581.CURVE_Order, nil), nil)
 		return &ScalarBls48581{
 			Value: t,
 			point: s.point,
 		}
 	} else {
-		t := bls48581.NewBIGint(-value)
-		v := bls48581.NewBIGints(bls48581.CURVE_Order)
+		t := bls48581.NewBIGint(-value, nil)
+		v := bls48581.NewBIGints(bls48581.CURVE_Order, nil)
 		v.Sub(t)
 		return &ScalarBls48581{
 			Value: v,
@@ -121,8 +122,8 @@ func (s *ScalarBls48581) Cmp(rhs Scalar) int {
 }
 
 func (s *ScalarBls48581) Square() Scalar {
-	sqr := bls48581.NewBIGcopy(s.Value)
-	sqr = bls48581.Modsqr(sqr, bls48581.NewBIGints(bls48581.CURVE_Order))
+	sqr := bls48581.NewBIGcopy(s.Value, nil)
+	sqr = bls48581.Modsqr(sqr, bls48581.NewBIGints(bls48581.CURVE_Order, nil), nil)
 	return &ScalarBls48581{
 		Value: sqr,
 		point: s.point,
@@ -130,8 +131,13 @@ func (s *ScalarBls48581) Square() Scalar {
 }
 
 func (s *ScalarBls48581) Double() Scalar {
-	dbl := bls48581.NewBIGcopy(s.Value)
-	dbl = bls48581.Modmul(dbl, bls48581.NewBIGint(2), bls48581.NewBIGints(bls48581.CURVE_Order))
+	dbl := bls48581.NewBIGcopy(s.Value, nil)
+	dbl = bls48581.Modmul(
+		dbl,
+		bls48581.NewBIGint(2, nil),
+		bls48581.NewBIGints(bls48581.CURVE_Order, nil),
+		nil,
+	)
 	return &ScalarBls48581{
 		Value: dbl,
 		point: s.point,
@@ -139,8 +145,8 @@ func (s *ScalarBls48581) Double() Scalar {
 }
 
 func (s *ScalarBls48581) Invert() (Scalar, error) {
-	v := bls48581.NewBIGcopy(s.Value)
-	v.Invmodp(bls48581.NewBIGints(bls48581.CURVE_Order))
+	v := bls48581.NewBIGcopy(s.Value, nil)
+	v.Invmodp(bls48581.NewBIGints(bls48581.CURVE_Order, nil))
 	if v == nil {
 		return nil, fmt.Errorf("inverse doesn't exist")
 	}
@@ -155,9 +161,9 @@ func (s *ScalarBls48581) Sqrt() (Scalar, error) {
 }
 
 func (s *ScalarBls48581) Cube() Scalar {
-	value := bls48581.NewBIGcopy(s.Value)
-	value = bls48581.Modsqr(value, bls48581.NewBIGints(bls48581.CURVE_Order))
-	value = bls48581.Modmul(value, s.Value, bls48581.NewBIGints(bls48581.CURVE_Order))
+	value := bls48581.NewBIGcopy(s.Value, nil)
+	value = bls48581.Modsqr(value, bls48581.NewBIGints(bls48581.CURVE_Order, nil), nil)
+	value = bls48581.Modmul(value, s.Value, bls48581.NewBIGints(bls48581.CURVE_Order, nil), nil)
 	return &ScalarBls48581{
 		Value: value,
 		point: s.point,
@@ -167,8 +173,11 @@ func (s *ScalarBls48581) Cube() Scalar {
 func (s *ScalarBls48581) Add(rhs Scalar) Scalar {
 	r, ok := rhs.(*ScalarBls48581)
 	if ok {
-		value := bls48581.NewBIGcopy(s.Value)
-		value = bls48581.ModAdd(value, r.Value, bls48581.NewBIGints(bls48581.CURVE_Order))
+		mem := arena.NewArena()
+		defer mem.Free()
+		value := bls48581.NewBIGcopy(s.Value, mem)
+		value = bls48581.ModAdd(value, r.Value, bls48581.NewBIGints(bls48581.CURVE_Order, mem), mem)
+		value = bls48581.NewBIGcopy(value, nil)
 		return &ScalarBls48581{
 			Value: value,
 			point: s.point,
@@ -181,9 +190,12 @@ func (s *ScalarBls48581) Add(rhs Scalar) Scalar {
 func (s *ScalarBls48581) Sub(rhs Scalar) Scalar {
 	r, ok := rhs.(*ScalarBls48581)
 	if ok {
-		value := bls48581.NewBIGcopy(r.Value)
-		value = bls48581.Modneg(value, bls48581.NewBIGints(bls48581.CURVE_Order))
-		value = bls48581.ModAdd(value, s.Value, bls48581.NewBIGints(bls48581.CURVE_Order))
+		mem := arena.NewArena()
+		defer mem.Free()
+		value := bls48581.NewBIGcopy(r.Value, mem)
+		value = bls48581.Modneg(value, bls48581.NewBIGints(bls48581.CURVE_Order, mem), mem)
+		value = bls48581.ModAdd(value, s.Value, bls48581.NewBIGints(bls48581.CURVE_Order, mem), mem)
+		value = bls48581.NewBIGcopy(value, nil)
 		return &ScalarBls48581{
 			Value: value,
 			point: s.point,
@@ -196,8 +208,11 @@ func (s *ScalarBls48581) Sub(rhs Scalar) Scalar {
 func (s *ScalarBls48581) Mul(rhs Scalar) Scalar {
 	r, ok := rhs.(*ScalarBls48581)
 	if ok {
-		value := bls48581.NewBIGcopy(s.Value)
-		value = bls48581.Modmul(value, r.Value, bls48581.NewBIGints(bls48581.CURVE_Order))
+		mem := arena.NewArena()
+		defer mem.Free()
+		value := bls48581.NewBIGcopy(s.Value, mem)
+		value = bls48581.Modmul(value, r.Value, bls48581.NewBIGints(bls48581.CURVE_Order, mem), mem)
+		value = bls48581.NewBIGcopy(value, nil)
 		return &ScalarBls48581{
 			Value: value,
 			point: s.point,
@@ -214,9 +229,12 @@ func (s *ScalarBls48581) MulAdd(y, z Scalar) Scalar {
 func (s *ScalarBls48581) Div(rhs Scalar) Scalar {
 	r, ok := rhs.(*ScalarBls48581)
 	if ok {
-		value := bls48581.NewBIGcopy(r.Value)
-		value.Invmodp(bls48581.NewBIGints(bls48581.CURVE_Order))
-		value = bls48581.Modmul(value, s.Value, bls48581.NewBIGints(bls48581.CURVE_Order))
+		mem := arena.NewArena()
+		defer mem.Free()
+		value := bls48581.NewBIGcopy(r.Value, mem)
+		value.Invmodp(bls48581.NewBIGints(bls48581.CURVE_Order, mem))
+		value = bls48581.Modmul(value, s.Value, bls48581.NewBIGints(bls48581.CURVE_Order, mem), mem)
+		value = bls48581.NewBIGcopy(value, nil)
 		return &ScalarBls48581{
 			Value: value,
 			point: s.point,
@@ -227,8 +245,11 @@ func (s *ScalarBls48581) Div(rhs Scalar) Scalar {
 }
 
 func (s *ScalarBls48581) Neg() Scalar {
-	value := bls48581.NewBIGcopy(s.Value)
-	value = bls48581.Modneg(value, bls48581.NewBIGints(bls48581.CURVE_Order))
+	mem := arena.NewArena()
+	defer mem.Free()
+	value := bls48581.NewBIGcopy(s.Value, mem)
+	value = bls48581.Modneg(value, bls48581.NewBIGints(bls48581.CURVE_Order, mem), mem)
+	value = bls48581.NewBIGcopy(value, nil)
 	return &ScalarBls48581{
 		Value: value,
 		point: s.point,
@@ -244,7 +265,7 @@ func (s *ScalarBls48581) SetBigInt(v *big.Int) (Scalar, error) {
 	copy(t[bls48581.MODBYTES-uint(len(b)):], b)
 
 	i := bls48581.FromBytes(t)
-	i.Mod(bls48581.NewBIGints(bls48581.CURVE_Order))
+	i.Mod(bls48581.NewBIGints(bls48581.CURVE_Order, nil), nil)
 	return &ScalarBls48581{
 		Value: i,
 		point: s.point,
@@ -298,7 +319,7 @@ func (s *ScalarBls48581) Point() Point {
 }
 
 func (s *ScalarBls48581) Clone() Scalar {
-	value := bls48581.NewBIGcopy(s.Value)
+	value := bls48581.NewBIGcopy(s.Value, nil)
 	return &ScalarBls48581{
 		Value: value,
 		point: s.point,
@@ -306,7 +327,7 @@ func (s *ScalarBls48581) Clone() Scalar {
 }
 
 func (s *ScalarBls48581) SetPoint(p Point) PairingScalar {
-	value := bls48581.NewBIGcopy(s.Value)
+	value := bls48581.NewBIGcopy(s.Value, nil)
 	return &ScalarBls48581{
 		Value: value,
 		point: p,
@@ -314,7 +335,7 @@ func (s *ScalarBls48581) SetPoint(p Point) PairingScalar {
 }
 
 func (s *ScalarBls48581) Order() *big.Int {
-	b := bls48581.NewBIGints(bls48581.CURVE_Order)
+	b := bls48581.NewBIGints(bls48581.CURVE_Order, nil)
 	bytes := make([]byte, bls48581.MODBYTES)
 	b.ToBytes(bytes)
 	return new(big.Int).SetBytes(bytes)
@@ -369,7 +390,7 @@ func (p *PointBls48581G1) Hash(bytes []byte) Point {
 
 func (p *PointBls48581G1) Identity() Point {
 	g1 := bls48581.ECP_generator()
-	g1 = g1.Mul(bls48581.NewBIGint(0))
+	g1 = g1.Mul(bls48581.NewBIGint(0, nil), nil, nil)
 	return &PointBls48581G1{
 		Value: g1,
 	}
@@ -384,7 +405,7 @@ func (p *PointBls48581G1) Generator() Point {
 }
 
 func (p *PointBls48581G1) IsIdentity() bool {
-	return p.Value.Is_infinity()
+	return p.Value.Is_infinity(nil)
 }
 
 func (p *PointBls48581G1) IsNegative() bool {
@@ -395,18 +416,18 @@ func (p *PointBls48581G1) IsNegative() bool {
 }
 
 func (p *PointBls48581G1) IsOnCurve() bool {
-	return bls48581.G1member(p.Value)
+	return bls48581.G1member(p.Value, nil)
 }
 
 func (p *PointBls48581G1) Double() Point {
-	v := bls48581.NewECP()
+	v := bls48581.NewECP(nil)
 	v.Copy(p.Value)
-	v.Dbl()
+	v.Dbl(nil)
 	return &PointBls48581G1{v}
 }
 
 func (p *PointBls48581G1) Scalar() Scalar {
-	value := bls48581.NewBIG()
+	value := bls48581.NewBIG(nil)
 	return &ScalarBls48581{
 		Value: value,
 		point: new(PointBls48581G1),
@@ -414,9 +435,9 @@ func (p *PointBls48581G1) Scalar() Scalar {
 }
 
 func (p *PointBls48581G1) Neg() Point {
-	v := bls48581.NewECP()
+	v := bls48581.NewECP(nil)
 	v.Copy(p.Value)
-	v.Neg()
+	v.Neg(nil)
 	return &PointBls48581G1{v}
 }
 
@@ -426,9 +447,9 @@ func (p *PointBls48581G1) Add(rhs Point) Point {
 	}
 	r, ok := rhs.(*PointBls48581G1)
 	if ok {
-		v := bls48581.NewECP()
+		v := bls48581.NewECP(nil)
 		v.Copy(p.Value)
-		v.Add(r.Value)
+		v.Add(r.Value, nil)
 		return &PointBls48581G1{v}
 	} else {
 		return nil
@@ -441,9 +462,9 @@ func (p *PointBls48581G1) Sub(rhs Point) Point {
 	}
 	r, ok := rhs.(*PointBls48581G1)
 	if ok {
-		v := bls48581.NewECP()
+		v := bls48581.NewECP(nil)
 		v.Copy(p.Value)
-		v.Sub(r.Value)
+		v.Sub(r.Value, nil)
 		return &PointBls48581G1{v}
 	} else {
 		return nil
@@ -456,9 +477,11 @@ func (p *PointBls48581G1) Mul(rhs Scalar) Point {
 	}
 	r, ok := rhs.(*ScalarBls48581)
 	if ok {
-		v := bls48581.NewECP()
+		mem := arena.NewArena()
+		defer mem.Free()
+		v := bls48581.NewECP(mem)
 		v.Copy(p.Value)
-		v = v.Mul(r.Value)
+		v = v.Mul(r.Value, nil, mem)
 		return &PointBls48581G1{v}
 	} else {
 		return nil
@@ -481,7 +504,7 @@ func (p *PointBls48581G1) Set(x, y *big.Int) (Point, error) {
 	y.FillBytes(yBytes)
 	xBig := bls48581.FromBytes(xBytes)
 	yBig := bls48581.FromBytes(yBytes)
-	v := bls48581.NewECPbigs(xBig, yBig)
+	v := bls48581.NewECPbigs(xBig, yBig, nil)
 	if v == nil {
 		return nil, fmt.Errorf("invalid coordinates")
 	}
@@ -504,7 +527,7 @@ func (p *PointBls48581G1) FromAffineCompressed(bytes []byte) (Point, error) {
 	var b [bls48581.MODBYTES + 1]byte
 	copy(b[:], bytes)
 	value := bls48581.ECP_fromBytes(b[:])
-	if value == nil || value.Is_infinity() {
+	if value == nil || value.Is_infinity(nil) {
 		return nil, errors.New("could not decode")
 	}
 	return &PointBls48581G1{value}, nil
@@ -514,7 +537,7 @@ func (p *PointBls48581G1) FromAffineUncompressed(bytes []byte) (Point, error) {
 	var b [bls48581.MODBYTES*2 + 1]byte
 	copy(b[:], bytes)
 	value := bls48581.ECP_fromBytes(b[:])
-	if value == nil || value.Is_infinity() {
+	if value == nil || value.Is_infinity(nil) {
 		return nil, errors.New("could not decode")
 	}
 	return &PointBls48581G1{value}, nil
@@ -541,8 +564,10 @@ func (p *PointBls48581G1) SumOfProducts(points []Point, scalars []Scalar) Point
 		}
 		nScalars[i] = s.Value
 	}
-	value := bls48581.ECP_muln(len(points), nPoints, nScalars)
-	if value == nil || value.Is_infinity() {
+	mem := arena.NewArena()
+	defer mem.Free()
+	value := bls48581.ECP_muln(len(points), nPoints, nScalars, mem)
+	if value == nil || value.Is_infinity(mem) {
 		return nil
 	}
 	return &PointBls48581G1{value}
@@ -563,77 +588,60 @@ func (p *PointBls48581G1) Pairing(rhs PairingPoint) Scalar {
 	return &ScalarBls48581Gt{pair}
 }
 
+func (p *PointBls48581G1) Ate2Pairing(
+	rhs *PointBls48581G2,
+	lhs2 *PointBls48581G1,
+	rhs2 *PointBls48581G2,
+) Scalar {
+	ate2 := bls48581.Ate2(rhs2.Value, p.Value, rhs2.Value, lhs2.Value)
+
+	return &ScalarBls48581Gt{ate2}
+}
+
 func (p *PointBls48581G1) MultiPairing(points ...PairingPoint) Scalar {
 	return bls48multiPairing(points...)
 }
 
 func (p *PointBls48581G1) X() *big.Int {
 	bytes := make([]byte, bls48581.MODBYTES)
-	p.Value.GetX().ToBytes(bytes[:])
+	p.Value.GetX(nil).ToBytes(bytes[:])
 	return new(big.Int).SetBytes(bytes)
 }
 
 func (p *PointBls48581G1) Y() *big.Int {
 	bytes := make([]byte, bls48581.MODBYTES)
-	p.Value.GetY().ToBytes(bytes[:])
+	p.Value.GetY(nil).ToBytes(bytes[:])
 	return new(big.Int).SetBytes(bytes)
 }
 
 func (p *PointBls48581G1) Modulus() *big.Int {
-	b := bls48581.NewBIGints(bls48581.Modulus)
+	b := bls48581.NewBIGints(bls48581.Modulus, nil)
 	bytes := make([]byte, bls48581.MODBYTES)
 	b.ToBytes(bytes)
 	return new(big.Int).SetBytes(bytes)
 }
 
 func (p *PointBls48581G1) MarshalBinary() ([]byte, error) {
-	return pointMarshalBinary(p)
+	return nil, nil
 }
 
 func (p *PointBls48581G1) UnmarshalBinary(input []byte) error {
-	pt, err := pointUnmarshalBinary(input)
-	if err != nil {
-		return err
-	}
-	ppt, ok := pt.(*PointBls48581G1)
-	if !ok {
-		return fmt.Errorf("invalid point")
-	}
-	p.Value = ppt.Value
 	return nil
 }
 
 func (p *PointBls48581G1) MarshalText() ([]byte, error) {
-	return pointMarshalText(p)
+	return nil, nil
 }
 
 func (p *PointBls48581G1) UnmarshalText(input []byte) error {
-	pt, err := pointUnmarshalText(input)
-	if err != nil {
-		return err
-	}
-	ppt, ok := pt.(*PointBls48581G1)
-	if !ok {
-		return fmt.Errorf("invalid point")
-	}
-	p.Value = ppt.Value
 	return nil
 }
 
 func (p *PointBls48581G1) MarshalJSON() ([]byte, error) {
-	return pointMarshalJson(p)
+	return nil, nil
 }
 
 func (p *PointBls48581G1) UnmarshalJSON(input []byte) error {
-	pt, err := pointUnmarshalJson(input)
-	if err != nil {
-		return err
-	}
-	P, ok := pt.(*PointBls48581G1)
-	if !ok {
-		return fmt.Errorf("invalid type")
-	}
-	p.Value = P.Value
 	return nil
 }
 
@@ -646,15 +654,15 @@ func (p *PointBls48581G2) Random(reader io.Reader) Point {
 func (p *PointBls48581G2) Hash(bytes []byte) Point {
 	DST := []byte("BLS_SIG_BLS48581G2_XMD:SHA-512_SVDW_RO_NUL_")
 	u := bls48581.Hash_to_field(ext.MC_SHA2, bls48581.HASH_TYPE, DST, bytes, 2)
-	u[0].Add(u[1])
-	fp8 := bls48581.NewFP8fp(u[0])
+	u[0].Add(u[1], nil)
+	fp8 := bls48581.NewFP8fp(u[0], nil)
 	v := bls48581.ECP8_map2point(fp8)
 	return &PointBls48581G2{v}
 }
 
 func (p *PointBls48581G2) Identity() Point {
 	g2 := bls48581.ECP8_generator()
-	g2 = g2.Mul(bls48581.NewBIGint(0))
+	g2 = g2.Mul(bls48581.NewBIGint(0, nil), nil)
 	return &PointBls48581G2{
 		Value: g2,
 	}
@@ -669,7 +677,7 @@ func (p *PointBls48581G2) Generator() Point {
 }
 
 func (p *PointBls48581G2) IsIdentity() bool {
-	return p.Value.Is_infinity()
+	return p.Value.Is_infinity(nil)
 }
 
 func (p *PointBls48581G2) IsNegative() bool {
@@ -680,18 +688,18 @@ func (p *PointBls48581G2) IsNegative() bool {
 }
 
 func (p *PointBls48581G2) IsOnCurve() bool {
-	return bls48581.G2member(p.Value)
+	return bls48581.G2member(p.Value, nil)
 }
 
 func (p *PointBls48581G2) Double() Point {
-	v := bls48581.NewECP8()
+	v := bls48581.NewECP8(nil)
 	v.Copy(p.Value)
-	v.Dbl()
+	v.Dbl(nil)
 	return &PointBls48581G2{v}
 }
 
 func (p *PointBls48581G2) Scalar() Scalar {
-	value := bls48581.NewBIG()
+	value := bls48581.NewBIG(nil)
 	return &ScalarBls48581{
 		Value: value,
 		point: new(PointBls48581G2),
@@ -699,9 +707,9 @@ func (p *PointBls48581G2) Scalar() Scalar {
 }
 
 func (p *PointBls48581G2) Neg() Point {
-	v := bls48581.NewECP8()
+	v := bls48581.NewECP8(nil)
 	v.Copy(p.Value)
-	v.Neg()
+	v.Neg(nil)
 	return &PointBls48581G2{v}
 }
 
@@ -711,9 +719,9 @@ func (p *PointBls48581G2) Add(rhs Point) Point {
 	}
 	r, ok := rhs.(*PointBls48581G2)
 	if ok {
-		v := bls48581.NewECP8()
+		v := bls48581.NewECP8(nil)
 		v.Copy(p.Value)
-		v.Add(r.Value)
+		v.Add(r.Value, nil)
 		return &PointBls48581G2{v}
 	} else {
 		return nil
@@ -726,9 +734,9 @@ func (p *PointBls48581G2) Sub(rhs Point) Point {
 	}
 	r, ok := rhs.(*PointBls48581G2)
 	if ok {
-		v := bls48581.NewECP8()
+		v := bls48581.NewECP8(nil)
 		v.Copy(p.Value)
-		v.Sub(r.Value)
+		v.Sub(r.Value, nil)
 		return &PointBls48581G2{v}
 	} else {
 		return nil
@@ -741,11 +749,11 @@ func (p *PointBls48581G2) Mul(rhs Scalar) Point {
 	}
 	r, ok := rhs.(*ScalarBls48581)
 	if ok {
-		v := bls48581.NewECP8()
+		mem := arena.NewArena()
+		defer mem.Free()
+		v := bls48581.NewECP8(nil)
 		v.Copy(p.Value)
-		bytes := make([]byte, bls48581.MODBYTES)
-		r.Value.ToBytes(bytes)
-		v = v.Mul(bls48581.FromBytes(bytes))
+		v = v.Mul(r.Value, mem)
 		return &PointBls48581G2{v}
 	} else {
 		return nil
@@ -768,8 +776,8 @@ func (p *PointBls48581G2) Set(x, y *big.Int) (Point, error) {
 	y.FillBytes(yBytes)
 	xBig := bls48581.FP8_fromBytes(xBytes)
 	yBig := bls48581.FP8_fromBytes(yBytes)
-	v := bls48581.NewECP8fp8s(xBig, yBig)
-	if v == nil || v.Is_infinity() {
+	v := bls48581.NewECP8fp8s(xBig, yBig, nil)
+	if v == nil || v.Is_infinity(nil) {
 		return nil, fmt.Errorf("invalid coordinates")
 	}
 	return &PointBls48581G2{v}, nil
@@ -791,7 +799,7 @@ func (p *PointBls48581G2) FromAffineCompressed(bytes []byte) (Point, error) {
 	var b [bls48581.MODBYTES*8 + 1]byte
 	copy(b[:], bytes)
 	value := bls48581.ECP8_fromBytes(b[:])
-	if value == nil || value.Is_infinity() {
+	if value == nil || value.Is_infinity(nil) {
 		return nil, errors.New("could not decode")
 	}
 	return &PointBls48581G2{value}, nil
@@ -801,7 +809,7 @@ func (p *PointBls48581G2) FromAffineUncompressed(bytes []byte) (Point, error) {
 	var b [bls48581.MODBYTES*16 + 1]byte
 	copy(b[:], bytes)
 	value := bls48581.ECP8_fromBytes(b[:])
-	if value == nil || value.Is_infinity() {
+	if value == nil || value.Is_infinity(nil) {
 		return nil, errors.New("could not decode")
 	}
 	return &PointBls48581G2{value}, nil
@@ -828,8 +836,8 @@ func (p *PointBls48581G2) SumOfProducts(points []Point, scalars []Scalar) Point
 		}
 		nScalars[i] = s.Value
 	}
-	value := bls48581.Mul16(nPoints, nScalars)
-	if value == nil || value.Is_infinity() {
+	value := bls48581.Mul16(nPoints, nScalars, nil)
+	if value == nil || value.Is_infinity(nil) {
 		return nil
 	}
 	return &PointBls48581G2{value}
@@ -855,74 +863,47 @@ func (p *PointBls48581G2) MultiPairing(points ...PairingPoint) Scalar {
 }
 
 func (p *PointBls48581G2) X() *big.Int {
-	x := p.Value.GetX()
+	x := p.Value.GetX(nil)
 	bytes := make([]byte, 8*bls48581.MODBYTES)
 	x.ToBytes(bytes)
 	return new(big.Int).SetBytes(bytes)
 }
 
 func (p *PointBls48581G2) Y() *big.Int {
-	y := p.Value.GetY()
+	y := p.Value.GetY(nil)
 	bytes := make([]byte, 8*bls48581.MODBYTES)
 	y.ToBytes(bytes)
 	return new(big.Int).SetBytes(bytes)
 }
 
 func (p *PointBls48581G2) Modulus() *big.Int {
-	b := bls48581.NewBIGints(bls48581.Modulus)
+	b := bls48581.NewBIGints(bls48581.Modulus, nil)
 	bytes := make([]byte, bls48581.MODBYTES)
 	b.ToBytes(bytes)
 	return new(big.Int).SetBytes(bytes)
 }
 
 func (p *PointBls48581G2) MarshalBinary() ([]byte, error) {
-	return pointMarshalBinary(p)
+	return nil, nil
 }
 
 func (p *PointBls48581G2) UnmarshalBinary(input []byte) error {
-	pt, err := pointUnmarshalBinary(input)
-	if err != nil {
-		return err
-	}
-	ppt, ok := pt.(*PointBls48581G2)
-	if !ok {
-		return fmt.Errorf("invalid point")
-	}
-	p.Value = ppt.Value
 	return nil
 }
 
 func (p *PointBls48581G2) MarshalText() ([]byte, error) {
-	return pointMarshalText(p)
+	return nil, nil
 }
 
 func (p *PointBls48581G2) UnmarshalText(input []byte) error {
-	pt, err := pointUnmarshalText(input)
-	if err != nil {
-		return err
-	}
-	ppt, ok := pt.(*PointBls48581G2)
-	if !ok {
-		return fmt.Errorf("invalid point")
-	}
-	p.Value = ppt.Value
 	return nil
 }
 
 func (p *PointBls48581G2) MarshalJSON() ([]byte, error) {
-	return pointMarshalJson(p)
+	return nil, nil
 }
 
 func (p *PointBls48581G2) UnmarshalJSON(input []byte) error {
-	pt, err := pointUnmarshalJson(input)
-	if err != nil {
-		return err
-	}
-	P, ok := pt.(*PointBls48581G2)
-	if !ok {
-		return fmt.Errorf("invalid type")
-	}
-	p.Value = P.Value
 	return nil
 }
 
@@ -931,21 +912,25 @@ func bls48multiPairing(points ...PairingPoint) Scalar {
 		return nil
 	}
 	valid := true
-	r := bls48581.Initmp()
+	mem := arena.NewArena()
+	defer mem.Free()
+	r := bls48581.Initmp(mem)
 	for i := 0; i < len(points); i += 2 {
 		pt1, ok := points[i].(*PointBls48581G1)
 		valid = valid && ok
 		pt2, ok := points[i+1].(*PointBls48581G2)
 		valid = valid && ok
 		if valid {
-			bls48581.Another(r, pt2.Value, pt1.Value)
+			inner := arena.NewArena()
+			bls48581.Another(r, pt2.Value, pt1.Value, inner)
+			inner.Free()
 		}
 	}
 	if !valid {
 		return nil
 	}
 
-	v := bls48581.Miller(r)
+	v := bls48581.Miller(r, mem)
 	v = bls48581.Fexp(v)
 	return &ScalarBls48581Gt{v}
 }
@@ -973,15 +958,15 @@ func (s *ScalarBls48581Gt) Hash(bytes []byte) Scalar {
 }
 
 func (s *ScalarBls48581Gt) Zero() Scalar {
-	return &ScalarBls48581Gt{bls48581.NewFP48int(0)}
+	return &ScalarBls48581Gt{bls48581.NewFP48int(0, nil)}
 }
 
 func (s *ScalarBls48581Gt) One() Scalar {
-	return &ScalarBls48581Gt{bls48581.NewFP48int(1)}
+	return &ScalarBls48581Gt{bls48581.NewFP48int(1, nil)}
 }
 
 func (s *ScalarBls48581Gt) IsZero() bool {
-	return s.Value.IsZero()
+	return s.Value.IsZero(nil)
 }
 
 func (s *ScalarBls48581Gt) IsOne() bool {
@@ -1034,7 +1019,7 @@ func (s *ScalarBls48581Gt) IsEven() bool {
 }
 
 func (s *ScalarBls48581Gt) New(input int) Scalar {
-	fp := bls48581.NewFP48int(input)
+	fp := bls48581.NewFP48int(input, nil)
 	return &ScalarBls48581Gt{fp}
 }
 
@@ -1048,20 +1033,20 @@ func (s *ScalarBls48581Gt) Cmp(rhs Scalar) int {
 }
 
 func (s *ScalarBls48581Gt) Square() Scalar {
-	v := bls48581.NewFP48copy(s.Value)
-	v.Sqr()
+	v := bls48581.NewFP48copy(s.Value, nil)
+	v.Sqr(nil)
 	return &ScalarBls48581Gt{v}
 }
 
 func (s *ScalarBls48581Gt) Double() Scalar {
-	v := bls48581.NewFP48copy(s.Value)
-	v.Mul(bls48581.NewFP48int(2))
+	v := bls48581.NewFP48copy(s.Value, nil)
+	v.Mul(bls48581.NewFP48int(2, nil), nil)
 	return &ScalarBls48581Gt{v}
 }
 
 func (s *ScalarBls48581Gt) Invert() (Scalar, error) {
-	v := bls48581.NewFP48copy(s.Value)
-	v.Invert()
+	v := bls48581.NewFP48copy(s.Value, nil)
+	v.Invert(nil)
 	if v == nil {
 		return nil, fmt.Errorf("not invertible")
 	}
@@ -1074,9 +1059,9 @@ func (s *ScalarBls48581Gt) Sqrt() (Scalar, error) {
 }
 
 func (s *ScalarBls48581Gt) Cube() Scalar {
-	v := bls48581.NewFP48copy(s.Value)
-	v.Sqr()
-	v.Mul(s.Value)
+	v := bls48581.NewFP48copy(s.Value, nil)
+	v.Sqr(nil)
+	v.Mul(s.Value, nil)
 	return &ScalarBls48581Gt{v}
 }
 
@@ -1093,8 +1078,8 @@ func (s *ScalarBls48581Gt) Sub(rhs Scalar) Scalar {
 func (s *ScalarBls48581Gt) Mul(rhs Scalar) Scalar {
 	r, ok := rhs.(*ScalarBls48581Gt)
 	if ok {
-		v := bls48581.NewFP48copy(s.Value)
-		v.Mul(r.Value)
+		v := bls48581.NewFP48copy(s.Value, nil)
+		v.Mul(r.Value, nil)
 		return &ScalarBls48581Gt{v}
 	} else {
 		return nil
@@ -1108,9 +1093,9 @@ func (s *ScalarBls48581Gt) MulAdd(y, z Scalar) Scalar {
 func (s *ScalarBls48581Gt) Div(rhs Scalar) Scalar {
 	r, ok := rhs.(*ScalarBls48581Gt)
 	if ok {
-		v := bls48581.NewFP48copy(r.Value)
-		v.Invert()
-		v.Mul(s.Value)
+		v := bls48581.NewFP48copy(r.Value, nil)
+		v.Invert(nil)
+		v.Mul(s.Value, nil)
 		return &ScalarBls48581Gt{v}
 	} else {
 		return nil
@@ -1160,7 +1145,7 @@ func (s *ScalarBls48581Gt) SetBytesWide(bytes []byte) (Scalar, error) {
 }
 
 func (s *ScalarBls48581Gt) Clone() Scalar {
-	fp := bls48581.NewFP48copy(s.Value)
+	fp := bls48581.NewFP48copy(s.Value, nil)
 	return &ScalarBls48581Gt{
 		Value: fp,
 	}
diff --git a/nekryptology/pkg/core/curves/bls48581_curve_test.go b/nekryptology/pkg/core/curves/bls48581_curve_test.go
index d7540c5..72e24f3 100644
--- a/nekryptology/pkg/core/curves/bls48581_curve_test.go
+++ b/nekryptology/pkg/core/curves/bls48581_curve_test.go
@@ -78,9 +78,9 @@ func TestScalarBls48581G1Invert(t *testing.T) {
 	nine := bls48581G1.Scalar.New(9)
 	actual, _ := nine.Invert()
 	sa, _ := actual.(*ScalarBls48581)
-	expected, err := bls48581G1.Scalar.SetBigInt(bhex("ab22a52d6e7108e9eabb0e17e8139cf4b9392413a05486ec3dcef3b90bea3db988c1478b9ec2b4f1382ab890f18c0c9a0f85d504cc493f9b79f8c84e41d01ae5070000000000000000"))
+	expected, err := bls48581G1.Scalar.SetBigInt(bhex("000000000000000007e51ad0414ec8f8799b3f49cc04d5850f9a0c8cf190b82a38f1b4c29e8b47c188b93dea0bb9f3ce3dec8654a0132439b9f49c13e8170ebbeae908716e2da522ab"))
 	require.NoError(t, err)
-	require.Equal(t, sa.Cmp(expected), 0)
+	require.Equal(t, sa.Value.ToString(), expected.(*ScalarBls48581).Value.ToString())
 }
 
 func TestScalarBls48581G1Add(t *testing.T) {
@@ -91,11 +91,11 @@ func TestScalarBls48581G1Add(t *testing.T) {
 	require.NotNil(t, fifteen)
 	expected := bls48581G1.Scalar.New(15)
 	require.Equal(t, expected.Cmp(fifteen), 0)
-	qq := bls48581.NewBIGints(bls48581.CURVE_Order)
-	qq.Sub(bls48581.NewBIGint(3))
+	qq := bls48581.NewBIGints(bls48581.CURVE_Order, nil)
+	qq.Sub(bls48581.NewBIGint(3, nil))
 
 	upper := &ScalarBls48581{
-		Value: bls48581.NewBIGcopy(qq),
+		Value: bls48581.NewBIGcopy(qq, nil),
 	}
 	actual := upper.Add(nine)
 	require.NotNil(t, actual)
@@ -106,8 +106,8 @@ func TestScalarBls48581G1Sub(t *testing.T) {
 	bls48581G1 := BLS48581G1()
 	nine := bls48581G1.Scalar.New(9)
 	six := bls48581G1.Scalar.New(6)
-	n := bls48581.NewFPbig(bls48581.NewBIGints(bls48581.CURVE_Order))
-	n.Sub(bls48581.NewFPint(3))
+	n := bls48581.NewFPbig(bls48581.NewBIGints(bls48581.CURVE_Order, nil), nil)
+	n.Sub(bls48581.NewFPint(3, nil), nil)
 
 	expected := bls48581G1.Scalar.New(0).Sub(bls48581G1.Scalar.New(3))
 	actual := six.Sub(nine)
@@ -138,7 +138,7 @@ func TestScalarBls48581G1Serialize(t *testing.T) {
 	sc := bls48581G1.Scalar.New(255)
 	sequence := sc.Bytes()
 	require.Equal(t, len(sequence), 73)
-	require.Equal(t, sequence, []byte{0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
+	require.Equal(t, sequence, []byte{0x00, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff})
 	ret, err := bls48581G1.Scalar.SetBytes(sequence)
 	require.NoError(t, err)
 	require.Equal(t, ret.Cmp(sc), 0)
diff --git a/nekryptology/pkg/core/curves/curve.go b/nekryptology/pkg/core/curves/curve.go
index 420c4a4..424203b 100644
--- a/nekryptology/pkg/core/curves/curve.go
+++ b/nekryptology/pkg/core/curves/curve.go
@@ -575,11 +575,11 @@ func BLS48581G1() *Curve {
 func bls48581g1Init() {
 	bls48581g1 = Curve{
 		Scalar: &ScalarBls48581{
-			Value: bls48581.NewBIGint(1),
+			Value: bls48581.NewBIGint(1, nil),
 			point: new(PointBls48581G1),
 		},
 		Point: new(PointBls48581G1).Identity(),
-		Name:  BLS12381G1Name,
+		Name:  BLS48581G1Name,
 	}
 }
 
@@ -592,7 +592,7 @@ func BLS48581G2() *Curve {
 func bls48581g2Init() {
 	bls48581g2 = Curve{
 		Scalar: &ScalarBls48581{
-			Value: bls48581.NewBIGint(1),
+			Value: bls48581.NewBIGint(1, nil),
 			point: new(PointBls48581G2),
 		},
 		Point: new(PointBls48581G2).Identity(),
@@ -603,7 +603,7 @@ func bls48581g2Init() {
 func BLS48581(preferredPoint Point) *PairingCurve {
 	return &PairingCurve{
 		Scalar: &ScalarBls48581{
-			Value: bls48581.NewBIG(),
+			Value: bls48581.NewBIG(nil),
 			point: preferredPoint,
 		},
 		PointG1: &PointBls48581G1{
@@ -613,7 +613,7 @@ func BLS48581(preferredPoint Point) *PairingCurve {
 			Value: bls48581.ECP8_generator(),
 		},
 		GT: &ScalarBls48581Gt{
-			Value: bls48581.NewFP48int(1),
+			Value: bls48581.NewFP48int(1, nil),
 		},
 		Name: BLS48581Name,
 	}
@@ -863,38 +863,40 @@ type sswuParams struct {
 // Let `n` be a number of point-scalar pairs.
 // Let `w` be a window of bits (6..8, chosen based on `n`, see cost factor).
 //
-// 1. Prepare `2^(w-1) - 1` buckets with indices `[1..2^(w-1))` initialized with identity points.
-//    Bucket 0 is not needed as it would contain points multiplied by 0.
-// 2. Convert scalars to a radix-`2^w` representation with signed digits in `[-2^w/2, 2^w/2]`.
-//    Note: only the last digit may equal `2^w/2`.
-// 3. Starting with the last window, for each point `i=[0..n)` add it to a a bucket indexed by
-//    the point's scalar's value in the window.
-// 4. Once all points in a window are sorted into buckets, add buckets by multiplying each
-//    by their index. Efficient way of doing it is to start with the last bucket and compute two sums:
-//    intermediate sum from the last to the first, and the full sum made of all intermediate sums.
-// 5. Shift the resulting sum of buckets by `w` bits by using `w` doublings.
-// 6. Add to the return value.
-// 7. Repeat the loop.
+//  1. Prepare `2^(w-1) - 1` buckets with indices `[1..2^(w-1))` initialized with identity points.
+//     Bucket 0 is not needed as it would contain points multiplied by 0.
+//  2. Convert scalars to a radix-`2^w` representation with signed digits in `[-2^w/2, 2^w/2]`.
+//     Note: only the last digit may equal `2^w/2`.
+//  3. Starting with the last window, for each point `i=[0..n)` add it to a a bucket indexed by
+//     the point's scalar's value in the window.
+//  4. Once all points in a window are sorted into buckets, add buckets by multiplying each
+//     by their index. Efficient way of doing it is to start with the last bucket and compute two sums:
+//     intermediate sum from the last to the first, and the full sum made of all intermediate sums.
+//  5. Shift the resulting sum of buckets by `w` bits by using `w` doublings.
+//  6. Add to the return value.
+//  7. Repeat the loop.
 //
 // Approximate cost w/o wNAF optimizations (A = addition, D = doubling):
 //
 // ```ascii
 // cost = (n*A + 2*(2^w/2)*A + w*D + A)*256/w
-//          |          |       |     |   |
-//          |          |       |     |   looping over 256/w windows
-//          |          |       |     adding to the result
-//    sorting points   |       shifting the sum by w bits (to the next window, starting from last window)
-//    one by one       |
-//    into buckets     adding/subtracting all buckets
-//                     multiplied by their indexes
-//                     using a sum of intermediate sums
+//
+//	      |          |       |     |   |
+//	      |          |       |     |   looping over 256/w windows
+//	      |          |       |     adding to the result
+//	sorting points   |       shifting the sum by w bits (to the next window, starting from last window)
+//	one by one       |
+//	into buckets     adding/subtracting all buckets
+//	                 multiplied by their indexes
+//	                 using a sum of intermediate sums
+//
 // ```
 //
 // For large `n`, dominant factor is (n*256/w) additions.
 // However, if `w` is too big and `n` is not too big, then `(2^w/2)*A` could dominate.
 // Therefore, the optimal choice of `w` grows slowly as `n` grows.
 //
-// For constant time we use a fixed window of 6
+// # For constant time we use a fixed window of 6
 //
 // This algorithm is adapted from section 4 of <https://eprint.iacr.org/2012/549.pdf>.
 // and https://cacr.uwaterloo.ca/techreports/2010/cacr2010-26.pdf
diff --git a/nekryptology/pkg/core/curves/native/bls48581/arch.go b/nekryptology/pkg/core/curves/native/bls48581/arch_32.go
similarity index 97%
rename from nekryptology/pkg/core/curves/native/bls48581/arch.go
rename to nekryptology/pkg/core/curves/native/bls48581/arch_32.go
index ac0c2e7..436f2a7 100644
--- a/nekryptology/pkg/core/curves/native/bls48581/arch.go
+++ b/nekryptology/pkg/core/curves/native/bls48581/arch_32.go
@@ -1,3 +1,5 @@
+//go:build js && wasm
+
 /*
  * Copyright (c) 2012-2020 MIRACL UK Ltd.
  *
diff --git a/nekryptology/pkg/core/curves/native/bls48581/arch_64.go b/nekryptology/pkg/core/curves/native/bls48581/arch_64.go
new file mode 100644
index 0000000..2736e68
--- /dev/null
+++ b/nekryptology/pkg/core/curves/native/bls48581/arch_64.go
@@ -0,0 +1,28 @@
+//go:build !js && !wasm
+
+/*
+ * Copyright (c) 2012-2020 MIRACL UK Ltd.
+ *
+ * This file is part of MIRACL Core
+ * (see https://github.com/miracl/core).
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* core BIG number class */
+
+package bls48581
+
+type Chunk int64
+
+const CHUNK int = 64 /* Set word size */
diff --git a/nekryptology/pkg/core/curves/native/bls48581/big.go b/nekryptology/pkg/core/curves/native/bls48581/big_32.go
similarity index 99%
rename from nekryptology/pkg/core/curves/native/bls48581/big.go
rename to nekryptology/pkg/core/curves/native/bls48581/big_32.go
index 2540cb9..c556db2 100644
--- a/nekryptology/pkg/core/curves/native/bls48581/big.go
+++ b/nekryptology/pkg/core/curves/native/bls48581/big_32.go
@@ -1,3 +1,5 @@
+//go:build js && wasm
+
 /*
  * Copyright (c) 2012-2020 MIRACL UK Ltd.
  *
diff --git a/nekryptology/pkg/core/curves/native/bls48581/big_64.go b/nekryptology/pkg/core/curves/native/bls48581/big_64.go
new file mode 100644
index 0000000..31d016c
--- /dev/null
+++ b/nekryptology/pkg/core/curves/native/bls48581/big_64.go
@@ -0,0 +1,999 @@
+//go:build !js && !wasm
+
+/*
+ * Copyright (c) 2012-2020 MIRACL UK Ltd.
+ *
+ * This file is part of MIRACL Core
+ * (see https://github.com/miracl/core).
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* core BIG number class */
+
+package bls48581
+
+import (
+	"arena"
+	"math/bits"
+	"strconv"
+
+	"source.quilibrium.com/quilibrium/monorepo/nekryptology/pkg/core/curves/native/bls48581/ext"
+)
+
+//import "fmt"
+
+type BIG struct {
+	w [NLEN]Chunk
+}
+
+type DBIG struct {
+	w [2 * NLEN]Chunk
+}
+
+/***************** 64-bit specific code ****************/
+
+/* First the 32/64-bit dependent BIG code */
+/* Note that because of the lack of a 128-bit integer, 32 and 64-bit code needs to be done differently */
+
+/* return a*b as DBIG */
+func mul(a *BIG, b *BIG, mem *arena.Arena) *DBIG {
+	c := NewDBIG(mem)
+	carry := Chunk(0)
+
+	for i := 0; i < NLEN; i++ {
+		carry = 0
+		for j := 0; j < NLEN; j++ {
+			carry, c.w[i+j] = mulAdd(a.w[i], b.w[j], carry, c.w[i+j])
+		}
+		c.w[NLEN+i] = carry
+	}
+
+	return c
+}
+
+/* return a^2 as DBIG */
+func sqr(a *BIG, mem *arena.Arena) *DBIG {
+	c := NewDBIG(mem)
+	carry := Chunk(0)
+
+	for i := 0; i < NLEN; i++ {
+		carry = 0
+		for j := i + 1; j < NLEN; j++ {
+			//if a.w[i]<0 {fmt.Printf("Negative m i in sqr\n")}
+			//if a.w[j]<0 {fmt.Printf("Negative m j in sqr\n")}
+			carry, c.w[i+j] = mulAdd(2*a.w[i], a.w[j], carry, c.w[i+j])
+		}
+		c.w[NLEN+i] = carry
+	}
+
+	for i := 0; i < NLEN; i++ {
+		//if a.w[i]<0 {fmt.Printf("Negative m s in sqr\n")}
+		top, bot := mulAdd(a.w[i], a.w[i], 0, c.w[2*i])
+
+		c.w[2*i] = bot
+		c.w[2*i+1] += top
+	}
+	c.norm()
+	return c
+}
+
+func monty(md *BIG, mc Chunk, d *DBIG, mem *arena.Arena) *BIG {
+	carry := Chunk(0)
+	m := Chunk(0)
+	for i := 0; i < NLEN; i++ {
+		if mc == -1 {
+			m = (-d.w[i]) & BMASK
+		} else {
+			if mc == 1 {
+				m = d.w[i]
+			} else {
+				m = (mc * d.w[i]) & BMASK
+			}
+		}
+
+		carry = 0
+		for j := 0; j < NLEN; j++ {
+			carry, d.w[i+j] = mulAdd(m, md.w[j], carry, d.w[i+j])
+			//if m<0 {fmt.Printf("Negative m in monty\n")}
+			//if md.w[j]<0 {fmt.Printf("Negative m in monty\n")}
+		}
+		d.w[NLEN+i] += carry
+	}
+
+	b := NewBIG(mem)
+	for i := 0; i < NLEN; i++ {
+		b.w[i] = d.w[NLEN+i]
+	}
+	b.norm()
+	return b
+}
+
+/* set this[i]+=x*y+c, and return high part */
+func mulAdd(a Chunk, b Chunk, c Chunk, r Chunk) (Chunk, Chunk) {
+
+	tp, bt := bits.Mul64(uint64(a), uint64(b)) // use math/bits intrinsic
+	bot := Chunk(bt & uint64(BMASK))
+	top := Chunk((tp << (64 - BASEBITS)) | (bt >> BASEBITS))
+	bot += c
+	bot += r
+	carry := bot >> BASEBITS
+	bot &= BMASK
+	top += carry
+	return top, bot
+
+}
+
+/************************************************************/
+
+func (r *BIG) get(i int) Chunk {
+	return r.w[i]
+}
+
+func (r *BIG) set(i int, x Chunk) {
+	r.w[i] = x
+}
+
+func (r *BIG) xortop(x Chunk) {
+	r.w[NLEN-1] ^= x
+}
+
+/* normalise BIG - force all digits < 2^BASEBITS */
+func (r *BIG) norm() Chunk {
+	carry := Chunk(0)
+	for i := 0; i < NLEN-1; i++ {
+		d := r.w[i] + carry
+		r.w[i] = d & BMASK
+		carry = d >> BASEBITS
+	}
+	r.w[NLEN-1] = (r.w[NLEN-1] + carry)
+	return (r.w[NLEN-1] >> ((8 * MODBYTES) % BASEBITS))
+}
+
+/* Shift right by less than a word */
+func (r *BIG) fshr(k uint) int {
+	w := r.w[0] & ((Chunk(1) << k) - 1) /* shifted out part */
+	for i := 0; i < NLEN-1; i++ {
+		r.w[i] = (r.w[i] >> k) | ((r.w[i+1] << (BASEBITS - k)) & BMASK)
+	}
+	r.w[NLEN-1] = r.w[NLEN-1] >> k
+	return int(w)
+}
+
+/* Shift right by less than a word */
+func (r *BIG) fshl(k uint) int {
+	r.w[NLEN-1] = (r.w[NLEN-1] << k) | (r.w[NLEN-2] >> (BASEBITS - k))
+	for i := NLEN - 2; i > 0; i-- {
+		r.w[i] = ((r.w[i] << k) & BMASK) | (r.w[i-1] >> (BASEBITS - k))
+	}
+	r.w[0] = (r.w[0] << k) & BMASK
+	return int(r.w[NLEN-1] >> ((8 * MODBYTES) % BASEBITS)) /* return excess - only used in ff.c */
+}
+
+func NewBIG(mem *arena.Arena) *BIG {
+	var b *BIG
+	if mem != nil {
+		b = arena.New[BIG](mem)
+	} else {
+		b = new(BIG)
+	}
+	for i := 0; i < NLEN; i++ {
+		b.w[i] = 0
+	}
+	return b
+}
+
+func NewBIGints(x [NLEN]Chunk, mem *arena.Arena) *BIG {
+	var b *BIG
+	if mem != nil {
+		b = arena.New[BIG](mem)
+	} else {
+		b = new(BIG)
+	}
+	for i := 0; i < NLEN; i++ {
+		b.w[i] = x[i]
+	}
+	return b
+}
+
+func NewBIGint(x int, mem *arena.Arena) *BIG {
+	var b *BIG
+	if mem != nil {
+		b = arena.New[BIG](mem)
+	} else {
+		b = new(BIG)
+	}
+	b.w[0] = Chunk(x)
+	for i := 1; i < NLEN; i++ {
+		b.w[i] = 0
+	}
+	return b
+}
+
+func NewBIGcopy(x *BIG, mem *arena.Arena) *BIG {
+	var b *BIG
+	if mem != nil {
+		b = arena.New[BIG](mem)
+	} else {
+		b = new(BIG)
+	}
+	for i := 0; i < NLEN; i++ {
+		b.w[i] = x.w[i]
+	}
+	return b
+}
+
+func NewBIGdcopy(x *DBIG, mem *arena.Arena) *BIG {
+	var b *BIG
+	if mem != nil {
+		b = arena.New[BIG](mem)
+	} else {
+		b = new(BIG)
+	}
+	for i := 0; i < NLEN; i++ {
+		b.w[i] = x.w[i]
+	}
+	return b
+}
+
+/* test for zero */
+func (r *BIG) IsZero() bool {
+	d := Chunk(0)
+	for i := 0; i < NLEN; i++ {
+		d |= r.w[i]
+	}
+	return (1 & ((d - 1) >> BASEBITS)) != 0
+}
+
+/* set to zero */
+func (r *BIG) zero() {
+	for i := 0; i < NLEN; i++ {
+		r.w[i] = 0
+	}
+}
+
+/* Test for equal to one */
+func (r *BIG) isunity() bool {
+	d := Chunk(0)
+	for i := 1; i < NLEN; i++ {
+		d |= r.w[i]
+	}
+	return (1 & ((d - 1) >> BASEBITS) & (((r.w[0] ^ 1) - 1) >> BASEBITS)) != 0
+}
+
+/* set to one */
+func (r *BIG) one() {
+	r.w[0] = 1
+	for i := 1; i < NLEN; i++ {
+		r.w[i] = 0
+	}
+}
+
+/* Copy from another BIG */
+func (r *BIG) copy(x *BIG) {
+	for i := 0; i < NLEN; i++ {
+		r.w[i] = x.w[i]
+	}
+}
+
+/* Copy from another DBIG */
+func (r *BIG) dcopy(x *DBIG) {
+	for i := 0; i < NLEN; i++ {
+		r.w[i] = x.w[i]
+	}
+}
+
+/* Conditional swap of two bigs depending on d using XOR - no branches */
+func (r *BIG) cswap(b *BIG, d int) Chunk {
+	c := Chunk(-d)
+	s := Chunk(0)
+	v := r.w[0] ^ b.w[1]
+	va := v + v
+	va >>= 1
+	for i := 0; i < NLEN; i++ {
+		t := c & (r.w[i] ^ b.w[i])
+		t ^= v
+		e := r.w[i] ^ t
+		s ^= e // to force calculation of e
+		r.w[i] = e ^ va
+		e = b.w[i] ^ t
+		s ^= e
+		b.w[i] = e ^ va
+	}
+	return s
+}
+
+func (r *BIG) cmove(g *BIG, d int) Chunk {
+	b := Chunk(-d)
+	s := Chunk(0)
+	v := r.w[0] ^ g.w[1]
+	va := v + v
+	va >>= 1
+	for i := 0; i < NLEN; i++ {
+		t := (r.w[i] ^ g.w[i]) & b
+		t ^= v
+		e := r.w[i] ^ t
+		s ^= e
+		r.w[i] = e ^ va
+	}
+	return s
+}
+
+/* general shift right */
+func (r *BIG) shr(k uint) {
+	n := (k % BASEBITS)
+	m := int(k / BASEBITS)
+	for i := 0; i < NLEN-m-1; i++ {
+		r.w[i] = (r.w[m+i] >> n) | ((r.w[m+i+1] << (BASEBITS - n)) & BMASK)
+	}
+	r.w[NLEN-m-1] = r.w[NLEN-1] >> n
+	for i := NLEN - m; i < NLEN; i++ {
+		r.w[i] = 0
+	}
+}
+
+/* general shift left */
+func (r *BIG) shl(k uint) {
+	n := k % BASEBITS
+	m := int(k / BASEBITS)
+
+	r.w[NLEN-1] = (r.w[NLEN-1-m] << n)
+	if NLEN >= m+2 {
+		r.w[NLEN-1] |= (r.w[NLEN-m-2] >> (BASEBITS - n))
+	}
+	for i := NLEN - 2; i > m; i-- {
+		r.w[i] = ((r.w[i-m] << n) & BMASK) | (r.w[i-m-1] >> (BASEBITS - n))
+	}
+	r.w[m] = (r.w[0] << n) & BMASK
+	for i := 0; i < m; i++ {
+		r.w[i] = 0
+	}
+}
+
+/* return number of bits */
+func (r *BIG) nbits() int {
+	t := NewBIGcopy(r, nil)
+	k := NLEN - 1
+	t.norm()
+	for k >= 0 && t.w[k] == 0 {
+		k--
+	}
+	if k < 0 {
+		return 0
+	}
+	bts := int(BASEBITS) * k
+	c := t.w[k]
+	for c != 0 {
+		c /= 2
+		bts++
+	}
+	return bts
+}
+
+func (r *BIG) Nbits() int {
+	return r.nbits()
+}
+
+/* Convert to Hex String */
+func (r *BIG) ToString() string {
+	s := ""
+	len := r.nbits()
+
+	if len%4 == 0 {
+		len /= 4
+	} else {
+		len /= 4
+		len++
+
+	}
+	MB := int(MODBYTES * 2)
+	if len < MB {
+		len = MB
+	}
+
+	for i := len - 1; i >= 0; i-- {
+		b := NewBIGcopy(r, nil)
+
+		b.shr(uint(i * 4))
+		s += strconv.FormatInt(int64(b.w[0]&15), 16)
+	}
+	return s
+}
+
+func (r *BIG) Add(x *BIG) {
+	for i := 0; i < NLEN; i++ {
+		r.w[i] = r.w[i] + x.w[i]
+	}
+}
+
+func (r *BIG) or(x *BIG) {
+	for i := 0; i < NLEN; i++ {
+		r.w[i] = r.w[i] | x.w[i]
+	}
+}
+
+/* return this+x */
+func (r *BIG) Plus(x *BIG) *BIG {
+	s := new(BIG)
+	for i := 0; i < NLEN; i++ {
+		s.w[i] = r.w[i] + x.w[i]
+	}
+	s.norm()
+	return s
+}
+
+/* this+=x, where x is int */
+func (r *BIG) inc(x int) {
+	r.norm()
+	r.w[0] += Chunk(x)
+}
+
+/* this*=c and catch overflow in DBIG */
+func (r *BIG) pxmul(c int, mem *arena.Arena) *DBIG {
+	m := NewDBIG(mem)
+	carry := Chunk(0)
+	for j := 0; j < NLEN; j++ {
+		carry, m.w[j] = mulAdd(r.w[j], Chunk(c), carry, m.w[j])
+		//if c<0 {fmt.Printf("Negative c in pxmul\n")}
+		//if r.w[j]<0 {fmt.Printf("Negative c in pxmul\n")}
+	}
+	m.w[NLEN] = carry
+	return m
+}
+
+/* return this-x */
+func (r *BIG) Minus(x *BIG) *BIG {
+	d := new(BIG)
+	for i := 0; i < NLEN; i++ {
+		d.w[i] = r.w[i] - x.w[i]
+	}
+	return d
+}
+
+/* this-=x */
+func (r *BIG) Sub(x *BIG) {
+	for i := 0; i < NLEN; i++ {
+		r.w[i] = r.w[i] - x.w[i]
+	}
+}
+
+/* reverse subtract this=x-this */
+func (r *BIG) rsub(x *BIG) {
+	for i := 0; i < NLEN; i++ {
+		r.w[i] = x.w[i] - r.w[i]
+	}
+}
+
+/* this-=x, where x is int */
+func (r *BIG) dec(x int) {
+	r.norm()
+	r.w[0] -= Chunk(x)
+}
+
+/* this*=x, where x is small int<NEXCESS */
+func (r *BIG) imul(c int) {
+	for i := 0; i < NLEN; i++ {
+		r.w[i] *= Chunk(c)
+	}
+}
+
+/* this*=x, where x is >NEXCESS */
+func (r *BIG) pmul(c int) Chunk {
+	carry := Chunk(0)
+	//	r.norm();
+	for i := 0; i < NLEN; i++ {
+		ak := r.w[i]
+		r.w[i] = 0
+		carry, r.w[i] = mulAdd(ak, Chunk(c), carry, r.w[i])
+		//if c<0 {fmt.Printf("Negative c in pmul\n")}
+		//if ak<0 {fmt.Printf("Negative c in pmul\n")}
+	}
+	return carry
+}
+
+/* convert this BIG to byte array */
+func (r *BIG) tobytearray(b []byte, n int) {
+	//r.norm();
+	c := NewBIGcopy(r, nil)
+	c.norm()
+
+	for i := int(MODBYTES) - 1; i >= 0; i-- {
+		b[i+n] = byte(c.w[0])
+		c.fshr(8)
+	}
+}
+
+/* convert from byte array to BIG */
+func frombytearray(b []byte, n int) *BIG {
+	m := NewBIG(nil)
+	l := len(b)
+	for i := 0; i < int(MODBYTES); i++ {
+		m.fshl(8)
+		if i < l {
+			m.w[0] += Chunk(int(b[i+n] & 0xff))
+		} else {
+			m.w[0] += Chunk(int(0 & 0xff))
+		}
+	}
+	return m
+}
+
+func (r *BIG) ToBytes(b []byte) {
+	r.tobytearray(b, 0)
+}
+
+func FromBytes(b []byte) *BIG {
+	return frombytearray(b, 0)
+}
+
+/* divide by 3 */
+func (r *BIG) div3() int {
+	carry := Chunk(0)
+	r.norm()
+	base := (Chunk(1) << BASEBITS)
+	for i := NLEN - 1; i >= 0; i-- {
+		ak := (carry*base + r.w[i])
+		r.w[i] = ak / 3
+		carry = ak % 3
+	}
+	return int(carry)
+}
+
+/* return a*b where result fits in a BIG */
+func smul(a *BIG, b *BIG) *BIG {
+	carry := Chunk(0)
+	c := NewBIG(nil)
+	for i := 0; i < NLEN; i++ {
+		carry = 0
+		for j := 0; j < NLEN; j++ {
+			if i+j < NLEN {
+				carry, c.w[i+j] = mulAdd(a.w[i], b.w[j], carry, c.w[i+j])
+			}
+		}
+	}
+	return c
+}
+
+/* Compare a and b, return 0 if a==b, -1 if a<b, +1 if a>b. Inputs must be normalised */
+func Comp(a *BIG, b *BIG) int {
+	gt := Chunk(0)
+	eq := Chunk(1)
+	for i := NLEN - 1; i >= 0; i-- {
+		gt |= ((b.w[i] - a.w[i]) >> BASEBITS) & eq
+		eq &= ((b.w[i] ^ a.w[i]) - 1) >> BASEBITS
+	}
+	return int(gt + gt + eq - 1)
+}
+
+/* return parity */
+func (r *BIG) parity() int {
+	return int(r.w[0] % 2)
+}
+
+/* return n-th bit */
+func (r *BIG) bit(n int) int {
+	return int((r.w[n/int(BASEBITS)] & (Chunk(1) << (uint(n) % BASEBITS))) >> (uint(n) % BASEBITS))
+	//	if (r.w[n/int(BASEBITS)] & (Chunk(1) << (uint(n) % BASEBITS))) > 0 {
+	//		return 1
+	//	}
+	//	return 0
+}
+
+/* return n last bits */
+func (r *BIG) lastbits(n int) int {
+	msk := (1 << uint(n)) - 1
+	r.norm()
+	return (int(r.w[0])) & msk
+}
+
+/* set x = x mod 2^m */
+func (r *BIG) mod2m(m uint) {
+	wd := int(m / BASEBITS)
+	bt := m % BASEBITS
+	msk := (Chunk(1) << bt) - 1
+	r.w[wd] &= msk
+	for i := wd + 1; i < NLEN; i++ {
+		r.w[i] = 0
+	}
+}
+
+/* a=1/a mod 2^256. This is very fast! */
+func (r *BIG) invmod2m() {
+	U := NewBIG(nil)
+	b := NewBIG(nil)
+	c := NewBIG(nil)
+
+	U.inc(invmod256(r.lastbits(8)))
+
+	for i := 8; i < BIGBITS; i <<= 1 {
+		U.norm()
+		ui := uint(i)
+		b.copy(r)
+		b.mod2m(ui)
+		t1 := smul(U, b)
+		t1.shr(ui)
+		c.copy(r)
+		c.shr(ui)
+		c.mod2m(ui)
+
+		t2 := smul(U, c)
+		t2.mod2m(ui)
+		t1.Add(t2)
+		t1.norm()
+		b = smul(t1, U)
+		t1.copy(b)
+		t1.mod2m(ui)
+
+		t2.one()
+		t2.shl(ui)
+		t1.rsub(t2)
+		t1.norm()
+		t1.shl(ui)
+		U.Add(t1)
+	}
+	U.mod2m(8 * MODBYTES)
+	r.copy(U)
+	r.norm()
+}
+
+func (r *BIG) ctmod(m *BIG, bd uint, mem *arena.Arena) {
+	k := bd
+	sr := NewBIG(mem)
+	c := NewBIGcopy(m, mem)
+	r.norm()
+
+	c.shl(k)
+
+	for {
+		sr.copy(r)
+		sr.Sub(c)
+		sr.norm()
+		r.cmove(sr, int(1-((sr.w[NLEN-1]>>uint(CHUNK-1))&1)))
+		if k == 0 {
+			break
+		}
+		c.fshr(1)
+		k -= 1
+	}
+}
+
+/* reduce this mod m */
+func (r *BIG) Mod(m *BIG, mem *arena.Arena) {
+	k := r.nbits() - m.nbits()
+	if k < 0 {
+		k = 0
+	}
+	r.ctmod(m, uint(k), mem)
+}
+
+func (r *BIG) ctdiv(m *BIG, bd uint, mem *arena.Arena) {
+	k := bd
+	e := NewBIGint(1, mem)
+	sr := NewBIG(mem)
+	a := NewBIGcopy(r, mem)
+	c := NewBIGcopy(m, mem)
+	r.norm()
+	r.zero()
+
+	c.shl(k)
+	e.shl(k)
+
+	for {
+		sr.copy(a)
+		sr.Sub(c)
+		sr.norm()
+		d := int(1 - ((sr.w[NLEN-1] >> uint(CHUNK-1)) & 1))
+		a.cmove(sr, d)
+		sr.copy(r)
+		sr.Add(e)
+		sr.norm()
+		r.cmove(sr, d)
+		if k == 0 {
+			break
+		}
+		c.fshr(1)
+		e.fshr(1)
+		k -= 1
+	}
+}
+
+/* divide this by m */
+func (r *BIG) div(m *BIG, mem *arena.Arena) {
+	k := r.nbits() - m.nbits()
+	if k < 0 {
+		k = 0
+	}
+	r.ctdiv(m, uint(k), mem)
+}
+
+/* get 8*MODBYTES size random number */
+func Random(rng *ext.RAND) *BIG {
+	m := NewBIG(nil)
+	var j int = 0
+	var r byte = 0
+	/* generate random BIG */
+	for i := 0; i < 8*int(MODBYTES); i++ {
+		if j == 0 {
+			r = rng.GetByte()
+		} else {
+			r >>= 1
+		}
+
+		b := Chunk(int(r & 1))
+		m.shl(1)
+		m.w[0] += b
+		j++
+		j &= 7
+	}
+	return m
+}
+
+/* Create random BIG in portable way, one bit at a time */
+func Randomnum(q *BIG, rng *ext.RAND) *BIG {
+	d := NewDBIG(nil)
+	var j int = 0
+	var r byte = 0
+	for i := 0; i < 2*q.nbits(); i++ {
+		if j == 0 {
+			r = rng.GetByte()
+		} else {
+			r >>= 1
+		}
+
+		b := Chunk(int(r & 1))
+		d.shl(1)
+		d.w[0] += b
+		j++
+		j &= 7
+	}
+	m := d.Mod(q, nil)
+	return m
+}
+
+func Randtrunc(q *BIG, trunc int, rng *ext.RAND) *BIG {
+	m := Randomnum(q, rng)
+	if q.nbits() > trunc {
+		m.mod2m(uint(trunc))
+	}
+	return m
+}
+
+/* return a*b mod m */
+func Modmul(a1, b1, m *BIG, mem *arena.Arena) *BIG {
+	a := NewBIGcopy(a1, mem)
+	b := NewBIGcopy(b1, mem)
+	a.Mod(m, mem)
+	b.Mod(m, mem)
+	d := mul(a, b, mem)
+	return d.ctmod(m, uint(m.nbits()), mem)
+}
+
+/* return a^2 mod m */
+func Modsqr(a1, m *BIG, mem *arena.Arena) *BIG {
+	a := NewBIGcopy(a1, mem)
+	a.Mod(m, mem)
+	d := sqr(a, mem)
+	return d.ctmod(m, uint(m.nbits()), mem)
+}
+
+/* return -a mod m */
+func Modneg(a1, m *BIG, mem *arena.Arena) *BIG {
+	a := NewBIGcopy(a1, mem)
+	a.Mod(m, mem)
+	a.rsub(m)
+	a.norm()
+	return a
+}
+
+/* return a+b mod m */
+func ModAdd(a1, b1, m *BIG, mem *arena.Arena) *BIG {
+	a := NewBIGcopy(a1, mem)
+	b := NewBIGcopy(b1, mem)
+	a.Mod(m, mem)
+	b.Mod(m, mem)
+	a.Add(b)
+	a.norm()
+	a.ctmod(m, 1, mem)
+	return a
+}
+
+/* Jacobi Symbol (this/p). Returns 0, 1 or -1 */
+func (r *BIG) Jacobi(p *BIG) int {
+	mem := arena.NewArena()
+	defer mem.Free()
+	m := 0
+	t := NewBIGint(0, mem)
+	x := NewBIGint(0, mem)
+	n := NewBIGint(0, mem)
+	zilch := NewBIGint(0, mem)
+	one := NewBIGint(1, mem)
+	if p.parity() == 0 || Comp(r, zilch) == 0 || Comp(p, one) <= 0 {
+		return 0
+	}
+	r.norm()
+	x.copy(r)
+	n.copy(p)
+	x.Mod(p, mem)
+
+	for Comp(n, one) > 0 {
+		if Comp(x, zilch) == 0 {
+			return 0
+		}
+		n8 := n.lastbits(3)
+		k := 0
+		for x.parity() == 0 {
+			k++
+			x.shr(1)
+		}
+		if k%2 == 1 {
+			m += (n8*n8 - 1) / 8
+		}
+		m += (n8 - 1) * (x.lastbits(2) - 1) / 4
+		t.copy(n)
+		t.Mod(x, mem)
+		n.copy(x)
+		x.copy(t)
+		m %= 2
+
+	}
+	if m == 0 {
+		return 1
+	}
+	return -1
+}
+
+/* this=1/this mod p. Binary method */
+func (r *BIG) Invmodp(p *BIG) {
+	mem := arena.NewArena()
+	defer mem.Free()
+	r.Mod(p, mem)
+	if r.IsZero() {
+		return
+	}
+	u := NewBIGcopy(r, mem)
+	v := NewBIGcopy(p, mem)
+	x1 := NewBIGint(1, mem)
+	x2 := NewBIGint(0, mem)
+	t := NewBIGint(0, mem)
+	one := NewBIGint(1, mem)
+	for Comp(u, one) != 0 && Comp(v, one) != 0 {
+		for u.parity() == 0 {
+			u.fshr(1)
+			t.copy(x1)
+			t.Add(p)
+			x1.cmove(t, x1.parity())
+			x1.norm()
+			x1.fshr(1)
+		}
+		for v.parity() == 0 {
+			v.fshr(1)
+			t.copy(x2)
+			t.Add(p)
+			x2.cmove(t, x2.parity())
+			x2.norm()
+			x2.fshr(1)
+		}
+		if Comp(u, v) >= 0 {
+			u.Sub(v)
+			u.norm()
+			t.copy(x1)
+			t.Add(p)
+			x1.cmove(t, (Comp(x1, x2)>>1)&1)
+			x1.Sub(x2)
+			x1.norm()
+		} else {
+			v.Sub(u)
+			v.norm()
+			t.copy(x2)
+			t.Add(p)
+			x2.cmove(t, (Comp(x2, x1)>>1)&1)
+			x2.Sub(x1)
+			x2.norm()
+		}
+	}
+	r.copy(x1)
+	r.cmove(x2, Comp(u, one)&1)
+}
+
+/* return this^e mod m */
+func (r *BIG) Powmod(e1 *BIG, m *BIG, mem *arena.Arena) *BIG {
+	e := NewBIGcopy(e1, mem)
+	r.norm()
+	e.norm()
+	a := NewBIGint(1, mem)
+	z := NewBIGcopy(e, mem)
+	s := NewBIGcopy(r, mem)
+	for true {
+		bt := z.parity()
+		z.fshr(1)
+		if bt == 1 {
+			a = Modmul(a, s, m, mem)
+		}
+		if z.IsZero() {
+			break
+		}
+		s = Modsqr(s, m, mem)
+	}
+	return a
+}
+
+/* Arazi and Qi inversion mod 256 */
+func invmod256(a int) int {
+	var t1 int = 0
+	c := (a >> 1) & 1
+	t1 += c
+	t1 &= 1
+	t1 = 2 - t1
+	t1 <<= 1
+	U := t1 + 1
+
+	// i=2
+	b := a & 3
+	t1 = U * b
+	t1 >>= 2
+	c = (a >> 2) & 3
+	t2 := (U * c) & 3
+	t1 += t2
+	t1 *= U
+	t1 &= 3
+	t1 = 4 - t1
+	t1 <<= 2
+	U += t1
+
+	// i=4
+	b = a & 15
+	t1 = U * b
+	t1 >>= 4
+	c = (a >> 4) & 15
+	t2 = (U * c) & 15
+	t1 += t2
+	t1 *= U
+	t1 &= 15
+	t1 = 16 - t1
+	t1 <<= 4
+	U += t1
+
+	return U
+}
+
+func logb2(w uint32) uint {
+	v := w
+	v |= (v >> 1)
+	v |= (v >> 2)
+	v |= (v >> 4)
+	v |= (v >> 8)
+	v |= (v >> 16)
+
+	v = v - ((v >> 1) & 0x55555555)
+	v = (v & 0x33333333) + ((v >> 2) & 0x33333333)
+	r := uint((((v + (v >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24)
+	return (r)
+}
+
+// Optimized combined shift, subtract and norm
+func ssn(r *BIG, a *BIG, m *BIG) int {
+	n := NLEN - 1
+	m.w[0] = (m.w[0] >> 1) | ((m.w[1] << (BASEBITS - 1)) & BMASK)
+	r.w[0] = a.w[0] - m.w[0]
+	carry := r.w[0] >> BASEBITS
+	r.w[0] &= BMASK
+	for i := 1; i < n; i++ {
+		m.w[i] = (m.w[i] >> 1) | ((m.w[i+1] << (BASEBITS - 1)) & BMASK)
+		r.w[i] = a.w[i] - m.w[i] + carry
+		carry = r.w[i] >> BASEBITS
+		r.w[i] &= BMASK
+	}
+	m.w[n] >>= 1
+	r.w[n] = a.w[n] - m.w[n] + carry
+	return int((r.w[n] >> uint(CHUNK-1)) & 1)
+}
diff --git a/nekryptology/pkg/core/curves/native/bls48581/bls256.go b/nekryptology/pkg/core/curves/native/bls48581/bls256.go
index c397a91..8d30ea1 100644
--- a/nekryptology/pkg/core/curves/native/bls48581/bls256.go
+++ b/nekryptology/pkg/core/curves/native/bls48581/bls256.go
@@ -42,7 +42,7 @@ func ceil(a int, b int) int {
 
 /* output u \in F_p */
 func Hash_to_field(hash int, hlen int, DST []byte, M []byte, ctr int) []*FP {
-	q := NewBIGints(Modulus)
+	q := NewBIGints(Modulus, nil)
 	nbq := q.nbits()
 	L := ceil(nbq+AESKEY*8, 8)
 	var u []*FP
@@ -53,7 +53,7 @@ func Hash_to_field(hash int, hlen int, DST []byte, M []byte, ctr int) []*FP {
 		for j := 0; j < L; j++ {
 			fd[j] = OKM[i*L+j]
 		}
-		u = append(u, NewFPbig(DBIG_fromBytes(fd).ctmod(q, uint(8*L-nbq))))
+		u = append(u, NewFPbig(DBIG_fromBytes(fd).ctmod(q, uint(8*L-nbq), nil), nil))
 	}
 	return u
 }
@@ -65,15 +65,15 @@ func Bls256_hash_to_point(M []byte) *ECP {
 
 	P := ECP_map2point(u[0])
 	P1 := ECP_map2point(u[1])
-	P.Add(P1)
+	P.Add(P1, nil)
 	P.Cfp()
-	P.Affine()
+	P.Affine(nil)
 	return P
 }
 
 func Init() int {
 	G := ECP8_generator()
-	if G.Is_infinity() {
+	if G.Is_infinity(nil) {
 		return BLS_FAIL
 	}
 	G2_TAB = precomp(G)
@@ -82,7 +82,7 @@ func Init() int {
 
 /* generate key pair, private key S, public key W */
 func KeyPairGenerate(IKM []byte, S []byte, W []byte) int {
-	r := NewBIGints(CURVE_Order)
+	r := NewBIGints(CURVE_Order, nil)
 	nbr := r.nbits()
 	L := ceil(3*ceil(nbr, 8), 2)
 	LEN := ext.InttoBytes(L, 2)
@@ -93,7 +93,7 @@ func KeyPairGenerate(IKM []byte, S []byte, W []byte) int {
 	AIKM[len(IKM)] = 0
 
 	G := ECP8_generator()
-	if G.Is_infinity() {
+	if G.Is_infinity(nil) {
 		return BLS_FAIL
 	}
 	SALT := []byte("BLS-SIG-KEYGEN-SALT-")
@@ -101,10 +101,10 @@ func KeyPairGenerate(IKM []byte, S []byte, W []byte) int {
 	OKM := ext.HKDF_Expand(ext.MC_SHA2, HASH_TYPE, L, PRK, LEN)
 
 	dx := DBIG_fromBytes(OKM[:])
-	s := dx.ctmod(r, uint(8*L-nbr))
+	s := dx.ctmod(r, uint(8*L-nbr), nil)
 	s.ToBytes(S)
 	// SkToPk
-	G = G2mul(G, s)
+	G = G2mul(G, s, nil)
 	G.ToBytes(W, true)
 	return BLS_OK
 }
@@ -113,7 +113,7 @@ func KeyPairGenerate(IKM []byte, S []byte, W []byte) int {
 func Core_Sign(SIG []byte, M []byte, S []byte) int {
 	D := Bls256_hash_to_point(M)
 	s := FromBytes(S)
-	D = G1mul(D, s)
+	D = G1mul(D, s, nil)
 	D.ToBytes(SIG, true)
 	return BLS_OK
 }
@@ -124,21 +124,21 @@ func Core_Verify(SIG []byte, M []byte, W []byte) int {
 	HM := Bls256_hash_to_point(M)
 
 	D := ECP_fromBytes(SIG)
-	if !G1member(D) {
+	if !G1member(D, nil) {
 		return BLS_FAIL
 	}
-	D.Neg()
+	D.Neg(nil)
 
 	PK := ECP8_fromBytes(W)
-	if !G2member(PK) {
+	if !G2member(PK, nil) {
 		return BLS_FAIL
 	}
 
 	// Use new multi-pairing mechanism
-	r := Initmp()
+	r := Initmp(nil)
 	Another_pc(r, G2_TAB, D)
-	Another(r, PK, HM)
-	v := Miller(r)
+	Another(r, PK, HM, nil)
+	v := Miller(r, nil)
 
 	//.. or alternatively
 	//	G := ECP8_generator()
diff --git a/nekryptology/pkg/core/curves/native/bls48581/config_big.go b/nekryptology/pkg/core/curves/native/bls48581/config_big_32.go
similarity index 98%
rename from nekryptology/pkg/core/curves/native/bls48581/config_big.go
rename to nekryptology/pkg/core/curves/native/bls48581/config_big_32.go
index 699a6e2..b814453 100644
--- a/nekryptology/pkg/core/curves/native/bls48581/config_big.go
+++ b/nekryptology/pkg/core/curves/native/bls48581/config_big_32.go
@@ -1,3 +1,5 @@
+//go:build js && wasm
+
 /*
  * Copyright (c) 2012-2020 MIRACL UK Ltd.
  *
diff --git a/nekryptology/pkg/core/curves/native/bls48581/config_big_64.go b/nekryptology/pkg/core/curves/native/bls48581/config_big_64.go
new file mode 100644
index 0000000..d31bd4d
--- /dev/null
+++ b/nekryptology/pkg/core/curves/native/bls48581/config_big_64.go
@@ -0,0 +1,36 @@
+//go:build !js && !wasm
+
+/*
+ * Copyright (c) 2012-2020 MIRACL UK Ltd.
+ *
+ * This file is part of MIRACL Core
+ * (see https://github.com/miracl/core).
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package bls48581
+
+// BIG length in bytes and number base
+const MODBYTES uint = 73
+const BASEBITS uint = 60
+
+// BIG lengths and Masks
+const NLEN int = int((1 + ((8*MODBYTES - 1) / BASEBITS)))
+const DNLEN int = 2 * NLEN
+const BMASK Chunk = ((Chunk(1) << BASEBITS) - 1)
+const HBITS uint = (BASEBITS / 2)
+const HMASK Chunk = ((Chunk(1) << HBITS) - 1)
+const NEXCESS int = (1 << (uint(CHUNK) - BASEBITS - 1))
+
+const BIGBITS int = int(MODBYTES * 8)
diff --git a/nekryptology/pkg/core/curves/native/bls48581/config_curve.go b/nekryptology/pkg/core/curves/native/bls48581/config_curve.go
index dc23507..7a39de1 100644
--- a/nekryptology/pkg/core/curves/native/bls48581/config_curve.go
+++ b/nekryptology/pkg/core/curves/native/bls48581/config_curve.go
@@ -19,11 +19,6 @@
 
 package bls48581
 
-// Curve types
-const WEIERSTRASS int = 0
-const EDWARDS int = 1
-const MONTGOMERY int = 2
-
 // Pairing Friendly?
 const NOT int = 0
 const BN int = 1
@@ -31,10 +26,6 @@ const BLS12 int = 2
 const BLS24 int = 3
 const BLS48 int = 4
 
-// Pairing Twist type
-const D_TYPE int = 0
-const M_TYPE int = 1
-
 // Sparsity
 const FP_ZERO int = 0
 const FP_ONE int = 1
@@ -43,34 +34,16 @@ const FP_SPARSER int = 3
 const FP_SPARSE int = 4
 const FP_DENSE int = 5
 
-// Pairing x parameter sign
-const POSITIVEX int = 0
-const NEGATIVEX int = 1
-
-// Curve type
-
-const CURVETYPE int = WEIERSTRASS
 const CURVE_A int = 0
-const CURVE_PAIRING_TYPE int = BLS48
 
-// Pairings only
-
-const SEXTIC_TWIST int = D_TYPE
-const SIGN_OF_X int = NEGATIVEX
 const ATE_BITS int = 33
 const G2_TABLE int = 36
 const HTC_ISO int = 0
 const HTC_ISO_G2 int = 0
 
-// associated hash function and AES key size
-
 const HASH_TYPE int = 64
 const AESKEY int = 32
 
-const ALLOW_ALT_COMPRESS bool = false
-
-// These are manually decided policy decisions. To block any potential patent issues set to false.
-
 const USE_GLV bool = true
 const USE_GS_G2 bool = true
 const USE_GS_GT bool = true
diff --git a/nekryptology/pkg/core/curves/native/bls48581/config_field.go b/nekryptology/pkg/core/curves/native/bls48581/config_field_32.go
similarity index 98%
rename from nekryptology/pkg/core/curves/native/bls48581/config_field.go
rename to nekryptology/pkg/core/curves/native/bls48581/config_field_32.go
index 1d47ce8..81d262c 100644
--- a/nekryptology/pkg/core/curves/native/bls48581/config_field.go
+++ b/nekryptology/pkg/core/curves/native/bls48581/config_field_32.go
@@ -1,3 +1,5 @@
+//go:build js && wasm
+
 /*
  * Copyright (c) 2012-2020 MIRACL UK Ltd.
  *
diff --git a/nekryptology/pkg/core/curves/native/bls48581/config_field_64.go b/nekryptology/pkg/core/curves/native/bls48581/config_field_64.go
new file mode 100644
index 0000000..186ddd5
--- /dev/null
+++ b/nekryptology/pkg/core/curves/native/bls48581/config_field_64.go
@@ -0,0 +1,49 @@
+//go:build !js && !wasm
+
+/*
+ * Copyright (c) 2012-2020 MIRACL UK Ltd.
+ *
+ * This file is part of MIRACL Core
+ * (see https://github.com/miracl/core).
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package bls48581
+
+// Modulus types
+const NOT_SPECIAL int = 0
+const PSEUDO_MERSENNE int = 1
+const MONTGOMERY_FRIENDLY int = 2
+const GENERALISED_MERSENNE int = 3
+
+const NEGATOWER int = 0
+const POSITOWER int = 1
+
+// Modulus details
+const MODBITS uint = 581        /* Number of bits in Modulus */
+const PM1D2 uint = 1            /* Modulus mod 8 */
+const RIADZ int = 2             /* hash-to-point Z */
+const RIADZG2A int = 2          /* G2 hash-to-point Z */
+const RIADZG2B int = 0          /* G2 hash-to-point Z */
+const MODTYPE int = NOT_SPECIAL //NOT_SPECIAL
+const QNRI int = 0              // Fp2 QNR
+const TOWER int = POSITOWER     // Tower type
+const FEXCESS int32 = ((int32(1) << 19) - 1)
+
+// Modulus Masks
+const OMASK Chunk = ((Chunk(-1)) << (MODBITS % BASEBITS))
+const TBITS uint = MODBITS % BASEBITS // Number of active bits in top word
+const TMASK Chunk = (Chunk(1) << TBITS) - 1
+
+const BIG_ENDIAN_SIGN bool = false
diff --git a/nekryptology/pkg/core/curves/native/bls48581/dbig.go b/nekryptology/pkg/core/curves/native/bls48581/dbig.go
index 2e9a2ae..755cdd2 100644
--- a/nekryptology/pkg/core/curves/native/bls48581/dbig.go
+++ b/nekryptology/pkg/core/curves/native/bls48581/dbig.go
@@ -21,28 +21,46 @@
 
 package bls48581
 
-import "strconv"
+import (
+	"arena"
+	"strconv"
+)
 
 //import "fmt"
 
-func NewDBIG() *DBIG {
-	b := new(DBIG)
+func NewDBIG(mem *arena.Arena) *DBIG {
+	var b *DBIG
+	if mem != nil {
+		b = arena.New[DBIG](mem)
+	} else {
+		b = new(DBIG)
+	}
 	for i := 0; i < DNLEN; i++ {
 		b.w[i] = 0
 	}
 	return b
 }
 
-func NewDBIGcopy(x *DBIG) *DBIG {
-	b := new(DBIG)
+func NewDBIGcopy(x *DBIG, mem *arena.Arena) *DBIG {
+	var b *DBIG
+	if mem != nil {
+		b = arena.New[DBIG](mem)
+	} else {
+		b = new(DBIG)
+	}
 	for i := 0; i < DNLEN; i++ {
 		b.w[i] = x.w[i]
 	}
 	return b
 }
 
-func NewDBIGscopy(x *BIG) *DBIG {
-	b := new(DBIG)
+func NewDBIGscopy(x *BIG, mem *arena.Arena) *DBIG {
+	var b *DBIG
+	if mem != nil {
+		b = arena.New[DBIG](mem)
+	} else {
+		b = new(DBIG)
+	}
 	for i := 0; i < NLEN-1; i++ {
 		b.w[i] = x.w[i]
 	}
@@ -67,8 +85,8 @@ func (r *DBIG) norm() {
 }
 
 /* split DBIG at position n, return higher half, keep lower half */
-func (r *DBIG) split(n uint) *BIG {
-	t := NewBIG()
+func (r *DBIG) split(n uint, mem *arena.Arena) *BIG {
+	t := NewBIG(mem)
 	m := n % BASEBITS
 	carry := r.w[DNLEN-1] << (BASEBITS - m)
 
@@ -173,11 +191,11 @@ func (r *DBIG) shr(k uint) {
 	}
 }
 
-func (r *DBIG) ctmod(m *BIG, bd uint) *BIG {
+func (r *DBIG) ctmod(m *BIG, bd uint, mem *arena.Arena) *BIG {
 	k := bd
 	r.norm()
-	c := NewDBIGscopy(m)
-	dr := NewDBIG()
+	c := NewDBIGscopy(m, mem)
+	dr := NewDBIG(mem)
 
 	c.shl(k)
 
@@ -192,25 +210,25 @@ func (r *DBIG) ctmod(m *BIG, bd uint) *BIG {
 		k -= 1
 		c.shr(1)
 	}
-	return NewBIGdcopy(r)
+	return NewBIGdcopy(r, mem)
 }
 
 /* reduces this DBIG mod a BIG, and returns the BIG */
-func (r *DBIG) Mod(m *BIG) *BIG {
+func (r *DBIG) Mod(m *BIG, mem *arena.Arena) *BIG {
 	k := r.nbits() - m.nbits()
 	if k < 0 {
 		k = 0
 	}
-	return r.ctmod(m, uint(k))
+	return r.ctmod(m, uint(k), mem)
 }
 
-func (r *DBIG) ctdiv(m *BIG, bd uint) *BIG {
+func (r *DBIG) ctdiv(m *BIG, bd uint, mem *arena.Arena) *BIG {
 	k := bd
-	c := NewDBIGscopy(m)
-	a := NewBIGint(0)
-	e := NewBIGint(1)
-	sr := NewBIG()
-	dr := NewDBIG()
+	c := NewDBIGscopy(m, mem)
+	a := NewBIGint(0, mem)
+	e := NewBIGint(1, mem)
+	sr := NewBIG(mem)
+	dr := NewDBIG(mem)
 	r.norm()
 
 	c.shl(k)
@@ -237,12 +255,12 @@ func (r *DBIG) ctdiv(m *BIG, bd uint) *BIG {
 }
 
 /* return this/c */
-func (r *DBIG) div(m *BIG) *BIG {
+func (r *DBIG) div(m *BIG, mem *arena.Arena) *BIG {
 	k := r.nbits() - m.nbits()
 	if k < 0 {
 		k = 0
 	}
-	return r.ctdiv(m, uint(k))
+	return r.ctdiv(m, uint(k), mem)
 }
 
 /* Convert to Hex String */
@@ -259,7 +277,7 @@ func (r *DBIG) toString() string {
 	}
 
 	for i := len - 1; i >= 0; i-- {
-		b := NewDBIGcopy(r)
+		b := NewDBIGcopy(r, nil)
 
 		b.shr(uint(i * 4))
 		s += strconv.FormatInt(int64(b.w[0]&15), 16)
@@ -270,7 +288,7 @@ func (r *DBIG) toString() string {
 /* return number of bits */
 func (r *DBIG) nbits() int {
 	k := DNLEN - 1
-	t := NewDBIGcopy(r)
+	t := NewDBIGcopy(r, nil)
 	t.norm()
 	for k >= 0 && t.w[k] == 0 {
 		k--
@@ -289,7 +307,7 @@ func (r *DBIG) nbits() int {
 
 /* convert from byte array to BIG */
 func DBIG_fromBytes(b []byte) *DBIG {
-	m := NewDBIG()
+	m := NewDBIG(nil)
 	for i := 0; i < len(b); i++ {
 		m.shl(8)
 		m.w[0] += Chunk(int(b[i] & 0xff))
diff --git a/nekryptology/pkg/core/curves/native/bls48581/ecdh.go b/nekryptology/pkg/core/curves/native/bls48581/ecdh.go
deleted file mode 100644
index 0480043..0000000
--- a/nekryptology/pkg/core/curves/native/bls48581/ecdh.go
+++ /dev/null
@@ -1,381 +0,0 @@
-/*
- * Copyright (c) 2012-2020 MIRACL UK Ltd.
- *
- * This file is part of MIRACL Core
- * (see https://github.com/miracl/ext..
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ECDH/ECIES/ECDSA API Functions */
-
-package bls48581
-
-//import "fmt"
-import "source.quilibrium.com/quilibrium/monorepo/nekryptology/pkg/core/curves/native/bls48581/ext"
-
-const INVALID_PUBLIC_KEY int = -2
-const ERROR int = -3
-
-//const INVALID int = -4
-const EFS int = int(MODBYTES)
-const EGS int = int(MODBYTES)
-
-// Transform a point multiplier to RFC7748 form
-func RFC7748(r *BIG) {
-	lg := 0
-	t := NewBIGint(1)
-	c := CURVE_Cof_I
-	for c != 1 {
-		lg++
-		c /= 2
-	}
-	n := uint(8*EGS - lg + 1)
-	r.mod2m(n)
-	t.shl(n)
-	r.Add(t)
-	c = r.lastbits(lg)
-	r.dec(c)
-}
-
-/* return true if S is in ranger 0 < S < order , else return false */
-func ECDH_IN_RANGE(S []byte) bool {
-	r := NewBIGints(CURVE_Order)
-	s := FromBytes(S)
-	if s.IsZero() {
-		return false
-	}
-	if Comp(s, r) >= 0 {
-		return false
-	}
-	return true
-}
-
-/* Calculate a public/private EC GF(p) key pair W,S where W=S.G mod EC(p),
- * where S is the secret key and W is the public key
- * and G is fixed generator.
- * If RNG is NULL then the private key is provided externally in S
- * otherwise it is generated randomly internally */
-func ECDH_KEY_PAIR_GENERATE(RNG *ext.RAND, S []byte, W []byte) int {
-	res := 0
-	var s *BIG
-	var G *ECP
-
-	G = ECP_generator()
-	r := NewBIGints(CURVE_Order)
-
-	if RNG == nil {
-		s = FromBytes(S)
-	} else {
-		if CURVETYPE != WEIERSTRASS {
-			s = Random(RNG) // from random bytes
-		} else {
-			s = Randomnum(r, RNG) // Removes biases
-		}
-	}
-
-	if CURVETYPE != WEIERSTRASS {
-		RFC7748(s) // For Montgomery or Edwards, apply RFC7748 transformation
-	}
-
-	s.ToBytes(S)
-	WP := G.clmul(s, r)
-	WP.ToBytes(W, false) // To use point compression on public keys, change to true
-
-	return res
-}
-
-/* validate public key */
-func ECDH_PUBLIC_KEY_VALIDATE(W []byte) int {
-	WP := ECP_fromBytes(W)
-	res := 0
-
-	r := NewBIGints(CURVE_Order)
-
-	if WP.Is_infinity() {
-		res = INVALID_PUBLIC_KEY
-	}
-	if res == 0 {
-
-		q := NewBIGints(Modulus)
-		nb := q.nbits()
-		k := NewBIGint(1)
-		k.shl(uint((nb + 4) / 2))
-		k.Add(q)
-		k.div(r)
-
-		for k.parity() == 0 {
-			k.shr(1)
-			WP.Dbl()
-		}
-
-		if !k.isunity() {
-			WP = WP.lmul(k)
-		}
-		if WP.Is_infinity() {
-			res = INVALID_PUBLIC_KEY
-		}
-
-	}
-	return res
-}
-
-/* IEEE-1363 Diffie-Hellman online calculation Z=S.WD */
-// type = 0 is just x coordinate output
-// type = 1 for standard compressed output
-// type = 2 for standard uncompress output 04|x|y
-func ECDH_ECPSVDP_DH(S []byte, WD []byte, Z []byte, typ int) int {
-	res := 0
-
-	s := FromBytes(S)
-
-	W := ECP_fromBytes(WD)
-	if W.Is_infinity() {
-		res = ERROR
-	}
-
-	if res == 0 {
-		r := NewBIGints(CURVE_Order)
-		W = W.clmul(s, r)
-		if W.Is_infinity() {
-			res = ERROR
-		} else {
-			if CURVETYPE != MONTGOMERY {
-				if typ > 0 {
-					if typ == 1 {
-						W.ToBytes(Z, true)
-					} else {
-						W.ToBytes(Z, false)
-					}
-				} else {
-					W.GetX().ToBytes(Z)
-				}
-				return res
-			} else {
-				W.GetX().ToBytes(Z)
-			}
-		}
-	}
-	return res
-}
-
-/* IEEE ECDSA Signature, C and D are signature on F using private key S */
-func ECDH_ECPSP_DSA(sha int, RNG *ext.RAND, S []byte, F []byte, C []byte, D []byte) int {
-	var T [EGS]byte
-
-	B := ext.GPhashit(ext.MC_SHA2, sha, EGS, 0, F, -1, nil)
-	G := ECP_generator()
-
-	r := NewBIGints(CURVE_Order)
-	s := FromBytes(S)
-	f := FromBytes(B[:])
-
-	c := NewBIGint(0)
-	d := NewBIGint(0)
-	V := NewECP()
-
-	for d.IsZero() {
-		u := Randomnum(r, RNG)
-		w := Randomnum(r, RNG) /* IMPORTANT - side channel masking to protect invmodp() */
-
-		V.Copy(G)
-		V = V.clmul(u, r)
-		vx := V.GetX()
-		c.copy(vx)
-		c.Mod(r)
-		if c.IsZero() {
-			continue
-		}
-		u.copy(Modmul(u, w, r))
-		u.Invmodp(r)
-		d.copy(Modmul(s, c, r))
-		d.copy(ModAdd(d, f, r))
-		d.copy(Modmul(d, w, r))
-		d.copy(Modmul(u, d, r))
-	}
-
-	c.ToBytes(T[:])
-	for i := 0; i < EGS; i++ {
-		C[i] = T[i]
-	}
-	d.ToBytes(T[:])
-	for i := 0; i < EGS; i++ {
-		D[i] = T[i]
-	}
-	return 0
-}
-
-/* IEEE1363 ECDSA Signature Verification. Signature C and D on F is verified using public key W */
-func ECDH_ECPVP_DSA(sha int, W []byte, F []byte, C []byte, D []byte) int {
-	res := 0
-
-	B := ext.GPhashit(ext.MC_SHA2, sha, EGS, 0, F, -1, nil)
-
-	G := ECP_generator()
-	r := NewBIGints(CURVE_Order)
-
-	c := FromBytes(C)
-	d := FromBytes(D)
-	f := FromBytes(B[:])
-
-	if c.IsZero() || Comp(c, r) >= 0 || d.IsZero() || Comp(d, r) >= 0 {
-		res = ERROR
-	}
-
-	if res == 0 {
-		d.Invmodp(r)
-		f.copy(Modmul(f, d, r))
-		h2 := Modmul(c, d, r)
-
-		WP := ECP_fromBytes(W)
-		if WP.Is_infinity() {
-			res = ERROR
-		} else {
-			P := NewECP()
-			P.Copy(WP)
-
-			P = P.Mul2(h2, G, f)
-
-			if P.Is_infinity() {
-				res = ERROR
-			} else {
-				d = P.GetX()
-				d.Mod(r)
-
-				if Comp(d, c) != 0 {
-					res = ERROR
-				}
-			}
-		}
-	}
-
-	return res
-}
-
-/* IEEE1363 ECIES encryption. Encryption of plaintext M uses public key W and produces ciphertext V,C,T */
-func ECDH_ECIES_ENCRYPT(sha int, P1 []byte, P2 []byte, RNG *ext.RAND, W []byte, M []byte, V []byte, T []byte) []byte {
-	var Z [EFS]byte
-	var VZ [3*EFS + 1]byte
-	var K1 [AESKEY]byte
-	var K2 [AESKEY]byte
-	var U [EGS]byte
-
-	if ECDH_KEY_PAIR_GENERATE(RNG, U[:], V) != 0 {
-		return nil
-	}
-	if ECDH_ECPSVDP_DH(U[:], W, Z[:], 0) != 0 {
-		return nil
-	}
-
-	for i := 0; i < 2*EFS+1; i++ {
-		VZ[i] = V[i]
-	}
-	for i := 0; i < EFS; i++ {
-		VZ[2*EFS+1+i] = Z[i]
-	}
-
-	K := ext.KDF2(ext.MC_SHA2, sha, VZ[:], P1, 2*AESKEY)
-
-	for i := 0; i < AESKEY; i++ {
-		K1[i] = K[i]
-		K2[i] = K[AESKEY+i]
-	}
-
-	C := ext.AES_CBC_IV0_ENCRYPT(K1[:], M)
-
-	L2 := ext.InttoBytes(len(P2), 8)
-
-	var AC []byte
-
-	for i := 0; i < len(C); i++ {
-		AC = append(AC, C[i])
-	}
-	for i := 0; i < len(P2); i++ {
-		AC = append(AC, P2[i])
-	}
-	for i := 0; i < 8; i++ {
-		AC = append(AC, L2[i])
-	}
-
-	ext.HMAC(ext.MC_SHA2, sha, T, len(T), K2[:], AC)
-
-	return C
-}
-
-/* constant time n-byte compare */
-func ncomp(T1 []byte, T2 []byte, n int) bool {
-	res := 0
-	for i := 0; i < n; i++ {
-		res |= int(T1[i] ^ T2[i])
-	}
-	if res == 0 {
-		return true
-	}
-	return false
-}
-
-/* IEEE1363 ECIES decryption. Decryption of ciphertext V,C,T using private key U outputs plaintext M */
-func ECDH_ECIES_DECRYPT(sha int, P1 []byte, P2 []byte, V []byte, C []byte, T []byte, U []byte) []byte {
-	var Z [EFS]byte
-	var VZ [3*EFS + 1]byte
-	var K1 [AESKEY]byte
-	var K2 [AESKEY]byte
-
-	var TAG []byte = T[:]
-
-	if ECDH_ECPSVDP_DH(U, V, Z[:], 0) != 0 {
-		return nil
-	}
-
-	for i := 0; i < 2*EFS+1; i++ {
-		VZ[i] = V[i]
-	}
-	for i := 0; i < EFS; i++ {
-		VZ[2*EFS+1+i] = Z[i]
-	}
-
-	K := ext.KDF2(ext.MC_SHA2, sha, VZ[:], P1, 2*AESKEY)
-
-	for i := 0; i < AESKEY; i++ {
-		K1[i] = K[i]
-		K2[i] = K[AESKEY+i]
-	}
-
-	M := ext.AES_CBC_IV0_DECRYPT(K1[:], C)
-
-	if M == nil {
-		return nil
-	}
-
-	L2 := ext.InttoBytes(len(P2), 8)
-
-	var AC []byte
-
-	for i := 0; i < len(C); i++ {
-		AC = append(AC, C[i])
-	}
-	for i := 0; i < len(P2); i++ {
-		AC = append(AC, P2[i])
-	}
-	for i := 0; i < 8; i++ {
-		AC = append(AC, L2[i])
-	}
-
-	ext.HMAC(ext.MC_SHA2, sha, TAG, len(TAG), K2[:], AC)
-
-	if !ncomp(T, TAG, len(T)) {
-		return nil
-	}
-
-	return M
-}
diff --git a/nekryptology/pkg/core/curves/native/bls48581/fp.go b/nekryptology/pkg/core/curves/native/bls48581/fp.go
index c42feca..2cab741 100644
--- a/nekryptology/pkg/core/curves/native/bls48581/fp.go
+++ b/nekryptology/pkg/core/curves/native/bls48581/fp.go
@@ -22,7 +22,11 @@
 
 package bls48581
 
-import "source.quilibrium.com/quilibrium/monorepo/nekryptology/pkg/core/curves/native/bls48581/ext"
+import (
+	"arena"
+
+	"source.quilibrium.com/quilibrium/monorepo/nekryptology/pkg/core/curves/native/bls48581/ext"
+)
 
 type FP struct {
 	x   *BIG
@@ -31,84 +35,119 @@ type FP struct {
 
 /* Constructors */
 
-func NewFP() *FP {
-	F := new(FP)
-	F.x = NewBIG()
-	F.XES = 1
-	return F
-}
-
-func NewFPint(a int) *FP {
-	F := new(FP)
-	if a < 0 {
-		m := NewBIGints(Modulus)
-		m.inc(a)
-		m.norm()
-		F.x = NewBIGcopy(m)
+func NewFP(mem *arena.Arena) *FP {
+	if mem != nil {
+		F := arena.New[FP](mem)
+		F.x = NewBIG(mem)
+		F.XES = 1
+		return F
 	} else {
-		F.x = NewBIGint(a)
+		F := new(FP)
+		F.x = NewBIG(nil)
+		F.XES = 1
+		return F
 	}
-	F.nres()
-	return F
 }
 
-func NewFPbig(a *BIG) *FP {
-	F := new(FP)
-	F.x = NewBIGcopy(a)
-	F.nres()
-	return F
+func NewFPint(a int, mem *arena.Arena) *FP {
+	if mem != nil {
+		F := arena.New[FP](mem)
+		if a < 0 {
+			m := NewBIGints(Modulus, mem)
+			m.inc(a)
+			m.norm()
+			F.x = NewBIGcopy(m, mem)
+		} else {
+			F.x = NewBIGint(a, mem)
+		}
+		F.nres(mem)
+		return F
+	} else {
+		F := new(FP)
+		if a < 0 {
+			m := NewBIGints(Modulus, nil)
+			m.inc(a)
+			m.norm()
+			F.x = NewBIGcopy(m, nil)
+		} else {
+			F.x = NewBIGint(a, nil)
+		}
+		F.nres(nil)
+		return F
+	}
 }
 
-func NewFPcopy(a *FP) *FP {
-	F := new(FP)
-	F.x = NewBIGcopy(a.x)
-	F.XES = a.XES
-	return F
+func NewFPbig(a *BIG, mem *arena.Arena) *FP {
+	if mem != nil {
+		F := arena.New[FP](mem)
+		F.x = NewBIGcopy(a, mem)
+		F.nres(mem)
+		return F
+	} else {
+		F := new(FP)
+		F.x = NewBIGcopy(a, nil)
+		F.nres(nil)
+		return F
+	}
+}
+
+func NewFPcopy(a *FP, mem *arena.Arena) *FP {
+	if mem != nil {
+		F := arena.New[FP](mem)
+		F.x = NewBIGcopy(a.x, mem)
+		F.XES = a.XES
+		return F
+	} else {
+		F := new(FP)
+		F.x = NewBIGcopy(a.x, nil)
+		F.XES = a.XES
+		return F
+	}
 }
 
 func NewFPrand(rng *ext.RAND) *FP {
-	m := NewBIGints(Modulus)
+	m := NewBIGints(Modulus, nil)
 	w := Randomnum(m, rng)
-	F := NewFPbig(w)
+	F := NewFPbig(w, nil)
 	return F
 }
 
 func (F *FP) ToString() string {
-	F.reduce()
-	return F.Redc().ToString()
+	F.reduce(nil)
+	return F.Redc(nil).ToString()
 }
 
 /* convert to Montgomery n-residue form */
-func (F *FP) nres() {
+func (F *FP) nres(mem *arena.Arena) {
 	if MODTYPE != PSEUDO_MERSENNE && MODTYPE != GENERALISED_MERSENNE {
-		r := NewBIGints(R2modp)
-		d := mul(F.x, r)
-		F.x.copy(mod(d))
+		r := NewBIGints(R2modp, mem)
+		d := mul(F.x, r, mem)
+		F.x.copy(mod(d, mem))
 		F.XES = 2
 	} else {
-		md := NewBIGints(Modulus)
-		F.x.Mod(md)
+		md := NewBIGints(Modulus, mem)
+		F.x.Mod(md, mem)
 		F.XES = 1
 	}
 }
 
 /* convert back to regular form */
-func (F *FP) Redc() *BIG {
+func (F *FP) Redc(mem *arena.Arena) *BIG {
 	if MODTYPE != PSEUDO_MERSENNE && MODTYPE != GENERALISED_MERSENNE {
-		d := NewDBIGscopy(F.x)
-		return mod(d)
+		d := NewDBIGscopy(F.x, mem)
+		return mod(d, mem)
 	} else {
-		r := NewBIGcopy(F.x)
+		r := NewBIGcopy(F.x, mem)
 		return r
 	}
 }
 
 /* reduce a DBIG to a BIG using the appropriate form of the modulus */
 
-func mod(d *DBIG) *BIG {
+func mod(d *DBIG, mem *arena.Arena) *BIG {
 	if MODTYPE == PSEUDO_MERSENNE {
-		t := d.split(MODBITS)
-		b := NewBIGdcopy(d)
+		t := d.split(MODBITS, mem)
+		b := NewBIGdcopy(d, mem)
 
 		v := t.pmul(int(MConst))
 
@@ -128,7 +167,7 @@ func mod(d *DBIG) *BIG {
 			d.w[NLEN+i-1] = bot
 			d.w[NLEN+i] += top
 		}
-		b := NewBIG()
+		b := NewBIG(mem)
 
 		for i := 0; i < NLEN; i++ {
 			b.w[i] = d.w[NLEN+i]
@@ -138,14 +177,14 @@ func mod(d *DBIG) *BIG {
 	}
 
 	if MODTYPE == GENERALISED_MERSENNE { // GoldiLocks only
-		t := d.split(MODBITS)
-		b := NewBIGdcopy(d)
+		t := d.split(MODBITS, mem)
+		b := NewBIGdcopy(d, mem)
 		b.Add(t)
-		dd := NewDBIGscopy(t)
+		dd := NewDBIGscopy(t, mem)
 		dd.shl(MODBITS / 2)
 
-		tt := dd.split(MODBITS)
-		lo := NewBIGdcopy(dd)
+		tt := dd.split(MODBITS, mem)
+		lo := NewBIGdcopy(dd, mem)
 		b.Add(tt)
 		b.Add(lo)
 		b.norm()
@@ -163,10 +202,10 @@ func mod(d *DBIG) *BIG {
 	}
 
 	if MODTYPE == NOT_SPECIAL {
-		md := NewBIGints(Modulus)
-		return monty(md, MConst, d)
+		md := NewBIGints(Modulus, mem)
+		return monty(md, MConst, d, mem)
 	}
-	return NewBIG()
+	return NewBIG(mem)
 }
 
 // find appoximation to quotient of a/m
@@ -189,9 +228,9 @@ func quo(n *BIG, m *BIG) int {
 }
 
 /* reduce this mod Modulus */
-func (F *FP) reduce() {
-	m := NewBIGints(Modulus)
-	r := NewBIGints(Modulus)
+func (F *FP) reduce(mem *arena.Arena) {
+	m := NewBIGints(Modulus, mem)
+	r := NewBIGints(Modulus, mem)
 	var sb uint
 	F.x.norm()
 
@@ -217,43 +256,49 @@ func (F *FP) reduce() {
 }
 
 /* test this=0? */
-func (F *FP) IsZero() bool {
-	W := NewFPcopy(F)
-	W.reduce()
+func (F *FP) IsZero(mem *arena.Arena) bool {
+	W := NewFPcopy(F, mem)
+	W.reduce(mem)
 	return W.x.IsZero()
 }
 
 func (F *FP) IsOne() bool {
-	W := NewFPcopy(F)
-	W.reduce()
-	T := NewFPint(1)
+	mem := arena.NewArena()
+	defer mem.Free()
+	W := NewFPcopy(F, mem)
+	W.reduce(mem)
+	T := NewFPint(1, mem)
 	return W.Equals(T)
 }
 
 func (F *FP) islarger() int {
-	if F.IsZero() {
+	mem := arena.NewArena()
+	defer mem.Free()
+	if F.IsZero(mem) {
 		return 0
 	}
-	sx := NewBIGints(Modulus)
-	fx := F.Redc()
+	sx := NewBIGints(Modulus, mem)
+	fx := F.Redc(mem)
 	sx.Sub(fx)
 	sx.norm()
 	return Comp(fx, sx)
 }
 
 func (F *FP) ToBytes(b []byte) {
-	F.Redc().ToBytes(b)
+	F.Redc(nil).ToBytes(b)
 }
 
 func FP_fromBytes(b []byte) *FP {
 	t := FromBytes(b)
-	return NewFPbig(t)
+	return NewFPbig(t, nil)
 }
 
 func (F *FP) isunity() bool {
-	W := NewFPcopy(F)
-	W.reduce()
-	return W.Redc().isunity()
+	mem := arena.NewArena()
+	defer mem.Free()
+	W := NewFPcopy(F, mem)
+	W.reduce(mem)
+	return W.Redc(mem).isunity()
 }
 
 /* copy from FP b */
@@ -270,25 +315,27 @@ func (F *FP) zero() {
 
 /* set this=1 */
 func (F *FP) one() {
+	mem := arena.NewArena()
+	defer mem.Free()
 	F.x.one()
-	F.nres()
+	F.nres(mem)
 }
 
 /* return sign */
-func (F *FP) sign() int {
+func (F *FP) sign(mem *arena.Arena) int {
 	if BIG_ENDIAN_SIGN {
-		m := NewBIGints(Modulus)
+		m := NewBIGints(Modulus, mem)
 		m.dec(1)
 		m.fshr(1)
-		n := NewFPcopy(F)
-		n.reduce()
-		w := n.Redc()
+		n := NewFPcopy(F, mem)
+		n.reduce(mem)
+		w := n.Redc(mem)
 		cp := Comp(w, m)
 		return ((cp + 1) & 2) >> 1
 	} else {
-		W := NewFPcopy(F)
-		W.reduce()
-		return W.Redc().parity()
+		W := NewFPcopy(F, mem)
+		W.reduce(mem)
+		return W.Redc(mem).parity()
 	}
 }
 
@@ -315,20 +362,20 @@ func (F *FP) cmove(b *FP, d int) {
 }
 
 /* this*=b mod Modulus */
-func (F *FP) Mul(b *FP) {
+func (F *FP) Mul(b *FP, mem *arena.Arena) {
 
 	if int64(F.XES)*int64(b.XES) > int64(FEXCESS) {
-		F.reduce()
+		F.reduce(mem)
 	}
 
-	d := mul(F.x, b.x)
-	F.x.copy(mod(d))
+	d := mul(F.x, b.x, mem)
+	F.x.copy(mod(d, mem))
 	F.XES = 2
 }
 
 /* this = -this mod Modulus */
-func (F *FP) Neg() {
-	m := NewBIGints(Modulus)
+func (F *FP) Neg(mem *arena.Arena) {
+	m := NewBIGints(Modulus, mem)
 	sb := logb2(uint32(F.XES - 1))
 
 	m.fshl(sb)
@@ -336,12 +383,12 @@ func (F *FP) Neg() {
 
 	F.XES = (1 << sb) + 1
 	if F.XES > FEXCESS {
-		F.reduce()
+		F.reduce(mem)
 	}
 }
 
 /* this*=c mod Modulus, where c is a small int */
-func (F *FP) imul(c int) {
+func (F *FP) imul(c int, mem *arena.Arena) {
 	//	F.norm()
 	s := false
 	if c < 0 {
@@ -350,60 +397,60 @@ func (F *FP) imul(c int) {
 	}
 
 	if MODTYPE == PSEUDO_MERSENNE || MODTYPE == GENERALISED_MERSENNE {
-		d := F.x.pxmul(c)
-		F.x.copy(mod(d))
+		d := F.x.pxmul(c, mem)
+		F.x.copy(mod(d, mem))
 		F.XES = 2
 	} else {
 		if F.XES*int32(c) <= FEXCESS {
 			F.x.pmul(c)
 			F.XES *= int32(c)
 		} else {
-			n := NewFPint(c)
-			F.Mul(n)
+			n := NewFPint(c, mem)
+			F.Mul(n, mem)
 		}
 	}
 	if s {
-		F.Neg()
+		F.Neg(mem)
 		F.norm()
 	}
 }
 
 /* this*=this mod Modulus */
-func (F *FP) Sqr() {
+func (F *FP) Sqr(mem *arena.Arena) {
 	if int64(F.XES)*int64(F.XES) > int64(FEXCESS) {
-		F.reduce()
+		F.reduce(mem)
 	}
-	d := sqr(F.x)
-	F.x.copy(mod(d))
+	d := sqr(F.x, mem)
+	F.x.copy(mod(d, mem))
 	F.XES = 2
 }
 
 /* this+=b */
-func (F *FP) Add(b *FP) {
+func (F *FP) Add(b *FP, mem *arena.Arena) {
 	F.x.Add(b.x)
 	F.XES += b.XES
 	if F.XES > FEXCESS {
-		F.reduce()
+		F.reduce(mem)
 	}
 }
 
 /* this-=b */
-func (F *FP) Sub(b *FP) {
-	n := NewFPcopy(b)
-	n.Neg()
-	F.Add(n)
+func (F *FP) Sub(b *FP, mem *arena.Arena) {
+	n := NewFPcopy(b, mem)
+	n.Neg(mem)
+	F.Add(n, mem)
 }
 
-func (F *FP) rsub(b *FP) {
-	F.Neg()
-	F.Add(b)
+func (F *FP) rsub(b *FP, mem *arena.Arena) {
+	F.Neg(mem)
+	F.Add(b, mem)
 }
 
 /* this/=2 mod Modulus */
-func (F *FP) div2() {
-	p := NewBIGints(Modulus)
+func (F *FP) div2(mem *arena.Arena) {
+	p := NewBIGints(Modulus, mem)
 	pr := F.x.parity()
-	w := NewBIGcopy(F.x)
+	w := NewBIGcopy(F.x, mem)
 	F.x.fshr(1)
 	w.Add(p)
 	w.norm()
@@ -413,18 +460,22 @@ func (F *FP) div2() {
 
 /* return jacobi symbol (this/Modulus) */
 func (F *FP) jacobi() int {
-	w := F.Redc()
-	p := NewBIGints(Modulus)
+	mem := arena.NewArena()
+	defer mem.Free()
+	w := F.Redc(mem)
+	p := NewBIGints(Modulus, mem)
 	return w.Jacobi(p)
 }
 
 /* return TRUE if this==a */
 func (F *FP) Equals(a *FP) bool {
-	f := NewFPcopy(F)
-	s := NewFPcopy(a)
+	mem := arena.NewArena()
+	defer mem.Free()
+	f := NewFPcopy(F, mem)
+	s := NewFPcopy(a, mem)
 
-	s.reduce()
-	f.reduce()
+	s.reduce(mem)
+	f.reduce(mem)
 	if Comp(s.x, f.x) == 0 {
 		return true
 	}
@@ -432,20 +483,22 @@ func (F *FP) Equals(a *FP) bool {
 }
 
 func (F *FP) Comp(a *FP) int {
-	f := NewFPcopy(F)
-	s := NewFPcopy(a)
+	mem := arena.NewArena()
+	defer mem.Free()
+	f := NewFPcopy(F, mem)
+	s := NewFPcopy(a, mem)
 
-	s.reduce()
-	f.reduce()
+	s.reduce(mem)
+	f.reduce(mem)
 
 	return Comp(s.x, f.x)
 }
 
-func (F *FP) pow(e *BIG) *FP {
+func (F *FP) pow(e *BIG, mem *arena.Arena) *FP {
 	var tb []*FP
 	var w [1 + (NLEN*int(BASEBITS)+3)/4]int8
 	F.norm()
-	t := NewBIGcopy(e)
+	t := NewBIGcopy(e, mem)
 	t.norm()
 	nb := 1 + (t.nbits()+3)/4
 
@@ -456,51 +509,51 @@ func (F *FP) pow(e *BIG) *FP {
 		w[i] = int8(lsbs)
 		t.fshr(4)
 	}
-	tb = append(tb, NewFPint(1))
-	tb = append(tb, NewFPcopy(F))
+	tb = append(tb, NewFPint(1, mem))
+	tb = append(tb, NewFPcopy(F, mem))
 	for i := 2; i < 16; i++ {
-		tb = append(tb, NewFPcopy(tb[i-1]))
-		tb[i].Mul(F)
+		tb = append(tb, NewFPcopy(tb[i-1], mem))
+		tb[i].Mul(F, mem)
 	}
-	r := NewFPcopy(tb[w[nb-1]])
+	r := NewFPcopy(tb[w[nb-1]], mem)
 	for i := nb - 2; i >= 0; i-- {
-		r.Sqr()
-		r.Sqr()
-		r.Sqr()
-		r.Sqr()
-		r.Mul(tb[w[i]])
+		r.Sqr(mem)
+		r.Sqr(mem)
+		r.Sqr(mem)
+		r.Sqr(mem)
+		r.Mul(tb[w[i]], mem)
 	}
-	r.reduce()
+	r.reduce(mem)
 	return r
 }
 
 // See https://eprint.iacr.org/2018/1038
 // return this^(p-3)/4 or this^(p-5)/8
-func (F *FP) fpow() *FP {
+func (F *FP) fpow(mem *arena.Arena) *FP {
 	ac := [11]int{1, 2, 3, 6, 12, 15, 30, 60, 120, 240, 255}
-	var xp []*FP
+	xp := arena.MakeSlice[*FP](mem, 11, 11)
 	// phase 1
-	xp = append(xp, NewFPcopy(F))
-	xp = append(xp, NewFPcopy(F))
-	xp[1].Sqr()
-	xp = append(xp, NewFPcopy(xp[1]))
-	xp[2].Mul(F)
-	xp = append(xp, NewFPcopy(xp[2]))
-	xp[3].Sqr()
-	xp = append(xp, NewFPcopy(xp[3]))
-	xp[4].Sqr()
-	xp = append(xp, NewFPcopy(xp[4]))
-	xp[5].Mul(xp[2])
-	xp = append(xp, NewFPcopy(xp[5]))
-	xp[6].Sqr()
-	xp = append(xp, NewFPcopy(xp[6]))
-	xp[7].Sqr()
-	xp = append(xp, NewFPcopy(xp[7]))
-	xp[8].Sqr()
-	xp = append(xp, NewFPcopy(xp[8]))
-	xp[9].Sqr()
-	xp = append(xp, NewFPcopy(xp[9]))
-	xp[10].Mul(xp[5])
+	xp[0] = NewFPcopy(F, mem)
+	xp[1] = NewFPcopy(F, mem)
+	xp[1].Sqr(mem)
+	xp[2] = NewFPcopy(xp[1], mem)
+	xp[2].Mul(F, mem)
+	xp[3] = NewFPcopy(xp[2], mem)
+	xp[3].Sqr(mem)
+	xp[4] = NewFPcopy(xp[3], mem)
+	xp[4].Sqr(mem)
+	xp[5] = NewFPcopy(xp[4], mem)
+	xp[5].Mul(xp[2], mem)
+	xp[6] = NewFPcopy(xp[5], mem)
+	xp[6].Sqr(mem)
+	xp[7] = NewFPcopy(xp[6], mem)
+	xp[7].Sqr(mem)
+	xp[8] = NewFPcopy(xp[7], mem)
+	xp[8].Sqr(mem)
+	xp[9] = NewFPcopy(xp[8], mem)
+	xp[9].Sqr(mem)
+	xp[10] = NewFPcopy(xp[9], mem)
+	xp[10].Mul(xp[5], mem)
 	var n, c int
 
 	e := int(PM1D2)
@@ -529,7 +582,7 @@ func (F *FP) fpow() *FP {
 	k := w - c
 
 	i := 10
-	key := NewFP()
+	key := NewFP(mem)
 
 	if k != 0 {
 		for ac[i] > k {
@@ -544,7 +597,7 @@ func (F *FP) fpow() *FP {
 		if ac[i] > k {
 			continue
 		}
-		key.Mul(xp[i])
+		key.Mul(xp[i], mem)
 		k -= ac[i]
 	}
 	// phase 2
@@ -555,19 +608,19 @@ func (F *FP) fpow() *FP {
 	j := 3
 	m := 8
 	nw := n - bw
-	t := NewFP()
+	t := NewFP(mem)
 	for 2*m < nw {
 		t.copy(xp[j])
 		j++
 		for i = 0; i < m; i++ {
-			t.Sqr()
+			t.Sqr(mem)
 		}
 		xp[j].copy(xp[j-1])
-		xp[j].Mul(t)
+		xp[j].Mul(t, mem)
 		m *= 2
 	}
 	lo := nw - m
-	r := NewFPcopy(xp[j])
+	r := NewFPcopy(xp[j], mem)
 
 	for lo != 0 {
 		m /= 2
@@ -578,84 +631,86 @@ func (F *FP) fpow() *FP {
 		lo -= m
 		t.copy(r)
 		for i = 0; i < m; i++ {
-			t.Sqr()
+			t.Sqr(mem)
 		}
 		r.copy(t)
-		r.Mul(xp[j])
+		r.Mul(xp[j], mem)
 	}
 	// phase 3
 	if bw != 0 {
 		for i = 0; i < bw; i++ {
-			r.Sqr()
+			r.Sqr(mem)
 		}
-		r.Mul(key)
+		r.Mul(key, mem)
 	}
 
 	if MODTYPE == GENERALISED_MERSENNE { // Goldilocks ONLY
 		key.copy(r)
-		r.Sqr()
-		r.Mul(F)
+		r.Sqr(mem)
+		r.Mul(F, mem)
 		for i = 0; i < n+1; i++ {
-			r.Sqr()
+			r.Sqr(mem)
 		}
-		r.Mul(key)
+		r.Mul(key, mem)
 	}
 	for nd > 0 {
-		r.Sqr()
+		r.Sqr(mem)
 		nd--
 	}
 	return r
 }
 
 // calculates r=x^(p-1-2^e)/2^{e+1) where 2^e|p-1
-func (F *FP) progen() {
+func (F *FP) progen(mem *arena.Arena) {
 	if MODTYPE == PSEUDO_MERSENNE || MODTYPE == GENERALISED_MERSENNE {
-		F.copy(F.fpow())
+		F.copy(F.fpow(mem))
 		return
 	}
 	e := uint(PM1D2)
-	m := NewBIGints(Modulus)
+	m := NewBIGints(Modulus, mem)
 	m.dec(1)
 	m.shr(e)
 	m.dec(1)
 	m.fshr(1)
-	F.copy(F.pow(m))
+	F.copy(F.pow(m, mem))
 }
 
 /* this=1/this mod Modulus */
-func (F *FP) Invert(h *FP) {
+func (F *FP) Invert(h *FP, mem *arena.Arena) {
 	e := int(PM1D2)
 	F.norm()
-	s := NewFPcopy(F)
+	s := NewFPcopy(F, mem)
 	for i := 0; i < e-1; i++ {
-		s.Sqr()
-		s.Mul(F)
+		s.Sqr(mem)
+		s.Mul(F, mem)
 	}
 	if h == nil {
-		F.progen()
+		F.progen(mem)
 	} else {
 		F.copy(h)
 	}
 	for i := 0; i <= e; i++ {
-		F.Sqr()
+		F.Sqr(mem)
 	}
-	F.Mul(s)
-	F.reduce()
+	F.Mul(s, mem)
+	F.reduce(mem)
 }
 
 /* test for Quadratic residue */
 func (F *FP) qr(h *FP) int {
-	r := NewFPcopy(F)
+	mem := arena.NewArena()
+	defer mem.Free()
+	r := NewFPcopy(F, mem)
 	e := int(PM1D2)
-	r.progen()
+	r.progen(mem)
 	if h != nil {
 		h.copy(r)
 	}
 
-	r.Sqr()
-	r.Mul(F)
+	r.Sqr(mem)
+	r.Mul(F, mem)
 	for i := 0; i < e-1; i++ {
-		r.Sqr()
+		r.Sqr(mem)
 	}
 
 	if r.isunity() {
@@ -666,29 +721,29 @@ func (F *FP) qr(h *FP) int {
 }
 
 /* return sqrt(this) mod Modulus */
-func (F *FP) Sqrt(h *FP) *FP {
+func (F *FP) Sqrt(h *FP, mem *arena.Arena) *FP {
 	e := int(PM1D2)
-	g := NewFPcopy(F)
+	g := NewFPcopy(F, mem)
 	if h == nil {
-		g.progen()
+		g.progen(mem)
 	} else {
 		g.copy(h)
 	}
 
-	m := NewBIGints(ROI)
-	v := NewFPbig(m)
+	m := NewBIGints(ROI, mem)
+	v := NewFPbig(m, mem)
 
-	t := NewFPcopy(g)
-	t.Sqr()
-	t.Mul(F)
+	t := NewFPcopy(g, mem)
+	t.Sqr(mem)
+	t.Mul(F, mem)
 
-	r := NewFPcopy(F)
-	r.Mul(g)
-	b := NewFPcopy(t)
+	r := NewFPcopy(F, mem)
+	r.Mul(g, mem)
+	b := NewFPcopy(t, mem)
 
 	for k := e; k > 1; k-- {
 		for j := 1; j < k-1; j++ {
-			b.Sqr()
+			b.Sqr(mem)
 		}
 		var u int
 		if b.isunity() {
@@ -697,41 +752,43 @@ func (F *FP) Sqrt(h *FP) *FP {
 			u = 1
 		}
 		g.copy(r)
-		g.Mul(v)
+		g.Mul(v, mem)
 		r.cmove(g, u)
-		v.Sqr()
+		v.Sqr(mem)
 		g.copy(t)
-		g.Mul(v)
+		g.Mul(v, mem)
 		t.cmove(g, u)
 		b.copy(t)
 	}
-	sgn := r.sign()
-	nr := NewFPcopy(r)
-	nr.Neg()
+	sgn := r.sign(mem)
+	nr := NewFPcopy(r, mem)
+	nr.Neg(mem)
 	nr.norm()
 	r.cmove(nr, sgn)
 	return r
 }
 
 func (F *FP) invsqrt(i *FP, s *FP) int {
-	h := NewFP()
+	mem := arena.NewArena()
+	defer mem.Free()
+	h := NewFP(mem)
 	qr := F.qr(h)
-	s.copy(F.Sqrt(h))
+	s.copy(F.Sqrt(h, mem))
 	i.copy(F)
-	i.Invert(h)
+	i.Invert(h, mem)
 	return qr
 }
 
 // Two for the price of one  - See Hamburg https://eprint.iacr.org/2012/309.pdf
 // Calculate Invert of i and square root of s, return QR
 func FP_tpo(i *FP, s *FP) int {
-	w := NewFPcopy(s)
-	t := NewFPcopy(i)
-	w.Mul(i)
-	t.Mul(w)
+	w := NewFPcopy(s, nil)
+	t := NewFPcopy(i, nil)
+	w.Mul(i, nil)
+	t.Mul(w, nil)
 	qr := t.invsqrt(i, s)
-	i.Mul(w)
-	s.Mul(i)
+	i.Mul(w, nil)
+	s.Mul(i, nil)
 	return qr
 }
 
diff --git a/nekryptology/pkg/core/curves/native/bls48581/fp16.go b/nekryptology/pkg/core/curves/native/bls48581/fp16.go
index 5f21e49..197ff43 100644
--- a/nekryptology/pkg/core/curves/native/bls48581/fp16.go
+++ b/nekryptology/pkg/core/curves/native/bls48581/fp16.go
@@ -23,6 +23,8 @@
 
 package bls48581
 
+import "arena"
+
 //import "fmt"
 
 type FP16 struct {
@@ -30,46 +32,81 @@ type FP16 struct {
 	b *FP8
 }
 
-func NewFP16() *FP16 {
-	F := new(FP16)
-	F.a = NewFP8()
-	F.b = NewFP8()
-	return F
+func NewFP16(mem *arena.Arena) *FP16 {
+	if mem != nil {
+		F := arena.New[FP16](mem)
+		F.a = NewFP8(mem)
+		F.b = NewFP8(mem)
+		return F
+	} else {
+		F := new(FP16)
+		F.a = NewFP8(nil)
+		F.b = NewFP8(nil)
+		return F
+	}
 }
 
 /* Constructors */
-func NewFP16int(a int) *FP16 {
-	F := new(FP16)
-	F.a = NewFP8int(a)
-	F.b = NewFP8()
-	return F
+func NewFP16int(a int, mem *arena.Arena) *FP16 {
+	if mem != nil {
+		F := arena.New[FP16](mem)
+		F.a = NewFP8int(a, mem)
+		F.b = NewFP8(mem)
+		return F
+	} else {
+		F := new(FP16)
+		F.a = NewFP8int(a, nil)
+		F.b = NewFP8(nil)
+		return F
+	}
 }
 
-func NewFP16copy(x *FP16) *FP16 {
-	F := new(FP16)
-	F.a = NewFP8copy(x.a)
-	F.b = NewFP8copy(x.b)
-	return F
+func NewFP16copy(x *FP16, mem *arena.Arena) *FP16 {
+	if mem != nil {
+		F := arena.New[FP16](mem)
+		F.a = NewFP8copy(x.a, mem)
+		F.b = NewFP8copy(x.b, mem)
+		return F
+	} else {
+		F := new(FP16)
+		F.a = NewFP8copy(x.a, nil)
+		F.b = NewFP8copy(x.b, nil)
+		return F
+	}
 }
 
-func NewFP16fp8s(c *FP8, d *FP8) *FP16 {
-	F := new(FP16)
-	F.a = NewFP8copy(c)
-	F.b = NewFP8copy(d)
-	return F
+func NewFP16fp8s(c *FP8, d *FP8, mem *arena.Arena) *FP16 {
+	if mem != nil {
+		F := arena.New[FP16](mem)
+		F.a = c
+		F.b = d
+		return F
+	} else {
+		F := new(FP16)
+		F.a = c
+		F.b = d
+		return F
+	}
 }
 
-func NewFP16fp8(c *FP8) *FP16 {
-	F := new(FP16)
-	F.a = NewFP8copy(c)
-	F.b = NewFP8()
-	return F
+func NewFP16fp8(c *FP8, mem *arena.Arena) *FP16 {
+	if mem != nil {
+		F := arena.New[FP16](mem)
+		F.a = c
+		F.b = NewFP8(mem)
+		return F
+	} else {
+		F := new(FP16)
+		F.a = c
+		F.b = NewFP8(nil)
+		return F
+	}
 }
 
 /* reduce all components of this mod Modulus */
-func (F *FP16) reduce() {
-	F.a.reduce()
-	F.b.reduce()
+func (F *FP16) reduce(mem *arena.Arena) {
+	F.a.reduce(mem)
+	F.b.reduce(mem)
 }
 
 /* normalise all components of this mod Modulus */
@@ -79,8 +116,8 @@ func (F *FP16) norm() {
 }
 
 /* test this==0 ? */
-func (F *FP16) IsZero() bool {
-	return F.a.IsZero() && F.b.IsZero()
+func (F *FP16) IsZero(mem *arena.Arena) bool {
+	return F.a.IsZero(mem) && F.b.IsZero(mem)
 }
 
 func (F *FP16) ToBytes(bf []byte) {
@@ -107,7 +144,7 @@ func FP16_fromBytes(bf []byte) *FP16 {
 		t[i] = bf[i+MB]
 	}
 	ta := FP8_fromBytes(t[:])
-	return NewFP16fp8s(ta, tb)
+	return NewFP16fp8s(ta, tb, nil)
 }
 
 /* Conditional move */
@@ -118,13 +155,15 @@ func (F *FP16) cmove(g *FP16, d int) {
 
 /* test this==1 ? */
 func (F *FP16) isunity() bool {
-	one := NewFP8int(1)
-	return F.a.Equals(one) && F.b.IsZero()
+	mem := arena.NewArena()
+	defer mem.Free()
+	one := NewFP8int(1, mem)
+	return F.a.Equals(one) && F.b.IsZero(mem)
 }
 
 /* test is w real? That is in a+ib test b is zero */
 func (F *FP16) isreal() bool {
-	return F.b.IsZero()
+	return F.b.IsZero(nil)
 }
 
 /* extract real part a */
@@ -165,137 +204,137 @@ func (F *FP16) one() {
 }
 
 /* set this=-this */
-func (F *FP16) Neg() {
+func (F *FP16) Neg(mem *arena.Arena) {
 	F.norm()
-	m := NewFP8copy(F.a)
-	t := NewFP8()
-	m.Add(F.b)
-	m.Neg()
+	m := NewFP8copy(F.a, mem)
+	t := NewFP8(mem)
+	m.Add(F.b, mem)
+	m.Neg(mem)
 	t.copy(m)
-	t.Add(F.b)
+	t.Add(F.b, mem)
 	F.b.copy(m)
-	F.b.Add(F.a)
+	F.b.Add(F.a, mem)
 	F.a.copy(t)
 	F.norm()
 }
 
 /* this=conjugate(this) */
-func (F *FP16) conj() {
-	F.b.Neg()
+func (F *FP16) conj(mem *arena.Arena) {
+	F.b.Neg(mem)
 	F.norm()
 }
 
 /* this=-conjugate(this) */
-func (F *FP16) nconj() {
-	F.a.Neg()
+func (F *FP16) nconj(mem *arena.Arena) {
+	F.a.Neg(mem)
 	F.norm()
 }
 
 /* this+=x */
-func (F *FP16) Add(x *FP16) {
-	F.a.Add(x.a)
-	F.b.Add(x.b)
+func (F *FP16) Add(x *FP16, mem *arena.Arena) {
+	F.a.Add(x.a, mem)
+	F.b.Add(x.b, mem)
 }
 
 /* this-=x */
-func (F *FP16) Sub(x *FP16) {
-	m := NewFP16copy(x)
-	m.Neg()
-	F.Add(m)
+func (F *FP16) Sub(x *FP16, mem *arena.Arena) {
+	m := NewFP16copy(x, mem)
+	m.Neg(mem)
+	F.Add(m, mem)
 }
 
 /* this-=x */
-func (F *FP16) rsub(x *FP16) {
-	F.Neg()
-	F.Add(x)
+func (F *FP16) rsub(x *FP16, mem *arena.Arena) {
+	F.Neg(mem)
+	F.Add(x, mem)
 }
 
 /* this*=s where s is FP8 */
-func (F *FP16) pmul(s *FP8) {
-	F.a.Mul(s)
-	F.b.Mul(s)
+func (F *FP16) pmul(s *FP8, mem *arena.Arena) {
+	F.a.Mul(s, mem)
+	F.b.Mul(s, mem)
 }
 
 /* this*=s where s is FP2 */
-func (F *FP16) qmul(s *FP2) {
-	F.a.qmul(s)
-	F.b.qmul(s)
+func (F *FP16) qmul(s *FP2, mem *arena.Arena) {
+	F.a.qmul(s, mem)
+	F.b.qmul(s, mem)
 }
 
 /* this*=s where s is FP */
-func (F *FP16) tmul(s *FP) {
-	F.a.tmul(s)
-	F.b.tmul(s)
+func (F *FP16) tmul(s *FP, mem *arena.Arena) {
+	F.a.tmul(s, mem)
+	F.b.tmul(s, mem)
 }
 
 /* this*=c where c is int */
-func (F *FP16) imul(c int) {
-	F.a.imul(c)
-	F.b.imul(c)
+func (F *FP16) imul(c int, mem *arena.Arena) {
+	F.a.imul(c, mem)
+	F.b.imul(c, mem)
 }
 
 /* this*=this */
-func (F *FP16) Sqr() {
-	t1 := NewFP8copy(F.a)
-	t2 := NewFP8copy(F.b)
-	t3 := NewFP8copy(F.a)
+func (F *FP16) Sqr(mem *arena.Arena) {
+	t1 := NewFP8copy(F.a, mem)
+	t2 := NewFP8copy(F.b, mem)
+	t3 := NewFP8copy(F.a, mem)
 
-	t3.Mul(F.b)
-	t1.Add(F.b)
-	t2.times_i()
+	t3.Mul(F.b, mem)
+	t1.Add(F.b, mem)
+	t2.times_i(mem)
 
-	t2.Add(F.a)
+	t2.Add(F.a, mem)
 
 	t1.norm()
 	t2.norm()
 
 	F.a.copy(t1)
-	F.a.Mul(t2)
+	F.a.Mul(t2, mem)
 
 	t2.copy(t3)
-	t2.times_i()
-	t2.Add(t3)
+	t2.times_i(mem)
+	t2.Add(t3, mem)
 	t2.norm()
-	t2.Neg()
-	F.a.Add(t2)
+	t2.Neg(mem)
+	F.a.Add(t2, mem)
 
 	F.b.copy(t3)
-	F.b.Add(t3)
+	F.b.Add(t3, mem)
 
 	F.norm()
 }
 
 /* this*=y */
-func (F *FP16) Mul(y *FP16) {
-	t1 := NewFP8copy(F.a)
-	t2 := NewFP8copy(F.b)
-	t3 := NewFP8()
-	t4 := NewFP8copy(F.b)
+func (F *FP16) Mul(y *FP16, mem *arena.Arena) {
+	t1 := NewFP8copy(F.a, mem)
+	t2 := NewFP8copy(F.b, mem)
+	t3 := NewFP8(mem)
+	t4 := NewFP8copy(F.b, mem)
 
-	t1.Mul(y.a)
-	t2.Mul(y.b)
+	t1.Mul(y.a, mem)
+	t2.Mul(y.b, mem)
 	t3.copy(y.b)
-	t3.Add(y.a)
-	t4.Add(F.a)
+	t3.Add(y.a, mem)
+	t4.Add(F.a, mem)
 
 	t3.norm()
 	t4.norm()
 
-	t4.Mul(t3)
+	t4.Mul(t3, mem)
 
 	t3.copy(t1)
-	t3.Neg()
-	t4.Add(t3)
+	t3.Neg(mem)
+	t4.Add(t3, mem)
 	t4.norm()
 
 	t3.copy(t2)
-	t3.Neg()
+	t3.Neg(mem)
 	F.b.copy(t4)
-	F.b.Add(t3)
+	F.b.Add(t3, mem)
 
-	t2.times_i()
+	t2.times_i(mem)
 	F.a.copy(t2)
-	F.a.Add(t1)
+	F.a.Add(t1, mem)
 
 	F.norm()
 }
@@ -306,77 +345,77 @@ func (F *FP16) toString() string {
 }
 
 /* this=1/this */
-func (F *FP16) Invert() {
-	t1 := NewFP8copy(F.a)
-	t2 := NewFP8copy(F.b)
+func (F *FP16) Invert(mem *arena.Arena) {
+	t1 := NewFP8copy(F.a, mem)
+	t2 := NewFP8copy(F.b, mem)
 
-	t1.Sqr()
-	t2.Sqr()
-	t2.times_i()
+	t1.Sqr(mem)
+	t2.Sqr(mem)
+	t2.times_i(mem)
 	t2.norm()
-	t1.Sub(t2)
+	t1.Sub(t2, mem)
 	t1.norm()
 
-	t1.Invert(nil)
+	t1.Invert(nil, mem)
 
-	F.a.Mul(t1)
-	t1.Neg()
+	F.a.Mul(t1, mem)
+	t1.Neg(mem)
 	t1.norm()
-	F.b.Mul(t1)
+	F.b.Mul(t1, mem)
 }
 
 /* this*=i where i = sqrt(sqrt(-1+sqrt(-1))) */
-func (F *FP16) times_i() {
-	s := NewFP8copy(F.b)
-	t := NewFP8copy(F.a)
-	s.times_i()
+func (F *FP16) times_i(mem *arena.Arena) {
+	s := NewFP8copy(F.b, mem)
+	t := NewFP8copy(F.a, mem)
+	s.times_i(mem)
 	F.a.copy(s)
 	F.b.copy(t)
 	F.norm()
 }
 
-func (F *FP16) times_i2() {
-	F.a.times_i()
-	F.b.times_i()
+func (F *FP16) times_i2(mem *arena.Arena) {
+	F.a.times_i(mem)
+	F.b.times_i(mem)
 }
 
-func (F *FP16) times_i4() {
-	F.a.times_i2()
-	F.b.times_i2()
+func (F *FP16) times_i4(mem *arena.Arena) {
+	F.a.times_i2(mem)
+	F.b.times_i2(mem)
 }
 
 /* this=this^p using Frobenius */
-func (F *FP16) frob(f *FP2) {
-	ff := NewFP2copy(f)
-	ff.Sqr()
+func (F *FP16) frob(f *FP2, mem *arena.Arena) {
+	ff := NewFP2copy(f, mem)
+	ff.Sqr(mem)
 	ff.norm()
 
-	F.a.frob(ff)
-	F.b.frob(ff)
-	F.b.qmul(f)
-	F.b.times_i()
+	F.a.frob(ff, mem)
+	F.b.frob(ff, mem)
+	F.b.qmul(f, mem)
+	F.b.times_i(mem)
 
 }
 
 /* this=this^e */
-func (F *FP16) pow(e *BIG) *FP16 {
-	w := NewFP16copy(F)
+func (F *FP16) pow(e *BIG, mem *arena.Arena) *FP16 {
+	w := NewFP16copy(F, mem)
 	w.norm()
-	z := NewBIGcopy(e)
-	r := NewFP16int(1)
+	z := NewBIGcopy(e, mem)
+	r := NewFP16int(1, mem)
 	z.norm()
 	for true {
 		bt := z.parity()
 		z.fshr(1)
 		if bt == 1 {
-			r.Mul(w)
+			r.Mul(w, mem)
 		}
 		if z.IsZero() {
 			break
 		}
-		w.Sqr()
+		w.Sqr(mem)
 	}
-	r.reduce()
+	r.reduce(mem)
 	return r
 }
 
diff --git a/nekryptology/pkg/core/curves/native/bls48581/fp2.go b/nekryptology/pkg/core/curves/native/bls48581/fp2.go
index 861445d..1824f4b 100644
--- a/nekryptology/pkg/core/curves/native/bls48581/fp2.go
+++ b/nekryptology/pkg/core/curves/native/bls48581/fp2.go
@@ -23,7 +23,11 @@
 
 package bls48581
 
-import "source.quilibrium.com/quilibrium/monorepo/nekryptology/pkg/core/curves/native/bls48581/ext"
+import (
+	"arena"
+
+	"source.quilibrium.com/quilibrium/monorepo/nekryptology/pkg/core/curves/native/bls48581/ext"
+)
 
 //import "fmt"
 
@@ -32,72 +36,128 @@ type FP2 struct {
 	b *FP
 }
 
-func NewFP2() *FP2 {
-	F := new(FP2)
-	F.a = NewFP()
-	F.b = NewFP()
-	return F
+func NewFP2(mem *arena.Arena) *FP2 {
+	if mem != nil {
+		F := arena.New[FP2](mem)
+		F.a = NewFP(mem)
+		F.b = NewFP(mem)
+		return F
+	} else {
+		F := new(FP2)
+		F.a = NewFP(nil)
+		F.b = NewFP(nil)
+		return F
+	}
 }
 
 /* Constructors */
-func NewFP2int(a int) *FP2 {
-	F := new(FP2)
-	F.a = NewFPint(a)
-	F.b = NewFP()
-	return F
+func NewFP2int(a int, mem *arena.Arena) *FP2 {
+	if mem != nil {
+		F := arena.New[FP2](mem)
+		F.a = NewFPint(a, mem)
+		F.b = NewFP(mem)
+		return F
+	} else {
+		F := new(FP2)
+		F.a = NewFPint(a, nil)
+		F.b = NewFP(nil)
+		return F
+	}
 }
 
-func NewFP2ints(a int, b int) *FP2 {
-	F := new(FP2)
-	F.a = NewFPint(a)
-	F.b = NewFPint(b)
-	return F
+func NewFP2ints(a int, b int, mem *arena.Arena) *FP2 {
+	if mem != nil {
+		F := arena.New[FP2](mem)
+		F.a = NewFPint(a, mem)
+		F.b = NewFPint(b, mem)
+		return F
+	} else {
+		F := new(FP2)
+		F.a = NewFPint(a, nil)
+		F.b = NewFPint(b, nil)
+		return F
+	}
 }
 
-func NewFP2copy(x *FP2) *FP2 {
-	F := new(FP2)
-	F.a = NewFPcopy(x.a)
-	F.b = NewFPcopy(x.b)
-	return F
+func NewFP2copy(x *FP2, mem *arena.Arena) *FP2 {
+	if mem != nil {
+		F := arena.New[FP2](mem)
+		F.a = NewFPcopy(x.a, mem)
+		F.b = NewFPcopy(x.b, mem)
+		return F
+	} else {
+		F := new(FP2)
+		F.a = NewFPcopy(x.a, nil)
+		F.b = NewFPcopy(x.b, nil)
+		return F
+	}
 }
 
-func NewFP2fps(c *FP, d *FP) *FP2 {
-	F := new(FP2)
-	F.a = NewFPcopy(c)
-	F.b = NewFPcopy(d)
-	return F
+func NewFP2fps(c *FP, d *FP, mem *arena.Arena) *FP2 {
+	if mem != nil {
+		F := arena.New[FP2](mem)
+		F.a = NewFPcopy(c, mem)
+		F.b = NewFPcopy(d, mem)
+		return F
+	} else {
+		F := new(FP2)
+		F.a = NewFPcopy(c, nil)
+		F.b = NewFPcopy(d, nil)
+		return F
+	}
 }
 
-func NewFP2bigs(c *BIG, d *BIG) *FP2 {
-	F := new(FP2)
-	F.a = NewFPbig(c)
-	F.b = NewFPbig(d)
-	return F
+func NewFP2bigs(c *BIG, d *BIG, mem *arena.Arena) *FP2 {
+	if mem != nil {
+		F := arena.New[FP2](mem)
+		F.a = NewFPbig(c, mem)
+		F.b = NewFPbig(d, mem)
+		return F
+	} else {
+		F := new(FP2)
+		F.a = NewFPbig(c, nil)
+		F.b = NewFPbig(d, nil)
+		return F
+	}
 }
 
-func NewFP2fp(c *FP) *FP2 {
-	F := new(FP2)
-	F.a = NewFPcopy(c)
-	F.b = NewFP()
-	return F
+func NewFP2fp(c *FP, mem *arena.Arena) *FP2 {
+	if mem != nil {
+		F := arena.New[FP2](mem)
+		F.a = NewFPcopy(c, mem)
+		F.b = NewFP(mem)
+		return F
+	} else {
+		F := new(FP2)
+		F.a = NewFPcopy(c, nil)
+		F.b = NewFP(nil)
+		return F
+	}
 }
 
-func NewFP2big(c *BIG) *FP2 {
-	F := new(FP2)
-	F.a = NewFPbig(c)
-	F.b = NewFP()
-	return F
+func NewFP2big(c *BIG, mem *arena.Arena) *FP2 {
+	if mem != nil {
+		F := arena.New[FP2](mem)
+		F.a = NewFPbig(c, mem)
+		F.b = NewFP(mem)
+		return F
+	} else {
+		F := new(FP2)
+		F.a = NewFPbig(c, nil)
+		F.b = NewFP(nil)
+		return F
+	}
 }
 
 func NewFP2rand(rng *ext.RAND) *FP2 {
-	F := NewFP2fps(NewFPrand(rng), NewFPrand(rng))
+	F := NewFP2fps(NewFPrand(rng), NewFPrand(rng), nil)
 	return F
 }
 
 /* reduce components mod Modulus */
-func (F *FP2) reduce() {
-	F.a.reduce()
-	F.b.reduce()
+func (F *FP2) reduce(mem *arena.Arena) {
+	F.a.reduce(mem)
+	F.b.reduce(mem)
 }
 
 /* normalise components of w */
@@ -107,12 +167,12 @@ func (F *FP2) norm() {
 }
 
 /* test this=0 ? */
-func (F *FP2) IsZero() bool {
-	return (F.a.IsZero() && F.b.IsZero())
+func (F *FP2) IsZero(mem *arena.Arena) bool {
+	return (F.a.IsZero(mem) && F.b.IsZero(mem))
 }
 
 func (F *FP2) islarger() int {
-	if F.IsZero() {
+	if F.IsZero(nil) {
 		return 0
 	}
 	cmp := F.b.islarger()
@@ -146,7 +206,7 @@ func FP2_fromBytes(bf []byte) *FP2 {
 		t[i] = bf[i+MB]
 	}
 	ta := FP_fromBytes(t[:])
-	return NewFP2fps(ta, tb)
+	return NewFP2fps(ta, tb, nil)
 }
 
 func (F *FP2) cmove(g *FP2, d int) {
@@ -156,8 +216,10 @@ func (F *FP2) cmove(g *FP2, d int) {
 
 /* test this=1 ? */
 func (F *FP2) isunity() bool {
-	one := NewFPint(1)
-	return (F.a.Equals(one) && F.b.IsZero())
+	mem := arena.NewArena()
+	defer mem.Free()
+	one := NewFPint(1, mem)
+	return (F.a.Equals(one) && F.b.IsZero(mem))
 }
 
 /* test this=x */
@@ -166,13 +228,13 @@ func (F *FP2) Equals(x *FP2) bool {
 }
 
 /* extract a */
-func (F *FP2) GetA() *BIG {
-	return F.a.Redc()
+func (F *FP2) GetA(mem *arena.Arena) *BIG {
+	return F.a.Redc(mem)
 }
 
 /* extract b */
-func (F *FP2) GetB() *BIG {
-	return F.b.Redc()
+func (F *FP2) GetB(mem *arena.Arena) *BIG {
+	return F.b.Redc(mem)
 }
 
 /* copy this=x */
@@ -194,12 +256,12 @@ func (F *FP2) one() {
 }
 
 /* Return sign */
-func (F *FP2) sign() int {
-	p1 := F.a.sign()
-	p2 := F.b.sign()
+func (F *FP2) sign(mem *arena.Arena) int {
+	p1 := F.a.sign(mem)
+	p2 := F.b.sign(mem)
 	var u int
 	if BIG_ENDIAN_SIGN {
-		if F.b.IsZero() {
+		if F.b.IsZero(mem) {
 			u = 1
 		} else {
 			u = 0
@@ -207,7 +269,7 @@ func (F *FP2) sign() int {
 		p2 ^= (p1 ^ p2) & u
 		return p2
 	} else {
-		if F.a.IsZero() {
+		if F.a.IsZero(mem) {
 			u = 1
 		} else {
 			u = 0
@@ -218,106 +280,106 @@ func (F *FP2) sign() int {
 }
 
 /* negate this mod Modulus */
-func (F *FP2) Neg() {
-	m := NewFPcopy(F.a)
-	t := NewFP()
+func (F *FP2) Neg(mem *arena.Arena) {
+	m := NewFPcopy(F.a, mem)
+	t := NewFP(mem)
 
-	m.Add(F.b)
-	m.Neg()
+	m.Add(F.b, mem)
+	m.Neg(mem)
 	t.copy(m)
-	t.Add(F.b)
+	t.Add(F.b, mem)
 	F.b.copy(m)
-	F.b.Add(F.a)
+	F.b.Add(F.a, mem)
 	F.a.copy(t)
 }
 
 /* set to a-ib */
-func (F *FP2) conj() {
-	F.b.Neg()
+func (F *FP2) conj(mem *arena.Arena) {
+	F.b.Neg(mem)
 	F.b.norm()
 }
 
 /* this+=a */
-func (F *FP2) Add(x *FP2) {
-	F.a.Add(x.a)
-	F.b.Add(x.b)
+func (F *FP2) Add(x *FP2, mem *arena.Arena) {
+	F.a.Add(x.a, mem)
+	F.b.Add(x.b, mem)
 }
 
 /* this-=a */
-func (F *FP2) Sub(x *FP2) {
-	m := NewFP2copy(x)
-	m.Neg()
-	F.Add(m)
+func (F *FP2) Sub(x *FP2, mem *arena.Arena) {
+	m := NewFP2copy(x, mem)
+	m.Neg(mem)
+	F.Add(m, mem)
 }
 
 /* this-=a */
-func (F *FP2) rsub(x *FP2) {
-	F.Neg()
-	F.Add(x)
+func (F *FP2) rsub(x *FP2, mem *arena.Arena) {
+	F.Neg(mem)
+	F.Add(x, mem)
 }
 
 /* this*=s, where s is an FP */
-func (F *FP2) pmul(s *FP) {
-	F.a.Mul(s)
-	F.b.Mul(s)
+func (F *FP2) pmul(s *FP, mem *arena.Arena) {
+	F.a.Mul(s, mem)
+	F.b.Mul(s, mem)
 }
 
 /* this*=i, where i is an int */
-func (F *FP2) imul(c int) {
-	F.a.imul(c)
-	F.b.imul(c)
+func (F *FP2) imul(c int, mem *arena.Arena) {
+	F.a.imul(c, mem)
+	F.b.imul(c, mem)
 }
 
 /* this*=this */
-func (F *FP2) Sqr() {
-	w1 := NewFPcopy(F.a)
-	w3 := NewFPcopy(F.a)
-	mb := NewFPcopy(F.b)
-	w1.Add(F.b)
+func (F *FP2) Sqr(mem *arena.Arena) {
+	w1 := NewFPcopy(F.a, mem)
+	w3 := NewFPcopy(F.a, mem)
+	mb := NewFPcopy(F.b, mem)
+	w1.Add(F.b, mem)
 
-	w3.Add(F.a)
+	w3.Add(F.a, mem)
 	w3.norm()
-	F.b.Mul(w3)
+	F.b.Mul(w3, mem)
 
-	mb.Neg()
-	F.a.Add(mb)
+	mb.Neg(mem)
+	F.a.Add(mb, mem)
 
 	w1.norm()
 	F.a.norm()
 
-	F.a.Mul(w1)
+	F.a.Mul(w1, mem)
 }
 
 /* this*=y */
 /* Now using Lazy reduction */
-func (F *FP2) Mul(y *FP2) {
+func (F *FP2) Mul(y *FP2, mem *arena.Arena) {
 
 	if int64(F.a.XES+F.b.XES)*int64(y.a.XES+y.b.XES) > int64(FEXCESS) {
 		if F.a.XES > 1 {
-			F.a.reduce()
+			F.a.reduce(mem)
 		}
 		if F.b.XES > 1 {
-			F.b.reduce()
+			F.b.reduce(mem)
 		}
 	}
 
-	pR := NewDBIG()
-	C := NewBIGcopy(F.a.x)
-	D := NewBIGcopy(y.a.x)
-	p := NewBIGints(Modulus)
+	pR := NewDBIG(mem)
+	C := NewBIGcopy(F.a.x, mem)
+	D := NewBIGcopy(y.a.x, mem)
+	p := NewBIGints(Modulus, mem)
 
 	pR.ucopy(p)
 
-	A := mul(F.a.x, y.a.x)
-	B := mul(F.b.x, y.b.x)
+	A := mul(F.a.x, y.a.x, mem)
+	B := mul(F.b.x, y.b.x, mem)
 
 	C.Add(F.b.x)
 	C.norm()
 	D.Add(y.b.x)
 	D.norm()
 
-	E := mul(C, D)
-	FF := NewDBIGcopy(A)
+	E := mul(C, D, mem)
+	FF := NewDBIGcopy(A, mem)
 	FF.Add(B)
 	B.rsub(pR)
 
@@ -326,82 +388,84 @@ func (F *FP2) Mul(y *FP2) {
 	E.Sub(FF)
 	E.norm()
 
-	F.a.x.copy(mod(A))
+	F.a.x.copy(mod(A, mem))
 	F.a.XES = 3
-	F.b.x.copy(mod(E))
+	F.b.x.copy(mod(E, mem))
 	F.b.XES = 2
 
 }
 
 /*
-func (F *FP2) pow(b *BIG)  {
-	w := NewFP2copy(F);
-	r := NewFP2int(1)
-	z := NewBIGcopy(b)
-	for true {
-		bt := z.parity()
-		z.shr(1)
-		if bt==1 {
-			r.Mul(w)
+	func (F *FP2) pow(b *BIG)  {
+		w := NewFP2copy(F);
+		r := NewFP2int(1)
+		z := NewBIGcopy(b)
+		for true {
+			bt := z.parity()
+			z.shr(1)
+			if bt==1 {
+				r.Mul(w)
+			}
+			if z.IsZero() {break}
+			w.Sqr()
 		}
-		if z.IsZero() {break}
-		w.Sqr()
+		r.reduce()
+		F.copy(r)
 	}
-	r.reduce()
-	F.copy(r)
-}
 */
 func (F *FP2) qr(h *FP) int {
-	c := NewFP2copy(F)
-	c.conj()
-	c.Mul(F)
+	mem := arena.NewArena()
+	defer mem.Free()
+	c := NewFP2copy(F, mem)
+	c.conj(mem)
+	c.Mul(F, mem)
 	return c.a.qr(h)
 }
 
 /* sqrt(a+ib) = sqrt(a+sqrt(a*a-n*b*b)/2)+ib/(2*sqrt(a+sqrt(a*a-n*b*b)/2)) */
-func (F *FP2) Sqrt(h *FP) {
-	if F.IsZero() {
+func (F *FP2) Sqrt(h *FP, mem *arena.Arena) {
+	if F.IsZero(mem) {
 		return
 	}
-	w1 := NewFPcopy(F.b)
-	w2 := NewFPcopy(F.a)
-	w3 := NewFP()
-	w4 := NewFP()
-	hint := NewFP()
-	w1.Sqr()
-	w2.Sqr()
-	w1.Add(w2)
+	w1 := NewFPcopy(F.b, mem)
+	w2 := NewFPcopy(F.a, mem)
+	w3 := NewFP(mem)
+	w4 := NewFP(mem)
+	hint := NewFP(mem)
+	w1.Sqr(mem)
+	w2.Sqr(mem)
+	w1.Add(w2, mem)
 	w1.norm()
 
-	w1 = w1.Sqrt(h)
+	w1 = w1.Sqrt(h, mem)
 	w2.copy(F.a)
 	w3.copy(F.a)
 
-	w2.Add(w1)
+	w2.Add(w1, mem)
 	w2.norm()
-	w2.div2()
+	w2.div2(mem)
 
 	w1.copy(F.b)
-	w1.div2()
+	w1.div2(mem)
 	qr := w2.qr(hint)
 
 	// tweak hint
 	w3.copy(hint)
-	w3.Neg()
+	w3.Neg(mem)
 	w3.norm()
 	w4.copy(w2)
-	w4.Neg()
+	w4.Neg(mem)
 	w4.norm()
 
 	w2.cmove(w4, 1-qr)
 	hint.cmove(w3, 1-qr)
 
-	F.a.copy(w2.Sqrt(hint))
+	F.a.copy(w2.Sqrt(hint, mem))
 	w3.copy(w2)
-	w3.Invert(hint)
-	w3.Mul(F.a)
+	w3.Invert(hint, mem)
+	w3.Mul(F.a, mem)
 	F.b.copy(w3)
-	F.b.Mul(w1)
+	F.b.Mul(w1, mem)
 	w4.copy(F.a)
 
 	F.a.cmove(F.b, 1-qr)
@@ -425,9 +489,9 @@ func (F *FP2) Sqrt(h *FP) {
 		F.b.cmove(w4,1-qr)
 	*/
 
-	sgn := F.sign()
-	nr := NewFP2copy(F)
-	nr.Neg()
+	sgn := F.sign(mem)
+	nr := NewFP2copy(F, mem)
+	nr.Neg(mem)
 	nr.norm()
 	F.cmove(nr, sgn)
 }
@@ -443,63 +507,63 @@ func (F *FP2) toString() string {
 }
 
 /* this=1/this */
-func (F *FP2) Invert(h *FP) {
+func (F *FP2) Invert(h *FP, mem *arena.Arena) {
 	F.norm()
-	w1 := NewFPcopy(F.a)
-	w2 := NewFPcopy(F.b)
+	w1 := NewFPcopy(F.a, mem)
+	w2 := NewFPcopy(F.b, mem)
 
-	w1.Sqr()
-	w2.Sqr()
-	w1.Add(w2)
-	w1.Invert(h)
-	F.a.Mul(w1)
-	w1.Neg()
+	w1.Sqr(mem)
+	w2.Sqr(mem)
+	w1.Add(w2, mem)
+	w1.Invert(h, mem)
+	F.a.Mul(w1, mem)
+	w1.Neg(mem)
 	w1.norm()
-	F.b.Mul(w1)
+	F.b.Mul(w1, mem)
 }
 
 /* this/=2 */
-func (F *FP2) div2() {
-	F.a.div2()
-	F.b.div2()
+func (F *FP2) div2(mem *arena.Arena) {
+	F.a.div2(mem)
+	F.b.div2(mem)
 }
 
 /* this*=sqrt(-1) */
-func (F *FP2) times_i() {
-	z := NewFPcopy(F.a)
+func (F *FP2) times_i(mem *arena.Arena) {
+	z := NewFPcopy(F.a, mem)
 	F.a.copy(F.b)
-	F.a.Neg()
+	F.a.Neg(mem)
 	F.b.copy(z)
 }
 
 /* w*=(1+sqrt(-1)) */
 /* where X*2-(2^i+sqrt(-1)) is irreducible for FP4 */
-func (F *FP2) Mul_ip() {
-	t := NewFP2copy(F)
+func (F *FP2) Mul_ip(mem *arena.Arena) {
+	t := NewFP2copy(F, mem)
 	i := QNRI
-	F.times_i()
+	F.times_i(mem)
 	for i > 0 {
-		t.Add(t)
+		t.Add(t, mem)
 		t.norm()
 		i--
 	}
-	F.Add(t)
+	F.Add(t, mem)
 
 	if TOWER == POSITOWER {
 		F.norm()
-		F.Neg()
+		F.Neg(mem)
 	}
 
 }
 
 /* w/=(2^i+sqrt(-1)) */
-func (F *FP2) div_ip() {
-	z := NewFP2ints(1<<uint(QNRI), 1)
-	z.Invert(nil)
+func (F *FP2) div_ip(mem *arena.Arena) {
+	z := NewFP2ints(1<<uint(QNRI), 1, nil)
+	z.Invert(nil, mem)
 	F.norm()
-	F.Mul(z)
+	F.Mul(z, mem)
 	if TOWER == POSITOWER {
-		F.Neg()
+		F.Neg(mem)
 		F.norm()
 	}
 }
diff --git a/nekryptology/pkg/core/curves/native/bls48581/fp4.go b/nekryptology/pkg/core/curves/native/bls48581/fp4.go
index dd7dc10..2e1efe2 100644
--- a/nekryptology/pkg/core/curves/native/bls48581/fp4.go
+++ b/nekryptology/pkg/core/curves/native/bls48581/fp4.go
@@ -23,7 +23,11 @@
 
 package bls48581
 
-import "source.quilibrium.com/quilibrium/monorepo/nekryptology/pkg/core/curves/native/bls48581/ext"
+import (
+	"arena"
+
+	"source.quilibrium.com/quilibrium/monorepo/nekryptology/pkg/core/curves/native/bls48581/ext"
+)
 
 //import "fmt"
 
@@ -32,66 +36,115 @@ type FP4 struct {
 	b *FP2
 }
 
-func NewFP4() *FP4 {
-	F := new(FP4)
-	F.a = NewFP2()
-	F.b = NewFP2()
-	return F
+func NewFP4(mem *arena.Arena) *FP4 {
+	if mem != nil {
+		F := arena.New[FP4](mem)
+		F.a = NewFP2(mem)
+		F.b = NewFP2(mem)
+		return F
+	} else {
+		F := new(FP4)
+		F.a = NewFP2(nil)
+		F.b = NewFP2(nil)
+		return F
+	}
 }
 
 /* Constructors */
-func NewFP4int(a int) *FP4 {
-	F := new(FP4)
-	F.a = NewFP2int(a)
-	F.b = NewFP2()
-	return F
+func NewFP4int(a int, mem *arena.Arena) *FP4 {
+	if mem != nil {
+		F := arena.New[FP4](mem)
+		F.a = NewFP2int(a, mem)
+		F.b = NewFP2(mem)
+		return F
+	} else {
+		F := new(FP4)
+		F.a = NewFP2int(a, nil)
+		F.b = NewFP2(nil)
+		return F
+	}
 }
 
 /* Constructors */
-func NewFP4ints(a int, b int) *FP4 {
-	F := new(FP4)
-	F.a = NewFP2int(a)
-	F.b = NewFP2int(b)
-	return F
+func NewFP4ints(a int, b int, mem *arena.Arena) *FP4 {
+	if mem != nil {
+		F := arena.New[FP4](mem)
+		F.a = NewFP2int(a, mem)
+		F.b = NewFP2int(b, mem)
+		return F
+	} else {
+		F := new(FP4)
+		F.a = NewFP2int(a, nil)
+		F.b = NewFP2int(b, nil)
+		return F
+	}
 }
 
-func NewFP4copy(x *FP4) *FP4 {
-	F := new(FP4)
-	F.a = NewFP2copy(x.a)
-	F.b = NewFP2copy(x.b)
-	return F
+func NewFP4copy(x *FP4, mem *arena.Arena) *FP4 {
+	if mem != nil {
+		F := arena.New[FP4](mem)
+		F.a = NewFP2copy(x.a, mem)
+		F.b = NewFP2copy(x.b, mem)
+		return F
+	} else {
+		F := new(FP4)
+		F.a = NewFP2copy(x.a, nil)
+		F.b = NewFP2copy(x.b, nil)
+		return F
+	}
 }
 
-func NewFP4fp2s(c *FP2, d *FP2) *FP4 {
-	F := new(FP4)
-	F.a = NewFP2copy(c)
-	F.b = NewFP2copy(d)
-	return F
+func NewFP4fp2s(c *FP2, d *FP2, mem *arena.Arena) *FP4 {
+	if mem != nil {
+		F := arena.New[FP4](mem)
+		F.a = NewFP2copy(c, mem)
+		F.b = NewFP2copy(d, mem)
+		return F
+	} else {
+		F := new(FP4)
+		F.a = NewFP2copy(c, nil)
+		F.b = NewFP2copy(d, nil)
+		return F
+	}
 }
 
-func NewFP4fp2(c *FP2) *FP4 {
-	F := new(FP4)
-	F.a = NewFP2copy(c)
-	F.b = NewFP2()
-	return F
+func NewFP4fp2(c *FP2, mem *arena.Arena) *FP4 {
+	if mem != nil {
+		F := arena.New[FP4](mem)
+		F.a = NewFP2copy(c, mem)
+		F.b = NewFP2(mem)
+		return F
+	} else {
+		F := new(FP4)
+		F.a = NewFP2copy(c, nil)
+		F.b = NewFP2(nil)
+		return F
+	}
 }
 
-func NewFP4fp(c *FP) *FP4 {
-	F := new(FP4)
-	F.a = NewFP2fp(c)
-	F.b = NewFP2()
-	return F
+func NewFP4fp(c *FP, mem *arena.Arena) *FP4 {
+	if mem != nil {
+		F := arena.New[FP4](mem)
+		F.a = NewFP2fp(c, mem)
+		F.b = NewFP2(mem)
+		return F
+	} else {
+		F := new(FP4)
+		F.a = NewFP2fp(c, nil)
+		F.b = NewFP2(nil)
+		return F
+	}
 }
 
 func NewFP4rand(rng *ext.RAND) *FP4 {
-	F := NewFP4fp2s(NewFP2rand(rng), NewFP2rand(rng))
+	F := NewFP4fp2s(NewFP2rand(rng), NewFP2rand(rng), nil)
 	return F
 }
 
 /* reduce all components of this mod Modulus */
-func (F *FP4) reduce() {
-	F.a.reduce()
-	F.b.reduce()
+func (F *FP4) reduce(mem *arena.Arena) {
+	F.a.reduce(mem)
+	F.b.reduce(mem)
 }
 
 /* normalise all components of this mod Modulus */
@@ -101,12 +154,12 @@ func (F *FP4) norm() {
 }
 
 /* test this==0 ? */
-func (F *FP4) IsZero() bool {
-	return F.a.IsZero() && F.b.IsZero()
+func (F *FP4) IsZero(mem *arena.Arena) bool {
+	return F.a.IsZero(mem) && F.b.IsZero(mem)
 }
 
 func (F *FP4) islarger() int {
-	if F.IsZero() {
+	if F.IsZero(nil) {
 		return 0
 	}
 	cmp := F.b.islarger()
@@ -140,7 +193,7 @@ func FP4_fromBytes(bf []byte) *FP4 {
 		t[i] = bf[i+MB]
 	}
 	ta := FP2_fromBytes(t[:])
-	return NewFP4fp2s(ta, tb)
+	return NewFP4fp2s(ta, tb, nil)
 }
 
 /* Conditional move */
@@ -151,13 +204,17 @@ func (F *FP4) cmove(g *FP4, d int) {
 
 /* test this==1 ? */
 func (F *FP4) isunity() bool {
-	one := NewFP2int(1)
-	return F.a.Equals(one) && F.b.IsZero()
+	mem := arena.NewArena()
+	defer mem.Free()
+	one := NewFP2int(1, mem)
+	return F.a.Equals(one) && F.b.IsZero(mem)
 }
 
 /* test is w real? That is in a+ib test b is zero */
 func (F *FP4) isreal() bool {
-	return F.b.IsZero()
+	mem := arena.NewArena()
+	defer mem.Free()
+	return F.b.IsZero(mem)
 }
 
 /* extract real part a */
@@ -198,12 +255,12 @@ func (F *FP4) one() {
 }
 
 /* Return sign */
-func (F *FP4) sign() int {
-	p1 := F.a.sign()
-	p2 := F.b.sign()
+func (F *FP4) sign(mem *arena.Arena) int {
+	p1 := F.a.sign(mem)
+	p2 := F.b.sign(mem)
 	var u int
 	if BIG_ENDIAN_SIGN {
-		if F.b.IsZero() {
+		if F.b.IsZero(mem) {
 			u = 1
 		} else {
 			u = 0
@@ -211,7 +268,7 @@ func (F *FP4) sign() int {
 		p2 ^= (p1 ^ p2) & u
 		return p2
 	} else {
-		if F.a.IsZero() {
+		if F.a.IsZero(mem) {
 			u = 1
 		} else {
 			u = 0
@@ -222,132 +279,132 @@ func (F *FP4) sign() int {
 }
 
 /* set this=-this */
-func (F *FP4) Neg() {
+func (F *FP4) Neg(mem *arena.Arena) {
 	F.norm()
-	m := NewFP2copy(F.a)
-	t := NewFP2()
-	m.Add(F.b)
-	m.Neg()
+	m := NewFP2copy(F.a, mem)
+	t := NewFP2(mem)
+	m.Add(F.b, mem)
+	m.Neg(mem)
 	t.copy(m)
-	t.Add(F.b)
+	t.Add(F.b, mem)
 	F.b.copy(m)
-	F.b.Add(F.a)
+	F.b.Add(F.a, mem)
 	F.a.copy(t)
 	F.norm()
 }
 
 /* this=conjugate(this) */
-func (F *FP4) conj() {
-	F.b.Neg()
+func (F *FP4) conj(mem *arena.Arena) {
+	F.b.Neg(mem)
 	F.norm()
 }
 
 /* this=-conjugate(this) */
-func (F *FP4) nconj() {
-	F.a.Neg()
+func (F *FP4) nconj(mem *arena.Arena) {
+	F.a.Neg(mem)
 	F.norm()
 }
 
 /* this+=x */
-func (F *FP4) Add(x *FP4) {
-	F.a.Add(x.a)
-	F.b.Add(x.b)
+func (F *FP4) Add(x *FP4, mem *arena.Arena) {
+	F.a.Add(x.a, mem)
+	F.b.Add(x.b, mem)
 }
 
 /* this-=x */
-func (F *FP4) Sub(x *FP4) {
-	m := NewFP4copy(x)
-	m.Neg()
-	F.Add(m)
+func (F *FP4) Sub(x *FP4, mem *arena.Arena) {
+	m := NewFP4copy(x, mem)
+	m.Neg(mem)
+	F.Add(m, mem)
 }
 
 /* this-=x */
-func (F *FP4) rsub(x *FP4) {
-	F.Neg()
-	F.Add(x)
+func (F *FP4) rsub(x *FP4, mem *arena.Arena) {
+	F.Neg(mem)
+	F.Add(x, mem)
 }
 
 /* this*=s where s is FP2 */
-func (F *FP4) pmul(s *FP2) {
-	F.a.Mul(s)
-	F.b.Mul(s)
+func (F *FP4) pmul(s *FP2, mem *arena.Arena) {
+	F.a.Mul(s, mem)
+	F.b.Mul(s, mem)
 }
 
 /* this*=s where s is FP2 */
-func (F *FP4) qmul(s *FP) {
-	F.a.pmul(s)
-	F.b.pmul(s)
+func (F *FP4) qmul(s *FP, mem *arena.Arena) {
+	F.a.pmul(s, mem)
+	F.b.pmul(s, mem)
 }
 
 /* this*=c where c is int */
-func (F *FP4) imul(c int) {
-	F.a.imul(c)
-	F.b.imul(c)
+func (F *FP4) imul(c int, mem *arena.Arena) {
+	F.a.imul(c, mem)
+	F.b.imul(c, mem)
 }
 
 /* this*=this */
-func (F *FP4) Sqr() {
-	t1 := NewFP2copy(F.a)
-	t2 := NewFP2copy(F.b)
-	t3 := NewFP2copy(F.a)
+func (F *FP4) Sqr(mem *arena.Arena) {
+	t1 := NewFP2copy(F.a, mem)
+	t2 := NewFP2copy(F.b, mem)
+	t3 := NewFP2copy(F.a, mem)
 
-	t3.Mul(F.b)
-	t1.Add(F.b)
-	t2.Mul_ip()
+	t3.Mul(F.b, mem)
+	t1.Add(F.b, mem)
+	t2.Mul_ip(mem)
 
-	t2.Add(F.a)
+	t2.Add(F.a, mem)
 
 	t1.norm()
 	t2.norm()
 
 	F.a.copy(t1)
 
-	F.a.Mul(t2)
+	F.a.Mul(t2, mem)
 
 	t2.copy(t3)
-	t2.Mul_ip()
-	t2.Add(t3)
+	t2.Mul_ip(mem)
+	t2.Add(t3, mem)
 	t2.norm()
-	t2.Neg()
-	F.a.Add(t2)
+	t2.Neg(mem)
+	F.a.Add(t2, mem)
 
 	F.b.copy(t3)
-	F.b.Add(t3)
+	F.b.Add(t3, mem)
 
 	F.norm()
 }
 
 /* this*=y */
-func (F *FP4) Mul(y *FP4) {
-	t1 := NewFP2copy(F.a)
-	t2 := NewFP2copy(F.b)
-	t3 := NewFP2()
-	t4 := NewFP2copy(F.b)
+func (F *FP4) Mul(y *FP4, mem *arena.Arena) {
+	t1 := NewFP2copy(F.a, mem)
+	t2 := NewFP2copy(F.b, mem)
+	t3 := NewFP2(mem)
+	t4 := NewFP2copy(F.b, mem)
 
-	t1.Mul(y.a)
-	t2.Mul(y.b)
+	t1.Mul(y.a, mem)
+	t2.Mul(y.b, mem)
 	t3.copy(y.b)
-	t3.Add(y.a)
-	t4.Add(F.a)
+	t3.Add(y.a, mem)
+	t4.Add(F.a, mem)
 
 	t3.norm()
 	t4.norm()
 
-	t4.Mul(t3)
+	t4.Mul(t3, mem)
 
 	t3.copy(t1)
-	t3.Neg()
-	t4.Add(t3)
+	t3.Neg(mem)
+	t4.Add(t3, mem)
 	t4.norm()
 
 	t3.copy(t2)
-	t3.Neg()
+	t3.Neg(mem)
 	F.b.copy(t4)
-	F.b.Add(t3)
+	F.b.Add(t3, mem)
 
-	t2.Mul_ip()
+	t2.Mul_ip(mem)
 	F.a.copy(t2)
-	F.a.Add(t1)
+	F.a.Add(t1, mem)
 
 	F.norm()
 }
@@ -358,41 +415,41 @@ func (F *FP4) toString() string {
 }
 
 /* this=1/this */
-func (F *FP4) Invert(h *FP) {
-	t1 := NewFP2copy(F.a)
-	t2 := NewFP2copy(F.b)
+func (F *FP4) Invert(h *FP, mem *arena.Arena) {
+	t1 := NewFP2copy(F.a, mem)
+	t2 := NewFP2copy(F.b, mem)
 
-	t1.Sqr()
-	t2.Sqr()
-	t2.Mul_ip()
+	t1.Sqr(mem)
+	t2.Sqr(mem)
+	t2.Mul_ip(mem)
 	t2.norm()
-	t1.Sub(t2)
+	t1.Sub(t2, mem)
 
-	t1.Invert(h)
-	F.a.Mul(t1)
-	t1.Neg()
+	t1.Invert(h, mem)
+	F.a.Mul(t1, mem)
+	t1.Neg(mem)
 	t1.norm()
-	F.b.Mul(t1)
+	F.b.Mul(t1, mem)
 }
 
 /* this*=i where i = sqrt(2^i+sqrt(-1)) */
-func (F *FP4) times_i() {
-	t := NewFP2copy(F.b)
+func (F *FP4) times_i(mem *arena.Arena) {
+	t := NewFP2copy(F.b, mem)
 	F.b.copy(F.a)
-	t.Mul_ip()
+	t.Mul_ip(mem)
 	F.a.copy(t)
 	F.norm()
 	if TOWER == POSITOWER {
-		F.Neg()
+		F.Neg(mem)
 		F.norm()
 	}
 }
 
 /* this=this^p using Frobenius */
-func (F *FP4) frob(f *FP2) {
-	F.a.conj()
-	F.b.conj()
-	F.b.Mul(f)
+func (F *FP4) frob(f *FP2, mem *arena.Arena) {
+	F.a.conj(mem)
+	F.b.conj(mem)
+	F.b.Mul(f, mem)
 }
 
 /* this=this^e
@@ -418,48 +475,48 @@ func (F *FP4) pow(e *BIG) *FP4 {
 }
 */
 /* XTR xtr_a function */
-func (F *FP4) xtr_A(w *FP4, y *FP4, z *FP4) {
-	r := NewFP4copy(w)
-	t := NewFP4copy(w)
-	r.Sub(y)
+func (F *FP4) xtr_A(w *FP4, y *FP4, z *FP4, mem *arena.Arena) {
+	r := NewFP4copy(w, mem)
+	t := NewFP4copy(w, mem)
+	r.Sub(y, mem)
 	r.norm()
-	r.pmul(F.a)
-	t.Add(y)
+	r.pmul(F.a, mem)
+	t.Add(y, mem)
 	t.norm()
-	t.pmul(F.b)
-	t.times_i()
+	t.pmul(F.b, mem)
+	t.times_i(mem)
 
 	F.copy(r)
-	F.Add(t)
-	F.Add(z)
+	F.Add(t, mem)
+	F.Add(z, mem)
 
 	F.norm()
 }
 
 /* XTR xtr_d function */
-func (F *FP4) xtr_D() {
-	w := NewFP4copy(F)
-	F.Sqr()
-	w.conj()
-	w.Add(w)
+func (F *FP4) xtr_D(mem *arena.Arena) {
+	w := NewFP4copy(F, mem)
+	F.Sqr(mem)
+	w.conj(mem)
+	w.Add(w, mem)
 	w.norm()
-	F.Sub(w)
-	F.reduce()
+	F.Sub(w, mem)
+	F.reduce(mem)
 }
 
 /* r=x^n using XTR method on traces of FP12s */
-func (F *FP4) xtr_pow(n *BIG) *FP4 {
-	a := NewFP4int(3)
-	b := NewFP4copy(F)
-	c := NewFP4copy(b)
-	c.xtr_D()
-	t := NewFP4()
-	r := NewFP4()
-	sf := NewFP4copy(F)
+func (F *FP4) xtr_pow(n *BIG, mem *arena.Arena) *FP4 {
+	a := NewFP4int(3, mem)
+	b := NewFP4copy(F, mem)
+	c := NewFP4copy(b, mem)
+	c.xtr_D(mem)
+	t := NewFP4(mem)
+	r := NewFP4(mem)
+	sf := NewFP4copy(F, mem)
 	sf.norm()
 
 	par := n.parity()
-	v := NewBIGcopy(n)
+	v := NewBIGcopy(n, mem)
 	v.norm()
 	v.fshr(1)
 	if par == 0 {
@@ -471,20 +528,20 @@ func (F *FP4) xtr_pow(n *BIG) *FP4 {
 	for i := nb - 1; i >= 0; i-- {
 		if v.bit(i) != 1 {
 			t.copy(b)
-			sf.conj()
-			c.conj()
-			b.xtr_A(a, sf, c)
-			sf.conj()
+			sf.conj(mem)
+			c.conj(mem)
+			b.xtr_A(a, sf, c, mem)
+			sf.conj(mem)
 			c.copy(t)
-			c.xtr_D()
-			a.xtr_D()
+			c.xtr_D(mem)
+			a.xtr_D(mem)
 		} else {
 			t.copy(a)
-			t.conj()
+			t.conj(mem)
 			a.copy(b)
-			a.xtr_D()
-			b.xtr_A(c, sf, t)
-			c.xtr_D()
+			a.xtr_D(mem)
+			b.xtr_A(c, sf, t, mem)
+			c.xtr_D(mem)
 		}
 	}
 	if par == 0 {
@@ -492,25 +549,25 @@ func (F *FP4) xtr_pow(n *BIG) *FP4 {
 	} else {
 		r.copy(b)
 	}
-	r.reduce()
+	r.reduce(mem)
 	return r
 }
 
 /* r=ck^a.cl^n using XTR double exponentiation method on traces of FP12s. See Stam thesis. */
-func (F *FP4) xtr_pow2(ck *FP4, ckml *FP4, ckm2l *FP4, a *BIG, b *BIG) *FP4 {
+func (F *FP4) xtr_pow2(ck *FP4, ckml *FP4, ckm2l *FP4, a *BIG, b *BIG, mem *arena.Arena) *FP4 {
 
-	e := NewBIGcopy(a)
-	d := NewBIGcopy(b)
-	w := NewBIGint(0)
+	e := NewBIGcopy(a, mem)
+	d := NewBIGcopy(b, mem)
+	w := NewBIGint(0, mem)
 	e.norm()
 	d.norm()
 
-	cu := NewFP4copy(ck) // can probably be passed in w/o copying
-	cv := NewFP4copy(F)
-	cumv := NewFP4copy(ckml)
-	cum2v := NewFP4copy(ckm2l)
-	r := NewFP4()
-	t := NewFP4()
+	cu := NewFP4copy(ck, mem) // can probably be passed in w/o copying
+	cv := NewFP4copy(F, mem)
+	cumv := NewFP4copy(ckml, mem)
+	cum2v := NewFP4copy(ckm2l, mem)
+	r := NewFP4(mem)
+	t := NewFP4(mem)
 
 	f2 := 0
 	for d.parity() == 0 && e.parity() == 0 {
@@ -531,9 +588,9 @@ func (F *FP4) xtr_pow2(ck *FP4, ckml *FP4, ckm2l *FP4, a *BIG, b *BIG) *FP4 {
 				e.norm()
 
 				t.copy(cv)
-				t.xtr_A(cu, cumv, cum2v)
+				t.xtr_A(cu, cumv, cum2v, mem)
 				cum2v.copy(cumv)
-				cum2v.conj()
+				cum2v.conj(mem)
 				cumv.copy(cv)
 				cv.copy(cu)
 				cu.copy(t)
@@ -541,24 +598,24 @@ func (F *FP4) xtr_pow2(ck *FP4, ckml *FP4, ckm2l *FP4, a *BIG, b *BIG) *FP4 {
 				if d.parity() == 0 {
 					d.fshr(1)
 					r.copy(cum2v)
-					r.conj()
+					r.conj(mem)
 					t.copy(cumv)
-					t.xtr_A(cu, cv, r)
+					t.xtr_A(cu, cv, r, mem)
 					cum2v.copy(cumv)
-					cum2v.xtr_D()
+					cum2v.xtr_D(mem)
 					cumv.copy(t)
-					cu.xtr_D()
+					cu.xtr_D(mem)
 				} else {
 					if e.parity() == 1 {
 						d.Sub(e)
 						d.norm()
 						d.fshr(1)
 						t.copy(cv)
-						t.xtr_A(cu, cumv, cum2v)
-						cu.xtr_D()
+						t.xtr_A(cu, cumv, cum2v, mem)
+						cu.xtr_D(mem)
 						cum2v.copy(cv)
-						cum2v.xtr_D()
-						cum2v.conj()
+						cum2v.xtr_D(mem)
+						cum2v.conj(mem)
 						cv.copy(t)
 					} else {
 						w.copy(d)
@@ -566,13 +623,13 @@ func (F *FP4) xtr_pow2(ck *FP4, ckml *FP4, ckm2l *FP4, a *BIG, b *BIG) *FP4 {
 						d.fshr(1)
 						e.copy(w)
 						t.copy(cumv)
-						t.xtr_D()
+						t.xtr_D(mem)
 						cumv.copy(cum2v)
-						cumv.conj()
+						cumv.conj(mem)
 						cum2v.copy(t)
-						cum2v.conj()
+						cum2v.conj(mem)
 						t.copy(cv)
-						t.xtr_D()
+						t.xtr_D(mem)
 						cv.copy(cu)
 						cu.copy(t)
 					}
@@ -587,7 +644,7 @@ func (F *FP4) xtr_pow2(ck *FP4, ckml *FP4, ckm2l *FP4, a *BIG, b *BIG) *FP4 {
 				e.Sub(d)
 				e.norm()
 				t.copy(cv)
-				t.xtr_A(cu, cumv, cum2v)
+				t.xtr_A(cu, cumv, cum2v, mem)
 				cum2v.copy(cumv)
 				cumv.copy(cu)
 				cu.copy(t)
@@ -598,13 +655,13 @@ func (F *FP4) xtr_pow2(ck *FP4, ckml *FP4, ckm2l *FP4, a *BIG, b *BIG) *FP4 {
 					d.fshr(1)
 					e.copy(w)
 					t.copy(cumv)
-					t.xtr_D()
+					t.xtr_D(mem)
 					cumv.copy(cum2v)
-					cumv.conj()
+					cumv.conj(mem)
 					cum2v.copy(t)
-					cum2v.conj()
+					cum2v.conj(mem)
 					t.copy(cv)
-					t.xtr_D()
+					t.xtr_D(mem)
 					cv.copy(cu)
 					cu.copy(t)
 				} else {
@@ -616,52 +673,52 @@ func (F *FP4) xtr_pow2(ck *FP4, ckml *FP4, ckm2l *FP4, a *BIG, b *BIG) *FP4 {
 						d.copy(w)
 						d.fshr(1)
 						t.copy(cv)
-						t.xtr_A(cu, cumv, cum2v)
-						cumv.conj()
+						t.xtr_A(cu, cumv, cum2v, mem)
+						cumv.conj(mem)
 						cum2v.copy(cu)
-						cum2v.xtr_D()
-						cum2v.conj()
+						cum2v.xtr_D(mem)
+						cum2v.conj(mem)
 						cu.copy(cv)
-						cu.xtr_D()
+						cu.xtr_D(mem)
 						cv.copy(t)
 					} else {
 						d.fshr(1)
 						r.copy(cum2v)
-						r.conj()
+						r.conj(mem)
 						t.copy(cumv)
-						t.xtr_A(cu, cv, r)
+						t.xtr_A(cu, cv, r, mem)
 						cum2v.copy(cumv)
-						cum2v.xtr_D()
+						cum2v.xtr_D(mem)
 						cumv.copy(t)
-						cu.xtr_D()
+						cu.xtr_D(mem)
 					}
 				}
 			}
 		}
 	}
 	r.copy(cv)
-	r.xtr_A(cu, cumv, cum2v)
+	r.xtr_A(cu, cumv, cum2v, mem)
 	for i := 0; i < f2; i++ {
-		r.xtr_D()
+		r.xtr_D(mem)
 	}
-	r = r.xtr_pow(d)
+	r = r.xtr_pow(d, mem)
 	return r
 }
 
 /* this/=2 */
-func (F *FP4) div2() {
-	F.a.div2()
-	F.b.div2()
+func (F *FP4) div2(mem *arena.Arena) {
+	F.a.div2(mem)
+	F.b.div2(mem)
 }
 
-func (F *FP4) div_i() {
-	u := NewFP2copy(F.a)
-	v := NewFP2copy(F.b)
-	u.div_ip()
+func (F *FP4) div_i(mem *arena.Arena) {
+	u := NewFP2copy(F.a, mem)
+	v := NewFP2copy(F.b, mem)
+	u.div_ip(mem)
 	F.a.copy(v)
 	F.b.copy(u)
 	if TOWER == POSITOWER {
-		F.Neg()
+		F.Neg(mem)
 		F.norm()
 	}
 }
@@ -688,70 +745,72 @@ func (F *FP4) pow(b *BIG) {
 /* */
 // Test for Quadratic Residue
 func (F *FP4) qr(h *FP) int {
-	c := NewFP4copy(F)
-	c.conj()
-	c.Mul(F)
+	mem := arena.NewArena()
+	defer mem.Free()
+	c := NewFP4copy(F, mem)
+	c.conj(mem)
+	c.Mul(F, mem)
 	return c.a.qr(h)
 }
 
 // sqrt(a+ib) = sqrt(a+sqrt(a*a-n*b*b)/2)+ib/(2*sqrt(a+sqrt(a*a-n*b*b)/2))
-func (F *FP4) Sqrt(h *FP) {
-	if F.IsZero() {
+func (F *FP4) Sqrt(h *FP, mem *arena.Arena) {
+	if F.IsZero(mem) {
 		return
 	}
 
-	a := NewFP2copy(F.a)
-	b := NewFP2()
-	s := NewFP2copy(F.b)
-	t := NewFP2copy(F.a)
-	hint := NewFP()
+	a := NewFP2copy(F.a, mem)
+	b := NewFP2(mem)
+	s := NewFP2copy(F.b, mem)
+	t := NewFP2copy(F.a, mem)
+	hint := NewFP(mem)
 
-	s.Sqr()
-	a.Sqr()
-	s.Mul_ip()
+	s.Sqr(mem)
+	a.Sqr(mem)
+	s.Mul_ip(mem)
 	s.norm()
-	a.Sub(s)
+	a.Sub(s, mem)
 
 	s.copy(a)
 	s.norm()
-	s.Sqrt(h)
+	s.Sqrt(h, mem)
 
 	a.copy(t)
 	b.copy(t)
 
-	a.Add(s)
+	a.Add(s, mem)
 	a.norm()
-	a.div2()
+	a.div2(mem)
 
 	b.copy(F.b)
-	b.div2()
+	b.div2(mem)
 	qr := a.qr(hint)
 
 	// tweak hint - multiply old hint by Norm(1/Beta)^e where Beta is irreducible polynomial
 	s.copy(a)
-	twk := NewFPbig(NewBIGints(TWK))
-	twk.Mul(hint)
-	s.div_ip()
+	twk := NewFPbig(NewBIGints(TWK, mem), mem)
+	twk.Mul(hint, mem)
+	s.div_ip(mem)
 	s.norm()
 
 	a.cmove(s, 1-qr)
 	hint.cmove(twk, 1-qr)
 
 	F.a.copy(a)
-	F.a.Sqrt(hint)
+	F.a.Sqrt(hint, mem)
 	s.copy(a)
-	s.Invert(hint)
-	s.Mul(F.a)
+	s.Invert(hint, mem)
+	s.Mul(F.a, mem)
 	F.b.copy(s)
-	F.b.Mul(b)
+	F.b.Mul(b, mem)
 	t.copy(F.a)
 
 	F.a.cmove(F.b, 1-qr)
 	F.b.cmove(t, 1-qr)
 
-	sgn := F.sign()
-	nr := NewFP4copy(F)
-	nr.Neg()
+	sgn := F.sign(mem)
+	nr := NewFP4copy(F, mem)
+	nr.Neg(mem)
 	nr.norm()
 	F.cmove(nr, sgn)
 }
diff --git a/nekryptology/pkg/core/curves/native/bls48581/fp48.go b/nekryptology/pkg/core/curves/native/bls48581/fp48.go
index fd8df7f..50e9d85 100644
--- a/nekryptology/pkg/core/curves/native/bls48581/fp48.go
+++ b/nekryptology/pkg/core/curves/native/bls48581/fp48.go
@@ -22,6 +22,8 @@
 
 package bls48581
 
+import "arena"
+
 //import "fmt"
 
 type FP48 struct {
@@ -32,29 +34,52 @@ type FP48 struct {
 }
 
 /* Constructors */
-func NewFP48fp16(d *FP16) *FP48 {
-	F := new(FP48)
-	F.a = NewFP16copy(d)
-	F.b = NewFP16()
-	F.c = NewFP16()
-	F.stype = FP_SPARSEST
-	return F
+func NewFP48fp16(d *FP16, mem *arena.Arena) *FP48 {
+	if mem != nil {
+		F := arena.New[FP48](mem)
+		F.a = NewFP16copy(d, mem)
+		F.b = NewFP16(mem)
+		F.c = NewFP16(mem)
+		F.stype = FP_SPARSEST
+		return F
+	} else {
+		F := new(FP48)
+		F.a = NewFP16copy(d, nil)
+		F.b = NewFP16(nil)
+		F.c = NewFP16(nil)
+		F.stype = FP_SPARSEST
+		return F
+	}
 }
 
-func NewFP48() *FP48 {
-	F := new(FP48)
-	F.a = NewFP16()
-	F.b = NewFP16()
-	F.c = NewFP16()
-	F.stype = FP_ZERO
-	return F
+func NewFP48(mem *arena.Arena) *FP48 {
+	if mem != nil {
+		F := arena.New[FP48](mem)
+		F.a = NewFP16(mem)
+		F.b = NewFP16(mem)
+		F.c = NewFP16(mem)
+		F.stype = FP_ZERO
+		return F
+	} else {
+		F := new(FP48)
+		F.a = NewFP16(nil)
+		F.b = NewFP16(nil)
+		F.c = NewFP16(nil)
+		F.stype = FP_ZERO
+		return F
+	}
 }
 
-func NewFP48int(d int) *FP48 {
-	F := new(FP48)
-	F.a = NewFP16int(d)
-	F.b = NewFP16()
-	F.c = NewFP16()
+func NewFP48int(d int, mem *arena.Arena) *FP48 {
+	var F *FP48
+	if mem != nil {
+		F = arena.New[FP48](mem)
+	} else {
+		F = new(FP48)
+	}
+	F.a = NewFP16int(d, mem)
+	F.b = NewFP16(mem)
+	F.c = NewFP16(mem)
 	if d == 1 {
 		F.stype = FP_ONE
 	} else {
@@ -63,29 +88,39 @@ func NewFP48int(d int) *FP48 {
 	return F
 }
 
-func NewFP48fp16s(d *FP16, e *FP16, f *FP16) *FP48 {
-	F := new(FP48)
-	F.a = NewFP16copy(d)
-	F.b = NewFP16copy(e)
-	F.c = NewFP16copy(f)
+func NewFP48fp16s(d *FP16, e *FP16, f *FP16, mem *arena.Arena) *FP48 {
+	var F *FP48
+	if mem != nil {
+		F = arena.New[FP48](mem)
+	} else {
+		F = new(FP48)
+	}
+	F.a = d
+	F.b = e
+	F.c = f
 	F.stype = FP_DENSE
 	return F
 }
 
-func NewFP48copy(x *FP48) *FP48 {
-	F := new(FP48)
-	F.a = NewFP16copy(x.a)
-	F.b = NewFP16copy(x.b)
-	F.c = NewFP16copy(x.c)
+func NewFP48copy(x *FP48, mem *arena.Arena) *FP48 {
+	var F *FP48
+	if mem != nil {
+		F = arena.New[FP48](mem)
+	} else {
+		F = new(FP48)
+	}
+	F.a = NewFP16copy(x.a, mem)
+	F.b = NewFP16copy(x.b, mem)
+	F.c = NewFP16copy(x.c, mem)
 	F.stype = x.stype
 	return F
 }
 
 /* reduce all components of this mod Modulus */
-func (F *FP48) reduce() {
-	F.a.reduce()
-	F.b.reduce()
-	F.c.reduce()
+func (F *FP48) reduce(mem *arena.Arena) {
+	F.a.reduce(mem)
+	F.b.reduce(mem)
+	F.c.reduce(mem)
 }
 
 /* normalise all components of this */
@@ -96,8 +131,8 @@ func (F *FP48) norm() {
 }
 
 /* test x==0 ? */
-func (F *FP48) IsZero() bool {
-	return (F.a.IsZero() && F.b.IsZero() && F.c.IsZero())
+func (F *FP48) IsZero(mem *arena.Arena) bool {
+	return (F.a.IsZero(mem) && F.b.IsZero(mem) && F.c.IsZero(mem))
 }
 
 /* Conditional move */
@@ -126,15 +161,17 @@ func (F *FP48) selector(g []*FP48, b int32) {
 	F.cmove(g[6], teq(babs, 6))
 	F.cmove(g[7], teq(babs, 7))
 
-	invF := NewFP48copy(F)
-	invF.conj()
+	invF := NewFP48copy(F, nil)
+	invF.conj(nil)
 	F.cmove(invF, int(m&1))
 }
 
 /* test x==1 ? */
 func (F *FP48) Isunity() bool {
-	one := NewFP16int(1)
-	return (F.a.Equals(one) && F.b.IsZero() && F.c.IsZero())
+	mem := arena.NewArena()
+	defer mem.Free()
+	one := NewFP16int(1, mem)
+	return (F.a.Equals(one) && F.b.IsZero(mem) && F.c.IsZero(mem))
 }
 
 /* return 1 if x==y, else 0 */
@@ -182,94 +219,94 @@ func (F *FP48) zero() {
 }
 
 /* this=conj(this) */
-func (F *FP48) conj() {
-	F.a.conj()
-	F.b.nconj()
-	F.c.conj()
+func (F *FP48) conj(mem *arena.Arena) {
+	F.a.conj(mem)
+	F.b.nconj(mem)
+	F.c.conj(mem)
 }
 
 /* Granger-Scott Unitary Squaring */
-func (F *FP48) uSqr() {
-	A := NewFP16copy(F.a)
-	B := NewFP16copy(F.c)
-	C := NewFP16copy(F.b)
-	D := NewFP16()
+func (F *FP48) uSqr(mem *arena.Arena) {
+	A := NewFP16copy(F.a, mem)
+	B := NewFP16copy(F.c, mem)
+	C := NewFP16copy(F.b, mem)
+	D := NewFP16(mem)
 
-	F.a.Sqr()
+	F.a.Sqr(mem)
 	D.copy(F.a)
-	D.Add(F.a)
-	F.a.Add(D)
+	D.Add(F.a, mem)
+	F.a.Add(D, mem)
 
 	F.a.norm()
-	A.nconj()
+	A.nconj(mem)
 
-	A.Add(A)
-	F.a.Add(A)
-	B.Sqr()
-	B.times_i()
+	A.Add(A, mem)
+	F.a.Add(A, mem)
+	B.Sqr(mem)
+	B.times_i(mem)
 
 	D.copy(B)
-	D.Add(B)
-	B.Add(D)
+	D.Add(B, mem)
+	B.Add(D, mem)
 	B.norm()
 
-	C.Sqr()
+	C.Sqr(mem)
 	D.copy(C)
-	D.Add(C)
-	C.Add(D)
+	D.Add(C, mem)
+	C.Add(D, mem)
 	C.norm()
 
-	F.b.conj()
-	F.b.Add(F.b)
-	F.c.nconj()
+	F.b.conj(mem)
+	F.b.Add(F.b, mem)
+	F.c.nconj(mem)
 
-	F.c.Add(F.c)
-	F.b.Add(B)
-	F.c.Add(C)
-	F.reduce()
+	F.c.Add(F.c, mem)
+	F.b.Add(B, mem)
+	F.c.Add(C, mem)
+	F.reduce(mem)
 	F.stype = FP_DENSE
 }
 
 /* Chung-Hasan SQR2 method from http://cacr.uwaterloo.ca/techreports/2006/cacr2006-24.pdf */
-func (F *FP48) Sqr() {
+func (F *FP48) Sqr(mem *arena.Arena) {
 	if F.stype == FP_ONE {
 		return
 	}
-	A := NewFP16copy(F.a)
-	B := NewFP16copy(F.b)
-	C := NewFP16copy(F.c)
-	D := NewFP16copy(F.a)
+	A := NewFP16copy(F.a, mem)
+	B := NewFP16copy(F.b, mem)
+	C := NewFP16copy(F.c, mem)
+	D := NewFP16copy(F.a, mem)
 
-	A.Sqr()
-	B.Mul(F.c)
-	B.Add(B)
+	A.Sqr(mem)
+	B.Mul(F.c, mem)
+	B.Add(B, mem)
 	B.norm()
-	C.Sqr()
-	D.Mul(F.b)
-	D.Add(D)
+	C.Sqr(mem)
+	D.Mul(F.b, mem)
+	D.Add(D, mem)
 
-	F.c.Add(F.a)
-	F.c.Add(F.b)
+	F.c.Add(F.a, mem)
+	F.c.Add(F.b, mem)
 	F.c.norm()
-	F.c.Sqr()
+	F.c.Sqr(mem)
 
 	F.a.copy(A)
 
-	A.Add(B)
+	A.Add(B, mem)
 	A.norm()
-	A.Add(C)
-	A.Add(D)
+	A.Add(C, mem)
+	A.Add(D, mem)
 	A.norm()
 
-	A.Neg()
-	B.times_i()
-	C.times_i()
+	A.Neg(mem)
+	B.times_i(mem)
+	C.times_i(mem)
 
-	F.a.Add(B)
+	F.a.Add(B, mem)
 
 	F.b.copy(C)
-	F.b.Add(D)
-	F.c.Add(A)
+	F.b.Add(D, mem)
+	F.c.Add(A, mem)
 	if F.stype == FP_SPARSER || F.stype == FP_SPARSEST {
 		F.stype = FP_SPARSE
 	} else {
@@ -279,70 +316,70 @@ func (F *FP48) Sqr() {
 }
 
 /* FP48 full multiplication this=this*y */
-func (F *FP48) Mul(y *FP48) {
-	z0 := NewFP16copy(F.a)
-	z1 := NewFP16()
-	z2 := NewFP16copy(F.b)
-	z3 := NewFP16()
-	t0 := NewFP16copy(F.a)
-	t1 := NewFP16copy(y.a)
+func (F *FP48) Mul(y *FP48, mem *arena.Arena) {
+	z0 := NewFP16copy(F.a, mem)
+	z1 := NewFP16(mem)
+	z2 := NewFP16copy(F.b, mem)
+	z3 := NewFP16(mem)
+	t0 := NewFP16copy(F.a, mem)
+	t1 := NewFP16copy(y.a, mem)
 
-	z0.Mul(y.a)
-	z2.Mul(y.b)
+	z0.Mul(y.a, mem)
+	z2.Mul(y.b, mem)
 
-	t0.Add(F.b)
+	t0.Add(F.b, mem)
 	t0.norm()
-	t1.Add(y.b)
+	t1.Add(y.b, mem)
 	t1.norm()
 
 	z1.copy(t0)
-	z1.Mul(t1)
+	z1.Mul(t1, mem)
 	t0.copy(F.b)
-	t0.Add(F.c)
+	t0.Add(F.c, mem)
 	t0.norm()
 
 	t1.copy(y.b)
-	t1.Add(y.c)
+	t1.Add(y.c, mem)
 	t1.norm()
 	z3.copy(t0)
-	z3.Mul(t1)
+	z3.Mul(t1, mem)
 
 	t0.copy(z0)
-	t0.Neg()
+	t0.Neg(mem)
 	t1.copy(z2)
-	t1.Neg()
+	t1.Neg(mem)
 
-	z1.Add(t0)
+	z1.Add(t0, mem)
 	//z1.norm();
 	F.b.copy(z1)
-	F.b.Add(t1)
+	F.b.Add(t1, mem)
 
-	z3.Add(t1)
-	z2.Add(t0)
+	z3.Add(t1, mem)
+	z2.Add(t0, mem)
 
 	t0.copy(F.a)
-	t0.Add(F.c)
+	t0.Add(F.c, mem)
 	t0.norm()
 	t1.copy(y.a)
-	t1.Add(y.c)
+	t1.Add(y.c, mem)
 	t1.norm()
-	t0.Mul(t1)
-	z2.Add(t0)
+	t0.Mul(t1, mem)
+	z2.Add(t0, mem)
 
 	t0.copy(F.c)
-	t0.Mul(y.c)
+	t0.Mul(y.c, mem)
 	t1.copy(t0)
-	t1.Neg()
+	t1.Neg(mem)
 
 	F.c.copy(z2)
-	F.c.Add(t1)
-	z3.Add(t1)
-	t0.times_i()
-	F.b.Add(t0)
+	F.c.Add(t1, mem)
+	z3.Add(t1, mem)
+	t0.times_i(mem)
+	F.b.Add(t0, mem)
 	z3.norm()
-	z3.times_i()
+	z3.times_i(mem)
 	F.a.copy(z0)
-	F.a.Add(z3)
+	F.a.Add(z3, mem)
 	F.stype = FP_DENSE
 	F.norm()
 }
@@ -350,7 +387,7 @@ func (F *FP48) Mul(y *FP48) {
 /* FP48 full multiplication w=w*y */
 /* Supports sparse multiplicands */
 /* Usually w is denser than y */
-func (F *FP48) ssmul(y *FP48) {
+func (F *FP48) ssmul(y *FP48, mem *arena.Arena) {
 	if F.stype == FP_ONE {
 		F.Copy(y)
 		return
@@ -359,483 +396,307 @@ func (F *FP48) ssmul(y *FP48) {
 		return
 	}
 	if y.stype >= FP_SPARSE {
-		z0 := NewFP16copy(F.a)
-		z1 := NewFP16()
-		z2 := NewFP16()
-		z3 := NewFP16()
-		z0.Mul(y.a)
+		z0 := NewFP16copy(F.a, mem)
+		z1 := NewFP16(mem)
+		z2 := NewFP16(mem)
+		z3 := NewFP16(mem)
+		z0.Mul(y.a, mem)
 
-		if SEXTIC_TWIST == M_TYPE {
-			if y.stype == FP_SPARSE || F.stype == FP_SPARSE {
-				z2.getb().copy(F.b.getb())
-				z2.getb().Mul(y.b.getb())
-				z2.geta().zero()
-				if y.stype != FP_SPARSE {
-					z2.geta().copy(F.b.getb())
-					z2.geta().Mul(y.b.geta())
-				}
-				if F.stype != FP_SPARSE {
-					z2.geta().copy(F.b.geta())
-					z2.geta().Mul(y.b.getb())
-				}
-				z2.times_i()
-			} else {
-				z2.copy(F.b)
-				z2.Mul(y.b)
-			}
-		} else {
-			z2.copy(F.b)
-			z2.Mul(y.b)
-		}
-		t0 := NewFP16copy(F.a)
-		t1 := NewFP16copy(y.a)
-		t0.Add(F.b)
+		z2.copy(F.b)
+		z2.Mul(y.b, mem)
+		t0 := NewFP16copy(F.a, mem)
+		t1 := NewFP16copy(y.a, mem)
+		t0.Add(F.b, mem)
 		t0.norm()
-		t1.Add(y.b)
+		t1.Add(y.b, mem)
 		t1.norm()
 
 		z1.copy(t0)
-		z1.Mul(t1)
+		z1.Mul(t1, mem)
 		t0.copy(F.b)
-		t0.Add(F.c)
+		t0.Add(F.c, mem)
 		t0.norm()
 		t1.copy(y.b)
-		t1.Add(y.c)
+		t1.Add(y.c, mem)
 		t1.norm()
 
 		z3.copy(t0)
-		z3.Mul(t1)
+		z3.Mul(t1, mem)
 
 		t0.copy(z0)
-		t0.Neg()
+		t0.Neg(mem)
 		t1.copy(z2)
-		t1.Neg()
+		t1.Neg(mem)
 
-		z1.Add(t0)
+		z1.Add(t0, mem)
 		F.b.copy(z1)
-		F.b.Add(t1)
+		F.b.Add(t1, mem)
 
-		z3.Add(t1)
-		z2.Add(t0)
+		z3.Add(t1, mem)
+		z2.Add(t0, mem)
 
 		t0.copy(F.a)
-		t0.Add(F.c)
+		t0.Add(F.c, mem)
 		t0.norm()
 		t1.copy(y.a)
-		t1.Add(y.c)
+		t1.Add(y.c, mem)
 		t1.norm()
 
-		t0.Mul(t1)
-		z2.Add(t0)
+		t0.Mul(t1, mem)
+		z2.Add(t0, mem)
 
-		if SEXTIC_TWIST == D_TYPE {
-			if y.stype == FP_SPARSE || F.stype == FP_SPARSE {
-				t0.geta().copy(F.c.geta())
-				t0.geta().Mul(y.c.geta())
-				t0.getb().zero()
-				if y.stype != FP_SPARSE {
-					t0.getb().copy(F.c.geta())
-					t0.getb().Mul(y.c.getb())
-				}
-				if F.stype != FP_SPARSE {
-					t0.getb().copy(F.c.getb())
-					t0.getb().Mul(y.c.geta())
-				}
-			} else {
-				t0.copy(F.c)
-				t0.Mul(y.c)
+		if y.stype == FP_SPARSE || F.stype == FP_SPARSE {
+			t0.geta().copy(F.c.geta())
+			t0.geta().Mul(y.c.geta(), mem)
+			t0.getb().zero()
+			if y.stype != FP_SPARSE {
+				t0.getb().copy(F.c.geta())
+				t0.getb().Mul(y.c.getb(), mem)
+			}
+			if F.stype != FP_SPARSE {
+				t0.getb().copy(F.c.getb())
+				t0.getb().Mul(y.c.geta(), mem)
 			}
 		} else {
 			t0.copy(F.c)
-			t0.Mul(y.c)
+			t0.Mul(y.c, mem)
 		}
 		t1.copy(t0)
-		t1.Neg()
+		t1.Neg(mem)
 
 		F.c.copy(z2)
-		F.c.Add(t1)
-		z3.Add(t1)
-		t0.times_i()
-		F.b.Add(t0)
+		F.c.Add(t1, mem)
+		z3.Add(t1, mem)
+		t0.times_i(mem)
+		F.b.Add(t0, mem)
 		z3.norm()
-		z3.times_i()
+		z3.times_i(mem)
 		F.a.copy(z0)
-		F.a.Add(z3)
+		F.a.Add(z3, mem)
 	} else {
 		if F.stype == FP_SPARSER || F.stype == FP_SPARSEST {
-			F.smul(y)
+			F.smul(y, mem)
 			return
 		}
-		if SEXTIC_TWIST == D_TYPE { // dense by sparser - 13m
-			z0 := NewFP16copy(F.a)
-			z2 := NewFP16copy(F.b)
-			z3 := NewFP16copy(F.b)
-			t0 := NewFP16()
-			t1 := NewFP16copy(y.a)
-			z0.Mul(y.a)
+		z0 := NewFP16copy(F.a, mem)
+		z2 := NewFP16copy(F.b, mem)
+		z3 := NewFP16copy(F.b, mem)
+		t0 := NewFP16(mem)
+		t1 := NewFP16copy(y.a, mem)
+		z0.Mul(y.a, mem)
 
-			if y.stype == FP_SPARSEST {
-				z2.tmul(y.b.a.a.a.a)
-			} else {
-				z2.pmul(y.b.geta())
-			}
-			F.b.Add(F.a)
-			t1.geta().Add(y.b.geta())
-
-			t1.norm()
-			F.b.norm()
-			F.b.Mul(t1)
-			z3.Add(F.c)
-			z3.norm()
-
-			if y.stype == FP_SPARSEST {
-				z3.tmul(y.b.a.a.a.a)
-			} else {
-				z3.pmul(y.b.geta())
-			}
-
-			t0.copy(z0)
-			t0.Neg()
-			t1.copy(z2)
-			t1.Neg()
-
-			F.b.Add(t0)
-
-			F.b.Add(t1)
-			z3.Add(t1)
-			z2.Add(t0)
-
-			t0.copy(F.a)
-			t0.Add(F.c)
-			t0.norm()
-			z3.norm()
-			t0.Mul(y.a)
-			F.c.copy(z2)
-			F.c.Add(t0)
-
-			z3.times_i()
-			F.a.copy(z0)
-			F.a.Add(z3)
+		if y.stype == FP_SPARSEST {
+			z2.tmul(y.b.a.a.a.a, mem)
+		} else {
+			z2.pmul(y.b.geta(), mem)
 		}
-		if SEXTIC_TWIST == M_TYPE {
-			z0 := NewFP16copy(F.a)
-			z1 := NewFP16()
-			z2 := NewFP16()
-			z3 := NewFP16()
-			t0 := NewFP16copy(F.a)
-			t1 := NewFP16()
+		F.b.Add(F.a, mem)
+		t1.geta().Add(y.b.geta(), mem)
 
-			z0.Mul(y.a)
-			t0.Add(F.b)
-			t0.norm()
+		t1.norm()
+		F.b.norm()
+		F.b.Mul(t1, mem)
+		z3.Add(F.c, mem)
+		z3.norm()
 
-			z1.copy(t0)
-			z1.Mul(y.a)
-			t0.copy(F.b)
-			t0.Add(F.c)
-			t0.norm()
-
-			z3.copy(t0)
-
-			if y.stype == FP_SPARSEST {
-				z3.tmul(y.c.b.a.a.a)
-			} else {
-				z3.pmul(y.c.getb())
-			}
-			z3.times_i()
-
-			t0.copy(z0)
-			t0.Neg()
-			z1.Add(t0)
-			F.b.copy(z1)
-			z2.copy(t0)
-
-			t0.copy(F.a)
-			t0.Add(F.c)
-			t0.norm()
-			t1.copy(y.a)
-			t1.Add(y.c)
-			t1.norm()
-
-			t0.Mul(t1)
-			z2.Add(t0)
-			t0.copy(F.c)
-
-			if y.stype == FP_SPARSEST {
-				t0.tmul(y.c.b.a.a.a)
-			} else {
-				t0.pmul(y.c.getb())
-			}
-			t0.times_i()
-			t1.copy(t0)
-			t1.Neg()
-
-			F.c.copy(z2)
-			F.c.Add(t1)
-			z3.Add(t1)
-			t0.times_i()
-			F.b.Add(t0)
-			z3.norm()
-			z3.times_i()
-			F.a.copy(z0)
-			F.a.Add(z3)
+		if y.stype == FP_SPARSEST {
+			z3.tmul(y.b.a.a.a.a, mem)
+		} else {
+			z3.pmul(y.b.geta(), mem)
 		}
+
+		t0.copy(z0)
+		t0.Neg(mem)
+		t1.copy(z2)
+		t1.Neg(mem)
+
+		F.b.Add(t0, mem)
+
+		F.b.Add(t1, mem)
+		z3.Add(t1, mem)
+		z2.Add(t0, mem)
+
+		t0.copy(F.a)
+		t0.Add(F.c, mem)
+		t0.norm()
+		z3.norm()
+		t0.Mul(y.a, mem)
+		F.c.copy(z2)
+		F.c.Add(t0, mem)
+
+		z3.times_i(mem)
+		F.a.copy(z0)
+		F.a.Add(z3, mem)
 	}
 	F.stype = FP_DENSE
 	F.norm()
 }
 
 /* Special case of multiplication arises from special form of ATE pairing line function */
-func (F *FP48) smul(y *FP48) {
-	if SEXTIC_TWIST == D_TYPE {
-		w1 := NewFP8copy(F.a.geta())
-		w2 := NewFP8copy(F.a.getb())
-		var w3 *FP8
+func (F *FP48) smul(y *FP48, mem *arena.Arena) {
+	w1 := NewFP8copy(F.a.geta(), mem)
+	w2 := NewFP8copy(F.a.getb(), mem)
+	var w3 *FP8
 
-		w1.Mul(y.a.geta())
-		w2.Mul(y.a.getb())
+	w1.Mul(y.a.geta(), mem)
+	w2.Mul(y.a.getb(), mem)
 
-		if y.stype == FP_SPARSEST || F.stype == FP_SPARSEST {
-			if y.stype == FP_SPARSEST && F.stype == FP_SPARSEST {
-				t := NewFPcopy(F.b.a.a.a.a)
-				t.Mul(y.b.a.a.a.a)
-				w3 = NewFP8fp(t)
-			} else {
-				if y.stype != FP_SPARSEST {
-					w3 = NewFP8copy(y.b.geta())
-					w3.tmul(F.b.a.a.a.a)
-				} else {
-					w3 = NewFP8copy(F.b.geta())
-					w3.tmul(y.b.a.a.a.a)
-				}
-			}
+	if y.stype == FP_SPARSEST || F.stype == FP_SPARSEST {
+		if y.stype == FP_SPARSEST && F.stype == FP_SPARSEST {
+			t := NewFPcopy(F.b.a.a.a.a, mem)
+			t.Mul(y.b.a.a.a.a, mem)
+			w3 = NewFP8fp(t, mem)
 		} else {
-			w3 = NewFP8copy(F.b.geta())
-			w3.Mul(y.b.geta())
+			if y.stype != FP_SPARSEST {
+				w3 = NewFP8copy(y.b.geta(), mem)
+				w3.tmul(F.b.a.a.a.a, mem)
+			} else {
+				w3 = NewFP8copy(F.b.geta(), mem)
+				w3.tmul(y.b.a.a.a.a, mem)
+			}
 		}
-		ta := NewFP8copy(F.a.geta())
-		tb := NewFP8copy(y.a.geta())
-		ta.Add(F.a.getb())
-		ta.norm()
-		tb.Add(y.a.getb())
-		tb.norm()
-		tc := NewFP8copy(ta)
-		tc.Mul(tb)
-		t := NewFP8copy(w1)
-		t.Add(w2)
-		t.Neg()
-		tc.Add(t)
-
-		ta.copy(F.a.geta())
-		ta.Add(F.b.geta())
-		ta.norm()
-		tb.copy(y.a.geta())
-		tb.Add(y.b.geta())
-		tb.norm()
-		td := NewFP8copy(ta)
-		td.Mul(tb)
-		t.copy(w1)
-		t.Add(w3)
-		t.Neg()
-		td.Add(t)
-
-		ta.copy(F.a.getb())
-		ta.Add(F.b.geta())
-		ta.norm()
-		tb.copy(y.a.getb())
-		tb.Add(y.b.geta())
-		tb.norm()
-		te := NewFP8copy(ta)
-		te.Mul(tb)
-		t.copy(w2)
-		t.Add(w3)
-		t.Neg()
-		te.Add(t)
-
-		w2.times_i()
-		w1.Add(w2)
-
-		F.a.geta().copy(w1)
-		F.a.getb().copy(tc)
-		F.b.geta().copy(td)
-		F.b.getb().copy(te)
-		F.c.geta().copy(w3)
-		F.c.getb().zero()
-
-		F.a.norm()
-		F.b.norm()
 	} else {
-		w1 := NewFP8copy(F.a.geta())
-		w2 := NewFP8copy(F.a.getb())
-		var w3 *FP8
-
-		w1.Mul(y.a.geta())
-		w2.Mul(y.a.getb())
-
-		if y.stype == FP_SPARSEST || F.stype == FP_SPARSEST {
-			if y.stype == FP_SPARSEST && F.stype == FP_SPARSEST {
-				t := NewFPcopy(F.c.b.a.a.a)
-				t.Mul(y.c.b.a.a.a)
-				w3 = NewFP8fp(t)
-			} else {
-				if y.stype != FP_SPARSEST {
-					w3 = NewFP8copy(y.c.getb())
-					w3.tmul(F.c.b.a.a.a)
-				} else {
-					w3 = NewFP8copy(F.c.getb())
-					w3.tmul(y.c.b.a.a.a)
-				}
-			}
-		} else {
-			w3 = NewFP8copy(F.c.getb())
-			w3.Mul(y.c.getb())
-		}
-
-		ta := NewFP8copy(F.a.geta())
-		tb := NewFP8copy(y.a.geta())
-		ta.Add(F.a.getb())
-		ta.norm()
-		tb.Add(y.a.getb())
-		tb.norm()
-		tc := NewFP8copy(ta)
-		tc.Mul(tb)
-		t := NewFP8copy(w1)
-		t.Add(w2)
-		t.Neg()
-		tc.Add(t)
-
-		ta.copy(F.a.geta())
-		ta.Add(F.c.getb())
-		ta.norm()
-		tb.copy(y.a.geta())
-		tb.Add(y.c.getb())
-		tb.norm()
-		td := NewFP8copy(ta)
-		td.Mul(tb)
-		t.copy(w1)
-		t.Add(w3)
-		t.Neg()
-		td.Add(t)
-
-		ta.copy(F.a.getb())
-		ta.Add(F.c.getb())
-		ta.norm()
-		tb.copy(y.a.getb())
-		tb.Add(y.c.getb())
-		tb.norm()
-		te := NewFP8copy(ta)
-		te.Mul(tb)
-		t.copy(w2)
-		t.Add(w3)
-		t.Neg()
-		te.Add(t)
-
-		w2.times_i()
-		w1.Add(w2)
-		F.a.geta().copy(w1)
-		F.a.getb().copy(tc)
-
-		w3.times_i()
-		w3.norm()
-		F.b.geta().zero()
-		F.b.getb().copy(w3)
-
-		te.norm()
-		te.times_i()
-		F.c.geta().copy(te)
-		F.c.getb().copy(td)
-
-		F.a.norm()
-		F.c.norm()
-
+		w3 = NewFP8copy(F.b.geta(), mem)
+		w3.Mul(y.b.geta(), mem)
 	}
+	ta := NewFP8copy(F.a.geta(), mem)
+	tb := NewFP8copy(y.a.geta(), mem)
+	ta.Add(F.a.getb(), mem)
+	ta.norm()
+	tb.Add(y.a.getb(), mem)
+	tb.norm()
+	tc := NewFP8copy(ta, mem)
+	tc.Mul(tb, mem)
+	t := NewFP8copy(w1, mem)
+	t.Add(w2, mem)
+	t.Neg(mem)
+	tc.Add(t, mem)
+
+	ta.copy(F.a.geta())
+	ta.Add(F.b.geta(), mem)
+	ta.norm()
+	tb.copy(y.a.geta())
+	tb.Add(y.b.geta(), mem)
+	tb.norm()
+	td := NewFP8copy(ta, mem)
+	td.Mul(tb, mem)
+	t.copy(w1)
+	t.Add(w3, mem)
+	t.Neg(mem)
+	td.Add(t, mem)
+
+	ta.copy(F.a.getb())
+	ta.Add(F.b.geta(), mem)
+	ta.norm()
+	tb.copy(y.a.getb())
+	tb.Add(y.b.geta(), mem)
+	tb.norm()
+	te := NewFP8copy(ta, mem)
+	te.Mul(tb, mem)
+	t.copy(w2)
+	t.Add(w3, mem)
+	t.Neg(mem)
+	te.Add(t, mem)
+
+	w2.times_i(mem)
+	w1.Add(w2, mem)
+
+	F.a.geta().copy(w1)
+	F.a.getb().copy(tc)
+	F.b.geta().copy(td)
+	F.b.getb().copy(te)
+	F.c.geta().copy(w3)
+	F.c.getb().zero()
+
+	F.a.norm()
+	F.b.norm()
 	F.stype = FP_SPARSE
 }
 
 /* this=1/this */
-func (F *FP48) Invert() {
-	f0 := NewFP16copy(F.a)
-	f1 := NewFP16copy(F.b)
-	f2 := NewFP16copy(F.a)
-	f3 := NewFP16()
+func (F *FP48) Invert(mem *arena.Arena) {
+	f0 := NewFP16copy(F.a, mem)
+	f1 := NewFP16copy(F.b, mem)
+	f2 := NewFP16copy(F.a, mem)
+	f3 := NewFP16(mem)
 
 	//F.norm()
-	f0.Sqr()
-	f1.Mul(F.c)
-	f1.times_i()
-	f0.Sub(f1)
+	f0.Sqr(mem)
+	f1.Mul(F.c, mem)
+	f1.times_i(mem)
+	f0.Sub(f1, mem)
 	f0.norm()
 
 	f1.copy(F.c)
-	f1.Sqr()
-	f1.times_i()
-	f2.Mul(F.b)
-	f1.Sub(f2)
+	f1.Sqr(mem)
+	f1.times_i(mem)
+	f2.Mul(F.b, mem)
+	f1.Sub(f2, mem)
 	f1.norm()
 
 	f2.copy(F.b)
-	f2.Sqr()
+	f2.Sqr(mem)
 	f3.copy(F.a)
-	f3.Mul(F.c)
-	f2.Sub(f3)
+	f3.Mul(F.c, mem)
+	f2.Sub(f3, mem)
 	f2.norm()
 
 	f3.copy(F.b)
-	f3.Mul(f2)
-	f3.times_i()
-	F.a.Mul(f0)
-	f3.Add(F.a)
-	F.c.Mul(f1)
-	F.c.times_i()
+	f3.Mul(f2, mem)
+	f3.times_i(mem)
+	F.a.Mul(f0, mem)
+	f3.Add(F.a, mem)
+	F.c.Mul(f1, mem)
+	F.c.times_i(mem)
 
-	f3.Add(F.c)
+	f3.Add(F.c, mem)
 	f3.norm()
-	f3.Invert()
+	f3.Invert(mem)
 
 	F.a.copy(f0)
-	F.a.Mul(f3)
+	F.a.Mul(f3, mem)
 	F.b.copy(f1)
-	F.b.Mul(f3)
+	F.b.Mul(f3, mem)
 	F.c.copy(f2)
-	F.c.Mul(f3)
+	F.c.Mul(f3, mem)
 	F.stype = FP_DENSE
 }
 
 /* this=this^p using Frobenius */
-func (F *FP48) frob(f *FP2, n int) {
-	f2 := NewFP2copy(f)
-	f3 := NewFP2copy(f)
+func (F *FP48) frob(f *FP2, n int, mem *arena.Arena) {
+	f2 := NewFP2copy(f, mem)
+	f3 := NewFP2copy(f, mem)
 
-	f2.Sqr()
-	f3.Mul(f2)
+	f2.Sqr(mem)
+	f3.Mul(f2, mem)
 
-	f3.Mul_ip()
+	f3.Mul_ip(mem)
 	f3.norm()
-	f3.Mul_ip()
+	f3.Mul_ip(mem)
 	f3.norm()
 
 	for i := 0; i < n; i++ {
-		F.a.frob(f3)
-		F.b.frob(f3)
-		F.c.frob(f3)
+		F.a.frob(f3, mem)
+		F.b.frob(f3, mem)
+		F.c.frob(f3, mem)
 
-		F.b.qmul(f)
-		F.b.times_i4()
-		F.b.times_i2()
-		F.c.qmul(f2)
-		F.c.times_i4()
-		F.c.times_i4()
-		F.c.times_i4()
+		F.b.qmul(f, mem)
+		F.b.times_i4(mem)
+		F.b.times_i2(mem)
+		F.c.qmul(f2, mem)
+		F.c.times_i4(mem)
+		F.c.times_i4(mem)
+		F.c.times_i4(mem)
 	}
 	F.stype = FP_DENSE
 }
 
 /* trace function */
-func (F *FP48) trace() *FP16 {
-	t := NewFP16()
+func (F *FP48) trace(mem *arena.Arena) *FP16 {
+	t := NewFP16(mem)
 	t.copy(F.a)
-	t.imul(3)
-	t.reduce()
+	t.imul(3, mem)
+	t.reduce(mem)
 	return t
 }
 
@@ -856,7 +717,7 @@ func FP48_fromBytes(w []byte) *FP48 {
 		t[i] = w[i+2*MB]
 	}
 	a := FP16_fromBytes(t[:])
-	return NewFP48fp16s(a, b, c)
+	return NewFP48fp16s(a, b, c, nil)
 }
 
 /* convert this to byte array */
@@ -883,48 +744,48 @@ func (F *FP48) ToString() string {
 }
 
 /* this=this^e */
-func (F *FP48) Pow(e *BIG) *FP48 {
-	sf := NewFP48copy(F)
+func (F *FP48) Pow(e *BIG, mem *arena.Arena) *FP48 {
+	sf := NewFP48copy(F, mem)
 	sf.norm()
-	e1 := NewBIGcopy(e)
+	e1 := NewBIGcopy(e, mem)
 	e1.norm()
-	e3 := NewBIGcopy(e1)
+	e3 := NewBIGcopy(e1, mem)
 	e3.pmul(3)
 	e3.norm()
 
-	w := NewFP48copy(sf)
+	w := NewFP48copy(sf, mem)
 	if e3.IsZero() {
 		w.one()
 		return w
 	}
 	nb := e3.nbits()
 	for i := nb - 2; i >= 1; i-- {
-		w.uSqr()
+		w.uSqr(mem)
 		bt := e3.bit(i) - e1.bit(i)
 		if bt == 1 {
-			w.Mul(sf)
+			w.Mul(sf, mem)
 		}
 		if bt == -1 {
-			sf.conj()
-			w.Mul(sf)
-			sf.conj()
+			sf.conj(mem)
+			w.Mul(sf, mem)
+			sf.conj(mem)
 		}
 	}
-	w.reduce()
+	w.reduce(mem)
 	return w
 
 }
 
 /* constant time powering by small integer of max length bts */
-func (F *FP48) pinpow(e int, bts int) {
+func (F *FP48) pinpow(e int, bts int, mem *arena.Arena) {
 	var R []*FP48
-	R = append(R, NewFP48int(1))
-	R = append(R, NewFP48copy(F))
+	R = append(R, NewFP48int(1, mem))
+	R = append(R, NewFP48copy(F, mem))
 
 	for i := bts - 1; i >= 0; i-- {
 		b := (e >> uint(i)) & 1
-		R[1-b].Mul(R[b])
-		R[b].uSqr()
+		R[1-b].Mul(R[b], mem)
+		R[b].uSqr(mem)
 	}
 	F.Copy(R[0])
 }
@@ -985,79 +846,79 @@ func pow16(q []*FP48, u []*BIG) *FP48 {
 	var w4 [NLEN*int(BASEBITS) + 1]int8
 	var s4 [NLEN*int(BASEBITS) + 1]int8
 	var t []*BIG
-	r := NewFP48()
-	p := NewFP48()
-	mt := NewBIGint(0)
+	r := NewFP48(nil)
+	p := NewFP48(nil)
+	mt := NewBIGint(0, nil)
 	var bt int8
 	var k int
 
 	for i := 0; i < 16; i++ {
-		t = append(t, NewBIGcopy(u[i]))
+		t = append(t, NewBIGcopy(u[i], nil))
 	}
 
-	g1 = append(g1, NewFP48copy(q[0])) // q[0]
-	g1 = append(g1, NewFP48copy(g1[0]))
-	g1[1].Mul(q[1]) // q[0].q[1]
-	g1 = append(g1, NewFP48copy(g1[0]))
-	g1[2].Mul(q[2]) // q[0].q[2]
-	g1 = append(g1, NewFP48copy(g1[1]))
-	g1[3].Mul(q[2]) // q[0].q[1].q[2]
-	g1 = append(g1, NewFP48copy(g1[0]))
-	g1[4].Mul(q[3]) // q[0].q[3]
-	g1 = append(g1, NewFP48copy(g1[1]))
-	g1[5].Mul(q[3]) // q[0].q[1].q[3]
-	g1 = append(g1, NewFP48copy(g1[2]))
-	g1[6].Mul(q[3]) // q[0].q[2].q[3]
-	g1 = append(g1, NewFP48copy(g1[3]))
-	g1[7].Mul(q[3]) // q[0].q[1].q[2].q[3]
+	g1 = append(g1, NewFP48copy(q[0], nil)) // q[0]
+	g1 = append(g1, NewFP48copy(g1[0], nil))
+	g1[1].Mul(q[1], nil) // q[0].q[1]
+	g1 = append(g1, NewFP48copy(g1[0], nil))
+	g1[2].Mul(q[2], nil) // q[0].q[2]
+	g1 = append(g1, NewFP48copy(g1[1], nil))
+	g1[3].Mul(q[2], nil) // q[0].q[1].q[2]
+	g1 = append(g1, NewFP48copy(g1[0], nil))
+	g1[4].Mul(q[3], nil) // q[0].q[3]
+	g1 = append(g1, NewFP48copy(g1[1], nil))
+	g1[5].Mul(q[3], nil) // q[0].q[1].q[3]
+	g1 = append(g1, NewFP48copy(g1[2], nil))
+	g1[6].Mul(q[3], nil) // q[0].q[2].q[3]
+	g1 = append(g1, NewFP48copy(g1[3], nil))
+	g1[7].Mul(q[3], nil) // q[0].q[1].q[2].q[3]
 
-	g2 = append(g2, NewFP48copy(q[4])) // q[0]
-	g2 = append(g2, NewFP48copy(g2[0]))
-	g2[1].Mul(q[5]) // q[0].q[1]
-	g2 = append(g2, NewFP48copy(g2[0]))
-	g2[2].Mul(q[6]) // q[0].q[2]
-	g2 = append(g2, NewFP48copy(g2[1]))
-	g2[3].Mul(q[6]) // q[0].q[1].q[2]
-	g2 = append(g2, NewFP48copy(g2[0]))
-	g2[4].Mul(q[7]) // q[0].q[3]
-	g2 = append(g2, NewFP48copy(g2[1]))
-	g2[5].Mul(q[7]) // q[0].q[1].q[3]
-	g2 = append(g2, NewFP48copy(g2[2]))
-	g2[6].Mul(q[7]) // q[0].q[2].q[3]
-	g2 = append(g2, NewFP48copy(g2[3]))
-	g2[7].Mul(q[7]) // q[0].q[1].q[2].q[3]
+	g2 = append(g2, NewFP48copy(q[4], nil)) // q[0]
+	g2 = append(g2, NewFP48copy(g2[0], nil))
+	g2[1].Mul(q[5], nil) // q[0].q[1]
+	g2 = append(g2, NewFP48copy(g2[0], nil))
+	g2[2].Mul(q[6], nil) // q[0].q[2]
+	g2 = append(g2, NewFP48copy(g2[1], nil))
+	g2[3].Mul(q[6], nil) // q[0].q[1].q[2]
+	g2 = append(g2, NewFP48copy(g2[0], nil))
+	g2[4].Mul(q[7], nil) // q[0].q[3]
+	g2 = append(g2, NewFP48copy(g2[1], nil))
+	g2[5].Mul(q[7], nil) // q[0].q[1].q[3]
+	g2 = append(g2, NewFP48copy(g2[2], nil))
+	g2[6].Mul(q[7], nil) // q[0].q[2].q[3]
+	g2 = append(g2, NewFP48copy(g2[3], nil))
+	g2[7].Mul(q[7], nil) // q[0].q[1].q[2].q[3]
 
-	g3 = append(g3, NewFP48copy(q[8])) // q[0]
-	g3 = append(g3, NewFP48copy(g3[0]))
-	g3[1].Mul(q[9]) // q[0].q[1]
-	g3 = append(g3, NewFP48copy(g3[0]))
-	g3[2].Mul(q[10]) // q[0].q[2]
-	g3 = append(g3, NewFP48copy(g3[1]))
-	g3[3].Mul(q[10]) // q[0].q[1].q[2]
-	g3 = append(g3, NewFP48copy(g3[0]))
-	g3[4].Mul(q[11]) // q[0].q[3]
-	g3 = append(g3, NewFP48copy(g3[1]))
-	g3[5].Mul(q[11]) // q[0].q[1].q[3]
-	g3 = append(g3, NewFP48copy(g3[2]))
-	g3[6].Mul(q[11]) // q[0].q[2].q[3]
-	g3 = append(g3, NewFP48copy(g3[3]))
-	g3[7].Mul(q[11]) // q[0].q[1].q[2].q[3]
+	g3 = append(g3, NewFP48copy(q[8], nil)) // q[0]
+	g3 = append(g3, NewFP48copy(g3[0], nil))
+	g3[1].Mul(q[9], nil) // q[0].q[1]
+	g3 = append(g3, NewFP48copy(g3[0], nil))
+	g3[2].Mul(q[10], nil) // q[0].q[2]
+	g3 = append(g3, NewFP48copy(g3[1], nil))
+	g3[3].Mul(q[10], nil) // q[0].q[1].q[2]
+	g3 = append(g3, NewFP48copy(g3[0], nil))
+	g3[4].Mul(q[11], nil) // q[0].q[3]
+	g3 = append(g3, NewFP48copy(g3[1], nil))
+	g3[5].Mul(q[11], nil) // q[0].q[1].q[3]
+	g3 = append(g3, NewFP48copy(g3[2], nil))
+	g3[6].Mul(q[11], nil) // q[0].q[2].q[3]
+	g3 = append(g3, NewFP48copy(g3[3], nil))
+	g3[7].Mul(q[11], nil) // q[0].q[1].q[2].q[3]
 
-	g4 = append(g4, NewFP48copy(q[12])) // q[0]
-	g4 = append(g4, NewFP48copy(g4[0]))
-	g4[1].Mul(q[13]) // q[0].q[1]
-	g4 = append(g4, NewFP48copy(g4[0]))
-	g4[2].Mul(q[14]) // q[0].q[2]
-	g4 = append(g4, NewFP48copy(g4[1]))
-	g4[3].Mul(q[14]) // q[0].q[1].q[2]
-	g4 = append(g4, NewFP48copy(g4[0]))
-	g4[4].Mul(q[15]) // q[0].q[3]
-	g4 = append(g4, NewFP48copy(g4[1]))
-	g4[5].Mul(q[15]) // q[0].q[1].q[3]
-	g4 = append(g4, NewFP48copy(g4[2]))
-	g4[6].Mul(q[15]) // q[0].q[2].q[3]
-	g4 = append(g4, NewFP48copy(g4[3]))
-	g4[7].Mul(q[15]) // q[0].q[1].q[2].q[3]
+	g4 = append(g4, NewFP48copy(q[12], nil)) // q[0]
+	g4 = append(g4, NewFP48copy(g4[0], nil))
+	g4[1].Mul(q[13], nil) // q[0].q[1]
+	g4 = append(g4, NewFP48copy(g4[0], nil))
+	g4[2].Mul(q[14], nil) // q[0].q[2]
+	g4 = append(g4, NewFP48copy(g4[1], nil))
+	g4[3].Mul(q[14], nil) // q[0].q[1].q[2]
+	g4 = append(g4, NewFP48copy(g4[0], nil))
+	g4[4].Mul(q[15], nil) // q[0].q[3]
+	g4 = append(g4, NewFP48copy(g4[1], nil))
+	g4[5].Mul(q[15], nil) // q[0].q[1].q[3]
+	g4 = append(g4, NewFP48copy(g4[2], nil))
+	g4[6].Mul(q[15], nil) // q[0].q[2].q[3]
+	g4 = append(g4, NewFP48copy(g4[3], nil))
+	g4[7].Mul(q[15], nil) // q[0].q[1].q[2].q[3]
 
 	// Make them odd
 	pb1 := 1 - t[0].parity()
@@ -1149,41 +1010,41 @@ func pow16(q []*FP48, u []*BIG) *FP48 {
 	// Main loop
 	p.selector(g1, int32(2*w1[nb-1]+1))
 	r.selector(g2, int32(2*w2[nb-1]+1))
-	p.Mul(r)
+	p.Mul(r, nil)
 	r.selector(g3, int32(2*w3[nb-1]+1))
-	p.Mul(r)
+	p.Mul(r, nil)
 	r.selector(g4, int32(2*w4[nb-1]+1))
-	p.Mul(r)
+	p.Mul(r, nil)
 	for i := nb - 2; i >= 0; i-- {
-		p.uSqr()
+		p.uSqr(nil)
 		r.selector(g1, int32(2*w1[i]+s1[i]))
-		p.Mul(r)
+		p.Mul(r, nil)
 		r.selector(g2, int32(2*w2[i]+s2[i]))
-		p.Mul(r)
+		p.Mul(r, nil)
 		r.selector(g3, int32(2*w3[i]+s3[i]))
-		p.Mul(r)
+		p.Mul(r, nil)
 		r.selector(g4, int32(2*w4[i]+s4[i]))
-		p.Mul(r)
+		p.Mul(r, nil)
 	}
 
 	// apply correction
 	r.Copy(q[0])
-	r.conj()
-	r.Mul(p)
+	r.conj(nil)
+	r.Mul(p, nil)
 	p.cmove(r, pb1)
 	r.Copy(q[4])
-	r.conj()
-	r.Mul(p)
+	r.conj(nil)
+	r.Mul(p, nil)
 	p.cmove(r, pb2)
 	r.Copy(q[8])
-	r.conj()
-	r.Mul(p)
+	r.conj(nil)
+	r.Mul(p, nil)
 	p.cmove(r, pb3)
 	r.Copy(q[12])
-	r.conj()
-	r.Mul(p)
+	r.conj(nil)
+	r.Mul(p, nil)
 	p.cmove(r, pb4)
 
-	p.reduce()
+	p.reduce(nil)
 	return p
 }
diff --git a/nekryptology/pkg/core/curves/native/bls48581/fp8.go b/nekryptology/pkg/core/curves/native/bls48581/fp8.go
index 4b94ff8..eed3355 100644
--- a/nekryptology/pkg/core/curves/native/bls48581/fp8.go
+++ b/nekryptology/pkg/core/curves/native/bls48581/fp8.go
@@ -23,7 +23,11 @@
 
 package bls48581
 
-import "source.quilibrium.com/quilibrium/monorepo/nekryptology/pkg/core/curves/native/bls48581/ext"
+import (
+	"arena"
+
+	"source.quilibrium.com/quilibrium/monorepo/nekryptology/pkg/core/curves/native/bls48581/ext"
+)
 
 //import "fmt"
 
@@ -32,66 +36,115 @@ type FP8 struct {
 	b *FP4
 }
 
-func NewFP8() *FP8 {
-	F := new(FP8)
-	F.a = NewFP4()
-	F.b = NewFP4()
-	return F
+func NewFP8(mem *arena.Arena) *FP8 {
+	if mem != nil {
+		F := arena.New[FP8](mem)
+		F.a = NewFP4(mem)
+		F.b = NewFP4(mem)
+		return F
+	} else {
+		F := new(FP8)
+		F.a = NewFP4(nil)
+		F.b = NewFP4(nil)
+		return F
+	}
 }
 
 /* Constructors */
-func NewFP8int(a int) *FP8 {
-	F := new(FP8)
-	F.a = NewFP4int(a)
-	F.b = NewFP4()
-	return F
+func NewFP8int(a int, mem *arena.Arena) *FP8 {
+	if mem != nil {
+		F := arena.New[FP8](mem)
+		F.a = NewFP4int(a, mem)
+		F.b = NewFP4(mem)
+		return F
+	} else {
+		F := new(FP8)
+		F.a = NewFP4int(a, nil)
+		F.b = NewFP4(nil)
+		return F
+	}
 }
 
 /* Constructors */
-func NewFP8ints(a int, b int) *FP8 {
-	F := new(FP8)
-	F.a = NewFP4int(a)
-	F.b = NewFP4int(b)
-	return F
+func NewFP8ints(a int, b int, mem *arena.Arena) *FP8 {
+	if mem != nil {
+		F := arena.New[FP8](mem)
+		F.a = NewFP4int(a, mem)
+		F.b = NewFP4int(b, mem)
+		return F
+	} else {
+		F := new(FP8)
+		F.a = NewFP4int(a, nil)
+		F.b = NewFP4int(b, nil)
+		return F
+	}
 }
 
-func NewFP8copy(x *FP8) *FP8 {
-	F := new(FP8)
-	F.a = NewFP4copy(x.a)
-	F.b = NewFP4copy(x.b)
-	return F
+func NewFP8copy(x *FP8, mem *arena.Arena) *FP8 {
+	if mem != nil {
+		F := arena.New[FP8](mem)
+		F.a = NewFP4copy(x.a, mem)
+		F.b = NewFP4copy(x.b, mem)
+		return F
+	} else {
+		F := new(FP8)
+		F.a = NewFP4copy(x.a, nil)
+		F.b = NewFP4copy(x.b, nil)
+		return F
+	}
 }
 
-func NewFP8fp4s(c *FP4, d *FP4) *FP8 {
-	F := new(FP8)
-	F.a = NewFP4copy(c)
-	F.b = NewFP4copy(d)
-	return F
+func NewFP8fp4s(c *FP4, d *FP4, mem *arena.Arena) *FP8 {
+	if mem != nil {
+		F := arena.New[FP8](mem)
+		F.a = NewFP4copy(c, mem)
+		F.b = NewFP4copy(d, mem)
+		return F
+	} else {
+		F := new(FP8)
+		F.a = NewFP4copy(c, nil)
+		F.b = NewFP4copy(d, nil)
+		return F
+	}
 }
 
-func NewFP8fp4(c *FP4) *FP8 {
-	F := new(FP8)
-	F.a = NewFP4copy(c)
-	F.b = NewFP4()
-	return F
+func NewFP8fp4(c *FP4, mem *arena.Arena) *FP8 {
+	if mem != nil {
+		F := arena.New[FP8](mem)
+		F.a = NewFP4copy(c, mem)
+		F.b = NewFP4(mem)
+		return F
+	} else {
+		F := new(FP8)
+		F.a = NewFP4copy(c, nil)
+		F.b = NewFP4(nil)
+		return F
+	}
 }
 
-func NewFP8fp(c *FP) *FP8 {
-	F := new(FP8)
-	F.a = NewFP4fp(c)
-	F.b = NewFP4()
-	return F
+func NewFP8fp(c *FP, mem *arena.Arena) *FP8 {
+	if mem != nil {
+		F := arena.New[FP8](mem)
+		F.a = NewFP4fp(c, mem)
+		F.b = NewFP4(mem)
+		return F
+	} else {
+		F := new(FP8)
+		F.a = NewFP4fp(c, nil)
+		F.b = NewFP4(nil)
+		return F
+	}
 }
 
 func NewFP8rand(rng *ext.RAND) *FP8 {
-	F := NewFP8fp4s(NewFP4rand(rng), NewFP4rand(rng))
+	F := NewFP8fp4s(NewFP4rand(rng), NewFP4rand(rng), nil)
 	return F
 }
 
 /* reduce all components of this mod Modulus */
-func (F *FP8) reduce() {
-	F.a.reduce()
-	F.b.reduce()
+func (F *FP8) reduce(mem *arena.Arena) {
+	F.a.reduce(mem)
+	F.b.reduce(mem)
 }
 
 /* normalise all components of this mod Modulus */
@@ -101,12 +154,12 @@ func (F *FP8) norm() {
 }
 
 /* test this==0 ? */
-func (F *FP8) IsZero() bool {
-	return F.a.IsZero() && F.b.IsZero()
+func (F *FP8) IsZero(mem *arena.Arena) bool {
+	return F.a.IsZero(mem) && F.b.IsZero(mem)
 }
 
 func (F *FP8) islarger() int {
-	if F.IsZero() {
+	if F.IsZero(nil) {
 		return 0
 	}
 	cmp := F.b.islarger()
@@ -140,7 +193,7 @@ func FP8_fromBytes(bf []byte) *FP8 {
 		t[i] = bf[i+MB]
 	}
 	ta := FP4_fromBytes(t[:])
-	return NewFP8fp4s(ta, tb)
+	return NewFP8fp4s(ta, tb, nil)
 }
 
 /* Conditional move */
@@ -151,13 +204,15 @@ func (F *FP8) cmove(g *FP8, d int) {
 
 /* test this==1 ? */
 func (F *FP8) isunity() bool {
-	one := NewFP4int(1)
-	return F.a.Equals(one) && F.b.IsZero()
+	mem := arena.NewArena()
+	defer mem.Free()
+	one := NewFP4int(1, mem)
+	return F.a.Equals(one) && F.b.IsZero(mem)
 }
 
 /* test is w real? That is in a+ib test b is zero */
 func (F *FP8) isreal() bool {
-	return F.b.IsZero()
+	return F.b.IsZero(nil)
 }
 
 /* extract real part a */
@@ -198,12 +253,12 @@ func (F *FP8) one() {
 }
 
 /* Return sign */
-func (F *FP8) sign() int {
-	p1 := F.a.sign()
-	p2 := F.b.sign()
+func (F *FP8) sign(mem *arena.Arena) int {
+	p1 := F.a.sign(mem)
+	p2 := F.b.sign(mem)
 	var u int
 	if BIG_ENDIAN_SIGN {
-		if F.b.IsZero() {
+		if F.b.IsZero(mem) {
 			u = 1
 		} else {
 			u = 0
@@ -211,7 +266,7 @@ func (F *FP8) sign() int {
 		p2 ^= (p1 ^ p2) & u
 		return p2
 	} else {
-		if F.a.IsZero() {
+		if F.a.IsZero(mem) {
 			u = 1
 		} else {
 			u = 0
@@ -222,137 +277,137 @@ func (F *FP8) sign() int {
 }
 
 /* set this=-this */
-func (F *FP8) Neg() {
+func (F *FP8) Neg(mem *arena.Arena) {
 	F.norm()
-	m := NewFP4copy(F.a)
-	t := NewFP4()
-	m.Add(F.b)
-	m.Neg()
+	m := NewFP4copy(F.a, mem)
+	t := NewFP4(mem)
+	m.Add(F.b, mem)
+	m.Neg(mem)
 	t.copy(m)
-	t.Add(F.b)
+	t.Add(F.b, mem)
 	F.b.copy(m)
-	F.b.Add(F.a)
+	F.b.Add(F.a, mem)
 	F.a.copy(t)
 	F.norm()
 }
 
 /* this=conjugate(this) */
-func (F *FP8) conj() {
-	F.b.Neg()
+func (F *FP8) conj(mem *arena.Arena) {
+	F.b.Neg(mem)
 	F.norm()
 }
 
 /* this=-conjugate(this) */
-func (F *FP8) nconj() {
-	F.a.Neg()
+func (F *FP8) nconj(mem *arena.Arena) {
+	F.a.Neg(mem)
 	F.norm()
 }
 
 /* this+=x */
-func (F *FP8) Add(x *FP8) {
-	F.a.Add(x.a)
-	F.b.Add(x.b)
+func (F *FP8) Add(x *FP8, mem *arena.Arena) {
+	F.a.Add(x.a, mem)
+	F.b.Add(x.b, mem)
 }
 
 /* this-=x */
-func (F *FP8) Sub(x *FP8) {
-	m := NewFP8copy(x)
-	m.Neg()
-	F.Add(m)
+func (F *FP8) Sub(x *FP8, mem *arena.Arena) {
+	m := NewFP8copy(x, mem)
+	m.Neg(mem)
+	F.Add(m, mem)
 }
 
 /* this-=x */
-func (F *FP8) rsub(x *FP8) {
-	F.Neg()
-	F.Add(x)
+func (F *FP8) rsub(x *FP8, mem *arena.Arena) {
+	F.Neg(mem)
+	F.Add(x, mem)
 }
 
 /* this*=s where s is FP4 */
-func (F *FP8) pmul(s *FP4) {
-	F.a.Mul(s)
-	F.b.Mul(s)
+func (F *FP8) pmul(s *FP4, mem *arena.Arena) {
+	F.a.Mul(s, mem)
+	F.b.Mul(s, mem)
 }
 
 /* this*=s where s is FP2 */
-func (F *FP8) qmul(s *FP2) {
-	F.a.pmul(s)
-	F.b.pmul(s)
+func (F *FP8) qmul(s *FP2, mem *arena.Arena) {
+	F.a.pmul(s, mem)
+	F.b.pmul(s, mem)
 }
 
 /* this*=s where s is FP */
-func (F *FP8) tmul(s *FP) {
-	F.a.qmul(s)
-	F.b.qmul(s)
+func (F *FP8) tmul(s *FP, mem *arena.Arena) {
+	F.a.qmul(s, mem)
+	F.b.qmul(s, mem)
 }
 
 /* this*=c where c is int */
-func (F *FP8) imul(c int) {
-	F.a.imul(c)
-	F.b.imul(c)
+func (F *FP8) imul(c int, mem *arena.Arena) {
+	F.a.imul(c, mem)
+	F.b.imul(c, mem)
 }
 
 /* this*=this */
-func (F *FP8) Sqr() {
-	t1 := NewFP4copy(F.a)
-	t2 := NewFP4copy(F.b)
-	t3 := NewFP4copy(F.a)
+func (F *FP8) Sqr(mem *arena.Arena) {
+	t1 := NewFP4copy(F.a, mem)
+	t2 := NewFP4copy(F.b, mem)
+	t3 := NewFP4copy(F.a, mem)
 
-	t3.Mul(F.b)
-	t1.Add(F.b)
-	t2.times_i()
+	t3.Mul(F.b, mem)
+	t1.Add(F.b, mem)
+	t2.times_i(mem)
 
-	t2.Add(F.a)
+	t2.Add(F.a, mem)
 
 	t1.norm()
 	t2.norm()
 
 	F.a.copy(t1)
-	F.a.Mul(t2)
+	F.a.Mul(t2, mem)
 
 	t2.copy(t3)
-	t2.times_i()
-	t2.Add(t3)
+	t2.times_i(mem)
+	t2.Add(t3, mem)
 	t2.norm()
-	t2.Neg()
-	F.a.Add(t2)
+	t2.Neg(mem)
+	F.a.Add(t2, mem)
 
 	F.b.copy(t3)
-	F.b.Add(t3)
+	F.b.Add(t3, mem)
 
 	F.norm()
 }
 
 /* this*=y */
-func (F *FP8) Mul(y *FP8) {
-	t1 := NewFP4copy(F.a)
-	t2 := NewFP4copy(F.b)
-	t3 := NewFP4()
-	t4 := NewFP4copy(F.b)
+func (F *FP8) Mul(y *FP8, mem *arena.Arena) {
+	t1 := NewFP4copy(F.a, mem)
+	t2 := NewFP4copy(F.b, mem)
+	t3 := NewFP4(mem)
+	t4 := NewFP4copy(F.b, mem)
 
-	t1.Mul(y.a)
-	t2.Mul(y.b)
+	t1.Mul(y.a, mem)
+	t2.Mul(y.b, mem)
 	t3.copy(y.b)
-	t3.Add(y.a)
-	t4.Add(F.a)
+	t3.Add(y.a, mem)
+	t4.Add(F.a, mem)
 
 	t3.norm()
 	t4.norm()
 
-	t4.Mul(t3)
+	t4.Mul(t3, mem)
 
 	t3.copy(t1)
-	t3.Neg()
-	t4.Add(t3)
+	t3.Neg(mem)
+	t4.Add(t3, mem)
 	t4.norm()
 
 	t3.copy(t2)
-	t3.Neg()
+	t3.Neg(mem)
 	F.b.copy(t4)
-	F.b.Add(t3)
+	F.b.Add(t3, mem)
 
-	t2.times_i()
+	t2.times_i(mem)
 	F.a.copy(t2)
-	F.a.Add(t1)
+	F.a.Add(t1, mem)
 
 	F.norm()
 }
@@ -363,55 +418,55 @@ func (F *FP8) toString() string {
 }
 
 /* this=1/this */
-func (F *FP8) Invert(h *FP) {
-	t1 := NewFP4copy(F.a)
-	t2 := NewFP4copy(F.b)
+func (F *FP8) Invert(h *FP, mem *arena.Arena) {
+	t1 := NewFP4copy(F.a, mem)
+	t2 := NewFP4copy(F.b, mem)
 
-	t1.Sqr()
-	t2.Sqr()
-	t2.times_i()
+	t1.Sqr(mem)
+	t2.Sqr(mem)
+	t2.times_i(mem)
 	t2.norm()
-	t1.Sub(t2)
+	t1.Sub(t2, mem)
 	t1.norm()
 
-	t1.Invert(h)
+	t1.Invert(h, mem)
 
-	F.a.Mul(t1)
-	t1.Neg()
+	F.a.Mul(t1, mem)
+	t1.Neg(mem)
 	t1.norm()
-	F.b.Mul(t1)
+	F.b.Mul(t1, mem)
 }
 
 /* this*=i where i = sqrt(sqrt(-1+sqrt(-1))) */
-func (F *FP8) times_i() {
-	s := NewFP4copy(F.b)
-	t := NewFP4copy(F.a)
-	s.times_i()
+func (F *FP8) times_i(mem *arena.Arena) {
+	s := NewFP4copy(F.b, mem)
+	t := NewFP4copy(F.a, mem)
+	s.times_i(mem)
 	F.a.copy(s)
 	F.b.copy(t)
 	F.norm()
 	if TOWER == POSITOWER {
-		F.Neg()
+		F.Neg(mem)
 		F.norm()
 	}
 }
 
-func (F *FP8) times_i2() {
-	F.a.times_i()
-	F.b.times_i()
+func (F *FP8) times_i2(mem *arena.Arena) {
+	F.a.times_i(mem)
+	F.b.times_i(mem)
 }
 
 /* this=this^p using Frobenius */
-func (F *FP8) frob(f *FP2) {
-	ff := NewFP2copy(f)
-	ff.Sqr()
-	ff.Mul_ip()
+func (F *FP8) frob(f *FP2, mem *arena.Arena) {
+	ff := NewFP2copy(f, mem)
+	ff.Sqr(mem)
+	ff.Mul_ip(mem)
 	ff.norm()
 
-	F.a.frob(ff)
-	F.b.frob(ff)
-	F.b.pmul(f)
-	F.b.times_i()
+	F.a.frob(ff, mem)
+	F.b.frob(ff, mem)
+	F.b.pmul(f, mem)
+	F.b.times_i(mem)
 }
 
 /* this=this^e
@@ -671,19 +726,19 @@ func (F *FP8) xtr_pow2(ck *FP8, ckml *FP8, ckm2l *FP8, a *BIG, b *BIG) *FP8 {
 }
 */
 /* this/=2 */
-func (F *FP8) div2() {
-	F.a.div2()
-	F.b.div2()
+func (F *FP8) div2(mem *arena.Arena) {
+	F.a.div2(mem)
+	F.b.div2(mem)
 }
 
-func (F *FP8) div_i() {
-	u := NewFP4copy(F.a)
-	v := NewFP4copy(F.b)
-	u.div_i()
+func (F *FP8) div_i(mem *arena.Arena) {
+	u := NewFP4copy(F.a, mem)
+	v := NewFP4copy(F.b, mem)
+	u.div_i(mem)
 	F.a.copy(v)
 	F.b.copy(u)
 	if TOWER == POSITOWER {
-		F.Neg()
+		F.Neg(mem)
 		F.norm()
 	}
 }
@@ -710,70 +765,72 @@ func (F *FP8) pow(b *BIG) {
 /* */
 // Test for Quadratic Residue
 func (F *FP8) qr(h *FP) int {
-	c := NewFP8copy(F)
-	c.conj()
-	c.Mul(F)
+	mem := arena.NewArena()
+	defer mem.Free()
+	c := NewFP8copy(F, mem)
+	c.conj(mem)
+	c.Mul(F, mem)
 	return c.a.qr(h)
 }
 
 // sqrt(a+ib) = sqrt(a+sqrt(a*a-n*b*b)/2)+ib/(2*sqrt(a+sqrt(a*a-n*b*b)/2))
-func (F *FP8) Sqrt(h *FP) {
-	if F.IsZero() {
+func (F *FP8) Sqrt(h *FP, mem *arena.Arena) {
+	if F.IsZero(mem) {
 		return
 	}
 
-	a := NewFP4copy(F.a)
-	b := NewFP4()
-	s := NewFP4copy(F.b)
-	t := NewFP4copy(F.a)
-	hint := NewFP()
+	a := NewFP4copy(F.a, mem)
+	b := NewFP4(mem)
+	s := NewFP4copy(F.b, mem)
+	t := NewFP4copy(F.a, mem)
+	hint := NewFP(mem)
 
-	s.Sqr()
-	a.Sqr()
-	s.times_i()
+	s.Sqr(mem)
+	a.Sqr(mem)
+	s.times_i(mem)
 	s.norm()
-	a.Sub(s)
+	a.Sub(s, mem)
 
 	s.copy(a)
 	s.norm()
 
-	s.Sqrt(h)
+	s.Sqrt(h, mem)
 	a.copy(t)
 	b.copy(t)
 
-	a.Add(s)
+	a.Add(s, mem)
 	a.norm()
-	a.div2()
+	a.div2(mem)
 
 	b.copy(F.b)
-	b.div2()
+	b.div2(mem)
 	qr := a.qr(hint)
 
 	// tweak hint - multiply old hint by Norm(1/Beta)^e where Beta is irreducible polynomial
 	s.copy(a)
-	twk := NewFPbig(NewBIGints(TWK))
-	twk.Mul(hint)
-	s.div_i()
+	twk := NewFPbig(NewBIGints(TWK, mem), mem)
+	twk.Mul(hint, mem)
+	s.div_i(mem)
 	s.norm()
 
 	a.cmove(s, 1-qr)
 	hint.cmove(twk, 1-qr)
 
 	F.a.copy(a)
-	F.a.Sqrt(hint)
+	F.a.Sqrt(hint, mem)
 	s.copy(a)
-	s.Invert(hint)
-	s.Mul(F.a)
+	s.Invert(hint, mem)
+	s.Mul(F.a, mem)
 	F.b.copy(s)
-	F.b.Mul(b)
+	F.b.Mul(b, mem)
 	t.copy(F.a)
 
 	F.a.cmove(F.b, 1-qr)
 	F.b.cmove(t, 1-qr)
 
-	sgn := F.sign()
-	nr := NewFP8copy(F)
-	nr.Neg()
+	sgn := F.sign(mem)
+	nr := NewFP8copy(F, mem)
+	nr.Neg(mem)
 	nr.norm()
 	F.cmove(nr, sgn)
 }
diff --git a/nekryptology/pkg/core/curves/native/bls48581/g1.go b/nekryptology/pkg/core/curves/native/bls48581/g1.go
index 99227dc..9959b11 100644
--- a/nekryptology/pkg/core/curves/native/bls48581/g1.go
+++ b/nekryptology/pkg/core/curves/native/bls48581/g1.go
@@ -19,6 +19,8 @@
 
 package bls48581
 
+import "arena"
+
 //import "fmt"
 /* Elliptic Curve Point Structure */
 
@@ -29,54 +31,59 @@ type ECP struct {
 }
 
 /* Constructors */
-func NewECP() *ECP {
-	E := new(ECP)
-	E.x = NewFP()
-	E.y = NewFPint(1)
-	if CURVETYPE == EDWARDS {
-		E.z = NewFPint(1)
+func NewECP(mem *arena.Arena) *ECP {
+	var E *ECP
+	if mem != nil {
+		E = arena.New[ECP](mem)
 	} else {
-		E.z = NewFP()
+		E = new(ECP)
 	}
+	E.x = NewFP(mem)
+	E.y = NewFPint(1, mem)
+	E.z = NewFP(mem)
 	return E
 }
 
 /* set (x,y) from two BIGs */
-func NewECPbigs(ix *BIG, iy *BIG) *ECP {
-	E := new(ECP)
-	E.x = NewFPbig(ix)
-	E.y = NewFPbig(iy)
-	E.z = NewFPint(1)
-	E.x.norm()
-	rhs := RHS(E.x)
-
-	if CURVETYPE == MONTGOMERY {
-		if rhs.qr(nil) != 1 {
-			E.inf()
-		}
+func NewECPbigs(ix *BIG, iy *BIG, mem *arena.Arena) *ECP {
+	var E *ECP
+	if mem != nil {
+		E = arena.New[ECP](mem)
 	} else {
-		y2 := NewFPcopy(E.y)
-		y2.Sqr()
-		if !y2.Equals(rhs) {
-			E.inf()
-		}
+		E = new(ECP)
+	}
+	E.x = NewFPbig(ix, mem)
+	E.y = NewFPbig(iy, mem)
+	E.z = NewFPint(1, mem)
+	E.x.norm()
+	rhs := RHS(E.x, mem)
+
+	y2 := NewFPcopy(E.y, mem)
+	y2.Sqr(mem)
+	if !y2.Equals(rhs) {
+		E.inf()
 	}
 	return E
 }
 
 /* set (x,y) from BIG and a bit */
-func NewECPbigint(ix *BIG, s int) *ECP {
-	E := new(ECP)
-	E.x = NewFPbig(ix)
-	E.y = NewFP()
+func NewECPbigint(ix *BIG, s int, mem *arena.Arena) *ECP {
+	var E *ECP
+	if mem != nil {
+		E = arena.New[ECP](mem)
+	} else {
+		E = new(ECP)
+	}
+	E.x = NewFPbig(ix, mem)
+	E.y = NewFP(mem)
 	E.x.norm()
-	rhs := RHS(E.x)
-	E.z = NewFPint(1)
-	hint := NewFP()
+	rhs := RHS(E.x, mem)
+	E.z = NewFPint(1, mem)
+	hint := NewFP(mem)
 	if rhs.qr(hint) == 1 {
-		ny := rhs.Sqrt(hint)
-		if ny.sign() != s {
-			ny.Neg()
+		ny := rhs.Sqrt(hint, mem)
+		if ny.sign(mem) != s {
+			ny.Neg(mem)
 			ny.norm()
 		}
 		E.y.copy(ny)
@@ -87,18 +94,21 @@ func NewECPbigint(ix *BIG, s int) *ECP {
 }
 
 /* set from x - calculate y from curve equation */
-func NewECPbig(ix *BIG) *ECP {
-	E := new(ECP)
-	E.x = NewFPbig(ix)
-	E.y = NewFP()
+func NewECPbig(ix *BIG, mem *arena.Arena) *ECP {
+	var E *ECP
+	if mem != nil {
+		E = arena.New[ECP](mem)
+	} else {
+		E = new(ECP)
+	}
+	E.x = NewFPbig(ix, mem)
+	E.y = NewFP(mem)
 	E.x.norm()
-	rhs := RHS(E.x)
-	E.z = NewFPint(1)
-	hint := NewFP()
+	rhs := RHS(E.x, mem)
+	E.z = NewFPint(1, mem)
+	hint := NewFP(mem)
 	if rhs.qr(hint) == 1 {
-		if CURVETYPE != MONTGOMERY {
-			E.y.copy(rhs.Sqrt(hint))
-		}
+		E.y.copy(rhs.Sqrt(hint, mem))
 	} else {
 		E.inf()
 	}
@@ -106,36 +116,23 @@ func NewECPbig(ix *BIG) *ECP {
 }
 
 /* test for O point-at-infinity */
-func (E *ECP) Is_infinity() bool {
+func (E *ECP) Is_infinity(mem *arena.Arena) bool {
 	//	if E.INF {return true}
 
-	if CURVETYPE == EDWARDS {
-		return (E.x.IsZero() && E.y.Equals(E.z))
-	}
-	if CURVETYPE == WEIERSTRASS {
-		return (E.x.IsZero() && E.z.IsZero())
-	}
-	if CURVETYPE == MONTGOMERY {
-		return E.z.IsZero()
-	}
-	return true
+	return (E.x.IsZero(mem) && E.z.IsZero(mem))
 }
 
 /* Conditional swap of P and Q dependant on d */
 func (E *ECP) cswap(Q *ECP, d int) {
 	E.x.cswap(Q.x, d)
-	if CURVETYPE != MONTGOMERY {
-		E.y.cswap(Q.y, d)
-	}
+	E.y.cswap(Q.y, d)
 	E.z.cswap(Q.z, d)
 }
 
 /* Conditional move of Q to P dependant on d */
 func (E *ECP) cmove(Q *ECP, d int) {
 	E.x.cmove(Q.x, d)
-	if CURVETYPE != MONTGOMERY {
-		E.y.cmove(Q.y, d)
-	}
+	E.y.cmove(Q.y, d)
 	E.z.cmove(Q.z, d)
 }
 
@@ -149,28 +146,20 @@ func teq(b int32, c int32) int {
 /* this=P */
 func (E *ECP) Copy(P *ECP) {
 	E.x.copy(P.x)
-	if CURVETYPE != MONTGOMERY {
-		E.y.copy(P.y)
-	}
+	E.y.copy(P.y)
 	E.z.copy(P.z)
 }
 
 /* this=-this */
-func (E *ECP) Neg() {
-	if CURVETYPE == WEIERSTRASS {
-		E.y.Neg()
-		E.y.norm()
-	}
-	if CURVETYPE == EDWARDS {
-		E.x.Neg()
-		E.x.norm()
-	}
+func (E *ECP) Neg(mem *arena.Arena) {
+	E.y.Neg(mem)
+	E.y.norm()
 	return
 }
 
 /* Constant time select from pre-computed table */
 func (E *ECP) selector(W []*ECP, b int32) {
-	MP := NewECP()
+	MP := NewECP(nil)
 	m := b >> 31
 	babs := (b ^ m) - m
 
@@ -186,137 +175,106 @@ func (E *ECP) selector(W []*ECP, b int32) {
 	E.cmove(W[7], teq(babs, 7))
 
 	MP.Copy(E)
-	MP.Neg()
+	MP.Neg(nil)
 	E.cmove(MP, int(m&1))
 }
 
 /* set this=O */
 func (E *ECP) inf() {
 	E.x.zero()
-	if CURVETYPE != MONTGOMERY {
-		E.y.one()
-	}
-	if CURVETYPE != EDWARDS {
-		E.z.zero()
-	} else {
-		E.z.one()
-	}
+	E.y.one()
+	E.z.zero()
 }
 
 /* Test P == Q */
 func (E *ECP) Equals(Q *ECP) bool {
-	a := NewFP()
-	b := NewFP()
+	mem := arena.NewArena()
+	defer mem.Free()
+	a := NewFP(mem)
+	b := NewFP(mem)
 	a.copy(E.x)
-	a.Mul(Q.z)
-	a.reduce()
+	a.Mul(Q.z, mem)
+	a.reduce(mem)
 	b.copy(Q.x)
-	b.Mul(E.z)
-	b.reduce()
+	b.Mul(E.z, mem)
+	b.reduce(mem)
 	if !a.Equals(b) {
 		return false
 	}
-	if CURVETYPE != MONTGOMERY {
-		a.copy(E.y)
-		a.Mul(Q.z)
-		a.reduce()
-		b.copy(Q.y)
-		b.Mul(E.z)
-		b.reduce()
-		if !a.Equals(b) {
-			return false
-		}
+	a.copy(E.y)
+	a.Mul(Q.z, mem)
+	a.reduce(mem)
+	b.copy(Q.y)
+	b.Mul(E.z, mem)
+	b.reduce(mem)
+	if !a.Equals(b) {
+		return false
 	}
 
 	return true
 }
 
 /* Calculate RHS of curve equation */
-func RHS(x *FP) *FP {
-	r := NewFPcopy(x)
-	r.Sqr()
+func RHS(x *FP, mem *arena.Arena) *FP {
+	r := NewFPcopy(x, mem)
+	r.Sqr(mem)
 
-	if CURVETYPE == WEIERSTRASS { // x^3+Ax+B
-		b := NewFPbig(NewBIGints(CURVE_B))
-		r.Mul(x)
-		if CURVE_A == -3 {
-			cx := NewFPcopy(x)
-			cx.imul(3)
-			cx.Neg()
-			cx.norm()
-			r.Add(cx)
-		}
-		r.Add(b)
+	// x^3+Ax+B
+	b := NewFPbig(NewBIGints(CURVE_B, mem), mem)
+	r.Mul(x, mem)
+	if CURVE_A == -3 {
+		cx := NewFPcopy(x, mem)
+		cx.imul(3, mem)
+		cx.Neg(mem)
+		cx.norm()
+		r.Add(cx, mem)
 	}
-	if CURVETYPE == EDWARDS { // (Ax^2-1)/(Bx^2-1)
-		b := NewFPbig(NewBIGints(CURVE_B))
+	r.Add(b, mem)
 
-		one := NewFPint(1)
-		b.Mul(r)
-		b.Sub(one)
-		b.norm()
-		if CURVE_A == -1 {
-			r.Neg()
-		}
-		r.Sub(one)
-		r.norm()
-		b.Invert(nil)
-		r.Mul(b)
-	}
-	if CURVETYPE == MONTGOMERY { // x^3+Ax^2+x
-		x3 := NewFP()
-		x3.copy(r)
-		x3.Mul(x)
-		r.imul(CURVE_A)
-		r.Add(x3)
-		r.Add(x)
-	}
-	r.reduce()
+	r.reduce(mem)
 	return r
 }
 
 /* set to affine - from (x,y,z) to (x,y) */
-func (E *ECP) Affine() {
-	if E.Is_infinity() {
+func (E *ECP) Affine(mem *arena.Arena) {
+	if E.Is_infinity(mem) {
 		return
 	}
-	one := NewFPint(1)
+	one := NewFPint(1, mem)
 	if E.z.Equals(one) {
 		return
 	}
-	E.z.Invert(nil)
-	E.x.Mul(E.z)
-	E.x.reduce()
+	E.z.Invert(nil, mem)
+	E.x.Mul(E.z, mem)
+	E.x.reduce(mem)
 
-	if CURVETYPE != MONTGOMERY {
-		E.y.Mul(E.z)
-		E.y.reduce()
-	}
+	E.y.Mul(E.z, mem)
+	E.y.reduce(mem)
 	E.z.copy(one)
 }
 
 /* extract x as a BIG */
-func (E *ECP) GetX() *BIG {
-	W := NewECP()
+func (E *ECP) GetX(mem *arena.Arena) *BIG {
+	W := NewECP(mem)
 	W.Copy(E)
-	W.Affine()
-	return W.x.Redc()
+	W.Affine(mem)
+	return W.x.Redc(mem)
 }
 
 /* extract y as a BIG */
-func (E *ECP) GetY() *BIG {
-	W := NewECP()
+func (E *ECP) GetY(mem *arena.Arena) *BIG {
+	W := NewECP(mem)
 	W.Copy(E)
-	W.Affine()
-	return W.y.Redc()
+	W.Affine(mem)
+	return W.y.Redc(mem)
 }
 
 /* get sign of Y */
-func (E *ECP) GetS() int {
-	W := NewECP()
+func (E *ECP) GetS(mem *arena.Arena) int {
+	W := NewECP(mem)
 	W.Copy(E)
-	W.Affine()
-	return W.y.sign()
+	W.Affine(mem)
+	return W.y.sign(mem)
 }
 
 /* extract x as an FP */
@@ -338,55 +296,25 @@ func (E *ECP) getz() *FP {
 func (E *ECP) ToBytes(b []byte, compress bool) {
 	var t [int(MODBYTES)]byte
 	MB := int(MODBYTES)
-	alt := false
-	W := NewECP()
+	W := NewECP(nil)
 	W.Copy(E)
-	W.Affine()
-	W.x.Redc().ToBytes(t[:])
+	W.Affine(nil)
+	W.x.Redc(nil).ToBytes(t[:])
 
-	if CURVETYPE == MONTGOMERY {
-		for i := 0; i < MB; i++ {
-			b[i] = t[i]
+	for i := 0; i < MB; i++ {
+		b[i+1] = t[i]
+	}
+	if compress {
+		b[0] = 0x02
+		if W.y.sign(nil) == 1 {
+			b[0] = 0x03
 		}
-		//b[0] = 0x06
 		return
 	}
-
-	if (MODBITS-1)%8 <= 4 && ALLOW_ALT_COMPRESS {
-		alt = true
-	}
-
-	if alt {
-		for i := 0; i < MB; i++ {
-			b[i] = t[i]
-		}
-		if compress {
-			b[0] |= 0x80
-			if W.y.islarger() == 1 {
-				b[0] |= 0x20
-			}
-		} else {
-			W.y.Redc().ToBytes(t[:])
-			for i := 0; i < MB; i++ {
-				b[i+MB] = t[i]
-			}
-		}
-	} else {
-		for i := 0; i < MB; i++ {
-			b[i+1] = t[i]
-		}
-		if compress {
-			b[0] = 0x02
-			if W.y.sign() == 1 {
-				b[0] = 0x03
-			}
-			return
-		}
-		b[0] = 0x04
-		W.y.Redc().ToBytes(t[:])
-		for i := 0; i < MB; i++ {
-			b[i+MB+1] = t[i]
-		}
+	b[0] = 0x04
+	W.y.Redc(nil).ToBytes(t[:])
+	for i := 0; i < MB; i++ {
+		b[i+MB+1] = t[i]
 	}
 }
 
@@ -394,616 +322,194 @@ func (E *ECP) ToBytes(b []byte, compress bool) {
 func ECP_fromBytes(b []byte) *ECP {
 	var t [int(MODBYTES)]byte
 	MB := int(MODBYTES)
-	p := NewBIGints(Modulus)
-	alt := false
+	p := NewBIGints(Modulus, nil)
 
-	if CURVETYPE == MONTGOMERY {
-		for i := 0; i < MB; i++ {
-			t[i] = b[i]
-		}
-		px := FromBytes(t[:])
-		if Comp(px, p) >= 0 {
-			return NewECP()
-		}
-		return NewECPbig(px)
+	for i := 0; i < MB; i++ {
+		t[i] = b[i+1]
+	}
+	px := FromBytes(t[:])
+	if Comp(px, p) >= 0 {
+		return NewECP(nil)
 	}
 
-	if (MODBITS-1)%8 <= 4 && ALLOW_ALT_COMPRESS {
-		alt = true
+	if b[0] == 0x04 {
+		for i := 0; i < MB; i++ {
+			t[i] = b[i+MB+1]
+		}
+		py := FromBytes(t[:])
+		if Comp(py, p) >= 0 {
+			return NewECP(nil)
+		}
+		return NewECPbigs(px, py, nil)
 	}
 
-	if alt {
-		for i := 0; i < MB; i++ {
-			t[i] = b[i]
-		}
-		t[0] &= 0x1f
-		px := FromBytes(t[:])
-		if (b[0] & 0x80) == 0 {
-			for i := 0; i < MB; i++ {
-				t[i] = b[i+MB]
-			}
-			py := FromBytes(t[:])
-			return NewECPbigs(px, py)
-		} else {
-			sgn := (b[0] & 0x20) >> 5
-			P := NewECPbigint(px, 0)
-			cmp := P.y.islarger()
-			if (sgn == 1 && cmp != 1) || (sgn == 0 && cmp == 1) {
-				P.Neg()
-			}
-			return P
-		}
-	} else {
-		for i := 0; i < MB; i++ {
-			t[i] = b[i+1]
-		}
-		px := FromBytes(t[:])
-		if Comp(px, p) >= 0 {
-			return NewECP()
-		}
-
-		if b[0] == 0x04 {
-			for i := 0; i < MB; i++ {
-				t[i] = b[i+MB+1]
-			}
-			py := FromBytes(t[:])
-			if Comp(py, p) >= 0 {
-				return NewECP()
-			}
-			return NewECPbigs(px, py)
-		}
-
-		if b[0] == 0x02 || b[0] == 0x03 {
-			return NewECPbigint(px, int(b[0]&1))
-		}
+	if b[0] == 0x02 || b[0] == 0x03 {
+		return NewECPbigint(px, int(b[0]&1), nil)
 	}
-	return NewECP()
+	return NewECP(nil)
 }
 
 /* convert to hex string */
 func (E *ECP) ToString() string {
-	W := NewECP()
+	W := NewECP(nil)
 	W.Copy(E)
-	W.Affine()
-	if W.Is_infinity() {
+	W.Affine(nil)
+	if W.Is_infinity(nil) {
 		return "infinity"
 	}
-	if CURVETYPE == MONTGOMERY {
-		return "(" + W.x.Redc().ToString() + ")"
-	} else {
-		return "(" + W.x.Redc().ToString() + "," + W.y.Redc().ToString() + ")"
-	}
+	return "(" + W.x.Redc(nil).ToString() + "," + W.y.Redc(nil).ToString() + ")"
 }
 
 /* this*=2 */
-func (E *ECP) Dbl() {
+func (E *ECP) Dbl(mem *arena.Arena) {
+	t0 := NewFPcopy(E.y, mem)
+	t0.Sqr(mem)
+	t1 := NewFPcopy(E.y, mem)
+	t1.Mul(E.z, mem)
+	t2 := NewFPcopy(E.z, mem)
+	t2.Sqr(mem)
 
-	if CURVETYPE == WEIERSTRASS {
-		if CURVE_A == 0 {
-			t0 := NewFPcopy(E.y)
-			t0.Sqr()
-			t1 := NewFPcopy(E.y)
-			t1.Mul(E.z)
-			t2 := NewFPcopy(E.z)
-			t2.Sqr()
+	E.z.copy(t0)
+	E.z.Add(t0, mem)
+	E.z.norm()
+	E.z.Add(E.z, mem)
+	E.z.Add(E.z, mem)
+	E.z.norm()
+	t2.imul(3*CURVE_B_I, mem)
 
-			E.z.copy(t0)
-			E.z.Add(t0)
-			E.z.norm()
-			E.z.Add(E.z)
-			E.z.Add(E.z)
-			E.z.norm()
-			t2.imul(3 * CURVE_B_I)
+	x3 := NewFPcopy(t2, mem)
+	x3.Mul(E.z, mem)
 
-			x3 := NewFPcopy(t2)
-			x3.Mul(E.z)
+	y3 := NewFPcopy(t0, mem)
+	y3.Add(t2, mem)
+	y3.norm()
+	E.z.Mul(t1, mem)
+	t1.copy(t2)
+	t1.Add(t2, mem)
+	t2.Add(t1, mem)
+	t0.Sub(t2, mem)
+	t0.norm()
+	y3.Mul(t0, mem)
+	y3.Add(x3, mem)
+	t1.copy(E.x)
+	t1.Mul(E.y, mem)
+	E.x.copy(t0)
+	E.x.norm()
+	E.x.Mul(t1, mem)
+	E.x.Add(E.x, mem)
+	E.x.norm()
+	E.y.copy(y3)
+	E.y.norm()
 
-			y3 := NewFPcopy(t0)
-			y3.Add(t2)
-			y3.norm()
-			E.z.Mul(t1)
-			t1.copy(t2)
-			t1.Add(t2)
-			t2.Add(t1)
-			t0.Sub(t2)
-			t0.norm()
-			y3.Mul(t0)
-			y3.Add(x3)
-			t1.copy(E.x)
-			t1.Mul(E.y)
-			E.x.copy(t0)
-			E.x.norm()
-			E.x.Mul(t1)
-			E.x.Add(E.x)
-			E.x.norm()
-			E.y.copy(y3)
-			E.y.norm()
-		} else {
-			t0 := NewFPcopy(E.x)
-			t1 := NewFPcopy(E.y)
-			t2 := NewFPcopy(E.z)
-			t3 := NewFPcopy(E.x)
-			z3 := NewFPcopy(E.z)
-			y3 := NewFP()
-			x3 := NewFP()
-			b := NewFP()
-
-			if CURVE_B_I == 0 {
-				b.copy(NewFPbig(NewBIGints(CURVE_B)))
-			}
-
-			t0.Sqr() //1    x^2
-			t1.Sqr() //2    y^2
-			t2.Sqr() //3
-
-			t3.Mul(E.y) //4
-			t3.Add(t3)
-			t3.norm()   //5
-			z3.Mul(E.x) //6
-			z3.Add(z3)
-			z3.norm() //7
-			y3.copy(t2)
-
-			if CURVE_B_I == 0 {
-				y3.Mul(b)
-			} else {
-				y3.imul(CURVE_B_I)
-			}
-
-			y3.Sub(z3) //9  ***
-			x3.copy(y3)
-			x3.Add(y3)
-			x3.norm() //10
-
-			y3.Add(x3) //11
-			x3.copy(t1)
-			x3.Sub(y3)
-			x3.norm() //12
-			y3.Add(t1)
-			y3.norm()  //13
-			y3.Mul(x3) //14
-			x3.Mul(t3) //15
-			t3.copy(t2)
-			t3.Add(t2) //16
-			t2.Add(t3) //17
-
-			if CURVE_B_I == 0 {
-				z3.Mul(b)
-			} else {
-				z3.imul(CURVE_B_I)
-			}
-
-			z3.Sub(t2) //19
-			z3.Sub(t0)
-			z3.norm() //20  ***
-			t3.copy(z3)
-			t3.Add(z3) //21
-
-			z3.Add(t3)
-			z3.norm() //22
-			t3.copy(t0)
-			t3.Add(t0) //23
-			t0.Add(t3) //24
-			t0.Sub(t2)
-			t0.norm() //25
-
-			t0.Mul(z3) //26
-			y3.Add(t0) //27
-			t0.copy(E.y)
-			t0.Mul(E.z) //28
-			t0.Add(t0)
-			t0.norm()  //29
-			z3.Mul(t0) //30
-			x3.Sub(z3) //x3.norm();//31
-			t0.Add(t0)
-			t0.norm() //32
-			t1.Add(t1)
-			t1.norm() //33
-			z3.copy(t0)
-			z3.Mul(t1) //34
-
-			E.x.copy(x3)
-			E.x.norm()
-			E.y.copy(y3)
-			E.y.norm()
-			E.z.copy(z3)
-			E.z.norm()
-		}
-	}
-
-	if CURVETYPE == EDWARDS {
-		C := NewFPcopy(E.x)
-		D := NewFPcopy(E.y)
-		H := NewFPcopy(E.z)
-		J := NewFP()
-
-		E.x.Mul(E.y)
-		E.x.Add(E.x)
-		E.x.norm()
-		C.Sqr()
-		D.Sqr()
-		if CURVE_A == -1 {
-			C.Neg()
-		}
-		E.y.copy(C)
-		E.y.Add(D)
-		E.y.norm()
-
-		H.Sqr()
-		H.Add(H)
-		E.z.copy(E.y)
-		J.copy(E.y)
-		J.Sub(H)
-		J.norm()
-		E.x.Mul(J)
-		C.Sub(D)
-		C.norm()
-		E.y.Mul(C)
-		E.z.Mul(J)
-
-	}
-	if CURVETYPE == MONTGOMERY {
-		A := NewFPcopy(E.x)
-		B := NewFPcopy(E.x)
-		AA := NewFP()
-		BB := NewFP()
-		C := NewFP()
-
-		A.Add(E.z)
-		A.norm()
-		AA.copy(A)
-		AA.Sqr()
-		B.Sub(E.z)
-		B.norm()
-		BB.copy(B)
-		BB.Sqr()
-		C.copy(AA)
-		C.Sub(BB)
-		C.norm()
-
-		E.x.copy(AA)
-		E.x.Mul(BB)
-
-		A.copy(C)
-		A.imul((CURVE_A + 2) / 4)
-
-		BB.Add(A)
-		BB.norm()
-		E.z.copy(BB)
-		E.z.Mul(C)
-	}
 	return
 }
 
 /* this+=Q */
-func (E *ECP) Add(Q *ECP) {
+func (E *ECP) Add(Q *ECP, mem *arena.Arena) {
+	b := 3 * CURVE_B_I
+	t0 := NewFPcopy(E.x, mem)
+	t0.Mul(Q.x, mem)
+	t1 := NewFPcopy(E.y, mem)
+	t1.Mul(Q.y, mem)
+	t2 := NewFPcopy(E.z, mem)
+	t2.Mul(Q.z, mem)
+	t3 := NewFPcopy(E.x, mem)
+	t3.Add(E.y, mem)
+	t3.norm()
+	t4 := NewFPcopy(Q.x, mem)
+	t4.Add(Q.y, mem)
+	t4.norm()
+	t3.Mul(t4, mem)
+	t4.copy(t0)
+	t4.Add(t1, mem)
 
-	if CURVETYPE == WEIERSTRASS {
-		if CURVE_A == 0 {
-			b := 3 * CURVE_B_I
-			t0 := NewFPcopy(E.x)
-			t0.Mul(Q.x)
-			t1 := NewFPcopy(E.y)
-			t1.Mul(Q.y)
-			t2 := NewFPcopy(E.z)
-			t2.Mul(Q.z)
-			t3 := NewFPcopy(E.x)
-			t3.Add(E.y)
-			t3.norm()
-			t4 := NewFPcopy(Q.x)
-			t4.Add(Q.y)
-			t4.norm()
-			t3.Mul(t4)
-			t4.copy(t0)
-			t4.Add(t1)
+	t3.Sub(t4, mem)
+	t3.norm()
+	t4.copy(E.y)
+	t4.Add(E.z, mem)
+	t4.norm()
+	x3 := NewFPcopy(Q.y, mem)
+	x3.Add(Q.z, mem)
+	x3.norm()
 
-			t3.Sub(t4)
-			t3.norm()
-			t4.copy(E.y)
-			t4.Add(E.z)
-			t4.norm()
-			x3 := NewFPcopy(Q.y)
-			x3.Add(Q.z)
-			x3.norm()
+	t4.Mul(x3, mem)
+	x3.copy(t1)
+	x3.Add(t2, mem)
 
-			t4.Mul(x3)
-			x3.copy(t1)
-			x3.Add(t2)
+	t4.Sub(x3, mem)
+	t4.norm()
+	x3.copy(E.x)
+	x3.Add(E.z, mem)
+	x3.norm()
+	y3 := NewFPcopy(Q.x, mem)
+	y3.Add(Q.z, mem)
+	y3.norm()
+	x3.Mul(y3, mem)
+	y3.copy(t0)
+	y3.Add(t2, mem)
+	y3.rsub(x3, mem)
+	y3.norm()
+	x3.copy(t0)
+	x3.Add(t0, mem)
+	t0.Add(x3, mem)
+	t0.norm()
+	t2.imul(b, mem)
 
-			t4.Sub(x3)
-			t4.norm()
-			x3.copy(E.x)
-			x3.Add(E.z)
-			x3.norm()
-			y3 := NewFPcopy(Q.x)
-			y3.Add(Q.z)
-			y3.norm()
-			x3.Mul(y3)
-			y3.copy(t0)
-			y3.Add(t2)
-			y3.rsub(x3)
-			y3.norm()
-			x3.copy(t0)
-			x3.Add(t0)
-			t0.Add(x3)
-			t0.norm()
-			t2.imul(b)
+	z3 := NewFPcopy(t1, mem)
+	z3.Add(t2, mem)
+	z3.norm()
+	t1.Sub(t2, mem)
+	t1.norm()
+	y3.imul(b, mem)
 
-			z3 := NewFPcopy(t1)
-			z3.Add(t2)
-			z3.norm()
-			t1.Sub(t2)
-			t1.norm()
-			y3.imul(b)
+	x3.copy(y3)
+	x3.Mul(t4, mem)
+	t2.copy(t3)
+	t2.Mul(t1, mem)
+	x3.rsub(t2, mem)
+	y3.Mul(t0, mem)
+	t1.Mul(z3, mem)
+	y3.Add(t1, mem)
+	t0.Mul(t3, mem)
+	z3.Mul(t4, mem)
+	z3.Add(t0, mem)
 
-			x3.copy(y3)
-			x3.Mul(t4)
-			t2.copy(t3)
-			t2.Mul(t1)
-			x3.rsub(t2)
-			y3.Mul(t0)
-			t1.Mul(z3)
-			y3.Add(t1)
-			t0.Mul(t3)
-			z3.Mul(t4)
-			z3.Add(t0)
+	E.x.copy(x3)
+	E.x.norm()
+	E.y.copy(y3)
+	E.y.norm()
+	E.z.copy(z3)
+	E.z.norm()
 
-			E.x.copy(x3)
-			E.x.norm()
-			E.y.copy(y3)
-			E.y.norm()
-			E.z.copy(z3)
-			E.z.norm()
-		} else {
-
-			t0 := NewFPcopy(E.x)
-			t1 := NewFPcopy(E.y)
-			t2 := NewFPcopy(E.z)
-			t3 := NewFPcopy(E.x)
-			t4 := NewFPcopy(Q.x)
-			z3 := NewFP()
-			y3 := NewFPcopy(Q.x)
-			x3 := NewFPcopy(Q.y)
-			b := NewFP()
-
-			if CURVE_B_I == 0 {
-				b.copy(NewFPbig(NewBIGints(CURVE_B)))
-			}
-
-			t0.Mul(Q.x) //1
-			t1.Mul(Q.y) //2
-			t2.Mul(Q.z) //3
-
-			t3.Add(E.y)
-			t3.norm() //4
-			t4.Add(Q.y)
-			t4.norm()  //5
-			t3.Mul(t4) //6
-			t4.copy(t0)
-			t4.Add(t1) //7
-			t3.Sub(t4)
-			t3.norm() //8
-			t4.copy(E.y)
-			t4.Add(E.z)
-			t4.norm() //9
-			x3.Add(Q.z)
-			x3.norm()  //10
-			t4.Mul(x3) //11
-			x3.copy(t1)
-			x3.Add(t2) //12
-
-			t4.Sub(x3)
-			t4.norm() //13
-			x3.copy(E.x)
-			x3.Add(E.z)
-			x3.norm() //14
-			y3.Add(Q.z)
-			y3.norm() //15
-
-			x3.Mul(y3) //16
-			y3.copy(t0)
-			y3.Add(t2) //17
-
-			y3.rsub(x3)
-			y3.norm() //18
-			z3.copy(t2)
-
-			if CURVE_B_I == 0 {
-				z3.Mul(b)
-			} else {
-				z3.imul(CURVE_B_I)
-			}
-
-			x3.copy(y3)
-			x3.Sub(z3)
-			x3.norm() //20
-			z3.copy(x3)
-			z3.Add(x3) //21
-
-			x3.Add(z3) //22
-			z3.copy(t1)
-			z3.Sub(x3)
-			z3.norm() //23
-			x3.Add(t1)
-			x3.norm() //24
-
-			if CURVE_B_I == 0 {
-				y3.Mul(b)
-			} else {
-				y3.imul(CURVE_B_I)
-			}
-
-			t1.copy(t2)
-			t1.Add(t2) //26
-			t2.Add(t1) //27
-
-			y3.Sub(t2) //28
-
-			y3.Sub(t0)
-			y3.norm() //29
-			t1.copy(y3)
-			t1.Add(y3) //30
-			y3.Add(t1)
-			y3.norm() //31
-
-			t1.copy(t0)
-			t1.Add(t0) //32
-			t0.Add(t1) //33
-			t0.Sub(t2)
-			t0.norm() //34
-			t1.copy(t4)
-			t1.Mul(y3) //35
-			t2.copy(t0)
-			t2.Mul(y3) //36
-			y3.copy(x3)
-			y3.Mul(z3) //37
-			y3.Add(t2) //38
-			x3.Mul(t3) //39
-			x3.Sub(t1) //40
-			z3.Mul(t4) //41
-			t1.copy(t3)
-			t1.Mul(t0) //42
-			z3.Add(t1)
-			E.x.copy(x3)
-			E.x.norm()
-			E.y.copy(y3)
-			E.y.norm()
-			E.z.copy(z3)
-			E.z.norm()
-
-		}
-	}
-	if CURVETYPE == EDWARDS {
-		b := NewFPbig(NewBIGints(CURVE_B))
-		A := NewFPcopy(E.z)
-		B := NewFP()
-		C := NewFPcopy(E.x)
-		D := NewFPcopy(E.y)
-		EE := NewFP()
-		F := NewFP()
-		G := NewFP()
-
-		A.Mul(Q.z)
-		B.copy(A)
-		B.Sqr()
-		C.Mul(Q.x)
-		D.Mul(Q.y)
-
-		EE.copy(C)
-		EE.Mul(D)
-		EE.Mul(b)
-		F.copy(B)
-		F.Sub(EE)
-		G.copy(B)
-		G.Add(EE)
-
-		if CURVE_A == 1 {
-			EE.copy(D)
-			EE.Sub(C)
-		}
-		C.Add(D)
-
-		B.copy(E.x)
-		B.Add(E.y)
-		D.copy(Q.x)
-		D.Add(Q.y)
-		B.norm()
-		D.norm()
-		B.Mul(D)
-		B.Sub(C)
-		B.norm()
-		F.norm()
-		B.Mul(F)
-		E.x.copy(A)
-		E.x.Mul(B)
-		G.norm()
-		if CURVE_A == 1 {
-			EE.norm()
-			C.copy(EE)
-			C.Mul(G)
-		}
-		if CURVE_A == -1 {
-			C.norm()
-			C.Mul(G)
-		}
-		E.y.copy(A)
-		E.y.Mul(C)
-		E.z.copy(F)
-		E.z.Mul(G)
-	}
 	return
 }
 
-/* Differential Add for Montgomery curves. this+=Q where W is this-Q and is affine. */
-func (E *ECP) dAdd(Q *ECP, W *ECP) {
-	A := NewFPcopy(E.x)
-	B := NewFPcopy(E.x)
-	C := NewFPcopy(Q.x)
-	D := NewFPcopy(Q.x)
-	DA := NewFP()
-	CB := NewFP()
-
-	A.Add(E.z)
-	B.Sub(E.z)
-
-	C.Add(Q.z)
-	D.Sub(Q.z)
-	A.norm()
-	D.norm()
-
-	DA.copy(D)
-	DA.Mul(A)
-	C.norm()
-	B.norm()
-
-	CB.copy(C)
-	CB.Mul(B)
-
-	A.copy(DA)
-	A.Add(CB)
-	A.norm()
-	A.Sqr()
-	B.copy(DA)
-	B.Sub(CB)
-	B.norm()
-	B.Sqr()
-
-	E.x.copy(A)
-	E.z.copy(W.x)
-	E.z.Mul(B)
-
-}
-
 /* this-=Q */
-func (E *ECP) Sub(Q *ECP) {
-	NQ := NewECP()
+func (E *ECP) Sub(Q *ECP, mem *arena.Arena) {
+	NQ := NewECP(mem)
 	NQ.Copy(Q)
-	NQ.Neg()
-	E.Add(NQ)
+	NQ.Neg(mem)
+	E.Add(NQ, mem)
 }
 
 /* constant time multiply by small integer of length bts - use lAdder */
-func (E *ECP) pinmul(e int32, bts int32) *ECP {
-	if CURVETYPE == MONTGOMERY {
-		return E.lmul(NewBIGint(int(e)))
-	} else {
-		P := NewECP()
-		R0 := NewECP()
-		R1 := NewECP()
-		R1.Copy(E)
+func (E *ECP) pinmul(e int32, bts int32, mem *arena.Arena) *ECP {
+	P := NewECP(mem)
+	R0 := NewECP(mem)
+	R1 := NewECP(mem)
+	R1.Copy(E)
 
-		for i := bts - 1; i >= 0; i-- {
-			b := int((e >> uint32(i)) & 1)
-			P.Copy(R1)
-			P.Add(R0)
-			R0.cswap(R1, b)
-			R1.Copy(P)
-			R0.Dbl()
-			R0.cswap(R1, b)
-		}
-		P.Copy(R0)
-		return P
+	for i := bts - 1; i >= 0; i-- {
+		b := int((e >> uint32(i)) & 1)
+		P.Copy(R1)
+		P.Add(R0, mem)
+		R0.cswap(R1, b)
+		R1.Copy(P)
+		R0.Dbl(mem)
+		R0.cswap(R1, b)
 	}
+	P.Copy(R0)
+	return P
 }
 
 // Point multiplication, multiplies a point P by a scalar e
@@ -1016,120 +522,97 @@ func (E *ECP) pinmul(e int32, bts int32) *ECP {
 // The point multiplication methods used will process leading zeros correctly.
 
 // So this function leaks information about the length of e...
-func (E *ECP) lmul(e *BIG) *ECP {
-	return E.clmul(e, e)
+func (E *ECP) lmul(e *BIG, outer, mem *arena.Arena) *ECP {
+	return E.clmul(e, e, outer, mem)
 }
 
 // .. but this one does not (typically set maxe=r)
 // Set P=e*P
 /* return e.this */
-func (E *ECP) clmul(e *BIG, maxe *BIG) *ECP {
-	if e.IsZero() || E.Is_infinity() {
-		return NewECP()
+func (E *ECP) clmul(e *BIG, maxe *BIG, outer, mem *arena.Arena) *ECP {
+	if e.IsZero() || E.Is_infinity(mem) {
+		return NewECP(outer)
 	}
-	P := NewECP()
-	cm := NewBIGcopy(e)
+	P := NewECP(outer)
+	cm := NewBIGcopy(e, mem)
 	cm.or(maxe)
 	max := cm.nbits()
 
-	if CURVETYPE == MONTGOMERY {
-		/* use LAdder */
-		D := NewECP()
-		R0 := NewECP()
-		R0.Copy(E)
-		R1 := NewECP()
-		R1.Copy(E)
-		R1.Dbl()
-		D.Copy(E)
-		D.Affine()
-		nb := max
-		for i := nb - 2; i >= 0; i-- {
-			b := int(e.bit(i))
-			P.Copy(R1)
-			P.dAdd(R0, D)
-			R0.cswap(R1, b)
-			R1.Copy(P)
-			R0.Dbl()
-			R0.cswap(R1, b)
-		}
-		P.Copy(R0)
-	} else {
-		// fixed size windows
-		mt := NewBIG()
-		t := NewBIG()
-		Q := NewECP()
-		C := NewECP()
+	// fixed size windows
+	mt := NewBIG(mem)
+	t := NewBIG(mem)
+	Q := NewECP(mem)
+	C := NewECP(mem)
 
-		var W []*ECP
-		var w [1 + (NLEN*int(BASEBITS)+3)/4]int8
+	var W []*ECP
+	var w [1 + (NLEN*int(BASEBITS)+3)/4]int8
 
-		Q.Copy(E)
-		Q.Dbl()
+	Q.Copy(E)
+	Q.Dbl(mem)
 
-		W = append(W, NewECP())
-		W[0].Copy(E)
+	W = append(W, NewECP(mem))
+	W[0].Copy(E)
 
-		for i := 1; i < 8; i++ {
-			W = append(W, NewECP())
-			W[i].Copy(W[i-1])
-			W[i].Add(Q)
-		}
-
-		// make exponent odd - Add 2P if even, P if odd
-		t.copy(e)
-		s := int(t.parity())
-		t.inc(1)
-		t.norm()
-		ns := int(t.parity())
-		mt.copy(t)
-		mt.inc(1)
-		mt.norm()
-		t.cmove(mt, s)
-		Q.cmove(E, ns)
-		C.Copy(Q)
-
-		nb := 1 + (max+3)/4
-
-		// convert exponent to signed 4-bit window
-		for i := 0; i < nb; i++ {
-			w[i] = int8(t.lastbits(5) - 16)
-			t.dec(int(w[i]))
-			t.norm()
-			t.fshr(4)
-		}
-		w[nb] = int8(t.lastbits(5))
-
-		//P.Copy(W[(int(w[nb])-1)/2])
-		P.selector(W, int32(w[nb]))
-		for i := nb - 1; i >= 0; i-- {
-			Q.selector(W, int32(w[i]))
-			P.Dbl()
-			P.Dbl()
-			P.Dbl()
-			P.Dbl()
-			P.Add(Q)
-		}
-		P.Sub(C) /* apply correction */
+	for i := 1; i < 8; i++ {
+		W = append(W, NewECP(mem))
+		W[i].Copy(W[i-1])
+		W[i].Add(Q, mem)
 	}
+
+	// make exponent odd - Add 2P if even, P if odd
+	t.copy(e)
+	s := int(t.parity())
+	t.inc(1)
+	t.norm()
+	ns := int(t.parity())
+	mt.copy(t)
+	mt.inc(1)
+	mt.norm()
+	t.cmove(mt, s)
+	Q.cmove(E, ns)
+	C.Copy(Q)
+
+	nb := 1 + (max+3)/4
+
+	// convert exponent to signed 4-bit window
+	for i := 0; i < nb; i++ {
+		w[i] = int8(t.lastbits(5) - 16)
+		t.dec(int(w[i]))
+		t.norm()
+		t.fshr(4)
+	}
+	w[nb] = int8(t.lastbits(5))
+
+	//P.Copy(W[(int(w[nb])-1)/2])
+	P.selector(W, int32(w[nb]))
+	for i := nb - 1; i >= 0; i-- {
+		Q.selector(W, int32(w[i]))
+		P.Dbl(mem)
+		P.Dbl(mem)
+		P.Dbl(mem)
+		P.Dbl(mem)
+		P.Add(Q, mem)
+	}
+	P.Sub(C, mem) /* apply correction */
 	return P
 }
 
 /* Public version */
-func (E *ECP) Mul(e *BIG) *ECP {
-	return E.lmul(e)
+func (E *ECP) Mul(e *BIG, outer, mem *arena.Arena) *ECP {
+	return E.lmul(e, outer, mem)
 }
 
 // Generic multi-multiplication, fixed 4-bit window, P=Sigma e_i*X_i
-func ECP_muln(n int, X []*ECP, e []*BIG) *ECP {
-	P := NewECP()
-	R := NewECP()
-	S := NewECP()
+func ECP_muln(n int, X []*ECP, e []*BIG, mem *arena.Arena) *ECP {
+	P := NewECP(nil)
+	R := NewECP(mem)
+	S := NewECP(mem)
 	var B []*ECP
-	t := NewBIG()
+	t := NewBIG(mem)
 	for i := 0; i < 16; i++ {
-		B = append(B, NewECP())
+		B = append(B, NewECP(mem))
 	}
-	mt := NewBIGcopy(e[0])
+	mt := NewBIGcopy(e[0], mem)
 	mt.norm()
 	for i := 1; i < n; i++ { // find biggest
 		t.copy(e[i])
@@ -1142,36 +625,42 @@ func ECP_muln(n int, X []*ECP, e []*BIG) *ECP {
 		for j := 0; j < 16; j++ {
 			B[j].inf()
 		}
+
+		inner := arena.NewArena()
 		for j := 0; j < n; j++ {
 			mt.copy(e[j])
 			mt.norm()
 			mt.shr(uint(i * 4))
 			k := mt.lastbits(4)
-			B[k].Add(X[j])
+			B[k].Add(X[j], inner)
+			if j%32 == 0 || j == n-1 {
+				inner.Free()
+				inner = arena.NewArena()
+			}
 		}
 		R.inf()
 		S.inf()
 		for j := 15; j >= 1; j-- {
-			R.Add(B[j])
-			S.Add(R)
+			R.Add(B[j], mem)
+			S.Add(R, mem)
 		}
 		for j := 0; j < 4; j++ {
-			P.Dbl()
+			P.Dbl(mem)
 		}
-		P.Add(S)
+		P.Add(S, mem)
 	}
 	return P
 }
 
 /* Return e.this+f.Q */
 
-func (E *ECP) Mul2(e *BIG, Q *ECP, f *BIG) *ECP {
-	te := NewBIG()
-	tf := NewBIG()
-	mt := NewBIG()
-	S := NewECP()
-	T := NewECP()
-	C := NewECP()
+func (E *ECP) Mul2(e *BIG, Q *ECP, f *BIG, mem *arena.Arena) *ECP {
+	te := NewBIG(mem)
+	tf := NewBIG(mem)
+	mt := NewBIG(mem)
+	S := NewECP(mem)
+	T := NewECP(mem)
+	C := NewECP(mem)
 	var W []*ECP
 	var w [1 + (NLEN*int(BASEBITS)+1)/2]int8
 
@@ -1180,28 +669,28 @@ func (E *ECP) Mul2(e *BIG, Q *ECP, f *BIG) *ECP {
 
 	// precompute table
 	for i := 0; i < 8; i++ {
-		W = append(W, NewECP())
+		W = append(W, NewECP(mem))
 	}
 	W[1].Copy(E)
-	W[1].Sub(Q)
+	W[1].Sub(Q, mem)
 	W[2].Copy(E)
-	W[2].Add(Q)
+	W[2].Add(Q, mem)
 	S.Copy(Q)
-	S.Dbl()
+	S.Dbl(mem)
 	W[0].Copy(W[1])
-	W[0].Sub(S)
+	W[0].Sub(S, mem)
 	W[3].Copy(W[2])
-	W[3].Add(S)
+	W[3].Add(S, mem)
 	T.Copy(E)
-	T.Dbl()
+	T.Dbl(mem)
 	W[5].Copy(W[1])
-	W[5].Add(T)
+	W[5].Add(T, mem)
 	W[6].Copy(W[2])
-	W[6].Add(T)
+	W[6].Add(T, mem)
 	W[4].Copy(W[5])
-	W[4].Sub(S)
+	W[4].Sub(S, mem)
 	W[7].Copy(W[6])
-	W[7].Add(S)
+	W[7].Add(S, mem)
 
 	// if multiplier is odd, Add 2, else Add 1 to multiplier, and Add 2P or P to correction
 
@@ -1225,7 +714,7 @@ func (E *ECP) Mul2(e *BIG, Q *ECP, f *BIG) *ECP {
 	mt.norm()
 	tf.cmove(mt, s)
 	S.cmove(Q, ns)
-	C.Add(S)
+	C.Add(S, mem)
 
 	mt.copy(te)
 	mt.Add(tf)
@@ -1249,48 +738,31 @@ func (E *ECP) Mul2(e *BIG, Q *ECP, f *BIG) *ECP {
 	S.selector(W, int32(w[nb]))
 	for i := nb - 1; i >= 0; i-- {
 		T.selector(W, int32(w[i]))
-		S.Dbl()
-		S.Dbl()
-		S.Add(T)
+		S.Dbl(mem)
+		S.Dbl(mem)
+		S.Add(T, mem)
 	}
-	S.Sub(C) /* apply correction */
+	S.Sub(C, mem) /* apply correction */
 	return S
 }
 
 func (E *ECP) Cfp() {
-	cf := CURVE_Cof_I
-	if cf == 1 {
-		return
-	}
-	if cf == 4 {
-		E.Dbl()
-		E.Dbl()
-		return
-	}
-	if cf == 8 {
-		E.Dbl()
-		E.Dbl()
-		E.Dbl()
-		return
-	}
-	c := NewBIGints(CURVE_Cof)
-	E.Copy(E.lmul(c))
+	mem := arena.NewArena()
+	defer mem.Free()
+	c := NewBIGints(CURVE_Cof, mem)
+	E.Copy(E.lmul(c, nil, mem))
 }
 
 /* Hunt and Peck a BIG to a curve point */
-func ECP_hap2point(h *BIG) *ECP {
+func ECP_hap2point(h *BIG, mem *arena.Arena) *ECP {
 	var P *ECP
-	x := NewBIGcopy(h)
+	x := NewBIGcopy(h, mem)
 
 	for true {
-		if CURVETYPE != MONTGOMERY {
-			P = NewECPbigint(x, 0)
-		} else {
-			P = NewECPbig(x)
-		}
+		P = NewECPbigint(x, 0, mem)
 		x.inc(1)
 		x.norm()
-		if !P.Is_infinity() {
+		if !P.Is_infinity(mem) {
 			break
 		}
 	}
@@ -1299,539 +771,102 @@ func ECP_hap2point(h *BIG) *ECP {
 
 /* Constant time Map to Point */
 func ECP_map2point(h *FP) *ECP {
-	P := NewECP()
+	P := NewECP(nil)
 
-	if CURVETYPE == MONTGOMERY {
-		// Elligator 2
-		X1 := NewFP()
-		X2 := NewFP()
-		w := NewFP()
-		one := NewFPint(1)
-		A := NewFPint(CURVE_A)
-		t := NewFPcopy(h)
-		N := NewFP()
-		D := NewFP()
-		hint := NewFP()
+	// swu method
+	A := NewFP(nil)
+	B := NewFP(nil)
+	X1 := NewFP(nil)
+	X2 := NewFP(nil)
+	X3 := NewFP(nil)
+	one := NewFPint(1, nil)
+	Y := NewFP(nil)
+	D := NewFP(nil)
+	t := NewFPcopy(h, nil)
+	w := NewFP(nil)
+	//Y3:=NewFP()
+	sgn := t.sign(nil)
 
-		t.Sqr()
+	// Shallue and van de Woestijne
+	// SQRTm3 not available, so preprocess this out
+	/* */
+	Z := RIADZ
+	X1.copy(NewFPint(Z, nil))
+	X3.copy(X1)
+	A.copy(RHS(X1, nil))
+	B.copy(NewFPbig(NewBIGints(SQRTm3, nil), nil))
+	B.imul(Z, nil)
 
-		if PM1D2 == 2 {
-			t.Add(t)
-		}
-		if PM1D2 == 1 {
-			t.Neg()
-		}
-		if PM1D2 > 2 {
-			t.imul(QNRI)
-		}
+	t.Sqr(nil)
+	Y.copy(A)
+	Y.Mul(t, nil)
+	t.copy(one)
+	t.Add(Y, nil)
+	t.norm()
+	Y.rsub(one, nil)
+	Y.norm()
+	D.copy(t)
+	D.Mul(Y, nil)
+	D.Mul(B, nil)
 
-		t.norm()
-		D.copy(t)
-		D.Add(one)
-		D.norm()
+	w.copy(A)
+	FP_tpo(D, w)
 
-		X1.copy(A)
-		X1.Neg()
-		X1.norm()
-		X2.copy(X1)
-		X2.Mul(t)
-
-		w.copy(X1)
-		w.Sqr()
-		N.copy(w)
-		N.Mul(X1)
-		w.Mul(A)
-		w.Mul(D)
-		N.Add(w)
-		t.copy(D)
-		t.Sqr()
-		t.Mul(X1)
-		N.Add(t)
-		N.norm()
-
-		t.copy(N)
-		t.Mul(D)
-		qres := t.qr(hint)
-		w.copy(t)
-		w.Invert(hint)
-		D.copy(w)
-		D.Mul(N)
-		X1.Mul(D)
-		X2.Mul(D)
-		X1.cmove(X2, 1-qres)
-
-		a := X1.Redc()
-		P.Copy(NewECPbig(a))
-	}
-	if CURVETYPE == EDWARDS {
-		// Elligator 2 - map to Montgomery, place point, map back
-		X1 := NewFP()
-		X2 := NewFP()
-		t := NewFPcopy(h)
-		w := NewFP()
-		one := NewFPint(1)
-		A := NewFP()
-		w1 := NewFP()
-		w2 := NewFP()
-		B := NewFPbig(NewBIGints(CURVE_B))
-		Y := NewFP()
-		K := NewFP()
-		D := NewFP()
-		hint := NewFP()
-		//Y3:=NewFP()
-		rfc := 0
-
-		if MODTYPE != GENERALISED_MERSENNE {
-			A.copy(B)
-
-			if CURVE_A == 1 {
-				A.Add(one)
-				B.Sub(one)
-			} else {
-				A.Sub(one)
-				B.Add(one)
-			}
-			A.norm()
-			B.norm()
-
-			A.div2()
-			B.div2()
-			B.div2()
-
-			K.copy(B)
-			K.Neg()
-			K.norm()
-			//K.Invert(nil)
-			K.invsqrt(K, w1)
-
-			rfc = RIADZ
-			if rfc == 1 { // RFC7748
-				A.Mul(K)
-				K.Mul(w1)
-				//K=K.Sqrt(nil)
-			} else {
-				B.Sqr()
-			}
-		} else {
-			rfc = 1
-			A.copy(NewFPint(156326))
-		}
-
-		t.Sqr()
-		qnr := 0
-		if PM1D2 == 2 {
-			t.Add(t)
-			qnr = 2
-		}
-		if PM1D2 == 1 {
-			t.Neg()
-			qnr = -1
-		}
-		if PM1D2 > 2 {
-			t.imul(QNRI)
-			qnr = QNRI
-		}
-		t.norm()
-
-		D.copy(t)
-		D.Add(one)
-		D.norm()
-		X1.copy(A)
-		X1.Neg()
-		X1.norm()
-		X2.copy(X1)
-		X2.Mul(t)
-
-		// Figure out RHS of Montgomery curve in rational form gx1/d^3
-
-		w.copy(X1)
-		w.Sqr()
-		w1.copy(w)
-		w1.Mul(X1)
-		w.Mul(A)
-		w.Mul(D)
-		w1.Add(w)
-		w2.copy(D)
-		w2.Sqr()
-
-		if rfc == 0 {
-			w.copy(X1)
-			w.Mul(B)
-			w2.Mul(w)
-			w1.Add(w2)
-		} else {
-			w2.Mul(X1)
-			w1.Add(w2)
-		}
-		w1.norm()
-
-		B.copy(w1)
-		B.Mul(D)
-		qres := B.qr(hint)
-		w.copy(B)
-		w.Invert(hint)
-		D.copy(w)
-		D.Mul(w1)
-		X1.Mul(D)
-		X2.Mul(D)
-		D.Sqr()
-
-		w1.copy(B)
-		w1.imul(qnr)
-		w.copy(NewFPbig(NewBIGints(CURVE_HTPC)))
-		w.Mul(hint)
-		w2.copy(D)
-		w2.Mul(h)
-
-		X1.cmove(X2, 1-qres)
-		B.cmove(w1, 1-qres)
-		hint.cmove(w, 1-qres)
-		D.cmove(w2, 1-qres)
-
-		Y.copy(B.Sqrt(hint))
-		Y.Mul(D)
-
-		/*
-		               Y.copy(B.Sqrt(hint))
-		               Y.Mul(D)
-
-		   			B.imul(qnr)
-		   			w.copy(NewFPbig(NewBIGints(CURVE_HTPC)))
-		   			hint.Mul(w)
-
-		               Y3.copy(B.Sqrt(hint))
-		               D.Mul(h)
-		               Y3.Mul(D)
-
-		               X1.cmove(X2,1-qres)
-		               Y.cmove(Y3,1-qres)
-		*/
-		w.copy(Y)
-		w.Neg()
+	w.Mul(B, nil)
+	if w.sign(nil) == 1 {
+		w.Neg(nil)
 		w.norm()
-		Y.cmove(w, qres^Y.sign())
-
-		if rfc == 0 {
-			X1.Mul(K)
-			Y.Mul(K)
-		}
-
-		if MODTYPE == GENERALISED_MERSENNE {
-			t.copy(X1)
-			t.Sqr()
-			w.copy(t)
-			w.Add(one)
-			w.norm()
-			t.Sub(one)
-			t.norm()
-			w1.copy(t)
-			w1.Mul(Y)
-			w1.Add(w1)
-			X2.copy(w1)
-			X2.Add(w1)
-			X2.norm()
-			t.Sqr()
-			Y.Sqr()
-			Y.Add(Y)
-			Y.Add(Y)
-			Y.norm()
-			B.copy(t)
-			B.Add(Y)
-			B.norm()
-
-			w2.copy(Y)
-			w2.Sub(t)
-			w2.norm()
-			w2.Mul(X1)
-			t.Mul(X1)
-			Y.div2()
-			w1.copy(Y)
-			w1.Mul(w)
-			w1.rsub(t)
-			w1.norm()
-
-			t.copy(X2)
-			t.Mul(w1)
-			P.x.copy(t)
-			t.copy(w2)
-			t.Mul(B)
-			P.y.copy(t)
-			t.copy(w1)
-			t.Mul(B)
-			P.z.copy(t)
-
-			return P
-		} else {
-			w1.copy(X1)
-			w1.Add(one)
-			w1.norm()
-			w2.copy(X1)
-			w2.Sub(one)
-			w2.norm()
-			t.copy(w1)
-			t.Mul(Y)
-			X1.Mul(w1)
-
-			if rfc == 1 {
-				X1.Mul(K)
-			}
-			Y.Mul(w2)
-			P.x.copy(X1)
-			P.y.copy(Y)
-			P.z.copy(t)
-
-			return P
-		}
 	}
-	if CURVETYPE == WEIERSTRASS {
-		// swu method
-		A := NewFP()
-		B := NewFP()
-		X1 := NewFP()
-		X2 := NewFP()
-		X3 := NewFP()
-		one := NewFPint(1)
-		Y := NewFP()
-		D := NewFP()
-		t := NewFPcopy(h)
-		w := NewFP()
-		D2 := NewFP()
-		hint := NewFP()
-		GX1 := NewFP()
-		//Y3:=NewFP()
-		sgn := t.sign()
 
-		if CURVE_A != 0 || HTC_ISO != 0 {
-			if HTC_ISO != 0 {
-				/* CAHCZS
-									A.copy(NewFPbig(NewBIGints(CURVE_Ad)))
-									B.copy(NewFPbig(NewBIGints(CURVE_Bd)))
-				CAHCZF */
-			} else {
-				A.copy(NewFPint(CURVE_A))
-				B.copy(NewFPbig(NewBIGints(CURVE_B)))
-			}
-			// SSWU method
-			t.Sqr()
-			t.imul(RIADZ)
-			w.copy(t)
-			w.Add(one)
-			w.norm()
+	w.Mul(B, nil)
+	w.Mul(h, nil)
+	w.Mul(Y, nil)
+	w.Mul(D, nil)
 
-			w.Mul(t)
-			D.copy(A)
-			D.Mul(w)
+	X1.Neg(nil)
+	X1.norm()
+	X1.div2(nil)
+	X2.copy(X1)
+	X1.Sub(w, nil)
+	X1.norm()
+	X2.Add(w, nil)
+	X2.norm()
+	A.Add(A, nil)
+	A.Add(A, nil)
+	A.norm()
+	t.Sqr(nil)
+	t.Mul(D, nil)
+	t.Sqr(nil)
+	A.Mul(t, nil)
+	X3.Add(A, nil)
+	X3.norm()
 
-			w.Add(one)
-			w.norm()
-			w.Mul(B)
-			w.Neg()
-			w.norm()
+	rhs := RHS(X2, nil)
+	X3.cmove(X2, rhs.qr(nil))
+	rhs.copy(RHS(X1, nil))
+	X3.cmove(X1, rhs.qr(nil))
+	rhs.copy(RHS(X3, nil))
+	Y.copy(rhs.Sqrt(nil, nil))
 
-			X2.copy(w)
-			X3.copy(t)
-			X3.Mul(X2)
+	ne := Y.sign(nil) ^ sgn
+	w.copy(Y)
+	w.Neg(nil)
+	w.norm()
+	Y.cmove(w, ne)
 
-			// x^3+Ad^2x+Bd^3
-			GX1.copy(X2)
-			GX1.Sqr()
-			D2.copy(D)
-			D2.Sqr()
-			w.copy(A)
-			w.Mul(D2)
-			GX1.Add(w)
-			GX1.norm()
-			GX1.Mul(X2)
-			D2.Mul(D)
-			w.copy(B)
-			w.Mul(D2)
-			GX1.Add(w)
-			GX1.norm()
-
-			w.copy(GX1)
-			w.Mul(D)
-			qr := w.qr(hint)
-			D.copy(w)
-			D.Invert(hint)
-			D.Mul(GX1)
-			X2.Mul(D)
-			X3.Mul(D)
-			t.Mul(h)
-			D2.copy(D)
-			D2.Sqr()
-
-			D.copy(D2)
-			D.Mul(t)
-			t.copy(w)
-			t.imul(RIADZ)
-			X1.copy(NewFPbig(NewBIGints(CURVE_HTPC)))
-			X1.Mul(hint)
-
-			X2.cmove(X3, 1-qr)
-			D2.cmove(D, 1-qr)
-			w.cmove(t, 1-qr)
-			hint.cmove(X1, 1-qr)
-
-			Y.copy(w.Sqrt(hint))
-			Y.Mul(D2)
-			/*
-			   Y.copy(w.Sqrt(hint))
-			   Y.Mul(D2)
-
-			   D2.Mul(t)
-			   w.imul(RIADZ)
-
-			   X1.copy(NewFPbig(NewBIGints(CURVE_HTPC)))
-			   hint.Mul(X1)
-
-			   Y3.copy(w.Sqrt(hint))
-			   Y3.Mul(D2)
-
-			   X2.cmove(X3,1-qr)
-			   Y.cmove(Y3,1-qr)
-			*/
-			ne := Y.sign() ^ sgn
-			w.copy(Y)
-			w.Neg()
-			w.norm()
-			Y.cmove(w, ne)
-
-			if HTC_ISO != 0 {
-				/* CAHCZS
-									k:=0
-									isox:=HTC_ISO
-									isoy:=3*(isox-1)/2
-
-								//xnum
-									xnum:=NewFPbig(NewBIGints(PC[k])); k+=1
-									for i:=0;i<isox;i++ {
-										xnum.Mul(X2)
-										w.copy(NewFPbig(NewBIGints(PC[k]))); k+=1
-										xnum.Add(w); xnum.norm()
-									}
-								//xden
-									xden:=NewFPcopy(X2)
-									w.copy(NewFPbig(NewBIGints(PC[k]))); k+=1
-									xden.Add(w);xden.norm();
-									for i:=0;i<isox-2;i++ {
-										xden.Mul(X2)
-										w.copy(NewFPbig(NewBIGints(PC[k]))); k+=1
-										xden.Add(w); xden.norm()
-									}
-								//ynum
-									ynum:=NewFPbig(NewBIGints(PC[k])); k+=1
-									for i:=0;i<isoy;i++ {
-										ynum.Mul(X2)
-										w.copy(NewFPbig(NewBIGints(PC[k]))); k+=1
-										ynum.Add(w); ynum.norm()
-									}
-									yden:=NewFPcopy(X2)
-									w.copy(NewFPbig(NewBIGints(PC[k]))); k+=1
-									yden.Add(w);yden.norm();
-									for i:=0;i<isoy-1;i++ {
-										yden.Mul(X2)
-										w.copy(NewFPbig(NewBIGints(PC[k]))); k+=1
-										yden.Add(w); yden.norm()
-									}
-									ynum.Mul(Y)
-									w.copy(xnum); w.Mul(yden)
-									P.x.copy(w)
-									w.copy(ynum); w.Mul(xden)
-									P.y.copy(w)
-									w.copy(xden); w.Mul(yden)
-									P.z.copy(w)
-									return P
-				CAHCZF */
-			} else {
-				x := X2.Redc()
-				y := Y.Redc()
-				P.Copy(NewECPbigs(x, y))
-				return P
-			}
-		} else {
-			// Shallue and van de Woestijne
-			// SQRTm3 not available, so preprocess this out
-			/* */
-			Z := RIADZ
-			X1.copy(NewFPint(Z))
-			X3.copy(X1)
-			A.copy(RHS(X1))
-			B.copy(NewFPbig(NewBIGints(SQRTm3)))
-			B.imul(Z)
-
-			t.Sqr()
-			Y.copy(A)
-			Y.Mul(t)
-			t.copy(one)
-			t.Add(Y)
-			t.norm()
-			Y.rsub(one)
-			Y.norm()
-			D.copy(t)
-			D.Mul(Y)
-			D.Mul(B)
-
-			w.copy(A)
-			FP_tpo(D, w)
-
-			w.Mul(B)
-			if w.sign() == 1 {
-				w.Neg()
-				w.norm()
-			}
-
-			w.Mul(B)
-			w.Mul(h)
-			w.Mul(Y)
-			w.Mul(D)
-
-			X1.Neg()
-			X1.norm()
-			X1.div2()
-			X2.copy(X1)
-			X1.Sub(w)
-			X1.norm()
-			X2.Add(w)
-			X2.norm()
-			A.Add(A)
-			A.Add(A)
-			A.norm()
-			t.Sqr()
-			t.Mul(D)
-			t.Sqr()
-			A.Mul(t)
-			X3.Add(A)
-			X3.norm()
-
-			rhs := RHS(X2)
-			X3.cmove(X2, rhs.qr(nil))
-			rhs.copy(RHS(X1))
-			X3.cmove(X1, rhs.qr(nil))
-			rhs.copy(RHS(X3))
-			Y.copy(rhs.Sqrt(nil))
-
-			ne := Y.sign() ^ sgn
-			w.copy(Y)
-			w.Neg()
-			w.norm()
-			Y.cmove(w, ne)
-
-			x := X3.Redc()
-			y := Y.Redc()
-			P.Copy(NewECPbigs(x, y))
-			return P
-			/* */
-		}
-	}
+	x := X3.Redc(nil)
+	y := Y.Redc(nil)
+	P.Copy(NewECPbigs(x, y, nil))
 	return P
+	/* */
 }
 
 func ECP_mapit(h []byte) *ECP {
-	q := NewBIGints(Modulus)
+	q := NewBIGints(Modulus, nil)
 	dx := DBIG_fromBytes(h[:])
-	x := dx.Mod(q)
+	x := dx.Mod(q, nil)
 
-	P := ECP_hap2point(x)
+	P := ECP_hap2point(x, nil)
 	P.Cfp()
 	return P
 }
@@ -1839,12 +874,8 @@ func ECP_mapit(h []byte) *ECP {
 func ECP_generator() *ECP {
 	var G *ECP
 
-	gx := NewBIGints(CURVE_Gx)
-	if CURVETYPE != MONTGOMERY {
-		gy := NewBIGints(CURVE_Gy)
-		G = NewECPbigs(gx, gy)
-	} else {
-		G = NewECPbig(gx)
-	}
+	gx := NewBIGints(CURVE_Gx, nil)
+	gy := NewBIGints(CURVE_Gy, nil)
+	G = NewECPbigs(gx, gy, nil)
 	return G
 }
diff --git a/nekryptology/pkg/core/curves/native/bls48581/g2.go b/nekryptology/pkg/core/curves/native/bls48581/g2.go
index f74031b..3f2c9cf 100644
--- a/nekryptology/pkg/core/curves/native/bls48581/g2.go
+++ b/nekryptology/pkg/core/curves/native/bls48581/g2.go
@@ -21,7 +21,9 @@
 
 package bls48581
 
-//import "fmt"
+import (
+	"arena"
+)
 
 type ECP8 struct {
 	x *FP8
@@ -29,17 +31,26 @@ type ECP8 struct {
 	z *FP8
 }
 
-func NewECP8() *ECP8 {
-	E := new(ECP8)
-	E.x = NewFP8()
-	E.y = NewFP8int(1)
-	E.z = NewFP8()
+func NewECP8(mem *arena.Arena) *ECP8 {
+	var E *ECP8
+	if mem != nil {
+		E = arena.New[ECP8](mem)
+		E.x = NewFP8(mem)
+		E.y = NewFP8int(1, mem)
+		E.z = NewFP8(mem)
+	} else {
+		E = new(ECP8)
+		E.x = NewFP8(nil)
+		E.y = NewFP8int(1, nil)
+		E.z = NewFP8(nil)
+	}
+
 	return E
 }
 
 /* Test this=O? */
-func (E *ECP8) Is_infinity() bool {
-	return E.x.IsZero() && E.z.IsZero()
+func (E *ECP8) Is_infinity(mem *arena.Arena) bool {
+	return E.x.IsZero(mem) && E.z.IsZero(mem)
 }
 
 /* copy this=P */
@@ -57,9 +68,9 @@ func (E *ECP8) inf() {
 }
 
 /* set this=-this */
-func (E *ECP8) Neg() {
+func (E *ECP8) Neg(mem *arena.Arena) {
 	E.y.norm()
-	E.y.Neg()
+	E.y.Neg(mem)
 	E.y.norm()
 }
 
@@ -72,7 +83,7 @@ func (E *ECP8) cmove(Q *ECP8, d int) {
 
 /* Constant time select from pre-computed table */
 func (E *ECP8) selector(W []*ECP8, b int32) {
-	MP := NewECP8()
+	MP := NewECP8(nil)
 	m := b >> 31
 	babs := (b ^ m) - m
 
@@ -88,25 +99,26 @@ func (E *ECP8) selector(W []*ECP8, b int32) {
 	E.cmove(W[7], teq(babs, 7))
 
 	MP.Copy(E)
-	MP.Neg()
+	MP.Neg(nil)
 	E.cmove(MP, int(m&1))
 }
 
 /* Test if P == Q */
 func (E *ECP8) Equals(Q *ECP8) bool {
-
-	a := NewFP8copy(E.x)
-	b := NewFP8copy(Q.x)
-	a.Mul(Q.z)
-	b.Mul(E.z)
+	mem := arena.NewArena()
+	defer mem.Free()
+	a := NewFP8copy(E.x, mem)
+	b := NewFP8copy(Q.x, mem)
+	a.Mul(Q.z, mem)
+	b.Mul(E.z, mem)
 
 	if !a.Equals(b) {
 		return false
 	}
 	a.copy(E.y)
 	b.copy(Q.y)
-	a.Mul(Q.z)
-	b.Mul(E.z)
+	a.Mul(Q.z, mem)
+	b.Mul(E.z, mem)
 	if !a.Equals(b) {
 		return false
 	}
@@ -115,38 +127,38 @@ func (E *ECP8) Equals(Q *ECP8) bool {
 }
 
 /* set to Affine - (x,y,z) to (x,y) */
-func (E *ECP8) Affine() {
-	if E.Is_infinity() {
+func (E *ECP8) Affine(mem *arena.Arena) {
+	if E.Is_infinity(mem) {
 		return
 	}
-	one := NewFP8int(1)
+	one := NewFP8int(1, mem)
 	if E.z.Equals(one) {
-		E.x.reduce()
-		E.y.reduce()
+		E.x.reduce(mem)
+		E.y.reduce(mem)
 		return
 	}
-	E.z.Invert(nil)
+	E.z.Invert(nil, mem)
 
-	E.x.Mul(E.z)
-	E.x.reduce()
-	E.y.Mul(E.z)
-	E.y.reduce()
+	E.x.Mul(E.z, mem)
+	E.x.reduce(mem)
+	E.y.Mul(E.z, mem)
+	E.y.reduce(mem)
 	E.z.copy(one)
 }
 
 /* extract affine x as FP2 */
-func (E *ECP8) GetX() *FP8 {
-	W := NewECP8()
+func (E *ECP8) GetX(mem *arena.Arena) *FP8 {
+	W := NewECP8(mem)
 	W.Copy(E)
-	W.Affine()
+	W.Affine(mem)
 	return W.x
 }
 
 /* extract affine y as FP2 */
-func (E *ECP8) GetY() *FP8 {
-	W := NewECP8()
+func (E *ECP8) GetY(mem *arena.Arena) *FP8 {
+	W := NewECP8(mem)
 	W.Copy(E)
-	W.Affine()
+	W.Affine(mem)
 	return W.y
 }
 
@@ -169,47 +181,24 @@ func (E *ECP8) getz() *FP8 {
 func (E *ECP8) ToBytes(b []byte, compress bool) {
 	var t [8 * int(MODBYTES)]byte
 	MB := 8 * int(MODBYTES)
-	alt := false
-	W := NewECP8()
+	W := NewECP8(nil)
 	W.Copy(E)
-	W.Affine()
+	W.Affine(nil)
 	W.x.ToBytes(t[:])
 
-	if (MODBITS-1)%8 <= 4 && ALLOW_ALT_COMPRESS {
-		alt = true
+	for i := 0; i < MB; i++ {
+		b[i+1] = t[i]
 	}
-
-	if alt {
+	if !compress {
+		b[0] = 0x04
+		W.y.ToBytes(t[:])
 		for i := 0; i < MB; i++ {
-			b[i] = t[i]
+			b[i+MB+1] = t[i]
 		}
-		if !compress {
-			W.y.ToBytes(t[:])
-			for i := 0; i < MB; i++ {
-				b[i+MB] = t[i]
-			}
-		} else {
-			b[0] |= 0x80
-			if W.y.islarger() == 1 {
-				b[0] |= 0x20
-			}
-		}
-
 	} else {
-		for i := 0; i < MB; i++ {
-			b[i+1] = t[i]
-		}
-		if !compress {
-			b[0] = 0x04
-			W.y.ToBytes(t[:])
-			for i := 0; i < MB; i++ {
-				b[i+MB+1] = t[i]
-			}
-		} else {
-			b[0] = 0x02
-			if W.y.sign() == 1 {
-				b[0] = 0x03
-			}
+		b[0] = 0x02
+		if W.y.sign(nil) == 1 {
+			b[0] = 0x03
 		}
 	}
 }
@@ -219,92 +208,64 @@ func ECP8_fromBytes(b []byte) *ECP8 {
 	var t [8 * int(MODBYTES)]byte
 	MB := 8 * int(MODBYTES)
 	typ := int(b[0])
-	alt := false
 
-	if (MODBITS-1)%8 <= 4 && ALLOW_ALT_COMPRESS {
-		alt = true
+	for i := 0; i < MB; i++ {
+		t[i] = b[i+1]
 	}
-
-	if alt {
+	rx := FP8_fromBytes(t[:])
+	if typ == 0x04 {
 		for i := 0; i < MB; i++ {
-			t[i] = b[i]
-		}
-		t[0] &= 0x1f
-		rx := FP8_fromBytes(t[:])
-		if (b[0] & 0x80) == 0 {
-			for i := 0; i < MB; i++ {
-				t[i] = b[i+MB]
-			}
-			ry := FP8_fromBytes(t[:])
-			return NewECP8fp8s(rx, ry)
-		} else {
-			sgn := (b[0] & 0x20) >> 5
-			P := NewECP8fp8(rx, 0)
-			cmp := P.y.islarger()
-			if (sgn == 1 && cmp != 1) || (sgn == 0 && cmp == 1) {
-				P.Neg()
-			}
-			return P
+			t[i] = b[i+MB+1]
 		}
+		ry := FP8_fromBytes(t[:])
+		return NewECP8fp8s(rx, ry, nil)
 	} else {
-		for i := 0; i < MB; i++ {
-			t[i] = b[i+1]
-		}
-		rx := FP8_fromBytes(t[:])
-		if typ == 0x04 {
-			for i := 0; i < MB; i++ {
-				t[i] = b[i+MB+1]
-			}
-			ry := FP8_fromBytes(t[:])
-			return NewECP8fp8s(rx, ry)
-		} else {
-			return NewECP8fp8(rx, typ&1)
-		}
+		return NewECP8fp8(rx, typ&1, nil)
 	}
 }
 
 /* convert this to hex string */
 func (E *ECP8) ToString() string {
-	W := NewECP8()
+	W := NewECP8(nil)
 	W.Copy(E)
-	W.Affine()
-	if W.Is_infinity() {
+	W.Affine(nil)
+	if W.Is_infinity(nil) {
 		return "infinity"
 	}
 	return "(" + W.x.toString() + "," + W.y.toString() + ")"
 }
 
 /* Calculate RHS of twisted curve equation x^3+B/i */
-func RHS8(x *FP8) *FP8 {
-	r := NewFP8copy(x)
-	r.Sqr()
-	b2 := NewFP2big(NewBIGints(CURVE_B))
-	b4 := NewFP4fp2(b2)
-	b := NewFP8fp4(b4)
+func RHS8(x *FP8, mem *arena.Arena) *FP8 {
+	r := NewFP8copy(x, mem)
+	r.Sqr(mem)
+	b2 := NewFP2big(NewBIGints(CURVE_B, mem), mem)
+	b4 := NewFP4fp2(b2, mem)
+	b := NewFP8fp4(b4, mem)
 
-	if SEXTIC_TWIST == D_TYPE {
-		b.div_i()
-	}
-	if SEXTIC_TWIST == M_TYPE {
-		b.times_i()
-	}
-	r.Mul(x)
-	r.Add(b)
+	b.div_i(mem)
+	r.Mul(x, mem)
+	r.Add(b, mem)
 
-	r.reduce()
+	r.reduce(mem)
 	return r
 }
 
 /* construct this from (x,y) - but set to O if not on curve */
-func NewECP8fp8s(ix *FP8, iy *FP8) *ECP8 {
-	E := new(ECP8)
-	E.x = NewFP8copy(ix)
-	E.y = NewFP8copy(iy)
-	E.z = NewFP8int(1)
+func NewECP8fp8s(ix *FP8, iy *FP8, mem *arena.Arena) *ECP8 {
+	var E *ECP8
+	if mem != nil {
+		E = arena.New[ECP8](mem)
+	} else {
+		E = new(ECP8)
+	}
+	E.x = NewFP8copy(ix, mem)
+	E.y = NewFP8copy(iy, mem)
+	E.z = NewFP8int(1, mem)
 	E.x.norm()
-	rhs := RHS8(E.x)
-	y2 := NewFP8copy(E.y)
-	y2.Sqr()
+	rhs := RHS8(E.x, mem)
+	y2 := NewFP8copy(E.y, mem)
+	y2.Sqr(mem)
 	if !y2.Equals(rhs) {
 		E.inf()
 	}
@@ -312,20 +273,25 @@ func NewECP8fp8s(ix *FP8, iy *FP8) *ECP8 {
 }
 
 /* construct this from x - but set to O if not on curve */
-func NewECP8fp8(ix *FP8, s int) *ECP8 {
-	E := new(ECP8)
-	h := NewFP()
-	E.x = NewFP8copy(ix)
-	E.y = NewFP8int(1)
-	E.z = NewFP8int(1)
+func NewECP8fp8(ix *FP8, s int, mem *arena.Arena) *ECP8 {
+	var E *ECP8
+	if mem != nil {
+		E = arena.New[ECP8](mem)
+	} else {
+		E = new(ECP8)
+	}
+	h := NewFP(mem)
+	E.x = NewFP8copy(ix, mem)
+	E.y = NewFP8int(1, mem)
+	E.z = NewFP8int(1, mem)
 	E.x.norm()
-	rhs := RHS8(E.x)
+	rhs := RHS8(E.x, mem)
 	if rhs.qr(h) == 1 {
-		rhs.Sqrt(h)
-		if rhs.sign() != s {
-			rhs.Neg()
+		rhs.Sqrt(h, mem)
+		if rhs.sign(mem) != s {
+			rhs.Neg(mem)
 		}
-		rhs.reduce()
+		rhs.reduce(mem)
 		E.y.copy(rhs)
 
 	} else {
@@ -335,55 +301,48 @@ func NewECP8fp8(ix *FP8, s int) *ECP8 {
 }
 
 /* this+=this */
-func (E *ECP8) Dbl() int {
-	iy := NewFP8copy(E.y)
-	if SEXTIC_TWIST == D_TYPE {
-		iy.times_i()
-	}
+func (E *ECP8) Dbl(mem *arena.Arena) int {
+	iy := NewFP8copy(E.y, mem)
+	iy.times_i(mem)
 
-	t0 := NewFP8copy(E.y)
-	t0.Sqr()
-	if SEXTIC_TWIST == D_TYPE {
-		t0.times_i()
-	}
-	t1 := NewFP8copy(iy)
-	t1.Mul(E.z)
-	t2 := NewFP8copy(E.z)
-	t2.Sqr()
+	t0 := NewFP8copy(E.y, mem)
+	t0.Sqr(mem)
+	t0.times_i(mem)
+	t1 := NewFP8copy(iy, mem)
+	t1.Mul(E.z, mem)
+	t2 := NewFP8copy(E.z, mem)
+	t2.Sqr(mem)
 
 	E.z.copy(t0)
-	E.z.Add(t0)
+	E.z.Add(t0, mem)
 	E.z.norm()
-	E.z.Add(E.z)
-	E.z.Add(E.z)
+	E.z.Add(E.z, mem)
+	E.z.Add(E.z, mem)
 	E.z.norm()
 
-	t2.imul(3 * CURVE_B_I)
-	if SEXTIC_TWIST == M_TYPE {
-		t2.times_i()
-	}
-	x3 := NewFP8copy(t2)
-	x3.Mul(E.z)
+	t2.imul(3*CURVE_B_I, mem)
+	x3 := NewFP8copy(t2, mem)
+	x3.Mul(E.z, mem)
 
-	y3 := NewFP8copy(t0)
+	y3 := NewFP8copy(t0, mem)
 
-	y3.Add(t2)
+	y3.Add(t2, mem)
 	y3.norm()
-	E.z.Mul(t1)
+	E.z.Mul(t1, mem)
 	t1.copy(t2)
-	t1.Add(t2)
-	t2.Add(t1)
+	t1.Add(t2, mem)
+	t2.Add(t1, mem)
 	t2.norm()
-	t0.Sub(t2)
+	t0.Sub(t2, mem)
 	t0.norm() //y^2-9bz^2
-	y3.Mul(t0)
-	y3.Add(x3) //(y^2+3z*2)(y^2-9z^2)+3b.z^2.8y^2
+	y3.Mul(t0, mem)
+	y3.Add(x3, mem) //(y^2+3z*2)(y^2-9z^2)+3b.z^2.8y^2
 	t1.copy(E.x)
-	t1.Mul(iy) //
+	t1.Mul(iy, mem) //
 	E.x.copy(t0)
 	E.x.norm()
-	E.x.Mul(t1)
-	E.x.Add(E.x) //(y^2-9bz^2)xy2
+	E.x.Mul(t1, mem)
+	E.x.Add(E.x, mem) //(y^2-9bz^2)xy2
 
 	E.x.norm()
 	E.y.copy(y3)
@@ -393,90 +352,78 @@ func (E *ECP8) Dbl() int {
 }
 
 /* this+=Q - return 0 for Add, 1 for double, -1 for O */
-func (E *ECP8) Add(Q *ECP8) int {
+func (E *ECP8) Add(Q *ECP8, mem *arena.Arena) int {
 	b := 3 * CURVE_B_I
-	t0 := NewFP8copy(E.x)
-	t0.Mul(Q.x) // x.Q.x
-	t1 := NewFP8copy(E.y)
-	t1.Mul(Q.y) // y.Q.y
+	t0 := NewFP8copy(E.x, mem)
+	t0.Mul(Q.x, mem) // x.Q.x
+	t1 := NewFP8copy(E.y, mem)
+	t1.Mul(Q.y, mem) // y.Q.y
 
-	t2 := NewFP8copy(E.z)
-	t2.Mul(Q.z)
-	t3 := NewFP8copy(E.x)
-	t3.Add(E.y)
+	t2 := NewFP8copy(E.z, mem)
+	t2.Mul(Q.z, mem)
+	t3 := NewFP8copy(E.x, mem)
+	t3.Add(E.y, mem)
 	t3.norm() //t3=X1+Y1
-	t4 := NewFP8copy(Q.x)
-	t4.Add(Q.y)
-	t4.norm()  //t4=X2+Y2
-	t3.Mul(t4) //t3=(X1+Y1)(X2+Y2)
+	t4 := NewFP8copy(Q.x, mem)
+	t4.Add(Q.y, mem)
+	t4.norm()       //t4=X2+Y2
+	t3.Mul(t4, mem) //t3=(X1+Y1)(X2+Y2)
 	t4.copy(t0)
-	t4.Add(t1) //t4=X1.X2+Y1.Y2
+	t4.Add(t1, mem) //t4=X1.X2+Y1.Y2
 
-	t3.Sub(t4)
+	t3.Sub(t4, mem)
 	t3.norm()
-	if SEXTIC_TWIST == D_TYPE {
-		t3.times_i() //t3=(X1+Y1)(X2+Y2)-(X1.X2+Y1.Y2) = X1.Y2+X2.Y1
-	}
+	t3.times_i(mem) //t3=(X1+Y1)(X2+Y2)-(X1.X2+Y1.Y2) = X1.Y2+X2.Y1
 	t4.copy(E.y)
-	t4.Add(E.z)
+	t4.Add(E.z, mem)
 	t4.norm() //t4=Y1+Z1
-	x3 := NewFP8copy(Q.y)
-	x3.Add(Q.z)
+	x3 := NewFP8copy(Q.y, mem)
+	x3.Add(Q.z, mem)
 	x3.norm() //x3=Y2+Z2
 
-	t4.Mul(x3)  //t4=(Y1+Z1)(Y2+Z2)
-	x3.copy(t1) //
-	x3.Add(t2)  //X3=Y1.Y2+Z1.Z2
+	t4.Mul(x3, mem) //t4=(Y1+Z1)(Y2+Z2)
+	x3.copy(t1)     //
+	x3.Add(t2, mem) //X3=Y1.Y2+Z1.Z2
 
-	t4.Sub(x3)
+	t4.Sub(x3, mem)
 	t4.norm()
-	if SEXTIC_TWIST == D_TYPE {
-		t4.times_i() //t4=(Y1+Z1)(Y2+Z2) - (Y1.Y2+Z1.Z2) = Y1.Z2+Y2.Z1
-	}
+	t4.times_i(mem) //t4=(Y1+Z1)(Y2+Z2) - (Y1.Y2+Z1.Z2) = Y1.Z2+Y2.Z1
 	x3.copy(E.x)
-	x3.Add(E.z)
+	x3.Add(E.z, mem)
 	x3.norm() // x3=X1+Z1
-	y3 := NewFP8copy(Q.x)
-	y3.Add(Q.z)
-	y3.norm()  // y3=X2+Z2
-	x3.Mul(y3) // x3=(X1+Z1)(X2+Z2)
+	y3 := NewFP8copy(Q.x, mem)
+	y3.Add(Q.z, mem)
+	y3.norm()       // y3=X2+Z2
+	x3.Mul(y3, mem) // x3=(X1+Z1)(X2+Z2)
 	y3.copy(t0)
-	y3.Add(t2) // y3=X1.X2+Z1+Z2
-	y3.rsub(x3)
+	y3.Add(t2, mem) // y3=X1.X2+Z1+Z2
+	y3.rsub(x3, mem)
 	y3.norm() // y3=(X1+Z1)(X2+Z2) - (X1.X2+Z1.Z2) = X1.Z2+X2.Z1
 
-	if SEXTIC_TWIST == D_TYPE {
-		t0.times_i() // x.Q.x
-		t1.times_i() // y.Q.y
-	}
+	t0.times_i(mem) // x.Q.x
+	t1.times_i(mem) // y.Q.y
 	x3.copy(t0)
-	x3.Add(t0)
-	t0.Add(x3)
+	x3.Add(t0, mem)
+	t0.Add(x3, mem)
 	t0.norm()
-	t2.imul(b)
-	if SEXTIC_TWIST == M_TYPE {
-		t2.times_i()
-	}
-	z3 := NewFP8copy(t1)
-	z3.Add(t2)
+	t2.imul(b, mem)
+	z3 := NewFP8copy(t1, mem)
+	z3.Add(t2, mem)
 	z3.norm()
-	t1.Sub(t2)
+	t1.Sub(t2, mem)
 	t1.norm()
-	y3.imul(b)
-	if SEXTIC_TWIST == M_TYPE {
-		y3.times_i()
-	}
+	y3.imul(b, mem)
 	x3.copy(y3)
-	x3.Mul(t4)
+	x3.Mul(t4, mem)
 	t2.copy(t3)
-	t2.Mul(t1)
-	x3.rsub(t2)
-	y3.Mul(t0)
-	t1.Mul(z3)
-	y3.Add(t1)
-	t0.Mul(t3)
-	z3.Mul(t4)
-	z3.Add(t0)
+	t2.Mul(t1, mem)
+	x3.rsub(t2, mem)
+	y3.Mul(t0, mem)
+	t1.Mul(z3, mem)
+	y3.Add(t1, mem)
+	t0.Mul(t3, mem)
+	z3.Mul(t4, mem)
+	z3.Add(t0, mem)
 
 	E.x.copy(x3)
 	E.x.norm()
@@ -489,51 +436,42 @@ func (E *ECP8) Add(Q *ECP8) int {
 }
 
 /* set this-=Q */
-func (E *ECP8) Sub(Q *ECP8) int {
-	NQ := NewECP8()
+func (E *ECP8) Sub(Q *ECP8, mem *arena.Arena) int {
+	NQ := NewECP8(mem)
 	NQ.Copy(Q)
-	NQ.Neg()
-	D := E.Add(NQ)
+	NQ.Neg(mem)
+	D := E.Add(NQ, mem)
 	return D
 }
 
 func ECP8_frob_constants() [3]*FP2 {
-	Fra := NewBIGints(Fra)
-	Frb := NewBIGints(Frb)
-	X := NewFP2bigs(Fra, Frb)
+	Fra := NewBIGints(Fra, nil)
+	Frb := NewBIGints(Frb, nil)
+	X := NewFP2bigs(Fra, Frb, nil)
 
-	F0 := NewFP2copy(X)
-	F0.Sqr()
-	F2 := NewFP2copy(F0)
-	F2.Mul_ip()
+	F0 := NewFP2copy(X, nil)
+	F0.Sqr(nil)
+	F2 := NewFP2copy(F0, nil)
+	F2.Mul_ip(nil)
 	F2.norm()
-	F1 := NewFP2copy(F2)
-	F1.Sqr()
-	F2.Mul(F1)
+	F1 := NewFP2copy(F2, nil)
+	F1.Sqr(nil)
+	F2.Mul(F1, nil)
 
-	F2.Mul_ip()
+	F2.Mul_ip(nil)
 	F2.norm()
 
 	F1.copy(X)
-	if SEXTIC_TWIST == M_TYPE {
-		F1.Mul_ip()
-		F1.norm()
-		F1.Invert(nil)
-		F0.copy(F1)
-		F0.Sqr()
-		F1.Mul(F0)
-	}
-	if SEXTIC_TWIST == D_TYPE {
-		F0.copy(F1)
-		F0.Sqr()
-		F1.Mul(F0)
-		F0.Mul_ip()
-		F0.norm()
-		F1.Mul_ip()
-		F1.norm()
-		F1.Mul_ip()
-		F1.norm()
-	}
+
+	F0.copy(F1)
+	F0.Sqr(nil)
+	F1.Mul(F0, nil)
+	F0.Mul_ip(nil)
+	F0.norm()
+	F1.Mul_ip(nil)
+	F1.norm()
+	F1.Mul_ip(nil)
+	F1.norm()
 
 	F := [3]*FP2{F0, F1, F2}
 	return F
@@ -542,41 +480,27 @@ func ECP8_frob_constants() [3]*FP2 {
 /* set this*=q, where q is Modulus, using Frobenius */
 func (E *ECP8) frob(F [3]*FP2, n int) {
 	for i := 0; i < n; i++ {
-		E.x.frob(F[2])
-		if SEXTIC_TWIST == M_TYPE {
-			E.x.qmul(F[0])
-			E.x.times_i2()
-		}
-		if SEXTIC_TWIST == D_TYPE {
-			E.x.qmul(F[0])
-			E.x.times_i2()
-		}
-		E.y.frob(F[2])
-		if SEXTIC_TWIST == M_TYPE {
-			E.y.qmul(F[1])
-			E.y.times_i2()
-			E.y.times_i()
-		}
-		if SEXTIC_TWIST == D_TYPE {
-			E.y.qmul(F[1])
-			E.y.times_i()
-		}
-
-		E.z.frob(F[2])
+		E.x.frob(F[2], nil)
+		E.x.qmul(F[0], nil)
+		E.x.times_i2(nil)
+		E.y.frob(F[2], nil)
+		E.y.qmul(F[1], nil)
+		E.y.times_i(nil)
+		E.z.frob(F[2], nil)
 	}
 }
 
 /* P*=e */
-func (E *ECP8) mul(e *BIG) *ECP8 {
+func (E *ECP8) mul(e *BIG, mem *arena.Arena) *ECP8 {
 	/* fixed size windows */
-	mt := NewBIG()
-	t := NewBIG()
-	P := NewECP8()
-	Q := NewECP8()
-	C := NewECP8()
+	mt := NewBIG(mem)
+	t := NewBIG(mem)
+	P := NewECP8(nil)
+	Q := NewECP8(mem)
+	C := NewECP8(mem)
 
-	if E.Is_infinity() {
-		return NewECP8()
+	if E.Is_infinity(mem) {
+		return NewECP8(mem)
 	}
 
 	var W []*ECP8
@@ -584,15 +508,15 @@ func (E *ECP8) mul(e *BIG) *ECP8 {
 
 	/* precompute table */
 	Q.Copy(E)
-	Q.Dbl()
+	Q.Dbl(mem)
 
-	W = append(W, NewECP8())
+	W = append(W, NewECP8(mem))
 	W[0].Copy(E)
 
 	for i := 1; i < 8; i++ {
-		W = append(W, NewECP8())
+		W = append(W, NewECP8(mem))
 		W[i].Copy(W[i-1])
-		W[i].Add(Q)
+		W[i].Add(Q, mem)
 	}
 
 	/* make exponent odd - Add 2P if even, P if odd */
@@ -622,81 +546,80 @@ func (E *ECP8) mul(e *BIG) *ECP8 {
 	P.selector(W, int32(w[nb]))
 	for i := nb - 1; i >= 0; i-- {
 		Q.selector(W, int32(w[i]))
-		P.Dbl()
-		P.Dbl()
-		P.Dbl()
-		P.Dbl()
-		P.Add(Q)
+		P.Dbl(mem)
+		P.Dbl(mem)
+		P.Dbl(mem)
+		P.Dbl(mem)
+		P.Add(Q, mem)
 	}
-	P.Sub(C)
-	P.Affine()
+	P.Sub(C, mem)
+	P.Affine(mem)
 	return P
 }
 
 /* Public version */
-func (E *ECP8) Mul(e *BIG) *ECP8 {
-	return E.mul(e)
+func (E *ECP8) Mul(e *BIG, mem *arena.Arena) *ECP8 {
+	return E.mul(e, mem)
 }
 
 /* needed for SOK */
 func (E *ECP8) Cfp() {
 
 	F := ECP8_frob_constants()
-	x := NewBIGints(CURVE_Bnx)
+	x := NewBIGints(CURVE_Bnx, nil)
 
-	xQ := E.Mul(x)
-	x2Q := xQ.Mul(x)
-	x3Q := x2Q.Mul(x)
-	x4Q := x3Q.Mul(x)
-	x5Q := x4Q.Mul(x)
-	x6Q := x5Q.Mul(x)
-	x7Q := x6Q.Mul(x)
-	x8Q := x7Q.Mul(x)
+	xQ := E.Mul(x, nil)
+	x2Q := xQ.Mul(x, nil)
+	x3Q := x2Q.Mul(x, nil)
+	x4Q := x3Q.Mul(x, nil)
+	x5Q := x4Q.Mul(x, nil)
+	x6Q := x5Q.Mul(x, nil)
+	x7Q := x6Q.Mul(x, nil)
+	x8Q := x7Q.Mul(x, nil)
 
-	if SIGN_OF_X == NEGATIVEX {
-		xQ.Neg()
-		x3Q.Neg()
-		x5Q.Neg()
-		x7Q.Neg()
-	}
-	x8Q.Sub(x7Q)
-	x8Q.Sub(E)
+	xQ.Neg(nil)
+	x3Q.Neg(nil)
+	x5Q.Neg(nil)
+	x7Q.Neg(nil)
 
-	x7Q.Sub(x6Q)
+	x8Q.Sub(x7Q, nil)
+	x8Q.Sub(E, nil)
+
+	x7Q.Sub(x6Q, nil)
 	x7Q.frob(F, 1)
 
-	x6Q.Sub(x5Q)
+	x6Q.Sub(x5Q, nil)
 	x6Q.frob(F, 2)
 
-	x5Q.Sub(x4Q)
+	x5Q.Sub(x4Q, nil)
 	x5Q.frob(F, 3)
 
-	x4Q.Sub(x3Q)
+	x4Q.Sub(x3Q, nil)
 	x4Q.frob(F, 4)
 
-	x3Q.Sub(x2Q)
+	x3Q.Sub(x2Q, nil)
 	x3Q.frob(F, 5)
 
-	x2Q.Sub(xQ)
+	x2Q.Sub(xQ, nil)
 	x2Q.frob(F, 6)
 
-	xQ.Sub(E)
+	xQ.Sub(E, nil)
 	xQ.frob(F, 7)
 
-	E.Dbl()
+	E.Dbl(nil)
 	E.frob(F, 8)
 
-	E.Add(x8Q)
-	E.Add(x7Q)
-	E.Add(x6Q)
-	E.Add(x5Q)
+	E.Add(x8Q, nil)
+	E.Add(x7Q, nil)
+	E.Add(x6Q, nil)
+	E.Add(x5Q, nil)
 
-	E.Add(x4Q)
-	E.Add(x3Q)
-	E.Add(x2Q)
-	E.Add(xQ)
+	E.Add(x4Q, nil)
+	E.Add(x3Q, nil)
+	E.Add(x2Q, nil)
+	E.Add(xQ, nil)
 
-	E.Affine()
+	E.Affine(nil)
 }
 
 func ECP8_generator() *ECP8 {
@@ -704,34 +627,34 @@ func ECP8_generator() *ECP8 {
 	G = NewECP8fp8s(
 		NewFP8fp4s(
 			NewFP4fp2s(
-				NewFP2bigs(NewBIGints(CURVE_Pxaaa), NewBIGints(CURVE_Pxaab)),
-				NewFP2bigs(NewBIGints(CURVE_Pxaba), NewBIGints(CURVE_Pxabb))),
+				NewFP2bigs(NewBIGints(CURVE_Pxaaa, nil), NewBIGints(CURVE_Pxaab, nil), nil),
+				NewFP2bigs(NewBIGints(CURVE_Pxaba, nil), NewBIGints(CURVE_Pxabb, nil), nil), nil),
 			NewFP4fp2s(
-				NewFP2bigs(NewBIGints(CURVE_Pxbaa), NewBIGints(CURVE_Pxbab)),
-				NewFP2bigs(NewBIGints(CURVE_Pxbba), NewBIGints(CURVE_Pxbbb)))),
+				NewFP2bigs(NewBIGints(CURVE_Pxbaa, nil), NewBIGints(CURVE_Pxbab, nil), nil),
+				NewFP2bigs(NewBIGints(CURVE_Pxbba, nil), NewBIGints(CURVE_Pxbbb, nil), nil), nil), nil),
 		NewFP8fp4s(
 			NewFP4fp2s(
-				NewFP2bigs(NewBIGints(CURVE_Pyaaa), NewBIGints(CURVE_Pyaab)),
-				NewFP2bigs(NewBIGints(CURVE_Pyaba), NewBIGints(CURVE_Pyabb))),
+				NewFP2bigs(NewBIGints(CURVE_Pyaaa, nil), NewBIGints(CURVE_Pyaab, nil), nil),
+				NewFP2bigs(NewBIGints(CURVE_Pyaba, nil), NewBIGints(CURVE_Pyabb, nil), nil), nil),
 			NewFP4fp2s(
-				NewFP2bigs(NewBIGints(CURVE_Pybaa), NewBIGints(CURVE_Pybab)),
-				NewFP2bigs(NewBIGints(CURVE_Pybba), NewBIGints(CURVE_Pybbb)))))
+				NewFP2bigs(NewBIGints(CURVE_Pybaa, nil), NewBIGints(CURVE_Pybab, nil), nil),
+				NewFP2bigs(NewBIGints(CURVE_Pybba, nil), NewBIGints(CURVE_Pybbb, nil), nil), nil), nil), nil)
 	return G
 }
 
 func ECP8_hap2point(h *BIG) *ECP8 {
-	one := NewBIGint(1)
-	x := NewBIGcopy(h)
+	one := NewBIGint(1, nil)
+	x := NewBIGcopy(h, nil)
 	var X2 *FP2
 	var X4 *FP4
 	var X8 *FP8
 	var Q *ECP8
 	for true {
-		X2 = NewFP2bigs(one, x)
-		X4 = NewFP4fp2(X2)
-		X8 = NewFP8fp4(X4)
-		Q = NewECP8fp8(X8, 0)
-		if !Q.Is_infinity() {
+		X2 = NewFP2bigs(one, x, nil)
+		X4 = NewFP4fp2(X2, nil)
+		X8 = NewFP8fp4(X4, nil)
+		Q = NewECP8fp8(X8, 0, nil)
+		if !Q.Is_infinity(nil) {
 			break
 		}
 		x.inc(1)
@@ -743,83 +666,83 @@ func ECP8_hap2point(h *BIG) *ECP8 {
 /* Deterministic mapping of Fp to point on curve */
 func ECP8_map2point(H *FP8) *ECP8 {
 	// Shallue and van de Woestijne
-	NY := NewFP8int(1)
-	T := NewFP8copy(H)
-	sgn := T.sign()
+	NY := NewFP8int(1, nil)
+	T := NewFP8copy(H, nil)
+	sgn := T.sign(nil)
 
-	Z := NewFPint(RIADZG2A)
-	X1 := NewFP8fp(Z)
-	X3 := NewFP8copy(X1)
-	A := RHS8(X1)
-	W := NewFP8copy(A)
-	W.Sqrt(nil)
+	Z := NewFPint(RIADZG2A, nil)
+	X1 := NewFP8fp(Z, nil)
+	X3 := NewFP8copy(X1, nil)
+	A := RHS8(X1, nil)
+	W := NewFP8copy(A, nil)
+	W.Sqrt(nil, nil)
 
-	s := NewFPbig(NewBIGints(SQRTm3))
-	Z.Mul(s)
+	s := NewFPbig(NewBIGints(SQRTm3, nil), nil)
+	Z.Mul(s, nil)
 
-	T.Sqr()
-	Y := NewFP8copy(A)
-	Y.Mul(T)
+	T.Sqr(nil)
+	Y := NewFP8copy(A, nil)
+	Y.Mul(T, nil)
 	T.copy(NY)
-	T.Add(Y)
+	T.Add(Y, nil)
 	T.norm()
-	Y.rsub(NY)
+	Y.rsub(NY, nil)
 	Y.norm()
 	NY.copy(T)
-	NY.Mul(Y)
+	NY.Mul(Y, nil)
 
-	NY.tmul(Z)
-	NY.Invert(nil)
+	NY.tmul(Z, nil)
+	NY.Invert(nil, nil)
 
-	W.tmul(Z)
-	if W.sign() == 1 {
-		W.Neg()
+	W.tmul(Z, nil)
+	if W.sign(nil) == 1 {
+		W.Neg(nil)
 		W.norm()
 	}
-	W.tmul(Z)
-	W.Mul(H)
-	W.Mul(Y)
-	W.Mul(NY)
+	W.tmul(Z, nil)
+	W.Mul(H, nil)
+	W.Mul(Y, nil)
+	W.Mul(NY, nil)
 
-	X1.Neg()
+	X1.Neg(nil)
 	X1.norm()
-	X1.div2()
-	X2 := NewFP8copy(X1)
-	X1.Sub(W)
+	X1.div2(nil)
+	X2 := NewFP8copy(X1, nil)
+	X1.Sub(W, nil)
 	X1.norm()
-	X2.Add(W)
+	X2.Add(W, nil)
 	X2.norm()
-	A.Add(A)
-	A.Add(A)
+	A.Add(A, nil)
+	A.Add(A, nil)
 	A.norm()
-	T.Sqr()
-	T.Mul(NY)
-	T.Sqr()
-	A.Mul(T)
-	X3.Add(A)
+	T.Sqr(nil)
+	T.Mul(NY, nil)
+	T.Sqr(nil)
+	A.Mul(T, nil)
+	X3.Add(A, nil)
 	X3.norm()
 
-	Y.copy(RHS8(X2))
+	Y.copy(RHS8(X2, nil))
 	X3.cmove(X2, Y.qr(nil))
-	Y.copy(RHS8(X1))
+	Y.copy(RHS8(X1, nil))
 	X3.cmove(X1, Y.qr(nil))
-	Y.copy(RHS8(X3))
-	Y.Sqrt(nil)
+	Y.copy(RHS8(X3, nil))
+	Y.Sqrt(nil, nil)
 
-	ne := Y.sign() ^ sgn
+	ne := Y.sign(nil) ^ sgn
 	W.copy(Y)
-	W.Neg()
+	W.Neg(nil)
 	W.norm()
 	Y.cmove(W, ne)
 
-	return NewECP8fp8s(X3, Y)
+	return NewECP8fp8s(X3, Y, nil)
 }
 
 /* Map octet string to curve point */
 func ECP8_mapit(h []byte) *ECP8 {
-	q := NewBIGints(Modulus)
+	q := NewBIGints(Modulus, nil)
 	dx := DBIG_fromBytes(h)
-	x := dx.Mod(q)
+	x := dx.Mod(q, nil)
 
 	Q := ECP8_hap2point(x)
 	Q.Cfp()
@@ -830,14 +753,14 @@ func ECP8_mapit(h []byte) *ECP8 {
 // Bos & Costello https://eprint.iacr.org/2013/458.pdf
 // Faz-Hernandez & Longa & Sanchez  https://eprint.iacr.org/2013/158.pdf
 // Side channel attack secure
-func Mul16(Q []*ECP8, u []*BIG) *ECP8 {
-	W := NewECP8()
-	P := NewECP8()
+func Mul16(Q []*ECP8, u []*BIG, mem *arena.Arena) *ECP8 {
+	W := NewECP8(mem)
+	P := NewECP8(mem)
 	var T1 []*ECP8
 	var T2 []*ECP8
 	var T3 []*ECP8
 	var T4 []*ECP8
-	mt := NewBIG()
+	mt := NewBIG(mem)
 	var t []*BIG
 	var bt int8
 	var k int
@@ -852,104 +775,104 @@ func Mul16(Q []*ECP8, u []*BIG) *ECP8 {
 	var s4 [NLEN*int(BASEBITS) + 1]int8
 
 	for i := 0; i < 16; i++ {
-		t = append(t, NewBIGcopy(u[i]))
+		t = append(t, NewBIGcopy(u[i], mem))
 	}
 
-	T1 = append(T1, NewECP8())
+	T1 = append(T1, NewECP8(mem))
 	T1[0].Copy(Q[0]) // Q[0]
-	T1 = append(T1, NewECP8())
+	T1 = append(T1, NewECP8(mem))
 	T1[1].Copy(T1[0])
-	T1[1].Add(Q[1]) // Q[0]+Q[1]
-	T1 = append(T1, NewECP8())
+	T1[1].Add(Q[1], mem) // Q[0]+Q[1]
+	T1 = append(T1, NewECP8(mem))
 	T1[2].Copy(T1[0])
-	T1[2].Add(Q[2]) // Q[0]+Q[2]
-	T1 = append(T1, NewECP8())
+	T1[2].Add(Q[2], mem) // Q[0]+Q[2]
+	T1 = append(T1, NewECP8(mem))
 	T1[3].Copy(T1[1])
-	T1[3].Add(Q[2]) // Q[0]+Q[1]+Q[2]
-	T1 = append(T1, NewECP8())
+	T1[3].Add(Q[2], mem) // Q[0]+Q[1]+Q[2]
+	T1 = append(T1, NewECP8(mem))
 	T1[4].Copy(T1[0])
-	T1[4].Add(Q[3]) // Q[0]+Q[3]
-	T1 = append(T1, NewECP8())
+	T1[4].Add(Q[3], mem) // Q[0]+Q[3]
+	T1 = append(T1, NewECP8(mem))
 	T1[5].Copy(T1[1])
-	T1[5].Add(Q[3]) // Q[0]+Q[1]+Q[3]
-	T1 = append(T1, NewECP8())
+	T1[5].Add(Q[3], mem) // Q[0]+Q[1]+Q[3]
+	T1 = append(T1, NewECP8(mem))
 	T1[6].Copy(T1[2])
-	T1[6].Add(Q[3]) // Q[0]+Q[2]+Q[3]
-	T1 = append(T1, NewECP8())
+	T1[6].Add(Q[3], mem) // Q[0]+Q[2]+Q[3]
+	T1 = append(T1, NewECP8(mem))
 	T1[7].Copy(T1[3])
-	T1[7].Add(Q[3]) // Q[0]+Q[1]+Q[2]+Q[3]
+	T1[7].Add(Q[3], mem) // Q[0]+Q[1]+Q[2]+Q[3]
 
-	T2 = append(T2, NewECP8())
+	T2 = append(T2, NewECP8(mem))
 	T2[0].Copy(Q[4]) // Q[0]
-	T2 = append(T2, NewECP8())
+	T2 = append(T2, NewECP8(mem))
 	T2[1].Copy(T2[0])
-	T2[1].Add(Q[5]) // Q[0]+Q[1]
-	T2 = append(T2, NewECP8())
+	T2[1].Add(Q[5], mem) // Q[0]+Q[1]
+	T2 = append(T2, NewECP8(mem))
 	T2[2].Copy(T2[0])
-	T2[2].Add(Q[6]) // Q[0]+Q[2]
-	T2 = append(T2, NewECP8())
+	T2[2].Add(Q[6], mem) // Q[0]+Q[2]
+	T2 = append(T2, NewECP8(mem))
 	T2[3].Copy(T2[1])
-	T2[3].Add(Q[6]) // Q[0]+Q[1]+Q[2]
-	T2 = append(T2, NewECP8())
+	T2[3].Add(Q[6], mem) // Q[0]+Q[1]+Q[2]
+	T2 = append(T2, NewECP8(mem))
 	T2[4].Copy(T2[0])
-	T2[4].Add(Q[7]) // Q[0]+Q[3]
-	T2 = append(T2, NewECP8())
+	T2[4].Add(Q[7], mem) // Q[0]+Q[3]
+	T2 = append(T2, NewECP8(mem))
 	T2[5].Copy(T2[1])
-	T2[5].Add(Q[7]) // Q[0]+Q[1]+Q[3]
-	T2 = append(T2, NewECP8())
+	T2[5].Add(Q[7], mem) // Q[0]+Q[1]+Q[3]
+	T2 = append(T2, NewECP8(mem))
 	T2[6].Copy(T2[2])
-	T2[6].Add(Q[7]) // Q[0]+Q[2]+Q[3]
-	T2 = append(T2, NewECP8())
+	T2[6].Add(Q[7], mem) // Q[0]+Q[2]+Q[3]
+	T2 = append(T2, NewECP8(mem))
 	T2[7].Copy(T2[3])
-	T2[7].Add(Q[7]) // Q[0]+Q[1]+Q[2]+Q[3]
+	T2[7].Add(Q[7], mem) // Q[0]+Q[1]+Q[2]+Q[3]
 
-	T3 = append(T3, NewECP8())
+	T3 = append(T3, NewECP8(mem))
 	T3[0].Copy(Q[8]) // Q[0]
-	T3 = append(T3, NewECP8())
+	T3 = append(T3, NewECP8(mem))
 	T3[1].Copy(T3[0])
-	T3[1].Add(Q[9]) // Q[0]+Q[1]
-	T3 = append(T3, NewECP8())
+	T3[1].Add(Q[9], mem) // Q[0]+Q[1]
+	T3 = append(T3, NewECP8(mem))
 	T3[2].Copy(T3[0])
-	T3[2].Add(Q[10]) // Q[0]+Q[2]
-	T3 = append(T3, NewECP8())
+	T3[2].Add(Q[10], mem) // Q[0]+Q[2]
+	T3 = append(T3, NewECP8(mem))
 	T3[3].Copy(T3[1])
-	T3[3].Add(Q[10]) // Q[0]+Q[1]+Q[2]
-	T3 = append(T3, NewECP8())
+	T3[3].Add(Q[10], mem) // Q[0]+Q[1]+Q[2]
+	T3 = append(T3, NewECP8(mem))
 	T3[4].Copy(T3[0])
-	T3[4].Add(Q[11]) // Q[0]+Q[3]
-	T3 = append(T3, NewECP8())
+	T3[4].Add(Q[11], mem) // Q[0]+Q[3]
+	T3 = append(T3, NewECP8(mem))
 	T3[5].Copy(T3[1])
-	T3[5].Add(Q[11]) // Q[0]+Q[1]+Q[3]
-	T3 = append(T3, NewECP8())
+	T3[5].Add(Q[11], mem) // Q[0]+Q[1]+Q[3]
+	T3 = append(T3, NewECP8(mem))
 	T3[6].Copy(T3[2])
-	T3[6].Add(Q[11]) // Q[0]+Q[2]+Q[3]
-	T3 = append(T3, NewECP8())
+	T3[6].Add(Q[11], mem) // Q[0]+Q[2]+Q[3]
+	T3 = append(T3, NewECP8(mem))
 	T3[7].Copy(T3[3])
-	T3[7].Add(Q[11]) // Q[0]+Q[1]+Q[2]+Q[3]
+	T3[7].Add(Q[11], mem) // Q[0]+Q[1]+Q[2]+Q[3]
 
-	T4 = append(T4, NewECP8())
+	T4 = append(T4, NewECP8(mem))
 	T4[0].Copy(Q[12]) // Q[0]
-	T4 = append(T4, NewECP8())
+	T4 = append(T4, NewECP8(mem))
 	T4[1].Copy(T4[0])
-	T4[1].Add(Q[13]) // Q[0]+Q[1]
-	T4 = append(T4, NewECP8())
+	T4[1].Add(Q[13], mem) // Q[0]+Q[1]
+	T4 = append(T4, NewECP8(mem))
 	T4[2].Copy(T4[0])
-	T4[2].Add(Q[14]) // Q[0]+Q[2]
-	T4 = append(T4, NewECP8())
+	T4[2].Add(Q[14], mem) // Q[0]+Q[2]
+	T4 = append(T4, NewECP8(mem))
 	T4[3].Copy(T4[1])
-	T4[3].Add(Q[14]) // Q[0]+Q[1]+Q[2]
-	T4 = append(T4, NewECP8())
+	T4[3].Add(Q[14], mem) // Q[0]+Q[1]+Q[2]
+	T4 = append(T4, NewECP8(mem))
 	T4[4].Copy(T4[0])
-	T4[4].Add(Q[15]) // Q[0]+Q[3]
-	T4 = append(T4, NewECP8())
+	T4[4].Add(Q[15], mem) // Q[0]+Q[3]
+	T4 = append(T4, NewECP8(mem))
 	T4[5].Copy(T4[1])
-	T4[5].Add(Q[15]) // Q[0]+Q[1]+Q[3]
-	T4 = append(T4, NewECP8())
+	T4[5].Add(Q[15], mem) // Q[0]+Q[1]+Q[3]
+	T4 = append(T4, NewECP8(mem))
 	T4[6].Copy(T4[2])
-	T4[6].Add(Q[15]) // Q[0]+Q[2]+Q[3]
-	T4 = append(T4, NewECP8())
+	T4[6].Add(Q[15], mem) // Q[0]+Q[2]+Q[3]
+	T4 = append(T4, NewECP8(mem))
 	T4[7].Copy(T4[3])
-	T4[7].Add(Q[15]) // Q[0]+Q[1]+Q[2]+Q[3]
+	T4[7].Add(Q[15], mem) // Q[0]+Q[1]+Q[2]+Q[3]
 
 	// Make them odd
 	pb1 := 1 - t[0].parity()
@@ -1037,38 +960,38 @@ func Mul16(Q []*ECP8, u []*BIG) *ECP8 {
 	// Main loop
 	P.selector(T1, int32(2*w1[nb-1]+1))
 	W.selector(T2, int32(2*w2[nb-1]+1))
-	P.Add(W)
+	P.Add(W, mem)
 	W.selector(T3, int32(2*w3[nb-1]+1))
-	P.Add(W)
+	P.Add(W, mem)
 	W.selector(T4, int32(2*w4[nb-1]+1))
-	P.Add(W)
+	P.Add(W, mem)
 	for i := nb - 2; i >= 0; i-- {
-		P.Dbl()
+		P.Dbl(mem)
 		W.selector(T1, int32(2*w1[i]+s1[i]))
-		P.Add(W)
+		P.Add(W, mem)
 		W.selector(T2, int32(2*w2[i]+s2[i]))
-		P.Add(W)
+		P.Add(W, mem)
 		W.selector(T3, int32(2*w3[i]+s3[i]))
-		P.Add(W)
+		P.Add(W, mem)
 		W.selector(T4, int32(2*w4[i]+s4[i]))
-		P.Add(W)
+		P.Add(W, mem)
 
 	}
 
 	// apply correction
 	W.Copy(P)
-	W.Sub(Q[0])
+	W.Sub(Q[0], mem)
 	P.cmove(W, pb1)
 	W.Copy(P)
-	W.Sub(Q[4])
+	W.Sub(Q[4], mem)
 	P.cmove(W, pb2)
 	W.Copy(P)
-	W.Sub(Q[8])
+	W.Sub(Q[8], mem)
 	P.cmove(W, pb3)
 	W.Copy(P)
-	W.Sub(Q[12])
+	W.Sub(Q[12], mem)
 	P.cmove(W, pb4)
 
-	P.Affine()
+	P.Affine(mem)
 	return P
 }
diff --git a/nekryptology/pkg/core/curves/native/bls48581/hpke.go b/nekryptology/pkg/core/curves/native/bls48581/hpke.go
deleted file mode 100644
index eb80eb7..0000000
--- a/nekryptology/pkg/core/curves/native/bls48581/hpke.go
+++ /dev/null
@@ -1,328 +0,0 @@
-/*
- * Copyright (c) 2012-2020 MIRACL UK Ltd.
- *
- * This file is part of MIRACL Core
- * (see https://github.com/miracl/ext..
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Hybrid Public Key Encryption */
-
-/* Following https://datatracker.ietf.org/doc/draft-irtf-cfrg-hpke/?include_text=1 */
-
-package bls48581
-
-import "source.quilibrium.com/quilibrium/monorepo/nekryptology/pkg/core/curves/native/bls48581/ext"
-
-//import "fmt"
-
-func reverse(X []byte) {
-	lx := len(X)
-	for i := 0; i < lx/2; i++ {
-		ch := X[i]
-		X[i] = X[lx-i-1]
-		X[lx-i-1] = ch
-	}
-}
-
-func labeledExtract(SALT []byte, SUITE_ID []byte, label string, IKM []byte) []byte {
-	rfc := "HPKE-v1"
-	RFC := []byte(rfc)
-	LABEL := []byte(label)
-	var LIKM []byte
-	for i := 0; i < len(RFC); i++ {
-		LIKM = append(LIKM, RFC[i])
-	}
-	for i := 0; i < len(SUITE_ID); i++ {
-		LIKM = append(LIKM, SUITE_ID[i])
-	}
-	for i := 0; i < len(LABEL); i++ {
-		LIKM = append(LIKM, LABEL[i])
-	}
-	if IKM != nil {
-		for i := 0; i < len(IKM); i++ {
-			LIKM = append(LIKM, IKM[i])
-		}
-	}
-	return ext.HKDF_Extract(ext.MC_SHA2, HASH_TYPE, SALT, LIKM)
-}
-
-func labeledExpand(PRK []byte, SUITE_ID []byte, label string, INFO []byte, L int) []byte {
-	rfc := "HPKE-v1"
-	RFC := []byte(rfc)
-	LABEL := []byte(label)
-	AR := ext.InttoBytes(L, 2)
-	var LINFO []byte
-	for i := 0; i < len(AR); i++ {
-		LINFO = append(LINFO, AR[i])
-	}
-	for i := 0; i < len(RFC); i++ {
-		LINFO = append(LINFO, RFC[i])
-	}
-	for i := 0; i < len(SUITE_ID); i++ {
-		LINFO = append(LINFO, SUITE_ID[i])
-	}
-	for i := 0; i < len(LABEL); i++ {
-		LINFO = append(LINFO, LABEL[i])
-	}
-	if INFO != nil {
-		for i := 0; i < len(INFO); i++ {
-			LINFO = append(LINFO, INFO[i])
-		}
-	}
-
-	return ext.HKDF_Expand(ext.MC_SHA2, HASH_TYPE, L, PRK, LINFO)
-}
-
-func extractAndExpand(config_id int, DH []byte, context []byte) []byte {
-	kem := config_id & 255
-	txt := "KEM"
-	KEM_ID := ext.InttoBytes(kem, 2)
-	KEM := []byte(txt)
-	var SUITE_ID []byte
-	for i := 0; i < len(KEM); i++ {
-		SUITE_ID = append(SUITE_ID, KEM[i])
-	}
-	SUITE_ID = append(SUITE_ID, KEM_ID[0])
-	SUITE_ID = append(SUITE_ID, KEM_ID[1])
-
-	PRK := labeledExtract(nil, SUITE_ID, "eae_prk", DH)
-	return labeledExpand(PRK, SUITE_ID, "shared_secret", context, HASH_TYPE)
-}
-
-func DeriveKeyPair(config_id int, SK []byte, PK []byte, SEED []byte) bool {
-	counter := 0
-	kem := config_id & 255
-
-	txt := "KEM"
-	KEM_ID := ext.InttoBytes(kem, 2)
-	KEM := []byte(txt)
-	var SUITE_ID []byte
-	for i := 0; i < len(KEM); i++ {
-		SUITE_ID = append(SUITE_ID, KEM[i])
-	}
-	SUITE_ID = append(SUITE_ID, KEM_ID[0])
-	SUITE_ID = append(SUITE_ID, KEM_ID[1])
-
-	PRK := labeledExtract(nil, SUITE_ID, "dkp_prk", SEED)
-	var S []byte
-	if kem == 32 || kem == 33 { // RFC7748
-		S = labeledExpand(PRK, SUITE_ID, "sk", nil, EGS)
-		reverse(S)
-		if kem == 32 {
-			S[EGS-1] &= 248
-			S[0] &= 127
-			S[0] |= 64
-		} else {
-			S[EGS-1] &= 252
-			S[0] |= 128
-		}
-	} else {
-		bit_mask := 0xff
-		if kem == 18 {
-			bit_mask = 1
-		}
-		for i := 0; i < EGS; i++ {
-			S = append(S, 0)
-		}
-		for !ECDH_IN_RANGE(S) && counter < 256 {
-			var INFO [1]byte
-			INFO[0] = byte(counter)
-			S = labeledExpand(PRK, SUITE_ID, "candidate", INFO[:], EGS)
-			S[0] &= byte(bit_mask)
-			counter++
-		}
-	}
-	for i := 0; i < EGS; i++ {
-		SK[i] = S[i]
-	}
-	ECDH_KEY_PAIR_GENERATE(nil, SK, PK)
-	if kem == 32 || kem == 33 {
-		reverse(PK)
-	}
-	if counter < 256 {
-		return true
-	}
-	return false
-}
-
-func Encap(config_id int, skE []byte, pkE []byte, pkR []byte) []byte {
-	DH := make([]byte, EFS)
-	var kemcontext []byte
-	kem := config_id & 255
-
-	if kem == 32 || kem == 33 {
-		reverse(pkR)
-		ECDH_ECPSVDP_DH(skE, pkR, DH[:], 0)
-		reverse(pkR)
-		reverse(DH[:])
-	} else {
-		ECDH_ECPSVDP_DH(skE, pkR, DH[:], 0)
-	}
-	for i := 0; i < len(pkE); i++ {
-		kemcontext = append(kemcontext, pkE[i])
-	}
-	for i := 0; i < len(pkR); i++ {
-		kemcontext = append(kemcontext, pkR[i])
-	}
-	return extractAndExpand(config_id, DH[:], kemcontext)
-}
-
-func Decap(config_id int, skR []byte, pkE []byte, pkR []byte) []byte {
-	DH := make([]byte, EFS)
-	var kemcontext []byte
-	kem := config_id & 255
-
-	if kem == 32 || kem == 33 {
-		reverse(pkE)
-		ECDH_ECPSVDP_DH(skR, pkE, DH[:], 0)
-		reverse(pkE)
-		reverse(DH[:])
-	} else {
-		ECDH_ECPSVDP_DH(skR, pkE, DH[:], 0)
-	}
-
-	for i := 0; i < len(pkE); i++ {
-		kemcontext = append(kemcontext, pkE[i])
-	}
-	for i := 0; i < len(pkR); i++ {
-		kemcontext = append(kemcontext, pkR[i])
-	}
-	return extractAndExpand(config_id, DH[:], kemcontext)
-}
-
-func AuthEncap(config_id int, skE []byte, skS []byte, pkE []byte, pkR []byte, pkS []byte) []byte {
-	pklen := len(pkE)
-	DH := make([]byte, EFS)
-	DH1 := make([]byte, EFS)
-
-	kemcontext := make([]byte, 3*pklen)
-	kem := config_id & 255
-
-	if kem == 32 || kem == 33 {
-		reverse(pkR)
-		ECDH_ECPSVDP_DH(skE, pkR, DH[:], 0)
-		ECDH_ECPSVDP_DH(skS, pkR, DH1[:], 0)
-		reverse(pkR)
-		reverse(DH[:])
-		reverse(DH1[:])
-	} else {
-		ECDH_ECPSVDP_DH(skE, pkR, DH[:], 0)
-		ECDH_ECPSVDP_DH(skS, pkR, DH1[:], 0)
-	}
-	ZZ := make([]byte, 2*EFS)
-	for i := 0; i < EFS; i++ {
-		ZZ[i] = DH[i]
-		ZZ[EFS+i] = DH1[i]
-	}
-
-	for i := 0; i < pklen; i++ {
-		kemcontext[i] = pkE[i]
-		kemcontext[pklen+i] = pkR[i]
-		kemcontext[2*pklen+i] = pkS[i]
-	}
-	return extractAndExpand(config_id, ZZ[:], kemcontext)
-}
-
-func AuthDecap(config_id int, skR []byte, pkE []byte, pkR []byte, pkS []byte) []byte {
-	pklen := len(pkE)
-	DH := make([]byte, EFS)
-	DH1 := make([]byte, EFS)
-	kemcontext := make([]byte, 3*pklen)
-
-	kem := config_id & 255
-
-	if kem == 32 || kem == 33 {
-		reverse(pkE)
-		reverse(pkS)
-		ECDH_ECPSVDP_DH(skR[:], pkE, DH[:], 0)
-		ECDH_ECPSVDP_DH(skR[:], pkS, DH1[:], 0)
-		reverse(pkE)
-		reverse(pkS)
-		reverse(DH[:])
-		reverse(DH1[:])
-	} else {
-		ECDH_ECPSVDP_DH(skR[:], pkE, DH[:], 0)
-		ECDH_ECPSVDP_DH(skR[:], pkS, DH1[:], 0)
-	}
-	ZZ := make([]byte, 2*EFS)
-	for i := 0; i < EFS; i++ {
-		ZZ[i] = DH[i]
-		ZZ[EFS+i] = DH1[i]
-	}
-
-	for i := 0; i < pklen; i++ {
-		kemcontext[i] = pkE[i]
-		kemcontext[pklen+i] = pkR[i]
-		kemcontext[2*pklen+i] = pkS[i]
-	}
-	return extractAndExpand(config_id, ZZ[:], kemcontext)
-}
-
-/*
-func printBinary(array []byte) {
-	for i := 0; i < len(array); i++ {
-		fmt.Printf("%02x", array[i])
-	}
-	fmt.Printf("\n")
-}
-*/
-
-func KeySchedule(config_id int, mode int, Z []byte, info []byte, psk []byte, pskID []byte) ([]byte, []byte, []byte) {
-	var context []byte
-
-	kem := config_id & 255
-	kdf := (config_id >> 8) & 3
-	aead := (config_id >> 10) & 3
-
-	txt := "HPKE"
-	KEM := []byte(txt)
-	var SUITE_ID []byte
-	for i := 0; i < len(KEM); i++ {
-		SUITE_ID = append(SUITE_ID, KEM[i])
-	}
-	num := ext.InttoBytes(kem, 2)
-	SUITE_ID = append(SUITE_ID, num[0])
-	SUITE_ID = append(SUITE_ID, num[1])
-	num = ext.InttoBytes(kdf, 2)
-	SUITE_ID = append(SUITE_ID, num[0])
-	SUITE_ID = append(SUITE_ID, num[1])
-	num = ext.InttoBytes(aead, 2)
-	SUITE_ID = append(SUITE_ID, num[0])
-	SUITE_ID = append(SUITE_ID, num[1])
-
-	ar := ext.InttoBytes(mode, 1)
-	for i := 0; i < len(ar); i++ {
-		context = append(context, ar[i])
-	}
-
-	H := labeledExtract(nil, SUITE_ID, "psk_id_hash", pskID)
-	for i := 0; i < HASH_TYPE; i++ {
-		context = append(context, H[i])
-	}
-	H = labeledExtract(nil, SUITE_ID, "info_hash", info)
-	for i := 0; i < HASH_TYPE; i++ {
-		context = append(context, H[i])
-	}
-	//H=labeledExtract(nil,SUITE_ID,"psk_hash",psk)
-	//secret:=labeledExtract(H,SUITE_ID,"secret",Z)
-
-	secret := labeledExtract(Z, SUITE_ID, "secret", psk)
-
-	key := labeledExpand(secret, SUITE_ID, "key", context, AESKEY)
-	nonce := labeledExpand(secret, SUITE_ID, "base_nonce", context, 12)
-	exp_secret := labeledExpand(secret, SUITE_ID, "exp", context, HASH_TYPE)
-
-	return key, nonce, exp_secret
-}
diff --git a/nekryptology/pkg/core/curves/native/bls48581/mpin256.go b/nekryptology/pkg/core/curves/native/bls48581/mpin256.go
deleted file mode 100644
index dd1d970..0000000
--- a/nekryptology/pkg/core/curves/native/bls48581/mpin256.go
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * Copyright (c) 2012-2020 MIRACL UK Ltd.
- *
- * This file is part of MIRACL Core
- * (see https://github.com/miracl/ext..
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* MPIN 256-bit API Functions */
-
-package bls48581
-
-import "source.quilibrium.com/quilibrium/monorepo/nekryptology/pkg/core/curves/native/bls48581/ext"
-
-//import "fmt"
-
-const MFS int = int(MODBYTES)
-const MGS int = int(MODBYTES)
-const BAD_PARAMS int = -11
-const INVALID_POINT int = -14
-const WRONG_ORDER int = -18
-const BAD_PIN int = -19
-
-/* Configure your PIN here */
-
-const MAXPIN int32 = 10000 /* PIN less than this */
-const PBLEN int32 = 14     /* Number of bits in PIN */
-
-func MPIN_HASH_ID(sha int, ID []byte) []byte {
-	return ext.GPhashit(ext.MC_SHA2, sha, int(MODBYTES), 0, nil, -1, ID)
-	//return mhashit(sha, 0, ID)
-}
-
-func roundup(a int, b int) int {
-	return (((a)-1)/(b) + 1)
-}
-
-func MPIN_ENCODE_TO_CURVE(DST []byte, ID []byte, HCID []byte) {
-	q := NewBIGints(Modulus)
-	k := q.Nbits()
-	r := NewBIGints(CURVE_Order)
-	m := r.Nbits()
-	L := roundup(k+roundup(m, 2), 8)
-	var fd = make([]byte, L)
-	OKM := ext.XMD_Expand(ext.MC_SHA2, HASH_TYPE, L, DST, ID)
-
-	for j := 0; j < L; j++ {
-		fd[j] = OKM[j]
-	}
-	dx := DBIG_fromBytes(fd)
-	u := NewFPbig(dx.Mod(q))
-	P := ECP_map2point(u)
-
-	P.Cfp()
-	P.Affine()
-	P.ToBytes(HCID, false)
-}
-
-/* create random secret S */
-func MPIN_RANDOM_GENERATE(rng *ext.RAND, S []byte) int {
-	r := NewBIGints(CURVE_Order)
-	s := Randtrunc(r, 16*AESKEY, rng)
-	s.ToBytes(S)
-	return 0
-}
-
-func MPIN_EXTRACT_PIN(CID []byte, pin int, TOKEN []byte) int {
-	P := ECP_fromBytes(TOKEN)
-	if P.Is_infinity() {
-		return INVALID_POINT
-	}
-	R := ECP_fromBytes(CID)
-	if R.Is_infinity() {
-		return INVALID_POINT
-	}
-	R = R.pinmul(int32(pin)%MAXPIN, PBLEN)
-	P.Sub(R)
-	P.ToBytes(TOKEN, false)
-	return 0
-}
-
-/* Implement step 2 on client side of MPin protocol */
-func MPIN_CLIENT_2(X []byte, Y []byte, SEC []byte) int {
-	r := NewBIGints(CURVE_Order)
-	P := ECP_fromBytes(SEC)
-	if P.Is_infinity() {
-		return INVALID_POINT
-	}
-
-	px := FromBytes(X)
-	py := FromBytes(Y)
-	px.Add(py)
-	px.Mod(r)
-
-	P = G1mul(P, px)
-	P.Neg()
-	P.ToBytes(SEC, false)
-	return 0
-}
-
-func MPIN_GET_CLIENT_SECRET(S []byte, IDHTC []byte, CST []byte) int {
-	s := FromBytes(S)
-	P := ECP_fromBytes(IDHTC)
-	if P.Is_infinity() {
-		return INVALID_POINT
-	}
-	G1mul(P, s).ToBytes(CST, false)
-	return 0
-}
-
-/* Implement step 1 on client side of MPin protocol */
-func MPIN_CLIENT_1(CID []byte, rng *ext.RAND, X []byte, pin int, TOKEN []byte, SEC []byte, xID []byte) int {
-	r := NewBIGints(CURVE_Order)
-	var x *BIG
-	if rng != nil {
-		x = Randtrunc(r, 16*AESKEY, rng)
-		x.ToBytes(X)
-	} else {
-		x = FromBytes(X)
-	}
-
-	P := ECP_fromBytes(CID)
-	if P.Is_infinity() {
-		return INVALID_POINT
-	}
-
-	T := ECP_fromBytes(TOKEN)
-	if T.Is_infinity() {
-		return INVALID_POINT
-	}
-
-	W := P.pinmul(int32(pin)%MAXPIN, PBLEN)
-	T.Add(W)
-
-	P = G1mul(P, x)
-	P.ToBytes(xID, false)
-
-	T.ToBytes(SEC, false)
-	return 0
-}
-
-/* Extract Server Secret SST=S*Q where Q is fixed generator in G2 and S is master secret */
-func MPIN_GET_SERVER_SECRET(S []byte, SST []byte) int {
-	Q := ECP8_generator()
-	s := FromBytes(S)
-	Q = G2mul(Q, s)
-	Q.ToBytes(SST, false)
-	return 0
-}
-
-/* Implement step 2 of MPin protocol on server side */
-func MPIN_SERVER(HID []byte, Y []byte, SST []byte, xID []byte, mSEC []byte) int {
-	Q := ECP8_generator()
-
-	sQ := ECP8_fromBytes(SST)
-	if sQ.Is_infinity() {
-		return INVALID_POINT
-	}
-
-	if xID == nil {
-		return BAD_PARAMS
-	}
-	R := ECP_fromBytes(xID)
-	if R.Is_infinity() {
-		return INVALID_POINT
-	}
-	y := FromBytes(Y)
-	if HID == nil {
-		return BAD_PARAMS
-	}
-	P := ECP_fromBytes(HID)
-	if P.Is_infinity() {
-		return INVALID_POINT
-	}
-
-	P = G1mul(P, y)
-	P.Add(R)
-	R = ECP_fromBytes(mSEC)
-	if R.Is_infinity() {
-		return INVALID_POINT
-	}
-
-	var g *FP48
-	g = Ate2(Q, R, sQ, P)
-	g = Fexp(g)
-
-	if !g.Isunity() {
-		return BAD_PIN
-	}
-	return 0
-}
diff --git a/nekryptology/pkg/core/curves/native/bls48581/pair8.go b/nekryptology/pkg/core/curves/native/bls48581/pair8.go
index 123f253..39f988f 100644
--- a/nekryptology/pkg/core/curves/native/bls48581/pair8.go
+++ b/nekryptology/pkg/core/curves/native/bls48581/pair8.go
@@ -21,111 +21,98 @@
 
 package bls48581
 
+import (
+	"arena"
+)
+
 //import "fmt"
 
 // Point doubling for pairings
-func dbl(A *ECP8, AA *FP8, BB *FP8, CC *FP8) {
-	CC.copy(A.getx())          //X
-	YY := NewFP8copy(A.gety()) //Y
-	BB.copy(A.getz())          //Z
-	AA.copy(YY)                //Y
-	AA.Mul(BB)                 //YZ
-	CC.Sqr()                   //X^2
-	YY.Sqr()                   //Y^2
-	BB.Sqr()                   //Z^2
+func dbl(A *ECP8, AA *FP8, BB *FP8, CC *FP8, mem *arena.Arena) {
+	CC.copy(A.getx())               //X
+	YY := NewFP8copy(A.gety(), mem) //Y
+	BB.copy(A.getz())               //Z
+	AA.copy(YY)                     //Y
+	AA.Mul(BB, mem)                 //YZ
+	CC.Sqr(mem)                     //X^2
+	YY.Sqr(mem)                     //Y^2
+	BB.Sqr(mem)                     //Z^2
 
-	AA.Add(AA)
-	AA.Neg()
+	AA.Add(AA, mem)
+	AA.Neg(mem)
 	AA.norm() //-2AA
-	AA.times_i()
+	AA.times_i(mem)
 
 	sb := 3 * CURVE_B_I
-	BB.imul(sb)
-	CC.imul(3)
-	if SEXTIC_TWIST == D_TYPE {
-		YY.times_i()
-		CC.times_i()
-	}
-	if SEXTIC_TWIST == M_TYPE {
-		BB.times_i()
-	}
-	BB.Sub(YY)
+	BB.imul(sb, mem)
+	CC.imul(3, mem)
+	YY.times_i(mem)
+	CC.times_i(mem)
+	BB.Sub(YY, mem)
 	BB.norm()
 
-	A.Dbl()
+	A.Dbl(mem)
 }
 
 // Point addition for pairings
-func add(A *ECP8, B *ECP8, AA *FP8, BB *FP8, CC *FP8) {
-	AA.copy(A.getx())          // X1
-	CC.copy(A.gety())          // Y1
-	T1 := NewFP8copy(A.getz()) // Z1
-	BB.copy(A.getz())          // Z1
+func add(A *ECP8, B *ECP8, AA *FP8, BB *FP8, CC *FP8, mem *arena.Arena) {
+	AA.copy(A.getx())               // X1
+	CC.copy(A.gety())               // Y1
+	T1 := NewFP8copy(A.getz(), mem) // Z1
+	BB.copy(A.getz())               // Z1
 
-	T1.Mul(B.gety()) // T1=Z1.Y2
-	BB.Mul(B.getx()) // T2=Z1.X2
+	T1.Mul(B.gety(), mem) // T1=Z1.Y2
+	BB.Mul(B.getx(), mem) // T2=Z1.X2
 
-	AA.Sub(BB)
+	AA.Sub(BB, mem)
 	AA.norm() // X1=X1-Z1.X2
-	CC.Sub(T1)
+	CC.Sub(T1, mem)
 	CC.norm() // Y1=Y1-Z1.Y2
 
 	T1.copy(AA) // T1=X1-Z1.X2
 
-	if SEXTIC_TWIST == M_TYPE {
-		AA.times_i()
-		AA.norm()
-	}
+	T1.Mul(B.gety(), mem) // T1=(X1-Z1.X2).Y2
 
-	T1.Mul(B.gety()) // T1=(X1-Z1.X2).Y2
-
-	BB.copy(CC)      // T2=Y1-Z1.Y2
-	BB.Mul(B.getx()) // T2=(Y1-Z1.Y2).X2
-	BB.Sub(T1)
+	BB.copy(CC)           // T2=Y1-Z1.Y2
+	BB.Mul(B.getx(), mem) // T2=(Y1-Z1.Y2).X2
+	BB.Sub(T1, mem)
 	BB.norm() // T2=(Y1-Z1.Y2).X2 - (X1-Z1.X2).Y2
-	CC.Neg()
+	CC.Neg(mem)
 	CC.norm() // Y1=-(Y1-Z1.Y2).Xs
 
-	A.Add(B)
+	A.Add(B, mem)
 }
 
-func line(A *ECP8, B *ECP8, Qx *FP, Qy *FP) *FP48 {
-	AA := NewFP8()
-	BB := NewFP8()
-	CC := NewFP8()
+func line(A *ECP8, B *ECP8, Qx *FP, Qy *FP, mem *arena.Arena) *FP48 {
+	AA := NewFP8(mem)
+	BB := NewFP8(mem)
+	CC := NewFP8(mem)
 
 	var a *FP16
 	var b *FP16
 	var c *FP16
 
 	if A == B {
-		dbl(A, AA, BB, CC)
+		dbl(A, AA, BB, CC, mem)
 	} else {
-		add(A, B, AA, BB, CC)
+		add(A, B, AA, BB, CC, mem)
 	}
-	CC.tmul(Qx)
-	AA.tmul(Qy)
+	CC.tmul(Qx, mem)
+	AA.tmul(Qy, mem)
 
-	a = NewFP16fp8s(AA, BB)
+	a = NewFP16fp8s(AA, BB, mem)
 
-	if SEXTIC_TWIST == D_TYPE {
-		b = NewFP16fp8(CC) // L(0,1) | L(0,0) | L(1,0)
-		c = NewFP16()
-	}
-	if SEXTIC_TWIST == M_TYPE {
-		b = NewFP16()
-		c = NewFP16fp8(CC)
-		c.times_i()
-	}
+	b = NewFP16fp8(CC, mem) // L(0,1) | L(0,0) | L(1,0)
+	c = NewFP16(mem)
 
-	r := NewFP48fp16s(a, b, c)
+	r := NewFP48fp16s(a, b, c, mem)
 	r.stype = FP_SPARSER
 	return r
 }
 
 /* prepare ate parameter, n=6u+2 (BN) or n=u (BLS), n3=3*n */
-func lbits(n3 *BIG, n *BIG) int {
-	n.copy(NewBIGints(CURVE_Bnx))
+func lbits(n3 *BIG, n *BIG, mem *arena.Arena) int {
+	n.copy(NewBIGints(CURVE_Bnx, mem))
 	n3.copy(n)
 	n3.pmul(3)
 	n3.norm()
@@ -133,40 +120,38 @@ func lbits(n3 *BIG, n *BIG) int {
 }
 
 /* prepare for multi-pairing */
-func Initmp() []*FP48 {
+func Initmp(mem *arena.Arena) []*FP48 {
 	var r []*FP48
 	for i := ATE_BITS - 1; i >= 0; i-- {
-		r = append(r, NewFP48int(1))
+		r = append(r, NewFP48int(1, mem))
 	}
 	return r
 }
 
 /* basic Miller loop */
-func Miller(r []*FP48) *FP48 {
-	res := NewFP48int(1)
+func Miller(r []*FP48, mem *arena.Arena) *FP48 {
+	res := NewFP48int(1, mem)
 	for i := ATE_BITS - 1; i >= 1; i-- {
-		res.Sqr()
-		res.ssmul(r[i])
+		res.Sqr(mem)
+		res.ssmul(r[i], mem)
 		r[i].zero()
 	}
 
-	if SIGN_OF_X == NEGATIVEX {
-		res.conj()
-	}
-	res.ssmul(r[0])
+	res.conj(mem)
+	res.ssmul(r[0], mem)
 	r[0].zero()
 	return res
 }
 
 // Store precomputed line details in an FP8
 func pack(AA *FP8, BB *FP8, CC *FP8) *FP16 {
-	i := NewFP8copy(CC)
-	i.Invert(nil)
-	a := NewFP8copy(AA)
-	a.Mul(i)
-	b := NewFP8copy(BB)
-	b.Mul(i)
-	return NewFP16fp8s(a, b)
+	i := NewFP8copy(CC, nil)
+	i.Invert(nil, nil)
+	a := NewFP8copy(AA, nil)
+	a.Mul(i, nil)
+	b := NewFP8copy(BB, nil)
+	b.Mul(i, nil)
+	return NewFP16fp8s(a, b, nil)
 }
 
 // Unpack G2 line function details and include G1
@@ -175,52 +160,45 @@ func unpack(T *FP16, Qx *FP, Qy *FP) *FP48 {
 	var b *FP16
 	var c *FP16
 
-	a = NewFP16copy(T)
-	a.geta().tmul(Qy)
-	t := NewFP8fp(Qx)
-	if SEXTIC_TWIST == D_TYPE {
-		b = NewFP16fp8(t)
-		c = NewFP16()
-	}
-	if SEXTIC_TWIST == M_TYPE {
-		b = NewFP16()
-		c = NewFP16fp8(t)
-		c.times_i()
-	}
-	v := NewFP48fp16s(a, b, c)
+	a = NewFP16copy(T, nil)
+	a.geta().tmul(Qy, nil)
+	t := NewFP8fp(Qx, nil)
+	b = NewFP16fp8(t, nil)
+	c = NewFP16(nil)
+	v := NewFP48fp16s(a, b, c, nil)
 	v.stype = FP_SPARSEST
 	return v
 }
 
 func precomp(GV *ECP8) []*FP16 {
-	n := NewBIG()
-	n3 := NewBIG()
-	AA := NewFP8()
-	BB := NewFP8()
-	CC := NewFP8()
+	n := NewBIG(nil)
+	n3 := NewBIG(nil)
+	AA := NewFP8(nil)
+	BB := NewFP8(nil)
+	CC := NewFP8(nil)
 	var bt int
-	P := NewECP8()
+	P := NewECP8(nil)
 	P.Copy(GV)
 
-	A := NewECP8()
+	A := NewECP8(nil)
 	A.Copy(P)
-	MP := NewECP8()
+	MP := NewECP8(nil)
 	MP.Copy(P)
-	MP.Neg()
+	MP.Neg(nil)
 
-	nb := lbits(n3, n)
+	nb := lbits(n3, n, nil)
 	var T []*FP16
 
 	for i := nb - 2; i >= 1; i-- {
-		dbl(A, AA, BB, CC)
+		dbl(A, AA, BB, CC, nil)
 		T = append(T, pack(AA, BB, CC))
 		bt = n3.bit(i) - n.bit(i)
 		if bt == 1 {
-			add(A, P, AA, BB, CC)
+			add(A, P, AA, BB, CC, nil)
 			T = append(T, pack(AA, BB, CC))
 		}
 		if bt == -1 {
-			add(A, MP, AA, BB, CC)
+			add(A, MP, AA, BB, CC, nil)
 			T = append(T, pack(AA, BB, CC))
 		}
 	}
@@ -228,22 +206,22 @@ func precomp(GV *ECP8) []*FP16 {
 }
 
 func Another_pc(r []*FP48, T []*FP16, QV *ECP) {
-	n := NewBIG()
-	n3 := NewBIG()
+	n := NewBIG(nil)
+	n3 := NewBIG(nil)
 	var lv, lv2 *FP48
 	var bt, j int
 
-	if QV.Is_infinity() {
+	if QV.Is_infinity(nil) {
 		return
 	}
 
-	Q := NewECP()
+	Q := NewECP(nil)
 	Q.Copy(QV)
-	Q.Affine()
-	Qx := NewFPcopy(Q.getx())
-	Qy := NewFPcopy(Q.gety())
+	Q.Affine(nil)
+	Qx := NewFPcopy(Q.getx(), nil)
+	Qy := NewFPcopy(Q.gety(), nil)
 
-	nb := lbits(n3, n)
+	nb := lbits(n3, n, nil)
 	j = 0
 	for i := nb - 2; i >= 1; i-- {
 		lv = unpack(T[j], Qx, Qy)
@@ -252,625 +230,452 @@ func Another_pc(r []*FP48, T []*FP16, QV *ECP) {
 		if bt == 1 {
 			lv2 = unpack(T[j], Qx, Qy)
 			j += 1
-			lv.smul(lv2)
+			lv.smul(lv2, nil)
 		}
 		if bt == -1 {
 			lv2 = unpack(T[j], Qx, Qy)
 			j += 1
-			lv.smul(lv2)
+			lv.smul(lv2, nil)
 		}
-		r[i].ssmul(lv)
+		r[i].ssmul(lv, nil)
 	}
 }
 
 /* Accumulate another set of line functions for n-pairing */
-func Another(r []*FP48, P1 *ECP8, Q1 *ECP) {
-	n := NewBIG()
-	n3 := NewBIG()
+func Another(r []*FP48, P1 *ECP8, Q1 *ECP, mem *arena.Arena) {
+	n := NewBIG(mem)
+	n3 := NewBIG(mem)
 	var lv, lv2 *FP48
 
-	if Q1.Is_infinity() {
+	if Q1.Is_infinity(mem) {
 		return
 	}
 	// P is needed in affine form for line function, Q for (Qx,Qy) extraction
-	P := NewECP8()
+	P := NewECP8(mem)
 	P.Copy(P1)
-	Q := NewECP()
+	Q := NewECP(mem)
 	Q.Copy(Q1)
 
-	P.Affine()
-	Q.Affine()
+	P.Affine(mem)
+	Q.Affine(mem)
 
-	Qx := NewFPcopy(Q.getx())
-	Qy := NewFPcopy(Q.gety())
+	Qx := NewFPcopy(Q.getx(), mem)
+	Qy := NewFPcopy(Q.gety(), mem)
 
-	A := NewECP8()
+	A := NewECP8(mem)
 	A.Copy(P)
 
-	MP := NewECP8()
+	MP := NewECP8(mem)
 	MP.Copy(P)
-	MP.Neg()
+	MP.Neg(mem)
 
-	nb := lbits(n3, n)
+	nb := lbits(n3, n, mem)
 
 	for i := nb - 2; i >= 1; i-- {
-		lv = line(A, A, Qx, Qy)
+		lv = line(A, A, Qx, Qy, mem)
 
 		bt := n3.bit(i) - n.bit(i)
 		if bt == 1 {
-			lv2 = line(A, P, Qx, Qy)
-			lv.smul(lv2)
+			lv2 = line(A, P, Qx, Qy, mem)
+			lv.smul(lv2, mem)
 		}
 		if bt == -1 {
-			lv2 = line(A, MP, Qx, Qy)
-			lv.smul(lv2)
+			lv2 = line(A, MP, Qx, Qy, mem)
+			lv.smul(lv2, mem)
 		}
-		r[i].ssmul(lv)
+		r[i].ssmul(lv, mem)
 	}
 }
 
 /* Optimal R-ate pairing */
 func Ate(P1 *ECP8, Q1 *ECP) *FP48 {
-	n := NewBIG()
-	n3 := NewBIG()
+	n := NewBIG(nil)
+	n3 := NewBIG(nil)
 	var lv, lv2 *FP48
 
-	if Q1.Is_infinity() {
-		return NewFP48int(1)
+	if Q1.Is_infinity(nil) {
+		return NewFP48int(1, nil)
 	}
 
-	P := NewECP8()
+	P := NewECP8(nil)
 	P.Copy(P1)
-	P.Affine()
-	Q := NewECP()
+	P.Affine(nil)
+	Q := NewECP(nil)
 	Q.Copy(Q1)
-	Q.Affine()
+	Q.Affine(nil)
 
-	Qx := NewFPcopy(Q.getx())
-	Qy := NewFPcopy(Q.gety())
+	Qx := NewFPcopy(Q.getx(), nil)
+	Qy := NewFPcopy(Q.gety(), nil)
 
-	A := NewECP8()
-	r := NewFP48int(1)
+	A := NewECP8(nil)
+	r := NewFP48int(1, nil)
 
 	A.Copy(P)
-	NP := NewECP8()
+	NP := NewECP8(nil)
 	NP.Copy(P)
-	NP.Neg()
+	NP.Neg(nil)
 
-	nb := lbits(n3, n)
+	nb := lbits(n3, n, nil)
 
 	for i := nb - 2; i >= 1; i-- {
-		r.Sqr()
-		lv = line(A, A, Qx, Qy)
+		r.Sqr(nil)
+		lv = line(A, A, Qx, Qy, nil)
 
 		bt := n3.bit(i) - n.bit(i)
 		if bt == 1 {
-			lv2 = line(A, P, Qx, Qy)
-			lv.smul(lv2)
+			lv2 = line(A, P, Qx, Qy, nil)
+			lv.smul(lv2, nil)
 		}
 		if bt == -1 {
-			lv2 = line(A, NP, Qx, Qy)
-			lv.smul(lv2)
+			lv2 = line(A, NP, Qx, Qy, nil)
+			lv.smul(lv2, nil)
 		}
-		r.ssmul(lv)
+		r.ssmul(lv, nil)
 	}
 
-	if SIGN_OF_X == NEGATIVEX {
-		r.conj()
-	}
+	r.conj(nil)
 
 	return r
 }
 
 /* Optimal R-ate double pairing e(P,Q).e(R,S) */
 func Ate2(P1 *ECP8, Q1 *ECP, R1 *ECP8, S1 *ECP) *FP48 {
-	n := NewBIG()
-	n3 := NewBIG()
+	n := NewBIG(nil)
+	n3 := NewBIG(nil)
 	var lv, lv2 *FP48
 
-	if Q1.Is_infinity() {
+	if Q1.Is_infinity(nil) {
 		return Ate(R1, S1)
 	}
-	if S1.Is_infinity() {
+	if S1.Is_infinity(nil) {
 		return Ate(P1, Q1)
 	}
 
-	P := NewECP8()
+	P := NewECP8(nil)
 	P.Copy(P1)
-	P.Affine()
-	Q := NewECP()
+	P.Affine(nil)
+	Q := NewECP(nil)
 	Q.Copy(Q1)
-	Q.Affine()
-	R := NewECP8()
+	Q.Affine(nil)
+	R := NewECP8(nil)
 	R.Copy(R1)
-	R.Affine()
-	S := NewECP()
+	R.Affine(nil)
+	S := NewECP(nil)
 	S.Copy(S1)
-	S.Affine()
+	S.Affine(nil)
 
-	Qx := NewFPcopy(Q.getx())
-	Qy := NewFPcopy(Q.gety())
-	Sx := NewFPcopy(S.getx())
-	Sy := NewFPcopy(S.gety())
+	Qx := NewFPcopy(Q.getx(), nil)
+	Qy := NewFPcopy(Q.gety(), nil)
+	Sx := NewFPcopy(S.getx(), nil)
+	Sy := NewFPcopy(S.gety(), nil)
 
-	A := NewECP8()
-	B := NewECP8()
-	r := NewFP48int(1)
+	A := NewECP8(nil)
+	B := NewECP8(nil)
+	r := NewFP48int(1, nil)
 
 	A.Copy(P)
 	B.Copy(R)
-	NP := NewECP8()
+	NP := NewECP8(nil)
 	NP.Copy(P)
-	NP.Neg()
-	NR := NewECP8()
+	NP.Neg(nil)
+	NR := NewECP8(nil)
 	NR.Copy(R)
-	NR.Neg()
+	NR.Neg(nil)
 
-	nb := lbits(n3, n)
+	nb := lbits(n3, n, nil)
 
 	for i := nb - 2; i >= 1; i-- {
-		r.Sqr()
-		lv = line(A, A, Qx, Qy)
-		lv2 = line(B, B, Sx, Sy)
-		lv.smul(lv2)
-		r.ssmul(lv)
+		r.Sqr(nil)
+		lv = line(A, A, Qx, Qy, nil)
+		lv2 = line(B, B, Sx, Sy, nil)
+		lv.smul(lv2, nil)
+		r.ssmul(lv, nil)
 		bt := n3.bit(i) - n.bit(i)
 		if bt == 1 {
-			lv = line(A, P, Qx, Qy)
-			lv2 = line(B, R, Sx, Sy)
-			lv.smul(lv2)
-			r.ssmul(lv)
+			lv = line(A, P, Qx, Qy, nil)
+			lv2 = line(B, R, Sx, Sy, nil)
+			lv.smul(lv2, nil)
+			r.ssmul(lv, nil)
 		}
 		if bt == -1 {
-			lv = line(A, NP, Qx, Qy)
-			lv2 = line(B, NR, Sx, Sy)
-			lv.smul(lv2)
-			r.ssmul(lv)
+			lv = line(A, NP, Qx, Qy, nil)
+			lv2 = line(B, NR, Sx, Sy, nil)
+			lv.smul(lv2, nil)
+			r.ssmul(lv, nil)
 		}
 	}
 
-	if SIGN_OF_X == NEGATIVEX {
-		r.conj()
-	}
+	r.conj(nil)
 
 	return r
 }
 
 /* final exponentiation - keep separate for multi-pairings and to avoid thrashing stack */
 func Fexp(m *FP48) *FP48 {
-	f := NewFP2bigs(NewBIGints(Fra), NewBIGints(Frb))
-	x := NewBIGints(CURVE_Bnx)
-	r := NewFP48copy(m)
+	mem := arena.NewArena()
+	f := NewFP2bigs(NewBIGints(Fra, mem), NewBIGints(Frb, mem), mem)
+	x := NewBIGints(CURVE_Bnx, mem)
+	r := NewFP48copy(m, nil)
 	//	var t1, t2 *FP48
 
 	/* Easy part of final exp */
-	lv := NewFP48copy(r)
+	lv := NewFP48copy(r, mem)
 
-	lv.Invert()
-	r.conj()
+	lv.Invert(mem)
+	r.conj(mem)
 
-	r.Mul(lv)
+	r.Mul(lv, mem)
 	lv.Copy(r)
-	r.frob(f, 8)
-	r.Mul(lv)
+	r.frob(f, 8, mem)
+	r.Mul(lv, mem)
 
 	/* Hard part of final exp */
 	// See https://eprint.iacr.org/2020/875.pdf
-	y1 := NewFP48copy(r)
-	y1.uSqr()
-	y1.Mul(r) // y1=r^3
+	y1 := NewFP48copy(r, mem)
+	y1.uSqr(mem)
+	y1.Mul(r, mem) // y1=r^3
 
-	y0 := NewFP48copy(r.Pow(x))
-	if SIGN_OF_X == NEGATIVEX {
-		y0.conj()
-	}
-	t0 := NewFP48copy(r)
-	t0.conj()
+	y0 := NewFP48copy(r.Pow(x, mem), mem)
+	y0.conj(mem)
+	t0 := NewFP48copy(r, mem)
+	t0.conj(mem)
 	r.Copy(y0)
-	r.Mul(t0)
+	r.Mul(t0, mem)
 
-	y0.Copy(r.Pow(x))
-	if SIGN_OF_X == NEGATIVEX {
-		y0.conj()
-	}
+	y0.Copy(r.Pow(x, mem))
+	y0.conj(mem)
 	t0.Copy(r)
-	t0.conj()
+	t0.conj(mem)
 	r.Copy(y0)
-	r.Mul(t0)
+	r.Mul(t0, mem)
 
 	// ^(x+p)
-	y0.Copy(r.Pow(x))
-	if SIGN_OF_X == NEGATIVEX {
-		y0.conj()
-	}
+	y0.Copy(r.Pow(x, mem))
+	y0.conj(mem)
 	t0.Copy(r)
-	t0.frob(f, 1)
+	t0.frob(f, 1, mem)
 	r.Copy(y0)
-	r.Mul(t0)
+	r.Mul(t0, mem)
 
 	// ^(x^2+p^2)
-	y0.Copy(r.Pow(x))
-	y0.Copy(y0.Pow(x))
+	y0.Copy(r.Pow(x, mem))
+	y0.Copy(y0.Pow(x, mem))
 	t0.Copy(r)
-	t0.frob(f, 2)
+	t0.frob(f, 2, mem)
 	r.Copy(y0)
-	r.Mul(t0)
+	r.Mul(t0, mem)
 
 	// ^(x^4+p^4)
-	y0.Copy(r.Pow(x))
-	y0.Copy(y0.Pow(x))
-	y0.Copy(y0.Pow(x))
-	y0.Copy(y0.Pow(x))
+	y0.Copy(r.Pow(x, mem))
+	y0.Copy(y0.Pow(x, mem))
+	y0.Copy(y0.Pow(x, mem))
+	y0.Copy(y0.Pow(x, mem))
 	t0.Copy(r)
-	t0.frob(f, 4)
+	t0.frob(f, 4, mem)
 	r.Copy(y0)
-	r.Mul(t0)
+	r.Mul(t0, mem)
 
 	// ^(x^8+p^8-1)
-	y0.Copy(r.Pow(x))
-	y0.Copy(y0.Pow(x))
-	y0.Copy(y0.Pow(x))
-	y0.Copy(y0.Pow(x))
-	y0.Copy(y0.Pow(x))
-	y0.Copy(y0.Pow(x))
-	y0.Copy(y0.Pow(x))
-	y0.Copy(y0.Pow(x))
+	y0.Copy(r.Pow(x, mem))
+	y0.Copy(y0.Pow(x, mem))
+	y0.Copy(y0.Pow(x, mem))
+	y0.Copy(y0.Pow(x, mem))
+	y0.Copy(y0.Pow(x, mem))
+	y0.Copy(y0.Pow(x, mem))
+	y0.Copy(y0.Pow(x, mem))
+	y0.Copy(y0.Pow(x, mem))
 	t0.Copy(r)
-	t0.frob(f, 8)
-	y0.Mul(t0)
+	t0.frob(f, 8, mem)
+	y0.Mul(t0, mem)
 	t0.Copy(r)
-	t0.conj()
+	t0.conj(mem)
 	r.Copy(y0)
-	r.Mul(t0)
+	r.Mul(t0, mem)
 
-	r.Mul(y1)
-	r.reduce()
+	r.Mul(y1, mem)
+	r.reduce(mem)
+	mem.Free()
 
-	/*
-		// Ghamman & Fouotsa Method
-
-		t7 := NewFP48copy(r)
-		t7.usqr()
-
-		if x.parity() == 1 {
-			t2 = r.Pow(x)
-			t1 = NewFP48copy(t2)
-			t1.usqr()
-			t2 = t2.Pow(x)
-		} else {
-			t1 = t7.Pow(x)
-			x.fshr(1)
-			t2 = t1.Pow(x)
-			x.fshl(1)
-		}
-
-		if SIGN_OF_X == NEGATIVEX {
-			t1.conj()
-		}
-
-		t3 := NewFP48copy(t1)
-		t3.conj()
-		t2.Mul(t3)
-		t2.Mul(r)
-
-		r.Mul(t7)
-
-		t1 = t2.Pow(x)
-		if SIGN_OF_X == NEGATIVEX {
-			t1.conj()
-		}
-		t3.Copy(t1)
-		t3.frob(f, 14)
-		r.Mul(t3)
-		t1 = t1.Pow(x)
-		if SIGN_OF_X == NEGATIVEX {
-			t1.conj()
-		}
-
-		t3.Copy(t1)
-		t3.frob(f, 13)
-		r.Mul(t3)
-		t1 = t1.Pow(x)
-		if SIGN_OF_X == NEGATIVEX {
-			t1.conj()
-		}
-
-		t3.Copy(t1)
-		t3.frob(f, 12)
-		r.Mul(t3)
-		t1 = t1.Pow(x)
-		if SIGN_OF_X == NEGATIVEX {
-			t1.conj()
-		}
-
-		t3.Copy(t1)
-		t3.frob(f, 11)
-		r.Mul(t3)
-		t1 = t1.Pow(x)
-		if SIGN_OF_X == NEGATIVEX {
-			t1.conj()
-		}
-
-		t3.Copy(t1)
-		t3.frob(f, 10)
-		r.Mul(t3)
-		t1 = t1.Pow(x)
-		if SIGN_OF_X == NEGATIVEX {
-			t1.conj()
-		}
-
-		t3.Copy(t1)
-		t3.frob(f, 9)
-		r.Mul(t3)
-		t1 = t1.Pow(x)
-		if SIGN_OF_X == NEGATIVEX {
-			t1.conj()
-		}
-
-		t3.Copy(t1)
-		t3.frob(f, 8)
-		r.Mul(t3)
-		t1 = t1.Pow(x)
-		if SIGN_OF_X == NEGATIVEX {
-			t1.conj()
-		}
-
-		t3.Copy(t2)
-		t3.conj()
-		t1.Mul(t3)
-		t3.Copy(t1)
-		t3.frob(f, 7)
-		r.Mul(t3)
-		t1 = t1.Pow(x)
-		if SIGN_OF_X == NEGATIVEX {
-			t1.conj()
-		}
-
-		t3.Copy(t1)
-		t3.frob(f, 6)
-		r.Mul(t3)
-		t1 = t1.Pow(x)
-		if SIGN_OF_X == NEGATIVEX {
-			t1.conj()
-		}
-
-		t3.Copy(t1)
-		t3.frob(f, 5)
-		r.Mul(t3)
-		t1 = t1.Pow(x)
-		if SIGN_OF_X == NEGATIVEX {
-			t1.conj()
-		}
-
-		t3.Copy(t1)
-		t3.frob(f, 4)
-		r.Mul(t3)
-		t1 = t1.Pow(x)
-		if SIGN_OF_X == NEGATIVEX {
-			t1.conj()
-		}
-
-		t3.Copy(t1)
-		t3.frob(f, 3)
-		r.Mul(t3)
-		t1 = t1.Pow(x)
-		if SIGN_OF_X == NEGATIVEX {
-			t1.conj()
-		}
-
-		t3.Copy(t1)
-		t3.frob(f, 2)
-		r.Mul(t3)
-		t1 = t1.Pow(x)
-		if SIGN_OF_X == NEGATIVEX {
-			t1.conj()
-		}
-
-		t3.Copy(t1)
-		t3.frob(f, 1)
-		r.Mul(t3)
-		t1 = t1.Pow(x)
-		if SIGN_OF_X == NEGATIVEX {
-			t1.conj()
-		}
-
-		r.Mul(t1)
-		t2.frob(f, 15)
-		r.Mul(t2)
-
-		r.reduce()
-	*/
 	return r
 }
 
 /* GLV method */
-func glv(ee *BIG) []*BIG {
+func glv(ee *BIG, mem *arena.Arena) []*BIG {
 	var u []*BIG
 
-	q := NewBIGints(CURVE_Order)
-	x := NewBIGints(CURVE_Bnx)
+	q := NewBIGints(CURVE_Order, mem)
+	x := NewBIGints(CURVE_Bnx, mem)
 	x2 := smul(x, x)
 	x = smul(x2, x2)
 	x2 = smul(x, x)
 	bd := uint(q.nbits() - x2.nbits())
-	u = append(u, NewBIGcopy(ee))
-	u[0].ctmod(x2, bd)
-	u = append(u, NewBIGcopy(ee))
-	u[1].ctdiv(x2, bd)
+	u = append(u, NewBIGcopy(ee, mem))
+	u[0].ctmod(x2, bd, mem)
+	u = append(u, NewBIGcopy(ee, mem))
+	u[1].ctdiv(x2, bd, mem)
 	u[1].rsub(q)
 	return u
 }
 
 /* Galbraith & Scott Method */
-func gs(ee *BIG) []*BIG {
+func gs(ee *BIG, mem *arena.Arena) []*BIG {
 	var u []*BIG
 
-	q := NewBIGints(CURVE_Order)
-	x := NewBIGints(CURVE_Bnx)
+	q := NewBIGints(CURVE_Order, mem)
+	x := NewBIGints(CURVE_Bnx, mem)
 	bd := uint(q.nbits() - x.nbits())
-	w := NewBIGcopy(ee)
+	w := NewBIGcopy(ee, mem)
 	for i := 0; i < 15; i++ {
-		u = append(u, NewBIGcopy(w))
-		u[i].ctmod(x, bd)
-		w.ctdiv(x, bd)
-	}
-	u = append(u, NewBIGcopy(w))
-	if SIGN_OF_X == NEGATIVEX {
-		u[1].copy(Modneg(u[1], q))
-		u[3].copy(Modneg(u[3], q))
-		u[5].copy(Modneg(u[5], q))
-		u[7].copy(Modneg(u[7], q))
-		u[9].copy(Modneg(u[9], q))
-		u[11].copy(Modneg(u[11], q))
-		u[13].copy(Modneg(u[13], q))
-		u[15].copy(Modneg(u[15], q))
+		u = append(u, NewBIGcopy(w, mem))
+		u[i].ctmod(x, bd, mem)
+		w.ctdiv(x, bd, mem)
 	}
+	u = append(u, NewBIGcopy(w, mem))
+	u[1].copy(Modneg(u[1], q, mem))
+	u[3].copy(Modneg(u[3], q, mem))
+	u[5].copy(Modneg(u[5], q, mem))
+	u[7].copy(Modneg(u[7], q, mem))
+	u[9].copy(Modneg(u[9], q, mem))
+	u[11].copy(Modneg(u[11], q, mem))
+	u[13].copy(Modneg(u[13], q, mem))
+	u[15].copy(Modneg(u[15], q, mem))
 
 	return u
 }
 
 /* Multiply P by e in group G1 */
-func G1mul(P *ECP, e *BIG) *ECP {
+func G1mul(P *ECP, e *BIG, mem *arena.Arena) *ECP {
 	var R *ECP
-	q := NewBIGints(CURVE_Order)
-	ee := NewBIGcopy(e)
-	ee.Mod(q)
-	if USE_GLV {
-		R = NewECP()
-		R.Copy(P)
-		Q := NewECP()
-		Q.Copy(P)
-		Q.Affine()
+	q := NewBIGints(CURVE_Order, mem)
+	ee := NewBIGcopy(e, mem)
+	ee.Mod(q, mem)
+	R = NewECP(mem)
+	R.Copy(P)
+	Q := NewECP(mem)
+	Q.Copy(P)
+	Q.Affine(mem)
 
-		cru := NewFPbig(NewBIGints(CRu))
-		t := NewBIGint(0)
-		u := glv(ee)
-		Q.getx().Mul(cru)
+	cru := NewFPbig(NewBIGints(CRu, mem), mem)
+	t := NewBIGint(0, mem)
+	u := glv(ee, mem)
+	Q.getx().Mul(cru, mem)
 
-		np := u[0].nbits()
-		t.copy(Modneg(u[0], q))
-		nn := t.nbits()
-		if nn < np {
-			u[0].copy(t)
-			R.Neg()
-		}
-
-		np = u[1].nbits()
-		t.copy(Modneg(u[1], q))
-		nn = t.nbits()
-		if nn < np {
-			u[1].copy(t)
-			Q.Neg()
-		}
-		u[0].norm()
-		u[1].norm()
-		R = R.Mul2(u[0], Q, u[1])
-
-	} else {
-		R = P.clmul(e, q)
+	np := u[0].nbits()
+	t.copy(Modneg(u[0], q, mem))
+	nn := t.nbits()
+	if nn < np {
+		u[0].copy(t)
+		R.Neg(mem)
 	}
+
+	np = u[1].nbits()
+	t.copy(Modneg(u[1], q, mem))
+	nn = t.nbits()
+	if nn < np {
+		u[1].copy(t)
+		Q.Neg(mem)
+	}
+	u[0].norm()
+	u[1].norm()
+	R = R.Mul2(u[0], Q, u[1], mem)
+
 	return R
 }
 
 /* Multiply P by e in group G2 */
-func G2mul(P *ECP8, e *BIG) *ECP8 {
+func G2mul(P *ECP8, e *BIG, mem *arena.Arena) *ECP8 {
 	var R *ECP8
-	q := NewBIGints(CURVE_Order)
-	ee := NewBIGcopy(e)
-	ee.Mod(q)
-	if USE_GS_G2 {
-		var Q []*ECP8
+	q := NewBIGints(CURVE_Order, mem)
+	ee := NewBIGcopy(e, mem)
+	ee.Mod(q, mem)
+	var Q []*ECP8
 
-		F := ECP8_frob_constants()
-		u := gs(ee)
+	F := ECP8_frob_constants()
+	u := gs(ee, mem)
 
-		t := NewBIGint(0)
+	t := NewBIGint(0, mem)
 
-		Q = append(Q, NewECP8())
-		Q[0].Copy(P)
-		for i := 1; i < 16; i++ {
-			Q = append(Q, NewECP8())
-			Q[i].Copy(Q[i-1])
-			Q[i].frob(F, 1)
-		}
-		for i := 0; i < 16; i++ {
-			np := u[i].nbits()
-			t.copy(Modneg(u[i], q))
-			nn := t.nbits()
-			if nn < np {
-				u[i].copy(t)
-				Q[i].Neg()
-			}
-			u[i].norm()
-		}
-
-		R = Mul16(Q, u)
-
-	} else {
-		R = P.Mul(e)
+	Q = append(Q, NewECP8(mem))
+	Q[0].Copy(P)
+	for i := 1; i < 16; i++ {
+		Q = append(Q, NewECP8(mem))
+		Q[i].Copy(Q[i-1])
+		Q[i].frob(F, 1)
 	}
+	for i := 0; i < 16; i++ {
+		np := u[i].nbits()
+		t.copy(Modneg(u[i], q, mem))
+		nn := t.nbits()
+		if nn < np {
+			u[i].copy(t)
+			Q[i].Neg(mem)
+		}
+		u[i].norm()
+	}
+
+	R = Mul16(Q, u, mem)
 	return R
 }
 
 /* f=f^e */
 /* Note that this method requires a lot of RAM!  */
-func GTpow(d *FP48, e *BIG) *FP48 {
-	var r *FP48
-	q := NewBIGints(CURVE_Order)
-	ee := NewBIGcopy(e)
-	ee.Mod(q)
-	if USE_GS_GT {
-		var g []*FP48
-		f := NewFP2bigs(NewBIGints(Fra), NewBIGints(Frb))
-		t := NewBIGint(0)
+// func GTpow(d *FP48, e *BIG) *FP48 {
+// 	var r *FP48
+// 	q := NewBIGints(CURVE_Order)
+// 	ee := NewBIGcopy(e)
+// 	ee.Mod(q)
+// 	if USE_GS_GT {
+// 		var g []*FP48
+// 		f := NewFP2bigs(NewBIGints(Fra), NewBIGints(Frb))
+// 		t := NewBIGint(0)
 
-		u := gs(ee)
+// 		u := gs(ee)
 
-		g = append(g, NewFP48copy(d))
-		for i := 1; i < 16; i++ {
-			g = append(g, NewFP48())
-			g[i].Copy(g[i-1])
-			g[i].frob(f, 1)
-		}
-		for i := 0; i < 16; i++ {
-			np := u[i].nbits()
-			t.copy(Modneg(u[i], q))
-			nn := t.nbits()
-			if nn < np {
-				u[i].copy(t)
-				g[i].conj()
-			}
-			u[i].norm()
-		}
-		r = pow16(g, u)
-	} else {
-		r = d.Pow(ee)
-	}
-	return r
-}
+// 		g = append(g, NewFP48copy(d))
+// 		for i := 1; i < 16; i++ {
+// 			g = append(g, NewFP48())
+// 			g[i].Copy(g[i-1])
+// 			g[i].frob(f, 1)
+// 		}
+// 		for i := 0; i < 16; i++ {
+// 			np := u[i].nbits()
+// 			t.copy(Modneg(u[i], q))
+// 			nn := t.nbits()
+// 			if nn < np {
+// 				u[i].copy(t)
+// 				g[i].conj()
+// 			}
+// 			u[i].norm()
+// 		}
+// 		r = pow16(g, u)
+// 	} else {
+// 		r = d.Pow(ee)
+// 	}
+// 	return r
+// }
 
 /* test G1 group membership */
-func G1member(P *ECP) bool {
-	if P.Is_infinity() {
+func G1member(P *ECP, mem *arena.Arena) bool {
+	if P.Is_infinity(mem) {
 		return false
 	}
-	x := NewBIGints(CURVE_Bnx)
-	cru := NewFPbig(NewBIGints(CRu))
-	W := NewECP()
+	x := NewBIGints(CURVE_Bnx, mem)
+	cru := NewFPbig(NewBIGints(CRu, mem), mem)
+	W := NewECP(mem)
 	W.Copy(P)
-	W.getx().Mul(cru)
-	T := P.lmul(x)
+	W.getx().Mul(cru, mem)
+	T := P.lmul(x, mem, mem)
 	if P.Equals(T) {
 		return false
 	} // P is of low order
-	T = T.Mul(x)
-	T = T.Mul(x)
-	T = T.Mul(x)
-	T = T.Mul(x)
-	T = T.Mul(x)
-	T = T.Mul(x)
-	T = T.Mul(x)
-	T.Neg()
+	T = T.Mul(x, mem, mem)
+	T = T.Mul(x, mem, mem)
+	T = T.Mul(x, mem, mem)
+	T = T.Mul(x, mem, mem)
+	T = T.Mul(x, mem, mem)
+	T = T.Mul(x, mem, mem)
+	T = T.Mul(x, mem, mem)
+	T.Neg(mem)
 	if !W.Equals(T) {
 		return false
 	}
@@ -889,19 +694,17 @@ func G1member(P *ECP) bool {
 }
 
 /* test G2 group membership */
-func G2member(P *ECP8) bool {
-	if P.Is_infinity() {
+func G2member(P *ECP8, mem *arena.Arena) bool {
+	if P.Is_infinity(mem) {
 		return false
 	}
 	F := ECP8_frob_constants()
-	x := NewBIGints(CURVE_Bnx)
-	W := NewECP8()
+	x := NewBIGints(CURVE_Bnx, mem)
+	W := NewECP8(mem)
 	W.Copy(P)
 	W.frob(F, 1)
-	T := P.Mul(x)
-	if SIGN_OF_X == NEGATIVEX {
-		T.Neg()
-	}
+	T := P.Mul(x, mem)
+	T.Neg(mem)
 	/*
 	   	R:=NewECP8(); R.Copy(W)
 	       R.frob(F,1)
@@ -928,20 +731,20 @@ func GTcyclotomic(m *FP48) bool {
 	if m.Isunity() {
 		return false
 	}
-	r := NewFP48copy(m)
-	r.conj()
-	r.Mul(m)
+	r := NewFP48copy(m, nil)
+	r.conj(nil)
+	r.Mul(m, nil)
 	if !r.Isunity() {
 		return false
 	}
 
-	f := NewFP2bigs(NewBIGints(Fra), NewBIGints(Frb))
+	f := NewFP2bigs(NewBIGints(Fra, nil), NewBIGints(Frb, nil), nil)
 
 	r.Copy(m)
-	r.frob(f, 8)
-	w := NewFP48copy(r)
-	w.frob(f, 8)
-	w.Mul(m)
+	r.frob(f, 8, nil)
+	w := NewFP48copy(r, nil)
+	w.frob(f, 8, nil)
+	w.Mul(m, nil)
 	if !w.Equals(r) {
 		return false
 	}
@@ -953,16 +756,14 @@ func GTmember(m *FP48) bool {
 	if !GTcyclotomic(m) {
 		return false
 	}
-	f := NewFP2bigs(NewBIGints(Fra), NewBIGints(Frb))
-	x := NewBIGints(CURVE_Bnx)
+	f := NewFP2bigs(NewBIGints(Fra, nil), NewBIGints(Frb, nil), nil)
+	x := NewBIGints(CURVE_Bnx, nil)
 
-	r := NewFP48copy(m)
-	r.frob(f, 1)
-	t := m.Pow(x)
+	r := NewFP48copy(m, nil)
+	r.frob(f, 1, nil)
+	t := m.Pow(x, nil)
 
-	if SIGN_OF_X == NEGATIVEX {
-		t.conj()
-	}
+	t.conj(nil)
 	if !r.Equals(t) {
 		return false
 	}
diff --git a/nekryptology/pkg/core/curves/native/bls48581/rom.go b/nekryptology/pkg/core/curves/native/bls48581/rom_32.go
similarity index 99%
rename from nekryptology/pkg/core/curves/native/bls48581/rom.go
rename to nekryptology/pkg/core/curves/native/bls48581/rom_32.go
index 7e0400a..c6f7069 100644
--- a/nekryptology/pkg/core/curves/native/bls48581/rom.go
+++ b/nekryptology/pkg/core/curves/native/bls48581/rom_32.go
@@ -1,3 +1,5 @@
+//go:build js && wasm
+
 /*
  * Copyright (c) 2012-2020 MIRACL UK Ltd.
  *
diff --git a/nekryptology/pkg/core/curves/native/bls48581/rom_64.go b/nekryptology/pkg/core/curves/native/bls48581/rom_64.go
new file mode 100644
index 0000000..e2adcbe
--- /dev/null
+++ b/nekryptology/pkg/core/curves/native/bls48581/rom_64.go
@@ -0,0 +1,77 @@
+//go:build !js && !wasm
+
+/*
+ * Copyright (c) 2012-2020 MIRACL UK Ltd.
+ *
+ * This file is part of MIRACL Core
+ * (see https://github.com/miracl/core).
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Fixed Data in ROM - Field and Curve parameters */
+
+package bls48581
+
+// Base Bits= 60
+var Modulus = [...]Chunk{0xEDC154E6565912B, 0x8FDF721A4A48AC3, 0x7A5513170EE0A57, 0x394F4736DAF6836, 0xAF6E082ACD9CD30, 0xF3975444A48AE43, 0x22131BB3BE6C0F1, 0x12A0056E84F8D1, 0x76F313824E31D47, 0x1280F73FF34}
+var ROI = [...]Chunk{0xEDC154E6565912A, 0x8FDF721A4A48AC3, 0x7A5513170EE0A57, 0x394F4736DAF6836, 0xAF6E082ACD9CD30, 0xF3975444A48AE43, 0x22131BB3BE6C0F1, 0x12A0056E84F8D1, 0x76F313824E31D47, 0x1280F73FF34}
+var R2modp = [...]Chunk{0x79868479F1B5833, 0xFB6EBA8FCB82D07, 0x9CC8A7F1FD84C7F, 0x402C51CF5CC3CBB, 0x3F3114F078502C, 0xFC90829BDC8336E, 0xC7BE91DE9CA8EED, 0xD4D273BB17BFADB, 0x6EC7C9A81E792CA, 0x1DC317A6E4}
+var SQRTm3 = [...]Chunk{0x51EDFC2A1D65A0A, 0xD62DAA292D8CDBF, 0x24112478269D616, 0x6C25D3CABF8AD71, 0xC8E9B16B5D3E4CD, 0xF50A03B738960EE, 0x1A664376FED4343, 0xBFFD8FB8925AE06, 0x600908C6A28DEAA, 0x1280F73F9A7}
+
+const MConst Chunk = 0x148B81FC39D5A7D
+
+var Fra = [...]Chunk{0x62EB6CFE42AEB25, 0xDB41942760AD3F9, 0xA7DF2570715ECE4, 0x90377B51208AC0F, 0x6848493E1C8C418, 0xF496307E298187E, 0x58740E3CAFD6B62, 0xF6067D047983E78, 0x49FA75CD7E73E55, 0xFD30DB501}
+var Frb = [...]Chunk{0x62EB6CFE42AEB25, 0xDB41942760AD3F9, 0xA7DF2570715ECE4, 0x90377B51208AC0F, 0x6848493E1C8C418, 0xF496307E298187E, 0x58740E3CAFD6B62, 0xF6067D047983E78, 0x49FA75CD7E73E55, 0xFD30DB501}
+var TWK = [...]Chunk{0x7B433D25F426953, 0xACE45923B9863D, 0xC28BBDFA2D37E16, 0x62FFCC8AFB4BC18, 0x661B4392F002C4F, 0x2ED27E951A14781, 0x670A6683B853246, 0xAEB8C9BA138A075, 0xC10075769CDDD9E, 0x3A65A537B}
+
+//*** rom curve parameters *****
+// Base Bits= 60
+// Ate Bits= 33
+// G2 Table size= 36
+
+const CURVE_Cof_I int = 0
+
+var CURVE_Cof = [...]Chunk{0x140000382, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+
+const CURVE_B_I int = 1
+
+var CURVE_B = [...]Chunk{0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+var CURVE_Order = [...]Chunk{0x8A5FE6FCD671C01, 0xBE599467C24DA11, 0xC7CD0562303C4CC, 0x9D34C4C92016A85, 0xBC972C2E6E74196, 0x3F0B3CBE003FAD6, 0x615C0D6C635387A, 0xE2885E233A9CCC1, 0x2386F8A925, 0x0}
+var CURVE_Gx = [...]Chunk{0xBCE8732315AF640, 0x74DA5D3A1E6D8C3, 0x57DB368B11786CB, 0x665D859236EBDBC, 0x46A9DF6F9645847, 0xEDFFB9F75445505, 0xE86868CF61ABDBA, 0x93F860DE3F257E0, 0x40F2BAF2B73DF1E, 0x2AF59B7AC3}
+var CURVE_Gy = [...]Chunk{0xDBB5DE3E2587A70, 0xF37AEF7B926B576, 0xF77C2876D1B2E35, 0x78584C3EF22F487, 0xFFB98AEE53E80F6, 0xD41B720EF7BB7BE, 0xFEB8A52E991279D, 0xB398A488A553C9E, 0x31F91F86B3A2D1F, 0xCEFDA44F65}
+var CURVE_HTPC = [...]Chunk{0x393F0BE031193EC, 0xC28896440758243, 0xDBE4AA8E70D4620, 0x6B27BD55EFD560E, 0x24A9624BEECD070, 0xE2626AD7C53B361, 0xDD845A98030C755, 0x29389B4E6A62C2D, 0x5AF94F05D8A9FD4, 0x92348CD5DC}
+
+var CURVE_Bnx = [...]Chunk{0x140000381, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
+var CRu = [...]Chunk{0x4DE9AC5E1C79B90, 0x5CD8E3F88E5DE82, 0xAB21F74F7421A20, 0x6694B9B60DB5D62, 0x73422B5FB82F431, 0xFF46A846B5FA6AA, 0x83D66C1E5FCBED6, 0x2096384F2AFA565, 0x8B75055DD5D1F4E, 0x2C6}
+var CURVE_Pxaaa = [...]Chunk{0x34FD0B4ACE8BFAB, 0xB79766322154DEC, 0x4D80491F510317, 0x3CA0612F4005030, 0xBAAD1A8C42281A6, 0x3A2EF156C46FF79, 0x344DBCCB7DE64DB, 0x2775DEBABBEFC70, 0x71E4A38237FA45A, 0x5D615D9A78}
+var CURVE_Pxaab = [...]Chunk{0x669B36676B47C57, 0x5556A01AFA143F1, 0x7630D979630FFD7, 0x6AFFA62504F0C3C, 0xABFEDF16214A7, 0x12307F4E1C3943A, 0xE1623E9526F6DA, 0xBC07E8B22BB6D98, 0x258512069B0E86A, 0x7C4973ECE2}
+var CURVE_Pxaba = [...]Chunk{0x488156CA55A3E6A, 0xEF4CDED6B3F0B46, 0xCBDFBB879D5FEA8, 0x66F0D2A6D55F028, 0xC1DBD19242FFAE7, 0xCCBAB5AB6860161, 0xAE237CA7A6D6957, 0xAD83BC73A8A6CA9, 0xF1334E1B2EA1853, 0x1FCCC70198}
+var CURVE_Pxabb = [...]Chunk{0x9A7033CBB7FEAFE, 0x10B8CB4E80BC3F0, 0x1C5257C200CA523, 0x43B1B279B9468C3, 0x5F63E1C776E6EC1, 0x393F8BE0CC218A9, 0x62F3E5821B7B92A, 0x54D4BFE8F5985AC, 0xEB6185C78D80129, 0xBE2218C25C}
+var CURVE_Pxbaa = [...]Chunk{0x39C3A1C53F8CCE5, 0x5B5F746C9D4CBB7, 0xD55FC1889AA80C6, 0xEF492AE589274FA, 0x9E48199D5AC10B2, 0xC5805386699981F, 0xB1642B5675FF0E7, 0xA9DD63007C675D0, 0x35913A3C598E4CA, 0x38B91C600B}
+var CURVE_Pxbab = [...]Chunk{0x2004D914A3C093A, 0x7960910FCE3370F, 0xA9F177612F097FC, 0x40B9C0B15DD7595, 0x3835D28997EB57B, 0x7BB037418181DF6, 0xEF0977A3D1A5867, 0xCDA088F7B8F35DC, 0x738603F1311E4E, 0xC96C7797EB}
+var CURVE_Pxbba = [...]Chunk{0x41607E60750E057, 0x4B5B0E205C3354E, 0xCBE4324C22D6333, 0xAA5EFCF3432AAD1, 0xF293B13CED0FD0C, 0xA2C0B7A449CEF11, 0x9D13852B6DB908B, 0x8AEE660DEA41B3, 0x61EE3F0197A4989, 0xB9B7951C60}
+var CURVE_Pxbbb = [...]Chunk{0xE19DA00FBC6AE34, 0x6AF2FC9E97C3F84, 0x9BD6AEBF9FC44E5, 0x90B7E2B0D458547, 0xA93F29CFF364A71, 0x719728A7F9F8CFC, 0xFAF47B5211CF741, 0x4AAA2B1E5D7A9DE, 0x2BDEC5282624C4F, 0x827D5C22FB}
+var CURVE_Pyaaa = [...]Chunk{0x3EDD3FE4D2D7971, 0x45012AB12C0FF32, 0x9ABF77EEA6D6590, 0x336D8AE5163C159, 0x35AFA27748D90F7, 0xBFC435FAAB09062, 0x59A577E6F3B39E, 0x2F3024B918B4238, 0x75B5DFA49721645, 0xEB53356C3}
+var CURVE_Pyaab = [...]Chunk{0x1471DB936CD5665, 0x8B423525FFC7B11, 0x2FA097D760E2E58, 0xD1892AB24E1DD21, 0x6B243B1F192C5C3, 0x64732FCBF3AFB09, 0xA325E6FBA01D729, 0x5FCADC2B75A422B, 0xE0FF144DA653181, 0x284DC75979}
+var CURVE_Pyaba = [...]Chunk{0x8332A526A2A8474, 0xBC7C46FC3B8FDE6, 0x1D35D51A652269C, 0x36CA3295E5E2F0C, 0xC99D0E904115155, 0xD370514475F7D5, 0x216D5B119D3A48, 0x67669EF2C2FC503, 0x8523E421EFB703, 0xB36A201DD0}
+var CURVE_Pyabb = [...]Chunk{0x6213DA92841589D, 0xB3D8B8A1E533731, 0x7BDA503EE5E578F, 0x817742770BA10D6, 0x224333FA40DCED2, 0x10E122D2742C89B, 0x60DCEE23DD8B0E7, 0x78762B1C2CDED33, 0xEDC0688223FBBD4, 0xAEC25A4621}
+var CURVE_Pybaa = [...]Chunk{0x47831F982E50137, 0x857FDDDFCF7A43F, 0x30135945D137B08, 0xCA4E512B64F59F4, 0x7FA238CDCE8A1E2, 0x5F1129857ED85C7, 0xB43DD93B5A95980, 0x88325A2554DC541, 0xA9C46916503FA5A, 0xD209D5A223}
+var CURVE_Pybab = [...]Chunk{0x4EEDC58CF90BEE4, 0xA59ED8226CF3A59, 0xFC198CAA72B679D, 0xF47C180D139E3AA, 0xE8C270841F6824, 0x55AB7504FA8342, 0xB16722B589D82E2, 0xD537B90421AD66E, 0x36B7A513D339D5A, 0x7D0D037457}
+var CURVE_Pybba = [...]Chunk{0xD41FAEAFEB23986, 0xE884017D9AA62B3, 0x40FA639F53DCCC9, 0xAB8C74B2618B5BB, 0x5AE3A2864F22C1F, 0xE4C819A6DF98F42, 0xC0841B064155F14, 0xD17AF8A006F364F, 0xE65EA25C2D05DFD, 0x896767811B}
+var CURVE_Pybbb = [...]Chunk{0x667FFCB732718B6, 0x5AC66E84069C55D, 0xD8C4AB33F748E, 0x333EC7192054173, 0x8E69C31E97E1AD0, 0xEF8ECA9A9533A3F, 0x6BE8E50C87549B6, 0x4F981B5E068F140, 0x9029D393A5C07E8, 0x35E2524FF8}
+
+//var CURVE_W=[2][10]Chunk {{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0},{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0}}
+//var CURVE_SB=[2][2][10]Chunk {{{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0},{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0}},{{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0},{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0}}}
+//var CURVE_WB=[4][10]Chunk {{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0},{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0},{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0},{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0}}
+//var CURVE_BB=[4][4][10]Chunk {{{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0},{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0},{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0},{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0}},{{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0},{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0},{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0},{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0}},{{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0},{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0},{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0},{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0}},{{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0},{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0},{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0},{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0}}}
diff --git a/nekryptology/pkg/vdf/vdf.go b/nekryptology/pkg/vdf/vdf.go
index 286d563..a5536b4 100644
--- a/nekryptology/pkg/vdf/vdf.go
+++ b/nekryptology/pkg/vdf/vdf.go
@@ -18,7 +18,7 @@ type VDF struct {
 	finished   bool
 }
 
-//size of long integers in quadratic function group
+// size of long integers in quadratic function group
 const sizeInBits = 2048
 
 // New create a new instance of VDF.
@@ -53,12 +53,31 @@ func (vdf *VDF) Execute() {
 	vdf.finished = true
 }
 
+func (vdf *VDF) ExecuteIteration(x_blob []byte) {
+	vdf.finished = false
+
+	yBuf, proofBuf := GenerateVDFIteration(vdf.input[:], x_blob, vdf.difficulty, sizeInBits)
+
+	copy(vdf.output[:], yBuf)
+	copy(vdf.output[258:], proofBuf)
+
+	go func() {
+		vdf.outputChan <- vdf.output
+	}()
+
+	vdf.finished = true
+}
+
 // Verify runs the verification of generated proof
 // currently on i7-6700K, verification takes about 350 ms
 func (vdf *VDF) Verify(proof [516]byte) bool {
 	return VerifyVDF(vdf.input[:], proof[:], vdf.difficulty, sizeInBits)
 }
 
+func (vdf *VDF) VerifyIteration(x_blob [258]byte, proof [516]byte, iterations uint32) bool {
+	return VerifyVDFIteration(vdf.input[:], x_blob[:], proof[:], vdf.difficulty, sizeInBits)
+}
+
 // IsFinished returns whether the vdf execution is finished or not.
 func (vdf *VDF) IsFinished() bool {
 	return vdf.finished
diff --git a/nekryptology/pkg/vdf/wesolowski.go b/nekryptology/pkg/vdf/wesolowski.go
index 139b233..2051cc3 100644
--- a/nekryptology/pkg/vdf/wesolowski.go
+++ b/nekryptology/pkg/vdf/wesolowski.go
@@ -16,8 +16,8 @@ import (
 	"source.quilibrium.com/quilibrium/monorepo/nekryptology/pkg/core/iqc"
 )
 
-//Creates L and k parameters from papers, based on how many iterations need to be
-//performed, and how much memory should be used.
+// Creates L and k parameters from papers, based on how many iterations need to be
+// performed, and how much memory should be used.
 func approximateParameters(T uint32) (int, int, int) {
 	//log_memory = math.log(10000000, 2)
 	log_memory := math.Log(10000000) / math.Log(2)
@@ -86,6 +86,20 @@ func GenerateVDFWithStopChan(seed []byte, iterations, int_size_bits uint32, stop
 	}
 }
 
+func GenerateVDFIteration(seed, x_blob []byte, iterations, int_size_bits uint32) ([]byte, []byte) {
+	int_size := (int_size_bits + 16) >> 4
+	D := iqc.CreateDiscriminant(seed, int_size_bits)
+	x, _ := iqc.NewClassGroupFromBytesDiscriminant(x_blob[:(2*int_size)], D)
+
+	y, proof := calculateVDF(D, x, iterations, int_size_bits, nil)
+
+	if (y == nil) || (proof == nil) {
+		return nil, nil
+	} else {
+		return y.Serialize(), proof.Serialize()
+	}
+}
+
 func VerifyVDF(seed, proof_blob []byte, iterations, int_size_bits uint32) bool {
 	int_size := (int_size_bits + 16) >> 4
 
@@ -97,6 +111,16 @@ func VerifyVDF(seed, proof_blob []byte, iterations, int_size_bits uint32) bool {
 	return verifyProof(x, y, proof, iterations)
 }
 
+func VerifyVDFIteration(seed, x_blob, proof_blob []byte, iterations, int_size_bits uint32) bool {
+	int_size := (int_size_bits + 16) >> 4
+	D := iqc.CreateDiscriminant(seed, int_size_bits)
+	x, _ := iqc.NewClassGroupFromBytesDiscriminant(x_blob[:(2*int_size)], D)
+	y, _ := iqc.NewClassGroupFromBytesDiscriminant(proof_blob[:(2*int_size)], D)
+	proof, _ := iqc.NewClassGroupFromBytesDiscriminant(proof_blob[2*int_size:], D)
+
+	return verifyProof(x, y, proof, iterations)
+}
+
 // Creates a random prime based on input x, y, T
 // Note – this differs from harmony-one's implementation, as the Fiat-Shamir
 // transform requires _all_ public parameters be input, or else there is the
@@ -133,7 +157,7 @@ func getBlock(i, k, T int, B *big.Int) *big.Int {
 	return iqc.FloorDivision(new(big.Int).Mul(p1, p2), B)
 }
 
-//Optimized evalutation of h ^ (2^T // B)
+// Optimized evalutation of h ^ (2^T // B)
 func evalOptimized(identity, h *iqc.ClassGroup, B *big.Int, T uint32, k, l int, C map[int]*iqc.ClassGroup) *iqc.ClassGroup {
 	//k1 = k//2
 	var k1 int = k / 2
@@ -219,7 +243,7 @@ func evalOptimized(identity, h *iqc.ClassGroup, B *big.Int, T uint32, k, l int,
 	return x
 }
 
-//generate y = x ^ (2 ^T) and pi
+// generate y = x ^ (2 ^T) and pi
 func generateProof(identity, x, y *iqc.ClassGroup, T uint32, k, l int, powers map[int]*iqc.ClassGroup) *iqc.ClassGroup {
 	//x_s = x.serialize()
 	x_s := x.Serialize()
@@ -236,10 +260,12 @@ func generateProof(identity, x, y *iqc.ClassGroup, T uint32, k, l int, powers ma
 
 func calculateVDF(discriminant *big.Int, x *iqc.ClassGroup, iterations, int_size_bits uint32, stop <-chan struct{}) (y, proof *iqc.ClassGroup) {
 	L, k, _ := approximateParameters(iterations)
-
 	loopCount := int(math.Ceil(float64(iterations) / float64(k*L)))
+	// NB: Dusk needs to do the disjoint set arithmetic, marking this spot down
+	// as the insertion point
 	powers_to_calculate := make([]int, loopCount+2)
 
+	// link into next
 	for i := 0; i < loopCount+1; i++ {
 		powers_to_calculate[i] = i * k * L
 	}
diff --git a/nekryptology/pkg/zkp/schnorr/schnorr_test.go b/nekryptology/pkg/zkp/schnorr/schnorr_test.go
index 47de66c..d897126 100644
--- a/nekryptology/pkg/zkp/schnorr/schnorr_test.go
+++ b/nekryptology/pkg/zkp/schnorr/schnorr_test.go
@@ -25,13 +25,13 @@ func TestZKPOverMultipleCurves(t *testing.T) {
 	}
 	for i, curve := range curveInstances {
 		uniqueSessionId := sha3.New256().Sum([]byte("random seed"))
-		prover := NewProver(curve, nil, uniqueSessionId)
+		prover := NewProver(curve, nil, sha3.New256(), uniqueSessionId)
 
 		secret := curve.Scalar.Random(rand.Reader)
 		proof, err := prover.Prove(secret)
 		require.NoError(t, err, fmt.Sprintf("failed in curve %d", i))
 
-		err = Verify(proof, curve, nil, uniqueSessionId)
+		err = Verify(proof, curve, nil, sha3.New256(), uniqueSessionId)
 		require.NoError(t, err, fmt.Sprintf("failed in curve %d", i))
 	}
 }
diff --git a/node/.vscode/settings.json b/node/.vscode/settings.json
new file mode 100644
index 0000000..aee3509
--- /dev/null
+++ b/node/.vscode/settings.json
@@ -0,0 +1,5 @@
+{
+  "go.testEnvVars": {
+    "GOEXPERIMENT": "arenas"
+  }
+}
\ No newline at end of file
diff --git a/node/app/db_console.go b/node/app/db_console.go
index ed93749..bc8b354 100644
--- a/node/app/db_console.go
+++ b/node/app/db_console.go
@@ -23,6 +23,7 @@ import (
 	"google.golang.org/grpc/credentials/insecure"
 	"source.quilibrium.com/quilibrium/monorepo/node/config"
 	"source.quilibrium.com/quilibrium/monorepo/node/execution/ceremony/application"
+	"source.quilibrium.com/quilibrium/monorepo/node/p2p"
 	"source.quilibrium.com/quilibrium/monorepo/node/protobufs"
 	"source.quilibrium.com/quilibrium/monorepo/node/tries"
 )
@@ -431,7 +432,7 @@ func (m model) View() string {
 
 	list := []string{}
 	for i, item := range m.filters {
-		str := item[0:12] + ".." + item[52:]
+		str := item[0:12] + ".." + item[len(item)-12:]
 		if m.selectedFilter == item {
 			list = append(list, selectedListStyle.Render(str))
 		} else if i == m.cursor {
@@ -584,7 +585,7 @@ func (m model) View() string {
 
 					for _, active := range app.ActiveParticipants {
 						explorerContent += "\t" + base64.StdEncoding.EncodeToString(
-							active.KeyValue,
+							active.PublicKeySignatureEd448.PublicKey.KeyValue,
 						) + "\n"
 					}
 
@@ -624,7 +625,7 @@ func (m model) View() string {
 
 					for _, active := range app.ActiveParticipants {
 						explorerContent += "\t" + base64.StdEncoding.EncodeToString(
-							active.KeyValue,
+							active.PublicKeySignatureEd448.PublicKey.KeyValue,
 						) + "\n"
 					}
 
@@ -656,8 +657,10 @@ func (m model) View() string {
 						) + "\n"
 					}
 				case application.CEREMONY_APPLICATION_STATE_VALIDATING:
+					explorerContent += fmt.Sprintf(
+						"G1 Powers: %d\n", len(app.UpdatedTranscript.G1Powers),
+					)
 					explorerContent += "Preferred Next Round Participants: \n"
-
 					for _, next := range app.NextRoundPreferredParticipants {
 						explorerContent += "\t" + base64.StdEncoding.EncodeToString(
 							next.KeyValue,
@@ -727,7 +730,10 @@ func consoleModel(
 				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 			}),
-			hex.EncodeToString(application.CEREMONY_ADDRESS),
+			hex.EncodeToString(append(
+				p2p.GetBloomFilter(application.CEREMONY_ADDRESS, 256, 3),
+				p2p.GetBloomFilterIndices(application.CEREMONY_ADDRESS, 65536, 24)...,
+			)),
 		},
 		cursor:   0,
 		conn:     conn,
diff --git a/node/app/wire.go b/node/app/wire.go
index a3e09bb..588381f 100644
--- a/node/app/wire.go
+++ b/node/app/wire.go
@@ -8,7 +8,6 @@ import (
 	"go.uber.org/zap"
 	"source.quilibrium.com/quilibrium/monorepo/node/config"
 	"source.quilibrium.com/quilibrium/monorepo/node/consensus"
-	ceremonyConsensus "source.quilibrium.com/quilibrium/monorepo/node/consensus/ceremony"
 	"source.quilibrium.com/quilibrium/monorepo/node/consensus/master"
 	"source.quilibrium.com/quilibrium/monorepo/node/execution/ceremony"
 	"source.quilibrium.com/quilibrium/monorepo/node/keys"
@@ -38,6 +37,7 @@ var keyManagerSet = wire.NewSet(
 var storeSet = wire.NewSet(
 	wire.FieldsOf(new(*config.Config), "DB"),
 	store.NewPebbleDB,
+	wire.Bind(new(store.KVDB), new(*store.PebbleDB)),
 	store.NewPebbleClockStore,
 	store.NewPebbleKeyStore,
 	store.NewPebbleDataProofStore,
@@ -52,16 +52,8 @@ var pubSubSet = wire.NewSet(
 	wire.Bind(new(p2p.PubSub), new(*p2p.BlossomSub)),
 )
 
-var dataConsensusSet = wire.NewSet(
-	wire.FieldsOf(new(*config.Config), "Engine"),
-	ceremonyConsensus.NewCeremonyDataClockConsensusEngine,
-	wire.Bind(
-		new(consensus.DataConsensusEngine),
-		new(*ceremonyConsensus.CeremonyDataClockConsensusEngine),
-	),
-)
-
 var engineSet = wire.NewSet(
+	wire.FieldsOf(new(*config.Config), "Engine"),
 	ceremony.NewCeremonyExecutionEngine,
 )
 
@@ -80,7 +72,6 @@ func NewNode(*config.Config) (*Node, error) {
 		storeSet,
 		pubSubSet,
 		engineSet,
-		dataConsensusSet,
 		consensusSet,
 		newNode,
 	))
diff --git a/node/app/wire_gen.go b/node/app/wire_gen.go
index d391163..40fb5dc 100644
--- a/node/app/wire_gen.go
+++ b/node/app/wire_gen.go
@@ -11,9 +11,8 @@ import (
 	"go.uber.org/zap"
 	"source.quilibrium.com/quilibrium/monorepo/node/config"
 	"source.quilibrium.com/quilibrium/monorepo/node/consensus"
-	"source.quilibrium.com/quilibrium/monorepo/node/consensus/ceremony"
 	"source.quilibrium.com/quilibrium/monorepo/node/consensus/master"
-	ceremony2 "source.quilibrium.com/quilibrium/monorepo/node/execution/ceremony"
+	"source.quilibrium.com/quilibrium/monorepo/node/execution/ceremony"
 	"source.quilibrium.com/quilibrium/monorepo/node/keys"
 	"source.quilibrium.com/quilibrium/monorepo/node/p2p"
 	"source.quilibrium.com/quilibrium/monorepo/node/store"
@@ -24,16 +23,15 @@ import (
 func NewNode(configConfig *config.Config) (*Node, error) {
 	zapLogger := logger()
 	dbConfig := configConfig.DB
-	db := store.NewPebbleDB(dbConfig)
-	pebbleClockStore := store.NewPebbleClockStore(db, zapLogger)
+	pebbleDB := store.NewPebbleDB(dbConfig)
+	pebbleClockStore := store.NewPebbleClockStore(pebbleDB, zapLogger)
 	keyConfig := configConfig.Key
 	fileKeyManager := keys.NewFileKeyManager(keyConfig, zapLogger)
 	p2PConfig := configConfig.P2P
 	blossomSub := p2p.NewBlossomSub(p2PConfig, zapLogger)
 	engineConfig := configConfig.Engine
-	pebbleKeyStore := store.NewPebbleKeyStore(db, zapLogger)
-	ceremonyDataClockConsensusEngine := ceremony.NewCeremonyDataClockConsensusEngine(engineConfig, zapLogger, fileKeyManager, pebbleClockStore, pebbleKeyStore, blossomSub)
-	ceremonyExecutionEngine := ceremony2.NewCeremonyExecutionEngine(zapLogger, ceremonyDataClockConsensusEngine, engineConfig, fileKeyManager, blossomSub, pebbleClockStore, pebbleKeyStore)
+	pebbleKeyStore := store.NewPebbleKeyStore(pebbleDB, zapLogger)
+	ceremonyExecutionEngine := ceremony.NewCeremonyExecutionEngine(zapLogger, engineConfig, fileKeyManager, blossomSub, pebbleClockStore, pebbleKeyStore)
 	masterClockConsensusEngine := master.NewMasterClockConsensusEngine(engineConfig, zapLogger, pebbleClockStore, fileKeyManager, blossomSub)
 	node, err := newNode(zapLogger, pebbleClockStore, fileKeyManager, blossomSub, ceremonyExecutionEngine, masterClockConsensusEngine)
 	if err != nil {
@@ -52,9 +50,9 @@ func NewDBConsole(configConfig *config.Config) (*DBConsole, error) {
 
 func NewClockStore(configConfig *config.Config) (store.ClockStore, error) {
 	dbConfig := configConfig.DB
-	db := store.NewPebbleDB(dbConfig)
+	pebbleDB := store.NewPebbleDB(dbConfig)
 	zapLogger := logger()
-	pebbleClockStore := store.NewPebbleClockStore(db, zapLogger)
+	pebbleClockStore := store.NewPebbleClockStore(pebbleDB, zapLogger)
 	return pebbleClockStore, nil
 }
 
@@ -75,17 +73,11 @@ var loggerSet = wire.NewSet(
 
 var keyManagerSet = wire.NewSet(wire.FieldsOf(new(*config.Config), "Key"), keys.NewFileKeyManager, wire.Bind(new(keys.KeyManager), new(*keys.FileKeyManager)))
 
-var storeSet = wire.NewSet(wire.FieldsOf(new(*config.Config), "DB"), store.NewPebbleDB, store.NewPebbleClockStore, store.NewPebbleKeyStore, store.NewPebbleDataProofStore, wire.Bind(new(store.ClockStore), new(*store.PebbleClockStore)), wire.Bind(new(store.KeyStore), new(*store.PebbleKeyStore)), wire.Bind(new(store.DataProofStore), new(*store.PebbleDataProofStore)))
+var storeSet = wire.NewSet(wire.FieldsOf(new(*config.Config), "DB"), store.NewPebbleDB, wire.Bind(new(store.KVDB), new(*store.PebbleDB)), store.NewPebbleClockStore, store.NewPebbleKeyStore, store.NewPebbleDataProofStore, wire.Bind(new(store.ClockStore), new(*store.PebbleClockStore)), wire.Bind(new(store.KeyStore), new(*store.PebbleKeyStore)), wire.Bind(new(store.DataProofStore), new(*store.PebbleDataProofStore)))
 
 var pubSubSet = wire.NewSet(wire.FieldsOf(new(*config.Config), "P2P"), p2p.NewBlossomSub, wire.Bind(new(p2p.PubSub), new(*p2p.BlossomSub)))
 
-var dataConsensusSet = wire.NewSet(wire.FieldsOf(new(*config.Config), "Engine"), ceremony.NewCeremonyDataClockConsensusEngine, wire.Bind(
-	new(consensus.DataConsensusEngine),
-	new(*ceremony.CeremonyDataClockConsensusEngine),
-),
-)
-
-var engineSet = wire.NewSet(ceremony2.NewCeremonyExecutionEngine)
+var engineSet = wire.NewSet(wire.FieldsOf(new(*config.Config), "Engine"), ceremony.NewCeremonyExecutionEngine)
 
 var consensusSet = wire.NewSet(master.NewMasterClockConsensusEngine, wire.Bind(
 	new(consensus.ConsensusEngine),
diff --git a/node/config/engine.go b/node/config/engine.go
index 7fce2ac..b2f5f22 100644
--- a/node/config/engine.go
+++ b/node/config/engine.go
@@ -7,4 +7,8 @@ type EngineConfig struct {
 	MaxFrames            int64  `yaml:"maxFrames"`
 	PendingCommitWorkers int64  `yaml:"pendingCommitWorkers"`
 	MinimumPeersRequired int    `yaml:"minimumPeersRequired"`
+
+	// Values used only for testing – do not override these in production, your
+	// node will get kicked out
+	Difficulty uint32 `yaml:"difficulty"`
 }
diff --git a/node/consensus/ceremony/broadcast_messaging.go b/node/consensus/ceremony/broadcast_messaging.go
index b79c4a0..b9014db 100644
--- a/node/consensus/ceremony/broadcast_messaging.go
+++ b/node/consensus/ceremony/broadcast_messaging.go
@@ -2,8 +2,6 @@ package ceremony
 
 import (
 	"bytes"
-	"crypto"
-	"crypto/rand"
 	"encoding/binary"
 	"strings"
 	"time"
@@ -19,7 +17,6 @@ import (
 	"google.golang.org/protobuf/types/known/anypb"
 	"source.quilibrium.com/quilibrium/monorepo/go-libp2p-blossomsub/pb"
 	"source.quilibrium.com/quilibrium/monorepo/nekryptology/pkg/core/curves"
-	"source.quilibrium.com/quilibrium/monorepo/nekryptology/pkg/zkp/schnorr"
 	"source.quilibrium.com/quilibrium/monorepo/node/consensus"
 	qcrypto "source.quilibrium.com/quilibrium/monorepo/node/crypto"
 	"source.quilibrium.com/quilibrium/monorepo/node/keys"
@@ -111,22 +108,6 @@ func (e *CeremonyDataClockConsensusEngine) handleMessage(
 		); err != nil {
 			return errors.Wrap(err, "handle message")
 		}
-	case protobufs.ProvingKeyRequestType:
-		if err := e.handleProvingKeyRequest(
-			message.From,
-			msg.Address,
-			any,
-		); err != nil {
-			return errors.Wrap(err, "handle message")
-		}
-	case protobufs.ProvingKeyAnnouncementType:
-		if err := e.handleProvingKey(message.From, msg.Address, any); err != nil {
-			return errors.Wrap(err, "handle message")
-		}
-	case protobufs.KeyBundleAnnouncementType:
-		if err := e.handleKeyBundle(message.From, msg.Address, any); err != nil {
-			return errors.Wrap(err, "handle message")
-		}
 	case protobufs.CeremonyPeerListAnnounceType:
 		if err := e.handleCeremonyPeerListAnnounce(
 			message.From,
@@ -304,177 +285,6 @@ func (e *CeremonyDataClockConsensusEngine) handleCeremonyLobbyStateTransition(
 	return nil
 }
 
-func (e *CeremonyDataClockConsensusEngine) handleKeyBundle(
-	peerID []byte,
-	address []byte,
-	any *anypb.Any,
-) error {
-	e.logger.Debug("received key bundle")
-	keyBundleAnnouncement := &protobufs.KeyBundleAnnouncement{}
-	if err := any.UnmarshalTo(keyBundleAnnouncement); err != nil {
-		return errors.Wrap(err, "handle key bundle")
-	}
-
-	if len(keyBundleAnnouncement.ProvingKeyBytes) == 0 {
-		return errors.Wrap(errors.New("proving key is nil"), "handle key bundle")
-	}
-
-	k, err := e.keyStore.GetLatestKeyBundle(keyBundleAnnouncement.ProvingKeyBytes)
-	if err != nil && !errors.Is(err, store.ErrNotFound) {
-		return errors.Wrap(err, "handle key bundle")
-	}
-
-	if k != nil {
-		latestAnnouncement := &protobufs.KeyBundleAnnouncement{}
-		err := proto.Unmarshal(k.Data, latestAnnouncement)
-		if err != nil {
-			return errors.Wrap(err, "handle key bundle")
-		}
-
-		if bytes.Equal(
-			latestAnnouncement.IdentityKey.Challenge,
-			keyBundleAnnouncement.IdentityKey.Challenge,
-		) && bytes.Equal(
-			latestAnnouncement.IdentityKey.Response,
-			keyBundleAnnouncement.IdentityKey.Response,
-		) && bytes.Equal(
-			latestAnnouncement.IdentityKey.Statement,
-			keyBundleAnnouncement.IdentityKey.Statement,
-		) && bytes.Equal(
-			latestAnnouncement.SignedPreKey.Challenge,
-			keyBundleAnnouncement.SignedPreKey.Challenge,
-		) && bytes.Equal(
-			latestAnnouncement.SignedPreKey.Response,
-			keyBundleAnnouncement.SignedPreKey.Response,
-		) && bytes.Equal(
-			latestAnnouncement.SignedPreKey.Statement,
-			keyBundleAnnouncement.SignedPreKey.Statement,
-		) {
-			// This has already been proven, ignore
-			return nil
-		}
-	}
-
-	var provingKey *protobufs.ProvingKeyAnnouncement
-	inclusion, err := e.keyStore.GetProvingKey(
-		keyBundleAnnouncement.ProvingKeyBytes,
-	)
-	if err != nil {
-		if !errors.Is(err, store.ErrNotFound) {
-			return errors.Wrap(err, "handle key bundle")
-		}
-
-		provingKey, err = e.keyStore.GetStagedProvingKey(
-			keyBundleAnnouncement.ProvingKeyBytes,
-		)
-		if err != nil && !errors.Is(err, store.ErrNotFound) {
-			return errors.Wrap(err, "handle key bundle")
-		}
-	} else {
-		err := proto.Unmarshal(inclusion.Data, provingKey)
-		if err != nil {
-			return errors.Wrap(err, "handle key bundle")
-		}
-	}
-
-	// We have a matching proving key, we can set this up to be committed.
-	if provingKey != nil {
-		e.logger.Debug("verifying key bundle announcement")
-		if err := keyBundleAnnouncement.Verify(provingKey); err != nil {
-			e.logger.Debug(
-				"could not verify key bundle announcement",
-				zap.Error(err),
-			)
-			return nil
-		}
-
-		go func() {
-			e.logger.Debug("adding key bundle announcement to pending commits")
-
-			e.pendingCommits <- any
-		}()
-
-		return nil
-	} else {
-		e.logger.Debug("proving key not found, requesting from peers")
-
-		if err = e.publishMessage(e.filter, &protobufs.ProvingKeyRequest{
-			ProvingKeyBytes: keyBundleAnnouncement.ProvingKeyBytes,
-		}); err != nil {
-			return errors.Wrap(err, "handle key bundle")
-		}
-
-		e.dependencyMapMx.Lock()
-		e.dependencyMap[string(keyBundleAnnouncement.ProvingKeyBytes)] = any
-		e.dependencyMapMx.Unlock()
-	}
-
-	return nil
-}
-
-func (e *CeremonyDataClockConsensusEngine) handleProvingKey(
-	peerID []byte,
-	address []byte,
-	any *anypb.Any,
-) error {
-	e.logger.Debug("received proving key")
-
-	provingKeyAnnouncement := &protobufs.ProvingKeyAnnouncement{}
-	if err := any.UnmarshalTo(provingKeyAnnouncement); err != nil {
-		return errors.Wrap(err, "handle proving key")
-	}
-
-	if err := provingKeyAnnouncement.Verify(); err != nil {
-		return errors.Wrap(err, "handle proving key")
-	}
-
-	if err := e.keyStore.StageProvingKey(provingKeyAnnouncement); err != nil {
-		return errors.Wrap(err, "handle proving key")
-	}
-
-	provingKey := provingKeyAnnouncement.PublicKey()
-
-	e.logger.Debug(
-		"proving key staged",
-		zap.Binary("proving_key", provingKey),
-	)
-
-	go func() {
-		e.dependencyMapMx.Lock()
-		if e.dependencyMap[string(provingKey)] != nil {
-			keyBundleAnnouncement := &protobufs.KeyBundleAnnouncement{}
-			if err := proto.Unmarshal(
-				e.dependencyMap[string(provingKey)].Value,
-				keyBundleAnnouncement,
-			); err != nil {
-				e.logger.Error(
-					"could not unmarshal key bundle announcement",
-					zap.Error(err),
-				)
-				e.dependencyMapMx.Unlock()
-				return
-			}
-			if err := keyBundleAnnouncement.Verify(
-				provingKeyAnnouncement,
-			); err != nil {
-				e.logger.Error(
-					"could not verify key bundle announcement",
-					zap.Error(err),
-				)
-				e.dependencyMapMx.Unlock()
-				return
-			}
-
-			e.pendingCommits <- e.dependencyMap[string(provingKey)]
-
-			delete(e.dependencyMap, string(provingKey))
-		}
-		e.dependencyMapMx.Unlock()
-	}()
-
-	return nil
-}
-
 func (e *CeremonyDataClockConsensusEngine) handleClockFrameData(
 	peerID []byte,
 	address []byte,
@@ -694,16 +504,30 @@ func (e *CeremonyDataClockConsensusEngine) handleClockFrameData(
 		zap.Binary("filter", frame.Filter),
 		zap.Uint64("frame_number", frame.FrameNumber),
 	)
+	masterFrame, err := e.clockStore.GetMasterClockFrame(
+		[]byte{
+			0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+			0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+			0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+			0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		},
+		frame.FrameNumber-1,
+	)
+	if err != nil {
+		e.logger.Info("received frame with no known master, needs sync")
+		return nil
+	}
 
-	parentSelector, selector, distance, err :=
-		frame.GetParentSelectorAndDistance()
+	discriminator, err := masterFrame.GetSelector()
+	if err != nil {
+		return errors.Wrap(err, "handle clock frame data")
+	}
+
+	parentSelector, distance, selector, err :=
+		frame.GetParentSelectorAndDistance(discriminator)
 	if err != nil {
 		return errors.Wrap(err, "handle clock frame data")
 	}
-	e.logger.Debug(
-		"difference between selector/discriminator",
-		zap.Binary("difference", distance.Bytes()),
-	)
 
 	if _, err := e.clockStore.GetParentDataClockFrame(
 		frame.Filter,
@@ -713,7 +537,7 @@ func (e *CeremonyDataClockConsensusEngine) handleClockFrameData(
 		// If this is a frame number higher than what we're already caught up to,
 		// push a request to fill the gap, unless we're syncing or it's in step,
 		// then just lazily seek.
-		from := e.frame
+		from := e.frame.FrameNumber
 		if from >= frame.FrameNumber-1 {
 			from = frame.FrameNumber - 1
 		}
@@ -737,9 +561,9 @@ func (e *CeremonyDataClockConsensusEngine) handleClockFrameData(
 	}
 
 	if err := e.clockStore.PutCandidateDataClockFrame(
-		parentSelector.Bytes(),
-		distance.Bytes(),
-		selector.Bytes(),
+		parentSelector.FillBytes(make([]byte, 32)),
+		distance.FillBytes(make([]byte, 32)),
+		selector.FillBytes(make([]byte, 32)),
 		frame,
 		txn,
 	); err != nil {
@@ -752,7 +576,7 @@ func (e *CeremonyDataClockConsensusEngine) handleClockFrameData(
 		return errors.Wrap(err, "handle clock frame data")
 	}
 
-	if e.frame < frame.FrameNumber {
+	if e.frame.FrameNumber < frame.FrameNumber {
 		e.latestFrameReceived = frame.FrameNumber
 		e.lastFrameReceivedAt = time.Now().UTC()
 	}
@@ -819,12 +643,11 @@ func (e *CeremonyDataClockConsensusEngine) publishMessage(
 	return e.pubSub.PublishToBitmask(filter, data)
 }
 
-func (e *CeremonyDataClockConsensusEngine) announceKeyBundle() error {
-	e.logger.Debug("announcing key bundle")
-	idk, err := e.keyManager.GetAgreementKey("q-ratchet-idk")
+func (e *CeremonyDataClockConsensusEngine) createCommunicationKeys() error {
+	_, err := e.keyManager.GetAgreementKey("q-ratchet-idk")
 	if err != nil {
 		if errors.Is(err, keys.KeyNotFoundErr) {
-			idk, err = e.keyManager.CreateAgreementKey(
+			_, err = e.keyManager.CreateAgreementKey(
 				"q-ratchet-idk",
 				keys.KeyTypeX448,
 			)
@@ -836,10 +659,10 @@ func (e *CeremonyDataClockConsensusEngine) announceKeyBundle() error {
 		}
 	}
 
-	spk, err := e.keyManager.GetAgreementKey("q-ratchet-spk")
+	_, err = e.keyManager.GetAgreementKey("q-ratchet-spk")
 	if err != nil {
 		if errors.Is(err, keys.KeyNotFoundErr) {
-			spk, err = e.keyManager.CreateAgreementKey(
+			_, err = e.keyManager.CreateAgreementKey(
 				"q-ratchet-spk",
 				keys.KeyTypeX448,
 			)
@@ -851,110 +674,5 @@ func (e *CeremonyDataClockConsensusEngine) announceKeyBundle() error {
 		}
 	}
 
-	idkPoint := curves.ED448().NewGeneratorPoint().Mul(idk)
-	idkProver := schnorr.NewProver(
-		curves.ED448(),
-		curves.ED448().NewGeneratorPoint(),
-		sha3.New256(),
-		[]byte{},
-	)
-
-	spkPoint := curves.ED448().NewGeneratorPoint().Mul(spk)
-	spkProver := schnorr.NewProver(
-		curves.ED448(),
-		curves.ED448().NewGeneratorPoint(),
-		sha3.New256(),
-		[]byte{},
-	)
-
-	idkProof, idkCommitment, err := idkProver.ProveCommit(idk)
-	if err != nil {
-		return errors.Wrap(err, "announce key bundle")
-	}
-
-	spkProof, spkCommitment, err := spkProver.ProveCommit(spk)
-	if err != nil {
-		return errors.Wrap(err, "announce key bundle")
-	}
-
-	msg := append(
-		append([]byte{}, idkCommitment...),
-		spkCommitment...,
-	)
-
-	signature, err := e.provingKey.Sign(rand.Reader, msg, crypto.Hash(0))
-	if err != nil {
-		return errors.Wrap(err, "announce key bundle")
-	}
-
-	signatureProto := &protobufs.ProvingKeyAnnouncement_ProvingKeySignatureEd448{
-		ProvingKeySignatureEd448: &protobufs.Ed448Signature{
-			PublicKey: &protobufs.Ed448PublicKey{
-				KeyValue: e.provingKeyBytes,
-			},
-			Signature: signature,
-		},
-	}
-	provingKeyAnnouncement := &protobufs.ProvingKeyAnnouncement{
-		IdentityCommitment:  idkCommitment,
-		PrekeyCommitment:    spkCommitment,
-		ProvingKeySignature: signatureProto,
-	}
-
-	if err := e.publishMessage(e.filter, provingKeyAnnouncement); err != nil {
-		return errors.Wrap(err, "announce key bundle")
-	}
-
-	idkSignature, err := e.provingKey.Sign(
-		rand.Reader,
-		idkPoint.ToAffineCompressed(),
-		crypto.Hash(0),
-	)
-	if err != nil {
-		return errors.Wrap(err, "announce key bundle")
-	}
-
-	spkSignature, err := e.provingKey.Sign(
-		rand.Reader,
-		spkPoint.ToAffineCompressed(),
-		crypto.Hash(0),
-	)
-	if err != nil {
-		return errors.Wrap(err, "announce key bundle")
-	}
-
-	keyBundleAnnouncement := &protobufs.KeyBundleAnnouncement{
-		ProvingKeyBytes: e.provingKeyBytes,
-		IdentityKey: &protobufs.IdentityKey{
-			Challenge: idkProof.C.Bytes(),
-			Response:  idkProof.S.Bytes(),
-			Statement: idkProof.Statement.ToAffineCompressed(),
-			IdentityKeySignature: &protobufs.IdentityKey_PublicKeySignatureEd448{
-				PublicKeySignatureEd448: &protobufs.Ed448Signature{
-					PublicKey: &protobufs.Ed448PublicKey{
-						KeyValue: idkPoint.ToAffineCompressed(),
-					},
-					Signature: idkSignature,
-				},
-			},
-		},
-		SignedPreKey: &protobufs.SignedPreKey{
-			Challenge: spkProof.C.Bytes(),
-			Response:  spkProof.S.Bytes(),
-			Statement: spkProof.Statement.ToAffineCompressed(),
-			SignedPreKeySignature: &protobufs.SignedPreKey_PublicKeySignatureEd448{
-				PublicKeySignatureEd448: &protobufs.Ed448Signature{
-					PublicKey: &protobufs.Ed448PublicKey{
-						KeyValue: spkPoint.ToAffineCompressed(),
-					},
-					Signature: spkSignature,
-				},
-			},
-		},
-	}
-
-	return errors.Wrap(
-		e.publishMessage(e.filter, keyBundleAnnouncement),
-		"announce key bundle",
-	)
+	return nil
 }
diff --git a/node/consensus/ceremony/ceremony_data_clock_consensus_engine.go b/node/consensus/ceremony/ceremony_data_clock_consensus_engine.go
index 5ce1f30..fa40990 100644
--- a/node/consensus/ceremony/ceremony_data_clock_consensus_engine.go
+++ b/node/consensus/ceremony/ceremony_data_clock_consensus_engine.go
@@ -53,8 +53,7 @@ type ChannelServer = protobufs.CeremonyService_GetPublicChannelServer
 
 type CeremonyDataClockConsensusEngine struct {
 	protobufs.UnimplementedCeremonyServiceServer
-	frame                       uint64
-	activeFrame                 *protobufs.ClockFrame
+	frame                       *protobufs.ClockFrame
 	difficulty                  uint32
 	logger                      *zap.Logger
 	state                       consensus.EngineState
@@ -113,6 +112,8 @@ func NewCeremonyDataClockConsensusEngine(
 	clockStore store.ClockStore,
 	keyStore store.KeyStore,
 	pubSub p2p.PubSub,
+	filter []byte,
+	seed []byte,
 ) *CeremonyDataClockConsensusEngine {
 	if logger == nil {
 		panic(errors.New("logger is nil"))
@@ -143,9 +144,14 @@ func NewCeremonyDataClockConsensusEngine(
 		minimumPeersRequired = 3
 	}
 
+	difficulty := engineConfig.Difficulty
+	if difficulty == 0 {
+		difficulty = 10000
+	}
+
 	e := &CeremonyDataClockConsensusEngine{
-		frame:            0,
-		difficulty:       10000,
+		frame:            nil,
+		difficulty:       difficulty,
 		logger:           logger,
 		state:            consensus.EngineStateStopped,
 		clockStore:       clockStore,
@@ -182,6 +188,8 @@ func NewCeremonyDataClockConsensusEngine(
 		engineConfig,
 	)
 
+	e.filter = filter
+	e.input = seed
 	e.provingKey = signer
 	e.provingKeyType = keyType
 	e.provingKeyBytes = bytes
@@ -190,16 +198,10 @@ func NewCeremonyDataClockConsensusEngine(
 	return e
 }
 
-func (e *CeremonyDataClockConsensusEngine) Start(
-	filter []byte,
-	seed []byte,
-) <-chan error {
+func (e *CeremonyDataClockConsensusEngine) Start() <-chan error {
 	e.logger.Info("starting ceremony consensus engine")
 	e.state = consensus.EngineStateStarting
 	errChan := make(chan error)
-
-	e.filter = filter
-	e.input = seed
 	e.state = consensus.EngineStateLoading
 
 	e.logger.Info("loading last seen state")
@@ -214,16 +216,16 @@ func (e *CeremonyDataClockConsensusEngine) Start(
 	if latestFrame != nil {
 		e.setFrame(latestFrame)
 	} else {
-		latestFrame = e.createGenesisFrame()
+		latestFrame = e.CreateGenesisFrame(nil)
+	}
+
+	err = e.createCommunicationKeys()
+	if err != nil {
+		panic(err)
 	}
 
 	e.logger.Info("subscribing to pubsub messages")
 	e.pubSub.Subscribe(e.filter, e.handleMessage, true)
-	e.pubSub.Subscribe(
-		append(append([]byte{}, e.filter...), e.pubSub.GetPeerID()...),
-		e.handleSync,
-		true,
-	)
 
 	go func() {
 		server := grpc.NewServer(
@@ -240,8 +242,6 @@ func (e *CeremonyDataClockConsensusEngine) Start(
 		}
 	}()
 
-	latestFrame = e.performSanityCheck(latestFrame)
-
 	e.state = consensus.EngineStateCollecting
 
 	for i := int64(0); i < e.pendingCommitWorkers; i++ {
@@ -257,7 +257,7 @@ func (e *CeremonyDataClockConsensusEngine) Start(
 			}
 
 			timestamp := time.Now().UnixMilli()
-			msg := binary.BigEndian.AppendUint64([]byte{}, e.frame)
+			msg := binary.BigEndian.AppendUint64([]byte{}, e.frame.FrameNumber)
 			msg = append(msg, consensus.GetVersion()...)
 			msg = binary.BigEndian.AppendUint64(msg, uint64(timestamp))
 			sig, err := e.pubSub.SignMessage(msg)
@@ -269,7 +269,7 @@ func (e *CeremonyDataClockConsensusEngine) Start(
 			e.peerMap[string(e.pubSub.GetPeerID())] = &peerInfo{
 				peerId:    e.pubSub.GetPeerID(),
 				multiaddr: "",
-				maxFrame:  e.frame,
+				maxFrame:  e.frame.FrameNumber,
 				version:   consensus.GetVersion(),
 				signature: sig,
 				publicKey: e.pubSub.GetPublicKey(),
@@ -307,38 +307,8 @@ func (e *CeremonyDataClockConsensusEngine) Start(
 	}()
 
 	go func() {
-		latest := latestFrame
-		for {
-			time.Sleep(30 * time.Second)
-			peerCount := e.pubSub.GetNetworkPeersCount()
-			if peerCount >= e.minimumPeersRequired {
-				e.logger.Info("selecting leader")
-				if e.frame > latest.FrameNumber && e.frame-latest.FrameNumber > 16 &&
-					e.syncingTarget == nil {
-					e.logger.Info("rewinding sync head due to large delta")
-					latest, _, err = e.clockStore.GetDataClockFrame(
-						e.filter,
-						0,
-					)
-					if err != nil {
-						panic(err)
-					}
-				}
-				latest, err = e.commitLongestPath(latest)
-				if err != nil {
-					e.logger.Error("could not collect longest path", zap.Error(err))
-					latest, _, err = e.clockStore.GetDataClockFrame(e.filter, 0)
-					if err != nil {
-						panic(err)
-					}
-				}
-
-				latest = e.performSanityCheck(latest)
-			}
-		}
-	}()
-
-	go func() {
+		e.logger.Info("waiting for peer list mappings")
+		time.Sleep(30 * time.Second)
 		for e.state < consensus.EngineStateStopping {
 			peerCount := e.pubSub.GetNetworkPeersCount()
 			if peerCount < e.minimumPeersRequired {
@@ -350,22 +320,23 @@ func (e *CeremonyDataClockConsensusEngine) Start(
 			} else {
 				switch e.state {
 				case consensus.EngineStateCollecting:
+					currentFrame := latestFrame
 					if latestFrame, err = e.collect(latestFrame); err != nil {
 						e.logger.Error("could not collect", zap.Error(err))
 						e.state = consensus.EngineStateCollecting
-						errChan <- err
+						latestFrame = currentFrame
 					}
 				case consensus.EngineStateProving:
+					currentFrame := latestFrame
 					if latestFrame, err = e.prove(latestFrame); err != nil {
 						e.logger.Error("could not prove", zap.Error(err))
 						e.state = consensus.EngineStateCollecting
-						errChan <- err
+						latestFrame = currentFrame
 					}
 				case consensus.EngineStatePublishing:
 					if err = e.publishProof(latestFrame); err != nil {
 						e.logger.Error("could not publish", zap.Error(err))
 						e.state = consensus.EngineStateCollecting
-						errChan <- err
 					}
 				}
 			}
@@ -389,7 +360,7 @@ func (e *CeremonyDataClockConsensusEngine) Stop(force bool) <-chan error {
 	for name := range e.executionEngines {
 		name := name
 		go func(name string) {
-			err := <-e.UnregisterExecutor(name, e.frame, force)
+			err := <-e.UnregisterExecutor(name, e.frame.FrameNumber, force)
 			if err != nil {
 				errChan <- err
 			}
@@ -463,7 +434,7 @@ func (e *CeremonyDataClockConsensusEngine) performSanityCheck(
 							panic(err)
 						}
 
-						parentSelector, _, _, err := disc.GetParentSelectorAndDistance()
+						parentSelector, _, _, err := disc.GetParentSelectorAndDistance(nil)
 						if err != nil {
 							panic(err)
 						}
@@ -536,7 +507,7 @@ func (e *CeremonyDataClockConsensusEngine) GetDifficulty() uint32 {
 	return e.difficulty
 }
 
-func (e *CeremonyDataClockConsensusEngine) GetFrame() uint64 {
+func (e *CeremonyDataClockConsensusEngine) GetFrame() *protobufs.ClockFrame {
 	return e.frame
 }
 
@@ -550,12 +521,6 @@ func (
 	return e.frameChan
 }
 
-func (
-	e *CeremonyDataClockConsensusEngine,
-) GetActiveFrame() *protobufs.ClockFrame {
-	return e.activeFrame
-}
-
 func (
 	e *CeremonyDataClockConsensusEngine,
 ) GetPeerInfo() *protobufs.PeerInfoResponse {
diff --git a/node/consensus/ceremony/consensus_frames.go b/node/consensus/ceremony/consensus_frames.go
index db8aabc..a1f29b3 100644
--- a/node/consensus/ceremony/consensus_frames.go
+++ b/node/consensus/ceremony/consensus_frames.go
@@ -3,16 +3,18 @@ package ceremony
 import (
 	"bytes"
 	"context"
+	"encoding/base64"
 	"encoding/binary"
 	"encoding/hex"
+	"encoding/json"
 	"fmt"
 	"io"
 	"math/big"
+	"os"
 	"strings"
 
 	"github.com/iden3/go-iden3-crypto/ff"
 	"github.com/iden3/go-iden3-crypto/poseidon"
-	"github.com/libp2p/go-libp2p/core/peer"
 	"github.com/pkg/errors"
 	"go.uber.org/zap"
 	"golang.org/x/crypto/sha3"
@@ -25,7 +27,6 @@ import (
 	"source.quilibrium.com/quilibrium/monorepo/node/execution/ceremony/application"
 	"source.quilibrium.com/quilibrium/monorepo/node/p2p"
 	"source.quilibrium.com/quilibrium/monorepo/node/protobufs"
-	"source.quilibrium.com/quilibrium/monorepo/node/store"
 	"source.quilibrium.com/quilibrium/monorepo/node/tries"
 )
 
@@ -322,9 +323,8 @@ func (e *CeremonyDataClockConsensusEngine) setFrame(
 	}
 	e.logger.Debug("set frame", zap.Uint64("frame_number", frame.FrameNumber))
 	e.currentDistance = distance
-	e.frame = frame.FrameNumber
+	e.frame = frame
 	e.parentSelector = parent.Bytes()
-	e.activeFrame = frame
 	go func() {
 		e.frameChan <- frame
 	}()
@@ -332,7 +332,7 @@ func (e *CeremonyDataClockConsensusEngine) setFrame(
 
 func (
 	e *CeremonyDataClockConsensusEngine,
-) createGenesisFrame() *protobufs.ClockFrame {
+) CreateGenesisFrame(testProverKeys [][]byte) *protobufs.ClockFrame {
 	e.logger.Info("creating genesis frame")
 	for _, l := range strings.Split(string(e.input), "\n") {
 		e.logger.Info(l)
@@ -376,7 +376,7 @@ func (
 	transcript.RunningG2_256Powers = append(
 		transcript.RunningG2_256Powers,
 		&protobufs.BLS48581G2PublicKey{
-			KeyValue: qcrypto.CeremonyPotPubKeys[len(qcrypto.CeremonyPotPubKeys)-1].
+			KeyValue: qcrypto.CeremonyBLS48581G2[len(qcrypto.CeremonyBLS48581G2)-1].
 				ToAffineCompressed(),
 		},
 	)
@@ -408,6 +408,44 @@ func (
 		rewardTrie.Add(addrBytes, 0, 50)
 	}
 
+	// 2024-01-03: 1.2.0
+	d, err := os.ReadFile("./retroactive_peers.json")
+	if err != nil {
+		panic(err)
+	}
+
+	type peerData struct {
+		PeerId       string `json:"peer_id"`
+		TokenBalance uint64 `json:"token_balance"`
+	}
+	type rewards struct {
+		Rewards []peerData `json:"rewards"`
+	}
+
+	retroEntries := &rewards{}
+	err = json.Unmarshal(d, retroEntries)
+	if err != nil {
+		panic(err)
+	}
+
+	e.logger.Info("adding retroactive peer reward info")
+	for _, s := range retroEntries.Rewards {
+		peerId := s.PeerId
+		peerBytes, err := base64.StdEncoding.DecodeString(peerId)
+		if err != nil {
+			panic(err)
+		}
+
+		addr, err := poseidon.HashBytes(peerBytes)
+		if err != nil {
+			panic(err)
+		}
+
+		addrBytes := addr.Bytes()
+		addrBytes = append(make([]byte, 32-len(addrBytes)), addrBytes...)
+		rewardTrie.Add(addrBytes, 0, s.TokenBalance)
+	}
+
 	trieBytes, err := rewardTrie.Serialize()
 	if err != nil {
 		panic(err)
@@ -521,25 +559,42 @@ func (
 	// first phase:
 	e.logger.Info("encoding signatories to prover trie")
 
-	for _, s := range qcrypto.CeremonySignatories {
-		pubkey := s.ToAffineCompressed()
-		e.logger.Info("0x" + hex.EncodeToString(pubkey))
+	if len(testProverKeys) != 0 {
+		e.logger.Warn(
+			"TEST PROVER ENTRIES BEING ADDED, YOUR NODE WILL BE KICKED IF IN" +
+				" PRODUCTION",
+		)
+		for _, s := range testProverKeys {
+			addr, err := poseidon.HashBytes(s)
+			if err != nil {
+				panic(err)
+			}
 
-		addr, err := poseidon.HashBytes(pubkey)
-		if err != nil {
-			panic(err)
+			addrBytes := addr.Bytes()
+			addrBytes = append(make([]byte, 32-len(addrBytes)), addrBytes...)
+			e.frameProverTrie.Add(addrBytes, 0)
 		}
+	} else {
+		for _, s := range qcrypto.CeremonySignatories {
+			pubkey := s.ToAffineCompressed()
+			e.logger.Info("0x" + hex.EncodeToString(pubkey))
 
-		addrBytes := addr.Bytes()
-		addrBytes = append(make([]byte, 32-len(addrBytes)), addrBytes...)
-		e.frameProverTrie.Add(addrBytes, 0)
+			addr, err := poseidon.HashBytes(pubkey)
+			if err != nil {
+				panic(err)
+			}
+
+			addrBytes := addr.Bytes()
+			addrBytes = append(make([]byte, 32-len(addrBytes)), addrBytes...)
+			e.frameProverTrie.Add(addrBytes, 0)
+		}
 	}
 
 	e.logger.Info("proving genesis frame")
 	input := []byte{}
 	input = append(input, e.filter...)
-	input = binary.BigEndian.AppendUint64(input, e.frame)
-	input = binary.BigEndian.AppendUint64(input, uint64(0))
+	input = binary.BigEndian.AppendUint64(input, 0)
+	input = binary.BigEndian.AppendUint64(input, 0)
 	input = binary.BigEndian.AppendUint32(input, e.difficulty)
 	input = append(input, e.input...)
 
@@ -551,7 +606,7 @@ func (
 
 	frame := &protobufs.ClockFrame{
 		Filter:         e.filter,
-		FrameNumber:    e.frame,
+		FrameNumber:    0,
 		Timestamp:      0,
 		Difficulty:     e.difficulty,
 		Input:          inputMessage,
@@ -563,7 +618,7 @@ func (
 		PublicKeySignature: nil,
 	}
 
-	parent, distance, selector, err := frame.GetParentSelectorAndDistance()
+	parent, _, selector, err := frame.GetParentSelectorAndDistance(nil)
 	if err != nil {
 		panic(err)
 	}
@@ -574,9 +629,9 @@ func (
 	}
 
 	if err := e.clockStore.PutCandidateDataClockFrame(
-		parent.Bytes(),
-		distance.Bytes(),
-		selector.Bytes(),
+		parent.FillBytes(make([]byte, 32)),
+		big.NewInt(0).FillBytes(make([]byte, 32)),
+		selector.FillBytes(make([]byte, 32)),
 		frame,
 		txn,
 	); err != nil {
@@ -643,13 +698,23 @@ func (e *CeremonyDataClockConsensusEngine) commitLongestPath(
 					return nil, errors.Wrap(err, "commit longest path")
 				}
 
-				selectorBytes := selector.Bytes()
-				selectorBytes = append(
-					make([]byte, 32-len(selectorBytes)),
-					selectorBytes...,
-				)
+				masterFrame, err := e.clockStore.GetMasterClockFrame([]byte{
+					0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+					0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+					0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+					0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				}, s[currentDepth].GetFrameNumber())
+				if err != nil {
+					return nil, errors.Wrap(err, "commit longest path")
+				}
+
+				proverSelector, err := masterFrame.GetSelector()
+				if err != nil {
+					return nil, errors.Wrap(err, "commit longest path")
+				}
+
 				nearest := e.frameProverTrie.FindNearest(
-					selectorBytes,
+					proverSelector.FillBytes(make([]byte, 32)),
 				)
 				addr, err := value.GetAddress()
 
@@ -786,37 +851,6 @@ func (e *CeremonyDataClockConsensusEngine) commitLongestPath(
 								)
 								return nil, errors.Wrap(err, "commit longest path")
 							}
-						case protobufs.KeyBundleAnnouncementType:
-							bundle := &protobufs.KeyBundleAnnouncement{}
-							if err := proto.Unmarshal(c.Data, bundle); err != nil {
-								e.logger.Error(
-									"could not commit candidate",
-									zap.Error(err),
-									zap.Uint64("frame_number", s.FrameNumber),
-									zap.Binary("commitment", c.Commitment),
-								)
-								return nil, errors.Wrap(err, "commit longest path")
-							}
-
-							e.logger.Debug(
-								"committing key bundle",
-								zap.Uint64("frame_number", s.FrameNumber),
-								zap.Binary("commitment", c.Commitment),
-							)
-
-							if err := e.keyStore.PutKeyBundle(
-								bundle.ProvingKeyBytes,
-								c,
-								txn,
-							); err != nil {
-								e.logger.Error(
-									"could not commit candidate",
-									zap.Error(err),
-									zap.Uint64("frame_number", s.FrameNumber),
-									zap.Binary("output", s.Output),
-								)
-								return nil, errors.Wrap(err, "commit longest path")
-							}
 						}
 					}
 				}
@@ -851,6 +885,22 @@ func (e *CeremonyDataClockConsensusEngine) commitLongestPath(
 		}
 	}
 
+	if current.FrameNumber != latest.FrameNumber {
+		to := current.FrameNumber
+		if to-16 > to { // underflow
+			to = 1
+		} else {
+			to = to - 16
+		}
+
+		if 1 < to {
+			err := e.clockStore.DeleteCandidateDataClockFrameRange(e.filter, 1, to)
+			if err != nil {
+				e.logger.Error("error while purging candidate frames", zap.Error(err))
+			}
+		}
+	}
+
 	return current, nil
 }
 
@@ -860,7 +910,7 @@ func (e *CeremonyDataClockConsensusEngine) GetMostAheadPeer() (
 	error,
 ) {
 	e.peerMapMx.Lock()
-	max := e.frame
+	max := e.frame.FrameNumber
 	var peer []byte = nil
 	for _, v := range e.peerMap {
 		if v.maxFrame > max {
@@ -882,190 +932,6 @@ func (e *CeremonyDataClockConsensusEngine) GetMostAheadPeer() (
 	return peer, max, nil
 }
 
-func (e *CeremonyDataClockConsensusEngine) reverseOptimisticSync(
-	currentLatest *protobufs.ClockFrame,
-	maxFrame uint64,
-	peerId []byte,
-) (*protobufs.ClockFrame, error) {
-	latest := currentLatest
-	cc, err := e.pubSub.GetDirectChannel(peerId)
-	if err != nil {
-		e.logger.Error(
-			"could not establish direct channel",
-			zap.Error(err),
-		)
-		e.peerMapMx.Lock()
-		if _, ok := e.peerMap[string(peerId)]; ok {
-			e.uncooperativePeersMap[string(peerId)] = e.peerMap[string(peerId)]
-			delete(e.peerMap, string(peerId))
-		}
-		e.peerMapMx.Unlock()
-		e.syncingTarget = nil
-		return latest, errors.Wrap(err, "reverse optimistic sync")
-	}
-
-	client := protobufs.NewCeremonyServiceClient(cc)
-
-	from := latest.FrameNumber
-	if from <= 1 {
-		from = 2
-	}
-
-	if maxFrame-from > 32 {
-		// divergence is high, ask them for the latest frame and if they
-		// respond with a valid answer, optimistically continue from this
-		// frame, if we hit a fault we'll mark them as uncooperative and move
-		// on
-		from = 2
-		s, err := client.GetCompressedSyncFrames(
-			context.Background(),
-			&protobufs.ClockFramesRequest{
-				Filter:          e.filter,
-				FromFrameNumber: maxFrame - 32,
-			},
-			grpc.MaxCallRecvMsgSize(600*1024*1024),
-		)
-		if err != nil {
-			e.logger.Error(
-				"received error from peer",
-				zap.Error(err),
-			)
-			e.peerMapMx.Lock()
-			if _, ok := e.peerMap[string(peerId)]; ok {
-				e.uncooperativePeersMap[string(peerId)] = e.peerMap[string(peerId)]
-				delete(e.peerMap, string(peerId))
-			}
-			e.peerMapMx.Unlock()
-			e.syncingTarget = nil
-			return latest, errors.Wrap(err, "reverse optimistic sync")
-		}
-		var syncMsg *protobufs.CeremonyCompressedSync
-		for syncMsg, err = s.Recv(); err == nil; syncMsg, err = s.Recv() {
-			e.logger.Info(
-				"received compressed sync frame",
-				zap.Uint64("from", syncMsg.FromFrameNumber),
-				zap.Uint64("to", syncMsg.ToFrameNumber),
-				zap.Int("frames", len(syncMsg.TruncatedClockFrames)),
-				zap.Int("proofs", len(syncMsg.Proofs)),
-			)
-			var next *protobufs.ClockFrame
-			if next, err = e.decompressAndStoreCandidates(
-				peerId,
-				syncMsg,
-				e.logger.Info,
-			); err != nil && !errors.Is(err, ErrNoNewFrames) {
-				e.logger.Error(
-					"could not decompress and store candidate",
-					zap.Error(err),
-				)
-				e.peerMapMx.Lock()
-				if _, ok := e.peerMap[string(peerId)]; ok {
-					e.uncooperativePeersMap[string(peerId)] = e.peerMap[string(peerId)]
-					delete(e.peerMap, string(peerId))
-				}
-				e.peerMapMx.Unlock()
-
-				if err := cc.Close(); err != nil {
-					e.logger.Error("error while closing connection", zap.Error(err))
-				}
-
-				e.syncingTarget = nil
-				e.syncingStatus = SyncStatusFailed
-				return currentLatest, errors.Wrap(err, "reverse optimistic sync")
-			}
-			if next != nil {
-				latest = next
-			}
-		}
-		if err != nil && err != io.EOF && !errors.Is(err, ErrNoNewFrames) {
-			if err := cc.Close(); err != nil {
-				e.logger.Error("error while closing connection", zap.Error(err))
-			}
-			e.logger.Error("error while receiving sync", zap.Error(err))
-			e.syncingTarget = nil
-			e.syncingStatus = SyncStatusFailed
-			return latest, errors.Wrap(err, "reverse optimistic sync")
-		}
-	}
-
-	go func() {
-		defer func() { e.syncingTarget = nil }()
-		e.logger.Info("continuing sync in background")
-		s, err := client.GetCompressedSyncFrames(
-			context.Background(),
-			&protobufs.ClockFramesRequest{
-				Filter:          e.filter,
-				FromFrameNumber: from - 1,
-				ToFrameNumber:   maxFrame,
-			},
-			grpc.MaxCallRecvMsgSize(600*1024*1024),
-		)
-		if err != nil {
-			e.logger.Error(
-				"error while retrieving sync",
-				zap.Error(err),
-			)
-			e.peerMapMx.Lock()
-			if _, ok := e.peerMap[string(peerId)]; ok {
-				e.uncooperativePeersMap[string(peerId)] = e.peerMap[string(peerId)]
-				delete(e.peerMap, string(peerId))
-			}
-			e.peerMapMx.Unlock()
-			e.syncingStatus = SyncStatusFailed
-
-			if err := cc.Close(); err != nil {
-				e.logger.Error("error while closing connection", zap.Error(err))
-			}
-			return
-		} else {
-			var syncMsg *protobufs.CeremonyCompressedSync
-			for syncMsg, err = s.Recv(); err == nil; syncMsg, err = s.Recv() {
-				e.logger.Debug(
-					"received compressed sync frame",
-					zap.Uint64("from", syncMsg.FromFrameNumber),
-					zap.Uint64("to", syncMsg.ToFrameNumber),
-					zap.Int("frames", len(syncMsg.TruncatedClockFrames)),
-					zap.Int("proofs", len(syncMsg.Proofs)),
-				)
-				if _, err = e.decompressAndStoreCandidates(
-					peerId,
-					syncMsg,
-					e.logger.Debug,
-				); err != nil && !errors.Is(err, ErrNoNewFrames) {
-					e.logger.Error(
-						"could not decompress and store candidate",
-						zap.Error(err),
-					)
-					e.syncingTarget = nil
-					e.syncingStatus = SyncStatusFailed
-					if err := cc.Close(); err != nil {
-						e.logger.Error("error while closing connection", zap.Error(err))
-					}
-					return
-				}
-			}
-			if err != nil && err != io.EOF && !errors.Is(err, ErrNoNewFrames) {
-				e.syncingTarget = nil
-				e.syncingStatus = SyncStatusFailed
-				e.logger.Error("error while receiving sync", zap.Error(err))
-				if err := cc.Close(); err != nil {
-					e.logger.Error("error while closing connection", zap.Error(err))
-				}
-				return
-			}
-		}
-
-		if err := cc.Close(); err != nil {
-			e.logger.Error("error while closing connection", zap.Error(err))
-		}
-
-		e.syncingTarget = nil
-		e.syncingStatus = SyncStatusNotSyncing
-	}()
-
-	return latest, nil
-}
-
 func (e *CeremonyDataClockConsensusEngine) sync(
 	currentLatest *protobufs.ClockFrame,
 	maxFrame uint64,
@@ -1095,18 +961,48 @@ func (e *CeremonyDataClockConsensusEngine) sync(
 		from = 1
 	}
 
-	if maxFrame > from {
-		s, err := client.GetCompressedSyncFrames(
-			context.Background(),
-			&protobufs.ClockFramesRequest{
-				Filter:          e.filter,
-				FromFrameNumber: maxFrame - 16,
-			},
-			grpc.MaxCallRecvMsgSize(600*1024*1024),
+	if maxFrame > from && maxFrame > 3 {
+		from = maxFrame - 2
+	}
+
+	s, err := client.GetCompressedSyncFrames(
+		context.Background(),
+		&protobufs.ClockFramesRequest{
+			Filter:          e.filter,
+			FromFrameNumber: from,
+		},
+		grpc.MaxCallRecvMsgSize(600*1024*1024),
+	)
+	if err != nil {
+		e.logger.Error(
+			"received error from peer",
+			zap.Error(err),
 		)
-		if err != nil {
+		e.peerMapMx.Lock()
+		if _, ok := e.peerMap[string(peerId)]; ok {
+			e.uncooperativePeersMap[string(peerId)] = e.peerMap[string(peerId)]
+			delete(e.peerMap, string(peerId))
+		}
+		e.peerMapMx.Unlock()
+		return latest, errors.Wrap(err, "reverse optimistic sync")
+	}
+	var syncMsg *protobufs.CeremonyCompressedSync
+	for syncMsg, err = s.Recv(); err == nil; syncMsg, err = s.Recv() {
+		e.logger.Info(
+			"received compressed sync frame",
+			zap.Uint64("from", syncMsg.FromFrameNumber),
+			zap.Uint64("to", syncMsg.ToFrameNumber),
+			zap.Int("frames", len(syncMsg.TruncatedClockFrames)),
+			zap.Int("proofs", len(syncMsg.Proofs)),
+		)
+		var next *protobufs.ClockFrame
+		if next, err = e.decompressAndStoreCandidates(
+			peerId,
+			syncMsg,
+			e.logger.Info,
+		); err != nil && !errors.Is(err, ErrNoNewFrames) {
 			e.logger.Error(
-				"received error from peer",
+				"could not decompress and store candidate",
 				zap.Error(err),
 			)
 			e.peerMapMx.Lock()
@@ -1115,56 +1011,31 @@ func (e *CeremonyDataClockConsensusEngine) sync(
 				delete(e.peerMap, string(peerId))
 			}
 			e.peerMapMx.Unlock()
-			return latest, errors.Wrap(err, "reverse optimistic sync")
-		}
-		var syncMsg *protobufs.CeremonyCompressedSync
-		for syncMsg, err = s.Recv(); err == nil; syncMsg, err = s.Recv() {
-			e.logger.Info(
-				"received compressed sync frame",
-				zap.Uint64("from", syncMsg.FromFrameNumber),
-				zap.Uint64("to", syncMsg.ToFrameNumber),
-				zap.Int("frames", len(syncMsg.TruncatedClockFrames)),
-				zap.Int("proofs", len(syncMsg.Proofs)),
-			)
-			var next *protobufs.ClockFrame
-			if next, err = e.decompressAndStoreCandidates(
-				peerId,
-				syncMsg,
-				e.logger.Info,
-			); err != nil && !errors.Is(err, ErrNoNewFrames) {
-				e.logger.Error(
-					"could not decompress and store candidate",
-					zap.Error(err),
-				)
-				e.peerMapMx.Lock()
-				if _, ok := e.peerMap[string(peerId)]; ok {
-					e.uncooperativePeersMap[string(peerId)] = e.peerMap[string(peerId)]
-					delete(e.peerMap, string(peerId))
-				}
-				e.peerMapMx.Unlock()
 
-				if err := cc.Close(); err != nil {
-					e.logger.Error("error while closing connection", zap.Error(err))
-				}
-
-				return currentLatest, errors.Wrap(err, "reverse optimistic sync")
-			}
-			if next != nil {
-				latest = next
-			}
-		}
-		if err != nil && err != io.EOF && !errors.Is(err, ErrNoNewFrames) {
 			if err := cc.Close(); err != nil {
 				e.logger.Error("error while closing connection", zap.Error(err))
 			}
-			e.logger.Error("error while receiving sync", zap.Error(err))
-			return latest, errors.Wrap(err, "reverse optimistic sync")
-		}
 
-		e.logger.Info("received new leading frame", zap.Uint64("frame_number", latest.FrameNumber))
+			return currentLatest, errors.Wrap(err, "reverse optimistic sync")
+		}
+		if next != nil {
+			latest = next
+		}
+	}
+	if err != nil && err != io.EOF && !errors.Is(err, ErrNoNewFrames) {
 		if err := cc.Close(); err != nil {
 			e.logger.Error("error while closing connection", zap.Error(err))
 		}
+		e.logger.Error("error while receiving sync", zap.Error(err))
+		return latest, errors.Wrap(err, "reverse optimistic sync")
+	}
+
+	e.logger.Info(
+		"received new leading frame",
+		zap.Uint64("frame_number", latest.FrameNumber),
+	)
+	if err := cc.Close(); err != nil {
+		e.logger.Error("error while closing connection", zap.Error(err))
 	}
 
 	return latest, nil
@@ -1181,43 +1052,31 @@ func (e *CeremonyDataClockConsensusEngine) collect(
 			latest = e.previousHead
 			e.syncingStatus = SyncStatusNotSyncing
 		}
-		maxFrame := uint64(0)
-		var peerId []byte
 		peerId, maxFrame, err := e.GetMostAheadPeer()
 		if err != nil {
 			e.logger.Warn("no peers available, skipping sync")
 		} else if peerId == nil {
 			e.logger.Info("currently up to date, skipping sync")
-		} else if e.syncingTarget == nil {
-			e.syncingStatus = SyncStatusAwaitingResponse
-			e.logger.Info(
-				"setting syncing target",
-				zap.String("peer_id", peer.ID(peerId).String()),
-			)
-
-			e.syncingTarget = peerId
-			e.previousHead = latest
-			latest, err = e.reverseOptimisticSync(latest, maxFrame, peerId)
-		} else if maxFrame > latest.FrameNumber {
+		} else if maxFrame-2 > latest.FrameNumber {
 			latest, err = e.sync(latest, maxFrame, peerId)
 		}
 
-		go func() {
-			_, err = e.keyStore.GetProvingKey(e.provingKeyBytes)
-			if errors.Is(err, store.ErrNotFound) &&
-				latest.FrameNumber-e.lastKeyBundleAnnouncementFrame > 6 {
-				if err = e.announceKeyBundle(); err != nil {
-					panic(err)
-				}
-				e.lastKeyBundleAnnouncementFrame = latest.FrameNumber
-			}
-		}()
-
 		e.logger.Info(
 			"returning leader frame",
 			zap.Uint64("frame_number", latest.FrameNumber),
 		)
 
+		e.logger.Info("selecting leader")
+
+		latest, err = e.commitLongestPath(latest)
+		if err != nil {
+			e.logger.Error("could not collect longest path", zap.Error(err))
+			latest, _, err = e.clockStore.GetDataClockFrame(e.filter, 0)
+			if err != nil {
+				panic(err)
+			}
+		}
+
 		e.setFrame(latest)
 		e.state = consensus.EngineStateProving
 		return latest, nil
diff --git a/node/consensus/ceremony/execution_registration.go b/node/consensus/ceremony/execution_registration.go
index 7e89a08..b781d1a 100644
--- a/node/consensus/ceremony/execution_registration.go
+++ b/node/consensus/ceremony/execution_registration.go
@@ -17,11 +17,11 @@ func (e *CeremonyDataClockConsensusEngine) RegisterExecutor(
 		for {
 			logger.Info(
 				"awaiting frame",
-				zap.Uint64("current_frame", e.frame),
+				zap.Uint64("current_frame", e.frame.FrameNumber),
 				zap.Uint64("target_frame", frame),
 			)
 
-			newFrame := e.frame
+			newFrame := e.frame.FrameNumber
 			if newFrame >= frame {
 				logger.Info(
 					"injecting execution engine at frame",
@@ -54,11 +54,11 @@ func (e *CeremonyDataClockConsensusEngine) UnregisterExecutor(
 		for {
 			logger.Info(
 				"awaiting frame",
-				zap.Uint64("current_frame", e.frame),
+				zap.Uint64("current_frame", e.frame.FrameNumber),
 				zap.Uint64("target_frame", frame),
 			)
 
-			newFrame := e.frame
+			newFrame := e.frame.FrameNumber
 			if newFrame >= frame {
 				logger.Info(
 					"removing execution engine at frame",
diff --git a/node/consensus/ceremony/peer_messaging.go b/node/consensus/ceremony/peer_messaging.go
index 8f4c41b..07f1a5a 100644
--- a/node/consensus/ceremony/peer_messaging.go
+++ b/node/consensus/ceremony/peer_messaging.go
@@ -11,7 +11,6 @@ import (
 	"google.golang.org/grpc"
 	"google.golang.org/protobuf/proto"
 	"google.golang.org/protobuf/types/known/anypb"
-	"source.quilibrium.com/quilibrium/monorepo/go-libp2p-blossomsub/pb"
 	"source.quilibrium.com/quilibrium/monorepo/node/execution/ceremony/application"
 	"source.quilibrium.com/quilibrium/monorepo/node/p2p"
 	"source.quilibrium.com/quilibrium/monorepo/node/protobufs"
@@ -20,52 +19,6 @@ import (
 
 var ErrNoNewFrames = errors.New("peer reported no frames")
 
-func (e *CeremonyDataClockConsensusEngine) handleSync(
-	message *pb.Message,
-) error {
-	e.logger.Debug(
-		"received message",
-		zap.Binary("data", message.Data),
-		zap.Binary("from", message.From),
-		zap.Binary("signature", message.Signature),
-	)
-	if bytes.Equal(message.From, e.pubSub.GetPeerID()) {
-		return nil
-	}
-
-	msg := &protobufs.Message{}
-
-	if err := proto.Unmarshal(message.Data, msg); err != nil {
-		return errors.Wrap(err, "handle sync")
-	}
-
-	any := &anypb.Any{}
-	if err := proto.Unmarshal(msg.Payload, any); err != nil {
-		return errors.Wrap(err, "handle sync")
-	}
-
-	switch any.TypeUrl {
-	case protobufs.ProvingKeyAnnouncementType:
-		if err := e.handleProvingKey(
-			message.From,
-			msg.Address,
-			any,
-		); err != nil {
-			return errors.Wrap(err, "handle sync")
-		}
-	case protobufs.KeyBundleAnnouncementType:
-		if err := e.handleKeyBundle(
-			message.From,
-			msg.Address,
-			any,
-		); err != nil {
-			return errors.Wrap(err, "handle sync")
-		}
-	}
-
-	return nil
-}
-
 // GetCompressedSyncFrames implements protobufs.CeremonyServiceServer.
 func (e *CeremonyDataClockConsensusEngine) GetCompressedSyncFrames(
 	request *protobufs.ClockFramesRequest,
@@ -153,7 +106,7 @@ func (e *CeremonyDataClockConsensusEngine) GetCompressedSyncFrames(
 		}
 	}
 
-	max := e.frame
+	max := e.frame.FrameNumber
 	to := request.ToFrameNumber
 
 	// We need to slightly rewind, to compensate for unconfirmed frame heads on a
@@ -469,93 +422,3 @@ func (e *CeremonyDataClockConsensusEngine) GetPublicChannel(
 ) error {
 	return errors.New("not supported")
 }
-
-func (e *CeremonyDataClockConsensusEngine) handleProvingKeyRequest(
-	peerID []byte,
-	address []byte,
-	any *anypb.Any,
-) error {
-	if bytes.Equal(peerID, e.pubSub.GetPeerID()) {
-		return nil
-	}
-
-	request := &protobufs.ProvingKeyRequest{}
-	if err := any.UnmarshalTo(request); err != nil {
-		return nil
-	}
-
-	if len(request.ProvingKeyBytes) == 0 {
-		e.logger.Debug(
-			"received proving key request for empty key",
-			zap.Binary("peer_id", peerID),
-			zap.Binary("address", address),
-		)
-		return nil
-	}
-
-	e.pubSub.Subscribe(
-		append(append([]byte{}, e.filter...), peerID...),
-		e.handleSync,
-		true,
-	)
-
-	e.logger.Debug(
-		"received proving key request",
-		zap.Binary("peer_id", peerID),
-		zap.Binary("address", address),
-		zap.Binary("proving_key", request.ProvingKeyBytes),
-	)
-
-	var provingKey *protobufs.ProvingKeyAnnouncement
-	inclusion, err := e.keyStore.GetProvingKey(request.ProvingKeyBytes)
-	if err != nil {
-		if !errors.Is(err, store.ErrNotFound) {
-			e.logger.Debug(
-				"peer asked for proving key that returned error",
-				zap.Binary("peer_id", peerID),
-				zap.Binary("address", address),
-				zap.Binary("proving_key", request.ProvingKeyBytes),
-			)
-			return nil
-		}
-
-		provingKey, err = e.keyStore.GetStagedProvingKey(request.ProvingKeyBytes)
-		if !errors.Is(err, store.ErrNotFound) {
-			e.logger.Debug(
-				"peer asked for proving key that returned error",
-				zap.Binary("peer_id", peerID),
-				zap.Binary("address", address),
-				zap.Binary("proving_key", request.ProvingKeyBytes),
-			)
-			return nil
-		} else if err != nil {
-			e.logger.Debug(
-				"peer asked for unknown proving key",
-				zap.Binary("peer_id", peerID),
-				zap.Binary("address", address),
-				zap.Binary("proving_key", request.ProvingKeyBytes),
-			)
-			return nil
-		}
-	} else {
-		err := proto.Unmarshal(inclusion.Data, provingKey)
-		if err != nil {
-			e.logger.Debug(
-				"inclusion commitment could not be deserialized",
-				zap.Binary("peer_id", peerID),
-				zap.Binary("address", address),
-				zap.Binary("proving_key", request.ProvingKeyBytes),
-			)
-			return nil
-		}
-	}
-
-	if err := e.publishMessage(
-		append(append([]byte{}, e.filter...), peerID...),
-		provingKey,
-	); err != nil {
-		return nil
-	}
-
-	return nil
-}
diff --git a/node/consensus/consensus_engine.go b/node/consensus/consensus_engine.go
index d9f0fb4..d3b8eca 100644
--- a/node/consensus/consensus_engine.go
+++ b/node/consensus/consensus_engine.go
@@ -28,22 +28,21 @@ type ConsensusEngine interface {
 	Stop(force bool) <-chan error
 	RegisterExecutor(exec execution.ExecutionEngine, frame uint64) <-chan error
 	UnregisterExecutor(name string, frame uint64, force bool) <-chan error
-	GetFrame() uint64
-	GetDifficulty() uint32
-	GetState() EngineState
-	GetFrameChannel() <-chan uint64
-}
-
-type DataConsensusEngine interface {
-	Start(filter []byte, seed []byte) <-chan error
-	Stop(force bool) <-chan error
-	RegisterExecutor(exec execution.ExecutionEngine, frame uint64) <-chan error
-	UnregisterExecutor(name string, frame uint64, force bool) <-chan error
-	GetFrame() uint64
+	GetFrame() *protobufs.ClockFrame
+	GetDifficulty() uint32
+	GetState() EngineState
+	GetFrameChannel() <-chan *protobufs.ClockFrame
+}
+
+type DataConsensusEngine interface {
+	Start() <-chan error
+	Stop(force bool) <-chan error
+	RegisterExecutor(exec execution.ExecutionEngine, frame uint64) <-chan error
+	UnregisterExecutor(name string, frame uint64, force bool) <-chan error
+	GetFrame() *protobufs.ClockFrame
 	GetDifficulty() uint32
 	GetState() EngineState
 	GetFrameChannel() <-chan *protobufs.ClockFrame
-	GetActiveFrame() *protobufs.ClockFrame
 	GetProvingKey(
 		engineConfig *config.EngineConfig,
 	) (crypto.Signer, keys.KeyType, []byte, []byte)
@@ -52,13 +51,13 @@ type DataConsensusEngine interface {
 }
 
 func GetMinimumVersionCutoff() time.Time {
-	return time.Date(2023, time.December, 2, 7, 0, 0, 0, time.UTC)
+	return time.Date(2024, time.January, 3, 7, 0, 0, 0, time.UTC)
 }
 
 func GetMinimumVersion() []byte {
-	return []byte{0x01, 0x01, 0x08}
+	return []byte{0x01, 0x02, 0x00}
 }
 
 func GetVersion() []byte {
-	return []byte{0x01, 0x01, 0x08}
+	return []byte{0x01, 0x02, 0x00}
 }
diff --git a/node/consensus/master/broadcast_messaging.go b/node/consensus/master/broadcast_messaging.go
index ba59f07..a956a3e 100644
--- a/node/consensus/master/broadcast_messaging.go
+++ b/node/consensus/master/broadcast_messaging.go
@@ -36,7 +36,6 @@ func (e *MasterClockConsensusEngine) handleMessage(message *pb.Message) error {
 
 	eg := errgroup.Group{}
 	eg.SetLimit(len(e.executionEngines))
-
 	for name := range e.executionEngines {
 		name := name
 		eg.Go(func() error {
@@ -52,7 +51,6 @@ func (e *MasterClockConsensusEngine) handleMessage(message *pb.Message) error {
 				)
 				return errors.Wrap(err, "handle message")
 			}
-
 			for _, m := range messages {
 				m := m
 				if err := e.publishMessage(m.Address, m); err != nil {
@@ -64,11 +62,9 @@ func (e *MasterClockConsensusEngine) handleMessage(message *pb.Message) error {
 					return errors.Wrap(err, "handle message")
 				}
 			}
-
 			return nil
 		})
 	}
-
 	if err := eg.Wait(); err != nil {
 		e.logger.Error("rejecting invalid message", zap.Error(err))
 		return errors.Wrap(err, "execution failed")
@@ -96,7 +92,7 @@ func (e *MasterClockConsensusEngine) handleClockFrameData(
 		return errors.Wrap(err, "handle clock frame data")
 	}
 
-	if e.frame > frame.FrameNumber {
+	if e.frame.FrameNumber > frame.FrameNumber {
 		e.logger.Debug(
 			"received anachronistic frame",
 			zap.Binary("sender", peerID),
@@ -131,7 +127,7 @@ func (e *MasterClockConsensusEngine) handleClockFrameData(
 		return errors.Wrap(err, "handle clock frame data")
 	}
 
-	if e.frame < frame.FrameNumber {
+	if e.frame.FrameNumber < frame.FrameNumber {
 		if err := e.enqueueSeenFrame(frame); err != nil {
 			e.logger.Error("could not enqueue seen clock frame", zap.Error(err))
 			return errors.Wrap(err, "handle clock frame data")
diff --git a/node/consensus/master/consensus_frames.go b/node/consensus/master/consensus_frames.go
index 53693a2..04bd699 100644
--- a/node/consensus/master/consensus_frames.go
+++ b/node/consensus/master/consensus_frames.go
@@ -43,8 +43,7 @@ func (e *MasterClockConsensusEngine) setFrame(frame *protobufs.ClockFrame) {
 	copy(previousSelectorBytes[:], frame.Output[:516])
 
 	e.logger.Debug("set frame", zap.Uint64("frame_number", frame.FrameNumber))
-	e.frame = frame.FrameNumber
-	e.latestFrame = frame
+	e.frame = frame
 
 	go func() {
 		e.frameChan <- e.frame
@@ -53,7 +52,7 @@ func (e *MasterClockConsensusEngine) setFrame(frame *protobufs.ClockFrame) {
 
 func (
 	e *MasterClockConsensusEngine,
-) createGenesisFrame() *protobufs.ClockFrame {
+) CreateGenesisFrame() *protobufs.ClockFrame {
 	e.logger.Debug("creating genesis frame")
 	b := sha3.Sum256(e.input)
 	v := vdf.New(e.difficulty, b)
@@ -65,7 +64,7 @@ func (
 	e.logger.Debug("proving genesis frame")
 	input := []byte{}
 	input = append(input, e.filter...)
-	input = binary.BigEndian.AppendUint64(input, e.frame)
+	input = binary.BigEndian.AppendUint64(input, 0)
 	input = binary.BigEndian.AppendUint32(input, e.difficulty)
 	if bytes.Equal(e.input, []byte{0x00}) {
 		value := [516]byte{}
@@ -82,7 +81,7 @@ func (
 
 	frame := &protobufs.ClockFrame{
 		Filter:      e.filter,
-		FrameNumber: e.frame,
+		FrameNumber: 0,
 		Timestamp:   0,
 		Difficulty:  e.difficulty,
 		Input:       inputMessage,
@@ -107,13 +106,13 @@ func (e *MasterClockConsensusEngine) collect(
 	if e.state == consensus.EngineStateCollecting {
 		e.logger.Debug("collecting vdf proofs")
 
-		latest := e.latestFrame
+		latest := e.frame
 
 		if e.syncingStatus == SyncStatusNotSyncing {
 			peer, err := e.pubSub.GetRandomPeer(e.filter)
 			if err != nil {
 				if errors.Is(err, p2p.ErrNoPeersAvailable) {
-					e.logger.Warn("no peers available, skipping sync")
+					e.logger.Debug("no peers available, skipping sync")
 				} else {
 					e.logger.Error("error while fetching random peer", zap.Error(err))
 				}
@@ -200,10 +199,10 @@ func (
 	})
 
 	if len(e.seenFrames) == 0 {
-		return e.latestFrame, nil
+		return e.frame, nil
 	}
 
-	prev := e.latestFrame
+	prev := e.frame
 	committedSet := []*protobufs.ClockFrame{}
 
 	for len(e.seenFrames) > 0 {
diff --git a/node/consensus/master/execution_registration.go b/node/consensus/master/execution_registration.go
index 69657cb..e5dde0c 100644
--- a/node/consensus/master/execution_registration.go
+++ b/node/consensus/master/execution_registration.go
@@ -17,7 +17,7 @@ func (e *MasterClockConsensusEngine) RegisterExecutor(
 	go func() {
 		logger.Info(
 			"starting execution engine at frame",
-			zap.Uint64("current_frame", e.frame),
+			zap.Uint64("current_frame", e.frame.FrameNumber),
 		)
 		err := <-exec.Start()
 		if err != nil {
@@ -29,11 +29,11 @@ func (e *MasterClockConsensusEngine) RegisterExecutor(
 		for {
 			logger.Info(
 				"awaiting frame",
-				zap.Uint64("current_frame", e.frame),
+				zap.Uint64("current_frame", e.frame.FrameNumber),
 				zap.Uint64("target_frame", frame),
 			)
 
-			newFrame := e.frame
+			newFrame := e.frame.FrameNumber
 			if newFrame >= frame {
 				logger.Info(
 					"injecting execution engine at frame",
@@ -76,11 +76,11 @@ func (e *MasterClockConsensusEngine) UnregisterExecutor(
 		for {
 			logger.Info(
 				"awaiting frame",
-				zap.Uint64("current_frame", e.frame),
+				zap.Uint64("current_frame", e.frame.FrameNumber),
 				zap.Uint64("target_frame", frame),
 			)
 
-			newFrame := e.frame
+			newFrame := e.frame.FrameNumber
 			if newFrame >= frame {
 				logger.Info(
 					"removing execution engine at frame",
diff --git a/node/consensus/master/master_clock_consensus_engine.go b/node/consensus/master/master_clock_consensus_engine.go
index 130149a..e8f6c22 100644
--- a/node/consensus/master/master_clock_consensus_engine.go
+++ b/node/consensus/master/master_clock_consensus_engine.go
@@ -25,16 +25,15 @@ const (
 )
 
 type MasterClockConsensusEngine struct {
-	frame               uint64
+	frame               *protobufs.ClockFrame
 	difficulty          uint32
 	logger              *zap.Logger
 	state               consensus.EngineState
 	pubSub              p2p.PubSub
 	keyManager          keys.KeyManager
 	lastFrameReceivedAt time.Time
-	latestFrame         *protobufs.ClockFrame
 
-	frameChan        chan uint64
+	frameChan        chan *protobufs.ClockFrame
 	executionEngines map[string]execution.ExecutionEngine
 	filter           []byte
 	input            []byte
@@ -79,20 +78,29 @@ func NewMasterClockConsensusEngine(
 	}
 
 	e := &MasterClockConsensusEngine{
-		frame:               0,
+		frame:               nil,
 		difficulty:          10000,
 		logger:              logger,
 		state:               consensus.EngineStateStopped,
 		keyManager:          keyManager,
 		pubSub:              pubSub,
-		frameChan:           make(chan uint64),
 		executionEngines:    map[string]execution.ExecutionEngine{},
+		frameChan:           make(chan *protobufs.ClockFrame),
 		input:               seed,
 		lastFrameReceivedAt: time.Time{},
 		syncingStatus:       SyncStatusNotSyncing,
 		clockStore:          clockStore,
 	}
 
+	latestFrame, err := e.clockStore.GetLatestMasterClockFrame(e.filter)
+	if err != nil && !errors.Is(err, store.ErrNotFound) {
+		panic(err)
+	}
+
+	if latestFrame != nil {
+		e.frame = latestFrame
+	}
+
 	if e.filter, err = hex.DecodeString(engineConfig.Filter); err != nil {
 		panic(errors.Wrap(err, "could not parse filter value"))
 	}
@@ -103,7 +111,7 @@ func NewMasterClockConsensusEngine(
 }
 
 func (e *MasterClockConsensusEngine) Start() <-chan error {
-	e.logger.Info("starting consensus engine")
+	e.logger.Info("starting master consensus engine")
 	e.state = consensus.EngineStateStarting
 	errChan := make(chan error)
 
@@ -112,7 +120,7 @@ func (e *MasterClockConsensusEngine) Start() <-chan error {
 
 	latestFrame, err := e.clockStore.GetLatestMasterClockFrame(e.filter)
 	if err != nil && errors.Is(err, store.ErrNotFound) {
-		latestFrame = e.createGenesisFrame()
+		latestFrame = e.CreateGenesisFrame()
 		txn, err := e.clockStore.NewTransaction()
 		if err != nil {
 			panic(err)
@@ -131,11 +139,111 @@ func (e *MasterClockConsensusEngine) Start() <-chan error {
 		e.setFrame(latestFrame)
 	}
 
+	e.buildHistoricFrameCache(latestFrame)
+
+	e.logger.Info("subscribing to pubsub messages")
+	e.pubSub.Subscribe(e.filter, e.handleMessage, true)
+	e.pubSub.Subscribe(e.pubSub.GetPeerID(), e.handleSync, true)
+
+	e.state = consensus.EngineStateCollecting
+
+	go func() {
+		for {
+			e.logger.Info(
+				"peers in store",
+				zap.Int("peer_store_count", e.pubSub.GetPeerstoreCount()),
+				zap.Int("network_peer_count", e.pubSub.GetNetworkPeersCount()),
+			)
+			time.Sleep(10 * time.Second)
+		}
+	}()
+
+	go func() {
+		for e.state < consensus.EngineStateStopping {
+			var err error
+			switch e.state {
+			case consensus.EngineStateCollecting:
+				currentFrame := latestFrame
+				if latestFrame, err = e.collect(latestFrame); err != nil {
+					e.logger.Error("could not collect", zap.Error(err))
+					latestFrame = currentFrame
+				}
+			case consensus.EngineStateProving:
+				currentFrame := latestFrame
+				if latestFrame, err = e.prove(latestFrame); err != nil {
+					e.logger.Error("could not prove", zap.Error(err))
+					latestFrame = currentFrame
+				}
+			case consensus.EngineStatePublishing:
+				if err = e.publishProof(latestFrame); err != nil {
+					e.logger.Error("could not publish", zap.Error(err))
+				}
+			}
+		}
+	}()
+
+	go func() {
+		errChan <- nil
+	}()
+
+	return errChan
+}
+
+func (e *MasterClockConsensusEngine) Stop(force bool) <-chan error {
+	e.logger.Info("stopping consensus engine")
+	e.state = consensus.EngineStateStopping
+	errChan := make(chan error)
+
+	wg := sync.WaitGroup{}
+	wg.Add(len(e.executionEngines))
+	for name := range e.executionEngines {
+		name := name
+		go func(name string) {
+			err := <-e.UnregisterExecutor(name, e.frame.FrameNumber, force)
+			if err != nil {
+				errChan <- err
+			}
+			wg.Done()
+		}(name)
+	}
+
+	e.logger.Info("waiting for execution engines to stop")
+	wg.Wait()
+	e.logger.Info("execution engines stopped")
+
+	e.state = consensus.EngineStateStopped
+	go func() {
+		errChan <- nil
+	}()
+	return errChan
+}
+
+func (e *MasterClockConsensusEngine) GetDifficulty() uint32 {
+	return e.difficulty
+}
+
+func (e *MasterClockConsensusEngine) GetFrame() *protobufs.ClockFrame {
+	return e.frame
+}
+
+func (e *MasterClockConsensusEngine) GetState() consensus.EngineState {
+	return e.state
+}
+
+func (
+	e *MasterClockConsensusEngine,
+) GetFrameChannel() <-chan *protobufs.ClockFrame {
+	return e.frameChan
+}
+
+func (e *MasterClockConsensusEngine) buildHistoricFrameCache(
+	latestFrame *protobufs.ClockFrame,
+) {
 	e.historicFrames = []*protobufs.ClockFrame{}
 
 	if latestFrame.FrameNumber != 0 {
 		min := uint64(0)
-		if latestFrame.FrameNumber-255 > min {
+		if latestFrame.FrameNumber-255 > min && latestFrame.FrameNumber > 255 {
 			min = latestFrame.FrameNumber - 255
 		}
 
@@ -163,98 +271,4 @@ func (e *MasterClockConsensusEngine) Start() <-chan error {
 	}
 
 	e.historicFrames = append(e.historicFrames, latestFrame)
-
-	e.logger.Info("subscribing to pubsub messages")
-	e.pubSub.Subscribe(e.filter, e.handleMessage, true)
-	e.pubSub.Subscribe(e.pubSub.GetPeerID(), e.handleSync, true)
-
-	e.state = consensus.EngineStateCollecting
-
-	go func() {
-		for {
-			e.logger.Info(
-				"peers in store",
-				zap.Int("peer_store_count", e.pubSub.GetPeerstoreCount()),
-				zap.Int("network_peer_count", e.pubSub.GetNetworkPeersCount()),
-			)
-			time.Sleep(10 * time.Second)
-		}
-	}()
-
-	go func() {
-		for e.state < consensus.EngineStateStopping {
-			var err error
-			switch e.state {
-			case consensus.EngineStateCollecting:
-				if latestFrame, err = e.collect(latestFrame); err != nil {
-					e.logger.Error("could not collect", zap.Error(err))
-					errChan <- err
-				}
-			case consensus.EngineStateProving:
-				if latestFrame, err = e.prove(latestFrame); err != nil {
-					e.logger.Error("could not prove", zap.Error(err))
-					errChan <- err
-				}
-			case consensus.EngineStatePublishing:
-				if err = e.publishProof(latestFrame); err != nil {
-					e.logger.Error("could not publish", zap.Error(err))
-					errChan <- err
-				}
-			}
-		}
-	}()
-
-	go func() {
-		errChan <- nil
-	}()
-
-	return errChan
-}
-
-func (e *MasterClockConsensusEngine) Stop(force bool) <-chan error {
-	e.logger.Info("stopping consensus engine")
-	e.state = consensus.EngineStateStopping
-	errChan := make(chan error)
-
-	wg := sync.WaitGroup{}
-	wg.Add(len(e.executionEngines))
-	for name := range e.executionEngines {
-		name := name
-		go func(name string) {
-			err := <-e.UnregisterExecutor(name, e.frame, force)
-			if err != nil {
-				errChan <- err
-			}
-			wg.Done()
-		}(name)
-	}
-
-	e.logger.Info("waiting for execution engines to stop")
-	wg.Wait()
-	e.logger.Info("execution engines stopped")
-
-	e.state = consensus.EngineStateStopped
-
-	e.engineMx.Lock()
-	defer e.engineMx.Unlock()
-	go func() {
-		errChan <- nil
-	}()
-	return errChan
-}
-
-func (e *MasterClockConsensusEngine) GetDifficulty() uint32 {
-	return e.difficulty
-}
-
-func (e *MasterClockConsensusEngine) GetFrame() uint64 {
-	return e.frame
-}
-
-func (e *MasterClockConsensusEngine) GetState() consensus.EngineState {
-	return e.state
-}
-
-func (e *MasterClockConsensusEngine) GetFrameChannel() <-chan uint64 {
-	return e.frameChan
 }
diff --git a/node/consensus/master/peer_messaging.go b/node/consensus/master/peer_messaging.go
index 5afedbb..9d64c6b 100644
--- a/node/consensus/master/peer_messaging.go
+++ b/node/consensus/master/peer_messaging.go
@@ -5,7 +5,6 @@ import (
 
 	"github.com/pkg/errors"
 	"go.uber.org/zap"
-	"golang.org/x/sync/errgroup"
 	"google.golang.org/protobuf/proto"
 	"google.golang.org/protobuf/types/known/anypb"
 	"source.quilibrium.com/quilibrium/monorepo/go-libp2p-blossomsub/pb"
@@ -30,46 +29,6 @@ func (e *MasterClockConsensusEngine) handleSync(message *pb.Message) error {
 		return errors.Wrap(err, "handle sync")
 	}
 
-	eg := errgroup.Group{}
-	eg.SetLimit(len(e.executionEngines))
-
-	for name := range e.executionEngines {
-		name := name
-		eg.Go(func() error {
-			messages, err := e.executionEngines[name].ProcessMessage(
-				msg.Address,
-				msg,
-			)
-			if err != nil {
-				e.logger.Error(
-					"could not process message for engine",
-					zap.Error(err),
-					zap.String("engine_name", name),
-				)
-				return errors.Wrap(err, "handle message")
-			}
-
-			for _, m := range messages {
-				m := m
-				if err := e.publishMessage(e.filter, m); err != nil {
-					e.logger.Error(
-						"could not publish message for engine",
-						zap.Error(err),
-						zap.String("engine_name", name),
-					)
-					return errors.Wrap(err, "handle message")
-				}
-			}
-
-			return nil
-		})
-	}
-
-	if err := eg.Wait(); err != nil {
-		e.logger.Error("rejecting invalid message", zap.Error(err))
-		return errors.Wrap(err, "handle sync")
-	}
-
 	switch any.TypeUrl {
 	case protobufs.ClockFramesResponseType:
 		if err := e.handleClockFramesResponse(
@@ -149,7 +108,7 @@ func (e *MasterClockConsensusEngine) handleClockFramesResponse(
 			zap.Uint64("frame_number", frame.FrameNumber),
 		)
 
-		if e.frame < frame.FrameNumber {
+		if e.frame.FrameNumber < frame.FrameNumber {
 			if err := e.enqueueSeenFrame(frame); err != nil {
 				e.logger.Error("could not enqueue seen clock frame", zap.Error(err))
 				return errors.Wrap(err, "handle clock frame response")
@@ -186,7 +145,7 @@ func (e *MasterClockConsensusEngine) handleClockFramesRequest(
 
 	from := request.FromFrameNumber
 
-	if e.frame < from || len(e.historicFrames) == 0 {
+	if e.frame.FrameNumber < from || len(e.historicFrames) == 0 {
 		e.logger.Debug(
 			"peer asked for undiscovered frame",
 			zap.Binary("peer_id", peerID),
@@ -210,8 +169,8 @@ func (e *MasterClockConsensusEngine) handleClockFramesRequest(
 		to = request.FromFrameNumber + 127
 	}
 
-	if int(to) > int(e.latestFrame.FrameNumber) {
-		to = e.latestFrame.FrameNumber
+	if int(to) > int(e.frame.FrameNumber) {
+		to = e.frame.FrameNumber
 	}
 
 	e.logger.Debug(
diff --git a/node/crypto/kzg.go b/node/crypto/kzg.go
index ffbf9a6..1c0488a 100644
--- a/node/crypto/kzg.go
+++ b/node/crypto/kzg.go
@@ -75,6 +75,221 @@ var CeremonyPotPubKeys []curves.PairingPoint
 var CeremonySignatories []curves.Point
 var FFTBLS48581 map[uint64][]curves.PairingPoint = make(map[uint64][]curves.PairingPoint)
 
+func TestInit(file string) {
+	// start with phase 1 ceremony:
+	csBytes, err := os.ReadFile(file)
+	if err != nil {
+		panic(err)
+	}
+
+	bls48581.Init()
+
+	cs := &CeremonyState{}
+	if err := json.Unmarshal(csBytes, cs); err != nil {
+		panic(err)
+	}
+
+	g1s := make([]curves.PairingPoint, 1024)
+	g2s := make([]curves.PairingPoint, 257)
+	g1ffts := make([]curves.PairingPoint, 1024)
+	wg := sync.WaitGroup{}
+	wg.Add(1024)
+
+	for i := 0; i < 1024; i++ {
+		i := i
+		go func() {
+			b, err := hex.DecodeString(cs.PowersOfTau.G1Affines[i][2:])
+			if err != nil {
+				panic(err)
+			}
+			g1, err := curves.BLS48581G1().NewGeneratorPoint().FromAffineCompressed(b)
+			if err != nil {
+				panic(err)
+			}
+			g1s[i] = g1.(curves.PairingPoint)
+
+			f, err := hex.DecodeString(cs.PowersOfTau.G1FFT[i][2:])
+			if err != nil {
+				panic(err)
+			}
+			g1fft, err := curves.BLS48581G1().NewGeneratorPoint().FromAffineCompressed(f)
+			if err != nil {
+				panic(err)
+			}
+			g1ffts[i] = g1fft.(curves.PairingPoint)
+
+			if i < 257 {
+				b, err := hex.DecodeString(cs.PowersOfTau.G2Affines[i][2:])
+				if err != nil {
+					panic(err)
+				}
+				g2, err := curves.BLS48581G2().NewGeneratorPoint().FromAffineCompressed(
+					b,
+				)
+				if err != nil {
+					panic(err)
+				}
+				g2s[i] = g2.(curves.PairingPoint)
+			}
+			wg.Done()
+		}()
+	}
+
+	wg.Wait()
+
+	wg.Add(len(cs.Witness.RunningProducts))
+	CeremonyRunningProducts = make([]curves.PairingPoint, len(cs.Witness.RunningProducts))
+	for i, s := range cs.Witness.RunningProducts {
+		i, s := i, s
+		go func() {
+			b, err := hex.DecodeString(s[2:])
+			if err != nil {
+				panic(err)
+			}
+
+			g1, err := curves.BLS48581G1().NewGeneratorPoint().FromAffineCompressed(b)
+			if err != nil {
+				panic(err)
+			}
+			CeremonyRunningProducts[i] = g1.(curves.PairingPoint)
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+
+	wg.Add(len(cs.Witness.PotPubKeys))
+	CeremonyPotPubKeys = make([]curves.PairingPoint, len(cs.Witness.PotPubKeys))
+	for i, s := range cs.Witness.PotPubKeys {
+		i, s := i, s
+		go func() {
+			b, err := hex.DecodeString(s[2:])
+			if err != nil {
+				panic(err)
+			}
+
+			g2, err := curves.BLS48581G2().NewGeneratorPoint().FromAffineCompressed(b)
+			if err != nil {
+				panic(err)
+			}
+			CeremonyPotPubKeys[i] = g2.(curves.PairingPoint)
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+
+	wg.Add(len(cs.VoucherPubKeys))
+	CeremonySignatories = make([]curves.Point, len(cs.VoucherPubKeys))
+	for i, s := range cs.VoucherPubKeys {
+		i, s := i, s
+		go func() {
+			b, err := hex.DecodeString(s[2:])
+			if err != nil {
+				panic(err)
+			}
+
+			CeremonySignatories[i], err = curves.ED448().Point.FromAffineCompressed(b)
+			if err != nil {
+				panic(err)
+			}
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+
+	CeremonyBLS48581G1 = g1s
+	CeremonyBLS48581G2 = g2s
+
+	// Post-ceremony, precompute everything and put it in the finalized ceremony
+	// state
+	modulus := make([]byte, 73)
+	bls48581.NewBIGints(bls48581.CURVE_Order, nil).ToBytes(modulus)
+	q := new(big.Int).SetBytes(modulus)
+	sizes := []int64{16, 128, 1024}
+
+	wg.Add(len(sizes))
+	root := make([]curves.PairingScalar, 3)
+	roots := make([][]curves.PairingScalar, 3)
+	reverseRoots := make([][]curves.PairingScalar, 3)
+	ffts := make([][]curves.PairingPoint, 3)
+
+	for idx, i := range sizes {
+		i := i
+		idx := idx
+		go func() {
+			exp := new(big.Int).Quo(
+				new(big.Int).Sub(q, big.NewInt(1)),
+				big.NewInt(i),
+			)
+			rootOfUnity := new(big.Int).Exp(big.NewInt(int64(37)), exp, q)
+			roots[idx] = make([]curves.PairingScalar, i+1)
+			reverseRoots[idx] = make([]curves.PairingScalar, i+1)
+			wg2 := sync.WaitGroup{}
+			wg2.Add(int(i))
+			for j := int64(0); j < i; j++ {
+				j := j
+				go func() {
+					rev := big.NewInt(int64(j))
+					r := new(big.Int).Exp(
+						rootOfUnity,
+						rev,
+						q,
+					)
+					scalar, _ := (&curves.ScalarBls48581{}).SetBigInt(r)
+
+					if rev.Cmp(big.NewInt(1)) == 0 {
+						root[idx] = scalar.(curves.PairingScalar)
+					}
+
+					roots[idx][j] = scalar.(curves.PairingScalar)
+					reverseRoots[idx][i-j] = roots[idx][j]
+					wg2.Done()
+				}()
+			}
+			wg2.Wait()
+			roots[idx][i] = roots[idx][0]
+			reverseRoots[idx][0] = reverseRoots[idx][i]
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+
+	wg.Add(len(sizes))
+	for i := range root {
+		i := i
+		RootOfUnityBLS48581[uint64(sizes[i])] = root[i]
+		RootsOfUnityBLS48581[uint64(sizes[i])] = roots[i]
+		ReverseRootsOfUnityBLS48581[uint64(sizes[i])] = reverseRoots[i]
+
+		go func() {
+			// We precomputed 65536, others are cheap and will be fully precomputed
+			// post-ceremony
+			if sizes[i] < 65536 {
+				fftG1, err := FFTG1(
+					CeremonyBLS48581G1[:sizes[i]],
+					*curves.BLS48581(
+						curves.BLS48581G1().NewGeneratorPoint(),
+					),
+					uint64(sizes[i]),
+					true,
+				)
+				if err != nil {
+					panic(err)
+				}
+
+				ffts[i] = fftG1
+			} else {
+				ffts[i] = g1ffts
+			}
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+
+	for i := range root {
+		FFTBLS48581[uint64(sizes[i])] = ffts[i]
+	}
+}
+
 func Init() {
 	// start with phase 1 ceremony:
 	csBytes, err := os.ReadFile("./ceremony.json")
@@ -202,7 +417,7 @@ func Init() {
 	// Post-ceremony, precompute everything and put it in the finalized ceremony
 	// state
 	modulus := make([]byte, 73)
-	bls48581.NewBIGints(bls48581.CURVE_Order).ToBytes(modulus)
+	bls48581.NewBIGints(bls48581.CURVE_Order, nil).ToBytes(modulus)
 	q := new(big.Int).SetBytes(modulus)
 	sizes := []int64{16, 128, 1024, 65536}
 
@@ -310,7 +525,7 @@ func NewKZGProver(
 
 func DefaultKZGProver() *KZGProver {
 	modulus := make([]byte, 73)
-	bls48581.NewBIGints(bls48581.CURVE_Order).ToBytes(modulus)
+	bls48581.NewBIGints(bls48581.CURVE_Order, nil).ToBytes(modulus)
 	q := new(big.Int).SetBytes(modulus)
 	return NewKZGProver(
 		curves.BLS48581(curves.BLS48581G1().Point),
@@ -426,7 +641,7 @@ func (p *KZGProver) EvaluateLagrangeForm(
 
 	xBI := x.BigInt()
 	modulus := make([]byte, 73)
-	bls48581.NewBIGints(bls48581.CURVE_Order).ToBytes(modulus)
+	bls48581.NewBIGints(bls48581.CURVE_Order, nil).ToBytes(modulus)
 	q := new(big.Int).SetBytes(modulus)
 	xBI.Exp(xBI, width.BigInt(), q)
 	xBI.Sub(xBI, big.NewInt(1))
diff --git a/node/crypto/kzg_test.go b/node/crypto/kzg_test.go
index 87c4668..7f684af 100644
--- a/node/crypto/kzg_test.go
+++ b/node/crypto/kzg_test.go
@@ -81,7 +81,7 @@ func TestMain(m *testing.M) {
 	// Post-ceremony, precompute everything and put it in the finalized ceremony
 	// state
 	modulus := make([]byte, 73)
-	bls48581.NewBIGints(bls48581.CURVE_Order).ToBytes(modulus)
+	bls48581.NewBIGints(bls48581.CURVE_Order, nil).ToBytes(modulus)
 	q := new(big.Int).SetBytes(modulus)
 	sizes := []int64{16}
 
@@ -173,7 +173,7 @@ func TestMain(m *testing.M) {
 
 func TestKzgBytesToPoly(t *testing.T) {
 	modulus := make([]byte, 73)
-	bls48581.NewBIGints(bls48581.CURVE_Order).ToBytes(modulus)
+	bls48581.NewBIGints(bls48581.CURVE_Order, nil).ToBytes(modulus)
 	q := new(big.Int).SetBytes(modulus)
 	p := crypto.NewKZGProver(curves.BLS48581(curves.BLS48581G1().Point), sha3.New256, q)
 
@@ -215,7 +215,7 @@ func TestKzgBytesToPoly(t *testing.T) {
 
 func TestPolynomialCommitment(t *testing.T) {
 	modulus := make([]byte, 73)
-	bls48581.NewBIGints(bls48581.CURVE_Order).ToBytes(modulus)
+	bls48581.NewBIGints(bls48581.CURVE_Order, nil).ToBytes(modulus)
 	q := new(big.Int).SetBytes(modulus)
 	p := crypto.NewKZGProver(curves.BLS48581(curves.BLS48581G1().Point), sha3.New256, q)
 
@@ -263,7 +263,7 @@ func TestPolynomialCommitment(t *testing.T) {
 
 func TestKZGProof(t *testing.T) {
 	modulus := make([]byte, 73)
-	bls48581.NewBIGints(bls48581.CURVE_Order).ToBytes(modulus)
+	bls48581.NewBIGints(bls48581.CURVE_Order, nil).ToBytes(modulus)
 	q := new(big.Int).SetBytes(modulus)
 	p := crypto.NewKZGProver(curves.BLS48581(curves.BLS48581G1().Point), sha3.New256, q)
 
@@ -290,27 +290,51 @@ func TestKZGProof(t *testing.T) {
 			curves.BLS48581G1().NewGeneratorPoint(),
 		),
 		16,
-		false,
+		true,
 	)
 	require.NoError(t, err)
 
-	commit, err := p.Commit(evalPoly)
+	commit, err := p.Commit(poly)
 	require.NoError(t, err)
 
-	z, err := (&curves.ScalarBls48581{}).SetBigInt(big.NewInt(2))
+	z := crypto.RootsOfUnityBLS48581[16][2]
 	require.NoError(t, err)
 
-	checky := poly[len(poly)-1]
-	for i := len(poly) - 2; i >= 0; i-- {
-		checky = checky.Mul(z).Add(poly[i]).(curves.PairingScalar)
+	checky := evalPoly[len(poly)-1]
+	for i := len(evalPoly) - 2; i >= 0; i-- {
+		checky = checky.Mul(z).Add(evalPoly[i]).(curves.PairingScalar)
 	}
-	y, err := p.EvaluateLagrangeForm(evalPoly, z.(curves.PairingScalar), 16, 0)
-	require.NoError(t, err)
-	require.Equal(t, y.Cmp(checky), 0)
+	fmt.Printf("%+x\n", checky.Bytes())
 
-	proof, err := p.Prove(evalPoly, commit, z.(curves.PairingScalar))
+	divisors := make([]curves.PairingScalar, 2)
+	divisors[0] = (&curves.ScalarBls48581{}).Zero().Sub(z).(*curves.ScalarBls48581)
+	divisors[1] = (&curves.ScalarBls48581{}).One().(*curves.ScalarBls48581)
+
+	a := make([]curves.PairingScalar, len(evalPoly))
+	for i := 0; i < len(a); i++ {
+		a[i] = evalPoly[i].Clone().(*curves.ScalarBls48581)
+	}
+
+	// Adapted from Feist's amortized proofs:
+	aPos := len(a) - 1
+	bPos := len(divisors) - 1
+	diff := aPos - bPos
+	out := make([]curves.PairingScalar, diff+1, diff+1)
+	for diff >= 0 {
+		out[diff] = a[aPos].Div(divisors[bPos]).(*curves.ScalarBls48581)
+		for i := bPos; i >= 0; i-- {
+			a[diff+i] = a[diff+i].Sub(
+				out[diff].Mul(divisors[i]),
+			).(*curves.ScalarBls48581)
+		}
+		aPos -= 1
+		diff -= 1
+	}
+
+	proof, err := p.PointLinearCombination(crypto.CeremonyBLS48581G1[:15], out)
+	// proof, err := p.Prove(evalPoly, commit, z.(curves.PairingScalar))
 	require.NoError(t, err)
-	require.True(t, p.Verify(commit, z.(curves.PairingScalar), y, proof))
+	require.True(t, p.Verify(commit, z, checky, proof))
 
 	commitments, err := p.CommitAggregate(
 		[][]curves.PairingScalar{evalPoly},
diff --git a/node/execution/ceremony/application/ceremony_application.go b/node/execution/ceremony/application/ceremony_application.go
index 0819a39..85a09ea 100644
--- a/node/execution/ceremony/application/ceremony_application.go
+++ b/node/execution/ceremony/application/ceremony_application.go
@@ -14,8 +14,6 @@ var ErrInvalidStateTransition = errors.New("invalid state transition")
 
 type CeremonyApplicationState int
 
-const V118_CUTOFF = uint64(45000)
-
 var CEREMONY_ADDRESS = []byte{
 	// SHA3-256("q_kzg_ceremony")
 	0x34, 0x00, 0x1b, 0xe7, 0x43, 0x2c, 0x2e, 0x66,
@@ -50,7 +48,7 @@ type CeremonyApplication struct {
 	StateCount                     uint64
 	RoundCount                     uint64
 	LobbyState                     CeremonyApplicationState
-	ActiveParticipants             []*protobufs.Ed448PublicKey
+	ActiveParticipants             []*protobufs.CeremonyLobbyJoin
 	NextRoundPreferredParticipants []*protobufs.Ed448PublicKey
 	LatestSeenProverAttestations   []*protobufs.CeremonySeenProverAttestation
 	DroppedParticipantAttestations []*protobufs.CeremonyDroppedProverAttestation
@@ -82,8 +80,22 @@ func (a *CeremonyApplication) Equals(b *CeremonyApplication) bool {
 
 	for i := range a.ActiveParticipants {
 		if !bytes.Equal(
-			a.ActiveParticipants[i].KeyValue,
-			b.ActiveParticipants[i].KeyValue,
+			a.ActiveParticipants[i].PublicKeySignatureEd448.PublicKey.KeyValue,
+			b.ActiveParticipants[i].PublicKeySignatureEd448.PublicKey.KeyValue,
+		) {
+			return false
+		}
+
+		if !bytes.Equal(
+			a.ActiveParticipants[i].IdentityKey.KeyValue,
+			b.ActiveParticipants[i].IdentityKey.KeyValue,
+		) {
+			return false
+		}
+
+		if !bytes.Equal(
+			a.ActiveParticipants[i].SignedPreKey.KeyValue,
+			b.ActiveParticipants[i].SignedPreKey.KeyValue,
 		) {
 			return false
 		}
@@ -856,7 +868,7 @@ func (a *CeremonyApplication) ApplyTransition(
 			}
 		}
 
-		if currentFrameNumber > V118_CUTOFF && a.StateCount > 100 {
+		if a.StateCount > 10 {
 			shouldReset = true
 		}
 
@@ -866,17 +878,19 @@ func (a *CeremonyApplication) ApplyTransition(
 			a.RoundCount = 0
 			for _, p := range a.ActiveParticipants {
 				p := p
-				if _, ok := droppedProversMap[string(p.KeyValue)]; !ok {
+				if _, ok := droppedProversMap[string(
+					p.PublicKeySignatureEd448.PublicKey.KeyValue,
+				)]; !ok {
 					a.NextRoundPreferredParticipants = append(
 						append(
 							[]*protobufs.Ed448PublicKey{},
-							p,
+							p.PublicKeySignatureEd448.PublicKey,
 						),
 						a.NextRoundPreferredParticipants...,
 					)
 				}
 			}
-			a.ActiveParticipants = []*protobufs.Ed448PublicKey{}
+			a.ActiveParticipants = []*protobufs.CeremonyLobbyJoin{}
 			a.DroppedParticipantAttestations =
 				[]*protobufs.CeremonyDroppedProverAttestation{}
 			a.LatestSeenProverAttestations =
@@ -958,7 +972,7 @@ func (a *CeremonyApplication) ApplyTransition(
 			}
 
 			a.LobbyState = CEREMONY_APPLICATION_STATE_VALIDATING
-			a.ActiveParticipants = []*protobufs.Ed448PublicKey{}
+			a.ActiveParticipants = []*protobufs.CeremonyLobbyJoin{}
 			a.DroppedParticipantAttestations =
 				[]*protobufs.CeremonyDroppedProverAttestation{}
 			a.LatestSeenProverAttestations =
@@ -984,7 +998,7 @@ func (a *CeremonyApplication) ApplyTransition(
 			}
 		}
 
-		if currentFrameNumber > V118_CUTOFF && a.StateCount > 100 {
+		if a.StateCount > 10 {
 			shouldReset = true
 		}
 
@@ -994,17 +1008,19 @@ func (a *CeremonyApplication) ApplyTransition(
 			a.RoundCount = 0
 			for _, p := range a.ActiveParticipants {
 				p := p
-				if _, ok := droppedProversMap[string(p.KeyValue)]; !ok {
+				if _, ok := droppedProversMap[string(
+					p.PublicKeySignatureEd448.PublicKey.KeyValue,
+				)]; !ok {
 					a.NextRoundPreferredParticipants = append(
 						append(
 							[]*protobufs.Ed448PublicKey{},
-							p,
+							p.PublicKeySignatureEd448.PublicKey,
 						),
 						a.NextRoundPreferredParticipants...,
 					)
 				}
 			}
-			a.ActiveParticipants = []*protobufs.Ed448PublicKey{}
+			a.ActiveParticipants = []*protobufs.CeremonyLobbyJoin{}
 			a.DroppedParticipantAttestations =
 				[]*protobufs.CeremonyDroppedProverAttestation{}
 			a.LatestSeenProverAttestations =
@@ -1036,7 +1052,25 @@ func (a *CeremonyApplication) ApplyTransition(
 			}
 		}
 
-		if a.UpdatedTranscript == nil {
+		shouldReset := false
+		if a.StateCount > 100 {
+			shouldReset = true
+		}
+
+		if shouldReset {
+			a.LobbyState = CEREMONY_APPLICATION_STATE_OPEN
+			a.StateCount = 0
+			a.RoundCount = 0
+			a.ActiveParticipants = []*protobufs.CeremonyLobbyJoin{}
+			a.DroppedParticipantAttestations =
+				[]*protobufs.CeremonyDroppedProverAttestation{}
+			a.LatestSeenProverAttestations =
+				[]*protobufs.CeremonySeenProverAttestation{}
+			a.TranscriptRoundAdvanceCommits =
+				[]*protobufs.CeremonyAdvanceRound{}
+			a.TranscriptShares =
+				[]*protobufs.CeremonyTranscriptShare{}
+		} else if a.UpdatedTranscript == nil {
 			rewardMultiplier := uint64(1)
 			for i := 0; i < len(a.FinalCommits)-1; i++ {
 				rewardMultiplier = rewardMultiplier << 1
@@ -1064,7 +1098,7 @@ func (a *CeremonyApplication) ApplyTransition(
 			a.LobbyState = CEREMONY_APPLICATION_STATE_OPEN
 			a.StateCount = 0
 			a.RoundCount = 0
-			a.ActiveParticipants = []*protobufs.Ed448PublicKey{}
+			a.ActiveParticipants = []*protobufs.CeremonyLobbyJoin{}
 			a.DroppedParticipantAttestations =
 				[]*protobufs.CeremonyDroppedProverAttestation{}
 			a.LatestSeenProverAttestations =
diff --git a/node/execution/ceremony/application/ceremony_application_in_progress.go b/node/execution/ceremony/application/ceremony_application_in_progress.go
index abcc6d7..b61e5f3 100644
--- a/node/execution/ceremony/application/ceremony_application_in_progress.go
+++ b/node/execution/ceremony/application/ceremony_application_in_progress.go
@@ -22,7 +22,10 @@ func (a *CeremonyApplication) applySeenProverAttestation(
 
 	inParticipantList := false
 	for _, p := range a.ActiveParticipants {
-		if bytes.Equal(p.KeyValue, seenProverAttestation.SeenProverKey.KeyValue) {
+		if bytes.Equal(
+			p.PublicKeySignatureEd448.PublicKey.KeyValue,
+			seenProverAttestation.SeenProverKey.KeyValue,
+		) {
 			inParticipantList = true
 			break
 		}
@@ -93,7 +96,7 @@ func (a *CeremonyApplication) applyDroppedProverAttestation(
 	inParticipantList := false
 	for _, p := range a.ActiveParticipants {
 		if bytes.Equal(
-			p.KeyValue,
+			p.PublicKeySignatureEd448.PublicKey.KeyValue,
 			droppedProverAttestation.DroppedProverKey.KeyValue,
 		) {
 			inParticipantList = true
@@ -189,7 +192,7 @@ func (a *CeremonyApplication) applyTranscriptCommit(
 	inParticipantList := false
 	for _, p := range a.ActiveParticipants {
 		if bytes.Equal(
-			p.KeyValue,
+			p.PublicKeySignatureEd448.PublicKey.KeyValue,
 			transcriptCommit.ProverSignature.PublicKey.KeyValue,
 		) {
 			inParticipantList = true
diff --git a/node/execution/ceremony/application/ceremony_application_open.go b/node/execution/ceremony/application/ceremony_application_open.go
index ab13d50..6f5b02d 100644
--- a/node/execution/ceremony/application/ceremony_application_open.go
+++ b/node/execution/ceremony/application/ceremony_application_open.go
@@ -89,11 +89,11 @@ func (a *CeremonyApplication) finalizeParticipantSet() error {
 		power = power >> 1
 	}
 
-	a.ActiveParticipants = []*protobufs.Ed448PublicKey{}
+	a.ActiveParticipants = []*protobufs.CeremonyLobbyJoin{}
 	for i := 0; i < int(power); i++ {
 		a.ActiveParticipants = append(
 			a.ActiveParticipants,
-			a.LobbyJoins[i].PublicKeySignatureEd448.PublicKey,
+			a.LobbyJoins[i],
 		)
 	}
 
diff --git a/node/execution/ceremony/application/ceremony_application_test.go b/node/execution/ceremony/application/ceremony_application_test.go
index be3f033..cfcec43 100644
--- a/node/execution/ceremony/application/ceremony_application_test.go
+++ b/node/execution/ceremony/application/ceremony_application_test.go
@@ -122,7 +122,10 @@ func TestCeremonyTransitions(t *testing.T) {
 	})
 	require.NoError(t, err)
 	require.Equal(t, a.LobbyState, CEREMONY_APPLICATION_STATE_IN_PROGRESS)
-	require.True(t, bytes.Equal(a.ActiveParticipants[0].KeyValue, proverPubKey))
+	require.True(t, bytes.Equal(
+		a.ActiveParticipants[0].PublicKeySignatureEd448.PublicKey.KeyValue,
+		proverPubKey,
+	))
 
 	tau := curves.BLS48581G1().Scalar.Random(rand.Reader)
 	tau2 := tau.Mul(tau)
diff --git a/node/execution/ceremony/application/ceremony_application_validating.go b/node/execution/ceremony/application/ceremony_application_validating.go
index 4b50fac..d1d36e1 100644
--- a/node/execution/ceremony/application/ceremony_application_validating.go
+++ b/node/execution/ceremony/application/ceremony_application_validating.go
@@ -2,9 +2,9 @@ package application
 
 import (
 	"bytes"
+	"crypto/rand"
 
 	"github.com/pkg/errors"
-	"golang.org/x/sync/errgroup"
 	"source.quilibrium.com/quilibrium/monorepo/nekryptology/pkg/core/curves"
 	"source.quilibrium.com/quilibrium/monorepo/node/protobufs"
 )
@@ -37,59 +37,47 @@ func (a *CeremonyApplication) applyTranscript(
 		)
 	}
 
-	g1s := make([]*curves.PointBls48581G1, len(a.UpdatedTranscript.G1Powers))
-	eg := errgroup.Group{}
-	eg.SetLimit(100)
+	g1s := make([]curves.Point, len(a.UpdatedTranscript.G1Powers))
 
 	for i := range a.UpdatedTranscript.G1Powers {
 		i := i
-		eg.Go(func() error {
-			if !bytes.Equal(
-				a.UpdatedTranscript.G1Powers[i].KeyValue,
-				transcript.G1Powers[i].KeyValue,
-			) {
-				return errors.Wrap(errors.New("invalid g1s"), "apply transcript")
-			}
+		if !bytes.Equal(
+			a.UpdatedTranscript.G1Powers[i].KeyValue,
+			transcript.G1Powers[i].KeyValue,
+		) {
+			return errors.Wrap(errors.New("invalid g1s"), "apply transcript")
+		}
 
-			g1 := &curves.PointBls48581G1{}
-			x, err := g1.FromAffineCompressed(a.UpdatedTranscript.G1Powers[i].KeyValue)
-			if err != nil {
-				return errors.Wrap(err, "apply transcript")
-			}
-			g1, _ = x.(*curves.PointBls48581G1)
+		g1 := &curves.PointBls48581G1{}
+		x, err := g1.FromAffineCompressed(
+			a.UpdatedTranscript.G1Powers[i].KeyValue,
+		)
+		if err != nil {
+			return errors.Wrap(err, "apply transcript")
+		}
 
-			g1s[i] = g1
-
-			return nil
-		})
+		g1s[i] = x
 	}
 
-	g2s := make([]*curves.PointBls48581G2, len(a.UpdatedTranscript.G2Powers))
+	g2s := make([]curves.Point, len(a.UpdatedTranscript.G2Powers))
 	for i := range a.UpdatedTranscript.G2Powers {
 		i := i
-		eg.Go(func() error {
-			if !bytes.Equal(
-				a.UpdatedTranscript.G2Powers[i].KeyValue,
-				transcript.G2Powers[i].KeyValue,
-			) {
-				return errors.Wrap(errors.New("invalid g2s"), "apply transcript")
-			}
+		if !bytes.Equal(
+			a.UpdatedTranscript.G2Powers[i].KeyValue,
+			transcript.G2Powers[i].KeyValue,
+		) {
+			return errors.Wrap(errors.New("invalid g2s"), "apply transcript")
+		}
 
-			g2 := &curves.PointBls48581G2{}
-			x, err := g2.FromAffineCompressed(a.UpdatedTranscript.G2Powers[i].KeyValue)
-			if err != nil {
-				return errors.Wrap(err, "apply transcript")
-			}
-			g2, _ = x.(*curves.PointBls48581G2)
+		g2 := &curves.PointBls48581G2{}
+		x, err := g2.FromAffineCompressed(
+			a.UpdatedTranscript.G2Powers[i].KeyValue,
+		)
+		if err != nil {
+			return errors.Wrap(err, "apply transcript")
+		}
 
-			g2s[i] = g2
-
-			return nil
-		})
-	}
-
-	if err := eg.Wait(); err != nil {
-		return err
+		g2s[i] = x
 	}
 
 	g1Witnesses := []*curves.PointBls48581G1{}
@@ -168,52 +156,70 @@ func (a *CeremonyApplication) applyTranscript(
 		}
 	}
 
-	mp := []curves.PairingPoint{}
 	mpg2 := curves.BLS48581G2().Point.Generator().(curves.PairingPoint)
 	mpg2n := g2s[1].Neg().(curves.PairingPoint)
 
-	for i := 0; i < len(g1s)-1; i++ {
-		mp = append(mp, g1s[i])
-		mp = append(mp, mpg2n)
-		mp = append(mp, g1s[i+1])
-		mp = append(mp, mpg2)
-	}
-
-	mp2 := []curves.PairingPoint{}
 	mpg1 := curves.BLS48581G1().Point.Generator().(curves.PairingPoint)
 	mpg1n := g1s[1].Neg().(curves.PairingPoint)
-	for i := 0; i < len(g2s)-1; i++ {
-		mp2 = append(mp2, mpg1n)
-		mp2 = append(mp2, g2s[i])
-		mp2 = append(mp2, mpg1)
-		mp2 = append(mp2, g2s[i+1])
+
+	randoms := []curves.Scalar{}
+	sum := curves.BLS48581G1().Scalar.Zero()
+
+	for i := 0; i < len(g1s)-1; i++ {
+		randoms = append(randoms, curves.BLS48581G1().Scalar.Random(rand.Reader))
+		sum = sum.Add(randoms[i])
 	}
 
-	l := g1s[0].MultiPairing(mp...)
-	if !l.IsOne() {
+	g1CheckR := g1s[0].SumOfProducts(g1s[1:], randoms)
+	g1CheckL := g1s[0].SumOfProducts(g1s[:len(g1s)-1], randoms)
+
+	if !mpg2.MultiPairing(
+		g1CheckL.(curves.PairingPoint),
+		mpg2n.Mul(sum).(curves.PairingPoint),
+		g1CheckR.(curves.PairingPoint),
+		mpg2.Mul(sum).(curves.PairingPoint),
+	).IsOne() {
 		return errors.Wrap(
 			errors.New("pairing check failed for g1s"),
 			"apply transcript",
 		)
 	}
 
-	l = g1s[0].MultiPairing(mp2...)
-	if !l.IsOne() {
+	var g2CheckL, g2CheckR curves.Point
+	g2Sum := curves.BLS48581G1().Scalar.Zero()
+	for i := 0; i < len(g2s)-1; i++ {
+		g2Sum = g2Sum.Add(randoms[i])
+		if g2CheckL == nil {
+			g2CheckL = g2s[0].Mul(randoms[0])
+			g2CheckR = g2s[1].Mul(randoms[0])
+		} else {
+			g2CheckL = g2CheckL.Add(g2s[i].Mul(randoms[i]))
+			g2CheckR = g2CheckR.Add(g2s[i+1].Mul(randoms[i]))
+		}
+	}
+
+	if !mpg2.MultiPairing(
+		mpg1n.Mul(g2Sum).(curves.PairingPoint),
+		g2CheckL.(curves.PairingPoint),
+		mpg1.Mul(g2Sum).(curves.PairingPoint),
+		g2CheckR.(curves.PairingPoint),
+	).IsOne() {
 		return errors.Wrap(
 			errors.New("pairing check failed for g2s"),
 			"apply transcript",
 		)
 	}
 
-	mp3 := []curves.PairingPoint{}
+	mp3 := make([]curves.PairingPoint, (len(g2Powers)-1)*4)
 	for i := 0; i < len(g2Powers)-1; i++ {
-		mp3 = append(mp3, g1Witnesses[i+1].Neg().(curves.PairingPoint))
-		mp3 = append(mp3, g2Powers[i])
-		mp3 = append(mp3, mpg1)
-		mp3 = append(mp3, g2Powers[i+1])
+		i := i
+		mp3[i*4+0] = g1Witnesses[i+1].Neg().(curves.PairingPoint)
+		mp3[i*4+1] = g2Powers[i]
+		mp3[i*4+2] = mpg1
+		mp3[i*4+3] = g2Powers[i+1]
 	}
 
-	l = g1s[0].MultiPairing(mp3...)
+	l := mp3[0].MultiPairing(mp3...)
 	if !l.IsOne() {
 		return errors.Wrap(
 			errors.New("pairing check failed for witnesses"),
diff --git a/node/execution/ceremony/application/ceremony_application_validating_test.go b/node/execution/ceremony/application/ceremony_application_validating_test.go
index 7f6c196..3840e32 100644
--- a/node/execution/ceremony/application/ceremony_application_validating_test.go
+++ b/node/execution/ceremony/application/ceremony_application_validating_test.go
@@ -3,7 +3,9 @@ package application
 import (
 	"crypto"
 	"crypto/rand"
+	"fmt"
 	"testing"
+	"time"
 
 	"github.com/cloudflare/circl/sign/ed448"
 	"github.com/stretchr/testify/require"
@@ -12,6 +14,166 @@ import (
 	"source.quilibrium.com/quilibrium/monorepo/node/protobufs"
 )
 
+// This does a full test of the 65536 powers, run this if you want to wait a
+// long time
+func TestApplyTranscript_Slow(t *testing.T) {
+	old := curves.BLS48581G1().Scalar.Random(rand.Reader)
+	olds := []*curves.ScalarBls48581{
+		curves.BLS48581G1().Scalar.One().(*curves.ScalarBls48581),
+	}
+	tau := curves.BLS48581G1().Scalar.Random(rand.Reader)
+	taus := []*curves.ScalarBls48581{
+		curves.BLS48581G1().Scalar.One().(*curves.ScalarBls48581),
+	}
+	fmt.Println(time.Now().Unix())
+	fmt.Println("generate taus")
+	for i := 0; i < 65536; i++ {
+		olds = append(olds, olds[i].Mul(old).(*curves.ScalarBls48581))
+		taus = append(taus, taus[i].Mul(tau).(*curves.ScalarBls48581))
+	}
+	tauPubG2 := curves.BLS48581G2().Point.Generator().Mul(tau)
+
+	fmt.Println(time.Now().Unix())
+	fmt.Println("taus generated")
+	proverPubKey, proverKey, err := ed448.GenerateKey(rand.Reader)
+	require.NoError(t, err)
+	proverSig, err := proverKey.Sign(
+		rand.Reader,
+		tauPubG2.ToAffineCompressed(),
+		crypto.Hash(0),
+	)
+	require.NoError(t, err)
+
+	fmt.Println(time.Now().Unix())
+	fmt.Println("prover signature generated")
+	blsSignature := make([]byte, int(bls48581.MODBYTES)+1)
+	key := tau.Bytes()
+
+	for i, j := 0, len(key)-1; i < j; i, j = i+1, j-1 {
+		key[i], key[j] = key[j], key[i]
+	}
+
+	if bls48581.Core_Sign(blsSignature, proverKey, key) != bls48581.BLS_OK {
+		require.Fail(t, "could not sign")
+	}
+
+	fmt.Println(time.Now().Unix())
+	fmt.Println("bls signature generated")
+
+	blsSig := blsSignature[:]
+	oldTranscript := &protobufs.CeremonyTranscript{
+		G1Powers: []*protobufs.BLS48581G1PublicKey{},
+		G2Powers: []*protobufs.BLS48581G2PublicKey{},
+		RunningG1_256Witnesses: []*protobufs.BLS48581G1PublicKey{
+			{
+				KeyValue: curves.BLS48581G1().Point.Generator().ToAffineCompressed(),
+			},
+		},
+		RunningG2_256Powers: []*protobufs.BLS48581G2PublicKey{
+			{
+				KeyValue: curves.BLS48581G2().Point.Generator().Mul(
+					olds[256],
+				).ToAffineCompressed(),
+			},
+		},
+	}
+	updatedTranscript := &protobufs.CeremonyTranscript{
+		G1Powers: []*protobufs.BLS48581G1PublicKey{},
+		G2Powers: []*protobufs.BLS48581G2PublicKey{},
+		RunningG1_256Witnesses: []*protobufs.BLS48581G1PublicKey{
+			{
+				KeyValue: curves.BLS48581G1().Point.Generator().ToAffineCompressed(),
+			},
+			{
+				KeyValue: curves.BLS48581G1().Point.Generator().Mul(
+					taus[256],
+				).ToAffineCompressed(),
+			},
+		},
+		RunningG2_256Powers: []*protobufs.BLS48581G2PublicKey{
+			{
+				KeyValue: curves.BLS48581G2().Point.Generator().Mul(
+					olds[256],
+				).ToAffineCompressed(),
+			},
+			{
+				KeyValue: curves.BLS48581G2().Point.Generator().Mul(
+					olds[256],
+				).Mul(taus[256]).ToAffineCompressed(),
+			},
+		},
+	}
+
+	for i, o := range olds {
+		oldTranscript.G1Powers = append(
+			oldTranscript.G1Powers,
+			&protobufs.BLS48581G1PublicKey{
+				KeyValue: curves.BLS48581G1().Point.Generator().Mul(
+					o,
+				).ToAffineCompressed(),
+			},
+		)
+
+		updatedTranscript.G1Powers = append(
+			updatedTranscript.G1Powers,
+			&protobufs.BLS48581G1PublicKey{
+				KeyValue: curves.BLS48581G1().Point.Generator().Mul(
+					o,
+				).Mul(taus[i]).ToAffineCompressed(),
+			},
+		)
+
+		if i < 257 {
+			oldTranscript.G2Powers = append(
+				oldTranscript.G2Powers,
+				&protobufs.BLS48581G2PublicKey{
+					KeyValue: curves.BLS48581G2().Point.Generator().Mul(
+						o,
+					).ToAffineCompressed(),
+				},
+			)
+
+			updatedTranscript.G2Powers = append(
+				updatedTranscript.G2Powers,
+				&protobufs.BLS48581G2PublicKey{
+					KeyValue: curves.BLS48581G2().Point.Generator().Mul(
+						o,
+					).Mul(taus[i]).ToAffineCompressed(),
+				},
+			)
+		}
+	}
+
+	fmt.Println(time.Now().Unix())
+	fmt.Println("transcripts generated")
+	a := &CeremonyApplication{
+		StateCount: 0,
+		RoundCount: 0,
+		LobbyState: CEREMONY_APPLICATION_STATE_VALIDATING,
+		FinalCommits: []*protobufs.CeremonyTranscriptCommit{
+			{
+				ProverSignature: &protobufs.Ed448Signature{
+					Signature: proverSig,
+					PublicKey: &protobufs.Ed448PublicKey{
+						KeyValue: proverPubKey,
+					},
+				},
+				ContributionSignature: &protobufs.BLS48581Signature{
+					Signature: blsSig,
+					PublicKey: &protobufs.BLS48581G2PublicKey{
+						KeyValue: tauPubG2.ToAffineCompressed(),
+					},
+				},
+			},
+		},
+		LatestTranscript:  oldTranscript,
+		UpdatedTranscript: updatedTranscript,
+	}
+
+	err = a.applyTranscript(updatedTranscript)
+	require.NoError(t, err)
+}
+
 func TestApplyTranscript(t *testing.T) {
 	old := curves.BLS48581G1().Scalar.Random(rand.Reader)
 	old2 := old.Mul(old)
@@ -322,5 +484,5 @@ func TestApplyRewritingTranscriptFails(t *testing.T) {
 	}
 
 	err = a.applyTranscript(updatedTranscript)
-	require.NoError(t, err)
+	require.Error(t, err)
 }
diff --git a/node/execution/ceremony/ceremony_execution_engine.go b/node/execution/ceremony/ceremony_execution_engine.go
index d122a09..8855ac8 100644
--- a/node/execution/ceremony/ceremony_execution_engine.go
+++ b/node/execution/ceremony/ceremony_execution_engine.go
@@ -37,6 +37,7 @@ type CeremonyExecutionEngine struct {
 	keyManager                 keys.KeyManager
 	engineConfig               *config.EngineConfig
 	pubSub                     p2p.PubSub
+	peerIdHash                 []byte
 	provingKey                 crypto.Signer
 	proverPublicKey            []byte
 	provingKeyAddress          []byte
@@ -48,11 +49,11 @@ type CeremonyExecutionEngine struct {
 	alreadyPublishedTranscript bool
 	seenMessageMap             map[string]bool
 	seenMessageMx              sync.Mutex
+	intrinsicFilter            []byte
 }
 
 func NewCeremonyExecutionEngine(
 	logger *zap.Logger,
-	clock *ceremony.CeremonyDataClockConsensusEngine,
 	engineConfig *config.EngineConfig,
 	keyManager keys.KeyManager,
 	pubSub p2p.PubSub,
@@ -63,6 +64,27 @@ func NewCeremonyExecutionEngine(
 		panic(errors.New("logger is nil"))
 	}
 
+	seed, err := hex.DecodeString(engineConfig.GenesisSeed)
+	if err != nil {
+		panic(err)
+	}
+
+	intrinsicFilter := append(
+		p2p.GetBloomFilter(application.CEREMONY_ADDRESS, 256, 3),
+		p2p.GetBloomFilterIndices(application.CEREMONY_ADDRESS, 65536, 24)...,
+	)
+
+	clock := ceremony.NewCeremonyDataClockConsensusEngine(
+		engineConfig,
+		logger,
+		keyManager,
+		clockStore,
+		keyStore,
+		pubSub,
+		intrinsicFilter,
+		seed,
+	)
+
 	e := &CeremonyExecutionEngine{
 		logger:                logger,
 		clock:                 clock,
@@ -76,8 +98,18 @@ func NewCeremonyExecutionEngine(
 		alreadyPublishedShare: false,
 		seenMessageMx:         sync.Mutex{},
 		seenMessageMap:        map[string]bool{},
+		intrinsicFilter:       intrinsicFilter,
 	}
 
+	peerId := e.pubSub.GetPeerID()
+	addr, err := poseidon.HashBytes(peerId)
+	if err != nil {
+		panic(err)
+	}
+
+	addrBytes := addr.Bytes()
+	addrBytes = append(make([]byte, 32-len(addrBytes)), addrBytes...)
+	e.peerIdHash = addrBytes
 	provingKey, _, publicKeyBytes, provingKeyAddress := e.clock.GetProvingKey(
 		engineConfig,
 	)
@@ -117,15 +149,7 @@ func (e *CeremonyExecutionEngine) Start() <-chan error {
 	))
 
 	go func() {
-		seed, err := hex.DecodeString(e.engineConfig.GenesisSeed)
-		if err != nil {
-			panic(err)
-		}
-
-		err = <-e.clock.Start(
-			application.CEREMONY_ADDRESS,
-			seed,
-		)
+		err := <-e.clock.Start()
 		if err != nil {
 			panic(err)
 		}
@@ -175,7 +199,7 @@ func (e *CeremonyExecutionEngine) ProcessMessage(
 				return nil, errors.Wrap(err, "process message")
 			}
 
-			if frame.FrameNumber < e.clock.GetFrame() {
+			if frame.FrameNumber < e.clock.GetFrame().FrameNumber {
 				return nil, nil
 			}
 
@@ -270,7 +294,7 @@ func (e *CeremonyExecutionEngine) RunWorker() {
 	frameChan := e.clock.GetFrameChannel()
 	for {
 		frameFromBuffer := <-frameChan
-		frame := e.clock.GetActiveFrame()
+		frame := e.clock.GetFrame()
 		e.activeClockFrame = frame
 		e.logger.Info(
 			"evaluating next frame",
@@ -289,9 +313,10 @@ func (e *CeremonyExecutionEngine) RunWorker() {
 		}
 
 		_, _, reward := app.RewardTrie.Get(e.provingKeyAddress)
+		_, _, retro := app.RewardTrie.Get(e.peerIdHash)
 		e.logger.Info(
 			"current application state",
-			zap.Uint64("my_balance", reward),
+			zap.Uint64("my_balance", reward+retro),
 			zap.String("lobby_state", app.LobbyState.String()),
 		)
 
@@ -313,7 +338,10 @@ func (e *CeremonyExecutionEngine) RunWorker() {
 			e.logger.Info(
 				"lobby open for joins",
 				zap.Int("joined_participants", len(app.LobbyJoins)),
-				zap.Int("preferred_participants", len(app.NextRoundPreferredParticipants)),
+				zap.Int(
+					"preferred_participants",
+					len(app.NextRoundPreferredParticipants),
+				),
 				zap.Bool("in_lobby", alreadyJoined),
 				zap.Uint64("state_count", app.StateCount),
 			)
@@ -337,7 +365,10 @@ func (e *CeremonyExecutionEngine) RunWorker() {
 		case application.CEREMONY_APPLICATION_STATE_IN_PROGRESS:
 			inRound := false
 			for _, p := range app.ActiveParticipants {
-				if bytes.Equal(p.KeyValue, e.proverPublicKey) {
+				if bytes.Equal(
+					p.PublicKeySignatureEd448.PublicKey.KeyValue,
+					e.proverPublicKey,
+				) {
 					inRound = true
 					break
 				}
@@ -353,7 +384,10 @@ func (e *CeremonyExecutionEngine) RunWorker() {
 			e.logger.Info(
 				"round in progress",
 				zap.Any("participants", app.ActiveParticipants),
-				zap.Any("current_seen_attestations", len(app.LatestSeenProverAttestations)),
+				zap.Any(
+					"current_seen_attestations",
+					len(app.LatestSeenProverAttestations),
+				),
 				zap.Any(
 					"current_dropped_attestations",
 					len(app.DroppedParticipantAttestations),
@@ -371,7 +405,10 @@ func (e *CeremonyExecutionEngine) RunWorker() {
 			if len(e.peerChannels) == 0 && app.RoundCount == 1 &&
 				len(app.ActiveParticipants) > 1 {
 				for i, p := range app.ActiveParticipants {
-					if bytes.Equal(p.KeyValue, e.proverPublicKey) {
+					if bytes.Equal(
+						p.PublicKeySignatureEd448.PublicKey.KeyValue,
+						e.proverPublicKey,
+					) {
 						shouldConnect = true
 						position = i
 						break
@@ -418,7 +455,10 @@ func (e *CeremonyExecutionEngine) RunWorker() {
 					}
 				}
 			} else if len(app.ActiveParticipants) == 1 &&
-				bytes.Equal(app.ActiveParticipants[0].KeyValue, e.proverPublicKey) {
+				bytes.Equal(
+					app.ActiveParticipants[0].PublicKeySignatureEd448.PublicKey.KeyValue,
+					e.proverPublicKey,
+				) {
 				if err = e.commitRound(e.activeSecrets); err != nil {
 					e.logger.Error("error while participating in round", zap.Error(err))
 				}
@@ -427,7 +467,10 @@ func (e *CeremonyExecutionEngine) RunWorker() {
 			e.logger.Info(
 				"round contribution finalizing",
 				zap.Any("participants", len(app.ActiveParticipants)),
-				zap.Any("current_seen_attestations", len(app.LatestSeenProverAttestations)),
+				zap.Any(
+					"current_seen_attestations",
+					len(app.LatestSeenProverAttestations),
+				),
 				zap.Any(
 					"current_dropped_attestations",
 					len(app.DroppedParticipantAttestations),
@@ -450,7 +493,10 @@ func (e *CeremonyExecutionEngine) RunWorker() {
 
 			shouldPublish := false
 			for _, p := range app.ActiveParticipants {
-				if bytes.Equal(p.KeyValue, e.proverPublicKey) {
+				if bytes.Equal(
+					p.PublicKeySignatureEd448.PublicKey.KeyValue,
+					e.proverPublicKey,
+				) {
 					shouldPublish = true
 					break
 				}
@@ -587,7 +633,7 @@ func (e *CeremonyExecutionEngine) announceJoin(
 
 	return errors.Wrap(
 		e.publishMessage(
-			application.CEREMONY_ADDRESS,
+			e.intrinsicFilter,
 			join,
 		),
 		"announce join",
@@ -607,34 +653,20 @@ func (e *CeremonyExecutionEngine) connectToActivePeers(
 		return errors.Wrap(err, "connect to active peers")
 	}
 
-	for i, p := range app.ActiveParticipants {
-		if !bytes.Equal(p.KeyValue, e.proverPublicKey) {
-			ic, err := e.keyStore.GetLatestKeyBundle(p.KeyValue)
-			if err != nil {
-				return errors.Wrap(err, "connect to active peers")
-			}
-
-			var kba *protobufs.KeyBundleAnnouncement
-			switch ic.TypeUrl {
-			case protobufs.KeyBundleAnnouncementType:
-				kba = &protobufs.KeyBundleAnnouncement{}
-				if err := proto.Unmarshal(
-					ic.Data,
-					kba,
-				); err != nil {
-					return errors.Wrap(err, "connect to active peers")
-				}
-			}
-
+	for i, p := range app.LobbyJoins {
+		if !bytes.Equal(
+			p.PublicKeySignatureEd448.PublicKey.KeyValue,
+			e.proverPublicKey,
+		) {
 			receiverIdk, err := curves.ED448().Point.FromAffineCompressed(
-				kba.IdentityKey.GetPublicKeySignatureEd448().PublicKey.KeyValue,
+				p.IdentityKey.KeyValue,
 			)
 			if err != nil {
 				return errors.Wrap(err, "connect to active peers")
 			}
 
 			receiverSpk, err := curves.ED448().Point.FromAffineCompressed(
-				kba.SignedPreKey.GetPublicKeySignatureEd448().PublicKey.KeyValue,
+				p.SignedPreKey.KeyValue,
 			)
 			if err != nil {
 				return errors.Wrap(err, "connect to active peers")
@@ -642,19 +674,24 @@ func (e *CeremonyExecutionEngine) connectToActivePeers(
 
 			client, err := e.clock.GetPublicChannelForProvingKey(
 				i > position,
-				p.KeyValue,
+				p.PublicKeySignatureEd448.PublicKey.KeyValue,
 			)
 			if err != nil {
 				e.logger.Error(
 					"peer does not support direct public channels",
-					zap.Binary("proving_key", p.KeyValue),
+					zap.Binary(
+						"proving_key",
+						p.PublicKeySignatureEd448.PublicKey.KeyValue,
+					),
 					zap.Error(err),
 				)
 			}
-			e.peerChannels[string(p.KeyValue)], err = p2p.NewPublicP2PChannel(
+			e.peerChannels[string(
+				p.PublicKeySignatureEd448.PublicKey.KeyValue,
+			)], err = p2p.NewPublicP2PChannel(
 				client,
 				e.proverPublicKey,
-				p.KeyValue,
+				p.PublicKeySignatureEd448.PublicKey.KeyValue,
 				i > position,
 				idk,
 				spk,
@@ -690,8 +727,13 @@ func (e *CeremonyExecutionEngine) participateRound(
 	idks := []curves.Point{}
 	initiator := false
 	for _, p := range app.ActiveParticipants {
-		if !bytes.Equal(p.KeyValue, e.proverPublicKey) {
-			ic, err := e.keyStore.GetLatestKeyBundle(p.KeyValue)
+		if !bytes.Equal(
+			p.PublicKeySignatureEd448.PublicKey.KeyValue,
+			e.proverPublicKey,
+		) {
+			ic, err := e.keyStore.GetLatestKeyBundle(
+				p.PublicKeySignatureEd448.PublicKey.KeyValue,
+			)
 			if err != nil {
 				return errors.Wrap(err, "participate round")
 			}
@@ -722,22 +764,29 @@ func (e *CeremonyExecutionEngine) participateRound(
 				return errors.Wrap(err, "participate round")
 			}
 
-			if _, ok := e.peerChannels[string(p.KeyValue)]; !ok {
+			if _, ok := e.peerChannels[string(
+				p.PublicKeySignatureEd448.PublicKey.KeyValue,
+			)]; !ok {
 				client, err := e.clock.GetPublicChannelForProvingKey(
 					initiator,
-					p.KeyValue,
+					p.PublicKeySignatureEd448.PublicKey.KeyValue,
 				)
 				if err != nil {
 					e.logger.Error(
 						"peer does not support direct public channels",
-						zap.Binary("proving_key", p.KeyValue),
+						zap.Binary(
+							"proving_key",
+							p.PublicKeySignatureEd448.PublicKey.KeyValue,
+						),
 						zap.Error(err),
 					)
 				}
-				e.peerChannels[string(p.KeyValue)], err = p2p.NewPublicP2PChannel(
+				e.peerChannels[string(
+					p.PublicKeySignatureEd448.PublicKey.KeyValue,
+				)], err = p2p.NewPublicP2PChannel(
 					client,
 					e.proverPublicKey,
-					p.KeyValue,
+					p.PublicKeySignatureEd448.PublicKey.KeyValue,
 					initiator,
 					idk,
 					spk,
@@ -761,7 +810,10 @@ func (e *CeremonyExecutionEngine) participateRound(
 
 	pubKeys := [][]byte{}
 	for _, p := range app.ActiveParticipants {
-		pubKeys = append(pubKeys, p.KeyValue)
+		pubKeys = append(
+			pubKeys,
+			p.PublicKeySignatureEd448.PublicKey.KeyValue,
+		)
 	}
 
 	newSecrets, err := application.ProcessRound(
@@ -834,7 +886,7 @@ func (e *CeremonyExecutionEngine) commitRound(secrets []curves.Scalar) error {
 	}
 
 	if err := e.publishMessage(
-		application.CEREMONY_ADDRESS,
+		e.intrinsicFilter,
 		advance,
 	); err != nil {
 		return errors.Wrap(err, "commit round")
@@ -849,7 +901,7 @@ func (e *CeremonyExecutionEngine) commitRound(secrets []curves.Scalar) error {
 func (e *CeremonyExecutionEngine) publishDroppedParticipant(
 	participant []byte,
 ) {
-	frameNumber := e.clock.GetFrame()
+	frameNumber := e.clock.GetFrame().FrameNumber
 
 	b := binary.BigEndian.AppendUint64([]byte("dropped"), frameNumber)
 	b = append(b, participant...)
@@ -876,7 +928,7 @@ func (e *CeremonyExecutionEngine) publishDroppedParticipant(
 	}
 
 	err = e.publishMessage(
-		application.CEREMONY_ADDRESS,
+		e.intrinsicFilter,
 		dropped,
 	)
 	if err != nil {
@@ -893,7 +945,7 @@ func (e *CeremonyExecutionEngine) publishDroppedParticipant(
 func (e *CeremonyExecutionEngine) publishLastSeenParticipant(
 	participant []byte,
 ) {
-	frameNumber := e.clock.GetFrame()
+	frameNumber := e.clock.GetFrame().FrameNumber
 
 	b := binary.BigEndian.AppendUint64([]byte("lastseen"), frameNumber)
 	b = append(b, participant...)
@@ -919,7 +971,7 @@ func (e *CeremonyExecutionEngine) publishLastSeenParticipant(
 		},
 	}
 	err = e.publishMessage(
-		application.CEREMONY_ADDRESS,
+		e.intrinsicFilter,
 		seen,
 	)
 	if err != nil {
@@ -1019,7 +1071,7 @@ func (e *CeremonyExecutionEngine) publishTranscriptShare(
 
 	err = errors.Wrap(
 		e.publishMessage(
-			application.CEREMONY_ADDRESS,
+			e.intrinsicFilter,
 			transcriptShare,
 		),
 		"publish transcript share",
@@ -1035,7 +1087,7 @@ func (e *CeremonyExecutionEngine) publishTranscriptShare(
 func (e *CeremonyExecutionEngine) VerifyExecution(
 	frame *protobufs.ClockFrame,
 ) error {
-	if e.clock.GetFrame() != frame.FrameNumber-1 {
+	if e.clock.GetFrame().FrameNumber != frame.FrameNumber-1 {
 		return nil
 	}
 
@@ -1102,7 +1154,7 @@ func (e *CeremonyExecutionEngine) publishTranscript(
 	e.alreadyPublishedTranscript = true
 	err := errors.Wrap(
 		e.publishMessage(
-			application.CEREMONY_ADDRESS,
+			e.intrinsicFilter,
 			app.UpdatedTranscript,
 		),
 		"publish transcript share",
diff --git a/node/go.mod b/node/go.mod
index ffaeba4..3cd7cfc 100644
--- a/node/go.mod
+++ b/node/go.mod
@@ -11,9 +11,11 @@ replace github.com/libp2p/go-libp2p-gostream => ../go-libp2p-gostream
 
 replace source.quilibrium.com/quilibrium/monorepo/go-libp2p-blossomsub => ../go-libp2p-blossomsub
 
+replace github.com/cockroachdb/pebble => ../pebble
+
 require (
 	filippo.io/edwards25519 v1.0.0-rc.1
-	github.com/cockroachdb/pebble v0.0.0-20231025190044-422dce910055
+	github.com/cockroachdb/pebble v0.0.0-20231210175920-b4d301aeb46a
 	github.com/libp2p/go-libp2p v0.31.0
 	github.com/libp2p/go-libp2p-gostream v0.6.0
 	github.com/libp2p/go-libp2p-kad-dht v0.23.0
@@ -57,11 +59,9 @@ require (
 	github.com/quic-go/qtls-go1-19 v0.3.3 // indirect
 	github.com/quic-go/qtls-go1-20 v0.2.3 // indirect
 	github.com/rivo/uniseg v0.2.0 // indirect
-	golang.org/x/term v0.14.0 // indirect
-	google.golang.org/genproto v0.0.0-20230803162519-f966b187b2e5 // indirect
+	golang.org/x/term v0.14.0
 	google.golang.org/genproto/googleapis/api v0.0.0-20230822172742-b8732ec3820d // indirect
 	google.golang.org/genproto/googleapis/rpc v0.0.0-20230822172742-b8732ec3820d // indirect
-	google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.3.0 // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
 
@@ -126,7 +126,7 @@ require (
 	github.com/mikioh/tcpinfo v0.0.0-20190314235526-30a79bb1804b // indirect
 	github.com/mikioh/tcpopt v0.0.0-20190314235656-172688c1accc // indirect
 	github.com/minio/sha256-simd v1.0.1 // indirect
-	github.com/mr-tron/base58 v1.2.0 // indirect
+	github.com/mr-tron/base58 v1.2.0
 	github.com/multiformats/go-base32 v0.1.0 // indirect
 	github.com/multiformats/go-base36 v0.2.0 // indirect
 	github.com/multiformats/go-multiaddr-dns v0.3.1 // indirect
diff --git a/node/go.sum b/node/go.sum
index f1e8d1a..72df8bd 100644
--- a/node/go.sum
+++ b/node/go.sum
@@ -9,22 +9,13 @@ dmitri.shuralyov.com/state v0.0.0-20180228185332-28bcc343414c/go.mod h1:0PRwlb0D
 filippo.io/edwards25519 v1.0.0-rc.1 h1:m0VOOB23frXZvAOK44usCgLWvtsxIoMCTBGJZlpmGfU=
 filippo.io/edwards25519 v1.0.0-rc.1/go.mod h1:N1IkdkCkiLB6tki+MYJoSx2JTY9NUlxZE7eHn5EwJns=
 git.apache.org/thrift.git v0.0.0-20180902110319-2566ecd5d999/go.mod h1:fPE2ZNJGynbRyZ4dJvy6G277gSllfV2HJqblrnkyeyg=
-github.com/AndreasBriese/bbloom v0.0.0-20190306092124-e2d15f34fcf9/go.mod h1:bOvUY6CB00SOBii9/FifXqc0awNKxLFCL/+pkDPuyl8=
 github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
-github.com/CloudyKit/fastprinter v0.0.0-20170127035650-74b38d55f37a/go.mod h1:EFZQ978U7x8IRnstaskI3IysnWY5Ao3QgZUKOXlsAdw=
-github.com/CloudyKit/jet v2.1.3-0.20180809161101-62edd43e4f88+incompatible/go.mod h1:HPYO+50pSWkPoj9Q/eq0aRGByCL6ScRlUmiEX5Zgm+w=
 github.com/DataDog/zstd v1.4.5 h1:EndNeuB0l9syBZhut0wns3gV1hL8zX8LIu6ZiVHWLIQ=
 github.com/DataDog/zstd v1.4.5/go.mod h1:1jcaCB/ufaK+sKp1NBhlGmpz41jOoPQ35bpF36t7BBo=
-github.com/Joker/hpp v1.0.0/go.mod h1:8x5n+M1Hp5hC0g8okX3sR3vFQwynaX/UgSOM9MeBKzY=
-github.com/Joker/jade v1.0.1-0.20190614124447-d475f43051e7/go.mod h1:6E6s8o2AE4KhCrqr6GRJjdC/gNfTdxkIXvuGZZda2VM=
-github.com/Shopify/goreferrer v0.0.0-20181106222321-ec9c9a553398/go.mod h1:a1uqRtAwp2Xwc6WNPJEufxJ7fx3npB4UV/JOLmbu5I0=
 github.com/aead/siphash v1.0.1/go.mod h1:Nywa3cDsYNNK3gaciGTWPwHt0wlpNV15vwmswBAUSII=
-github.com/ajg/form v1.5.1/go.mod h1:uL1WgH+h2mgNtvBq0339dVnzXdBETtL2LeUXaIv25UY=
 github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239/go.mod h1:2FmKhYUyUczH0OGQWaF5ceTx0UBShxjsH6f8oGKYe2c=
-github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8=
 github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
 github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
-github.com/aymerick/raymond v2.0.3-0.20180322193309-b565731e1464+incompatible/go.mod h1:osfaiScAUVup+UC9Nfq76eWqDhXlp+4UYaA8uhTBO6g=
 github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA=
 github.com/benbjohnson/clock v1.3.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA=
 github.com/benbjohnson/clock v1.3.5 h1:VvXlSJBzZpA/zum6Sj74hxwYI2DIxRWuNIoXAzHZz5o=
@@ -61,30 +52,16 @@ github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDk
 github.com/cloudflare/circl v1.3.3 h1:fE/Qz0QdIGqeWfnwq0RE0R7MI51s0M2E4Ga9kq5AEMs=
 github.com/cloudflare/circl v1.3.3/go.mod h1:5XYMA4rFBvNIrhs50XuiBJ15vF2pZn4nnUKZrLbUZFA=
 github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
-github.com/cockroachdb/datadriven v1.0.0/go.mod h1:5Ib8Meh+jk1RlHIXej6Pzevx/NLlNvQB9pmSBZErGA4=
 github.com/cockroachdb/datadriven v1.0.3-0.20230413201302-be42291fc80f h1:otljaYPt5hWxV3MUfO5dFPFiOXg9CyG5/kCfayTqsJ4=
-github.com/cockroachdb/errors v1.6.1/go.mod h1:tm6FTP5G81vwJ5lC0SizQo374JNCOPrHyXGitRJoDqM=
-github.com/cockroachdb/errors v1.8.1 h1:A5+txlVZfOqFBDa4mGz2bUWSp0aHElvHX2bKkdbQu+Y=
-github.com/cockroachdb/errors v1.8.1/go.mod h1:qGwQn6JmZ+oMjuLwjWzUNqblqk0xl4CVV3SQbGwK7Ac=
 github.com/cockroachdb/errors v1.11.1 h1:xSEW75zKaKCWzR3OfxXUxgrk/NtT4G1MiOv5lWZazG8=
 github.com/cockroachdb/errors v1.11.1/go.mod h1:8MUxA3Gi6b25tYlFEBGLf+D8aISL+M4MIpiWMSNRfxw=
-github.com/cockroachdb/logtags v0.0.0-20190617123548-eb05cc24525f h1:o/kfcElHqOiXqcou5a3rIlMc7oJbMQkeLk0VQJ7zgqY=
-github.com/cockroachdb/logtags v0.0.0-20190617123548-eb05cc24525f/go.mod h1:i/u985jwjWRlyHXQbwatDASoW0RMlZ/3i9yJHE2xLkI=
 github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b h1:r6VH0faHjZeQy818SGhaone5OnYfxFR/+AzdY3sf5aE=
 github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b/go.mod h1:Vz9DsVWQQhf3vs21MhPMZpMGSht7O/2vFW2xusFUVOs=
-github.com/cockroachdb/pebble v0.0.0-20230527012508-ac69476c46ff h1:/F1VgP7wxZCRj8PzresPo2NbAdgPwmU7pi+CgZ8sBZw=
-github.com/cockroachdb/pebble v0.0.0-20230527012508-ac69476c46ff/go.mod h1:TkdVsGYRqtULUppt2RbC+YaKtTHnHoWa2apfFrSKABw=
-github.com/cockroachdb/pebble v0.0.0-20231025190044-422dce910055 h1:EigfnVX/iY/WTi3F+f4ezhAxJO+BePglQkEAKycNhqo=
-github.com/cockroachdb/pebble v0.0.0-20231025190044-422dce910055/go.mod h1:sEHm5NOXxyiAoKWhoFxT8xMgd/f3RA6qUqQ1BXKrh2E=
-github.com/cockroachdb/redact v1.0.8 h1:8QG/764wK+vmEYoOlfobpe12EQcS81ukx/a4hdVMxNw=
-github.com/cockroachdb/redact v1.0.8/go.mod h1:BVNblN9mBWFyMyqK1k3AAiSxhvhfK2oOZZ2lK+dpvRg=
+github.com/cockroachdb/metamorphic v0.0.0-20231108215700-4ba948b56895 h1:XANOgPYtvELQ/h4IrmPAohXqe2pWA8Bwhejr3VQoZsA=
 github.com/cockroachdb/redact v1.1.5 h1:u1PMllDkdFfPWaNGMyLD1+so+aq3uUItthCFqzwPJ30=
 github.com/cockroachdb/redact v1.1.5/go.mod h1:BVNblN9mBWFyMyqK1k3AAiSxhvhfK2oOZZ2lK+dpvRg=
-github.com/cockroachdb/sentry-go v0.6.1-cockroachdb.2 h1:IKgmqgMQlVJIZj19CdocBeSfSaiCbEBZGKODaixqtHM=
-github.com/cockroachdb/sentry-go v0.6.1-cockroachdb.2/go.mod h1:8BT+cPK6xvFOcRlk0R8eg+OTkcqI6baNH4xAkpiYVvQ=
 github.com/cockroachdb/tokenbucket v0.0.0-20230807174530-cc333fc44b06 h1:zuQyyAKVxetITBuuhv3BI9cMrmStnpT18zmgmTxunpo=
 github.com/cockroachdb/tokenbucket v0.0.0-20230807174530-cc333fc44b06/go.mod h1:7nc4anLGjupUW/PeY5qiNYsdNXj7zopG+eqsS7To5IQ=
-github.com/codegangsta/inject v0.0.0-20150114235600-33e0aa1cb7c0/go.mod h1:4Zcjuz89kmFXt9morQgcfYZAYZ5n8WHjt81YYWIwtTM=
 github.com/consensys/bavard v0.1.8-0.20210915155054-088da2f7f54a/go.mod h1:9ItSMtA/dXMAiL7BG6bqW2m3NdSEObYWoH223nGHukI=
 github.com/consensys/gnark-crypto v0.5.3 h1:4xLFGZR3NWEH2zy+YzvzHicpToQR8FXFbfLNvpGB+rE=
 github.com/consensys/gnark-crypto v0.5.3/go.mod h1:hOdPlWQV1gDLp7faZVeg8Y0iEPFaOUnCc4XeCCk96p0=
@@ -93,14 +70,10 @@ github.com/containerd/cgroups v1.1.0 h1:v8rEWFl6EoqHB+swVNjVoCJE8o3jX7e8nqBGPLaD
 github.com/containerd/cgroups v1.1.0/go.mod h1:6ppBcbh/NOOUU+dMKrykgaBnK9lCIBxHqJDGwsa1mIw=
 github.com/containerd/console v1.0.4-0.20230313162750-1ae8d489ac81 h1:q2hJAaP1k2wIvVRd/hEHD7lacgqrCPS+k8g1MndzfWY=
 github.com/containerd/console v1.0.4-0.20230313162750-1ae8d489ac81/go.mod h1:YynlIjWYF8myEu6sdkwKIvGQq+cOckRm6So2avqoYAk=
-github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE=
-github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk=
-github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
 github.com/coreos/go-systemd v0.0.0-20181012123002-c6f51f82210d/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=
 github.com/coreos/go-systemd/v22 v22.1.0/go.mod h1:xO0FLkIi5MaZafQlIrOotqXZ90ih+1atmu1JpKERPPk=
 github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs=
 github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
-github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE=
 github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
 github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
 github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
@@ -114,14 +87,10 @@ github.com/decred/dcrd/crypto/blake256 v1.0.1 h1:7PltbUIQB7u/FfZ39+DGa/ShuMyJ5il
 github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0 h1:8UrgZ3GkP4i/CLijOJx79Yu+etlyjdBU4sfcs2WYQMs=
 github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0/go.mod h1:v57UDF4pDQJcEfFUCRop3lJL149eHGSe9Jvczhzjo/0=
 github.com/decred/dcrd/lru v1.0.0/go.mod h1:mxKOwFd7lFjN2GZYsiz/ecgqR6kkYAl+0pz0tEMk218=
-github.com/dgraph-io/badger v1.6.0/go.mod h1:zwt7syl517jmP8s94KqSxTlM6IMsdhYy6psNgSztDR4=
-github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
-github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw=
 github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
 github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4=
 github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
 github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
-github.com/eknkc/amber v0.0.0-20171010120322-cdade1c07385/go.mod h1:0vRUJqYpeSZifjYj7uP3BG/gKcuzL9xWVV/Y+cK33KM=
 github.com/elastic/gosigar v0.12.0/go.mod h1:iXRIGg2tLnu7LBdpqzyQfGDEidKCfWcCMS0WKyPWoMs=
 github.com/elastic/gosigar v0.14.2 h1:Dg80n8cr90OZ7x+bAax/QjoW/XqTI11RmA79ZwIm9/4=
 github.com/elastic/gosigar v0.14.2/go.mod h1:iXRIGg2tLnu7LBdpqzyQfGDEidKCfWcCMS0WKyPWoMs=
@@ -129,10 +98,6 @@ github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymF
 github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
 github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
 github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
-github.com/etcd-io/bbolt v1.3.3/go.mod h1:ZF2nL25h33cCyBtcyWeZ2/I3HQOfTP+0PIEvHjkjCrw=
-github.com/fasthttp-contrib/websocket v0.0.0-20160511215533-1f3b11f56072/go.mod h1:duJ4Jxv5lDcvg4QuQr0oowTf7dz4/CR8NtyCooz9HL8=
-github.com/fatih/structs v1.1.0/go.mod h1:9NiDSp5zOcgEDl+j00MP/WkGVPOlPRLejGD8Ga6PJ7M=
-github.com/flosch/pongo2 v0.0.0-20190707114632-bbf5a6c351f4/go.mod h1:T9YF2M40nIgbVgp3rreNmTged+9HrbNTIQf1PsaIiTA=
 github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:xEzjJPgXI435gkrCt3MPfRiAkVrwSbHsst4LCFVfpJc=
 github.com/flynn/noise v1.0.0 h1:DlTHqmzmvcEiKj+4RYo/imoswx/4r6iBlCMfVtrMXpQ=
 github.com/flynn/noise v1.0.0/go.mod h1:xbMo+0i6+IGbYdJhF31t2eR1BIU0CYc12+BNAKwUTag=
@@ -140,43 +105,30 @@ github.com/francoispqt/gojay v1.2.13 h1:d2m3sFjloqoIUQU3TsHBgj6qg/BVGlTBeHDUmyJn
 github.com/francoispqt/gojay v1.2.13/go.mod h1:ehT5mTG4ua4581f1++1WLG0vPdaA9HaiDsoyrBGkyDY=
 github.com/frankban/quicktest v1.14.4 h1:g2rn0vABPOOXmZUj+vbmUp0lPoXEMuhTpIluN0XL9UY=
 github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
-github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ=
-github.com/gavv/httpexpect v2.0.0+incompatible/go.mod h1:x+9tiU1YnrOvnB725RkpoLv1M62hOWzwo5OXotisrKc=
 github.com/getsentry/sentry-go v0.18.0 h1:MtBW5H9QgdcJabtZcuJG80BMOwaBpkRDZkxRkNC1sN0=
 github.com/getsentry/sentry-go v0.18.0/go.mod h1:Kgon4Mby+FJ7ZWHFUAZgVaIa8sxHtnRJRLTXZr51aKQ=
 github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
-github.com/gin-contrib/sse v0.0.0-20190301062529-5545eab6dad3/go.mod h1:VJ0WA2NBN22VlZ2dKZQPAPnyWw5XTlK1KymzLKsr59s=
-github.com/gin-gonic/gin v1.4.0/go.mod h1:OW2EZn3DO8Ln9oIKOvM++LBO+5UPHJJDH72/q/3rZdM=
 github.com/gliderlabs/ssh v0.1.1/go.mod h1:U7qILu1NlMHj9FlMhZLlkCdDnU1DBEAqr0aevW3Awn0=
-github.com/go-check/check v0.0.0-20180628173108-788fd7840127/go.mod h1:9ES+weclKsC9YodN5RgxqK/VD9HM9JsCSh7rNhMZE98=
-github.com/go-errors/errors v1.0.1 h1:LUHzmkK3GUKUrL/1gfBUxAHzcev3apQlezX/+O7ma6w=
 github.com/go-errors/errors v1.0.1/go.mod h1:f4zRHt4oKfwPJE5k8C9vpYG+aDHdBFUsgrm6/TyX73Q=
+github.com/go-errors/errors v1.4.2 h1:J6MZopCL4uSllY1OfXM374weqZFFItUbrImctkmUxIA=
 github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
 github.com/go-logr/logr v1.2.4 h1:g01GSCwiDw2xSZfjJ2/T9M+S6pFdcNtFYsp+Y43HYDQ=
 github.com/go-logr/logr v1.2.4/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
 github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
 github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
-github.com/go-martini/martini v0.0.0-20170121215854-22fa46961aab/go.mod h1:/P9AEU963A2AYjv4d1V5eVL1CQbEJq6aCNHDDjibzu8=
 github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI=
 github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls=
 github.com/go-yaml/yaml v2.1.0+incompatible/go.mod h1:w2MrLa16VYP0jy6N7M5kHaCkaLENm+P+Tv+MfurjSw0=
-github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee/go.mod h1:L0fX3K22YWvt/FAX9NnzrNzcI4wNYi9Yku4O0LKYflo=
-github.com/gobwas/pool v0.2.0/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
-github.com/gobwas/ws v1.0.2/go.mod h1:szmBTxLgaFppYjEmNtny/v3w89xOydFnnZMcgRRu/EM=
 github.com/godbus/dbus/v5 v5.0.3/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
 github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
 github.com/godbus/dbus/v5 v5.1.0 h1:4KLkAxT3aOY8Li4FRJe/KvhoNFFxo0m6fNuFUO8QJUk=
 github.com/godbus/dbus/v5 v5.1.0/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
-github.com/gogo/googleapis v0.0.0-20180223154316-0cd9801be74a/go.mod h1:gf4bu3Q80BeJ6H1S1vYPm8/ELATdvryBaNFGgqEef3s=
 github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
-github.com/gogo/protobuf v1.2.0/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
 github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o=
 github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
 github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
-github.com/gogo/status v1.1.0/go.mod h1:BFv9nrluPLmrS0EmGVvLaPNmRosr9KapBYd5/hpY1WM=
 github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
 github.com/golang/glog v1.1.0 h1:/d3pCKDPWNnvIWe0vVUpNP32qc8U3PDVxySP/y360qE=
-github.com/golang/glog v1.1.0/go.mod h1:pfYeQZ3JWZoXTV5sFc986z3HTpwQs9At6P4ImfuP3NQ=
 github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
 github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
 github.com/golang/lint v0.0.0-20180702182130-06c8688daad7/go.mod h1:tluoj9z5200jBnyusfRPU2LqT6J+DAorxEvtC7LHB+E=
@@ -187,21 +139,18 @@ github.com/golang/mock v1.6.0/go.mod h1:p6yTPP+5HYm5mzsMV8JkE6ZKdX+/wYM6Hr+Licev
 github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
 github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
 github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
-github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
 github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
 github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
 github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
 github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
 github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
 github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
-github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
 github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
 github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
 github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg=
 github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
 github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM=
 github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
-github.com/gomodule/redigo v1.7.1-0.20190724094224-574c33c3df38/go.mod h1:B4C85qUVwatsJoIUNIfCRsp7qO0iAmpGFZ4EELWSbC4=
 github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
 github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
 github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
@@ -231,7 +180,6 @@ github.com/googleapis/gax-go v2.0.0+incompatible/go.mod h1:SFVmujtThgffbyetf+mdk
 github.com/googleapis/gax-go/v2 v2.0.3/go.mod h1:LLvjysVCY1JZeum8Z6l8qUty8fiNwE08qbEPm1M08qg=
 github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
 github.com/gopherjs/gopherjs v0.0.0-20190430165422-3e4dfb77656c h1:7lF+Vz0LqiRidnzC1Oq86fpX1q/iEv2KJdrCtttYjT4=
-github.com/gorilla/websocket v1.4.0/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ=
 github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc=
 github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
 github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=
@@ -245,20 +193,15 @@ github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY
 github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
 github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
 github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
-github.com/hashicorp/go-version v1.2.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA=
 github.com/hashicorp/golang-lru v0.5.4 h1:YDjusn29QI/Das2iO9M0BHnIbxPeyuCHsjMW+lJfyTc=
 github.com/hashicorp/golang-lru v0.5.4/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4=
 github.com/hashicorp/golang-lru/v2 v2.0.2 h1:Dwmkdr5Nc/oBiXgJS3CDHNhJtIHkuZ3DZF5twqnfBdU=
 github.com/hashicorp/golang-lru/v2 v2.0.2/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
-github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
 github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
 github.com/huin/goupnp v1.2.0 h1:uOKW26NG1hsSSbXIZ1IR7XP9Gjd1U8pnLaCMgntmkmY=
 github.com/huin/goupnp v1.2.0/go.mod h1:gnGPsThkYa7bFi/KWmEysQRf48l2dvR5bxr2OFckNX8=
-github.com/hydrogen18/memlistener v0.0.0-20141126152155-54553eb933fb/go.mod h1:qEIFzExnS6016fRpRfxrExeVn2gbClQA99gQhnIcdhE=
 github.com/iden3/go-iden3-crypto v0.0.15 h1:4MJYlrot1l31Fzlo2sF56u7EVFeHHJkxGXXZCtESgK4=
 github.com/iden3/go-iden3-crypto v0.0.15/go.mod h1:dLpM4vEPJ3nDHzhWFXDjzkn1qHoBeOT/3UEhXsEsP3E=
-github.com/imkira/go-interpol v1.1.0/go.mod h1:z0h2/2T3XF8kyEPpRgJ3kmNv+C43p+I/CoI+jC3w2iA=
-github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8=
 github.com/ipfs/boxo v0.8.0 h1:UdjAJmHzQHo/j3g3b1bAcAXCj/GM6iTwvSlBDvPBNBs=
 github.com/ipfs/boxo v0.8.0/go.mod h1:RIsi4CnTyQ7AUsNn5gXljJYZlQrHBMnJp94p73liFiA=
 github.com/ipfs/go-cid v0.4.1 h1:A/T3qGvxi4kpKWWcPC/PgbvDA2bjVLO7n4UeVwnbs/s=
@@ -276,10 +219,6 @@ github.com/ipfs/go-log/v2 v2.5.1 h1:1XdUzF7048prq4aBjDQQ4SL5RxftpRGdXhNRwKSAlcY=
 github.com/ipfs/go-log/v2 v2.5.1/go.mod h1:prSpmC1Gpllc9UYWxDiZDreBYw7zp4Iqp1kOLU9U5UI=
 github.com/ipld/go-ipld-prime v0.20.0 h1:Ud3VwE9ClxpO2LkCYP7vWPc0Fo+dYdYzgxUJZ3uRG4g=
 github.com/ipld/go-ipld-prime v0.20.0/go.mod h1:PzqZ/ZR981eKbgdr3y2DJYeD/8bgMawdGVlJDE8kK+M=
-github.com/iris-contrib/blackfriday v2.0.0+incompatible/go.mod h1:UzZ2bDEoaSGPbkg6SAB4att1aAwTmVIx/5gCVqeyUdI=
-github.com/iris-contrib/go.uuid v2.0.0+incompatible/go.mod h1:iz2lgM/1UnEf1kP0L/+fafWORmlnuysV2EMP8MW+qe0=
-github.com/iris-contrib/i18n v0.0.0-20171121225848-987a633949d0/go.mod h1:pMCz62A0xJL6I+umB2YTlFRwWXaDFA0jy+5HzGiJjqI=
-github.com/iris-contrib/schema v0.0.1/go.mod h1:urYA3uvUNG1TIIjOSCzHr9/LmbQo8LrOcOqfqxa4hXw=
 github.com/jackpal/go-nat-pmp v1.0.2 h1:KzKSgb7qkJvOUTqYl9/Hg/me3pWgBmERKrTGD7BdWus=
 github.com/jackpal/go-nat-pmp v1.0.2/go.mod h1:QPH045xvCAeXUZOxsnwmrtiCoxIr9eob+4orBN1SBKc=
 github.com/jbenet/go-cienv v0.1.0/go.mod h1:TqNnHUmJgXau0nCzC7kXWeotg3J9W34CUv5Djy1+FlA=
@@ -295,23 +234,12 @@ github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCV
 github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU=
 github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=
 github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
-github.com/juju/errors v0.0.0-20181118221551-089d3ea4e4d5/go.mod h1:W54LbzXuIE0boCoNJfwqpmkKJ1O4TCTZMetAt6jGk7Q=
-github.com/juju/loggo v0.0.0-20180524022052-584905176618/go.mod h1:vgyd7OREkbtVEN/8IXZe5Ooef3LQePvuBm9UWj6ZL8U=
-github.com/juju/testing v0.0.0-20180920084828-472a3e8b2073/go.mod h1:63prj8cnj0tU0S9OHjGJn+b1h0ZghCndfnbQolrYTwA=
-github.com/k0kubun/colorstring v0.0.0-20150214042306-9440f1994b88/go.mod h1:3w7q1U84EfirKl04SVQ/s7nPm1ZPhiXd34z40TNz36k=
-github.com/kataras/golog v0.0.9/go.mod h1:12HJgwBIZFNGL0EJnMRhmvGA0PQGx8VFwrZtM4CqbAk=
-github.com/kataras/iris/v12 v12.0.1/go.mod h1:udK4vLQKkdDqMGJJVd/msuMtN6hpYJhg/lSzuxjhO+U=
-github.com/kataras/neffos v0.0.10/go.mod h1:ZYmJC07hQPW67eKuzlfY7SO3bC0mw83A3j6im82hfqw=
-github.com/kataras/pio v0.0.0-20190103105442-ea782b38602d/go.mod h1:NV88laa9UiiDuX9AhMbDPkGYSPugBOV6yTZB1l2K9Z0=
 github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00=
 github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
 github.com/kkdai/bstream v0.0.0-20161212061736-f391b8402d23/go.mod h1:J+Gs4SYgM6CZQHDETBtE9HaSEkGmuNXF86RwHhHUvq4=
-github.com/klauspost/compress v1.8.2/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
-github.com/klauspost/compress v1.9.0/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
 github.com/klauspost/compress v1.16.7 h1:2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I=
 github.com/klauspost/compress v1.16.7/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
-github.com/klauspost/cpuid v1.2.1/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
 github.com/klauspost/cpuid/v2 v2.2.5 h1:0E5MSMDEoAulmXNFquVs//DdoomxaoTY1kUhbc/qbZg=
 github.com/klauspost/cpuid/v2 v2.2.5/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
 github.com/koron/go-ssdp v0.0.4 h1:1IDwrghSKYM7yLf7XCzbByg2sJ/JcNOZRXS2jczTwz0=
@@ -325,8 +253,6 @@ github.com/kr/pty v1.1.3/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
 github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
-github.com/labstack/echo/v4 v4.1.11/go.mod h1:i541M3Fj6f76NZtHSj7TXnyM8n2gaodfvfxNnFqi74g=
-github.com/labstack/gommon v0.3.0/go.mod h1:MULnywXg0yavhxWKc+lOruYdAhDwPK9wf0OL7NoOu+k=
 github.com/leanovate/gopter v0.2.9 h1:fQjYxZaynp97ozCzfOyOuAGOU4aU/z37zf/tOujFk7c=
 github.com/leanovate/gopter v0.2.9/go.mod h1:U2L/78B+KVFIx2VmW6onHJQzXtFb+p5y3y2Sh+Jxxv8=
 github.com/libp2p/go-buffer-pool v0.1.0 h1:oK4mSFcQz7cTQIfqbe4MIj9gLW+mnanjyFtc6cdF0Y8=
@@ -357,14 +283,9 @@ github.com/libp2p/go-yamux/v4 v4.0.1/go.mod h1:NWjl8ZTLOGlozrXSOZ/HlfG++39iKNnM5
 github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY=
 github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
 github.com/lunixbochs/vtclean v1.0.0/go.mod h1:pHhQNgMf3btfWnGBVipUOjRYhoOsdGqdm/+2c2E2WMI=
-github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
 github.com/mailru/easyjson v0.0.0-20190312143242-1de009706dbe/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
 github.com/marten-seemann/tcp v0.0.0-20210406111302-dfbc87cc63fd h1:br0buuQ854V8u83wA0rVZ8ttrq5CpaPZdvrK0LP2lOk=
 github.com/marten-seemann/tcp v0.0.0-20210406111302-dfbc87cc63fd/go.mod h1:QuCEs1Nt24+FYQEqAAncTDPJIuGs+LxK1MCiFL25pMU=
-github.com/mattn/go-colorable v0.1.2/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE=
-github.com/mattn/go-isatty v0.0.7/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s=
-github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s=
-github.com/mattn/go-isatty v0.0.9/go.mod h1:YNRxwqDuOph6SZLI9vUUz6OYw3QyUt7WiY2yME+cCiQ=
 github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94=
 github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA=
 github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
@@ -373,14 +294,10 @@ github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+Ei
 github.com/mattn/go-runewidth v0.0.12/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk=
 github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZgg3U=
 github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
-github.com/mattn/goveralls v0.0.2/go.mod h1:8d1ZMHsd7fW6IRPKQh46F2WRpyib5/X4FOpevwGNQEw=
 github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
 github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo=
 github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4=
-github.com/mediocregopher/mediocre-go-lib v0.0.0-20181029021733-cb65787f37ed/go.mod h1:dSsfyI2zABAdhcbvkXqgxOxrCsbYeHCPgrZkku60dSg=
-github.com/mediocregopher/radix/v3 v3.3.0/go.mod h1:EmfVyvspXz1uZEyPBMyGK+kjWiKQGvsUt6O3Pj+LDCQ=
 github.com/microcosm-cc/bluemonday v1.0.1/go.mod h1:hsXNsILzKxV+sX77C5b8FSuKF00vh2OMYv+xgHpAMF4=
-github.com/microcosm-cc/bluemonday v1.0.2/go.mod h1:iVP4YcDBq+n/5fb23BhYFvIMq/leAFZyRl6bYmGDlGc=
 github.com/miekg/dns v1.1.41/go.mod h1:p6aan82bvRIyn+zDIv9xYNUpwa73JcSh9BKwknJysuI=
 github.com/miekg/dns v1.1.55 h1:GoQ4hpsj0nFLYe+bWiCToyrBEJXkQfOOIvFGFy0lEgo=
 github.com/miekg/dns v1.1.55/go.mod h1:uInx36IzPl7FYnDcMeVWxj9byh7DutNykX4G9Sj60FY=
@@ -396,11 +313,8 @@ github.com/minio/blake2b-simd v0.0.0-20160723061019-3f5f724cb5b1/go.mod h1:pD8Rv
 github.com/minio/sha256-simd v0.1.1-0.20190913151208-6de447530771/go.mod h1:B5e1o+1/KgNmWrSQK08Y6Z1Vb5pwIktudl0J58iy0KM=
 github.com/minio/sha256-simd v1.0.1 h1:6kaan5IFmwTNynnKKpDHe6FWHohJOHhCPchzK49dzMM=
 github.com/minio/sha256-simd v1.0.1/go.mod h1:Pz6AKMiUdngCLpeTL/RJY1M9rUuPMYujV5xJjtbRSN8=
-github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
-github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
-github.com/moul/http2curl v1.0.0/go.mod h1:8UbvGypXm98wA/IqH45anm5Y2Z6ep6O31QGOAZ3H0fQ=
 github.com/mr-tron/base58 v1.1.2/go.mod h1:BinMc/sQntlIE1frQmRFPUoPA1Zkr8VRgBdjWI2mNwc=
 github.com/mr-tron/base58 v1.1.3/go.mod h1:BinMc/sQntlIE1frQmRFPUoPA1Zkr8VRgBdjWI2mNwc=
 github.com/mr-tron/base58 v1.2.0 h1:T/HDJBh4ZCPbU39/+c3rRvE0uKBQlU27+QI8LJ4t64o=
@@ -439,22 +353,14 @@ github.com/multiformats/go-varint v0.0.1/go.mod h1:3Ls8CIEsrijN6+B7PbrXRPxHRPuXS
 github.com/multiformats/go-varint v0.0.5/go.mod h1:3Ls8CIEsrijN6+B7PbrXRPxHRPuXSrVKRY101jdMZYE=
 github.com/multiformats/go-varint v0.0.7 h1:sWSGR+f/eu5ABZA2ZpYKBILXTTs9JWpdEM/nEGOHFS8=
 github.com/multiformats/go-varint v0.0.7/go.mod h1:r8PUYw/fD/SjBCiKOoDlGF6QawOELpZAu9eioSos/OU=
-github.com/nats-io/nats.go v1.8.1/go.mod h1:BrFz9vVn0fU3AcH9Vn4Kd7W0NpJ651tD5omQ3M8LwxM=
-github.com/nats-io/nkeys v0.0.2/go.mod h1:dab7URMsZm6Z/jp9Z5UGa87Uutgc2mVpXLC4B7TDb/4=
-github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
 github.com/neelance/astrewrite v0.0.0-20160511093645-99348263ae86/go.mod h1:kHJEU3ofeGjhHklVoIGuVj85JJwZ6kWPaJwCIxgnFmo=
 github.com/neelance/sourcemap v0.0.0-20151028013722-8c68805598ab/go.mod h1:Qr6/a/Q4r9LP1IltGz7tA7iOK1WonHEYhu1HRBA7ZiM=
-github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A=
 github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
 github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
-github.com/onsi/ginkgo v1.12.1/go.mod h1:zj2OWP4+oCPe1qIXoGWkgMRwljMUYCdkwsT2108oapk=
-github.com/onsi/ginkgo v1.13.0/go.mod h1:+REjRxOmWfHCjfv9TTWB1jD1Frx4XydAD3zm1lskyM0=
 github.com/onsi/ginkgo/v2 v2.11.0 h1:WgqUCUt/lT6yXoQ8Wef0fsNn5cAuMK7+KT9UFRz2tcU=
 github.com/onsi/ginkgo/v2 v2.11.0/go.mod h1:ZhrRA5XmEE3x3rhlzamx/JJvujdZoJ2uvgI7kR0iZvM=
 github.com/onsi/gomega v1.4.1/go.mod h1:C1qb7wdrVGGVU+Z6iS04AVkA3Q65CEZX59MT0QO5uiA=
 github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY=
-github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY=
-github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo=
 github.com/onsi/gomega v1.27.8 h1:gegWiwZjBsf2DgiSbf5hpokZ98JVDMcWkUiigk6/KXc=
 github.com/opencontainers/runtime-spec v1.0.2/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
 github.com/opencontainers/runtime-spec v1.1.0 h1:HHUyrt9mwHUjtasSbXSMvs4cyFxh+Bll4AjJ9odEGpg=
@@ -464,9 +370,7 @@ github.com/opentracing/opentracing-go v1.2.0/go.mod h1:GxEUsuufX4nBwe+T+Wl9TAgYr
 github.com/openzipkin/zipkin-go v0.1.1/go.mod h1:NtoC/o8u3JlF1lSlyPNswIbeQH9bJTmOf0Erfk+hxe8=
 github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
 github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
-github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic=
 github.com/pingcap/errors v0.11.4 h1:lFuQV/oaUMGcD2tqt+01ROSmJs75VG1ToEOkZIZ4nE4=
-github.com/pingcap/errors v0.11.4/go.mod h1:Oi8TUi2kEtXXLMJk9l1cGmz20kV3TaQ0usTwv5KuLY8=
 github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
 github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
@@ -494,8 +398,6 @@ github.com/quic-go/qtls-go1-19 v0.3.3 h1:wznEHvJwd+2X3PqftRha0SUKmGsnb6dfArMhy9P
 github.com/quic-go/qtls-go1-19 v0.3.3/go.mod h1:ySOI96ew8lnoKPtSqx2BlI5wCpUVPT05RMAlajtnyOI=
 github.com/quic-go/qtls-go1-20 v0.2.3 h1:m575dovXn1y2ATOb1XrRFcrv0F+EQmlowTkoraNkDPI=
 github.com/quic-go/qtls-go1-20 v0.2.3/go.mod h1:JKtK6mjbAVcUTN/9jZpvLbGxvdWIKS8uT7EiStoU1SM=
-github.com/quic-go/qtls-go1-20 v0.3.2 h1:rRgN3WfnKbyik4dBV8A6girlJVxGand/d+jVKbQq5GI=
-github.com/quic-go/qtls-go1-20 v0.3.2/go.mod h1:X9Nh97ZL80Z+bX/gUXMbipO6OxdiDi58b/fMC9mAL+k=
 github.com/quic-go/quic-go v0.36.3 h1:f+yOqeGhMoRX7/M3wmEw/djhzKWr15FtQysox85/834=
 github.com/quic-go/quic-go v0.36.3/go.mod h1:qxQumdeKw5GmWs1OsTZZnOxzSI+RJWuhf1O8FN35L2o=
 github.com/quic-go/webtransport-go v0.5.3 h1:5XMlzemqB4qmOlgIus5zB45AcZ2kCgCy2EptUrfOPWU=
@@ -511,10 +413,7 @@ github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjR
 github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
 github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g=
 github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
-github.com/ryanuber/columnize v2.1.0+incompatible/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts=
-github.com/sclevine/agouti v3.0.0+incompatible/go.mod h1:b4WX9W9L1sfQKXeJf1mUTLZKJ48R1S7H23Ji7oFO5Bw=
 github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo=
-github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM=
 github.com/shurcooL/component v0.0.0-20170202220835-f88ec8f54cc4/go.mod h1:XhFIlyj5a1fBNx5aJTbKoIq0mNaPvOagO+HjB3EtxrY=
 github.com/shurcooL/events v0.0.0-20181021180414-410e4ca65f48/go.mod h1:5u70Mqkb5O5cxEA8nxTsgrgLehJeAw6Oc4Ab1c/P1HM=
 github.com/shurcooL/github_flavored_markdown v0.0.0-20181002035957-2122de532470/go.mod h1:2dOwnU2uBioM+SGy2aZoq1f/Sd1l9OkAeAUvjSyvgU0=
@@ -539,22 +438,14 @@ github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeV
 github.com/shurcooL/users v0.0.0-20180125191416-49c67e49c537/go.mod h1:QJTqeLYEDaXHZDBsXlPCDqdhQuJkuw4NOtaxYe3xii4=
 github.com/shurcooL/webdavfs v0.0.0-20170829043945-18c3829fa133/go.mod h1:hKmq5kWdCj2z2KEozexVbfEZIWiTjhE0+UjmZgPqehw=
 github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0=
-github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
 github.com/smartystreets/assertions v1.2.0 h1:42S6lae5dvLc7BrLu/0ugRtcFVjoJNMC/N3yZFZkDFs=
 github.com/smartystreets/assertions v1.2.0/go.mod h1:tcbTF8ujkAEcZ8TElKY+i30BzYlVhC/LOxJk7iOWnoo=
-github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA=
 github.com/smartystreets/goconvey v1.7.2 h1:9RBaZCeXEQ3UselpuwUQHltGVXvdwm6cv1hgR6gDIPg=
 github.com/smartystreets/goconvey v1.7.2/go.mod h1:Vw0tHAZW6lzCRk3xgdin6fKYcG+G3Pg9vgXWeJpQFMM=
 github.com/sourcegraph/annotate v0.0.0-20160123013949-f4cad6c6324d/go.mod h1:UdhH50NIW0fCiwBSr0co2m7BnFLdv4fQTgdqdJTHFeE=
 github.com/sourcegraph/syntaxhighlight v0.0.0-20170531221838-bd320f5d308e/go.mod h1:HuIsMU8RRBOtsCgI77wP899iHVBQpCmg4ErYMZB+2IA=
 github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI=
 github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
-github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ=
-github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE=
-github.com/spf13/cobra v0.0.5/go.mod h1:3K3wKZymM7VvHMDS9+Akkh4K60UwM26emMESw8tLCHU=
-github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo=
-github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4=
-github.com/spf13/viper v1.3.2/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
 github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
@@ -569,29 +460,14 @@ github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o
 github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
 github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
 github.com/tarm/serial v0.0.0-20180830185346-98f6abe2eb07/go.mod h1:kDXzergiv9cbyO7IOYJZWg1U88JhDg3PB6klq9Hg2pA=
-github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGrc=
-github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0=
 github.com/urfave/cli v1.22.2/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
 github.com/urfave/cli v1.22.10/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
-github.com/urfave/negroni v1.0.0/go.mod h1:Meg73S6kFm/4PpbYdq35yYWoCZ9mS/YSx+lKnmiohz4=
-github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
-github.com/valyala/fasthttp v1.6.0/go.mod h1:FstJa9V+Pj9vQ7OJie2qMHdwemEDaDiSdBnvPM1Su9w=
-github.com/valyala/fasttemplate v1.0.1/go.mod h1:UQGH1tvbgY+Nz5t2n7tXsz52dQxojPUpymEIMZ47gx8=
-github.com/valyala/tcplisten v0.0.0-20161114210144-ceec8f93295a/go.mod h1:v3UYOV9WzVtRmSR+PDvWpU/qWl4Wa5LApYYX4ZtKbio=
 github.com/viant/assertly v0.4.8/go.mod h1:aGifi++jvCrUaklKEKT0BU95igDNaqkvz+49uaYMPRU=
 github.com/viant/toolbox v0.24.0/go.mod h1:OxMCG57V0PXuIP2HNQrtJf2CjqdmbrOx5EkMILuUhzM=
 github.com/warpfork/go-wish v0.0.0-20220906213052-39a1cc7a02d0 h1:GDDkbFiaK8jsSDJfjId/PEGEShv6ugrt4kYsC5UIDaQ=
 github.com/warpfork/go-wish v0.0.0-20220906213052-39a1cc7a02d0/go.mod h1:x6AKhvSSexNrVSrViXSHUEbICjmGXhtgABaHIySUSGw=
 github.com/whyrusleeping/go-keyspace v0.0.0-20160322163242-5b898ac5add1 h1:EKhdznlJHPMoKr0XTrX+IlJs1LH3lyx2nfr1dOlZ79k=
 github.com/whyrusleeping/go-keyspace v0.0.0-20160322163242-5b898ac5add1/go.mod h1:8UvriyWtv5Q5EOgjHaSseUEdkQfvwFv1I/In/O2M9gc=
-github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU=
-github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ=
-github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y=
-github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
-github.com/yalp/jsonpath v0.0.0-20180802001716-5cc68e5049a0/go.mod h1:/LWChgwKmvncFJFHJ7Gvn9wZArjbV5/FppcK2fKk/tI=
-github.com/yudai/gojsondiff v1.0.0/go.mod h1:AY32+k2cwILAkW1fbgxQ5mUmMiZFgLIV+FBNExI05xg=
-github.com/yudai/golcs v0.0.0-20170316035057-ecda9a501e82/go.mod h1:lgjkn3NuSvDfVJdfcVVdX+jpBxNmX4rDAzaS45IcYoM=
-github.com/yudai/pp v2.0.1+incompatible/go.mod h1:PuxR/8QJ7cyCkFp/aUDS+JY727OFEZkTdatxwunjIkc=
 github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
@@ -624,20 +500,16 @@ go4.org v0.0.0-20180809161055-417644f6feb5/go.mod h1:MkTOUMDaeVYJUOUsaDXIhWPZYa1
 golang.org/x/build v0.0.0-20190111050920-041ab4dc3f9d/go.mod h1:OWs+y06UdEOHN4y+MfF/py+xQ/tYqIWW03b70/CG9Rw=
 golang.org/x/crypto v0.0.0-20170930174604-9419663f5a44/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
 golang.org/x/crypto v0.0.0-20181030102418-4d3f4d9ffa16/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
-golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.0.0-20190313024323-a1f597ede03a/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20190611184440-5c40567a22f8/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
-golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200115085410-6d4e4cb37c7d/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.0.0-20200510223506-06a226fb4e37/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.0.0-20200602180216-279210d13fed/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.0.0-20210322153248-0c34fe9e7dc2/go.mod h1:T9bdIzuCu7OtxOm1hfPfRQxPLYneinmdGuTeoZ9dtd4=
-golang.org/x/crypto v0.12.0 h1:tFM/ta59kqch6LlvYnPa0yx5a83cL2nHflFhYKvv9Yk=
-golang.org/x/crypto v0.12.0/go.mod h1:NF0Gs7EO5K4qLn+Ylc+fih8BSTeIjAP05siRnAh98yw=
 golang.org/x/crypto v0.13.0 h1:mvySKfSWJ+UKUii46M40LOvyWfN0s2U+46/jDd0e6Ck=
 golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
@@ -662,25 +534,18 @@ golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73r
 golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20181029044818-c44066c5c816/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20181106065722-10aee1819953/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20181220203305-927f97764cc3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
 golang.org/x/net v0.0.0-20190313220215-9f648a60d977/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/net v0.0.0-20190327091125-710a502c58a2/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
 golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
 golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
-golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
-golang.org/x/net v0.0.0-20200520004742-59133d7f0dd7/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
 golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/net v0.0.0-20210119194325-5f4716e94777/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
 golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
 golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
-golang.org/x/net v0.14.0 h1:BONx9s002vGdD9umnlX1Po8vOZmrgH34qlHcD1MfK14=
-golang.org/x/net v0.14.0/go.mod h1:PpSgVXXLK0OxS0F31C1/tv6XNguvCrnXIDrFMspZIUI=
 golang.org/x/net v0.15.0 h1:ugBLEUaxABaB5AJqW9enI0ACdci2RUd4eP51NTBvuJ8=
 golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
@@ -702,20 +567,11 @@ golang.org/x/sys v0.0.0-20180810173357-98c5dad5d1a0/go.mod h1:STP8DvDyc/dI5b8T5h
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20181029174526-d69651ed3497/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20181205085412-a5c9d58dba9a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190316082340-a2f829d7f35f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20190626221950-04f50cda93cb/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20190904154756-749cb33beabd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20191005200804-aed5e4c7ecf9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20191120155948-bd437916bb0e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200124204421-9fbb57f87de9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200519105757-fe76b779f299/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200602225109-6fdc65e7d980/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -727,23 +583,14 @@ golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBc
 golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.11.0 h1:eG7RXZHdqOJ1i+0lgLgCpSXAp6M3LYlAo6osgSi0xOM=
-golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.12.0 h1:CM0HF96J0hcLAwsHPJZjfdNzs0gftsLfgKt57wWHJ0o=
-golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.14.0 h1:Vz7Qs629MkJkGyHxUlRHizWJRG2j8fbQKjELVSNhy7Q=
 golang.org/x/sys v0.14.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
-golang.org/x/term v0.12.0 h1:/ZfYdc3zq+q02Rv9vGqTeSItdzZTSNDmfTi0mBAuidU=
-golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
 golang.org/x/term v0.14.0 h1:LGK9IlZ8T9jvdy6cTdfKUCltatMFOehAQo9SRC46UQ8=
 golang.org/x/term v0.14.0/go.mod h1:TySc+nGkYR6qt8km8wUhuFRTVSMIX3XPR58y2lC8vww=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
-golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.12.0 h1:k+n5B8goJNdU7hSvEtMUz3d1Q6D/XW4COJSJR6fN0mc=
-golang.org/x/text v0.12.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
 golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k=
 golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
 golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
@@ -752,11 +599,9 @@ golang.org/x/tools v0.0.0-20180828015842-6cd1fcedba52/go.mod h1:n7NCudcB/nEzxVGm
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20181030000716-a0a13e073c7b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20181030221726-6c7e314b6563/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20181221001348-537d06c36207/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
 golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
-golang.org/x/tools v0.0.0-20190327201419-c70d86f8b7cf/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
 golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
 golang.org/x/tools v0.0.0-20190422233926-fe54fb35175b/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
 golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
@@ -784,7 +629,6 @@ google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9Ywl
 google.golang.org/appengine v1.2.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
 google.golang.org/appengine v1.3.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
 google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
-google.golang.org/genproto v0.0.0-20180518175338-11a468237815/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
 google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
 google.golang.org/genproto v0.0.0-20180831171423-11092d34479b/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
 google.golang.org/genproto v0.0.0-20181029155118-b69ba1387ce2/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
@@ -793,14 +637,10 @@ google.golang.org/genproto v0.0.0-20190306203927-b5d61aea6440/go.mod h1:VzzqZJRn
 google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
 google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
 google.golang.org/genproto v0.0.0-20230803162519-f966b187b2e5 h1:L6iMMGrtzgHsWofoFcihmDEMYeDR9KN/ThbPWGrh++g=
-google.golang.org/genproto v0.0.0-20230803162519-f966b187b2e5/go.mod h1:oH/ZOT02u4kWEp7oYBGYFFkCdKS/uYR9Z7+0/xuuFp8=
 google.golang.org/genproto/googleapis/api v0.0.0-20230822172742-b8732ec3820d h1:DoPTO70H+bcDXcd39vOqb2viZxgqeBeSGtZ55yZU4/Q=
 google.golang.org/genproto/googleapis/api v0.0.0-20230822172742-b8732ec3820d/go.mod h1:KjSP20unUpOx5kyQUFa7k4OJg0qeJ7DEZflGDu2p6Bk=
-google.golang.org/genproto/googleapis/rpc v0.0.0-20230711160842-782d3b101e98 h1:bVf09lpb+OJbByTj913DRJioFFAjf/ZGxEz7MajTp2U=
-google.golang.org/genproto/googleapis/rpc v0.0.0-20230711160842-782d3b101e98/go.mod h1:TUfxEVdsvPg18p6AslUXFoLdpED4oBnGwyqk3dV1XzM=
 google.golang.org/genproto/googleapis/rpc v0.0.0-20230822172742-b8732ec3820d h1:uvYuEyMHKNt+lT4K3bN6fGswmK8qSvcreM3BwjDh+y4=
 google.golang.org/genproto/googleapis/rpc v0.0.0-20230822172742-b8732ec3820d/go.mod h1:+Bk1OCOj40wS2hwAMA+aCW9ypzm63QTBBHp6lQ3p+9M=
-google.golang.org/grpc v1.12.0/go.mod h1:yo6s7OP7yaDglbqo1J04qKzAhqBH6lvTonzMVmEdcZw=
 google.golang.org/grpc v1.14.0/go.mod h1:yo6s7OP7yaDglbqo1J04qKzAhqBH6lvTonzMVmEdcZw=
 google.golang.org/grpc v1.16.0/go.mod h1:0JHn/cJsOMiMfNA9+DeHDlAU7KAAB5GDlYFpa9MZMio=
 google.golang.org/grpc v1.17.0/go.mod h1:6QZJwpn2B+Zp71q/5VxRsJ6NXXVCE5NRUHRo+f3cWCs=
@@ -808,12 +648,9 @@ google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZi
 google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
 google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
 google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
-google.golang.org/grpc v1.29.1/go.mod h1:itym6AZVZYACWQqET3MqgPpjcuV5QH3BxFS3IjizoKk=
 google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc=
 google.golang.org/grpc v1.58.2 h1:SXUpjxeVF3FKrTYQI4f4KvbGD5u2xccdYdurwowix5I=
 google.golang.org/grpc v1.58.2/go.mod h1:tgX3ZQDlNJGU96V6yHh1T/JeoBQ2TXdr43YbYSsCJk0=
-google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.3.0 h1:rNBFJjBCOgVr9pWD7rs/knKL4FRTKgpZmsRfV214zcA=
-google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.3.0/go.mod h1:Dk1tviKTvMCz5tvh7t+fh94dhmQVHuCt2OzJB3CTW9Y=
 google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
 google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
 google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
@@ -829,21 +666,15 @@ google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs
 google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
 gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
 gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
-gopkg.in/go-playground/assert.v1 v1.2.1/go.mod h1:9RXL0bg/zibRAgZUYszZSwO/z8Y/a8bDuhia5mkpMnE=
-gopkg.in/go-playground/validator.v8 v8.18.2/go.mod h1:RX2a/7Ha8BgOhfk7j780h4/u/RRjR0eouCJSH80/M2Y=
 gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
-gopkg.in/mgo.v2 v2.0.0-20180705113604-9856a29383ce/go.mod h1:yeKp02qBN3iKW1OzL3MGk2IdtZzaj7SFntXj72NppTA=
 gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
 gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
-gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
-gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
 gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/node/keys/inmem.go b/node/keys/inmem.go
new file mode 100644
index 0000000..a59a201
--- /dev/null
+++ b/node/keys/inmem.go
@@ -0,0 +1,197 @@
+package keys
+
+import (
+	"crypto"
+	"crypto/rand"
+
+	"github.com/cloudflare/circl/sign/ed448"
+	"github.com/pkg/errors"
+	"source.quilibrium.com/quilibrium/monorepo/nekryptology/pkg/core/curves"
+)
+
+type InMemoryKeyManager struct {
+	key   ByteString
+	store map[string]Key
+}
+
+func NewInMemoryKeyManager() *InMemoryKeyManager {
+	store := make(map[string]Key)
+
+	return &InMemoryKeyManager{
+		store: store,
+	}
+}
+
+// CreateSigningKey implements KeyManager
+func (f *InMemoryKeyManager) CreateSigningKey(
+	id string,
+	keyType KeyType,
+) (crypto.Signer, error) {
+	switch keyType {
+	case KeyTypeEd448:
+		pubkey, privkey, err := ed448.GenerateKey(rand.Reader)
+		if err != nil {
+			return nil, errors.Wrap(err, "could not generate key")
+		}
+
+		if err = f.save(
+			id,
+			Key{
+				Id:         id,
+				Type:       keyType,
+				PublicKey:  ByteString(pubkey),
+				PrivateKey: ByteString(privkey),
+			},
+		); err != nil {
+			return nil, errors.Wrap(err, "could not save")
+		}
+
+		return privkey, nil
+		// case KeyTypePCAS:
+		// 	_, privkey, err := addressing.GenerateKey(rand.Reader)
+		// 	if err != nil {
+		// 		return nil, errors.Wrap(err, "could not generate key")
+		// 	}
+
+		// 	if err = f.save(id, privkey); err != nil {
+		// 		return nil, errors.Wrap(err, "could not save")
+		// 	}
+
+		// 	return privkey, nil
+	}
+
+	return nil, UnsupportedKeyTypeErr
+}
+
+// CreateAgreementKey implements KeyManager
+func (f *InMemoryKeyManager) CreateAgreementKey(
+	id string,
+	keyType KeyType,
+) (curves.Scalar, error) {
+	switch keyType {
+	case KeyTypeX448:
+		privkey := curves.ED448().Scalar.Random(rand.Reader)
+		pubkey := curves.ED448().NewGeneratorPoint().Mul(privkey)
+
+		if err := f.save(
+			id,
+			Key{
+				Id:         id,
+				Type:       KeyTypeX448,
+				PublicKey:  pubkey.ToAffineCompressed(),
+				PrivateKey: privkey.Bytes(),
+			},
+		); err != nil {
+			return nil, errors.Wrap(err, "could not save")
+		}
+
+		return privkey, nil
+	}
+
+	return nil, UnsupportedKeyTypeErr
+}
+
+// GetAgreementKey implements KeyManager
+func (f *InMemoryKeyManager) GetAgreementKey(id string) (curves.Scalar, error) {
+	key, err := f.read(id)
+	if err != nil {
+		return nil, err
+	}
+
+	switch key.Type {
+	case KeyTypeX448:
+		privkey, err := curves.ED448().NewScalar().SetBytes(key.PrivateKey)
+		return privkey, err
+	}
+
+	return nil, UnsupportedKeyTypeErr
+}
+
+// GetRawKey implements KeyManager
+func (f *InMemoryKeyManager) GetRawKey(id string) (*Key, error) {
+	key, err := f.read(id)
+	return &key, err
+}
+
+// GetSigningKey implements KeyManager
+func (f *InMemoryKeyManager) GetSigningKey(id string) (crypto.Signer, error) {
+	key, err := f.read(id)
+	if err != nil {
+		return nil, err
+	}
+
+	switch key.Type {
+	case KeyTypeEd448:
+		privkey := (ed448.PrivateKey)(key.PrivateKey)
+		return privkey, err
+		// case KeyTypePCAS:
+		// 	privkey := (addressing.PCAS)(key.PrivateKey)
+		// 	return privkey, err
+	}
+
+	return nil, UnsupportedKeyTypeErr
+}
+
+// PutRawKey implements KeyManager
+func (f *InMemoryKeyManager) PutRawKey(key *Key) error {
+	return f.save(key.Id, *key)
+}
+
+// DeleteKey implements KeyManager
+func (f *InMemoryKeyManager) DeleteKey(id string) error {
+	delete(f.store, id)
+
+	return nil
+}
+
+// GetKey implements KeyManager
+func (f *InMemoryKeyManager) GetKey(id string) (key *Key, err error) {
+	storeKey, err := f.read(id)
+	if err != nil {
+		return nil, err
+	}
+
+	return &storeKey, nil
+}
+
+// ListKeys implements KeyManager
+func (f *InMemoryKeyManager) ListKeys() ([]*Key, error) {
+	keys := []*Key{}
+
+	for k := range f.store {
+		storeKey, err := f.read(k)
+		if err != nil {
+			return nil, err
+		}
+		keys = append(keys, &storeKey)
+	}
+
+	return keys, nil
+}
+
+var _ KeyManager = (*InMemoryKeyManager)(nil)
+
+func (f *InMemoryKeyManager) save(id string, key Key) error {
+	f.store[id] = Key{
+		Id:         key.Id,
+		Type:       key.Type,
+		PublicKey:  key.PublicKey,
+		PrivateKey: key.PrivateKey,
+	}
+
+	return nil
+}
+
+func (f *InMemoryKeyManager) read(id string) (Key, error) {
+	k, ok := f.store[id]
+	if !ok {
+		return Key{}, KeyNotFoundErr
+	}
+
+	return Key{
+		Id:         k.Id,
+		Type:       k.Type,
+		PublicKey:  k.PublicKey,
+		PrivateKey: k.PrivateKey,
+	}, nil
+}
diff --git a/node/main.go b/node/main.go
index 7e7769e..81a3101 100644
--- a/node/main.go
+++ b/node/main.go
@@ -25,7 +25,7 @@ import (
 var (
 	configDirectory = flag.String(
 		"config",
-		"./.config/",
+		filepath.Join(".", ".config"),
 		"the configuration directory",
 	)
 	importPrivKey = flag.String(
@@ -233,5 +233,5 @@ func printLogo() {
 
 func printVersion() {
 	fmt.Println(" ")
-	fmt.Println("                         Quilibrium Node - v1.1.8 – Dawn")
+	fmt.Println("                         Quilibrium Node - v1.2.0 – Dawn")
 }
diff --git a/node/p2p/bloom_utils.go b/node/p2p/bloom_utils.go
index aa2963a..101e9c9 100644
--- a/node/p2p/bloom_utils.go
+++ b/node/p2p/bloom_utils.go
@@ -3,6 +3,7 @@ package p2p
 import (
 	"fmt"
 	"math/big"
+	"sort"
 
 	"golang.org/x/crypto/sha3"
 )
@@ -64,10 +65,10 @@ func generateBitSlices(
 	return nil
 }
 
-// getBloomFilterIndices returns a bloom filter index based on the data, however
+// GetBloomFilter returns a bloom filter based on the data, however
 // it assumes bitLength is a multiple of 32. If the filter size is not
 // conformant, this will generate biased indices.
-func getBloomFilterIndices(data []byte, bitLength int, k int) []byte {
+func GetBloomFilter(data []byte, bitLength int, k int) []byte {
 	size := big.NewInt(int64(bitLength)).BitLen() - 1
 	digest := sha3.Sum256(data)
 	output := make([]byte, bitLength/8)
@@ -75,7 +76,7 @@ func getBloomFilterIndices(data []byte, bitLength int, k int) []byte {
 	digestBI := new(big.Int).SetBytes(digest[:])
 	for i := 0; i < k; i++ {
 		position := uint(0)
-		for j := size*(i+1) - 1; j >= size*i; j-- {
+		for j := size * i; j < size*(i+1); j++ {
 			position = position<<1 | (digestBI.Bit(j))
 		}
 		if outputBI.Bit(int(position)) != 1 {
@@ -96,3 +97,51 @@ func getBloomFilterIndices(data []byte, bitLength int, k int) []byte {
 	outputBI.FillBytes(output)
 	return output
 }
+
+// GetBloomFilterIndices returns the indices of a bloom filter, in increasing
+// order, assuming bitLength is a multiple of 32 as in GetBloomFilter.
+func GetBloomFilterIndices(data []byte, bitLength int, k int) []byte {
+	size := big.NewInt(int64(bitLength)).BitLen() - 1
+	h := sha3.NewShake256()
+	_, err := h.Write(data)
+	if err != nil {
+		panic(err)
+	}
+
+	digest := make([]byte, size*k/8)
+	_, err = h.Read(digest)
+	if err != nil {
+		panic(err)
+	}
+
+	indices := []string{}
+	for i := 0; i < k; i++ {
+		position := make([]byte, size/8)
+		for j := (size / 8) * i; j < (size/8)*(i+1); j++ {
+			position[j%(size/8)] = digest[j]
+		}
+		found := false
+		for _, ext := range indices {
+			if ext == string(position) {
+				k++
+				found = true
+				break
+			}
+		}
+		if !found {
+			p := sort.SearchStrings(indices, string(position))
+			if len(indices) > p {
+				indices = append(indices[:p+1], indices[p:]...)
+				indices[p] = string(position)
+			} else {
+				indices = append(indices, string(position))
+			}
+		}
+	}
+
+	output := ""
+	for _, idx := range indices {
+		output += idx
+	}
+	return []byte(output)
+}
diff --git a/node/p2p/bloom_utils_test.go b/node/p2p/bloom_utils_test.go
new file mode 100644
index 0000000..2fa7698
--- /dev/null
+++ b/node/p2p/bloom_utils_test.go
@@ -0,0 +1,91 @@
+package p2p_test
+
+import (
+	"bytes"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"source.quilibrium.com/quilibrium/monorepo/node/p2p"
+)
+
+func TestGetBloomFilter(t *testing.T) {
+	fourByteThreeKTest := p2p.GetBloomFilter(
+		[]byte{0x00, 0x00, 0x00, 0x00},
+		256,
+		3,
+	)
+	assert.ElementsMatch(t, fourByteThreeKTest, []byte{
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x20,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	})
+
+	sixtyByteThreeKTest := p2p.GetBloomFilter(
+		bytes.Repeat([]byte{0x00}, 60),
+		256,
+		3,
+	)
+	assert.ElementsMatch(t, sixtyByteThreeKTest, []byte{
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x10,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	})
+
+	fourByteSixteenKTest := p2p.GetBloomFilter(
+		[]byte{0x00, 0x00, 0x00, 0x00},
+		65536,
+		16,
+	)
+	assert.ElementsMatch(t, fourByteSixteenKTest, []byte{
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	})
+
+	sixtyByteSixteenKTest := p2p.GetBloomFilter(
+		bytes.Repeat([]byte{0x00}, 60),
+		65536,
+		16,
+	)
+	assert.ElementsMatch(t, sixtyByteSixteenKTest, []byte{
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	})
+}
+
+func TestGetBloomFilterIndices(t *testing.T) {
+	fourByteThreeKTest := p2p.GetBloomFilterIndices(
+		[]byte{0x00, 0x00, 0x00, 0x00},
+		256,
+		3,
+	)
+	assert.ElementsMatch(t, fourByteThreeKTest, []byte{0x1e, 0xa2, 0xb4})
+
+	sixtyByteThreeKTest := p2p.GetBloomFilterIndices(
+		bytes.Repeat([]byte{0x00}, 60),
+		256,
+		3,
+	)
+	assert.ElementsMatch(t, sixtyByteThreeKTest, []byte{0x0a, 0x72, 0x80})
+
+	fourByteSixteenKTest := p2p.GetBloomFilterIndices(
+		[]byte{0x00, 0x00, 0x00, 0x00},
+		65536,
+		16,
+	)
+	assert.ElementsMatch(t, fourByteSixteenKTest, []byte{
+		0x10, 0x23, 0x1e, 0x79, 0x39, 0xbe, 0x50, 0xe9, 0x64, 0x68, 0x73, 0x4f,
+		0x7e, 0xd5, 0x8b, 0x4d, 0x8d, 0x15, 0x95, 0xd6, 0xb1, 0x25, 0xb3, 0x1a,
+		0xb4, 0xa2, 0xbd, 0x3c, 0xea, 0x31, 0xee, 0x7e,
+	})
+
+	sixtyByteSixteenKTest := p2p.GetBloomFilterIndices(
+		bytes.Repeat([]byte{0x00}, 60),
+		65536,
+		16,
+	)
+	assert.ElementsMatch(t, sixtyByteSixteenKTest, []byte{
+		0x10, 0x34, 0x16, 0x18, 0x27, 0xe7, 0x4b, 0xfc, 0x72, 0x0a, 0x80, 0x38,
+		0x81, 0x12, 0x93, 0xec, 0xa1, 0xf8, 0xa2, 0x37, 0xa9, 0x1a, 0xc1, 0x55,
+		0xc4, 0x16, 0xd1, 0x7e, 0xd5, 0xcd, 0xf0, 0x6c,
+	})
+}
diff --git a/node/p2p/blossomsub.go b/node/p2p/blossomsub.go
index f5dc4c4..58e8526 100644
--- a/node/p2p/blossomsub.go
+++ b/node/p2p/blossomsub.go
@@ -17,6 +17,7 @@ import (
 	libp2pconfig "github.com/libp2p/go-libp2p/config"
 	"github.com/libp2p/go-libp2p/core/crypto"
 	"github.com/libp2p/go-libp2p/core/host"
+	"github.com/libp2p/go-libp2p/core/network"
 	"github.com/libp2p/go-libp2p/core/peer"
 	"github.com/libp2p/go-libp2p/core/protocol"
 	"github.com/libp2p/go-libp2p/p2p/discovery/routing"
@@ -47,8 +48,6 @@ type BlossomSub struct {
 var _ PubSub = (*BlossomSub)(nil)
 var ErrNoPeersAvailable = errors.New("no peers available")
 
-// Crucial note, bitmask lengths should always be a power of two so as to reduce
-// index bias with hash functions
 var BITMASK_ALL = []byte{
 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
@@ -180,7 +179,8 @@ func (b *BlossomSub) PublishToBitmask(bitmask []byte, data []byte) error {
 }
 
 func (b *BlossomSub) Publish(data []byte) error {
-	bitmask := getBloomFilterIndices(data, 256, 3)
+	bitmask := GetBloomFilter(data, 256, 3)
+	bitmask = append(bitmask, GetBloomFilterIndices(data, 65536, 24)...)
 	return b.PublishToBitmask(bitmask, data)
 }
 
@@ -509,7 +509,8 @@ func discoverPeers(
 
 		for peer := range peerChan {
 			peer := peer
-			if peer.ID == h.ID() {
+			if peer.ID == h.ID() ||
+				h.Network().Connectedness(peer.ID) == network.Connected {
 				continue
 			}
 
@@ -535,10 +536,7 @@ func discoverPeers(
 	go func() {
 		for {
 			time.Sleep(30 * time.Second)
-			if len(h.Network().Peers()) == 0 {
-				logger.Info("reinitiating discovery")
-				discover()
-			}
+			discover()
 		}
 	}()
 
diff --git a/node/poor_mans_cd.sh b/node/poor_mans_cd.sh
new file mode 100755
index 0000000..386cbc4
--- /dev/null
+++ b/node/poor_mans_cd.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+start_process() {
+    go run ./... &
+    process_pid=$!
+    child_process_pid=$(pgrep -P $process_pid)
+}
+
+is_process_running() {
+    ps -p $process_pid > /dev/null 2>&1
+    return $?
+}
+
+kill_process() {
+    kill $process_pid
+    kill $child_process_pid
+}
+
+start_process
+
+while true; do
+    if ! is_process_running; then
+        echo "Process crashed or stopped. Restarting..."
+        start_process
+    fi
+
+    git fetch
+
+    local_head=$(git rev-parse HEAD)
+    remote_head=$(git rev-parse @{u})
+
+    if [ "$local_head" != "$remote_head" ]; then
+        kill_process
+
+        git pull
+
+        start_process
+    fi
+
+    sleep 60
+done
diff --git a/node/protobufs/ceremony.pb.go b/node/protobufs/ceremony.pb.go
index 4e8398b..0c2af75 100644
--- a/node/protobufs/ceremony.pb.go
+++ b/node/protobufs/ceremony.pb.go
@@ -748,7 +748,7 @@ type CeremonyInProgressState struct {
 	sizeCache     protoimpl.SizeCache
 	unknownFields protoimpl.UnknownFields
 
-	ActiveParticipants             []*Ed448PublicKey                   `protobuf:"bytes,1,rep,name=active_participants,json=activeParticipants,proto3" json:"active_participants,omitempty"`
+	ActiveParticipants             []*CeremonyLobbyJoin                `protobuf:"bytes,1,rep,name=active_participants,json=activeParticipants,proto3" json:"active_participants,omitempty"`
 	LatestSeenProverAttestations   []*CeremonySeenProverAttestation    `protobuf:"bytes,2,rep,name=latest_seen_prover_attestations,json=latestSeenProverAttestations,proto3" json:"latest_seen_prover_attestations,omitempty"`
 	DroppedParticipantAttestations []*CeremonyDroppedProverAttestation `protobuf:"bytes,3,rep,name=dropped_participant_attestations,json=droppedParticipantAttestations,proto3" json:"dropped_participant_attestations,omitempty"`
 	TranscriptRoundAdvanceCommits  []*CeremonyAdvanceRound             `protobuf:"bytes,4,rep,name=transcript_round_advance_commits,json=transcriptRoundAdvanceCommits,proto3" json:"transcript_round_advance_commits,omitempty"`
@@ -787,7 +787,7 @@ func (*CeremonyInProgressState) Descriptor() ([]byte, []int) {
 	return file_ceremony_proto_rawDescGZIP(), []int{10}
 }
 
-func (x *CeremonyInProgressState) GetActiveParticipants() []*Ed448PublicKey {
+func (x *CeremonyInProgressState) GetActiveParticipants() []*CeremonyLobbyJoin {
 	if x != nil {
 		return x.ActiveParticipants
 	}
@@ -827,7 +827,7 @@ type CeremonyFinalizingState struct {
 	sizeCache     protoimpl.SizeCache
 	unknownFields protoimpl.UnknownFields
 
-	ActiveParticipants             []*Ed448PublicKey                   `protobuf:"bytes,1,rep,name=active_participants,json=activeParticipants,proto3" json:"active_participants,omitempty"`
+	ActiveParticipants             []*CeremonyLobbyJoin                `protobuf:"bytes,1,rep,name=active_participants,json=activeParticipants,proto3" json:"active_participants,omitempty"`
 	LatestSeenProverAttestations   []*CeremonySeenProverAttestation    `protobuf:"bytes,2,rep,name=latest_seen_prover_attestations,json=latestSeenProverAttestations,proto3" json:"latest_seen_prover_attestations,omitempty"`
 	DroppedParticipantAttestations []*CeremonyDroppedProverAttestation `protobuf:"bytes,3,rep,name=dropped_participant_attestations,json=droppedParticipantAttestations,proto3" json:"dropped_participant_attestations,omitempty"`
 	Commits                        []*CeremonyTranscriptCommit         `protobuf:"bytes,4,rep,name=commits,proto3" json:"commits,omitempty"`
@@ -867,7 +867,7 @@ func (*CeremonyFinalizingState) Descriptor() ([]byte, []int) {
 	return file_ceremony_proto_rawDescGZIP(), []int{11}
 }
 
-func (x *CeremonyFinalizingState) GetActiveParticipants() []*Ed448PublicKey {
+func (x *CeremonyFinalizingState) GetActiveParticipants() []*CeremonyLobbyJoin {
 	if x != nil {
 		return x.ActiveParticipants
 	}
@@ -1567,189 +1567,190 @@ var file_ceremony_proto_rawDesc = []byte{
 	0x6f, 0x64, 0x65, 0x2e, 0x6b, 0x65, 0x79, 0x73, 0x2e, 0x70, 0x62, 0x2e, 0x45, 0x64, 0x34, 0x34,
 	0x38, 0x50, 0x75, 0x62, 0x6c, 0x69, 0x63, 0x4b, 0x65, 0x79, 0x52, 0x15, 0x70, 0x72, 0x65, 0x66,
 	0x65, 0x72, 0x72, 0x65, 0x64, 0x50, 0x61, 0x72, 0x74, 0x69, 0x63, 0x69, 0x70, 0x61, 0x6e, 0x74,
-	0x73, 0x22, 0xde, 0x04, 0x0a, 0x17, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x49, 0x6e,
-	0x50, 0x72, 0x6f, 0x67, 0x72, 0x65, 0x73, 0x73, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x58, 0x0a,
+	0x73, 0x22, 0xe5, 0x04, 0x0a, 0x17, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x49, 0x6e,
+	0x50, 0x72, 0x6f, 0x67, 0x72, 0x65, 0x73, 0x73, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x5f, 0x0a,
 	0x13, 0x61, 0x63, 0x74, 0x69, 0x76, 0x65, 0x5f, 0x70, 0x61, 0x72, 0x74, 0x69, 0x63, 0x69, 0x70,
-	0x61, 0x6e, 0x74, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x27, 0x2e, 0x71, 0x75, 0x69,
-	0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x6b, 0x65, 0x79,
-	0x73, 0x2e, 0x70, 0x62, 0x2e, 0x45, 0x64, 0x34, 0x34, 0x38, 0x50, 0x75, 0x62, 0x6c, 0x69, 0x63,
-	0x4b, 0x65, 0x79, 0x52, 0x12, 0x61, 0x63, 0x74, 0x69, 0x76, 0x65, 0x50, 0x61, 0x72, 0x74, 0x69,
-	0x63, 0x69, 0x70, 0x61, 0x6e, 0x74, 0x73, 0x12, 0x81, 0x01, 0x0a, 0x1f, 0x6c, 0x61, 0x74, 0x65,
-	0x73, 0x74, 0x5f, 0x73, 0x65, 0x65, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x76, 0x65, 0x72, 0x5f, 0x61,
-	0x74, 0x74, 0x65, 0x73, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28,
-	0x0b, 0x32, 0x3a, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e,
-	0x6f, 0x64, 0x65, 0x2e, 0x63, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x2e, 0x70, 0x62, 0x2e,
-	0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x53, 0x65, 0x65, 0x6e, 0x50, 0x72, 0x6f, 0x76,
-	0x65, 0x72, 0x41, 0x74, 0x74, 0x65, 0x73, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x1c, 0x6c,
-	0x61, 0x74, 0x65, 0x73, 0x74, 0x53, 0x65, 0x65, 0x6e, 0x50, 0x72, 0x6f, 0x76, 0x65, 0x72, 0x41,
-	0x74, 0x74, 0x65, 0x73, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x12, 0x87, 0x01, 0x0a, 0x20,
-	0x64, 0x72, 0x6f, 0x70, 0x70, 0x65, 0x64, 0x5f, 0x70, 0x61, 0x72, 0x74, 0x69, 0x63, 0x69, 0x70,
-	0x61, 0x6e, 0x74, 0x5f, 0x61, 0x74, 0x74, 0x65, 0x73, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73,
-	0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x3d, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72,
+	0x61, 0x6e, 0x74, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x2e, 0x2e, 0x71, 0x75, 0x69,
+	0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x63, 0x65, 0x72,
+	0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x2e, 0x70, 0x62, 0x2e, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e,
+	0x79, 0x4c, 0x6f, 0x62, 0x62, 0x79, 0x4a, 0x6f, 0x69, 0x6e, 0x52, 0x12, 0x61, 0x63, 0x74, 0x69,
+	0x76, 0x65, 0x50, 0x61, 0x72, 0x74, 0x69, 0x63, 0x69, 0x70, 0x61, 0x6e, 0x74, 0x73, 0x12, 0x81,
+	0x01, 0x0a, 0x1f, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x74, 0x5f, 0x73, 0x65, 0x65, 0x6e, 0x5f, 0x70,
+	0x72, 0x6f, 0x76, 0x65, 0x72, 0x5f, 0x61, 0x74, 0x74, 0x65, 0x73, 0x74, 0x61, 0x74, 0x69, 0x6f,
+	0x6e, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x3a, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69,
+	0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x63, 0x65, 0x72, 0x65, 0x6d,
+	0x6f, 0x6e, 0x79, 0x2e, 0x70, 0x62, 0x2e, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x53,
+	0x65, 0x65, 0x6e, 0x50, 0x72, 0x6f, 0x76, 0x65, 0x72, 0x41, 0x74, 0x74, 0x65, 0x73, 0x74, 0x61,
+	0x74, 0x69, 0x6f, 0x6e, 0x52, 0x1c, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x74, 0x53, 0x65, 0x65, 0x6e,
+	0x50, 0x72, 0x6f, 0x76, 0x65, 0x72, 0x41, 0x74, 0x74, 0x65, 0x73, 0x74, 0x61, 0x74, 0x69, 0x6f,
+	0x6e, 0x73, 0x12, 0x87, 0x01, 0x0a, 0x20, 0x64, 0x72, 0x6f, 0x70, 0x70, 0x65, 0x64, 0x5f, 0x70,
+	0x61, 0x72, 0x74, 0x69, 0x63, 0x69, 0x70, 0x61, 0x6e, 0x74, 0x5f, 0x61, 0x74, 0x74, 0x65, 0x73,
+	0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x3d, 0x2e,
+	0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e,
+	0x63, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x2e, 0x70, 0x62, 0x2e, 0x43, 0x65, 0x72, 0x65,
+	0x6d, 0x6f, 0x6e, 0x79, 0x44, 0x72, 0x6f, 0x70, 0x70, 0x65, 0x64, 0x50, 0x72, 0x6f, 0x76, 0x65,
+	0x72, 0x41, 0x74, 0x74, 0x65, 0x73, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x1e, 0x64, 0x72,
+	0x6f, 0x70, 0x70, 0x65, 0x64, 0x50, 0x61, 0x72, 0x74, 0x69, 0x63, 0x69, 0x70, 0x61, 0x6e, 0x74,
+	0x41, 0x74, 0x74, 0x65, 0x73, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x12, 0x7a, 0x0a, 0x20,
+	0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x5f, 0x72, 0x6f, 0x75, 0x6e, 0x64,
+	0x5f, 0x61, 0x64, 0x76, 0x61, 0x6e, 0x63, 0x65, 0x5f, 0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x73,
+	0x18, 0x04, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x31, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72,
 	0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x63, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e,
-	0x79, 0x2e, 0x70, 0x62, 0x2e, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x44, 0x72, 0x6f,
-	0x70, 0x70, 0x65, 0x64, 0x50, 0x72, 0x6f, 0x76, 0x65, 0x72, 0x41, 0x74, 0x74, 0x65, 0x73, 0x74,
-	0x61, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x1e, 0x64, 0x72, 0x6f, 0x70, 0x70, 0x65, 0x64, 0x50, 0x61,
-	0x72, 0x74, 0x69, 0x63, 0x69, 0x70, 0x61, 0x6e, 0x74, 0x41, 0x74, 0x74, 0x65, 0x73, 0x74, 0x61,
-	0x74, 0x69, 0x6f, 0x6e, 0x73, 0x12, 0x7a, 0x0a, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72,
-	0x69, 0x70, 0x74, 0x5f, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x5f, 0x61, 0x64, 0x76, 0x61, 0x6e, 0x63,
-	0x65, 0x5f, 0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0b, 0x32,
-	0x31, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64,
-	0x65, 0x2e, 0x63, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x2e, 0x70, 0x62, 0x2e, 0x43, 0x65,
-	0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x41, 0x64, 0x76, 0x61, 0x6e, 0x63, 0x65, 0x52, 0x6f, 0x75,
-	0x6e, 0x64, 0x52, 0x1d, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x52, 0x6f,
-	0x75, 0x6e, 0x64, 0x41, 0x64, 0x76, 0x61, 0x6e, 0x63, 0x65, 0x43, 0x6f, 0x6d, 0x6d, 0x69, 0x74,
-	0x73, 0x12, 0x5f, 0x0a, 0x17, 0x6e, 0x65, 0x78, 0x74, 0x5f, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x5f,
-	0x70, 0x61, 0x72, 0x74, 0x69, 0x63, 0x69, 0x70, 0x61, 0x6e, 0x74, 0x73, 0x18, 0x05, 0x20, 0x03,
-	0x28, 0x0b, 0x32, 0x27, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e,
-	0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x6b, 0x65, 0x79, 0x73, 0x2e, 0x70, 0x62, 0x2e, 0x45, 0x64, 0x34,
-	0x34, 0x38, 0x50, 0x75, 0x62, 0x6c, 0x69, 0x63, 0x4b, 0x65, 0x79, 0x52, 0x15, 0x6e, 0x65, 0x78,
-	0x74, 0x52, 0x6f, 0x75, 0x6e, 0x64, 0x50, 0x61, 0x72, 0x74, 0x69, 0x63, 0x69, 0x70, 0x61, 0x6e,
-	0x74, 0x73, 0x22, 0x81, 0x05, 0x0a, 0x17, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x46,
-	0x69, 0x6e, 0x61, 0x6c, 0x69, 0x7a, 0x69, 0x6e, 0x67, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x58,
-	0x0a, 0x13, 0x61, 0x63, 0x74, 0x69, 0x76, 0x65, 0x5f, 0x70, 0x61, 0x72, 0x74, 0x69, 0x63, 0x69,
-	0x70, 0x61, 0x6e, 0x74, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x27, 0x2e, 0x71, 0x75,
-	0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x6b, 0x65,
-	0x79, 0x73, 0x2e, 0x70, 0x62, 0x2e, 0x45, 0x64, 0x34, 0x34, 0x38, 0x50, 0x75, 0x62, 0x6c, 0x69,
-	0x63, 0x4b, 0x65, 0x79, 0x52, 0x12, 0x61, 0x63, 0x74, 0x69, 0x76, 0x65, 0x50, 0x61, 0x72, 0x74,
-	0x69, 0x63, 0x69, 0x70, 0x61, 0x6e, 0x74, 0x73, 0x12, 0x81, 0x01, 0x0a, 0x1f, 0x6c, 0x61, 0x74,
-	0x65, 0x73, 0x74, 0x5f, 0x73, 0x65, 0x65, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x76, 0x65, 0x72, 0x5f,
-	0x61, 0x74, 0x74, 0x65, 0x73, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, 0x02, 0x20, 0x03,
-	0x28, 0x0b, 0x32, 0x3a, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e,
+	0x79, 0x2e, 0x70, 0x62, 0x2e, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x41, 0x64, 0x76,
+	0x61, 0x6e, 0x63, 0x65, 0x52, 0x6f, 0x75, 0x6e, 0x64, 0x52, 0x1d, 0x74, 0x72, 0x61, 0x6e, 0x73,
+	0x63, 0x72, 0x69, 0x70, 0x74, 0x52, 0x6f, 0x75, 0x6e, 0x64, 0x41, 0x64, 0x76, 0x61, 0x6e, 0x63,
+	0x65, 0x43, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x73, 0x12, 0x5f, 0x0a, 0x17, 0x6e, 0x65, 0x78, 0x74,
+	0x5f, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x5f, 0x70, 0x61, 0x72, 0x74, 0x69, 0x63, 0x69, 0x70, 0x61,
+	0x6e, 0x74, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x27, 0x2e, 0x71, 0x75, 0x69, 0x6c,
+	0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x6b, 0x65, 0x79, 0x73,
+	0x2e, 0x70, 0x62, 0x2e, 0x45, 0x64, 0x34, 0x34, 0x38, 0x50, 0x75, 0x62, 0x6c, 0x69, 0x63, 0x4b,
+	0x65, 0x79, 0x52, 0x15, 0x6e, 0x65, 0x78, 0x74, 0x52, 0x6f, 0x75, 0x6e, 0x64, 0x50, 0x61, 0x72,
+	0x74, 0x69, 0x63, 0x69, 0x70, 0x61, 0x6e, 0x74, 0x73, 0x22, 0x88, 0x05, 0x0a, 0x17, 0x43, 0x65,
+	0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x46, 0x69, 0x6e, 0x61, 0x6c, 0x69, 0x7a, 0x69, 0x6e, 0x67,
+	0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x5f, 0x0a, 0x13, 0x61, 0x63, 0x74, 0x69, 0x76, 0x65, 0x5f,
+	0x70, 0x61, 0x72, 0x74, 0x69, 0x63, 0x69, 0x70, 0x61, 0x6e, 0x74, 0x73, 0x18, 0x01, 0x20, 0x03,
+	0x28, 0x0b, 0x32, 0x2e, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e,
 	0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x63, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x2e, 0x70, 0x62,
-	0x2e, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x53, 0x65, 0x65, 0x6e, 0x50, 0x72, 0x6f,
-	0x76, 0x65, 0x72, 0x41, 0x74, 0x74, 0x65, 0x73, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x1c,
-	0x6c, 0x61, 0x74, 0x65, 0x73, 0x74, 0x53, 0x65, 0x65, 0x6e, 0x50, 0x72, 0x6f, 0x76, 0x65, 0x72,
-	0x41, 0x74, 0x74, 0x65, 0x73, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x12, 0x87, 0x01, 0x0a,
-	0x20, 0x64, 0x72, 0x6f, 0x70, 0x70, 0x65, 0x64, 0x5f, 0x70, 0x61, 0x72, 0x74, 0x69, 0x63, 0x69,
-	0x70, 0x61, 0x6e, 0x74, 0x5f, 0x61, 0x74, 0x74, 0x65, 0x73, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e,
-	0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x3d, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62,
-	0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x63, 0x65, 0x72, 0x65, 0x6d, 0x6f,
-	0x6e, 0x79, 0x2e, 0x70, 0x62, 0x2e, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x44, 0x72,
-	0x6f, 0x70, 0x70, 0x65, 0x64, 0x50, 0x72, 0x6f, 0x76, 0x65, 0x72, 0x41, 0x74, 0x74, 0x65, 0x73,
-	0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x1e, 0x64, 0x72, 0x6f, 0x70, 0x70, 0x65, 0x64, 0x50,
-	0x61, 0x72, 0x74, 0x69, 0x63, 0x69, 0x70, 0x61, 0x6e, 0x74, 0x41, 0x74, 0x74, 0x65, 0x73, 0x74,
-	0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x12, 0x4f, 0x0a, 0x07, 0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74,
-	0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x35, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62,
-	0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x63, 0x65, 0x72, 0x65, 0x6d, 0x6f,
-	0x6e, 0x79, 0x2e, 0x70, 0x62, 0x2e, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x54, 0x72,
-	0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x43, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x52, 0x07,
-	0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x73, 0x12, 0x4c, 0x0a, 0x06, 0x73, 0x68, 0x61, 0x72, 0x65,
-	0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x34, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62,
-	0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x63, 0x65, 0x72, 0x65, 0x6d, 0x6f,
-	0x6e, 0x79, 0x2e, 0x70, 0x62, 0x2e, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x54, 0x72,
-	0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x53, 0x68, 0x61, 0x72, 0x65, 0x52, 0x06, 0x73,
-	0x68, 0x61, 0x72, 0x65, 0x73, 0x12, 0x5f, 0x0a, 0x17, 0x6e, 0x65, 0x78, 0x74, 0x5f, 0x72, 0x6f,
-	0x75, 0x6e, 0x64, 0x5f, 0x70, 0x61, 0x72, 0x74, 0x69, 0x63, 0x69, 0x70, 0x61, 0x6e, 0x74, 0x73,
-	0x18, 0x06, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x27, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72,
-	0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x6b, 0x65, 0x79, 0x73, 0x2e, 0x70, 0x62,
-	0x2e, 0x45, 0x64, 0x34, 0x34, 0x38, 0x50, 0x75, 0x62, 0x6c, 0x69, 0x63, 0x4b, 0x65, 0x79, 0x52,
-	0x15, 0x6e, 0x65, 0x78, 0x74, 0x52, 0x6f, 0x75, 0x6e, 0x64, 0x50, 0x61, 0x72, 0x74, 0x69, 0x63,
-	0x69, 0x70, 0x61, 0x6e, 0x74, 0x73, 0x22, 0xab, 0x02, 0x0a, 0x17, 0x43, 0x65, 0x72, 0x65, 0x6d,
-	0x6f, 0x6e, 0x79, 0x56, 0x61, 0x6c, 0x69, 0x64, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x53, 0x74, 0x61,
-	0x74, 0x65, 0x12, 0x4f, 0x0a, 0x07, 0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x73, 0x18, 0x01, 0x20,
-	0x03, 0x28, 0x0b, 0x32, 0x35, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d,
-	0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x63, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x2e, 0x70,
-	0x62, 0x2e, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x63,
-	0x72, 0x69, 0x70, 0x74, 0x43, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x52, 0x07, 0x63, 0x6f, 0x6d, 0x6d,
-	0x69, 0x74, 0x73, 0x12, 0x5e, 0x0a, 0x12, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x64, 0x5f, 0x74,
-	0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32,
-	0x2f, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64,
-	0x65, 0x2e, 0x63, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x2e, 0x70, 0x62, 0x2e, 0x43, 0x65,
-	0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74,
-	0x52, 0x11, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x64, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72,
-	0x69, 0x70, 0x74, 0x12, 0x5f, 0x0a, 0x17, 0x6e, 0x65, 0x78, 0x74, 0x5f, 0x72, 0x6f, 0x75, 0x6e,
-	0x64, 0x5f, 0x70, 0x61, 0x72, 0x74, 0x69, 0x63, 0x69, 0x70, 0x61, 0x6e, 0x74, 0x73, 0x18, 0x03,
+	0x2e, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x4c, 0x6f, 0x62, 0x62, 0x79, 0x4a, 0x6f,
+	0x69, 0x6e, 0x52, 0x12, 0x61, 0x63, 0x74, 0x69, 0x76, 0x65, 0x50, 0x61, 0x72, 0x74, 0x69, 0x63,
+	0x69, 0x70, 0x61, 0x6e, 0x74, 0x73, 0x12, 0x81, 0x01, 0x0a, 0x1f, 0x6c, 0x61, 0x74, 0x65, 0x73,
+	0x74, 0x5f, 0x73, 0x65, 0x65, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x76, 0x65, 0x72, 0x5f, 0x61, 0x74,
+	0x74, 0x65, 0x73, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b,
+	0x32, 0x3a, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f,
+	0x64, 0x65, 0x2e, 0x63, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x2e, 0x70, 0x62, 0x2e, 0x43,
+	0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x53, 0x65, 0x65, 0x6e, 0x50, 0x72, 0x6f, 0x76, 0x65,
+	0x72, 0x41, 0x74, 0x74, 0x65, 0x73, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x1c, 0x6c, 0x61,
+	0x74, 0x65, 0x73, 0x74, 0x53, 0x65, 0x65, 0x6e, 0x50, 0x72, 0x6f, 0x76, 0x65, 0x72, 0x41, 0x74,
+	0x74, 0x65, 0x73, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x12, 0x87, 0x01, 0x0a, 0x20, 0x64,
+	0x72, 0x6f, 0x70, 0x70, 0x65, 0x64, 0x5f, 0x70, 0x61, 0x72, 0x74, 0x69, 0x63, 0x69, 0x70, 0x61,
+	0x6e, 0x74, 0x5f, 0x61, 0x74, 0x74, 0x65, 0x73, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18,
+	0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x3d, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69,
+	0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x63, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79,
+	0x2e, 0x70, 0x62, 0x2e, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x44, 0x72, 0x6f, 0x70,
+	0x70, 0x65, 0x64, 0x50, 0x72, 0x6f, 0x76, 0x65, 0x72, 0x41, 0x74, 0x74, 0x65, 0x73, 0x74, 0x61,
+	0x74, 0x69, 0x6f, 0x6e, 0x52, 0x1e, 0x64, 0x72, 0x6f, 0x70, 0x70, 0x65, 0x64, 0x50, 0x61, 0x72,
+	0x74, 0x69, 0x63, 0x69, 0x70, 0x61, 0x6e, 0x74, 0x41, 0x74, 0x74, 0x65, 0x73, 0x74, 0x61, 0x74,
+	0x69, 0x6f, 0x6e, 0x73, 0x12, 0x4f, 0x0a, 0x07, 0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x73, 0x18,
+	0x04, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x35, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69,
+	0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x63, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79,
+	0x2e, 0x70, 0x62, 0x2e, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x54, 0x72, 0x61, 0x6e,
+	0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x43, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x52, 0x07, 0x63, 0x6f,
+	0x6d, 0x6d, 0x69, 0x74, 0x73, 0x12, 0x4c, 0x0a, 0x06, 0x73, 0x68, 0x61, 0x72, 0x65, 0x73, 0x18,
+	0x05, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x34, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69,
+	0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x63, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79,
+	0x2e, 0x70, 0x62, 0x2e, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x54, 0x72, 0x61, 0x6e,
+	0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x53, 0x68, 0x61, 0x72, 0x65, 0x52, 0x06, 0x73, 0x68, 0x61,
+	0x72, 0x65, 0x73, 0x12, 0x5f, 0x0a, 0x17, 0x6e, 0x65, 0x78, 0x74, 0x5f, 0x72, 0x6f, 0x75, 0x6e,
+	0x64, 0x5f, 0x70, 0x61, 0x72, 0x74, 0x69, 0x63, 0x69, 0x70, 0x61, 0x6e, 0x74, 0x73, 0x18, 0x06,
 	0x20, 0x03, 0x28, 0x0b, 0x32, 0x27, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75,
 	0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x6b, 0x65, 0x79, 0x73, 0x2e, 0x70, 0x62, 0x2e, 0x45,
 	0x64, 0x34, 0x34, 0x38, 0x50, 0x75, 0x62, 0x6c, 0x69, 0x63, 0x4b, 0x65, 0x79, 0x52, 0x15, 0x6e,
 	0x65, 0x78, 0x74, 0x52, 0x6f, 0x75, 0x6e, 0x64, 0x50, 0x61, 0x72, 0x74, 0x69, 0x63, 0x69, 0x70,
-	0x61, 0x6e, 0x74, 0x73, 0x22, 0x62, 0x0a, 0x18, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79,
-	0x50, 0x65, 0x65, 0x72, 0x4c, 0x69, 0x73, 0x74, 0x41, 0x6e, 0x6e, 0x6f, 0x75, 0x6e, 0x63, 0x65,
-	0x12, 0x46, 0x0a, 0x09, 0x70, 0x65, 0x65, 0x72, 0x5f, 0x6c, 0x69, 0x73, 0x74, 0x18, 0x01, 0x20,
-	0x03, 0x28, 0x0b, 0x32, 0x29, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d,
-	0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x63, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x2e, 0x70,
-	0x62, 0x2e, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x50, 0x65, 0x65, 0x72, 0x52, 0x08,
-	0x70, 0x65, 0x65, 0x72, 0x4c, 0x69, 0x73, 0x74, 0x22, 0xd7, 0x01, 0x0a, 0x0c, 0x43, 0x65, 0x72,
-	0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x50, 0x65, 0x65, 0x72, 0x12, 0x17, 0x0a, 0x07, 0x70, 0x65, 0x65,
-	0x72, 0x5f, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x06, 0x70, 0x65, 0x65, 0x72,
-	0x49, 0x64, 0x12, 0x1c, 0x0a, 0x09, 0x6d, 0x75, 0x6c, 0x74, 0x69, 0x61, 0x64, 0x64, 0x72, 0x18,
-	0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x09, 0x6d, 0x75, 0x6c, 0x74, 0x69, 0x61, 0x64, 0x64, 0x72,
-	0x12, 0x1b, 0x0a, 0x09, 0x6d, 0x61, 0x78, 0x5f, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x18, 0x03, 0x20,
-	0x01, 0x28, 0x04, 0x52, 0x08, 0x6d, 0x61, 0x78, 0x46, 0x72, 0x61, 0x6d, 0x65, 0x12, 0x1c, 0x0a,
-	0x09, 0x74, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x18, 0x04, 0x20, 0x01, 0x28, 0x03,
-	0x52, 0x09, 0x74, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x12, 0x18, 0x0a, 0x07, 0x76,
-	0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x07, 0x76, 0x65,
-	0x72, 0x73, 0x69, 0x6f, 0x6e, 0x12, 0x1c, 0x0a, 0x09, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x74, 0x75,
-	0x72, 0x65, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x09, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x74,
-	0x75, 0x72, 0x65, 0x12, 0x1d, 0x0a, 0x0a, 0x70, 0x75, 0x62, 0x6c, 0x69, 0x63, 0x5f, 0x6b, 0x65,
-	0x79, 0x18, 0x07, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x09, 0x70, 0x75, 0x62, 0x6c, 0x69, 0x63, 0x4b,
-	0x65, 0x79, 0x22, 0xe0, 0x02, 0x0a, 0x16, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x43,
-	0x6f, 0x6d, 0x70, 0x72, 0x65, 0x73, 0x73, 0x65, 0x64, 0x53, 0x79, 0x6e, 0x63, 0x12, 0x2a, 0x0a,
-	0x11, 0x66, 0x72, 0x6f, 0x6d, 0x5f, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x5f, 0x6e, 0x75, 0x6d, 0x62,
-	0x65, 0x72, 0x18, 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0f, 0x66, 0x72, 0x6f, 0x6d, 0x46, 0x72,
-	0x61, 0x6d, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x12, 0x26, 0x0a, 0x0f, 0x74, 0x6f, 0x5f,
-	0x66, 0x72, 0x61, 0x6d, 0x65, 0x5f, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x18, 0x02, 0x20, 0x01,
-	0x28, 0x04, 0x52, 0x0d, 0x74, 0x6f, 0x46, 0x72, 0x61, 0x6d, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65,
-	0x72, 0x12, 0x5a, 0x0a, 0x16, 0x74, 0x72, 0x75, 0x6e, 0x63, 0x61, 0x74, 0x65, 0x64, 0x5f, 0x63,
-	0x6c, 0x6f, 0x63, 0x6b, 0x5f, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28,
-	0x0b, 0x32, 0x24, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e,
-	0x6f, 0x64, 0x65, 0x2e, 0x63, 0x6c, 0x6f, 0x63, 0x6b, 0x2e, 0x70, 0x62, 0x2e, 0x43, 0x6c, 0x6f,
-	0x63, 0x6b, 0x46, 0x72, 0x61, 0x6d, 0x65, 0x52, 0x14, 0x74, 0x72, 0x75, 0x6e, 0x63, 0x61, 0x74,
-	0x65, 0x64, 0x43, 0x6c, 0x6f, 0x63, 0x6b, 0x46, 0x72, 0x61, 0x6d, 0x65, 0x73, 0x12, 0x47, 0x0a,
-	0x06, 0x70, 0x72, 0x6f, 0x6f, 0x66, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x2f, 0x2e,
+	0x61, 0x6e, 0x74, 0x73, 0x22, 0xab, 0x02, 0x0a, 0x17, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e,
+	0x79, 0x56, 0x61, 0x6c, 0x69, 0x64, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x53, 0x74, 0x61, 0x74, 0x65,
+	0x12, 0x4f, 0x0a, 0x07, 0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28,
+	0x0b, 0x32, 0x35, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e,
+	0x6f, 0x64, 0x65, 0x2e, 0x63, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x2e, 0x70, 0x62, 0x2e,
+	0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69,
+	0x70, 0x74, 0x43, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x52, 0x07, 0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74,
+	0x73, 0x12, 0x5e, 0x0a, 0x12, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x64, 0x5f, 0x74, 0x72, 0x61,
+	0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x2f, 0x2e,
 	0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e,
-	0x63, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x2e, 0x70, 0x62, 0x2e, 0x49, 0x6e, 0x63, 0x6c,
-	0x75, 0x73, 0x69, 0x6f, 0x6e, 0x50, 0x72, 0x6f, 0x6f, 0x66, 0x73, 0x4d, 0x61, 0x70, 0x52, 0x06,
-	0x70, 0x72, 0x6f, 0x6f, 0x66, 0x73, 0x12, 0x4d, 0x0a, 0x08, 0x73, 0x65, 0x67, 0x6d, 0x65, 0x6e,
-	0x74, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x31, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69,
+	0x63, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x2e, 0x70, 0x62, 0x2e, 0x43, 0x65, 0x72, 0x65,
+	0x6d, 0x6f, 0x6e, 0x79, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x52, 0x11,
+	0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x64, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70,
+	0x74, 0x12, 0x5f, 0x0a, 0x17, 0x6e, 0x65, 0x78, 0x74, 0x5f, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x5f,
+	0x70, 0x61, 0x72, 0x74, 0x69, 0x63, 0x69, 0x70, 0x61, 0x6e, 0x74, 0x73, 0x18, 0x03, 0x20, 0x03,
+	0x28, 0x0b, 0x32, 0x27, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e,
+	0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x6b, 0x65, 0x79, 0x73, 0x2e, 0x70, 0x62, 0x2e, 0x45, 0x64, 0x34,
+	0x34, 0x38, 0x50, 0x75, 0x62, 0x6c, 0x69, 0x63, 0x4b, 0x65, 0x79, 0x52, 0x15, 0x6e, 0x65, 0x78,
+	0x74, 0x52, 0x6f, 0x75, 0x6e, 0x64, 0x50, 0x61, 0x72, 0x74, 0x69, 0x63, 0x69, 0x70, 0x61, 0x6e,
+	0x74, 0x73, 0x22, 0x62, 0x0a, 0x18, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x50, 0x65,
+	0x65, 0x72, 0x4c, 0x69, 0x73, 0x74, 0x41, 0x6e, 0x6e, 0x6f, 0x75, 0x6e, 0x63, 0x65, 0x12, 0x46,
+	0x0a, 0x09, 0x70, 0x65, 0x65, 0x72, 0x5f, 0x6c, 0x69, 0x73, 0x74, 0x18, 0x01, 0x20, 0x03, 0x28,
+	0x0b, 0x32, 0x29, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e,
+	0x6f, 0x64, 0x65, 0x2e, 0x63, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x2e, 0x70, 0x62, 0x2e,
+	0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x50, 0x65, 0x65, 0x72, 0x52, 0x08, 0x70, 0x65,
+	0x65, 0x72, 0x4c, 0x69, 0x73, 0x74, 0x22, 0xd7, 0x01, 0x0a, 0x0c, 0x43, 0x65, 0x72, 0x65, 0x6d,
+	0x6f, 0x6e, 0x79, 0x50, 0x65, 0x65, 0x72, 0x12, 0x17, 0x0a, 0x07, 0x70, 0x65, 0x65, 0x72, 0x5f,
+	0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x06, 0x70, 0x65, 0x65, 0x72, 0x49, 0x64,
+	0x12, 0x1c, 0x0a, 0x09, 0x6d, 0x75, 0x6c, 0x74, 0x69, 0x61, 0x64, 0x64, 0x72, 0x18, 0x02, 0x20,
+	0x01, 0x28, 0x09, 0x52, 0x09, 0x6d, 0x75, 0x6c, 0x74, 0x69, 0x61, 0x64, 0x64, 0x72, 0x12, 0x1b,
+	0x0a, 0x09, 0x6d, 0x61, 0x78, 0x5f, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28,
+	0x04, 0x52, 0x08, 0x6d, 0x61, 0x78, 0x46, 0x72, 0x61, 0x6d, 0x65, 0x12, 0x1c, 0x0a, 0x09, 0x74,
+	0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x18, 0x04, 0x20, 0x01, 0x28, 0x03, 0x52, 0x09,
+	0x74, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x12, 0x18, 0x0a, 0x07, 0x76, 0x65, 0x72,
+	0x73, 0x69, 0x6f, 0x6e, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x07, 0x76, 0x65, 0x72, 0x73,
+	0x69, 0x6f, 0x6e, 0x12, 0x1c, 0x0a, 0x09, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x74, 0x75, 0x72, 0x65,
+	0x18, 0x06, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x09, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x74, 0x75, 0x72,
+	0x65, 0x12, 0x1d, 0x0a, 0x0a, 0x70, 0x75, 0x62, 0x6c, 0x69, 0x63, 0x5f, 0x6b, 0x65, 0x79, 0x18,
+	0x07, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x09, 0x70, 0x75, 0x62, 0x6c, 0x69, 0x63, 0x4b, 0x65, 0x79,
+	0x22, 0xe0, 0x02, 0x0a, 0x16, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x43, 0x6f, 0x6d,
+	0x70, 0x72, 0x65, 0x73, 0x73, 0x65, 0x64, 0x53, 0x79, 0x6e, 0x63, 0x12, 0x2a, 0x0a, 0x11, 0x66,
+	0x72, 0x6f, 0x6d, 0x5f, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x5f, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72,
+	0x18, 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0f, 0x66, 0x72, 0x6f, 0x6d, 0x46, 0x72, 0x61, 0x6d,
+	0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x12, 0x26, 0x0a, 0x0f, 0x74, 0x6f, 0x5f, 0x66, 0x72,
+	0x61, 0x6d, 0x65, 0x5f, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04,
+	0x52, 0x0d, 0x74, 0x6f, 0x46, 0x72, 0x61, 0x6d, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x12,
+	0x5a, 0x0a, 0x16, 0x74, 0x72, 0x75, 0x6e, 0x63, 0x61, 0x74, 0x65, 0x64, 0x5f, 0x63, 0x6c, 0x6f,
+	0x63, 0x6b, 0x5f, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32,
+	0x24, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64,
+	0x65, 0x2e, 0x63, 0x6c, 0x6f, 0x63, 0x6b, 0x2e, 0x70, 0x62, 0x2e, 0x43, 0x6c, 0x6f, 0x63, 0x6b,
+	0x46, 0x72, 0x61, 0x6d, 0x65, 0x52, 0x14, 0x74, 0x72, 0x75, 0x6e, 0x63, 0x61, 0x74, 0x65, 0x64,
+	0x43, 0x6c, 0x6f, 0x63, 0x6b, 0x46, 0x72, 0x61, 0x6d, 0x65, 0x73, 0x12, 0x47, 0x0a, 0x06, 0x70,
+	0x72, 0x6f, 0x6f, 0x66, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x2f, 0x2e, 0x71, 0x75,
+	0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x63, 0x65,
+	0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x2e, 0x70, 0x62, 0x2e, 0x49, 0x6e, 0x63, 0x6c, 0x75, 0x73,
+	0x69, 0x6f, 0x6e, 0x50, 0x72, 0x6f, 0x6f, 0x66, 0x73, 0x4d, 0x61, 0x70, 0x52, 0x06, 0x70, 0x72,
+	0x6f, 0x6f, 0x66, 0x73, 0x12, 0x4d, 0x0a, 0x08, 0x73, 0x65, 0x67, 0x6d, 0x65, 0x6e, 0x74, 0x73,
+	0x18, 0x05, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x31, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72,
+	0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x63, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e,
+	0x79, 0x2e, 0x70, 0x62, 0x2e, 0x49, 0x6e, 0x63, 0x6c, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x53, 0x65,
+	0x67, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x4d, 0x61, 0x70, 0x52, 0x08, 0x73, 0x65, 0x67, 0x6d, 0x65,
+	0x6e, 0x74, 0x73, 0x22, 0xa5, 0x01, 0x0a, 0x12, 0x49, 0x6e, 0x63, 0x6c, 0x75, 0x73, 0x69, 0x6f,
+	0x6e, 0x50, 0x72, 0x6f, 0x6f, 0x66, 0x73, 0x4d, 0x61, 0x70, 0x12, 0x21, 0x0a, 0x0c, 0x66, 0x72,
+	0x61, 0x6d, 0x65, 0x5f, 0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0c,
+	0x52, 0x0b, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x43, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x12, 0x14, 0x0a,
+	0x05, 0x70, 0x72, 0x6f, 0x6f, 0x66, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x05, 0x70, 0x72,
+	0x6f, 0x6f, 0x66, 0x12, 0x56, 0x0a, 0x0b, 0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x6d, 0x65, 0x6e,
+	0x74, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x34, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69,
 	0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x63, 0x65, 0x72, 0x65, 0x6d,
 	0x6f, 0x6e, 0x79, 0x2e, 0x70, 0x62, 0x2e, 0x49, 0x6e, 0x63, 0x6c, 0x75, 0x73, 0x69, 0x6f, 0x6e,
-	0x53, 0x65, 0x67, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x4d, 0x61, 0x70, 0x52, 0x08, 0x73, 0x65, 0x67,
-	0x6d, 0x65, 0x6e, 0x74, 0x73, 0x22, 0xa5, 0x01, 0x0a, 0x12, 0x49, 0x6e, 0x63, 0x6c, 0x75, 0x73,
-	0x69, 0x6f, 0x6e, 0x50, 0x72, 0x6f, 0x6f, 0x66, 0x73, 0x4d, 0x61, 0x70, 0x12, 0x21, 0x0a, 0x0c,
-	0x66, 0x72, 0x61, 0x6d, 0x65, 0x5f, 0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x18, 0x01, 0x20, 0x01,
-	0x28, 0x0c, 0x52, 0x0b, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x43, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x12,
-	0x14, 0x0a, 0x05, 0x70, 0x72, 0x6f, 0x6f, 0x66, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x05,
-	0x70, 0x72, 0x6f, 0x6f, 0x66, 0x12, 0x56, 0x0a, 0x0b, 0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x6d,
-	0x65, 0x6e, 0x74, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x34, 0x2e, 0x71, 0x75, 0x69,
-	0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x63, 0x65, 0x72,
-	0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x2e, 0x70, 0x62, 0x2e, 0x49, 0x6e, 0x63, 0x6c, 0x75, 0x73, 0x69,
-	0x6f, 0x6e, 0x43, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x4d, 0x61, 0x70,
-	0x52, 0x0b, 0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x22, 0x3e, 0x0a,
-	0x14, 0x49, 0x6e, 0x63, 0x6c, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x53, 0x65, 0x67, 0x6d, 0x65, 0x6e,
-	0x74, 0x73, 0x4d, 0x61, 0x70, 0x12, 0x12, 0x0a, 0x04, 0x68, 0x61, 0x73, 0x68, 0x18, 0x01, 0x20,
-	0x01, 0x28, 0x0c, 0x52, 0x04, 0x68, 0x61, 0x73, 0x68, 0x12, 0x12, 0x0a, 0x04, 0x64, 0x61, 0x74,
-	0x61, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x04, 0x64, 0x61, 0x74, 0x61, 0x22, 0x7b, 0x0a,
-	0x17, 0x49, 0x6e, 0x63, 0x6c, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x43, 0x6f, 0x6d, 0x6d, 0x69, 0x74,
-	0x6d, 0x65, 0x6e, 0x74, 0x73, 0x4d, 0x61, 0x70, 0x12, 0x1e, 0x0a, 0x0a, 0x63, 0x6f, 0x6d, 0x6d,
-	0x69, 0x74, 0x6d, 0x65, 0x6e, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x0a, 0x63, 0x6f,
-	0x6d, 0x6d, 0x69, 0x74, 0x6d, 0x65, 0x6e, 0x74, 0x12, 0x19, 0x0a, 0x08, 0x74, 0x79, 0x70, 0x65,
-	0x5f, 0x75, 0x72, 0x6c, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x74, 0x79, 0x70, 0x65,
-	0x55, 0x72, 0x6c, 0x12, 0x25, 0x0a, 0x0e, 0x73, 0x65, 0x67, 0x6d, 0x65, 0x6e, 0x74, 0x5f, 0x68,
-	0x61, 0x73, 0x68, 0x65, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0c, 0x52, 0x0d, 0x73, 0x65, 0x67,
-	0x6d, 0x65, 0x6e, 0x74, 0x48, 0x61, 0x73, 0x68, 0x65, 0x73, 0x32, 0x89, 0x02, 0x0a, 0x0f, 0x43,
-	0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x12, 0x7e,
-	0x0a, 0x17, 0x47, 0x65, 0x74, 0x43, 0x6f, 0x6d, 0x70, 0x72, 0x65, 0x73, 0x73, 0x65, 0x64, 0x53,
-	0x79, 0x6e, 0x63, 0x46, 0x72, 0x61, 0x6d, 0x65, 0x73, 0x12, 0x2c, 0x2e, 0x71, 0x75, 0x69, 0x6c,
-	0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x63, 0x6c, 0x6f, 0x63,
-	0x6b, 0x2e, 0x70, 0x62, 0x2e, 0x43, 0x6c, 0x6f, 0x63, 0x6b, 0x46, 0x72, 0x61, 0x6d, 0x65, 0x73,
-	0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x33, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62,
-	0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x63, 0x65, 0x72, 0x65, 0x6d, 0x6f,
-	0x6e, 0x79, 0x2e, 0x70, 0x62, 0x2e, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x43, 0x6f,
-	0x6d, 0x70, 0x72, 0x65, 0x73, 0x73, 0x65, 0x64, 0x53, 0x79, 0x6e, 0x63, 0x30, 0x01, 0x12, 0x76,
-	0x0a, 0x10, 0x47, 0x65, 0x74, 0x50, 0x75, 0x62, 0x6c, 0x69, 0x63, 0x43, 0x68, 0x61, 0x6e, 0x6e,
-	0x65, 0x6c, 0x12, 0x2e, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e,
-	0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x63, 0x68, 0x61, 0x6e, 0x6e, 0x65, 0x6c, 0x2e, 0x70, 0x62, 0x2e,
-	0x50, 0x32, 0x50, 0x43, 0x68, 0x61, 0x6e, 0x6e, 0x65, 0x6c, 0x45, 0x6e, 0x76, 0x65, 0x6c, 0x6f,
-	0x70, 0x65, 0x1a, 0x2e, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e,
-	0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x63, 0x68, 0x61, 0x6e, 0x6e, 0x65, 0x6c, 0x2e, 0x70, 0x62, 0x2e,
-	0x50, 0x32, 0x50, 0x43, 0x68, 0x61, 0x6e, 0x6e, 0x65, 0x6c, 0x45, 0x6e, 0x76, 0x65, 0x6c, 0x6f,
-	0x70, 0x65, 0x28, 0x01, 0x30, 0x01, 0x42, 0x3a, 0x5a, 0x38, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65,
-	0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e, 0x63, 0x6f, 0x6d, 0x2f,
-	0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2f, 0x6d, 0x6f, 0x6e, 0x6f, 0x72,
-	0x65, 0x70, 0x6f, 0x2f, 0x6e, 0x6f, 0x64, 0x65, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75,
-	0x66, 0x73, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33,
+	0x43, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x4d, 0x61, 0x70, 0x52, 0x0b,
+	0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x22, 0x3e, 0x0a, 0x14, 0x49,
+	0x6e, 0x63, 0x6c, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x53, 0x65, 0x67, 0x6d, 0x65, 0x6e, 0x74, 0x73,
+	0x4d, 0x61, 0x70, 0x12, 0x12, 0x0a, 0x04, 0x68, 0x61, 0x73, 0x68, 0x18, 0x01, 0x20, 0x01, 0x28,
+	0x0c, 0x52, 0x04, 0x68, 0x61, 0x73, 0x68, 0x12, 0x12, 0x0a, 0x04, 0x64, 0x61, 0x74, 0x61, 0x18,
+	0x02, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x04, 0x64, 0x61, 0x74, 0x61, 0x22, 0x7b, 0x0a, 0x17, 0x49,
+	0x6e, 0x63, 0x6c, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x43, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x6d, 0x65,
+	0x6e, 0x74, 0x73, 0x4d, 0x61, 0x70, 0x12, 0x1e, 0x0a, 0x0a, 0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74,
+	0x6d, 0x65, 0x6e, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x0a, 0x63, 0x6f, 0x6d, 0x6d,
+	0x69, 0x74, 0x6d, 0x65, 0x6e, 0x74, 0x12, 0x19, 0x0a, 0x08, 0x74, 0x79, 0x70, 0x65, 0x5f, 0x75,
+	0x72, 0x6c, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x74, 0x79, 0x70, 0x65, 0x55, 0x72,
+	0x6c, 0x12, 0x25, 0x0a, 0x0e, 0x73, 0x65, 0x67, 0x6d, 0x65, 0x6e, 0x74, 0x5f, 0x68, 0x61, 0x73,
+	0x68, 0x65, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0c, 0x52, 0x0d, 0x73, 0x65, 0x67, 0x6d, 0x65,
+	0x6e, 0x74, 0x48, 0x61, 0x73, 0x68, 0x65, 0x73, 0x32, 0x89, 0x02, 0x0a, 0x0f, 0x43, 0x65, 0x72,
+	0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x12, 0x7e, 0x0a, 0x17,
+	0x47, 0x65, 0x74, 0x43, 0x6f, 0x6d, 0x70, 0x72, 0x65, 0x73, 0x73, 0x65, 0x64, 0x53, 0x79, 0x6e,
+	0x63, 0x46, 0x72, 0x61, 0x6d, 0x65, 0x73, 0x12, 0x2c, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62,
+	0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x63, 0x6c, 0x6f, 0x63, 0x6b, 0x2e,
+	0x70, 0x62, 0x2e, 0x43, 0x6c, 0x6f, 0x63, 0x6b, 0x46, 0x72, 0x61, 0x6d, 0x65, 0x73, 0x52, 0x65,
+	0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x33, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69,
+	0x75, 0x6d, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x2e, 0x63, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79,
+	0x2e, 0x70, 0x62, 0x2e, 0x43, 0x65, 0x72, 0x65, 0x6d, 0x6f, 0x6e, 0x79, 0x43, 0x6f, 0x6d, 0x70,
+	0x72, 0x65, 0x73, 0x73, 0x65, 0x64, 0x53, 0x79, 0x6e, 0x63, 0x30, 0x01, 0x12, 0x76, 0x0a, 0x10,
+	0x47, 0x65, 0x74, 0x50, 0x75, 0x62, 0x6c, 0x69, 0x63, 0x43, 0x68, 0x61, 0x6e, 0x6e, 0x65, 0x6c,
+	0x12, 0x2e, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f,
+	0x64, 0x65, 0x2e, 0x63, 0x68, 0x61, 0x6e, 0x6e, 0x65, 0x6c, 0x2e, 0x70, 0x62, 0x2e, 0x50, 0x32,
+	0x50, 0x43, 0x68, 0x61, 0x6e, 0x6e, 0x65, 0x6c, 0x45, 0x6e, 0x76, 0x65, 0x6c, 0x6f, 0x70, 0x65,
+	0x1a, 0x2e, 0x2e, 0x71, 0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e, 0x6e, 0x6f,
+	0x64, 0x65, 0x2e, 0x63, 0x68, 0x61, 0x6e, 0x6e, 0x65, 0x6c, 0x2e, 0x70, 0x62, 0x2e, 0x50, 0x32,
+	0x50, 0x43, 0x68, 0x61, 0x6e, 0x6e, 0x65, 0x6c, 0x45, 0x6e, 0x76, 0x65, 0x6c, 0x6f, 0x70, 0x65,
+	0x28, 0x01, 0x30, 0x01, 0x42, 0x3a, 0x5a, 0x38, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x2e, 0x71,
+	0x75, 0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x71, 0x75,
+	0x69, 0x6c, 0x69, 0x62, 0x72, 0x69, 0x75, 0x6d, 0x2f, 0x6d, 0x6f, 0x6e, 0x6f, 0x72, 0x65, 0x70,
+	0x6f, 0x2f, 0x6e, 0x6f, 0x64, 0x65, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x73,
+	0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33,
 }
 
 var (
@@ -1822,12 +1823,12 @@ var file_ceremony_proto_depIdxs = []int32{
 	22, // 23: quilibrium.node.ceremony.pb.CeremonyLobbyJoin.public_key_signature_ed448:type_name -> quilibrium.node.keys.pb.Ed448Signature
 	7,  // 24: quilibrium.node.ceremony.pb.CeremonyOpenState.joined_participants:type_name -> quilibrium.node.ceremony.pb.CeremonyLobbyJoin
 	21, // 25: quilibrium.node.ceremony.pb.CeremonyOpenState.preferred_participants:type_name -> quilibrium.node.keys.pb.Ed448PublicKey
-	21, // 26: quilibrium.node.ceremony.pb.CeremonyInProgressState.active_participants:type_name -> quilibrium.node.keys.pb.Ed448PublicKey
+	7,  // 26: quilibrium.node.ceremony.pb.CeremonyInProgressState.active_participants:type_name -> quilibrium.node.ceremony.pb.CeremonyLobbyJoin
 	2,  // 27: quilibrium.node.ceremony.pb.CeremonyInProgressState.latest_seen_prover_attestations:type_name -> quilibrium.node.ceremony.pb.CeremonySeenProverAttestation
 	3,  // 28: quilibrium.node.ceremony.pb.CeremonyInProgressState.dropped_participant_attestations:type_name -> quilibrium.node.ceremony.pb.CeremonyDroppedProverAttestation
 	6,  // 29: quilibrium.node.ceremony.pb.CeremonyInProgressState.transcript_round_advance_commits:type_name -> quilibrium.node.ceremony.pb.CeremonyAdvanceRound
 	21, // 30: quilibrium.node.ceremony.pb.CeremonyInProgressState.next_round_participants:type_name -> quilibrium.node.keys.pb.Ed448PublicKey
-	21, // 31: quilibrium.node.ceremony.pb.CeremonyFinalizingState.active_participants:type_name -> quilibrium.node.keys.pb.Ed448PublicKey
+	7,  // 31: quilibrium.node.ceremony.pb.CeremonyFinalizingState.active_participants:type_name -> quilibrium.node.ceremony.pb.CeremonyLobbyJoin
 	2,  // 32: quilibrium.node.ceremony.pb.CeremonyFinalizingState.latest_seen_prover_attestations:type_name -> quilibrium.node.ceremony.pb.CeremonySeenProverAttestation
 	3,  // 33: quilibrium.node.ceremony.pb.CeremonyFinalizingState.dropped_participant_attestations:type_name -> quilibrium.node.ceremony.pb.CeremonyDroppedProverAttestation
 	5,  // 34: quilibrium.node.ceremony.pb.CeremonyFinalizingState.commits:type_name -> quilibrium.node.ceremony.pb.CeremonyTranscriptCommit
diff --git a/node/protobufs/ceremony.proto b/node/protobufs/ceremony.proto
index b4ce8dd..3014ce9 100644
--- a/node/protobufs/ceremony.proto
+++ b/node/protobufs/ceremony.proto
@@ -105,7 +105,7 @@ message CeremonyOpenState {
 }
 
 message CeremonyInProgressState { 
-  repeated quilibrium.node.keys.pb.Ed448PublicKey active_participants = 1;
+  repeated CeremonyLobbyJoin active_participants = 1;
   repeated CeremonySeenProverAttestation latest_seen_prover_attestations = 2;
   repeated CeremonyDroppedProverAttestation dropped_participant_attestations = 3;
   repeated CeremonyAdvanceRound transcript_round_advance_commits = 4;
@@ -113,7 +113,7 @@ message CeremonyInProgressState {
 }
 
 message CeremonyFinalizingState {
-  repeated quilibrium.node.keys.pb.Ed448PublicKey active_participants = 1;
+  repeated CeremonyLobbyJoin active_participants = 1;
   repeated CeremonySeenProverAttestation latest_seen_prover_attestations = 2;
   repeated CeremonyDroppedProverAttestation dropped_participant_attestations = 3;
   repeated CeremonyTranscriptCommit commits = 4;
diff --git a/node/protobufs/clock.go b/node/protobufs/clock.go
index 1bf62d9..e1c5077 100644
--- a/node/protobufs/clock.go
+++ b/node/protobufs/clock.go
@@ -121,7 +121,9 @@ func (frame *ClockFrame) VerifyMasterClockFrame() error {
 	return nil
 }
 
-func (frame *ClockFrame) GetParentSelectorAndDistance() (
+func (frame *ClockFrame) GetParentSelectorAndDistance(
+	discriminator *big.Int,
+) (
 	*big.Int,
 	*big.Int,
 	*big.Int,
@@ -141,27 +143,20 @@ func (frame *ClockFrame) GetParentSelectorAndDistance() (
 
 	parentSelector := new(big.Int).SetBytes(frame.ParentSelector)
 
-	var pubkey []byte
-	ed448PublicKey := frame.GetPublicKeySignatureEd448()
-	if ed448PublicKey != nil {
-		pubkey = ed448PublicKey.PublicKey.KeyValue
-	} else {
-		return nil, nil, nil, errors.Wrap(
-			errors.New("no valid signature provided"),
-			"get parent selector and distance",
+	var distance *big.Int
+	if discriminator != nil {
+		l := new(big.Int).Mod(
+			new(big.Int).Sub(selector, discriminator),
+			ff.Modulus(),
 		)
-	}
-
-	discriminator, err := poseidon.HashBytes(pubkey)
-	if err != nil {
-		return nil, nil, nil, errors.Wrap(err, "get parent selector and distance")
-	}
-
-	l := new(big.Int).Mod(new(big.Int).Sub(selector, discriminator), ff.Modulus())
-	r := new(big.Int).Mod(new(big.Int).Sub(discriminator, selector), ff.Modulus())
-	distance := r
-	if l.Cmp(r) == -1 {
-		distance = l
+		r := new(big.Int).Mod(
+			new(big.Int).Sub(discriminator, selector),
+			ff.Modulus(),
+		)
+		distance = r
+		if l.Cmp(r) == 1 {
+			distance = l
+		}
 	}
 
 	return parentSelector, distance, selector, nil
diff --git a/node/retroactive_peers.json b/node/retroactive_peers.json
new file mode 100644
index 0000000..2a8d5fe
--- /dev/null
+++ b/node/retroactive_peers.json
@@ -0,0 +1,1052 @@
+{
+  "rewards": [
+    { "peer_id": "EiDt/I7irgZJvxHTKVYBWFC84aZt6t+jH44pTtBwDps2Mw==", "token_balance": 137558 },
+    { "peer_id": "EiCFw9CwNODrkiOIVFcyLpwLDVbzw+gJEk9Up36FOcAkYA==", "token_balance": 137558 },
+    { "peer_id": "EiA/zYamLhLM7WvEJw76qCsu5BECV7HuHnyXqijSDCj/LA==", "token_balance": 137558 },
+    { "peer_id": "EiDN04yQGVtsc8h1kiuZ9/hsp0N8YnWmuD/H4sLqBo4cMQ==", "token_balance": 137558 },
+    { "peer_id": "EiBEDZeMlawFcSNZbhMdpp81oeABrkoys9FW5Gpuo+Vx1w==", "token_balance": 137558 },
+    { "peer_id": "EiCpNbD4duH1dBHPSKerruhytoJXS4yBzmxtll4/+uTdbw==", "token_balance": 157208 },
+    { "peer_id": "EiDW4pG0FjdnCCn4Fl3WaFtsuOjewUPyYy1hgxYGcHJU4A==", "token_balance": 137558 },
+    { "peer_id": "EiDjVqM33jPfGp3G6hmXGnY/xT9+jJQaNqzaEc5YPh7nrA==", "token_balance": 137558 },
+    { "peer_id": "EiDi4THsckwtdtcsVftxOXE5ECGQVt/PlUR5z3CHfdaOvg==", "token_balance": 137558 },
+    { "peer_id": "EiA3/AHA4LVWEJBJC7Vj3DwN96vPIFrq7sUMIriesskU+A==", "token_balance": 157208 },
+    { "peer_id": "EiB40xAnxyscpEqR+HI4sqtEHX2L5TpOglTr8wgR1Rf8XQ==", "token_balance": 137558 },
+    { "peer_id": "EiBDjyTCf4m7wrvkAq795q3/9GTROLo3dQRpJDbpkA7TlA==", "token_balance": 137558 },
+    { "peer_id": "EiDnRZq4L5VNWDvT6lj3Lx8hV6uLISnIw5/WbWxPlHJ/YQ==", "token_balance": 137558 },
+    { "peer_id": "EiDcwwi2Y/d29+Za4AKneG8WVGzn0B8IGasVAluMJX8kNQ==", "token_balance": 137558 },
+    { "peer_id": "EiC/2ozdDeYRHkBa7eHblE0B7iHotFyuutG1OoxiCF/Iuw==", "token_balance": 137558 },
+    { "peer_id": "EiCsl0NhFsQfmWeqmGFnAHJO3EvcInss6hwpzs4yCGbYcA==", "token_balance": 137558 },
+    { "peer_id": "EiBacHPWZ9GDbROC3ir0iBFWLSRlYiJ+31IrNoJd8IVcew==", "token_balance": 137558 },
+    { "peer_id": "EiBQ82JQDutS5d301d7ZwTebXyQaw8boOLxi3eKMxDjMiA==", "token_balance": 137558 },
+    { "peer_id": "EiC79invPH4VCatNnehIsRFpI32LHVISg8T6gmBI03L+iw==", "token_balance": 157208 },
+    { "peer_id": "EiBqHhBPaZ40hnQkisOf5i72uii3Ft99g/WHSZeABmN+gQ==", "token_balance": 137558 },
+    { "peer_id": "EiDLg4I2+SAV7f0dUfT/qwDJWAstv1CAYmbhvJG3LxrgZw==", "token_balance": 157208 },
+    { "peer_id": "EiCPSIISA21MOeMW7/jaoEm1X8S0JZtUy49uqxPi/FGbTg==", "token_balance": 137558 },
+    { "peer_id": "EiC+fzSzlytoB6Ip0cRcDGmEk12Ogw58Sc3hNdsg/SHkOw==", "token_balance": 137558 },
+    { "peer_id": "EiBziJ/ddhCWFhS0qAIyvBqdgFzVxeww8UfIHyR1LLFJJg==", "token_balance": 137558 },
+    { "peer_id": "EiDSp9LSre4xNdlcUQUeWZ27Le3z2ux+yXivh+CQ9i+f4A==", "token_balance": 137558 },
+    { "peer_id": "EiB7PV8KSHxzLWMdhtKyJetVAwG1MmFAw+Aq2scndWw5ag==", "token_balance": 137558 },
+    { "peer_id": "EiCOMYZsW3yEELpS24n3pBVjAiztvHT5PgCPtX2Tlw1OFQ==", "token_balance": 137558 },
+    { "peer_id": "EiB7RtBwX+XKHvZ60TAmYFwUc8wf/gjr/oxM85oRovHeYg==", "token_balance": 137558 },
+    { "peer_id": "EiCGalaov/tWEJ9kefQb3tT2t8Iqhw8I77Hy9LTiqkTFCA==", "token_balance": 137558 },
+    { "peer_id": "EiDVxRV2koJRehT22MRKhCUAwVjWoOs7jG3YHkH+H4n/Kg==", "token_balance": 137558 },
+    { "peer_id": "EiDw3Ywh65HO9XDhjHrCh9O58F4by8QJYG/BIAj/0k/MCg==", "token_balance": 137558 },
+    { "peer_id": "EiDegv3AFdSkzSBIMEa66/4zF4dAy9VWX+PfsgyWbrjMMw==", "token_balance": 137558 },
+    { "peer_id": "EiBYvhRhcArCadflEwdD5dCjxnpBR+GTqbzTOeB6mftNBQ==", "token_balance": 137558 },
+    { "peer_id": "EiAMbrjjLm8tnSBsGYC5aAB8W+bTgLLfZRZ/vNevcfe1UQ==", "token_balance": 137558 },
+    { "peer_id": "EiBK+wXpiV7GnfCYqkUBCQm8+G0YxGKlKwAYIEYdNSPKqg==", "token_balance": 137558 },
+    { "peer_id": "EiBh7HnX3m2LJoXl0cFrHYC0wRkeMIu86iXuD3ITfOcTQA==", "token_balance": 137558 },
+    { "peer_id": "EiDyQ/H+unN8ABzy3jMO8AXqqRWMkdpp9S5Qo5KxsRk0yA==", "token_balance": 157208 },
+    { "peer_id": "EiCtbAIknJCzX3zMa7XKdL2C/Vk5SOAuQvN1QQ7g/NINuA==", "token_balance": 137558 },
+    { "peer_id": "EiBPYBYzDA1opc87Pxn1+0W3T1m7r5LLDGKXORuyqb/NrA==", "token_balance": 137558 },
+    { "peer_id": "EiCFRWMLrI4Ep1tM/Ypek+bBdRjFNs8VSTYUOmvAmbY++Q==", "token_balance": 137558 },
+    { "peer_id": "EiDq5lIy++a/uU3dGPbUo727N4pUfjY4l5aESU0ri91igg==", "token_balance": 137558 },
+    { "peer_id": "EiCir42Ak2MRRlHduW0N6EYW/TvZ4iiaCjbrvip2k0YZeg==", "token_balance": 157208 },
+    { "peer_id": "EiAMqyEN+tXoJQXA3/JSsaUSEDmAh9BL9ZOd1i9+r8uIzw==", "token_balance": 137558 },
+    { "peer_id": "EiBIcjkr470meG42q/bwhhTsKvsWL/SjRjhRwYlWCzmgnQ==", "token_balance": 137558 },
+    { "peer_id": "EiDACZWOuYJyI5Zp/z+P4rRshPO81fv9v7w/D0Vb9YOjkw==", "token_balance": 137558 },
+    { "peer_id": "EiCqZLOl92jTRBmkq4aVHo08RWPBQuGuieg53X/gfbJX5Q==", "token_balance": 157208 },
+    { "peer_id": "EiBb6Z0DMUQ7je1as86/l+cqGYx0lbI/j2ZOMf7Twr3qiQ==", "token_balance": 137558 },
+    { "peer_id": "EiDiXM2XUbY95oH/DxoP7zIzTkiBTqIq2Q6oG4/Lzf3KnA==", "token_balance": 137558 },
+    { "peer_id": "EiAhgQeJqx2PJwubnlXxw/VNRyoUQdSrFjTuGr3ZoNesPQ==", "token_balance": 137558 },
+    { "peer_id": "EiAXJMczXLr3cPMAhaEcbgskqDNJR0AYc4h925HYASTp2g==", "token_balance": 137558 },
+    { "peer_id": "EiDSsJ9Cz15k1adu13K8cnWpaBfYZQia5D4HNf19HpkMbg==", "token_balance": 137558 },
+    { "peer_id": "EiBhpttrR0jqq1LcunsuvGf/UvmpkLE9K3XsmbsKpMmKaw==", "token_balance": 137558 },
+    { "peer_id": "EiCIOXvLFLqVeaZ9cbNh+l0su0ZrLcQ+8fYSp8EDtzzpaw==", "token_balance": 137558 },
+    { "peer_id": "EiB/w8e9nsOFtyA2SYPeeGqiMgwwPFivXPEscYNWzR+MPA==", "token_balance": 137558 },
+    { "peer_id": "EiDiCoY7zYQxXDN0WuNX+gT0bTvpriicypSUX2NULZb1Yw==", "token_balance": 137558 },
+    { "peer_id": "EiBri4toaKjBVT+c2ttG082uNPjN2YntOGUz74YUnA5S8g==", "token_balance": 137558 },
+    { "peer_id": "EiCKW+WVd/yseDDV2jCI+y7yekbimA4EOjB4NpaQDUzAXw==", "token_balance": 137558 },
+    { "peer_id": "EiD90pwUVRhsyBqleRB6+fCFlmyzPEuSShGUBa5TxIIfEw==", "token_balance": 137558 },
+    { "peer_id": "EiCx5rf/JCPlaaSu2vfAdf+YpYNp6vAr+CoiV9J/dtfayQ==", "token_balance": 137558 },
+    { "peer_id": "EiDOu8gLzNR5ZWmg8a6JjNaFQS6LUp+0JMjJDrrGlwfvlw==", "token_balance": 137558 },
+    { "peer_id": "EiA6mZroS+TJ0PvEI3sszwFCfWOMfE8rxfTebdSCjpXB7w==", "token_balance": 137558 },
+    { "peer_id": "EiCvx9ZgMW2yiOdLGLrFSm0O8M99wnuVAAwFJotqjpQTNA==", "token_balance": 137558 },
+    { "peer_id": "EiBBNOiQksmnZ68ePdBabTngUR97UXxkJYXRPEl2agvdGw==", "token_balance": 137558 },
+    { "peer_id": "EiAo3nz97JlE158qfzDRyzBAjpWBOEx/faUrVQePy+vxSg==", "token_balance": 137558 },
+    { "peer_id": "EiBkXrxRgcgoZdI1KbsVhQAHDrmOWc55aspWx0MImrRcvA==", "token_balance": 157208 },
+    { "peer_id": "EiCbTU+5tg7y6uBqHEISeCGu1R9v0CiIjXxcr4yHngKClw==", "token_balance": 137558 },
+    { "peer_id": "EiBJXIAwAW2SRFAQrdhnglMRBhOp5m4SxYF2yJTTW9lF4g==", "token_balance": 137558 },
+    { "peer_id": "EiDU0CsEo5ClYdBuO3JPoxMBRx8tEAam5cdYHaBAcPH8kg==", "token_balance": 137558 },
+    { "peer_id": "EiBtaK2pOIVjGqj9/f7lTSljT39JWd41YbjLuNX0NWo3eQ==", "token_balance": 157208 },
+    { "peer_id": "EiBQnAqhRGVsBfSz4+YdbzV05rHDKh++b1vSEcAXvKauoQ==", "token_balance": 137558 },
+    { "peer_id": "EiATlGphiTRVCj4/CD1sRTqolV53dk4NA2kG22bIK9D/4Q==", "token_balance": 137558 },
+    { "peer_id": "EiAXDYLdsQlcFCcBArHnVbhoq1E2YBQhGLsVhqaV2pjI1g==", "token_balance": 137558 },
+    { "peer_id": "EiDiSU/zw55buN1xoWp+EnVEfunIINTN/pNkWiSFx4oA5g==", "token_balance": 137558 },
+    { "peer_id": "EiB3i1+vGFtWlC8Ei/8AfpypQexK7qx6F/R5REdPL3NcKA==", "token_balance": 137558 },
+    { "peer_id": "EiBHbU8R0nMCaNXqO2g0ewk+4vakrS1f8EL/VCytPqmTFA==", "token_balance": 137558 },
+    { "peer_id": "EiDwwS45tB4GWVX11CcliPTQejbRmS+lcTJClEpQbquF2Q==", "token_balance": 137558 },
+    { "peer_id": "EiD0CCDAOEx5kecW5b4ICbyg8BhhEhApmhgASK9Bqt8UDg==", "token_balance": 137558 },
+    { "peer_id": "EiCZ60/rAs6kL3pE9Qnw3y/bv2GdUMnBBIXNN3VRqpYq2Q==", "token_balance": 137558 },
+    { "peer_id": "EiBpvlWQu1uluUEyD266UVTyn22s+GNv9MSdKsLfsyt7Ag==", "token_balance": 137558 },
+    { "peer_id": "EiDTu1XyDtM1N43pEjkMzj7ai/Q4X2XnEaFgz31D/7G73w==", "token_balance": 137558 },
+    { "peer_id": "EiCM/tgnkDCqFGaULwGhlGmxsH3VKVIEOVHMd6RyTyEPSA==", "token_balance": 137558 },
+    { "peer_id": "EiCcaGUxOHidGuLwj3QDnVR5BA3BelUb5GA6UOFWVAcY6g==", "token_balance": 137558 },
+    { "peer_id": "EiBVaTZNWB4SCvlWOmwK6d0C5Yzl33hW8q7CAxjgb5zlww==", "token_balance": 137558 },
+    { "peer_id": "EiDjNhYf9f1XFjPNjHqat5KFyQGzTJadpcpom20xt0F56A==", "token_balance": 137558 },
+    { "peer_id": "EiBoMVXwviMRtKyjX9dtXnBUiErng+hbl4hLPa1LBsDQLA==", "token_balance": 137558 },
+    { "peer_id": "EiBoIc1nPO5+W5RqtFwqCo7kxSDFdckdPVwxpkW6UyEiNw==", "token_balance": 137558 },
+    { "peer_id": "EiBDwCF8pdxlB93eGtnRSo23g651J/aygpWQEtOnAYUbYA==", "token_balance": 137558 },
+    { "peer_id": "EiD24ZVIuHTeXxVpGf6b1azMHq67iyj1LZNLwlFP6eZDrg==", "token_balance": 157208 },
+    { "peer_id": "EiB8EGifgNwKMxlBc8o71qYg++hs1FLIXQEq08s2/69ktg==", "token_balance": 137558 },
+    { "peer_id": "EiDCPV7yxYd7F9zfXvF/nl02PR3zMZkQkm4KAERVBCFk3Q==", "token_balance": 137558 },
+    { "peer_id": "EiCp/4Ozp78dzwtKPvqucyDPmAWvKQI0Fc9rf1SX6/CARQ==", "token_balance": 137558 },
+    { "peer_id": "EiBT8XJHEtogwc/6GEAkLesKHzbDonM068Mg/mCLYPfUlA==", "token_balance": 137558 },
+    { "peer_id": "EiC39fG4ecRzuvl4qpqkSJ3eIyVDr/iEfLviwASR/YtfsQ==", "token_balance": 157208 },
+    { "peer_id": "EiDj5Tv5rYO3GX5P/qp14nwaEbDMHtgMkIaDYTAPCnJOwg==", "token_balance": 137558 },
+    { "peer_id": "EiBZtWRH2M3P/YGeVcGzeH1WyPAT1EAufTd6yX1bRSR+vA==", "token_balance": 137558 },
+    { "peer_id": "EiCsWqPPPT+BhctANRqQHFD+Dj/GdKsTmFHj6Xq84yg3XQ==", "token_balance": 137558 },
+    { "peer_id": "EiDOauPSTm/mG6Elcd+EBBlpMXamMY4eCFXkRCGciKE6KQ==", "token_balance": 137558 },
+    { "peer_id": "EiBw0l6b5gwe0O0Qr8WnIMx8a2Y2L5Vjziq74x/WfRg1HA==", "token_balance": 137558 },
+    { "peer_id": "EiDKtgwh/YwV5hLImqNffl6K2cVyMYKbt243FB6YV4Re5w==", "token_balance": 137558 },
+    { "peer_id": "EiBe1BsrDW/oLARa5LVx4d4o7ktYfxf/yoS+CEW1C4PcyQ==", "token_balance": 137558 },
+    { "peer_id": "EiCVNzMEqHzb9TsgWV7AlzdT5lFAtVbA0b3g2VbZj7pUWA==", "token_balance": 157208 },
+    { "peer_id": "EiBU1uPpjaSSbD+hqspV0xg9ms/a/u1SdZQPKPUwwp95WQ==", "token_balance": 157208 },
+    { "peer_id": "EiBcAUabPqHI+YSNuzRpTFm5PxDeRVk3vb/4HYlLVkbxnw==", "token_balance": 137558 },
+    { "peer_id": "EiCkMEu6lXBun9op8QbDcJJqwg+y531zJCdBcpKzyyzBVg==", "token_balance": 137558 },
+    { "peer_id": "EiC57WWhYCOd1+qqmxW9XnjkhG82xuBDv/hZFT4wbJAbUA==", "token_balance": 157208 },
+    { "peer_id": "EiAjhP9B5faB5+2IZI2cSm+FPfw9pfB7SL7AYDGlK4h4AQ==", "token_balance": 137558 },
+    { "peer_id": "EiCOGSBY4ly4j3GZ9yCw1vVyr7rd5S9yWjViKuLuzTETsQ==", "token_balance": 137558 },
+    { "peer_id": "EiAIIeJwouU3PVWLvUhPtm+MiQntIEKnpaKlsyIMzdgSnQ==", "token_balance": 137558 },
+    { "peer_id": "EiBo1PqBDIQRzE3hNL0DttKpjuaj2yWG2tPL01UKY+31LA==", "token_balance": 137558 },
+    { "peer_id": "EiD65Eh0LKGj6AgMlsVmEBGnpisJgTcdilvmlqAqBCVBTA==", "token_balance": 137558 },
+    { "peer_id": "EiDxhfqRg6RzeaZrJl9FbDrEWc0Aq1iph+xWBamS6FcgTw==", "token_balance": 137558 },
+    { "peer_id": "EiDrYab6PMYmhZ4e3ry0Qw/+3sbvVfG6M2vJfrUsZcn4hg==", "token_balance": 137558 },
+    { "peer_id": "EiDeAUv0KvqElzgVULgPe7J3bZFV1Vi0qNREG7uCR4QOrA==", "token_balance": 137558 },
+    { "peer_id": "EiBBqBROgbME6/f9OmDnI+p6gr/XfRWdhCVbaXWIe/It4g==", "token_balance": 137558 },
+    { "peer_id": "EiByqJCfrbGJK6SAZJkdaM5LCIk1jsegmD9vQhscSWtFhw==", "token_balance": 137558 },
+    { "peer_id": "EiCqM/Pjb5beBN9WYrlsg7jOYpT6qxLxIeflKY3dXXJRUw==", "token_balance": 137558 },
+    { "peer_id": "EiDZopN+RnHBtk2ocwTFPuNe7JWxw22Q24q/vfvkcSW7KQ==", "token_balance": 157208 },
+    { "peer_id": "EiAWko6omZov8Q02glLaXuFTcYcP2Em7cJ2EUdawyq1ZKQ==", "token_balance": 137558 },
+    { "peer_id": "EiC79QU6FfCx3fXWTypurSbaa7MHNawbXlsRGRwD9kIuvw==", "token_balance": 137558 },
+    { "peer_id": "EiBPAuih1CryAgv7qNreo/NH6lrJWAT8QYwt71XGMnprkQ==", "token_balance": 137558 },
+    { "peer_id": "EiCNsbzzelVdxxhlJSvkwjt7xbZi40tpcojNJAaJlcxWTA==", "token_balance": 137558 },
+    { "peer_id": "EiC842UyoMHux38m67Ij7jJz4aL7Mg1IB8CCZd2IF9ZlGA==", "token_balance": 137558 },
+    { "peer_id": "EiB5TBHNSRvK9MwnbtW85pM9eaJGBULDivUFlCQauLUcig==", "token_balance": 137558 },
+    { "peer_id": "EiB8UECS82ItUDV1t7C0UN4WyNu6nVBfdaM37lzwnXDcJg==", "token_balance": 137558 },
+    { "peer_id": "EiDhS3+6Vc3SaMdJpYtrX5fnRS5ZRDEfF8vx+iOwWIO6Bw==", "token_balance": 137558 },
+    { "peer_id": "EiAjlO4GFVrt2V+pQikLXmHfBmqcvTmJOQD3+eJXnOJl+w==", "token_balance": 137558 },
+    { "peer_id": "EiCNte5+wR+8p342FHVoaR2OGq6Vb3NnXW2AhG4W+dFqLg==", "token_balance": 137558 },
+    { "peer_id": "EiBjJ7ZhiNV9aHcjEtTyLANsX2lsrRbE7DUJvn5rHpW+vw==", "token_balance": 137558 },
+    { "peer_id": "EiAcdEkxOVC3LS7goYpEO5du4LttGgg9NVM35jurSs99qw==", "token_balance": 137558 },
+    { "peer_id": "EiDjXPG7MmNt/AKYQDDziB6rkUVFAQBPEL8rd2fhlnG08A==", "token_balance": 137558 },
+    { "peer_id": "EiDrhsEnPMADvxpuN3kj1n18kvvWKXA3oWDHi1WZXFgnvw==", "token_balance": 137558 },
+    { "peer_id": "EiCnI2JWuAmhP313X0ywIwgK0BugQ7NgA6ttp7PbBjPB3w==", "token_balance": 137558 },
+    { "peer_id": "EiDa3uT5jYEMOq55A5jQStBwijgR0KF2GSifgZbGGRTZiA==", "token_balance": 157208 },
+    { "peer_id": "EiCVUzWKGlK815nhPRldaii2fU72oiDebdMtOHC1NjxVBQ==", "token_balance": 137558 },
+    { "peer_id": "EiAZ46m+gjk1acOEZWaYBE3Mzt/r2U9CKm9fZzkjwuP6pA==", "token_balance": 137558 },
+    { "peer_id": "EiBNf+29/yg2TqUgcm5mVAKq2Awd4cIE0JXnehyh8GUVhA==", "token_balance": 137558 },
+    { "peer_id": "EiD9ZRy93btwIV4ucAefUMYpuliAjG7dhP/8XV4jTOIgxQ==", "token_balance": 137558 },
+    { "peer_id": "EiAk7UZ2QrfW3w7fW3aOnQGTeIx4BkodlcVoqoDj79dCUA==", "token_balance": 137558 },
+    { "peer_id": "EiCWy8qgkdwORzG0b0o7x3sFHGMzGR29ZeCM0uUUqCJ3eQ==", "token_balance": 137558 },
+    { "peer_id": "EiDz3WxEff1Sa8ByeiwpvFvIX7Nf9mBIGlSmzjW4yVtZPw==", "token_balance": 137558 },
+    { "peer_id": "EiBbt7dZ0/SdrpyHEGRAVCNV5tpZOf8iGkf2hxGjGdgIWg==", "token_balance": 137558 },
+    { "peer_id": "EiDOQHb6qPJjw4S9iqfiTbRyCpANCGzJ1wjk+DCMbZCqRA==", "token_balance": 137558 },
+    { "peer_id": "EiB6VG4YGT/FOpsMlw70GtuxLwRWLtm82on6h8Tu0fZtNA==", "token_balance": 137558 },
+    { "peer_id": "EiAk0lqV/eHg9YPtN4ChUQoPctuHRLtfi8X5i8cyP8aYCA==", "token_balance": 137558 },
+    { "peer_id": "EiAkLM53kaSeuM8qcBRnWIBWtZbMtoYeTHrr2gOLQA7Y0Q==", "token_balance": 157208 },
+    { "peer_id": "EiCA3PAFkKLgOXOPTOPWUvHRtDo4kqaP2A++jWq4FVnEeA==", "token_balance": 137558 },
+    { "peer_id": "EiCb9Jvw2ydm4pzukNeYd7uQADWMG06yblmV+DBPlvvTGw==", "token_balance": 137558 },
+    { "peer_id": "EiAD/2QUvbHyV9Z/I2YUFwGccdA7tyLJL3gN+78xoHvcJA==", "token_balance": 137558 },
+    { "peer_id": "EiCZaMGDkCoAEyJgRozTxZtEkjmJmJK+7yGyiseoeFDsrQ==", "token_balance": 137558 },
+    { "peer_id": "EiAr7ITg8LupDC6Ofj1F42es67j4IHuGDLXAbHBkcpX9iA==", "token_balance": 137558 },
+    { "peer_id": "EiAuXy5mmGl5HjFoya9gddTfBQI9ltryDq5Jo8kTKrGmnA==", "token_balance": 157208 },
+    { "peer_id": "EiA3D9eFBoUeMCLKvEiYi7mYE3pbmgedRj7MAdRWHC6iXA==", "token_balance": 137558 },
+    { "peer_id": "EiCCwj7cKZuz7wtrOiw6GhNGg7j/pm1oupCJ5oq/8BR5Tg==", "token_balance": 137558 },
+    { "peer_id": "EiDu8Y42LEgOfvdNSuX6le505kdL5UPQwfvXrndiVwUCXA==", "token_balance": 137558 },
+    { "peer_id": "EiC/fh676OHCz8Vup8mvGJ2BRQuV4w/M2mPQVrjUy9418w==", "token_balance": 137558 },
+    { "peer_id": "EiDisvDDXH6qzTtoiBfEo1UAQ/no5de1lOs5mbMvNuzGyw==", "token_balance": 137558 },
+    { "peer_id": "EiDftaeCgl6Fl6VuRt7kH03+p83iY4fLCvVBO0awZbSkTg==", "token_balance": 137558 },
+    { "peer_id": "EiCskqQddnjW6dsG0EkpjDRQtB0T/2i0HmF95FXq/oc/fw==", "token_balance": 137558 },
+    { "peer_id": "EiDfYvHCkMwHQo1SGIrExl+xFo0Rk7WVyLzb3R6Jtdye4w==", "token_balance": 137558 },
+    { "peer_id": "EiDPjISeAgCnpggVNcAFeTWP5T69QWbGZr9RK2D6SRoX6A==", "token_balance": 137558 },
+    { "peer_id": "EiDjEtnQ3bj7NzDbaS1frrVfbGrjRNs4nXelBRgpdvtbhA==", "token_balance": 137558 },
+    { "peer_id": "EiBmrBWcyAr89jZe8YB+hDvj/jl2ozI5fcN+UZU20RYmNA==", "token_balance": 137558 },
+    { "peer_id": "EiAQjU11AvNccFg+G2P1xiXo3eAwdjKHDE+njKV+Gplo9A==", "token_balance": 137558 },
+    { "peer_id": "EiCH1eO9ezx9Q5ICnHu0qUkWljIXs5sDKIgJ9toSbrzicw==", "token_balance": 137558 },
+    { "peer_id": "EiBvj8iR5QnqtyCLMlgwLp17LRkCnPvjggfb+/8b00sQTg==", "token_balance": 137558 },
+    { "peer_id": "EiCPkDUg+eAEAx4eMdctzsBJ2W6Q8KkCPz5Ed2WkCMDymg==", "token_balance": 137558 },
+    { "peer_id": "EiD433yqmHk/ISwbVapwg4RFNvfOaw9OWtJTzHcB3JbmdQ==", "token_balance": 137558 },
+    { "peer_id": "EiCUU9k9AvybFsitkfFweis1z8XdzfjSnFI7VwHxS4+K3w==", "token_balance": 137558 },
+    { "peer_id": "EiCgvtp8KntElYqHPcMOqnNGaMRSMQrhiA75JzfpEjI5Kg==", "token_balance": 137558 },
+    { "peer_id": "EiAkYt7QV7Fj37K5uPaaFzqSvm1Y6vQpHtYVkX2TXm6vZA==", "token_balance": 137558 },
+    { "peer_id": "EiBqkYtumWWv0YqAikqLyskPL2XGvz984JUJao76py2Ftw==", "token_balance": 137558 },
+    { "peer_id": "EiB48ZoOAYpAPbh+y1R/NV2KEStNmMtfluDCX9GEqLvrhg==", "token_balance": 137558 },
+    { "peer_id": "EiB/LQgpH8BXKxXeT/1I04xa7BSK3Rpqf4gV7mgwecDV6g==", "token_balance": 137558 },
+    { "peer_id": "EiBLTzHUsmdyJSU0K6IJ+3nD7deb3j/w4YBe2d9WNq5REA==", "token_balance": 157208 },
+    { "peer_id": "EiDVngikPhoWPFGQDaukzbPry2+VnuJMNl+4qtNH7qLhQg==", "token_balance": 157208 },
+    { "peer_id": "EiAeUgH7prW7wVAXhoMVNqfvN3epBJ75X0dftHiYx8/+kQ==", "token_balance": 137558 },
+    { "peer_id": "EiDqQJ9ndp28N7MtMynRou6vRds9J30evUiR2sbzOSQwEA==", "token_balance": 137558 },
+    { "peer_id": "EiBFJlOHZZIVEEOUYzN+7m9z8Dyi2JAJ7eMKzXhyxOxGwA==", "token_balance": 137558 },
+    { "peer_id": "EiCgtRXQ+69xr3xSexZjmmBo9as5fmdXfeMAmee0LSRhHA==", "token_balance": 137558 },
+    { "peer_id": "EiB+SDiUu5zYWg0XrjFABF+v72aDGylM/Xbvh99bLgC/rA==", "token_balance": 137558 },
+    { "peer_id": "EiBra5zVAnmYZYfzXu6lB2/gS6BouXV7DcIPkA4qNdxIEA==", "token_balance": 137558 },
+    { "peer_id": "EiDaGoc7jYl3OEYhffHVFzIa7lh2T/rwUSM8Lo8wt0wLIg==", "token_balance": 157208 },
+    { "peer_id": "EiCc1y5qnoyYev03GMklk9mqPI4vzkRZA8NImSTq3pbHDQ==", "token_balance": 137558 },
+    { "peer_id": "EiCreC2KQ/VUXAkh7B7vl/b+amIxVAK8agO4AakNwRhFqg==", "token_balance": 137558 },
+    { "peer_id": "EiA+KiRdIaZ0l8JnM7p7b4ixAXsgnetDVV0b92i/0mKkfQ==", "token_balance": 157208 },
+    { "peer_id": "EiCntm/fnimBfZVLzMo5FWyCLAecPdfBUjxsNA1ywux6uw==", "token_balance": 137558 },
+    { "peer_id": "EiCpGHsztbHwtuhNQwCLjJ3A5mOuUBUSjZz0pOticztOSw==", "token_balance": 137558 },
+    { "peer_id": "EiBpztNN0Xqf1t38b2EbhwdiV0uPnh7JouqPAQHf9k4eTA==", "token_balance": 137558 },
+    { "peer_id": "EiDNgDbbc8h7d168za3+7OkxsPWfbF2/Wj0C0TqQ6wso8A==", "token_balance": 137558 },
+    { "peer_id": "EiD2ODrPmC9doWMq/WCiymEsdeUBSHVdaQahQn8X1JSKGQ==", "token_balance": 137558 },
+    { "peer_id": "EiCW3hg2/wfrbRsqXVAlRmNz6yhLJ/euBp5XT1WK0+RBTg==", "token_balance": 137558 },
+    { "peer_id": "EiDcHmZCqlve6TljT1uNA4sf9nDkwoitbODaczsrrwDuyQ==", "token_balance": 137558 },
+    { "peer_id": "EiCtKApm4Z9at1keNK+D9G+qtcZjcmOYorFsIdgBP4jxMQ==", "token_balance": 137558 },
+    { "peer_id": "EiD3E5ybLfrVuIlTNC/PtLT0bxbJ73IkSm6V31WD7f/CgQ==", "token_balance": 137558 },
+    { "peer_id": "EiABQZ4CCgA/dROFPMznV8OHlYJwvB7YVpE2QG8q/lYz4w==", "token_balance": 137558 },
+    { "peer_id": "EiAv0sXFIFO95FO1i6pcm3Cv/etwK3hgnVNtAjIirIi79Q==", "token_balance": 137558 },
+    { "peer_id": "EiA9CUVhMXrDwoey5zZ3UDk/9vv5NmVRSrI2E1VbfbwChQ==", "token_balance": 137558 },
+    { "peer_id": "EiBF4OtXBF62jVOAd9/aGSmi4yMqJGgiGdlWuHoXXeQbgw==", "token_balance": 137558 },
+    { "peer_id": "EiBVjczAYfWR+Cnr7Yr7wAWYVNDRvQB8MK7kimQxvddy9Q==", "token_balance": 157208 },
+    { "peer_id": "EiCnqObOi5JPBt+wlcrfrtTOLvtzXcTH+bz8Ho9c0kBPlw==", "token_balance": 137558 },
+    { "peer_id": "EiBf+/8AtcguK8Jg7G8uhSk2RrE4zeLyHtKvOq5ivGVG+g==", "token_balance": 137558 },
+    { "peer_id": "EiCvbY06avzEq6Z5Fl9dVGpSYxgxvGXYBt2t/pYB5MeJUw==", "token_balance": 137558 },
+    { "peer_id": "EiB5Anrm1+fR82PvpuxxA90ZxVjYytw1VNO2YofrhgHJ1w==", "token_balance": 137558 },
+    { "peer_id": "EiDDxh5G0ceG81L8ZM5OImUHZCa5FRlBJQ517wpVK2d5pQ==", "token_balance": 157208 },
+    { "peer_id": "EiB+lNO1WleKyaoqwVjpkYSBuq5YglwYMtokLaIJool4Mw==", "token_balance": 157208 },
+    { "peer_id": "EiAqIczZ3El5VDLORlRmlL7P2PDJymLTOYbqVLVbOCxo+Q==", "token_balance": 137558 },
+    { "peer_id": "EiABSmVVj6o/k35ZhWz+XngR//XA/WNXt9dnsOwuWqJomw==", "token_balance": 137558 },
+    { "peer_id": "EiBC2kF9sAvD2S2/j95S1equyXP1djna0trNfv+/i7IWMg==", "token_balance": 137558 },
+    { "peer_id": "EiB7pQTMuJPWqN4Ov9R4fA01GPo76W7U8wJnvyQ9mMwXyw==", "token_balance": 137558 },
+    { "peer_id": "EiD7mDCGtv4wQl7lc9SKxdXWDGnhtCCG4xlnS/eHB6ZO5g==", "token_balance": 137558 },
+    { "peer_id": "EiBK42dQx6Omt1qfftHiTirf/LjYcehnhshoLBNRxaNZ1Q==", "token_balance": 137558 },
+    { "peer_id": "EiDI9PlqCQ44ItfmAv8res4WcjFBy/qoaLVUz3GgJ0l6tQ==", "token_balance": 137558 },
+    { "peer_id": "EiDr82Ty9BU5Des3YkZWk+7yO0h6LwJKXzfkHbUipidHbw==", "token_balance": 137558 },
+    { "peer_id": "EiCB0vEE0C3b88qllK92qAlVxjvAU6TZOUsVs0/q6O4BhQ==", "token_balance": 137558 },
+    { "peer_id": "EiDeTDldMcJfdupjyie3xCfD6nqokoXvQWtrn6hpqY0x+g==", "token_balance": 137558 },
+    { "peer_id": "EiB6e4Y3e7dHwg8qZkpt9uZ02Hopn3kNUkyxRcjFXabhpg==", "token_balance": 157208 },
+    { "peer_id": "EiAN2SYlXsybAQ9XkdLJWBIYN3rPJJVyBODvA30r0crMgA==", "token_balance": 137558 },
+    { "peer_id": "EiBtqGRGMEzBNhbkLDFeBqD0mtxI58LBRNV3wus3bKD18w==", "token_balance": 157208 },
+    { "peer_id": "EiCQPlVMmVo2eTY+iZzPtKPtpSFpCkUrODV5bFDNoXu4Yg==", "token_balance": 137558 },
+    { "peer_id": "EiBIKOtXIlcuDEBueu1XrNyswYrzSf1ujX8Kg5NWLZzc/g==", "token_balance": 137558 },
+    { "peer_id": "EiAA7wfcxZux/SjXo2voSj09na2CREo1UGsWkf15UGZTDA==", "token_balance": 157208 },
+    { "peer_id": "EiA4VMDy4uv9XWIgAi6ZhJDM35uaEuTwS3e/VZwecfKmKw==", "token_balance": 137558 },
+    { "peer_id": "EiBN6EJOM2NQG4JSB17CIDX4Q1wer6oGycjXt7E2YzqfOQ==", "token_balance": 137558 },
+    { "peer_id": "EiCGGLalG9WE6bozbLdGaK5guXTfZ23RBlGs9Uv9uPQVLQ==", "token_balance": 137558 },
+    { "peer_id": "EiAvZSddPIdV7D3ljwhRZ+5b2OxVlC0Y8fa4zkILgPDSLA==", "token_balance": 137558 },
+    { "peer_id": "EiAtFvzYP8Q3yVI6s0NMyUkjybEmDobuF//x+A3f5BJBmA==", "token_balance": 137558 },
+    { "peer_id": "EiAZSsgD67QhAU3SE/6poRBrO5zPOR2VGOTxMM84TY/VDw==", "token_balance": 137558 },
+    { "peer_id": "EiAgpQDCV3cnKbhkv16nt/pF8AxH4Zi9KLpkbU3L1j9G9A==", "token_balance": 137558 },
+    { "peer_id": "EiCLn0btOlih5XI+OepuUGTh/cfAJ33w5ynGlpp83zyW4w==", "token_balance": 137558 },
+    { "peer_id": "EiCDeai2KHQs6kQbLH89DmKkJGNB+bqz5Sf+8HMFRrtRnA==", "token_balance": 137558 },
+    { "peer_id": "EiAu5k+odJOG2jRk2ikvsKBxdk1i2kSwXMUhmDCnipmc4A==", "token_balance": 137558 },
+    { "peer_id": "EiCs4jsoS8acBYI/EEI3EWkF/LT++aAmJxwGL34NJ8fGaA==", "token_balance": 137558 },
+    { "peer_id": "EiBI+4rfaF+ZqC40CLlZ+Rph6EBFLTtmipFLBf+0HIrurA==", "token_balance": 137558 },
+    { "peer_id": "EiC5AOkAl2pVyR9LstzYM/NmYWgVfnIs3Icxz+mPmEPG4A==", "token_balance": 137558 },
+    { "peer_id": "EiD/R1kMZFs/Uphk6hAObL8T+5tdW+bCUgZ9NVSdHcACyw==", "token_balance": 137558 },
+    { "peer_id": "EiCuopZao+OzB/jbBkx4kBygbkbmdDfwGToQ9efHfvhFbA==", "token_balance": 137558 },
+    { "peer_id": "EiC50zJitcCJXzRJvHvps2z6JPRhSDFmhibISk7ttGFoTQ==", "token_balance": 137558 },
+    { "peer_id": "EiAT63M6Px7ZxHiYxiv7ZxZd42brelsauhHsRBt9Cax/uQ==", "token_balance": 137558 },
+    { "peer_id": "EiCkrMhJuCmV2zhJIGGPciNuGc2lC6U4L0RxnbA5By4D+Q==", "token_balance": 137558 },
+    { "peer_id": "EiB+GN7cVgllTzW8oYTbjHDYxV2p/OeuFkpCwaHZFUIlEA==", "token_balance": 137558 },
+    { "peer_id": "EiBLrG3wRVZM++De9v3+ccZzDgMJ6HEAplPh61B4S5YY6w==", "token_balance": 137558 },
+    { "peer_id": "EiBqwyHIPiwSaWxXdHRzK2iGUm4okH1Hv1KUjshZTmEZoQ==", "token_balance": 157208 },
+    { "peer_id": "EiA0KQCUtI0PEsiNUZGyW+AaJouiK4FI9+FrLtQJxE+aDw==", "token_balance": 137558 },
+    { "peer_id": "EiC4tbGUqzjdaz+IxkebyB3R3Pphzup4FvYFsxv4b3i+DQ==", "token_balance": 137558 },
+    { "peer_id": "EiCGUQXfBRuokJRXVjxE5MTsmFW1PwJztBOqZ70C3PURdg==", "token_balance": 137558 },
+    { "peer_id": "EiDe8g43WqEUHuI/FS1MOjIaBrvLDjiufR83YbwszrVqig==", "token_balance": 137558 },
+    { "peer_id": "EiDPGdvmHON/A0ldwFf+R7z8PF49SLqEiAhyTfgqqTwDIw==", "token_balance": 157208 },
+    { "peer_id": "EiDcMK4WU6KBZL4zXYvYOMz70aEWGY/npzh+LDzGlP+BKw==", "token_balance": 157208 },
+    { "peer_id": "EiB8x4GuPKiBHbL5du5wlBYC5C5oppLajOjIfmLQJ6OTcA==", "token_balance": 137558 },
+    { "peer_id": "EiB9s3cB6R2ef6szn78Buh39wNJTT6Zlzr8tOYyizQEGxg==", "token_balance": 157208 },
+    { "peer_id": "EiDuLsv0RyrofD3QyQpsR5+1AWqGoxcSDWthqhmR4KlHUQ==", "token_balance": 137558 },
+    { "peer_id": "EiAoWM0By5swyHg7oo8rX/+1dCoL4eUloJxAtl1YftCqrQ==", "token_balance": 137558 },
+    { "peer_id": "EiBbNKpta8Qsn19wiIWnKTD6Z3Zq8NdZe7yW1rEyXdQwOg==", "token_balance": 137558 },
+    { "peer_id": "EiBVxhk+ShOhyzwlS1dzxiBqvmh8zTfU7TBPa6H41Xu1Pg==", "token_balance": 137558 },
+    { "peer_id": "EiCRoYv4VQBN7iwhi/yblGxEkTb163UCdcwbwS/VWEjfAg==", "token_balance": 137558 },
+    { "peer_id": "EiCZ+OJHLfeVkclYarjJ6U9vVRq2yulDqBK7aPZEY5+oyQ==", "token_balance": 137558 },
+    { "peer_id": "EiDhgDJHnu/4e+AIgtohdNveAD9vQN7bRAm89DSqSzP6sw==", "token_balance": 137558 },
+    { "peer_id": "EiB9gEMg1+aoU0glO9ykSOOUPfCXNqt1qYUo9SOymO3bAg==", "token_balance": 137558 },
+    { "peer_id": "EiA100t3PRxiPpxt1rKgVxYO5DLh7BAnZkkeLvveEXuT3Q==", "token_balance": 137558 },
+    { "peer_id": "EiB97HDI8LQC0GV6Duon7y2OxJ6sYMhVrGFCwfmLLM3eRw==", "token_balance": 137558 },
+    { "peer_id": "EiB3RYnKKJqN8plhodTS744tIZV42fQI2PDR3qi7pVXwoQ==", "token_balance": 137558 },
+    { "peer_id": "EiDQAtKJ5IOwyWyJcLfMgQZUXagbv5T5bWJROotr7yhngw==", "token_balance": 137558 },
+    { "peer_id": "EiDbQABcKvZHuMaoBOSILSvZ+WT1+HXz9uuh3iB4iCRmFg==", "token_balance": 137558 },
+    { "peer_id": "EiDgA09flDCpYk2RZJLhkPNmYIaDPxFC9bc6HFK8t4y8nQ==", "token_balance": 137558 },
+    { "peer_id": "EiBDoMkiC6QClOVuQbgQ8fNgF4+9UoYJEu/ofbY+Q2FZJA==", "token_balance": 137558 },
+    { "peer_id": "EiDh6xqGmTtqaAcT8fUy42+9uOwSyuaiWu2H4BCemCT5Ig==", "token_balance": 157208 },
+    { "peer_id": "EiAAY6pE5i4Va9G24V1wf3xlWAlff4lmAgGLxx5YRMDm+w==", "token_balance": 157208 },
+    { "peer_id": "EiAUHyDXncXwDNwXam488KI/ZcHcEeQ3Y1UepIUVa0jZpw==", "token_balance": 137558 },
+    { "peer_id": "EiDpDm6ULT0zFLyFt9FoEyDST0UUWENG/tvWVcfrPEEykA==", "token_balance": 137558 },
+    { "peer_id": "EiDAgucJmDeH71VTlBr6Tds9Xlgp4wfCXYtPiEZ6EQl54w==", "token_balance": 137558 },
+    { "peer_id": "EiCZW+B1BC+/XIBHqPp+TlQetFZtTONViTENNKX9gym+pw==", "token_balance": 137558 },
+    { "peer_id": "EiB7jT12NBT0LAhuQgTe9W2ZbZPjh1r7zTw3gqeILXdW8A==", "token_balance": 137558 },
+    { "peer_id": "EiAnaYLNqkkD+KsaiEmb3jjMxV5d3SwXZLRWZRWtyaolFA==", "token_balance": 137558 },
+    { "peer_id": "EiAUXQsFmfuuGg2K6YXc/PIOcGoV7easZ4FLCZ2sLPubaQ==", "token_balance": 137558 },
+    { "peer_id": "EiCKPjtVwy95EHMWusmjDKbuM7y5c2hQMARWuA2lWvozqw==", "token_balance": 137558 },
+    { "peer_id": "EiAC36AjoCsacpU3+EDSsSmwn0k+h1kCRU0xDqUKDK1PBg==", "token_balance": 137558 },
+    { "peer_id": "EiA9FLFmQFfiVgv42zPf00A1hiwsQ6dP3Ff4VYKhefPLPA==", "token_balance": 157208 },
+    { "peer_id": "EiAAGmFUPKTUM+oerZ9OBng/gG5zKv7bNRSjmv/AC+evQA==", "token_balance": 137558 },
+    { "peer_id": "EiAN5DlwxKeEIjn5coZTEUi4N464ny+WPYOWu6dzlTPS+g==", "token_balance": 137558 },
+    { "peer_id": "EiCgtlpVM12OZRU3x1VCjvjsWNkKdjer5Xfy0VkMiZOhvQ==", "token_balance": 157208 },
+    { "peer_id": "EiBgwEn7JglAV+D3hYPqQcmipxopjPiqc/nAzmYCngdQHw==", "token_balance": 137558 },
+    { "peer_id": "EiAfLa3D9ekAB6XZ1E/P71iTJITjOJs3EKRNDBYtDiexmg==", "token_balance": 137558 },
+    { "peer_id": "EiAgLM2GAnbCvpsZKlRfJwJ0eEKl3n4DPsIxE4jvX92l/Q==", "token_balance": 137558 },
+    { "peer_id": "EiBZGTYzIzVy5pBAVaSbY9cGqw5xKfUvP0BQDZs5BR09mA==", "token_balance": 137558 },
+    { "peer_id": "EiAqO53EIRsj17SqXZWWJECTyUaM6cOf3gW9PSEJKoQbZA==", "token_balance": 137558 },
+    { "peer_id": "EiD706mU4PTftS7516daxMQoMj53ixhFrC83jtM62AK9aQ==", "token_balance": 137558 },
+    { "peer_id": "EiDWgzeIvL6EEZM/EwjD0Q54TLd2+m2yZpIehC/tsJKOHg==", "token_balance": 137558 },
+    { "peer_id": "EiAEogUfqBLPI1O3mUr/b/0AVNZLm7DuM1NT2RA+h4CS7w==", "token_balance": 137558 },
+    { "peer_id": "EiCEg/y+xLhzoOykP3oen+G7aFEhcLgm5YNyiIrSlxEFlw==", "token_balance": 137558 },
+    { "peer_id": "EiD1ClZn/lr+n/gnS96Q4gKwBDk3yl33kNIhx9wUxJiyEA==", "token_balance": 157208 },
+    { "peer_id": "EiCS+UGbICfohPwlkkBArU+suk5ocOvPhGS5rDBavrSysw==", "token_balance": 137558 },
+    { "peer_id": "EiBejpof3a17bbGGegjU85qeUIXYefAlXHLPAo1FIuQTYg==", "token_balance": 157208 },
+    { "peer_id": "EiAGEpNUss0oz/eSsRLBtcNtd8TyiMVfr7xy48pIG8VawA==", "token_balance": 137558 },
+    { "peer_id": "EiDyVGg9sAlZvuJh6sJn38vCxfReQGoSKupyZQT+PxZrBg==", "token_balance": 157208 },
+    { "peer_id": "EiAdEYEQJQqGEmJmqnodIqypxuiRKOpULxCzOGk/Drc1Jw==", "token_balance": 137558 },
+    { "peer_id": "EiD9Llqctpvwv0RRiL/zTbjoHZ9UxRaxw+rrZe1As8V61Q==", "token_balance": 137558 },
+    { "peer_id": "EiA/aIPYU7aZvO8ZaEOf6aWuAo724Zh4wCoH0d0N0Pnn3w==", "token_balance": 157208 },
+    { "peer_id": "EiBksN5pbnr4zLiMWshG5SrxhqxJUMBkZW5qvkpDauAq3Q==", "token_balance": 137558 },
+    { "peer_id": "EiBE3N94casP7FFAAol1HDV++Nah7QCNKHHjZtRuk+z4QQ==", "token_balance": 137558 },
+    { "peer_id": "EiBjrd5gSlAfML4iUDdknXTWrwZ9lfGViPSv7vIjyys99A==", "token_balance": 137558 },
+    { "peer_id": "EiBLAMpUbItxRg+Q5FvtlvaGi2rYiwjtKzXe1DmqKUi6/g==", "token_balance": 137558 },
+    { "peer_id": "EiApVYQygAgEh2IFOXEHou8avd8Qw3FjlwsYqbPVUhK0fg==", "token_balance": 137558 },
+    { "peer_id": "EiCD+wJGK3BpFfByuU70A+iO/MhWoZZqkdxIkZqVtzq5Ag==", "token_balance": 157208 },
+    { "peer_id": "EiDYS4Kmy1C3HrT/t5UcxpJhY9s274z/+QXmW+rh4qu4yg==", "token_balance": 157208 },
+    { "peer_id": "EiBPrs9PE47ZceqK5Fp+jtpInHjfbvpuasWUDKgWx5JTLQ==", "token_balance": 137558 },
+    { "peer_id": "EiDq+NOd3OhtBtqWme38fOnyjQ2DHYGp/d/15zk1ynCvEw==", "token_balance": 137558 },
+    { "peer_id": "EiBdvGYTySnnmJC+7HY6hJ8mtgNtgEesmVGjM/bAZJO+sA==", "token_balance": 137558 },
+    { "peer_id": "EiDHhNSCuO2rII/ptgl/bDU0Sp9M8gRdM4a5eOTk8JGBbA==", "token_balance": 137558 },
+    { "peer_id": "EiBk/8MT5SauzL+k1yv6LdJ4IB1ANi8xG/VWP2oStpvVMg==", "token_balance": 137558 },
+    { "peer_id": "EiDpJyIFhx17Wfw5i9Ae06RcoPE0h5or+uX7LO71Y+19Pg==", "token_balance": 137558 },
+    { "peer_id": "EiCxYrVu5Bln87tUlUlItdaQuEFyNqj0FL9yOENzGRN94g==", "token_balance": 137558 },
+    { "peer_id": "EiAV0e0LXPlk6QrL1ZQYD3yHC/As94dBNa5SnXC+8wduGw==", "token_balance": 137558 },
+    { "peer_id": "EiBpRLhlsP/UL7lbpmG990Cx1Fs07gLbJn0qT/755fGGWQ==", "token_balance": 137558 },
+    { "peer_id": "EiBfXibLxmGFrvVix+zDTo0f43GjXEKt0rYqMZfQyV7MJA==", "token_balance": 137558 },
+    { "peer_id": "EiCqaQhJzVnVhpI/JPydNHCpQcc5YtpQ7KtljcByIGaezg==", "token_balance": 137558 },
+    { "peer_id": "EiA9+jMngCwreiwZ4BQOkp4MRlEBG9Oopg2EfpSgWQ7zhg==", "token_balance": 137558 },
+    { "peer_id": "EiDkWIDnoEvf8QAbWPplSJhlW365RSB92O3nt2Ur7wwtLA==", "token_balance": 137558 },
+    { "peer_id": "EiDsoq1sGqFS27UHg5VPJZu/mi4xLVkOsO7aKfaPIiw9Sg==", "token_balance": 137558 },
+    { "peer_id": "EiAmKQgu42zcqNBYuGM5Jy2f2KVhIvsrl4dLpJRbdoV3bQ==", "token_balance": 137558 },
+    { "peer_id": "EiBRVawJJCCix8NamzTnuJ/eR1y+2AA53Arz29Wux9G4ZA==", "token_balance": 137558 },
+    { "peer_id": "EiCKRjGjDdrs7V7YaYuR8whEioPfSdffIw27Hnz6WjOI0g==", "token_balance": 137558 },
+    { "peer_id": "EiADQPc/+VKrP6PExxIIM29NtWczuEsQkIbFOuYOFK2RrQ==", "token_balance": 137558 },
+    { "peer_id": "EiDgNrp4Fppegk+QsosN0y/ABGYGNPVL/bBI+W5+DxBodg==", "token_balance": 137558 },
+    { "peer_id": "EiCoEn06vol7BFgli2k6joQ/X8oO35BJ8JqCJJTo8hhv+Q==", "token_balance": 137558 },
+    { "peer_id": "EiCVHBFKcgVZvY+MrdeOb31tgLqsM+RFYHAbYBiZkrhMMQ==", "token_balance": 137558 },
+    { "peer_id": "EiDysFOhmL573OSnuvgQCeI0uc/pWCyCjyQq2hL/IC2ARw==", "token_balance": 137558 },
+    { "peer_id": "EiDZazhdtuv9vjvvHeJBjS51oNvrOwr9cZ+jMltVVbBcNw==", "token_balance": 137558 },
+    { "peer_id": "EiCjDzoPmvwLkQKR/ClUHymrezQPixf6nTuTM7YiYkdE/Q==", "token_balance": 137558 },
+    { "peer_id": "EiDmZTWRIgsNjFhaPEped34QUt1uBAJYSVvy7X0Hw+fJCg==", "token_balance": 137558 },
+    { "peer_id": "EiALJndJOaLZYu9GaJ53HQ+OHqAtOU1s4/+iWOK/8iI42g==", "token_balance": 137558 },
+    { "peer_id": "EiAQ1qkmtAgk9qEEJhGNFLaXprDMF+WNWkcZvGDKFVmOng==", "token_balance": 137558 },
+    { "peer_id": "EiC2bTcJ8YY4Zz7VoSvfUlqYbRfwx6TPhntmH3PLETHXkQ==", "token_balance": 137558 },
+    { "peer_id": "EiAlHUUXZnRgi49XoRmzEvLTBuf3J2WkCidjp9BA/HzslA==", "token_balance": 137558 },
+    { "peer_id": "EiC4ftRiGqwMolMTJoV/gHGOykBApNz5E/3HwfvIkubIMA==", "token_balance": 137558 },
+    { "peer_id": "EiAINRTHs8IfaTgVX/RztGWaf6XjMcWa9oHsWu9AqPQirQ==", "token_balance": 137558 },
+    { "peer_id": "EiDKTuzZGvDyq4nrSLG1dG5EjSaBsTnI9Dhb5u2QR3w2IQ==", "token_balance": 157208 },
+    { "peer_id": "EiA8mcKvhNnXtOzDkjarKpL5JPoRAaZIxd+Bxm5epWsh6A==", "token_balance": 137558 },
+    { "peer_id": "EiBhB6ThVIxME1Z1BPBfxuKB94rSFUt2RDmc2+r/HUBlYA==", "token_balance": 137558 },
+    { "peer_id": "EiD+dsBrCy/3DNorb9cecwT9FpR2MNNHc2OBeSA1Iu3HkQ==", "token_balance": 137558 },
+    { "peer_id": "EiBGgDkjoanLHTyAKw8IR8wSnwPgnWOLcOsUQ2U+1oua2g==", "token_balance": 137558 },
+    { "peer_id": "EiDk1kU9NOtQR8nMytl0QhxeH3Ssuw52OxqVvOZrslMr5w==", "token_balance": 137558 },
+    { "peer_id": "EiCN+oyx7su4px0triKa6EYxl7AB0DbSpi1KJzbO86VTug==", "token_balance": 137558 },
+    { "peer_id": "EiDcab9PStZfKELc20vvRFHE+fhzqmukDlJOR3+gAl+UmA==", "token_balance": 157208 },
+    { "peer_id": "EiDp0kEdQErFqOg07FU/GnjeVB8XEpma44Vyc3mb1Ef9Eg==", "token_balance": 157208 },
+    { "peer_id": "EiB89GnA0DEEes4RXAyLBJesXJSYjplUSkgpTNaoWBLsEw==", "token_balance": 157208 },
+    { "peer_id": "EiA1VfdGkyx+tJG4N6UGu1hKjodgoAGTEorzWfVs+5Xw8Q==", "token_balance": 137558 },
+    { "peer_id": "EiBU0+sJZJH+7Bp3a9j2LVluKd9t7UBFH5rahzHP7SpYWg==", "token_balance": 137558 },
+    { "peer_id": "EiDA3iHr5GCI3fIw6p1sqS9Ut5PjuMqUdP+SitnCMvOPWg==", "token_balance": 137558 },
+    { "peer_id": "EiCoQxedMtpJfB7t/35aaau4uWesK5ln4KykmSdgHG29AQ==", "token_balance": 137558 },
+    { "peer_id": "EiATco/nzGcWFf8ahZJyIgppqW68b7uTDj92dh0QGrUzyA==", "token_balance": 137558 },
+    { "peer_id": "EiBR2OQ10cuOu4nKdKeHaQy4Uz7IIyw3X9qCtsaNuH5Uow==", "token_balance": 137558 },
+    { "peer_id": "EiCKZX/CNm+h9RK07sfStkvaxK6dHoP4/B1QCyOwQBmtug==", "token_balance": 137558 },
+    { "peer_id": "EiABcdEEJJfBZJpdCbWjUh5uSRC4y59UkCp9GoM4biwT1g==", "token_balance": 137558 },
+    { "peer_id": "EiCe0aTgx2Cu1zsxaVfDR7cxNu65D9v5I3T7MTIfosNG5A==", "token_balance": 137558 },
+    { "peer_id": "EiDHCldHkMNoz266D1tgOti+SEToDtSjujcypt8nNL1G+w==", "token_balance": 137558 },
+    { "peer_id": "EiB3vZK8ixEftjLqgkvldnn9yHTxe7mWNk6ba8mBmt9ksQ==", "token_balance": 137558 },
+    { "peer_id": "EiDCSLA2xAN0e679iRafJHnACM1bKfnrLY7PIpFMxQmEFg==", "token_balance": 137558 },
+    { "peer_id": "EiCLd4Vmux1VYcg6WvsfoOKy5vfL3uGtBnSCd7vE/fylWw==", "token_balance": 137558 },
+    { "peer_id": "EiDPsj1bcWNbAKm+HHYmShX0kEZVA5UAcI7NyY14/CPpXQ==", "token_balance": 157208 },
+    { "peer_id": "EiDqX4/5Ou8vKMRnayi0O3ltMkdvGkMRCb8G/PL87vQfFA==", "token_balance": 137558 },
+    { "peer_id": "EiC68pzDnhtLXMM7nyATOZwbd+DQzQPse1J+ta5jeaOYFA==", "token_balance": 137558 },
+    { "peer_id": "EiA/bxjlK17UdlZl5q//iaI+y9OYfz/sDiq/0+SkU6hT7A==", "token_balance": 137558 },
+    { "peer_id": "EiCKfybjaoAGq11A2bZMx+W7NQCpEGYPDmj6IhM9TUh3tg==", "token_balance": 157208 },
+    { "peer_id": "EiB4y3z/324s/Slp/j2Wv8hEP6RZPpL5pfkfNTI//d15/w==", "token_balance": 137558 },
+    { "peer_id": "EiCtdbaksp9dOd9DDa72bWUokeG++DGYzI8tESCbN5U6TQ==", "token_balance": 137558 },
+    { "peer_id": "EiBNmWmujZOthMkVX4hRXuxhT2UkGQxH3ZQSbRTYG3HkKQ==", "token_balance": 137558 },
+    { "peer_id": "EiBHrClYSwP3MZZlFqPz/hycq7ujDlPGzNoHn1a2x7Bwhw==", "token_balance": 137558 },
+    { "peer_id": "EiA+1+tnDCb0nAf41vlKbuXmB2NcY0CCe1FopFTwbULbxg==", "token_balance": 137558 },
+    { "peer_id": "EiDmZtyzkPJQFaRPNlRXA9xqzFGZCqRI4dMxTI4xLlg5ug==", "token_balance": 137558 },
+    { "peer_id": "EiAPdwVGIS5gdcR47qEmaQgYHTLGLDzbo8l+fqjFYA2qPw==", "token_balance": 137558 },
+    { "peer_id": "EiDc3b5ykipRZwn2s9JD6GYWgCjjYNkDfEUH5ILBHnNPmw==", "token_balance": 137558 },
+    { "peer_id": "EiAL2se2LGR/0tPFRZV1REcHjU2VzFgNfoE6BC07jkGw6w==", "token_balance": 137558 },
+    { "peer_id": "EiAWxMdDkXEdvH+UyYTJ7zA/HopO7PdAeWzaFAlAPHxrAg==", "token_balance": 157208 },
+    { "peer_id": "EiCUq9h/rTEx1kSSB4/LMVgsmregP6lUTomkgYrBnbjNFw==", "token_balance": 137558 },
+    { "peer_id": "EiCHOyTO9Q4IfMaMUnUEiURByDFrt0eIGpgHrfMTH/qy0w==", "token_balance": 137558 },
+    { "peer_id": "EiAQu5dD9vuiOMg6HzcsEMBivS/d4IQXN3h77mZjMNhibw==", "token_balance": 137558 },
+    { "peer_id": "EiDXPzhsqJ/bjDAWkTTuIqlznKlSvcePJ8Mk4mhZnlCAfA==", "token_balance": 137558 },
+    { "peer_id": "EiCkvJZWVRP7e4sH+GHrdXWqE7T5ADwTXCQ9SwZ6y1016g==", "token_balance": 137558 },
+    { "peer_id": "EiBP9FJgyNS06x+T78clbNMSvf80nIQ+V7k+9aQhmwD/5g==", "token_balance": 137558 },
+    { "peer_id": "EiBV6xdHFCqsbh/V2GHzykdmnpUaqFxNE92w4dlqokV0Xw==", "token_balance": 137558 },
+    { "peer_id": "EiCCFD7uHIKJxtKbPL3LgG0wif4b8Vi8tWFd9l+1JqVDLw==", "token_balance": 137558 },
+    { "peer_id": "EiDjWJ08Y7NayDPit22SnL/9lh2mdnBgejn0yULZYTkwHw==", "token_balance": 137558 },
+    { "peer_id": "EiDMehCt7VlQilNOEWMDrqDTrRNgFQwB445sKjJtrTNb/A==", "token_balance": 137558 },
+    { "peer_id": "EiDCwiJOFNOqormuFUv4gY9Fx3AALf2+zCGdINYXVwAU0w==", "token_balance": 137558 },
+    { "peer_id": "EiB24XtmOig1e2COvo0PNam1LkAdVSqMaCVcoFuWfwPFjQ==", "token_balance": 137558 },
+    { "peer_id": "EiCejOSxGqS/PE/DHRDdWqqtZsVg4dE3H/grq4kXA0lDFQ==", "token_balance": 137558 },
+    { "peer_id": "EiA/6XxrFKmKK30gWpul7ST18nMrjCmE51M6SgRHEn0YCQ==", "token_balance": 137558 },
+    { "peer_id": "EiArCutLI9X+cwDwPqe+ozv0OrjHovrvjPXA3vH5XbnY7w==", "token_balance": 137558 },
+    { "peer_id": "EiCvMIjvDRwc9nKfoZGzf4PME3S7FKIQIwzXQndg5pNmWw==", "token_balance": 137558 },
+    { "peer_id": "EiBpleJCfpISjLL2Qlpu0CBx0YtT/aitwblYnlbrEQhp0g==", "token_balance": 137558 },
+    { "peer_id": "EiB3dJrzQj57hGlL77MICeb+TFDqzVu3ErL+bfFCKhkhtw==", "token_balance": 137558 },
+    { "peer_id": "EiB1PX+8OIQEeBJn8aAbbIbhgCW2AeYXoO98ddxyGmiKMQ==", "token_balance": 137558 },
+    { "peer_id": "EiB4d8LJCTTQDgqk54x0AJkhgb+WtGOaZIGJ4km8uUodMQ==", "token_balance": 137558 },
+    { "peer_id": "EiAAA1mzRx2CKmcp/tmH3JZpmuJ11qFiWIz4iJKEFy5HWQ==", "token_balance": 137558 },
+    { "peer_id": "EiB36hBV4anjov+eJuhz3Wu74TX5XEl0Def2x6sipX+/8Q==", "token_balance": 157208 },
+    { "peer_id": "EiA/YbtP4FdCmHecgeScK+JyCgGsQXzT86F58sV6c2mWXA==", "token_balance": 137558 },
+    { "peer_id": "EiD7OeDPJfZYGHp0Zhad50Zd7Rhx2YiLi/o8oUTwUQQRcw==", "token_balance": 137558 },
+    { "peer_id": "EiC9YMdYZazIpQgj49BlcE31ISygEBkSDeYZJE3htKz/3g==", "token_balance": 137558 },
+    { "peer_id": "EiD3zWIZoY6PXXeF7y4+a49EMNiviocb6Q7gHYu4wIpKsw==", "token_balance": 137558 },
+    { "peer_id": "EiDiRmVhssP3nFyfci/Zh/EOBTzDlH6iht0UBNaaW9xLsQ==", "token_balance": 137558 },
+    { "peer_id": "EiDiq1QVXeZvZoRP5lmcavDqMLk8oXKtSz+QJ7ok2zqQOw==", "token_balance": 137558 },
+    { "peer_id": "EiAtM9sDIaGs9cvPhotiJDN0eX7TJPdAJ1c5lC1KR5px7Q==", "token_balance": 137558 },
+    { "peer_id": "EiC0d2NFu9t8wJTBqSLCIbWLI5XVENcFHtnsMkUYUGcghg==", "token_balance": 137558 },
+    { "peer_id": "EiBczdVAatCiWaVsgA4I6Qo9c8VDAndub3F1BYP6aFva1w==", "token_balance": 137558 },
+    { "peer_id": "EiCQJsIDaNWniKt7oIH9PVMS4sAjqw9M1h9eI5tJOfcvbg==", "token_balance": 137558 },
+    { "peer_id": "EiBAoCcTJStOSZN+ae9cxG/GJqJrFBPdAE9LkKxNzp0lOw==", "token_balance": 137558 },
+    { "peer_id": "EiDDuejVIWjWB2EkgmzmTMXS0sWKWASp2xbiZ/w048hgLg==", "token_balance": 137558 },
+    { "peer_id": "EiBTwCmsrLo9nNe+2v7aI/GfCAgvaVBx+JJlheMboLqmMw==", "token_balance": 137558 },
+    { "peer_id": "EiDTBA//SALTW0B78xLh2mpJP/azixsk6pfPd9GE3VKd8g==", "token_balance": 157208 },
+    { "peer_id": "EiD8xUedCdFDxKpvVHS6XxHuEon6fuXWIR5d27nWWOneQw==", "token_balance": 157208 },
+    { "peer_id": "EiCkA9myYy7GEN28iFlSiunkX2hRWG5no8JKPRBnrGlBCA==", "token_balance": 137558 },
+    { "peer_id": "EiC9u/D1Kc7QRXQQvYbtfdGyPORzFeyHAHlfjISFz8hlfw==", "token_balance": 137558 },
+    { "peer_id": "EiBtQ+eLb0VB4yW1nXVa3iJiJ0qTYPNu/2gcyOEoV2zHbA==", "token_balance": 137558 },
+    { "peer_id": "EiCwOekX3cjvUyWJAPdi1yKUW1QkpWF02joo7SKO75L+Mw==", "token_balance": 137558 },
+    { "peer_id": "EiCJ0IkusUB9C9G1uS61LL1LGTKnybWemVgI6EIXnDpJhw==", "token_balance": 137558 },
+    { "peer_id": "EiD0EzPqhxDeqVXu9L2f7ejpy1uyi7fMDTseHSicDS/uyQ==", "token_balance": 137558 },
+    { "peer_id": "EiDYVPNxD3RkVLAn+NQRXw+Hpo9MndkCpZ9RRc4s56vkKQ==", "token_balance": 137558 },
+    { "peer_id": "EiAhwM51CcGxWJu/btMQXLWcHvspYe9qmwOV71kr1saqmg==", "token_balance": 137558 },
+    { "peer_id": "EiC3Kzc7YMFvjTQJNRzg3epEfqa2pf0HBUi7mO6/r4g0aw==", "token_balance": 137558 },
+    { "peer_id": "EiBudCwHp8+4szbbCSoU/ub7EsqR4Ml2p6bkuKP8oqfB4w==", "token_balance": 128947 },
+    { "peer_id": "EiDZobodlnDZ/nNPhlDlw29XUQlevuKLopzM4rsMC7keCA==", "token_balance": 128947 },
+    { "peer_id": "EiCIdkcjKNCexeiA79DIPTsN7V6pEuZv9Jf+PIDAbXhT8g==", "token_balance": 128947 },
+    { "peer_id": "EiBNcJ4MFi2zU9uRYHZecmx6mh2vnQtdtV42F7KpluNpeg==", "token_balance": 128947 },
+    { "peer_id": "EiDFttf6R3TjZyW2/R9XODCUEbn3gSLruOSQclIanTGw3A==", "token_balance": 128947 },
+    { "peer_id": "EiCYvXZFY0o48MA5kJL5aP+lX3jVIOyfwuxHE8QsqNsWrA==", "token_balance": 128947 },
+    { "peer_id": "EiD1nq5d0976wfxEhbJqXeWO7lZgX+z9HyGuqkr03kj6wQ==", "token_balance": 128947 },
+    { "peer_id": "EiDBqssLUs550bRROG5rT7Gh2ZSUD3yJt1sG+cR+KfEJbw==", "token_balance": 128947 },
+    { "peer_id": "EiBHfY3x3OGPrj5NNv+Sv/v91uRzLunt5xDzfcjye47PMg==", "token_balance": 128947 },
+    { "peer_id": "EiBVpYyEsSVHYip42IG4LbYO3R57T89Nckk7Gv0eDUPOAw==", "token_balance": 128947 },
+    { "peer_id": "EiCKhTBAZRFHY3zim2MwgIg0ZHP4PEmyp0NJBqrHbouJ/w==", "token_balance": 128947 },
+    { "peer_id": "EiDAlZyPyVkSoT396S5/4gqiV+T0xyFIUzfK3cu2/9YCQA==", "token_balance": 128947 },
+    { "peer_id": "EiBenIgcHBcpfYF3glMJfMFFzR8vlTvEYKwut2EinyndIQ==", "token_balance": 128947 },
+    { "peer_id": "EiBiCbWG5pLCORbJvDHn5WUs++MW3bJV5B18t1uk2x8vrg==", "token_balance": 128947 },
+    { "peer_id": "EiB2HzTaxx/MVv/cnCzctCRJlt3dzGHS9GU90DW1acAmgw==", "token_balance": 128947 },
+    { "peer_id": "EiB0ySUp4IB2umQ624skEgqsVQj0lY/LXieaABpuDV9+FA==", "token_balance": 128947 },
+    { "peer_id": "EiAmVUd8apr3OsmKXyWwhJbMmmWA6BRTyZ+KI4IwAfFyCg==", "token_balance": 128947 },
+    { "peer_id": "EiD83N5on3ALN7c5Yrukiz4ZvmtcVDF76I4/zRtLgc0wAA==", "token_balance": 128947 },
+    { "peer_id": "EiD4bfZuFjOXKZLTxm71N6HZjzAIUM/8Sqv/cU998qo4mg==", "token_balance": 128947 },
+    { "peer_id": "EiCe0FmkQzbuYrL9A0t3mwFeQdiIky2xHB7JMxjBYIeRAw==", "token_balance": 128947 },
+    { "peer_id": "EiC8QR2AYCt9T7D/+f7uvyFWOKvcmUo4X0+aanBGzaVrEw==", "token_balance": 128947 },
+    { "peer_id": "EiCPHbDqSZGymQQDigwc6p0jw4A13JlDF+BKvzh3O+P8fw==", "token_balance": 128947 },
+    { "peer_id": "EiC9hCUmVbouUu0FP38f9zhg6x2AlhuBcQQxnIkau5X9oA==", "token_balance": 128947 },
+    { "peer_id": "EiBKkvHpfQkotrQ+ZOcrHSr6Be8+Wr5JqXu9BurDWNvlQw==", "token_balance": 128947 },
+    { "peer_id": "EiA/LKqaSLse/YfL2nBR0F3Vd7vrvmekkSMV2QUY8psjVA==", "token_balance": 128947 },
+    { "peer_id": "EiA38eVX9pSSs+INUgLi/t3Zihv+LNAKjmW2Qt7+BwCx1Q==", "token_balance": 128947 },
+    { "peer_id": "EiAlX1Mw91D9yPSEL479b5zRTfbUb+BSQ5QuGikmkgL3eA==", "token_balance": 128947 },
+    { "peer_id": "EiAj8vNP8zEmMcEQTlsucUe82e7oBk8QVtvZDOmsje7Aew==", "token_balance": 128947 },
+    { "peer_id": "EiAXmwRH8HdwKPDz7APhYqzVTxhoPUpdKmwSM5tPPkRm7Q==", "token_balance": 128947 },
+    { "peer_id": "EiCrTZ/hTLYMdmQsS5mdScLKoGJTMoZ3F/U4GRxfB9gguA==", "token_balance": 128947 },
+    { "peer_id": "EiAkWA+OKVn1Ajrp0evPyr/0rZD3doU4WmBkY4CtxzVYVw==", "token_balance": 128947 },
+    { "peer_id": "EiCXvAsfWvjDuhFloqV/WQitXhHabIRBZqOHR3d5rVSiLA==", "token_balance": 128947 },
+    { "peer_id": "EiDugvOk+eWUfMCo6LaEZbCdu6an+96p63qlPEHULHUTCA==", "token_balance": 128947 },
+    { "peer_id": "EiDr0x0Yvn1qTyqfzwKne/hAnosvKjk9OWfsnbFI8tB2Aw==", "token_balance": 120115 },
+    { "peer_id": "EiD5JmPqKCdDs3RIhBgapI1Ie4S6lddJ4O/iiMPwYoyoGg==", "token_balance": 120115 },
+    { "peer_id": "EiCoPeQ+leGveEM5P/TX0fdNuWTg7Dp6T7lQiXzDLhbw7w==", "token_balance": 120115 },
+    { "peer_id": "EiDf9LzrOWpezMQjR9rOjU+ZabucZu/yCLkzpvTJAj3dDw==", "token_balance": 120115 },
+    { "peer_id": "EiCefkAcM4ut0sZiT2f6aEaNtIZI2hhjzT/itbe6PIqkuw==", "token_balance": 107750 },
+    { "peer_id": "EiA24CKWMo0F6Am5cHrzVooLkX5sO0eFb3L3+jq74qY3hQ==", "token_balance": 107750 },
+    { "peer_id": "EiAwlQCRHAtudM+JuqRo7h6Spgcp689hQeYufdGOVx9QPg==", "token_balance": 107750 },
+    { "peer_id": "EiBfXdvIsVFwKz5WVDS4v4DCp7RY7WqiIL2ZDTWDVEz+yw==", "token_balance": 107750 },
+    { "peer_id": "EiB06ZbA3xXEqWhuZGTHPUL1v0LOC9XmTjGLZQSiECcJLg==", "token_balance": 107750 },
+    { "peer_id": "EiBa9ClQNKr1YNREaaZn5e7IrA0XkoCur/YTFlCPufsWkQ==", "token_balance": 107750 },
+    { "peer_id": "EiCuH4VWAQn9FbvJem/dmswYYfRW8A7H+5Q3URK5gNv+NA==", "token_balance": 107750 },
+    { "peer_id": "EiAX3pvxWa621huB5lEUELwY5RYFCrRd1KToebTN3TAXMA==", "token_balance": 107750 },
+    { "peer_id": "EiB2EikvzsEXIL0Ybe2ssRjnmiATWfw9jXpWP9cL90dydw==", "token_balance": 107750 },
+    { "peer_id": "EiCTzrmo7Gqk6p9gpJJvgyIb/rc9JT1OdjojH6TMKSMf8w==", "token_balance": 107750 },
+    { "peer_id": "EiB+FoAYEbJ0RVI9L1rjxelLzUrbaHP1aOn/SQBbuVTFrA==", "token_balance": 107750 },
+    { "peer_id": "EiDDwaJAiOcjRggi9W/DS01gwitZ4plOthL+sV/F2WydQQ==", "token_balance": 107750 },
+    { "peer_id": "EiBk3y1Uw8WXmUHDRxc9ouse00KC9Dr3czFOG4MTZnUhsQ==", "token_balance": 107750 },
+    { "peer_id": "EiD9PRQM5Kjk4lA5SV0wvdDJfgZjFrHyN5sZxXVySLuu7w==", "token_balance": 107750 },
+    { "peer_id": "EiDmQlALdT6m6Q8wZM16eZGLVB9pjTsZEWivEIn4pdkczw==", "token_balance": 107750 },
+    { "peer_id": "EiDp29FXbAscbKZNcH99kEDP0PcGwW22TsMxnn1HK1D1Rw==", "token_balance": 107750 },
+    { "peer_id": "EiDj1Bddcb+fXnm4W8HLMHjNqjtlbhWmfED/1Oha+w33Hw==", "token_balance": 107750 },
+    { "peer_id": "EiAuMprA+CZlzE1ehX9O3M7Y+ootz05eCVaFVyD8Udi47Q==", "token_balance": 107750 },
+    { "peer_id": "EiDoyelvLUjWjyL343FNLw4b1v+6Loyo3YIJuG6Agb+TfA==", "token_balance": 107750 },
+    { "peer_id": "EiAm0kOxey9yhwCN/J5WsfIm8fXRZuAX7gx4dWz5OgguKg==", "token_balance": 107750 },
+    { "peer_id": "EiDVWI/eL4QK9rtZIRyqa36nRNaCroGY/8yX2Ex3q+4SQw==", "token_balance": 107750 },
+    { "peer_id": "EiCUnLSqUJpeIkKltKHECTUl84MDnZtko0WTRh5OqCmO6A==", "token_balance": 107750 },
+    { "peer_id": "EiDSP7Mw4TRSBmuF9OOEAXtcK//RTzJLWw3xwuVLQijRVQ==", "token_balance": 107750 },
+    { "peer_id": "EiB5FLxFycijk+kbjUCLVN+u/LMdwvEHbiA8UmseaRJUDA==", "token_balance": 107750 },
+    { "peer_id": "EiALqqXPhdT+LWlx2cbx3vYiFGxeUhv+KgyhE+MT8g7fLg==", "token_balance": 107750 },
+    { "peer_id": "EiAQctJosQjmEukddnIjsML9I8IVPEG9AXnwIgSOwS1tag==", "token_balance": 107750 },
+    { "peer_id": "EiB1uhvNx1vVURYaDgyjtKh/kAvKDTA+pizs9Ki1WK/nGg==", "token_balance": 107750 },
+    { "peer_id": "EiBe+uGO+zfmX3+O+o4ZMG2FJYj8WNpF/3x8WIG/lpvr2Q==", "token_balance": 61824 },
+    { "peer_id": "EiCB9chm8KaN8pZymFIdkbNZo0mi92Td9zwzZ3AoKrO79A==", "token_balance": 61824 },
+    { "peer_id": "EiAPtMZC5vXTmvJp3tHtDWN//mufykfif5ytS+lM1xxMwA==", "token_balance": 61824 },
+    { "peer_id": "EiD+FBGriVFqbWsGufY4M/7fljQ6Qczaa+ukYopboNnw0Q==", "token_balance": 61824 },
+    { "peer_id": "EiAuD4e+ZRXsFJGh05UNVsqLQJQaaJ/E1PQZN6ElMB4Nnw==", "token_balance": 61824 },
+    { "peer_id": "EiDTB6eObT+GA+cXvO/5j98NVdqQcU9wyBJ+/vSKIumj4A==", "token_balance": 61824 },
+    { "peer_id": "EiDca75Qx2Lr6s/Lrbl9O6Z72bivN/mhpX/ckh8rwAK+fA==", "token_balance": 61824 },
+    { "peer_id": "EiBvZAiuQHUUkZGpqjuOpwriYtQZwGArCN5E9NsoEEWu2Q==", "token_balance": 61824 },
+    { "peer_id": "EiDRpjDjicKK6CO8udUAjJgAGqyCUNXVzAUa9PV/lnkenA==", "token_balance": 61824 },
+    { "peer_id": "EiCwOvw+44dQw+xdnOoPehvrg3b/PNFBStXpRzj3t9+J8g==", "token_balance": 61824 },
+    { "peer_id": "EiA3l0p54keWgYXyLjbyM6OIstASS0qMdA/8q/CCnvqIZA==", "token_balance": 61824 },
+    { "peer_id": "EiCSeatYJDXNTeO5WPM7M8B8ts5SG+KPpda8MntmQAhVAQ==", "token_balance": 61824 },
+    { "peer_id": "EiDs483tDwR8UI6GN9TlFNh2/BsrGKRIUJxXLJLKf9Sx2Q==", "token_balance": 61824 },
+    { "peer_id": "EiDNkWoJFRRDalKXKC04bPXXg1UxEkGpRj/Dx7+02Z+ptA==", "token_balance": 61824 },
+    { "peer_id": "EiCqTs19rOr56QlWFibxPrsgUwl8cWXzDxTmyE83dm+p/Q==", "token_balance": 61824 },
+    { "peer_id": "EiB0lN+j8cxNve80vwVj+UnzYC5lkn5AzQxb1AitNkoiaA==", "token_balance": 61824 },
+    { "peer_id": "EiCYihN/GnRlHD7rTEie7A5yPyYgFbLgY9mAvxz8gB2lrg==", "token_balance": 61824 },
+    { "peer_id": "EiDY6YKI85tRvI+P3PlJ0YZovmCcAVQQHQOMZKZ/Y7hbsA==", "token_balance": 61824 },
+    { "peer_id": "EiAk1nTaG+sv0sTHLdoB9v11oH1BmyGIZjMYOLQTLhF8KA==", "token_balance": 61824 },
+    { "peer_id": "EiD24dCVyfK2SQFtCr0v+cb7m/R8fz0aSqi3MFdkG0heiw==", "token_balance": 61824 },
+    { "peer_id": "EiB7yaEqAc4bQdJIxKNh7xpk86m3P6xgFWua7Y5mRBIaPg==", "token_balance": 61824 },
+    { "peer_id": "EiC53Ud7eV3p75zwuoDzkkXdmZPbyz127GYJkNptKauLfw==", "token_balance": 61824 },
+    { "peer_id": "EiCb/z7SCCYAb6ZRnj9pmDH/quyo9jJgUg22NNEYa/FgZQ==", "token_balance": 61824 },
+    { "peer_id": "EiACcSxsFxdFF5tYlWtEJzu4phV0xY9nvT04UWxuzgaDJQ==", "token_balance": 61824 },
+    { "peer_id": "EiCLgtRnsVkmApnbIuS4rKWuhHO/TNKb8RuMtMK/+RE4ow==", "token_balance": 61824 },
+    { "peer_id": "EiAa5nZen9aFIi24iHtZ/Ex3lmQ2EPCmkeCPDFMwiOcNMQ==", "token_balance": 61824 },
+    { "peer_id": "EiCujlXgMAztpSmJQXfEie+4VsQQ5ycaeyp1BWk4zkkfvA==", "token_balance": 61824 },
+    { "peer_id": "EiAFPSGu47UX5mK1WZub/cyz+Eqzz/kHojG+EObqOT+Tig==", "token_balance": 61824 },
+    { "peer_id": "EiCS2kZI7/gne4GKvMrZ2pZ2M2dvdOQT1SonV/VcHTL6rw==", "token_balance": 61824 },
+    { "peer_id": "EiA+C/aDS3W3hp7Q8NU9KQrTk+1JSdVgOQUuZhBU1iQEvQ==", "token_balance": 61824 },
+    { "peer_id": "EiCeW9mbI6XPL/+u3SCSBP/vnh3Y+04Nqa5j2aQu/es6Bw==", "token_balance": 61824 },
+    { "peer_id": "EiDfDAJkh33TLXipy070sZFs6gkawKYOktAiMJzAlY3dHQ==", "token_balance": 61824 },
+    { "peer_id": "EiB8E50DyoPLYs31IOyU5TVjEAHi9fQaLFKpkc6/ybGycg==", "token_balance": 61824 },
+    { "peer_id": "EiDbjh0elVYk86eG92sl594syNz+T4kmRUOfGg5OT6jIBw==", "token_balance": 61824 },
+    { "peer_id": "EiASb36QWr/w2bfUVBAdeyvJdae6VG6ymGHn1F6ThMqOXg==", "token_balance": 61824 },
+    { "peer_id": "EiAOMF0CzwdP71hBZX85HrI/7EL3B0V5R1cQSt6P9tWVHQ==", "token_balance": 61824 },
+    { "peer_id": "EiC1km3lus0lSHxQHZBITkn27ALaloZ+tm5wm4eZUQp05w==", "token_balance": 61824 },
+    { "peer_id": "EiBn1Q9Yyf/9wN0DNHqi1BXtFnr8MO1rx/GKCBP9SIcD5w==", "token_balance": 61824 },
+    { "peer_id": "EiD/aXrgArqCjMprLcySsatTjG3lmN3Vpm21ERGBapnbCQ==", "token_balance": 61824 },
+    { "peer_id": "EiBGo3UaRFtHxngQqLNRhkDOEaWs5mNe1ZsliC5H9N0+FA==", "token_balance": 61824 },
+    { "peer_id": "EiD4SkT81EXr+ZYU8aEtP5IAbLYRb4hfN4CWIbgoMa15Ew==", "token_balance": 61824 },
+    { "peer_id": "EiDgOMRPpkwnokWMhfIulVCx5iRI/ZtVtkvGifd7oplQqA==", "token_balance": 61824 },
+    { "peer_id": "EiAOU1Vo85qO+op2b8nHbDxEoQB1PiugGmU6oM1xIxGfIA==", "token_balance": 61824 },
+    { "peer_id": "EiDUwdzfroP/KA3GUUhaS+FAUnN7St87xdg2aRbCxPYgBA==", "token_balance": 61824 },
+    { "peer_id": "EiBH4XPQlGcXC4Tp64lv/dXntJwHAiwfiyd5KbiCALchpg==", "token_balance": 61824 },
+    { "peer_id": "EiDuLC/SyTv5wbp/jsRjHXQz3ME2r65RMBoULwq8GOj8gA==", "token_balance": 61824 },
+    { "peer_id": "EiC7FJ83dVoH+KhxBsRiWCOvvmOmgcbOykqjIJZGuMWtQw==", "token_balance": 61824 },
+    { "peer_id": "EiCrnRmoeqytLLo2t7gFXN4DoXgDcW5gGxsdfx1NxiDPfQ==", "token_balance": 61824 },
+    { "peer_id": "EiAUPveUbHBELb34KirMKvDmYFpM6gu7Y+7RFWGqt5G4TQ==", "token_balance": 61824 },
+    { "peer_id": "EiBLmdy419du6Cr3FJ2JJ+pJknek6vvVBquFr6Wof24aqg==", "token_balance": 61824 },
+    { "peer_id": "EiDet0IKnZSHTrc5runAi0s4fPU66sCrfVnC3YrKxl6FvQ==", "token_balance": 61824 },
+    { "peer_id": "EiB8YNNT+f32fQyo5Yb2AZlj5djjuqohs2rS16847nfo7w==", "token_balance": 61824 },
+    { "peer_id": "EiBfRtaMHINkOraGjpoveo0xGmXVIiTkcLPY3+s5zCSTBg==", "token_balance": 61824 },
+    { "peer_id": "EiBzobVhRUY6NmHFQphTXzSVwGec4Dh6RO5hExcLzrurJA==", "token_balance": 61824 },
+    { "peer_id": "EiCgn1324zpvTbmxl+4IkM7QsODIaApQ7szZowWWowBd+Q==", "token_balance": 61824 },
+    { "peer_id": "EiBmytP4g2gix8OkjbEFrvPl9lr1KyW6SSjuo1duWg8mcw==", "token_balance": 61824 },
+    { "peer_id": "EiBw6ZcCRqk6jnsin0txNjOXU33pycAv0dfvno2slS1orQ==", "token_balance": 61824 },
+    { "peer_id": "EiB+pcJzbDvkZt9wFeDouWA0A+uWg1EoiNdkRCjTUshjvg==", "token_balance": 61824 },
+    { "peer_id": "EiC0Z7QzQ7+LJMQDShs/t50tSSpgdBLHEVnIi37TCrcI/Q==", "token_balance": 61824 },
+    { "peer_id": "EiDIhtoFJIXlDLxa30nAlL1xfVEsPer1PZPtL+OfImXoTQ==", "token_balance": 61824 },
+    { "peer_id": "EiDqCjFGTEahSbLOKlHJhBWegH0Qp+ulT6OId6U9UlN82g==", "token_balance": 61824 },
+    { "peer_id": "EiCnoPVKUzy4ckmKh29PTYa5YNLEFk7dSJp8jkjDfldZlA==", "token_balance": 61824 },
+    { "peer_id": "EiAeijjugEKMsWJK2S4k9ZsdxL2AtNoo3yiu9yzgCUYljw==", "token_balance": 61824 },
+    { "peer_id": "EiCgBkybsxAJev8LDuGaHFo+Y/0oXalNGwyNU8bn3dZ2EA==", "token_balance": 61824 },
+    { "peer_id": "EiDI04gf7fFIdMgy48woEvhL37qjCLBoXO4FRx9SYjchkw==", "token_balance": 61824 },
+    { "peer_id": "EiDXKrbqTGUCD+Sc+vK6dffF2hPyqrb24ZNYdnuGWlZJ0A==", "token_balance": 61824 },
+    { "peer_id": "EiAuW/4ekqiqvUQ9uAuW8MmlFab0hGUzed7XYA547uNFVQ==", "token_balance": 61824 },
+    { "peer_id": "EiCAisdo4y1a8hhfbTW57cf1mO1B0ozIjWFWBl/Rpuc8NQ==", "token_balance": 61824 },
+    { "peer_id": "EiAD7hV/sysaahbEZvB1wgQ0qSZ7TJeCQpTEKJQp8e7gRA==", "token_balance": 61824 },
+    { "peer_id": "EiAe5RkC1xrO2dLHNKRDfVMPnfQdqkuoFiSUEwq7/kqt7w==", "token_balance": 61824 },
+    { "peer_id": "EiBf6AAXBRJNIlo4EVlCowR+GrkS1xtOZqfFDwZrbdj76A==", "token_balance": 61824 },
+    { "peer_id": "EiAf6JNp6EJ+slqf6gFNccTE5Weaje/7mH9AC65B/l0XDA==", "token_balance": 61824 },
+    { "peer_id": "EiDVQtE9e+nVS3KNnlT+K6I+8yyUbWgOCQ+i7q7ygQQd3A==", "token_balance": 61824 },
+    { "peer_id": "EiB6VIK2jYp6H3TaNNvfekSteOtOtDTBR0ouuw08cz8goQ==", "token_balance": 61824 },
+    { "peer_id": "EiBsKhzoSZX3OfW7JkaQ4SMhG4OFLQC/dLVS8UdqKRl8iw==", "token_balance": 61824 },
+    { "peer_id": "EiBbyy7XmY+K1a7KZ3B1Rhqv8N5QycrRBi+SIMoTd1gCAw==", "token_balance": 61824 },
+    { "peer_id": "EiDxKveI0vntWs5k2Gbkp7Jz1gsOWmM965JIkPIfybYM0g==", "token_balance": 61824 },
+    { "peer_id": "EiCanoHsHQfIYBX2m61Lx8sQIJgfQR5kwisnqIzqjgoKJA==", "token_balance": 61824 },
+    { "peer_id": "EiAm6wr+C8ONNrhJ/C03Urtr6ZymsqwUrQWcFO2sqehWFQ==", "token_balance": 61824 },
+    { "peer_id": "EiBKuUHD5fLHHw0zbGDoSnudE0cK9MYqWgIlztDit86Cfw==", "token_balance": 61824 },
+    { "peer_id": "EiD4l58piCN43cC082ods0+ZU14ztKJA3oJFYDDdxbZUmw==", "token_balance": 61824 },
+    { "peer_id": "EiAv5VroWTdm+bfqpZP/GJNFBsZ4w7yX9MwjzNuk8VrDow==", "token_balance": 61824 },
+    { "peer_id": "EiBpnRvySi87aP7LRpnZzTUORl+Xd3hXS+xm7HUHqWMPNQ==", "token_balance": 61824 },
+    { "peer_id": "EiCvMFaIEmqxfeWqEIc8wupEbN9/UkgKzGpJp8Y7DSX1Pg==", "token_balance": 61824 },
+    { "peer_id": "EiATuJlAk0T59YjAfRW/QU/r2TWsQ25HQs6ufKaKnbjsXg==", "token_balance": 61824 },
+    { "peer_id": "EiCq8997oUfbEIBjahCrhxh7H31zVfdNkFkj+BDhhxaYQQ==", "token_balance": 61824 },
+    { "peer_id": "EiAuGeUeyMApmQHbnLJNJsAKPshU1DGMZTVYEyVYNgoZyg==", "token_balance": 61824 },
+    { "peer_id": "EiADsA5N8dze0U/Uo3KcuIlE4jYpVWNlfb9MkilY3kq7tw==", "token_balance": 61824 },
+    { "peer_id": "EiB540GDYq60zLwqV7CKQn0+wN/Pdg67xnwZzFOhoh/2FA==", "token_balance": 61824 },
+    { "peer_id": "EiAFs3PzeiYao4UnwO0+fBRE1sQfm3WJu3K0KD9wQyU4gA==", "token_balance": 61824 },
+    { "peer_id": "EiBxWRcB+i8yYk6JsfjgWsiumDlx1d0rirWObGy+WrgSQw==", "token_balance": 61824 },
+    { "peer_id": "EiC6Qc4m9Av+L6T3p2vtwzKhi9IQphqMxdjfjBjzeZAfNA==", "token_balance": 61824 },
+    { "peer_id": "EiBWHq49B6I2twbr4ojxEMZ1zsPL3KWFVPz4zAs9pcNCMw==", "token_balance": 61824 },
+    { "peer_id": "EiB++1x6xUHvHycpWOBnSyLQJoZI2phig50z+5M/pKAR4A==", "token_balance": 61824 },
+    { "peer_id": "EiCtob6JlVawBXqHImckijCy8f3Ge7WNpRDAXCOlTRGuDg==", "token_balance": 61824 },
+    { "peer_id": "EiB/Jdh3vbXBDgM3WppnUuct25thmE+eZDpv3yX491D2eg==", "token_balance": 61824 },
+    { "peer_id": "EiBac8yDlhETCRXuEjqOBNwaR6VdAsN8Ysz3wd+Iw4WdPQ==", "token_balance": 61824 },
+    { "peer_id": "EiDo/GPoHDZ30pnJ59vjSE0+SzcYIeR8lKknAQhzpuytJQ==", "token_balance": 61824 },
+    { "peer_id": "EiDodcIFaqlsODm+b2HhJAFL/mAxmixJOVVbWWb+zbetSg==", "token_balance": 61824 },
+    { "peer_id": "EiAxlPx8NRUo/Sz/EFfVkIhfa8W906wIz5KuDdpwT7lT+g==", "token_balance": 61824 },
+    { "peer_id": "EiB8ev89g0spull/cBised4VxH7ny3cF6cDy9kqnLCvU2Q==", "token_balance": 61824 },
+    { "peer_id": "EiC/nVmFcGmGT1CYY93X+PSTki2bLYhpG9y1U/8xxsz6tQ==", "token_balance": 61824 },
+    { "peer_id": "EiCsdBs8fE2TDzQBj7hiTo8bJMmHyD3yt5H6D4CN0V99hg==", "token_balance": 61824 },
+    { "peer_id": "EiA4yAXe96tr27C0bnyRu2nkBThd6f8s+5OH+WNbVx1Wjg==", "token_balance": 61824 },
+    { "peer_id": "EiAcF3iqG1YDBQskroraFGPmgPkQeEToC0PeS0+/4Fxg8g==", "token_balance": 61824 },
+    { "peer_id": "EiA8TCBqYy/w8Bkis0Lc9VEgcxntOIoxKFgoEEI9obruZg==", "token_balance": 61824 },
+    { "peer_id": "EiCafNjTZ6ikwsNN6IHbT0d8V86sHGPnRsFAXe36xCwUUg==", "token_balance": 61824 },
+    { "peer_id": "EiDQ9b0Td2Q61QD5PpXP5oa16wxJc9UDIlPJ4w64ZNvqYw==", "token_balance": 61824 },
+    { "peer_id": "EiCHFUr7q08y7RpdkeL8mXLsnGA8M06r3kd9nBUTprDVtQ==", "token_balance": 61824 },
+    { "peer_id": "EiDAX0wjwdc/SVHLDZN/LxXHHEN9ErUP9wbmHCU7ptrINA==", "token_balance": 61824 },
+    { "peer_id": "EiAB8GYGS0bYBxwtxHRjfk1+NjCjo0qCe8zgKcvrwpGYkw==", "token_balance": 61824 },
+    { "peer_id": "EiBBaL8rpEK4BzwxcWfBEq2OrEAoHTPymv2uwDYHqbfmRQ==", "token_balance": 61824 },
+    { "peer_id": "EiA47GEI33UVHQEetz4Kao7xelFsR5nUMwDIamPxqHvxVA==", "token_balance": 61824 },
+    { "peer_id": "EiB0ONqncERGkOWujrOSliQzQXTZBtdbypJ+Pmo8+5SqBA==", "token_balance": 61824 },
+    { "peer_id": "EiCIOoud54jxO9jx390end9EIQaPZuqtKsY/F8WnFSmJfA==", "token_balance": 61824 },
+    { "peer_id": "EiDnSgQZSMRH0M/Ezf5gypb/QNrbLkdcWC8GyCnpLVZwzg==", "token_balance": 61824 },
+    { "peer_id": "EiAptsYmq/a5Icp9tR1AeMP3hxlpFZkOEq/k0BxMSW0wsw==", "token_balance": 61824 },
+    { "peer_id": "EiCupmS1ciQafdBAooHOmPSZYGRIqWX1BVGmIz+xhDKN7A==", "token_balance": 61824 },
+    { "peer_id": "EiCvpniatiKdJDnEoQeCrBJ5/Ojru5/uOjQQN/Vz1iCmzA==", "token_balance": 61824 },
+    { "peer_id": "EiB+qjdMyYs5OIO/Sh//4EHHbfWEizaEGFFLc+w6A/NsYQ==", "token_balance": 61824 },
+    { "peer_id": "EiCAK916o+OcyYV/7hCUiFVSfX3MCletSrzSHEMAgAvstQ==", "token_balance": 61824 },
+    { "peer_id": "EiC+M2tOkhL/A40NYkgLxmI0/aSzC6A3gLc3wFF1EwQamw==", "token_balance": 61824 },
+    { "peer_id": "EiA/akLerobM52O/J17SIMN2BKis+CPamCHdEVrEnCtcGw==", "token_balance": 33561 },
+    { "peer_id": "EiA/iZh3KK5VxH9cc3iPp9A8NVZf97E+nbBiiSnvrx++Vg==", "token_balance": 33561 },
+    { "peer_id": "EiA0740dd4FBfTVQ5B8XvGYVlyRR0JO/5y0NPkYDTlhx+w==", "token_balance": 33561 },
+    { "peer_id": "EiA0HZzbMv0NIfY3mhHxbkuiOybc9eTgZ205Rvmt0LqEtg==", "token_balance": 33561 },
+    { "peer_id": "EiA0TrhWj0QVgt6KeqEKuji+TYYGpMD3IuAaKbiTVRHpww==", "token_balance": 33561 },
+    { "peer_id": "EiA2baZcR3a8El5GY/I+UPfTqG60Gac55TMZLccPKzFmDw==", "token_balance": 33561 },
+    { "peer_id": "EiA5AGZ8I4EIf1nMgc/FBkI/4LjQcXv73DlD+fNVtOkq0Q==", "token_balance": 33561 },
+    { "peer_id": "EiA6l1cD3nprm93KF9Y8KfPkogh2bqs7bx8rOZy23TP2kg==", "token_balance": 33561 },
+    { "peer_id": "EiA7Pz+t1mGSvtRFLWarCW/okdx1CIOIIFG9+D13uWX6Jw==", "token_balance": 33561 },
+    { "peer_id": "EiA9btruVye7Cpf0WiIda67ZKqgPL4YLWC+ABDY0rpFrbg==", "token_balance": 33561 },
+    { "peer_id": "EiA9kSxfO2PvQAqguGxEOHX5tyrMBOSwwJQgZ/g/h97Dpg==", "token_balance": 33561 },
+    { "peer_id": "EiA9vOx1NGXZZXUP906t9G8u1fFV2xH+o29kbPXvSnA5RA==", "token_balance": 33561 },
+    { "peer_id": "EiAa3HCAVFV973qGJZEEg3lmgys5cKXH6G7HVx0f9DmBIA==", "token_balance": 33561 },
+    { "peer_id": "EiAAhlHx5OR6cLBYnnpX5nXs4Y8k9RjPJrhzf3cHQb2xkg==", "token_balance": 33561 },
+    { "peer_id": "EiAAmnX4CodVLPBwcvIEs1Oqcj6ApANp2xYPHdjea50jRw==", "token_balance": 33561 },
+    { "peer_id": "EiAAPW1G0+5S+z8nfgFHySiKbsX9CT2+13kcLfRhYalkww==", "token_balance": 33561 },
+    { "peer_id": "EiAaxxb4RlJ74mvFyGacvhRjJreBEKTykeYDwBsGmrabOw==", "token_balance": 33561 },
+    { "peer_id": "EiAb6kSbftKbXZ8qumYjHn+wxCNY6RtC2wDX0/e1fPDEYg==", "token_balance": 33561 },
+    { "peer_id": "EiAbbDQeJ71MCaOfNpqQIF4hJEfxJOolOf3Halfg7pORaw==", "token_balance": 33561 },
+    { "peer_id": "EiABQRGzduIk5aWq5hkALpsJdx3MJBDv8fRHoUyyOqjomQ==", "token_balance": 33561 },
+    { "peer_id": "EiAbtdBmWtDs1u6iUBlIt+Y5v6Sj8b7fPxfQ0mqisU0lMQ==", "token_balance": 33561 },
+    { "peer_id": "EiAbwwA4wpLCEukt6vg4EiRiP/HI4OS+cxRi+WpdIDtBag==", "token_balance": 33561 },
+    { "peer_id": "EiAc2s3k+s+MEeaoQ8ANiH28dSxWbbdLic4W9mMJ9M3PeQ==", "token_balance": 33561 },
+    { "peer_id": "EiACz7owFX2P2m1qMsODZuLXRLd9fMc83WWf7cHFJMmqHw==", "token_balance": 33561 },
+    { "peer_id": "EiADAfzHMYcg5UUGt0sIwVmJDkiiSM7kK/r7n2a3vh1Guw==", "token_balance": 33561 },
+    { "peer_id": "EiAdOK/lq4G7SqVxEHZE9Jv6CWfHoK5a1vwfbm4O+knyGA==", "token_balance": 33561 },
+    { "peer_id": "EiAdS9WLuPm1c5BU+Irw/Lc31UhYmddscB80aExVsyu9ZQ==", "token_balance": 33561 },
+    { "peer_id": "EiAF8Kmvn7JtUwHLtej153fVHzwOpzA9lbL/ShtMRISbjw==", "token_balance": 33561 },
+    { "peer_id": "EiAFb9b1iasILaveYLBq+dudCxXZrZ5mYWpqMTDPLP5SjQ==", "token_balance": 33561 },
+    { "peer_id": "EiAFLa9OqY3DgnYxvAyUjrcFw/vbnTYXxz1VB28kUhrSdQ==", "token_balance": 33561 },
+    { "peer_id": "EiAfqrHIIo2+Dr1aaQVcmeQ/Z0W9PlnCZoZQgRCHi0ue6Q==", "token_balance": 33561 },
+    { "peer_id": "EiAfvR+3gMSl1rRb06UjW3r4BnqBmKScDq0SwcWnaHFK4w==", "token_balance": 33561 },
+    { "peer_id": "EiAGi/THEGK8Xnj+T8j5Ts/E1Tr/gde8GDR4YkU8T9OYRw==", "token_balance": 33561 },
+    { "peer_id": "EiAhUaHSoj3BKUUnvVg9abdHuq63Tm0qWIN1nX2xmyCV3Q==", "token_balance": 33561 },
+    { "peer_id": "EiAHwIbKCjbm+BmxOEarNRsTtbh1H8/I9lWC6PunWpFGhw==", "token_balance": 33561 },
+    { "peer_id": "EiAISnKrs5ASNGb0s7xnZjrrCCWevPDOsd4lHv57J5gW2g==", "token_balance": 33561 },
+    { "peer_id": "EiAJdvz9obduntJXgi8Q6bHHrn6Vzhqtb2Qq0n8aNgukUQ==", "token_balance": 33561 },
+    { "peer_id": "EiAJS3i+shd9vHDEYbyeMVlZS0RQ7e8kGIGdZ+H1V1+7Wg==", "token_balance": 33561 },
+    { "peer_id": "EiAl6DYRNqRMtEKbPs4RQR0IBo1zqfM6QqVsORU5hXCIHA==", "token_balance": 33561 },
+    { "peer_id": "EiAlE+Y2efamrIMiM8ykoVS373b9JfRyIUYs9MepXryPyQ==", "token_balance": 33561 },
+    { "peer_id": "EiAmbSKaIkxs4INgf+P28hu1U8ZllazNtkhbZzlua6kb7Q==", "token_balance": 33561 },
+    { "peer_id": "EiAMjLq1fHta0P+DZ8Lj1qiR6mT1TyWf/OEdHltrViFlpA==", "token_balance": 33561 },
+    { "peer_id": "EiAmX3zdnKnKmdFHWEgYVvABA6QqCBCovXoqIF7kFmN3dg==", "token_balance": 33561 },
+    { "peer_id": "EiAnTKU212B/Yb64J7v14ZSIioBqbPQXog6PvSGLOE/msA==", "token_balance": 33561 },
+    { "peer_id": "EiAO+aV6/Czp74D2m6SI+x0wwtL7lAwIULzf2z+ObV0ydg==", "token_balance": 33561 },
+    { "peer_id": "EiAobcBgASGBqo7kpINLnNmwv+v383h/MA2x/V58c7tQPA==", "token_balance": 33561 },
+    { "peer_id": "EiAoO4D+jjdejWYA160gIaeJNpowr724X6/4cRKMqOYCJQ==", "token_balance": 33561 },
+    { "peer_id": "EiAoyXeMEGhqze7b7QVVS7Tx1xfOEqtfi8LoVbwgs/WHrQ==", "token_balance": 33561 },
+    { "peer_id": "EiAp0+gEXAkzJoOuBJMgndElivLXDR3smEwUTl0lYhz1RQ==", "token_balance": 33561 },
+    { "peer_id": "EiAp60bHplMpPtpLEndCFc86K+wdGmej8jNK7x6L3/Zn8Q==", "token_balance": 33561 },
+    { "peer_id": "EiAp6WcyGNk8adu4zRBt9tsEIErFtxdLB95icnuay68UVw==", "token_balance": 33561 },
+    { "peer_id": "EiAq+eHz1CKq+c6gkJdz1Q6bIutfH8uyVjq5PRelTGn1TA==", "token_balance": 33561 },
+    { "peer_id": "EiAqIi/Ohox19hK3Q57dwCjTGBcUSpprh8JS5CBggd/OAA==", "token_balance": 33561 },
+    { "peer_id": "EiAQZ+QgJ5m59tuoSE1l/ieEwNlz6Po7223pYwmnDXGKIA==", "token_balance": 33561 },
+    { "peer_id": "EiAR+iLt2TonvFdwsPc+CnhggVMyLJgsUTDv3uP5ViTFww==", "token_balance": 33561 },
+    { "peer_id": "EiArIQvipFwtI6XERgNx0pqGjoZ1MX56hYRu8CdpulsZ8w==", "token_balance": 33561 },
+    { "peer_id": "EiArK2bArvm4MYl7py3z2Xp3fBxe1ma0tEkuQR8zzPUskQ==", "token_balance": 33561 },
+    { "peer_id": "EiAsfjCQGZLXjDJche5we8SDpT7++fH/NTlb8CjSYkMrjQ==", "token_balance": 33561 },
+    { "peer_id": "EiCwv4d6oHK+EYLz3vx5bQe75I7Ps0LYrqNpSdw076vHiw==", "token_balance": 33561 },
+    { "peer_id": "EiASJpXnYpnbh9KqA1ejjNrx548nvzREhhggyl+jnqkpng==", "token_balance": 33561 },
+    { "peer_id": "EiAsydKWNZRjiSX2NMERO0JKVO8SSr9GqhviqQQG+MbS1g==", "token_balance": 33561 },
+    { "peer_id": "EiATfXiLdhvhFAm3qzjqrjQc8ayLjCs0hFE5rCWXkTDsUg==", "token_balance": 33561 },
+    { "peer_id": "EiAtMEKlsUwrSbsube1d6cJllBLsNq1j+iH56W1OnV7lmQ==", "token_balance": 33561 },
+    { "peer_id": "EiATulgo3nHfYPQp0djiFlyhL8c9ZirZ4NSGTbTwI6dR4A==", "token_balance": 33561 },
+    { "peer_id": "EiAVMJPE3xobrvDBSYrWYyFIESnB3aedgb1xQNhdvuYZrQ==", "token_balance": 33561 },
+    { "peer_id": "EiAWtOu8NzFPJesBacHDG2E3WISh2LWNjuZAHnXd6Cvg3Q==", "token_balance": 33561 },
+    { "peer_id": "EiAXFu4gdiCHtG/A6VZdtCywJ5Lplh6APHZfMSrHdjix0Q==", "token_balance": 33561 },
+    { "peer_id": "EiAYIsoULoY6bIekM+Yqa5hJYuKhPSVy5+2bGUIaUkx1AA==", "token_balance": 33561 },
+    { "peer_id": "EiAyj7Dl3rlh2z6B6gACIJv5IjLCm7gJSk/2D9H5VFOTUw==", "token_balance": 33561 },
+    { "peer_id": "EiAZB+lRay451UHeCVksj03K0Zk4rGJo4efSL7mULpeh4A==", "token_balance": 33561 },
+    { "peer_id": "EiAzguJnszJo+cNDo40ainUoR0lvSoPG4LT9LCXmUyMGJQ==", "token_balance": 33561 },
+    { "peer_id": "EiB+0U1jaQW/33m05dxQ/S0OYnItEB1OMG+NwUyAH0PVxQ==", "token_balance": 33561 },
+    { "peer_id": "EiB/8kobVeqeFljr2nMP0n5vDfod0S9WRzrsSFv9nkUpTg==", "token_balance": 33561 },
+    { "peer_id": "EiB0bwtUxtSyFavpehB/DKEZSzjpg2P+l52DyTCGF88cxg==", "token_balance": 33561 },
+    { "peer_id": "EiB0qeqkS1VR2xWI83HMr+ey9ZnMz/M1hQfsalpvrOAZeA==", "token_balance": 33561 },
+    { "peer_id": "EiB1a1SKljSPHmso4+EfG1lMSEdFS0Wl84xh2gUNC96Vgw==", "token_balance": 33561 },
+    { "peer_id": "EiB3mW1+NHsFaEWr8BS2TBf3okZdEs7LTwEBGLOS9fJUuA==", "token_balance": 33561 },
+    { "peer_id": "EiB4JHgXoUSx4r/REx5wHJFk1YNlUR4zkVthOrF2G3qaIg==", "token_balance": 33561 },
+    { "peer_id": "EiB4Nxn4n9eYwxSTMRT3PmhUDVQclQwjx6WHpRd5OUB8nA==", "token_balance": 33561 },
+    { "peer_id": "EiB5JUEuZV7aXoTuF3DK2h+Rl6qhuq/pgQwh9p3ivjc+Dw==", "token_balance": 33561 },
+    { "peer_id": "EiB5lMV8eGdxAq+nl3afjNspbuIOz0DbVcA7JAcXJpip6Q==", "token_balance": 33561 },
+    { "peer_id": "EiB7mWD3Zu2E10et6LjGwn/i10hR/QzL7SR5JZ4h7AP4YQ==", "token_balance": 33561 },
+    { "peer_id": "EiB7n66XHzp5X16+DiLjrIIoeliYqFUUzT2Nf57Haf1WMg==", "token_balance": 33561 },
+    { "peer_id": "EiB8zyMv06OTwZn0HJB0yQh8YMKmaYr9IoT1hZlmT7blsA==", "token_balance": 33561 },
+    { "peer_id": "EiBadGrAfFftqRwM/tSofGzpHbF1UEfgxcKCdPwP2pWcaw==", "token_balance": 33561 },
+    { "peer_id": "EiBBWJR7DYIW2Ur6/3Lo8PtWruF40mkZsBd9pjhwvRmKBw==", "token_balance": 33561 },
+    { "peer_id": "EiBC5VKF5YQfVSwPwKl2Ka+BnEquAYGT6PVBHzm1pzO2kA==", "token_balance": 33561 },
+    { "peer_id": "EiBcmsS6EmEBO/YqVA9EtYDhMvSXiwTCbegWaAz7Ilggbg==", "token_balance": 33561 },
+    { "peer_id": "EiBdAngPCp4mMyeT3/dp/tU5K8gRLkJbNRLpVBZVDOzrxQ==", "token_balance": 33561 },
+    { "peer_id": "EiBDmeSPVzcneJHL3Lg4IG3FOY3hhhGQQHw/DBBDCiWDeA==", "token_balance": 33561 },
+    { "peer_id": "EiBdnU3CGuhKS4i3CCOoXWUgd1pbP6BzssQyJ9uyPpsjoA==", "token_balance": 33561 },
+    { "peer_id": "EiBDpkY2IAoq2X15Z1N7mzFsvvrkGK5Oo4Zptf64vWI30g==", "token_balance": 33561 },
+    { "peer_id": "EiBDUFthmcCe+jlPcCUiTpfYdeL4mTseblHp49KH+HmX3g==", "token_balance": 33561 },
+    { "peer_id": "EiBeh2Q4YiInxE24QXd7OCZWIufaP8WiIBVWHiMCN3HHTQ==", "token_balance": 33561 },
+    { "peer_id": "EiBeKcob5o+L+Lc4xQcPYmayt3nmbDRiaXb+hcWAqeKYPA==", "token_balance": 33561 },
+    { "peer_id": "EiBevqQOOZ+75Z3tQGS5UG9qHkUpg7ELZf6fD2t9PyqE/g==", "token_balance": 33561 },
+    { "peer_id": "EiBeXZ7YqqTq+QKJig4FyCS+ozHPFJJ1yStJ0s8qnECP1g==", "token_balance": 33561 },
+    { "peer_id": "EiBfcXJwnRrubRaLaNMbFCPEF4mIB1aeiMpEojbKDDiL1w==", "token_balance": 33561 },
+    { "peer_id": "EiBfuLXHHGhdOu8zGgQF7QTHKuQga+BIizrhKplKvAMvRQ==", "token_balance": 33561 },
+    { "peer_id": "EiBFXUH2G2KfAlodMJbXj2ox7QPw1jz5l0mMwICRO5Jo9g==", "token_balance": 33561 },
+    { "peer_id": "EiBG4ll7Xd5GnZwnyoRD0CPPHwKI0XaVbXaDuBCwbuxetw==", "token_balance": 33561 },
+    { "peer_id": "EiBGJELZMMnglR3l4CQF71ItkOzH042NnJ3Ms51Y4AJsTQ==", "token_balance": 33561 },
+    { "peer_id": "EiBgpfYoiSVKy1WqQA0LNsRhHA6F1mkp4QvHWI2MU/UeZA==", "token_balance": 33561 },
+    { "peer_id": "EiBgY6eN0MxGT3zLN7KzYnbIL2Sr5piDLyxrkbuPimeGAg==", "token_balance": 33561 },
+    { "peer_id": "EiBh52+5wTVp60gTVCmREPZAiThINbKMs1NuT7fIOr2JnQ==", "token_balance": 33561 },
+    { "peer_id": "EiBHeY4bA3u2JYRmjMoVIHJLZvK9QaLwsw1pbuD/kn8mnA==", "token_balance": 33561 },
+    { "peer_id": "EiBHF2D1YFAMI/hjJAJ4B/y+PBhNafGEAxjccfJPishm+g==", "token_balance": 33561 },
+    { "peer_id": "EiBhgKCiuKGNnM2Y1vwRLuIxAPR52monvCuG+qm3pv7pSQ==", "token_balance": 33561 },
+    { "peer_id": "EiBHM3LPhhIhXX8oj7T1hJN7fBaKw30cn5zUMitxb8U6ZQ==", "token_balance": 33561 },
+    { "peer_id": "EiBhM9eraY/qGeTnW+ZajQWMoOu3gBQ2z9fT0vM41gZwHQ==", "token_balance": 33561 },
+    { "peer_id": "EiBhzc7WxJXbPS1IqD2KrtyOGq55aA1vSlSFaang1YTudQ==", "token_balance": 33561 },
+    { "peer_id": "EiBi6d1AB30oohOQKpxTPuQcsCf9jAP+8es/m8xql+fxkg==", "token_balance": 33561 },
+    { "peer_id": "EiBigEs+ON2j/hMhXl9ctg1oCyZiMIx7wTtYylx0Ou8wKA==", "token_balance": 33561 },
+    { "peer_id": "EiBiMcu3TQx9OFyseXMW7Kta5qU9d6j1kfKJJHShymAkTw==", "token_balance": 33561 },
+    { "peer_id": "EiBjNoymiVeNXeK8blTJrrpQEv0Pow9wNc1T5jufTpZzbw==", "token_balance": 33561 },
+    { "peer_id": "EiBKaK+NyybyOJ8/ElLl4f833O0LuI9M7TPBoomWMSwOfQ==", "token_balance": 33561 },
+    { "peer_id": "EiBm0As/9wBQb/u9j6++M3P/5Dh52/px0cLLMzgacwHF1g==", "token_balance": 33561 },
+    { "peer_id": "EiBmkzxsNd2njEnsB2C9ZF6hSnkiFWgBV8jkb6zrboKR7A==", "token_balance": 33561 },
+    { "peer_id": "EiBMXMuMJgVK9Reb9WzXszjSWIxWVNPOR7/wjmLWQuqgPw==", "token_balance": 33561 },
+    { "peer_id": "EiBNK/q5mjOOGQonNkaOC/dvUOLMdQuT1FrvahlBzLNBZA==", "token_balance": 33561 },
+    { "peer_id": "EiBNqPROwDeTq/wdw/wG+p77gdRxpQxYznJfDyy4PuPZ0w==", "token_balance": 33561 },
+    { "peer_id": "EiBoyrGstUmmW796+68m315oYWHzkySxw6KQZhw/2QxjoA==", "token_balance": 33561 },
+    { "peer_id": "EiBpCiEwzXp/IwAqBAm0aXquk+r38gkeX2oSwExq5FIkQg==", "token_balance": 33561 },
+    { "peer_id": "EiBPilMWfKdMaHK3QUFo8te7MsJ28a/J99BdAwxTKf0tgQ==", "token_balance": 33561 },
+    { "peer_id": "EiBpj2rICDPCIRRQ4X67hSXq/z1XOyi+VCztwwexkS3jDw==", "token_balance": 33561 },
+    { "peer_id": "EiBPTmUxdd/X1LidP2a5919j9sNTJwmT559heKGcWUX+EA==", "token_balance": 33561 },
+    { "peer_id": "EiBQ2y/H0VkSU5OvomqL47R6I+TsHpWouU0Sw4sXiyIUXA==", "token_balance": 33561 },
+    { "peer_id": "EiBR9t0JKdnTBtp3FfjIwOQkNuTzEOwrDGBR67ir6O2Drw==", "token_balance": 33561 },
+    { "peer_id": "EiBr9VmoOm+4IkBb6w1/Cs5YiECgcKSIFg15mWO3PXtxmw==", "token_balance": 33561 },
+    { "peer_id": "EiBrA2jFA/vMnltH2CGgXtTHh6jkIsHU6PeiCH8el2O4tg==", "token_balance": 33561 },
+    { "peer_id": "EiBRrIgSQDM/6Z33kSuomTRnaI2bhV/e9TYIR02hrBpKaQ==", "token_balance": 33561 },
+    { "peer_id": "EiBrsy7y859RkDavqTzB7xu7d+812SgziYZp310Slx5aCg==", "token_balance": 33561 },
+    { "peer_id": "EiBsmsQAsfN1HrPI1gwat4lSiV7gY7wgVj9Bv7Z6s4RQLQ==", "token_balance": 33561 },
+    { "peer_id": "EiBsOLRx8jfcINZK/ssxgcwyVTrA4Qblw2j3glZ13bnOtA==", "token_balance": 33561 },
+    { "peer_id": "EiBT8DnUHZSGwPCkT61r7dIrX30RXJ3ESnDZBTaRwR50kg==", "token_balance": 33561 },
+    { "peer_id": "EiBTBk/X27/DClwVSAWjAFQV5OvcBC+sH8slItb84a3Kjg==", "token_balance": 33561 },
+    { "peer_id": "EiBTeMZAt5ic7noc06AEJY8l6qScDHGtdNKWxH33mqcpbQ==", "token_balance": 33561 },
+    { "peer_id": "EiBtGIu0h+64wjQVG4kQoxclXt6OCnG9JIuvXutYsPFJFw==", "token_balance": 33561 },
+    { "peer_id": "EiBtYfZnWJu/Ar1oDXp0FM68VQDvBalJ9Cbq0E6RX7HOXQ==", "token_balance": 33561 },
+    { "peer_id": "EiBu/WWMQHxaLsX1XyqYy25uAIE27G6iSKvM5itJ0rs9Kw==", "token_balance": 33561 },
+    { "peer_id": "EiBUBgV0XJHRn3PmWtkISus5mlE5rB9vcUi51ZoYBNUm6Q==", "token_balance": 33561 },
+    { "peer_id": "EiBUrVQqgIkfw+8jibFTiIRNbH7mSTtsW8HzEZmIeoZrUQ==", "token_balance": 33561 },
+    { "peer_id": "EiBus/DIfBSaJGdwgK5Ooue1+jZ0ZVd8/9LtK1HNCqgx7w==", "token_balance": 33561 },
+    { "peer_id": "EiBUyrjgnJtVbvfHI5kZTGiKvgvXuXWIGasC/nHWTYy0HQ==", "token_balance": 33561 },
+    { "peer_id": "EiBVCf8/gq2S9gGLAKDAJF81I7uYcG4PfT3opAfBviSIIA==", "token_balance": 33561 },
+    { "peer_id": "EiBWVKy5Ft92suGdvj4uynD025iyK1bHQZ+S2fKGDMdLkw==", "token_balance": 33561 },
+    { "peer_id": "EiBx3EdbtPNcNViV6MR6LOq3lPJvwoArGzbpzxYzAzMG1g==", "token_balance": 33561 },
+    { "peer_id": "EiBXDIUOsIXCVQHS+sW7B+Yg+1OudVthocHU0LTcDxZwRw==", "token_balance": 33561 },
+    { "peer_id": "EiBxELcqLhVBUY7ZZyW/JNyTxdXzgYkzh51S4C8FBSl7Sw==", "token_balance": 33561 },
+    { "peer_id": "EiBxgKPONqQVlIk5VqM4KVDeM6KSrbegG66Ncn4ySAXW5g==", "token_balance": 33561 },
+    { "peer_id": "EiBXHNMT5mdoPQY2JfrEsu4amVv6Eggx2AEXrwXx/UHQ6g==", "token_balance": 33561 },
+    { "peer_id": "EiBxlyWYOKw3ZQgyOxbDVINASSXczl4lAvccN33rO43JmQ==", "token_balance": 33561 },
+    { "peer_id": "EiBXqH9N2MPXhqVE/5T/eCC/XYog8kiO5swlKMB4iMwrtQ==", "token_balance": 33561 },
+    { "peer_id": "EiBXUarTiitJ2PEwqzo+l6AEld0m/W5qKw6pfFGZDSHWdg==", "token_balance": 33561 },
+    { "peer_id": "EiBXXLcXOI3lI7+Yz3YpiAkeErANUw3zeVkMOHUM0DQvEw==", "token_balance": 33561 },
+    { "peer_id": "EiByHO6dP93FY3FsuPVJ98BgFlXs03SaFk3w16GDKjJyuw==", "token_balance": 33561 },
+    { "peer_id": "EiBypVSidPouWbIlwiVEgWDYhl0mXf870eIP/kBiCQo8oQ==", "token_balance": 33561 },
+    { "peer_id": "EiBZzUZGBxNnfPFELzIy//rJmU0FpRk7Mu2aZv60wDGFBQ==", "token_balance": 33561 },
+    { "peer_id": "EiC0WRkyHHxUQML+030JsEKoKMUzf0JbXGtl3oxW6K1kdw==", "token_balance": 33561 },
+    { "peer_id": "EiC1l7XzOaZul+EnpRlNqicYIEQ3cIwe5FvAert0qvOxUA==", "token_balance": 33561 },
+    { "peer_id": "EiC1RVPdNtjwIEH3ZON3OrVrFvOY1acugPS0KKpGO5ZVmQ==", "token_balance": 33561 },
+    { "peer_id": "EiC1ST/PIv2uheK9Zt6HWU+PByWA7PaV2SForEClUDHEeQ==", "token_balance": 33561 },
+    { "peer_id": "EiC2obFK5cPgQe/IuttjHqHYni8GsYsblYGTkQ4HvY6LZA==", "token_balance": 33561 },
+    { "peer_id": "EiC484zYPCYAOQTc7I7BrpO+5plVVeW7HTjvmVqhQw6BHA==", "token_balance": 33561 },
+    { "peer_id": "EiC4LgRgN6I8jvTES08ggVznv6CDpYQF7quw4TYVnsml4Q==", "token_balance": 33561 },
+    { "peer_id": "EiC68ul1+gDOyG93CVMf3+TIxIRKodKmc/e5Yp4wZqJ+Mg==", "token_balance": 33561 },
+    { "peer_id": "EiC7m7Q+geHtVjVqRKYTEqXcyru2IvSNyQXiLbwIVWVw0w==", "token_balance": 33561 },
+    { "peer_id": "EiC8ZP9nLpSPeT7nTBYazd9V3RaDWoA4Ekziz0v+gHYRuQ==", "token_balance": 33561 },
+    { "peer_id": "EiCagErmVTA8ytscM1QaWqFWRDC7mzQfBeUo3IqdCUnYsQ==", "token_balance": 33561 },
+    { "peer_id": "EiCb34w0U9JqmsbKeYoJmhabt4FnNeydaq6Oqlvq1w5/fQ==", "token_balance": 33561 },
+    { "peer_id": "EiCb9d8qxJ0iYDgNwO0gx8UIiJdikiIVUhcQrVsK1zGzCQ==", "token_balance": 33561 },
+    { "peer_id": "EiCBHXr9Zls4Ov6VNntiy2gEKvb1mNjS2kO7zUhMRSmieQ==", "token_balance": 33561 },
+    { "peer_id": "EiCE5oe7nQ7JpnNtSLXCMiHq/2SDuNY/FlIBdpFawDACCA==", "token_balance": 33561 },
+    { "peer_id": "EiCEWTPV09cZrP87m0EdX2ZBNF3egeqEiwAMYz2ZYrgjhQ==", "token_balance": 33561 },
+    { "peer_id": "EiCeXjEqt4hs/vmdpMsDVtlgnniNDzp9A/8z1O6PKDEukA==", "token_balance": 33561 },
+    { "peer_id": "EiCfKRYgC0nEsbmFGY1nI3RGo9RCdzMZRLfQbMys9OAMsg==", "token_balance": 33561 },
+    { "peer_id": "EiCFWus17WYdzOD7dcLAZlcPlfKDUy9QvRRYdg/YJBn48A==", "token_balance": 33561 },
+    { "peer_id": "EiCg81scwxZyLkhMSgclSIVx6VKeoXMga4i3iMaKCMjUeg==", "token_balance": 33561 },
+    { "peer_id": "EiCgOhJzjzGYHpwiqntXrfhuMnU1V6/MTZrAFjGpX+ydWA==", "token_balance": 33561 },
+    { "peer_id": "EiCh0pcSGxXjRXf7OXcGg9wQGqQA/4DyrEh30XnEF6H4fg==", "token_balance": 33561 },
+    { "peer_id": "EiCiC9dRImqGhqKABMUt8Hl81OpQwoxnJ7m0UASpH4lItg==", "token_balance": 33561 },
+    { "peer_id": "EiCIm4dpeLlI+qLLHZ2Gsw6QKTM7JcxVobtsee/TTcWODw==", "token_balance": 33561 },
+    { "peer_id": "EiCinhN5WURUo7iQv7zb7DYnzsMzij0sM15/shTTQ287DA==", "token_balance": 33561 },
+    { "peer_id": "EiCiwC3yv3ePIt70lxe9PtWNhrwr7mXXBMJV+Xi6k/FKmA==", "token_balance": 33561 },
+    { "peer_id": "EiCJ0rBmh6l4d2ygoaTxnbgqEX+CG6xdxKTa0MDnx2OC+g==", "token_balance": 33561 },
+    { "peer_id": "EiCjCuorEu8LGrdJQyTzSsOPoKWbU1sGMTKENMDBZEkSWQ==", "token_balance": 33561 },
+    { "peer_id": "EiCjL1Ek53nbh/WinVrH+kzmXRWOsthbobrTCzc29Jo7xQ==", "token_balance": 33561 },
+    { "peer_id": "EiCjtvzkM8CqHvUHiQSzVjz/gSeg+2705f4/bZphUkf/NQ==", "token_balance": 33561 },
+    { "peer_id": "EiCk5x+DdW2U38dgxiiQnSP/KB4rxn04lvdPJuubVrQdjw==", "token_balance": 33561 },
+    { "peer_id": "EiCk7DYBnTkEz/UVj+X6AAZ66zrtDHPWQA8X2pah2hoccg==", "token_balance": 33561 },
+    { "peer_id": "EiCl0cat1aSXx4nGYdOKNnvRFZQAYcqqDgtrh+g32j0uqw==", "token_balance": 33561 },
+    { "peer_id": "EiClic3+AUmTDVqH8juQxGu1w9FEOf76bEBJW3l6J4S6cg==", "token_balance": 33561 },
+    { "peer_id": "EiClV7TNuDYNywNzLpIxPTYsT9Gy2xlvNlBIIwgSMDYsbg==", "token_balance": 33561 },
+    { "peer_id": "EiCNmcgNpLGN9Jwjyf/q7eCyDSA9U/Ba7RK7BX0kc/iOQw==", "token_balance": 33561 },
+    { "peer_id": "EiCO9rhg1Rf8aFhOdBBxhXb4hOcXalPhFBJN1752/X+hdg==", "token_balance": 33561 },
+    { "peer_id": "EiCOlhap9AJPpU8Bo2ZqhalrLyCtXfA9ta1kfPJ4QH4siw==", "token_balance": 33561 },
+    { "peer_id": "EiCOnJABC9Nas3tONmczSklaxqkZDOaC+Pf4rEt4ufpu1A==", "token_balance": 33561 },
+    { "peer_id": "EiCPE6wq/EFjgy8XntAioIOZQBtyBMuoJNdVmYRr25CIZQ==", "token_balance": 33561 },
+    { "peer_id": "EiCq4Qqp06UxkbjKXHytu0yjUZLC1vXmYtNvydr3PMFj6w==", "token_balance": 33561 },
+    { "peer_id": "EiCQDcloEnVyFTU04lxDTIP2XlJvlan0T8MCw/chwfZrSA==", "token_balance": 33561 },
+    { "peer_id": "EiCqJwAtWrf4FJfj9GRsPSVaJ0p7OX5onfgrSURWeH8xIQ==", "token_balance": 33561 },
+    { "peer_id": "EiCr1+4YvnxvquEKEVr8aZlkPdv9xPO1tvb5j/5EjjXnLQ==", "token_balance": 33561 },
+    { "peer_id": "EiCRgIfoKLobbbSet+CIcoxpwmRGfZQmtexQGAL/fqParw==", "token_balance": 33561 },
+    { "peer_id": "EiCSAdTRYWnxcQGQ9czd8RVql4W7unD+kCYz9/6WSjtmhw==", "token_balance": 33561 },
+    { "peer_id": "EiCT+3Yt9UCVkwSbFxeJVvNUHgeqVVVO5QWeLnIGo9F7wA==", "token_balance": 33561 },
+    { "peer_id": "EiCTa4639f/X5q+D74mSuC3QleLO5PMpCedo6bSnvhPveg==", "token_balance": 33561 },
+    { "peer_id": "EiCTehwHrsswcYCnXB1PtjLhspaKVqU8rcCaD7crN0JKXQ==", "token_balance": 33561 },
+    { "peer_id": "EiCtn8WO1aBlmjhXe+H1enGXJ4qq7owbchsgbClCRe6zEw==", "token_balance": 33561 },
+    { "peer_id": "EiCV4gsPbqhe3dbMmrM9dY8rQRl0nSyzbD8bdFv7JVbB/Q==", "token_balance": 33561 },
+    { "peer_id": "EiCvgCZr6YyWRGqiqEXv5ZABVwGyptZomlFkZINV68Uo5w==", "token_balance": 33561 },
+    { "peer_id": "EiCw/Q8lTBPKAVWIoTdHzLTnngu/wZLlLnGONUAavDUdQQ==", "token_balance": 33561 },
+    { "peer_id": "EiCWGm9hyulhgWc9FH9ejedN5kK0FlXCIYMV4JaNRh4o4g==", "token_balance": 33561 },
+    { "peer_id": "EiCXGTygpITlerQe56QbBPQvH91IjalW5EM+iRKspD1NKA==", "token_balance": 33561 },
+    { "peer_id": "EiCYE8yfBKAhsLg3c+nNsgZfuPkpqesC2MzEA7cGagZO0Q==", "token_balance": 33561 },
+    { "peer_id": "EiCZ+QsNHYSeupXEwwpqZTcbmiVhs83YpXnqpAiZZN0QEg==", "token_balance": 33561 },
+    { "peer_id": "EiD/WU32JObv7p8TukO4W1HTpbWFHPKMinRzt9XnHC3tJQ==", "token_balance": 33561 },
+    { "peer_id": "EiD0PxxnNwV8tUqsqEL8u+tbFOO2/DgTam+E1iJyymECHA==", "token_balance": 33561 },
+    { "peer_id": "EiD27CTm1hX5sfjJAp5aZQ3ZGQMJoKYjS6Ew08SLbWEroQ==", "token_balance": 33561 },
+    { "peer_id": "EiD29GMnzfaDVuX0yzSkhdVKcUdklLReD0OO0J4rkA+ydA==", "token_balance": 33561 },
+    { "peer_id": "EiD29YiZZBZnDL8O32tMrG7lfGmBEI1RJLVCposmEXrcNA==", "token_balance": 33561 },
+    { "peer_id": "EiD3bLzoV0vdqAVD0B/lTSbKlV2qitlQuWf9TrFolf6kBg==", "token_balance": 33561 },
+    { "peer_id": "EiD4EmSxter0Gv6sGLKSPgKQv3/e49vt9x5xgdOqyWFwWg==", "token_balance": 33561 },
+    { "peer_id": "EiD5Bn9Ln6lI1zklAyIsM2+KR/fwyKL/rwV4YY3DuqjUew==", "token_balance": 33561 },
+    { "peer_id": "EiD6/Q3HhVJWC9tNL+2kTroT/vRsNFv8nIKuAYRDZXStoA==", "token_balance": 33561 },
+    { "peer_id": "EiD60ZIpO3Lzd+mmDUqxWOxD2aJCuENTdDudWcyrCehOzQ==", "token_balance": 33561 },
+    { "peer_id": "EiD66rvM4SOZimHZwjosNyaL966f6109BLPuhkxGOYl1Lw==", "token_balance": 33561 },
+    { "peer_id": "EiD66WAy4F6v0gXPFAZq1DIbejNT/PiHtO9cZmSgzAba6A==", "token_balance": 33561 },
+    { "peer_id": "EiD7HhMlv0I+DO7LgaCXldw77czuyU3MWDX0E73e4DHQMw==", "token_balance": 33561 },
+    { "peer_id": "EiD7KNmH7BJGvmXs5U2ULcLfEvjckhuzZyAgfQ4iLBsIOg==", "token_balance": 33561 },
+    { "peer_id": "EiD7YoR7BoOlrDkV6SDbSZ5R1XxhAYpTXf9ZZkc0JywXNQ==", "token_balance": 33561 },
+    { "peer_id": "EiD9W4Gx3C2Ir0lV2G22pbOX6w/pl5U5Xj8/QqwpmypuwQ==", "token_balance": 33561 },
+    { "peer_id": "EiDA6nzZspLd9A2WXA86Wos45y3/6hauVJMJpWVVRhKMpQ==", "token_balance": 33561 },
+    { "peer_id": "EiDAbSr47sbAy8IWtA7l43iYx8KcsXC/4UVmf9IYFYZXXA==", "token_balance": 33561 },
+    { "peer_id": "EiDAKQjpZjFKd+e2xpv+zWxXlJtzRRJVIQln8Q/sJeV2FA==", "token_balance": 33561 },
+    { "peer_id": "EiDB0azciKU6nhYyVmQnhdiESyemEKSBAyC+0v1Lxk6vXQ==", "token_balance": 33561 },
+    { "peer_id": "EiDBeD7AHN9N8lDgqekN5xMCgu6AG26BkizF+3KlK8//SQ==", "token_balance": 33561 },
+    { "peer_id": "EiDbPN8P4wl9UftHptd6nID1mHVaUn1pjH9RaUbULQfM1g==", "token_balance": 33561 },
+    { "peer_id": "EiDC3Qg9wxNXBNXUg4etRoXcDZ77zs4sha2EjDFGjkBs3A==", "token_balance": 33561 },
+    { "peer_id": "EiDc4eGssM/nIMWdlRQYFN+I+FKkTiqsyLeAmMI7u+DqGQ==", "token_balance": 33561 },
+    { "peer_id": "EiDdhCrFlFXziz3Ks2EP4aGA+baZH+aSq88HFF/tHLegDg==", "token_balance": 33561 },
+    { "peer_id": "EiDdJ2VBhCLCdxy5W0uRM7DQu1N+u34uS2mmzhx3Gi98JA==", "token_balance": 33561 },
+    { "peer_id": "EiDe9W1f6gSofLBD292UzOL/a7y5ve9nro04d7P1TT+xmw==", "token_balance": 33561 },
+    { "peer_id": "EiDeqYbTSJxcKTK8OVrTV5Zfh9hfEV6yzL+DJ35qyPUyNg==", "token_balance": 33561 },
+    { "peer_id": "EiDfIbOJWhFFdAtd7Ypr2JbiN/Fdv0+7O9Yp1ceLezOLrw==", "token_balance": 33561 },
+    { "peer_id": "EiDGdp6wRE2Vr3H6jMTc9hw7vsW0ekIjhySU+oxLHHIDcw==", "token_balance": 33561 },
+    { "peer_id": "EiDGFcew84luMddwYdBRUrau8wUFqBfVmXXkeCa/yzT+jg==", "token_balance": 33561 },
+    { "peer_id": "EiDgIQPTwxPNU+sxzBYH852WdNaWzCP8N8Q4HzY3sf88VA==", "token_balance": 33561 },
+    { "peer_id": "EiDglUe1WsMkRs1bf1h8Ul+wEOnLUmih9S6FeE6aDWTGhw==", "token_balance": 33561 },
+    { "peer_id": "EiDGu0ifK+mUbzKnHimQhDzO5V5xiF9pWry/gvIySPE8ow==", "token_balance": 33561 },
+    { "peer_id": "EiDHRYbkf4lnWOKiySMj+Sz0bN0R7gckauP9rI277lYJNw==", "token_balance": 33561 },
+    { "peer_id": "EiDJux9BM5SomowX1NTnYFBFSPOokE41C6DJ9Is864CvKw==", "token_balance": 33561 },
+    { "peer_id": "EiDK1hOcfGrOh5UO4Pw+yYDv2MRs7aQoj+Znatddm1zN7Q==", "token_balance": 33561 },
+    { "peer_id": "EiDK4CxNrEUn2moWMgEqIK/XqQozt4EJbS34ffrs7P/ItQ==", "token_balance": 33561 },
+    { "peer_id": "EiDLAOrrGUnhrsjHAZMdqEmAlMdhyQw821xbs5c3t88EiA==", "token_balance": 33561 },
+    { "peer_id": "EiDmgHCG4xZhd2Ks/t07nFDXARlzV/+N3yexBUlNTmqnrg==", "token_balance": 33561 },
+    { "peer_id": "EiDMixUJXTtGcRyxwE+uM5UlfsxED4NuI4LVI95oxAm4BQ==", "token_balance": 33561 },
+    { "peer_id": "EiDMY7IXiZB3issJiG8wX8Lb9Iku9i3fVV6dqqVFjBvz4Q==", "token_balance": 33561 },
+    { "peer_id": "EiDn2Q10/Hym961SYwzfVmbp1uEfueMmiqZk4mlNOzuvbA==", "token_balance": 33561 },
+    { "peer_id": "EiDnYFVX8Rr3ctyH9Nz0rlDXfwc+U8WQKeXoK7k43rP0UQ==", "token_balance": 33561 },
+    { "peer_id": "EiDOn1yXt4t1bOLMBM+Lh6Pm4JBpKP6K3lV2oPMtEbmjkg==", "token_balance": 33561 },
+    { "peer_id": "EiDOPuCY+XHn+deTf7FScuJSPK7uRN7bLekSMGSY0SXgiw==", "token_balance": 33561 },
+    { "peer_id": "EiDoQskpaCTOEuP1QuawoVdewVHlc24Nxv1IHmXcjvbXgA==", "token_balance": 33561 },
+    { "peer_id": "EiDOsXWYGeFSQCNs8jzysDXMpwhRCnfTiRdM7vFxw2tOPQ==", "token_balance": 33561 },
+    { "peer_id": "EiDpbxb5eRcbqsvgLSLcTphknlUUmZg7E9sDz1wftKxCUw==", "token_balance": 33561 },
+    { "peer_id": "EiDPiFiPckjEifgudwkmcfVueFX//ZS4t8LTq69N4kKIrQ==", "token_balance": 33561 },
+    { "peer_id": "EiDpk+sQ42fIHEPCXjKswQVnq7XVQ/ySn3kfCg6tDUzBfA==", "token_balance": 33561 },
+    { "peer_id": "EiDQKKqh6xsXAGdT9P29p+kcVobboAnwZt77bnLiuRal/Q==", "token_balance": 33561 },
+    { "peer_id": "EiDRoqSJBW18n+qGu1RBMDsL2Sglkz3YGteYixIpRORvpw==", "token_balance": 33561 },
+    { "peer_id": "EiDrVC3vX1ahb38Q5GJz9u7osC4EimbyhEFd6x23AijPqQ==", "token_balance": 33561 },
+    { "peer_id": "EiDsI9lx0jiJ5L9Q6CWsnhnZfP0kCQ7Pvq6g7WdcQhyhew==", "token_balance": 33561 },
+    { "peer_id": "EiDsSAzu5rDFHenTqj1GdXQw+mVUMDoMvWLeQKdK546F5A==", "token_balance": 33561 },
+    { "peer_id": "EiDT/3ugdOvxgb4vg6x5CGFMf4js65U55aWKCMwK14AJGw==", "token_balance": 33561 },
+    { "peer_id": "EiDtAtMfou/MhHrQNuCAKY/nExJGhUqPHy6q4s+kGzkTbA==", "token_balance": 33561 },
+    { "peer_id": "EiDTxgd6YQr8q720CBjgBVHn8RH7cm8V4vp/TcPZznBR8g==", "token_balance": 33561 },
+    { "peer_id": "EiDUUpBTQqf4O9avsTWT04PGDAwSjluLCXYrlZrmcnIAzg==", "token_balance": 33561 },
+    { "peer_id": "EiDUxwzOEhSqfmr1I+gKAjOHOiRIzT5Lxi6JZdTEbn+9ZA==", "token_balance": 33561 },
+    { "peer_id": "EiDvKwaMeYr3wtknCAbVV7S6lbEVbLKxKsdydZr7vAJLRw==", "token_balance": 33561 },
+    { "peer_id": "EiDvlFO6IxfAJGQFHQQ1KNlFRFgGdnVCXf3QO42fz0O7NQ==", "token_balance": 33561 },
+    { "peer_id": "EiDVRXtI2+zXpUC0tNe7h284vUwSN7xUde43hxUx5KiD1g==", "token_balance": 33561 },
+    { "peer_id": "EiDwQWP9U75DFY1t1wRFbp/U+cj9l/UaTLY+MuGBYsLLDA==", "token_balance": 33561 },
+    { "peer_id": "EiDX/W0a43KqHcx3jmbzO8SQxNHUi6+j9dq7F3Nzxy3vCA==", "token_balance": 33561 },
+    { "peer_id": "EiDyedbXzqeXwwQyirQTKae/E5WQNhzKYf56KRP7xFyysA==", "token_balance": 33561 },
+    { "peer_id": "EiDYOZL8lMNDKPiV3i86wbTjQitdNHW2VW0cy3gFJ0axgQ==", "token_balance": 33561 },
+    { "peer_id": "EiDYt8fxGG/ULy6eX/t2kZxp8f16taumsvEX55JSgTJtwQ==", "token_balance": 33561 },
+    { "peer_id": "EiDz/eTYn8bsQW0nD6iDSUAVd/ZQuUOy3CQewg2IQQx3vw==", "token_balance": 33561 },
+    { "peer_id": "EiDZ06PQVTfdNqed8CeAK1Iv2u2UmlWqEo83OYfrblE/6w==", "token_balance": 33561 },
+    { "peer_id": "EiDz0lmuI58BkBDLD744KQ04rbHFoG9mynSN8zzmQOjypA==", "token_balance": 33561 },
+    { "peer_id": "EiDZGlv2zmXBgSz5r0a0GgUzyhcgfnkRHn3uj656cQL2KQ==", "token_balance": 33561 },
+    { "peer_id": "EiDZnFHAENyupayig0aaBuEpD9LI/l/MarlPFvOXx1uYBA==", "token_balance": 33561 },
+    { "peer_id": "EiCCL8IIBky56tp+bZIx0NHono36QgPVhoVFnbXXimZpjg==", "token_balance": 33561 },
+    { "peer_id": "EiB3vW1YNxiqw2q6W0Qszof/Jv3BLvJvTNKRAzvlirfm0A==", "token_balance": 33561 },
+    { "peer_id": "EiDiOz2nMrxqGBqsck8CcamwMsnZY8NlhXbl0IoqX0UZ4A==", "token_balance": 33561 },
+    { "peer_id": "EiCJQgxkDhPQ7lBR9HYet5wKumBPl6YKCLThRcEo1BRPvg==", "token_balance": 33561 },
+    { "peer_id": "EiAvwrixOsPLPWD06V7YvZbgHYFoCj8aTX/4V6RNIWSeFQ==", "token_balance": 33561 },
+    { "peer_id": "EiA0uBCHhPvWzprp9sUCHiGBsR8cR6FGeiY2EP0w0yfKDg==", "token_balance": 33561 },
+    { "peer_id": "EiBGXBHH7axfRHYhZIx03raLDe2kBgznSYRTNRbv0nrZqg==", "token_balance": 33561 },
+    { "peer_id": "EiBZYHxq3m+Hmx6MxQD5r2NPjkHQ6IJXvEKEz6GSbsNsrw==", "token_balance": 33561 },
+    { "peer_id": "EiAm45a2yURP5fS/qhslXzx/jbHyoJ+aYlh9JB4mZPKenA==", "token_balance": 33561 },
+    { "peer_id": "EiC5wYjJ8IIGWXiNSRLJ55er/hUv++PoN6i9DgXq7aq+wg==", "token_balance": 33561 },
+    { "peer_id": "EiA+p5KYXfEdSFVw9BhsgGQQVX9XyxOeRpmKI8guCf0Qwg==", "token_balance": 14131 },
+    { "peer_id": "EiA2LTxEOq+aY3oXqW6II4rxMs++I3m6b1F/385DazbhAQ==", "token_balance": 14131 },
+    { "peer_id": "EiA3iH1DhRv2Yq1foZJvJrSkBxnJpyBHgPWOKOzrSXlywA==", "token_balance": 14131 },
+    { "peer_id": "EiA79Q7Re/ov7XBzR0oxeJgrPKzWlTt0/2deGNrvLSowbg==", "token_balance": 14131 },
+    { "peer_id": "EiA7C6zF40t7xoAjwePQjrCfIdh1t+1hmU510igs0hV6kw==", "token_balance": 14131 },
+    { "peer_id": "EiA8pL87yzOUyMDx1FzIM4muQv6Jpaa2CQ3PdGEFSdNpGw==", "token_balance": 14131 },
+    { "peer_id": "EiAACXXXzU9DAQrN4gaGVgl4b216C/AH9hZxNv7eyhjazA==", "token_balance": 14131 },
+    { "peer_id": "EiAahlQqT3/UlFvc1p+ww8S8S7gbK14Dyde50sK8hTINIw==", "token_balance": 14131 },
+    { "peer_id": "EiAbO5mcT16RxhiZOozqX6rM3jVswTWBTOkq3QbQKgN8Yw==", "token_balance": 14131 },
+    { "peer_id": "EiAcWp157pz7+CWI3VR2kWexFjndqceZkDJmaEnRYK2Eyg==", "token_balance": 14131 },
+    { "peer_id": "EiAeCRrFuX5C/kdZYTVfriE1e3cQSQTKOEdnb62SCCKxag==", "token_balance": 14131 },
+    { "peer_id": "EiAfa1YbG6suWnaDdeKePJDtcCdKn7sOotk+6evEaYGSVA==", "token_balance": 14131 },
+    { "peer_id": "EiAgo0D0ewh2ozj2o4w6eifg47bAqg/aASPnODHq5DbJsw==", "token_balance": 14131 },
+    { "peer_id": "EiAGpy+7f6G8KJO90sZhEwtms2pb/uV6y46FrGOAfLWciA==", "token_balance": 14131 },
+    { "peer_id": "EiAh3ml2qwcOLuYmLNkCzdoqhKQubSGodc5c52TX7O5FIw==", "token_balance": 14131 },
+    { "peer_id": "EiAIOvtUvZbLmyuACX8m3iVXBlWcbTeDe/qvT/Z/8WEoIQ==", "token_balance": 14131 },
+    { "peer_id": "EiAJ05XAk9ozclScMhUY0JjMScIoltFIRnr9S2+FlFuU7Q==", "token_balance": 14131 },
+    { "peer_id": "EiAJUiqPKU7Mtzv1uNUo2Eb1dLQH1AQXHCs9Y/mdE8LZFw==", "token_balance": 14131 },
+    { "peer_id": "EiAKHUO4r8n420rLFpwpPGXvhnyjPmE/I6XU0UXLNkF8QA==", "token_balance": 14131 },
+    { "peer_id": "EiAkmBx/IweHwGUFtS+wTJee30lN/IjD0WKaoezBzXTtYQ==", "token_balance": 14131 },
+    { "peer_id": "EiAl1/Bp+0eIMrQQHz/Cq70g4LAqCPFm9pL8fXjynTDWIg==", "token_balance": 14131 },
+    { "peer_id": "EiAlVBObQQZQlXJog9KHjcwCNn6My7QPDeUrQ5jhczFKJQ==", "token_balance": 14131 },
+    { "peer_id": "EiAmclu9uZtgv3s7OKK1RgSZthUzSHfgKp30/RpB2sVsYA==", "token_balance": 14131 },
+    { "peer_id": "EiAmwRgulhRQZ3k7ZfMzwfQu01svPWc0mRZUfzisyp2eXg==", "token_balance": 14131 },
+    { "peer_id": "EiANWjxJzSc613LL8OuVeGuoCGsUAH1ffsyAzIXf/amqeg==", "token_balance": 14131 },
+    { "peer_id": "EiAogWRRTBr2swYNDUDx8pkbW3fbSFV5XLzvyHbSP74mUA==", "token_balance": 14131 },
+    { "peer_id": "EiAoW+AnTP/1jG2nGDcKEky5yLjj10rxOK/cS4VcHWHTgg==", "token_balance": 14131 },
+    { "peer_id": "EiAP80CvojfPuRBG+VNJxSFonvhgrY8bflDWEsHuDlP3Jw==", "token_balance": 14131 },
+    { "peer_id": "EiAPjOb8we+ocPGyciL5pgUFjLQEo4seVztvbI5qDowpIg==", "token_balance": 14131 },
+    { "peer_id": "EiAq/9aanJWTgxULhlycv44eYd2sT/OSFkXYnQPVE70XKw==", "token_balance": 14131 },
+    { "peer_id": "EiAq9OQhcZTIEVWNUMkAgAIdOyVyK9Pb0GoF6JzMtfz8ow==", "token_balance": 14131 },
+    { "peer_id": "EiAr7LQ1goFdkwaYnLWZZnL7pnatm4rr5SDff8XU/k4fTQ==", "token_balance": 14131 },
+    { "peer_id": "EiARCleCxEfq9N1oHMEGiEwm6M75QixANqXQGEIAsN8iyg==", "token_balance": 14131 },
+    { "peer_id": "EiArYOBCuzHjKaOvc16KZWSaK02j+0z/ezXwS4I5tnSySA==", "token_balance": 14131 },
+    { "peer_id": "EiATomioP0BdxT8l6xtqM1pidOs6XIkAFn3PnvnjjTgYlg==", "token_balance": 14131 },
+    { "peer_id": "EiAv/HAyL9Flsw3UOBQaRiHbYvKn7u2SbZwfZz+Gwk88fA==", "token_balance": 14131 },
+    { "peer_id": "EiAveS8oMO93ID3JT1eEgjUF3as/DpOZ2U/fY4TY3sClJg==", "token_balance": 14131 },
+    { "peer_id": "EiAzALXcTZtECDgNMn7EbHYeGdFqumYukgPuvlEtY9Hszg==", "token_balance": 14131 },
+    { "peer_id": "EiAZl/e92G7m7+MWa/q/CCliixw/LgSjRLMf+EoXJo2A2g==", "token_balance": 14131 },
+    { "peer_id": "EiB104ofyZgg4SbNV1hbCxrupU2wBlA9RAyFL182vf7dpQ==", "token_balance": 14131 },
+    { "peer_id": "EiB44PpKKYsov6QTsEL8qg4xjdfWmseav2cEq1wQF2q45w==", "token_balance": 14131 },
+    { "peer_id": "EiB4CLe1iirwASrbNR2u1SIM53xaqJNpaaWnEpVLqFNf5A==", "token_balance": 14131 },
+    { "peer_id": "EiB4SQlsGRD1xkWRy0zHa0wvrBlIUVmOGsownttpWHM8SA==", "token_balance": 14131 },
+    { "peer_id": "EiB5Mzanmx9YS9nPK4c4FmMZULvJVydieWVRLTa2KbUmjA==", "token_balance": 14131 },
+    { "peer_id": "EiB7dQs9O4zOkLS7TzjUNfxquzcoIezZ91YAg8DkzK4heA==", "token_balance": 14131 },
+    { "peer_id": "EiB7DY3BGfeQ7My3lzZv2ZrGj2UFn5a7UC8n3dQdAguwcg==", "token_balance": 14131 },
+    { "peer_id": "EiB7h+COLmrKzVuh6revYAeuGf0NuP62L7qTaTi3Pp4W2Q==", "token_balance": 14131 },
+    { "peer_id": "EiB93kvIpMxjMQQNEWCmxoKeM5XD+2Lqqk8OJHYsZNCGMw==", "token_balance": 14131 },
+    { "peer_id": "EiBABy5T0sFNCLuoLjNULkXJlLstpo13RsPvuAIKCiYQPA==", "token_balance": 14131 },
+    { "peer_id": "EiBaFhAiwplT1pJf3uxKMcspM07osb3dOdwGSJk11rTR1Q==", "token_balance": 14131 },
+    { "peer_id": "EiBaKSSnCi/ZfokmQApSYtAi4xvxbT5B263CkJqAt7YSAQ==", "token_balance": 14131 },
+    { "peer_id": "EiBaKt+mklDecWvS1m17WSFlb1JjJDA6uXBcMRIr2NTz/w==", "token_balance": 14131 },
+    { "peer_id": "EiBaUNq3wdeadsrrfv7UeOMCCnb+ilJ8b9rez8BxnXFDlQ==", "token_balance": 14131 },
+    { "peer_id": "EiBavPp4YwTyr9/3pCeWOBVySprBSsr3n2U5u/0nmQ4bxg==", "token_balance": 14131 },
+    { "peer_id": "EiBEi4r/vpOjt9Wp5obXZdHQoLTFoJNZutbtxdpEp6USPQ==", "token_balance": 14131 },
+    { "peer_id": "EiBFJEoAMl9Y2wD1es0PIxJeo9zVVblq2yCrSAXEMs6ecw==", "token_balance": 14131 },
+    { "peer_id": "EiBFNztvBF1fiJdeRZvqvwPykbvDJejXrak1Ex5+U+lu3g==", "token_balance": 14131 },
+    { "peer_id": "EiBg0bACYD3Hqn6pi6K8lX7Q0z1d9BY7+qQuCgVao84yzg==", "token_balance": 14131 },
+    { "peer_id": "EiBIQXASyYPnVMmSrQcchsWtsf/woOYQXfF6pYex/3IxkA==", "token_balance": 14131 },
+    { "peer_id": "EiBiX3Di7eSOIV15fJn9MQa/2rk4t33vV5oP52Bz8qjzyA==", "token_balance": 14131 },
+    { "peer_id": "EiBJQSjehIVAEQPADz1EPnq+/YtndF/gzfHhnAo6/Ytn3w==", "token_balance": 14131 },
+    { "peer_id": "EiBkd2IgkaS1fd/3sHQc2XeM93gueoQPymSJnpy62PdhDQ==", "token_balance": 14131 },
+    { "peer_id": "EiBlbDATmftg5CHxy9uQ95hOmBa+dQMEfrCT6ax6V55H3g==", "token_balance": 14131 },
+    { "peer_id": "EiBNwIGR5eqXMXoFkg/Co0tiHXg/4G8i721v6yqFPynNLg==", "token_balance": 14131 },
+    { "peer_id": "EiBo1ySjJJFfzv+E0OJTfG3Bc/vr03X7YU7bb6kIFsRTRA==", "token_balance": 14131 },
+    { "peer_id": "EiBP+L50XDFFnSpAqLEpmOd78BNyXkBm3+1hMWRYWv5Tzg==", "token_balance": 14131 },
+    { "peer_id": "EiBPSC7f1vRRRHH5oy1xCTKjSglLpRDDJxbFhBswHXKtNw==", "token_balance": 14131 },
+    { "peer_id": "EiBR8FV0T0jpAuxBZCI6B+91USsfOb0l/3HVB0i+gnN3Wg==", "token_balance": 14131 },
+    { "peer_id": "EiBRRGlEPmJifZ0MMoy1Utw8Q7o2UA8BKDRIYA6bx097GQ==", "token_balance": 14131 },
+    { "peer_id": "EiBSMFZZ4TJffoXAwdUBGSnR+Um/G9ZCrrAiS8du2+ChKQ==", "token_balance": 14131 },
+    { "peer_id": "EiBSRkj61j5iARkjIsvoRDtqbuiFXJAZei4TdQvLF7oEHA==", "token_balance": 14131 },
+    { "peer_id": "EiBULRje4iGOFbRskmsV3hDFb0JxHIiAwqacthUkd+qJgA==", "token_balance": 14131 },
+    { "peer_id": "EiBVLbCAfL1PI281LA2x+3zRtljiwMZomOA+uMuGRRvHnw==", "token_balance": 14131 },
+    { "peer_id": "EiBwCm57KOKTW3KojF5hcYr5uny0gLOufZCQ9z29OXevgA==", "token_balance": 14131 },
+    { "peer_id": "EiBWkUrOeYF9fBjgYjM1pL/d0F2ecDEUKJ3MwtK7nz2ihw==", "token_balance": 14131 },
+    { "peer_id": "EiBX2BFQK4mDRNTJkQZi4t0AsAj0+t6YvVgSA62Cr1lLgA==", "token_balance": 14131 },
+    { "peer_id": "EiBXXiIu7TEJZ8iubowxgHX2SXps2hvcoz7UOvU86yEZtw==", "token_balance": 14131 },
+    { "peer_id": "EiBz/9XZy8GzqQAMSsh33WH7gSl9BjCe2CCosxuqL3kEyA==", "token_balance": 14131 },
+    { "peer_id": "EiBZDzjcmWq411H6xhuYjPm2UinwNQPnjtqAcvacV6KTxg==", "token_balance": 14131 },
+    { "peer_id": "EiBZrZpZEbVlmhzUnPUY/TmmgTXDFukYbxUe1fDFBobN2A==", "token_balance": 14131 },
+    { "peer_id": "EiC/WW9wiPF6oV4P61KF+x63PP7bxUYKKn7KdJtduxpSOQ==", "token_balance": 14131 },
+    { "peer_id": "EiC5PG8ELRDfFs57o/Lzo3wbl/0sUv+jfFysqgT+f4zF9g==", "token_balance": 14131 },
+    { "peer_id": "EiC61EfGsE1TxPfXQIu8sIo8agpOIag5OWzuxy78So8p5g==", "token_balance": 14131 },
+    { "peer_id": "EiC6Eqpi4B1w02f/sMTzCK6yza1FfrLcfajM1roAXwBW4g==", "token_balance": 14131 },
+    { "peer_id": "EiC8PpB0i1w/qlTfAjSkF+71BpioCFJKx83PeRbj2dMGkA==", "token_balance": 14131 },
+    { "peer_id": "EiCAKalEIOVZll5qTaJkoOORPvlvWy4dliRkvU4DGkTBPw==", "token_balance": 14131 },
+    { "peer_id": "EiCAm8vtd0n4znyA2/wvOm0z6b1Np+Da0CILLlpKNDTVLQ==", "token_balance": 14131 },
+    { "peer_id": "EiCbENzmjAe1NBaulfMVwu8WeIw4EsTZ1y9Wb9jM8aACvw==", "token_balance": 14131 },
+    { "peer_id": "EiCBoUdPb0/A7SwR3PGVoS1B1NE+A3JschDYrrznjP/2/w==", "token_balance": 14131 },
+    { "peer_id": "EiCEIc+lI/0h1nwaHM+jwyIwdNv9DuTwzsi9vXSYkvFGzQ==", "token_balance": 14131 },
+    { "peer_id": "EiCEJTohv96gw6sui4NezA/5zS3DShpHcS3yrT0ifXBITg==", "token_balance": 14131 },
+    { "peer_id": "EiCFAQ8TKGZxMN4SNekcYJRHGBIOXKbkVFRr2ScCd7msOg==", "token_balance": 14131 },
+    { "peer_id": "EiCfOozM8WwgvzAOuhH4QTSqtzVjYYFolT9J3zBC4Vuo+A==", "token_balance": 14131 },
+    { "peer_id": "EiCFTKONzz79nk8pq4Cx8w4Rj9lRLUQSUuqGJPKyR5eLQQ==", "token_balance": 14131 },
+    { "peer_id": "EiCgA2vgV5TkNbB1BCDvfvCHwoN77QnBdLYVKbB+iucROQ==", "token_balance": 14131 },
+    { "peer_id": "EiChBy5gzOvUBJGjlGwu8D0AOdyRTnZxkWf6xj3wtdfoLg==", "token_balance": 14131 },
+    { "peer_id": "EiCHt1daeYFg3cqqGco15WbsDvK3/EMX7iSW0l3cG934Ig==", "token_balance": 14131 },
+    { "peer_id": "EiChTVF5cYU31FtNQN4pi5TbcYZbl9Mm4vvgRX3WXkBRpg==", "token_balance": 14131 },
+    { "peer_id": "EiChuk1RuNAGMf/jsHBaevQqu1Lc6CQgEnIUVc6+a9MfZw==", "token_balance": 14131 },
+    { "peer_id": "EiCJIIdUQWygzHxU4N3eipgAYfA6oWME2+zeprOjHDuG+g==", "token_balance": 14131 },
+    { "peer_id": "EiCKA61pUxmFcMaKmn/ykOD8UUPeun96ATttwdb96eP7iQ==", "token_balance": 14131 },
+    { "peer_id": "EiCKFxs1XPwcYROzEWcBx3cm+6sAUXD4ytcHrvKfO1ecEA==", "token_balance": 14131 },
+    { "peer_id": "EiCMU3UKFEm09ry84SM+cJWd6tQIVKp4a2FsDYv4KueM4w==", "token_balance": 14131 },
+    { "peer_id": "EiCMUwONUM+HZjbSJogaFwryZUqEhVZfOOtsxfc15595dw==", "token_balance": 14131 },
+    { "peer_id": "EiCP8Tvwyr0X13ErQ08wwJfQDQlIHJf36xcp36daZax84A==", "token_balance": 14131 },
+    { "peer_id": "EiCPEA5+vd9bWr35axyuo+aGl8sPNzuWmuYqiDbLsKmq8A==", "token_balance": 14131 },
+    { "peer_id": "EiCpXjiJFcyuQDSo9bHvZaN0lysbBqSyQdPL0ncRWjgqMg==", "token_balance": 14131 },
+    { "peer_id": "EiCqm2uoj682Zm9wLC34QS6PZzT8MK+1VouN2uS7ZynLyA==", "token_balance": 14131 },
+    { "peer_id": "EiCSiwM6vnVlcBve5sx5A31GJVpCW3LkTtgyLtcVZHSckg==", "token_balance": 14131 },
+    { "peer_id": "EiCttygf7ST4sHqshAHusQazOBFVIAOPxQGv/dUwKmYDyQ==", "token_balance": 14131 },
+    { "peer_id": "EiCU3/SNW2GUnF9j2m/AqEKvSlbj89RCIUe61hb+aaI4SA==", "token_balance": 14131 },
+    { "peer_id": "EiCWe4C+eEuLmkui1FCgMHzw8dVCXG3zNRLm3COkdKxE0Q==", "token_balance": 14131 },
+    { "peer_id": "EiCyDcq6dy4ZHHMo+S/zsxZbVF1J8qv/hkHjQBXtChj/pg==", "token_balance": 14131 },
+    { "peer_id": "EiD/sK801+gJVhriWwPInV8K7yB0OVCqWQaShpZy+JMy4g==", "token_balance": 14131 },
+    { "peer_id": "EiD2WIIwy5DcHuhy8jAG7xrvB00aB4dNJEYuv/6Fs4X7Fg==", "token_balance": 14131 },
+    { "peer_id": "EiD38tGZ8118CmVi324OI6HjiOdiZfoKpQ0DM6QroSLGmA==", "token_balance": 14131 },
+    { "peer_id": "EiD5C+It9Fi5ae81Bud8egyByPyYHJNMDBGLMKR8h8pCrA==", "token_balance": 14131 },
+    { "peer_id": "EiD8qqXLTPxTS0tChKYkxtrYIiOs4YTxA0EcM3KfFYlCBg==", "token_balance": 14131 },
+    { "peer_id": "EiDB7gTrbys+YCIGXqohCrJgSIKKhLkuIQ4Lt3XgcwUiYQ==", "token_balance": 14131 },
+    { "peer_id": "EiDbn7CabET039d7TULd5bRWSW5a4FnuPyO7l3a7Qu6ULQ==", "token_balance": 14131 },
+    { "peer_id": "EiDc3vxkQQxMDZh5/shT9asIXxRKADOmsIT/yUKm+lErsA==", "token_balance": 14131 },
+    { "peer_id": "EiDDNED6DphDR/MfWD/LcmT1cF+myrhu8DqRDjpVG9kN1Q==", "token_balance": 14131 },
+    { "peer_id": "EiDFZ3WZDDPjYO4+inRDTuyAQN0bNCCkDX7YcxVaGL5isQ==", "token_balance": 14131 },
+    { "peer_id": "EiDghtmPnBgBl9aEfAEoX7zqGjRqyikcf8imBRSIx79SpQ==", "token_balance": 14131 },
+    { "peer_id": "EiDI3IWtdE6+riV4Ag3pe32Ek0g7QETFsQD6IuCx7ZzkNQ==", "token_balance": 14131 },
+    { "peer_id": "EiDLU5mqRefq3Lnleo4LXgk5+JG2Tq1LQ1daJErAkSmQkQ==", "token_balance": 14131 },
+    { "peer_id": "EiDmlBVIHR+jqAZab1JomprMk4Zd60Kk9pfmvhxPRGwIzQ==", "token_balance": 14131 },
+    { "peer_id": "EiDMLuuSCxqmmAZwl0fhAEB29T2NrYuxNX3eoVkoB+LgFw==", "token_balance": 14131 },
+    { "peer_id": "EiDIXso0zaNVYv9o1oa6Xx4DbheVEht+mMcPJIvJWKSxuQ==", "token_balance": 14131 },
+    { "peer_id": "EiDOt1mXOV0cA1q1yWHO7At79CFOSxBAFidPQhwMD4licg==", "token_balance": 14131 },
+    { "peer_id": "EiDP21JvwRuBXadS9TWZ+AHvvsAOvucLV2GUf5CehuninA==", "token_balance": 14131 },
+    { "peer_id": "EiDQYHxH3uptGi8btm3JlSllnuJv4psDL0mrgVmE4Tg0nA==", "token_balance": 14131 },
+    { "peer_id": "EiDSJUeAmaspN5qvLSnOR/ku1cUtqA75i1nqgBND+X8VkA==", "token_balance": 14131 },
+    { "peer_id": "EiDsx6A1ZJobkwHzI1L4x8MV/qSTnLo7B4dVwCOcmFntQw==", "token_balance": 14131 },
+    { "peer_id": "EiDTUujybIA/oPuG8hSjPUioegcd1pOillvksY5zeOTUqw==", "token_balance": 14131 },
+    { "peer_id": "EiDucIlcuEY1ohpR421WkGUOMimWFqAfMG3tp7lUtx534w==", "token_balance": 14131 },
+    { "peer_id": "EiDueSVvXpj2B5IKYkjmjnGZtEeXuyqvCxgsNyClTeSZqA==", "token_balance": 14131 },
+    { "peer_id": "EiDuXb8WGtb41p0fTfCECraBXI71y8TLCqgjyQP8/Cfw8A==", "token_balance": 14131 },
+    { "peer_id": "EiDV19Jt3o+SI2JnbAub+zMddORpSZ+dfGO/j+P1lGDhPg==", "token_balance": 14131 },
+    { "peer_id": "EiDWzbv+2wYUL9q5uDhQZq+j/OPxjzLZxCrwZCYQtPMAZQ==", "token_balance": 14131 },
+    { "peer_id": "EiDx6BMj1AEzyv71+ZcJz8lpYZJo7TpFjj4aJ2jf8pXq3g==", "token_balance": 14131 },
+    { "peer_id": "EiDXMvSSOv+nF6qZuBEU+cVCczPoclZgsWEzaDek8dIGjA==", "token_balance": 14131 },
+    { "peer_id": "EiDyqp7F9IKBL5BGiAYdExLw+8ItsAYL2AVcS3XMDtPp+Q==", "token_balance": 14131 },
+    { "peer_id": "EiDZGMJApfat9XujI1X19ulYZV9KdCjUP94kTp4ypv1JCw==", "token_balance": 14131 }
+  ]
+}
\ No newline at end of file
diff --git a/node/store/clock.go b/node/store/clock.go
index b688a58..073095b 100644
--- a/node/store/clock.go
+++ b/node/store/clock.go
@@ -97,29 +97,29 @@ type ClockStore interface {
 }
 
 type PebbleClockStore struct {
-	db     *pebble.DB
+	db     KVDB
 	logger *zap.Logger
 }
 
 var _ ClockStore = (*PebbleClockStore)(nil)
 
 type PebbleMasterClockIterator struct {
-	i *pebble.Iterator
+	i Iterator
 }
 
 type PebbleClockIterator struct {
-	i  *pebble.Iterator
+	i  Iterator
 	db *PebbleClockStore
 }
 
 type PebbleCandidateClockIterator struct {
-	i  *pebble.Iterator
+	i  Iterator
 	db *PebbleClockStore
 }
 
-var _ Iterator[*protobufs.ClockFrame] = (*PebbleMasterClockIterator)(nil)
-var _ Iterator[*protobufs.ClockFrame] = (*PebbleClockIterator)(nil)
-var _ Iterator[*protobufs.ClockFrame] = (*PebbleCandidateClockIterator)(nil)
+var _ TypedIterator[*protobufs.ClockFrame] = (*PebbleMasterClockIterator)(nil)
+var _ TypedIterator[*protobufs.ClockFrame] = (*PebbleClockIterator)(nil)
+var _ TypedIterator[*protobufs.ClockFrame] = (*PebbleCandidateClockIterator)(nil)
 
 func (p *PebbleMasterClockIterator) First() bool {
 	return p.i.First()
@@ -173,7 +173,7 @@ func (p *PebbleMasterClockIterator) Value() (*protobufs.ClockFrame, error) {
 		return nil, errors.Wrap(err, "get master clock frame iterator value")
 	}
 
-	frame.ParentSelector = parent.Bytes()
+	frame.ParentSelector = parent.FillBytes(make([]byte, 32))
 
 	return frame, nil
 }
@@ -306,7 +306,7 @@ func (p *PebbleCandidateClockIterator) Close() error {
 	return errors.Wrap(p.i.Close(), "closing candidate clock frame iterator")
 }
 
-func NewPebbleClockStore(db *pebble.DB, logger *zap.Logger) *PebbleClockStore {
+func NewPebbleClockStore(db KVDB, logger *zap.Logger) *PebbleClockStore {
 	return &PebbleClockStore{
 		db,
 		logger,
@@ -446,9 +446,7 @@ func clockProverTrieKey(filter []byte, frameNumber uint64) []byte {
 }
 
 func (p *PebbleClockStore) NewTransaction() (Transaction, error) {
-	return &PebbleTransaction{
-		b: p.db.NewBatch(),
-	}, nil
+	return p.db.NewBatch(), nil
 }
 
 // GetEarliestMasterClockFrame implements ClockStore.
@@ -530,7 +528,7 @@ func (p *PebbleClockStore) GetMasterClockFrame(
 		return nil, errors.Wrap(err, "get master clock frame")
 	}
 
-	frame.ParentSelector = parent.Bytes()
+	frame.ParentSelector = parent.FillBytes(make([]byte, 32))
 
 	return frame, nil
 }
@@ -547,10 +545,10 @@ func (p *PebbleClockStore) RangeMasterClockFrames(
 		startFrameNumber = temp
 	}
 
-	iter, err := p.db.NewIter(&pebble.IterOptions{
-		LowerBound: clockMasterFrameKey(filter, startFrameNumber),
-		UpperBound: clockMasterFrameKey(filter, endFrameNumber),
-	})
+	iter, err := p.db.NewIter(
+		clockMasterFrameKey(filter, startFrameNumber),
+		clockMasterFrameKey(filter, endFrameNumber),
+	)
 	if err != nil {
 		return nil, errors.Wrap(err, "range master clock frames")
 	}
@@ -863,7 +861,7 @@ func (p *PebbleClockStore) PutCandidateDataClockFrame(
 	frame *protobufs.ClockFrame,
 	txn Transaction,
 ) error {
-	if err := p.saveAggregateProofs(nil, frame); err != nil {
+	if err := p.saveAggregateProofs(txn, frame); err != nil {
 		return errors.Wrap(
 			errors.Wrap(err, ErrInvalidData.Error()),
 			"put candidate data clock frame",
@@ -920,7 +918,7 @@ func (p *PebbleClockStore) PutDataClockFrame(
 	backfill bool,
 ) error {
 	if frame.FrameNumber != 0 {
-		if err := p.saveAggregateProofs(nil, frame); err != nil {
+		if err := p.saveAggregateProofs(txn, frame); err != nil {
 			return errors.Wrap(
 				errors.Wrap(err, ErrInvalidData.Error()),
 				"put candidate data clock frame",
@@ -1004,8 +1002,8 @@ func (p *PebbleClockStore) GetCandidateDataClockFrames(
 	filter []byte,
 	frameNumber uint64,
 ) ([]*protobufs.ClockFrame, error) {
-	iter, err := p.db.NewIter(&pebble.IterOptions{
-		LowerBound: clockDataCandidateFrameKey(
+	iter, err := p.db.NewIter(
+		clockDataCandidateFrameKey(
 			filter,
 			frameNumber,
 			[]byte{
@@ -1021,7 +1019,7 @@ func (p *PebbleClockStore) GetCandidateDataClockFrames(
 				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 			},
 		),
-		UpperBound: clockDataCandidateFrameKey(
+		clockDataCandidateFrameKey(
 			filter,
 			frameNumber,
 			[]byte{
@@ -1037,7 +1035,7 @@ func (p *PebbleClockStore) GetCandidateDataClockFrames(
 				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 			},
 		),
-	})
+	)
 	if err != nil {
 		return nil, errors.Wrap(err, "get candidate data clock frames")
 	}
@@ -1084,8 +1082,8 @@ func (p *PebbleClockStore) RangeCandidateDataClockFrames(
 			0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 		}
 	}
-	iter, err := p.db.NewIter(&pebble.IterOptions{
-		LowerBound: clockDataCandidateFrameKey(
+	iter, err := p.db.NewIter(
+		clockDataCandidateFrameKey(
 			filter,
 			frameNumber,
 			fromParent,
@@ -1096,7 +1094,7 @@ func (p *PebbleClockStore) RangeCandidateDataClockFrames(
 				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 			},
 		),
-		UpperBound: clockDataCandidateFrameKey(
+		clockDataCandidateFrameKey(
 			filter,
 			frameNumber,
 			toParent,
@@ -1107,7 +1105,7 @@ func (p *PebbleClockStore) RangeCandidateDataClockFrames(
 				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 			},
 		),
-	})
+	)
 	if err != nil {
 		return nil, errors.Wrap(err, "range candidate data clock frames")
 	}
@@ -1127,10 +1125,10 @@ func (p *PebbleClockStore) RangeDataClockFrames(
 		startFrameNumber = temp
 	}
 
-	iter, err := p.db.NewIter(&pebble.IterOptions{
-		LowerBound: clockDataFrameKey(filter, startFrameNumber),
-		UpperBound: clockDataFrameKey(filter, endFrameNumber),
-	})
+	iter, err := p.db.NewIter(
+		clockDataFrameKey(filter, startFrameNumber),
+		clockDataFrameKey(filter, endFrameNumber),
+	)
 	if err != nil {
 		return nil, errors.Wrap(err, "get data clock frames")
 	}
@@ -1161,10 +1159,7 @@ func (p *PebbleClockStore) Deduplicate(filter []byte) error {
 		},
 	)
 
-	iter, err := p.db.NewIter(&pebble.IterOptions{
-		LowerBound: from,
-		UpperBound: to,
-	})
+	iter, err := p.db.NewIter(from, to)
 	if err != nil {
 		return errors.Wrap(err, "deduplicate")
 	}
@@ -1187,7 +1182,7 @@ func (p *PebbleClockStore) Deduplicate(filter []byte) error {
 			return err
 		}
 
-		err = p.db.Set(iter.Key(), newValue, &pebble.WriteOptions{Sync: true})
+		err = p.db.Set(iter.Key(), newValue)
 		if err != nil {
 			return err
 		}
@@ -1205,10 +1200,7 @@ func (p *PebbleClockStore) Deduplicate(filter []byte) error {
 	from = clockDataFrameKey(filter, 1)
 	to = clockDataFrameKey(filter, 20000)
 
-	iter, err = p.db.NewIter(&pebble.IterOptions{
-		LowerBound: from,
-		UpperBound: to,
-	})
+	iter, err = p.db.NewIter(from, to)
 	if err != nil {
 		return errors.Wrap(err, "deduplicate")
 	}
@@ -1231,7 +1223,7 @@ func (p *PebbleClockStore) Deduplicate(filter []byte) error {
 			return err
 		}
 
-		err = p.db.Set(iter.Key(), newValue, &pebble.WriteOptions{Sync: true})
+		err = p.db.Set(iter.Key(), newValue)
 		if err != nil {
 			return err
 		}
@@ -1279,10 +1271,7 @@ func (p *PebbleClockStore) Deduplicate(filter []byte) error {
 		},
 	)
 
-	iter, err = p.db.NewIter(&pebble.IterOptions{
-		LowerBound: from,
-		UpperBound: to,
-	})
+	iter, err = p.db.NewIter(from, to)
 	if err != nil {
 		return errors.Wrap(err, "deduplicate")
 	}
@@ -1305,7 +1294,7 @@ func (p *PebbleClockStore) Deduplicate(filter []byte) error {
 			return err
 		}
 
-		err = p.db.Set(iter.Key(), newValue, &pebble.WriteOptions{Sync: true})
+		err = p.db.Set(iter.Key(), newValue)
 		if err != nil {
 			return err
 		}
@@ -1334,10 +1323,7 @@ func (p *PebbleClockStore) GetCompressedDataClockFrames(
 	from := clockDataFrameKey(filter, fromFrameNumber)
 	to := clockDataFrameKey(filter, toFrameNumber+1)
 
-	iter, err := p.db.NewIter(&pebble.IterOptions{
-		LowerBound: from,
-		UpperBound: to,
-	})
+	iter, err := p.db.NewIter(from, to)
 	if err != nil {
 		return nil, errors.Wrap(err, "get compressed data clock frames")
 	}
@@ -1418,10 +1404,7 @@ func (p *PebbleClockStore) GetCompressedDataClockFrames(
 			},
 		)
 
-		iter, err := p.db.NewIter(&pebble.IterOptions{
-			LowerBound: from,
-			UpperBound: to,
-		})
+		iter, err := p.db.NewIter(from, to)
 		if err != nil {
 			return nil, errors.Wrap(err, "get compressed data clock frames")
 		}
@@ -1458,7 +1441,7 @@ func (p *PebbleClockStore) GetCompressedDataClockFrames(
 						if err != nil {
 							return nil, errors.Wrap(err, "get compressed data clock frames")
 						}
-						parentSelector, _, _, err := frame.GetParentSelectorAndDistance()
+						parentSelector, _, _, err := frame.GetParentSelectorAndDistance(nil)
 						if err != nil {
 							return nil, errors.Wrap(err, "get compressed data clock frames")
 						}
@@ -1480,8 +1463,28 @@ func (p *PebbleClockStore) GetCompressedDataClockFrames(
 					break
 				}
 				score := new(big.Int)
-				for _, p := range paths[i] {
-					_, distance, _, err := p.GetParentSelectorAndDistance()
+				for _, path := range paths[i] {
+					master, err := p.GetMasterClockFrame(
+						[]byte{
+							0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+							0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+							0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+							0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+						},
+						path.FrameNumber,
+					)
+					if err != nil {
+						return nil, errors.Wrap(err, "get compressed data clock frames")
+					}
+
+					discriminator, err := master.GetSelector()
+					if err != nil {
+						return nil, errors.Wrap(err, "get compressed data clock frames")
+					}
+
+					_, distance, _, err := path.GetParentSelectorAndDistance(
+						discriminator,
+					)
 					if err != nil {
 						return nil, errors.Wrap(err, "get compressed data clock frames")
 					}
@@ -1535,10 +1538,13 @@ func (p *PebbleClockStore) GetCompressedDataClockFrames(
 			return nil, errors.Wrap(err, "get compressed data clock frames")
 		}
 
-		iter, err := p.db.NewIter(&pebble.IterOptions{
-			LowerBound: dataProofInclusionKey(filter, []byte(k), 0),
-			UpperBound: dataProofInclusionKey(filter, []byte(k), limit+1),
-		})
+		iter, err := p.db.NewIter(
+			dataProofInclusionKey(filter, []byte(k), 0),
+			dataProofInclusionKey(filter, []byte(k), limit+1),
+		)
+		if err != nil {
+			return nil, errors.Wrap(err, "get compressed data clock frames")
+		}
 
 		for iter.First(); iter.Valid(); iter.Next() {
 			incCommit := iter.Value()
@@ -1632,9 +1638,6 @@ func (p *PebbleClockStore) SetLatestDataClockFrameNumber(
 	err := p.db.Set(
 		clockDataLatestIndex(filter),
 		binary.BigEndian.AppendUint64(nil, frameNumber),
-		&pebble.WriteOptions{
-			Sync: true,
-		},
 	)
 
 	return errors.Wrap(err, "set latest data clock frame number")
@@ -1678,9 +1681,6 @@ func (p *PebbleClockStore) DeleteCandidateDataClockFrameRange(
 				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 			},
 		),
-		&pebble.WriteOptions{
-			Sync: true,
-		},
 	)
 	return errors.Wrap(err, "delete candidate data clock frame range")
 }
@@ -1727,10 +1727,13 @@ func (p *PebbleClockStore) GetHighestCandidateDataClockFrame(
 		},
 	)
 
-	iter, err := p.db.NewIter(&pebble.IterOptions{
-		LowerBound: from,
-		UpperBound: to,
-	})
+	iter, err := p.db.NewIter(from, to)
+	if err != nil {
+		return nil, errors.Wrap(
+			errors.Wrap(err, ErrInvalidData.Error()),
+			"get highest candidate data clock frame",
+		)
+	}
 
 	found := iter.SeekLT(to)
 	if found {
diff --git a/node/store/data_proof.go b/node/store/data_proof.go
index 6221447..0952fc2 100644
--- a/node/store/data_proof.go
+++ b/node/store/data_proof.go
@@ -30,12 +30,12 @@ type DataProofStore interface {
 }
 
 type PebbleDataProofStore struct {
-	db     *pebble.DB
+	db     KVDB
 	logger *zap.Logger
 }
 
 func NewPebbleDataProofStore(
-	db *pebble.DB,
+	db KVDB,
 	logger *zap.Logger,
 ) *PebbleDataProofStore {
 	return &PebbleDataProofStore{
@@ -81,13 +81,11 @@ func dataProofSegmentKey(
 }
 
 func (p *PebbleDataProofStore) NewTransaction() (Transaction, error) {
-	return &PebbleTransaction{
-		b: p.db.NewBatch(),
-	}, nil
+	return p.db.NewBatch(), nil
 }
 
 func internalGetAggregateProof(
-	db *pebble.DB,
+	db KVDB,
 	filter []byte,
 	commitment []byte,
 	frameNumber uint64,
@@ -114,10 +112,10 @@ func internalGetAggregateProof(
 		Proof:                copied,
 	}
 
-	iter, err := db.NewIter(&pebble.IterOptions{
-		LowerBound: dataProofInclusionKey(filter, commitment, 0),
-		UpperBound: dataProofInclusionKey(filter, commitment, limit+1),
-	})
+	iter, err := db.NewIter(
+		dataProofInclusionKey(filter, commitment, 0),
+		dataProofInclusionKey(filter, commitment, limit+1),
+	)
 	if err != nil {
 		return nil, errors.Wrap(err, "get aggregate proof")
 	}
@@ -206,7 +204,7 @@ func (p *PebbleDataProofStore) GetAggregateProof(
 }
 
 func internalPutAggregateProof(
-	db *pebble.DB,
+	db KVDB,
 	txn Transaction,
 	aggregateProof *protobufs.InclusionAggregateProof,
 	commitment []byte,
diff --git a/node/store/inmem.go b/node/store/inmem.go
new file mode 100644
index 0000000..7ec5661
--- /dev/null
+++ b/node/store/inmem.go
@@ -0,0 +1,349 @@
+package store
+
+import (
+	"errors"
+	"io"
+	"math/rand"
+	"sort"
+	"sync"
+
+	"github.com/cockroachdb/pebble"
+)
+
+type InMemKVDB struct {
+	open       bool
+	sortedKeys []string
+	store      map[string][]byte
+	storeMx    sync.Mutex
+}
+
+type Operation int
+
+const (
+	SetOperation Operation = iota
+	DeleteOperation
+)
+
+type InMemKVDBOperation struct {
+	op    Operation
+	key   []byte
+	value []byte
+}
+
+type InMemKVDBTransaction struct {
+	id      int
+	changes []InMemKVDBOperation
+	db      *InMemKVDB
+}
+
+type InMemKVDBIterator struct {
+	db    *InMemKVDB
+	start []byte
+	end   []byte
+	pos   int
+	open  bool
+}
+
+func (i *InMemKVDBIterator) Key() []byte {
+	if !i.open {
+		return nil
+	}
+	i.db.storeMx.Lock()
+	if _, ok := i.db.store[i.db.sortedKeys[i.pos]]; !ok {
+		return nil
+	}
+	i.db.storeMx.Unlock()
+
+	return []byte(i.db.sortedKeys[i.pos])
+}
+
+func (i *InMemKVDBIterator) First() bool {
+	if !i.open {
+		return false
+	}
+	i.db.storeMx.Lock()
+	found := false
+	idx := sort.SearchStrings(i.db.sortedKeys, string(i.start))
+	final := sort.SearchStrings(i.db.sortedKeys, string(i.end))
+	if idx < final {
+		i.pos = idx
+		found = true
+	}
+	i.db.storeMx.Unlock()
+
+	return found
+}
+
+func (i *InMemKVDBIterator) Next() bool {
+	if !i.open {
+		return false
+	}
+	i.db.storeMx.Lock()
+	found := false
+	if _, ok := i.db.store[i.db.sortedKeys[i.pos]]; ok {
+		final := sort.SearchStrings(i.db.sortedKeys, string(i.end))
+		if i.pos < final {
+			i.pos = i.pos + 1
+			found = true
+		}
+	}
+	i.db.storeMx.Unlock()
+
+	return found
+}
+
+func (i *InMemKVDBIterator) Prev() bool {
+	if !i.open {
+		return false
+	}
+	i.db.storeMx.Lock()
+	found := false
+	if _, ok := i.db.store[i.db.sortedKeys[i.pos]]; ok {
+		start := sort.SearchStrings(i.db.sortedKeys, string(i.start))
+		if i.pos-1 > start {
+			i.pos = i.pos - 1
+			found = true
+		}
+	}
+	i.db.storeMx.Unlock()
+
+	return found
+}
+
+func (i *InMemKVDBIterator) Valid() bool {
+	if !i.open {
+		return false
+	}
+	i.db.storeMx.Lock()
+	start := sort.SearchStrings(i.db.sortedKeys, string(i.start))
+	final := sort.SearchStrings(i.db.sortedKeys, string(i.end))
+	i.db.storeMx.Unlock()
+
+	return i.pos < final && i.pos >= start
+}
+
+func (i *InMemKVDBIterator) Value() []byte {
+	if !i.open {
+		return nil
+	}
+
+	i.db.storeMx.Lock()
+	value := i.db.store[i.db.sortedKeys[i.pos]]
+	i.db.storeMx.Unlock()
+
+	return value
+}
+
+func (i *InMemKVDBIterator) Close() error {
+	if !i.open {
+		return errors.New("already closed iterator")
+	}
+
+	i.open = false
+	return nil
+}
+
+func (i *InMemKVDBIterator) SeekLT(lt []byte) bool {
+	if !i.open {
+		return false
+	}
+	i.db.storeMx.Lock()
+	found := false
+	if _, ok := i.db.store[i.db.sortedKeys[i.pos]]; ok {
+		idx := sort.SearchStrings(i.db.sortedKeys, string(lt))
+		start := sort.SearchStrings(i.db.sortedKeys, string(i.start))
+		if idx >= start {
+			i.pos = idx + 1
+			found = true
+		}
+	}
+	i.db.storeMx.Unlock()
+
+	return found
+}
+
+func (t *InMemKVDBTransaction) Set(key []byte, value []byte) error {
+	if !t.db.open {
+		return errors.New("inmem db closed")
+	}
+
+	t.changes = append(t.changes, InMemKVDBOperation{
+		op:    SetOperation,
+		key:   key,
+		value: value,
+	})
+
+	return nil
+}
+
+func (t *InMemKVDBTransaction) Commit() error {
+	if !t.db.open {
+		return errors.New("inmem db closed")
+	}
+
+	var err error
+loop:
+	for _, op := range t.changes {
+		switch op.op {
+		case SetOperation:
+			err = t.db.Set(op.key, op.value)
+			if err != nil {
+				break loop
+			}
+		case DeleteOperation:
+			err = t.db.Delete(op.key)
+			if err != nil {
+				break loop
+			}
+		}
+	}
+
+	return err
+}
+
+func (t *InMemKVDBTransaction) Delete(key []byte) error {
+	if !t.db.open {
+		return errors.New("inmem db closed")
+	}
+
+	t.changes = append(t.changes, InMemKVDBOperation{
+		op:  DeleteOperation,
+		key: key,
+	})
+
+	return nil
+}
+
+func (t *InMemKVDBTransaction) Abort() error {
+	return nil
+}
+
+func NewInMemKVDB() *InMemKVDB {
+	return &InMemKVDB{
+		open:       true,
+		store:      map[string][]byte{},
+		sortedKeys: []string{},
+	}
+}
+
+func (d *InMemKVDB) Get(key []byte) ([]byte, io.Closer, error) {
+	if !d.open {
+		return nil, nil, errors.New("inmem db closed")
+	}
+
+	d.storeMx.Lock()
+	b, ok := d.store[string(key)]
+	d.storeMx.Unlock()
+	if !ok {
+		return nil, nil, pebble.ErrNotFound
+	}
+	return b, io.NopCloser(nil), nil
+}
+
+func (d *InMemKVDB) Set(key, value []byte) error {
+	if !d.open {
+		return errors.New("inmem db closed")
+	}
+
+	d.storeMx.Lock()
+	_, ok := d.store[string(key)]
+	if !ok {
+		i := sort.SearchStrings(d.sortedKeys, string(key))
+		if len(d.sortedKeys) > i {
+			d.sortedKeys = append(d.sortedKeys[:i+1], d.sortedKeys[i:]...)
+			d.sortedKeys[i] = string(key)
+		} else {
+			d.sortedKeys = append(d.sortedKeys, string(key))
+		}
+	}
+	d.store[string(key)] = value
+
+	d.storeMx.Unlock()
+	return nil
+}
+
+func (d *InMemKVDB) Delete(key []byte) error {
+	if !d.open {
+		return errors.New("inmem db closed")
+	}
+
+	d.storeMx.Lock()
+	_, ok := d.store[string(key)]
+	if ok {
+		i := sort.SearchStrings(d.sortedKeys, string(key))
+		if len(d.sortedKeys)-1 > i {
+			d.sortedKeys = append(d.sortedKeys[:i], d.sortedKeys[i+1:]...)
+		} else {
+			d.sortedKeys = d.sortedKeys[:i]
+		}
+	}
+	delete(d.store, string(key))
+	d.storeMx.Unlock()
+	return nil
+}
+
+func (d *InMemKVDB) NewBatch() Transaction {
+	if !d.open {
+		return nil
+	}
+
+	id := rand.Int()
+	return &InMemKVDBTransaction{
+		id:      id,
+		db:      d,
+		changes: []InMemKVDBOperation{},
+	}
+}
+
+func (d *InMemKVDB) NewIter(lowerBound []byte, upperBound []byte) (Iterator, error) {
+	if !d.open {
+		return nil, errors.New("inmem db closed")
+	}
+
+	return &InMemKVDBIterator{
+		open:  true,
+		db:    d,
+		start: lowerBound,
+		end:   upperBound,
+		pos:   -1,
+	}, nil
+}
+
+func (d *InMemKVDB) Compact(start, end []byte, parallelize bool) error {
+	if !d.open {
+		return errors.New("inmem db closed")
+	}
+
+	return nil
+}
+
+func (d *InMemKVDB) Close() error {
+	if !d.open {
+		return errors.New("inmem db closed")
+	}
+
+	d.open = false
+	return nil
+}
+
+func (d *InMemKVDB) DeleteRange(start, end []byte) error {
+	if !d.open {
+		return errors.New("inmem db closed")
+	}
+
+	iter, err := d.NewIter(start, end)
+	if err != nil {
+		return err
+	}
+
+	for iter.First(); iter.Valid(); iter.Next() {
+		err = d.Delete(iter.Key())
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+var _ KVDB = (*InMemKVDB)(nil)
diff --git a/node/store/inmem_test.go b/node/store/inmem_test.go
new file mode 100644
index 0000000..4c5424a
--- /dev/null
+++ b/node/store/inmem_test.go
@@ -0,0 +1,90 @@
+package store_test
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"source.quilibrium.com/quilibrium/monorepo/node/store"
+)
+
+func TestIter(t *testing.T) {
+	db := store.NewInMemKVDB()
+	db.Set([]byte{0x01}, []byte{0x01})
+	db.Set([]byte{0x02}, []byte{0x02})
+	db.Set([]byte{0x03}, []byte{0x03})
+	db.Set([]byte{0x04}, []byte{0x04})
+	db.Set([]byte{0x06}, []byte{0x06})
+	db.Set([]byte{0x07}, []byte{0x07})
+	db.Set([]byte{0x08}, []byte{0x08})
+	db.Set([]byte{0x010}, []byte{0x010})
+	db.Set([]byte{0x012}, []byte{0x012})
+	db.Set([]byte{0x014}, []byte{0x014})
+	iter, err := db.NewIter([]byte{0x01}, []byte{0x04})
+	assert.NoError(t, err)
+	assert.True(t, iter.First())
+	assert.True(t, iter.Valid())
+	assert.ElementsMatch(t, iter.Value(), []byte{0x01})
+	assert.ElementsMatch(t, iter.Key(), []byte{0x01})
+	assert.True(t, iter.Next())
+	assert.True(t, iter.Valid())
+	assert.ElementsMatch(t, iter.Value(), []byte{0x02})
+	assert.ElementsMatch(t, iter.Key(), []byte{0x02})
+	assert.True(t, iter.Next())
+	assert.True(t, iter.Valid())
+	assert.ElementsMatch(t, iter.Value(), []byte{0x03})
+	assert.ElementsMatch(t, iter.Key(), []byte{0x03})
+	assert.True(t, iter.Next())
+	assert.False(t, iter.Valid())
+	assert.NoError(t, iter.Close())
+
+	iter, err = db.NewIter([]byte{0x06}, []byte{0x09})
+	assert.NoError(t, err)
+	assert.True(t, iter.First())
+	assert.True(t, iter.Valid())
+	assert.ElementsMatch(t, iter.Value(), []byte{0x06})
+	assert.ElementsMatch(t, iter.Key(), []byte{0x06})
+	assert.True(t, iter.Next())
+	assert.True(t, iter.Valid())
+	assert.ElementsMatch(t, iter.Value(), []byte{0x07})
+	assert.ElementsMatch(t, iter.Key(), []byte{0x07})
+	assert.True(t, iter.Next())
+	assert.True(t, iter.Valid())
+	assert.ElementsMatch(t, iter.Value(), []byte{0x08})
+	assert.ElementsMatch(t, iter.Key(), []byte{0x08})
+	assert.True(t, iter.Next())
+	assert.False(t, iter.Valid())
+
+	iter, err = db.NewIter([]byte{0x05}, []byte{0x09})
+	assert.NoError(t, err)
+	assert.True(t, iter.First())
+	assert.True(t, iter.Valid())
+	assert.ElementsMatch(t, iter.Value(), []byte{0x06})
+	assert.ElementsMatch(t, iter.Key(), []byte{0x06})
+	assert.True(t, iter.Next())
+	assert.True(t, iter.Valid())
+	assert.ElementsMatch(t, iter.Value(), []byte{0x07})
+	assert.ElementsMatch(t, iter.Key(), []byte{0x07})
+	assert.True(t, iter.Next())
+	assert.True(t, iter.Valid())
+	assert.ElementsMatch(t, iter.Value(), []byte{0x08})
+	assert.ElementsMatch(t, iter.Key(), []byte{0x08})
+	assert.True(t, iter.Next())
+	assert.False(t, iter.Valid())
+
+	iter, err = db.NewIter([]byte{0x010}, []byte{0x015})
+	assert.NoError(t, err)
+	assert.True(t, iter.First())
+	assert.True(t, iter.Valid())
+	assert.ElementsMatch(t, iter.Value(), []byte{0x10})
+	assert.ElementsMatch(t, iter.Key(), []byte{0x10})
+	assert.True(t, iter.Next())
+	assert.True(t, iter.Valid())
+	assert.ElementsMatch(t, iter.Value(), []byte{0x12})
+	assert.ElementsMatch(t, iter.Key(), []byte{0x12})
+	assert.True(t, iter.Next())
+	assert.True(t, iter.Valid())
+	assert.ElementsMatch(t, iter.Value(), []byte{0x14})
+	assert.ElementsMatch(t, iter.Key(), []byte{0x14})
+	assert.True(t, iter.Next())
+	assert.False(t, iter.Valid())
+}
diff --git a/node/store/iterator.go b/node/store/iterator.go
index d239ee1..b19247d 100644
--- a/node/store/iterator.go
+++ b/node/store/iterator.go
@@ -2,7 +2,18 @@ package store
 
 import "google.golang.org/protobuf/proto"
 
-type Iterator[T proto.Message] interface {
+type Iterator interface {
+	Key() []byte
+	First() bool
+	Next() bool
+	Prev() bool
+	Valid() bool
+	Value() []byte
+	Close() error
+	SeekLT([]byte) bool
+}
+
+type TypedIterator[T proto.Message] interface {
 	First() bool
 	Next() bool
 	Valid() bool
diff --git a/node/store/key.go b/node/store/key.go
index 0adc8fc..0031e3c 100644
--- a/node/store/key.go
+++ b/node/store/key.go
@@ -37,28 +37,28 @@ type KeyStore interface {
 }
 
 type PebbleKeyStore struct {
-	db     *pebble.DB
+	db     KVDB
 	logger *zap.Logger
 }
 
 type PebbleProvingKeyIterator struct {
-	i *pebble.Iterator
+	i Iterator
 }
 
 type PebbleStagedProvingKeyIterator struct {
-	i *pebble.Iterator
+	i Iterator
 }
 
 type PebbleKeyBundleIterator struct {
-	i *pebble.Iterator
+	i Iterator
 }
 
 var pki = (*PebbleProvingKeyIterator)(nil)
 var spki = (*PebbleStagedProvingKeyIterator)(nil)
 var kbi = (*PebbleKeyBundleIterator)(nil)
-var _ Iterator[*protobufs.InclusionCommitment] = pki
-var _ Iterator[*protobufs.ProvingKeyAnnouncement] = spki
-var _ Iterator[*protobufs.InclusionCommitment] = kbi
+var _ TypedIterator[*protobufs.InclusionCommitment] = pki
+var _ TypedIterator[*protobufs.ProvingKeyAnnouncement] = spki
+var _ TypedIterator[*protobufs.InclusionCommitment] = kbi
 var _ KeyStore = (*PebbleKeyStore)(nil)
 
 func (p *PebbleProvingKeyIterator) First() bool {
@@ -169,7 +169,7 @@ func (p *PebbleKeyBundleIterator) Close() error {
 	return errors.Wrap(p.i.Close(), "closing iterator")
 }
 
-func NewPebbleKeyStore(db *pebble.DB, logger *zap.Logger) *PebbleKeyStore {
+func NewPebbleKeyStore(db KVDB, logger *zap.Logger) *PebbleKeyStore {
 	return &PebbleKeyStore{
 		db,
 		logger,
@@ -217,9 +217,7 @@ func keyBundleEarliestKey(provingKey []byte) []byte {
 }
 
 func (p *PebbleKeyStore) NewTransaction() (Transaction, error) {
-	return &PebbleTransaction{
-		b: p.db.NewBatch(),
-	}, nil
+	return p.db.NewBatch(), nil
 }
 
 // Stages a proving key for later inclusion on proof of meaningful work.
@@ -235,9 +233,6 @@ func (p *PebbleKeyStore) StageProvingKey(
 	err = p.db.Set(
 		stagedProvingKeyKey(provingKey.PublicKey()),
 		data,
-		&pebble.WriteOptions{
-			Sync: true,
-		},
 	)
 	if err != nil {
 		return errors.Wrap(err, "stage proving key")
@@ -462,8 +457,8 @@ func (p *PebbleKeyStore) PutKeyBundle(
 }
 
 func (p *PebbleKeyStore) RangeProvingKeys() (*PebbleProvingKeyIterator, error) {
-	iter, err := p.db.NewIter(&pebble.IterOptions{
-		LowerBound: provingKeyKey([]byte{
+	iter, err := p.db.NewIter(
+		provingKeyKey([]byte{
 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -473,7 +468,7 @@ func (p *PebbleKeyStore) RangeProvingKeys() (*PebbleProvingKeyIterator, error) {
 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 			0x00,
 		}),
-		UpperBound: provingKeyKey([]byte{
+		provingKeyKey([]byte{
 			0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 			0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 			0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
@@ -483,7 +478,7 @@ func (p *PebbleKeyStore) RangeProvingKeys() (*PebbleProvingKeyIterator, error) {
 			0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 			0xff,
 		}),
-	})
+	)
 	if err != nil {
 		return nil, errors.Wrap(err, "range proving keys")
 	}
@@ -495,8 +490,8 @@ func (p *PebbleKeyStore) RangeStagedProvingKeys() (
 	*PebbleStagedProvingKeyIterator,
 	error,
 ) {
-	iter, err := p.db.NewIter(&pebble.IterOptions{
-		LowerBound: stagedProvingKeyKey([]byte{
+	iter, err := p.db.NewIter(
+		stagedProvingKeyKey([]byte{
 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -506,7 +501,7 @@ func (p *PebbleKeyStore) RangeStagedProvingKeys() (
 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 			0x00,
 		}),
-		UpperBound: stagedProvingKeyKey([]byte{
+		stagedProvingKeyKey([]byte{
 			0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 			0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 			0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
@@ -516,7 +511,7 @@ func (p *PebbleKeyStore) RangeStagedProvingKeys() (
 			0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 			0xff,
 		}),
-	})
+	)
 	if err != nil {
 		return nil, errors.Wrap(err, "range staged proving keys")
 	}
@@ -528,10 +523,10 @@ func (p *PebbleKeyStore) RangeKeyBundleKeys(provingKey []byte) (
 	*PebbleKeyBundleIterator,
 	error,
 ) {
-	iter, err := p.db.NewIter(&pebble.IterOptions{
-		LowerBound: keyBundleKey(provingKey, 0),
-		UpperBound: keyBundleKey(provingKey, 0xffffffffffffffff),
-	})
+	iter, err := p.db.NewIter(
+		keyBundleKey(provingKey, 0),
+		keyBundleKey(provingKey, 0xffffffffffffffff),
+	)
 	if err != nil {
 		return nil, errors.Wrap(err, "range key bundle keys")
 	}
diff --git a/node/store/kvdb.go b/node/store/kvdb.go
new file mode 100644
index 0000000..01ebadf
--- /dev/null
+++ b/node/store/kvdb.go
@@ -0,0 +1,16 @@
+package store
+
+import (
+	"io"
+)
+
+type KVDB interface {
+	Get(key []byte) ([]byte, io.Closer, error)
+	Set(key, value []byte) error
+	Delete(key []byte) error
+	NewBatch() Transaction
+	NewIter(lowerBound []byte, upperBound []byte) (Iterator, error)
+	Compact(start, end []byte, parallelize bool) error
+	Close() error
+	DeleteRange(start, end []byte) error
+}
diff --git a/node/store/pebble.go b/node/store/pebble.go
index fc475b9..91b39fd 100644
--- a/node/store/pebble.go
+++ b/node/store/pebble.go
@@ -1,19 +1,67 @@
 package store
 
 import (
+	"io"
+
 	"github.com/cockroachdb/pebble"
 	"source.quilibrium.com/quilibrium/monorepo/node/config"
 )
 
-func NewPebbleDB(config *config.DBConfig) *pebble.DB {
+type PebbleDB struct {
+	db *pebble.DB
+}
+
+func NewPebbleDB(config *config.DBConfig) *PebbleDB {
 	db, err := pebble.Open(config.Path, &pebble.Options{})
 	if err != nil {
 		panic(err)
 	}
 
-	return db
+	return &PebbleDB{db}
 }
 
+func (p *PebbleDB) Get(key []byte) ([]byte, io.Closer, error) {
+	return p.db.Get(key)
+}
+
+func (p *PebbleDB) Set(key, value []byte) error {
+	return p.db.Set(key, value, &pebble.WriteOptions{Sync: true})
+}
+
+func (p *PebbleDB) Delete(key []byte) error {
+	return p.db.Delete(key, &pebble.WriteOptions{Sync: true})
+}
+
+func (p *PebbleDB) NewBatch() Transaction {
+	return &PebbleTransaction{
+		b: p.db.NewBatch(),
+	}
+}
+
+func (p *PebbleDB) NewIter(lowerBound []byte, upperBound []byte) (
+	Iterator,
+	error,
+) {
+	return p.db.NewIter(&pebble.IterOptions{
+		LowerBound: lowerBound,
+		UpperBound: upperBound,
+	})
+}
+
+func (p *PebbleDB) Compact(start, end []byte, parallelize bool) error {
+	return p.db.Compact(start, end, parallelize)
+}
+
+func (p *PebbleDB) Close() error {
+	return p.db.Close()
+}
+
+func (p *PebbleDB) DeleteRange(start, end []byte) error {
+	return p.db.DeleteRange(start, end, &pebble.WriteOptions{Sync: true})
+}
+
+var _ KVDB = (*PebbleDB)(nil)
+
 type Transaction interface {
 	Set(key []byte, value []byte) error
 	Commit() error
diff --git a/pebble/.editorconfig b/pebble/.editorconfig
new file mode 100644
index 0000000..0e4642a
--- /dev/null
+++ b/pebble/.editorconfig
@@ -0,0 +1,10 @@
+# See http://editorconfig.org
+
+[*]
+end_of_line = lf
+insert_final_newline = true
+charset = utf-8
+
+# For non-go files, we indent with two spaces. In go files we indent
+# with tabs but still set indent_size to control the github web viewer.
+indent_size=2
diff --git a/pebble/.github/workflows/ci.yaml b/pebble/.github/workflows/ci.yaml
new file mode 100644
index 0000000..a8fbb26
--- /dev/null
+++ b/pebble/.github/workflows/ci.yaml
@@ -0,0 +1,160 @@
+name: Test
+
+on:
+  push:
+    branches:
+    - master
+    - crl-release-*
+  pull_request:
+    branches:
+    - master
+    - crl-release-*
+
+jobs:
+
+  linux:
+    name: go-linux
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set up Go
+      uses: actions/setup-go@v4
+      with:
+        go-version: "1.21"
+
+    - run: make test generate
+
+  linux-32bit:
+    name: go-linux-32bit
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Set up Go
+        uses: actions/setup-go@v4
+        with:
+          go-version: "1.21"
+
+      - run: GOARCH=386 make test
+
+  linux-crossversion:
+    name: go-linux-crossversion
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set up Go
+      uses: actions/setup-go@v4
+      with:
+        go-version: "1.21"
+
+    - run: make crossversion-meta
+
+  linux-race:
+    name: go-linux-race
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set up Go
+      uses: actions/setup-go@v4
+      with:
+        go-version: "1.21"
+
+    - run: make testrace TAGS=
+
+  linux-no-invariants:
+    name: go-linux-no-invariants
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set up Go
+      uses: actions/setup-go@v4
+      with:
+        go-version: "1.21"
+
+    - run: make test TAGS=
+
+  linux-no-cgo:
+    name: go-linux-no-cgo
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set up Go
+      uses: actions/setup-go@v4
+      with:
+        go-version: "1.21"
+
+    - run: CGO_ENABLED=0 make test TAGS=
+
+  darwin:
+    name: go-macos
+    runs-on: macos-12
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set up Go
+      uses: actions/setup-go@v4
+      with:
+        go-version: "1.21"
+
+    - run: make test
+
+  windows:
+    name: go-windows
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set up Go
+      uses: actions/setup-go@v4
+      with:
+        go-version: "1.21"
+
+    - run: go test -v ./...
+
+  bsds:
+    name: go-bsds
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set up Go
+      uses: actions/setup-go@v4
+      with:
+        go-version: "1.21"
+
+    - name: FreeBSD build
+      env:
+        GOOS: freebsd
+      run: go build -v ./...
+
+    - name: NetBSD build
+      env:
+        GOOS: netbsd
+      run: go build -v ./...
+
+    - name: OpenBSD build
+      env:
+        GOOS: openbsd
+      run: go build -v ./...
+
+  go-lint-checks:
+    name: go-lint-checks
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set up Go
+      uses: actions/setup-go@v4
+      with:
+        go-version: "1.21"
+
+    - name: mod-tidy-check
+      run: make mod-tidy-check
+
+    - name: format-check
+      run: make format-check
diff --git a/pebble/.github/workflows/code-cover-gen.yaml b/pebble/.github/workflows/code-cover-gen.yaml
new file mode 100644
index 0000000..e6ec42a
--- /dev/null
+++ b/pebble/.github/workflows/code-cover-gen.yaml
@@ -0,0 +1,71 @@
+name: PR code coverage (generate)
+
+on:
+  # This workflow does not have access to secrets because it runs on top of
+  # potentially unsafe changes.
+  pull_request:
+    types: [ opened, reopened, synchronize ]
+    branches: [ master ]
+
+jobs:
+  # The results of this job are uploaded as artifacts. A separate job will
+  # download the artifacts and upload them to a GCS bucket.
+  code-cover-gen:
+    runs-on: ubuntu-latest
+    env:
+      PR: ${{ github.event.pull_request.number }}
+      HEAD_SHA: ${{ github.event.pull_request.head.sha }}
+      GH_TOKEN: ${{ github.token }}
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          # By default, checkout merges the PR into the current master.
+          # Instead, we want to check out the PR as-is.
+          ref: ${{ github.event.pull_request.head.sha }}
+          # Fetch all branches and history (we'll need the origin/master ref and
+          # the base commit).
+          fetch-depth: 0
+
+      - name: Set up Go
+        uses: actions/setup-go@v2
+        with:
+          go-version: "1.21"
+
+      - name: Get list of changed packages
+        shell: bash
+        run: |
+          set -euxo pipefail
+          # To get the base commit, we get the number of commits in the PR.
+          # Note that github.event.pull_request.base.sha is not what we want,
+          # that is the tip of master and not necessarily the PR fork point.
+          NUM_COMMITS=$(gh pr view $PR --json commits --jq '.commits | length')
+          BASE_SHA=$(git rev-parse HEAD~${NUM_COMMITS})
+          CHANGED_PKGS=$(scripts/changed-go-pkgs.sh ${BASE_SHA} ${HEAD_SHA})
+          echo "BASE_SHA=${BASE_SHA}" >> "${GITHUB_ENV}"
+          echo "CHANGED_PKGS=${CHANGED_PKGS}" >> "${GITHUB_ENV}"
+
+      - name: Generate "after" coverage
+        shell: bash
+        run: |
+          set -euxo pipefail
+          CHANGED_PKGS='${{ env.CHANGED_PKGS }}'
+          mkdir -p artifacts
+          # Make a copy of the script so that the "before" run below uses the
+          # same version.
+          cp scripts/pr-codecov-run-tests.sh ${RUNNER_TEMP}/
+          ${RUNNER_TEMP}/pr-codecov-run-tests.sh artifacts/cover-${PR}-${HEAD_SHA}.json "${CHANGED_PKGS}"
+
+      - name: Generate "before" coverage
+        shell: bash
+        run: |
+          set -euxo pipefail
+          BASE_SHA='${{ env.BASE_SHA }}'
+          CHANGED_PKGS='${{ env.CHANGED_PKGS }}'
+          git checkout -f ${BASE_SHA}
+          ${RUNNER_TEMP}/pr-codecov-run-tests.sh artifacts/cover-${PR}-${BASE_SHA}.json "${CHANGED_PKGS}"
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v2
+        with:
+          name: cover
+          path: artifacts/cover-*.json
diff --git a/pebble/.github/workflows/code-cover-publish.yaml b/pebble/.github/workflows/code-cover-publish.yaml
new file mode 100644
index 0000000..ba5f63c
--- /dev/null
+++ b/pebble/.github/workflows/code-cover-publish.yaml
@@ -0,0 +1,55 @@
+name: PR code coverage (publish)
+
+on:
+  workflow_run:
+    workflows: [ "PR code coverage (generate)" ]
+    types: [ "completed" ]
+
+
+jobs:
+  # This job downloads the artifacts genearted by the code-cover-gen job and
+  # uploads them to a GCS bucket, from where Reviewable can access them.
+  code-cover-publish:
+    runs-on: ubuntu-latest
+    if: >
+      github.event.workflow_run.event == 'pull_request' &&
+      github.event.workflow_run.conclusion == 'success'
+    steps:
+      - name: 'Download artifact'
+        uses: actions/github-script@v3.1.0
+        with:
+          script: |
+            var artifacts = await github.actions.listWorkflowRunArtifacts({
+               owner: context.repo.owner,
+               repo: context.repo.repo,
+               run_id: ${{github.event.workflow_run.id }},
+            });
+            var matchArtifact = artifacts.data.artifacts.filter((artifact) => {
+              return artifact.name == "cover"
+            })[0];
+            var download = await github.actions.downloadArtifact({
+               owner: context.repo.owner,
+               repo: context.repo.repo,
+               artifact_id: matchArtifact.id,
+               archive_format: 'zip',
+            });
+            var fs = require('fs');
+            fs.writeFileSync('${{github.workspace}}/cover.zip', Buffer.from(download.data));
+
+      - run: |
+          mkdir -p cover
+          unzip cover.zip -d cover
+
+      - name: 'Authenticate to Google Cloud'
+        uses: 'google-github-actions/auth@v1'
+        with:
+          credentials_json: '${{ secrets.CODECOVER_SERVICE_ACCOUNT_KEY }}'
+
+      - name: 'Upload to GCS'
+        uses: 'google-github-actions/upload-cloud-storage@v1'
+        with:
+          path: 'cover'
+          glob: '**/cover-*.json'
+          parent: false
+          destination: 'crl-codecover-public/pr-pebble/'
+          process_gcloudignore: false
diff --git a/pebble/.github/workflows/nightly-code-cover.yaml b/pebble/.github/workflows/nightly-code-cover.yaml
new file mode 100644
index 0000000..5c444c3
--- /dev/null
+++ b/pebble/.github/workflows/nightly-code-cover.yaml
@@ -0,0 +1,48 @@
+name: Nightly code coverage
+
+on:
+  schedule:
+    - cron: '00 08 * * * '
+  workflow_dispatch:
+
+jobs:
+  coverage-gen-and-publish:
+    runs-on: ubuntu-latest
+    env:
+      GH_TOKEN: ${{ github.token }}
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          # By default, checkout merges the PR into the current master.
+          # Instead, we want to check out the PR as-is.
+          ref: ${{ github.event.pull_request.head.sha }}
+          # Fetch all branches and history (we'll need the origin/master ref and
+          # the base commit).
+          fetch-depth: 0
+
+      - name: Set up Go
+        uses: actions/setup-go@v2
+        with:
+          go-version: "1.21"
+
+      - name: Generate coverage
+        run: scripts/code-coverage.sh
+
+      - name: Install lcov
+        run: |
+          sudo apt-get update
+          sudo apt-get install lcov
+
+      - name: 'Authenticate to Google Cloud'
+        uses: 'google-github-actions/auth@v1'
+        with:
+          credentials_json: '${{ secrets.CODECOVER_SERVICE_ACCOUNT_KEY }}'
+
+      - name: 'Set up Cloud SDK'
+        uses: 'google-github-actions/setup-gcloud@v1'
+        with:
+          version: '>= 363.0.0'
+
+      - name: Publish coverage
+        run: scripts/code-coverage-publish.sh
diff --git a/pebble/.github/workflows/sanitizers.yaml b/pebble/.github/workflows/sanitizers.yaml
new file mode 100644
index 0000000..a9da116
--- /dev/null
+++ b/pebble/.github/workflows/sanitizers.yaml
@@ -0,0 +1,32 @@
+name: Sanitizers
+
+on:
+  schedule:
+  - cron: "0 0 * * *" # Midnight UTC, daily.
+
+jobs:
+  linux-asan:
+    name: go-linux-asan
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set up Go
+      uses: actions/setup-go@v2
+      with:
+        go-version: "1.21"
+
+    - run: make testasan
+
+  linux-msan:
+    name: go-linux-msan
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set up Go
+      uses: actions/setup-go@v2
+      with:
+        go-version: "1.21"
+
+    - run: make testmsan
diff --git a/pebble/.github/workflows/stale.yml b/pebble/.github/workflows/stale.yml
new file mode 100644
index 0000000..92d39cf
--- /dev/null
+++ b/pebble/.github/workflows/stale.yml
@@ -0,0 +1,34 @@
+name: Mark stale issues and pull requests
+
+on:
+  schedule:
+  - cron: "0 11 * * 1-4"
+  workflow_dispatch:
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+    - uses: actions/stale@v3
+      with:
+        operations-per-run: 1000
+        repo-token: ${{ secrets.GITHUB_TOKEN }}
+        stale-issue-message: |
+           We have marked this issue as stale because it has been inactive for
+           18 months. If this issue is still relevant, removing the stale label
+           or adding a comment will keep it active. Otherwise, we'll close it
+           in 10 days to keep the issue queue tidy. Thank you for your
+           contribution to Pebble!
+        stale-pr-message: 'Stale pull request message'
+        stale-issue-label: 'no-issue-activity'
+        stale-pr-label: 'no-pr-activity'
+        close-issue-label: 'X-stale'
+        close-pr-label: 'X-stale'
+        # Disable this for PR's, by setting a very high bar
+        days-before-pr-stale: 99999
+        days-before-issue-stale: 540
+        days-before-close: 10
+        exempt-issue-labels: 'X-nostale'
diff --git a/pebble/.gitignore b/pebble/.gitignore
new file mode 100644
index 0000000..87ef192
--- /dev/null
+++ b/pebble/.gitignore
@@ -0,0 +1,9 @@
+# Github action artifacts.
+artifacts
+# Profiling artifacts.
+cpu.*.prof
+heap.prof
+mutex.prof
+coverprofile.out
+# Testing artifacts
+meta.*.test
diff --git a/pebble/LICENSE b/pebble/LICENSE
new file mode 100644
index 0000000..fec05ce
--- /dev/null
+++ b/pebble/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2011 The LevelDB-Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/pebble/Makefile b/pebble/Makefile
new file mode 100644
index 0000000..e430ff2
--- /dev/null
+++ b/pebble/Makefile
@@ -0,0 +1,131 @@
+GO := go
+PKG := ./...
+GOFLAGS :=
+STRESSFLAGS :=
+TAGS := invariants
+TESTS := .
+COVER_PROFILE := coverprofile.out
+
+.PHONY: all
+all:
+	@echo usage:
+	@echo "  make test"
+	@echo "  make testrace"
+	@echo "  make stress"
+	@echo "  make stressrace"
+	@echo "  make stressmeta"
+	@echo "  make crossversion-meta"
+	@echo "  make testcoverage"
+	@echo "  make mod-update"
+	@echo "  make generate"
+	@echo "  make generate-test-data"
+	@echo "  make clean"
+
+override testflags :=
+.PHONY: test
+test:
+	${GO} test -tags '$(TAGS)' ${testflags} -run ${TESTS} ${PKG}
+
+.PHONY: testcoverage
+testcoverage:
+	${GO} test -tags '$(TAGS)' ${testflags} -run ${TESTS} ${PKG} -coverprofile ${COVER_PROFILE}
+
+.PHONY: testrace
+testrace: testflags += -race -timeout 20m
+testrace: test
+
+testasan: testflags += -asan -timeout 20m
+testasan: test
+
+testmsan: export CC=clang
+testmsan: testflags += -msan -timeout 20m
+testmsan: test
+
+.PHONY: testobjiotracing
+testobjiotracing:
+	${GO} test -tags '$(TAGS) pebble_obj_io_tracing' ${testflags} -run ${TESTS} ./objstorage/objstorageprovider/objiotracing
+
+.PHONY: lint
+lint:
+	${GO} test -tags '$(TAGS)' ${testflags} -run ${TESTS} ./internal/lint
+
+.PHONY: stress stressrace
+stressrace: testflags += -race
+stress stressrace: testflags += -exec 'stress ${STRESSFLAGS}' -timeout 0 -test.v
+stress stressrace: test
+
+.PHONY: stressmeta
+stressmeta: override PKG = ./internal/metamorphic
+stressmeta: override STRESSFLAGS += -p 1
+stressmeta: override TESTS = TestMeta$$
+stressmeta: stress
+
+.PHONY: crossversion-meta
+crossversion-meta:
+	$(eval LATEST_RELEASE := $(shell git fetch origin && git branch -r --list '*/crl-release-*' | grep -o 'crl-release-.*$$' | sort | tail -1))
+	git checkout ${LATEST_RELEASE}; \
+		${GO} test -c ./internal/metamorphic -o './internal/metamorphic/crossversion/${LATEST_RELEASE}.test'; \
+		git checkout -; \
+		${GO} test -c ./internal/metamorphic -o './internal/metamorphic/crossversion/head.test'; \
+		${GO} test -tags '$(TAGS)' ${testflags} -v -run 'TestMetaCrossVersion' ./internal/metamorphic/crossversion --version '${LATEST_RELEASE},${LATEST_RELEASE},${LATEST_RELEASE}.test' --version 'HEAD,HEAD,./head.test'
+
+.PHONY: stress-crossversion
+stress-crossversion:
+	STRESS=1 ./scripts/run-crossversion-meta.sh crl-release-21.2 crl-release-22.1 crl-release-22.2 crl-release-23.1 master
+
+.PHONY: generate
+generate:
+	${GO} generate ${PKG}
+
+generate:
+
+# Note that the output of generate-test-data is not deterministic. This should
+# only be run manually as needed.
+.PHONY: generate-test-data
+generate-test-data:
+	${GO} run -tags make_incorrect_manifests ./tool/make_incorrect_manifests.go
+	${GO} run -tags make_test_find_db ./tool/make_test_find_db.go
+	${GO} run -tags make_test_sstables ./tool/make_test_sstables.go
+	${GO} run -tags make_test_remotecat ./tool/make_test_remotecat.go
+
+mod-update:
+	${GO} get -u
+	${GO} mod tidy
+
+.PHONY: clean
+clean:
+	rm -f $(patsubst %,%.test,$(notdir $(shell go list ${PKG})))
+
+git_dirty := $(shell git status -s)
+
+.PHONY: git-clean-check
+git-clean-check:
+ifneq ($(git_dirty),)
+	@echo "Git repository is dirty!"
+	@false
+else
+	@echo "Git repository is clean."
+endif
+
+.PHONY: mod-tidy-check
+mod-tidy-check:
+ifneq ($(git_dirty),)
+	$(error mod-tidy-check must be invoked on a clean repository)
+endif
+	@${GO} mod tidy
+	$(MAKE) git-clean-check
+
+# TODO(radu): switch back to @latest once bogus doc changes are
+# addressed; see https://github.com/cockroachdb/crlfmt/pull/44
+.PHONY: format
+format:
+	go install github.com/cockroachdb/crlfmt@44a36ec7 && crlfmt -w -tab 2 .
+
+.PHONY: format-check
+format-check:
+ifneq ($(git_dirty),)
+	$(error format-check must be invoked on a clean repository)
+endif
+	$(MAKE) format
+	git diff
+	$(MAKE) git-clean-check
diff --git a/pebble/README.md b/pebble/README.md
new file mode 100644
index 0000000..c09e45d
--- /dev/null
+++ b/pebble/README.md
@@ -0,0 +1,226 @@
+# Pebble [![Build Status](https://github.com/cockroachdb/pebble/actions/workflows/ci.yaml/badge.svg?branch=master)](https://github.com/cockroachdb/pebble/actions/workflows/ci.yaml) [![GoDoc](https://godoc.org/github.com/cockroachdb/pebble?status.svg)](https://godoc.org/github.com/cockroachdb/pebble) <sup><sub><sub>[Coverage](https://storage.googleapis.com/crl-codecover-public/pebble/index.html)</sub></sub></sup>
+
+#### [Nightly benchmarks](https://cockroachdb.github.io/pebble/)
+
+Pebble is a LevelDB/RocksDB inspired key-value store focused on
+performance and internal usage by CockroachDB. Pebble inherits the
+RocksDB file formats and a few extensions such as range deletion
+tombstones, table-level bloom filters, and updates to the MANIFEST
+format.
+
+Pebble intentionally does not aspire to include every feature in RocksDB and
+specifically targets the use case and feature set needed by CockroachDB:
+
+* Block-based tables
+* Checkpoints
+* Indexed batches
+* Iterator options (lower/upper bound, table filter)
+* Level-based compaction
+* Manual compaction
+* Merge operator
+* Prefix bloom filters
+* Prefix iteration
+* Range deletion tombstones
+* Reverse iteration
+* SSTable ingestion
+* Single delete
+* Snapshots
+* Table-level bloom filters
+
+RocksDB has a large number of features that are not implemented in
+Pebble:
+
+* Backups
+* Column families
+* Delete files in range
+* FIFO compaction style
+* Forward iterator / tailing iterator
+* Hash table format
+* Memtable bloom filter
+* Persistent cache
+* Pin iterator key / value
+* Plain table format
+* SSTable ingest-behind
+* Sub-compactions
+* Transactions
+* Universal compaction style
+
+***WARNING***: Pebble may silently corrupt data or behave incorrectly if
+used with a RocksDB database that uses a feature Pebble doesn't
+support. Caveat emptor!
+
+## Production Ready
+
+Pebble was introduced as an alternative storage engine to RocksDB in
+CockroachDB v20.1 (released May 2020) and was used in production
+successfully at that time. Pebble was made the default storage engine
+in CockroachDB v20.2 (released Nov 2020). Pebble is being used in
+production by users of CockroachDB at scale and is considered stable
+and production ready.
+
+## Advantages
+
+Pebble offers several improvements over RocksDB:
+
+* Faster reverse iteration via backwards links in the memtable's
+  skiplist.
+* Faster commit pipeline that achieves better concurrency.
+* Seamless merged iteration of indexed batches. The mutations in the
+  batch conceptually occupy another memtable level.
+* L0 sublevels and flush splitting for concurrent compactions out of L0 and
+  reduced read-amplification during heavy write load.
+* Faster LSM edits in LSMs with large numbers of sstables through use of a
+  copy-on-write B-tree to hold file metadata.
+* Delete-only compactions that drop whole sstables that fall within the bounds
+  of a range deletion.
+* Block-property collectors and filters that enable iterators to skip tables,
+  index blocks and data blocks that are irrelevant, according to user-defined
+  properties over key-value pairs.
+* Range keys API, allowing KV pairs defined over a range of keyspace with
+  user-defined semantics and interleaved during iteration.
+* Smaller, more approachable code base.
+
+See the [Pebble vs RocksDB: Implementation
+Differences](docs/rocksdb.md) doc for more details on implementation
+differences.
+
+## RocksDB Compatibility
+
+Pebble strives for forward compatibility with RocksDB 6.2.1 (the latest
+version of RocksDB used by CockroachDB). Forward compatibility means
+that a DB generated by RocksDB can be used by Pebble. Currently, Pebble
+provides bidirectional compatibility with RocksDB (a Pebble generated DB
+can be used by RocksDB) when using its FormatMostCompatible format. New
+functionality that is backwards incompatible is gated behind new format
+major versions. In general, Pebble only provides compatibility with the
+subset of functionality and configuration used by CockroachDB. The scope
+of RocksDB functionality and configuration is too large to adequately
+test and document all the incompatibilities. The list below contains
+known incompatibilities.
+
+* Pebble's use of WAL recycling is only compatible with RocksDB's
+  `kTolerateCorruptedTailRecords` WAL recovery mode. Older versions of
+  RocksDB would automatically map incompatible WAL recovery modes to
+  `kTolerateCorruptedTailRecords`. New versions of RocksDB will
+  disable WAL recycling.
+* Column families. Pebble does not support column families, nor does
+  it attempt to detect their usage when opening a DB that may contain
+  them.
+* Hash table format. Pebble does not support the hash table sstable
+  format.
+* Plain table format. Pebble does not support the plain table sstable
+  format.
+* SSTable format version 3 and 4. Pebble does not support version 3
+  and version 4 format sstables. The sstable format version is
+  controlled by the `BlockBasedTableOptions::format_version` option.
+  See [#97](https://github.com/cockroachdb/pebble/issues/97).
+
+## Format major versions
+
+Over time Pebble has introduced new physical file formats.  Backwards
+incompatible changes are made through the introduction of 'format major
+versions'. By default, when Pebble opens a database, it defaults to
+`FormatMostCompatible`. This version is bi-directionally compatible with RocksDB
+6.2.1 (with the caveats described above).
+
+To opt into new formats, a user may set `FormatMajorVersion` on the
+[`Options`](https://pkg.go.dev/github.com/cockroachdb/pebble#Options)
+supplied to
+[`Open`](https://pkg.go.dev/github.com/cockroachdb/pebble#Open), or
+upgrade the format major version at runtime using
+[`DB.RatchetFormatMajorVersion`](https://pkg.go.dev/github.com/cockroachdb/pebble#DB.RatchetFormatMajorVersion).
+Format major version upgrades are permanent; There is no option to
+return to an earlier format.
+
+The table below outlines the history of format major versions:
+
+| Name                               | Value | Migration  |
+|------------------------------------|-------|------------|
+| FormatMostCompatible               |   1   | No         |
+| FormatVersioned                    |   3   | No         |
+| FormatSetWithDelete                |   4   | No         |
+| FormatBlockPropertyCollector       |   5   | No         |
+| FormatSplitUserKeysMarked          |   6   | Background |
+| FormatSplitUserKeysMarkedCompacted |   7   | Blocking   |
+| FormatRangeKeys                    |   8   | No         |
+| FormatMinTableFormatPebblev1       |   9   | No         |
+| FormatPrePebblev1Marked            |  10   | Background |
+| FormatSSTableValueBlocks           |  12   | No         |
+| FormatFlushableIngest              |  13   | No         |
+| FormatPrePebblev1MarkedCompacted   |  14   | Blocking   |
+| FormatDeleteSizedAndObsolete       |  15   | No         |
+| FormatVirtualSSTables              |  16   | No         |
+
+Upgrading to a format major version with 'Background' in the migration
+column may trigger background activity to rewrite physical file
+formats, typically through compactions. Upgrading to a format major
+version with 'Blocking' in the migration column will block until a
+migration is complete. The database may continue to serve reads and
+writes if upgrading a live database through
+`RatchetFormatMajorVersion`, but the method call will not return until
+the migration is complete.
+
+For reference, the table below lists the range of supported Pebble format major
+versions for CockroachDB releases.
+
+| CockroachDB release | Earliest supported                 | Latest supported          |
+|---------------------|------------------------------------|---------------------------|
+| 20.1 through 21.1   | FormatMostCompatible               | FormatMostCompatible      |
+| 21.2                | FormatMostCompatible               | FormatSetWithDelete       |
+| 21.2                | FormatMostCompatible               | FormatSetWithDelete       |
+| 22.1                | FormatMostCompatible               | FormatSplitUserKeysMarked |
+| 22.2                | FormatMostCompatible               | FormatPrePebblev1Marked   |
+| 23.1                | FormatSplitUserKeysMarkedCompacted | FormatFlushableIngest     |
+| 23.2                | FormatSplitUserKeysMarkedCompacted | FormatVirtualSSTables     |
+| 24.1 plan           | FormatSSTableValueBlocks           |                           |
+
+## Pedigree
+
+Pebble is based on the incomplete Go version of LevelDB:
+
+https://github.com/golang/leveldb
+
+The Go version of LevelDB is based on the C++ original:
+
+https://github.com/google/leveldb
+
+Optimizations and inspiration were drawn from RocksDB:
+
+https://github.com/facebook/rocksdb
+
+## Getting Started
+
+### Example Code
+
+```go
+package main
+
+import (
+	"fmt"
+	"log"
+
+	"github.com/cockroachdb/pebble"
+)
+
+func main() {
+	db, err := pebble.Open("demo", &pebble.Options{})
+	if err != nil {
+		log.Fatal(err)
+	}
+	key := []byte("hello")
+	if err := db.Set(key, []byte("world"), pebble.Sync); err != nil {
+		log.Fatal(err)
+	}
+	value, closer, err := db.Get(key)
+	if err != nil {
+		log.Fatal(err)
+	}
+	fmt.Printf("%s %s\n", key, value)
+	if err := closer.Close(); err != nil {
+		log.Fatal(err)
+	}
+	if err := db.Close(); err != nil {
+		log.Fatal(err)
+	}
+}
+```
diff --git a/pebble/batch.go b/pebble/batch.go
new file mode 100644
index 0000000..c3dbfcc
--- /dev/null
+++ b/pebble/batch.go
@@ -0,0 +1,2312 @@
+// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"context"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"math"
+	"sort"
+	"sync"
+	"sync/atomic"
+	"time"
+	"unsafe"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/batchskl"
+	"github.com/cockroachdb/pebble/internal/humanize"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/private"
+	"github.com/cockroachdb/pebble/internal/rangedel"
+	"github.com/cockroachdb/pebble/internal/rangekey"
+	"github.com/cockroachdb/pebble/internal/rawalloc"
+)
+
+const (
+	batchCountOffset     = 8
+	batchHeaderLen       = 12
+	batchInitialSize     = 1 << 10 // 1 KB
+	batchMaxRetainedSize = 1 << 20 // 1 MB
+	invalidBatchCount    = 1<<32 - 1
+	maxVarintLen32       = 5
+)
+
+// ErrNotIndexed means that a read operation on a batch failed because the
+// batch is not indexed and thus doesn't support reads.
+var ErrNotIndexed = errors.New("pebble: batch not indexed")
+
+// ErrInvalidBatch indicates that a batch is invalid or otherwise corrupted.
+var ErrInvalidBatch = base.MarkCorruptionError(errors.New("pebble: invalid batch"))
+
+// ErrBatchTooLarge indicates that a batch is invalid or otherwise corrupted.
+var ErrBatchTooLarge = base.MarkCorruptionError(errors.Newf("pebble: batch too large: >= %s", humanize.Bytes.Uint64(maxBatchSize)))
+
+// DeferredBatchOp represents a batch operation (eg. set, merge, delete) that is
+// being inserted into the batch. Indexing is not performed on the specified key
+// until Finish is called, hence the name deferred. This struct lets the caller
+// copy or encode keys/values directly into the batch representation instead of
+// copying into an intermediary buffer then having pebble.Batch copy off of it.
+type DeferredBatchOp struct {
+	index *batchskl.Skiplist
+
+	// Key and Value point to parts of the binary batch representation where
+	// keys and values should be encoded/copied into. len(Key) and len(Value)
+	// bytes must be copied into these slices respectively before calling
+	// Finish(). Changing where these slices point to is not allowed.
+	Key, Value []byte
+	offset     uint32
+}
+
+// Finish completes the addition of this batch operation, and adds it to the
+// index if necessary. Must be called once (and exactly once) keys/values
+// have been filled into Key and Value. Not calling Finish or not
+// copying/encoding keys will result in an incomplete index, and calling Finish
+// twice may result in a panic.
+func (d DeferredBatchOp) Finish() error {
+	if d.index != nil {
+		if err := d.index.Add(d.offset); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// A Batch is a sequence of Sets, Merges, Deletes, DeleteRanges, RangeKeySets,
+// RangeKeyUnsets, and/or RangeKeyDeletes that are applied atomically. Batch
+// implements the Reader interface, but only an indexed batch supports reading
+// (without error) via Get or NewIter. A non-indexed batch will return
+// ErrNotIndexed when read from. A batch is not safe for concurrent use, and
+// consumers should use a batch per goroutine or provide their own
+// synchronization.
+//
+// # Indexing
+//
+// Batches can be optionally indexed (see DB.NewIndexedBatch). An indexed batch
+// allows iteration via an Iterator (see Batch.NewIter). The iterator provides
+// a merged view of the operations in the batch and the underlying
+// database. This is implemented by treating the batch as an additional layer
+// in the LSM where every entry in the batch is considered newer than any entry
+// in the underlying database (batch entries have the InternalKeySeqNumBatch
+// bit set). By treating the batch as an additional layer in the LSM, iteration
+// supports all batch operations (i.e. Set, Merge, Delete, DeleteRange,
+// RangeKeySet, RangeKeyUnset, RangeKeyDelete) with minimal effort.
+//
+// The same key can be operated on multiple times in a batch, though only the
+// latest operation will be visible. For example, Put("a", "b"), Delete("a")
+// will cause the key "a" to not be visible in the batch. Put("a", "b"),
+// Put("a", "c") will cause a read of "a" to return the value "c".
+//
+// The batch index is implemented via an skiplist (internal/batchskl). While
+// the skiplist implementation is very fast, inserting into an indexed batch is
+// significantly slower than inserting into a non-indexed batch. Only use an
+// indexed batch if you require reading from it.
+//
+// # Atomic commit
+//
+// The operations in a batch are persisted by calling Batch.Commit which is
+// equivalent to calling DB.Apply(batch). A batch is committed atomically by
+// writing the internal batch representation to the WAL, adding all of the
+// batch operations to the memtable associated with the WAL, and then
+// incrementing the visible sequence number so that subsequent reads can see
+// the effects of the batch operations. If WriteOptions.Sync is true, a call to
+// Batch.Commit will guarantee that the batch is persisted to disk before
+// returning. See commitPipeline for more on the implementation details.
+//
+// # Large batches
+//
+// The size of a batch is limited only by available memory (be aware that
+// indexed batches require considerably additional memory for the skiplist
+// structure). A given WAL file has a single memtable associated with it (this
+// restriction could be removed, but doing so is onerous and complex). And a
+// memtable has a fixed size due to the underlying fixed size arena. Note that
+// this differs from RocksDB where a memtable can grow arbitrarily large using
+// a list of arena chunks. In RocksDB this is accomplished by storing pointers
+// in the arena memory, but that isn't possible in Go.
+//
+// During Batch.Commit, a batch which is larger than a threshold (>
+// MemTableSize/2) is wrapped in a flushableBatch and inserted into the queue
+// of memtables. A flushableBatch forces WAL to be rotated, but that happens
+// anyways when the memtable becomes full so this does not cause significant
+// WAL churn. Because the flushableBatch is readable as another layer in the
+// LSM, Batch.Commit returns as soon as the flushableBatch has been added to
+// the queue of memtables.
+//
+// Internally, a flushableBatch provides Iterator support by sorting the batch
+// contents (the batch is sorted once, when it is added to the memtable
+// queue). Sorting the batch contents and insertion of the contents into a
+// memtable have the same big-O time, but the constant factor dominates
+// here. Sorting is significantly faster and uses significantly less memory.
+//
+// # Internal representation
+//
+// The internal batch representation is a contiguous byte buffer with a fixed
+// 12-byte header, followed by a series of records.
+//
+//	+-------------+------------+--- ... ---+
+//	| SeqNum (8B) | Count (4B) |  Entries  |
+//	+-------------+------------+--- ... ---+
+//
+// Each record has a 1-byte kind tag prefix, followed by 1 or 2 length prefixed
+// strings (varstring):
+//
+//	+-----------+-----------------+-------------------+
+//	| Kind (1B) | Key (varstring) | Value (varstring) |
+//	+-----------+-----------------+-------------------+
+//
+// A varstring is a varint32 followed by N bytes of data. The Kind tags are
+// exactly those specified by InternalKeyKind. The following table shows the
+// format for records of each kind:
+//
+//	InternalKeyKindDelete         varstring
+//	InternalKeyKindLogData        varstring
+//	InternalKeyKindIngestSST      varstring
+//	InternalKeyKindSet            varstring varstring
+//	InternalKeyKindMerge          varstring varstring
+//	InternalKeyKindRangeDelete    varstring varstring
+//	InternalKeyKindRangeKeySet    varstring varstring
+//	InternalKeyKindRangeKeyUnset  varstring varstring
+//	InternalKeyKindRangeKeyDelete varstring varstring
+//
+// The intuitive understanding here are that the arguments to Delete, Set,
+// Merge, DeleteRange and RangeKeyDelete are encoded into the batch. The
+// RangeKeySet and RangeKeyUnset operations are slightly more complicated,
+// encoding their end key, suffix and value [in the case of RangeKeySet] within
+// the Value varstring. For more information on the value encoding for
+// RangeKeySet and RangeKeyUnset, see the internal/rangekey package.
+//
+// The internal batch representation is the on disk format for a batch in the
+// WAL, and thus stable. New record kinds may be added, but the existing ones
+// will not be modified.
+type Batch struct {
+	batchInternal
+	applied atomic.Bool
+}
+
+// batchInternal contains the set of fields within Batch that are non-atomic and
+// capable of being reset using a *b = batchInternal{} struct copy.
+type batchInternal struct {
+	// Data is the wire format of a batch's log entry:
+	//   - 8 bytes for a sequence number of the first batch element,
+	//     or zeroes if the batch has not yet been applied,
+	//   - 4 bytes for the count: the number of elements in the batch,
+	//     or "\xff\xff\xff\xff" if the batch is invalid,
+	//   - count elements, being:
+	//     - one byte for the kind
+	//     - the varint-string user key,
+	//     - the varint-string value (if kind != delete).
+	// The sequence number and count are stored in little-endian order.
+	//
+	// The data field can be (but is not guaranteed to be) nil for new
+	// batches. Large batches will set the data field to nil when committed as
+	// the data has been moved to a flushableBatch and inserted into the queue of
+	// memtables.
+	data           []byte
+	cmp            Compare
+	formatKey      base.FormatKey
+	abbreviatedKey AbbreviatedKey
+
+	// An upper bound on required space to add this batch to a memtable.
+	// Note that although batches are limited to 4 GiB in size, that limit
+	// applies to len(data), not the memtable size. The upper bound on the
+	// size of a memtable node is larger than the overhead of the batch's log
+	// encoding, so memTableSize is larger than len(data) and may overflow a
+	// uint32.
+	memTableSize uint64
+
+	// The db to which the batch will be committed. Do not change this field
+	// after the batch has been created as it might invalidate internal state.
+	// Batch.memTableSize is only refreshed if Batch.db is set. Setting db to
+	// nil once it has been set implies that the Batch has encountered an error.
+	db *DB
+
+	// The count of records in the batch. This count will be stored in the batch
+	// data whenever Repr() is called.
+	count uint64
+
+	// The count of range deletions in the batch. Updated every time a range
+	// deletion is added.
+	countRangeDels uint64
+
+	// The count of range key sets, unsets and deletes in the batch. Updated
+	// every time a RANGEKEYSET, RANGEKEYUNSET or RANGEKEYDEL key is added.
+	countRangeKeys uint64
+
+	// A deferredOp struct, stored in the Batch so that a pointer can be returned
+	// from the *Deferred() methods rather than a value.
+	deferredOp DeferredBatchOp
+
+	// An optional skiplist keyed by offset into data of the entry.
+	index         *batchskl.Skiplist
+	rangeDelIndex *batchskl.Skiplist
+	rangeKeyIndex *batchskl.Skiplist
+
+	// Fragmented range deletion tombstones. Cached the first time a range
+	// deletion iterator is requested. The cache is invalidated whenever a new
+	// range deletion is added to the batch. This cache can only be used when
+	// opening an iterator to read at a batch sequence number >=
+	// tombstonesSeqNum. This is the case for all new iterators created over a
+	// batch but it's not the case for all cloned iterators.
+	tombstones       []keyspan.Span
+	tombstonesSeqNum uint64
+
+	// Fragmented range key spans. Cached the first time a range key iterator is
+	// requested. The cache is invalidated whenever a new range key
+	// (RangeKey{Set,Unset,Del}) is added to the batch. This cache can only be
+	// used when opening an iterator to read at a batch sequence number >=
+	// tombstonesSeqNum. This is the case for all new iterators created over a
+	// batch but it's not the case for all cloned iterators.
+	rangeKeys       []keyspan.Span
+	rangeKeysSeqNum uint64
+
+	// The flushableBatch wrapper if the batch is too large to fit in the
+	// memtable.
+	flushable *flushableBatch
+
+	// minimumFormatMajorVersion indicates the format major version required in
+	// order to commit this batch. If an operation requires a particular format
+	// major version, it ratchets the batch's minimumFormatMajorVersion. When
+	// the batch is committed, this is validated against the database's current
+	// format major version.
+	minimumFormatMajorVersion FormatMajorVersion
+
+	// Synchronous Apply uses the commit WaitGroup for both publishing the
+	// seqnum and waiting for the WAL fsync (if needed). Asynchronous
+	// ApplyNoSyncWait, which implies WriteOptions.Sync is true, uses the commit
+	// WaitGroup for publishing the seqnum and the fsyncWait WaitGroup for
+	// waiting for the WAL fsync.
+	//
+	// TODO(sumeer): if we find that ApplyNoSyncWait in conjunction with
+	// SyncWait is causing higher memory usage because of the time duration
+	// between when the sync is already done, and a goroutine calls SyncWait
+	// (followed by Batch.Close), we could separate out {fsyncWait, commitErr}
+	// into a separate struct that is allocated separately (using another
+	// sync.Pool), and only that struct needs to outlive Batch.Close (which
+	// could then be called immediately after ApplyNoSyncWait). commitStats
+	// will also need to be in this separate struct.
+	commit    sync.WaitGroup
+	fsyncWait sync.WaitGroup
+
+	commitStats BatchCommitStats
+
+	commitErr error
+
+	// Position bools together to reduce the sizeof the struct.
+
+	// ingestedSSTBatch indicates that the batch contains one or more key kinds
+	// of InternalKeyKindIngestSST. If the batch contains key kinds of IngestSST
+	// then it will only contain key kinds of IngestSST.
+	ingestedSSTBatch bool
+
+	// committing is set to true when a batch begins to commit. It's used to
+	// ensure the batch is not mutated concurrently. It is not an atomic
+	// deliberately, so as to avoid the overhead on batch mutations. This is
+	// okay, because under correct usage this field will never be accessed
+	// concurrently. It's only under incorrect usage the memory accesses of this
+	// variable may violate memory safety. Since we don't use atomics here,
+	// false negatives are possible.
+	committing bool
+}
+
+// BatchCommitStats exposes stats related to committing a batch.
+//
+// NB: there is no Pebble internal tracing (using LoggerAndTracer) of slow
+// batch commits. The caller can use these stats to do their own tracing as
+// needed.
+type BatchCommitStats struct {
+	// TotalDuration is the time spent in DB.{Apply,ApplyNoSyncWait} or
+	// Batch.Commit, plus the time waiting in Batch.SyncWait. If there is a gap
+	// between calling ApplyNoSyncWait and calling SyncWait, that gap could
+	// include some duration in which real work was being done for the commit
+	// and will not be included here. This missing time is considered acceptable
+	// since the goal of these stats is to understand user-facing latency.
+	//
+	// TotalDuration includes time spent in various queues both inside Pebble
+	// and outside Pebble (I/O queues, goroutine scheduler queue, mutex wait
+	// etc.). For some of these queues (which we consider important) the wait
+	// times are included below -- these expose low-level implementation detail
+	// and are meant for expert diagnosis and subject to change. There may be
+	// unaccounted time after subtracting those values from TotalDuration.
+	TotalDuration time.Duration
+	// SemaphoreWaitDuration is the wait time for semaphores in
+	// commitPipeline.Commit.
+	SemaphoreWaitDuration time.Duration
+	// WALQueueWaitDuration is the wait time for allocating memory blocks in the
+	// LogWriter (due to the LogWriter not writing fast enough). At the moment
+	// this is duration is always zero because a single WAL will allow
+	// allocating memory blocks up to the entire memtable size. In the future,
+	// we may pipeline WALs and bound the WAL queued blocks separately, so this
+	// field is preserved for that possibility.
+	WALQueueWaitDuration time.Duration
+	// MemTableWriteStallDuration is the wait caused by a write stall due to too
+	// many memtables (due to not flushing fast enough).
+	MemTableWriteStallDuration time.Duration
+	// L0ReadAmpWriteStallDuration is the wait caused by a write stall due to
+	// high read amplification in L0 (due to not compacting fast enough out of
+	// L0).
+	L0ReadAmpWriteStallDuration time.Duration
+	// WALRotationDuration is the wait time for WAL rotation, which includes
+	// syncing and closing the old WAL and creating (or reusing) a new one.
+	WALRotationDuration time.Duration
+	// CommitWaitDuration is the wait for publishing the seqnum plus the
+	// duration for the WAL sync (if requested). The former should be tiny and
+	// one can assume that this is all due to the WAL sync.
+	CommitWaitDuration time.Duration
+}
+
+var _ Reader = (*Batch)(nil)
+var _ Writer = (*Batch)(nil)
+
+var batchPool = sync.Pool{
+	New: func() interface{} {
+		return &Batch{}
+	},
+}
+
+type indexedBatch struct {
+	batch Batch
+	index batchskl.Skiplist
+}
+
+var indexedBatchPool = sync.Pool{
+	New: func() interface{} {
+		return &indexedBatch{}
+	},
+}
+
+func newBatch(db *DB) *Batch {
+	b := batchPool.Get().(*Batch)
+	b.db = db
+	return b
+}
+
+func newBatchWithSize(db *DB, size int) *Batch {
+	b := newBatch(db)
+	if cap(b.data) < size {
+		b.data = rawalloc.New(0, size)
+	}
+	return b
+}
+
+func newIndexedBatch(db *DB, comparer *Comparer) *Batch {
+	i := indexedBatchPool.Get().(*indexedBatch)
+	i.batch.cmp = comparer.Compare
+	i.batch.formatKey = comparer.FormatKey
+	i.batch.abbreviatedKey = comparer.AbbreviatedKey
+	i.batch.db = db
+	i.batch.index = &i.index
+	i.batch.index.Init(&i.batch.data, i.batch.cmp, i.batch.abbreviatedKey)
+	return &i.batch
+}
+
+func newIndexedBatchWithSize(db *DB, comparer *Comparer, size int) *Batch {
+	b := newIndexedBatch(db, comparer)
+	if cap(b.data) < size {
+		b.data = rawalloc.New(0, size)
+	}
+	return b
+}
+
+// nextSeqNum returns the batch "sequence number" that will be given to the next
+// key written to the batch. During iteration keys within an indexed batch are
+// given a sequence number consisting of their offset within the batch combined
+// with the base.InternalKeySeqNumBatch bit. These sequence numbers are only
+// used during iteration, and the keys are assigned ordinary sequence numbers
+// when the batch is committed.
+func (b *Batch) nextSeqNum() uint64 {
+	return uint64(len(b.data)) | base.InternalKeySeqNumBatch
+}
+
+func (b *Batch) release() {
+	if b.db == nil {
+		// The batch was not created using newBatch or newIndexedBatch, or an error
+		// was encountered. We don't try to reuse batches that encountered an error
+		// because they might be stuck somewhere in the system and attempting to
+		// reuse such batches is a recipe for onerous debugging sessions. Instead,
+		// let the GC do its job.
+		return
+	}
+	b.db = nil
+
+	// NB: This is ugly (it would be cleaner if we could just assign a Batch{}),
+	// but necessary so that we can use atomic.StoreUint32 for the Batch.applied
+	// field. Without using an atomic to clear that field the Go race detector
+	// complains.
+	b.Reset()
+	b.cmp = nil
+	b.formatKey = nil
+	b.abbreviatedKey = nil
+
+	if b.index == nil {
+		batchPool.Put(b)
+	} else {
+		b.index, b.rangeDelIndex, b.rangeKeyIndex = nil, nil, nil
+		indexedBatchPool.Put((*indexedBatch)(unsafe.Pointer(b)))
+	}
+}
+
+func (b *Batch) refreshMemTableSize() error {
+	b.memTableSize = 0
+	if len(b.data) < batchHeaderLen {
+		return nil
+	}
+
+	b.countRangeDels = 0
+	b.countRangeKeys = 0
+	b.minimumFormatMajorVersion = 0
+	for r := b.Reader(); ; {
+		kind, key, value, ok, err := r.Next()
+		if !ok {
+			if err != nil {
+				return err
+			}
+			break
+		}
+		switch kind {
+		case InternalKeyKindRangeDelete:
+			b.countRangeDels++
+		case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
+			b.countRangeKeys++
+		case InternalKeyKindDeleteSized:
+			if b.minimumFormatMajorVersion < FormatDeleteSizedAndObsolete {
+				b.minimumFormatMajorVersion = FormatDeleteSizedAndObsolete
+			}
+		case InternalKeyKindIngestSST:
+			if b.minimumFormatMajorVersion < FormatFlushableIngest {
+				b.minimumFormatMajorVersion = FormatFlushableIngest
+			}
+			// This key kind doesn't contribute to the memtable size.
+			continue
+		}
+		b.memTableSize += memTableEntrySize(len(key), len(value))
+	}
+	if b.countRangeKeys > 0 && b.minimumFormatMajorVersion < FormatRangeKeys {
+		b.minimumFormatMajorVersion = FormatRangeKeys
+	}
+	return nil
+}
+
+// Apply the operations contained in the batch to the receiver batch.
+//
+// It is safe to modify the contents of the arguments after Apply returns.
+func (b *Batch) Apply(batch *Batch, _ *WriteOptions) error {
+	if b.ingestedSSTBatch {
+		panic("pebble: invalid batch application")
+	}
+	if len(batch.data) == 0 {
+		return nil
+	}
+	if len(batch.data) < batchHeaderLen {
+		return ErrInvalidBatch
+	}
+
+	offset := len(b.data)
+	if offset == 0 {
+		b.init(offset)
+		offset = batchHeaderLen
+	}
+	b.data = append(b.data, batch.data[batchHeaderLen:]...)
+
+	b.setCount(b.Count() + batch.Count())
+
+	if b.db != nil || b.index != nil {
+		// Only iterate over the new entries if we need to track memTableSize or in
+		// order to update the index.
+		for iter := BatchReader(b.data[offset:]); len(iter) > 0; {
+			offset := uintptr(unsafe.Pointer(&iter[0])) - uintptr(unsafe.Pointer(&b.data[0]))
+			kind, key, value, ok, err := iter.Next()
+			if !ok {
+				if err != nil {
+					return err
+				}
+				break
+			}
+			switch kind {
+			case InternalKeyKindRangeDelete:
+				b.countRangeDels++
+			case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
+				b.countRangeKeys++
+			case InternalKeyKindIngestSST:
+				panic("pebble: invalid key kind for batch")
+			}
+			if b.index != nil {
+				var err error
+				switch kind {
+				case InternalKeyKindRangeDelete:
+					b.tombstones = nil
+					b.tombstonesSeqNum = 0
+					if b.rangeDelIndex == nil {
+						b.rangeDelIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey)
+					}
+					err = b.rangeDelIndex.Add(uint32(offset))
+				case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
+					b.rangeKeys = nil
+					b.rangeKeysSeqNum = 0
+					if b.rangeKeyIndex == nil {
+						b.rangeKeyIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey)
+					}
+					err = b.rangeKeyIndex.Add(uint32(offset))
+				default:
+					err = b.index.Add(uint32(offset))
+				}
+				if err != nil {
+					return err
+				}
+			}
+			b.memTableSize += memTableEntrySize(len(key), len(value))
+		}
+	}
+	return nil
+}
+
+// Get gets the value for the given key. It returns ErrNotFound if the Batch
+// does not contain the key.
+//
+// The caller should not modify the contents of the returned slice, but it is
+// safe to modify the contents of the argument after Get returns. The returned
+// slice will remain valid until the returned Closer is closed. On success, the
+// caller MUST call closer.Close() or a memory leak will occur.
+func (b *Batch) Get(key []byte) ([]byte, io.Closer, error) {
+	if b.index == nil {
+		return nil, nil, ErrNotIndexed
+	}
+	return b.db.getInternal(key, b, nil /* snapshot */)
+}
+
+func (b *Batch) prepareDeferredKeyValueRecord(keyLen, valueLen int, kind InternalKeyKind) {
+	if b.committing {
+		panic("pebble: batch already committing")
+	}
+	if len(b.data) == 0 {
+		b.init(keyLen + valueLen + 2*binary.MaxVarintLen64 + batchHeaderLen)
+	}
+	b.count++
+	b.memTableSize += memTableEntrySize(keyLen, valueLen)
+
+	pos := len(b.data)
+	b.deferredOp.offset = uint32(pos)
+	b.grow(1 + 2*maxVarintLen32 + keyLen + valueLen)
+	b.data[pos] = byte(kind)
+	pos++
+
+	{
+		// TODO(peter): Manually inlined version binary.PutUvarint(). This is 20%
+		// faster on BenchmarkBatchSet on go1.13. Remove if go1.14 or future
+		// versions show this to not be a performance win.
+		x := uint32(keyLen)
+		for x >= 0x80 {
+			b.data[pos] = byte(x) | 0x80
+			x >>= 7
+			pos++
+		}
+		b.data[pos] = byte(x)
+		pos++
+	}
+
+	b.deferredOp.Key = b.data[pos : pos+keyLen]
+	pos += keyLen
+
+	{
+		// TODO(peter): Manually inlined version binary.PutUvarint(). This is 20%
+		// faster on BenchmarkBatchSet on go1.13. Remove if go1.14 or future
+		// versions show this to not be a performance win.
+		x := uint32(valueLen)
+		for x >= 0x80 {
+			b.data[pos] = byte(x) | 0x80
+			x >>= 7
+			pos++
+		}
+		b.data[pos] = byte(x)
+		pos++
+	}
+
+	b.deferredOp.Value = b.data[pos : pos+valueLen]
+	// Shrink data since varints may be shorter than the upper bound.
+	b.data = b.data[:pos+valueLen]
+}
+
+func (b *Batch) prepareDeferredKeyRecord(keyLen int, kind InternalKeyKind) {
+	if b.committing {
+		panic("pebble: batch already committing")
+	}
+	if len(b.data) == 0 {
+		b.init(keyLen + binary.MaxVarintLen64 + batchHeaderLen)
+	}
+	b.count++
+	b.memTableSize += memTableEntrySize(keyLen, 0)
+
+	pos := len(b.data)
+	b.deferredOp.offset = uint32(pos)
+	b.grow(1 + maxVarintLen32 + keyLen)
+	b.data[pos] = byte(kind)
+	pos++
+
+	{
+		// TODO(peter): Manually inlined version binary.PutUvarint(). Remove if
+		// go1.13 or future versions show this to not be a performance win. See
+		// BenchmarkBatchSet.
+		x := uint32(keyLen)
+		for x >= 0x80 {
+			b.data[pos] = byte(x) | 0x80
+			x >>= 7
+			pos++
+		}
+		b.data[pos] = byte(x)
+		pos++
+	}
+
+	b.deferredOp.Key = b.data[pos : pos+keyLen]
+	b.deferredOp.Value = nil
+
+	// Shrink data since varint may be shorter than the upper bound.
+	b.data = b.data[:pos+keyLen]
+}
+
+// AddInternalKey allows the caller to add an internal key of point key or range
+// key kinds (but not RangeDelete) to a batch. Passing in an internal key of
+// kind RangeDelete will result in a panic. Note that the seqnum in the internal
+// key is effectively ignored, even though the Kind is preserved. This is
+// because the batch format does not allow for a per-key seqnum to be specified,
+// only a batch-wide one.
+//
+// Note that non-indexed keys (IngestKeyKind{LogData,IngestSST}) are not
+// supported with this method as they require specialized logic.
+func (b *Batch) AddInternalKey(key *base.InternalKey, value []byte, _ *WriteOptions) error {
+	keyLen := len(key.UserKey)
+	hasValue := false
+	switch kind := key.Kind(); kind {
+	case InternalKeyKindRangeDelete:
+		panic("unexpected range delete in AddInternalKey")
+	case InternalKeyKindSingleDelete, InternalKeyKindDelete:
+		b.prepareDeferredKeyRecord(keyLen, kind)
+		b.deferredOp.index = b.index
+	case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
+		b.prepareDeferredKeyValueRecord(keyLen, len(value), kind)
+		hasValue = true
+		b.incrementRangeKeysCount()
+	default:
+		b.prepareDeferredKeyValueRecord(keyLen, len(value), kind)
+		hasValue = true
+		b.deferredOp.index = b.index
+	}
+	copy(b.deferredOp.Key, key.UserKey)
+	if hasValue {
+		copy(b.deferredOp.Value, value)
+	}
+
+	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
+	// in go1.13 will remove the need for this.
+	if b.index != nil {
+		if err := b.index.Add(b.deferredOp.offset); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Set adds an action to the batch that sets the key to map to the value.
+//
+// It is safe to modify the contents of the arguments after Set returns.
+func (b *Batch) Set(key, value []byte, _ *WriteOptions) error {
+	deferredOp := b.SetDeferred(len(key), len(value))
+	copy(deferredOp.Key, key)
+	copy(deferredOp.Value, value)
+	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
+	// in go1.13 will remove the need for this.
+	if b.index != nil {
+		if err := b.index.Add(deferredOp.offset); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// SetDeferred is similar to Set in that it adds a set operation to the batch,
+// except it only takes in key/value lengths instead of complete slices,
+// letting the caller encode into those objects and then call Finish() on the
+// returned object.
+func (b *Batch) SetDeferred(keyLen, valueLen int) *DeferredBatchOp {
+	b.prepareDeferredKeyValueRecord(keyLen, valueLen, InternalKeyKindSet)
+	b.deferredOp.index = b.index
+	return &b.deferredOp
+}
+
+// Merge adds an action to the batch that merges the value at key with the new
+// value. The details of the merge are dependent upon the configured merge
+// operator.
+//
+// It is safe to modify the contents of the arguments after Merge returns.
+func (b *Batch) Merge(key, value []byte, _ *WriteOptions) error {
+	deferredOp := b.MergeDeferred(len(key), len(value))
+	copy(deferredOp.Key, key)
+	copy(deferredOp.Value, value)
+	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
+	// in go1.13 will remove the need for this.
+	if b.index != nil {
+		if err := b.index.Add(deferredOp.offset); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// MergeDeferred is similar to Merge in that it adds a merge operation to the
+// batch, except it only takes in key/value lengths instead of complete slices,
+// letting the caller encode into those objects and then call Finish() on the
+// returned object.
+func (b *Batch) MergeDeferred(keyLen, valueLen int) *DeferredBatchOp {
+	b.prepareDeferredKeyValueRecord(keyLen, valueLen, InternalKeyKindMerge)
+	b.deferredOp.index = b.index
+	return &b.deferredOp
+}
+
+// Delete adds an action to the batch that deletes the entry for key.
+//
+// It is safe to modify the contents of the arguments after Delete returns.
+func (b *Batch) Delete(key []byte, _ *WriteOptions) error {
+	deferredOp := b.DeleteDeferred(len(key))
+	copy(deferredOp.Key, key)
+	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
+	// in go1.13 will remove the need for this.
+	if b.index != nil {
+		if err := b.index.Add(deferredOp.offset); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// DeleteDeferred is similar to Delete in that it adds a delete operation to
+// the batch, except it only takes in key/value lengths instead of complete
+// slices, letting the caller encode into those objects and then call Finish()
+// on the returned object.
+func (b *Batch) DeleteDeferred(keyLen int) *DeferredBatchOp {
+	b.prepareDeferredKeyRecord(keyLen, InternalKeyKindDelete)
+	b.deferredOp.index = b.index
+	return &b.deferredOp
+}
+
+// DeleteSized behaves identically to Delete, but takes an additional
+// argument indicating the size of the value being deleted. DeleteSized
+// should be preferred when the caller has the expectation that there exists
+// a single internal KV pair for the key (eg, the key has not been
+// overwritten recently), and the caller knows the size of its value.
+//
+// DeleteSized will record the value size within the tombstone and use it to
+// inform compaction-picking heuristics which strive to reduce space
+// amplification in the LSM. This "calling your shot" mechanic allows the
+// storage engine to more accurately estimate and reduce space amplification.
+//
+// It is safe to modify the contents of the arguments after DeleteSized
+// returns.
+func (b *Batch) DeleteSized(key []byte, deletedValueSize uint32, _ *WriteOptions) error {
+	deferredOp := b.DeleteSizedDeferred(len(key), deletedValueSize)
+	copy(b.deferredOp.Key, key)
+	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Check if in a
+	// later Go release this is unnecessary.
+	if b.index != nil {
+		if err := b.index.Add(deferredOp.offset); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// DeleteSizedDeferred is similar to DeleteSized in that it adds a sized delete
+// operation to the batch, except it only takes in key length instead of a
+// complete key slice, letting the caller encode into the DeferredBatchOp.Key
+// slice and then call Finish() on the returned object.
+func (b *Batch) DeleteSizedDeferred(keyLen int, deletedValueSize uint32) *DeferredBatchOp {
+	if b.minimumFormatMajorVersion < FormatDeleteSizedAndObsolete {
+		b.minimumFormatMajorVersion = FormatDeleteSizedAndObsolete
+	}
+
+	// Encode the sum of the key length and the value in the value.
+	v := uint64(deletedValueSize) + uint64(keyLen)
+
+	// Encode `v` as a varint.
+	var buf [binary.MaxVarintLen64]byte
+	n := 0
+	{
+		x := v
+		for x >= 0x80 {
+			buf[n] = byte(x) | 0x80
+			x >>= 7
+			n++
+		}
+		buf[n] = byte(x)
+		n++
+	}
+
+	// NB: In batch entries and sstable entries, values are stored as
+	// varstrings. Here, the value is itself a simple varint. This results in an
+	// unnecessary double layer of encoding:
+	//     varint(n) varint(deletedValueSize)
+	// The first varint will always be 1-byte, since a varint-encoded uint64
+	// will never exceed 128 bytes. This unnecessary extra byte and wrapping is
+	// preserved to avoid special casing across the database, and in particular
+	// in sstable block decoding which is performance sensitive.
+	b.prepareDeferredKeyValueRecord(keyLen, n, InternalKeyKindDeleteSized)
+	b.deferredOp.index = b.index
+	copy(b.deferredOp.Value, buf[:n])
+	return &b.deferredOp
+}
+
+// SingleDelete adds an action to the batch that single deletes the entry for key.
+// See Writer.SingleDelete for more details on the semantics of SingleDelete.
+//
+// It is safe to modify the contents of the arguments after SingleDelete returns.
+func (b *Batch) SingleDelete(key []byte, _ *WriteOptions) error {
+	deferredOp := b.SingleDeleteDeferred(len(key))
+	copy(deferredOp.Key, key)
+	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
+	// in go1.13 will remove the need for this.
+	if b.index != nil {
+		if err := b.index.Add(deferredOp.offset); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// SingleDeleteDeferred is similar to SingleDelete in that it adds a single delete
+// operation to the batch, except it only takes in key/value lengths instead of
+// complete slices, letting the caller encode into those objects and then call
+// Finish() on the returned object.
+func (b *Batch) SingleDeleteDeferred(keyLen int) *DeferredBatchOp {
+	b.prepareDeferredKeyRecord(keyLen, InternalKeyKindSingleDelete)
+	b.deferredOp.index = b.index
+	return &b.deferredOp
+}
+
+// DeleteRange deletes all of the point keys (and values) in the range
+// [start,end) (inclusive on start, exclusive on end). DeleteRange does NOT
+// delete overlapping range keys (eg, keys set via RangeKeySet).
+//
+// It is safe to modify the contents of the arguments after DeleteRange
+// returns.
+func (b *Batch) DeleteRange(start, end []byte, _ *WriteOptions) error {
+	deferredOp := b.DeleteRangeDeferred(len(start), len(end))
+	copy(deferredOp.Key, start)
+	copy(deferredOp.Value, end)
+	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
+	// in go1.13 will remove the need for this.
+	if deferredOp.index != nil {
+		if err := deferredOp.index.Add(deferredOp.offset); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// DeleteRangeDeferred is similar to DeleteRange in that it adds a delete range
+// operation to the batch, except it only takes in key lengths instead of
+// complete slices, letting the caller encode into those objects and then call
+// Finish() on the returned object. Note that DeferredBatchOp.Key should be
+// populated with the start key, and DeferredBatchOp.Value should be populated
+// with the end key.
+func (b *Batch) DeleteRangeDeferred(startLen, endLen int) *DeferredBatchOp {
+	b.prepareDeferredKeyValueRecord(startLen, endLen, InternalKeyKindRangeDelete)
+	b.countRangeDels++
+	if b.index != nil {
+		b.tombstones = nil
+		b.tombstonesSeqNum = 0
+		// Range deletions are rare, so we lazily allocate the index for them.
+		if b.rangeDelIndex == nil {
+			b.rangeDelIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey)
+		}
+		b.deferredOp.index = b.rangeDelIndex
+	}
+	return &b.deferredOp
+}
+
+// RangeKeySet sets a range key mapping the key range [start, end) at the MVCC
+// timestamp suffix to value. The suffix is optional. If any portion of the key
+// range [start, end) is already set by a range key with the same suffix value,
+// RangeKeySet overrides it.
+//
+// It is safe to modify the contents of the arguments after RangeKeySet returns.
+func (b *Batch) RangeKeySet(start, end, suffix, value []byte, _ *WriteOptions) error {
+	suffixValues := [1]rangekey.SuffixValue{{Suffix: suffix, Value: value}}
+	internalValueLen := rangekey.EncodedSetValueLen(end, suffixValues[:])
+
+	deferredOp := b.rangeKeySetDeferred(len(start), internalValueLen)
+	copy(deferredOp.Key, start)
+	n := rangekey.EncodeSetValue(deferredOp.Value, end, suffixValues[:])
+	if n != internalValueLen {
+		panic("unexpected internal value length mismatch")
+	}
+
+	// Manually inline DeferredBatchOp.Finish().
+	if deferredOp.index != nil {
+		if err := deferredOp.index.Add(deferredOp.offset); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *Batch) rangeKeySetDeferred(startLen, internalValueLen int) *DeferredBatchOp {
+	b.prepareDeferredKeyValueRecord(startLen, internalValueLen, InternalKeyKindRangeKeySet)
+	b.incrementRangeKeysCount()
+	return &b.deferredOp
+}
+
+func (b *Batch) incrementRangeKeysCount() {
+	b.countRangeKeys++
+	if b.minimumFormatMajorVersion < FormatRangeKeys {
+		b.minimumFormatMajorVersion = FormatRangeKeys
+	}
+	if b.index != nil {
+		b.rangeKeys = nil
+		b.rangeKeysSeqNum = 0
+		// Range keys are rare, so we lazily allocate the index for them.
+		if b.rangeKeyIndex == nil {
+			b.rangeKeyIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey)
+		}
+		b.deferredOp.index = b.rangeKeyIndex
+	}
+}
+
+// RangeKeyUnset removes a range key mapping the key range [start, end) at the
+// MVCC timestamp suffix. The suffix may be omitted to remove an unsuffixed
+// range key. RangeKeyUnset only removes portions of range keys that fall within
+// the [start, end) key span, and only range keys with suffixes that exactly
+// match the unset suffix.
+//
+// It is safe to modify the contents of the arguments after RangeKeyUnset
+// returns.
+func (b *Batch) RangeKeyUnset(start, end, suffix []byte, _ *WriteOptions) error {
+	suffixes := [1][]byte{suffix}
+	internalValueLen := rangekey.EncodedUnsetValueLen(end, suffixes[:])
+
+	deferredOp := b.rangeKeyUnsetDeferred(len(start), internalValueLen)
+	copy(deferredOp.Key, start)
+	n := rangekey.EncodeUnsetValue(deferredOp.Value, end, suffixes[:])
+	if n != internalValueLen {
+		panic("unexpected internal value length mismatch")
+	}
+
+	// Manually inline DeferredBatchOp.Finish()
+	if deferredOp.index != nil {
+		if err := deferredOp.index.Add(deferredOp.offset); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *Batch) rangeKeyUnsetDeferred(startLen, internalValueLen int) *DeferredBatchOp {
+	b.prepareDeferredKeyValueRecord(startLen, internalValueLen, InternalKeyKindRangeKeyUnset)
+	b.incrementRangeKeysCount()
+	return &b.deferredOp
+}
+
+// RangeKeyDelete deletes all of the range keys in the range [start,end)
+// (inclusive on start, exclusive on end). It does not delete point keys (for
+// that use DeleteRange). RangeKeyDelete removes all range keys within the
+// bounds, including those with or without suffixes.
+//
+// It is safe to modify the contents of the arguments after RangeKeyDelete
+// returns.
+func (b *Batch) RangeKeyDelete(start, end []byte, _ *WriteOptions) error {
+	deferredOp := b.RangeKeyDeleteDeferred(len(start), len(end))
+	copy(deferredOp.Key, start)
+	copy(deferredOp.Value, end)
+	// Manually inline DeferredBatchOp.Finish().
+	if deferredOp.index != nil {
+		if err := deferredOp.index.Add(deferredOp.offset); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// RangeKeyDeleteDeferred is similar to RangeKeyDelete in that it adds an
+// operation to delete range keys to the batch, except it only takes in key
+// lengths instead of complete slices, letting the caller encode into those
+// objects and then call Finish() on the returned object. Note that
+// DeferredBatchOp.Key should be populated with the start key, and
+// DeferredBatchOp.Value should be populated with the end key.
+func (b *Batch) RangeKeyDeleteDeferred(startLen, endLen int) *DeferredBatchOp {
+	b.prepareDeferredKeyValueRecord(startLen, endLen, InternalKeyKindRangeKeyDelete)
+	b.incrementRangeKeysCount()
+	return &b.deferredOp
+}
+
+// LogData adds the specified to the batch. The data will be written to the
+// WAL, but not added to memtables or sstables. Log data is never indexed,
+// which makes it useful for testing WAL performance.
+//
+// It is safe to modify the contents of the argument after LogData returns.
+func (b *Batch) LogData(data []byte, _ *WriteOptions) error {
+	origCount, origMemTableSize := b.count, b.memTableSize
+	b.prepareDeferredKeyRecord(len(data), InternalKeyKindLogData)
+	copy(b.deferredOp.Key, data)
+	// Since LogData only writes to the WAL and does not affect the memtable, we
+	// restore b.count and b.memTableSize to their origin values. Note that
+	// Batch.count only refers to records that are added to the memtable.
+	b.count, b.memTableSize = origCount, origMemTableSize
+	return nil
+}
+
+// IngestSST adds the FileNum for an sstable to the batch. The data will only be
+// written to the WAL (not added to memtables or sstables).
+func (b *Batch) ingestSST(fileNum base.FileNum) {
+	if b.Empty() {
+		b.ingestedSSTBatch = true
+	} else if !b.ingestedSSTBatch {
+		// Batch contains other key kinds.
+		panic("pebble: invalid call to ingestSST")
+	}
+
+	origMemTableSize := b.memTableSize
+	var buf [binary.MaxVarintLen64]byte
+	length := binary.PutUvarint(buf[:], uint64(fileNum))
+	b.prepareDeferredKeyRecord(length, InternalKeyKindIngestSST)
+	copy(b.deferredOp.Key, buf[:length])
+	// Since IngestSST writes only to the WAL and does not affect the memtable,
+	// we restore b.memTableSize to its original value. Note that Batch.count
+	// is not reset because for the InternalKeyKindIngestSST the count is the
+	// number of sstable paths which have been added to the batch.
+	b.memTableSize = origMemTableSize
+	b.minimumFormatMajorVersion = FormatFlushableIngest
+}
+
+// Empty returns true if the batch is empty, and false otherwise.
+func (b *Batch) Empty() bool {
+	return len(b.data) <= batchHeaderLen
+}
+
+// Len returns the current size of the batch in bytes.
+func (b *Batch) Len() int {
+	if len(b.data) <= batchHeaderLen {
+		return batchHeaderLen
+	}
+	return len(b.data)
+}
+
+// Repr returns the underlying batch representation. It is not safe to modify
+// the contents. Reset() will not change the contents of the returned value,
+// though any other mutation operation may do so.
+func (b *Batch) Repr() []byte {
+	if len(b.data) == 0 {
+		b.init(batchHeaderLen)
+	}
+	binary.LittleEndian.PutUint32(b.countData(), b.Count())
+	return b.data
+}
+
+// SetRepr sets the underlying batch representation. The batch takes ownership
+// of the supplied slice. It is not safe to modify it afterwards until the
+// Batch is no longer in use.
+func (b *Batch) SetRepr(data []byte) error {
+	if len(data) < batchHeaderLen {
+		return base.CorruptionErrorf("invalid batch")
+	}
+	b.data = data
+	b.count = uint64(binary.LittleEndian.Uint32(b.countData()))
+	var err error
+	if b.db != nil {
+		// Only track memTableSize for batches that will be committed to the DB.
+		err = b.refreshMemTableSize()
+	}
+	return err
+}
+
+// NewIter returns an iterator that is unpositioned (Iterator.Valid() will
+// return false). The iterator can be positioned via a call to SeekGE,
+// SeekPrefixGE, SeekLT, First or Last. Only indexed batches support iterators.
+//
+// The returned Iterator observes all of the Batch's existing mutations, but no
+// later mutations. Its view can be refreshed via RefreshBatchSnapshot or
+// SetOptions().
+func (b *Batch) NewIter(o *IterOptions) (*Iterator, error) {
+	return b.NewIterWithContext(context.Background(), o)
+}
+
+// NewIterWithContext is like NewIter, and additionally accepts a context for
+// tracing.
+func (b *Batch) NewIterWithContext(ctx context.Context, o *IterOptions) (*Iterator, error) {
+	if b.index == nil {
+		return nil, ErrNotIndexed
+	}
+	return b.db.newIter(ctx, b, newIterOpts{}, o), nil
+}
+
+// NewBatchOnlyIter constructs an iterator that only reads the contents of the
+// batch, and does not overlay the batch mutations on top of the DB state.
+//
+// The returned Iterator observes all of the Batch's existing mutations, but
+// no later mutations. Its view can be refreshed via RefreshBatchSnapshot or
+// SetOptions().
+func (b *Batch) NewBatchOnlyIter(ctx context.Context, o *IterOptions) (*Iterator, error) {
+	if b.index == nil {
+		return nil, ErrNotIndexed
+	}
+	return b.db.newIter(ctx, b, newIterOpts{batch: batchIterOpts{batchOnly: true}}, o), nil
+}
+
+// newInternalIter creates a new internalIterator that iterates over the
+// contents of the batch.
+func (b *Batch) newInternalIter(o *IterOptions) *batchIter {
+	iter := &batchIter{}
+	b.initInternalIter(o, iter)
+	return iter
+}
+
+func (b *Batch) initInternalIter(o *IterOptions, iter *batchIter) {
+	*iter = batchIter{
+		cmp:   b.cmp,
+		batch: b,
+		iter:  b.index.NewIter(o.GetLowerBound(), o.GetUpperBound()),
+		// NB: We explicitly do not propagate the batch snapshot to the point
+		// key iterator. Filtering point keys within the batch iterator can
+		// cause pathological behavior where a batch iterator advances
+		// significantly farther than necessary filtering many batch keys that
+		// are not visible at the batch sequence number. Instead, the merging
+		// iterator enforces bounds.
+		//
+		// For example, consider an engine that contains the committed keys
+		// 'bar' and 'bax', with no keys between them. Consider a batch
+		// containing keys 1,000 keys within the range [a,z]. All of the
+		// batch keys were added to the batch after the iterator was
+		// constructed, so they are not visible to the iterator. A call to
+		// SeekGE('bax') would seek the LSM iterators and discover the key
+		// 'bax'. It would also seek the batch iterator, landing on the key
+		// 'baz' but discover it that it's not visible. The batch iterator would
+		// next through the rest of the batch's keys, only to discover there are
+		// no visible keys greater than or equal to 'bax'.
+		//
+		// Filtering these batch points within the merging iterator ensures that
+		// the batch iterator never needs to iterate beyond 'baz', because it
+		// already found a smaller, visible key 'bax'.
+		snapshot: base.InternalKeySeqNumMax,
+	}
+}
+
+func (b *Batch) newRangeDelIter(o *IterOptions, batchSnapshot uint64) *keyspan.Iter {
+	// Construct an iterator even if rangeDelIndex is nil, because it is allowed
+	// to refresh later, so we need the container to exist.
+	iter := new(keyspan.Iter)
+	b.initRangeDelIter(o, iter, batchSnapshot)
+	return iter
+}
+
+func (b *Batch) initRangeDelIter(_ *IterOptions, iter *keyspan.Iter, batchSnapshot uint64) {
+	if b.rangeDelIndex == nil {
+		iter.Init(b.cmp, nil)
+		return
+	}
+
+	// Fragment the range tombstones the first time a range deletion iterator is
+	// requested. The cached tombstones are invalidated if another range
+	// deletion tombstone is added to the batch. This cache is only guaranteed
+	// to be correct if we're opening an iterator to read at a batch sequence
+	// number at least as high as tombstonesSeqNum. The cache is guaranteed to
+	// include all tombstones up to tombstonesSeqNum, and if any additional
+	// tombstones were added after that sequence number the cache would've been
+	// cleared.
+	nextSeqNum := b.nextSeqNum()
+	if b.tombstones != nil && b.tombstonesSeqNum <= batchSnapshot {
+		iter.Init(b.cmp, b.tombstones)
+		return
+	}
+
+	tombstones := make([]keyspan.Span, 0, b.countRangeDels)
+	frag := &keyspan.Fragmenter{
+		Cmp:    b.cmp,
+		Format: b.formatKey,
+		Emit: func(s keyspan.Span) {
+			tombstones = append(tombstones, s)
+		},
+	}
+	it := &batchIter{
+		cmp:      b.cmp,
+		batch:    b,
+		iter:     b.rangeDelIndex.NewIter(nil, nil),
+		snapshot: batchSnapshot,
+	}
+	fragmentRangeDels(frag, it, int(b.countRangeDels))
+	iter.Init(b.cmp, tombstones)
+
+	// If we just read all the tombstones in the batch (eg, batchSnapshot was
+	// set to b.nextSeqNum()), then cache the tombstones so that a subsequent
+	// call to initRangeDelIter may use them without refragmenting.
+	if nextSeqNum == batchSnapshot {
+		b.tombstones = tombstones
+		b.tombstonesSeqNum = nextSeqNum
+	}
+}
+
+func fragmentRangeDels(frag *keyspan.Fragmenter, it internalIterator, count int) {
+	// The memory management here is a bit subtle. The keys and values returned
+	// by the iterator are slices in Batch.data. Thus the fragmented tombstones
+	// are slices within Batch.data. If additional entries are added to the
+	// Batch, Batch.data may be reallocated. The references in the fragmented
+	// tombstones will remain valid, pointing into the old Batch.data. GC for
+	// the win.
+
+	// Use a single []keyspan.Key buffer to avoid allocating many
+	// individual []keyspan.Key slices with a single element each.
+	keyBuf := make([]keyspan.Key, 0, count)
+	for key, val := it.First(); key != nil; key, val = it.Next() {
+		s := rangedel.Decode(*key, val.InPlaceValue(), keyBuf)
+		keyBuf = s.Keys[len(s.Keys):]
+
+		// Set a fixed capacity to avoid accidental overwriting.
+		s.Keys = s.Keys[:len(s.Keys):len(s.Keys)]
+		frag.Add(s)
+	}
+	frag.Finish()
+}
+
+func (b *Batch) newRangeKeyIter(o *IterOptions, batchSnapshot uint64) *keyspan.Iter {
+	// Construct an iterator even if rangeKeyIndex is nil, because it is allowed
+	// to refresh later, so we need the container to exist.
+	iter := new(keyspan.Iter)
+	b.initRangeKeyIter(o, iter, batchSnapshot)
+	return iter
+}
+
+func (b *Batch) initRangeKeyIter(_ *IterOptions, iter *keyspan.Iter, batchSnapshot uint64) {
+	if b.rangeKeyIndex == nil {
+		iter.Init(b.cmp, nil)
+		return
+	}
+
+	// Fragment the range keys the first time a range key iterator is requested.
+	// The cached spans are invalidated if another range key is added to the
+	// batch. This cache is only guaranteed to be correct if we're opening an
+	// iterator to read at a batch sequence number at least as high as
+	// rangeKeysSeqNum. The cache is guaranteed to include all range keys up to
+	// rangeKeysSeqNum, and if any additional range keys were added after that
+	// sequence number the cache would've been cleared.
+	nextSeqNum := b.nextSeqNum()
+	if b.rangeKeys != nil && b.rangeKeysSeqNum <= batchSnapshot {
+		iter.Init(b.cmp, b.rangeKeys)
+		return
+	}
+
+	rangeKeys := make([]keyspan.Span, 0, b.countRangeKeys)
+	frag := &keyspan.Fragmenter{
+		Cmp:    b.cmp,
+		Format: b.formatKey,
+		Emit: func(s keyspan.Span) {
+			rangeKeys = append(rangeKeys, s)
+		},
+	}
+	it := &batchIter{
+		cmp:      b.cmp,
+		batch:    b,
+		iter:     b.rangeKeyIndex.NewIter(nil, nil),
+		snapshot: batchSnapshot,
+	}
+	fragmentRangeKeys(frag, it, int(b.countRangeKeys))
+	iter.Init(b.cmp, rangeKeys)
+
+	// If we just read all the range keys in the batch (eg, batchSnapshot was
+	// set to b.nextSeqNum()), then cache the range keys so that a subsequent
+	// call to initRangeKeyIter may use them without refragmenting.
+	if nextSeqNum == batchSnapshot {
+		b.rangeKeys = rangeKeys
+		b.rangeKeysSeqNum = nextSeqNum
+	}
+}
+
+func fragmentRangeKeys(frag *keyspan.Fragmenter, it internalIterator, count int) error {
+	// The memory management here is a bit subtle. The keys and values
+	// returned by the iterator are slices in Batch.data. Thus the
+	// fragmented key spans are slices within Batch.data. If additional
+	// entries are added to the Batch, Batch.data may be reallocated. The
+	// references in the fragmented keys will remain valid, pointing into
+	// the old Batch.data. GC for the win.
+
+	// Use a single []keyspan.Key buffer to avoid allocating many
+	// individual []keyspan.Key slices with a single element each.
+	keyBuf := make([]keyspan.Key, 0, count)
+	for ik, val := it.First(); ik != nil; ik, val = it.Next() {
+		s, err := rangekey.Decode(*ik, val.InPlaceValue(), keyBuf)
+		if err != nil {
+			return err
+		}
+		keyBuf = s.Keys[len(s.Keys):]
+
+		// Set a fixed capacity to avoid accidental overwriting.
+		s.Keys = s.Keys[:len(s.Keys):len(s.Keys)]
+		frag.Add(s)
+	}
+	frag.Finish()
+	return nil
+}
+
+// Commit applies the batch to its parent writer.
+func (b *Batch) Commit(o *WriteOptions) error {
+	return b.db.Apply(b, o)
+}
+
+// Close closes the batch without committing it.
+func (b *Batch) Close() error {
+	b.release()
+	return nil
+}
+
+// Indexed returns true if the batch is indexed (i.e. supports read
+// operations).
+func (b *Batch) Indexed() bool {
+	return b.index != nil
+}
+
+// init ensures that the batch data slice is initialized to meet the
+// minimum required size and allocates space for the batch header.
+func (b *Batch) init(size int) {
+	n := batchInitialSize
+	for n < size {
+		n *= 2
+	}
+	if cap(b.data) < n {
+		b.data = rawalloc.New(batchHeaderLen, n)
+	}
+	b.setCount(0)
+	b.setSeqNum(0)
+	b.data = b.data[:batchHeaderLen]
+}
+
+// Reset resets the batch for reuse. The underlying byte slice (that is
+// returned by Repr()) may not be modified. It is only necessary to call this
+// method if a batch is explicitly being reused. Close automatically takes are
+// of releasing resources when appropriate for batches that are internally
+// being reused.
+func (b *Batch) Reset() {
+	// Zero out the struct, retaining only the fields necessary for manual
+	// reuse.
+	b.batchInternal = batchInternal{
+		data:           b.data,
+		cmp:            b.cmp,
+		formatKey:      b.formatKey,
+		abbreviatedKey: b.abbreviatedKey,
+		index:          b.index,
+		db:             b.db,
+	}
+	b.applied.Store(false)
+	if b.data != nil {
+		if cap(b.data) > batchMaxRetainedSize {
+			// If the capacity of the buffer is larger than our maximum
+			// retention size, don't re-use it. Let it be GC-ed instead.
+			// This prevents the memory from an unusually large batch from
+			// being held on to indefinitely.
+			b.data = nil
+		} else {
+			// Otherwise, reset the buffer for re-use.
+			b.data = b.data[:batchHeaderLen]
+			b.setSeqNum(0)
+		}
+	}
+	if b.index != nil {
+		b.index.Init(&b.data, b.cmp, b.abbreviatedKey)
+	}
+}
+
+// seqNumData returns the 8 byte little-endian sequence number. Zero means that
+// the batch has not yet been applied.
+func (b *Batch) seqNumData() []byte {
+	return b.data[:8]
+}
+
+// countData returns the 4 byte little-endian count data. "\xff\xff\xff\xff"
+// means that the batch is invalid.
+func (b *Batch) countData() []byte {
+	return b.data[8:12]
+}
+
+func (b *Batch) grow(n int) {
+	newSize := len(b.data) + n
+	if uint64(newSize) >= maxBatchSize {
+		panic(ErrBatchTooLarge)
+	}
+	if newSize > cap(b.data) {
+		newCap := 2 * cap(b.data)
+		for newCap < newSize {
+			newCap *= 2
+		}
+		newData := rawalloc.New(len(b.data), newCap)
+		copy(newData, b.data)
+		b.data = newData
+	}
+	b.data = b.data[:newSize]
+}
+
+func (b *Batch) setSeqNum(seqNum uint64) {
+	binary.LittleEndian.PutUint64(b.seqNumData(), seqNum)
+}
+
+// SeqNum returns the batch sequence number which is applied to the first
+// record in the batch. The sequence number is incremented for each subsequent
+// record. It returns zero if the batch is empty.
+func (b *Batch) SeqNum() uint64 {
+	if len(b.data) == 0 {
+		b.init(batchHeaderLen)
+	}
+	return binary.LittleEndian.Uint64(b.seqNumData())
+}
+
+func (b *Batch) setCount(v uint32) {
+	b.count = uint64(v)
+}
+
+// Count returns the count of memtable-modifying operations in this batch. All
+// operations with the except of LogData increment this count. For IngestSSTs,
+// count is only used to indicate the number of SSTs ingested in the record, the
+// batch isn't applied to the memtable.
+func (b *Batch) Count() uint32 {
+	if b.count > math.MaxUint32 {
+		panic(ErrInvalidBatch)
+	}
+	return uint32(b.count)
+}
+
+// Reader returns a BatchReader for the current batch contents. If the batch is
+// mutated, the new entries will not be visible to the reader.
+func (b *Batch) Reader() BatchReader {
+	if len(b.data) == 0 {
+		b.init(batchHeaderLen)
+	}
+	return b.data[batchHeaderLen:]
+}
+
+func batchDecodeStr(data []byte) (odata []byte, s []byte, ok bool) {
+	// TODO(jackson): This will index out of bounds if there's no varint or an
+	// invalid varint (eg, a single 0xff byte). Correcting will add a bit of
+	// overhead. We could avoid that overhead whenever len(data) >=
+	// binary.MaxVarint32?
+
+	var v uint32
+	var n int
+	ptr := unsafe.Pointer(&data[0])
+	if a := *((*uint8)(ptr)); a < 128 {
+		v = uint32(a)
+		n = 1
+	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
+		v = uint32(b)<<7 | uint32(a)
+		n = 2
+	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
+		v = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+		n = 3
+	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
+		v = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+		n = 4
+	} else {
+		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
+		v = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+		n = 5
+	}
+
+	data = data[n:]
+	if v > uint32(len(data)) {
+		return nil, nil, false
+	}
+	return data[v:], data[:v], true
+}
+
+// SyncWait is to be used in conjunction with DB.ApplyNoSyncWait.
+func (b *Batch) SyncWait() error {
+	now := time.Now()
+	b.fsyncWait.Wait()
+	if b.commitErr != nil {
+		b.db = nil // prevent batch reuse on error
+	}
+	waitDuration := time.Since(now)
+	b.commitStats.CommitWaitDuration += waitDuration
+	b.commitStats.TotalDuration += waitDuration
+	return b.commitErr
+}
+
+// CommitStats returns stats related to committing the batch. Should be called
+// after Batch.Commit, DB.Apply. If DB.ApplyNoSyncWait is used, should be
+// called after Batch.SyncWait.
+func (b *Batch) CommitStats() BatchCommitStats {
+	return b.commitStats
+}
+
+// BatchReader iterates over the entries contained in a batch.
+type BatchReader []byte
+
+// ReadBatch constructs a BatchReader from a batch representation.  The
+// header is not validated. ReadBatch returns a new batch reader and the
+// count of entries contained within the batch.
+func ReadBatch(repr []byte) (r BatchReader, count uint32) {
+	if len(repr) <= batchHeaderLen {
+		return nil, count
+	}
+	count = binary.LittleEndian.Uint32(repr[batchCountOffset:batchHeaderLen])
+	return repr[batchHeaderLen:], count
+}
+
+// Next returns the next entry in this batch, if there is one. If the reader has
+// reached the end of the batch, Next returns ok=false and a nil error. If the
+// batch is corrupt and the next entry is illegible, Next returns ok=false and a
+// non-nil error.
+func (r *BatchReader) Next() (kind InternalKeyKind, ukey []byte, value []byte, ok bool, err error) {
+	if len(*r) == 0 {
+		return 0, nil, nil, false, nil
+	}
+	kind = InternalKeyKind((*r)[0])
+	if kind > InternalKeyKindMax {
+		return 0, nil, nil, false, errors.Wrapf(ErrInvalidBatch, "invalid key kind 0x%x", (*r)[0])
+	}
+	*r, ukey, ok = batchDecodeStr((*r)[1:])
+	if !ok {
+		return 0, nil, nil, false, errors.Wrapf(ErrInvalidBatch, "decoding user key")
+	}
+	switch kind {
+	case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete,
+		InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete,
+		InternalKeyKindDeleteSized:
+		*r, value, ok = batchDecodeStr(*r)
+		if !ok {
+			return 0, nil, nil, false, errors.Wrapf(ErrInvalidBatch, "decoding %s value", kind)
+		}
+	}
+	return kind, ukey, value, true, nil
+}
+
+// Note: batchIter mirrors the implementation of flushableBatchIter. Keep the
+// two in sync.
+type batchIter struct {
+	cmp   Compare
+	batch *Batch
+	iter  batchskl.Iterator
+	err   error
+	// snapshot holds a batch "sequence number" at which the batch is being
+	// read. This sequence number has the InternalKeySeqNumBatch bit set, so it
+	// encodes an offset within the batch. Only batch entries earlier than the
+	// offset are visible during iteration.
+	snapshot uint64
+}
+
+// batchIter implements the base.InternalIterator interface.
+var _ base.InternalIterator = (*batchIter)(nil)
+
+func (i *batchIter) String() string {
+	return "batch"
+}
+
+func (i *batchIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) {
+	// Ignore TrySeekUsingNext if the view of the batch changed.
+	if flags.TrySeekUsingNext() && flags.BatchJustRefreshed() {
+		flags = flags.DisableTrySeekUsingNext()
+	}
+
+	i.err = nil // clear cached iteration error
+	ikey := i.iter.SeekGE(key, flags)
+	for ikey != nil && ikey.SeqNum() >= i.snapshot {
+		ikey = i.iter.Next()
+	}
+	if ikey == nil {
+		return nil, base.LazyValue{}
+	}
+	return ikey, base.MakeInPlaceValue(i.value())
+}
+
+func (i *batchIter) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	i.err = nil // clear cached iteration error
+	return i.SeekGE(key, flags)
+}
+
+func (i *batchIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) {
+	i.err = nil // clear cached iteration error
+	ikey := i.iter.SeekLT(key)
+	for ikey != nil && ikey.SeqNum() >= i.snapshot {
+		ikey = i.iter.Prev()
+	}
+	if ikey == nil {
+		return nil, base.LazyValue{}
+	}
+	return ikey, base.MakeInPlaceValue(i.value())
+}
+
+func (i *batchIter) First() (*InternalKey, base.LazyValue) {
+	i.err = nil // clear cached iteration error
+	ikey := i.iter.First()
+	for ikey != nil && ikey.SeqNum() >= i.snapshot {
+		ikey = i.iter.Next()
+	}
+	if ikey == nil {
+		return nil, base.LazyValue{}
+	}
+	return ikey, base.MakeInPlaceValue(i.value())
+}
+
+func (i *batchIter) Last() (*InternalKey, base.LazyValue) {
+	i.err = nil // clear cached iteration error
+	ikey := i.iter.Last()
+	for ikey != nil && ikey.SeqNum() >= i.snapshot {
+		ikey = i.iter.Prev()
+	}
+	if ikey == nil {
+		return nil, base.LazyValue{}
+	}
+	return ikey, base.MakeInPlaceValue(i.value())
+}
+
+func (i *batchIter) Next() (*InternalKey, base.LazyValue) {
+	ikey := i.iter.Next()
+	for ikey != nil && ikey.SeqNum() >= i.snapshot {
+		ikey = i.iter.Next()
+	}
+	if ikey == nil {
+		return nil, base.LazyValue{}
+	}
+	return ikey, base.MakeInPlaceValue(i.value())
+}
+
+func (i *batchIter) NextPrefix(succKey []byte) (*InternalKey, LazyValue) {
+	// Because NextPrefix was invoked `succKey` must be ≥ the key at i's current
+	// position. Seek the arena iterator using TrySeekUsingNext.
+	ikey := i.iter.SeekGE(succKey, base.SeekGEFlagsNone.EnableTrySeekUsingNext())
+	for ikey != nil && ikey.SeqNum() >= i.snapshot {
+		ikey = i.iter.Next()
+	}
+	if ikey == nil {
+		return nil, base.LazyValue{}
+	}
+	return ikey, base.MakeInPlaceValue(i.value())
+}
+
+func (i *batchIter) Prev() (*InternalKey, base.LazyValue) {
+	ikey := i.iter.Prev()
+	for ikey != nil && ikey.SeqNum() >= i.snapshot {
+		ikey = i.iter.Prev()
+	}
+	if ikey == nil {
+		return nil, base.LazyValue{}
+	}
+	return ikey, base.MakeInPlaceValue(i.value())
+}
+
+func (i *batchIter) value() []byte {
+	offset, _, keyEnd := i.iter.KeyInfo()
+	data := i.batch.data
+	if len(data[offset:]) == 0 {
+		i.err = base.CorruptionErrorf("corrupted batch")
+		return nil
+	}
+
+	switch InternalKeyKind(data[offset]) {
+	case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete,
+		InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete,
+		InternalKeyKindDeleteSized:
+		_, value, ok := batchDecodeStr(data[keyEnd:])
+		if !ok {
+			return nil
+		}
+		return value
+	default:
+		return nil
+	}
+}
+
+func (i *batchIter) Error() error {
+	return i.err
+}
+
+func (i *batchIter) Close() error {
+	_ = i.iter.Close()
+	return i.err
+}
+
+func (i *batchIter) SetBounds(lower, upper []byte) {
+	i.iter.SetBounds(lower, upper)
+}
+
+func (i *batchIter) SetContext(_ context.Context) {}
+
+type flushableBatchEntry struct {
+	// offset is the byte offset of the record within the batch repr.
+	offset uint32
+	// index is the 0-based ordinal number of the record within the batch. Used
+	// to compute the seqnum for the record.
+	index uint32
+	// key{Start,End} are the start and end byte offsets of the key within the
+	// batch repr. Cached to avoid decoding the key length on every
+	// comparison. The value is stored starting at keyEnd.
+	keyStart uint32
+	keyEnd   uint32
+}
+
+// flushableBatch wraps an existing batch and provides the interfaces needed
+// for making the batch flushable (i.e. able to mimic a memtable).
+type flushableBatch struct {
+	cmp       Compare
+	formatKey base.FormatKey
+	data      []byte
+
+	// The base sequence number for the entries in the batch. This is the same
+	// value as Batch.seqNum() and is cached here for performance.
+	seqNum uint64
+
+	// A slice of offsets and indices for the entries in the batch. Used to
+	// implement flushableBatchIter. Unlike the indexing on a normal batch, a
+	// flushable batch is indexed such that batch entry i will be given the
+	// sequence number flushableBatch.seqNum+i.
+	//
+	// Sorted in increasing order of key and decreasing order of offset (since
+	// higher offsets correspond to higher sequence numbers).
+	//
+	// Does not include range deletion entries or range key entries.
+	offsets []flushableBatchEntry
+
+	// Fragmented range deletion tombstones.
+	tombstones []keyspan.Span
+
+	// Fragmented range keys.
+	rangeKeys []keyspan.Span
+}
+
+var _ flushable = (*flushableBatch)(nil)
+
+// newFlushableBatch creates a new batch that implements the flushable
+// interface. This allows the batch to act like a memtable and be placed in the
+// queue of flushable memtables. Note that the flushable batch takes ownership
+// of the batch data.
+func newFlushableBatch(batch *Batch, comparer *Comparer) (*flushableBatch, error) {
+	b := &flushableBatch{
+		data:      batch.data,
+		cmp:       comparer.Compare,
+		formatKey: comparer.FormatKey,
+		offsets:   make([]flushableBatchEntry, 0, batch.Count()),
+	}
+	if b.data != nil {
+		// Note that this sequence number is not correct when this batch has not
+		// been applied since the sequence number has not been assigned yet. The
+		// correct sequence number will be set later. But it is correct when the
+		// batch is being replayed from the WAL.
+		b.seqNum = batch.SeqNum()
+	}
+	var rangeDelOffsets []flushableBatchEntry
+	var rangeKeyOffsets []flushableBatchEntry
+	if len(b.data) > batchHeaderLen {
+		// Non-empty batch.
+		var index uint32
+		for iter := BatchReader(b.data[batchHeaderLen:]); len(iter) > 0; index++ {
+			offset := uintptr(unsafe.Pointer(&iter[0])) - uintptr(unsafe.Pointer(&b.data[0]))
+			kind, key, _, ok, err := iter.Next()
+			if !ok {
+				if err != nil {
+					return nil, err
+				}
+				break
+			}
+			entry := flushableBatchEntry{
+				offset: uint32(offset),
+				index:  uint32(index),
+			}
+			if keySize := uint32(len(key)); keySize == 0 {
+				// Must add 2 to the offset. One byte encodes `kind` and the next
+				// byte encodes `0`, which is the length of the key.
+				entry.keyStart = uint32(offset) + 2
+				entry.keyEnd = entry.keyStart
+			} else {
+				entry.keyStart = uint32(uintptr(unsafe.Pointer(&key[0])) -
+					uintptr(unsafe.Pointer(&b.data[0])))
+				entry.keyEnd = entry.keyStart + keySize
+			}
+			switch kind {
+			case InternalKeyKindRangeDelete:
+				rangeDelOffsets = append(rangeDelOffsets, entry)
+			case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
+				rangeKeyOffsets = append(rangeKeyOffsets, entry)
+			default:
+				b.offsets = append(b.offsets, entry)
+			}
+		}
+	}
+
+	// Sort all of offsets, rangeDelOffsets and rangeKeyOffsets, using *batch's
+	// sort.Interface implementation.
+	pointOffsets := b.offsets
+	sort.Sort(b)
+	b.offsets = rangeDelOffsets
+	sort.Sort(b)
+	b.offsets = rangeKeyOffsets
+	sort.Sort(b)
+	b.offsets = pointOffsets
+
+	if len(rangeDelOffsets) > 0 {
+		frag := &keyspan.Fragmenter{
+			Cmp:    b.cmp,
+			Format: b.formatKey,
+			Emit: func(s keyspan.Span) {
+				b.tombstones = append(b.tombstones, s)
+			},
+		}
+		it := &flushableBatchIter{
+			batch:   b,
+			data:    b.data,
+			offsets: rangeDelOffsets,
+			cmp:     b.cmp,
+			index:   -1,
+		}
+		fragmentRangeDels(frag, it, len(rangeDelOffsets))
+	}
+	if len(rangeKeyOffsets) > 0 {
+		frag := &keyspan.Fragmenter{
+			Cmp:    b.cmp,
+			Format: b.formatKey,
+			Emit: func(s keyspan.Span) {
+				b.rangeKeys = append(b.rangeKeys, s)
+			},
+		}
+		it := &flushableBatchIter{
+			batch:   b,
+			data:    b.data,
+			offsets: rangeKeyOffsets,
+			cmp:     b.cmp,
+			index:   -1,
+		}
+		fragmentRangeKeys(frag, it, len(rangeKeyOffsets))
+	}
+	return b, nil
+}
+
+func (b *flushableBatch) setSeqNum(seqNum uint64) {
+	if b.seqNum != 0 {
+		panic(fmt.Sprintf("pebble: flushableBatch.seqNum already set: %d", b.seqNum))
+	}
+	b.seqNum = seqNum
+	for i := range b.tombstones {
+		for j := range b.tombstones[i].Keys {
+			b.tombstones[i].Keys[j].Trailer = base.MakeTrailer(
+				b.tombstones[i].Keys[j].SeqNum()+seqNum,
+				b.tombstones[i].Keys[j].Kind(),
+			)
+		}
+	}
+	for i := range b.rangeKeys {
+		for j := range b.rangeKeys[i].Keys {
+			b.rangeKeys[i].Keys[j].Trailer = base.MakeTrailer(
+				b.rangeKeys[i].Keys[j].SeqNum()+seqNum,
+				b.rangeKeys[i].Keys[j].Kind(),
+			)
+		}
+	}
+}
+
+func (b *flushableBatch) Len() int {
+	return len(b.offsets)
+}
+
+func (b *flushableBatch) Less(i, j int) bool {
+	ei := &b.offsets[i]
+	ej := &b.offsets[j]
+	ki := b.data[ei.keyStart:ei.keyEnd]
+	kj := b.data[ej.keyStart:ej.keyEnd]
+	switch c := b.cmp(ki, kj); {
+	case c < 0:
+		return true
+	case c > 0:
+		return false
+	default:
+		return ei.offset > ej.offset
+	}
+}
+
+func (b *flushableBatch) Swap(i, j int) {
+	b.offsets[i], b.offsets[j] = b.offsets[j], b.offsets[i]
+}
+
+// newIter is part of the flushable interface.
+func (b *flushableBatch) newIter(o *IterOptions) internalIterator {
+	return &flushableBatchIter{
+		batch:   b,
+		data:    b.data,
+		offsets: b.offsets,
+		cmp:     b.cmp,
+		index:   -1,
+		lower:   o.GetLowerBound(),
+		upper:   o.GetUpperBound(),
+	}
+}
+
+// newFlushIter is part of the flushable interface.
+func (b *flushableBatch) newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator {
+	return &flushFlushableBatchIter{
+		flushableBatchIter: flushableBatchIter{
+			batch:   b,
+			data:    b.data,
+			offsets: b.offsets,
+			cmp:     b.cmp,
+			index:   -1,
+		},
+		bytesIterated: bytesFlushed,
+	}
+}
+
+// newRangeDelIter is part of the flushable interface.
+func (b *flushableBatch) newRangeDelIter(o *IterOptions) keyspan.FragmentIterator {
+	if len(b.tombstones) == 0 {
+		return nil
+	}
+	return keyspan.NewIter(b.cmp, b.tombstones)
+}
+
+// newRangeKeyIter is part of the flushable interface.
+func (b *flushableBatch) newRangeKeyIter(o *IterOptions) keyspan.FragmentIterator {
+	if len(b.rangeKeys) == 0 {
+		return nil
+	}
+	return keyspan.NewIter(b.cmp, b.rangeKeys)
+}
+
+// containsRangeKeys is part of the flushable interface.
+func (b *flushableBatch) containsRangeKeys() bool { return len(b.rangeKeys) > 0 }
+
+// inuseBytes is part of the flushable interface.
+func (b *flushableBatch) inuseBytes() uint64 {
+	return uint64(len(b.data) - batchHeaderLen)
+}
+
+// totalBytes is part of the flushable interface.
+func (b *flushableBatch) totalBytes() uint64 {
+	return uint64(cap(b.data))
+}
+
+// readyForFlush is part of the flushable interface.
+func (b *flushableBatch) readyForFlush() bool {
+	// A flushable batch is always ready for flush; it must be flushed together
+	// with the previous memtable.
+	return true
+}
+
+// Note: flushableBatchIter mirrors the implementation of batchIter. Keep the
+// two in sync.
+type flushableBatchIter struct {
+	// Members to be initialized by creator.
+	batch *flushableBatch
+	// The bytes backing the batch. Always the same as batch.data?
+	data []byte
+	// The sorted entries. This is not always equal to batch.offsets.
+	offsets []flushableBatchEntry
+	cmp     Compare
+	// Must be initialized to -1. It is the index into offsets that represents
+	// the current iterator position.
+	index int
+
+	// For internal use by the implementation.
+	key InternalKey
+	err error
+
+	// Optionally initialize to bounds of iteration, if any.
+	lower []byte
+	upper []byte
+}
+
+// flushableBatchIter implements the base.InternalIterator interface.
+var _ base.InternalIterator = (*flushableBatchIter)(nil)
+
+func (i *flushableBatchIter) String() string {
+	return "flushable-batch"
+}
+
+// SeekGE implements internalIterator.SeekGE, as documented in the pebble
+// package. Ignore flags.TrySeekUsingNext() since we don't expect this
+// optimization to provide much benefit here at the moment.
+func (i *flushableBatchIter) SeekGE(
+	key []byte, flags base.SeekGEFlags,
+) (*InternalKey, base.LazyValue) {
+	i.err = nil // clear cached iteration error
+	ikey := base.MakeSearchKey(key)
+	i.index = sort.Search(len(i.offsets), func(j int) bool {
+		return base.InternalCompare(i.cmp, ikey, i.getKey(j)) <= 0
+	})
+	if i.index >= len(i.offsets) {
+		return nil, base.LazyValue{}
+	}
+	i.key = i.getKey(i.index)
+	if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 {
+		i.index = len(i.offsets)
+		return nil, base.LazyValue{}
+	}
+	return &i.key, i.value()
+}
+
+// SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
+// pebble package.
+func (i *flushableBatchIter) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	return i.SeekGE(key, flags)
+}
+
+// SeekLT implements internalIterator.SeekLT, as documented in the pebble
+// package.
+func (i *flushableBatchIter) SeekLT(
+	key []byte, flags base.SeekLTFlags,
+) (*InternalKey, base.LazyValue) {
+	i.err = nil // clear cached iteration error
+	ikey := base.MakeSearchKey(key)
+	i.index = sort.Search(len(i.offsets), func(j int) bool {
+		return base.InternalCompare(i.cmp, ikey, i.getKey(j)) <= 0
+	})
+	i.index--
+	if i.index < 0 {
+		return nil, base.LazyValue{}
+	}
+	i.key = i.getKey(i.index)
+	if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 {
+		i.index = -1
+		return nil, base.LazyValue{}
+	}
+	return &i.key, i.value()
+}
+
+// First implements internalIterator.First, as documented in the pebble
+// package.
+func (i *flushableBatchIter) First() (*InternalKey, base.LazyValue) {
+	i.err = nil // clear cached iteration error
+	if len(i.offsets) == 0 {
+		return nil, base.LazyValue{}
+	}
+	i.index = 0
+	i.key = i.getKey(i.index)
+	if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 {
+		i.index = len(i.offsets)
+		return nil, base.LazyValue{}
+	}
+	return &i.key, i.value()
+}
+
+// Last implements internalIterator.Last, as documented in the pebble
+// package.
+func (i *flushableBatchIter) Last() (*InternalKey, base.LazyValue) {
+	i.err = nil // clear cached iteration error
+	if len(i.offsets) == 0 {
+		return nil, base.LazyValue{}
+	}
+	i.index = len(i.offsets) - 1
+	i.key = i.getKey(i.index)
+	if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 {
+		i.index = -1
+		return nil, base.LazyValue{}
+	}
+	return &i.key, i.value()
+}
+
+// Note: flushFlushableBatchIter.Next mirrors the implementation of
+// flushableBatchIter.Next due to performance. Keep the two in sync.
+func (i *flushableBatchIter) Next() (*InternalKey, base.LazyValue) {
+	if i.index == len(i.offsets) {
+		return nil, base.LazyValue{}
+	}
+	i.index++
+	if i.index == len(i.offsets) {
+		return nil, base.LazyValue{}
+	}
+	i.key = i.getKey(i.index)
+	if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 {
+		i.index = len(i.offsets)
+		return nil, base.LazyValue{}
+	}
+	return &i.key, i.value()
+}
+
+func (i *flushableBatchIter) Prev() (*InternalKey, base.LazyValue) {
+	if i.index < 0 {
+		return nil, base.LazyValue{}
+	}
+	i.index--
+	if i.index < 0 {
+		return nil, base.LazyValue{}
+	}
+	i.key = i.getKey(i.index)
+	if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 {
+		i.index = -1
+		return nil, base.LazyValue{}
+	}
+	return &i.key, i.value()
+}
+
+// Note: flushFlushableBatchIter.NextPrefix mirrors the implementation of
+// flushableBatchIter.NextPrefix due to performance. Keep the two in sync.
+func (i *flushableBatchIter) NextPrefix(succKey []byte) (*InternalKey, LazyValue) {
+	return i.SeekGE(succKey, base.SeekGEFlagsNone.EnableTrySeekUsingNext())
+}
+
+func (i *flushableBatchIter) getKey(index int) InternalKey {
+	e := &i.offsets[index]
+	kind := InternalKeyKind(i.data[e.offset])
+	key := i.data[e.keyStart:e.keyEnd]
+	return base.MakeInternalKey(key, i.batch.seqNum+uint64(e.index), kind)
+}
+
+func (i *flushableBatchIter) value() base.LazyValue {
+	p := i.data[i.offsets[i.index].offset:]
+	if len(p) == 0 {
+		i.err = base.CorruptionErrorf("corrupted batch")
+		return base.LazyValue{}
+	}
+	kind := InternalKeyKind(p[0])
+	if kind > InternalKeyKindMax {
+		i.err = base.CorruptionErrorf("corrupted batch")
+		return base.LazyValue{}
+	}
+	var value []byte
+	var ok bool
+	switch kind {
+	case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete,
+		InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete,
+		InternalKeyKindDeleteSized:
+		keyEnd := i.offsets[i.index].keyEnd
+		_, value, ok = batchDecodeStr(i.data[keyEnd:])
+		if !ok {
+			i.err = base.CorruptionErrorf("corrupted batch")
+			return base.LazyValue{}
+		}
+	}
+	return base.MakeInPlaceValue(value)
+}
+
+func (i *flushableBatchIter) Valid() bool {
+	return i.index >= 0 && i.index < len(i.offsets)
+}
+
+func (i *flushableBatchIter) Error() error {
+	return i.err
+}
+
+func (i *flushableBatchIter) Close() error {
+	return i.err
+}
+
+func (i *flushableBatchIter) SetBounds(lower, upper []byte) {
+	i.lower = lower
+	i.upper = upper
+}
+
+func (i *flushableBatchIter) SetContext(_ context.Context) {}
+
+// flushFlushableBatchIter is similar to flushableBatchIter but it keeps track
+// of number of bytes iterated.
+type flushFlushableBatchIter struct {
+	flushableBatchIter
+	bytesIterated *uint64
+}
+
+// flushFlushableBatchIter implements the base.InternalIterator interface.
+var _ base.InternalIterator = (*flushFlushableBatchIter)(nil)
+
+func (i *flushFlushableBatchIter) String() string {
+	return "flushable-batch"
+}
+
+func (i *flushFlushableBatchIter) SeekGE(
+	key []byte, flags base.SeekGEFlags,
+) (*InternalKey, base.LazyValue) {
+	panic("pebble: SeekGE unimplemented")
+}
+
+func (i *flushFlushableBatchIter) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	panic("pebble: SeekPrefixGE unimplemented")
+}
+
+func (i *flushFlushableBatchIter) SeekLT(
+	key []byte, flags base.SeekLTFlags,
+) (*InternalKey, base.LazyValue) {
+	panic("pebble: SeekLT unimplemented")
+}
+
+func (i *flushFlushableBatchIter) First() (*InternalKey, base.LazyValue) {
+	i.err = nil // clear cached iteration error
+	key, val := i.flushableBatchIter.First()
+	if key == nil {
+		return nil, base.LazyValue{}
+	}
+	entryBytes := i.offsets[i.index].keyEnd - i.offsets[i.index].offset
+	*i.bytesIterated += uint64(entryBytes) + i.valueSize()
+	return key, val
+}
+
+func (i *flushFlushableBatchIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
+	panic("pebble: Prev unimplemented")
+}
+
+// Note: flushFlushableBatchIter.Next mirrors the implementation of
+// flushableBatchIter.Next due to performance. Keep the two in sync.
+func (i *flushFlushableBatchIter) Next() (*InternalKey, base.LazyValue) {
+	if i.index == len(i.offsets) {
+		return nil, base.LazyValue{}
+	}
+	i.index++
+	if i.index == len(i.offsets) {
+		return nil, base.LazyValue{}
+	}
+	i.key = i.getKey(i.index)
+	entryBytes := i.offsets[i.index].keyEnd - i.offsets[i.index].offset
+	*i.bytesIterated += uint64(entryBytes) + i.valueSize()
+	return &i.key, i.value()
+}
+
+func (i flushFlushableBatchIter) Prev() (*InternalKey, base.LazyValue) {
+	panic("pebble: Prev unimplemented")
+}
+
+func (i flushFlushableBatchIter) valueSize() uint64 {
+	p := i.data[i.offsets[i.index].offset:]
+	if len(p) == 0 {
+		i.err = base.CorruptionErrorf("corrupted batch")
+		return 0
+	}
+	kind := InternalKeyKind(p[0])
+	if kind > InternalKeyKindMax {
+		i.err = base.CorruptionErrorf("corrupted batch")
+		return 0
+	}
+	var length uint64
+	switch kind {
+	case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete:
+		keyEnd := i.offsets[i.index].keyEnd
+		v, n := binary.Uvarint(i.data[keyEnd:])
+		if n <= 0 {
+			i.err = base.CorruptionErrorf("corrupted batch")
+			return 0
+		}
+		length = v + uint64(n)
+	}
+	return length
+}
+
+// batchSort returns iterators for the sorted contents of the batch. It is
+// intended for testing use only. The batch.Sort dance is done to prevent
+// exposing this method in the public pebble interface.
+func batchSort(
+	i interface{},
+) (
+	points internalIterator,
+	rangeDels keyspan.FragmentIterator,
+	rangeKeys keyspan.FragmentIterator,
+) {
+	b := i.(*Batch)
+	if b.Indexed() {
+		pointIter := b.newInternalIter(nil)
+		rangeDelIter := b.newRangeDelIter(nil, math.MaxUint64)
+		rangeKeyIter := b.newRangeKeyIter(nil, math.MaxUint64)
+		return pointIter, rangeDelIter, rangeKeyIter
+	}
+	f, err := newFlushableBatch(b, b.db.opts.Comparer)
+	if err != nil {
+		panic(err)
+	}
+	return f.newIter(nil), f.newRangeDelIter(nil), f.newRangeKeyIter(nil)
+}
+
+func init() {
+	private.BatchSort = batchSort
+}
diff --git a/pebble/batch_test.go b/pebble/batch_test.go
new file mode 100644
index 0000000..c977874
--- /dev/null
+++ b/pebble/batch_test.go
@@ -0,0 +1,1652 @@
+// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"context"
+	"encoding/binary"
+	"encoding/hex"
+	"fmt"
+	"io"
+	"math"
+	"math/rand"
+	"strconv"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+	"unicode"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/batchskl"
+	"github.com/cockroachdb/pebble/internal/itertest"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+func TestBatch(t *testing.T) {
+	testBatch(t, 0)
+	testBatch(t, batchInitialSize)
+}
+
+func testBatch(t *testing.T, size int) {
+	type testCase struct {
+		kind       InternalKeyKind
+		key, value string
+		valueInt   uint32
+	}
+
+	verifyTestCases := func(b *Batch, testCases []testCase, indexedPointKindsOnly bool) {
+		r := b.Reader()
+
+		for _, tc := range testCases {
+			if indexedPointKindsOnly && (tc.kind == InternalKeyKindLogData || tc.kind == InternalKeyKindIngestSST ||
+				tc.kind == InternalKeyKindRangeDelete) {
+				continue
+			}
+			kind, k, v, ok, err := r.Next()
+			if !ok {
+				if err != nil {
+					t.Fatal(err)
+				}
+				t.Fatalf("next returned !ok: test case = %v", tc)
+			}
+			key, value := string(k), string(v)
+			if kind != tc.kind || key != tc.key || value != tc.value {
+				t.Errorf("got (%d, %q, %q), want (%d, %q, %q)",
+					kind, key, value, tc.kind, tc.key, tc.value)
+			}
+		}
+		if len(r) != 0 {
+			t.Errorf("reader was not exhausted: remaining bytes = %q", r)
+		}
+	}
+
+	encodeFileNum := func(n base.FileNum) string {
+		return string(binary.AppendUvarint(nil, uint64(n)))
+	}
+	decodeFileNum := func(d []byte) base.FileNum {
+		val, n := binary.Uvarint(d)
+		if n <= 0 {
+			t.Fatalf("invalid filenum encoding")
+		}
+		return base.FileNum(val)
+	}
+
+	// RangeKeySet and RangeKeyUnset are untested here because they don't expose
+	// deferred variants. This is a consequence of these keys' more complex
+	// value encodings.
+	testCases := []testCase{
+		{InternalKeyKindIngestSST, encodeFileNum(1), "", 0},
+		{InternalKeyKindSet, "roses", "red", 0},
+		{InternalKeyKindSet, "violets", "blue", 0},
+		{InternalKeyKindDelete, "roses", "", 0},
+		{InternalKeyKindSingleDelete, "roses", "", 0},
+		{InternalKeyKindSet, "", "", 0},
+		{InternalKeyKindSet, "", "non-empty", 0},
+		{InternalKeyKindDelete, "", "", 0},
+		{InternalKeyKindSingleDelete, "", "", 0},
+		{InternalKeyKindSet, "grass", "green", 0},
+		{InternalKeyKindSet, "grass", "greener", 0},
+		{InternalKeyKindSet, "eleventy", strings.Repeat("!!11!", 100), 0},
+		{InternalKeyKindDelete, "nosuchkey", "", 0},
+		{InternalKeyKindDeleteSized, "eleventy", string(binary.AppendUvarint([]byte(nil), 508)), 500},
+		{InternalKeyKindSingleDelete, "nosuchkey", "", 0},
+		{InternalKeyKindSet, "binarydata", "\x00", 0},
+		{InternalKeyKindSet, "binarydata", "\xff", 0},
+		{InternalKeyKindMerge, "merge", "mergedata", 0},
+		{InternalKeyKindMerge, "merge", "", 0},
+		{InternalKeyKindMerge, "", "", 0},
+		{InternalKeyKindRangeDelete, "a", "b", 0},
+		{InternalKeyKindRangeDelete, "", "", 0},
+		{InternalKeyKindLogData, "logdata", "", 0},
+		{InternalKeyKindLogData, "", "", 0},
+		{InternalKeyKindRangeKeyDelete, "grass", "green", 0},
+		{InternalKeyKindRangeKeyDelete, "", "", 0},
+		{InternalKeyKindDeleteSized, "nosuchkey", string(binary.AppendUvarint([]byte(nil), 11)), 2},
+	}
+	b := newBatchWithSize(nil, size)
+	for _, tc := range testCases {
+		switch tc.kind {
+		case InternalKeyKindSet:
+			_ = b.Set([]byte(tc.key), []byte(tc.value), nil)
+		case InternalKeyKindMerge:
+			_ = b.Merge([]byte(tc.key), []byte(tc.value), nil)
+		case InternalKeyKindDelete:
+			_ = b.Delete([]byte(tc.key), nil)
+		case InternalKeyKindDeleteSized:
+			_ = b.DeleteSized([]byte(tc.key), tc.valueInt, nil)
+		case InternalKeyKindSingleDelete:
+			_ = b.SingleDelete([]byte(tc.key), nil)
+		case InternalKeyKindRangeDelete:
+			_ = b.DeleteRange([]byte(tc.key), []byte(tc.value), nil)
+		case InternalKeyKindLogData:
+			_ = b.LogData([]byte(tc.key), nil)
+		case InternalKeyKindRangeKeyDelete:
+			_ = b.RangeKeyDelete([]byte(tc.key), []byte(tc.value), nil)
+		case InternalKeyKindIngestSST:
+			b.ingestSST(decodeFileNum([]byte(tc.key)))
+		}
+	}
+	verifyTestCases(b, testCases, false /* indexedKindsOnly */)
+
+	b.Reset()
+	// Run the same operations, this time using the Deferred variants of each
+	// operation (eg. SetDeferred).
+	for _, tc := range testCases {
+		key := []byte(tc.key)
+		value := []byte(tc.value)
+		switch tc.kind {
+		case InternalKeyKindSet:
+			d := b.SetDeferred(len(key), len(value))
+			copy(d.Key, key)
+			copy(d.Value, value)
+			d.Finish()
+		case InternalKeyKindMerge:
+			d := b.MergeDeferred(len(key), len(value))
+			copy(d.Key, key)
+			copy(d.Value, value)
+			d.Finish()
+		case InternalKeyKindDelete:
+			d := b.DeleteDeferred(len(key))
+			copy(d.Key, key)
+			copy(d.Value, value)
+			d.Finish()
+		case InternalKeyKindDeleteSized:
+			d := b.DeleteSizedDeferred(len(tc.key), tc.valueInt)
+			copy(d.Key, key)
+			d.Finish()
+		case InternalKeyKindSingleDelete:
+			d := b.SingleDeleteDeferred(len(key))
+			copy(d.Key, key)
+			copy(d.Value, value)
+			d.Finish()
+		case InternalKeyKindRangeDelete:
+			d := b.DeleteRangeDeferred(len(key), len(value))
+			copy(d.Key, key)
+			copy(d.Value, value)
+			d.Finish()
+		case InternalKeyKindLogData:
+			_ = b.LogData([]byte(tc.key), nil)
+		case InternalKeyKindIngestSST:
+			b.ingestSST(decodeFileNum([]byte(tc.key)))
+		case InternalKeyKindRangeKeyDelete:
+			d := b.RangeKeyDeleteDeferred(len(key), len(value))
+			copy(d.Key, key)
+			copy(d.Value, value)
+			d.Finish()
+		}
+	}
+	verifyTestCases(b, testCases, false /* indexedKindsOnly */)
+
+	b.Reset()
+	// Run the same operations, this time using AddInternalKey instead of the
+	// Kind-specific methods.
+	for _, tc := range testCases {
+		if tc.kind == InternalKeyKindLogData || tc.kind == InternalKeyKindIngestSST ||
+			tc.kind == InternalKeyKindRangeDelete {
+			continue
+		}
+		key := []byte(tc.key)
+		value := []byte(tc.value)
+		b.AddInternalKey(&InternalKey{UserKey: key, Trailer: base.MakeTrailer(0, tc.kind)}, value, nil)
+	}
+	verifyTestCases(b, testCases, true /* indexedKindsOnly */)
+}
+
+func TestBatchPreAlloc(t *testing.T) {
+	var cases = []struct {
+		size int
+		exp  int
+	}{
+		{0, batchInitialSize},
+		{batchInitialSize, batchInitialSize},
+		{2 * batchInitialSize, 2 * batchInitialSize},
+	}
+	for _, c := range cases {
+		b := newBatchWithSize(nil, c.size)
+		b.Set([]byte{0x1}, []byte{0x2}, nil)
+		if cap(b.data) != c.exp {
+			t.Errorf("Unexpected memory space, required: %d, got: %d", c.exp, cap(b.data))
+		}
+	}
+}
+
+func TestBatchIngestSST(t *testing.T) {
+	// Verify that Batch.IngestSST has the correct batch count and memtable
+	// size.
+	var b Batch
+	b.ingestSST(1)
+	require.Equal(t, int(b.Count()), 1)
+	b.ingestSST(2)
+	require.Equal(t, int(b.Count()), 2)
+	require.Equal(t, int(b.memTableSize), 0)
+	require.Equal(t, b.ingestedSSTBatch, true)
+}
+
+func TestBatchLen(t *testing.T) {
+	var b Batch
+
+	requireLenAndReprEq := func(size int) {
+		require.Equal(t, size, b.Len())
+		require.Equal(t, size, len(b.Repr()))
+	}
+
+	requireLenAndReprEq(batchHeaderLen)
+
+	key := "test-key"
+	value := "test-value"
+
+	err := b.Set([]byte(key), []byte(value), nil)
+	require.NoError(t, err)
+
+	requireLenAndReprEq(33)
+
+	err = b.Delete([]byte(key), nil)
+	require.NoError(t, err)
+
+	requireLenAndReprEq(43)
+}
+
+func TestBatchEmpty(t *testing.T) {
+	testBatchEmpty(t, 0)
+	testBatchEmpty(t, batchInitialSize)
+}
+
+func testBatchEmpty(t *testing.T, size int) {
+	b := newBatchWithSize(nil, size)
+	require.True(t, b.Empty())
+
+	ops := []func(*Batch) error{
+		func(b *Batch) error { return b.Set(nil, nil, nil) },
+		func(b *Batch) error { return b.Merge(nil, nil, nil) },
+		func(b *Batch) error { return b.Delete(nil, nil) },
+		func(b *Batch) error { return b.DeleteRange(nil, nil, nil) },
+		func(b *Batch) error { return b.LogData(nil, nil) },
+		func(b *Batch) error { return b.RangeKeySet(nil, nil, nil, nil, nil) },
+		func(b *Batch) error { return b.RangeKeyUnset(nil, nil, nil, nil) },
+		func(b *Batch) error { return b.RangeKeyDelete(nil, nil, nil) },
+	}
+
+	for _, op := range ops {
+		require.NoError(t, op(b))
+		require.False(t, b.Empty())
+		b.Reset()
+		require.True(t, b.Empty())
+		// Reset may choose to reuse b.data, so clear it to the zero value in
+		// order to test the lazy initialization of b.data.
+		b = newBatchWithSize(nil, size)
+	}
+
+	_ = b.Reader()
+	require.True(t, b.Empty())
+	b.Reset()
+	require.True(t, b.Empty())
+	b = newBatchWithSize(nil, size)
+
+	require.Equal(t, uint64(0), b.SeqNum())
+	require.True(t, b.Empty())
+	b.Reset()
+	require.True(t, b.Empty())
+	b = &Batch{}
+
+	d, err := Open("", &Options{
+		FS: vfs.NewMem(),
+	})
+	require.NoError(t, err)
+	defer d.Close()
+	ib := newIndexedBatch(d, DefaultComparer)
+	iter, _ := ib.NewIter(nil)
+	require.False(t, iter.First())
+	iter2, err := iter.Clone(CloneOptions{})
+	require.NoError(t, err)
+	require.NoError(t, iter.Close())
+	_, err = iter.Clone(CloneOptions{})
+	require.True(t, err != nil)
+	require.False(t, iter2.First())
+	require.NoError(t, iter2.Close())
+	iter3, err := ib.NewBatchOnlyIter(context.Background(), nil)
+	require.NoError(t, err)
+	require.False(t, iter3.First())
+	_, err = iter3.Clone(CloneOptions{})
+	require.Error(t, err)
+	require.NoError(t, iter3.Close())
+}
+
+func TestBatchApplyNoSyncWait(t *testing.T) {
+	db, err := Open("", &Options{
+		FS: vfs.NewMem(),
+	})
+	require.NoError(t, err)
+	defer db.Close()
+	var batches []*Batch
+	options := &WriteOptions{Sync: true}
+	for i := 0; i < 10000; i++ {
+		b := db.NewBatch()
+		str := fmt.Sprintf("a%d", i)
+		require.NoError(t, b.Set([]byte(str), []byte(str), nil))
+		require.NoError(t, db.ApplyNoSyncWait(b, options))
+		// k-v pair is visible even if not yet synced.
+		val, closer, err := db.Get([]byte(str))
+		require.NoError(t, err)
+		require.Equal(t, str, string(val))
+		closer.Close()
+		batches = append(batches, b)
+	}
+	for _, b := range batches {
+		require.NoError(t, b.SyncWait())
+		b.Close()
+	}
+}
+
+func TestBatchReset(t *testing.T) {
+	db, err := Open("", &Options{
+		FS: vfs.NewMem(),
+	})
+	require.NoError(t, err)
+	defer db.Close()
+	key := "test-key"
+	value := "test-value"
+	b := db.NewBatch()
+	require.NoError(t, b.Set([]byte(key), []byte(value), nil))
+	dd := b.DeleteRangeDeferred(len(key), len(value))
+	copy(dd.Key, key)
+	copy(dd.Value, value)
+	dd.Finish()
+
+	require.NoError(t, b.RangeKeySet([]byte(key), []byte(value), []byte(value), []byte(value), nil))
+
+	b.setSeqNum(100)
+	b.applied.Store(true)
+	b.commitErr = errors.New("test-error")
+	b.commit.Add(1)
+	b.fsyncWait.Add(1)
+	require.Equal(t, uint32(3), b.Count())
+	require.Equal(t, uint64(1), b.countRangeDels)
+	require.Equal(t, uint64(1), b.countRangeKeys)
+	require.True(t, len(b.data) > 0)
+	require.True(t, b.SeqNum() > 0)
+	require.True(t, b.memTableSize > 0)
+	require.NotEqual(t, b.deferredOp, DeferredBatchOp{})
+	// At this point b.data has not been modified since the db.NewBatch() and is
+	// either nil or contains a byte slice of length batchHeaderLen, with a 0
+	// seqnum encoded in data[0:8] and an arbitrary count encoded in data[8:12].
+	// The following commented code will often fail.
+	// 	count := binary.LittleEndian.Uint32(b.countData())
+	//  if count != 0 && count != 3 {
+	//  	t.Fatalf("count: %d", count)
+	//  }
+	// If we simply called b.Reset now and later used b.data to initialize
+	// expected, the count in expected will also be arbitrary. So we fix the
+	// count in b.data now by calling b.Repr(). This call isn't essential, since
+	// we will call b.Repr() again, and just shows that it fixes the count in
+	// b.data.
+	_ = b.Repr()
+	require.Equal(t, uint32(3), binary.LittleEndian.Uint32(b.countData()))
+
+	b.Reset()
+	require.Equal(t, db, b.db)
+	require.Equal(t, false, b.applied.Load())
+	require.Nil(t, b.commitErr)
+	require.Equal(t, uint32(0), b.Count())
+	require.Equal(t, uint64(0), b.countRangeDels)
+	require.Equal(t, uint64(0), b.countRangeKeys)
+	require.Equal(t, batchHeaderLen, len(b.data))
+	require.Equal(t, uint64(0), b.SeqNum())
+	require.Equal(t, uint64(0), b.memTableSize)
+	require.Equal(t, FormatMajorVersion(0x00), b.minimumFormatMajorVersion)
+	require.Equal(t, b.deferredOp, DeferredBatchOp{})
+	_ = b.Repr()
+
+	var expected Batch
+	require.NoError(t, expected.SetRepr(b.data))
+	expected.db = db
+	require.Equal(t, &expected, b)
+
+	// Reset batch can be used to write and commit a new record.
+	b.Set([]byte(key), []byte(value), nil)
+	require.NoError(t, db.Apply(b, nil))
+	v, closer, err := db.Get([]byte(key))
+	require.NoError(t, err)
+	defer closer.Close()
+	require.Equal(t, v, []byte(value))
+}
+
+func TestIndexedBatchReset(t *testing.T) {
+	indexCount := func(sl *batchskl.Skiplist) int {
+		count := 0
+		iter := sl.NewIter(nil, nil)
+		defer iter.Close()
+		for iter.First(); iter.Valid(); iter.Next() {
+			count++
+		}
+		return count
+	}
+	db, err := Open("", &Options{
+		FS: vfs.NewMem(),
+	})
+	require.NoError(t, err)
+	defer db.Close()
+	b := newIndexedBatch(db, DefaultComparer)
+	start := "start-key"
+	end := "end-key"
+	key := "test-key"
+	value := "test-value"
+	b.DeleteRange([]byte(start), []byte(end), nil)
+	b.Set([]byte(key), []byte(value), nil)
+	require.NoError(t, b.
+		RangeKeySet([]byte(start), []byte(end), []byte("suffix"), []byte(value), nil))
+	require.NotNil(t, b.rangeKeyIndex)
+	require.NotNil(t, b.rangeDelIndex)
+	require.NotNil(t, b.index)
+	require.Equal(t, 1, indexCount(b.index))
+
+	b.Reset()
+	require.NotNil(t, b.cmp)
+	require.NotNil(t, b.formatKey)
+	require.NotNil(t, b.abbreviatedKey)
+	require.NotNil(t, b.index)
+	require.Nil(t, b.rangeDelIndex)
+	require.Nil(t, b.rangeKeyIndex)
+
+	count := func(ib *Batch) int {
+		iter, _ := ib.NewIter(nil)
+		defer iter.Close()
+		iter2, err := iter.Clone(CloneOptions{})
+		require.NoError(t, err)
+		defer iter2.Close()
+		iter3, err := ib.NewBatchOnlyIter(context.Background(), nil)
+		require.NoError(t, err)
+		defer iter3.Close()
+		var count [3]int
+		for i, it := range []*Iterator{iter, iter2, iter3} {
+			for it.First(); it.Valid(); it.Next() {
+				count[i]++
+			}
+		}
+		require.Equal(t, count[0], count[1])
+		require.Equal(t, count[0], count[2])
+		return count[0]
+	}
+	contains := func(ib *Batch, key, value string) bool {
+		iter, _ := ib.NewIter(nil)
+		defer iter.Close()
+		iter2, err := iter.Clone(CloneOptions{})
+		require.NoError(t, err)
+		defer iter2.Close()
+		iter3, err := ib.NewBatchOnlyIter(context.Background(), nil)
+		require.NoError(t, err)
+		defer iter3.Close()
+		var found [3]bool
+		for i, it := range []*Iterator{iter, iter2, iter3} {
+			for it.First(); it.Valid(); it.Next() {
+				if string(it.Key()) == key &&
+					string(it.Value()) == value {
+					found[i] = true
+				}
+			}
+		}
+		require.Equal(t, found[0], found[1])
+		require.Equal(t, found[0], found[2])
+		return found[0]
+	}
+	// Set a key and check whether the key-value pair is visible.
+	b.Set([]byte(key), []byte(value), nil)
+	require.Equal(t, 1, indexCount(b.index))
+	require.Equal(t, 1, count(b))
+	require.True(t, contains(b, key, value))
+
+	// Use range delete to delete the above inserted key-value pair.
+	b.DeleteRange([]byte(key), []byte(value), nil)
+	require.NotNil(t, b.rangeDelIndex)
+	require.Equal(t, 1, indexCount(b.rangeDelIndex))
+	require.Equal(t, 0, count(b))
+	require.False(t, contains(b, key, value))
+}
+
+// TestIndexedBatchMutation tests mutating an indexed batch with an open
+// iterator.
+func TestIndexedBatchMutation(t *testing.T) {
+	opts := &Options{
+		Comparer:           testkeys.Comparer,
+		FS:                 vfs.NewMem(),
+		FormatMajorVersion: internalFormatNewest,
+	}
+	d, err := Open("", opts)
+	require.NoError(t, err)
+	defer func() { d.Close() }()
+
+	b := newIndexedBatch(d, DefaultComparer)
+	iters := map[string]*Iterator{}
+	defer func() {
+		for _, iter := range iters {
+			require.NoError(t, iter.Close())
+		}
+	}()
+
+	datadriven.RunTest(t, "testdata/indexed_batch_mutation", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "batch":
+			writeBatch := newBatch(d)
+			if err := runBatchDefineCmd(td, writeBatch); err != nil {
+				return err.Error()
+			}
+			if err := writeBatch.Commit(nil); err != nil {
+				return err.Error()
+			}
+			return ""
+		case "new-batch-iter":
+			name := td.CmdArgs[0].String()
+			iters[name], _ = b.NewIter(&IterOptions{
+				KeyTypes: IterKeyTypePointsAndRanges,
+			})
+			return ""
+		case "new-batch-only-iter":
+			name := td.CmdArgs[0].String()
+			iters[name], _ = b.NewBatchOnlyIter(context.Background(), &IterOptions{
+				KeyTypes: IterKeyTypePointsAndRanges,
+			})
+			return ""
+		case "new-db-iter":
+			name := td.CmdArgs[0].String()
+			iters[name], _ = d.NewIter(&IterOptions{
+				KeyTypes: IterKeyTypePointsAndRanges,
+			})
+			return ""
+		case "new-batch":
+			if b != nil {
+				require.NoError(t, b.Close())
+			}
+			b = newIndexedBatch(d, opts.Comparer)
+			if err := runBatchDefineCmd(td, b); err != nil {
+				return err.Error()
+			}
+			return ""
+		case "flush":
+			require.NoError(t, d.Flush())
+			return ""
+		case "iter":
+			var iter string
+			td.ScanArgs(t, "iter", &iter)
+			return runIterCmd(td, iters[iter], false /* closeIter */)
+		case "mutate":
+			mut := newBatch(d)
+			if err := runBatchDefineCmd(td, mut); err != nil {
+				return err.Error()
+			}
+			if err := b.Apply(mut, nil); err != nil {
+				return err.Error()
+			}
+			return ""
+		case "clone":
+			var from, to string
+			var refreshBatchView bool
+			td.ScanArgs(t, "from", &from)
+			td.ScanArgs(t, "to", &to)
+			td.ScanArgs(t, "refresh-batch", &refreshBatchView)
+			var err error
+			iters[to], err = iters[from].Clone(CloneOptions{RefreshBatchView: refreshBatchView})
+			if err != nil {
+				return err.Error()
+			}
+			return ""
+		case "reset":
+			for key, iter := range iters {
+				if err := iter.Close(); err != nil {
+					return err.Error()
+				}
+				delete(iters, key)
+			}
+			if d != nil {
+				if err := d.Close(); err != nil {
+					return err.Error()
+				}
+			}
+			opts.FS = vfs.NewMem()
+			d, err = Open("", opts)
+			require.NoError(t, err)
+			return ""
+		default:
+			return fmt.Sprintf("unrecognized command %q", td.Cmd)
+		}
+	})
+}
+
+func TestIndexedBatch_GlobalVisibility(t *testing.T) {
+	opts := &Options{
+		FS:                 vfs.NewMem(),
+		FormatMajorVersion: internalFormatNewest,
+		Comparer:           testkeys.Comparer,
+	}
+	d, err := Open("", opts)
+	require.NoError(t, err)
+	defer d.Close()
+
+	require.NoError(t, d.Set([]byte("foo"), []byte("foo"), nil))
+
+	// Create an iterator over an empty indexed batch.
+	b := newIndexedBatch(d, DefaultComparer)
+	iterOpts := IterOptions{KeyTypes: IterKeyTypePointsAndRanges}
+	iter, _ := b.NewIter(&iterOpts)
+	defer iter.Close()
+
+	// Mutate the database's committed state.
+	mut := newBatch(d)
+	require.NoError(t, mut.Set([]byte("bar"), []byte("bar"), nil))
+	require.NoError(t, mut.DeleteRange([]byte("e"), []byte("g"), nil))
+	require.NoError(t, mut.RangeKeySet([]byte("a"), []byte("c"), []byte("@1"), []byte("v"), nil))
+	require.NoError(t, mut.Commit(nil))
+
+	scanIter := func() string {
+		var buf bytes.Buffer
+		for valid := iter.First(); valid; valid = iter.Next() {
+			fmt.Fprintf(&buf, "%s: (", iter.Key())
+			hasPoint, hasRange := iter.HasPointAndRange()
+			if hasPoint {
+				fmt.Fprintf(&buf, "%s,", iter.Value())
+			} else {
+				fmt.Fprintf(&buf, ".,")
+			}
+			if hasRange {
+				start, end := iter.RangeBounds()
+				fmt.Fprintf(&buf, "[%s-%s)", start, end)
+				writeRangeKeys(&buf, iter)
+			} else {
+				fmt.Fprintf(&buf, ".")
+			}
+			fmt.Fprintln(&buf, ")")
+		}
+		return strings.TrimSpace(buf.String())
+	}
+	// Scanning the iterator should only see the point key written before the
+	// iterator was constructed.
+	require.Equal(t, `foo: (foo,.)`, scanIter())
+
+	// After calling SetOptions, the iterator should still only see the point
+	// key written before the iterator was constructed. SetOptions refreshes the
+	// iterator's view of its own indexed batch, but not committed state.
+	iter.SetOptions(&iterOpts)
+	require.Equal(t, `foo: (foo,.)`, scanIter())
+}
+
+func TestFlushableBatchReset(t *testing.T) {
+	var b Batch
+	var err error
+	b.flushable, err = newFlushableBatch(&b, DefaultComparer)
+	require.NoError(t, err)
+
+	b.Reset()
+	require.Nil(t, b.flushable)
+}
+
+func TestBatchIncrement(t *testing.T) {
+	testCases := []uint32{
+		0x00000000,
+		0x00000001,
+		0x00000002,
+		0x0000007f,
+		0x00000080,
+		0x000000fe,
+		0x000000ff,
+		0x00000100,
+		0x00000101,
+		0x000001ff,
+		0x00000200,
+		0x00000fff,
+		0x00001234,
+		0x0000fffe,
+		0x0000ffff,
+		0x00010000,
+		0x00010001,
+		0x000100fe,
+		0x000100ff,
+		0x00020100,
+		0x03fffffe,
+		0x03ffffff,
+		0x04000000,
+		0x04000001,
+		0x7fffffff,
+		0xfffffffe,
+	}
+	for _, tc := range testCases {
+		var buf [batchHeaderLen]byte
+		binary.LittleEndian.PutUint32(buf[8:12], tc)
+		var b Batch
+		b.SetRepr(buf[:])
+		b.count++
+		got := binary.LittleEndian.Uint32(b.Repr()[8:12])
+		want := tc + 1
+		if got != want {
+			t.Errorf("input=%d: got %d, want %d", tc, got, want)
+		}
+		_, count := ReadBatch(b.Repr())
+		if got != want {
+			t.Errorf("input=%d: got %d, want %d", tc, count, want)
+		}
+	}
+
+	err := func() (err error) {
+		defer func() {
+			if v := recover(); v != nil {
+				if verr, ok := v.(error); ok {
+					err = verr
+				}
+			}
+		}()
+		var buf [batchHeaderLen]byte
+		binary.LittleEndian.PutUint32(buf[8:12], 0xffffffff)
+		var b Batch
+		b.SetRepr(buf[:])
+		b.count++
+		b.Repr()
+		return nil
+	}()
+	if err != ErrInvalidBatch {
+		t.Fatalf("expected %v, but found %v", ErrInvalidBatch, err)
+	}
+}
+
+func TestBatchOpDoesIncrement(t *testing.T) {
+	var b Batch
+	key := []byte("foo")
+	value := []byte("bar")
+
+	if b.Count() != 0 {
+		t.Fatalf("new batch has a nonzero count: %d", b.Count())
+	}
+
+	// Should increment count by 1
+	_ = b.Set(key, value, nil)
+	if b.Count() != 1 {
+		t.Fatalf("expected count: %d, got %d", 1, b.Count())
+	}
+
+	var b2 Batch
+	// Should increment count by 1 each
+	_ = b2.Set(key, value, nil)
+	_ = b2.Delete(key, nil)
+	if b2.Count() != 2 {
+		t.Fatalf("expected count: %d, got %d", 2, b2.Count())
+	}
+
+	// Should increment count by b2.count()
+	_ = b.Apply(&b2, nil)
+	if b.Count() != 3 {
+		t.Fatalf("expected count: %d, got %d", 3, b.Count())
+	}
+
+	// Should increment count by 1
+	_ = b.Merge(key, value, nil)
+	if b.Count() != 4 {
+		t.Fatalf("expected count: %d, got %d", 4, b.Count())
+	}
+
+	// Should NOT increment count.
+	_ = b.LogData([]byte("foobarbaz"), nil)
+	if b.Count() != 4 {
+		t.Fatalf("expected count: %d, got %d", 4, b.Count())
+	}
+}
+
+func TestBatchGet(t *testing.T) {
+	testCases := []struct {
+		method       string
+		memTableSize uint64
+	}{
+		{"build", 64 << 20},
+		{"build", 2 << 10},
+		{"apply", 64 << 20},
+	}
+
+	for _, c := range testCases {
+		t.Run(fmt.Sprintf("%s,mem=%d", c.method, c.memTableSize), func(t *testing.T) {
+			d, err := Open("", &Options{
+				FS:           vfs.NewMem(),
+				MemTableSize: c.memTableSize,
+			})
+			if err != nil {
+				t.Fatalf("Open: %v", err)
+			}
+			defer d.Close()
+			var b *Batch
+
+			datadriven.RunTest(t, "testdata/batch_get", func(t *testing.T, td *datadriven.TestData) string {
+				switch td.Cmd {
+				case "define":
+					switch c.method {
+					case "build":
+						b = d.NewIndexedBatch()
+					case "apply":
+						b = d.NewBatch()
+					}
+
+					if err := runBatchDefineCmd(td, b); err != nil {
+						return err.Error()
+					}
+
+					switch c.method {
+					case "apply":
+						tmp := d.NewIndexedBatch()
+						tmp.Apply(b, nil)
+						b = tmp
+					}
+					return ""
+
+				case "commit":
+					if err := b.Commit(nil); err != nil {
+						return err.Error()
+					}
+					b = nil
+					return ""
+
+				case "get":
+					if len(td.CmdArgs) != 1 {
+						return fmt.Sprintf("%s expects 1 argument", td.Cmd)
+					}
+					v, closer, err := b.Get([]byte(td.CmdArgs[0].String()))
+					if err != nil {
+						return err.Error()
+					}
+					defer closer.Close()
+					return string(v)
+
+				default:
+					return fmt.Sprintf("unknown command: %s", td.Cmd)
+				}
+			})
+		})
+	}
+}
+
+func TestBatchIter(t *testing.T) {
+	var b *Batch
+
+	for _, method := range []string{"build", "apply"} {
+		for _, testdata := range []string{
+			"testdata/internal_iter_next", "testdata/internal_iter_bounds"} {
+			t.Run(method, func(t *testing.T) {
+				datadriven.RunTest(t, testdata, func(t *testing.T, d *datadriven.TestData) string {
+					switch d.Cmd {
+					case "define":
+						switch method {
+						case "build":
+							b = newIndexedBatch(nil, DefaultComparer)
+						case "apply":
+							b = newBatch(nil)
+						}
+
+						for _, key := range strings.Split(d.Input, "\n") {
+							j := strings.Index(key, ":")
+							ikey := base.ParseInternalKey(key[:j])
+							value := []byte(key[j+1:])
+							b.Set(ikey.UserKey, value, nil)
+						}
+
+						switch method {
+						case "apply":
+							tmp := newIndexedBatch(nil, DefaultComparer)
+							tmp.Apply(b, nil)
+							b = tmp
+						}
+						return ""
+
+					case "iter":
+						var options IterOptions
+						for _, arg := range d.CmdArgs {
+							switch arg.Key {
+							case "lower":
+								if len(arg.Vals) != 1 {
+									return fmt.Sprintf(
+										"%s expects at most 1 value for lower", d.Cmd)
+								}
+								options.LowerBound = []byte(arg.Vals[0])
+							case "upper":
+								if len(arg.Vals) != 1 {
+									return fmt.Sprintf(
+										"%s expects at most 1 value for upper", d.Cmd)
+								}
+								options.UpperBound = []byte(arg.Vals[0])
+							default:
+								return fmt.Sprintf("unknown arg: %s", arg.Key)
+							}
+						}
+						iter := b.newInternalIter(&options)
+						defer iter.Close()
+						return itertest.RunInternalIterCmd(t, d, iter)
+
+					default:
+						return fmt.Sprintf("unknown command: %s", d.Cmd)
+					}
+				})
+			})
+		}
+	}
+}
+
+func TestBatchRangeOps(t *testing.T) {
+	var b *Batch
+
+	datadriven.RunTest(t, "testdata/batch_range_ops", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "clear":
+			b = nil
+			return ""
+
+		case "apply":
+			if b == nil {
+				b = newIndexedBatch(nil, DefaultComparer)
+			}
+			t := newBatch(nil)
+			if err := runBatchDefineCmd(td, t); err != nil {
+				return err.Error()
+			}
+			if err := b.Apply(t, nil); err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "define":
+			if b == nil {
+				b = newIndexedBatch(nil, DefaultComparer)
+			}
+			if err := runBatchDefineCmd(td, b); err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "scan":
+			if len(td.CmdArgs) > 1 {
+				return fmt.Sprintf("%s expects at most 1 argument", td.Cmd)
+			}
+			var fragmentIter keyspan.FragmentIterator
+			var internalIter base.InternalIterator
+			switch {
+			case td.HasArg("range-del"):
+				fragmentIter = b.newRangeDelIter(nil, math.MaxUint64)
+				defer fragmentIter.Close()
+			case td.HasArg("range-key"):
+				fragmentIter = b.newRangeKeyIter(nil, math.MaxUint64)
+				defer fragmentIter.Close()
+			default:
+				internalIter = b.newInternalIter(nil)
+				defer internalIter.Close()
+			}
+
+			var buf bytes.Buffer
+			if fragmentIter != nil {
+				for s := fragmentIter.First(); s != nil; s = fragmentIter.Next() {
+					for i := range s.Keys {
+						s.Keys[i].Trailer = base.MakeTrailer(
+							s.Keys[i].SeqNum()&^base.InternalKeySeqNumBatch,
+							s.Keys[i].Kind(),
+						)
+					}
+					fmt.Fprintln(&buf, s)
+				}
+			} else {
+				for k, v := internalIter.First(); k != nil; k, v = internalIter.Next() {
+					k.SetSeqNum(k.SeqNum() &^ InternalKeySeqNumBatch)
+					fmt.Fprintf(&buf, "%s:%s\n", k, v.InPlaceValue())
+				}
+			}
+			return buf.String()
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestBatchTooLarge(t *testing.T) {
+	var b Batch
+	var result interface{}
+	func() {
+		defer func() {
+			if r := recover(); r != nil {
+				result = r
+			}
+		}()
+		b.grow(maxBatchSize)
+	}()
+	require.EqualValues(t, ErrBatchTooLarge, result)
+}
+
+func TestFlushableBatchIter(t *testing.T) {
+	var b *flushableBatch
+	datadriven.RunTest(t, "testdata/internal_iter_next", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "define":
+			batch := newBatch(nil)
+			for _, key := range strings.Split(d.Input, "\n") {
+				j := strings.Index(key, ":")
+				ikey := base.ParseInternalKey(key[:j])
+				value := []byte(fmt.Sprint(ikey.SeqNum()))
+				batch.Set(ikey.UserKey, value, nil)
+			}
+			var err error
+			b, err = newFlushableBatch(batch, DefaultComparer)
+			require.NoError(t, err)
+			return ""
+
+		case "iter":
+			iter := b.newIter(nil)
+			defer iter.Close()
+			return itertest.RunInternalIterCmd(t, d, iter)
+
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
+
+func TestFlushableBatch(t *testing.T) {
+	var b *flushableBatch
+	datadriven.RunTest(t, "testdata/flushable_batch", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "define":
+			batch := newBatch(nil)
+			for _, key := range strings.Split(d.Input, "\n") {
+				j := strings.Index(key, ":")
+				ikey := base.ParseInternalKey(key[:j])
+				value := []byte(fmt.Sprint(ikey.SeqNum()))
+				switch ikey.Kind() {
+				case InternalKeyKindDelete:
+					require.NoError(t, batch.Delete(ikey.UserKey, nil))
+				case InternalKeyKindSet:
+					require.NoError(t, batch.Set(ikey.UserKey, value, nil))
+				case InternalKeyKindMerge:
+					require.NoError(t, batch.Merge(ikey.UserKey, value, nil))
+				case InternalKeyKindRangeDelete:
+					require.NoError(t, batch.DeleteRange(ikey.UserKey, value, nil))
+				case InternalKeyKindRangeKeyDelete:
+					require.NoError(t, batch.RangeKeyDelete(ikey.UserKey, value, nil))
+				case InternalKeyKindRangeKeySet:
+					require.NoError(t, batch.RangeKeySet(ikey.UserKey, value, value, value, nil))
+				case InternalKeyKindRangeKeyUnset:
+					require.NoError(t, batch.RangeKeyUnset(ikey.UserKey, value, value, nil))
+				}
+			}
+			var err error
+			b, err = newFlushableBatch(batch, DefaultComparer)
+			require.NoError(t, err)
+			return ""
+
+		case "iter":
+			var opts IterOptions
+			for _, arg := range d.CmdArgs {
+				if len(arg.Vals) != 1 {
+					return fmt.Sprintf("%s: %s=<value>", d.Cmd, arg.Key)
+				}
+				switch arg.Key {
+				case "lower":
+					opts.LowerBound = []byte(arg.Vals[0])
+				case "upper":
+					opts.UpperBound = []byte(arg.Vals[0])
+				default:
+					return fmt.Sprintf("%s: unknown arg: %s", d.Cmd, arg.Key)
+				}
+			}
+
+			iter := b.newIter(&opts)
+			defer iter.Close()
+			return itertest.RunInternalIterCmd(t, d, iter)
+
+		case "dump":
+			if len(d.CmdArgs) != 1 || len(d.CmdArgs[0].Vals) != 1 || d.CmdArgs[0].Key != "seq" {
+				return "dump seq=<value>\n"
+			}
+			seqNum, err := strconv.Atoi(d.CmdArgs[0].Vals[0])
+			if err != nil {
+				return err.Error()
+			}
+			b.setSeqNum(uint64(seqNum))
+
+			var buf bytes.Buffer
+
+			iter := newInternalIterAdapter(b.newIter(nil))
+			for valid := iter.First(); valid; valid = iter.Next() {
+				fmt.Fprintf(&buf, "%s:%s\n", iter.Key(), iter.Value())
+			}
+			iter.Close()
+
+			if rangeDelIter := b.newRangeDelIter(nil); rangeDelIter != nil {
+				scanKeyspanIterator(&buf, rangeDelIter)
+				rangeDelIter.Close()
+			}
+			if rangeKeyIter := b.newRangeKeyIter(nil); rangeKeyIter != nil {
+				scanKeyspanIterator(&buf, rangeKeyIter)
+				rangeKeyIter.Close()
+			}
+			return buf.String()
+
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
+
+func TestFlushableBatchDeleteRange(t *testing.T) {
+	var fb *flushableBatch
+	var input string
+
+	datadriven.RunTest(t, "testdata/delete_range", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "clear":
+			input = ""
+			return ""
+
+		case "define":
+			b := newBatch(nil)
+			// NB: We can't actually add to the flushable batch as we can to a
+			// memtable (which shares the "testdata/delete_range" data), so we fake
+			// it by concatenating the input and rebuilding the flushable batch from
+			// scratch.
+			input += "\n" + td.Input
+			td.Input = input
+			if err := runBatchDefineCmd(td, b); err != nil {
+				return err.Error()
+			}
+			var err error
+			fb, err = newFlushableBatch(b, DefaultComparer)
+			require.NoError(t, err)
+			return ""
+
+		case "scan":
+			var buf bytes.Buffer
+			if td.HasArg("range-del") {
+				fi := fb.newRangeDelIter(nil)
+				defer fi.Close()
+				scanKeyspanIterator(&buf, fi)
+			} else {
+				ii := fb.newIter(nil)
+				defer ii.Close()
+				scanInternalIter(&buf, ii)
+			}
+			return buf.String()
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func scanInternalIter(w io.Writer, ii internalIterator) {
+	for k, v := ii.First(); k != nil; k, v = ii.Next() {
+		fmt.Fprintf(w, "%s:%s\n", k, v.InPlaceValue())
+	}
+}
+
+func scanKeyspanIterator(w io.Writer, ki keyspan.FragmentIterator) {
+	for s := ki.First(); s != nil; s = ki.Next() {
+		fmt.Fprintln(w, s)
+	}
+}
+
+func TestFlushableBatchBytesIterated(t *testing.T) {
+	batch := newBatch(nil)
+	for j := 0; j < 1000; j++ {
+		key := make([]byte, 8+j%3)
+		value := make([]byte, 7+j%5)
+		batch.Set(key, value, nil)
+
+		fb, err := newFlushableBatch(batch, DefaultComparer)
+		require.NoError(t, err)
+
+		var bytesIterated uint64
+		it := fb.newFlushIter(nil, &bytesIterated)
+
+		var prevIterated uint64
+		for key, _ := it.First(); key != nil; key, _ = it.Next() {
+			if bytesIterated < prevIterated {
+				t.Fatalf("bytesIterated moved backward: %d < %d", bytesIterated, prevIterated)
+			}
+			prevIterated = bytesIterated
+		}
+
+		expected := fb.inuseBytes()
+		if bytesIterated != expected {
+			t.Fatalf("bytesIterated: got %d, want %d", bytesIterated, expected)
+		}
+	}
+}
+
+func TestEmptyFlushableBatch(t *testing.T) {
+	// Verify that we can create a flushable batch on an empty batch.
+	fb, err := newFlushableBatch(newBatch(nil), DefaultComparer)
+	require.NoError(t, err)
+	it := newInternalIterAdapter(fb.newIter(nil))
+	require.False(t, it.First())
+}
+
+func TestBatchCommitStats(t *testing.T) {
+	testFunc := func() error {
+		db, err := Open("", &Options{
+			FS: vfs.NewMem(),
+		})
+		require.NoError(t, err)
+		defer db.Close()
+		b := db.NewBatch()
+		defer b.Close()
+		stats := b.CommitStats()
+		require.Equal(t, BatchCommitStats{}, stats)
+
+		// The stall code peers into the internals, instead of adding general
+		// purpose hooks, to avoid changing production code. We can revisit this
+		// choice if it becomes hard to maintain.
+
+		// Commit semaphore stall funcs.
+		var unstallCommitSemaphore func()
+		stallCommitSemaphore := func() {
+			commitPipeline := db.commit
+			commitSemaphoreReserved := 0
+			done := false
+			for !done {
+				select {
+				case commitPipeline.commitQueueSem <- struct{}{}:
+					commitSemaphoreReserved++
+				default:
+					done = true
+				}
+				if done {
+					break
+				}
+			}
+			unstallCommitSemaphore = func() {
+				for i := 0; i < commitSemaphoreReserved; i++ {
+					<-commitPipeline.commitQueueSem
+				}
+			}
+		}
+
+		// Memstable stall funcs.
+		var unstallMemtable func()
+		stallMemtable := func() {
+			db.mu.Lock()
+			defer db.mu.Unlock()
+			prev := db.opts.MemTableStopWritesThreshold
+			db.opts.MemTableStopWritesThreshold = 0
+			unstallMemtable = func() {
+				db.mu.Lock()
+				defer db.mu.Unlock()
+				db.opts.MemTableStopWritesThreshold = prev
+				db.mu.compact.cond.Broadcast()
+			}
+		}
+
+		// L0 read-amp stall funcs.
+		var unstallL0ReadAmp func()
+		stallL0ReadAmp := func() {
+			db.mu.Lock()
+			defer db.mu.Unlock()
+			prev := db.opts.L0StopWritesThreshold
+			db.opts.L0StopWritesThreshold = 0
+			unstallL0ReadAmp = func() {
+				db.mu.Lock()
+				defer db.mu.Unlock()
+				db.opts.L0StopWritesThreshold = prev
+				db.mu.compact.cond.Broadcast()
+			}
+		}
+
+		// Commit wait stall funcs.
+		var unstallCommitWait func()
+		stallCommitWait := func() {
+			b.commit.Add(1)
+			unstallCommitWait = func() {
+				b.commit.Done()
+			}
+		}
+
+		// Stall everything.
+		stallCommitSemaphore()
+		stallMemtable()
+		stallL0ReadAmp()
+		stallCommitWait()
+
+		// Exceed initialMemTableSize -- this is needed to make stallMemtable work.
+		require.NoError(t, b.Set(make([]byte, initialMemTableSize), nil, nil))
+
+		var commitWG sync.WaitGroup
+		commitWG.Add(1)
+		go func() {
+			require.NoError(t, db.Apply(b, &WriteOptions{Sync: true}))
+			commitWG.Done()
+		}()
+		// Unstall things in the order that the stalls will happen.
+		sleepDuration := 10 * time.Millisecond
+		time.Sleep(sleepDuration)
+		unstallCommitSemaphore()
+		time.Sleep(sleepDuration)
+		unstallMemtable()
+		time.Sleep(sleepDuration)
+		unstallL0ReadAmp()
+		time.Sleep(sleepDuration)
+		unstallCommitWait()
+
+		// Wait for Apply to return.
+		commitWG.Wait()
+		stats = b.CommitStats()
+		expectedDuration := (2 * sleepDuration) / 3
+		if expectedDuration > stats.SemaphoreWaitDuration {
+			return errors.Errorf("SemaphoreWaitDuration %s is too low",
+				stats.SemaphoreWaitDuration.String())
+		}
+		if expectedDuration > stats.MemTableWriteStallDuration {
+			return errors.Errorf("MemTableWriteStallDuration %s is too low",
+				stats.MemTableWriteStallDuration.String())
+		}
+		if expectedDuration > stats.L0ReadAmpWriteStallDuration {
+			return errors.Errorf("L0ReadAmpWriteStallDuration %s is too low",
+				stats.L0ReadAmpWriteStallDuration)
+		}
+		if expectedDuration > stats.CommitWaitDuration {
+			return errors.Errorf("CommitWaitDuration %s is too low",
+				stats.CommitWaitDuration)
+		}
+		if 5*expectedDuration > stats.TotalDuration {
+			return errors.Errorf("TotalDuration %s is too low",
+				stats.TotalDuration)
+		}
+		return nil
+	}
+	// Try a few times, and succeed if one of them succeeds.
+	var err error
+	for i := 0; i < 5; i++ {
+		err = testFunc()
+		if err == nil {
+			break
+		}
+	}
+	require.NoError(t, err)
+}
+
+func TestBatchReader(t *testing.T) {
+	datadriven.RunTest(t, "testdata/batch_reader", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "scan":
+			var repr bytes.Buffer
+			for i, l := range strings.Split(td.Input, "\n") {
+				// Remove any trailing comments behind #.
+				if i := strings.IndexRune(l, '#'); i >= 0 {
+					l = l[:i]
+				}
+				// Strip all whitespace from the line.
+				l = strings.Map(func(r rune) rune {
+					if unicode.IsSpace(r) {
+						return -1
+					}
+					return r
+				}, l)
+				b, err := hex.DecodeString(l)
+				if err != nil {
+					return fmt.Sprintf("failed to decode hex; line %d", i)
+				}
+				repr.Write(b)
+			}
+			r, count := ReadBatch(repr.Bytes())
+			var out strings.Builder
+			fmt.Fprintf(&out, "Count: %d\n", count)
+			for {
+				kind, ukey, value, ok, err := r.Next()
+				if !ok {
+					if err != nil {
+						fmt.Fprintf(&out, "err: %s\n", err)
+					} else {
+						fmt.Fprint(&out, "eof")
+					}
+					break
+				}
+				fmt.Fprintf(&out, "%s: %q: %q\n", kind, ukey, value)
+			}
+			return out.String()
+
+		default:
+			return fmt.Sprintf("unrecognized command %q", td.Cmd)
+		}
+	})
+}
+
+func BenchmarkBatchSet(b *testing.B) {
+	value := make([]byte, 10)
+	for i := range value {
+		value[i] = byte(i)
+	}
+	key := make([]byte, 8)
+	batch := newBatch(nil)
+
+	b.ResetTimer()
+
+	const batchSize = 1000
+	for i := 0; i < b.N; i += batchSize {
+		end := i + batchSize
+		if end > b.N {
+			end = b.N
+		}
+
+		for j := i; j < end; j++ {
+			binary.BigEndian.PutUint64(key, uint64(j))
+			batch.Set(key, value, nil)
+		}
+		batch.Reset()
+	}
+
+	b.StopTimer()
+}
+
+func BenchmarkIndexedBatchSet(b *testing.B) {
+	value := make([]byte, 10)
+	for i := range value {
+		value[i] = byte(i)
+	}
+	key := make([]byte, 8)
+	batch := newIndexedBatch(nil, DefaultComparer)
+
+	b.ResetTimer()
+
+	const batchSize = 1000
+	for i := 0; i < b.N; i += batchSize {
+		end := i + batchSize
+		if end > b.N {
+			end = b.N
+		}
+
+		for j := i; j < end; j++ {
+			binary.BigEndian.PutUint64(key, uint64(j))
+			batch.Set(key, value, nil)
+		}
+		batch.Reset()
+	}
+
+	b.StopTimer()
+}
+
+func BenchmarkBatchSetDeferred(b *testing.B) {
+	value := make([]byte, 10)
+	for i := range value {
+		value[i] = byte(i)
+	}
+	key := make([]byte, 8)
+	batch := newBatch(nil)
+
+	b.ResetTimer()
+
+	const batchSize = 1000
+	for i := 0; i < b.N; i += batchSize {
+		end := i + batchSize
+		if end > b.N {
+			end = b.N
+		}
+
+		for j := i; j < end; j++ {
+			binary.BigEndian.PutUint64(key, uint64(j))
+			deferredOp := batch.SetDeferred(len(key), len(value))
+
+			copy(deferredOp.Key, key)
+			copy(deferredOp.Value, value)
+
+			deferredOp.Finish()
+		}
+		batch.Reset()
+	}
+
+	b.StopTimer()
+}
+
+func BenchmarkIndexedBatchSetDeferred(b *testing.B) {
+	value := make([]byte, 10)
+	for i := range value {
+		value[i] = byte(i)
+	}
+	key := make([]byte, 8)
+	batch := newIndexedBatch(nil, DefaultComparer)
+
+	b.ResetTimer()
+
+	const batchSize = 1000
+	for i := 0; i < b.N; i += batchSize {
+		end := i + batchSize
+		if end > b.N {
+			end = b.N
+		}
+
+		for j := i; j < end; j++ {
+			binary.BigEndian.PutUint64(key, uint64(j))
+			deferredOp := batch.SetDeferred(len(key), len(value))
+
+			copy(deferredOp.Key, key)
+			copy(deferredOp.Value, value)
+
+			deferredOp.Finish()
+		}
+		batch.Reset()
+	}
+
+	b.StopTimer()
+}
+
+func TestBatchMemTableSizeOverflow(t *testing.T) {
+	opts := &Options{
+		FS: vfs.NewMem(),
+	}
+	opts.EnsureDefaults()
+	d, err := Open("", opts)
+	require.NoError(t, err)
+
+	bigValue := make([]byte, 1000)
+	b := d.NewBatch()
+
+	// memTableSize can overflow as a uint32.
+	b.memTableSize = math.MaxUint32 - 50
+	for i := 0; i < 10; i++ {
+		k := fmt.Sprintf("key-%05d", i)
+		require.NoError(t, b.Set([]byte(k), bigValue, nil))
+	}
+	require.Greater(t, b.memTableSize, uint64(math.MaxUint32))
+	require.NoError(t, b.Close())
+	require.NoError(t, d.Close())
+}
+
+// TestBatchSpanCaching stress tests the caching of keyspan.Spans for range
+// tombstones and range keys.
+func TestBatchSpanCaching(t *testing.T) {
+	opts := &Options{
+		Comparer:           testkeys.Comparer,
+		FS:                 vfs.NewMem(),
+		FormatMajorVersion: internalFormatNewest,
+	}
+	d, err := Open("", opts)
+	require.NoError(t, err)
+	defer d.Close()
+
+	ks := testkeys.Alpha(1)
+	b := d.NewIndexedBatch()
+	for i := int64(0); i < ks.Count(); i++ {
+		k := testkeys.Key(ks, i)
+		require.NoError(t, b.Set(k, k, nil))
+	}
+
+	seed := int64(time.Now().UnixNano())
+	t.Logf("seed = %d", seed)
+	rng := rand.New(rand.NewSource(seed))
+	iters := make([][]*Iterator, ks.Count())
+	defer func() {
+		for _, keyIters := range iters {
+			for _, iter := range keyIters {
+				_ = iter.Close()
+			}
+		}
+	}()
+
+	// This test begins with one point key for every letter of the alphabet.
+	// Over the course of the test, point keys are 'replaced' with range keys
+	// with narrow bounds from left to right. Iterators are created at random,
+	// sometimes from the batch and sometimes by cloning existing iterators.
+
+	checkIter := func(iter *Iterator, nextKey int64) {
+		var i int64
+		for valid := iter.First(); valid; valid = iter.Next() {
+			hasPoint, hasRange := iter.HasPointAndRange()
+			require.Equal(t, testkeys.Key(ks, i), iter.Key())
+			if i < nextKey {
+				// This key should not exist as a point key, just a range key.
+				require.False(t, hasPoint)
+				require.True(t, hasRange)
+			} else {
+				require.True(t, hasPoint)
+				require.False(t, hasRange)
+			}
+			i++
+		}
+		require.Equal(t, ks.Count(), i)
+	}
+
+	// Each iteration of the below loop either reads or writes.
+	//
+	// A write iteration writes a new RANGEDEL and RANGEKEYSET into the batch,
+	// covering a single point key seeded above. Writing these two span keys
+	// together 'replaces' the point key with a range key. Each write iteration
+	// ratchets nextWriteKey so the next write iteration will write the next
+	// key.
+	//
+	// A read iteration creates a new iterator and ensures its state is
+	// expected: some prefix of only point keys, followed by a suffix of only
+	// range keys. Iterators created through Clone should observe the point keys
+	// that existed when the cloned iterator was created.
+	for nextWriteKey := int64(0); nextWriteKey < ks.Count(); {
+		p := rng.Float64()
+		switch {
+		case p < .10: /* 10 % */
+			// Write a new range deletion and range key.
+			start := testkeys.Key(ks, nextWriteKey)
+			end := append(start, 0x00)
+			require.NoError(t, b.DeleteRange(start, end, nil))
+			require.NoError(t, b.RangeKeySet(start, end, nil, []byte("foo"), nil))
+			nextWriteKey++
+		case p < .55: /* 45 % */
+			// Create a new iterator directly from the batch and check that it
+			// observes the correct state.
+			iter, _ := b.NewIter(&IterOptions{KeyTypes: IterKeyTypePointsAndRanges})
+			checkIter(iter, nextWriteKey)
+			iters[nextWriteKey] = append(iters[nextWriteKey], iter)
+		default: /* 45 % */
+			// Create a new iterator through cloning a random existing iterator
+			// and check that it observes the right state.
+			readKey := rng.Int63n(nextWriteKey + 1)
+			itersForReadKey := iters[readKey]
+			if len(itersForReadKey) == 0 {
+				continue
+			}
+			iter, err := itersForReadKey[rng.Intn(len(itersForReadKey))].Clone(CloneOptions{})
+			require.NoError(t, err)
+			checkIter(iter, readKey)
+			iters[readKey] = append(iters[readKey], iter)
+		}
+	}
+}
diff --git a/pebble/bloom/bloom.go b/pebble/bloom/bloom.go
new file mode 100644
index 0000000..bf72e1d
--- /dev/null
+++ b/pebble/bloom/bloom.go
@@ -0,0 +1,250 @@
+// Copyright 2013 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+// Package bloom implements Bloom filters.
+package bloom // import "github.com/cockroachdb/pebble/bloom"
+
+import (
+	"encoding/binary"
+	"fmt"
+	"sync"
+
+	"github.com/cockroachdb/pebble/internal/base"
+)
+
+const (
+	cacheLineSize = 64
+	cacheLineBits = cacheLineSize * 8
+)
+
+type tableFilter []byte
+
+func (f tableFilter) MayContain(key []byte) bool {
+	if len(f) <= 5 {
+		return false
+	}
+	n := len(f) - 5
+	nProbes := f[n]
+	nLines := binary.LittleEndian.Uint32(f[n+1:])
+	cacheLineBits := 8 * (uint32(n) / nLines)
+
+	h := hash(key)
+	delta := h>>17 | h<<15
+	b := (h % nLines) * cacheLineBits
+
+	for j := uint8(0); j < nProbes; j++ {
+		bitPos := b + (h % cacheLineBits)
+		if f[bitPos/8]&(1<<(bitPos%8)) == 0 {
+			return false
+		}
+		h += delta
+	}
+	return true
+}
+
+func calculateProbes(bitsPerKey int) uint32 {
+	// We intentionally round down to reduce probing cost a little bit
+	n := uint32(float64(bitsPerKey) * 0.69) // 0.69 =~ ln(2)
+	if n < 1 {
+		n = 1
+	}
+	if n > 30 {
+		n = 30
+	}
+	return n
+}
+
+// extend appends n zero bytes to b. It returns the overall slice (of length
+// n+len(originalB)) and the slice of n trailing zeroes.
+func extend(b []byte, n int) (overall, trailer []byte) {
+	want := n + len(b)
+	if want <= cap(b) {
+		overall = b[:want]
+		trailer = overall[len(b):]
+		for i := range trailer {
+			trailer[i] = 0
+		}
+	} else {
+		// Grow the capacity exponentially, with a 1KiB minimum.
+		c := 1024
+		for c < want {
+			c += c / 4
+		}
+		overall = make([]byte, want, c)
+		trailer = overall[len(b):]
+		copy(overall, b)
+	}
+	return overall, trailer
+}
+
+// hash implements a hashing algorithm similar to the Murmur hash.
+func hash(b []byte) uint32 {
+	const (
+		seed = 0xbc9f1d34
+		m    = 0xc6a4a793
+	)
+	h := uint32(seed) ^ uint32(uint64(uint32(len(b))*m))
+	for ; len(b) >= 4; b = b[4:] {
+		h += uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+		h *= m
+		h ^= h >> 16
+	}
+
+	// The code below first casts each byte to a signed 8-bit integer. This is
+	// necessary to match RocksDB's behavior. Note that the `byte` type in Go is
+	// unsigned. What is the difference between casting a signed 8-bit value vs
+	// unsigned 8-bit value into an unsigned 32-bit value?
+	// Sign-extension. Consider the value 250 which has the bit pattern 11111010:
+	//
+	//   uint32(250)        = 00000000000000000000000011111010
+	//   uint32(int8(250))  = 11111111111111111111111111111010
+	//
+	// Note that the original LevelDB code did not explicitly cast to a signed
+	// 8-bit value which left the behavior dependent on whether C characters were
+	// signed or unsigned which is a compiler flag for gcc (-funsigned-char).
+	switch len(b) {
+	case 3:
+		h += uint32(int8(b[2])) << 16
+		fallthrough
+	case 2:
+		h += uint32(int8(b[1])) << 8
+		fallthrough
+	case 1:
+		h += uint32(int8(b[0]))
+		h *= m
+		h ^= h >> 24
+	}
+	return h
+}
+
+const hashBlockLen = 16384
+
+type hashBlock [hashBlockLen]uint32
+
+var hashBlockPool = sync.Pool{
+	New: func() interface{} {
+		return &hashBlock{}
+	},
+}
+
+type tableFilterWriter struct {
+	bitsPerKey int
+
+	numHashes int
+	// We store the hashes in blocks.
+	blocks   []*hashBlock
+	lastHash uint32
+
+	// Initial "in-line" storage for the blocks slice (to avoid some small
+	// allocations).
+	blocksBuf [16]*hashBlock
+}
+
+func newTableFilterWriter(bitsPerKey int) *tableFilterWriter {
+	w := &tableFilterWriter{
+		bitsPerKey: bitsPerKey,
+	}
+	w.blocks = w.blocksBuf[:0]
+	return w
+}
+
+// AddKey implements the base.FilterWriter interface.
+func (w *tableFilterWriter) AddKey(key []byte) {
+	h := hash(key)
+	if w.numHashes != 0 && h == w.lastHash {
+		return
+	}
+	ofs := w.numHashes % hashBlockLen
+	if ofs == 0 {
+		// Time for a new block.
+		w.blocks = append(w.blocks, hashBlockPool.Get().(*hashBlock))
+	}
+	w.blocks[len(w.blocks)-1][ofs] = h
+	w.numHashes++
+	w.lastHash = h
+}
+
+// Finish implements the base.FilterWriter interface.
+func (w *tableFilterWriter) Finish(buf []byte) []byte {
+	// The table filter format matches the RocksDB full-file filter format.
+	var nLines int
+	if w.numHashes != 0 {
+		nLines = (w.numHashes*w.bitsPerKey + cacheLineBits - 1) / (cacheLineBits)
+		// Make nLines an odd number to make sure more bits are involved when
+		// determining which block.
+		if nLines%2 == 0 {
+			nLines++
+		}
+	}
+
+	nBytes := nLines * cacheLineSize
+	// +5: 4 bytes for num-lines, 1 byte for num-probes
+	buf, filter := extend(buf, nBytes+5)
+
+	if nLines != 0 {
+		nProbes := calculateProbes(w.bitsPerKey)
+		for bIdx, b := range w.blocks {
+			length := hashBlockLen
+			if bIdx == len(w.blocks)-1 && w.numHashes%hashBlockLen != 0 {
+				length = w.numHashes % hashBlockLen
+			}
+			for _, h := range b[:length] {
+				delta := h>>17 | h<<15 // rotate right 17 bits
+				b := (h % uint32(nLines)) * (cacheLineBits)
+				for i := uint32(0); i < nProbes; i++ {
+					bitPos := b + (h % cacheLineBits)
+					filter[bitPos/8] |= (1 << (bitPos % 8))
+					h += delta
+				}
+			}
+		}
+		filter[nBytes] = byte(nProbes)
+		binary.LittleEndian.PutUint32(filter[nBytes+1:], uint32(nLines))
+	}
+
+	// Release the hash blocks.
+	for i, b := range w.blocks {
+		hashBlockPool.Put(b)
+		w.blocks[i] = nil
+	}
+	w.blocks = w.blocks[:0]
+	w.numHashes = 0
+	return buf
+}
+
+// FilterPolicy implements the FilterPolicy interface from the pebble package.
+//
+// The integer value is the approximate number of bits used per key. A good
+// value is 10, which yields a filter with ~ 1% false positive rate.
+type FilterPolicy int
+
+var _ base.FilterPolicy = FilterPolicy(0)
+
+// Name implements the pebble.FilterPolicy interface.
+func (p FilterPolicy) Name() string {
+	// This string looks arbitrary, but its value is written to LevelDB .sst
+	// files, and should be this exact value to be compatible with those files
+	// and with the C++ LevelDB code.
+	return "rocksdb.BuiltinBloomFilter"
+}
+
+// MayContain implements the pebble.FilterPolicy interface.
+func (p FilterPolicy) MayContain(ftype base.FilterType, f, key []byte) bool {
+	switch ftype {
+	case base.TableFilter:
+		return tableFilter(f).MayContain(key)
+	default:
+		panic(fmt.Sprintf("unknown filter type: %v", ftype))
+	}
+}
+
+// NewWriter implements the pebble.FilterPolicy interface.
+func (p FilterPolicy) NewWriter(ftype base.FilterType) base.FilterWriter {
+	switch ftype {
+	case base.TableFilter:
+		return newTableFilterWriter(int(p))
+	default:
+		panic(fmt.Sprintf("unknown filter type: %v", ftype))
+	}
+}
diff --git a/pebble/bloom/bloom_test.go b/pebble/bloom/bloom_test.go
new file mode 100644
index 0000000..74a6f62
--- /dev/null
+++ b/pebble/bloom/bloom_test.go
@@ -0,0 +1,219 @@
+// Copyright 2013 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package bloom
+
+import (
+	"crypto/rand"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/stretchr/testify/require"
+)
+
+func (f tableFilter) String() string {
+	var buf strings.Builder
+	for i, x := range f {
+		if i > 0 {
+			if i%8 == 0 {
+				buf.WriteString("\n")
+			} else {
+				buf.WriteString("  ")
+			}
+		}
+
+		for j := uint(0); j < 8; j++ {
+			if x&(1<<(7-j)) != 0 {
+				buf.WriteString("1")
+			} else {
+				buf.WriteString(".")
+			}
+		}
+	}
+	buf.WriteString("\n")
+	return buf.String()
+}
+
+func newTableFilter(bitsPerKey int, keys ...[]byte) tableFilter {
+	w := FilterPolicy(bitsPerKey).NewWriter(base.TableFilter)
+	for _, key := range keys {
+		w.AddKey(key)
+	}
+	return tableFilter(w.Finish(nil))
+}
+
+func TestSmallBloomFilter(t *testing.T) {
+	f := newTableFilter(10, []byte("hello"), []byte("world"))
+
+	// The magic expected string comes from running RocksDB's util/bloom_test.cc:FullBloomTest.FullSmall.
+	want := `
+........  ........  ........  .......1  ........  ........  ........  ........
+........  .1......  ........  .1......  ........  ........  ........  ........
+...1....  ........  ........  ........  ........  ........  ........  ........
+........  ........  ........  ........  ........  ........  ........  ...1....
+........  ........  ........  ........  .....1..  ........  ........  ........
+.......1  ........  ........  ........  ........  ........  .1......  ........
+........  ........  ........  ........  ........  ...1....  ........  ........
+.......1  ........  ........  ........  .1...1..  ........  ........  ........
+.....11.  .......1  ........  ........  ........
+`
+	want = strings.TrimLeft(want, "\n")
+	require.EqualValues(t, want, f.String())
+
+	m := map[string]bool{
+		"hello": true,
+		"world": true,
+		"x":     false,
+		"foo":   false,
+	}
+	for k, want := range m {
+		require.EqualValues(t, want, f.MayContain([]byte(k)))
+	}
+}
+
+func TestBloomFilter(t *testing.T) {
+	nextLength := func(x int) int {
+		if x < 10 {
+			return x + 1
+		}
+		if x < 100 {
+			return x + 10
+		}
+		if x < 1000 {
+			return x + 100
+		}
+		return x + 1000
+	}
+	le32 := func(i int) []byte {
+		b := make([]byte, 4)
+		b[0] = uint8(uint32(i) >> 0)
+		b[1] = uint8(uint32(i) >> 8)
+		b[2] = uint8(uint32(i) >> 16)
+		b[3] = uint8(uint32(i) >> 24)
+		return b
+	}
+
+	nMediocreFilters, nGoodFilters := 0, 0
+loop:
+	for length := 1; length <= 10000; length = nextLength(length) {
+		keys := make([][]byte, 0, length)
+		for i := 0; i < length; i++ {
+			keys = append(keys, le32(i))
+		}
+		f := newTableFilter(10, keys...)
+		// The size of the table bloom filter is measured in multiples of the
+		// cache line size. The '+2' contribution captures the rounding up in the
+		// length division plus preferring an odd number of cache lines. As such,
+		// this formula isn't exact, but the exact formula is hard to read.
+		maxLen := 5 + ((length*10)/cacheLineBits+2)*cacheLineSize
+		if len(f) > maxLen {
+			t.Errorf("length=%d: len(f)=%d > max len %d", length, len(f), maxLen)
+			continue
+		}
+
+		// All added keys must match.
+		for _, key := range keys {
+			if !f.MayContain(key) {
+				t.Errorf("length=%d: did not contain key %q", length, key)
+				continue loop
+			}
+		}
+
+		// Check false positive rate.
+		nFalsePositive := 0
+		for i := 0; i < 10000; i++ {
+			if f.MayContain(le32(1e9 + i)) {
+				nFalsePositive++
+			}
+		}
+		if nFalsePositive > 0.02*10000 {
+			t.Errorf("length=%d: %d false positives in 10000", length, nFalsePositive)
+			continue
+		}
+		if nFalsePositive > 0.0125*10000 {
+			nMediocreFilters++
+		} else {
+			nGoodFilters++
+		}
+	}
+
+	if nMediocreFilters > nGoodFilters/5 {
+		t.Errorf("%d mediocre filters but only %d good filters", nMediocreFilters, nGoodFilters)
+	}
+}
+
+func TestHash(t *testing.T) {
+	testCases := []struct {
+		s        string
+		expected uint32
+	}{
+		// The magic expected numbers come from RocksDB's util/hash_test.cc:TestHash.
+		{"", 3164544308},
+		{"\x08", 422599524},
+		{"\x17", 3168152998},
+		{"\x9a", 3195034349},
+		{"\x1c", 2651681383},
+		{"\x4d\x76", 2447836956},
+		{"\x52\xd5", 3854228105},
+		{"\x91\xf7", 31066776},
+		{"\xd6\x27", 1806091603},
+		{"\x30\x46\x0b", 3808221797},
+		{"\x56\xdc\xd6", 2157698265},
+		{"\xd4\x52\x33", 1721992661},
+		{"\x6a\xb5\xf4", 2469105222},
+		{"\x67\x53\x81\x1c", 118283265},
+		{"\x69\xb8\xc0\x88", 3416318611},
+		{"\x1e\x84\xaf\x2d", 3315003572},
+		{"\x46\xdc\x54\xbe", 447346355},
+		{"\xd0\x7a\x6e\xea\x56", 4255445370},
+		{"\x86\x83\xd5\xa4\xd8", 2390603402},
+		{"\xb7\x46\xbb\x77\xce", 2048907743},
+		{"\x6c\xa8\xbc\xe5\x99", 2177978500},
+		{"\x5c\x5e\xe1\xa0\x73\x81", 1036846008},
+		{"\x08\x5d\x73\x1c\xe5\x2e", 229980482},
+		{"\x42\xfb\xf2\x52\xb4\x10", 3655585422},
+		{"\x73\xe1\xff\x56\x9c\xce", 3502708029},
+		{"\x5c\xbe\x97\x75\x54\x9a\x52", 815120748},
+		{"\x16\x82\x39\x49\x88\x2b\x36", 3056033698},
+		{"\x59\x77\xf0\xa7\x24\xf4\x78", 587205227},
+		{"\xd3\xa5\x7c\x0e\xc0\x02\x07", 2030937252},
+		{"\x31\x1b\x98\x75\x96\x22\xd3\x9a", 469635402},
+		{"\x38\xd6\xf7\x28\x20\xb4\x8a\xe9", 3530274698},
+		{"\xbb\x18\x5d\xf4\x12\x03\xf7\x99", 1974545809},
+		{"\x80\xd4\x3b\x3b\xae\x22\xa2\x78", 3563570120},
+		{"\x1a\xb5\xd0\xfe\xab\xc3\x61\xb2\x99", 2706087434},
+		{"\x8e\x4a\xc3\x18\x20\x2f\x06\xe6\x3c", 1534654151},
+		{"\xb6\xc0\xdd\x05\x3f\xc4\x86\x4c\xef", 2355554696},
+		{"\x9a\x5f\x78\x0d\xaf\x50\xe1\x1f\x55", 1400800912},
+		{"\x22\x6f\x39\x1f\xf8\xdd\x4f\x52\x17\x94", 3420325137},
+		{"\x32\x89\x2a\x75\x48\x3a\x4a\x02\x69\xdd", 3427803584},
+		{"\x06\x92\x5c\xf4\x88\x0e\x7e\x68\x38\x3e", 1152407945},
+		{"\xbd\x2c\x63\x38\xbf\xe9\x78\xb7\xbf\x15", 3382479516},
+	}
+	for _, tc := range testCases {
+		t.Run("", func(t *testing.T) {
+			require.EqualValues(t, tc.expected, hash([]byte(tc.s)))
+		})
+	}
+}
+
+func BenchmarkBloomFilter(b *testing.B) {
+	const keyLen = 128
+	const numKeys = 1024
+	keys := make([][]byte, numKeys)
+	for i := range keys {
+		keys[i] = make([]byte, keyLen)
+		_, _ = rand.Read(keys[i])
+	}
+	b.ResetTimer()
+	policy := FilterPolicy(10)
+	for i := 0; i < b.N; i++ {
+		w := policy.NewWriter(base.TableFilter)
+		for _, key := range keys {
+			w.AddKey(key)
+		}
+		w.Finish(nil)
+	}
+}
diff --git a/pebble/cache.go b/pebble/cache.go
new file mode 100644
index 0000000..91f5532
--- /dev/null
+++ b/pebble/cache.go
@@ -0,0 +1,23 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import "github.com/cockroachdb/pebble/internal/cache"
+
+// Cache exports the cache.Cache type.
+type Cache = cache.Cache
+
+// NewCache creates a new cache of the specified size. Memory for the cache is
+// allocated on demand, not during initialization. The cache is created with a
+// reference count of 1. Each DB it is associated with adds a reference, so the
+// creator of the cache should usually release their reference after the DB is
+// created.
+//
+//	c := pebble.NewCache(...)
+//	defer c.Unref()
+//	d, err := pebble.Open(pebble.Options{Cache: c})
+func NewCache(size int64) *cache.Cache {
+	return cache.New(size)
+}
diff --git a/pebble/checkpoint.go b/pebble/checkpoint.go
new file mode 100644
index 0000000..f321c01
--- /dev/null
+++ b/pebble/checkpoint.go
@@ -0,0 +1,428 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"io"
+	"os"
+
+	"github.com/cockroachdb/errors/oserror"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/record"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/cockroachdb/pebble/vfs/atomicfs"
+)
+
+// checkpointOptions hold the optional parameters to construct checkpoint
+// snapshots.
+type checkpointOptions struct {
+	// flushWAL set to true will force a flush and sync of the WAL prior to
+	// checkpointing.
+	flushWAL bool
+
+	// If set, any SSTs that don't overlap with these spans are excluded from a checkpoint.
+	restrictToSpans []CheckpointSpan
+}
+
+// CheckpointOption set optional parameters used by `DB.Checkpoint`.
+type CheckpointOption func(*checkpointOptions)
+
+// WithFlushedWAL enables flushing and syncing the WAL prior to constructing a
+// checkpoint. This guarantees that any writes committed before calling
+// DB.Checkpoint will be part of that checkpoint.
+//
+// Note that this setting can only be useful in cases when some writes are
+// performed with Sync = false. Otherwise, the guarantee will already be met.
+//
+// Passing this option is functionally equivalent to calling
+// DB.LogData(nil, Sync) right before DB.Checkpoint.
+func WithFlushedWAL() CheckpointOption {
+	return func(opt *checkpointOptions) {
+		opt.flushWAL = true
+	}
+}
+
+// WithRestrictToSpans specifies spans of interest for the checkpoint. Any SSTs
+// that don't overlap with any of these spans are excluded from the checkpoint.
+//
+// Note that the checkpoint can still surface keys outside of these spans (from
+// the WAL and from SSTs that partially overlap with these spans). Moreover,
+// these surface keys aren't necessarily "valid" in that they could have been
+// modified but the SST containing the modification is excluded.
+func WithRestrictToSpans(spans []CheckpointSpan) CheckpointOption {
+	return func(opt *checkpointOptions) {
+		opt.restrictToSpans = spans
+	}
+}
+
+// CheckpointSpan is a key range [Start, End) (inclusive on Start, exclusive on
+// End) of interest for a checkpoint.
+type CheckpointSpan struct {
+	Start []byte
+	End   []byte
+}
+
+// excludeFromCheckpoint returns true if an SST file should be excluded from the
+// checkpoint because it does not overlap with the spans of interest
+// (opt.restrictToSpans).
+func excludeFromCheckpoint(f *fileMetadata, opt *checkpointOptions, cmp Compare) bool {
+	if len(opt.restrictToSpans) == 0 {
+		// Option not set; don't exclude anything.
+		return false
+	}
+	for _, s := range opt.restrictToSpans {
+		if f.Overlaps(cmp, s.Start, s.End, true /* exclusiveEnd */) {
+			return false
+		}
+	}
+	// None of the restrictToSpans overlapped; we can exclude this file.
+	return true
+}
+
+// mkdirAllAndSyncParents creates destDir and any of its missing parents.
+// Those missing parents, as well as the closest existing ancestor, are synced.
+// Returns a handle to the directory created at destDir.
+func mkdirAllAndSyncParents(fs vfs.FS, destDir string) (vfs.File, error) {
+	// Collect paths for all directories between destDir (excluded) and its
+	// closest existing ancestor (included).
+	var parentPaths []string
+	foundExistingAncestor := false
+	for parentPath := fs.PathDir(destDir); parentPath != "."; parentPath = fs.PathDir(parentPath) {
+		parentPaths = append(parentPaths, parentPath)
+		_, err := fs.Stat(parentPath)
+		if err == nil {
+			// Exit loop at the closest existing ancestor.
+			foundExistingAncestor = true
+			break
+		}
+		if !oserror.IsNotExist(err) {
+			return nil, err
+		}
+	}
+	// Handle empty filesystem edge case.
+	if !foundExistingAncestor {
+		parentPaths = append(parentPaths, "")
+	}
+	// Create destDir and any of its missing parents.
+	if err := fs.MkdirAll(destDir, 0755); err != nil {
+		return nil, err
+	}
+	// Sync all the parent directories up to the closest existing ancestor,
+	// included.
+	for _, parentPath := range parentPaths {
+		parentDir, err := fs.OpenDir(parentPath)
+		if err != nil {
+			return nil, err
+		}
+		err = parentDir.Sync()
+		if err != nil {
+			_ = parentDir.Close()
+			return nil, err
+		}
+		err = parentDir.Close()
+		if err != nil {
+			return nil, err
+		}
+	}
+	return fs.OpenDir(destDir)
+}
+
+// Checkpoint constructs a snapshot of the DB instance in the specified
+// directory. The WAL, MANIFEST, OPTIONS, and sstables will be copied into the
+// snapshot. Hard links will be used when possible. Beware of the significant
+// space overhead for a checkpoint if hard links are disabled. Also beware that
+// even if hard links are used, the space overhead for the checkpoint will
+// increase over time as the DB performs compactions.
+func (d *DB) Checkpoint(
+	destDir string, opts ...CheckpointOption,
+) (
+	ckErr error, /* used in deferred cleanup */
+) {
+	opt := &checkpointOptions{}
+	for _, fn := range opts {
+		fn(opt)
+	}
+
+	if _, err := d.opts.FS.Stat(destDir); !oserror.IsNotExist(err) {
+		if err == nil {
+			return &os.PathError{
+				Op:   "checkpoint",
+				Path: destDir,
+				Err:  oserror.ErrExist,
+			}
+		}
+		return err
+	}
+
+	if opt.flushWAL && !d.opts.DisableWAL {
+		// Write an empty log-data record to flush and sync the WAL.
+		if err := d.LogData(nil /* data */, Sync); err != nil {
+			return err
+		}
+	}
+
+	// Disable file deletions.
+	d.mu.Lock()
+	d.disableFileDeletions()
+	defer func() {
+		d.mu.Lock()
+		defer d.mu.Unlock()
+		d.enableFileDeletions()
+	}()
+
+	// TODO(peter): RocksDB provides the option to roll the manifest if the
+	// MANIFEST size is too large. Should we do this too?
+
+	// Lock the manifest before getting the current version. We need the
+	// length of the manifest that we read to match the current version that
+	// we read, otherwise we might copy a versionEdit not reflected in the
+	// sstables we copy/link.
+	d.mu.versions.logLock()
+	// Get the unflushed log files, the current version, and the current manifest
+	// file number.
+	memQueue := d.mu.mem.queue
+	current := d.mu.versions.currentVersion()
+	formatVers := d.FormatMajorVersion()
+	manifestFileNum := d.mu.versions.manifestFileNum
+	manifestSize := d.mu.versions.manifest.Size()
+	optionsFileNum := d.optionsFileNum
+	virtualBackingFiles := make(map[base.DiskFileNum]struct{})
+	for diskFileNum := range d.mu.versions.backingState.fileBackingMap {
+		virtualBackingFiles[diskFileNum] = struct{}{}
+	}
+	// Release the manifest and DB.mu so we don't block other operations on
+	// the database.
+	d.mu.versions.logUnlock()
+	d.mu.Unlock()
+
+	// Wrap the normal filesystem with one which wraps newly created files with
+	// vfs.NewSyncingFile.
+	fs := vfs.NewSyncingFS(d.opts.FS, vfs.SyncingFileOptions{
+		NoSyncOnClose: d.opts.NoSyncOnClose,
+		BytesPerSync:  d.opts.BytesPerSync,
+	})
+
+	// Create the dir and its parents (if necessary), and sync them.
+	var dir vfs.File
+	defer func() {
+		if dir != nil {
+			_ = dir.Close()
+		}
+		if ckErr != nil {
+			// Attempt to cleanup on error.
+			_ = fs.RemoveAll(destDir)
+		}
+	}()
+	dir, ckErr = mkdirAllAndSyncParents(fs, destDir)
+	if ckErr != nil {
+		return ckErr
+	}
+
+	{
+		// Link or copy the OPTIONS.
+		srcPath := base.MakeFilepath(fs, d.dirname, fileTypeOptions, optionsFileNum)
+		destPath := fs.PathJoin(destDir, fs.PathBase(srcPath))
+		ckErr = vfs.LinkOrCopy(fs, srcPath, destPath)
+		if ckErr != nil {
+			return ckErr
+		}
+	}
+
+	{
+		// Set the format major version in the destination directory.
+		var versionMarker *atomicfs.Marker
+		versionMarker, _, ckErr = atomicfs.LocateMarker(fs, destDir, formatVersionMarkerName)
+		if ckErr != nil {
+			return ckErr
+		}
+
+		// We use the marker to encode the active format version in the
+		// marker filename. Unlike other uses of the atomic marker,
+		// there is no file with the filename `formatVers.String()` on
+		// the filesystem.
+		ckErr = versionMarker.Move(formatVers.String())
+		if ckErr != nil {
+			return ckErr
+		}
+		ckErr = versionMarker.Close()
+		if ckErr != nil {
+			return ckErr
+		}
+	}
+
+	var excludedFiles map[deletedFileEntry]*fileMetadata
+	// Set of FileBacking.DiskFileNum which will be required by virtual sstables
+	// in the checkpoint.
+	requiredVirtualBackingFiles := make(map[base.DiskFileNum]struct{})
+	// Link or copy the sstables.
+	for l := range current.Levels {
+		iter := current.Levels[l].Iter()
+		for f := iter.First(); f != nil; f = iter.Next() {
+			if excludeFromCheckpoint(f, opt, d.cmp) {
+				if excludedFiles == nil {
+					excludedFiles = make(map[deletedFileEntry]*fileMetadata)
+				}
+				excludedFiles[deletedFileEntry{
+					Level:   l,
+					FileNum: f.FileNum,
+				}] = f
+				continue
+			}
+
+			fileBacking := f.FileBacking
+			if f.Virtual {
+				if _, ok := requiredVirtualBackingFiles[fileBacking.DiskFileNum]; ok {
+					continue
+				}
+				requiredVirtualBackingFiles[fileBacking.DiskFileNum] = struct{}{}
+			}
+
+			srcPath := base.MakeFilepath(fs, d.dirname, fileTypeTable, fileBacking.DiskFileNum)
+			destPath := fs.PathJoin(destDir, fs.PathBase(srcPath))
+			ckErr = vfs.LinkOrCopy(fs, srcPath, destPath)
+			if ckErr != nil {
+				return ckErr
+			}
+		}
+	}
+
+	var removeBackingTables []base.DiskFileNum
+	for diskFileNum := range virtualBackingFiles {
+		if _, ok := requiredVirtualBackingFiles[diskFileNum]; !ok {
+			// The backing sstable associated with fileNum is no longer
+			// required.
+			removeBackingTables = append(removeBackingTables, diskFileNum)
+		}
+	}
+
+	ckErr = d.writeCheckpointManifest(
+		fs, formatVers, destDir, dir, manifestFileNum, manifestSize,
+		excludedFiles, removeBackingTables,
+	)
+	if ckErr != nil {
+		return ckErr
+	}
+
+	// Copy the WAL files. We copy rather than link because WAL file recycling
+	// will cause the WAL files to be reused which would invalidate the
+	// checkpoint.
+	for i := range memQueue {
+		logNum := memQueue[i].logNum
+		if logNum == 0 {
+			continue
+		}
+		srcPath := base.MakeFilepath(fs, d.walDirname, fileTypeLog, logNum)
+		destPath := fs.PathJoin(destDir, fs.PathBase(srcPath))
+		ckErr = vfs.Copy(fs, srcPath, destPath)
+		if ckErr != nil {
+			return ckErr
+		}
+	}
+
+	// Sync and close the checkpoint directory.
+	ckErr = dir.Sync()
+	if ckErr != nil {
+		return ckErr
+	}
+	ckErr = dir.Close()
+	dir = nil
+	return ckErr
+}
+
+func (d *DB) writeCheckpointManifest(
+	fs vfs.FS,
+	formatVers FormatMajorVersion,
+	destDirPath string,
+	destDir vfs.File,
+	manifestFileNum base.DiskFileNum,
+	manifestSize int64,
+	excludedFiles map[deletedFileEntry]*fileMetadata,
+	removeBackingTables []base.DiskFileNum,
+) error {
+	// Copy the MANIFEST, and create a pointer to it. We copy rather
+	// than link because additional version edits added to the
+	// MANIFEST after we took our snapshot of the sstables will
+	// reference sstables that aren't in our checkpoint. For a
+	// similar reason, we need to limit how much of the MANIFEST we
+	// copy.
+	// If some files are excluded from the checkpoint, also append a block that
+	// records those files as deleted.
+	if err := func() error {
+		srcPath := base.MakeFilepath(fs, d.dirname, fileTypeManifest, manifestFileNum)
+		destPath := fs.PathJoin(destDirPath, fs.PathBase(srcPath))
+		src, err := fs.Open(srcPath, vfs.SequentialReadsOption)
+		if err != nil {
+			return err
+		}
+		defer src.Close()
+
+		dst, err := fs.Create(destPath)
+		if err != nil {
+			return err
+		}
+		defer dst.Close()
+
+		// Copy all existing records. We need to copy at the record level in case we
+		// need to append another record with the excluded files (we cannot simply
+		// append a record after a raw data copy; see
+		// https://github.com/cockroachdb/cockroach/issues/100935).
+		r := record.NewReader(&io.LimitedReader{R: src, N: manifestSize}, manifestFileNum)
+		w := record.NewWriter(dst)
+		for {
+			rr, err := r.Next()
+			if err != nil {
+				if err == io.EOF {
+					break
+				}
+				return err
+			}
+
+			rw, err := w.Next()
+			if err != nil {
+				return err
+			}
+			if _, err := io.Copy(rw, rr); err != nil {
+				return err
+			}
+		}
+
+		if len(excludedFiles) > 0 {
+			// Write out an additional VersionEdit that deletes the excluded SST files.
+			ve := versionEdit{
+				DeletedFiles:         excludedFiles,
+				RemovedBackingTables: removeBackingTables,
+			}
+
+			rw, err := w.Next()
+			if err != nil {
+				return err
+			}
+			if err := ve.Encode(rw); err != nil {
+				return err
+			}
+		}
+		if err := w.Close(); err != nil {
+			return err
+		}
+		return dst.Sync()
+	}(); err != nil {
+		return err
+	}
+
+	// Recent format versions use an atomic marker for setting the
+	// active manifest. Older versions use the CURRENT file. The
+	// setCurrentFunc function will return a closure that will
+	// take the appropriate action for the database's format
+	// version.
+	var manifestMarker *atomicfs.Marker
+	manifestMarker, _, err := atomicfs.LocateMarker(fs, destDirPath, manifestMarkerName)
+	if err != nil {
+		return err
+	}
+	if err := setCurrentFunc(formatVers, manifestMarker, fs, destDirPath, destDir)(manifestFileNum); err != nil {
+		return err
+	}
+	return manifestMarker.Close()
+}
diff --git a/pebble/checkpoint_test.go b/pebble/checkpoint_test.go
new file mode 100644
index 0000000..e5e20a9
--- /dev/null
+++ b/pebble/checkpoint_test.go
@@ -0,0 +1,415 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"math/rand"
+	"slices"
+	"sort"
+	"strings"
+	"sync"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+func TestCheckpoint(t *testing.T) {
+	dbs := make(map[string]*DB)
+	defer func() {
+		for _, db := range dbs {
+			if db.closed.Load() == nil {
+				require.NoError(t, db.Close())
+			}
+		}
+	}()
+
+	mem := vfs.NewMem()
+	var memLog base.InMemLogger
+	opts := &Options{
+		FS:                          vfs.WithLogging(mem, memLog.Infof),
+		FormatMajorVersion:          internalFormatNewest,
+		L0CompactionThreshold:       10,
+		DisableAutomaticCompactions: true,
+	}
+	opts.private.disableTableStats = true
+	opts.private.testingAlwaysWaitForCleanup = true
+
+	datadriven.RunTest(t, "testdata/checkpoint", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "batch":
+			if len(td.CmdArgs) != 1 {
+				return "batch <db>"
+			}
+			memLog.Reset()
+			d := dbs[td.CmdArgs[0].String()]
+			b := d.NewBatch()
+			if err := runBatchDefineCmd(td, b); err != nil {
+				return err.Error()
+			}
+			if err := b.Commit(Sync); err != nil {
+				return err.Error()
+			}
+			return memLog.String()
+
+		case "checkpoint":
+			if !(len(td.CmdArgs) == 2 || (len(td.CmdArgs) == 3 && td.CmdArgs[2].Key == "restrict")) {
+				return "checkpoint <db> <dir> [restrict=(start-end, ...)]"
+			}
+			var opts []CheckpointOption
+			if len(td.CmdArgs) == 3 {
+				var spans []CheckpointSpan
+				for _, v := range td.CmdArgs[2].Vals {
+					splits := strings.SplitN(v, "-", 2)
+					if len(splits) != 2 {
+						return fmt.Sprintf("invalid restrict range %q", v)
+					}
+					spans = append(spans, CheckpointSpan{
+						Start: []byte(splits[0]),
+						End:   []byte(splits[1]),
+					})
+				}
+				opts = append(opts, WithRestrictToSpans(spans))
+			}
+			memLog.Reset()
+			d := dbs[td.CmdArgs[0].String()]
+			if err := d.Checkpoint(td.CmdArgs[1].String(), opts...); err != nil {
+				return err.Error()
+			}
+			return memLog.String()
+
+		case "ingest-and-excise":
+			d := dbs[td.CmdArgs[0].String()]
+
+			// Hacky but the command doesn't expect a db string. Get rid of it.
+			td.CmdArgs = td.CmdArgs[1:]
+			if err := runIngestAndExciseCmd(td, d, mem); err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "build":
+			d := dbs[td.CmdArgs[0].String()]
+
+			// Hacky but the command doesn't expect a db string. Get rid of it.
+			td.CmdArgs = td.CmdArgs[1:]
+			if err := runBuildCmd(td, d, mem); err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "lsm":
+			d := dbs[td.CmdArgs[0].String()]
+
+			// Hacky but the command doesn't expect a db string. Get rid of it.
+			td.CmdArgs = td.CmdArgs[1:]
+			return runLSMCmd(td, d)
+
+		case "compact":
+			if len(td.CmdArgs) != 1 {
+				return "compact <db>"
+			}
+			memLog.Reset()
+			d := dbs[td.CmdArgs[0].String()]
+			if err := d.Compact(nil, []byte("\xff"), false); err != nil {
+				return err.Error()
+			}
+			d.TestOnlyWaitForCleaning()
+			return memLog.String()
+
+		case "print-backing":
+			// prints contents of the file backing map in the version. Used to
+			// test whether the checkpoint removed the filebackings correctly.
+			if len(td.CmdArgs) != 1 {
+				return "print-backing <db>"
+			}
+			d := dbs[td.CmdArgs[0].String()]
+			d.mu.Lock()
+			d.mu.versions.logLock()
+			var fileNums []base.DiskFileNum
+			for _, b := range d.mu.versions.backingState.fileBackingMap {
+				fileNums = append(fileNums, b.DiskFileNum)
+			}
+			d.mu.versions.logUnlock()
+			d.mu.Unlock()
+
+			slices.Sort(fileNums)
+			var buf bytes.Buffer
+			for _, f := range fileNums {
+				buf.WriteString(fmt.Sprintf("%s\n", f.String()))
+			}
+			return buf.String()
+
+		case "close":
+			if len(td.CmdArgs) != 1 {
+				return "close <db>"
+			}
+			d := dbs[td.CmdArgs[0].String()]
+			require.NoError(t, d.Close())
+			return ""
+
+		case "flush":
+			if len(td.CmdArgs) != 1 {
+				return "flush <db>"
+			}
+			memLog.Reset()
+			d := dbs[td.CmdArgs[0].String()]
+			if err := d.Flush(); err != nil {
+				return err.Error()
+			}
+			return memLog.String()
+
+		case "list":
+			if len(td.CmdArgs) != 1 {
+				return "list <dir>"
+			}
+			paths, err := mem.List(td.CmdArgs[0].String())
+			if err != nil {
+				return err.Error()
+			}
+			sort.Strings(paths)
+			return fmt.Sprintf("%s\n", strings.Join(paths, "\n"))
+
+		case "open":
+			if len(td.CmdArgs) != 1 && len(td.CmdArgs) != 2 {
+				return "open <dir> [readonly]"
+			}
+			opts.ReadOnly = false
+			if len(td.CmdArgs) == 2 {
+				if td.CmdArgs[1].String() != "readonly" {
+					return "open <dir> [readonly]"
+				}
+				opts.ReadOnly = true
+			}
+
+			memLog.Reset()
+			dir := td.CmdArgs[0].String()
+			d, err := Open(dir, opts)
+			if err != nil {
+				return err.Error()
+			}
+			dbs[dir] = d
+			return memLog.String()
+
+		case "scan":
+			if len(td.CmdArgs) != 1 {
+				return "scan <db>"
+			}
+			memLog.Reset()
+			d := dbs[td.CmdArgs[0].String()]
+			iter, _ := d.NewIter(nil)
+			for valid := iter.First(); valid; valid = iter.Next() {
+				memLog.Infof("%s %s", iter.Key(), iter.Value())
+			}
+			memLog.Infof(".")
+			if err := iter.Close(); err != nil {
+				memLog.Infof("%v\n", err)
+			}
+			return memLog.String()
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestCheckpointCompaction(t *testing.T) {
+	fs := vfs.NewMem()
+	d, err := Open("", &Options{FS: fs})
+	require.NoError(t, err)
+
+	ctx, cancel := context.WithCancel(context.Background())
+
+	var wg sync.WaitGroup
+	wg.Add(4)
+	go func() {
+		defer cancel()
+		defer wg.Done()
+		for i := 0; ctx.Err() == nil; i++ {
+			if err := d.Set([]byte(fmt.Sprintf("key%06d", i)), nil, nil); err != nil {
+				t.Error(err)
+				return
+			}
+		}
+	}()
+	go func() {
+		defer cancel()
+		defer wg.Done()
+		for ctx.Err() == nil {
+			if err := d.Compact([]byte("key"), []byte("key999999"), false); err != nil {
+				t.Error(err)
+				return
+			}
+		}
+	}()
+	check := make(chan string, 100)
+	go func() {
+		defer cancel()
+		defer close(check)
+		defer wg.Done()
+		for i := 0; ctx.Err() == nil && i < 200; i++ {
+			dir := fmt.Sprintf("checkpoint%06d", i)
+			if err := d.Checkpoint(dir); err != nil {
+				t.Error(err)
+				return
+			}
+			select {
+			case <-ctx.Done():
+				return
+			case check <- dir:
+			}
+		}
+	}()
+	go func() {
+		opts := &Options{FS: fs}
+		defer cancel()
+		defer wg.Done()
+		for dir := range check {
+			d2, err := Open(dir, opts)
+			if err != nil {
+				t.Error(err)
+				return
+			}
+			// Check the checkpoint has all the sstables that the manifest
+			// claims it has.
+			tableInfos, _ := d2.SSTables()
+			for _, tables := range tableInfos {
+				for _, tbl := range tables {
+					if tbl.Virtual {
+						continue
+					}
+					if _, err := fs.Stat(base.MakeFilepath(fs, dir, base.FileTypeTable, tbl.FileNum.DiskFileNum())); err != nil {
+						t.Error(err)
+						return
+					}
+				}
+			}
+			if err := d2.Close(); err != nil {
+				t.Error(err)
+				return
+			}
+		}
+	}()
+	<-ctx.Done()
+	wg.Wait()
+	require.NoError(t, d.Close())
+}
+
+func TestCheckpointFlushWAL(t *testing.T) {
+	const checkpointPath = "checkpoints/checkpoint"
+	fs := vfs.NewStrictMem()
+	opts := &Options{FS: fs}
+	key, value := []byte("key"), []byte("value")
+
+	// Create a checkpoint from an unsynced DB.
+	{
+		d, err := Open("", opts)
+		require.NoError(t, err)
+		{
+			wb := d.NewBatch()
+			err = wb.Set(key, value, nil)
+			require.NoError(t, err)
+			err = d.Apply(wb, NoSync)
+			require.NoError(t, err)
+		}
+		err = d.Checkpoint(checkpointPath, WithFlushedWAL())
+		require.NoError(t, err)
+		require.NoError(t, d.Close())
+		fs.ResetToSyncedState()
+	}
+
+	// Check that the WAL has been flushed in the checkpoint.
+	{
+		files, err := fs.List(checkpointPath)
+		require.NoError(t, err)
+		hasLogFile := false
+		for _, f := range files {
+			info, err := fs.Stat(fs.PathJoin(checkpointPath, f))
+			require.NoError(t, err)
+			if strings.HasSuffix(f, ".log") {
+				hasLogFile = true
+				require.NotZero(t, info.Size())
+			}
+		}
+		require.True(t, hasLogFile)
+	}
+
+	// Check that the checkpoint contains the expected data.
+	{
+		d, err := Open(checkpointPath, opts)
+		require.NoError(t, err)
+		iter, _ := d.NewIter(nil)
+		require.True(t, iter.First())
+		require.Equal(t, key, iter.Key())
+		require.Equal(t, value, iter.Value())
+		require.False(t, iter.Next())
+		require.NoError(t, iter.Close())
+		require.NoError(t, d.Close())
+	}
+}
+
+func TestCheckpointManyFiles(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping because of short flag")
+	}
+	const checkpointPath = "checkpoint"
+	opts := &Options{
+		FS:                          vfs.NewMem(),
+		FormatMajorVersion:          internalFormatNewest,
+		DisableAutomaticCompactions: true,
+	}
+	// Disable compression to speed up the test.
+	opts.EnsureDefaults()
+	for i := range opts.Levels {
+		opts.Levels[i].Compression = NoCompression
+	}
+
+	d, err := Open("", opts)
+	require.NoError(t, err)
+	defer d.Close()
+
+	mkKey := func(x int) []byte {
+		return []byte(fmt.Sprintf("key%06d", x))
+	}
+	// We want to test the case where the appended record with the excluded files
+	// makes the manifest cross 32KB. This will happen for a range of values
+	// around 450.
+	n := 400 + rand.Intn(100)
+	for i := 0; i < n; i++ {
+		err := d.Set(mkKey(i), nil, nil)
+		require.NoError(t, err)
+		err = d.Flush()
+		require.NoError(t, err)
+	}
+	err = d.Checkpoint(checkpointPath, WithRestrictToSpans([]CheckpointSpan{
+		{
+			Start: mkKey(0),
+			End:   mkKey(10),
+		},
+	}))
+	require.NoError(t, err)
+
+	// Open the checkpoint and iterate through all the keys.
+	{
+		d, err := Open(checkpointPath, opts)
+		require.NoError(t, err)
+		iter, _ := d.NewIter(nil)
+		require.True(t, iter.First())
+		require.NoError(t, iter.Error())
+		n := 1
+		for iter.Next() {
+			n++
+		}
+		require.NoError(t, iter.Error())
+		require.NoError(t, iter.Close())
+		require.NoError(t, d.Close())
+		require.Equal(t, 10, n)
+	}
+}
diff --git a/pebble/cleaner.go b/pebble/cleaner.go
new file mode 100644
index 0000000..f9fa43b
--- /dev/null
+++ b/pebble/cleaner.go
@@ -0,0 +1,295 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"context"
+	"runtime/pprof"
+	"sync"
+	"time"
+
+	"github.com/cockroachdb/errors/oserror"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/tokenbucket"
+)
+
+// Cleaner exports the base.Cleaner type.
+type Cleaner = base.Cleaner
+
+// DeleteCleaner exports the base.DeleteCleaner type.
+type DeleteCleaner = base.DeleteCleaner
+
+// ArchiveCleaner exports the base.ArchiveCleaner type.
+type ArchiveCleaner = base.ArchiveCleaner
+
+type cleanupManager struct {
+	opts            *Options
+	objProvider     objstorage.Provider
+	onTableDeleteFn func(fileSize uint64)
+	deletePacer     *deletionPacer
+
+	// jobsCh is used as the cleanup job queue.
+	jobsCh chan *cleanupJob
+	// waitGroup is used to wait for the background goroutine to exit.
+	waitGroup sync.WaitGroup
+
+	mu struct {
+		sync.Mutex
+		// totalJobs is the total number of enqueued jobs (completed or in progress).
+		totalJobs              int
+		completedJobs          int
+		completedJobsCond      sync.Cond
+		jobsQueueWarningIssued bool
+	}
+}
+
+// We can queue this many jobs before we have to block EnqueueJob.
+const jobsQueueDepth = 1000
+
+// obsoleteFile holds information about a file that needs to be deleted soon.
+type obsoleteFile struct {
+	dir      string
+	fileNum  base.DiskFileNum
+	fileType fileType
+	fileSize uint64
+}
+
+type cleanupJob struct {
+	jobID         int
+	obsoleteFiles []obsoleteFile
+}
+
+// openCleanupManager creates a cleanupManager and starts its background goroutine.
+// The cleanupManager must be Close()d.
+func openCleanupManager(
+	opts *Options,
+	objProvider objstorage.Provider,
+	onTableDeleteFn func(fileSize uint64),
+	getDeletePacerInfo func() deletionPacerInfo,
+) *cleanupManager {
+	cm := &cleanupManager{
+		opts:            opts,
+		objProvider:     objProvider,
+		onTableDeleteFn: onTableDeleteFn,
+		deletePacer:     newDeletionPacer(time.Now(), int64(opts.TargetByteDeletionRate), getDeletePacerInfo),
+		jobsCh:          make(chan *cleanupJob, jobsQueueDepth),
+	}
+	cm.mu.completedJobsCond.L = &cm.mu.Mutex
+	cm.waitGroup.Add(1)
+
+	go func() {
+		pprof.Do(context.Background(), gcLabels, func(context.Context) {
+			cm.mainLoop()
+		})
+	}()
+
+	return cm
+}
+
+// Close stops the background goroutine, waiting until all queued jobs are completed.
+// Delete pacing is disabled for the remaining jobs.
+func (cm *cleanupManager) Close() {
+	close(cm.jobsCh)
+	cm.waitGroup.Wait()
+}
+
+// EnqueueJob adds a cleanup job to the manager's queue.
+func (cm *cleanupManager) EnqueueJob(jobID int, obsoleteFiles []obsoleteFile) {
+	job := &cleanupJob{
+		jobID:         jobID,
+		obsoleteFiles: obsoleteFiles,
+	}
+
+	// Report deleted bytes to the pacer, which can use this data to potentially
+	// increase the deletion rate to keep up. We want to do this at enqueue time
+	// rather than when we get to the job, otherwise the reported bytes will be
+	// subject to the throttling rate which defeats the purpose.
+	var pacingBytes uint64
+	for _, of := range obsoleteFiles {
+		if cm.needsPacing(of.fileType, of.fileNum) {
+			pacingBytes += of.fileSize
+		}
+	}
+	if pacingBytes > 0 {
+		cm.deletePacer.ReportDeletion(time.Now(), pacingBytes)
+	}
+
+	cm.mu.Lock()
+	cm.mu.totalJobs++
+	cm.maybeLogLocked()
+	cm.mu.Unlock()
+
+	if invariants.Enabled && len(cm.jobsCh) >= cap(cm.jobsCh)-2 {
+		panic("cleanup jobs queue full")
+	}
+
+	cm.jobsCh <- job
+}
+
+// Wait until the completion of all jobs that were already queued.
+//
+// Does not wait for jobs that are enqueued during the call.
+//
+// Note that DB.mu should not be held while calling this method; the background
+// goroutine needs to acquire DB.mu to update deleted table metrics.
+func (cm *cleanupManager) Wait() {
+	cm.mu.Lock()
+	defer cm.mu.Unlock()
+	n := cm.mu.totalJobs
+	for cm.mu.completedJobs < n {
+		cm.mu.completedJobsCond.Wait()
+	}
+}
+
+// mainLoop runs the manager's background goroutine.
+func (cm *cleanupManager) mainLoop() {
+	defer cm.waitGroup.Done()
+
+	var tb tokenbucket.TokenBucket
+	// Use a token bucket with 1 token / second refill rate and 1 token burst.
+	tb.Init(1.0, 1.0)
+	for job := range cm.jobsCh {
+		for _, of := range job.obsoleteFiles {
+			if of.fileType != fileTypeTable {
+				path := base.MakeFilepath(cm.opts.FS, of.dir, of.fileType, of.fileNum)
+				cm.deleteObsoleteFile(of.fileType, job.jobID, path, of.fileNum, of.fileSize)
+			} else {
+				cm.maybePace(&tb, of.fileType, of.fileNum, of.fileSize)
+				cm.onTableDeleteFn(of.fileSize)
+				cm.deleteObsoleteObject(fileTypeTable, job.jobID, of.fileNum)
+			}
+		}
+		cm.mu.Lock()
+		cm.mu.completedJobs++
+		cm.mu.completedJobsCond.Broadcast()
+		cm.maybeLogLocked()
+		cm.mu.Unlock()
+	}
+}
+
+func (cm *cleanupManager) needsPacing(fileType base.FileType, fileNum base.DiskFileNum) bool {
+	if fileType != fileTypeTable {
+		return false
+	}
+	meta, err := cm.objProvider.Lookup(fileType, fileNum)
+	if err != nil {
+		// The object was already removed from the provider; we won't actually
+		// delete anything, so we don't need to pace.
+		return false
+	}
+	// Don't throttle deletion of remote objects.
+	return !meta.IsRemote()
+}
+
+// maybePace sleeps before deleting an object if appropriate. It is always
+// called from the background goroutine.
+func (cm *cleanupManager) maybePace(
+	tb *tokenbucket.TokenBucket, fileType base.FileType, fileNum base.DiskFileNum, fileSize uint64,
+) {
+	if !cm.needsPacing(fileType, fileNum) {
+		return
+	}
+
+	tokens := cm.deletePacer.PacingDelay(time.Now(), fileSize)
+	if tokens == 0.0 {
+		// The token bucket might be in debt; it could make us wait even for 0
+		// tokens. We don't want that if the pacer decided throttling should be
+		// disabled.
+		return
+	}
+	// Wait for tokens. We use a token bucket instead of sleeping outright because
+	// the token bucket accumulates up to one second of unused tokens.
+	for {
+		ok, d := tb.TryToFulfill(tokenbucket.Tokens(tokens))
+		if ok {
+			break
+		}
+		time.Sleep(d)
+	}
+}
+
+// deleteObsoleteFile deletes a (non-object) file that is no longer needed.
+func (cm *cleanupManager) deleteObsoleteFile(
+	fileType fileType, jobID int, path string, fileNum base.DiskFileNum, fileSize uint64,
+) {
+	// TODO(peter): need to handle this error, probably by re-adding the
+	// file that couldn't be deleted to one of the obsolete slices map.
+	err := cm.opts.Cleaner.Clean(cm.opts.FS, fileType, path)
+	if oserror.IsNotExist(err) {
+		return
+	}
+
+	switch fileType {
+	case fileTypeLog:
+		cm.opts.EventListener.WALDeleted(WALDeleteInfo{
+			JobID:   jobID,
+			Path:    path,
+			FileNum: fileNum.FileNum(),
+			Err:     err,
+		})
+	case fileTypeManifest:
+		cm.opts.EventListener.ManifestDeleted(ManifestDeleteInfo{
+			JobID:   jobID,
+			Path:    path,
+			FileNum: fileNum.FileNum(),
+			Err:     err,
+		})
+	case fileTypeTable:
+		panic("invalid deletion of object file")
+	}
+}
+
+func (cm *cleanupManager) deleteObsoleteObject(
+	fileType fileType, jobID int, fileNum base.DiskFileNum,
+) {
+	if fileType != fileTypeTable {
+		panic("not an object")
+	}
+
+	var path string
+	meta, err := cm.objProvider.Lookup(fileType, fileNum)
+	if err != nil {
+		path = "<nil>"
+	} else {
+		path = cm.objProvider.Path(meta)
+		err = cm.objProvider.Remove(fileType, fileNum)
+	}
+	if cm.objProvider.IsNotExistError(err) {
+		return
+	}
+
+	switch fileType {
+	case fileTypeTable:
+		cm.opts.EventListener.TableDeleted(TableDeleteInfo{
+			JobID:   jobID,
+			Path:    path,
+			FileNum: fileNum.FileNum(),
+			Err:     err,
+		})
+	}
+}
+
+// maybeLogLocked issues a log if the job queue gets 75% full and issues a log
+// when the job queue gets back to less than 10% full.
+//
+// Must be called with cm.mu locked.
+func (cm *cleanupManager) maybeLogLocked() {
+	const highThreshold = jobsQueueDepth * 3 / 4
+	const lowThreshold = jobsQueueDepth / 10
+
+	jobsInQueue := cm.mu.totalJobs - cm.mu.completedJobs
+
+	if !cm.mu.jobsQueueWarningIssued && jobsInQueue > highThreshold {
+		cm.mu.jobsQueueWarningIssued = true
+		cm.opts.Logger.Infof("cleanup falling behind; job queue has over %d jobs", highThreshold)
+	}
+
+	if cm.mu.jobsQueueWarningIssued && jobsInQueue < lowThreshold {
+		cm.mu.jobsQueueWarningIssued = false
+		cm.opts.Logger.Infof("cleanup back to normal; job queue has under %d jobs", lowThreshold)
+	}
+}
diff --git a/pebble/cleaner_test.go b/pebble/cleaner_test.go
new file mode 100644
index 0000000..11d9ab9
--- /dev/null
+++ b/pebble/cleaner_test.go
@@ -0,0 +1,137 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"fmt"
+	"sort"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+func TestCleaner(t *testing.T) {
+	dbs := make(map[string]*DB)
+	defer func() {
+		for _, db := range dbs {
+			require.NoError(t, db.Close())
+		}
+	}()
+
+	mem := vfs.NewMem()
+	var memLog base.InMemLogger
+	fs := vfs.WithLogging(mem, memLog.Infof)
+	datadriven.RunTest(t, "testdata/cleaner", func(t *testing.T, td *datadriven.TestData) string {
+		memLog.Reset()
+		switch td.Cmd {
+		case "batch":
+			if len(td.CmdArgs) != 1 {
+				return "batch <db>"
+			}
+			d := dbs[td.CmdArgs[0].String()]
+			b := d.NewBatch()
+			if err := runBatchDefineCmd(td, b); err != nil {
+				return err.Error()
+			}
+			if err := b.Commit(Sync); err != nil {
+				return err.Error()
+			}
+			return memLog.String()
+
+		case "compact":
+			if len(td.CmdArgs) != 1 {
+				return "compact <db>"
+			}
+			d := dbs[td.CmdArgs[0].String()]
+			if err := d.Compact(nil, []byte("\xff"), false); err != nil {
+				return err.Error()
+			}
+			return memLog.String()
+
+		case "flush":
+			if len(td.CmdArgs) != 1 {
+				return "flush <db>"
+			}
+			d := dbs[td.CmdArgs[0].String()]
+			if err := d.Flush(); err != nil {
+				return err.Error()
+			}
+			return memLog.String()
+
+		case "close":
+			if len(td.CmdArgs) != 1 {
+				return "close <db>"
+			}
+			dbDir := td.CmdArgs[0].String()
+			d := dbs[dbDir]
+			if err := d.Close(); err != nil {
+				return err.Error()
+			}
+			delete(dbs, dbDir)
+			return memLog.String()
+
+		case "list":
+			if len(td.CmdArgs) != 1 {
+				return "list <dir>"
+			}
+			paths, err := mem.List(td.CmdArgs[0].String())
+			if err != nil {
+				return err.Error()
+			}
+			sort.Strings(paths)
+			return fmt.Sprintf("%s\n", strings.Join(paths, "\n"))
+
+		case "open":
+			if len(td.CmdArgs) < 1 || len(td.CmdArgs) > 3 {
+				return "open <dir> [archive] [readonly]"
+			}
+			dir := td.CmdArgs[0].String()
+			opts := (&Options{
+				FS:     fs,
+				WALDir: dir + "_wal",
+			}).WithFSDefaults()
+
+			for i := 1; i < len(td.CmdArgs); i++ {
+				switch td.CmdArgs[i].String() {
+				case "readonly":
+					opts.ReadOnly = true
+				case "archive":
+					opts.Cleaner = ArchiveCleaner{}
+				default:
+					return "open <dir> [archive] [readonly]"
+				}
+			}
+			// Asynchronous table stats retrieval makes the output flaky.
+			opts.private.disableTableStats = true
+			opts.private.testingAlwaysWaitForCleanup = true
+			d, err := Open(dir, opts)
+			if err != nil {
+				return err.Error()
+			}
+			d.TestOnlyWaitForCleaning()
+			dbs[dir] = d
+			return memLog.String()
+
+		case "create-bogus-file":
+			if len(td.CmdArgs) != 1 {
+				return "create-bogus-file <db/file>"
+			}
+			dst, err := fs.Create(td.CmdArgs[0].String())
+			require.NoError(t, err)
+			_, err = dst.Write([]byte("bogus data"))
+			require.NoError(t, err)
+			require.NoError(t, dst.Sync())
+			require.NoError(t, dst.Close())
+			return memLog.String()
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
diff --git a/pebble/cmd/pebble/.gitignore b/pebble/cmd/pebble/.gitignore
new file mode 100644
index 0000000..812a2be
--- /dev/null
+++ b/pebble/cmd/pebble/.gitignore
@@ -0,0 +1 @@
+pebble
diff --git a/pebble/cmd/pebble/db.go b/pebble/cmd/pebble/db.go
new file mode 100644
index 0000000..41c6e59
--- /dev/null
+++ b/pebble/cmd/pebble/db.go
@@ -0,0 +1,168 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package main
+
+import (
+	"log"
+
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/bloom"
+	"github.com/cockroachdb/pebble/internal/bytealloc"
+	"github.com/cockroachdb/pebble/objstorage/remote"
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+// DB specifies the minimal interfaces that need to be implemented to support
+// the pebble command.
+type DB interface {
+	NewIter(*pebble.IterOptions) iterator
+	NewBatch() batch
+	Scan(iter iterator, key []byte, count int64, reverse bool) error
+	Metrics() *pebble.Metrics
+	Flush() error
+}
+
+type iterator interface {
+	SeekLT(key []byte) bool
+	SeekGE(key []byte) bool
+	Valid() bool
+	Key() []byte
+	Value() []byte
+	First() bool
+	Next() bool
+	Last() bool
+	Prev() bool
+	Close() error
+}
+
+type batch interface {
+	Close() error
+	Commit(opts *pebble.WriteOptions) error
+	Set(key, value []byte, opts *pebble.WriteOptions) error
+	Delete(key []byte, opts *pebble.WriteOptions) error
+	LogData(data []byte, opts *pebble.WriteOptions) error
+}
+
+// Adapters for Pebble. Since the interfaces above are based on Pebble's
+// interfaces, it can simply forward calls for everything.
+type pebbleDB struct {
+	d       *pebble.DB
+	ballast []byte
+}
+
+func newPebbleDB(dir string) DB {
+	cache := pebble.NewCache(cacheSize)
+	defer cache.Unref()
+	opts := &pebble.Options{
+		Cache:                       cache,
+		Comparer:                    mvccComparer,
+		DisableWAL:                  disableWAL,
+		FormatMajorVersion:          pebble.FormatNewest,
+		L0CompactionThreshold:       2,
+		L0StopWritesThreshold:       1000,
+		LBaseMaxBytes:               64 << 20, // 64 MB
+		Levels:                      make([]pebble.LevelOptions, 7),
+		MaxOpenFiles:                16384,
+		MemTableSize:                64 << 20,
+		MemTableStopWritesThreshold: 4,
+		Merger: &pebble.Merger{
+			Name: "cockroach_merge_operator",
+		},
+		MaxConcurrentCompactions: func() int {
+			return 3
+		},
+	}
+
+	for i := 0; i < len(opts.Levels); i++ {
+		l := &opts.Levels[i]
+		l.BlockSize = 32 << 10       // 32 KB
+		l.IndexBlockSize = 256 << 10 // 256 KB
+		l.FilterPolicy = bloom.FilterPolicy(10)
+		l.FilterType = pebble.TableFilter
+		if i > 0 {
+			l.TargetFileSize = opts.Levels[i-1].TargetFileSize * 2
+		}
+		l.EnsureDefaults()
+	}
+	opts.Levels[6].FilterPolicy = nil
+	opts.FlushSplitBytes = opts.Levels[0].TargetFileSize
+
+	opts.EnsureDefaults()
+
+	if verbose {
+		lel := pebble.MakeLoggingEventListener(nil)
+		opts.EventListener = &lel
+		opts.EventListener.TableDeleted = nil
+		opts.EventListener.TableIngested = nil
+		opts.EventListener.WALCreated = nil
+		opts.EventListener.WALDeleted = nil
+	}
+
+	if pathToLocalSharedStorage != "" {
+		opts.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
+			// Store all shared objects on local disk, for convenience.
+			"": remote.NewLocalFS(pathToLocalSharedStorage, vfs.Default),
+		})
+		opts.Experimental.CreateOnShared = remote.CreateOnSharedAll
+		if secondaryCacheSize != 0 {
+			opts.Experimental.SecondaryCacheSizeBytes = secondaryCacheSize
+		}
+	}
+
+	p, err := pebble.Open(dir, opts)
+	if err != nil {
+		log.Fatal(err)
+	}
+	if pathToLocalSharedStorage != "" {
+		if err := p.SetCreatorID(1); err != nil {
+			log.Fatal(err)
+		}
+	}
+	return pebbleDB{
+		d:       p,
+		ballast: make([]byte, 1<<30),
+	}
+}
+
+func (p pebbleDB) Flush() error {
+	return p.d.Flush()
+}
+
+func (p pebbleDB) NewIter(opts *pebble.IterOptions) iterator {
+	iter, _ := p.d.NewIter(opts)
+	return iter
+}
+
+func (p pebbleDB) NewBatch() batch {
+	return p.d.NewBatch()
+}
+
+func (p pebbleDB) Scan(iter iterator, key []byte, count int64, reverse bool) error {
+	var data bytealloc.A
+	if reverse {
+		for i, valid := 0, iter.SeekLT(key); valid; valid = iter.Prev() {
+			data, _ = data.Copy(iter.Key())
+			data, _ = data.Copy(iter.Value())
+			i++
+			if i >= int(count) {
+				break
+			}
+		}
+	} else {
+		for i, valid := 0, iter.SeekGE(key); valid; valid = iter.Next() {
+			data, _ = data.Copy(iter.Key())
+			data, _ = data.Copy(iter.Value())
+			i++
+			if i >= int(count) {
+				break
+			}
+		}
+	}
+	return nil
+}
+
+func (p pebbleDB) Metrics() *pebble.Metrics {
+	return p.d.Metrics()
+}
diff --git a/pebble/cmd/pebble/fsbench.go b/pebble/cmd/pebble/fsbench.go
new file mode 100644
index 0000000..94d437d
--- /dev/null
+++ b/pebble/cmd/pebble/fsbench.go
@@ -0,0 +1,707 @@
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"log"
+	"os"
+	"path"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/spf13/cobra"
+)
+
+var fsBenchCmd = &cobra.Command{
+	Use:   "fs <dir>",
+	Short: "Run file system benchmarks.",
+	Long: `
+Run file system benchmarks. Each benchmark is predefined and can be
+run using the command "bench fs <dir> --bench-name <benchmark>".
+Each possible <benchmark> which can be run is defined in the code.
+Benchmarks may require the specification of a --duration or
+--max-ops flag, to prevent the benchmark from running forever
+or running out of memory.
+
+The --num-times flag can be used to run the entire benchmark, more than
+once. If the flag isn't provided, then the benchmark is only run once.
+`,
+	Args: cobra.ExactArgs(1),
+	RunE: runFsBench,
+}
+
+const writeBatchSize = 1 << 10
+
+var fsConfig struct {
+	// An upper limit on the number of ops which can be run.
+	maxOps int
+
+	// Benchmark to run.
+	benchname string
+
+	// Number of times each benchmark should be run.
+	numTimes int
+
+	fs vfs.FS
+
+	precomputedWriteBatch []byte
+}
+
+func init() {
+	fsBenchCmd.Flags().IntVar(
+		&fsConfig.maxOps, "max-ops", 0,
+		"Maximum number of times the operation which is being benchmarked should be run.",
+	)
+
+	fsBenchCmd.Flags().StringVar(
+		&fsConfig.benchname, "bench-name", "", "The benchmark to run.")
+	fsBenchCmd.MarkFlagRequired("bench-name")
+
+	fsBenchCmd.Flags().IntVar(
+		&fsConfig.numTimes, "num-times", 1,
+		"Number of times each benchmark should be run.")
+
+	// Add subcommand to list
+	fsBenchCmd.AddCommand(listFsBench)
+
+	// Just use the default vfs implementation for now.
+	fsConfig.fs = vfs.Default
+
+	fsConfig.precomputedWriteBatch = bytes.Repeat([]byte("a"), writeBatchSize)
+}
+
+// State relevant to a benchmark.
+type fsBench struct {
+	// A short name for the benchmark.
+	name string
+
+	// A one line description for the benchmark.
+	description string
+
+	// numOps is the total number of ops which
+	// have been run for the benchmark. This is used
+	// to make sure that we don't benchmark the operation
+	// more than max-ops times.
+	numOps int
+
+	// directory under which the benchmark is run.
+	dir     vfs.File
+	dirName string
+
+	// Stats associated with the benchmark.
+	reg *histogramRegistry
+
+	// The operation which we're benchmarking. This
+	// will be called over and over again.
+	// Returns false if run should no longer be called.
+	run func(*namedHistogram) bool
+
+	// Stop the benchmark from executing any further.
+	// Stop is safe to call concurrently with run.
+	stop func()
+
+	// A cleanup func which must be called after
+	// the benchmark has finished running.
+	// Clean should be only called after making sure
+	// that the run function is no longer executing.
+	clean func()
+}
+
+// createFile can be used to create an empty file.
+// Invariant: File shouldn't already exist.
+func createFile(filepath string) vfs.File {
+	fh, err := fsConfig.fs.Create(filepath)
+	if err != nil {
+		log.Fatalln(err)
+	}
+	return fh
+}
+
+// Invariant: file with filepath should exist.
+func deleteFile(filepath string) {
+	err := fsConfig.fs.Remove(filepath)
+	if err != nil {
+		log.Fatalln(err)
+	}
+}
+
+// Write size bytes to the file in batches.
+func writeToFile(fh vfs.File, size int64) {
+	for size > 0 {
+		var toWrite []byte
+		if size >= writeBatchSize {
+			toWrite = fsConfig.precomputedWriteBatch
+		} else {
+			toWrite = fsConfig.precomputedWriteBatch[:size]
+		}
+		written, err := fh.Write(toWrite)
+		if err != nil {
+			log.Fatalln(err)
+		}
+		if written != len(toWrite) {
+			log.Fatalf("Couldn't write %d bytes to file\n", size)
+		}
+		size -= int64(len(toWrite))
+	}
+}
+
+func syncFile(fh vfs.File) {
+	err := fh.Sync()
+	if err != nil {
+		log.Fatalln(err)
+	}
+}
+
+func closeFile(fh vfs.File) {
+	err := fh.Close()
+	if err != nil {
+		log.Fatalln(err)
+	}
+}
+
+func getDiskUsage(filepath string) {
+	_, err := fsConfig.fs.GetDiskUsage(filepath)
+	if err != nil {
+		log.Fatalln(err)
+	}
+}
+
+func openDir(filepath string) vfs.File {
+	fh, err := fsConfig.fs.OpenDir(filepath)
+	if err != nil {
+		log.Fatalln(err)
+	}
+	return fh
+}
+
+func mkDir(filepath string) {
+	err := fsConfig.fs.MkdirAll(filepath, 0755)
+	if err != nil {
+		log.Fatalln(err)
+	}
+}
+
+func removeAllFiles(filepath string) {
+	err := fsConfig.fs.RemoveAll(filepath)
+	if err != nil {
+		log.Fatalln(err)
+	}
+}
+
+// fileSize is in bytes.
+func createBench(benchName string, benchDescription string) fsBenchmark {
+	createBench := func(dirpath string) *fsBench {
+		bench := &fsBench{}
+		mkDir(dirpath)
+		fh := openDir(dirpath)
+
+		bench.dir = fh
+		bench.dirName = dirpath
+		bench.reg = newHistogramRegistry()
+		bench.numOps = 0
+		bench.name = benchName
+		bench.description = benchDescription
+
+		// setup the operation to benchmark, and the cleanup functions.
+		pref := "temp_"
+		var numFiles int
+		var done atomic.Bool
+
+		bench.run = func(hist *namedHistogram) bool {
+			if done.Load() {
+				return false
+			}
+
+			start := time.Now()
+			fh := createFile(path.Join(dirpath, fmt.Sprintf("%s%d", pref, numFiles)))
+			syncFile(bench.dir)
+			hist.Record(time.Since(start))
+
+			closeFile(fh)
+			numFiles++
+			return true
+		}
+
+		bench.stop = func() {
+			done.Store(true)
+		}
+
+		bench.clean = func() {
+			removeAllFiles(dirpath)
+			closeFile(bench.dir)
+		}
+
+		return bench
+	}
+
+	return fsBenchmark{
+		createBench,
+		benchName,
+		benchDescription,
+	}
+}
+
+// This benchmark prepopulates a directory with some files of a given size. Then, it creates and deletes
+// a file of some size, while measuring only the performance of the delete.
+func deleteBench(
+	benchName string, benchDescription string, preNumFiles int, preFileSize int64, fileSize int64,
+) fsBenchmark {
+
+	createBench := func(dirpath string) *fsBench {
+		bench := &fsBench{}
+		mkDir(dirpath)
+		fh := openDir(dirpath)
+
+		bench.dir = fh
+		bench.dirName = dirpath
+		bench.reg = newHistogramRegistry()
+		bench.numOps = 0
+		bench.name = benchName
+		bench.description = benchDescription
+
+		// prepopulate the directory
+		prePref := "pre_temp_"
+		for i := 0; i < preNumFiles; i++ {
+			fh := createFile(path.Join(dirpath, fmt.Sprintf("%s%d", prePref, i)))
+			if preFileSize > 0 {
+				writeToFile(fh, preFileSize)
+				syncFile(fh)
+			}
+			closeFile(fh)
+		}
+		syncFile(bench.dir)
+
+		var done atomic.Bool
+		bench.run = func(hist *namedHistogram) bool {
+			if done.Load() {
+				return false
+			}
+
+			filename := "newfile"
+			fh := createFile(path.Join(dirpath, filename))
+			writeToFile(fh, fileSize)
+			syncFile(fh)
+
+			start := time.Now()
+			deleteFile(path.Join(dirpath, filename))
+			hist.Record(time.Since(start))
+
+			return true
+		}
+
+		bench.stop = func() {
+			done.Store(true)
+		}
+
+		bench.clean = func() {
+			removeAllFiles(dirpath)
+			closeFile(bench.dir)
+		}
+
+		return bench
+	}
+
+	return fsBenchmark{
+		createBench,
+		benchName,
+		benchDescription,
+	}
+}
+
+// This benchmark creates some files in a directory, and then measures the performance
+// of the vfs.Remove function.
+// fileSize is in bytes.
+func deleteUniformBench(
+	benchName string, benchDescription string, numFiles int, fileSize int64,
+) fsBenchmark {
+	createBench := func(dirpath string) *fsBench {
+		bench := &fsBench{}
+		mkDir(dirpath)
+		fh := openDir(dirpath)
+
+		bench.dir = fh
+		bench.dirName = dirpath
+		bench.reg = newHistogramRegistry()
+		bench.numOps = 0
+		bench.name = benchName
+		bench.description = benchDescription
+
+		// setup the operation to benchmark, and the cleaup functions.
+		pref := "temp_"
+		for i := 0; i < numFiles; i++ {
+			fh := createFile(path.Join(dirpath, fmt.Sprintf("%s%d", pref, i)))
+			if fileSize > 0 {
+				writeToFile(fh, fileSize)
+				syncFile(fh)
+			}
+			closeFile(fh)
+		}
+		syncFile(bench.dir)
+
+		var done atomic.Bool
+		bench.run = func(hist *namedHistogram) bool {
+			if done.Load() {
+				return false
+			}
+
+			if numFiles == 0 {
+				return false
+			}
+
+			start := time.Now()
+			deleteFile(path.Join(dirpath, fmt.Sprintf("%s%d", pref, numFiles-1)))
+			hist.Record(time.Since(start))
+
+			numFiles--
+			return true
+		}
+
+		bench.stop = func() {
+			done.Store(true)
+		}
+
+		bench.clean = func() {
+			removeAll(dirpath)
+			closeFile(bench.dir)
+		}
+
+		return bench
+	}
+
+	return fsBenchmark{
+		createBench,
+		benchName,
+		benchDescription,
+	}
+}
+
+// Tests the performance of syncing data to disk.
+// Only measures the sync performance.
+// The writes will be synced after every writeSize bytes have been written.
+func writeSyncBench(
+	benchName string, benchDescription string, maxFileSize int64, writeSize int64,
+) fsBenchmark {
+
+	if writeSize > maxFileSize {
+		log.Fatalln("File write threshold is greater than max file size.")
+	}
+
+	createBench := func(dirpath string) *fsBench {
+		bench := &fsBench{}
+		mkDir(dirpath)
+		fh := openDir(dirpath)
+
+		bench.dir = fh
+		bench.dirName = dirpath
+		bench.reg = newHistogramRegistry()
+		bench.numOps = 0
+		bench.name = benchName
+		bench.description = benchDescription
+
+		pref := "temp_"
+		var benchData struct {
+			done         atomic.Bool
+			fh           vfs.File
+			fileNum      int
+			bytesWritten int64
+		}
+		benchData.fh = createFile(path.Join(dirpath, fmt.Sprintf("%s%d", pref, benchData.fileNum)))
+
+		bench.run = func(hist *namedHistogram) bool {
+			if benchData.done.Load() {
+				return false
+			}
+
+			if benchData.bytesWritten+writeSize > maxFileSize {
+				closeFile(benchData.fh)
+				benchData.fileNum++
+				benchData.bytesWritten = 0
+				benchData.fh = createFile(path.Join(dirpath, fmt.Sprintf("%s%d", pref, benchData.fileNum)))
+			}
+
+			benchData.bytesWritten += writeSize
+			writeToFile(benchData.fh, writeSize)
+
+			start := time.Now()
+			syncFile(benchData.fh)
+			hist.Record(time.Since(start))
+
+			return true
+		}
+
+		bench.stop = func() {
+			benchData.done.Store(true)
+		}
+
+		bench.clean = func() {
+			closeFile(benchData.fh)
+			removeAllFiles(dirpath)
+			closeFile(bench.dir)
+		}
+
+		return bench
+	}
+
+	return fsBenchmark{
+		createBench,
+		benchName,
+		benchDescription,
+	}
+}
+
+// Tests the peformance of calling the vfs.GetDiskUsage call on a directory,
+// as the number of files/total size of files in the directory grows.
+func diskUsageBench(
+	benchName string, benchDescription string, maxFileSize int64, writeSize int64,
+) fsBenchmark {
+
+	if writeSize > maxFileSize {
+		log.Fatalln("File write threshold is greater than max file size.")
+	}
+
+	createBench := func(dirpath string) *fsBench {
+		bench := &fsBench{}
+		mkDir(dirpath)
+		fh := openDir(dirpath)
+
+		bench.dir = fh
+		bench.dirName = dirpath
+		bench.reg = newHistogramRegistry()
+		bench.numOps = 0
+		bench.name = benchName
+		bench.description = benchDescription
+
+		pref := "temp_"
+		var benchData struct {
+			done         atomic.Bool
+			fh           vfs.File
+			fileNum      int
+			bytesWritten int64
+		}
+		benchData.fh = createFile(path.Join(dirpath, fmt.Sprintf("%s%d", pref, benchData.fileNum)))
+
+		bench.run = func(hist *namedHistogram) bool {
+			if benchData.done.Load() {
+				return false
+			}
+
+			if benchData.bytesWritten+writeSize > maxFileSize {
+				closeFile(benchData.fh)
+				benchData.fileNum++
+				benchData.bytesWritten = 0
+				benchData.fh = createFile(path.Join(dirpath, fmt.Sprintf("%s%d", pref, benchData.fileNum)))
+			}
+
+			benchData.bytesWritten += writeSize
+			writeToFile(benchData.fh, writeSize)
+			syncFile(benchData.fh)
+
+			start := time.Now()
+			getDiskUsage(dirpath)
+			hist.Record(time.Since(start))
+
+			return true
+		}
+
+		bench.stop = func() {
+			benchData.done.Store(true)
+		}
+
+		bench.clean = func() {
+			closeFile(benchData.fh)
+			removeAllFiles(dirpath)
+			closeFile(bench.dir)
+		}
+
+		return bench
+	}
+
+	return fsBenchmark{
+		createBench,
+		benchName,
+		benchDescription,
+	}
+}
+
+// A benchmark is a function which takes a directory
+// as input and returns the fsBench struct which has
+// all the information required to run the benchmark.
+type fsBenchmark struct {
+	createBench func(string) *fsBench
+	name        string
+	description string
+}
+
+// The various benchmarks which can be run.
+var benchmarks = map[string]fsBenchmark{
+	"create_empty": createBench("create_empty", "create empty file, sync par dir"),
+	"delete_10k_2MiB": deleteUniformBench(
+		"delete_10k_2MiB", "create 10k 2MiB size files, measure deletion times", 10_000, 2<<20,
+	),
+	"delete_100k_2MiB": deleteUniformBench(
+		"delete_100k_2MiB", "create 100k 2MiB size files, measure deletion times", 100_000, 2<<20,
+	),
+	"delete_200k_2MiB": deleteUniformBench(
+		"delete_200k_2MiB", "create 200k 2MiB size files, measure deletion times", 200_000, 2<<20,
+	),
+	"write_sync_1MiB": writeSyncBench(
+		"write_sync_1MiB", "Write 1MiB to a file, then sync, while timing the sync.", 2<<30, 1<<20,
+	),
+	"write_sync_16MiB": writeSyncBench(
+		"write_sync_16MiB", "Write 16MiB to a file, then sync, while timing the sync.", 2<<30, 16<<20,
+	),
+	"write_sync_128MiB": writeSyncBench(
+		"write_sync_128MiB", "Write 128MiB to a file, then sync, while timing the sync.", 2<<30, 128<<20,
+	),
+	"disk_usage_128MB": diskUsageBench(
+		"disk_usage_128MB",
+		"Write 128MiB to a file, measure GetDiskUsage call. Create a new file, when file size is 1GB.",
+		1<<30, 128<<20,
+	),
+	"disk_usage_many_files": diskUsageBench(
+		"disk_usage_many_files",
+		"Create new file, Write 128KiB to a file, measure GetDiskUsage call.",
+		128<<10, 128<<10,
+	),
+	"delete_large_dir_256MiB": deleteBench(
+		"delete_large_dir_256MiB", "Prepopulate directory with 100k 1MiB files, measure delete peformance of 256MiB files",
+		1e5, 1<<20, 256<<20,
+	),
+	"delete_large_dir_2MiB": deleteBench(
+		"delete_large_dir_2MiB", "Prepopulate directory with 100k 1MiB files, measure delete peformance of 2MiB files",
+		1e5, 1<<20, 2<<20,
+	),
+	"delete_small_dir_2GiB": deleteBench(
+		"delete_small_dir_2GiB", "Prepopulate directory with 1k 1MiB files, measure delete peformance of 2GiB files",
+		1e3, 1<<20, 2<<30,
+	),
+	"delete_small_dir_256MiB": deleteBench(
+		"delete_small_dir_256MiB", "Prepopulate directory with 1k 1MiB files, measure delete peformance of 256MiB files",
+		1e3, 1<<20, 256<<20,
+	),
+	"delete_small_dir_2MiB": deleteBench(
+		"delete_small_dir_2MiB", "Prepopulate directory with 1k 1MiB files, measure delete peformance of 2MiB files",
+		1e3, 1<<20, 2<<20,
+	),
+}
+
+func runFsBench(_ *cobra.Command, args []string) error {
+	benchmark, ok := benchmarks[fsConfig.benchname]
+	if !ok {
+		return errors.Errorf("trying to run an unknown benchmark: %s", fsConfig.benchname)
+	}
+
+	// Run the benchmark a comple of times.
+	fmt.Printf("The benchmark will be run %d time(s).\n", fsConfig.numTimes)
+	for i := 0; i < fsConfig.numTimes; i++ {
+		fmt.Println("Starting benchmark:", i)
+		benchStruct := benchmark.createBench(args[0])
+		runTestWithoutDB(testWithoutDB{
+			init: benchStruct.init,
+			tick: benchStruct.tick,
+			done: benchStruct.done,
+		})
+	}
+	return nil
+}
+
+func (bench *fsBench) init(wg *sync.WaitGroup) {
+	fmt.Println("Running benchmark:", bench.name)
+	fmt.Println("Description:", bench.description)
+
+	wg.Add(1)
+	go bench.execute(wg)
+}
+
+func (bench *fsBench) execute(wg *sync.WaitGroup) {
+	defer wg.Done()
+
+	latencyHist := bench.reg.Register(bench.name)
+
+	for {
+		// run the op which we're benchmarking.
+		bench.numOps++
+
+		// The running function will determine exactly what to latency
+		// it wants to measure.
+		continueBench := bench.run(latencyHist)
+		if !continueBench || (fsConfig.maxOps > 0 && bench.numOps >= fsConfig.maxOps) {
+			break
+		}
+	}
+}
+
+func (bench *fsBench) tick(elapsed time.Duration, i int) {
+	if i%20 == 0 {
+		fmt.Println("____optype__elapsed__ops/sec(inst)___ops/sec(cum)__p50(ms)__p95(ms)__p99(ms)__pMax(ms)")
+	}
+	bench.reg.Tick(func(tick histogramTick) {
+		h := tick.Hist
+
+		fmt.Printf("%10s %8s %14.1f %14.1f %5.6f %5.6f %5.6f %5.6f\n",
+			tick.Name[:10],
+			time.Duration(elapsed.Seconds()+0.5)*time.Second,
+			float64(h.TotalCount())/tick.Elapsed.Seconds(),
+			float64(tick.Cumulative.TotalCount())/elapsed.Seconds(),
+			time.Duration(h.ValueAtQuantile(50)).Seconds()*1000,
+			time.Duration(h.ValueAtQuantile(95)).Seconds()*1000,
+			time.Duration(h.ValueAtQuantile(99)).Seconds()*1000,
+			time.Duration(h.ValueAtQuantile(100)).Seconds()*1000,
+		)
+	})
+}
+
+func (bench *fsBench) done(wg *sync.WaitGroup, elapsed time.Duration) {
+	// Do the cleanup.
+	bench.stop()
+	wg.Wait()
+	defer bench.clean()
+
+	fmt.Println("\n____optype__elapsed_____ops(total)___ops/sec(cum)__avg(ms)__p50(ms)__p95(ms)__p99(ms)__pMax(ms)")
+
+	resultTick := histogramTick{}
+	bench.reg.Tick(func(tick histogramTick) {
+		h := tick.Cumulative
+		if resultTick.Cumulative == nil {
+			resultTick.Now = tick.Now
+			resultTick.Cumulative = h
+		} else {
+			resultTick.Cumulative.Merge(h)
+		}
+
+		fmt.Printf("%10s %7.1fs %14d %14.1f %5.6f %5.6f %5.6f %5.6f %5.6f\n",
+			tick.Name[:10], elapsed.Seconds(), h.TotalCount(),
+			float64(h.TotalCount())/elapsed.Seconds(),
+			time.Duration(h.Mean()).Seconds()*1000,
+			time.Duration(h.ValueAtQuantile(50)).Seconds()*1000,
+			time.Duration(h.ValueAtQuantile(95)).Seconds()*1000,
+			time.Duration(h.ValueAtQuantile(99)).Seconds()*1000,
+			time.Duration(h.ValueAtQuantile(100)).Seconds()*1000,
+		)
+	})
+	fmt.Println()
+
+	resultHist := resultTick.Cumulative
+
+	fmt.Printf("Benchmarkfsbench/%s  %d %0.1f ops/sec\n\n",
+		bench.name,
+		resultHist.TotalCount(),
+		float64(resultHist.TotalCount())/elapsed.Seconds(),
+	)
+}
+
+func verbosef(fmtstr string, args ...interface{}) {
+	if verbose {
+		fmt.Printf(fmtstr, args...)
+	}
+}
+
+func removeAll(dir string) {
+	verbosef("Removing %q.\n", dir)
+	if err := os.RemoveAll(dir); err != nil {
+		log.Fatal(err)
+	}
+}
diff --git a/pebble/cmd/pebble/fsbenchlist.go b/pebble/cmd/pebble/fsbenchlist.go
new file mode 100644
index 0000000..467af81
--- /dev/null
+++ b/pebble/cmd/pebble/fsbenchlist.go
@@ -0,0 +1,39 @@
+package main
+
+import (
+	"fmt"
+
+	"github.com/cockroachdb/errors"
+	"github.com/spf13/cobra"
+)
+
+var listFsBench = &cobra.Command{
+	Use:   "list [<name>] [<name>] ...",
+	Short: "List the available file system benchmarks.",
+	Long: `
+List the available file system benchmarks. If no <name> is supplied
+as an argument, then all the available benchmark names are printed.
+If one or more <name>s are supplied as arguments, then the benchmark
+descriptions are printed out for those names.
+`,
+	RunE: runListFsBench,
+}
+
+func runListFsBench(_ *cobra.Command, args []string) error {
+	if len(args) == 0 {
+		fmt.Println("Available benchmarks:")
+		for name := range benchmarks {
+			fmt.Println(name)
+		}
+	} else {
+		for _, v := range args {
+			benchStruct, ok := benchmarks[v]
+			if !ok {
+				return errors.Errorf("trying to print out the description for unknown benchmark: %s", v)
+			}
+			fmt.Println("Name:", benchStruct.name)
+			fmt.Println("Description:", benchStruct.description)
+		}
+	}
+	return nil
+}
diff --git a/pebble/cmd/pebble/main.go b/pebble/cmd/pebble/main.go
new file mode 100644
index 0000000..9417bfb
--- /dev/null
+++ b/pebble/cmd/pebble/main.go
@@ -0,0 +1,99 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package main
+
+import (
+	"log"
+	"os"
+	"time"
+
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/tool"
+	"github.com/spf13/cobra"
+)
+
+var (
+	cacheSize                int64
+	concurrency              int
+	disableWAL               bool
+	duration                 time.Duration
+	maxSize                  uint64
+	maxOpsPerSec             = newRateFlag("")
+	verbose                  bool
+	waitCompactions          bool
+	wipe                     bool
+	pathToLocalSharedStorage string
+	// If zero, or if !sharedStorageEnabled, secondary cache is
+	// not used.
+	secondaryCacheSize int64
+)
+
+func main() {
+	log.SetFlags(0)
+
+	cobra.EnableCommandSorting = false
+
+	benchCmd := &cobra.Command{
+		Use:   "bench",
+		Short: "benchmarks",
+	}
+
+	replayCmd := initReplayCmd()
+	benchCmd.AddCommand(
+		replayCmd,
+		scanCmd,
+		syncCmd,
+		tombstoneCmd,
+		ycsbCmd,
+		fsBenchCmd,
+		writeBenchCmd,
+	)
+
+	rootCmd := &cobra.Command{
+		Use:   "pebble [command] (flags)",
+		Short: "pebble benchmarking/introspection tool",
+	}
+	rootCmd.AddCommand(benchCmd)
+
+	t := tool.New(tool.Comparers(mvccComparer, testkeys.Comparer), tool.Mergers(fauxMVCCMerger))
+	rootCmd.AddCommand(t.Commands...)
+
+	for _, cmd := range []*cobra.Command{replayCmd, scanCmd, syncCmd, tombstoneCmd, writeBenchCmd, ycsbCmd} {
+		cmd.Flags().BoolVarP(
+			&verbose, "verbose", "v", false, "enable verbose event logging")
+		cmd.Flags().StringVar(
+			&pathToLocalSharedStorage, "shared-storage", "", "path to local shared storage (empty for no shared storage)")
+		cmd.Flags().Int64Var(
+			&secondaryCacheSize, "secondary-cache", 0, "secondary cache size in bytes")
+	}
+	for _, cmd := range []*cobra.Command{scanCmd, syncCmd, tombstoneCmd, ycsbCmd} {
+		cmd.Flags().Int64Var(
+			&cacheSize, "cache", 1<<30, "cache size")
+	}
+	for _, cmd := range []*cobra.Command{scanCmd, syncCmd, tombstoneCmd, ycsbCmd, fsBenchCmd, writeBenchCmd} {
+		cmd.Flags().DurationVarP(
+			&duration, "duration", "d", 10*time.Second, "the duration to run (0, run forever)")
+	}
+	for _, cmd := range []*cobra.Command{scanCmd, syncCmd, tombstoneCmd, ycsbCmd} {
+		cmd.Flags().IntVarP(
+			&concurrency, "concurrency", "c", 1, "number of concurrent workers")
+		cmd.Flags().BoolVar(
+			&disableWAL, "disable-wal", false, "disable the WAL (voiding persistence guarantees)")
+		cmd.Flags().VarP(
+			maxOpsPerSec, "rate", "m", "max ops per second [{zipf,uniform}:]min[-max][/period (sec)]")
+		cmd.Flags().BoolVar(
+			&waitCompactions, "wait-compactions", false,
+			"wait for background compactions to complete after load stops")
+		cmd.Flags().BoolVarP(
+			&wipe, "wipe", "w", false, "wipe the database before starting")
+		cmd.Flags().Uint64Var(
+			&maxSize, "max-size", 0, "maximum disk size, in MB (0, run forever)")
+	}
+
+	if err := rootCmd.Execute(); err != nil {
+		// Cobra has already printed the error message.
+		os.Exit(1)
+	}
+}
diff --git a/pebble/cmd/pebble/mvcc.go b/pebble/cmd/pebble/mvcc.go
new file mode 100644
index 0000000..0e388de
--- /dev/null
+++ b/pebble/cmd/pebble/mvcc.go
@@ -0,0 +1,223 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package main
+
+import (
+	"bytes"
+
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/bytealloc"
+)
+
+// MVCC encoding and decoding routines adapted from CockroachDB sources. Used
+// to perform apples-to-apples benchmarking for CockroachDB's usage of RocksDB.
+
+var mvccComparer = &pebble.Comparer{
+	Compare: mvccCompare,
+
+	AbbreviatedKey: func(k []byte) uint64 {
+		key, _, ok := mvccSplitKey(k)
+		if !ok {
+			return 0
+		}
+		return pebble.DefaultComparer.AbbreviatedKey(key)
+	},
+
+	Equal: func(a, b []byte) bool {
+		return mvccCompare(a, b) == 0
+	},
+
+	Separator: func(dst, a, b []byte) []byte {
+		aKey, _, ok := mvccSplitKey(a)
+		if !ok {
+			return append(dst, a...)
+		}
+		bKey, _, ok := mvccSplitKey(b)
+		if !ok {
+			return append(dst, a...)
+		}
+		// If the keys are the same just return a.
+		if bytes.Equal(aKey, bKey) {
+			return append(dst, a...)
+		}
+		n := len(dst)
+		// MVCC key comparison uses bytes.Compare on the roachpb.Key, which is the same semantics as
+		// pebble.DefaultComparer, so reuse the latter's Separator implementation.
+		dst = pebble.DefaultComparer.Separator(dst, aKey, bKey)
+		// Did it pick a separator different than aKey -- if it did not we can't do better than a.
+		buf := dst[n:]
+		if bytes.Equal(aKey, buf) {
+			return append(dst[:n], a...)
+		}
+		// The separator is > aKey, so we only need to add the timestamp sentinel.
+		return append(dst, 0)
+	},
+
+	Successor: func(dst, a []byte) []byte {
+		aKey, _, ok := mvccSplitKey(a)
+		if !ok {
+			return append(dst, a...)
+		}
+		n := len(dst)
+		// MVCC key comparison uses bytes.Compare on the roachpb.Key, which is the same semantics as
+		// pebble.DefaultComparer, so reuse the latter's Successor implementation.
+		dst = pebble.DefaultComparer.Successor(dst, aKey)
+		// Did it pick a successor different than aKey -- if it did not we can't do better than a.
+		buf := dst[n:]
+		if bytes.Equal(aKey, buf) {
+			return append(dst[:n], a...)
+		}
+		// The successor is > aKey, so we only need to add the timestamp sentinel.
+		return append(dst, 0)
+	},
+
+	Split: func(k []byte) int {
+		key, _, ok := mvccSplitKey(k)
+		if !ok {
+			return len(k)
+		}
+		// This matches the behavior of libroach/KeyPrefix. RocksDB requires that
+		// keys generated via a SliceTransform be comparable with normal encoded
+		// MVCC keys. Encoded MVCC keys have a suffix indicating the number of
+		// bytes of timestamp data. MVCC keys without a timestamp have a suffix of
+		// 0. We're careful in EncodeKey to make sure that the user-key always has
+		// a trailing 0. If there is no timestamp this falls out naturally. If
+		// there is a timestamp we prepend a 0 to the encoded timestamp data.
+		return len(key) + 1
+	},
+
+	Name: "cockroach_comparator",
+}
+
+func mvccSplitKey(mvccKey []byte) (key []byte, ts []byte, ok bool) {
+	if len(mvccKey) == 0 {
+		return nil, nil, false
+	}
+	n := len(mvccKey) - 1
+	tsLen := int(mvccKey[n])
+	if n < tsLen {
+		return nil, nil, false
+	}
+	key = mvccKey[:n-tsLen]
+	if tsLen > 0 {
+		ts = mvccKey[n-tsLen+1 : len(mvccKey)-1]
+	}
+	return key, ts, true
+}
+
+func mvccCompare(a, b []byte) int {
+	// NB: For performance, this routine manually splits the key into the
+	// user-key and timestamp components rather than using SplitMVCCKey. Don't
+	// try this at home kids: use SplitMVCCKey.
+
+	aEnd := len(a) - 1
+	bEnd := len(b) - 1
+	if aEnd < 0 || bEnd < 0 {
+		// This should never happen unless there is some sort of corruption of
+		// the keys. This is a little bizarre, but the behavior exactly matches
+		// engine/db.cc:DBComparator.
+		return bytes.Compare(a, b)
+	}
+
+	// Compute the index of the separator between the key and the timestamp.
+	aSep := aEnd - int(a[aEnd])
+	bSep := bEnd - int(b[bEnd])
+	if aSep < 0 || bSep < 0 {
+		// This should never happen unless there is some sort of corruption of
+		// the keys. This is a little bizarre, but the behavior exactly matches
+		// engine/db.cc:DBComparator.
+		return bytes.Compare(a, b)
+	}
+
+	// Compare the "user key" part of the key.
+	if c := bytes.Compare(a[:aSep], b[:bSep]); c != 0 {
+		return c
+	}
+
+	// Compare the timestamp part of the key.
+	aTS := a[aSep:aEnd]
+	bTS := b[bSep:bEnd]
+	if len(aTS) == 0 {
+		if len(bTS) == 0 {
+			return 0
+		}
+		return -1
+	} else if len(bTS) == 0 {
+		return 1
+	}
+	return bytes.Compare(bTS, aTS)
+}
+
+// <key>\x00[<wall_time>[<logical>]]<#timestamp-bytes>
+func mvccEncode(dst, key []byte, walltime uint64, logical uint32) []byte {
+	dst = append(dst, key...)
+	dst = append(dst, 0)
+	if walltime != 0 || logical != 0 {
+		extra := byte(1 + 8)
+		dst = encodeUint64Ascending(dst, walltime)
+		if logical != 0 {
+			dst = encodeUint32Ascending(dst, logical)
+			extra += 4
+		}
+		dst = append(dst, extra)
+	}
+	return dst
+}
+
+func mvccForwardScan(d DB, start, end, ts []byte) (int, int64) {
+	it := d.NewIter(&pebble.IterOptions{
+		LowerBound: mvccEncode(nil, start, 0, 0),
+		UpperBound: mvccEncode(nil, end, 0, 0),
+	})
+	defer it.Close()
+
+	var data bytealloc.A
+	var count int
+	var nbytes int64
+
+	for valid := it.First(); valid; valid = it.Next() {
+		key, keyTS, _ := mvccSplitKey(it.Key())
+		if bytes.Compare(keyTS, ts) <= 0 {
+			data, _ = data.Copy(key)
+			data, _ = data.Copy(it.Value())
+		}
+		count++
+		nbytes += int64(len(it.Key()) + len(it.Value()))
+	}
+	return count, nbytes
+}
+
+func mvccReverseScan(d DB, start, end, ts []byte) (int, int64) {
+	it := d.NewIter(&pebble.IterOptions{
+		LowerBound: mvccEncode(nil, start, 0, 0),
+		UpperBound: mvccEncode(nil, end, 0, 0),
+	})
+	defer it.Close()
+
+	var data bytealloc.A
+	var count int
+	var nbytes int64
+
+	for valid := it.Last(); valid; valid = it.Prev() {
+		key, keyTS, _ := mvccSplitKey(it.Key())
+		if bytes.Compare(keyTS, ts) <= 0 {
+			data, _ = data.Copy(key)
+			data, _ = data.Copy(it.Value())
+		}
+		count++
+		nbytes += int64(len(it.Key()) + len(it.Value()))
+	}
+	return count, nbytes
+}
+
+var fauxMVCCMerger = &pebble.Merger{
+	Name: "cockroach_merge_operator",
+	Merge: func(key, value []byte) (pebble.ValueMerger, error) {
+		// This merger is used by the compact benchmark and use the
+		// pebble default value merger to concatenate values.
+		// It shouldn't materially affect the benchmarks.
+		return pebble.DefaultMerger.Merge(key, value)
+	},
+}
diff --git a/pebble/cmd/pebble/queue.go b/pebble/cmd/pebble/queue.go
new file mode 100644
index 0000000..7193741
--- /dev/null
+++ b/pebble/cmd/pebble/queue.go
@@ -0,0 +1,116 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package main
+
+import (
+	"fmt"
+	"log"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/randvar"
+	"github.com/spf13/cobra"
+	"golang.org/x/exp/rand"
+)
+
+var queueConfig struct {
+	size   int
+	values *randvar.BytesFlag
+}
+
+func initQueue(cmd *cobra.Command) {
+	cmd.Flags().IntVar(
+		&queueConfig.size, "queue-size", 256,
+		"size of the queue to maintain")
+	queueConfig.values = randvar.NewBytesFlag("16384")
+	cmd.Flags().Var(
+		queueConfig.values, "queue-values",
+		"queue value size distribution [{zipf,uniform}:]min[-max][/<target-compression>]")
+}
+
+func queueTest() (test, *atomic.Int64) {
+	ops := new(atomic.Int64) // atomic
+	var (
+		lastOps     int64
+		lastElapsed time.Duration
+	)
+	return test{
+		init: func(d DB, wg *sync.WaitGroup) {
+			var (
+				value []byte
+				rng   = rand.New(rand.NewSource(1449168817))
+				queue = make([][]byte, queueConfig.size)
+			)
+			for i := 0; i < queueConfig.size; i++ {
+				b := d.NewBatch()
+				queue[i] = mvccEncode(nil, encodeUint32Ascending([]byte("queue-"), uint32(i)), uint64(i+1), 0)
+				value = queueConfig.values.Bytes(rng, value)
+				b.Set(queue[i], value, pebble.NoSync)
+				if err := b.Commit(pebble.NoSync); err != nil {
+					log.Fatal(err)
+				}
+			}
+			if err := d.Flush(); err != nil {
+				log.Fatal(err)
+			}
+
+			limiter := maxOpsPerSec.newRateLimiter()
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
+
+				for i := queueConfig.size; ; i++ {
+					idx := i % queueConfig.size
+
+					// Delete the head.
+					b := d.NewBatch()
+					if err := b.Delete(queue[idx], pebble.Sync); err != nil {
+						log.Fatal(err)
+					}
+					if err := b.Commit(pebble.Sync); err != nil {
+						log.Fatal(err)
+					}
+					_ = b.Close()
+					wait(limiter)
+
+					// Append to the tail.
+					b = d.NewBatch()
+					queue[idx] = mvccEncode(queue[idx][:0], encodeUint32Ascending([]byte("queue-"), uint32(i)), uint64(i+1), 0)
+					value = queueConfig.values.Bytes(rng, value)
+					b.Set(queue[idx], value, nil)
+					if err := b.Commit(pebble.Sync); err != nil {
+						log.Fatal(err)
+					}
+					_ = b.Close()
+					wait(limiter)
+					ops.Add(1)
+				}
+			}()
+		},
+		tick: func(elapsed time.Duration, i int) {
+			if i%20 == 0 {
+				fmt.Println("Queue___elapsed_______ops/sec")
+			}
+
+			curOps := ops.Load()
+			dur := elapsed - lastElapsed
+			fmt.Printf("%15s %13.1f\n",
+				time.Duration(elapsed.Seconds()+0.5)*time.Second,
+				float64(curOps-lastOps)/dur.Seconds(),
+			)
+			lastOps = curOps
+			lastElapsed = elapsed
+		},
+		done: func(elapsed time.Duration) {
+			curOps := ops.Load()
+			fmt.Println("\nQueue___elapsed___ops/sec(cum)")
+			fmt.Printf("%13.1fs %14.1f\n\n",
+				elapsed.Seconds(),
+				float64(curOps)/elapsed.Seconds())
+		},
+	}, ops
+}
diff --git a/pebble/cmd/pebble/random.go b/pebble/cmd/pebble/random.go
new file mode 100644
index 0000000..c098b74
--- /dev/null
+++ b/pebble/cmd/pebble/random.go
@@ -0,0 +1,92 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package main
+
+import (
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/randvar"
+	"github.com/cockroachdb/pebble/internal/rate"
+)
+
+type rateFlag struct {
+	randvar.Flag
+	fluctuateDuration time.Duration
+	spec              string
+}
+
+func newRateFlag(spec string) *rateFlag {
+	f := &rateFlag{}
+	if err := f.Set(spec); err != nil {
+		panic(err)
+	}
+	return f
+}
+
+func (f *rateFlag) String() string {
+	return f.spec
+}
+
+// Type implements the Flag.Value interface.
+func (f *rateFlag) Type() string {
+	return "ratevar"
+}
+
+// Set implements the Flag.Value interface.
+func (f *rateFlag) Set(spec string) error {
+	if spec == "" {
+		if err := f.Flag.Set("0"); err != nil {
+			return err
+		}
+		f.fluctuateDuration = time.Duration(0)
+		f.spec = spec
+		return nil
+	}
+
+	parts := strings.Split(spec, "/")
+	if len(parts) == 0 || len(parts) > 2 {
+		return errors.Errorf("invalid ratevar spec: %s", errors.Safe(spec))
+	}
+	if err := f.Flag.Set(parts[0]); err != nil {
+		return err
+	}
+	// Don't fluctuate by default.
+	f.fluctuateDuration = time.Duration(0)
+	if len(parts) == 2 {
+		fluctuateDurationFloat, err := strconv.ParseFloat(parts[1], 64)
+		if err != nil {
+			return err
+		}
+		f.fluctuateDuration = time.Duration(fluctuateDurationFloat) * time.Second
+	}
+	f.spec = spec
+	return nil
+}
+
+func (f *rateFlag) newRateLimiter() *rate.Limiter {
+	if f.spec == "" {
+		return nil
+	}
+	rng := randvar.NewRand()
+	limiter := rate.NewLimiter(float64(f.Uint64(rng)), 1)
+	if f.fluctuateDuration != 0 {
+		go func(limiter *rate.Limiter) {
+			ticker := time.NewTicker(f.fluctuateDuration)
+			for range ticker.C {
+				limiter.SetRate(float64(f.Uint64(rng)))
+			}
+		}(limiter)
+	}
+	return limiter
+}
+
+func wait(l *rate.Limiter) {
+	if l != nil {
+		l.Wait(1)
+	}
+}
diff --git a/pebble/cmd/pebble/replay.go b/pebble/cmd/pebble/replay.go
new file mode 100644
index 0000000..7479769
--- /dev/null
+++ b/pebble/cmd/pebble/replay.go
@@ -0,0 +1,448 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package main
+
+import (
+	"bytes"
+	"context"
+	"flag"
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+	"sort"
+	"strconv"
+	"strings"
+	"syscall"
+	"time"
+	"unicode"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/bloom"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/cache"
+	"github.com/cockroachdb/pebble/replay"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/spf13/cobra"
+)
+
+func initReplayCmd() *cobra.Command {
+	c := replayConfig{
+		pacer:            pacerFlag{Pacer: replay.PaceByFixedReadAmp(10)},
+		runDir:           "",
+		count:            1,
+		streamLogs:       false,
+		ignoreCheckpoint: false,
+	}
+	cmd := &cobra.Command{
+		Use:   "replay <workload>",
+		Short: "run the provided captured write workload",
+		Args:  cobra.ExactArgs(1),
+		RunE:  c.runE,
+	}
+	cmd.Flags().IntVar(
+		&c.count, "count", 1, "the number of times to replay the workload")
+	cmd.Flags().StringVar(
+		&c.name, "name", "", "the name of the workload being replayed")
+	cmd.Flags().VarPF(
+		&c.pacer, "pacer", "p", "the pacer to use: unpaced, reference-ramp, or fixed-ramp=N")
+	cmd.Flags().Uint64Var(
+		&c.maxWritesMB, "max-writes", 0, "the maximum volume of writes (MB) to apply, with 0 denoting unlimited")
+	cmd.Flags().StringVar(
+		&c.optionsString, "options", "", "Pebble options to override, in the OPTIONS ini format but with any whitespace as field delimiters instead of newlines")
+	cmd.Flags().StringVar(
+		&c.runDir, "run-dir", c.runDir, "the directory to use for the replay data directory; defaults to a random dir in pwd")
+	cmd.Flags().Int64Var(
+		&c.maxCacheSize, "max-cache-size", c.maxCacheSize, "the max size of the block cache")
+	cmd.Flags().BoolVar(
+		&c.streamLogs, "stream-logs", c.streamLogs, "stream the Pebble logs to stdout during replay")
+	cmd.Flags().BoolVar(
+		&c.ignoreCheckpoint, "ignore-checkpoint", c.ignoreCheckpoint, "ignore the workload's initial checkpoint")
+	cmd.Flags().StringVar(
+		&c.checkpointDir, "checkpoint-dir", c.checkpointDir, "path to the checkpoint to use if not <WORKLOAD_DIR>/checkpoint")
+	return cmd
+}
+
+type replayConfig struct {
+	name             string
+	pacer            pacerFlag
+	runDir           string
+	count            int
+	maxWritesMB      uint64
+	streamLogs       bool
+	checkpointDir    string
+	ignoreCheckpoint bool
+	optionsString    string
+	maxCacheSize     int64
+
+	cleanUpFuncs []func() error
+}
+
+func (c *replayConfig) args() (args []string) {
+	if c.name != "" {
+		args = append(args, "--name", c.name)
+	}
+	if c.pacer.spec != "" {
+		args = append(args, "--pacer", c.pacer.spec)
+	}
+	if c.runDir != "" {
+		args = append(args, "--run-dir", c.runDir)
+	}
+	if c.count != 0 {
+		args = append(args, "--count", fmt.Sprint(c.count))
+	}
+	if c.maxWritesMB != 0 {
+		args = append(args, "--max-writes", fmt.Sprint(c.maxWritesMB))
+	}
+	if c.maxCacheSize != 0 {
+		args = append(args, "--max-cache-size", fmt.Sprint(c.maxCacheSize))
+	}
+	if c.streamLogs {
+		args = append(args, "--stream-logs")
+	}
+	if c.checkpointDir != "" {
+		args = append(args, "--checkpoint-dir", c.checkpointDir)
+	}
+	if c.ignoreCheckpoint {
+		args = append(args, "--ignore-checkpoint")
+	}
+	if c.optionsString != "" {
+		args = append(args, "--options", c.optionsString)
+	}
+	return args
+}
+
+func (c *replayConfig) runE(cmd *cobra.Command, args []string) error {
+	if c.ignoreCheckpoint && c.checkpointDir != "" {
+		return errors.Newf("cannot provide both --checkpoint-dir and --ignore-checkpoint")
+	}
+	stdout := cmd.OutOrStdout()
+
+	workloadPath := args[0]
+	if err := c.runOnce(stdout, workloadPath); err != nil {
+		return err
+	}
+	c.count--
+
+	// If necessary, run it again. We run again replacing our existing process
+	// with the next run so that we're truly starting over. This helps avoid the
+	// possibility of state within the Go runtime, the fragmentation of the
+	// heap, or global state within Pebble from interfering with the
+	// independence of individual runs. Previously we called runOnce multiple
+	// times without exec-ing, but we observed less variance between runs from
+	// within the same process.
+	if c.count > 0 {
+		fmt.Printf("%d runs remaining.", c.count)
+		executable, err := os.Executable()
+		if err != nil {
+			return err
+		}
+		execArgs := append(append([]string{executable, "bench", "replay"}, c.args()...), workloadPath)
+		syscall.Exec(executable, execArgs, os.Environ())
+	}
+	return nil
+}
+
+func (c *replayConfig) runOnce(stdout io.Writer, workloadPath string) error {
+	defer c.cleanUp()
+	if c.name == "" {
+		c.name = vfs.Default.PathBase(workloadPath)
+	}
+
+	r := &replay.Runner{
+		RunDir:       c.runDir,
+		WorkloadFS:   vfs.Default,
+		WorkloadPath: workloadPath,
+		Pacer:        c.pacer,
+		Opts:         &pebble.Options{},
+	}
+	if c.maxWritesMB > 0 {
+		r.MaxWriteBytes = c.maxWritesMB * (1 << 20)
+	}
+	if err := c.initRunDir(r); err != nil {
+		return err
+	}
+	if err := c.initOptions(r); err != nil {
+		return err
+	}
+	if verbose {
+		fmt.Fprintln(stdout, "Options:")
+		fmt.Fprintln(stdout, r.Opts.String())
+	}
+
+	// Begin the workload. Run does not block.
+	ctx := context.Background()
+	if err := r.Run(ctx); err != nil {
+		return errors.Wrapf(err, "starting workload")
+	}
+
+	// Wait blocks until the workload is complete. Once Wait returns, all of the
+	// workload's write operations have been replayed AND the database's
+	// compactions have quiesced.
+	m, err := r.Wait()
+	if err != nil {
+		return errors.Wrapf(err, "waiting for workload to complete")
+	}
+	if err := r.Close(); err != nil {
+		return errors.Wrapf(err, "cleaning up")
+	}
+	fmt.Fprintln(stdout, "Workload complete.")
+	if err := m.WriteBenchmarkString(c.name, stdout); err != nil {
+		return err
+	}
+	for _, plot := range m.Plots(120 /* width */, 30 /* height */) {
+		fmt.Fprintln(stdout, plot.Name)
+		fmt.Fprintln(stdout, plot.Plot)
+		fmt.Fprintln(stdout)
+	}
+	fmt.Fprintln(stdout, m.Final.String())
+	return nil
+}
+
+func (c *replayConfig) initRunDir(r *replay.Runner) error {
+	if r.RunDir == "" {
+		// Default to replaying in a new directory within the current working
+		// directory.
+		wd, err := os.Getwd()
+		if err != nil {
+			return err
+		}
+		r.RunDir, err = os.MkdirTemp(wd, "replay-")
+		if err != nil {
+			return err
+		}
+		c.cleanUpFuncs = append(c.cleanUpFuncs, func() error {
+			return os.RemoveAll(r.RunDir)
+		})
+	}
+	if !c.ignoreCheckpoint {
+		checkpointDir := c.getCheckpointDir(r)
+		fmt.Printf("%s: Attempting to initialize with checkpoint %q.\n", time.Now().Format(time.RFC3339), checkpointDir)
+		ok, err := vfs.Clone(
+			r.WorkloadFS,
+			vfs.Default,
+			checkpointDir,
+			filepath.Join(r.RunDir),
+			vfs.CloneTryLink)
+		if err != nil {
+			return err
+		}
+		if !ok {
+			return errors.Newf("no checkpoint %q exists; you may re-run with --ignore-checkpoint", checkpointDir)
+		}
+		fmt.Printf("%s: Run directory initialized with checkpoint %q.\n", time.Now().Format(time.RFC3339), checkpointDir)
+	}
+	return nil
+}
+
+func (c *replayConfig) initOptions(r *replay.Runner) error {
+	// If using a workload checkpoint, load the Options from it.
+	// TODO(jackson): Allow overriding the OPTIONS.
+	if !c.ignoreCheckpoint {
+		ls, err := r.WorkloadFS.List(c.getCheckpointDir(r))
+		if err != nil {
+			return err
+		}
+		sort.Strings(ls)
+		var optionsFilepath string
+		for _, l := range ls {
+			path := r.WorkloadFS.PathJoin(r.WorkloadPath, "checkpoint", l)
+			typ, _, ok := base.ParseFilename(r.WorkloadFS, path)
+			if ok && typ == base.FileTypeOptions {
+				optionsFilepath = path
+			}
+		}
+		f, err := r.WorkloadFS.Open(optionsFilepath)
+		if err != nil {
+			return err
+		}
+		o, err := io.ReadAll(f)
+		if err != nil {
+			return err
+		}
+		if err := f.Close(); err != nil {
+			return err
+		}
+		if err := r.Opts.Parse(string(o), c.parseHooks()); err != nil {
+			return err
+		}
+	}
+	if err := c.parseCustomOptions(c.optionsString, r.Opts); err != nil {
+		return err
+	}
+	// TODO(jackson): If r.Opts.Comparer == nil, peek at the workload's
+	// manifests and pull the comparer out of them.
+	//
+	// r.Opts.Comparer can only be nil at this point if ignoreCheckpoint is
+	// set; otherwise we'll have already extracted the Comparer from the
+	// checkpoint's OPTIONS file.
+
+	if c.streamLogs {
+		r.Opts.AddEventListener(pebble.MakeLoggingEventListener(pebble.DefaultLogger))
+	}
+	r.Opts.EnsureDefaults()
+	return nil
+}
+
+func (c *replayConfig) getCheckpointDir(r *replay.Runner) string {
+	if c.checkpointDir != "" {
+		return c.checkpointDir
+	}
+	return r.WorkloadFS.PathJoin(r.WorkloadPath, `checkpoint`)
+}
+
+func (c *replayConfig) parseHooks() *pebble.ParseHooks {
+	return &pebble.ParseHooks{
+		NewCache: func(size int64) *cache.Cache {
+			if c.maxCacheSize != 0 && size > c.maxCacheSize {
+				size = c.maxCacheSize
+			}
+			return cache.New(size)
+		},
+		NewComparer: makeComparer,
+		NewFilterPolicy: func(name string) (pebble.FilterPolicy, error) {
+			switch name {
+			case "none":
+				return nil, nil
+			case "rocksdb.BuiltinBloomFilter":
+				return bloom.FilterPolicy(10), nil
+			default:
+				return nil, errors.Errorf("invalid filter policy name %q", name)
+			}
+		},
+		NewMerger: makeMerger,
+	}
+}
+
+// parseCustomOptions parses Pebble Options passed through a CLI flag.
+// Ordinarily Pebble Options are specified through an INI file with newlines
+// delimiting fields. That doesn't translate well to a CLI interface, so this
+// function accepts fields are that delimited by any whitespace. This is the
+// same format that CockroachDB accepts Pebble Options through the --store flag,
+// and this code is copied from there.
+func (c *replayConfig) parseCustomOptions(optsStr string, opts *pebble.Options) error {
+	if optsStr == "" {
+		return nil
+	}
+	// Pebble options are supplied in the Pebble OPTIONS ini-like
+	// format, but allowing any whitespace to delimit lines. Convert
+	// the options to a newline-delimited format. This isn't a trivial
+	// character replacement because whitespace may appear within a
+	// stanza, eg ["Level 0"].
+	value := strings.TrimSpace(optsStr)
+	var buf bytes.Buffer
+	for len(value) > 0 {
+		i := strings.IndexFunc(value, func(r rune) bool {
+			return r == '[' || unicode.IsSpace(r)
+		})
+		switch {
+		case i == -1:
+			buf.WriteString(value)
+			value = value[len(value):]
+		case value[i] == '[':
+			// If there's whitespace within [ ], we write it verbatim.
+			j := i + strings.IndexRune(value[i:], ']')
+			buf.WriteString(value[:j+1])
+			value = value[j+1:]
+		case unicode.IsSpace(rune(value[i])):
+			// NB: This doesn't handle multibyte whitespace.
+			buf.WriteString(value[:i])
+			buf.WriteRune('\n')
+			value = strings.TrimSpace(value[i+1:])
+		}
+	}
+	return opts.Parse(buf.String(), c.parseHooks())
+}
+
+func (c *replayConfig) cleanUp() error {
+	for _, f := range c.cleanUpFuncs {
+		if err := f(); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func makeComparer(name string) (*pebble.Comparer, error) {
+	switch name {
+	case base.DefaultComparer.Name:
+		return base.DefaultComparer, nil
+	case "cockroach_comparator":
+		return mvccComparer, nil
+	default:
+		return nil, errors.Newf("unrecognized comparer %q", name)
+	}
+}
+
+func makeMerger(name string) (*pebble.Merger, error) {
+	switch name {
+	case base.DefaultMerger.Name:
+		return base.DefaultMerger, nil
+	case "cockroach_merge_operator":
+		// We don't want to reimplement the cockroach merger. Instead we
+		// implement this merger to return the newer of the two operands. This
+		// doesn't exactly model cockroach's true use but should be good enough.
+		// TODO(jackson): Consider lifting replay into a `cockroach debug`
+		// command so we can use the true merger and comparer.
+		merger := new(pebble.Merger)
+		merger.Merge = func(key, value []byte) (pebble.ValueMerger, error) {
+			return &overwriteValueMerger{value: append([]byte{}, value...)}, nil
+		}
+		merger.Name = name
+		return merger, nil
+	default:
+		return nil, errors.Newf("unrecognized comparer %q", name)
+	}
+}
+
+// pacerFlag provides a command line flag interface for specifying the pacer to
+// use. It implements the flag.Value interface.
+type pacerFlag struct {
+	replay.Pacer
+	spec string
+}
+
+var _ flag.Value = (*pacerFlag)(nil)
+
+func (f *pacerFlag) String() string { return f.spec }
+func (f *pacerFlag) Type() string   { return "pacer" }
+
+// Set implements the Flag.Value interface.
+func (f *pacerFlag) Set(spec string) error {
+	f.spec = spec
+	switch {
+	case spec == "unpaced":
+		f.Pacer = replay.Unpaced{}
+	case spec == "reference-ramp":
+		f.Pacer = replay.PaceByReferenceReadAmp{}
+	case strings.HasPrefix(spec, "fixed-ramp="):
+		rAmp, err := strconv.Atoi(strings.TrimPrefix(spec, "fixed-ramp="))
+		if err != nil {
+			return errors.Newf("unable to parse fixed r-amp: %s", err)
+		}
+		f.Pacer = replay.PaceByFixedReadAmp(rAmp)
+	default:
+		return errors.Newf("unrecognized pacer spec: %q", errors.Safe(spec))
+	}
+	return nil
+}
+
+type overwriteValueMerger struct {
+	value []byte
+}
+
+func (o *overwriteValueMerger) MergeNewer(value []byte) error {
+	o.value = append(o.value[:0], value...)
+	return nil
+}
+
+func (o *overwriteValueMerger) MergeOlder(value []byte) error {
+	return nil
+}
+
+func (o *overwriteValueMerger) Finish(includesBase bool) ([]byte, io.Closer, error) {
+	return o.value, nil, nil
+}
diff --git a/pebble/cmd/pebble/replay_test.go b/pebble/cmd/pebble/replay_test.go
new file mode 100644
index 0000000..b3f6225
--- /dev/null
+++ b/pebble/cmd/pebble/replay_test.go
@@ -0,0 +1,77 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package main
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/cache"
+	"github.com/stretchr/testify/require"
+)
+
+func TestParseOptionsStr(t *testing.T) {
+	type testCase struct {
+		c       replayConfig
+		options *pebble.Options
+	}
+
+	testCases := []testCase{
+		{
+			c:       replayConfig{optionsString: `[Options] max_concurrent_compactions=9`},
+			options: &pebble.Options{MaxConcurrentCompactions: func() int { return 9 }},
+		},
+		{
+			c:       replayConfig{optionsString: `[Options] bytes_per_sync=90000`},
+			options: &pebble.Options{BytesPerSync: 90000},
+		},
+		{
+			c:       replayConfig{optionsString: fmt.Sprintf(`[Options] cache_size=%d`, 16<<20 /* 16MB */)},
+			options: &pebble.Options{Cache: cache.New(16 << 20 /* 16 MB */)},
+		},
+		{
+			c: replayConfig{
+				maxCacheSize:  16 << 20, /* 16 MB */
+				optionsString: fmt.Sprintf(`[Options] cache_size=%d`, int64(10<<30 /* 10 GB */)),
+			},
+			options: &pebble.Options{Cache: cache.New(16 << 20 /* 16 MB */)},
+		},
+		{
+			c: replayConfig{optionsString: `[Options] [Level "0"] target_file_size=222`},
+			options: &pebble.Options{Levels: []pebble.LevelOptions{
+				{TargetFileSize: 222},
+			}},
+		},
+		{
+			c: replayConfig{optionsString: `[Options] lbase_max_bytes=10  max_open_files=20  [Level "0"] target_file_size=30 [Level "1"] index_block_size=40`},
+			options: &pebble.Options{
+				LBaseMaxBytes: 10,
+				MaxOpenFiles:  20,
+				Levels: []pebble.LevelOptions{
+					{TargetFileSize: 30},
+					{IndexBlockSize: 40},
+				},
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		o := new(pebble.Options)
+		require.NoError(t, tc.c.parseCustomOptions(tc.c.optionsString, o))
+		o.EnsureDefaults()
+		got := o.String()
+
+		tc.options.EnsureDefaults()
+		want := tc.options.String()
+		require.Equal(t, want, got)
+		if o.Cache != nil {
+			o.Cache.Unref()
+		}
+		if tc.options.Cache != nil {
+			tc.options.Cache.Unref()
+		}
+	}
+}
diff --git a/pebble/cmd/pebble/scan.go b/pebble/cmd/pebble/scan.go
new file mode 100644
index 0000000..0803501
--- /dev/null
+++ b/pebble/cmd/pebble/scan.go
@@ -0,0 +1,160 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package main
+
+import (
+	"fmt"
+	"log"
+	"math"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/randvar"
+	"github.com/spf13/cobra"
+	"golang.org/x/exp/rand"
+)
+
+var scanConfig struct {
+	reverse bool
+	rows    *randvar.Flag
+	values  *randvar.BytesFlag
+}
+
+var scanCmd = &cobra.Command{
+	Use:   "scan <dir>",
+	Short: "run the scan benchmark",
+	Long:  ``,
+	Args:  cobra.ExactArgs(1),
+	Run:   runScan,
+}
+
+func init() {
+	scanCmd.Flags().BoolVarP(
+		&scanConfig.reverse, "reverse", "r", false, "reverse scan")
+	scanConfig.rows = randvar.NewFlag("100")
+	scanCmd.Flags().Var(
+		scanConfig.rows, "rows", "number of rows to scan in each operation")
+	scanConfig.values = randvar.NewBytesFlag("8")
+	scanCmd.Flags().Var(
+		scanConfig.values, "values",
+		"value size distribution [{zipf,uniform}:]min[-max][/<target-compression>]")
+}
+
+func runScan(cmd *cobra.Command, args []string) {
+	var (
+		bytes       atomic.Int64
+		scanned     atomic.Int64
+		lastBytes   int64
+		lastScanned int64
+		lastElapsed time.Duration
+	)
+
+	opts := pebble.Sync
+	if disableWAL {
+		opts = pebble.NoSync
+	}
+
+	rowDist := scanConfig.rows
+
+	runTest(args[0], test{
+		init: func(d DB, wg *sync.WaitGroup) {
+			const count = 100000
+			const batch = 1000
+
+			rng := rand.New(rand.NewSource(1449168817))
+			keys := make([][]byte, count)
+
+			for i := 0; i < count; {
+				b := d.NewBatch()
+				var value []byte
+				for end := i + batch; i < end; i++ {
+					keys[i] = mvccEncode(nil, encodeUint32Ascending([]byte("key-"), uint32(i)), uint64(i+1), 0)
+					value = scanConfig.values.Bytes(rng, value)
+					if err := b.Set(keys[i], value, nil); err != nil {
+						log.Fatal(err)
+					}
+				}
+				if err := b.Commit(opts); err != nil {
+					log.Fatal(err)
+				}
+			}
+
+			if err := d.Flush(); err != nil {
+				log.Fatal(err)
+			}
+
+			limiter := maxOpsPerSec.newRateLimiter()
+
+			wg.Add(concurrency)
+			for i := 0; i < concurrency; i++ {
+				go func(i int) {
+					defer wg.Done()
+
+					rng := rand.New(rand.NewSource(uint64(i)))
+					startKeyBuf := append(make([]byte, 0, 64), []byte("key-")...)
+					endKeyBuf := append(make([]byte, 0, 64), []byte("key-")...)
+					minTS := encodeUint64Ascending(nil, math.MaxUint64)
+
+					for {
+						wait(limiter)
+
+						rows := int(rowDist.Uint64(rng))
+						startIdx := rng.Int31n(int32(len(keys) - rows))
+						startKey := encodeUint32Ascending(startKeyBuf[:4], uint32(startIdx))
+						endKey := encodeUint32Ascending(endKeyBuf[:4], uint32(startIdx+int32(rows)))
+
+						var count int
+						var nbytes int64
+						if scanConfig.reverse {
+							count, nbytes = mvccReverseScan(d, startKey, endKey, minTS)
+						} else {
+							count, nbytes = mvccForwardScan(d, startKey, endKey, minTS)
+						}
+
+						if count != rows {
+							log.Fatalf("scanned %d, expected %d\n", count, rows)
+						}
+
+						bytes.Add(nbytes)
+						scanned.Add(int64(count))
+					}
+				}(i)
+			}
+		},
+
+		tick: func(elapsed time.Duration, i int) {
+			if i%20 == 0 {
+				fmt.Println("_elapsed_______rows/sec_______MB/sec_______ns/row")
+			}
+
+			curBytes := bytes.Load()
+			curScanned := scanned.Load()
+			dur := elapsed - lastElapsed
+			fmt.Printf("%8s %14.1f %12.1f %12.1f\n",
+				time.Duration(elapsed.Seconds()+0.5)*time.Second,
+				float64(curScanned-lastScanned)/dur.Seconds(),
+				float64(curBytes-lastBytes)/(dur.Seconds()*(1<<20)),
+				float64(dur)/float64(curScanned-lastScanned),
+			)
+			lastBytes = curBytes
+			lastScanned = curScanned
+			lastElapsed = elapsed
+		},
+
+		done: func(elapsed time.Duration) {
+			curBytes := bytes.Load()
+			curScanned := scanned.Load()
+			fmt.Println("\n_elapsed___ops/sec(cum)__MB/sec(cum)__ns/row(avg)")
+			fmt.Printf("%7.1fs %14.1f %12.1f %12.1f\n\n",
+				elapsed.Seconds(),
+				float64(curScanned)/elapsed.Seconds(),
+				float64(curBytes)/(elapsed.Seconds()*(1<<20)),
+				float64(elapsed)/float64(curScanned),
+			)
+		},
+	})
+}
diff --git a/pebble/cmd/pebble/sync.go b/pebble/cmd/pebble/sync.go
new file mode 100644
index 0000000..e2add26
--- /dev/null
+++ b/pebble/cmd/pebble/sync.go
@@ -0,0 +1,143 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package main
+
+import (
+	"fmt"
+	"log"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/randvar"
+	"github.com/spf13/cobra"
+	"golang.org/x/exp/rand"
+)
+
+var syncConfig struct {
+	batch   *randvar.Flag
+	walOnly bool
+	values  *randvar.BytesFlag
+}
+
+var syncCmd = &cobra.Command{
+	Use:   "sync <dir>",
+	Short: "run the sync benchmark",
+	Long:  ``,
+	Args:  cobra.ExactArgs(1),
+	Run:   runSync,
+}
+
+func init() {
+	syncConfig.batch = randvar.NewFlag("5")
+	syncCmd.Flags().Var(
+		syncConfig.batch, "batch",
+		"batch size distribution [{zipf,uniform}:]min[-max]")
+	syncCmd.Flags().BoolVar(
+		&syncConfig.walOnly, "wal-only", false, "write data only to the WAL")
+	syncConfig.values = randvar.NewBytesFlag("uniform:60-80/1.0")
+	syncCmd.Flags().Var(
+		syncConfig.values, "values",
+		"value size distribution [{zipf,uniform}:]min[-max][/<target-compression>]")
+}
+
+func runSync(cmd *cobra.Command, args []string) {
+	reg := newHistogramRegistry()
+	var bytes atomic.Uint64
+	var lastBytes uint64
+
+	opts := pebble.Sync
+	if disableWAL {
+		opts = pebble.NoSync
+	}
+
+	batchDist := syncConfig.batch
+
+	runTest(args[0], test{
+		init: func(d DB, wg *sync.WaitGroup) {
+			limiter := maxOpsPerSec.newRateLimiter()
+
+			wg.Add(concurrency)
+			for i := 0; i < concurrency; i++ {
+				latency := reg.Register("ops")
+				go func() {
+					defer wg.Done()
+
+					rand := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+					var raw []byte
+					var buf []byte
+					var block []byte
+					for {
+						wait(limiter)
+
+						start := time.Now()
+						b := d.NewBatch()
+						var n uint64
+						count := int(batchDist.Uint64(rand))
+						for j := 0; j < count; j++ {
+							block = syncConfig.values.Bytes(rand, block)
+
+							if syncConfig.walOnly {
+								if err := b.LogData(block, nil); err != nil {
+									log.Fatal(err)
+								}
+							} else {
+								raw = encodeUint32Ascending(raw[:0], rand.Uint32())
+								key := mvccEncode(buf[:0], raw, 0, 0)
+								buf = key[:0]
+								if err := b.Set(key, block, nil); err != nil {
+									log.Fatal(err)
+								}
+							}
+							n += uint64(len(block))
+						}
+						if err := b.Commit(opts); err != nil {
+							log.Fatal(err)
+						}
+						latency.Record(time.Since(start))
+						bytes.Add(n)
+					}
+				}()
+			}
+		},
+
+		tick: func(elapsed time.Duration, i int) {
+			if i%20 == 0 {
+				fmt.Println("_elapsed____ops/sec___mb/sec__p50(ms)__p95(ms)__p99(ms)_pMax(ms)")
+			}
+			reg.Tick(func(tick histogramTick) {
+				h := tick.Hist
+				n := bytes.Load()
+				fmt.Printf("%8s %10.1f %8.1f %8.1f %8.1f %8.1f %8.1f\n",
+					time.Duration(elapsed.Seconds()+0.5)*time.Second,
+					float64(h.TotalCount())/tick.Elapsed.Seconds(),
+					float64(n-lastBytes)/(1024.0*1024.0)/tick.Elapsed.Seconds(),
+					time.Duration(h.ValueAtQuantile(50)).Seconds()*1000,
+					time.Duration(h.ValueAtQuantile(95)).Seconds()*1000,
+					time.Duration(h.ValueAtQuantile(99)).Seconds()*1000,
+					time.Duration(h.ValueAtQuantile(100)).Seconds()*1000,
+				)
+				lastBytes = n
+			})
+		},
+
+		done: func(elapsed time.Duration) {
+			fmt.Println("\n_elapsed___ops(total)_ops/sec(cum)_mb/sec(cum)__avg(ms)__p50(ms)__p95(ms)__p99(ms)_pMax(ms)")
+			reg.Tick(func(tick histogramTick) {
+				h := tick.Cumulative
+				fmt.Printf("%7.1fs %12d %12.1f %11.1f %8.1f %8.1f %8.1f %8.1f %8.1f\n\n",
+					elapsed.Seconds(), h.TotalCount(),
+					float64(h.TotalCount())/elapsed.Seconds(),
+					float64(bytes.Load()/(1024.0*1024.0))/elapsed.Seconds(),
+					time.Duration(h.Mean()).Seconds()*1000,
+					time.Duration(h.ValueAtQuantile(50)).Seconds()*1000,
+					time.Duration(h.ValueAtQuantile(95)).Seconds()*1000,
+					time.Duration(h.ValueAtQuantile(99)).Seconds()*1000,
+					time.Duration(h.ValueAtQuantile(100)).Seconds()*1000)
+			})
+		},
+	})
+}
diff --git a/pebble/cmd/pebble/test.go b/pebble/cmd/pebble/test.go
new file mode 100644
index 0000000..c8d707b
--- /dev/null
+++ b/pebble/cmd/pebble/test.go
@@ -0,0 +1,400 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package main
+
+import (
+	"fmt"
+	"io"
+	"log"
+	"os"
+	"os/signal"
+	"runtime"
+	"runtime/pprof"
+	"sort"
+	"sync"
+	"syscall"
+	"time"
+
+	"github.com/HdrHistogram/hdrhistogram-go"
+	"github.com/cockroachdb/pebble"
+)
+
+const (
+	minLatency = 10 * time.Microsecond
+	maxLatency = 10 * time.Second
+)
+
+func startCPUProfile() func() {
+	runtime.SetMutexProfileFraction(1000)
+
+	done := startRecording("cpu.%04d.prof", pprof.StartCPUProfile, pprof.StopCPUProfile)
+	return func() {
+		done()
+		if p := pprof.Lookup("heap"); p != nil {
+			f, err := os.Create("heap.prof")
+			if err != nil {
+				log.Fatal(err)
+			}
+			if err := p.WriteTo(f, 0); err != nil {
+				log.Fatal(err)
+			}
+			f.Close()
+		}
+		if p := pprof.Lookup("mutex"); p != nil {
+			f, err := os.Create("mutex.prof")
+			if err != nil {
+				log.Fatal(err)
+			}
+			if err := p.WriteTo(f, 0); err != nil {
+				log.Fatal(err)
+			}
+			f.Close()
+		}
+	}
+}
+
+func startRecording(fmtStr string, startFunc func(io.Writer) error, stopFunc func()) func() {
+	doneCh := make(chan struct{})
+	var doneWG sync.WaitGroup
+	doneWG.Add(1)
+
+	go func() {
+		defer doneWG.Done()
+
+		start := time.Now()
+		t := time.NewTicker(10 * time.Second)
+		defer t.Stop()
+
+		var current *os.File
+		defer func() {
+			if current != nil {
+				stopFunc()
+				current.Close()
+			}
+		}()
+
+		for {
+			if current != nil {
+				stopFunc()
+				current.Close()
+				current = nil
+			}
+			path := fmt.Sprintf(fmtStr, int(time.Since(start).Seconds()+0.5))
+			f, err := os.Create(path)
+			if err != nil {
+				log.Fatalf("unable to create cpu profile: %s", err)
+				return
+			}
+			if err := startFunc(f); err != nil {
+				log.Fatalf("unable to start cpu profile: %v", err)
+				f.Close()
+				return
+			}
+			current = f
+
+			select {
+			case <-doneCh:
+				return
+			case <-t.C:
+			}
+		}
+	}()
+
+	return func() {
+		close(doneCh)
+		doneWG.Wait()
+	}
+}
+
+func newHistogram() *hdrhistogram.Histogram {
+	return hdrhistogram.New(minLatency.Nanoseconds(), maxLatency.Nanoseconds(), 1)
+}
+
+type namedHistogram struct {
+	name string
+	mu   struct {
+		sync.Mutex
+		current *hdrhistogram.Histogram
+	}
+}
+
+func newNamedHistogram(name string) *namedHistogram {
+	w := &namedHistogram{name: name}
+	w.mu.current = newHistogram()
+	return w
+}
+
+func (w *namedHistogram) Record(elapsed time.Duration) {
+	if elapsed < minLatency {
+		elapsed = minLatency
+	} else if elapsed > maxLatency {
+		elapsed = maxLatency
+	}
+
+	w.mu.Lock()
+	err := w.mu.current.RecordValue(elapsed.Nanoseconds())
+	w.mu.Unlock()
+
+	if err != nil {
+		// Note that a histogram only drops recorded values that are out of range,
+		// but we clamp the latency value to the configured range to prevent such
+		// drops. This code path should never happen.
+		panic(fmt.Sprintf(`%s: recording value: %s`, w.name, err))
+	}
+}
+
+func (w *namedHistogram) tick(fn func(h *hdrhistogram.Histogram)) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	h := w.mu.current
+	w.mu.current = newHistogram()
+	fn(h)
+}
+
+type histogramTick struct {
+	// Name is the name given to the histograms represented by this tick.
+	Name string
+	// Hist is the merged result of the represented histograms for this tick.
+	// Hist.TotalCount() is the number of operations that occurred for this tick.
+	Hist *hdrhistogram.Histogram
+	// Cumulative is the merged result of the represented histograms for all
+	// time. Cumulative.TotalCount() is the total number of operations that have
+	// occurred over all time.
+	Cumulative *hdrhistogram.Histogram
+	// Elapsed is the amount of time since the last tick.
+	Elapsed time.Duration
+	// Now is the time at which the tick was gathered. It covers the period
+	// [Now-Elapsed,Now).
+	Now time.Time
+}
+
+type histogramRegistry struct {
+	mu struct {
+		sync.Mutex
+		registered []*namedHistogram
+	}
+
+	start      time.Time
+	cumulative map[string]*hdrhistogram.Histogram
+	prevTick   map[string]time.Time
+}
+
+func newHistogramRegistry() *histogramRegistry {
+	return &histogramRegistry{
+		start:      time.Now(),
+		cumulative: make(map[string]*hdrhistogram.Histogram),
+		prevTick:   make(map[string]time.Time),
+	}
+}
+
+func (w *histogramRegistry) Register(name string) *namedHistogram {
+	hist := newNamedHistogram(name)
+
+	w.mu.Lock()
+	w.mu.registered = append(w.mu.registered, hist)
+	w.mu.Unlock()
+
+	return hist
+}
+
+func (w *histogramRegistry) Tick(fn func(histogramTick)) {
+	w.mu.Lock()
+	registered := append([]*namedHistogram(nil), w.mu.registered...)
+	w.mu.Unlock()
+
+	merged := make(map[string]*hdrhistogram.Histogram)
+	var names []string
+	for _, hist := range registered {
+		hist.tick(func(h *hdrhistogram.Histogram) {
+			if p, ok := merged[hist.name]; ok {
+				p.Merge(h)
+			} else {
+				merged[hist.name] = h
+				names = append(names, hist.name)
+			}
+		})
+	}
+
+	now := time.Now()
+	sort.Strings(names)
+	for _, name := range names {
+		mergedHist := merged[name]
+		if _, ok := w.cumulative[name]; !ok {
+			w.cumulative[name] = newHistogram()
+		}
+		w.cumulative[name].Merge(mergedHist)
+
+		prevTick, ok := w.prevTick[name]
+		if !ok {
+			prevTick = w.start
+		}
+		w.prevTick[name] = now
+		fn(histogramTick{
+			Name:       name,
+			Hist:       merged[name],
+			Cumulative: w.cumulative[name],
+			Elapsed:    now.Sub(prevTick),
+			Now:        now,
+		})
+	}
+}
+
+type testWithoutDB struct {
+	init func(wg *sync.WaitGroup)
+	tick func(elapsed time.Duration, i int)
+	done func(wg *sync.WaitGroup, elapsed time.Duration)
+}
+
+func runTestWithoutDB(t testWithoutDB) {
+	var wg sync.WaitGroup
+	t.init(&wg)
+
+	ticker := time.NewTicker(time.Second)
+	defer ticker.Stop()
+
+	done := make(chan os.Signal, 3)
+	workersDone := make(chan struct{})
+	signal.Notify(done, os.Interrupt)
+
+	go func() {
+		wg.Wait()
+		close(workersDone)
+	}()
+
+	if duration > 0 {
+		go func() {
+			time.Sleep(duration)
+			done <- syscall.Signal(0)
+		}()
+	}
+
+	stopProf := startCPUProfile()
+	defer stopProf()
+
+	start := time.Now()
+	for i := 0; ; i++ {
+		select {
+		case <-ticker.C:
+			if workersDone != nil {
+				t.tick(time.Since(start), i)
+			}
+
+		case <-workersDone:
+			workersDone = nil
+			t.done(&wg, time.Since(start))
+			return
+
+		case sig := <-done:
+			fmt.Println("operating system is killing the op.", sig)
+			if workersDone != nil {
+				t.done(&wg, time.Since(start))
+			}
+			return
+		}
+	}
+}
+
+type test struct {
+	init func(db DB, wg *sync.WaitGroup)
+	tick func(elapsed time.Duration, i int)
+	done func(elapsed time.Duration)
+}
+
+func runTest(dir string, t test) {
+	// Check if the directory exists.
+	if wipe {
+		fmt.Printf("wiping %s\n", dir)
+		if err := os.RemoveAll(dir); err != nil {
+			log.Fatal(err)
+		}
+	}
+
+	fmt.Printf("dir %s\nconcurrency %d\n", dir, concurrency)
+
+	db := newPebbleDB(dir)
+	var wg sync.WaitGroup
+	t.init(db, &wg)
+
+	ticker := time.NewTicker(time.Second)
+	defer ticker.Stop()
+
+	done := make(chan os.Signal, 3)
+	workersDone := make(chan struct{})
+	signal.Notify(done, os.Interrupt)
+
+	go func() {
+		wg.Wait()
+		close(workersDone)
+	}()
+
+	if maxSize > 0 {
+		go func() {
+			for {
+				time.Sleep(10 * time.Second)
+				if db.Metrics().DiskSpaceUsage() > maxSize*1e6 {
+					fmt.Println("max size reached")
+					done <- syscall.Signal(0)
+				}
+			}
+		}()
+	}
+	if duration > 0 {
+		go func() {
+			time.Sleep(duration)
+			done <- syscall.Signal(0)
+		}()
+	}
+
+	stopProf := startCPUProfile()
+	defer stopProf()
+
+	backgroundCompactions := func(p *pebble.Metrics) bool {
+		// The last level never gets selected as an input level for compaction,
+		// only as an output level, so ignore it for the purposes of determining if
+		// background compactions are still needed.
+		for i := range p.Levels[:len(p.Levels)-1] {
+			if p.Levels[i].Score > 1 {
+				return true
+			}
+		}
+		return false
+	}
+
+	start := time.Now()
+	for i := 0; ; i++ {
+		select {
+		case <-ticker.C:
+			if workersDone != nil {
+				t.tick(time.Since(start), i)
+				if verbose && (i%10) == 9 {
+					fmt.Printf("%s", db.Metrics())
+				}
+			} else if waitCompactions {
+				p := db.Metrics()
+				fmt.Printf("%s", p)
+				if !backgroundCompactions(p) {
+					return
+				}
+			}
+
+		case <-workersDone:
+			workersDone = nil
+			t.done(time.Since(start))
+			p := db.Metrics()
+			fmt.Printf("%s", p)
+			if !waitCompactions || !backgroundCompactions(p) {
+				return
+			}
+			fmt.Printf("waiting for background compactions\n")
+
+		case <-done:
+			if workersDone != nil {
+				t.done(time.Since(start))
+			}
+			fmt.Printf("%s", db.Metrics())
+			return
+		}
+	}
+}
diff --git a/pebble/cmd/pebble/tombstone.go b/pebble/cmd/pebble/tombstone.go
new file mode 100644
index 0000000..bbe0e3b
--- /dev/null
+++ b/pebble/cmd/pebble/tombstone.go
@@ -0,0 +1,134 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package main
+
+import (
+	"fmt"
+	"log"
+	"sync"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/humanize"
+	"github.com/spf13/cobra"
+)
+
+func init() {
+	// NB: the tombstone workload piggybacks off the existing flags and
+	// configs for the queue and ycsb workloads.
+	initQueue(tombstoneCmd)
+	initYCSB(tombstoneCmd)
+}
+
+var tombstoneCmd = &cobra.Command{
+	Use:   "tombstone <dir>",
+	Short: "run the mixed-workload point tombstone benchmark",
+	Long: `
+Run a customizable YCSB workload, alongside a single-writer, fixed-sized queue
+workload. This command is intended for evaluating compaction heuristics
+surrounding point tombstones.
+
+The queue workload writes a point tombstone with every operation. A compaction
+strategy that does not account for point tombstones may accumulate many
+uncompacted tombstones, causing steady growth of the disk space consumed by
+the queue keyspace.
+
+The --queue-values flag controls the distribution of the queue value sizes.
+Larger values are more likely to exhibit problematic point tombstone behavior
+on a database using a min-overlapping ratio heuristic because the compact
+point tombstones may overlap many tables in the next level.
+
+The --queue-size flag controls the fixed number of live keys in the queue. Low
+queue sizes may not exercise problematic tombstone behavior if queue sets and
+deletes get written to the same sstable. The large-valued sets can serve as a
+counterweight to the point tombstones, narrowing the keyrange of the sstable
+inflating its size relative to its overlap with the next level.
+	`,
+	Args: cobra.ExactArgs(1),
+	RunE: runTombstoneCmd,
+}
+
+func runTombstoneCmd(cmd *cobra.Command, args []string) error {
+	if wipe && ycsbConfig.prepopulatedKeys > 0 {
+		return errors.New("--wipe and --prepopulated-keys both specified which is nonsensical")
+	}
+
+	weights, err := ycsbParseWorkload(ycsbConfig.workload)
+	if err != nil {
+		return err
+	}
+
+	keyDist, err := ycsbParseKeyDist(ycsbConfig.keys)
+	if err != nil {
+		return err
+	}
+
+	batchDist := ycsbConfig.batch
+	scanDist := ycsbConfig.scans
+	if err != nil {
+		return err
+	}
+
+	valueDist := ycsbConfig.values
+	y := newYcsb(weights, keyDist, batchDist, scanDist, valueDist)
+	q, queueOps := queueTest()
+
+	queueStart := []byte("queue-")
+	queueEnd := append(append([]byte{}, queueStart...), 0xFF)
+
+	var lastElapsed time.Duration
+	var lastQueueOps int64
+
+	var pdb pebbleDB
+	runTest(args[0], test{
+		init: func(d DB, wg *sync.WaitGroup) {
+			pdb = d.(pebbleDB)
+			y.init(d, wg)
+			q.init(d, wg)
+		},
+		tick: func(elapsed time.Duration, i int) {
+			if i%20 == 0 {
+				fmt.Println("                                             queue                         ycsb")
+				fmt.Println("________elapsed______queue_size__ops/sec(inst)___ops/sec(cum)__ops/sec(inst)___ops/sec(cum)")
+			}
+
+			curQueueOps := queueOps.Load()
+			dur := elapsed - lastElapsed
+			queueOpsPerSec := float64(curQueueOps-lastQueueOps) / dur.Seconds()
+			queueCumOpsPerSec := float64(curQueueOps) / elapsed.Seconds()
+
+			lastQueueOps = curQueueOps
+			lastElapsed = elapsed
+
+			var ycsbOpsPerSec, ycsbCumOpsPerSec float64
+			y.reg.Tick(func(tick histogramTick) {
+				h := tick.Hist
+				ycsbOpsPerSec = float64(h.TotalCount()) / tick.Elapsed.Seconds()
+				ycsbCumOpsPerSec = float64(tick.Cumulative.TotalCount()) / elapsed.Seconds()
+			})
+
+			queueSize, err := pdb.d.EstimateDiskUsage(queueStart, queueEnd)
+			if err != nil {
+				log.Fatal(err)
+			}
+			fmt.Printf("%15s %15s %14.1f %14.1f %14.1f %14.1f\n",
+				time.Duration(elapsed.Seconds()+0.5)*time.Second,
+				humanize.Bytes.Uint64(queueSize),
+				queueOpsPerSec,
+				queueCumOpsPerSec,
+				ycsbOpsPerSec,
+				ycsbCumOpsPerSec)
+		},
+		done: func(elapsed time.Duration) {
+			fmt.Println("________elapsed______queue_size")
+			queueSize, err := pdb.d.EstimateDiskUsage(queueStart, queueEnd)
+			if err != nil {
+				log.Fatal(err)
+			}
+			fmt.Printf("%15s %15s\n", elapsed.Truncate(time.Second), humanize.Bytes.Uint64(queueSize))
+		},
+	})
+	return nil
+}
diff --git a/pebble/cmd/pebble/util.go b/pebble/cmd/pebble/util.go
new file mode 100644
index 0000000..2da4685
--- /dev/null
+++ b/pebble/cmd/pebble/util.go
@@ -0,0 +1,15 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package main
+
+func encodeUint32Ascending(b []byte, v uint32) []byte {
+	return append(b, byte(v>>24), byte(v>>16), byte(v>>8), byte(v))
+}
+
+func encodeUint64Ascending(b []byte, v uint64) []byte {
+	return append(b,
+		byte(v>>56), byte(v>>48), byte(v>>40), byte(v>>32),
+		byte(v>>24), byte(v>>16), byte(v>>8), byte(v))
+}
diff --git a/pebble/cmd/pebble/write_bench.go b/pebble/cmd/pebble/write_bench.go
new file mode 100644
index 0000000..397a536
--- /dev/null
+++ b/pebble/cmd/pebble/write_bench.go
@@ -0,0 +1,483 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package main
+
+import (
+	"context"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/cockroachdb/pebble/internal/ackseq"
+	"github.com/cockroachdb/pebble/internal/randvar"
+	"github.com/cockroachdb/pebble/internal/rate"
+	"github.com/spf13/cobra"
+)
+
+// The following constants match the values that Cockroach uses in Admission
+// Control at the time of writing.
+// See: https://github.com/cockroachdb/cockroach/blob/cb5d5108a7705eac7be82bc7f0f8b6f4dc825b96/pkg/util/admission/granter.go#L1212-L1229
+const (
+	defaultL0FileLimit     = 1000
+	defaultL0SubLevelLimit = 20
+)
+
+var writeBenchConfig struct {
+	batch              *randvar.Flag
+	keys               string
+	values             *randvar.BytesFlag
+	concurrency        int
+	rateStart          int
+	incBase            int
+	testPeriod         time.Duration
+	cooloffPeriod      time.Duration
+	targetL0Files      int
+	targetL0SubLevels  int
+	maxRateDipFraction float64
+	debug              bool
+}
+
+var writeBenchCmd = &cobra.Command{
+	Use:   "write <dir>",
+	Short: "Run YCSB F to find an a sustainable write throughput",
+	Long: `
+Run YCSB F (100% writes) at varying levels of sustained write load (ops/sec) to
+determine an optimal value of write throughput.
+
+The benchmark works by maintaining a fixed amount of write load on the DB for a
+fixed amount of time. If the database can handle the sustained load - determined
+by a heuristic that takes into account the number of files in L0 sub-levels, the
+number of L0 sub-levels, and whether the DB has encountered a write stall (i.e.
+measured load on the DB drops to zero) - the load is increased on the DB.
+
+Load increases exponentially from an initial load. If the DB fails the heuristic
+at the given write load, the load on the DB is paused for a period of time (the
+cool-off period) before returning to the last value at which the DB could handle
+the load. The exponent is then reset and the process repeats from this new
+initial value. This allows the benchmark to converge on and oscillate around the
+optimal write load.
+
+The values of load at which the DB passes and fails the heuristic are maintained
+over the duration of the benchmark. On completion of the benchmark, an "optimal"
+value is computed. The optimal value is computed as the value that minimizes the
+mis-classification of the recorded "passes" and "fails"". This can be visualized
+as a point on the x-axis that separates the passes and fails into the left and
+right half-planes, minimizing the number of fails that fall to the left of this
+point (i.e. mis-classified fails) and the number of passes that fall to the
+right (i.e. mis-classified passes).
+
+The resultant "optimal sustained write load" value provides an estimate of the
+write load that the DB can sustain without failing the target heuristic.
+
+A typical invocation of the benchmark is as follows:
+
+  pebble bench write [PATH] --wipe -c 1024 -d 8h --rate-start 30000 --debug
+`,
+	Args: cobra.ExactArgs(1),
+	RunE: runWriteBenchmark,
+}
+
+func init() {
+	initWriteBench(writeBenchCmd)
+}
+
+func initWriteBench(cmd *cobra.Command) {
+	// Default values for custom flags.
+	writeBenchConfig.batch = randvar.NewFlag("1")
+	writeBenchConfig.values = randvar.NewBytesFlag("1000")
+
+	cmd.Flags().Var(
+		writeBenchConfig.batch, "batch",
+		"batch size distribution [{zipf,uniform}:]min[-max]")
+	cmd.Flags().StringVar(
+		&writeBenchConfig.keys, "keys", "zipf", "latest, uniform, or zipf")
+	cmd.Flags().Var(
+		writeBenchConfig.values, "values",
+		"value size distribution [{zipf,uniform}:]min[-max][/<target-compression>]")
+	cmd.Flags().IntVarP(
+		&writeBenchConfig.concurrency, "concurrency", "c",
+		1, "number of concurrent workers")
+	cmd.Flags().IntVar(
+		&writeBenchConfig.rateStart, "rate-start",
+		1000, "starting write load (ops/sec)")
+	cmd.Flags().IntVar(
+		&writeBenchConfig.incBase, "rate-inc-base",
+		100, "increment / decrement base")
+	cmd.Flags().DurationVar(
+		&writeBenchConfig.testPeriod, "test-period",
+		60*time.Second, "time to run at a given write load")
+	cmd.Flags().DurationVar(
+		&writeBenchConfig.cooloffPeriod, "cooloff-period",
+		30*time.Second, "time to pause write load after a failure")
+	cmd.Flags().IntVar(
+		&writeBenchConfig.targetL0Files, "l0-files",
+		defaultL0FileLimit, "target L0 file count")
+	cmd.Flags().IntVar(
+		&writeBenchConfig.targetL0SubLevels, "l0-sublevels",
+		defaultL0SubLevelLimit, "target L0 sublevel count")
+	cmd.Flags().BoolVarP(
+		&wipe, "wipe", "w", false, "wipe the database before starting")
+	cmd.Flags().Float64Var(
+		&writeBenchConfig.maxRateDipFraction, "max-rate-dip-fraction", 0.1,
+		"fraction at which to mark a test-run as failed if the actual rate dips below (relative to the desired rate)")
+	cmd.Flags().BoolVar(
+		&writeBenchConfig.debug, "debug", false, "print benchmark debug information")
+}
+
+// writeBenchResult contains the results of a test run at a given rate. The
+// independent variable is the rate (in ops/sec) and the dependent variable is
+// whether the test passed or failed. Additional metadata associated with the
+// test run is also captured.
+type writeBenchResult struct {
+	name     string
+	rate     int           // The rate at which the test is currently running.
+	passed   bool          // Was the test successful at this rate.
+	elapsed  time.Duration // The total elapsed time of the test.
+	bytes    uint64        // The size of the LSM.
+	levels   int           // The number of levels occupied in the LSM.
+	writeAmp float64       // The write amplification.
+}
+
+// String implements fmt.Stringer, printing a raw benchmark line. These lines
+// are used when performing analysis on a given benchmark run.
+func (r writeBenchResult) String() string {
+	return fmt.Sprintf("BenchmarkRaw%s %d ops/sec %v pass %s elapsed %d bytes %d levels %.2f writeAmp",
+		r.name,
+		r.rate,
+		r.passed,
+		r.elapsed,
+		r.bytes,
+		r.levels,
+		r.writeAmp,
+	)
+}
+
+func runWriteBenchmark(_ *cobra.Command, args []string) error {
+	const workload = "F" // 100% inserts.
+	var (
+		writers      []*pauseWriter
+		writersWg    *sync.WaitGroup // Tracks completion of all pauseWriters.
+		cooloff      bool            // Is cool-off enabled.
+		streak       int             // The number of successive passes.
+		clockStart   time.Time       // Start time for current load.
+		cooloffStart time.Time       // When cool-off was enabled.
+		stack        []int           // Stack of passing load values.
+		pass, fail   []int           // Values of load that pass and fail, respectively.
+		rateAcc      float64         // Accumulator of measured rates for a single test run.
+	)
+
+	desiredRate := writeBenchConfig.rateStart
+	incBase := writeBenchConfig.incBase
+	weights, err := ycsbParseWorkload(workload)
+
+	if err != nil {
+		return err
+	}
+
+	keyDist, err := ycsbParseKeyDist(writeBenchConfig.keys)
+	if err != nil {
+		return err
+	}
+	batchDist := writeBenchConfig.batch
+	valueDist := writeBenchConfig.values
+
+	// Construct a new YCSB F benchmark with the configured values.
+	y := newYcsb(weights, keyDist, batchDist, nil /* scans */, valueDist)
+	y.keyNum = ackseq.New(0)
+
+	setLimit := func(l int) {
+		perWriterRate := float64(l) / float64(len(writers))
+		for _, w := range writers {
+			w.setRate(perWriterRate)
+		}
+	}
+
+	// Function closure to run on test-run failure.
+	onTestFail := func(r writeBenchResult, cancel func()) {
+		fail = append(fail, desiredRate)
+
+		// Emit a benchmark raw datapoint.
+		fmt.Println(r)
+
+		// We failed at the current load, we have two options:
+
+		// a) No room to backtrack. We're done.
+		if len(stack) == 0 {
+			debugPrint("no room to backtrack; exiting ...\n")
+			cancel()
+			writersWg.Wait()
+			return
+		}
+
+		// b) We still have room to backtrack. Reduce the load to the
+		// last known passing value.
+		desiredRate, stack = stack[len(stack)-1], stack[:len(stack)-1]
+		setLimit(desiredRate)
+
+		// Enter the cool-off period.
+		cooloff = true
+		var wg sync.WaitGroup
+		for _, w := range writers {
+			// With a large number of writers, pausing synchronously can
+			// take a material amount of time. Instead, pause the
+			// writers in parallel in the background, and wait for all
+			// to complete before continuing.
+			wg.Add(1)
+			go func(writer *pauseWriter) {
+				writer.pause()
+				wg.Done()
+			}(w)
+		}
+		wg.Wait()
+
+		// Reset the counters and clocks.
+		streak = 0
+		rateAcc = 0
+		cooloffStart = time.Now()
+		clockStart = time.Now()
+		debugPrint("Fail. Pausing writers for cool-off period.\n")
+		debugPrint(fmt.Sprintf("new rate=%d\npasses=%v\nfails=%v\nstack=%v\n",
+			desiredRate, pass, fail, stack))
+	}
+
+	// Function closure to run on test-run success.
+	onTestSuccess := func(r writeBenchResult) {
+		streak++
+		pass = append(pass, desiredRate)
+		stack = append(stack, desiredRate)
+
+		// Emit a benchmark raw datapoint.
+		r.passed = true
+		fmt.Println(r)
+
+		// Increase the rate.
+		desiredRate = desiredRate + incBase*(1<<(streak-1))
+		setLimit(desiredRate)
+
+		// Restart the test.
+		rateAcc = 0
+		clockStart = time.Now()
+
+		debugPrint(fmt.Sprintf("Pass.\nnew rate=%d\npasses=%v\nfails=%v\nstreak=%d\nstack=%v\n",
+			desiredRate, pass, fail, streak, stack))
+	}
+
+	name := fmt.Sprintf("write/values=%s", writeBenchConfig.values)
+	ctx, cancel := context.WithCancel(context.Background())
+	runTest(args[0], test{
+		init: func(db DB, wg *sync.WaitGroup) {
+			y.db = db
+			writersWg = wg
+
+			// Spawn the writers.
+			for i := 0; i < writeBenchConfig.concurrency; i++ {
+				writer := newPauseWriter(y, float64(desiredRate))
+				writers = append(writers, writer)
+				writersWg.Add(1)
+				go writer.run(ctx, wg)
+			}
+			setLimit(desiredRate)
+
+			// Start the clock on the current load.
+			clockStart = time.Now()
+		},
+		tick: func(elapsed time.Duration, i int) {
+			m := y.db.Metrics()
+			if i%20 == 0 {
+				if writeBenchConfig.debug && i > 0 {
+					fmt.Printf("%s\n", m)
+				}
+				fmt.Println("___elapsed___clock___rate(desired)___rate(actual)___L0files___L0levels___levels______lsmBytes___writeAmp")
+			}
+
+			// Print the current stats.
+			l0Files := m.Levels[0].NumFiles
+			l0Sublevels := m.Levels[0].Sublevels
+			nLevels := 0
+			for _, l := range m.Levels {
+				if l.BytesIn > 0 {
+					nLevels++
+				}
+			}
+			lsmBytes := m.DiskSpaceUsage()
+			total := m.Total()
+			writeAmp := (&total).WriteAmp()
+
+			var currRate float64
+			var stalled bool
+			y.reg.Tick(func(tick histogramTick) {
+				h := tick.Hist
+				currRate = float64(h.TotalCount()) / tick.Elapsed.Seconds()
+				stalled = !cooloff && currRate == 0
+			})
+			rateAcc += currRate
+
+			// The heuristic by which the DB can sustain a given write load is
+			// determined by whether the DB, for the configured window of time:
+			// 1) did not encounter a write stall (i.e. write load fell to
+			//    zero),
+			// 2) number of files in L0 was at or below the target, and
+			// 3) number of L0 sub-levels is at or below the target.
+			failed := stalled ||
+				int(l0Files) > writeBenchConfig.targetL0Files ||
+				int(l0Sublevels) > writeBenchConfig.targetL0SubLevels
+
+			// Print the result for this tick.
+			fmt.Printf("%10s %7s %15d %14.1f %9d %10d %8d %13d %10.2f\n",
+				time.Duration(elapsed.Seconds()+0.5)*time.Second,
+				time.Duration(time.Since(clockStart).Seconds()+0.5)*time.Second,
+				desiredRate,
+				currRate,
+				l0Files,
+				l0Sublevels,
+				nLevels,
+				lsmBytes,
+				writeAmp,
+			)
+
+			// If we're in cool-off mode, allow it to complete before resuming
+			// writing.
+			if cooloff {
+				if time.Since(cooloffStart) < writeBenchConfig.cooloffPeriod {
+					return
+				}
+				debugPrint("ending cool-off")
+
+				// Else, resume writing.
+				cooloff = false
+				for _, w := range writers {
+					w.unpause()
+				}
+				clockStart = time.Now()
+
+				return
+			}
+
+			r := writeBenchResult{
+				name:     name,
+				rate:     desiredRate,
+				elapsed:  time.Duration(elapsed.Seconds()+0.5) * time.Second,
+				bytes:    lsmBytes,
+				levels:   nLevels,
+				writeAmp: writeAmp,
+			}
+
+			if failed {
+				onTestFail(r, cancel)
+				return
+			}
+
+			// Else, the DB could handle the current load. We only increase
+			// after a fixed amount of time at this load as elapsed.
+			testElapsed := time.Since(clockStart)
+			if testElapsed < writeBenchConfig.testPeriod {
+				// This test-run still has time on the clock.
+				return
+			}
+
+			// This test-run has completed.
+
+			// If the average rate over the test is less than the desired rate,
+			// we mark this test-run as a failure. This handles cases where we
+			// encounter a bottleneck that limits write throughput but
+			// incorrectly mark the test as passed.
+			diff := 1 - rateAcc/(float64(desiredRate)*testElapsed.Seconds())
+			if diff > writeBenchConfig.maxRateDipFraction {
+				if writeBenchConfig.debug {
+					debugPrint(fmt.Sprintf(
+						"difference in rates (%.2f) exceeded threshold (%.2f); marking test as failed\n",
+						diff, writeBenchConfig.maxRateDipFraction,
+					))
+				}
+				onTestFail(r, cancel)
+				return
+			}
+
+			// Mark this test-run as passed.
+			onTestSuccess(r)
+		},
+		done: func(elapsed time.Duration) {
+			// Print final analysis.
+			var total int64
+			y.reg.Tick(func(tick histogramTick) {
+				total = tick.Cumulative.TotalCount()
+			})
+			fmt.Println("___elapsed___ops(total)")
+			fmt.Printf("%10s %12d\n", elapsed.Truncate(time.Second), total)
+		},
+	})
+
+	return nil
+}
+
+// debugPrint prints a debug line to stdout if debug logging is enabled via the
+// --debug flag.
+func debugPrint(s string) {
+	if !writeBenchConfig.debug {
+		return
+	}
+	fmt.Print("DEBUG: " + s)
+}
+
+// pauseWriter issues load against a pebble instance, and can be paused on
+// demand to allow the DB to recover.
+type pauseWriter struct {
+	y        *ycsb
+	limiter  *rate.Limiter
+	pauseC   chan struct{}
+	unpauseC chan struct{}
+}
+
+// newPauseWriter returns a new pauseWriter.
+func newPauseWriter(y *ycsb, initialRate float64) *pauseWriter {
+	// Set the burst rate for the limiter to the lowest sensible value to
+	// prevent excessive bursting. Note that a burst of zero effectively
+	// disables the rate limiter, as a wait time of +Inf is returned from all
+	// calls, and `wait(l *rate.Limiter)` will not sleep in this case.
+	const burst = 1
+	return &pauseWriter{
+		y:        y,
+		limiter:  rate.NewLimiter(float64(initialRate), burst),
+		pauseC:   make(chan struct{}),
+		unpauseC: make(chan struct{}),
+	}
+}
+
+// run starts the pauseWriter, issuing load against the DB.
+func (w *pauseWriter) run(ctx context.Context, wg *sync.WaitGroup) {
+	defer wg.Done()
+
+	buf := &ycsbBuf{rng: randvar.NewRand()}
+	hist := w.y.reg.Register("insert")
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-w.pauseC:
+			// Hold the goroutine here until we unpause.
+			<-w.unpauseC
+		default:
+			wait(w.limiter)
+			start := time.Now()
+			w.y.insert(w.y.db, buf)
+			hist.Record(time.Since(start))
+		}
+	}
+}
+
+// pause signals that the writer should pause after the current operation.
+func (w *pauseWriter) pause() {
+	w.pauseC <- struct{}{}
+}
+
+// unpause unpauses the writer.
+func (w *pauseWriter) unpause() {
+	w.unpauseC <- struct{}{}
+}
+
+// setRate sets the rate limit for this writer.
+func (w *pauseWriter) setRate(r float64) {
+	w.limiter.SetRate(r)
+}
diff --git a/pebble/cmd/pebble/ycsb.go b/pebble/cmd/pebble/ycsb.go
new file mode 100644
index 0000000..41de324
--- /dev/null
+++ b/pebble/cmd/pebble/ycsb.go
@@ -0,0 +1,609 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package main
+
+import (
+	"fmt"
+	"log"
+	"strconv"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/ackseq"
+	"github.com/cockroachdb/pebble/internal/randvar"
+	"github.com/cockroachdb/pebble/internal/rate"
+	"github.com/spf13/cobra"
+	"golang.org/x/exp/rand"
+)
+
+const (
+	ycsbInsert = iota
+	ycsbRead
+	ycsbScan
+	ycsbReverseScan
+	ycsbUpdate
+	ycsbNumOps
+)
+
+var ycsbConfig struct {
+	batch            *randvar.Flag
+	keys             string
+	initialKeys      int
+	prepopulatedKeys int
+	numOps           uint64
+	scans            *randvar.Flag
+	values           *randvar.BytesFlag
+	workload         string
+}
+
+var ycsbCmd = &cobra.Command{
+	Use:   "ycsb <dir>",
+	Short: "run customizable YCSB benchmark",
+	Long: `
+Run a customizable YCSB workload. The workload is specified by the --workload
+flag which can take either one of the standard workload mixes (A-F), or
+customizable workload fixes specified as a command separated list of op=weight
+pairs. For example, --workload=read=50,update=50 performs a workload composed
+of 50% reads and 50% updates. This is identical to the standard workload A.
+
+The --batch, --scans, and --values flags take the specification for a random
+variable: [<type>:]<min>[-<max>]. The <type> parameter must be one of "uniform"
+or "zipf". If <type> is omitted, a uniform distribution is used. If <max> is
+omitted it is set to the same value as <min>. The specification "1000" results
+in a constant 1000. The specification "10-100" results in a uniformly random
+variable in the range [10,100). The specification "zipf(10,100)" results in a
+zipf distribution with a minimum value of 10 and a maximum value of 100.
+
+The --batch flag controls the size of batches used for insert and update
+operations. The --scans flag controls the number of iterations performed by a
+scan operation. Read operations always read a single key.
+
+The --values flag provides for an optional "/<target-compression-ratio>"
+suffix. The default target compression ratio is 1.0 (i.e. incompressible random
+data). A value of 2 will cause random data to be generated that should compress
+to 50% of its uncompressed size.
+
+Standard workloads:
+
+  A:  50% reads   /  50% updates
+  B:  95% reads   /   5% updates
+  C: 100% reads
+  D:  95% reads   /   5% inserts
+  E:  95% scans   /   5% inserts
+  F: 100% inserts
+`,
+	Args: cobra.ExactArgs(1),
+	RunE: runYcsb,
+}
+
+func init() {
+	initYCSB(ycsbCmd)
+}
+
+func initYCSB(cmd *cobra.Command) {
+	ycsbConfig.batch = randvar.NewFlag("1")
+	cmd.Flags().Var(
+		ycsbConfig.batch, "batch",
+		"batch size distribution [{zipf,uniform}:]min[-max]")
+	cmd.Flags().StringVar(
+		&ycsbConfig.keys, "keys", "zipf", "latest, uniform, or zipf")
+	cmd.Flags().IntVar(
+		&ycsbConfig.initialKeys, "initial-keys", 10000,
+		"initial number of keys to insert before beginning workload")
+	cmd.Flags().IntVar(
+		&ycsbConfig.prepopulatedKeys, "prepopulated-keys", 0,
+		"number of keys that were previously inserted into the database")
+	cmd.Flags().Uint64VarP(
+		&ycsbConfig.numOps, "num-ops", "n", 0,
+		"maximum number of operations (0 means unlimited)")
+	ycsbConfig.scans = randvar.NewFlag("zipf:1-1000")
+	cmd.Flags().Var(
+		ycsbConfig.scans, "scans",
+		"scan length distribution [{zipf,uniform}:]min[-max]")
+	cmd.Flags().StringVar(
+		&ycsbConfig.workload, "workload", "B",
+		"workload type (A-F) or spec (read=X,update=Y,...)")
+	ycsbConfig.values = randvar.NewBytesFlag("1000")
+	cmd.Flags().Var(
+		ycsbConfig.values, "values",
+		"value size distribution [{zipf,uniform}:]min[-max][/<target-compression>]")
+}
+
+type ycsbWeights []float64
+
+func (w ycsbWeights) get(i int) float64 {
+	if i >= len(w) {
+		return 0
+	}
+	return w[i]
+}
+
+var ycsbWorkloads = map[string]ycsbWeights{
+	"A": {
+		ycsbRead:   0.5,
+		ycsbUpdate: 0.5,
+	},
+	"B": {
+		ycsbRead:   0.95,
+		ycsbUpdate: 0.05,
+	},
+	"C": {
+		ycsbRead: 1.0,
+	},
+	"D": {
+		ycsbInsert: 0.05,
+		ycsbRead:   0.95,
+		// TODO(peter): default to skewed-latest distribution.
+	},
+	"E": {
+		ycsbInsert: 0.05,
+		ycsbScan:   0.95,
+	},
+	"F": {
+		ycsbInsert: 1.0,
+		// TODO(peter): the real workload is read-modify-write.
+	},
+}
+
+func ycsbParseWorkload(w string) (ycsbWeights, error) {
+	if weights := ycsbWorkloads[w]; weights != nil {
+		return weights, nil
+	}
+	iWeights := make([]int, ycsbNumOps)
+	for _, p := range strings.Split(w, ",") {
+		parts := strings.Split(p, "=")
+		if len(parts) != 2 {
+			return nil, errors.Errorf("malformed weights: %s", errors.Safe(w))
+		}
+		weight, err := strconv.Atoi(parts[1])
+		if err != nil {
+			return nil, err
+		}
+		switch parts[0] {
+		case "insert":
+			iWeights[ycsbInsert] = weight
+		case "read":
+			iWeights[ycsbRead] = weight
+		case "scan":
+			iWeights[ycsbScan] = weight
+		case "rscan":
+			iWeights[ycsbReverseScan] = weight
+		case "update":
+			iWeights[ycsbUpdate] = weight
+		}
+	}
+
+	var sum int
+	for _, w := range iWeights {
+		sum += w
+	}
+	if sum == 0 {
+		return nil, errors.Errorf("zero weight specified: %s", errors.Safe(w))
+	}
+
+	weights := make(ycsbWeights, ycsbNumOps)
+	for i := range weights {
+		weights[i] = float64(iWeights[i]) / float64(sum)
+	}
+	return weights, nil
+}
+
+func ycsbParseKeyDist(d string) (randvar.Dynamic, error) {
+	totalKeys := uint64(ycsbConfig.initialKeys + ycsbConfig.prepopulatedKeys)
+	switch strings.ToLower(d) {
+	case "latest":
+		return randvar.NewDefaultSkewedLatest()
+	case "uniform":
+		return randvar.NewUniform(1, totalKeys), nil
+	case "zipf":
+		return randvar.NewZipf(1, totalKeys, 0.99)
+	default:
+		return nil, errors.Errorf("unknown distribution: %s", errors.Safe(d))
+	}
+}
+
+func runYcsb(cmd *cobra.Command, args []string) error {
+	if wipe && ycsbConfig.prepopulatedKeys > 0 {
+		return errors.New("--wipe and --prepopulated-keys both specified which is nonsensical")
+	}
+
+	weights, err := ycsbParseWorkload(ycsbConfig.workload)
+	if err != nil {
+		return err
+	}
+
+	keyDist, err := ycsbParseKeyDist(ycsbConfig.keys)
+	if err != nil {
+		return err
+	}
+
+	batchDist := ycsbConfig.batch
+	scanDist := ycsbConfig.scans
+	if err != nil {
+		return err
+	}
+
+	valueDist := ycsbConfig.values
+	y := newYcsb(weights, keyDist, batchDist, scanDist, valueDist)
+	runTest(args[0], test{
+		init: y.init,
+		tick: y.tick,
+		done: y.done,
+	})
+	return nil
+}
+
+type ycsbBuf struct {
+	rng      *rand.Rand
+	keyBuf   []byte
+	valueBuf []byte
+	keyNums  []uint64
+}
+
+type ycsb struct {
+	db           DB
+	writeOpts    *pebble.WriteOptions
+	weights      ycsbWeights
+	reg          *histogramRegistry
+	keyDist      randvar.Dynamic
+	batchDist    randvar.Static
+	scanDist     randvar.Static
+	valueDist    *randvar.BytesFlag
+	readAmpCount atomic.Uint64
+	readAmpSum   atomic.Uint64
+	keyNum       *ackseq.S
+	numOps       atomic.Uint64
+	limiter      *rate.Limiter
+	opsMap       map[string]int
+}
+
+func newYcsb(
+	weights ycsbWeights,
+	keyDist randvar.Dynamic,
+	batchDist, scanDist randvar.Static,
+	valueDist *randvar.BytesFlag,
+) *ycsb {
+	y := &ycsb{
+		reg:       newHistogramRegistry(),
+		weights:   weights,
+		keyDist:   keyDist,
+		batchDist: batchDist,
+		scanDist:  scanDist,
+		valueDist: valueDist,
+		opsMap:    make(map[string]int),
+	}
+	y.writeOpts = pebble.Sync
+	if disableWAL {
+		y.writeOpts = pebble.NoSync
+	}
+
+	ops := map[string]int{
+		"insert": ycsbInsert,
+		"read":   ycsbRead,
+		"rscan":  ycsbReverseScan,
+		"scan":   ycsbScan,
+		"update": ycsbUpdate,
+	}
+	for name, op := range ops {
+		w := y.weights.get(op)
+		if w == 0 {
+			continue
+		}
+		wstr := fmt.Sprint(int(100 * w))
+		fill := strings.Repeat("_", 3-len(wstr))
+		if fill == "" {
+			fill = "_"
+		}
+		fullName := fmt.Sprintf("%s%s%s", name, fill, wstr)
+		y.opsMap[fullName] = op
+	}
+	return y
+}
+
+func (y *ycsb) init(db DB, wg *sync.WaitGroup) {
+	y.db = db
+
+	if ycsbConfig.initialKeys > 0 {
+		buf := &ycsbBuf{rng: randvar.NewRand()}
+
+		b := db.NewBatch()
+		size := 0
+		start := time.Now()
+		last := start
+		for i := 1; i <= ycsbConfig.initialKeys; i++ {
+			if now := time.Now(); now.Sub(last) >= time.Second {
+				fmt.Printf("%5s inserted %d keys (%0.1f%%)\n",
+					time.Duration(now.Sub(start).Seconds()+0.5)*time.Second,
+					i-1, 100*float64(i-1)/float64(ycsbConfig.initialKeys))
+				last = now
+			}
+			if size >= 1<<20 {
+				if err := b.Commit(y.writeOpts); err != nil {
+					log.Fatal(err)
+				}
+				b = db.NewBatch()
+				size = 0
+			}
+			key := y.makeKey(uint64(i+ycsbConfig.prepopulatedKeys), buf)
+			value := y.randBytes(buf)
+			if err := b.Set(key, value, nil); err != nil {
+				log.Fatal(err)
+			}
+			size += len(key) + len(value)
+		}
+		if err := b.Commit(y.writeOpts); err != nil {
+			log.Fatal(err)
+		}
+		_ = b.Close()
+		fmt.Printf("inserted keys [%d-%d)\n",
+			1+ycsbConfig.prepopulatedKeys,
+			1+ycsbConfig.prepopulatedKeys+ycsbConfig.initialKeys)
+	}
+	y.keyNum = ackseq.New(uint64(ycsbConfig.initialKeys + ycsbConfig.prepopulatedKeys))
+
+	y.limiter = maxOpsPerSec.newRateLimiter()
+
+	wg.Add(concurrency)
+
+	// If this workload doesn't produce reads, sample the worst case read-amp
+	// from Metrics() periodically.
+	if y.weights.get(ycsbRead) == 0 && y.weights.get(ycsbScan) == 0 && y.weights.get(ycsbReverseScan) == 0 {
+		wg.Add(1)
+		go y.sampleReadAmp(db, wg)
+	}
+
+	for i := 0; i < concurrency; i++ {
+		go y.run(db, wg)
+	}
+}
+
+func (y *ycsb) run(db DB, wg *sync.WaitGroup) {
+	defer wg.Done()
+
+	var latency [ycsbNumOps]*namedHistogram
+	for name, op := range y.opsMap {
+		latency[op] = y.reg.Register(name)
+	}
+
+	buf := &ycsbBuf{rng: randvar.NewRand()}
+
+	ops := randvar.NewWeighted(nil, y.weights...)
+	for {
+		wait(y.limiter)
+
+		start := time.Now()
+
+		op := ops.Int()
+		switch op {
+		case ycsbInsert:
+			y.insert(db, buf)
+		case ycsbRead:
+			y.read(db, buf)
+		case ycsbScan:
+			y.scan(db, buf, false /* reverse */)
+		case ycsbReverseScan:
+			y.scan(db, buf, true /* reverse */)
+		case ycsbUpdate:
+			y.update(db, buf)
+		default:
+			panic("not reached")
+		}
+
+		latency[op].Record(time.Since(start))
+		if ycsbConfig.numOps > 0 && y.numOps.Add(1) >= ycsbConfig.numOps {
+			break
+		}
+	}
+}
+
+func (y *ycsb) sampleReadAmp(db DB, wg *sync.WaitGroup) {
+	defer wg.Done()
+
+	ticker := time.NewTicker(time.Second)
+	defer ticker.Stop()
+	for range ticker.C {
+		m := db.Metrics()
+		y.readAmpCount.Add(1)
+		y.readAmpSum.Add(uint64(m.ReadAmp()))
+		if ycsbConfig.numOps > 0 && y.numOps.Load() >= ycsbConfig.numOps {
+			break
+		}
+	}
+}
+
+func (y *ycsb) hashKey(key uint64) uint64 {
+	// Inlined version of fnv.New64 + Write.
+	const offset64 = 14695981039346656037
+	const prime64 = 1099511628211
+
+	h := uint64(offset64)
+	for i := 0; i < 8; i++ {
+		h *= prime64
+		h ^= uint64(key & 0xff)
+		key >>= 8
+	}
+	return h
+}
+
+func (y *ycsb) makeKey(keyNum uint64, buf *ycsbBuf) []byte {
+	const size = 24 + 10
+	if cap(buf.keyBuf) < size {
+		buf.keyBuf = make([]byte, size)
+	}
+	key := buf.keyBuf[:4]
+	copy(key, "user")
+	key = strconv.AppendUint(key, y.hashKey(keyNum), 10)
+	// Use the MVCC encoding for keys. This appends a timestamp with
+	// walltime=1. That knowledge is utilized by rocksDB.Scan.
+	key = append(key, '\x00', '\x00', '\x00', '\x00', '\x00',
+		'\x00', '\x00', '\x00', '\x01', '\x09')
+	buf.keyBuf = key
+	return key
+}
+
+func (y *ycsb) nextReadKey(buf *ycsbBuf) []byte {
+	// NB: the range of values returned by keyDist is tied to the range returned
+	// by keyNum.Base. See how these are both incremented by ycsb.insert().
+	keyNum := y.keyDist.Uint64(buf.rng)
+	return y.makeKey(keyNum, buf)
+}
+
+func (y *ycsb) randBytes(buf *ycsbBuf) []byte {
+	buf.valueBuf = y.valueDist.Bytes(buf.rng, buf.valueBuf)
+	return buf.valueBuf
+}
+
+func (y *ycsb) insert(db DB, buf *ycsbBuf) {
+	count := y.batchDist.Uint64(buf.rng)
+	if cap(buf.keyNums) < int(count) {
+		buf.keyNums = make([]uint64, count)
+	}
+	keyNums := buf.keyNums[:count]
+
+	b := db.NewBatch()
+	for i := range keyNums {
+		keyNums[i] = y.keyNum.Next()
+		_ = b.Set(y.makeKey(keyNums[i], buf), y.randBytes(buf), nil)
+	}
+	if err := b.Commit(y.writeOpts); err != nil {
+		log.Fatal(err)
+	}
+	_ = b.Close()
+
+	for i := range keyNums {
+		delta, err := y.keyNum.Ack(keyNums[i])
+		if err != nil {
+			log.Fatal(err)
+		}
+		if delta > 0 {
+			y.keyDist.IncMax(delta)
+		}
+	}
+}
+
+func (y *ycsb) read(db DB, buf *ycsbBuf) {
+	key := y.nextReadKey(buf)
+	iter := db.NewIter(nil)
+	iter.SeekGE(key)
+	if iter.Valid() {
+		_ = iter.Key()
+		_ = iter.Value()
+	}
+
+	type metrics interface {
+		Metrics() pebble.IteratorMetrics
+	}
+	if m, ok := iter.(metrics); ok {
+		y.readAmpCount.Add(1)
+		y.readAmpSum.Add(uint64(m.Metrics().ReadAmp))
+	}
+
+	if err := iter.Close(); err != nil {
+		log.Fatal(err)
+	}
+}
+
+func (y *ycsb) scan(db DB, buf *ycsbBuf, reverse bool) {
+	count := y.scanDist.Uint64(buf.rng)
+	key := y.nextReadKey(buf)
+	iter := db.NewIter(nil)
+	if err := db.Scan(iter, key, int64(count), reverse); err != nil {
+		log.Fatal(err)
+	}
+
+	type metrics interface {
+		Metrics() pebble.IteratorMetrics
+	}
+	if m, ok := iter.(metrics); ok {
+		y.readAmpCount.Add(1)
+		y.readAmpSum.Add(uint64(m.Metrics().ReadAmp))
+	}
+
+	if err := iter.Close(); err != nil {
+		log.Fatal(err)
+	}
+}
+
+func (y *ycsb) update(db DB, buf *ycsbBuf) {
+	count := int(y.batchDist.Uint64(buf.rng))
+	b := db.NewBatch()
+	for i := 0; i < count; i++ {
+		_ = b.Set(y.nextReadKey(buf), y.randBytes(buf), nil)
+	}
+	if err := b.Commit(y.writeOpts); err != nil {
+		log.Fatal(err)
+	}
+	_ = b.Close()
+}
+
+func (y *ycsb) tick(elapsed time.Duration, i int) {
+	if i%20 == 0 {
+		fmt.Println("____optype__elapsed__ops/sec(inst)___ops/sec(cum)__p50(ms)__p95(ms)__p99(ms)_pMax(ms)")
+	}
+	y.reg.Tick(func(tick histogramTick) {
+		h := tick.Hist
+
+		fmt.Printf("%10s %8s %14.1f %14.1f %8.1f %8.1f %8.1f %8.1f\n",
+			tick.Name,
+			time.Duration(elapsed.Seconds()+0.5)*time.Second,
+			float64(h.TotalCount())/tick.Elapsed.Seconds(),
+			float64(tick.Cumulative.TotalCount())/elapsed.Seconds(),
+			time.Duration(h.ValueAtQuantile(50)).Seconds()*1000,
+			time.Duration(h.ValueAtQuantile(95)).Seconds()*1000,
+			time.Duration(h.ValueAtQuantile(99)).Seconds()*1000,
+			time.Duration(h.ValueAtQuantile(100)).Seconds()*1000,
+		)
+	})
+}
+
+func (y *ycsb) done(elapsed time.Duration) {
+	fmt.Println("\n____optype__elapsed_____ops(total)___ops/sec(cum)__avg(ms)__p50(ms)__p95(ms)__p99(ms)_pMax(ms)")
+
+	resultTick := histogramTick{}
+	y.reg.Tick(func(tick histogramTick) {
+		h := tick.Cumulative
+		if resultTick.Cumulative == nil {
+			resultTick.Now = tick.Now
+			resultTick.Cumulative = h
+		} else {
+			resultTick.Cumulative.Merge(h)
+		}
+
+		fmt.Printf("%10s %7.1fs %14d %14.1f %8.1f %8.1f %8.1f %8.1f %8.1f\n",
+			tick.Name, elapsed.Seconds(), h.TotalCount(),
+			float64(h.TotalCount())/elapsed.Seconds(),
+			time.Duration(h.Mean()).Seconds()*1000,
+			time.Duration(h.ValueAtQuantile(50)).Seconds()*1000,
+			time.Duration(h.ValueAtQuantile(95)).Seconds()*1000,
+			time.Duration(h.ValueAtQuantile(99)).Seconds()*1000,
+			time.Duration(h.ValueAtQuantile(100)).Seconds()*1000)
+	})
+	fmt.Println()
+
+	resultHist := resultTick.Cumulative
+	m := y.db.Metrics()
+	total := m.Total()
+
+	readAmpCount := y.readAmpCount.Load()
+	readAmpSum := y.readAmpSum.Load()
+	if readAmpCount == 0 {
+		readAmpSum = 0
+		readAmpCount = 1
+	}
+
+	fmt.Printf("Benchmarkycsb/%s/values=%s %d  %0.1f ops/sec  %d read  %d write  %.2f r-amp  %0.2f w-amp\n\n",
+		ycsbConfig.workload, ycsbConfig.values,
+		resultHist.TotalCount(),
+		float64(resultHist.TotalCount())/elapsed.Seconds(),
+		total.BytesRead,
+		total.BytesFlushed+total.BytesCompacted,
+		float64(readAmpSum)/float64(readAmpCount),
+		total.WriteAmp(),
+	)
+}
diff --git a/pebble/commit.go b/pebble/commit.go
new file mode 100644
index 0000000..38cdbb8
--- /dev/null
+++ b/pebble/commit.go
@@ -0,0 +1,517 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"runtime"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/cockroachdb/pebble/record"
+)
+
+// commitQueue is a lock-free fixed-size single-producer, multi-consumer
+// queue. The single producer can enqueue (push) to the head, and consumers can
+// dequeue (pop) from the tail.
+//
+// It has the added feature that it nils out unused slots to avoid unnecessary
+// retention of objects.
+type commitQueue struct {
+	// headTail packs together a 32-bit head index and a 32-bit tail index. Both
+	// are indexes into slots modulo len(slots)-1.
+	//
+	// tail = index of oldest data in queue
+	// head = index of next slot to fill
+	//
+	// Slots in the range [tail, head) are owned by consumers.  A consumer
+	// continues to own a slot outside this range until it nils the slot, at
+	// which point ownership passes to the producer.
+	//
+	// The head index is stored in the most-significant bits so that we can
+	// atomically add to it and the overflow is harmless.
+	headTail atomic.Uint64
+
+	// slots is a ring buffer of values stored in this queue. The size must be a
+	// power of 2. A slot is in use until *both* the tail index has moved beyond
+	// it and the slot value has been set to nil. The slot value is set to nil
+	// atomically by the consumer and read atomically by the producer.
+	slots [record.SyncConcurrency]atomic.Pointer[Batch]
+}
+
+const dequeueBits = 32
+
+func (q *commitQueue) unpack(ptrs uint64) (head, tail uint32) {
+	const mask = 1<<dequeueBits - 1
+	head = uint32((ptrs >> dequeueBits) & mask)
+	tail = uint32(ptrs & mask)
+	return
+}
+
+func (q *commitQueue) pack(head, tail uint32) uint64 {
+	const mask = 1<<dequeueBits - 1
+	return (uint64(head) << dequeueBits) |
+		uint64(tail&mask)
+}
+
+func (q *commitQueue) enqueue(b *Batch) {
+	ptrs := q.headTail.Load()
+	head, tail := q.unpack(ptrs)
+	if (tail+uint32(len(q.slots)))&(1<<dequeueBits-1) == head {
+		// Queue is full. This should never be reached because commitPipeline.commitQueueSem
+		// limits the number of concurrent operations.
+		panic("pebble: not reached")
+	}
+	slot := &q.slots[head&uint32(len(q.slots)-1)]
+
+	// Check if the head slot has been released by dequeueApplied.
+	for slot.Load() != nil {
+		// Another goroutine is still cleaning up the tail, so the queue is
+		// actually still full. We spin because this should resolve itself
+		// momentarily.
+		runtime.Gosched()
+	}
+
+	// The head slot is free, so we own it.
+	slot.Store(b)
+
+	// Increment head. This passes ownership of slot to dequeueApplied and acts as a
+	// store barrier for writing the slot.
+	q.headTail.Add(1 << dequeueBits)
+}
+
+// dequeueApplied removes the earliest enqueued Batch, if it is applied.
+//
+// Returns nil if the commit queue is empty or the earliest Batch is not yet
+// applied.
+func (q *commitQueue) dequeueApplied() *Batch {
+	for {
+		ptrs := q.headTail.Load()
+		head, tail := q.unpack(ptrs)
+		if tail == head {
+			// Queue is empty.
+			return nil
+		}
+
+		slot := &q.slots[tail&uint32(len(q.slots)-1)]
+		b := slot.Load()
+		if b == nil || !b.applied.Load() {
+			// The batch is not ready to be dequeued, or another goroutine has
+			// already dequeued it.
+			return nil
+		}
+
+		// Confirm head and tail (for our speculative check above) and increment
+		// tail. If this succeeds, then we own the slot at tail.
+		ptrs2 := q.pack(head, tail+1)
+		if q.headTail.CompareAndSwap(ptrs, ptrs2) {
+			// We now own slot.
+			//
+			// Tell enqueue that we're done with this slot. Zeroing the slot is also
+			// important so we don't leave behind references that could keep this object
+			// live longer than necessary.
+			slot.Store(nil)
+			// At this point enqueue owns the slot.
+			return b
+		}
+	}
+}
+
+// commitEnv contains the environment that a commitPipeline interacts
+// with. This allows fine-grained testing of commitPipeline behavior without
+// construction of an entire DB.
+type commitEnv struct {
+	// The next sequence number to give to a batch. Protected by
+	// commitPipeline.mu.
+	logSeqNum *atomic.Uint64
+	// The visible sequence number at which reads should be performed. Ratcheted
+	// upwards atomically as batches are applied to the memtable.
+	visibleSeqNum *atomic.Uint64
+
+	// Apply the batch to the specified memtable. Called concurrently.
+	apply func(b *Batch, mem *memTable) error
+	// Write the batch to the WAL. If wg != nil, the data will be persisted
+	// asynchronously and done will be called on wg upon completion. If wg != nil
+	// and err != nil, a failure to persist the WAL will populate *err. Returns
+	// the memtable the batch should be applied to. Serial execution enforced by
+	// commitPipeline.mu.
+	write func(b *Batch, wg *sync.WaitGroup, err *error) (*memTable, error)
+}
+
+// A commitPipeline manages the stages of committing a set of mutations
+// (contained in a single Batch) atomically to the DB. The steps are
+// conceptually:
+//
+//  1. Write the batch to the WAL and optionally sync the WAL
+//  2. Apply the mutations in the batch to the memtable
+//
+// These two simple steps are made complicated by the desire for high
+// performance. In the absence of concurrency, performance is limited by how
+// fast a batch can be written (and synced) to the WAL and then added to the
+// memtable, both of which are outside the purview of the commit
+// pipeline. Performance under concurrency is the primary concern of the commit
+// pipeline, though it also needs to maintain two invariants:
+//
+//  1. Batches need to be written to the WAL in sequence number order.
+//  2. Batches need to be made visible for reads in sequence number order. This
+//     invariant arises from the use of a single sequence number which
+//     indicates which mutations are visible.
+//
+// Taking these invariants into account, let's revisit the work the commit
+// pipeline needs to perform. Writing the batch to the WAL is necessarily
+// serialized as there is a single WAL object. The order of the entries in the
+// WAL defines the sequence number order. Note that writing to the WAL is
+// extremely fast, usually just a memory copy. Applying the mutations in a
+// batch to the memtable can occur concurrently as the underlying skiplist
+// supports concurrent insertions. Publishing the visible sequence number is
+// another serialization point, but one with a twist: the visible sequence
+// number cannot be bumped until the mutations for earlier batches have
+// finished applying to the memtable (the visible sequence number only ratchets
+// up). Lastly, if requested, the commit waits for the WAL to sync. Note that
+// waiting for the WAL sync after ratcheting the visible sequence number allows
+// another goroutine to read committed data before the WAL has synced. This is
+// similar behavior to RocksDB's manual WAL flush functionality. Application
+// code needs to protect against this if necessary.
+//
+// The full outline of the commit pipeline operation is as follows:
+//
+//	with commitPipeline mutex locked:
+//	  assign batch sequence number
+//	  write batch to WAL
+//	(optionally) add batch to WAL sync list
+//	apply batch to memtable (concurrently)
+//	wait for earlier batches to apply
+//	ratchet read sequence number
+//	(optionally) wait for the WAL to sync
+//
+// As soon as a batch has been written to the WAL, the commitPipeline mutex is
+// released allowing another batch to write to the WAL. Each commit operation
+// individually applies its batch to the memtable providing concurrency. The
+// WAL sync happens concurrently with applying to the memtable (see
+// commitPipeline.syncLoop).
+//
+// The "waits for earlier batches to apply" work is more complicated than might
+// be expected. The obvious approach would be to keep a queue of pending
+// batches and for each batch to wait for the previous batch to finish
+// committing. This approach was tried initially and turned out to be too
+// slow. The problem is that it causes excessive goroutine activity as each
+// committing goroutine needs to wake up in order for the next goroutine to be
+// unblocked. The approach taken in the current code is conceptually similar,
+// though it avoids waking a goroutine to perform work that another goroutine
+// can perform. A commitQueue (a single-producer, multiple-consumer queue)
+// holds the ordered list of committing batches. Addition to the queue is done
+// while holding commitPipeline.mutex ensuring the same ordering of batches in
+// the queue as the ordering in the WAL. When a batch finishes applying to the
+// memtable, it atomically updates its Batch.applied field. Ratcheting of the
+// visible sequence number is done by commitPipeline.publish which loops
+// dequeueing "applied" batches and ratcheting the visible sequence number. If
+// we hit an unapplied batch at the head of the queue we can block as we know
+// that committing of that unapplied batch will eventually find our (applied)
+// batch in the queue. See commitPipeline.publish for additional commentary.
+type commitPipeline struct {
+	// WARNING: The following struct `commitQueue` contains fields which will
+	// be accessed atomically.
+	//
+	// Go allocations are guaranteed to be 64-bit aligned which we take advantage
+	// of by placing the 64-bit fields which we access atomically at the beginning
+	// of the commitPipeline struct.
+	// For more information, see https://golang.org/pkg/sync/atomic/#pkg-note-BUG.
+	// Queue of pending batches to commit.
+	pending commitQueue
+	env     commitEnv
+	// The commit path has two queues:
+	// - commitPipeline.pending contains batches whose seqnums have not yet been
+	//   published. It is a lock-free single producer multi consumer queue.
+	// - LogWriter.flusher.syncQ contains state for batches that have asked for
+	//   a sync. It is a lock-free single producer single consumer queue.
+	// These lock-free queues have a fixed capacity. And since they are
+	// lock-free, we cannot do blocking waits when pushing onto these queues, in
+	// case they are full. Additionally, adding to these queues happens while
+	// holding commitPipeline.mu, and we don't want to block while holding that
+	// mutex since it is also needed by other code.
+	//
+	// Popping from these queues is independent and for a particular batch can
+	// occur in either order, though it is more common that popping from the
+	// commitPipeline.pending will happen first.
+	//
+	// Due to these constraints, we reserve a unit of space in each queue before
+	// acquiring commitPipeline.mu, which also ensures that the push operation
+	// is guaranteed to have space in the queue. The commitQueueSem and
+	// logSyncQSem are used for this reservation.
+	commitQueueSem chan struct{}
+	logSyncQSem    chan struct{}
+	ingestSem      chan struct{}
+	// The mutex to use for synchronizing access to logSeqNum and serializing
+	// calls to commitEnv.write().
+	mu sync.Mutex
+}
+
+func newCommitPipeline(env commitEnv) *commitPipeline {
+	p := &commitPipeline{
+		env: env,
+		// The capacity of both commitQueue.slots and syncQueue.slots is set to
+		// record.SyncConcurrency, which also determines the value of these
+		// semaphores. We used to have a single semaphore, which required that the
+		// capacity of these queues be the same. Now that we have two semaphores,
+		// the capacity of these queues could be changed to be different. Say half
+		// of the batches asked to be synced, but syncing took 5x the latency of
+		// adding to the memtable and publishing. Then syncQueue.slots could be
+		// sized as 0.5*5 of the commitQueue.slots. We can explore this if we find
+		// that LogWriterMetrics.SyncQueueLen has high utilization under some
+		// workloads.
+		//
+		// NB: the commit concurrency is one less than SyncConcurrency because we
+		// have to allow one "slot" for a concurrent WAL rotation which will close
+		// and sync the WAL.
+		commitQueueSem: make(chan struct{}, record.SyncConcurrency-1),
+		logSyncQSem:    make(chan struct{}, record.SyncConcurrency-1),
+		ingestSem:      make(chan struct{}, 1),
+	}
+	return p
+}
+
+// directWrite is used to directly write to the WAL. commitPipeline.mu must be
+// held while this is called. DB.mu must not be held. directWrite will only
+// return once the WAL sync is complete. Note that DirectWrite is a special case
+// function which is currently only used when ingesting sstables as a flushable.
+// Reason carefully about the correctness argument when calling this function
+// from any context.
+func (p *commitPipeline) directWrite(b *Batch) error {
+	var syncWG sync.WaitGroup
+	var syncErr error
+	syncWG.Add(1)
+	p.logSyncQSem <- struct{}{}
+	_, err := p.env.write(b, &syncWG, &syncErr)
+	syncWG.Wait()
+	err = firstError(err, syncErr)
+	return err
+}
+
+// Commit the specified batch, writing it to the WAL, optionally syncing the
+// WAL, and applying the batch to the memtable. Upon successful return the
+// batch's mutations will be visible for reading.
+// REQUIRES: noSyncWait => syncWAL
+func (p *commitPipeline) Commit(b *Batch, syncWAL bool, noSyncWait bool) error {
+	if b.Empty() {
+		return nil
+	}
+
+	commitStartTime := time.Now()
+	// Acquire semaphores.
+	p.commitQueueSem <- struct{}{}
+	if syncWAL {
+		p.logSyncQSem <- struct{}{}
+	}
+	b.commitStats.SemaphoreWaitDuration = time.Since(commitStartTime)
+
+	// Prepare the batch for committing: enqueuing the batch in the pending
+	// queue, determining the batch sequence number and writing the data to the
+	// WAL.
+	//
+	// NB: We set Batch.commitErr on error so that the batch won't be a candidate
+	// for reuse. See Batch.release().
+	mem, err := p.prepare(b, syncWAL, noSyncWait)
+	if err != nil {
+		b.db = nil // prevent batch reuse on error
+		// NB: we are not doing <-p.commitQueueSem since the batch is still
+		// sitting in the pending queue. We should consider fixing this by also
+		// removing the batch from the pending queue.
+		return err
+	}
+
+	// Apply the batch to the memtable.
+	if err := p.env.apply(b, mem); err != nil {
+		b.db = nil // prevent batch reuse on error
+		// NB: we are not doing <-p.commitQueueSem since the batch is still
+		// sitting in the pending queue. We should consider fixing this by also
+		// removing the batch from the pending queue.
+		return err
+	}
+
+	// Publish the batch sequence number.
+	p.publish(b)
+
+	<-p.commitQueueSem
+
+	if !noSyncWait {
+		// Already waited for commit, so look at the error.
+		if b.commitErr != nil {
+			b.db = nil // prevent batch reuse on error
+			err = b.commitErr
+		}
+	}
+	// Else noSyncWait. The LogWriter can be concurrently writing to
+	// b.commitErr. We will read b.commitErr in Batch.SyncWait after the
+	// LogWriter is done writing.
+
+	b.commitStats.TotalDuration = time.Since(commitStartTime)
+
+	return err
+}
+
+// AllocateSeqNum allocates count sequence numbers, invokes the prepare
+// callback, then the apply callback, and then publishes the sequence
+// numbers. AllocateSeqNum does not write to the WAL or add entries to the
+// memtable. AllocateSeqNum can be used to sequence an operation such as
+// sstable ingestion within the commit pipeline. The prepare callback is
+// invoked with commitPipeline.mu held, but note that DB.mu is not held and
+// must be locked if necessary.
+func (p *commitPipeline) AllocateSeqNum(
+	count int, prepare func(seqNum uint64), apply func(seqNum uint64),
+) {
+	// This method is similar to Commit and prepare. Be careful about trying to
+	// share additional code with those methods because Commit and prepare are
+	// performance critical code paths.
+
+	b := newBatch(nil)
+	defer b.release()
+
+	// Give the batch a count of 1 so that the log and visible sequence number
+	// are incremented correctly.
+	b.data = make([]byte, batchHeaderLen)
+	b.setCount(uint32(count))
+	b.commit.Add(1)
+
+	p.commitQueueSem <- struct{}{}
+
+	p.mu.Lock()
+
+	// Enqueue the batch in the pending queue. Note that while the pending queue
+	// is lock-free, we want the order of batches to be the same as the sequence
+	// number order.
+	p.pending.enqueue(b)
+
+	// Assign the batch a sequence number. Note that we use atomic operations
+	// here to handle concurrent reads of logSeqNum. commitPipeline.mu provides
+	// mutual exclusion for other goroutines writing to logSeqNum.
+	logSeqNum := p.env.logSeqNum.Add(uint64(count)) - uint64(count)
+	seqNum := logSeqNum
+	if seqNum == 0 {
+		// We can't use the value 0 for the global seqnum during ingestion, because
+		// 0 indicates no global seqnum. So allocate one more seqnum.
+		p.env.logSeqNum.Add(1)
+		seqNum++
+	}
+	b.setSeqNum(seqNum)
+
+	// Wait for any outstanding writes to the memtable to complete. This is
+	// necessary for ingestion so that the check for memtable overlap can see any
+	// writes that were sequenced before the ingestion. The spin loop is
+	// unfortunate, but obviates the need for additional synchronization.
+	for {
+		visibleSeqNum := p.env.visibleSeqNum.Load()
+		if visibleSeqNum == logSeqNum {
+			break
+		}
+		runtime.Gosched()
+	}
+
+	// Invoke the prepare callback. Note the lack of error reporting. Even if the
+	// callback internally fails, the sequence number needs to be published in
+	// order to allow the commit pipeline to proceed.
+	prepare(b.SeqNum())
+
+	p.mu.Unlock()
+
+	// Invoke the apply callback.
+	apply(b.SeqNum())
+
+	// Publish the sequence number.
+	p.publish(b)
+
+	<-p.commitQueueSem
+}
+
+func (p *commitPipeline) prepare(b *Batch, syncWAL bool, noSyncWait bool) (*memTable, error) {
+	n := uint64(b.Count())
+	if n == invalidBatchCount {
+		return nil, ErrInvalidBatch
+	}
+	var syncWG *sync.WaitGroup
+	var syncErr *error
+	switch {
+	case !syncWAL:
+		// Only need to wait for the publish.
+		b.commit.Add(1)
+	// Remaining cases represent syncWAL=true.
+	case noSyncWait:
+		syncErr = &b.commitErr
+		syncWG = &b.fsyncWait
+		// Only need to wait synchronously for the publish. The user will
+		// (asynchronously) wait on the batch's fsyncWait.
+		b.commit.Add(1)
+		b.fsyncWait.Add(1)
+	case !noSyncWait:
+		syncErr = &b.commitErr
+		syncWG = &b.commit
+		// Must wait for both the publish and the WAL fsync.
+		b.commit.Add(2)
+	}
+
+	p.mu.Lock()
+
+	// Enqueue the batch in the pending queue. Note that while the pending queue
+	// is lock-free, we want the order of batches to be the same as the sequence
+	// number order.
+	p.pending.enqueue(b)
+
+	// Assign the batch a sequence number. Note that we use atomic operations
+	// here to handle concurrent reads of logSeqNum. commitPipeline.mu provides
+	// mutual exclusion for other goroutines writing to logSeqNum.
+	b.setSeqNum(p.env.logSeqNum.Add(n) - n)
+
+	// Write the data to the WAL.
+	mem, err := p.env.write(b, syncWG, syncErr)
+
+	p.mu.Unlock()
+
+	return mem, err
+}
+
+func (p *commitPipeline) publish(b *Batch) {
+	// Mark the batch as applied.
+	b.applied.Store(true)
+
+	// Loop dequeuing applied batches from the pending queue. If our batch was
+	// the head of the pending queue we are guaranteed that either we'll publish
+	// it or someone else will dequeueApplied and publish it. If our batch is not the
+	// head of the queue then either we'll dequeueApplied applied batches and reach our
+	// batch or there is an unapplied batch blocking us. When that unapplied
+	// batch applies it will go through the same process and publish our batch
+	// for us.
+	for {
+		t := p.pending.dequeueApplied()
+		if t == nil {
+			// Wait for another goroutine to publish us. We might also be waiting for
+			// the WAL sync to finish.
+			now := time.Now()
+			b.commit.Wait()
+			b.commitStats.CommitWaitDuration += time.Since(now)
+			break
+		}
+		if !t.applied.Load() {
+			panic("not reached")
+		}
+
+		// We're responsible for publishing the sequence number for batch t, but
+		// another concurrent goroutine might sneak in and publish the sequence
+		// number for a subsequent batch. That's ok as all we're guaranteeing is
+		// that the sequence number ratchets up.
+		for {
+			curSeqNum := p.env.visibleSeqNum.Load()
+			newSeqNum := t.SeqNum() + uint64(t.Count())
+			if newSeqNum <= curSeqNum {
+				// t's sequence number has already been published.
+				break
+			}
+			if p.env.visibleSeqNum.CompareAndSwap(curSeqNum, newSeqNum) {
+				// We successfully published t's sequence number.
+				break
+			}
+		}
+
+		t.commit.Done()
+	}
+}
diff --git a/pebble/commit_test.go b/pebble/commit_test.go
new file mode 100644
index 0000000..51b618d
--- /dev/null
+++ b/pebble/commit_test.go
@@ -0,0 +1,355 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/pebble/internal/arenaskl"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/record"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+type testCommitEnv struct {
+	logSeqNum     atomic.Uint64
+	visibleSeqNum atomic.Uint64
+	writeCount    atomic.Uint64
+	applyBuf      struct {
+		sync.Mutex
+		buf []uint64
+	}
+	queueSemChan chan struct{}
+}
+
+func (e *testCommitEnv) env() commitEnv {
+	return commitEnv{
+		logSeqNum:     &e.logSeqNum,
+		visibleSeqNum: &e.visibleSeqNum,
+		apply:         e.apply,
+		write:         e.write,
+	}
+}
+
+func (e *testCommitEnv) apply(b *Batch, mem *memTable) error {
+	e.applyBuf.Lock()
+	e.applyBuf.buf = append(e.applyBuf.buf, b.SeqNum())
+	e.applyBuf.Unlock()
+	return nil
+}
+
+func (e *testCommitEnv) write(b *Batch, wg *sync.WaitGroup, _ *error) (*memTable, error) {
+	e.writeCount.Add(1)
+	if wg != nil {
+		wg.Done()
+		<-e.queueSemChan
+	}
+	return nil, nil
+}
+
+func TestCommitQueue(t *testing.T) {
+	var q commitQueue
+	var batches [16]Batch
+	for i := range batches {
+		q.enqueue(&batches[i])
+	}
+	if b := q.dequeueApplied(); b != nil {
+		t.Fatalf("unexpectedly dequeued batch: %p", b)
+	}
+	batches[1].applied.Store(true)
+	if b := q.dequeueApplied(); b != nil {
+		t.Fatalf("unexpectedly dequeued batch: %p", b)
+	}
+	for i := range batches {
+		batches[i].applied.Store(true)
+		if b := q.dequeueApplied(); b != &batches[i] {
+			t.Fatalf("%d: expected batch %p, but found %p", i, &batches[i], b)
+		}
+	}
+	if b := q.dequeueApplied(); b != nil {
+		t.Fatalf("unexpectedly dequeued batch: %p", b)
+	}
+}
+
+func TestCommitPipeline(t *testing.T) {
+	var e testCommitEnv
+	p := newCommitPipeline(e.env())
+
+	n := 10000
+	if invariants.RaceEnabled {
+		// Under race builds we have to limit the concurrency or we hit the
+		// following error:
+		//
+		//   race: limit on 8128 simultaneously alive goroutines is exceeded, dying
+		n = 1000
+	}
+
+	var wg sync.WaitGroup
+	wg.Add(n)
+	for i := 0; i < n; i++ {
+		go func(i int) {
+			defer wg.Done()
+			var b Batch
+			_ = b.Set([]byte(fmt.Sprint(i)), nil, nil)
+			_ = p.Commit(&b, false, false)
+		}(i)
+	}
+	wg.Wait()
+
+	if s := e.writeCount.Load(); uint64(n) != s {
+		t.Fatalf("expected %d written batches, but found %d", n, s)
+	}
+	if n != len(e.applyBuf.buf) {
+		t.Fatalf("expected %d written batches, but found %d",
+			n, len(e.applyBuf.buf))
+	}
+	if s := e.logSeqNum.Load(); uint64(n) != s {
+		t.Fatalf("expected %d, but found %d", n, s)
+	}
+	if s := e.visibleSeqNum.Load(); uint64(n) != s {
+		t.Fatalf("expected %d, but found %d", n, s)
+	}
+}
+
+func TestCommitPipelineSync(t *testing.T) {
+	n := 10000
+	if invariants.RaceEnabled {
+		// Under race builds we have to limit the concurrency or we hit the
+		// following error:
+		//
+		//   race: limit on 8128 simultaneously alive goroutines is exceeded, dying
+		n = 1000
+	}
+
+	for _, noSyncWait := range []bool{false, true} {
+		t.Run(fmt.Sprintf("no-sync-wait=%t", noSyncWait), func(t *testing.T) {
+			var e testCommitEnv
+			p := newCommitPipeline(e.env())
+			e.queueSemChan = p.logSyncQSem
+
+			var wg sync.WaitGroup
+			wg.Add(n)
+			for i := 0; i < n; i++ {
+				go func(i int) {
+					defer wg.Done()
+					var b Batch
+					require.NoError(t, b.Set([]byte(fmt.Sprint(i)), nil, nil))
+					require.NoError(t, p.Commit(&b, true, noSyncWait))
+					if noSyncWait {
+						require.NoError(t, b.SyncWait())
+					}
+				}(i)
+			}
+			wg.Wait()
+			if s := e.writeCount.Load(); uint64(n) != s {
+				t.Fatalf("expected %d written batches, but found %d", n, s)
+			}
+			if n != len(e.applyBuf.buf) {
+				t.Fatalf("expected %d written batches, but found %d",
+					n, len(e.applyBuf.buf))
+			}
+			if s := e.logSeqNum.Load(); uint64(n) != s {
+				t.Fatalf("expected %d, but found %d", n, s)
+			}
+			if s := e.visibleSeqNum.Load(); uint64(n) != s {
+				t.Fatalf("expected %d, but found %d", n, s)
+			}
+		})
+	}
+}
+
+func TestCommitPipelineAllocateSeqNum(t *testing.T) {
+	var e testCommitEnv
+	p := newCommitPipeline(e.env())
+
+	const n = 10
+	var wg sync.WaitGroup
+	wg.Add(n)
+	var prepareCount atomic.Uint64
+	var applyCount atomic.Uint64
+	for i := 1; i <= n; i++ {
+		go func(i int) {
+			defer wg.Done()
+			p.AllocateSeqNum(i, func(_ uint64) {
+				prepareCount.Add(1)
+			}, func(seqNum uint64) {
+				applyCount.Add(1)
+			})
+		}(i)
+	}
+	wg.Wait()
+
+	if s := prepareCount.Load(); n != s {
+		t.Fatalf("expected %d prepares, but found %d", n, s)
+	}
+	if s := applyCount.Load(); n != s {
+		t.Fatalf("expected %d applies, but found %d", n, s)
+	}
+	// AllocateSeqNum always returns a non-zero sequence number causing the
+	// values we see to be offset from 1.
+	const total = 1 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10
+	if s := e.logSeqNum.Load(); total != s {
+		t.Fatalf("expected %d, but found %d", total, s)
+	}
+	if s := e.visibleSeqNum.Load(); total != s {
+		t.Fatalf("expected %d, but found %d", total, s)
+	}
+}
+
+type syncDelayFile struct {
+	vfs.File
+	done chan struct{}
+}
+
+func (f *syncDelayFile) Sync() error {
+	<-f.done
+	return nil
+}
+
+func TestCommitPipelineWALClose(t *testing.T) {
+	// This test stresses the edge case of N goroutines blocked in the
+	// commitPipeline waiting for the log to sync when we concurrently decide to
+	// rotate and close the log.
+
+	mem := vfs.NewMem()
+	f, err := mem.Create("test-wal")
+	require.NoError(t, err)
+
+	// syncDelayFile will block on the done channel befor returning from Sync
+	// call.
+	sf := &syncDelayFile{
+		File: f,
+		done: make(chan struct{}),
+	}
+
+	// A basic commitEnv which writes to a WAL.
+	var wal *record.LogWriter
+	var walDone sync.WaitGroup
+	testEnv := commitEnv{
+		logSeqNum:     new(atomic.Uint64),
+		visibleSeqNum: new(atomic.Uint64),
+		apply: func(b *Batch, mem *memTable) error {
+			// At this point, we've called SyncRecord but the sync is blocked.
+			walDone.Done()
+			return nil
+		},
+		write: func(b *Batch, syncWG *sync.WaitGroup, syncErr *error) (*memTable, error) {
+			_, err := wal.SyncRecord(b.data, syncWG, syncErr)
+			return nil, err
+		},
+	}
+	p := newCommitPipeline(testEnv)
+	wal = record.NewLogWriter(sf, 0 /* logNum */, record.LogWriterConfig{
+		WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}),
+		QueueSemChan:    p.logSyncQSem,
+	})
+
+	// Launch N (commitConcurrency) goroutines which each create a batch and
+	// commit it with sync==true. Because of the syncDelayFile, none of these
+	// operations can complete until syncDelayFile.done is closed.
+	errCh := make(chan error, cap(p.commitQueueSem))
+	walDone.Add(cap(errCh))
+	for i := 0; i < cap(errCh); i++ {
+		go func(i int) {
+			b := &Batch{}
+			if err := b.LogData([]byte("foo"), nil); err != nil {
+				errCh <- err
+				return
+			}
+			errCh <- p.Commit(b, true /* sync */, false)
+		}(i)
+	}
+
+	// Wait for all of the WAL writes to queue up. This ensures we don't violate
+	// the concurrency requirements of LogWriter, and also ensures all of the WAL
+	// writes are queued.
+	walDone.Wait()
+	close(sf.done)
+
+	// Close the WAL. A "queue is full" panic means that something is broken.
+	require.NoError(t, wal.Close())
+	for i := 0; i < cap(errCh); i++ {
+		require.NoError(t, <-errCh)
+	}
+}
+
+func BenchmarkCommitPipeline(b *testing.B) {
+	for _, noSyncWait := range []bool{false, true} {
+		for _, parallelism := range []int{1, 2, 4, 8, 16, 32, 64, 128} {
+			b.Run(fmt.Sprintf("no-sync-wait=%t/parallel=%d", noSyncWait, parallelism),
+				func(b *testing.B) {
+					b.SetParallelism(parallelism)
+					mem := newMemTable(memTableOptions{})
+					var wal *record.LogWriter
+					nullCommitEnv := commitEnv{
+						logSeqNum:     new(atomic.Uint64),
+						visibleSeqNum: new(atomic.Uint64),
+						apply: func(b *Batch, mem *memTable) error {
+							err := mem.apply(b, b.SeqNum())
+							if err != nil {
+								return err
+							}
+							mem.writerUnref()
+							return nil
+						},
+						write: func(b *Batch, syncWG *sync.WaitGroup, syncErr *error) (*memTable, error) {
+							for {
+								err := mem.prepare(b)
+								if err == arenaskl.ErrArenaFull {
+									mem = newMemTable(memTableOptions{})
+									continue
+								}
+								if err != nil {
+									return nil, err
+								}
+								break
+							}
+
+							_, err := wal.SyncRecord(b.data, syncWG, syncErr)
+							return mem, err
+						},
+					}
+					p := newCommitPipeline(nullCommitEnv)
+					wal = record.NewLogWriter(io.Discard, 0, /* logNum */
+						record.LogWriterConfig{
+							WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}),
+							QueueSemChan:    p.logSyncQSem,
+						})
+					const keySize = 8
+					b.SetBytes(2 * keySize)
+					b.ResetTimer()
+
+					b.RunParallel(func(pb *testing.PB) {
+						rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+						buf := make([]byte, keySize)
+
+						for pb.Next() {
+							batch := newBatch(nil)
+							binary.BigEndian.PutUint64(buf, rng.Uint64())
+							batch.Set(buf, buf, nil)
+							if err := p.Commit(batch, true /* sync */, noSyncWait); err != nil {
+								b.Fatal(err)
+							}
+							if noSyncWait {
+								if err := batch.SyncWait(); err != nil {
+									b.Fatal(err)
+								}
+							}
+							batch.release()
+						}
+					})
+				})
+		}
+	}
+}
diff --git a/pebble/compaction.go b/pebble/compaction.go
new file mode 100644
index 0000000..42a709f
--- /dev/null
+++ b/pebble/compaction.go
@@ -0,0 +1,3924 @@
+// Copyright 2013 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"io"
+	"math"
+	"runtime/pprof"
+	"sort"
+	"sync/atomic"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invalidating"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/internal/private"
+	"github.com/cockroachdb/pebble/internal/rangedel"
+	"github.com/cockroachdb/pebble/internal/rangekey"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing"
+	"github.com/cockroachdb/pebble/objstorage/remote"
+	"github.com/cockroachdb/pebble/shims/cmp"
+	"github.com/cockroachdb/pebble/shims/slices"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+var errEmptyTable = errors.New("pebble: empty table")
+
+// ErrCancelledCompaction is returned if a compaction is cancelled by a
+// concurrent excise or ingest-split operation.
+var ErrCancelledCompaction = errors.New("pebble: compaction cancelled by a concurrent operation, will retry compaction")
+
+var compactLabels = pprof.Labels("pebble", "compact")
+var flushLabels = pprof.Labels("pebble", "flush")
+var gcLabels = pprof.Labels("pebble", "gc")
+
+// getInternalWriterProperties accesses a private variable (in the
+// internal/private package) initialized by the sstable Writer. This indirection
+// is necessary to ensure non-Pebble users constructing sstables for ingestion
+// are unable to set internal-only properties.
+var getInternalWriterProperties = private.SSTableInternalProperties.(func(*sstable.Writer) *sstable.Properties)
+
+// expandedCompactionByteSizeLimit is the maximum number of bytes in all
+// compacted files. We avoid expanding the lower level file set of a compaction
+// if it would make the total compaction cover more than this many bytes.
+func expandedCompactionByteSizeLimit(opts *Options, level int, availBytes uint64) uint64 {
+	v := uint64(25 * opts.Level(level).TargetFileSize)
+
+	// Never expand a compaction beyond half the available capacity, divided
+	// by the maximum number of concurrent compactions. Each of the concurrent
+	// compactions may expand up to this limit, so this attempts to limit
+	// compactions to half of available disk space. Note that this will not
+	// prevent compaction picking from pursuing compactions that are larger
+	// than this threshold before expansion.
+	diskMax := (availBytes / 2) / uint64(opts.MaxConcurrentCompactions())
+	if v > diskMax {
+		v = diskMax
+	}
+	return v
+}
+
+// maxGrandparentOverlapBytes is the maximum bytes of overlap with level+1
+// before we stop building a single file in a level-1 to level compaction.
+func maxGrandparentOverlapBytes(opts *Options, level int) uint64 {
+	return uint64(10 * opts.Level(level).TargetFileSize)
+}
+
+// maxReadCompactionBytes is used to prevent read compactions which
+// are too wide.
+func maxReadCompactionBytes(opts *Options, level int) uint64 {
+	return uint64(10 * opts.Level(level).TargetFileSize)
+}
+
+// noCloseIter wraps around a FragmentIterator, intercepting and eliding
+// calls to Close. It is used during compaction to ensure that rangeDelIters
+// are not closed prematurely.
+type noCloseIter struct {
+	keyspan.FragmentIterator
+}
+
+func (i noCloseIter) Close() error {
+	return nil
+}
+
+type compactionLevel struct {
+	level int
+	files manifest.LevelSlice
+	// l0SublevelInfo contains information about L0 sublevels being compacted.
+	// It's only set for the start level of a compaction starting out of L0 and
+	// is nil for all other compactions.
+	l0SublevelInfo []sublevelInfo
+}
+
+func (cl compactionLevel) Clone() compactionLevel {
+	newCL := compactionLevel{
+		level: cl.level,
+		files: cl.files.Reslice(func(start, end *manifest.LevelIterator) {}),
+	}
+	return newCL
+}
+func (cl compactionLevel) String() string {
+	return fmt.Sprintf(`Level %d, Files %s`, cl.level, cl.files)
+}
+
+// Return output from compactionOutputSplitters. See comment on
+// compactionOutputSplitter.shouldSplitBefore() on how this value is used.
+type maybeSplit int
+
+const (
+	noSplit maybeSplit = iota
+	splitNow
+)
+
+// String implements the Stringer interface.
+func (c maybeSplit) String() string {
+	if c == noSplit {
+		return "no-split"
+	}
+	return "split-now"
+}
+
+// compactionOutputSplitter is an interface for encapsulating logic around
+// switching the output of a compaction to a new output file. Additional
+// constraints around switching compaction outputs that are specific to that
+// compaction type (eg. flush splits) are implemented in
+// compactionOutputSplitters that compose other child compactionOutputSplitters.
+type compactionOutputSplitter interface {
+	// shouldSplitBefore returns whether we should split outputs before the
+	// specified "current key". The return value is splitNow or noSplit.
+	// splitNow means a split is advised before the specified key, and noSplit
+	// means no split is advised. If shouldSplitBefore(a) advises a split then
+	// shouldSplitBefore(b) should also advise a split given b >= a, until
+	// onNewOutput is called.
+	shouldSplitBefore(key *InternalKey, tw *sstable.Writer) maybeSplit
+	// onNewOutput updates internal splitter state when the compaction switches
+	// to a new sstable, and returns the next limit for the new output which
+	// would get used to truncate range tombstones if the compaction iterator
+	// runs out of keys. The limit returned MUST be > key according to the
+	// compaction's comparator. The specified key is the first key in the new
+	// output, or nil if this sstable will only contain range tombstones already
+	// in the fragmenter.
+	onNewOutput(key []byte) []byte
+}
+
+// fileSizeSplitter is a compactionOutputSplitter that enforces target file
+// sizes. This splitter splits to a new output file when the estimated file size
+// is 0.5x-2x the target file size. If there are overlapping grandparent files,
+// this splitter will attempt to split at a grandparent boundary. For example,
+// consider the example where a compaction wrote 'd' to the current output file,
+// and the next key has a user key 'g':
+//
+//	                              previous key   next key
+//		                                 |           |
+//		                                 |           |
+//		                 +---------------|----+   +--|----------+
+//		  grandparents:  |       000006  |    |   |  | 000007   |
+//		                 +---------------|----+   +--|----------+
+//		                 a    b          d    e   f  g       i
+//
+// Splitting the output file F before 'g' will ensure that the current output
+// file F does not overlap the grandparent file 000007. Aligning sstable
+// boundaries like this can significantly reduce write amplification, since a
+// subsequent compaction of F into the grandparent level will avoid needlessly
+// rewriting any keys within 000007 that do not overlap F's bounds. Consider the
+// following compaction:
+//
+//	                       +----------------------+
+//		  input            |                      |
+//		  level            +----------------------+
+//		                              \/
+//		           +---------------+       +---------------+
+//		  output   |XXXXXXX|       |       |      |XXXXXXXX|
+//		  level    +---------------+       +---------------+
+//
+// The input-level file overlaps two files in the output level, but only
+// partially. The beginning of the first output-level file and the end of the
+// second output-level file will be rewritten verbatim. This write I/O is
+// "wasted" in the sense that no merging is being performed.
+//
+// To prevent the above waste, this splitter attempts to split output files
+// before the start key of grandparent files. It still strives to write output
+// files of approximately the target file size, by constraining this splitting
+// at grandparent points to apply only if the current output's file size is
+// about the right order of magnitude.
+//
+// Note that, unlike most other splitters, this splitter does not guarantee that
+// it will advise splits only at user key change boundaries.
+type fileSizeSplitter struct {
+	frontier              frontier
+	targetFileSize        uint64
+	atGrandparentBoundary bool
+	boundariesObserved    uint64
+	nextGrandparent       *fileMetadata
+	grandparents          manifest.LevelIterator
+}
+
+func newFileSizeSplitter(
+	f *frontiers, targetFileSize uint64, grandparents manifest.LevelIterator,
+) *fileSizeSplitter {
+	s := &fileSizeSplitter{targetFileSize: targetFileSize}
+	s.nextGrandparent = grandparents.First()
+	s.grandparents = grandparents
+	if s.nextGrandparent != nil {
+		s.frontier.Init(f, s.nextGrandparent.Smallest.UserKey, s.reached)
+	}
+	return s
+}
+
+func (f *fileSizeSplitter) reached(nextKey []byte) []byte {
+	f.atGrandparentBoundary = true
+	f.boundariesObserved++
+	// NB: f.grandparents is a bounded iterator, constrained to the compaction
+	// key range.
+	f.nextGrandparent = f.grandparents.Next()
+	if f.nextGrandparent == nil {
+		return nil
+	}
+	// TODO(jackson): Should we also split before or immediately after
+	// grandparents' largest keys? Splitting before the start boundary prevents
+	// overlap with the grandparent. Also splitting after the end boundary may
+	// increase the probability of move compactions.
+	return f.nextGrandparent.Smallest.UserKey
+}
+
+func (f *fileSizeSplitter) shouldSplitBefore(key *InternalKey, tw *sstable.Writer) maybeSplit {
+	atGrandparentBoundary := f.atGrandparentBoundary
+
+	// Clear f.atGrandparentBoundary unconditionally.
+	//
+	// This is a bit subtle. Even if do decide to split, it's possible that a
+	// higher-level splitter will ignore our request (eg, because we're between
+	// two internal keys with the same user key). In this case, the next call to
+	// shouldSplitBefore will find atGrandparentBoundary=false. This is
+	// desirable, because in this case we would've already written the earlier
+	// key with the same user key to the output file. The current output file is
+	// already doomed to overlap the grandparent whose bound triggered
+	// atGrandparentBoundary=true. We should continue on, waiting for the next
+	// grandparent boundary.
+	f.atGrandparentBoundary = false
+
+	// If the key is a range tombstone, the EstimatedSize may not grow right
+	// away when a range tombstone is added to the fragmenter: It's dependent on
+	// whether or not the this new range deletion will start a new fragment.
+	// Range deletions are rare, so we choose to simply not split yet.
+	// TODO(jackson): Reconsider this, and consider range keys too as a part of
+	// #2321.
+	if key.Kind() == InternalKeyKindRangeDelete || tw == nil {
+		return noSplit
+	}
+
+	estSize := tw.EstimatedSize()
+	switch {
+	case estSize < f.targetFileSize/2:
+		// The estimated file size is less than half the target file size. Don't
+		// split it, even if currently aligned with a grandparent file because
+		// it's too small.
+		return noSplit
+	case estSize >= 2*f.targetFileSize:
+		// The estimated file size is double the target file size. Split it even
+		// if we were not aligned with a grandparent file boundary to avoid
+		// excessively exceeding the target file size.
+		return splitNow
+	case !atGrandparentBoundary:
+		// Don't split if we're not at a grandparent, except if we've exhausted
+		// all the grandparents overlapping this compaction's key range. Then we
+		// may want to split purely based on file size.
+		if f.nextGrandparent == nil {
+			// There are no more grandparents. Optimize for the target file size
+			// and split as soon as we hit the target file size.
+			if estSize >= f.targetFileSize {
+				return splitNow
+			}
+		}
+		return noSplit
+	default:
+		// INVARIANT: atGrandparentBoundary
+		// INVARIANT: targetSize/2 < estSize < 2*targetSize
+		//
+		// The estimated file size is close enough to the target file size that
+		// we should consider splitting.
+		//
+		// Determine whether to split now based on how many grandparent
+		// boundaries we have already observed while building this output file.
+		// The intuition here is that if the grandparent level is dense in this
+		// part of the keyspace, we're likely to continue to have more
+		// opportunities to split this file aligned with a grandparent. If this
+		// is the first grandparent boundary observed, we split immediately
+		// (we're already at ≥50% the target file size). Otherwise, each
+		// overlapping grandparent we've observed increases the minimum file
+		// size by 5% of the target file size, up to at most 90% of the target
+		// file size.
+		//
+		// TODO(jackson): The particular thresholds are somewhat unprincipled.
+		// This is the same heuristic as RocksDB implements. Is there are more
+		// principled formulation that can, further reduce w-amp, produce files
+		// closer to the target file size, or is more understandable?
+
+		// NB: Subtract 1 from `boundariesObserved` to account for the current
+		// boundary we're considering splitting at. `reached` will have
+		// incremented it at the same time it set `atGrandparentBoundary`.
+		minBoundaries := f.boundariesObserved-1
+		if minBoundaries > 8 {
+			minBoundaries = 8
+		}
+		minimumPctOfTargetSize := 50 + 5*minBoundaries
+		if estSize < (minimumPctOfTargetSize*f.targetFileSize)/100 {
+			return noSplit
+		}
+		return splitNow
+	}
+}
+
+func (f *fileSizeSplitter) onNewOutput(key []byte) []byte {
+	f.boundariesObserved = 0
+	return nil
+}
+
+func newLimitFuncSplitter(f *frontiers, limitFunc func(userKey []byte) []byte) *limitFuncSplitter {
+	s := &limitFuncSplitter{limitFunc: limitFunc}
+	s.frontier.Init(f, nil, s.reached)
+	return s
+}
+
+type limitFuncSplitter struct {
+	frontier  frontier
+	limitFunc func(userKey []byte) []byte
+	split     maybeSplit
+}
+
+func (lf *limitFuncSplitter) shouldSplitBefore(key *InternalKey, tw *sstable.Writer) maybeSplit {
+	return lf.split
+}
+
+func (lf *limitFuncSplitter) reached(nextKey []byte) []byte {
+	lf.split = splitNow
+	return nil
+}
+
+func (lf *limitFuncSplitter) onNewOutput(key []byte) []byte {
+	lf.split = noSplit
+	if key != nil {
+		// TODO(jackson): For some users, like L0 flush splits, there's no need
+		// to binary search over all the flush splits every time. The next split
+		// point must be ahead of the previous flush split point.
+		limit := lf.limitFunc(key)
+		lf.frontier.Update(limit)
+		return limit
+	}
+	lf.frontier.Update(nil)
+	return nil
+}
+
+// splitterGroup is a compactionOutputSplitter that splits whenever one of its
+// child splitters advises a compaction split.
+type splitterGroup struct {
+	cmp       Compare
+	splitters []compactionOutputSplitter
+}
+
+func (a *splitterGroup) shouldSplitBefore(
+	key *InternalKey, tw *sstable.Writer,
+) (suggestion maybeSplit) {
+	for _, splitter := range a.splitters {
+		if splitter.shouldSplitBefore(key, tw) == splitNow {
+			return splitNow
+		}
+	}
+	return noSplit
+}
+
+func (a *splitterGroup) onNewOutput(key []byte) []byte {
+	var earliestLimit []byte
+	for _, splitter := range a.splitters {
+		limit := splitter.onNewOutput(key)
+		if limit == nil {
+			continue
+		}
+		if earliestLimit == nil || a.cmp(limit, earliestLimit) < 0 {
+			earliestLimit = limit
+		}
+	}
+	return earliestLimit
+}
+
+// userKeyChangeSplitter is a compactionOutputSplitter that takes in a child
+// splitter, and splits when 1) that child splitter has advised a split, and 2)
+// the compaction output is at the boundary between two user keys (also
+// the boundary between atomic compaction units). Use this splitter to wrap
+// any splitters that don't guarantee user key splits (i.e. splitters that make
+// their determination in ways other than comparing the current key against a
+// limit key.) If a wrapped splitter advises a split, it must continue
+// to advise a split until a new output.
+type userKeyChangeSplitter struct {
+	cmp               Compare
+	splitter          compactionOutputSplitter
+	unsafePrevUserKey func() []byte
+}
+
+func (u *userKeyChangeSplitter) shouldSplitBefore(key *InternalKey, tw *sstable.Writer) maybeSplit {
+	// NB: The userKeyChangeSplitter only needs to suffer a key comparison if
+	// the wrapped splitter requests a split.
+	//
+	// We could implement this splitter using frontiers: When the inner splitter
+	// requests a split before key `k`, we'd update a frontier to be
+	// ImmediateSuccessor(k). Then on the next key greater than >k, the
+	// frontier's `reached` func would be called and we'd return splitNow.
+	// This doesn't really save work since duplicate user keys are rare, and it
+	// requires us to materialize the ImmediateSuccessor key. It also prevents
+	// us from splitting on the same key that the inner splitter requested a
+	// split for—instead we need to wait until the next key. The current
+	// implementation uses `unsafePrevUserKey` to gain access to the previous
+	// key which allows it to immediately respect the inner splitter if
+	// possible.
+	if split := u.splitter.shouldSplitBefore(key, tw); split != splitNow {
+		return split
+	}
+	if u.cmp(key.UserKey, u.unsafePrevUserKey()) > 0 {
+		return splitNow
+	}
+	return noSplit
+}
+
+func (u *userKeyChangeSplitter) onNewOutput(key []byte) []byte {
+	return u.splitter.onNewOutput(key)
+}
+
+// compactionWritable is a objstorage.Writable wrapper that, on every write,
+// updates a metric in `versions` on bytes written by in-progress compactions so
+// far. It also increments a per-compaction `written` int.
+type compactionWritable struct {
+	objstorage.Writable
+
+	versions *versionSet
+	written  *int64
+}
+
+// Write is part of the objstorage.Writable interface.
+func (c *compactionWritable) Write(p []byte) error {
+	if err := c.Writable.Write(p); err != nil {
+		return err
+	}
+
+	*c.written += int64(len(p))
+	c.versions.incrementCompactionBytes(int64(len(p)))
+	return nil
+}
+
+type compactionKind int
+
+const (
+	compactionKindDefault compactionKind = iota
+	compactionKindFlush
+	// compactionKindMove denotes a move compaction where the input file is
+	// retained and linked in a new level without being obsoleted.
+	compactionKindMove
+	// compactionKindCopy denotes a copy compaction where the input file is
+	// copied byte-by-byte into a new file with a new FileNum in the output level.
+	compactionKindCopy
+	compactionKindDeleteOnly
+	compactionKindElisionOnly
+	compactionKindRead
+	compactionKindRewrite
+	compactionKindIngestedFlushable
+)
+
+func (k compactionKind) String() string {
+	switch k {
+	case compactionKindDefault:
+		return "default"
+	case compactionKindFlush:
+		return "flush"
+	case compactionKindMove:
+		return "move"
+	case compactionKindDeleteOnly:
+		return "delete-only"
+	case compactionKindElisionOnly:
+		return "elision-only"
+	case compactionKindRead:
+		return "read"
+	case compactionKindRewrite:
+		return "rewrite"
+	case compactionKindIngestedFlushable:
+		return "ingested-flushable"
+	case compactionKindCopy:
+		return "copy"
+	}
+	return "?"
+}
+
+// rangeKeyCompactionTransform is used to transform range key spans as part of the
+// keyspan.MergingIter. As part of this transformation step, we can elide range
+// keys in the last snapshot stripe, as well as coalesce range keys within
+// snapshot stripes.
+func rangeKeyCompactionTransform(
+	eq base.Equal, snapshots []uint64, elideRangeKey func(start, end []byte) bool,
+) keyspan.Transformer {
+	return keyspan.TransformerFunc(func(cmp base.Compare, s keyspan.Span, dst *keyspan.Span) error {
+		elideInLastStripe := func(keys []keyspan.Key) []keyspan.Key {
+			// Unsets and deletes in the last snapshot stripe can be elided.
+			k := 0
+			for j := range keys {
+				if elideRangeKey(s.Start, s.End) &&
+					(keys[j].Kind() == InternalKeyKindRangeKeyUnset || keys[j].Kind() == InternalKeyKindRangeKeyDelete) {
+					continue
+				}
+				keys[k] = keys[j]
+				k++
+			}
+			keys = keys[:k]
+			return keys
+		}
+		// snapshots are in ascending order, while s.keys are in descending seqnum
+		// order. Partition s.keys by snapshot stripes, and call rangekey.Coalesce
+		// on each partition.
+		dst.Start = s.Start
+		dst.End = s.End
+		dst.Keys = dst.Keys[:0]
+		i, j := len(snapshots)-1, 0
+		usedLen := 0
+		for i >= 0 {
+			start := j
+			for j < len(s.Keys) && !base.Visible(s.Keys[j].SeqNum(), snapshots[i], base.InternalKeySeqNumMax) {
+				// Include j in current partition.
+				j++
+			}
+			if j > start {
+				keysDst := dst.Keys[usedLen:cap(dst.Keys)]
+				if err := rangekey.Coalesce(cmp, eq, s.Keys[start:j], &keysDst); err != nil {
+					return err
+				}
+				if j == len(s.Keys) {
+					// This is the last snapshot stripe. Unsets and deletes can be elided.
+					keysDst = elideInLastStripe(keysDst)
+				}
+				usedLen += len(keysDst)
+				dst.Keys = append(dst.Keys, keysDst...)
+			}
+			i--
+		}
+		if j < len(s.Keys) {
+			keysDst := dst.Keys[usedLen:cap(dst.Keys)]
+			if err := rangekey.Coalesce(cmp, eq, s.Keys[j:], &keysDst); err != nil {
+				return err
+			}
+			keysDst = elideInLastStripe(keysDst)
+			usedLen += len(keysDst)
+			dst.Keys = append(dst.Keys, keysDst...)
+		}
+		return nil
+	})
+}
+
+// compaction is a table compaction from one level to the next, starting from a
+// given version.
+type compaction struct {
+	// cancel is a bool that can be used by other goroutines to signal a compaction
+	// to cancel, such as if a conflicting excise operation raced it to manifest
+	// application. Only holders of the manifest lock will write to this atomic.
+	cancel atomic.Bool
+
+	kind      compactionKind
+	cmp       Compare
+	equal     Equal
+	comparer  *base.Comparer
+	formatKey base.FormatKey
+	logger    Logger
+	version   *version
+	stats     base.InternalIteratorStats
+	beganAt   time.Time
+	// versionEditApplied is set to true when a compaction has completed and the
+	// resulting version has been installed (if successful), but the compaction
+	// goroutine is still cleaning up (eg, deleting obsolete files).
+	versionEditApplied bool
+	bufferPool         sstable.BufferPool
+
+	// startLevel is the level that is being compacted. Inputs from startLevel
+	// and outputLevel will be merged to produce a set of outputLevel files.
+	startLevel *compactionLevel
+
+	// outputLevel is the level that files are being produced in. outputLevel is
+	// equal to startLevel+1 except when:
+	//    - if startLevel is 0, the output level equals compactionPicker.baseLevel().
+	//    - in multilevel compaction, the output level is the lowest level involved in
+	//      the compaction
+	// A compaction's outputLevel is nil for delete-only compactions.
+	outputLevel *compactionLevel
+
+	// extraLevels point to additional levels in between the input and output
+	// levels that get compacted in multilevel compactions
+	extraLevels []*compactionLevel
+
+	inputs []compactionLevel
+
+	// maxOutputFileSize is the maximum size of an individual table created
+	// during compaction.
+	maxOutputFileSize uint64
+	// maxOverlapBytes is the maximum number of bytes of overlap allowed for a
+	// single output table with the tables in the grandparent level.
+	maxOverlapBytes uint64
+	// disableSpanElision disables elision of range tombstones and range keys. Used
+	// by tests to allow range tombstones or range keys to be added to tables where
+	// they would otherwise be elided.
+	disableSpanElision bool
+
+	// flushing contains the flushables (aka memtables) that are being flushed.
+	flushing flushableList
+	// bytesIterated contains the number of bytes that have been flushed/compacted.
+	bytesIterated uint64
+	// bytesWritten contains the number of bytes that have been written to outputs.
+	bytesWritten int64
+
+	// The boundaries of the input data.
+	smallest InternalKey
+	largest  InternalKey
+
+	// The range deletion tombstone fragmenter. Adds range tombstones as they are
+	// returned from `compactionIter` and fragments them for output to files.
+	// Referenced by `compactionIter` which uses it to check whether keys are deleted.
+	rangeDelFrag keyspan.Fragmenter
+	// The range key fragmenter. Similar to rangeDelFrag in that it gets range
+	// keys from the compaction iter and fragments them for output to files.
+	rangeKeyFrag keyspan.Fragmenter
+	// The range deletion tombstone iterator, that merges and fragments
+	// tombstones across levels. This iterator is included within the compaction
+	// input iterator as a single level.
+	// TODO(jackson): Remove this when the refactor of FragmentIterator,
+	// InterleavingIterator, etc is complete.
+	rangeDelIter keyspan.InternalIteratorShim
+	// rangeKeyInterleaving is the interleaving iter for range keys.
+	rangeKeyInterleaving keyspan.InterleavingIter
+
+	// A list of objects to close when the compaction finishes. Used by input
+	// iteration to keep rangeDelIters open for the lifetime of the compaction,
+	// and only close them when the compaction finishes.
+	closers []io.Closer
+
+	// grandparents are the tables in level+2 that overlap with the files being
+	// compacted. Used to determine output table boundaries. Do not assume that the actual files
+	// in the grandparent when this compaction finishes will be the same.
+	grandparents manifest.LevelSlice
+
+	// Boundaries at which flushes to L0 should be split. Determined by
+	// L0Sublevels. If nil, flushes aren't split.
+	l0Limits [][]byte
+
+	// List of disjoint inuse key ranges the compaction overlaps with in
+	// grandparent and lower levels. See setupInuseKeyRanges() for the
+	// construction. Used by elideTombstone() and elideRangeTombstone() to
+	// determine if keys affected by a tombstone possibly exist at a lower level.
+	inuseKeyRanges []manifest.UserKeyRange
+	// inuseEntireRange is set if the above inuse key ranges wholly contain the
+	// compaction's key range. This allows compactions in higher levels to often
+	// elide key comparisons.
+	inuseEntireRange    bool
+	elideTombstoneIndex int
+
+	// allowedZeroSeqNum is true if seqnums can be zeroed if there are no
+	// snapshots requiring them to be kept. This determination is made by
+	// looking for an sstable which overlaps the bounds of the compaction at a
+	// lower level in the LSM during runCompaction.
+	allowedZeroSeqNum bool
+
+	metrics map[int]*LevelMetrics
+
+	pickerMetrics compactionPickerMetrics
+}
+
+func (c *compaction) makeInfo(jobID int) CompactionInfo {
+	info := CompactionInfo{
+		JobID:       jobID,
+		Reason:      c.kind.String(),
+		Input:       make([]LevelInfo, 0, len(c.inputs)),
+		Annotations: []string{},
+	}
+	for _, cl := range c.inputs {
+		inputInfo := LevelInfo{Level: cl.level, Tables: nil}
+		iter := cl.files.Iter()
+		for m := iter.First(); m != nil; m = iter.Next() {
+			inputInfo.Tables = append(inputInfo.Tables, m.TableInfo())
+		}
+		info.Input = append(info.Input, inputInfo)
+	}
+	if c.outputLevel != nil {
+		info.Output.Level = c.outputLevel.level
+
+		// If there are no inputs from the output level (eg, a move
+		// compaction), add an empty LevelInfo to info.Input.
+		if len(c.inputs) > 0 && c.inputs[len(c.inputs)-1].level != c.outputLevel.level {
+			info.Input = append(info.Input, LevelInfo{Level: c.outputLevel.level})
+		}
+	} else {
+		// For a delete-only compaction, set the output level to L6. The
+		// output level is not meaningful here, but complicating the
+		// info.Output interface with a pointer doesn't seem worth the
+		// semantic distinction.
+		info.Output.Level = numLevels - 1
+	}
+
+	for i, score := range c.pickerMetrics.scores {
+		info.Input[i].Score = score
+	}
+	info.SingleLevelOverlappingRatio = c.pickerMetrics.singleLevelOverlappingRatio
+	info.MultiLevelOverlappingRatio = c.pickerMetrics.multiLevelOverlappingRatio
+	if len(info.Input) > 2 {
+		info.Annotations = append(info.Annotations, "multilevel")
+	}
+	return info
+}
+
+func newCompaction(
+	pc *pickedCompaction, opts *Options, beganAt time.Time, provider objstorage.Provider,
+) *compaction {
+	c := &compaction{
+		kind:              compactionKindDefault,
+		cmp:               pc.cmp,
+		equal:             opts.equal(),
+		comparer:          opts.Comparer,
+		formatKey:         opts.Comparer.FormatKey,
+		inputs:            pc.inputs,
+		smallest:          pc.smallest,
+		largest:           pc.largest,
+		logger:            opts.Logger,
+		version:           pc.version,
+		beganAt:           beganAt,
+		maxOutputFileSize: pc.maxOutputFileSize,
+		maxOverlapBytes:   pc.maxOverlapBytes,
+		pickerMetrics:     pc.pickerMetrics,
+	}
+	c.startLevel = &c.inputs[0]
+	if pc.startLevel.l0SublevelInfo != nil {
+		c.startLevel.l0SublevelInfo = pc.startLevel.l0SublevelInfo
+	}
+	c.outputLevel = &c.inputs[1]
+
+	if len(pc.extraLevels) > 0 {
+		c.extraLevels = pc.extraLevels
+		c.outputLevel = &c.inputs[len(c.inputs)-1]
+	}
+	// Compute the set of outputLevel+1 files that overlap this compaction (these
+	// are the grandparent sstables).
+	if c.outputLevel.level+1 < numLevels {
+		c.grandparents = c.version.Overlaps(c.outputLevel.level+1, c.cmp,
+			c.smallest.UserKey, c.largest.UserKey, c.largest.IsExclusiveSentinel())
+	}
+	c.setupInuseKeyRanges()
+	c.kind = pc.kind
+
+	if c.kind == compactionKindDefault && c.outputLevel.files.Empty() && !c.hasExtraLevelData() &&
+		c.startLevel.files.Len() == 1 && c.grandparents.SizeSum() <= c.maxOverlapBytes {
+		// This compaction can be converted into a move or copy from one level
+		// to the next. We avoid such a move if there is lots of overlapping
+		// grandparent data. Otherwise, the move could create a parent file
+		// that will require a very expensive merge later on.
+		iter := c.startLevel.files.Iter()
+		meta := iter.First()
+		isRemote := false
+		// We should always be passed a provider, except in some unit tests.
+		if provider != nil {
+			objMeta, err := provider.Lookup(fileTypeTable, meta.FileBacking.DiskFileNum)
+			if err != nil {
+				panic(errors.Wrapf(err, "cannot lookup table %s in provider", meta.FileBacking.DiskFileNum))
+			}
+			isRemote = objMeta.IsRemote()
+		}
+		// Avoid a trivial move or copy if all of these are true, as rewriting a
+		// new file is better:
+		//
+		// 1) The source file is a virtual sstable
+		// 2) The existing file `meta` is on non-remote storage
+		// 3) The output level prefers shared storage
+		mustCopy := !isRemote && remote.ShouldCreateShared(opts.Experimental.CreateOnShared, c.outputLevel.level)
+		if mustCopy {
+			// If the source is virtual, it's best to just rewrite the file as all
+			// conditions in the above comment are met.
+			if !meta.Virtual {
+				c.kind = compactionKindCopy
+			}
+		} else {
+			c.kind = compactionKindMove
+		}
+	}
+	return c
+}
+
+func newDeleteOnlyCompaction(
+	opts *Options, cur *version, inputs []compactionLevel, beganAt time.Time,
+) *compaction {
+	c := &compaction{
+		kind:      compactionKindDeleteOnly,
+		cmp:       opts.Comparer.Compare,
+		equal:     opts.equal(),
+		comparer:  opts.Comparer,
+		formatKey: opts.Comparer.FormatKey,
+		logger:    opts.Logger,
+		version:   cur,
+		beganAt:   beganAt,
+		inputs:    inputs,
+	}
+
+	// Set c.smallest, c.largest.
+	files := make([]manifest.LevelIterator, 0, len(inputs))
+	for _, in := range inputs {
+		files = append(files, in.files.Iter())
+	}
+	c.smallest, c.largest = manifest.KeyRange(opts.Comparer.Compare, files...)
+	return c
+}
+
+func adjustGrandparentOverlapBytesForFlush(c *compaction, flushingBytes uint64) {
+	// Heuristic to place a lower bound on compaction output file size
+	// caused by Lbase. Prior to this heuristic we have observed an L0 in
+	// production with 310K files of which 290K files were < 10KB in size.
+	// Our hypothesis is that it was caused by L1 having 2600 files and
+	// ~10GB, such that each flush got split into many tiny files due to
+	// overlapping with most of the files in Lbase.
+	//
+	// The computation below is general in that it accounts
+	// for flushing different volumes of data (e.g. we may be flushing
+	// many memtables). For illustration, we consider the typical
+	// example of flushing a 64MB memtable. So 12.8MB output,
+	// based on the compression guess below. If the compressed bytes
+	// guess is an over-estimate we will end up with smaller files,
+	// and if an under-estimate we will end up with larger files.
+	// With a 2MB target file size, 7 files. We are willing to accept
+	// 4x the number of files, if it results in better write amplification
+	// when later compacting to Lbase, i.e., ~450KB files (target file
+	// size / 4).
+	//
+	// Note that this is a pessimistic heuristic in that
+	// fileCountUpperBoundDueToGrandparents could be far from the actual
+	// number of files produced due to the grandparent limits. For
+	// example, in the extreme, consider a flush that overlaps with 1000
+	// files in Lbase f0...f999, and the initially calculated value of
+	// maxOverlapBytes will cause splits at f10, f20,..., f990, which
+	// means an upper bound file count of 100 files. Say the input bytes
+	// in the flush are such that acceptableFileCount=10. We will fatten
+	// up maxOverlapBytes by 10x to ensure that the upper bound file count
+	// drops to 10. However, it is possible that in practice, even without
+	// this change, we would have produced no more than 10 files, and that
+	// this change makes the files unnecessarily wide. Say the input bytes
+	// are distributed such that 10% are in f0...f9, 10% in f10...f19, ...
+	// 10% in f80...f89 and 10% in f990...f999. The original value of
+	// maxOverlapBytes would have actually produced only 10 sstables. But
+	// by increasing maxOverlapBytes by 10x, we may produce 1 sstable that
+	// spans f0...f89, i.e., a much wider sstable than necessary.
+	//
+	// We could produce a tighter estimate of
+	// fileCountUpperBoundDueToGrandparents if we had knowledge of the key
+	// distribution of the flush. The 4x multiplier mentioned earlier is
+	// a way to try to compensate for this pessimism.
+	//
+	// TODO(sumeer): we don't have compression info for the data being
+	// flushed, but it is likely that existing files that overlap with
+	// this flush in Lbase are representative wrt compression ratio. We
+	// could store the uncompressed size in FileMetadata and estimate
+	// the compression ratio.
+	const approxCompressionRatio = 0.2
+	approxOutputBytes := approxCompressionRatio * float64(flushingBytes)
+	approxNumFilesBasedOnTargetSize :=
+		int(math.Ceil(approxOutputBytes / float64(c.maxOutputFileSize)))
+	acceptableFileCount := float64(4 * approxNumFilesBasedOnTargetSize)
+	// The byte calculation is linear in numGrandparentFiles, but we will
+	// incur this linear cost in findGrandparentLimit too, so we are also
+	// willing to pay it now. We could approximate this cheaply by using
+	// the mean file size of Lbase.
+	grandparentFileBytes := c.grandparents.SizeSum()
+	fileCountUpperBoundDueToGrandparents :=
+		float64(grandparentFileBytes) / float64(c.maxOverlapBytes)
+	if fileCountUpperBoundDueToGrandparents > acceptableFileCount {
+		c.maxOverlapBytes = uint64(
+			float64(c.maxOverlapBytes) *
+				(fileCountUpperBoundDueToGrandparents / acceptableFileCount))
+	}
+}
+
+func newFlush(
+	opts *Options, cur *version, baseLevel int, flushing flushableList, beganAt time.Time,
+) *compaction {
+	c := &compaction{
+		kind:              compactionKindFlush,
+		cmp:               opts.Comparer.Compare,
+		equal:             opts.equal(),
+		comparer:          opts.Comparer,
+		formatKey:         opts.Comparer.FormatKey,
+		logger:            opts.Logger,
+		version:           cur,
+		beganAt:           beganAt,
+		inputs:            []compactionLevel{{level: -1}, {level: 0}},
+		maxOutputFileSize: math.MaxUint64,
+		maxOverlapBytes:   math.MaxUint64,
+		flushing:          flushing,
+	}
+	c.startLevel = &c.inputs[0]
+	c.outputLevel = &c.inputs[1]
+
+	if len(flushing) > 0 {
+		if _, ok := flushing[0].flushable.(*ingestedFlushable); ok {
+			if len(flushing) != 1 {
+				panic("pebble: ingestedFlushable must be flushed one at a time.")
+			}
+			c.kind = compactionKindIngestedFlushable
+			return c
+		}
+	}
+
+	// Make sure there's no ingestedFlushable after the first flushable in the
+	// list.
+	for _, f := range flushing {
+		if _, ok := f.flushable.(*ingestedFlushable); ok {
+			panic("pebble: flushing shouldn't contain ingestedFlushable flushable")
+		}
+	}
+
+	if cur.L0Sublevels != nil {
+		c.l0Limits = cur.L0Sublevels.FlushSplitKeys()
+	}
+
+	smallestSet, largestSet := false, false
+	updatePointBounds := func(iter internalIterator) {
+		if key, _ := iter.First(); key != nil {
+			if !smallestSet ||
+				base.InternalCompare(c.cmp, c.smallest, *key) > 0 {
+				smallestSet = true
+				c.smallest = key.Clone()
+			}
+		}
+		if key, _ := iter.Last(); key != nil {
+			if !largestSet ||
+				base.InternalCompare(c.cmp, c.largest, *key) < 0 {
+				largestSet = true
+				c.largest = key.Clone()
+			}
+		}
+	}
+
+	updateRangeBounds := func(iter keyspan.FragmentIterator) {
+		// File bounds require s != nil && !s.Empty(). We only need to check for
+		// s != nil here, as the memtable's FragmentIterator would never surface
+		// empty spans.
+		if s := iter.First(); s != nil {
+			if key := s.SmallestKey(); !smallestSet ||
+				base.InternalCompare(c.cmp, c.smallest, key) > 0 {
+				smallestSet = true
+				c.smallest = key.Clone()
+			}
+		}
+		if s := iter.Last(); s != nil {
+			if key := s.LargestKey(); !largestSet ||
+				base.InternalCompare(c.cmp, c.largest, key) < 0 {
+				largestSet = true
+				c.largest = key.Clone()
+			}
+		}
+	}
+
+	var flushingBytes uint64
+	for i := range flushing {
+		f := flushing[i]
+		updatePointBounds(f.newIter(nil))
+		if rangeDelIter := f.newRangeDelIter(nil); rangeDelIter != nil {
+			updateRangeBounds(rangeDelIter)
+		}
+		if rangeKeyIter := f.newRangeKeyIter(nil); rangeKeyIter != nil {
+			updateRangeBounds(rangeKeyIter)
+		}
+		flushingBytes += f.inuseBytes()
+	}
+
+	if opts.FlushSplitBytes > 0 {
+		c.maxOutputFileSize = uint64(opts.Level(0).TargetFileSize)
+		c.maxOverlapBytes = maxGrandparentOverlapBytes(opts, 0)
+		c.grandparents = c.version.Overlaps(baseLevel, c.cmp, c.smallest.UserKey,
+			c.largest.UserKey, c.largest.IsExclusiveSentinel())
+		adjustGrandparentOverlapBytesForFlush(c, flushingBytes)
+	}
+
+	c.setupInuseKeyRanges()
+	return c
+}
+
+func (c *compaction) hasExtraLevelData() bool {
+	if len(c.extraLevels) == 0 {
+		// not a multi level compaction
+		return false
+	} else if c.extraLevels[0].files.Empty() {
+		// a multi level compaction without data in the intermediate input level;
+		// e.g. for a multi level compaction with levels 4,5, and 6, this could
+		// occur if there is no files to compact in 5, or in 5 and 6 (i.e. a move).
+		return false
+	}
+	return true
+}
+
+func (c *compaction) setupInuseKeyRanges() {
+	level := c.outputLevel.level + 1
+	if c.outputLevel.level == 0 {
+		level = 0
+	}
+	// calculateInuseKeyRanges will return a series of sorted spans. Overlapping
+	// or abutting spans have already been merged.
+	c.inuseKeyRanges = calculateInuseKeyRanges(
+		c.version, c.cmp, level, numLevels-1, c.smallest.UserKey, c.largest.UserKey,
+	)
+	// Check if there's a single in-use span that encompasses the entire key
+	// range of the compaction. This is an optimization to avoid key comparisons
+	// against inuseKeyRanges during the compaction when every key within the
+	// compaction overlaps with an in-use span.
+	if len(c.inuseKeyRanges) > 0 {
+		c.inuseEntireRange = c.cmp(c.inuseKeyRanges[0].Start, c.smallest.UserKey) <= 0 &&
+			c.cmp(c.inuseKeyRanges[0].End, c.largest.UserKey) >= 0
+	}
+}
+
+func calculateInuseKeyRanges(
+	v *version, cmp base.Compare, level, maxLevel int, smallest, largest []byte,
+) []manifest.UserKeyRange {
+	// Use two slices, alternating which one is input and which one is output
+	// as we descend the LSM.
+	var input, output []manifest.UserKeyRange
+
+	// L0 requires special treatment, since sstables within L0 may overlap.
+	// We use the L0 Sublevels structure to efficiently calculate the merged
+	// in-use key ranges.
+	if level == 0 {
+		output = v.L0Sublevels.InUseKeyRanges(smallest, largest)
+		level++
+	}
+
+	for ; level <= maxLevel; level++ {
+		// NB: We always treat `largest` as inclusive for simplicity, because
+		// there's little consequence to calculating slightly broader in-use key
+		// ranges.
+		overlaps := v.Overlaps(level, cmp, smallest, largest, false /* exclusiveEnd */)
+		iter := overlaps.Iter()
+
+		// We may already have in-use key ranges from higher levels. Iterate
+		// through both our accumulated in-use key ranges and this level's
+		// files, merging the two.
+		//
+		// Tables higher within the LSM have broader key spaces. We use this
+		// when possible to seek past a level's files that are contained by
+		// our current accumulated in-use key ranges. This helps avoid
+		// per-sstable work during flushes or compactions in high levels which
+		// overlap the majority of the LSM's sstables.
+		input, output = output, input
+		output = output[:0]
+
+		var currFile *fileMetadata
+		var currAccum *manifest.UserKeyRange
+		if len(input) > 0 {
+			currAccum, input = &input[0], input[1:]
+		}
+
+		// If we have an accumulated key range and its start is ≤ smallest,
+		// we can seek to the accumulated range's end. Otherwise, we need to
+		// start at the first overlapping file within the level.
+		if currAccum != nil && cmp(currAccum.Start, smallest) <= 0 {
+			currFile = seekGT(&iter, cmp, currAccum.End)
+		} else {
+			currFile = iter.First()
+		}
+
+		for currFile != nil || currAccum != nil {
+			// If we've exhausted either the files in the level or the
+			// accumulated key ranges, we just need to append the one we have.
+			// If we have both a currFile and a currAccum, they either overlap
+			// or they're disjoint. If they're disjoint, we append whichever
+			// one sorts first and move on to the next file or range. If they
+			// overlap, we merge them into currAccum and proceed to the next
+			// file.
+			switch {
+			case currAccum == nil || (currFile != nil && cmp(currFile.Largest.UserKey, currAccum.Start) < 0):
+				// This file is strictly before the current accumulated range,
+				// or there are no more accumulated ranges.
+				output = append(output, manifest.UserKeyRange{
+					Start: currFile.Smallest.UserKey,
+					End:   currFile.Largest.UserKey,
+				})
+				currFile = iter.Next()
+			case currFile == nil || (currAccum != nil && cmp(currAccum.End, currFile.Smallest.UserKey) < 0):
+				// The current accumulated key range is strictly before the
+				// current file, or there are no more files.
+				output = append(output, *currAccum)
+				currAccum = nil
+				if len(input) > 0 {
+					currAccum, input = &input[0], input[1:]
+				}
+			default:
+				// The current accumulated range and the current file overlap.
+				// Adjust the accumulated range to be the union.
+				if cmp(currFile.Smallest.UserKey, currAccum.Start) < 0 {
+					currAccum.Start = currFile.Smallest.UserKey
+				}
+				if cmp(currFile.Largest.UserKey, currAccum.End) > 0 {
+					currAccum.End = currFile.Largest.UserKey
+				}
+
+				// Extending `currAccum`'s end boundary may have caused it to
+				// overlap with `input` key ranges that we haven't processed
+				// yet. Merge any such key ranges.
+				for len(input) > 0 && cmp(input[0].Start, currAccum.End) <= 0 {
+					if cmp(input[0].End, currAccum.End) > 0 {
+						currAccum.End = input[0].End
+					}
+					input = input[1:]
+				}
+				// Seek the level iterator past our current accumulated end.
+				currFile = seekGT(&iter, cmp, currAccum.End)
+			}
+		}
+	}
+	return output
+}
+
+func seekGT(iter *manifest.LevelIterator, cmp base.Compare, key []byte) *manifest.FileMetadata {
+	f := iter.SeekGE(cmp, key)
+	for f != nil && cmp(f.Largest.UserKey, key) == 0 {
+		f = iter.Next()
+	}
+	return f
+}
+
+// findGrandparentLimit takes the start user key for a table and returns the
+// user key to which that table can extend without excessively overlapping
+// the grandparent level. If no limit is needed considering the grandparent
+// files, this function returns nil. This is done in order to prevent a table
+// at level N from overlapping too much data at level N+1. We want to avoid
+// such large overlaps because they translate into large compactions. The
+// current heuristic stops output of a table if the addition of another key
+// would cause the table to overlap more than 10x the target file size at
+// level N. See maxGrandparentOverlapBytes.
+func (c *compaction) findGrandparentLimit(start []byte) []byte {
+	iter := c.grandparents.Iter()
+	var overlappedBytes uint64
+	var greater bool
+	for f := iter.SeekGE(c.cmp, start); f != nil; f = iter.Next() {
+		overlappedBytes += f.Size
+		// To ensure forward progress we always return a larger user
+		// key than where we started. See comments above clients of
+		// this function for how this is used.
+		greater = greater || c.cmp(f.Smallest.UserKey, start) > 0
+		if !greater {
+			continue
+		}
+
+		// We return the smallest bound of a sstable rather than the
+		// largest because the smallest is always inclusive, and limits
+		// are used exlusively when truncating range tombstones. If we
+		// truncated an output to the largest key while there's a
+		// pending tombstone, the next output file would also overlap
+		// the same grandparent f.
+		if overlappedBytes > c.maxOverlapBytes {
+			return f.Smallest.UserKey
+		}
+	}
+	return nil
+}
+
+// findL0Limit takes the start key for a table and returns the user key to which
+// that table can be extended without hitting the next l0Limit. Having flushed
+// sstables "bridging across" an l0Limit could lead to increased L0 -> LBase
+// compaction sizes as well as elevated read amplification.
+func (c *compaction) findL0Limit(start []byte) []byte {
+	if c.startLevel.level > -1 || c.outputLevel.level != 0 || len(c.l0Limits) == 0 {
+		return nil
+	}
+	index := sort.Search(len(c.l0Limits), func(i int) bool {
+		return c.cmp(c.l0Limits[i], start) > 0
+	})
+	if index < len(c.l0Limits) {
+		return c.l0Limits[index]
+	}
+	return nil
+}
+
+// errorOnUserKeyOverlap returns an error if the last two written sstables in
+// this compaction have revisions of the same user key present in both sstables,
+// when it shouldn't (eg. when splitting flushes).
+func (c *compaction) errorOnUserKeyOverlap(ve *versionEdit) error {
+	if n := len(ve.NewFiles); n > 1 {
+		meta := ve.NewFiles[n-1].Meta
+		prevMeta := ve.NewFiles[n-2].Meta
+		if !prevMeta.Largest.IsExclusiveSentinel() &&
+			c.cmp(prevMeta.Largest.UserKey, meta.Smallest.UserKey) >= 0 {
+			return errors.Errorf("pebble: compaction split user key across two sstables: %s in %s and %s",
+				prevMeta.Largest.Pretty(c.formatKey),
+				prevMeta.FileNum,
+				meta.FileNum)
+		}
+	}
+	return nil
+}
+
+// allowZeroSeqNum returns true if seqnum's can be zeroed if there are no
+// snapshots requiring them to be kept. It performs this determination by
+// looking for an sstable which overlaps the bounds of the compaction at a
+// lower level in the LSM.
+func (c *compaction) allowZeroSeqNum() bool {
+	return c.elideRangeTombstone(c.smallest.UserKey, c.largest.UserKey)
+}
+
+// elideTombstone returns true if it is ok to elide a tombstone for the
+// specified key. A return value of true guarantees that there are no key/value
+// pairs at c.level+2 or higher that possibly contain the specified user
+// key. The keys in multiple invocations to elideTombstone must be supplied in
+// order.
+func (c *compaction) elideTombstone(key []byte) bool {
+	if c.inuseEntireRange || len(c.flushing) != 0 {
+		return false
+	}
+
+	for ; c.elideTombstoneIndex < len(c.inuseKeyRanges); c.elideTombstoneIndex++ {
+		r := &c.inuseKeyRanges[c.elideTombstoneIndex]
+		if c.cmp(key, r.End) <= 0 {
+			if c.cmp(key, r.Start) >= 0 {
+				return false
+			}
+			break
+		}
+	}
+	return true
+}
+
+// elideRangeTombstone returns true if it is ok to elide the specified range
+// tombstone. A return value of true guarantees that there are no key/value
+// pairs at c.outputLevel.level+1 or higher that possibly overlap the specified
+// tombstone.
+func (c *compaction) elideRangeTombstone(start, end []byte) bool {
+	// Disable range tombstone elision if the testing knob for that is enabled,
+	// or if we are flushing memtables. The latter requirement is due to
+	// inuseKeyRanges not accounting for key ranges in other memtables that are
+	// being flushed in the same compaction. It's possible for a range tombstone
+	// in one memtable to overlap keys in a preceding memtable in c.flushing.
+	//
+	// This function is also used in setting allowZeroSeqNum, so disabling
+	// elision of range tombstones also disables zeroing of SeqNums.
+	//
+	// TODO(peter): we disable zeroing of seqnums during flushing to match
+	// RocksDB behavior and to avoid generating overlapping sstables during
+	// DB.replayWAL. When replaying WAL files at startup, we flush after each
+	// WAL is replayed building up a single version edit that is
+	// applied. Because we don't apply the version edit after each flush, this
+	// code doesn't know that L0 contains files and zeroing of seqnums should
+	// be disabled. That is fixable, but it seems safer to just match the
+	// RocksDB behavior for now.
+	if c.disableSpanElision || len(c.flushing) != 0 {
+		return false
+	}
+
+	lower := sort.Search(len(c.inuseKeyRanges), func(i int) bool {
+		return c.cmp(c.inuseKeyRanges[i].End, start) >= 0
+	})
+	upper := sort.Search(len(c.inuseKeyRanges), func(i int) bool {
+		return c.cmp(c.inuseKeyRanges[i].Start, end) > 0
+	})
+	return lower >= upper
+}
+
+// elideRangeKey returns true if it is ok to elide the specified range key. A
+// return value of true guarantees that there are no key/value pairs at
+// c.outputLevel.level+1 or higher that possibly overlap the specified range key.
+func (c *compaction) elideRangeKey(start, end []byte) bool {
+	// TODO(bilal): Track inuseKeyRanges separately for the range keyspace as
+	// opposed to the point keyspace. Once that is done, elideRangeTombstone
+	// can just check in the point keyspace, and this function can check for
+	// inuseKeyRanges in the range keyspace.
+	return c.elideRangeTombstone(start, end)
+}
+
+// newInputIter returns an iterator over all the input tables in a compaction.
+func (c *compaction) newInputIter(
+	newIters tableNewIters, newRangeKeyIter keyspan.TableNewSpanIter, snapshots []uint64,
+) (_ internalIterator, retErr error) {
+	// Validate the ordering of compaction input files for defense in depth.
+	// TODO(jackson): Some of the CheckOrdering calls may be adapted to pass
+	// ProhibitSplitUserKeys if we thread the active format major version in. Or
+	// if we remove support for earlier FMVs, we can remove the parameter
+	// altogether.
+	if len(c.flushing) == 0 {
+		if c.startLevel.level >= 0 {
+			err := manifest.CheckOrdering(c.cmp, c.formatKey,
+				manifest.Level(c.startLevel.level), c.startLevel.files.Iter(),
+				manifest.AllowSplitUserKeys)
+			if err != nil {
+				return nil, err
+			}
+		}
+		err := manifest.CheckOrdering(c.cmp, c.formatKey,
+			manifest.Level(c.outputLevel.level), c.outputLevel.files.Iter(),
+			manifest.AllowSplitUserKeys)
+		if err != nil {
+			return nil, err
+		}
+		if c.startLevel.level == 0 {
+			if c.startLevel.l0SublevelInfo == nil {
+				panic("l0SublevelInfo not created for compaction out of L0")
+			}
+			for _, info := range c.startLevel.l0SublevelInfo {
+				err := manifest.CheckOrdering(c.cmp, c.formatKey,
+					info.sublevel, info.Iter(),
+					// NB: L0 sublevels have never allowed split user keys.
+					manifest.ProhibitSplitUserKeys)
+				if err != nil {
+					return nil, err
+				}
+			}
+		}
+		if len(c.extraLevels) > 0 {
+			if len(c.extraLevels) > 1 {
+				panic("n>2 multi level compaction not implemented yet")
+			}
+			interLevel := c.extraLevels[0]
+			err := manifest.CheckOrdering(c.cmp, c.formatKey,
+				manifest.Level(interLevel.level), interLevel.files.Iter(),
+				manifest.AllowSplitUserKeys)
+			if err != nil {
+				return nil, err
+			}
+		}
+	}
+
+	// There are three classes of keys that a compaction needs to process: point
+	// keys, range deletion tombstones and range keys. Collect all iterators for
+	// all these classes of keys from all the levels. We'll aggregate them
+	// together farther below.
+	//
+	// numInputLevels is an approximation of the number of iterator levels. Due
+	// to idiosyncrasies in iterator construction, we may (rarely) exceed this
+	// initial capacity.
+	numInputLevels := len(c.flushing)
+	if numInputLevels < len(c.inputs) {
+		numInputLevels = len(c.inputs)
+	}
+	iters := make([]internalIterator, 0, numInputLevels)
+	rangeDelIters := make([]keyspan.FragmentIterator, 0, numInputLevels)
+	rangeKeyIters := make([]keyspan.FragmentIterator, 0, numInputLevels)
+
+	// If construction of the iterator inputs fails, ensure that we close all
+	// the consitutent iterators.
+	defer func() {
+		if retErr != nil {
+			for _, iter := range iters {
+				if iter != nil {
+					iter.Close()
+				}
+			}
+			for _, rangeDelIter := range rangeDelIters {
+				rangeDelIter.Close()
+			}
+		}
+	}()
+	iterOpts := IterOptions{
+		CategoryAndQoS: sstable.CategoryAndQoS{
+			Category: "pebble-compaction",
+			QoSLevel: sstable.NonLatencySensitiveQoSLevel,
+		},
+		logger: c.logger,
+	}
+
+	// Populate iters, rangeDelIters and rangeKeyIters with the appropriate
+	// constituent iterators. This depends on whether this is a flush or a
+	// compaction.
+	if len(c.flushing) != 0 {
+		// If flushing, we need to build the input iterators over the memtables
+		// stored in c.flushing.
+		for i := range c.flushing {
+			f := c.flushing[i]
+			iters = append(iters, f.newFlushIter(nil, &c.bytesIterated))
+			rangeDelIter := f.newRangeDelIter(nil)
+			if rangeDelIter != nil {
+				rangeDelIters = append(rangeDelIters, rangeDelIter)
+			}
+			if rangeKeyIter := f.newRangeKeyIter(nil); rangeKeyIter != nil {
+				rangeKeyIters = append(rangeKeyIters, rangeKeyIter)
+			}
+		}
+	} else {
+		addItersForLevel := func(level *compactionLevel, l manifest.Level) error {
+			// Add a *levelIter for point iterators. Because we don't call
+			// initRangeDel, the levelIter will close and forget the range
+			// deletion iterator when it steps on to a new file. Surfacing range
+			// deletions to compactions are handled below.
+			iters = append(iters, newLevelIter(context.Background(),
+				iterOpts, c.comparer, newIters, level.files.Iter(), l, internalIterOpts{
+					bytesIterated: &c.bytesIterated,
+					bufferPool:    &c.bufferPool,
+				}))
+			// TODO(jackson): Use keyspan.LevelIter to avoid loading all the range
+			// deletions into memory upfront. (See #2015, which reverted this.)
+			// There will be no user keys that are split between sstables
+			// within a level in Cockroach 23.1, which unblocks this optimization.
+
+			// Add the range deletion iterator for each file as an independent level
+			// in mergingIter, as opposed to making a levelIter out of those. This
+			// is safer as levelIter expects all keys coming from underlying
+			// iterators to be in order. Due to compaction / tombstone writing
+			// logic in finishOutput(), it is possible for range tombstones to not
+			// be strictly ordered across all files in one level.
+			//
+			// Consider this example from the metamorphic tests (also repeated in
+			// finishOutput()), consisting of three L3 files with their bounds
+			// specified in square brackets next to the file name:
+			//
+			// ./000240.sst   [tmgc#391,MERGE-tmgc#391,MERGE]
+			// tmgc#391,MERGE [786e627a]
+			// tmgc-udkatvs#331,RANGEDEL
+			//
+			// ./000241.sst   [tmgc#384,MERGE-tmgc#384,MERGE]
+			// tmgc#384,MERGE [666c7070]
+			// tmgc-tvsalezade#383,RANGEDEL
+			// tmgc-tvsalezade#331,RANGEDEL
+			//
+			// ./000242.sst   [tmgc#383,RANGEDEL-tvsalezade#72057594037927935,RANGEDEL]
+			// tmgc-tvsalezade#383,RANGEDEL
+			// tmgc#375,SET [72646c78766965616c72776865676e79]
+			// tmgc-tvsalezade#356,RANGEDEL
+			//
+			// Here, the range tombstone in 000240.sst falls "after" one in
+			// 000241.sst, despite 000240.sst being ordered "before" 000241.sst for
+			// levelIter's purposes. While each file is still consistent before its
+			// bounds, it's safer to have all rangedel iterators be visible to
+			// mergingIter.
+			iter := level.files.Iter()
+			for f := iter.First(); f != nil; f = iter.Next() {
+				rangeDelIter, closer, err := c.newRangeDelIter(
+					newIters, iter.Take(), iterOpts, l, &c.bytesIterated)
+				if err != nil {
+					// The error will already be annotated with the BackingFileNum, so
+					// we annotate it with the FileNum.
+					return errors.Wrapf(err, "pebble: could not open table %s", errors.Safe(f.FileNum))
+				}
+				if rangeDelIter == nil {
+					continue
+				}
+				rangeDelIters = append(rangeDelIters, rangeDelIter)
+				c.closers = append(c.closers, closer)
+			}
+
+			// Check if this level has any range keys.
+			hasRangeKeys := false
+			for f := iter.First(); f != nil; f = iter.Next() {
+				if f.HasRangeKeys {
+					hasRangeKeys = true
+					break
+				}
+			}
+			if hasRangeKeys {
+				li := &keyspan.LevelIter{}
+				newRangeKeyIterWrapper := func(file *manifest.FileMetadata, iterOptions keyspan.SpanIterOptions) (keyspan.FragmentIterator, error) {
+					iter, err := newRangeKeyIter(file, iterOptions)
+					if err != nil {
+						return nil, err
+					} else if iter == nil {
+						return emptyKeyspanIter, nil
+					}
+					// Ensure that the range key iter is not closed until the compaction is
+					// finished. This is necessary because range key processing
+					// requires the range keys to be held in memory for up to the
+					// lifetime of the compaction.
+					c.closers = append(c.closers, iter)
+					iter = noCloseIter{iter}
+
+					// We do not need to truncate range keys to sstable boundaries, or
+					// only read within the file's atomic compaction units, unlike with
+					// range tombstones. This is because range keys were added after we
+					// stopped splitting user keys across sstables, so all the range keys
+					// in this sstable must wholly lie within the file's bounds.
+					return iter, err
+				}
+				li.Init(keyspan.SpanIterOptions{}, c.cmp, newRangeKeyIterWrapper, level.files.Iter(), l, manifest.KeyTypeRange)
+				rangeKeyIters = append(rangeKeyIters, li)
+			}
+			return nil
+		}
+
+		for i := range c.inputs {
+			// If the level is annotated with l0SublevelInfo, expand it into one
+			// level per sublevel.
+			// TODO(jackson): Perform this expansion even earlier when we pick the
+			// compaction?
+			if len(c.inputs[i].l0SublevelInfo) > 0 {
+				for _, info := range c.startLevel.l0SublevelInfo {
+					sublevelCompactionLevel := &compactionLevel{0, info.LevelSlice, nil}
+					if err := addItersForLevel(sublevelCompactionLevel, info.sublevel); err != nil {
+						return nil, err
+					}
+				}
+				continue
+			}
+			if err := addItersForLevel(&c.inputs[i], manifest.Level(c.inputs[i].level)); err != nil {
+				return nil, err
+			}
+		}
+	}
+
+	// In normal operation, levelIter iterates over the point operations in a
+	// level, and initializes a rangeDelIter pointer for the range deletions in
+	// each table. During compaction, we want to iterate over the merged view of
+	// point operations and range deletions. In order to do this we create one
+	// levelIter per level to iterate over the point operations, and collect up
+	// all the range deletion files.
+	//
+	// The range deletion levels are first combined with a keyspan.MergingIter
+	// (currently wrapped by a keyspan.InternalIteratorShim to satisfy the
+	// internal iterator interface). The resulting merged rangedel iterator is
+	// then included with the point levels in a single mergingIter.
+	//
+	// Combine all the rangedel iterators using a keyspan.MergingIterator and a
+	// InternalIteratorShim so that the range deletions may be interleaved in
+	// the compaction input.
+	// TODO(jackson): Replace the InternalIteratorShim with an interleaving
+	// iterator.
+	if len(rangeDelIters) > 0 {
+		c.rangeDelIter.Init(c.cmp, rangeDelIters...)
+		iters = append(iters, &c.rangeDelIter)
+	}
+
+	// If there's only one constituent point iterator, we can avoid the overhead
+	// of a *mergingIter. This is possible, for example, when performing a flush
+	// of a single memtable. Otherwise, combine all the iterators into a merging
+	// iter.
+	iter := iters[0]
+	if len(iters) > 0 {
+		iter = newMergingIter(c.logger, &c.stats, c.cmp, nil, iters...)
+	}
+	// If there are range key iterators, we need to combine them using
+	// keyspan.MergingIter, and then interleave them among the points.
+	if len(rangeKeyIters) > 0 {
+		mi := &keyspan.MergingIter{}
+		mi.Init(c.cmp, rangeKeyCompactionTransform(c.equal, snapshots, c.elideRangeKey), new(keyspan.MergingBuffers), rangeKeyIters...)
+		di := &keyspan.DefragmentingIter{}
+		di.Init(c.comparer, mi, keyspan.DefragmentInternal, keyspan.StaticDefragmentReducer, new(keyspan.DefragmentingBuffers))
+		c.rangeKeyInterleaving.Init(c.comparer, iter, di, keyspan.InterleavingIterOpts{})
+		iter = &c.rangeKeyInterleaving
+	}
+	return iter, nil
+}
+
+func (c *compaction) newRangeDelIter(
+	newIters tableNewIters,
+	f manifest.LevelFile,
+	opts IterOptions,
+	l manifest.Level,
+	bytesIterated *uint64,
+) (keyspan.FragmentIterator, io.Closer, error) {
+	opts.level = l
+	iter, rangeDelIter, err := newIters(context.Background(), f.FileMetadata,
+		&opts, internalIterOpts{
+			bytesIterated: &c.bytesIterated,
+			bufferPool:    &c.bufferPool,
+		})
+	if err != nil {
+		return nil, nil, err
+	}
+	// TODO(peter): It is mildly wasteful to open the point iterator only to
+	// immediately close it. One way to solve this would be to add new
+	// methods to tableCache for creating point and range-deletion iterators
+	// independently. We'd only want to use those methods here,
+	// though. Doesn't seem worth the hassle in the near term.
+	if err = iter.Close(); err != nil {
+		if rangeDelIter != nil {
+			err = errors.CombineErrors(err, rangeDelIter.Close())
+		}
+		return nil, nil, err
+	}
+	if rangeDelIter == nil {
+		// The file doesn't contain any range deletions.
+		return nil, nil, nil
+	}
+
+	// Ensure that rangeDelIter is not closed until the compaction is
+	// finished. This is necessary because range tombstone processing
+	// requires the range tombstones to be held in memory for up to the
+	// lifetime of the compaction.
+	closer := rangeDelIter
+	rangeDelIter = noCloseIter{rangeDelIter}
+
+	// Truncate the range tombstones returned by the iterator to the
+	// upper bound of the atomic compaction unit of the file. We want to
+	// truncate the range tombstone to the bounds of the file, but files
+	// with split user keys pose an obstacle: The file's largest bound
+	// is inclusive whereas the range tombstone's end is exclusive.
+	//
+	// Consider the example:
+	//
+	//   000001:[b-f#200]         range del [c,k)
+	//   000002:[f#190-g#inf]     range del [c,k)
+	//   000003:[g#500-i#3]
+	//
+	// Files 000001 and 000002 contain the untruncated range tombstones
+	// [c,k). While the keyspace covered by 000003 was at one point
+	// deleted by the tombstone [c,k), the tombstone may have already
+	// been compacted away and the file does not contain an untruncated
+	// range tombstone. We want to bound 000001's tombstone to the file
+	// bounds, but it's not possible to encode a range tombstone with an
+	// end boundary within a user key (eg, between sequence numbers
+	// f#200 and f#190). Instead, we expand 000001 to its atomic
+	// compaction unit (000001 and 000002) and truncate the tombstone to
+	// g#inf.
+	//
+	// NB: We must not use the atomic compaction unit of the entire
+	// compaction, because the [c,k) tombstone contained in the file
+	// 000001 ≥ g. If 000001, 000002 and 000003 are all included in the
+	// same compaction, the compaction's atomic compaction unit includes
+	// 000003. However 000003's keys must not be covered by 000001's
+	// untruncated range tombstone.
+	//
+	// Note that we need do this truncation at read time in order to
+	// handle sstables generated by RocksDB and earlier versions of
+	// Pebble which do not truncate range tombstones to atomic
+	// compaction unit boundaries at write time.
+	//
+	// The current Pebble compaction logic DOES truncate tombstones to
+	// atomic unit boundaries at compaction time too.
+	atomicUnit, _ := expandToAtomicUnit(c.cmp, f.Slice(), true /* disableIsCompacting */)
+	lowerBound, upperBound := manifest.KeyRange(c.cmp, atomicUnit.Iter())
+	// Range deletion tombstones are often written to sstables
+	// untruncated on the end key side. However, they are still only
+	// valid within a given file's bounds. The logic for writing range
+	// tombstones to an output file sometimes has an incomplete view
+	// of range tombstones outside the file's internal key bounds. Skip
+	// any range tombstones completely outside file bounds.
+	rangeDelIter = keyspan.Truncate(
+		c.cmp, rangeDelIter, lowerBound.UserKey, upperBound.UserKey,
+		&f.Smallest, &f.Largest, false, /* panicOnUpperTruncate */
+	)
+	return rangeDelIter, closer, nil
+}
+
+func (c *compaction) String() string {
+	if len(c.flushing) != 0 {
+		return "flush\n"
+	}
+
+	var buf bytes.Buffer
+	for level := c.startLevel.level; level <= c.outputLevel.level; level++ {
+		i := level - c.startLevel.level
+		fmt.Fprintf(&buf, "%d:", level)
+		iter := c.inputs[i].files.Iter()
+		for f := iter.First(); f != nil; f = iter.Next() {
+			fmt.Fprintf(&buf, " %s:%s-%s", f.FileNum, f.Smallest, f.Largest)
+		}
+		fmt.Fprintf(&buf, "\n")
+	}
+	return buf.String()
+}
+
+type manualCompaction struct {
+	// Count of the retries either due to too many concurrent compactions, or a
+	// concurrent compaction to overlapping levels.
+	retries     int
+	level       int
+	outputLevel int
+	done        chan error
+	start       []byte
+	end         []byte
+	split       bool
+}
+
+type readCompaction struct {
+	level int
+	// [start, end] key ranges are used for de-duping.
+	start []byte
+	end   []byte
+
+	// The file associated with the compaction.
+	// If the file no longer belongs in the same
+	// level, then we skip the compaction.
+	fileNum base.FileNum
+}
+
+func (d *DB) addInProgressCompaction(c *compaction) {
+	d.mu.compact.inProgress[c] = struct{}{}
+	var isBase, isIntraL0 bool
+	for _, cl := range c.inputs {
+		iter := cl.files.Iter()
+		for f := iter.First(); f != nil; f = iter.Next() {
+			if f.IsCompacting() {
+				d.opts.Logger.Fatalf("L%d->L%d: %s already being compacted", c.startLevel.level, c.outputLevel.level, f.FileNum)
+			}
+			f.SetCompactionState(manifest.CompactionStateCompacting)
+			if c.startLevel != nil && c.outputLevel != nil && c.startLevel.level == 0 {
+				if c.outputLevel.level == 0 {
+					f.IsIntraL0Compacting = true
+					isIntraL0 = true
+				} else {
+					isBase = true
+				}
+			}
+		}
+	}
+
+	if (isIntraL0 || isBase) && c.version.L0Sublevels != nil {
+		l0Inputs := []manifest.LevelSlice{c.startLevel.files}
+		if isIntraL0 {
+			l0Inputs = append(l0Inputs, c.outputLevel.files)
+		}
+		if err := c.version.L0Sublevels.UpdateStateForStartedCompaction(l0Inputs, isBase); err != nil {
+			d.opts.Logger.Fatalf("could not update state for compaction: %s", err)
+		}
+	}
+}
+
+// Removes compaction markers from files in a compaction. The rollback parameter
+// indicates whether the compaction state should be rolled back to its original
+// state in the case of an unsuccessful compaction.
+//
+// DB.mu must be held when calling this method, however this method can drop and
+// re-acquire that mutex. All writes to the manifest for this compaction should
+// have completed by this point.
+func (d *DB) clearCompactingState(c *compaction, rollback bool) {
+	c.versionEditApplied = true
+	for _, cl := range c.inputs {
+		iter := cl.files.Iter()
+		for f := iter.First(); f != nil; f = iter.Next() {
+			if !f.IsCompacting() {
+				d.opts.Logger.Fatalf("L%d->L%d: %s not being compacted", c.startLevel.level, c.outputLevel.level, f.FileNum)
+			}
+			if !rollback {
+				// On success all compactions other than move-compactions transition the
+				// file into the Compacted state. Move-compacted files become eligible
+				// for compaction again and transition back to NotCompacting.
+				if c.kind != compactionKindMove {
+					f.SetCompactionState(manifest.CompactionStateCompacted)
+				} else {
+					f.SetCompactionState(manifest.CompactionStateNotCompacting)
+				}
+			} else {
+				// Else, on rollback, all input files unconditionally transition back to
+				// NotCompacting.
+				f.SetCompactionState(manifest.CompactionStateNotCompacting)
+			}
+			f.IsIntraL0Compacting = false
+		}
+	}
+	l0InProgress := inProgressL0Compactions(d.getInProgressCompactionInfoLocked(c))
+	func() {
+		// InitCompactingFileInfo requires that no other manifest writes be
+		// happening in parallel with it, i.e. we're not in the midst of installing
+		// another version. Otherwise, it's possible that we've created another
+		// L0Sublevels instance, but not added it to the versions list, causing
+		// all the indices in FileMetadata to be inaccurate. To ensure this,
+		// grab the manifest lock.
+		d.mu.versions.logLock()
+		defer d.mu.versions.logUnlock()
+		d.mu.versions.currentVersion().L0Sublevels.InitCompactingFileInfo(l0InProgress)
+	}()
+}
+
+func (d *DB) calculateDiskAvailableBytes() uint64 {
+	if space, err := d.opts.FS.GetDiskUsage(d.dirname); err == nil {
+		d.diskAvailBytes.Store(space.AvailBytes)
+		return space.AvailBytes
+	} else if !errors.Is(err, vfs.ErrUnsupported) {
+		d.opts.EventListener.BackgroundError(err)
+	}
+	return d.diskAvailBytes.Load()
+}
+
+func (d *DB) getDeletionPacerInfo() deletionPacerInfo {
+	var pacerInfo deletionPacerInfo
+	// Call GetDiskUsage after every file deletion. This may seem inefficient,
+	// but in practice this was observed to take constant time, regardless of
+	// volume size used, at least on linux with ext4 and zfs. All invocations
+	// take 10 microseconds or less.
+	pacerInfo.freeBytes = d.calculateDiskAvailableBytes()
+	d.mu.Lock()
+	pacerInfo.obsoleteBytes = d.mu.versions.metrics.Table.ObsoleteSize
+	pacerInfo.liveBytes = uint64(d.mu.versions.metrics.Total().Size)
+	d.mu.Unlock()
+	return pacerInfo
+}
+
+// onObsoleteTableDelete is called to update metrics when an sstable is deleted.
+func (d *DB) onObsoleteTableDelete(fileSize uint64) {
+	d.mu.Lock()
+	d.mu.versions.metrics.Table.ObsoleteCount--
+	d.mu.versions.metrics.Table.ObsoleteSize -= fileSize
+	d.mu.Unlock()
+}
+
+// maybeScheduleFlush schedules a flush if necessary.
+//
+// d.mu must be held when calling this.
+func (d *DB) maybeScheduleFlush() {
+	if d.mu.compact.flushing || d.closed.Load() != nil || d.opts.ReadOnly {
+		return
+	}
+	if len(d.mu.mem.queue) <= 1 {
+		return
+	}
+
+	if !d.passedFlushThreshold() {
+		return
+	}
+
+	d.mu.compact.flushing = true
+	go d.flush()
+}
+
+func (d *DB) passedFlushThreshold() bool {
+	var n int
+	var size uint64
+	for ; n < len(d.mu.mem.queue)-1; n++ {
+		if !d.mu.mem.queue[n].readyForFlush() {
+			break
+		}
+		if d.mu.mem.queue[n].flushForced {
+			// A flush was forced. Pretend the memtable size is the configured
+			// size. See minFlushSize below.
+			size += d.opts.MemTableSize
+		} else {
+			size += d.mu.mem.queue[n].totalBytes()
+		}
+	}
+	if n == 0 {
+		// None of the immutable memtables are ready for flushing.
+		return false
+	}
+
+	// Only flush once the sum of the queued memtable sizes exceeds half the
+	// configured memtable size. This prevents flushing of memtables at startup
+	// while we're undergoing the ramp period on the memtable size. See
+	// DB.newMemTable().
+	minFlushSize := d.opts.MemTableSize / 2
+	return size >= minFlushSize
+}
+
+func (d *DB) maybeScheduleDelayedFlush(tbl *memTable, dur time.Duration) {
+	var mem *flushableEntry
+	for _, m := range d.mu.mem.queue {
+		if m.flushable == tbl {
+			mem = m
+			break
+		}
+	}
+	if mem == nil || mem.flushForced {
+		return
+	}
+	deadline := d.timeNow().Add(dur)
+	if !mem.delayedFlushForcedAt.IsZero() && deadline.After(mem.delayedFlushForcedAt) {
+		// Already scheduled to flush sooner than within `dur`.
+		return
+	}
+	mem.delayedFlushForcedAt = deadline
+	go func() {
+		timer := time.NewTimer(dur)
+		defer timer.Stop()
+
+		select {
+		case <-d.closedCh:
+			return
+		case <-mem.flushed:
+			return
+		case <-timer.C:
+			d.commit.mu.Lock()
+			defer d.commit.mu.Unlock()
+			d.mu.Lock()
+			defer d.mu.Unlock()
+
+			// NB: The timer may fire concurrently with a call to Close.  If a
+			// Close call beat us to acquiring d.mu, d.closed holds ErrClosed,
+			// and it's too late to flush anything. Otherwise, the Close call
+			// will block on locking d.mu until we've finished scheduling the
+			// flush and set `d.mu.compact.flushing` to true. Close will wait
+			// for the current flush to complete.
+			if d.closed.Load() != nil {
+				return
+			}
+
+			if d.mu.mem.mutable == tbl {
+				d.makeRoomForWrite(nil)
+			} else {
+				mem.flushForced = true
+			}
+			d.maybeScheduleFlush()
+		}
+	}()
+}
+
+func (d *DB) flush() {
+	pprof.Do(context.Background(), flushLabels, func(context.Context) {
+		flushingWorkStart := time.Now()
+		d.mu.Lock()
+		defer d.mu.Unlock()
+		idleDuration := flushingWorkStart.Sub(d.mu.compact.noOngoingFlushStartTime)
+		var bytesFlushed uint64
+		var err error
+		if bytesFlushed, err = d.flush1(); err != nil {
+			// TODO(peter): count consecutive flush errors and backoff.
+			d.opts.EventListener.BackgroundError(err)
+		}
+		d.mu.compact.flushing = false
+		d.mu.compact.noOngoingFlushStartTime = time.Now()
+		workDuration := d.mu.compact.noOngoingFlushStartTime.Sub(flushingWorkStart)
+		d.mu.compact.flushWriteThroughput.Bytes += int64(bytesFlushed)
+		d.mu.compact.flushWriteThroughput.WorkDuration += workDuration
+		d.mu.compact.flushWriteThroughput.IdleDuration += idleDuration
+		// More flush work may have arrived while we were flushing, so schedule
+		// another flush if needed.
+		d.maybeScheduleFlush()
+		// The flush may have produced too many files in a level, so schedule a
+		// compaction if needed.
+		d.maybeScheduleCompaction()
+		d.mu.compact.cond.Broadcast()
+	})
+}
+
+// runIngestFlush is used to generate a flush version edit for sstables which
+// were ingested as flushables. Both DB.mu and the manifest lock must be held
+// while runIngestFlush is called.
+func (d *DB) runIngestFlush(c *compaction) (*manifest.VersionEdit, error) {
+	if len(c.flushing) != 1 {
+		panic("pebble: ingestedFlushable must be flushed one at a time.")
+	}
+
+	// Construct the VersionEdit, levelMetrics etc.
+	c.metrics = make(map[int]*LevelMetrics, numLevels)
+	// Finding the target level for ingestion must use the latest version
+	// after the logLock has been acquired.
+	c.version = d.mu.versions.currentVersion()
+
+	baseLevel := d.mu.versions.picker.getBaseLevel()
+	iterOpts := IterOptions{logger: d.opts.Logger}
+	ve := &versionEdit{}
+	var level int
+	var err error
+	var fileToSplit *fileMetadata
+	var ingestSplitFiles []ingestSplitFile
+	for _, file := range c.flushing[0].flushable.(*ingestedFlushable).files {
+		suggestSplit := d.opts.Experimental.IngestSplit != nil && d.opts.Experimental.IngestSplit() &&
+			d.FormatMajorVersion() >= FormatVirtualSSTables
+		level, fileToSplit, err = ingestTargetLevel(
+			d.newIters, d.tableNewRangeKeyIter, iterOpts, d.opts.Comparer,
+			c.version, baseLevel, d.mu.compact.inProgress, file.FileMetadata,
+			suggestSplit,
+		)
+		if err != nil {
+			return nil, err
+		}
+		ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: level, Meta: file.FileMetadata})
+		if fileToSplit != nil {
+			ingestSplitFiles = append(ingestSplitFiles, ingestSplitFile{
+				ingestFile: file.FileMetadata,
+				splitFile:  fileToSplit,
+				level:      level,
+			})
+		}
+		levelMetrics := c.metrics[level]
+		if levelMetrics == nil {
+			levelMetrics = &LevelMetrics{}
+			c.metrics[level] = levelMetrics
+		}
+		levelMetrics.BytesIngested += file.Size
+		levelMetrics.TablesIngested++
+	}
+
+	updateLevelMetricsOnExcise := func(m *fileMetadata, level int, added []newFileEntry) {
+		levelMetrics := c.metrics[level]
+		if levelMetrics == nil {
+			levelMetrics = &LevelMetrics{}
+			c.metrics[level] = levelMetrics
+		}
+		levelMetrics.NumFiles--
+		levelMetrics.Size -= int64(m.Size)
+		for i := range added {
+			levelMetrics.NumFiles++
+			levelMetrics.Size += int64(added[i].Meta.Size)
+		}
+	}
+
+	if len(ingestSplitFiles) > 0 {
+		ve.DeletedFiles = make(map[manifest.DeletedFileEntry]*manifest.FileMetadata)
+		replacedFiles := make(map[base.FileNum][]newFileEntry)
+		if err := d.ingestSplit(ve, updateLevelMetricsOnExcise, ingestSplitFiles, replacedFiles); err != nil {
+			return nil, err
+		}
+	}
+
+	return ve, nil
+}
+
+// flush runs a compaction that copies the immutable memtables from memory to
+// disk.
+//
+// d.mu must be held when calling this, but the mutex may be dropped and
+// re-acquired during the course of this method.
+func (d *DB) flush1() (bytesFlushed uint64, err error) {
+	// NB: The flushable queue can contain flushables of type ingestedFlushable.
+	// The sstables in ingestedFlushable.files must be placed into the appropriate
+	// level in the lsm. Let's say the flushable queue contains a prefix of
+	// regular immutable memtables, then an ingestedFlushable, and then the
+	// mutable memtable. When the flush of the ingestedFlushable is performed,
+	// it needs an updated view of the lsm. That is, the prefix of immutable
+	// memtables must have already been flushed. Similarly, if there are two
+	// contiguous ingestedFlushables in the queue, then the first flushable must
+	// be flushed, so that the second flushable can see an updated view of the
+	// lsm.
+	//
+	// Given the above, we restrict flushes to either some prefix of regular
+	// memtables, or a single flushable of type ingestedFlushable. The DB.flush
+	// function will call DB.maybeScheduleFlush again, so a new flush to finish
+	// the remaining flush work should be scheduled right away.
+	//
+	// NB: Large batches placed in the flushable queue share the WAL with the
+	// previous memtable in the queue. We must ensure the property that both the
+	// large batch and the memtable with which it shares a WAL are flushed
+	// together. The property ensures that the minimum unflushed log number
+	// isn't incremented incorrectly. Since a flushableBatch.readyToFlush always
+	// returns true, and since the large batch will always be placed right after
+	// the memtable with which it shares a WAL, the property is naturally
+	// ensured. The large batch will always be placed after the memtable with
+	// which it shares a WAL because we ensure it in DB.commitWrite by holding
+	// the commitPipeline.mu and then holding DB.mu. As an extra defensive
+	// measure, if we try to flush the memtable without also flushing the
+	// flushable batch in the same flush, since the memtable and flushableBatch
+	// have the same logNum, the logNum invariant check below will trigger.
+	var n, inputs int
+	var inputBytes uint64
+	var ingest bool
+	for ; n < len(d.mu.mem.queue)-1; n++ {
+		if f, ok := d.mu.mem.queue[n].flushable.(*ingestedFlushable); ok {
+			if n == 0 {
+				// The first flushable is of type ingestedFlushable. Since these
+				// must be flushed individually, we perform a flush for just
+				// this.
+				if !f.readyForFlush() {
+					// This check is almost unnecessary, but we guard against it
+					// just in case this invariant changes in the future.
+					panic("pebble: ingestedFlushable should always be ready to flush.")
+				}
+				// By setting n = 1, we ensure that the first flushable(n == 0)
+				// is scheduled for a flush. The number of tables added is equal to the
+				// number of files in the ingest operation.
+				n = 1
+				inputs = len(f.files)
+				ingest = true
+				break
+			} else {
+				// There was some prefix of flushables which weren't of type
+				// ingestedFlushable. So, perform a flush for those.
+				break
+			}
+		}
+		if !d.mu.mem.queue[n].readyForFlush() {
+			break
+		}
+		inputBytes += d.mu.mem.queue[n].inuseBytes()
+	}
+	if n == 0 {
+		// None of the immutable memtables are ready for flushing.
+		return 0, nil
+	}
+	if !ingest {
+		// Flushes of memtables add the prefix of n memtables from the flushable
+		// queue.
+		inputs = n
+	}
+
+	// Require that every memtable being flushed has a log number less than the
+	// new minimum unflushed log number.
+	minUnflushedLogNum := d.mu.mem.queue[n].logNum
+	if !d.opts.DisableWAL {
+		for i := 0; i < n; i++ {
+			if logNum := d.mu.mem.queue[i].logNum; logNum >= minUnflushedLogNum {
+				panic(errors.AssertionFailedf("logNum invariant violated: flushing %d items; %d:type=%T,logNum=%d; %d:type=%T,logNum=%d",
+					n,
+					i, d.mu.mem.queue[i].flushable, logNum,
+					n, d.mu.mem.queue[n].flushable, minUnflushedLogNum))
+			}
+		}
+	}
+
+	c := newFlush(d.opts, d.mu.versions.currentVersion(),
+		d.mu.versions.picker.getBaseLevel(), d.mu.mem.queue[:n], d.timeNow())
+	d.addInProgressCompaction(c)
+
+	jobID := d.mu.nextJobID
+	d.mu.nextJobID++
+	d.opts.EventListener.FlushBegin(FlushInfo{
+		JobID:      jobID,
+		Input:      inputs,
+		InputBytes: inputBytes,
+		Ingest:     ingest,
+	})
+	startTime := d.timeNow()
+
+	var ve *manifest.VersionEdit
+	var pendingOutputs []physicalMeta
+	var stats compactStats
+	// To determine the target level of the files in the ingestedFlushable, we
+	// need to acquire the logLock, and not release it for that duration. Since,
+	// we need to acquire the logLock below to perform the logAndApply step
+	// anyway, we create the VersionEdit for ingestedFlushable outside of
+	// runCompaction. For all other flush cases, we construct the VersionEdit
+	// inside runCompaction.
+	if c.kind != compactionKindIngestedFlushable {
+		ve, pendingOutputs, stats, err = d.runCompaction(jobID, c)
+	}
+
+	// Acquire logLock. This will be released either on an error, by way of
+	// logUnlock, or through a call to logAndApply if there is no error.
+	d.mu.versions.logLock()
+
+	if c.kind == compactionKindIngestedFlushable {
+		ve, err = d.runIngestFlush(c)
+	}
+
+	info := FlushInfo{
+		JobID:      jobID,
+		Input:      inputs,
+		InputBytes: inputBytes,
+		Duration:   d.timeNow().Sub(startTime),
+		Done:       true,
+		Ingest:     ingest,
+		Err:        err,
+	}
+	if err == nil {
+		for i := range ve.NewFiles {
+			e := &ve.NewFiles[i]
+			info.Output = append(info.Output, e.Meta.TableInfo())
+			// Ingested tables are not necessarily flushed to L0. Record the level of
+			// each ingested file explicitly.
+			if ingest {
+				info.IngestLevels = append(info.IngestLevels, e.Level)
+			}
+		}
+		if len(ve.NewFiles) == 0 {
+			info.Err = errEmptyTable
+		}
+
+		// The flush succeeded or it produced an empty sstable. In either case we
+		// want to bump the minimum unflushed log number to the log number of the
+		// oldest unflushed memtable.
+		ve.MinUnflushedLogNum = minUnflushedLogNum
+		if c.kind != compactionKindIngestedFlushable {
+			metrics := c.metrics[0]
+			if d.opts.DisableWAL {
+				// If the WAL is disabled, every flushable has a zero [logSize],
+				// resulting in zero bytes in. Instead, use the number of bytes we
+				// flushed as the BytesIn. This ensures we get a reasonable w-amp
+				// calculation even when the WAL is disabled.
+				metrics.BytesIn = metrics.BytesFlushed
+			} else {
+				metrics := c.metrics[0]
+				for i := 0; i < n; i++ {
+					metrics.BytesIn += d.mu.mem.queue[i].logSize
+				}
+			}
+		} else if len(ve.DeletedFiles) > 0 {
+			// c.kind == compactionKindIngestedFlushable && we have deleted files due
+			// to ingest-time splits.
+			//
+			// Iterate through all other compactions, and check if their inputs have
+			// been replaced due to an ingest-time split. In that case, cancel the
+			// compaction.
+			for c2 := range d.mu.compact.inProgress {
+				for i := range c2.inputs {
+					iter := c2.inputs[i].files.Iter()
+					for f := iter.First(); f != nil; f = iter.Next() {
+						if _, ok := ve.DeletedFiles[deletedFileEntry{FileNum: f.FileNum, Level: c2.inputs[i].level}]; ok {
+							c2.cancel.Store(true)
+							break
+						}
+					}
+				}
+			}
+		}
+		err = d.mu.versions.logAndApply(jobID, ve, c.metrics, false, /* forceRotation */
+			func() []compactionInfo { return d.getInProgressCompactionInfoLocked(c) })
+		if err != nil {
+			info.Err = err
+			// TODO(peter): untested.
+			for _, f := range pendingOutputs {
+				// Note that the FileBacking for the file metadata might not have
+				// been set yet. So, we directly use the FileNum. Since these
+				// files were generated as compaction outputs, these must be
+				// physical files on disk. This property might not hold once
+				// https://github.com/cockroachdb/pebble/issues/389 is
+				// implemented if #389 creates virtual sstables as output files.
+				d.mu.versions.obsoleteTables = append(
+					d.mu.versions.obsoleteTables,
+					fileInfo{f.FileNum.DiskFileNum(), f.Size},
+				)
+			}
+			d.mu.versions.updateObsoleteTableMetricsLocked()
+		}
+	} else {
+		// We won't be performing the logAndApply step because of the error,
+		// so logUnlock.
+		d.mu.versions.logUnlock()
+	}
+
+	bytesFlushed = c.bytesIterated
+
+	// If err != nil, then the flush will be retried, and we will recalculate
+	// these metrics.
+	if err == nil {
+		d.mu.snapshots.cumulativePinnedCount += stats.cumulativePinnedKeys
+		d.mu.snapshots.cumulativePinnedSize += stats.cumulativePinnedSize
+		d.mu.versions.metrics.Keys.MissizedTombstonesCount += stats.countMissizedDels
+		d.maybeUpdateDeleteCompactionHints(c)
+	}
+
+	d.clearCompactingState(c, err != nil)
+	delete(d.mu.compact.inProgress, c)
+	d.mu.versions.incrementCompactions(c.kind, c.extraLevels, c.pickerMetrics)
+
+	var flushed flushableList
+	if err == nil {
+		flushed = d.mu.mem.queue[:n]
+		d.mu.mem.queue = d.mu.mem.queue[n:]
+		d.updateReadStateLocked(d.opts.DebugCheck)
+		d.updateTableStatsLocked(ve.NewFiles)
+		if ingest {
+			d.mu.versions.metrics.Flush.AsIngestCount++
+			for _, l := range c.metrics {
+				d.mu.versions.metrics.Flush.AsIngestBytes += l.BytesIngested
+				d.mu.versions.metrics.Flush.AsIngestTableCount += l.TablesIngested
+			}
+		}
+
+		// Update if any eventually file-only snapshots have now transitioned to
+		// being file-only.
+		earliestUnflushedSeqNum := d.getEarliestUnflushedSeqNumLocked()
+		currentVersion := d.mu.versions.currentVersion()
+		for s := d.mu.snapshots.root.next; s != &d.mu.snapshots.root; {
+			if s.efos == nil {
+				s = s.next
+				continue
+			}
+			if base.Visible(earliestUnflushedSeqNum, s.efos.seqNum, InternalKeySeqNumMax) {
+				s = s.next
+				continue
+			}
+			if s.efos.excised.Load() {
+				// If a concurrent excise has happened that overlaps with one of the key
+				// ranges this snapshot is interested in, this EFOS cannot transition to
+				// a file-only snapshot as keys in that range could now be deleted. Move
+				// onto the next snapshot.
+				s = s.next
+				continue
+			}
+			currentVersion.Ref()
+
+			// NB: s.efos.transitionToFileOnlySnapshot could close s, in which
+			// case s.next would be nil. Save it before calling it.
+			next := s.next
+			_ = s.efos.transitionToFileOnlySnapshot(currentVersion)
+			s = next
+		}
+	}
+	// Signal FlushEnd after installing the new readState. This helps for unit
+	// tests that use the callback to trigger a read using an iterator with
+	// IterOptions.OnlyReadGuaranteedDurable.
+	info.TotalDuration = d.timeNow().Sub(startTime)
+	d.opts.EventListener.FlushEnd(info)
+
+	// The order of these operations matters here for ease of testing.
+	// Removing the reader reference first allows tests to be guaranteed that
+	// the memtable reservation has been released by the time a synchronous
+	// flush returns. readerUnrefLocked may also produce obsolete files so the
+	// call to deleteObsoleteFiles must happen after it.
+	for i := range flushed {
+		flushed[i].readerUnrefLocked(true)
+	}
+
+	d.deleteObsoleteFiles(jobID)
+
+	// Mark all the memtables we flushed as flushed.
+	for i := range flushed {
+		close(flushed[i].flushed)
+	}
+
+	return bytesFlushed, err
+}
+
+// maybeScheduleCompactionAsync should be used when
+// we want to possibly schedule a compaction, but don't
+// want to eat the cost of running maybeScheduleCompaction.
+// This method should be launched in a separate goroutine.
+// d.mu must not be held when this is called.
+func (d *DB) maybeScheduleCompactionAsync() {
+	defer d.compactionSchedulers.Done()
+
+	d.mu.Lock()
+	d.maybeScheduleCompaction()
+	d.mu.Unlock()
+}
+
+// maybeScheduleCompaction schedules a compaction if necessary.
+//
+// d.mu must be held when calling this.
+func (d *DB) maybeScheduleCompaction() {
+	d.maybeScheduleCompactionPicker(pickAuto)
+}
+
+func pickAuto(picker compactionPicker, env compactionEnv) *pickedCompaction {
+	return picker.pickAuto(env)
+}
+
+func pickElisionOnly(picker compactionPicker, env compactionEnv) *pickedCompaction {
+	return picker.pickElisionOnlyCompaction(env)
+}
+
+// maybeScheduleCompactionPicker schedules a compaction if necessary,
+// calling `pickFunc` to pick automatic compactions.
+//
+// d.mu must be held when calling this.
+func (d *DB) maybeScheduleCompactionPicker(
+	pickFunc func(compactionPicker, compactionEnv) *pickedCompaction,
+) {
+	if d.closed.Load() != nil || d.opts.ReadOnly {
+		return
+	}
+	maxConcurrentCompactions := d.opts.MaxConcurrentCompactions()
+	if d.mu.compact.compactingCount >= maxConcurrentCompactions {
+		if len(d.mu.compact.manual) > 0 {
+			// Inability to run head blocks later manual compactions.
+			d.mu.compact.manual[0].retries++
+		}
+		return
+	}
+
+	// Compaction picking needs a coherent view of a Version. In particular, we
+	// need to exlude concurrent ingestions from making a decision on which level
+	// to ingest into that conflicts with our compaction
+	// decision. versionSet.logLock provides the necessary mutual exclusion.
+	d.mu.versions.logLock()
+	defer d.mu.versions.logUnlock()
+
+	// Check for the closed flag again, in case the DB was closed while we were
+	// waiting for logLock().
+	if d.closed.Load() != nil {
+		return
+	}
+
+	env := compactionEnv{
+		diskAvailBytes:          d.diskAvailBytes.Load(),
+		earliestSnapshotSeqNum:  d.mu.snapshots.earliest(),
+		earliestUnflushedSeqNum: d.getEarliestUnflushedSeqNumLocked(),
+	}
+
+	// Check for delete-only compactions first, because they're expected to be
+	// cheap and reduce future compaction work.
+	if !d.opts.private.disableDeleteOnlyCompactions &&
+		len(d.mu.compact.deletionHints) > 0 &&
+		!d.opts.DisableAutomaticCompactions {
+		v := d.mu.versions.currentVersion()
+		snapshots := d.mu.snapshots.toSlice()
+		inputs, unresolvedHints := checkDeleteCompactionHints(d.cmp, v, d.mu.compact.deletionHints, snapshots)
+		d.mu.compact.deletionHints = unresolvedHints
+
+		if len(inputs) > 0 {
+			c := newDeleteOnlyCompaction(d.opts, v, inputs, d.timeNow())
+			d.mu.compact.compactingCount++
+			d.addInProgressCompaction(c)
+			go d.compact(c, nil)
+		}
+	}
+
+	for len(d.mu.compact.manual) > 0 && d.mu.compact.compactingCount < maxConcurrentCompactions {
+		v := d.mu.versions.currentVersion()
+		manual := d.mu.compact.manual[0]
+		env.inProgressCompactions = d.getInProgressCompactionInfoLocked(nil)
+		pc, retryLater := pickManualCompaction(v, d.opts, env, d.mu.versions.picker.getBaseLevel(), manual)
+		if pc != nil {
+			c := newCompaction(pc, d.opts, d.timeNow(), d.ObjProvider())
+			d.mu.compact.manual = d.mu.compact.manual[1:]
+			d.mu.compact.compactingCount++
+			d.addInProgressCompaction(c)
+			go d.compact(c, manual.done)
+		} else if !retryLater {
+			// Noop
+			d.mu.compact.manual = d.mu.compact.manual[1:]
+			manual.done <- nil
+		} else {
+			// Inability to run head blocks later manual compactions.
+			manual.retries++
+			break
+		}
+	}
+
+	for !d.opts.DisableAutomaticCompactions && d.mu.compact.compactingCount < maxConcurrentCompactions {
+		env.inProgressCompactions = d.getInProgressCompactionInfoLocked(nil)
+		env.readCompactionEnv = readCompactionEnv{
+			readCompactions:          &d.mu.compact.readCompactions,
+			flushing:                 d.mu.compact.flushing || d.passedFlushThreshold(),
+			rescheduleReadCompaction: &d.mu.compact.rescheduleReadCompaction,
+		}
+		pc := pickFunc(d.mu.versions.picker, env)
+		if pc == nil {
+			break
+		}
+		c := newCompaction(pc, d.opts, d.timeNow(), d.ObjProvider())
+		d.mu.compact.compactingCount++
+		d.addInProgressCompaction(c)
+		go d.compact(c, nil)
+	}
+}
+
+// deleteCompactionHintType indicates whether the deleteCompactionHint was
+// generated from a span containing a range del (point key only), a range key
+// delete (range key only), or both a point and range key.
+type deleteCompactionHintType uint8
+
+const (
+	// NOTE: While these are primarily used as enumeration types, they are also
+	// used for some bitwise operations. Care should be taken when updating.
+	deleteCompactionHintTypeUnknown deleteCompactionHintType = iota
+	deleteCompactionHintTypePointKeyOnly
+	deleteCompactionHintTypeRangeKeyOnly
+	deleteCompactionHintTypePointAndRangeKey
+)
+
+// String implements fmt.Stringer.
+func (h deleteCompactionHintType) String() string {
+	switch h {
+	case deleteCompactionHintTypeUnknown:
+		return "unknown"
+	case deleteCompactionHintTypePointKeyOnly:
+		return "point-key-only"
+	case deleteCompactionHintTypeRangeKeyOnly:
+		return "range-key-only"
+	case deleteCompactionHintTypePointAndRangeKey:
+		return "point-and-range-key"
+	default:
+		panic(fmt.Sprintf("unknown hint type: %d", h))
+	}
+}
+
+// compactionHintFromKeys returns a deleteCompactionHintType given a slice of
+// keyspan.Keys.
+func compactionHintFromKeys(keys []keyspan.Key) deleteCompactionHintType {
+	var hintType deleteCompactionHintType
+	for _, k := range keys {
+		switch k.Kind() {
+		case base.InternalKeyKindRangeDelete:
+			hintType |= deleteCompactionHintTypePointKeyOnly
+		case base.InternalKeyKindRangeKeyDelete:
+			hintType |= deleteCompactionHintTypeRangeKeyOnly
+		default:
+			panic(fmt.Sprintf("unsupported key kind: %s", k.Kind()))
+		}
+	}
+	return hintType
+}
+
+// A deleteCompactionHint records a user key and sequence number span that has been
+// deleted by a range tombstone. A hint is recorded if at least one sstable
+// falls completely within both the user key and sequence number spans.
+// Once the tombstones and the observed completely-contained sstables fall
+// into the same snapshot stripe, a delete-only compaction may delete any
+// sstables within the range.
+type deleteCompactionHint struct {
+	// The type of key span that generated this hint (point key, range key, or
+	// both).
+	hintType deleteCompactionHintType
+	// start and end are user keys specifying a key range [start, end) of
+	// deleted keys.
+	start []byte
+	end   []byte
+	// The level of the file containing the range tombstone(s) when the hint
+	// was created. Only lower levels need to be searched for files that may
+	// be deleted.
+	tombstoneLevel int
+	// The file containing the range tombstone(s) that created the hint.
+	tombstoneFile *fileMetadata
+	// The smallest and largest sequence numbers of the abutting tombstones
+	// merged to form this hint. All of a tables' keys must be less than the
+	// tombstone smallest sequence number to be deleted. All of a tables'
+	// sequence numbers must fall into the same snapshot stripe as the
+	// tombstone largest sequence number to be deleted.
+	tombstoneLargestSeqNum  uint64
+	tombstoneSmallestSeqNum uint64
+	// The smallest sequence number of a sstable that was found to be covered
+	// by this hint. The hint cannot be resolved until this sequence number is
+	// in the same snapshot stripe as the largest tombstone sequence number.
+	// This is set when a hint is created, so the LSM may look different and
+	// notably no longer contain the sstable that contained the key at this
+	// sequence number.
+	fileSmallestSeqNum uint64
+}
+
+func (h deleteCompactionHint) String() string {
+	return fmt.Sprintf(
+		"L%d.%s %s-%s seqnums(tombstone=%d-%d, file-smallest=%d, type=%s)",
+		h.tombstoneLevel, h.tombstoneFile.FileNum, h.start, h.end,
+		h.tombstoneSmallestSeqNum, h.tombstoneLargestSeqNum, h.fileSmallestSeqNum,
+		h.hintType,
+	)
+}
+
+func (h *deleteCompactionHint) canDelete(cmp Compare, m *fileMetadata, snapshots []uint64) bool {
+	// The file can only be deleted if all of its keys are older than the
+	// earliest tombstone aggregated into the hint.
+	if m.LargestSeqNum >= h.tombstoneSmallestSeqNum || m.SmallestSeqNum < h.fileSmallestSeqNum {
+		return false
+	}
+
+	// The file's oldest key must  be in the same snapshot stripe as the
+	// newest tombstone. NB: We already checked the hint's sequence numbers,
+	// but this file's oldest sequence number might be lower than the hint's
+	// smallest sequence number despite the file falling within the key range
+	// if this file was constructed after the hint by a compaction.
+	ti, _ := snapshotIndex(h.tombstoneLargestSeqNum, snapshots)
+	fi, _ := snapshotIndex(m.SmallestSeqNum, snapshots)
+	if ti != fi {
+		return false
+	}
+
+	switch h.hintType {
+	case deleteCompactionHintTypePointKeyOnly:
+		// A hint generated by a range del span cannot delete tables that contain
+		// range keys.
+		if m.HasRangeKeys {
+			return false
+		}
+	case deleteCompactionHintTypeRangeKeyOnly:
+		// A hint generated by a range key del span cannot delete tables that
+		// contain point keys.
+		if m.HasPointKeys {
+			return false
+		}
+	case deleteCompactionHintTypePointAndRangeKey:
+		// A hint from a span that contains both range dels *and* range keys can
+		// only be deleted if both bounds fall within the hint. The next check takes
+		// care of this.
+	default:
+		panic(fmt.Sprintf("pebble: unknown delete compaction hint type: %d", h.hintType))
+	}
+
+	// The file's keys must be completely contained within the hint range.
+	return cmp(h.start, m.Smallest.UserKey) <= 0 && cmp(m.Largest.UserKey, h.end) < 0
+}
+
+func (d *DB) maybeUpdateDeleteCompactionHints(c *compaction) {
+	// Compactions that zero sequence numbers can interfere with compaction
+	// deletion hints. Deletion hints apply to tables containing keys older
+	// than a threshold. If a key more recent than the threshold is zeroed in
+	// a compaction, a delete-only compaction may mistake it as meeting the
+	// threshold and drop a table containing live data.
+	//
+	// To avoid this scenario, compactions that zero sequence numbers remove
+	// any conflicting deletion hints. A deletion hint is conflicting if both
+	// of the following conditions apply:
+	// * its key space overlaps with the compaction
+	// * at least one of its inputs contains a key as recent as one of the
+	//   hint's tombstones.
+	//
+	if !c.allowedZeroSeqNum {
+		return
+	}
+
+	updatedHints := d.mu.compact.deletionHints[:0]
+	for _, h := range d.mu.compact.deletionHints {
+		// If the compaction's key space is disjoint from the hint's key
+		// space, the zeroing of sequence numbers won't affect the hint. Keep
+		// the hint.
+		keysDisjoint := d.cmp(h.end, c.smallest.UserKey) < 0 || d.cmp(h.start, c.largest.UserKey) > 0
+		if keysDisjoint {
+			updatedHints = append(updatedHints, h)
+			continue
+		}
+
+		// All of the compaction's inputs must be older than the hint's
+		// tombstones.
+		inputsOlder := true
+		for _, in := range c.inputs {
+			iter := in.files.Iter()
+			for f := iter.First(); f != nil; f = iter.Next() {
+				inputsOlder = inputsOlder && f.LargestSeqNum < h.tombstoneSmallestSeqNum
+			}
+		}
+		if inputsOlder {
+			updatedHints = append(updatedHints, h)
+			continue
+		}
+
+		// Drop h, because the compaction c may have zeroed sequence numbers
+		// of keys more recent than some of h's tombstones.
+	}
+	d.mu.compact.deletionHints = updatedHints
+}
+
+func checkDeleteCompactionHints(
+	cmp Compare, v *version, hints []deleteCompactionHint, snapshots []uint64,
+) ([]compactionLevel, []deleteCompactionHint) {
+	var files map[*fileMetadata]bool
+	var byLevel [numLevels][]*fileMetadata
+
+	unresolvedHints := hints[:0]
+	for _, h := range hints {
+		// Check each compaction hint to see if it's resolvable. Resolvable
+		// hints are removed and trigger a delete-only compaction if any files
+		// in the current LSM still meet their criteria. Unresolvable hints
+		// are saved and don't trigger a delete-only compaction.
+		//
+		// When a compaction hint is created, the sequence numbers of the
+		// range tombstones and the covered file with the oldest key are
+		// recorded. The largest tombstone sequence number and the smallest
+		// file sequence number must be in the same snapshot stripe for the
+		// hint to be resolved. The below graphic models a compaction hint
+		// covering the keyspace [b, r). The hint completely contains two
+		// files, 000002 and 000003. The file 000003 contains the lowest
+		// covered sequence number at #90. The tombstone b.RANGEDEL.230:h has
+		// the highest tombstone sequence number incorporated into the hint.
+		// The hint may be resolved only once the snapshots at #100, #180 and
+		// #210 are all closed. File 000001 is not included within the hint
+		// because it extends beyond the range tombstones in user key space.
+		//
+		// 250
+		//
+		//       |-b...230:h-|
+		// _____________________________________________________ snapshot #210
+		// 200               |--h.RANGEDEL.200:r--|
+		//
+		// _____________________________________________________ snapshot #180
+		//
+		// 150                     +--------+
+		//           +---------+   | 000003 |
+		//           | 000002  |   |        |
+		//           +_________+   |        |
+		// 100_____________________|________|___________________ snapshot #100
+		//                         +--------+
+		// _____________________________________________________ snapshot #70
+		//                             +---------------+
+		//  50                         | 000001        |
+		//                             |               |
+		//                             +---------------+
+		// ______________________________________________________________
+		//     a b c d e f g h i j k l m n o p q r s t u v w x y z
+
+		ti, _ := snapshotIndex(h.tombstoneLargestSeqNum, snapshots)
+		fi, _ := snapshotIndex(h.fileSmallestSeqNum, snapshots)
+		if ti != fi {
+			// Cannot resolve yet.
+			unresolvedHints = append(unresolvedHints, h)
+			continue
+		}
+
+		// The hint h will be resolved and dropped, regardless of whether
+		// there are any tables that can be deleted.
+		for l := h.tombstoneLevel + 1; l < numLevels; l++ {
+			overlaps := v.Overlaps(l, cmp, h.start, h.end, true /* exclusiveEnd */)
+			iter := overlaps.Iter()
+			for m := iter.First(); m != nil; m = iter.Next() {
+				if m.IsCompacting() || !h.canDelete(cmp, m, snapshots) || files[m] {
+					continue
+				}
+				if files == nil {
+					// Construct files lazily, assuming most calls will not
+					// produce delete-only compactions.
+					files = make(map[*fileMetadata]bool)
+				}
+				files[m] = true
+				byLevel[l] = append(byLevel[l], m)
+			}
+		}
+	}
+
+	var compactLevels []compactionLevel
+	for l, files := range byLevel {
+		if len(files) == 0 {
+			continue
+		}
+		compactLevels = append(compactLevels, compactionLevel{
+			level: l,
+			files: manifest.NewLevelSliceKeySorted(cmp, files),
+		})
+	}
+	return compactLevels, unresolvedHints
+}
+
+// compact runs one compaction and maybe schedules another call to compact.
+func (d *DB) compact(c *compaction, errChannel chan error) {
+	pprof.Do(context.Background(), compactLabels, func(context.Context) {
+		d.mu.Lock()
+		defer d.mu.Unlock()
+		if err := d.compact1(c, errChannel); err != nil {
+			// TODO(peter): count consecutive compaction errors and backoff.
+			d.opts.EventListener.BackgroundError(err)
+		}
+		d.mu.compact.compactingCount--
+		delete(d.mu.compact.inProgress, c)
+		// Add this compaction's duration to the cumulative duration. NB: This
+		// must be atomic with the above removal of c from
+		// d.mu.compact.InProgress to ensure Metrics.Compact.Duration does not
+		// miss or double count a completing compaction's duration.
+		d.mu.compact.duration += d.timeNow().Sub(c.beganAt)
+
+		// The previous compaction may have produced too many files in a
+		// level, so reschedule another compaction if needed.
+		d.maybeScheduleCompaction()
+		d.mu.compact.cond.Broadcast()
+	})
+}
+
+// compact1 runs one compaction.
+//
+// d.mu must be held when calling this, but the mutex may be dropped and
+// re-acquired during the course of this method.
+func (d *DB) compact1(c *compaction, errChannel chan error) (err error) {
+	if errChannel != nil {
+		defer func() {
+			errChannel <- err
+		}()
+	}
+
+	jobID := d.mu.nextJobID
+	d.mu.nextJobID++
+	info := c.makeInfo(jobID)
+	d.opts.EventListener.CompactionBegin(info)
+	startTime := d.timeNow()
+
+	ve, pendingOutputs, stats, err := d.runCompaction(jobID, c)
+
+	info.Duration = d.timeNow().Sub(startTime)
+	if err == nil {
+		err = func() error {
+			var err error
+			d.mu.versions.logLock()
+			// Check if this compaction had a conflicting operation (eg. a d.excise())
+			// that necessitates it restarting from scratch. Note that since we hold
+			// the manifest lock, we don't expect this bool to change its value
+			// as only the holder of the manifest lock will ever write to it.
+			if c.cancel.Load() {
+				err = firstError(err, ErrCancelledCompaction)
+			}
+			if err != nil {
+				// logAndApply calls logUnlock. If we didn't call it, we need to call
+				// logUnlock ourselves.
+				d.mu.versions.logUnlock()
+				return err
+			}
+			return d.mu.versions.logAndApply(jobID, ve, c.metrics, false /* forceRotation */, func() []compactionInfo {
+				return d.getInProgressCompactionInfoLocked(c)
+			})
+		}()
+		if err != nil {
+			// TODO(peter): untested.
+			for _, f := range pendingOutputs {
+				// Note that the FileBacking for the file metadata might not have
+				// been set yet. So, we directly use the FileNum. Since these
+				// files were generated as compaction outputs, these must be
+				// physical files on disk. This property might not hold once
+				// https://github.com/cockroachdb/pebble/issues/389 is
+				// implemented if #389 creates virtual sstables as output files.
+				d.mu.versions.obsoleteTables = append(
+					d.mu.versions.obsoleteTables,
+					fileInfo{f.FileNum.DiskFileNum(), f.Size},
+				)
+			}
+			d.mu.versions.updateObsoleteTableMetricsLocked()
+		}
+	}
+
+	info.Done = true
+	info.Err = err
+	if err == nil {
+		for i := range ve.NewFiles {
+			e := &ve.NewFiles[i]
+			info.Output.Tables = append(info.Output.Tables, e.Meta.TableInfo())
+		}
+		d.mu.snapshots.cumulativePinnedCount += stats.cumulativePinnedKeys
+		d.mu.snapshots.cumulativePinnedSize += stats.cumulativePinnedSize
+		d.mu.versions.metrics.Keys.MissizedTombstonesCount += stats.countMissizedDels
+		d.maybeUpdateDeleteCompactionHints(c)
+	}
+
+	// NB: clearing compacting state must occur before updating the read state;
+	// L0Sublevels initialization depends on it.
+	d.clearCompactingState(c, err != nil)
+	d.mu.versions.incrementCompactions(c.kind, c.extraLevels, c.pickerMetrics)
+	d.mu.versions.incrementCompactionBytes(-c.bytesWritten)
+
+	info.TotalDuration = d.timeNow().Sub(c.beganAt)
+	d.opts.EventListener.CompactionEnd(info)
+
+	// Update the read state before deleting obsolete files because the
+	// read-state update will cause the previous version to be unref'd and if
+	// there are no references obsolete tables will be added to the obsolete
+	// table list.
+	if err == nil {
+		d.updateReadStateLocked(d.opts.DebugCheck)
+		d.updateTableStatsLocked(ve.NewFiles)
+	}
+	d.deleteObsoleteFiles(jobID)
+
+	return err
+}
+
+type compactStats struct {
+	cumulativePinnedKeys uint64
+	cumulativePinnedSize uint64
+	countMissizedDels    uint64
+}
+
+// runCopyCompaction runs a copy compaction where a new FileNum is created that
+// is a byte-for-byte copy of the input file. This is used in lieu of a move
+// compaction when a file is being moved across the local/remote storage
+// boundary.
+//
+// d.mu must be held when calling this method.
+func (d *DB) runCopyCompaction(
+	jobID int,
+	c *compaction,
+	meta *fileMetadata,
+	objMeta objstorage.ObjectMetadata,
+	versionEdit *versionEdit,
+) (ve *versionEdit, pendingOutputs []physicalMeta, retErr error) {
+	ve = versionEdit
+	if objMeta.IsRemote() || !remote.ShouldCreateShared(d.opts.Experimental.CreateOnShared, c.outputLevel.level) {
+		panic("pebble: scheduled a copy compaction that is not actually moving files to shared storage")
+	}
+	// Note that based on logic in the compaction picker, we're guaranteed
+	// meta.Virtual is false.
+	if meta.Virtual {
+		panic(errors.AssertionFailedf("cannot do a copy compaction of a virtual sstable across local/remote storage"))
+	}
+	// We are in the relatively more complex case where we need to copy this
+	// file to remote/shared storage. Drop the db mutex while we do the
+	// copy.
+	//
+	// To ease up cleanup of the local file and tracking of refs, we create
+	// a new FileNum. This has the potential of making the block cache less
+	// effective, however.
+	metaCopy := new(fileMetadata)
+	*metaCopy = fileMetadata{
+		Size:           meta.Size,
+		CreationTime:   meta.CreationTime,
+		SmallestSeqNum: meta.SmallestSeqNum,
+		LargestSeqNum:  meta.LargestSeqNum,
+		Stats:          meta.Stats,
+		Virtual:        meta.Virtual,
+	}
+	if meta.HasPointKeys {
+		metaCopy.ExtendPointKeyBounds(c.cmp, meta.SmallestPointKey, meta.LargestPointKey)
+	}
+	if meta.HasRangeKeys {
+		metaCopy.ExtendRangeKeyBounds(c.cmp, meta.SmallestRangeKey, meta.LargestRangeKey)
+	}
+	metaCopy.FileNum = d.mu.versions.getNextFileNum()
+	metaCopy.InitPhysicalBacking()
+	c.metrics = map[int]*LevelMetrics{
+		c.outputLevel.level: {
+			BytesIn:         meta.Size,
+			BytesCompacted:  meta.Size,
+			TablesCompacted: 1,
+		},
+	}
+	pendingOutputs = append(pendingOutputs, metaCopy.PhysicalMeta())
+	// Before dropping the db mutex, grab a ref to the current version. This
+	// prevents any concurrent excises from deleting files that this compaction
+	// needs to read/maintain a reference to.
+	vers := d.mu.versions.currentVersion()
+	vers.Ref()
+	defer vers.UnrefLocked()
+
+	d.mu.Unlock()
+	defer d.mu.Lock()
+	_, err := d.objProvider.LinkOrCopyFromLocal(context.TODO(), d.opts.FS,
+		d.objProvider.Path(objMeta), fileTypeTable, metaCopy.FileBacking.DiskFileNum,
+		objstorage.CreateOptions{PreferSharedStorage: true})
+	if err != nil {
+		return ve, pendingOutputs, err
+	}
+	ve.NewFiles[0].Meta = metaCopy
+
+	if err := d.objProvider.Sync(); err != nil {
+		return nil, pendingOutputs, err
+	}
+	return ve, pendingOutputs, nil
+}
+
+// runCompactions runs a compaction that produces new on-disk tables from
+// memtables or old on-disk tables.
+//
+// d.mu must be held when calling this, but the mutex may be dropped and
+// re-acquired during the course of this method.
+func (d *DB) runCompaction(
+	jobID int, c *compaction,
+) (ve *versionEdit, pendingOutputs []physicalMeta, stats compactStats, retErr error) {
+	// As a sanity check, confirm that the smallest / largest keys for new and
+	// deleted files in the new versionEdit pass a validation function before
+	// returning the edit.
+	defer func() {
+		// If we're handling a panic, don't expect the version edit to validate.
+		if r := recover(); r != nil {
+			panic(r)
+		} else if ve != nil {
+			err := validateVersionEdit(ve, d.opts.Experimental.KeyValidationFunc, d.opts.Comparer.FormatKey)
+			if err != nil {
+				d.opts.Logger.Fatalf("pebble: version edit validation failed: %s", err)
+			}
+		}
+	}()
+
+	// Check for a delete-only compaction. This can occur when wide range
+	// tombstones completely contain sstables.
+	if c.kind == compactionKindDeleteOnly {
+		c.metrics = make(map[int]*LevelMetrics, len(c.inputs))
+		ve := &versionEdit{
+			DeletedFiles: map[deletedFileEntry]*fileMetadata{},
+		}
+		for _, cl := range c.inputs {
+			levelMetrics := &LevelMetrics{}
+			iter := cl.files.Iter()
+			for f := iter.First(); f != nil; f = iter.Next() {
+				ve.DeletedFiles[deletedFileEntry{
+					Level:   cl.level,
+					FileNum: f.FileNum,
+				}] = f
+			}
+			c.metrics[cl.level] = levelMetrics
+		}
+		return ve, nil, stats, nil
+	}
+
+	if c.kind == compactionKindIngestedFlushable {
+		panic("pebble: runCompaction cannot handle compactionKindIngestedFlushable.")
+	}
+
+	// Check for a move or copy of one table from one level to the next. We avoid
+	// such a move if there is lots of overlapping grandparent data. Otherwise,
+	// the move could create a parent file that will require a very expensive
+	// merge later on.
+	if c.kind == compactionKindMove || c.kind == compactionKindCopy {
+		iter := c.startLevel.files.Iter()
+		meta := iter.First()
+		if invariants.Enabled {
+			if iter.Next() != nil {
+				panic("got more than one file for a move or copy compaction")
+			}
+		}
+		if c.cancel.Load() {
+			return ve, nil, stats, ErrCancelledCompaction
+		}
+		objMeta, err := d.objProvider.Lookup(fileTypeTable, meta.FileBacking.DiskFileNum)
+		if err != nil {
+			return ve, pendingOutputs, stats, err
+		}
+		c.metrics = map[int]*LevelMetrics{
+			c.outputLevel.level: {
+				BytesMoved:  meta.Size,
+				TablesMoved: 1,
+			},
+		}
+		ve := &versionEdit{
+			DeletedFiles: map[deletedFileEntry]*fileMetadata{
+				{Level: c.startLevel.level, FileNum: meta.FileNum}: meta,
+			},
+			NewFiles: []newFileEntry{
+				{Level: c.outputLevel.level, Meta: meta},
+			},
+		}
+		if c.kind == compactionKindCopy {
+			ve, pendingOutputs, retErr = d.runCopyCompaction(jobID, c, meta, objMeta, ve)
+			if retErr != nil {
+				return ve, pendingOutputs, stats, retErr
+			}
+		}
+		return ve, nil, stats, nil
+	}
+
+	defer func() {
+		if retErr != nil {
+			pendingOutputs = nil
+		}
+	}()
+
+	snapshots := d.mu.snapshots.toSlice()
+	formatVers := d.FormatMajorVersion()
+
+	if c.flushing == nil {
+		// Before dropping the db mutex, grab a ref to the current version. This
+		// prevents any concurrent excises from deleting files that this compaction
+		// needs to read/maintain a reference to.
+		//
+		// Note that unlike user iterators, compactionIter does not maintain a ref
+		// of the version or read state.
+		vers := d.mu.versions.currentVersion()
+		vers.Ref()
+		defer vers.UnrefLocked()
+	}
+
+	if c.cancel.Load() {
+		return ve, nil, stats, ErrCancelledCompaction
+	}
+
+	// Release the d.mu lock while doing I/O.
+	// Note the unusual order: Unlock and then Lock.
+	d.mu.Unlock()
+	defer d.mu.Lock()
+
+	// Compactions use a pool of buffers to read blocks, avoiding polluting the
+	// block cache with blocks that will not be read again. We initialize the
+	// buffer pool with a size 12. This initial size does not need to be
+	// accurate, because the pool will grow to accommodate the maximum number of
+	// blocks allocated at a given time over the course of the compaction. But
+	// choosing a size larger than that working set avoids any additional
+	// allocations to grow the size of the pool over the course of iteration.
+	//
+	// Justification for initial size 12: In a two-level compaction, at any
+	// given moment we'll have 2 index blocks in-use and 2 data blocks in-use.
+	// Additionally, when decoding a compressed block, we'll temporarily
+	// allocate 1 additional block to hold the compressed buffer. In the worst
+	// case that all input sstables have two-level index blocks (+2), value
+	// blocks (+2), range deletion blocks (+n) and range key blocks (+n), we'll
+	// additionally require 2n+4 blocks where n is the number of input sstables.
+	// Range deletion and range key blocks are relatively rare, and the cost of
+	// an additional allocation or two over the course of the compaction is
+	// considered to be okay. A larger initial size would cause the pool to hold
+	// on to more memory, even when it's not in-use because the pool will
+	// recycle buffers up to the current capacity of the pool. The memory use of
+	// a 12-buffer pool is expected to be within reason, even if all the buffers
+	// grow to the typical size of an index block (256 KiB) which would
+	// translate to 3 MiB per compaction.
+	c.bufferPool.Init(12)
+	defer c.bufferPool.Release()
+
+	iiter, err := c.newInputIter(d.newIters, d.tableNewRangeKeyIter, snapshots)
+	if err != nil {
+		return nil, pendingOutputs, stats, err
+	}
+	c.allowedZeroSeqNum = c.allowZeroSeqNum()
+	iiter = invalidating.MaybeWrapIfInvariants(iiter)
+	iter := newCompactionIter(c.cmp, c.equal, c.formatKey, d.merge, iiter, snapshots,
+		&c.rangeDelFrag, &c.rangeKeyFrag, c.allowedZeroSeqNum, c.elideTombstone,
+		c.elideRangeTombstone, d.FormatMajorVersion())
+
+	var (
+		createdFiles    []base.DiskFileNum
+		tw              *sstable.Writer
+		pinnedKeySize   uint64
+		pinnedValueSize uint64
+		pinnedCount     uint64
+	)
+	defer func() {
+		if iter != nil {
+			retErr = firstError(retErr, iter.Close())
+		}
+		if tw != nil {
+			retErr = firstError(retErr, tw.Close())
+		}
+		if retErr != nil {
+			for _, fileNum := range createdFiles {
+				_ = d.objProvider.Remove(fileTypeTable, fileNum)
+			}
+		}
+		for _, closer := range c.closers {
+			retErr = firstError(retErr, closer.Close())
+		}
+	}()
+
+	ve = &versionEdit{
+		DeletedFiles: map[deletedFileEntry]*fileMetadata{},
+	}
+
+	startLevelBytes := c.startLevel.files.SizeSum()
+	outputMetrics := &LevelMetrics{
+		BytesIn:   startLevelBytes,
+		BytesRead: c.outputLevel.files.SizeSum(),
+	}
+	if len(c.extraLevels) > 0 {
+		outputMetrics.BytesIn += c.extraLevels[0].files.SizeSum()
+	}
+	outputMetrics.BytesRead += outputMetrics.BytesIn
+
+	c.metrics = map[int]*LevelMetrics{
+		c.outputLevel.level: outputMetrics,
+	}
+	if len(c.flushing) == 0 && c.metrics[c.startLevel.level] == nil {
+		c.metrics[c.startLevel.level] = &LevelMetrics{}
+	}
+	if len(c.extraLevels) > 0 {
+		c.metrics[c.extraLevels[0].level] = &LevelMetrics{}
+		outputMetrics.MultiLevel.BytesInTop = startLevelBytes
+		outputMetrics.MultiLevel.BytesIn = outputMetrics.BytesIn
+		outputMetrics.MultiLevel.BytesRead = outputMetrics.BytesRead
+	}
+
+	// The table is typically written at the maximum allowable format implied by
+	// the current format major version of the DB.
+	tableFormat := formatVers.MaxTableFormat()
+
+	// In format major versions with maximum table formats of Pebblev3, value
+	// blocks were conditional on an experimental setting. In format major
+	// versions with maximum table formats of Pebblev4 and higher, value blocks
+	// are always enabled.
+	if tableFormat == sstable.TableFormatPebblev3 &&
+		(d.opts.Experimental.EnableValueBlocks == nil || !d.opts.Experimental.EnableValueBlocks()) {
+		tableFormat = sstable.TableFormatPebblev2
+	}
+
+	writerOpts := d.opts.MakeWriterOptions(c.outputLevel.level, tableFormat)
+	if formatVers < FormatBlockPropertyCollector {
+		// Cannot yet write block properties.
+		writerOpts.BlockPropertyCollectors = nil
+	}
+
+	// prevPointKey is a sstable.WriterOption that provides access to
+	// the last point key written to a writer's sstable. When a new
+	// output begins in newOutput, prevPointKey is updated to point to
+	// the new output's sstable.Writer. This allows the compaction loop
+	// to access the last written point key without requiring the
+	// compaction loop to make a copy of each key ahead of time. Users
+	// must be careful, because the byte slice returned by UnsafeKey
+	// points directly into the Writer's block buffer.
+	var prevPointKey sstable.PreviousPointKeyOpt
+	var cpuWorkHandle CPUWorkHandle
+	defer func() {
+		if cpuWorkHandle != nil {
+			d.opts.Experimental.CPUWorkPermissionGranter.CPUWorkDone(cpuWorkHandle)
+		}
+	}()
+
+	newOutput := func() error {
+		// Check if we've been cancelled by a concurrent operation.
+		if c.cancel.Load() {
+			return ErrCancelledCompaction
+		}
+		fileMeta := &fileMetadata{}
+		d.mu.Lock()
+		fileNum := d.mu.versions.getNextFileNum()
+		fileMeta.FileNum = fileNum
+		pendingOutputs = append(pendingOutputs, fileMeta.PhysicalMeta())
+		d.mu.Unlock()
+
+		ctx := context.TODO()
+		if objiotracing.Enabled {
+			ctx = objiotracing.WithLevel(ctx, c.outputLevel.level)
+			switch c.kind {
+			case compactionKindFlush:
+				ctx = objiotracing.WithReason(ctx, objiotracing.ForFlush)
+			case compactionKindIngestedFlushable:
+				ctx = objiotracing.WithReason(ctx, objiotracing.ForIngestion)
+			default:
+				ctx = objiotracing.WithReason(ctx, objiotracing.ForCompaction)
+			}
+		}
+		// Prefer shared storage if present.
+		createOpts := objstorage.CreateOptions{
+			PreferSharedStorage: remote.ShouldCreateShared(d.opts.Experimental.CreateOnShared, c.outputLevel.level),
+		}
+		writable, objMeta, err := d.objProvider.Create(ctx, fileTypeTable, fileNum.DiskFileNum(), createOpts)
+		if err != nil {
+			return err
+		}
+
+		reason := "flushing"
+		if c.flushing == nil {
+			reason = "compacting"
+		}
+		d.opts.EventListener.TableCreated(TableCreateInfo{
+			JobID:   jobID,
+			Reason:  reason,
+			Path:    d.objProvider.Path(objMeta),
+			FileNum: fileNum,
+		})
+		if c.kind != compactionKindFlush {
+			writable = &compactionWritable{
+				Writable: writable,
+				versions: d.mu.versions,
+				written:  &c.bytesWritten,
+			}
+		}
+		createdFiles = append(createdFiles, fileNum.DiskFileNum())
+		cacheOpts := private.SSTableCacheOpts(d.cacheID, fileNum.DiskFileNum()).(sstable.WriterOption)
+
+		const MaxFileWriteAdditionalCPUTime = time.Millisecond * 100
+		cpuWorkHandle = d.opts.Experimental.CPUWorkPermissionGranter.GetPermission(
+			MaxFileWriteAdditionalCPUTime,
+		)
+		writerOpts.Parallelism =
+			d.opts.Experimental.MaxWriterConcurrency > 0 &&
+				(cpuWorkHandle.Permitted() || d.opts.Experimental.ForceWriterParallelism)
+
+		tw = sstable.NewWriter(writable, writerOpts, cacheOpts, &prevPointKey)
+
+		fileMeta.CreationTime = time.Now().Unix()
+		ve.NewFiles = append(ve.NewFiles, newFileEntry{
+			Level: c.outputLevel.level,
+			Meta:  fileMeta,
+		})
+		return nil
+	}
+
+	// splitL0Outputs is true during flushes and intra-L0 compactions with flush
+	// splits enabled.
+	splitL0Outputs := c.outputLevel.level == 0 && d.opts.FlushSplitBytes > 0
+
+	// finishOutput is called with the a user key up to which all tombstones
+	// should be flushed. Typically, this is the first key of the next
+	// sstable or an empty key if this output is the final sstable.
+	finishOutput := func(splitKey []byte) error {
+		// If we haven't output any point records to the sstable (tw == nil) then the
+		// sstable will only contain range tombstones and/or range keys. The smallest
+		// key in the sstable will be the start key of the first range tombstone or
+		// range key added. We need to ensure that this start key is distinct from
+		// the splitKey passed to finishOutput (if set), otherwise we would generate
+		// an sstable where the largest key is smaller than the smallest key due to
+		// how the largest key boundary is set below. NB: It is permissible for the
+		// range tombstone / range key start key to be the empty string.
+		//
+		// TODO: It is unfortunate that we have to do this check here rather than
+		// when we decide to finish the sstable in the runCompaction loop. A better
+		// structure currently eludes us.
+		if tw == nil {
+			startKey := c.rangeDelFrag.Start()
+			if len(iter.tombstones) > 0 {
+				startKey = iter.tombstones[0].Start
+			}
+			if startKey == nil {
+				startKey = c.rangeKeyFrag.Start()
+				if len(iter.rangeKeys) > 0 {
+					startKey = iter.rangeKeys[0].Start
+				}
+			}
+			if splitKey != nil && d.cmp(startKey, splitKey) == 0 {
+				return nil
+			}
+		}
+
+		// NB: clone the key because the data can be held on to by the call to
+		// compactionIter.Tombstones via keyspan.Fragmenter.FlushTo, and by the
+		// WriterMetadata.LargestRangeDel.UserKey.
+		splitKey = append([]byte(nil), splitKey...)
+		for _, v := range iter.Tombstones(splitKey) {
+			if tw == nil {
+				if err := newOutput(); err != nil {
+					return err
+				}
+			}
+			// The tombstone being added could be completely outside the
+			// eventual bounds of the sstable. Consider this example (bounds
+			// in square brackets next to table filename):
+			//
+			// ./000240.sst   [tmgc#391,MERGE-tmgc#391,MERGE]
+			// tmgc#391,MERGE [786e627a]
+			// tmgc-udkatvs#331,RANGEDEL
+			//
+			// ./000241.sst   [tmgc#384,MERGE-tmgc#384,MERGE]
+			// tmgc#384,MERGE [666c7070]
+			// tmgc-tvsalezade#383,RANGEDEL
+			// tmgc-tvsalezade#331,RANGEDEL
+			//
+			// ./000242.sst   [tmgc#383,RANGEDEL-tvsalezade#72057594037927935,RANGEDEL]
+			// tmgc-tvsalezade#383,RANGEDEL
+			// tmgc#375,SET [72646c78766965616c72776865676e79]
+			// tmgc-tvsalezade#356,RANGEDEL
+			//
+			// Note that both of the top two SSTables have range tombstones
+			// that start after the file's end keys. Since the file bound
+			// computation happens well after all range tombstones have been
+			// added to the writer, eliding out-of-file range tombstones based
+			// on sequence number at this stage is difficult, and necessitates
+			// read-time logic to ignore range tombstones outside file bounds.
+			if err := rangedel.Encode(&v, tw.Add); err != nil {
+				return err
+			}
+		}
+		for _, v := range iter.RangeKeys(splitKey) {
+			// Same logic as for range tombstones, except added using tw.AddRangeKey.
+			if tw == nil {
+				if err := newOutput(); err != nil {
+					return err
+				}
+			}
+			if err := rangekey.Encode(&v, tw.AddRangeKey); err != nil {
+				return err
+			}
+		}
+
+		if tw == nil {
+			return nil
+		}
+		{
+			// Set internal sstable properties.
+			p := getInternalWriterProperties(tw)
+			// Set the external sst version to 0. This is what RocksDB expects for
+			// db-internal sstables; otherwise, it could apply a global sequence number.
+			p.ExternalFormatVersion = 0
+			// Set the snapshot pinned totals.
+			p.SnapshotPinnedKeys = pinnedCount
+			p.SnapshotPinnedKeySize = pinnedKeySize
+			p.SnapshotPinnedValueSize = pinnedValueSize
+			stats.cumulativePinnedKeys += pinnedCount
+			stats.cumulativePinnedSize += pinnedKeySize + pinnedValueSize
+			pinnedCount = 0
+			pinnedKeySize = 0
+			pinnedValueSize = 0
+		}
+		if err := tw.Close(); err != nil {
+			tw = nil
+			return err
+		}
+		d.opts.Experimental.CPUWorkPermissionGranter.CPUWorkDone(cpuWorkHandle)
+		cpuWorkHandle = nil
+		writerMeta, err := tw.Metadata()
+		if err != nil {
+			tw = nil
+			return err
+		}
+		tw = nil
+		meta := ve.NewFiles[len(ve.NewFiles)-1].Meta
+		meta.Size = writerMeta.Size
+		meta.SmallestSeqNum = writerMeta.SmallestSeqNum
+		meta.LargestSeqNum = writerMeta.LargestSeqNum
+		meta.InitPhysicalBacking()
+
+		// If the file didn't contain any range deletions, we can fill its
+		// table stats now, avoiding unnecessarily loading the table later.
+		maybeSetStatsFromProperties(
+			meta.PhysicalMeta(), &writerMeta.Properties,
+		)
+
+		if c.flushing == nil {
+			outputMetrics.TablesCompacted++
+			outputMetrics.BytesCompacted += meta.Size
+		} else {
+			outputMetrics.TablesFlushed++
+			outputMetrics.BytesFlushed += meta.Size
+		}
+		outputMetrics.Size += int64(meta.Size)
+		outputMetrics.NumFiles++
+		outputMetrics.Additional.BytesWrittenDataBlocks += writerMeta.Properties.DataSize
+		outputMetrics.Additional.BytesWrittenValueBlocks += writerMeta.Properties.ValueBlocksSize
+
+		if n := len(ve.NewFiles); n > 1 {
+			// This is not the first output file. Ensure the sstable boundaries
+			// are nonoverlapping.
+			prevMeta := ve.NewFiles[n-2].Meta
+			if writerMeta.SmallestRangeDel.UserKey != nil {
+				c := d.cmp(writerMeta.SmallestRangeDel.UserKey, prevMeta.Largest.UserKey)
+				if c < 0 {
+					return errors.Errorf(
+						"pebble: smallest range tombstone start key is less than previous sstable largest key: %s < %s",
+						writerMeta.SmallestRangeDel.Pretty(d.opts.Comparer.FormatKey),
+						prevMeta.Largest.Pretty(d.opts.Comparer.FormatKey))
+				} else if c == 0 && !prevMeta.Largest.IsExclusiveSentinel() {
+					// The user key portion of the range boundary start key is
+					// equal to the previous table's largest key user key, and
+					// the previous table's largest key is not exclusive. This
+					// violates the invariant that tables are key-space
+					// partitioned.
+					return errors.Errorf(
+						"pebble: invariant violation: previous sstable largest key %s, current sstable smallest rangedel: %s",
+						prevMeta.Largest.Pretty(d.opts.Comparer.FormatKey),
+						writerMeta.SmallestRangeDel.Pretty(d.opts.Comparer.FormatKey),
+					)
+				}
+			}
+		}
+
+		// Verify that all range deletions outputted to the sstable are
+		// truncated to split key.
+		if splitKey != nil && writerMeta.LargestRangeDel.UserKey != nil &&
+			d.cmp(writerMeta.LargestRangeDel.UserKey, splitKey) > 0 {
+			return errors.Errorf(
+				"pebble: invariant violation: rangedel largest key %q extends beyond split key %q",
+				writerMeta.LargestRangeDel.Pretty(d.opts.Comparer.FormatKey),
+				d.opts.Comparer.FormatKey(splitKey),
+			)
+		}
+
+		if writerMeta.HasPointKeys {
+			meta.ExtendPointKeyBounds(d.cmp, writerMeta.SmallestPoint, writerMeta.LargestPoint)
+		}
+		if writerMeta.HasRangeDelKeys {
+			meta.ExtendPointKeyBounds(d.cmp, writerMeta.SmallestRangeDel, writerMeta.LargestRangeDel)
+		}
+		if writerMeta.HasRangeKeys {
+			meta.ExtendRangeKeyBounds(d.cmp, writerMeta.SmallestRangeKey, writerMeta.LargestRangeKey)
+		}
+
+		// Verify that the sstable bounds fall within the compaction input
+		// bounds. This is a sanity check that we don't have a logic error
+		// elsewhere that causes the sstable bounds to accidentally expand past the
+		// compaction input bounds as doing so could lead to various badness such
+		// as keys being deleted by a range tombstone incorrectly.
+		if c.smallest.UserKey != nil {
+			switch v := d.cmp(meta.Smallest.UserKey, c.smallest.UserKey); {
+			case v >= 0:
+				// Nothing to do.
+			case v < 0:
+				return errors.Errorf("pebble: compaction output grew beyond bounds of input: %s < %s",
+					meta.Smallest.Pretty(d.opts.Comparer.FormatKey),
+					c.smallest.Pretty(d.opts.Comparer.FormatKey))
+			}
+		}
+		if c.largest.UserKey != nil {
+			switch v := d.cmp(meta.Largest.UserKey, c.largest.UserKey); {
+			case v <= 0:
+				// Nothing to do.
+			case v > 0:
+				return errors.Errorf("pebble: compaction output grew beyond bounds of input: %s > %s",
+					meta.Largest.Pretty(d.opts.Comparer.FormatKey),
+					c.largest.Pretty(d.opts.Comparer.FormatKey))
+			}
+		}
+		// Verify that we never split different revisions of the same user key
+		// across two different sstables.
+		if err := c.errorOnUserKeyOverlap(ve); err != nil {
+			return err
+		}
+		if err := meta.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil {
+			return err
+		}
+		return nil
+	}
+
+	// Build a compactionOutputSplitter that contains all logic to determine
+	// whether the compaction loop should stop writing to one output sstable and
+	// switch to a new one. Some splitters can wrap other splitters, and the
+	// splitterGroup can be composed of multiple splitters. In this case, we
+	// start off with splitters for file sizes, grandparent limits, and (for L0
+	// splits) L0 limits, before wrapping them in an splitterGroup.
+	sizeSplitter := newFileSizeSplitter(&iter.frontiers, c.maxOutputFileSize, c.grandparents.Iter())
+	unsafePrevUserKey := func() []byte {
+		// Return the largest point key written to tw or the start of
+		// the current range deletion in the fragmenter, whichever is
+		// greater.
+		prevPoint := prevPointKey.UnsafeKey()
+		if c.cmp(prevPoint.UserKey, c.rangeDelFrag.Start()) > 0 {
+			return prevPoint.UserKey
+		}
+		return c.rangeDelFrag.Start()
+	}
+	outputSplitters := []compactionOutputSplitter{
+		// We do not split the same user key across different sstables within
+		// one flush or compaction. The fileSizeSplitter may request a split in
+		// the middle of a user key, so the userKeyChangeSplitter ensures we are
+		// at a user key change boundary when doing a split.
+		&userKeyChangeSplitter{
+			cmp:               c.cmp,
+			splitter:          sizeSplitter,
+			unsafePrevUserKey: unsafePrevUserKey,
+		},
+		newLimitFuncSplitter(&iter.frontiers, c.findGrandparentLimit),
+	}
+	if splitL0Outputs {
+		outputSplitters = append(outputSplitters, newLimitFuncSplitter(&iter.frontiers, c.findL0Limit))
+	}
+	splitter := &splitterGroup{cmp: c.cmp, splitters: outputSplitters}
+
+	// Each outer loop iteration produces one output file. An iteration that
+	// produces a file containing point keys (and optionally range tombstones)
+	// guarantees that the input iterator advanced. An iteration that produces
+	// a file containing only range tombstones guarantees the limit passed to
+	// `finishOutput()` advanced to a strictly greater user key corresponding
+	// to a grandparent file largest key, or nil. Taken together, these
+	// progress guarantees ensure that eventually the input iterator will be
+	// exhausted and the range tombstone fragments will all be flushed.
+	for key, val := iter.First(); key != nil || !c.rangeDelFrag.Empty() || !c.rangeKeyFrag.Empty(); {
+		var firstKey []byte
+		if key != nil {
+			firstKey = key.UserKey
+		} else if startKey := c.rangeDelFrag.Start(); startKey != nil {
+			// Pass the start key of the first pending tombstone to find the
+			// next limit. All pending tombstones have the same start key. We
+			// use this as opposed to the end key of the last written sstable to
+			// effectively handle cases like these:
+			//
+			// a.SET.3
+			// (lf.limit at b)
+			// d.RANGEDEL.4:f
+			//
+			// In this case, the partition after b has only range deletions, so
+			// if we were to find the limit after the last written key at the
+			// split point (key a), we'd get the limit b again, and
+			// finishOutput() would not advance any further because the next
+			// range tombstone to write does not start until after the L0 split
+			// point.
+			firstKey = startKey
+		}
+		splitterSuggestion := splitter.onNewOutput(firstKey)
+
+		// Each inner loop iteration processes one key from the input iterator.
+		for ; key != nil; key, val = iter.Next() {
+			if split := splitter.shouldSplitBefore(key, tw); split == splitNow {
+				break
+			}
+
+			switch key.Kind() {
+			case InternalKeyKindRangeDelete:
+				// Range tombstones are handled specially. They are fragmented,
+				// and they're not written until later during `finishOutput()`.
+				// We add them to the `Fragmenter` now to make them visible to
+				// `compactionIter` so covered keys in the same snapshot stripe
+				// can be elided.
+
+				// The interleaved range deletion might only be one of many with
+				// these bounds. Some fragmenting is performed ahead of time by
+				// keyspan.MergingIter.
+				if s := c.rangeDelIter.Span(); !s.Empty() {
+					// The memory management here is subtle. Range deletions
+					// blocks do NOT use prefix compression, which ensures that
+					// range deletion spans' memory is available as long we keep
+					// the iterator open. However, the keyspan.MergingIter that
+					// merges spans across levels only guarantees the lifetime
+					// of the [start, end) bounds until the next positioning
+					// method is called.
+					//
+					// Additionally, the Span.Keys slice is owned by the the
+					// range deletion iterator stack, and it may be overwritten
+					// when we advance.
+					//
+					// Clone the Keys slice and the start and end keys.
+					//
+					// TODO(jackson): Avoid the clone by removing c.rangeDelFrag
+					// and performing explicit truncation of the pending
+					// rangedel span as necessary.
+					clone := keyspan.Span{
+						Start: iter.cloneKey(s.Start),
+						End:   iter.cloneKey(s.End),
+						Keys:  make([]keyspan.Key, len(s.Keys)),
+					}
+					copy(clone.Keys, s.Keys)
+					c.rangeDelFrag.Add(clone)
+				}
+				continue
+			case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
+				// Range keys are handled in the same way as range tombstones, except
+				// with a dedicated fragmenter.
+				if s := c.rangeKeyInterleaving.Span(); !s.Empty() {
+					clone := keyspan.Span{
+						Start: iter.cloneKey(s.Start),
+						End:   iter.cloneKey(s.End),
+						Keys:  make([]keyspan.Key, len(s.Keys)),
+					}
+					// Since the keys' Suffix and Value fields are not deep cloned, the
+					// underlying blockIter must be kept open for the lifetime of the
+					// compaction.
+					copy(clone.Keys, s.Keys)
+					c.rangeKeyFrag.Add(clone)
+				}
+				continue
+			}
+			if tw == nil {
+				if err := newOutput(); err != nil {
+					return nil, pendingOutputs, stats, err
+				}
+			}
+			if err := tw.AddWithForceObsolete(*key, val, iter.forceObsoleteDueToRangeDel); err != nil {
+				return nil, pendingOutputs, stats, err
+			}
+			if iter.snapshotPinned {
+				// The kv pair we just added to the sstable was only surfaced by
+				// the compaction iterator because an open snapshot prevented
+				// its elision. Increment the stats.
+				pinnedCount++
+				pinnedKeySize += uint64(len(key.UserKey)) + base.InternalTrailerLen
+				pinnedValueSize += uint64(len(val))
+			}
+		}
+
+		// A splitter requested a split, and we're ready to finish the output.
+		// We need to choose the key at which to split any pending range
+		// tombstones. There are two options:
+		// 1. splitterSuggestion — The key suggested by the splitter. This key
+		//    is guaranteed to be greater than the last key written to the
+		//    current output.
+		// 2. key.UserKey — the first key of the next sstable output. This user
+		//     key is also guaranteed to be greater than the last user key
+		//     written to the current output (see userKeyChangeSplitter).
+		//
+		// Use whichever is smaller. Using the smaller of the two limits
+		// overlap with grandparents. Consider the case where the
+		// grandparent limit is calculated to be 'b', key is 'x', and
+		// there exist many sstables between 'b' and 'x'. If the range
+		// deletion fragmenter has a pending tombstone [a,x), splitting
+		// at 'x' would cause the output table to overlap many
+		// grandparents well beyond the calculated grandparent limit
+		// 'b'. Splitting at the smaller `splitterSuggestion` avoids
+		// this unbounded overlap with grandparent tables.
+		splitKey := splitterSuggestion
+		if key != nil && (splitKey == nil || c.cmp(splitKey, key.UserKey) > 0) {
+			splitKey = key.UserKey
+		}
+		if err := finishOutput(splitKey); err != nil {
+			return nil, pendingOutputs, stats, err
+		}
+	}
+
+	for _, cl := range c.inputs {
+		iter := cl.files.Iter()
+		for f := iter.First(); f != nil; f = iter.Next() {
+			ve.DeletedFiles[deletedFileEntry{
+				Level:   cl.level,
+				FileNum: f.FileNum,
+			}] = f
+		}
+	}
+
+	// The compaction iterator keeps track of a count of the number of DELSIZED
+	// keys that encoded an incorrect size. Propagate it up as a part of
+	// compactStats.
+	stats.countMissizedDels = iter.stats.countMissizedDels
+
+	if err := d.objProvider.Sync(); err != nil {
+		return nil, pendingOutputs, stats, err
+	}
+
+	// Refresh the disk available statistic whenever a compaction/flush
+	// completes, before re-acquiring the mutex.
+	_ = d.calculateDiskAvailableBytes()
+
+	return ve, pendingOutputs, stats, nil
+}
+
+// validateVersionEdit validates that start and end keys across new and deleted
+// files in a versionEdit pass the given validation function.
+func validateVersionEdit(
+	ve *versionEdit, validateFn func([]byte) error, format base.FormatKey,
+) error {
+	validateMetaFn := func(f *manifest.FileMetadata) error {
+		for _, key := range []InternalKey{f.Smallest, f.Largest} {
+			if err := validateFn(key.UserKey); err != nil {
+				return errors.Wrapf(err, "key=%q; file=%s", format(key.UserKey), f)
+			}
+		}
+		return nil
+	}
+
+	// Validate both new and deleted files.
+	for _, f := range ve.NewFiles {
+		if err := validateMetaFn(f.Meta); err != nil {
+			return err
+		}
+	}
+	for _, m := range ve.DeletedFiles {
+		if err := validateMetaFn(m); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// scanObsoleteFiles scans the filesystem for files that are no longer needed
+// and adds those to the internal lists of obsolete files. Note that the files
+// are not actually deleted by this method. A subsequent call to
+// deleteObsoleteFiles must be performed. Must be not be called concurrently
+// with compactions and flushes. db.mu must be held when calling this function.
+func (d *DB) scanObsoleteFiles(list []string) {
+	// Disable automatic compactions temporarily to avoid concurrent compactions /
+	// flushes from interfering. The original value is restored on completion.
+	disabledPrev := d.opts.DisableAutomaticCompactions
+	defer func() {
+		d.opts.DisableAutomaticCompactions = disabledPrev
+	}()
+	d.opts.DisableAutomaticCompactions = true
+
+	// Wait for any ongoing compaction to complete before continuing.
+	for d.mu.compact.compactingCount > 0 || d.mu.compact.flushing {
+		d.mu.compact.cond.Wait()
+	}
+
+	liveFileNums := make(map[base.DiskFileNum]struct{})
+	d.mu.versions.addLiveFileNums(liveFileNums)
+	// Protect against files which are only referred to by the ingestedFlushable
+	// from being deleted. These are added to the flushable queue on WAL replay
+	// during read only mode and aren't part of the Version. Note that if
+	// !d.opts.ReadOnly, then all flushables of type ingestedFlushable have
+	// already been flushed.
+	for _, fEntry := range d.mu.mem.queue {
+		if f, ok := fEntry.flushable.(*ingestedFlushable); ok {
+			for _, file := range f.files {
+				liveFileNums[file.FileBacking.DiskFileNum] = struct{}{}
+			}
+		}
+	}
+
+	minUnflushedLogNum := d.mu.versions.minUnflushedLogNum
+	manifestFileNum := d.mu.versions.manifestFileNum
+
+	var obsoleteLogs []fileInfo
+	var obsoleteTables []fileInfo
+	var obsoleteManifests []fileInfo
+	var obsoleteOptions []fileInfo
+
+	for _, filename := range list {
+		fileType, diskFileNum, ok := base.ParseFilename(d.opts.FS, filename)
+		if !ok {
+			continue
+		}
+		switch fileType {
+		case fileTypeLog:
+			if diskFileNum >= minUnflushedLogNum {
+				continue
+			}
+			fi := fileInfo{fileNum: diskFileNum}
+			if stat, err := d.opts.FS.Stat(filename); err == nil {
+				fi.fileSize = uint64(stat.Size())
+			}
+			obsoleteLogs = append(obsoleteLogs, fi)
+		case fileTypeManifest:
+			if diskFileNum >= manifestFileNum {
+				continue
+			}
+			fi := fileInfo{fileNum: diskFileNum}
+			if stat, err := d.opts.FS.Stat(filename); err == nil {
+				fi.fileSize = uint64(stat.Size())
+			}
+			obsoleteManifests = append(obsoleteManifests, fi)
+		case fileTypeOptions:
+			if diskFileNum.FileNum() >= d.optionsFileNum.FileNum() {
+				continue
+			}
+			fi := fileInfo{fileNum: diskFileNum}
+			if stat, err := d.opts.FS.Stat(filename); err == nil {
+				fi.fileSize = uint64(stat.Size())
+			}
+			obsoleteOptions = append(obsoleteOptions, fi)
+		case fileTypeTable:
+			// Objects are handled through the objstorage provider below.
+		default:
+			// Don't delete files we don't know about.
+		}
+	}
+
+	objects := d.objProvider.List()
+	for _, obj := range objects {
+		switch obj.FileType {
+		case fileTypeTable:
+			if _, ok := liveFileNums[obj.DiskFileNum]; ok {
+				continue
+			}
+			fileInfo := fileInfo{
+				fileNum: obj.DiskFileNum,
+			}
+			if size, err := d.objProvider.Size(obj); err == nil {
+				fileInfo.fileSize = uint64(size)
+			}
+			obsoleteTables = append(obsoleteTables, fileInfo)
+
+		default:
+			// Ignore object types we don't know about.
+		}
+	}
+
+	d.mu.log.queue = merge(d.mu.log.queue, obsoleteLogs)
+	d.mu.versions.metrics.WAL.Files = int64(len(d.mu.log.queue))
+	d.mu.versions.obsoleteTables = merge(d.mu.versions.obsoleteTables, obsoleteTables)
+	d.mu.versions.updateObsoleteTableMetricsLocked()
+	d.mu.versions.obsoleteManifests = merge(d.mu.versions.obsoleteManifests, obsoleteManifests)
+	d.mu.versions.obsoleteOptions = merge(d.mu.versions.obsoleteOptions, obsoleteOptions)
+}
+
+// disableFileDeletions disables file deletions and then waits for any
+// in-progress deletion to finish. The caller is required to call
+// enableFileDeletions in order to enable file deletions again. It is ok for
+// multiple callers to disable file deletions simultaneously, though they must
+// all invoke enableFileDeletions in order for file deletions to be re-enabled
+// (there is an internal reference count on file deletion disablement).
+//
+// d.mu must be held when calling this method.
+func (d *DB) disableFileDeletions() {
+	d.mu.disableFileDeletions++
+	d.mu.Unlock()
+	defer d.mu.Lock()
+	d.cleanupManager.Wait()
+}
+
+// enableFileDeletions enables previously disabled file deletions. A cleanup job
+// is queued if necessary.
+//
+// d.mu must be held when calling this method.
+func (d *DB) enableFileDeletions() {
+	if d.mu.disableFileDeletions <= 0 {
+		panic("pebble: file deletion disablement invariant violated")
+	}
+	d.mu.disableFileDeletions--
+	if d.mu.disableFileDeletions > 0 {
+		return
+	}
+	jobID := d.mu.nextJobID
+	d.mu.nextJobID++
+	d.deleteObsoleteFiles(jobID)
+}
+
+type fileInfo struct {
+	fileNum  base.DiskFileNum
+	fileSize uint64
+}
+
+// deleteObsoleteFiles enqueues a cleanup job to the cleanup manager, if necessary.
+//
+// d.mu must be held when calling this. The function will release and re-aquire the mutex.
+//
+// Does nothing if file deletions are disabled (see disableFileDeletions). A
+// cleanup job will be scheduled when file deletions are re-enabled.
+func (d *DB) deleteObsoleteFiles(jobID int) {
+	if d.mu.disableFileDeletions > 0 {
+		return
+	}
+
+	var obsoleteLogs []fileInfo
+	for i := range d.mu.log.queue {
+		// NB: d.mu.versions.minUnflushedLogNum is the log number of the earliest
+		// log that has not had its contents flushed to an sstable. We can recycle
+		// the prefix of d.mu.log.queue with log numbers less than
+		// minUnflushedLogNum.
+		if d.mu.log.queue[i].fileNum >= d.mu.versions.minUnflushedLogNum {
+			obsoleteLogs = d.mu.log.queue[:i]
+			d.mu.log.queue = d.mu.log.queue[i:]
+			d.mu.versions.metrics.WAL.Files -= int64(len(obsoleteLogs))
+			break
+		}
+	}
+
+	obsoleteTables := append([]fileInfo(nil), d.mu.versions.obsoleteTables...)
+	d.mu.versions.obsoleteTables = nil
+
+	for _, tbl := range obsoleteTables {
+		delete(d.mu.versions.zombieTables, tbl.fileNum)
+	}
+
+	// Sort the manifests cause we want to delete some contiguous prefix
+	// of the older manifests.
+	slices.SortFunc(d.mu.versions.obsoleteManifests, func(a, b fileInfo) int {
+		return cmp.Compare(a.fileNum, b.fileNum)
+	})
+
+	var obsoleteManifests []fileInfo
+	manifestsToDelete := len(d.mu.versions.obsoleteManifests) - d.opts.NumPrevManifest
+	if manifestsToDelete > 0 {
+		obsoleteManifests = d.mu.versions.obsoleteManifests[:manifestsToDelete]
+		d.mu.versions.obsoleteManifests = d.mu.versions.obsoleteManifests[manifestsToDelete:]
+		if len(d.mu.versions.obsoleteManifests) == 0 {
+			d.mu.versions.obsoleteManifests = nil
+		}
+	}
+
+	obsoleteOptions := d.mu.versions.obsoleteOptions
+	d.mu.versions.obsoleteOptions = nil
+
+	// Release d.mu while preparing the cleanup job and possibly waiting.
+	// Note the unusual order: Unlock and then Lock.
+	d.mu.Unlock()
+	defer d.mu.Lock()
+
+	files := [4]struct {
+		fileType fileType
+		obsolete []fileInfo
+	}{
+		{fileTypeLog, obsoleteLogs},
+		{fileTypeTable, obsoleteTables},
+		{fileTypeManifest, obsoleteManifests},
+		{fileTypeOptions, obsoleteOptions},
+	}
+	_, noRecycle := d.opts.Cleaner.(base.NeedsFileContents)
+	filesToDelete := make([]obsoleteFile, 0, len(obsoleteLogs)+len(obsoleteTables)+len(obsoleteManifests)+len(obsoleteOptions))
+	for _, f := range files {
+		// We sort to make the order of deletions deterministic, which is nice for
+		// tests.
+		slices.SortFunc(f.obsolete, func(a, b fileInfo) int {
+			return cmp.Compare(a.fileNum, b.fileNum)
+		})
+		for _, fi := range f.obsolete {
+			dir := d.dirname
+			switch f.fileType {
+			case fileTypeLog:
+				if !noRecycle && d.logRecycler.add(fi) {
+					continue
+				}
+				dir = d.walDirname
+			case fileTypeTable:
+				d.tableCache.evict(fi.fileNum)
+			}
+
+			filesToDelete = append(filesToDelete, obsoleteFile{
+				dir:      dir,
+				fileNum:  fi.fileNum,
+				fileType: f.fileType,
+				fileSize: fi.fileSize,
+			})
+		}
+	}
+	if len(filesToDelete) > 0 {
+		d.cleanupManager.EnqueueJob(jobID, filesToDelete)
+	}
+	if d.opts.private.testingAlwaysWaitForCleanup {
+		d.cleanupManager.Wait()
+	}
+}
+
+func (d *DB) maybeScheduleObsoleteTableDeletion() {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	d.maybeScheduleObsoleteTableDeletionLocked()
+}
+
+func (d *DB) maybeScheduleObsoleteTableDeletionLocked() {
+	if len(d.mu.versions.obsoleteTables) > 0 {
+		jobID := d.mu.nextJobID
+		d.mu.nextJobID++
+		d.deleteObsoleteFiles(jobID)
+	}
+}
+
+func merge(a, b []fileInfo) []fileInfo {
+	if len(b) == 0 {
+		return a
+	}
+
+	a = append(a, b...)
+	slices.SortFunc(a, func(a, b fileInfo) int {
+		return cmp.Compare(a.fileNum, b.fileNum)
+	})
+	return slices.CompactFunc(a, func(a, b fileInfo) bool {
+		return a.fileNum == b.fileNum
+	})
+}
diff --git a/pebble/compaction_iter.go b/pebble/compaction_iter.go
new file mode 100644
index 0000000..0fb9e45
--- /dev/null
+++ b/pebble/compaction_iter.go
@@ -0,0 +1,1473 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"sort"
+	"strconv"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/bytealloc"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/rangekey"
+	"github.com/cockroachdb/redact"
+)
+
+// compactionIter provides a forward-only iterator that encapsulates the logic
+// for collapsing entries during compaction. It wraps an internal iterator and
+// collapses entries that are no longer necessary because they are shadowed by
+// newer entries. The simplest example of this is when the internal iterator
+// contains two keys: a.PUT.2 and a.PUT.1. Instead of returning both entries,
+// compactionIter collapses the second entry because it is no longer
+// necessary. The high-level structure for compactionIter is to iterate over
+// its internal iterator and output 1 entry for every user-key. There are four
+// complications to this story.
+//
+// 1. Eliding Deletion Tombstones
+//
+// Consider the entries a.DEL.2 and a.PUT.1. These entries collapse to
+// a.DEL.2. Do we have to output the entry a.DEL.2? Only if a.DEL.2 possibly
+// shadows an entry at a lower level. If we're compacting to the base-level in
+// the LSM tree then a.DEL.2 is definitely not shadowing an entry at a lower
+// level and can be elided.
+//
+// We can do slightly better than only eliding deletion tombstones at the base
+// level by observing that we can elide a deletion tombstone if there are no
+// sstables that contain the entry's key. This check is performed by
+// elideTombstone.
+//
+// 2. Merges
+//
+// The MERGE operation merges the value for an entry with the existing value
+// for an entry. The logical value of an entry can be composed of a series of
+// merge operations. When compactionIter sees a MERGE, it scans forward in its
+// internal iterator collapsing MERGE operations for the same key until it
+// encounters a SET or DELETE operation. For example, the keys a.MERGE.4,
+// a.MERGE.3, a.MERGE.2 will be collapsed to a.MERGE.4 and the values will be
+// merged using the specified Merger.
+//
+// An interesting case here occurs when MERGE is combined with SET. Consider
+// the entries a.MERGE.3 and a.SET.2. The collapsed key will be a.SET.3. The
+// reason that the kind is changed to SET is because the SET operation acts as
+// a barrier preventing further merging. This can be seen better in the
+// scenario a.MERGE.3, a.SET.2, a.MERGE.1. The entry a.MERGE.1 may be at lower
+// (older) level and not involved in the compaction. If the compaction of
+// a.MERGE.3 and a.SET.2 produced a.MERGE.3, a subsequent compaction with
+// a.MERGE.1 would merge the values together incorrectly.
+//
+// 3. Snapshots
+//
+// Snapshots are lightweight point-in-time views of the DB state. At its core,
+// a snapshot is a sequence number along with a guarantee from Pebble that it
+// will maintain the view of the database at that sequence number. Part of this
+// guarantee is relatively straightforward to achieve. When reading from the
+// database Pebble will ignore sequence numbers that are larger than the
+// snapshot sequence number. The primary complexity with snapshots occurs
+// during compaction: the collapsing of entries that are shadowed by newer
+// entries is at odds with the guarantee that Pebble will maintain the view of
+// the database at the snapshot sequence number. Rather than collapsing entries
+// up to the next user key, compactionIter can only collapse entries up to the
+// next snapshot boundary. That is, every snapshot boundary potentially causes
+// another entry for the same user-key to be emitted. Another way to view this
+// is that snapshots define stripes and entries are collapsed within stripes,
+// but not across stripes. Consider the following scenario:
+//
+//	a.PUT.9
+//	a.DEL.8
+//	a.PUT.7
+//	a.DEL.6
+//	a.PUT.5
+//
+// In the absence of snapshots these entries would be collapsed to
+// a.PUT.9. What if there is a snapshot at sequence number 7? The entries can
+// be divided into two stripes and collapsed within the stripes:
+//
+//	a.PUT.9        a.PUT.9
+//	a.DEL.8  --->
+//	a.PUT.7
+//	--             --
+//	a.DEL.6  --->  a.DEL.6
+//	a.PUT.5
+//
+// All of the rules described earlier still apply, but they are confined to
+// operate within a snapshot stripe. Snapshots only affect compaction when the
+// snapshot sequence number lies within the range of sequence numbers being
+// compacted. In the above example, a snapshot at sequence number 10 or at
+// sequence number 5 would not have any effect.
+//
+// 4. Range Deletions
+//
+// Range deletions provide the ability to delete all of the keys (and values)
+// in a contiguous range. Range deletions are stored indexed by their start
+// key. The end key of the range is stored in the value. In order to support
+// lookup of the range deletions which overlap with a particular key, the range
+// deletion tombstones need to be fragmented whenever they overlap. This
+// fragmentation is performed by keyspan.Fragmenter. The fragments are then
+// subject to the rules for snapshots. For example, consider the two range
+// tombstones [a,e)#1 and [c,g)#2:
+//
+//	2:     c-------g
+//	1: a-------e
+//
+// These tombstones will be fragmented into:
+//
+//	2:     c---e---g
+//	1: a---c---e
+//
+// Do we output the fragment [c,e)#1? Since it is covered by [c-e]#2 the answer
+// depends on whether it is in a new snapshot stripe.
+//
+// In addition to the fragmentation of range tombstones, compaction also needs
+// to take the range tombstones into consideration when outputting normal
+// keys. Just as with point deletions, a range deletion covering an entry can
+// cause the entry to be elided.
+//
+// A note on the stability of keys and values.
+//
+// The stability guarantees of keys and values returned by the iterator tree
+// that backs a compactionIter is nuanced and care must be taken when
+// referencing any returned items.
+//
+// Keys and values returned by exported functions (i.e. First, Next, etc.) have
+// lifetimes that fall into two categories:
+//
+// Lifetime valid for duration of compaction. Range deletion keys and values are
+// stable for the duration of the compaction, due to way in which a
+// compactionIter is typically constructed (i.e. via (*compaction).newInputIter,
+// which wraps the iterator over the range deletion block in a noCloseIter,
+// preventing the release of the backing memory until the compaction is
+// finished).
+//
+// Lifetime limited to duration of sstable block liveness. Point keys (SET, DEL,
+// etc.) and values must be cloned / copied following the return from the
+// exported function, and before a subsequent call to Next advances the iterator
+// and mutates the contents of the returned key and value.
+type compactionIter struct {
+	equal Equal
+	merge Merge
+	iter  internalIterator
+	err   error
+	// `key.UserKey` is set to `keyBuf` caused by saving `i.iterKey.UserKey`
+	// and `key.Trailer` is set to `i.iterKey.Trailer`. This is the
+	// case on return from all public methods -- these methods return `key`.
+	// Additionally, it is the internal state when the code is moving to the
+	// next key so it can determine whether the user key has changed from
+	// the previous key.
+	key InternalKey
+	// keyTrailer is updated when `i.key` is updated and holds the key's
+	// original trailer (eg, before any sequence-number zeroing or changes to
+	// key kind).
+	keyTrailer  uint64
+	value       []byte
+	valueCloser io.Closer
+	// Temporary buffer used for storing the previous user key in order to
+	// determine when iteration has advanced to a new user key and thus a new
+	// snapshot stripe.
+	keyBuf []byte
+	// Temporary buffer used for storing the previous value, which may be an
+	// unsafe, i.iter-owned slice that could be altered when the iterator is
+	// advanced.
+	valueBuf []byte
+	// Is the current entry valid?
+	valid            bool
+	iterKey          *InternalKey
+	iterValue        []byte
+	iterStripeChange stripeChangeType
+	// `skip` indicates whether the remaining skippable entries in the current
+	// snapshot stripe should be skipped or processed. An example of a non-
+	// skippable entry is a range tombstone as we need to return it from the
+	// `compactionIter`, even if a key covering its start key has already been
+	// seen in the same stripe. `skip` has no effect when `pos == iterPosNext`.
+	//
+	// TODO(jackson): If we use keyspan.InterleavingIter for range deletions,
+	// like we do for range keys, the only remaining 'non-skippable' key is
+	// the invalid key. We should be able to simplify this logic and remove this
+	// field.
+	skip bool
+	// `pos` indicates the iterator position at the top of `Next()`. Its type's
+	// (`iterPos`) values take on the following meanings in the context of
+	// `compactionIter`.
+	//
+	// - `iterPosCur`: the iterator is at the last key returned.
+	// - `iterPosNext`: the iterator has already been advanced to the next
+	//   candidate key. For example, this happens when processing merge operands,
+	//   where we advance the iterator all the way into the next stripe or next
+	//   user key to ensure we've seen all mergeable operands.
+	// - `iterPosPrev`: this is invalid as compactionIter is forward-only.
+	pos iterPos
+	// `snapshotPinned` indicates whether the last point key returned by the
+	// compaction iterator was only returned because an open snapshot prevents
+	// its elision. This field only applies to point keys, and not to range
+	// deletions or range keys.
+	//
+	// For MERGE, it is possible that doing the merge is interrupted even when
+	// the next point key is in the same stripe. This can happen if the loop in
+	// mergeNext gets interrupted by sameStripeNonSkippable.
+	// sameStripeNonSkippable occurs due to RANGEDELs that sort before
+	// SET/MERGE/DEL with the same seqnum, so the RANGEDEL does not necessarily
+	// delete the subsequent SET/MERGE/DEL keys.
+	snapshotPinned bool
+	// forceObsoleteDueToRangeDel is set to true in a subset of the cases that
+	// snapshotPinned is true. This value is true when the point is obsolete due
+	// to a RANGEDEL but could not be deleted due to a snapshot.
+	//
+	// NB: it may seem that the additional cases that snapshotPinned captures
+	// are harmless in that they can also be used to mark a point as obsolete
+	// (it is merely a duplication of some logic that happens in
+	// Writer.AddWithForceObsolete), but that is not quite accurate as of this
+	// writing -- snapshotPinned originated in stats collection and for a
+	// sequence MERGE, SET, where the MERGE cannot merge with the (older) SET
+	// due to a snapshot, the snapshotPinned value for the SET is true.
+	//
+	// TODO(sumeer,jackson): improve the logic of snapshotPinned and reconsider
+	// whether we need forceObsoleteDueToRangeDel.
+	forceObsoleteDueToRangeDel bool
+	// The index of the snapshot for the current key within the snapshots slice.
+	curSnapshotIdx    int
+	curSnapshotSeqNum uint64
+	// The snapshot sequence numbers that need to be maintained. These sequence
+	// numbers define the snapshot stripes (see the Snapshots description
+	// above). The sequence numbers are in ascending order.
+	snapshots []uint64
+	// frontiers holds a heap of user keys that affect compaction behavior when
+	// they're exceeded. Before a new key is returned, the compaction iterator
+	// advances the frontier, notifying any code that subscribed to be notified
+	// when a key was reached. The primary use today is within the
+	// implementation of compactionOutputSplitters in compaction.go. Many of
+	// these splitters wait for the compaction iterator to call Advance(k) when
+	// it's returning a new key. If the key that they're waiting for is
+	// surpassed, these splitters update internal state recording that they
+	// should request a compaction split next time they're asked in
+	// [shouldSplitBefore].
+	frontiers frontiers
+	// Reference to the range deletion tombstone fragmenter (e.g.,
+	// `compaction.rangeDelFrag`).
+	rangeDelFrag *keyspan.Fragmenter
+	rangeKeyFrag *keyspan.Fragmenter
+	// The fragmented tombstones.
+	tombstones []keyspan.Span
+	// The fragmented range keys.
+	rangeKeys []keyspan.Span
+	// Byte allocator for the tombstone keys.
+	alloc               bytealloc.A
+	allowZeroSeqNum     bool
+	elideTombstone      func(key []byte) bool
+	elideRangeTombstone func(start, end []byte) bool
+	// The on-disk format major version. This informs the types of keys that
+	// may be written to disk during a compaction.
+	formatVersion FormatMajorVersion
+	stats         struct {
+		// count of DELSIZED keys that were missized.
+		countMissizedDels uint64
+	}
+}
+
+func newCompactionIter(
+	cmp Compare,
+	equal Equal,
+	formatKey base.FormatKey,
+	merge Merge,
+	iter internalIterator,
+	snapshots []uint64,
+	rangeDelFrag *keyspan.Fragmenter,
+	rangeKeyFrag *keyspan.Fragmenter,
+	allowZeroSeqNum bool,
+	elideTombstone func(key []byte) bool,
+	elideRangeTombstone func(start, end []byte) bool,
+	formatVersion FormatMajorVersion,
+) *compactionIter {
+	i := &compactionIter{
+		equal:               equal,
+		merge:               merge,
+		iter:                iter,
+		snapshots:           snapshots,
+		frontiers:           frontiers{cmp: cmp},
+		rangeDelFrag:        rangeDelFrag,
+		rangeKeyFrag:        rangeKeyFrag,
+		allowZeroSeqNum:     allowZeroSeqNum,
+		elideTombstone:      elideTombstone,
+		elideRangeTombstone: elideRangeTombstone,
+		formatVersion:       formatVersion,
+	}
+	i.rangeDelFrag.Cmp = cmp
+	i.rangeDelFrag.Format = formatKey
+	i.rangeDelFrag.Emit = i.emitRangeDelChunk
+	i.rangeKeyFrag.Cmp = cmp
+	i.rangeKeyFrag.Format = formatKey
+	i.rangeKeyFrag.Emit = i.emitRangeKeyChunk
+	return i
+}
+
+func (i *compactionIter) First() (*InternalKey, []byte) {
+	if i.err != nil {
+		return nil, nil
+	}
+	var iterValue LazyValue
+	i.iterKey, iterValue = i.iter.First()
+	i.iterValue, _, i.err = iterValue.Value(nil)
+	if i.err != nil {
+		return nil, nil
+	}
+	if i.iterKey != nil {
+		i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(i.iterKey.SeqNum(), i.snapshots)
+	}
+	i.pos = iterPosNext
+	i.iterStripeChange = newStripeNewKey
+	return i.Next()
+}
+
+func (i *compactionIter) Next() (*InternalKey, []byte) {
+	if i.err != nil {
+		return nil, nil
+	}
+
+	// Close the closer for the current value if one was open.
+	if i.closeValueCloser() != nil {
+		return nil, nil
+	}
+
+	// Prior to this call to `Next()` we are in one of four situations with
+	// respect to `iterKey` and related state:
+	//
+	// - `!skip && pos == iterPosNext`: `iterKey` is already at the next key.
+	// - `!skip && pos == iterPosCurForward`: We are at the key that has been returned.
+	//   To move forward we advance by one key, even if that lands us in the same
+	//   snapshot stripe.
+	// - `skip && pos == iterPosCurForward`: We are at the key that has been returned.
+	//   To move forward we skip skippable entries in the stripe.
+	// - `skip && pos == iterPosNext && i.iterStripeChange == sameStripeNonSkippable`:
+	//    This case may occur when skipping within a snapshot stripe and we
+	//    encounter either:
+	//      a) an invalid key kind; The previous call will have returned
+	//         whatever key it was processing and deferred handling of the
+	//         invalid key to this invocation of Next(). We're responsible for
+	//         ignoring skip=true and falling into the invalid key kind case
+	//         down below.
+	//      b) an interleaved range delete; This is a wart of the current code
+	//         structure. While skipping within a snapshot stripe, a range
+	//         delete interleaved at its start key and sequence number
+	//         interrupts the sequence of point keys. After we return the range
+	//         delete to the caller, we need to pick up skipping at where we
+	//         left off, so we preserve skip=true.
+	//    TODO(jackson): This last case is confusing and can be removed if we
+	//    interleave range deletions at the maximal sequence number using the
+	//    keyspan interleaving iterator. This is the treatment given to range
+	//    keys today.
+	if i.pos == iterPosCurForward {
+		if i.skip {
+			i.skipInStripe()
+		} else {
+			i.nextInStripe()
+		}
+	} else if i.skip {
+		if i.iterStripeChange != sameStripeNonSkippable {
+			panic(errors.AssertionFailedf("compaction iterator has skip=true, but iterator is at iterPosNext"))
+		}
+	}
+
+	i.pos = iterPosCurForward
+	i.valid = false
+
+	for i.iterKey != nil {
+		// If we entered a new snapshot stripe with the same key, any key we
+		// return on this iteration is only returned because the open snapshot
+		// prevented it from being elided or merged with the key returned for
+		// the previous stripe. Mark it as pinned so that the compaction loop
+		// can correctly populate output tables' pinned statistics. We might
+		// also set snapshotPinned=true down below if we observe that the key is
+		// deleted by a range deletion in a higher stripe or that this key is a
+		// tombstone that could be elided if only it were in the last snapshot
+		// stripe.
+		i.snapshotPinned = i.iterStripeChange == newStripeSameKey
+
+		if i.iterKey.Kind() == InternalKeyKindRangeDelete || rangekey.IsRangeKey(i.iterKey.Kind()) {
+			// Return the span so the compaction can use it for file truncation and add
+			// it to the relevant fragmenter. We do not set `skip` to true before
+			// returning as there may be a forthcoming point key with the same user key
+			// and sequence number. Such a point key must be visible (i.e., not skipped
+			// over) since we promise point keys are not deleted by range tombstones at
+			// the same sequence number.
+			//
+			// Although, note that `skip` may already be true before reaching here
+			// due to an earlier key in the stripe. Then it is fine to leave it set
+			// to true, as the earlier key must have had a higher sequence number.
+			//
+			// NOTE: there is a subtle invariant violation here in that calling
+			// saveKey and returning a reference to the temporary slice violates
+			// the stability guarantee for range deletion keys. A potential
+			// mediation could return the original iterKey and iterValue
+			// directly, as the backing memory is guaranteed to be stable until
+			// the compaction completes. The violation here is only minor in
+			// that the caller immediately clones the range deletion InternalKey
+			// when passing the key to the deletion fragmenter (see the
+			// call-site in compaction.go).
+			// TODO(travers): address this violation by removing the call to
+			// saveKey and instead return the original iterKey and iterValue.
+			// This goes against the comment on i.key in the struct, and
+			// therefore warrants some investigation.
+			i.saveKey()
+			// TODO(jackson): Handle tracking pinned statistics for range keys
+			// and range deletions. This would require updating
+			// emitRangeDelChunk and rangeKeyCompactionTransform to update
+			// statistics when they apply their own snapshot striping logic.
+			i.snapshotPinned = false
+			i.value = i.iterValue
+			i.valid = true
+			return &i.key, i.value
+		}
+
+		if cover := i.rangeDelFrag.Covers(*i.iterKey, i.curSnapshotSeqNum); cover == keyspan.CoversVisibly {
+			// A pending range deletion deletes this key. Skip it.
+			i.saveKey()
+			i.skipInStripe()
+			continue
+		} else if cover == keyspan.CoversInvisibly {
+			// i.iterKey would be deleted by a range deletion if there weren't
+			// any open snapshots. Mark it as pinned.
+			//
+			// NB: there are multiple places in this file where we call
+			// i.rangeDelFrag.Covers and this is the only one where we are writing
+			// to i.snapshotPinned. Those other cases occur in mergeNext where the
+			// caller is deciding whether the value should be merged or not, and the
+			// key is in the same snapshot stripe. Hence, snapshotPinned is by
+			// definition false in those cases.
+			i.snapshotPinned = true
+			i.forceObsoleteDueToRangeDel = true
+		} else {
+			i.forceObsoleteDueToRangeDel = false
+		}
+
+		switch i.iterKey.Kind() {
+		case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
+			if i.elideTombstone(i.iterKey.UserKey) {
+				if i.curSnapshotIdx == 0 {
+					// If we're at the last snapshot stripe and the tombstone
+					// can be elided skip skippable keys in the same stripe.
+					i.saveKey()
+					i.skipInStripe()
+					if i.iterStripeChange == newStripeSameKey {
+						panic(errors.AssertionFailedf("pebble: skipInStripe in last stripe found a new stripe within the same key"))
+					}
+					if !i.skip && i.iterStripeChange != newStripeNewKey {
+						panic(errors.AssertionFailedf("pebble: skipInStripe in last stripe disabled skip without advancing to new key"))
+					}
+					continue
+				} else {
+					// We're not at the last snapshot stripe, so the tombstone
+					// can NOT yet be elided. Mark it as pinned, so that it's
+					// included in table statistics appropriately.
+					i.snapshotPinned = true
+				}
+			}
+
+			switch i.iterKey.Kind() {
+			case InternalKeyKindDelete:
+				i.saveKey()
+				i.value = i.iterValue
+				i.valid = true
+				i.skip = true
+				return &i.key, i.value
+
+			case InternalKeyKindDeleteSized:
+				// We may skip subsequent keys because of this tombstone. Scan
+				// ahead to see just how much data this tombstone drops and if
+				// the tombstone's value should be updated accordingly.
+				return i.deleteSizedNext()
+
+			case InternalKeyKindSingleDelete:
+				if i.singleDeleteNext() {
+					return &i.key, i.value
+				} else if i.err != nil {
+					return nil, nil
+				}
+				continue
+
+			default:
+				panic(errors.AssertionFailedf(
+					"unexpected kind %s", redact.SafeString(i.iterKey.Kind().String())))
+			}
+
+		case InternalKeyKindSet, InternalKeyKindSetWithDelete:
+			// The key we emit for this entry is a function of the current key
+			// kind, and whether this entry is followed by a DEL/SINGLEDEL
+			// entry. setNext() does the work to move the iterator forward,
+			// preserving the original value, and potentially mutating the key
+			// kind.
+			i.setNext()
+			if i.err != nil {
+				return nil, nil
+			}
+			return &i.key, i.value
+
+		case InternalKeyKindMerge:
+			// Record the snapshot index before mergeNext as merging
+			// advances the iterator, adjusting curSnapshotIdx.
+			origSnapshotIdx := i.curSnapshotIdx
+			var valueMerger ValueMerger
+			valueMerger, i.err = i.merge(i.iterKey.UserKey, i.iterValue)
+			var change stripeChangeType
+			if i.err == nil {
+				change = i.mergeNext(valueMerger)
+			}
+			var needDelete bool
+			if i.err == nil {
+				// includesBase is true whenever we've transformed the MERGE record
+				// into a SET.
+				var includesBase bool
+				switch i.key.Kind() {
+				case InternalKeyKindSet, InternalKeyKindSetWithDelete:
+					includesBase = true
+				case InternalKeyKindMerge:
+				default:
+					panic(errors.AssertionFailedf(
+						"unexpected kind %s", redact.SafeString(i.key.Kind().String())))
+				}
+				i.value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, includesBase)
+			}
+			if i.err == nil {
+				if needDelete {
+					i.valid = false
+					if i.closeValueCloser() != nil {
+						return nil, nil
+					}
+					continue
+				}
+				// A non-skippable entry does not necessarily cover later merge
+				// operands, so we must not zero the current merge result's seqnum.
+				//
+				// For example, suppose the forthcoming two keys are a range
+				// tombstone, `[a, b)#3`, and a merge operand, `a#3`. Recall that
+				// range tombstones do not cover point keys at the same seqnum, so
+				// `a#3` is not deleted. The range tombstone will be seen first due
+				// to its larger value type. Since it is a non-skippable key, the
+				// current merge will not include `a#3`. If we zeroed the current
+				// merge result's seqnum, then it would conflict with the upcoming
+				// merge including `a#3`, whose seqnum will also be zeroed.
+				if change != sameStripeNonSkippable {
+					i.maybeZeroSeqnum(origSnapshotIdx)
+				}
+				return &i.key, i.value
+			}
+			if i.err != nil {
+				i.valid = false
+				// TODO(sumeer): why is MarkCorruptionError only being called for
+				// MERGE?
+				i.err = base.MarkCorruptionError(i.err)
+			}
+			return nil, nil
+
+		default:
+			i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind()))
+			i.valid = false
+			return nil, nil
+		}
+	}
+
+	return nil, nil
+}
+
+func (i *compactionIter) closeValueCloser() error {
+	if i.valueCloser == nil {
+		return nil
+	}
+
+	i.err = i.valueCloser.Close()
+	i.valueCloser = nil
+	if i.err != nil {
+		i.valid = false
+	}
+	return i.err
+}
+
+// snapshotIndex returns the index of the first sequence number in snapshots
+// which is greater than or equal to seq.
+func snapshotIndex(seq uint64, snapshots []uint64) (int, uint64) {
+	index := sort.Search(len(snapshots), func(i int) bool {
+		return snapshots[i] > seq
+	})
+	if index >= len(snapshots) {
+		return index, InternalKeySeqNumMax
+	}
+	return index, snapshots[index]
+}
+
+// skipInStripe skips over skippable keys in the same stripe and user key. It
+// may set i.err, in which case i.iterKey will be nil.
+func (i *compactionIter) skipInStripe() {
+	i.skip = true
+	for i.nextInStripe() == sameStripeSkippable {
+		if i.err != nil {
+			panic(i.err)
+		}
+	}
+	// Reset skip if we landed outside the original stripe. Otherwise, we landed
+	// in the same stripe on a non-skippable key. In that case we should preserve
+	// `i.skip == true` such that later keys in the stripe will continue to be
+	// skipped.
+	if i.iterStripeChange == newStripeNewKey || i.iterStripeChange == newStripeSameKey {
+		i.skip = false
+	}
+}
+
+func (i *compactionIter) iterNext() bool {
+	var iterValue LazyValue
+	i.iterKey, iterValue = i.iter.Next()
+	i.iterValue, _, i.err = iterValue.Value(nil)
+	if i.err != nil {
+		i.iterKey = nil
+	}
+	return i.iterKey != nil
+}
+
+// stripeChangeType indicates how the snapshot stripe changed relative to the
+// previous key. If no change, it also indicates whether the current entry is
+// skippable. If the snapshot stripe changed, it also indicates whether the new
+// stripe was entered because the iterator progressed onto an entirely new key
+// or entered a new stripe within the same key.
+type stripeChangeType int
+
+const (
+	newStripeNewKey stripeChangeType = iota
+	newStripeSameKey
+	sameStripeSkippable
+	sameStripeNonSkippable
+)
+
+// nextInStripe advances the iterator and returns one of the above const ints
+// indicating how its state changed.
+//
+// Calls to nextInStripe must be preceded by a call to saveKey to retain a
+// temporary reference to the original key, so that forward iteration can
+// proceed with a reference to the original key. Care should be taken to avoid
+// overwriting or mutating the saved key or value before they have been returned
+// to the caller of the exported function (i.e. the caller of Next, First, etc.)
+//
+// nextInStripe may set i.err, in which case the return value will be
+// newStripeNewKey, and i.iterKey will be nil.
+func (i *compactionIter) nextInStripe() stripeChangeType {
+	i.iterStripeChange = i.nextInStripeHelper()
+	return i.iterStripeChange
+}
+
+// nextInStripeHelper is an internal helper for nextInStripe; callers should use
+// nextInStripe and not call nextInStripeHelper.
+func (i *compactionIter) nextInStripeHelper() stripeChangeType {
+	if !i.iterNext() {
+		return newStripeNewKey
+	}
+	key := i.iterKey
+
+	if !i.equal(i.key.UserKey, key.UserKey) {
+		i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(key.SeqNum(), i.snapshots)
+		return newStripeNewKey
+	}
+
+	// If i.key and key have the same user key, then
+	//   1. i.key must not have had a zero sequence number (or it would've be the last
+	//      key with its user key).
+	//   2. i.key must have a strictly larger sequence number
+	// There's an exception in that either key may be a range delete. Range
+	// deletes may share a sequence number with a point key if the keys were
+	// ingested together. Range keys may also share the sequence number if they
+	// were ingested, but range keys are interleaved into the compaction
+	// iterator's input iterator at the maximal sequence number so their
+	// original sequence number will not be observed here.
+	if prevSeqNum := base.SeqNumFromTrailer(i.keyTrailer); (prevSeqNum == 0 || prevSeqNum <= key.SeqNum()) &&
+		i.key.Kind() != InternalKeyKindRangeDelete && key.Kind() != InternalKeyKindRangeDelete {
+		prevKey := i.key
+		prevKey.Trailer = i.keyTrailer
+		panic(errors.AssertionFailedf("pebble: invariant violation: %s and %s out of order", prevKey, key))
+	}
+
+	origSnapshotIdx := i.curSnapshotIdx
+	i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(key.SeqNum(), i.snapshots)
+	switch key.Kind() {
+	case InternalKeyKindRangeDelete:
+		// Range tombstones need to be exposed by the compactionIter to the upper level
+		// `compaction` object, so return them regardless of whether they are in the same
+		// snapshot stripe.
+		if i.curSnapshotIdx == origSnapshotIdx {
+			return sameStripeNonSkippable
+		}
+		return newStripeSameKey
+	case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
+		// Range keys are interleaved at the max sequence number for a given user
+		// key, so we should not see any more range keys in this stripe.
+		panic("unreachable")
+	case InternalKeyKindInvalid:
+		if i.curSnapshotIdx == origSnapshotIdx {
+			return sameStripeNonSkippable
+		}
+		return newStripeSameKey
+	case InternalKeyKindDelete, InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindSingleDelete,
+		InternalKeyKindSetWithDelete, InternalKeyKindDeleteSized:
+		// Fall through
+	default:
+		i.iterKey = nil
+		i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind()))
+		i.valid = false
+		return newStripeNewKey
+	}
+	if i.curSnapshotIdx == origSnapshotIdx {
+		return sameStripeSkippable
+	}
+	return newStripeSameKey
+}
+
+func (i *compactionIter) setNext() {
+	// Save the current key.
+	i.saveKey()
+	i.value = i.iterValue
+	i.valid = true
+	i.maybeZeroSeqnum(i.curSnapshotIdx)
+
+	// There are two cases where we can early return and skip the remaining
+	// records in the stripe:
+	// - If the DB does not SETWITHDEL.
+	// - If this key is already a SETWITHDEL.
+	if i.formatVersion < FormatSetWithDelete ||
+		i.iterKey.Kind() == InternalKeyKindSetWithDelete {
+		i.skip = true
+		return
+	}
+
+	// We are iterating forward. Save the current value.
+	i.valueBuf = append(i.valueBuf[:0], i.iterValue...)
+	i.value = i.valueBuf
+
+	// Else, we continue to loop through entries in the stripe looking for a
+	// DEL. Note that we may stop *before* encountering a DEL, if one exists.
+	for {
+		switch i.nextInStripe() {
+		case newStripeNewKey, newStripeSameKey:
+			i.pos = iterPosNext
+			return
+		case sameStripeNonSkippable:
+			i.pos = iterPosNext
+			// We iterated onto a key that we cannot skip. We can
+			// conservatively transform the original SET into a SETWITHDEL
+			// as an indication that there *may* still be a DEL/SINGLEDEL
+			// under this SET, even if we did not actually encounter one.
+			//
+			// This is safe to do, as:
+			//
+			// - in the case that there *is not* actually a DEL/SINGLEDEL
+			// under this entry, any SINGLEDEL above this now-transformed
+			// SETWITHDEL will become a DEL when the two encounter in a
+			// compaction. The DEL will eventually be elided in a
+			// subsequent compaction. The cost for ensuring correctness is
+			// that this entry is kept around for an additional compaction
+			// cycle(s).
+			//
+			// - in the case there *is* indeed a DEL/SINGLEDEL under us
+			// (but in a different stripe or sstable), then we will have
+			// already done the work to transform the SET into a
+			// SETWITHDEL, and we will skip any additional iteration when
+			// this entry is encountered again in a subsequent compaction.
+			//
+			// Ideally, this codepath would be smart enough to handle the
+			// case of SET <- RANGEDEL <- ... <- DEL/SINGLEDEL <- ....
+			// This requires preserving any RANGEDEL entries we encounter
+			// along the way, then emitting the original (possibly
+			// transformed) key, followed by the RANGEDELs. This requires
+			// a sizable refactoring of the existing code, as nextInStripe
+			// currently returns a sameStripeNonSkippable when it
+			// encounters a RANGEDEL.
+			// TODO(travers): optimize to handle the RANGEDEL case if it
+			// turns out to be a performance problem.
+			i.key.SetKind(InternalKeyKindSetWithDelete)
+
+			// By setting i.skip=true, we are saying that after the
+			// non-skippable key is emitted (which is likely a RANGEDEL),
+			// the remaining point keys that share the same user key as this
+			// saved key should be skipped.
+			i.skip = true
+			return
+		case sameStripeSkippable:
+			// We're still in the same stripe. If this is a
+			// DEL/SINGLEDEL/DELSIZED, we stop looking and emit a SETWITHDEL.
+			// Subsequent keys are eligible for skipping.
+			switch i.iterKey.Kind() {
+			case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
+				i.key.SetKind(InternalKeyKindSetWithDelete)
+				i.skip = true
+				return
+			case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindSetWithDelete:
+				// Do nothing
+			default:
+				i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind()))
+				i.valid = false
+			}
+		default:
+			panic("pebble: unexpected stripeChangeType: " + strconv.Itoa(int(i.iterStripeChange)))
+		}
+	}
+}
+
+func (i *compactionIter) mergeNext(valueMerger ValueMerger) stripeChangeType {
+	// Save the current key.
+	i.saveKey()
+	i.valid = true
+
+	// Loop looking for older values in the current snapshot stripe and merge
+	// them.
+	for {
+		if i.nextInStripe() != sameStripeSkippable {
+			i.pos = iterPosNext
+			return i.iterStripeChange
+		}
+		if i.err != nil {
+			panic(i.err)
+		}
+		key := i.iterKey
+		switch key.Kind() {
+		case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
+			// We've hit a deletion tombstone. Return everything up to this point and
+			// then skip entries until the next snapshot stripe. We change the kind
+			// of the result key to a Set so that it shadows keys in lower
+			// levels. That is, MERGE+DEL -> SETWITHDEL.
+			//
+			// We do the same for SingleDelete since SingleDelete is only
+			// permitted (with deterministic behavior) for keys that have been
+			// set once since the last SingleDelete/Delete, so everything
+			// older is acceptable to shadow. Note that this is slightly
+			// different from singleDeleteNext() which implements stricter
+			// semantics in terms of applying the SingleDelete to the single
+			// next Set. But those stricter semantics are not observable to
+			// the end-user since Iterator interprets SingleDelete as Delete.
+			// We could do something more complicated here and consume only a
+			// single Set, and then merge in any following Sets, but that is
+			// complicated wrt code and unnecessary given the narrow permitted
+			// use of SingleDelete.
+			i.key.SetKind(InternalKeyKindSetWithDelete)
+			i.skip = true
+			return sameStripeSkippable
+
+		case InternalKeyKindSet, InternalKeyKindSetWithDelete:
+			if i.rangeDelFrag.Covers(*key, i.curSnapshotSeqNum) == keyspan.CoversVisibly {
+				// We change the kind of the result key to a Set so that it shadows
+				// keys in lower levels. That is, MERGE+RANGEDEL -> SET. This isn't
+				// strictly necessary, but provides consistency with the behavior of
+				// MERGE+DEL.
+				i.key.SetKind(InternalKeyKindSet)
+				i.skip = true
+				return sameStripeSkippable
+			}
+
+			// We've hit a Set or SetWithDel value. Merge with the existing
+			// value and return. We change the kind of the resulting key to a
+			// Set so that it shadows keys in lower levels. That is:
+			// MERGE + (SET*) -> SET.
+			i.err = valueMerger.MergeOlder(i.iterValue)
+			if i.err != nil {
+				i.valid = false
+				return sameStripeSkippable
+			}
+			i.key.SetKind(InternalKeyKindSet)
+			i.skip = true
+			return sameStripeSkippable
+
+		case InternalKeyKindMerge:
+			if i.rangeDelFrag.Covers(*key, i.curSnapshotSeqNum) == keyspan.CoversVisibly {
+				// We change the kind of the result key to a Set so that it shadows
+				// keys in lower levels. That is, MERGE+RANGEDEL -> SET. This isn't
+				// strictly necessary, but provides consistency with the behavior of
+				// MERGE+DEL.
+				i.key.SetKind(InternalKeyKindSet)
+				i.skip = true
+				return sameStripeSkippable
+			}
+
+			// We've hit another Merge value. Merge with the existing value and
+			// continue looping.
+			i.err = valueMerger.MergeOlder(i.iterValue)
+			if i.err != nil {
+				i.valid = false
+				return sameStripeSkippable
+			}
+
+		default:
+			i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind()))
+			i.valid = false
+			return sameStripeSkippable
+		}
+	}
+}
+
+// singleDeleteNext processes a SingleDelete point tombstone. A SingleDelete, or
+// SINGLEDEL, is unique in that it deletes exactly 1 internal key. It's a
+// performance optimization when the client knows a user key has not been
+// overwritten, allowing the elision of the tombstone earlier, avoiding write
+// amplification.
+//
+// singleDeleteNext returns a boolean indicating whether or not the caller
+// should yield the SingleDelete key to the consumer of the compactionIter. If
+// singleDeleteNext returns false, the caller may consume/elide the
+// SingleDelete.
+func (i *compactionIter) singleDeleteNext() bool {
+	// Save the current key.
+	i.saveKey()
+	i.value = i.iterValue
+	i.valid = true
+
+	// Loop until finds a key to be passed to the next level.
+	for {
+		// If we find a key that can't be skipped, return true so that the
+		// caller yields the SingleDelete to the caller.
+		if i.nextInStripe() != sameStripeSkippable {
+			i.pos = iterPosNext
+			return i.err == nil
+		}
+		if i.err != nil {
+			panic(i.err)
+		}
+		key := i.iterKey
+		switch key.Kind() {
+		case InternalKeyKindDelete, InternalKeyKindMerge, InternalKeyKindSetWithDelete, InternalKeyKindDeleteSized:
+			// We've hit a Delete, DeleteSized, Merge, SetWithDelete, transform
+			// the SingleDelete into a full Delete.
+			i.key.SetKind(InternalKeyKindDelete)
+			i.skip = true
+			return true
+
+		case InternalKeyKindSet:
+			// This SingleDelete deletes the Set, and we can now elide the
+			// SingleDel as well. We advance past the Set and return false to
+			// indicate to the main compaction loop that we should NOT yield the
+			// current SingleDel key to the compaction loop.
+			i.nextInStripe()
+			// TODO(jackson): We could assert that nextInStripe either a)
+			// stepped onto a new key, or b) stepped on to a Delete, DeleteSized
+			// or SingleDel key. This would detect improper uses of SingleDel,
+			// but only when all three internal keys meet in the same compaction
+			// which is not likely.
+			i.valid = false
+			return false
+
+		case InternalKeyKindSingleDelete:
+			// Two single deletes met in a compaction. With proper deterministic
+			// use of SingleDelete, this should never happen. The expectation is
+			// that there's exactly 1 set beneath a single delete. Currently, we
+			// opt to skip it.
+			// TODO(jackson): Should we make this an error? This would also
+			// allow us to simplify the code a bit by removing the for loop.
+			continue
+
+		default:
+			i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind()))
+			i.valid = false
+			return false
+		}
+	}
+}
+
+// deleteSizedNext processes a DELSIZED point tombstone. Unlike ordinary DELs,
+// these tombstones carry a value that's a varint indicating the size of the
+// entry (len(key)+len(value)) that the tombstone is expected to delete.
+//
+// When a deleteSizedNext is encountered, we skip ahead to see which keys, if
+// any, are elided as a result of the tombstone.
+func (i *compactionIter) deleteSizedNext() (*base.InternalKey, []byte) {
+	i.saveKey()
+	i.valid = true
+	i.skip = true
+
+	// The DELSIZED tombstone may have no value at all. This happens when the
+	// tombstone has already deleted the key that the user originally predicted.
+	// In this case, we still peek forward in case there's another DELSIZED key
+	// with a lower sequence number, in which case we'll adopt its value.
+	if len(i.iterValue) == 0 {
+		i.value = i.valueBuf[:0]
+	} else {
+		i.valueBuf = append(i.valueBuf[:0], i.iterValue...)
+		i.value = i.valueBuf
+	}
+
+	// Loop through all the keys within this stripe that are skippable.
+	i.pos = iterPosNext
+	for i.nextInStripe() == sameStripeSkippable {
+		if i.err != nil {
+			panic(i.err)
+		}
+		switch i.iterKey.Kind() {
+		case InternalKeyKindDelete, InternalKeyKindDeleteSized, InternalKeyKindSingleDelete:
+			// We encountered a tombstone (DEL, or DELSIZED) that's deleted by
+			// the original DELSIZED tombstone. This can happen in two cases:
+			//
+			// (1) These tombstones were intended to delete two distinct values,
+			//     and this DELSIZED has already dropped the relevant key. For
+			//     example:
+			//
+			//     a.DELSIZED.9   a.SET.7   a.DELSIZED.5   a.SET.4
+			//
+			//     If a.DELSIZED.9 has already deleted a.SET.7, its size has
+			//     already been zeroed out. In this case, we want to adopt the
+			//     value of the DELSIZED with the lower sequence number, in
+			//     case the a.SET.4 key has not yet been elided.
+			//
+			// (2) This DELSIZED was missized. The user thought they were
+			//     deleting a key with this user key, but this user key had
+			//     already been deleted.
+			//
+			// We can differentiate these two cases by examining the length of
+			// the DELSIZED's value. A DELSIZED's value holds the size of both
+			// the user key and value that it intends to delete. For any user
+			// key with a length > 1, a DELSIZED that has not deleted a key must
+			// have a value with a length > 1.
+			//
+			// We treat both cases the same functionally, adopting the identity
+			// of the lower-sequence numbered tombstone. However in the second
+			// case, we also increment the stat counting missized tombstones.
+			if len(i.value) > 0 {
+				// The original DELSIZED key was missized. The key that the user
+				// thought they were deleting does not exist.
+				i.stats.countMissizedDels++
+			}
+			i.valueBuf = append(i.valueBuf[:0], i.iterValue...)
+			i.value = i.valueBuf
+			if i.iterKey.Kind() != InternalKeyKindDeleteSized {
+				// Convert the DELSIZED to a DEL—The DEL/SINGLEDEL we're eliding
+				// may not have deleted the key(s) it was intended to yet. The
+				// ordinary DEL compaction heuristics are better suited at that,
+				// plus we don't want to count it as a missized DEL. We early
+				// exit in this case, after skipping the remainder of the
+				// snapshot stripe.
+				i.key.SetKind(InternalKeyKindDelete)
+				// NB: We skipInStripe now, rather than returning leaving
+				// i.skip=true and returning early, because Next() requires
+				// that i.skip=true only if i.iterPos = iterPosCurForward.
+				//
+				// Ignore any error caused by skipInStripe since it does not affect
+				// the key/value being returned here, and the next call to Next() will
+				// expose it.
+				i.skipInStripe()
+				return &i.key, i.value
+			}
+			// Continue, in case we uncover another DELSIZED or a key this
+			// DELSIZED deletes.
+
+		case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindSetWithDelete:
+			// If the DELSIZED is value-less, it already deleted the key that it
+			// was intended to delete. This is possible with a sequence like:
+			//
+			//      DELSIZED.8     SET.7     SET.3
+			//
+			// The DELSIZED only describes the size of the SET.7, which in this
+			// case has already been elided. We don't count it as a missizing,
+			// instead converting the DELSIZED to a DEL. Skip the remainder of
+			// the snapshot stripe and return.
+			if len(i.value) == 0 {
+				i.key.SetKind(InternalKeyKindDelete)
+				// NB: We skipInStripe now, rather than returning leaving
+				// i.skip=true and returning early, because Next() requires
+				// that i.skip=true only if i.iterPos = iterPosCurForward.
+				//
+				// Ignore any error caused by skipInStripe since it does not affect
+				// the key/value being returned here, and the next call to Next() will
+				// expose it.
+				i.skipInStripe()
+				return &i.key, i.value
+			}
+			// The deleted key is not a DEL, DELSIZED, and the DELSIZED in i.key
+			// has a positive size.
+			expectedSize, n := binary.Uvarint(i.value)
+			if n != len(i.value) {
+				i.err = base.CorruptionErrorf("DELSIZED holds invalid value: %x", errors.Safe(i.value))
+				i.valid = false
+				return nil, nil
+			}
+			elidedSize := uint64(len(i.iterKey.UserKey)) + uint64(len(i.iterValue))
+			if elidedSize != expectedSize {
+				// The original DELSIZED key was missized. It's unclear what to
+				// do. The user-provided size was wrong, so it's unlikely to be
+				// accurate or meaningful. We could:
+				//
+				//   1. return the DELSIZED with the original user-provided size unmodified
+				//   2. return the DELZIZED with a zeroed size to reflect that a key was
+				//   elided, even if it wasn't the anticipated size.
+				//   3. subtract the elided size from the estimate and re-encode.
+				//   4. convert the DELSIZED into a value-less DEL, so that
+				//      ordinary DEL heuristics apply.
+				//
+				// We opt for (4) under the rationale that we can't rely on the
+				// user-provided size for accuracy, so ordinary DEL heuristics
+				// are safer.
+				i.stats.countMissizedDels++
+				i.key.SetKind(InternalKeyKindDelete)
+				i.value = i.valueBuf[:0]
+				// NB: We skipInStripe now, rather than returning leaving
+				// i.skip=true and returning early, because Next() requires
+				// that i.skip=true only if i.iterPos = iterPosCurForward.
+				//
+				// Ignore any error caused by skipInStripe since it does not affect
+				// the key/value being returned here, and the next call to Next() will
+				// expose it.
+				i.skipInStripe()
+				return &i.key, i.value
+			}
+			// NB: We remove the value regardless of whether the key was sized
+			// appropriately. The size encoded is 'consumed' the first time it
+			// meets a key that it deletes.
+			i.value = i.valueBuf[:0]
+
+		default:
+			i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind()))
+			i.valid = false
+			return nil, nil
+		}
+	}
+	// Reset skip if we landed outside the original stripe. Otherwise, we landed
+	// in the same stripe on a non-skippable key. In that case we should preserve
+	// `i.skip == true` such that later keys in the stripe will continue to be
+	// skipped.
+	if i.iterStripeChange == newStripeNewKey || i.iterStripeChange == newStripeSameKey {
+		i.skip = false
+	}
+	if i.err != nil {
+		return nil, nil
+	}
+	return &i.key, i.value
+}
+
+func (i *compactionIter) saveKey() {
+	i.keyBuf = append(i.keyBuf[:0], i.iterKey.UserKey...)
+	i.key.UserKey = i.keyBuf
+	i.key.Trailer = i.iterKey.Trailer
+	i.keyTrailer = i.iterKey.Trailer
+	i.frontiers.Advance(i.key.UserKey)
+}
+
+func (i *compactionIter) cloneKey(key []byte) []byte {
+	i.alloc, key = i.alloc.Copy(key)
+	return key
+}
+
+func (i *compactionIter) Key() InternalKey {
+	return i.key
+}
+
+func (i *compactionIter) Value() []byte {
+	return i.value
+}
+
+func (i *compactionIter) Valid() bool {
+	return i.valid
+}
+
+func (i *compactionIter) Error() error {
+	return i.err
+}
+
+func (i *compactionIter) Close() error {
+	err := i.iter.Close()
+	if i.err == nil {
+		i.err = err
+	}
+
+	// Close the closer for the current value if one was open.
+	if i.valueCloser != nil {
+		i.err = firstError(i.err, i.valueCloser.Close())
+		i.valueCloser = nil
+	}
+
+	return i.err
+}
+
+// Tombstones returns a list of pending range tombstones in the fragmenter
+// up to the specified key, or all pending range tombstones if key = nil.
+func (i *compactionIter) Tombstones(key []byte) []keyspan.Span {
+	if key == nil {
+		i.rangeDelFrag.Finish()
+	} else {
+		// The specified end key is exclusive; no versions of the specified
+		// user key (including range tombstones covering that key) should
+		// be flushed yet.
+		i.rangeDelFrag.TruncateAndFlushTo(key)
+	}
+	tombstones := i.tombstones
+	i.tombstones = nil
+	return tombstones
+}
+
+// RangeKeys returns a list of pending fragmented range keys up to the specified
+// key, or all pending range keys if key = nil.
+func (i *compactionIter) RangeKeys(key []byte) []keyspan.Span {
+	if key == nil {
+		i.rangeKeyFrag.Finish()
+	} else {
+		// The specified end key is exclusive; no versions of the specified
+		// user key (including range tombstones covering that key) should
+		// be flushed yet.
+		i.rangeKeyFrag.TruncateAndFlushTo(key)
+	}
+	rangeKeys := i.rangeKeys
+	i.rangeKeys = nil
+	return rangeKeys
+}
+
+func (i *compactionIter) emitRangeDelChunk(fragmented keyspan.Span) {
+	// Apply the snapshot stripe rules, keeping only the latest tombstone for
+	// each snapshot stripe.
+	currentIdx := -1
+	keys := fragmented.Keys[:0]
+	for _, k := range fragmented.Keys {
+		idx, _ := snapshotIndex(k.SeqNum(), i.snapshots)
+		if currentIdx == idx {
+			continue
+		}
+		if idx == 0 && i.elideRangeTombstone(fragmented.Start, fragmented.End) {
+			// This is the last snapshot stripe and the range tombstone
+			// can be elided.
+			break
+		}
+
+		keys = append(keys, k)
+		if idx == 0 {
+			// This is the last snapshot stripe.
+			break
+		}
+		currentIdx = idx
+	}
+	if len(keys) > 0 {
+		i.tombstones = append(i.tombstones, keyspan.Span{
+			Start: fragmented.Start,
+			End:   fragmented.End,
+			Keys:  keys,
+		})
+	}
+}
+
+func (i *compactionIter) emitRangeKeyChunk(fragmented keyspan.Span) {
+	// Elision of snapshot stripes happens in rangeKeyCompactionTransform, so no need to
+	// do that here.
+	if len(fragmented.Keys) > 0 {
+		i.rangeKeys = append(i.rangeKeys, fragmented)
+	}
+}
+
+// maybeZeroSeqnum attempts to set the seqnum for the current key to 0. Doing
+// so improves compression and enables an optimization during forward iteration
+// to skip some key comparisons. The seqnum for an entry can be zeroed if the
+// entry is on the bottom snapshot stripe and on the bottom level of the LSM.
+func (i *compactionIter) maybeZeroSeqnum(snapshotIdx int) {
+	if !i.allowZeroSeqNum {
+		// TODO(peter): allowZeroSeqNum applies to the entire compaction. We could
+		// make the determination on a key by key basis, similar to what is done
+		// for elideTombstone. Need to add a benchmark for compactionIter to verify
+		// that isn't too expensive.
+		return
+	}
+	if snapshotIdx > 0 {
+		// This is not the last snapshot
+		return
+	}
+	i.key.SetSeqNum(base.SeqNumZero)
+}
+
+// A frontier is used to monitor a compaction's progression across the user
+// keyspace.
+//
+// A frontier hold a user key boundary that it's concerned with in its `key`
+// field. If/when the compaction iterator returns an InternalKey with a user key
+// _k_ such that k ≥ frontier.key, the compaction iterator invokes the
+// frontier's `reached` function, passing _k_ as its argument.
+//
+// The `reached` function returns a new value to use as the key. If `reached`
+// returns nil, the frontier is forgotten and its `reached` method will not be
+// invoked again, unless the user calls [Update] to set a new key.
+//
+// A frontier's key may be updated outside the context of a `reached`
+// invocation at any time, through its Update method.
+type frontier struct {
+	// container points to the containing *frontiers that was passed to Init
+	// when the frontier was initialized.
+	container *frontiers
+
+	// key holds the frontier's current key. If nil, this frontier is inactive
+	// and its reached func will not be invoked. The value of this key may only
+	// be updated by the `frontiers` type, or the Update method.
+	key []byte
+
+	// reached is invoked to inform a frontier that its key has been reached.
+	// It's invoked with the user key that reached the limit. The `key` argument
+	// is guaranteed to be ≥ the frontier's key.
+	//
+	// After reached is invoked, the frontier's key is updated to the return
+	// value of `reached`. Note bene, the frontier is permitted to update its
+	// key to a user key ≤ the argument `key`.
+	//
+	// If a frontier is set to key k1, and reached(k2) is invoked (k2 ≥ k1), the
+	// frontier will receive reached(k2) calls until it returns nil or a key
+	// `k3` such that k2 < k3. This property is useful for frontiers that use
+	// `reached` invocations to drive iteration through collections of keys that
+	// may contain multiple keys that are both < k2 and ≥ k1.
+	reached func(key []byte) (next []byte)
+}
+
+// Init initializes the frontier with the provided key and reached callback.
+// The frontier is attached to the provided *frontiers and the provided reached
+// func will be invoked when the *frontiers is advanced to a key ≥ this
+// frontier's key.
+func (f *frontier) Init(
+	frontiers *frontiers, initialKey []byte, reached func(key []byte) (next []byte),
+) {
+	*f = frontier{
+		container: frontiers,
+		key:       initialKey,
+		reached:   reached,
+	}
+	if initialKey != nil {
+		f.container.push(f)
+	}
+}
+
+// String implements fmt.Stringer.
+func (f *frontier) String() string {
+	return string(f.key)
+}
+
+// Update replaces the existing frontier's key with the provided key. The
+// frontier's reached func will be invoked when the new key is reached.
+func (f *frontier) Update(key []byte) {
+	c := f.container
+	prevKeyIsNil := f.key == nil
+	f.key = key
+	if prevKeyIsNil {
+		if key != nil {
+			c.push(f)
+		}
+		return
+	}
+
+	// Find the frontier within the heap (it must exist within the heap because
+	// f.key was != nil). If the frontier key is now nil, remove it from the
+	// heap. Otherwise, fix up its position.
+	for i := 0; i < len(c.items); i++ {
+		if c.items[i] == f {
+			if key != nil {
+				c.fix(i)
+			} else {
+				n := c.len() - 1
+				c.swap(i, n)
+				c.down(i, n)
+				c.items = c.items[:n]
+			}
+			return
+		}
+	}
+	panic("unreachable")
+}
+
+// frontiers is used to track progression of a task (eg, compaction) across the
+// keyspace. Clients that want to be informed when the task advances to a key ≥
+// some frontier may register a frontier, providing a callback. The task calls
+// `Advance(k)` with each user key encountered, which invokes the `reached` func
+// on all tracked frontiers with `key`s ≤ k.
+//
+// Internally, frontiers is implemented as a simple heap.
+type frontiers struct {
+	cmp   Compare
+	items []*frontier
+}
+
+// String implements fmt.Stringer.
+func (f *frontiers) String() string {
+	var buf bytes.Buffer
+	for i := 0; i < len(f.items); i++ {
+		if i > 0 {
+			fmt.Fprint(&buf, ", ")
+		}
+		fmt.Fprintf(&buf, "%s: %q", f.items[i], f.items[i].key)
+	}
+	return buf.String()
+}
+
+// Advance notifies all member frontiers with keys ≤ k.
+func (f *frontiers) Advance(k []byte) {
+	for len(f.items) > 0 && f.cmp(k, f.items[0].key) >= 0 {
+		// This frontier has been reached. Invoke the closure and update with
+		// the next frontier.
+		f.items[0].key = f.items[0].reached(k)
+		if f.items[0].key == nil {
+			// This was the final frontier that this user was concerned with.
+			// Remove it from the heap.
+			f.pop()
+		} else {
+			// Fix up the heap root.
+			f.fix(0)
+		}
+	}
+}
+
+func (f *frontiers) len() int {
+	return len(f.items)
+}
+
+func (f *frontiers) less(i, j int) bool {
+	return f.cmp(f.items[i].key, f.items[j].key) < 0
+}
+
+func (f *frontiers) swap(i, j int) {
+	f.items[i], f.items[j] = f.items[j], f.items[i]
+}
+
+// fix, up and down are copied from the go stdlib.
+
+func (f *frontiers) fix(i int) {
+	if !f.down(i, f.len()) {
+		f.up(i)
+	}
+}
+
+func (f *frontiers) push(ff *frontier) {
+	n := len(f.items)
+	f.items = append(f.items, ff)
+	f.up(n)
+}
+
+func (f *frontiers) pop() *frontier {
+	n := f.len() - 1
+	f.swap(0, n)
+	f.down(0, n)
+	item := f.items[n]
+	f.items = f.items[:n]
+	return item
+}
+
+func (f *frontiers) up(j int) {
+	for {
+		i := (j - 1) / 2 // parent
+		if i == j || !f.less(j, i) {
+			break
+		}
+		f.swap(i, j)
+		j = i
+	}
+}
+
+func (f *frontiers) down(i0, n int) bool {
+	i := i0
+	for {
+		j1 := 2*i + 1
+		if j1 >= n || j1 < 0 { // j1 < 0 after int overflow
+			break
+		}
+		j := j1 // left child
+		if j2 := j1 + 1; j2 < n && f.less(j2, j1) {
+			j = j2 // = 2*i + 2  // right child
+		}
+		if !f.less(j, i) {
+			break
+		}
+		f.swap(i, j)
+		i = j
+	}
+	return i > i0
+}
diff --git a/pebble/compaction_iter_test.go b/pebble/compaction_iter_test.go
new file mode 100644
index 0000000..07c489c
--- /dev/null
+++ b/pebble/compaction_iter_test.go
@@ -0,0 +1,382 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"slices"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invalidating"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/rangekey"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/stretchr/testify/require"
+)
+
+func TestSnapshotIndex(t *testing.T) {
+	testCases := []struct {
+		snapshots      []uint64
+		seq            uint64
+		expectedIndex  int
+		expectedSeqNum uint64
+	}{
+		{[]uint64{}, 1, 0, InternalKeySeqNumMax},
+		{[]uint64{1}, 0, 0, 1},
+		{[]uint64{1}, 1, 1, InternalKeySeqNumMax},
+		{[]uint64{1}, 2, 1, InternalKeySeqNumMax},
+		{[]uint64{1, 3}, 1, 1, 3},
+		{[]uint64{1, 3}, 2, 1, 3},
+		{[]uint64{1, 3}, 3, 2, InternalKeySeqNumMax},
+		{[]uint64{1, 3}, 4, 2, InternalKeySeqNumMax},
+		{[]uint64{1, 3, 3}, 2, 1, 3},
+	}
+	for _, c := range testCases {
+		t.Run("", func(t *testing.T) {
+			idx, seqNum := snapshotIndex(c.seq, c.snapshots)
+			if c.expectedIndex != idx {
+				t.Fatalf("expected %d, but got %d", c.expectedIndex, idx)
+			}
+			if c.expectedSeqNum != seqNum {
+				t.Fatalf("expected %d, but got %d", c.expectedSeqNum, seqNum)
+			}
+		})
+	}
+}
+
+type debugMerger struct {
+	buf []byte
+}
+
+func (m *debugMerger) MergeNewer(value []byte) error {
+	m.buf = append(m.buf, value...)
+	return nil
+}
+
+func (m *debugMerger) MergeOlder(value []byte) error {
+	buf := make([]byte, 0, len(m.buf)+len(value))
+	buf = append(buf, value...)
+	buf = append(buf, m.buf...)
+	m.buf = buf
+	return nil
+}
+
+func (m *debugMerger) Finish(includesBase bool) ([]byte, io.Closer, error) {
+	if includesBase {
+		m.buf = append(m.buf, []byte("[base]")...)
+	}
+	return m.buf, nil, nil
+}
+
+func TestCompactionIter(t *testing.T) {
+	var merge Merge
+	var keys []InternalKey
+	var rangeKeys []keyspan.Span
+	var vals [][]byte
+	var snapshots []uint64
+	var elideTombstones bool
+	var allowZeroSeqnum bool
+	var interleavingIter *keyspan.InterleavingIter
+
+	// The input to the data-driven test is dependent on the format major
+	// version we are testing against.
+	fileFunc := func(formatVersion FormatMajorVersion) string {
+		if formatVersion < FormatSetWithDelete {
+			return "testdata/compaction_iter"
+		}
+		if formatVersion < FormatDeleteSizedAndObsolete {
+			return "testdata/compaction_iter_set_with_del"
+		}
+		return "testdata/compaction_iter_delete_sized"
+	}
+
+	newIter := func(formatVersion FormatMajorVersion) *compactionIter {
+		// To adhere to the existing assumption that range deletion blocks in
+		// SSTables are not released while iterating, and therefore not
+		// susceptible to use-after-free bugs, we skip the zeroing of
+		// RangeDelete keys.
+		fi := &fakeIter{keys: keys, vals: vals}
+		interleavingIter = &keyspan.InterleavingIter{}
+		interleavingIter.Init(
+			base.DefaultComparer,
+			fi,
+			keyspan.NewIter(base.DefaultComparer.Compare, rangeKeys),
+			keyspan.InterleavingIterOpts{})
+		iter := invalidating.NewIter(interleavingIter, invalidating.IgnoreKinds(InternalKeyKindRangeDelete))
+		if merge == nil {
+			merge = func(key, value []byte) (base.ValueMerger, error) {
+				m := &debugMerger{}
+				m.buf = append(m.buf, value...)
+				return m, nil
+			}
+		}
+
+		return newCompactionIter(
+			DefaultComparer.Compare,
+			DefaultComparer.Equal,
+			DefaultComparer.FormatKey,
+			merge,
+			iter,
+			snapshots,
+			&keyspan.Fragmenter{},
+			&keyspan.Fragmenter{},
+			allowZeroSeqnum,
+			func([]byte) bool {
+				return elideTombstones
+			},
+			func(_, _ []byte) bool {
+				return elideTombstones
+			},
+			formatVersion,
+		)
+	}
+
+	runTest := func(t *testing.T, formatVersion FormatMajorVersion) {
+		datadriven.RunTest(t, fileFunc(formatVersion), func(t *testing.T, d *datadriven.TestData) string {
+			switch d.Cmd {
+			case "define":
+				merge = nil
+				if len(d.CmdArgs) > 0 && d.CmdArgs[0].Key == "merger" &&
+					len(d.CmdArgs[0].Vals) > 0 && d.CmdArgs[0].Vals[0] == "deletable" {
+					merge = newDeletableSumValueMerger
+				}
+				keys = keys[:0]
+				vals = vals[:0]
+				rangeKeys = rangeKeys[:0]
+				for _, key := range strings.Split(d.Input, "\n") {
+					j := strings.Index(key, ":")
+					keys = append(keys, base.ParseInternalKey(key[:j]))
+
+					if strings.HasPrefix(key[j+1:], "varint(") {
+						valueStr := strings.TrimSuffix(strings.TrimPrefix(key[j+1:], "varint("), ")")
+						v, err := strconv.ParseUint(valueStr, 10, 64)
+						require.NoError(t, err)
+						encodedValue := binary.AppendUvarint([]byte(nil), v)
+						vals = append(vals, encodedValue)
+					} else {
+						vals = append(vals, []byte(key[j+1:]))
+					}
+				}
+				return ""
+
+			case "define-range-keys":
+				for _, key := range strings.Split(d.Input, "\n") {
+					s := keyspan.ParseSpan(strings.TrimSpace(key))
+					rangeKeys = append(rangeKeys, s)
+				}
+				return ""
+
+			case "iter":
+				snapshots = snapshots[:0]
+				elideTombstones = false
+				allowZeroSeqnum = false
+				printSnapshotPinned := false
+				printMissizedDels := false
+				printForceObsolete := false
+				for _, arg := range d.CmdArgs {
+					switch arg.Key {
+					case "snapshots":
+						for _, val := range arg.Vals {
+							seqNum, err := strconv.Atoi(val)
+							if err != nil {
+								return err.Error()
+							}
+							snapshots = append(snapshots, uint64(seqNum))
+						}
+					case "elide-tombstones":
+						var err error
+						elideTombstones, err = strconv.ParseBool(arg.Vals[0])
+						if err != nil {
+							return err.Error()
+						}
+					case "allow-zero-seqnum":
+						var err error
+						allowZeroSeqnum, err = strconv.ParseBool(arg.Vals[0])
+						if err != nil {
+							return err.Error()
+						}
+					case "print-snapshot-pinned":
+						printSnapshotPinned = true
+					case "print-missized-dels":
+						printMissizedDels = true
+					case "print-force-obsolete":
+						printForceObsolete = true
+					default:
+						return fmt.Sprintf("%s: unknown arg: %s", d.Cmd, arg.Key)
+					}
+				}
+				slices.Sort(snapshots)
+
+				iter := newIter(formatVersion)
+				var b bytes.Buffer
+				for _, line := range strings.Split(d.Input, "\n") {
+					parts := strings.Fields(line)
+					if len(parts) == 0 {
+						continue
+					}
+					switch parts[0] {
+					case "first":
+						iter.First()
+					case "next":
+						iter.Next()
+					case "tombstones":
+						var key []byte
+						if len(parts) == 2 {
+							key = []byte(parts[1])
+						}
+						for _, v := range iter.Tombstones(key) {
+							for _, k := range v.Keys {
+								fmt.Fprintf(&b, "%s-%s#%d\n", v.Start, v.End, k.SeqNum())
+							}
+						}
+						fmt.Fprintf(&b, ".\n")
+						continue
+					case "range-keys":
+						var key []byte
+						if len(parts) == 2 {
+							key = []byte(parts[1])
+						}
+						for _, v := range iter.RangeKeys(key) {
+							fmt.Fprintf(&b, "%s\n", v)
+						}
+						fmt.Fprintf(&b, ".\n")
+						continue
+					default:
+						return fmt.Sprintf("unknown op: %s", parts[0])
+					}
+					if iter.Valid() {
+						snapshotPinned := ""
+						if printSnapshotPinned {
+							snapshotPinned = " (not pinned)"
+							if iter.snapshotPinned {
+								snapshotPinned = " (pinned)"
+							}
+						}
+						forceObsolete := ""
+						if printForceObsolete {
+							forceObsolete = " (not force obsolete)"
+							if iter.forceObsoleteDueToRangeDel {
+								forceObsolete = " (force obsolete)"
+							}
+						}
+						v := string(iter.Value())
+						if iter.Key().Kind() == base.InternalKeyKindDeleteSized && len(iter.Value()) > 0 {
+							vn, n := binary.Uvarint(iter.Value())
+							if n != len(iter.Value()) {
+								v = fmt.Sprintf("err: %0x value not a uvarint", iter.Value())
+							} else {
+								v = fmt.Sprintf("varint(%d)", vn)
+							}
+						}
+						fmt.Fprintf(&b, "%s:%s%s%s\n", iter.Key(), v, snapshotPinned, forceObsolete)
+						if iter.Key().Kind() == InternalKeyKindRangeDelete {
+							iter.rangeDelFrag.Add(keyspan.Span{
+								Start: append([]byte{}, iter.Key().UserKey...),
+								End:   append([]byte{}, iter.Value()...),
+								Keys: []keyspan.Key{
+									{Trailer: iter.Key().Trailer},
+								},
+							})
+						}
+						if rangekey.IsRangeKey(iter.Key().Kind()) {
+							iter.rangeKeyFrag.Add(*interleavingIter.Span())
+						}
+					} else if err := iter.Error(); err != nil {
+						fmt.Fprintf(&b, "err=%v\n", err)
+					} else {
+						fmt.Fprintf(&b, ".\n")
+					}
+				}
+				if printMissizedDels {
+					fmt.Fprintf(&b, "missized-dels=%d\n", iter.stats.countMissizedDels)
+				}
+				return b.String()
+
+			default:
+				return fmt.Sprintf("unknown command: %s", d.Cmd)
+			}
+		})
+	}
+
+	// Rather than testing against all format version, we test against the
+	// significant boundaries.
+	formatVersions := []FormatMajorVersion{
+		FormatMostCompatible,
+		FormatSetWithDelete - 1,
+		FormatSetWithDelete,
+		internalFormatNewest,
+	}
+	for _, formatVersion := range formatVersions {
+		t.Run(fmt.Sprintf("version-%s", formatVersion), func(t *testing.T) {
+			runTest(t, formatVersion)
+		})
+	}
+}
+
+func TestFrontiers(t *testing.T) {
+	cmp := testkeys.Comparer.Compare
+	var keySets [][][]byte
+	datadriven.RunTest(t, "testdata/frontiers", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "init":
+			// Init configures a frontier per line of input. Each line should
+			// contain a sorted whitespace-separated list of keys that the
+			// frontier will use.
+			//
+			// For example, the following input creates two separate monitored
+			// frontiers: one that sets its key successively to 'd', 'e', 'j'
+			// and one that sets its key to 'a', 'p', 'n', 'z':
+			//
+			//    init
+			//    b e j
+			//    a p n z
+
+			keySets = keySets[:0]
+			for _, line := range strings.Split(td.Input, "\n") {
+				keySets = append(keySets, bytes.Fields([]byte(line)))
+			}
+			return ""
+		case "scan":
+			f := &frontiers{cmp: cmp}
+			for _, keys := range keySets {
+				initTestFrontier(f, keys...)
+			}
+			var buf bytes.Buffer
+			for _, kStr := range strings.Fields(td.Input) {
+				k := []byte(kStr)
+				f.Advance(k)
+				fmt.Fprintf(&buf, "%s : { %s }\n", kStr, f.String())
+			}
+			return buf.String()
+		default:
+			return fmt.Sprintf("unrecognized command %q", td.Cmd)
+		}
+	})
+}
+
+// initTestFrontiers adds a new frontier to f that iterates through the provided
+// keys. The keys slice must be sorted.
+func initTestFrontier(f *frontiers, keys ...[]byte) *frontier {
+	ff := &frontier{}
+	var key []byte
+	if len(keys) > 0 {
+		key, keys = keys[0], keys[1:]
+	}
+	reached := func(k []byte) (nextKey []byte) {
+		if len(keys) > 0 {
+			nextKey, keys = keys[0], keys[1:]
+		}
+		return nextKey
+	}
+	ff.Init(f, key, reached)
+	return ff
+}
diff --git a/pebble/compaction_picker.go b/pebble/compaction_picker.go
new file mode 100644
index 0000000..6567391
--- /dev/null
+++ b/pebble/compaction_picker.go
@@ -0,0 +1,2068 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"fmt"
+	"math"
+	"sort"
+	"strings"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/humanize"
+	"github.com/cockroachdb/pebble/internal/manifest"
+)
+
+// The minimum count for an intra-L0 compaction. This matches the RocksDB
+// heuristic.
+const minIntraL0Count = 4
+
+type compactionEnv struct {
+	// diskAvailBytes holds a statistic on the number of bytes available on
+	// disk, as reported by the filesystem. It's used to be more restrictive in
+	// expanding compactions if available disk space is limited.
+	//
+	// The cached value (d.diskAvailBytes) is updated whenever a file is deleted
+	// and whenever a compaction or flush completes. Since file removal is the
+	// primary means of reclaiming space, there is a rough bound on the
+	// statistic's staleness when available bytes is growing. Compactions and
+	// flushes are longer, slower operations and provide a much looser bound
+	// when available bytes is decreasing.
+	diskAvailBytes          uint64
+	earliestUnflushedSeqNum uint64
+	earliestSnapshotSeqNum  uint64
+	inProgressCompactions   []compactionInfo
+	readCompactionEnv       readCompactionEnv
+}
+
+type compactionPicker interface {
+	getScores([]compactionInfo) [numLevels]float64
+	getBaseLevel() int
+	estimatedCompactionDebt(l0ExtraSize uint64) uint64
+	pickAuto(env compactionEnv) (pc *pickedCompaction)
+	pickElisionOnlyCompaction(env compactionEnv) (pc *pickedCompaction)
+	pickRewriteCompaction(env compactionEnv) (pc *pickedCompaction)
+	pickReadTriggeredCompaction(env compactionEnv) (pc *pickedCompaction)
+	forceBaseLevel1()
+}
+
+// readCompactionEnv is used to hold data required to perform read compactions
+type readCompactionEnv struct {
+	rescheduleReadCompaction *bool
+	readCompactions          *readCompactionQueue
+	flushing                 bool
+}
+
+// Information about in-progress compactions provided to the compaction picker.
+// These are used to constrain the new compactions that will be picked.
+type compactionInfo struct {
+	// versionEditApplied is true if this compaction's version edit has already
+	// been committed. The compaction may still be in-progress deleting newly
+	// obsolete files.
+	versionEditApplied bool
+	inputs             []compactionLevel
+	outputLevel        int
+	smallest           InternalKey
+	largest            InternalKey
+}
+
+func (info compactionInfo) String() string {
+	var buf bytes.Buffer
+	var largest int
+	for i, in := range info.inputs {
+		if i > 0 {
+			fmt.Fprintf(&buf, " -> ")
+		}
+		fmt.Fprintf(&buf, "L%d", in.level)
+		in.files.Each(func(m *fileMetadata) {
+			fmt.Fprintf(&buf, " %s", m.FileNum)
+		})
+		if largest < in.level {
+			largest = in.level
+		}
+	}
+	if largest != info.outputLevel || len(info.inputs) == 1 {
+		fmt.Fprintf(&buf, " -> L%d", info.outputLevel)
+	}
+	return buf.String()
+}
+
+type sortCompactionLevelsByPriority []candidateLevelInfo
+
+func (s sortCompactionLevelsByPriority) Len() int {
+	return len(s)
+}
+
+// A level should be picked for compaction if the compensatedScoreRatio is >= the
+// compactionScoreThreshold.
+const compactionScoreThreshold = 1
+
+// Less should return true if s[i] must be placed earlier than s[j] in the final
+// sorted list. The candidateLevelInfo for the level placed earlier is more likely
+// to be picked for a compaction.
+func (s sortCompactionLevelsByPriority) Less(i, j int) bool {
+	iShouldCompact := s[i].compensatedScoreRatio >= compactionScoreThreshold
+	jShouldCompact := s[j].compensatedScoreRatio >= compactionScoreThreshold
+	// Ordering is defined as decreasing on (shouldCompact, uncompensatedScoreRatio)
+	// where shouldCompact is 1 for true and 0 for false.
+	if iShouldCompact && !jShouldCompact {
+		return true
+	}
+	if !iShouldCompact && jShouldCompact {
+		return false
+	}
+
+	if s[i].uncompensatedScoreRatio != s[j].uncompensatedScoreRatio {
+		return s[i].uncompensatedScoreRatio > s[j].uncompensatedScoreRatio
+	}
+	return s[i].level < s[j].level
+}
+
+func (s sortCompactionLevelsByPriority) Swap(i, j int) {
+	s[i], s[j] = s[j], s[i]
+}
+
+// sublevelInfo is used to tag a LevelSlice for an L0 sublevel with the
+// sublevel.
+type sublevelInfo struct {
+	manifest.LevelSlice
+	sublevel manifest.Level
+}
+
+func (cl sublevelInfo) Clone() sublevelInfo {
+	return sublevelInfo{
+		sublevel:   cl.sublevel,
+		LevelSlice: cl.LevelSlice.Reslice(func(start, end *manifest.LevelIterator) {}),
+	}
+}
+func (cl sublevelInfo) String() string {
+	return fmt.Sprintf(`Sublevel %s; Levels %s`, cl.sublevel, cl.LevelSlice)
+}
+
+// generateSublevelInfo will generate the level slices for each of the sublevels
+// from the level slice for all of L0.
+func generateSublevelInfo(cmp base.Compare, levelFiles manifest.LevelSlice) []sublevelInfo {
+	sublevelMap := make(map[uint64][]*fileMetadata)
+	it := levelFiles.Iter()
+	for f := it.First(); f != nil; f = it.Next() {
+		sublevelMap[uint64(f.SubLevel)] = append(sublevelMap[uint64(f.SubLevel)], f)
+	}
+
+	var sublevels []int
+	for level := range sublevelMap {
+		sublevels = append(sublevels, int(level))
+	}
+	sort.Ints(sublevels)
+
+	var levelSlices []sublevelInfo
+	for _, sublevel := range sublevels {
+		metas := sublevelMap[uint64(sublevel)]
+		levelSlices = append(
+			levelSlices,
+			sublevelInfo{
+				manifest.NewLevelSliceKeySorted(cmp, metas),
+				manifest.L0Sublevel(sublevel),
+			},
+		)
+	}
+	return levelSlices
+}
+
+// compactionPickerMetrics holds metrics related to the compaction picking process
+type compactionPickerMetrics struct {
+	// scores contains the compensatedScoreRatio from the candidateLevelInfo.
+	scores                      []float64
+	singleLevelOverlappingRatio float64
+	multiLevelOverlappingRatio  float64
+}
+
+// pickedCompaction contains information about a compaction that has already
+// been chosen, and is being constructed. Compaction construction info lives in
+// this struct, and is copied over into the compaction struct when that's
+// created.
+type pickedCompaction struct {
+	cmp Compare
+	// score of the chosen compaction. This is the same as the
+	// compensatedScoreRatio in the candidateLevelInfo.
+	score float64
+	// kind indicates the kind of compaction.
+	kind compactionKind
+	// startLevel is the level that is being compacted. Inputs from startLevel
+	// and outputLevel will be merged to produce a set of outputLevel files.
+	startLevel *compactionLevel
+	// outputLevel is the level that files are being produced in. outputLevel is
+	// equal to startLevel+1 except when:
+	//    - if startLevel is 0, the output level equals compactionPicker.baseLevel().
+	//    - in multilevel compaction, the output level is the lowest level involved in
+	//      the compaction
+	outputLevel *compactionLevel
+	// extraLevels contain additional levels in between the input and output
+	// levels that get compacted in multi level compactions
+	extraLevels []*compactionLevel
+	inputs      []compactionLevel
+	// LBase at the time of compaction picking.
+	baseLevel int
+	// L0-specific compaction info. Set to a non-nil value for all compactions
+	// where startLevel == 0 that were generated by L0Sublevels.
+	lcf *manifest.L0CompactionFiles
+	// maxOutputFileSize is the maximum size of an individual table created
+	// during compaction.
+	maxOutputFileSize uint64
+	// maxOverlapBytes is the maximum number of bytes of overlap allowed for a
+	// single output table with the tables in the grandparent level.
+	maxOverlapBytes uint64
+	// maxReadCompactionBytes is the maximum bytes a read compaction is allowed to
+	// overlap in its output level with. If the overlap is greater than
+	// maxReadCompaction bytes, then we don't proceed with the compaction.
+	maxReadCompactionBytes uint64
+	// The boundaries of the input data.
+	smallest      InternalKey
+	largest       InternalKey
+	version       *version
+	pickerMetrics compactionPickerMetrics
+}
+
+func defaultOutputLevel(startLevel, baseLevel int) int {
+	outputLevel := startLevel + 1
+	if startLevel == 0 {
+		outputLevel = baseLevel
+	}
+	if outputLevel >= numLevels-1 {
+		outputLevel = numLevels - 1
+	}
+	return outputLevel
+}
+
+func newPickedCompaction(
+	opts *Options, cur *version, startLevel, outputLevel, baseLevel int,
+) *pickedCompaction {
+	if startLevel > 0 && startLevel < baseLevel {
+		panic(fmt.Sprintf("invalid compaction: start level %d should not be empty (base level %d)",
+			startLevel, baseLevel))
+	}
+
+	adjustedLevel := adjustedOutputLevel(outputLevel, baseLevel)
+	pc := &pickedCompaction{
+		cmp:                    opts.Comparer.Compare,
+		version:                cur,
+		baseLevel:              baseLevel,
+		inputs:                 []compactionLevel{{level: startLevel}, {level: outputLevel}},
+		maxOutputFileSize:      uint64(opts.Level(adjustedLevel).TargetFileSize),
+		maxOverlapBytes:        maxGrandparentOverlapBytes(opts, adjustedLevel),
+		maxReadCompactionBytes: maxReadCompactionBytes(opts, adjustedLevel),
+	}
+	pc.startLevel = &pc.inputs[0]
+	pc.outputLevel = &pc.inputs[1]
+	return pc
+}
+
+// adjustedOutputLevel is the output level used for the purpose of
+// determining the target output file size, overlap bytes, and expanded
+// bytes, taking into account the base level.
+func adjustedOutputLevel(outputLevel int, baseLevel int) int {
+	adjustedOutputLevel := outputLevel
+	if adjustedOutputLevel > 0 {
+		// Output level is in the range [baseLevel, numLevels]. For the purpose of
+		// determining the target output file size, overlap bytes, and expanded
+		// bytes, we want to adjust the range to [1,numLevels].
+		adjustedOutputLevel = 1 + outputLevel - baseLevel
+	}
+	return adjustedOutputLevel
+}
+
+func newPickedCompactionFromL0(
+	lcf *manifest.L0CompactionFiles, opts *Options, vers *version, baseLevel int, isBase bool,
+) *pickedCompaction {
+	outputLevel := baseLevel
+	if !isBase {
+		outputLevel = 0 // Intra L0
+	}
+
+	pc := newPickedCompaction(opts, vers, 0, outputLevel, baseLevel)
+	pc.lcf = lcf
+	pc.outputLevel.level = outputLevel
+
+	// Manually build the compaction as opposed to calling
+	// pickAutoHelper. This is because L0Sublevels has already added
+	// any overlapping L0 SSTables that need to be added, and
+	// because compactions built by L0SSTables do not necessarily
+	// pick contiguous sequences of files in pc.version.Levels[0].
+	files := make([]*manifest.FileMetadata, 0, len(lcf.Files))
+	iter := vers.Levels[0].Iter()
+	for f := iter.First(); f != nil; f = iter.Next() {
+		if lcf.FilesIncluded[f.L0Index] {
+			files = append(files, f)
+		}
+	}
+	pc.startLevel.files = manifest.NewLevelSliceSeqSorted(files)
+	return pc
+}
+
+func (pc *pickedCompaction) String() string {
+	var builder strings.Builder
+	builder.WriteString(fmt.Sprintf(`Score=%f, `, pc.score))
+	builder.WriteString(fmt.Sprintf(`Kind=%s, `, pc.kind))
+	builder.WriteString(fmt.Sprintf(`AdjustedOutputLevel=%d, `, adjustedOutputLevel(pc.outputLevel.level, pc.baseLevel)))
+	builder.WriteString(fmt.Sprintf(`maxOutputFileSize=%d, `, pc.maxOutputFileSize))
+	builder.WriteString(fmt.Sprintf(`maxReadCompactionBytes=%d, `, pc.maxReadCompactionBytes))
+	builder.WriteString(fmt.Sprintf(`smallest=%s, `, pc.smallest))
+	builder.WriteString(fmt.Sprintf(`largest=%s, `, pc.largest))
+	builder.WriteString(fmt.Sprintf(`version=%s, `, pc.version))
+	builder.WriteString(fmt.Sprintf(`inputs=%s, `, pc.inputs))
+	builder.WriteString(fmt.Sprintf(`startlevel=%s, `, pc.startLevel))
+	builder.WriteString(fmt.Sprintf(`outputLevel=%s, `, pc.outputLevel))
+	builder.WriteString(fmt.Sprintf(`extraLevels=%s, `, pc.extraLevels))
+	builder.WriteString(fmt.Sprintf(`l0SublevelInfo=%s, `, pc.startLevel.l0SublevelInfo))
+	builder.WriteString(fmt.Sprintf(`lcf=%s`, pc.lcf))
+	return builder.String()
+}
+
+// Clone creates a deep copy of the pickedCompaction
+func (pc *pickedCompaction) clone() *pickedCompaction {
+
+	// Quickly copy over fields that do not require special deep copy care, and
+	// set all fields that will require a deep copy to nil.
+	newPC := &pickedCompaction{
+		cmp:                    pc.cmp,
+		score:                  pc.score,
+		kind:                   pc.kind,
+		baseLevel:              pc.baseLevel,
+		maxOutputFileSize:      pc.maxOutputFileSize,
+		maxOverlapBytes:        pc.maxOverlapBytes,
+		maxReadCompactionBytes: pc.maxReadCompactionBytes,
+		smallest:               pc.smallest.Clone(),
+		largest:                pc.largest.Clone(),
+
+		// TODO(msbutler): properly clone picker metrics
+		pickerMetrics: pc.pickerMetrics,
+
+		// Both copies see the same manifest, therefore, it's ok for them to se
+		// share the same pc. version.
+		version: pc.version,
+	}
+
+	newPC.inputs = make([]compactionLevel, len(pc.inputs))
+	newPC.extraLevels = make([]*compactionLevel, 0, len(pc.extraLevels))
+	for i := range pc.inputs {
+		newPC.inputs[i] = pc.inputs[i].Clone()
+		if i == 0 {
+			newPC.startLevel = &newPC.inputs[i]
+		} else if i == len(pc.inputs)-1 {
+			newPC.outputLevel = &newPC.inputs[i]
+		} else {
+			newPC.extraLevels = append(newPC.extraLevels, &newPC.inputs[i])
+		}
+	}
+
+	if len(pc.startLevel.l0SublevelInfo) > 0 {
+		newPC.startLevel.l0SublevelInfo = make([]sublevelInfo, len(pc.startLevel.l0SublevelInfo))
+		for i := range pc.startLevel.l0SublevelInfo {
+			newPC.startLevel.l0SublevelInfo[i] = pc.startLevel.l0SublevelInfo[i].Clone()
+		}
+	}
+	if pc.lcf != nil {
+		newPC.lcf = pc.lcf.Clone()
+	}
+	return newPC
+}
+
+// maybeExpandedBounds is a helper function for setupInputs which ensures the
+// pickedCompaction's smallest and largest internal keys are updated iff
+// the candidate keys expand the key span. This avoids a bug for multi-level
+// compactions: during the second call to setupInputs, the picked compaction's
+// smallest and largest keys should not decrease the key span.
+func (pc *pickedCompaction) maybeExpandBounds(smallest InternalKey, largest InternalKey) {
+	emptyKey := InternalKey{}
+	if base.InternalCompare(pc.cmp, smallest, emptyKey) == 0 {
+		if base.InternalCompare(pc.cmp, largest, emptyKey) != 0 {
+			panic("either both candidate keys are empty or neither are empty")
+		}
+		return
+	}
+	if base.InternalCompare(pc.cmp, pc.smallest, emptyKey) == 0 {
+		if base.InternalCompare(pc.cmp, pc.largest, emptyKey) != 0 {
+			panic("either both pc keys are empty or neither are empty")
+		}
+		pc.smallest = smallest
+		pc.largest = largest
+		return
+	}
+	if base.InternalCompare(pc.cmp, pc.smallest, smallest) >= 0 {
+		pc.smallest = smallest
+	}
+	if base.InternalCompare(pc.cmp, pc.largest, largest) <= 0 {
+		pc.largest = largest
+	}
+}
+
+// setupInputs returns true if a compaction has been set up. It returns false if
+// a concurrent compaction is occurring on the start or output level files.
+func (pc *pickedCompaction) setupInputs(
+	opts *Options, diskAvailBytes uint64, startLevel *compactionLevel,
+) bool {
+	// maxExpandedBytes is the maximum size of an expanded compaction. If
+	// growing a compaction results in a larger size, the original compaction
+	// is used instead.
+	maxExpandedBytes := expandedCompactionByteSizeLimit(
+		opts, adjustedOutputLevel(pc.outputLevel.level, pc.baseLevel), diskAvailBytes,
+	)
+
+	// Expand the initial inputs to a clean cut.
+	var isCompacting bool
+	startLevel.files, isCompacting = expandToAtomicUnit(pc.cmp, startLevel.files, false /* disableIsCompacting */)
+	if isCompacting {
+		return false
+	}
+	pc.maybeExpandBounds(manifest.KeyRange(pc.cmp, startLevel.files.Iter()))
+
+	// Determine the sstables in the output level which overlap with the input
+	// sstables, and then expand those tables to a clean cut. No need to do
+	// this for intra-L0 compactions; outputLevel.files is left empty for those.
+	if startLevel.level != pc.outputLevel.level {
+		pc.outputLevel.files = pc.version.Overlaps(pc.outputLevel.level, pc.cmp, pc.smallest.UserKey,
+			pc.largest.UserKey, pc.largest.IsExclusiveSentinel())
+		pc.outputLevel.files, isCompacting = expandToAtomicUnit(pc.cmp, pc.outputLevel.files,
+			false /* disableIsCompacting */)
+		if isCompacting {
+			return false
+		}
+		pc.maybeExpandBounds(manifest.KeyRange(pc.cmp,
+			startLevel.files.Iter(), pc.outputLevel.files.Iter()))
+	}
+
+	// Grow the sstables in startLevel.level as long as it doesn't affect the number
+	// of sstables included from pc.outputLevel.level.
+	if pc.lcf != nil && startLevel.level == 0 && pc.outputLevel.level != 0 {
+		// Call the L0-specific compaction extension method. Similar logic as
+		// pc.grow. Additional L0 files are optionally added to the compaction at
+		// this step. Note that the bounds passed in are not the bounds of the
+		// compaction, but rather the smallest and largest internal keys that
+		// the compaction cannot include from L0 without pulling in more Lbase
+		// files. Consider this example:
+		//
+		// L0:        c-d e+f g-h
+		// Lbase: a-b     e+f     i-j
+		//        a b c d e f g h i j
+		//
+		// The e-f files have already been chosen in the compaction. As pulling
+		// in more LBase files is undesirable, the logic below will pass in
+		// smallest = b and largest = i to ExtendL0ForBaseCompactionTo, which
+		// will expand the compaction to include c-d and g-h from L0. The
+		// bounds passed in are exclusive; the compaction cannot be expanded
+		// to include files that "touch" it.
+		smallestBaseKey := base.InvalidInternalKey
+		largestBaseKey := base.InvalidInternalKey
+		if pc.outputLevel.files.Empty() {
+			baseIter := pc.version.Levels[pc.outputLevel.level].Iter()
+			if sm := baseIter.SeekLT(pc.cmp, pc.smallest.UserKey); sm != nil {
+				smallestBaseKey = sm.Largest
+			}
+			if la := baseIter.SeekGE(pc.cmp, pc.largest.UserKey); la != nil {
+				largestBaseKey = la.Smallest
+			}
+		} else {
+			// NB: We use Reslice to access the underlying level's files, but
+			// we discard the returned slice. The pc.outputLevel.files slice
+			// is not modified.
+			_ = pc.outputLevel.files.Reslice(func(start, end *manifest.LevelIterator) {
+				if sm := start.Prev(); sm != nil {
+					smallestBaseKey = sm.Largest
+				}
+				if la := end.Next(); la != nil {
+					largestBaseKey = la.Smallest
+				}
+			})
+		}
+		oldLcf := pc.lcf.Clone()
+		if pc.version.L0Sublevels.ExtendL0ForBaseCompactionTo(smallestBaseKey, largestBaseKey, pc.lcf) {
+			var newStartLevelFiles []*fileMetadata
+			iter := pc.version.Levels[0].Iter()
+			var sizeSum uint64
+			for j, f := 0, iter.First(); f != nil; j, f = j+1, iter.Next() {
+				if pc.lcf.FilesIncluded[f.L0Index] {
+					newStartLevelFiles = append(newStartLevelFiles, f)
+					sizeSum += f.Size
+				}
+			}
+			if sizeSum+pc.outputLevel.files.SizeSum() < maxExpandedBytes {
+				startLevel.files = manifest.NewLevelSliceSeqSorted(newStartLevelFiles)
+				pc.smallest, pc.largest = manifest.KeyRange(pc.cmp,
+					startLevel.files.Iter(), pc.outputLevel.files.Iter())
+			} else {
+				*pc.lcf = *oldLcf
+			}
+		}
+	} else if pc.grow(pc.smallest, pc.largest, maxExpandedBytes, startLevel) {
+		pc.maybeExpandBounds(manifest.KeyRange(pc.cmp,
+			startLevel.files.Iter(), pc.outputLevel.files.Iter()))
+	}
+
+	if pc.startLevel.level == 0 {
+		// We don't change the input files for the compaction beyond this point.
+		pc.startLevel.l0SublevelInfo = generateSublevelInfo(pc.cmp, pc.startLevel.files)
+	}
+
+	return true
+}
+
+// grow grows the number of inputs at c.level without changing the number of
+// c.level+1 files in the compaction, and returns whether the inputs grew. sm
+// and la are the smallest and largest InternalKeys in all of the inputs.
+func (pc *pickedCompaction) grow(
+	sm, la InternalKey, maxExpandedBytes uint64, startLevel *compactionLevel,
+) bool {
+	if pc.outputLevel.files.Empty() {
+		return false
+	}
+	grow0 := pc.version.Overlaps(startLevel.level, pc.cmp, sm.UserKey,
+		la.UserKey, la.IsExclusiveSentinel())
+	grow0, isCompacting := expandToAtomicUnit(pc.cmp, grow0, false /* disableIsCompacting */)
+	if isCompacting {
+		return false
+	}
+	if grow0.Len() <= startLevel.files.Len() {
+		return false
+	}
+	if grow0.SizeSum()+pc.outputLevel.files.SizeSum() >= maxExpandedBytes {
+		return false
+	}
+	// We need to include the outputLevel iter because without it, in a multiLevel scenario,
+	// sm1 and la1 could shift the output level keyspace when pc.outputLevel.files is set to grow1.
+	sm1, la1 := manifest.KeyRange(pc.cmp, grow0.Iter(), pc.outputLevel.files.Iter())
+	grow1 := pc.version.Overlaps(pc.outputLevel.level, pc.cmp, sm1.UserKey,
+		la1.UserKey, la1.IsExclusiveSentinel())
+	grow1, isCompacting = expandToAtomicUnit(pc.cmp, grow1, false /* disableIsCompacting */)
+	if isCompacting {
+		return false
+	}
+	if grow1.Len() != pc.outputLevel.files.Len() {
+		return false
+	}
+	startLevel.files = grow0
+	pc.outputLevel.files = grow1
+	return true
+}
+
+func (pc *pickedCompaction) compactionSize() uint64 {
+	var bytesToCompact uint64
+	for i := range pc.inputs {
+		bytesToCompact += pc.inputs[i].files.SizeSum()
+	}
+	return bytesToCompact
+}
+
+// setupMultiLevelCandidated returns true if it successfully added another level
+// to the compaction.
+func (pc *pickedCompaction) setupMultiLevelCandidate(opts *Options, diskAvailBytes uint64) bool {
+	pc.inputs = append(pc.inputs, compactionLevel{level: pc.outputLevel.level + 1})
+
+	// Recalibrate startLevel and outputLevel:
+	//  - startLevel and outputLevel pointers may be obsolete after appending to pc.inputs.
+	//  - push outputLevel to extraLevels and move the new level to outputLevel
+	pc.startLevel = &pc.inputs[0]
+	pc.extraLevels = []*compactionLevel{&pc.inputs[1]}
+	pc.outputLevel = &pc.inputs[2]
+	return pc.setupInputs(opts, diskAvailBytes, pc.extraLevels[len(pc.extraLevels)-1])
+}
+
+// expandToAtomicUnit expands the provided level slice within its level both
+// forwards and backwards to its "atomic compaction unit" boundaries, if
+// necessary.
+//
+// While picking compaction inputs, this is required to maintain the invariant
+// that the versions of keys at level+1 are older than the versions of keys at
+// level. Tables are added to the right of the current slice tables such that
+// the rightmost table has a "clean cut". A clean cut is either a change in
+// user keys, or when the largest key in the left sstable is a range tombstone
+// sentinel key (InternalKeyRangeDeleteSentinel).
+//
+// In addition to maintaining the seqnum invariant, expandToAtomicUnit is used
+// to provide clean boundaries for range tombstone truncation during
+// compaction. In order to achieve these clean boundaries, expandToAtomicUnit
+// needs to find a "clean cut" on the left edge of the compaction as well.
+// This is necessary in order for "atomic compaction units" to always be
+// compacted as a unit. Failure to do this leads to a subtle bug with
+// truncation of range tombstones to atomic compaction unit boundaries.
+// Consider the scenario:
+//
+//	L3:
+//	  12:[a#2,15-b#1,1]
+//	  13:[b#0,15-d#72057594037927935,15]
+//
+// These sstables contain a range tombstone [a-d)#2 which spans the two
+// sstables. The two sstables need to always be kept together. Compacting
+// sstable 13 independently of sstable 12 would result in:
+//
+//	L3:
+//	  12:[a#2,15-b#1,1]
+//	L4:
+//	  14:[b#0,15-d#72057594037927935,15]
+//
+// This state is still ok, but when sstable 12 is next compacted, its range
+// tombstones will be truncated at "b" (the largest key in its atomic
+// compaction unit). In the scenario here, that could result in b#1 becoming
+// visible when it should be deleted.
+//
+// isCompacting is returned true for any atomic units that contain files that
+// have in-progress compactions, i.e. FileMetadata.Compacting == true. If
+// disableIsCompacting is true, isCompacting always returns false. This helps
+// avoid spurious races from being detected when this method is used outside
+// of compaction picking code.
+//
+// TODO(jackson): Compactions and flushes no longer split a user key between two
+// sstables. We could perform a migration, re-compacting any sstables with split
+// user keys, which would allow us to remove atomic compaction unit expansion
+// code.
+func expandToAtomicUnit(
+	cmp Compare, inputs manifest.LevelSlice, disableIsCompacting bool,
+) (slice manifest.LevelSlice, isCompacting bool) {
+	// NB: Inputs for L0 can't be expanded and *version.Overlaps guarantees
+	// that we get a 'clean cut.' For L0, Overlaps will return a slice without
+	// access to the rest of the L0 files, so it's OK to try to reslice.
+	if inputs.Empty() {
+		// Nothing to expand.
+		return inputs, false
+	}
+
+	// TODO(jackson): Update to avoid use of LevelIterator.Current(). The
+	// Reslice interface will require some tweaking, because we currently rely
+	// on Reslice having already positioned the LevelIterator appropriately.
+
+	inputs = inputs.Reslice(func(start, end *manifest.LevelIterator) {
+		iter := start.Clone()
+		iter.Prev()
+		for cur, prev := start.Current(), iter.Current(); prev != nil; cur, prev = start.Prev(), iter.Prev() {
+			if cur.IsCompacting() {
+				isCompacting = true
+			}
+			if cmp(prev.Largest.UserKey, cur.Smallest.UserKey) < 0 {
+				break
+			}
+			if prev.Largest.IsExclusiveSentinel() {
+				// The table prev has a largest key indicating that the user key
+				// prev.largest.UserKey doesn't actually exist in the table.
+				break
+			}
+			// prev.Largest.UserKey == cur.Smallest.UserKey, so we need to
+			// include prev in the compaction.
+		}
+
+		iter = end.Clone()
+		iter.Next()
+		for cur, next := end.Current(), iter.Current(); next != nil; cur, next = end.Next(), iter.Next() {
+			if cur.IsCompacting() {
+				isCompacting = true
+			}
+			if cmp(cur.Largest.UserKey, next.Smallest.UserKey) < 0 {
+				break
+			}
+			if cur.Largest.IsExclusiveSentinel() {
+				// The table cur has a largest key indicating that the user key
+				// cur.largest.UserKey doesn't actually exist in the table.
+				break
+			}
+			// cur.Largest.UserKey == next.Smallest.UserKey, so we need to
+			// include next in the compaction.
+		}
+	})
+	inputIter := inputs.Iter()
+	isCompacting = !disableIsCompacting &&
+		(isCompacting || inputIter.First().IsCompacting() || inputIter.Last().IsCompacting())
+	return inputs, isCompacting
+}
+
+func newCompactionPicker(
+	v *version, opts *Options, inProgressCompactions []compactionInfo,
+) compactionPicker {
+	p := &compactionPickerByScore{
+		opts: opts,
+		vers: v,
+	}
+	p.initLevelMaxBytes(inProgressCompactions)
+	return p
+}
+
+// Information about a candidate compaction level that has been identified by
+// the compaction picker.
+type candidateLevelInfo struct {
+	// The compensatedScore of the level after adjusting according to the other
+	// levels' sizes. For L0, the compensatedScoreRatio is equivalent to the
+	// uncompensatedScoreRatio as we don't account for level size compensation in
+	// L0.
+	compensatedScoreRatio float64
+	// The score of the level after accounting for level size compensation before
+	// adjusting according to other levels' sizes. For L0, the compensatedScore
+	// is equivalent to the uncompensatedScore as we don't account for level
+	// size compensation in L0.
+	compensatedScore float64
+	// The score of the level to be compacted, calculated using uncompensated file
+	// sizes and without any adjustments.
+	uncompensatedScore float64
+	// uncompensatedScoreRatio is the uncompensatedScore adjusted according to
+	// the other levels' sizes.
+	uncompensatedScoreRatio float64
+	level                   int
+	// The level to compact to.
+	outputLevel int
+	// The file in level that will be compacted. Additional files may be
+	// picked by the compaction, and a pickedCompaction created for the
+	// compaction.
+	file manifest.LevelFile
+}
+
+func (c *candidateLevelInfo) shouldCompact() bool {
+	return c.compensatedScoreRatio >= compactionScoreThreshold
+}
+
+func fileCompensation(f *fileMetadata) uint64 {
+	return uint64(f.Stats.PointDeletionsBytesEstimate) + f.Stats.RangeDeletionsBytesEstimate
+}
+
+// compensatedSize returns f's file size, inflated according to compaction
+// priorities.
+func compensatedSize(f *fileMetadata) uint64 {
+	// Add in the estimate of disk space that may be reclaimed by compacting the
+	// file's tombstones.
+	return f.Size + fileCompensation(f)
+}
+
+// compensatedSizeAnnotator implements manifest.Annotator, annotating B-Tree
+// nodes with the sum of the files' compensated sizes. Its annotation type is
+// a *uint64. Compensated sizes may change once a table's stats are loaded
+// asynchronously, so its values are marked as cacheable only if a file's
+// stats have been loaded.
+type compensatedSizeAnnotator struct {
+}
+
+var _ manifest.Annotator = compensatedSizeAnnotator{}
+
+func (a compensatedSizeAnnotator) Zero(dst interface{}) interface{} {
+	if dst == nil {
+		return new(uint64)
+	}
+	v := dst.(*uint64)
+	*v = 0
+	return v
+}
+
+func (a compensatedSizeAnnotator) Accumulate(
+	f *fileMetadata, dst interface{},
+) (v interface{}, cacheOK bool) {
+	vptr := dst.(*uint64)
+	*vptr = *vptr + compensatedSize(f)
+	return vptr, f.StatsValid()
+}
+
+func (a compensatedSizeAnnotator) Merge(src interface{}, dst interface{}) interface{} {
+	srcV := src.(*uint64)
+	dstV := dst.(*uint64)
+	*dstV = *dstV + *srcV
+	return dstV
+}
+
+// totalCompensatedSize computes the compensated size over a file metadata
+// iterator. Note that this function is linear in the files available to the
+// iterator. Use the compensatedSizeAnnotator if querying the total
+// compensated size of a level.
+func totalCompensatedSize(iter manifest.LevelIterator) uint64 {
+	var sz uint64
+	for f := iter.First(); f != nil; f = iter.Next() {
+		sz += compensatedSize(f)
+	}
+	return sz
+}
+
+// compactionPickerByScore holds the state and logic for picking a compaction. A
+// compaction picker is associated with a single version. A new compaction
+// picker is created and initialized every time a new version is installed.
+type compactionPickerByScore struct {
+	opts *Options
+	vers *version
+	// The level to target for L0 compactions. Levels L1 to baseLevel must be
+	// empty.
+	baseLevel int
+	// levelMaxBytes holds the dynamically adjusted max bytes setting for each
+	// level.
+	levelMaxBytes [numLevels]int64
+}
+
+var _ compactionPicker = &compactionPickerByScore{}
+
+func (p *compactionPickerByScore) getScores(inProgress []compactionInfo) [numLevels]float64 {
+	var scores [numLevels]float64
+	for _, info := range p.calculateLevelScores(inProgress) {
+		scores[info.level] = info.compensatedScoreRatio
+	}
+	return scores
+}
+
+func (p *compactionPickerByScore) getBaseLevel() int {
+	if p == nil {
+		return 1
+	}
+	return p.baseLevel
+}
+
+// estimatedCompactionDebt estimates the number of bytes which need to be
+// compacted before the LSM tree becomes stable.
+func (p *compactionPickerByScore) estimatedCompactionDebt(l0ExtraSize uint64) uint64 {
+	if p == nil {
+		return 0
+	}
+
+	// We assume that all the bytes in L0 need to be compacted to Lbase. This is
+	// unlike the RocksDB logic that figures out whether L0 needs compaction.
+	bytesAddedToNextLevel := l0ExtraSize + p.vers.Levels[0].Size()
+	lbaseSize := p.vers.Levels[p.baseLevel].Size()
+
+	var compactionDebt uint64
+	if bytesAddedToNextLevel > 0 && lbaseSize > 0 {
+		// We only incur compaction debt if both L0 and Lbase contain data. If L0
+		// is empty, no compaction is necessary. If Lbase is empty, a move-based
+		// compaction from L0 would occur.
+		compactionDebt += bytesAddedToNextLevel + lbaseSize
+	}
+
+	// loop invariant: At the beginning of the loop, bytesAddedToNextLevel is the
+	// bytes added to `level` in the loop.
+	for level := p.baseLevel; level < numLevels-1; level++ {
+		levelSize := p.vers.Levels[level].Size() + bytesAddedToNextLevel
+		nextLevelSize := p.vers.Levels[level+1].Size()
+		if levelSize > uint64(p.levelMaxBytes[level]) {
+			bytesAddedToNextLevel = levelSize - uint64(p.levelMaxBytes[level])
+			if nextLevelSize > 0 {
+				// We only incur compaction debt if the next level contains data. If the
+				// next level is empty, a move-based compaction would be used.
+				levelRatio := float64(nextLevelSize) / float64(levelSize)
+				// The current level contributes bytesAddedToNextLevel to compactions.
+				// The next level contributes levelRatio * bytesAddedToNextLevel.
+				compactionDebt += uint64(float64(bytesAddedToNextLevel) * (levelRatio + 1))
+			}
+		} else {
+			// We're not moving any bytes to the next level.
+			bytesAddedToNextLevel = 0
+		}
+	}
+	return compactionDebt
+}
+
+func (p *compactionPickerByScore) initLevelMaxBytes(inProgressCompactions []compactionInfo) {
+	// The levelMaxBytes calculations here differ from RocksDB in two ways:
+	//
+	// 1. The use of dbSize vs maxLevelSize. RocksDB uses the size of the maximum
+	//    level in L1-L6, rather than determining the size of the bottom level
+	//    based on the total amount of data in the dB. The RocksDB calculation is
+	//    problematic if L0 contains a significant fraction of data, or if the
+	//    level sizes are roughly equal and thus there is a significant fraction
+	//    of data outside of the largest level.
+	//
+	// 2. Not adjusting the size of Lbase based on L0. RocksDB computes
+	//    baseBytesMax as the maximum of the configured LBaseMaxBytes and the
+	//    size of L0. This is problematic because baseBytesMax is used to compute
+	//    the max size of lower levels. A very large baseBytesMax will result in
+	//    an overly large value for the size of lower levels which will caused
+	//    those levels not to be compacted even when they should be
+	//    compacted. This often results in "inverted" LSM shapes where Ln is
+	//    larger than Ln+1.
+
+	// Determine the first non-empty level and the total DB size.
+	firstNonEmptyLevel := -1
+	var dbSize uint64
+	for level := 1; level < numLevels; level++ {
+		if p.vers.Levels[level].Size() > 0 {
+			if firstNonEmptyLevel == -1 {
+				firstNonEmptyLevel = level
+			}
+			dbSize += p.vers.Levels[level].Size()
+		}
+	}
+	for _, c := range inProgressCompactions {
+		if c.outputLevel == 0 || c.outputLevel == -1 {
+			continue
+		}
+		if c.inputs[0].level == 0 && (firstNonEmptyLevel == -1 || c.outputLevel < firstNonEmptyLevel) {
+			firstNonEmptyLevel = c.outputLevel
+		}
+	}
+
+	// Initialize the max-bytes setting for each level to "infinity" which will
+	// disallow compaction for that level. We'll fill in the actual value below
+	// for levels we want to allow compactions from.
+	for level := 0; level < numLevels; level++ {
+		p.levelMaxBytes[level] = math.MaxInt64
+	}
+
+	if dbSize == 0 {
+		// No levels for L1 and up contain any data. Target L0 compactions for the
+		// last level or to the level to which there is an ongoing L0 compaction.
+		p.baseLevel = numLevels - 1
+		if firstNonEmptyLevel >= 0 {
+			p.baseLevel = firstNonEmptyLevel
+		}
+		return
+	}
+
+	dbSize += p.vers.Levels[0].Size()
+	bottomLevelSize := dbSize - dbSize/uint64(p.opts.Experimental.LevelMultiplier)
+
+	curLevelSize := bottomLevelSize
+	for level := numLevels - 2; level >= firstNonEmptyLevel; level-- {
+		curLevelSize = uint64(float64(curLevelSize) / float64(p.opts.Experimental.LevelMultiplier))
+	}
+
+	// Compute base level (where L0 data is compacted to).
+	baseBytesMax := uint64(p.opts.LBaseMaxBytes)
+	p.baseLevel = firstNonEmptyLevel
+	for p.baseLevel > 1 && curLevelSize > baseBytesMax {
+		p.baseLevel--
+		curLevelSize = uint64(float64(curLevelSize) / float64(p.opts.Experimental.LevelMultiplier))
+	}
+
+	smoothedLevelMultiplier := 1.0
+	if p.baseLevel < numLevels-1 {
+		smoothedLevelMultiplier = math.Pow(
+			float64(bottomLevelSize)/float64(baseBytesMax),
+			1.0/float64(numLevels-p.baseLevel-1))
+	}
+
+	levelSize := float64(baseBytesMax)
+	for level := p.baseLevel; level < numLevels; level++ {
+		if level > p.baseLevel && levelSize > 0 {
+			levelSize *= smoothedLevelMultiplier
+		}
+		// Round the result since test cases use small target level sizes, which
+		// can be impacted by floating-point imprecision + integer truncation.
+		roundedLevelSize := math.Round(levelSize)
+		if roundedLevelSize > float64(math.MaxInt64) {
+			p.levelMaxBytes[level] = math.MaxInt64
+		} else {
+			p.levelMaxBytes[level] = int64(roundedLevelSize)
+		}
+	}
+}
+
+type levelSizeAdjust struct {
+	incomingActualBytes      uint64
+	outgoingActualBytes      uint64
+	outgoingCompensatedBytes uint64
+}
+
+func (a levelSizeAdjust) compensated() uint64 {
+	return a.incomingActualBytes - a.outgoingCompensatedBytes
+}
+
+func (a levelSizeAdjust) actual() uint64 {
+	return a.incomingActualBytes - a.outgoingActualBytes
+}
+
+func calculateSizeAdjust(inProgressCompactions []compactionInfo) [numLevels]levelSizeAdjust {
+	// Compute size adjustments for each level based on the in-progress
+	// compactions. We sum the file sizes of all files leaving and entering each
+	// level in in-progress compactions. For outgoing files, we also sum a
+	// separate sum of 'compensated file sizes', which are inflated according
+	// to deletion estimates.
+	//
+	// When we adjust a level's size according to these values during score
+	// calculation, we subtract the compensated size of start level inputs to
+	// account for the fact that score calculation uses compensated sizes.
+	//
+	// Since compensated file sizes may be compensated because they reclaim
+	// space from the output level's files, we only add the real file size to
+	// the output level.
+	//
+	// This is slightly different from RocksDB's behavior, which simply elides
+	// compacting files from the level size calculation.
+	var sizeAdjust [numLevels]levelSizeAdjust
+	for i := range inProgressCompactions {
+		c := &inProgressCompactions[i]
+		// If this compaction's version edit has already been applied, there's
+		// no need to adjust: The LSM we'll examine will already reflect the
+		// new LSM state.
+		if c.versionEditApplied {
+			continue
+		}
+
+		for _, input := range c.inputs {
+			actualSize := input.files.SizeSum()
+			compensatedSize := totalCompensatedSize(input.files.Iter())
+
+			if input.level != c.outputLevel {
+				sizeAdjust[input.level].outgoingCompensatedBytes += compensatedSize
+				sizeAdjust[input.level].outgoingActualBytes += actualSize
+				if c.outputLevel != -1 {
+					sizeAdjust[c.outputLevel].incomingActualBytes += actualSize
+				}
+			}
+		}
+	}
+	return sizeAdjust
+}
+
+func levelCompensatedSize(lm manifest.LevelMetadata) uint64 {
+	return *lm.Annotation(compensatedSizeAnnotator{}).(*uint64)
+}
+
+func (p *compactionPickerByScore) calculateLevelScores(
+	inProgressCompactions []compactionInfo,
+) [numLevels]candidateLevelInfo {
+	var scores [numLevels]candidateLevelInfo
+	for i := range scores {
+		scores[i].level = i
+		scores[i].outputLevel = i + 1
+	}
+	l0UncompensatedScore := calculateL0UncompensatedScore(p.vers, p.opts, inProgressCompactions)
+	scores[0] = candidateLevelInfo{
+		outputLevel:        p.baseLevel,
+		uncompensatedScore: l0UncompensatedScore,
+		compensatedScore:   l0UncompensatedScore, /* No level size compensation for L0 */
+	}
+	sizeAdjust := calculateSizeAdjust(inProgressCompactions)
+	for level := 1; level < numLevels; level++ {
+		compensatedLevelSize := levelCompensatedSize(p.vers.Levels[level]) + sizeAdjust[level].compensated()
+		scores[level].compensatedScore = float64(compensatedLevelSize) / float64(p.levelMaxBytes[level])
+		scores[level].uncompensatedScore = float64(p.vers.Levels[level].Size()+sizeAdjust[level].actual()) / float64(p.levelMaxBytes[level])
+	}
+
+	// Adjust each level's {compensated, uncompensated}Score by the uncompensatedScore
+	// of the next level to get a {compensated, uncompensated}ScoreRatio. If the
+	// next level has a high uncompensatedScore, and is thus a priority for compaction,
+	// this reduces the priority for compacting the current level. If the next level
+	// has a low uncompensatedScore (i.e. it is below its target size), this increases
+	// the priority for compacting the current level.
+	//
+	// The effect of this adjustment is to help prioritize compactions in lower
+	// levels. The following example shows the compensatedScoreRatio and the
+	// compensatedScore. In this scenario, L0 has 68 sublevels. L3 (a.k.a. Lbase)
+	// is significantly above its target size. The original score prioritizes
+	// compactions from those two levels, but doing so ends up causing a future
+	// problem: data piles up in the higher levels, starving L5->L6 compactions,
+	// and to a lesser degree starving L4->L5 compactions.
+	//
+	// Note that in the example shown there is no level size compensation so the
+	// compensatedScore and the uncompensatedScore is the same for each level.
+	//
+	//        compensatedScoreRatio   compensatedScore   uncompensatedScore   size   max-size
+	//   L0                     3.2               68.0                 68.0  2.2 G          -
+	//   L3                     3.2               21.1                 21.1  1.3 G       64 M
+	//   L4                     3.4                6.7                  6.7  3.1 G      467 M
+	//   L5                     3.4                2.0                  2.0  6.6 G      3.3 G
+	//   L6                     0.6                0.6                  0.6   14 G       24 G
+	var prevLevel int
+	for level := p.baseLevel; level < numLevels; level++ {
+		// The compensated scores, and uncompensated scores will be turned into
+		// ratios as they're adjusted according to other levels' sizes.
+		scores[prevLevel].compensatedScoreRatio = scores[prevLevel].compensatedScore
+		scores[prevLevel].uncompensatedScoreRatio = scores[prevLevel].uncompensatedScore
+
+		// Avoid absurdly large scores by placing a floor on the score that we'll
+		// adjust a level by. The value of 0.01 was chosen somewhat arbitrarily.
+		const minScore = 0.01
+		if scores[prevLevel].compensatedScoreRatio >= compactionScoreThreshold {
+			if scores[level].uncompensatedScore >= minScore {
+				scores[prevLevel].compensatedScoreRatio /= scores[level].uncompensatedScore
+			} else {
+				scores[prevLevel].compensatedScoreRatio /= minScore
+			}
+		}
+		if scores[prevLevel].uncompensatedScoreRatio >= compactionScoreThreshold {
+			if scores[level].uncompensatedScore >= minScore {
+				scores[prevLevel].uncompensatedScoreRatio /= scores[level].uncompensatedScore
+			} else {
+				scores[prevLevel].uncompensatedScoreRatio /= minScore
+			}
+		}
+		prevLevel = level
+	}
+	// Set the score ratios for the lowest level.
+	// INVARIANT: prevLevel == numLevels-1
+	scores[prevLevel].compensatedScoreRatio = scores[prevLevel].compensatedScore
+	scores[prevLevel].uncompensatedScoreRatio = scores[prevLevel].uncompensatedScore
+
+	sort.Sort(sortCompactionLevelsByPriority(scores[:]))
+	return scores
+}
+
+// calculateL0UncompensatedScore calculates a float score representing the
+// relative priority of compacting L0. Level L0 is special in that files within
+// L0 may overlap one another, so a different set of heuristics that take into
+// account read amplification apply.
+func calculateL0UncompensatedScore(
+	vers *version, opts *Options, inProgressCompactions []compactionInfo,
+) float64 {
+	// Use the sublevel count to calculate the score. The base vs intra-L0
+	// compaction determination happens in pickAuto, not here.
+	score := float64(2*vers.L0Sublevels.MaxDepthAfterOngoingCompactions()) /
+		float64(opts.L0CompactionThreshold)
+
+	// Also calculate a score based on the file count but use it only if it
+	// produces a higher score than the sublevel-based one. This heuristic is
+	// designed to accommodate cases where L0 is accumulating non-overlapping
+	// files in L0. Letting too many non-overlapping files accumulate in few
+	// sublevels is undesirable, because:
+	// 1) we can produce a massive backlog to compact once files do overlap.
+	// 2) constructing L0 sublevels has a runtime that grows superlinearly with
+	//    the number of files in L0 and must be done while holding D.mu.
+	noncompactingFiles := vers.Levels[0].Len()
+	for _, c := range inProgressCompactions {
+		for _, cl := range c.inputs {
+			if cl.level == 0 {
+				noncompactingFiles -= cl.files.Len()
+			}
+		}
+	}
+	fileScore := float64(noncompactingFiles) / float64(opts.L0CompactionFileThreshold)
+	if score < fileScore {
+		score = fileScore
+	}
+	return score
+}
+
+// pickCompactionSeedFile picks a file from `level` in the `vers` to build a
+// compaction around. Currently, this function implements a heuristic similar to
+// RocksDB's kMinOverlappingRatio, seeking to minimize write amplification. This
+// function is linear with respect to the number of files in `level` and
+// `outputLevel`.
+func pickCompactionSeedFile(
+	vers *version, opts *Options, level, outputLevel int, earliestSnapshotSeqNum uint64,
+) (manifest.LevelFile, bool) {
+	// Select the file within the level to compact. We want to minimize write
+	// amplification, but also ensure that deletes are propagated to the
+	// bottom level in a timely fashion so as to reclaim disk space. A table's
+	// smallest sequence number provides a measure of its age. The ratio of
+	// overlapping-bytes / table-size gives an indication of write
+	// amplification (a smaller ratio is preferrable).
+	//
+	// The current heuristic is based off the the RocksDB kMinOverlappingRatio
+	// heuristic. It chooses the file with the minimum overlapping ratio with
+	// the target level, which minimizes write amplification.
+	//
+	// It uses a "compensated size" for the denominator, which is the file
+	// size but artificially inflated by an estimate of the space that may be
+	// reclaimed through compaction. Currently, we only compensate for range
+	// deletions and only with a rough estimate of the reclaimable bytes. This
+	// differs from RocksDB which only compensates for point tombstones and
+	// only if they exceed the number of non-deletion entries in table.
+	//
+	// TODO(peter): For concurrent compactions, we may want to try harder to
+	// pick a seed file whose resulting compaction bounds do not overlap with
+	// an in-progress compaction.
+
+	cmp := opts.Comparer.Compare
+	startIter := vers.Levels[level].Iter()
+	outputIter := vers.Levels[outputLevel].Iter()
+
+	var file manifest.LevelFile
+	smallestRatio := uint64(math.MaxUint64)
+
+	outputFile := outputIter.First()
+
+	for f := startIter.First(); f != nil; f = startIter.Next() {
+		var overlappingBytes uint64
+		compacting := f.IsCompacting()
+		if compacting {
+			// Move on if this file is already being compacted. We'll likely
+			// still need to move past the overlapping output files regardless,
+			// but in cases where all start-level files are compacting we won't.
+			continue
+		}
+
+		// Trim any output-level files smaller than f.
+		for outputFile != nil && sstableKeyCompare(cmp, outputFile.Largest, f.Smallest) < 0 {
+			outputFile = outputIter.Next()
+		}
+
+		for outputFile != nil && sstableKeyCompare(cmp, outputFile.Smallest, f.Largest) <= 0 && !compacting {
+			overlappingBytes += outputFile.Size
+			compacting = compacting || outputFile.IsCompacting()
+
+			// For files in the bottommost level of the LSM, the
+			// Stats.RangeDeletionsBytesEstimate field is set to the estimate
+			// of bytes /within/ the file itself that may be dropped by
+			// recompacting the file. These bytes from obsolete keys would not
+			// need to be rewritten if we compacted `f` into `outputFile`, so
+			// they don't contribute to write amplification. Subtracting them
+			// out of the overlapping bytes helps prioritize these compactions
+			// that are cheaper than their file sizes suggest.
+			if outputLevel == numLevels-1 && outputFile.LargestSeqNum < earliestSnapshotSeqNum {
+				overlappingBytes -= outputFile.Stats.RangeDeletionsBytesEstimate
+			}
+
+			// If the file in the next level extends beyond f's largest key,
+			// break out and don't advance outputIter because f's successor
+			// might also overlap.
+			//
+			// Note, we stop as soon as we encounter an output-level file with a
+			// largest key beyond the input-level file's largest bound. We
+			// perform a simple user key comparison here using sstableKeyCompare
+			// which handles the potential for exclusive largest key bounds.
+			// There's some subtlety when the bounds are equal (eg, equal and
+			// inclusive, or equal and exclusive). Current Pebble doesn't split
+			// user keys across sstables within a level (and in format versions
+			// FormatSplitUserKeysMarkedCompacted and later we guarantee no
+			// split user keys exist within the entire LSM). In that case, we're
+			// assured that neither the input level nor the output level's next
+			// file shares the same user key, so compaction expansion will not
+			// include them in any compaction compacting `f`.
+			//
+			// NB: If we /did/ allow split user keys, or we're running on an
+			// old database with an earlier format major version where there are
+			// existing split user keys, this logic would be incorrect. Consider
+			//    L1: [a#120,a#100] [a#80,a#60]
+			//    L2: [a#55,a#45] [a#35,a#25] [a#15,a#5]
+			// While considering the first file in L1, [a#120,a#100], we'd skip
+			// past all of the files in L2. When considering the second file in
+			// L1, we'd improperly conclude that the second file overlaps
+			// nothing in the second level and is cheap to compact, when in
+			// reality we'd need to expand the compaction to include all 5
+			// files.
+			if sstableKeyCompare(cmp, outputFile.Largest, f.Largest) > 0 {
+				break
+			}
+			outputFile = outputIter.Next()
+		}
+
+		// If the input level file or one of the overlapping files is
+		// compacting, we're not going to be able to compact this file
+		// anyways, so skip it.
+		if compacting {
+			continue
+		}
+
+		compSz := compensatedSize(f)
+		scaledRatio := overlappingBytes * 1024 / compSz
+		if scaledRatio < smallestRatio {
+			smallestRatio = scaledRatio
+			file = startIter.Take()
+		}
+	}
+	return file, file.FileMetadata != nil
+}
+
+// pickAuto picks the best compaction, if any.
+//
+// On each call, pickAuto computes per-level size adjustments based on
+// in-progress compactions, and computes a per-level score. The levels are
+// iterated over in decreasing score order trying to find a valid compaction
+// anchored at that level.
+//
+// If a score-based compaction cannot be found, pickAuto falls back to looking
+// for an elision-only compaction to remove obsolete keys.
+func (p *compactionPickerByScore) pickAuto(env compactionEnv) (pc *pickedCompaction) {
+	// Compaction concurrency is controlled by L0 read-amp. We allow one
+	// additional compaction per L0CompactionConcurrency sublevels, as well as
+	// one additional compaction per CompactionDebtConcurrency bytes of
+	// compaction debt. Compaction concurrency is tied to L0 sublevels as that
+	// signal is independent of the database size. We tack on the compaction
+	// debt as a second signal to prevent compaction concurrency from dropping
+	// significantly right after a base compaction finishes, and before those
+	// bytes have been compacted further down the LSM.
+	if n := len(env.inProgressCompactions); n > 0 {
+		l0ReadAmp := p.vers.L0Sublevels.MaxDepthAfterOngoingCompactions()
+		compactionDebt := p.estimatedCompactionDebt(0)
+		ccSignal1 := n * p.opts.Experimental.L0CompactionConcurrency
+		ccSignal2 := uint64(n) * p.opts.Experimental.CompactionDebtConcurrency
+		if l0ReadAmp < ccSignal1 && compactionDebt < ccSignal2 {
+			return nil
+		}
+	}
+
+	scores := p.calculateLevelScores(env.inProgressCompactions)
+
+	// TODO(bananabrick): Either remove, or change this into an event sent to the
+	// EventListener.
+	logCompaction := func(pc *pickedCompaction) {
+		var buf bytes.Buffer
+		for i := 0; i < numLevels; i++ {
+			if i != 0 && i < p.baseLevel {
+				continue
+			}
+
+			var info *candidateLevelInfo
+			for j := range scores {
+				if scores[j].level == i {
+					info = &scores[j]
+					break
+				}
+			}
+
+			marker := " "
+			if pc.startLevel.level == info.level {
+				marker = "*"
+			}
+			fmt.Fprintf(&buf, "  %sL%d: %5.1f  %5.1f  %5.1f  %5.1f %8s  %8s",
+				marker, info.level, info.compensatedScoreRatio, info.compensatedScore,
+				info.uncompensatedScoreRatio, info.uncompensatedScore,
+				humanize.Bytes.Int64(int64(totalCompensatedSize(
+					p.vers.Levels[info.level].Iter(),
+				))),
+				humanize.Bytes.Int64(p.levelMaxBytes[info.level]),
+			)
+
+			count := 0
+			for i := range env.inProgressCompactions {
+				c := &env.inProgressCompactions[i]
+				if c.inputs[0].level != info.level {
+					continue
+				}
+				count++
+				if count == 1 {
+					fmt.Fprintf(&buf, "  [")
+				} else {
+					fmt.Fprintf(&buf, " ")
+				}
+				fmt.Fprintf(&buf, "L%d->L%d", c.inputs[0].level, c.outputLevel)
+			}
+			if count > 0 {
+				fmt.Fprintf(&buf, "]")
+			}
+			fmt.Fprintf(&buf, "\n")
+		}
+		p.opts.Logger.Infof("pickAuto: L%d->L%d\n%s",
+			pc.startLevel.level, pc.outputLevel.level, buf.String())
+	}
+
+	// Check for a score-based compaction. candidateLevelInfos are first sorted
+	// by whether they should be compacted, so if we find a level which shouldn't
+	// be compacted, we can break early.
+	for i := range scores {
+		info := &scores[i]
+		if !info.shouldCompact() {
+			break
+		}
+		if info.level == numLevels-1 {
+			continue
+		}
+
+		if info.level == 0 {
+			pc = pickL0(env, p.opts, p.vers, p.baseLevel)
+			// Fail-safe to protect against compacting the same sstable
+			// concurrently.
+			if pc != nil && !inputRangeAlreadyCompacting(env, pc) {
+				p.addScoresToPickedCompactionMetrics(pc, scores)
+				pc.score = info.compensatedScoreRatio
+				// TODO(bananabrick): Create an EventListener for logCompaction.
+				if false {
+					logCompaction(pc)
+				}
+				return pc
+			}
+			continue
+		}
+
+		// info.level > 0
+		var ok bool
+		info.file, ok = pickCompactionSeedFile(p.vers, p.opts, info.level, info.outputLevel, env.earliestSnapshotSeqNum)
+		if !ok {
+			continue
+		}
+
+		pc := pickAutoLPositive(env, p.opts, p.vers, *info, p.baseLevel, p.levelMaxBytes)
+		// Fail-safe to protect against compacting the same sstable concurrently.
+		if pc != nil && !inputRangeAlreadyCompacting(env, pc) {
+			p.addScoresToPickedCompactionMetrics(pc, scores)
+			pc.score = info.compensatedScoreRatio
+			// TODO(bananabrick): Create an EventListener for logCompaction.
+			if false {
+				logCompaction(pc)
+			}
+			return pc
+		}
+	}
+
+	// Check for L6 files with tombstones that may be elided. These files may
+	// exist if a snapshot prevented the elision of a tombstone or because of
+	// a move compaction. These are low-priority compactions because they
+	// don't help us keep up with writes, just reclaim disk space.
+	if pc := p.pickElisionOnlyCompaction(env); pc != nil {
+		return pc
+	}
+
+	if pc := p.pickReadTriggeredCompaction(env); pc != nil {
+		return pc
+	}
+
+	// NB: This should only be run if a read compaction wasn't
+	// scheduled.
+	//
+	// We won't be scheduling a read compaction right now, and in
+	// read heavy workloads, compactions won't be scheduled frequently
+	// because flushes aren't frequent. So we need to signal to the
+	// iterator to schedule a compaction when it adds compactions to
+	// the read compaction queue.
+	//
+	// We need the nil check here because without it, we have some
+	// tests which don't set that variable fail. Since there's a
+	// chance that one of those tests wouldn't want extra compactions
+	// to be scheduled, I added this check here, instead of
+	// setting rescheduleReadCompaction in those tests.
+	if env.readCompactionEnv.rescheduleReadCompaction != nil {
+		*env.readCompactionEnv.rescheduleReadCompaction = true
+	}
+
+	// At the lowest possible compaction-picking priority, look for files marked
+	// for compaction. Pebble will mark files for compaction if they have atomic
+	// compaction units that span multiple files. While current Pebble code does
+	// not construct such sstables, RocksDB and earlier versions of Pebble may
+	// have created them. These split user keys form sets of files that must be
+	// compacted together for correctness (referred to as "atomic compaction
+	// units" within the code). Rewrite them in-place.
+	//
+	// It's also possible that a file may have been marked for compaction by
+	// even earlier versions of Pebble code, since FileMetadata's
+	// MarkedForCompaction field is persisted in the manifest. That's okay. We
+	// previously would've ignored the designation, whereas now we'll re-compact
+	// the file in place.
+	if p.vers.Stats.MarkedForCompaction > 0 {
+		if pc := p.pickRewriteCompaction(env); pc != nil {
+			return pc
+		}
+	}
+
+	return nil
+}
+
+func (p *compactionPickerByScore) addScoresToPickedCompactionMetrics(
+	pc *pickedCompaction, candInfo [numLevels]candidateLevelInfo,
+) {
+
+	// candInfo is sorted by score, not by compaction level.
+	infoByLevel := [numLevels]candidateLevelInfo{}
+	for i := range candInfo {
+		level := candInfo[i].level
+		infoByLevel[level] = candInfo[i]
+	}
+	// Gather the compaction scores for the levels participating in the compaction.
+	pc.pickerMetrics.scores = make([]float64, len(pc.inputs))
+	inputIdx := 0
+	for i := range infoByLevel {
+		if pc.inputs[inputIdx].level == infoByLevel[i].level {
+			pc.pickerMetrics.scores[inputIdx] = infoByLevel[i].compensatedScoreRatio
+			inputIdx++
+		}
+		if inputIdx == len(pc.inputs) {
+			break
+		}
+	}
+}
+
+// elisionOnlyAnnotator implements the manifest.Annotator interface,
+// annotating B-Tree nodes with the *fileMetadata of a file meeting the
+// obsolete keys criteria for an elision-only compaction within the subtree.
+// If multiple files meet the criteria, it chooses whichever file has the
+// lowest LargestSeqNum. The lowest LargestSeqNum file will be the first
+// eligible for an elision-only compaction once snapshots less than or equal
+// to its LargestSeqNum are closed.
+type elisionOnlyAnnotator struct{}
+
+var _ manifest.Annotator = elisionOnlyAnnotator{}
+
+func (a elisionOnlyAnnotator) Zero(interface{}) interface{} {
+	return nil
+}
+
+func (a elisionOnlyAnnotator) Accumulate(f *fileMetadata, dst interface{}) (interface{}, bool) {
+	if f.IsCompacting() {
+		return dst, true
+	}
+	if !f.StatsValid() {
+		return dst, false
+	}
+	// Bottommost files are large and not worthwhile to compact just
+	// to remove a few tombstones. Consider a file ineligible if its
+	// own range deletions delete less than 10% of its data and its
+	// deletion tombstones make up less than 10% of its entries.
+	//
+	// TODO(jackson): This does not account for duplicate user keys
+	// which may be collapsed. Ideally, we would have 'obsolete keys'
+	// statistics that would include tombstones, the keys that are
+	// dropped by tombstones and duplicated user keys. See #847.
+	//
+	// Note that tables that contain exclusively range keys (i.e. no point keys,
+	// `NumEntries` and `RangeDeletionsBytesEstimate` are both zero) are excluded
+	// from elision-only compactions.
+	// TODO(travers): Consider an alternative heuristic for elision of range-keys.
+	if f.Stats.RangeDeletionsBytesEstimate*10 < f.Size &&
+		f.Stats.NumDeletions*10 <= f.Stats.NumEntries {
+		return dst, true
+	}
+	if dst == nil {
+		return f, true
+	} else if dstV := dst.(*fileMetadata); dstV.LargestSeqNum > f.LargestSeqNum {
+		return f, true
+	}
+	return dst, true
+}
+
+func (a elisionOnlyAnnotator) Merge(v interface{}, accum interface{}) interface{} {
+	if v == nil {
+		return accum
+	}
+	// If we haven't accumulated an eligible file yet, or f's LargestSeqNum is
+	// less than the accumulated file's, use f.
+	if accum == nil {
+		return v
+	}
+	f := v.(*fileMetadata)
+	accumV := accum.(*fileMetadata)
+	if accumV == nil || accumV.LargestSeqNum > f.LargestSeqNum {
+		return f
+	}
+	return accumV
+}
+
+// markedForCompactionAnnotator implements the manifest.Annotator interface,
+// annotating B-Tree nodes with the *fileMetadata of a file that is marked for
+// compaction within the subtree. If multiple files meet the criteria, it
+// chooses whichever file has the lowest LargestSeqNum.
+type markedForCompactionAnnotator struct{}
+
+var _ manifest.Annotator = markedForCompactionAnnotator{}
+
+func (a markedForCompactionAnnotator) Zero(interface{}) interface{} {
+	return nil
+}
+
+func (a markedForCompactionAnnotator) Accumulate(
+	f *fileMetadata, dst interface{},
+) (interface{}, bool) {
+	if !f.MarkedForCompaction {
+		// Not marked for compaction; return dst.
+		return dst, true
+	}
+	return markedMergeHelper(f, dst)
+}
+
+func (a markedForCompactionAnnotator) Merge(v interface{}, accum interface{}) interface{} {
+	if v == nil {
+		return accum
+	}
+	accum, _ = markedMergeHelper(v.(*fileMetadata), accum)
+	return accum
+}
+
+// REQUIRES: f is non-nil, and f.MarkedForCompaction=true.
+func markedMergeHelper(f *fileMetadata, dst interface{}) (interface{}, bool) {
+	if dst == nil {
+		return f, true
+	} else if dstV := dst.(*fileMetadata); dstV.LargestSeqNum > f.LargestSeqNum {
+		return f, true
+	}
+	return dst, true
+}
+
+// pickElisionOnlyCompaction looks for compactions of sstables in the
+// bottommost level containing obsolete records that may now be dropped.
+func (p *compactionPickerByScore) pickElisionOnlyCompaction(
+	env compactionEnv,
+) (pc *pickedCompaction) {
+	if p.opts.private.disableElisionOnlyCompactions {
+		return nil
+	}
+	v := p.vers.Levels[numLevels-1].Annotation(elisionOnlyAnnotator{})
+	if v == nil {
+		return nil
+	}
+	candidate := v.(*fileMetadata)
+	if candidate.IsCompacting() || candidate.LargestSeqNum >= env.earliestSnapshotSeqNum {
+		return nil
+	}
+	lf := p.vers.Levels[numLevels-1].Find(p.opts.Comparer.Compare, candidate)
+	if lf == nil {
+		panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, numLevels-1))
+	}
+
+	// Construct a picked compaction of the elision candidate's atomic
+	// compaction unit.
+	pc = newPickedCompaction(p.opts, p.vers, numLevels-1, numLevels-1, p.baseLevel)
+	pc.kind = compactionKindElisionOnly
+	var isCompacting bool
+	pc.startLevel.files, isCompacting = expandToAtomicUnit(p.opts.Comparer.Compare, lf.Slice(), false /* disableIsCompacting */)
+	if isCompacting {
+		return nil
+	}
+	pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter())
+	// Fail-safe to protect against compacting the same sstable concurrently.
+	if !inputRangeAlreadyCompacting(env, pc) {
+		return pc
+	}
+	return nil
+}
+
+// pickRewriteCompaction attempts to construct a compaction that
+// rewrites a file marked for compaction. pickRewriteCompaction will
+// pull in adjacent files in the file's atomic compaction unit if
+// necessary. A rewrite compaction outputs files to the same level as
+// the input level.
+func (p *compactionPickerByScore) pickRewriteCompaction(env compactionEnv) (pc *pickedCompaction) {
+	for l := numLevels - 1; l >= 0; l-- {
+		v := p.vers.Levels[l].Annotation(markedForCompactionAnnotator{})
+		if v == nil {
+			// Try the next level.
+			continue
+		}
+		candidate := v.(*fileMetadata)
+		if candidate.IsCompacting() {
+			// Try the next level.
+			continue
+		}
+		lf := p.vers.Levels[l].Find(p.opts.Comparer.Compare, candidate)
+		if lf == nil {
+			panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, numLevels-1))
+		}
+
+		inputs := lf.Slice()
+		// L0 files generated by a flush have never been split such that
+		// adjacent files can contain the same user key. So we do not need to
+		// rewrite an atomic compaction unit for L0. Note that there is nothing
+		// preventing two different flushes from producing files that are
+		// non-overlapping from an InternalKey perspective, but span the same
+		// user key. However, such files cannot be in the same L0 sublevel,
+		// since each sublevel requires non-overlapping user keys (unlike other
+		// levels).
+		if l > 0 {
+			// Find this file's atomic compaction unit. This is only relevant
+			// for levels L1+.
+			var isCompacting bool
+			inputs, isCompacting = expandToAtomicUnit(
+				p.opts.Comparer.Compare,
+				inputs,
+				false, /* disableIsCompacting */
+			)
+			if isCompacting {
+				// Try the next level.
+				continue
+			}
+		}
+
+		pc = newPickedCompaction(p.opts, p.vers, l, l, p.baseLevel)
+		pc.outputLevel.level = l
+		pc.kind = compactionKindRewrite
+		pc.startLevel.files = inputs
+		pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter())
+
+		// Fail-safe to protect against compacting the same sstable concurrently.
+		if !inputRangeAlreadyCompacting(env, pc) {
+			if pc.startLevel.level == 0 {
+				pc.startLevel.l0SublevelInfo = generateSublevelInfo(pc.cmp, pc.startLevel.files)
+			}
+			return pc
+		}
+	}
+	return nil
+}
+
+// pickAutoLPositive picks an automatic compaction for the candidate
+// file in a positive-numbered level. This function must not be used for
+// L0.
+func pickAutoLPositive(
+	env compactionEnv,
+	opts *Options,
+	vers *version,
+	cInfo candidateLevelInfo,
+	baseLevel int,
+	levelMaxBytes [numLevels]int64,
+) (pc *pickedCompaction) {
+	if cInfo.level == 0 {
+		panic("pebble: pickAutoLPositive called for L0")
+	}
+
+	pc = newPickedCompaction(opts, vers, cInfo.level, defaultOutputLevel(cInfo.level, baseLevel), baseLevel)
+	if pc.outputLevel.level != cInfo.outputLevel {
+		panic("pebble: compaction picked unexpected output level")
+	}
+	pc.startLevel.files = cInfo.file.Slice()
+	// Files in level 0 may overlap each other, so pick up all overlapping ones.
+	if pc.startLevel.level == 0 {
+		cmp := opts.Comparer.Compare
+		smallest, largest := manifest.KeyRange(cmp, pc.startLevel.files.Iter())
+		pc.startLevel.files = vers.Overlaps(0, cmp, smallest.UserKey,
+			largest.UserKey, largest.IsExclusiveSentinel())
+		if pc.startLevel.files.Empty() {
+			panic("pebble: empty compaction")
+		}
+	}
+
+	if !pc.setupInputs(opts, env.diskAvailBytes, pc.startLevel) {
+		return nil
+	}
+	return pc.maybeAddLevel(opts, env.diskAvailBytes)
+}
+
+// maybeAddLevel maybe adds a level to the picked compaction.
+func (pc *pickedCompaction) maybeAddLevel(opts *Options, diskAvailBytes uint64) *pickedCompaction {
+	pc.pickerMetrics.singleLevelOverlappingRatio = pc.overlappingRatio()
+	if pc.outputLevel.level == numLevels-1 {
+		// Don't add a level if the current output level is in L6
+		return pc
+	}
+	if !opts.Experimental.MultiLevelCompactionHeuristic.allowL0() && pc.startLevel.level == 0 {
+		return pc
+	}
+	if pc.compactionSize() > expandedCompactionByteSizeLimit(
+		opts, adjustedOutputLevel(pc.outputLevel.level, pc.baseLevel), diskAvailBytes) {
+		// Don't add a level if the current compaction exceeds the compaction size limit
+		return pc
+	}
+	return opts.Experimental.MultiLevelCompactionHeuristic.pick(pc, opts, diskAvailBytes)
+}
+
+// MultiLevelHeuristic evaluates whether to add files from the next level into the compaction.
+type MultiLevelHeuristic interface {
+	// Evaluate returns the preferred compaction.
+	pick(pc *pickedCompaction, opts *Options, diskAvailBytes uint64) *pickedCompaction
+
+	// Returns if the heuristic allows L0 to be involved in ML compaction
+	allowL0() bool
+}
+
+// NoMultiLevel will never add an additional level to the compaction.
+type NoMultiLevel struct{}
+
+var _ MultiLevelHeuristic = (*NoMultiLevel)(nil)
+
+func (nml NoMultiLevel) pick(
+	pc *pickedCompaction, opts *Options, diskAvailBytes uint64,
+) *pickedCompaction {
+	return pc
+}
+
+func (nml NoMultiLevel) allowL0() bool {
+	return false
+}
+
+func (pc *pickedCompaction) predictedWriteAmp() float64 {
+	var bytesToCompact uint64
+	var higherLevelBytes uint64
+	for i := range pc.inputs {
+		levelSize := pc.inputs[i].files.SizeSum()
+		bytesToCompact += levelSize
+		if i != len(pc.inputs)-1 {
+			higherLevelBytes += levelSize
+		}
+	}
+	return float64(bytesToCompact) / float64(higherLevelBytes)
+}
+
+func (pc *pickedCompaction) overlappingRatio() float64 {
+	var higherLevelBytes uint64
+	var lowestLevelBytes uint64
+	for i := range pc.inputs {
+		levelSize := pc.inputs[i].files.SizeSum()
+		if i == len(pc.inputs)-1 {
+			lowestLevelBytes += levelSize
+			continue
+		}
+		higherLevelBytes += levelSize
+	}
+	return float64(lowestLevelBytes) / float64(higherLevelBytes)
+}
+
+// WriteAmpHeuristic defines a multi level compaction heuristic which will add
+// an additional level to the picked compaction if it reduces predicted write
+// amp of the compaction + the addPropensity constant.
+type WriteAmpHeuristic struct {
+	// addPropensity is a constant that affects the propensity to conduct multilevel
+	// compactions. If positive, a multilevel compaction may get picked even if
+	// the single level compaction has lower write amp, and vice versa.
+	AddPropensity float64
+
+	// AllowL0 if true, allow l0 to be involved in a ML compaction.
+	AllowL0 bool
+}
+
+var _ MultiLevelHeuristic = (*WriteAmpHeuristic)(nil)
+
+// TODO(msbutler): microbenchmark the extent to which multilevel compaction
+// picking slows down the compaction picking process.  This should be as fast as
+// possible since Compaction-picking holds d.mu, which prevents WAL rotations,
+// in-progress flushes and compactions from completing, etc. Consider ways to
+// deduplicate work, given that setupInputs has already been called.
+func (wa WriteAmpHeuristic) pick(
+	pcOrig *pickedCompaction, opts *Options, diskAvailBytes uint64,
+) *pickedCompaction {
+	pcMulti := pcOrig.clone()
+	if !pcMulti.setupMultiLevelCandidate(opts, diskAvailBytes) {
+		return pcOrig
+	}
+	picked := pcOrig
+	if pcMulti.predictedWriteAmp() <= pcOrig.predictedWriteAmp()+wa.AddPropensity {
+		picked = pcMulti
+	}
+	// Regardless of what compaction was picked, log the multilevelOverlapping ratio.
+	picked.pickerMetrics.multiLevelOverlappingRatio = pcMulti.overlappingRatio()
+	return picked
+}
+
+func (wa WriteAmpHeuristic) allowL0() bool {
+	return wa.AllowL0
+}
+
+// Helper method to pick compactions originating from L0. Uses information about
+// sublevels to generate a compaction.
+func pickL0(env compactionEnv, opts *Options, vers *version, baseLevel int) (pc *pickedCompaction) {
+	// It is important to pass information about Lbase files to L0Sublevels
+	// so it can pick a compaction that does not conflict with an Lbase => Lbase+1
+	// compaction. Without this, we observed reduced concurrency of L0=>Lbase
+	// compactions, and increasing read amplification in L0.
+	//
+	// TODO(bilal) Remove the minCompactionDepth parameter once fixing it at 1
+	// has been shown to not cause a performance regression.
+	lcf, err := vers.L0Sublevels.PickBaseCompaction(1, vers.Levels[baseLevel].Slice())
+	if err != nil {
+		opts.Logger.Errorf("error when picking base compaction: %s", err)
+		return
+	}
+	if lcf != nil {
+		pc = newPickedCompactionFromL0(lcf, opts, vers, baseLevel, true)
+		pc.setupInputs(opts, env.diskAvailBytes, pc.startLevel)
+		if pc.startLevel.files.Empty() {
+			opts.Logger.Fatalf("empty compaction chosen")
+		}
+		return pc.maybeAddLevel(opts, env.diskAvailBytes)
+	}
+
+	// Couldn't choose a base compaction. Try choosing an intra-L0
+	// compaction. Note that we pass in L0CompactionThreshold here as opposed to
+	// 1, since choosing a single sublevel intra-L0 compaction is
+	// counterproductive.
+	lcf, err = vers.L0Sublevels.PickIntraL0Compaction(env.earliestUnflushedSeqNum, minIntraL0Count)
+	if err != nil {
+		opts.Logger.Errorf("error when picking intra-L0 compaction: %s", err)
+		return
+	}
+	if lcf != nil {
+		pc = newPickedCompactionFromL0(lcf, opts, vers, 0, false)
+		if !pc.setupInputs(opts, env.diskAvailBytes, pc.startLevel) {
+			return nil
+		}
+		if pc.startLevel.files.Empty() {
+			opts.Logger.Fatalf("empty compaction chosen")
+		}
+		{
+			iter := pc.startLevel.files.Iter()
+			if iter.First() == nil || iter.Next() == nil {
+				// A single-file intra-L0 compaction is unproductive.
+				return nil
+			}
+		}
+
+		pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter())
+	}
+	return pc
+}
+
+func pickManualCompaction(
+	vers *version, opts *Options, env compactionEnv, baseLevel int, manual *manualCompaction,
+) (pc *pickedCompaction, retryLater bool) {
+	outputLevel := manual.level + 1
+	if manual.level == 0 {
+		outputLevel = baseLevel
+	} else if manual.level < baseLevel {
+		// The start level for a compaction must be >= Lbase. A manual
+		// compaction could have been created adhering to that condition, and
+		// then an automatic compaction came in and compacted all of the
+		// sstables in Lbase to Lbase+1 which caused Lbase to change. Simply
+		// ignore this manual compaction as there is nothing to do (manual.level
+		// points to an empty level).
+		return nil, false
+	}
+	// This conflictsWithInProgress call is necessary for the manual compaction to
+	// be retried when it conflicts with an ongoing automatic compaction. Without
+	// it, the compaction is dropped due to pc.setupInputs returning false since
+	// the input/output range is already being compacted, and the manual
+	// compaction ends with a non-compacted LSM.
+	if conflictsWithInProgress(manual, outputLevel, env.inProgressCompactions, opts.Comparer.Compare) {
+		return nil, true
+	}
+	pc = newPickedCompaction(opts, vers, manual.level, defaultOutputLevel(manual.level, baseLevel), baseLevel)
+	manual.outputLevel = pc.outputLevel.level
+	pc.startLevel.files = vers.Overlaps(manual.level, opts.Comparer.Compare, manual.start, manual.end, false)
+	if pc.startLevel.files.Empty() {
+		// Nothing to do
+		return nil, false
+	}
+	if !pc.setupInputs(opts, env.diskAvailBytes, pc.startLevel) {
+		// setupInputs returned false indicating there's a conflicting
+		// concurrent compaction.
+		return nil, true
+	}
+	if pc = pc.maybeAddLevel(opts, env.diskAvailBytes); pc == nil {
+		return nil, false
+	}
+	if pc.outputLevel.level != outputLevel {
+		if len(pc.extraLevels) > 0 {
+			// multilevel compactions relax this invariant
+		} else {
+			panic("pebble: compaction picked unexpected output level")
+		}
+	}
+	// Fail-safe to protect against compacting the same sstable concurrently.
+	if inputRangeAlreadyCompacting(env, pc) {
+		return nil, true
+	}
+	return pc, false
+}
+
+func (p *compactionPickerByScore) pickReadTriggeredCompaction(
+	env compactionEnv,
+) (pc *pickedCompaction) {
+	// If a flush is in-progress or expected to happen soon, it means more writes are taking place. We would
+	// soon be scheduling more write focussed compactions. In this case, skip read compactions as they are
+	// lower priority.
+	if env.readCompactionEnv.flushing || env.readCompactionEnv.readCompactions == nil {
+		return nil
+	}
+	for env.readCompactionEnv.readCompactions.size > 0 {
+		rc := env.readCompactionEnv.readCompactions.remove()
+		if pc = pickReadTriggeredCompactionHelper(p, rc, env); pc != nil {
+			break
+		}
+	}
+	return pc
+}
+
+func pickReadTriggeredCompactionHelper(
+	p *compactionPickerByScore, rc *readCompaction, env compactionEnv,
+) (pc *pickedCompaction) {
+	cmp := p.opts.Comparer.Compare
+	overlapSlice := p.vers.Overlaps(rc.level, cmp, rc.start, rc.end, false /* exclusiveEnd */)
+	if overlapSlice.Empty() {
+		// If there is no overlap, then the file with the key range
+		// must have been compacted away. So, we don't proceed to
+		// compact the same key range again.
+		return nil
+	}
+
+	iter := overlapSlice.Iter()
+	var fileMatches bool
+	for f := iter.First(); f != nil; f = iter.Next() {
+		if f.FileNum == rc.fileNum {
+			fileMatches = true
+			break
+		}
+	}
+	if !fileMatches {
+		return nil
+	}
+
+	pc = newPickedCompaction(p.opts, p.vers, rc.level, defaultOutputLevel(rc.level, p.baseLevel), p.baseLevel)
+
+	pc.startLevel.files = overlapSlice
+	if !pc.setupInputs(p.opts, env.diskAvailBytes, pc.startLevel) {
+		return nil
+	}
+	if inputRangeAlreadyCompacting(env, pc) {
+		return nil
+	}
+	pc.kind = compactionKindRead
+
+	// Prevent read compactions which are too wide.
+	outputOverlaps := pc.version.Overlaps(
+		pc.outputLevel.level, pc.cmp, pc.smallest.UserKey,
+		pc.largest.UserKey, pc.largest.IsExclusiveSentinel())
+	if outputOverlaps.SizeSum() > pc.maxReadCompactionBytes {
+		return nil
+	}
+
+	// Prevent compactions which start with a small seed file X, but overlap
+	// with over allowedCompactionWidth * X file sizes in the output layer.
+	const allowedCompactionWidth = 35
+	if outputOverlaps.SizeSum() > overlapSlice.SizeSum()*allowedCompactionWidth {
+		return nil
+	}
+
+	return pc
+}
+
+func (p *compactionPickerByScore) forceBaseLevel1() {
+	p.baseLevel = 1
+}
+
+func inputRangeAlreadyCompacting(env compactionEnv, pc *pickedCompaction) bool {
+	for _, cl := range pc.inputs {
+		iter := cl.files.Iter()
+		for f := iter.First(); f != nil; f = iter.Next() {
+			if f.IsCompacting() {
+				return true
+			}
+		}
+	}
+
+	// Look for active compactions outputting to the same region of the key
+	// space in the same output level. Two potential compactions may conflict
+	// without sharing input files if there are no files in the output level
+	// that overlap with the intersection of the compactions' key spaces.
+	//
+	// Consider an active L0->Lbase compaction compacting two L0 files one
+	// [a-f] and the other [t-z] into Lbase.
+	//
+	// L0
+	//     ↦ 000100  ↤                           ↦  000101   ↤
+	// L1
+	//     ↦ 000004  ↤
+	//     a b c d e f g h i j k l m n o p q r s t u v w x y z
+	//
+	// If a new file 000102 [j-p] is flushed while the existing compaction is
+	// still ongoing, new file would not be in any compacting sublevel
+	// intervals and would not overlap with any Lbase files that are also
+	// compacting. However, this compaction cannot be picked because the
+	// compaction's output key space [j-p] would overlap the existing
+	// compaction's output key space [a-z].
+	//
+	// L0
+	//     ↦ 000100* ↤       ↦   000102  ↤       ↦  000101*  ↤
+	// L1
+	//     ↦ 000004* ↤
+	//     a b c d e f g h i j k l m n o p q r s t u v w x y z
+	//
+	// * - currently compacting
+	if pc.outputLevel != nil && pc.outputLevel.level != 0 {
+		for _, c := range env.inProgressCompactions {
+			if pc.outputLevel.level != c.outputLevel {
+				continue
+			}
+			if base.InternalCompare(pc.cmp, c.largest, pc.smallest) < 0 ||
+				base.InternalCompare(pc.cmp, c.smallest, pc.largest) > 0 {
+				continue
+			}
+
+			// The picked compaction and the in-progress compaction c are
+			// outputting to the same region of the key space of the same
+			// level.
+			return true
+		}
+	}
+	return false
+}
+
+// conflictsWithInProgress checks if there are any in-progress compactions with overlapping keyspace.
+func conflictsWithInProgress(
+	manual *manualCompaction, outputLevel int, inProgressCompactions []compactionInfo, cmp Compare,
+) bool {
+	for _, c := range inProgressCompactions {
+		if (c.outputLevel == manual.level || c.outputLevel == outputLevel) &&
+			isUserKeysOverlapping(manual.start, manual.end, c.smallest.UserKey, c.largest.UserKey, cmp) {
+			return true
+		}
+		for _, in := range c.inputs {
+			if in.files.Empty() {
+				continue
+			}
+			iter := in.files.Iter()
+			smallest := iter.First().Smallest.UserKey
+			largest := iter.Last().Largest.UserKey
+			if (in.level == manual.level || in.level == outputLevel) &&
+				isUserKeysOverlapping(manual.start, manual.end, smallest, largest, cmp) {
+				return true
+			}
+		}
+	}
+	return false
+}
+
+func isUserKeysOverlapping(x1, x2, y1, y2 []byte, cmp Compare) bool {
+	return cmp(x1, y2) <= 0 && cmp(y1, x2) <= 0
+}
diff --git a/pebble/compaction_picker_test.go b/pebble/compaction_picker_test.go
new file mode 100644
index 0000000..b0ace35
--- /dev/null
+++ b/pebble/compaction_picker_test.go
@@ -0,0 +1,1593 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"fmt"
+	"math"
+	"sort"
+	"strconv"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/humanize"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+func loadVersion(t *testing.T, d *datadriven.TestData) (*version, *Options, string) {
+	var sizes [numLevels]int64
+	opts := &Options{}
+	opts.testingRandomized(t)
+	opts.EnsureDefaults()
+
+	if len(d.CmdArgs) != 1 {
+		return nil, nil, fmt.Sprintf("%s expects 1 argument", d.Cmd)
+	}
+	var err error
+	opts.LBaseMaxBytes, err = strconv.ParseInt(d.CmdArgs[0].Key, 10, 64)
+	if err != nil {
+		return nil, nil, err.Error()
+	}
+
+	var files [numLevels][]*fileMetadata
+	if len(d.Input) > 0 {
+		// Parse each line as
+		//
+		// <level>: <size> [compensation]
+		//
+		// Creating sstables within the level whose file sizes total to `size`
+		// and whose compensated file sizes total to `size`+`compensation`. If
+		// size is sufficiently large, only one single file is created. See
+		// the TODO below.
+		for _, data := range strings.Split(d.Input, "\n") {
+			parts := strings.Split(data, " ")
+			parts[0] = strings.TrimSuffix(strings.TrimSpace(parts[0]), ":")
+			if len(parts) < 2 {
+				return nil, nil, fmt.Sprintf("malformed test:\n%s", d.Input)
+			}
+			level, err := strconv.Atoi(parts[0])
+			if err != nil {
+				return nil, nil, err.Error()
+			}
+			if files[level] != nil {
+				return nil, nil, fmt.Sprintf("level %d already filled", level)
+			}
+			size, err := strconv.ParseUint(strings.TrimSpace(parts[1]), 10, 64)
+			if err != nil {
+				return nil, nil, err.Error()
+			}
+			var compensation uint64
+			if len(parts) == 3 {
+				compensation, err = strconv.ParseUint(strings.TrimSpace(parts[2]), 10, 64)
+				if err != nil {
+					return nil, nil, err.Error()
+				}
+			}
+
+			var lastFile *fileMetadata
+			for i := uint64(1); sizes[level] < int64(size); i++ {
+				var key InternalKey
+				if level == 0 {
+					// For L0, make `size` overlapping files.
+					key = base.MakeInternalKey([]byte(fmt.Sprintf("%04d", 1)), i, InternalKeyKindSet)
+				} else {
+					key = base.MakeInternalKey([]byte(fmt.Sprintf("%04d", i)), i, InternalKeyKindSet)
+				}
+				m := (&fileMetadata{
+					FileNum:        base.FileNum(uint64(level)*100_000 + i),
+					SmallestSeqNum: key.SeqNum(),
+					LargestSeqNum:  key.SeqNum(),
+					Size:           1,
+					Stats: manifest.TableStats{
+						RangeDeletionsBytesEstimate: 0,
+					},
+				}).ExtendPointKeyBounds(opts.Comparer.Compare, key, key)
+				m.InitPhysicalBacking()
+				m.StatsMarkValid()
+				lastFile = m
+				if size >= 100 {
+					// If the requested size of the level is very large only add a single
+					// file in order to avoid massive blow-up in the number of files in
+					// the Version.
+					//
+					// TODO(peter): There is tension between the testing in
+					// TestCompactionPickerLevelMaxBytes and
+					// TestCompactionPickerTargetLevel. Clean this up somehow.
+					m.Size = size
+					if level != 0 {
+						endKey := base.MakeInternalKey([]byte(fmt.Sprintf("%04d", size)), i, InternalKeyKindSet)
+						m.ExtendPointKeyBounds(opts.Comparer.Compare, key, endKey)
+					}
+				}
+				files[level] = append(files[level], m)
+				sizes[level] += int64(m.Size)
+			}
+			// Let all the compensation be due to the last file.
+			if lastFile != nil && compensation > 0 {
+				lastFile.Stats.RangeDeletionsBytesEstimate = compensation
+			}
+		}
+	}
+
+	vers := newVersion(opts, files)
+	return vers, opts, ""
+}
+
+func TestCompactionPickerByScoreLevelMaxBytes(t *testing.T) {
+	datadriven.RunTest(t, "testdata/compaction_picker_level_max_bytes",
+		func(t *testing.T, d *datadriven.TestData) string {
+			switch d.Cmd {
+			case "init":
+				vers, opts, errMsg := loadVersion(t, d)
+				if errMsg != "" {
+					return errMsg
+				}
+
+				p, ok := newCompactionPicker(vers, opts, nil).(*compactionPickerByScore)
+				require.True(t, ok)
+				var buf bytes.Buffer
+				for level := p.getBaseLevel(); level < numLevels; level++ {
+					fmt.Fprintf(&buf, "%d: %d\n", level, p.levelMaxBytes[level])
+				}
+				return buf.String()
+
+			default:
+				return fmt.Sprintf("unknown command: %s", d.Cmd)
+			}
+		})
+}
+
+func TestCompactionPickerTargetLevel(t *testing.T) {
+	var vers *version
+	var opts *Options
+	var pickerByScore *compactionPickerByScore
+
+	parseInProgress := func(vals []string) ([]compactionInfo, error) {
+		var levels []int
+		for _, s := range vals {
+			l, err := strconv.ParseInt(s, 10, 8)
+			if err != nil {
+				return nil, err
+			}
+			levels = append(levels, int(l))
+		}
+		if len(levels)%2 != 0 {
+			return nil, errors.New("odd number of levels with ongoing compactions")
+		}
+		var inProgress []compactionInfo
+		for i := 0; i < len(levels); i += 2 {
+			inProgress = append(inProgress, compactionInfo{
+				inputs: []compactionLevel{
+					{level: levels[i]},
+					{level: levels[i+1]},
+				},
+				outputLevel: levels[i+1],
+			})
+		}
+		return inProgress, nil
+	}
+
+	resetCompacting := func() {
+		for _, files := range vers.Levels {
+			files.Slice().Each(func(f *fileMetadata) {
+				f.CompactionState = manifest.CompactionStateNotCompacting
+			})
+		}
+	}
+
+	datadriven.RunTest(t, "testdata/compaction_picker_target_level",
+		func(t *testing.T, d *datadriven.TestData) string {
+			switch d.Cmd {
+			case "init":
+				// loadVersion expects a single datadriven argument that it
+				// sets as Options.LBaseMaxBytes. It parses the input as
+				// newline-separated levels, specifying the level's file size
+				// and optionally additional compensation to be added during
+				// compensated file size calculations. Eg:
+				//
+				// init <LBaseMaxBytes>
+				// <level>: <size> [compensation]
+				// <level>: <size> [compensation]
+				var errMsg string
+				vers, opts, errMsg = loadVersion(t, d)
+				if errMsg != "" {
+					return errMsg
+				}
+				return runVersionFileSizes(vers)
+			case "init_cp":
+				resetCompacting()
+
+				var inProgress []compactionInfo
+				if arg, ok := d.Arg("ongoing"); ok {
+					var err error
+					inProgress, err = parseInProgress(arg.Vals)
+					if err != nil {
+						return err.Error()
+					}
+				}
+
+				p := newCompactionPicker(vers, opts, inProgress)
+				var ok bool
+				pickerByScore, ok = p.(*compactionPickerByScore)
+				require.True(t, ok)
+				return fmt.Sprintf("base: %d", pickerByScore.baseLevel)
+			case "queue":
+				var b strings.Builder
+				var inProgress []compactionInfo
+				for {
+					env := compactionEnv{
+						diskAvailBytes:          math.MaxUint64,
+						earliestUnflushedSeqNum: InternalKeySeqNumMax,
+						inProgressCompactions:   inProgress,
+					}
+					pc := pickerByScore.pickAuto(env)
+					if pc == nil {
+						break
+					}
+					fmt.Fprintf(&b, "L%d->L%d: %.1f\n", pc.startLevel.level, pc.outputLevel.level, pc.score)
+					inProgress = append(inProgress, compactionInfo{
+						inputs:      pc.inputs,
+						outputLevel: pc.outputLevel.level,
+						smallest:    pc.smallest,
+						largest:     pc.largest,
+					})
+					if pc.outputLevel.level == 0 {
+						// Once we pick one L0->L0 compaction, we'll keep on doing so
+						// because the test isn't marking files as Compacting.
+						break
+					}
+					for _, cl := range pc.inputs {
+						cl.files.Each(func(f *fileMetadata) {
+							f.CompactionState = manifest.CompactionStateCompacting
+							fmt.Fprintf(&b, "  %s marked as compacting\n", f)
+						})
+					}
+				}
+
+				resetCompacting()
+				return b.String()
+			case "pick":
+				resetCompacting()
+
+				var inProgress []compactionInfo
+				if len(d.CmdArgs) == 1 {
+					arg := d.CmdArgs[0]
+					if arg.Key != "ongoing" {
+						return "unknown arg: " + arg.Key
+					}
+					var err error
+					inProgress, err = parseInProgress(arg.Vals)
+					if err != nil {
+						return err.Error()
+					}
+				}
+
+				// Mark files as compacting for each in-progress compaction.
+				for i := range inProgress {
+					c := &inProgress[i]
+					for j, cl := range c.inputs {
+						iter := vers.Levels[cl.level].Iter()
+						for f := iter.First(); f != nil; f = iter.Next() {
+							if !f.IsCompacting() {
+								f.CompactionState = manifest.CompactionStateCompacting
+								c.inputs[j].files = iter.Take().Slice()
+								break
+							}
+						}
+					}
+					if c.inputs[0].level == 0 && c.outputLevel != 0 {
+						// L0->Lbase: mark all of Lbase as compacting.
+						c.inputs[1].files = vers.Levels[c.outputLevel].Slice()
+						for _, in := range c.inputs {
+							in.files.Each(func(f *fileMetadata) {
+								f.CompactionState = manifest.CompactionStateCompacting
+							})
+						}
+					}
+				}
+
+				var b strings.Builder
+				fmt.Fprintf(&b, "Initial state before pick:\n%s", runVersionFileSizes(vers))
+				pc := pickerByScore.pickAuto(compactionEnv{
+					earliestUnflushedSeqNum: InternalKeySeqNumMax,
+					inProgressCompactions:   inProgress,
+				})
+				if pc != nil {
+					fmt.Fprintf(&b, "Picked: L%d->L%d: %0.1f\n", pc.startLevel.level, pc.outputLevel.level, pc.score)
+				}
+				if pc == nil {
+					fmt.Fprintln(&b, "Picked: no compaction")
+				}
+				return b.String()
+			case "pick_manual":
+				var startLevel int
+				var start, end string
+				d.MaybeScanArgs(t, "level", &startLevel)
+				d.MaybeScanArgs(t, "start", &start)
+				d.MaybeScanArgs(t, "end", &end)
+
+				iStart := base.MakeInternalKey([]byte(start), InternalKeySeqNumMax, InternalKeyKindMax)
+				iEnd := base.MakeInternalKey([]byte(end), 0, 0)
+				manual := &manualCompaction{
+					done:  make(chan error, 1),
+					level: startLevel,
+					start: iStart.UserKey,
+					end:   iEnd.UserKey,
+				}
+
+				pc, retryLater := pickManualCompaction(
+					pickerByScore.vers,
+					opts,
+					compactionEnv{
+						earliestUnflushedSeqNum: InternalKeySeqNumMax,
+					},
+					pickerByScore.getBaseLevel(),
+					manual)
+				if pc == nil {
+					return fmt.Sprintf("nil, retryLater = %v", retryLater)
+				}
+
+				return fmt.Sprintf("L%d->L%d, retryLater = %v", pc.startLevel.level, pc.outputLevel.level, retryLater)
+			default:
+				return fmt.Sprintf("unknown command: %s", d.Cmd)
+			}
+		})
+}
+
+func TestCompactionPickerEstimatedCompactionDebt(t *testing.T) {
+	datadriven.RunTest(t, "testdata/compaction_picker_estimated_debt",
+		func(t *testing.T, d *datadriven.TestData) string {
+			switch d.Cmd {
+			case "init":
+				vers, opts, errMsg := loadVersion(t, d)
+				if errMsg != "" {
+					return errMsg
+				}
+				opts.MemTableSize = 1000
+
+				p := newCompactionPicker(vers, opts, nil)
+				return fmt.Sprintf("%d\n", p.estimatedCompactionDebt(0))
+
+			default:
+				return fmt.Sprintf("unknown command: %s", d.Cmd)
+			}
+		})
+}
+
+func TestCompactionPickerL0(t *testing.T) {
+	opts := (*Options)(nil).EnsureDefaults()
+	opts.Experimental.L0CompactionConcurrency = 1
+
+	parseMeta := func(s string) (*fileMetadata, error) {
+		parts := strings.Split(s, ":")
+		fileNum, err := strconv.Atoi(parts[0])
+		if err != nil {
+			return nil, err
+		}
+		fields := strings.Fields(parts[1])
+		parts = strings.Split(fields[0], "-")
+		if len(parts) != 2 {
+			return nil, errors.Errorf("malformed table spec: %s", s)
+		}
+		m := (&fileMetadata{
+			FileNum: base.FileNum(fileNum),
+		}).ExtendPointKeyBounds(
+			opts.Comparer.Compare,
+			base.ParseInternalKey(strings.TrimSpace(parts[0])),
+			base.ParseInternalKey(strings.TrimSpace(parts[1])),
+		)
+		m.SmallestSeqNum = m.Smallest.SeqNum()
+		m.LargestSeqNum = m.Largest.SeqNum()
+		m.InitPhysicalBacking()
+		return m, nil
+	}
+
+	var picker *compactionPickerByScore
+	var inProgressCompactions []compactionInfo
+	var pc *pickedCompaction
+
+	datadriven.RunTest(t, "testdata/compaction_picker_L0", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "define":
+			fileMetas := [manifest.NumLevels][]*fileMetadata{}
+			baseLevel := manifest.NumLevels - 1
+			level := 0
+			var err error
+			lines := strings.Split(td.Input, "\n")
+			var compactionLines []string
+
+			for len(lines) > 0 {
+				data := strings.TrimSpace(lines[0])
+				lines = lines[1:]
+				switch data {
+				case "L0", "L1", "L2", "L3", "L4", "L5", "L6":
+					level, err = strconv.Atoi(data[1:])
+					if err != nil {
+						return err.Error()
+					}
+				case "compactions":
+					compactionLines, lines = lines, nil
+				default:
+					meta, err := parseMeta(data)
+					if err != nil {
+						return err.Error()
+					}
+					if level != 0 && level < baseLevel {
+						baseLevel = level
+					}
+					fileMetas[level] = append(fileMetas[level], meta)
+				}
+			}
+
+			// Parse in-progress compactions in the form of:
+			//   L0 000001 -> L2 000005
+			inProgressCompactions = nil
+			for len(compactionLines) > 0 {
+				parts := strings.Fields(compactionLines[0])
+				compactionLines = compactionLines[1:]
+
+				var level int
+				var info compactionInfo
+				first := true
+				compactionFiles := map[int][]*fileMetadata{}
+				for _, p := range parts {
+					switch p {
+					case "L0", "L1", "L2", "L3", "L4", "L5", "L6":
+						var err error
+						level, err = strconv.Atoi(p[1:])
+						if err != nil {
+							return err.Error()
+						}
+						if len(info.inputs) > 0 && info.inputs[len(info.inputs)-1].level == level {
+							// eg, L0 -> L0 compaction or L6 -> L6 compaction
+							continue
+						}
+						if info.outputLevel < level {
+							info.outputLevel = level
+						}
+						info.inputs = append(info.inputs, compactionLevel{level: level})
+					case "->":
+						continue
+					default:
+						fileNum, err := strconv.Atoi(p)
+						if err != nil {
+							return err.Error()
+						}
+						var compactFile *fileMetadata
+						for _, m := range fileMetas[level] {
+							if m.FileNum == FileNum(fileNum) {
+								compactFile = m
+							}
+						}
+						if compactFile == nil {
+							return fmt.Sprintf("cannot find compaction file %s", FileNum(fileNum))
+						}
+						compactFile.CompactionState = manifest.CompactionStateCompacting
+						if first || base.InternalCompare(DefaultComparer.Compare, info.largest, compactFile.Largest) < 0 {
+							info.largest = compactFile.Largest
+						}
+						if first || base.InternalCompare(DefaultComparer.Compare, info.smallest, compactFile.Smallest) > 0 {
+							info.smallest = compactFile.Smallest
+						}
+						first = false
+						compactionFiles[level] = append(compactionFiles[level], compactFile)
+					}
+				}
+				for i, cl := range info.inputs {
+					files := compactionFiles[cl.level]
+					info.inputs[i].files = manifest.NewLevelSliceSeqSorted(files)
+					// Mark as intra-L0 compacting if the compaction is
+					// L0 -> L0.
+					if info.outputLevel == 0 {
+						for _, f := range files {
+							f.IsIntraL0Compacting = true
+						}
+					}
+				}
+				inProgressCompactions = append(inProgressCompactions, info)
+			}
+
+			version := newVersion(opts, fileMetas)
+			version.L0Sublevels.InitCompactingFileInfo(inProgressL0Compactions(inProgressCompactions))
+			vs := &versionSet{
+				opts:    opts,
+				cmp:     DefaultComparer.Compare,
+				cmpName: DefaultComparer.Name,
+			}
+			vs.versions.Init(nil)
+			vs.append(version)
+			picker = &compactionPickerByScore{
+				opts:      opts,
+				vers:      version,
+				baseLevel: baseLevel,
+			}
+			vs.picker = picker
+			picker.initLevelMaxBytes(inProgressCompactions)
+
+			var buf bytes.Buffer
+			fmt.Fprint(&buf, version.String())
+			if len(inProgressCompactions) > 0 {
+				fmt.Fprintln(&buf, "compactions")
+				for _, c := range inProgressCompactions {
+					fmt.Fprintf(&buf, "  %s\n", c.String())
+				}
+			}
+			return buf.String()
+		case "pick-auto":
+			td.MaybeScanArgs(t, "l0_compaction_threshold", &opts.L0CompactionThreshold)
+			td.MaybeScanArgs(t, "l0_compaction_file_threshold", &opts.L0CompactionFileThreshold)
+
+			pc = picker.pickAuto(compactionEnv{
+				diskAvailBytes:          math.MaxUint64,
+				earliestUnflushedSeqNum: math.MaxUint64,
+				inProgressCompactions:   inProgressCompactions,
+			})
+			var result strings.Builder
+			if pc != nil {
+				checkClone(t, pc)
+				c := newCompaction(pc, opts, time.Now(), nil /* provider */)
+				fmt.Fprintf(&result, "L%d -> L%d\n", pc.startLevel.level, pc.outputLevel.level)
+				fmt.Fprintf(&result, "L%d: %s\n", pc.startLevel.level, fileNums(pc.startLevel.files))
+				if !pc.outputLevel.files.Empty() {
+					fmt.Fprintf(&result, "L%d: %s\n", pc.outputLevel.level, fileNums(pc.outputLevel.files))
+				}
+				if !c.grandparents.Empty() {
+					fmt.Fprintf(&result, "grandparents: %s\n", fileNums(c.grandparents))
+				}
+			} else {
+				return "nil"
+			}
+			return result.String()
+		case "mark-for-compaction":
+			var fileNum uint64
+			td.ScanArgs(t, "file", &fileNum)
+			for l, lm := range picker.vers.Levels {
+				iter := lm.Iter()
+				for f := iter.First(); f != nil; f = iter.Next() {
+					if f.FileNum != base.FileNum(fileNum) {
+						continue
+					}
+					f.MarkedForCompaction = true
+					picker.vers.Stats.MarkedForCompaction++
+					picker.vers.Levels[l].InvalidateAnnotation(markedForCompactionAnnotator{})
+					return fmt.Sprintf("marked L%d.%s", l, f.FileNum)
+				}
+			}
+			return "not-found"
+		case "max-output-file-size":
+			if pc == nil {
+				return "no compaction"
+			}
+			return fmt.Sprintf("%d", pc.maxOutputFileSize)
+		case "max-overlap-bytes":
+			if pc == nil {
+				return "no compaction"
+			}
+			return fmt.Sprintf("%d", pc.maxOverlapBytes)
+		}
+		return fmt.Sprintf("unrecognized command: %s", td.Cmd)
+	})
+}
+
+func TestCompactionPickerConcurrency(t *testing.T) {
+	opts := (*Options)(nil).EnsureDefaults()
+	opts.Experimental.L0CompactionConcurrency = 1
+
+	parseMeta := func(s string) (*fileMetadata, error) {
+		parts := strings.Split(s, ":")
+		fileNum, err := strconv.Atoi(parts[0])
+		if err != nil {
+			return nil, err
+		}
+		fields := strings.Fields(parts[1])
+		parts = strings.Split(fields[0], "-")
+		if len(parts) != 2 {
+			return nil, errors.Errorf("malformed table spec: %s", s)
+		}
+		m := (&fileMetadata{
+			FileNum: base.FileNum(fileNum),
+			Size:    1028,
+		}).ExtendPointKeyBounds(
+			opts.Comparer.Compare,
+			base.ParseInternalKey(strings.TrimSpace(parts[0])),
+			base.ParseInternalKey(strings.TrimSpace(parts[1])),
+		)
+		m.InitPhysicalBacking()
+		for _, p := range fields[1:] {
+			if strings.HasPrefix(p, "size=") {
+				v, err := strconv.Atoi(strings.TrimPrefix(p, "size="))
+				if err != nil {
+					return nil, err
+				}
+				m.Size = uint64(v)
+			}
+		}
+		m.SmallestSeqNum = m.Smallest.SeqNum()
+		m.LargestSeqNum = m.Largest.SeqNum()
+		return m, nil
+	}
+
+	var picker *compactionPickerByScore
+	var inProgressCompactions []compactionInfo
+
+	datadriven.RunTest(t, "testdata/compaction_picker_concurrency", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "define":
+			fileMetas := [manifest.NumLevels][]*fileMetadata{}
+			level := 0
+			var err error
+			lines := strings.Split(td.Input, "\n")
+			var compactionLines []string
+
+			for len(lines) > 0 {
+				data := strings.TrimSpace(lines[0])
+				lines = lines[1:]
+				switch data {
+				case "L0", "L1", "L2", "L3", "L4", "L5", "L6":
+					level, err = strconv.Atoi(data[1:])
+					if err != nil {
+						return err.Error()
+					}
+				case "compactions":
+					compactionLines, lines = lines, nil
+				default:
+					meta, err := parseMeta(data)
+					if err != nil {
+						return err.Error()
+					}
+					fileMetas[level] = append(fileMetas[level], meta)
+				}
+			}
+
+			// Parse in-progress compactions in the form of:
+			//   L0 000001 -> L2 000005
+			inProgressCompactions = nil
+			for len(compactionLines) > 0 {
+				parts := strings.Fields(compactionLines[0])
+				compactionLines = compactionLines[1:]
+
+				var level int
+				var info compactionInfo
+				first := true
+				compactionFiles := map[int][]*fileMetadata{}
+				for _, p := range parts {
+					switch p {
+					case "L0", "L1", "L2", "L3", "L4", "L5", "L6":
+						var err error
+						level, err = strconv.Atoi(p[1:])
+						if err != nil {
+							return err.Error()
+						}
+						if len(info.inputs) > 0 && info.inputs[len(info.inputs)-1].level == level {
+							// eg, L0 -> L0 compaction or L6 -> L6 compaction
+							continue
+						}
+						if info.outputLevel < level {
+							info.outputLevel = level
+						}
+						info.inputs = append(info.inputs, compactionLevel{level: level})
+					case "->":
+						continue
+					default:
+						fileNum, err := strconv.Atoi(p)
+						if err != nil {
+							return err.Error()
+						}
+						var compactFile *fileMetadata
+						for _, m := range fileMetas[level] {
+							if m.FileNum == FileNum(fileNum) {
+								compactFile = m
+							}
+						}
+						if compactFile == nil {
+							return fmt.Sprintf("cannot find compaction file %s", FileNum(fileNum))
+						}
+						compactFile.CompactionState = manifest.CompactionStateCompacting
+						if first || base.InternalCompare(DefaultComparer.Compare, info.largest, compactFile.Largest) < 0 {
+							info.largest = compactFile.Largest
+						}
+						if first || base.InternalCompare(DefaultComparer.Compare, info.smallest, compactFile.Smallest) > 0 {
+							info.smallest = compactFile.Smallest
+						}
+						first = false
+						compactionFiles[level] = append(compactionFiles[level], compactFile)
+					}
+				}
+				for i, cl := range info.inputs {
+					files := compactionFiles[cl.level]
+					if cl.level == 0 {
+						info.inputs[i].files = manifest.NewLevelSliceSeqSorted(files)
+					} else {
+						info.inputs[i].files = manifest.NewLevelSliceKeySorted(DefaultComparer.Compare, files)
+					}
+					// Mark as intra-L0 compacting if the compaction is
+					// L0 -> L0.
+					if info.outputLevel == 0 {
+						for _, f := range files {
+							f.IsIntraL0Compacting = true
+						}
+					}
+				}
+				inProgressCompactions = append(inProgressCompactions, info)
+			}
+
+			version := newVersion(opts, fileMetas)
+			version.L0Sublevels.InitCompactingFileInfo(inProgressL0Compactions(inProgressCompactions))
+			vs := &versionSet{
+				opts:    opts,
+				cmp:     DefaultComparer.Compare,
+				cmpName: DefaultComparer.Name,
+			}
+			vs.versions.Init(nil)
+			vs.append(version)
+
+			picker = newCompactionPicker(version, opts, inProgressCompactions).(*compactionPickerByScore)
+			vs.picker = picker
+
+			var buf bytes.Buffer
+			fmt.Fprint(&buf, version.String())
+			if len(inProgressCompactions) > 0 {
+				fmt.Fprintln(&buf, "compactions")
+				for _, c := range inProgressCompactions {
+					fmt.Fprintf(&buf, "  %s\n", c.String())
+				}
+			}
+			return buf.String()
+
+		case "pick-auto":
+			td.MaybeScanArgs(t, "l0_compaction_threshold", &opts.L0CompactionThreshold)
+			td.MaybeScanArgs(t, "l0_compaction_concurrency", &opts.Experimental.L0CompactionConcurrency)
+			td.MaybeScanArgs(t, "compaction_debt_concurrency", &opts.Experimental.CompactionDebtConcurrency)
+
+			pc := picker.pickAuto(compactionEnv{
+				earliestUnflushedSeqNum: math.MaxUint64,
+				inProgressCompactions:   inProgressCompactions,
+			})
+			var result strings.Builder
+			if pc != nil {
+				c := newCompaction(pc, opts, time.Now(), nil /* provider */)
+				fmt.Fprintf(&result, "L%d -> L%d\n", pc.startLevel.level, pc.outputLevel.level)
+				fmt.Fprintf(&result, "L%d: %s\n", pc.startLevel.level, fileNums(pc.startLevel.files))
+				if !pc.outputLevel.files.Empty() {
+					fmt.Fprintf(&result, "L%d: %s\n", pc.outputLevel.level, fileNums(pc.outputLevel.files))
+				}
+				if !c.grandparents.Empty() {
+					fmt.Fprintf(&result, "grandparents: %s\n", fileNums(c.grandparents))
+				}
+			} else {
+				return "nil"
+			}
+			return result.String()
+		}
+		return fmt.Sprintf("unrecognized command: %s", td.Cmd)
+	})
+}
+
+func TestCompactionPickerPickReadTriggered(t *testing.T) {
+	opts := (*Options)(nil).EnsureDefaults()
+	var picker *compactionPickerByScore
+	var rcList readCompactionQueue
+	var vers *version
+
+	parseMeta := func(s string) (*fileMetadata, error) {
+		parts := strings.Split(s, ":")
+		fileNum, err := strconv.Atoi(parts[0])
+		if err != nil {
+			return nil, err
+		}
+		fields := strings.Fields(parts[1])
+		parts = strings.Split(fields[0], "-")
+		if len(parts) != 2 {
+			return nil, errors.Errorf("malformed table spec: %s. usage: <file-num>:start.SET.1-end.SET.2", s)
+		}
+		m := (&fileMetadata{
+			FileNum: base.FileNum(fileNum),
+			Size:    1028,
+		}).ExtendPointKeyBounds(
+			opts.Comparer.Compare,
+			base.ParseInternalKey(strings.TrimSpace(parts[0])),
+			base.ParseInternalKey(strings.TrimSpace(parts[1])),
+		)
+		m.InitPhysicalBacking()
+		for _, p := range fields[1:] {
+			if strings.HasPrefix(p, "size=") {
+				v, err := strconv.Atoi(strings.TrimPrefix(p, "size="))
+				if err != nil {
+					return nil, err
+				}
+				m.Size = uint64(v)
+			}
+		}
+		m.SmallestSeqNum = m.Smallest.SeqNum()
+		m.LargestSeqNum = m.Largest.SeqNum()
+		return m, nil
+	}
+
+	datadriven.RunTest(t, "testdata/compaction_picker_read_triggered", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "define":
+			rcList = readCompactionQueue{}
+			fileMetas := [manifest.NumLevels][]*fileMetadata{}
+			level := 0
+			var err error
+			lines := strings.Split(td.Input, "\n")
+
+			for len(lines) > 0 {
+				data := strings.TrimSpace(lines[0])
+				lines = lines[1:]
+				switch data {
+				case "L0", "L1", "L2", "L3", "L4", "L5", "L6":
+					level, err = strconv.Atoi(data[1:])
+					if err != nil {
+						return err.Error()
+					}
+				default:
+					meta, err := parseMeta(data)
+					if err != nil {
+						return err.Error()
+					}
+					fileMetas[level] = append(fileMetas[level], meta)
+				}
+			}
+
+			vers = newVersion(opts, fileMetas)
+			vs := &versionSet{
+				opts:    opts,
+				cmp:     DefaultComparer.Compare,
+				cmpName: DefaultComparer.Name,
+			}
+			vs.versions.Init(nil)
+			vs.append(vers)
+			var inProgressCompactions []compactionInfo
+			picker = newCompactionPicker(vers, opts, inProgressCompactions).(*compactionPickerByScore)
+			vs.picker = picker
+
+			var buf bytes.Buffer
+			fmt.Fprint(&buf, vers.String())
+			return buf.String()
+
+		case "add-read-compaction":
+			for _, line := range strings.Split(td.Input, "\n") {
+				if line == "" {
+					continue
+				}
+				parts := strings.Split(line, " ")
+				if len(parts) != 3 {
+					return "error: malformed data for add-read-compaction. usage: <level>: <start>-<end> <filenum>"
+				}
+				if l, err := strconv.Atoi(parts[0][:1]); err == nil {
+					keys := strings.Split(parts[1], "-")
+					fileNum, _ := strconv.Atoi(parts[2])
+
+					rc := readCompaction{
+						level:   l,
+						start:   []byte(keys[0]),
+						end:     []byte(keys[1]),
+						fileNum: base.FileNum(fileNum),
+					}
+					rcList.add(&rc, DefaultComparer.Compare)
+				} else {
+					return err.Error()
+				}
+			}
+			return ""
+
+		case "show-read-compactions":
+			var sb strings.Builder
+			if rcList.size == 0 {
+				sb.WriteString("(none)")
+			}
+			for i := 0; i < rcList.size; i++ {
+				rc := rcList.at(i)
+				sb.WriteString(fmt.Sprintf("(level: %d, start: %s, end: %s)\n", rc.level, string(rc.start), string(rc.end)))
+			}
+			return sb.String()
+
+		case "pick-auto":
+			pc := picker.pickAuto(compactionEnv{
+				earliestUnflushedSeqNum: math.MaxUint64,
+				readCompactionEnv: readCompactionEnv{
+					readCompactions: &rcList,
+					flushing:        false,
+				},
+			})
+			var result strings.Builder
+			if pc != nil {
+				fmt.Fprintf(&result, "L%d -> L%d\n", pc.startLevel.level, pc.outputLevel.level)
+				fmt.Fprintf(&result, "L%d: %s\n", pc.startLevel.level, fileNums(pc.startLevel.files))
+				if !pc.outputLevel.files.Empty() {
+					fmt.Fprintf(&result, "L%d: %s\n", pc.outputLevel.level, fileNums(pc.outputLevel.files))
+				}
+			} else {
+				return "nil"
+			}
+			return result.String()
+		}
+		return fmt.Sprintf("unrecognized command: %s", td.Cmd)
+	})
+}
+
+type alwaysMultiLevel struct{}
+
+func (d alwaysMultiLevel) pick(
+	pcOrig *pickedCompaction, opts *Options, diskAvailBytes uint64,
+) *pickedCompaction {
+	pcMulti := pcOrig.clone()
+	if !pcMulti.setupMultiLevelCandidate(opts, diskAvailBytes) {
+		return pcOrig
+	}
+	return pcMulti
+}
+
+func (d alwaysMultiLevel) allowL0() bool {
+	return false
+}
+
+func TestPickedCompactionSetupInputs(t *testing.T) {
+	opts := &Options{}
+	opts.EnsureDefaults()
+
+	parseMeta := func(s string) *fileMetadata {
+		parts := strings.Split(strings.TrimSpace(s), " ")
+		var fileSize uint64
+		var compacting bool
+		for _, part := range parts {
+			switch {
+			case part == "compacting":
+				compacting = true
+			case strings.HasPrefix(part, "size="):
+				v, err := strconv.ParseUint(strings.TrimPrefix(part, "size="), 10, 64)
+				require.NoError(t, err)
+				fileSize = v
+			}
+		}
+		tableParts := strings.Split(parts[0], "-")
+		if len(tableParts) != 2 {
+			t.Fatalf("malformed table spec: %s", s)
+		}
+		state := manifest.CompactionStateNotCompacting
+		if compacting {
+			state = manifest.CompactionStateCompacting
+		}
+		m := (&fileMetadata{
+			CompactionState: state,
+			Size:            fileSize,
+		}).ExtendPointKeyBounds(
+			opts.Comparer.Compare,
+			base.ParseInternalKey(strings.TrimSpace(tableParts[0])),
+			base.ParseInternalKey(strings.TrimSpace(tableParts[1])),
+		)
+		m.SmallestSeqNum = m.Smallest.SeqNum()
+		m.LargestSeqNum = m.Largest.SeqNum()
+		m.InitPhysicalBacking()
+		return m
+	}
+
+	setupInputTest := func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "setup-inputs":
+			var availBytes uint64 = math.MaxUint64
+			var maxLevelBytes [7]int64
+			args := d.CmdArgs
+
+			if len(args) > 0 && args[0].Key == "avail-bytes" {
+				require.Equal(t, 1, len(args[0].Vals))
+				var err error
+				availBytes, err = strconv.ParseUint(args[0].Vals[0], 10, 64)
+				require.NoError(t, err)
+				args = args[1:]
+			}
+
+			if len(args) != 2 {
+				return "setup-inputs [avail-bytes=XXX] <start> <end>"
+			}
+
+			pc := &pickedCompaction{
+				cmp:    DefaultComparer.Compare,
+				inputs: []compactionLevel{{level: -1}, {level: -1}},
+			}
+			pc.startLevel, pc.outputLevel = &pc.inputs[0], &pc.inputs[1]
+			var currentLevel int
+			var files [numLevels][]*fileMetadata
+			fileNum := FileNum(1)
+
+			for _, data := range strings.Split(d.Input, "\n") {
+				switch data[:2] {
+				case "L0", "L1", "L2", "L3", "L4", "L5", "L6":
+					levelArgs := strings.Fields(data)
+					level, err := strconv.Atoi(levelArgs[0][1:])
+					if err != nil {
+						return err.Error()
+					}
+					currentLevel = level
+					if len(levelArgs) > 1 {
+						maxSizeArg := strings.Replace(levelArgs[1], "max-size=", "", 1)
+						maxSize, err := strconv.ParseInt(maxSizeArg, 10, 64)
+						if err != nil {
+							return err.Error()
+						}
+						maxLevelBytes[level] = maxSize
+					} else {
+						maxLevelBytes[level] = math.MaxInt64
+					}
+					if pc.startLevel.level == -1 {
+						pc.startLevel.level = level
+
+					} else if pc.outputLevel.level == -1 {
+						if pc.startLevel.level >= level {
+							return fmt.Sprintf("startLevel=%d >= outputLevel=%d\n", pc.startLevel.level, level)
+						}
+						pc.outputLevel.level = level
+					}
+				default:
+					meta := parseMeta(data)
+					meta.FileNum = fileNum
+					fileNum++
+					files[currentLevel] = append(files[currentLevel], meta)
+				}
+			}
+
+			if pc.outputLevel.level == -1 {
+				pc.outputLevel.level = pc.startLevel.level + 1
+			}
+			pc.version = newVersion(opts, files)
+			pc.startLevel.files = pc.version.Overlaps(pc.startLevel.level, pc.cmp,
+				[]byte(args[0].String()), []byte(args[1].String()), false /* exclusiveEnd */)
+
+			var isCompacting bool
+			if !pc.setupInputs(opts, availBytes, pc.startLevel) {
+				isCompacting = true
+			}
+			origPC := pc
+			pc = pc.maybeAddLevel(opts, availBytes)
+			// If pc points to a new pickedCompaction, a new multi level compaction
+			// was initialized.
+			initMultiLevel := pc != origPC
+			checkClone(t, pc)
+			var buf bytes.Buffer
+			for _, cl := range pc.inputs {
+				if cl.files.Empty() {
+					continue
+				}
+
+				fmt.Fprintf(&buf, "L%d\n", cl.level)
+				cl.files.Each(func(f *fileMetadata) {
+					fmt.Fprintf(&buf, "  %s\n", f)
+				})
+			}
+			if isCompacting {
+				fmt.Fprintf(&buf, "is-compacting\n")
+			}
+
+			if initMultiLevel {
+				extraLevel := pc.extraLevels[0].level
+				fmt.Fprintf(&buf, "init-multi-level(%d,%d,%d)\n", pc.startLevel.level, extraLevel,
+					pc.outputLevel.level)
+				fmt.Fprintf(&buf, "Original WriteAmp %.2f; ML WriteAmp %.2f\n", origPC.predictedWriteAmp(), pc.predictedWriteAmp())
+				fmt.Fprintf(&buf, "Original OverlappingRatio %.2f; ML OverlappingRatio %.2f\n", origPC.overlappingRatio(), pc.overlappingRatio())
+			}
+			return buf.String()
+
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	}
+
+	t.Logf("Test basic setup inputs behavior without multi level compactions")
+	opts.Experimental.MultiLevelCompactionHeuristic = NoMultiLevel{}
+	datadriven.RunTest(t, "testdata/compaction_setup_inputs",
+		setupInputTest)
+
+	t.Logf("Turning multi level compaction on")
+	opts.Experimental.MultiLevelCompactionHeuristic = alwaysMultiLevel{}
+	datadriven.RunTest(t, "testdata/compaction_setup_inputs_multilevel_dummy",
+		setupInputTest)
+
+	t.Logf("Try Write-Amp Heuristic")
+	opts.Experimental.MultiLevelCompactionHeuristic = WriteAmpHeuristic{}
+	datadriven.RunTest(t, "testdata/compaction_setup_inputs_multilevel_write_amp",
+		setupInputTest)
+}
+
+func TestPickedCompactionExpandInputs(t *testing.T) {
+	opts := &Options{}
+	opts.EnsureDefaults()
+	cmp := DefaultComparer.Compare
+	var files []*fileMetadata
+
+	parseMeta := func(s string) *fileMetadata {
+		parts := strings.Split(s, "-")
+		if len(parts) != 2 {
+			t.Fatalf("malformed table spec: %s", s)
+		}
+		m := (&fileMetadata{}).ExtendPointKeyBounds(
+			opts.Comparer.Compare,
+			base.ParseInternalKey(parts[0]),
+			base.ParseInternalKey(parts[1]),
+		)
+		m.InitPhysicalBacking()
+		return m
+	}
+
+	datadriven.RunTest(t, "testdata/compaction_expand_inputs",
+		func(t *testing.T, d *datadriven.TestData) string {
+			switch d.Cmd {
+			case "define":
+				files = nil
+				if len(d.Input) == 0 {
+					return ""
+				}
+				for _, data := range strings.Split(d.Input, "\n") {
+					meta := parseMeta(data)
+					meta.FileNum = FileNum(len(files))
+					files = append(files, meta)
+				}
+				manifest.SortBySmallest(files, cmp)
+				return ""
+
+			case "expand-inputs":
+				pc := &pickedCompaction{
+					cmp:    cmp,
+					inputs: []compactionLevel{{level: 1}},
+				}
+				pc.startLevel = &pc.inputs[0]
+
+				var filesLevelled [numLevels][]*fileMetadata
+				filesLevelled[pc.startLevel.level] = files
+				pc.version = newVersion(opts, filesLevelled)
+
+				if len(d.CmdArgs) != 1 {
+					return fmt.Sprintf("%s expects 1 argument", d.Cmd)
+				}
+				index, err := strconv.ParseInt(d.CmdArgs[0].String(), 10, 64)
+				if err != nil {
+					return err.Error()
+				}
+
+				// Advance the iterator to position `index`.
+				iter := pc.version.Levels[pc.startLevel.level].Iter()
+				_ = iter.First()
+				for i := int64(0); i < index; i++ {
+					_ = iter.Next()
+				}
+
+				inputs, _ := expandToAtomicUnit(cmp, iter.Take().Slice(), true /* disableIsCompacting */)
+
+				var buf bytes.Buffer
+				inputs.Each(func(f *fileMetadata) {
+					fmt.Fprintf(&buf, "%d: %s-%s\n", f.FileNum, f.Smallest, f.Largest)
+				})
+				return buf.String()
+
+			default:
+				return fmt.Sprintf("unknown command: %s", d.Cmd)
+			}
+		})
+}
+
+func TestCompactionOutputFileSize(t *testing.T) {
+	opts := (*Options)(nil).EnsureDefaults()
+	var picker *compactionPickerByScore
+	var vers *version
+
+	parseMeta := func(s string) (*fileMetadata, error) {
+		parts := strings.Split(s, ":")
+		fileNum, err := strconv.Atoi(parts[0])
+		if err != nil {
+			return nil, err
+		}
+		fields := strings.Fields(parts[1])
+		parts = strings.Split(fields[0], "-")
+		if len(parts) != 2 {
+			return nil, errors.Errorf("malformed table spec: %s. usage: <file-num>:start.SET.1-end.SET.2", s)
+		}
+		m := (&fileMetadata{
+			FileNum: base.FileNum(fileNum),
+			Size:    1028,
+		}).ExtendPointKeyBounds(
+			opts.Comparer.Compare,
+			base.ParseInternalKey(strings.TrimSpace(parts[0])),
+			base.ParseInternalKey(strings.TrimSpace(parts[1])),
+		)
+		m.InitPhysicalBacking()
+		for _, p := range fields[1:] {
+			if strings.HasPrefix(p, "size=") {
+				v, err := strconv.Atoi(strings.TrimPrefix(p, "size="))
+				if err != nil {
+					return nil, err
+				}
+				m.Size = uint64(v)
+			}
+			if strings.HasPrefix(p, "range-deletions-bytes-estimate=") {
+				v, err := strconv.Atoi(strings.TrimPrefix(p, "range-deletions-bytes-estimate="))
+				if err != nil {
+					return nil, err
+				}
+				m.Stats.RangeDeletionsBytesEstimate = uint64(v)
+				m.Stats.NumDeletions = 1 // At least one range del responsible for the deletion bytes.
+				m.StatsMarkValid()
+			}
+		}
+		m.SmallestSeqNum = m.Smallest.SeqNum()
+		m.LargestSeqNum = m.Largest.SeqNum()
+		return m, nil
+	}
+
+	datadriven.RunTest(t, "testdata/compaction_output_file_size", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "define":
+			fileMetas := [manifest.NumLevels][]*fileMetadata{}
+			level := 0
+			var err error
+			lines := strings.Split(td.Input, "\n")
+
+			for len(lines) > 0 {
+				data := strings.TrimSpace(lines[0])
+				lines = lines[1:]
+				switch data {
+				case "L0", "L1", "L2", "L3", "L4", "L5", "L6":
+					level, err = strconv.Atoi(data[1:])
+					if err != nil {
+						return err.Error()
+					}
+				default:
+					meta, err := parseMeta(data)
+					if err != nil {
+						return err.Error()
+					}
+					fileMetas[level] = append(fileMetas[level], meta)
+				}
+			}
+
+			vers = newVersion(opts, fileMetas)
+			vs := &versionSet{
+				opts:    opts,
+				cmp:     DefaultComparer.Compare,
+				cmpName: DefaultComparer.Name,
+			}
+			vs.versions.Init(nil)
+			vs.append(vers)
+			var inProgressCompactions []compactionInfo
+			picker = newCompactionPicker(vers, opts, inProgressCompactions).(*compactionPickerByScore)
+			vs.picker = picker
+
+			var buf bytes.Buffer
+			fmt.Fprint(&buf, vers.String())
+			return buf.String()
+
+		case "pick-auto":
+			pc := picker.pickAuto(compactionEnv{
+				earliestUnflushedSeqNum: math.MaxUint64,
+				earliestSnapshotSeqNum:  math.MaxUint64,
+			})
+			var buf bytes.Buffer
+			if pc != nil {
+				fmt.Fprintf(&buf, "L%d -> L%d\n", pc.startLevel.level, pc.outputLevel.level)
+				fmt.Fprintf(&buf, "L%d: %s\n", pc.startLevel.level, fileNums(pc.startLevel.files))
+				fmt.Fprintf(&buf, "maxOutputFileSize: %d\n", pc.maxOutputFileSize)
+			} else {
+				return "nil"
+			}
+			return buf.String()
+
+		default:
+			return fmt.Sprintf("unrecognized command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestCompactionPickerCompensatedSize(t *testing.T) {
+	testCases := []struct {
+		size                  uint64
+		pointDelEstimateBytes uint64
+		rangeDelEstimateBytes uint64
+		wantBytes             uint64
+	}{
+		{
+			size:                  100,
+			pointDelEstimateBytes: 0,
+			rangeDelEstimateBytes: 0,
+			wantBytes:             100,
+		},
+		{
+			size:                  100,
+			pointDelEstimateBytes: 10,
+			rangeDelEstimateBytes: 0,
+			wantBytes:             100 + 10,
+		},
+		{
+			size:                  100,
+			pointDelEstimateBytes: 10,
+			rangeDelEstimateBytes: 5,
+			wantBytes:             100 + 10 + 5,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run("", func(t *testing.T) {
+			f := &fileMetadata{Size: tc.size}
+			f.InitPhysicalBacking()
+			f.Stats.PointDeletionsBytesEstimate = tc.pointDelEstimateBytes
+			f.Stats.RangeDeletionsBytesEstimate = tc.rangeDelEstimateBytes
+			gotBytes := compensatedSize(f)
+			require.Equal(t, tc.wantBytes, gotBytes)
+		})
+	}
+}
+
+func TestCompactionPickerPickFile(t *testing.T) {
+	fs := vfs.NewMem()
+	opts := &Options{
+		Comparer:           testkeys.Comparer,
+		FormatMajorVersion: FormatNewest,
+		FS:                 fs,
+	}
+
+	d, err := Open("", opts)
+	require.NoError(t, err)
+	defer func() {
+		if d != nil {
+			require.NoError(t, d.Close())
+		}
+	}()
+
+	datadriven.RunTest(t, "testdata/compaction_picker_pick_file", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "define":
+			require.NoError(t, d.Close())
+
+			d, err = runDBDefineCmd(td, opts)
+			if err != nil {
+				return err.Error()
+			}
+			d.mu.Lock()
+			s := d.mu.versions.currentVersion().String()
+			d.mu.Unlock()
+			return s
+
+		case "file-sizes":
+			return runTableFileSizesCmd(td, d)
+
+		case "pick-file":
+			s := strings.TrimPrefix(td.CmdArgs[0].String(), "L")
+			level, err := strconv.Atoi(s)
+			if err != nil {
+				return fmt.Sprintf("unable to parse arg %q as level", td.CmdArgs[0].String())
+			}
+			if level == 0 {
+				panic("L0 picking unimplemented")
+			}
+			d.mu.Lock()
+			defer d.mu.Unlock()
+
+			// Use maybeScheduleCompactionPicker to take care of all of the
+			// initialization of the compaction-picking environment, but never
+			// pick a compaction; just call pickFile using the user-provided
+			// level.
+			var lf manifest.LevelFile
+			var ok bool
+			d.maybeScheduleCompactionPicker(func(untypedPicker compactionPicker, env compactionEnv) *pickedCompaction {
+				p := untypedPicker.(*compactionPickerByScore)
+				lf, ok = pickCompactionSeedFile(p.vers, opts, level, level+1, env.earliestSnapshotSeqNum)
+				return nil
+			})
+			if !ok {
+				return "(none)"
+			}
+			return lf.FileMetadata.String()
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+type pausableCleaner struct {
+	mu      sync.Mutex
+	cond    sync.Cond
+	paused  bool
+	cleaner Cleaner
+}
+
+func (c *pausableCleaner) Clean(fs vfs.FS, fileType base.FileType, path string) error {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	for c.paused {
+		c.cond.Wait()
+	}
+	return c.cleaner.Clean(fs, fileType, path)
+}
+
+func (c *pausableCleaner) pause() {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	c.paused = true
+}
+
+func (c *pausableCleaner) resume() {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	c.paused = false
+	c.cond.Broadcast()
+}
+
+func TestCompactionPickerScores(t *testing.T) {
+	fs := vfs.NewMem()
+	cleaner := pausableCleaner{cleaner: DeleteCleaner{}}
+	cleaner.cond.L = &cleaner.mu
+	opts := &Options{
+		Cleaner:                     &cleaner,
+		Comparer:                    testkeys.Comparer,
+		DisableAutomaticCompactions: true,
+		FormatMajorVersion:          FormatNewest,
+		FS:                          fs,
+	}
+
+	d, err := Open("", opts)
+	require.NoError(t, err)
+	defer func() {
+		if d != nil {
+			cleaner.resume()
+			require.NoError(t, closeAllSnapshots(d))
+			require.NoError(t, d.Close())
+		}
+	}()
+
+	var buf bytes.Buffer
+	datadriven.RunTest(t, "testdata/compaction_picker_scores", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "define":
+			require.NoError(t, closeAllSnapshots(d))
+			require.NoError(t, d.Close())
+
+			if td.HasArg("pause-cleaning") {
+				cleaner.pause()
+			}
+
+			d, err = runDBDefineCmd(td, opts)
+			if err != nil {
+				return err.Error()
+			}
+			d.mu.Lock()
+			s := d.mu.versions.currentVersion().String()
+			d.mu.Unlock()
+			return s
+
+		case "disable-table-stats":
+			d.mu.Lock()
+			d.opts.private.disableTableStats = true
+			d.mu.Unlock()
+			return ""
+
+		case "enable-table-stats":
+			d.mu.Lock()
+			d.opts.private.disableTableStats = false
+			d.maybeCollectTableStatsLocked()
+			d.mu.Unlock()
+			return ""
+
+		case "resume-cleaning":
+			cleaner.resume()
+			return ""
+
+		case "ingest":
+			if err = runBuildCmd(td, d, d.opts.FS); err != nil {
+				return err.Error()
+			}
+			if err = runIngestCmd(td, d, d.opts.FS); err != nil {
+				return err.Error()
+			}
+			d.mu.Lock()
+			s := d.mu.versions.currentVersion().String()
+			d.mu.Unlock()
+			return s
+
+		case "lsm":
+			return runLSMCmd(td, d)
+
+		case "maybe-compact":
+			buf.Reset()
+			d.mu.Lock()
+			d.opts.DisableAutomaticCompactions = false
+			d.maybeScheduleCompaction()
+			fmt.Fprintf(&buf, "%d compactions in progress:", d.mu.compact.compactingCount)
+			for c := range d.mu.compact.inProgress {
+				fmt.Fprintf(&buf, "\n%s", c)
+			}
+			d.opts.DisableAutomaticCompactions = true
+			d.mu.Unlock()
+			return buf.String()
+
+		case "scores":
+			waitFor := "completion"
+			td.MaybeScanArgs(t, "wait-for-compaction", &waitFor)
+
+			// Wait for any running compactions to complete before calculating
+			// scores. Otherwise, the output of this command is
+			// nondeterministic.
+			switch waitFor {
+			case "completion":
+				d.mu.Lock()
+				for d.mu.compact.compactingCount > 0 {
+					d.mu.compact.cond.Wait()
+				}
+				d.mu.Unlock()
+			case "version-edit":
+				func() {
+					for {
+						d.mu.Lock()
+						wait := len(d.mu.compact.inProgress) > 0
+						for c := range d.mu.compact.inProgress {
+							wait = wait && !c.versionEditApplied
+						}
+						d.mu.Unlock()
+						if !wait {
+							return
+						}
+						// d.mu.compact.cond isn't notified until the compaction
+						// is removed from inProgress, so we need to just sleep
+						// and check again soon.
+						time.Sleep(10 * time.Millisecond)
+					}
+				}()
+			default:
+				panic(fmt.Sprintf("unrecognized `wait-for-compaction` value: %q", waitFor))
+			}
+
+			buf.Reset()
+			fmt.Fprintf(&buf, "L       Size   Score\n")
+			for l, lm := range d.Metrics().Levels {
+				if l < numLevels-1 {
+					fmt.Fprintf(&buf, "L%-3d\t%-7s%.1f\n", l, humanize.Bytes.Int64(lm.Size), lm.Score)
+				} else {
+					fmt.Fprintf(&buf, "L%-3d\t%-7s-\n", l, humanize.Bytes.Int64(lm.Size))
+				}
+			}
+			return buf.String()
+
+		case "wait-pending-table-stats":
+			return runTableStatsCmd(td, d)
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func fileNums(files manifest.LevelSlice) string {
+	var ss []string
+	files.Each(func(f *fileMetadata) {
+		ss = append(ss, f.FileNum.String())
+	})
+	sort.Strings(ss)
+	return strings.Join(ss, ",")
+}
+
+func checkClone(t *testing.T, pc *pickedCompaction) {
+	pcClone := pc.clone()
+	require.Equal(t, pc.String(), pcClone.String())
+
+	// ensure all input files are in new address
+	for i := range pc.inputs {
+		// Len could be zero if setup inputs rejected a level
+		if pc.inputs[i].files.Len() > 0 {
+			require.NotEqual(t, &pc.inputs[i], &pcClone.inputs[i])
+		}
+	}
+	for i := range pc.startLevel.l0SublevelInfo {
+		if pc.startLevel.l0SublevelInfo[i].Len() > 0 {
+			require.NotEqual(t, &pc.startLevel.l0SublevelInfo[i], &pcClone.startLevel.l0SublevelInfo[i])
+		}
+	}
+}
diff --git a/pebble/compaction_test.go b/pebble/compaction_test.go
new file mode 100644
index 0000000..ea1437a
--- /dev/null
+++ b/pebble/compaction_test.go
@@ -0,0 +1,3912 @@
+// Copyright 2013 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"context"
+	crand "crypto/rand"
+	"fmt"
+	"math"
+	"math/rand"
+	"path/filepath"
+	"reflect"
+	"regexp"
+	"runtime"
+	"slices"
+	"sort"
+	"strconv"
+	"strings"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/errors/oserror"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/objstorage/remote"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/cockroachdb/pebble/vfs/errorfs"
+	"github.com/stretchr/testify/require"
+)
+
+func newVersion(opts *Options, files [numLevels][]*fileMetadata) *version {
+	return manifest.NewVersion(
+		opts.Comparer.Compare,
+		opts.Comparer.FormatKey,
+		opts.FlushSplitBytes,
+		files)
+}
+
+type compactionPickerForTesting struct {
+	score         float64
+	level         int
+	baseLevel     int
+	opts          *Options
+	vers          *manifest.Version
+	maxLevelBytes [7]int64
+}
+
+var _ compactionPicker = &compactionPickerForTesting{}
+
+func (p *compactionPickerForTesting) getScores([]compactionInfo) [numLevels]float64 {
+	return [numLevels]float64{}
+}
+
+func (p *compactionPickerForTesting) getBaseLevel() int {
+	return p.baseLevel
+}
+
+func (p *compactionPickerForTesting) estimatedCompactionDebt(l0ExtraSize uint64) uint64 {
+	return 0
+}
+
+func (p *compactionPickerForTesting) forceBaseLevel1() {}
+
+func (p *compactionPickerForTesting) pickAuto(env compactionEnv) (pc *pickedCompaction) {
+	if p.score < 1 {
+		return nil
+	}
+	outputLevel := p.level + 1
+	if p.level == 0 {
+		outputLevel = p.baseLevel
+	}
+	iter := p.vers.Levels[p.level].Iter()
+	iter.First()
+	cInfo := candidateLevelInfo{
+		level:       p.level,
+		outputLevel: outputLevel,
+		file:        iter.Take(),
+	}
+	if cInfo.level == 0 {
+		return pickL0(env, p.opts, p.vers, p.baseLevel)
+	}
+	return pickAutoLPositive(env, p.opts, p.vers, cInfo, p.baseLevel, p.maxLevelBytes)
+}
+
+func (p *compactionPickerForTesting) pickElisionOnlyCompaction(
+	env compactionEnv,
+) (pc *pickedCompaction) {
+	return nil
+}
+
+func (p *compactionPickerForTesting) pickRewriteCompaction(
+	env compactionEnv,
+) (pc *pickedCompaction) {
+	return nil
+}
+
+func (p *compactionPickerForTesting) pickReadTriggeredCompaction(
+	env compactionEnv,
+) (pc *pickedCompaction) {
+	return nil
+}
+
+func TestPickCompaction(t *testing.T) {
+	fileNums := func(files manifest.LevelSlice) string {
+		var ss []string
+		files.Each(func(meta *fileMetadata) {
+			ss = append(ss, strconv.Itoa(int(meta.FileNum)))
+		})
+		sort.Strings(ss)
+		return strings.Join(ss, ",")
+	}
+
+	opts := (*Options)(nil).EnsureDefaults()
+	newFileMeta := func(fileNum FileNum, size uint64, smallest, largest base.InternalKey) *fileMetadata {
+		m := (&fileMetadata{
+			FileNum: fileNum,
+			Size:    size,
+		}).ExtendPointKeyBounds(opts.Comparer.Compare, smallest, largest)
+		m.InitPhysicalBacking()
+		return m
+	}
+
+	testCases := []struct {
+		desc      string
+		version   *version
+		picker    compactionPickerForTesting
+		want      string
+		wantMulti bool
+	}{
+		{
+			desc: "no compaction",
+			version: newVersion(opts, [numLevels][]*fileMetadata{
+				0: {
+					newFileMeta(
+						100,
+						1,
+						base.ParseInternalKey("i.SET.101"),
+						base.ParseInternalKey("j.SET.102"),
+					),
+				},
+			}),
+			want: "",
+		},
+
+		{
+			desc: "1 L0 file",
+			version: newVersion(opts, [numLevels][]*fileMetadata{
+				0: {
+					newFileMeta(
+						100,
+						1,
+						base.ParseInternalKey("i.SET.101"),
+						base.ParseInternalKey("j.SET.102"),
+					),
+				},
+			}),
+			picker: compactionPickerForTesting{
+				score:     99,
+				level:     0,
+				baseLevel: 1,
+			},
+			want: "100  ",
+		},
+
+		{
+			desc: "2 L0 files (0 overlaps)",
+			version: newVersion(opts, [numLevels][]*fileMetadata{
+				0: {
+					newFileMeta(
+						100,
+						1,
+						base.ParseInternalKey("i.SET.101"),
+						base.ParseInternalKey("j.SET.102"),
+					),
+					newFileMeta(
+						110,
+						1,
+						base.ParseInternalKey("k.SET.111"),
+						base.ParseInternalKey("l.SET.112"),
+					),
+				},
+			}),
+			picker: compactionPickerForTesting{
+				score:     99,
+				level:     0,
+				baseLevel: 1,
+			},
+			want: "100,110  ",
+		},
+
+		{
+			desc: "2 L0 files, with ikey overlap",
+			version: newVersion(opts, [numLevels][]*fileMetadata{
+				0: {
+					newFileMeta(
+						100,
+						1,
+						base.ParseInternalKey("i.SET.101"),
+						base.ParseInternalKey("p.SET.102"),
+					),
+					newFileMeta(
+						110,
+						1,
+						base.ParseInternalKey("j.SET.111"),
+						base.ParseInternalKey("q.SET.112"),
+					),
+				},
+			}),
+			picker: compactionPickerForTesting{
+				score:     99,
+				level:     0,
+				baseLevel: 1,
+			},
+			want: "100,110  ",
+		},
+
+		{
+			desc: "2 L0 files, with ukey overlap",
+			version: newVersion(opts, [numLevels][]*fileMetadata{
+				0: {
+					newFileMeta(
+						100,
+						1,
+						base.ParseInternalKey("i.SET.101"),
+						base.ParseInternalKey("i.SET.102"),
+					),
+					newFileMeta(
+						110,
+						1,
+						base.ParseInternalKey("i.SET.111"),
+						base.ParseInternalKey("i.SET.112"),
+					),
+				},
+			}),
+			picker: compactionPickerForTesting{
+				score:     99,
+				level:     0,
+				baseLevel: 1,
+			},
+			want: "100,110  ",
+		},
+
+		{
+			desc: "1 L0 file, 2 L1 files (0 overlaps)",
+			version: newVersion(opts, [numLevels][]*fileMetadata{
+				0: {
+					newFileMeta(
+						100,
+						1,
+						base.ParseInternalKey("i.SET.101"),
+						base.ParseInternalKey("i.SET.102"),
+					),
+				},
+				1: {
+					newFileMeta(
+						200,
+						1,
+						base.ParseInternalKey("a.SET.201"),
+						base.ParseInternalKey("b.SET.202"),
+					),
+					newFileMeta(
+						210,
+						1,
+						base.ParseInternalKey("y.SET.211"),
+						base.ParseInternalKey("z.SET.212"),
+					),
+				},
+			}),
+			picker: compactionPickerForTesting{
+				score:     99,
+				level:     0,
+				baseLevel: 1,
+			},
+			want: "100  ",
+		},
+
+		{
+			desc: "1 L0 file, 2 L1 files (1 overlap), 4 L2 files (3 overlaps)",
+			version: newVersion(opts, [numLevels][]*fileMetadata{
+				0: {
+					newFileMeta(
+						100,
+						1,
+						base.ParseInternalKey("i.SET.101"),
+						base.ParseInternalKey("t.SET.102"),
+					),
+				},
+				1: {
+					newFileMeta(
+						200,
+						1,
+						base.ParseInternalKey("a.SET.201"),
+						base.ParseInternalKey("e.SET.202"),
+					),
+					newFileMeta(
+						210,
+						1,
+						base.ParseInternalKey("f.SET.211"),
+						base.ParseInternalKey("j.SET.212"),
+					),
+				},
+				2: {
+					newFileMeta(
+						300,
+						1,
+						base.ParseInternalKey("a.SET.301"),
+						base.ParseInternalKey("b.SET.302"),
+					),
+					newFileMeta(
+						310,
+						1,
+						base.ParseInternalKey("c.SET.311"),
+						base.ParseInternalKey("g.SET.312"),
+					),
+					newFileMeta(
+						320,
+						1,
+						base.ParseInternalKey("h.SET.321"),
+						base.ParseInternalKey("m.SET.322"),
+					),
+					newFileMeta(
+						330,
+						1,
+						base.ParseInternalKey("n.SET.331"),
+						base.ParseInternalKey("z.SET.332"),
+					),
+				},
+			}),
+			picker: compactionPickerForTesting{
+				score:     99,
+				level:     0,
+				baseLevel: 1,
+			},
+			want: "100 210 310,320,330",
+		},
+
+		{
+			desc: "4 L1 files, 2 L2 files, can grow",
+			version: newVersion(opts, [numLevels][]*fileMetadata{
+				1: {
+					newFileMeta(
+						200,
+						1,
+						base.ParseInternalKey("i1.SET.201"),
+						base.ParseInternalKey("i2.SET.202"),
+					),
+					newFileMeta(
+						210,
+						1,
+						base.ParseInternalKey("j1.SET.211"),
+						base.ParseInternalKey("j2.SET.212"),
+					),
+					newFileMeta(
+						220,
+						1,
+						base.ParseInternalKey("k1.SET.221"),
+						base.ParseInternalKey("k2.SET.222"),
+					),
+					newFileMeta(
+						230,
+						1,
+						base.ParseInternalKey("l1.SET.231"),
+						base.ParseInternalKey("l2.SET.232"),
+					),
+				},
+				2: {
+					newFileMeta(
+						300,
+						1,
+						base.ParseInternalKey("a0.SET.301"),
+						base.ParseInternalKey("l0.SET.302"),
+					),
+					newFileMeta(
+						310,
+						1,
+						base.ParseInternalKey("l2.SET.311"),
+						base.ParseInternalKey("z2.SET.312"),
+					),
+				},
+			}),
+			picker: compactionPickerForTesting{
+				score:     99,
+				level:     1,
+				baseLevel: 1,
+			},
+			want:      "200,210,220 300  ",
+			wantMulti: true,
+		},
+
+		{
+			desc: "4 L1 files, 2 L2 files, can't grow (range)",
+			version: newVersion(opts, [numLevels][]*fileMetadata{
+				1: {
+					newFileMeta(
+						200,
+						1,
+						base.ParseInternalKey("i1.SET.201"),
+						base.ParseInternalKey("i2.SET.202"),
+					),
+					newFileMeta(
+						210,
+						1,
+						base.ParseInternalKey("j1.SET.211"),
+						base.ParseInternalKey("j2.SET.212"),
+					),
+					newFileMeta(
+						220,
+						1,
+						base.ParseInternalKey("k1.SET.221"),
+						base.ParseInternalKey("k2.SET.222"),
+					),
+					newFileMeta(
+						230,
+						1,
+						base.ParseInternalKey("l1.SET.231"),
+						base.ParseInternalKey("l2.SET.232"),
+					),
+				},
+				2: {
+					newFileMeta(
+						300,
+						1,
+						base.ParseInternalKey("a0.SET.301"),
+						base.ParseInternalKey("j0.SET.302"),
+					),
+					newFileMeta(
+						310,
+						1,
+						base.ParseInternalKey("j2.SET.311"),
+						base.ParseInternalKey("z2.SET.312"),
+					),
+				},
+			}),
+			picker: compactionPickerForTesting{
+				score:     99,
+				level:     1,
+				baseLevel: 1,
+			},
+			want:      "200 300  ",
+			wantMulti: true,
+		},
+
+		{
+			desc: "4 L1 files, 2 L2 files, can't grow (size)",
+			version: newVersion(opts, [numLevels][]*fileMetadata{
+				1: {
+					newFileMeta(
+						200,
+						expandedCompactionByteSizeLimit(opts, 1, math.MaxUint64)-1,
+						base.ParseInternalKey("i1.SET.201"),
+						base.ParseInternalKey("i2.SET.202"),
+					),
+					newFileMeta(
+						210,
+						expandedCompactionByteSizeLimit(opts, 1, math.MaxUint64)-1,
+						base.ParseInternalKey("j1.SET.211"),
+						base.ParseInternalKey("j2.SET.212"),
+					),
+					newFileMeta(
+						220,
+						expandedCompactionByteSizeLimit(opts, 1, math.MaxUint64)-1,
+						base.ParseInternalKey("k1.SET.221"),
+						base.ParseInternalKey("k2.SET.222"),
+					),
+					newFileMeta(
+						230,
+						expandedCompactionByteSizeLimit(opts, 1, math.MaxUint64)-1,
+						base.ParseInternalKey("l1.SET.231"),
+						base.ParseInternalKey("l2.SET.232"),
+					),
+				},
+				2: {
+					newFileMeta(
+						300,
+						expandedCompactionByteSizeLimit(opts, 2, math.MaxUint64)-1,
+						base.ParseInternalKey("a0.SET.301"),
+						base.ParseInternalKey("l0.SET.302"),
+					),
+					newFileMeta(
+						310,
+						expandedCompactionByteSizeLimit(opts, 2, math.MaxUint64)-1,
+						base.ParseInternalKey("l2.SET.311"),
+						base.ParseInternalKey("z2.SET.312"),
+					),
+				},
+			}),
+			picker: compactionPickerForTesting{
+				score:     99,
+				level:     1,
+				baseLevel: 1,
+			},
+			want: "200 300 ",
+		},
+	}
+
+	for _, tc := range testCases {
+		vs := &versionSet{
+			opts:    opts,
+			cmp:     DefaultComparer.Compare,
+			cmpName: DefaultComparer.Name,
+		}
+		vs.versions.Init(nil)
+		vs.append(tc.version)
+		tc.picker.opts = opts
+		tc.picker.vers = tc.version
+		vs.picker = &tc.picker
+		pc, got := vs.picker.pickAuto(compactionEnv{diskAvailBytes: math.MaxUint64}), ""
+		if pc != nil {
+			c := newCompaction(pc, opts, time.Now(), nil /* provider */)
+
+			gotStart := fileNums(c.startLevel.files)
+			gotML := ""
+			observedMulti := len(c.extraLevels) > 0
+			if observedMulti {
+				gotML = " " + fileNums(c.extraLevels[0].files)
+			}
+			gotOutput := " " + fileNums(c.outputLevel.files)
+			gotGrandparents := " " + fileNums(c.grandparents)
+			got = gotStart + gotML + gotOutput + gotGrandparents
+			if tc.wantMulti != observedMulti {
+				t.Fatalf("Expected Multi %t; Observed Multi %t, for %s", tc.wantMulti, observedMulti, got)
+			}
+
+		}
+		if got != tc.want {
+			t.Fatalf("%s:\ngot  %q\nwant %q", tc.desc, got, tc.want)
+		}
+	}
+}
+
+func TestElideTombstone(t *testing.T) {
+	var d *DB
+	defer func() {
+		if d != nil {
+			require.NoError(t, d.Close())
+		}
+	}()
+	var buf bytes.Buffer
+	datadriven.RunTest(t, "testdata/compaction_elide_tombstone",
+		func(t *testing.T, td *datadriven.TestData) string {
+			switch td.Cmd {
+			case "define":
+				if d != nil {
+					if err := d.Close(); err != nil {
+						return err.Error()
+					}
+				}
+				var err error
+				if d, err = runDBDefineCmd(td, (&Options{
+					FS:                          vfs.NewMem(),
+					DebugCheck:                  DebugCheckLevels,
+					FormatMajorVersion:          FormatNewest,
+					DisableAutomaticCompactions: true,
+				}).WithFSDefaults()); err != nil {
+					return err.Error()
+				}
+				if td.HasArg("verbose") {
+					return d.mu.versions.currentVersion().DebugString(base.DefaultFormatter)
+				}
+				return d.mu.versions.currentVersion().String()
+			case "elide":
+				buf.Reset()
+				var startLevel int
+				td.ScanArgs(t, "start-level", &startLevel)
+				c := compaction{
+					cmp:      testkeys.Comparer.Compare,
+					comparer: testkeys.Comparer,
+					version:  d.mu.versions.currentVersion(),
+					inputs:   []compactionLevel{{level: startLevel}, {level: startLevel + 1}},
+					smallest: base.ParseInternalKey("a.SET.0"),
+					largest:  base.ParseInternalKey("z.SET.0"),
+				}
+				c.startLevel, c.outputLevel = &c.inputs[0], &c.inputs[1]
+				c.setupInuseKeyRanges()
+				for _, ukey := range strings.Split(td.Input, "\n") {
+					fmt.Fprintf(&buf, "elideTombstone(%q) = %t\n", ukey, c.elideTombstone([]byte(ukey)))
+				}
+				return buf.String()
+			default:
+				return fmt.Sprintf("unknown command: %s", td.Cmd)
+			}
+		})
+}
+
+func TestElideRangeTombstone(t *testing.T) {
+	opts := (*Options)(nil).EnsureDefaults()
+
+	newFileMeta := func(smallest, largest base.InternalKey) *fileMetadata {
+		m := (&fileMetadata{}).ExtendPointKeyBounds(
+			opts.Comparer.Compare, smallest, largest,
+		)
+		m.InitPhysicalBacking()
+		return m
+	}
+
+	type want struct {
+		key      string
+		endKey   string
+		expected bool
+	}
+
+	testCases := []struct {
+		desc     string
+		level    int
+		version  *version
+		wants    []want
+		flushing flushableList
+	}{
+		{
+			desc:    "empty",
+			level:   1,
+			version: newVersion(opts, [numLevels][]*fileMetadata{}),
+			wants: []want{
+				{"x", "y", true},
+			},
+		},
+		{
+			desc:  "non-empty",
+			level: 1,
+			version: newVersion(opts, [numLevels][]*fileMetadata{
+				1: {
+					newFileMeta(
+						base.ParseInternalKey("c.SET.801"),
+						base.ParseInternalKey("g.SET.800"),
+					),
+					newFileMeta(
+						base.ParseInternalKey("x.SET.701"),
+						base.ParseInternalKey("y.SET.700"),
+					),
+				},
+				2: {
+					newFileMeta(
+						base.ParseInternalKey("d.SET.601"),
+						base.ParseInternalKey("h.SET.600"),
+					),
+					newFileMeta(
+						base.ParseInternalKey("r.SET.501"),
+						base.ParseInternalKey("t.SET.500"),
+					),
+				},
+				3: {
+					newFileMeta(
+						base.ParseInternalKey("f.SET.401"),
+						base.ParseInternalKey("g.SET.400"),
+					),
+					newFileMeta(
+						base.ParseInternalKey("w.SET.301"),
+						base.ParseInternalKey("x.SET.300"),
+					),
+				},
+				4: {
+					newFileMeta(
+						base.ParseInternalKey("f.SET.201"),
+						base.ParseInternalKey("m.SET.200"),
+					),
+					newFileMeta(
+						base.ParseInternalKey("t.SET.101"),
+						base.ParseInternalKey("t.SET.100"),
+					),
+				},
+			}),
+			wants: []want{
+				{"b", "c", true},
+				{"c", "d", true},
+				{"d", "e", true},
+				{"e", "f", false},
+				{"f", "g", false},
+				{"g", "h", false},
+				{"h", "i", false},
+				{"l", "m", false},
+				{"m", "n", false},
+				{"n", "o", true},
+				{"q", "r", true},
+				{"r", "s", true},
+				{"s", "t", false},
+				{"t", "u", false},
+				{"u", "v", true},
+				{"v", "w", false},
+				{"w", "x", false},
+				{"x", "y", false},
+				{"y", "z", true},
+			},
+		},
+		{
+			desc:  "flushing",
+			level: -1,
+			version: newVersion(opts, [numLevels][]*fileMetadata{
+				0: {
+					newFileMeta(
+						base.ParseInternalKey("h.SET.901"),
+						base.ParseInternalKey("j.SET.900"),
+					),
+				},
+				1: {
+					newFileMeta(
+						base.ParseInternalKey("c.SET.801"),
+						base.ParseInternalKey("g.SET.800"),
+					),
+					newFileMeta(
+						base.ParseInternalKey("x.SET.701"),
+						base.ParseInternalKey("y.SET.700"),
+					),
+				},
+			}),
+			wants: []want{
+				{"m", "n", false},
+			},
+			// Pretend one memtable is being flushed
+			flushing: flushableList{nil},
+		},
+	}
+
+	for _, tc := range testCases {
+		c := compaction{
+			cmp:      DefaultComparer.Compare,
+			comparer: DefaultComparer,
+			version:  tc.version,
+			inputs:   []compactionLevel{{level: tc.level}, {level: tc.level + 1}},
+			smallest: base.ParseInternalKey("a.SET.0"),
+			largest:  base.ParseInternalKey("z.SET.0"),
+			flushing: tc.flushing,
+		}
+		c.startLevel, c.outputLevel = &c.inputs[0], &c.inputs[1]
+		c.setupInuseKeyRanges()
+		for _, w := range tc.wants {
+			if got := c.elideRangeTombstone([]byte(w.key), []byte(w.endKey)); got != w.expected {
+				t.Errorf("%s: keys=%q-%q: got %v, want %v", tc.desc, w.key, w.endKey, got, w.expected)
+			}
+		}
+	}
+}
+
+func TestCompactionTransform(t *testing.T) {
+	datadriven.RunTest(t, "testdata/compaction_transform", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "transform":
+			var snapshots []uint64
+			var keyRanges []manifest.UserKeyRange
+			disableElision := td.HasArg("disable-elision")
+			td.MaybeScanArgs(t, "snapshots", &snapshots)
+			if arg, ok := td.Arg("in-use-key-ranges"); ok {
+				for _, keyRange := range arg.Vals {
+					parts := strings.SplitN(keyRange, "-", 2)
+					start := []byte(strings.TrimSpace(parts[0]))
+					end := []byte(strings.TrimSpace(parts[1]))
+					keyRanges = append(keyRanges, manifest.UserKeyRange{
+						Start: start,
+						End:   end,
+					})
+				}
+			}
+			span := keyspan.ParseSpan(td.Input)
+			for i := range span.Keys {
+				if i > 0 {
+					if span.Keys[i-1].Trailer < span.Keys[i].Trailer {
+						return "span keys not sorted"
+					}
+				}
+			}
+			var outSpan keyspan.Span
+			c := compaction{
+				cmp:                base.DefaultComparer.Compare,
+				comparer:           base.DefaultComparer,
+				disableSpanElision: disableElision,
+				inuseKeyRanges:     keyRanges,
+			}
+			transformer := rangeKeyCompactionTransform(base.DefaultComparer.Equal, snapshots, c.elideRangeTombstone)
+			if err := transformer.Transform(base.DefaultComparer.Compare, span, &outSpan); err != nil {
+				return fmt.Sprintf("error: %s", err)
+			}
+			return outSpan.String()
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+type cpuPermissionGranter struct {
+	// requestCount is used to confirm that every GetPermission function call
+	// has a corresponding CPUWorkDone function call.
+	requestCount int
+	used         bool
+	permit       bool
+}
+
+type cpuWorkHandle struct {
+	permit bool
+}
+
+func (c cpuWorkHandle) Permitted() bool {
+	return c.permit
+}
+
+func (t *cpuPermissionGranter) GetPermission(dur time.Duration) CPUWorkHandle {
+	t.requestCount++
+	t.used = true
+	return cpuWorkHandle{t.permit}
+}
+
+func (t *cpuPermissionGranter) CPUWorkDone(_ CPUWorkHandle) {
+	t.requestCount--
+}
+
+// Simple test to check if compactions are using the granter, and if exactly
+// the acquired handles are returned.
+func TestCompactionCPUGranter(t *testing.T) {
+	mem := vfs.NewMem()
+	opts := (&Options{FS: mem}).WithFSDefaults()
+	g := &cpuPermissionGranter{permit: true}
+	opts.Experimental.CPUWorkPermissionGranter = g
+	d, err := Open("", opts)
+	if err != nil {
+		t.Fatalf("Open: %v", err)
+	}
+	defer d.Close()
+
+	d.Set([]byte{'a'}, []byte{'a'}, nil)
+	err = d.Compact([]byte{'a'}, []byte{'b'}, true)
+	if err != nil {
+		t.Fatalf("Compact: %v", err)
+	}
+	require.True(t, g.used)
+	require.Equal(t, g.requestCount, 0)
+}
+
+// Tests that there's no errors or panics when the default CPU granter is used.
+func TestCompactionCPUGranterDefault(t *testing.T) {
+	mem := vfs.NewMem()
+	opts := (&Options{FS: mem}).WithFSDefaults()
+	d, err := Open("", opts)
+	if err != nil {
+		t.Fatalf("Open: %v", err)
+	}
+	defer d.Close()
+
+	d.Set([]byte{'a'}, []byte{'a'}, nil)
+	err = d.Compact([]byte{'a'}, []byte{'b'}, true)
+	if err != nil {
+		t.Fatalf("Compact: %v", err)
+	}
+}
+
+func TestCompaction(t *testing.T) {
+	const memTableSize = 10000
+	// Tuned so that 2 values can reside in the memtable before a flush, but a
+	// 3rd value will cause a flush. Needs to account for the max skiplist node
+	// size.
+	const valueSize = 3500
+
+	mem := vfs.NewMem()
+	opts := &Options{
+		FS:                    mem,
+		MemTableSize:          memTableSize,
+		DebugCheck:            DebugCheckLevels,
+		L0CompactionThreshold: 8,
+	}
+	opts.testingRandomized(t).WithFSDefaults()
+	d, err := Open("", opts)
+	if err != nil {
+		t.Fatalf("Open: %v", err)
+	}
+
+	get1 := func(iter internalIterator) (ret string) {
+		b := &bytes.Buffer{}
+		for key, _ := iter.First(); key != nil; key, _ = iter.Next() {
+			b.Write(key.UserKey)
+		}
+		if err := iter.Close(); err != nil {
+			t.Fatalf("iterator Close: %v", err)
+		}
+		return b.String()
+	}
+	getAll := func() (gotMem, gotDisk string, err error) {
+		d.mu.Lock()
+		defer d.mu.Unlock()
+
+		if d.mu.mem.mutable != nil {
+			gotMem = get1(d.mu.mem.mutable.newIter(nil))
+		}
+		ss := []string(nil)
+		v := d.mu.versions.currentVersion()
+		provider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(mem, "" /* dirName */))
+		if err != nil {
+			t.Fatalf("%v", err)
+		}
+		defer provider.Close()
+		for _, levelMetadata := range v.Levels {
+			iter := levelMetadata.Iter()
+			for meta := iter.First(); meta != nil; meta = iter.Next() {
+				if meta.Virtual {
+					continue
+				}
+				f, err := provider.OpenForReading(context.Background(), base.FileTypeTable, meta.FileBacking.DiskFileNum, objstorage.OpenOptions{})
+				if err != nil {
+					return "", "", errors.WithStack(err)
+				}
+				r, err := sstable.NewReader(f, sstable.ReaderOptions{})
+				if err != nil {
+					return "", "", errors.WithStack(err)
+				}
+				defer r.Close()
+				iter, err := r.NewIter(nil /* lower */, nil /* upper */)
+				if err != nil {
+					return "", "", errors.WithStack(err)
+				}
+				ss = append(ss, get1(iter)+".")
+			}
+		}
+		sort.Strings(ss)
+		return gotMem, strings.Join(ss, ""), nil
+	}
+
+	value := bytes.Repeat([]byte("x"), valueSize)
+	testCases := []struct {
+		key, wantMem, wantDisk string
+	}{
+		{"+A", "A", ""},
+		{"+a", "Aa", ""},
+		{"+B", "B", "Aa."},
+		{"+b", "Bb", "Aa."},
+		// The next level-0 table overwrites the B key.
+		{"+C", "C", "Aa.Bb."},
+		{"+B", "BC", "Aa.Bb."},
+		// The next level-0 table deletes the a key.
+		{"+D", "D", "Aa.BC.Bb."},
+		{"-a", "Da", "Aa.BC.Bb."},
+		{"+d", "Dad", "Aa.BC.Bb."},
+		{"+E", "E", "Aa.BC.Bb.Dad."},
+		{"+e", "Ee", "Aa.BC.Bb.Dad."},
+		// The next addition creates the fourth level-0 table, and l0CompactionTrigger == 8,
+		// but since the sublevel count is doubled when comparing with l0CompactionTrigger,
+		// the addition of the 4th sublevel triggers a non-trivial compaction into one level-1 table.
+		// Note that the keys in this one larger table are interleaved from the four smaller ones.
+		{"+F", "F", "ABCDEbde."},
+	}
+	for _, tc := range testCases {
+		if key := tc.key[1:]; tc.key[0] == '+' {
+			if err := d.Set([]byte(key), value, nil); err != nil {
+				t.Errorf("%q: Set: %v", key, err)
+				break
+			}
+		} else {
+			if err := d.Delete([]byte(key), nil); err != nil {
+				t.Errorf("%q: Delete: %v", key, err)
+				break
+			}
+		}
+
+		// try backs off to allow any writes to the memfs to complete.
+		err := try(100*time.Microsecond, 20*time.Second, func() error {
+			gotMem, gotDisk, err := getAll()
+			if err != nil {
+				return err
+			}
+			if testing.Verbose() {
+				fmt.Printf("mem=%s (%s) disk=%s (%s)\n", gotMem, tc.wantMem, gotDisk, tc.wantDisk)
+			}
+
+			if gotMem != tc.wantMem {
+				return errors.Errorf("mem: got %q, want %q", gotMem, tc.wantMem)
+			}
+			if gotDisk != tc.wantDisk {
+				return errors.Errorf("ldb: got %q, want %q", gotDisk, tc.wantDisk)
+			}
+			return nil
+		})
+		if err != nil {
+			t.Errorf("%q: %v", tc.key, err)
+		}
+	}
+	if err := d.Close(); err != nil {
+		t.Fatalf("db Close: %v", err)
+	}
+}
+
+func TestValidateVersionEdit(t *testing.T) {
+	const badKey = "malformed-key"
+
+	errValidationFailed := errors.New("validation failed")
+	validateFn := func(key []byte) error {
+		if string(key) == badKey {
+			return errValidationFailed
+		}
+		return nil
+	}
+
+	cmp := DefaultComparer.Compare
+	newFileMeta := func(smallest, largest base.InternalKey) *fileMetadata {
+		m := (&fileMetadata{}).ExtendPointKeyBounds(cmp, smallest, largest)
+		m.InitPhysicalBacking()
+		return m
+	}
+
+	testCases := []struct {
+		desc    string
+		ve      *versionEdit
+		vFunc   func([]byte) error
+		wantErr error
+	}{
+		{
+			desc: "single new file; start key",
+			ve: &versionEdit{
+				NewFiles: []manifest.NewFileEntry{
+					{
+						Meta: newFileMeta(
+							manifest.InternalKey{UserKey: []byte(badKey)},
+							manifest.InternalKey{UserKey: []byte("z")},
+						),
+					},
+				},
+			},
+			vFunc:   validateFn,
+			wantErr: errValidationFailed,
+		},
+		{
+			desc: "single new file; end key",
+			ve: &versionEdit{
+				NewFiles: []manifest.NewFileEntry{
+					{
+						Meta: newFileMeta(
+							manifest.InternalKey{UserKey: []byte("a")},
+							manifest.InternalKey{UserKey: []byte(badKey)},
+						),
+					},
+				},
+			},
+			vFunc:   validateFn,
+			wantErr: errValidationFailed,
+		},
+		{
+			desc: "multiple new files",
+			ve: &versionEdit{
+				NewFiles: []manifest.NewFileEntry{
+					{
+						Meta: newFileMeta(
+							manifest.InternalKey{UserKey: []byte("a")},
+							manifest.InternalKey{UserKey: []byte("c")},
+						),
+					},
+					{
+						Meta: newFileMeta(
+							manifest.InternalKey{UserKey: []byte(badKey)},
+							manifest.InternalKey{UserKey: []byte("z")},
+						),
+					},
+				},
+			},
+			vFunc:   validateFn,
+			wantErr: errValidationFailed,
+		},
+		{
+			desc: "single deleted file; start key",
+			ve: &versionEdit{
+				DeletedFiles: map[manifest.DeletedFileEntry]*manifest.FileMetadata{
+					deletedFileEntry{Level: 0, FileNum: 0}: newFileMeta(
+						manifest.InternalKey{UserKey: []byte(badKey)},
+						manifest.InternalKey{UserKey: []byte("z")},
+					),
+				},
+			},
+			vFunc:   validateFn,
+			wantErr: errValidationFailed,
+		},
+		{
+			desc: "single deleted file; end key",
+			ve: &versionEdit{
+				DeletedFiles: map[manifest.DeletedFileEntry]*manifest.FileMetadata{
+					deletedFileEntry{Level: 0, FileNum: 0}: newFileMeta(
+						manifest.InternalKey{UserKey: []byte("a")},
+						manifest.InternalKey{UserKey: []byte(badKey)},
+					),
+				},
+			},
+			vFunc:   validateFn,
+			wantErr: errValidationFailed,
+		},
+		{
+			desc: "multiple deleted files",
+			ve: &versionEdit{
+				DeletedFiles: map[manifest.DeletedFileEntry]*manifest.FileMetadata{
+					deletedFileEntry{Level: 0, FileNum: 0}: newFileMeta(
+						manifest.InternalKey{UserKey: []byte("a")},
+						manifest.InternalKey{UserKey: []byte("c")},
+					),
+					deletedFileEntry{Level: 0, FileNum: 1}: newFileMeta(
+						manifest.InternalKey{UserKey: []byte(badKey)},
+						manifest.InternalKey{UserKey: []byte("z")},
+					),
+				},
+			},
+			vFunc:   validateFn,
+			wantErr: errValidationFailed,
+		},
+		{
+			desc: "no errors",
+			ve: &versionEdit{
+				NewFiles: []manifest.NewFileEntry{
+					{
+						Level: 0,
+						Meta: newFileMeta(
+							manifest.InternalKey{UserKey: []byte("b")},
+							manifest.InternalKey{UserKey: []byte("c")},
+						),
+					},
+					{
+						Level: 0,
+						Meta: newFileMeta(
+							manifest.InternalKey{UserKey: []byte("d")},
+							manifest.InternalKey{UserKey: []byte("g")},
+						),
+					},
+				},
+				DeletedFiles: map[manifest.DeletedFileEntry]*manifest.FileMetadata{
+					deletedFileEntry{Level: 6, FileNum: 0}: newFileMeta(
+						manifest.InternalKey{UserKey: []byte("a")},
+						manifest.InternalKey{UserKey: []byte("d")},
+					),
+					deletedFileEntry{Level: 6, FileNum: 1}: newFileMeta(
+						manifest.InternalKey{UserKey: []byte("x")},
+						manifest.InternalKey{UserKey: []byte("z")},
+					),
+				},
+			},
+			vFunc: validateFn,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.desc, func(t *testing.T) {
+			err := validateVersionEdit(tc.ve, tc.vFunc, base.DefaultFormatter)
+			if tc.wantErr != nil {
+				if !errors.Is(err, tc.wantErr) {
+					t.Fatalf("got: %s; want: %s", err, tc.wantErr)
+				}
+				return
+			}
+			if err != nil {
+				t.Fatalf("got %s; wanted no error", err)
+			}
+		})
+	}
+}
+
+func TestManualCompaction(t *testing.T) {
+	var mem vfs.FS
+	var d *DB
+	defer func() {
+		if d != nil {
+			require.NoError(t, closeAllSnapshots(d))
+			require.NoError(t, d.Close())
+		}
+	}()
+
+	seed := time.Now().UnixNano()
+	rng := rand.New(rand.NewSource(seed))
+	t.Logf("seed: %d", seed)
+
+	randVersion := func(min, max FormatMajorVersion) FormatMajorVersion {
+		return FormatMajorVersion(int(min) + rng.Intn(int(max)-int(min)+1))
+	}
+
+	var compactionLog bytes.Buffer
+	compactionLogEventListener := &EventListener{
+		CompactionEnd: func(info CompactionInfo) {
+			// Ensure determinism.
+			info.JobID = 1
+			info.Duration = time.Second
+			info.TotalDuration = time.Second
+			fmt.Fprintln(&compactionLog, info.String())
+		},
+	}
+	reset := func(minVersion, maxVersion FormatMajorVersion) {
+		compactionLog.Reset()
+		if d != nil {
+			require.NoError(t, closeAllSnapshots(d))
+			require.NoError(t, d.Close())
+		}
+		mem = vfs.NewMem()
+		require.NoError(t, mem.MkdirAll("ext", 0755))
+
+		opts := (&Options{
+			FS:                          mem,
+			DebugCheck:                  DebugCheckLevels,
+			DisableAutomaticCompactions: true,
+			EventListener:               compactionLogEventListener,
+			FormatMajorVersion:          randVersion(minVersion, maxVersion),
+		}).WithFSDefaults()
+
+		var err error
+		d, err = Open("", opts)
+		require.NoError(t, err)
+	}
+
+	// d.mu must be held when calling.
+	createOngoingCompaction := func(start, end []byte, startLevel, outputLevel int) (ongoingCompaction *compaction) {
+		ongoingCompaction = &compaction{
+			inputs:   []compactionLevel{{level: startLevel}, {level: outputLevel}},
+			smallest: InternalKey{UserKey: start},
+			largest:  InternalKey{UserKey: end},
+		}
+		ongoingCompaction.startLevel = &ongoingCompaction.inputs[0]
+		ongoingCompaction.outputLevel = &ongoingCompaction.inputs[1]
+		// Mark files as compacting.
+		curr := d.mu.versions.currentVersion()
+		ongoingCompaction.startLevel.files = curr.Overlaps(startLevel, d.cmp, start, end, false)
+		ongoingCompaction.outputLevel.files = curr.Overlaps(outputLevel, d.cmp, start, end, false)
+		for _, cl := range ongoingCompaction.inputs {
+			iter := cl.files.Iter()
+			for f := iter.First(); f != nil; f = iter.Next() {
+				f.CompactionState = manifest.CompactionStateCompacting
+			}
+		}
+		d.mu.compact.inProgress[ongoingCompaction] = struct{}{}
+		d.mu.compact.compactingCount++
+		return
+	}
+
+	// d.mu must be held when calling.
+	deleteOngoingCompaction := func(ongoingCompaction *compaction) {
+		for _, cl := range ongoingCompaction.inputs {
+			iter := cl.files.Iter()
+			for f := iter.First(); f != nil; f = iter.Next() {
+				f.CompactionState = manifest.CompactionStateNotCompacting
+			}
+		}
+		delete(d.mu.compact.inProgress, ongoingCompaction)
+		d.mu.compact.compactingCount--
+	}
+
+	runTest := func(t *testing.T, testData string, minVersion, maxVersion FormatMajorVersion, verbose bool) {
+		reset(minVersion, maxVersion)
+		var ongoingCompaction *compaction
+		datadriven.RunTest(t, testData, func(t *testing.T, td *datadriven.TestData) string {
+			switch td.Cmd {
+			case "reset":
+				reset(minVersion, maxVersion)
+				return ""
+
+			case "batch":
+				b := d.NewIndexedBatch()
+				if err := runBatchDefineCmd(td, b); err != nil {
+					return err.Error()
+				}
+				require.NoError(t, b.Commit(nil))
+				return ""
+
+			case "build":
+				if err := runBuildCmd(td, d, mem); err != nil {
+					return err.Error()
+				}
+				return ""
+
+			case "compact":
+				if err := runCompactCmd(td, d); err != nil {
+					return err.Error()
+				}
+				d.mu.Lock()
+				s := d.mu.versions.currentVersion().String()
+				if verbose {
+					s = d.mu.versions.currentVersion().DebugString(base.DefaultFormatter)
+				}
+				d.mu.Unlock()
+				if td.HasArg("hide-file-num") {
+					re := regexp.MustCompile(`([0-9]*):\[`)
+					s = re.ReplaceAllString(s, "[")
+				}
+				return s
+
+			case "define":
+				if d != nil {
+					if err := closeAllSnapshots(d); err != nil {
+						return err.Error()
+					}
+					if err := d.Close(); err != nil {
+						return err.Error()
+					}
+				}
+
+				mem = vfs.NewMem()
+				opts := (&Options{
+					FS:                          mem,
+					DebugCheck:                  DebugCheckLevels,
+					EventListener:               compactionLogEventListener,
+					FormatMajorVersion:          randVersion(minVersion, maxVersion),
+					DisableAutomaticCompactions: true,
+				}).WithFSDefaults()
+
+				var err error
+				if d, err = runDBDefineCmd(td, opts); err != nil {
+					return err.Error()
+				}
+
+				s := d.mu.versions.currentVersion().String()
+				if verbose {
+					s = d.mu.versions.currentVersion().DebugString(base.DefaultFormatter)
+				}
+				return s
+
+			case "file-sizes":
+				return runTableFileSizesCmd(td, d)
+
+			case "flush":
+				if err := d.Flush(); err != nil {
+					return err.Error()
+				}
+				d.mu.Lock()
+				s := d.mu.versions.currentVersion().String()
+				if verbose {
+					s = d.mu.versions.currentVersion().DebugString(base.DefaultFormatter)
+				}
+				d.mu.Unlock()
+				return s
+
+			case "ingest":
+				if err := runIngestCmd(td, d, mem); err != nil {
+					return err.Error()
+				}
+				d.mu.Lock()
+				s := d.mu.versions.currentVersion().String()
+				if verbose {
+					s = d.mu.versions.currentVersion().DebugString(base.DefaultFormatter)
+				}
+				d.mu.Unlock()
+				return s
+
+			case "iter":
+				// TODO(peter): runDBDefineCmd doesn't properly update the visible
+				// sequence number. So we have to use a snapshot with a very large
+				// sequence number, otherwise the DB appears empty.
+				snap := Snapshot{
+					db:     d,
+					seqNum: InternalKeySeqNumMax,
+				}
+				iter, _ := snap.NewIter(nil)
+				return runIterCmd(td, iter, true)
+
+			case "lsm":
+				return runLSMCmd(td, d)
+
+			case "populate":
+				b := d.NewBatch()
+				runPopulateCmd(t, td, b)
+				count := b.Count()
+				require.NoError(t, b.Commit(nil))
+				return fmt.Sprintf("wrote %d keys\n", count)
+
+			case "async-compact":
+				var s string
+				ch := make(chan error, 1)
+				go func() {
+					if err := runCompactCmd(td, d); err != nil {
+						ch <- err
+						close(ch)
+						return
+					}
+					d.mu.Lock()
+					s = d.mu.versions.currentVersion().String()
+					d.mu.Unlock()
+					close(ch)
+				}()
+
+				manualDone := func() bool {
+					select {
+					case <-ch:
+						return true
+					default:
+						return false
+					}
+				}
+
+				err := try(100*time.Microsecond, 20*time.Second, func() error {
+					if manualDone() {
+						return nil
+					}
+
+					d.mu.Lock()
+					defer d.mu.Unlock()
+					if len(d.mu.compact.manual) == 0 {
+						return errors.New("no manual compaction queued")
+					}
+					manual := d.mu.compact.manual[0]
+					if manual.retries == 0 {
+						return errors.New("manual compaction has not been retried")
+					}
+					return nil
+				})
+				if err != nil {
+					return err.Error()
+				}
+
+				if manualDone() {
+					return "manual compaction did not block for ongoing\n" + s
+				}
+
+				d.mu.Lock()
+				deleteOngoingCompaction(ongoingCompaction)
+				ongoingCompaction = nil
+				d.maybeScheduleCompaction()
+				d.mu.Unlock()
+				if err := <-ch; err != nil {
+					return err.Error()
+				}
+				return "manual compaction blocked until ongoing finished\n" + s
+
+			case "add-ongoing-compaction":
+				var startLevel int
+				var outputLevel int
+				var start string
+				var end string
+				td.ScanArgs(t, "startLevel", &startLevel)
+				td.ScanArgs(t, "outputLevel", &outputLevel)
+				td.ScanArgs(t, "start", &start)
+				td.ScanArgs(t, "end", &end)
+				d.mu.Lock()
+				ongoingCompaction = createOngoingCompaction([]byte(start), []byte(end), startLevel, outputLevel)
+				d.mu.Unlock()
+				return ""
+
+			case "remove-ongoing-compaction":
+				d.mu.Lock()
+				deleteOngoingCompaction(ongoingCompaction)
+				ongoingCompaction = nil
+				d.mu.Unlock()
+				return ""
+
+			case "set-concurrent-compactions":
+				var concurrentCompactions int
+				td.ScanArgs(t, "num", &concurrentCompactions)
+				d.opts.MaxConcurrentCompactions = func() int {
+					return concurrentCompactions
+				}
+				return ""
+
+			case "sstable-properties":
+				return runSSTablePropertiesCmd(t, td, d)
+
+			case "wait-pending-table-stats":
+				return runTableStatsCmd(td, d)
+
+			case "close-snapshots":
+				d.mu.Lock()
+				// Re-enable automatic compactions if they were disabled so that
+				// closing snapshots can trigger elision-only compactions if
+				// necessary.
+				d.opts.DisableAutomaticCompactions = false
+
+				var ss []*Snapshot
+				l := &d.mu.snapshots
+				for i := l.root.next; i != &l.root; i = i.next {
+					ss = append(ss, i)
+				}
+				d.mu.Unlock()
+				for i := range ss {
+					if err := ss[i].Close(); err != nil {
+						return err.Error()
+					}
+				}
+				return ""
+
+			case "compaction-log":
+				defer compactionLog.Reset()
+				return compactionLog.String()
+
+			default:
+				return fmt.Sprintf("unknown command: %s", td.Cmd)
+			}
+		})
+	}
+
+	testCases := []struct {
+		testData   string
+		minVersion FormatMajorVersion
+		maxVersion FormatMajorVersion // inclusive
+		verbose    bool
+	}{
+		{
+			testData:   "testdata/manual_compaction",
+			minVersion: FormatMostCompatible,
+			maxVersion: FormatSetWithDelete - 1,
+		},
+		{
+			testData:   "testdata/manual_compaction_set_with_del",
+			minVersion: FormatBlockPropertyCollector,
+			// This test exercises split user keys.
+			maxVersion: FormatSplitUserKeysMarkedCompacted - 1,
+		},
+		{
+			testData:   "testdata/singledel_manual_compaction",
+			minVersion: FormatMostCompatible,
+			maxVersion: FormatSetWithDelete - 1,
+		},
+		{
+			testData:   "testdata/singledel_manual_compaction_set_with_del",
+			minVersion: FormatSetWithDelete,
+			maxVersion: internalFormatNewest,
+		},
+		{
+			testData:   "testdata/manual_compaction_range_keys",
+			minVersion: FormatRangeKeys,
+			maxVersion: internalFormatNewest,
+			verbose:    true,
+		},
+		{
+			testData:   "testdata/manual_compaction_file_boundaries",
+			minVersion: FormatBlockPropertyCollector,
+			// This test exercises split user keys.
+			maxVersion: FormatSplitUserKeysMarkedCompacted - 1,
+		},
+		{
+			testData:   "testdata/manual_compaction_file_boundaries_delsized",
+			minVersion: FormatDeleteSizedAndObsolete,
+			maxVersion: internalFormatNewest,
+		},
+		{
+			testData:   "testdata/manual_compaction_set_with_del_sstable_Pebblev4",
+			minVersion: FormatDeleteSizedAndObsolete,
+			maxVersion: internalFormatNewest,
+		},
+		{
+			testData:   "testdata/manual_compaction_multilevel",
+			minVersion: FormatMostCompatible,
+			maxVersion: internalFormatNewest,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.testData, func(t *testing.T) {
+			runTest(t, tc.testData, tc.minVersion, tc.maxVersion, tc.verbose)
+		})
+	}
+}
+
+func TestCompactionFindGrandparentLimit(t *testing.T) {
+	cmp := DefaultComparer.Compare
+	var grandparents []*fileMetadata
+
+	var fileNum base.FileNum
+	parseMeta := func(s string) *fileMetadata {
+		parts := strings.Split(s, "-")
+		if len(parts) != 2 {
+			t.Fatalf("malformed table spec: %s", s)
+		}
+		fileNum++
+		m := (&fileMetadata{
+			FileNum: fileNum,
+		}).ExtendPointKeyBounds(
+			cmp,
+			InternalKey{UserKey: []byte(parts[0])},
+			InternalKey{UserKey: []byte(parts[1])},
+		)
+		m.InitPhysicalBacking()
+		return m
+	}
+
+	datadriven.RunTest(t, "testdata/compaction_find_grandparent_limit",
+		func(t *testing.T, d *datadriven.TestData) string {
+			switch d.Cmd {
+			case "define":
+				grandparents = nil
+				if len(d.Input) == 0 {
+					return ""
+				}
+				for _, data := range strings.Split(d.Input, "\n") {
+					parts := strings.Fields(data)
+					if len(parts) != 2 {
+						return fmt.Sprintf("malformed test:\n%s", d.Input)
+					}
+
+					meta := parseMeta(parts[0])
+					var err error
+					meta.Size, err = strconv.ParseUint(parts[1], 10, 64)
+					if err != nil {
+						return err.Error()
+					}
+					grandparents = append(grandparents, meta)
+				}
+				return ""
+
+			case "compact":
+				c := &compaction{
+					cmp:          cmp,
+					equal:        DefaultComparer.Equal,
+					comparer:     DefaultComparer,
+					grandparents: manifest.NewLevelSliceKeySorted(cmp, grandparents),
+				}
+				if len(d.CmdArgs) != 1 {
+					return fmt.Sprintf("%s expects 1 argument", d.Cmd)
+				}
+				if len(d.CmdArgs[0].Vals) != 1 {
+					return fmt.Sprintf("%s expects 1 value", d.CmdArgs[0].Key)
+				}
+				var err error
+				c.maxOverlapBytes, err = strconv.ParseUint(d.CmdArgs[0].Vals[0], 10, 64)
+				if err != nil {
+					return err.Error()
+				}
+
+				var buf bytes.Buffer
+				var smallest, largest string
+				var grandparentLimit []byte
+				for i, key := range strings.Fields(d.Input) {
+					if i == 0 {
+						smallest = key
+						grandparentLimit = c.findGrandparentLimit([]byte(key))
+					}
+					if grandparentLimit != nil && c.cmp(grandparentLimit, []byte(key)) < 0 {
+						fmt.Fprintf(&buf, "%s-%s\n", smallest, largest)
+						smallest = key
+						grandparentLimit = c.findGrandparentLimit([]byte(key))
+					}
+					largest = key
+				}
+				fmt.Fprintf(&buf, "%s-%s\n", smallest, largest)
+				return buf.String()
+
+			default:
+				return fmt.Sprintf("unknown command: %s", d.Cmd)
+			}
+		})
+}
+
+func TestCompactionFindL0Limit(t *testing.T) {
+	cmp := DefaultComparer.Compare
+
+	fileNumCounter := 1
+	parseMeta := func(s string) (*fileMetadata, error) {
+		fields := strings.Fields(s)
+		parts := strings.Split(fields[0], "-")
+		if len(parts) != 2 {
+			return nil, errors.Errorf("malformed table spec: %s", s)
+		}
+		m := (&fileMetadata{
+			FileNum: base.FileNum(fileNumCounter),
+		}).ExtendPointKeyBounds(
+			cmp,
+			base.ParseInternalKey(strings.TrimSpace(parts[0])),
+			base.ParseInternalKey(strings.TrimSpace(parts[1])),
+		)
+		fileNumCounter++
+		m.SmallestSeqNum = m.Smallest.SeqNum()
+		m.LargestSeqNum = m.Largest.SeqNum()
+
+		for _, field := range fields[1:] {
+			parts := strings.Split(field, "=")
+			switch parts[0] {
+			case "size":
+				size, err := strconv.ParseUint(parts[1], 10, 64)
+				if err != nil {
+					t.Fatal(err)
+				}
+				m.Size = size
+			}
+		}
+		m.InitPhysicalBacking()
+		return m, nil
+	}
+
+	var vers *version
+	flushSplitBytes := int64(0)
+
+	datadriven.RunTest(t, "testdata/compaction_find_l0_limit",
+		func(t *testing.T, d *datadriven.TestData) string {
+			switch d.Cmd {
+			case "define":
+				fileMetas := [manifest.NumLevels][]*fileMetadata{}
+				baseLevel := manifest.NumLevels - 1
+				level := 0
+				d.MaybeScanArgs(t, "flush_split_bytes", &flushSplitBytes)
+
+				var err error
+				for _, data := range strings.Split(d.Input, "\n") {
+					data = strings.TrimSpace(data)
+					switch data {
+					case "L0", "L1", "L2", "L3", "L4", "L5", "L6":
+						level, err = strconv.Atoi(data[1:])
+						if err != nil {
+							return err.Error()
+						}
+					default:
+						meta, err := parseMeta(data)
+						if err != nil {
+							return err.Error()
+						}
+						if level != 0 && level < baseLevel {
+							baseLevel = level
+						}
+						fileMetas[level] = append(fileMetas[level], meta)
+					}
+				}
+
+				vers = manifest.NewVersion(DefaultComparer.Compare, base.DefaultFormatter, flushSplitBytes, fileMetas)
+				flushSplitKeys := vers.L0Sublevels.FlushSplitKeys()
+
+				var buf strings.Builder
+				buf.WriteString(vers.String())
+				buf.WriteString("flush split keys:\n")
+				for _, key := range flushSplitKeys {
+					fmt.Fprintf(&buf, "\t%s\n", base.DefaultFormatter(key))
+				}
+
+				return buf.String()
+
+			case "flush":
+				c := &compaction{
+					cmp:      cmp,
+					equal:    DefaultComparer.Equal,
+					comparer: DefaultComparer,
+					version:  vers,
+					l0Limits: vers.L0Sublevels.FlushSplitKeys(),
+					inputs:   []compactionLevel{{level: -1}, {level: 0}},
+				}
+				c.startLevel, c.outputLevel = &c.inputs[0], &c.inputs[1]
+
+				var buf bytes.Buffer
+				var smallest, largest string
+				var l0Limit []byte
+				for i, key := range strings.Fields(d.Input) {
+					if i == 0 {
+						smallest = key
+						l0Limit = c.findL0Limit([]byte(key))
+					}
+					if l0Limit != nil && c.cmp(l0Limit, []byte(key)) < 0 {
+						fmt.Fprintf(&buf, "%s-%s\n", smallest, largest)
+						smallest = key
+						l0Limit = c.findL0Limit([]byte(key))
+					}
+					largest = key
+				}
+				fmt.Fprintf(&buf, "%s-%s\n", smallest, largest)
+				return buf.String()
+
+			default:
+				return fmt.Sprintf("unknown command: %s", d.Cmd)
+			}
+		})
+}
+
+func TestCompactionOutputLevel(t *testing.T) {
+	opts := (*Options)(nil).EnsureDefaults()
+	version := &version{}
+
+	datadriven.RunTest(t, "testdata/compaction_output_level",
+		func(t *testing.T, d *datadriven.TestData) (res string) {
+			defer func() {
+				if r := recover(); r != nil {
+					res = fmt.Sprintln(r)
+				}
+			}()
+
+			switch d.Cmd {
+			case "compact":
+				var start, base int
+				d.ScanArgs(t, "start", &start)
+				d.ScanArgs(t, "base", &base)
+				pc := newPickedCompaction(opts, version, start, defaultOutputLevel(start, base), base)
+				c := newCompaction(pc, opts, time.Now(), nil /* provider */)
+				return fmt.Sprintf("output=%d\nmax-output-file-size=%d\n",
+					c.outputLevel.level, c.maxOutputFileSize)
+
+			default:
+				return fmt.Sprintf("unknown command: %s", d.Cmd)
+			}
+		})
+}
+
+func TestCompactionAtomicUnitBounds(t *testing.T) {
+	cmp := DefaultComparer.Compare
+	var files manifest.LevelSlice
+
+	parseMeta := func(s string) *fileMetadata {
+		parts := strings.Split(s, "-")
+		if len(parts) != 2 {
+			t.Fatalf("malformed table spec: %s", s)
+		}
+		m := (&fileMetadata{}).ExtendPointKeyBounds(
+			cmp,
+			base.ParseInternalKey(parts[0]),
+			base.ParseInternalKey(parts[1]),
+		)
+		m.InitPhysicalBacking()
+		return m
+	}
+
+	datadriven.RunTest(t, "testdata/compaction_atomic_unit_bounds",
+		func(t *testing.T, d *datadriven.TestData) string {
+			switch d.Cmd {
+			case "define":
+				files = manifest.LevelSlice{}
+				if len(d.Input) == 0 {
+					return ""
+				}
+				var ff []*fileMetadata
+				for _, data := range strings.Split(d.Input, "\n") {
+					meta := parseMeta(data)
+					meta.FileNum = FileNum(len(ff))
+					ff = append(ff, meta)
+				}
+				files = manifest.NewLevelSliceKeySorted(cmp, ff)
+				return ""
+
+			case "atomic-unit-bounds":
+				c := &compaction{
+					cmp:      cmp,
+					equal:    DefaultComparer.Equal,
+					comparer: DefaultComparer,
+					inputs:   []compactionLevel{{files: files}, {}},
+				}
+				c.startLevel, c.outputLevel = &c.inputs[0], &c.inputs[1]
+				if len(d.CmdArgs) != 1 {
+					return fmt.Sprintf("%s expects 1 argument", d.Cmd)
+				}
+				index, err := strconv.ParseInt(d.CmdArgs[0].String(), 10, 64)
+				if err != nil {
+					return err.Error()
+				}
+				iter := files.Iter()
+				// Advance iter to `index`.
+				_ = iter.First()
+				for i := int64(0); i < index; i++ {
+					_ = iter.Next()
+				}
+				atomicUnit, _ := expandToAtomicUnit(c.cmp, iter.Take().Slice(), true /* disableIsCompacting */)
+				lower, upper := manifest.KeyRange(c.cmp, atomicUnit.Iter())
+				return fmt.Sprintf("%s-%s\n", lower.UserKey, upper.UserKey)
+
+			default:
+				return fmt.Sprintf("unknown command: %s", d.Cmd)
+			}
+		})
+}
+
+func TestCompactionDeleteOnlyHints(t *testing.T) {
+	parseUint64 := func(s string) uint64 {
+		v, err := strconv.ParseUint(s, 10, 64)
+		require.NoError(t, err)
+		return v
+	}
+	var d *DB
+	defer func() {
+		if d != nil {
+			require.NoError(t, closeAllSnapshots(d))
+			require.NoError(t, d.Close())
+		}
+	}()
+
+	var compactInfo *CompactionInfo // protected by d.mu
+	reset := func() (*Options, error) {
+		if d != nil {
+			compactInfo = nil
+			if err := closeAllSnapshots(d); err != nil {
+				return nil, err
+			}
+			if err := d.Close(); err != nil {
+				return nil, err
+			}
+		}
+		opts := (&Options{
+			FS:         vfs.NewMem(),
+			DebugCheck: DebugCheckLevels,
+			EventListener: &EventListener{
+				CompactionEnd: func(info CompactionInfo) {
+					if compactInfo != nil {
+						return
+					}
+					compactInfo = &info
+				},
+			},
+			FormatMajorVersion: internalFormatNewest,
+		}).WithFSDefaults()
+
+		// Collection of table stats can trigger compactions. As we want full
+		// control over when compactions are run, disable stats by default.
+		opts.private.disableTableStats = true
+
+		return opts, nil
+	}
+
+	compactionString := func() string {
+		for d.mu.compact.compactingCount > 0 {
+			d.mu.compact.cond.Wait()
+		}
+
+		s := "(none)"
+		if compactInfo != nil {
+			// Fix the job ID and durations for determinism.
+			compactInfo.JobID = 100
+			compactInfo.Duration = time.Second
+			compactInfo.TotalDuration = 2 * time.Second
+			s = compactInfo.String()
+			compactInfo = nil
+		}
+		return s
+	}
+
+	var err error
+	var opts *Options
+	datadriven.RunTest(t, "testdata/compaction_delete_only_hints",
+		func(t *testing.T, td *datadriven.TestData) string {
+			switch td.Cmd {
+			case "define":
+				opts, err = reset()
+				if err != nil {
+					return err.Error()
+				}
+				d, err = runDBDefineCmd(td, opts)
+				if err != nil {
+					return err.Error()
+				}
+				d.mu.Lock()
+				s := d.mu.versions.currentVersion().String()
+				d.mu.Unlock()
+				return s
+
+			case "force-set-hints":
+				d.mu.Lock()
+				defer d.mu.Unlock()
+				d.mu.compact.deletionHints = d.mu.compact.deletionHints[:0]
+				var buf bytes.Buffer
+				for _, data := range strings.Split(td.Input, "\n") {
+					parts := strings.FieldsFunc(strings.TrimSpace(data),
+						func(r rune) bool { return r == '-' || r == ' ' || r == '.' })
+
+					start, end := []byte(parts[2]), []byte(parts[3])
+
+					var tombstoneFile *fileMetadata
+					tombstoneLevel := int(parseUint64(parts[0][1:]))
+
+					// Set file number to the value provided in the input.
+					tombstoneFile = &fileMetadata{
+						FileNum: base.FileNum(parseUint64(parts[1])),
+					}
+
+					var hintType deleteCompactionHintType
+					switch typ := parts[7]; typ {
+					case "point_key_only":
+						hintType = deleteCompactionHintTypePointKeyOnly
+					case "range_key_only":
+						hintType = deleteCompactionHintTypeRangeKeyOnly
+					case "point_and_range_key":
+						hintType = deleteCompactionHintTypePointAndRangeKey
+					default:
+						return fmt.Sprintf("unknown hint type: %s", typ)
+					}
+
+					h := deleteCompactionHint{
+						hintType:                hintType,
+						start:                   start,
+						end:                     end,
+						fileSmallestSeqNum:      parseUint64(parts[4]),
+						tombstoneLevel:          tombstoneLevel,
+						tombstoneFile:           tombstoneFile,
+						tombstoneSmallestSeqNum: parseUint64(parts[5]),
+						tombstoneLargestSeqNum:  parseUint64(parts[6]),
+					}
+					d.mu.compact.deletionHints = append(d.mu.compact.deletionHints, h)
+					fmt.Fprintln(&buf, h.String())
+				}
+				return buf.String()
+
+			case "get-hints":
+				d.mu.Lock()
+				defer d.mu.Unlock()
+
+				// Force collection of table stats. This requires re-enabling the
+				// collection flag. We also do not want compactions to run as part of
+				// the stats collection job, so we disable it temporarily.
+				d.opts.private.disableTableStats = false
+				d.opts.DisableAutomaticCompactions = true
+				defer func() {
+					d.opts.private.disableTableStats = true
+					d.opts.DisableAutomaticCompactions = false
+				}()
+
+				// NB: collectTableStats attempts to acquire the lock. Temporarily
+				// unlock here to avoid a deadlock.
+				d.mu.Unlock()
+				didRun := d.collectTableStats()
+				d.mu.Lock()
+
+				if !didRun {
+					// If a job was already running, wait for the results.
+					d.waitTableStats()
+				}
+
+				hints := d.mu.compact.deletionHints
+				if len(hints) == 0 {
+					return "(none)"
+				}
+				var buf bytes.Buffer
+				for _, h := range hints {
+					buf.WriteString(h.String() + "\n")
+				}
+				return buf.String()
+
+			case "maybe-compact":
+				d.mu.Lock()
+				d.maybeScheduleCompaction()
+
+				var buf bytes.Buffer
+				fmt.Fprintf(&buf, "Deletion hints:\n")
+				for _, h := range d.mu.compact.deletionHints {
+					fmt.Fprintf(&buf, "  %s\n", h.String())
+				}
+				if len(d.mu.compact.deletionHints) == 0 {
+					fmt.Fprintf(&buf, "  (none)\n")
+				}
+				fmt.Fprintf(&buf, "Compactions:\n")
+				fmt.Fprintf(&buf, "  %s", compactionString())
+				d.mu.Unlock()
+				return buf.String()
+
+			case "compact":
+				if err := runCompactCmd(td, d); err != nil {
+					return err.Error()
+				}
+				d.mu.Lock()
+				compactInfo = nil
+				s := d.mu.versions.currentVersion().String()
+				d.mu.Unlock()
+				return s
+
+			case "close-snapshot":
+				seqNum, err := strconv.ParseUint(strings.TrimSpace(td.Input), 0, 64)
+				if err != nil {
+					return err.Error()
+				}
+				d.mu.Lock()
+				var s *Snapshot
+				l := &d.mu.snapshots
+				for i := l.root.next; i != &l.root; i = i.next {
+					if i.seqNum == seqNum {
+						s = i
+					}
+				}
+				d.mu.Unlock()
+				if s == nil {
+					return "(not found)"
+				} else if err := s.Close(); err != nil {
+					return err.Error()
+				}
+
+				d.mu.Lock()
+				// Closing the snapshot may have triggered a compaction.
+				str := compactionString()
+				d.mu.Unlock()
+				return str
+
+			case "iter":
+				snap := Snapshot{
+					db:     d,
+					seqNum: InternalKeySeqNumMax,
+				}
+				iter, _ := snap.NewIter(nil)
+				return runIterCmd(td, iter, true)
+
+			case "reset":
+				opts, err = reset()
+				if err != nil {
+					return err.Error()
+				}
+				d, err = Open("", opts)
+				if err != nil {
+					return err.Error()
+				}
+				return ""
+
+			case "ingest":
+				if err = runBuildCmd(td, d, d.opts.FS); err != nil {
+					return err.Error()
+				}
+				if err = runIngestCmd(td, d, d.opts.FS); err != nil {
+					return err.Error()
+				}
+				return "OK"
+
+			case "describe-lsm":
+				d.mu.Lock()
+				s := d.mu.versions.currentVersion().String()
+				d.mu.Unlock()
+				return s
+
+			default:
+				return fmt.Sprintf("unknown command: %s", td.Cmd)
+			}
+		})
+}
+
+func TestCompactionTombstones(t *testing.T) {
+	var d *DB
+	defer func() {
+		if d != nil {
+			require.NoError(t, closeAllSnapshots(d))
+			require.NoError(t, d.Close())
+		}
+	}()
+
+	var compactInfo *CompactionInfo // protected by d.mu
+
+	compactionString := func() string {
+		for d.mu.compact.compactingCount > 0 {
+			d.mu.compact.cond.Wait()
+		}
+
+		s := "(none)"
+		if compactInfo != nil {
+			// Fix the job ID and durations for determinism.
+			compactInfo.JobID = 100
+			compactInfo.Duration = time.Second
+			compactInfo.TotalDuration = 2 * time.Second
+			s = compactInfo.String()
+			compactInfo = nil
+		}
+		return s
+	}
+
+	datadriven.RunTest(t, "testdata/compaction_tombstones",
+		func(t *testing.T, td *datadriven.TestData) string {
+			switch td.Cmd {
+			case "define":
+				if d != nil {
+					compactInfo = nil
+					require.NoError(t, closeAllSnapshots(d))
+					if err := d.Close(); err != nil {
+						return err.Error()
+					}
+				}
+				opts := (&Options{
+					FS:         vfs.NewMem(),
+					DebugCheck: DebugCheckLevels,
+					EventListener: &EventListener{
+						CompactionEnd: func(info CompactionInfo) {
+							compactInfo = &info
+						},
+					},
+					FormatMajorVersion: internalFormatNewest,
+				}).WithFSDefaults()
+				var err error
+				d, err = runDBDefineCmd(td, opts)
+				if err != nil {
+					return err.Error()
+				}
+				d.mu.Lock()
+				s := d.mu.versions.currentVersion().String()
+				d.mu.Unlock()
+				return s
+
+			case "maybe-compact":
+				d.mu.Lock()
+				d.opts.DisableAutomaticCompactions = false
+				d.maybeScheduleCompaction()
+				s := compactionString()
+				d.mu.Unlock()
+				return s
+
+			case "wait-pending-table-stats":
+				return runTableStatsCmd(td, d)
+
+			case "close-snapshot":
+				seqNum, err := strconv.ParseUint(strings.TrimSpace(td.Input), 0, 64)
+				if err != nil {
+					return err.Error()
+				}
+				d.mu.Lock()
+				var s *Snapshot
+				l := &d.mu.snapshots
+				for i := l.root.next; i != &l.root; i = i.next {
+					if i.seqNum == seqNum {
+						s = i
+					}
+				}
+				d.mu.Unlock()
+				if s == nil {
+					return "(not found)"
+				} else if err := s.Close(); err != nil {
+					return err.Error()
+				}
+
+				d.mu.Lock()
+				// Closing the snapshot may have triggered a compaction.
+				str := compactionString()
+				d.mu.Unlock()
+				return str
+
+			case "close":
+				if err := d.Close(); err != nil {
+					return err.Error()
+				}
+				d = nil
+				return ""
+
+			case "version":
+				d.mu.Lock()
+				s := d.mu.versions.currentVersion().String()
+				d.mu.Unlock()
+				return s
+
+			default:
+				return fmt.Sprintf("unknown command: %s", td.Cmd)
+			}
+		})
+}
+
+func closeAllSnapshots(d *DB) error {
+	d.mu.Lock()
+	var ss []*Snapshot
+	l := &d.mu.snapshots
+	for i := l.root.next; i != &l.root; i = i.next {
+		ss = append(ss, i)
+	}
+	d.mu.Unlock()
+	for i := range ss {
+		if err := ss[i].Close(); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func TestCompactionReadTriggeredQueue(t *testing.T) {
+
+	// Convert a read compaction to a string which this test
+	// understands.
+	showRC := func(rc *readCompaction) string {
+		return fmt.Sprintf(
+			"L%d: %s-%s %d\n", rc.level, string(rc.start), string(rc.end), rc.fileNum,
+		)
+	}
+
+	var queue *readCompactionQueue
+
+	datadriven.RunTest(t, "testdata/read_compaction_queue",
+		func(t *testing.T, td *datadriven.TestData) string {
+			switch td.Cmd {
+			case "create":
+				queue = &readCompactionQueue{}
+				return "(success)"
+			case "add-compaction":
+				for _, line := range strings.Split(td.Input, "\n") {
+					if line == "" {
+						continue
+					}
+					parts := strings.Split(line, " ")
+
+					if len(parts) != 3 {
+						return "error: malformed data for add-compaction. usage: <level>: <start>-<end> <filenum>"
+					}
+					if l, err := strconv.Atoi(parts[0][1:2]); err == nil {
+						keys := strings.Split(parts[1], "-")
+						fileNum, _ := strconv.Atoi(parts[2])
+						rc := readCompaction{
+							level:   l,
+							start:   []byte(keys[0]),
+							end:     []byte(keys[1]),
+							fileNum: base.FileNum(fileNum),
+						}
+						queue.add(&rc, DefaultComparer.Compare)
+					} else {
+						return err.Error()
+					}
+				}
+				return ""
+			case "remove-compaction":
+				rc := queue.remove()
+				if rc == nil {
+					return "(nil)"
+				}
+				return showRC(rc)
+			case "print-size":
+				// Print the size of the queue.
+				return fmt.Sprintf("%d", queue.size)
+			case "print-queue":
+				// Print each element of the queue on a separate line.
+				var sb strings.Builder
+				if queue.size == 0 {
+					sb.WriteString("(empty)")
+				}
+
+				for i := 0; i < queue.size; i++ {
+					rc := queue.at(i)
+					sb.WriteString(showRC(rc))
+				}
+				return sb.String()
+			default:
+				return fmt.Sprintf("unknown command: %s", td.Cmd)
+			}
+		},
+	)
+}
+
+func (qu *readCompactionQueue) at(i int) *readCompaction {
+	if i >= qu.size {
+		return nil
+	}
+
+	return qu.queue[i]
+}
+
+func TestCompactionReadTriggered(t *testing.T) {
+	var d *DB
+	defer func() {
+		if d != nil {
+			require.NoError(t, d.Close())
+		}
+	}()
+
+	var compactInfo *CompactionInfo // protected by d.mu
+
+	compactionString := func() string {
+		for d.mu.compact.compactingCount > 0 {
+			d.mu.compact.cond.Wait()
+		}
+
+		s := "(none)"
+		if compactInfo != nil {
+			// Fix the job ID and durations for determinism.
+			compactInfo.JobID = 100
+			compactInfo.Duration = time.Second
+			compactInfo.TotalDuration = 2 * time.Second
+			s = compactInfo.String()
+			compactInfo = nil
+		}
+		return s
+	}
+
+	datadriven.RunTest(t, "testdata/compaction_read_triggered",
+		func(t *testing.T, td *datadriven.TestData) string {
+			switch td.Cmd {
+			case "define":
+				if d != nil {
+					compactInfo = nil
+					if err := d.Close(); err != nil {
+						return err.Error()
+					}
+				}
+				opts := (&Options{
+					FS:         vfs.NewMem(),
+					DebugCheck: DebugCheckLevels,
+					EventListener: &EventListener{
+						CompactionEnd: func(info CompactionInfo) {
+							compactInfo = &info
+						},
+					},
+				}).WithFSDefaults()
+				var err error
+				d, err = runDBDefineCmd(td, opts)
+				if err != nil {
+					return err.Error()
+				}
+				d.mu.Lock()
+				s := d.mu.versions.currentVersion().String()
+				d.mu.Unlock()
+				return s
+
+			case "add-read-compaction":
+				d.mu.Lock()
+				td.MaybeScanArgs(t, "flushing", &d.mu.compact.flushing)
+				for _, line := range strings.Split(td.Input, "\n") {
+					if line == "" {
+						continue
+					}
+					parts := strings.Split(line, " ")
+					if len(parts) != 3 {
+						return "error: malformed data for add-read-compaction. usage: <level>: <start>-<end> <filenum>"
+					}
+					if l, err := strconv.Atoi(parts[0][:1]); err == nil {
+						keys := strings.Split(parts[1], "-")
+						fileNum, _ := strconv.Atoi(parts[2])
+						rc := readCompaction{
+							level:   l,
+							start:   []byte(keys[0]),
+							end:     []byte(keys[1]),
+							fileNum: base.FileNum(fileNum),
+						}
+						d.mu.compact.readCompactions.add(&rc, DefaultComparer.Compare)
+					} else {
+						return err.Error()
+					}
+				}
+				d.mu.Unlock()
+				return ""
+
+			case "show-read-compactions":
+				d.mu.Lock()
+				var sb strings.Builder
+				if d.mu.compact.readCompactions.size == 0 {
+					sb.WriteString("(none)")
+				}
+				for i := 0; i < d.mu.compact.readCompactions.size; i++ {
+					rc := d.mu.compact.readCompactions.at(i)
+					sb.WriteString(fmt.Sprintf("(level: %d, start: %s, end: %s)\n", rc.level, string(rc.start), string(rc.end)))
+				}
+				d.mu.Unlock()
+				return sb.String()
+
+			case "maybe-compact":
+				d.mu.Lock()
+				d.opts.DisableAutomaticCompactions = false
+				d.maybeScheduleCompaction()
+				s := compactionString()
+				d.mu.Unlock()
+				return s
+
+			case "version":
+				d.mu.Lock()
+				s := d.mu.versions.currentVersion().String()
+				d.mu.Unlock()
+				return s
+
+			default:
+				return fmt.Sprintf("unknown command: %s", td.Cmd)
+			}
+		})
+}
+
+func TestCompactionInuseKeyRanges(t *testing.T) {
+	cmp := DefaultComparer.Compare
+	parseMeta := func(s string) *fileMetadata {
+		parts := strings.Split(s, "-")
+		if len(parts) != 2 {
+			t.Fatalf("malformed table spec: %s", s)
+		}
+		m := (&fileMetadata{}).ExtendRangeKeyBounds(
+			cmp,
+			base.ParseInternalKey(strings.TrimSpace(parts[0])),
+			base.ParseInternalKey(strings.TrimSpace(parts[1])),
+		)
+		m.SmallestSeqNum = m.Smallest.SeqNum()
+		m.LargestSeqNum = m.Largest.SeqNum()
+		m.InitPhysicalBacking()
+		return m
+	}
+
+	opts := (*Options)(nil).EnsureDefaults()
+
+	var c *compaction
+	datadriven.RunTest(t, "testdata/compaction_inuse_key_ranges", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "define":
+			c = &compaction{
+				cmp:       DefaultComparer.Compare,
+				equal:     DefaultComparer.Equal,
+				comparer:  DefaultComparer,
+				formatKey: DefaultComparer.FormatKey,
+				inputs:    []compactionLevel{{}, {}},
+			}
+			c.startLevel, c.outputLevel = &c.inputs[0], &c.inputs[1]
+			var files [numLevels][]*fileMetadata
+			var currentLevel int
+			fileNum := FileNum(1)
+
+			for _, data := range strings.Split(td.Input, "\n") {
+				switch data {
+				case "L0", "L1", "L2", "L3", "L4", "L5", "L6":
+					level, err := strconv.Atoi(data[1:])
+					if err != nil {
+						return err.Error()
+					}
+					currentLevel = level
+
+				default:
+					meta := parseMeta(data)
+					meta.FileNum = fileNum
+					fileNum++
+					files[currentLevel] = append(files[currentLevel], meta)
+				}
+			}
+			c.version = newVersion(opts, files)
+			return c.version.String()
+
+		case "inuse-key-ranges":
+			var buf bytes.Buffer
+			for _, line := range strings.Split(td.Input, "\n") {
+				parts := strings.Fields(line)
+				if len(parts) != 3 {
+					fmt.Fprintf(&buf, "expected <level> <smallest> <largest>: %q\n", line)
+					continue
+				}
+				level, err := strconv.Atoi(parts[0])
+				if err != nil {
+					fmt.Fprintf(&buf, "expected <level> <smallest> <largest>: %q: %v\n", line, err)
+					continue
+				}
+				c.outputLevel.level = level
+				c.smallest.UserKey = []byte(parts[1])
+				c.largest.UserKey = []byte(parts[2])
+
+				c.inuseKeyRanges = nil
+				c.setupInuseKeyRanges()
+				if len(c.inuseKeyRanges) == 0 {
+					fmt.Fprintf(&buf, ".\n")
+				} else {
+					for i, r := range c.inuseKeyRanges {
+						if i > 0 {
+							fmt.Fprintf(&buf, " ")
+						}
+						fmt.Fprintf(&buf, "%s-%s", r.Start, r.End)
+					}
+					fmt.Fprintf(&buf, "\n")
+				}
+			}
+			return buf.String()
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestCompactionInuseKeyRangesRandomized(t *testing.T) {
+	var (
+		fileNum     = FileNum(0)
+		opts        = (*Options)(nil).EnsureDefaults()
+		seed        = int64(time.Now().UnixNano())
+		rng         = rand.New(rand.NewSource(seed))
+		endKeyspace = 26 * 26
+	)
+	t.Logf("Using rng seed %d.", seed)
+
+	for iter := 0; iter < 100; iter++ {
+		makeUserKey := func(i int) []byte {
+			if i >= endKeyspace {
+				i = endKeyspace - 1
+			}
+			return []byte{byte(i/26 + 'a'), byte(i%26 + 'a')}
+		}
+		makeIK := func(level, i int) InternalKey {
+			return base.MakeInternalKey(
+				makeUserKey(i),
+				uint64(numLevels-level),
+				base.InternalKeyKindSet,
+			)
+		}
+		makeFile := func(level, start, end int) *fileMetadata {
+			fileNum++
+			m := (&fileMetadata{
+				FileNum: fileNum,
+			}).ExtendPointKeyBounds(
+				opts.Comparer.Compare,
+				makeIK(level, start),
+				makeIK(level, end),
+			)
+			m.SmallestSeqNum = m.Smallest.SeqNum()
+			m.LargestSeqNum = m.Largest.SeqNum()
+			m.InitPhysicalBacking()
+			return m
+		}
+		overlaps := func(startA, endA, startB, endB []byte) bool {
+			disjoint := opts.Comparer.Compare(endB, startA) < 0 || opts.Comparer.Compare(endA, startB) < 0
+			return !disjoint
+		}
+		var files [numLevels][]*fileMetadata
+		for l := 0; l < numLevels; l++ {
+			for i := 0; i < rand.Intn(10); i++ {
+				s := rng.Intn(endKeyspace)
+				maxWidth := rng.Intn(endKeyspace-s) + 1
+				e := rng.Intn(maxWidth) + s
+				sKey, eKey := makeUserKey(s), makeUserKey(e)
+				// Discard the key range if it overlaps any existing files
+				// within this level.
+				var o bool
+				for _, f := range files[l] {
+					o = o || overlaps(sKey, eKey, f.Smallest.UserKey, f.Largest.UserKey)
+				}
+				if o {
+					continue
+				}
+				files[l] = append(files[l], makeFile(l, s, e))
+			}
+			slices.SortFunc(files[l], func(a, b *fileMetadata) int {
+				return opts.Comparer.Compare(a.Smallest.UserKey, b.Smallest.UserKey)
+			})
+		}
+		v := newVersion(opts, files)
+		t.Log(v.DebugString(opts.Comparer.FormatKey))
+		for i := 0; i < 1000; i++ {
+			l := rng.Intn(numLevels)
+			s := rng.Intn(endKeyspace)
+			maxWidth := rng.Intn(endKeyspace-s) + 1
+			e := rng.Intn(maxWidth) + s
+			sKey, eKey := makeUserKey(s), makeUserKey(e)
+			keyRanges := calculateInuseKeyRanges(v, opts.Comparer.Compare, l, numLevels-1, sKey, eKey)
+
+			for level := l; level < numLevels; level++ {
+				for _, f := range files[level] {
+					if !overlaps(sKey, eKey, f.Smallest.UserKey, f.Largest.UserKey) {
+						// This file doesn't overlap the queried range. Skip it.
+						continue
+					}
+					// This file does overlap the queried range. The key range
+					// [MAX(f.Smallest, sKey), MIN(f.Largest, eKey)] must be fully
+					// contained by a key range in keyRanges.
+					checkStart, checkEnd := f.Smallest.UserKey, f.Largest.UserKey
+					if opts.Comparer.Compare(checkStart, sKey) < 0 {
+						checkStart = sKey
+					}
+					if opts.Comparer.Compare(checkEnd, eKey) > 0 {
+						checkEnd = eKey
+					}
+					var contained bool
+					for _, kr := range keyRanges {
+						contained = contained ||
+							(opts.Comparer.Compare(checkStart, kr.Start) >= 0 &&
+								opts.Comparer.Compare(checkEnd, kr.End) <= 0)
+					}
+					if !contained {
+						t.Errorf("Seed %d, iter %d: File %s overlaps %q-%q, but is not fully contained in any of the key ranges.",
+							seed, iter, f, sKey, eKey)
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestCompactionAllowZeroSeqNum(t *testing.T) {
+	var d *DB
+	defer func() {
+		if d != nil {
+			require.NoError(t, closeAllSnapshots(d))
+			require.NoError(t, d.Close())
+		}
+	}()
+
+	metaRE := regexp.MustCompile(`^L([0-9]+):([^-]+)-(.+)$`)
+	var fileNum base.FileNum
+	parseMeta := func(s string) (level int, meta *fileMetadata) {
+		match := metaRE.FindStringSubmatch(s)
+		if match == nil {
+			t.Fatalf("malformed table spec: %s", s)
+		}
+		level, err := strconv.Atoi(match[1])
+		if err != nil {
+			t.Fatalf("malformed table spec: %s: %s", s, err)
+		}
+		fileNum++
+		meta = (&fileMetadata{
+			FileNum: fileNum,
+		}).ExtendPointKeyBounds(
+			d.cmp,
+			InternalKey{UserKey: []byte(match[2])},
+			InternalKey{UserKey: []byte(match[3])},
+		)
+		meta.InitPhysicalBacking()
+		return level, meta
+	}
+
+	datadriven.RunTest(t, "testdata/compaction_allow_zero_seqnum",
+		func(t *testing.T, td *datadriven.TestData) string {
+			switch td.Cmd {
+			case "define":
+				if d != nil {
+					require.NoError(t, closeAllSnapshots(d))
+					if err := d.Close(); err != nil {
+						return err.Error()
+					}
+				}
+
+				var err error
+				if d, err = runDBDefineCmd(td, nil /* options */); err != nil {
+					return err.Error()
+				}
+
+				d.mu.Lock()
+				s := d.mu.versions.currentVersion().String()
+				d.mu.Unlock()
+				return s
+
+			case "allow-zero-seqnum":
+				d.mu.Lock()
+				c := &compaction{
+					cmp:      d.cmp,
+					comparer: d.opts.Comparer,
+					version:  d.mu.versions.currentVersion(),
+					inputs:   []compactionLevel{{}, {}},
+				}
+				c.startLevel, c.outputLevel = &c.inputs[0], &c.inputs[1]
+				d.mu.Unlock()
+
+				var buf bytes.Buffer
+				for _, line := range strings.Split(td.Input, "\n") {
+					parts := strings.Fields(line)
+					if len(parts) == 0 {
+						continue
+					}
+					c.flushing = nil
+					c.startLevel.level = -1
+
+					var startFiles, outputFiles []*fileMetadata
+
+					switch {
+					case len(parts) == 1 && parts[0] == "flush":
+						c.outputLevel.level = 0
+						d.mu.Lock()
+						c.flushing = d.mu.mem.queue
+						d.mu.Unlock()
+
+					default:
+						for _, p := range parts {
+							level, meta := parseMeta(p)
+							if c.startLevel.level == -1 {
+								c.startLevel.level = level
+							}
+
+							switch level {
+							case c.startLevel.level:
+								startFiles = append(startFiles, meta)
+							case c.startLevel.level + 1:
+								outputFiles = append(outputFiles, meta)
+							default:
+								return fmt.Sprintf("invalid level %d: expected %d or %d",
+									level, c.startLevel.level, c.startLevel.level+1)
+							}
+						}
+						c.outputLevel.level = c.startLevel.level + 1
+						c.startLevel.files = manifest.NewLevelSliceSpecificOrder(startFiles)
+						c.outputLevel.files = manifest.NewLevelSliceKeySorted(c.cmp, outputFiles)
+					}
+
+					c.smallest, c.largest = manifest.KeyRange(c.cmp,
+						c.startLevel.files.Iter(),
+						c.outputLevel.files.Iter())
+
+					c.inuseKeyRanges = nil
+					c.setupInuseKeyRanges()
+					fmt.Fprintf(&buf, "%t\n", c.allowZeroSeqNum())
+				}
+				return buf.String()
+
+			default:
+				return fmt.Sprintf("unknown command: %s", td.Cmd)
+			}
+		})
+}
+
+func TestCompactionErrorOnUserKeyOverlap(t *testing.T) {
+	cmp := DefaultComparer.Compare
+	parseMeta := func(s string) *fileMetadata {
+		parts := strings.Split(s, "-")
+		if len(parts) != 2 {
+			t.Fatalf("malformed table spec: %s", s)
+		}
+		m := (&fileMetadata{}).ExtendPointKeyBounds(
+			cmp,
+			base.ParseInternalKey(strings.TrimSpace(parts[0])),
+			base.ParseInternalKey(strings.TrimSpace(parts[1])),
+		)
+		m.SmallestSeqNum = m.Smallest.SeqNum()
+		m.LargestSeqNum = m.Largest.SeqNum()
+		m.InitPhysicalBacking()
+		return m
+	}
+
+	datadriven.RunTest(t, "testdata/compaction_error_on_user_key_overlap",
+		func(t *testing.T, d *datadriven.TestData) string {
+			switch d.Cmd {
+			case "error-on-user-key-overlap":
+				c := &compaction{
+					cmp:       DefaultComparer.Compare,
+					comparer:  DefaultComparer,
+					formatKey: DefaultComparer.FormatKey,
+				}
+				var files []manifest.NewFileEntry
+				fileNum := FileNum(1)
+
+				for _, data := range strings.Split(d.Input, "\n") {
+					meta := parseMeta(data)
+					meta.FileNum = fileNum
+					fileNum++
+					files = append(files, manifest.NewFileEntry{Level: 1, Meta: meta})
+				}
+
+				result := "OK"
+				ve := &versionEdit{
+					NewFiles: files,
+				}
+				if err := c.errorOnUserKeyOverlap(ve); err != nil {
+					result = fmt.Sprint(err)
+				}
+				return result
+
+			default:
+				return fmt.Sprintf("unknown command: %s", d.Cmd)
+			}
+		})
+}
+
+// TestCompactionErrorCleanup tests an error encountered during a compaction
+// after some output tables have been created. It ensures that the pending
+// output tables are removed from the filesystem.
+func TestCompactionErrorCleanup(t *testing.T) {
+	// protected by d.mu
+	var (
+		initialSetupDone bool
+		tablesCreated    []FileNum
+	)
+
+	mem := vfs.NewMem()
+	ii := errorfs.OnIndex(math.MaxInt32) // start disabled
+	opts := (&Options{
+		FS:     errorfs.Wrap(mem, errorfs.ErrInjected.If(ii)),
+		Levels: make([]LevelOptions, numLevels),
+		EventListener: &EventListener{
+			TableCreated: func(info TableCreateInfo) {
+				t.Log(info)
+
+				// If the initial setup is over, record tables created and
+				// inject an error immediately after the second table is
+				// created.
+				if initialSetupDone {
+					tablesCreated = append(tablesCreated, info.FileNum)
+					if len(tablesCreated) >= 2 {
+						ii.Store(0)
+					}
+				}
+			},
+		},
+	}).WithFSDefaults()
+	for i := range opts.Levels {
+		opts.Levels[i].TargetFileSize = 1
+	}
+	opts.testingRandomized(t)
+	d, err := Open("", opts)
+	require.NoError(t, err)
+
+	ingest := func(keys ...string) {
+		t.Helper()
+		f, err := mem.Create("ext")
+		require.NoError(t, err)
+
+		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
+			TableFormat: d.FormatMajorVersion().MaxTableFormat(),
+		})
+		for _, k := range keys {
+			require.NoError(t, w.Set([]byte(k), nil))
+		}
+		require.NoError(t, w.Close())
+		require.NoError(t, d.Ingest([]string{"ext"}))
+	}
+	ingest("a", "c")
+	ingest("b")
+
+	// Trigger a manual compaction, which will encounter an injected error
+	// after the second table is created.
+	d.mu.Lock()
+	initialSetupDone = true
+	d.mu.Unlock()
+	err = d.Compact([]byte("a"), []byte("d"), false)
+	require.Error(t, err, "injected error")
+
+	d.mu.Lock()
+	if len(tablesCreated) < 2 {
+		t.Fatalf("expected 2 output tables created by compaction: found %d", len(tablesCreated))
+	}
+	d.mu.Unlock()
+
+	require.NoError(t, d.Close())
+	for _, fileNum := range tablesCreated {
+		filename := fmt.Sprintf("%s.sst", fileNum)
+		if _, err = mem.Stat(filename); err == nil || !oserror.IsNotExist(err) {
+			t.Errorf("expected %q to not exist: %s", filename, err)
+		}
+	}
+}
+
+func TestCompactionCheckOrdering(t *testing.T) {
+	cmp := DefaultComparer.Compare
+	parseMeta := func(s string) *fileMetadata {
+		parts := strings.Split(s, "-")
+		if len(parts) != 2 {
+			t.Fatalf("malformed table spec: %s", s)
+		}
+		m := (&fileMetadata{}).ExtendPointKeyBounds(
+			cmp,
+			base.ParseInternalKey(strings.TrimSpace(parts[0])),
+			base.ParseInternalKey(strings.TrimSpace(parts[1])),
+		)
+		m.SmallestSeqNum = m.Smallest.SeqNum()
+		m.LargestSeqNum = m.Largest.SeqNum()
+		m.InitPhysicalBacking()
+		return m
+	}
+
+	datadriven.RunTest(t, "testdata/compaction_check_ordering",
+		func(t *testing.T, d *datadriven.TestData) string {
+			switch d.Cmd {
+			case "check-ordering":
+				c := &compaction{
+					cmp:       DefaultComparer.Compare,
+					comparer:  DefaultComparer,
+					formatKey: DefaultComparer.FormatKey,
+					logger:    panicLogger{},
+					inputs:    []compactionLevel{{level: -1}, {level: -1}},
+				}
+				c.startLevel, c.outputLevel = &c.inputs[0], &c.inputs[1]
+				var startFiles, outputFiles []*fileMetadata
+				var sublevels []manifest.LevelSlice
+				var files *[]*fileMetadata
+				var sublevel []*fileMetadata
+				var sublevelNum int
+				var parsingSublevel bool
+				fileNum := FileNum(1)
+
+				switchSublevel := func() {
+					if sublevel != nil {
+						sublevels = append(
+							sublevels, manifest.NewLevelSliceSpecificOrder(sublevel),
+						)
+						sublevel = nil
+					}
+					parsingSublevel = false
+				}
+
+				for _, data := range strings.Split(d.Input, "\n") {
+					if data[0] == 'L' && len(data) == 4 {
+						// Format L0.{sublevel}.
+						switchSublevel()
+						level, err := strconv.Atoi(data[1:2])
+						if err != nil {
+							return err.Error()
+						}
+						sublevelNum, err = strconv.Atoi(data[3:])
+						if err != nil {
+							return err.Error()
+						}
+						if c.startLevel.level == -1 {
+							c.startLevel.level = level
+							files = &startFiles
+						}
+						parsingSublevel = true
+					} else if data[0] == 'L' {
+						switchSublevel()
+						level, err := strconv.Atoi(data[1:])
+						if err != nil {
+							return err.Error()
+						}
+						if c.startLevel.level == -1 {
+							c.startLevel.level = level
+							files = &startFiles
+						} else if c.outputLevel.level == -1 {
+							if c.startLevel.level >= level {
+								return fmt.Sprintf("startLevel=%d >= outputLevel=%d\n", c.startLevel.level, level)
+							}
+							c.outputLevel.level = level
+							files = &outputFiles
+						} else {
+							return "outputLevel already set\n"
+						}
+					} else {
+						meta := parseMeta(data)
+						meta.FileNum = fileNum
+						fileNum++
+						*files = append(*files, meta)
+						if parsingSublevel {
+							meta.SubLevel = sublevelNum
+							sublevel = append(sublevel, meta)
+						}
+					}
+				}
+
+				switchSublevel()
+				c.startLevel.files = manifest.NewLevelSliceSpecificOrder(startFiles)
+				c.outputLevel.files = manifest.NewLevelSliceSpecificOrder(outputFiles)
+				if c.outputLevel.level == -1 {
+					c.outputLevel.level = 0
+				}
+				if c.startLevel.level == 0 {
+					// We don't change the input files for the compaction beyond this point.
+					c.startLevel.l0SublevelInfo = generateSublevelInfo(c.cmp, c.startLevel.files)
+				}
+
+				newIters := func(
+					_ context.Context, _ *manifest.FileMetadata, _ *IterOptions, _ internalIterOpts,
+				) (internalIterator, keyspan.FragmentIterator, error) {
+					return &errorIter{}, nil, nil
+				}
+				result := "OK"
+				_, err := c.newInputIter(newIters, nil, nil)
+				if err != nil {
+					result = fmt.Sprint(err)
+				}
+				return result
+
+			default:
+				return fmt.Sprintf("unknown command: %s", d.Cmd)
+			}
+		})
+}
+
+type mockSplitter struct {
+	shouldSplitVal maybeSplit
+}
+
+func (m *mockSplitter) shouldSplitBefore(key *InternalKey, tw *sstable.Writer) maybeSplit {
+	return m.shouldSplitVal
+}
+
+func (m *mockSplitter) onNewOutput(key []byte) []byte {
+	return nil
+}
+
+func TestCompactionOutputSplitters(t *testing.T) {
+	var main, child0, child1 compactionOutputSplitter
+	var prevUserKey []byte
+	pickSplitter := func(input string) *compactionOutputSplitter {
+		switch input {
+		case "main":
+			return &main
+		case "child0":
+			return &child0
+		case "child1":
+			return &child1
+		default:
+			t.Fatalf("invalid splitter slot: %s", input)
+			return nil
+		}
+	}
+
+	datadriven.RunTest(t, "testdata/compaction_output_splitters",
+		func(t *testing.T, d *datadriven.TestData) string {
+			switch d.Cmd {
+			case "reset":
+				main = nil
+				child0 = nil
+				child1 = nil
+			case "init":
+				if len(d.CmdArgs) < 2 {
+					return "expected at least 2 args"
+				}
+				splitterToInit := pickSplitter(d.CmdArgs[0].Key)
+				switch d.CmdArgs[1].Key {
+				case "array":
+					*splitterToInit = &splitterGroup{
+						cmp:       base.DefaultComparer.Compare,
+						splitters: []compactionOutputSplitter{child0, child1},
+					}
+				case "mock":
+					*splitterToInit = &mockSplitter{}
+				case "userkey":
+					*splitterToInit = &userKeyChangeSplitter{
+						cmp: base.DefaultComparer.Compare,
+						unsafePrevUserKey: func() []byte {
+							return prevUserKey
+						},
+						splitter: child0,
+					}
+				}
+				(*splitterToInit).onNewOutput(nil)
+			case "set-should-split":
+				if len(d.CmdArgs) < 2 {
+					return "expected at least 2 args"
+				}
+				splitterToSet := (*pickSplitter(d.CmdArgs[0].Key)).(*mockSplitter)
+				var val maybeSplit
+				switch d.CmdArgs[1].Key {
+				case "split-now":
+					val = splitNow
+				case "no-split":
+					val = noSplit
+				default:
+					t.Fatalf("unexpected value for should-split: %s", d.CmdArgs[1].Key)
+				}
+				splitterToSet.shouldSplitVal = val
+			case "should-split-before":
+				if len(d.CmdArgs) < 1 {
+					return "expected at least 1 arg"
+				}
+				key := base.ParseInternalKey(d.CmdArgs[0].Key)
+				shouldSplit := main.shouldSplitBefore(&key, nil)
+				if shouldSplit == splitNow {
+					main.onNewOutput(key.UserKey)
+					prevUserKey = nil
+				} else {
+					prevUserKey = key.UserKey
+				}
+				return shouldSplit.String()
+			default:
+				return fmt.Sprintf("unknown command: %s", d.Cmd)
+			}
+			return "ok"
+		})
+}
+
+func TestCompactFlushQueuedMemTableAndFlushMetrics(t *testing.T) {
+	t.Run("", func(t *testing.T) {
+		// Verify that manual compaction forces a flush of a queued memtable.
+
+		mem := vfs.NewMem()
+		d, err := Open("", testingRandomized(t, &Options{
+			FS: mem,
+		}).WithFSDefaults())
+		require.NoError(t, err)
+
+		// Add the key "a" to the memtable, then fill up the memtable with the key
+		// prefix "b". The compaction will only overlap with the queued memtable,
+		// not the mutable memtable.
+		// NB: The initial memtable size is 256KB, which is filled up with random
+		// values which typically don't compress well. The test also appends the
+		// random value to the "b" key to limit overwriting of the same key, which
+		// would get collapsed at flush time since there are no open snapshots.
+		value := make([]byte, 50)
+		_, err = crand.Read(value)
+		require.NoError(t, err)
+		require.NoError(t, d.Set([]byte("a"), value, nil))
+		for {
+			_, err = crand.Read(value)
+			require.NoError(t, err)
+			require.NoError(t, d.Set(append([]byte("b"), value...), value, nil))
+			d.mu.Lock()
+			done := len(d.mu.mem.queue) == 2
+			d.mu.Unlock()
+			if done {
+				break
+			}
+		}
+
+		require.NoError(t, d.Compact([]byte("a"), []byte("a\x00"), false))
+		d.mu.Lock()
+		require.Equal(t, 1, len(d.mu.mem.queue))
+		d.mu.Unlock()
+		// Flush metrics are updated after and non-atomically with the memtable
+		// being removed from the queue.
+		for begin := time.Now(); ; {
+			metrics := d.Metrics()
+			require.NotNil(t, metrics)
+			if metrics.Flush.WriteThroughput.Bytes >= 50*1024 {
+				// The writes (during which the flush is idle) and the flush work
+				// should not be so fast as to be unrealistic. If these turn out to be
+				// flaky we could instead inject a clock.
+				//
+				// Windows timer precision is bad (on the order of 1 millisecond) and
+				// can cause the duration to be 0.
+				if runtime.GOOS != "windows" {
+					tinyInterval := 50 * time.Microsecond
+					require.Less(t, tinyInterval, metrics.Flush.WriteThroughput.WorkDuration)
+					require.Less(t, tinyInterval, metrics.Flush.WriteThroughput.IdleDuration)
+				}
+				break
+			}
+			if time.Since(begin) > 2*time.Second {
+				t.Fatal("flush did not happen")
+			}
+			time.Sleep(time.Millisecond)
+		}
+		require.NoError(t, d.Close())
+	})
+}
+
+func TestCompactFlushQueuedLargeBatch(t *testing.T) {
+	// Verify that compaction forces a flush of a queued large batch.
+
+	mem := vfs.NewMem()
+	d, err := Open("", testingRandomized(t, &Options{
+		FS: mem,
+	}).WithFSDefaults())
+	require.NoError(t, err)
+
+	// The default large batch threshold is slightly less than 1/2 of the
+	// memtable size which makes triggering a problem with flushing queued large
+	// batches irritating. Manually adjust the threshold to 1/8 of the memtable
+	// size in order to more easily create a situation where a large batch is
+	// queued but not automatically flushed.
+	d.mu.Lock()
+	d.largeBatchThreshold = d.opts.MemTableSize / 8
+	require.Equal(t, 1, len(d.mu.mem.queue))
+	d.mu.Unlock()
+
+	// Set a record with a large value. This will be transformed into a large
+	// batch and placed in the flushable queue.
+	require.NoError(t, d.Set([]byte("a"), bytes.Repeat([]byte("v"), int(d.largeBatchThreshold)), nil))
+	d.mu.Lock()
+	require.Greater(t, len(d.mu.mem.queue), 1)
+	d.mu.Unlock()
+
+	require.NoError(t, d.Compact([]byte("a"), []byte("a\x00"), false))
+	d.mu.Lock()
+	require.Equal(t, 1, len(d.mu.mem.queue))
+	d.mu.Unlock()
+
+	require.NoError(t, d.Close())
+}
+
+func TestFlushError(t *testing.T) {
+	// Error the first five times we try to write a sstable.
+	var errorOps atomic.Int32
+	errorOps.Store(3)
+	fs := errorfs.Wrap(vfs.NewMem(), errorfs.InjectorFunc(func(op errorfs.Op) error {
+		if op.Kind == errorfs.OpCreate && filepath.Ext(op.Path) == ".sst" && errorOps.Add(-1) >= 0 {
+			return errorfs.ErrInjected
+		}
+		return nil
+	}))
+	d, err := Open("", testingRandomized(t, &Options{
+		FS: fs,
+		EventListener: &EventListener{
+			BackgroundError: func(err error) {
+				t.Log(err)
+			},
+		},
+	}).WithFSDefaults())
+	require.NoError(t, err)
+	require.NoError(t, d.Set([]byte("a"), []byte("foo"), NoSync))
+	require.NoError(t, d.Flush())
+	require.NoError(t, d.Close())
+}
+
+func TestAdjustGrandparentOverlapBytesForFlush(t *testing.T) {
+	// 500MB in Lbase
+	var lbaseFiles []*manifest.FileMetadata
+	const lbaseSize = 5 << 20
+	for i := 0; i < 100; i++ {
+		m := &manifest.FileMetadata{Size: lbaseSize, FileNum: FileNum(i)}
+		m.InitPhysicalBacking()
+		lbaseFiles =
+			append(lbaseFiles, m)
+	}
+	const maxOutputFileSize = 2 << 20
+	// 20MB max overlap, so flush split into 25 files.
+	const maxOverlapBytes = 20 << 20
+	ls := manifest.NewLevelSliceSpecificOrder(lbaseFiles)
+	testCases := []struct {
+		flushingBytes        uint64
+		adjustedOverlapBytes uint64
+	}{
+		// Flushes large enough that 25 files is acceptable.
+		{flushingBytes: 128 << 20, adjustedOverlapBytes: 20971520},
+		{flushingBytes: 64 << 20, adjustedOverlapBytes: 20971520},
+		// Small increase in adjustedOverlapBytes.
+		{flushingBytes: 32 << 20, adjustedOverlapBytes: 32768000},
+		// Large increase in adjusterOverlapBytes, to limit to 4 files.
+		{flushingBytes: 1 << 20, adjustedOverlapBytes: 131072000},
+	}
+	for _, tc := range testCases {
+		t.Run("", func(t *testing.T) {
+			c := compaction{
+				grandparents:      ls,
+				maxOverlapBytes:   maxOverlapBytes,
+				maxOutputFileSize: maxOutputFileSize,
+			}
+			adjustGrandparentOverlapBytesForFlush(&c, tc.flushingBytes)
+			require.Equal(t, tc.adjustedOverlapBytes, c.maxOverlapBytes)
+		})
+	}
+}
+
+func TestCompactionInvalidBounds(t *testing.T) {
+	db, err := Open("", testingRandomized(t, &Options{
+		FS: vfs.NewMem(),
+	}).WithFSDefaults())
+	require.NoError(t, err)
+	defer db.Close()
+	require.NoError(t, db.Compact([]byte("a"), []byte("b"), false))
+	require.Error(t, db.Compact([]byte("a"), []byte("a"), false))
+	require.Error(t, db.Compact([]byte("b"), []byte("a"), false))
+}
+
+func Test_calculateInuseKeyRanges(t *testing.T) {
+	opts := (*Options)(nil).EnsureDefaults()
+	cmp := base.DefaultComparer.Compare
+	newFileMeta := func(fileNum FileNum, size uint64, smallest, largest base.InternalKey) *fileMetadata {
+		m := (&fileMetadata{
+			FileNum: fileNum,
+			Size:    size,
+		}).ExtendPointKeyBounds(opts.Comparer.Compare, smallest, largest)
+		m.InitPhysicalBacking()
+		return m
+	}
+	tests := []struct {
+		name     string
+		v        *version
+		level    int
+		depth    int
+		smallest []byte
+		largest  []byte
+		want     []manifest.UserKeyRange
+	}{
+		{
+			name: "No files in next level",
+			v: newVersion(opts, [numLevels][]*fileMetadata{
+				1: {
+					newFileMeta(
+						1,
+						1,
+						base.ParseInternalKey("a.SET.2"),
+						base.ParseInternalKey("c.SET.2"),
+					),
+					newFileMeta(
+						2,
+						1,
+						base.ParseInternalKey("d.SET.2"),
+						base.ParseInternalKey("e.SET.2"),
+					),
+				},
+			}),
+			level:    1,
+			depth:    2,
+			smallest: []byte("a"),
+			largest:  []byte("e"),
+			want: []manifest.UserKeyRange{
+				{
+					Start: []byte("a"),
+					End:   []byte("c"),
+				},
+				{
+					Start: []byte("d"),
+					End:   []byte("e"),
+				},
+			},
+		},
+		{
+			name: "No overlapping key ranges",
+			v: newVersion(opts, [numLevels][]*fileMetadata{
+				1: {
+					newFileMeta(
+						1,
+						1,
+						base.ParseInternalKey("a.SET.1"),
+						base.ParseInternalKey("c.SET.1"),
+					),
+					newFileMeta(
+						2,
+						1,
+						base.ParseInternalKey("l.SET.1"),
+						base.ParseInternalKey("p.SET.1"),
+					),
+				},
+				2: {
+					newFileMeta(
+						3,
+						1,
+						base.ParseInternalKey("d.SET.1"),
+						base.ParseInternalKey("i.SET.1"),
+					),
+					newFileMeta(
+						4,
+						1,
+						base.ParseInternalKey("s.SET.1"),
+						base.ParseInternalKey("w.SET.1"),
+					),
+				},
+			}),
+			level:    1,
+			depth:    2,
+			smallest: []byte("a"),
+			largest:  []byte("z"),
+			want: []manifest.UserKeyRange{
+				{
+					Start: []byte("a"),
+					End:   []byte("c"),
+				},
+				{
+					Start: []byte("d"),
+					End:   []byte("i"),
+				},
+				{
+					Start: []byte("l"),
+					End:   []byte("p"),
+				},
+				{
+					Start: []byte("s"),
+					End:   []byte("w"),
+				},
+			},
+		},
+		{
+			name: "First few non-overlapping, followed by overlapping",
+			v: newVersion(opts, [numLevels][]*fileMetadata{
+				1: {
+					newFileMeta(
+						1,
+						1,
+						base.ParseInternalKey("a.SET.1"),
+						base.ParseInternalKey("c.SET.1"),
+					),
+					newFileMeta(
+						2,
+						1,
+						base.ParseInternalKey("d.SET.1"),
+						base.ParseInternalKey("e.SET.1"),
+					),
+					newFileMeta(
+						3,
+						1,
+						base.ParseInternalKey("n.SET.1"),
+						base.ParseInternalKey("o.SET.1"),
+					),
+					newFileMeta(
+						4,
+						1,
+						base.ParseInternalKey("p.SET.1"),
+						base.ParseInternalKey("q.SET.1"),
+					),
+				},
+				2: {
+					newFileMeta(
+						5,
+						1,
+						base.ParseInternalKey("m.SET.1"),
+						base.ParseInternalKey("q.SET.1"),
+					),
+					newFileMeta(
+						6,
+						1,
+						base.ParseInternalKey("s.SET.1"),
+						base.ParseInternalKey("w.SET.1"),
+					),
+				},
+			}),
+			level:    1,
+			depth:    2,
+			smallest: []byte("a"),
+			largest:  []byte("z"),
+			want: []manifest.UserKeyRange{
+				{
+					Start: []byte("a"),
+					End:   []byte("c"),
+				},
+				{
+					Start: []byte("d"),
+					End:   []byte("e"),
+				},
+				{
+					Start: []byte("m"),
+					End:   []byte("q"),
+				},
+				{
+					Start: []byte("s"),
+					End:   []byte("w"),
+				},
+			},
+		},
+		{
+			name: "All overlapping",
+			v: newVersion(opts, [numLevels][]*fileMetadata{
+				1: {
+					newFileMeta(
+						1,
+						1,
+						base.ParseInternalKey("d.SET.1"),
+						base.ParseInternalKey("e.SET.1"),
+					),
+					newFileMeta(
+						2,
+						1,
+						base.ParseInternalKey("n.SET.1"),
+						base.ParseInternalKey("o.SET.1"),
+					),
+					newFileMeta(
+						3,
+						1,
+						base.ParseInternalKey("p.SET.1"),
+						base.ParseInternalKey("q.SET.1"),
+					),
+				},
+				2: {
+					newFileMeta(
+						4,
+						1,
+						base.ParseInternalKey("a.SET.1"),
+						base.ParseInternalKey("c.SET.1"),
+					),
+					newFileMeta(
+						5,
+						1,
+						base.ParseInternalKey("d.SET.1"),
+						base.ParseInternalKey("w.SET.1"),
+					),
+				},
+			}),
+			level:    1,
+			depth:    2,
+			smallest: []byte("a"),
+			largest:  []byte("z"),
+			want: []manifest.UserKeyRange{
+				{
+					Start: []byte("a"),
+					End:   []byte("c"),
+				},
+				{
+					Start: []byte("d"),
+					End:   []byte("w"),
+				},
+			},
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := calculateInuseKeyRanges(tt.v, cmp, tt.level, tt.depth, tt.smallest, tt.largest); !reflect.DeepEqual(got, tt.want) {
+				t.Errorf("calculateInuseKeyRanges() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func TestMarkedForCompaction(t *testing.T) {
+	var mem vfs.FS = vfs.NewMem()
+	var d *DB
+	defer func() {
+		if d != nil {
+			require.NoError(t, d.Close())
+		}
+	}()
+
+	var buf bytes.Buffer
+	opts := (&Options{
+		FS:                          mem,
+		DebugCheck:                  DebugCheckLevels,
+		DisableAutomaticCompactions: true,
+		FormatMajorVersion:          internalFormatNewest,
+		EventListener: &EventListener{
+			CompactionEnd: func(info CompactionInfo) {
+				// Fix the job ID and durations for determinism.
+				info.JobID = 100
+				info.Duration = time.Second
+				info.TotalDuration = 2 * time.Second
+				fmt.Fprintln(&buf, info)
+			},
+		},
+	}).WithFSDefaults()
+
+	reset := func() {
+		if d != nil {
+			require.NoError(t, d.Close())
+		}
+		mem = vfs.NewMem()
+		require.NoError(t, mem.MkdirAll("ext", 0755))
+
+		var err error
+		d, err = Open("", opts)
+		require.NoError(t, err)
+	}
+	datadriven.RunTest(t, "testdata/marked_for_compaction", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "reset":
+			reset()
+			return ""
+
+		case "define":
+			if d != nil {
+				if err := d.Close(); err != nil {
+					return err.Error()
+				}
+			}
+			var err error
+			if d, err = runDBDefineCmd(td, opts); err != nil {
+				return err.Error()
+			}
+			d.mu.Lock()
+			defer d.mu.Unlock()
+			t := time.Now()
+			d.timeNow = func() time.Time {
+				t = t.Add(time.Second)
+				return t
+			}
+			s := d.mu.versions.currentVersion().DebugString(base.DefaultFormatter)
+			return s
+
+		case "mark-for-compaction":
+			d.mu.Lock()
+			defer d.mu.Unlock()
+			vers := d.mu.versions.currentVersion()
+			var fileNum uint64
+			td.ScanArgs(t, "file", &fileNum)
+			for l, lm := range vers.Levels {
+				iter := lm.Iter()
+				for f := iter.First(); f != nil; f = iter.Next() {
+					if f.FileNum != base.FileNum(fileNum) {
+						continue
+					}
+					f.MarkedForCompaction = true
+					vers.Stats.MarkedForCompaction++
+					vers.Levels[l].InvalidateAnnotation(markedForCompactionAnnotator{})
+					return fmt.Sprintf("marked L%d.%s", l, f.FileNum)
+				}
+			}
+			return "not-found"
+
+		case "maybe-compact":
+			d.mu.Lock()
+			defer d.mu.Unlock()
+			d.opts.DisableAutomaticCompactions = false
+			d.maybeScheduleCompaction()
+			for d.mu.compact.compactingCount > 0 {
+				d.mu.compact.cond.Wait()
+			}
+
+			fmt.Fprintln(&buf, d.mu.versions.currentVersion().DebugString(base.DefaultFormatter))
+			s := strings.TrimSpace(buf.String())
+			buf.Reset()
+			opts.DisableAutomaticCompactions = true
+			return s
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+// createManifestErrorInjector injects errors (when enabled) into vfs.FS calls
+// to create MANIFEST files.
+type createManifestErrorInjector struct {
+	enabled atomic.Bool
+}
+
+// TODO(jackson): Replace the createManifestErrorInjector with the composition
+// of primitives defined in errorfs. This may require additional primitives.
+
+func (i *createManifestErrorInjector) String() string { return "MANIFEST-Creates" }
+
+// enable enables error injection for the vfs.FS.
+func (i *createManifestErrorInjector) enable() {
+	i.enabled.Store(true)
+}
+
+// MaybeError implements errorfs.Injector.
+func (i *createManifestErrorInjector) MaybeError(op errorfs.Op) error {
+	if !i.enabled.Load() {
+		return nil
+	}
+	// This necessitates having a MaxManifestSize of 1, to reliably induce
+	// logAndApply errors.
+	if strings.Contains(op.Path, "MANIFEST") && op.Kind == errorfs.OpCreate {
+		return errorfs.ErrInjected
+	}
+	return nil
+}
+
+var _ errorfs.Injector = &createManifestErrorInjector{}
+
+// TestCompaction_LogAndApplyFails exercises a flush or ingest encountering an
+// unrecoverable error during logAndApply.
+//
+// Regression test for #1669.
+func TestCompaction_LogAndApplyFails(t *testing.T) {
+	// flushKeys writes the given keys to the DB, flushing the resulting memtable.
+	var key = []byte("foo")
+	flushErrC := make(chan error)
+	flushKeys := func(db *DB) error {
+		b := db.NewBatch()
+		err := b.Set(key, nil, nil)
+		require.NoError(t, err)
+		err = b.Commit(nil)
+		require.NoError(t, err)
+		// An error from a failing flush is returned asynchronously.
+		go func() { _ = db.Flush() }()
+		return <-flushErrC
+	}
+
+	// ingestKeys adds the given keys to the DB via an ingestion.
+	ingestKeys := func(db *DB) error {
+		// Create an SST for ingestion.
+		const fName = "ext"
+		f, err := db.opts.FS.Create(fName)
+		require.NoError(t, err)
+		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
+		require.NoError(t, w.Set(key, nil))
+		require.NoError(t, w.Close())
+		// Ingest the SST.
+		return db.Ingest([]string{fName})
+	}
+
+	testCases := []struct {
+		name              string
+		addFn             func(db *DB) error
+		backgroundErrorFn func(*DB, error)
+	}{
+		{
+			name:  "flush",
+			addFn: flushKeys,
+			backgroundErrorFn: func(db *DB, err error) {
+				require.True(t, errors.Is(err, errorfs.ErrInjected))
+				flushErrC <- err
+				// A flush will attempt to retry in the background. For the purposes of
+				// testing this particular scenario, where we would have crashed anyway,
+				// drop the memtable on the floor to short circuit the retry loop.
+				// NB: we hold db.mu here.
+				var cur *flushableEntry
+				cur, db.mu.mem.queue = db.mu.mem.queue[0], db.mu.mem.queue[1:]
+				cur.readerUnrefLocked(true)
+			},
+		},
+		{
+			name:  "ingest",
+			addFn: ingestKeys,
+		},
+	}
+
+	runTest := func(t *testing.T, addFn func(db *DB) error, bgFn func(*DB, error)) {
+		var db *DB
+		inj := &createManifestErrorInjector{}
+		logger := &fatalCapturingLogger{t: t}
+		opts := (&Options{
+			FS: errorfs.Wrap(vfs.NewMem(), inj),
+			// Rotate the manifest after each write. This is required to trigger a
+			// file creation, into which errors can be injected.
+			MaxManifestFileSize: 1,
+			Logger:              logger,
+			EventListener: &EventListener{
+				BackgroundError: func(err error) {
+					if bgFn != nil {
+						bgFn(db, err)
+					}
+				},
+			},
+			DisableAutomaticCompactions: true,
+		}).WithFSDefaults()
+
+		db, err := Open("", opts)
+		require.NoError(t, err)
+		defer func() { _ = db.Close() }()
+
+		inj.enable()
+		err = addFn(db)
+		require.True(t, errors.Is(err, errorfs.ErrInjected))
+
+		// Under normal circumstances, such an error in logAndApply would panic and
+		// cause the DB to terminate here. Assert that we captured the fatal error.
+		require.True(t, errors.Is(logger.err, errorfs.ErrInjected))
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			runTest(t, tc.addFn, tc.backgroundErrorFn)
+		})
+	}
+}
+
+// TestSharedObjectDeletePacing tests that we don't throttle shared object
+// deletes (see the TargetBytesDeletionRate option).
+func TestSharedObjectDeletePacing(t *testing.T) {
+	var opts Options
+	opts.FS = vfs.NewMem()
+	opts.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
+		"": remote.NewInMem(),
+	})
+	opts.Experimental.CreateOnShared = remote.CreateOnSharedAll
+	opts.TargetByteDeletionRate = 1
+
+	d, err := Open("", &opts)
+	require.NoError(t, err)
+	require.NoError(t, d.SetCreatorID(1))
+
+	randVal := func() []byte {
+		res := make([]byte, 1024)
+		_, err := crand.Read(res)
+		require.NoError(t, err)
+		return res
+	}
+
+	// We must set up things so that we will have more live bytes than obsolete
+	// bytes, otherwise delete pacing will be disabled anyway.
+	key := func(i int) string {
+		return fmt.Sprintf("k%02d", i)
+	}
+	const numKeys = 20
+	for i := 1; i <= numKeys; i++ {
+		require.NoError(t, d.Set([]byte(key(i)), randVal(), nil))
+		require.NoError(t, d.Compact([]byte(key(i)), []byte(key(i)+"1"), false))
+	}
+
+	done := make(chan struct{})
+	go func() {
+		err = d.DeleteRange([]byte(key(5)), []byte(key(9)), nil)
+		if err == nil {
+			err = d.Compact([]byte(key(5)), []byte(key(9)), false)
+		}
+		// Wait for objects to be deleted.
+		for {
+			time.Sleep(10 * time.Millisecond)
+			if len(d.objProvider.List()) < numKeys-2 {
+				break
+			}
+		}
+		close(done)
+	}()
+
+	select {
+	case <-time.After(60 * time.Second):
+		// Don't close the DB in this case (the goroutine above might panic).
+		t.Fatalf("compaction timed out, possibly due to incorrect deletion pacing")
+	case <-done:
+	}
+	require.NoError(t, err)
+	d.Close()
+}
+
+type WriteErrorInjector struct {
+	enabled atomic.Bool
+}
+
+// TODO(jackson): Replace WriteErrorInjector with use of primitives in errorfs,
+// adding new primitives as necessary.
+
+func (i *WriteErrorInjector) String() string { return "FileWrites(ErrInjected)" }
+
+// enable enables error injection for the vfs.FS.
+func (i *WriteErrorInjector) enable() {
+	i.enabled.Store(true)
+}
+
+// disable disabled error injection for the vfs.FS.
+func (i *WriteErrorInjector) disable() {
+	i.enabled.Store(false)
+}
+
+// MaybeError implements errorfs.Injector.
+func (i *WriteErrorInjector) MaybeError(op errorfs.Op) error {
+	if !i.enabled.Load() {
+		return nil
+	}
+	// Fail any future write.
+	if op.Kind == errorfs.OpFileWrite {
+		return errorfs.ErrInjected
+	}
+	return nil
+}
+
+var _ errorfs.Injector = &WriteErrorInjector{}
+
+// Cumulative compaction stats shouldn't be updated on compaction error.
+func TestCompactionErrorStats(t *testing.T) {
+	// protected by d.mu
+	var (
+		useInjector   bool
+		tablesCreated []FileNum
+	)
+
+	mem := vfs.NewMem()
+	injector := &WriteErrorInjector{}
+	opts := (&Options{
+		FS:     errorfs.Wrap(mem, injector),
+		Levels: make([]LevelOptions, numLevels),
+		EventListener: &EventListener{
+			TableCreated: func(info TableCreateInfo) {
+				t.Log(info)
+
+				if useInjector {
+					// We'll write 3 tables during compaction, and we only need
+					// the writes to error on the third file write, so only enable
+					// the injector after the first two files have been written to.
+					tablesCreated = append(tablesCreated, info.FileNum)
+					if len(tablesCreated) >= 2 {
+						injector.enable()
+					}
+				}
+			},
+		},
+	}).WithFSDefaults()
+	for i := range opts.Levels {
+		opts.Levels[i].TargetFileSize = 1
+	}
+	opts.testingRandomized(t)
+	d, err := Open("", opts)
+	require.NoError(t, err)
+
+	ingest := func(keys ...string) {
+		t.Helper()
+		f, err := mem.Create("ext")
+		require.NoError(t, err)
+
+		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
+			TableFormat: d.FormatMajorVersion().MaxTableFormat(),
+		})
+		for _, k := range keys {
+			require.NoError(t, w.Set([]byte(k), nil))
+		}
+		require.NoError(t, w.Close())
+		require.NoError(t, d.Ingest([]string{"ext"}))
+	}
+	ingest("a", "c")
+	// Snapshot will preserve the older "a" key during compaction.
+	snap := d.NewSnapshot()
+	ingest("a", "b")
+
+	// Trigger a manual compaction, which will encounter an injected error
+	// after the second table is created.
+	d.mu.Lock()
+	useInjector = true
+	d.mu.Unlock()
+
+	err = d.Compact([]byte("a"), []byte("d"), false)
+	require.Error(t, err, "injected error")
+
+	// Due to the error, stats shouldn't have been updated.
+	d.mu.Lock()
+	require.Equal(t, 0, int(d.mu.snapshots.cumulativePinnedCount))
+	require.Equal(t, 0, int(d.mu.snapshots.cumulativePinnedSize))
+	useInjector = false
+	d.mu.Unlock()
+
+	injector.disable()
+
+	// The following compaction won't error, but snapshot is open, so snapshot
+	// pinned stats should update.
+	require.NoError(t, d.Compact([]byte("a"), []byte("d"), false))
+	require.NoError(t, snap.Close())
+
+	d.mu.Lock()
+	require.Equal(t, 1, int(d.mu.snapshots.cumulativePinnedCount))
+	require.Equal(t, 9, int(d.mu.snapshots.cumulativePinnedSize))
+	d.mu.Unlock()
+	require.NoError(t, d.Close())
+}
diff --git a/pebble/comparer.go b/pebble/comparer.go
new file mode 100644
index 0000000..c92cd79
--- /dev/null
+++ b/pebble/comparer.go
@@ -0,0 +1,31 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import "github.com/cockroachdb/pebble/internal/base"
+
+// Compare exports the base.Compare type.
+type Compare = base.Compare
+
+// Equal exports the base.Equal type.
+type Equal = base.Equal
+
+// AbbreviatedKey exports the base.AbbreviatedKey type.
+type AbbreviatedKey = base.AbbreviatedKey
+
+// Separator exports the base.Separator type.
+type Separator = base.Separator
+
+// Successor exports the base.Successor type.
+type Successor = base.Successor
+
+// Split exports the base.Split type.
+type Split = base.Split
+
+// Comparer exports the base.Comparer type.
+type Comparer = base.Comparer
+
+// DefaultComparer exports the base.DefaultComparer variable.
+var DefaultComparer = base.DefaultComparer
diff --git a/pebble/data_test.go b/pebble/data_test.go
new file mode 100644
index 0000000..9f6260f
--- /dev/null
+++ b/pebble/data_test.go
@@ -0,0 +1,1426 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	crand "crypto/rand"
+	"fmt"
+	"io"
+	"math"
+	"math/rand"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/bloom"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/humanize"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/private"
+	"github.com/cockroachdb/pebble/internal/rangedel"
+	"github.com/cockroachdb/pebble/internal/rangekey"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/objstorage/remote"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/cockroachdb/pebble/vfs/errorfs"
+	"github.com/stretchr/testify/require"
+)
+
+func runGetCmd(t testing.TB, td *datadriven.TestData, d *DB) string {
+	snap := Snapshot{
+		db:     d,
+		seqNum: InternalKeySeqNumMax,
+	}
+	td.MaybeScanArgs(t, "seq", &snap.seqNum)
+
+	var buf bytes.Buffer
+	for _, data := range strings.Split(td.Input, "\n") {
+		v, closer, err := snap.Get([]byte(data))
+		if err != nil {
+			fmt.Fprintf(&buf, "%s: %s\n", data, err)
+		} else {
+			fmt.Fprintf(&buf, "%s:%s\n", data, v)
+			closer.Close()
+		}
+	}
+	return buf.String()
+}
+
+func runIterCmd(d *datadriven.TestData, iter *Iterator, closeIter bool) string {
+	if closeIter {
+		defer func() {
+			if iter != nil {
+				iter.Close()
+			}
+		}()
+	}
+	var b bytes.Buffer
+	for _, line := range strings.Split(d.Input, "\n") {
+		parts := strings.Fields(line)
+		if len(parts) == 0 {
+			continue
+		}
+		printValidityState := false
+		var valid bool
+		var validityState IterValidityState
+		switch parts[0] {
+		case "seek-ge":
+			if len(parts) != 2 {
+				return "seek-ge <key>\n"
+			}
+			valid = iter.SeekGE([]byte(parts[1]))
+		case "seek-prefix-ge":
+			if len(parts) != 2 {
+				return "seek-prefix-ge <key>\n"
+			}
+			valid = iter.SeekPrefixGE([]byte(parts[1]))
+		case "seek-lt":
+			if len(parts) != 2 {
+				return "seek-lt <key>\n"
+			}
+			valid = iter.SeekLT([]byte(parts[1]))
+		case "seek-ge-limit":
+			if len(parts) != 3 {
+				return "seek-ge-limit <key> <limit>\n"
+			}
+			validityState = iter.SeekGEWithLimit(
+				[]byte(parts[1]), []byte(parts[2]))
+			printValidityState = true
+		case "seek-lt-limit":
+			if len(parts) != 3 {
+				return "seek-lt-limit <key> <limit>\n"
+			}
+			validityState = iter.SeekLTWithLimit(
+				[]byte(parts[1]), []byte(parts[2]))
+			printValidityState = true
+		case "inspect":
+			if len(parts) != 2 {
+				return "inspect <field>\n"
+			}
+			field := parts[1]
+			switch field {
+			case "lastPositioningOp":
+				op := "?"
+				switch iter.lastPositioningOp {
+				case unknownLastPositionOp:
+					op = "unknown"
+				case seekPrefixGELastPositioningOp:
+					op = "seekprefixge"
+				case seekGELastPositioningOp:
+					op = "seekge"
+				case seekLTLastPositioningOp:
+					op = "seeklt"
+				case invalidatedLastPositionOp:
+					op = "invalidate"
+				}
+				fmt.Fprintf(&b, "%s=%q\n", field, op)
+			default:
+				return fmt.Sprintf("unrecognized inspect field %q\n", field)
+			}
+			continue
+		case "next-limit":
+			if len(parts) != 2 {
+				return "next-limit <limit>\n"
+			}
+			validityState = iter.NextWithLimit([]byte(parts[1]))
+			printValidityState = true
+		case "internal-next":
+			validity, keyKind := iter.internalNext()
+			switch validity {
+			case internalNextError:
+				fmt.Fprintf(&b, "err: %s\n", iter.Error())
+			case internalNextExhausted:
+				fmt.Fprint(&b, ".\n")
+			case internalNextValid:
+				fmt.Fprintf(&b, "%s\n", keyKind)
+			default:
+				panic("unreachable")
+			}
+			continue
+		case "can-deterministically-single-delete":
+			ok, err := CanDeterministicallySingleDelete(iter)
+			if err != nil {
+				fmt.Fprintf(&b, "err: %s\n", err)
+			} else {
+				fmt.Fprintf(&b, "%t\n", ok)
+			}
+			continue
+		case "prev-limit":
+			if len(parts) != 2 {
+				return "prev-limit <limit>\n"
+			}
+			validityState = iter.PrevWithLimit([]byte(parts[1]))
+			printValidityState = true
+		case "first":
+			valid = iter.First()
+		case "last":
+			valid = iter.Last()
+		case "next":
+			valid = iter.Next()
+		case "next-prefix":
+			valid = iter.NextPrefix()
+		case "prev":
+			valid = iter.Prev()
+		case "set-bounds":
+			if len(parts) <= 1 || len(parts) > 3 {
+				return "set-bounds lower=<lower> upper=<upper>\n"
+			}
+			var lower []byte
+			var upper []byte
+			for _, part := range parts[1:] {
+				arg := strings.Split(part, "=")
+				switch arg[0] {
+				case "lower":
+					lower = []byte(arg[1])
+				case "upper":
+					upper = []byte(arg[1])
+				default:
+					return fmt.Sprintf("set-bounds: unknown arg: %s", arg)
+				}
+			}
+			iter.SetBounds(lower, upper)
+			valid = iter.Valid()
+		case "set-options":
+			opts := iter.opts
+			if _, err := parseIterOptions(&opts, &iter.opts, parts[1:]); err != nil {
+				return fmt.Sprintf("set-options: %s", err.Error())
+			}
+			iter.SetOptions(&opts)
+			valid = iter.Valid()
+		case "stats":
+			stats := iter.Stats()
+			// The timing is non-deterministic, so set to 0.
+			stats.InternalStats.BlockReadDuration = 0
+			fmt.Fprintf(&b, "stats: %s\n", stats.String())
+			continue
+		case "clone":
+			var opts CloneOptions
+			if len(parts) > 1 {
+				var iterOpts IterOptions
+				if foundAny, err := parseIterOptions(&iterOpts, &iter.opts, parts[1:]); err != nil {
+					return fmt.Sprintf("clone: %s", err.Error())
+				} else if foundAny {
+					opts.IterOptions = &iterOpts
+				}
+				for _, part := range parts[1:] {
+					if arg := strings.Split(part, "="); len(arg) == 2 && arg[0] == "refresh-batch" {
+						var err error
+						opts.RefreshBatchView, err = strconv.ParseBool(arg[1])
+						if err != nil {
+							return fmt.Sprintf("clone: refresh-batch: %s", err.Error())
+						}
+					}
+				}
+			}
+			clonedIter, err := iter.Clone(opts)
+			if err != nil {
+				fmt.Fprintf(&b, "error in clone, skipping rest of input: err=%v\n", err)
+				return b.String()
+			}
+			if err = iter.Close(); err != nil {
+				fmt.Fprintf(&b, "err=%v\n", err)
+			}
+			iter = clonedIter
+		case "is-using-combined":
+			if iter.opts.KeyTypes != IterKeyTypePointsAndRanges {
+				fmt.Fprintln(&b, "not configured for combined iteration")
+			} else if iter.lazyCombinedIter.combinedIterState.initialized {
+				fmt.Fprintln(&b, "using combined (non-lazy) iterator")
+			} else {
+				fmt.Fprintln(&b, "using lazy iterator")
+			}
+			continue
+		default:
+			return fmt.Sprintf("unknown op: %s", parts[0])
+		}
+
+		valid = valid || validityState == IterValid
+		if valid != iter.Valid() {
+			fmt.Fprintf(&b, "mismatched valid states: %t vs %t\n", valid, iter.Valid())
+		}
+		hasPoint, hasRange := iter.HasPointAndRange()
+		hasEither := hasPoint || hasRange
+		if hasEither != valid {
+			fmt.Fprintf(&b, "mismatched valid/HasPointAndRange states: valid=%t HasPointAndRange=(%t,%t)\n", valid, hasPoint, hasRange)
+		}
+
+		if valid {
+			validityState = IterValid
+		}
+		printIterState(&b, iter, validityState, printValidityState)
+	}
+	return b.String()
+}
+
+func parseIterOptions(
+	opts *IterOptions, ref *IterOptions, parts []string,
+) (foundAny bool, err error) {
+	const usageString = "[lower=<lower>] [upper=<upper>] [key-types=point|range|both] [mask-suffix=<suffix>] [mask-filter=<bool>] [only-durable=<bool>] [table-filter=reuse|none] [point-filters=reuse|none]\n"
+	for _, part := range parts {
+		arg := strings.SplitN(part, "=", 2)
+		if len(arg) != 2 {
+			return false, errors.Newf(usageString)
+		}
+		switch arg[0] {
+		case "point-filters":
+			switch arg[1] {
+			case "reuse":
+				opts.PointKeyFilters = ref.PointKeyFilters
+			case "none":
+				opts.PointKeyFilters = nil
+			default:
+				return false, errors.Newf("unknown arg point-filter=%q:\n%s", arg[1], usageString)
+			}
+		case "lower":
+			opts.LowerBound = []byte(arg[1])
+		case "upper":
+			opts.UpperBound = []byte(arg[1])
+		case "key-types":
+			switch arg[1] {
+			case "point":
+				opts.KeyTypes = IterKeyTypePointsOnly
+			case "range":
+				opts.KeyTypes = IterKeyTypeRangesOnly
+			case "both":
+				opts.KeyTypes = IterKeyTypePointsAndRanges
+			default:
+				return false, errors.Newf("unknown key-type %q:\n%s", arg[1], usageString)
+			}
+		case "mask-suffix":
+			opts.RangeKeyMasking.Suffix = []byte(arg[1])
+		case "mask-filter":
+			opts.RangeKeyMasking.Filter = func() BlockPropertyFilterMask {
+				return sstable.NewTestKeysMaskingFilter()
+			}
+		case "table-filter":
+			switch arg[1] {
+			case "reuse":
+				opts.TableFilter = ref.TableFilter
+			case "none":
+				opts.TableFilter = nil
+			default:
+				return false, errors.Newf("unknown arg table-filter=%q:\n%s", arg[1], usageString)
+			}
+		case "only-durable":
+			var err error
+			opts.OnlyReadGuaranteedDurable, err = strconv.ParseBool(arg[1])
+			if err != nil {
+				return false, errors.Newf("cannot parse only-durable=%q: %s", arg[1], err)
+			}
+		default:
+			continue
+		}
+		foundAny = true
+	}
+	return foundAny, nil
+}
+
+func printIterState(
+	b io.Writer, iter *Iterator, validity IterValidityState, printValidityState bool,
+) {
+	var validityStateStr string
+	if printValidityState {
+		switch validity {
+		case IterExhausted:
+			validityStateStr = " exhausted"
+		case IterValid:
+			validityStateStr = " valid"
+		case IterAtLimit:
+			validityStateStr = " at-limit"
+		}
+	}
+	if err := iter.Error(); err != nil {
+		fmt.Fprintf(b, "err=%v\n", err)
+	} else if validity == IterValid {
+		switch {
+		case iter.opts.pointKeys():
+			hasPoint, hasRange := iter.HasPointAndRange()
+			fmt.Fprintf(b, "%s:%s (", iter.Key(), validityStateStr)
+			if hasPoint {
+				fmt.Fprintf(b, "%s, ", iter.Value())
+			} else {
+				fmt.Fprint(b, "., ")
+			}
+			if hasRange {
+				start, end := iter.RangeBounds()
+				fmt.Fprintf(b, "[%s-%s)", formatASCIIKey(start), formatASCIIKey(end))
+				writeRangeKeys(b, iter)
+			} else {
+				fmt.Fprint(b, ".")
+			}
+			if iter.RangeKeyChanged() {
+				fmt.Fprint(b, " UPDATED")
+			}
+			fmt.Fprint(b, ")")
+		default:
+			if iter.Valid() {
+				hasPoint, hasRange := iter.HasPointAndRange()
+				if hasPoint || !hasRange {
+					panic(fmt.Sprintf("pebble: unexpected HasPointAndRange (%t, %t)", hasPoint, hasRange))
+				}
+				start, end := iter.RangeBounds()
+				fmt.Fprintf(b, "%s [%s-%s)", iter.Key(), formatASCIIKey(start), formatASCIIKey(end))
+				writeRangeKeys(b, iter)
+			} else {
+				fmt.Fprint(b, ".")
+			}
+			if iter.RangeKeyChanged() {
+				fmt.Fprint(b, " UPDATED")
+			}
+		}
+		fmt.Fprintln(b)
+	} else {
+		fmt.Fprintf(b, ".%s\n", validityStateStr)
+	}
+}
+
+func formatASCIIKey(b []byte) string {
+	if bytes.IndexFunc(b, func(r rune) bool { return r < 'A' || r > 'z' }) != -1 {
+		// This key is not just ASCII letters. Quote it.
+		return fmt.Sprintf("%q", b)
+	}
+	return string(b)
+}
+
+func writeRangeKeys(b io.Writer, iter *Iterator) {
+	rangeKeys := iter.RangeKeys()
+	for j := 0; j < len(rangeKeys); j++ {
+		if j > 0 {
+			fmt.Fprint(b, ",")
+		}
+		fmt.Fprintf(b, " %s=%s", rangeKeys[j].Suffix, rangeKeys[j].Value)
+	}
+}
+
+func runBatchDefineCmd(d *datadriven.TestData, b *Batch) error {
+	for _, line := range strings.Split(d.Input, "\n") {
+		parts := strings.Fields(line)
+		if len(parts) == 0 {
+			continue
+		}
+		if parts[1] == `<nil>` {
+			parts[1] = ""
+		}
+		var err error
+		switch parts[0] {
+		case "set":
+			if len(parts) != 3 {
+				return errors.Errorf("%s expects 2 arguments", parts[0])
+			}
+			err = b.Set([]byte(parts[1]), []byte(parts[2]), nil)
+		case "del":
+			if len(parts) != 2 {
+				return errors.Errorf("%s expects 1 argument", parts[0])
+			}
+			err = b.Delete([]byte(parts[1]), nil)
+		case "del-sized":
+			if len(parts) != 3 {
+				return errors.Errorf("%s expects 2 arguments", parts[0])
+			}
+			var valSize uint64
+			valSize, err = strconv.ParseUint(parts[2], 10, 32)
+			if err != nil {
+				return err
+			}
+			err = b.DeleteSized([]byte(parts[1]), uint32(valSize), nil)
+		case "singledel":
+			if len(parts) != 2 {
+				return errors.Errorf("%s expects 1 argument", parts[0])
+			}
+			err = b.SingleDelete([]byte(parts[1]), nil)
+		case "del-range":
+			if len(parts) != 3 {
+				return errors.Errorf("%s expects 2 arguments", parts[0])
+			}
+			err = b.DeleteRange([]byte(parts[1]), []byte(parts[2]), nil)
+		case "merge":
+			if len(parts) != 3 {
+				return errors.Errorf("%s expects 2 arguments", parts[0])
+			}
+			err = b.Merge([]byte(parts[1]), []byte(parts[2]), nil)
+		case "range-key-set":
+			if len(parts) < 4 || len(parts) > 5 {
+				return errors.Errorf("%s expects 3 or 4 arguments", parts[0])
+			}
+			var val []byte
+			if len(parts) == 5 {
+				val = []byte(parts[4])
+			}
+			err = b.RangeKeySet(
+				[]byte(parts[1]),
+				[]byte(parts[2]),
+				[]byte(parts[3]),
+				val,
+				nil)
+		case "range-key-unset":
+			if len(parts) != 4 {
+				return errors.Errorf("%s expects 3 arguments", parts[0])
+			}
+			err = b.RangeKeyUnset(
+				[]byte(parts[1]),
+				[]byte(parts[2]),
+				[]byte(parts[3]),
+				nil)
+		case "range-key-del":
+			if len(parts) != 3 {
+				return errors.Errorf("%s expects 2 arguments", parts[0])
+			}
+			err = b.RangeKeyDelete(
+				[]byte(parts[1]),
+				[]byte(parts[2]),
+				nil)
+		default:
+			return errors.Errorf("unknown op: %s", parts[0])
+		}
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func runBuildRemoteCmd(td *datadriven.TestData, d *DB, storage remote.Storage) error {
+	b := d.NewIndexedBatch()
+	if err := runBatchDefineCmd(td, b); err != nil {
+		return err
+	}
+
+	if len(td.CmdArgs) < 1 {
+		return errors.New("build <path>: argument missing")
+	}
+	path := td.CmdArgs[0].String()
+
+	// Override table format, if provided.
+	tableFormat := d.opts.FormatMajorVersion.MaxTableFormat()
+	for _, cmdArg := range td.CmdArgs[1:] {
+		switch cmdArg.Key {
+		case "format":
+			switch cmdArg.Vals[0] {
+			case "leveldb":
+				tableFormat = sstable.TableFormatLevelDB
+			case "rocksdbv2":
+				tableFormat = sstable.TableFormatRocksDBv2
+			case "pebblev1":
+				tableFormat = sstable.TableFormatPebblev1
+			case "pebblev2":
+				tableFormat = sstable.TableFormatPebblev2
+			case "pebblev3":
+				tableFormat = sstable.TableFormatPebblev3
+			case "pebblev4":
+				tableFormat = sstable.TableFormatPebblev4
+			default:
+				return errors.Errorf("unknown format string %s", cmdArg.Vals[0])
+			}
+		}
+	}
+
+	writeOpts := d.opts.MakeWriterOptions(0 /* level */, tableFormat)
+
+	f, err := storage.CreateObject(path)
+	if err != nil {
+		return err
+	}
+	w := sstable.NewWriter(objstorageprovider.NewRemoteWritable(f), writeOpts)
+	iter := b.newInternalIter(nil)
+	for key, val := iter.First(); key != nil; key, val = iter.Next() {
+		tmp := *key
+		tmp.SetSeqNum(0)
+		if err := w.Add(tmp, val.InPlaceValue()); err != nil {
+			return err
+		}
+	}
+	if err := iter.Close(); err != nil {
+		return err
+	}
+
+	if rdi := b.newRangeDelIter(nil, math.MaxUint64); rdi != nil {
+		for s := rdi.First(); s != nil; s = rdi.Next() {
+			err := rangedel.Encode(s, func(k base.InternalKey, v []byte) error {
+				k.SetSeqNum(0)
+				return w.Add(k, v)
+			})
+			if err != nil {
+				return err
+			}
+		}
+	}
+
+	if rki := b.newRangeKeyIter(nil, math.MaxUint64); rki != nil {
+		for s := rki.First(); s != nil; s = rki.Next() {
+			for _, k := range s.Keys {
+				var err error
+				switch k.Kind() {
+				case base.InternalKeyKindRangeKeySet:
+					err = w.RangeKeySet(s.Start, s.End, k.Suffix, k.Value)
+				case base.InternalKeyKindRangeKeyUnset:
+					err = w.RangeKeyUnset(s.Start, s.End, k.Suffix)
+				case base.InternalKeyKindRangeKeyDelete:
+					err = w.RangeKeyDelete(s.Start, s.End)
+				default:
+					panic("not a range key")
+				}
+				if err != nil {
+					return err
+				}
+			}
+		}
+	}
+
+	return w.Close()
+}
+
+func runBuildCmd(td *datadriven.TestData, d *DB, fs vfs.FS) error {
+	b := d.NewIndexedBatch()
+	if err := runBatchDefineCmd(td, b); err != nil {
+		return err
+	}
+
+	if len(td.CmdArgs) < 1 {
+		return errors.New("build <path>: argument missing")
+	}
+	path := td.CmdArgs[0].String()
+
+	// Override table format, if provided.
+	tableFormat := d.opts.FormatMajorVersion.MaxTableFormat()
+	for _, cmdArg := range td.CmdArgs[1:] {
+		switch cmdArg.Key {
+		case "format":
+			switch cmdArg.Vals[0] {
+			case "leveldb":
+				tableFormat = sstable.TableFormatLevelDB
+			case "rocksdbv2":
+				tableFormat = sstable.TableFormatRocksDBv2
+			case "pebblev1":
+				tableFormat = sstable.TableFormatPebblev1
+			case "pebblev2":
+				tableFormat = sstable.TableFormatPebblev2
+			case "pebblev3":
+				tableFormat = sstable.TableFormatPebblev3
+			case "pebblev4":
+				tableFormat = sstable.TableFormatPebblev4
+			default:
+				return errors.Errorf("unknown format string %s", cmdArg.Vals[0])
+			}
+		}
+	}
+
+	writeOpts := d.opts.MakeWriterOptions(0 /* level */, tableFormat)
+
+	f, err := fs.Create(path)
+	if err != nil {
+		return err
+	}
+	w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), writeOpts)
+	iter := b.newInternalIter(nil)
+	for key, val := iter.First(); key != nil; key, val = iter.Next() {
+		tmp := *key
+		tmp.SetSeqNum(0)
+		if err := w.Add(tmp, val.InPlaceValue()); err != nil {
+			return err
+		}
+	}
+	if err := iter.Close(); err != nil {
+		return err
+	}
+
+	if rdi := b.newRangeDelIter(nil, math.MaxUint64); rdi != nil {
+		for s := rdi.First(); s != nil; s = rdi.Next() {
+			err := rangedel.Encode(s, func(k base.InternalKey, v []byte) error {
+				k.SetSeqNum(0)
+				return w.Add(k, v)
+			})
+			if err != nil {
+				return err
+			}
+		}
+	}
+
+	if rki := b.newRangeKeyIter(nil, math.MaxUint64); rki != nil {
+		for s := rki.First(); s != nil; s = rki.Next() {
+			for _, k := range s.Keys {
+				var err error
+				switch k.Kind() {
+				case base.InternalKeyKindRangeKeySet:
+					err = w.RangeKeySet(s.Start, s.End, k.Suffix, k.Value)
+				case base.InternalKeyKindRangeKeyUnset:
+					err = w.RangeKeyUnset(s.Start, s.End, k.Suffix)
+				case base.InternalKeyKindRangeKeyDelete:
+					err = w.RangeKeyDelete(s.Start, s.End)
+				default:
+					panic("not a range key")
+				}
+				if err != nil {
+					return err
+				}
+			}
+		}
+	}
+
+	return w.Close()
+}
+
+func runCompactCmd(td *datadriven.TestData, d *DB) error {
+	if len(td.CmdArgs) > 4 {
+		return errors.Errorf("%s expects at most four arguments", td.Cmd)
+	}
+	parts := strings.Split(td.CmdArgs[0].Key, "-")
+	if len(parts) != 2 {
+		return errors.Errorf("expected <begin>-<end>: %s", td.Input)
+	}
+	parallelize := td.HasArg("parallel")
+	if len(td.CmdArgs) >= 2 && strings.HasPrefix(td.CmdArgs[1].Key, "L") {
+		levelString := td.CmdArgs[1].String()
+		iStart := base.MakeInternalKey([]byte(parts[0]), InternalKeySeqNumMax, InternalKeyKindMax)
+		iEnd := base.MakeInternalKey([]byte(parts[1]), 0, 0)
+		if levelString[0] != 'L' {
+			return errors.Errorf("expected L<n>: %s", levelString)
+		}
+		level, err := strconv.Atoi(levelString[1:])
+		if err != nil {
+			return err
+		}
+		return d.manualCompact(iStart.UserKey, iEnd.UserKey, level, parallelize)
+	}
+	return d.Compact([]byte(parts[0]), []byte(parts[1]), parallelize)
+}
+
+// runDBDefineCmd prepares a database state, returning the opened
+// database with the initialized state.
+//
+// The command accepts input describing memtables and sstables to
+// construct. Each new table is indicated by a line containing the
+// level of the next table to build (eg, "L6"), or "mem" to build
+// a memtable. Each subsequent line contains a new key-value pair.
+//
+// Point keys and range deletions should be encoded as the
+// InternalKey's string representation, as understood by
+// ParseInternalKey, followed a colon and the corresponding value.
+//
+//	b.SET.50:foo
+//	c.DEL.20
+//
+// Range keys may be encoded by prefixing the line with `rangekey:`,
+// followed by the keyspan.Span string representation, as understood
+// by keyspan.ParseSpan.
+//
+//	rangekey:b-d:{(#5,RANGEKEYSET,@2,foo)}
+//
+// # Mechanics
+//
+// runDBDefineCmd works by simulating a flush for every file written.
+// Keys are written to a memtable. When a file is complete, the table
+// is flushed to physical files through manually invoking runCompaction.
+// The resulting version edit is then manipulated to write the files
+// to the indicated level.
+//
+// Because of it's low-level manipulation, runDBDefineCmd does allow the
+// creation of invalid database states. If opts.DebugCheck is set, the
+// level checker should detect the invalid state.
+func runDBDefineCmd(td *datadriven.TestData, opts *Options) (*DB, error) {
+	opts = opts.EnsureDefaults()
+	opts.FS = vfs.NewMem()
+
+	var snapshots []uint64
+	var levelMaxBytes map[int]int64
+	for _, arg := range td.CmdArgs {
+		switch arg.Key {
+		case "target-file-sizes":
+			opts.Levels = make([]LevelOptions, len(arg.Vals))
+			for i := range arg.Vals {
+				size, err := strconv.ParseInt(arg.Vals[i], 10, 64)
+				if err != nil {
+					return nil, err
+				}
+				opts.Levels[i].TargetFileSize = size
+			}
+		case "snapshots":
+			snapshots = make([]uint64, len(arg.Vals))
+			for i := range arg.Vals {
+				seqNum, err := strconv.ParseUint(arg.Vals[i], 10, 64)
+				if err != nil {
+					return nil, err
+				}
+				snapshots[i] = seqNum
+				if i > 0 && snapshots[i] < snapshots[i-1] {
+					return nil, errors.New("Snapshots must be in ascending order")
+				}
+			}
+		case "lbase-max-bytes":
+			lbaseMaxBytes, err := strconv.ParseInt(arg.Vals[0], 10, 64)
+			if err != nil {
+				return nil, err
+			}
+			opts.LBaseMaxBytes = lbaseMaxBytes
+		case "level-max-bytes":
+			levelMaxBytes = map[int]int64{}
+			for i := range arg.Vals {
+				j := strings.Index(arg.Vals[i], ":")
+				levelStr := strings.TrimSpace(arg.Vals[i][:j])
+				level, err := strconv.Atoi(levelStr[1:])
+				if err != nil {
+					return nil, err
+				}
+				size, err := strconv.ParseInt(strings.TrimSpace(arg.Vals[i][j+1:]), 10, 64)
+				if err != nil {
+					return nil, err
+				}
+				levelMaxBytes[level] = size
+			}
+		case "auto-compactions":
+			switch arg.Vals[0] {
+			case "off":
+				opts.DisableAutomaticCompactions = true
+			case "on":
+				opts.DisableAutomaticCompactions = false
+			default:
+				return nil, errors.Errorf("Unrecognized %q %q arg value: %q", td.Cmd, arg.Key, arg.Vals[0])
+			}
+		case "enable-table-stats":
+			enable, err := strconv.ParseBool(arg.Vals[0])
+			if err != nil {
+				return nil, errors.Errorf("%s: could not parse %q as bool: %s", td.Cmd, arg.Vals[0], err)
+			}
+			opts.private.disableTableStats = !enable
+		case "block-size":
+			size, err := strconv.Atoi(arg.Vals[0])
+			if err != nil {
+				return nil, err
+			}
+			for _, levelOpts := range opts.Levels {
+				levelOpts.BlockSize = size
+			}
+		case "format-major-version":
+			fmv, err := strconv.Atoi(arg.Vals[0])
+			if err != nil {
+				return nil, err
+			}
+			opts.FormatMajorVersion = FormatMajorVersion(fmv)
+		case "disable-multi-level":
+			opts.Experimental.MultiLevelCompactionHeuristic = NoMultiLevel{}
+		}
+	}
+
+	// This is placed after the argument parsing above, because the arguments
+	// to define should be parsed even if td.Input is empty.
+	if td.Input == "" {
+		// Empty LSM.
+		d, err := Open("", opts)
+		if err != nil {
+			return nil, err
+		}
+		d.mu.Lock()
+		for i := range snapshots {
+			s := &Snapshot{db: d}
+			s.seqNum = snapshots[i]
+			d.mu.snapshots.pushBack(s)
+		}
+		for l, maxBytes := range levelMaxBytes {
+			d.mu.versions.picker.(*compactionPickerByScore).levelMaxBytes[l] = maxBytes
+		}
+		d.mu.Unlock()
+		return d, nil
+	}
+
+	d, err := Open("", opts)
+	if err != nil {
+		return nil, err
+	}
+	d.mu.Lock()
+	d.mu.versions.dynamicBaseLevel = false
+	for i := range snapshots {
+		s := &Snapshot{db: d}
+		s.seqNum = snapshots[i]
+		d.mu.snapshots.pushBack(s)
+	}
+	defer d.mu.Unlock()
+
+	var mem *memTable
+	var start, end *base.InternalKey
+	ve := &versionEdit{}
+	level := -1
+
+	maybeFlush := func() error {
+		if level < 0 {
+			return nil
+		}
+
+		toFlush := flushableList{{
+			flushable: mem,
+			flushed:   make(chan struct{}),
+		}}
+		c := newFlush(d.opts, d.mu.versions.currentVersion(),
+			d.mu.versions.picker.getBaseLevel(), toFlush, time.Now())
+		c.disableSpanElision = true
+		// NB: define allows the test to exactly specify which keys go
+		// into which sstables. If the test has a small target file
+		// size to test grandparent limits, etc, the maxOutputFileSize
+		// can cause splitting /within/ the bounds specified to the
+		// test. Ignore the target size here, and split only according
+		// to the user-defined boundaries.
+		c.maxOutputFileSize = math.MaxUint64
+
+		newVE, _, _, err := d.runCompaction(0, c)
+		if err != nil {
+			return err
+		}
+		largestSeqNum := d.mu.versions.logSeqNum.Load()
+		for _, f := range newVE.NewFiles {
+			if start != nil {
+				f.Meta.SmallestPointKey = *start
+				f.Meta.Smallest = *start
+			}
+			if end != nil {
+				f.Meta.LargestPointKey = *end
+				f.Meta.Largest = *end
+			}
+			if largestSeqNum <= f.Meta.LargestSeqNum {
+				largestSeqNum = f.Meta.LargestSeqNum + 1
+			}
+			ve.NewFiles = append(ve.NewFiles, newFileEntry{
+				Level: level,
+				Meta:  f.Meta,
+			})
+		}
+		// The committed keys were never written to the WAL, so neither
+		// the logSeqNum nor the commit pipeline's visibleSeqNum have
+		// been ratcheted. Manually ratchet them to the largest sequence
+		// number committed to ensure iterators opened from the database
+		// correctly observe the committed keys.
+		if d.mu.versions.logSeqNum.Load() < largestSeqNum {
+			d.mu.versions.logSeqNum.Store(largestSeqNum)
+		}
+		if d.mu.versions.visibleSeqNum.Load() < largestSeqNum {
+			d.mu.versions.visibleSeqNum.Store(largestSeqNum)
+		}
+		level = -1
+		return nil
+	}
+
+	// Example, a-c.
+	parseMeta := func(s string) (*fileMetadata, error) {
+		parts := strings.Split(s, "-")
+		if len(parts) != 2 {
+			return nil, errors.Errorf("malformed table spec: %s", s)
+		}
+		m := (&fileMetadata{}).ExtendPointKeyBounds(
+			opts.Comparer.Compare,
+			InternalKey{UserKey: []byte(parts[0])},
+			InternalKey{UserKey: []byte(parts[1])},
+		)
+		m.InitPhysicalBacking()
+		return m, nil
+	}
+
+	// Example, compact: a-c.
+	parseCompaction := func(outputLevel int, s string) (*compaction, error) {
+		m, err := parseMeta(s[len("compact:"):])
+		if err != nil {
+			return nil, err
+		}
+		c := &compaction{
+			inputs:   []compactionLevel{{}, {level: outputLevel}},
+			smallest: m.Smallest,
+			largest:  m.Largest,
+		}
+		c.startLevel, c.outputLevel = &c.inputs[0], &c.inputs[1]
+		return c, nil
+	}
+
+	for _, line := range strings.Split(td.Input, "\n") {
+		fields := strings.Fields(line)
+		if len(fields) > 0 {
+			switch fields[0] {
+			case "mem":
+				if err := maybeFlush(); err != nil {
+					return nil, err
+				}
+				// Add a memtable layer.
+				if !d.mu.mem.mutable.empty() {
+					d.mu.mem.mutable = newMemTable(memTableOptions{Options: d.opts})
+					entry := d.newFlushableEntry(d.mu.mem.mutable, 0, 0)
+					entry.readerRefs.Add(1)
+					d.mu.mem.queue = append(d.mu.mem.queue, entry)
+					d.updateReadStateLocked(nil)
+				}
+				mem = d.mu.mem.mutable
+				start, end = nil, nil
+				fields = fields[1:]
+			case "L0", "L1", "L2", "L3", "L4", "L5", "L6":
+				if err := maybeFlush(); err != nil {
+					return nil, err
+				}
+				var err error
+				if level, err = strconv.Atoi(fields[0][1:]); err != nil {
+					return nil, err
+				}
+				fields = fields[1:]
+				start, end = nil, nil
+				boundFields := 0
+				for _, field := range fields {
+					toBreak := false
+					switch {
+					case strings.HasPrefix(field, "start="):
+						ikey := base.ParseInternalKey(strings.TrimPrefix(field, "start="))
+						start = &ikey
+						boundFields++
+					case strings.HasPrefix(field, "end="):
+						ikey := base.ParseInternalKey(strings.TrimPrefix(field, "end="))
+						end = &ikey
+						boundFields++
+					default:
+						toBreak = true
+					}
+					if toBreak {
+						break
+					}
+				}
+				fields = fields[boundFields:]
+				mem = newMemTable(memTableOptions{Options: d.opts})
+			}
+		}
+
+		for _, data := range fields {
+			i := strings.Index(data, ":")
+			// Define in-progress compactions.
+			if data[:i] == "compact" {
+				c, err := parseCompaction(level, data)
+				if err != nil {
+					return nil, err
+				}
+				d.mu.compact.inProgress[c] = struct{}{}
+				continue
+			}
+			if data[:i] == "rangekey" {
+				span := keyspan.ParseSpan(data[i:])
+				err := rangekey.Encode(&span, func(k base.InternalKey, v []byte) error {
+					return mem.set(k, v)
+				})
+				if err != nil {
+					return nil, err
+				}
+				continue
+			}
+			key := base.ParseInternalKey(data[:i])
+			valueStr := data[i+1:]
+			value := []byte(valueStr)
+			var randBytes int
+			if n, err := fmt.Sscanf(valueStr, "<rand-bytes=%d>", &randBytes); err == nil && n == 1 {
+				value = make([]byte, randBytes)
+				rnd := rand.New(rand.NewSource(int64(key.SeqNum())))
+				if _, err := rnd.Read(value[:]); err != nil {
+					return nil, err
+				}
+			}
+			if err := mem.set(key, value); err != nil {
+				return nil, err
+			}
+		}
+	}
+
+	if err := maybeFlush(); err != nil {
+		return nil, err
+	}
+
+	if len(ve.NewFiles) > 0 {
+		jobID := d.mu.nextJobID
+		d.mu.nextJobID++
+		d.mu.versions.logLock()
+		if err := d.mu.versions.logAndApply(jobID, ve, newFileMetrics(ve.NewFiles), false, func() []compactionInfo {
+			return nil
+		}); err != nil {
+			return nil, err
+		}
+		d.updateReadStateLocked(nil)
+		d.updateTableStatsLocked(ve.NewFiles)
+	}
+
+	for l, maxBytes := range levelMaxBytes {
+		d.mu.versions.picker.(*compactionPickerByScore).levelMaxBytes[l] = maxBytes
+	}
+
+	return d, nil
+}
+
+func runTableStatsCmd(td *datadriven.TestData, d *DB) string {
+	u, err := strconv.ParseUint(strings.TrimSpace(td.Input), 10, 64)
+	if err != nil {
+		return err.Error()
+	}
+	fileNum := base.FileNum(u)
+
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	v := d.mu.versions.currentVersion()
+	for _, levelMetadata := range v.Levels {
+		iter := levelMetadata.Iter()
+		for f := iter.First(); f != nil; f = iter.Next() {
+			if f.FileNum != fileNum {
+				continue
+			}
+
+			if !f.StatsValid() {
+				d.waitTableStats()
+			}
+
+			var b bytes.Buffer
+			fmt.Fprintf(&b, "num-entries: %d\n", f.Stats.NumEntries)
+			fmt.Fprintf(&b, "num-deletions: %d\n", f.Stats.NumDeletions)
+			fmt.Fprintf(&b, "num-range-key-sets: %d\n", f.Stats.NumRangeKeySets)
+			fmt.Fprintf(&b, "point-deletions-bytes-estimate: %d\n", f.Stats.PointDeletionsBytesEstimate)
+			fmt.Fprintf(&b, "range-deletions-bytes-estimate: %d\n", f.Stats.RangeDeletionsBytesEstimate)
+			return b.String()
+		}
+	}
+	return "(not found)"
+}
+
+func runTableFileSizesCmd(td *datadriven.TestData, d *DB) string {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	return runVersionFileSizes(d.mu.versions.currentVersion())
+}
+
+func runVersionFileSizes(v *version) string {
+	var buf bytes.Buffer
+	for l, levelMetadata := range v.Levels {
+		if levelMetadata.Empty() {
+			continue
+		}
+		fmt.Fprintf(&buf, "L%d:\n", l)
+		iter := levelMetadata.Iter()
+		for f := iter.First(); f != nil; f = iter.Next() {
+			fmt.Fprintf(&buf, "  %s: %d bytes (%s)", f, f.Size, humanize.Bytes.Uint64(f.Size))
+			if f.IsCompacting() {
+				fmt.Fprintf(&buf, " (IsCompacting)")
+			}
+			fmt.Fprintln(&buf)
+		}
+	}
+	return buf.String()
+}
+
+// Prints some metadata about some sstable which is currently in the latest
+// version.
+func runMetadataCommand(t *testing.T, td *datadriven.TestData, d *DB) string {
+	var file int
+	td.ScanArgs(t, "file", &file)
+	var m *fileMetadata
+	d.mu.Lock()
+	currVersion := d.mu.versions.currentVersion()
+	for _, level := range currVersion.Levels {
+		lIter := level.Iter()
+		for f := lIter.First(); f != nil; f = lIter.Next() {
+			if f.FileNum == base.FileNum(uint64(file)) {
+				m = f
+				break
+			}
+		}
+	}
+	d.mu.Unlock()
+	var buf bytes.Buffer
+	// Add more metadata as needed.
+	fmt.Fprintf(&buf, "size: %d\n", m.Size)
+	return buf.String()
+}
+
+func runSSTablePropertiesCmd(t *testing.T, td *datadriven.TestData, d *DB) string {
+	var file int
+	td.ScanArgs(t, "file", &file)
+
+	// See if we can grab the FileMetadata associated with the file. This is needed
+	// to easily construct virtual sstable properties.
+	var m *fileMetadata
+	d.mu.Lock()
+	currVersion := d.mu.versions.currentVersion()
+	for _, level := range currVersion.Levels {
+		lIter := level.Iter()
+		for f := lIter.First(); f != nil; f = lIter.Next() {
+			if f.FileNum == base.FileNum(uint64(file)) {
+				m = f
+				break
+			}
+		}
+	}
+	d.mu.Unlock()
+
+	// Note that m can be nil here if the sstable exists in the file system, but
+	// not in the lsm. If m is nil just assume that file is not virtual.
+
+	backingFileNum := base.FileNum(uint64(file)).DiskFileNum()
+	if m != nil {
+		backingFileNum = m.FileBacking.DiskFileNum
+	}
+	fileName := base.MakeFilename(fileTypeTable, backingFileNum)
+	f, err := d.opts.FS.Open(fileName)
+	if err != nil {
+		return err.Error()
+	}
+	readable, err := sstable.NewSimpleReadable(f)
+	if err != nil {
+		return err.Error()
+	}
+	// TODO(bananabrick): cacheOpts is used to set the file number on a Reader,
+	// and virtual sstables expect this file number to be set. Split out the
+	// opts into fileNum opts, and cache opts.
+	cacheOpts := private.SSTableCacheOpts(0, backingFileNum).(sstable.ReaderOption)
+	r, err := sstable.NewReader(readable, d.opts.MakeReaderOptions(), cacheOpts)
+	if err != nil {
+		return err.Error()
+	}
+	defer r.Close()
+
+	var v sstable.VirtualReader
+	props := r.Properties.String()
+	if m != nil && m.Virtual {
+		v = sstable.MakeVirtualReader(r, m.VirtualMeta(), false /* isForeign */)
+		props = v.Properties.String()
+	}
+	if len(td.Input) == 0 {
+		return props
+	}
+	var buf bytes.Buffer
+	propsSlice := strings.Split(props, "\n")
+	for _, requestedProp := range strings.Split(td.Input, "\n") {
+		fmt.Fprintf(&buf, "%s:\n", requestedProp)
+		for _, prop := range propsSlice {
+			if strings.Contains(prop, requestedProp) {
+				fmt.Fprintf(&buf, "  %s\n", prop)
+			}
+		}
+	}
+	return buf.String()
+}
+
+func runPopulateCmd(t *testing.T, td *datadriven.TestData, b *Batch) {
+	var maxKeyLength, valLength int
+	var timestamps []int
+	td.ScanArgs(t, "keylen", &maxKeyLength)
+	td.MaybeScanArgs(t, "timestamps", &timestamps)
+	td.MaybeScanArgs(t, "vallen", &valLength)
+	// Default to writing timestamps @1.
+	if len(timestamps) == 0 {
+		timestamps = append(timestamps, 1)
+	}
+
+	ks := testkeys.Alpha(maxKeyLength)
+	buf := make([]byte, ks.MaxLen()+testkeys.MaxSuffixLen)
+	vbuf := make([]byte, valLength)
+	for i := int64(0); i < ks.Count(); i++ {
+		for _, ts := range timestamps {
+			n := testkeys.WriteKeyAt(buf, ks, i, int64(ts))
+
+			// Default to using the key as the value, but if the user provided
+			// the vallen argument, generate a random value of the specified
+			// length.
+			value := buf[:n]
+			if valLength > 0 {
+				_, err := crand.Read(vbuf)
+				require.NoError(t, err)
+				value = vbuf
+			}
+			require.NoError(t, b.Set(buf[:n], value, nil))
+		}
+	}
+}
+
+// waitTableStats waits until all new files' statistics have been loaded. It's
+// used in tests. The d.mu mutex must be locked while calling this method.
+func (d *DB) waitTableStats() {
+	for d.mu.tableStats.loading || len(d.mu.tableStats.pending) > 0 {
+		d.mu.tableStats.cond.Wait()
+	}
+}
+
+func runIngestAndExciseCmd(td *datadriven.TestData, d *DB, fs vfs.FS) error {
+	var exciseSpan KeyRange
+	paths := make([]string, 0, len(td.CmdArgs))
+	for i, arg := range td.CmdArgs {
+		switch td.CmdArgs[i].Key {
+		case "excise":
+			if len(td.CmdArgs[i].Vals) != 1 {
+				return errors.New("expected 2 values for excise separated by -, eg. ingest-and-excise foo1 excise=\"start-end\"")
+			}
+			fields := strings.Split(td.CmdArgs[i].Vals[0], "-")
+			if len(fields) != 2 {
+				return errors.New("expected 2 values for excise separated by -, eg. ingest-and-excise foo1 excise=\"start-end\"")
+			}
+			exciseSpan.Start = []byte(fields[0])
+			exciseSpan.End = []byte(fields[1])
+		default:
+			paths = append(paths, arg.String())
+		}
+	}
+
+	if _, err := d.IngestAndExcise(paths, nil /* shared */, exciseSpan); err != nil {
+		return err
+	}
+	return nil
+}
+
+func runIngestCmd(td *datadriven.TestData, d *DB, fs vfs.FS) error {
+	paths := make([]string, 0, len(td.CmdArgs))
+	for _, arg := range td.CmdArgs {
+		paths = append(paths, arg.String())
+	}
+
+	if err := d.Ingest(paths); err != nil {
+		return err
+	}
+	return nil
+}
+
+func runIngestExternalCmd(td *datadriven.TestData, d *DB, locator string) error {
+	external := make([]ExternalFile, 0)
+	for _, arg := range strings.Split(td.Input, "\n") {
+		fields := strings.Split(arg, ",")
+		if len(fields) != 4 {
+			return errors.New("usage: path,size,smallest,largest")
+		}
+		ef := ExternalFile{}
+		ef.Locator = remote.Locator(locator)
+		ef.ObjName = fields[0]
+		sizeInt, err := strconv.Atoi(fields[1])
+		if err != nil {
+			return err
+		}
+		ef.Size = uint64(sizeInt)
+		ef.SmallestUserKey = []byte(fields[2])
+		ef.LargestUserKey = []byte(fields[3])
+		ef.HasPointKey = true
+		external = append(external, ef)
+	}
+
+	if _, err := d.IngestExternalFiles(external); err != nil {
+		return err
+	}
+	return nil
+}
+
+func runForceIngestCmd(td *datadriven.TestData, d *DB) error {
+	var paths []string
+	var level int
+	for _, arg := range td.CmdArgs {
+		switch arg.Key {
+		case "paths":
+			paths = append(paths, arg.Vals...)
+		case "level":
+			var err error
+			level, err = strconv.Atoi(arg.Vals[0])
+			if err != nil {
+				return err
+			}
+		}
+	}
+	_, err := d.ingest(paths, func(
+		tableNewIters,
+		keyspan.TableNewSpanIter,
+		IterOptions,
+		*Comparer,
+		*version,
+		int,
+		map[*compaction]struct{},
+		*fileMetadata,
+		bool,
+	) (int, *fileMetadata, error) {
+		return level, nil, nil
+	}, nil /* shared */, KeyRange{}, nil /* external */)
+	return err
+}
+
+func runLSMCmd(td *datadriven.TestData, d *DB) string {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	if td.HasArg("verbose") {
+		return d.mu.versions.currentVersion().DebugString(d.opts.Comparer.FormatKey)
+	}
+	return d.mu.versions.currentVersion().String()
+}
+
+func parseDBOptionsArgs(opts *Options, args []datadriven.CmdArg) error {
+	for _, cmdArg := range args {
+		switch cmdArg.Key {
+		case "auto-compactions":
+			switch cmdArg.Vals[0] {
+			case "off":
+				opts.DisableAutomaticCompactions = true
+			case "on":
+				opts.DisableAutomaticCompactions = false
+			default:
+				return errors.Errorf("Unrecognized %q arg value: %q", cmdArg.Key, cmdArg.Vals[0])
+			}
+		case "inject-errors":
+			injs := make([]errorfs.Injector, len(cmdArg.Vals))
+			for i := 0; i < len(cmdArg.Vals); i++ {
+				inj, err := errorfs.ParseDSL(cmdArg.Vals[i])
+				if err != nil {
+					return err
+				}
+				injs[i] = inj
+			}
+			opts.FS = errorfs.Wrap(opts.FS, errorfs.Any(injs...))
+		case "enable-table-stats":
+			enable, err := strconv.ParseBool(cmdArg.Vals[0])
+			if err != nil {
+				return errors.Errorf("%s: could not parse %q as bool: %s", cmdArg.Key, cmdArg.Vals[0], err)
+			}
+			opts.private.disableTableStats = !enable
+		case "format-major-version":
+			v, err := strconv.Atoi(cmdArg.Vals[0])
+			if err != nil {
+				return err
+			}
+			// Override the DB version.
+			opts.FormatMajorVersion = FormatMajorVersion(v)
+		case "block-size":
+			v, err := strconv.Atoi(cmdArg.Vals[0])
+			if err != nil {
+				return err
+			}
+			for i := range opts.Levels {
+				opts.Levels[i].BlockSize = v
+			}
+		case "index-block-size":
+			v, err := strconv.Atoi(cmdArg.Vals[0])
+			if err != nil {
+				return err
+			}
+			for i := range opts.Levels {
+				opts.Levels[i].IndexBlockSize = v
+			}
+		case "target-file-size":
+			v, err := strconv.Atoi(cmdArg.Vals[0])
+			if err != nil {
+				return err
+			}
+			for i := range opts.Levels {
+				opts.Levels[i].TargetFileSize = int64(v)
+			}
+		case "bloom-bits-per-key":
+			v, err := strconv.Atoi(cmdArg.Vals[0])
+			if err != nil {
+				return err
+			}
+			fp := bloom.FilterPolicy(v)
+			opts.Filters = map[string]FilterPolicy{fp.Name(): fp}
+			for i := range opts.Levels {
+				opts.Levels[i].FilterPolicy = fp
+			}
+		case "merger":
+			switch cmdArg.Vals[0] {
+			case "appender":
+				opts.Merger = base.DefaultMerger
+			default:
+				return errors.Newf("unrecognized Merger %q\n", cmdArg.Vals[0])
+			}
+		}
+	}
+	return nil
+}
diff --git a/pebble/db.go b/pebble/db.go
new file mode 100644
index 0000000..ea23aaa
--- /dev/null
+++ b/pebble/db.go
@@ -0,0 +1,3050 @@
+// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+// Package pebble provides an ordered key/value store.
+package pebble // import "github.com/cockroachdb/pebble"
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"os"
+	"strconv"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/arenaskl"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invalidating"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/internal/manual"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/remote"
+	"github.com/cockroachdb/pebble/rangekey"
+	"github.com/cockroachdb/pebble/record"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/cockroachdb/pebble/vfs/atomicfs"
+	"github.com/cockroachdb/tokenbucket"
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+const (
+	// minTableCacheSize is the minimum size of the table cache, for a single db.
+	minTableCacheSize = 64
+
+	// numNonTableCacheFiles is an approximation for the number of files
+	// that we don't use for table caches, for a given db.
+	numNonTableCacheFiles = 10
+)
+
+var (
+	// ErrNotFound is returned when a get operation does not find the requested
+	// key.
+	ErrNotFound = base.ErrNotFound
+	// ErrClosed is panicked when an operation is performed on a closed snapshot or
+	// DB. Use errors.Is(err, ErrClosed) to check for this error.
+	ErrClosed = errors.New("pebble: closed")
+	// ErrReadOnly is returned when a write operation is performed on a read-only
+	// database.
+	ErrReadOnly = errors.New("pebble: read-only")
+	// errNoSplit indicates that the user is trying to perform a range key
+	// operation but the configured Comparer does not provide a Split
+	// implementation.
+	errNoSplit = errors.New("pebble: Comparer.Split required for range key operations")
+)
+
+// Reader is a readable key/value store.
+//
+// It is safe to call Get and NewIter from concurrent goroutines.
+type Reader interface {
+	// Get gets the value for the given key. It returns ErrNotFound if the DB
+	// does not contain the key.
+	//
+	// The caller should not modify the contents of the returned slice, but it is
+	// safe to modify the contents of the argument after Get returns. The
+	// returned slice will remain valid until the returned Closer is closed. On
+	// success, the caller MUST call closer.Close() or a memory leak will occur.
+	Get(key []byte) (value []byte, closer io.Closer, err error)
+
+	// NewIter returns an iterator that is unpositioned (Iterator.Valid() will
+	// return false). The iterator can be positioned via a call to SeekGE,
+	// SeekLT, First or Last.
+	NewIter(o *IterOptions) (*Iterator, error)
+
+	// NewIterWithContext is like NewIter, and additionally accepts a context
+	// for tracing.
+	NewIterWithContext(ctx context.Context, o *IterOptions) (*Iterator, error)
+
+	// Close closes the Reader. It may or may not close any underlying io.Reader
+	// or io.Writer, depending on how the DB was created.
+	//
+	// It is not safe to close a DB until all outstanding iterators are closed.
+	// It is valid to call Close multiple times. Other methods should not be
+	// called after the DB has been closed.
+	Close() error
+}
+
+// Writer is a writable key/value store.
+//
+// Goroutine safety is dependent on the specific implementation.
+type Writer interface {
+	// Apply the operations contained in the batch to the DB.
+	//
+	// It is safe to modify the contents of the arguments after Apply returns.
+	Apply(batch *Batch, o *WriteOptions) error
+
+	// Delete deletes the value for the given key. Deletes are blind all will
+	// succeed even if the given key does not exist.
+	//
+	// It is safe to modify the contents of the arguments after Delete returns.
+	Delete(key []byte, o *WriteOptions) error
+
+	// DeleteSized behaves identically to Delete, but takes an additional
+	// argument indicating the size of the value being deleted. DeleteSized
+	// should be preferred when the caller has the expectation that there exists
+	// a single internal KV pair for the key (eg, the key has not been
+	// overwritten recently), and the caller knows the size of its value.
+	//
+	// DeleteSized will record the value size within the tombstone and use it to
+	// inform compaction-picking heuristics which strive to reduce space
+	// amplification in the LSM. This "calling your shot" mechanic allows the
+	// storage engine to more accurately estimate and reduce space
+	// amplification.
+	//
+	// It is safe to modify the contents of the arguments after DeleteSized
+	// returns.
+	DeleteSized(key []byte, valueSize uint32, _ *WriteOptions) error
+
+	// SingleDelete is similar to Delete in that it deletes the value for the given key. Like Delete,
+	// it is a blind operation that will succeed even if the given key does not exist.
+	//
+	// WARNING: Undefined (non-deterministic) behavior will result if a key is overwritten and
+	// then deleted using SingleDelete. The record may appear deleted immediately, but be
+	// resurrected at a later time after compactions have been performed. Or the record may
+	// be deleted permanently. A Delete operation lays down a "tombstone" which shadows all
+	// previous versions of a key. The SingleDelete operation is akin to "anti-matter" and will
+	// only delete the most recently written version for a key. These different semantics allow
+	// the DB to avoid propagating a SingleDelete operation during a compaction as soon as the
+	// corresponding Set operation is encountered. These semantics require extreme care to handle
+	// properly. Only use if you have a workload where the performance gain is critical and you
+	// can guarantee that a record is written once and then deleted once.
+	//
+	// SingleDelete is internally transformed into a Delete if the most recent record for a key is either
+	// a Merge or Delete record.
+	//
+	// It is safe to modify the contents of the arguments after SingleDelete returns.
+	SingleDelete(key []byte, o *WriteOptions) error
+
+	// DeleteRange deletes all of the point keys (and values) in the range
+	// [start,end) (inclusive on start, exclusive on end). DeleteRange does NOT
+	// delete overlapping range keys (eg, keys set via RangeKeySet).
+	//
+	// It is safe to modify the contents of the arguments after DeleteRange
+	// returns.
+	DeleteRange(start, end []byte, o *WriteOptions) error
+
+	// LogData adds the specified to the batch. The data will be written to the
+	// WAL, but not added to memtables or sstables. Log data is never indexed,
+	// which makes it useful for testing WAL performance.
+	//
+	// It is safe to modify the contents of the argument after LogData returns.
+	LogData(data []byte, opts *WriteOptions) error
+
+	// Merge merges the value for the given key. The details of the merge are
+	// dependent upon the configured merge operation.
+	//
+	// It is safe to modify the contents of the arguments after Merge returns.
+	Merge(key, value []byte, o *WriteOptions) error
+
+	// Set sets the value for the given key. It overwrites any previous value
+	// for that key; a DB is not a multi-map.
+	//
+	// It is safe to modify the contents of the arguments after Set returns.
+	Set(key, value []byte, o *WriteOptions) error
+
+	// RangeKeySet sets a range key mapping the key range [start, end) at the MVCC
+	// timestamp suffix to value. The suffix is optional. If any portion of the key
+	// range [start, end) is already set by a range key with the same suffix value,
+	// RangeKeySet overrides it.
+	//
+	// It is safe to modify the contents of the arguments after RangeKeySet returns.
+	RangeKeySet(start, end, suffix, value []byte, opts *WriteOptions) error
+
+	// RangeKeyUnset removes a range key mapping the key range [start, end) at the
+	// MVCC timestamp suffix. The suffix may be omitted to remove an unsuffixed
+	// range key. RangeKeyUnset only removes portions of range keys that fall within
+	// the [start, end) key span, and only range keys with suffixes that exactly
+	// match the unset suffix.
+	//
+	// It is safe to modify the contents of the arguments after RangeKeyUnset
+	// returns.
+	RangeKeyUnset(start, end, suffix []byte, opts *WriteOptions) error
+
+	// RangeKeyDelete deletes all of the range keys in the range [start,end)
+	// (inclusive on start, exclusive on end). It does not delete point keys (for
+	// that use DeleteRange). RangeKeyDelete removes all range keys within the
+	// bounds, including those with or without suffixes.
+	//
+	// It is safe to modify the contents of the arguments after RangeKeyDelete
+	// returns.
+	RangeKeyDelete(start, end []byte, opts *WriteOptions) error
+}
+
+// CPUWorkHandle represents a handle used by the CPUWorkPermissionGranter API.
+type CPUWorkHandle interface {
+	// Permitted indicates whether Pebble can use additional CPU resources.
+	Permitted() bool
+}
+
+// CPUWorkPermissionGranter is used to request permission to opportunistically
+// use additional CPUs to speed up internal background work.
+type CPUWorkPermissionGranter interface {
+	// GetPermission returns a handle regardless of whether permission is granted
+	// or not. In the latter case, the handle is only useful for recording
+	// the CPU time actually spent on this calling goroutine.
+	GetPermission(time.Duration) CPUWorkHandle
+	// CPUWorkDone must be called regardless of whether CPUWorkHandle.Permitted
+	// returns true or false.
+	CPUWorkDone(CPUWorkHandle)
+}
+
+// Use a default implementation for the CPU work granter to avoid excessive nil
+// checks in the code.
+type defaultCPUWorkHandle struct{}
+
+func (d defaultCPUWorkHandle) Permitted() bool {
+	return false
+}
+
+type defaultCPUWorkGranter struct{}
+
+func (d defaultCPUWorkGranter) GetPermission(_ time.Duration) CPUWorkHandle {
+	return defaultCPUWorkHandle{}
+}
+
+func (d defaultCPUWorkGranter) CPUWorkDone(_ CPUWorkHandle) {}
+
+// DB provides a concurrent, persistent ordered key/value store.
+//
+// A DB's basic operations (Get, Set, Delete) should be self-explanatory. Get
+// and Delete will return ErrNotFound if the requested key is not in the store.
+// Callers are free to ignore this error.
+//
+// A DB also allows for iterating over the key/value pairs in key order. If d
+// is a DB, the code below prints all key/value pairs whose keys are 'greater
+// than or equal to' k:
+//
+//	iter := d.NewIter(readOptions)
+//	for iter.SeekGE(k); iter.Valid(); iter.Next() {
+//		fmt.Printf("key=%q value=%q\n", iter.Key(), iter.Value())
+//	}
+//	return iter.Close()
+//
+// The Options struct holds the optional parameters for the DB, including a
+// Comparer to define a 'less than' relationship over keys. It is always valid
+// to pass a nil *Options, which means to use the default parameter values. Any
+// zero field of a non-nil *Options also means to use the default value for
+// that parameter. Thus, the code below uses a custom Comparer, but the default
+// values for every other parameter:
+//
+//	db := pebble.Open(&Options{
+//		Comparer: myComparer,
+//	})
+type DB struct {
+	// The count and size of referenced memtables. This includes memtables
+	// present in DB.mu.mem.queue, as well as memtables that have been flushed
+	// but are still referenced by an inuse readState, as well as up to one
+	// memTable waiting to be reused and stored in d.memTableRecycle.
+	memTableCount    atomic.Int64
+	memTableReserved atomic.Int64 // number of bytes reserved in the cache for memtables
+	// memTableRecycle holds a pointer to an obsolete memtable. The next
+	// memtable allocation will reuse this memtable if it has not already been
+	// recycled.
+	memTableRecycle atomic.Pointer[memTable]
+
+	// The size of the current log file (i.e. db.mu.log.queue[len(queue)-1].
+	logSize atomic.Uint64
+
+	// The number of bytes available on disk.
+	diskAvailBytes atomic.Uint64
+
+	cacheID        uint64
+	dirname        string
+	walDirname     string
+	opts           *Options
+	cmp            Compare
+	equal          Equal
+	merge          Merge
+	split          Split
+	abbreviatedKey AbbreviatedKey
+	// The threshold for determining when a batch is "large" and will skip being
+	// inserted into a memtable.
+	largeBatchThreshold uint64
+	// The current OPTIONS file number.
+	optionsFileNum base.DiskFileNum
+	// The on-disk size of the current OPTIONS file.
+	optionsFileSize uint64
+
+	// objProvider is used to access and manage SSTs.
+	objProvider objstorage.Provider
+
+	fileLock *Lock
+	dataDir  vfs.File
+	walDir   vfs.File
+
+	tableCache           *tableCacheContainer
+	newIters             tableNewIters
+	tableNewRangeKeyIter keyspan.TableNewSpanIter
+
+	commit *commitPipeline
+
+	// readState provides access to the state needed for reading without needing
+	// to acquire DB.mu.
+	readState struct {
+		sync.RWMutex
+		val *readState
+	}
+	// logRecycler holds a set of log file numbers that are available for
+	// reuse. Writing to a recycled log file is faster than to a new log file on
+	// some common filesystems (xfs, and ext3/4) due to avoiding metadata
+	// updates.
+	logRecycler logRecycler
+
+	closed   *atomic.Value
+	closedCh chan struct{}
+
+	cleanupManager *cleanupManager
+
+	// During an iterator close, we may asynchronously schedule read compactions.
+	// We want to wait for those goroutines to finish, before closing the DB.
+	// compactionShedulers.Wait() should not be called while the DB.mu is held.
+	compactionSchedulers sync.WaitGroup
+
+	// The main mutex protecting internal DB state. This mutex encompasses many
+	// fields because those fields need to be accessed and updated atomically. In
+	// particular, the current version, log.*, mem.*, and snapshot list need to
+	// be accessed and updated atomically during compaction.
+	//
+	// Care is taken to avoid holding DB.mu during IO operations. Accomplishing
+	// this sometimes requires releasing DB.mu in a method that was called with
+	// it held. See versionSet.logAndApply() and DB.makeRoomForWrite() for
+	// examples. This is a common pattern, so be careful about expectations that
+	// DB.mu will be held continuously across a set of calls.
+	mu struct {
+		sync.Mutex
+
+		formatVers struct {
+			// vers is the database's current format major version.
+			// Backwards-incompatible features are gated behind new
+			// format major versions and not enabled until a database's
+			// version is ratcheted upwards.
+			//
+			// Although this is under the `mu` prefix, readers may read vers
+			// atomically without holding d.mu. Writers must only write to this
+			// value through finalizeFormatVersUpgrade which requires d.mu is
+			// held.
+			vers atomic.Uint64
+			// marker is the atomic marker for the format major version.
+			// When a database's version is ratcheted upwards, the
+			// marker is moved in order to atomically record the new
+			// version.
+			marker *atomicfs.Marker
+			// ratcheting when set to true indicates that the database is
+			// currently in the process of ratcheting the format major version
+			// to vers + 1. As a part of ratcheting the format major version,
+			// migrations may drop and re-acquire the mutex.
+			ratcheting bool
+		}
+
+		// The ID of the next job. Job IDs are passed to event listener
+		// notifications and act as a mechanism for tying together the events and
+		// log messages for a single job such as a flush, compaction, or file
+		// ingestion. Job IDs are not serialized to disk or used for correctness.
+		nextJobID int
+
+		// The collection of immutable versions and state about the log and visible
+		// sequence numbers. Use the pointer here to ensure the atomic fields in
+		// version set are aligned properly.
+		versions *versionSet
+
+		log struct {
+			// The queue of logs, containing both flushed and unflushed logs. The
+			// flushed logs will be a prefix, the unflushed logs a suffix. The
+			// delimeter between flushed and unflushed logs is
+			// versionSet.minUnflushedLogNum.
+			queue []fileInfo
+			// The number of input bytes to the log. This is the raw size of the
+			// batches written to the WAL, without the overhead of the record
+			// envelopes.
+			bytesIn uint64
+			// The LogWriter is protected by commitPipeline.mu. This allows log
+			// writes to be performed without holding DB.mu, but requires both
+			// commitPipeline.mu and DB.mu to be held when rotating the WAL/memtable
+			// (i.e. makeRoomForWrite).
+			*record.LogWriter
+			// Can be nil.
+			metrics struct {
+				fsyncLatency prometheus.Histogram
+				record.LogWriterMetrics
+			}
+			registerLogWriterForTesting func(w *record.LogWriter)
+		}
+
+		mem struct {
+			// The current mutable memTable.
+			mutable *memTable
+			// Queue of flushables (the mutable memtable is at end). Elements are
+			// added to the end of the slice and removed from the beginning. Once an
+			// index is set it is never modified making a fixed slice immutable and
+			// safe for concurrent reads.
+			queue flushableList
+			// nextSize is the size of the next memtable. The memtable size starts at
+			// min(256KB,Options.MemTableSize) and doubles each time a new memtable
+			// is allocated up to Options.MemTableSize. This reduces the memory
+			// footprint of memtables when lots of DB instances are used concurrently
+			// in test environments.
+			nextSize uint64
+		}
+
+		compact struct {
+			// Condition variable used to signal when a flush or compaction has
+			// completed. Used by the write-stall mechanism to wait for the stall
+			// condition to clear. See DB.makeRoomForWrite().
+			cond sync.Cond
+			// True when a flush is in progress.
+			flushing bool
+			// The number of ongoing compactions.
+			compactingCount int
+			// The list of deletion hints, suggesting ranges for delete-only
+			// compactions.
+			deletionHints []deleteCompactionHint
+			// The list of manual compactions. The next manual compaction to perform
+			// is at the start of the list. New entries are added to the end.
+			manual []*manualCompaction
+			// inProgress is the set of in-progress flushes and compactions.
+			// It's used in the calculation of some metrics and to initialize L0
+			// sublevels' state. Some of the compactions contained within this
+			// map may have already committed an edit to the version but are
+			// lingering performing cleanup, like deleting obsolete files.
+			inProgress map[*compaction]struct{}
+
+			// rescheduleReadCompaction indicates to an iterator that a read compaction
+			// should be scheduled.
+			rescheduleReadCompaction bool
+
+			// readCompactions is a readCompactionQueue which keeps track of the
+			// compactions which we might have to perform.
+			readCompactions readCompactionQueue
+
+			// The cumulative duration of all completed compactions since Open.
+			// Does not include flushes.
+			duration time.Duration
+			// Flush throughput metric.
+			flushWriteThroughput ThroughputMetric
+			// The idle start time for the flush "loop", i.e., when the flushing
+			// bool above transitions to false.
+			noOngoingFlushStartTime time.Time
+		}
+
+		// Non-zero when file cleaning is disabled. The disabled count acts as a
+		// reference count to prohibit file cleaning. See
+		// DB.{disable,Enable}FileDeletions().
+		disableFileDeletions int
+
+		snapshots struct {
+			// The list of active snapshots.
+			snapshotList
+
+			// The cumulative count and size of snapshot-pinned keys written to
+			// sstables.
+			cumulativePinnedCount uint64
+			cumulativePinnedSize  uint64
+		}
+
+		tableStats struct {
+			// Condition variable used to signal the completion of a
+			// job to collect table stats.
+			cond sync.Cond
+			// True when a stat collection operation is in progress.
+			loading bool
+			// True if stat collection has loaded statistics for all tables
+			// other than those listed explicitly in pending. This flag starts
+			// as false when a database is opened and flips to true once stat
+			// collection has caught up.
+			loadedInitial bool
+			// A slice of files for which stats have not been computed.
+			// Compactions, ingests, flushes append files to be processed. An
+			// active stat collection goroutine clears the list and processes
+			// them.
+			pending []manifest.NewFileEntry
+		}
+
+		tableValidation struct {
+			// cond is a condition variable used to signal the completion of a
+			// job to validate one or more sstables.
+			cond sync.Cond
+			// pending is a slice of metadata for sstables waiting to be
+			// validated. Only physical sstables should be added to the pending
+			// queue.
+			pending []newFileEntry
+			// validating is set to true when validation is running.
+			validating bool
+		}
+	}
+
+	// Normally equal to time.Now() but may be overridden in tests.
+	timeNow func() time.Time
+	// the time at database Open; may be used to compute metrics like effective
+	// compaction concurrency
+	openedAt time.Time
+}
+
+var _ Reader = (*DB)(nil)
+var _ Writer = (*DB)(nil)
+
+// TestOnlyWaitForCleaning MUST only be used in tests.
+func (d *DB) TestOnlyWaitForCleaning() {
+	d.cleanupManager.Wait()
+}
+
+// Get gets the value for the given key. It returns ErrNotFound if the DB does
+// not contain the key.
+//
+// The caller should not modify the contents of the returned slice, but it is
+// safe to modify the contents of the argument after Get returns. The returned
+// slice will remain valid until the returned Closer is closed. On success, the
+// caller MUST call closer.Close() or a memory leak will occur.
+func (d *DB) Get(key []byte) ([]byte, io.Closer, error) {
+	return d.getInternal(key, nil /* batch */, nil /* snapshot */)
+}
+
+type getIterAlloc struct {
+	dbi    Iterator
+	keyBuf []byte
+	get    getIter
+}
+
+var getIterAllocPool = sync.Pool{
+	New: func() interface{} {
+		return &getIterAlloc{}
+	},
+}
+
+func (d *DB) getInternal(key []byte, b *Batch, s *Snapshot) ([]byte, io.Closer, error) {
+	if err := d.closed.Load(); err != nil {
+		panic(err)
+	}
+
+	// Grab and reference the current readState. This prevents the underlying
+	// files in the associated version from being deleted if there is a current
+	// compaction. The readState is unref'd by Iterator.Close().
+	readState := d.loadReadState()
+
+	// Determine the seqnum to read at after grabbing the read state (current and
+	// memtables) above.
+	var seqNum uint64
+	if s != nil {
+		seqNum = s.seqNum
+	} else {
+		seqNum = d.mu.versions.visibleSeqNum.Load()
+	}
+
+	buf := getIterAllocPool.Get().(*getIterAlloc)
+
+	get := &buf.get
+	*get = getIter{
+		logger:   d.opts.Logger,
+		comparer: d.opts.Comparer,
+		newIters: d.newIters,
+		snapshot: seqNum,
+		key:      key,
+		batch:    b,
+		mem:      readState.memtables,
+		l0:       readState.current.L0SublevelFiles,
+		version:  readState.current,
+	}
+
+	// Strip off memtables which cannot possibly contain the seqNum being read
+	// at.
+	for len(get.mem) > 0 {
+		n := len(get.mem)
+		if logSeqNum := get.mem[n-1].logSeqNum; logSeqNum < seqNum {
+			break
+		}
+		get.mem = get.mem[:n-1]
+	}
+
+	i := &buf.dbi
+	pointIter := get
+	*i = Iterator{
+		ctx:          context.Background(),
+		getIterAlloc: buf,
+		iter:         pointIter,
+		pointIter:    pointIter,
+		merge:        d.merge,
+		comparer:     *d.opts.Comparer,
+		readState:    readState,
+		keyBuf:       buf.keyBuf,
+	}
+
+	if !i.First() {
+		err := i.Close()
+		if err != nil {
+			return nil, nil, err
+		}
+		return nil, nil, ErrNotFound
+	}
+	return i.Value(), i, nil
+}
+
+// Set sets the value for the given key. It overwrites any previous value
+// for that key; a DB is not a multi-map.
+//
+// It is safe to modify the contents of the arguments after Set returns.
+func (d *DB) Set(key, value []byte, opts *WriteOptions) error {
+	b := newBatch(d)
+	_ = b.Set(key, value, opts)
+	if err := d.Apply(b, opts); err != nil {
+		return err
+	}
+	// Only release the batch on success.
+	b.release()
+	return nil
+}
+
+// Delete deletes the value for the given key. Deletes are blind all will
+// succeed even if the given key does not exist.
+//
+// It is safe to modify the contents of the arguments after Delete returns.
+func (d *DB) Delete(key []byte, opts *WriteOptions) error {
+	b := newBatch(d)
+	_ = b.Delete(key, opts)
+	if err := d.Apply(b, opts); err != nil {
+		return err
+	}
+	// Only release the batch on success.
+	b.release()
+	return nil
+}
+
+// DeleteSized behaves identically to Delete, but takes an additional
+// argument indicating the size of the value being deleted. DeleteSized
+// should be preferred when the caller has the expectation that there exists
+// a single internal KV pair for the key (eg, the key has not been
+// overwritten recently), and the caller knows the size of its value.
+//
+// DeleteSized will record the value size within the tombstone and use it to
+// inform compaction-picking heuristics which strive to reduce space
+// amplification in the LSM. This "calling your shot" mechanic allows the
+// storage engine to more accurately estimate and reduce space amplification.
+//
+// It is safe to modify the contents of the arguments after DeleteSized
+// returns.
+func (d *DB) DeleteSized(key []byte, valueSize uint32, opts *WriteOptions) error {
+	b := newBatch(d)
+	_ = b.DeleteSized(key, valueSize, opts)
+	if err := d.Apply(b, opts); err != nil {
+		return err
+	}
+	// Only release the batch on success.
+	b.release()
+	return nil
+}
+
+// SingleDelete adds an action to the batch that single deletes the entry for key.
+// See Writer.SingleDelete for more details on the semantics of SingleDelete.
+//
+// It is safe to modify the contents of the arguments after SingleDelete returns.
+func (d *DB) SingleDelete(key []byte, opts *WriteOptions) error {
+	b := newBatch(d)
+	_ = b.SingleDelete(key, opts)
+	if err := d.Apply(b, opts); err != nil {
+		return err
+	}
+	// Only release the batch on success.
+	b.release()
+	return nil
+}
+
+// DeleteRange deletes all of the keys (and values) in the range [start,end)
+// (inclusive on start, exclusive on end).
+//
+// It is safe to modify the contents of the arguments after DeleteRange
+// returns.
+func (d *DB) DeleteRange(start, end []byte, opts *WriteOptions) error {
+	b := newBatch(d)
+	_ = b.DeleteRange(start, end, opts)
+	if err := d.Apply(b, opts); err != nil {
+		return err
+	}
+	// Only release the batch on success.
+	b.release()
+	return nil
+}
+
+// Merge adds an action to the DB that merges the value at key with the new
+// value. The details of the merge are dependent upon the configured merge
+// operator.
+//
+// It is safe to modify the contents of the arguments after Merge returns.
+func (d *DB) Merge(key, value []byte, opts *WriteOptions) error {
+	b := newBatch(d)
+	_ = b.Merge(key, value, opts)
+	if err := d.Apply(b, opts); err != nil {
+		return err
+	}
+	// Only release the batch on success.
+	b.release()
+	return nil
+}
+
+// LogData adds the specified to the batch. The data will be written to the
+// WAL, but not added to memtables or sstables. Log data is never indexed,
+// which makes it useful for testing WAL performance.
+//
+// It is safe to modify the contents of the argument after LogData returns.
+func (d *DB) LogData(data []byte, opts *WriteOptions) error {
+	b := newBatch(d)
+	_ = b.LogData(data, opts)
+	if err := d.Apply(b, opts); err != nil {
+		return err
+	}
+	// Only release the batch on success.
+	b.release()
+	return nil
+}
+
+// RangeKeySet sets a range key mapping the key range [start, end) at the MVCC
+// timestamp suffix to value. The suffix is optional. If any portion of the key
+// range [start, end) is already set by a range key with the same suffix value,
+// RangeKeySet overrides it.
+//
+// It is safe to modify the contents of the arguments after RangeKeySet returns.
+func (d *DB) RangeKeySet(start, end, suffix, value []byte, opts *WriteOptions) error {
+	b := newBatch(d)
+	_ = b.RangeKeySet(start, end, suffix, value, opts)
+	if err := d.Apply(b, opts); err != nil {
+		return err
+	}
+	// Only release the batch on success.
+	b.release()
+	return nil
+}
+
+// RangeKeyUnset removes a range key mapping the key range [start, end) at the
+// MVCC timestamp suffix. The suffix may be omitted to remove an unsuffixed
+// range key. RangeKeyUnset only removes portions of range keys that fall within
+// the [start, end) key span, and only range keys with suffixes that exactly
+// match the unset suffix.
+//
+// It is safe to modify the contents of the arguments after RangeKeyUnset
+// returns.
+func (d *DB) RangeKeyUnset(start, end, suffix []byte, opts *WriteOptions) error {
+	b := newBatch(d)
+	_ = b.RangeKeyUnset(start, end, suffix, opts)
+	if err := d.Apply(b, opts); err != nil {
+		return err
+	}
+	// Only release the batch on success.
+	b.release()
+	return nil
+}
+
+// RangeKeyDelete deletes all of the range keys in the range [start,end)
+// (inclusive on start, exclusive on end). It does not delete point keys (for
+// that use DeleteRange). RangeKeyDelete removes all range keys within the
+// bounds, including those with or without suffixes.
+//
+// It is safe to modify the contents of the arguments after RangeKeyDelete
+// returns.
+func (d *DB) RangeKeyDelete(start, end []byte, opts *WriteOptions) error {
+	b := newBatch(d)
+	_ = b.RangeKeyDelete(start, end, opts)
+	if err := d.Apply(b, opts); err != nil {
+		return err
+	}
+	// Only release the batch on success.
+	b.release()
+	return nil
+}
+
+// Apply the operations contained in the batch to the DB. If the batch is large
+// the contents of the batch may be retained by the database. If that occurs
+// the batch contents will be cleared preventing the caller from attempting to
+// reuse them.
+//
+// It is safe to modify the contents of the arguments after Apply returns.
+func (d *DB) Apply(batch *Batch, opts *WriteOptions) error {
+	return d.applyInternal(batch, opts, false)
+}
+
+// ApplyNoSyncWait must only be used when opts.Sync is true and the caller
+// does not want to wait for the WAL fsync to happen. The method will return
+// once the mutation is applied to the memtable and is visible (note that a
+// mutation is visible before the WAL sync even in the wait case, so we have
+// not weakened the durability semantics). The caller must call Batch.SyncWait
+// to wait for the WAL fsync. The caller must not Close the batch without
+// first calling Batch.SyncWait.
+//
+// RECOMMENDATION: Prefer using Apply unless you really understand why you
+// need ApplyNoSyncWait.
+// EXPERIMENTAL: API/feature subject to change. Do not yet use outside
+// CockroachDB.
+func (d *DB) ApplyNoSyncWait(batch *Batch, opts *WriteOptions) error {
+	if !opts.Sync {
+		return errors.Errorf("cannot request asynchonous apply when WriteOptions.Sync is false")
+	}
+	return d.applyInternal(batch, opts, true)
+}
+
+// REQUIRES: noSyncWait => opts.Sync
+func (d *DB) applyInternal(batch *Batch, opts *WriteOptions, noSyncWait bool) error {
+	if err := d.closed.Load(); err != nil {
+		panic(err)
+	}
+	if batch.committing {
+		panic("pebble: batch already committing")
+	}
+	if batch.applied.Load() {
+		panic("pebble: batch already applied")
+	}
+	if d.opts.ReadOnly {
+		return ErrReadOnly
+	}
+	if batch.db != nil && batch.db != d {
+		panic(fmt.Sprintf("pebble: batch db mismatch: %p != %p", batch.db, d))
+	}
+
+	sync := opts.GetSync()
+	if sync && d.opts.DisableWAL {
+		return errors.New("pebble: WAL disabled")
+	}
+
+	if batch.minimumFormatMajorVersion != FormatMostCompatible {
+		if fmv := d.FormatMajorVersion(); fmv < batch.minimumFormatMajorVersion {
+			panic(fmt.Sprintf(
+				"pebble: batch requires at least format major version %d (current: %d)",
+				batch.minimumFormatMajorVersion, fmv,
+			))
+		}
+	}
+
+	if batch.countRangeKeys > 0 {
+		if d.split == nil {
+			return errNoSplit
+		}
+		// TODO(jackson): Assert that all range key operands are suffixless.
+	}
+	batch.committing = true
+
+	if batch.db == nil {
+		if err := batch.refreshMemTableSize(); err != nil {
+			return err
+		}
+	}
+	if batch.memTableSize >= d.largeBatchThreshold {
+		var err error
+		batch.flushable, err = newFlushableBatch(batch, d.opts.Comparer)
+		if err != nil {
+			return err
+		}
+	}
+	if err := d.commit.Commit(batch, sync, noSyncWait); err != nil {
+		// There isn't much we can do on an error here. The commit pipeline will be
+		// horked at this point.
+		d.opts.Logger.Fatalf("pebble: fatal commit error: %v", err)
+	}
+	// If this is a large batch, we need to clear the batch contents as the
+	// flushable batch may still be present in the flushables queue.
+	//
+	// TODO(peter): Currently large batches are written to the WAL. We could
+	// skip the WAL write and instead wait for the large batch to be flushed to
+	// an sstable. For a 100 MB batch, this might actually be faster. For a 1
+	// GB batch this is almost certainly faster.
+	if batch.flushable != nil {
+		batch.data = nil
+	}
+	return nil
+}
+
+func (d *DB) commitApply(b *Batch, mem *memTable) error {
+	if b.flushable != nil {
+		// This is a large batch which was already added to the immutable queue.
+		return nil
+	}
+	err := mem.apply(b, b.SeqNum())
+	if err != nil {
+		return err
+	}
+
+	// If the batch contains range tombstones and the database is configured
+	// to flush range deletions, schedule a delayed flush so that disk space
+	// may be reclaimed without additional writes or an explicit flush.
+	if b.countRangeDels > 0 && d.opts.FlushDelayDeleteRange > 0 {
+		d.mu.Lock()
+		d.maybeScheduleDelayedFlush(mem, d.opts.FlushDelayDeleteRange)
+		d.mu.Unlock()
+	}
+
+	// If the batch contains range keys and the database is configured to flush
+	// range keys, schedule a delayed flush so that the range keys are cleared
+	// from the memtable.
+	if b.countRangeKeys > 0 && d.opts.FlushDelayRangeKey > 0 {
+		d.mu.Lock()
+		d.maybeScheduleDelayedFlush(mem, d.opts.FlushDelayRangeKey)
+		d.mu.Unlock()
+	}
+
+	if mem.writerUnref() {
+		d.mu.Lock()
+		d.maybeScheduleFlush()
+		d.mu.Unlock()
+	}
+	return nil
+}
+
+func (d *DB) commitWrite(b *Batch, syncWG *sync.WaitGroup, syncErr *error) (*memTable, error) {
+	var size int64
+	repr := b.Repr()
+
+	if b.flushable != nil {
+		// We have a large batch. Such batches are special in that they don't get
+		// added to the memtable, and are instead inserted into the queue of
+		// memtables. The call to makeRoomForWrite with this batch will force the
+		// current memtable to be flushed. We want the large batch to be part of
+		// the same log, so we add it to the WAL here, rather than after the call
+		// to makeRoomForWrite().
+		//
+		// Set the sequence number since it was not set to the correct value earlier
+		// (see comment in newFlushableBatch()).
+		b.flushable.setSeqNum(b.SeqNum())
+		if !d.opts.DisableWAL {
+			var err error
+			size, err = d.mu.log.SyncRecord(repr, syncWG, syncErr)
+			if err != nil {
+				panic(err)
+			}
+		}
+	}
+
+	d.mu.Lock()
+
+	var err error
+	if !b.ingestedSSTBatch {
+		// Batches which contain keys of kind InternalKeyKindIngestSST will
+		// never be applied to the memtable, so we don't need to make room for
+		// write. For the other cases, switch out the memtable if there was not
+		// enough room to store the batch.
+		err = d.makeRoomForWrite(b)
+	}
+
+	if err == nil && !d.opts.DisableWAL {
+		d.mu.log.bytesIn += uint64(len(repr))
+	}
+
+	// Grab a reference to the memtable while holding DB.mu. Note that for
+	// non-flushable batches (b.flushable == nil) makeRoomForWrite() added a
+	// reference to the memtable which will prevent it from being flushed until
+	// we unreference it. This reference is dropped in DB.commitApply().
+	mem := d.mu.mem.mutable
+
+	d.mu.Unlock()
+	if err != nil {
+		return nil, err
+	}
+
+	if d.opts.DisableWAL {
+		return mem, nil
+	}
+
+	if b.flushable == nil {
+		size, err = d.mu.log.SyncRecord(repr, syncWG, syncErr)
+		if err != nil {
+			panic(err)
+		}
+	}
+
+	d.logSize.Store(uint64(size))
+	return mem, err
+}
+
+type iterAlloc struct {
+	dbi                 Iterator
+	keyBuf              []byte
+	boundsBuf           [2][]byte
+	prefixOrFullSeekKey []byte
+	merging             mergingIter
+	mlevels             [3 + numLevels]mergingIterLevel
+	levels              [3 + numLevels]levelIter
+	levelsPositioned    [3 + numLevels]bool
+}
+
+var iterAllocPool = sync.Pool{
+	New: func() interface{} {
+		return &iterAlloc{}
+	},
+}
+
+// snapshotIterOpts denotes snapshot-related iterator options when calling
+// newIter. These are the possible cases for a snapshotIterOpts:
+//   - No snapshot: All fields are zero values.
+//   - Classic snapshot: Only `seqNum` is set. The latest readState will be used
+//     and the specified seqNum will be used as the snapshot seqNum.
+//   - EventuallyFileOnlySnapshot (EFOS) behaving as a classic snapshot. Only
+//     the `seqNum` is set. The latest readState will be used
+//     and the specified seqNum will be used as the snapshot seqNum.
+//   - EFOS in file-only state: Only `seqNum` and `vers` are set. All the
+//     relevant SSTs are referenced by the *version.
+type snapshotIterOpts struct {
+	seqNum uint64
+	vers   *version
+}
+
+type batchIterOpts struct {
+	batchOnly bool
+}
+type newIterOpts struct {
+	snapshot snapshotIterOpts
+	batch    batchIterOpts
+}
+
+// newIter constructs a new iterator, merging in batch iterators as an extra
+// level.
+func (d *DB) newIter(
+	ctx context.Context, batch *Batch, internalOpts newIterOpts, o *IterOptions,
+) *Iterator {
+	if internalOpts.batch.batchOnly {
+		if batch == nil {
+			panic("batchOnly is true, but batch is nil")
+		}
+		if internalOpts.snapshot.vers != nil {
+			panic("batchOnly is true, but snapshotIterOpts is initialized")
+		}
+	}
+	if err := d.closed.Load(); err != nil {
+		panic(err)
+	}
+	seqNum := internalOpts.snapshot.seqNum
+	if o.rangeKeys() {
+		if d.FormatMajorVersion() < FormatRangeKeys {
+			panic(fmt.Sprintf(
+				"pebble: range keys require at least format major version %d (current: %d)",
+				FormatRangeKeys, d.FormatMajorVersion(),
+			))
+		}
+	}
+	if o != nil && o.RangeKeyMasking.Suffix != nil && o.KeyTypes != IterKeyTypePointsAndRanges {
+		panic("pebble: range key masking requires IterKeyTypePointsAndRanges")
+	}
+	if (batch != nil || seqNum != 0) && (o != nil && o.OnlyReadGuaranteedDurable) {
+		// We could add support for OnlyReadGuaranteedDurable on snapshots if
+		// there was a need: this would require checking that the sequence number
+		// of the snapshot has been flushed, by comparing with
+		// DB.mem.queue[0].logSeqNum.
+		panic("OnlyReadGuaranteedDurable is not supported for batches or snapshots")
+	}
+	var readState *readState
+	var newIters tableNewIters
+	var newIterRangeKey keyspan.TableNewSpanIter
+	if !internalOpts.batch.batchOnly {
+		// Grab and reference the current readState. This prevents the underlying
+		// files in the associated version from being deleted if there is a current
+		// compaction. The readState is unref'd by Iterator.Close().
+		if internalOpts.snapshot.vers == nil {
+			// NB: loadReadState() calls readState.ref().
+			readState = d.loadReadState()
+		} else {
+			// vers != nil
+			internalOpts.snapshot.vers.Ref()
+		}
+
+		// Determine the seqnum to read at after grabbing the read state (current and
+		// memtables) above.
+		if seqNum == 0 {
+			seqNum = d.mu.versions.visibleSeqNum.Load()
+		}
+		newIters = d.newIters
+		newIterRangeKey = d.tableNewRangeKeyIter
+	}
+
+	// Bundle various structures under a single umbrella in order to allocate
+	// them together.
+	buf := iterAllocPool.Get().(*iterAlloc)
+	dbi := &buf.dbi
+	*dbi = Iterator{
+		ctx:                 ctx,
+		alloc:               buf,
+		merge:               d.merge,
+		comparer:            *d.opts.Comparer,
+		readState:           readState,
+		version:             internalOpts.snapshot.vers,
+		keyBuf:              buf.keyBuf,
+		prefixOrFullSeekKey: buf.prefixOrFullSeekKey,
+		boundsBuf:           buf.boundsBuf,
+		batch:               batch,
+		newIters:            newIters,
+		newIterRangeKey:     newIterRangeKey,
+		seqNum:              seqNum,
+		batchOnlyIter:       internalOpts.batch.batchOnly,
+	}
+	if o != nil {
+		dbi.opts = *o
+		dbi.processBounds(o.LowerBound, o.UpperBound)
+	}
+	dbi.opts.logger = d.opts.Logger
+	if d.opts.private.disableLazyCombinedIteration {
+		dbi.opts.disableLazyCombinedIteration = true
+	}
+	if batch != nil {
+		dbi.batchSeqNum = dbi.batch.nextSeqNum()
+	}
+	return finishInitializingIter(ctx, buf)
+}
+
+// finishInitializingIter is a helper for doing the non-trivial initialization
+// of an Iterator. It's invoked to perform the initial initialization of an
+// Iterator during NewIter or Clone, and to perform reinitialization due to a
+// change in IterOptions by a call to Iterator.SetOptions.
+func finishInitializingIter(ctx context.Context, buf *iterAlloc) *Iterator {
+	// Short-hand.
+	dbi := &buf.dbi
+	var memtables flushableList
+	if dbi.readState != nil {
+		memtables = dbi.readState.memtables
+	}
+	if dbi.opts.OnlyReadGuaranteedDurable {
+		memtables = nil
+	} else {
+		// We only need to read from memtables which contain sequence numbers older
+		// than seqNum. Trim off newer memtables.
+		for i := len(memtables) - 1; i >= 0; i-- {
+			if logSeqNum := memtables[i].logSeqNum; logSeqNum < dbi.seqNum {
+				break
+			}
+			memtables = memtables[:i]
+		}
+	}
+
+	if dbi.opts.pointKeys() {
+		// Construct the point iterator, initializing dbi.pointIter to point to
+		// dbi.merging. If this is called during a SetOptions call and this
+		// Iterator has already initialized dbi.merging, constructPointIter is a
+		// noop and an initialized pointIter already exists in dbi.pointIter.
+		dbi.constructPointIter(ctx, memtables, buf)
+		dbi.iter = dbi.pointIter
+	} else {
+		dbi.iter = emptyIter
+	}
+
+	if dbi.opts.rangeKeys() {
+		dbi.rangeKeyMasking.init(dbi, dbi.comparer.Compare, dbi.comparer.Split)
+
+		// When iterating over both point and range keys, don't create the
+		// range-key iterator stack immediately if we can avoid it. This
+		// optimization takes advantage of the expected sparseness of range
+		// keys, and configures the point-key iterator to dynamically switch to
+		// combined iteration when it observes a file containing range keys.
+		//
+		// Lazy combined iteration is not possible if a batch or a memtable
+		// contains any range keys.
+		useLazyCombinedIteration := dbi.rangeKey == nil &&
+			dbi.opts.KeyTypes == IterKeyTypePointsAndRanges &&
+			(dbi.batch == nil || dbi.batch.countRangeKeys == 0) &&
+			!dbi.opts.disableLazyCombinedIteration
+		if useLazyCombinedIteration {
+			// The user requested combined iteration, and there's no indexed
+			// batch currently containing range keys that would prevent lazy
+			// combined iteration. Check the memtables to see if they contain
+			// any range keys.
+			for i := range memtables {
+				if memtables[i].containsRangeKeys() {
+					useLazyCombinedIteration = false
+					break
+				}
+			}
+		}
+
+		if useLazyCombinedIteration {
+			dbi.lazyCombinedIter = lazyCombinedIter{
+				parent:    dbi,
+				pointIter: dbi.pointIter,
+				combinedIterState: combinedIterState{
+					initialized: false,
+				},
+			}
+			dbi.iter = &dbi.lazyCombinedIter
+			dbi.iter = invalidating.MaybeWrapIfInvariants(dbi.iter)
+		} else {
+			dbi.lazyCombinedIter.combinedIterState = combinedIterState{
+				initialized: true,
+			}
+			if dbi.rangeKey == nil {
+				dbi.rangeKey = iterRangeKeyStateAllocPool.Get().(*iteratorRangeKeyState)
+				dbi.rangeKey.init(dbi.comparer.Compare, dbi.comparer.Split, &dbi.opts)
+				dbi.constructRangeKeyIter()
+			} else {
+				dbi.rangeKey.iterConfig.SetBounds(dbi.opts.LowerBound, dbi.opts.UpperBound)
+			}
+
+			// Wrap the point iterator (currently dbi.iter) with an interleaving
+			// iterator that interleaves range keys pulled from
+			// dbi.rangeKey.rangeKeyIter.
+			//
+			// NB: The interleaving iterator is always reinitialized, even if
+			// dbi already had an initialized range key iterator, in case the point
+			// iterator changed or the range key masking suffix changed.
+			dbi.rangeKey.iiter.Init(&dbi.comparer, dbi.iter, dbi.rangeKey.rangeKeyIter,
+				keyspan.InterleavingIterOpts{
+					Mask:       &dbi.rangeKeyMasking,
+					LowerBound: dbi.opts.LowerBound,
+					UpperBound: dbi.opts.UpperBound,
+				})
+			dbi.iter = &dbi.rangeKey.iiter
+		}
+	} else {
+		// !dbi.opts.rangeKeys()
+		//
+		// Reset the combined iterator state. The initialized=true ensures the
+		// iterator doesn't unnecessarily try to switch to combined iteration.
+		dbi.lazyCombinedIter.combinedIterState = combinedIterState{initialized: true}
+	}
+	return dbi
+}
+
+// ScanInternal scans all internal keys within the specified bounds, truncating
+// any rangedels and rangekeys to those bounds if they span past them. For use
+// when an external user needs to be aware of all internal keys that make up a
+// key range.
+//
+// Keys deleted by range deletions must not be returned or exposed by this
+// method, while the range deletion deleting that key must be exposed using
+// visitRangeDel. Keys that would be masked by range key masking (if an
+// appropriate prefix were set) should be exposed, alongside the range key
+// that would have masked it. This method also collapses all point keys into
+// one InternalKey; so only one internal key at most per user key is returned
+// to visitPointKey.
+//
+// If visitSharedFile is not nil, ScanInternal iterates in skip-shared iteration
+// mode. In this iteration mode, sstables in levels L5 and L6 are skipped, and
+// their metadatas truncated to [lower, upper) and passed into visitSharedFile.
+// ErrInvalidSkipSharedIteration is returned if visitSharedFile is not nil and an
+// sstable in L5 or L6 is found that is not in shared storage according to
+// provider.IsShared, or an sstable in those levels contains a newer key than the
+// snapshot sequence number (only applicable for snapshot.ScanInternal). Examples
+// of when this could happen could be if Pebble started writing sstables before a
+// creator ID was set (as creator IDs are necessary to enable shared storage)
+// resulting in some lower level SSTs being on non-shared storage. Skip-shared
+// iteration is invalid in those cases.
+func (d *DB) ScanInternal(
+	ctx context.Context,
+	categoryAndQoS sstable.CategoryAndQoS,
+	lower, upper []byte,
+	visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error,
+	visitRangeDel func(start, end []byte, seqNum uint64) error,
+	visitRangeKey func(start, end []byte, keys []rangekey.Key) error,
+	visitSharedFile func(sst *SharedSSTMeta) error,
+) error {
+	scanInternalOpts := &scanInternalOptions{
+		CategoryAndQoS:   categoryAndQoS,
+		visitPointKey:    visitPointKey,
+		visitRangeDel:    visitRangeDel,
+		visitRangeKey:    visitRangeKey,
+		visitSharedFile:  visitSharedFile,
+		skipSharedLevels: visitSharedFile != nil,
+		IterOptions: IterOptions{
+			KeyTypes:   IterKeyTypePointsAndRanges,
+			LowerBound: lower,
+			UpperBound: upper,
+		},
+	}
+	iter, err := d.newInternalIter(ctx, snapshotIterOpts{} /* snapshot */, scanInternalOpts)
+	if err != nil {
+		return err
+	}
+	defer iter.close()
+	return scanInternalImpl(ctx, lower, upper, iter, scanInternalOpts)
+}
+
+// newInternalIter constructs and returns a new scanInternalIterator on this db.
+// If o.skipSharedLevels is true, levels below sharedLevelsStart are *not* added
+// to the internal iterator.
+//
+// TODO(bilal): This method has a lot of similarities with db.newIter as well as
+// finishInitializingIter. Both pairs of methods should be refactored to reduce
+// this duplication.
+func (d *DB) newInternalIter(
+	ctx context.Context, sOpts snapshotIterOpts, o *scanInternalOptions,
+) (*scanInternalIterator, error) {
+	if err := d.closed.Load(); err != nil {
+		panic(err)
+	}
+	// Grab and reference the current readState. This prevents the underlying
+	// files in the associated version from being deleted if there is a current
+	// compaction. The readState is unref'd by Iterator.Close().
+	var readState *readState
+	if sOpts.vers == nil {
+		readState = d.loadReadState()
+	}
+	if sOpts.vers != nil {
+		sOpts.vers.Ref()
+	}
+
+	// Determine the seqnum to read at after grabbing the read state (current and
+	// memtables) above.
+	seqNum := sOpts.seqNum
+	if seqNum == 0 {
+		seqNum = d.mu.versions.visibleSeqNum.Load()
+	}
+
+	// Bundle various structures under a single umbrella in order to allocate
+	// them together.
+	buf := iterAllocPool.Get().(*iterAlloc)
+	dbi := &scanInternalIterator{
+		ctx:             ctx,
+		db:              d,
+		comparer:        d.opts.Comparer,
+		merge:           d.opts.Merger.Merge,
+		readState:       readState,
+		version:         sOpts.vers,
+		alloc:           buf,
+		newIters:        d.newIters,
+		newIterRangeKey: d.tableNewRangeKeyIter,
+		seqNum:          seqNum,
+		mergingIter:     &buf.merging,
+	}
+	dbi.opts = *o
+	dbi.opts.logger = d.opts.Logger
+	if d.opts.private.disableLazyCombinedIteration {
+		dbi.opts.disableLazyCombinedIteration = true
+	}
+	return finishInitializingInternalIter(buf, dbi)
+}
+
+func finishInitializingInternalIter(
+	buf *iterAlloc, i *scanInternalIterator,
+) (*scanInternalIterator, error) {
+	// Short-hand.
+	var memtables flushableList
+	if i.readState != nil {
+		memtables = i.readState.memtables
+	}
+	// We only need to read from memtables which contain sequence numbers older
+	// than seqNum. Trim off newer memtables.
+	for j := len(memtables) - 1; j >= 0; j-- {
+		if logSeqNum := memtables[j].logSeqNum; logSeqNum < i.seqNum {
+			break
+		}
+		memtables = memtables[:j]
+	}
+	i.initializeBoundBufs(i.opts.LowerBound, i.opts.UpperBound)
+
+	i.constructPointIter(i.opts.CategoryAndQoS, memtables, buf)
+
+	// For internal iterators, we skip the lazy combined iteration optimization
+	// entirely, and create the range key iterator stack directly.
+	i.rangeKey = iterRangeKeyStateAllocPool.Get().(*iteratorRangeKeyState)
+	i.rangeKey.init(i.comparer.Compare, i.comparer.Split, &i.opts.IterOptions)
+	if err := i.constructRangeKeyIter(); err != nil {
+		return nil, err
+	}
+
+	// Wrap the point iterator (currently i.iter) with an interleaving
+	// iterator that interleaves range keys pulled from
+	// i.rangeKey.rangeKeyIter.
+	i.rangeKey.iiter.Init(i.comparer, i.iter, i.rangeKey.rangeKeyIter,
+		keyspan.InterleavingIterOpts{
+			LowerBound: i.opts.LowerBound,
+			UpperBound: i.opts.UpperBound,
+		})
+	i.iter = &i.rangeKey.iiter
+
+	return i, nil
+}
+
+func (i *Iterator) constructPointIter(
+	ctx context.Context, memtables flushableList, buf *iterAlloc,
+) {
+	if i.pointIter != nil {
+		// Already have one.
+		return
+	}
+	internalOpts := internalIterOpts{stats: &i.stats.InternalStats}
+	if i.opts.RangeKeyMasking.Filter != nil {
+		internalOpts.boundLimitedFilter = &i.rangeKeyMasking
+	}
+
+	// Merging levels and levels from iterAlloc.
+	mlevels := buf.mlevels[:0]
+	levels := buf.levels[:0]
+
+	// We compute the number of levels needed ahead of time and reallocate a slice if
+	// the array from the iterAlloc isn't large enough. Doing this allocation once
+	// should improve the performance.
+	numMergingLevels := 0
+	numLevelIters := 0
+	if i.batch != nil {
+		numMergingLevels++
+	}
+
+	var current *version
+	if !i.batchOnlyIter {
+		numMergingLevels += len(memtables)
+
+		current = i.version
+		if current == nil {
+			current = i.readState.current
+		}
+		numMergingLevels += len(current.L0SublevelFiles)
+		numLevelIters += len(current.L0SublevelFiles)
+		for level := 1; level < len(current.Levels); level++ {
+			if current.Levels[level].Empty() {
+				continue
+			}
+			numMergingLevels++
+			numLevelIters++
+		}
+	}
+
+	if numMergingLevels > cap(mlevels) {
+		mlevels = make([]mergingIterLevel, 0, numMergingLevels)
+	}
+	if numLevelIters > cap(levels) {
+		levels = make([]levelIter, 0, numLevelIters)
+	}
+
+	// Top-level is the batch, if any.
+	if i.batch != nil {
+		if i.batch.index == nil {
+			// This isn't an indexed batch. We shouldn't have gotten this far.
+			panic(errors.AssertionFailedf("creating an iterator over an unindexed batch"))
+		} else {
+			i.batch.initInternalIter(&i.opts, &i.batchPointIter)
+			i.batch.initRangeDelIter(&i.opts, &i.batchRangeDelIter, i.batchSeqNum)
+			// Only include the batch's rangedel iterator if it's non-empty.
+			// This requires some subtle logic in the case a rangedel is later
+			// written to the batch and the view of the batch is refreshed
+			// during a call to SetOptions—in this case, we need to reconstruct
+			// the point iterator to add the batch rangedel iterator.
+			var rangeDelIter keyspan.FragmentIterator
+			if i.batchRangeDelIter.Count() > 0 {
+				rangeDelIter = &i.batchRangeDelIter
+			}
+			mlevels = append(mlevels, mergingIterLevel{
+				iter:         &i.batchPointIter,
+				rangeDelIter: rangeDelIter,
+			})
+		}
+	}
+
+	if !i.batchOnlyIter {
+		// Next are the memtables.
+		for j := len(memtables) - 1; j >= 0; j-- {
+			mem := memtables[j]
+			mlevels = append(mlevels, mergingIterLevel{
+				iter:         mem.newIter(&i.opts),
+				rangeDelIter: mem.newRangeDelIter(&i.opts),
+			})
+		}
+
+		// Next are the file levels: L0 sub-levels followed by lower levels.
+		mlevelsIndex := len(mlevels)
+		levelsIndex := len(levels)
+		mlevels = mlevels[:numMergingLevels]
+		levels = levels[:numLevelIters]
+		i.opts.snapshotForHideObsoletePoints = buf.dbi.seqNum
+		addLevelIterForFiles := func(files manifest.LevelIterator, level manifest.Level) {
+			li := &levels[levelsIndex]
+
+			li.init(ctx, i.opts, &i.comparer, i.newIters, files, level, internalOpts)
+			li.initRangeDel(&mlevels[mlevelsIndex].rangeDelIter)
+			li.initBoundaryContext(&mlevels[mlevelsIndex].levelIterBoundaryContext)
+			li.initCombinedIterState(&i.lazyCombinedIter.combinedIterState)
+			mlevels[mlevelsIndex].levelIter = li
+			mlevels[mlevelsIndex].iter = invalidating.MaybeWrapIfInvariants(li)
+
+			levelsIndex++
+			mlevelsIndex++
+		}
+
+		// Add level iterators for the L0 sublevels, iterating from newest to
+		// oldest.
+		for i := len(current.L0SublevelFiles) - 1; i >= 0; i-- {
+			addLevelIterForFiles(current.L0SublevelFiles[i].Iter(), manifest.L0Sublevel(i))
+		}
+
+		// Add level iterators for the non-empty non-L0 levels.
+		for level := 1; level < len(current.Levels); level++ {
+			if current.Levels[level].Empty() {
+				continue
+			}
+			addLevelIterForFiles(current.Levels[level].Iter(), manifest.Level(level))
+		}
+	}
+	buf.merging.init(&i.opts, &i.stats.InternalStats, i.comparer.Compare, i.comparer.Split, mlevels...)
+	if len(mlevels) <= cap(buf.levelsPositioned) {
+		buf.merging.levelsPositioned = buf.levelsPositioned[:len(mlevels)]
+	}
+	buf.merging.snapshot = i.seqNum
+	buf.merging.batchSnapshot = i.batchSeqNum
+	buf.merging.combinedIterState = &i.lazyCombinedIter.combinedIterState
+	i.pointIter = invalidating.MaybeWrapIfInvariants(&buf.merging)
+	i.merging = &buf.merging
+}
+
+// NewBatch returns a new empty write-only batch. Any reads on the batch will
+// return an error. If the batch is committed it will be applied to the DB.
+func (d *DB) NewBatch() *Batch {
+	return newBatch(d)
+}
+
+// NewBatchWithSize is mostly identical to NewBatch, but it will allocate the
+// the specified memory space for the internal slice in advance.
+func (d *DB) NewBatchWithSize(size int) *Batch {
+	return newBatchWithSize(d, size)
+}
+
+// NewIndexedBatch returns a new empty read-write batch. Any reads on the batch
+// will read from both the batch and the DB. If the batch is committed it will
+// be applied to the DB. An indexed batch is slower that a non-indexed batch
+// for insert operations. If you do not need to perform reads on the batch, use
+// NewBatch instead.
+func (d *DB) NewIndexedBatch() *Batch {
+	return newIndexedBatch(d, d.opts.Comparer)
+}
+
+// NewIndexedBatchWithSize is mostly identical to NewIndexedBatch, but it will
+// allocate the the specified memory space for the internal slice in advance.
+func (d *DB) NewIndexedBatchWithSize(size int) *Batch {
+	return newIndexedBatchWithSize(d, d.opts.Comparer, size)
+}
+
+// NewIter returns an iterator that is unpositioned (Iterator.Valid() will
+// return false). The iterator can be positioned via a call to SeekGE, SeekLT,
+// First or Last. The iterator provides a point-in-time view of the current DB
+// state. This view is maintained by preventing file deletions and preventing
+// memtables referenced by the iterator from being deleted. Using an iterator
+// to maintain a long-lived point-in-time view of the DB state can lead to an
+// apparent memory and disk usage leak. Use snapshots (see NewSnapshot) for
+// point-in-time snapshots which avoids these problems.
+func (d *DB) NewIter(o *IterOptions) (*Iterator, error) {
+	return d.NewIterWithContext(context.Background(), o)
+}
+
+// NewIterWithContext is like NewIter, and additionally accepts a context for
+// tracing.
+func (d *DB) NewIterWithContext(ctx context.Context, o *IterOptions) (*Iterator, error) {
+	return d.newIter(ctx, nil /* batch */, newIterOpts{}, o), nil
+}
+
+// NewSnapshot returns a point-in-time view of the current DB state. Iterators
+// created with this handle will all observe a stable snapshot of the current
+// DB state. The caller must call Snapshot.Close() when the snapshot is no
+// longer needed. Snapshots are not persisted across DB restarts (close ->
+// open). Unlike the implicit snapshot maintained by an iterator, a snapshot
+// will not prevent memtables from being released or sstables from being
+// deleted. Instead, a snapshot prevents deletion of sequence numbers
+// referenced by the snapshot.
+func (d *DB) NewSnapshot() *Snapshot {
+	if err := d.closed.Load(); err != nil {
+		panic(err)
+	}
+
+	d.mu.Lock()
+	s := &Snapshot{
+		db:     d,
+		seqNum: d.mu.versions.visibleSeqNum.Load(),
+	}
+	d.mu.snapshots.pushBack(s)
+	d.mu.Unlock()
+	return s
+}
+
+// NewEventuallyFileOnlySnapshot returns a point-in-time view of the current DB
+// state, similar to NewSnapshot, but with consistency constrained to the
+// provided set of key ranges. See the comment at EventuallyFileOnlySnapshot for
+// its semantics.
+func (d *DB) NewEventuallyFileOnlySnapshot(keyRanges []KeyRange) *EventuallyFileOnlySnapshot {
+	if err := d.closed.Load(); err != nil {
+		panic(err)
+	}
+
+	internalKeyRanges := make([]internalKeyRange, len(keyRanges))
+	for i := range keyRanges {
+		if i > 0 && d.cmp(keyRanges[i-1].End, keyRanges[i].Start) > 0 {
+			panic("pebble: key ranges for eventually-file-only-snapshot not in order")
+		}
+		internalKeyRanges[i] = internalKeyRange{
+			smallest: base.MakeInternalKey(keyRanges[i].Start, InternalKeySeqNumMax, InternalKeyKindMax),
+			largest:  base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, keyRanges[i].End),
+		}
+	}
+
+	return d.makeEventuallyFileOnlySnapshot(keyRanges, internalKeyRanges)
+}
+
+// Close closes the DB.
+//
+// It is not safe to close a DB until all outstanding iterators are closed
+// or to call Close concurrently with any other DB method. It is not valid
+// to call any of a DB's methods after the DB has been closed.
+func (d *DB) Close() error {
+	// Lock the commit pipeline for the duration of Close. This prevents a race
+	// with makeRoomForWrite. Rotating the WAL in makeRoomForWrite requires
+	// dropping d.mu several times for I/O. If Close only holds d.mu, an
+	// in-progress WAL rotation may re-acquire d.mu only once the database is
+	// closed.
+	//
+	// Additionally, locking the commit pipeline makes it more likely that
+	// (illegal) concurrent writes will observe d.closed.Load() != nil, creating
+	// more understable panics if the database is improperly used concurrently
+	// during Close.
+	d.commit.mu.Lock()
+	defer d.commit.mu.Unlock()
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	if err := d.closed.Load(); err != nil {
+		panic(err)
+	}
+
+	// Clear the finalizer that is used to check that an unreferenced DB has been
+	// closed. We're closing the DB here, so the check performed by that
+	// finalizer isn't necessary.
+	//
+	// Note: this is a no-op if invariants are disabled or race is enabled.
+	invariants.SetFinalizer(d.closed, nil)
+
+	d.closed.Store(errors.WithStack(ErrClosed))
+	close(d.closedCh)
+
+	defer d.opts.Cache.Unref()
+
+	for d.mu.compact.compactingCount > 0 || d.mu.compact.flushing {
+		d.mu.compact.cond.Wait()
+	}
+	for d.mu.tableStats.loading {
+		d.mu.tableStats.cond.Wait()
+	}
+	for d.mu.tableValidation.validating {
+		d.mu.tableValidation.cond.Wait()
+	}
+
+	var err error
+	if n := len(d.mu.compact.inProgress); n > 0 {
+		err = errors.Errorf("pebble: %d unexpected in-progress compactions", errors.Safe(n))
+	}
+	err = firstError(err, d.mu.formatVers.marker.Close())
+	err = firstError(err, d.tableCache.close())
+	if !d.opts.ReadOnly {
+		err = firstError(err, d.mu.log.Close())
+	} else if d.mu.log.LogWriter != nil {
+		panic("pebble: log-writer should be nil in read-only mode")
+	}
+	err = firstError(err, d.fileLock.Close())
+
+	// Note that versionSet.close() only closes the MANIFEST. The versions list
+	// is still valid for the checks below.
+	err = firstError(err, d.mu.versions.close())
+
+	err = firstError(err, d.dataDir.Close())
+	if d.dataDir != d.walDir {
+		err = firstError(err, d.walDir.Close())
+	}
+
+	d.readState.val.unrefLocked()
+
+	current := d.mu.versions.currentVersion()
+	for v := d.mu.versions.versions.Front(); true; v = v.Next() {
+		refs := v.Refs()
+		if v == current {
+			if refs != 1 {
+				err = firstError(err, errors.Errorf("leaked iterators: current\n%s", v))
+			}
+			break
+		}
+		if refs != 0 {
+			err = firstError(err, errors.Errorf("leaked iterators:\n%s", v))
+		}
+	}
+
+	for _, mem := range d.mu.mem.queue {
+		// Usually, we'd want to delete the files returned by readerUnref. But
+		// in this case, even if we're unreferencing the flushables, the
+		// flushables aren't obsolete. They will be reconstructed during WAL
+		// replay.
+		mem.readerUnrefLocked(false)
+	}
+	// If there's an unused, recycled memtable, we need to release its memory.
+	if obsoleteMemTable := d.memTableRecycle.Swap(nil); obsoleteMemTable != nil {
+		d.freeMemTable(obsoleteMemTable)
+	}
+	if reserved := d.memTableReserved.Load(); reserved != 0 {
+		err = firstError(err, errors.Errorf("leaked memtable reservation: %d", errors.Safe(reserved)))
+	}
+
+	// Since we called d.readState.val.unrefLocked() above, we are expected to
+	// manually schedule deletion of obsolete files.
+	if len(d.mu.versions.obsoleteTables) > 0 {
+		d.deleteObsoleteFiles(d.mu.nextJobID)
+	}
+
+	d.mu.Unlock()
+	d.compactionSchedulers.Wait()
+
+	// Wait for all cleaning jobs to finish.
+	d.cleanupManager.Close()
+
+	// Sanity check metrics.
+	if invariants.Enabled {
+		m := d.Metrics()
+		if m.Compact.NumInProgress > 0 || m.Compact.InProgressBytes > 0 {
+			d.mu.Lock()
+			panic(fmt.Sprintf("invalid metrics on close:\n%s", m))
+		}
+	}
+
+	d.mu.Lock()
+
+	// As a sanity check, ensure that there are no zombie tables. A non-zero count
+	// hints at a reference count leak.
+	if ztbls := len(d.mu.versions.zombieTables); ztbls > 0 {
+		err = firstError(err, errors.Errorf("non-zero zombie file count: %d", ztbls))
+	}
+
+	err = firstError(err, d.objProvider.Close())
+
+	// If the options include a closer to 'close' the filesystem, close it.
+	if d.opts.private.fsCloser != nil {
+		d.opts.private.fsCloser.Close()
+	}
+
+	// Return an error if the user failed to close all open snapshots.
+	if v := d.mu.snapshots.count(); v > 0 {
+		err = firstError(err, errors.Errorf("leaked snapshots: %d open snapshots on DB %p", v, d))
+	}
+
+	return err
+}
+
+// Compact the specified range of keys in the database.
+func (d *DB) Compact(start, end []byte, parallelize bool) error {
+	if err := d.closed.Load(); err != nil {
+		panic(err)
+	}
+	if d.opts.ReadOnly {
+		return ErrReadOnly
+	}
+	if d.cmp(start, end) >= 0 {
+		return errors.Errorf("Compact start %s is not less than end %s",
+			d.opts.Comparer.FormatKey(start), d.opts.Comparer.FormatKey(end))
+	}
+	iStart := base.MakeInternalKey(start, InternalKeySeqNumMax, InternalKeyKindMax)
+	iEnd := base.MakeInternalKey(end, 0, 0)
+	m := (&fileMetadata{}).ExtendPointKeyBounds(d.cmp, iStart, iEnd)
+	meta := []*fileMetadata{m}
+
+	d.mu.Lock()
+	maxLevelWithFiles := 1
+	cur := d.mu.versions.currentVersion()
+	for level := 0; level < numLevels; level++ {
+		overlaps := cur.Overlaps(level, d.cmp, start, end, iEnd.IsExclusiveSentinel())
+		if !overlaps.Empty() {
+			maxLevelWithFiles = level + 1
+		}
+	}
+
+	keyRanges := make([]internalKeyRange, len(meta))
+	for i := range meta {
+		keyRanges[i] = internalKeyRange{smallest: m.Smallest, largest: m.Largest}
+	}
+	// Determine if any memtable overlaps with the compaction range. We wait for
+	// any such overlap to flush (initiating a flush if necessary).
+	mem, err := func() (*flushableEntry, error) {
+		// Check to see if any files overlap with any of the memtables. The queue
+		// is ordered from oldest to newest with the mutable memtable being the
+		// last element in the slice. We want to wait for the newest table that
+		// overlaps.
+		for i := len(d.mu.mem.queue) - 1; i >= 0; i-- {
+			mem := d.mu.mem.queue[i]
+			if ingestMemtableOverlaps(d.cmp, mem, keyRanges) {
+				var err error
+				if mem.flushable == d.mu.mem.mutable {
+					// We have to hold both commitPipeline.mu and DB.mu when calling
+					// makeRoomForWrite(). Lock order requirements elsewhere force us to
+					// unlock DB.mu in order to grab commitPipeline.mu first.
+					d.mu.Unlock()
+					d.commit.mu.Lock()
+					d.mu.Lock()
+					defer d.commit.mu.Unlock()
+					if mem.flushable == d.mu.mem.mutable {
+						// Only flush if the active memtable is unchanged.
+						err = d.makeRoomForWrite(nil)
+					}
+				}
+				mem.flushForced = true
+				d.maybeScheduleFlush()
+				return mem, err
+			}
+		}
+		return nil, nil
+	}()
+
+	d.mu.Unlock()
+
+	if err != nil {
+		return err
+	}
+	if mem != nil {
+		<-mem.flushed
+	}
+
+	for level := 0; level < maxLevelWithFiles; {
+		for {
+			if err := d.manualCompact(
+				iStart.UserKey, iEnd.UserKey, level, parallelize); err != nil {
+				if errors.Is(err, ErrCancelledCompaction) {
+					continue
+				}
+				return err
+			}
+			break
+		}
+		level++
+		if level == numLevels-1 {
+			// A manual compaction of the bottommost level occurred.
+			// There is no next level to try and compact.
+			break
+		}
+	}
+	return nil
+}
+
+func (d *DB) manualCompact(start, end []byte, level int, parallelize bool) error {
+	d.mu.Lock()
+	curr := d.mu.versions.currentVersion()
+	files := curr.Overlaps(level, d.cmp, start, end, false)
+	if files.Empty() {
+		d.mu.Unlock()
+		return nil
+	}
+
+	var compactions []*manualCompaction
+	if parallelize {
+		compactions = append(compactions, d.splitManualCompaction(start, end, level)...)
+	} else {
+		compactions = append(compactions, &manualCompaction{
+			level: level,
+			done:  make(chan error, 1),
+			start: start,
+			end:   end,
+		})
+	}
+	d.mu.compact.manual = append(d.mu.compact.manual, compactions...)
+	d.maybeScheduleCompaction()
+	d.mu.Unlock()
+
+	// Each of the channels is guaranteed to be eventually sent to once. After a
+	// compaction is possibly picked in d.maybeScheduleCompaction(), either the
+	// compaction is dropped, executed after being scheduled, or retried later.
+	// Assuming eventual progress when a compaction is retried, all outcomes send
+	// a value to the done channel. Since the channels are buffered, it is not
+	// necessary to read from each channel, and so we can exit early in the event
+	// of an error.
+	for _, compaction := range compactions {
+		if err := <-compaction.done; err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// splitManualCompaction splits a manual compaction over [start,end] on level
+// such that the resulting compactions have no key overlap.
+func (d *DB) splitManualCompaction(
+	start, end []byte, level int,
+) (splitCompactions []*manualCompaction) {
+	curr := d.mu.versions.currentVersion()
+	endLevel := level + 1
+	baseLevel := d.mu.versions.picker.getBaseLevel()
+	if level == 0 {
+		endLevel = baseLevel
+	}
+	keyRanges := calculateInuseKeyRanges(curr, d.cmp, level, endLevel, start, end)
+	for _, keyRange := range keyRanges {
+		splitCompactions = append(splitCompactions, &manualCompaction{
+			level: level,
+			done:  make(chan error, 1),
+			start: keyRange.Start,
+			end:   keyRange.End,
+			split: true,
+		})
+	}
+	return splitCompactions
+}
+
+// DownloadSpan is a key range passed to the Download method.
+type DownloadSpan struct {
+	StartKey []byte
+	// EndKey is exclusive.
+	EndKey []byte
+}
+
+// Download ensures that the LSM does not use any external sstables for the
+// given key ranges. It does so by performing appropriate compactions so that
+// all external data becomes available locally.
+//
+// Note that calling this method does not imply that all other compactions stop;
+// it simply informs Pebble of a list of spans for which external data should be
+// downloaded with high priority.
+//
+// The method returns once no external sstasbles overlap the given spans, the
+// context is canceled, or an error is hit.
+//
+// TODO(radu): consider passing a priority/impact knob to express how important
+// the download is (versus live traffic performance, LSM health).
+func (d *DB) Download(ctx context.Context, spans []DownloadSpan) error {
+	return errors.Errorf("not implemented")
+}
+
+// Flush the memtable to stable storage.
+func (d *DB) Flush() error {
+	flushDone, err := d.AsyncFlush()
+	if err != nil {
+		return err
+	}
+	<-flushDone
+	return nil
+}
+
+// AsyncFlush asynchronously flushes the memtable to stable storage.
+//
+// If no error is returned, the caller can receive from the returned channel in
+// order to wait for the flush to complete.
+func (d *DB) AsyncFlush() (<-chan struct{}, error) {
+	if err := d.closed.Load(); err != nil {
+		panic(err)
+	}
+	if d.opts.ReadOnly {
+		return nil, ErrReadOnly
+	}
+
+	d.commit.mu.Lock()
+	defer d.commit.mu.Unlock()
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	flushed := d.mu.mem.queue[len(d.mu.mem.queue)-1].flushed
+	err := d.makeRoomForWrite(nil)
+	if err != nil {
+		return nil, err
+	}
+	return flushed, nil
+}
+
+// Metrics returns metrics about the database.
+func (d *DB) Metrics() *Metrics {
+	metrics := &Metrics{}
+	recycledLogsCount, recycledLogSize := d.logRecycler.stats()
+
+	d.mu.Lock()
+	vers := d.mu.versions.currentVersion()
+	*metrics = d.mu.versions.metrics
+	metrics.Compact.EstimatedDebt = d.mu.versions.picker.estimatedCompactionDebt(0)
+	metrics.Compact.InProgressBytes = d.mu.versions.atomicInProgressBytes.Load()
+	metrics.Compact.NumInProgress = int64(d.mu.compact.compactingCount)
+	metrics.Compact.MarkedFiles = vers.Stats.MarkedForCompaction
+	metrics.Compact.Duration = d.mu.compact.duration
+	for c := range d.mu.compact.inProgress {
+		if c.kind != compactionKindFlush {
+			metrics.Compact.Duration += d.timeNow().Sub(c.beganAt)
+		}
+	}
+
+	for _, m := range d.mu.mem.queue {
+		metrics.MemTable.Size += m.totalBytes()
+	}
+	metrics.Snapshots.Count = d.mu.snapshots.count()
+	if metrics.Snapshots.Count > 0 {
+		metrics.Snapshots.EarliestSeqNum = d.mu.snapshots.earliest()
+	}
+	metrics.Snapshots.PinnedKeys = d.mu.snapshots.cumulativePinnedCount
+	metrics.Snapshots.PinnedSize = d.mu.snapshots.cumulativePinnedSize
+	metrics.MemTable.Count = int64(len(d.mu.mem.queue))
+	metrics.MemTable.ZombieCount = d.memTableCount.Load() - metrics.MemTable.Count
+	metrics.MemTable.ZombieSize = uint64(d.memTableReserved.Load()) - metrics.MemTable.Size
+	metrics.WAL.ObsoleteFiles = int64(recycledLogsCount)
+	metrics.WAL.ObsoletePhysicalSize = recycledLogSize
+	metrics.WAL.Size = d.logSize.Load()
+	// The current WAL size (d.atomic.logSize) is the current logical size,
+	// which may be less than the WAL's physical size if it was recycled.
+	// The file sizes in d.mu.log.queue are updated to the physical size
+	// during WAL rotation. Use the larger of the two for the current WAL. All
+	// the previous WALs's fileSizes in d.mu.log.queue are already updated.
+	metrics.WAL.PhysicalSize = metrics.WAL.Size
+	if len(d.mu.log.queue) > 0 && metrics.WAL.PhysicalSize < d.mu.log.queue[len(d.mu.log.queue)-1].fileSize {
+		metrics.WAL.PhysicalSize = d.mu.log.queue[len(d.mu.log.queue)-1].fileSize
+	}
+	for i, n := 0, len(d.mu.log.queue)-1; i < n; i++ {
+		metrics.WAL.PhysicalSize += d.mu.log.queue[i].fileSize
+	}
+
+	metrics.WAL.BytesIn = d.mu.log.bytesIn // protected by d.mu
+	for i, n := 0, len(d.mu.mem.queue)-1; i < n; i++ {
+		metrics.WAL.Size += d.mu.mem.queue[i].logSize
+	}
+	metrics.WAL.BytesWritten = metrics.Levels[0].BytesIn + metrics.WAL.Size
+	if p := d.mu.versions.picker; p != nil {
+		compactions := d.getInProgressCompactionInfoLocked(nil)
+		for level, score := range p.getScores(compactions) {
+			metrics.Levels[level].Score = score
+		}
+	}
+	metrics.Table.ZombieCount = int64(len(d.mu.versions.zombieTables))
+	for _, size := range d.mu.versions.zombieTables {
+		metrics.Table.ZombieSize += size
+	}
+	metrics.private.optionsFileSize = d.optionsFileSize
+
+	// TODO(jackson): Consider making these metrics optional.
+	metrics.Keys.RangeKeySetsCount = countRangeKeySetFragments(vers)
+	metrics.Keys.TombstoneCount = countTombstones(vers)
+
+	d.mu.versions.logLock()
+	metrics.private.manifestFileSize = uint64(d.mu.versions.manifest.Size())
+	metrics.Table.BackingTableCount = uint64(len(d.mu.versions.backingState.fileBackingMap))
+	metrics.Table.BackingTableSize = d.mu.versions.backingState.fileBackingSize
+	if invariants.Enabled {
+		var totalSize uint64
+		for _, backing := range d.mu.versions.backingState.fileBackingMap {
+			totalSize += backing.Size
+		}
+		if totalSize != metrics.Table.BackingTableSize {
+			panic("pebble: invalid backing table size accounting")
+		}
+	}
+	d.mu.versions.logUnlock()
+
+	metrics.LogWriter.FsyncLatency = d.mu.log.metrics.fsyncLatency
+	if err := metrics.LogWriter.Merge(&d.mu.log.metrics.LogWriterMetrics); err != nil {
+		d.opts.Logger.Errorf("metrics error: %s", err)
+	}
+	metrics.Flush.WriteThroughput = d.mu.compact.flushWriteThroughput
+	if d.mu.compact.flushing {
+		metrics.Flush.NumInProgress = 1
+	}
+	for i := 0; i < numLevels; i++ {
+		metrics.Levels[i].Additional.ValueBlocksSize = valueBlocksSizeForLevel(vers, i)
+	}
+
+	d.mu.Unlock()
+
+	metrics.BlockCache = d.opts.Cache.Metrics()
+	metrics.TableCache, metrics.Filter = d.tableCache.metrics()
+	metrics.TableIters = int64(d.tableCache.iterCount())
+	metrics.CategoryStats = d.tableCache.dbOpts.sstStatsCollector.GetStats()
+
+	metrics.SecondaryCacheMetrics = d.objProvider.Metrics()
+
+	metrics.Uptime = d.timeNow().Sub(d.openedAt)
+
+	return metrics
+}
+
+// sstablesOptions hold the optional parameters to retrieve TableInfo for all sstables.
+type sstablesOptions struct {
+	// set to true will return the sstable properties in TableInfo
+	withProperties bool
+
+	// if set, return sstables that overlap the key range (end-exclusive)
+	start []byte
+	end   []byte
+
+	withApproximateSpanBytes bool
+}
+
+// SSTablesOption set optional parameter used by `DB.SSTables`.
+type SSTablesOption func(*sstablesOptions)
+
+// WithProperties enable return sstable properties in each TableInfo.
+//
+// NOTE: if most of the sstable properties need to be read from disk,
+// this options may make method `SSTables` quite slow.
+func WithProperties() SSTablesOption {
+	return func(opt *sstablesOptions) {
+		opt.withProperties = true
+	}
+}
+
+// WithKeyRangeFilter ensures returned sstables overlap start and end (end-exclusive)
+// if start and end are both nil these properties have no effect.
+func WithKeyRangeFilter(start, end []byte) SSTablesOption {
+	return func(opt *sstablesOptions) {
+		opt.end = end
+		opt.start = start
+	}
+}
+
+// WithApproximateSpanBytes enables capturing the approximate number of bytes that
+// overlap the provided key span for each sstable.
+// NOTE: this option can only be used with WithKeyRangeFilter and WithProperties
+// provided.
+func WithApproximateSpanBytes() SSTablesOption {
+	return func(opt *sstablesOptions) {
+		opt.withApproximateSpanBytes = true
+	}
+}
+
+// BackingType denotes the type of storage backing a given sstable.
+type BackingType int
+
+const (
+	// BackingTypeLocal denotes an sstable stored on local disk according to the
+	// objprovider. This file is completely owned by us.
+	BackingTypeLocal BackingType = iota
+	// BackingTypeShared denotes an sstable stored on shared storage, created
+	// by this Pebble instance and possibly shared by other Pebble instances.
+	// These types of files have lifecycle managed by Pebble.
+	BackingTypeShared
+	// BackingTypeSharedForeign denotes an sstable stored on shared storage,
+	// created by a Pebble instance other than this one. These types of files have
+	// lifecycle managed by Pebble.
+	BackingTypeSharedForeign
+	// BackingTypeExternal denotes an sstable stored on external storage,
+	// not owned by any Pebble instance and with no refcounting/cleanup methods
+	// or lifecycle management. An example of an external file is a file restored
+	// from a backup.
+	BackingTypeExternal
+)
+
+// SSTableInfo export manifest.TableInfo with sstable.Properties alongside
+// other file backing info.
+type SSTableInfo struct {
+	manifest.TableInfo
+	// Virtual indicates whether the sstable is virtual.
+	Virtual bool
+	// BackingSSTNum is the file number associated with backing sstable which
+	// backs the sstable associated with this SSTableInfo. If Virtual is false,
+	// then BackingSSTNum == FileNum.
+	BackingSSTNum base.FileNum
+	// BackingType is the type of storage backing this sstable.
+	BackingType BackingType
+	// Locator is the remote.Locator backing this sstable, if the backing type is
+	// not BackingTypeLocal.
+	Locator remote.Locator
+
+	// Properties is the sstable properties of this table. If Virtual is true,
+	// then the Properties are associated with the backing sst.
+	Properties *sstable.Properties
+}
+
+// SSTables retrieves the current sstables. The returned slice is indexed by
+// level and each level is indexed by the position of the sstable within the
+// level. Note that this information may be out of date due to concurrent
+// flushes and compactions.
+func (d *DB) SSTables(opts ...SSTablesOption) ([][]SSTableInfo, error) {
+	opt := &sstablesOptions{}
+	for _, fn := range opts {
+		fn(opt)
+	}
+
+	if opt.withApproximateSpanBytes && !opt.withProperties {
+		return nil, errors.Errorf("Cannot use WithApproximateSpanBytes without WithProperties option.")
+	}
+	if opt.withApproximateSpanBytes && (opt.start == nil || opt.end == nil) {
+		return nil, errors.Errorf("Cannot use WithApproximateSpanBytes without WithKeyRangeFilter option.")
+	}
+
+	// Grab and reference the current readState.
+	readState := d.loadReadState()
+	defer readState.unref()
+
+	// TODO(peter): This is somewhat expensive, especially on a large
+	// database. It might be worthwhile to unify TableInfo and FileMetadata and
+	// then we could simply return current.Files. Note that RocksDB is doing
+	// something similar to the current code, so perhaps it isn't too bad.
+	srcLevels := readState.current.Levels
+	var totalTables int
+	for i := range srcLevels {
+		totalTables += srcLevels[i].Len()
+	}
+
+	destTables := make([]SSTableInfo, totalTables)
+	destLevels := make([][]SSTableInfo, len(srcLevels))
+	for i := range destLevels {
+		iter := srcLevels[i].Iter()
+		j := 0
+		for m := iter.First(); m != nil; m = iter.Next() {
+			if opt.start != nil && opt.end != nil && !m.Overlaps(d.opts.Comparer.Compare, opt.start, opt.end, true /* exclusive end */) {
+				continue
+			}
+			destTables[j] = SSTableInfo{TableInfo: m.TableInfo()}
+			if opt.withProperties {
+				p, err := d.tableCache.getTableProperties(
+					m,
+				)
+				if err != nil {
+					return nil, err
+				}
+				destTables[j].Properties = p
+			}
+			destTables[j].Virtual = m.Virtual
+			destTables[j].BackingSSTNum = m.FileBacking.DiskFileNum.FileNum()
+			objMeta, err := d.objProvider.Lookup(fileTypeTable, m.FileBacking.DiskFileNum)
+			if err != nil {
+				return nil, err
+			}
+			if objMeta.IsRemote() {
+				if objMeta.IsShared() {
+					if d.objProvider.IsSharedForeign(objMeta) {
+						destTables[j].BackingType = BackingTypeSharedForeign
+					} else {
+						destTables[j].BackingType = BackingTypeShared
+					}
+				} else {
+					destTables[j].BackingType = BackingTypeExternal
+				}
+				destTables[j].Locator = objMeta.Remote.Locator
+			} else {
+				destTables[j].BackingType = BackingTypeLocal
+			}
+
+			if opt.withApproximateSpanBytes {
+				var spanBytes uint64
+				if m.ContainedWithinSpan(d.opts.Comparer.Compare, opt.start, opt.end) {
+					spanBytes = m.Size
+				} else {
+					size, err := d.tableCache.estimateSize(m, opt.start, opt.end)
+					if err != nil {
+						return nil, err
+					}
+					spanBytes = size
+				}
+				propertiesCopy := *destTables[j].Properties
+
+				// Deep copy user properties so approximate span bytes can be added.
+				propertiesCopy.UserProperties = make(map[string]string, len(destTables[j].Properties.UserProperties)+1)
+				for k, v := range destTables[j].Properties.UserProperties {
+					propertiesCopy.UserProperties[k] = v
+				}
+				propertiesCopy.UserProperties["approximate-span-bytes"] = strconv.FormatUint(spanBytes, 10)
+				destTables[j].Properties = &propertiesCopy
+			}
+			j++
+		}
+		destLevels[i] = destTables[:j]
+		destTables = destTables[j:]
+	}
+
+	return destLevels, nil
+}
+
+// EstimateDiskUsage returns the estimated filesystem space used in bytes for
+// storing the range `[start, end]`. The estimation is computed as follows:
+//
+//   - For sstables fully contained in the range the whole file size is included.
+//   - For sstables partially contained in the range the overlapping data block sizes
+//     are included. Even if a data block partially overlaps, or we cannot determine
+//     overlap due to abbreviated index keys, the full data block size is included in
+//     the estimation. Note that unlike fully contained sstables, none of the
+//     meta-block space is counted for partially overlapped files.
+//   - For virtual sstables, we use the overlap between start, end and the virtual
+//     sstable bounds to determine disk usage.
+//   - There may also exist WAL entries for unflushed keys in this range. This
+//     estimation currently excludes space used for the range in the WAL.
+func (d *DB) EstimateDiskUsage(start, end []byte) (uint64, error) {
+	bytes, _, _, err := d.EstimateDiskUsageByBackingType(start, end)
+	return bytes, err
+}
+
+// EstimateDiskUsageByBackingType is like EstimateDiskUsage but additionally
+// returns the subsets of that size in remote ane external files.
+func (d *DB) EstimateDiskUsageByBackingType(
+	start, end []byte,
+) (totalSize, remoteSize, externalSize uint64, _ error) {
+	if err := d.closed.Load(); err != nil {
+		panic(err)
+	}
+	if d.opts.Comparer.Compare(start, end) > 0 {
+		return 0, 0, 0, errors.New("invalid key-range specified (start > end)")
+	}
+
+	// Grab and reference the current readState. This prevents the underlying
+	// files in the associated version from being deleted if there is a concurrent
+	// compaction.
+	readState := d.loadReadState()
+	defer readState.unref()
+
+	for level, files := range readState.current.Levels {
+		iter := files.Iter()
+		if level > 0 {
+			// We can only use `Overlaps` to restrict `files` at L1+ since at L0 it
+			// expands the range iteratively until it has found a set of files that
+			// do not overlap any other L0 files outside that set.
+			overlaps := readState.current.Overlaps(level, d.opts.Comparer.Compare, start, end, false /* exclusiveEnd */)
+			iter = overlaps.Iter()
+		}
+		for file := iter.First(); file != nil; file = iter.Next() {
+			if d.opts.Comparer.Compare(start, file.Smallest.UserKey) <= 0 &&
+				d.opts.Comparer.Compare(file.Largest.UserKey, end) <= 0 {
+				// The range fully contains the file, so skip looking it up in
+				// table cache/looking at its indexes, and add the full file size.
+				meta, err := d.objProvider.Lookup(fileTypeTable, file.FileBacking.DiskFileNum)
+				if err != nil {
+					return 0, 0, 0, err
+				}
+				if meta.IsRemote() {
+					remoteSize += file.Size
+					if meta.Remote.CleanupMethod == objstorage.SharedNoCleanup {
+						externalSize += file.Size
+					}
+				}
+				totalSize += file.Size
+			} else if d.opts.Comparer.Compare(file.Smallest.UserKey, end) <= 0 &&
+				d.opts.Comparer.Compare(start, file.Largest.UserKey) <= 0 {
+				var size uint64
+				var err error
+				if file.Virtual {
+					err = d.tableCache.withVirtualReader(
+						file.VirtualMeta(),
+						func(r sstable.VirtualReader) (err error) {
+							size, err = r.EstimateDiskUsage(start, end)
+							return err
+						},
+					)
+				} else {
+					err = d.tableCache.withReader(
+						file.PhysicalMeta(),
+						func(r *sstable.Reader) (err error) {
+							size, err = r.EstimateDiskUsage(start, end)
+							return err
+						},
+					)
+				}
+				if err != nil {
+					return 0, 0, 0, err
+				}
+				meta, err := d.objProvider.Lookup(fileTypeTable, file.FileBacking.DiskFileNum)
+				if err != nil {
+					return 0, 0, 0, err
+				}
+				if meta.IsRemote() {
+					remoteSize += size
+					if meta.Remote.CleanupMethod == objstorage.SharedNoCleanup {
+						externalSize += size
+					}
+				}
+				totalSize += size
+			}
+		}
+	}
+	return totalSize, remoteSize, externalSize, nil
+}
+
+func (d *DB) walPreallocateSize() int {
+	// Set the WAL preallocate size to 110% of the memtable size. Note that there
+	// is a bit of apples and oranges in units here as the memtabls size
+	// corresponds to the memory usage of the memtable while the WAL size is the
+	// size of the batches (plus overhead) stored in the WAL.
+	//
+	// TODO(peter): 110% of the memtable size is quite hefty for a block
+	// size. This logic is taken from GetWalPreallocateBlockSize in
+	// RocksDB. Could a smaller preallocation block size be used?
+	size := d.opts.MemTableSize
+	size = (size / 10) + size
+	return int(size)
+}
+
+func (d *DB) newMemTable(logNum base.DiskFileNum, logSeqNum uint64) (*memTable, *flushableEntry) {
+	size := d.mu.mem.nextSize
+	if d.mu.mem.nextSize < d.opts.MemTableSize {
+		d.mu.mem.nextSize *= 2
+		if d.mu.mem.nextSize > d.opts.MemTableSize {
+			d.mu.mem.nextSize = d.opts.MemTableSize
+		}
+	}
+
+	memtblOpts := memTableOptions{
+		Options:   d.opts,
+		logSeqNum: logSeqNum,
+	}
+
+	// Before attempting to allocate a new memtable, check if there's one
+	// available for recycling in memTableRecycle. Large contiguous allocations
+	// can be costly as fragmentation makes it more difficult to find a large
+	// contiguous free space. We've observed 64MB allocations taking 10ms+.
+	//
+	// To reduce these costly allocations, up to 1 obsolete memtable is stashed
+	// in `d.memTableRecycle` to allow a future memtable rotation to reuse
+	// existing memory.
+	var mem *memTable
+	mem = d.memTableRecycle.Swap(nil)
+	if mem != nil && uint64(len(mem.arenaBuf)) != size {
+		d.freeMemTable(mem)
+		mem = nil
+	}
+	if mem != nil {
+		// Carry through the existing buffer and memory reservation.
+		memtblOpts.arenaBuf = mem.arenaBuf
+		memtblOpts.releaseAccountingReservation = mem.releaseAccountingReservation
+	} else {
+		mem = new(memTable)
+		memtblOpts.arenaBuf = manual.New(int(size))
+		memtblOpts.releaseAccountingReservation = d.opts.Cache.Reserve(int(size))
+		d.memTableCount.Add(1)
+		d.memTableReserved.Add(int64(size))
+
+		// Note: this is a no-op if invariants are disabled or race is enabled.
+		invariants.SetFinalizer(mem, checkMemTable)
+	}
+	mem.init(memtblOpts)
+
+	entry := d.newFlushableEntry(mem, logNum, logSeqNum)
+	entry.releaseMemAccounting = func() {
+		// If the user leaks iterators, we may be releasing the memtable after
+		// the DB is already closed. In this case, we want to just release the
+		// memory because DB.Close won't come along to free it for us.
+		if err := d.closed.Load(); err != nil {
+			d.freeMemTable(mem)
+			return
+		}
+
+		// The next memtable allocation might be able to reuse this memtable.
+		// Stash it on d.memTableRecycle.
+		if unusedMem := d.memTableRecycle.Swap(mem); unusedMem != nil {
+			// There was already a memtable waiting to be recycled. We're now
+			// responsible for freeing it.
+			d.freeMemTable(unusedMem)
+		}
+	}
+	return mem, entry
+}
+
+func (d *DB) freeMemTable(m *memTable) {
+	d.memTableCount.Add(-1)
+	d.memTableReserved.Add(-int64(len(m.arenaBuf)))
+	m.free()
+}
+
+func (d *DB) newFlushableEntry(
+	f flushable, logNum base.DiskFileNum, logSeqNum uint64,
+) *flushableEntry {
+	fe := &flushableEntry{
+		flushable:      f,
+		flushed:        make(chan struct{}),
+		logNum:         logNum,
+		logSeqNum:      logSeqNum,
+		deleteFn:       d.mu.versions.addObsolete,
+		deleteFnLocked: d.mu.versions.addObsoleteLocked,
+	}
+	fe.readerRefs.Store(1)
+	return fe
+}
+
+// makeRoomForWrite ensures that the memtable has room to hold the contents of
+// Batch. It reserves the space in the memtable and adds a reference to the
+// memtable. The caller must later ensure that the memtable is unreferenced. If
+// the memtable is full, or a nil Batch is provided, the current memtable is
+// rotated (marked as immutable) and a new mutable memtable is allocated. This
+// memtable rotation also causes a log rotation.
+//
+// Both DB.mu and commitPipeline.mu must be held by the caller. Note that DB.mu
+// may be released and reacquired.
+func (d *DB) makeRoomForWrite(b *Batch) error {
+	if b != nil && b.ingestedSSTBatch {
+		panic("pebble: invalid function call")
+	}
+
+	force := b == nil || b.flushable != nil
+	stalled := false
+	for {
+		if b != nil && b.flushable == nil {
+			err := d.mu.mem.mutable.prepare(b)
+			if err != arenaskl.ErrArenaFull {
+				if stalled {
+					d.opts.EventListener.WriteStallEnd()
+				}
+				return err
+			}
+		} else if !force {
+			if stalled {
+				d.opts.EventListener.WriteStallEnd()
+			}
+			return nil
+		}
+		// force || err == ErrArenaFull, so we need to rotate the current memtable.
+		{
+			var size uint64
+			for i := range d.mu.mem.queue {
+				size += d.mu.mem.queue[i].totalBytes()
+			}
+			if size >= uint64(d.opts.MemTableStopWritesThreshold)*d.opts.MemTableSize {
+				// We have filled up the current memtable, but already queued memtables
+				// are still flushing, so we wait.
+				if !stalled {
+					stalled = true
+					d.opts.EventListener.WriteStallBegin(WriteStallBeginInfo{
+						Reason: "memtable count limit reached",
+					})
+				}
+				now := time.Now()
+				d.mu.compact.cond.Wait()
+				if b != nil {
+					b.commitStats.MemTableWriteStallDuration += time.Since(now)
+				}
+				continue
+			}
+		}
+		l0ReadAmp := d.mu.versions.currentVersion().L0Sublevels.ReadAmplification()
+		if l0ReadAmp >= d.opts.L0StopWritesThreshold {
+			// There are too many level-0 files, so we wait.
+			if !stalled {
+				stalled = true
+				d.opts.EventListener.WriteStallBegin(WriteStallBeginInfo{
+					Reason: "L0 file count limit exceeded",
+				})
+			}
+			now := time.Now()
+			d.mu.compact.cond.Wait()
+			if b != nil {
+				b.commitStats.L0ReadAmpWriteStallDuration += time.Since(now)
+			}
+			continue
+		}
+
+		var newLogNum base.DiskFileNum
+		var prevLogSize uint64
+		if !d.opts.DisableWAL {
+			now := time.Now()
+			newLogNum, prevLogSize = d.recycleWAL()
+			if b != nil {
+				b.commitStats.WALRotationDuration += time.Since(now)
+			}
+		}
+
+		immMem := d.mu.mem.mutable
+		imm := d.mu.mem.queue[len(d.mu.mem.queue)-1]
+		imm.logSize = prevLogSize
+		imm.flushForced = imm.flushForced || (b == nil)
+
+		// If we are manually flushing and we used less than half of the bytes in
+		// the memtable, don't increase the size for the next memtable. This
+		// reduces memtable memory pressure when an application is frequently
+		// manually flushing.
+		if (b == nil) && uint64(immMem.availBytes()) > immMem.totalBytes()/2 {
+			d.mu.mem.nextSize = immMem.totalBytes()
+		}
+
+		if b != nil && b.flushable != nil {
+			// The batch is too large to fit in the memtable so add it directly to
+			// the immutable queue. The flushable batch is associated with the same
+			// log as the immutable memtable, but logically occurs after it in
+			// seqnum space. We ensure while flushing that the flushable batch
+			// is flushed along with the previous memtable in the flushable
+			// queue. See the top level comment in DB.flush1 to learn how this
+			// is ensured.
+			//
+			// See DB.commitWrite for the special handling of log writes for large
+			// batches. In particular, the large batch has already written to
+			// imm.logNum.
+			entry := d.newFlushableEntry(b.flushable, imm.logNum, b.SeqNum())
+			// The large batch is by definition large. Reserve space from the cache
+			// for it until it is flushed.
+			entry.releaseMemAccounting = d.opts.Cache.Reserve(int(b.flushable.totalBytes()))
+			d.mu.mem.queue = append(d.mu.mem.queue, entry)
+		}
+
+		var logSeqNum uint64
+		if b != nil {
+			logSeqNum = b.SeqNum()
+			if b.flushable != nil {
+				logSeqNum += uint64(b.Count())
+			}
+		} else {
+			logSeqNum = d.mu.versions.logSeqNum.Load()
+		}
+		d.rotateMemtable(newLogNum, logSeqNum, immMem)
+		force = false
+	}
+}
+
+// Both DB.mu and commitPipeline.mu must be held by the caller.
+func (d *DB) rotateMemtable(newLogNum base.DiskFileNum, logSeqNum uint64, prev *memTable) {
+	// Create a new memtable, scheduling the previous one for flushing. We do
+	// this even if the previous memtable was empty because the DB.Flush
+	// mechanism is dependent on being able to wait for the empty memtable to
+	// flush. We can't just mark the empty memtable as flushed here because we
+	// also have to wait for all previous immutable tables to
+	// flush. Additionally, the memtable is tied to particular WAL file and we
+	// want to go through the flush path in order to recycle that WAL file.
+	//
+	// NB: newLogNum corresponds to the WAL that contains mutations that are
+	// present in the new memtable. When immutable memtables are flushed to
+	// disk, a VersionEdit will be created telling the manifest the minimum
+	// unflushed log number (which will be the next one in d.mu.mem.mutable
+	// that was not flushed).
+	//
+	// NB: prev should be the current mutable memtable.
+	var entry *flushableEntry
+	d.mu.mem.mutable, entry = d.newMemTable(newLogNum, logSeqNum)
+	d.mu.mem.queue = append(d.mu.mem.queue, entry)
+	d.updateReadStateLocked(nil)
+	if prev.writerUnref() {
+		d.maybeScheduleFlush()
+	}
+}
+
+// Both DB.mu and commitPipeline.mu must be held by the caller. Note that DB.mu
+// may be released and reacquired.
+func (d *DB) recycleWAL() (newLogNum base.DiskFileNum, prevLogSize uint64) {
+	if d.opts.DisableWAL {
+		panic("pebble: invalid function call")
+	}
+
+	jobID := d.mu.nextJobID
+	d.mu.nextJobID++
+	newLogNum = d.mu.versions.getNextDiskFileNum()
+
+	prevLogSize = uint64(d.mu.log.Size())
+
+	// The previous log may have grown past its original physical
+	// size. Update its file size in the queue so we have a proper
+	// accounting of its file size.
+	if d.mu.log.queue[len(d.mu.log.queue)-1].fileSize < prevLogSize {
+		d.mu.log.queue[len(d.mu.log.queue)-1].fileSize = prevLogSize
+	}
+	d.mu.Unlock()
+
+	var err error
+	// Close the previous log first. This writes an EOF trailer
+	// signifying the end of the file and syncs it to disk. We must
+	// close the previous log before linking the new log file,
+	// otherwise a crash could leave both logs with unclean tails, and
+	// Open will treat the previous log as corrupt.
+	err = d.mu.log.LogWriter.Close()
+	metrics := d.mu.log.LogWriter.Metrics()
+	d.mu.Lock()
+	if err := d.mu.log.metrics.Merge(metrics); err != nil {
+		d.opts.Logger.Errorf("metrics error: %s", err)
+	}
+	d.mu.Unlock()
+
+	newLogName := base.MakeFilepath(d.opts.FS, d.walDirname, fileTypeLog, newLogNum)
+
+	// Try to use a recycled log file. Recycling log files is an important
+	// performance optimization as it is faster to sync a file that has
+	// already been written, than one which is being written for the first
+	// time. This is due to the need to sync file metadata when a file is
+	// being written for the first time. Note this is true even if file
+	// preallocation is performed (e.g. fallocate).
+	var recycleLog fileInfo
+	var recycleOK bool
+	var newLogFile vfs.File
+	if err == nil {
+		recycleLog, recycleOK = d.logRecycler.peek()
+		if recycleOK {
+			recycleLogName := base.MakeFilepath(d.opts.FS, d.walDirname, fileTypeLog, recycleLog.fileNum)
+			newLogFile, err = d.opts.FS.ReuseForWrite(recycleLogName, newLogName)
+			base.MustExist(d.opts.FS, newLogName, d.opts.Logger, err)
+		} else {
+			newLogFile, err = d.opts.FS.Create(newLogName)
+			base.MustExist(d.opts.FS, newLogName, d.opts.Logger, err)
+		}
+	}
+
+	var newLogSize uint64
+	if err == nil && recycleOK {
+		// Figure out the recycled WAL size. This Stat is necessary
+		// because ReuseForWrite's contract allows for removing the
+		// old file and creating a new one. We don't know whether the
+		// WAL was actually recycled.
+		// TODO(jackson): Adding a boolean to the ReuseForWrite return
+		// value indicating whether or not the file was actually
+		// reused would allow us to skip the stat and use
+		// recycleLog.fileSize.
+		var finfo os.FileInfo
+		finfo, err = newLogFile.Stat()
+		if err == nil {
+			newLogSize = uint64(finfo.Size())
+		}
+	}
+
+	if err == nil {
+		// TODO(peter): RocksDB delays sync of the parent directory until the
+		// first time the log is synced. Is that worthwhile?
+		err = d.walDir.Sync()
+	}
+
+	if err != nil && newLogFile != nil {
+		newLogFile.Close()
+	} else if err == nil {
+		newLogFile = vfs.NewSyncingFile(newLogFile, vfs.SyncingFileOptions{
+			NoSyncOnClose:   d.opts.NoSyncOnClose,
+			BytesPerSync:    d.opts.WALBytesPerSync,
+			PreallocateSize: d.walPreallocateSize(),
+		})
+	}
+
+	if recycleOK {
+		err = firstError(err, d.logRecycler.pop(recycleLog.fileNum.FileNum()))
+	}
+
+	d.opts.EventListener.WALCreated(WALCreateInfo{
+		JobID:           jobID,
+		Path:            newLogName,
+		FileNum:         newLogNum,
+		RecycledFileNum: recycleLog.fileNum.FileNum(),
+		Err:             err,
+	})
+
+	d.mu.Lock()
+
+	d.mu.versions.metrics.WAL.Files++
+
+	if err != nil {
+		// TODO(peter): avoid chewing through file numbers in a tight loop if there
+		// is an error here.
+		//
+		// What to do here? Stumbling on doesn't seem worthwhile. If we failed to
+		// close the previous log it is possible we lost a write.
+		panic(err)
+	}
+
+	d.mu.log.queue = append(d.mu.log.queue, fileInfo{fileNum: newLogNum, fileSize: newLogSize})
+	d.mu.log.LogWriter = record.NewLogWriter(newLogFile, newLogNum, record.LogWriterConfig{
+		WALFsyncLatency:    d.mu.log.metrics.fsyncLatency,
+		WALMinSyncInterval: d.opts.WALMinSyncInterval,
+		QueueSemChan:       d.commit.logSyncQSem,
+	})
+	if d.mu.log.registerLogWriterForTesting != nil {
+		d.mu.log.registerLogWriterForTesting(d.mu.log.LogWriter)
+	}
+
+	return
+}
+
+func (d *DB) getEarliestUnflushedSeqNumLocked() uint64 {
+	seqNum := InternalKeySeqNumMax
+	for i := range d.mu.mem.queue {
+		logSeqNum := d.mu.mem.queue[i].logSeqNum
+		if seqNum > logSeqNum {
+			seqNum = logSeqNum
+		}
+	}
+	return seqNum
+}
+
+func (d *DB) getInProgressCompactionInfoLocked(finishing *compaction) (rv []compactionInfo) {
+	for c := range d.mu.compact.inProgress {
+		if len(c.flushing) == 0 && (finishing == nil || c != finishing) {
+			info := compactionInfo{
+				versionEditApplied: c.versionEditApplied,
+				inputs:             c.inputs,
+				smallest:           c.smallest,
+				largest:            c.largest,
+				outputLevel:        -1,
+			}
+			if c.outputLevel != nil {
+				info.outputLevel = c.outputLevel.level
+			}
+			rv = append(rv, info)
+		}
+	}
+	return
+}
+
+func inProgressL0Compactions(inProgress []compactionInfo) []manifest.L0Compaction {
+	var compactions []manifest.L0Compaction
+	for _, info := range inProgress {
+		// Skip in-progress compactions that have already committed; the L0
+		// sublevels initialization code requires the set of in-progress
+		// compactions to be consistent with the current version. Compactions
+		// with versionEditApplied=true are already applied to the current
+		// version and but are performing cleanup without the database mutex.
+		if info.versionEditApplied {
+			continue
+		}
+		l0 := false
+		for _, cl := range info.inputs {
+			l0 = l0 || cl.level == 0
+		}
+		if !l0 {
+			continue
+		}
+		compactions = append(compactions, manifest.L0Compaction{
+			Smallest:  info.smallest,
+			Largest:   info.largest,
+			IsIntraL0: info.outputLevel == 0,
+		})
+	}
+	return compactions
+}
+
+// firstError returns the first non-nil error of err0 and err1, or nil if both
+// are nil.
+func firstError(err0, err1 error) error {
+	if err0 != nil {
+		return err0
+	}
+	return err1
+}
+
+// SetCreatorID sets the CreatorID which is needed in order to use shared objects.
+// Remote object usage is disabled until this method is called the first time.
+// Once set, the Creator ID is persisted and cannot change.
+//
+// Does nothing if SharedStorage was not set in the options when the DB was
+// opened or if the DB is in read-only mode.
+func (d *DB) SetCreatorID(creatorID uint64) error {
+	if d.opts.Experimental.RemoteStorage == nil || d.opts.ReadOnly {
+		return nil
+	}
+	return d.objProvider.SetCreatorID(objstorage.CreatorID(creatorID))
+}
+
+// KeyStatistics keeps track of the number of keys that have been pinned by a
+// snapshot as well as counts of the different key kinds in the lsm.
+//
+// One way of using the accumulated stats, when we only have sets and dels,
+// and say the counts are represented as del_count, set_count,
+// del_latest_count, set_latest_count, snapshot_pinned_count.
+//
+//   - del_latest_count + set_latest_count is the set of unique user keys
+//     (unique).
+//
+//   - set_latest_count is the set of live unique user keys (live_unique).
+//
+//   - Garbage is del_count + set_count - live_unique.
+//
+//   - If everything were in the LSM, del_count+set_count-snapshot_pinned_count
+//     would also be the set of unique user keys (note that
+//     snapshot_pinned_count is counting something different -- see comment below).
+//     But snapshot_pinned_count only counts keys in the LSM so the excess here
+//     must be keys in memtables.
+type KeyStatistics struct {
+	// TODO(sumeer): the SnapshotPinned* are incorrect in that these older
+	// versions can be in a different level. Either fix the accounting or
+	// rename these fields.
+
+	// SnapshotPinnedKeys represents obsolete keys that cannot be elided during
+	// a compaction, because they are required by an open snapshot.
+	SnapshotPinnedKeys int
+	// SnapshotPinnedKeysBytes is the total number of bytes of all snapshot
+	// pinned keys.
+	SnapshotPinnedKeysBytes uint64
+	// KindsCount is the count for each kind of key. It includes point keys,
+	// range deletes and range keys.
+	KindsCount [InternalKeyKindMax + 1]int
+	// LatestKindsCount is the count for each kind of key when it is the latest
+	// kind for a user key. It is only populated for point keys.
+	LatestKindsCount [InternalKeyKindMax + 1]int
+}
+
+// LSMKeyStatistics is used by DB.ScanStatistics.
+type LSMKeyStatistics struct {
+	Accumulated KeyStatistics
+	// Levels contains statistics only for point keys. Range deletions and range keys will
+	// appear in Accumulated but not Levels.
+	Levels [numLevels]KeyStatistics
+	// BytesRead represents the logical, pre-compression size of keys and values read
+	BytesRead uint64
+}
+
+// ScanStatisticsOptions is used by DB.ScanStatistics.
+type ScanStatisticsOptions struct {
+	// LimitBytesPerSecond indicates the number of bytes that are able to be read
+	// per second using ScanInternal.
+	// A value of 0 indicates that there is no limit set.
+	LimitBytesPerSecond int64
+}
+
+// ScanStatistics returns the count of different key kinds within the lsm for a
+// key span [lower, upper) as well as the number of snapshot keys.
+func (d *DB) ScanStatistics(
+	ctx context.Context, lower, upper []byte, opts ScanStatisticsOptions,
+) (LSMKeyStatistics, error) {
+	stats := LSMKeyStatistics{}
+	var prevKey InternalKey
+	var rateLimitFunc func(key *InternalKey, val LazyValue) error
+	tb := tokenbucket.TokenBucket{}
+
+	if opts.LimitBytesPerSecond != 0 {
+		// Each "token" roughly corresponds to a byte that was read.
+		tb.Init(tokenbucket.TokensPerSecond(opts.LimitBytesPerSecond), tokenbucket.Tokens(1024))
+		rateLimitFunc = func(key *InternalKey, val LazyValue) error {
+			return tb.WaitCtx(ctx, tokenbucket.Tokens(key.Size()+val.Len()))
+		}
+	}
+
+	scanInternalOpts := &scanInternalOptions{
+		visitPointKey: func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error {
+			// If the previous key is equal to the current point key, the current key was
+			// pinned by a snapshot.
+			size := uint64(key.Size())
+			kind := key.Kind()
+			sameKey := d.equal(prevKey.UserKey, key.UserKey)
+			if iterInfo.Kind == IteratorLevelLSM && sameKey {
+				stats.Levels[iterInfo.Level].SnapshotPinnedKeys++
+				stats.Levels[iterInfo.Level].SnapshotPinnedKeysBytes += size
+				stats.Accumulated.SnapshotPinnedKeys++
+				stats.Accumulated.SnapshotPinnedKeysBytes += size
+			}
+			if iterInfo.Kind == IteratorLevelLSM {
+				stats.Levels[iterInfo.Level].KindsCount[kind]++
+			}
+			if !sameKey {
+				if iterInfo.Kind == IteratorLevelLSM {
+					stats.Levels[iterInfo.Level].LatestKindsCount[kind]++
+				}
+				stats.Accumulated.LatestKindsCount[kind]++
+			}
+
+			stats.Accumulated.KindsCount[kind]++
+			prevKey.CopyFrom(*key)
+			stats.BytesRead += uint64(key.Size() + value.Len())
+			return nil
+		},
+		visitRangeDel: func(start, end []byte, seqNum uint64) error {
+			stats.Accumulated.KindsCount[InternalKeyKindRangeDelete]++
+			stats.BytesRead += uint64(len(start) + len(end))
+			return nil
+		},
+		visitRangeKey: func(start, end []byte, keys []rangekey.Key) error {
+			stats.BytesRead += uint64(len(start) + len(end))
+			for _, key := range keys {
+				stats.Accumulated.KindsCount[key.Kind()]++
+				stats.BytesRead += uint64(len(key.Value) + len(key.Suffix))
+			}
+			return nil
+		},
+		includeObsoleteKeys: true,
+		IterOptions: IterOptions{
+			KeyTypes:   IterKeyTypePointsAndRanges,
+			LowerBound: lower,
+			UpperBound: upper,
+		},
+		rateLimitFunc: rateLimitFunc,
+	}
+	iter, err := d.newInternalIter(ctx, snapshotIterOpts{}, scanInternalOpts)
+	if err != nil {
+		return LSMKeyStatistics{}, err
+	}
+	defer iter.close()
+
+	err = scanInternalImpl(ctx, lower, upper, iter, scanInternalOpts)
+
+	if err != nil {
+		return LSMKeyStatistics{}, err
+	}
+
+	return stats, nil
+}
+
+// ObjProvider returns the objstorage.Provider for this database. Meant to be
+// used for internal purposes only.
+func (d *DB) ObjProvider() objstorage.Provider {
+	return d.objProvider
+}
+
+func (d *DB) checkVirtualBounds(m *fileMetadata) {
+	if !invariants.Enabled {
+		return
+	}
+
+	objMeta, err := d.objProvider.Lookup(fileTypeTable, m.FileBacking.DiskFileNum)
+	if err != nil {
+		panic(err)
+	}
+	if objMeta.IsExternal() {
+		// Nothing to do; bounds are expected to be loose.
+		return
+	}
+
+	if m.HasPointKeys {
+		pointIter, rangeDelIter, err := d.newIters(context.TODO(), m, nil, internalIterOpts{})
+		if err != nil {
+			panic(errors.Wrap(err, "pebble: error creating point iterator"))
+		}
+
+		defer pointIter.Close()
+		if rangeDelIter != nil {
+			defer rangeDelIter.Close()
+		}
+
+		pointKey, _ := pointIter.First()
+		var rangeDel *keyspan.Span
+		if rangeDelIter != nil {
+			rangeDel = rangeDelIter.First()
+		}
+
+		// Check that the lower bound is tight.
+		if (rangeDel == nil || d.cmp(rangeDel.SmallestKey().UserKey, m.SmallestPointKey.UserKey) != 0) &&
+			(pointKey == nil || d.cmp(pointKey.UserKey, m.SmallestPointKey.UserKey) != 0) {
+			panic(errors.Newf("pebble: virtual sstable %s lower point key bound is not tight", m.FileNum))
+		}
+
+		pointKey, _ = pointIter.Last()
+		rangeDel = nil
+		if rangeDelIter != nil {
+			rangeDel = rangeDelIter.Last()
+		}
+
+		// Check that the upper bound is tight.
+		if (rangeDel == nil || d.cmp(rangeDel.LargestKey().UserKey, m.LargestPointKey.UserKey) != 0) &&
+			(pointKey == nil || d.cmp(pointKey.UserKey, m.LargestPointKey.UserKey) != 0) {
+			panic(errors.Newf("pebble: virtual sstable %s upper point key bound is not tight", m.FileNum))
+		}
+
+		// Check that iterator keys are within bounds.
+		for key, _ := pointIter.First(); key != nil; key, _ = pointIter.Next() {
+			if d.cmp(key.UserKey, m.SmallestPointKey.UserKey) < 0 || d.cmp(key.UserKey, m.LargestPointKey.UserKey) > 0 {
+				panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.FileNum, key.UserKey))
+			}
+		}
+
+		if rangeDelIter != nil {
+			for key := rangeDelIter.First(); key != nil; key = rangeDelIter.Next() {
+				if d.cmp(key.SmallestKey().UserKey, m.SmallestPointKey.UserKey) < 0 {
+					panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.FileNum, key.SmallestKey().UserKey))
+				}
+
+				if d.cmp(key.LargestKey().UserKey, m.LargestPointKey.UserKey) > 0 {
+					panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.FileNum, key.LargestKey().UserKey))
+				}
+			}
+		}
+	}
+
+	if !m.HasRangeKeys {
+		return
+	}
+
+	rangeKeyIter, err := d.tableNewRangeKeyIter(m, keyspan.SpanIterOptions{})
+	defer rangeKeyIter.Close()
+
+	if err != nil {
+		panic(errors.Wrap(err, "pebble: error creating range key iterator"))
+	}
+
+	// Check that the lower bound is tight.
+	if d.cmp(rangeKeyIter.First().SmallestKey().UserKey, m.SmallestRangeKey.UserKey) != 0 {
+		panic(errors.Newf("pebble: virtual sstable %s lower range key bound is not tight", m.FileNum))
+	}
+
+	// Check that upper bound is tight.
+	if d.cmp(rangeKeyIter.Last().LargestKey().UserKey, m.LargestRangeKey.UserKey) != 0 {
+		panic(errors.Newf("pebble: virtual sstable %s upper range key bound is not tight", m.FileNum))
+	}
+
+	for key := rangeKeyIter.First(); key != nil; key = rangeKeyIter.Next() {
+		if d.cmp(key.SmallestKey().UserKey, m.SmallestRangeKey.UserKey) < 0 {
+			panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.FileNum, key.SmallestKey().UserKey))
+		}
+		if d.cmp(key.LargestKey().UserKey, m.LargestRangeKey.UserKey) > 0 {
+			panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.FileNum, key.LargestKey().UserKey))
+		}
+	}
+}
diff --git a/pebble/db_test.go b/pebble/db_test.go
new file mode 100644
index 0000000..631753d
--- /dev/null
+++ b/pebble/db_test.go
@@ -0,0 +1,1969 @@
+// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"io"
+	"path/filepath"
+	"slices"
+	"strconv"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/cache"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+// try repeatedly calls f, sleeping between calls with exponential back-off,
+// until f returns a nil error or the total sleep time is greater than or equal
+// to maxTotalSleep. It always calls f at least once.
+func try(initialSleep, maxTotalSleep time.Duration, f func() error) error {
+	totalSleep := time.Duration(0)
+	for d := initialSleep; ; d *= 2 {
+		time.Sleep(d)
+		totalSleep += d
+		if err := f(); err == nil || totalSleep >= maxTotalSleep {
+			return err
+		}
+	}
+}
+
+func TestTry(t *testing.T) {
+	c := make(chan struct{})
+	go func() {
+		time.Sleep(1 * time.Millisecond)
+		close(c)
+	}()
+
+	attemptsMu := sync.Mutex{}
+	attempts := 0
+
+	err := try(100*time.Microsecond, 20*time.Second, func() error {
+		attemptsMu.Lock()
+		attempts++
+		attemptsMu.Unlock()
+
+		select {
+		default:
+			return errors.New("timed out")
+		case <-c:
+			return nil
+		}
+	})
+	require.NoError(t, err)
+
+	attemptsMu.Lock()
+	a := attempts
+	attemptsMu.Unlock()
+
+	if a == 0 {
+		t.Fatalf("attempts: got 0, want > 0")
+	}
+}
+
+func TestBasicReads(t *testing.T) {
+	testCases := []struct {
+		dirname string
+		wantMap map[string]string
+	}{
+		{
+			"db-stage-1",
+			map[string]string{
+				"aaa":  "",
+				"bar":  "",
+				"baz":  "",
+				"foo":  "",
+				"quux": "",
+				"zzz":  "",
+			},
+		},
+		{
+			"db-stage-2",
+			map[string]string{
+				"aaa":  "",
+				"bar":  "",
+				"baz":  "three",
+				"foo":  "four",
+				"quux": "",
+				"zzz":  "",
+			},
+		},
+		{
+			"db-stage-3",
+			map[string]string{
+				"aaa":  "",
+				"bar":  "",
+				"baz":  "three",
+				"foo":  "four",
+				"quux": "",
+				"zzz":  "",
+			},
+		},
+		{
+			"db-stage-4",
+			map[string]string{
+				"aaa":  "",
+				"bar":  "",
+				"baz":  "",
+				"foo":  "five",
+				"quux": "six",
+				"zzz":  "",
+			},
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.dirname, func(t *testing.T) {
+			fs := vfs.NewMem()
+			_, err := vfs.Clone(vfs.Default, fs, filepath.Join("testdata", tc.dirname), tc.dirname)
+			if err != nil {
+				t.Fatalf("%s: cloneFileSystem failed: %v", tc.dirname, err)
+			}
+			d, err := Open(tc.dirname, testingRandomized(t, &Options{
+				FS: fs,
+			}))
+			if err != nil {
+				t.Fatalf("%s: Open failed: %v", tc.dirname, err)
+			}
+			for key, want := range tc.wantMap {
+				got, closer, err := d.Get([]byte(key))
+				if err != nil && err != ErrNotFound {
+					t.Fatalf("%s: Get(%q) failed: %v", tc.dirname, key, err)
+				}
+				if string(got) != string(want) {
+					t.Fatalf("%s: Get(%q): got %q, want %q", tc.dirname, key, got, want)
+				}
+				if closer != nil {
+					closer.Close()
+				}
+			}
+			err = d.Close()
+			if err != nil {
+				t.Fatalf("%s: Close failed: %v", tc.dirname, err)
+			}
+		})
+	}
+}
+
+func TestBasicWrites(t *testing.T) {
+	d, err := Open("", testingRandomized(t, &Options{
+		FS: vfs.NewMem(),
+	}))
+	require.NoError(t, err)
+
+	names := []string{
+		"Alatar",
+		"Gandalf",
+		"Pallando",
+		"Radagast",
+		"Saruman",
+		"Joe",
+	}
+	wantMap := map[string]string{}
+
+	inBatch, batch, pending := false, &Batch{}, [][]string(nil)
+	set0 := func(k, v string) error {
+		return d.Set([]byte(k), []byte(v), nil)
+	}
+	del0 := func(k string) error {
+		return d.Delete([]byte(k), nil)
+	}
+	set1 := func(k, v string) error {
+		batch.Set([]byte(k), []byte(v), nil)
+		return nil
+	}
+	del1 := func(k string) error {
+		batch.Delete([]byte(k), nil)
+		return nil
+	}
+	set, del := set0, del0
+
+	testCases := []string{
+		"set Gandalf Grey",
+		"set Saruman White",
+		"set Radagast Brown",
+		"delete Saruman",
+		"set Gandalf White",
+		"batch",
+		"  set Alatar AliceBlue",
+		"apply",
+		"delete Pallando",
+		"set Alatar AntiqueWhite",
+		"set Pallando PapayaWhip",
+		"batch",
+		"apply",
+		"set Pallando PaleVioletRed",
+		"batch",
+		"  delete Alatar",
+		"  set Gandalf GhostWhite",
+		"  set Saruman Seashell",
+		"  delete Saruman",
+		"  set Saruman SeaGreen",
+		"  set Radagast RosyBrown",
+		"  delete Pallando",
+		"apply",
+		"delete Radagast",
+		"delete Radagast",
+		"delete Radagast",
+		"set Gandalf Goldenrod",
+		"set Pallando PeachPuff",
+		"batch",
+		"  delete Joe",
+		"  delete Saruman",
+		"  delete Radagast",
+		"  delete Pallando",
+		"  delete Gandalf",
+		"  delete Alatar",
+		"apply",
+		"set Joe Plumber",
+	}
+	for i, tc := range testCases {
+		s := strings.Split(strings.TrimSpace(tc), " ")
+		switch s[0] {
+		case "set":
+			if err := set(s[1], s[2]); err != nil {
+				t.Fatalf("#%d %s: %v", i, tc, err)
+			}
+			if inBatch {
+				pending = append(pending, s)
+			} else {
+				wantMap[s[1]] = s[2]
+			}
+		case "delete":
+			if err := del(s[1]); err != nil {
+				t.Fatalf("#%d %s: %v", i, tc, err)
+			}
+			if inBatch {
+				pending = append(pending, s)
+			} else {
+				delete(wantMap, s[1])
+			}
+		case "batch":
+			inBatch, batch, set, del = true, &Batch{}, set1, del1
+		case "apply":
+			if err := d.Apply(batch, nil); err != nil {
+				t.Fatalf("#%d %s: %v", i, tc, err)
+			}
+			for _, p := range pending {
+				switch p[0] {
+				case "set":
+					wantMap[p[1]] = p[2]
+				case "delete":
+					delete(wantMap, p[1])
+				}
+			}
+			inBatch, pending, set, del = false, nil, set0, del0
+		default:
+			t.Fatalf("#%d %s: bad test case: %q", i, tc, s)
+		}
+
+		fail := false
+		for _, name := range names {
+			g, closer, err := d.Get([]byte(name))
+			if err != nil && err != ErrNotFound {
+				t.Errorf("#%d %s: Get(%q): %v", i, tc, name, err)
+				fail = true
+			}
+			got, gOK := string(g), err == nil
+			want, wOK := wantMap[name]
+			if got != want || gOK != wOK {
+				t.Errorf("#%d %s: Get(%q): got %q, %t, want %q, %t",
+					i, tc, name, got, gOK, want, wOK)
+				fail = true
+			}
+			if closer != nil {
+				closer.Close()
+			}
+		}
+		if fail {
+			return
+		}
+	}
+
+	require.NoError(t, d.Close())
+}
+
+func TestRandomWrites(t *testing.T) {
+	d, err := Open("", testingRandomized(t, &Options{
+		FS:           vfs.NewMem(),
+		MemTableSize: 8 * 1024,
+	}))
+	require.NoError(t, err)
+
+	keys := [64][]byte{}
+	wants := [64]int{}
+	for k := range keys {
+		keys[k] = []byte(strconv.Itoa(k))
+		wants[k] = -1
+	}
+	xxx := bytes.Repeat([]byte("x"), 512)
+
+	rng := rand.New(rand.NewSource(123))
+	const N = 1000
+	for i := 0; i < N; i++ {
+		k := rng.Intn(len(keys))
+		if rng.Intn(20) != 0 {
+			wants[k] = rng.Intn(len(xxx) + 1)
+			if err := d.Set(keys[k], xxx[:wants[k]], nil); err != nil {
+				t.Fatalf("i=%d: Set: %v", i, err)
+			}
+		} else {
+			wants[k] = -1
+			if err := d.Delete(keys[k], nil); err != nil {
+				t.Fatalf("i=%d: Delete: %v", i, err)
+			}
+		}
+
+		if i != N-1 || rng.Intn(50) != 0 {
+			continue
+		}
+		for k := range keys {
+			got := -1
+			if v, closer, err := d.Get(keys[k]); err != nil {
+				if err != ErrNotFound {
+					t.Fatalf("Get: %v", err)
+				}
+			} else {
+				got = len(v)
+				closer.Close()
+			}
+			if got != wants[k] {
+				t.Errorf("i=%d, k=%d: got %d, want %d", i, k, got, wants[k])
+			}
+		}
+	}
+
+	require.NoError(t, d.Close())
+}
+
+func TestLargeBatch(t *testing.T) {
+	d, err := Open("", testingRandomized(t, &Options{
+		FS:                          vfs.NewMem(),
+		MemTableSize:                1400,
+		MemTableStopWritesThreshold: 100,
+	}))
+	require.NoError(t, err)
+
+	verifyLSM := func(expected string) func() error {
+		return func() error {
+			d.mu.Lock()
+			s := d.mu.versions.currentVersion().String()
+			d.mu.Unlock()
+			if expected != s {
+				if testing.Verbose() {
+					fmt.Println(strings.TrimSpace(s))
+				}
+				return errors.Errorf("expected %s, but found %s", expected, s)
+			}
+			return nil
+		}
+	}
+
+	logNum := func() base.DiskFileNum {
+		d.mu.Lock()
+		defer d.mu.Unlock()
+		return d.mu.log.queue[len(d.mu.log.queue)-1].fileNum
+	}
+	fileSize := func(fileNum base.DiskFileNum) int64 {
+		info, err := d.opts.FS.Stat(base.MakeFilepath(d.opts.FS, "", fileTypeLog, fileNum))
+		require.NoError(t, err)
+		return info.Size()
+	}
+	memTableCreationSeqNum := func() uint64 {
+		d.mu.Lock()
+		defer d.mu.Unlock()
+		return d.mu.mem.mutable.logSeqNum
+	}
+
+	startLogNum := logNum()
+	startLogStartSize := fileSize(startLogNum)
+	startSeqNum := d.mu.versions.logSeqNum.Load()
+
+	// Write a key with a value larger than the memtable size.
+	require.NoError(t, d.Set([]byte("a"), bytes.Repeat([]byte("a"), 512), nil))
+
+	// Verify that the large batch was written to the WAL that existed before it
+	// was committed. We verify that WAL rotation occurred, where the large batch
+	// was written to, and that the new WAL is empty.
+	endLogNum := logNum()
+	if startLogNum == endLogNum {
+		t.Fatal("expected WAL rotation")
+	}
+	startLogEndSize := fileSize(startLogNum)
+	if startLogEndSize == startLogStartSize {
+		t.Fatalf("expected large batch to be written to %s.log, but file size unchanged at %d",
+			startLogNum, startLogEndSize)
+	}
+	endLogSize := fileSize(endLogNum)
+	if endLogSize != 0 {
+		t.Fatalf("expected %s.log to be empty, but found %d", endLogNum, endLogSize)
+	}
+	if creationSeqNum := memTableCreationSeqNum(); creationSeqNum <= startSeqNum {
+		t.Fatalf("expected memTable.logSeqNum=%d > largeBatch.seqNum=%d", creationSeqNum, startSeqNum)
+	}
+
+	// Verify this results in one L0 table being created.
+	require.NoError(t, try(100*time.Microsecond, 20*time.Second,
+		verifyLSM("0.0:\n  000005:[a#10,SET-a#10,SET]\n")))
+
+	require.NoError(t, d.Set([]byte("b"), bytes.Repeat([]byte("b"), 512), nil))
+
+	// Verify this results in a second L0 table being created.
+	require.NoError(t, try(100*time.Microsecond, 20*time.Second,
+		verifyLSM("0.0:\n  000005:[a#10,SET-a#10,SET]\n  000007:[b#11,SET-b#11,SET]\n")))
+
+	// Allocate a bunch of batches to exhaust the batchPool. None of these
+	// batches should have a non-zero count.
+	for i := 0; i < 10; i++ {
+		b := d.NewBatch()
+		require.EqualValues(t, 0, b.Count())
+	}
+
+	require.NoError(t, d.Close())
+}
+
+func TestGetNoCache(t *testing.T) {
+	cache := NewCache(0)
+	defer cache.Unref()
+
+	d, err := Open("", testingRandomized(t, &Options{
+		Cache: cache,
+		FS:    vfs.NewMem(),
+	}))
+	require.NoError(t, err)
+
+	require.NoError(t, d.Set([]byte("a"), []byte("aa"), nil))
+	require.NoError(t, d.Flush())
+	verifyGet(t, d, []byte("a"), []byte("aa"))
+
+	require.NoError(t, d.Close())
+}
+
+func TestGetMerge(t *testing.T) {
+	d, err := Open("", testingRandomized(t, &Options{
+		FS: vfs.NewMem(),
+	}))
+	require.NoError(t, err)
+
+	key := []byte("a")
+	verify := func(expected string) {
+		val, closer, err := d.Get(key)
+		require.NoError(t, err)
+
+		if expected != string(val) {
+			t.Fatalf("expected %s, but got %s", expected, val)
+		}
+		closer.Close()
+	}
+
+	const val = "1"
+	for i := 1; i <= 3; i++ {
+		require.NoError(t, d.Merge(key, []byte(val), nil))
+
+		expected := strings.Repeat(val, i)
+		verify(expected)
+
+		require.NoError(t, d.Flush())
+		verify(expected)
+	}
+
+	require.NoError(t, d.Close())
+}
+
+func TestMergeOrderSameAfterFlush(t *testing.T) {
+	// Ensure compaction iterator (used by flush) and user iterator process merge
+	// operands in the same order
+	d, err := Open("", testingRandomized(t, &Options{
+		FS: vfs.NewMem(),
+	}))
+	require.NoError(t, err)
+
+	key := []byte("a")
+	verify := func(expected string) {
+		iter, _ := d.NewIter(nil)
+		if !iter.SeekGE([]byte("a")) {
+			t.Fatal("expected one value, but got empty iterator")
+		}
+		if expected != string(iter.Value()) {
+			t.Fatalf("expected %s, but got %s", expected, string(iter.Value()))
+		}
+		if !iter.SeekLT([]byte("b")) {
+			t.Fatal("expected one value, but got empty iterator")
+		}
+		if expected != string(iter.Value()) {
+			t.Fatalf("expected %s, but got %s", expected, string(iter.Value()))
+		}
+		require.NoError(t, iter.Close())
+	}
+
+	require.NoError(t, d.Merge(key, []byte("0"), nil))
+	require.NoError(t, d.Merge(key, []byte("1"), nil))
+
+	verify("01")
+	require.NoError(t, d.Flush())
+	verify("01")
+
+	require.NoError(t, d.Close())
+}
+
+type closableMerger struct {
+	lastBuf []byte
+	closed  bool
+}
+
+func (m *closableMerger) MergeNewer(value []byte) error {
+	m.lastBuf = append(m.lastBuf[:0], value...)
+	return nil
+}
+
+func (m *closableMerger) MergeOlder(value []byte) error {
+	m.lastBuf = append(m.lastBuf[:0], value...)
+	return nil
+}
+
+func (m *closableMerger) Finish(includesBase bool) ([]byte, io.Closer, error) {
+	return m.lastBuf, m, nil
+}
+
+func (m *closableMerger) Close() error {
+	m.closed = true
+	return nil
+}
+
+func TestMergerClosing(t *testing.T) {
+	m := &closableMerger{}
+
+	d, err := Open("", testingRandomized(t, &Options{
+		FS: vfs.NewMem(),
+		Merger: &Merger{
+			Merge: func(key, value []byte) (base.ValueMerger, error) {
+				return m, m.MergeNewer(value)
+			},
+		},
+	}))
+	require.NoError(t, err)
+
+	defer func() {
+		require.NoError(t, d.Close())
+	}()
+
+	err = d.Merge([]byte("a"), []byte("b"), nil)
+	require.NoError(t, err)
+	require.False(t, m.closed)
+
+	val, closer, err := d.Get([]byte("a"))
+	require.NoError(t, err)
+	require.Equal(t, []byte("b"), val)
+	require.NotNil(t, closer)
+	require.False(t, m.closed)
+	_ = closer.Close()
+	require.True(t, m.closed)
+}
+
+func TestLogData(t *testing.T) {
+	d, err := Open("", testingRandomized(t, &Options{
+		FS: vfs.NewMem(),
+	}))
+	require.NoError(t, err)
+
+	defer func() {
+		require.NoError(t, d.Close())
+	}()
+
+	require.NoError(t, d.LogData([]byte("foo"), Sync))
+	require.NoError(t, d.LogData([]byte("bar"), Sync))
+	// TODO(itsbilal): Confirm that we wrote some bytes to the WAL.
+	// For now, LogData proceeding ahead without a panic is good enough.
+}
+
+func TestSingleDeleteGet(t *testing.T) {
+	d, err := Open("", testingRandomized(t, &Options{
+		FS: vfs.NewMem(),
+	}))
+	require.NoError(t, err)
+	defer func() {
+		require.NoError(t, d.Close())
+	}()
+
+	key := []byte("key")
+	val := []byte("val")
+
+	require.NoError(t, d.Set(key, val, nil))
+	verifyGet(t, d, key, val)
+
+	key2 := []byte("key2")
+	val2 := []byte("val2")
+
+	require.NoError(t, d.Set(key2, val2, nil))
+	verifyGet(t, d, key2, val2)
+
+	require.NoError(t, d.SingleDelete(key2, nil))
+	verifyGetNotFound(t, d, key2)
+}
+
+func TestSingleDeleteFlush(t *testing.T) {
+	d, err := Open("", testingRandomized(t, &Options{
+		FS: vfs.NewMem(),
+	}))
+	require.NoError(t, err)
+	defer func() {
+		require.NoError(t, d.Close())
+	}()
+
+	key := []byte("key")
+	valFirst := []byte("first")
+	valSecond := []byte("second")
+	key2 := []byte("key2")
+	val2 := []byte("val2")
+
+	require.NoError(t, d.Set(key, valFirst, nil))
+	require.NoError(t, d.Set(key2, val2, nil))
+	require.NoError(t, d.Flush())
+
+	require.NoError(t, d.SingleDelete(key, nil))
+	require.NoError(t, d.Set(key, valSecond, nil))
+	require.NoError(t, d.Delete(key2, nil))
+	require.NoError(t, d.Set(key2, val2, nil))
+	require.NoError(t, d.Flush())
+
+	require.NoError(t, d.SingleDelete(key, nil))
+	require.NoError(t, d.Delete(key2, nil))
+	require.NoError(t, d.Flush())
+
+	verifyGetNotFound(t, d, key)
+	verifyGetNotFound(t, d, key2)
+}
+
+func TestUnremovableSingleDelete(t *testing.T) {
+	d, err := Open("", testingRandomized(t, &Options{
+		FS:                    vfs.NewMem(),
+		L0CompactionThreshold: 8,
+	}))
+	require.NoError(t, err)
+	defer func() {
+		require.NoError(t, d.Close())
+	}()
+
+	key := []byte("key")
+	valFirst := []byte("valFirst")
+	valSecond := []byte("valSecond")
+
+	require.NoError(t, d.Set(key, valFirst, nil))
+	ss := d.NewSnapshot()
+	defer ss.Close()
+	require.NoError(t, d.SingleDelete(key, nil))
+	require.NoError(t, d.Set(key, valSecond, nil))
+	require.NoError(t, d.Flush())
+
+	verifyGet(t, ss, key, valFirst)
+	verifyGet(t, d, key, valSecond)
+
+	require.NoError(t, d.SingleDelete(key, nil))
+
+	verifyGet(t, ss, key, valFirst)
+	verifyGetNotFound(t, d, key)
+
+	require.NoError(t, d.Flush())
+
+	verifyGet(t, ss, key, valFirst)
+	verifyGetNotFound(t, d, key)
+}
+
+func TestIterLeak(t *testing.T) {
+	for _, leak := range []bool{true, false} {
+		t.Run(fmt.Sprintf("leak=%t", leak), func(t *testing.T) {
+			for _, flush := range []bool{true, false} {
+				t.Run(fmt.Sprintf("flush=%t", flush), func(t *testing.T) {
+					d, err := Open("", testingRandomized(t, &Options{
+						FS: vfs.NewMem(),
+					}))
+					require.NoError(t, err)
+
+					require.NoError(t, d.Set([]byte("a"), []byte("a"), nil))
+					if flush {
+						require.NoError(t, d.Flush())
+					}
+					iter, _ := d.NewIter(nil)
+					iter.First()
+					if !leak {
+						require.NoError(t, iter.Close())
+						require.NoError(t, d.Close())
+					} else {
+						defer iter.Close()
+						if err := d.Close(); err == nil {
+							t.Fatalf("expected failure, but found success")
+						} else if !strings.HasPrefix(err.Error(), "leaked iterators:") {
+							t.Fatalf("expected leaked iterators, but found %+v", err)
+						} else {
+							t.Log(err.Error())
+						}
+					}
+				})
+			}
+		})
+	}
+}
+
+// Make sure that we detect an iter leak when only one DB closes
+// while the second db still holds a reference to the TableCache.
+func TestIterLeakSharedCache(t *testing.T) {
+	for _, leak := range []bool{true, false} {
+		t.Run(fmt.Sprintf("leak=%t", leak), func(t *testing.T) {
+			for _, flush := range []bool{true, false} {
+				t.Run(fmt.Sprintf("flush=%t", flush), func(t *testing.T) {
+					d1, err := Open("", &Options{
+						FS: vfs.NewMem(),
+					})
+					require.NoError(t, err)
+
+					d2, err := Open("", &Options{
+						FS: vfs.NewMem(),
+					})
+					require.NoError(t, err)
+
+					require.NoError(t, d1.Set([]byte("a"), []byte("a"), nil))
+					if flush {
+						require.NoError(t, d1.Flush())
+					}
+
+					require.NoError(t, d2.Set([]byte("a"), []byte("a"), nil))
+					if flush {
+						require.NoError(t, d2.Flush())
+					}
+
+					// Check if leak detection works with only one db closing.
+					{
+						iter1, _ := d1.NewIter(nil)
+						iter1.First()
+						if !leak {
+							require.NoError(t, iter1.Close())
+							require.NoError(t, d1.Close())
+						} else {
+							defer iter1.Close()
+							if err := d1.Close(); err == nil {
+								t.Fatalf("expected failure, but found success")
+							} else if !strings.HasPrefix(err.Error(), "leaked iterators:") {
+								t.Fatalf("expected leaked iterators, but found %+v", err)
+							} else {
+								t.Log(err.Error())
+							}
+						}
+					}
+
+					{
+						iter2, _ := d2.NewIter(nil)
+						iter2.First()
+						if !leak {
+							require.NoError(t, iter2.Close())
+							require.NoError(t, d2.Close())
+						} else {
+							defer iter2.Close()
+							if err := d2.Close(); err == nil {
+								t.Fatalf("expected failure, but found success")
+							} else if !strings.HasPrefix(err.Error(), "leaked iterators:") {
+								t.Fatalf("expected leaked iterators, but found %+v", err)
+							} else {
+								t.Log(err.Error())
+							}
+						}
+					}
+
+				})
+			}
+		})
+	}
+}
+
+func TestMemTableReservation(t *testing.T) {
+	opts := &Options{
+		Cache:        NewCache(128 << 10 /* 128 KB */),
+		MemTableSize: initialMemTableSize,
+		FS:           vfs.NewMem(),
+	}
+	defer opts.Cache.Unref()
+	opts.testingRandomized(t)
+	opts.EnsureDefaults()
+	// We're going to be looking at and asserting the global memtable reservation
+	// amount below so we don't want to race with any triggered stats collections.
+	opts.private.disableTableStats = true
+
+	// Add a block to the cache. Note that the memtable size is larger than the
+	// cache size, so opening the DB should cause this block to be evicted.
+	tmpID := opts.Cache.NewID()
+	helloWorld := []byte("hello world")
+	value := cache.Alloc(len(helloWorld))
+	copy(value.Buf(), helloWorld)
+	opts.Cache.Set(tmpID, base.FileNum(0).DiskFileNum(), 0, value).Release()
+
+	d, err := Open("", opts)
+	require.NoError(t, err)
+
+	checkReserved := func(expected int64) {
+		t.Helper()
+		if reserved := d.memTableReserved.Load(); expected != reserved {
+			t.Fatalf("expected %d reserved, but found %d", expected, reserved)
+		}
+	}
+
+	checkReserved(int64(opts.MemTableSize))
+	if refs := d.mu.mem.queue[len(d.mu.mem.queue)-1].readerRefs.Load(); refs != 2 {
+		t.Fatalf("expected 2 refs, but found %d", refs)
+	}
+	// Verify the memtable reservation has caused our test block to be evicted.
+	if h := opts.Cache.Get(tmpID, base.FileNum(0).DiskFileNum(), 0); h.Get() != nil {
+		t.Fatalf("expected failure, but found success: %s", h.Get())
+	}
+
+	// Flush the memtable. The memtable reservation should double because old
+	// memtable will be recycled, saved for the next memtable allocation.
+	require.NoError(t, d.Flush())
+	checkReserved(int64(2 * opts.MemTableSize))
+	// Flush again. The memtable reservation should be unchanged because at most
+	// 1 memtable may be preserved for recycling.
+
+	// Flush in the presence of an active iterator. The iterator will hold a
+	// reference to a readState which will in turn hold a reader reference to the
+	// memtable.
+	iter, _ := d.NewIter(nil)
+	require.NoError(t, d.Flush())
+	// The flush moved the recycled memtable into position as an active mutable
+	// memtable. There are now two allocated memtables: 1 mutable and 1 pinned
+	// by the iterator's read state.
+	checkReserved(2 * int64(opts.MemTableSize))
+
+	// Flushing again should increase the reservation total to 3x: 1 active
+	// mutable, 1 for recycling, 1 pinned by iterator's read state.
+	require.NoError(t, d.Flush())
+	checkReserved(3 * int64(opts.MemTableSize))
+
+	// Closing the iterator will release the iterator's read state, and the old
+	// memtable will be moved into position as the next memtable to recycle.
+	// There was already a memtable ready to be recycled, so that memtable will
+	// be freed and the overall reservation total is reduced to 2x.
+	require.NoError(t, iter.Close())
+	checkReserved(2 * int64(opts.MemTableSize))
+
+	require.NoError(t, d.Close())
+}
+
+func TestMemTableReservationLeak(t *testing.T) {
+	d, err := Open("", &Options{FS: vfs.NewMem()})
+	require.NoError(t, err)
+
+	d.mu.Lock()
+	last := d.mu.mem.queue[len(d.mu.mem.queue)-1]
+	last.readerRef()
+	defer func() {
+		last.readerUnref(true)
+	}()
+	d.mu.Unlock()
+	if err := d.Close(); err == nil {
+		t.Fatalf("expected failure, but found success")
+	} else if !strings.HasPrefix(err.Error(), "leaked memtable reservation:") {
+		t.Fatalf("expected leaked memtable reservation, but found %+v", err)
+	} else {
+		t.Log(err.Error())
+	}
+}
+
+func TestCacheEvict(t *testing.T) {
+	cache := NewCache(10 << 20)
+	defer cache.Unref()
+
+	d, err := Open("", &Options{
+		Cache: cache,
+		FS:    vfs.NewMem(),
+	})
+	require.NoError(t, err)
+
+	for i := 0; i < 1000; i++ {
+		key := []byte(fmt.Sprintf("%04d", i))
+		require.NoError(t, d.Set(key, key, nil))
+	}
+
+	require.NoError(t, d.Flush())
+	iter, _ := d.NewIter(nil)
+	for iter.First(); iter.Valid(); iter.Next() {
+	}
+	require.NoError(t, iter.Close())
+
+	if size := cache.Size(); size == 0 {
+		t.Fatalf("expected non-zero cache size")
+	}
+
+	for i := 0; i < 1000; i++ {
+		key := []byte(fmt.Sprintf("%04d", i))
+		require.NoError(t, d.Delete(key, nil))
+	}
+
+	require.NoError(t, d.Compact([]byte("0"), []byte("1"), false))
+
+	require.NoError(t, d.Close())
+
+	if size := cache.Size(); size != 0 {
+		t.Fatalf("expected empty cache, but found %d", size)
+	}
+}
+
+func TestFlushEmpty(t *testing.T) {
+	d, err := Open("", testingRandomized(t, &Options{
+		FS: vfs.NewMem(),
+	}))
+	require.NoError(t, err)
+
+	// Flushing an empty memtable should not fail.
+	require.NoError(t, d.Flush())
+	require.NoError(t, d.Close())
+}
+
+func TestRollManifest(t *testing.T) {
+	toPreserve := rand.Int31n(5) + 1
+	opts := &Options{
+		MaxManifestFileSize:   1,
+		L0CompactionThreshold: 10,
+		L0StopWritesThreshold: 1000,
+		FS:                    vfs.NewMem(),
+		NumPrevManifest:       int(toPreserve),
+	}
+	opts.DisableAutomaticCompactions = true
+	opts.testingRandomized(t)
+	d, err := Open("", opts)
+	require.NoError(t, err)
+
+	manifestFileNumber := func() base.DiskFileNum {
+		d.mu.Lock()
+		defer d.mu.Unlock()
+		return d.mu.versions.manifestFileNum
+	}
+	sizeRolloverState := func() (int64, int64) {
+		d.mu.Lock()
+		defer d.mu.Unlock()
+		return d.mu.versions.rotationHelper.DebugInfo()
+	}
+
+	current := func() string {
+		desc, err := Peek(d.dirname, d.opts.FS)
+		require.NoError(t, err)
+		return desc.ManifestFilename
+	}
+
+	lastManifestNum := manifestFileNumber()
+	manifestNums := []base.DiskFileNum{lastManifestNum}
+	for i := 0; i < 5; i++ {
+		// MaxManifestFileSize is 1, but the rollover logic also counts edits
+		// since the last snapshot to decide on rollover, so do as many flushes as
+		// it demands.
+		lastSnapshotCount, editsSinceSnapshotCount := sizeRolloverState()
+		var expectedLastSnapshotCount, expectedEditsSinceSnapshotCount int64
+		switch i {
+		case 0:
+			// DB is empty.
+			expectedLastSnapshotCount, expectedEditsSinceSnapshotCount = 0, 0
+		case 1:
+			// First edit that caused rollover is not in the snapshot.
+			expectedLastSnapshotCount, expectedEditsSinceSnapshotCount = 0, 1
+		case 2:
+			// One flush is in the snapshot. One flush in the edit.
+			expectedLastSnapshotCount, expectedEditsSinceSnapshotCount = 1, 1
+		case 3:
+			// Two flushes in the snapshot. One flush in the edit. Will need to do
+			// two more flushes, the first of which will be in the next snapshot.
+			expectedLastSnapshotCount, expectedEditsSinceSnapshotCount = 2, 1
+		case 4:
+			// Four flushes in the snapshot. One flush in the edit. Will need to do
+			// four more flushes, three of which will be in the snapshot.
+			expectedLastSnapshotCount, expectedEditsSinceSnapshotCount = 4, 1
+		}
+		require.Equal(t, expectedLastSnapshotCount, lastSnapshotCount)
+		require.Equal(t, expectedEditsSinceSnapshotCount, editsSinceSnapshotCount)
+		// Number of flushes to do to trigger the rollover.
+		steps := int(lastSnapshotCount - editsSinceSnapshotCount + 1)
+		// Steps can be <= 0, but we need to do at least one edit to trigger the
+		// rollover logic.
+		if steps <= 0 {
+			steps = 1
+		}
+		for j := 0; j < steps; j++ {
+			require.NoError(t, d.Set([]byte("a"), nil, nil))
+			require.NoError(t, d.Flush())
+		}
+		d.TestOnlyWaitForCleaning()
+		num := manifestFileNumber()
+		if lastManifestNum == num {
+			t.Fatalf("manifest failed to roll %d: %d == %d", i, lastManifestNum, num)
+		}
+
+		manifestNums = append(manifestNums, num)
+		lastManifestNum = num
+
+		expectedCurrent := fmt.Sprintf("MANIFEST-%s", lastManifestNum)
+		if v := current(); expectedCurrent != v {
+			t.Fatalf("expected %s, but found %s", expectedCurrent, v)
+		}
+	}
+	lastSnapshotCount, editsSinceSnapshotCount := sizeRolloverState()
+	require.EqualValues(t, 8, lastSnapshotCount)
+	require.EqualValues(t, 1, editsSinceSnapshotCount)
+
+	files, err := d.opts.FS.List("")
+	require.NoError(t, err)
+
+	var manifests []string
+	for _, filename := range files {
+		fileType, _, ok := base.ParseFilename(d.opts.FS, filename)
+		if !ok {
+			continue
+		}
+		if fileType == fileTypeManifest {
+			manifests = append(manifests, filename)
+		}
+	}
+	slices.Sort(manifests)
+
+	var expected []string
+	for i := len(manifestNums) - int(toPreserve) - 1; i < len(manifestNums); i++ {
+		expected = append(
+			expected,
+			fmt.Sprintf("MANIFEST-%s", manifestNums[i]),
+		)
+	}
+	require.EqualValues(t, expected, manifests)
+
+	// Test the logic that uses the future snapshot size to rollover.
+	// Reminder: we have a snapshot with 8 files and the manifest has 1 edit
+	// (flush) with 1 file.
+	// Add 8 more files with a different key.
+	lastManifestNum = manifestFileNumber()
+	for j := 0; j < 8; j++ {
+		require.NoError(t, d.Set([]byte("c"), nil, nil))
+		require.NoError(t, d.Flush())
+	}
+	lastSnapshotCount, editsSinceSnapshotCount = sizeRolloverState()
+	// Need 16 more files in edits to trigger a rollover.
+	require.EqualValues(t, 16, lastSnapshotCount)
+	require.EqualValues(t, 1, editsSinceSnapshotCount)
+	require.NotEqual(t, manifestFileNumber(), lastManifestNum)
+	lastManifestNum = manifestFileNumber()
+	// Do a compaction that moves 8 of the files from L0 to 1 file in L6. This
+	// adds 9 files in edits. We still need 6 more files in edits based on the
+	// last snapshot. But the current version has only 9 L0 files and 1 L6 file,
+	// for a total of 10 files. So 1 flush should push us over that threshold.
+	d.Compact([]byte("c"), []byte("d"), false)
+	lastSnapshotCount, editsSinceSnapshotCount = sizeRolloverState()
+	require.EqualValues(t, 16, lastSnapshotCount)
+	require.EqualValues(t, 10, editsSinceSnapshotCount)
+	require.Equal(t, manifestFileNumber(), lastManifestNum)
+	require.NoError(t, d.Set([]byte("c"), nil, nil))
+	require.NoError(t, d.Flush())
+	lastSnapshotCount, editsSinceSnapshotCount = sizeRolloverState()
+	require.EqualValues(t, 10, lastSnapshotCount)
+	require.EqualValues(t, 1, editsSinceSnapshotCount)
+	require.NotEqual(t, manifestFileNumber(), lastManifestNum)
+
+	require.NoError(t, d.Close())
+}
+
+func TestDBClosed(t *testing.T) {
+	d, err := Open("", &Options{
+		FS: vfs.NewMem(),
+	})
+	require.NoError(t, err)
+	require.NoError(t, d.Close())
+
+	catch := func(f func()) (err error) {
+		defer func() {
+			if r := recover(); r != nil {
+				err = r.(error)
+			}
+		}()
+		f()
+		return nil
+	}
+
+	require.True(t, errors.Is(catch(func() { _ = d.Close() }), ErrClosed))
+
+	require.True(t, errors.Is(catch(func() { _ = d.Compact(nil, nil, false) }), ErrClosed))
+	require.True(t, errors.Is(catch(func() { _ = d.Flush() }), ErrClosed))
+	require.True(t, errors.Is(catch(func() { _, _ = d.AsyncFlush() }), ErrClosed))
+
+	require.True(t, errors.Is(catch(func() { _, _, _ = d.Get(nil) }), ErrClosed))
+	require.True(t, errors.Is(catch(func() { _ = d.Delete(nil, nil) }), ErrClosed))
+	require.True(t, errors.Is(catch(func() { _ = d.DeleteRange(nil, nil, nil) }), ErrClosed))
+	require.True(t, errors.Is(catch(func() { _ = d.Ingest(nil) }), ErrClosed))
+	require.True(t, errors.Is(catch(func() { _ = d.LogData(nil, nil) }), ErrClosed))
+	require.True(t, errors.Is(catch(func() { _ = d.Merge(nil, nil, nil) }), ErrClosed))
+	require.True(t, errors.Is(catch(func() { _ = d.RatchetFormatMajorVersion(internalFormatNewest) }), ErrClosed))
+	require.True(t, errors.Is(catch(func() { _ = d.Set(nil, nil, nil) }), ErrClosed))
+
+	require.True(t, errors.Is(catch(func() { _ = d.NewSnapshot() }), ErrClosed))
+
+	b := d.NewIndexedBatch()
+	require.True(t, errors.Is(catch(func() { _ = b.Commit(nil) }), ErrClosed))
+	require.True(t, errors.Is(catch(func() { _ = d.Apply(b, nil) }), ErrClosed))
+	require.True(t, errors.Is(catch(func() { _, _ = b.NewIter(nil) }), ErrClosed))
+}
+
+func TestDBConcurrentCommitCompactFlush(t *testing.T) {
+	d, err := Open("", testingRandomized(t, &Options{
+		FS: vfs.NewMem(),
+	}))
+	require.NoError(t, err)
+
+	// Concurrently commit, compact, and flush in order to stress the locking around
+	// those operations.
+	const n = 1000
+	var wg sync.WaitGroup
+	wg.Add(n)
+	for i := 0; i < n; i++ {
+		go func(i int) {
+			defer wg.Done()
+			_ = d.Set([]byte(fmt.Sprint(i)), nil, nil)
+			var err error
+			switch i % 3 {
+			case 0:
+				err = d.Compact(nil, []byte("\xff"), false)
+			case 1:
+				err = d.Flush()
+			case 2:
+				_, err = d.AsyncFlush()
+			}
+			require.NoError(t, err)
+		}(i)
+	}
+	wg.Wait()
+
+	require.NoError(t, d.Close())
+}
+
+func TestDBConcurrentCompactClose(t *testing.T) {
+	// Test closing while a compaction is ongoing. This ensures compaction code
+	// detects the close and finishes cleanly.
+	mem := vfs.NewMem()
+	for i := 0; i < 100; i++ {
+		opts := &Options{
+			FS: mem,
+			MaxConcurrentCompactions: func() int {
+				return 2
+			},
+		}
+		d, err := Open("", testingRandomized(t, opts))
+		require.NoError(t, err)
+
+		// Ingest a series of files containing a single key each. As the outer
+		// loop progresses, these ingestions will build up compaction debt
+		// causing compactions to be running concurrently with the close below.
+		for j := 0; j < 10; j++ {
+			path := fmt.Sprintf("ext%d", j)
+			f, err := mem.Create(path)
+			require.NoError(t, err)
+			w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
+				TableFormat: d.FormatMajorVersion().MaxTableFormat(),
+			})
+			require.NoError(t, w.Set([]byte(fmt.Sprint(j)), nil))
+			require.NoError(t, w.Close())
+			require.NoError(t, d.Ingest([]string{path}))
+		}
+
+		require.NoError(t, d.Close())
+	}
+}
+
+func TestDBApplyBatchNilDB(t *testing.T) {
+	d, err := Open("", &Options{FS: vfs.NewMem()})
+	require.NoError(t, err)
+
+	b1 := &Batch{}
+	b1.Set([]byte("test"), nil, nil)
+
+	b2 := &Batch{}
+	b2.Apply(b1, nil)
+	if b2.memTableSize != 0 {
+		t.Fatalf("expected memTableSize to not be set")
+	}
+	require.NoError(t, d.Apply(b2, nil))
+	if b1.memTableSize != b2.memTableSize {
+		t.Fatalf("expected memTableSize %d, but found %d", b1.memTableSize, b2.memTableSize)
+	}
+
+	require.NoError(t, d.Close())
+}
+
+func TestDBApplyBatchMismatch(t *testing.T) {
+	srcDB, err := Open("", &Options{FS: vfs.NewMem()})
+	require.NoError(t, err)
+
+	applyDB, err := Open("", &Options{FS: vfs.NewMem()})
+	require.NoError(t, err)
+
+	err = func() (err error) {
+		defer func() {
+			if v := recover(); v != nil {
+				err = errors.Errorf("%v", v)
+			}
+		}()
+
+		b := srcDB.NewBatch()
+		b.Set([]byte("test"), nil, nil)
+		return applyDB.Apply(b, nil)
+	}()
+	if err == nil || !strings.Contains(err.Error(), "pebble: batch db mismatch:") {
+		t.Fatalf("expected error, but found %v", err)
+	}
+
+	require.NoError(t, srcDB.Close())
+	require.NoError(t, applyDB.Close())
+}
+
+func TestCloseCleanerRace(t *testing.T) {
+	mem := vfs.NewMem()
+	for i := 0; i < 20; i++ {
+		db, err := Open("", testingRandomized(t, &Options{FS: mem}))
+		require.NoError(t, err)
+		require.NoError(t, db.Set([]byte("a"), []byte("something"), Sync))
+		require.NoError(t, db.Flush())
+		// Ref the sstables so cannot be deleted.
+		it, _ := db.NewIter(nil)
+		require.NotNil(t, it)
+		require.NoError(t, db.DeleteRange([]byte("a"), []byte("b"), Sync))
+		require.NoError(t, db.Compact([]byte("a"), []byte("b"), false))
+		// Only the iterator is keeping the sstables alive.
+		files, err := mem.List("/")
+		require.NoError(t, err)
+		var found bool
+		for _, f := range files {
+			if strings.HasSuffix(f, ".sst") {
+				found = true
+				break
+			}
+		}
+		require.True(t, found)
+		// Close the iterator and the db in succession so file cleaning races with DB.Close() --
+		// latter should wait for file cleaning to finish.
+		require.NoError(t, it.Close())
+		require.NoError(t, db.Close())
+		files, err = mem.List("/")
+		require.NoError(t, err)
+		for _, f := range files {
+			if strings.HasSuffix(f, ".sst") {
+				t.Fatalf("found sst: %s", f)
+			}
+		}
+	}
+}
+
+func TestSSTablesWithApproximateSpanBytes(t *testing.T) {
+	d, err := Open("", &Options{
+		FS: vfs.NewMem(),
+	})
+	require.NoError(t, err)
+	defer func() {
+		if d != nil {
+			require.NoError(t, d.Close())
+		}
+	}()
+
+	// Create two sstables.
+	// sstable is contained within keyspan (fileNum = 5).
+	require.NoError(t, d.Set([]byte("c"), nil, nil))
+	require.NoError(t, d.Set([]byte("d"), nil, nil))
+	require.NoError(t, d.Flush())
+
+	// sstable partially overlaps keyspan (fileNum = 7).
+	require.NoError(t, d.Set([]byte("d"), nil, nil))
+	require.NoError(t, d.Set([]byte("g"), nil, nil))
+	require.NoError(t, d.Flush())
+
+	// cannot use WithApproximateSpanBytes without WithProperties.
+	_, err = d.SSTables(WithKeyRangeFilter([]byte("a"), []byte("e")), WithApproximateSpanBytes())
+	require.Error(t, err)
+
+	// cannot use WithApproximateSpanBytes without WithKeyRangeFilter.
+	_, err = d.SSTables(WithProperties(), WithApproximateSpanBytes())
+	require.Error(t, err)
+
+	tableInfos, err := d.SSTables(WithProperties(), WithKeyRangeFilter([]byte("a"), []byte("e")), WithApproximateSpanBytes())
+	require.NoError(t, err)
+
+	for _, levelTables := range tableInfos {
+		for _, table := range levelTables {
+			approximateSpanBytes, err := strconv.ParseInt(table.Properties.UserProperties["approximate-span-bytes"], 10, 64)
+			require.NoError(t, err)
+			if table.FileNum == 5 {
+				require.Equal(t, uint64(approximateSpanBytes), table.Size)
+			}
+			if table.FileNum == 7 {
+				require.Less(t, uint64(approximateSpanBytes), table.Size)
+			}
+		}
+	}
+}
+
+func TestFilterSSTablesWithOption(t *testing.T) {
+	d, err := Open("", &Options{
+		FS: vfs.NewMem(),
+	})
+	require.NoError(t, err)
+	defer func() {
+		if d != nil {
+			require.NoError(t, d.Close())
+		}
+	}()
+
+	// Create two sstables.
+	require.NoError(t, d.Set([]byte("/Table/5"), nil, nil))
+	require.NoError(t, d.Flush())
+	require.NoError(t, d.Set([]byte("/Table/10"), nil, nil))
+	require.NoError(t, d.Flush())
+
+	tableInfos, err := d.SSTables(WithKeyRangeFilter([]byte("/Table/5"), []byte("/Table/6")))
+	require.NoError(t, err)
+
+	totalTables := 0
+	for _, levelTables := range tableInfos {
+		totalTables += len(levelTables)
+	}
+
+	// with filter second sstable should not be returned
+	require.EqualValues(t, 1, totalTables)
+
+	tableInfos, err = d.SSTables()
+	require.NoError(t, err)
+
+	totalTables = 0
+	for _, levelTables := range tableInfos {
+		totalTables += len(levelTables)
+	}
+
+	// without filter
+	require.EqualValues(t, 2, totalTables)
+}
+
+func TestSSTables(t *testing.T) {
+	d, err := Open("", &Options{
+		FS: vfs.NewMem(),
+	})
+	require.NoError(t, err)
+	defer func() {
+		if d != nil {
+			require.NoError(t, d.Close())
+		}
+	}()
+
+	// Create two sstables.
+	require.NoError(t, d.Set([]byte("hello"), nil, nil))
+	require.NoError(t, d.Flush())
+	require.NoError(t, d.Set([]byte("world"), nil, nil))
+	require.NoError(t, d.Flush())
+
+	// by default returned table infos should not contain Properties
+	tableInfos, err := d.SSTables()
+	require.NoError(t, err)
+	for _, levelTables := range tableInfos {
+		for _, info := range levelTables {
+			require.Nil(t, info.Properties)
+		}
+	}
+
+	// with opt `WithProperties()` the `Properties` in table info should not be nil
+	tableInfos, err = d.SSTables(WithProperties())
+	require.NoError(t, err)
+	for _, levelTables := range tableInfos {
+		for _, info := range levelTables {
+			require.NotNil(t, info.Properties)
+		}
+	}
+}
+
+type testTracer struct {
+	enabledOnlyForNonBackgroundContext bool
+	buf                                strings.Builder
+}
+
+func (t *testTracer) Infof(format string, args ...interface{})  {}
+func (t *testTracer) Errorf(format string, args ...interface{}) {}
+func (t *testTracer) Fatalf(format string, args ...interface{}) {}
+
+func (t *testTracer) Eventf(ctx context.Context, format string, args ...interface{}) {
+	if t.enabledOnlyForNonBackgroundContext && ctx == context.Background() {
+		return
+	}
+	fmt.Fprintf(&t.buf, format, args...)
+	fmt.Fprint(&t.buf, "\n")
+}
+
+func (t *testTracer) IsTracingEnabled(ctx context.Context) bool {
+	if t.enabledOnlyForNonBackgroundContext && ctx == context.Background() {
+		return false
+	}
+	return true
+}
+
+func TestTracing(t *testing.T) {
+	if !invariants.Enabled {
+		// The test relies on timing behavior injected when invariants.Enabled.
+		return
+	}
+	var tracer testTracer
+	c := NewCache(0)
+	defer c.Unref()
+	d, err := Open("", &Options{
+		FS:              vfs.NewMem(),
+		Cache:           c,
+		LoggerAndTracer: &tracer,
+	})
+	require.NoError(t, err)
+	defer func() {
+		require.NoError(t, d.Close())
+	}()
+
+	// Create a sstable.
+	require.NoError(t, d.Set([]byte("hello"), nil, nil))
+	require.NoError(t, d.Flush())
+	_, closer, err := d.Get([]byte("hello"))
+	require.NoError(t, err)
+	closer.Close()
+	readerInitTraceString := "reading 37 bytes took 5ms\nreading 628 bytes took 5ms\n"
+	iterTraceString := "reading 27 bytes took 5ms\nreading 29 bytes took 5ms\n"
+	require.Equal(t, readerInitTraceString+iterTraceString, tracer.buf.String())
+
+	// Get again, but since it currently uses context.Background(), no trace
+	// output is produced.
+	tracer.buf.Reset()
+	tracer.enabledOnlyForNonBackgroundContext = true
+	_, closer, err = d.Get([]byte("hello"))
+	require.NoError(t, err)
+	closer.Close()
+	require.Equal(t, "", tracer.buf.String())
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	iter, _ := d.NewIterWithContext(ctx, nil)
+	iter.SeekGE([]byte("hello"))
+	iter.Close()
+	require.Equal(t, iterTraceString, tracer.buf.String())
+
+	tracer.buf.Reset()
+	snap := d.NewSnapshot()
+	iter, _ = snap.NewIterWithContext(ctx, nil)
+	iter.SeekGE([]byte("hello"))
+	iter.Close()
+	require.Equal(t, iterTraceString, tracer.buf.String())
+	snap.Close()
+
+	tracer.buf.Reset()
+	b := d.NewIndexedBatch()
+	iter, err = b.NewIterWithContext(ctx, nil)
+	require.NoError(t, err)
+	iter.SeekGE([]byte("hello"))
+	iter.Close()
+	require.Equal(t, iterTraceString, tracer.buf.String())
+	b.Close()
+}
+
+func TestMemtableIngestInversion(t *testing.T) {
+	memFS := vfs.NewMem()
+	opts := &Options{
+		FS:                          memFS,
+		MemTableSize:                256 << 10, // 4KB
+		MemTableStopWritesThreshold: 1000,
+		L0StopWritesThreshold:       1000,
+		L0CompactionThreshold:       2,
+		MaxConcurrentCompactions: func() int {
+			return 1000
+		},
+	}
+
+	const channelTimeout = 5 * time.Second
+
+	// We induce delay in compactions by passing in an EventListener that stalls on
+	// the first TableCreated event for a compaction job we want to block.
+	// FlushBegin and CompactionBegin has info on compaction start/output levels
+	// which is what we need to identify what compactions to block. However
+	// FlushBegin and CompactionBegin are called while holding db.mu, so we cannot
+	// block those events forever. Instead, we grab the job ID from those events
+	// and store it. Then during TableCreated, we check if we're creating an output
+	// for a job we have identified earlier as one to block, and then hold on a
+	// semaphore there until there's a signal from the test code to resume with the
+	// compaction.
+	//
+	// If nextBlockedCompaction is non-zero, we must block the next compaction
+	// out of the nextBlockedCompaction - 3 start level. 1 means block the next
+	// intra-L0 compaction and 2 means block the next flush (as flushes have
+	// a -1 start level).
+	var nextBlockedCompaction, blockedJobID int
+	var blockedCompactionsMu sync.Mutex // protects the above two variables.
+	nextSem := make(chan chan struct{}, 1)
+	var el EventListener
+	el.EnsureDefaults(testLogger{t: t})
+	el.FlushBegin = func(info FlushInfo) {
+		blockedCompactionsMu.Lock()
+		defer blockedCompactionsMu.Unlock()
+		if nextBlockedCompaction == 2 {
+			nextBlockedCompaction = 0
+			blockedJobID = info.JobID
+		}
+	}
+	el.CompactionBegin = func(info CompactionInfo) {
+		// 0 = block nothing, 1 = block intra-L0 compaction, 2 = block flush,
+		// 3 = block L0 -> LBase compaction, 4 = block compaction out of L1, and so on.
+		blockedCompactionsMu.Lock()
+		defer blockedCompactionsMu.Unlock()
+		blockValue := info.Input[0].Level + 3
+		if info.Input[0].Level == 0 && info.Output.Level == 0 {
+			// Intra L0 compaction, denoted by casValue of 1.
+			blockValue = 1
+		}
+		if nextBlockedCompaction == blockValue {
+			nextBlockedCompaction = 0
+			blockedJobID = info.JobID
+		}
+	}
+	el.TableCreated = func(info TableCreateInfo) {
+		blockedCompactionsMu.Lock()
+		if info.JobID != blockedJobID {
+			blockedCompactionsMu.Unlock()
+			return
+		}
+		blockedJobID = 0
+		blockedCompactionsMu.Unlock()
+		sem := make(chan struct{})
+		nextSem <- sem
+		<-sem
+	}
+	tel := TeeEventListener(MakeLoggingEventListener(testLogger{t: t}), el)
+	opts.EventListener = &tel
+	opts.Experimental.L0CompactionConcurrency = 1
+	d, err := Open("", opts)
+	require.NoError(t, err)
+	defer func() {
+		if d != nil {
+			require.NoError(t, d.Close())
+		}
+	}()
+
+	printLSM := func() {
+		d.mu.Lock()
+		s := d.mu.versions.currentVersion().String()
+		d.mu.Unlock()
+		t.Logf("%s", s)
+	}
+
+	// Create some sstables. These should go into L6. These are irrelevant for
+	// the rest of the test.
+	require.NoError(t, d.Set([]byte("b"), []byte("foo"), nil))
+	require.NoError(t, d.Flush())
+	require.NoError(t, d.Set([]byte("d"), []byte("bar"), nil))
+	require.NoError(t, d.Flush())
+	require.NoError(t, d.Compact([]byte("a"), []byte("z"), true))
+
+	var baseCompactionSem, flushSem, intraL0Sem chan struct{}
+	// Block an L0 -> LBase compaction. This is necessary to induce intra-L0
+	// compactions later on.
+	blockedCompactionsMu.Lock()
+	nextBlockedCompaction = 3
+	blockedCompactionsMu.Unlock()
+	timeoutSem := time.After(channelTimeout)
+	t.Log("blocking an L0 -> LBase compaction")
+	// Write an sstable to L0 until we're blocked on an L0 -> LBase compaction.
+	breakLoop := false
+	for !breakLoop {
+		select {
+		case sem := <-nextSem:
+			baseCompactionSem = sem
+			breakLoop = true
+		case <-timeoutSem:
+			t.Fatal("did not get blocked on an LBase compaction")
+		default:
+			require.NoError(t, d.Set([]byte("b"), []byte("foo"), nil))
+			require.NoError(t, d.Set([]byte("g"), []byte("bar"), nil))
+			require.NoError(t, d.Flush())
+			time.Sleep(100 * time.Millisecond)
+		}
+	}
+	printLSM()
+
+	// Do 4 ingests, one with the key cc, one with bb and cc, and two with just bb.
+	// The purpose of the sstable containing cc is to inflate the L0 sublevel
+	// count of the interval at cc, as that's where we want the intra-L0 compaction
+	// to be seeded. However we also need a file left of that interval to have
+	// the same (or higher) sublevel to trigger the bug in
+	// cockroachdb/cockroach#101896. That's why we ingest a file after it to
+	// "bridge" the bb/cc intervals, and then ingest a file at bb. These go
+	// into sublevels like this:
+	//
+	//    bb
+	//    bb
+	//    bb-----cc
+	//           cc
+	//
+	// Eventually, we'll drop an ingested file containing a range del starting at
+	// cc around here:
+	//
+	//    bb
+	//    bb     cc---...
+	//    bb-----cc
+	//           cc
+	{
+		path := "ingest1.sst"
+		f, err := memFS.Create(path)
+		require.NoError(t, err)
+		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
+			TableFormat: d.FormatMajorVersion().MaxTableFormat(),
+		})
+		require.NoError(t, w.Set([]byte("cc"), []byte("foo")))
+		require.NoError(t, w.Close())
+		require.NoError(t, d.Ingest([]string{path}))
+	}
+	{
+		path := "ingest2.sst"
+		f, err := memFS.Create(path)
+		require.NoError(t, err)
+		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
+			TableFormat: d.FormatMajorVersion().MaxTableFormat(),
+		})
+		require.NoError(t, w.Set([]byte("bb"), []byte("foo2")))
+		require.NoError(t, w.Set([]byte("cc"), []byte("foo2")))
+		require.NoError(t, w.Close())
+		require.NoError(t, d.Ingest([]string{path}))
+	}
+	{
+		path := "ingest3.sst"
+		f, err := memFS.Create(path)
+		require.NoError(t, err)
+		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
+			TableFormat: d.FormatMajorVersion().MaxTableFormat(),
+		})
+		require.NoError(t, w.Set([]byte("bb"), []byte("foo3")))
+		require.NoError(t, w.Close())
+		require.NoError(t, d.Ingest([]string{path}))
+	}
+	{
+		path := "ingest4.sst"
+		f, err := memFS.Create(path)
+		require.NoError(t, err)
+		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
+			TableFormat: d.FormatMajorVersion().MaxTableFormat(),
+		})
+		require.NoError(t, w.Set([]byte("bb"), []byte("foo4")))
+		require.NoError(t, w.Close())
+		require.NoError(t, d.Ingest([]string{path}))
+	}
+
+	// We now have a base compaction blocked. Block a memtable flush to cause
+	// memtables to queue up.
+	//
+	// Memtable (stuck):
+	//
+	//   b-----------------g
+	//
+	// Relevant L0 ssstables
+	//
+	//    bb
+	//    bb
+	//    bb-----cc
+	//           cc
+	blockedCompactionsMu.Lock()
+	nextBlockedCompaction = 2
+	blockedCompactionsMu.Unlock()
+	t.Log("blocking a flush")
+	require.NoError(t, d.Set([]byte("b"), []byte("foo2"), nil))
+	require.NoError(t, d.Set([]byte("g"), []byte("bar2"), nil))
+	_, _ = d.AsyncFlush()
+	select {
+	case sem := <-nextSem:
+		flushSem = sem
+	case <-time.After(channelTimeout):
+		t.Fatal("did not get blocked on a flush")
+	}
+	// Add one memtable to flush queue, and finish it off.
+	//
+	// Memtables (stuck):
+	//
+	//   b-----------------g (waiting to flush)
+	//   b-----------------g (flushing, blocked)
+	//
+	// Relevant L0 ssstables
+	//
+	//    bb
+	//    bb
+	//    bb-----cc
+	//           cc
+	require.NoError(t, d.Set([]byte("b"), []byte("foo3"), nil))
+	require.NoError(t, d.Set([]byte("g"), []byte("bar3"), nil))
+	// note: this flush will wait for the earlier, blocked flush, but it closes
+	// off the memtable which is what we want.
+	_, _ = d.AsyncFlush()
+
+	// Open a new mutable memtable. This gets us an earlier earlierUnflushedSeqNum
+	// than the ingest below it.
+	require.NoError(t, d.Set([]byte("c"), []byte("somethingbigishappening"), nil))
+	// Block an intra-L0 compaction, as one might happen around this time.
+	blockedCompactionsMu.Lock()
+	nextBlockedCompaction = 1
+	blockedCompactionsMu.Unlock()
+	t.Log("blocking an intra-L0 compaction")
+	// Ingest a file containing a cc-e rangedel.
+	//
+	// Memtables:
+	//
+	//         c             (mutable)
+	//   b-----------------g (waiting to flush)
+	//   b-----------------g (flushing, blocked)
+	//
+	// Relevant L0 ssstables
+	//
+	//    bb
+	//    bb     cc-----e (just ingested)
+	//    bb-----cc
+	//           cc
+	{
+		path := "ingest5.sst"
+		f, err := memFS.Create(path)
+		require.NoError(t, err)
+		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
+			TableFormat: d.FormatMajorVersion().MaxTableFormat(),
+		})
+		require.NoError(t, w.DeleteRange([]byte("cc"), []byte("e")))
+		require.NoError(t, w.Close())
+		require.NoError(t, d.Ingest([]string{path}))
+	}
+	t.Log("main ingest complete")
+	printLSM()
+	t.Logf("%s", d.Metrics().String())
+
+	require.NoError(t, d.Set([]byte("d"), []byte("ThisShouldNotBeDeleted"), nil))
+
+	// Do another ingest with a seqnum newer than d. The purpose of this is to
+	// increase the LargestSeqNum of the intra-L0 compaction output *beyond*
+	// the flush that contains d=ThisShouldNotBeDeleted, therefore causing
+	// that point key to be deleted (in the buggy code).
+	//
+	// Memtables:
+	//
+	//         c-----d       (mutable)
+	//   b-----------------g (waiting to flush)
+	//   b-----------------g (flushing, blocked)
+	//
+	// Relevant L0 ssstables
+	//
+	//    bb     cc
+	//    bb     cc-----e (just ingested)
+	//    bb-----cc
+	//           cc
+	{
+		path := "ingest6.sst"
+		f, err := memFS.Create(path)
+		require.NoError(t, err)
+		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
+			TableFormat: d.FormatMajorVersion().MaxTableFormat(),
+		})
+		require.NoError(t, w.Set([]byte("cc"), []byte("doesntmatter")))
+		require.NoError(t, w.Close())
+		require.NoError(t, d.Ingest([]string{path}))
+	}
+
+	// Unblock earlier flushes. We will first finish flushing the blocked
+	// memtable, and end up in this state:
+	//
+	// Memtables:
+	//
+	//         c-----d       (mutable)
+	//   b-----------------g (waiting to flush)
+	//
+	// Relevant L0 ssstables
+	//
+	//  b-------------------g (irrelevant, just flushed)
+	//    bb     cc (has LargestSeqNum > earliestUnflushedSeqNum)
+	//    bb     cc-----e (has a rangedel)
+	//    bb-----cc
+	//           cc
+	//
+	// Note that while b----g is relatively old (and so has a low LargestSeqNum),
+	// it bridges a bunch of intervals. Had we regenerated sublevels from scratch,
+	// it'd have gone below the cc-e sstable. But due to #101896, we just slapped
+	// it on top. Now, as long as our seed interval is the one at cc and our seed
+	// file is the just-flushed L0 sstable, we will go down and include anything
+	// in that interval even if it has a LargestSeqNum > earliestUnflushedSeqNum.
+	//
+	// All asterisked L0 sstables should now get picked in an intra-L0 compaction
+	// right after the flush finishes, that we then block:
+	//
+	//  b-------------------g*
+	//    bb*    cc*
+	//    bb*    cc-----e*
+	//    bb-----cc*
+	//           cc*
+	t.Log("unblocking flush")
+	flushSem <- struct{}{}
+	printLSM()
+
+	select {
+	case sem := <-nextSem:
+		intraL0Sem = sem
+	case <-time.After(channelTimeout):
+		t.Fatal("did not get blocked on an intra L0 compaction")
+	}
+
+	// Ensure all memtables are flushed. This will mean d=ThisShouldNotBeDeleted
+	// will land in L0 and since that was the last key written to a memtable,
+	// and the ingestion at cc came after it, the output of the intra-L0
+	// compaction will elevate the cc-e rangedel above it and delete it
+	// (if #101896 is not fixed).
+	ch, _ := d.AsyncFlush()
+	<-ch
+
+	// Unblock earlier intra-L0 compaction.
+	t.Log("unblocking intraL0")
+	intraL0Sem <- struct{}{}
+	printLSM()
+
+	// Try reading d a couple times.
+	for i := 0; i < 2; i++ {
+		val, closer, err := d.Get([]byte("d"))
+		require.NoError(t, err)
+		require.Equal(t, []byte("ThisShouldNotBeDeleted"), val)
+		if closer != nil {
+			closer.Close()
+		}
+		time.Sleep(100 * time.Millisecond)
+	}
+
+	// Unblock everything.
+	baseCompactionSem <- struct{}{}
+}
+
+func BenchmarkDelete(b *testing.B) {
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+	const keyCount = 10000
+	var keys [keyCount][]byte
+	for i := 0; i < keyCount; i++ {
+		keys[i] = []byte(strconv.Itoa(rng.Int()))
+	}
+	val := bytes.Repeat([]byte("x"), 10)
+
+	benchmark := func(b *testing.B, useSingleDelete bool) {
+		d, err := Open(
+			"",
+			&Options{
+				FS: vfs.NewMem(),
+			})
+		if err != nil {
+			b.Fatal(err)
+		}
+		defer func() {
+			if err := d.Close(); err != nil {
+				b.Fatal(err)
+			}
+		}()
+
+		b.StartTimer()
+		for _, key := range keys {
+			_ = d.Set(key, val, nil)
+			if useSingleDelete {
+				_ = d.SingleDelete(key, nil)
+			} else {
+				_ = d.Delete(key, nil)
+			}
+		}
+		// Manually flush as it is flushing/compaction where SingleDelete
+		// performance shows up. With SingleDelete, we can elide all of the
+		// SingleDelete and Set records.
+		if err := d.Flush(); err != nil {
+			b.Fatal(err)
+		}
+		b.StopTimer()
+	}
+
+	b.Run("delete", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			benchmark(b, false)
+		}
+	})
+
+	b.Run("single-delete", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			benchmark(b, true)
+		}
+	})
+}
+
+func BenchmarkNewIterReadAmp(b *testing.B) {
+	for _, readAmp := range []int{10, 100, 1000} {
+		b.Run(strconv.Itoa(readAmp), func(b *testing.B) {
+			opts := &Options{
+				FS:                    vfs.NewMem(),
+				L0StopWritesThreshold: 1000,
+			}
+			opts.DisableAutomaticCompactions = true
+
+			d, err := Open("", opts)
+			require.NoError(b, err)
+
+			for i := 0; i < readAmp; i++ {
+				require.NoError(b, d.Set([]byte("a"), []byte("b"), NoSync))
+				require.NoError(b, d.Flush())
+			}
+
+			require.Equal(b, d.Metrics().ReadAmp(), readAmp)
+
+			b.StopTimer()
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				b.StartTimer()
+				iter, _ := d.NewIter(nil)
+				b.StopTimer()
+				require.NoError(b, iter.Close())
+			}
+
+			require.NoError(b, d.Close())
+		})
+	}
+}
+
+func verifyGet(t *testing.T, r Reader, key, expected []byte) {
+	val, closer, err := r.Get(key)
+	require.NoError(t, err)
+	if !bytes.Equal(expected, val) {
+		t.Fatalf("expected %s, but got %s", expected, val)
+	}
+	closer.Close()
+}
+
+func verifyGetNotFound(t *testing.T, r Reader, key []byte) {
+	val, _, err := r.Get(key)
+	if err != base.ErrNotFound {
+		t.Fatalf("expected nil, but got %s", val)
+	}
+}
+
+func BenchmarkRotateMemtables(b *testing.B) {
+	o := &Options{FS: vfs.NewMem(), MemTableSize: 64 << 20 /* 64 MB */}
+	d, err := Open("", o)
+	require.NoError(b, err)
+
+	// We want to jump to full-sized memtables.
+	d.mu.Lock()
+	d.mu.mem.nextSize = o.MemTableSize
+	d.mu.Unlock()
+	require.NoError(b, d.Flush())
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		if err := d.Flush(); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
diff --git a/pebble/docs/RFCS/20211018_range_keys.md b/pebble/docs/RFCS/20211018_range_keys.md
new file mode 100644
index 0000000..890fa58
--- /dev/null
+++ b/pebble/docs/RFCS/20211018_range_keys.md
@@ -0,0 +1,961 @@
+- Feature Name: Range Keys
+- Status: draft
+- Start Date: 2021-10-18
+- Authors: Sumeer Bhola, Jackson Owens
+- RFC PR: #1341
+- Pebble Issues:
+  https://github.com/cockroachdb/pebble/issues/1339
+- Cockroach Issues:
+  https://github.com/cockroachdb/cockroach/issues/70429
+  https://github.com/cockroachdb/cockroach/issues/70412
+
+** Design Draft**
+
+# Summary
+
+An ongoing effort within CockroachDB to preserve MVCC history across all SQL
+operations (see cockroachdb/cockroach#69380) requires a more efficient method of
+deleting ranges of MVCC history.
+
+This document describes an extension to Pebble introducing first-class support
+for range keys. Range keys map a range of keyspace to a value. Optionally, the
+key range may include an suffix encoding a version (eg, MVCC timestamp). Pebble
+iterators may be configured to surface range keys during iteration, or to mask
+point keys at lower MVCC timestamps covered by range keys.
+
+CockroachDB will make use of these range keys to enable history-preserving
+removal of contiguous ranges of MVCC keys with constant writes, and efficient
+iteration past deleted versions.
+
+# Background
+
+A previous CockroachDB RFC cockroach/cockroachdb#69380 describes the motivation
+for the larger project of migrating MVCC-noncompliant operations into MVCC
+compliance. Implemented with the existing MVCC primitives, some operations like
+removal of an index or table would require performing writes linearly
+proportional to the size of the table. Dropping a large table using existing
+MVCC point-delete primitives would be prohibitively expensive. The desire for a
+sublinear delete of an MVCC range motivates this work.
+
+The detailed design for MVCC compliant bulk operations ([high-level
+description](https://github.com/cockroachdb/cockroach/blob/master/docs/RFCS/20210825_mvcc_bulk_ops.md);
+detailed design draft for DeleteRange in internal
+[doc](https://docs.google.com/document/d/1ItxpitNwuaEnwv95RJORLCGuOczuS2y_GoM2ckJCnFs/edit#heading=h.x6oktstoeb9t)),
+ran into complexity by placing range operations above the Pebble layer, such
+that Pebble sees these as points. The complexity causes are various: (a) which
+key (start or end) to anchor this range on, when represented as a point (there
+are performance consequences), (b) rewriting on CockroachDB range splits (and
+concerns about rewrite volume), (c) fragmentation on writes and complexity
+thereof (and performance concerns for reads when not fragmenting), (d) inability
+to efficiently skip older MVCC versions that are masked by a `[k1,k2)@ts` (where
+ts is the MVCC timestamp).
+
+Pebble currently has only one kind of key that is associated with a range:
+`RANGEDEL [k1, k2)#seq`, where [k1, k2) is supplied by the caller, and is used
+to efficiently remove a set of point keys.
+
+First-class support for range keys in Pebble eliminates all these issues.
+Additionally, it allows for future extensions like efficient transactional range
+operations. This issue describes how this feature would work from the
+perspective of a user of Pebble (like CockroachDB), and sketches some
+implementation details.
+
+# Design
+
+## Interface
+
+### New `Comparer` requirements
+
+The Pebble `Comparer` type allows users to optionally specify a `Split` function
+that splits a user key into a prefix and a suffix. This Split allows users
+implementing MVCC (Multi-Version Concurrency Control) to inform Pebble which
+part of the key encodes the user key and which part of the key encodes the
+version (eg, a timestamp). Pebble does not dictate the encoding of an MVCC
+version, only that the version form a suffix on keys.
+
+The range keys design described in this RFC introduces stricter requirements for
+user-provided `Split` implementations and the ordering of keys:
+
+1. The user key consisting of just a key prefix `k` must sort before all
+   other user keys containing that prefix. Specifically
+   `Compare(k[:Split(k)], k) < 0` where `Split(k) < len(k)`.
+2. A key consisting of a bare suffix must be a valid key and comparable. The
+   ordering of the empty key prefix with any suffixes must be consistent with
+   the ordering of those same suffixes applied to any other key prefix.
+   Specifically `Compare(k[Split(k):], k2[Split(k2):]) == Compare(k, k2)` where
+   `Compare(k[:Split(k)], k2[:Split(k2)]) == 0`.
+
+The details of why these new requirements are necessary are explained in the
+implementation section.
+
+### Writes
+
+This design introduces three new write operations:
+
+- `RangeKeySet([k1, k2), [optional suffix], <value>)`: This represents the
+  mapping `[k1, k2)@suffix => value`. Keys `k1` and `k2` must not contain a
+  suffix (i.e., `Split(k1)==len(k1)` and `Split(k2)==len(k2))`.
+
+- `RangeKeyUnset([k1, k2), [optional suffix])`: This removes a mapping
+  previously applied by `RangeKeySet`. The unset may use a smaller key range
+  than the original `RangeKeySet`, in which case only part of the range is
+  deleted. The unset only applies to range keys with a matching optional suffix.
+  If the optional suffix is absent in both the RangeKeySet and RangeKeyUnset,
+  they are considered matching.
+
+- `RangeKeyDelete([k1, k2))`: This removes all range keys within the provided
+  key span. It behaves like an `Unset` unencumbered by suffix restrictions.
+
+For example, consider `RangeKeySet([a,d), foo)` (i.e., no suffix). If
+there is a later call `RangeKeyUnset([b,c))`, the resulting state seen by
+a reader is `[a,b) => foo`, `[c,d) => foo`. Note that the value is not
+modified when the key is fragmented.
+
+Partially overlapping `RangeKeySet`s with the same suffix overwrite one
+another.  For example, consider `RangeKeySet([a,d), foo)`, followed by
+`RangeKeySet([c,e), bar)`.  The resulting state is `[a,c) => foo`, `[c,e)
+=> bar`.
+
+Point keys (eg, traditional keys defined at a singular byte string key) and
+range keys do not overwrite one another. They have a parallel existence. Point
+deletes only apply to points. Range unsets only apply to range keys. However,
+users may configure iterators to mask point keys covered by newer range keys.
+This masking behavior is explicitly requested by the user in the context of the
+iteration. Masking is described in more detail below.
+
+There exist separate range delete operations for point keys and range keys. A
+`RangeKeyDelete` may remove part of a range key, just like the new
+`RangeKeyUnset` operation introduced earlier. `RangeKeyDelete`s differ from
+`RangeKeyUnset`s, because the latter requires that the suffix matches and
+applies only to range keys. `RangeKeyDelete`s completely clear all existing
+range keys within their span at all suffix values.
+
+The optional suffix in `RangeKeySet` and `RangeKeyUnset` operations is related
+to the pebble `Comparer.Split` operation which is explicitly documented as being
+for [MVCC
+keys](https://github.com/cockroachdb/pebble/blob/e95e73745ce8a85d605ef311d29a6574db8ed3bf/internal/base/comparer.go#L69-L88),
+without mandating exactly how the versions are represented. `RangeKeySet` and
+`RangeKeyUnset` keys with different suffixes do not interact logically, although
+Pebble will observably fragment ranges at intersection points.
+
+### Iteration
+
+A user iterating over a key interval [k1,k2) can request:
+
+- **[I1]** An iterator over only point keys.
+
+- **[I2]** A combined iterator over point and range keys. This is what
+  we mainly discuss below in the implementation discussion.
+
+- **[I3]** An iterator over only range keys. In the CockroachDB use
+    case, range keys will need to be subject to MVCC GC just like
+    point keys — this iterator may be useful for that purpose.
+
+The `pebble.Iterator` type will be extended to provide accessors for
+range keys for use in the combined and exclusively range iteration
+modes.
+
+```
+// HasPointAndRange indicates whether there exists a point key, a range key or
+// both at the current iterator position.
+HasPointAndRange() (hasPoint, hasRange bool)
+
+// RangeKeyChanged indicates whether the most recent iterator positioning
+// operation resulted in the iterator stepping into or out of a new range key.
+// If true previously returned range key bounds and data has been invalidated.
+// If false, previously obtained range key bounds, suffix and value slices are
+// still valid and may continue to be read.
+RangeKeyChanged() bool
+
+// Key returns the key of the current key/value pair, or nil if done. If
+// positioned at an iterator position that only holds a range key, Key()
+// always returns the start bound of the range key. Otherwise, it returns
+// the point key's key.
+Key() []byte
+
+// RangeBounds returns the start (inclusive) and end (exclusive) bounds of the
+// range key covering the current iterator position. RangeBounds returns nil
+// bounds if there is no range key covering the current iterator position, or
+// the iterator is not configured to surface range keys.
+//
+// If valid, the returned start bound is less than or equal to Key() and the
+// returned end bound is greater than Key().
+RangeBounds() (start, end []byte)
+
+// Value returns the value of the current key/value pair, or nil if done.
+// The caller should not modify the contents of the returned slice, and
+// its contents may change on the next call to Next.
+//
+// Only valid if HasPointAndRange() returns true for hasPoint.
+Value() []byte
+
+// RangeKeys returns the range key values and their suffixes covering the
+// current iterator position. The range bounds may be retrieved separately
+// through RangeBounds().
+RangeKeys() []RangeKey
+
+type RangeKey struct {
+    Suffix []byte
+    Value  []byte
+}
+```
+
+When a combined iterator exposes range keys, it exposes all the range
+keys covering `Key`. During iteration with a combined iterator, an
+iteration position may surface just a point key, just a range key or
+both at the currently-positioned `Key`.
+
+Described another way, a Pebble combined iterator guarantees that it
+will stop at all positions within the keyspace where:
+1. There exists a point key at that position.
+2. There exists a range key that logically begins at that postition.
+
+In addition to the above positions, a Pebble iterator may also stop at keys
+in-between the above positions due to fragmentation. Range keys are defined over
+continuous spans of keyspace. Range keys with different suffix values may
+overlap each other arbitrarily. To surface these arbitrarily overlapping spans
+in an understandable and efficient way, the Pebble iterator surfaces range keys
+fragmented at intersection points. Consider the following sequence of writes:
+
+```
+    RangeKeySet([a,z), @1, 'apple')
+    RangeKeySet([c,e), @3, 'banana')
+    RangeKeySet([e,m), @5, 'orange')
+    RangeKeySet([b,k), @7, 'kiwi')
+```
+
+This yields a database containing overlapping range keys:
+```
+  @7 → kiwi     |-----------------)
+  @5 → orange         |---------------)
+  @3 → banana     |---)
+  @1 → apple  |-------------------------------------------------)
+              a b c d e f g h i j k l m n o p q r s t u v w x y z
+```
+
+During iteration, these range keys are surfaced using the bounds of their
+intersection points. For example, a scan across the keyspace containing only
+these range keys would observe the following iterator positions:
+
+```
+  Key() = a   RangeKeyBounds() = [a,b)   RangeKeys() = {(@1,apple)}
+  Key() = b   RangeKeyBounds() = [b,c)   RangeKeys() = {(@7,kiwi), (@1,apple)}
+  Key() = c   RangeKeyBounds() = [c,e)   RangeKeys() = {(@7,kiwi), (@3,banana), (@1,apple)}
+  Key() = e   RangeKeyBounds() = [e,k)   RangeKeys() = {(@7,kiwi), (@5,orange), (@1,apple)}
+  Key() = k   RangeKeyBounds() = [k,m)   RangeKeys() = {(@5,orange), (@1,apple)}
+  Key() = m   RangeKeyBounds() = [m,z)   RangeKeys() = {(@1,apple)}
+```
+
+This fragmentation produces a more understandable interface, and avoids forcing
+iterators to read all range keys within the bounds of the broadest range key.
+Consider this example:
+
+```
+                   iterator pos          [ ] - sstable bounds
+                         |
+L1:         [a----v1@t2--|-h]     [l-----unset@t1----u]
+L2:                 [e---|------v1@t1----------r]
+             a b c d e f g h i j k l m n o p q r s t u v w x y z
+```
+
+If the iterator is positioned at a point key `g`, there are two overlapping
+physical range keys: `[a,h)@t2→v1` and `[e,r)@t1→v1`.
+
+However, the `RangeKeyUnset([l,u), @t1)` removes part of the `[e,r)@t1→v1` range
+key, truncating it to the bounds `[e,l)`. The iterator must return the truncated
+bounds that correctly respect the `RangeKeyUnset`. However, when the range keys
+are stored within a log-structured merge tree like Pebble, the `RangeKeyUnset`
+may not be contained within the level's sstable that overlaps the current point
+key. Searching for the unset could require reading an unbounded number of
+sstables, losing the log-structured merge tree's property that bounds read
+amplification to the number of levels in the tree.
+
+Fragmenting range keys to intersection points avoids this problem. The iterator
+positioned at `g` only surfaces range key state with the bounds `[e,h)`, the
+widest bounds in which it can guarantee t2→v1 and t1→v1 without loading
+additional sstables.
+
+#### Iteration order
+
+Recall that the user-provided `Comparer.Split(k)` function divides all user keys
+into a prefix and a suffix, such that the prefix is `k[:Split(k)]`, and the
+suffix is `k[Split(k):]`. If a key does not contain a suffix, the key equals the
+prefix.
+
+An iterator that is configured to surface range keys alongside point keys will
+surface all range keys covering the current `Key()` position. Revisiting an
+earlier example with the addition of three new point key-value pairs:
+a→artichoke, b@2→beet and t@3→turnip. Consider '@<number>' to form the suffix
+where present, with `<number>` denoting a MVCC timestamp. Higher, more-recent
+timestamps sort before lower, older timestamps.
+
+```
+              .                                                   a   → artichoke
+  @7 → kiwi     |-----------------)
+  @5 → orange         |---------------)
+                . b@2                                             b@2 → beet
+  @3 → banana     |---)                             . t@3         t@3 → turnip
+  @1 → apple  |-------------------------------------------------)
+              a b c d e f g h i j k l m n o p q r s t u v w x y z
+```
+
+An iterator configured to surface both point and range keys will visit the
+following iterator positions during forward iteration:
+
+```
+  Key()   HasPointAndRange()   Value()      RangeKeyBounds()    RangeKeys()
+  a       (true,  true)        artichoke    [a,b)               {(@1,apple)}
+  b       (false, true)        -            [b,c)               {(@7,kiwi), (@1,apple)}
+  b@2     (true,  true)        beet         [b,c)               {(@7,kiwi), (@1,apple)}
+  c       (false, true)        -            [c,e)               {(@7,kiwi), (@3,banana), (@1,apple)}
+  e       (false, true)        -            [e,k)               {(@7,kiwi), (@5,orange), (@1,apple)}
+  k       (false, true)        -            [k,m)               {(@5,orange), (@1,apple)}
+  m       (false, true)        -            [m,z)               {(@1,apple)}
+  t@3     (true,  true)        turnip       [m,z)               {(@1,apple)}
+```
+
+Note that:
+
+- While positioned over a point key (eg, Key() = 'a', 'b@2' or t@3'), the
+  iterator exposes both the point key's value through Value() and the
+  overlapping range keys values through `RangeKeys()`.
+
+- There can be multiple range keys covering a `Key()`, each with a different
+  suffix.
+
+- There cannot be multiple range keys covering a `Key()` with the same suffix,
+  since the most-recently committed one (eg, the one with the highest sequence
+  number) will win, just like for point keys.
+
+- If the iterator has configured lower and/or upper bounds, they will truncate
+  the range key to those bounds. For example, if the above iterator had an upper
+  bound 'y', the `[m,z)` range key would be surfaced with the bounds `[m,y)`
+  instead.
+
+#### Masking
+
+Range key masking provides additional, optional functionality designed
+specifically for the use case of implementing a MVCC-compatible delete range.
+
+When constructing an iterator that iterators over both point and range keys, a
+user may request that range keys mask point keys. Masking is configured with a
+suffix parameter that determines which range keys may mask point keys. Only
+range keys with suffixes that sort after the mask's suffix mask point keys. A
+range key that meets this condition only masks points with suffixes that sort
+after the range key's suffix.
+
+```
+type IterOptions struct {
+    // ...
+    RangeKeyMasking RangeKeyMasking
+}
+
+// RangeKeyMasking configures automatic hiding of point keys by range keys.
+// A non-nil Suffix enables range-key masking. When enabled, range keys with
+// suffixes ≥ Suffix behave as masks. All point keys that are contained within
+// a masking range key's bounds and have suffixes greater than the range key's
+// suffix are automatically skipped.
+//
+// Specifically, when configured with a RangeKeyMasking.Suffix _s_, and there
+// exists a range key with suffix _r_ covering a point key with suffix _p_, and
+//
+//   _s_ ≤ _r_ < _p_
+//
+// then the point key is elided.
+//
+// Range-key masking may only be used when iterating over both point keys and
+// range keys.
+type RangeKeyMasking struct {
+	// Suffix configures which range keys may mask point keys. Only range keys
+	// that are defined at suffixes greater than or equal to Suffix will mask
+	// point keys.
+	Suffix []byte
+	// Filter is an optional field that may be used to improve performance of
+	// range-key masking through a block-property filter defined over key
+	// suffixes. If non-nil, Filter is called by Pebble to construct a
+	// block-property filter mask at iterator creation. The filter is used to
+	// skip whole point-key blocks containing point keys with suffixes greater
+	// than a covering range-key's suffix.
+	//
+	// To use this functionality, the caller must create and configure (through
+	// Options.BlockPropertyCollectors) a block-property collector that records
+	// the maxmimum suffix contained within a block. The caller then must write
+	// and provide a BlockPropertyFilterMask implementation on that same
+	// property. See the BlockPropertyFilterMask type for more information.
+	Filter func() BlockPropertyFilterMask
+}
+```
+
+Example: A user may construct an iterator with `RangeKeyMasking.Suffix` set to
+`@50`. The range key `[a, c)@60` would mask nothing, because `@60` is a more
+recent timestamp than `@50`. However a range key `[a,c)@30` would mask `a@20`
+and `apple@10` but not `apple@40`. A range key can only mask keys with MVCC
+timestamps older than the range key's own timestamp. Only range keys with
+suffixes (eg, MVCC timestamps) may mask anything at all.
+
+The pebble Iterator surfaces all range keys when masking is enabled. Only point
+keys are ever skipped, and only when they are contained within the bounds of a
+range key with a more-recent suffix, and the range key's suffix is older than
+the timestamp encoded in `RangeKeyMasking.Sufffix`.
+
+## Implementation
+
+### Write operations
+
+This design introduces three new Pebble write operations: `RangeKeySet`,
+`RangeKeyUnset` and `RangeKeyDelete`. Internally, these operations are
+represented as internal keys with new corresponding key kinds encoded as a part
+of the key trailer. These keys are stored within special range key blocks
+separate from point keys, but within the same sstable. The range key blocks hold
+`RangeKeySet`, `RangeKeyUnset` and `RangeKeyDelete` keys, but do not hold keys
+of any other kind.  Within the memtables, these range keys are stored in a
+separate skip list.
+
+- `RangeKeySet([k1,k2), @suffix, value)` is encoded as a `k1.RANGEKEYSET` key
+  with a value encoding the tuple `(k2,@suffix,value)`.
+- `RangeKeyUnset([k1,k2), @suffix)` is encoded as a `k1.RANGEUNSET` key
+  with a value encoding the tuple `(k2,@suffix)`.
+- `RangeKeyDelete([k1,k2)` is encoded as a `k1.RANGEKEYDELETE` key with a value
+  encoding `k2`.
+
+Range keys are physically fragmented as an artifact of the log-structured merge
+tree structure and internal sstable boundaries. This fragmentation is essential
+for preserving the performance characteristics of a log-structured merge tree.
+Although the public interface operations for `RangeKeySet` and `RangeKeyUnset`
+require both boundary keys `[k1,k2)` to always be bare prefixes (eg, to not have
+a suffix), internally these keys may be fragmented to bounds containing
+suffixes.
+
+Example: If a user attempts to write `RangeKeySet([a@v1, c@v2), @v3, value)`,
+Pebble will return an error to the user. If a user writes `RangeKeySet([a, c),
+@v3, value)`, Pebble will allow the write and may later internally fragment the
+`RangeKeySet` into three internal keys:
+ - `RangeKeySet([a, a@v1), @v3, value)`
+ - `RangeKeySet([a@v1, c@v2), @v3, value)`
+ - `RangeKeySet([c@v2, c), @v3, value)`
+
+This fragmentation preserve log-structured merge tree performance
+characteristics because it allows a range key to be split across many sstables,
+while preserving locality between range keys and point keys. Consider a
+`RangeKeySet([a,z), @1, foo)` on a database that contains millions of point keys
+in the range [a,z). If the [a,z) range key was not permitted to be fragmented
+internally, it would either need to be stored completely separately from the
+point keys in a separate sstable or in a single intractably large sstable
+containing all the overlapping point keys. Fragmentation allows locality,
+ensuring point keys and range keys in the same region of the keyspace can be
+stored in the same sstable.
+
+`RangeKeySet`, `RangeKeyUnset` and `RangeKeyDelete` keys are assigned sequence
+numbers, like other internal keys. Log-structured merge tree level invariants
+are valid across range key, point keys and between the two. That is:
+
+  1. The point key `k1#s2` cannot be at a lower level than `k2#s1` where
+     `k1==k2` and `s1 < s2`. This is the invariant implemented by all LSMs.
+  2. `RangeKeySet([k1,k2))#s2` cannot be at a lower level than
+     `RangeKeySet([k3,k4))#s1` where `[k1,k2)` overlaps `[k3,k4)` and `s1 < s2`.
+  3. `RangeKeySet([k1,k2))#s2` cannot be at a lower level than a point key
+     `k3#s1` where `k3 \in [k1,k2)` and `s1 < s2`.
+
+Like other tombstones, the `RangeKeyUnset` and `RangeKeyDelete` keys are elided
+when they fall to the bottomost level of the LSM and there is no snapshot
+preventing its elision. There is no additional garbage collection problem
+introduced by these keys.
+
+There is no Merge operation that affects range keys.
+
+#### Physical representation
+
+`RangeKeySet`, `RangeKeyUnset` and `RangeKeyDelete` keys are keyed by their
+start key. This poses an obstacle. We must be able to support multiple range
+keys at the same sequence number, because all keys within an ingested sstable
+adopt the same sequence number. Duplicate internal keys (keys with equal user
+keys, sequence numbers and kinds) are prohibited within Pebble. To resolve this
+issue, fragments with the same bounds are merged within snapshot stripes into a
+single physical key-value, representing multiple logical key-value pairs:
+
+```
+k1.RangeKeySet#s2 → (k2,[(@t2,v2),(@t1,v1)])
+```
+
+Within a physical key-value pair, suffix-value pairs are stored sorted by
+suffix, descending. This has a minor advantage of reducing iteration-time
+user-key comparisons when there exist multiple range keys in a table.
+
+Unlike other Pebble keys, the `RangeKeySet` and `RangeKeyUnset` keys have values
+that encode fields of data known to Pebble. The value that the user sets in a
+call to `RangeKeySet` is opaque to Pebble, but the physical representation of
+the `RangeKeySet`'s value is known. This encoding is a sequence of fields:
+
+* End key, `varstring`, encodes the end user key of the fragment.
+* A series of (suffix, value) tuples representing the logical range keys that
+  were merged into this one physical `RangeKeySet` key:
+  * Suffix, `varstring`
+  * Value, `varstring`
+
+Similarly, `RangeKeyUnset` keys are merged within snapshot stripes and have a
+physical representation like:
+
+```
+k1.RangeKeyUnset#s2 → (k2,[(@t2),(@t1)])
+```
+
+A `RangeKeyUnset` key's value is encoded as:
+* End key, `varstring`, encodes the end user key of the fragment.
+* A series of suffix `varstring`s.
+
+When `RangeKeySet` and `RangeKeyUnset` fragments with identical bounds meet
+within the same snapshot stripe within a compaction, any of the
+`RangeKeyUnset`'s suffixes that exist within the `RangeKeySet` key are removed.
+
+A `RangeKeyDelete` key has no additional data beyond its end key, which is
+encoded directly in the value.
+
+NB: `RangeKeySet` and `RangeKeyUnset` keys are not merged within batches or the
+memtable. That's okay, because batches are append-only and indexed batches will
+refragment and merge the range keys on-demand. In the memtable, every key is
+guaranteed to have a unique sequence number.
+
+### Sequence numbers
+
+Like all Pebble keys, `RangeKeySet`, `RangeKeyUnset` and `RangeKeyDelete` are
+assigned sequence numbers when committed. As described above, overlapping
+`RangeKeySet`s and `RangeKeyUnset`s are fragmented to have matching start and
+end bounds. Then the resulting exactly-overlapping range key fragments are
+merged into a single internal key-value pair, within the same snapshot stripe
+and sstable. The original, unmerged internal keys each have their own sequence
+numbers, indicating the moment they were committed within the history of all
+write operations.
+
+Recall that sequence numbers are used within Pebble to determine which keys
+appear live to which iterators. When an iterator is constructed, it takes note
+of the current _visible sequence number_, and for the lifetime of the iterator,
+only surfaces keys less than that sequence number. Similarly, snapshots read the
+current _visible sequence number_, remember it, but also leave a note asking
+compactions to preserve history at that sequence number. The space between
+snapshotted sequence numbers is referred to as a _snapshot stripe_, and
+operations cannot drop or otherwise mutate keys unless they fall within the same
+_snapshot stripe_. For example a `k.MERGE#5` key may not be merged with a
+`k.MERGE#1` operation if there's an open snapshot at `#3`.
+
+The new `RangeKeySet`, `RangeKeyUnset` and `RangeKeyDelete` keys behave
+similarly. Overlapping range keys won't be merged if there's an open snapshot
+separating them. Consider a range key `a-z` written at sequence number `#1` and
+a point key `d.SET#2`. A combined point-and-range iterator using a sequence
+number `#3` and positioned at `d` will surface both the range key `a-z` and the
+point key `d`.
+
+In the context of masking, the suffix-based masking of range keys can cause
+potentially unexpected behavior. A range key `[a,z)@10` may be committed as
+sequence number `#1`. Afterwards, a point key `d@5#2` may be committed. An
+iterator that is configured with range-key masking with suffix `@20` would mask
+the point key `d@5#2` because although `d@5#2`'s sequence number is higher,
+range-key masking uses suffixes to impose order, not sequence numbers.
+
+### Boundaries for sstables
+
+Range keys follow the same relationship to sstable bounadries as the existing
+`RANGEDEL` tombstones. The bounds of an internal range key are user keys. Every
+range key is limited by its containing sstable's bounds.
+
+Consider these keys, annotated with sequence numbers:
+
+```
+Point keys: a#50, b#70, b#49, b#48, c#47, d#46, e#45, f#44
+Range key: [a,e)#60
+```
+
+We have created three versions of `b` in this example. In previous versions,
+Pebble could split output sstables during a compaction such that the different
+`b` versions span more than one sstable. This creates problems for `RANGEDEL`s
+which span these two sstables which are discussed in the section on [improperly
+truncated RANGEDELS](https://github.com/cockroachdb/pebble/blob/master/docs/range_deletions.md#improperly-truncated-range-deletes).
+We manage to tolerate this for `RANGEDEL`s since their semantics are defined by
+the system, which is not true for these range keys where the actual semantics
+are up to the user.
+
+Pebble now disallows such sstable split points. In this example, by postponing
+the sstable split point to the user key c, we can cleanly split the range key
+into `[a,c)#60` and `[c,e)#60`. The sstable end bound for the first sstable
+(sstable bounds are inclusive) will be c#inf (where inf is the largest possible
+seqnum, which is unused except for these cases), and sstable start bound for the
+second sstable will be c#60.
+
+The above example deals exclusively with point and range keys without suffixes.
+Consider this example with suffixed keys, and compaction outputs split in the
+middle of the `b` prefix:
+
+```
+first sstable: points: a@100, a@30, b@100, b@40 ranges: [a,c)@50
+second sstable: points: b@30, c@40, d@40, e@30, ranges: [c,e)@50
+```
+
+When the compaction code decides to defer `b@30` to the next sstable and finish
+the first sstable, the range key `[a,c)@50` is sitting in the fragmenter. The
+compaction must split the range key at the bounds determined by the user key.
+The compaction uses the first point key of the next sstable, in this case
+`b@30`, to truncate the range key. The compaction flushes the fragment
+`[a,b@30)@50` to the first sstable and updates the existing fragment to begin at
+`b@30`.
+
+If a range key extends into the next file, the range key's truncated end is used
+for the purposes of determining the sstable end boundary. The first sstable's
+end boundary becomes `b@30#inf`, signifying the range key does not cover `b@30`.
+The second sstable's start boundary is `b@30`.
+
+### Block property collectors
+
+Separate block property collectors may be configured to collect separate
+properties about range keys. This is necessary for CockroachDB's MVCC block
+property collectors to ensure the sstable-level properties are correct.
+
+### Iteration
+
+This design extends the `*pebble.Iterator` with the ability to iterate over
+exclusively range keys, range keys and point keys together or exclusively point
+keys (the previous behavior).
+
+- Pebble already requires that the prefix `k` follows the same key validity
+  rules as `k@suffix`.
+
+- Previously, Pebble did not require that a user key consisting of just a prefix
+  `k` sort before the same prefix with a non-empty suffix. CockroachDB has
+  adopted this behavior since it results in the following clean behavior:
+  `RANGEDEL` over [k1, k2) deletes all versioned keys which have prefixes in the
+  interval [k1, k2). Pebble will now require this behavior for all users using
+  MVCC keys. Specifically, it must hold that `Compare(k[:Split(k)], k) < 0` if
+  `Split(k) < len(k)`.
+
+# TKTK: Discuss merging iterator
+
+#### Determinism
+
+Range keys will be split based on boundaries of sstables in an LSM. Users of an
+LSM typically expect that two different LSMs with different sstable settings
+that receive the same writes should output the same key-value pairs when
+iterating. To provide this behavior, the iterator implementation may be
+configured to defragment range keys during iteration time. The defragmentation
+behavior would be:
+
+- Two visible ranges `[k1,k2)@suffix1=>val1`, `[k2,k3)@suffix2=>val2` are
+  defragmented if suffix1==suffix2 and val1==val2, and become [k1,k3).
+
+- Defragmentation during user iteration does not consider the sequence number.
+  This is necessary since LSM state can be exported to another LSM via the use
+  of sstable ingestion, which can collapse different seqnums to the same seqnum.
+  We would like both LSMs to look identical to the user when iterating.
+
+The above defragmentation is conceptually simple, but hard to implement
+efficiently, since it requires stepping ahead from the current position to
+defragment range keys. This stepping ahead could switch sstables while there are
+still points to be consumed in a previous sstable. This determinism is useful
+for testing and verification purposes:
+
+- Randomized and metamorphic testing is used extensively to reliably test
+  software including Pebble and CockroachDB. Defragmentation provides
+  the determinism necessary for this form of testing.
+
+- CockroachDB's replica divergence detector requires a consistent view of the
+  database on each replica.
+
+In order to provide determinism, Pebble constructs an internal range key
+iterator stack that's separate from the point iterator stack, even when
+performing combined iteration over both range and point keys. The separate range
+key iterator allows the internal range key iterator to move independently of the
+point key iterator. This allows the range key iterator to independently visit
+adjacent sstables in order to defragment their range keys if necessary, without
+repositioning the point iterator.
+
+Two spans [k1,k2) and [k3, k4) of range keys are defragmented if their bounds
+abut and their user observable-state is identical. That is, `k2==k3` and each
+spans' contains exactly the same set of range key (<suffix>, <tuple>) pairs.  In
+order to support `RangeKeyUnset` and `RangeKeyDelete`, defragmentation must be
+applied _after_ resolving unset and deletes.
+
+#### Merging iteration
+
+Recall that range keys are stored in the same sstables as point keys. In a
+log-structured merge tree, these sstables are distributed across levels. Within
+a level, sstables are non-overlapping but between levels sstables may overlap
+arbitrarily. During iteration, keys across levels must be merged together. For
+point keys, this is typically done with a heap.
+
+Range keys too must be merged across levels, and the earlier described
+fragmentation at intersection boundaries must be applied. To implement this, a
+range key merging iterator is defined.
+
+A merging iterator is initialized with an arbitrary number of child iterators
+over fragmented spans. Each child iterator exposes fragmented range keys, such
+that overlapping range keys are surfaced in a single span with a single set of
+bounds. Range keys from one child iterator may overlap key spans from another
+child iterator arbitrarily. The high-level algorithm is:
+
+1. Initialize a heap with bound keys from child iterators' range keys.
+2. Find the next [or previous, if in reverse] two unique user keys' from bounds.
+3. Consider the span formed between the two unique user keys a candidate span.
+4. Determine if any of the child iterators' spans overlap the candidate span.
+  4a. If any of the child iterator's current bounds are end keys (during
+      forward iteration) or start keys (during reverse iteration), then all the
+      spans with that bound overlap the candidate span.
+  4b. If no spans overlap, forget the smallest (forward iteration) or largest
+      (reverse iteration) unique user key and advance the iterators to the next
+      unique user key. Start again from 3.
+
+Consider the example:
+
+```
+       i0:     b---d e-----h
+       i1:   a---c         h-----k
+       i2:   a------------------------------p
+
+fragments:   a-b-c-d-e-----h-----k----------p
+```
+
+None of the individual child iterators contain a span with the exact bounds
+[c,d), but the merging iterator must produce a span [c,d). To accomplish this,
+the merging iterator visits every span between unique boundary user keys. In the
+above example, this is:
+
+```
+[a,b), [b,c), [c,d), [d,e), [e, h), [h, k), [k, p)
+```
+
+The merging iterator first initializes the heap to prepare for iteration. The
+description below discusses the mechanics of forward iteration after a call to
+First, but the mechanics are similar for reverse iteration and other positioning
+methods.
+
+During a call to First, the heap is initialized by seeking every level to the
+first bound of the first fragment. In the above example, this seeks the child
+iterators to:
+
+```
+i0: (b, boundKindStart, [ [b,d) ])
+i1: (a, boundKindStart, [ [a,c) ])
+i2: (a, boundKindStart, [ [a,p) ])
+```
+
+After fixing up the heap, the root of the heap is the bound with the smallest
+user key ('a' in the example). During forward iteration, the root of the heap's
+user key is the start key of next merged span. The merging iterator records this
+key as the start key. The heap may contain other levels with range keys that
+also have the same user key as a bound of a range key, so the merging iterator
+pulls from the heap until it finds the first bound greater than the recorded
+start key.
+
+In the above example, this results in the bounds `[a,b)` and child iterators in
+the following positions:
+
+```
+i0: (b, boundKindStart, [ [b,d) ])
+i1: (c, boundKindEnd,   [ [a,c) ])
+i2: (p, boundKindEnd,   [ [a,p) ])
+```
+
+With the user key bounds of the next merged span established, the merging
+iterator must determine which, if any, of the range keys overlap the span.
+During forward iteration any child iterator that is now positioned at an end
+boundary has an overlapping span. (Justification: The child iterator's end
+boundary is ≥ the new end bound. The child iterator's range key's corresponding
+start boundary must be ≤ the new start bound since there were no other user keys
+between the new span's bounds. So the fragments associated with the iterator's
+current end boundary have start and end bounds such that start ≤ <new start
+bound> < <new end bound> ≤ end).
+
+The merging iterator iterates over the levels, collecting keys from any child
+iterators positioned at end boundaries. In the above example, i1 and i2 are
+positioned at end boundaries, so the merging iterator collects the keys of [a,c)
+and [a,p). These spans contain the merging iterator's [a,b) span, but they may
+also extend beyond the new span's start and end. The merging iterator returns
+the keys with the new start and end bounds, preserving the underlying keys'
+sequence numbers, key kinds and values.
+
+It may be the case that the merging iterator finds no levels positioned at span
+end boundaries in which case the span overlaps with nothing. In this case the
+merging iterator loops, repeating the above process again until it finds a span
+that does contain keys.
+
+#### Efficient masking
+
+Recollect that in the earlier example from the iteration interface, during
+forward iteration an iterator would output the following keys:
+
+```
+  Key()   HasPointAndRange()   Value()      RangeKeyBounds()    RangeKeys()
+  a       (true,  true)        artichoke    [a,b)               {(@1,apple)}
+  b       (false, true)        -            [b,c)               {(@7,kiwi), (@1,apple)}
+  b@2     (true,  true)        beet         [b,c)               {(@7,kiwi), (@1,apple)}
+  c       (false, true)        -            [c,e)               {(@7,kiwi), (@3,banana), (@1,apple)}
+  e       (false, true)        -            [e,k)               {(@7,kiwi), (@5,orange), (@1,apple)}
+  k       (false, true)        -            [k,m)               {(@5,orange), (@1,apple)}
+  m       (false, true)        -            [m,z)               {(@1,apple)}
+  t@3     (true,  true)        turnip       [m,z)               {(@1,apple)}
+```
+
+When implementing an MVCC "soft delete range" operation using range keys, the
+range key `[b,k)@7→kiwi` may represent that all keys within the range [b,k) are
+deleted at MVCC timestamp @7. During iteration, it would be desirable if the
+caller could indicate that it does not want to observe any "soft deleted" point
+keys, and the iterator can safely skip them. Note that in a MVCC system, whether
+or not a key is soft deleted depends on the timestamp at which the database is
+read.
+
+This is implemented through "range key masking," where a range key may act as a
+mask, hiding point keys with MVCC timestamps beneath the range key. This
+iterator option requires that the client configure the iterator with a MVCC
+timestamp `suffix` representing the timestamp at which history should be read.
+All range keys with suffixes (MVCC timestamps) less than or equal to the
+configured suffix serve as masks. All point keys with suffixes (MVCC timestamps)
+less than a covering, masking range key's suffix are hidden.
+
+Specifically, when configured with a RangeKeyMasking.Suffix _s_, and there
+exists a range key with suffix _r_ covering a point key with suffix _p_, and _s_
+≤ _r_ < _p_ then the point key is elided.
+
+In the above example, if `RangeKeyMasking.Suffix` is set to `@7`, every range
+key serves as a mask and the point key `b@2` is hidden during iteration because
+it's contained within the masking `[b,k)@7→kiwi` range key. Note that `t@3`
+would _not_ be masked, because its timestamp `@3` is more recent than the only
+range key that covers it (`[a,z)@1→apple`).
+
+If `RangeKeyMasking.Suffix` were set to `@6` (a historical, point-in-time read),
+the `[b,k)@7→kiwi` range key would no longer serve as a mask, and `b@2` would be
+visible.
+
+To efficiently implement masking, we cannot rely on the LSM invariant since
+`b@100` can be at a lower level than `[a,e)@50`. Instead, we build on
+block-property filters, supporting special use of a MVCC timestamp block
+property in order to skip blocks wholly containing point keys that are masked by
+a range key. The client may configure a block-property collector to record the
+highest MVCC timestamps of point keys within blocks.
+
+During read time, when positioned within a range key with a suffix ≤
+`RangeKeyMasking.Suffix`, the iterator configures sstable readers to use a
+block-property filter to skip any blocks for which the highest MVCC timestamp is
+less than the provided suffix. Additionally, these iterators must consult index
+block bounds to ensure the block-property filter is not applied beyond the
+bounds of the masking range key.
+
+### CockroachDB use
+
+CockroachDB initially will only use range keys to represent MVCC range
+tombstones. See the MVCC range tombstones tech note for more details:
+
+https://github.com/cockroachdb/cockroach/blob/master/docs/tech-notes/mvcc-range-tombstones.md
+
+### Alternatives
+
+#### A1. Automatic elision of range keys that don't cover keys
+
+We could decide that range keys:
+
+- Don't contribute to `MVCCStats` themselves.
+- May be elided by Pebble when they cover zero point keys.
+
+This means that CockroachDB garbage collection does not need to explicitly
+remove the range keys, only the point keys they deleted. This option is clean
+when paired with `RANGEDEL`s dropping both point and range keys. CockroachDB can
+issue `RANGEDEL`s whenever it wants to drop a contiguous swath of points, and
+not worry about the fact that it might also need to update the MVCC stats for
+overlapping range keys.
+
+However, this option makes deterministic iteration over defragmented range keys
+for replica divergence detection challenging, because internal fragmentation may
+elide regions of a range key at any point.  Producing a normalized form would
+require storing state in the value (ie, the original start key) and
+recalculating the smallest and largest extant covered point keys within the
+range key and replica bounds. This would require maintaining _O_(range-keys)
+state during the `storage.ComputeStatsForRange` pass over a replica's combined
+point and range iterator.
+
+This likely forces replica divergence detection to use other means (eg, altering
+the checksum of covered points) to incorporate MVCC range tombstone state.
+
+This option is also highly tailored to the MVCC Delete Range use case.  Other
+range key usages, like ranged intents, would not want this behavior, so we don't
+consider it further.
+
+#### A2. Separate LSM of range keys
+
+There are two viable options for where to store range keys. They may be encoded
+within the same sstables as points in separate blocks, or in separate sstables
+forming a parallel range-key LSM. We examine the tradeoffs between storing range
+keys in the same sstable in different blocks ("shared sstables") or separate
+sstables forming a parallel LSM ("separate sstables"):
+
+- Storing range keys in separate sstables is possible because the only
+  iteractions between range keys and point keys happens at a global level.
+  Masking is defined over suffixes. It may be extended to be defined over
+  sequence numbers too (see 'Sequence numbers' section below), but that is
+  optional. Unlike range deletion tombstones, range keys have no effect on point
+  keys during compactions.
+
+- With separate sstables, reads may need to open additional sstable(s) and read
+  additional blocks. The number of additional sstables is the number of nonempty
+  levels in the range-key LSM, so it grows logarithmically with the number of
+  range keys. For each sstable, a read must read the index block and a data
+  block.
+
+- With our expectation of few range keys, the range-key LSM is expected to be
+  small, with one or two levels. Heuristics around sstable boundaries may
+  prevent unnecessary range-key reads when there is no covering range key. Range
+  key sstables and blocks are expected to have much higher table and block cache
+  hit rates, since they are orders of magnitude less dense. Reads in any
+  overlapping point sstables all access the same range key sstables.
+
+- With shared sstables, `SeekPrefixGE` cannot use bloom filters to entirely
+  eliminate sstables that contain range keys. Pebble does not always use bloom
+  filters in L6, so once a range key is compacted into L6 its impact to
+  `SeekPrefixGE` is lessened. With separate sstables, `SeekPrefixGE` can always
+  use bloom filters for point-key sstables. If there are any overlapping
+  range-key sstables, the read must read them.
+
+- With shared sstables, range keys create dense sstable boundaries. A range key
+  spanning an sstable boundary leaves no gap between the sstables' bounds. This
+  can force ingested sstables into higher levels of the LSM, even if the
+  sstables' point key spans don't overlap. This problem was previously observed
+  with wide `RANGEDEL` tombstones and was mitigated by prioritizing compaction
+  of sstables that contain `RANGEDEL` keys. We could do the same with range
+  keys, but the write amplification is expected to be much worse. The `RANGEDEL`
+  tombstones drop keys and eventually are dropped themselves as long as there is
+  not an open snapshot. Range keys do not drop data and are expected to persist
+  in L6 for long durations, always requiring ingested sstables to be inserted
+  into L5 or above.
+
+- With separate sstables, compaction logic is separate, which helps avoid
+  complexity of tricky sstable boundary conditions. Because there are expected
+  to be an order of magnitude fewer range keys, we could impose the constraint
+  that a prefix cannot be split across multiple range key sstables. The
+  simplified compaction logic comes at the cost of higher levels, iterators, etc
+  all needing to deal with the concept of two parallel LSMs.
+
+- With shared sstables, the LSM invariant is maintained between range keys and
+  point keys. For example, if the point key `b@20` is committed, and
+  subsequently a range key `RangeKey([a,c), @25, ...)` is committed, the range
+  key will never fall below the covered point `b@20` within the LSM.
+
+We decide to share sstables, because preserving the LSM invariant between range
+keys and point keys is expected to be useful in the long-term.
+
+#### A3. Sequence number masking
+
+In the CockroachDB MVCC range tombstone use case, a point key should never be
+written below an existing range key with a higher timestamp. The MVCC range
+tombstone use case would allow us to dictate that an overlapping range key with
+a higher sequence number always masks range keys with lower sequence numbers.
+Adding this additional masking scope would avoid the comparatively costly suffix
+comparison when a point key _is_ masked by a range key. We need to consider how
+sequence number masking might be affected by the merging of range keys within
+snapshot stripes.
+
+Consider the committing of range key `[a,z)@{t1}#10`, followed by point keys
+`d@t2#11` and `m@t2#11`, followed by range key `[j,z)@{t3}#12`.  This sequencing
+respects the expected timestamp, sequence number relationship in CockroachDB's
+use case. If all keys are flushed within the same sstable, fragmentation and
+merging overlapping fragments yields range keys `[a,j)@{t1}#10`,
+`[j,z)@{t3,t1}#12`. The key `d@t2#11` must not be masked because it's not
+covered by the new range key, and indeed that's the case because the covering
+range key's fragment is unchanged `[a,j)@{t1}#10`.
+
+For now we defer this optimization, with the expectation that we may not be able
+to preserve this relationship between sequence numbers and suffixes in all range
+key use cases.
diff --git a/pebble/docs/RFCS/20220112_pebble_sstable_format_versions.md b/pebble/docs/RFCS/20220112_pebble_sstable_format_versions.md
new file mode 100644
index 0000000..c2f792f
--- /dev/null
+++ b/pebble/docs/RFCS/20220112_pebble_sstable_format_versions.md
@@ -0,0 +1,290 @@
+- Feature Name: Pebble SSTable Format Versions
+- Status: completed
+- Start Date: 2022-01-12
+- Authors: Nick Travers
+- RFC PR: https://github.com/cockroachdb/pebble/pull/1450
+- Pebble Issues:
+  https://github.com/cockroachdb/pebble/issues/1409
+  https://github.com/cockroachdb/pebble/issues/1339
+- Cockroach Issues:
+
+# Summary
+
+To safely support changes to the SSTable structure, a new versioning scheme
+under a Pebble magic number is proposed.
+
+This RFC also outlines the relationship between the SSTable format version and
+the existing Pebble format major version, in addition to how the two are to
+be used in Cockroach for safely enabling new table format versions.
+
+# Motivation
+
+Pebble currently uses a "format major version" scheme for the store (or DB)
+that indicates which Pebble features should be enabled when the store is first
+opened, before any SSTables are opened. The versions indicate points of
+backwards incompatibility for a store. For example, the introduction of the
+`SetWithDelete` key kind is gated behind a version, as is block property
+collection. This format major version scheme was introduced in
+[#1227](https://github.com/cockroachdb/pebble/issues/1227).
+
+While Pebble can use the format major version to infer how to load and
+interpret data in the LSM, the SSTables that make up the store itself have
+their own notion of a "version". This "SSTable version" (also referred to as a
+"table format") is written to the footer (or trailing section) of each SSTable
+file and determines how the file is to be interpreted by Pebble. As of the time
+of writing, Pebble supports two table formats - LevelDB's format, and RocksDB's
+v2 format. Pebble inherited the latter as the default table format as it was
+the version that RocksDB used at the time Pebble was being developed, and
+remained the default to allow for a simpler migration path from Cockroach
+clusters that were originally using RocksDB as the storage engine. The
+RocksDBv2 table format adds various features on top of the LevelDB format,
+including a two-level index, configurable checksum algorithms, and an explicit
+versioning scheme to allow for the introduction of changes, amongst other
+features.
+
+While the RocksDBv2 SSTable format has been sufficient for Pebble's needs since
+inception, new Pebble features and potential backports from RocksDB itself
+require that the SSTable format evolve over time and therefore that the table
+format be updated. As the majority of new features added over time will be
+specific to Pebble, it does not make sense to repurpose the RocksDB format
+versions that exist upstream for use with Pebble features (at the time of
+writing, RocksDB had added versions 3 and 4 on top of the version 2 in use by
+Pebble). A new Pebble-specific table format scheme is proposed.
+
+In the context of a distributed system such as Cockroach, certain SSTable
+features are backwards incompatible (e.g. the block property collection and
+filtering feature extends the RocksDBv2 SSTable block index format to encoded
+various block properties, which is a breaking change). Participants must
+_first_ ensure that their stores have the code-level features available to read
+and write these newer SSTables (indicated by Pebble's format major version).
+Once all stores agree that they are running the minimum Pebble format major
+version and will not roll back (e.g. Cockroach cluster version finalization),
+SSTables can be written and read using more recent table formats. The Pebble
+"format major version" and "table format version" are therefore no longer
+independent - the former implies an upper bound on the latter.
+
+Additionally, certain SSTable generation operations are independent of a
+specific Pebble instance. For example, SSTable construction for the purposes of
+backup and restore generates SSTables that are stored external to a specific
+Pebble store (e.g. in cloud storage) can be used at a later point in time to
+restore a store. SSTables constructed for such purposes must be carefully
+versioned to ensure compatibility with existing clusters that may run with a
+mixture of Pebble versions.
+
+As a real-world example of the need for the above, consider two Cockroach nodes
+each with a Pebble store, one at version A, the other at version B (version A
+(newer) > B (older)). Store A constructs an SSTable for an external backup
+containing a newer block index format (for block property collection). This
+SSTable is then imported in to store B. Store B fails to read the SSTable as it
+is not running with a format major version recent enough make sense of the
+newer index format. The two stores require a method for agreeing on a minimum
+supported table format.
+
+The remainder of this document outlines a new table format for Pebble. This new
+table format will be used for new table-level features such as block properties
+and range keys (see
+[#1339](https://github.com/cockroachdb/pebble/issues/1339)), but also for
+backporting table-level features from RocksDB that would be useful to Pebble
+(e.g. version 3 avoids encoding sequence numbers in the index, and version 4
+uses delta encoding for the block offsets in the index, both of which are
+useful for Pebble).
+
+# Technical design
+
+## Pebble magic number
+
+The last 8 bytes of an SSTable is referred to as the "magic number".
+
+LevelDB uses the first 8 bytes of the SHA1 hash of the string
+`http://code.google.com/p/leveldb/` for the magic number.
+
+RocksDB uses its own magic number, which indicates the use of a slightly
+different table layout - the footer (the name for the end of an SSTable) is
+slightly larger to accommodate a 32-bit version number and 8 bits for a
+checksum type to be used for all blocks in the SSTable.
+
+A new 8-byte magic number will be introduced for Pebble:
+
+```
+\xf0\x9f\xaa\xb3\xf0\x9f\xaa\xb3 // 🪳🪳
+```
+
+## Pebble version scheme
+
+Tables with a Pebble magic number will use a dedicated versioning scheme,
+starting with version `1`. No new versions other than version `2` will be
+supported for tables containing the RocksDB magic number.
+
+The choice of switching to a Pebble versioning scheme starting `1` simplifies
+the implementation. Essentially all existing Pebble stores are managed via
+Cockroach, and were either previously using RocksDB and migrated to Pebble, or
+were created with Pebble stores. In both situations the table format used is
+RocksDB v2.
+
+Given that Pebble has not needed (and likely will not need) to support other
+RocksDB table formats, it is reasonable to introduce a new magic number for
+Pebble and reset the version counter to v1.
+
+The following initial versions will correspond to the following new Pebble
+features, that have yet to be introduced to Cockroach clusters as of the time
+of writing:
+
+- Version 1: block property collectors (block properties are encoded into the
+  block index)
+- Version 2: range keys (a new block is present in the table for range keys).
+
+Subsequent alterations to the SSTable format should only increment the _Pebble
+version number_. It should be noted that backported RocksDB table format
+features (e.g. RocksDB versions 3 and 4) would use a different version number,
+within the Pebble version sequence. While possibly confusing, the RocksDB
+features are being "adopted" by Pebble, rather than directly ported, so a
+Pebble specific version number is appropriate.
+
+An alternative would be to allow RocksDB table format features to be backported
+into Pebble under their existing RocksDB magic number, _alongside_
+Pebble-specific features. The complexity required to determine the set of
+characteristics to read and write to each SSTable would increase with such a
+scheme, compared to the simpler "linear history" approach described above,
+where new features simply ratchet the Pebble table format version number.
+
+## Footer format
+
+The footer format for SSTables with Pebble magic numbers _will remain the same_
+as the RocksDB footer format - specifically, the trailing 53-bytes of the
+SSTable consisting of the following fields with the given indices,
+little-endian encoded:
+
+- `0`: Checksum type
+- `1-20`: Meta-index block handle
+- `21-40`: Index block handle
+- `41-44`: Version number
+- `45-52`: Magic number
+
+## Changes / additions to `sstable.TableFormat`
+
+The `sstable.TableFormat` enum is a `uint32` representation of the tuple
+`(magic number, format version). The current values are:
+
+```go
+type TableFormat uint32
+
+const (
+  TableFormatRocksDBv2 TableFormat = iota
+  TableFormatLevelDB
+)
+```
+
+It should be noted that this enum is _not_ persisted in the SSTable. It is
+purely an internal type that represents the tuple that simplifies a number of
+version checks when reading / writing an SSTable. The values are free to
+change, provided care is taken with default values and existing usage.
+
+The existing `sstable.TableFormat` will be altered to reflect the "linear"
+nature of the version history. New versions will be added with the next value
+in the sequence.
+
+```go
+const (
+	TableFormatUnspecified TableFormat = iota
+  TableFormatLevelDB    // The original LevelDB table format.
+  TableFormatRocksDBv2  // The current default table format.
+	TableFormatPebblev1   // Block properties.
+	TableFormatPebblev2   // Range keys.
+  ...
+  TableFormatPebbleDBvN
+)
+```
+
+The introduction of `TableFormatUnspecified` can be used to ensure that where a
+`sstable.TableFormat` is _not_ specified, Pebble can select a suitable default
+for writing the table (most likely based on the format major version in use by
+the store; more in the next section).
+
+## Interaction with the format major version
+
+The `FormatMajorVersion` type is used to determine the set of features the
+store supports.
+
+A Pebble store may be read-from / written-to by a Pebble binary that supports
+newer features, with more recent Pebble format major versions. These newer
+features could include the ability to read and write more recent SSTables.
+While the store _could_ read and write SSTables at the most recent version the
+binary supports, it is not safe to do so, for reasons outlined earlier.
+
+The format major version will have a "maximum table format version" associated
+with it that indicates the maximum `sstable.TableFormat` that can be safely
+handled by the store.
+
+When introducing a new _table format_ version, it should be gated behind an
+associated `FormatMajorVersion` that has the new table format as its "maximum
+table format version".
+
+For example:
+
+```go
+// Existing verisons.
+FormatDefault.MaxTableFormat()                       // sstable.TableFormatRocksDBv2
+...
+FormatSetWithDelete.MaxTableFormat()                 // sstable.TableFormatRocksDBv2
+// Proposed versions with Pebble version scheme.
+FormatBlockPropertyCollector.MaxTableFormat()        // sstable.TableFormatPebbleDBv1
+FormatRangeKeys.MaxTableFormat()                     // sstable.TableFormatPebbleDBv2
+```
+
+## Usage in Cockroach
+
+The introduction of new SSTable format versions needs to be carefully
+coordinated between stores to ensure there are no incompatibilities (i.e. newer
+store writes an SSTable that cannot be understood by other stores).
+
+It is only safe to use a new table format when all nodes in a cluster have been
+finalized. A newer Cockroach node, with newer Pebble code, should continue to
+write SSTables with a table format version equal to or less than the smallest
+table format version across all nodes in the cluster. Once the cluster version
+has been finalized, and `(*DB).RatchetFormatMajorVersion(FormatMajorVersion)`
+has been called, nodes are free to write SSTables at newer table format
+versions.
+
+At runtime, Pebble exposes a `(*DB).FormatMajorVersion()` method, which may be
+used to determine the current format major version of the store, and hence, the
+associated table format version.
+
+In addition to the above, there are situations where SSTables are created for
+consumption at a later point in time, independent of any Pebble store -
+specifically backup and restore. Currently, Cockroach uses two functions in
+`pkg/sstable` to construct SSTables for both ingestion and backup
+([here](https://github.com/cockroachdb/cockroach/blob/20eaf0b415f1df361246804e5d1d80c7a20a8eb6/pkg/storage/sst_writer.go#L57)
+and
+[here](https://github.com/cockroachdb/cockroach/blob/20eaf0b415f1df361246804e5d1d80c7a20a8eb6/pkg/storage/sst_writer.go#L78)).
+Both will need to be updated to take into account the cluster version to ensure
+that SSTables with newer versions are only written once the cluster version has
+been finalized.
+
+### Cluster version migration sequencing
+
+Cockroach uses cluster versions as a guarantee that all nodes in a cluster are
+running at a particular binary version, with a particular set of features
+enabled. The Pebble store is ratcheted as the cluster version passes certain
+versions that correspond to new Pebble functionality. Care must be taken to
+prevent subtle race conditions while the cluster version is being updated
+across all nodes in a cluster.
+
+Consider a cluster at cluster version `n-1` with corresponding Pebble format
+major version `A`. A new cluster version `n` introduces a new Pebble format
+major version `B` with new table level features. One by one, nodes will bump
+their format major versions from `A` to `B` as they are upgraded to cluster
+version `n`. There exists a period of time where nodes in a cluster are split
+between cluster versions `n-1` and `n`, and Pebble format major versions `A`
+and `B`. If version `B` introduces SSTable level features that nodes with
+stores at format major version `A` do not yet understand, there exists the risk
+for runtime incompatibilities.
+
+To guard against the window of incompatibility, _two_ cluster versions are
+employed when bumping Pebble format major versions that correspond to new
+SSTable level features. The first cluster verison is uesd to synchronize all
+stores at the same Pebble format major version (and therefore table format
+version). The second cluster version is used as a feature gate that enables
+Cockroach nodes to make use of the newer table format, relying on the guarantee
+that if a node is at version `n + 1`, then all other nodes in the cluster must
+all be at least at version `n`, and therefore have Pebble stores at format
+major version `B`.
diff --git a/pebble/docs/RFCS/20220311_pebble_flushable_ingested_sstable.md b/pebble/docs/RFCS/20220311_pebble_flushable_ingested_sstable.md
new file mode 100644
index 0000000..fb6ff7f
--- /dev/null
+++ b/pebble/docs/RFCS/20220311_pebble_flushable_ingested_sstable.md
@@ -0,0 +1,280 @@
+- Feature Name: Flushable Ingested SSTable
+- Status: in-progress
+- Start Date: 2022-03-11
+- Authors: Mufeez Amjad
+- RFC PR: [#1586](https://github.com/cockroachdb/pebble/pull/1586)
+- Pebble Issues: [#25](https://github.com/cockroachdb/pebble/issues/25)
+- Cockroach Issues:
+
+## Summary
+
+To avoid a forced flush when ingesting SSTables that have an overlap with a
+memtable, we "lazily" add the SSTs to the LSM as a `*flushableEntry` to
+`d.mu.mem.queue`. In comparison to a regular ingest which adds the SSTs to the
+lowest possible level, the SSTs will get placed in the memtable queue before
+they are eventually flushed (to the lowest level possible). This state is only
+persisted in memory until a flush occurs, thus we require a WAL entry to replay
+the ingestion in the event of a crash.
+
+## Motivation
+
+Currently, if any of the SSTs that need to be ingested have an overlap with a
+memtable, we
+[wait](https://github.com/cockroachdb/pebble/blob/56c5aebe151977964db7e464bb6c87ebd3451bd5/ingest.go#L671)
+for the memtable to be flushed before the ingestion can proceed. This is to
+satisfy the invariant that newer entries (those in the ingested SSTs) in the LSM
+have a higher sequence number than old entries (those in the memtables). This
+problem is also present for subsequent normal writes that are blocked behind the
+ingest waiting for their sequence number to be made visible.
+
+## Technical Design
+
+The proposed design is mostly taken from Peter's suggestion in #25. The core
+requirements are:
+1. Replayable WAL entry for the ingest.
+2. Implementation of the `flushable` interface for a new `ingestedSSTables` struct.
+3. Lazily adding the ingested SSTs to the LSM.
+4. Flushing logic to move SSTs into L0-L6.
+
+<br>
+
+### 1. WAL Entry
+
+We require a WAL entry to make the ingestion into the flushable queue
+replayable, and there is a need for a new type of WAL entry that does not get
+applied to the memtable. 2 approaches were considered:
+1. Using `seqnum=0` to differentiate this new WAL entry.
+2. Introduce a new `InternalKeyKind` for the new WAL entry,
+   `InternalKeyKindIngestSST`.
+
+We believe the second approach is better because it avoids modifying batch
+headers which can be messy/hacky and because `seqnum=0` is already used for
+unapplied batches. The second approach also gives way for a simpler/cleaner
+implementation because it utilizes the extensibility of `InternalKeyKind` and is
+similar to the treatment of `InternalKeyKindLogData`. It also follows the
+correct seqnum semantics for SSTable ingestion in the event of a WAL replay —
+each SST in the ingestion batch already gets its own sequence number.
+
+This change will need to be gated on a `FormatMajorVersion` because if the store
+is opened with an older version of Pebble, Pebble will not understand any WAL
+entry that contains the new `InternalKeyKind`.
+
+<br>
+
+When performing an ingest (with overlap), we create a batch with the header:
+
+```
++-------------+------------+--- ... ---+
+| SeqNum (8B) | Count (4B) |  Entries  |
++-------------+------------+--- ... ---+
+```
+
+where`SeqNum` is the current running sequence number in the WAL, `Count` is the
+number of ingested SSTs, and each entry has the form:
+
+```
++-----------+-----------------+-------------------+
+| Kind (1B) | Key (varstring) | Value (varstring) |
++-----------+-----------------+-------------------+
+```
+
+where `Kind` is `InternalKeyKindIngestSST`, and `Key` is a path to the
+ingested SST on disk.
+
+When replaying the WAL, we check every batch's first entry and if `keykind ==
+InternalKeyKindIngestSSTs` then we continue reading the rest of the entries in
+the batch of SSTs and replay the ingestion steps - we construct a
+`flushableEntry` and add it to the flushable queue:
+
+```go
+b = Batch{db: d}
+b.SetRepr(buf.Bytes())
+seqNum := b.SeqNum()
+maxSeqNum = seqNum + uint64(b.Count())
+br := b.Reader()
+if kind, _, _, _ := br.Next(); kind == InternalKeyKindIngestSST {
+  // Continue reading the rest of the batch and construct flushable 
+  // of sstables with correct seqnum and add to queue.
+  buf.Reset()
+  continue
+}
+```
+
+
+### 2. `flushable` Implementation
+
+Introduce a new flushable type: `ingestedSSTables`.
+
+```go
+type ingestedSSTables struct {
+    files []*fileMetadata
+    size  uint64
+
+    cmp      Compare
+    newIters tableNewIters
+}
+```
+which implements the following functions from the `flushable` interface:
+
+#### 1. `newIter(o *IterOptions) internalIterator`
+
+We return a `levelIter` since the ingested SSTables have no overlap, and we can
+treat them like a level in the LSM.
+
+```go
+levelSlice := manifest.NewLevelSliceKeySorted(s.cmp, s.files)
+return newLevelIter(*o, s.cmp, nil, s.newIters, levelSlice.Iter(), 0, nil)
+```
+
+<br>
+
+On the client-side, this iterator would have to be used like this:
+```go
+var iter internalIteratorWithStats
+var rangeDelIter keyspan.FragmentIterator
+iter = base.WrapIterWithStats(mem.newIter(&dbi.opts))
+switch mem.flushable.(type) {
+case *ingestedSSTables:
+    iter.(*levelIter).initRangeDel(&rangeDelIter)
+default:
+    rangeDelIter = mem.newRangeDelIter(&dbi.opts)
+}
+
+mlevels = append(mlevels, mergingIterLevel{
+    iter:         iter,
+    rangeDelIter: rangeDelIter,
+})
+```
+
+#### 2. `newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator`
+
+#### 3. `newRangeDelIter(o *IterOptions) keyspan.FragmentIterator`
+
+The above two methods would return `nil`. By doing so, in `c.newInputIter()`:
+```go
+if flushIter := f.newFlushIter(nil, &c.bytesIterated); flushIter != nil {
+    iters = append(iters, flushIter)
+}
+if rangeDelIter := f.newRangeDelIter(nil); rangeDelIter != nil {
+    iters = append(iters, rangeDelIter)
+}
+```
+we ensure that no iterators on `ingestedSSTables` will be used while flushing in
+`c.runCompaction()`.
+
+The special-cased flush process for this flushable is described in [Section
+4](#4-flushing-logic-to-move-ssts-into-l0).
+
+#### 4. `newRangeKeyIter(o *IterOptions) keyspan.FragmentIterator`
+
+Will wait on range key support in `levelIter` to land before implementing.
+
+#### 5. `inuseBytes() uint64` and `totalBytes() uint64`
+
+For both functions, we return 0.
+
+Returning 0 for `inuseBytes()` means that the calculation of `c.maxOverlapBytes`
+is not affected by the SSTs (the ingested SSTs don't participate in the
+compaction).
+
+We don't want the size of the ingested SSTs to contribute to the size of the
+memtable when determining whether or not to stall writes
+(`MemTableStopWritesThreshold`); they should contribute to the L0 read-amp
+instead (`L0StopWritesThreshold`). Thus, we'll have to special case for ingested
+SSTs in `d.makeRoomForWrite()` to address this detail.
+
+`totalBytes()` represents the number of bytes allocated by the flushable, which
+in our case is 0. A consequence for this is that the size of the SSTs do not
+count towards the flush threshold calculation. However, by setting
+`flushableEntry.flushForced` we can achieve the same behaviour.
+
+#### 6. `readyForFlush() bool`
+
+The flushable of ingested SSTs can always be flushed because the files are
+already on disk, so we return true.
+
+### 3. Lazily adding the ingested SSTs to the LSM
+
+The steps to add the ingested SSTs to the flushable queue are:
+1. Detect an overlap exists (existing logic).
+
+Add a check that falls back to the old ingestion logic of blocking the ingest on
+the flush when `len(d.mu.mem.queue) >= MemtablesStopWritesThreshold - 1`. This
+reduces the chance that many short, overlapping, and successive ingestions cause
+a memtable write stall.
+
+Additionally, to mitigate the hiccup on subsequent normal writes, we could wait
+before the call to `d.commit.AllocateSeqNum` until:
+1. the number of immutable memtables and `ingestedSSTs` in the flushable queue
+   is below a certain threshold (to prevent building up too many sublevels)
+2. the number of immutable memtables is low. This could lead to starvation if
+   there is a high rate of normal writes.
+
+2. Create a batch with the list of ingested SSTs.
+```go
+b := newBatch()
+for _, path := range paths:
+    b.IngestSSTs([]byte(path), nil)
+```
+3. Apply the batch.
+
+In the call to `d.commit.AllocateSeqNum`, `b.count` sequence numbers are already
+allocated before the `prepare` step. When we identify a memtable overlap, we
+commit the batch to the WAL manually (through logic similar to
+`commitPipeline.prepare`). The `apply` step would be a no-op if we performed a
+WAL write in the `prepare` step. We would also need to truncate the memtable/WAL
+after this step.
+
+5. Create `ingestedSSTables` flushable and `flushableEntry`.
+
+We'd need to call `ingestUpdateSeqNum` on these SSTs before adding them to the
+flushable. This is to respect the sequence number ordering invariant while the
+SSTs reside in the flushable queue.
+
+6. Add to flushable queue.
+
+Pebble requires that the last entry in `d.mu.mem.queue` is the mutable memtable
+with value `d.mu.mem.mutable`. When adding a `flushableEntry` to the queue, we
+want to maintain this invariant. To do this we pass `nil` as the batch to
+`d.makeRoomForWrite()`. The result is
+
+```
+| immutable old memtable | mutable new memtable |
+```
+
+We then append our new `flushableEntry`, and swap the last two elements in
+`d.mu.mem.queue`:
+
+```
+| immutable old memtable | ingestedSSTables | mutable new memtable |
+```
+
+Because we add the ingested SSTs to the flushable queue when there is overlap,
+and are skipping applying the version edit through the `apply` step of the
+ingestion, we ensure that the SSTs are only added to the LSM once.
+
+7. Call `d.maybeScheduleFlush()`.
+
+Because we've added an immutable memtable to the flushable queue and set
+`flushForced` on the `flushableEntry`, this will surely result in a flush. This
+call can be done asynchronously.
+
+We can then return to caller without waiting for the flush to finish.
+
+### 4. Flushing logic to move SSTs into L0-L6
+
+By returning `nil` for both `flushable.newFlushIter()` and
+`flushable.newRangeDelIter()`, the `ingestedSSTables` flushable will not be
+flushed normally.
+
+The suggestion in issue #25 is to move the SSTs from the flushable queue into
+L0. However, only the tables that overlap with the memtable will need to target
+L0 (because they will likely overlap with L0 post flush), the others can be
+moved to lower levels in the LSM. We can use the existing logic in
+`ingestTargetLevel` to determine which level to move the ingested SSTables to
+during `c.runCompaction()`. However, it's important to do this step after the
+memtable has been flushed to use the correct `version` when determining overlap.
+
+The flushable of ingested SSTs should not influence the bounds on the
+compaction, so we will have to skip updating `c.smallest` and `c.largest` in
+`d.newFlush()` for this flushable.
diff --git a/pebble/docs/RFCS/20221122_virtual_sstable.md b/pebble/docs/RFCS/20221122_virtual_sstable.md
new file mode 100644
index 0000000..168dbf6
--- /dev/null
+++ b/pebble/docs/RFCS/20221122_virtual_sstable.md
@@ -0,0 +1,366 @@
+- Feature Name: Virtual sstables
+- Status: in-progress
+- Start Date: 2022-10-27
+- Authors: Arjun Nair
+- RFC PR: https://github.com/cockroachdb/pebble/pull/2116
+- Pebble Issues:
+  https://github.com/cockroachdb/pebble/issues/1683
+
+
+** Design Draft**
+
+# Summary
+
+The RFC outlines the design to enable virtualizing of physical sstables
+in Pebble.
+
+A virtual sstable has no associated physical data on disk, and is instead backed
+by an existing physical sstable. Each physical sstable may be shared by one, or
+more than one virtual sstable.
+
+Initially, the design will be used to lower the read-amp and the write-amp
+caused by certain ingestions. Sometimes, ingestions are unable to place incoming
+files, which have no data overlap with other files in the lsm, lower in the lsm
+because of file boundary overlap with files in the lsm. In this case, we are
+forced to place files higher in the lsm, sometimes in L0, which can cause higher
+read-amp and unnecessary write-amp as the file is moved lower down the lsm. See
+https://github.com/cockroachdb/cockroach/issues/80589 for the problem occurring
+in practice.
+
+Eventually, the design will also be used for the disaggregated storage masking
+use-case: https://github.com/cockroachdb/cockroach/pull/70419/files.
+
+This document describes the design of virtual sstables in Pebble with enough
+detail to aid the implementation and code review.
+
+# Design
+
+### Ingestion
+
+When an sstable is ingested into Pebble, we try to place it in the lowest level
+without any data overlap, or any file boundary overlap. We can make use of
+virtual sstables in the cases where we're forced to place the ingested sstable
+at a higher level due to file boundary overlap, but no data overlap.
+
+```
+                                  s2
+ingest:                     [i-j-------n]
+                                  s1
+L6:                 [e---g-----------------p---r]
+             a b c d e f g h i j k l m n o p q r s t u v w x y z
+```
+
+Consider the sstable s1 in L6 and the ingesting sstable s2. It is clear that
+the file boundaries of s1 and s2 overlap, but there is no data overlap as shown
+in the diagram. Currently, we will be forced to ingest the sstable s2 into a
+level higher than L6. With virtual sstables, we can split the existing sstable
+s1 into two sstables s3 and s4 as shown in the following diagram.
+
+```
+                       s3         s2        s4
+L6:                 [e---g]-[i-j-------n]-[p---r]
+             a b c d e f g h i j k l m n o p q r s t u v w x y z
+```
+
+The sstable s1 will be deleted from the lsm. If s1 was a physical sstable, then
+we will keep the file on disk as long as we need to so that it can back the
+virtual sstables.
+
+There are cases where the ingesting sstables have no data overlap with existing
+sstables, but we can't make use of virtual sstables. Consider:
+```
+                                  s2
+ingest:               [f-----i-j-------n]
+                                  s1
+L6:                 [e---g-----------------p---r]
+             a b c d e f g h i j k l m n o p q r s t u v w x y z
+```
+We cannot use virtual sstables in the above scenario for two reasons:
+1. We don't have a quick method of detecting no data overlap.
+2. We will be forced to split the sstable in L6 into more than two virtual
+   sstables, but we want to avoid many small virtual sstables in the lsm.
+
+Note that in Cockroach, the easier-to-solve case happens very regularly when an
+sstable spans a range boundary (which pebble has no knowledge of), and we ingest
+a snapshot of a range in between the two already-present ranges.
+
+slide in between two existing sstables is more likely to happen. It occurs when
+we ingest a snapshot of a range in between two already present ranges.
+
+`ingestFindTargetLevel` changes:
+- The `ingestFindTargetLevel` function is used to determine the target level
+  of the file which is being ingested. Currently, this function returns an `int`
+  which is the target level for the ingesting file. Two additional return
+  parameters, `[]manifest.NewFileEntry` and `*manifest.DeletedFileEntry`, will be
+  added to the function.
+- If `ingestFindTargetLevel` decides to split an existing sstable into virtual
+  sstables, then it will return new and deleted entries. Otherwise, it will only
+  return the target level of the ingesting file.
+- Within the `ingestFindTargetLevel` function, the `overlapWithIterator`
+  function is used to quickly detect data overlap. In the case with file
+  boundary overlap, but no data overlap, in the lowest possible level, we will
+  split the existing sstable into virtual sstables and generate the
+  `NewFileEntry`s and the `DeletedFileEntry`. The `FilemetaData` section
+  describes how the various fields in the `FilemetaData` will be computed for
+  the newly created virtual sstables.
+
+- Note that we will not split physical sstables into virtual sstables in L0 for
+  the use case described in this RFC. The benefit of doing so would be to reduce
+  the number of L0 sublevels, but the cost would be additional implementation
+  complexity(see the `FilemetaData` section). We also want to avoid too many
+  virtual sstables in the lsm as they can lead to space amp(see `Compaction`
+  section). However, in the future, for the disaggregated storage masking case,
+  we would need to support ingestion and use of virtual sstables in L0.
+
+- Note that we may need an upper bound on the number of times an sstable is
+  split into smaller virtual sstables. We can further reduce the risk of many
+  small sstables:
+  1. For CockroachDB's snapshot ingestion, there is one large sst (up to 512MB)
+     and many tiny ones. We can choose the apply this splitting logic only for
+     the large sst. It is ok for the tiny ssts to be ingested into L0.
+  2. Split only if the ingested sst is at least half the size of the sst being
+     split. So if we have a smaller ingested sst, we will pick a higher level to
+     split at (where the ssts are smaller). The lifetime of virtual ssts at a
+     higher level is smaller, so there is lower risk of littering the LSM with
+     long-lived small virtual ssts.
+  3. For disaggregated storage implementation, we can avoid masking for tiny
+     sstables being ingested and instead write a range delete like we currently
+     do. Precise details on the masking use case are out of the scope of this
+     RFC.
+
+`ingestApply` changes:
+- The new and deleted file entries returned by the `ingestFindTargetLevel`
+  function will be added to the version edit in `ingestApply`.
+- We will appropriately update the `levelMetrics` based on the new information
+  returned by `ingestFindTargetLevel`.
+
+
+### `FilemetaData` changes
+
+Each virtual sstables will have a unique file metadata value associated with it.
+The metadata may be borrowed from the backing physical sstable, or it may be
+unique to the virtual sstable.
+
+This rfc lists out the fields in the `FileMetadata` struct with information on
+how each field will be populated.
+
+`Atomic.AllowedSeeks`: Field is used for read triggered compactions, and we can
+populate this field for each virtual sstable since virtual sstables can be
+picked for compactions.
+
+`Atomic.statsValid`: We can set this to true(`1`) when the virtual sstable is
+created. On virtual sstable creation we will estimate the table stats of the
+virtual sstable based on the table stats of the physical sstable. We can also
+set this to `0` and let the table stats job asynchronously compute the stats.
+
+`refs`: The will be turned into a pointer which will be shared by the
+virtual/physical sstables. See the deletion section of the RFC to learn how the
+`refs` count will be used.
+
+`FileNum`: We could give each virtual sstable its own file number or share
+the file number between all the virtual sstables. In the former case, the virtual
+sstables will be distinguished by the file number, and will have an additional
+metadata field to indicate the file number of the parent sstable. In the latter
+case, we can use a few of the most significant bits of the 64 bit file number to
+distinguish the virtual sstables.
+
+The benefit of using a single file number for each virtual sstable, is that we
+don't need to use additional space to store the file number of the backing
+physical sstable.
+
+It might make sense to give each virtual sstable its own file number. Virtual
+sstables are picked for compactions, and compactions and compaction picking
+expect a unique file number for each of the files which it is compacting.
+For example, read compactions will use the file number of the file to determine
+if a file picked for compaction has already been compacted, the version edit
+will expect a different file number for each virtual sstable, etc.
+
+There are direct references to the `FilemetaData.FileNum` throughout Pebble. For
+example, the file number is accessed when the the `DB.Checkpoint` function is
+called. This function iterates through the files in each level of the lsm,
+constructs the filepath using the file number, and reads the file from disk. In
+such cases, it is important to exclude virtual sstables.
+
+`Size`: We compute this using linear interpolation on the number of blocks in
+the parent sstable and the number of blocks in the newly created virtual sstable.
+
+`SmallestSeqNum/LargestSeqNum`: These fields depend on the parent sstable,
+but we would need to perform a scan of the physical sstable to compute these
+accurately for the virtual sstable upon creation. Instead, we could convert
+these fields into lower and upper bounds of the sequence numbers in a file.
+
+These fields are used for l0 sublevels, pebble tooling, delete compaction hints,
+and a lot of plumbing. We don't need to worry about the L0 sublevels use case
+because we won't have virtual sstables in L0 for the use case in this RFC. For
+the rest of the use cases we can use lower bound for the smallest seq number,
+and an upper bound for the largest seq number work.
+
+TODO(bananabrick): Add more detail for any delete compaction hint changes if
+necessary.
+
+`Smallest/Largest`: These, along with the smallest/largest ranges for the range
+and point keys can be computed upon virtual sstable creation. Precisely, these
+can be computed when we try and detect data overlap in the `overlapWithIterator`
+function during ingestion.
+
+`Stats`: `TableStats` will either be computed upon virtual sstable creation
+using linear interpolation on the block counts of the virtual/physical sstables
+or asynchronously using the file bounds of the virtual sstable.
+
+`PhysicalState`: We can add an additional struct with state associated with
+physical ssts which have been virtualized.
+
+```
+type PhysicalState struct {
+  // Total refs across all virtual ssts * versions. That is, if the same virtual
+  // sst is present in multiple versions, it may have multiple refs, if the
+  // btree node is not the same.
+  totalRefs int32
+
+  // Number of virtual ssts in the latest version that refer to this physical
+  // SST. Will be 1 if there is only a physical sst, or there is only 1 virtual
+  // sst referencing this physical sst.
+  // INVARIANT: refsInLatestVersion <= totalRefs
+  // refsInLatestVersion == 0 is a zombie sstable.
+  refsInLatestVersion int32
+
+  fileSize uint64
+
+  // If sst is not virtualized and in latest version
+  // virtualSizeSumInLatestVersion == fileSize. If
+  // virtualSizeSumInLatestVersion > 0 and
+  // virtualSizeSumInLatestVersion/fileSize is very small, the corresponding
+  // virtual sst(s) should be candidates for compaction. These candidates can be
+  // tracked via btree annotations. Incrementlly updated in
+  // BulkVersionEdit.Apply, when updating refsInLatestVersion.
+  virtualSizeSumInLatestVersion uint64
+}
+```
+
+The `Deletion` section and the `Compactions` section describe why we need to
+store the `PhysicalState`.
+
+### Deletion of physical and virtual sstables
+
+We want to ensure that the physical sstable is only deleted from disk when no
+version references it, and when there are no virtual sstables which are backed
+by the physical sstable.
+
+Since `FilemetaData.refs` is a pointer which is shared by the physical and
+virtual sstables, the physical sstable won't be deleted when it is removed
+from the latest version as the `FilemetaData.refs` will have been increased
+when the virtual sstable is added to a version. Therefore, we only need to
+ensure that the physical sstable is eventually deleted when there are no
+versions which reference it.
+
+Sstables are deleted from disk by the `DB.doDeleteObsoleteFiles` function which
+looks for files to delete in the the `DB.mu.versions.obsoleteTables` slice.
+So we need to ensure that any physical sstable which was virtualized is added to
+the obsolete tables list iff `FilemetaData.refs` is 0.
+
+Sstable are added to the obsolete file list when a `Version` is unrefed and
+when `DB.scanObsoleteFiles` is called when Pebble is opened.
+
+When a `Version` is unrefed, sstables referenced by it are only added to the
+obsolete table list if the `FilemetaData.refs` hits 0 for the sstable. With
+virtual sstables, we can have a case where the last version which directly
+references a physical sstable is unrefed, but the physical sstable is not added
+to the obsolete table list because its `FilemetaData.refs` count is not 0
+because of indirect references through virtual sstables. Since the last Version
+which directly references the physical sstable is deleted, the physical sstable
+will never get added to the obsolete table list. Since virtual sstables keep
+track of their parent physical sstable, we can just add the physical sstable to
+the obsolete table list when the last virtual sstable which references it is
+deleted.
+
+`DB.scanObsoleteFiles` will delete any file which isn't referenced by the
+`VersionSet.versions` list. So, it's possible that a physical sstable associated
+with a virtual sstable will be deleted. This problem can be fixed by a small
+tweak in the `d.mu.versions.addLiveFileNums` to treat the parent sstable of
+a virtual sstable as a live file.
+
+Deleted files still referenced by older versions are considered zombie sstables.
+We can extend the definition of zombie sstables to be any sstable which is not
+directly, or indirectly through virtual sstables, referenced by the latest
+version. See the `PhysicalState` subsection of the `FilemetaData` section
+where we describe how the references in the latest version will be tracked.
+
+
+### Reading from virtual sstables
+
+Since virtual sstables do not exist on disk, we will have to redirect reads
+to the physical sstable which backs the virtual sstable.
+
+All reads to the physical files go through the table cache which opens the file
+on disk and creates a `Reader` for the reads. The table cache currently creates
+a `FileNum` -> `Reader` mapping for the physical sstables.
+
+Most of the functions in table cache API take the file metadata of the file as
+a parameter. Examples include `newIters`, `newRangeKeyIter`, `withReader`, etc.
+Each of these functions then calls a subsequent function on the sstable
+`Reader`.
+
+In the `Reader` API, some functions only really need to be called on physical
+sstables, whereas some functions need to be called on both physical and virtual
+sstables. For example, the `Reader.EstimateDiskUsage` usage function, or the
+`Reader.Layout` function only need to be called on physical sstables, whereas,
+some function like, `Reader.NewIter`, and `Reader.NewCompactionIter` need to
+work with virtual sstables.
+
+We could either have an abstraction over the physical sstable `Reader` per
+virtual sstable, or update the `Reader` API to accept file bounds of the
+sstable. In the latter case, we would create one `Reader` on the physical
+sstable for all of the virtual sstables, and update the `Reader` API to accept
+the file bounds of the sstable.
+
+Changes required to share a `Reader` on the physical sstable among the virtual
+sstable:
+- If the file metadata of the virtual sstable is passed into the table cache, on
+  a table cache miss, the table cache will load the Reader for the physical
+  sstable. This step can be performed in the `tableCacheValue.load` function. On
+  a table cache hit, the file number of the parent sstable will be used to fetch
+  the appropriate sstable `Reader`.
+- The `Reader` api will be updated to support reads from virtual sstables. For
+  example, the `NewCompactionIter` function will take additional
+  `lower,upper []byte` parameters.
+
+Updates to iterators:
+- `Reader.NewIter` already has `lower,upper []byte` parameters so this requires
+   no change.
+- Add `lower,upper` fields to the `Reader.NewCompactionIter`. The function
+  initializes single level and two level iterators, and we can pass in the
+  `lower,upper` values to those. TODO(bananabrick): Make sure that the value
+  of `bytesIterated` in the compaction iterator is still accurate.
+- `Reader.NewRawRangeKeyIter/NewRawRangeDelIter`: We need to add `lower/upper`
+   fields to the functions. Both iterators make use of a `fragmentBlockIter`. We
+   could filter keys above the `fragmentBlockIter` or add filtering within the
+   `fragmentBlockIter`. To add filtering within the `fragmentBlockIter` we will
+   initialize it with two additional `lower/upper []byte` fields.
+- We would need to update the `SetBounds` logic for the sstable iterators to
+  never set bounds for the iterators outside the virtual sstable bounds. This
+  could lead to keys outside the virtual sstable bounds, but inside the physical
+  sstable bounds, to be surfaced.
+
+TODO(bananabrick): Add a section about sstable properties, if necessary.
+
+### Compactions
+
+Virtual sstables can be picked for compactions. If the `FilemetaData` and the
+iterator stack changes work, then compaction shouldn't require much, if any,
+additional work.
+
+Virtual sstables which are picked for compactions may cause space amplification.
+For example, if we have two virtual sstables `a` and `b` in L5, backed by a
+physical sstable `c`, and the sstable `a` is picked for a compaction. We will
+write some additional data into L6, but we won't delete sstable `c` because
+sstable `b` still refers to it. In the worst case, sstable `b` will never be
+picked for compaction and will never be compacted into and we'll have permanent
+space amplification. We should try prioritize compaction of sstable `b` to
+prevent such a scenario.
+
+See the `PhysicalState` subsection in the `FilemetaData` section to see how
+we'll store compaction picking metrics to reduce virtual sstable space-amp.
+
+### `VersionEdit` decode/encode
+Any additional fields added to the `FilemetaData` need to be supported in the
+version edit `decode/encode` functions.
diff --git a/pebble/docs/css/app.css b/pebble/docs/css/app.css
new file mode 100644
index 0000000..71e1527
--- /dev/null
+++ b/pebble/docs/css/app.css
@@ -0,0 +1,117 @@
+body {
+    margin: 10;
+    background-color: #fff;
+    font: 10pt -apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,Apple Color Emoji,Segoe UI Emoji;
+}
+
+.divider {
+    border-top: 1px solid #eee;
+}
+
+.columns {
+    display: flex;
+    flex-direction: row;
+    align-items: baseline;
+    justify-content: space-between;
+}
+
+.rows {
+    display: flex;
+    flex-direction: column;
+    flex-wrap: wrap;
+}
+
+.center {
+    margin: auto;
+    width: 90%;
+    min-width: 400px;
+    max-width: 1200px;
+}
+
+.section {
+    flex: 100%;
+    margin-top: 10px;
+    overflow: auto;
+}
+
+.title {
+    font-size: 24pt;
+    font-weight: bold;
+}
+
+.subtitle {
+    font-size: 12pt;
+    font-weight: bold;
+}
+
+.updated {
+    font-size: 9pt;
+    text-align: right;
+}
+
+div.annotation {
+    display: none;
+}
+
+.code {
+    font-family: SFMono-Regular,Consolas,Liberation Mono,Menlo,monospace;
+}
+
+.overview {
+    max-width: 800px;
+}
+
+.controls {
+    margin: 5px;
+}
+
+a {
+    text-decoration: none;
+}
+
+.selected {
+    font-weight: bold;
+    text-decoration: underline;
+}
+
+path.line1 {
+    fill: none;
+    stroke-width: 1.5px;
+}
+
+path.line2 {
+    fill: none;
+    stroke-width: 1.5px;
+}
+
+svg.chart {
+    flex: 50%;
+    height: 200px;
+}
+
+.write-throughput {
+    flex: 50%;
+    height: 300px;
+}
+
+.write-throughput-detail {
+    flex: 50%;
+    height: 300px;
+}
+
+text.hover {
+    font-size: 8pt;
+    font-weight: bold;
+    filter: url(#textBackground);
+}
+
+@media only screen and (max-width: 900px) {
+    .columns {
+        flex-direction: column;
+    }
+
+    svg.chart {
+        width: 100%;
+        flex: auto;
+    }
+}
diff --git a/pebble/docs/index.html b/pebble/docs/index.html
new file mode 100644
index 0000000..cc70942
--- /dev/null
+++ b/pebble/docs/index.html
@@ -0,0 +1,166 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+    <link rel="stylesheet" href="css/app.css" >
+    <title>Pebble Benchmarks</title>
+  </head>
+  <body>
+    <div class="center rows">
+      <div class="section">
+        <div class="columns">
+          <div class="title">Pebble Benchmarks</div>
+          <div class="updated">Last updated</div>
+        </div>
+        <br/>
+        <div class="columns">
+          <div class="overview">
+            Benchmarks are run nightly using <a class="code"
+            href="https://github.com/cockroachdb/pebble/blob/master/cmd/pebble/ycsb.go">pebble
+            bench ycsb</a> on AWS m6id.4xlarge machines equipped with
+            local SSD storage. The AWS instances show remarkably high
+            instance to instance performance variability. In order to
+            smooth out that variability the benchmarks are run multiple
+            times each (using different instances) and outliers are
+            excluded.
+          </div>
+          <div>
+            <div class="controls">
+              <b>Detail:</b>
+              <a id="readBytes" class="toggle">Bytes Read</a> |
+              <a id="writeBytes" class="toggle">Bytes Written</a> |
+              <a id="readAmp" class="toggle">Read Amp</a> |
+              <a id="writeAmp" class="toggle">Write Amp</a>
+            </div>
+            <div class="controls">
+              <b>Options:</b>
+              <a id="localMax">Local scale</a>
+            </div>
+          </div>
+        </div>
+        <hr class="divider"/>
+        <div class="annotation" data-date="20200614">L0-sublevels and
+        flush-splits enabled</div>
+        <div class="annotation" data-date="20200604">Increased
+        LogWriter free blocks 4-&gt;16</div>
+        <div class="annotation" data-date="20200706">Began tracking
+        ycsb/E read-amp</div>
+        <div class="annotation" data-date="20201022">Level metadata
+        switched to use a B-Tree</div>
+        <div class="annotation" data-date="20201222">Enabled
+        read-triggered compactions</div>
+        <div class="annotation" data-date="20210113">Readahead
+        and preallocation bug fixed</div>
+        <div class="annotation" data-date="20210326">Removed excess
+        read samples for read-triggered compactions</div>
+        <div class="annotation" data-date="20210429">Switched to Ubuntu
+        20.04.2 LTS AMI</div>
+        <div class="annotation" data-date="20211018">Read compaction fixes</div>
+        <div class="annotation" data-date="20211109">Bumped benchmark
+          runtime to 90 minutes</div>
+        <div class="annotation" data-date="20220330">Data quality issue introduced (YCSB A only)</div>
+        <div class="annotation" data-date="20220526">Data quality issue fixed (YCSB A only)</div>
+        <div class="annotation" data-date="20220626">Began zeroing reused
+            iterator structs (#1822)</div>
+        <div class="annotation" data-date="20230215">Grandparent boundary
+            compaction splitting</div>
+        <div class="annotation" data-date="20230516">Infrastructure
+            change (#2578)</div>
+        <div class="annotation" data-date="20230607">ycsb/F sampling bug</div>
+        <div class="annotation" data-date="20230611">Switched to m6id.4xlarge
+            (from 5d.4xlarge)</div>
+      </div>
+      <div class="section rows">
+        <div>
+          <span class="subtitle">YCSB A</span>
+          <span>(50% reads, 50% updates, zipf key distribution)</span>
+        </div>
+        <div class="columns">
+          <svg class="chart ycsb" data-key="ycsb/A/values=64"></svg>
+          <svg class="chart ycsb" data-key="ycsb/A/values=1024"></svg>
+        </div>
+      </div>
+      <div class="section rows">
+        <div>
+          <span class="subtitle">YCSB B</span>
+          <span>(95 reads, 5% updates, zipf key distribution)</span>
+        </div>
+        <div class="columns">
+          <svg class="chart ycsb left" data-key="ycsb/B/values=64"></svg>
+          <svg class="chart ycsb right" data-key="ycsb/B/values=1024"></svg>
+        </div>
+      </div>
+      <div class="section rows">
+        <div>
+          <span class="subtitle">YCSB C</span>
+          <span>(100% reads, zipf key distribution)</span>
+        </div>
+        <div class="columns">
+          <svg class="chart ycsb left" data-key="ycsb/C/values=64"></svg>
+          <svg class="chart ycsb right" data-key="ycsb/C/values=1024"></svg>
+        </div>
+      </div>
+      <div class="section rows">
+        <div>
+          <span class="subtitle">YCSB D</span>
+          <span>(95% reads, 5% updates, uniform key distribution)</span>
+        </div>
+        <div class="columns">
+          <svg class="chart ycsb left" data-key="ycsb/D/values=64"></svg>
+          <svg class="chart ycsb right" data-key="ycsb/D/values=1024"></svg>
+        </div>
+      </div>
+      <div class="section rows">
+        <div>
+          <span class="subtitle">YCSB E</span>
+          <span>(95% scans, 5% updates, zipf key distribution)</span>
+        </div>
+        <div class="columns">
+          <svg class="chart ycsb left" data-key="ycsb/E/values=64"></svg>
+          <svg class="chart ycsb right" data-key="ycsb/E/values=1024"></svg>
+        </div>
+      </div>
+      <div class="section rows">
+        <div>
+          <span class="subtitle">Insert-only</span>
+          <span>(100% inserts, zipf key distribution)</span>
+        </div>
+        <div class="columns">
+          <svg class="chart ycsb left" data-key="ycsb/F/values=64"></svg>
+          <svg class="chart ycsb right" data-key="ycsb/F/values=1024"></svg>
+        </div>
+      </div>
+    </div>
+    <div class="center rows">
+      <div class="section">
+        <hr class="divider"/>
+        <div>
+          <span class="subtitle">Write throughput</span>
+          <span>(100% inserts, zipf key distribution)</span>
+        </div>
+        <br />
+        <div class="overview">
+          This benchmark attempts to find the optimal write throughput by
+          driving more and more load against the DB until a target heuristic
+          fails (currently a mixture of number of L0 sublevels, L0 files, and
+          whether the DB has experienced a write stall). These benchmarks are
+          run nightly using <a class="code"
+          href="https://github.com/cockroachdb/pebble/blob/master/cmd/pebble/write.go">pebble
+          bench write</a> on GCP n2-standard-32 machines equipped with 16 local
+          SSDs in a RAID 0 array.
+        </div>
+      </div>
+      <div class="section rows">
+        <div class="columns">
+          <svg class="chart write-throughput" data-key="write/values=1024"></svg>
+          <svg class="chart write-throughput-detail"/>
+        </div>
+      </div>
+    </div>
+    <script src="js/d3.v5.min.js"></script>
+    <script src="https://pebble-benchmarks.s3.amazonaws.com/data.js"></script>
+    <script src="js/write-throughput.js"></script>
+    <script src="js/app.js"></script>
+  </body>
+</html>
diff --git a/pebble/docs/io_profiling.md b/pebble/docs/io_profiling.md
new file mode 100644
index 0000000..8fd6230
--- /dev/null
+++ b/pebble/docs/io_profiling.md
@@ -0,0 +1,231 @@
+# I/O Profiling
+
+Linux provide extensive kernel profiling capabilities, including the
+ability to trace operations at the block I/O layer. These tools are
+incredibly powerful, though sometimes overwhelming in their
+flexibility. This document captures some common recipes for profiling
+Linux I/O.
+
+* [Perf](#perf)
+* [Blktrace](#blktrace)
+
+## Perf
+
+The Linux `perf` command can instrument CPU performance counters, and
+the extensive set of kernel trace points. A great place to get started
+understanding `perf` are Brendan Gregg's [perf
+examples](http://www.brendangregg.com/perf.html).
+
+The two modes of operation are "live" reporting via `perf top`, and
+record and report via `perf record` and `perf
+{report,script}`. 
+
+Recording the stack traces for `block:block_rq_insert` event allows
+determination of what Pebble level code is generating block requests.
+
+### Installation
+
+Ubuntu AWS installation:
+
+```
+sudo apt-get install linux-tools-common linux-tools-4.4.0-1049-aws linux-cloud-tools-4.4.0-1049-aws
+```
+
+### Recording
+
+`perf record` (and `perf top`) requires read and write access to
+`/sys/kernel/debug/tracing`. Running as root as an easiest way to get
+the right permissions.
+
+```
+# Trace all block device (disk I/O) requests with stack traces, until Ctrl-C.
+sudo perf record -e block:block_rq_insert -ag
+
+# Trace all block device (disk I/O) issues and completions with stack traces, until Ctrl-C.
+sudo perf record -e block:block_rq_issue -e block:block_rq_complete -ag
+```
+
+The `-a` flag records events on all CPUs (almost always desirable).
+
+The `-g` flag records call graphs (a.k.a stack traces). Capturing the
+stack trace makes the recording somewhat more expensive, but it
+enables determining the originator of the event. Note the stack traces
+include both the kernel and application code, allowing pinpointing the
+source of I/O as due to flush, compaction, WAL writes, etc.
+
+The `-e` flag controls which events are instrumented. The list of
+`perf` events is enormous. See `sudo perf list`.
+
+The `-o` flag controls where output is recorded. The default is
+`perf.data`.
+
+In order to record events for a specific duration, you can append `--
+sleep <duration>` to the command line.
+
+```
+# Trace all block device (disk I/O) requests with stack traces for 10s.
+sudo perf record -e block:block_rq_insert -ag -- sleep 10
+```
+
+### Reporting
+
+The recorded perf data (`perf.data`) can be explored using `perf
+report` and `perf script`.
+
+```
+# Show perf.data in an ncurses browser.
+sudo perf report
+
+# Show perf.data as a text report.
+sudo perf report --stdio
+```
+
+As an example, `perf report --stdio` from perf data gathered using
+`perf record -e block:block_rq_insert -ag` will show something like:
+
+```
+    96.76%     0.00%  pebble          pebble             [.] runtime.goexit
+                    |
+                    ---runtime.goexit
+                       |
+                       |--85.58%-- github.com/cockroachdb/pebble/internal/record.NewLogWriter.func2
+                       |          runtime/pprof.Do
+                       |          github.com/cockroachdb/pebble/internal/record.(*LogWriter).flushLoop-fm
+                       |          github.com/cockroachdb/pebble/internal/record.(*LogWriter).flushLoop
+                       |          github.com/cockroachdb/pebble/internal/record.(*LogWriter).flushPending
+                       |          github.com/cockroachdb/pebble/vfs.(*syncingFile).Sync
+                       |          github.com/cockroachdb/pebble/vfs.(*syncingFile).syncFdatasync-fm
+                       |          github.com/cockroachdb/pebble/vfs.(*syncingFile).syncFdatasync
+                       |          syscall.Syscall
+                       |          entry_SYSCALL_64_fastpath
+                       |          sys_fdatasync
+                       |          do_fsync
+                       |          vfs_fsync_range
+                       |          ext4_sync_file
+                       |          filemap_write_and_wait_range
+                       |          __filemap_fdatawrite_range
+                       |          do_writepages
+                       |          ext4_writepages
+                       |          blk_finish_plug
+                       |          blk_flush_plug_list
+                       |          blk_mq_flush_plug_list
+                       |          blk_mq_insert_requests
+```
+
+This is showing that `96.76%` of block device requests on the entire
+system were generated by the `pebble` process, and `85.58%` of the
+block device requests on the entire system were generated from WAL
+syncing within this `pebble` process.
+
+The `perf script` command provides access to the raw request
+data. While there are various pre-recorded scripts that can be
+executed, it is primarily useful for seeing call stacks along with the
+"trace" data. For block requests, the trace data shows the device, the
+operation type, the offset, and the size.
+
+```
+# List all events from perf.data with recommended header and fields.
+sudo perf script --header -F comm,pid,tid,cpu,time,event,ip,sym,dso,trace
+...
+pebble  6019/6019  [008] 16492.555957: block:block_rq_insert: 259,0 WS 0 () 3970952 + 256 [pebble]
+            7fff813d791a blk_mq_insert_requests
+            7fff813d8878 blk_mq_flush_plug_list
+            7fff813ccc96 blk_flush_plug_list
+            7fff813cd20c blk_finish_plug
+            7fff812a143d ext4_writepages
+            7fff8119ea1e do_writepages
+            7fff81191746 __filemap_fdatawrite_range
+            7fff8119188a filemap_write_and_wait_range
+            7fff81297c41 ext4_sync_file
+            7fff81244ecb vfs_fsync_range
+            7fff81244f8d do_fsync
+            7fff81245243 sys_fdatasync
+            7fff8181ae6d entry_SYSCALL_64_fastpath
+                  3145e0 syscall.Syscall
+                  6eddf3 github.com/cockroachdb/pebble/vfs.(*syncingFile).syncFdatasync
+                  6f069a github.com/cockroachdb/pebble/vfs.(*syncingFile).syncFdatasync-fm
+                  6ed8d2 github.com/cockroachdb/pebble/vfs.(*syncingFile).Sync
+                  72542f github.com/cockroachdb/pebble/internal/record.(*LogWriter).flushPending
+                  724f5c github.com/cockroachdb/pebble/internal/record.(*LogWriter).flushLoop
+                  72855e github.com/cockroachdb/pebble/internal/record.(*LogWriter).flushLoop-fm
+                  7231d8 runtime/pprof.Do
+                  727b09 github.com/cockroachdb/pebble/internal/record.NewLogWriter.func2
+                  2c0281 runtime.goexit
+```
+
+Let's break down the trace data:
+
+```
+259,0 WS 0 () 3970952 + 256
+ |     |         |       |
+ |     |         |       + size (sectors)
+ |     |         |
+ |     |         + offset (sectors)
+ |     |
+ |     +- flags: R(ead), W(rite), B(arrier), S(ync), D(iscard), N(one)
+ |
+ +- device: <major>, <minor>
+```
+
+The above is indicating that a synchronous write of `256` sectors was
+performed starting at sector `3970952`. The sector size is device
+dependent and can be determined with `blockdev --report <device>`,
+though it is almost always `512` bytes. In this case, the sector size
+is `512` bytes indicating that this is a write of 128 KB.
+
+## Blktrace
+
+The `blktrace` tool records similar info to `perf`, but is targeted to
+the block layer instead of being general purpose. The `blktrace`
+command records data, while the `blkparse` command parses and displays
+data. The `btrace` command is a shortcut for piping the output from
+`blktrace` directly into `blkparse.
+
+### Installation
+
+Ubuntu AWS installation:
+
+```
+sudo apt-get install blktrace
+```
+
+## Usage
+
+```
+# Pipe the output of blktrace directly into blkparse.
+sudo blktrace -d /dev/nvme1n1 -o - | blkparse -i -
+
+# Equivalently.
+sudo btrace /dev/nvme1n1
+```
+
+The information captured by `blktrace` is similar to what `perf` captures:
+
+```
+sudo btrace /dev/nvme1n1
+...
+259,0    4      186     0.016411295 11538  Q  WS 129341760 + 296 [pebble]
+259,0    4      187     0.016412100 11538  Q  WS 129342016 + 40 [pebble]
+259,0    4      188     0.016412200 11538  G  WS 129341760 + 256 [pebble]
+259,0    4      189     0.016412714 11538  G  WS 129342016 + 40 [pebble]
+259,0    4      190     0.016413148 11538  U   N [pebble] 2
+259,0    4      191     0.016413255 11538  I  WS 129341760 + 256 [pebble]
+259,0    4      192     0.016413321 11538  I  WS 129342016 + 40 [pebble]
+259,0    4      193     0.016414271 11538  D  WS 129341760 + 256 [pebble]
+259,0    4      194     0.016414860 11538  D  WS 129342016 + 40 [pebble]
+259,0   12      217     0.016687595     0  C  WS 129341760 + 256 [0]
+259,0   12      218     0.016700021     0  C  WS 129342016 + 40 [0]
+```
+
+The standard format is:
+
+```
+<device> <cpu> <seqnum> <timestamp> <pid> <action> <RWBS> <start-sector> + <size> [<command>]
+```
+
+See `man blkparse` for an explanation of the actions.
+
+The `blktrace` output can be used to highlight problematic I/O
+patterns. For example, it can be used to determine there are an
+excessive number of small sequential read I/Os indicating that dynamic
+readahead is not working correctly.
diff --git a/pebble/docs/js/app.js b/pebble/docs/js/app.js
new file mode 100644
index 0000000..d944350
--- /dev/null
+++ b/pebble/docs/js/app.js
@@ -0,0 +1,695 @@
+// TODO(peter)
+// - Save pan/zoom settings in query params
+//
+// TODO(travers): There exists an awkward ordering script loading issue where
+// write-throughput.js is loaded first, but contains references to functions
+// defined in this file. Work out a better way of modularizing this code.
+
+const parseTime = d3.timeParse("%Y%m%d");
+const formatTime = d3.timeFormat("%b %d");
+const dateBisector = d3.bisector(d => d.date).left;
+
+let minDate;
+let max = {
+    date: new Date(),
+    perChart: {},
+    opsSec: 0,
+    readBytes: 0,
+    writeBytes: 0,
+    readAmp: 0,
+    writeAmp: 0
+};
+let usePerChartMax = false;
+let detail;
+let detailName;
+let detailFormat;
+
+let annotations = [];
+
+function getMaxes(chartKey) {
+    return usePerChartMax ? max.perChart[chartKey] : max;
+}
+
+function styleWidth(e) {
+    const width = +e.style("width").slice(0, -2);
+    return Math.round(Number(width));
+}
+
+function styleHeight(e) {
+    const height = +e.style("height").slice(0, -2);
+    return Math.round(Number(height));
+}
+
+function pathGetY(path, x) {
+    // Walk along the path using binary search to locate the point
+    // with the supplied x value.
+    let start = 0;
+    let end = path.getTotalLength();
+    while (start < end) {
+        const target = (start + end) / 2;
+        const pos = path.getPointAtLength(target);
+        if (Math.abs(pos.x - x) < 0.01) {
+            // Close enough.
+            return pos.y;
+        } else if (pos.x > x) {
+            end = target;
+        } else {
+            start = target;
+        }
+    }
+    return path.getPointAtLength(start).y;
+}
+
+// Pretty formatting of a number in human readable units.
+function humanize(s) {
+    const iecSuffixes = [" B", " KB", " MB", " GB", " TB", " PB", " EB"];
+    if (s < 10) {
+        return "" + s;
+    }
+    let e = Math.floor(Math.log(s) / Math.log(1024));
+    let suffix = iecSuffixes[Math.floor(e)];
+    let val = Math.floor(s / Math.pow(1024, e) * 10 + 0.5) / 10;
+    return val.toFixed(val < 10 ? 1 : 0) + suffix;
+}
+
+function dirname(path) {
+    return path.match(/.*\//)[0];
+}
+
+function equalDay(d1, d2) {
+    return (
+        d1.getYear() == d2.getYear() &&
+        d1.getMonth() == d2.getMonth() &&
+        d1.getDate() == d2.getDate()
+    );
+}
+
+function computeSegments(data) {
+    return data.reduce(function(segments, d) {
+        if (segments.length == 0) {
+            segments.push([d]);
+            return segments;
+        }
+
+        const lastSegment = segments[segments.length - 1];
+        const lastDatum = lastSegment[lastSegment.length - 1];
+        const days = Math.round(
+            (d.date.getTime() - lastDatum.date.getTime()) /
+                (24 * 60 * 60 * 1000)
+        );
+        if (days == 1) {
+            lastSegment.push(d);
+        } else {
+            segments.push([d]);
+        }
+        return segments;
+    }, []);
+}
+
+function computeGaps(segments) {
+    let gaps = [];
+    for (let i = 1; i < segments.length; ++i) {
+        const last = segments[i - 1];
+        const cur = segments[i];
+        gaps.push([last[last.length - 1], cur[0]]);
+    }
+
+    // If the last day is not equal to the current day, add a gap that
+    // spans to the current day.
+    const last = segments[segments.length - 1];
+    const lastDay = last[last.length - 1];
+    if (!equalDay(lastDay.date, max.date)) {
+        const maxDay = Object.assign({}, lastDay);
+        maxDay.date = max.date;
+        gaps.push([lastDay, maxDay]);
+    }
+    return gaps;
+}
+
+function renderChart(chart) {
+    const chartKey = chart.attr("data-key");
+    const vals = data[chartKey];
+
+    const svg = chart.html("");
+
+    const margin = { top: 25, right: 60, bottom: 25, left: 60 };
+
+    const width = styleWidth(svg) - margin.left - margin.right,
+        height = styleHeight(svg) - margin.top - margin.bottom;
+
+    const defs = svg.append("defs");
+    const filter = defs
+        .append("filter")
+        .attr("id", "textBackground")
+        .attr("x", 0)
+        .attr("y", 0)
+        .attr("width", 1)
+        .attr("height", 1);
+    filter.append("feFlood").attr("flood-color", "white");
+    filter.append("feComposite").attr("in", "SourceGraphic");
+
+    defs
+        .append("clipPath")
+        .attr("id", chartKey)
+        .append("rect")
+        .attr("x", 0)
+        .attr("y", -margin.top)
+        .attr("width", width)
+        .attr("height", margin.top + height + 10);
+
+    const title = svg
+        .append("text")
+        .attr("class", "chart-title")
+        .attr("x", margin.left + width / 2)
+        .attr("y", 15)
+        .style("text-anchor", "middle")
+        .style("font", "8pt sans-serif")
+        .text(chartKey);
+
+    const g = svg
+        .append("g")
+        .attr("transform", "translate(" + margin.left + "," + margin.top + ")");
+
+    const x = d3.scaleTime().range([0, width]);
+    const x2 = d3.scaleTime().range([0, width]);
+    const y1 = d3.scaleLinear().range([height, 0]);
+    const z = d3.scaleOrdinal(d3.schemeCategory10);
+    const xFormat = formatTime;
+
+    x.domain([minDate, max.date]);
+    x2.domain([minDate, max.date]);
+
+    y1.domain([0, getMaxes(chartKey).opsSec]);
+
+    const xAxis = d3.axisBottom(x).ticks(5);
+
+    g
+        .append("g")
+        .attr("class", "axis axis--x")
+        .attr("transform", "translate(0," + height + ")")
+        .call(xAxis);
+    g
+        .append("g")
+        .attr("class", "axis axis--y")
+        .call(d3.axisLeft(y1).ticks(5));
+
+    if (!vals) {
+        // That's all we can draw for an empty chart.
+        svg
+            .append("text")
+            .attr("x", margin.left + width / 2)
+            .attr("y", margin.top + height / 2)
+            .style("text-anchor", "middle")
+            .style("font", "8pt sans-serif")
+            .text("No data");
+        return;
+    }
+
+    const view = g
+        .append("g")
+        .attr("class", "view")
+        .attr("clip-path", "url(#" + chartKey + ")");
+
+    const triangle = d3
+        .symbol()
+        .type(d3.symbolTriangle)
+        .size(12);
+    view
+        .selectAll("path.annotation")
+        .data(annotations)
+        .enter()
+        .append("path")
+        .attr("class", "annotation")
+        .attr("d", triangle)
+        .attr("stroke", "#2b2")
+        .attr("fill", "#2b2")
+        .attr(
+            "transform",
+            d => "translate(" + (x(d.date) + "," + (height + 5) + ")")
+        );
+
+    view
+        .selectAll("line.annotation")
+        .data(annotations)
+        .enter()
+        .append("line")
+        .attr("class", "annotation")
+        .attr("fill", "none")
+        .attr("stroke", "#2b2")
+        .attr("stroke-width", "1px")
+        .attr("stroke-dasharray", "1 2")
+        .attr("x1", d => x(d.date))
+        .attr("x2", d => x(d.date))
+        .attr("y1", 0)
+        .attr("y2", height);
+
+    // Divide the data into contiguous days so that we can avoid
+    // interpolating days where there is missing data.
+    const segments = computeSegments(vals);
+    const gaps = computeGaps(segments);
+
+    const line1 = d3
+        .line()
+        .x(d => x(d.date))
+        .y(d => y1(d.opsSec));
+    const path1 = view
+        .selectAll(".line1")
+        .data(segments)
+        .enter()
+        .append("path")
+        .attr("class", "line1")
+        .attr("d", line1)
+        .style("stroke", d => z(0));
+
+    view
+        .selectAll(".line1-gaps")
+        .data(gaps)
+        .enter()
+        .append("path")
+        .attr("class", "line1-gaps")
+        .attr("d", line1)
+        .attr("opacity", 0.8)
+        .style("stroke", d => z(0))
+        .style("stroke-dasharray", "1 2");
+
+    let y2 = d3.scaleLinear().range([height, 0]);
+    let line2;
+    let path2;
+    if (detail) {
+        y2 = d3.scaleLinear().range([height, 0]);
+        y2.domain([0, detail(getMaxes(chartKey))]);
+        g
+            .append("g")
+            .attr("class", "axis axis--y")
+            .attr("transform", "translate(" + width + ",0)")
+            .call(
+                d3
+                    .axisRight(y2)
+                    .ticks(5)
+                    .tickFormat(detailFormat)
+            );
+
+        line2 = d3
+            .line()
+            .x(d => x(d.date))
+            .y(d => y2(detail(d)));
+        path2 = view
+            .selectAll(".line2")
+            .data(segments)
+            .enter()
+            .append("path")
+            .attr("class", "line2")
+            .attr("d", line2)
+            .style("stroke", d => z(1));
+        view
+            .selectAll(".line2-gaps")
+            .data(gaps)
+            .enter()
+            .append("path")
+            .attr("class", "line2-gaps")
+            .attr("d", line2)
+            .attr("opacity", 0.8)
+            .style("stroke", d => z(1))
+            .style("stroke-dasharray", "1 2");
+    }
+
+    const updateZoom = function(t) {
+        x.domain(t.rescaleX(x2).domain());
+        g.select(".axis--x").call(xAxis);
+        g.selectAll(".line1").attr("d", line1);
+        g.selectAll(".line1-gaps").attr("d", line1);
+        if (detail) {
+            g.selectAll(".line2").attr("d", line2);
+            g.selectAll(".line2-gaps").attr("d", line2);
+        }
+        g
+            .selectAll("path.annotation")
+            .attr(
+                "transform",
+                d => "translate(" + (x(d.date) + "," + (height + 5) + ")")
+            );
+        g
+            .selectAll("line.annotation")
+            .attr("x1", d => x(d.date))
+            .attr("x2", d => x(d.date));
+    };
+    svg.node().updateZoom = updateZoom;
+
+    const hoverSeries = function(mouse) {
+        if (!detail) {
+            return 1;
+        }
+        const mousex = mouse[0];
+        const mousey = mouse[1] - margin.top;
+        const path1Y = pathGetY(path1.node(), mousex);
+        const path2Y = pathGetY(path2.node(), mousex);
+        return Math.abs(mousey - path1Y) < Math.abs(mousey - path2Y) ? 1 : 2;
+    };
+
+    // This is a bit funky: initDate() initializes the date range to
+    // [today-90,today]. We then allow zooming out by 4x which will
+    // give a maximum range of 360 days. We limit translation to the
+    // 360 day period. The funkiness is that it would be more natural
+    // to start at the maximum zoomed amount and then initialize the
+    // zoom. But that doesn't work because we want to maintain the
+    // existing zoom settings whenever we have to (re-)render().
+    const zoom = d3
+        .zoom()
+        .scaleExtent([0.25, 2])
+        .translateExtent([[-width * 3, 0], [width, 1]])
+        .extent([[0, 0], [width, 1]])
+        .on("zoom", function() {
+            const t = d3.event.transform;
+            if (!d3.event.sourceEvent) {
+                updateZoom(t);
+                return;
+            }
+
+            d3.selectAll(".chart").each(function() {
+                if (this.updateZoom != null) {
+                    this.updateZoom(t);
+                }
+            });
+
+            d3.selectAll(".chart").each(function() {
+                this.__zoom = t.translate(0, 0);
+            });
+
+            const mouse = d3.mouse(this);
+            if (mouse) {
+                mouse[0] -= margin.left; // adjust for rect.mouse position
+                const date = x.invert(mouse[0]);
+                const hover = hoverSeries(mouse);
+                d3.selectAll(".chart.ycsb").each(function() {
+                    this.updateMouse(mouse, date, hover);
+                });
+            }
+        });
+
+    svg.call(zoom);
+    svg.call(zoom.transform, d3.zoomTransform(svg.node()));
+
+    const lineHover = g
+        .append("line")
+        .attr("class", "hover")
+        .style("fill", "none")
+        .style("stroke", "#f99")
+        .style("stroke-width", "1px");
+
+    const dateHover = g
+        .append("text")
+        .attr("class", "hover")
+        .attr("fill", "#f22")
+        .attr("text-anchor", "middle")
+        .attr("alignment-baseline", "hanging")
+        .attr("transform", "translate(0, 0)");
+
+    const opsHover = g
+        .append("text")
+        .attr("class", "hover")
+        .attr("fill", "#f22")
+        .attr("text-anchor", "middle")
+        .attr("transform", "translate(0, 0)");
+
+    const marker = g
+        .append("circle")
+        .attr("class", "hover")
+        .attr("r", 3)
+        .style("opacity", "0")
+        .style("stroke", "#f22")
+        .style("fill", "#f22");
+
+    svg.node().updateMouse = function(mouse, date, hover) {
+        const mousex = mouse[0];
+        const mousey = mouse[1];
+        const i = dateBisector(vals, date, 1);
+        const v =
+            i == vals.length
+                ? vals[i - 1]
+                : mousex - x(vals[i - 1].date) < x(vals[i].date) - mousex
+                    ? vals[i - 1]
+                    : vals[i];
+        const noData = mousex < x(vals[0].date);
+
+        let lineY = height;
+        if (!noData) {
+            if (hover == 1) {
+                lineY = pathGetY(path1.node(), mousex);
+            } else {
+                lineY = pathGetY(path2.node(), mousex);
+            }
+        }
+
+        let val, valY, valFormat;
+        if (hover == 1) {
+            val = v.opsSec;
+            valY = y1(val);
+            valFormat = d3.format(",.0f");
+        } else {
+            val = detail(v);
+            valY = y2(val);
+            valFormat = detailFormat;
+        }
+
+        lineHover
+            .attr("x1", mousex)
+            .attr("x2", mousex)
+            .attr("y1", lineY)
+            .attr("y2", height);
+        marker.attr("transform", "translate(" + x(v.date) + "," + valY + ")");
+        dateHover
+            .attr("transform", "translate(" + mousex + "," + (height + 8) + ")")
+            .text(xFormat(date));
+        opsHover
+            .attr(
+                "transform",
+                "translate(" + x(v.date) + "," + (valY - 7) + ")"
+            )
+            .text(valFormat(val));
+    };
+
+    const rect = svg
+        .append("rect")
+        .attr("class", "mouse")
+        .attr("cursor", "move")
+        .attr("fill", "none")
+        .attr("pointer-events", "all")
+        .attr("width", width)
+        .attr("height", height + margin.top + margin.bottom)
+        .attr("transform", "translate(" + margin.left + "," + 0 + ")")
+        .on("mousemove", function() {
+            const mouse = d3.mouse(this);
+            const date = x.invert(mouse[0]);
+            const hover = hoverSeries(mouse);
+
+            let resetTitle = true;
+            for (let i in annotations) {
+                if (Math.abs(mouse[0] - x(annotations[i].date)) <= 5) {
+                    title
+                        .style("font-size", "9pt")
+                        .text(annotations[i].message);
+                    resetTitle = false;
+                    break;
+                }
+            }
+            if (resetTitle) {
+                title.style("font-size", "8pt").text(chartKey);
+            }
+
+            d3.selectAll(".chart").each(function() {
+                if (this.updateMouse != null) {
+                    this.updateMouse(mouse, date, hover);
+                }
+            });
+        })
+        .on("mouseover", function() {
+            d3
+                .selectAll(".chart")
+                .selectAll(".hover")
+                .style("opacity", 1.0);
+        })
+        .on("mouseout", function() {
+            d3
+                .selectAll(".chart")
+                .selectAll(".hover")
+                .style("opacity", 0);
+        });
+}
+
+function renderYCSB() {
+    d3.selectAll(".chart.ycsb").each(function(d, i) {
+        renderChart(d3.select(this));
+    });
+}
+
+function initData() {
+    for (key in data) {
+        data[key] = d3.csvParseRows(data[key], function(d, i) {
+            return {
+                date: parseTime(d[0]),
+                opsSec: +d[1],
+                readBytes: +d[2],
+                writeBytes: +d[3],
+                readAmp: +d[4],
+                writeAmp: +d[5]
+            };
+        });
+
+        const vals = data[key];
+        max.perChart[key] = {
+            opsSec: d3.max(vals, d => d.opsSec),
+            readBytes: d3.max(vals, d => d.readBytes),
+            writeBytes: d3.max(vals, d => d.writeBytes),
+            readAmp: d3.max(vals, d => d.readAmp),
+            writeAmp: d3.max(vals, d => d.writeAmp),
+        }
+        max.opsSec = Math.max(max.opsSec, max.perChart[key].opsSec);
+        max.readBytes = Math.max(max.readBytes, max.perChart[key].readBytes);
+        max.writeBytes = Math.max(
+            max.writeBytes,
+            max.perChart[key].writeBytes,
+        );
+        max.readAmp = Math.max(max.readAmp, max.perChart[key].readAmp);
+        max.writeAmp = Math.max(max.writeAmp, max.perChart[key].writeAmp);
+    }
+
+    // Load the write-throughput data and merge with the existing data. We
+    // return a promise here to allow us to continue to make progress elsewhere.
+    return fetch(writeThroughputSummaryURL())
+      .then(response => response.json())
+      .then(wtData => {
+            for (let key in wtData) {
+                data[key] = wtData[key];
+            }
+      });
+}
+
+function initDateRange() {
+    max.date.setHours(0, 0, 0, 0);
+    minDate = new Date(new Date().setDate(max.date.getDate() - 90));
+}
+
+function initAnnotations() {
+    d3.selectAll(".annotation").each(function() {
+        const annotation = d3.select(this);
+        const date = parseTime(annotation.attr("data-date"));
+        annotations.push({ date: date, message: annotation.text() });
+    });
+}
+
+function setQueryParams() {
+    var params = new URLSearchParams();
+    if (detailName) {
+        params.set("detail", detailName);
+    }
+    if (usePerChartMax) {
+        params.set("max", "local");
+    }
+    var search = "?" + params;
+    if (window.location.search != search) {
+        window.history.pushState(null, null, search);
+    }
+}
+
+function setDetail(name) {
+    detail = undefined;
+    detailFormat = undefined;
+    detailName = name;
+
+    switch (detailName) {
+        case "readBytes":
+            detail = d => d.readBytes;
+            detailFormat = humanize;
+            break;
+        case "writeBytes":
+            detail = d => d.writeBytes;
+            detailFormat = humanize;
+            break;
+        case "readAmp":
+            detail = d => d.readAmp;
+            detailFormat = d3.format(",.1f");
+            break;
+        case "writeAmp":
+            detail = d => d.writeAmp;
+            detailFormat = d3.format(",.1f");
+            break;
+    }
+
+    d3.selectAll(".toggle").classed("selected", false);
+    d3.select("#" + detailName).classed("selected", detail != null);
+}
+
+function initQueryParams() {
+    var params = new URLSearchParams(window.location.search.substring(1));
+    setDetail(params.get("detail"));
+    usePerChartMax = params.get("max") == "local";
+    d3.select("#localMax").classed("selected", usePerChartMax);
+}
+
+function toggleDetail(name) {
+    const link = d3.select("#" + name);
+    const selected = !link.classed("selected");
+    link.classed("selected", selected);
+    if (selected) {
+        setDetail(name);
+    } else {
+        setDetail(null);
+    }
+    setQueryParams();
+    renderYCSB();
+}
+
+function toggleLocalMax() {
+    const link = d3.select("#localMax");
+    const selected = !link.classed("selected");
+    link.classed("selected", selected);
+    usePerChartMax = selected;
+    setQueryParams();
+    renderYCSB();
+}
+
+window.onload = function init() {
+    d3.selectAll(".toggle").each(function() {
+        const link = d3.select(this);
+        link.attr("href", 'javascript:toggleDetail("' + link.attr("id") + '")');
+    });
+    d3.selectAll("#localMax").each(function() {
+        const link = d3.select(this);
+        link.attr("href", 'javascript:toggleLocalMax()');
+    });
+
+    initData().then(_ => {
+        initDateRange();
+        initAnnotations();
+        initQueryParams();
+
+        renderYCSB();
+        renderWriteThroughputSummary(data);
+
+        // Use the max date to bisect into the workload data to pluck out the
+        // correct datapoint.
+        let workloadData = data[writeThroughputWorkload];
+        bisectAndRenderWriteThroughputDetail(workloadData, max.date);
+
+        let lastUpdate;
+        for (let key in data) {
+            const max = d3.max(data[key], d => d.date);
+            if (!lastUpdate || lastUpdate < max) {
+                lastUpdate = max;
+            }
+        }
+        d3.selectAll(".updated")
+            .text("Last updated: " + d3.timeFormat("%b %e, %Y")(lastUpdate));
+    })
+
+    // By default, display each panel with its local max, which makes spotting
+    // regressions simpler.
+    toggleLocalMax();
+};
+
+window.onpopstate = function() {
+    initQueryParams();
+    renderYCSB();
+};
+
+window.addEventListener("resize", renderYCSB);
diff --git a/pebble/docs/js/d3.v5.min.js b/pebble/docs/js/d3.v5.min.js
new file mode 100644
index 0000000..a75674c
--- /dev/null
+++ b/pebble/docs/js/d3.v5.min.js
@@ -0,0 +1,2 @@
+// https://d3js.org Version 5.1.0. Copyright 2018 Mike Bostock.
+(function(t,n){"object"==typeof exports&&"undefined"!=typeof module?n(exports):"function"==typeof define&&define.amd?define(["exports"],n):n(t.d3=t.d3||{})})(this,function(t){"use strict";function n(t,n){return t<n?-1:t>n?1:t>=n?0:NaN}function e(t){return 1===t.length&&(t=function(t){return function(e,r){return n(t(e),r)}}(t)),{left:function(n,e,r,i){for(null==r&&(r=0),null==i&&(i=n.length);r<i;){var o=r+i>>>1;t(n[o],e)<0?r=o+1:i=o}return r},right:function(n,e,r,i){for(null==r&&(r=0),null==i&&(i=n.length);r<i;){var o=r+i>>>1;t(n[o],e)>0?i=o:r=o+1}return r}}}function r(t,n){return[t,n]}function i(t){return null===t?NaN:+t}function o(t,n){var e,r,o=t.length,a=0,u=-1,f=0,c=0;if(null==n)for(;++u<o;)isNaN(e=i(t[u]))||(c+=(r=e-f)*(e-(f+=r/++a)));else for(;++u<o;)isNaN(e=i(n(t[u],u,t)))||(c+=(r=e-f)*(e-(f+=r/++a)));if(a>1)return c/(a-1)}function a(t,n){var e=o(t,n);return e?Math.sqrt(e):e}function u(t,n){var e,r,i,o=t.length,a=-1;if(null==n){for(;++a<o;)if(null!=(e=t[a])&&e>=e)for(r=i=e;++a<o;)null!=(e=t[a])&&(r>e&&(r=e),i<e&&(i=e))}else for(;++a<o;)if(null!=(e=n(t[a],a,t))&&e>=e)for(r=i=e;++a<o;)null!=(e=n(t[a],a,t))&&(r>e&&(r=e),i<e&&(i=e));return[r,i]}function f(t){return function(){return t}}function c(t){return t}function s(t,n,e){t=+t,n=+n,e=(i=arguments.length)<2?(n=t,t=0,1):i<3?1:+e;for(var r=-1,i=0|Math.max(0,Math.ceil((n-t)/e)),o=new Array(i);++r<i;)o[r]=t+r*e;return o}function l(t,n,e){var r,i,o,a,u=-1;if(n=+n,t=+t,e=+e,t===n&&e>0)return[t];if((r=n<t)&&(i=t,t=n,n=i),0===(a=h(t,n,e))||!isFinite(a))return[];if(a>0)for(t=Math.ceil(t/a),n=Math.floor(n/a),o=new Array(i=Math.ceil(n-t+1));++u<i;)o[u]=(t+u)*a;else for(t=Math.floor(t*a),n=Math.ceil(n*a),o=new Array(i=Math.ceil(t-n+1));++u<i;)o[u]=(t-u)/a;return r&&o.reverse(),o}function h(t,n,e){var r=(n-t)/Math.max(0,e),i=Math.floor(Math.log(r)/Math.LN10),o=r/Math.pow(10,i);return i>=0?(o>=es?10:o>=rs?5:o>=is?2:1)*Math.pow(10,i):-Math.pow(10,-i)/(o>=es?10:o>=rs?5:o>=is?2:1)}function d(t,n,e){var r=Math.abs(n-t)/Math.max(0,e),i=Math.pow(10,Math.floor(Math.log(r)/Math.LN10)),o=r/i;return o>=es?i*=10:o>=rs?i*=5:o>=is&&(i*=2),n<t?-i:i}function p(t){return Math.ceil(Math.log(t.length)/Math.LN2)+1}function v(t,n,e){if(null==e&&(e=i),r=t.length){if((n=+n)<=0||r<2)return+e(t[0],0,t);if(n>=1)return+e(t[r-1],r-1,t);var r,o=(r-1)*n,a=Math.floor(o),u=+e(t[a],a,t);return u+(+e(t[a+1],a+1,t)-u)*(o-a)}}function g(t,n){var e,r,i=t.length,o=-1;if(null==n){for(;++o<i;)if(null!=(e=t[o])&&e>=e)for(r=e;++o<i;)null!=(e=t[o])&&e>r&&(r=e)}else for(;++o<i;)if(null!=(e=n(t[o],o,t))&&e>=e)for(r=e;++o<i;)null!=(e=n(t[o],o,t))&&e>r&&(r=e);return r}function y(t){for(var n,e,r,i=t.length,o=-1,a=0;++o<i;)a+=t[o].length;for(e=new Array(a);--i>=0;)for(n=(r=t[i]).length;--n>=0;)e[--a]=r[n];return e}function _(t,n){var e,r,i=t.length,o=-1;if(null==n){for(;++o<i;)if(null!=(e=t[o])&&e>=e)for(r=e;++o<i;)null!=(e=t[o])&&r>e&&(r=e)}else for(;++o<i;)if(null!=(e=n(t[o],o,t))&&e>=e)for(r=e;++o<i;)null!=(e=n(t[o],o,t))&&r>e&&(r=e);return r}function b(t){if(!(i=t.length))return[];for(var n=-1,e=_(t,m),r=new Array(e);++n<e;)for(var i,o=-1,a=r[n]=new Array(i);++o<i;)a[o]=t[o][n];return r}function m(t){return t.length}function x(t){return t}function w(t){return"translate("+(t+.5)+",0)"}function M(t){return"translate(0,"+(t+.5)+")"}function A(){return!this.__axis}function T(t,n){function e(e){var h=null==i?n.ticks?n.ticks.apply(n,r):n.domain():i,d=null==o?n.tickFormat?n.tickFormat.apply(n,r):x:o,p=Math.max(a,0)+f,v=n.range(),g=+v[0]+.5,y=+v[v.length-1]+.5,_=(n.bandwidth?function(t){var n=Math.max(0,t.bandwidth()-1)/2;return t.round()&&(n=Math.round(n)),function(e){return+t(e)+n}}:function(t){return function(n){return+t(n)}})(n.copy()),b=e.selection?e.selection():e,m=b.selectAll(".domain").data([null]),w=b.selectAll(".tick").data(h,n).order(),M=w.exit(),T=w.enter().append("g").attr("class","tick"),N=w.select("line"),S=w.select("text");m=m.merge(m.enter().insert("path",".tick").attr("class","domain").attr("stroke","#000")),w=w.merge(T),N=N.merge(T.append("line").attr("stroke","#000").attr(s+"2",c*a)),S=S.merge(T.append("text").attr("fill","#000").attr(s,c*p).attr("dy",t===as?"0em":t===fs?"0.71em":"0.32em")),e!==b&&(m=m.transition(e),w=w.transition(e),N=N.transition(e),S=S.transition(e),M=M.transition(e).attr("opacity",ss).attr("transform",function(t){return isFinite(t=_(t))?l(t):this.getAttribute("transform")}),T.attr("opacity",ss).attr("transform",function(t){var n=this.parentNode.__axis;return l(n&&isFinite(n=n(t))?n:_(t))})),M.remove(),m.attr("d",t===cs||t==us?"M"+c*u+","+g+"H0.5V"+y+"H"+c*u:"M"+g+","+c*u+"V0.5H"+y+"V"+c*u),w.attr("opacity",1).attr("transform",function(t){return l(_(t))}),N.attr(s+"2",c*a),S.attr(s,c*p).text(d),b.filter(A).attr("fill","none").attr("font-size",10).attr("font-family","sans-serif").attr("text-anchor",t===us?"start":t===cs?"end":"middle"),b.each(function(){this.__axis=_})}var r=[],i=null,o=null,a=6,u=6,f=3,c=t===as||t===cs?-1:1,s=t===cs||t===us?"x":"y",l=t===as||t===fs?w:M;return e.scale=function(t){return arguments.length?(n=t,e):n},e.ticks=function(){return r=os.call(arguments),e},e.tickArguments=function(t){return arguments.length?(r=null==t?[]:os.call(t),e):r.slice()},e.tickValues=function(t){return arguments.length?(i=null==t?null:os.call(t),e):i&&i.slice()},e.tickFormat=function(t){return arguments.length?(o=t,e):o},e.tickSize=function(t){return arguments.length?(a=u=+t,e):a},e.tickSizeInner=function(t){return arguments.length?(a=+t,e):a},e.tickSizeOuter=function(t){return arguments.length?(u=+t,e):u},e.tickPadding=function(t){return arguments.length?(f=+t,e):f},e}function N(){for(var t,n=0,e=arguments.length,r={};n<e;++n){if(!(t=arguments[n]+"")||t in r)throw new Error("illegal type: "+t);r[t]=[]}return new S(r)}function S(t){this._=t}function E(t,n,e){for(var r=0,i=t.length;r<i;++r)if(t[r].name===n){t[r]=ls,t=t.slice(0,r).concat(t.slice(r+1));break}return null!=e&&t.push({name:n,value:e}),t}function k(t){var n=t+="",e=n.indexOf(":");return e>=0&&"xmlns"!==(n=t.slice(0,e))&&(t=t.slice(e+1)),ds.hasOwnProperty(n)?{space:ds[n],local:t}:t}function C(t){var n=k(t);return(n.local?function(t){return function(){return this.ownerDocument.createElementNS(t.space,t.local)}}:function(t){return function(){var n=this.ownerDocument,e=this.namespaceURI;return e===hs&&n.documentElement.namespaceURI===hs?n.createElement(t):n.createElementNS(e,t)}})(n)}function P(){}function z(t){return null==t?P:function(){return this.querySelector(t)}}function R(){return[]}function L(t){return null==t?R:function(){return this.querySelectorAll(t)}}function D(t){return new Array(t.length)}function U(t,n){this.ownerDocument=t.ownerDocument,this.namespaceURI=t.namespaceURI,this._next=null,this._parent=t,this.__data__=n}function q(t,n,e,r,i,o){for(var a,u=0,f=n.length,c=o.length;u<c;++u)(a=n[u])?(a.__data__=o[u],r[u]=a):e[u]=new U(t,o[u]);for(;u<f;++u)(a=n[u])&&(i[u]=a)}function O(t,n,e,r,i,o,a){var u,f,c,s={},l=n.length,h=o.length,d=new Array(l);for(u=0;u<l;++u)(f=n[u])&&(d[u]=c=_s+a.call(f,f.__data__,u,n),c in s?i[u]=f:s[c]=f);for(u=0;u<h;++u)(f=s[c=_s+a.call(t,o[u],u,o)])?(r[u]=f,f.__data__=o[u],s[c]=null):e[u]=new U(t,o[u]);for(u=0;u<l;++u)(f=n[u])&&s[d[u]]===f&&(i[u]=f)}function Y(t,n){return t<n?-1:t>n?1:t>=n?0:NaN}function B(t){return t.ownerDocument&&t.ownerDocument.defaultView||t.document&&t||t.defaultView}function F(t,n){return t.style.getPropertyValue(n)||B(t).getComputedStyle(t,null).getPropertyValue(n)}function I(t){return t.trim().split(/^|\s+/)}function j(t){return t.classList||new H(t)}function H(t){this._node=t,this._names=I(t.getAttribute("class")||"")}function X(t,n){for(var e=j(t),r=-1,i=n.length;++r<i;)e.add(n[r])}function G(t,n){for(var e=j(t),r=-1,i=n.length;++r<i;)e.remove(n[r])}function V(){this.textContent=""}function $(){this.innerHTML=""}function W(){this.nextSibling&&this.parentNode.appendChild(this)}function Z(){this.previousSibling&&this.parentNode.insertBefore(this,this.parentNode.firstChild)}function Q(){return null}function J(){var t=this.parentNode;t&&t.removeChild(this)}function K(){return this.parentNode.insertBefore(this.cloneNode(!1),this.nextSibling)}function tt(){return this.parentNode.insertBefore(this.cloneNode(!0),this.nextSibling)}function nt(t,n,e){return t=et(t,n,e),function(n){var e=n.relatedTarget;e&&(e===this||8&e.compareDocumentPosition(this))||t.call(this,n)}}function et(n,e,r){return function(i){var o=t.event;t.event=i;try{n.call(this,this.__data__,e,r)}finally{t.event=o}}}function rt(t){return function(){var n=this.__on;if(n){for(var e,r=0,i=-1,o=n.length;r<o;++r)e=n[r],t.type&&e.type!==t.type||e.name!==t.name?n[++i]=e:this.removeEventListener(e.type,e.listener,e.capture);++i?n.length=i:delete this.__on}}}function it(t,n,e){var r=bs.hasOwnProperty(t.type)?nt:et;return function(i,o,a){var u,f=this.__on,c=r(n,o,a);if(f)for(var s=0,l=f.length;s<l;++s)if((u=f[s]).type===t.type&&u.name===t.name)return this.removeEventListener(u.type,u.listener,u.capture),this.addEventListener(u.type,u.listener=c,u.capture=e),void(u.value=n);this.addEventListener(t.type,c,e),u={type:t.type,name:t.name,value:n,listener:c,capture:e},f?f.push(u):this.__on=[u]}}function ot(n,e,r,i){var o=t.event;n.sourceEvent=t.event,t.event=n;try{return e.apply(r,i)}finally{t.event=o}}function at(t,n,e){var r=B(t),i=r.CustomEvent;"function"==typeof i?i=new i(n,e):(i=r.document.createEvent("Event"),e?(i.initEvent(n,e.bubbles,e.cancelable),i.detail=e.detail):i.initEvent(n,!1,!1)),t.dispatchEvent(i)}function ut(t,n){this._groups=t,this._parents=n}function ft(){return new ut([[document.documentElement]],ms)}function ct(t){return"string"==typeof t?new ut([[document.querySelector(t)]],[document.documentElement]):new ut([[t]],ms)}function st(){return new lt}function lt(){this._="@"+(++xs).toString(36)}function ht(){for(var n,e=t.event;n=e.sourceEvent;)e=n;return e}function dt(t,n){var e=t.ownerSVGElement||t;if(e.createSVGPoint){var r=e.createSVGPoint();return r.x=n.clientX,r.y=n.clientY,r=r.matrixTransform(t.getScreenCTM().inverse()),[r.x,r.y]}var i=t.getBoundingClientRect();return[n.clientX-i.left-t.clientLeft,n.clientY-i.top-t.clientTop]}function pt(t){var n=ht();return n.changedTouches&&(n=n.changedTouches[0]),dt(t,n)}function vt(t,n,e){arguments.length<3&&(e=n,n=ht().changedTouches);for(var r,i=0,o=n?n.length:0;i<o;++i)if((r=n[i]).identifier===e)return dt(t,r);return null}function gt(){t.event.stopImmediatePropagation()}function yt(){t.event.preventDefault(),t.event.stopImmediatePropagation()}function _t(t){var n=t.document.documentElement,e=ct(t).on("dragstart.drag",yt,!0);"onselectstart"in n?e.on("selectstart.drag",yt,!0):(n.__noselect=n.style.MozUserSelect,n.style.MozUserSelect="none")}function bt(t,n){var e=t.document.documentElement,r=ct(t).on("dragstart.drag",null);n&&(r.on("click.drag",yt,!0),setTimeout(function(){r.on("click.drag",null)},0)),"onselectstart"in e?r.on("selectstart.drag",null):(e.style.MozUserSelect=e.__noselect,delete e.__noselect)}function mt(t){return function(){return t}}function xt(t,n,e,r,i,o,a,u,f,c){this.target=t,this.type=n,this.subject=e,this.identifier=r,this.active=i,this.x=o,this.y=a,this.dx=u,this.dy=f,this._=c}function wt(){return!t.event.button}function Mt(){return this.parentNode}function At(n){return null==n?{x:t.event.x,y:t.event.y}:n}function Tt(){return"ontouchstart"in this}function Nt(t,n,e){t.prototype=n.prototype=e,e.constructor=t}function St(t,n){var e=Object.create(t.prototype);for(var r in n)e[r]=n[r];return e}function Et(){}function kt(t){var n;return t=(t+"").trim().toLowerCase(),(n=Ts.exec(t))?(n=parseInt(n[1],16),new Lt(n>>8&15|n>>4&240,n>>4&15|240&n,(15&n)<<4|15&n,1)):(n=Ns.exec(t))?Ct(parseInt(n[1],16)):(n=Ss.exec(t))?new Lt(n[1],n[2],n[3],1):(n=Es.exec(t))?new Lt(255*n[1]/100,255*n[2]/100,255*n[3]/100,1):(n=ks.exec(t))?Pt(n[1],n[2],n[3],n[4]):(n=Cs.exec(t))?Pt(255*n[1]/100,255*n[2]/100,255*n[3]/100,n[4]):(n=Ps.exec(t))?Dt(n[1],n[2]/100,n[3]/100,1):(n=zs.exec(t))?Dt(n[1],n[2]/100,n[3]/100,n[4]):Rs.hasOwnProperty(t)?Ct(Rs[t]):"transparent"===t?new Lt(NaN,NaN,NaN,0):null}function Ct(t){return new Lt(t>>16&255,t>>8&255,255&t,1)}function Pt(t,n,e,r){return r<=0&&(t=n=e=NaN),new Lt(t,n,e,r)}function zt(t){return t instanceof Et||(t=kt(t)),t?(t=t.rgb(),new Lt(t.r,t.g,t.b,t.opacity)):new Lt}function Rt(t,n,e,r){return 1===arguments.length?zt(t):new Lt(t,n,e,null==r?1:r)}function Lt(t,n,e,r){this.r=+t,this.g=+n,this.b=+e,this.opacity=+r}function Dt(t,n,e,r){return r<=0?t=n=e=NaN:e<=0||e>=1?t=n=NaN:n<=0&&(t=NaN),new qt(t,n,e,r)}function Ut(t,n,e,r){return 1===arguments.length?function(t){if(t instanceof qt)return new qt(t.h,t.s,t.l,t.opacity);if(t instanceof Et||(t=kt(t)),!t)return new qt;if(t instanceof qt)return t;var n=(t=t.rgb()).r/255,e=t.g/255,r=t.b/255,i=Math.min(n,e,r),o=Math.max(n,e,r),a=NaN,u=o-i,f=(o+i)/2;return u?(a=n===o?(e-r)/u+6*(e<r):e===o?(r-n)/u+2:(n-e)/u+4,u/=f<.5?o+i:2-o-i,a*=60):u=f>0&&f<1?0:a,new qt(a,u,f,t.opacity)}(t):new qt(t,n,e,null==r?1:r)}function qt(t,n,e,r){this.h=+t,this.s=+n,this.l=+e,this.opacity=+r}function Ot(t,n,e){return 255*(t<60?n+(e-n)*t/60:t<180?e:t<240?n+(e-n)*(240-t)/60:n)}function Yt(t){if(t instanceof Ft)return new Ft(t.l,t.a,t.b,t.opacity);if(t instanceof $t){if(isNaN(t.h))return new Ft(t.l,0,0,t.opacity);var n=t.h*Ls;return new Ft(t.l,Math.cos(n)*t.c,Math.sin(n)*t.c,t.opacity)}t instanceof Lt||(t=zt(t));var e,r,i=Xt(t.r),o=Xt(t.g),a=Xt(t.b),u=It((.2225045*i+.7168786*o+.0606169*a)/qs);return i===o&&o===a?e=r=u:(e=It((.4360747*i+.3850649*o+.1430804*a)/Us),r=It((.0139322*i+.0971045*o+.7141733*a)/Os)),new Ft(116*u-16,500*(e-u),200*(u-r),t.opacity)}function Bt(t,n,e,r){return 1===arguments.length?Yt(t):new Ft(t,n,e,null==r?1:r)}function Ft(t,n,e,r){this.l=+t,this.a=+n,this.b=+e,this.opacity=+r}function It(t){return t>Is?Math.pow(t,1/3):t/Fs+Ys}function jt(t){return t>Bs?t*t*t:Fs*(t-Ys)}function Ht(t){return 255*(t<=.0031308?12.92*t:1.055*Math.pow(t,1/2.4)-.055)}function Xt(t){return(t/=255)<=.04045?t/12.92:Math.pow((t+.055)/1.055,2.4)}function Gt(t){if(t instanceof $t)return new $t(t.h,t.c,t.l,t.opacity);if(t instanceof Ft||(t=Yt(t)),0===t.a&&0===t.b)return new $t(NaN,0,t.l,t.opacity);var n=Math.atan2(t.b,t.a)*Ds;return new $t(n<0?n+360:n,Math.sqrt(t.a*t.a+t.b*t.b),t.l,t.opacity)}function Vt(t,n,e,r){return 1===arguments.length?Gt(t):new $t(t,n,e,null==r?1:r)}function $t(t,n,e,r){this.h=+t,this.c=+n,this.l=+e,this.opacity=+r}function Wt(t,n,e,r){return 1===arguments.length?function(t){if(t instanceof Zt)return new Zt(t.h,t.s,t.l,t.opacity);t instanceof Lt||(t=zt(t));var n=t.r/255,e=t.g/255,r=t.b/255,i=($s*r+Gs*n-Vs*e)/($s+Gs-Vs),o=r-i,a=(Xs*(e-i)-js*o)/Hs,u=Math.sqrt(a*a+o*o)/(Xs*i*(1-i)),f=u?Math.atan2(a,o)*Ds-120:NaN;return new Zt(f<0?f+360:f,u,i,t.opacity)}(t):new Zt(t,n,e,null==r?1:r)}function Zt(t,n,e,r){this.h=+t,this.s=+n,this.l=+e,this.opacity=+r}function Qt(t,n,e,r,i){var o=t*t,a=o*t;return((1-3*t+3*o-a)*n+(4-6*o+3*a)*e+(1+3*t+3*o-3*a)*r+a*i)/6}function Jt(t){var n=t.length-1;return function(e){var r=e<=0?e=0:e>=1?(e=1,n-1):Math.floor(e*n),i=t[r],o=t[r+1],a=r>0?t[r-1]:2*i-o,u=r<n-1?t[r+2]:2*o-i;return Qt((e-r/n)*n,a,i,o,u)}}function Kt(t){var n=t.length;return function(e){var r=Math.floor(((e%=1)<0?++e:e)*n),i=t[(r+n-1)%n],o=t[r%n],a=t[(r+1)%n],u=t[(r+2)%n];return Qt((e-r/n)*n,i,o,a,u)}}function tn(t){return function(){return t}}function nn(t,n){return function(e){return t+e*n}}function en(t,n){var e=n-t;return e?nn(t,e>180||e<-180?e-360*Math.round(e/360):e):tn(isNaN(t)?n:t)}function rn(t){return 1==(t=+t)?on:function(n,e){return e-n?function(t,n,e){return t=Math.pow(t,e),n=Math.pow(n,e)-t,e=1/e,function(r){return Math.pow(t+r*n,e)}}(n,e,t):tn(isNaN(n)?e:n)}}function on(t,n){var e=n-t;return e?nn(t,e):tn(isNaN(t)?n:t)}function an(t){return function(n){var e,r,i=n.length,o=new Array(i),a=new Array(i),u=new Array(i);for(e=0;e<i;++e)r=Rt(n[e]),o[e]=r.r||0,a[e]=r.g||0,u[e]=r.b||0;return o=t(o),a=t(a),u=t(u),r.opacity=1,function(t){return r.r=o(t),r.g=a(t),r.b=u(t),r+""}}}function un(t,n){var e,r=n?n.length:0,i=t?Math.min(r,t.length):0,o=new Array(i),a=new Array(r);for(e=0;e<i;++e)o[e]=hn(t[e],n[e]);for(;e<r;++e)a[e]=n[e];return function(t){for(e=0;e<i;++e)a[e]=o[e](t);return a}}function fn(t,n){var e=new Date;return t=+t,n-=t,function(r){return e.setTime(t+n*r),e}}function cn(t,n){return t=+t,n-=t,function(e){return t+n*e}}function sn(t,n){var e,r={},i={};null!==t&&"object"==typeof t||(t={}),null!==n&&"object"==typeof n||(n={});for(e in n)e in t?r[e]=hn(t[e],n[e]):i[e]=n[e];return function(t){for(e in r)i[e]=r[e](t);return i}}function ln(t,n){var e,r,i,o=il.lastIndex=ol.lastIndex=0,a=-1,u=[],f=[];for(t+="",n+="";(e=il.exec(t))&&(r=ol.exec(n));)(i=r.index)>o&&(i=n.slice(o,i),u[a]?u[a]+=i:u[++a]=i),(e=e[0])===(r=r[0])?u[a]?u[a]+=r:u[++a]=r:(u[++a]=null,f.push({i:a,x:cn(e,r)})),o=ol.lastIndex;return o<n.length&&(i=n.slice(o),u[a]?u[a]+=i:u[++a]=i),u.length<2?f[0]?function(t){return function(n){return t(n)+""}}(f[0].x):function(t){return function(){return t}}(n):(n=f.length,function(t){for(var e,r=0;r<n;++r)u[(e=f[r]).i]=e.x(t);return u.join("")})}function hn(t,n){var e,r=typeof n;return null==n||"boolean"===r?tn(n):("number"===r?cn:"string"===r?(e=kt(n))?(n=e,nl):ln:n instanceof kt?nl:n instanceof Date?fn:Array.isArray(n)?un:"function"!=typeof n.valueOf&&"function"!=typeof n.toString||isNaN(n)?sn:cn)(t,n)}function dn(t,n){return t=+t,n-=t,function(e){return Math.round(t+n*e)}}function pn(t,n,e,r,i,o){var a,u,f;return(a=Math.sqrt(t*t+n*n))&&(t/=a,n/=a),(f=t*e+n*r)&&(e-=t*f,r-=n*f),(u=Math.sqrt(e*e+r*r))&&(e/=u,r/=u,f/=u),t*r<n*e&&(t=-t,n=-n,f=-f,a=-a),{translateX:i,translateY:o,rotate:Math.atan2(n,t)*al,skewX:Math.atan(f)*al,scaleX:a,scaleY:u}}function vn(t,n,e,r){function i(t){return t.length?t.pop()+" ":""}return function(o,a){var u=[],f=[];return o=t(o),a=t(a),function(t,r,i,o,a,u){if(t!==i||r!==o){var f=a.push("translate(",null,n,null,e);u.push({i:f-4,x:cn(t,i)},{i:f-2,x:cn(r,o)})}else(i||o)&&a.push("translate("+i+n+o+e)}(o.translateX,o.translateY,a.translateX,a.translateY,u,f),function(t,n,e,o){t!==n?(t-n>180?n+=360:n-t>180&&(t+=360),o.push({i:e.push(i(e)+"rotate(",null,r)-2,x:cn(t,n)})):n&&e.push(i(e)+"rotate("+n+r)}(o.rotate,a.rotate,u,f),function(t,n,e,o){t!==n?o.push({i:e.push(i(e)+"skewX(",null,r)-2,x:cn(t,n)}):n&&e.push(i(e)+"skewX("+n+r)}(o.skewX,a.skewX,u,f),function(t,n,e,r,o,a){if(t!==e||n!==r){var u=o.push(i(o)+"scale(",null,",",null,")");a.push({i:u-4,x:cn(t,e)},{i:u-2,x:cn(n,r)})}else 1===e&&1===r||o.push(i(o)+"scale("+e+","+r+")")}(o.scaleX,o.scaleY,a.scaleX,a.scaleY,u,f),o=a=null,function(t){for(var n,e=-1,r=f.length;++e<r;)u[(n=f[e]).i]=n.x(t);return u.join("")}}}function gn(t){return((t=Math.exp(t))+1/t)/2}function yn(t,n){var e,r,i=t[0],o=t[1],a=t[2],u=n[0],f=n[1],c=n[2],s=u-i,l=f-o,h=s*s+l*l;if(h<dl)r=Math.log(c/a)/sl,e=function(t){return[i+t*s,o+t*l,a*Math.exp(sl*t*r)]};else{var d=Math.sqrt(h),p=(c*c-a*a+hl*h)/(2*a*ll*d),v=(c*c-a*a-hl*h)/(2*c*ll*d),g=Math.log(Math.sqrt(p*p+1)-p),y=Math.log(Math.sqrt(v*v+1)-v);r=(y-g)/sl,e=function(t){var n=t*r,e=gn(g),u=a/(ll*d)*(e*function(t){return((t=Math.exp(2*t))-1)/(t+1)}(sl*n+g)-function(t){return((t=Math.exp(t))-1/t)/2}(g));return[i+u*s,o+u*l,a*e/gn(sl*n+g)]}}return e.duration=1e3*r,e}function _n(t){return function(n,e){var r=t((n=Ut(n)).h,(e=Ut(e)).h),i=on(n.s,e.s),o=on(n.l,e.l),a=on(n.opacity,e.opacity);return function(t){return n.h=r(t),n.s=i(t),n.l=o(t),n.opacity=a(t),n+""}}}function bn(t){return function(n,e){var r=t((n=Vt(n)).h,(e=Vt(e)).h),i=on(n.c,e.c),o=on(n.l,e.l),a=on(n.opacity,e.opacity);return function(t){return n.h=r(t),n.c=i(t),n.l=o(t),n.opacity=a(t),n+""}}}function mn(t){return function n(e){function r(n,r){var i=t((n=Wt(n)).h,(r=Wt(r)).h),o=on(n.s,r.s),a=on(n.l,r.l),u=on(n.opacity,r.opacity);return function(t){return n.h=i(t),n.s=o(t),n.l=a(Math.pow(t,e)),n.opacity=u(t),n+""}}return e=+e,r.gamma=n,r}(1)}function xn(){return Tl||(El(wn),Tl=Sl.now()+Nl)}function wn(){Tl=0}function Mn(){this._call=this._time=this._next=null}function An(t,n,e){var r=new Mn;return r.restart(t,n,e),r}function Tn(){xn(),++ml;for(var t,n=Ks;n;)(t=Tl-n._time)>=0&&n._call.call(null,t),n=n._next;--ml}function Nn(){Tl=(Al=Sl.now())+Nl,ml=xl=0;try{Tn()}finally{ml=0,function(){var t,n,e=Ks,r=1/0;for(;e;)e._call?(r>e._time&&(r=e._time),t=e,e=e._next):(n=e._next,e._next=null,e=t?t._next=n:Ks=n);tl=t,En(r)}(),Tl=0}}function Sn(){var t=Sl.now(),n=t-Al;n>Ml&&(Nl-=n,Al=t)}function En(t){if(!ml){xl&&(xl=clearTimeout(xl));t-Tl>24?(t<1/0&&(xl=setTimeout(Nn,t-Sl.now()-Nl)),wl&&(wl=clearInterval(wl))):(wl||(Al=Sl.now(),wl=setInterval(Sn,Ml)),ml=1,El(Nn))}}function kn(t,n,e){var r=new Mn;return n=null==n?0:+n,r.restart(function(e){r.stop(),t(e+n)},n,e),r}function Cn(t,n,e,r,i,o){var a=t.__transition;if(a){if(e in a)return}else t.__transition={};(function(t,n,e){function r(f){var c,s,l,h;if(e.state!==zl)return o();for(c in u)if((h=u[c]).name===e.name){if(h.state===Ll)return kn(r);h.state===Dl?(h.state=ql,h.timer.stop(),h.on.call("interrupt",t,t.__data__,h.index,h.group),delete u[c]):+c<n&&(h.state=ql,h.timer.stop(),delete u[c])}if(kn(function(){e.state===Ll&&(e.state=Dl,e.timer.restart(i,e.delay,e.time),i(f))}),e.state=Rl,e.on.call("start",t,t.__data__,e.index,e.group),e.state===Rl){for(e.state=Ll,a=new Array(l=e.tween.length),c=0,s=-1;c<l;++c)(h=e.tween[c].value.call(t,t.__data__,e.index,e.group))&&(a[++s]=h);a.length=s+1}}function i(n){for(var r=n<e.duration?e.ease.call(null,n/e.duration):(e.timer.restart(o),e.state=Ul,1),i=-1,u=a.length;++i<u;)a[i].call(null,r);e.state===Ul&&(e.on.call("end",t,t.__data__,e.index,e.group),o())}function o(){e.state=ql,e.timer.stop(),delete u[n];for(var r in u)return;delete t.__transition}var a,u=t.__transition;u[n]=e,e.timer=An(function(t){e.state=zl,e.timer.restart(r,e.delay,e.time),e.delay<=t&&r(t-e.delay)},0,e.time)})(t,e,{name:n,index:r,group:i,on:kl,tween:Cl,time:o.time,delay:o.delay,duration:o.duration,ease:o.ease,timer:null,state:Pl})}function Pn(t,n){var e=Rn(t,n);if(e.state>Pl)throw new Error("too late; already scheduled");return e}function zn(t,n){var e=Rn(t,n);if(e.state>Rl)throw new Error("too late; already started");return e}function Rn(t,n){var e=t.__transition;if(!e||!(e=e[n]))throw new Error("transition not found");return e}function Ln(t,n){var e,r,i,o=t.__transition,a=!0;if(o){n=null==n?null:n+"";for(i in o)(e=o[i]).name===n?(r=e.state>Rl&&e.state<Ul,e.state=ql,e.timer.stop(),r&&e.on.call("interrupt",t,t.__data__,e.index,e.group),delete o[i]):a=!1;a&&delete t.__transition}}function Dn(t,n,e){var r=t._id;return t.each(function(){var t=zn(this,r);(t.value||(t.value={}))[n]=e.apply(this,arguments)}),function(t){return Rn(t,r).value[n]}}function Un(t,n){var e;return("number"==typeof n?cn:n instanceof kt?nl:(e=kt(n))?(n=e,nl):ln)(t,n)}function qn(t,n,e,r){this._groups=t,this._parents=n,this._name=e,this._id=r}function On(t){return ft().transition(t)}function Yn(){return++Yl}function Bn(t){return((t*=2)<=1?t*t:--t*(2-t)+1)/2}function Fn(t){return((t*=2)<=1?t*t*t:(t-=2)*t*t+2)/2}function In(t){return(1-Math.cos(Hl*t))/2}function jn(t){return((t*=2)<=1?Math.pow(2,10*t-10):2-Math.pow(2,10-10*t))/2}function Hn(t){return((t*=2)<=1?1-Math.sqrt(1-t*t):Math.sqrt(1-(t-=2)*t)+1)/2}function Xn(t){return(t=+t)<Gl?nh*t*t:t<$l?nh*(t-=Vl)*t+Wl:t<Ql?nh*(t-=Zl)*t+Jl:nh*(t-=Kl)*t+th}function Gn(t,n){for(var e;!(e=t.__transition)||!(e=e[n]);)if(!(t=t.parentNode))return ch.time=xn(),ch;return e}function Vn(t){return function(){return t}}function $n(){t.event.stopImmediatePropagation()}function Wn(){t.event.preventDefault(),t.event.stopImmediatePropagation()}function Zn(t){return{type:t}}function Qn(){return!t.event.button}function Jn(){var t=this.ownerSVGElement||this;return[[0,0],[t.width.baseVal.value,t.height.baseVal.value]]}function Kn(t){for(;!t.__brush;)if(!(t=t.parentNode))return;return t.__brush}function te(t){return t[0][0]===t[1][0]||t[0][1]===t[1][1]}function ne(n){function e(t){var e=t.property("__brush",u).selectAll(".overlay").data([Zn("overlay")]);e.enter().append("rect").attr("class","overlay").attr("pointer-events","all").attr("cursor",_h.overlay).merge(e).each(function(){var t=Kn(this).extent;ct(this).attr("x",t[0][0]).attr("y",t[0][1]).attr("width",t[1][0]-t[0][0]).attr("height",t[1][1]-t[0][1])}),t.selectAll(".selection").data([Zn("selection")]).enter().append("rect").attr("class","selection").attr("cursor",_h.selection).attr("fill","#777").attr("fill-opacity",.3).attr("stroke","#fff").attr("shape-rendering","crispEdges");var i=t.selectAll(".handle").data(n.handles,function(t){return t.type});i.exit().remove(),i.enter().append("rect").attr("class",function(t){return"handle handle--"+t.type}).attr("cursor",function(t){return _h[t.type]}),t.each(r).attr("fill","none").attr("pointer-events","all").style("-webkit-tap-highlight-color","rgba(0,0,0,0)").on("mousedown.brush touchstart.brush",a)}function r(){var t=ct(this),n=Kn(this).selection;n?(t.selectAll(".selection").style("display",null).attr("x",n[0][0]).attr("y",n[0][1]).attr("width",n[1][0]-n[0][0]).attr("height",n[1][1]-n[0][1]),t.selectAll(".handle").style("display",null).attr("x",function(t){return"e"===t.type[t.type.length-1]?n[1][0]-h/2:n[0][0]-h/2}).attr("y",function(t){return"s"===t.type[0]?n[1][1]-h/2:n[0][1]-h/2}).attr("width",function(t){return"n"===t.type||"s"===t.type?n[1][0]-n[0][0]+h:h}).attr("height",function(t){return"e"===t.type||"w"===t.type?n[1][1]-n[0][1]+h:h})):t.selectAll(".selection,.handle").style("display","none").attr("x",null).attr("y",null).attr("width",null).attr("height",null)}function i(t,n){return t.__brush.emitter||new o(t,n)}function o(t,n){this.that=t,this.args=n,this.state=t.__brush,this.active=0}function a(){function e(){var t=pt(w);!L||m||x||(Math.abs(t[0]-U[0])>Math.abs(t[1]-U[1])?x=!0:m=!0),U=t,b=!0,Wn(),o()}function o(){var t;switch(y=U[0]-D[0],_=U[1]-D[1],A){case hh:case lh:T&&(y=Math.max(C-u,Math.min(z-d,y)),c=u+y,p=d+y),N&&(_=Math.max(P-l,Math.min(R-v,_)),h=l+_,g=v+_);break;case dh:T<0?(y=Math.max(C-u,Math.min(z-u,y)),c=u+y,p=d):T>0&&(y=Math.max(C-d,Math.min(z-d,y)),c=u,p=d+y),N<0?(_=Math.max(P-l,Math.min(R-l,_)),h=l+_,g=v):N>0&&(_=Math.max(P-v,Math.min(R-v,_)),h=l,g=v+_);break;case ph:T&&(c=Math.max(C,Math.min(z,u-y*T)),p=Math.max(C,Math.min(z,d+y*T))),N&&(h=Math.max(P,Math.min(R,l-_*N)),g=Math.max(P,Math.min(R,v+_*N)))}p<c&&(T*=-1,t=u,u=d,d=t,t=c,c=p,p=t,M in bh&&Y.attr("cursor",_h[M=bh[M]])),g<h&&(N*=-1,t=l,l=v,v=t,t=h,h=g,g=t,M in mh&&Y.attr("cursor",_h[M=mh[M]])),S.selection&&(k=S.selection),m&&(c=k[0][0],p=k[1][0]),x&&(h=k[0][1],g=k[1][1]),k[0][0]===c&&k[0][1]===h&&k[1][0]===p&&k[1][1]===g||(S.selection=[[c,h],[p,g]],r.call(w),q.brush())}function a(){if($n(),t.event.touches){if(t.event.touches.length)return;f&&clearTimeout(f),f=setTimeout(function(){f=null},500),O.on("touchmove.brush touchend.brush touchcancel.brush",null)}else bt(t.event.view,b),B.on("keydown.brush keyup.brush mousemove.brush mouseup.brush",null);O.attr("pointer-events","all"),Y.attr("cursor",_h.overlay),S.selection&&(k=S.selection),te(k)&&(S.selection=null,r.call(w)),q.end()}if(t.event.touches){if(t.event.changedTouches.length<t.event.touches.length)return Wn()}else if(f)return;if(s.apply(this,arguments)){var u,c,l,h,d,p,v,g,y,_,b,m,x,w=this,M=t.event.target.__data__.type,A="selection"===(t.event.metaKey?M="overlay":M)?lh:t.event.altKey?ph:dh,T=n===gh?null:xh[M],N=n===vh?null:wh[M],S=Kn(w),E=S.extent,k=S.selection,C=E[0][0],P=E[0][1],z=E[1][0],R=E[1][1],L=T&&N&&t.event.shiftKey,D=pt(w),U=D,q=i(w,arguments).beforestart();"overlay"===M?S.selection=k=[[u=n===gh?C:D[0],l=n===vh?P:D[1]],[d=n===gh?z:u,v=n===vh?R:l]]:(u=k[0][0],l=k[0][1],d=k[1][0],v=k[1][1]),c=u,h=l,p=d,g=v;var O=ct(w).attr("pointer-events","none"),Y=O.selectAll(".overlay").attr("cursor",_h[M]);if(t.event.touches)O.on("touchmove.brush",e,!0).on("touchend.brush touchcancel.brush",a,!0);else{var B=ct(t.event.view).on("keydown.brush",function(){switch(t.event.keyCode){case 16:L=T&&N;break;case 18:A===dh&&(T&&(d=p-y*T,u=c+y*T),N&&(v=g-_*N,l=h+_*N),A=ph,o());break;case 32:A!==dh&&A!==ph||(T<0?d=p-y:T>0&&(u=c-y),N<0?v=g-_:N>0&&(l=h-_),A=hh,Y.attr("cursor",_h.selection),o());break;default:return}Wn()},!0).on("keyup.brush",function(){switch(t.event.keyCode){case 16:L&&(m=x=L=!1,o());break;case 18:A===ph&&(T<0?d=p:T>0&&(u=c),N<0?v=g:N>0&&(l=h),A=dh,o());break;case 32:A===hh&&(t.event.altKey?(T&&(d=p-y*T,u=c+y*T),N&&(v=g-_*N,l=h+_*N),A=ph):(T<0?d=p:T>0&&(u=c),N<0?v=g:N>0&&(l=h),A=dh),Y.attr("cursor",_h[M]),o());break;default:return}Wn()},!0).on("mousemove.brush",e,!0).on("mouseup.brush",a,!0);_t(t.event.view)}$n(),Ln(w),r.call(w),q.start()}}function u(){var t=this.__brush||{selection:null};return t.extent=c.apply(this,arguments),t.dim=n,t}var f,c=Jn,s=Qn,l=N(e,"start","brush","end"),h=6;return e.move=function(t,e){t.selection?t.on("start.brush",function(){i(this,arguments).beforestart().start()}).on("interrupt.brush end.brush",function(){i(this,arguments).end()}).tween("brush",function(){function t(t){a.selection=1===t&&te(c)?null:s(t),r.call(o),u.brush()}var o=this,a=o.__brush,u=i(o,arguments),f=a.selection,c=n.input("function"==typeof e?e.apply(this,arguments):e,a.extent),s=hn(f,c);return f&&c?t:t(1)}):t.each(function(){var t=arguments,o=this.__brush,a=n.input("function"==typeof e?e.apply(this,t):e,o.extent),u=i(this,t).beforestart();Ln(this),o.selection=null==a||te(a)?null:a,r.call(this),u.start().brush().end()})},o.prototype={beforestart:function(){return 1==++this.active&&(this.state.emitter=this,this.starting=!0),this},start:function(){return this.starting&&(this.starting=!1,this.emit("start")),this},brush:function(){return this.emit("brush"),this},end:function(){return 0==--this.active&&(delete this.state.emitter,this.emit("end")),this},emit:function(t){ot(new function(t,n,e){this.target=t,this.type=n,this.selection=e}(e,t,n.output(this.state.selection)),l.apply,l,[t,this.that,this.args])}},e.extent=function(t){return arguments.length?(c="function"==typeof t?t:Vn([[+t[0][0],+t[0][1]],[+t[1][0],+t[1][1]]]),e):c},e.filter=function(t){return arguments.length?(s="function"==typeof t?t:Vn(!!t),e):s},e.handleSize=function(t){return arguments.length?(h=+t,e):h},e.on=function(){var t=l.on.apply(l,arguments);return t===l?e:t},e}function ee(t){return function(){return t}}function re(){this._x0=this._y0=this._x1=this._y1=null,this._=""}function ie(){return new re}function oe(t){return t.source}function ae(t){return t.target}function ue(t){return t.radius}function fe(t){return t.startAngle}function ce(t){return t.endAngle}function se(){}function le(t,n){var e=new se;if(t instanceof se)t.each(function(t,n){e.set(n,t)});else if(Array.isArray(t)){var r,i=-1,o=t.length;if(null==n)for(;++i<o;)e.set(i,t[i]);else for(;++i<o;)e.set(n(r=t[i],i,t),r)}else if(t)for(var a in t)e.set(a,t[a]);return e}function he(){return{}}function de(t,n,e){t[n]=e}function pe(){return le()}function ve(t,n,e){t.set(n,e)}function ge(){}function ye(t,n){var e=new ge;if(t instanceof ge)t.each(function(t){e.add(t)});else if(t){var r=-1,i=t.length;if(null==n)for(;++r<i;)e.add(t[r]);else for(;++r<i;)e.add(n(t[r],r,t))}return e}function _e(t,n){return t-n}function be(t){return function(){return t}}function me(t,n){for(var e,r=-1,i=n.length;++r<i;)if(e=function(t,n){for(var e=n[0],r=n[1],i=-1,o=0,a=t.length,u=a-1;o<a;u=o++){var f=t[o],c=f[0],s=f[1],l=t[u],h=l[0],d=l[1];if(function(t,n,e){var r;return function(t,n,e){return(n[0]-t[0])*(e[1]-t[1])==(e[0]-t[0])*(n[1]-t[1])}(t,n,e)&&function(t,n,e){return t<=n&&n<=e||e<=n&&n<=t}(t[r=+(t[0]===n[0])],e[r],n[r])}(f,l,n))return 0;s>r!=d>r&&e<(h-c)*(r-s)/(d-s)+c&&(i=-i)}return i}(t,n[r]))return e;return 0}function xe(){}function we(){function t(t){var e=a(t);if(Array.isArray(e))e=e.slice().sort(_e);else{var r=u(t),i=r[0],o=r[1];e=d(i,o,e),e=s(Math.floor(i/e)*e,Math.floor(o/e)*e,e)}return e.map(function(e){return n(t,e)})}function n(t,n){var r=[],a=[];return function(t,n,r){function a(t){var n,i,o=[t[0][0]+u,t[0][1]+f],a=[t[1][0]+u,t[1][1]+f],c=e(o),s=e(a);(n=p[c])?(i=d[s])?(delete p[n.end],delete d[i.start],n===i?(n.ring.push(a),r(n.ring)):d[n.start]=p[i.end]={start:n.start,end:i.end,ring:n.ring.concat(i.ring)}):(delete p[n.end],n.ring.push(a),p[n.end=s]=n):(n=d[s])?(i=p[c])?(delete d[n.start],delete p[i.end],n===i?(n.ring.push(a),r(n.ring)):d[i.start]=p[n.end]={start:i.start,end:n.end,ring:i.ring.concat(n.ring)}):(delete d[n.start],n.ring.unshift(o),d[n.start=c]=n):d[c]=p[s]={start:c,end:s,ring:[o,a]}}var u,f,c,s,l,h,d=new Array,p=new Array;u=f=-1,s=t[0]>=n,Dh[s<<1].forEach(a);for(;++u<i-1;)c=s,s=t[u+1]>=n,Dh[c|s<<1].forEach(a);Dh[s<<0].forEach(a);for(;++f<o-1;){for(u=-1,s=t[f*i+i]>=n,l=t[f*i]>=n,Dh[s<<1|l<<2].forEach(a);++u<i-1;)c=s,s=t[f*i+i+u+1]>=n,h=l,l=t[f*i+u+1]>=n,Dh[c|s<<1|l<<2|h<<3].forEach(a);Dh[s|l<<3].forEach(a)}u=-1,l=t[f*i]>=n,Dh[l<<2].forEach(a);for(;++u<i-1;)h=l,l=t[f*i+u+1]>=n,Dh[l<<2|h<<3].forEach(a);Dh[l<<3].forEach(a)}(t,n,function(e){f(e,t,n),function(t){for(var n=0,e=t.length,r=t[e-1][1]*t[0][0]-t[e-1][0]*t[0][1];++n<e;)r+=t[n-1][1]*t[n][0]-t[n-1][0]*t[n][1];return r}(e)>0?r.push([e]):a.push(e)}),a.forEach(function(t){for(var n,e=0,i=r.length;e<i;++e)if(-1!==me((n=r[e])[0],t))return void n.push(t)}),{type:"MultiPolygon",value:n,coordinates:r}}function e(t){return 2*t[0]+t[1]*(i+1)*4}function r(t,n,e){t.forEach(function(t){var r,a=t[0],u=t[1],f=0|a,c=0|u,s=n[c*i+f];a>0&&a<i&&f===a&&(r=n[c*i+f-1],t[0]=a+(e-r)/(s-r)-.5),u>0&&u<o&&c===u&&(r=n[(c-1)*i+f],t[1]=u+(e-r)/(s-r)-.5)})}var i=1,o=1,a=p,f=r;return t.contour=n,t.size=function(n){if(!arguments.length)return[i,o];var e=Math.ceil(n[0]),r=Math.ceil(n[1]);if(!(e>0&&r>0))throw new Error("invalid size");return i=e,o=r,t},t.thresholds=function(n){return arguments.length?(a="function"==typeof n?n:Array.isArray(n)?be(Lh.call(n)):be(n),t):a},t.smooth=function(n){return arguments.length?(f=n?r:xe,t):f===r},t}function Me(t,n,e){for(var r=t.width,i=t.height,o=1+(e<<1),a=0;a<i;++a)for(var u=0,f=0;u<r+e;++u)u<r&&(f+=t.data[u+a*r]),u>=e&&(u>=o&&(f-=t.data[u-o+a*r]),n.data[u-e+a*r]=f/Math.min(u+1,r-1+o-u,o))}function Ae(t,n,e){for(var r=t.width,i=t.height,o=1+(e<<1),a=0;a<r;++a)for(var u=0,f=0;u<i+e;++u)u<i&&(f+=t.data[a+u*r]),u>=e&&(u>=o&&(f-=t.data[a+(u-o)*r]),n.data[a+(u-e)*r]=f/Math.min(u+1,i-1+o-u,o))}function Te(t){return t[0]}function Ne(t){return t[1]}function Se(t){return new Function("d","return {"+t.map(function(t,n){return JSON.stringify(t)+": d["+n+"]"}).join(",")+"}")}function Ee(t){function n(t,n){function e(){if(c)return qh;if(s)return s=!1,Uh;var n,e,r=u;if(t.charCodeAt(r)===Oh){for(;u++<a&&t.charCodeAt(u)!==Oh||t.charCodeAt(++u)===Oh;);return(n=u)>=a?c=!0:(e=t.charCodeAt(u++))===Yh?s=!0:e===Bh&&(s=!0,t.charCodeAt(u)===Yh&&++u),t.slice(r+1,n-1).replace(/""/g,'"')}for(;u<a;){if((e=t.charCodeAt(n=u++))===Yh)s=!0;else if(e===Bh)s=!0,t.charCodeAt(u)===Yh&&++u;else if(e!==o)continue;return t.slice(r,n)}return c=!0,t.slice(r,a)}var r,i=[],a=t.length,u=0,f=0,c=a<=0,s=!1;for(t.charCodeAt(a-1)===Yh&&--a,t.charCodeAt(a-1)===Bh&&--a;(r=e())!==qh;){for(var l=[];r!==Uh&&r!==qh;)l.push(r),r=e();n&&null==(l=n(l,f++))||i.push(l)}return i}function e(n){return n.map(r).join(t)}function r(t){return null==t?"":i.test(t+="")?'"'+t.replace(/"/g,'""')+'"':t}var i=new RegExp('["'+t+"\n\r]"),o=t.charCodeAt(0);return{parse:function(t,e){var r,i,o=n(t,function(t,n){if(r)return r(t,n-1);i=t,r=e?function(t,n){var e=Se(t);return function(r,i){return n(e(r),i,t)}}(t,e):Se(t)});return o.columns=i||[],o},parseRows:n,format:function(n,e){return null==e&&(e=function(t){var n=Object.create(null),e=[];return t.forEach(function(t){for(var r in t)r in n||e.push(n[r]=r)}),e}(n)),[e.map(r).join(t)].concat(n.map(function(n){return e.map(function(t){return r(n[t])}).join(t)})).join("\n")},formatRows:function(t){return t.map(e).join("\n")}}}function ke(t){if(!t.ok)throw new Error(t.status+" "+t.statusText);return t.blob()}function Ce(t){if(!t.ok)throw new Error(t.status+" "+t.statusText);return t.arrayBuffer()}function Pe(t){if(!t.ok)throw new Error(t.status+" "+t.statusText);return t.text()}function ze(t,n){return fetch(t,n).then(Pe)}function Re(t){return function(n,e,r){return 2===arguments.length&&"function"==typeof e&&(r=e,e=void 0),ze(n,e).then(function(n){return t(n,r)})}}function Le(t){if(!t.ok)throw new Error(t.status+" "+t.statusText);return t.json()}function De(t){return function(n,e){return ze(n,e).then(function(n){return(new DOMParser).parseFromString(n,t)})}}function Ue(t){return function(){return t}}function qe(){return 1e-6*(Math.random()-.5)}function Oe(t,n,e,r){if(isNaN(n)||isNaN(e))return t;var i,o,a,u,f,c,s,l,h,d=t._root,p={data:r},v=t._x0,g=t._y0,y=t._x1,_=t._y1;if(!d)return t._root=p,t;for(;d.length;)if((c=n>=(o=(v+y)/2))?v=o:y=o,(s=e>=(a=(g+_)/2))?g=a:_=a,i=d,!(d=d[l=s<<1|c]))return i[l]=p,t;if(u=+t._x.call(null,d.data),f=+t._y.call(null,d.data),n===u&&e===f)return p.next=d,i?i[l]=p:t._root=p,t;do{i=i?i[l]=new Array(4):t._root=new Array(4),(c=n>=(o=(v+y)/2))?v=o:y=o,(s=e>=(a=(g+_)/2))?g=a:_=a}while((l=s<<1|c)==(h=(f>=a)<<1|u>=o));return i[h]=d,i[l]=p,t}function Ye(t,n,e,r,i){this.node=t,this.x0=n,this.y0=e,this.x1=r,this.y1=i}function Be(t){return t[0]}function Fe(t){return t[1]}function Ie(t,n,e){var r=new je(null==n?Be:n,null==e?Fe:e,NaN,NaN,NaN,NaN);return null==t?r:r.addAll(t)}function je(t,n,e,r,i,o){this._x=t,this._y=n,this._x0=e,this._y0=r,this._x1=i,this._y1=o,this._root=void 0}function He(t){for(var n={data:t.data},e=n;t=t.next;)e=e.next={data:t.data};return n}function Xe(t){return t.x+t.vx}function Ge(t){return t.y+t.vy}function Ve(t){return t.index}function $e(t,n){var e=t.get(n);if(!e)throw new Error("missing: "+n);return e}function We(t){return t.x}function Ze(t){return t.y}function Qe(t,n){if((e=(t=n?t.toExponential(n-1):t.toExponential()).indexOf("e"))<0)return null;var e,r=t.slice(0,e);return[r.length>1?r[0]+r.slice(2):r,+t.slice(e+1)]}function Je(t){return(t=Qe(Math.abs(t)))?t[1]:NaN}function Ke(t,n){var e=Qe(t,n);if(!e)return t+"";var r=e[0],i=e[1];return i<0?"0."+new Array(-i).join("0")+r:r.length>i+1?r.slice(0,i+1)+"."+r.slice(i+1):r+new Array(i-r.length+2).join("0")}function tr(t){return new nr(t)}function nr(t){if(!(n=ud.exec(t)))throw new Error("invalid format: "+t);var n,e=n[1]||" ",r=n[2]||">",i=n[3]||"-",o=n[4]||"",a=!!n[5],u=n[6]&&+n[6],f=!!n[7],c=n[8]&&+n[8].slice(1),s=n[9]||"";"n"===s?(f=!0,s="g"):ad[s]||(s=""),(a||"0"===e&&"="===r)&&(a=!0,e="0",r="="),this.fill=e,this.align=r,this.sign=i,this.symbol=o,this.zero=a,this.width=u,this.comma=f,this.precision=c,this.type=s}function er(t){return t}function rr(t){function n(t){function n(t){var n,r,a,s=g,m=y;if("c"===v)m=_(t)+m,t="";else{var x=(t=+t)<0;if(t=_(Math.abs(t),p),x&&0==+t&&(x=!1),s=(x?"("===c?c:"-":"-"===c||"("===c?"":c)+s,m=("s"===v?cd[8+rd/3]:"")+m+(x&&"("===c?")":""),b)for(n=-1,r=t.length;++n<r;)if(48>(a=t.charCodeAt(n))||a>57){m=(46===a?i+t.slice(n+1):t.slice(n))+m,t=t.slice(0,n);break}}d&&!l&&(t=e(t,1/0));var w=s.length+t.length+m.length,M=w<h?new Array(h-w+1).join(u):"";switch(d&&l&&(t=e(M+t,M.length?h-m.length:1/0),M=""),f){case"<":t=s+t+m+M;break;case"=":t=s+M+t+m;break;case"^":t=M.slice(0,w=M.length>>1)+s+t+m+M.slice(w);break;default:t=M+s+t+m}return o(t)}var u=(t=tr(t)).fill,f=t.align,c=t.sign,s=t.symbol,l=t.zero,h=t.width,d=t.comma,p=t.precision,v=t.type,g="$"===s?r[0]:"#"===s&&/[boxX]/.test(v)?"0"+v.toLowerCase():"",y="$"===s?r[1]:/[%p]/.test(v)?a:"",_=ad[v],b=!v||/[defgprs%]/.test(v);return p=null==p?v?6:12:/[gprs]/.test(v)?Math.max(1,Math.min(21,p)):Math.max(0,Math.min(20,p)),n.toString=function(){return t+""},n}var e=t.grouping&&t.thousands?function(t,n){return function(e,r){for(var i=e.length,o=[],a=0,u=t[0],f=0;i>0&&u>0&&(f+u+1>r&&(u=Math.max(1,r-f)),o.push(e.substring(i-=u,i+u)),!((f+=u+1)>r));)u=t[a=(a+1)%t.length];return o.reverse().join(n)}}(t.grouping,t.thousands):er,r=t.currency,i=t.decimal,o=t.numerals?function(t){return function(n){return n.replace(/[0-9]/g,function(n){return t[+n]})}}(t.numerals):er,a=t.percent||"%";return{format:n,formatPrefix:function(t,e){var r=n((t=tr(t),t.type="f",t)),i=3*Math.max(-8,Math.min(8,Math.floor(Je(e)/3))),o=Math.pow(10,-i),a=cd[8+i/3];return function(t){return r(o*t)+a}}}}function ir(n){return fd=rr(n),t.format=fd.format,t.formatPrefix=fd.formatPrefix,fd}function or(t){return Math.max(0,-Je(Math.abs(t)))}function ar(t,n){return Math.max(0,3*Math.max(-8,Math.min(8,Math.floor(Je(n)/3)))-Je(Math.abs(t)))}function ur(t,n){return t=Math.abs(t),n=Math.abs(n)-t,Math.max(0,Je(n)-Je(t))+1}function fr(){return new cr}function cr(){this.reset()}function sr(t,n,e){var r=t.s=n+e,i=r-n,o=r-i;t.t=n-o+(e-i)}function lr(t){return t>1?0:t<-1?Hd:Math.acos(t)}function hr(t){return t>1?Xd:t<-1?-Xd:Math.asin(t)}function dr(t){return(t=ip(t/2))*t}function pr(){}function vr(t,n){t&&cp.hasOwnProperty(t.type)&&cp[t.type](t,n)}function gr(t,n,e){var r,i=-1,o=t.length-e;for(n.lineStart();++i<o;)r=t[i],n.point(r[0],r[1],r[2]);n.lineEnd()}function yr(t,n){var e=-1,r=t.length;for(n.polygonStart();++e<r;)gr(t[e],n,1);n.polygonEnd()}function _r(t,n){t&&fp.hasOwnProperty(t.type)?fp[t.type](t,n):vr(t,n)}function br(){hp.point=xr}function mr(){wr(sd,ld)}function xr(t,n){hp.point=wr,sd=t,ld=n,hd=t*=Wd,dd=Kd(n=(n*=Wd)/2+Gd),pd=ip(n)}function wr(t,n){n=(n*=Wd)/2+Gd;var e=(t*=Wd)-hd,r=e>=0?1:-1,i=r*e,o=Kd(n),a=ip(n),u=pd*a,f=dd*o+u*Kd(i),c=u*r*ip(i);sp.add(Jd(c,f)),hd=t,dd=o,pd=a}function Mr(t){return[Jd(t[1],t[0]),hr(t[2])]}function Ar(t){var n=t[0],e=t[1],r=Kd(e);return[r*Kd(n),r*ip(n),ip(e)]}function Tr(t,n){return t[0]*n[0]+t[1]*n[1]+t[2]*n[2]}function Nr(t,n){return[t[1]*n[2]-t[2]*n[1],t[2]*n[0]-t[0]*n[2],t[0]*n[1]-t[1]*n[0]]}function Sr(t,n){t[0]+=n[0],t[1]+=n[1],t[2]+=n[2]}function Er(t,n){return[t[0]*n,t[1]*n,t[2]*n]}function kr(t){var n=ap(t[0]*t[0]+t[1]*t[1]+t[2]*t[2]);t[0]/=n,t[1]/=n,t[2]/=n}function Cr(t,n){Md.push(Ad=[vd=t,yd=t]),n<gd&&(gd=n),n>_d&&(_d=n)}function Pr(t,n){var e=Ar([t*Wd,n*Wd]);if(wd){var r=Nr(wd,e),i=Nr([r[1],-r[0],0],r);kr(i),i=Mr(i);var o,a=t-bd,u=a>0?1:-1,f=i[0]*$d*u,c=Zd(a)>180;c^(u*bd<f&&f<u*t)?(o=i[1]*$d)>_d&&(_d=o):(f=(f+360)%360-180,c^(u*bd<f&&f<u*t)?(o=-i[1]*$d)<gd&&(gd=o):(n<gd&&(gd=n),n>_d&&(_d=n))),c?t<bd?qr(vd,t)>qr(vd,yd)&&(yd=t):qr(t,yd)>qr(vd,yd)&&(vd=t):yd>=vd?(t<vd&&(vd=t),t>yd&&(yd=t)):t>bd?qr(vd,t)>qr(vd,yd)&&(yd=t):qr(t,yd)>qr(vd,yd)&&(vd=t)}else Md.push(Ad=[vd=t,yd=t]);n<gd&&(gd=n),n>_d&&(_d=n),wd=e,bd=t}function zr(){pp.point=Pr}function Rr(){Ad[0]=vd,Ad[1]=yd,pp.point=Cr,wd=null}function Lr(t,n){if(wd){var e=t-bd;dp.add(Zd(e)>180?e+(e>0?360:-360):e)}else md=t,xd=n;hp.point(t,n),Pr(t,n)}function Dr(){hp.lineStart()}function Ur(){Lr(md,xd),hp.lineEnd(),Zd(dp)>Id&&(vd=-(yd=180)),Ad[0]=vd,Ad[1]=yd,wd=null}function qr(t,n){return(n-=t)<0?n+360:n}function Or(t,n){return t[0]-n[0]}function Yr(t,n){return t[0]<=t[1]?t[0]<=n&&n<=t[1]:n<t[0]||t[1]<n}function Br(t,n){t*=Wd;var e=Kd(n*=Wd);Fr(e*Kd(t),e*ip(t),ip(n))}function Fr(t,n,e){Sd+=(t-Sd)/++Td,Ed+=(n-Ed)/Td,kd+=(e-kd)/Td}function Ir(){vp.point=jr}function jr(t,n){t*=Wd;var e=Kd(n*=Wd);Od=e*Kd(t),Yd=e*ip(t),Bd=ip(n),vp.point=Hr,Fr(Od,Yd,Bd)}function Hr(t,n){t*=Wd;var e=Kd(n*=Wd),r=e*Kd(t),i=e*ip(t),o=ip(n),a=Jd(ap((a=Yd*o-Bd*i)*a+(a=Bd*r-Od*o)*a+(a=Od*i-Yd*r)*a),Od*r+Yd*i+Bd*o);Nd+=a,Cd+=a*(Od+(Od=r)),Pd+=a*(Yd+(Yd=i)),zd+=a*(Bd+(Bd=o)),Fr(Od,Yd,Bd)}function Xr(){vp.point=Br}function Gr(){vp.point=$r}function Vr(){Wr(Ud,qd),vp.point=Br}function $r(t,n){Ud=t,qd=n,t*=Wd,n*=Wd,vp.point=Wr;var e=Kd(n);Od=e*Kd(t),Yd=e*ip(t),Bd=ip(n),Fr(Od,Yd,Bd)}function Wr(t,n){t*=Wd;var e=Kd(n*=Wd),r=e*Kd(t),i=e*ip(t),o=ip(n),a=Yd*o-Bd*i,u=Bd*r-Od*o,f=Od*i-Yd*r,c=ap(a*a+u*u+f*f),s=hr(c),l=c&&-s/c;Rd+=l*a,Ld+=l*u,Dd+=l*f,Nd+=s,Cd+=s*(Od+(Od=r)),Pd+=s*(Yd+(Yd=i)),zd+=s*(Bd+(Bd=o)),Fr(Od,Yd,Bd)}function Zr(t){return function(){return t}}function Qr(t,n){function e(e,r){return e=t(e,r),n(e[0],e[1])}return t.invert&&n.invert&&(e.invert=function(e,r){return(e=n.invert(e,r))&&t.invert(e[0],e[1])}),e}function Jr(t,n){return[t>Hd?t-Vd:t<-Hd?t+Vd:t,n]}function Kr(t,n,e){return(t%=Vd)?n||e?Qr(ni(t),ei(n,e)):ni(t):n||e?ei(n,e):Jr}function ti(t){return function(n,e){return n+=t,[n>Hd?n-Vd:n<-Hd?n+Vd:n,e]}}function ni(t){var n=ti(t);return n.invert=ti(-t),n}function ei(t,n){function e(t,n){var e=Kd(n),u=Kd(t)*e,f=ip(t)*e,c=ip(n),s=c*r+u*i;return[Jd(f*o-s*a,u*r-c*i),hr(s*o+f*a)]}var r=Kd(t),i=ip(t),o=Kd(n),a=ip(n);return e.invert=function(t,n){var e=Kd(n),u=Kd(t)*e,f=ip(t)*e,c=ip(n),s=c*o-f*a;return[Jd(f*o+c*a,u*r+s*i),hr(s*r-u*i)]},e}function ri(t){function n(n){return n=t(n[0]*Wd,n[1]*Wd),n[0]*=$d,n[1]*=$d,n}return t=Kr(t[0]*Wd,t[1]*Wd,t.length>2?t[2]*Wd:0),n.invert=function(n){return n=t.invert(n[0]*Wd,n[1]*Wd),n[0]*=$d,n[1]*=$d,n},n}function ii(t,n,e,r,i,o){if(e){var a=Kd(n),u=ip(n),f=r*e;null==i?(i=n+r*Vd,o=n-f/2):(i=oi(a,i),o=oi(a,o),(r>0?i<o:i>o)&&(i+=r*Vd));for(var c,s=i;r>0?s>o:s<o;s-=f)c=Mr([a,-u*Kd(s),-u*ip(s)]),t.point(c[0],c[1])}}function oi(t,n){(n=Ar(n))[0]-=t,kr(n);var e=lr(-n[1]);return((-n[2]<0?-e:e)+Vd-Id)%Vd}function ai(){var t,n=[];return{point:function(n,e){t.push([n,e])},lineStart:function(){n.push(t=[])},lineEnd:pr,rejoin:function(){n.length>1&&n.push(n.pop().concat(n.shift()))},result:function(){var e=n;return n=[],t=null,e}}}function ui(t,n){return Zd(t[0]-n[0])<Id&&Zd(t[1]-n[1])<Id}function fi(t,n,e,r){this.x=t,this.z=n,this.o=e,this.e=r,this.v=!1,this.n=this.p=null}function ci(t,n,e,r,i){var o,a,u=[],f=[];if(t.forEach(function(t){if(!((n=t.length-1)<=0)){var n,e,r=t[0],a=t[n];if(ui(r,a)){for(i.lineStart(),o=0;o<n;++o)i.point((r=t[o])[0],r[1]);i.lineEnd()}else u.push(e=new fi(r,t,null,!0)),f.push(e.o=new fi(r,null,e,!1)),u.push(e=new fi(a,t,null,!1)),f.push(e.o=new fi(a,null,e,!0))}}),u.length){for(f.sort(n),si(u),si(f),o=0,a=f.length;o<a;++o)f[o].e=e=!e;for(var c,s,l=u[0];;){for(var h=l,d=!0;h.v;)if((h=h.n)===l)return;c=h.z,i.lineStart();do{if(h.v=h.o.v=!0,h.e){if(d)for(o=0,a=c.length;o<a;++o)i.point((s=c[o])[0],s[1]);else r(h.x,h.n.x,1,i);h=h.n}else{if(d)for(c=h.p.z,o=c.length-1;o>=0;--o)i.point((s=c[o])[0],s[1]);else r(h.x,h.p.x,-1,i);h=h.p}c=(h=h.o).z,d=!d}while(!h.v);i.lineEnd()}}}function si(t){if(n=t.length){for(var n,e,r=0,i=t[0];++r<n;)i.n=e=t[r],e.p=i,i=e;i.n=e=t[0],e.p=i}}function li(t,n){var e=n[0],r=n[1],i=ip(r),o=[ip(e),-Kd(e),0],a=0,u=0;Sp.reset(),1===i?r=Xd+Id:-1===i&&(r=-Xd-Id);for(var f=0,c=t.length;f<c;++f)if(l=(s=t[f]).length)for(var s,l,h=s[l-1],d=h[0],p=h[1]/2+Gd,v=ip(p),g=Kd(p),y=0;y<l;++y,d=b,v=x,g=w,h=_){var _=s[y],b=_[0],m=_[1]/2+Gd,x=ip(m),w=Kd(m),M=b-d,A=M>=0?1:-1,T=A*M,N=T>Hd,S=v*x;if(Sp.add(Jd(S*A*ip(T),g*w+S*Kd(T))),a+=N?M+A*Vd:M,N^d>=e^b>=e){var E=Nr(Ar(h),Ar(_));kr(E);var k=Nr(o,E);kr(k);var C=(N^M>=0?-1:1)*hr(k[2]);(r>C||r===C&&(E[0]||E[1]))&&(u+=N^M>=0?1:-1)}}return(a<-Id||a<Id&&Sp<-Id)^1&u}function hi(t,n,e,r){return function(i){function o(n,e){t(n,e)&&i.point(n,e)}function a(t,n){v.point(t,n)}function u(){m.point=a,v.lineStart()}function f(){m.point=o,v.lineEnd()}function c(t,n){p.push([t,n]),_.point(t,n)}function s(){_.lineStart(),p=[]}function l(){c(p[0][0],p[0][1]),_.lineEnd();var t,n,e,r,o=_.clean(),a=g.result(),u=a.length;if(p.pop(),h.push(p),p=null,u)if(1&o){if(e=a[0],(n=e.length-1)>0){for(b||(i.polygonStart(),b=!0),i.lineStart(),t=0;t<n;++t)i.point((r=e[t])[0],r[1]);i.lineEnd()}}else u>1&&2&o&&a.push(a.pop().concat(a.shift())),d.push(a.filter(di))}var h,d,p,v=n(i),g=ai(),_=n(g),b=!1,m={point:o,lineStart:u,lineEnd:f,polygonStart:function(){m.point=c,m.lineStart=s,m.lineEnd=l,d=[],h=[]},polygonEnd:function(){m.point=o,m.lineStart=u,m.lineEnd=f,d=y(d);var t=li(h,r);d.length?(b||(i.polygonStart(),b=!0),ci(d,pi,t,e,i)):t&&(b||(i.polygonStart(),b=!0),i.lineStart(),e(null,null,1,i),i.lineEnd()),b&&(i.polygonEnd(),b=!1),d=h=null},sphere:function(){i.polygonStart(),i.lineStart(),e(null,null,1,i),i.lineEnd(),i.polygonEnd()}};return m}}function di(t){return t.length>1}function pi(t,n){return((t=t.x)[0]<0?t[1]-Xd-Id:Xd-t[1])-((n=n.x)[0]<0?n[1]-Xd-Id:Xd-n[1])}function vi(t){function n(t,n){return Kd(t)*Kd(n)>i}function e(t,n,e){var r=[1,0,0],o=Nr(Ar(t),Ar(n)),a=Tr(o,o),u=o[0],f=a-u*u;if(!f)return!e&&t;var c=i*a/f,s=-i*u/f,l=Nr(r,o),h=Er(r,c);Sr(h,Er(o,s));var d=l,p=Tr(h,d),v=Tr(d,d),g=p*p-v*(Tr(h,h)-1);if(!(g<0)){var y=ap(g),_=Er(d,(-p-y)/v);if(Sr(_,h),_=Mr(_),!e)return _;var b,m=t[0],x=n[0],w=t[1],M=n[1];x<m&&(b=m,m=x,x=b);var A=x-m,T=Zd(A-Hd)<Id;if(!T&&M<w&&(b=w,w=M,M=b),T||A<Id?T?w+M>0^_[1]<(Zd(_[0]-m)<Id?w:M):w<=_[1]&&_[1]<=M:A>Hd^(m<=_[0]&&_[0]<=x)){var N=Er(d,(-p+y)/v);return Sr(N,h),[_,Mr(N)]}}}function r(n,e){var r=a?t:Hd-t,i=0;return n<-r?i|=1:n>r&&(i|=2),e<-r?i|=4:e>r&&(i|=8),i}var i=Kd(t),o=6*Wd,a=i>0,u=Zd(i)>Id;return hi(n,function(t){var i,o,f,c,s;return{lineStart:function(){c=f=!1,s=1},point:function(l,h){var d,p=[l,h],v=n(l,h),g=a?v?0:r(l,h):v?r(l+(l<0?Hd:-Hd),h):0;if(!i&&(c=f=v)&&t.lineStart(),v!==f&&(!(d=e(i,p))||ui(i,d)||ui(p,d))&&(p[0]+=Id,p[1]+=Id,v=n(p[0],p[1])),v!==f)s=0,v?(t.lineStart(),d=e(p,i),t.point(d[0],d[1])):(d=e(i,p),t.point(d[0],d[1]),t.lineEnd()),i=d;else if(u&&i&&a^v){var y;g&o||!(y=e(p,i,!0))||(s=0,a?(t.lineStart(),t.point(y[0][0],y[0][1]),t.point(y[1][0],y[1][1]),t.lineEnd()):(t.point(y[1][0],y[1][1]),t.lineEnd(),t.lineStart(),t.point(y[0][0],y[0][1])))}!v||i&&ui(i,p)||t.point(p[0],p[1]),i=p,f=v,o=g},lineEnd:function(){f&&t.lineEnd(),i=null},clean:function(){return s|(c&&f)<<1}}},function(n,e,r,i){ii(i,t,o,r,n,e)},a?[0,-t]:[-Hd,t-Hd])}function gi(t,n,e,r){function i(i,o){return t<=i&&i<=e&&n<=o&&o<=r}function o(i,o,u,c){var s=0,l=0;if(null==i||(s=a(i,u))!==(l=a(o,u))||f(i,o)<0^u>0)do{c.point(0===s||3===s?t:e,s>1?r:n)}while((s=(s+u+4)%4)!==l);else c.point(o[0],o[1])}function a(r,i){return Zd(r[0]-t)<Id?i>0?0:3:Zd(r[0]-e)<Id?i>0?2:1:Zd(r[1]-n)<Id?i>0?1:0:i>0?3:2}function u(t,n){return f(t.x,n.x)}function f(t,n){var e=a(t,1),r=a(n,1);return e!==r?e-r:0===e?n[1]-t[1]:1===e?t[0]-n[0]:2===e?t[1]-n[1]:n[0]-t[0]}return function(a){function f(t,n){i(t,n)&&w.point(t,n)}function c(o,a){var u=i(o,a);if(l&&h.push([o,a]),m)d=o,p=a,v=u,m=!1,u&&(w.lineStart(),w.point(o,a));else if(u&&b)w.point(o,a);else{var f=[g=Math.max(Cp,Math.min(kp,g)),_=Math.max(Cp,Math.min(kp,_))],c=[o=Math.max(Cp,Math.min(kp,o)),a=Math.max(Cp,Math.min(kp,a))];!function(t,n,e,r,i,o){var a,u=t[0],f=t[1],c=0,s=1,l=n[0]-u,h=n[1]-f;if(a=e-u,l||!(a>0)){if(a/=l,l<0){if(a<c)return;a<s&&(s=a)}else if(l>0){if(a>s)return;a>c&&(c=a)}if(a=i-u,l||!(a<0)){if(a/=l,l<0){if(a>s)return;a>c&&(c=a)}else if(l>0){if(a<c)return;a<s&&(s=a)}if(a=r-f,h||!(a>0)){if(a/=h,h<0){if(a<c)return;a<s&&(s=a)}else if(h>0){if(a>s)return;a>c&&(c=a)}if(a=o-f,h||!(a<0)){if(a/=h,h<0){if(a>s)return;a>c&&(c=a)}else if(h>0){if(a<c)return;a<s&&(s=a)}return c>0&&(t[0]=u+c*l,t[1]=f+c*h),s<1&&(n[0]=u+s*l,n[1]=f+s*h),!0}}}}}(f,c,t,n,e,r)?u&&(w.lineStart(),w.point(o,a),x=!1):(b||(w.lineStart(),w.point(f[0],f[1])),w.point(c[0],c[1]),u||w.lineEnd(),x=!1)}g=o,_=a,b=u}var s,l,h,d,p,v,g,_,b,m,x,w=a,M=ai(),A={point:f,lineStart:function(){A.point=c,l&&l.push(h=[]),m=!0,b=!1,g=_=NaN},lineEnd:function(){s&&(c(d,p),v&&b&&M.rejoin(),s.push(M.result())),A.point=f,b&&w.lineEnd()},polygonStart:function(){w=M,s=[],l=[],x=!0},polygonEnd:function(){var n=function(){for(var n=0,e=0,i=l.length;e<i;++e)for(var o,a,u=l[e],f=1,c=u.length,s=u[0],h=s[0],d=s[1];f<c;++f)o=h,a=d,h=(s=u[f])[0],d=s[1],a<=r?d>r&&(h-o)*(r-a)>(d-a)*(t-o)&&++n:d<=r&&(h-o)*(r-a)<(d-a)*(t-o)&&--n;return n}(),e=x&&n,i=(s=y(s)).length;(e||i)&&(a.polygonStart(),e&&(a.lineStart(),o(null,null,1,a),a.lineEnd()),i&&ci(s,u,n,o,a),a.polygonEnd()),w=a,s=l=h=null}};return A}}function yi(){zp.point=zp.lineEnd=pr}function _i(t,n){gp=t*=Wd,yp=ip(n*=Wd),_p=Kd(n),zp.point=bi}function bi(t,n){t*=Wd;var e=ip(n*=Wd),r=Kd(n),i=Zd(t-gp),o=Kd(i),a=r*ip(i),u=_p*e-yp*r*o,f=yp*e+_p*r*o;Pp.add(Jd(ap(a*a+u*u),f)),gp=t,yp=e,_p=r}function mi(t){return Pp.reset(),_r(t,zp),+Pp}function xi(t,n){return Rp[0]=t,Rp[1]=n,mi(Lp)}function wi(t,n){return!(!t||!Up.hasOwnProperty(t.type))&&Up[t.type](t,n)}function Mi(t,n){return 0===xi(t,n)}function Ai(t,n){var e=xi(t[0],t[1]);return xi(t[0],n)+xi(n,t[1])<=e+Id}function Ti(t,n){return!!li(t.map(Ni),Si(n))}function Ni(t){return(t=t.map(Si)).pop(),t}function Si(t){return[t[0]*Wd,t[1]*Wd]}function Ei(t,n,e){var r=s(t,n-Id,e).concat(n);return function(t){return r.map(function(n){return[t,n]})}}function ki(t,n,e){var r=s(t,n-Id,e).concat(n);return function(t){return r.map(function(n){return[n,t]})}}function Ci(){function t(){return{type:"MultiLineString",coordinates:n()}}function n(){return s(tp(o/y)*y,i,y).map(d).concat(s(tp(c/_)*_,f,_).map(p)).concat(s(tp(r/v)*v,e,v).filter(function(t){return Zd(t%y)>Id}).map(l)).concat(s(tp(u/g)*g,a,g).filter(function(t){return Zd(t%_)>Id}).map(h))}var e,r,i,o,a,u,f,c,l,h,d,p,v=10,g=v,y=90,_=360,b=2.5;return t.lines=function(){return n().map(function(t){return{type:"LineString",coordinates:t}})},t.outline=function(){return{type:"Polygon",coordinates:[d(o).concat(p(f).slice(1),d(i).reverse().slice(1),p(c).reverse().slice(1))]}},t.extent=function(n){return arguments.length?t.extentMajor(n).extentMinor(n):t.extentMinor()},t.extentMajor=function(n){return arguments.length?(o=+n[0][0],i=+n[1][0],c=+n[0][1],f=+n[1][1],o>i&&(n=o,o=i,i=n),c>f&&(n=c,c=f,f=n),t.precision(b)):[[o,c],[i,f]]},t.extentMinor=function(n){return arguments.length?(r=+n[0][0],e=+n[1][0],u=+n[0][1],a=+n[1][1],r>e&&(n=r,r=e,e=n),u>a&&(n=u,u=a,a=n),t.precision(b)):[[r,u],[e,a]]},t.step=function(n){return arguments.length?t.stepMajor(n).stepMinor(n):t.stepMinor()},t.stepMajor=function(n){return arguments.length?(y=+n[0],_=+n[1],t):[y,_]},t.stepMinor=function(n){return arguments.length?(v=+n[0],g=+n[1],t):[v,g]},t.precision=function(n){return arguments.length?(b=+n,l=Ei(u,a,90),h=ki(r,e,b),d=Ei(c,f,90),p=ki(o,i,b),t):b},t.extentMajor([[-180,-90+Id],[180,90-Id]]).extentMinor([[-180,-80-Id],[180,80+Id]])}function Pi(t){return t}function zi(){Yp.point=Ri}function Ri(t,n){Yp.point=Li,bp=xp=t,mp=wp=n}function Li(t,n){Op.add(wp*t-xp*n),xp=t,wp=n}function Di(){Li(bp,mp)}function Ui(t,n){Xp+=t,Gp+=n,++Vp}function qi(){tv.point=Oi}function Oi(t,n){tv.point=Yi,Ui(Tp=t,Np=n)}function Yi(t,n){var e=t-Tp,r=n-Np,i=ap(e*e+r*r);$p+=i*(Tp+t)/2,Wp+=i*(Np+n)/2,Zp+=i,Ui(Tp=t,Np=n)}function Bi(){tv.point=Ui}function Fi(){tv.point=ji}function Ii(){Hi(Mp,Ap)}function ji(t,n){tv.point=Hi,Ui(Mp=Tp=t,Ap=Np=n)}function Hi(t,n){var e=t-Tp,r=n-Np,i=ap(e*e+r*r);$p+=i*(Tp+t)/2,Wp+=i*(Np+n)/2,Zp+=i,Qp+=(i=Np*t-Tp*n)*(Tp+t),Jp+=i*(Np+n),Kp+=3*i,Ui(Tp=t,Np=n)}function Xi(t){this._context=t}function Gi(t,n){uv.point=Vi,ev=iv=t,rv=ov=n}function Vi(t,n){iv-=t,ov-=n,av.add(ap(iv*iv+ov*ov)),iv=t,ov=n}function $i(){this._string=[]}function Wi(t){return"m0,"+t+"a"+t+","+t+" 0 1,1 0,"+-2*t+"a"+t+","+t+" 0 1,1 0,"+2*t+"z"}function Zi(t){return function(n){var e=new Qi;for(var r in t)e[r]=t[r];return e.stream=n,e}}function Qi(){}function Ji(t,n,e){var r=t.clipExtent&&t.clipExtent();return t.scale(150).translate([0,0]),null!=r&&t.clipExtent(null),_r(e,t.stream(Hp)),n(Hp.result()),null!=r&&t.clipExtent(r),t}function Ki(t,n,e){return Ji(t,function(e){var r=n[1][0]-n[0][0],i=n[1][1]-n[0][1],o=Math.min(r/(e[1][0]-e[0][0]),i/(e[1][1]-e[0][1])),a=+n[0][0]+(r-o*(e[1][0]+e[0][0]))/2,u=+n[0][1]+(i-o*(e[1][1]+e[0][1]))/2;t.scale(150*o).translate([a,u])},e)}function to(t,n,e){return Ki(t,[[0,0],n],e)}function no(t,n,e){return Ji(t,function(e){var r=+n,i=r/(e[1][0]-e[0][0]),o=(r-i*(e[1][0]+e[0][0]))/2,a=-i*e[0][1];t.scale(150*i).translate([o,a])},e)}function eo(t,n,e){return Ji(t,function(e){var r=+n,i=r/(e[1][1]-e[0][1]),o=-i*e[0][0],a=(r-i*(e[1][1]+e[0][1]))/2;t.scale(150*i).translate([o,a])},e)}function ro(t,n){return+n?function(t,n){function e(r,i,o,a,u,f,c,s,l,h,d,p,v,g){var y=c-r,_=s-i,b=y*y+_*_;if(b>4*n&&v--){var m=a+h,x=u+d,w=f+p,M=ap(m*m+x*x+w*w),A=hr(w/=M),T=Zd(Zd(w)-1)<Id||Zd(o-l)<Id?(o+l)/2:Jd(x,m),N=t(T,A),S=N[0],E=N[1],k=S-r,C=E-i,P=_*k-y*C;(P*P/b>n||Zd((y*k+_*C)/b-.5)>.3||a*h+u*d+f*p<cv)&&(e(r,i,o,a,u,f,S,E,T,m/=M,x/=M,w,v,g),g.point(S,E),e(S,E,T,m,x,w,c,s,l,h,d,p,v,g))}}return function(n){function r(e,r){e=t(e,r),n.point(e[0],e[1])}function i(){y=NaN,w.point=o,n.lineStart()}function o(r,i){var o=Ar([r,i]),a=t(r,i);e(y,_,g,b,m,x,y=a[0],_=a[1],g=r,b=o[0],m=o[1],x=o[2],fv,n),n.point(y,_)}function a(){w.point=r,n.lineEnd()}function u(){i(),w.point=f,w.lineEnd=c}function f(t,n){o(s=t,n),l=y,h=_,d=b,p=m,v=x,w.point=o}function c(){e(y,_,g,b,m,x,l,h,s,d,p,v,fv,n),w.lineEnd=a,a()}var s,l,h,d,p,v,g,y,_,b,m,x,w={point:r,lineStart:i,lineEnd:a,polygonStart:function(){n.polygonStart(),w.lineStart=u},polygonEnd:function(){n.polygonEnd(),w.lineStart=i}};return w}}(t,n):function(t){return Zi({point:function(n,e){n=t(n,e),this.stream.point(n[0],n[1])}})}(t)}function io(t,n,e,r){function i(t,r){return[u*t-f*r+n,e-f*t-u*r]}var o=Kd(r),a=ip(r),u=o*t,f=a*t,c=o/t,s=a/t,l=(a*e-o*n)/t,h=(a*n+o*e)/t;return i.invert=function(t,n){return[c*t-s*n+l,h-s*t-c*n]},i}function oo(t){return ao(function(){return t})()}function ao(t){function n(t){return l(t[0]*Wd,t[1]*Wd)}function e(){var t=io(p,0,0,w).apply(null,i(y,_)),n=(w?io:function(t,n,e){function r(r,i){return[n+t*r,e-t*i]}return r.invert=function(r,i){return[(r-n)/t,(e-i)/t]},r})(p,v-t[0],g-t[1],w);return o=Kr(b,m,x),s=Qr(i,n),l=Qr(o,s),c=ro(s,S),r()}function r(){return h=d=null,n}var i,o,a,u,f,c,s,l,h,d,p=150,v=480,g=250,y=0,_=0,b=0,m=0,x=0,w=0,M=null,A=Ep,T=null,N=Pi,S=.5;return n.stream=function(t){return h&&d===t?h:h=sv(function(t){return Zi({point:function(n,e){var r=t(n,e);return this.stream.point(r[0],r[1])}})}(o)(A(c(N(d=t)))))},n.preclip=function(t){return arguments.length?(A=t,M=void 0,r()):A},n.postclip=function(t){return arguments.length?(N=t,T=a=u=f=null,r()):N},n.clipAngle=function(t){return arguments.length?(A=+t?vi(M=t*Wd):(M=null,Ep),r()):M*$d},n.clipExtent=function(t){return arguments.length?(N=null==t?(T=a=u=f=null,Pi):gi(T=+t[0][0],a=+t[0][1],u=+t[1][0],f=+t[1][1]),r()):null==T?null:[[T,a],[u,f]]},n.scale=function(t){return arguments.length?(p=+t,e()):p},n.translate=function(t){return arguments.length?(v=+t[0],g=+t[1],e()):[v,g]},n.center=function(t){return arguments.length?(y=t[0]%360*Wd,_=t[1]%360*Wd,e()):[y*$d,_*$d]},n.rotate=function(t){return arguments.length?(b=t[0]%360*Wd,m=t[1]%360*Wd,x=t.length>2?t[2]%360*Wd:0,e()):[b*$d,m*$d,x*$d]},n.angle=function(t){return arguments.length?(w=t%360*Wd,e()):w*$d},n.precision=function(t){return arguments.length?(c=ro(s,S=t*t),r()):ap(S)},n.fitExtent=function(t,e){return Ki(n,t,e)},n.fitSize=function(t,e){return to(n,t,e)},n.fitWidth=function(t,e){return no(n,t,e)},n.fitHeight=function(t,e){return eo(n,t,e)},function(){return i=t.apply(this,arguments),n.invert=i.invert&&function(t){return(t=l.invert(t[0],t[1]))&&[t[0]*$d,t[1]*$d]},e()}}function uo(t){var n=0,e=Hd/3,r=ao(t),i=r(n,e);return i.parallels=function(t){return arguments.length?r(n=t[0]*Wd,e=t[1]*Wd):[n*$d,e*$d]},i}function fo(t,n){function e(t,n){var e=ap(o-2*i*ip(n))/i;return[e*ip(t*=i),a-e*Kd(t)]}var r=ip(t),i=(r+ip(n))/2;if(Zd(i)<Id)return function(t){function n(t,n){return[t*e,ip(n)/e]}var e=Kd(t);return n.invert=function(t,n){return[t/e,hr(n*e)]},n}(t);var o=1+r*(2*i-r),a=ap(o)/i;return e.invert=function(t,n){var e=a-n;return[Jd(t,Zd(e))/i*op(e),hr((o-(t*t+e*e)*i*i)/(2*i))]},e}function co(){return uo(fo).scale(155.424).center([0,33.6442])}function so(){return co().parallels([29.5,45.5]).scale(1070).translate([480,250]).rotate([96,0]).center([-.6,38.7])}function lo(t){return function(n,e){var r=Kd(n),i=Kd(e),o=t(r*i);return[o*i*ip(n),o*ip(e)]}}function ho(t){return function(n,e){var r=ap(n*n+e*e),i=t(r),o=ip(i),a=Kd(i);return[Jd(n*o,r*a),hr(r&&e*o/r)]}}function po(t,n){return[t,ep(up((Xd+n)/2))]}function vo(t){function n(){var n=Hd*u(),a=o(ri(o.rotate()).invert([0,0]));return c(null==s?[[a[0]-n,a[1]-n],[a[0]+n,a[1]+n]]:t===po?[[Math.max(a[0]-n,s),e],[Math.min(a[0]+n,r),i]]:[[s,Math.max(a[1]-n,e)],[r,Math.min(a[1]+n,i)]])}var e,r,i,o=oo(t),a=o.center,u=o.scale,f=o.translate,c=o.clipExtent,s=null;return o.scale=function(t){return arguments.length?(u(t),n()):u()},o.translate=function(t){return arguments.length?(f(t),n()):f()},o.center=function(t){return arguments.length?(a(t),n()):a()},o.clipExtent=function(t){return arguments.length?(null==t?s=e=r=i=null:(s=+t[0][0],e=+t[0][1],r=+t[1][0],i=+t[1][1]),n()):null==s?null:[[s,e],[r,i]]},n()}function go(t){return up((Xd+t)/2)}function yo(t,n){function e(t,n){o>0?n<-Xd+Id&&(n=-Xd+Id):n>Xd-Id&&(n=Xd-Id);var e=o/rp(go(n),i);return[e*ip(i*t),o-e*Kd(i*t)]}var r=Kd(t),i=t===n?ip(t):ep(r/Kd(n))/ep(go(n)/go(t)),o=r*rp(go(t),i)/i;return i?(e.invert=function(t,n){var e=o-n,r=op(i)*ap(t*t+e*e);return[Jd(t,Zd(e))/i*op(e),2*Qd(rp(o/r,1/i))-Xd]},e):po}function _o(t,n){return[t,n]}function bo(t,n){function e(t,n){var e=o-n,r=i*t;return[e*ip(r),o-e*Kd(r)]}var r=Kd(t),i=t===n?ip(t):(r-Kd(n))/(n-t),o=r/i+t;return Zd(i)<Id?_o:(e.invert=function(t,n){var e=o-n;return[Jd(t,Zd(e))/i*op(e),o-op(i)*ap(t*t+e*e)]},e)}function mo(t,n){var e=Kd(n),r=Kd(t)*e;return[e*ip(t)/r,ip(n)/r]}function xo(t,n,e,r){return 1===t&&1===n&&0===e&&0===r?Pi:Zi({point:function(i,o){this.stream.point(i*t+e,o*n+r)}})}function wo(t,n){var e=n*n,r=e*e;return[t*(.8707-.131979*e+r*(r*(.003971*e-.001529*r)-.013791)),n*(1.007226+e*(.015085+r*(.028874*e-.044475-.005916*r)))]}function Mo(t,n){return[Kd(n)*ip(t),ip(n)]}function Ao(t,n){var e=Kd(n),r=1+Kd(t)*e;return[e*ip(t)/r,ip(n)/r]}function To(t,n){return[ep(up((Xd+n)/2)),-t]}function No(t,n){return t.parent===n.parent?1:2}function So(t,n){return t+n.x}function Eo(t,n){return Math.max(t,n.y)}function ko(t){var n=0,e=t.children,r=e&&e.length;if(r)for(;--r>=0;)n+=e[r].value;else n=1;t.value=n}function Co(t,n){var e,r,i,o,a,u=new Lo(t),f=+t.value&&(u.value=t.value),c=[u];for(null==n&&(n=Po);e=c.pop();)if(f&&(e.value=+e.data.value),(i=n(e.data))&&(a=i.length))for(e.children=new Array(a),o=a-1;o>=0;--o)c.push(r=e.children[o]=new Lo(i[o])),r.parent=e,r.depth=e.depth+1;return u.eachBefore(Ro)}function Po(t){return t.children}function zo(t){t.data=t.data.data}function Ro(t){var n=0;do{t.height=n}while((t=t.parent)&&t.height<++n)}function Lo(t){this.data=t,this.depth=this.height=0,this.parent=null}function Do(t){for(var n,e,r=0,i=(t=function(t){for(var n,e,r=t.length;r;)e=Math.random()*r--|0,n=t[r],t[r]=t[e],t[e]=n;return t}(dv.call(t))).length,o=[];r<i;)n=t[r],e&&qo(e,n)?++r:(e=function(t){switch(t.length){case 1:return function(t){return{x:t.x,y:t.y,r:t.r}}(t[0]);case 2:return Yo(t[0],t[1]);case 3:return Bo(t[0],t[1],t[2])}}(o=function(t,n){var e,r;if(Oo(n,t))return[n];for(e=0;e<t.length;++e)if(Uo(n,t[e])&&Oo(Yo(t[e],n),t))return[t[e],n];for(e=0;e<t.length-1;++e)for(r=e+1;r<t.length;++r)if(Uo(Yo(t[e],t[r]),n)&&Uo(Yo(t[e],n),t[r])&&Uo(Yo(t[r],n),t[e])&&Oo(Bo(t[e],t[r],n),t))return[t[e],t[r],n];throw new Error}(o,n)),r=0);return e}function Uo(t,n){var e=t.r-n.r,r=n.x-t.x,i=n.y-t.y;return e<0||e*e<r*r+i*i}function qo(t,n){var e=t.r-n.r+1e-6,r=n.x-t.x,i=n.y-t.y;return e>0&&e*e>r*r+i*i}function Oo(t,n){for(var e=0;e<n.length;++e)if(!qo(t,n[e]))return!1;return!0}function Yo(t,n){var e=t.x,r=t.y,i=t.r,o=n.x,a=n.y,u=n.r,f=o-e,c=a-r,s=u-i,l=Math.sqrt(f*f+c*c);return{x:(e+o+f/l*s)/2,y:(r+a+c/l*s)/2,r:(l+i+u)/2}}function Bo(t,n,e){var r=t.x,i=t.y,o=t.r,a=n.x,u=n.y,f=n.r,c=e.x,s=e.y,l=e.r,h=r-a,d=r-c,p=i-u,v=i-s,g=f-o,y=l-o,_=r*r+i*i-o*o,b=_-a*a-u*u+f*f,m=_-c*c-s*s+l*l,x=d*p-h*v,w=(p*m-v*b)/(2*x)-r,M=(v*g-p*y)/x,A=(d*b-h*m)/(2*x)-i,T=(h*y-d*g)/x,N=M*M+T*T-1,S=2*(o+w*M+A*T),E=w*w+A*A-o*o,k=-(N?(S+Math.sqrt(S*S-4*N*E))/(2*N):E/S);return{x:r+w+M*k,y:i+A+T*k,r:k}}function Fo(t,n,e){var r,i,o,a,u=t.x-n.x,f=t.y-n.y,c=u*u+f*f;c?(i=n.r+e.r,i*=i,a=t.r+e.r,i>(a*=a)?(r=(c+a-i)/(2*c),o=Math.sqrt(Math.max(0,a/c-r*r)),e.x=t.x-r*u-o*f,e.y=t.y-r*f+o*u):(r=(c+i-a)/(2*c),o=Math.sqrt(Math.max(0,i/c-r*r)),e.x=n.x+r*u-o*f,e.y=n.y+r*f+o*u)):(e.x=n.x+e.r,e.y=n.y)}function Io(t,n){var e=t.r+n.r-1e-6,r=n.x-t.x,i=n.y-t.y;return e>0&&e*e>r*r+i*i}function jo(t){var n=t._,e=t.next._,r=n.r+e.r,i=(n.x*e.r+e.x*n.r)/r,o=(n.y*e.r+e.y*n.r)/r;return i*i+o*o}function Ho(t){this._=t,this.next=null,this.previous=null}function Xo(t){if(!(i=t.length))return 0;var n,e,r,i,o,a,u,f,c,s,l;if(n=t[0],n.x=0,n.y=0,!(i>1))return n.r;if(e=t[1],n.x=-e.r,e.x=n.r,e.y=0,!(i>2))return n.r+e.r;Fo(e,n,r=t[2]),n=new Ho(n),e=new Ho(e),r=new Ho(r),n.next=r.previous=e,e.next=n.previous=r,r.next=e.previous=n;t:for(u=3;u<i;++u){Fo(n._,e._,r=t[u]),r=new Ho(r),f=e.next,c=n.previous,s=e._.r,l=n._.r;do{if(s<=l){if(Io(f._,r._)){e=f,n.next=e,e.previous=n,--u;continue t}s+=f._.r,f=f.next}else{if(Io(c._,r._)){(n=c).next=e,e.previous=n,--u;continue t}l+=c._.r,c=c.previous}}while(f!==c.next);for(r.previous=n,r.next=e,n.next=e.previous=e=r,o=jo(n);(r=r.next)!==e;)(a=jo(r))<o&&(n=r,o=a);e=n.next}for(n=[e._],r=e;(r=r.next)!==e;)n.push(r._);for(r=Do(n),u=0;u<i;++u)n=t[u],n.x-=r.x,n.y-=r.y;return r.r}function Go(t){if("function"!=typeof t)throw new Error;return t}function Vo(){return 0}function $o(t){return function(){return t}}function Wo(t){return Math.sqrt(t.value)}function Zo(t){return function(n){n.children||(n.r=Math.max(0,+t(n)||0))}}function Qo(t,n){return function(e){if(r=e.children){var r,i,o,a=r.length,u=t(e)*n||0;if(u)for(i=0;i<a;++i)r[i].r+=u;if(o=Xo(r),u)for(i=0;i<a;++i)r[i].r-=u;e.r=o+u}}}function Jo(t){return function(n){var e=n.parent;n.r*=t,e&&(n.x=e.x+t*n.x,n.y=e.y+t*n.y)}}function Ko(t){t.x0=Math.round(t.x0),t.y0=Math.round(t.y0),t.x1=Math.round(t.x1),t.y1=Math.round(t.y1)}function ta(t,n,e,r,i){for(var o,a=t.children,u=-1,f=a.length,c=t.value&&(r-n)/t.value;++u<f;)(o=a[u]).y0=e,o.y1=i,o.x0=n,o.x1=n+=o.value*c}function na(t){return t.id}function ea(t){return t.parentId}function ra(t,n){return t.parent===n.parent?1:2}function ia(t){var n=t.children;return n?n[0]:t.t}function oa(t){var n=t.children;return n?n[n.length-1]:t.t}function aa(t,n,e){var r=e/(n.i-t.i);n.c-=r,n.s+=e,t.c+=r,n.z+=e,n.m+=e}function ua(t,n,e){return t.a.parent===n.parent?t.a:e}function fa(t,n){this._=t,this.parent=null,this.children=null,this.A=null,this.a=this,this.z=0,this.m=0,this.c=0,this.s=0,this.t=null,this.i=n}function ca(t,n,e,r,i){for(var o,a=t.children,u=-1,f=a.length,c=t.value&&(i-e)/t.value;++u<f;)(o=a[u]).x0=n,o.x1=r,o.y0=e,o.y1=e+=o.value*c}function sa(t,n,e,r,i,o){for(var a,u,f,c,s,l,h,d,p,v,g,y=[],_=n.children,b=0,m=0,x=_.length,w=n.value;b<x;){f=i-e,c=o-r;do{s=_[m++].value}while(!s&&m<x);for(l=h=s,g=s*s*(v=Math.max(c/f,f/c)/(w*t)),p=Math.max(h/g,g/l);m<x;++m){if(s+=u=_[m].value,u<l&&(l=u),u>h&&(h=u),g=s*s*v,(d=Math.max(h/g,g/l))>p){s-=u;break}p=d}y.push(a={value:s,dice:f<c,children:_.slice(b,m)}),a.dice?ta(a,e,r,i,w?r+=c*s/w:o):ca(a,e,r,w?e+=f*s/w:i,o),w-=s,b=m}return y}function la(t,n,e){return(n[0]-t[0])*(e[1]-t[1])-(n[1]-t[1])*(e[0]-t[0])}function ha(t,n){return t[0]-n[0]||t[1]-n[1]}function da(t){for(var n=t.length,e=[0,1],r=2,i=2;i<n;++i){for(;r>1&&la(t[e[r-2]],t[e[r-1]],t[i])<=0;)--r;e[r++]=i}return e.slice(0,r)}function pa(){return Math.random()}function va(t){function n(n){var o=n+"",a=e.get(o);if(!a){if(i!==kv)return i;e.set(o,a=r.push(n))}return t[(a-1)%t.length]}var e=le(),r=[],i=kv;return t=null==t?[]:Ev.call(t),n.domain=function(t){if(!arguments.length)return r.slice();r=[],e=le();for(var i,o,a=-1,u=t.length;++a<u;)e.has(o=(i=t[a])+"")||e.set(o,r.push(i));return n},n.range=function(e){return arguments.length?(t=Ev.call(e),n):t.slice()},n.unknown=function(t){return arguments.length?(i=t,n):i},n.copy=function(){return va().domain(r).range(t).unknown(i)},n}function ga(){function t(){var t=i().length,r=a[1]<a[0],h=a[r-0],d=a[1-r];n=(d-h)/Math.max(1,t-f+2*c),u&&(n=Math.floor(n)),h+=(d-h-n*(t-f))*l,e=n*(1-f),u&&(h=Math.round(h),e=Math.round(e));var p=s(t).map(function(t){return h+n*t});return o(r?p.reverse():p)}var n,e,r=va().unknown(void 0),i=r.domain,o=r.range,a=[0,1],u=!1,f=0,c=0,l=.5;return delete r.unknown,r.domain=function(n){return arguments.length?(i(n),t()):i()},r.range=function(n){return arguments.length?(a=[+n[0],+n[1]],t()):a.slice()},r.rangeRound=function(n){return a=[+n[0],+n[1]],u=!0,t()},r.bandwidth=function(){return e},r.step=function(){return n},r.round=function(n){return arguments.length?(u=!!n,t()):u},r.padding=function(n){return arguments.length?(f=c=Math.max(0,Math.min(1,n)),t()):f},r.paddingInner=function(n){return arguments.length?(f=Math.max(0,Math.min(1,n)),t()):f},r.paddingOuter=function(n){return arguments.length?(c=Math.max(0,Math.min(1,n)),t()):c},r.align=function(n){return arguments.length?(l=Math.max(0,Math.min(1,n)),t()):l},r.copy=function(){return ga().domain(i()).range(a).round(u).paddingInner(f).paddingOuter(c).align(l)},t()}function ya(t){var n=t.copy;return t.padding=t.paddingOuter,delete t.paddingInner,delete t.paddingOuter,t.copy=function(){return ya(n())},t}function _a(t){return function(){return t}}function ba(t){return+t}function ma(t,n){return(n-=t=+t)?function(e){return(e-t)/n}:_a(n)}function xa(t,n,e,r){var i=t[0],o=t[1],a=n[0],u=n[1];return o<i?(i=e(o,i),a=r(u,a)):(i=e(i,o),a=r(a,u)),function(t){return a(i(t))}}function wa(t,n,e,r){var i=Math.min(t.length,n.length)-1,o=new Array(i),a=new Array(i),u=-1;for(t[i]<t[0]&&(t=t.slice().reverse(),n=n.slice().reverse());++u<i;)o[u]=e(t[u],t[u+1]),a[u]=r(n[u],n[u+1]);return function(n){var e=Qc(t,n,1,i)-1;return a[e](o[e](n))}}function Ma(t,n){return n.domain(t.domain()).range(t.range()).interpolate(t.interpolate()).clamp(t.clamp())}function Aa(t,n){function e(){return i=Math.min(u.length,f.length)>2?wa:xa,o=a=null,r}function r(n){return(o||(o=i(u,f,s?function(t){return function(n,e){var r=t(n=+n,e=+e);return function(t){return t<=n?0:t>=e?1:r(t)}}}(t):t,c)))(+n)}var i,o,a,u=Cv,f=Cv,c=hn,s=!1;return r.invert=function(t){return(a||(a=i(f,u,ma,s?function(t){return function(n,e){var r=t(n=+n,e=+e);return function(t){return t<=0?n:t>=1?e:r(t)}}}(n):n)))(+t)},r.domain=function(t){return arguments.length?(u=Sv.call(t,ba),e()):u.slice()},r.range=function(t){return arguments.length?(f=Ev.call(t),e()):f.slice()},r.rangeRound=function(t){return f=Ev.call(t),c=dn,e()},r.clamp=function(t){return arguments.length?(s=!!t,e()):s},r.interpolate=function(t){return arguments.length?(c=t,e()):c},e()}function Ta(n){var e=n.domain;return n.ticks=function(t){var n=e();return l(n[0],n[n.length-1],null==t?10:t)},n.tickFormat=function(n,r){return function(n,e,r){var i,o=n[0],a=n[n.length-1],u=d(o,a,null==e?10:e);switch((r=tr(null==r?",f":r)).type){case"s":var f=Math.max(Math.abs(o),Math.abs(a));return null!=r.precision||isNaN(i=ar(u,f))||(r.precision=i),t.formatPrefix(r,f);case"":case"e":case"g":case"p":case"r":null!=r.precision||isNaN(i=ur(u,Math.max(Math.abs(o),Math.abs(a))))||(r.precision=i-("e"===r.type));break;case"f":case"%":null!=r.precision||isNaN(i=or(u))||(r.precision=i-2*("%"===r.type))}return t.format(r)}(e(),n,r)},n.nice=function(t){null==t&&(t=10);var r,i=e(),o=0,a=i.length-1,u=i[o],f=i[a];return f<u&&(r=u,u=f,f=r,r=o,o=a,a=r),(r=h(u,f,t))>0?r=h(u=Math.floor(u/r)*r,f=Math.ceil(f/r)*r,t):r<0&&(r=h(u=Math.ceil(u*r)/r,f=Math.floor(f*r)/r,t)),r>0?(i[o]=Math.floor(u/r)*r,i[a]=Math.ceil(f/r)*r,e(i)):r<0&&(i[o]=Math.ceil(u*r)/r,i[a]=Math.floor(f*r)/r,e(i)),n},n}function Na(){var t=Aa(ma,cn);return t.copy=function(){return Ma(t,Na())},Ta(t)}function Sa(){function t(t){return+t}var n=[0,1];return t.invert=t,t.domain=t.range=function(e){return arguments.length?(n=Sv.call(e,ba),t):n.slice()},t.copy=function(){return Sa().domain(n)},Ta(t)}function Ea(t,n){var e,r=0,i=(t=t.slice()).length-1,o=t[r],a=t[i];return a<o&&(e=r,r=i,i=e,e=o,o=a,a=e),t[r]=n.floor(o),t[i]=n.ceil(a),t}function ka(t,n){return(n=Math.log(n/t))?function(e){return Math.log(e/t)/n}:_a(n)}function Ca(t,n){return t<0?function(e){return-Math.pow(-n,e)*Math.pow(-t,1-e)}:function(e){return Math.pow(n,e)*Math.pow(t,1-e)}}function Pa(t){return isFinite(t)?+("1e"+t):t<0?0:t}function za(t){return 10===t?Pa:t===Math.E?Math.exp:function(n){return Math.pow(t,n)}}function Ra(t){return t===Math.E?Math.log:10===t&&Math.log10||2===t&&Math.log2||(t=Math.log(t),function(n){return Math.log(n)/t})}function La(t){return function(n){return-t(-n)}}function Da(){function n(){return o=Ra(i),a=za(i),r()[0]<0&&(o=La(o),a=La(a)),e}var e=Aa(ka,Ca).domain([1,10]),r=e.domain,i=10,o=Ra(10),a=za(10);return e.base=function(t){return arguments.length?(i=+t,n()):i},e.domain=function(t){return arguments.length?(r(t),n()):r()},e.ticks=function(t){var n,e=r(),u=e[0],f=e[e.length-1];(n=f<u)&&(d=u,u=f,f=d);var c,s,h,d=o(u),p=o(f),v=null==t?10:+t,g=[];if(!(i%1)&&p-d<v){if(d=Math.round(d)-1,p=Math.round(p)+1,u>0){for(;d<p;++d)for(s=1,c=a(d);s<i;++s)if(!((h=c*s)<u)){if(h>f)break;g.push(h)}}else for(;d<p;++d)for(s=i-1,c=a(d);s>=1;--s)if(!((h=c*s)<u)){if(h>f)break;g.push(h)}}else g=l(d,p,Math.min(p-d,v)).map(a);return n?g.reverse():g},e.tickFormat=function(n,r){if(null==r&&(r=10===i?".0e":","),"function"!=typeof r&&(r=t.format(r)),n===1/0)return r;null==n&&(n=10);var u=Math.max(1,i*n/e.ticks().length);return function(t){var n=t/a(Math.round(o(t)));return n*i<i-.5&&(n*=i),n<=u?r(t):""}},e.nice=function(){return r(Ea(r(),{floor:function(t){return a(Math.floor(o(t)))},ceil:function(t){return a(Math.ceil(o(t)))}}))},e.copy=function(){return Ma(e,Da().base(i))},e}function Ua(t,n){return t<0?-Math.pow(-t,n):Math.pow(t,n)}function qa(){var t=1,n=Aa(function(n,e){return(e=Ua(e,t)-(n=Ua(n,t)))?function(r){return(Ua(r,t)-n)/e}:_a(e)},function(n,e){return e=Ua(e,t)-(n=Ua(n,t)),function(r){return Ua(n+e*r,1/t)}}),e=n.domain;return n.exponent=function(n){return arguments.length?(t=+n,e(e())):t},n.copy=function(){return Ma(n,qa().exponent(t))},Ta(n)}function Oa(){function t(){var t=0,n=Math.max(1,i.length);for(o=new Array(n-1);++t<n;)o[t-1]=v(r,t/n);return e}function e(t){if(!isNaN(t=+t))return i[Qc(o,t)]}var r=[],i=[],o=[];return e.invertExtent=function(t){var n=i.indexOf(t);return n<0?[NaN,NaN]:[n>0?o[n-1]:r[0],n<o.length?o[n]:r[r.length-1]]},e.domain=function(e){if(!arguments.length)return r.slice();r=[];for(var i,o=0,a=e.length;o<a;++o)null==(i=e[o])||isNaN(i=+i)||r.push(i);return r.sort(n),t()},e.range=function(n){return arguments.length?(i=Ev.call(n),t()):i.slice()},e.quantiles=function(){return o.slice()},e.copy=function(){return Oa().domain(r).range(i)},e}function Ya(){function t(t){if(t<=t)return a[Qc(o,t,0,i)]}function n(){var n=-1;for(o=new Array(i);++n<i;)o[n]=((n+1)*r-(n-i)*e)/(i+1);return t}var e=0,r=1,i=1,o=[.5],a=[0,1];return t.domain=function(t){return arguments.length?(e=+t[0],r=+t[1],n()):[e,r]},t.range=function(t){return arguments.length?(i=(a=Ev.call(t)).length-1,n()):a.slice()},t.invertExtent=function(t){var n=a.indexOf(t);return n<0?[NaN,NaN]:n<1?[e,o[0]]:n>=i?[o[i-1],r]:[o[n-1],o[n]]},t.copy=function(){return Ya().domain([e,r]).range(a)},Ta(t)}function Ba(){function t(t){if(t<=t)return e[Qc(n,t,0,r)]}var n=[.5],e=[0,1],r=1;return t.domain=function(i){return arguments.length?(n=Ev.call(i),r=Math.min(n.length,e.length-1),t):n.slice()},t.range=function(i){return arguments.length?(e=Ev.call(i),r=Math.min(n.length,e.length-1),t):e.slice()},t.invertExtent=function(t){var r=e.indexOf(t);return[n[r-1],n[r]]},t.copy=function(){return Ba().domain(n).range(e)},t}function Fa(t,n,e,r){function i(n){return t(n=new Date(+n)),n}return i.floor=i,i.ceil=function(e){return t(e=new Date(e-1)),n(e,1),t(e),e},i.round=function(t){var n=i(t),e=i.ceil(t);return t-n<e-t?n:e},i.offset=function(t,e){return n(t=new Date(+t),null==e?1:Math.floor(e)),t},i.range=function(e,r,o){var a,u=[];if(e=i.ceil(e),o=null==o?1:Math.floor(o),!(e<r&&o>0))return u;do{u.push(a=new Date(+e)),n(e,o),t(e)}while(a<e&&e<r);return u},i.filter=function(e){return Fa(function(n){if(n>=n)for(;t(n),!e(n);)n.setTime(n-1)},function(t,r){if(t>=t)if(r<0)for(;++r<=0;)for(;n(t,-1),!e(t););else for(;--r>=0;)for(;n(t,1),!e(t););})},e&&(i.count=function(n,r){return Pv.setTime(+n),zv.setTime(+r),t(Pv),t(zv),Math.floor(e(Pv,zv))},i.every=function(t){return t=Math.floor(t),isFinite(t)&&t>0?t>1?i.filter(r?function(n){return r(n)%t==0}:function(n){return i.count(0,n)%t==0}):i:null}),i}function Ia(t){return Fa(function(n){n.setDate(n.getDate()-(n.getDay()+7-t)%7),n.setHours(0,0,0,0)},function(t,n){t.setDate(t.getDate()+7*n)},function(t,n){return(n-t-(n.getTimezoneOffset()-t.getTimezoneOffset())*Dv)/Uv})}function ja(t){return Fa(function(n){n.setUTCDate(n.getUTCDate()-(n.getUTCDay()+7-t)%7),n.setUTCHours(0,0,0,0)},function(t,n){t.setUTCDate(t.getUTCDate()+7*n)},function(t,n){return(n-t)/Uv})}function Ha(t){if(0<=t.y&&t.y<100){var n=new Date(-1,t.m,t.d,t.H,t.M,t.S,t.L);return n.setFullYear(t.y),n}return new Date(t.y,t.m,t.d,t.H,t.M,t.S,t.L)}function Xa(t){if(0<=t.y&&t.y<100){var n=new Date(Date.UTC(-1,t.m,t.d,t.H,t.M,t.S,t.L));return n.setUTCFullYear(t.y),n}return new Date(Date.UTC(t.y,t.m,t.d,t.H,t.M,t.S,t.L))}function Ga(t){return{y:t,m:0,d:1,H:0,M:0,S:0,L:0}}function Va(t){function n(t,n){return function(e){var r,i,o,a=[],u=-1,f=0,c=t.length;for(e instanceof Date||(e=new Date(+e));++u<c;)37===t.charCodeAt(u)&&(a.push(t.slice(f,u)),null!=(i=Lg[r=t.charAt(++u)])?r=t.charAt(++u):i="e"===r?" ":"0",(o=n[r])&&(r=o(e,i)),a.push(r),f=u+1);return a.push(t.slice(f,u)),a.join("")}}function e(t,n){return function(e){var i,o,a=Ga(1900);if(r(a,t,e+="",0)!=e.length)return null;if("Q"in a)return new Date(a.Q);if("p"in a&&(a.H=a.H%12+12*a.p),"V"in a){if(a.V<1||a.V>53)return null;"w"in a||(a.w=1),"Z"in a?(i=(o=(i=Xa(Ga(a.y))).getUTCDay())>4||0===o?gg.ceil(i):gg(i),i=dg.offset(i,7*(a.V-1)),a.y=i.getUTCFullYear(),a.m=i.getUTCMonth(),a.d=i.getUTCDate()+(a.w+6)%7):(i=(o=(i=n(Ga(a.y))).getDay())>4||0===o?Gv.ceil(i):Gv(i),i=jv.offset(i,7*(a.V-1)),a.y=i.getFullYear(),a.m=i.getMonth(),a.d=i.getDate()+(a.w+6)%7)}else("W"in a||"U"in a)&&("w"in a||(a.w="u"in a?a.u%7:"W"in a?1:0),o="Z"in a?Xa(Ga(a.y)).getUTCDay():n(Ga(a.y)).getDay(),a.m=0,a.d="W"in a?(a.w+6)%7+7*a.W-(o+5)%7:a.w+7*a.U-(o+6)%7);return"Z"in a?(a.H+=a.Z/100|0,a.M+=a.Z%100,Xa(a)):n(a)}}function r(t,n,e,r){for(var i,o,a=0,u=n.length,f=e.length;a<u;){if(r>=f)return-1;if(37===(i=n.charCodeAt(a++))){if(i=n.charAt(a++),!(o=A[i in Lg?n.charAt(a++):i])||(r=o(t,e,r))<0)return-1}else if(i!=e.charCodeAt(r++))return-1}return r}var i=t.dateTime,o=t.date,a=t.time,u=t.periods,f=t.days,c=t.shortDays,s=t.months,l=t.shortMonths,h=Za(u),d=Qa(u),p=Za(f),v=Qa(f),g=Za(c),y=Qa(c),_=Za(s),b=Qa(s),m=Za(l),x=Qa(l),w={a:function(t){return c[t.getDay()]},A:function(t){return f[t.getDay()]},b:function(t){return l[t.getMonth()]},B:function(t){return s[t.getMonth()]},c:null,d:yu,e:yu,f:wu,H:_u,I:bu,j:mu,L:xu,m:Mu,M:Au,p:function(t){return u[+(t.getHours()>=12)]},Q:Ju,s:Ku,S:Tu,u:Nu,U:Su,V:Eu,w:ku,W:Cu,x:null,X:null,y:Pu,Y:zu,Z:Ru,"%":Qu},M={a:function(t){return c[t.getUTCDay()]},A:function(t){return f[t.getUTCDay()]},b:function(t){return l[t.getUTCMonth()]},B:function(t){return s[t.getUTCMonth()]},c:null,d:Lu,e:Lu,f:Yu,H:Du,I:Uu,j:qu,L:Ou,m:Bu,M:Fu,p:function(t){return u[+(t.getUTCHours()>=12)]},Q:Ju,s:Ku,S:Iu,u:ju,U:Hu,V:Xu,w:Gu,W:Vu,x:null,X:null,y:$u,Y:Wu,Z:Zu,"%":Qu},A={a:function(t,n,e){var r=g.exec(n.slice(e));return r?(t.w=y[r[0].toLowerCase()],e+r[0].length):-1},A:function(t,n,e){var r=p.exec(n.slice(e));return r?(t.w=v[r[0].toLowerCase()],e+r[0].length):-1},b:function(t,n,e){var r=m.exec(n.slice(e));return r?(t.m=x[r[0].toLowerCase()],e+r[0].length):-1},B:function(t,n,e){var r=_.exec(n.slice(e));return r?(t.m=b[r[0].toLowerCase()],e+r[0].length):-1},c:function(t,n,e){return r(t,i,n,e)},d:uu,e:uu,f:du,H:cu,I:cu,j:fu,L:hu,m:au,M:su,p:function(t,n,e){var r=h.exec(n.slice(e));return r?(t.p=d[r[0].toLowerCase()],e+r[0].length):-1},Q:vu,s:gu,S:lu,u:Ka,U:tu,V:nu,w:Ja,W:eu,x:function(t,n,e){return r(t,o,n,e)},X:function(t,n,e){return r(t,a,n,e)},y:iu,Y:ru,Z:ou,"%":pu};return w.x=n(o,w),w.X=n(a,w),w.c=n(i,w),M.x=n(o,M),M.X=n(a,M),M.c=n(i,M),{format:function(t){var e=n(t+="",w);return e.toString=function(){return t},e},parse:function(t){var n=e(t+="",Ha);return n.toString=function(){return t},n},utcFormat:function(t){var e=n(t+="",M);return e.toString=function(){return t},e},utcParse:function(t){var n=e(t,Xa);return n.toString=function(){return t},n}}}function $a(t,n,e){var r=t<0?"-":"",i=(r?-t:t)+"",o=i.length;return r+(o<e?new Array(e-o+1).join(n)+i:i)}function Wa(t){return t.replace(qg,"\\$&")}function Za(t){return new RegExp("^(?:"+t.map(Wa).join("|")+")","i")}function Qa(t){for(var n={},e=-1,r=t.length;++e<r;)n[t[e].toLowerCase()]=e;return n}function Ja(t,n,e){var r=Dg.exec(n.slice(e,e+1));return r?(t.w=+r[0],e+r[0].length):-1}function Ka(t,n,e){var r=Dg.exec(n.slice(e,e+1));return r?(t.u=+r[0],e+r[0].length):-1}function tu(t,n,e){var r=Dg.exec(n.slice(e,e+2));return r?(t.U=+r[0],e+r[0].length):-1}function nu(t,n,e){var r=Dg.exec(n.slice(e,e+2));return r?(t.V=+r[0],e+r[0].length):-1}function eu(t,n,e){var r=Dg.exec(n.slice(e,e+2));return r?(t.W=+r[0],e+r[0].length):-1}function ru(t,n,e){var r=Dg.exec(n.slice(e,e+4));return r?(t.y=+r[0],e+r[0].length):-1}function iu(t,n,e){var r=Dg.exec(n.slice(e,e+2));return r?(t.y=+r[0]+(+r[0]>68?1900:2e3),e+r[0].length):-1}function ou(t,n,e){var r=/^(Z)|([+-]\d\d)(?::?(\d\d))?/.exec(n.slice(e,e+6));return r?(t.Z=r[1]?0:-(r[2]+(r[3]||"00")),e+r[0].length):-1}function au(t,n,e){var r=Dg.exec(n.slice(e,e+2));return r?(t.m=r[0]-1,e+r[0].length):-1}function uu(t,n,e){var r=Dg.exec(n.slice(e,e+2));return r?(t.d=+r[0],e+r[0].length):-1}function fu(t,n,e){var r=Dg.exec(n.slice(e,e+3));return r?(t.m=0,t.d=+r[0],e+r[0].length):-1}function cu(t,n,e){var r=Dg.exec(n.slice(e,e+2));return r?(t.H=+r[0],e+r[0].length):-1}function su(t,n,e){var r=Dg.exec(n.slice(e,e+2));return r?(t.M=+r[0],e+r[0].length):-1}function lu(t,n,e){var r=Dg.exec(n.slice(e,e+2));return r?(t.S=+r[0],e+r[0].length):-1}function hu(t,n,e){var r=Dg.exec(n.slice(e,e+3));return r?(t.L=+r[0],e+r[0].length):-1}function du(t,n,e){var r=Dg.exec(n.slice(e,e+6));return r?(t.L=Math.floor(r[0]/1e3),e+r[0].length):-1}function pu(t,n,e){var r=Ug.exec(n.slice(e,e+1));return r?e+r[0].length:-1}function vu(t,n,e){var r=Dg.exec(n.slice(e));return r?(t.Q=+r[0],e+r[0].length):-1}function gu(t,n,e){var r=Dg.exec(n.slice(e));return r?(t.Q=1e3*+r[0],e+r[0].length):-1}function yu(t,n){return $a(t.getDate(),n,2)}function _u(t,n){return $a(t.getHours(),n,2)}function bu(t,n){return $a(t.getHours()%12||12,n,2)}function mu(t,n){return $a(1+jv.count(ug(t),t),n,3)}function xu(t,n){return $a(t.getMilliseconds(),n,3)}function wu(t,n){return xu(t,n)+"000"}function Mu(t,n){return $a(t.getMonth()+1,n,2)}function Au(t,n){return $a(t.getMinutes(),n,2)}function Tu(t,n){return $a(t.getSeconds(),n,2)}function Nu(t){var n=t.getDay();return 0===n?7:n}function Su(t,n){return $a(Xv.count(ug(t),t),n,2)}function Eu(t,n){var e=t.getDay();return t=e>=4||0===e?Wv(t):Wv.ceil(t),$a(Wv.count(ug(t),t)+(4===ug(t).getDay()),n,2)}function ku(t){return t.getDay()}function Cu(t,n){return $a(Gv.count(ug(t),t),n,2)}function Pu(t,n){return $a(t.getFullYear()%100,n,2)}function zu(t,n){return $a(t.getFullYear()%1e4,n,4)}function Ru(t){var n=t.getTimezoneOffset();return(n>0?"-":(n*=-1,"+"))+$a(n/60|0,"0",2)+$a(n%60,"0",2)}function Lu(t,n){return $a(t.getUTCDate(),n,2)}function Du(t,n){return $a(t.getUTCHours(),n,2)}function Uu(t,n){return $a(t.getUTCHours()%12||12,n,2)}function qu(t,n){return $a(1+dg.count(Pg(t),t),n,3)}function Ou(t,n){return $a(t.getUTCMilliseconds(),n,3)}function Yu(t,n){return Ou(t,n)+"000"}function Bu(t,n){return $a(t.getUTCMonth()+1,n,2)}function Fu(t,n){return $a(t.getUTCMinutes(),n,2)}function Iu(t,n){return $a(t.getUTCSeconds(),n,2)}function ju(t){var n=t.getUTCDay();return 0===n?7:n}function Hu(t,n){return $a(vg.count(Pg(t),t),n,2)}function Xu(t,n){var e=t.getUTCDay();return t=e>=4||0===e?bg(t):bg.ceil(t),$a(bg.count(Pg(t),t)+(4===Pg(t).getUTCDay()),n,2)}function Gu(t){return t.getUTCDay()}function Vu(t,n){return $a(gg.count(Pg(t),t),n,2)}function $u(t,n){return $a(t.getUTCFullYear()%100,n,2)}function Wu(t,n){return $a(t.getUTCFullYear()%1e4,n,4)}function Zu(){return"+0000"}function Qu(){return"%"}function Ju(t){return+t}function Ku(t){return Math.floor(+t/1e3)}function tf(n){return zg=Va(n),t.timeFormat=zg.format,t.timeParse=zg.parse,t.utcFormat=zg.utcFormat,t.utcParse=zg.utcParse,zg}function nf(t){return new Date(t)}function ef(t){return t instanceof Date?+t:+new Date(+t)}function rf(t,n,r,i,o,a,u,f,c){function s(e){return(u(e)<e?g:a(e)<e?y:o(e)<e?_:i(e)<e?b:n(e)<e?r(e)<e?m:x:t(e)<e?w:M)(e)}function l(n,r,i,o){if(null==n&&(n=10),"number"==typeof n){var a=Math.abs(i-r)/n,u=e(function(t){return t[2]}).right(A,a);u===A.length?(o=d(r/Vg,i/Vg,n),n=t):u?(o=(u=A[a/A[u-1][2]<A[u][2]/a?u-1:u])[1],n=u[0]):(o=Math.max(d(r,i,n),1),n=f)}return null==o?n:n.every(o)}var h=Aa(ma,cn),p=h.invert,v=h.domain,g=c(".%L"),y=c(":%S"),_=c("%I:%M"),b=c("%I %p"),m=c("%a %d"),x=c("%b %d"),w=c("%B"),M=c("%Y"),A=[[u,1,Fg],[u,5,5*Fg],[u,15,15*Fg],[u,30,30*Fg],[a,1,Ig],[a,5,5*Ig],[a,15,15*Ig],[a,30,30*Ig],[o,1,jg],[o,3,3*jg],[o,6,6*jg],[o,12,12*jg],[i,1,Hg],[i,2,2*Hg],[r,1,Xg],[n,1,Gg],[n,3,3*Gg],[t,1,Vg]];return h.invert=function(t){return new Date(p(t))},h.domain=function(t){return arguments.length?v(Sv.call(t,ef)):v().map(nf)},h.ticks=function(t,n){var e,r=v(),i=r[0],o=r[r.length-1],a=o<i;return a&&(e=i,i=o,o=e),e=l(t,i,o,n),e=e?e.range(i,o+1):[],a?e.reverse():e},h.tickFormat=function(t,n){return null==n?s:c(n)},h.nice=function(t,n){var e=v();return(t=l(t,e[0],e[e.length-1],n))?v(Ea(e,t)):h},h.copy=function(){return Ma(h,rf(t,n,r,i,o,a,u,f,c))},h}function of(t){function n(n){var o=(n-e)/(r-e);return t(i?Math.max(0,Math.min(1,o)):o)}var e=0,r=1,i=!1;return n.domain=function(t){return arguments.length?(e=+t[0],r=+t[1],n):[e,r]},n.clamp=function(t){return arguments.length?(i=!!t,n):i},n.interpolator=function(e){return arguments.length?(t=e,n):t},n.copy=function(){return of(t).domain([e,r]).clamp(i)},Ta(n)}function af(t){for(var n=t.length/6|0,e=new Array(n),r=0;r<n;)e[r]="#"+t.slice(6*r,6*++r);return e}function uf(t){return el(t[t.length-1])}function ff(t){var n=t.length;return function(e){return t[Math.max(0,Math.min(n-1,Math.floor(e*n)))]}}function cf(t){return function(){return t}}function sf(t){return t>=1?m_:t<=-1?-m_:Math.asin(t)}function lf(t){return t.innerRadius}function hf(t){return t.outerRadius}function df(t){return t.startAngle}function pf(t){return t.endAngle}function vf(t){return t&&t.padAngle}function gf(t,n,e,r,i,o,a){var u=t-e,f=n-r,c=(a?o:-o)/y_(u*u+f*f),s=c*f,l=-c*u,h=t+s,d=n+l,p=e+s,v=r+l,g=(h+p)/2,y=(d+v)/2,_=p-h,b=v-d,m=_*_+b*b,x=i-o,w=h*v-p*d,M=(b<0?-1:1)*y_(p_(0,x*x*m-w*w)),A=(w*b-_*M)/m,T=(-w*_-b*M)/m,N=(w*b+_*M)/m,S=(-w*_+b*M)/m,E=A-g,k=T-y,C=N-g,P=S-y;return E*E+k*k>C*C+P*P&&(A=N,T=S),{cx:A,cy:T,x01:-s,y01:-l,x11:A*(i/x-1),y11:T*(i/x-1)}}function yf(t){this._context=t}function _f(t){return new yf(t)}function bf(t){return t[0]}function mf(t){return t[1]}function xf(){function t(t){var u,f,c,s=t.length,l=!1;for(null==i&&(a=o(c=ie())),u=0;u<=s;++u)!(u<s&&r(f=t[u],u,t))===l&&((l=!l)?a.lineStart():a.lineEnd()),l&&a.point(+n(f,u,t),+e(f,u,t));if(c)return a=null,c+""||null}var n=bf,e=mf,r=cf(!0),i=null,o=_f,a=null;return t.x=function(e){return arguments.length?(n="function"==typeof e?e:cf(+e),t):n},t.y=function(n){return arguments.length?(e="function"==typeof n?n:cf(+n),t):e},t.defined=function(n){return arguments.length?(r="function"==typeof n?n:cf(!!n),t):r},t.curve=function(n){return arguments.length?(o=n,null!=i&&(a=o(i)),t):o},t.context=function(n){return arguments.length?(null==n?i=a=null:a=o(i=n),t):i},t}function wf(){function t(t){var n,s,l,h,d,p=t.length,v=!1,g=new Array(p),y=new Array(p);for(null==u&&(c=f(d=ie())),n=0;n<=p;++n){if(!(n<p&&a(h=t[n],n,t))===v)if(v=!v)s=n,c.areaStart(),c.lineStart();else{for(c.lineEnd(),c.lineStart(),l=n-1;l>=s;--l)c.point(g[l],y[l]);c.lineEnd(),c.areaEnd()}v&&(g[n]=+e(h,n,t),y[n]=+i(h,n,t),c.point(r?+r(h,n,t):g[n],o?+o(h,n,t):y[n]))}if(d)return c=null,d+""||null}function n(){return xf().defined(a).curve(f).context(u)}var e=bf,r=null,i=cf(0),o=mf,a=cf(!0),u=null,f=_f,c=null;return t.x=function(n){return arguments.length?(e="function"==typeof n?n:cf(+n),r=null,t):e},t.x0=function(n){return arguments.length?(e="function"==typeof n?n:cf(+n),t):e},t.x1=function(n){return arguments.length?(r=null==n?null:"function"==typeof n?n:cf(+n),t):r},t.y=function(n){return arguments.length?(i="function"==typeof n?n:cf(+n),o=null,t):i},t.y0=function(n){return arguments.length?(i="function"==typeof n?n:cf(+n),t):i},t.y1=function(n){return arguments.length?(o=null==n?null:"function"==typeof n?n:cf(+n),t):o},t.lineX0=t.lineY0=function(){return n().x(e).y(i)},t.lineY1=function(){return n().x(e).y(o)},t.lineX1=function(){return n().x(r).y(i)},t.defined=function(n){return arguments.length?(a="function"==typeof n?n:cf(!!n),t):a},t.curve=function(n){return arguments.length?(f=n,null!=u&&(c=f(u)),t):f},t.context=function(n){return arguments.length?(null==n?u=c=null:c=f(u=n),t):u},t}function Mf(t,n){return n<t?-1:n>t?1:n>=t?0:NaN}function Af(t){return t}function Tf(t){this._curve=t}function Nf(t){function n(n){return new Tf(t(n))}return n._curve=t,n}function Sf(t){var n=t.curve;return t.angle=t.x,delete t.x,t.radius=t.y,delete t.y,t.curve=function(t){return arguments.length?n(Nf(t)):n()._curve},t}function Ef(){return Sf(xf().curve(w_))}function kf(){var t=wf().curve(w_),n=t.curve,e=t.lineX0,r=t.lineX1,i=t.lineY0,o=t.lineY1;return t.angle=t.x,delete t.x,t.startAngle=t.x0,delete t.x0,t.endAngle=t.x1,delete t.x1,t.radius=t.y,delete t.y,t.innerRadius=t.y0,delete t.y0,t.outerRadius=t.y1,delete t.y1,t.lineStartAngle=function(){return Sf(e())},delete t.lineX0,t.lineEndAngle=function(){return Sf(r())},delete t.lineX1,t.lineInnerRadius=function(){return Sf(i())},delete t.lineY0,t.lineOuterRadius=function(){return Sf(o())},delete t.lineY1,t.curve=function(t){return arguments.length?n(Nf(t)):n()._curve},t}function Cf(t,n){return[(n=+n)*Math.cos(t-=Math.PI/2),n*Math.sin(t)]}function Pf(t){return t.source}function zf(t){return t.target}function Rf(t){function n(){var n,u=M_.call(arguments),f=e.apply(this,u),c=r.apply(this,u);if(a||(a=n=ie()),t(a,+i.apply(this,(u[0]=f,u)),+o.apply(this,u),+i.apply(this,(u[0]=c,u)),+o.apply(this,u)),n)return a=null,n+""||null}var e=Pf,r=zf,i=bf,o=mf,a=null;return n.source=function(t){return arguments.length?(e=t,n):e},n.target=function(t){return arguments.length?(r=t,n):r},n.x=function(t){return arguments.length?(i="function"==typeof t?t:cf(+t),n):i},n.y=function(t){return arguments.length?(o="function"==typeof t?t:cf(+t),n):o},n.context=function(t){return arguments.length?(a=null==t?null:t,n):a},n}function Lf(t,n,e,r,i){t.moveTo(n,e),t.bezierCurveTo(n=(n+r)/2,e,n,i,r,i)}function Df(t,n,e,r,i){t.moveTo(n,e),t.bezierCurveTo(n,e=(e+i)/2,r,e,r,i)}function Uf(t,n,e,r,i){var o=Cf(n,e),a=Cf(n,e=(e+i)/2),u=Cf(r,e),f=Cf(r,i);t.moveTo(o[0],o[1]),t.bezierCurveTo(a[0],a[1],u[0],u[1],f[0],f[1])}function qf(){}function Of(t,n,e){t._context.bezierCurveTo((2*t._x0+t._x1)/3,(2*t._y0+t._y1)/3,(t._x0+2*t._x1)/3,(t._y0+2*t._y1)/3,(t._x0+4*t._x1+n)/6,(t._y0+4*t._y1+e)/6)}function Yf(t){this._context=t}function Bf(t){this._context=t}function Ff(t){this._context=t}function If(t,n){this._basis=new Yf(t),this._beta=n}function jf(t,n,e){t._context.bezierCurveTo(t._x1+t._k*(t._x2-t._x0),t._y1+t._k*(t._y2-t._y0),t._x2+t._k*(t._x1-n),t._y2+t._k*(t._y1-e),t._x2,t._y2)}function Hf(t,n){this._context=t,this._k=(1-n)/6}function Xf(t,n){this._context=t,this._k=(1-n)/6}function Gf(t,n){this._context=t,this._k=(1-n)/6}function Vf(t,n,e){var r=t._x1,i=t._y1,o=t._x2,a=t._y2;if(t._l01_a>__){var u=2*t._l01_2a+3*t._l01_a*t._l12_a+t._l12_2a,f=3*t._l01_a*(t._l01_a+t._l12_a);r=(r*u-t._x0*t._l12_2a+t._x2*t._l01_2a)/f,i=(i*u-t._y0*t._l12_2a+t._y2*t._l01_2a)/f}if(t._l23_a>__){var c=2*t._l23_2a+3*t._l23_a*t._l12_a+t._l12_2a,s=3*t._l23_a*(t._l23_a+t._l12_a);o=(o*c+t._x1*t._l23_2a-n*t._l12_2a)/s,a=(a*c+t._y1*t._l23_2a-e*t._l12_2a)/s}t._context.bezierCurveTo(r,i,o,a,t._x2,t._y2)}function $f(t,n){this._context=t,this._alpha=n}function Wf(t,n){this._context=t,this._alpha=n}function Zf(t,n){this._context=t,this._alpha=n}function Qf(t){this._context=t}function Jf(t){return t<0?-1:1}function Kf(t,n,e){var r=t._x1-t._x0,i=n-t._x1,o=(t._y1-t._y0)/(r||i<0&&-0),a=(e-t._y1)/(i||r<0&&-0),u=(o*i+a*r)/(r+i);return(Jf(o)+Jf(a))*Math.min(Math.abs(o),Math.abs(a),.5*Math.abs(u))||0}function tc(t,n){var e=t._x1-t._x0;return e?(3*(t._y1-t._y0)/e-n)/2:n}function nc(t,n,e){var r=t._x0,i=t._y0,o=t._x1,a=t._y1,u=(o-r)/3;t._context.bezierCurveTo(r+u,i+u*n,o-u,a-u*e,o,a)}function ec(t){this._context=t}function rc(t){this._context=new ic(t)}function ic(t){this._context=t}function oc(t){this._context=t}function ac(t){var n,e,r=t.length-1,i=new Array(r),o=new Array(r),a=new Array(r);for(i[0]=0,o[0]=2,a[0]=t[0]+2*t[1],n=1;n<r-1;++n)i[n]=1,o[n]=4,a[n]=4*t[n]+2*t[n+1];for(i[r-1]=2,o[r-1]=7,a[r-1]=8*t[r-1]+t[r],n=1;n<r;++n)e=i[n]/o[n-1],o[n]-=e,a[n]-=e*a[n-1];for(i[r-1]=a[r-1]/o[r-1],n=r-2;n>=0;--n)i[n]=(a[n]-i[n+1])/o[n];for(o[r-1]=(t[r]+i[r-1])/2,n=0;n<r-1;++n)o[n]=2*t[n+1]-i[n+1];return[i,o]}function uc(t,n){this._context=t,this._t=n}function fc(t,n){if((i=t.length)>1)for(var e,r,i,o=1,a=t[n[0]],u=a.length;o<i;++o)for(r=a,a=t[n[o]],e=0;e<u;++e)a[e][1]+=a[e][0]=isNaN(r[e][1])?r[e][0]:r[e][1]}function cc(t){for(var n=t.length,e=new Array(n);--n>=0;)e[n]=n;return e}function sc(t,n){return t[n]}function lc(t){var n=t.map(hc);return cc(t).sort(function(t,e){return n[t]-n[e]})}function hc(t){for(var n,e=0,r=-1,i=t.length;++r<i;)(n=+t[r][1])&&(e+=n);return e}function dc(t){return function(){return t}}function pc(t){return t[0]}function vc(t){return t[1]}function gc(){this._=null}function yc(t){t.U=t.C=t.L=t.R=t.P=t.N=null}function _c(t,n){var e=n,r=n.R,i=e.U;i?i.L===e?i.L=r:i.R=r:t._=r,r.U=i,e.U=r,e.R=r.L,e.R&&(e.R.U=e),r.L=e}function bc(t,n){var e=n,r=n.L,i=e.U;i?i.L===e?i.L=r:i.R=r:t._=r,r.U=i,e.U=r,e.L=r.R,e.L&&(e.L.U=e),r.R=e}function mc(t){for(;t.L;)t=t.L;return t}function xc(t,n,e,r){var i=[null,null],o=J_.push(i)-1;return i.left=t,i.right=n,e&&Mc(i,t,n,e),r&&Mc(i,n,t,r),Z_[t.index].halfedges.push(o),Z_[n.index].halfedges.push(o),i}function wc(t,n,e){var r=[n,e];return r.left=t,r}function Mc(t,n,e,r){t[0]||t[1]?t.left===e?t[1]=r:t[0]=r:(t[0]=r,t.left=n,t.right=e)}function Ac(t,n,e,r,i){var o,a=t[0],u=t[1],f=a[0],c=a[1],s=0,l=1,h=u[0]-f,d=u[1]-c;if(o=n-f,h||!(o>0)){if(o/=h,h<0){if(o<s)return;o<l&&(l=o)}else if(h>0){if(o>l)return;o>s&&(s=o)}if(o=r-f,h||!(o<0)){if(o/=h,h<0){if(o>l)return;o>s&&(s=o)}else if(h>0){if(o<s)return;o<l&&(l=o)}if(o=e-c,d||!(o>0)){if(o/=d,d<0){if(o<s)return;o<l&&(l=o)}else if(d>0){if(o>l)return;o>s&&(s=o)}if(o=i-c,d||!(o<0)){if(o/=d,d<0){if(o>l)return;o>s&&(s=o)}else if(d>0){if(o<s)return;o<l&&(l=o)}return!(s>0||l<1)||(s>0&&(t[0]=[f+s*h,c+s*d]),l<1&&(t[1]=[f+l*h,c+l*d]),!0)}}}}}function Tc(t,n,e,r,i){var o=t[1];if(o)return!0;var a,u,f=t[0],c=t.left,s=t.right,l=c[0],h=c[1],d=s[0],p=s[1],v=(l+d)/2,g=(h+p)/2;if(p===h){if(v<n||v>=r)return;if(l>d){if(f){if(f[1]>=i)return}else f=[v,e];o=[v,i]}else{if(f){if(f[1]<e)return}else f=[v,i];o=[v,e]}}else if(a=(l-d)/(p-h),u=g-a*v,a<-1||a>1)if(l>d){if(f){if(f[1]>=i)return}else f=[(e-u)/a,e];o=[(i-u)/a,i]}else{if(f){if(f[1]<e)return}else f=[(i-u)/a,i];o=[(e-u)/a,e]}else if(h<p){if(f){if(f[0]>=r)return}else f=[n,a*n+u];o=[r,a*r+u]}else{if(f){if(f[0]<n)return}else f=[r,a*r+u];o=[n,a*n+u]}return t[0]=f,t[1]=o,!0}function Nc(t,n){var e=t.site,r=n.left,i=n.right;return e===i&&(i=r,r=e),i?Math.atan2(i[1]-r[1],i[0]-r[0]):(e===r?(r=n[1],i=n[0]):(r=n[0],i=n[1]),Math.atan2(r[0]-i[0],i[1]-r[1]))}function Sc(t,n){return n[+(n.left!==t.site)]}function Ec(t,n){return n[+(n.left===t.site)]}function kc(t){var n=t.P,e=t.N;if(n&&e){var r=n.site,i=t.site,o=e.site;if(r!==o){var a=i[0],u=i[1],f=r[0]-a,c=r[1]-u,s=o[0]-a,l=o[1]-u,h=2*(f*l-c*s);if(!(h>=-eb)){var d=f*f+c*c,p=s*s+l*l,v=(l*d-c*p)/h,g=(f*p-s*d)/h,y=K_.pop()||new function(){yc(this),this.x=this.y=this.arc=this.site=this.cy=null};y.arc=t,y.site=i,y.x=v+a,y.y=(y.cy=g+u)+Math.sqrt(v*v+g*g),t.circle=y;for(var _=null,b=Q_._;b;)if(y.y<b.y||y.y===b.y&&y.x<=b.x){if(!b.L){_=b.P;break}b=b.L}else{if(!b.R){_=b;break}b=b.R}Q_.insert(_,y),_||($_=y)}}}}function Cc(t){var n=t.circle;n&&(n.P||($_=n.N),Q_.remove(n),K_.push(n),yc(n),t.circle=null)}function Pc(t){var n=tb.pop()||new function(){yc(this),this.edge=this.site=this.circle=null};return n.site=t,n}function zc(t){Cc(t),W_.remove(t),tb.push(t),yc(t)}function Rc(t){var n=t.circle,e=n.x,r=n.cy,i=[e,r],o=t.P,a=t.N,u=[t];zc(t);for(var f=o;f.circle&&Math.abs(e-f.circle.x)<nb&&Math.abs(r-f.circle.cy)<nb;)o=f.P,u.unshift(f),zc(f),f=o;u.unshift(f),Cc(f);for(var c=a;c.circle&&Math.abs(e-c.circle.x)<nb&&Math.abs(r-c.circle.cy)<nb;)a=c.N,u.push(c),zc(c),c=a;u.push(c),Cc(c);var s,l=u.length;for(s=1;s<l;++s)c=u[s],f=u[s-1],Mc(c.edge,f.site,c.site,i);f=u[0],(c=u[l-1]).edge=xc(f.site,c.site,null,i),kc(f),kc(c)}function Lc(t){for(var n,e,r,i,o=t[0],a=t[1],u=W_._;u;)if((r=Dc(u,a)-o)>nb)u=u.L;else{if(!((i=o-function(t,n){var e=t.N;if(e)return Dc(e,n);var r=t.site;return r[1]===n?r[0]:1/0}(u,a))>nb)){r>-nb?(n=u.P,e=u):i>-nb?(n=u,e=u.N):n=e=u;break}if(!u.R){n=u;break}u=u.R}(function(t){Z_[t.index]={site:t,halfedges:[]}})(t);var f=Pc(t);if(W_.insert(n,f),n||e){if(n===e)return Cc(n),e=Pc(n.site),W_.insert(f,e),f.edge=e.edge=xc(n.site,f.site),kc(n),void kc(e);if(e){Cc(n),Cc(e);var c=n.site,s=c[0],l=c[1],h=t[0]-s,d=t[1]-l,p=e.site,v=p[0]-s,g=p[1]-l,y=2*(h*g-d*v),_=h*h+d*d,b=v*v+g*g,m=[(g*_-d*b)/y+s,(h*b-v*_)/y+l];Mc(e.edge,c,p,m),f.edge=xc(c,t,null,m),e.edge=xc(t,p,null,m),kc(n),kc(e)}else f.edge=xc(n.site,f.site)}}function Dc(t,n){var e=t.site,r=e[0],i=e[1],o=i-n;if(!o)return r;var a=t.P;if(!a)return-1/0;var u=(e=a.site)[0],f=e[1],c=f-n;if(!c)return u;var s=u-r,l=1/o-1/c,h=s/c;return l?(-h+Math.sqrt(h*h-2*l*(s*s/(-2*c)-f+c/2+i-o/2)))/l+r:(r+u)/2}function Uc(t,n,e){return(t[0]-e[0])*(n[1]-t[1])-(t[0]-n[0])*(e[1]-t[1])}function qc(t,n){return n[1]-t[1]||n[0]-t[0]}function Oc(t,n){var e,r,i,o=t.sort(qc).pop();for(J_=[],Z_=new Array(t.length),W_=new gc,Q_=new gc;;)if(i=$_,o&&(!i||o[1]<i.y||o[1]===i.y&&o[0]<i.x))o[0]===e&&o[1]===r||(Lc(o),e=o[0],r=o[1]),o=t.pop();else{if(!i)break;Rc(i.arc)}if(function(){for(var t,n,e,r,i=0,o=Z_.length;i<o;++i)if((t=Z_[i])&&(r=(n=t.halfedges).length)){var a=new Array(r),u=new Array(r);for(e=0;e<r;++e)a[e]=e,u[e]=Nc(t,J_[n[e]]);for(a.sort(function(t,n){return u[n]-u[t]}),e=0;e<r;++e)u[e]=n[a[e]];for(e=0;e<r;++e)n[e]=u[e]}}(),n){var a=+n[0][0],u=+n[0][1],f=+n[1][0],c=+n[1][1];(function(t,n,e,r){for(var i,o=J_.length;o--;)Tc(i=J_[o],t,n,e,r)&&Ac(i,t,n,e,r)&&(Math.abs(i[0][0]-i[1][0])>nb||Math.abs(i[0][1]-i[1][1])>nb)||delete J_[o]})(a,u,f,c),function(t,n,e,r){var i,o,a,u,f,c,s,l,h,d,p,v,g=Z_.length,y=!0;for(i=0;i<g;++i)if(o=Z_[i]){for(a=o.site,u=(f=o.halfedges).length;u--;)J_[f[u]]||f.splice(u,1);for(u=0,c=f.length;u<c;)p=(d=Ec(o,J_[f[u]]))[0],v=d[1],l=(s=Sc(o,J_[f[++u%c]]))[0],h=s[1],(Math.abs(p-l)>nb||Math.abs(v-h)>nb)&&(f.splice(u,0,J_.push(wc(a,d,Math.abs(p-t)<nb&&r-v>nb?[t,Math.abs(l-t)<nb?h:r]:Math.abs(v-r)<nb&&e-p>nb?[Math.abs(h-r)<nb?l:e,r]:Math.abs(p-e)<nb&&v-n>nb?[e,Math.abs(l-e)<nb?h:n]:Math.abs(v-n)<nb&&p-t>nb?[Math.abs(h-n)<nb?l:t,n]:null))-1),++c);c&&(y=!1)}if(y){var _,b,m,x=1/0;for(i=0,y=null;i<g;++i)(o=Z_[i])&&(m=(_=(a=o.site)[0]-t)*_+(b=a[1]-n)*b)<x&&(x=m,y=o);if(y){var w=[t,n],M=[t,r],A=[e,r],T=[e,n];y.halfedges.push(J_.push(wc(a=y.site,w,M))-1,J_.push(wc(a,M,A))-1,J_.push(wc(a,A,T))-1,J_.push(wc(a,T,w))-1)}}for(i=0;i<g;++i)(o=Z_[i])&&(o.halfedges.length||delete Z_[i])}(a,u,f,c)}this.edges=J_,this.cells=Z_,W_=Q_=J_=Z_=null}function Yc(t){return function(){return t}}function Bc(t,n,e){this.k=t,this.x=n,this.y=e}function Fc(t){return t.__zoom||rb}function Ic(){t.event.stopImmediatePropagation()}function jc(){t.event.preventDefault(),t.event.stopImmediatePropagation()}function Hc(){return!t.event.button}function Xc(){var t,n,e=this;return e instanceof SVGElement?(t=(e=e.ownerSVGElement||e).width.baseVal.value,n=e.height.baseVal.value):(t=e.clientWidth,n=e.clientHeight),[[0,0],[t,n]]}function Gc(){return this.__zoom||rb}function Vc(){return-t.event.deltaY*(t.event.deltaMode?120:1)/500}function $c(){return"ontouchstart"in this}function Wc(t,n,e){var r=t.invertX(n[0][0])-e[0][0],i=t.invertX(n[1][0])-e[1][0],o=t.invertY(n[0][1])-e[0][1],a=t.invertY(n[1][1])-e[1][1];return t.translate(i>r?(r+i)/2:Math.min(0,r)||Math.max(0,i),a>o?(o+a)/2:Math.min(0,o)||Math.max(0,a))}var Zc=e(n),Qc=Zc.right,Jc=Zc.left,Kc=Array.prototype,ts=Kc.slice,ns=Kc.map,es=Math.sqrt(50),rs=Math.sqrt(10),is=Math.sqrt(2),os=Array.prototype.slice,as=1,us=2,fs=3,cs=4,ss=1e-6,ls={value:function(){}};S.prototype=N.prototype={constructor:S,on:function(t,n){var e,r=this._,i=function(t,n){return t.trim().split(/^|\s+/).map(function(t){var e="",r=t.indexOf(".");if(r>=0&&(e=t.slice(r+1),t=t.slice(0,r)),t&&!n.hasOwnProperty(t))throw new Error("unknown type: "+t);return{type:t,name:e}})}(t+"",r),o=-1,a=i.length;{if(!(arguments.length<2)){if(null!=n&&"function"!=typeof n)throw new Error("invalid callback: "+n);for(;++o<a;)if(e=(t=i[o]).type)r[e]=E(r[e],t.name,n);else if(null==n)for(e in r)r[e]=E(r[e],t.name,null);return this}for(;++o<a;)if((e=(t=i[o]).type)&&(e=function(t,n){for(var e,r=0,i=t.length;r<i;++r)if((e=t[r]).name===n)return e.value}(r[e],t.name)))return e}},copy:function(){var t={},n=this._;for(var e in n)t[e]=n[e].slice();return new S(t)},call:function(t,n){if((e=arguments.length-2)>0)for(var e,r,i=new Array(e),o=0;o<e;++o)i[o]=arguments[o+2];if(!this._.hasOwnProperty(t))throw new Error("unknown type: "+t);for(o=0,e=(r=this._[t]).length;o<e;++o)r[o].value.apply(n,i)},apply:function(t,n,e){if(!this._.hasOwnProperty(t))throw new Error("unknown type: "+t);for(var r=this._[t],i=0,o=r.length;i<o;++i)r[i].value.apply(n,e)}};var hs="http://www.w3.org/1999/xhtml",ds={svg:"http://www.w3.org/2000/svg",xhtml:hs,xlink:"http://www.w3.org/1999/xlink",xml:"http://www.w3.org/XML/1998/namespace",xmlns:"http://www.w3.org/2000/xmlns/"},ps=function(t){return function(){return this.matches(t)}};if("undefined"!=typeof document){var vs=document.documentElement;if(!vs.matches){var gs=vs.webkitMatchesSelector||vs.msMatchesSelector||vs.mozMatchesSelector||vs.oMatchesSelector;ps=function(t){return function(){return gs.call(this,t)}}}}var ys=ps;U.prototype={constructor:U,appendChild:function(t){return this._parent.insertBefore(t,this._next)},insertBefore:function(t,n){return this._parent.insertBefore(t,n)},querySelector:function(t){return this._parent.querySelector(t)},querySelectorAll:function(t){return this._parent.querySelectorAll(t)}};var _s="$";H.prototype={add:function(t){this._names.indexOf(t)<0&&(this._names.push(t),this._node.setAttribute("class",this._names.join(" ")))},remove:function(t){var n=this._names.indexOf(t);n>=0&&(this._names.splice(n,1),this._node.setAttribute("class",this._names.join(" ")))},contains:function(t){return this._names.indexOf(t)>=0}};var bs={};if(t.event=null,"undefined"!=typeof document){"onmouseenter"in document.documentElement||(bs={mouseenter:"mouseover",mouseleave:"mouseout"})}var ms=[null];ut.prototype=ft.prototype={constructor:ut,select:function(t){"function"!=typeof t&&(t=z(t));for(var n=this._groups,e=n.length,r=new Array(e),i=0;i<e;++i)for(var o,a,u=n[i],f=u.length,c=r[i]=new Array(f),s=0;s<f;++s)(o=u[s])&&(a=t.call(o,o.__data__,s,u))&&("__data__"in o&&(a.__data__=o.__data__),c[s]=a);return new ut(r,this._parents)},selectAll:function(t){"function"!=typeof t&&(t=L(t));for(var n=this._groups,e=n.length,r=[],i=[],o=0;o<e;++o)for(var a,u=n[o],f=u.length,c=0;c<f;++c)(a=u[c])&&(r.push(t.call(a,a.__data__,c,u)),i.push(a));return new ut(r,i)},filter:function(t){"function"!=typeof t&&(t=ys(t));for(var n=this._groups,e=n.length,r=new Array(e),i=0;i<e;++i)for(var o,a=n[i],u=a.length,f=r[i]=[],c=0;c<u;++c)(o=a[c])&&t.call(o,o.__data__,c,a)&&f.push(o);return new ut(r,this._parents)},data:function(t,n){if(!t)return d=new Array(this.size()),c=-1,this.each(function(t){d[++c]=t}),d;var e=n?O:q,r=this._parents,i=this._groups;"function"!=typeof t&&(t=function(t){return function(){return t}}(t));for(var o=i.length,a=new Array(o),u=new Array(o),f=new Array(o),c=0;c<o;++c){var s=r[c],l=i[c],h=l.length,d=t.call(s,s&&s.__data__,c,r),p=d.length,v=u[c]=new Array(p),g=a[c]=new Array(p);e(s,l,v,g,f[c]=new Array(h),d,n);for(var y,_,b=0,m=0;b<p;++b)if(y=v[b]){for(b>=m&&(m=b+1);!(_=g[m])&&++m<p;);y._next=_||null}}return a=new ut(a,r),a._enter=u,a._exit=f,a},enter:function(){return new ut(this._enter||this._groups.map(D),this._parents)},exit:function(){return new ut(this._exit||this._groups.map(D),this._parents)},merge:function(t){for(var n=this._groups,e=t._groups,r=n.length,i=e.length,o=Math.min(r,i),a=new Array(r),u=0;u<o;++u)for(var f,c=n[u],s=e[u],l=c.length,h=a[u]=new Array(l),d=0;d<l;++d)(f=c[d]||s[d])&&(h[d]=f);for(;u<r;++u)a[u]=n[u];return new ut(a,this._parents)},order:function(){for(var t=this._groups,n=-1,e=t.length;++n<e;)for(var r,i=t[n],o=i.length-1,a=i[o];--o>=0;)(r=i[o])&&(a&&a!==r.nextSibling&&a.parentNode.insertBefore(r,a),a=r);return this},sort:function(t){function n(n,e){return n&&e?t(n.__data__,e.__data__):!n-!e}t||(t=Y);for(var e=this._groups,r=e.length,i=new Array(r),o=0;o<r;++o){for(var a,u=e[o],f=u.length,c=i[o]=new Array(f),s=0;s<f;++s)(a=u[s])&&(c[s]=a);c.sort(n)}return new ut(i,this._parents).order()},call:function(){var t=arguments[0];return arguments[0]=this,t.apply(null,arguments),this},nodes:function(){var t=new Array(this.size()),n=-1;return this.each(function(){t[++n]=this}),t},node:function(){for(var t=this._groups,n=0,e=t.length;n<e;++n)for(var r=t[n],i=0,o=r.length;i<o;++i){var a=r[i];if(a)return a}return null},size:function(){var t=0;return this.each(function(){++t}),t},empty:function(){return!this.node()},each:function(t){for(var n=this._groups,e=0,r=n.length;e<r;++e)for(var i,o=n[e],a=0,u=o.length;a<u;++a)(i=o[a])&&t.call(i,i.__data__,a,o);return this},attr:function(t,n){var e=k(t);if(arguments.length<2){var r=this.node();return e.local?r.getAttributeNS(e.space,e.local):r.getAttribute(e)}return this.each((null==n?e.local?function(t){return function(){this.removeAttributeNS(t.space,t.local)}}:function(t){return function(){this.removeAttribute(t)}}:"function"==typeof n?e.local?function(t,n){return function(){var e=n.apply(this,arguments);null==e?this.removeAttributeNS(t.space,t.local):this.setAttributeNS(t.space,t.local,e)}}:function(t,n){return function(){var e=n.apply(this,arguments);null==e?this.removeAttribute(t):this.setAttribute(t,e)}}:e.local?function(t,n){return function(){this.setAttributeNS(t.space,t.local,n)}}:function(t,n){return function(){this.setAttribute(t,n)}})(e,n))},style:function(t,n,e){return arguments.length>1?this.each((null==n?function(t){return function(){this.style.removeProperty(t)}}:"function"==typeof n?function(t,n,e){return function(){var r=n.apply(this,arguments);null==r?this.style.removeProperty(t):this.style.setProperty(t,r,e)}}:function(t,n,e){return function(){this.style.setProperty(t,n,e)}})(t,n,null==e?"":e)):F(this.node(),t)},property:function(t,n){return arguments.length>1?this.each((null==n?function(t){return function(){delete this[t]}}:"function"==typeof n?function(t,n){return function(){var e=n.apply(this,arguments);null==e?delete this[t]:this[t]=e}}:function(t,n){return function(){this[t]=n}})(t,n)):this.node()[t]},classed:function(t,n){var e=I(t+"");if(arguments.length<2){for(var r=j(this.node()),i=-1,o=e.length;++i<o;)if(!r.contains(e[i]))return!1;return!0}return this.each(("function"==typeof n?function(t,n){return function(){(n.apply(this,arguments)?X:G)(this,t)}}:n?function(t){return function(){X(this,t)}}:function(t){return function(){G(this,t)}})(e,n))},text:function(t){return arguments.length?this.each(null==t?V:("function"==typeof t?function(t){return function(){var n=t.apply(this,arguments);this.textContent=null==n?"":n}}:function(t){return function(){this.textContent=t}})(t)):this.node().textContent},html:function(t){return arguments.length?this.each(null==t?$:("function"==typeof t?function(t){return function(){var n=t.apply(this,arguments);this.innerHTML=null==n?"":n}}:function(t){return function(){this.innerHTML=t}})(t)):this.node().innerHTML},raise:function(){return this.each(W)},lower:function(){return this.each(Z)},append:function(t){var n="function"==typeof t?t:C(t);return this.select(function(){return this.appendChild(n.apply(this,arguments))})},insert:function(t,n){var e="function"==typeof t?t:C(t),r=null==n?Q:"function"==typeof n?n:z(n);return this.select(function(){return this.insertBefore(e.apply(this,arguments),r.apply(this,arguments)||null)})},remove:function(){return this.each(J)},clone:function(t){return this.select(t?tt:K)},datum:function(t){return arguments.length?this.property("__data__",t):this.node().__data__},on:function(t,n,e){var r,i,o=function(t){return t.trim().split(/^|\s+/).map(function(t){var n="",e=t.indexOf(".");return e>=0&&(n=t.slice(e+1),t=t.slice(0,e)),{type:t,name:n}})}(t+""),a=o.length;if(!(arguments.length<2)){for(u=n?it:rt,null==e&&(e=!1),r=0;r<a;++r)this.each(u(o[r],n,e));return this}var u=this.node().__on;if(u)for(var f,c=0,s=u.length;c<s;++c)for(r=0,f=u[c];r<a;++r)if((i=o[r]).type===f.type&&i.name===f.name)return f.value},dispatch:function(t,n){return this.each(("function"==typeof n?function(t,n){return function(){return at(this,t,n.apply(this,arguments))}}:function(t,n){return function(){return at(this,t,n)}})(t,n))}};var xs=0;lt.prototype=st.prototype={constructor:lt,get:function(t){for(var n=this._;!(n in t);)if(!(t=t.parentNode))return;return t[n]},set:function(t,n){return t[this._]=n},remove:function(t){return this._ in t&&delete t[this._]},toString:function(){return this._}},xt.prototype.on=function(){var t=this._.on.apply(this._,arguments);return t===this._?this:t};var ws="\\s*([+-]?\\d+)\\s*",Ms="\\s*([+-]?\\d*\\.?\\d+(?:[eE][+-]?\\d+)?)\\s*",As="\\s*([+-]?\\d*\\.?\\d+(?:[eE][+-]?\\d+)?)%\\s*",Ts=/^#([0-9a-f]{3})$/,Ns=/^#([0-9a-f]{6})$/,Ss=new RegExp("^rgb\\("+[ws,ws,ws]+"\\)$"),Es=new RegExp("^rgb\\("+[As,As,As]+"\\)$"),ks=new RegExp("^rgba\\("+[ws,ws,ws,Ms]+"\\)$"),Cs=new RegExp("^rgba\\("+[As,As,As,Ms]+"\\)$"),Ps=new RegExp("^hsl\\("+[Ms,As,As]+"\\)$"),zs=new RegExp("^hsla\\("+[Ms,As,As,Ms]+"\\)$"),Rs={aliceblue:15792383,antiquewhite:16444375,aqua:65535,aquamarine:8388564,azure:15794175,beige:16119260,bisque:16770244,black:0,blanchedalmond:16772045,blue:255,blueviolet:9055202,brown:10824234,burlywood:14596231,cadetblue:6266528,chartreuse:8388352,chocolate:13789470,coral:16744272,cornflowerblue:6591981,cornsilk:16775388,crimson:14423100,cyan:65535,darkblue:139,darkcyan:35723,darkgoldenrod:12092939,darkgray:11119017,darkgreen:25600,darkgrey:11119017,darkkhaki:12433259,darkmagenta:9109643,darkolivegreen:5597999,darkorange:16747520,darkorchid:10040012,darkred:9109504,darksalmon:15308410,darkseagreen:9419919,darkslateblue:4734347,darkslategray:3100495,darkslategrey:3100495,darkturquoise:52945,darkviolet:9699539,deeppink:16716947,deepskyblue:49151,dimgray:6908265,dimgrey:6908265,dodgerblue:2003199,firebrick:11674146,floralwhite:16775920,forestgreen:2263842,fuchsia:16711935,gainsboro:14474460,ghostwhite:16316671,gold:16766720,goldenrod:14329120,gray:8421504,green:32768,greenyellow:11403055,grey:8421504,honeydew:15794160,hotpink:16738740,indianred:13458524,indigo:4915330,ivory:16777200,khaki:15787660,lavender:15132410,lavenderblush:16773365,lawngreen:8190976,lemonchiffon:16775885,lightblue:11393254,lightcoral:15761536,lightcyan:14745599,lightgoldenrodyellow:16448210,lightgray:13882323,lightgreen:9498256,lightgrey:13882323,lightpink:16758465,lightsalmon:16752762,lightseagreen:2142890,lightskyblue:8900346,lightslategray:7833753,lightslategrey:7833753,lightsteelblue:11584734,lightyellow:16777184,lime:65280,limegreen:3329330,linen:16445670,magenta:16711935,maroon:8388608,mediumaquamarine:6737322,mediumblue:205,mediumorchid:12211667,mediumpurple:9662683,mediumseagreen:3978097,mediumslateblue:8087790,mediumspringgreen:64154,mediumturquoise:4772300,mediumvioletred:13047173,midnightblue:1644912,mintcream:16121850,mistyrose:16770273,moccasin:16770229,navajowhite:16768685,navy:128,oldlace:16643558,olive:8421376,olivedrab:7048739,orange:16753920,orangered:16729344,orchid:14315734,palegoldenrod:15657130,palegreen:10025880,paleturquoise:11529966,palevioletred:14381203,papayawhip:16773077,peachpuff:16767673,peru:13468991,pink:16761035,plum:14524637,powderblue:11591910,purple:8388736,rebeccapurple:6697881,red:16711680,rosybrown:12357519,royalblue:4286945,saddlebrown:9127187,salmon:16416882,sandybrown:16032864,seagreen:3050327,seashell:16774638,sienna:10506797,silver:12632256,skyblue:8900331,slateblue:6970061,slategray:7372944,slategrey:7372944,snow:16775930,springgreen:65407,steelblue:4620980,tan:13808780,teal:32896,thistle:14204888,tomato:16737095,turquoise:4251856,violet:15631086,wheat:16113331,white:16777215,whitesmoke:16119285,yellow:16776960,yellowgreen:10145074};Nt(Et,kt,{displayable:function(){return this.rgb().displayable()},toString:function(){return this.rgb()+""}}),Nt(Lt,Rt,St(Et,{brighter:function(t){return t=null==t?1/.7:Math.pow(1/.7,t),new Lt(this.r*t,this.g*t,this.b*t,this.opacity)},darker:function(t){return t=null==t?.7:Math.pow(.7,t),new Lt(this.r*t,this.g*t,this.b*t,this.opacity)},rgb:function(){return this},displayable:function(){return 0<=this.r&&this.r<=255&&0<=this.g&&this.g<=255&&0<=this.b&&this.b<=255&&0<=this.opacity&&this.opacity<=1},toString:function(){var t=this.opacity;return(1===(t=isNaN(t)?1:Math.max(0,Math.min(1,t)))?"rgb(":"rgba(")+Math.max(0,Math.min(255,Math.round(this.r)||0))+", "+Math.max(0,Math.min(255,Math.round(this.g)||0))+", "+Math.max(0,Math.min(255,Math.round(this.b)||0))+(1===t?")":", "+t+")")}})),Nt(qt,Ut,St(Et,{brighter:function(t){return t=null==t?1/.7:Math.pow(1/.7,t),new qt(this.h,this.s,this.l*t,this.opacity)},darker:function(t){return t=null==t?.7:Math.pow(.7,t),new qt(this.h,this.s,this.l*t,this.opacity)},rgb:function(){var t=this.h%360+360*(this.h<0),n=isNaN(t)||isNaN(this.s)?0:this.s,e=this.l,r=e+(e<.5?e:1-e)*n,i=2*e-r;return new Lt(Ot(t>=240?t-240:t+120,i,r),Ot(t,i,r),Ot(t<120?t+240:t-120,i,r),this.opacity)},displayable:function(){return(0<=this.s&&this.s<=1||isNaN(this.s))&&0<=this.l&&this.l<=1&&0<=this.opacity&&this.opacity<=1}}));var Ls=Math.PI/180,Ds=180/Math.PI,Us=.96422,qs=1,Os=.82521,Ys=4/29,Bs=6/29,Fs=3*Bs*Bs,Is=Bs*Bs*Bs;Nt(Ft,Bt,St(Et,{brighter:function(t){return new Ft(this.l+18*(null==t?1:t),this.a,this.b,this.opacity)},darker:function(t){return new Ft(this.l-18*(null==t?1:t),this.a,this.b,this.opacity)},rgb:function(){var t=(this.l+16)/116,n=isNaN(this.a)?t:t+this.a/500,e=isNaN(this.b)?t:t-this.b/200;return n=Us*jt(n),t=qs*jt(t),e=Os*jt(e),new Lt(Ht(3.1338561*n-1.6168667*t-.4906146*e),Ht(-.9787684*n+1.9161415*t+.033454*e),Ht(.0719453*n-.2289914*t+1.4052427*e),this.opacity)}})),Nt($t,Vt,St(Et,{brighter:function(t){return new $t(this.h,this.c,this.l+18*(null==t?1:t),this.opacity)},darker:function(t){return new $t(this.h,this.c,this.l-18*(null==t?1:t),this.opacity)},rgb:function(){return Yt(this).rgb()}}));var js=-.29227,Hs=-.90649,Xs=1.97294,Gs=Xs*Hs,Vs=1.78277*Xs,$s=1.78277*js- -.14861*Hs;Nt(Zt,Wt,St(Et,{brighter:function(t){return t=null==t?1/.7:Math.pow(1/.7,t),new Zt(this.h,this.s,this.l*t,this.opacity)},darker:function(t){return t=null==t?.7:Math.pow(.7,t),new Zt(this.h,this.s,this.l*t,this.opacity)},rgb:function(){var t=isNaN(this.h)?0:(this.h+120)*Ls,n=+this.l,e=isNaN(this.s)?0:this.s*n*(1-n),r=Math.cos(t),i=Math.sin(t);return new Lt(255*(n+e*(-.14861*r+1.78277*i)),255*(n+e*(js*r+Hs*i)),255*(n+e*(Xs*r)),this.opacity)}}));var Ws,Zs,Qs,Js,Ks,tl,nl=function t(n){function e(t,n){var e=r((t=Rt(t)).r,(n=Rt(n)).r),i=r(t.g,n.g),o=r(t.b,n.b),a=on(t.opacity,n.opacity);return function(n){return t.r=e(n),t.g=i(n),t.b=o(n),t.opacity=a(n),t+""}}var r=rn(n);return e.gamma=t,e}(1),el=an(Jt),rl=an(Kt),il=/[-+]?(?:\d+\.?\d*|\.?\d+)(?:[eE][-+]?\d+)?/g,ol=new RegExp(il.source,"g"),al=180/Math.PI,ul={translateX:0,translateY:0,rotate:0,skewX:0,scaleX:1,scaleY:1},fl=vn(function(t){return"none"===t?ul:(Ws||(Ws=document.createElement("DIV"),Zs=document.documentElement,Qs=document.defaultView),Ws.style.transform=t,t=Qs.getComputedStyle(Zs.appendChild(Ws),null).getPropertyValue("transform"),Zs.removeChild(Ws),t=t.slice(7,-1).split(","),pn(+t[0],+t[1],+t[2],+t[3],+t[4],+t[5]))},"px, ","px)","deg)"),cl=vn(function(t){return null==t?ul:(Js||(Js=document.createElementNS("http://www.w3.org/2000/svg","g")),Js.setAttribute("transform",t),(t=Js.transform.baseVal.consolidate())?(t=t.matrix,pn(t.a,t.b,t.c,t.d,t.e,t.f)):ul)},", ",")",")"),sl=Math.SQRT2,ll=2,hl=4,dl=1e-12,pl=_n(en),vl=_n(on),gl=bn(en),yl=bn(on),_l=mn(en),bl=mn(on),ml=0,xl=0,wl=0,Ml=1e3,Al=0,Tl=0,Nl=0,Sl="object"==typeof performance&&performance.now?performance:Date,El="object"==typeof window&&window.requestAnimationFrame?window.requestAnimationFrame.bind(window):function(t){setTimeout(t,17)};Mn.prototype=An.prototype={constructor:Mn,restart:function(t,n,e){if("function"!=typeof t)throw new TypeError("callback is not a function");e=(null==e?xn():+e)+(null==n?0:+n),this._next||tl===this||(tl?tl._next=this:Ks=this,tl=this),this._call=t,this._time=e,En()},stop:function(){this._call&&(this._call=null,this._time=1/0,En())}};var kl=N("start","end","interrupt"),Cl=[],Pl=0,zl=1,Rl=2,Ll=3,Dl=4,Ul=5,ql=6,Ol=ft.prototype.constructor,Yl=0,Bl=ft.prototype;qn.prototype=On.prototype={constructor:qn,select:function(t){var n=this._name,e=this._id;"function"!=typeof t&&(t=z(t));for(var r=this._groups,i=r.length,o=new Array(i),a=0;a<i;++a)for(var u,f,c=r[a],s=c.length,l=o[a]=new Array(s),h=0;h<s;++h)(u=c[h])&&(f=t.call(u,u.__data__,h,c))&&("__data__"in u&&(f.__data__=u.__data__),l[h]=f,Cn(l[h],n,e,h,l,Rn(u,e)));return new qn(o,this._parents,n,e)},selectAll:function(t){var n=this._name,e=this._id;"function"!=typeof t&&(t=L(t));for(var r=this._groups,i=r.length,o=[],a=[],u=0;u<i;++u)for(var f,c=r[u],s=c.length,l=0;l<s;++l)if(f=c[l]){for(var h,d=t.call(f,f.__data__,l,c),p=Rn(f,e),v=0,g=d.length;v<g;++v)(h=d[v])&&Cn(h,n,e,v,d,p);o.push(d),a.push(f)}return new qn(o,a,n,e)},filter:function(t){"function"!=typeof t&&(t=ys(t));for(var n=this._groups,e=n.length,r=new Array(e),i=0;i<e;++i)for(var o,a=n[i],u=a.length,f=r[i]=[],c=0;c<u;++c)(o=a[c])&&t.call(o,o.__data__,c,a)&&f.push(o);return new qn(r,this._parents,this._name,this._id)},merge:function(t){if(t._id!==this._id)throw new Error;for(var n=this._groups,e=t._groups,r=n.length,i=e.length,o=Math.min(r,i),a=new Array(r),u=0;u<o;++u)for(var f,c=n[u],s=e[u],l=c.length,h=a[u]=new Array(l),d=0;d<l;++d)(f=c[d]||s[d])&&(h[d]=f);for(;u<r;++u)a[u]=n[u];return new qn(a,this._parents,this._name,this._id)},selection:function(){return new Ol(this._groups,this._parents)},transition:function(){for(var t=this._name,n=this._id,e=Yn(),r=this._groups,i=r.length,o=0;o<i;++o)for(var a,u=r[o],f=u.length,c=0;c<f;++c)if(a=u[c]){var s=Rn(a,n);Cn(a,t,e,c,u,{time:s.time+s.delay+s.duration,delay:0,duration:s.duration,ease:s.ease})}return new qn(r,this._parents,t,e)},call:Bl.call,nodes:Bl.nodes,node:Bl.node,size:Bl.size,empty:Bl.empty,each:Bl.each,on:function(t,n){var e=this._id;return arguments.length<2?Rn(this.node(),e).on.on(t):this.each(function(t,n,e){var r,i,o=function(t){return(t+"").trim().split(/^|\s+/).every(function(t){var n=t.indexOf(".");return n>=0&&(t=t.slice(0,n)),!t||"start"===t})}(n)?Pn:zn;return function(){var a=o(this,t),u=a.on;u!==r&&(i=(r=u).copy()).on(n,e),a.on=i}}(e,t,n))},attr:function(t,n){var e=k(t),r="transform"===e?cl:Un;return this.attrTween(t,"function"==typeof n?(e.local?function(t,n,e){var r,i,o;return function(){var a,u=e(this);if(null!=u)return(a=this.getAttributeNS(t.space,t.local))===u?null:a===r&&u===i?o:o=n(r=a,i=u);this.removeAttributeNS(t.space,t.local)}}:function(t,n,e){var r,i,o;return function(){var a,u=e(this);if(null!=u)return(a=this.getAttribute(t))===u?null:a===r&&u===i?o:o=n(r=a,i=u);this.removeAttribute(t)}})(e,r,Dn(this,"attr."+t,n)):null==n?(e.local?function(t){return function(){this.removeAttributeNS(t.space,t.local)}}:function(t){return function(){this.removeAttribute(t)}})(e):(e.local?function(t,n,e){var r,i;return function(){var o=this.getAttributeNS(t.space,t.local);return o===e?null:o===r?i:i=n(r=o,e)}}:function(t,n,e){var r,i;return function(){var o=this.getAttribute(t);return o===e?null:o===r?i:i=n(r=o,e)}})(e,r,n+""))},attrTween:function(t,n){var e="attr."+t;if(arguments.length<2)return(e=this.tween(e))&&e._value;if(null==n)return this.tween(e,null);if("function"!=typeof n)throw new Error;var r=k(t);return this.tween(e,(r.local?function(t,n){function e(){var e=this,r=n.apply(e,arguments);return r&&function(n){e.setAttributeNS(t.space,t.local,r(n))}}return e._value=n,e}:function(t,n){function e(){var e=this,r=n.apply(e,arguments);return r&&function(n){e.setAttribute(t,r(n))}}return e._value=n,e})(r,n))},style:function(t,n,e){var r="transform"==(t+="")?fl:Un;return null==n?this.styleTween(t,function(t,n){var e,r,i;return function(){var o=F(this,t),a=(this.style.removeProperty(t),F(this,t));return o===a?null:o===e&&a===r?i:i=n(e=o,r=a)}}(t,r)).on("end.style."+t,function(t){return function(){this.style.removeProperty(t)}}(t)):this.styleTween(t,"function"==typeof n?function(t,n,e){var r,i,o;return function(){var a=F(this,t),u=e(this);return null==u&&(this.style.removeProperty(t),u=F(this,t)),a===u?null:a===r&&u===i?o:o=n(r=a,i=u)}}(t,r,Dn(this,"style."+t,n)):function(t,n,e){var r,i;return function(){var o=F(this,t);return o===e?null:o===r?i:i=n(r=o,e)}}(t,r,n+""),e)},styleTween:function(t,n,e){var r="style."+(t+="");if(arguments.length<2)return(r=this.tween(r))&&r._value;if(null==n)return this.tween(r,null);if("function"!=typeof n)throw new Error;return this.tween(r,function(t,n,e){function r(){var r=this,i=n.apply(r,arguments);return i&&function(n){r.style.setProperty(t,i(n),e)}}return r._value=n,r}(t,n,null==e?"":e))},text:function(t){return this.tween("text","function"==typeof t?function(t){return function(){var n=t(this);this.textContent=null==n?"":n}}(Dn(this,"text",t)):function(t){return function(){this.textContent=t}}(null==t?"":t+""))},remove:function(){return this.on("end.remove",function(t){return function(){var n=this.parentNode;for(var e in this.__transition)if(+e!==t)return;n&&n.removeChild(this)}}(this._id))},tween:function(t,n){var e=this._id;if(t+="",arguments.length<2){for(var r,i=Rn(this.node(),e).tween,o=0,a=i.length;o<a;++o)if((r=i[o]).name===t)return r.value;return null}return this.each((null==n?function(t,n){var e,r;return function(){var i=zn(this,t),o=i.tween;if(o!==e)for(var a=0,u=(r=e=o).length;a<u;++a)if(r[a].name===n){(r=r.slice()).splice(a,1);break}i.tween=r}}:function(t,n,e){var r,i;if("function"!=typeof e)throw new Error;return function(){var o=zn(this,t),a=o.tween;if(a!==r){i=(r=a).slice();for(var u={name:n,value:e},f=0,c=i.length;f<c;++f)if(i[f].name===n){i[f]=u;break}f===c&&i.push(u)}o.tween=i}})(e,t,n))},delay:function(t){var n=this._id;return arguments.length?this.each(("function"==typeof t?function(t,n){return function(){Pn(this,t).delay=+n.apply(this,arguments)}}:function(t,n){return n=+n,function(){Pn(this,t).delay=n}})(n,t)):Rn(this.node(),n).delay},duration:function(t){var n=this._id;return arguments.length?this.each(("function"==typeof t?function(t,n){return function(){zn(this,t).duration=+n.apply(this,arguments)}}:function(t,n){return n=+n,function(){zn(this,t).duration=n}})(n,t)):Rn(this.node(),n).duration},ease:function(t){var n=this._id;return arguments.length?this.each(function(t,n){if("function"!=typeof n)throw new Error;return function(){zn(this,t).ease=n}}(n,t)):Rn(this.node(),n).ease}};var Fl=function t(n){function e(t){return Math.pow(t,n)}return n=+n,e.exponent=t,e}(3),Il=function t(n){function e(t){return 1-Math.pow(1-t,n)}return n=+n,e.exponent=t,e}(3),jl=function t(n){function e(t){return((t*=2)<=1?Math.pow(t,n):2-Math.pow(2-t,n))/2}return n=+n,e.exponent=t,e}(3),Hl=Math.PI,Xl=Hl/2,Gl=4/11,Vl=6/11,$l=8/11,Wl=.75,Zl=9/11,Ql=10/11,Jl=.9375,Kl=21/22,th=63/64,nh=1/Gl/Gl,eh=function t(n){function e(t){return t*t*((n+1)*t-n)}return n=+n,e.overshoot=t,e}(1.70158),rh=function t(n){function e(t){return--t*t*((n+1)*t+n)+1}return n=+n,e.overshoot=t,e}(1.70158),ih=function t(n){function e(t){return((t*=2)<1?t*t*((n+1)*t-n):(t-=2)*t*((n+1)*t+n)+2)/2}return n=+n,e.overshoot=t,e}(1.70158),oh=2*Math.PI,ah=function t(n,e){function r(t){return n*Math.pow(2,10*--t)*Math.sin((i-t)/e)}var i=Math.asin(1/(n=Math.max(1,n)))*(e/=oh);return r.amplitude=function(n){return t(n,e*oh)},r.period=function(e){return t(n,e)},r}(1,.3),uh=function t(n,e){function r(t){return 1-n*Math.pow(2,-10*(t=+t))*Math.sin((t+i)/e)}var i=Math.asin(1/(n=Math.max(1,n)))*(e/=oh);return r.amplitude=function(n){return t(n,e*oh)},r.period=function(e){return t(n,e)},r}(1,.3),fh=function t(n,e){function r(t){return((t=2*t-1)<0?n*Math.pow(2,10*t)*Math.sin((i-t)/e):2-n*Math.pow(2,-10*t)*Math.sin((i+t)/e))/2}var i=Math.asin(1/(n=Math.max(1,n)))*(e/=oh);return r.amplitude=function(n){return t(n,e*oh)},r.period=function(e){return t(n,e)},r}(1,.3),ch={time:null,delay:0,duration:250,ease:Fn};ft.prototype.interrupt=function(t){return this.each(function(){Ln(this,t)})},ft.prototype.transition=function(t){var n,e;t instanceof qn?(n=t._id,t=t._name):(n=Yn(),(e=ch).time=xn(),t=null==t?null:t+"");for(var r=this._groups,i=r.length,o=0;o<i;++o)for(var a,u=r[o],f=u.length,c=0;c<f;++c)(a=u[c])&&Cn(a,t,n,c,u,e||Gn(a,n));return new qn(r,this._parents,t,n)};var sh=[null],lh={name:"drag"},hh={name:"space"},dh={name:"handle"},ph={name:"center"},vh={name:"x",handles:["e","w"].map(Zn),input:function(t,n){return t&&[[t[0],n[0][1]],[t[1],n[1][1]]]},output:function(t){return t&&[t[0][0],t[1][0]]}},gh={name:"y",handles:["n","s"].map(Zn),input:function(t,n){return t&&[[n[0][0],t[0]],[n[1][0],t[1]]]},output:function(t){return t&&[t[0][1],t[1][1]]}},yh={name:"xy",handles:["n","e","s","w","nw","ne","se","sw"].map(Zn),input:function(t){return t},output:function(t){return t}},_h={overlay:"crosshair",selection:"move",n:"ns-resize",e:"ew-resize",s:"ns-resize",w:"ew-resize",nw:"nwse-resize",ne:"nesw-resize",se:"nwse-resize",sw:"nesw-resize"},bh={e:"w",w:"e",nw:"ne",ne:"nw",se:"sw",sw:"se"},mh={n:"s",s:"n",nw:"sw",ne:"se",se:"ne",sw:"nw"},xh={overlay:1,selection:1,n:null,e:1,s:null,w:-1,nw:-1,ne:1,se:1,sw:-1},wh={overlay:1,selection:1,n:-1,e:null,s:1,w:null,nw:-1,ne:-1,se:1,sw:1},Mh=Math.cos,Ah=Math.sin,Th=Math.PI,Nh=Th/2,Sh=2*Th,Eh=Math.max,kh=Array.prototype.slice,Ch=Math.PI,Ph=2*Ch,zh=Ph-1e-6;re.prototype=ie.prototype={constructor:re,moveTo:function(t,n){this._+="M"+(this._x0=this._x1=+t)+","+(this._y0=this._y1=+n)},closePath:function(){null!==this._x1&&(this._x1=this._x0,this._y1=this._y0,this._+="Z")},lineTo:function(t,n){this._+="L"+(this._x1=+t)+","+(this._y1=+n)},quadraticCurveTo:function(t,n,e,r){this._+="Q"+ +t+","+ +n+","+(this._x1=+e)+","+(this._y1=+r)},bezierCurveTo:function(t,n,e,r,i,o){this._+="C"+ +t+","+ +n+","+ +e+","+ +r+","+(this._x1=+i)+","+(this._y1=+o)},arcTo:function(t,n,e,r,i){t=+t,n=+n,e=+e,r=+r,i=+i;var o=this._x1,a=this._y1,u=e-t,f=r-n,c=o-t,s=a-n,l=c*c+s*s;if(i<0)throw new Error("negative radius: "+i);if(null===this._x1)this._+="M"+(this._x1=t)+","+(this._y1=n);else if(l>1e-6)if(Math.abs(s*u-f*c)>1e-6&&i){var h=e-o,d=r-a,p=u*u+f*f,v=h*h+d*d,g=Math.sqrt(p),y=Math.sqrt(l),_=i*Math.tan((Ch-Math.acos((p+l-v)/(2*g*y)))/2),b=_/y,m=_/g;Math.abs(b-1)>1e-6&&(this._+="L"+(t+b*c)+","+(n+b*s)),this._+="A"+i+","+i+",0,0,"+ +(s*h>c*d)+","+(this._x1=t+m*u)+","+(this._y1=n+m*f)}else this._+="L"+(this._x1=t)+","+(this._y1=n);else;},arc:function(t,n,e,r,i,o){t=+t,n=+n;var a=(e=+e)*Math.cos(r),u=e*Math.sin(r),f=t+a,c=n+u,s=1^o,l=o?r-i:i-r;if(e<0)throw new Error("negative radius: "+e);null===this._x1?this._+="M"+f+","+c:(Math.abs(this._x1-f)>1e-6||Math.abs(this._y1-c)>1e-6)&&(this._+="L"+f+","+c),e&&(l<0&&(l=l%Ph+Ph),l>zh?this._+="A"+e+","+e+",0,1,"+s+","+(t-a)+","+(n-u)+"A"+e+","+e+",0,1,"+s+","+(this._x1=f)+","+(this._y1=c):l>1e-6&&(this._+="A"+e+","+e+",0,"+ +(l>=Ch)+","+s+","+(this._x1=t+e*Math.cos(i))+","+(this._y1=n+e*Math.sin(i))))},rect:function(t,n,e,r){this._+="M"+(this._x0=this._x1=+t)+","+(this._y0=this._y1=+n)+"h"+ +e+"v"+ +r+"h"+-e+"Z"},toString:function(){return this._}};se.prototype=le.prototype={constructor:se,has:function(t){return"$"+t in this},get:function(t){return this["$"+t]},set:function(t,n){return this["$"+t]=n,this},remove:function(t){var n="$"+t;return n in this&&delete this[n]},clear:function(){for(var t in this)"$"===t[0]&&delete this[t]},keys:function(){var t=[];for(var n in this)"$"===n[0]&&t.push(n.slice(1));return t},values:function(){var t=[];for(var n in this)"$"===n[0]&&t.push(this[n]);return t},entries:function(){var t=[];for(var n in this)"$"===n[0]&&t.push({key:n.slice(1),value:this[n]});return t},size:function(){var t=0;for(var n in this)"$"===n[0]&&++t;return t},empty:function(){for(var t in this)if("$"===t[0])return!1;return!0},each:function(t){for(var n in this)"$"===n[0]&&t(this[n],n.slice(1),this)}};var Rh=le.prototype;ge.prototype=ye.prototype={constructor:ge,has:Rh.has,add:function(t){return t+="",this["$"+t]=t,this},remove:Rh.remove,clear:Rh.clear,values:Rh.keys,size:Rh.size,empty:Rh.empty,each:Rh.each};var Lh=Array.prototype.slice,Dh=[[],[[[1,1.5],[.5,1]]],[[[1.5,1],[1,1.5]]],[[[1.5,1],[.5,1]]],[[[1,.5],[1.5,1]]],[[[1,1.5],[.5,1]],[[1,.5],[1.5,1]]],[[[1,.5],[1,1.5]]],[[[1,.5],[.5,1]]],[[[.5,1],[1,.5]]],[[[1,1.5],[1,.5]]],[[[.5,1],[1,.5]],[[1.5,1],[1,1.5]]],[[[1.5,1],[1,.5]]],[[[.5,1],[1.5,1]]],[[[1,1.5],[1.5,1]]],[[[.5,1],[1,1.5]]],[]],Uh={},qh={},Oh=34,Yh=10,Bh=13,Fh=Ee(","),Ih=Fh.parse,jh=Fh.parseRows,Hh=Fh.format,Xh=Fh.formatRows,Gh=Ee("\t"),Vh=Gh.parse,$h=Gh.parseRows,Wh=Gh.format,Zh=Gh.formatRows,Qh=Re(Ih),Jh=Re(Vh),Kh=De("application/xml"),td=De("text/html"),nd=De("image/svg+xml"),ed=Ie.prototype=je.prototype;ed.copy=function(){var t,n,e=new je(this._x,this._y,this._x0,this._y0,this._x1,this._y1),r=this._root;if(!r)return e;if(!r.length)return e._root=He(r),e;for(t=[{source:r,target:e._root=new Array(4)}];r=t.pop();)for(var i=0;i<4;++i)(n=r.source[i])&&(n.length?t.push({source:n,target:r.target[i]=new Array(4)}):r.target[i]=He(n));return e},ed.add=function(t){var n=+this._x.call(null,t),e=+this._y.call(null,t);return Oe(this.cover(n,e),n,e,t)},ed.addAll=function(t){var n,e,r,i,o=t.length,a=new Array(o),u=new Array(o),f=1/0,c=1/0,s=-1/0,l=-1/0;for(e=0;e<o;++e)isNaN(r=+this._x.call(null,n=t[e]))||isNaN(i=+this._y.call(null,n))||(a[e]=r,u[e]=i,r<f&&(f=r),r>s&&(s=r),i<c&&(c=i),i>l&&(l=i));for(s<f&&(f=this._x0,s=this._x1),l<c&&(c=this._y0,l=this._y1),this.cover(f,c).cover(s,l),e=0;e<o;++e)Oe(this,a[e],u[e],t[e]);return this},ed.cover=function(t,n){if(isNaN(t=+t)||isNaN(n=+n))return this;var e=this._x0,r=this._y0,i=this._x1,o=this._y1;if(isNaN(e))i=(e=Math.floor(t))+1,o=(r=Math.floor(n))+1;else{if(!(e>t||t>i||r>n||n>o))return this;var a,u,f=i-e,c=this._root;switch(u=(n<(r+o)/2)<<1|t<(e+i)/2){case 0:do{a=new Array(4),a[u]=c,c=a}while(f*=2,i=e+f,o=r+f,t>i||n>o);break;case 1:do{a=new Array(4),a[u]=c,c=a}while(f*=2,e=i-f,o=r+f,e>t||n>o);break;case 2:do{a=new Array(4),a[u]=c,c=a}while(f*=2,i=e+f,r=o-f,t>i||r>n);break;case 3:do{a=new Array(4),a[u]=c,c=a}while(f*=2,e=i-f,r=o-f,e>t||r>n)}this._root&&this._root.length&&(this._root=c)}return this._x0=e,this._y0=r,this._x1=i,this._y1=o,this},ed.data=function(){var t=[];return this.visit(function(n){if(!n.length)do{t.push(n.data)}while(n=n.next)}),t},ed.extent=function(t){return arguments.length?this.cover(+t[0][0],+t[0][1]).cover(+t[1][0],+t[1][1]):isNaN(this._x0)?void 0:[[this._x0,this._y0],[this._x1,this._y1]]},ed.find=function(t,n,e){var r,i,o,a,u,f,c,s=this._x0,l=this._y0,h=this._x1,d=this._y1,p=[],v=this._root;for(v&&p.push(new Ye(v,s,l,h,d)),null==e?e=1/0:(s=t-e,l=n-e,h=t+e,d=n+e,e*=e);f=p.pop();)if(!(!(v=f.node)||(i=f.x0)>h||(o=f.y0)>d||(a=f.x1)<s||(u=f.y1)<l))if(v.length){var g=(i+a)/2,y=(o+u)/2;p.push(new Ye(v[3],g,y,a,u),new Ye(v[2],i,y,g,u),new Ye(v[1],g,o,a,y),new Ye(v[0],i,o,g,y)),(c=(n>=y)<<1|t>=g)&&(f=p[p.length-1],p[p.length-1]=p[p.length-1-c],p[p.length-1-c]=f)}else{var _=t-+this._x.call(null,v.data),b=n-+this._y.call(null,v.data),m=_*_+b*b;if(m<e){var x=Math.sqrt(e=m);s=t-x,l=n-x,h=t+x,d=n+x,r=v.data}}return r},ed.remove=function(t){if(isNaN(o=+this._x.call(null,t))||isNaN(a=+this._y.call(null,t)))return this;var n,e,r,i,o,a,u,f,c,s,l,h,d=this._root,p=this._x0,v=this._y0,g=this._x1,y=this._y1;if(!d)return this;if(d.length)for(;;){if((c=o>=(u=(p+g)/2))?p=u:g=u,(s=a>=(f=(v+y)/2))?v=f:y=f,n=d,!(d=d[l=s<<1|c]))return this;if(!d.length)break;(n[l+1&3]||n[l+2&3]||n[l+3&3])&&(e=n,h=l)}for(;d.data!==t;)if(r=d,!(d=d.next))return this;return(i=d.next)&&delete d.next,r?(i?r.next=i:delete r.next,this):n?(i?n[l]=i:delete n[l],(d=n[0]||n[1]||n[2]||n[3])&&d===(n[3]||n[2]||n[1]||n[0])&&!d.length&&(e?e[h]=d:this._root=d),this):(this._root=i,this)},ed.removeAll=function(t){for(var n=0,e=t.length;n<e;++n)this.remove(t[n]);return this},ed.root=function(){return this._root},ed.size=function(){var t=0;return this.visit(function(n){if(!n.length)do{++t}while(n=n.next)}),t},ed.visit=function(t){var n,e,r,i,o,a,u=[],f=this._root;for(f&&u.push(new Ye(f,this._x0,this._y0,this._x1,this._y1));n=u.pop();)if(!t(f=n.node,r=n.x0,i=n.y0,o=n.x1,a=n.y1)&&f.length){var c=(r+o)/2,s=(i+a)/2;(e=f[3])&&u.push(new Ye(e,c,s,o,a)),(e=f[2])&&u.push(new Ye(e,r,s,c,a)),(e=f[1])&&u.push(new Ye(e,c,i,o,s)),(e=f[0])&&u.push(new Ye(e,r,i,c,s))}return this},ed.visitAfter=function(t){var n,e=[],r=[];for(this._root&&e.push(new Ye(this._root,this._x0,this._y0,this._x1,this._y1));n=e.pop();){var i=n.node;if(i.length){var o,a=n.x0,u=n.y0,f=n.x1,c=n.y1,s=(a+f)/2,l=(u+c)/2;(o=i[0])&&e.push(new Ye(o,a,u,s,l)),(o=i[1])&&e.push(new Ye(o,s,u,f,l)),(o=i[2])&&e.push(new Ye(o,a,l,s,c)),(o=i[3])&&e.push(new Ye(o,s,l,f,c))}r.push(n)}for(;n=r.pop();)t(n.node,n.x0,n.y0,n.x1,n.y1);return this},ed.x=function(t){return arguments.length?(this._x=t,this):this._x},ed.y=function(t){return arguments.length?(this._y=t,this):this._y};var rd,id=10,od=Math.PI*(3-Math.sqrt(5)),ad={"":function(t,n){t:for(var e,r=(t=t.toPrecision(n)).length,i=1,o=-1;i<r;++i)switch(t[i]){case".":o=e=i;break;case"0":0===o&&(o=i),e=i;break;case"e":break t;default:o>0&&(o=0)}return o>0?t.slice(0,o)+t.slice(e+1):t},"%":function(t,n){return(100*t).toFixed(n)},b:function(t){return Math.round(t).toString(2)},c:function(t){return t+""},d:function(t){return Math.round(t).toString(10)},e:function(t,n){return t.toExponential(n)},f:function(t,n){return t.toFixed(n)},g:function(t,n){return t.toPrecision(n)},o:function(t){return Math.round(t).toString(8)},p:function(t,n){return Ke(100*t,n)},r:Ke,s:function(t,n){var e=Qe(t,n);if(!e)return t+"";var r=e[0],i=e[1],o=i-(rd=3*Math.max(-8,Math.min(8,Math.floor(i/3))))+1,a=r.length;return o===a?r:o>a?r+new Array(o-a+1).join("0"):o>0?r.slice(0,o)+"."+r.slice(o):"0."+new Array(1-o).join("0")+Qe(t,Math.max(0,n+o-1))[0]},X:function(t){return Math.round(t).toString(16).toUpperCase()},x:function(t){return Math.round(t).toString(16)}},ud=/^(?:(.)?([<>=^]))?([+\-\( ])?([$#])?(0)?(\d+)?(,)?(\.\d+)?([a-z%])?$/i;tr.prototype=nr.prototype,nr.prototype.toString=function(){return this.fill+this.align+this.sign+this.symbol+(this.zero?"0":"")+(null==this.width?"":Math.max(1,0|this.width))+(this.comma?",":"")+(null==this.precision?"":"."+Math.max(0,0|this.precision))+this.type};var fd,cd=["y","z","a","f","p","n","µ","m","","k","M","G","T","P","E","Z","Y"];ir({decimal:".",thousands:",",grouping:[3],currency:["$",""]}),cr.prototype={constructor:cr,reset:function(){this.s=this.t=0},add:function(t){sr(Fd,t,this.t),sr(this,Fd.s,this.s),this.s?this.t+=Fd.t:this.s=Fd.t},valueOf:function(){return this.s}};var sd,ld,hd,dd,pd,vd,gd,yd,_d,bd,md,xd,wd,Md,Ad,Td,Nd,Sd,Ed,kd,Cd,Pd,zd,Rd,Ld,Dd,Ud,qd,Od,Yd,Bd,Fd=new cr,Id=1e-6,jd=1e-12,Hd=Math.PI,Xd=Hd/2,Gd=Hd/4,Vd=2*Hd,$d=180/Hd,Wd=Hd/180,Zd=Math.abs,Qd=Math.atan,Jd=Math.atan2,Kd=Math.cos,tp=Math.ceil,np=Math.exp,ep=Math.log,rp=Math.pow,ip=Math.sin,op=Math.sign||function(t){return t>0?1:t<0?-1:0},ap=Math.sqrt,up=Math.tan,fp={Feature:function(t,n){vr(t.geometry,n)},FeatureCollection:function(t,n){for(var e=t.features,r=-1,i=e.length;++r<i;)vr(e[r].geometry,n)}},cp={Sphere:function(t,n){n.sphere()},Point:function(t,n){t=t.coordinates,n.point(t[0],t[1],t[2])},MultiPoint:function(t,n){for(var e=t.coordinates,r=-1,i=e.length;++r<i;)t=e[r],n.point(t[0],t[1],t[2])},LineString:function(t,n){gr(t.coordinates,n,0)},MultiLineString:function(t,n){for(var e=t.coordinates,r=-1,i=e.length;++r<i;)gr(e[r],n,0)},Polygon:function(t,n){yr(t.coordinates,n)},MultiPolygon:function(t,n){for(var e=t.coordinates,r=-1,i=e.length;++r<i;)yr(e[r],n)},GeometryCollection:function(t,n){for(var e=t.geometries,r=-1,i=e.length;++r<i;)vr(e[r],n)}},sp=fr(),lp=fr(),hp={point:pr,lineStart:pr,lineEnd:pr,polygonStart:function(){sp.reset(),hp.lineStart=br,hp.lineEnd=mr},polygonEnd:function(){var t=+sp;lp.add(t<0?Vd+t:t),this.lineStart=this.lineEnd=this.point=pr},sphere:function(){lp.add(Vd)}},dp=fr(),pp={point:Cr,lineStart:zr,lineEnd:Rr,polygonStart:function(){pp.point=Lr,pp.lineStart=Dr,pp.lineEnd=Ur,dp.reset(),hp.polygonStart()},polygonEnd:function(){hp.polygonEnd(),pp.point=Cr,pp.lineStart=zr,pp.lineEnd=Rr,sp<0?(vd=-(yd=180),gd=-(_d=90)):dp>Id?_d=90:dp<-Id&&(gd=-90),Ad[0]=vd,Ad[1]=yd}},vp={sphere:pr,point:Br,lineStart:Ir,lineEnd:Xr,polygonStart:function(){vp.lineStart=Gr,vp.lineEnd=Vr},polygonEnd:function(){vp.lineStart=Ir,vp.lineEnd=Xr}};Jr.invert=Jr;var gp,yp,_p,bp,mp,xp,wp,Mp,Ap,Tp,Np,Sp=fr(),Ep=hi(function(){return!0},function(t){var n,e=NaN,r=NaN,i=NaN;return{lineStart:function(){t.lineStart(),n=1},point:function(o,a){var u=o>0?Hd:-Hd,f=Zd(o-e);Zd(f-Hd)<Id?(t.point(e,r=(r+a)/2>0?Xd:-Xd),t.point(i,r),t.lineEnd(),t.lineStart(),t.point(u,r),t.point(o,r),n=0):i!==u&&f>=Hd&&(Zd(e-i)<Id&&(e-=i*Id),Zd(o-u)<Id&&(o-=u*Id),r=function(t,n,e,r){var i,o,a=ip(t-e);return Zd(a)>Id?Qd((ip(n)*(o=Kd(r))*ip(e)-ip(r)*(i=Kd(n))*ip(t))/(i*o*a)):(n+r)/2}(e,r,o,a),t.point(i,r),t.lineEnd(),t.lineStart(),t.point(u,r),n=0),t.point(e=o,r=a),i=u},lineEnd:function(){t.lineEnd(),e=r=NaN},clean:function(){return 2-n}}},function(t,n,e,r){var i;if(null==t)i=e*Xd,r.point(-Hd,i),r.point(0,i),r.point(Hd,i),r.point(Hd,0),r.point(Hd,-i),r.point(0,-i),r.point(-Hd,-i),r.point(-Hd,0),r.point(-Hd,i);else if(Zd(t[0]-n[0])>Id){var o=t[0]<n[0]?Hd:-Hd;i=e*o/2,r.point(-o,i),r.point(0,i),r.point(o,i)}else r.point(n[0],n[1])},[-Hd,-Xd]),kp=1e9,Cp=-kp,Pp=fr(),zp={sphere:pr,point:pr,lineStart:function(){zp.point=_i,zp.lineEnd=yi},lineEnd:pr,polygonStart:pr,polygonEnd:pr},Rp=[null,null],Lp={type:"LineString",coordinates:Rp},Dp={Feature:function(t,n){return wi(t.geometry,n)},FeatureCollection:function(t,n){for(var e=t.features,r=-1,i=e.length;++r<i;)if(wi(e[r].geometry,n))return!0;return!1}},Up={Sphere:function(){return!0},Point:function(t,n){return Mi(t.coordinates,n)},MultiPoint:function(t,n){for(var e=t.coordinates,r=-1,i=e.length;++r<i;)if(Mi(e[r],n))return!0;return!1},LineString:function(t,n){return Ai(t.coordinates,n)},MultiLineString:function(t,n){for(var e=t.coordinates,r=-1,i=e.length;++r<i;)if(Ai(e[r],n))return!0;return!1},Polygon:function(t,n){return Ti(t.coordinates,n)},MultiPolygon:function(t,n){for(var e=t.coordinates,r=-1,i=e.length;++r<i;)if(Ti(e[r],n))return!0;return!1},GeometryCollection:function(t,n){for(var e=t.geometries,r=-1,i=e.length;++r<i;)if(wi(e[r],n))return!0;return!1}},qp=fr(),Op=fr(),Yp={point:pr,lineStart:pr,lineEnd:pr,polygonStart:function(){Yp.lineStart=zi,Yp.lineEnd=Di},polygonEnd:function(){Yp.lineStart=Yp.lineEnd=Yp.point=pr,qp.add(Zd(Op)),Op.reset()},result:function(){var t=qp/2;return qp.reset(),t}},Bp=1/0,Fp=Bp,Ip=-Bp,jp=Ip,Hp={point:function(t,n){t<Bp&&(Bp=t),t>Ip&&(Ip=t),n<Fp&&(Fp=n),n>jp&&(jp=n)},lineStart:pr,lineEnd:pr,polygonStart:pr,polygonEnd:pr,result:function(){var t=[[Bp,Fp],[Ip,jp]];return Ip=jp=-(Fp=Bp=1/0),t}},Xp=0,Gp=0,Vp=0,$p=0,Wp=0,Zp=0,Qp=0,Jp=0,Kp=0,tv={point:Ui,lineStart:qi,lineEnd:Bi,polygonStart:function(){tv.lineStart=Fi,tv.lineEnd=Ii},polygonEnd:function(){tv.point=Ui,tv.lineStart=qi,tv.lineEnd=Bi},result:function(){var t=Kp?[Qp/Kp,Jp/Kp]:Zp?[$p/Zp,Wp/Zp]:Vp?[Xp/Vp,Gp/Vp]:[NaN,NaN];return Xp=Gp=Vp=$p=Wp=Zp=Qp=Jp=Kp=0,t}};Xi.prototype={_radius:4.5,pointRadius:function(t){return this._radius=t,this},polygonStart:function(){this._line=0},polygonEnd:function(){this._line=NaN},lineStart:function(){this._point=0},lineEnd:function(){0===this._line&&this._context.closePath(),this._point=NaN},point:function(t,n){switch(this._point){case 0:this._context.moveTo(t,n),this._point=1;break;case 1:this._context.lineTo(t,n);break;default:this._context.moveTo(t+this._radius,n),this._context.arc(t,n,this._radius,0,Vd)}},result:pr};var nv,ev,rv,iv,ov,av=fr(),uv={point:pr,lineStart:function(){uv.point=Gi},lineEnd:function(){nv&&Vi(ev,rv),uv.point=pr},polygonStart:function(){nv=!0},polygonEnd:function(){nv=null},result:function(){var t=+av;return av.reset(),t}};$i.prototype={_radius:4.5,_circle:Wi(4.5),pointRadius:function(t){return(t=+t)!==this._radius&&(this._radius=t,this._circle=null),this},polygonStart:function(){this._line=0},polygonEnd:function(){this._line=NaN},lineStart:function(){this._point=0},lineEnd:function(){0===this._line&&this._string.push("Z"),this._point=NaN},point:function(t,n){switch(this._point){case 0:this._string.push("M",t,",",n),this._point=1;break;case 1:this._string.push("L",t,",",n);break;default:null==this._circle&&(this._circle=Wi(this._radius)),this._string.push("M",t,",",n,this._circle)}},result:function(){if(this._string.length){var t=this._string.join("");return this._string=[],t}return null}},Qi.prototype={constructor:Qi,point:function(t,n){this.stream.point(t,n)},sphere:function(){this.stream.sphere()},lineStart:function(){this.stream.lineStart()},lineEnd:function(){this.stream.lineEnd()},polygonStart:function(){this.stream.polygonStart()},polygonEnd:function(){this.stream.polygonEnd()}};var fv=16,cv=Kd(30*Wd),sv=Zi({point:function(t,n){this.stream.point(t*Wd,n*Wd)}}),lv=lo(function(t){return ap(2/(1+t))});lv.invert=ho(function(t){return 2*hr(t/2)});var hv=lo(function(t){return(t=lr(t))&&t/ip(t)});hv.invert=ho(function(t){return t}),po.invert=function(t,n){return[t,2*Qd(np(n))-Xd]},_o.invert=_o,mo.invert=ho(Qd),wo.invert=function(t,n){var e,r=n,i=25;do{var o=r*r,a=o*o;r-=e=(r*(1.007226+o*(.015085+a*(.028874*o-.044475-.005916*a)))-n)/(1.007226+o*(.045255+a*(.259866*o-.311325-.005916*11*a)))}while(Zd(e)>Id&&--i>0);return[t/(.8707+(o=r*r)*(o*(o*o*o*(.003971-.001529*o)-.013791)-.131979)),r]},Mo.invert=ho(hr),Ao.invert=ho(function(t){return 2*Qd(t)}),To.invert=function(t,n){return[-n,2*Qd(np(t))-Xd]},Lo.prototype=Co.prototype={constructor:Lo,count:function(){return this.eachAfter(ko)},each:function(t){var n,e,r,i,o=this,a=[o];do{for(n=a.reverse(),a=[];o=n.pop();)if(t(o),e=o.children)for(r=0,i=e.length;r<i;++r)a.push(e[r])}while(a.length);return this},eachAfter:function(t){for(var n,e,r,i=this,o=[i],a=[];i=o.pop();)if(a.push(i),n=i.children)for(e=0,r=n.length;e<r;++e)o.push(n[e]);for(;i=a.pop();)t(i);return this},eachBefore:function(t){for(var n,e,r=this,i=[r];r=i.pop();)if(t(r),n=r.children)for(e=n.length-1;e>=0;--e)i.push(n[e]);return this},sum:function(t){return this.eachAfter(function(n){for(var e=+t(n.data)||0,r=n.children,i=r&&r.length;--i>=0;)e+=r[i].value;n.value=e})},sort:function(t){return this.eachBefore(function(n){n.children&&n.children.sort(t)})},path:function(t){for(var n=this,e=function(t,n){if(t===n)return t;var e=t.ancestors(),r=n.ancestors(),i=null;for(t=e.pop(),n=r.pop();t===n;)i=t,t=e.pop(),n=r.pop();return i}(n,t),r=[n];n!==e;)n=n.parent,r.push(n);for(var i=r.length;t!==e;)r.splice(i,0,t),t=t.parent;return r},ancestors:function(){for(var t=this,n=[t];t=t.parent;)n.push(t);return n},descendants:function(){var t=[];return this.each(function(n){t.push(n)}),t},leaves:function(){var t=[];return this.eachBefore(function(n){n.children||t.push(n)}),t},links:function(){var t=this,n=[];return t.each(function(e){e!==t&&n.push({source:e.parent,target:e})}),n},copy:function(){return Co(this).eachBefore(zo)}};var dv=Array.prototype.slice,pv="$",vv={depth:-1},gv={};fa.prototype=Object.create(Lo.prototype);var yv=(1+Math.sqrt(5))/2,_v=function t(n){function e(t,e,r,i,o){sa(n,t,e,r,i,o)}return e.ratio=function(n){return t((n=+n)>1?n:1)},e}(yv),bv=function t(n){function e(t,e,r,i,o){if((a=t._squarify)&&a.ratio===n)for(var a,u,f,c,s,l=-1,h=a.length,d=t.value;++l<h;){for(f=(u=a[l]).children,c=u.value=0,s=f.length;c<s;++c)u.value+=f[c].value;u.dice?ta(u,e,r,i,r+=(o-r)*u.value/d):ca(u,e,r,e+=(i-e)*u.value/d,o),d-=u.value}else t._squarify=a=sa(n,t,e,r,i,o),a.ratio=n}return e.ratio=function(n){return t((n=+n)>1?n:1)},e}(yv),mv=function t(n){function e(t,e){return t=null==t?0:+t,e=null==e?1:+e,1===arguments.length?(e=t,t=0):e-=t,function(){return n()*e+t}}return e.source=t,e}(pa),xv=function t(n){function e(t,e){var r,i;return t=null==t?0:+t,e=null==e?1:+e,function(){var o;if(null!=r)o=r,r=null;else do{r=2*n()-1,o=2*n()-1,i=r*r+o*o}while(!i||i>1);return t+e*o*Math.sqrt(-2*Math.log(i)/i)}}return e.source=t,e}(pa),wv=function t(n){function e(){var t=xv.source(n).apply(this,arguments);return function(){return Math.exp(t())}}return e.source=t,e}(pa),Mv=function t(n){function e(t){return function(){for(var e=0,r=0;r<t;++r)e+=n();return e}}return e.source=t,e}(pa),Av=function t(n){function e(t){var e=Mv.source(n)(t);return function(){return e()/t}}return e.source=t,e}(pa),Tv=function t(n){function e(t){return function(){return-Math.log(1-n())/t}}return e.source=t,e}(pa),Nv=Array.prototype,Sv=Nv.map,Ev=Nv.slice,kv={name:"implicit"},Cv=[0,1],Pv=new Date,zv=new Date,Rv=Fa(function(){},function(t,n){t.setTime(+t+n)},function(t,n){return n-t});Rv.every=function(t){return t=Math.floor(t),isFinite(t)&&t>0?t>1?Fa(function(n){n.setTime(Math.floor(n/t)*t)},function(n,e){n.setTime(+n+e*t)},function(n,e){return(e-n)/t}):Rv:null};var Lv=Rv.range,Dv=6e4,Uv=6048e5,qv=Fa(function(t){t.setTime(1e3*Math.floor(t/1e3))},function(t,n){t.setTime(+t+1e3*n)},function(t,n){return(n-t)/1e3},function(t){return t.getUTCSeconds()}),Ov=qv.range,Yv=Fa(function(t){t.setTime(Math.floor(t/Dv)*Dv)},function(t,n){t.setTime(+t+n*Dv)},function(t,n){return(n-t)/Dv},function(t){return t.getMinutes()}),Bv=Yv.range,Fv=Fa(function(t){var n=t.getTimezoneOffset()*Dv%36e5;n<0&&(n+=36e5),t.setTime(36e5*Math.floor((+t-n)/36e5)+n)},function(t,n){t.setTime(+t+36e5*n)},function(t,n){return(n-t)/36e5},function(t){return t.getHours()}),Iv=Fv.range,jv=Fa(function(t){t.setHours(0,0,0,0)},function(t,n){t.setDate(t.getDate()+n)},function(t,n){return(n-t-(n.getTimezoneOffset()-t.getTimezoneOffset())*Dv)/864e5},function(t){return t.getDate()-1}),Hv=jv.range,Xv=Ia(0),Gv=Ia(1),Vv=Ia(2),$v=Ia(3),Wv=Ia(4),Zv=Ia(5),Qv=Ia(6),Jv=Xv.range,Kv=Gv.range,tg=Vv.range,ng=$v.range,eg=Wv.range,rg=Zv.range,ig=Qv.range,og=Fa(function(t){t.setDate(1),t.setHours(0,0,0,0)},function(t,n){t.setMonth(t.getMonth()+n)},function(t,n){return n.getMonth()-t.getMonth()+12*(n.getFullYear()-t.getFullYear())},function(t){return t.getMonth()}),ag=og.range,ug=Fa(function(t){t.setMonth(0,1),t.setHours(0,0,0,0)},function(t,n){t.setFullYear(t.getFullYear()+n)},function(t,n){return n.getFullYear()-t.getFullYear()},function(t){return t.getFullYear()});ug.every=function(t){return isFinite(t=Math.floor(t))&&t>0?Fa(function(n){n.setFullYear(Math.floor(n.getFullYear()/t)*t),n.setMonth(0,1),n.setHours(0,0,0,0)},function(n,e){n.setFullYear(n.getFullYear()+e*t)}):null};var fg=ug.range,cg=Fa(function(t){t.setUTCSeconds(0,0)},function(t,n){t.setTime(+t+n*Dv)},function(t,n){return(n-t)/Dv},function(t){return t.getUTCMinutes()}),sg=cg.range,lg=Fa(function(t){t.setUTCMinutes(0,0,0)},function(t,n){t.setTime(+t+36e5*n)},function(t,n){return(n-t)/36e5},function(t){return t.getUTCHours()}),hg=lg.range,dg=Fa(function(t){t.setUTCHours(0,0,0,0)},function(t,n){t.setUTCDate(t.getUTCDate()+n)},function(t,n){return(n-t)/864e5},function(t){return t.getUTCDate()-1}),pg=dg.range,vg=ja(0),gg=ja(1),yg=ja(2),_g=ja(3),bg=ja(4),mg=ja(5),xg=ja(6),wg=vg.range,Mg=gg.range,Ag=yg.range,Tg=_g.range,Ng=bg.range,Sg=mg.range,Eg=xg.range,kg=Fa(function(t){t.setUTCDate(1),t.setUTCHours(0,0,0,0)},function(t,n){t.setUTCMonth(t.getUTCMonth()+n)},function(t,n){return n.getUTCMonth()-t.getUTCMonth()+12*(n.getUTCFullYear()-t.getUTCFullYear())},function(t){return t.getUTCMonth()}),Cg=kg.range,Pg=Fa(function(t){t.setUTCMonth(0,1),t.setUTCHours(0,0,0,0)},function(t,n){t.setUTCFullYear(t.getUTCFullYear()+n)},function(t,n){return n.getUTCFullYear()-t.getUTCFullYear()},function(t){return t.getUTCFullYear()});Pg.every=function(t){return isFinite(t=Math.floor(t))&&t>0?Fa(function(n){n.setUTCFullYear(Math.floor(n.getUTCFullYear()/t)*t),n.setUTCMonth(0,1),n.setUTCHours(0,0,0,0)},function(n,e){n.setUTCFullYear(n.getUTCFullYear()+e*t)}):null};var zg,Rg=Pg.range,Lg={"-":"",_:" ",0:"0"},Dg=/^\s*\d+/,Ug=/^%/,qg=/[\\^$*+?|[\]().{}]/g;tf({dateTime:"%x, %X",date:"%-m/%-d/%Y",time:"%-I:%M:%S %p",periods:["AM","PM"],days:["Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"],shortDays:["Sun","Mon","Tue","Wed","Thu","Fri","Sat"],months:["January","February","March","April","May","June","July","August","September","October","November","December"],shortMonths:["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]});var Og="%Y-%m-%dT%H:%M:%S.%LZ",Yg=Date.prototype.toISOString?function(t){return t.toISOString()}:t.utcFormat(Og),Bg=+new Date("2000-01-01T00:00:00.000Z")?function(t){var n=new Date(t);return isNaN(n)?null:n}:t.utcParse(Og),Fg=1e3,Ig=60*Fg,jg=60*Ig,Hg=24*jg,Xg=7*Hg,Gg=30*Hg,Vg=365*Hg,$g=af("1f77b4ff7f0e2ca02cd627289467bd8c564be377c27f7f7fbcbd2217becf"),Wg=af("7fc97fbeaed4fdc086ffff99386cb0f0027fbf5b17666666"),Zg=af("1b9e77d95f027570b3e7298a66a61ee6ab02a6761d666666"),Qg=af("a6cee31f78b4b2df8a33a02cfb9a99e31a1cfdbf6fff7f00cab2d66a3d9affff99b15928"),Jg=af("fbb4aeb3cde3ccebc5decbe4fed9a6ffffcce5d8bdfddaecf2f2f2"),Kg=af("b3e2cdfdcdaccbd5e8f4cae4e6f5c9fff2aef1e2cccccccc"),ty=af("e41a1c377eb84daf4a984ea3ff7f00ffff33a65628f781bf999999"),ny=af("66c2a5fc8d628da0cbe78ac3a6d854ffd92fe5c494b3b3b3"),ey=af("8dd3c7ffffb3bebadafb807280b1d3fdb462b3de69fccde5d9d9d9bc80bdccebc5ffed6f"),ry=new Array(3).concat("d8b365f5f5f55ab4ac","a6611adfc27d80cdc1018571","a6611adfc27df5f5f580cdc1018571","8c510ad8b365f6e8c3c7eae55ab4ac01665e","8c510ad8b365f6e8c3f5f5f5c7eae55ab4ac01665e","8c510abf812ddfc27df6e8c3c7eae580cdc135978f01665e","8c510abf812ddfc27df6e8c3f5f5f5c7eae580cdc135978f01665e","5430058c510abf812ddfc27df6e8c3c7eae580cdc135978f01665e003c30","5430058c510abf812ddfc27df6e8c3f5f5f5c7eae580cdc135978f01665e003c30").map(af),iy=uf(ry),oy=new Array(3).concat("af8dc3f7f7f77fbf7b","7b3294c2a5cfa6dba0008837","7b3294c2a5cff7f7f7a6dba0008837","762a83af8dc3e7d4e8d9f0d37fbf7b1b7837","762a83af8dc3e7d4e8f7f7f7d9f0d37fbf7b1b7837","762a839970abc2a5cfe7d4e8d9f0d3a6dba05aae611b7837","762a839970abc2a5cfe7d4e8f7f7f7d9f0d3a6dba05aae611b7837","40004b762a839970abc2a5cfe7d4e8d9f0d3a6dba05aae611b783700441b","40004b762a839970abc2a5cfe7d4e8f7f7f7d9f0d3a6dba05aae611b783700441b").map(af),ay=uf(oy),uy=new Array(3).concat("e9a3c9f7f7f7a1d76a","d01c8bf1b6dab8e1864dac26","d01c8bf1b6daf7f7f7b8e1864dac26","c51b7de9a3c9fde0efe6f5d0a1d76a4d9221","c51b7de9a3c9fde0eff7f7f7e6f5d0a1d76a4d9221","c51b7dde77aef1b6dafde0efe6f5d0b8e1867fbc414d9221","c51b7dde77aef1b6dafde0eff7f7f7e6f5d0b8e1867fbc414d9221","8e0152c51b7dde77aef1b6dafde0efe6f5d0b8e1867fbc414d9221276419","8e0152c51b7dde77aef1b6dafde0eff7f7f7e6f5d0b8e1867fbc414d9221276419").map(af),fy=uf(uy),cy=new Array(3).concat("998ec3f7f7f7f1a340","5e3c99b2abd2fdb863e66101","5e3c99b2abd2f7f7f7fdb863e66101","542788998ec3d8daebfee0b6f1a340b35806","542788998ec3d8daebf7f7f7fee0b6f1a340b35806","5427888073acb2abd2d8daebfee0b6fdb863e08214b35806","5427888073acb2abd2d8daebf7f7f7fee0b6fdb863e08214b35806","2d004b5427888073acb2abd2d8daebfee0b6fdb863e08214b358067f3b08","2d004b5427888073acb2abd2d8daebf7f7f7fee0b6fdb863e08214b358067f3b08").map(af),sy=uf(cy),ly=new Array(3).concat("ef8a62f7f7f767a9cf","ca0020f4a58292c5de0571b0","ca0020f4a582f7f7f792c5de0571b0","b2182bef8a62fddbc7d1e5f067a9cf2166ac","b2182bef8a62fddbc7f7f7f7d1e5f067a9cf2166ac","b2182bd6604df4a582fddbc7d1e5f092c5de4393c32166ac","b2182bd6604df4a582fddbc7f7f7f7d1e5f092c5de4393c32166ac","67001fb2182bd6604df4a582fddbc7d1e5f092c5de4393c32166ac053061","67001fb2182bd6604df4a582fddbc7f7f7f7d1e5f092c5de4393c32166ac053061").map(af),hy=uf(ly),dy=new Array(3).concat("ef8a62ffffff999999","ca0020f4a582bababa404040","ca0020f4a582ffffffbababa404040","b2182bef8a62fddbc7e0e0e09999994d4d4d","b2182bef8a62fddbc7ffffffe0e0e09999994d4d4d","b2182bd6604df4a582fddbc7e0e0e0bababa8787874d4d4d","b2182bd6604df4a582fddbc7ffffffe0e0e0bababa8787874d4d4d","67001fb2182bd6604df4a582fddbc7e0e0e0bababa8787874d4d4d1a1a1a","67001fb2182bd6604df4a582fddbc7ffffffe0e0e0bababa8787874d4d4d1a1a1a").map(af),py=uf(dy),vy=new Array(3).concat("fc8d59ffffbf91bfdb","d7191cfdae61abd9e92c7bb6","d7191cfdae61ffffbfabd9e92c7bb6","d73027fc8d59fee090e0f3f891bfdb4575b4","d73027fc8d59fee090ffffbfe0f3f891bfdb4575b4","d73027f46d43fdae61fee090e0f3f8abd9e974add14575b4","d73027f46d43fdae61fee090ffffbfe0f3f8abd9e974add14575b4","a50026d73027f46d43fdae61fee090e0f3f8abd9e974add14575b4313695","a50026d73027f46d43fdae61fee090ffffbfe0f3f8abd9e974add14575b4313695").map(af),gy=uf(vy),yy=new Array(3).concat("fc8d59ffffbf91cf60","d7191cfdae61a6d96a1a9641","d7191cfdae61ffffbfa6d96a1a9641","d73027fc8d59fee08bd9ef8b91cf601a9850","d73027fc8d59fee08bffffbfd9ef8b91cf601a9850","d73027f46d43fdae61fee08bd9ef8ba6d96a66bd631a9850","d73027f46d43fdae61fee08bffffbfd9ef8ba6d96a66bd631a9850","a50026d73027f46d43fdae61fee08bd9ef8ba6d96a66bd631a9850006837","a50026d73027f46d43fdae61fee08bffffbfd9ef8ba6d96a66bd631a9850006837").map(af),_y=uf(yy),by=new Array(3).concat("fc8d59ffffbf99d594","d7191cfdae61abdda42b83ba","d7191cfdae61ffffbfabdda42b83ba","d53e4ffc8d59fee08be6f59899d5943288bd","d53e4ffc8d59fee08bffffbfe6f59899d5943288bd","d53e4ff46d43fdae61fee08be6f598abdda466c2a53288bd","d53e4ff46d43fdae61fee08bffffbfe6f598abdda466c2a53288bd","9e0142d53e4ff46d43fdae61fee08be6f598abdda466c2a53288bd5e4fa2","9e0142d53e4ff46d43fdae61fee08bffffbfe6f598abdda466c2a53288bd5e4fa2").map(af),my=uf(by),xy=new Array(3).concat("e5f5f999d8c92ca25f","edf8fbb2e2e266c2a4238b45","edf8fbb2e2e266c2a42ca25f006d2c","edf8fbccece699d8c966c2a42ca25f006d2c","edf8fbccece699d8c966c2a441ae76238b45005824","f7fcfde5f5f9ccece699d8c966c2a441ae76238b45005824","f7fcfde5f5f9ccece699d8c966c2a441ae76238b45006d2c00441b").map(af),wy=uf(xy),My=new Array(3).concat("e0ecf49ebcda8856a7","edf8fbb3cde38c96c688419d","edf8fbb3cde38c96c68856a7810f7c","edf8fbbfd3e69ebcda8c96c68856a7810f7c","edf8fbbfd3e69ebcda8c96c68c6bb188419d6e016b","f7fcfde0ecf4bfd3e69ebcda8c96c68c6bb188419d6e016b","f7fcfde0ecf4bfd3e69ebcda8c96c68c6bb188419d810f7c4d004b").map(af),Ay=uf(My),Ty=new Array(3).concat("e0f3dba8ddb543a2ca","f0f9e8bae4bc7bccc42b8cbe","f0f9e8bae4bc7bccc443a2ca0868ac","f0f9e8ccebc5a8ddb57bccc443a2ca0868ac","f0f9e8ccebc5a8ddb57bccc44eb3d32b8cbe08589e","f7fcf0e0f3dbccebc5a8ddb57bccc44eb3d32b8cbe08589e","f7fcf0e0f3dbccebc5a8ddb57bccc44eb3d32b8cbe0868ac084081").map(af),Ny=uf(Ty),Sy=new Array(3).concat("fee8c8fdbb84e34a33","fef0d9fdcc8afc8d59d7301f","fef0d9fdcc8afc8d59e34a33b30000","fef0d9fdd49efdbb84fc8d59e34a33b30000","fef0d9fdd49efdbb84fc8d59ef6548d7301f990000","fff7ecfee8c8fdd49efdbb84fc8d59ef6548d7301f990000","fff7ecfee8c8fdd49efdbb84fc8d59ef6548d7301fb300007f0000").map(af),Ey=uf(Sy),ky=new Array(3).concat("ece2f0a6bddb1c9099","f6eff7bdc9e167a9cf02818a","f6eff7bdc9e167a9cf1c9099016c59","f6eff7d0d1e6a6bddb67a9cf1c9099016c59","f6eff7d0d1e6a6bddb67a9cf3690c002818a016450","fff7fbece2f0d0d1e6a6bddb67a9cf3690c002818a016450","fff7fbece2f0d0d1e6a6bddb67a9cf3690c002818a016c59014636").map(af),Cy=uf(ky),Py=new Array(3).concat("ece7f2a6bddb2b8cbe","f1eef6bdc9e174a9cf0570b0","f1eef6bdc9e174a9cf2b8cbe045a8d","f1eef6d0d1e6a6bddb74a9cf2b8cbe045a8d","f1eef6d0d1e6a6bddb74a9cf3690c00570b0034e7b","fff7fbece7f2d0d1e6a6bddb74a9cf3690c00570b0034e7b","fff7fbece7f2d0d1e6a6bddb74a9cf3690c00570b0045a8d023858").map(af),zy=uf(Py),Ry=new Array(3).concat("e7e1efc994c7dd1c77","f1eef6d7b5d8df65b0ce1256","f1eef6d7b5d8df65b0dd1c77980043","f1eef6d4b9dac994c7df65b0dd1c77980043","f1eef6d4b9dac994c7df65b0e7298ace125691003f","f7f4f9e7e1efd4b9dac994c7df65b0e7298ace125691003f","f7f4f9e7e1efd4b9dac994c7df65b0e7298ace125698004367001f").map(af),Ly=uf(Ry),Dy=new Array(3).concat("fde0ddfa9fb5c51b8a","feebe2fbb4b9f768a1ae017e","feebe2fbb4b9f768a1c51b8a7a0177","feebe2fcc5c0fa9fb5f768a1c51b8a7a0177","feebe2fcc5c0fa9fb5f768a1dd3497ae017e7a0177","fff7f3fde0ddfcc5c0fa9fb5f768a1dd3497ae017e7a0177","fff7f3fde0ddfcc5c0fa9fb5f768a1dd3497ae017e7a017749006a").map(af),Uy=uf(Dy),qy=new Array(3).concat("edf8b17fcdbb2c7fb8","ffffcca1dab441b6c4225ea8","ffffcca1dab441b6c42c7fb8253494","ffffccc7e9b47fcdbb41b6c42c7fb8253494","ffffccc7e9b47fcdbb41b6c41d91c0225ea80c2c84","ffffd9edf8b1c7e9b47fcdbb41b6c41d91c0225ea80c2c84","ffffd9edf8b1c7e9b47fcdbb41b6c41d91c0225ea8253494081d58").map(af),Oy=uf(qy),Yy=new Array(3).concat("f7fcb9addd8e31a354","ffffccc2e69978c679238443","ffffccc2e69978c67931a354006837","ffffccd9f0a3addd8e78c67931a354006837","ffffccd9f0a3addd8e78c67941ab5d238443005a32","ffffe5f7fcb9d9f0a3addd8e78c67941ab5d238443005a32","ffffe5f7fcb9d9f0a3addd8e78c67941ab5d238443006837004529").map(af),By=uf(Yy),Fy=new Array(3).concat("fff7bcfec44fd95f0e","ffffd4fed98efe9929cc4c02","ffffd4fed98efe9929d95f0e993404","ffffd4fee391fec44ffe9929d95f0e993404","ffffd4fee391fec44ffe9929ec7014cc4c028c2d04","ffffe5fff7bcfee391fec44ffe9929ec7014cc4c028c2d04","ffffe5fff7bcfee391fec44ffe9929ec7014cc4c02993404662506").map(af),Iy=uf(Fy),jy=new Array(3).concat("ffeda0feb24cf03b20","ffffb2fecc5cfd8d3ce31a1c","ffffb2fecc5cfd8d3cf03b20bd0026","ffffb2fed976feb24cfd8d3cf03b20bd0026","ffffb2fed976feb24cfd8d3cfc4e2ae31a1cb10026","ffffccffeda0fed976feb24cfd8d3cfc4e2ae31a1cb10026","ffffccffeda0fed976feb24cfd8d3cfc4e2ae31a1cbd0026800026").map(af),Hy=uf(jy),Xy=new Array(3).concat("deebf79ecae13182bd","eff3ffbdd7e76baed62171b5","eff3ffbdd7e76baed63182bd08519c","eff3ffc6dbef9ecae16baed63182bd08519c","eff3ffc6dbef9ecae16baed64292c62171b5084594","f7fbffdeebf7c6dbef9ecae16baed64292c62171b5084594","f7fbffdeebf7c6dbef9ecae16baed64292c62171b508519c08306b").map(af),Gy=uf(Xy),Vy=new Array(3).concat("e5f5e0a1d99b31a354","edf8e9bae4b374c476238b45","edf8e9bae4b374c47631a354006d2c","edf8e9c7e9c0a1d99b74c47631a354006d2c","edf8e9c7e9c0a1d99b74c47641ab5d238b45005a32","f7fcf5e5f5e0c7e9c0a1d99b74c47641ab5d238b45005a32","f7fcf5e5f5e0c7e9c0a1d99b74c47641ab5d238b45006d2c00441b").map(af),$y=uf(Vy),Wy=new Array(3).concat("f0f0f0bdbdbd636363","f7f7f7cccccc969696525252","f7f7f7cccccc969696636363252525","f7f7f7d9d9d9bdbdbd969696636363252525","f7f7f7d9d9d9bdbdbd969696737373525252252525","fffffff0f0f0d9d9d9bdbdbd969696737373525252252525","fffffff0f0f0d9d9d9bdbdbd969696737373525252252525000000").map(af),Zy=uf(Wy),Qy=new Array(3).concat("efedf5bcbddc756bb1","f2f0f7cbc9e29e9ac86a51a3","f2f0f7cbc9e29e9ac8756bb154278f","f2f0f7dadaebbcbddc9e9ac8756bb154278f","f2f0f7dadaebbcbddc9e9ac8807dba6a51a34a1486","fcfbfdefedf5dadaebbcbddc9e9ac8807dba6a51a34a1486","fcfbfdefedf5dadaebbcbddc9e9ac8807dba6a51a354278f3f007d").map(af),Jy=uf(Qy),Ky=new Array(3).concat("fee0d2fc9272de2d26","fee5d9fcae91fb6a4acb181d","fee5d9fcae91fb6a4ade2d26a50f15","fee5d9fcbba1fc9272fb6a4ade2d26a50f15","fee5d9fcbba1fc9272fb6a4aef3b2ccb181d99000d","fff5f0fee0d2fcbba1fc9272fb6a4aef3b2ccb181d99000d","fff5f0fee0d2fcbba1fc9272fb6a4aef3b2ccb181da50f1567000d").map(af),t_=uf(Ky),n_=new Array(3).concat("fee6cefdae6be6550d","feeddefdbe85fd8d3cd94701","feeddefdbe85fd8d3ce6550da63603","feeddefdd0a2fdae6bfd8d3ce6550da63603","feeddefdd0a2fdae6bfd8d3cf16913d948018c2d04","fff5ebfee6cefdd0a2fdae6bfd8d3cf16913d948018c2d04","fff5ebfee6cefdd0a2fdae6bfd8d3cf16913d94801a636037f2704").map(af),e_=uf(n_),r_=bl(Wt(300,.5,0),Wt(-240,.5,1)),i_=bl(Wt(-100,.75,.35),Wt(80,1.5,.8)),o_=bl(Wt(260,.75,.35),Wt(80,1.5,.8)),a_=Wt(),u_=ff(af("44015444025645045745055946075a46085c460a5d460b5e470d60470e6147106347116447136548146748166848176948186a481a6c481b6d481c6e481d6f481f70482071482173482374482475482576482677482878482979472a7a472c7a472d7b472e7c472f7d46307e46327e46337f463480453581453781453882443983443a83443b84433d84433e85423f854240864241864142874144874045884046883f47883f48893e49893e4a893e4c8a3d4d8a3d4e8a3c4f8a3c508b3b518b3b528b3a538b3a548c39558c39568c38588c38598c375a8c375b8d365c8d365d8d355e8d355f8d34608d34618d33628d33638d32648e32658e31668e31678e31688e30698e306a8e2f6b8e2f6c8e2e6d8e2e6e8e2e6f8e2d708e2d718e2c718e2c728e2c738e2b748e2b758e2a768e2a778e2a788e29798e297a8e297b8e287c8e287d8e277e8e277f8e27808e26818e26828e26828e25838e25848e25858e24868e24878e23888e23898e238a8d228b8d228c8d228d8d218e8d218f8d21908d21918c20928c20928c20938c1f948c1f958b1f968b1f978b1f988b1f998a1f9a8a1e9b8a1e9c891e9d891f9e891f9f881fa0881fa1881fa1871fa28720a38620a48621a58521a68522a78522a88423a98324aa8325ab8225ac8226ad8127ad8128ae8029af7f2ab07f2cb17e2db27d2eb37c2fb47c31b57b32b67a34b67935b77937b87838b9773aba763bbb753dbc743fbc7340bd7242be7144bf7046c06f48c16e4ac16d4cc26c4ec36b50c46a52c56954c56856c66758c7655ac8645cc8635ec96260ca6063cb5f65cb5e67cc5c69cd5b6ccd5a6ece5870cf5773d05675d05477d1537ad1517cd2507fd34e81d34d84d44b86d54989d5488bd6468ed64590d74393d74195d84098d83e9bd93c9dd93ba0da39a2da37a5db36a8db34aadc32addc30b0dd2fb2dd2db5de2bb8de29bade28bddf26c0df25c2df23c5e021c8e020cae11fcde11dd0e11cd2e21bd5e21ad8e219dae319dde318dfe318e2e418e5e419e7e419eae51aece51befe51cf1e51df4e61ef6e620f8e621fbe723fde725")),f_=ff(af("00000401000501010601010802010902020b02020d03030f03031204041405041606051806051a07061c08071e0907200a08220b09240c09260d0a290e0b2b100b2d110c2f120d31130d34140e36150e38160f3b180f3d19103f1a10421c10441d11471e114920114b21114e22115024125325125527125829115a2a115c2c115f2d11612f116331116533106734106936106b38106c390f6e3b0f703d0f713f0f72400f74420f75440f764510774710784910784a10794c117a4e117b4f127b51127c52137c54137d56147d57157e59157e5a167e5c167f5d177f5f187f601880621980641a80651a80671b80681c816a1c816b1d816d1d816e1e81701f81721f817320817521817621817822817922827b23827c23827e24828025828125818326818426818627818827818928818b29818c29818e2a81902a81912b81932b80942c80962c80982d80992d809b2e7f9c2e7f9e2f7fa02f7fa1307ea3307ea5317ea6317da8327daa337dab337cad347cae347bb0357bb2357bb3367ab5367ab73779b83779ba3878bc3978bd3977bf3a77c03a76c23b75c43c75c53c74c73d73c83e73ca3e72cc3f71cd4071cf4070d0416fd2426fd3436ed5446dd6456cd8456cd9466bdb476adc4869de4968df4a68e04c67e24d66e34e65e44f64e55064e75263e85362e95462ea5661eb5760ec5860ed5a5fee5b5eef5d5ef05f5ef1605df2625df2645cf3655cf4675cf4695cf56b5cf66c5cf66e5cf7705cf7725cf8745cf8765cf9785df9795df97b5dfa7d5efa7f5efa815ffb835ffb8560fb8761fc8961fc8a62fc8c63fc8e64fc9065fd9266fd9467fd9668fd9869fd9a6afd9b6bfe9d6cfe9f6dfea16efea36ffea571fea772fea973feaa74feac76feae77feb078feb27afeb47bfeb67cfeb77efeb97ffebb81febd82febf84fec185fec287fec488fec68afec88cfeca8dfecc8ffecd90fecf92fed194fed395fed597fed799fed89afdda9cfddc9efddea0fde0a1fde2a3fde3a5fde5a7fde7a9fde9aafdebacfcecaefceeb0fcf0b2fcf2b4fcf4b6fcf6b8fcf7b9fcf9bbfcfbbdfcfdbf")),c_=ff(af("00000401000501010601010802010a02020c02020e03021004031204031405041706041907051b08051d09061f0a07220b07240c08260d08290e092b10092d110a30120a32140b34150b37160b39180c3c190c3e1b0c411c0c431e0c451f0c48210c4a230c4c240c4f260c51280b53290b552b0b572d0b592f0a5b310a5c320a5e340a5f3609613809623909633b09643d09653e0966400a67420a68440a68450a69470b6a490b6a4a0c6b4c0c6b4d0d6c4f0d6c510e6c520e6d540f6d550f6d57106e59106e5a116e5c126e5d126e5f136e61136e62146e64156e65156e67166e69166e6a176e6c186e6d186e6f196e71196e721a6e741a6e751b6e771c6d781c6d7a1d6d7c1d6d7d1e6d7f1e6c801f6c82206c84206b85216b87216b88226a8a226a8c23698d23698f24699025689225689326679526679727669827669a28659b29649d29649f2a63a02a63a22b62a32c61a52c60a62d60a82e5fa92e5eab2f5ead305dae305cb0315bb1325ab3325ab43359b63458b73557b93556ba3655bc3754bd3853bf3952c03a51c13a50c33b4fc43c4ec63d4dc73e4cc83f4bca404acb4149cc4248ce4347cf4446d04545d24644d34743d44842d54a41d74b3fd84c3ed94d3dda4e3cdb503bdd513ade5238df5337e05536e15635e25734e35933e45a31e55c30e65d2fe75e2ee8602de9612bea632aeb6429eb6628ec6726ed6925ee6a24ef6c23ef6e21f06f20f1711ff1731df2741cf3761bf37819f47918f57b17f57d15f67e14f68013f78212f78410f8850ff8870ef8890cf98b0bf98c0af98e09fa9008fa9207fa9407fb9606fb9706fb9906fb9b06fb9d07fc9f07fca108fca309fca50afca60cfca80dfcaa0ffcac11fcae12fcb014fcb216fcb418fbb61afbb81dfbba1ffbbc21fbbe23fac026fac228fac42afac62df9c72ff9c932f9cb35f8cd37f8cf3af7d13df7d340f6d543f6d746f5d949f5db4cf4dd4ff4df53f4e156f3e35af3e55df2e661f2e865f2ea69f1ec6df1ed71f1ef75f1f179f2f27df2f482f3f586f3f68af4f88ef5f992f6fa96f8fb9af9fc9dfafda1fcffa4")),s_=ff(af("0d088710078813078916078a19068c1b068d1d068e20068f2206902406912605912805922a05932c05942e05952f059631059733059735049837049938049a3a049a3c049b3e049c3f049c41049d43039e44039e46039f48039f4903a04b03a14c02a14e02a25002a25102a35302a35502a45601a45801a45901a55b01a55c01a65e01a66001a66100a76300a76400a76600a76700a86900a86a00a86c00a86e00a86f00a87100a87201a87401a87501a87701a87801a87a02a87b02a87d03a87e03a88004a88104a78305a78405a78606a68707a68808a68a09a58b0aa58d0ba58e0ca48f0da4910ea3920fa39410a29511a19613a19814a099159f9a169f9c179e9d189d9e199da01a9ca11b9ba21d9aa31e9aa51f99a62098a72197a82296aa2395ab2494ac2694ad2793ae2892b02991b12a90b22b8fb32c8eb42e8db52f8cb6308bb7318ab83289ba3388bb3488bc3587bd3786be3885bf3984c03a83c13b82c23c81c33d80c43e7fc5407ec6417dc7427cc8437bc9447aca457acb4679cc4778cc4977cd4a76ce4b75cf4c74d04d73d14e72d24f71d35171d45270d5536fd5546ed6556dd7566cd8576bd9586ada5a6ada5b69db5c68dc5d67dd5e66de5f65de6164df6263e06363e16462e26561e26660e3685fe4695ee56a5de56b5de66c5ce76e5be76f5ae87059e97158e97257ea7457eb7556eb7655ec7754ed7953ed7a52ee7b51ef7c51ef7e50f07f4ff0804ef1814df1834cf2844bf3854bf3874af48849f48948f58b47f58c46f68d45f68f44f79044f79143f79342f89441f89540f9973ff9983ef99a3efa9b3dfa9c3cfa9e3bfb9f3afba139fba238fca338fca537fca636fca835fca934fdab33fdac33fdae32fdaf31fdb130fdb22ffdb42ffdb52efeb72dfeb82cfeba2cfebb2bfebd2afebe2afec029fdc229fdc328fdc527fdc627fdc827fdca26fdcb26fccd25fcce25fcd025fcd225fbd324fbd524fbd724fad824fada24f9dc24f9dd25f8df25f8e125f7e225f7e425f6e626f6e826f5e926f5eb27f4ed27f3ee27f3f027f2f227f1f426f1f525f0f724f0f921")),l_=Math.abs,h_=Math.atan2,d_=Math.cos,p_=Math.max,v_=Math.min,g_=Math.sin,y_=Math.sqrt,__=1e-12,b_=Math.PI,m_=b_/2,x_=2*b_;yf.prototype={areaStart:function(){this._line=0},areaEnd:function(){this._line=NaN},lineStart:function(){this._point=0},lineEnd:function(){(this._line||0!==this._line&&1===this._point)&&this._context.closePath(),this._line=1-this._line},point:function(t,n){switch(t=+t,n=+n,this._point){case 0:this._point=1,this._line?this._context.lineTo(t,n):this._context.moveTo(t,n);break;case 1:this._point=2;default:this._context.lineTo(t,n)}}};var w_=Nf(_f);Tf.prototype={areaStart:function(){this._curve.areaStart()},areaEnd:function(){this._curve.areaEnd()},lineStart:function(){this._curve.lineStart()},lineEnd:function(){this._curve.lineEnd()},point:function(t,n){this._curve.point(n*Math.sin(t),n*-Math.cos(t))}};var M_=Array.prototype.slice,A_={draw:function(t,n){var e=Math.sqrt(n/b_);t.moveTo(e,0),t.arc(0,0,e,0,x_)}},T_={draw:function(t,n){var e=Math.sqrt(n/5)/2;t.moveTo(-3*e,-e),t.lineTo(-e,-e),t.lineTo(-e,-3*e),t.lineTo(e,-3*e),t.lineTo(e,-e),t.lineTo(3*e,-e),t.lineTo(3*e,e),t.lineTo(e,e),t.lineTo(e,3*e),t.lineTo(-e,3*e),t.lineTo(-e,e),t.lineTo(-3*e,e),t.closePath()}},N_=Math.sqrt(1/3),S_=2*N_,E_={draw:function(t,n){var e=Math.sqrt(n/S_),r=e*N_;t.moveTo(0,-e),t.lineTo(r,0),t.lineTo(0,e),t.lineTo(-r,0),t.closePath()}},k_=Math.sin(b_/10)/Math.sin(7*b_/10),C_=Math.sin(x_/10)*k_,P_=-Math.cos(x_/10)*k_,z_={draw:function(t,n){var e=Math.sqrt(.8908130915292852*n),r=C_*e,i=P_*e;t.moveTo(0,-e),t.lineTo(r,i);for(var o=1;o<5;++o){var a=x_*o/5,u=Math.cos(a),f=Math.sin(a);t.lineTo(f*e,-u*e),t.lineTo(u*r-f*i,f*r+u*i)}t.closePath()}},R_={draw:function(t,n){var e=Math.sqrt(n),r=-e/2;t.rect(r,r,e,e)}},L_=Math.sqrt(3),D_={draw:function(t,n){var e=-Math.sqrt(n/(3*L_));t.moveTo(0,2*e),t.lineTo(-L_*e,-e),t.lineTo(L_*e,-e),t.closePath()}},U_=Math.sqrt(3)/2,q_=1/Math.sqrt(12),O_=3*(q_/2+1),Y_={draw:function(t,n){var e=Math.sqrt(n/O_),r=e/2,i=e*q_,o=r,a=e*q_+e,u=-o,f=a;t.moveTo(r,i),t.lineTo(o,a),t.lineTo(u,f),t.lineTo(-.5*r-U_*i,U_*r+-.5*i),t.lineTo(-.5*o-U_*a,U_*o+-.5*a),t.lineTo(-.5*u-U_*f,U_*u+-.5*f),t.lineTo(-.5*r+U_*i,-.5*i-U_*r),t.lineTo(-.5*o+U_*a,-.5*a-U_*o),t.lineTo(-.5*u+U_*f,-.5*f-U_*u),t.closePath()}},B_=[A_,T_,E_,R_,z_,D_,Y_];Yf.prototype={areaStart:function(){this._line=0},areaEnd:function(){this._line=NaN},lineStart:function(){this._x0=this._x1=this._y0=this._y1=NaN,this._point=0},lineEnd:function(){switch(this._point){case 3:Of(this,this._x1,this._y1);case 2:this._context.lineTo(this._x1,this._y1)}(this._line||0!==this._line&&1===this._point)&&this._context.closePath(),this._line=1-this._line},point:function(t,n){switch(t=+t,n=+n,this._point){case 0:this._point=1,this._line?this._context.lineTo(t,n):this._context.moveTo(t,n);break;case 1:this._point=2;break;case 2:this._point=3,this._context.lineTo((5*this._x0+this._x1)/6,(5*this._y0+this._y1)/6);default:Of(this,t,n)}this._x0=this._x1,this._x1=t,this._y0=this._y1,this._y1=n}},Bf.prototype={areaStart:qf,areaEnd:qf,lineStart:function(){this._x0=this._x1=this._x2=this._x3=this._x4=this._y0=this._y1=this._y2=this._y3=this._y4=NaN,this._point=0},lineEnd:function(){switch(this._point){case 1:this._context.moveTo(this._x2,this._y2),this._context.closePath();break;case 2:this._context.moveTo((this._x2+2*this._x3)/3,(this._y2+2*this._y3)/3),this._context.lineTo((this._x3+2*this._x2)/3,(this._y3+2*this._y2)/3),this._context.closePath();break;case 3:this.point(this._x2,this._y2),this.point(this._x3,this._y3),this.point(this._x4,this._y4)}},point:function(t,n){switch(t=+t,n=+n,this._point){case 0:this._point=1,this._x2=t,this._y2=n;break;case 1:this._point=2,this._x3=t,this._y3=n;break;case 2:this._point=3,this._x4=t,this._y4=n,this._context.moveTo((this._x0+4*this._x1+t)/6,(this._y0+4*this._y1+n)/6);break;default:Of(this,t,n)}this._x0=this._x1,this._x1=t,this._y0=this._y1,this._y1=n}},Ff.prototype={areaStart:function(){this._line=0},areaEnd:function(){this._line=NaN},lineStart:function(){this._x0=this._x1=this._y0=this._y1=NaN,this._point=0},lineEnd:function(){(this._line||0!==this._line&&3===this._point)&&this._context.closePath(),this._line=1-this._line},point:function(t,n){switch(t=+t,n=+n,this._point){case 0:this._point=1;break;case 1:this._point=2;break;case 2:this._point=3;var e=(this._x0+4*this._x1+t)/6,r=(this._y0+4*this._y1+n)/6;this._line?this._context.lineTo(e,r):this._context.moveTo(e,r);break;case 3:this._point=4;default:Of(this,t,n)}this._x0=this._x1,this._x1=t,this._y0=this._y1,this._y1=n}},If.prototype={lineStart:function(){this._x=[],this._y=[],this._basis.lineStart()},lineEnd:function(){var t=this._x,n=this._y,e=t.length-1;if(e>0)for(var r,i=t[0],o=n[0],a=t[e]-i,u=n[e]-o,f=-1;++f<=e;)r=f/e,this._basis.point(this._beta*t[f]+(1-this._beta)*(i+r*a),this._beta*n[f]+(1-this._beta)*(o+r*u));this._x=this._y=null,this._basis.lineEnd()},point:function(t,n){this._x.push(+t),this._y.push(+n)}};var F_=function t(n){function e(t){return 1===n?new Yf(t):new If(t,n)}return e.beta=function(n){return t(+n)},e}(.85);Hf.prototype={areaStart:function(){this._line=0},areaEnd:function(){this._line=NaN},lineStart:function(){this._x0=this._x1=this._x2=this._y0=this._y1=this._y2=NaN,this._point=0},lineEnd:function(){switch(this._point){case 2:this._context.lineTo(this._x2,this._y2);break;case 3:jf(this,this._x1,this._y1)}(this._line||0!==this._line&&1===this._point)&&this._context.closePath(),this._line=1-this._line},point:function(t,n){switch(t=+t,n=+n,this._point){case 0:this._point=1,this._line?this._context.lineTo(t,n):this._context.moveTo(t,n);break;case 1:this._point=2,this._x1=t,this._y1=n;break;case 2:this._point=3;default:jf(this,t,n)}this._x0=this._x1,this._x1=this._x2,this._x2=t,this._y0=this._y1,this._y1=this._y2,this._y2=n}};var I_=function t(n){function e(t){return new Hf(t,n)}return e.tension=function(n){return t(+n)},e}(0);Xf.prototype={areaStart:qf,areaEnd:qf,lineStart:function(){this._x0=this._x1=this._x2=this._x3=this._x4=this._x5=this._y0=this._y1=this._y2=this._y3=this._y4=this._y5=NaN,this._point=0},lineEnd:function(){switch(this._point){case 1:this._context.moveTo(this._x3,this._y3),this._context.closePath();break;case 2:this._context.lineTo(this._x3,this._y3),this._context.closePath();break;case 3:this.point(this._x3,this._y3),this.point(this._x4,this._y4),this.point(this._x5,this._y5)}},point:function(t,n){switch(t=+t,n=+n,this._point){case 0:this._point=1,this._x3=t,this._y3=n;break;case 1:this._point=2,this._context.moveTo(this._x4=t,this._y4=n);break;case 2:this._point=3,this._x5=t,this._y5=n;break;default:jf(this,t,n)}this._x0=this._x1,this._x1=this._x2,this._x2=t,this._y0=this._y1,this._y1=this._y2,this._y2=n}};var j_=function t(n){function e(t){return new Xf(t,n)}return e.tension=function(n){return t(+n)},e}(0);Gf.prototype={areaStart:function(){this._line=0},areaEnd:function(){this._line=NaN},lineStart:function(){this._x0=this._x1=this._x2=this._y0=this._y1=this._y2=NaN,this._point=0},lineEnd:function(){(this._line||0!==this._line&&3===this._point)&&this._context.closePath(),this._line=1-this._line},point:function(t,n){switch(t=+t,n=+n,this._point){case 0:this._point=1;break;case 1:this._point=2;break;case 2:this._point=3,this._line?this._context.lineTo(this._x2,this._y2):this._context.moveTo(this._x2,this._y2);break;case 3:this._point=4;default:jf(this,t,n)}this._x0=this._x1,this._x1=this._x2,this._x2=t,this._y0=this._y1,this._y1=this._y2,this._y2=n}};var H_=function t(n){function e(t){return new Gf(t,n)}return e.tension=function(n){return t(+n)},e}(0);$f.prototype={areaStart:function(){this._line=0},areaEnd:function(){this._line=NaN},lineStart:function(){this._x0=this._x1=this._x2=this._y0=this._y1=this._y2=NaN,this._l01_a=this._l12_a=this._l23_a=this._l01_2a=this._l12_2a=this._l23_2a=this._point=0},lineEnd:function(){switch(this._point){case 2:this._context.lineTo(this._x2,this._y2);break;case 3:this.point(this._x2,this._y2)}(this._line||0!==this._line&&1===this._point)&&this._context.closePath(),this._line=1-this._line},point:function(t,n){if(t=+t,n=+n,this._point){var e=this._x2-t,r=this._y2-n;this._l23_a=Math.sqrt(this._l23_2a=Math.pow(e*e+r*r,this._alpha))}switch(this._point){case 0:this._point=1,this._line?this._context.lineTo(t,n):this._context.moveTo(t,n);break;case 1:this._point=2;break;case 2:this._point=3;default:Vf(this,t,n)}this._l01_a=this._l12_a,this._l12_a=this._l23_a,this._l01_2a=this._l12_2a,this._l12_2a=this._l23_2a,this._x0=this._x1,this._x1=this._x2,this._x2=t,this._y0=this._y1,this._y1=this._y2,this._y2=n}};var X_=function t(n){function e(t){return n?new $f(t,n):new Hf(t,0)}return e.alpha=function(n){return t(+n)},e}(.5);Wf.prototype={areaStart:qf,areaEnd:qf,lineStart:function(){this._x0=this._x1=this._x2=this._x3=this._x4=this._x5=this._y0=this._y1=this._y2=this._y3=this._y4=this._y5=NaN,this._l01_a=this._l12_a=this._l23_a=this._l01_2a=this._l12_2a=this._l23_2a=this._point=0},lineEnd:function(){switch(this._point){case 1:this._context.moveTo(this._x3,this._y3),this._context.closePath();break;case 2:this._context.lineTo(this._x3,this._y3),this._context.closePath();break;case 3:this.point(this._x3,this._y3),this.point(this._x4,this._y4),this.point(this._x5,this._y5)}},point:function(t,n){if(t=+t,n=+n,this._point){var e=this._x2-t,r=this._y2-n;this._l23_a=Math.sqrt(this._l23_2a=Math.pow(e*e+r*r,this._alpha))}switch(this._point){case 0:this._point=1,this._x3=t,this._y3=n;break;case 1:this._point=2,this._context.moveTo(this._x4=t,this._y4=n);break;case 2:this._point=3,this._x5=t,this._y5=n;break;default:Vf(this,t,n)}this._l01_a=this._l12_a,this._l12_a=this._l23_a,this._l01_2a=this._l12_2a,this._l12_2a=this._l23_2a,this._x0=this._x1,this._x1=this._x2,this._x2=t,this._y0=this._y1,this._y1=this._y2,this._y2=n}};var G_=function t(n){function e(t){return n?new Wf(t,n):new Xf(t,0)}return e.alpha=function(n){return t(+n)},e}(.5);Zf.prototype={areaStart:function(){this._line=0},areaEnd:function(){this._line=NaN},lineStart:function(){this._x0=this._x1=this._x2=this._y0=this._y1=this._y2=NaN,this._l01_a=this._l12_a=this._l23_a=this._l01_2a=this._l12_2a=this._l23_2a=this._point=0},lineEnd:function(){(this._line||0!==this._line&&3===this._point)&&this._context.closePath(),this._line=1-this._line},point:function(t,n){if(t=+t,n=+n,this._point){var e=this._x2-t,r=this._y2-n;this._l23_a=Math.sqrt(this._l23_2a=Math.pow(e*e+r*r,this._alpha))}switch(this._point){case 0:this._point=1;break;case 1:this._point=2;break;case 2:this._point=3,this._line?this._context.lineTo(this._x2,this._y2):this._context.moveTo(this._x2,this._y2);break;case 3:this._point=4;default:Vf(this,t,n)}this._l01_a=this._l12_a,this._l12_a=this._l23_a,this._l01_2a=this._l12_2a,this._l12_2a=this._l23_2a,this._x0=this._x1,this._x1=this._x2,this._x2=t,this._y0=this._y1,this._y1=this._y2,this._y2=n}};var V_=function t(n){function e(t){return n?new Zf(t,n):new Gf(t,0)}return e.alpha=function(n){return t(+n)},e}(.5);Qf.prototype={areaStart:qf,areaEnd:qf,lineStart:function(){this._point=0},lineEnd:function(){this._point&&this._context.closePath()},point:function(t,n){t=+t,n=+n,this._point?this._context.lineTo(t,n):(this._point=1,this._context.moveTo(t,n))}},ec.prototype={areaStart:function(){this._line=0},areaEnd:function(){this._line=NaN},lineStart:function(){this._x0=this._x1=this._y0=this._y1=this._t0=NaN,this._point=0},lineEnd:function(){switch(this._point){case 2:this._context.lineTo(this._x1,this._y1);break;case 3:nc(this,this._t0,tc(this,this._t0))}(this._line||0!==this._line&&1===this._point)&&this._context.closePath(),this._line=1-this._line},point:function(t,n){var e=NaN;if(t=+t,n=+n,t!==this._x1||n!==this._y1){switch(this._point){case 0:this._point=1,this._line?this._context.lineTo(t,n):this._context.moveTo(t,n);break;case 1:this._point=2;break;case 2:this._point=3,nc(this,tc(this,e=Kf(this,t,n)),e);break;default:nc(this,this._t0,e=Kf(this,t,n))}this._x0=this._x1,this._x1=t,this._y0=this._y1,this._y1=n,this._t0=e}}},(rc.prototype=Object.create(ec.prototype)).point=function(t,n){ec.prototype.point.call(this,n,t)},ic.prototype={moveTo:function(t,n){this._context.moveTo(n,t)},closePath:function(){this._context.closePath()},lineTo:function(t,n){this._context.lineTo(n,t)},bezierCurveTo:function(t,n,e,r,i,o){this._context.bezierCurveTo(n,t,r,e,o,i)}},oc.prototype={areaStart:function(){this._line=0},areaEnd:function(){this._line=NaN},lineStart:function(){this._x=[],this._y=[]},lineEnd:function(){var t=this._x,n=this._y,e=t.length;if(e)if(this._line?this._context.lineTo(t[0],n[0]):this._context.moveTo(t[0],n[0]),2===e)this._context.lineTo(t[1],n[1]);else for(var r=ac(t),i=ac(n),o=0,a=1;a<e;++o,++a)this._context.bezierCurveTo(r[0][o],i[0][o],r[1][o],i[1][o],t[a],n[a]);(this._line||0!==this._line&&1===e)&&this._context.closePath(),this._line=1-this._line,this._x=this._y=null},point:function(t,n){this._x.push(+t),this._y.push(+n)}},uc.prototype={areaStart:function(){this._line=0},areaEnd:function(){this._line=NaN},lineStart:function(){this._x=this._y=NaN,this._point=0},lineEnd:function(){0<this._t&&this._t<1&&2===this._point&&this._context.lineTo(this._x,this._y),(this._line||0!==this._line&&1===this._point)&&this._context.closePath(),this._line>=0&&(this._t=1-this._t,this._line=1-this._line)},point:function(t,n){switch(t=+t,n=+n,this._point){case 0:this._point=1,this._line?this._context.lineTo(t,n):this._context.moveTo(t,n);break;case 1:this._point=2;default:if(this._t<=0)this._context.lineTo(this._x,n),this._context.lineTo(t,n);else{var e=this._x*(1-this._t)+t*this._t;this._context.lineTo(e,this._y),this._context.lineTo(e,n)}}this._x=t,this._y=n}},gc.prototype={constructor:gc,insert:function(t,n){var e,r,i;if(t){if(n.P=t,n.N=t.N,t.N&&(t.N.P=n),t.N=n,t.R){for(t=t.R;t.L;)t=t.L;t.L=n}else t.R=n;e=t}else this._?(t=mc(this._),n.P=null,n.N=t,t.P=t.L=n,e=t):(n.P=n.N=null,this._=n,e=null);for(n.L=n.R=null,n.U=e,n.C=!0,t=n;e&&e.C;)e===(r=e.U).L?(i=r.R)&&i.C?(e.C=i.C=!1,r.C=!0,t=r):(t===e.R&&(_c(this,e),e=(t=e).U),e.C=!1,r.C=!0,bc(this,r)):(i=r.L)&&i.C?(e.C=i.C=!1,r.C=!0,t=r):(t===e.L&&(bc(this,e),e=(t=e).U),e.C=!1,r.C=!0,_c(this,r)),e=t.U;this._.C=!1},remove:function(t){t.N&&(t.N.P=t.P),t.P&&(t.P.N=t.N),t.N=t.P=null;var n,e,r,i=t.U,o=t.L,a=t.R;if(e=o?a?mc(a):o:a,i?i.L===t?i.L=e:i.R=e:this._=e,o&&a?(r=e.C,e.C=t.C,e.L=o,o.U=e,e!==a?(i=e.U,e.U=t.U,t=e.R,i.L=t,e.R=a,a.U=e):(e.U=i,i=e,t=e.R)):(r=t.C,t=e),t&&(t.U=i),!r)if(t&&t.C)t.C=!1;else{do{if(t===this._)break;if(t===i.L){if((n=i.R).C&&(n.C=!1,i.C=!0,_c(this,i),n=i.R),n.L&&n.L.C||n.R&&n.R.C){n.R&&n.R.C||(n.L.C=!1,n.C=!0,bc(this,n),n=i.R),n.C=i.C,i.C=n.R.C=!1,_c(this,i),t=this._;break}}else if((n=i.L).C&&(n.C=!1,i.C=!0,bc(this,i),n=i.L),n.L&&n.L.C||n.R&&n.R.C){n.L&&n.L.C||(n.R.C=!1,n.C=!0,_c(this,n),n=i.L),n.C=i.C,i.C=n.L.C=!1,bc(this,i),t=this._;break}n.C=!0,t=i,i=i.U}while(!t.C);t&&(t.C=!1)}}};var $_,W_,Z_,Q_,J_,K_=[],tb=[],nb=1e-6,eb=1e-12;Oc.prototype={constructor:Oc,polygons:function(){var t=this.edges;return this.cells.map(function(n){var e=n.halfedges.map(function(e){return Sc(n,t[e])});return e.data=n.site.data,e})},triangles:function(){var t=[],n=this.edges;return this.cells.forEach(function(e,r){if(o=(i=e.halfedges).length)for(var i,o,a,u=e.site,f=-1,c=n[i[o-1]],s=c.left===u?c.right:c.left;++f<o;)a=s,s=(c=n[i[f]]).left===u?c.right:c.left,a&&s&&r<a.index&&r<s.index&&Uc(u,a,s)<0&&t.push([u.data,a.data,s.data])}),t},links:function(){return this.edges.filter(function(t){return t.right}).map(function(t){return{source:t.left.data,target:t.right.data}})},find:function(t,n,e){for(var r,i,o=this,a=o._found||0,u=o.cells.length;!(i=o.cells[a]);)if(++a>=u)return null;var f=t-i.site[0],c=n-i.site[1],s=f*f+c*c;do{i=o.cells[r=a],a=null,i.halfedges.forEach(function(e){var r=o.edges[e],u=r.left;if(u!==i.site&&u||(u=r.right)){var f=t-u[0],c=n-u[1],l=f*f+c*c;l<s&&(s=l,a=u.index)}})}while(null!==a);return o._found=r,null==e||s<=e*e?i.site:null}},Bc.prototype={constructor:Bc,scale:function(t){return 1===t?this:new Bc(this.k*t,this.x,this.y)},translate:function(t,n){return 0===t&0===n?this:new Bc(this.k,this.x+this.k*t,this.y+this.k*n)},apply:function(t){return[t[0]*this.k+this.x,t[1]*this.k+this.y]},applyX:function(t){return t*this.k+this.x},applyY:function(t){return t*this.k+this.y},invert:function(t){return[(t[0]-this.x)/this.k,(t[1]-this.y)/this.k]},invertX:function(t){return(t-this.x)/this.k},invertY:function(t){return(t-this.y)/this.k},rescaleX:function(t){return t.copy().domain(t.range().map(this.invertX,this).map(t.invert,t))},rescaleY:function(t){return t.copy().domain(t.range().map(this.invertY,this).map(t.invert,t))},toString:function(){return"translate("+this.x+","+this.y+") scale("+this.k+")"}};var rb=new Bc(1,0,0);Fc.prototype=Bc.prototype,t.version="5.1.0",t.bisect=Qc,t.bisectRight=Qc,t.bisectLeft=Jc,t.ascending=n,t.bisector=e,t.cross=function(t,n,e){var i,o,a,u,f=t.length,c=n.length,s=new Array(f*c);for(null==e&&(e=r),i=a=0;i<f;++i)for(u=t[i],o=0;o<c;++o,++a)s[a]=e(u,n[o]);return s},t.descending=function(t,n){return n<t?-1:n>t?1:n>=t?0:NaN},t.deviation=a,t.extent=u,t.histogram=function(){function t(t){var i,o,a=t.length,u=new Array(a);for(i=0;i<a;++i)u[i]=n(t[i],i,t);var f=e(u),c=f[0],l=f[1],h=r(u,c,l);Array.isArray(h)||(h=d(c,l,h),h=s(Math.ceil(c/h)*h,Math.floor(l/h)*h,h));for(var p=h.length;h[0]<=c;)h.shift(),--p;for(;h[p-1]>l;)h.pop(),--p;var v,g=new Array(p+1);for(i=0;i<=p;++i)(v=g[i]=[]).x0=i>0?h[i-1]:c,v.x1=i<p?h[i]:l;for(i=0;i<a;++i)c<=(o=u[i])&&o<=l&&g[Qc(h,o,0,p)].push(t[i]);return g}var n=c,e=u,r=p;return t.value=function(e){return arguments.length?(n="function"==typeof e?e:f(e),t):n},t.domain=function(n){return arguments.length?(e="function"==typeof n?n:f([n[0],n[1]]),t):e},t.thresholds=function(n){return arguments.length?(r="function"==typeof n?n:Array.isArray(n)?f(ts.call(n)):f(n),t):r},t},t.thresholdFreedmanDiaconis=function(t,e,r){return t=ns.call(t,i).sort(n),Math.ceil((r-e)/(2*(v(t,.75)-v(t,.25))*Math.pow(t.length,-1/3)))},t.thresholdScott=function(t,n,e){return Math.ceil((e-n)/(3.5*a(t)*Math.pow(t.length,-1/3)))},t.thresholdSturges=p,t.max=g,t.mean=function(t,n){var e,r=t.length,o=r,a=-1,u=0;if(null==n)for(;++a<r;)isNaN(e=i(t[a]))?--o:u+=e;else for(;++a<r;)isNaN(e=i(n(t[a],a,t)))?--o:u+=e;if(o)return u/o},t.median=function(t,e){var r,o=t.length,a=-1,u=[];if(null==e)for(;++a<o;)isNaN(r=i(t[a]))||u.push(r);else for(;++a<o;)isNaN(r=i(e(t[a],a,t)))||u.push(r);return v(u.sort(n),.5)},t.merge=y,t.min=_,t.pairs=function(t,n){null==n&&(n=r);for(var e=0,i=t.length-1,o=t[0],a=new Array(i<0?0:i);e<i;)a[e]=n(o,o=t[++e]);return a},t.permute=function(t,n){for(var e=n.length,r=new Array(e);e--;)r[e]=t[n[e]];return r},t.quantile=v,t.range=s,t.scan=function(t,e){if(r=t.length){var r,i,o=0,a=0,u=t[a];for(null==e&&(e=n);++o<r;)(e(i=t[o],u)<0||0!==e(u,u))&&(u=i,a=o);return 0===e(u,u)?a:void 0}},t.shuffle=function(t,n,e){for(var r,i,o=(null==e?t.length:e)-(n=null==n?0:+n);o;)i=Math.random()*o--|0,r=t[o+n],t[o+n]=t[i+n],t[i+n]=r;return t},t.sum=function(t,n){var e,r=t.length,i=-1,o=0;if(null==n)for(;++i<r;)(e=+t[i])&&(o+=e);else for(;++i<r;)(e=+n(t[i],i,t))&&(o+=e);return o},t.ticks=l,t.tickIncrement=h,t.tickStep=d,t.transpose=b,t.variance=o,t.zip=function(){return b(arguments)},t.axisTop=function(t){return T(as,t)},t.axisRight=function(t){return T(us,t)},t.axisBottom=function(t){return T(fs,t)},t.axisLeft=function(t){return T(cs,t)},t.brush=function(){return ne(yh)},t.brushX=function(){return ne(vh)},t.brushY=function(){return ne(gh)},t.brushSelection=function(t){var n=t.__brush;return n?n.dim.output(n.selection):null},t.chord=function(){function t(t){var o,a,u,f,c,l,h=t.length,d=[],p=s(h),v=[],g=[],y=g.groups=new Array(h),_=new Array(h*h);for(o=0,c=-1;++c<h;){for(a=0,l=-1;++l<h;)a+=t[c][l];d.push(a),v.push(s(h)),o+=a}for(e&&p.sort(function(t,n){return e(d[t],d[n])}),r&&v.forEach(function(n,e){n.sort(function(n,i){return r(t[e][n],t[e][i])})}),f=(o=Eh(0,Sh-n*h)/o)?n:Sh/h,a=0,c=-1;++c<h;){for(u=a,l=-1;++l<h;){var b=p[c],m=v[b][l],x=t[b][m],w=a,M=a+=x*o;_[m*h+b]={index:b,subindex:m,startAngle:w,endAngle:M,value:x}}y[b]={index:b,startAngle:u,endAngle:a,value:d[b]},a+=f}for(c=-1;++c<h;)for(l=c-1;++l<h;){var A=_[l*h+c],T=_[c*h+l];(A.value||T.value)&&g.push(A.value<T.value?{source:T,target:A}:{source:A,target:T})}return i?g.sort(i):g}var n=0,e=null,r=null,i=null;return t.padAngle=function(e){return arguments.length?(n=Eh(0,e),t):n},t.sortGroups=function(n){return arguments.length?(e=n,t):e},t.sortSubgroups=function(n){return arguments.length?(r=n,t):r},t.sortChords=function(n){return arguments.length?(null==n?i=null:(i=function(t){return function(n,e){return t(n.source.value+n.target.value,e.source.value+e.target.value)}}(n))._=n,t):i&&i._},t},t.ribbon=function(){function t(){var t,u=kh.call(arguments),f=n.apply(this,u),c=e.apply(this,u),s=+r.apply(this,(u[0]=f,u)),l=i.apply(this,u)-Nh,h=o.apply(this,u)-Nh,d=s*Mh(l),p=s*Ah(l),v=+r.apply(this,(u[0]=c,u)),g=i.apply(this,u)-Nh,y=o.apply(this,u)-Nh;if(a||(a=t=ie()),a.moveTo(d,p),a.arc(0,0,s,l,h),l===g&&h===y||(a.quadraticCurveTo(0,0,v*Mh(g),v*Ah(g)),a.arc(0,0,v,g,y)),a.quadraticCurveTo(0,0,d,p),a.closePath(),t)return a=null,t+""||null}var n=oe,e=ae,r=ue,i=fe,o=ce,a=null;return t.radius=function(n){return arguments.length?(r="function"==typeof n?n:ee(+n),t):r},t.startAngle=function(n){return arguments.length?(i="function"==typeof n?n:ee(+n),t):i},t.endAngle=function(n){return arguments.length?(o="function"==typeof n?n:ee(+n),t):o},t.source=function(e){return arguments.length?(n=e,t):n},t.target=function(n){return arguments.length?(e=n,t):e},t.context=function(n){return arguments.length?(a=null==n?null:n,t):a},t},t.nest=function(){function t(n,i,a,u){if(i>=o.length)return null!=e&&n.sort(e),null!=r?r(n):n;for(var f,c,s,l=-1,h=n.length,d=o[i++],p=le(),v=a();++l<h;)(s=p.get(f=d(c=n[l])+""))?s.push(c):p.set(f,[c]);return p.each(function(n,e){u(v,e,t(n,i,a,u))}),v}function n(t,e){if(++e>o.length)return t;var i,u=a[e-1];return null!=r&&e>=o.length?i=t.entries():(i=[],t.each(function(t,r){i.push({key:r,values:n(t,e)})})),null!=u?i.sort(function(t,n){return u(t.key,n.key)}):i}var e,r,i,o=[],a=[];return i={object:function(n){return t(n,0,he,de)},map:function(n){return t(n,0,pe,ve)},entries:function(e){return n(t(e,0,pe,ve),0)},key:function(t){return o.push(t),i},sortKeys:function(t){return a[o.length-1]=t,i},sortValues:function(t){return e=t,i},rollup:function(t){return r=t,i}}},t.set=ye,t.map=le,t.keys=function(t){var n=[];for(var e in t)n.push(e);return n},t.values=function(t){var n=[];for(var e in t)n.push(t[e]);return n},t.entries=function(t){var n=[];for(var e in t)n.push({key:e,value:t[e]});return n},t.color=kt,t.rgb=Rt,t.hsl=Ut,t.lab=Bt,t.hcl=Vt,t.lch=function(t,n,e,r){return 1===arguments.length?Gt(t):new $t(e,n,t,null==r?1:r)},t.gray=function(t,n){return new Ft(t,0,0,null==n?1:n)},t.cubehelix=Wt,t.contours=we,t.contourDensity=function(){function t(t){var e=new Float32Array(v*y),r=new Float32Array(v*y);t.forEach(function(t,n,r){var i=a(t,n,r)+p>>h,o=u(t,n,r)+p>>h;i>=0&&i<v&&o>=0&&o<y&&++e[i+o*v]}),Me({width:v,height:y,data:e},{width:v,height:y,data:r},l>>h),Ae({width:v,height:y,data:r},{width:v,height:y,data:e},l>>h),Me({width:v,height:y,data:e},{width:v,height:y,data:r},l>>h),Ae({width:v,height:y,data:r},{width:v,height:y,data:e},l>>h),Me({width:v,height:y,data:e},{width:v,height:y,data:r},l>>h),Ae({width:v,height:y,data:r},{width:v,height:y,data:e},l>>h);var i=_(e);if(!Array.isArray(i)){var o=g(e);i=d(0,o,i),(i=s(0,Math.floor(o/i)*i,i)).shift()}return we().thresholds(i).size([v,y])(e).map(n)}function n(t){return t.value*=Math.pow(2,-2*h),t.coordinates.forEach(e),t}function e(t){t.forEach(r)}function r(t){t.forEach(i)}function i(t){t[0]=t[0]*Math.pow(2,h)-p,t[1]=t[1]*Math.pow(2,h)-p}function o(){return p=3*l,v=f+2*p>>h,y=c+2*p>>h,t}var a=Te,u=Ne,f=960,c=500,l=20,h=2,p=3*l,v=f+2*p>>h,y=c+2*p>>h,_=be(20);return t.x=function(n){return arguments.length?(a="function"==typeof n?n:be(+n),t):a},t.y=function(n){return arguments.length?(u="function"==typeof n?n:be(+n),t):u},t.size=function(t){if(!arguments.length)return[f,c];var n=Math.ceil(t[0]),e=Math.ceil(t[1]);if(!(n>=0||n>=0))throw new Error("invalid size");return f=n,c=e,o()},t.cellSize=function(t){if(!arguments.length)return 1<<h;if(!((t=+t)>=1))throw new Error("invalid cell size");return h=Math.floor(Math.log(t)/Math.LN2),o()},t.thresholds=function(n){return arguments.length?(_="function"==typeof n?n:Array.isArray(n)?be(Lh.call(n)):be(n),t):_},t.bandwidth=function(t){if(!arguments.length)return Math.sqrt(l*(l+1));if(!((t=+t)>=0))throw new Error("invalid bandwidth");return l=Math.round((Math.sqrt(4*t*t+1)-1)/2),o()},t},t.dispatch=N,t.drag=function(){function n(t){t.on("mousedown.drag",e).filter(g).on("touchstart.drag",o).on("touchmove.drag",a).on("touchend.drag touchcancel.drag",u).style("touch-action","none").style("-webkit-tap-highlight-color","rgba(0,0,0,0)")}function e(){if(!h&&d.apply(this,arguments)){var n=f("mouse",p.apply(this,arguments),pt,this,arguments);n&&(ct(t.event.view).on("mousemove.drag",r,!0).on("mouseup.drag",i,!0),_t(t.event.view),gt(),l=!1,c=t.event.clientX,s=t.event.clientY,n("start"))}}function r(){if(yt(),!l){var n=t.event.clientX-c,e=t.event.clientY-s;l=n*n+e*e>m}y.mouse("drag")}function i(){ct(t.event.view).on("mousemove.drag mouseup.drag",null),bt(t.event.view,l),yt(),y.mouse("end")}function o(){if(d.apply(this,arguments)){var n,e,r=t.event.changedTouches,i=p.apply(this,arguments),o=r.length;for(n=0;n<o;++n)(e=f(r[n].identifier,i,vt,this,arguments))&&(gt(),e("start"))}}function a(){var n,e,r=t.event.changedTouches,i=r.length;for(n=0;n<i;++n)(e=y[r[n].identifier])&&(yt(),e("drag"))}function u(){var n,e,r=t.event.changedTouches,i=r.length;for(h&&clearTimeout(h),h=setTimeout(function(){h=null},500),n=0;n<i;++n)(e=y[r[n].identifier])&&(gt(),e("end"))}function f(e,r,i,o,a){var u,f,c,s=i(r,e),l=_.copy();if(ot(new xt(n,"beforestart",u,e,b,s[0],s[1],0,0,l),function(){return null!=(t.event.subject=u=v.apply(o,a))&&(f=u.x-s[0]||0,c=u.y-s[1]||0,!0)}))return function t(h){var d,p=s;switch(h){case"start":y[e]=t,d=b++;break;case"end":delete y[e],--b;case"drag":s=i(r,e),d=b}ot(new xt(n,h,u,e,d,s[0]+f,s[1]+c,s[0]-p[0],s[1]-p[1],l),l.apply,l,[h,o,a])}}var c,s,l,h,d=wt,p=Mt,v=At,g=Tt,y={},_=N("start","drag","end"),b=0,m=0;return n.filter=function(t){return arguments.length?(d="function"==typeof t?t:mt(!!t),n):d},n.container=function(t){return arguments.length?(p="function"==typeof t?t:mt(t),n):p},n.subject=function(t){return arguments.length?(v="function"==typeof t?t:mt(t),n):v},n.touchable=function(t){return arguments.length?(g="function"==typeof t?t:mt(!!t),n):g},n.on=function(){var t=_.on.apply(_,arguments);return t===_?n:t},n.clickDistance=function(t){return arguments.length?(m=(t=+t)*t,n):Math.sqrt(m)},n},t.dragDisable=_t,t.dragEnable=bt,t.dsvFormat=Ee,t.csvParse=Ih,t.csvParseRows=jh,t.csvFormat=Hh,t.csvFormatRows=Xh,t.tsvParse=Vh,t.tsvParseRows=$h,t.tsvFormat=Wh,t.tsvFormatRows=Zh,t.easeLinear=function(t){return+t},t.easeQuad=Bn,t.easeQuadIn=function(t){return t*t},t.easeQuadOut=function(t){return t*(2-t)},t.easeQuadInOut=Bn,t.easeCubic=Fn,t.easeCubicIn=function(t){return t*t*t},t.easeCubicOut=function(t){return--t*t*t+1},t.easeCubicInOut=Fn,t.easePoly=jl,t.easePolyIn=Fl,t.easePolyOut=Il,t.easePolyInOut=jl,t.easeSin=In,t.easeSinIn=function(t){return 1-Math.cos(t*Xl)},t.easeSinOut=function(t){return Math.sin(t*Xl)},t.easeSinInOut=In,t.easeExp=jn,t.easeExpIn=function(t){return Math.pow(2,10*t-10)},t.easeExpOut=function(t){return 1-Math.pow(2,-10*t)},t.easeExpInOut=jn,t.easeCircle=Hn,t.easeCircleIn=function(t){return 1-Math.sqrt(1-t*t)},t.easeCircleOut=function(t){return Math.sqrt(1- --t*t)},t.easeCircleInOut=Hn,t.easeBounce=Xn,t.easeBounceIn=function(t){return 1-Xn(1-t)},t.easeBounceOut=Xn,t.easeBounceInOut=function(t){return((t*=2)<=1?1-Xn(1-t):Xn(t-1)+1)/2},t.easeBack=ih,t.easeBackIn=eh,t.easeBackOut=rh,t.easeBackInOut=ih,t.easeElastic=uh,t.easeElasticIn=ah,t.easeElasticOut=uh,t.easeElasticInOut=fh,t.blob=function(t,n){return fetch(t,n).then(ke)},t.buffer=function(t,n){return fetch(t,n).then(Ce)},t.dsv=function(t,n,e,r){3===arguments.length&&"function"==typeof e&&(r=e,e=void 0);var i=Ee(t);return ze(n,e).then(function(t){return i.parse(t,r)})},t.csv=Qh,t.tsv=Jh,t.image=function(t,n){return new Promise(function(e,r){var i=new Image;for(var o in n)i[o]=n[o];i.onerror=r,i.onload=function(){e(i)},i.src=t})},t.json=function(t,n){return fetch(t,n).then(Le)},t.text=ze,t.xml=Kh,t.html=td,t.svg=nd,t.forceCenter=function(t,n){function e(){var e,i,o=r.length,a=0,u=0;for(e=0;e<o;++e)a+=(i=r[e]).x,u+=i.y;for(a=a/o-t,u=u/o-n,e=0;e<o;++e)(i=r[e]).x-=a,i.y-=u}var r;return null==t&&(t=0),null==n&&(n=0),e.initialize=function(t){r=t},e.x=function(n){return arguments.length?(t=+n,e):t},e.y=function(t){return arguments.length?(n=+t,e):n},e},t.forceCollide=function(t){function n(){for(var t,n,r,f,c,s,l,h=i.length,d=0;d<u;++d)for(n=Ie(i,Xe,Ge).visitAfter(e),t=0;t<h;++t)r=i[t],s=o[r.index],l=s*s,f=r.x+r.vx,c=r.y+r.vy,n.visit(function(t,n,e,i,o){var u=t.data,h=t.r,d=s+h;if(!u)return n>f+d||i<f-d||e>c+d||o<c-d;if(u.index>r.index){var p=f-u.x-u.vx,v=c-u.y-u.vy,g=p*p+v*v;g<d*d&&(0===p&&(p=qe(),g+=p*p),0===v&&(v=qe(),g+=v*v),g=(d-(g=Math.sqrt(g)))/g*a,r.vx+=(p*=g)*(d=(h*=h)/(l+h)),r.vy+=(v*=g)*d,u.vx-=p*(d=1-d),u.vy-=v*d)}})}function e(t){if(t.data)return t.r=o[t.data.index];for(var n=t.r=0;n<4;++n)t[n]&&t[n].r>t.r&&(t.r=t[n].r)}function r(){if(i){var n,e,r=i.length;for(o=new Array(r),n=0;n<r;++n)e=i[n],o[e.index]=+t(e,n,i)}}var i,o,a=1,u=1;return"function"!=typeof t&&(t=Ue(null==t?1:+t)),n.initialize=function(t){i=t,r()},n.iterations=function(t){return arguments.length?(u=+t,n):u},n.strength=function(t){return arguments.length?(a=+t,n):a},n.radius=function(e){return arguments.length?(t="function"==typeof e?e:Ue(+e),r(),n):t},n},t.forceLink=function(t){function n(n){for(var e=0,r=t.length;e<d;++e)for(var i,u,f,s,l,h,p,v=0;v<r;++v)u=(i=t[v]).source,s=(f=i.target).x+f.vx-u.x-u.vx||qe(),l=f.y+f.vy-u.y-u.vy||qe(),s*=h=((h=Math.sqrt(s*s+l*l))-a[v])/h*n*o[v],l*=h,f.vx-=s*(p=c[v]),f.vy-=l*p,u.vx+=s*(p=1-p),u.vy+=l*p}function e(){if(u){var n,e,l=u.length,h=t.length,d=le(u,s);for(n=0,f=new Array(l);n<h;++n)(e=t[n]).index=n,"object"!=typeof e.source&&(e.source=$e(d,e.source)),"object"!=typeof e.target&&(e.target=$e(d,e.target)),f[e.source.index]=(f[e.source.index]||0)+1,f[e.target.index]=(f[e.target.index]||0)+1;for(n=0,c=new Array(h);n<h;++n)e=t[n],c[n]=f[e.source.index]/(f[e.source.index]+f[e.target.index]);o=new Array(h),r(),a=new Array(h),i()}}function r(){if(u)for(var n=0,e=t.length;n<e;++n)o[n]=+l(t[n],n,t)}function i(){if(u)for(var n=0,e=t.length;n<e;++n)a[n]=+h(t[n],n,t)}var o,a,u,f,c,s=Ve,l=function(t){return 1/Math.min(f[t.source.index],f[t.target.index])},h=Ue(30),d=1;return null==t&&(t=[]),n.initialize=function(t){u=t,e()},n.links=function(r){return arguments.length?(t=r,e(),n):t},n.id=function(t){return arguments.length?(s=t,n):s},n.iterations=function(t){return arguments.length?(d=+t,n):d},n.strength=function(t){return arguments.length?(l="function"==typeof t?t:Ue(+t),r(),n):l},n.distance=function(t){return arguments.length?(h="function"==typeof t?t:Ue(+t),i(),n):h},n},t.forceManyBody=function(){function t(t){var n,u=i.length,f=Ie(i,We,Ze).visitAfter(e);for(a=t,n=0;n<u;++n)o=i[n],f.visit(r)}function n(){if(i){var t,n,e=i.length;for(u=new Array(e),t=0;t<e;++t)n=i[t],u[n.index]=+f(n,t,i)}}function e(t){var n,e,r,i,o,a=0,f=0;if(t.length){for(r=i=o=0;o<4;++o)(n=t[o])&&(e=Math.abs(n.value))&&(a+=n.value,f+=e,r+=e*n.x,i+=e*n.y);t.x=r/f,t.y=i/f}else{(n=t).x=n.data.x,n.y=n.data.y;do{a+=u[n.data.index]}while(n=n.next)}t.value=a}function r(t,n,e,r){if(!t.value)return!0;var i=t.x-o.x,f=t.y-o.y,h=r-n,d=i*i+f*f;if(h*h/l<d)return d<s&&(0===i&&(i=qe(),d+=i*i),0===f&&(f=qe(),d+=f*f),d<c&&(d=Math.sqrt(c*d)),o.vx+=i*t.value*a/d,o.vy+=f*t.value*a/d),!0;if(!(t.length||d>=s)){(t.data!==o||t.next)&&(0===i&&(i=qe(),d+=i*i),0===f&&(f=qe(),d+=f*f),d<c&&(d=Math.sqrt(c*d)));do{t.data!==o&&(h=u[t.data.index]*a/d,o.vx+=i*h,o.vy+=f*h)}while(t=t.next)}}var i,o,a,u,f=Ue(-30),c=1,s=1/0,l=.81;return t.initialize=function(t){i=t,n()},t.strength=function(e){return arguments.length?(f="function"==typeof e?e:Ue(+e),n(),t):f},t.distanceMin=function(n){return arguments.length?(c=n*n,t):Math.sqrt(c)},t.distanceMax=function(n){return arguments.length?(s=n*n,t):Math.sqrt(s)},t.theta=function(n){return arguments.length?(l=n*n,t):Math.sqrt(l)},t},t.forceRadial=function(t,n,e){function r(t){for(var r=0,i=o.length;r<i;++r){var f=o[r],c=f.x-n||1e-6,s=f.y-e||1e-6,l=Math.sqrt(c*c+s*s),h=(u[r]-l)*a[r]*t/l;f.vx+=c*h,f.vy+=s*h}}function i(){if(o){var n,e=o.length;for(a=new Array(e),u=new Array(e),n=0;n<e;++n)u[n]=+t(o[n],n,o),a[n]=isNaN(u[n])?0:+f(o[n],n,o)}}var o,a,u,f=Ue(.1);return"function"!=typeof t&&(t=Ue(+t)),null==n&&(n=0),null==e&&(e=0),r.initialize=function(t){o=t,i()},r.strength=function(t){return arguments.length?(f="function"==typeof t?t:Ue(+t),i(),r):f},r.radius=function(n){return arguments.length?(t="function"==typeof n?n:Ue(+n),i(),r):t},r.x=function(t){return arguments.length?(n=+t,r):n},r.y=function(t){return arguments.length?(e=+t,r):e},r},t.forceSimulation=function(t){function n(){e(),d.call("tick",o),a<u&&(h.stop(),d.call("end",o))}function e(){var n,e,r=t.length;for(a+=(c-a)*f,l.each(function(t){t(a)}),n=0;n<r;++n)null==(e=t[n]).fx?e.x+=e.vx*=s:(e.x=e.fx,e.vx=0),null==e.fy?e.y+=e.vy*=s:(e.y=e.fy,e.vy=0)}function r(){for(var n,e=0,r=t.length;e<r;++e){if(n=t[e],n.index=e,isNaN(n.x)||isNaN(n.y)){var i=id*Math.sqrt(e),o=e*od;n.x=i*Math.cos(o),n.y=i*Math.sin(o)}(isNaN(n.vx)||isNaN(n.vy))&&(n.vx=n.vy=0)}}function i(n){return n.initialize&&n.initialize(t),n}var o,a=1,u=.001,f=1-Math.pow(u,1/300),c=0,s=.6,l=le(),h=An(n),d=N("tick","end");return null==t&&(t=[]),r(),o={tick:e,restart:function(){return h.restart(n),o},stop:function(){return h.stop(),o},nodes:function(n){return arguments.length?(t=n,r(),l.each(i),o):t},alpha:function(t){return arguments.length?(a=+t,o):a},alphaMin:function(t){return arguments.length?(u=+t,o):u},alphaDecay:function(t){return arguments.length?(f=+t,o):+f},alphaTarget:function(t){return arguments.length?(c=+t,o):c},velocityDecay:function(t){return arguments.length?(s=1-t,o):1-s},force:function(t,n){return arguments.length>1?(null==n?l.remove(t):l.set(t,i(n)),o):l.get(t)},find:function(n,e,r){var i,o,a,u,f,c=0,s=t.length;for(null==r?r=1/0:r*=r,c=0;c<s;++c)(a=(i=n-(u=t[c]).x)*i+(o=e-u.y)*o)<r&&(f=u,r=a);return f},on:function(t,n){return arguments.length>1?(d.on(t,n),o):d.on(t)}}},t.forceX=function(t){function n(t){for(var n,e=0,a=r.length;e<a;++e)(n=r[e]).vx+=(o[e]-n.x)*i[e]*t}function e(){if(r){var n,e=r.length;for(i=new Array(e),o=new Array(e),n=0;n<e;++n)i[n]=isNaN(o[n]=+t(r[n],n,r))?0:+a(r[n],n,r)}}var r,i,o,a=Ue(.1);return"function"!=typeof t&&(t=Ue(null==t?0:+t)),n.initialize=function(t){r=t,e()},n.strength=function(t){return arguments.length?(a="function"==typeof t?t:Ue(+t),e(),n):a},n.x=function(r){return arguments.length?(t="function"==typeof r?r:Ue(+r),e(),n):t},n},t.forceY=function(t){function n(t){for(var n,e=0,a=r.length;e<a;++e)(n=r[e]).vy+=(o[e]-n.y)*i[e]*t}function e(){if(r){var n,e=r.length;for(i=new Array(e),o=new Array(e),n=0;n<e;++n)i[n]=isNaN(o[n]=+t(r[n],n,r))?0:+a(r[n],n,r)}}var r,i,o,a=Ue(.1);return"function"!=typeof t&&(t=Ue(null==t?0:+t)),n.initialize=function(t){r=t,e()},n.strength=function(t){return arguments.length?(a="function"==typeof t?t:Ue(+t),e(),n):a},n.y=function(r){return arguments.length?(t="function"==typeof r?r:Ue(+r),e(),n):t},n},t.formatDefaultLocale=ir,t.formatLocale=rr,t.formatSpecifier=tr,t.precisionFixed=or,t.precisionPrefix=ar,t.precisionRound=ur,t.geoArea=function(t){return lp.reset(),_r(t,hp),2*lp},t.geoBounds=function(t){var n,e,r,i,o,a,u;if(_d=yd=-(vd=gd=1/0),Md=[],_r(t,pp),e=Md.length){for(Md.sort(Or),n=1,o=[r=Md[0]];n<e;++n)Yr(r,(i=Md[n])[0])||Yr(r,i[1])?(qr(r[0],i[1])>qr(r[0],r[1])&&(r[1]=i[1]),qr(i[0],r[1])>qr(r[0],r[1])&&(r[0]=i[0])):o.push(r=i);for(a=-1/0,n=0,r=o[e=o.length-1];n<=e;r=i,++n)i=o[n],(u=qr(r[1],i[0]))>a&&(a=u,vd=i[0],yd=r[1])}return Md=Ad=null,vd===1/0||gd===1/0?[[NaN,NaN],[NaN,NaN]]:[[vd,gd],[yd,_d]]},t.geoCentroid=function(t){Td=Nd=Sd=Ed=kd=Cd=Pd=zd=Rd=Ld=Dd=0,_r(t,vp);var n=Rd,e=Ld,r=Dd,i=n*n+e*e+r*r;return i<jd&&(n=Cd,e=Pd,r=zd,Nd<Id&&(n=Sd,e=Ed,r=kd),(i=n*n+e*e+r*r)<jd)?[NaN,NaN]:[Jd(e,n)*$d,hr(r/ap(i))*$d]},t.geoCircle=function(){function t(){var t=r.apply(this,arguments),u=i.apply(this,arguments)*Wd,f=o.apply(this,arguments)*Wd;return n=[],e=Kr(-t[0]*Wd,-t[1]*Wd,0).invert,ii(a,u,f,1),t={type:"Polygon",coordinates:[n]},n=e=null,t}var n,e,r=Zr([0,0]),i=Zr(90),o=Zr(6),a={point:function(t,r){n.push(t=e(t,r)),t[0]*=$d,t[1]*=$d}};return t.center=function(n){return arguments.length?(r="function"==typeof n?n:Zr([+n[0],+n[1]]),t):r},t.radius=function(n){return arguments.length?(i="function"==typeof n?n:Zr(+n),t):i},t.precision=function(n){return arguments.length?(o="function"==typeof n?n:Zr(+n),t):o},t},t.geoClipAntimeridian=Ep,t.geoClipCircle=vi,t.geoClipExtent=function(){var t,n,e,r=0,i=0,o=960,a=500;return e={stream:function(e){return t&&n===e?t:t=gi(r,i,o,a)(n=e)},extent:function(u){return arguments.length?(r=+u[0][0],i=+u[0][1],o=+u[1][0],a=+u[1][1],t=n=null,e):[[r,i],[o,a]]}}},t.geoClipRectangle=gi,t.geoContains=function(t,n){return(t&&Dp.hasOwnProperty(t.type)?Dp[t.type]:wi)(t,n)},t.geoDistance=xi,t.geoGraticule=Ci,t.geoGraticule10=function(){return Ci()()},t.geoInterpolate=function(t,n){var e=t[0]*Wd,r=t[1]*Wd,i=n[0]*Wd,o=n[1]*Wd,a=Kd(r),u=ip(r),f=Kd(o),c=ip(o),s=a*Kd(e),l=a*ip(e),h=f*Kd(i),d=f*ip(i),p=2*hr(ap(dr(o-r)+a*f*dr(i-e))),v=ip(p),g=p?function(t){var n=ip(t*=p)/v,e=ip(p-t)/v,r=e*s+n*h,i=e*l+n*d,o=e*u+n*c;return[Jd(i,r)*$d,Jd(o,ap(r*r+i*i))*$d]}:function(){return[e*$d,r*$d]};return g.distance=p,g},t.geoLength=mi,t.geoPath=function(t,n){function e(t){return t&&("function"==typeof o&&i.pointRadius(+o.apply(this,arguments)),_r(t,r(i))),i.result()}var r,i,o=4.5;return e.area=function(t){return _r(t,r(Yp)),Yp.result()},e.measure=function(t){return _r(t,r(uv)),uv.result()},e.bounds=function(t){return _r(t,r(Hp)),Hp.result()},e.centroid=function(t){return _r(t,r(tv)),tv.result()},e.projection=function(n){return arguments.length?(r=null==n?(t=null,Pi):(t=n).stream,e):t},e.context=function(t){return arguments.length?(i=null==t?(n=null,new $i):new Xi(n=t),"function"!=typeof o&&i.pointRadius(o),e):n},e.pointRadius=function(t){return arguments.length?(o="function"==typeof t?t:(i.pointRadius(+t),+t),e):o},e.projection(t).context(n)},t.geoAlbers=so,t.geoAlbersUsa=function(){function t(t){var n=t[0],e=t[1];return u=null,i.point(n,e),u||(o.point(n,e),u)||(a.point(n,e),u)}function n(){return e=r=null,t}var e,r,i,o,a,u,f=so(),c=co().rotate([154,0]).center([-2,58.5]).parallels([55,65]),s=co().rotate([157,0]).center([-3,19.9]).parallels([8,18]),l={point:function(t,n){u=[t,n]}};return t.invert=function(t){var n=f.scale(),e=f.translate(),r=(t[0]-e[0])/n,i=(t[1]-e[1])/n;return(i>=.12&&i<.234&&r>=-.425&&r<-.214?c:i>=.166&&i<.234&&r>=-.214&&r<-.115?s:f).invert(t)},t.stream=function(t){return e&&r===t?e:e=function(t){var n=t.length;return{point:function(e,r){for(var i=-1;++i<n;)t[i].point(e,r)},sphere:function(){for(var e=-1;++e<n;)t[e].sphere()},lineStart:function(){for(var e=-1;++e<n;)t[e].lineStart()},lineEnd:function(){for(var e=-1;++e<n;)t[e].lineEnd()},polygonStart:function(){for(var e=-1;++e<n;)t[e].polygonStart()},polygonEnd:function(){for(var e=-1;++e<n;)t[e].polygonEnd()}}}([f.stream(r=t),c.stream(t),s.stream(t)])},t.precision=function(t){return arguments.length?(f.precision(t),c.precision(t),s.precision(t),n()):f.precision()},t.scale=function(n){return arguments.length?(f.scale(n),c.scale(.35*n),s.scale(n),t.translate(f.translate())):f.scale()},t.translate=function(t){if(!arguments.length)return f.translate();var e=f.scale(),r=+t[0],u=+t[1];return i=f.translate(t).clipExtent([[r-.455*e,u-.238*e],[r+.455*e,u+.238*e]]).stream(l),o=c.translate([r-.307*e,u+.201*e]).clipExtent([[r-.425*e+Id,u+.12*e+Id],[r-.214*e-Id,u+.234*e-Id]]).stream(l),a=s.translate([r-.205*e,u+.212*e]).clipExtent([[r-.214*e+Id,u+.166*e+Id],[r-.115*e-Id,u+.234*e-Id]]).stream(l),n()},t.fitExtent=function(n,e){return Ki(t,n,e)},t.fitSize=function(n,e){return to(t,n,e)},t.fitWidth=function(n,e){return no(t,n,e)},t.fitHeight=function(n,e){return eo(t,n,e)},t.scale(1070)},t.geoAzimuthalEqualArea=function(){return oo(lv).scale(124.75).clipAngle(179.999)},t.geoAzimuthalEqualAreaRaw=lv,t.geoAzimuthalEquidistant=function(){return oo(hv).scale(79.4188).clipAngle(179.999)},t.geoAzimuthalEquidistantRaw=hv,t.geoConicConformal=function(){return uo(yo).scale(109.5).parallels([30,30])},t.geoConicConformalRaw=yo,t.geoConicEqualArea=co,t.geoConicEqualAreaRaw=fo,t.geoConicEquidistant=function(){return uo(bo).scale(131.154).center([0,13.9389])},t.geoConicEquidistantRaw=bo,t.geoEquirectangular=function(){return oo(_o).scale(152.63)},t.geoEquirectangularRaw=_o,t.geoGnomonic=function(){return oo(mo).scale(144.049).clipAngle(60)},t.geoGnomonicRaw=mo,t.geoIdentity=function(){function t(){return i=o=null,a}var n,e,r,i,o,a,u=1,f=0,c=0,s=1,l=1,h=Pi,d=null,p=Pi;return a={stream:function(t){return i&&o===t?i:i=h(p(o=t))},postclip:function(i){return arguments.length?(p=i,d=n=e=r=null,t()):p},clipExtent:function(i){return arguments.length?(p=null==i?(d=n=e=r=null,Pi):gi(d=+i[0][0],n=+i[0][1],e=+i[1][0],r=+i[1][1]),t()):null==d?null:[[d,n],[e,r]]},scale:function(n){return arguments.length?(h=xo((u=+n)*s,u*l,f,c),t()):u},translate:function(n){return arguments.length?(h=xo(u*s,u*l,f=+n[0],c=+n[1]),t()):[f,c]},reflectX:function(n){return arguments.length?(h=xo(u*(s=n?-1:1),u*l,f,c),t()):s<0},reflectY:function(n){return arguments.length?(h=xo(u*s,u*(l=n?-1:1),f,c),t()):l<0},fitExtent:function(t,n){return Ki(a,t,n)},fitSize:function(t,n){return to(a,t,n)},fitWidth:function(t,n){return no(a,t,n)},fitHeight:function(t,n){return eo(a,t,n)}}},t.geoProjection=oo,t.geoProjectionMutator=ao,t.geoMercator=function(){return vo(po).scale(961/Vd)},t.geoMercatorRaw=po,t.geoNaturalEarth1=function(){return oo(wo).scale(175.295)},t.geoNaturalEarth1Raw=wo,t.geoOrthographic=function(){return oo(Mo).scale(249.5).clipAngle(90+Id)},t.geoOrthographicRaw=Mo,t.geoStereographic=function(){return oo(Ao).scale(250).clipAngle(142)},t.geoStereographicRaw=Ao,t.geoTransverseMercator=function(){var t=vo(To),n=t.center,e=t.rotate;return t.center=function(t){return arguments.length?n([-t[1],t[0]]):(t=n(),[t[1],-t[0]])},t.rotate=function(t){return arguments.length?e([t[0],t[1],t.length>2?t[2]+90:90]):(t=e(),[t[0],t[1],t[2]-90])},e([0,0,90]).scale(159.155)},t.geoTransverseMercatorRaw=To,t.geoRotation=ri,t.geoStream=_r,t.geoTransform=function(t){return{stream:Zi(t)}},t.cluster=function(){function t(t){var o,a=0;t.eachAfter(function(t){var e=t.children;e?(t.x=function(t){return t.reduce(So,0)/t.length}(e),t.y=function(t){return 1+t.reduce(Eo,0)}(e)):(t.x=o?a+=n(t,o):0,t.y=0,o=t)});var u=function(t){for(var n;n=t.children;)t=n[0];return t}(t),f=function(t){for(var n;n=t.children;)t=n[n.length-1];return t}(t),c=u.x-n(u,f)/2,s=f.x+n(f,u)/2;return t.eachAfter(i?function(n){n.x=(n.x-t.x)*e,n.y=(t.y-n.y)*r}:function(n){n.x=(n.x-c)/(s-c)*e,n.y=(1-(t.y?n.y/t.y:1))*r})}var n=No,e=1,r=1,i=!1;return t.separation=function(e){return arguments.length?(n=e,t):n},t.size=function(n){return arguments.length?(i=!1,e=+n[0],r=+n[1],t):i?null:[e,r]},t.nodeSize=function(n){return arguments.length?(i=!0,e=+n[0],r=+n[1],t):i?[e,r]:null},t},t.hierarchy=Co,t.pack=function(){function t(t){return t.x=e/2,t.y=r/2,n?t.eachBefore(Zo(n)).eachAfter(Qo(i,.5)).eachBefore(Jo(1)):t.eachBefore(Zo(Wo)).eachAfter(Qo(Vo,1)).eachAfter(Qo(i,t.r/Math.min(e,r))).eachBefore(Jo(Math.min(e,r)/(2*t.r))),t}var n=null,e=1,r=1,i=Vo;return t.radius=function(e){return arguments.length?(n=function(t){return null==t?null:Go(t)}(e),t):n},t.size=function(n){return arguments.length?(e=+n[0],r=+n[1],t):[e,r]},t.padding=function(n){return arguments.length?(i="function"==typeof n?n:$o(+n),t):i},t},t.packSiblings=function(t){return Xo(t),t},t.packEnclose=Do,t.partition=function(){function t(t){var o=t.height+1;return t.x0=t.y0=r,t.x1=n,t.y1=e/o,t.eachBefore(function(t,n){return function(e){e.children&&ta(e,e.x0,t*(e.depth+1)/n,e.x1,t*(e.depth+2)/n);var i=e.x0,o=e.y0,a=e.x1-r,u=e.y1-r;a<i&&(i=a=(i+a)/2),u<o&&(o=u=(o+u)/2),e.x0=i,e.y0=o,e.x1=a,e.y1=u}}(e,o)),i&&t.eachBefore(Ko),t}var n=1,e=1,r=0,i=!1;return t.round=function(n){return arguments.length?(i=!!n,t):i},t.size=function(r){return arguments.length?(n=+r[0],e=+r[1],t):[n,e]},t.padding=function(n){return arguments.length?(r=+n,t):r},t},t.stratify=function(){function t(t){var r,i,o,a,u,f,c,s=t.length,l=new Array(s),h={};for(i=0;i<s;++i)r=t[i],u=l[i]=new Lo(r),null!=(f=n(r,i,t))&&(f+="")&&(h[c=pv+(u.id=f)]=c in h?gv:u);for(i=0;i<s;++i)if(u=l[i],null!=(f=e(t[i],i,t))&&(f+="")){if(!(a=h[pv+f]))throw new Error("missing: "+f);if(a===gv)throw new Error("ambiguous: "+f);a.children?a.children.push(u):a.children=[u],u.parent=a}else{if(o)throw new Error("multiple roots");o=u}if(!o)throw new Error("no root");if(o.parent=vv,o.eachBefore(function(t){t.depth=t.parent.depth+1,--s}).eachBefore(Ro),o.parent=null,s>0)throw new Error("cycle");return o}var n=na,e=ea;return t.id=function(e){return arguments.length?(n=Go(e),t):n},t.parentId=function(n){return arguments.length?(e=Go(n),t):e},t},t.tree=function(){function t(t){var f=function(t){for(var n,e,r,i,o,a=new fa(t,0),u=[a];n=u.pop();)if(r=n._.children)for(n.children=new Array(o=r.length),i=o-1;i>=0;--i)u.push(e=n.children[i]=new fa(r[i],i)),e.parent=n;return(a.parent=new fa(null,0)).children=[a],a}(t);if(f.eachAfter(n),f.parent.m=-f.z,f.eachBefore(e),u)t.eachBefore(r);else{var c=t,s=t,l=t;t.eachBefore(function(t){t.x<c.x&&(c=t),t.x>s.x&&(s=t),t.depth>l.depth&&(l=t)});var h=c===s?1:i(c,s)/2,d=h-c.x,p=o/(s.x+h+d),v=a/(l.depth||1);t.eachBefore(function(t){t.x=(t.x+d)*p,t.y=t.depth*v})}return t}function n(t){var n=t.children,e=t.parent.children,r=t.i?e[t.i-1]:null;if(n){(function(t){for(var n,e=0,r=0,i=t.children,o=i.length;--o>=0;)(n=i[o]).z+=e,n.m+=e,e+=n.s+(r+=n.c)})(t);var o=(n[0].z+n[n.length-1].z)/2;r?(t.z=r.z+i(t._,r._),t.m=t.z-o):t.z=o}else r&&(t.z=r.z+i(t._,r._));t.parent.A=function(t,n,e){if(n){for(var r,o=t,a=t,u=n,f=o.parent.children[0],c=o.m,s=a.m,l=u.m,h=f.m;u=oa(u),o=ia(o),u&&o;)f=ia(f),(a=oa(a)).a=t,(r=u.z+l-o.z-c+i(u._,o._))>0&&(aa(ua(u,t,e),t,r),c+=r,s+=r),l+=u.m,c+=o.m,h+=f.m,s+=a.m;u&&!oa(a)&&(a.t=u,a.m+=l-s),o&&!ia(f)&&(f.t=o,f.m+=c-h,e=t)}return e}(t,r,t.parent.A||e[0])}function e(t){t._.x=t.z+t.parent.m,t.m+=t.parent.m}function r(t){t.x*=o,t.y=t.depth*a}var i=ra,o=1,a=1,u=null;return t.separation=function(n){return arguments.length?(i=n,t):i},t.size=function(n){return arguments.length?(u=!1,o=+n[0],a=+n[1],t):u?null:[o,a]},t.nodeSize=function(n){return arguments.length?(u=!0,o=+n[0],a=+n[1],t):u?[o,a]:null},t},t.treemap=function(){function t(t){return t.x0=t.y0=0,t.x1=i,t.y1=o,t.eachBefore(n),a=[0],r&&t.eachBefore(Ko),t}function n(t){var n=a[t.depth],r=t.x0+n,i=t.y0+n,o=t.x1-n,h=t.y1-n;o<r&&(r=o=(r+o)/2),h<i&&(i=h=(i+h)/2),t.x0=r,t.y0=i,t.x1=o,t.y1=h,t.children&&(n=a[t.depth+1]=u(t)/2,r+=l(t)-n,i+=f(t)-n,o-=c(t)-n,h-=s(t)-n,o<r&&(r=o=(r+o)/2),h<i&&(i=h=(i+h)/2),e(t,r,i,o,h))}var e=_v,r=!1,i=1,o=1,a=[0],u=Vo,f=Vo,c=Vo,s=Vo,l=Vo;return t.round=function(n){return arguments.length?(r=!!n,t):r},t.size=function(n){return arguments.length?(i=+n[0],o=+n[1],t):[i,o]},t.tile=function(n){return arguments.length?(e=Go(n),t):e},t.padding=function(n){return arguments.length?t.paddingInner(n).paddingOuter(n):t.paddingInner()},t.paddingInner=function(n){return arguments.length?(u="function"==typeof n?n:$o(+n),t):u},t.paddingOuter=function(n){return arguments.length?t.paddingTop(n).paddingRight(n).paddingBottom(n).paddingLeft(n):t.paddingTop()},t.paddingTop=function(n){return arguments.length?(f="function"==typeof n?n:$o(+n),t):f},t.paddingRight=function(n){return arguments.length?(c="function"==typeof n?n:$o(+n),t):c},t.paddingBottom=function(n){return arguments.length?(s="function"==typeof n?n:$o(+n),t):s},t.paddingLeft=function(n){return arguments.length?(l="function"==typeof n?n:$o(+n),t):l},t},t.treemapBinary=function(t,n,e,r,i){function o(t,n,e,r,i,a,u){if(t>=n-1){var c=f[t];return c.x0=r,c.y0=i,c.x1=a,void(c.y1=u)}for(var l=s[t],h=e/2+l,d=t+1,p=n-1;d<p;){var v=d+p>>>1;s[v]<h?d=v+1:p=v}h-s[d-1]<s[d]-h&&t+1<d&&--d;var g=s[d]-l,y=e-g;if(a-r>u-i){var _=(r*y+a*g)/e;o(t,d,g,r,i,_,u),o(d,n,y,_,i,a,u)}else{var b=(i*y+u*g)/e;o(t,d,g,r,i,a,b),o(d,n,y,r,b,a,u)}}var a,u,f=t.children,c=f.length,s=new Array(c+1);for(s[0]=u=a=0;a<c;++a)s[a+1]=u+=f[a].value;o(0,c,t.value,n,e,r,i)},t.treemapDice=ta,t.treemapSlice=ca,t.treemapSliceDice=function(t,n,e,r,i){(1&t.depth?ca:ta)(t,n,e,r,i)},t.treemapSquarify=_v,t.treemapResquarify=bv,t.interpolate=hn,t.interpolateArray=un,t.interpolateBasis=Jt,t.interpolateBasisClosed=Kt,t.interpolateDate=fn,t.interpolateNumber=cn,t.interpolateObject=sn,t.interpolateRound=dn,t.interpolateString=ln,t.interpolateTransformCss=fl,t.interpolateTransformSvg=cl,t.interpolateZoom=yn,t.interpolateRgb=nl,t.interpolateRgbBasis=el,t.interpolateRgbBasisClosed=rl,t.interpolateHsl=pl,t.interpolateHslLong=vl,t.interpolateLab=function(t,n){var e=on((t=Bt(t)).l,(n=Bt(n)).l),r=on(t.a,n.a),i=on(t.b,n.b),o=on(t.opacity,n.opacity);return function(n){return t.l=e(n),t.a=r(n),t.b=i(n),t.opacity=o(n),t+""}},t.interpolateHcl=gl,t.interpolateHclLong=yl,t.interpolateCubehelix=_l,t.interpolateCubehelixLong=bl,t.quantize=function(t,n){for(var e=new Array(n),r=0;r<n;++r)e[r]=t(r/(n-1));return e},t.path=ie,t.polygonArea=function(t){for(var n,e=-1,r=t.length,i=t[r-1],o=0;++e<r;)n=i,i=t[e],o+=n[1]*i[0]-n[0]*i[1];return o/2},t.polygonCentroid=function(t){for(var n,e,r=-1,i=t.length,o=0,a=0,u=t[i-1],f=0;++r<i;)n=u,u=t[r],f+=e=n[0]*u[1]-u[0]*n[1],o+=(n[0]+u[0])*e,a+=(n[1]+u[1])*e;return f*=3,[o/f,a/f]},t.polygonHull=function(t){if((e=t.length)<3)return null;var n,e,r=new Array(e),i=new Array(e);for(n=0;n<e;++n)r[n]=[+t[n][0],+t[n][1],n];for(r.sort(ha),n=0;n<e;++n)i[n]=[r[n][0],-r[n][1]];var o=da(r),a=da(i),u=a[0]===o[0],f=a[a.length-1]===o[o.length-1],c=[];for(n=o.length-1;n>=0;--n)c.push(t[r[o[n]][2]]);for(n=+u;n<a.length-f;++n)c.push(t[r[a[n]][2]]);return c},t.polygonContains=function(t,n){for(var e,r,i=t.length,o=t[i-1],a=n[0],u=n[1],f=o[0],c=o[1],s=!1,l=0;l<i;++l)e=(o=t[l])[0],(r=o[1])>u!=c>u&&a<(f-e)*(u-r)/(c-r)+e&&(s=!s),f=e,c=r;return s},t.polygonLength=function(t){for(var n,e,r=-1,i=t.length,o=t[i-1],a=o[0],u=o[1],f=0;++r<i;)n=a,e=u,n-=a=(o=t[r])[0],e-=u=o[1],f+=Math.sqrt(n*n+e*e);return f},t.quadtree=Ie,t.randomUniform=mv,t.randomNormal=xv,t.randomLogNormal=wv,t.randomBates=Av,t.randomIrwinHall=Mv,t.randomExponential=Tv,t.scaleBand=ga,t.scalePoint=function(){return ya(ga().paddingInner(1))},t.scaleIdentity=Sa,t.scaleLinear=Na,t.scaleLog=Da,t.scaleOrdinal=va,t.scaleImplicit=kv,t.scalePow=qa,t.scaleSqrt=function(){return qa().exponent(.5)},t.scaleQuantile=Oa,t.scaleQuantize=Ya,t.scaleThreshold=Ba,t.scaleTime=function(){return rf(ug,og,Xv,jv,Fv,Yv,qv,Rv,t.timeFormat).domain([new Date(2e3,0,1),new Date(2e3,0,2)])},t.scaleUtc=function(){return rf(Pg,kg,vg,dg,lg,cg,qv,Rv,t.utcFormat).domain([Date.UTC(2e3,0,1),Date.UTC(2e3,0,2)])},t.scaleSequential=of,t.schemeCategory10=$g,t.schemeAccent=Wg,t.schemeDark2=Zg,t.schemePaired=Qg,t.schemePastel1=Jg,t.schemePastel2=Kg,t.schemeSet1=ty,t.schemeSet2=ny,t.schemeSet3=ey,t.interpolateBrBG=iy,t.schemeBrBG=ry,t.interpolatePRGn=ay,t.schemePRGn=oy,t.interpolatePiYG=fy,t.schemePiYG=uy,t.interpolatePuOr=sy,t.schemePuOr=cy,t.interpolateRdBu=hy,t.schemeRdBu=ly,t.interpolateRdGy=py,t.schemeRdGy=dy,t.interpolateRdYlBu=gy,t.schemeRdYlBu=vy,t.interpolateRdYlGn=_y,t.schemeRdYlGn=yy,t.interpolateSpectral=my,t.schemeSpectral=by,t.interpolateBuGn=wy,t.schemeBuGn=xy,t.interpolateBuPu=Ay,t.schemeBuPu=My,t.interpolateGnBu=Ny,t.schemeGnBu=Ty,t.interpolateOrRd=Ey,t.schemeOrRd=Sy,t.interpolatePuBuGn=Cy,t.schemePuBuGn=ky,t.interpolatePuBu=zy,t.schemePuBu=Py,t.interpolatePuRd=Ly,t.schemePuRd=Ry,t.interpolateRdPu=Uy,t.schemeRdPu=Dy,t.interpolateYlGnBu=Oy,t.schemeYlGnBu=qy,t.interpolateYlGn=By,t.schemeYlGn=Yy,t.interpolateYlOrBr=Iy,t.schemeYlOrBr=Fy,t.interpolateYlOrRd=Hy,t.schemeYlOrRd=jy,t.interpolateBlues=Gy,t.schemeBlues=Xy,t.interpolateGreens=$y,t.schemeGreens=Vy,t.interpolateGreys=Zy,t.schemeGreys=Wy,t.interpolatePurples=Jy,t.schemePurples=Qy,t.interpolateReds=t_,t.schemeReds=Ky,t.interpolateOranges=e_,t.schemeOranges=n_,t.interpolateCubehelixDefault=r_,t.interpolateRainbow=function(t){(t<0||t>1)&&(t-=Math.floor(t));var n=Math.abs(t-.5);return a_.h=360*t-100,a_.s=1.5-1.5*n,a_.l=.8-.9*n,a_+""},t.interpolateWarm=i_,t.interpolateCool=o_,t.interpolateViridis=u_,t.interpolateMagma=f_,t.interpolateInferno=c_,t.interpolatePlasma=s_,t.create=function(t){return ct(C(t).call(document.documentElement))},t.creator=C,t.local=st,t.matcher=ys,t.mouse=pt,t.namespace=k,t.namespaces=ds,t.clientPoint=dt,t.select=ct,t.selectAll=function(t){return"string"==typeof t?new ut([document.querySelectorAll(t)],[document.documentElement]):new ut([null==t?[]:t],ms)},t.selection=ft,t.selector=z,t.selectorAll=L,t.style=F,t.touch=vt,t.touches=function(t,n){null==n&&(n=ht().touches);for(var e=0,r=n?n.length:0,i=new Array(r);e<r;++e)i[e]=dt(t,n[e]);return i},t.window=B,t.customEvent=ot,t.arc=function(){function t(){var t,c,s=+n.apply(this,arguments),l=+e.apply(this,arguments),h=o.apply(this,arguments)-m_,d=a.apply(this,arguments)-m_,p=l_(d-h),v=d>h;if(f||(f=t=ie()),l<s&&(c=l,l=s,s=c),l>__)if(p>x_-__)f.moveTo(l*d_(h),l*g_(h)),f.arc(0,0,l,h,d,!v),s>__&&(f.moveTo(s*d_(d),s*g_(d)),f.arc(0,0,s,d,h,v));else{var g,y,_=h,b=d,m=h,x=d,w=p,M=p,A=u.apply(this,arguments)/2,T=A>__&&(i?+i.apply(this,arguments):y_(s*s+l*l)),N=v_(l_(l-s)/2,+r.apply(this,arguments)),S=N,E=N;if(T>__){var k=sf(T/s*g_(A)),C=sf(T/l*g_(A));(w-=2*k)>__?(k*=v?1:-1,m+=k,x-=k):(w=0,m=x=(h+d)/2),(M-=2*C)>__?(C*=v?1:-1,_+=C,b-=C):(M=0,_=b=(h+d)/2)}var P=l*d_(_),z=l*g_(_),R=s*d_(x),L=s*g_(x);if(N>__){var D=l*d_(b),U=l*g_(b),q=s*d_(m),O=s*g_(m);if(p<b_){var Y=w>__?function(t,n,e,r,i,o,a,u){var f=e-t,c=r-n,s=a-i,l=u-o,h=(s*(n-o)-l*(t-i))/(l*f-s*c);return[t+h*f,n+h*c]}(P,z,q,O,D,U,R,L):[R,L],B=P-Y[0],F=z-Y[1],I=D-Y[0],j=U-Y[1],H=1/g_(function(t){return t>1?0:t<-1?b_:Math.acos(t)}((B*I+F*j)/(y_(B*B+F*F)*y_(I*I+j*j)))/2),X=y_(Y[0]*Y[0]+Y[1]*Y[1]);S=v_(N,(s-X)/(H-1)),E=v_(N,(l-X)/(H+1))}}M>__?E>__?(g=gf(q,O,P,z,l,E,v),y=gf(D,U,R,L,l,E,v),f.moveTo(g.cx+g.x01,g.cy+g.y01),E<N?f.arc(g.cx,g.cy,E,h_(g.y01,g.x01),h_(y.y01,y.x01),!v):(f.arc(g.cx,g.cy,E,h_(g.y01,g.x01),h_(g.y11,g.x11),!v),f.arc(0,0,l,h_(g.cy+g.y11,g.cx+g.x11),h_(y.cy+y.y11,y.cx+y.x11),!v),f.arc(y.cx,y.cy,E,h_(y.y11,y.x11),h_(y.y01,y.x01),!v))):(f.moveTo(P,z),f.arc(0,0,l,_,b,!v)):f.moveTo(P,z),s>__&&w>__?S>__?(g=gf(R,L,D,U,s,-S,v),y=gf(P,z,q,O,s,-S,v),f.lineTo(g.cx+g.x01,g.cy+g.y01),S<N?f.arc(g.cx,g.cy,S,h_(g.y01,g.x01),h_(y.y01,y.x01),!v):(f.arc(g.cx,g.cy,S,h_(g.y01,g.x01),h_(g.y11,g.x11),!v),f.arc(0,0,s,h_(g.cy+g.y11,g.cx+g.x11),h_(y.cy+y.y11,y.cx+y.x11),v),f.arc(y.cx,y.cy,S,h_(y.y11,y.x11),h_(y.y01,y.x01),!v))):f.arc(0,0,s,x,m,v):f.lineTo(R,L)}else f.moveTo(0,0);if(f.closePath(),t)return f=null,t+""||null}var n=lf,e=hf,r=cf(0),i=null,o=df,a=pf,u=vf,f=null;return t.centroid=function(){var t=(+n.apply(this,arguments)+ +e.apply(this,arguments))/2,r=(+o.apply(this,arguments)+ +a.apply(this,arguments))/2-b_/2;return[d_(r)*t,g_(r)*t]},t.innerRadius=function(e){return arguments.length?(n="function"==typeof e?e:cf(+e),t):n},t.outerRadius=function(n){return arguments.length?(e="function"==typeof n?n:cf(+n),t):e},t.cornerRadius=function(n){return arguments.length?(r="function"==typeof n?n:cf(+n),t):r},t.padRadius=function(n){return arguments.length?(i=null==n?null:"function"==typeof n?n:cf(+n),t):i},t.startAngle=function(n){return arguments.length?(o="function"==typeof n?n:cf(+n),t):o},t.endAngle=function(n){return arguments.length?(a="function"==typeof n?n:cf(+n),t):a},t.padAngle=function(n){return arguments.length?(u="function"==typeof n?n:cf(+n),t):u},t.context=function(n){return arguments.length?(f=null==n?null:n,t):f},t},t.area=wf,t.line=xf,t.pie=function(){function t(t){var u,f,c,s,l,h=t.length,d=0,p=new Array(h),v=new Array(h),g=+i.apply(this,arguments),y=Math.min(x_,Math.max(-x_,o.apply(this,arguments)-g)),_=Math.min(Math.abs(y)/h,a.apply(this,arguments)),b=_*(y<0?-1:1);for(u=0;u<h;++u)(l=v[p[u]=u]=+n(t[u],u,t))>0&&(d+=l);for(null!=e?p.sort(function(t,n){return e(v[t],v[n])}):null!=r&&p.sort(function(n,e){return r(t[n],t[e])}),u=0,c=d?(y-h*b)/d:0;u<h;++u,g=s)f=p[u],s=g+((l=v[f])>0?l*c:0)+b,v[f]={data:t[f],index:u,value:l,startAngle:g,endAngle:s,padAngle:_};return v}var n=Af,e=Mf,r=null,i=cf(0),o=cf(x_),a=cf(0);return t.value=function(e){return arguments.length?(n="function"==typeof e?e:cf(+e),t):n},t.sortValues=function(n){return arguments.length?(e=n,r=null,t):e},t.sort=function(n){return arguments.length?(r=n,e=null,t):r},t.startAngle=function(n){return arguments.length?(i="function"==typeof n?n:cf(+n),t):i},t.endAngle=function(n){return arguments.length?(o="function"==typeof n?n:cf(+n),t):o},t.padAngle=function(n){return arguments.length?(a="function"==typeof n?n:cf(+n),t):a},t},t.areaRadial=kf,t.radialArea=kf,t.lineRadial=Ef,t.radialLine=Ef,t.pointRadial=Cf,t.linkHorizontal=function(){return Rf(Lf)},t.linkVertical=function(){return Rf(Df)},t.linkRadial=function(){var t=Rf(Uf);return t.angle=t.x,delete t.x,t.radius=t.y,delete t.y,t},t.symbol=function(){function t(){var t;if(r||(r=t=ie()),n.apply(this,arguments).draw(r,+e.apply(this,arguments)),t)return r=null,t+""||null}var n=cf(A_),e=cf(64),r=null;return t.type=function(e){return arguments.length?(n="function"==typeof e?e:cf(e),t):n},t.size=function(n){return arguments.length?(e="function"==typeof n?n:cf(+n),t):e},t.context=function(n){return arguments.length?(r=null==n?null:n,t):r},t},t.symbols=B_,t.symbolCircle=A_,t.symbolCross=T_,t.symbolDiamond=E_,t.symbolSquare=R_,t.symbolStar=z_,t.symbolTriangle=D_,t.symbolWye=Y_,t.curveBasisClosed=function(t){return new Bf(t)},t.curveBasisOpen=function(t){return new Ff(t)},t.curveBasis=function(t){return new Yf(t)},t.curveBundle=F_,t.curveCardinalClosed=j_,t.curveCardinalOpen=H_,t.curveCardinal=I_,t.curveCatmullRomClosed=G_,t.curveCatmullRomOpen=V_,t.curveCatmullRom=X_,t.curveLinearClosed=function(t){return new Qf(t)},t.curveLinear=_f,t.curveMonotoneX=function(t){return new ec(t)},t.curveMonotoneY=function(t){return new rc(t)},t.curveNatural=function(t){return new oc(t)},t.curveStep=function(t){return new uc(t,.5)},t.curveStepAfter=function(t){return new uc(t,1)},t.curveStepBefore=function(t){return new uc(t,0)},t.stack=function(){function t(t){var o,a,u=n.apply(this,arguments),f=t.length,c=u.length,s=new Array(c);for(o=0;o<c;++o){for(var l,h=u[o],d=s[o]=new Array(f),p=0;p<f;++p)d[p]=l=[0,+i(t[p],h,p,t)],l.data=t[p];d.key=h}for(o=0,a=e(s);o<c;++o)s[a[o]].index=o;return r(s,a),s}var n=cf([]),e=cc,r=fc,i=sc;return t.keys=function(e){return arguments.length?(n="function"==typeof e?e:cf(M_.call(e)),t):n},t.value=function(n){return arguments.length?(i="function"==typeof n?n:cf(+n),t):i},t.order=function(n){return arguments.length?(e=null==n?cc:"function"==typeof n?n:cf(M_.call(n)),t):e},t.offset=function(n){return arguments.length?(r=null==n?fc:n,t):r},t},t.stackOffsetExpand=function(t,n){if((r=t.length)>0){for(var e,r,i,o=0,a=t[0].length;o<a;++o){for(i=e=0;e<r;++e)i+=t[e][o][1]||0;if(i)for(e=0;e<r;++e)t[e][o][1]/=i}fc(t,n)}},t.stackOffsetDiverging=function(t,n){if((u=t.length)>1)for(var e,r,i,o,a,u,f=0,c=t[n[0]].length;f<c;++f)for(o=a=0,e=0;e<u;++e)(i=(r=t[n[e]][f])[1]-r[0])>=0?(r[0]=o,r[1]=o+=i):i<0?(r[1]=a,r[0]=a+=i):r[0]=o},t.stackOffsetNone=fc,t.stackOffsetSilhouette=function(t,n){if((e=t.length)>0){for(var e,r=0,i=t[n[0]],o=i.length;r<o;++r){for(var a=0,u=0;a<e;++a)u+=t[a][r][1]||0;i[r][1]+=i[r][0]=-u/2}fc(t,n)}},t.stackOffsetWiggle=function(t,n){if((i=t.length)>0&&(r=(e=t[n[0]]).length)>0){for(var e,r,i,o=0,a=1;a<r;++a){for(var u=0,f=0,c=0;u<i;++u){for(var s=t[n[u]],l=s[a][1]||0,h=(l-(s[a-1][1]||0))/2,d=0;d<u;++d){var p=t[n[d]];h+=(p[a][1]||0)-(p[a-1][1]||0)}f+=l,c+=h*l}e[a-1][1]+=e[a-1][0]=o,f&&(o-=c/f)}e[a-1][1]+=e[a-1][0]=o,fc(t,n)}},t.stackOrderAscending=lc,t.stackOrderDescending=function(t){return lc(t).reverse()},t.stackOrderInsideOut=function(t){var n,e,r=t.length,i=t.map(hc),o=cc(t).sort(function(t,n){return i[n]-i[t]}),a=0,u=0,f=[],c=[];for(n=0;n<r;++n)e=o[n],a<u?(a+=i[e],f.push(e)):(u+=i[e],c.push(e));return c.reverse().concat(f)},t.stackOrderNone=cc,t.stackOrderReverse=function(t){return cc(t).reverse()},t.timeInterval=Fa,t.timeMillisecond=Rv,t.timeMilliseconds=Lv,t.utcMillisecond=Rv,t.utcMilliseconds=Lv,t.timeSecond=qv,t.timeSeconds=Ov,t.utcSecond=qv,t.utcSeconds=Ov,t.timeMinute=Yv,t.timeMinutes=Bv,t.timeHour=Fv,t.timeHours=Iv,t.timeDay=jv,t.timeDays=Hv,t.timeWeek=Xv,t.timeWeeks=Jv,t.timeSunday=Xv,t.timeSundays=Jv,t.timeMonday=Gv,t.timeMondays=Kv,t.timeTuesday=Vv,t.timeTuesdays=tg,t.timeWednesday=$v,t.timeWednesdays=ng,t.timeThursday=Wv,t.timeThursdays=eg,t.timeFriday=Zv,t.timeFridays=rg,t.timeSaturday=Qv,t.timeSaturdays=ig,t.timeMonth=og,t.timeMonths=ag,t.timeYear=ug,t.timeYears=fg,t.utcMinute=cg,t.utcMinutes=sg,t.utcHour=lg,t.utcHours=hg,t.utcDay=dg,t.utcDays=pg,t.utcWeek=vg,t.utcWeeks=wg,t.utcSunday=vg,t.utcSundays=wg,t.utcMonday=gg,t.utcMondays=Mg,t.utcTuesday=yg,t.utcTuesdays=Ag,t.utcWednesday=_g,t.utcWednesdays=Tg,t.utcThursday=bg,t.utcThursdays=Ng,t.utcFriday=mg,t.utcFridays=Sg,t.utcSaturday=xg,t.utcSaturdays=Eg,t.utcMonth=kg,t.utcMonths=Cg,t.utcYear=Pg,t.utcYears=Rg,t.timeFormatDefaultLocale=tf,t.timeFormatLocale=Va,t.isoFormat=Yg,t.isoParse=Bg,t.now=xn,t.timer=An,t.timerFlush=Tn,t.timeout=kn,t.interval=function(t,n,e){var r=new Mn,i=n;return null==n?(r.restart(t,n,e),r):(n=+n,e=null==e?xn():+e,r.restart(function o(a){a+=i,r.restart(o,i+=n,e),t(a)},n,e),r)},t.transition=On,t.active=function(t,n){var e,r,i=t.__transition;if(i){n=null==n?null:n+"";for(r in i)if((e=i[r]).state>zl&&e.name===n)return new qn([[t]],sh,n,+r)}return null},t.interrupt=Ln,t.voronoi=function(){function t(t){return new Oc(t.map(function(r,i){var o=[Math.round(n(r,i,t)/nb)*nb,Math.round(e(r,i,t)/nb)*nb];return o.index=i,o.data=r,o}),r)}var n=pc,e=vc,r=null;return t.polygons=function(n){return t(n).polygons()},t.links=function(n){return t(n).links()},t.triangles=function(n){return t(n).triangles()},t.x=function(e){return arguments.length?(n="function"==typeof e?e:dc(+e),t):n},t.y=function(n){return arguments.length?(e="function"==typeof n?n:dc(+n),t):e},t.extent=function(n){return arguments.length?(r=null==n?null:[[+n[0][0],+n[0][1]],[+n[1][0],+n[1][1]]],t):r&&[[r[0][0],r[0][1]],[r[1][0],r[1][1]]]},t.size=function(n){return arguments.length?(r=null==n?null:[[0,0],[+n[0],+n[1]]],t):r&&[r[1][0]-r[0][0],r[1][1]-r[0][1]]},t},t.zoom=function(){function n(t){t.property("__zoom",Gc).on("wheel.zoom",f).on("mousedown.zoom",c).on("dblclick.zoom",s).filter(m).on("touchstart.zoom",l).on("touchmove.zoom",h).on("touchend.zoom touchcancel.zoom",d).style("touch-action","none").style("-webkit-tap-highlight-color","rgba(0,0,0,0)")}function e(t,n){return(n=Math.max(x[0],Math.min(x[1],n)))===t.k?t:new Bc(n,t.x,t.y)}function r(t,n,e){var r=n[0]-e[0]*t.k,i=n[1]-e[1]*t.k;return r===t.x&&i===t.y?t:new Bc(t.k,r,i)}function i(t){return[(+t[0][0]+ +t[1][0])/2,(+t[0][1]+ +t[1][1])/2]}function o(t,n,e){t.on("start.zoom",function(){a(this,arguments).start()}).on("interrupt.zoom end.zoom",function(){a(this,arguments).end()}).tween("zoom",function(){var t=arguments,r=a(this,t),o=y.apply(this,t),u=e||i(o),f=Math.max(o[1][0]-o[0][0],o[1][1]-o[0][1]),c=this.__zoom,s="function"==typeof n?n.apply(this,t):n,l=A(c.invert(u).concat(f/c.k),s.invert(u).concat(f/s.k));return function(t){if(1===t)t=s;else{var n=l(t),e=f/n[2];t=new Bc(e,u[0]-n[0]*e,u[1]-n[1]*e)}r.zoom(null,t)}})}function a(t,n){for(var e,r=0,i=T.length;r<i;++r)if((e=T[r]).that===t)return e;return new u(t,n)}function u(t,n){this.that=t,this.args=n,this.index=-1,this.active=0,this.extent=y.apply(t,n)}function f(){if(g.apply(this,arguments)){var t=a(this,arguments),n=this.__zoom,i=Math.max(x[0],Math.min(x[1],n.k*Math.pow(2,b.apply(this,arguments)))),o=pt(this);if(t.wheel)t.mouse[0][0]===o[0]&&t.mouse[0][1]===o[1]||(t.mouse[1]=n.invert(t.mouse[0]=o)),clearTimeout(t.wheel);else{if(n.k===i)return;t.mouse=[o,n.invert(o)],Ln(this),t.start()}jc(),t.wheel=setTimeout(function(){t.wheel=null,t.end()},k),t.zoom("mouse",_(r(e(n,i),t.mouse[0],t.mouse[1]),t.extent,w))}}function c(){if(!v&&g.apply(this,arguments)){var n=a(this,arguments),e=ct(t.event.view).on("mousemove.zoom",function(){if(jc(),!n.moved){var e=t.event.clientX-o,i=t.event.clientY-u;n.moved=e*e+i*i>C}n.zoom("mouse",_(r(n.that.__zoom,n.mouse[0]=pt(n.that),n.mouse[1]),n.extent,w))},!0).on("mouseup.zoom",function(){e.on("mousemove.zoom mouseup.zoom",null),bt(t.event.view,n.moved),jc(),n.end()},!0),i=pt(this),o=t.event.clientX,u=t.event.clientY;_t(t.event.view),Ic(),n.mouse=[i,this.__zoom.invert(i)],Ln(this),n.start()}}function s(){if(g.apply(this,arguments)){var i=this.__zoom,a=pt(this),u=i.invert(a),f=i.k*(t.event.shiftKey?.5:2),c=_(r(e(i,f),a,u),y.apply(this,arguments),w);jc(),M>0?ct(this).transition().duration(M).call(o,c,a):ct(this).call(n.transform,c)}}function l(){if(g.apply(this,arguments)){var n,e,r,i,o=a(this,arguments),u=t.event.changedTouches,f=u.length;for(Ic(),e=0;e<f;++e)i=[i=vt(this,u,(r=u[e]).identifier),this.__zoom.invert(i),r.identifier],o.touch0?o.touch1||(o.touch1=i):(o.touch0=i,n=!0);if(p&&(p=clearTimeout(p),!o.touch1))return o.end(),void((i=ct(this).on("dblclick.zoom"))&&i.apply(this,arguments));n&&(p=setTimeout(function(){p=null},E),Ln(this),o.start())}}function h(){var n,i,o,u,f=a(this,arguments),c=t.event.changedTouches,s=c.length;for(jc(),p&&(p=clearTimeout(p)),n=0;n<s;++n)o=vt(this,c,(i=c[n]).identifier),f.touch0&&f.touch0[2]===i.identifier?f.touch0[0]=o:f.touch1&&f.touch1[2]===i.identifier&&(f.touch1[0]=o);if(i=f.that.__zoom,f.touch1){var l=f.touch0[0],h=f.touch0[1],d=f.touch1[0],v=f.touch1[1],g=(g=d[0]-l[0])*g+(g=d[1]-l[1])*g,y=(y=v[0]-h[0])*y+(y=v[1]-h[1])*y;i=e(i,Math.sqrt(g/y)),o=[(l[0]+d[0])/2,(l[1]+d[1])/2],u=[(h[0]+v[0])/2,(h[1]+v[1])/2]}else{if(!f.touch0)return;o=f.touch0[0],u=f.touch0[1]}f.zoom("touch",_(r(i,o,u),f.extent,w))}function d(){var n,e,r=a(this,arguments),i=t.event.changedTouches,o=i.length;for(Ic(),v&&clearTimeout(v),v=setTimeout(function(){v=null},E),n=0;n<o;++n)e=i[n],r.touch0&&r.touch0[2]===e.identifier?delete r.touch0:r.touch1&&r.touch1[2]===e.identifier&&delete r.touch1;r.touch1&&!r.touch0&&(r.touch0=r.touch1,delete r.touch1),r.touch0?r.touch0[1]=this.__zoom.invert(r.touch0[0]):r.end()}var p,v,g=Hc,y=Xc,_=Wc,b=Vc,m=$c,x=[0,1/0],w=[[-1/0,-1/0],[1/0,1/0]],M=250,A=yn,T=[],S=N("start","zoom","end"),E=500,k=150,C=0;return n.transform=function(t,n){var e=t.selection?t.selection():t;e.property("__zoom",Gc),t!==e?o(t,n):e.interrupt().each(function(){a(this,arguments).start().zoom(null,"function"==typeof n?n.apply(this,arguments):n).end()})},n.scaleBy=function(t,e){n.scaleTo(t,function(){return this.__zoom.k*("function"==typeof e?e.apply(this,arguments):e)})},n.scaleTo=function(t,o){n.transform(t,function(){var t=y.apply(this,arguments),n=this.__zoom,a=i(t),u=n.invert(a),f="function"==typeof o?o.apply(this,arguments):o;return _(r(e(n,f),a,u),t,w)})},n.translateBy=function(t,e,r){n.transform(t,function(){return _(this.__zoom.translate("function"==typeof e?e.apply(this,arguments):e,"function"==typeof r?r.apply(this,arguments):r),y.apply(this,arguments),w)})},n.translateTo=function(t,e,r){n.transform(t,function(){var t=y.apply(this,arguments),n=this.__zoom,o=i(t);return _(rb.translate(o[0],o[1]).scale(n.k).translate("function"==typeof e?-e.apply(this,arguments):-e,"function"==typeof r?-r.apply(this,arguments):-r),t,w)})},u.prototype={start:function(){return 1==++this.active&&(this.index=T.push(this)-1,this.emit("start")),this},zoom:function(t,n){return this.mouse&&"mouse"!==t&&(this.mouse[1]=n.invert(this.mouse[0])),this.touch0&&"touch"!==t&&(this.touch0[1]=n.invert(this.touch0[0])),this.touch1&&"touch"!==t&&(this.touch1[1]=n.invert(this.touch1[0])),this.that.__zoom=n,this.emit("zoom"),this},end:function(){return 0==--this.active&&(T.splice(this.index,1),this.index=-1,this.emit("end")),this},emit:function(t){ot(new function(t,n,e){this.target=t,this.type=n,this.transform=e}(n,t,this.that.__zoom),S.apply,S,[t,this.that,this.args])}},n.wheelDelta=function(t){return arguments.length?(b="function"==typeof t?t:Yc(+t),n):b},n.filter=function(t){return arguments.length?(g="function"==typeof t?t:Yc(!!t),n):g},n.touchable=function(t){return arguments.length?(m="function"==typeof t?t:Yc(!!t),n):m},n.extent=function(t){return arguments.length?(y="function"==typeof t?t:Yc([[+t[0][0],+t[0][1]],[+t[1][0],+t[1][1]]]),n):y},n.scaleExtent=function(t){return arguments.length?(x[0]=+t[0],x[1]=+t[1],n):[x[0],x[1]]},n.translateExtent=function(t){return arguments.length?(w[0][0]=+t[0][0],w[1][0]=+t[1][0],w[0][1]=+t[0][1],w[1][1]=+t[1][1],n):[[w[0][0],w[0][1]],[w[1][0],w[1][1]]]},n.constrain=function(t){return arguments.length?(_=t,n):_},n.duration=function(t){return arguments.length?(M=+t,n):M},n.interpolate=function(t){return arguments.length?(A=t,n):A},n.on=function(){var t=S.on.apply(S,arguments);return t===S?n:t},n.clickDistance=function(t){return arguments.length?(C=(t=+t)*t,n):Math.sqrt(C)},n},t.zoomTransform=Fc,t.zoomIdentity=rb,Object.defineProperty(t,"__esModule",{value:!0})});
\ No newline at end of file
diff --git a/pebble/docs/js/write-throughput.js b/pebble/docs/js/write-throughput.js
new file mode 100644
index 0000000..365860c
--- /dev/null
+++ b/pebble/docs/js/write-throughput.js
@@ -0,0 +1,420 @@
+// TODO(travers): support multiple time-seriies on the summary chart, once we
+// have data available.
+const writeThroughputWorkload = "write/values=1024";
+
+/*
+ * Returns the full URL to the write-throughput summary JSON file.
+ */
+function writeThroughputSummaryURL() {
+    return "https://pebble-benchmarks.s3.amazonaws.com/write-throughput/summary.json";
+}
+
+/*
+ * Returns the full URL to a write-throughput summary detail file, given the
+ * filename.
+ */
+function writeThroughputDetailURL(filename) {
+    return `https://pebble-benchmarks.s3.amazonaws.com/write-throughput/${filename}`;
+}
+
+/*
+ * Renders the appropriate detail view given the array of data and the date
+ * extract.
+ *
+ * This function works by using the provided date to "bisect" into the data
+ * array and pull out the corresponding datapoint.
+ */
+function bisectAndRenderWriteThroughputDetail(data, detailDate) {
+    const bisect = d3.bisector(d => parseTime(d.date)).left;
+    let i = bisect(data, detailDate, 1);
+
+    let workload = data[i];
+    let date = workload.date;
+    let name = workload.name;
+    let opsSec = workload.opsSec;
+    let filename = workload.summaryPath;
+
+    fetchWriteThroughputSummaryData(filename)
+      .then(
+        d => renderWriteThroughputSummaryDetail(name, date, opsSec, d),
+        _ => renderWriteThroughputSummaryDetail(name, date, opsSec, null),
+      );
+}
+
+/*
+ * Renders the write-throughput summary view, given the correspnding data.
+ *
+ * This function generates a time-series similar to the YCSB benchmark data.
+ * The x-axis represents the day on which the becnhmark was run, and the y-axis
+ * represents the calculated "max sustainable throughput" in ops-second.
+ *
+ * Clicking on an individual day renders the detail view for the given day,
+ * allowing the user to drill down into the per-worker performance.
+ */
+function renderWriteThroughputSummary(allData) {
+    const svg = d3.select(".chart.write-throughput");
+
+    // Filter on the appropriate time-series.
+    const dataKey = "write/values=1024";
+    const data = allData[dataKey];
+
+    // Set up axes.
+
+    const margin = {top: 25, right: 60, bottom: 25, left: 60};
+    let maxY = d3.max(data, d => d.opsSec);
+
+    const width = styleWidth(svg) - margin.left - margin.right;
+    const height = styleHeight(svg) - margin.top - margin.bottom;
+
+    const x = d3.scaleTime()
+        .domain([minDate, max.date])
+        .range([0, width]);
+    const x2 = d3.scaleTime()
+        .domain([minDate, max.date])
+        .range([0, width]);
+
+    const y = d3.scaleLinear()
+        .domain([0, maxY * 1.1])
+        .range([height, 0]);
+
+    const z = d3.scaleOrdinal(d3.schemeCategory10);
+
+    const xAxis = d3.axisBottom(x)
+        .ticks(5);
+
+    const yAxis = d3.axisLeft(y)
+        .ticks(5);
+
+    const g = svg
+        .append("g")
+        .attr("transform", "translate(" + margin.left + "," + margin.top + ")");
+
+    g.append("g")
+        .attr("class", "axis axis--x")
+        .attr("transform", "translate(0," + height + ")")
+        .call(xAxis);
+
+    g.append("g")
+        .attr("class", "axis axis--y")
+        .call(yAxis);
+
+    g.append("text")
+        .attr("class", "chart-title")
+        .attr("x", margin.left + width / 2)
+        .attr("y", 0)
+        .style("text-anchor", "middle")
+        .style("font", "8pt sans-serif")
+        .text(dataKey);
+
+    // Create a rectangle that can be used to clip the data. This avoids having
+    // the time-series spill across the y-axis when panning and zooming.
+
+    const defs = svg.append("defs");
+
+    defs.append("clipPath")
+        .attr("id", dataKey)
+        .append("rect")
+        .attr("x", 0)
+        .attr("y", -margin.top)
+        .attr("width", width)
+        .attr("height", margin.top + height + 10);
+
+    // Plot time-series.
+
+    const view = g.append("g")
+        .attr("class", "view")
+        .attr("clip-path", "url(#" + dataKey + ")");
+
+    const line = d3.line()
+        .x(d => x(parseTime(d.date)))
+        .y(d => y(d.opsSec));
+
+    const path = view.selectAll(".line1")
+        .data([data])
+        .enter()
+        .append("path")
+        .attr("class", "line1")
+        .attr("d", line)
+        .style("stroke", z(0));
+
+    // Hover to show labels.
+
+    const lineHover = g
+        .append("line")
+        .attr("class", "hover")
+        .style("fill", "none")
+        .style("stroke", "#f99")
+        .style("stroke-width", "1px");
+
+    const dateHover = g
+        .append("text")
+        .attr("class", "hover")
+        .attr("fill", "#f22")
+        .attr("text-anchor", "middle")
+        .attr("alignment-baseline", "hanging")
+        .attr("transform", "translate(0, 0)");
+
+    const opsHover = g
+        .append("text")
+        .attr("class", "hover")
+        .attr("fill", "#f22")
+        .attr("text-anchor", "middle")
+        .attr("transform", "translate(0, 0)");
+
+    const marker = g
+        .append("circle")
+        .attr("class", "hover")
+        .attr("r", 3)
+        .style("opacity", "0")
+        .style("stroke", "#f22")
+        .style("fill", "#f22");
+
+    svg.node().updateMouse = function (mouse, date, hover) {
+        const mousex = mouse[0];
+        const bisect = d3.bisector(d => parseTime(d.date)).left;
+        const i = bisect(data, date, 1);
+        const v =
+            i === data.length
+                ? data[i - 1]
+                : mousex - x(parseTime(data[i - 1].date)) < x(parseTime(data[i].date)) - mousex
+                    ? data[i - 1]
+                    : data[i];
+        const noData = mousex < x(parseTime(data[0].date));
+
+        let lineY = height;
+        if (!noData) {
+            lineY = pathGetY(path.node(), mousex);
+        }
+
+        let val, valY, valFormat;
+        val = v.opsSec;
+        valY = y(val);
+        valFormat = d3.format(",.0f");
+
+        lineHover
+            .attr("x1", mousex)
+            .attr("x2", mousex)
+            .attr("y1", lineY)
+            .attr("y2", height);
+        marker.attr("transform", "translate(" + x(parseTime(v.date)) + "," + valY + ")");
+        dateHover
+            .attr("transform", "translate(" + mousex + "," + (height + 8) + ")")
+            .text(formatTime(date));
+        opsHover
+            .attr("transform", "translate(" + x(parseTime(v.date)) + "," + (valY - 7) + ")")
+            .text(valFormat(val));
+    };
+
+    // Panning and zooming.
+
+    const updateZoom = function (t) {
+        x.domain(t.rescaleX(x2).domain());
+        g.select(".axis--x").call(xAxis);
+        g.selectAll(".line1").attr("d", line);
+    };
+    svg.node().updateZoom = updateZoom;
+
+    const zoom = d3.zoom()
+        .extent([[0, 0], [width, 1]])
+        .scaleExtent([0.25, 2])                         // [45, 360] days
+        .translateExtent([[-width * 3, 0], [width, 1]]) // [today-360, today]
+        .on("zoom", function () {
+            const t = d3.event.transform;
+            if (!d3.event.sourceEvent) {
+                updateZoom(t);
+                return;
+            }
+
+            d3.selectAll(".chart").each(function () {
+                if (this.updateZoom != null) {
+                    this.updateZoom(t);
+                }
+            });
+
+            d3.selectAll(".chart").each(function () {
+                this.__zoom = t.translate(0, 0);
+            });
+        });
+
+    svg.call(zoom);
+    svg.call(zoom.transform, d3.zoomTransform(svg.node()));
+
+    svg.append("rect")
+        .attr("class", "mouse")
+        .attr("cursor", "move")
+        .attr("fill", "none")
+        .attr("pointer-events", "all")
+        .attr("width", width)
+        .attr("height", height + margin.top + margin.bottom)
+        .attr("transform", "translate(" + margin.left + "," + 0 + ")")
+        .on("mousemove", function () {
+            const mouse = d3.mouse(this);
+            const date = x.invert(mouse[0]);
+
+            d3.selectAll(".chart").each(function () {
+                if (this.updateMouse != null) {
+                    this.updateMouse(mouse, date, 1);
+                }
+            });
+        })
+        .on("mouseover", function () {
+            d3.selectAll(".chart")
+                .selectAll(".hover")
+                .style("opacity", 1.0);
+        })
+        .on("mouseout", function () {
+            d3.selectAll(".chart")
+                .selectAll(".hover")
+                .style("opacity", 0);
+        })
+        .on("click", function(d) {
+            // Use the date corresponding to the clicked data point to bisect
+            // into the workload data to pluck out the correct datapoint.
+            const mouse = d3.mouse(this);
+            let detailDate = d3.timeDay.floor(x.invert(mouse[0]));
+            bisectAndRenderWriteThroughputDetail(data, detailDate);
+        });
+}
+
+function fetchWriteThroughputSummaryData(file) {
+    return fetch(writeThroughputDetailURL(file))
+      .then(response => response.json())
+      .then(data => {
+        for (let key in data) {
+          let csvData = data[key].rawData;
+          data[key].data = d3.csvParseRows(csvData, function (d, i) {
+            return {
+              elapsed: +d[0],
+              opsSec: +d[1],
+              passed: d[2] === 'true',
+              size: +d[3],
+              levels: +d[4],
+            };
+          });
+          delete data[key].rawData;
+        }
+        return data;
+      });
+}
+
+/*
+ * Renders the write-throughput detail view, given the correspnding data, and
+ * the particular workload and date on which it was run.
+ *
+ * This function generates a series with the x-axis representing the elapsed
+ * time since the start of the benchmark, and the measured write load at that
+ * point in time (in ops/second). Each series is a worker that participated in
+ * the benchmark on the selected date.
+ */
+function renderWriteThroughputSummaryDetail(workload, date, opsSec, rawData) {
+    const svg = d3.select(".chart.write-throughput-detail");
+
+    // Remove anything that was previously on the canvas. This ensures that a
+    // user clicking multiple times does not keep adding data to the canvas.
+    svg.selectAll("*").remove();
+
+    const margin = {top: 25, right: 60, bottom: 25, left: 60};
+    let maxX = 0;
+    let maxY = 0;
+    for (let key in rawData) {
+        let run = rawData[key];
+        maxX = Math.max(maxX, d3.max(run.data, d => d.elapsed));
+        maxY = Math.max(maxY, d3.max(run.data, d => d.opsSec));
+    }
+
+    const width = styleWidth(svg) - margin.left - margin.right;
+    const height = styleHeight(svg) - margin.top - margin.bottom;
+
+    // Panning and zooming.
+    // These callbacks are defined as they are called from the panning /
+    // zooming functions elsewhere, however, they are simply no-ops on this
+    // chart, as they x-axis is a measure of "elapsed time" rather than a date.
+
+    svg.node().updateMouse = function (mouse, date, hover) {}
+    svg.node().updateZoom = function () {};
+
+    // Set up axes.
+
+    const x = d3.scaleLinear()
+        .domain([0, 8.5 * 3600])
+        .range([0, width]);
+
+    const y = d3.scaleLinear()
+        .domain([0, maxY * 1.1])
+        .range([height, 0]);
+
+    const z = d3.scaleOrdinal(d3.schemeCategory10);
+
+    const xAxis = d3.axisBottom(x)
+        .ticks(5)
+        .tickFormat(d => Math.floor(d / 3600) + "h");
+
+    const yAxis = d3.axisLeft(y)
+        .ticks(5);
+
+    const g = svg
+        .append("g")
+        .attr("transform", "translate(" + margin.left + "," + margin.top + ")");
+
+    g.append("g")
+        .attr("class", "axis axis--x")
+        .attr("transform", "translate(0," + height + ")")
+        .call(xAxis);
+
+    g.append("g")
+        .attr("class", "axis axis--y")
+        .call(yAxis);
+
+    // If we get no data, we just render an empty chart.
+    if (rawData == null) {
+      g.append("text")
+          .attr("class", "chart-title")
+          .attr("x", margin.left + width / 2)
+          .attr("y", height / 2)
+          .style("text-anchor", "middle")
+          .style("font", "8pt sans-serif")
+          .text("Data unavailable");
+      return;
+    }
+
+    g.append("text")
+        .attr("class", "chart-title")
+        .attr("x", margin.left + width / 2)
+        .attr("y", 0)
+        .style("text-anchor", "middle")
+        .style("font", "8pt sans-serif")
+        .text("Ops/sec over time");
+
+    // Plot data.
+
+    const view = g.append("g")
+        .attr("class", "view");
+
+    let values = [];
+    for (let key in rawData) {
+        values.push({
+            id: key,
+            values: rawData[key].data,
+        });
+    }
+
+    const line = d3.line()
+        .x(d => x(d.elapsed))
+        .y(d => y(d.opsSec));
+
+    const path = view.selectAll(".line1")
+        .data(values)
+        .enter()
+        .append("path")
+        .attr("class", "line1")
+        .attr("d", d => line(d.values))
+        .style("stroke", d => z(d.id));
+
+    // Draw a horizontal line for the calculated ops/sec average.
+
+    view.append("path")
+        .attr("d", d3.line()([[x(0), y(opsSec)], [x(maxX), y(opsSec)]]))
+        .attr("stroke", "black")
+        .attr("stroke-width", "2")
+        .style("stroke-dasharray", ("2, 5"));
+}
diff --git a/pebble/docs/memory.md b/pebble/docs/memory.md
new file mode 100644
index 0000000..b9f4a63
--- /dev/null
+++ b/pebble/docs/memory.md
@@ -0,0 +1,92 @@
+# Memory Management
+
+## Background
+
+Pebble has two significant sources of memory usage: MemTables and the
+Block Cache. MemTables buffer data that has been written to the WAL
+but not yet flushed to an SSTable. The Block Cache provides a cache of
+uncompressed SSTable data blocks.
+
+Originally, Pebble used regular Go memory allocation for the memory
+backing both MemTables and the Block Cache. This was problematic as it
+put significant pressure on the Go GC. The higher the bandwidth of
+memory allocations, the more work GC has to do to reclaim the
+memory. In order to lessen the pressure on the Go GC, an "allocation
+cache" was introduced to the Block Cache which allowed reusing the
+memory backing cached blocks in most circumstances. This produced a
+dramatic reduction in GC pressure and a measurable performance
+improvement in CockroachDB workloads.
+
+Unfortunately, the use of Go allocated memory still caused a
+problem. CockroachDB running on top of Pebble often resulted in an RSS
+(resident set size) 2x what it was when using RocksDB. The cause of
+this effect is due to the Go runtime's heuristic for triggering GC:
+
+> A collection is triggered when the ratio of freshly allocated data
+> to live data remaining after the previous collection reaches this
+> percentage.
+
+This percentage can be configured by the
+[`GOGC`](https://golang.org/pkg/runtime/) environment variable or by
+calling
+[`debug.SetGCPercent`](https://golang.org/pkg/runtime/debug/#SetGCPercent). The
+default value is `100`, which means that GC is triggered when the
+freshly allocated data is equal to the amount of live data at the end
+of the last collection period. This generally works well in practice,
+but the Pebble Block Cache is often configured to be 10s of gigabytes
+in size. Waiting for 10s of gigabytes of data to be allocated before
+triggering a GC results in very large Go heap sizes.
+
+## Manual Memory Management
+
+Attempting to adjust `GOGC` to account for the significant amount of
+memory used by the Block Cache is fraught. What value should be used?
+`10%`? `20%`? Should the setting be tuned dynamically? Rather than
+introducing a heuristic which may have cascading effects on the
+application using Pebble, we decided to move the Block Cache and
+MemTable memory out of the Go heap. This is done by using the C memory
+allocator, though it could also be done by providing a simple memory
+allocator in Go which uses `mmap` to allocate memory.
+
+In order to support manual memory management for the Block Cache and
+MemTables, Pebble needs to precisely track their lifetime. This was
+already being done for the MemTable in order to account for its memory
+usage in metrics. It was mostly being done for the Block Cache. Values
+stores in the Block Cache are reference counted and are returned to
+the "alloc cache" when their reference count falls
+to 0. Unfortunately, this tracking wasn't precise and there were
+numerous cases where the cache values were being leaked. This was
+acceptable in a world where the Go GC would clean up after us. It is
+unacceptable if the leak becomes permanent.
+
+## Leak Detection
+
+In order to find all of the cache value leaks, Pebble has a leak
+detection facility built on top of
+[`runtime.SetFinalizer`](https://golang.org/pkg/runtime/#SetFinalizer). A
+finalizer is a function associated with an object which is run when
+the object is no longer reachable. On the surface, this sounds perfect
+as a facility for performing all memory reclamation. Unfortunately,
+finalizers are generally frowned upon by the Go implementors, and come
+with very loose guarantees:
+
+> The finalizer is scheduled to run at some arbitrary time after the
+> program can no longer reach the object to which obj points. There is
+> no guarantee that finalizers will run before a program exits, so
+> typically they are useful only for releasing non-memory resources
+> associated with an object during a long-running program
+
+This language is somewhat frightening, but in practice finalizers are run at the
+end of every GC period. Pebble primarily relies on finalizers for its leak
+detection facility. In the block cache, a finalizer is associated with the Go
+allocated `cache.Value` object. When the finalizer is run, it checks that the
+buffer backing the `cache.Value` has been freed. This leak detection facility is
+enabled by the `"invariants"` build tag which is enabled by the Pebble unit
+tests.
+
+There also exists a very specific memory reclamation use case in the block cache
+that ensures that structs with transitively reachable fields backed by manually
+allocated memory that are pooled in a `sync.Pool` are freed correctly when their
+parent struct is released from the pool and consequently garbage collected by
+the Go runtime (see `cache/entry_normal.go`). The loose guarantees provided by
+the runtime are reasonable to rely on in this case to prevent a memory leak.
diff --git a/pebble/docs/range_deletions.md b/pebble/docs/range_deletions.md
new file mode 100644
index 0000000..19b8f06
--- /dev/null
+++ b/pebble/docs/range_deletions.md
@@ -0,0 +1,471 @@
+# Range Deletions
+
+TODO: The following explanation of range deletions does not take into account
+the recent change to prohibit splitting of a user key between sstables. This
+change simplifies the logic, removing 'improperly truncated range tombstones.'
+
+TODO: The following explanation of range deletions ignores the
+kind/trailer that appears at the end of keys after the sequence
+number. This should be harmless but need to add a justification on why
+it is harmless.
+
+## Background and Notation
+
+Range deletions are represented as `[start, end)#seqnum`. Points
+(set/merge/...) are represented as `key#seqnum`. A range delete `[s, e)#n1`
+deletes every point `k#n2` where `k \in [s, e)` and `n2 < n1`.
+The inequality `n2 < n1` is to handle the case where a range delete and
+a point have the same sequence number -- this happens during sstable
+ingestion where the whole sstable is assigned a single sequence number
+that applies to all the data in it.
+
+There is additionally an infinity sequence number, represented as
+`inf`, which is not used for any point, that we can use for reasoning
+about range deletes.
+
+It has been asked why range deletes use an exclusive end key instead
+of an inclusive end key. For string keys, one can convert a desired
+range delete on `[s, e]` into a range delete on `[s, ImmediateSuccessor(e))`.
+For strings, the immediate successor of a key
+is that key with a \0 appended to it. However one cannot go in the
+other direction: if one could represent only inclusive end keys in a
+range delete and one desires to delete a range with an exclusive end
+key `[s, e)#n`, one needs to compute `ImmediatePredecessor(e)` which
+is an infinite length string. For example,
+`ImmediatePredecessor("ab")` is `"aa\xff\xff...."`. Additionally,
+regardless of user needs, the exclusive end key helps with splitting a
+range delete as we will see later. 
+
+We will sometimes use ImmediatePredecessor and ImmediateSuccessor in
+the following for illustrating an idea, but we do not rely on them as
+something that is viable to produce for a particular kind of key. And
+even if viable, these functions are not currently provided to
+RockDB/Pebble.
+
+### Visualization
+
+If we consider a 2 dimensional space with increasing keys on the X
+axis (with every possible user key represented) and increasing
+sequence numbers on the Y axis, range deletes apply to a rectangle
+whose bottom edge sits on the X axis.
+
+The actual space represented by the ordering in our sstables is a one
+dimensional space where `k1#n1` is less than `k2#n2` if either of the
+following holds:
+
+- k1 < k2
+
+- k1 = k2 and n1 > n2 (under the assumption that no two points with
+the same key have the same sequence number).
+
+```
+  ^
+  |   .       > .        > .        > yy
+  |   .      >  .       >  .       >  .
+  |   .     >   .      >   .      >   .
+n |   V    >    xx    >    .     >    V
+  |   .   >     x.   >     x.   >     . 
+  |   .  >      x.  >      x.  >      .
+  |   . >       x. >       x. >       .
+  |   .>        x.>        x.>        .
+  ------------------------------------------>
+                k        IS(k)    IS(IS(k))
+```
+
+The above figure uses `.` to represent points and the X axis is dense in
+that it represents all possible keys. `xx` represents the start of a
+range delete and `x.` are the points which it deletes. The arrows `V` and
+`>` represent the ordering of the points in the one dimensional space.
+`IS` is shorthand for `ImmediateSuccessor` and the range delete represented
+there is `[k, IS(IS(k)))#n`. Ignore `yy` for now.
+
+The one dimensional space works fine in a world with only points. But
+issues arise when storing range deletes, that represent an action in 2
+dimensional space, into this one dimensional space.
+
+## Range Delete Boundaries and the Simplest World
+
+RocksDB/Pebble store the inclusive bounds of each sstable in one dimensional
+space. The range deletes two dimensional behavior and exclusive end key needs
+to be adapted to this requirement. For a range delete `[s, e)#n`,
+the smallest key it acts on is `s#(n-1)` and the largest key it
+acts on is `ImmediatePredecessor(e)#0`. So if we position the range delete
+immediately before the smallest key it acts on and immediately after
+the largest key it acts on we can give it a tight inclusive bound of
+`[s#n, e#inf]`.  
+
+Note again that this range delete does not delete everything in its
+inclusive bound. For example, range delete `["c", "h")#10` has a tight
+inclusive bound of `["c"#10, "h"#inf]` but does not delete `"d"#11`
+which lies in that bound. Going back to our earlier diagram, the one
+dimensional inclusive bounds go from the `xx` to `yy` but there are
+`.`s in between, in the one dimensional order, that are not deleted.
+
+This is the reason why one cannot in general
+use a range delete to seek over all points within its bounds. The one
+exception to this seeking behaviour is that when we can order sstables
+from new to old, one can "blindly" use this range delete in a newer
+sstable to seek to `"h"` in all older sstables since we know those
+older sstables must only have point keys with sequence numbers `< 10`
+for the keys in interval `["c", "h")`. This partial order across
+sstables exists in RocksDB/Pebble between memtable, L0 sstables (where
+it is a total order) and across sstables in different levels.
+
+Coming back to the inclusive bounds of the range delete, `[s#n, e#inf]`:
+these bounds participate in deciding the bounds of the
+sstable. In this world, one can read all the entries in an sstable and
+compute its bounds. However being able to construct these bounds by
+reading an sstable is not essential -- RocksDB/Pebble store these
+bounds in the `MANIFEST`. This latter fact has been exploited to
+construct a real world (later section) where the bounds of an sstable
+are not computable by reading all its keys.
+
+If we had a system with one sstable per level, for each level lower
+than L0, we are effectively done. We have represented the tight bounds
+of each range delete and it is within the bounds of the sstable. This
+works even with L0 => L0 compactions assuming they output exactly one
+sstable.
+
+## The Mostly Simple World
+
+Here we have multiple files for levels lower than L0 that are non
+overlapping in the file bounds. These multiple files occur because
+compactions produce multiple files. This introduces the need to split a
+range delete across the files being produced by a compaction.
+
+There is a clean way to split a range delete `[s, e)#n` into 2 parts
+(which can be recursively applied to split into arbitrarily many
+parts): split into `[s, m)#n` and `[m, e)#n`. These range deletes
+apply to non-overlapping points and their tight bounds are `[s#m,
+m#inf]`, `[m#n, e#inf]` which are also non-overlapping.
+
+Consider the following example of an input range delete `["c", "h")#10` and
+the following two output files from a compaction:
+
+```
+          sst1            sst2
+last point is "e"#7 | first point is "f"#20
+```
+
+The range delete can be split into `["c", "f")#10` and `["f",
+"h")#10`, by using the first point key of sst2 as the split
+point. Then the bounds of sst1 and sst2 will be `[..., "f"#inf]` and
+`["f"#20, ...]` which are non-overlapping. It is still possible to compute
+the sstable bounds by looking at all the entries in the sstable.
+
+## The Real World
+
+Continuing with the same range delete `["c", "h")#10`, we can have the
+following sstables produced during a compaction:
+
+```
+         sst1       sst2         sst3        sst4     sst5
+points: "e"#7 | "f"#12 "f"#7 | "f"#4 "f"#3 | "f"#1 | "g"#15
+```
+
+The range deletes written to these ssts are
+
+```
+      sst1           sst2            sst3           sst4          sst5
+["c", "h")#10 | ["f", "h")#10 | ["f", "h")#10 | ["f", "h")#10 | ["g", "h")#10
+```
+
+The Pebble code that achieves this effect is in
+`rangedel.Fragmenter`. It is a code structuring artifact that sst1
+does not contain a range delete equal to `["c", "f")#10` and sst4 does
+not contain `["f", "g")#10`. However for the range deletes in sst2 and
+sst3 we cannot do any better because we don't know what the key
+following "f" will be (the compaction cannot look ahead) and because
+we don't have an `ImmediateSuccessor` function (otherwise we could
+have written `["f", ImmediateSuccessor("f"))#10` to sst2, sst3). But
+the code artifacts are not the ones introducing the real complexity.
+
+The range delete bounds are
+
+```
+      sst1        sst2, sst3, sst4          sst5
+["c"#10, "h"#inf] ["f"#10, "h"#inf]   ["g"#10, "h"#inf]
+
+```
+
+We note the following:
+
+- The bounds of range deletes are overlapping since we have been
+  unable to split the range deletes. If these decide the sstable
+  bounds, the sstables will have overlapping bounds. This is not
+  permissible.
+
+- The range deletes included in each sstable result in that sstable
+  being "self-sufficient" wrt having the range delete that deletes
+  some of the points in the sstable (let us assume that the points in
+  this example have not been dropped from that sstable because of a
+  snapshot).
+
+- The transitions from sst1 to sst2 and sst4 to sst5 are **clean** in
+  that we can pretend that the range deletes in those files are actually:
+
+```
+      sst1           sst2            sst3           sst4          sst5
+["c", "f")#10 | ["f", "g")#10 | ["f", "g")#10 | ["f", "g")#10 | ["g", "h")#10
+```
+
+We could achieve some of these **clean** transitions (but not all) with a
+code change. Also note that these better range deletes maintain the
+"self-sufficient" property.
+
+### Making Non-overlapping SSTable bounds
+
+We force the sstable bounds to be non-overlapping by setting them to:
+
+```
+      sst1              sst2           sst3            sst4              sst5
+["c"#10, "f"#inf] ["f"#12, "f"#7] ["f"#4, "f"#3] ["f"#1, "g"#inf] ["g"#15, "h"#inf]
+```
+
+Note that for sst1...sst4 the sstable bounds are smaller than the
+bounds of the range deletes contained in them. The code that
+accomplishes this is Pebble is in `compaction.go` -- we will not discuss the
+details of that logic here but note that it is placing an `inf`
+sequence number for a clean transition and for an unclean transition
+it is using the point keys as the bounds.
+
+Associated with these narrower bounds, we add the following
+requirement: a range delete in an sstable must **act-within** the bounds of
+the sstable it is contained in. In the above example:
+
+- sst1: range delete `["c", "h")#10` must act-within the bound `["c"#10, "f"#inf]`
+
+- sst2: range delete `["f", "h")#10` must act-within the bound `["f"#12, "f"#7]`
+
+- sst3: range delete `["f", "h")#10` must act-within the bound `["f"#4, "f"#3]`
+
+- sst4: range delete `["f", "h")#10` must act-within the bound ["f"#1, "g"#inf]
+
+- And so on.
+
+The intuitive reason for the **act-within** requirement is that 
+sst5 can be compacted and moved down to a lower level independent of
+sst1-sst4, since it was at a **clean** boundary. We do not want the
+range delete `["f", "h")#10` sitting in sst1...sst4 at the higher
+level to act on `"g"#15` that has been moved to the lower level. Note
+that this incorrect action can happen due to 2 reasons:
+  
+1. the invariant that lower levels have older data for keys
+   that also exist in higher levels means we can (a) seek a lower level
+   sstable to the end of a range delete from a higher level, (b) for a key
+   lookup, stop searching in lower levels once a range delete is encountered
+   for that key in a higher level.
+  
+2. Sequence number zeroing logic can change the sequence number of
+  `"g"#15` to `"g"#0` (for better compression) once it realizes that
+   there are no older versions of `"g"`. It would be incorrect for this
+  `"g"#0` to be deleted.  
+
+
+#### Loss of Power
+
+This act-within behavior introduces some "loss of power" for
+the original range delete `["c", "h")#10`. By acting within sst2...sst4
+it can no longer delete keys `"f"#6`, `"f"#5`, `"f"#2`.
+
+Luckily for us, this is harmless since these keys cannot have existed
+in the system due to the levelling behavior: we cannot be writing
+sst2...sst4 to level `i` if versions of `"f"` younger than `"f"#4` are
+already in level `i` or version older than `"f"#7` have been left in
+level i - 1. There is some trickery possible to prevent this "loss of
+power" for queries (see the "Putting it together" section), but given
+the history of bugs in this area, we should be cautious.
+
+### Improperly truncated Range Deletes
+
+We refer to range deletes that have experienced this "loss of power"
+as **improper**. In the above example the range deletions in sst2, sst3, sst4
+are improper. The problem with improper range deletions occurs
+when they need to participate in a future compaction: even though we
+have restricted them to act-within their current sstable boundary, we
+don't have a way of **"writing"** this restriction to a new sstable,
+since they still need to be written in the `[s, e)#n` format.
+
+For example, sst2 has delete `["f", "h")#10` that must act-within
+the bound `["f"#12, "f"#7]`. If sst2 was compacted down to the next
+level into a new sstable (whose bounds we cannot predict because they
+depend on other data written to that sstable) we need to be able to
+write a range delete entry that follows the original restriction. But
+the narrowest we can write is `["f", ImmediateSuccessor("f"))#10`. This
+is an expansion of the act-within restriction with potentially
+unintended consequences. In this case the expansion happened in the suffix.
+For sst4, the range deletion `["f", "h")#10` must act-within `["f"#1, "g"#inf]`,
+and we can precisely represent the constraint on the suffix by writing
+`["f", "g")#10` but it does not precisely represent that this range delete
+should not apply to `"f"#9`...`"f"#2`.
+
+In comparison, the sst1 range delete `["c", "h")#10` that must act-within
+the bound `["c"#10, "f"#inf]` is not improper. This restriction can
+be applied precisely to get a range delete `["c", "f")#10`. 
+
+The solution to this is to note that while individual sstables have
+improper range deletes, if we look at a collection of sstables we
+can restore the improper range deletes spread across them to their proper self
+(and their full power). To accurately find these improper range
+deletes would require looking into the contents of a file, which is
+expensive. But we can construct a pessimistic set based on
+looking at the sequence of all files in a level and partitioning them:
+adjacent files `f1`, `f2` with largest and smallest bound `k1#n1`,
+`k2#n2` must be in the same partition if
+
+```
+k1 = k2 and n1 != inf
+```
+
+In the above example sst2, sst3, sst4 are one partition. The
+**spanning bound** of this partition is `["f"#12, "g"#inf]` and the
+range delete `["f", "h")#10` when constrained to act-within this
+spanning bound is precisely the range delete `["f",
+"g")#10`. Intuitively, the "loss of power" of this range delete has
+been restored for the sake of making it proper, so it can be
+accurately "written" in the output of the compaction (it may be
+improperly fragmented again in the output, but we have already
+discussed that). Such partitions are called "atomic compaction groups"
+and must participate as a whole in a compaction (and a
+compaction can use multiple atomic compaction groups as input).
+
+Consider another example:
+
+```
+          sst1              sst2
+points:  "e"#12         |  "e"#10
+delete: ["c", "g")#8    | ["c", "g")#8
+bounds  ["c"#8, "e"#12] | ["e"#10, "g"#inf]
+```
+
+sst1, sst2 are an atomic compaction group. Say we violated the
+requirement that both be inputs in a compaction and only compacted
+sst2 down to level `i + 1` and then down to level `i + 2`. Then we add
+sst3 with bounds `["h"#10, "j"#5]` to level `i` and sst1 and sst3 are
+compacted to level `i + 1` into a single sstable. This new sstable
+will have bounds `["c"#8, "j"#5]` so these bounds do not help with the
+original apply-witin constraint on `["c", "g")#8` (that it should
+apply-within `["c"#8, "e"#12]`). The narrowest we can construct (if we had
+`ImmediateSuccessor`) would be `["c", ImmediateSuccessor("e"))#8`.  Now we
+can incorrectly apply this range delete that is in level `i + 1` to `"e"#10`
+sitting in level `i + 2`. Note that this example can be made worse using
+sequence number zeroing -- `"e"#10` may have been rewritten to `"e"#0`.  
+
+If a range delete `[s, e)#n` is in an atomic compaction group with
+spanning bounds `[k1#n1, k2#n2]` our construction above guarantees the
+following properties
+
+- `k1#n1 <= s#n`, so the bounds do not constrain the start of the
+  range delete.
+
+- `k2 >= e` or `n2 = inf`, so if `k2` is constraining the range delete
+  it will properly truncate the range delete.
+
+
+#### New sstable at sequence number 0
+
+A new sstable can be assigned sequence number 0 (and be written to L0)
+if the keys in the sstable are not in any other sstable. This
+comparison uses the keys and not key#seqnum, so the loss and
+restoration of power does not cause problems since that occurs within
+the versions of a single key.
+
+#### Flawed optimizations
+
+For the case where the atomic compaction group correspond to the lower
+level of a compaction, it may initially seem to be correct to use only
+a prefix or suffix of that group in a compaction. In this case the
+prefix (suffix) will correspond to the largest key (smallest key) in
+the input sstables in the compaction and so can continue to constrain
+the range delete.  For example, sst1 and sst2 are in the same atomic
+compaction group
+
+```
+          sst1               sst2
+points: "c"#10 "e"#12    |  "e"#10
+delete: ["c", "g")#8     | ["c", "g")#8
+bounds  ["c"#10, "e"#12] | ["e"#10, "g"#inf]
+```
+
+and this is the lower level of a compaction with
+
+```
+          sst3
+points: "a"#14 "d"#15
+bounds  ["a"#14, "d"#15]
+```
+
+we could allow for a compaction involving sst1 and sst3 which would produce
+
+```
+          sst4
+points: "a"#14 "c"#10 "d"#15 "e"#12
+delete: ["c", "g")#8
+bounds  ["a"#14, "e"#12]
+```
+
+and the range delete is still improper but its act-within constraint has
+not expanded.
+
+But we have to be very careful to not have a more significant loss of power
+of this range delete. Consider a situation where sst3 had a single delete
+`"e"#16`. It still does not overlap in bounds with sst2 and we again pick
+sst1 and sst3 for compaction. This single delete will cause `"e"#12` to be deleted
+and sst4 bounds would be (unless we had complicated code preventing it):
+
+```
+          sst4
+points: "a"#14 "c"#10 "d"#15
+delete: ["c", "g")#8
+bounds  ["a"#14, "d"#15]
+```
+
+Now this delete cannot delete `"dd"#6` and we have lost the ability to know
+that sst4 and sst2 are in the same atomic compaction group.
+
+
+### Putting it together
+
+Summarizing the above, we have:
+
+- SStable bounds logic that ensures sstables are not
+overlapping. These sstables contain range deletes that extend outside
+these bounds. But these range deletes should **apply-within** the
+sstable bounds.
+
+- Compactions: they need to constrain the range deletes in the inputs
+to **apply-within**, but this can create problems with **writing** the
+**improper** range deletes. The solution is to include the full
+**atomic compaction group** in a compaction so we can restore the
+**improper** range deletes to their **proper** self and then apply the
+constraints of the atomic compaction group.
+
+- Queries: We need to act-within the file bound constraint on the range delete.
+  Say the range delete is `[s, e)#n` and the file bound is `[b1#n1,
+  b2#n2]`. We are guaranteed that `b1#n1 <= s#n` so the only
+  constraint can come from `b2#n2`.
+  
+  - Deciding whether a range delete covers a key in the same or lower levels.
+
+    - `b2 >= e`: there is no act-within constraint.
+    - `b2 < e`: to be precise we cannot let it delete `b2#n2-1` or
+      later keys. But it is likely that allowing it to delete up to
+      `b2#0` would be ok due to the atomic compaction group. This
+      would prevent the so-called "loss of power" discussed earlier if
+      one also includes the argument that the gap in the file bounds
+      that also represents the loss of power is harmless (the gap
+      exists within versions of key, and anyone doing a query for that
+      key will start from the sstable to the left of the gap). But it
+      may be better to be cautious.
+
+  - For using the range delete to seek sstables at lower levels.
+    - `b2 >= e`: seek to `e` since there is no act-within constraint.
+    - `b2 < e`: seek to `b2`. We are ignoring that this range delete
+      is allowed to  delete some versions of `b2` since this is just a
+      performance optimization.
+
+
+
+
+
+
diff --git a/pebble/docs/rocksdb.md b/pebble/docs/rocksdb.md
new file mode 100644
index 0000000..8cf7ae9
--- /dev/null
+++ b/pebble/docs/rocksdb.md
@@ -0,0 +1,757 @@
+# Pebble vs RocksDB: Implementation Differences
+
+RocksDB is a key-value store implemented using a Log-Structured
+Merge-Tree (LSM). This document is not a primer on LSMs. There exist
+some decent
+[introductions](http://www.benstopford.com/2015/02/14/log-structured-merge-trees/)
+on the web, or try chapter 3 of [Designing Data-Intensive
+Applications](https://www.amazon.com/Designing-Data-Intensive-Applications-Reliable-Maintainable/dp/1449373321).
+
+Pebble inherits the RocksDB file formats, has a similar API, and
+shares many implementation details, but it also has many differences
+that improve performance, reduce implementation complexity, or extend
+functionality. This document highlights some of the more important
+differences.
+
+* [Internal Keys](#internal-keys)
+* [Indexed Batches](#indexed-batches)
+* [Large Batches](#large-batches)
+* [Commit Pipeline](#commit-pipeline)
+* [Range Deletions](#range-deletions)
+* [Flush and Compaction Pacing](#flush-and-compaction-pacing)
+* [Write Throttling](#write-throttling)
+* [Other Differences](#other-differences)
+
+## Internal Keys
+
+The external RocksDB API accepts keys and values. Due to the LSM
+structure, keys are never updated in place, but overwritten with new
+versions. Inside RocksDB, these versioned keys are known as Internal
+Keys. An Internal Key is composed of the user specified key, a
+sequence number and a kind. On disk, sstables always store Internal
+Keys.
+
+```
+  +-------------+------------+----------+
+  | UserKey (N) | SeqNum (7) | Kind (1) |
+  +-------------+------------+----------+
+```
+
+The `Kind` field indicates the type of key: set, merge, delete, etc.
+
+While Pebble inherits the Internal Key encoding for format
+compatibility, it diverges from RocksDB in how it manages Internal
+Keys in its implementation. In RocksDB, Internal Keys are represented
+either in encoded form (as a string) or as a `ParsedInternalKey`. The
+latter is a struct with the components of the Internal Key as three
+separate fields.
+
+```c++
+struct ParsedInternalKey {
+  Slice  user_key;
+  uint64 seqnum;
+  uint8  kind;
+}
+```
+
+The component format is convenient: changing the `SeqNum` or `Kind` is
+field assignment. Extracting the `UserKey` is a field
+reference. However, RocksDB tends to only use `ParsedInternalKey`
+locally. The major internal APIs, such as `InternalIterator`, operate
+using encoded internal keys (i.e. strings) for parameters and return
+values.
+
+To give a concrete example of the overhead this causes, consider
+`Iterator::Seek(user_key)`. The external `Iterator` is implemented on
+top of an `InternalIterator`. `Iterator::Seek` ends up calling
+`InternalIterator::Seek`. Both Seek methods take a key, but
+`InternalIterator::Seek` expects an encoded Internal Key. This is both
+error prone and expensive. The key passed to `Iterator::Seek` needs to
+be copied into a temporary string in order to append the `SeqNum` and
+`Kind`. In Pebble, Internal Keys are represented in memory using an
+`InternalKey` struct that is the analog of `ParsedInternalKey`. All
+internal APIs use `InternalKeys`, with the exception of the lowest
+level routines for decoding data from sstables. In Pebble, since the
+interfaces all take and return the `InternalKey` struct, we don’t need
+to allocate to construct the Internal Key from the User Key, but
+RocksDB sometimes needs to allocate, and encode (i.e. make a
+copy). The use of the encoded form also causes RocksDB to pass encoded
+keys to the comparator routines, sometimes decoding the keys multiple
+times during the course of processing.
+
+## Indexed Batches
+
+In RocksDB, a batch is the unit for all write operations. Even writing
+a single key is transformed internally to a batch. The batch internal
+representation is a contiguous byte buffer with a fixed 12-byte
+header, followed by a series of records.
+
+```
+  +------------+-----------+--- ... ---+
+  | SeqNum (8) | Count (4) |  Entries  |
+  +------------+-----------+--- ... ---+
+```
+
+Each record has a 1-byte kind tag prefix, followed by 1 or 2 length
+prefixed strings (varstring):
+
+```
+  +----------+-----------------+-------------------+
+  | Kind (1) | Key (varstring) | Value (varstring) |
+  +----------+-----------------+-------------------+
+```
+
+(The `Kind` indicates if there are 1 or 2 varstrings. `Set`, `Merge`,
+and `DeleteRange` have 2 varstrings, while `Delete` has 1.)
+
+Adding a mutation to a batch involves appending a new record to the
+buffer. This format is extremely fast for writes, but the lack of
+indexing makes it untenable to use directly for reads. In order to
+support iteration, a separate indexing structure is created. Both
+RocksDB and Pebble use a skiplist for the indexing structure, but with
+a clever twist. Rather than the skiplist storing a copy of the key, it
+simply stores the offset of the record within the mutation buffer. The
+result is that the skiplist acts a multi-map (i.e. a map that can have
+duplicate entries for a given key). The iteration order for this map
+is constructed so that records sort on key, and for equal keys they
+sort on descending offset. Newer records for the same key appear
+before older records.
+
+While the indexing structure for batches is nearly identical between
+RocksDB and Pebble, how the index structure is used is completely
+different. In RocksDB, a batch is indexed using the
+`WriteBatchWithIndex` class. The `WriteBatchWithIndex` class provides
+a `NewIteratorWithBase` method that allows iteration over the merged
+view of the batch contents and an underlying "base" iterator created
+from the database. `BaseDeltaIterator` contains logic to iterate over
+the batch entries and the base iterator in parallel which allows us to
+perform reads on a snapshot of the database as though the batch had
+been applied to it. On the surface this sounds reasonable, yet the
+implementation is incomplete. Merge and DeleteRange operations are not
+supported. The reason they are not supported is because handling them
+is complex and requires duplicating logic that already exists inside
+RocksDB for normal iterator processing.
+
+Pebble takes a different approach to iterating over a merged view of a
+batch's contents and the underlying database: it treats the batch as
+another level in the LSM. Recall that an LSM is composed of zero or
+more memtable layers and zero or more sstable layers. Internally, both
+RocksDB and Pebble contain a `MergingIterator` that knows how to merge
+the operations from different levels, including processing overwritten
+keys, merge operations, and delete range operations. The challenge
+with treating the batch as another level to be used by a
+`MergingIterator` is that the records in a batch do not have a
+sequence number. The sequence number in the batch header is not
+assigned until the batch is committed. The solution is to give the
+batch records temporary sequence numbers. We need these temporary
+sequence numbers to be larger than any other sequence number in the
+database so that the records in the batch are considered newer than
+any committed record. This is accomplished by reserving the high-bit
+in the 56-bit sequence number for use as a marker for batch sequence
+numbers. The sequence number for a record in an uncommitted batch is:
+
+```
+  RecordOffset | (1<<55)
+```
+
+Newer records in a given batch will have a larger sequence number than
+older records in the batch. And all of the records in a batch will
+have larger sequence numbers than any committed record in the
+database.
+
+The end result is that Pebble's batch iterators support all of the
+functionality of regular database iterators with minimal additional
+code.
+
+## Large Batches
+
+The size of a batch is limited only by available memory, yet the
+required memory is not just the batch representation. When a batch is
+committed, the commit operation iterates over the records in the batch
+from oldest to newest and inserts them into the current memtable. The
+memtable is an in-memory structure that buffers mutations that have
+been committed (written to the Write Ahead Log), but not yet written
+to an sstable. Internally, a memtable uses a skiplist to index
+records. Each skiplist entry has overhead for the index links and
+other metadata that is a dozen bytes at minimum. A large batch
+composed of many small records can require twice as much memory when
+inserted into a memtable than it required in the batch. And note that
+this causes a temporary increase in memory requirements because the
+batch memory is not freed until it is completely committed.
+
+A non-obvious implementation restriction present in both RocksDB and
+Pebble is that there is a one-to-one correspondence between WAL files
+and memtables. That is, a given WAL file has a single memtable
+associated with it and vice-versa. While this restriction could be
+removed, doing so is onerous and intricate. It should also be noted
+that committing a batch involves writing it to a single WAL file. The
+combination of restrictions results in a batch needing to be written
+entirely to a single memtable.
+
+What happens if a batch is too large to fit in a memtable?  Memtables
+are generally considered to have a fixed size, yet this is not
+actually true in RocksDB. In RocksDB, the memtable skiplist is
+implemented on top of an arena structure. An arena is composed of a
+list of fixed size chunks, with no upper limit set for the number of
+chunks that can be associated with an arena. So RocksDB handles large
+batches by allowing a memtable to grow beyond its configured
+size. Concretely, while RocksDB may be configured with a 64MB memtable
+size, a 1GB batch will cause the memtable to grow to accomodate
+it. Functionally, this is good, though there is a practical problem: a
+large batch is first written to the WAL, and then added to the
+memtable. Adding the large batch to the memtable may consume so much
+memory that the system runs out of memory and is killed by the
+kernel. This can result in a death loop because upon restarting as the
+batch is read from the WAL and applied to the memtable again.
+
+In Pebble, the memtable is also implemented using a skiplist on top of
+an arena. Significantly, the Pebble arena is a fixed size. While the
+RocksDB skiplist uses pointers, the Pebble skiplist uses offsets from
+the start of the arena. The fixed size arena means that the Pebble
+memtable cannot expand arbitrarily. A batch that is too large to fit
+in the memtable causes the current mutable memtable to be marked as
+immutable and the batch is wrapped in a `flushableBatch` structure and
+added to the list of immutable memtables. Because the `flushableBatch`
+is readable as another layer in the LSM, the batch commit can return
+as soon as the `flushableBatch` has been added to the immutable
+memtable list.
+
+Internally, a `flushableBatch` provides iterator support by sorting
+the batch contents (the batch is sorted once, when it is added to the
+memtable list). Sorting the batch contents and insertion of the
+contents into a memtable have the same big-O time, but the constant
+factor dominates here. Sorting is significantly faster and uses
+significantly less memory due to not having to copy the batch records.
+
+Note that an effect of this large batch support is that Pebble can be
+configured as an efficient on-disk sorter: specify a small memtable
+size, disable the WAL, and set a large L0 compaction threshold. In
+order to sort a large amount of data, create batches that are larger
+than the memtable size and commit them. When committed these batches
+will not be inserted into a memtable, but instead sorted and then
+written out to L0. The fully sorted data can later be read and the
+normal merging process will take care of the final ordering.
+
+## Commit Pipeline
+
+The commit pipeline is the component which manages the steps in
+committing write batches, such as writing the batch to the WAL and
+applying its contents to the memtable. While simple conceptually, the
+commit pipeline is crucial for high performance. In the absence of
+concurrency, commit performance is limited by how fast a batch can be
+written (and synced) to the WAL and then added to the memtable, both
+of which are outside of the purview of the commit pipeline.
+
+To understand the challenge here, it is useful to have a conception of
+the WAL (write-ahead log). The WAL contains a record of all of the
+batches that have been committed to the database. As a record is
+written to the WAL it is added to the memtable. Each record is
+assigned a sequence number which is used to distinguish newer updates
+from older ones. Conceptually the WAL looks like:
+
+```
++--------------------------------------+
+| Batch(SeqNum=1,Count=9,Records=...)  |
++--------------------------------------+
+| Batch(SeqNum=10,Count=5,Records=...) |
++--------------------------------------+
+| Batch(SeqNum=15,Count=7,Records...)  |
++--------------------------------------+
+| ...                                  |
++--------------------------------------+
+```
+
+Note that each WAL entry is precisely the batch representation
+described earlier in the [Indexed Batches](#indexed-batches)
+section. The monotonically increasing sequence numbers are a critical
+component in allowing RocksDB and Pebble to provide fast snapshot
+views of the database for reads.
+
+If concurrent performance was not a concern, the commit pipeline could
+simply be a mutex which serialized writes to the WAL and application
+of the batch records to the memtable. Concurrent performance is a
+concern, though.
+
+The primary challenge in concurrent performance in the commit pipeline
+is maintaining two invariants:
+
+1. Batches need to be written to the WAL in sequence number order.
+2. Batches need to be made visible for reads in sequence number
+   order. This invariant arises from the use of a single sequence
+   number which indicates which mutations are visible.
+
+The second invariant deserves explanation. RocksDB and Pebble both
+keep track of a visible sequence number. This is the sequence number
+for which records in the database are visible during reads. The
+visible sequence number exists because committing a batch is an atomic
+operation, yet adding records to the memtable is done without an
+exclusive lock (the skiplists used by both Pebble and RocksDB are
+lock-free). When the records from a batch are being added to the
+memtable, a concurrent read operation may see those records, but will
+skip over them because they are newer than the visible sequence
+number. Once all of the records in the batch have been added to the
+memtable, the visible sequence number is atomically incremented.
+
+So we have four steps in committing a write batch:
+
+1. Write the batch to the WAL
+2. Apply the mutations in the batch to the memtable
+3. Bump the visible sequence number
+4. (Optionally) sync the WAL
+
+Writing the batch to the WAL is actually very fast as it is just a
+memory copy. Applying the mutations in the batch to the memtable is by
+far the most CPU intensive part of the commit pipeline. Syncing the
+WAL is the most expensive from a wall clock perspective.
+
+With that background out of the way, let's examine how RocksDB commits
+batches. This description is of the traditional commit pipeline in
+RocksDB (i.e. the one used by CockroachDB).
+
+RocksDB achieves concurrency in the commit pipeline by grouping
+concurrently committed batches into a batch group. Each group is
+assigned a "leader" which is the first batch to be added to the
+group. The batch group is written atomically to the WAL by the leader
+thread, and then the individual batches making up the group are
+concurrently applied to the memtable. Lastly, the visible sequence
+number is bumped such that all of the batches in the group become
+visible in a single atomic step. While a batch group is being applied,
+other concurrent commits are added to a waiting list. When the group
+commit finishes, the waiting commits form the next group.
+
+There are two criticisms of the batch grouping approach. The first is
+that forming a batch group involves copying batch contents. RocksDB
+partially alleviates this for large batches by placing a limit on the
+total size of a group. A large batch will end up in its own group and
+not be copied, but the criticism still applies for small batches. Note
+that there are actually two copies here. The batch contents are
+concatenated together to form the group, and then the group contents
+are written into an in memory buffer for the WAL before being written
+to disk.
+
+The second criticism is about the thread synchronization points. Let's
+consider what happens to a commit which becomes the leader:
+
+1. Lock commit mutex
+2. Wait to become leader
+3. Form (concatenate) batch group and write to the WAL
+4. Notify followers to apply their batch to the memtable
+5. Apply own batch to memtable
+6. Wait for followers to finish
+7. Bump visible sequence number
+8. Unlock commit mutex
+9. Notify followers that the commit is complete
+
+The follower's set of operations looks like:
+
+1. Lock commit mutex
+2. Wait to become follower
+3. Wait to be notified that it is time to apply batch
+4. Unlock commit mutex
+5. Apply batch to memtable
+6. Wait to be notified that commit is complete
+
+The thread synchronization points (all of the waits and notifies) are
+overhead. Reducing that overhead can improve performance.
+
+The Pebble commit pipeline addresses both criticisms. The main
+innovation is a commit queue that mirrors the commit order. The Pebble
+commit pipeline looks like:
+
+1. Lock commit mutex
+  * Add batch to commit queue
+  * Assign batch sequence number
+  * Write batch to the WAL
+2. Unlock commit mutex
+3. Apply batch to memtable (concurrently)
+4. Publish batch sequence number
+
+Pebble does not use the concept of a batch group. Each batch is
+individually written to the WAL, but note that the WAL write is just a
+memory copy into an internal buffer in the WAL.
+
+Step 4 deserves further scrutiny as it is where the invariant on the
+visible batch sequence number is maintained. Publishing the batch
+sequence number cannot simply bump the visible sequence number because
+batches with earlier sequence numbers may still be applying to the
+memtable. If we were to ratchet the visible sequence number without
+waiting for those applies to finish, a concurrent reader could see
+partial batch contents. Note that RocksDB has experimented with
+allowing these semantics with its unordered writes option.
+
+We want to retain the atomic visibility of batch commits. The publish
+batch sequence number step needs to ensure that we don't ratchet the
+visible sequence number until all batches with earlier sequence
+numbers have applied. Enter the commit queue: a lock-free
+single-producer, multi-consumer queue. Batches are added to the commit
+queue with the commit mutex held, ensuring the same order as the
+sequence number assignment. After a batch finishes applying to the
+memtable, it atomically marks the batch as applied. It then removes
+the prefix of applied batches from the commit queue, bumping the
+visible sequence number, and marking the batch as committed (via a
+`sync.WaitGroup`). If the first batch in the commit queue has not be
+applied we wait for our batch to be committed, relying on another
+concurrent committer to perform the visible sequence ratcheting for
+our batch. We know a concurrent commit is taking place because if
+there was only one batch committing it would be at the head of the
+commit queue.
+
+There are two possibilities when publishing a sequence number. The
+first is that there is an unapplied batch at the head of the
+queue. Consider the following scenario where we're trying to publish
+the sequence number for batch `B`.
+
+```
+  +---------------+-------------+---------------+-----+
+  | A (unapplied) | B (applied) | C (unapplied) | ... |
+  +---------------+-------------+---------------+-----+
+```
+
+The publish routine will see that `A` is unapplied and then simply
+wait for `B's` done `sync.WaitGroup` to be signalled. This is safe
+because `A` must still be committing. And if `A` has concurrently been
+marked as applied, the goroutine publishing `A` will then publish
+`B`. What happens when `A` publishes its sequence number? The commit
+queue state becomes:
+
+```
+  +-------------+-------------+---------------+-----+
+  | A (applied) | B (applied) | C (unapplied) | ... |
+  +-------------+-------------+---------------+-----+
+```
+
+The publish routine pops `A` from the queue, ratchets the sequence
+number, then pops `B` and ratchets the sequence number again, and then
+finds `C` and stops. A detail that it is important to notice is that
+the committer for batch `B` didn't have to do any more work. An
+alternative approach would be to have `B` wakeup and ratchet its own
+sequence number, but that would serialize the remainder of the commit
+queue behind that goroutine waking up.
+
+The commit queue reduces the number of thread synchronization
+operations required to commit a batch. There is no leader to notify,
+or followers to wait for. A commit either publishes its own sequence
+number, or performs one synchronization operation to wait for a
+concurrent committer to publish its sequence number.
+
+## Range Deletions
+
+Deletion of an individual key in RocksDB and Pebble is accomplished by
+writing a deletion tombstone. A deletion tombstone shadows an existing
+value for a key, causing reads to treat the key as not present. The
+deletion tombstone mechanism works well for deleting small sets of
+keys, but what happens if you want to all of the keys within a range
+of keys that might number in the thousands or millions? A range
+deletion is an operation which deletes an entire range of keys with a
+single record. In contrast to a point deletion tombstone which
+specifies a single key, a range deletion tombstone (a.k.a. range
+tombstone) specifies a start key (inclusive) and an end key
+(exclusive). This single record is much faster to write than thousands
+or millions of point deletion tombstones, and can be done blindly --
+without iterating over the keys that need to be deleted. The downside
+to range tombstones is that they require additional processing during
+reads. How the processing of range tombstones is done significantly
+affects both the complexity of the implementation, and the efficiency
+of read operations in the presence of range tombstones.
+
+A range tombstone is composed of a start key, end key, and sequence
+number. Any key that falls within the range is considered deleted if
+the key's sequence number is less than the range tombstone's sequence
+number. RocksDB stores range tombstones segregated from point
+operations in a special range deletion block within each sstable.
+Conceptually, the range tombstones stored within an sstable are
+truncated to the boundaries of the sstable, though there are
+complexities that cause this to not actually be physically true.
+
+In RocksDB, the main structure implementing range tombstone processing
+is the `RangeDelAggregator`. Each read operation and iterator has its
+own `RangeDelAggregator` configured for the sequence number the read
+is taking place at. The initial implementation of `RangeDelAggregator`
+built up a "skyline" for the range tombstones visible at the read
+sequence number.
+
+```
+10   +---+
+ 9   |   |
+ 8   |   |
+ 7   |   +----+
+ 6   |        |
+ 5 +-+        |  +----+
+ 4 |          |  |    |
+ 3 |          |  |    +---+
+ 2 |          |  |        |
+ 1 |          |  |        |
+ 0 |          |  |        |
+  abcdefghijklmnopqrstuvwxyz
+```
+
+The above diagram shows the skyline created for the range tombstones
+`[b,j)#5`, `[d,h)#10`, `[f,m)#7`, `[p,u)#5`, and `[t,y)#3`. The
+skyline is queried for each key read to see if the key should be
+considered deleted or not. The skyline structure is stored in a binary
+tree, making the queries an O(logn) operation in the number of
+tombstones, though there is an optimization to make this O(1) for
+`next`/`prev` iteration. Note that the skyline representation loses
+information about the range tombstones. This requires the structure to
+be rebuilt on every read which has a significant performance impact.
+
+The initial skyline range tombstone implementation has since been
+replaced with a more efficient lookup structure. See the
+[DeleteRange](https://rocksdb.org/blog/2018/11/21/delete-range.html)
+blog post for a good description of both the original implementation
+and the new (v2) implementation. The key change in the new
+implementation is to "fragment" the range tombstones that are stored
+in an sstable. The fragmented range tombstones provide the same
+benefit as the skyline representation: the ability to binary search
+the fragments in order to find the tombstone covering a key. But
+unlike the skyline approach, the fragmented tombstones can be cached
+on a per-sstable basis. In the v2 approach, `RangeDelAggregator` keeps
+track of the fragmented range tombstones for each sstable encountered
+during a read or iterator, and logically merges them together.
+
+Fragmenting range tombstones involves splitting range tombstones at
+overlap points. Let's consider the tombstones in the skyline example
+above:
+
+```
+10:   d---h
+ 7:     f------m
+ 5: b-------j     p----u
+ 3:                   t----y
+```
+
+Fragmenting the range tombstones at the overlap points creates a
+larger number of range tombstones:
+
+```
+10:   d-f-h
+ 7:     f-h-j--m
+ 5: b-d-f-h-j     p---tu
+ 3:                   tu---y
+```
+
+While the number of tombstones is larger there is a significant
+advantage: we can order the tombstones by their start key and then
+binary search to find the set of tombstones overlapping a particular
+point. This is possible because due to the fragmenting, all the
+tombstones that overlap a range of keys will have the same start and
+end key. The v2 `RangeDelAggregator` and associated classes perform
+fragmentation of range tombstones stored in each sstable and those
+fragmented tombstones are then cached.
+
+In summary, in RocksDB `RangeDelAggregator` acts as an oracle for
+answering whether a key is deleted at a particular sequence
+number. Due to caching of fragmented tombstones, the v2 implementation
+of `RangeDelAggregator` implementation is significantly faster to
+populate than v1, yet the overall approach to processing range
+tombstones remains similar.
+
+Pebble takes a different approach: it integrates range tombstones
+processing directly into the `mergingIter` structure. `mergingIter` is
+the internal structure which provides a merged view of the levels in
+an LSM. RocksDB has a similar class named
+`MergingIterator`. Internally, `mergingIter` maintains a heap over the
+levels in the LSM (note that each memtable and L0 table is a separate
+"level" in `mergingIter`). In RocksDB, `MergingIterator` knows nothing
+about range tombstones, and it is thus up to higher-level code to
+process range tombstones using `RangeDelAggregator`.
+
+While the separation of `MergingIterator` and range tombstones seems
+reasonable at first glance, there is an optimization that RocksDB does
+not perform which is awkward with the `RangeDelAggregator` approach:
+skipping swaths of deleted keys. A range tombstone often shadows more
+than one key. Rather than iterating over the deleted keys, it is much
+quicker to seek to the end point of the range tombstone. The challenge
+in implementing this optimization is that a key might be newer than
+the range tombstone and thus shouldn't be skipped. An insight to be
+utilized is that the level structure itself provides sufficient
+information. A range tombstone at `Ln` is guaranteed to be newer than
+any key it overlaps in `Ln+1`.
+
+Pebble utilizes the insight above to integrate range deletion
+processing with `mergingIter`. A `mergingIter` maintains a point
+iterator and a range deletion iterator per level in the LSM. In this
+context, every L0 table is a separate level, as is every
+memtable. Within a level, when a range deletion contains a point
+operation the sequence numbers must be checked to determine if the
+point operation is newer or older than the range deletion
+tombstone. The `mergingIter` maintains the invariant that the range
+deletion iterators for all levels newer that the current iteration key
+are positioned at the next (or previous during reverse iteration)
+range deletion tombstone. We know those levels don't contain a range
+deletion tombstone that covers the current key because if they did the
+current key would be deleted. The range deletion iterator for the
+current key's level is positioned at a range tombstone covering or
+past the current key. The position of all of other range deletion
+iterators is unspecified. Whenever a key from those levels becomes the
+current key, their range deletion iterators need to be
+positioned. This lazy positioning avoids seeking the range deletion
+iterators for keys that are never considered.
+
+For a full example, consider the following setup:
+
+```
+  p0:               o
+  r0:             m---q
+
+  p1:              n p
+  r1:       g---k
+
+  p2:  b d    i
+  r2: a---e           q----v
+
+  p3:     e
+  r3:
+```
+
+The diagram above shows is showing 4 levels, with `pX` indicating the
+point operations in a level and `rX` indicating the range tombstones.
+
+If we start iterating from the beginning, the first key we encounter
+is `b` in `p2`. When the mergingIter is pointing at a valid entry, the
+range deletion iterators for all of the levels less that the current
+key's level are positioned at the next range tombstone past the
+current key. So `r0` will point at `[m,q)` and `r1` at `[g,k)`. When
+the key `b` is encountered, we check to see if the current tombstone
+for `r0` or `r1` contains it, and whether the tombstone for `r2`,
+`[a,e)`, contains and is newer than `b`.
+
+Advancing the iterator finds the next key at `d`. This is in the same
+level as the previous key `b` so we don't have to reposition any of
+the range deletion iterators, but merely check whether `d` is now
+contained by any of the range tombstones at higher levels or has
+stepped past the range tombstone in its own level. In this case, there
+is nothing to be done.
+
+Advancing the iterator again finds `e`. Since `e` comes from `p3`, we
+have to position the `r3` range deletion iterator, which is empty. `e`
+is past the `r2` tombstone of `[a,e)` so we need to advance the `r2`
+range deletion iterator to `[q,v)`.
+
+The next key is `i`. Because this key is in `p2`, a level above `e`,
+we don't have to reposition any range deletion iterators and instead
+see that `i` is covered by the range tombstone `[g,k)`. The iterator
+is immediately advanced to `n` which is covered by the range tombstone
+`[m,q)` causing the iterator to advance to `o` which is visible.
+
+## Flush and Compaction Pacing
+
+Flushes and compactions in LSM trees are problematic because they
+contend with foreground traffic, resulting in write and read latency
+spikes. Without throttling the rate of flushes and compactions, they
+occur "as fast as possible" (which is not entirely true, since we
+have a `bytes_per_sync` option). This instantaneous usage of CPU and
+disk IO results in potentially huge latency spikes for writes and
+reads which occur in parallel to the flushes and compactions.
+
+RocksDB attempts to solve this issue by offering an option to limit
+the speed of flushes and compactions. A maximum `bytes/sec` can be
+specified through the options, and background IO usage will be limited
+to the specified amount. Flushes are given priority over compactions,
+but they still use the same rate limiter. Though simple to implement
+and understand, this option is fragile for various reasons.
+
+1) If the rate limit is configured too low, the DB will stall and
+write throughput will be affected.
+2) If the rate limit is configured too high, the write and read
+latency spikes will persist.
+3) A different configuration is needed per system depending on the
+speed of the storage device.
+4) Write rates typically do not stay the same throughout the lifetime
+of the DB (higher throughput during certain times of the day, etc) but
+the rate limit cannot be configured during runtime.
+
+RocksDB also offers an
+["auto-tuned" rate limiter](https://rocksdb.org/blog/2017/12/18/17-auto-tuned-rate-limiter.html)
+which uses a simple multiplicative-increase, multiplicative-decrease
+algorithm to dynamically adjust the background IO rate limit depending
+on how much of the rate limiter has been exhausted in an interval.
+This solves the problem of having a static rate limit, but Pebble
+attempts to improve on this with a different pacing mechanism.
+
+Pebble's pacing mechanism uses separate rate limiters for flushes and
+compactions. Both the flush and compaction pacing mechanisms work by
+attempting to flush and compact only as fast as needed and no faster.
+This is achieved differently for flushes versus compactions.
+
+For flush pacing, Pebble keeps the rate at which the memtable is
+flushed at the same rate as user writes. This ensures that disk IO
+used by flushes remains steady. When a mutable memtable becomes full
+and is marked immutable, it is typically flushed as fast as possible.
+Instead of flushing as fast as possible, what we do is look at the
+total number of bytes in all the memtables (mutable + queue of
+immutables) and subtract the number of bytes that have been flushed in
+the current flush. This number gives us the total number of bytes
+which remain to be flushed. If we keep this number steady at a constant
+level, we have the invariant that the flush rate is equal to the write
+rate.
+
+When the number of bytes remaining to be flushed falls below our
+target level, we slow down the speed of flushing. We keep a minimum
+rate at which the memtable is flushed so that flushes proceed even if
+writes have stopped. When the number of bytes remaining to be flushed
+goes above our target level, we allow the flush to proceed as fast as
+possible, without applying any rate limiting. However, note that the
+second case would indicate that writes are occurring faster than the
+memtable can flush, which would be an unsustainable rate. The LSM
+would soon hit the memtable count stall condition and writes would be
+completely stopped.
+
+For compaction pacing, Pebble uses an estimation of compaction debt,
+which is the number of bytes which need to be compacted before no
+further compactions are needed. This estimation is calculated by
+looking at the number of bytes that have been flushed by the current
+flush routine, adding those bytes to the size of the level 0 sstables,
+then seeing how many bytes exceed the target number of bytes for the
+level 0 sstables. We multiply the number of bytes exceeded by the
+level ratio and add that number to the compaction debt estimate.
+We repeat this process until the final level, which gives us a final
+compaction debt estimate for the entire LSM tree.
+
+Like with flush pacing, we want to keep the compaction debt at a
+constant level. This ensures that compactions occur only as fast as
+needed and no faster. If the compaction debt estimate falls below our
+target level, we slow down compactions. We maintain a minimum
+compaction rate so that compactions proceed even if flushes have
+stopped. If the compaction debt goes above our target level, we let
+compactions proceed as fast as possible without any rate limiting.
+Just like with flush pacing, this would indicate that writes are
+occurring faster than the background compactions can keep up with,
+which is an unsustainable rate. The LSM's read amplification would
+increase and the L0 file count stall condition would be hit.
+
+With the combined flush and compaction pacing mechanisms, flushes and
+compactions only occur as fast as needed and no faster, which reduces
+latency spikes for user read and write operations.
+
+## Write throttling
+
+RocksDB adds artificial delays to user writes when certain thresholds
+are met, such as `l0_slowdown_writes_threshold`. These artificial
+delays occur when the system is close to stalling to lessen the write
+pressure so that flushing and compactions can catch up. On the surface
+this seems good, since write stalls would seemingly be eliminated and
+replaced with gradual slowdowns. Closed loop write latency benchmarks
+would show the elimination of abrupt write stalls, which seems
+desirable.
+
+However, this doesn't do anything to improve latencies in an open loop
+model, which is the model more likely to resemble real world use
+cases. Artificial delays increase write latencies without a clear
+benefit. Writes stalls in an open loop system would indicate that
+writes are generated faster than the system could possibly handle,
+which adding artificial delays won't solve.
+
+For this reason, Pebble doesn't add artificial delays to user writes
+and writes are served as quickly as possible.
+
+### Other Differences
+
+* `internalIterator` API which minimizes indirect (virtual) function
+  calls
+* Previous pointers in the memtable and indexed batch skiplists
+* Elision of per-key lower/upper bound checks in long range scans
+* Improved `Iterator` API
+  + `SeekPrefixGE` for prefix iteration
+  + `SetBounds` for adjusting the bounds on an existing `Iterator`
+* Simpler `Get` implementation
diff --git a/pebble/error_iter.go b/pebble/error_iter.go
new file mode 100644
index 0000000..10bc9cc
--- /dev/null
+++ b/pebble/error_iter.go
@@ -0,0 +1,86 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"context"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+)
+
+type errorIter struct {
+	err error
+}
+
+// errorIter implements the base.InternalIterator interface.
+var _ internalIterator = (*errorIter)(nil)
+
+func (c *errorIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) {
+	return nil, base.LazyValue{}
+}
+
+func (c *errorIter) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	return nil, base.LazyValue{}
+}
+
+func (c *errorIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) {
+	return nil, base.LazyValue{}
+}
+
+func (c *errorIter) First() (*InternalKey, base.LazyValue) {
+	return nil, base.LazyValue{}
+}
+
+func (c *errorIter) Last() (*InternalKey, base.LazyValue) {
+	return nil, base.LazyValue{}
+}
+
+func (c *errorIter) Next() (*InternalKey, base.LazyValue) {
+	return nil, base.LazyValue{}
+}
+
+func (c *errorIter) Prev() (*InternalKey, base.LazyValue) {
+	return nil, base.LazyValue{}
+}
+
+func (c *errorIter) NextPrefix([]byte) (*InternalKey, base.LazyValue) {
+	return nil, base.LazyValue{}
+}
+
+func (c *errorIter) Error() error {
+	return c.err
+}
+
+func (c *errorIter) Close() error {
+	return c.err
+}
+
+func (c *errorIter) String() string {
+	return "error"
+}
+
+func (c *errorIter) SetBounds(lower, upper []byte) {}
+
+func (c *errorIter) SetContext(_ context.Context) {}
+
+type errorKeyspanIter struct {
+	err error
+}
+
+// errorKeyspanIter implements the keyspan.FragmentIterator interface.
+var _ keyspan.FragmentIterator = (*errorKeyspanIter)(nil)
+
+func (*errorKeyspanIter) SeekGE(key []byte) *keyspan.Span { return nil }
+func (*errorKeyspanIter) SeekLT(key []byte) *keyspan.Span { return nil }
+func (*errorKeyspanIter) First() *keyspan.Span            { return nil }
+func (*errorKeyspanIter) Last() *keyspan.Span             { return nil }
+func (*errorKeyspanIter) Next() *keyspan.Span             { return nil }
+func (*errorKeyspanIter) Prev() *keyspan.Span             { return nil }
+func (i *errorKeyspanIter) Error() error                  { return i.err }
+func (i *errorKeyspanIter) Close() error                  { return i.err }
+func (*errorKeyspanIter) String() string                  { return "error" }
diff --git a/pebble/error_test.go b/pebble/error_test.go
new file mode 100644
index 0000000..82af4a4
--- /dev/null
+++ b/pebble/error_test.go
@@ -0,0 +1,429 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"fmt"
+	"math"
+	"strings"
+	"sync/atomic"
+	"testing"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/cockroachdb/pebble/vfs/errorfs"
+	"github.com/stretchr/testify/require"
+)
+
+type panicLogger struct{}
+
+func (l panicLogger) Infof(format string, args ...interface{})  {}
+func (l panicLogger) Errorf(format string, args ...interface{}) {}
+
+func (l panicLogger) Fatalf(format string, args ...interface{}) {
+	panic(errors.Errorf("fatal: "+format, args...))
+}
+
+// corruptFS injects a corruption in the `index`th byte read.
+type corruptFS struct {
+	vfs.FS
+	// index is the index of the byte which we will corrupt.
+	index     atomic.Int32
+	bytesRead atomic.Int32
+}
+
+func (fs *corruptFS) maybeCorrupt(n int32, p []byte) {
+	newBytesRead := fs.bytesRead.Add(n)
+	pIdx := newBytesRead - 1 - fs.index.Load()
+	if pIdx >= 0 && pIdx < n {
+		p[pIdx]++
+	}
+}
+
+func (fs *corruptFS) maybeCorruptAt(n int32, p []byte, offset int64) {
+	pIdx := fs.index.Load() - int32(offset)
+	if pIdx >= 0 && pIdx < n {
+		p[pIdx]++
+	}
+}
+
+func (fs *corruptFS) Open(name string, opts ...vfs.OpenOption) (vfs.File, error) {
+	f, err := fs.FS.Open(name)
+	if err != nil {
+		return nil, err
+	}
+	cf := corruptFile{f, fs}
+	for _, opt := range opts {
+		opt.Apply(cf)
+	}
+	return cf, nil
+}
+
+type corruptFile struct {
+	vfs.File
+	fs *corruptFS
+}
+
+func (f corruptFile) Read(p []byte) (int, error) {
+	n, err := f.File.Read(p)
+	f.fs.maybeCorrupt(int32(n), p)
+	return n, err
+}
+
+func (f corruptFile) ReadAt(p []byte, off int64) (int, error) {
+	n, err := f.File.ReadAt(p, off)
+	f.fs.maybeCorruptAt(int32(n), p, off)
+	return n, err
+}
+
+func expectLSM(expected string, d *DB, t *testing.T) {
+	t.Helper()
+	expected = strings.TrimSpace(expected)
+	d.mu.Lock()
+	actual := d.mu.versions.currentVersion().String()
+	d.mu.Unlock()
+	actual = strings.TrimSpace(actual)
+	if expected != actual {
+		t.Fatalf("expected\n%s\nbut found\n%s", expected, actual)
+	}
+}
+
+// TestErrors repeatedly runs a short sequence of operations, injecting FS
+// errors at different points, until success is achieved.
+func TestErrors(t *testing.T) {
+	run := func(fs *errorfs.FS) (err error) {
+		defer func() {
+			if r := recover(); r != nil {
+				if e, ok := r.(error); ok {
+					err = e
+				} else {
+					t.Fatal(r)
+				}
+			}
+		}()
+
+		d, err := Open("", &Options{
+			FS:     fs,
+			Logger: panicLogger{},
+		})
+		if err != nil {
+			return err
+		}
+
+		key := []byte("a")
+		value := []byte("b")
+		if err := d.Set(key, value, nil); err != nil {
+			return err
+		}
+		if err := d.Flush(); err != nil {
+			return err
+		}
+		if err := d.Compact(nil, []byte("\xff"), false); err != nil {
+			return err
+		}
+
+		iter, _ := d.NewIter(nil)
+		for valid := iter.First(); valid; valid = iter.Next() {
+		}
+		if err := iter.Close(); err != nil {
+			return err
+		}
+		return d.Close()
+	}
+
+	errorCounts := make(map[string]int)
+	for i := int32(0); ; i++ {
+		fs := errorfs.Wrap(vfs.NewMem(), errorfs.ErrInjected.If(errorfs.OnIndex(i)))
+		err := run(fs)
+		if err == nil {
+			t.Logf("success %d\n", i)
+			break
+		}
+		errorCounts[err.Error()]++
+	}
+
+	expectedErrors := []string{
+		"fatal: MANIFEST flush failed: injected error",
+		"fatal: MANIFEST sync failed: injected error",
+		"fatal: MANIFEST set current failed: injected error",
+		"fatal: MANIFEST dirsync failed: injected error",
+	}
+	for _, expected := range expectedErrors {
+		if errorCounts[expected] == 0 {
+			t.Errorf("expected error %q did not occur", expected)
+		}
+	}
+}
+
+// TestRequireReadError injects FS errors into read operations at successively later
+// points until all operations can complete. It requires an operation fails any time
+// an error was injected. This differs from the TestErrors case above as that one
+// cannot require operations fail since it involves flush/compaction, which retry
+// internally and succeed following an injected error.
+func TestRequireReadError(t *testing.T) {
+	run := func(formatVersion FormatMajorVersion, index int32) (err error) {
+		// Perform setup with error injection disabled as it involves writes/background ops.
+		ii := errorfs.OnIndex(-1)
+		fs := errorfs.Wrap(vfs.NewMem(), errorfs.ErrInjected.If(ii))
+		opts := &Options{
+			FS:                 fs,
+			Logger:             panicLogger{},
+			FormatMajorVersion: formatVersion,
+		}
+		opts.private.disableTableStats = true
+		d, err := Open("", opts)
+		require.NoError(t, err)
+
+		defer func() {
+			if d != nil {
+				require.NoError(t, d.Close())
+			}
+		}()
+
+		key1 := []byte("a1")
+		key2 := []byte("a2")
+		value := []byte("b")
+		require.NoError(t, d.Set(key1, value, nil))
+		require.NoError(t, d.Set(key2, value, nil))
+		require.NoError(t, d.Flush())
+		require.NoError(t, d.Compact(key1, key2, false))
+		require.NoError(t, d.DeleteRange(key1, key2, nil))
+		require.NoError(t, d.Set(key1, value, nil))
+		require.NoError(t, d.Flush())
+		if formatVersion < FormatSetWithDelete {
+			expectLSM(`
+0.0:
+  000007:[a1#13,SET-a2#inf,RANGEDEL]
+6:
+  000005:[a1#10,SET-a2#11,SET]
+`, d, t)
+		} else {
+			expectLSM(`
+0.0:
+  000007:[a1#13,SETWITHDEL-a2#inf,RANGEDEL]
+6:
+  000005:[a1#10,SET-a2#11,SET]
+`, d, t)
+		}
+
+		// Now perform foreground ops with error injection enabled.
+		ii.Store(index)
+		iter, _ := d.NewIter(nil)
+		if err := iter.Error(); err != nil {
+			return err
+		}
+		numFound := 0
+		expectedKeys := [][]byte{key1, key2}
+		for valid := iter.First(); valid; valid = iter.Next() {
+			if !bytes.Equal(iter.Key(), expectedKeys[numFound]) {
+				t.Fatalf("expected key %v; found %v", expectedKeys[numFound], iter.Key())
+			}
+			if !bytes.Equal(iter.Value(), value) {
+				t.Fatalf("expected value %v; found %v", value, iter.Value())
+			}
+			numFound++
+		}
+		if err := iter.Close(); err != nil {
+			return err
+		}
+		if err := d.Close(); err != nil {
+			d = nil
+			return err
+		}
+		d = nil
+		// Reaching here implies all read operations succeeded. This
+		// should only happen when we reached a large enough index at
+		// which `errorfs.FS` did not return any error.
+		if i := ii.Load(); i < 0 {
+			t.Errorf("FS error injected %d ops ago went unreported", -i)
+		}
+		if numFound != 2 {
+			t.Fatalf("expected 2 values; found %d", numFound)
+		}
+		return nil
+	}
+
+	versions := []FormatMajorVersion{FormatMostCompatible, FormatSetWithDelete}
+	for _, version := range versions {
+		t.Run(fmt.Sprintf("version-%s", version), func(t *testing.T) {
+			for i := int32(0); ; i++ {
+				err := run(version, i)
+				if err == nil {
+					t.Logf("no failures reported at index %d", i)
+					break
+				}
+			}
+		})
+	}
+}
+
+// TestCorruptReadError verifies that reads to a corrupted file detect the
+// corruption and return an error. In this case the filesystem reads return
+// successful status but the data they return is corrupt.
+func TestCorruptReadError(t *testing.T) {
+	run := func(formatVersion FormatMajorVersion, index int32) (err error) {
+		// Perform setup with corruption injection disabled as it involves writes/background ops.
+		fs := &corruptFS{
+			FS: vfs.NewMem(),
+		}
+		fs.index.Store(-1)
+		opts := &Options{
+			FS:                 fs,
+			Logger:             panicLogger{},
+			FormatMajorVersion: formatVersion,
+		}
+		opts.private.disableTableStats = true
+		d, err := Open("", opts)
+		if err != nil {
+			t.Fatalf("%v", err)
+		}
+		defer func() {
+			if d != nil {
+				require.NoError(t, d.Close())
+			}
+		}()
+
+		key1 := []byte("a1")
+		key2 := []byte("a2")
+		value := []byte("b")
+		require.NoError(t, d.Set(key1, value, nil))
+		require.NoError(t, d.Set(key2, value, nil))
+		require.NoError(t, d.Flush())
+		require.NoError(t, d.Compact(key1, key2, false))
+		require.NoError(t, d.DeleteRange(key1, key2, nil))
+		require.NoError(t, d.Set(key1, value, nil))
+		require.NoError(t, d.Flush())
+		if formatVersion < FormatSetWithDelete {
+			expectLSM(`
+0.0:
+  000007:[a1#13,SET-a2#inf,RANGEDEL]
+6:
+  000005:[a1#10,SET-a2#11,SET]
+`, d, t)
+
+		} else {
+			expectLSM(`
+0.0:
+  000007:[a1#13,SETWITHDEL-a2#inf,RANGEDEL]
+6:
+  000005:[a1#10,SET-a2#11,SET]
+`, d, t)
+		}
+
+		// Now perform foreground ops with corruption injection enabled.
+		fs.index.Store(index)
+		iter, _ := d.NewIter(nil)
+		if err := iter.Error(); err != nil {
+			return err
+		}
+
+		numFound := 0
+		expectedKeys := [][]byte{key1, key2}
+		for valid := iter.First(); valid; valid = iter.Next() {
+			if !bytes.Equal(iter.Key(), expectedKeys[numFound]) {
+				t.Fatalf("expected key %v; found %v", expectedKeys[numFound], iter.Key())
+			}
+			if !bytes.Equal(iter.Value(), value) {
+				t.Fatalf("expected value %v; found %v", value, iter.Value())
+			}
+			numFound++
+		}
+		if err := iter.Close(); err != nil {
+			return err
+		}
+		if err := d.Close(); err != nil {
+			return err
+		}
+		d = nil
+		// Reaching here implies all read operations succeeded. This
+		// should only happen when we reached a large enough index at
+		// which `corruptFS` did not inject any corruption.
+		if bytesRead := fs.bytesRead.Load(); bytesRead > index {
+			t.Errorf("corruption error injected at index %d went unreported", index)
+		}
+		if numFound != 2 {
+			t.Fatalf("expected 2 values; found %d", numFound)
+		}
+		return nil
+	}
+	versions := []FormatMajorVersion{FormatMostCompatible, FormatSetWithDelete}
+	for _, version := range versions {
+		t.Run(fmt.Sprintf("version-%s", version), func(t *testing.T) {
+			for i := int32(0); ; i++ {
+				err := run(version, i)
+				if err == nil {
+					t.Logf("no failures reported at index %d", i)
+					break
+				}
+			}
+		})
+	}
+}
+
+func TestDBWALRotationCrash(t *testing.T) {
+	memfs := vfs.NewStrictMem()
+
+	var index atomic.Int32
+	inj := errorfs.InjectorFunc(func(op errorfs.Op) error {
+		if op.Kind.ReadOrWrite() == errorfs.OpIsWrite && index.Add(-1) == -1 {
+			memfs.SetIgnoreSyncs(true)
+		}
+		return nil
+	})
+	triggered := func() bool { return index.Load() < 0 }
+
+	run := func(fs *errorfs.FS, k int32) (err error) {
+		opts := &Options{
+			FS:           fs,
+			Logger:       panicLogger{},
+			MemTableSize: 2048,
+		}
+		opts.private.disableTableStats = true
+		d, err := Open("", opts)
+		if err != nil || triggered() {
+			return err
+		}
+
+		// Write keys with the FS set up to simulate a crash by ignoring
+		// syncs on the k-th write operation.
+		index.Store(k)
+		key := []byte("test")
+		for i := 0; i < 10; i++ {
+			v := []byte(strings.Repeat("b", i))
+			err = d.Set(key, v, nil)
+			if err != nil || triggered() {
+				break
+			}
+		}
+		err = firstError(err, d.Close())
+		return err
+	}
+
+	fs := errorfs.Wrap(memfs, inj)
+	for k := int32(0); ; k++ {
+		// Run, simulating a crash by ignoring syncs after the k-th write
+		// operation after Open.
+		index.Store(math.MaxInt32)
+		err := run(fs, k)
+		if !triggered() {
+			// Stop when we reach a value of k greater than the number of
+			// write operations performed during `run`.
+			t.Logf("No crash at write operation %d\n", k)
+			if err != nil {
+				t.Fatalf("Filesystem did not 'crash', but error returned: %s", err)
+			}
+			break
+		}
+		t.Logf("Crashed at write operation % 2d, error: %v\n", k, err)
+
+		// Reset the filesystem to its state right before the simulated
+		// "crash", restore syncs, and run again without crashing.
+		memfs.ResetToSyncedState()
+		memfs.SetIgnoreSyncs(false)
+		index.Store(math.MaxInt32)
+		require.NoError(t, run(fs, k))
+	}
+}
diff --git a/pebble/event.go b/pebble/event.go
new file mode 100644
index 0000000..ea527ef
--- /dev/null
+++ b/pebble/event.go
@@ -0,0 +1,767 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/humanize"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/cockroachdb/redact"
+)
+
+// TableInfo exports the manifest.TableInfo type.
+type TableInfo = manifest.TableInfo
+
+func tablesTotalSize(tables []TableInfo) uint64 {
+	var size uint64
+	for i := range tables {
+		size += tables[i].Size
+	}
+	return size
+}
+
+func formatFileNums(tables []TableInfo) string {
+	var buf strings.Builder
+	for i := range tables {
+		if i > 0 {
+			buf.WriteString(" ")
+		}
+		buf.WriteString(tables[i].FileNum.String())
+	}
+	return buf.String()
+}
+
+// LevelInfo contains info pertaining to a particular level.
+type LevelInfo struct {
+	Level  int
+	Tables []TableInfo
+	Score  float64
+}
+
+func (i LevelInfo) String() string {
+	return redact.StringWithoutMarkers(i)
+}
+
+// SafeFormat implements redact.SafeFormatter.
+func (i LevelInfo) SafeFormat(w redact.SafePrinter, _ rune) {
+	w.Printf("L%d [%s] (%s) Score=%.2f",
+		redact.Safe(i.Level),
+		redact.Safe(formatFileNums(i.Tables)),
+		redact.Safe(humanize.Bytes.Uint64(tablesTotalSize(i.Tables))),
+		redact.Safe(i.Score))
+}
+
+// CompactionInfo contains the info for a compaction event.
+type CompactionInfo struct {
+	// JobID is the ID of the compaction job.
+	JobID int
+	// Reason is the reason for the compaction.
+	Reason string
+	// Input contains the input tables for the compaction organized by level.
+	Input []LevelInfo
+	// Output contains the output tables generated by the compaction. The output
+	// tables are empty for the compaction begin event.
+	Output LevelInfo
+	// Duration is the time spent compacting, including reading and writing
+	// sstables.
+	Duration time.Duration
+	// TotalDuration is the total wall-time duration of the compaction,
+	// including applying the compaction to the database. TotalDuration is
+	// always ≥ Duration.
+	TotalDuration time.Duration
+	Done          bool
+	Err           error
+
+	SingleLevelOverlappingRatio float64
+	MultiLevelOverlappingRatio  float64
+
+	// Annotations specifies additional info to appear in a compaction's event log line
+	Annotations compactionAnnotations
+}
+
+type compactionAnnotations []string
+
+// SafeFormat implements redact.SafeFormatter.
+func (ca compactionAnnotations) SafeFormat(w redact.SafePrinter, _ rune) {
+	if len(ca) == 0 {
+		return
+	}
+	for i := range ca {
+		if i != 0 {
+			w.Print(" ")
+		}
+		w.Printf("%s", redact.SafeString(ca[i]))
+	}
+}
+
+func (i CompactionInfo) String() string {
+	return redact.StringWithoutMarkers(i)
+}
+
+// SafeFormat implements redact.SafeFormatter.
+func (i CompactionInfo) SafeFormat(w redact.SafePrinter, _ rune) {
+	if i.Err != nil {
+		w.Printf("[JOB %d] compaction(%s) to L%d error: %s",
+			redact.Safe(i.JobID), redact.SafeString(i.Reason), redact.Safe(i.Output.Level), i.Err)
+		return
+	}
+
+	if !i.Done {
+		w.Printf("[JOB %d] compacting(%s) ",
+			redact.Safe(i.JobID),
+			redact.SafeString(i.Reason))
+		w.Printf("%s", i.Annotations)
+		w.Printf("%s; ", levelInfos(i.Input))
+		w.Printf("OverlappingRatio: Single %.2f, Multi %.2f", i.SingleLevelOverlappingRatio, i.MultiLevelOverlappingRatio)
+		return
+	}
+	outputSize := tablesTotalSize(i.Output.Tables)
+	w.Printf("[JOB %d] compacted(%s) ", redact.Safe(i.JobID), redact.SafeString(i.Reason))
+	w.Printf("%s", i.Annotations)
+	w.Print(levelInfos(i.Input))
+	w.Printf(" -> L%d [%s] (%s), in %.1fs (%.1fs total), output rate %s/s",
+		redact.Safe(i.Output.Level),
+		redact.Safe(formatFileNums(i.Output.Tables)),
+		redact.Safe(humanize.Bytes.Uint64(outputSize)),
+		redact.Safe(i.Duration.Seconds()),
+		redact.Safe(i.TotalDuration.Seconds()),
+		redact.Safe(humanize.Bytes.Uint64(uint64(float64(outputSize)/i.Duration.Seconds()))))
+}
+
+type levelInfos []LevelInfo
+
+func (i levelInfos) SafeFormat(w redact.SafePrinter, _ rune) {
+	for j, levelInfo := range i {
+		if j > 0 {
+			w.Printf(" + ")
+		}
+		w.Print(levelInfo)
+	}
+}
+
+// DiskSlowInfo contains the info for a disk slowness event when writing to a
+// file.
+type DiskSlowInfo = vfs.DiskSlowInfo
+
+// FlushInfo contains the info for a flush event.
+type FlushInfo struct {
+	// JobID is the ID of the flush job.
+	JobID int
+	// Reason is the reason for the flush.
+	Reason string
+	// Input contains the count of input memtables that were flushed.
+	Input int
+	// InputBytes contains the total in-memory size of the memtable(s) that were
+	// flushed. This size includes skiplist indexing data structures.
+	InputBytes uint64
+	// Output contains the ouptut table generated by the flush. The output info
+	// is empty for the flush begin event.
+	Output []TableInfo
+	// Duration is the time spent flushing. This duration includes writing and
+	// syncing all of the flushed keys to sstables.
+	Duration time.Duration
+	// TotalDuration is the total wall-time duration of the flush, including
+	// applying the flush to the database. TotalDuration is always ≥ Duration.
+	TotalDuration time.Duration
+	// Ingest is set to true if the flush is handling tables that were added to
+	// the flushable queue via an ingestion operation.
+	Ingest bool
+	// IngestLevels are the output levels for each ingested table in the flush.
+	// This field is only populated when Ingest is true.
+	IngestLevels []int
+	Done         bool
+	Err          error
+}
+
+func (i FlushInfo) String() string {
+	return redact.StringWithoutMarkers(i)
+}
+
+// SafeFormat implements redact.SafeFormatter.
+func (i FlushInfo) SafeFormat(w redact.SafePrinter, _ rune) {
+	if i.Err != nil {
+		w.Printf("[JOB %d] flush error: %s", redact.Safe(i.JobID), i.Err)
+		return
+	}
+
+	plural := redact.SafeString("s")
+	if i.Input == 1 {
+		plural = ""
+	}
+	if !i.Done {
+		w.Printf("[JOB %d] ", redact.Safe(i.JobID))
+		if !i.Ingest {
+			w.Printf("flushing %d memtable", redact.Safe(i.Input))
+			w.SafeString(plural)
+			w.Printf(" (%s) to L0", redact.Safe(humanize.Bytes.Uint64(i.InputBytes)))
+		} else {
+			w.Printf("flushing %d ingested table%s", redact.Safe(i.Input), plural)
+		}
+		return
+	}
+
+	outputSize := tablesTotalSize(i.Output)
+	if !i.Ingest {
+		if invariants.Enabled && len(i.IngestLevels) > 0 {
+			panic(errors.AssertionFailedf("pebble: expected len(IngestedLevels) == 0"))
+		}
+		w.Printf("[JOB %d] flushed %d memtable%s (%s) to L0 [%s] (%s), in %.1fs (%.1fs total), output rate %s/s",
+			redact.Safe(i.JobID), redact.Safe(i.Input), plural,
+			redact.Safe(humanize.Bytes.Uint64(i.InputBytes)),
+			redact.Safe(formatFileNums(i.Output)),
+			redact.Safe(humanize.Bytes.Uint64(outputSize)),
+			redact.Safe(i.Duration.Seconds()),
+			redact.Safe(i.TotalDuration.Seconds()),
+			redact.Safe(humanize.Bytes.Uint64(uint64(float64(outputSize)/i.Duration.Seconds()))))
+	} else {
+		if invariants.Enabled && len(i.IngestLevels) == 0 {
+			panic(errors.AssertionFailedf("pebble: expected len(IngestedLevels) > 0"))
+		}
+		w.Printf("[JOB %d] flushed %d ingested flushable%s",
+			redact.Safe(i.JobID), redact.Safe(len(i.Output)), plural)
+		for j, level := range i.IngestLevels {
+			file := i.Output[j]
+			if j > 0 {
+				w.Printf(" +")
+			}
+			w.Printf(" L%d:%s (%s)", level, file.FileNum, humanize.Bytes.Uint64(file.Size))
+		}
+		w.Printf(" in %.1fs (%.1fs total), output rate %s/s",
+			redact.Safe(i.Duration.Seconds()),
+			redact.Safe(i.TotalDuration.Seconds()),
+			redact.Safe(humanize.Bytes.Uint64(uint64(float64(outputSize)/i.Duration.Seconds()))))
+	}
+}
+
+// ManifestCreateInfo contains info about a manifest creation event.
+type ManifestCreateInfo struct {
+	// JobID is the ID of the job the caused the manifest to be created.
+	JobID int
+	Path  string
+	// The file number of the new Manifest.
+	FileNum base.DiskFileNum
+	Err     error
+}
+
+func (i ManifestCreateInfo) String() string {
+	return redact.StringWithoutMarkers(i)
+}
+
+// SafeFormat implements redact.SafeFormatter.
+func (i ManifestCreateInfo) SafeFormat(w redact.SafePrinter, _ rune) {
+	if i.Err != nil {
+		w.Printf("[JOB %d] MANIFEST create error: %s", redact.Safe(i.JobID), i.Err)
+		return
+	}
+	w.Printf("[JOB %d] MANIFEST created %s", redact.Safe(i.JobID), i.FileNum)
+}
+
+// ManifestDeleteInfo contains the info for a Manifest deletion event.
+type ManifestDeleteInfo struct {
+	// JobID is the ID of the job the caused the Manifest to be deleted.
+	JobID   int
+	Path    string
+	FileNum FileNum
+	Err     error
+}
+
+func (i ManifestDeleteInfo) String() string {
+	return redact.StringWithoutMarkers(i)
+}
+
+// SafeFormat implements redact.SafeFormatter.
+func (i ManifestDeleteInfo) SafeFormat(w redact.SafePrinter, _ rune) {
+	if i.Err != nil {
+		w.Printf("[JOB %d] MANIFEST delete error: %s", redact.Safe(i.JobID), i.Err)
+		return
+	}
+	w.Printf("[JOB %d] MANIFEST deleted %s", redact.Safe(i.JobID), i.FileNum)
+}
+
+// TableCreateInfo contains the info for a table creation event.
+type TableCreateInfo struct {
+	JobID int
+	// Reason is the reason for the table creation: "compacting", "flushing", or
+	// "ingesting".
+	Reason  string
+	Path    string
+	FileNum FileNum
+}
+
+func (i TableCreateInfo) String() string {
+	return redact.StringWithoutMarkers(i)
+}
+
+// SafeFormat implements redact.SafeFormatter.
+func (i TableCreateInfo) SafeFormat(w redact.SafePrinter, _ rune) {
+	w.Printf("[JOB %d] %s: sstable created %s",
+		redact.Safe(i.JobID), redact.Safe(i.Reason), i.FileNum)
+}
+
+// TableDeleteInfo contains the info for a table deletion event.
+type TableDeleteInfo struct {
+	JobID   int
+	Path    string
+	FileNum FileNum
+	Err     error
+}
+
+func (i TableDeleteInfo) String() string {
+	return redact.StringWithoutMarkers(i)
+}
+
+// SafeFormat implements redact.SafeFormatter.
+func (i TableDeleteInfo) SafeFormat(w redact.SafePrinter, _ rune) {
+	if i.Err != nil {
+		w.Printf("[JOB %d] sstable delete error %s: %s",
+			redact.Safe(i.JobID), i.FileNum, i.Err)
+		return
+	}
+	w.Printf("[JOB %d] sstable deleted %s", redact.Safe(i.JobID), i.FileNum)
+}
+
+// TableIngestInfo contains the info for a table ingestion event.
+type TableIngestInfo struct {
+	// JobID is the ID of the job the caused the table to be ingested.
+	JobID  int
+	Tables []struct {
+		TableInfo
+		Level int
+	}
+	// GlobalSeqNum is the sequence number that was assigned to all entries in
+	// the ingested table.
+	GlobalSeqNum uint64
+	// flushable indicates whether the ingested sstable was treated as a
+	// flushable.
+	flushable bool
+	Err       error
+}
+
+func (i TableIngestInfo) String() string {
+	return redact.StringWithoutMarkers(i)
+}
+
+// SafeFormat implements redact.SafeFormatter.
+func (i TableIngestInfo) SafeFormat(w redact.SafePrinter, _ rune) {
+	if i.Err != nil {
+		w.Printf("[JOB %d] ingest error: %s", redact.Safe(i.JobID), i.Err)
+		return
+	}
+
+	if i.flushable {
+		w.Printf("[JOB %d] ingested as flushable", redact.Safe(i.JobID))
+	} else {
+		w.Printf("[JOB %d] ingested", redact.Safe(i.JobID))
+	}
+
+	for j := range i.Tables {
+		t := &i.Tables[j]
+		if j > 0 {
+			w.Printf(",")
+		}
+		levelStr := ""
+		if !i.flushable {
+			levelStr = fmt.Sprintf("L%d:", t.Level)
+		}
+		w.Printf(" %s%s (%s)", redact.Safe(levelStr), t.FileNum,
+			redact.Safe(humanize.Bytes.Uint64(t.Size)))
+	}
+}
+
+// TableStatsInfo contains the info for a table stats loaded event.
+type TableStatsInfo struct {
+	// JobID is the ID of the job that finished loading the initial tables'
+	// stats.
+	JobID int
+}
+
+func (i TableStatsInfo) String() string {
+	return redact.StringWithoutMarkers(i)
+}
+
+// SafeFormat implements redact.SafeFormatter.
+func (i TableStatsInfo) SafeFormat(w redact.SafePrinter, _ rune) {
+	w.Printf("[JOB %d] all initial table stats loaded", redact.Safe(i.JobID))
+}
+
+// TableValidatedInfo contains information on the result of a validation run
+// on an sstable.
+type TableValidatedInfo struct {
+	JobID int
+	Meta  *fileMetadata
+}
+
+func (i TableValidatedInfo) String() string {
+	return redact.StringWithoutMarkers(i)
+}
+
+// SafeFormat implements redact.SafeFormatter.
+func (i TableValidatedInfo) SafeFormat(w redact.SafePrinter, _ rune) {
+	w.Printf("[JOB %d] validated table: %s", redact.Safe(i.JobID), i.Meta)
+}
+
+// WALCreateInfo contains info about a WAL creation event.
+type WALCreateInfo struct {
+	// JobID is the ID of the job the caused the WAL to be created.
+	JobID int
+	Path  string
+	// The file number of the new WAL.
+	FileNum base.DiskFileNum
+	// The file number of a previous WAL which was recycled to create this
+	// one. Zero if recycling did not take place.
+	RecycledFileNum FileNum
+	Err             error
+}
+
+func (i WALCreateInfo) String() string {
+	return redact.StringWithoutMarkers(i)
+}
+
+// SafeFormat implements redact.SafeFormatter.
+func (i WALCreateInfo) SafeFormat(w redact.SafePrinter, _ rune) {
+	if i.Err != nil {
+		w.Printf("[JOB %d] WAL create error: %s", redact.Safe(i.JobID), i.Err)
+		return
+	}
+
+	if i.RecycledFileNum == 0 {
+		w.Printf("[JOB %d] WAL created %s", redact.Safe(i.JobID), i.FileNum)
+		return
+	}
+
+	w.Printf("[JOB %d] WAL created %s (recycled %s)",
+		redact.Safe(i.JobID), i.FileNum, i.RecycledFileNum)
+}
+
+// WALDeleteInfo contains the info for a WAL deletion event.
+type WALDeleteInfo struct {
+	// JobID is the ID of the job the caused the WAL to be deleted.
+	JobID   int
+	Path    string
+	FileNum FileNum
+	Err     error
+}
+
+func (i WALDeleteInfo) String() string {
+	return redact.StringWithoutMarkers(i)
+}
+
+// SafeFormat implements redact.SafeFormatter.
+func (i WALDeleteInfo) SafeFormat(w redact.SafePrinter, _ rune) {
+	if i.Err != nil {
+		w.Printf("[JOB %d] WAL delete error: %s", redact.Safe(i.JobID), i.Err)
+		return
+	}
+	w.Printf("[JOB %d] WAL deleted %s", redact.Safe(i.JobID), i.FileNum)
+}
+
+// WriteStallBeginInfo contains the info for a write stall begin event.
+type WriteStallBeginInfo struct {
+	Reason string
+}
+
+func (i WriteStallBeginInfo) String() string {
+	return redact.StringWithoutMarkers(i)
+}
+
+// SafeFormat implements redact.SafeFormatter.
+func (i WriteStallBeginInfo) SafeFormat(w redact.SafePrinter, _ rune) {
+	w.Printf("write stall beginning: %s", redact.Safe(i.Reason))
+}
+
+// EventListener contains a set of functions that will be invoked when various
+// significant DB events occur. Note that the functions should not run for an
+// excessive amount of time as they are invoked synchronously by the DB and may
+// block continued DB work. For a similar reason it is advisable to not perform
+// any synchronous calls back into the DB.
+type EventListener struct {
+	// BackgroundError is invoked whenever an error occurs during a background
+	// operation such as flush or compaction.
+	BackgroundError func(error)
+
+	// CompactionBegin is invoked after the inputs to a compaction have been
+	// determined, but before the compaction has produced any output.
+	CompactionBegin func(CompactionInfo)
+
+	// CompactionEnd is invoked after a compaction has completed and the result
+	// has been installed.
+	CompactionEnd func(CompactionInfo)
+
+	// DiskSlow is invoked after a disk write operation on a file created with a
+	// disk health checking vfs.FS (see vfs.DefaultWithDiskHealthChecks) is
+	// observed to exceed the specified disk slowness threshold duration. DiskSlow
+	// is called on a goroutine that is monitoring slowness/stuckness. The callee
+	// MUST return without doing any IO, or blocking on anything (like a mutex)
+	// that is waiting on IO. This is imperative in order to reliably monitor for
+	// slowness, since if this goroutine gets stuck, the monitoring will stop
+	// working.
+	DiskSlow func(DiskSlowInfo)
+
+	// FlushBegin is invoked after the inputs to a flush have been determined,
+	// but before the flush has produced any output.
+	FlushBegin func(FlushInfo)
+
+	// FlushEnd is invoked after a flush has complated and the result has been
+	// installed.
+	FlushEnd func(FlushInfo)
+
+	// FormatUpgrade is invoked after the database's FormatMajorVersion
+	// is upgraded.
+	FormatUpgrade func(FormatMajorVersion)
+
+	// ManifestCreated is invoked after a manifest has been created.
+	ManifestCreated func(ManifestCreateInfo)
+
+	// ManifestDeleted is invoked after a manifest has been deleted.
+	ManifestDeleted func(ManifestDeleteInfo)
+
+	// TableCreated is invoked when a table has been created.
+	TableCreated func(TableCreateInfo)
+
+	// TableDeleted is invoked after a table has been deleted.
+	TableDeleted func(TableDeleteInfo)
+
+	// TableIngested is invoked after an externally created table has been
+	// ingested via a call to DB.Ingest().
+	TableIngested func(TableIngestInfo)
+
+	// TableStatsLoaded is invoked at most once, when the table stats
+	// collector has loaded statistics for all tables that existed at Open.
+	TableStatsLoaded func(TableStatsInfo)
+
+	// TableValidated is invoked after validation runs on an sstable.
+	TableValidated func(TableValidatedInfo)
+
+	// WALCreated is invoked after a WAL has been created.
+	WALCreated func(WALCreateInfo)
+
+	// WALDeleted is invoked after a WAL has been deleted.
+	WALDeleted func(WALDeleteInfo)
+
+	// WriteStallBegin is invoked when writes are intentionally delayed.
+	WriteStallBegin func(WriteStallBeginInfo)
+
+	// WriteStallEnd is invoked when delayed writes are released.
+	WriteStallEnd func()
+}
+
+// EnsureDefaults ensures that background error events are logged to the
+// specified logger if a handler for those events hasn't been otherwise
+// specified. Ensure all handlers are non-nil so that we don't have to check
+// for nil-ness before invoking.
+func (l *EventListener) EnsureDefaults(logger Logger) {
+	if l.BackgroundError == nil {
+		if logger != nil {
+			l.BackgroundError = func(err error) {
+				logger.Errorf("background error: %s", err)
+			}
+		} else {
+			l.BackgroundError = func(error) {}
+		}
+	}
+	if l.CompactionBegin == nil {
+		l.CompactionBegin = func(info CompactionInfo) {}
+	}
+	if l.CompactionEnd == nil {
+		l.CompactionEnd = func(info CompactionInfo) {}
+	}
+	if l.DiskSlow == nil {
+		l.DiskSlow = func(info DiskSlowInfo) {}
+	}
+	if l.FlushBegin == nil {
+		l.FlushBegin = func(info FlushInfo) {}
+	}
+	if l.FlushEnd == nil {
+		l.FlushEnd = func(info FlushInfo) {}
+	}
+	if l.FormatUpgrade == nil {
+		l.FormatUpgrade = func(v FormatMajorVersion) {}
+	}
+	if l.ManifestCreated == nil {
+		l.ManifestCreated = func(info ManifestCreateInfo) {}
+	}
+	if l.ManifestDeleted == nil {
+		l.ManifestDeleted = func(info ManifestDeleteInfo) {}
+	}
+	if l.TableCreated == nil {
+		l.TableCreated = func(info TableCreateInfo) {}
+	}
+	if l.TableDeleted == nil {
+		l.TableDeleted = func(info TableDeleteInfo) {}
+	}
+	if l.TableIngested == nil {
+		l.TableIngested = func(info TableIngestInfo) {}
+	}
+	if l.TableStatsLoaded == nil {
+		l.TableStatsLoaded = func(info TableStatsInfo) {}
+	}
+	if l.TableValidated == nil {
+		l.TableValidated = func(validated TableValidatedInfo) {}
+	}
+	if l.WALCreated == nil {
+		l.WALCreated = func(info WALCreateInfo) {}
+	}
+	if l.WALDeleted == nil {
+		l.WALDeleted = func(info WALDeleteInfo) {}
+	}
+	if l.WriteStallBegin == nil {
+		l.WriteStallBegin = func(info WriteStallBeginInfo) {}
+	}
+	if l.WriteStallEnd == nil {
+		l.WriteStallEnd = func() {}
+	}
+}
+
+// MakeLoggingEventListener creates an EventListener that logs all events to the
+// specified logger.
+func MakeLoggingEventListener(logger Logger) EventListener {
+	if logger == nil {
+		logger = DefaultLogger
+	}
+
+	return EventListener{
+		BackgroundError: func(err error) {
+			logger.Errorf("background error: %s", err)
+		},
+		CompactionBegin: func(info CompactionInfo) {
+			logger.Infof("%s", info)
+		},
+		CompactionEnd: func(info CompactionInfo) {
+			logger.Infof("%s", info)
+		},
+		DiskSlow: func(info DiskSlowInfo) {
+			logger.Infof("%s", info)
+		},
+		FlushBegin: func(info FlushInfo) {
+			logger.Infof("%s", info)
+		},
+		FlushEnd: func(info FlushInfo) {
+			logger.Infof("%s", info)
+		},
+		FormatUpgrade: func(v FormatMajorVersion) {
+			logger.Infof("upgraded to format version: %s", v)
+		},
+		ManifestCreated: func(info ManifestCreateInfo) {
+			logger.Infof("%s", info)
+		},
+		ManifestDeleted: func(info ManifestDeleteInfo) {
+			logger.Infof("%s", info)
+		},
+		TableCreated: func(info TableCreateInfo) {
+			logger.Infof("%s", info)
+		},
+		TableDeleted: func(info TableDeleteInfo) {
+			logger.Infof("%s", info)
+		},
+		TableIngested: func(info TableIngestInfo) {
+			logger.Infof("%s", info)
+		},
+		TableStatsLoaded: func(info TableStatsInfo) {
+			logger.Infof("%s", info)
+		},
+		TableValidated: func(info TableValidatedInfo) {
+			logger.Infof("%s", info)
+		},
+		WALCreated: func(info WALCreateInfo) {
+			logger.Infof("%s", info)
+		},
+		WALDeleted: func(info WALDeleteInfo) {
+			logger.Infof("%s", info)
+		},
+		WriteStallBegin: func(info WriteStallBeginInfo) {
+			logger.Infof("%s", info)
+		},
+		WriteStallEnd: func() {
+			logger.Infof("write stall ending")
+		},
+	}
+}
+
+// TeeEventListener wraps two EventListeners, forwarding all events to both.
+func TeeEventListener(a, b EventListener) EventListener {
+	a.EnsureDefaults(nil)
+	b.EnsureDefaults(nil)
+	return EventListener{
+		BackgroundError: func(err error) {
+			a.BackgroundError(err)
+			b.BackgroundError(err)
+		},
+		CompactionBegin: func(info CompactionInfo) {
+			a.CompactionBegin(info)
+			b.CompactionBegin(info)
+		},
+		CompactionEnd: func(info CompactionInfo) {
+			a.CompactionEnd(info)
+			b.CompactionEnd(info)
+		},
+		DiskSlow: func(info DiskSlowInfo) {
+			a.DiskSlow(info)
+			b.DiskSlow(info)
+		},
+		FlushBegin: func(info FlushInfo) {
+			a.FlushBegin(info)
+			b.FlushBegin(info)
+		},
+		FlushEnd: func(info FlushInfo) {
+			a.FlushEnd(info)
+			b.FlushEnd(info)
+		},
+		FormatUpgrade: func(v FormatMajorVersion) {
+			a.FormatUpgrade(v)
+			b.FormatUpgrade(v)
+		},
+		ManifestCreated: func(info ManifestCreateInfo) {
+			a.ManifestCreated(info)
+			b.ManifestCreated(info)
+		},
+		ManifestDeleted: func(info ManifestDeleteInfo) {
+			a.ManifestDeleted(info)
+			b.ManifestDeleted(info)
+		},
+		TableCreated: func(info TableCreateInfo) {
+			a.TableCreated(info)
+			b.TableCreated(info)
+		},
+		TableDeleted: func(info TableDeleteInfo) {
+			a.TableDeleted(info)
+			b.TableDeleted(info)
+		},
+		TableIngested: func(info TableIngestInfo) {
+			a.TableIngested(info)
+			b.TableIngested(info)
+		},
+		TableStatsLoaded: func(info TableStatsInfo) {
+			a.TableStatsLoaded(info)
+			b.TableStatsLoaded(info)
+		},
+		TableValidated: func(info TableValidatedInfo) {
+			a.TableValidated(info)
+			b.TableValidated(info)
+		},
+		WALCreated: func(info WALCreateInfo) {
+			a.WALCreated(info)
+			b.WALCreated(info)
+		},
+		WALDeleted: func(info WALDeleteInfo) {
+			a.WALDeleted(info)
+			b.WALDeleted(info)
+		},
+		WriteStallBegin: func(info WriteStallBeginInfo) {
+			a.WriteStallBegin(info)
+			b.WriteStallBegin(info)
+		},
+		WriteStallEnd: func() {
+			a.WriteStallEnd()
+			b.WriteStallEnd()
+		},
+	}
+}
diff --git a/pebble/event_listener_test.go b/pebble/event_listener_test.go
new file mode 100644
index 0000000..69325d9
--- /dev/null
+++ b/pebble/event_listener_test.go
@@ -0,0 +1,376 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"fmt"
+	"reflect"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/cockroachdb/redact"
+	"github.com/stretchr/testify/require"
+)
+
+// Verify event listener actions, as well as expected filesystem operations.
+func TestEventListener(t *testing.T) {
+	var d *DB
+	var memLog base.InMemLogger
+	mem := vfs.NewMem()
+	require.NoError(t, mem.MkdirAll("ext", 0755))
+
+	datadriven.RunTest(t, "testdata/event_listener", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "open":
+			memLog.Reset()
+			lel := MakeLoggingEventListener(&memLog)
+			flushBegin, flushEnd := lel.FlushBegin, lel.FlushEnd
+			lel.FlushBegin = func(info FlushInfo) {
+				// Make deterministic.
+				info.InputBytes = 100
+				flushBegin(info)
+			}
+			lel.FlushEnd = func(info FlushInfo) {
+				// Make deterministic.
+				info.InputBytes = 100
+				flushEnd(info)
+			}
+			opts := &Options{
+				FS:                    vfs.WithLogging(mem, memLog.Infof),
+				FormatMajorVersion:    internalFormatNewest,
+				EventListener:         &lel,
+				MaxManifestFileSize:   1,
+				L0CompactionThreshold: 10,
+				WALDir:                "wal",
+			}
+			// The table stats collector runs asynchronously and its
+			// timing is less predictable. It increments nextJobID, which
+			// can make these tests flaky. The TableStatsLoaded event is
+			// tested separately in TestTableStats.
+			opts.private.disableTableStats = true
+			var err error
+			d, err = Open("db", opts)
+			if err != nil {
+				return err.Error()
+			}
+			t := time.Now()
+			d.timeNow = func() time.Time {
+				t = t.Add(time.Second)
+				return t
+			}
+			d.opts.private.testingAlwaysWaitForCleanup = true
+			return memLog.String()
+
+		case "close":
+			memLog.Reset()
+			if err := d.Close(); err != nil {
+				return err.Error()
+			}
+			return memLog.String()
+
+		case "flush":
+			memLog.Reset()
+			if err := d.Set([]byte("a"), nil, nil); err != nil {
+				return err.Error()
+			}
+			if err := d.Flush(); err != nil {
+				return err.Error()
+			}
+			return memLog.String()
+
+		case "compact":
+			memLog.Reset()
+			if err := d.Set([]byte("a"), nil, nil); err != nil {
+				return err.Error()
+			}
+			if err := d.Compact([]byte("a"), []byte("b"), false); err != nil {
+				return err.Error()
+			}
+			return memLog.String()
+
+		case "checkpoint":
+			memLog.Reset()
+			if err := d.Checkpoint("checkpoint"); err != nil {
+				return err.Error()
+			}
+			return memLog.String()
+
+		case "disable-file-deletions":
+			memLog.Reset()
+			d.mu.Lock()
+			d.disableFileDeletions()
+			d.mu.Unlock()
+			return memLog.String()
+
+		case "enable-file-deletions":
+			memLog.Reset()
+			func() {
+				defer func() {
+					if r := recover(); r != nil {
+						memLog.Infof("%v", r)
+					}
+				}()
+				d.mu.Lock()
+				defer d.mu.Unlock()
+				d.enableFileDeletions()
+			}()
+			d.TestOnlyWaitForCleaning()
+			return memLog.String()
+
+		case "ingest":
+			memLog.Reset()
+			f, err := mem.Create("ext/0")
+			if err != nil {
+				return err.Error()
+			}
+			w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
+				TableFormat: d.FormatMajorVersion().MaxTableFormat(),
+			})
+			if err := w.Add(base.MakeInternalKey([]byte("a"), 0, InternalKeyKindSet), nil); err != nil {
+				return err.Error()
+			}
+			if err := w.Close(); err != nil {
+				return err.Error()
+			}
+			if err := d.Ingest([]string{"ext/0"}); err != nil {
+				return err.Error()
+			}
+			return memLog.String()
+
+		case "ingest-flushable":
+			memLog.Reset()
+
+			// Prevent flushes during this test to ensure determinism.
+			d.mu.Lock()
+			d.mu.compact.flushing = true
+			d.mu.Unlock()
+
+			b := d.NewBatch()
+			if err := b.Set([]byte("a"), nil, nil); err != nil {
+				return err.Error()
+			}
+			if err := d.Apply(b, nil); err != nil {
+				return err.Error()
+			}
+			writeTable := func(name string, key byte) error {
+				f, err := mem.Create(name)
+				if err != nil {
+					return err
+				}
+				w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
+					TableFormat: d.FormatMajorVersion().MaxTableFormat(),
+				})
+				if err := w.Add(base.MakeInternalKey([]byte{key}, 0, InternalKeyKindSet), nil); err != nil {
+					return err
+				}
+				if err := w.Close(); err != nil {
+					return err
+				}
+				return nil
+			}
+			tableA, tableB := "ext/a", "ext/b"
+			if err := writeTable(tableA, 'a'); err != nil {
+				return err.Error()
+			}
+			if err := writeTable(tableB, 'b'); err != nil {
+				return err.Error()
+			}
+			if err := d.Ingest([]string{tableA, tableB}); err != nil {
+				return err.Error()
+			}
+
+			// Re-enable flushes, to allow the subsequent flush to proceed.
+			d.mu.Lock()
+			d.mu.compact.flushing = false
+			d.mu.Unlock()
+			if err := d.Flush(); err != nil {
+				return err.Error()
+			}
+			return memLog.String()
+
+		case "metrics":
+			// The asynchronous loading of table stats can change metrics, so
+			// wait for all the tables' stats to be loaded.
+			d.mu.Lock()
+			d.waitTableStats()
+			d.mu.Unlock()
+
+			return d.Metrics().StringForTests()
+
+		case "sstables":
+			var buf bytes.Buffer
+			tableInfos, _ := d.SSTables()
+			for i, level := range tableInfos {
+				if len(level) == 0 {
+					continue
+				}
+				fmt.Fprintf(&buf, "%d:\n", i)
+				for _, m := range level {
+					fmt.Fprintf(&buf, "  %d:[%s-%s]\n",
+						m.FileNum, m.Smallest.UserKey, m.Largest.UserKey)
+				}
+			}
+			return buf.String()
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestWriteStallEvents(t *testing.T) {
+	const flushCount = 10
+	const writeStallEnd = "write stall ending"
+
+	testCases := []struct {
+		delayFlush bool
+		expected   string
+	}{
+		{true, "memtable count limit reached"},
+		{false, "L0 file count limit exceeded"},
+	}
+
+	for _, c := range testCases {
+		t.Run("", func(t *testing.T) {
+			stallEnded := make(chan struct{}, 1)
+			createReleased := make(chan struct{}, flushCount)
+			var log base.InMemLogger
+			var delayOnce sync.Once
+			listener := &EventListener{
+				TableCreated: func(info TableCreateInfo) {
+					if c.delayFlush == (info.Reason == "flushing") {
+						delayOnce.Do(func() {
+							<-createReleased
+						})
+					}
+				},
+				WriteStallBegin: func(info WriteStallBeginInfo) {
+					log.Infof("%s", info.String())
+					createReleased <- struct{}{}
+				},
+				WriteStallEnd: func() {
+					log.Infof("%s", writeStallEnd)
+					select {
+					case stallEnded <- struct{}{}:
+					default:
+					}
+				},
+			}
+			d, err := Open("db", &Options{
+				EventListener:               listener,
+				FS:                          vfs.NewMem(),
+				MemTableSize:                initialMemTableSize,
+				MemTableStopWritesThreshold: 2,
+				L0CompactionThreshold:       2,
+				L0StopWritesThreshold:       2,
+			})
+			require.NoError(t, err)
+			defer d.Close()
+
+			for i := 0; i < flushCount; i++ {
+				require.NoError(t, d.Set([]byte("a"), nil, NoSync))
+
+				ch, err := d.AsyncFlush()
+				require.NoError(t, err)
+
+				// If we're delaying the flush (because we're testing for memtable
+				// write stalls), we can't wait for the flush to finish as doing so
+				// would deadlock. If we're not delaying the flush (because we're
+				// testing for L0 write stals), we wait for the flush to finish so we
+				// don't create too many memtables which would trigger a memtable write
+				// stall.
+				if !c.delayFlush {
+					<-ch
+				}
+				if strings.Contains(log.String(), c.expected) {
+					break
+				}
+			}
+			<-stallEnded
+
+			events := log.String()
+			require.Contains(t, events, c.expected)
+			require.Contains(t, events, writeStallEnd)
+			if testing.Verbose() {
+				t.Logf("\n%s", events)
+			}
+		})
+	}
+}
+
+type redactLogger struct {
+	logger Logger
+}
+
+// Infof implements the Logger.Infof interface.
+func (l redactLogger) Infof(format string, args ...interface{}) {
+	l.logger.Infof("%s", redact.Sprintf(format, args...).Redact())
+}
+
+// Errorf implements the Logger.Errorf interface.
+func (l redactLogger) Errorf(format string, args ...interface{}) {
+	l.logger.Errorf("%s", redact.Sprintf(format, args...).Redact())
+}
+
+// Fatalf implements the Logger.Fatalf interface.
+func (l redactLogger) Fatalf(format string, args ...interface{}) {
+	l.logger.Fatalf("%s", redact.Sprintf(format, args...).Redact())
+}
+
+func TestEventListenerRedact(t *testing.T) {
+	// The vast majority of event listener fields logged are safe and do not
+	// need to be redacted. Verify that the rare, unsafe error does appear in
+	// the log redacted.
+	var log base.InMemLogger
+	l := MakeLoggingEventListener(redactLogger{logger: &log})
+	l.WALDeleted(WALDeleteInfo{
+		JobID:   5,
+		FileNum: FileNum(20),
+		Err:     errors.Errorf("unredacted error: %s", "unredacted string"),
+	})
+	require.Equal(t, "[JOB 5] WAL delete error: unredacted error: ‹×›\n", log.String())
+}
+
+func TestEventListenerEnsureDefaultsBackgroundError(t *testing.T) {
+	e := EventListener{}
+	e.EnsureDefaults(nil)
+	e.BackgroundError(errors.New("an example error"))
+}
+
+func TestEventListenerEnsureDefaultsSetsAllCallbacks(t *testing.T) {
+	e := EventListener{}
+	e.EnsureDefaults(nil)
+	testAllCallbacksSetInEventListener(t, e)
+}
+
+func TestMakeLoggingEventListenerSetsAllCallbacks(t *testing.T) {
+	e := MakeLoggingEventListener(nil)
+	testAllCallbacksSetInEventListener(t, e)
+}
+
+func TestTeeEventListenerSetsAllCallbacks(t *testing.T) {
+	e := TeeEventListener(EventListener{}, EventListener{})
+	testAllCallbacksSetInEventListener(t, e)
+}
+
+func testAllCallbacksSetInEventListener(t *testing.T, e EventListener) {
+	t.Helper()
+	v := reflect.ValueOf(e)
+	for i := 0; i < v.NumField(); i++ {
+		fType := v.Type().Field(i)
+		fVal := v.Field(i)
+		require.Equal(t, reflect.Func, fType.Type.Kind(), "unexpected non-func field: %s", fType.Name)
+		require.False(t, fVal.IsNil(), "unexpected nil field: %s", fType.Name)
+	}
+}
diff --git a/pebble/example_test.go b/pebble/example_test.go
new file mode 100644
index 0000000..5c13df1
--- /dev/null
+++ b/pebble/example_test.go
@@ -0,0 +1,37 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble_test
+
+import (
+	"fmt"
+	"log"
+
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+func Example() {
+	db, err := pebble.Open("", &pebble.Options{FS: vfs.NewMem()})
+	if err != nil {
+		log.Fatal(err)
+	}
+	key := []byte("hello")
+	if err := db.Set(key, []byte("world"), pebble.Sync); err != nil {
+		log.Fatal(err)
+	}
+	value, closer, err := db.Get(key)
+	if err != nil {
+		log.Fatal(err)
+	}
+	fmt.Printf("%s %s\n", key, value)
+	if err := closer.Close(); err != nil {
+		log.Fatal(err)
+	}
+	if err := db.Close(); err != nil {
+		log.Fatal(err)
+	}
+	// Output:
+	// hello world
+}
diff --git a/pebble/external_iterator.go b/pebble/external_iterator.go
new file mode 100644
index 0000000..078d016
--- /dev/null
+++ b/pebble/external_iterator.go
@@ -0,0 +1,561 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"context"
+	"fmt"
+	"sort"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/sstable"
+)
+
+// ExternalIterOption provide an interface to specify open-time options to
+// NewExternalIter.
+type ExternalIterOption interface {
+	// iterApply is called on the iterator during opening in order to set internal
+	// parameters.
+	iterApply(*Iterator)
+	// readerOptions returns any reader options added by this iter option.
+	readerOptions() []sstable.ReaderOption
+}
+
+type externalIterReaderOptions struct {
+	opts []sstable.ReaderOption
+}
+
+func (e *externalIterReaderOptions) iterApply(iterator *Iterator) {
+	// Do nothing.
+}
+
+func (e *externalIterReaderOptions) readerOptions() []sstable.ReaderOption {
+	return e.opts
+}
+
+// ExternalIterReaderOptions returns an ExternalIterOption that specifies
+// sstable.ReaderOptions to be applied on sstable readers in NewExternalIter.
+func ExternalIterReaderOptions(opts ...sstable.ReaderOption) ExternalIterOption {
+	return &externalIterReaderOptions{opts: opts}
+}
+
+// ExternalIterForwardOnly is an ExternalIterOption that specifies this iterator
+// will only be used for forward positioning operations (First, SeekGE, Next).
+// This could enable optimizations that take advantage of this invariant.
+// Behaviour when a reverse positioning operation is done on an iterator
+// opened with this option is unpredictable, though in most cases it should.
+type ExternalIterForwardOnly struct{}
+
+func (e ExternalIterForwardOnly) iterApply(iter *Iterator) {
+	iter.forwardOnly = true
+}
+
+func (e ExternalIterForwardOnly) readerOptions() []sstable.ReaderOption {
+	return nil
+}
+
+// NewExternalIter takes an input 2d array of sstable files which may overlap
+// across subarrays but not within a subarray (at least as far as points are
+// concerned; range keys are allowed to overlap arbitrarily even within a
+// subarray), and returns an Iterator over the merged contents of the sstables.
+// Input sstables may contain point keys, range keys, range deletions, etc. The
+// input files slice must be sorted in reverse chronological ordering. A key in a
+// file at a lower index subarray will shadow a key with an identical user key
+// contained within a file at a higher index subarray. Each subarray must be
+// sorted in internal key order, where lower index files contain keys that sort
+// left of files with higher indexes.
+//
+// Input sstables must only contain keys with the zero sequence number.
+//
+// Iterators constructed through NewExternalIter do not support all iterator
+// options, including block-property and table filters. NewExternalIter errors
+// if an incompatible option is set.
+func NewExternalIter(
+	o *Options,
+	iterOpts *IterOptions,
+	files [][]sstable.ReadableFile,
+	extraOpts ...ExternalIterOption,
+) (it *Iterator, err error) {
+	return NewExternalIterWithContext(context.Background(), o, iterOpts, files, extraOpts...)
+}
+
+// NewExternalIterWithContext is like NewExternalIter, and additionally
+// accepts a context for tracing.
+func NewExternalIterWithContext(
+	ctx context.Context,
+	o *Options,
+	iterOpts *IterOptions,
+	files [][]sstable.ReadableFile,
+	extraOpts ...ExternalIterOption,
+) (it *Iterator, err error) {
+	if iterOpts != nil {
+		if err := validateExternalIterOpts(iterOpts); err != nil {
+			return nil, err
+		}
+	}
+
+	var readers [][]*sstable.Reader
+
+	// Ensure we close all the opened readers if we error out.
+	defer func() {
+		if err != nil {
+			for i := range readers {
+				for j := range readers[i] {
+					_ = readers[i][j].Close()
+				}
+			}
+		}
+	}()
+	seqNumOffset := 0
+	var extraReaderOpts []sstable.ReaderOption
+	for i := range extraOpts {
+		extraReaderOpts = append(extraReaderOpts, extraOpts[i].readerOptions()...)
+	}
+	for _, levelFiles := range files {
+		seqNumOffset += len(levelFiles)
+	}
+	for _, levelFiles := range files {
+		var subReaders []*sstable.Reader
+		seqNumOffset -= len(levelFiles)
+		subReaders, err = openExternalTables(o, levelFiles, seqNumOffset, o.MakeReaderOptions(), extraReaderOpts...)
+		readers = append(readers, subReaders)
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	buf := iterAllocPool.Get().(*iterAlloc)
+	dbi := &buf.dbi
+	*dbi = Iterator{
+		ctx:                 ctx,
+		alloc:               buf,
+		merge:               o.Merger.Merge,
+		comparer:            *o.Comparer,
+		readState:           nil,
+		keyBuf:              buf.keyBuf,
+		prefixOrFullSeekKey: buf.prefixOrFullSeekKey,
+		boundsBuf:           buf.boundsBuf,
+		batch:               nil,
+		// Add the readers to the Iterator so that Close closes them, and
+		// SetOptions can re-construct iterators from them.
+		externalReaders: readers,
+		newIters: func(
+			ctx context.Context, f *manifest.FileMetadata, opts *IterOptions,
+			internalOpts internalIterOpts) (internalIterator, keyspan.FragmentIterator, error) {
+			// NB: External iterators are currently constructed without any
+			// `levelIters`. newIters should never be called. When we support
+			// organizing multiple non-overlapping files into a single level
+			// (see TODO below), we'll need to adjust this tableNewIters
+			// implementation to open iterators by looking up f in a map
+			// of readers indexed by *fileMetadata.
+			panic("unreachable")
+		},
+		seqNum: base.InternalKeySeqNumMax,
+	}
+	if iterOpts != nil {
+		dbi.opts = *iterOpts
+		dbi.processBounds(iterOpts.LowerBound, iterOpts.UpperBound)
+	}
+	for i := range extraOpts {
+		extraOpts[i].iterApply(dbi)
+	}
+	if err := finishInitializingExternal(ctx, dbi); err != nil {
+		dbi.Close()
+		return nil, err
+	}
+	return dbi, nil
+}
+
+func validateExternalIterOpts(iterOpts *IterOptions) error {
+	switch {
+	case iterOpts.TableFilter != nil:
+		return errors.Errorf("pebble: external iterator: TableFilter unsupported")
+	case iterOpts.PointKeyFilters != nil:
+		return errors.Errorf("pebble: external iterator: PointKeyFilters unsupported")
+	case iterOpts.RangeKeyFilters != nil:
+		return errors.Errorf("pebble: external iterator: RangeKeyFilters unsupported")
+	case iterOpts.OnlyReadGuaranteedDurable:
+		return errors.Errorf("pebble: external iterator: OnlyReadGuaranteedDurable unsupported")
+	case iterOpts.UseL6Filters:
+		return errors.Errorf("pebble: external iterator: UseL6Filters unsupported")
+	}
+	return nil
+}
+
+func createExternalPointIter(ctx context.Context, it *Iterator) (internalIterator, error) {
+	// TODO(jackson): In some instances we could generate fewer levels by using
+	// L0Sublevels code to organize nonoverlapping files into the same level.
+	// This would allow us to use levelIters and keep a smaller set of data and
+	// files in-memory. However, it would also require us to identify the bounds
+	// of all the files upfront.
+
+	if !it.opts.pointKeys() {
+		return emptyIter, nil
+	} else if it.pointIter != nil {
+		return it.pointIter, nil
+	}
+	mlevels := it.alloc.mlevels[:0]
+
+	if len(it.externalReaders) > cap(mlevels) {
+		mlevels = make([]mergingIterLevel, 0, len(it.externalReaders))
+	}
+	for _, readers := range it.externalReaders {
+		var combinedIters []internalIterator
+		for _, r := range readers {
+			var (
+				rangeDelIter keyspan.FragmentIterator
+				pointIter    internalIterator
+				err          error
+			)
+			// We could set hideObsoletePoints=true, since we are reading at
+			// InternalKeySeqNumMax, but we don't bother since these sstables should
+			// not have obsolete points (so the performance optimization is
+			// unnecessary), and we don't want to bother constructing a
+			// BlockPropertiesFilterer that includes obsoleteKeyBlockPropertyFilter.
+			pointIter, err = r.NewIterWithBlockPropertyFiltersAndContextEtc(
+				ctx, it.opts.LowerBound, it.opts.UpperBound, nil, /* BlockPropertiesFilterer */
+				false /* hideObsoletePoints */, false, /* useFilterBlock */
+				&it.stats.InternalStats, it.opts.CategoryAndQoS, nil,
+				sstable.TrivialReaderProvider{Reader: r})
+			if err != nil {
+				return nil, err
+			}
+			rangeDelIter, err = r.NewRawRangeDelIter()
+			if err != nil {
+				return nil, err
+			}
+			if rangeDelIter == nil && pointIter != nil && it.forwardOnly {
+				// TODO(bilal): Consider implementing range key pausing in
+				// simpleLevelIter so we can reduce mergingIterLevels even more by
+				// sending all sstable iterators to combinedIters, not just those
+				// corresponding to sstables without range deletes.
+				combinedIters = append(combinedIters, pointIter)
+				continue
+			}
+			mlevels = append(mlevels, mergingIterLevel{
+				iter:         pointIter,
+				rangeDelIter: rangeDelIter,
+			})
+		}
+		if len(combinedIters) == 1 {
+			mlevels = append(mlevels, mergingIterLevel{
+				iter: combinedIters[0],
+			})
+		} else if len(combinedIters) > 1 {
+			sli := &simpleLevelIter{
+				cmp:   it.cmp,
+				iters: combinedIters,
+			}
+			sli.init(it.opts)
+			mlevels = append(mlevels, mergingIterLevel{
+				iter:         sli,
+				rangeDelIter: nil,
+			})
+		}
+	}
+	if len(mlevels) == 1 && mlevels[0].rangeDelIter == nil {
+		// Set closePointIterOnce to true. This is because we're bypassing the
+		// merging iter, which turns Close()s on it idempotent for any child
+		// iterators. The outer Iterator could call Close() on a point iter twice,
+		// which sstable iterators do not support (as they release themselves to
+		// a pool).
+		it.closePointIterOnce = true
+		return mlevels[0].iter, nil
+	}
+
+	it.alloc.merging.init(&it.opts, &it.stats.InternalStats, it.comparer.Compare, it.comparer.Split, mlevels...)
+	it.alloc.merging.snapshot = base.InternalKeySeqNumMax
+	if len(mlevels) <= cap(it.alloc.levelsPositioned) {
+		it.alloc.merging.levelsPositioned = it.alloc.levelsPositioned[:len(mlevels)]
+	}
+	return &it.alloc.merging, nil
+}
+
+func finishInitializingExternal(ctx context.Context, it *Iterator) error {
+	pointIter, err := createExternalPointIter(ctx, it)
+	if err != nil {
+		return err
+	}
+	it.pointIter = pointIter
+	it.iter = it.pointIter
+
+	if it.opts.rangeKeys() {
+		it.rangeKeyMasking.init(it, it.comparer.Compare, it.comparer.Split)
+		var rangeKeyIters []keyspan.FragmentIterator
+		if it.rangeKey == nil {
+			// We could take advantage of the lack of overlaps in range keys within
+			// each slice in it.externalReaders, and generate keyspan.LevelIters
+			// out of those. However, since range keys are expected to be sparse to
+			// begin with, the performance gain might not be significant enough to
+			// warrant it.
+			//
+			// TODO(bilal): Explore adding a simpleRangeKeyLevelIter that does not
+			// operate on FileMetadatas (similar to simpleLevelIter), and implements
+			// this optimization.
+			for _, readers := range it.externalReaders {
+				for _, r := range readers {
+					if rki, err := r.NewRawRangeKeyIter(); err != nil {
+						return err
+					} else if rki != nil {
+						rangeKeyIters = append(rangeKeyIters, rki)
+					}
+				}
+			}
+			if len(rangeKeyIters) > 0 {
+				it.rangeKey = iterRangeKeyStateAllocPool.Get().(*iteratorRangeKeyState)
+				it.rangeKey.init(it.comparer.Compare, it.comparer.Split, &it.opts)
+				it.rangeKey.rangeKeyIter = it.rangeKey.iterConfig.Init(
+					&it.comparer,
+					base.InternalKeySeqNumMax,
+					it.opts.LowerBound, it.opts.UpperBound,
+					&it.hasPrefix, &it.prefixOrFullSeekKey,
+					false /* internalKeys */, &it.rangeKey.internal,
+				)
+				for i := range rangeKeyIters {
+					it.rangeKey.iterConfig.AddLevel(rangeKeyIters[i])
+				}
+			}
+		}
+		if it.rangeKey != nil {
+			it.rangeKey.iiter.Init(&it.comparer, it.iter, it.rangeKey.rangeKeyIter,
+				keyspan.InterleavingIterOpts{
+					Mask:       &it.rangeKeyMasking,
+					LowerBound: it.opts.LowerBound,
+					UpperBound: it.opts.UpperBound,
+				})
+			it.iter = &it.rangeKey.iiter
+		}
+	}
+	return nil
+}
+
+func openExternalTables(
+	o *Options,
+	files []sstable.ReadableFile,
+	seqNumOffset int,
+	readerOpts sstable.ReaderOptions,
+	extraReaderOpts ...sstable.ReaderOption,
+) (readers []*sstable.Reader, err error) {
+	readers = make([]*sstable.Reader, 0, len(files))
+	for i := range files {
+		readable, err := sstable.NewSimpleReadable(files[i])
+		if err != nil {
+			return readers, err
+		}
+		r, err := sstable.NewReader(readable, readerOpts, extraReaderOpts...)
+		if err != nil {
+			return readers, err
+		}
+		// Use the index of the file in files as the sequence number for all of
+		// its keys.
+		r.Properties.GlobalSeqNum = uint64(len(files) - i + seqNumOffset)
+		readers = append(readers, r)
+	}
+	return readers, err
+}
+
+// simpleLevelIter is similar to a levelIter in that it merges the points
+// from multiple point iterators that are non-overlapping in the key ranges
+// they return. It is only expected to support forward iteration and forward
+// regular seeking; reverse iteration and prefix seeking is not supported.
+// Intended to be a low-overhead, non-FileMetadata dependent option for
+// NewExternalIter. To optimize seeking and forward iteration, it maintains
+// two slices of child iterators; one of all iterators, and a subset of it that
+// contains just the iterators that contain point keys within the current
+// bounds.
+//
+// Note that this levelIter does not support pausing at file boundaries
+// in case of range tombstones in this file that could apply to points outside
+// of this file (and outside of this level). This is sufficient for optimizing
+// the main use cases of NewExternalIter, however for completeness it would make
+// sense to build this pausing functionality in.
+type simpleLevelIter struct {
+	cmp          Compare
+	err          error
+	lowerBound   []byte
+	iters        []internalIterator
+	filtered     []internalIterator
+	firstKeys    [][]byte
+	firstKeysBuf []byte
+	currentIdx   int
+}
+
+var _ internalIterator = &simpleLevelIter{}
+
+// init initializes this simpleLevelIter.
+func (s *simpleLevelIter) init(opts IterOptions) {
+	s.currentIdx = 0
+	s.lowerBound = opts.LowerBound
+	s.resetFilteredIters()
+}
+
+func (s *simpleLevelIter) resetFilteredIters() {
+	s.filtered = s.filtered[:0]
+	s.firstKeys = s.firstKeys[:0]
+	s.firstKeysBuf = s.firstKeysBuf[:0]
+	s.err = nil
+	for i := range s.iters {
+		var iterKey *base.InternalKey
+		if s.lowerBound != nil {
+			iterKey, _ = s.iters[i].SeekGE(s.lowerBound, base.SeekGEFlagsNone)
+		} else {
+			iterKey, _ = s.iters[i].First()
+		}
+		if iterKey != nil {
+			s.filtered = append(s.filtered, s.iters[i])
+			bufStart := len(s.firstKeysBuf)
+			s.firstKeysBuf = append(s.firstKeysBuf, iterKey.UserKey...)
+			s.firstKeys = append(s.firstKeys, s.firstKeysBuf[bufStart:bufStart+len(iterKey.UserKey)])
+		} else if err := s.iters[i].Error(); err != nil {
+			s.err = err
+		}
+	}
+}
+
+func (s *simpleLevelIter) SeekGE(
+	key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	if s.err != nil {
+		return nil, base.LazyValue{}
+	}
+	// Find the first file that is entirely >= key. The file before that could
+	// contain the key we're looking for.
+	n := sort.Search(len(s.firstKeys), func(i int) bool {
+		return s.cmp(key, s.firstKeys[i]) <= 0
+	})
+	if n > 0 {
+		s.currentIdx = n - 1
+	} else {
+		s.currentIdx = n
+	}
+	if s.currentIdx < len(s.filtered) {
+		if iterKey, val := s.filtered[s.currentIdx].SeekGE(key, flags); iterKey != nil {
+			return iterKey, val
+		}
+		if err := s.filtered[s.currentIdx].Error(); err != nil {
+			s.err = err
+		}
+		s.currentIdx++
+	}
+	return s.skipEmptyFileForward(key, flags)
+}
+
+func (s *simpleLevelIter) skipEmptyFileForward(
+	seekKey []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	var iterKey *base.InternalKey
+	var val base.LazyValue
+	for s.currentIdx >= 0 && s.currentIdx < len(s.filtered) && s.err == nil {
+		if seekKey != nil {
+			iterKey, val = s.filtered[s.currentIdx].SeekGE(seekKey, flags)
+		} else if s.lowerBound != nil {
+			iterKey, val = s.filtered[s.currentIdx].SeekGE(s.lowerBound, flags)
+		} else {
+			iterKey, val = s.filtered[s.currentIdx].First()
+		}
+		if iterKey != nil {
+			return iterKey, val
+		}
+		if err := s.filtered[s.currentIdx].Error(); err != nil {
+			s.err = err
+		}
+		s.currentIdx++
+	}
+	return nil, base.LazyValue{}
+}
+
+func (s *simpleLevelIter) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	panic("unimplemented")
+}
+
+func (s *simpleLevelIter) SeekLT(
+	key []byte, flags base.SeekLTFlags,
+) (*base.InternalKey, base.LazyValue) {
+	panic("unimplemented")
+}
+
+func (s *simpleLevelIter) First() (*base.InternalKey, base.LazyValue) {
+	if s.err != nil {
+		return nil, base.LazyValue{}
+	}
+	s.currentIdx = 0
+	return s.skipEmptyFileForward(nil /* seekKey */, base.SeekGEFlagsNone)
+}
+
+func (s *simpleLevelIter) Last() (*base.InternalKey, base.LazyValue) {
+	panic("unimplemented")
+}
+
+func (s *simpleLevelIter) Next() (*base.InternalKey, base.LazyValue) {
+	if s.err != nil {
+		return nil, base.LazyValue{}
+	}
+	if s.currentIdx < 0 || s.currentIdx >= len(s.filtered) {
+		return nil, base.LazyValue{}
+	}
+	if iterKey, val := s.filtered[s.currentIdx].Next(); iterKey != nil {
+		return iterKey, val
+	}
+	s.currentIdx++
+	return s.skipEmptyFileForward(nil /* seekKey */, base.SeekGEFlagsNone)
+}
+
+func (s *simpleLevelIter) NextPrefix(succKey []byte) (*base.InternalKey, base.LazyValue) {
+	if s.err != nil {
+		return nil, base.LazyValue{}
+	}
+	if s.currentIdx < 0 || s.currentIdx >= len(s.filtered) {
+		return nil, base.LazyValue{}
+	}
+	if iterKey, val := s.filtered[s.currentIdx].NextPrefix(succKey); iterKey != nil {
+		return iterKey, val
+	}
+	s.currentIdx++
+	return s.skipEmptyFileForward(succKey /* seekKey */, base.SeekGEFlagsNone)
+}
+
+func (s *simpleLevelIter) Prev() (*base.InternalKey, base.LazyValue) {
+	panic("unimplemented")
+}
+
+func (s *simpleLevelIter) Error() error {
+	if s.currentIdx >= 0 && s.currentIdx < len(s.filtered) {
+		s.err = firstError(s.err, s.filtered[s.currentIdx].Error())
+	}
+	return s.err
+}
+
+func (s *simpleLevelIter) Close() error {
+	var err error
+	for i := range s.iters {
+		err = firstError(err, s.iters[i].Close())
+	}
+	return err
+}
+
+func (s *simpleLevelIter) SetBounds(lower, upper []byte) {
+	s.currentIdx = -1
+	s.lowerBound = lower
+	for i := range s.iters {
+		s.iters[i].SetBounds(lower, upper)
+	}
+	s.resetFilteredIters()
+}
+
+func (s *simpleLevelIter) SetContext(_ context.Context) {}
+
+func (s *simpleLevelIter) String() string {
+	if s.currentIdx < 0 || s.currentIdx >= len(s.filtered) {
+		return "simpleLevelIter: current=<nil>"
+	}
+	return fmt.Sprintf("simpleLevelIter: current=%s", s.filtered[s.currentIdx])
+}
+
+var _ internalIterator = &simpleLevelIter{}
diff --git a/pebble/external_iterator_test.go b/pebble/external_iterator_test.go
new file mode 100644
index 0000000..77afd4d
--- /dev/null
+++ b/pebble/external_iterator_test.go
@@ -0,0 +1,380 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"fmt"
+	"math"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/cache"
+	"github.com/cockroachdb/pebble/internal/itertest"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+func TestExternalIterator(t *testing.T) {
+	mem := vfs.NewMem()
+	o := &Options{
+		FS:                 mem,
+		Comparer:           testkeys.Comparer,
+		FormatMajorVersion: FormatRangeKeys,
+	}
+	o.EnsureDefaults()
+	d, err := Open("", o)
+	require.NoError(t, err)
+	defer func() { require.NoError(t, d.Close()) }()
+
+	datadriven.RunTest(t, "testdata/external_iterator", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "reset":
+			mem = vfs.NewMem()
+			return ""
+		case "build":
+			if err := runBuildCmd(td, d, mem); err != nil {
+				return err.Error()
+			}
+			return ""
+		case "iter":
+			opts := IterOptions{KeyTypes: IterKeyTypePointsAndRanges}
+			var externalIterOpts []ExternalIterOption
+			var files [][]sstable.ReadableFile
+			for _, arg := range td.CmdArgs {
+				switch arg.Key {
+				case "fwd-only":
+					externalIterOpts = append(externalIterOpts, ExternalIterForwardOnly{})
+				case "mask-suffix":
+					opts.RangeKeyMasking.Suffix = []byte(arg.Vals[0])
+				case "lower":
+					opts.LowerBound = []byte(arg.Vals[0])
+				case "upper":
+					opts.UpperBound = []byte(arg.Vals[0])
+				case "files":
+					for _, v := range arg.Vals {
+						f, err := mem.Open(v)
+						require.NoError(t, err)
+						files = append(files, []sstable.ReadableFile{f})
+					}
+				}
+			}
+			it, err := NewExternalIter(o, &opts, files, externalIterOpts...)
+			require.NoError(t, err)
+			return runIterCmd(td, it, true /* close iter */)
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestSimpleLevelIter(t *testing.T) {
+	mem := vfs.NewMem()
+	o := &Options{
+		FS:                 mem,
+		Comparer:           testkeys.Comparer,
+		FormatMajorVersion: FormatRangeKeys,
+	}
+	o.EnsureDefaults()
+	d, err := Open("", o)
+	require.NoError(t, err)
+	defer func() { require.NoError(t, d.Close()) }()
+
+	datadriven.RunTest(t, "testdata/simple_level_iter", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "reset":
+			mem = vfs.NewMem()
+			return ""
+		case "build":
+			if err := runBuildCmd(td, d, mem); err != nil {
+				return err.Error()
+			}
+			return ""
+		case "iter":
+			var files []sstable.ReadableFile
+			var filenames []string
+			td.ScanArgs(t, "files", &filenames)
+			for _, name := range filenames {
+				f, err := mem.Open(name)
+				require.NoError(t, err)
+				files = append(files, f)
+			}
+			readers, err := openExternalTables(o, files, 0, o.MakeReaderOptions())
+			require.NoError(t, err)
+			defer func() {
+				for i := range readers {
+					_ = readers[i].Close()
+				}
+			}()
+			var internalIters []internalIterator
+			for i := range readers {
+				iter, err := readers[i].NewIter(nil, nil)
+				require.NoError(t, err)
+				internalIters = append(internalIters, iter)
+			}
+			it := &simpleLevelIter{cmp: o.Comparer.Compare, iters: internalIters}
+			it.init(IterOptions{})
+
+			response := itertest.RunInternalIterCmd(t, td, it)
+			require.NoError(t, it.Close())
+			return response
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestSimpleIterError(t *testing.T) {
+	s := simpleLevelIter{cmp: DefaultComparer.Compare, iters: []internalIterator{&errorIter{err: errors.New("injected")}}}
+	s.init(IterOptions{})
+	defer s.Close()
+
+	iterKey, _ := s.First()
+	require.Nil(t, iterKey)
+	require.Error(t, s.Error())
+}
+
+func TestIterRandomizedMaybeFilteredKeys(t *testing.T) {
+	mem := vfs.NewMem()
+
+	seed := *seed
+	if seed == 0 {
+		seed = uint64(time.Now().UnixNano())
+		t.Logf("seed: %d", seed)
+	}
+	rng := rand.New(rand.NewSource(seed))
+	numKeys := 100 + rng.Intn(5000)
+	// The block property filter will exclude keys with suffixes [0, tsSeparator-1].
+	// We use the first "part" of the keyspace below to write keys >= tsSeparator,
+	// and the second part to write keys < tsSeparator. Successive parts (if any)
+	// will contain keys at random before or after the separator.
+	tsSeparator := 10 + rng.Int63n(5000)
+	const keyLen = 5
+
+	// We split the keyspace into logical "parts" which are disjoint slices of the
+	// keyspace. That is, the keyspace a-z could be comprised of parts {a-k, l-z}.
+	// We rely on this partitioning when generating timestamps to give us some
+	// predictable clustering of timestamps in sstable blocks, however it is not
+	// strictly necessary for this test.
+	alpha := testkeys.Alpha(keyLen)
+	numParts := rng.Intn(3) + 2
+	blockSize := 16 + rng.Intn(64)
+
+	c := cache.New(128 << 20)
+	defer c.Unref()
+
+	for fileIdx, twoLevelIndex := range []bool{false, true} {
+		t.Run(fmt.Sprintf("twoLevelIndex=%v", twoLevelIndex), func(t *testing.T) {
+			keys := make([][]byte, 0, numKeys)
+
+			filename := fmt.Sprintf("test-%d", fileIdx)
+			f0, err := mem.Create(filename)
+			require.NoError(t, err)
+
+			indexBlockSize := 4096
+			if twoLevelIndex {
+				indexBlockSize = 1
+			}
+			w := sstable.NewWriter(objstorageprovider.NewFileWritable(f0), sstable.WriterOptions{
+				BlockSize:      blockSize,
+				Comparer:       testkeys.Comparer,
+				IndexBlockSize: indexBlockSize,
+				TableFormat:    sstable.TableFormatPebblev2,
+				BlockPropertyCollectors: []func() BlockPropertyCollector{
+					func() BlockPropertyCollector {
+						return sstable.NewTestKeysBlockPropertyCollector()
+					},
+				},
+			})
+			buf := make([]byte, alpha.MaxLen()+testkeys.MaxSuffixLen)
+			valBuf := make([]byte, 20)
+			keyIdx := int64(0)
+			for i := 0; i < numParts; i++ {
+				// The first two parts of the keyspace are special. The first one has
+				// all keys with timestamps greater than tsSeparator, while the second
+				// one has all keys with timestamps less than tsSeparator. Any additional
+				// keys could have timestamps at random before or after the tsSeparator.
+				maxKeysPerPart := numKeys / numParts
+				for j := 0; j < maxKeysPerPart; j++ {
+					var ts int64
+					if i == 0 {
+						ts = rng.Int63n(5000) + tsSeparator
+					} else if i == 1 {
+						ts = rng.Int63n(tsSeparator)
+					} else {
+						ts = rng.Int63n(tsSeparator + 5000)
+					}
+					n := testkeys.WriteKeyAt(buf, alpha, keyIdx*alpha.Count()/int64(numKeys), ts)
+					keys = append(keys, append([]byte(nil), buf[:n]...))
+					randStr(valBuf, rng)
+					require.NoError(t, w.Set(buf[:n], valBuf))
+					keyIdx++
+				}
+			}
+			require.NoError(t, w.Close())
+
+			// Re-open that filename for reading.
+			f1, err := mem.Open(filename)
+			require.NoError(t, err)
+
+			readable, err := sstable.NewSimpleReadable(f1)
+			require.NoError(t, err)
+
+			r, err := sstable.NewReader(readable, sstable.ReaderOptions{
+				Cache:    c,
+				Comparer: testkeys.Comparer,
+			})
+			require.NoError(t, err)
+			defer r.Close()
+
+			filter := sstable.NewTestKeysBlockPropertyFilter(uint64(tsSeparator), math.MaxUint64)
+			filterer, err := sstable.IntersectsTable([]BlockPropertyFilter{filter}, nil, r.Properties.UserProperties)
+			require.NoError(t, err)
+			require.NotNil(t, filterer)
+
+			var iter sstable.Iterator
+			iter, err = r.NewIterWithBlockPropertyFilters(
+				nil, nil, filterer, false /* useFilterBlock */, nil, /* stats */
+				sstable.CategoryAndQoS{}, nil, sstable.TrivialReaderProvider{Reader: r})
+			require.NoError(t, err)
+			defer iter.Close()
+			var lastSeekKey, lowerBound, upperBound []byte
+			narrowBoundsMode := false
+
+			for i := 0; i < 10000; i++ {
+				if rng.Intn(8) == 0 {
+					// Toggle narrow bounds mode.
+					if narrowBoundsMode {
+						// Reset bounds.
+						lowerBound, upperBound = nil, nil
+						iter.SetBounds(nil /* lower */, nil /* upper */)
+					}
+					narrowBoundsMode = !narrowBoundsMode
+				}
+				keyIdx := rng.Intn(len(keys))
+				seekKey := keys[keyIdx]
+				if narrowBoundsMode {
+					// Case 1: We just entered narrow bounds mode, and both bounds
+					// are nil. Set a lower/upper bound.
+					//
+					// Case 2: The seek key is outside our last bounds.
+					//
+					// In either case, pick a narrow range of keys to set bounds on,
+					// let's say keys[keyIdx-5] and keys[keyIdx+5], before doing our
+					// seek operation. Picking narrow bounds increases the chance of
+					// monotonic bound changes.
+					cmp := testkeys.Comparer.Compare
+					case1 := lowerBound == nil && upperBound == nil
+					case2 := (lowerBound != nil && cmp(lowerBound, seekKey) > 0) || (upperBound != nil && cmp(upperBound, seekKey) <= 0)
+					if case1 || case2 {
+						lowerBound = nil
+						if keyIdx-5 >= 0 {
+							lowerBound = keys[keyIdx-5]
+						}
+						upperBound = nil
+						if keyIdx+5 < len(keys) {
+							upperBound = keys[keyIdx+5]
+						}
+						iter.SetBounds(lowerBound, upperBound)
+					}
+					// Case 3: The current seek key is within the previously-set bounds.
+					// No need to change bounds.
+				}
+				flags := base.SeekGEFlagsNone
+				if lastSeekKey != nil && bytes.Compare(seekKey, lastSeekKey) > 0 {
+					flags = flags.EnableTrySeekUsingNext()
+				}
+				lastSeekKey = append(lastSeekKey[:0], seekKey...)
+
+				newKey, _ := iter.SeekGE(seekKey, flags)
+				if newKey == nil || !bytes.Equal(newKey.UserKey, seekKey) {
+					// We skipped some keys. Check if maybeFilteredKeys is true.
+					formattedNewKey := "<nil>"
+					if newKey != nil {
+						formattedNewKey = fmt.Sprintf("%s", testkeys.Comparer.FormatKey(newKey.UserKey))
+					}
+					require.True(t, iter.MaybeFilteredKeys(), "seeked for key = %s, got key = %s indicating block property filtering but MaybeFilteredKeys = false", testkeys.Comparer.FormatKey(seekKey), formattedNewKey)
+				}
+			}
+		})
+	}
+}
+
+func BenchmarkExternalIter_NonOverlapping_SeekNextScan(b *testing.B) {
+	ks := testkeys.Alpha(6)
+	opts := (&Options{}).EnsureDefaults()
+	iterOpts := &IterOptions{
+		KeyTypes: IterKeyTypePointsAndRanges,
+	}
+	writeOpts := opts.MakeWriterOptions(6, sstable.TableFormatPebblev2)
+
+	for _, keyCount := range []int{100, 10_000, 100_000} {
+		b.Run(fmt.Sprintf("keys=%d", keyCount), func(b *testing.B) {
+			for _, fileCount := range []int{1, 10, 100} {
+				b.Run(fmt.Sprintf("files=%d", fileCount), func(b *testing.B) {
+					var fs vfs.FS = vfs.NewMem()
+					filenames := make([]string, fileCount)
+					var keys [][]byte
+					for i := 0; i < fileCount; i++ {
+						filename := fmt.Sprintf("%03d.sst", i)
+						wf, err := fs.Create(filename)
+						require.NoError(b, err)
+						w := sstable.NewWriter(objstorageprovider.NewFileWritable(wf), writeOpts)
+						for j := 0; j < keyCount/fileCount; j++ {
+							key := testkeys.Key(ks, int64(len(keys)))
+							keys = append(keys, key)
+							require.NoError(b, w.Set(key, key))
+						}
+						require.NoError(b, w.Close())
+						filenames[i] = filename
+					}
+
+					for _, forwardOnly := range []bool{false, true} {
+						b.Run(fmt.Sprintf("forward-only=%t", forwardOnly), func(b *testing.B) {
+							var externalIterOpts []ExternalIterOption
+							if forwardOnly {
+								externalIterOpts = append(externalIterOpts, ExternalIterForwardOnly{})
+							}
+
+							for i := 0; i < b.N; i++ {
+								func() {
+									files := make([][]sstable.ReadableFile, fileCount)
+									for i := 0; i < fileCount; i++ {
+										f, err := fs.Open(filenames[i])
+										require.NoError(b, err)
+										files[i] = []sstable.ReadableFile{f}
+									}
+
+									it, err := NewExternalIter(opts, iterOpts, files, externalIterOpts...)
+									require.NoError(b, err)
+									defer it.Close()
+
+									for k := 0; k+1 < len(keys); k += 2 {
+										if !it.SeekGE(keys[k]) {
+											b.Fatalf("key %q not found", keys[k])
+										}
+										if !it.Next() {
+											b.Fatalf("key %q not found", keys[k+1])
+										}
+										if !bytes.Equal(it.Key(), keys[k+1]) {
+											b.Fatalf("expected key %q, found %q", keys[k+1], it.Key())
+										}
+									}
+								}()
+							}
+						})
+					}
+				})
+			}
+		})
+	}
+}
diff --git a/pebble/filenames.go b/pebble/filenames.go
new file mode 100644
index 0000000..07d74c8
--- /dev/null
+++ b/pebble/filenames.go
@@ -0,0 +1,54 @@
+// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"fmt"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+type fileType = base.FileType
+
+// FileNum is an identifier for a file within a database.
+type FileNum = base.FileNum
+
+const (
+	fileTypeLog      = base.FileTypeLog
+	fileTypeLock     = base.FileTypeLock
+	fileTypeTable    = base.FileTypeTable
+	fileTypeManifest = base.FileTypeManifest
+	fileTypeCurrent  = base.FileTypeCurrent
+	fileTypeOptions  = base.FileTypeOptions
+	fileTypeTemp     = base.FileTypeTemp
+	fileTypeOldTemp  = base.FileTypeOldTemp
+)
+
+// setCurrentFile sets the CURRENT file to point to the manifest with
+// provided file number.
+//
+// NB: This is a low-level routine and typically not what you want to
+// use. Newer versions of Pebble running newer format major versions do
+// not use the CURRENT file. See setCurrentFunc in version_set.go.
+func setCurrentFile(dirname string, fs vfs.FS, fileNum base.DiskFileNum) error {
+	newFilename := base.MakeFilepath(fs, dirname, fileTypeCurrent, fileNum)
+	oldFilename := base.MakeFilepath(fs, dirname, fileTypeTemp, fileNum)
+	fs.Remove(oldFilename)
+	f, err := fs.Create(oldFilename)
+	if err != nil {
+		return err
+	}
+	if _, err := fmt.Fprintf(f, "MANIFEST-%s\n", fileNum); err != nil {
+		return err
+	}
+	if err := f.Sync(); err != nil {
+		return err
+	}
+	if err := f.Close(); err != nil {
+		return err
+	}
+	return fs.Rename(oldFilename, newFilename)
+}
diff --git a/pebble/filenames_test.go b/pebble/filenames_test.go
new file mode 100644
index 0000000..287352e
--- /dev/null
+++ b/pebble/filenames_test.go
@@ -0,0 +1,110 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"testing"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+// TestSetCurrentFileCrash tests a crash that occurs during
+// a MANIFEST roll, leaving the temporary CURRENT file on
+// the filesystem. These temporary files should be cleaned
+// up on Open.
+func TestSetCurrentFileCrash(t *testing.T) {
+	mem := vfs.NewMem()
+
+	// Initialize a fresh database to write the initial MANIFEST.
+	{
+		d, err := Open("", &Options{FS: mem})
+		require.NoError(t, err)
+		require.NoError(t, d.Close())
+	}
+
+	// Open the database again, this time with a FS that
+	// errors on Rename and a tiny max manifest file size
+	// to force manifest rolls.
+	{
+		wantErr := errors.New("rename error")
+		_, err := Open("", &Options{
+			FS:                    renameErrorFS{FS: mem, err: wantErr},
+			Logger:                noFatalLogger{t: t},
+			MaxManifestFileSize:   1,
+			L0CompactionThreshold: 10,
+		})
+		// Open should fail during a manifest roll,
+		// leaving a temp dir on the filesystem.
+		if !errors.Is(err, wantErr) {
+			t.Fatal(err)
+		}
+	}
+
+	// A temp file should be left on the filesystem
+	// from the failed Rename of the CURRENT file.
+	if temps := allTempFiles(t, mem); len(temps) == 0 {
+		t.Fatal("no temp files on the filesystem")
+	}
+
+	// Open the database a third time with a normal
+	// filesystem again. It should clean up any temp
+	// files on Open.
+	{
+		d, err := Open("", &Options{
+			FS:                    mem,
+			MaxManifestFileSize:   1,
+			L0CompactionThreshold: 10,
+		})
+		require.NoError(t, err)
+		require.NoError(t, d.Close())
+		if temps := allTempFiles(t, mem); len(temps) > 0 {
+			t.Fatalf("temporary files still on disk: %#v\n", temps)
+		}
+	}
+}
+
+func allTempFiles(t *testing.T, fs vfs.FS) []string {
+	var files []string
+	ls, err := fs.List("")
+	require.NoError(t, err)
+	for _, f := range ls {
+		ft, _, ok := base.ParseFilename(fs, f)
+		if ok && ft == fileTypeTemp {
+			files = append(files, f)
+		}
+	}
+	return files
+}
+
+type renameErrorFS struct {
+	vfs.FS
+	err error
+}
+
+func (fs renameErrorFS) Rename(oldname string, newname string) error {
+	return fs.err
+}
+
+// noFatalLogger implements Logger, logging to the contained
+// *testing.T. Notably it does not panic on calls to Fatalf
+// to enable unit tests of fatal logic.
+type noFatalLogger struct {
+	t *testing.T
+}
+
+func (l noFatalLogger) Infof(format string, args ...interface{}) {
+	l.t.Logf(format, args...)
+}
+
+func (l noFatalLogger) Errorf(format string, args ...interface{}) {
+	l.t.Logf(format, args...)
+}
+
+func (l noFatalLogger) Fatalf(format string, args ...interface{}) {
+	l.t.Logf(format, args...)
+}
diff --git a/pebble/flush_test.go b/pebble/flush_test.go
new file mode 100644
index 0000000..0031420
--- /dev/null
+++ b/pebble/flush_test.go
@@ -0,0 +1,117 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+func TestManualFlush(t *testing.T) {
+	getOptions := func() *Options {
+		opts := &Options{
+			FS:                    vfs.NewMem(),
+			L0CompactionThreshold: 10,
+		}
+		opts.DisableAutomaticCompactions = true
+		return opts
+	}
+	d, err := Open("", getOptions())
+	require.NoError(t, err)
+	defer func() {
+		require.NoError(t, d.Close())
+	}()
+
+	datadriven.RunTest(t, "testdata/manual_flush", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "batch":
+			b := d.NewBatch()
+			if err := runBatchDefineCmd(td, b); err != nil {
+				return err.Error()
+			}
+			b.Commit(nil)
+			return ""
+
+		case "flush":
+			if err := d.Flush(); err != nil {
+				return err.Error()
+			}
+
+			d.mu.Lock()
+			s := d.mu.versions.currentVersion().String()
+			d.mu.Unlock()
+			return s
+
+		case "async-flush":
+			d.mu.Lock()
+			cur := d.mu.versions.currentVersion()
+			d.mu.Unlock()
+
+			if _, err := d.AsyncFlush(); err != nil {
+				return err.Error()
+			}
+
+			err := try(100*time.Microsecond, 20*time.Second, func() error {
+				d.mu.Lock()
+				defer d.mu.Unlock()
+				if cur == d.mu.versions.currentVersion() {
+					return errors.New("flush has not occurred")
+				}
+				return nil
+			})
+			if err != nil {
+				return err.Error()
+			}
+
+			d.mu.Lock()
+			s := d.mu.versions.currentVersion().String()
+			d.mu.Unlock()
+			return s
+
+		case "reset":
+			if err := d.Close(); err != nil {
+				return err.Error()
+			}
+			d, err = Open("", getOptions())
+			if err != nil {
+				return err.Error()
+			}
+			return ""
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+// TestFlushDelRangeEmptyKey tests flushing a range tombstone that begins with
+// an empty key. The empty key is a valid key but can be confused with nil.
+func TestFlushDelRangeEmptyKey(t *testing.T) {
+	d, err := Open("", &Options{FS: vfs.NewMem()})
+	require.NoError(t, err)
+	require.NoError(t, d.DeleteRange([]byte{}, []byte("z"), nil))
+	require.NoError(t, d.Flush())
+	require.NoError(t, d.Close())
+}
+
+// TestFlushEmptyKey tests that flushing an empty key does not trigger that key
+// order invariant assertions.
+func TestFlushEmptyKey(t *testing.T) {
+	d, err := Open("", &Options{FS: vfs.NewMem()})
+	require.NoError(t, err)
+	require.NoError(t, d.Set(nil, []byte("hello"), nil))
+	require.NoError(t, d.Flush())
+	val, closer, err := d.Get(nil)
+	require.NoError(t, err)
+	require.Equal(t, val, []byte("hello"))
+	require.NoError(t, closer.Close())
+	require.NoError(t, d.Close())
+}
diff --git a/pebble/flushable.go b/pebble/flushable.go
new file mode 100644
index 0000000..473dc6a
--- /dev/null
+++ b/pebble/flushable.go
@@ -0,0 +1,254 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"context"
+	"fmt"
+	"sync/atomic"
+	"time"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+)
+
+// flushable defines the interface for immutable memtables.
+type flushable interface {
+	newIter(o *IterOptions) internalIterator
+	newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator
+	newRangeDelIter(o *IterOptions) keyspan.FragmentIterator
+	newRangeKeyIter(o *IterOptions) keyspan.FragmentIterator
+	containsRangeKeys() bool
+	// inuseBytes returns the number of inuse bytes by the flushable.
+	inuseBytes() uint64
+	// totalBytes returns the total number of bytes allocated by the flushable.
+	totalBytes() uint64
+	// readyForFlush returns true when the flushable is ready for flushing. See
+	// memTable.readyForFlush for one implementation which needs to check whether
+	// there are any outstanding write references.
+	readyForFlush() bool
+}
+
+// flushableEntry wraps a flushable and adds additional metadata and
+// functionality that is common to all flushables.
+type flushableEntry struct {
+	flushable
+	// Channel which is closed when the flushable has been flushed.
+	flushed chan struct{}
+	// flushForced indicates whether a flush was forced on this memtable (either
+	// manual, or due to ingestion). Protected by DB.mu.
+	flushForced bool
+	// delayedFlushForcedAt indicates whether a timer has been set to force a
+	// flush on this memtable at some point in the future. Protected by DB.mu.
+	// Holds the timestamp of when the flush will be issued.
+	delayedFlushForcedAt time.Time
+	// logNum corresponds to the WAL that contains the records present in the
+	// receiver.
+	logNum base.DiskFileNum
+	// logSize is the size in bytes of the associated WAL. Protected by DB.mu.
+	logSize uint64
+	// The current logSeqNum at the time the memtable was created. This is
+	// guaranteed to be less than or equal to any seqnum stored in the memtable.
+	logSeqNum uint64
+	// readerRefs tracks the read references on the flushable. The two sources of
+	// reader references are DB.mu.mem.queue and readState.memtables. The memory
+	// reserved by the flushable in the cache is released when the reader refs
+	// drop to zero. If the flushable is referencing sstables, then the file
+	// refount is also decreased once the reader refs drops to 0. If the
+	// flushable is a memTable, when the reader refs drops to zero, the writer
+	// refs will already be zero because the memtable will have been flushed and
+	// that only occurs once the writer refs drops to zero.
+	readerRefs atomic.Int32
+	// Closure to invoke to release memory accounting.
+	releaseMemAccounting func()
+	// unrefFiles, if not nil, should be invoked to decrease the ref count of
+	// files which are backing the flushable.
+	unrefFiles func() []*fileBacking
+	// deleteFnLocked should be called if the caller is holding DB.mu.
+	deleteFnLocked func(obsolete []*fileBacking)
+	// deleteFn should be called if the caller is not holding DB.mu.
+	deleteFn func(obsolete []*fileBacking)
+}
+
+func (e *flushableEntry) readerRef() {
+	switch v := e.readerRefs.Add(1); {
+	case v <= 1:
+		panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v))
+	}
+}
+
+// db.mu must not be held when this is called.
+func (e *flushableEntry) readerUnref(deleteFiles bool) {
+	e.readerUnrefHelper(deleteFiles, e.deleteFn)
+}
+
+// db.mu must be held when this is called.
+func (e *flushableEntry) readerUnrefLocked(deleteFiles bool) {
+	e.readerUnrefHelper(deleteFiles, e.deleteFnLocked)
+}
+
+func (e *flushableEntry) readerUnrefHelper(
+	deleteFiles bool, deleteFn func(obsolete []*fileBacking),
+) {
+	switch v := e.readerRefs.Add(-1); {
+	case v < 0:
+		panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v))
+	case v == 0:
+		if e.releaseMemAccounting == nil {
+			panic("pebble: memtable reservation already released")
+		}
+		e.releaseMemAccounting()
+		e.releaseMemAccounting = nil
+		if e.unrefFiles != nil {
+			obsolete := e.unrefFiles()
+			e.unrefFiles = nil
+			if deleteFiles {
+				deleteFn(obsolete)
+			}
+		}
+	}
+}
+
+type flushableList []*flushableEntry
+
+// ingestedFlushable is the implementation of the flushable interface for the
+// ingesting sstables which are added to the flushable list.
+type ingestedFlushable struct {
+	files            []physicalMeta
+	comparer         *Comparer
+	newIters         tableNewIters
+	newRangeKeyIters keyspan.TableNewSpanIter
+
+	// Since the level slice is immutable, we construct and set it once. It
+	// should be safe to read from slice in future reads.
+	slice manifest.LevelSlice
+	// hasRangeKeys is set on ingestedFlushable construction.
+	hasRangeKeys bool
+}
+
+func newIngestedFlushable(
+	files []*fileMetadata,
+	comparer *Comparer,
+	newIters tableNewIters,
+	newRangeKeyIters keyspan.TableNewSpanIter,
+) *ingestedFlushable {
+	var physicalFiles []physicalMeta
+	var hasRangeKeys bool
+	for _, f := range files {
+		if f.HasRangeKeys {
+			hasRangeKeys = true
+		}
+		physicalFiles = append(physicalFiles, f.PhysicalMeta())
+	}
+
+	ret := &ingestedFlushable{
+		files:            physicalFiles,
+		comparer:         comparer,
+		newIters:         newIters,
+		newRangeKeyIters: newRangeKeyIters,
+		// slice is immutable and can be set once and used many times.
+		slice:        manifest.NewLevelSliceKeySorted(comparer.Compare, files),
+		hasRangeKeys: hasRangeKeys,
+	}
+
+	return ret
+}
+
+// TODO(sumeer): ingestedFlushable iters also need to plumb context for
+// tracing.
+
+// newIter is part of the flushable interface.
+func (s *ingestedFlushable) newIter(o *IterOptions) internalIterator {
+	var opts IterOptions
+	if o != nil {
+		opts = *o
+	}
+	// TODO(bananabrick): The manifest.Level in newLevelIter is only used for
+	// logging. Update the manifest.Level encoding to account for levels which
+	// aren't truly levels in the lsm. Right now, the encoding only supports
+	// L0 sublevels, and the rest of the levels in the lsm.
+	return newLevelIter(
+		context.Background(), opts, s.comparer, s.newIters, s.slice.Iter(), manifest.Level(0),
+		internalIterOpts{},
+	)
+}
+
+// newFlushIter is part of the flushable interface.
+func (s *ingestedFlushable) newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator {
+	// newFlushIter is only used for writing memtables to disk as sstables.
+	// Since ingested sstables are already present on disk, they don't need to
+	// make use of a flush iter.
+	panic("pebble: not implemented")
+}
+
+func (s *ingestedFlushable) constructRangeDelIter(
+	file *manifest.FileMetadata, _ keyspan.SpanIterOptions,
+) (keyspan.FragmentIterator, error) {
+	// Note that the keyspan level iter expects a non-nil iterator to be
+	// returned even if there is an error. So, we return the emptyKeyspanIter.
+	iter, rangeDelIter, err := s.newIters(context.Background(), file, nil, internalIterOpts{})
+	if err != nil {
+		return emptyKeyspanIter, err
+	}
+	iter.Close()
+	if rangeDelIter == nil {
+		return emptyKeyspanIter, nil
+	}
+	return rangeDelIter, nil
+}
+
+// newRangeDelIter is part of the flushable interface.
+// TODO(bananabrick): Using a level iter instead of a keyspan level iter to
+// surface range deletes is more efficient.
+//
+// TODO(sumeer): *IterOptions are being ignored, so the index block load for
+// the point iterator in constructRangeDeIter is not tracked.
+func (s *ingestedFlushable) newRangeDelIter(_ *IterOptions) keyspan.FragmentIterator {
+	return keyspan.NewLevelIter(
+		keyspan.SpanIterOptions{}, s.comparer.Compare,
+		s.constructRangeDelIter, s.slice.Iter(), manifest.Level(0),
+		manifest.KeyTypePoint,
+	)
+}
+
+// newRangeKeyIter is part of the flushable interface.
+func (s *ingestedFlushable) newRangeKeyIter(o *IterOptions) keyspan.FragmentIterator {
+	if !s.containsRangeKeys() {
+		return nil
+	}
+
+	return keyspan.NewLevelIter(
+		keyspan.SpanIterOptions{}, s.comparer.Compare, s.newRangeKeyIters,
+		s.slice.Iter(), manifest.Level(0), manifest.KeyTypeRange,
+	)
+}
+
+// containsRangeKeys is part of the flushable interface.
+func (s *ingestedFlushable) containsRangeKeys() bool {
+	return s.hasRangeKeys
+}
+
+// inuseBytes is part of the flushable interface.
+func (s *ingestedFlushable) inuseBytes() uint64 {
+	// inuseBytes is only used when memtables are flushed to disk as sstables.
+	panic("pebble: not implemented")
+}
+
+// totalBytes is part of the flushable interface.
+func (s *ingestedFlushable) totalBytes() uint64 {
+	// We don't allocate additional bytes for the ingestedFlushable.
+	return 0
+}
+
+// readyForFlush is part of the flushable interface.
+func (s *ingestedFlushable) readyForFlush() bool {
+	// ingestedFlushable should always be ready to flush. However, note that
+	// memtables before the ingested sstables in the memtable queue must be
+	// flushed before an ingestedFlushable can be flushed. This is because the
+	// ingested sstables need an updated view of the Version to
+	// determine where to place the files in the lsm.
+	return true
+}
diff --git a/pebble/flushable_test.go b/pebble/flushable_test.go
new file mode 100644
index 0000000..c5d1d9c
--- /dev/null
+++ b/pebble/flushable_test.go
@@ -0,0 +1,168 @@
+package pebble
+
+import (
+	"bytes"
+	"fmt"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+// Simple sanity tests for the flushable interface implementation for ingested
+// sstables.
+func TestIngestedSSTFlushableAPI(t *testing.T) {
+	var mem vfs.FS
+	var d *DB
+	defer func() {
+		require.NoError(t, d.Close())
+	}()
+	var flushable flushable
+
+	reset := func() {
+		if d != nil {
+			require.NoError(t, d.Close())
+		}
+
+		mem = vfs.NewMem()
+		require.NoError(t, mem.MkdirAll("ext", 0755))
+		opts := &Options{
+			FS:                    mem,
+			L0CompactionThreshold: 100,
+			L0StopWritesThreshold: 100,
+			DebugCheck:            DebugCheckLevels,
+			FormatMajorVersion:    internalFormatNewest,
+		}
+		// Disable automatic compactions because otherwise we'll race with
+		// delete-only compactions triggered by ingesting range tombstones.
+		opts.DisableAutomaticCompactions = true
+
+		var err error
+		d, err = Open("", opts)
+		require.NoError(t, err)
+		flushable = nil
+	}
+	reset()
+
+	loadFileMeta := func(paths []string) []*fileMetadata {
+		d.mu.Lock()
+		pendingOutputs := make([]base.DiskFileNum, len(paths))
+		for i := range paths {
+			pendingOutputs[i] = d.mu.versions.getNextDiskFileNum()
+		}
+		jobID := d.mu.nextJobID
+		d.mu.nextJobID++
+		d.mu.Unlock()
+
+		// We can reuse the ingestLoad function for this test even if we're
+		// not actually ingesting a file.
+		lr, err := ingestLoad(d.opts, d.FormatMajorVersion(), paths, nil, nil, d.cacheID, pendingOutputs, d.objProvider, jobID)
+		if err != nil {
+			panic(err)
+		}
+		meta := lr.localMeta
+		if len(meta) == 0 {
+			// All of the sstables to be ingested were empty. Nothing to do.
+			panic("empty sstable")
+		}
+		// The table cache requires the *fileMetadata to have a positive
+		// reference count. Fake a reference before we try to load the file.
+		for _, f := range meta {
+			f.Ref()
+		}
+
+		// Verify the sstables do not overlap.
+		if err := ingestSortAndVerify(d.cmp, lr, KeyRange{}); err != nil {
+			panic("unsorted sstables")
+		}
+
+		// Hard link the sstables into the DB directory. Since the sstables aren't
+		// referenced by a version, they won't be used. If the hard linking fails
+		// (e.g. because the files reside on a different filesystem), ingestLink will
+		// fall back to copying, and if that fails we undo our work and return an
+		// error.
+		if err := ingestLink(jobID, d.opts, d.objProvider, lr, nil /* shared */); err != nil {
+			panic("couldn't hard link sstables")
+		}
+
+		// Fsync the directory we added the tables to. We need to do this at some
+		// point before we update the MANIFEST (via logAndApply), otherwise a crash
+		// can have the tables referenced in the MANIFEST, but not present in the
+		// directory.
+		if err := d.dataDir.Sync(); err != nil {
+			panic("Couldn't sync data directory")
+		}
+
+		return meta
+	}
+
+	datadriven.RunTest(t, "testdata/ingested_flushable_api", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "reset":
+			reset()
+			return ""
+		case "build":
+			if err := runBuildCmd(td, d, mem); err != nil {
+				return err.Error()
+			}
+			return ""
+		case "flushable":
+			// Creates an ingestedFlushable over the input files.
+			paths := make([]string, 0, len(td.CmdArgs))
+			for _, arg := range td.CmdArgs {
+				paths = append(paths, arg.String())
+			}
+
+			meta := loadFileMeta(paths)
+			flushable = newIngestedFlushable(
+				meta, d.opts.Comparer, d.newIters, d.tableNewRangeKeyIter,
+			)
+			return ""
+		case "iter":
+			iter := flushable.newIter(nil)
+			var buf bytes.Buffer
+			for x, _ := iter.First(); x != nil; x, _ = iter.Next() {
+				buf.WriteString(x.String())
+				buf.WriteString("\n")
+			}
+			iter.Close()
+			return buf.String()
+		case "rangekeyIter":
+			iter := flushable.newRangeKeyIter(nil)
+			var buf bytes.Buffer
+			if iter != nil {
+				for span := iter.First(); span != nil; span = iter.Next() {
+					buf.WriteString(span.String())
+					buf.WriteString("\n")
+				}
+				iter.Close()
+			}
+			return buf.String()
+		case "rangedelIter":
+			iter := flushable.newRangeDelIter(nil)
+			var buf bytes.Buffer
+			if iter != nil {
+				for span := iter.First(); span != nil; span = iter.Next() {
+					buf.WriteString(span.String())
+					buf.WriteString("\n")
+				}
+				iter.Close()
+			}
+			return buf.String()
+		case "readyForFlush":
+			if flushable.readyForFlush() {
+				return "true"
+			}
+			return "false"
+		case "containsRangeKey":
+			if flushable.containsRangeKeys() {
+				return "true"
+			}
+			return "false"
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
diff --git a/pebble/format_major_version.go b/pebble/format_major_version.go
new file mode 100644
index 0000000..89be161
--- /dev/null
+++ b/pebble/format_major_version.go
@@ -0,0 +1,678 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"fmt"
+	"strconv"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/cockroachdb/pebble/vfs/atomicfs"
+)
+
+// FormatMajorVersion is a constant controlling the format of persisted
+// data. Backwards incompatible changes to durable formats are gated
+// behind new format major versions.
+//
+// At any point, a database's format major version may be bumped.
+// However, once a database's format major version is increased,
+// previous versions of Pebble will refuse to open the database.
+//
+// The zero value format is the FormatDefault constant. The exact
+// FormatVersion that the default corresponds to may change with time.
+type FormatMajorVersion uint64
+
+// SafeValue implements redact.SafeValue.
+func (v FormatMajorVersion) SafeValue() {}
+
+// String implements fmt.Stringer.
+func (v FormatMajorVersion) String() string {
+	// NB: This must not change. It's used as the value for the on-disk
+	// version marker file.
+	//
+	// Specifically, this value must always parse as a base 10 integer
+	// that fits in a uint64. We format it as zero-padded, 3-digit
+	// number today, but the padding may change.
+	return fmt.Sprintf("%03d", v)
+}
+
+const (
+	// 21.2 versions.
+
+	// FormatDefault leaves the format version unspecified. The
+	// FormatDefault constant may be ratcheted upwards over time.
+	FormatDefault FormatMajorVersion = iota
+	// FormatMostCompatible maintains the most backwards compatibility,
+	// maintaining bi-directional compatibility with RocksDB 6.2.1 in
+	// the particular configuration described in the Pebble README.
+	FormatMostCompatible
+	// formatVersionedManifestMarker is the first
+	// backwards-incompatible change made to Pebble, introducing the
+	// format-version marker file for handling backwards-incompatible
+	// changes more broadly, and replacing the `CURRENT` file with a
+	// marker file.
+	//
+	// This format version is intended as an intermediary version state.
+	// It is deliberately unexported to discourage direct use of this
+	// format major version.  Clients should use FormatVersioned which
+	// also ensures earlier versions of Pebble fail to open a database
+	// written in a future format major version.
+	formatVersionedManifestMarker
+	// FormatVersioned is a new format major version that replaces the
+	// old `CURRENT` file with a new 'marker' file scheme.  Previous
+	// Pebble versions will be unable to open the database unless
+	// they're aware of format versions.
+	FormatVersioned
+	// FormatSetWithDelete is a format major version that introduces a new key
+	// kind, base.InternalKeyKindSetWithDelete. Previous Pebble versions will be
+	// unable to open this database.
+	FormatSetWithDelete
+
+	// 22.1 versions.
+
+	// FormatBlockPropertyCollector is a format major version that introduces
+	// BlockPropertyCollectors.
+	FormatBlockPropertyCollector
+	// FormatSplitUserKeysMarked is a format major version that guarantees that
+	// all files that share user keys with neighbors are marked for compaction
+	// in the manifest. Ratcheting to FormatSplitUserKeysMarked will block
+	// (without holding mutexes) until the scan of the LSM is complete and the
+	// manifest has been rotated.
+	FormatSplitUserKeysMarked
+
+	// 22.2 versions.
+
+	// FormatSplitUserKeysMarkedCompacted is a format major version that
+	// guarantees that all files explicitly marked for compaction in the manifest
+	// have been compacted. Combined with the FormatSplitUserKeysMarked format
+	// major version, this version guarantees that there are no user keys split
+	// across multiple files within a level L1+. Ratcheting to this format version
+	// will block (without holding mutexes) until all necessary compactions for
+	// files marked for compaction are complete.
+	FormatSplitUserKeysMarkedCompacted
+	// FormatRangeKeys is a format major version that introduces range keys.
+	FormatRangeKeys
+	// FormatMinTableFormatPebblev1 is a format major version that guarantees that
+	// tables created by or ingested into the DB at or above this format major
+	// version will have a table format version of at least Pebblev1 (Block
+	// Properties).
+	FormatMinTableFormatPebblev1
+	// FormatPrePebblev1Marked is a format major version that guarantees that all
+	// sstables with a table format version pre-Pebblev1 (i.e. those that are
+	// guaranteed to not contain block properties) are marked for compaction in
+	// the manifest. Ratcheting to FormatPrePebblev1Marked will block (without
+	// holding mutexes) until the scan of the LSM is complete and the manifest has
+	// been rotated.
+	FormatPrePebblev1Marked
+
+	// 23.1 versions.
+
+	// formatUnusedPrePebblev1MarkedCompacted is an unused format major version.
+	// This format major version was originally intended to ship in the 23.1
+	// release. It was later decided that this should be deferred until a
+	// subsequent release. The original ordering is preserved so as not to
+	// introduce breaking changes in Cockroach.
+	formatUnusedPrePebblev1MarkedCompacted
+
+	// FormatSSTableValueBlocks is a format major version that adds support for
+	// storing values in value blocks in the sstable. Value block support is not
+	// necessarily enabled when writing sstables, when running with this format
+	// major version.
+	//
+	// WARNING: In development, so no production code should upgrade to this
+	// format, since a DB with this format major version will not actually
+	// interoperate correctly with another DB with the same format major
+	// version. This format major version is introduced so that tests can start
+	// being executed up to this version. Note that these tests succeed despite
+	// the incomplete support since they do not enable value blocks and use
+	// TableFormatPebblev2.
+	FormatSSTableValueBlocks
+
+	// FormatFlushableIngest is a format major version that enables lazy
+	// addition of ingested sstables into the LSM structure. When an ingest
+	// overlaps with a memtable, a record of the ingest is written to the WAL
+	// without waiting for a flush. Subsequent reads treat the ingested files as
+	// a level above the overlapping memtable. Once the memtable is flushed, the
+	// ingested files are moved into the lowest possible levels.
+	//
+	// This feature is behind a format major version because it required
+	// breaking changes to the WAL format.
+	FormatFlushableIngest
+
+	// 23.2 versions.
+
+	// FormatPrePebblev1MarkedCompacted is a format major version that guarantees
+	// that all sstables explicitly marked for compaction in the manifest (see
+	// FormatPrePebblev1Marked) have been compacted. Ratcheting to this format
+	// version will block (without holding mutexes) until all necessary
+	// compactions for files marked for compaction are complete.
+	FormatPrePebblev1MarkedCompacted
+
+	// FormatDeleteSizedAndObsolete is a format major version that adds support
+	// for deletion tombstones that encode the size of the value they're
+	// expected to delete. This format major version is required before the
+	// associated key kind may be committed through batch applications or
+	// ingests. It also adds support for keys that are marked obsolete (see
+	// sstable/format.go for details).
+	FormatDeleteSizedAndObsolete
+
+	// FormatVirtualSSTables is a format major version that adds support for
+	// virtual sstables that can reference a sub-range of keys in an underlying
+	// physical sstable. This information is persisted through new,
+	// backward-incompatible fields in the Manifest, and therefore requires
+	// a format major version.
+	FormatVirtualSSTables
+
+	// internalFormatNewest holds the newest format major version, including
+	// experimental ones excluded from the exported FormatNewest constant until
+	// they've stabilized. Used in tests.
+	internalFormatNewest FormatMajorVersion = iota - 1
+
+	// FormatNewest always contains the most recent format major version.
+	FormatNewest FormatMajorVersion = internalFormatNewest
+)
+
+// MaxTableFormat returns the maximum sstable.TableFormat that can be used at
+// this FormatMajorVersion.
+func (v FormatMajorVersion) MaxTableFormat() sstable.TableFormat {
+	switch v {
+	case FormatDefault, FormatMostCompatible, formatVersionedManifestMarker,
+		FormatVersioned, FormatSetWithDelete:
+		return sstable.TableFormatRocksDBv2
+	case FormatBlockPropertyCollector, FormatSplitUserKeysMarked,
+		FormatSplitUserKeysMarkedCompacted:
+		return sstable.TableFormatPebblev1
+	case FormatRangeKeys, FormatMinTableFormatPebblev1, FormatPrePebblev1Marked,
+		formatUnusedPrePebblev1MarkedCompacted:
+		return sstable.TableFormatPebblev2
+	case FormatSSTableValueBlocks, FormatFlushableIngest, FormatPrePebblev1MarkedCompacted:
+		return sstable.TableFormatPebblev3
+	case FormatDeleteSizedAndObsolete, FormatVirtualSSTables:
+		return sstable.TableFormatPebblev4
+	default:
+		panic(fmt.Sprintf("pebble: unsupported format major version: %s", v))
+	}
+}
+
+// MinTableFormat returns the minimum sstable.TableFormat that can be used at
+// this FormatMajorVersion.
+func (v FormatMajorVersion) MinTableFormat() sstable.TableFormat {
+	switch v {
+	case FormatDefault, FormatMostCompatible, formatVersionedManifestMarker,
+		FormatVersioned, FormatSetWithDelete, FormatBlockPropertyCollector,
+		FormatSplitUserKeysMarked, FormatSplitUserKeysMarkedCompacted,
+		FormatRangeKeys:
+		return sstable.TableFormatLevelDB
+	case FormatMinTableFormatPebblev1, FormatPrePebblev1Marked,
+		formatUnusedPrePebblev1MarkedCompacted, FormatSSTableValueBlocks,
+		FormatFlushableIngest, FormatPrePebblev1MarkedCompacted,
+		FormatDeleteSizedAndObsolete, FormatVirtualSSTables:
+		return sstable.TableFormatPebblev1
+	default:
+		panic(fmt.Sprintf("pebble: unsupported format major version: %s", v))
+	}
+}
+
+// orderingInvariants returns an enum encoding the set of invariants that must
+// hold within the receiver format major version. Invariants only get stricter
+// as the format major version advances, so it is okay to retrieve the
+// invariants from the current format major version and by the time the
+// invariants are enforced, the format major version has advanced.
+func (v FormatMajorVersion) orderingInvariants() manifest.OrderingInvariants {
+	if v < FormatSplitUserKeysMarkedCompacted {
+		return manifest.AllowSplitUserKeys
+	}
+	return manifest.ProhibitSplitUserKeys
+}
+
+// formatMajorVersionMigrations defines the migrations from one format
+// major version to the next. Each migration is defined as a closure
+// which will be invoked on the database before the new format major
+// version is committed. Migrations must be idempotent. Migrations are
+// invoked with d.mu locked.
+//
+// Each migration is responsible for invoking finalizeFormatVersUpgrade
+// to set the new format major version.  RatchetFormatMajorVersion will
+// panic if a migration returns a nil error but fails to finalize the
+// new format major version.
+var formatMajorVersionMigrations = map[FormatMajorVersion]func(*DB) error{
+	FormatMostCompatible: func(d *DB) error { return nil },
+	formatVersionedManifestMarker: func(d *DB) error {
+		// formatVersionedManifestMarker introduces the use of a marker
+		// file for pointing to the current MANIFEST file.
+
+		// Lock the manifest.
+		d.mu.versions.logLock()
+		defer d.mu.versions.logUnlock()
+
+		// Construct the filename of the currently active manifest and
+		// move the manifest marker to that filename. The marker is
+		// guaranteed to exist, because we unconditionally locate it
+		// during Open.
+		manifestFileNum := d.mu.versions.manifestFileNum
+		filename := base.MakeFilename(fileTypeManifest, manifestFileNum)
+		if err := d.mu.versions.manifestMarker.Move(filename); err != nil {
+			return errors.Wrap(err, "moving manifest marker")
+		}
+
+		// Now that we have a manifest marker file in place and pointing
+		// to the current MANIFEST, finalize the upgrade. If we fail for
+		// some reason, a retry of this migration is guaranteed to again
+		// move the manifest marker file to the latest manifest. If
+		// we're unable to finalize the upgrade, a subsequent call to
+		// Open will ignore the manifest marker.
+		if err := d.finalizeFormatVersUpgrade(formatVersionedManifestMarker); err != nil {
+			return err
+		}
+
+		// We've finalized the upgrade. All subsequent Open calls will
+		// ignore the CURRENT file and instead read the manifest marker.
+		// Before we unlock the manifest, we need to update versionSet
+		// to use the manifest marker on future rotations.
+		d.mu.versions.setCurrent = setCurrentFuncMarker(
+			d.mu.versions.manifestMarker,
+			d.mu.versions.fs,
+			d.mu.versions.dirname)
+		return nil
+	},
+	// The FormatVersioned version is split into two, each with their
+	// own migration to ensure the post-migration cleanup happens even
+	// if there's a crash immediately after finalizing the version. Once
+	// a new format major version is finalized, its migration will never
+	// run again. Post-migration cleanup like the one in the migration
+	// below must be performed in a separate migration or every time the
+	// database opens.
+	FormatVersioned: func(d *DB) error {
+		// Replace the `CURRENT` file with one that points to the
+		// nonexistent `MANIFEST-000000` file. If an earlier Pebble
+		// version that does not know about format major versions
+		// attempts to open the database, it will error avoiding
+		// accidental corruption.
+		if err := setCurrentFile(d.mu.versions.dirname, d.mu.versions.fs, base.FileNum(0).DiskFileNum()); err != nil {
+			return err
+		}
+		return d.finalizeFormatVersUpgrade(FormatVersioned)
+	},
+	// As SetWithDelete is a new key kind, there is nothing to migrate. We can
+	// simply finalize the format version and we're done.
+	FormatSetWithDelete: func(d *DB) error {
+		return d.finalizeFormatVersUpgrade(FormatSetWithDelete)
+	},
+	FormatBlockPropertyCollector: func(d *DB) error {
+		return d.finalizeFormatVersUpgrade(FormatBlockPropertyCollector)
+	},
+	FormatSplitUserKeysMarked: func(d *DB) error {
+		// Mark any unmarked files with split-user keys. Note all format major
+		// versions migrations are invoked with DB.mu locked.
+		if err := d.markFilesLocked(markFilesWithSplitUserKeys(d.opts.Comparer.Equal)); err != nil {
+			return err
+		}
+		return d.finalizeFormatVersUpgrade(FormatSplitUserKeysMarked)
+	},
+	FormatSplitUserKeysMarkedCompacted: func(d *DB) error {
+		// Before finalizing the format major version, rewrite any sstables
+		// still marked for compaction. Note all format major versions
+		// migrations are invoked with DB.mu locked.
+		if err := d.compactMarkedFilesLocked(); err != nil {
+			return err
+		}
+		return d.finalizeFormatVersUpgrade(FormatSplitUserKeysMarkedCompacted)
+	},
+	FormatRangeKeys: func(d *DB) error {
+		return d.finalizeFormatVersUpgrade(FormatRangeKeys)
+	},
+	FormatMinTableFormatPebblev1: func(d *DB) error {
+		return d.finalizeFormatVersUpgrade(FormatMinTableFormatPebblev1)
+	},
+	FormatPrePebblev1Marked: func(d *DB) error {
+		// Mark any unmarked files that contain only table properties. Note all
+		// format major versions migrations are invoked with DB.mu locked.
+		if err := d.markFilesLocked(markFilesPrePebblev1(d.tableCache)); err != nil {
+			return err
+		}
+		return d.finalizeFormatVersUpgrade(FormatPrePebblev1Marked)
+	},
+	formatUnusedPrePebblev1MarkedCompacted: func(d *DB) error {
+		// Intentional no-op.
+		return d.finalizeFormatVersUpgrade(formatUnusedPrePebblev1MarkedCompacted)
+	},
+	FormatSSTableValueBlocks: func(d *DB) error {
+		return d.finalizeFormatVersUpgrade(FormatSSTableValueBlocks)
+	},
+	FormatFlushableIngest: func(d *DB) error {
+		return d.finalizeFormatVersUpgrade(FormatFlushableIngest)
+	},
+	FormatPrePebblev1MarkedCompacted: func(d *DB) error {
+		// Before finalizing the format major version, rewrite any sstables
+		// still marked for compaction. Note all format major versions
+		// migrations are invoked with DB.mu locked.
+		if err := d.compactMarkedFilesLocked(); err != nil {
+			return err
+		}
+		return d.finalizeFormatVersUpgrade(FormatPrePebblev1MarkedCompacted)
+	},
+	FormatDeleteSizedAndObsolete: func(d *DB) error {
+		return d.finalizeFormatVersUpgrade(FormatDeleteSizedAndObsolete)
+	},
+	FormatVirtualSSTables: func(d *DB) error {
+		return d.finalizeFormatVersUpgrade(FormatVirtualSSTables)
+	},
+}
+
+const formatVersionMarkerName = `format-version`
+
+func lookupFormatMajorVersion(
+	fs vfs.FS, dirname string,
+) (FormatMajorVersion, *atomicfs.Marker, error) {
+	m, versString, err := atomicfs.LocateMarker(fs, dirname, formatVersionMarkerName)
+	if err != nil {
+		return 0, nil, err
+	}
+	if versString == "" {
+		return FormatMostCompatible, m, nil
+	}
+	v, err := strconv.ParseUint(versString, 10, 64)
+	if err != nil {
+		return 0, nil, errors.Wrap(err, "parsing format major version")
+	}
+	vers := FormatMajorVersion(v)
+	if vers == FormatDefault {
+		return 0, nil, errors.Newf("pebble: default format major version should not persisted", vers)
+	}
+	if vers > internalFormatNewest {
+		return 0, nil, errors.Newf("pebble: database %q written in format major version %d", dirname, vers)
+	}
+	return vers, m, nil
+}
+
+// FormatMajorVersion returns the database's active format major
+// version. The format major version may be higher than the one
+// provided in Options when the database was opened if the existing
+// database was written with a higher format version.
+func (d *DB) FormatMajorVersion() FormatMajorVersion {
+	return FormatMajorVersion(d.mu.formatVers.vers.Load())
+}
+
+// RatchetFormatMajorVersion ratchets the opened database's format major
+// version to the provided version. It errors if the provided format
+// major version is below the database's current version. Once a
+// database's format major version is upgraded, previous Pebble versions
+// that do not know of the format version will be unable to open the
+// database.
+func (d *DB) RatchetFormatMajorVersion(fmv FormatMajorVersion) error {
+	if err := d.closed.Load(); err != nil {
+		panic(err)
+	}
+
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	return d.ratchetFormatMajorVersionLocked(fmv)
+}
+
+func (d *DB) ratchetFormatMajorVersionLocked(formatVers FormatMajorVersion) error {
+	if d.opts.ReadOnly {
+		return ErrReadOnly
+	}
+	if formatVers > internalFormatNewest {
+		// Guard against accidentally forgetting to update internalFormatNewest.
+		return errors.Errorf("pebble: unknown format version %d", formatVers)
+	}
+	if currentVers := d.FormatMajorVersion(); currentVers > formatVers {
+		return errors.Newf("pebble: database already at format major version %d; cannot reduce to %d",
+			currentVers, formatVers)
+	}
+	if d.mu.formatVers.ratcheting {
+		return errors.Newf("pebble: database format major version upgrade is in-progress")
+	}
+	d.mu.formatVers.ratcheting = true
+	defer func() { d.mu.formatVers.ratcheting = false }()
+
+	for nextVers := d.FormatMajorVersion() + 1; nextVers <= formatVers; nextVers++ {
+		if err := formatMajorVersionMigrations[nextVers](d); err != nil {
+			return errors.Wrapf(err, "migrating to version %d", nextVers)
+		}
+
+		// NB: The migration is responsible for calling
+		// finalizeFormatVersUpgrade to finalize the upgrade. This
+		// structure is necessary because some migrations may need to
+		// update in-memory state (without ever dropping locks) after
+		// the upgrade is finalized. Here we assert that the upgrade
+		// did occur.
+		if d.FormatMajorVersion() != nextVers {
+			d.opts.Logger.Fatalf("pebble: successful migration to format version %d never finalized the upgrade", nextVers)
+		}
+	}
+	return nil
+}
+
+// finalizeFormatVersUpgrade is typically only be called from within a
+// format major version migration.
+//
+// See formatMajorVersionMigrations.
+func (d *DB) finalizeFormatVersUpgrade(formatVers FormatMajorVersion) error {
+	// We use the marker to encode the active format version in the
+	// marker filename. Unlike other uses of the atomic marker, there is
+	// no file with the filename `formatVers.String()` on the
+	// filesystem.
+	if err := d.mu.formatVers.marker.Move(formatVers.String()); err != nil {
+		return err
+	}
+	d.mu.formatVers.vers.Store(uint64(formatVers))
+	d.opts.EventListener.FormatUpgrade(formatVers)
+	return nil
+}
+
+// compactMarkedFilesLocked performs a migration that schedules rewrite
+// compactions to compact away any sstables marked for compaction.
+// compactMarkedFilesLocked is run while ratcheting the database's format major
+// version to FormatSplitUserKeysMarkedCompacted.
+//
+// Note that while this method is called with the DB.mu held, and will not
+// return until all marked files have been compacted, the mutex is dropped while
+// waiting for compactions to complete (or for slots to free up).
+func (d *DB) compactMarkedFilesLocked() error {
+	curr := d.mu.versions.currentVersion()
+	for curr.Stats.MarkedForCompaction > 0 {
+		// Attempt to schedule a compaction to rewrite a file marked for
+		// compaction.
+		d.maybeScheduleCompactionPicker(func(picker compactionPicker, env compactionEnv) *pickedCompaction {
+			return picker.pickRewriteCompaction(env)
+		})
+
+		// The above attempt might succeed and schedule a rewrite compaction. Or
+		// there might not be available compaction concurrency to schedule the
+		// compaction.  Or compaction of the file might have already been in
+		// progress. In any scenario, wait until there's some change in the
+		// state of active compactions.
+
+		// Before waiting, check that the database hasn't been closed. Trying to
+		// schedule the compaction may have dropped d.mu while waiting for a
+		// manifest write to complete. In that dropped interim, the database may
+		// have been closed.
+		if err := d.closed.Load(); err != nil {
+			return err.(error)
+		}
+
+		// Some flush or compaction may have scheduled or completed while we waited
+		// for the manifest lock in maybeScheduleCompactionPicker. Get the latest
+		// Version before waiting on a compaction.
+		curr = d.mu.versions.currentVersion()
+
+		// Only wait on compactions if there are files still marked for compaction.
+		// NB: Waiting on this condition variable drops d.mu while blocked.
+		if curr.Stats.MarkedForCompaction > 0 {
+			if d.mu.compact.compactingCount == 0 {
+				panic("expected a compaction of marked files in progress")
+			}
+			d.mu.compact.cond.Wait()
+			// Refresh the current version again.
+			curr = d.mu.versions.currentVersion()
+		}
+	}
+	return nil
+}
+
+// findFilesFunc scans the LSM for files, returning true if at least one
+// file was found. The returned array contains the matched files, if any, per
+// level.
+type findFilesFunc func(v *version) (found bool, files [numLevels][]*fileMetadata, _ error)
+
+// markFilesWithSplitUserKeys scans the LSM's levels 1 through 6 for adjacent
+// files that contain the same user key. Such arrangements of files were
+// permitted in RocksDB and in Pebble up to SHA a860bbad.
+var markFilesWithSplitUserKeys = func(equal Equal) findFilesFunc {
+	return func(v *version) (found bool, files [numLevels][]*fileMetadata, _ error) {
+		// Files with split user keys are expected to be rare and performing key
+		// comparisons for every file within the LSM is expensive, so drop the
+		// database lock while scanning the file metadata.
+		for l := numLevels - 1; l > 0; l-- {
+			iter := v.Levels[l].Iter()
+			var prevFile *fileMetadata
+			var prevUserKey []byte
+			for f := iter.First(); f != nil; f = iter.Next() {
+				if prevUserKey != nil && equal(prevUserKey, f.Smallest.UserKey) {
+					// NB: We may append a file twice, once as prevFile and once
+					// as f. That's okay, and handled below.
+					files[l] = append(files[l], prevFile, f)
+					found = true
+				}
+				if f.Largest.IsExclusiveSentinel() {
+					prevUserKey = nil
+					prevFile = nil
+				} else {
+					prevUserKey = f.Largest.UserKey
+					prevFile = f
+				}
+			}
+		}
+		return
+	}
+}
+
+// markFilesPrePebblev1 scans the LSM for files that do not support block
+// properties (i.e. a table format version pre-Pebblev1).
+var markFilesPrePebblev1 = func(tc *tableCacheContainer) findFilesFunc {
+	return func(v *version) (found bool, files [numLevels][]*fileMetadata, err error) {
+		for l := numLevels - 1; l > 0; l-- {
+			iter := v.Levels[l].Iter()
+			for f := iter.First(); f != nil; f = iter.Next() {
+				if f.Virtual {
+					// Any physical sstable which has been virtualized must
+					// have already undergone this migration, and we don't
+					// need to worry about the virtual sstable themselves.
+					panic("pebble: unexpected virtual sstable during migration")
+				}
+				err = tc.withReader(
+					f.PhysicalMeta(), func(r *sstable.Reader) error {
+						tf, err := r.TableFormat()
+						if err != nil {
+							return err
+						}
+						if tf < sstable.TableFormatPebblev1 {
+							found = true
+							files[l] = append(files[l], f)
+						}
+						return nil
+					})
+				if err != nil {
+					return
+				}
+			}
+		}
+		return
+	}
+}
+
+// markFilesLock durably marks the files that match the given findFilesFunc for
+// compaction.
+func (d *DB) markFilesLocked(findFn findFilesFunc) error {
+	jobID := d.mu.nextJobID
+	d.mu.nextJobID++
+
+	// Acquire a read state to have a view of the LSM and a guarantee that none
+	// of the referenced files will be deleted until we've unreferenced the read
+	// state. Some findFilesFuncs may read the files, requiring they not be
+	// deleted.
+	rs := d.loadReadState()
+	var (
+		found bool
+		files [numLevels][]*fileMetadata
+		err   error
+	)
+	func() {
+		defer rs.unrefLocked()
+		// Note the unusual locking: unlock, defer Lock(). The scan of the files in
+		// the version does not need to block other operations that require the
+		// DB.mu. Drop it for the scan, before re-acquiring it.
+		d.mu.Unlock()
+		defer d.mu.Lock()
+		found, files, err = findFn(rs.current)
+	}()
+	if err != nil {
+		return err
+	}
+
+	// The database lock has been acquired again by the defer within the above
+	// anonymous function.
+	if !found {
+		// Nothing to do.
+		return nil
+	}
+
+	// After scanning, if we found files to mark, we fetch the current state of
+	// the LSM (which may have changed) and set MarkedForCompaction on the files,
+	// and update the version's Stats.MarkedForCompaction count, which are both
+	// protected by d.mu.
+
+	// Lock the manifest for a coherent view of the LSM. The database lock has
+	// been re-acquired by the defer within the above anonymous function.
+	d.mu.versions.logLock()
+	vers := d.mu.versions.currentVersion()
+	for l, filesToMark := range files {
+		if len(filesToMark) == 0 {
+			continue
+		}
+		for _, f := range filesToMark {
+			// Ignore files to be marked that have already been compacted or marked.
+			if f.CompactionState == manifest.CompactionStateCompacted ||
+				f.MarkedForCompaction {
+				continue
+			}
+			// Else, mark the file for compaction in this version.
+			vers.Stats.MarkedForCompaction++
+			f.MarkedForCompaction = true
+		}
+		// The compaction picker uses the markedForCompactionAnnotator to
+		// quickly find files marked for compaction, or to quickly determine
+		// that there are no such files marked for compaction within a level.
+		// A b-tree node may be annotated with an annotation recording that
+		// there are no files marked for compaction within the node's subtree,
+		// based on the assumption that it's static.
+		//
+		// Since we're marking files for compaction, these b-tree nodes'
+		// annotations will be out of date. Clear the compaction-picking
+		// annotation, so that it's recomputed the next time the compaction
+		// picker looks for a file marked for compaction.
+		vers.Levels[l].InvalidateAnnotation(markedForCompactionAnnotator{})
+	}
+
+	// The 'marked-for-compaction' bit is persisted in the MANIFEST file
+	// metadata. We've already modified the in-memory file metadata, but the
+	// manifest hasn't been updated. Force rotation to a new MANIFEST file,
+	// which will write every file metadata to the new manifest file and ensure
+	// that the now marked-for-compaction file metadata are persisted as marked.
+	// NB: This call to logAndApply will unlockthe MANIFEST, which we locked up
+	// above before obtaining `vers`.
+	return d.mu.versions.logAndApply(
+		jobID,
+		&manifest.VersionEdit{},
+		map[int]*LevelMetrics{},
+		true, /* forceRotation */
+		func() []compactionInfo { return d.getInProgressCompactionInfoLocked(nil) })
+}
diff --git a/pebble/format_major_version_test.go b/pebble/format_major_version_test.go
new file mode 100644
index 0000000..bbca42b
--- /dev/null
+++ b/pebble/format_major_version_test.go
@@ -0,0 +1,580 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"fmt"
+	"strconv"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/bloom"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/cockroachdb/pebble/vfs/atomicfs"
+	"github.com/stretchr/testify/require"
+)
+
+func TestFormatMajorVersion_MigrationDefined(t *testing.T) {
+	for v := FormatMostCompatible; v <= FormatNewest; v++ {
+		if _, ok := formatMajorVersionMigrations[v]; !ok {
+			t.Errorf("format major version %d has no migration defined", v)
+		}
+	}
+}
+
+func TestRatchetFormat(t *testing.T) {
+	fs := vfs.NewMem()
+	d, err := Open("", (&Options{FS: fs}).WithFSDefaults())
+	require.NoError(t, err)
+	require.NoError(t, d.Set([]byte("foo"), []byte("bar"), Sync))
+	require.Equal(t, FormatMostCompatible, d.FormatMajorVersion())
+	require.NoError(t, d.RatchetFormatMajorVersion(FormatVersioned))
+	require.Equal(t, FormatVersioned, d.FormatMajorVersion())
+	require.NoError(t, d.RatchetFormatMajorVersion(FormatVersioned))
+	require.Equal(t, FormatVersioned, d.FormatMajorVersion())
+	require.NoError(t, d.RatchetFormatMajorVersion(FormatSetWithDelete))
+	require.Equal(t, FormatSetWithDelete, d.FormatMajorVersion())
+	require.NoError(t, d.RatchetFormatMajorVersion(FormatBlockPropertyCollector))
+	require.Equal(t, FormatBlockPropertyCollector, d.FormatMajorVersion())
+	require.NoError(t, d.RatchetFormatMajorVersion(FormatSplitUserKeysMarked))
+	require.Equal(t, FormatSplitUserKeysMarked, d.FormatMajorVersion())
+	require.NoError(t, d.RatchetFormatMajorVersion(FormatSplitUserKeysMarkedCompacted))
+	require.Equal(t, FormatSplitUserKeysMarkedCompacted, d.FormatMajorVersion())
+	require.NoError(t, d.RatchetFormatMajorVersion(FormatRangeKeys))
+	require.Equal(t, FormatRangeKeys, d.FormatMajorVersion())
+	require.NoError(t, d.RatchetFormatMajorVersion(FormatMinTableFormatPebblev1))
+	require.Equal(t, FormatMinTableFormatPebblev1, d.FormatMajorVersion())
+	require.NoError(t, d.RatchetFormatMajorVersion(FormatPrePebblev1Marked))
+	require.Equal(t, FormatPrePebblev1Marked, d.FormatMajorVersion())
+	require.NoError(t, d.RatchetFormatMajorVersion(formatUnusedPrePebblev1MarkedCompacted))
+	require.Equal(t, formatUnusedPrePebblev1MarkedCompacted, d.FormatMajorVersion())
+	require.NoError(t, d.RatchetFormatMajorVersion(FormatSSTableValueBlocks))
+	require.Equal(t, FormatSSTableValueBlocks, d.FormatMajorVersion())
+	require.NoError(t, d.RatchetFormatMajorVersion(FormatFlushableIngest))
+	require.Equal(t, FormatFlushableIngest, d.FormatMajorVersion())
+	require.NoError(t, d.RatchetFormatMajorVersion(FormatPrePebblev1MarkedCompacted))
+	require.Equal(t, FormatPrePebblev1MarkedCompacted, d.FormatMajorVersion())
+	require.NoError(t, d.RatchetFormatMajorVersion(FormatDeleteSizedAndObsolete))
+	require.Equal(t, FormatDeleteSizedAndObsolete, d.FormatMajorVersion())
+	require.NoError(t, d.RatchetFormatMajorVersion(FormatVirtualSSTables))
+	require.Equal(t, FormatVirtualSSTables, d.FormatMajorVersion())
+
+	require.NoError(t, d.Close())
+
+	// If we Open the database again, leaving the default format, the
+	// database should Open using the persisted FormatNewest.
+	d, err = Open("", (&Options{FS: fs}).WithFSDefaults())
+	require.NoError(t, err)
+	require.Equal(t, internalFormatNewest, d.FormatMajorVersion())
+	require.NoError(t, d.Close())
+
+	// Move the marker to a version that does not exist.
+	m, _, err := atomicfs.LocateMarker(fs, "", formatVersionMarkerName)
+	require.NoError(t, err)
+	require.NoError(t, m.Move("999999"))
+	require.NoError(t, m.Close())
+
+	_, err = Open("", (&Options{
+		FS:                 fs,
+		FormatMajorVersion: FormatVersioned,
+	}).WithFSDefaults())
+	require.Error(t, err)
+	require.EqualError(t, err, `pebble: database "" written in format major version 999999`)
+}
+
+func testBasicDB(d *DB) error {
+	key := []byte("a")
+	value := []byte("b")
+	if err := d.Set(key, value, nil); err != nil {
+		return err
+	}
+	if err := d.Flush(); err != nil {
+		return err
+	}
+	if err := d.Compact(nil, []byte("\xff"), false); err != nil {
+		return err
+	}
+
+	iter, _ := d.NewIter(nil)
+	for valid := iter.First(); valid; valid = iter.Next() {
+	}
+	if err := iter.Close(); err != nil {
+		return err
+	}
+	return nil
+}
+
+func TestFormatMajorVersions(t *testing.T) {
+	for vers := FormatMostCompatible; vers <= FormatNewest; vers++ {
+		t.Run(fmt.Sprintf("vers=%03d", vers), func(t *testing.T) {
+			fs := vfs.NewStrictMem()
+			opts := (&Options{
+				FS:                 fs,
+				FormatMajorVersion: vers,
+			}).WithFSDefaults()
+
+			// Create a database at this format major version and perform
+			// some very basic operations.
+			d, err := Open("", opts)
+			require.NoError(t, err)
+			require.NoError(t, testBasicDB(d))
+			require.NoError(t, d.Close())
+
+			// Re-open the database at this format major version, and again
+			// perform some basic operations.
+			d, err = Open("", opts)
+			require.NoError(t, err)
+			require.NoError(t, testBasicDB(d))
+			require.NoError(t, d.Close())
+
+			t.Run("upgrade-at-open", func(t *testing.T) {
+				for upgradeVers := vers + 1; upgradeVers <= FormatNewest; upgradeVers++ {
+					t.Run(fmt.Sprintf("upgrade-vers=%03d", upgradeVers), func(t *testing.T) {
+						// We use vfs.MemFS's option to ignore syncs so
+						// that we can perform an upgrade on the current
+						// database state in fs, and revert it when this
+						// subtest is complete.
+						fs.SetIgnoreSyncs(true)
+						defer fs.ResetToSyncedState()
+
+						// Re-open the database, passing a higher format
+						// major version in the Options to automatically
+						// ratchet the format major version. Ensure some
+						// basic operations pass.
+						opts := opts.Clone()
+						opts.FormatMajorVersion = upgradeVers
+						d, err = Open("", opts)
+						require.NoError(t, err)
+						require.Equal(t, upgradeVers, d.FormatMajorVersion())
+						require.NoError(t, testBasicDB(d))
+						require.NoError(t, d.Close())
+
+						// Re-open to ensure the upgrade persisted.
+						d, err = Open("", opts)
+						require.NoError(t, err)
+						require.Equal(t, upgradeVers, d.FormatMajorVersion())
+						require.NoError(t, testBasicDB(d))
+						require.NoError(t, d.Close())
+					})
+				}
+			})
+
+			t.Run("upgrade-while-open", func(t *testing.T) {
+				for upgradeVers := vers + 1; upgradeVers <= FormatNewest; upgradeVers++ {
+					t.Run(fmt.Sprintf("upgrade-vers=%03d", upgradeVers), func(t *testing.T) {
+						// Ensure the previous tests don't overwrite our
+						// options.
+						require.Equal(t, vers, opts.FormatMajorVersion)
+
+						// We use vfs.MemFS's option to ignore syncs so
+						// that we can perform an upgrade on the current
+						// database state in fs, and revert it when this
+						// subtest is complete.
+						fs.SetIgnoreSyncs(true)
+						defer fs.ResetToSyncedState()
+
+						// Re-open the database, still at the current format
+						// major version. Perform some basic operations,
+						// ratchet the format version up, and perform
+						// additional basic operations.
+						d, err = Open("", opts)
+						require.NoError(t, err)
+						require.NoError(t, testBasicDB(d))
+						require.Equal(t, vers, d.FormatMajorVersion())
+						require.NoError(t, d.RatchetFormatMajorVersion(upgradeVers))
+						require.Equal(t, upgradeVers, d.FormatMajorVersion())
+						require.NoError(t, testBasicDB(d))
+						require.NoError(t, d.Close())
+
+						// Re-open to ensure the upgrade persisted.
+						d, err = Open("", opts)
+						require.NoError(t, err)
+						require.Equal(t, upgradeVers, d.FormatMajorVersion())
+						require.NoError(t, testBasicDB(d))
+						require.NoError(t, d.Close())
+					})
+				}
+			})
+		})
+	}
+}
+
+func TestFormatMajorVersions_TableFormat(t *testing.T) {
+	// NB: This test is intended to validate the mapping between every
+	// FormatMajorVersion and sstable.TableFormat exhaustively. This serves as a
+	// sanity check that new versions have a corresponding mapping. The test
+	// fixture is intentionally verbose.
+
+	m := map[FormatMajorVersion][2]sstable.TableFormat{
+		FormatDefault:                          {sstable.TableFormatLevelDB, sstable.TableFormatRocksDBv2},
+		FormatMostCompatible:                   {sstable.TableFormatLevelDB, sstable.TableFormatRocksDBv2},
+		formatVersionedManifestMarker:          {sstable.TableFormatLevelDB, sstable.TableFormatRocksDBv2},
+		FormatVersioned:                        {sstable.TableFormatLevelDB, sstable.TableFormatRocksDBv2},
+		FormatSetWithDelete:                    {sstable.TableFormatLevelDB, sstable.TableFormatRocksDBv2},
+		FormatBlockPropertyCollector:           {sstable.TableFormatLevelDB, sstable.TableFormatPebblev1},
+		FormatSplitUserKeysMarked:              {sstable.TableFormatLevelDB, sstable.TableFormatPebblev1},
+		FormatSplitUserKeysMarkedCompacted:     {sstable.TableFormatLevelDB, sstable.TableFormatPebblev1},
+		FormatRangeKeys:                        {sstable.TableFormatLevelDB, sstable.TableFormatPebblev2},
+		FormatMinTableFormatPebblev1:           {sstable.TableFormatPebblev1, sstable.TableFormatPebblev2},
+		FormatPrePebblev1Marked:                {sstable.TableFormatPebblev1, sstable.TableFormatPebblev2},
+		formatUnusedPrePebblev1MarkedCompacted: {sstable.TableFormatPebblev1, sstable.TableFormatPebblev2},
+		FormatSSTableValueBlocks:               {sstable.TableFormatPebblev1, sstable.TableFormatPebblev3},
+		FormatFlushableIngest:                  {sstable.TableFormatPebblev1, sstable.TableFormatPebblev3},
+		FormatPrePebblev1MarkedCompacted:       {sstable.TableFormatPebblev1, sstable.TableFormatPebblev3},
+		FormatDeleteSizedAndObsolete:           {sstable.TableFormatPebblev1, sstable.TableFormatPebblev4},
+		FormatVirtualSSTables:                  {sstable.TableFormatPebblev1, sstable.TableFormatPebblev4},
+	}
+
+	// Valid versions.
+	for fmv := FormatMostCompatible; fmv <= internalFormatNewest; fmv++ {
+		got := [2]sstable.TableFormat{fmv.MinTableFormat(), fmv.MaxTableFormat()}
+		require.Equalf(t, m[fmv], got, "got %s; want %s", got, m[fmv])
+		require.True(t, got[0] <= got[1] /* min <= max */)
+	}
+
+	// Invalid versions.
+	fmv := internalFormatNewest + 1
+	require.Panics(t, func() { _ = fmv.MaxTableFormat() })
+	require.Panics(t, func() { _ = fmv.MinTableFormat() })
+}
+
+func TestSplitUserKeyMigration(t *testing.T) {
+	var d *DB
+	var opts *Options
+	var fs vfs.FS
+	var buf bytes.Buffer
+	defer func() {
+		if d != nil {
+			require.NoError(t, d.Close())
+		}
+	}()
+
+	datadriven.RunTest(t, "testdata/format_major_version_split_user_key_migration",
+		func(t *testing.T, td *datadriven.TestData) string {
+			switch td.Cmd {
+			case "define":
+				if d != nil {
+					if err := d.Close(); err != nil {
+						return err.Error()
+					}
+					buf.Reset()
+				}
+				opts = (&Options{
+					FormatMajorVersion: FormatBlockPropertyCollector,
+					EventListener: &EventListener{
+						CompactionEnd: func(info CompactionInfo) {
+							// Fix the job ID and durations for determinism.
+							info.JobID = 100
+							info.Duration = time.Second
+							info.TotalDuration = 2 * time.Second
+							fmt.Fprintln(&buf, info)
+						},
+					},
+					DisableAutomaticCompactions: true,
+				}).WithFSDefaults()
+				var err error
+				if d, err = runDBDefineCmd(td, opts); err != nil {
+					return err.Error()
+				}
+
+				fs = d.opts.FS
+				d.mu.Lock()
+				defer d.mu.Unlock()
+				return d.mu.versions.currentVersion().DebugString(base.DefaultFormatter)
+			case "reopen":
+				if d != nil {
+					if err := d.Close(); err != nil {
+						return err.Error()
+					}
+					buf.Reset()
+				}
+				opts.FS = fs
+				opts.DisableAutomaticCompactions = true
+				var err error
+				d, err = Open("", opts)
+				if err != nil {
+					return err.Error()
+				}
+				return "OK"
+			case "build":
+				if err := runBuildCmd(td, d, fs); err != nil {
+					return err.Error()
+				}
+				return ""
+			case "force-ingest":
+				if err := runForceIngestCmd(td, d); err != nil {
+					return err.Error()
+				}
+				d.mu.Lock()
+				defer d.mu.Unlock()
+				return d.mu.versions.currentVersion().DebugString(base.DefaultFormatter)
+			case "format-major-version":
+				return d.FormatMajorVersion().String()
+			case "ratchet-format-major-version":
+				v, err := strconv.Atoi(td.CmdArgs[0].String())
+				if err != nil {
+					return err.Error()
+				}
+				if err := d.RatchetFormatMajorVersion(FormatMajorVersion(v)); err != nil {
+					return err.Error()
+				}
+				return buf.String()
+			case "lsm":
+				return runLSMCmd(td, d)
+			case "marked-file-count":
+				m := d.Metrics()
+				return fmt.Sprintf("%d files marked for compaction", m.Compact.MarkedFiles)
+			case "disable-automatic-compactions":
+				d.mu.Lock()
+				defer d.mu.Unlock()
+				switch v := td.CmdArgs[0].String(); v {
+				case "true":
+					d.opts.DisableAutomaticCompactions = true
+				case "false":
+					d.opts.DisableAutomaticCompactions = false
+				default:
+					return fmt.Sprintf("unknown value %q", v)
+				}
+				return ""
+			default:
+				return fmt.Sprintf("unrecognized command %q", td.Cmd)
+			}
+		})
+}
+
+func TestPebblev1Migration(t *testing.T) {
+	var d *DB
+	defer func() {
+		if d != nil {
+			require.NoError(t, d.Close())
+		}
+	}()
+
+	datadriven.RunTest(t, "testdata/format_major_version_pebblev1_migration",
+		func(t *testing.T, td *datadriven.TestData) string {
+			switch cmd := td.Cmd; cmd {
+			case "open":
+				var version int
+				var err error
+				for _, cmdArg := range td.CmdArgs {
+					switch cmd := cmdArg.Key; cmd {
+					case "version":
+						version, err = strconv.Atoi(cmdArg.Vals[0])
+						if err != nil {
+							return err.Error()
+						}
+					default:
+						return fmt.Sprintf("unknown argument: %s", cmd)
+					}
+				}
+				opts := (&Options{
+					FS:                 vfs.NewMem(),
+					FormatMajorVersion: FormatMajorVersion(version),
+				}).WithFSDefaults()
+				d, err = Open("", opts)
+				if err != nil {
+					return err.Error()
+				}
+				return ""
+
+			case "format-major-version":
+				return d.FormatMajorVersion().String()
+
+			case "min-table-format":
+				return d.FormatMajorVersion().MinTableFormat().String()
+
+			case "max-table-format":
+				return d.FormatMajorVersion().MaxTableFormat().String()
+
+			case "disable-automatic-compactions":
+				d.mu.Lock()
+				defer d.mu.Unlock()
+				switch v := td.CmdArgs[0].String(); v {
+				case "true":
+					d.opts.DisableAutomaticCompactions = true
+				case "false":
+					d.opts.DisableAutomaticCompactions = false
+				default:
+					return fmt.Sprintf("unknown value %q", v)
+				}
+				return ""
+
+			case "batch":
+				b := d.NewIndexedBatch()
+				if err := runBatchDefineCmd(td, b); err != nil {
+					return err.Error()
+				}
+				if err := b.Commit(nil); err != nil {
+					return err.Error()
+				}
+				return ""
+
+			case "flush":
+				if err := d.Flush(); err != nil {
+					return err.Error()
+				}
+				return ""
+
+			case "ingest":
+				if err := runBuildCmd(td, d, d.opts.FS); err != nil {
+					return err.Error()
+				}
+				// Only the first arg is a filename.
+				td.CmdArgs = td.CmdArgs[:1]
+				if err := runIngestCmd(td, d, d.opts.FS); err != nil {
+					return err.Error()
+				}
+				return ""
+
+			case "lsm":
+				return runLSMCmd(td, d)
+
+			case "tally-table-formats":
+				d.mu.Lock()
+				defer d.mu.Unlock()
+				v := d.mu.versions.currentVersion()
+				tally := make([]int, sstable.TableFormatMax+1)
+				for _, l := range v.Levels {
+					iter := l.Iter()
+					for m := iter.First(); m != nil; m = iter.Next() {
+						err := d.tableCache.withReader(m.PhysicalMeta(),
+							func(r *sstable.Reader) error {
+								f, err := r.TableFormat()
+								if err != nil {
+									return err
+								}
+								tally[f]++
+								return nil
+							})
+						if err != nil {
+							return err.Error()
+						}
+					}
+				}
+				var b bytes.Buffer
+				for i := 1; i <= int(sstable.TableFormatMax); i++ {
+					_, _ = fmt.Fprintf(&b, "%s: %d\n", sstable.TableFormat(i), tally[i])
+				}
+				return b.String()
+
+			case "ratchet-format-major-version":
+				v, err := strconv.Atoi(td.CmdArgs[0].String())
+				if err != nil {
+					return err.Error()
+				}
+				if err = d.RatchetFormatMajorVersion(FormatMajorVersion(v)); err != nil {
+					return err.Error()
+				}
+				return ""
+
+			case "marked-file-count":
+				m := d.Metrics()
+				return fmt.Sprintf("%d files marked for compaction", m.Compact.MarkedFiles)
+
+			default:
+				return fmt.Sprintf("unknown command: %s", cmd)
+			}
+		},
+	)
+}
+
+// TestPebblev1MigrationRace exercises the race between a PrePebbleV1Marked
+// format major version upgrade that needs to open sstables to read their table
+// format, and concurrent compactions that may delete the same files from the
+// LSM.
+//
+// Regression test for #2019.
+func TestPebblev1MigrationRace(t *testing.T) {
+	// Use a smaller table cache size to slow down the PrePebbleV1Marked
+	// migration, ensuring each table read needs to re-open the file.
+	cache := NewCache(4 << 20)
+	defer cache.Unref()
+	tableCache := NewTableCache(cache, 1, 5)
+	defer tableCache.Unref()
+	d, err := Open("", (&Options{
+		Cache:              cache,
+		FS:                 vfs.NewMem(),
+		FormatMajorVersion: FormatMajorVersion(FormatPrePebblev1Marked - 1),
+		TableCache:         tableCache,
+		Levels:             []LevelOptions{{TargetFileSize: 1}},
+	}).WithFSDefaults())
+	require.NoError(t, err)
+	defer d.Close()
+
+	ks := testkeys.Alpha(3).EveryN(10)
+	var key [3]byte
+	for i := int64(0); i < ks.Count(); i++ {
+		n := testkeys.WriteKey(key[:], ks, i)
+		require.NoError(t, d.Set(key[:n], key[:n], nil))
+		require.NoError(t, d.Flush())
+	}
+
+	// Asynchronously write and flush range deletes that will cause compactions
+	// to delete the existing sstables. These deletes will race with the format
+	// major version upgrade's migration will attempt to delete the files.
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		for i := ks.Count() - 1; i > 0; i -= 50 {
+			endKey := testkeys.Key(ks, i)
+			startIndex := i - 50
+			if startIndex < 0 {
+				startIndex = 0
+			}
+			startKey := testkeys.Key(ks, startIndex)
+
+			require.NoError(t, d.DeleteRange(startKey, endKey, nil))
+			_, err := d.AsyncFlush()
+			require.NoError(t, err)
+		}
+	}()
+	require.NoError(t, d.RatchetFormatMajorVersion(FormatPrePebblev1Marked))
+	wg.Wait()
+}
+
+// Regression test for #2044, where multiple concurrent compactions can lead
+// to an indefinite wait on the compaction goroutine in compactMarkedFilesLocked.
+func TestPebblev1MigrationConcurrencyRace(t *testing.T) {
+	opts := (&Options{
+		Comparer:           testkeys.Comparer,
+		FS:                 vfs.NewMem(),
+		FormatMajorVersion: FormatSplitUserKeysMarked,
+		Levels:             []LevelOptions{{FilterPolicy: bloom.FilterPolicy(10)}},
+		MaxConcurrentCompactions: func() int {
+			return 4
+		},
+	}).WithFSDefaults()
+	func() {
+		d, err := Open("", opts)
+		require.NoError(t, err)
+		defer func() {
+			require.NoError(t, d.Close())
+		}()
+
+		ks := testkeys.Alpha(3).EveryN(10)
+		var key [3]byte
+		for i := int64(0); i < ks.Count(); i++ {
+			n := testkeys.WriteKey(key[:], ks, i)
+			require.NoError(t, d.Set(key[:n], key[:n], nil))
+			if i%100 == 0 {
+				require.NoError(t, d.Flush())
+			}
+		}
+		require.NoError(t, d.Flush())
+	}()
+
+	opts.FormatMajorVersion = formatUnusedPrePebblev1MarkedCompacted
+	d, err := Open("", opts)
+	require.NoError(t, err)
+	require.NoError(t, d.RatchetFormatMajorVersion(formatUnusedPrePebblev1MarkedCompacted))
+	require.NoError(t, d.Close())
+}
diff --git a/pebble/get_iter.go b/pebble/get_iter.go
new file mode 100644
index 0000000..6ebdd59
--- /dev/null
+++ b/pebble/get_iter.go
@@ -0,0 +1,258 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/sstable"
+)
+
+// getIter is an internal iterator used to perform gets. It iterates through
+// the values for a particular key, level by level. It is not a general purpose
+// internalIterator, but specialized for Get operations so that it loads data
+// lazily.
+type getIter struct {
+	logger       Logger
+	comparer     *Comparer
+	newIters     tableNewIters
+	snapshot     uint64
+	key          []byte
+	iter         internalIterator
+	rangeDelIter keyspan.FragmentIterator
+	tombstone    *keyspan.Span
+	levelIter    levelIter
+	level        int
+	batch        *Batch
+	mem          flushableList
+	l0           []manifest.LevelSlice
+	version      *version
+	iterKey      *InternalKey
+	iterValue    base.LazyValue
+	err          error
+}
+
+// TODO(sumeer): CockroachDB code doesn't use getIter, but, for completeness,
+// make this implement InternalIteratorWithStats.
+
+// getIter implements the base.InternalIterator interface.
+var _ base.InternalIterator = (*getIter)(nil)
+
+func (g *getIter) String() string {
+	return fmt.Sprintf("len(l0)=%d, len(mem)=%d, level=%d", len(g.l0), len(g.mem), g.level)
+}
+
+func (g *getIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) {
+	panic("pebble: SeekGE unimplemented")
+}
+
+func (g *getIter) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	panic("pebble: SeekPrefixGE unimplemented")
+}
+
+func (g *getIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) {
+	panic("pebble: SeekLT unimplemented")
+}
+
+func (g *getIter) First() (*InternalKey, base.LazyValue) {
+	return g.Next()
+}
+
+func (g *getIter) Last() (*InternalKey, base.LazyValue) {
+	panic("pebble: Last unimplemented")
+}
+
+func (g *getIter) Next() (*InternalKey, base.LazyValue) {
+	if g.iter != nil {
+		g.iterKey, g.iterValue = g.iter.Next()
+	}
+
+	for {
+		if g.iter != nil {
+			// We have to check rangeDelIter on each iteration because a single
+			// user-key can be spread across multiple tables in a level. A range
+			// tombstone will appear in the table corresponding to its start
+			// key. Every call to levelIter.Next() potentially switches to a new
+			// table and thus reinitializes rangeDelIter.
+			if g.rangeDelIter != nil {
+				g.tombstone = keyspan.Get(g.comparer.Compare, g.rangeDelIter, g.key)
+				if g.err = g.rangeDelIter.Close(); g.err != nil {
+					return nil, base.LazyValue{}
+				}
+				g.rangeDelIter = nil
+			}
+
+			if g.iterKey != nil {
+				key := g.iterKey
+				if g.tombstone != nil && g.tombstone.CoversAt(g.snapshot, key.SeqNum()) {
+					// We have a range tombstone covering this key. Rather than return a
+					// point or range deletion here, we return false and close our
+					// internal iterator which will make Valid() return false,
+					// effectively stopping iteration.
+					g.err = g.iter.Close()
+					g.iter = nil
+					return nil, base.LazyValue{}
+				}
+				if g.comparer.Equal(g.key, key.UserKey) {
+					if !key.Visible(g.snapshot, base.InternalKeySeqNumMax) {
+						g.iterKey, g.iterValue = g.iter.Next()
+						continue
+					}
+					return g.iterKey, g.iterValue
+				}
+			}
+			// We've advanced the iterator passed the desired key. Move on to the
+			// next memtable / level.
+			g.err = g.iter.Close()
+			g.iter = nil
+			if g.err != nil {
+				return nil, base.LazyValue{}
+			}
+		}
+
+		// Create an iterator from the batch.
+		if g.batch != nil {
+			if g.batch.index == nil {
+				g.err = ErrNotIndexed
+				g.iterKey, g.iterValue = nil, base.LazyValue{}
+				return nil, base.LazyValue{}
+			}
+			g.iter = g.batch.newInternalIter(nil)
+			g.rangeDelIter = g.batch.newRangeDelIter(
+				nil,
+				// Get always reads the entirety of the batch's history, so no
+				// batch keys should be filtered.
+				base.InternalKeySeqNumMax,
+			)
+			g.iterKey, g.iterValue = g.iter.SeekGE(g.key, base.SeekGEFlagsNone)
+			g.batch = nil
+			continue
+		}
+
+		// If we have a tombstone from a previous level it is guaranteed to delete
+		// keys in lower levels.
+		if g.tombstone != nil && g.tombstone.VisibleAt(g.snapshot) {
+			return nil, base.LazyValue{}
+		}
+
+		// Create iterators from memtables from newest to oldest.
+		if n := len(g.mem); n > 0 {
+			m := g.mem[n-1]
+			g.iter = m.newIter(nil)
+			g.rangeDelIter = m.newRangeDelIter(nil)
+			g.mem = g.mem[:n-1]
+			g.iterKey, g.iterValue = g.iter.SeekGE(g.key, base.SeekGEFlagsNone)
+			continue
+		}
+
+		if g.level == 0 {
+			// Create iterators from L0 from newest to oldest.
+			if n := len(g.l0); n > 0 {
+				files := g.l0[n-1].Iter()
+				g.l0 = g.l0[:n-1]
+				iterOpts := IterOptions{
+					// TODO(sumeer): replace with a parameter provided by the caller.
+					CategoryAndQoS: sstable.CategoryAndQoS{
+						Category: "pebble-get",
+						QoSLevel: sstable.LatencySensitiveQoSLevel,
+					},
+					logger:                        g.logger,
+					snapshotForHideObsoletePoints: g.snapshot}
+				g.levelIter.init(context.Background(), iterOpts, g.comparer, g.newIters,
+					files, manifest.L0Sublevel(n), internalIterOpts{})
+				g.levelIter.initRangeDel(&g.rangeDelIter)
+				bc := levelIterBoundaryContext{}
+				g.levelIter.initBoundaryContext(&bc)
+				g.iter = &g.levelIter
+
+				// Compute the key prefix for bloom filtering if split function is
+				// specified, or use the user key as default.
+				prefix := g.key
+				if g.comparer.Split != nil {
+					prefix = g.key[:g.comparer.Split(g.key)]
+				}
+				g.iterKey, g.iterValue = g.iter.SeekPrefixGE(prefix, g.key, base.SeekGEFlagsNone)
+				if bc.isSyntheticIterBoundsKey || bc.isIgnorableBoundaryKey {
+					g.iterKey = nil
+					g.iterValue = base.LazyValue{}
+				}
+				continue
+			}
+			g.level++
+		}
+
+		if g.level >= numLevels {
+			return nil, base.LazyValue{}
+		}
+		if g.version.Levels[g.level].Empty() {
+			g.level++
+			continue
+		}
+
+		iterOpts := IterOptions{
+			// TODO(sumeer): replace with a parameter provided by the caller.
+			CategoryAndQoS: sstable.CategoryAndQoS{
+				Category: "pebble-get",
+				QoSLevel: sstable.LatencySensitiveQoSLevel,
+			}, logger: g.logger, snapshotForHideObsoletePoints: g.snapshot}
+		g.levelIter.init(context.Background(), iterOpts, g.comparer, g.newIters,
+			g.version.Levels[g.level].Iter(), manifest.Level(g.level), internalIterOpts{})
+		g.levelIter.initRangeDel(&g.rangeDelIter)
+		bc := levelIterBoundaryContext{}
+		g.levelIter.initBoundaryContext(&bc)
+		g.level++
+		g.iter = &g.levelIter
+
+		// Compute the key prefix for bloom filtering if split function is
+		// specified, or use the user key as default.
+		prefix := g.key
+		if g.comparer.Split != nil {
+			prefix = g.key[:g.comparer.Split(g.key)]
+		}
+		g.iterKey, g.iterValue = g.iter.SeekPrefixGE(prefix, g.key, base.SeekGEFlagsNone)
+		if bc.isSyntheticIterBoundsKey || bc.isIgnorableBoundaryKey {
+			g.iterKey = nil
+			g.iterValue = base.LazyValue{}
+		}
+	}
+}
+
+func (g *getIter) Prev() (*InternalKey, base.LazyValue) {
+	panic("pebble: Prev unimplemented")
+}
+
+func (g *getIter) NextPrefix([]byte) (*InternalKey, base.LazyValue) {
+	panic("pebble: NextPrefix unimplemented")
+}
+
+func (g *getIter) Valid() bool {
+	return g.iterKey != nil && g.err == nil
+}
+
+func (g *getIter) Error() error {
+	return g.err
+}
+
+func (g *getIter) Close() error {
+	if g.iter != nil {
+		if err := g.iter.Close(); err != nil && g.err == nil {
+			g.err = err
+		}
+		g.iter = nil
+	}
+	return g.err
+}
+
+func (g *getIter) SetBounds(lower, upper []byte) {
+	panic("pebble: SetBounds unimplemented")
+}
+
+func (g *getIter) SetContext(_ context.Context) {}
diff --git a/pebble/get_iter_test.go b/pebble/get_iter_test.go
new file mode 100644
index 0000000..ab6e67e
--- /dev/null
+++ b/pebble/get_iter_test.go
@@ -0,0 +1,576 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"context"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+)
+
+func TestGetIter(t *testing.T) {
+	// testTable is a table to insert into a version.
+	// Each element of data is a string of the form "internalKey value".
+	type testTable struct {
+		level   int
+		fileNum FileNum
+		data    []string
+	}
+
+	testCases := []struct {
+		description string
+		// badOrdering is whether this test case has a table ordering violation.
+		badOrdering bool
+		// tables are the tables to populate the version with.
+		tables []testTable
+		// queries are the queries to run against the version. Each element has
+		// the form "internalKey wantedValue". The internalKey is passed to the
+		// version.get method, wantedValue may be "ErrNotFound" if the query
+		// should return that error.
+		queries []string
+	}{
+		{
+			description: "empty: an empty version",
+			queries: []string{
+				"abc.SEPARATOR.101 ErrNotFound",
+			},
+		},
+
+		{
+			description: "single-0: one level-0 table",
+			tables: []testTable{
+				{
+					level:   0,
+					fileNum: 10,
+					data: []string{
+						"the.SET.101 a",
+						"cat.SET.102 b",
+						"on_.SET.103 c",
+						"the.SET.104 d",
+						"mat.SET.105 e",
+						"the.DEL.106 ",
+						"the.MERGE.107 g",
+					},
+				},
+			},
+			queries: []string{
+				"aaa.SEPARATOR.105 ErrNotFound",
+				"cat.SEPARATOR.105 b",
+				"hat.SEPARATOR.105 ErrNotFound",
+				"mat.SEPARATOR.105 e",
+				"the.SEPARATOR.108 g",
+				"the.SEPARATOR.107 g",
+				"the.SEPARATOR.106 ErrNotFound",
+				"the.SEPARATOR.105 d",
+				"the.SEPARATOR.104 d",
+				"the.SEPARATOR.104 d",
+				"the.SEPARATOR.103 a",
+				"the.SEPARATOR.102 a",
+				"the.SEPARATOR.101 a",
+				"the.SEPARATOR.100 ErrNotFound",
+				"zzz.SEPARATOR.105 ErrNotFound",
+			},
+		},
+
+		{
+			description: "triple-0: three level-0 tables",
+			tables: []testTable{
+				{
+					level:   0,
+					fileNum: 10,
+					data: []string{
+						"the.SET.101 a",
+						"cat.SET.102 b",
+						"on_.SET.103 c",
+						"the.SET.104 d",
+						"mat.SET.105 e",
+						"the.DEL.106 ",
+						"the.SET.107 g",
+					},
+				},
+				{
+					level:   0,
+					fileNum: 11,
+					data: []string{
+						"awk.SET.111 w",
+						"cat.SET.112 x",
+						"man.SET.113 y",
+						"sed.SET.114 z",
+					},
+				},
+				{
+					level:   0,
+					fileNum: 12,
+					data: []string{
+						"the.DEL.121 ",
+						"cat.DEL.122 ",
+						"man.DEL.123 ",
+						"was.SET.124 D",
+						"not.SET.125 E",
+						"the.SET.126 F",
+						"man.SET.127 G",
+					},
+				},
+			},
+			queries: []string{
+				"aaa.SEPARATOR.105 ErrNotFound",
+				"awk.SEPARATOR.135 w",
+				"awk.SEPARATOR.125 w",
+				"awk.SEPARATOR.115 w",
+				"awk.SEPARATOR.105 ErrNotFound",
+				"cat.SEPARATOR.135 ErrNotFound",
+				"cat.SEPARATOR.125 ErrNotFound",
+				"cat.SEPARATOR.115 x",
+				"cat.SEPARATOR.105 b",
+				"man.SEPARATOR.135 G",
+				"man.SEPARATOR.125 ErrNotFound",
+				"man.SEPARATOR.115 y",
+				"man.SEPARATOR.105 ErrNotFound",
+				"on_.SEPARATOR.135 c",
+				"on_.SEPARATOR.125 c",
+				"on_.SEPARATOR.115 c",
+				"on_.SEPARATOR.105 c",
+				"the.SEPARATOR.135 F",
+				"the.SEPARATOR.127 F",
+				"the.SEPARATOR.126 F",
+				"the.SEPARATOR.125 ErrNotFound",
+				"the.SEPARATOR.122 ErrNotFound",
+				"the.SEPARATOR.121 ErrNotFound",
+				"the.SEPARATOR.120 g",
+				"the.SEPARATOR.115 g",
+				"the.SEPARATOR.114 g",
+				"the.SEPARATOR.111 g",
+				"the.SEPARATOR.110 g",
+				"the.SEPARATOR.108 g",
+				"the.SEPARATOR.107 g",
+				"the.SEPARATOR.106 ErrNotFound",
+				"the.SEPARATOR.105 d",
+				"the.SEPARATOR.104 d",
+				"the.SEPARATOR.104 d",
+				"the.SEPARATOR.103 a",
+				"the.SEPARATOR.102 a",
+				"the.SEPARATOR.101 a",
+				"the.SEPARATOR.100 ErrNotFound",
+				"zzz.SEPARATOR.105 ErrNotFound",
+			},
+		},
+
+		{
+			description: "quad-4: four level-4 tables",
+			tables: []testTable{
+				{
+					level:   4,
+					fileNum: 11,
+					data: []string{
+						"aardvark.SET.101 a1",
+						"alpaca__.SET.201 a2",
+						"anteater.SET.301 a3",
+					},
+				},
+				{
+					level:   4,
+					fileNum: 22,
+					data: []string{
+						"baboon__.SET.102 b1",
+						"baboon__.DEL.202 ",
+						"baboon__.SET.302 b3",
+						"bear____.SET.402 b4",
+						"bear____.DEL.502 ",
+						"buffalo_.SET.602 b6",
+					},
+				},
+				{
+					level:   4,
+					fileNum: 33,
+					data: []string{
+						"buffalo_.SET.103 B1",
+					},
+				},
+				{
+					level:   4,
+					fileNum: 44,
+					data: []string{
+						"chipmunk.SET.104 c1",
+						"chipmunk.SET.204 c2",
+					},
+				},
+			},
+			queries: []string{
+				"a_______.SEPARATOR.999 ErrNotFound",
+				"aardvark.SEPARATOR.999 a1",
+				"aardvark.SEPARATOR.102 a1",
+				"aardvark.SEPARATOR.101 a1",
+				"aardvark.SEPARATOR.100 ErrNotFound",
+				"alpaca__.SEPARATOR.999 a2",
+				"alpaca__.SEPARATOR.200 ErrNotFound",
+				"anteater.SEPARATOR.999 a3",
+				"anteater.SEPARATOR.302 a3",
+				"anteater.SEPARATOR.301 a3",
+				"anteater.SEPARATOR.300 ErrNotFound",
+				"anteater.SEPARATOR.000 ErrNotFound",
+				"b_______.SEPARATOR.999 ErrNotFound",
+				"baboon__.SEPARATOR.999 b3",
+				"baboon__.SEPARATOR.302 b3",
+				"baboon__.SEPARATOR.301 ErrNotFound",
+				"baboon__.SEPARATOR.202 ErrNotFound",
+				"baboon__.SEPARATOR.201 b1",
+				"baboon__.SEPARATOR.102 b1",
+				"baboon__.SEPARATOR.101 ErrNotFound",
+				"bear____.SEPARATOR.999 ErrNotFound",
+				"bear____.SEPARATOR.500 b4",
+				"bear____.SEPARATOR.000 ErrNotFound",
+				"buffalo_.SEPARATOR.999 b6",
+				"buffalo_.SEPARATOR.603 b6",
+				"buffalo_.SEPARATOR.602 b6",
+				"buffalo_.SEPARATOR.601 B1",
+				"buffalo_.SEPARATOR.104 B1",
+				"buffalo_.SEPARATOR.103 B1",
+				"buffalo_.SEPARATOR.102 ErrNotFound",
+				"buffalo_.SEPARATOR.000 ErrNotFound",
+				"c_______.SEPARATOR.999 ErrNotFound",
+				"chipmunk.SEPARATOR.999 c2",
+				"chipmunk.SEPARATOR.205 c2",
+				"chipmunk.SEPARATOR.204 c2",
+				"chipmunk.SEPARATOR.203 c1",
+				"chipmunk.SEPARATOR.105 c1",
+				"chipmunk.SEPARATOR.104 c1",
+				"chipmunk.SEPARATOR.103 ErrNotFound",
+				"chipmunk.SEPARATOR.000 ErrNotFound",
+				"d_______.SEPARATOR.999 ErrNotFound",
+			},
+		},
+
+		{
+			description: "complex: many tables at many levels",
+			tables: []testTable{
+				{
+					level:   0,
+					fileNum: 50,
+					data: []string{
+						"alfalfa__.SET.501 p1",
+						"asparagus.SET.502 p2",
+						"cabbage__.DEL.503 ",
+						"spinach__.MERGE.504 p3",
+					},
+				},
+				{
+					level:   0,
+					fileNum: 51,
+					data: []string{
+						"asparagus.SET.511 q1",
+						"asparagus.SET.512 q2",
+						"asparagus.SET.513 q3",
+						"beans____.SET.514 q4",
+						"broccoli_.SET.515 q5",
+						"cabbage__.SET.516 q6",
+						"celery___.SET.517 q7",
+						"spinach__.MERGE.518 q8",
+					},
+				},
+				{
+					level:   1,
+					fileNum: 40,
+					data: []string{
+						"alfalfa__.SET.410 r1",
+						"asparagus.SET.420 r2",
+						"arugula__.SET.430 r3",
+					},
+				},
+				{
+					level:   1,
+					fileNum: 41,
+					data: []string{
+						"beans____.SET.411 s1",
+						"beans____.SET.421 s2",
+						"bokchoy__.DEL.431 ",
+						"broccoli_.SET.441 s4",
+					},
+				},
+				{
+					level:   1,
+					fileNum: 42,
+					data: []string{
+						"cabbage__.SET.412 t1",
+						"corn_____.DEL.422 ",
+						"spinach__.MERGE.432 t2",
+					},
+				},
+				{
+					level:   2,
+					fileNum: 30,
+					data: []string{
+						"alfalfa__.SET.310 u1",
+						"bokchoy__.SET.320 u2",
+						"celery___.SET.330 u3",
+						"corn_____.SET.340 u4",
+						"spinach__.MERGE.350 u5",
+					},
+				},
+			},
+			queries: []string{
+				"a________.SEPARATOR.999 ErrNotFound",
+				"alfalfa__.SEPARATOR.520 p1",
+				"alfalfa__.SEPARATOR.510 p1",
+				"alfalfa__.SEPARATOR.500 r1",
+				"alfalfa__.SEPARATOR.400 u1",
+				"alfalfa__.SEPARATOR.300 ErrNotFound",
+				"asparagus.SEPARATOR.520 q3",
+				"asparagus.SEPARATOR.510 p2",
+				"asparagus.SEPARATOR.500 r2",
+				"asparagus.SEPARATOR.400 ErrNotFound",
+				"asparagus.SEPARATOR.300 ErrNotFound",
+				"arugula__.SEPARATOR.520 r3",
+				"arugula__.SEPARATOR.510 r3",
+				"arugula__.SEPARATOR.500 r3",
+				"arugula__.SEPARATOR.400 ErrNotFound",
+				"arugula__.SEPARATOR.300 ErrNotFound",
+				"beans____.SEPARATOR.520 q4",
+				"beans____.SEPARATOR.510 s2",
+				"beans____.SEPARATOR.500 s2",
+				"beans____.SEPARATOR.400 ErrNotFound",
+				"beans____.SEPARATOR.300 ErrNotFound",
+				"bokchoy__.SEPARATOR.520 ErrNotFound",
+				"bokchoy__.SEPARATOR.510 ErrNotFound",
+				"bokchoy__.SEPARATOR.500 ErrNotFound",
+				"bokchoy__.SEPARATOR.400 u2",
+				"bokchoy__.SEPARATOR.300 ErrNotFound",
+				"broccoli_.SEPARATOR.520 q5",
+				"broccoli_.SEPARATOR.510 s4",
+				"broccoli_.SEPARATOR.500 s4",
+				"broccoli_.SEPARATOR.400 ErrNotFound",
+				"broccoli_.SEPARATOR.300 ErrNotFound",
+				"cabbage__.SEPARATOR.520 q6",
+				"cabbage__.SEPARATOR.510 ErrNotFound",
+				"cabbage__.SEPARATOR.500 t1",
+				"cabbage__.SEPARATOR.400 ErrNotFound",
+				"cabbage__.SEPARATOR.300 ErrNotFound",
+				"celery___.SEPARATOR.520 q7",
+				"celery___.SEPARATOR.510 u3",
+				"celery___.SEPARATOR.500 u3",
+				"celery___.SEPARATOR.400 u3",
+				"celery___.SEPARATOR.300 ErrNotFound",
+				"corn_____.SEPARATOR.520 ErrNotFound",
+				"corn_____.SEPARATOR.510 ErrNotFound",
+				"corn_____.SEPARATOR.500 ErrNotFound",
+				"corn_____.SEPARATOR.400 u4",
+				"corn_____.SEPARATOR.300 ErrNotFound",
+				"d________.SEPARATOR.999 ErrNotFound",
+				"spinach__.SEPARATOR.999 u5t2p3q8",
+				"spinach__.SEPARATOR.518 u5t2p3q8",
+				"spinach__.SEPARATOR.517 u5t2p3",
+				"spinach__.SEPARATOR.504 u5t2p3",
+				"spinach__.SEPARATOR.503 u5t2",
+				"spinach__.SEPARATOR.432 u5t2",
+				"spinach__.SEPARATOR.431 u5",
+				"spinach__.SEPARATOR.350 u5",
+				"spinach__.SEPARATOR.349 ErrNotFound",
+			},
+		},
+
+		{
+			description: "broken invariants 0: non-increasing level 0 sequence numbers",
+			badOrdering: true,
+			tables: []testTable{
+				{
+					level:   0,
+					fileNum: 19,
+					data: []string{
+						"a.SET.101 a",
+						"b.SET.102 b",
+					},
+				},
+				{
+					level:   0,
+					fileNum: 20,
+					data: []string{
+						"c.SET.101 c",
+					},
+				},
+			},
+		},
+
+		{
+			description: "broken invariants 1: non-increasing level 0 sequence numbers",
+			badOrdering: true,
+			tables: []testTable{
+				{
+					level:   0,
+					fileNum: 19,
+					data: []string{
+						"a.SET.101 a",
+						"b.SET.102 b",
+					},
+				},
+				{
+					level:   0,
+					fileNum: 20,
+					data: []string{
+						"c.SET.100 c",
+						"d.SET.101 d",
+					},
+				},
+			},
+		},
+
+		{
+			description: "broken invariants 2: matching level 0 sequence numbers, considered acceptable",
+			badOrdering: false,
+			tables: []testTable{
+				{
+					level:   0,
+					fileNum: 19,
+					data: []string{
+						"a.SET.101 a",
+					},
+				},
+				{
+					level:   0,
+					fileNum: 20,
+					data: []string{
+						"a.SET.101 a",
+					},
+				},
+			},
+		},
+
+		{
+			description: "broken invariants 3: level non-0 overlapping internal key ranges",
+			badOrdering: true,
+			tables: []testTable{
+				{
+					level:   5,
+					fileNum: 11,
+					data: []string{
+						"bat.SET.101 xxx",
+						"dog.SET.102 xxx",
+					},
+				},
+				{
+					level:   5,
+					fileNum: 12,
+					data: []string{
+						"cow.SET.103 xxx",
+						"pig.SET.104 xxx",
+					},
+				},
+			},
+		},
+	}
+
+	cmp := testkeys.Comparer.Compare
+	for _, tc := range testCases {
+		desc := tc.description[:strings.Index(tc.description, ":")]
+
+		// m is a map from file numbers to DBs.
+		m := map[FileNum]*memTable{}
+		newIter := func(
+			_ context.Context, file *manifest.FileMetadata, _ *IterOptions, _ internalIterOpts,
+		) (internalIterator, keyspan.FragmentIterator, error) {
+			d, ok := m[file.FileNum]
+			if !ok {
+				return nil, nil, errors.New("no such file")
+			}
+			return d.newIter(nil), nil, nil
+		}
+
+		var files [numLevels][]*fileMetadata
+		for _, tt := range tc.tables {
+			d := newMemTable(memTableOptions{})
+			m[tt.fileNum] = d
+
+			meta := &fileMetadata{
+				FileNum: tt.fileNum,
+			}
+			meta.InitPhysicalBacking()
+			for i, datum := range tt.data {
+				s := strings.Split(datum, " ")
+				ikey := base.ParseInternalKey(s[0])
+				err := d.set(ikey, []byte(s[1]))
+				if err != nil {
+					t.Fatalf("desc=%q: memtable Set: %v", desc, err)
+				}
+
+				meta.ExtendPointKeyBounds(cmp, ikey, ikey)
+				if i == 0 {
+					meta.SmallestSeqNum = ikey.SeqNum()
+					meta.LargestSeqNum = ikey.SeqNum()
+				} else {
+					if meta.SmallestSeqNum > ikey.SeqNum() {
+						meta.SmallestSeqNum = ikey.SeqNum()
+					}
+					if meta.LargestSeqNum < ikey.SeqNum() {
+						meta.LargestSeqNum = ikey.SeqNum()
+					}
+				}
+			}
+
+			files[tt.level] = append(files[tt.level], meta)
+		}
+		v := manifest.NewVersion(cmp, base.DefaultFormatter, 10<<20, files)
+		err := v.CheckOrdering(cmp, base.DefaultFormatter, manifest.AllowSplitUserKeys)
+		if tc.badOrdering && err == nil {
+			t.Errorf("desc=%q: want bad ordering, got nil error", desc)
+			continue
+		} else if !tc.badOrdering && err != nil {
+			t.Errorf("desc=%q: bad ordering: %v", desc, err)
+			continue
+		}
+
+		get := func(v *version, ikey InternalKey) ([]byte, error) {
+			var buf struct {
+				dbi Iterator
+				get getIter
+			}
+
+			get := &buf.get
+			get.comparer = testkeys.Comparer
+			get.newIters = newIter
+			get.key = ikey.UserKey
+			get.l0 = v.L0SublevelFiles
+			get.version = v
+			get.snapshot = ikey.SeqNum() + 1
+
+			i := &buf.dbi
+			i.comparer = *testkeys.Comparer
+			i.merge = DefaultMerger.Merge
+			i.iter = get
+
+			defer i.Close()
+			if !i.First() {
+				err := i.Error()
+				if err != nil {
+					return nil, err
+				}
+				return nil, ErrNotFound
+			}
+			return i.Value(), nil
+		}
+
+		for _, query := range tc.queries {
+			s := strings.Split(query, " ")
+			ikey := base.ParseInternalKey(s[0])
+			value, err := get(v, ikey)
+			got, want := "", s[1]
+			if err != nil {
+				if err != ErrNotFound {
+					t.Errorf("desc=%q: query=%q: %v", desc, s[0], err)
+					continue
+				}
+				got = "ErrNotFound"
+			} else {
+				got = string(value)
+			}
+			if got != want {
+				t.Errorf("desc=%q: query=%q: got %q, want %q", desc, s[0], got, want)
+			}
+		}
+	}
+}
diff --git a/pebble/go.mod b/pebble/go.mod
new file mode 100644
index 0000000..d882642
--- /dev/null
+++ b/pebble/go.mod
@@ -0,0 +1,49 @@
+module github.com/cockroachdb/pebble
+
+require (
+	github.com/DataDog/zstd v1.4.5
+	github.com/HdrHistogram/hdrhistogram-go v1.1.2
+	github.com/cespare/xxhash/v2 v2.2.0
+	github.com/cockroachdb/datadriven v1.0.3-0.20230413201302-be42291fc80f
+	github.com/cockroachdb/errors v1.11.1
+	github.com/cockroachdb/metamorphic v0.0.0-20231108215700-4ba948b56895
+	github.com/cockroachdb/redact v1.1.5
+	github.com/cockroachdb/tokenbucket v0.0.0-20230807174530-cc333fc44b06
+	github.com/ghemawat/stream v0.0.0-20171120220530-696b145b53b9
+	github.com/golang/snappy v0.0.4
+	github.com/guptarohit/asciigraph v0.5.5
+	github.com/klauspost/compress v1.15.15
+	github.com/kr/pretty v0.3.1
+	github.com/pkg/errors v0.9.1
+	github.com/pmezard/go-difflib v1.0.0
+	github.com/prometheus/client_golang v1.12.0
+	github.com/prometheus/client_model v0.2.1-0.20210607210712-147c58e9608a
+	github.com/spf13/cobra v1.0.0
+	github.com/stretchr/testify v1.8.4
+	golang.org/x/exp v0.0.0-20230626212559-97b1e661b5df
+	golang.org/x/perf v0.0.0-20230113213139-801c7ef9e5c5
+	golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4
+	golang.org/x/sys v0.11.0
+)
+
+require (
+	github.com/aclements/go-moremath v0.0.0-20210112150236-f10218a38794 // indirect
+	github.com/beorn7/perks v1.0.1 // indirect
+	github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b // indirect
+	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/getsentry/sentry-go v0.18.0 // indirect
+	github.com/gogo/protobuf v1.3.2 // indirect
+	github.com/golang/protobuf v1.5.2 // indirect
+	github.com/inconshreveable/mousetrap v1.0.0 // indirect
+	github.com/kr/text v0.2.0 // indirect
+	github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 // indirect
+	github.com/prometheus/common v0.32.1 // indirect
+	github.com/prometheus/procfs v0.7.3 // indirect
+	github.com/rogpeppe/go-internal v1.9.0 // indirect
+	github.com/spf13/pflag v1.0.5 // indirect
+	golang.org/x/text v0.7.0 // indirect
+	google.golang.org/protobuf v1.28.1 // indirect
+	gopkg.in/yaml.v3 v3.0.1 // indirect
+)
+
+go 1.20
diff --git a/pebble/go.sum b/pebble/go.sum
new file mode 100644
index 0000000..89e7f53
--- /dev/null
+++ b/pebble/go.sum
@@ -0,0 +1,666 @@
+cloud.google.com/go v0.0.0-20170206221025-ce650573d812/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
+cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
+cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
+cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU=
+cloud.google.com/go v0.44.1/go.mod h1:iSa0KzasP4Uvy3f1mN/7PiObzGgflwredwwASm/v6AU=
+cloud.google.com/go v0.44.2/go.mod h1:60680Gw3Yr4ikxnPRS/oxxkBccT6SA1yMk63TGekxKY=
+cloud.google.com/go v0.45.1/go.mod h1:RpBamKRgapWJb87xiFSdk4g1CME7QZg3uwTez+TSTjc=
+cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg0=
+cloud.google.com/go v0.50.0/go.mod h1:r9sluTvynVuxRIOHXQEHMFffphuXHOMZMycpNR5e6To=
+cloud.google.com/go v0.52.0/go.mod h1:pXajvRH/6o3+F9jDHZWQ5PbGhn+o8w9qiu/CffaVdO4=
+cloud.google.com/go v0.53.0/go.mod h1:fp/UouUEsRkN6ryDKNW/Upv/JBKnv6WDthjR6+vze6M=
+cloud.google.com/go v0.54.0/go.mod h1:1rq2OEkV3YMf6n/9ZvGWI3GWw0VoqH/1x2nd8Is/bPc=
+cloud.google.com/go v0.56.0/go.mod h1:jr7tqZxxKOVYizybht9+26Z/gUq7tiRzu+ACVAMbKVk=
+cloud.google.com/go v0.57.0/go.mod h1:oXiQ6Rzq3RAkkY7N6t3TcE6jE+CIBBbA36lwQ1JyzZs=
+cloud.google.com/go v0.62.0/go.mod h1:jmCYTdRCQuc1PHIIJ/maLInMho30T/Y0M4hTdTShOYc=
+cloud.google.com/go v0.65.0/go.mod h1:O5N8zS7uWy9vkA9vayVHs65eM1ubvY4h553ofrNHObY=
+cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o=
+cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE=
+cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvftPBK2Dvzc=
+cloud.google.com/go/bigquery v1.5.0/go.mod h1:snEHRnqQbz117VIFhE8bmtwIDY80NLUZUMb4Nv6dBIg=
+cloud.google.com/go/bigquery v1.7.0/go.mod h1://okPTzCYNXSlb24MZs83e2Do+h+VXtc4gLoIoXIAPc=
+cloud.google.com/go/bigquery v1.8.0/go.mod h1:J5hqkt3O0uAFnINi6JXValWIb1v0goeZM77hZzJN/fQ=
+cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE=
+cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk=
+cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I=
+cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw=
+cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA=
+cloud.google.com/go/pubsub v1.3.1/go.mod h1:i+ucay31+CNRpDW4Lu78I4xXG+O1r/MAHgjpRVR+TSU=
+cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw=
+cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos=
+cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk=
+cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RXyy7KQOVs=
+cloud.google.com/go/storage v1.10.0/go.mod h1:FLPqc6j+Ki4BU591ie1oL6qBQGu2Bl/tZ9ullr3+Kg0=
+dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
+gioui.org v0.0.0-20210308172011-57750fc8a0a6/go.mod h1:RSH6KIUZ0p2xy5zHDxgAM4zumjgTw83q2ge/PI+yyw8=
+github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
+github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
+github.com/DataDog/zstd v1.4.5 h1:EndNeuB0l9syBZhut0wns3gV1hL8zX8LIu6ZiVHWLIQ=
+github.com/DataDog/zstd v1.4.5/go.mod h1:1jcaCB/ufaK+sKp1NBhlGmpz41jOoPQ35bpF36t7BBo=
+github.com/GoogleCloudPlatform/cloudsql-proxy v0.0.0-20190129172621-c8b1d7a94ddf/go.mod h1:aJ4qN3TfrelA6NZ6AXsXRfmEVaYin3EDbSPJrKS8OXo=
+github.com/HdrHistogram/hdrhistogram-go v1.1.2 h1:5IcZpTvzydCQeHzK4Ef/D5rrSqwxob0t8PQPMybUNFM=
+github.com/HdrHistogram/hdrhistogram-go v1.1.2/go.mod h1:yDgFjdqOqDEKOvasDdhWNXYg9BVp4O+o5f6V/ehm6Oo=
+github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
+github.com/aclements/go-gg v0.0.0-20170118225347-6dbb4e4fefb0/go.mod h1:55qNq4vcpkIuHowELi5C8e+1yUHtoLoOUR9QU5j7Tes=
+github.com/aclements/go-moremath v0.0.0-20210112150236-f10218a38794 h1:xlwdaKcTNVW4PtpQb8aKA4Pjy0CdJHEqvFbAnvR5m2g=
+github.com/aclements/go-moremath v0.0.0-20210112150236-f10218a38794/go.mod h1:7e+I0LQFUI9AXWxOfsQROs9xPhoJtbsyWcjJqDd4KPY=
+github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
+github.com/ajstarks/svgo v0.0.0-20210923152817-c3b6e2f0c527/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
+github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
+github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
+github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
+github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
+github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho=
+github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8=
+github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
+github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
+github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
+github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
+github.com/boombuler/barcode v1.0.0/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8=
+github.com/boombuler/barcode v1.0.1/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8=
+github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
+github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc=
+github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
+github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
+github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
+github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
+github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=
+github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
+github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU=
+github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
+github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
+github.com/cockroachdb/datadriven v1.0.3-0.20230413201302-be42291fc80f h1:otljaYPt5hWxV3MUfO5dFPFiOXg9CyG5/kCfayTqsJ4=
+github.com/cockroachdb/datadriven v1.0.3-0.20230413201302-be42291fc80f/go.mod h1:a9RdTaap04u637JoCzcUoIcDmvwSUtcUFtT/C3kJlTU=
+github.com/cockroachdb/errors v1.11.1 h1:xSEW75zKaKCWzR3OfxXUxgrk/NtT4G1MiOv5lWZazG8=
+github.com/cockroachdb/errors v1.11.1/go.mod h1:8MUxA3Gi6b25tYlFEBGLf+D8aISL+M4MIpiWMSNRfxw=
+github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b h1:r6VH0faHjZeQy818SGhaone5OnYfxFR/+AzdY3sf5aE=
+github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b/go.mod h1:Vz9DsVWQQhf3vs21MhPMZpMGSht7O/2vFW2xusFUVOs=
+github.com/cockroachdb/metamorphic v0.0.0-20231108215700-4ba948b56895 h1:XANOgPYtvELQ/h4IrmPAohXqe2pWA8Bwhejr3VQoZsA=
+github.com/cockroachdb/metamorphic v0.0.0-20231108215700-4ba948b56895/go.mod h1:aPd7gM9ov9M8v32Yy5NJrDyOcD8z642dqs+F0CeNXfA=
+github.com/cockroachdb/redact v1.1.5 h1:u1PMllDkdFfPWaNGMyLD1+so+aq3uUItthCFqzwPJ30=
+github.com/cockroachdb/redact v1.1.5/go.mod h1:BVNblN9mBWFyMyqK1k3AAiSxhvhfK2oOZZ2lK+dpvRg=
+github.com/cockroachdb/tokenbucket v0.0.0-20230807174530-cc333fc44b06 h1:zuQyyAKVxetITBuuhv3BI9cMrmStnpT18zmgmTxunpo=
+github.com/cockroachdb/tokenbucket v0.0.0-20230807174530-cc333fc44b06/go.mod h1:7nc4anLGjupUW/PeY5qiNYsdNXj7zopG+eqsS7To5IQ=
+github.com/coreos/bbolt v1.3.2/go.mod h1:iRUV2dpdMOn7Bo10OQBFzIJO9kkE559Wcmn+qkEiiKk=
+github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE=
+github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
+github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=
+github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA=
+github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
+github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
+github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no=
+github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
+github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
+github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
+github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
+github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
+github.com/fogleman/gg v1.3.0/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
+github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
+github.com/getsentry/sentry-go v0.18.0 h1:MtBW5H9QgdcJabtZcuJG80BMOwaBpkRDZkxRkNC1sN0=
+github.com/getsentry/sentry-go v0.18.0/go.mod h1:Kgon4Mby+FJ7ZWHFUAZgVaIa8sxHtnRJRLTXZr51aKQ=
+github.com/ghemawat/stream v0.0.0-20171120220530-696b145b53b9 h1:r5GgOLGbza2wVHRzK7aAj6lWZjfbAwiu/RDCVOKjRyM=
+github.com/ghemawat/stream v0.0.0-20171120220530-696b145b53b9/go.mod h1:106OIgooyS7OzLDOpUGgm9fA3bQENb/cFSyyBmMoJDs=
+github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
+github.com/go-errors/errors v1.4.2 h1:J6MZopCL4uSllY1OfXM374weqZFFItUbrImctkmUxIA=
+github.com/go-errors/errors v1.4.2/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og=
+github.com/go-fonts/dejavu v0.1.0/go.mod h1:4Wt4I4OU2Nq9asgDCteaAaWZOV24E+0/Pwo0gppep4g=
+github.com/go-fonts/latin-modern v0.2.0/go.mod h1:rQVLdDMK+mK1xscDwsqM5J8U2jrRa3T0ecnM9pNujks=
+github.com/go-fonts/liberation v0.1.1/go.mod h1:K6qoJYypsmfVjWg8KOVDQhLc8UDgIK2HYqyqAO9z7GY=
+github.com/go-fonts/liberation v0.2.0/go.mod h1:K6qoJYypsmfVjWg8KOVDQhLc8UDgIK2HYqyqAO9z7GY=
+github.com/go-fonts/stix v0.1.0/go.mod h1:w/c1f0ldAUlJmLBvlbkvVXLAD+tAMqobIIQpmnUIzUY=
+github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
+github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
+github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
+github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
+github.com/go-kit/kit v0.9.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
+github.com/go-kit/log v0.1.0/go.mod h1:zbhenjAZHb184qTLMA9ZjW7ThYL0H2mk7Q6pNt4vbaY=
+github.com/go-latex/latex v0.0.0-20210118124228-b3d85cf34e07/go.mod h1:CO1AlKB2CSIqUrmQPqA0gdRIlnLEY0gK5JGjh37zN5U=
+github.com/go-latex/latex v0.0.0-20210823091927-c0d11ff05a81/go.mod h1:SX0U8uGpxhq9o2S/CELCSUxEWWAuoCUcVCQWv7G2OCk=
+github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE=
+github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk=
+github.com/go-logfmt/logfmt v0.5.0/go.mod h1:wCYkCAKZfumFQihp8CzCvQ3paCTfi41vtzG1KdI/P7A=
+github.com/go-pdf/fpdf v0.5.0/go.mod h1:HzcnA+A23uwogo0tp9yU+l3V+KXhiESpt1PMayhOh5M=
+github.com/go-sql-driver/mysql v1.4.1/go.mod h1:zAC/RDZ24gD3HViQzih4MyKcchzm+sOG5ZlKdlhCg5w=
+github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
+github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
+github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4=
+github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
+github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
+github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
+github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
+github.com/golang/groupcache v0.0.0-20190129154638-5b532d6fd5ef/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
+github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
+github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
+github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
+github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
+github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
+github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y=
+github.com/golang/mock v1.4.0/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw=
+github.com/golang/mock v1.4.1/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw=
+github.com/golang/mock v1.4.3/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw=
+github.com/golang/mock v1.4.4/go.mod h1:l3mdAwkq5BuhzHwde/uurv3sEJeZMXNpwsxVWU71h+4=
+github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
+github.com/golang/protobuf v1.3.4/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
+github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk=
+github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
+github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
+github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
+github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
+github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
+github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
+github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
+github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
+github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
+github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw=
+github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
+github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM=
+github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
+github.com/gonum/blas v0.0.0-20181208220705-f22b278b28ac/go.mod h1:P32wAyui1PQ58Oce/KYkOqQv8cVw1zAapXOl+dRFGbc=
+github.com/gonum/floats v0.0.0-20181209220543-c233463c7e82/go.mod h1:PxC8OnwL11+aosOB5+iEPoV3picfs8tUpkVd0pDo+Kg=
+github.com/gonum/internal v0.0.0-20181124074243-f884aa714029/go.mod h1:Pu4dmpkhSyOzRwuXkOgAvijx4o+4YMUJJo9OvPYMkks=
+github.com/gonum/lapack v0.0.0-20181123203213-e4cdc5a0bff9/go.mod h1:XA3DeT6rxh2EAE789SSiSJNqxPaC0aE9J8NTOI0Jo/A=
+github.com/gonum/matrix v0.0.0-20181209220409-c518dec07be9/go.mod h1:0EXg4mc1CNP0HCqCz+K4ts155PXIlUywf0wqN+GfPZw=
+github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
+github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
+github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
+github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
+github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
+github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.4.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
+github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
+github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs=
+github.com/google/martian/v3 v3.0.0/go.mod h1:y5Zk1BBys9G+gd6Jrk0W3cC1+ELVxBWuIGO+w/tUAp0=
+github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
+github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
+github.com/google/pprof v0.0.0-20191218002539-d4f498aebedc/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
+github.com/google/pprof v0.0.0-20200212024743-f11f1df84d12/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
+github.com/google/pprof v0.0.0-20200229191704-1ebb73c60ed3/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
+github.com/google/pprof v0.0.0-20200430221834-fc25d7d30c6d/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
+github.com/google/pprof v0.0.0-20200708004538-1a94d8640e99/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
+github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI=
+github.com/google/safehtml v0.0.2/go.mod h1:L4KWwDsUJdECRAEpZoBn3O64bQaywRscowZjJAzjHnU=
+github.com/googleapis/gax-go v0.0.0-20161107002406-da06d194a00e/go.mod h1:SFVmujtThgffbyetf+mdk2eWhX2bMyUtNHzFKcPA9HY=
+github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg=
+github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk=
+github.com/gorilla/websocket v1.4.0/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ=
+github.com/grpc-ecosystem/go-grpc-middleware v1.0.0/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs=
+github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk=
+github.com/grpc-ecosystem/grpc-gateway v1.9.0/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY=
+github.com/guptarohit/asciigraph v0.5.5 h1:ccFnUF8xYIOUPPY3tmdvRyHqmn1MYI9iv1pLKX+/ZkQ=
+github.com/guptarohit/asciigraph v0.5.5/go.mod h1:dYl5wwK4gNsnFf9Zp+l06rFiDZ5YtXM6x7SRWZ3KGag=
+github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
+github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
+github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
+github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
+github.com/inconshreveable/mousetrap v1.0.0 h1:Z8tu5sraLXCXIcARxBp/8cbvlwVa7Z1NHg9XEKhtSvM=
+github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8=
+github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo=
+github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
+github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
+github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
+github.com/json-iterator/go v1.1.11/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
+github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
+github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU=
+github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk=
+github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
+github.com/julienschmidt/httprouter v1.3.0/go.mod h1:JR6WtHb+2LUe8TCKY3cZOxFyyO8IZAc4RVcycCCAKdM=
+github.com/jung-kurt/gofpdf v1.0.0/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
+github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
+github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q=
+github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
+github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
+github.com/klauspost/compress v1.15.15 h1:EF27CXIuDsYJ6mmvtBRlEuB2UVOqHG1tAXgZ7yIO+lw=
+github.com/klauspost/compress v1.15.15/go.mod h1:ZcK2JAFqKOpnBlxcLsJzYfrS9X1akm9fHZNnD9+Vo/4=
+github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
+github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
+github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
+github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
+github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
+github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
+github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
+github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
+github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
+github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
+github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
+github.com/mattn/go-sqlite3 v1.14.5/go.mod h1:WVKg1VTActs4Qso6iwGbiFih2UIHo0ENGwNd0Lj+XmI=
+github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
+github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 h1:I0XW9+e1XWDxdcEniV4rQAIOPUGDq67JSCiRCgGCZLI=
+github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4=
+github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
+github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
+github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
+github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
+github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
+github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
+github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
+github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
+github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
+github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
+github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U=
+github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic=
+github.com/phpdave11/gofpdf v1.4.2/go.mod h1:zpO6xFn9yxo3YLyMvW8HcKWVdbNqgIfOOp2dXMnm1mY=
+github.com/phpdave11/gofpdi v1.0.12/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk/7bXwjDoI=
+github.com/phpdave11/gofpdi v1.0.13/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk/7bXwjDoI=
+github.com/pingcap/errors v0.11.4 h1:lFuQV/oaUMGcD2tqt+01ROSmJs75VG1ToEOkZIZ4nE4=
+github.com/pingcap/errors v0.11.4/go.mod h1:Oi8TUi2kEtXXLMJk9l1cGmz20kV3TaQ0usTwv5KuLY8=
+github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
+github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
+github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
+github.com/prometheus/client_golang v0.9.3/go.mod h1:/TN21ttK/J9q6uSwhBd54HahCDft0ttaMvbicHlPoso=
+github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
+github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M=
+github.com/prometheus/client_golang v1.11.0/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0=
+github.com/prometheus/client_golang v1.12.0 h1:C+UIj/QWtmqY13Arb8kwMt5j34/0Z2iKamrJ+ryC0Gg=
+github.com/prometheus/client_golang v1.12.0/go.mod h1:3Z9XVyYiZYEO+YQWt3RD2R3jrbd179Rt297l4aS6nDY=
+github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
+github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
+github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
+github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
+github.com/prometheus/client_model v0.2.1-0.20210607210712-147c58e9608a h1:CmF68hwI0XsOQ5UwlBopMi2Ow4Pbg32akc4KIVCOm+Y=
+github.com/prometheus/client_model v0.2.1-0.20210607210712-147c58e9608a/go.mod h1:LDGWKZIo7rky3hgvBe+caln+Dr3dPggB5dvjtD7w9+w=
+github.com/prometheus/common v0.0.0-20181113130724-41aa239b4cce/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro=
+github.com/prometheus/common v0.4.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4=
+github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4=
+github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB80sz/V91rCo=
+github.com/prometheus/common v0.26.0/go.mod h1:M7rCNAaPfAosfx8veZJCuw84e35h3Cfd9VFqTh1DIvc=
+github.com/prometheus/common v0.32.1 h1:hWIdL3N2HoUx3B8j3YN9mWor0qhY/NlEKZEaXxuIRh4=
+github.com/prometheus/common v0.32.1/go.mod h1:vu+V0TpY+O6vW9J44gczi3Ap/oXXR10b+M/gUGO4Hls=
+github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
+github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
+github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
+github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU=
+github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA=
+github.com/prometheus/procfs v0.7.3 h1:4jVXhlkAyzOScmCkXBTOLRLTz8EeU+eyjrwB/EPq0VU=
+github.com/prometheus/procfs v0.7.3/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA=
+github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU=
+github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg=
+github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
+github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8=
+github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
+github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
+github.com/ruudk/golang-pdf417 v0.0.0-20181029194003-1af4ab5afa58/go.mod h1:6lfFZQK844Gfx8o5WFuvpxWRwnSoipWe/p622j1v06w=
+github.com/ruudk/golang-pdf417 v0.0.0-20201230142125-a7e3863a1245/go.mod h1:pQAZKsJ8yyVxGRWYNEm9oFB8ieLgKFnamEyDmSA0BRk=
+github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
+github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
+github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
+github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88=
+github.com/soheilhy/cmux v0.1.4/go.mod h1:IM3LyeVVIOuxMH7sFAkER9+bJ4dT7Ms6E4xg4kGIyLM=
+github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
+github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ=
+github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE=
+github.com/spf13/cobra v1.0.0 h1:6m/oheQuQ13N9ks4hubMG6BnvwOeaJrqSPLahSnczz8=
+github.com/spf13/cobra v1.0.0/go.mod h1:/6GTrnGXV9HjY+aR4k0oJ5tcvakLuG6EuKReYlHNrgE=
+github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo=
+github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4=
+github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
+github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
+github.com/spf13/viper v1.4.0/go.mod h1:PTJ7Z/lr49W6bUbkmS1V3by4uWynFiR9p7+dSq/yZzE=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
+github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
+github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
+github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
+github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
+github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGrc=
+github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU=
+github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
+github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU=
+go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU=
+go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8=
+go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
+go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
+go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
+go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE=
+go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0=
+go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q=
+golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
+golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
+golang.org/x/exp v0.0.0-20190829153037-c13cbed26979/go.mod h1:86+5VVa7VpoJ4kLfm080zCjGlMRFzhUhsZKEZO7MGek=
+golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3/go.mod h1:NOZ3BPKG0ec/BKJQgnvsSFpcKLM5xXVWnvZS97DWHgE=
+golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY=
+golang.org/x/exp v0.0.0-20191129062945-2f5052295587/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4=
+golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4=
+golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4=
+golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM=
+golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU=
+golang.org/x/exp v0.0.0-20230626212559-97b1e661b5df h1:UA2aFVmmsIlefxMk29Dp2juaUSth8Pyn3Tq5Y5mJGME=
+golang.org/x/exp v0.0.0-20230626212559-97b1e661b5df/go.mod h1:FXUEEKJgO7OQYeo8N01OfiKP8RXMtf6e8aTskBGqWdc=
+golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
+golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
+golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20190910094157-69e4b8554b2a/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20200119044424-58c23975cae1/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20200430140353-33d19683fad8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20200618115811-c13761719519/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20201208152932-35266b937fa6/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20210216034530-4410531fe030/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20210607152325-775e3b0c77b9/go.mod h1:023OzeP/+EPmXeapQh35lcL3II3LrY8Ic+EFFKVhULM=
+golang.org/x/image v0.0.0-20210628002857-a66eb6448b8d/go.mod h1:023OzeP/+EPmXeapQh35lcL3II3LrY8Ic+EFFKVhULM=
+golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
+golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
+golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
+golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
+golang.org/x/lint v0.0.0-20190409202823-959b441ac422/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
+golang.org/x/lint v0.0.0-20190909230951-414d861bb4ac/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
+golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
+golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f/go.mod h1:5qLYkcX4OjUUV8bRuDixDT3tpyyb+LUpUlRWLxfhWrs=
+golang.org/x/lint v0.0.0-20200130185559-910be7a94367/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
+golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
+golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE=
+golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o=
+golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc=
+golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY=
+golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
+golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
+golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20181220203305-927f97764cc3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190501004415-9ce7a6920f09/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190522155817-f3200d17e092/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
+golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
+golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20190628185345-da137c7871d7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20190724013045-ca1201d0de80/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20200222125558-5a598a2470a0/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20200301022130-244492dfa37a/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
+golang.org/x/net v0.0.0-20200501053045-e0ff5e5a1de5/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
+golang.org/x/net v0.0.0-20200506145744-7e3656a0809f/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
+golang.org/x/net v0.0.0-20200513185701-a91f0712d120/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
+golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
+golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
+golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
+golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
+golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
+golang.org/x/net v0.0.0-20210525063256-abc453219eb5/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
+golang.org/x/oauth2 v0.0.0-20170207211851-4464e7848382/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
+golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
+golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
+golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
+golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
+golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
+golang.org/x/oauth2 v0.0.0-20210514164344-f6687ab2804c/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A=
+golang.org/x/perf v0.0.0-20230113213139-801c7ef9e5c5 h1:ObuXPmIgI4ZMyQLIz48cJYgSyWdjUXc2SZAdyJMwEAU=
+golang.org/x/perf v0.0.0-20230113213139-801c7ef9e5c5/go.mod h1:UBKtEnL8aqnd+0JHqZ+2qoMDwtuy6cYhhKNoHLBiTQc=
+golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4 h1:uVc8UZUe6tr40fFVnUP5Oj+veunVezqYl9z7DYw9xzw=
+golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20181107165924-66b7b1311ac8/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200106162015-b016eb3dc98e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200113162924-86b910548bc1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200212091648-12a6c2dcc1e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200331124033-c3d80250170d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200501052902-10377860bb8e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200511232937-7e40ca221e25/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200515095857-1151b9dac4a9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200523222454-059865788121/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200803210538-64077c9b5642/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210304124612-50617c2ba197/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210603081109-ebe580a85c40/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220114195835-da31bd327af9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.11.0 h1:eG7RXZHdqOJ1i+0lgLgCpSXAp6M3LYlAo6osgSi0xOM=
+golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.7.0 h1:4BRB4x83lYWy72KwLD/qYDuTu7q9PjSagHvijDw7cLo=
+golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
+golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
+golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
+golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
+golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
+golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
+golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
+golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
+golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
+golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
+golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
+golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
+golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
+golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
+golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20190927191325-030b2cf1153e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20191130070609-6e064ea0cf2d/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20191216173652-a0e659d51361/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/tools v0.0.0-20191227053925-7b8e75db28f4/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/tools v0.0.0-20200117161641-43d50277825c/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/tools v0.0.0-20200122220014-bf1340f18c4a/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/tools v0.0.0-20200204074204-1cc6d1ef6c74/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/tools v0.0.0-20200212150539-ea181f53ac56/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/tools v0.0.0-20200224181240-023911ca70b2/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/tools v0.0.0-20200227222343-706bc42d1f0d/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/tools v0.0.0-20200304193943-95d2e580d8eb/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw=
+golang.org/x/tools v0.0.0-20200312045724-11d5b4c81c7d/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw=
+golang.org/x/tools v0.0.0-20200331025713-a30bf2db82d4/go.mod h1:Sl4aGygMT6LrqrWclx+PTx3U+LnKx/seiNR+3G19Ar8=
+golang.org/x/tools v0.0.0-20200501065659-ab2804fb9c9d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
+golang.org/x/tools v0.0.0-20200512131952-2bc93b1c0c88/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
+golang.org/x/tools v0.0.0-20200515010526-7d3b6ebf133d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
+golang.org/x/tools v0.0.0-20200618134242-20370b0cb4b2/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
+golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
+golang.org/x/tools v0.0.0-20200729194436-6467de6f59a7/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA=
+golang.org/x/tools v0.0.0-20200804011535-6c149bb5ef0d/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA=
+golang.org/x/tools v0.0.0-20200825202427-b303f430e36d/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA=
+golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo=
+gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0=
+gonum.org/v1/gonum v0.9.3 h1:DnoIG+QAMaF5NvxnGe/oKsgKcAc6PcUyl8q0VetfQ8s=
+gonum.org/v1/gonum v0.9.3/go.mod h1:TZumC3NeyVQskjXqmyWt4S3bINhy7B4eYwW69EbyX+0=
+gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
+gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc=
+gonum.org/v1/plot v0.9.0/go.mod h1:3Pcqqmp6RHvJI72kgb8fThyUnav364FOsdDo2aGW5lY=
+gonum.org/v1/plot v0.10.0/go.mod h1:JWIHJ7U20drSQb/aDpTetJzfC1KlAPldJLpkSy88dvQ=
+google.golang.org/api v0.0.0-20170206182103-3d017632ea10/go.mod h1:4mhQ8q/RsB7i+udVvVy5NUi08OU8ZlA0gRVgrF7VFY0=
+google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE=
+google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M=
+google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg=
+google.golang.org/api v0.9.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg=
+google.golang.org/api v0.13.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI=
+google.golang.org/api v0.14.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI=
+google.golang.org/api v0.15.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI=
+google.golang.org/api v0.17.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE=
+google.golang.org/api v0.18.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE=
+google.golang.org/api v0.19.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE=
+google.golang.org/api v0.20.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE=
+google.golang.org/api v0.22.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE=
+google.golang.org/api v0.24.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE=
+google.golang.org/api v0.28.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE=
+google.golang.org/api v0.29.0/go.mod h1:Lcubydp8VUV7KeIHD9z2Bys/sm/vGKnG1UHuDBSrHWM=
+google.golang.org/api v0.30.0/go.mod h1:QGmEvQ87FHZNiUVJkT14jQNYJ4ZJjdRF23ZXz5138Fc=
+google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
+google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
+google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
+google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0=
+google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
+google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
+google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
+google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
+google.golang.org/genproto v0.0.0-20190418145605-e7d98fc518a7/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
+google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
+google.golang.org/genproto v0.0.0-20190502173448-54afdca5d873/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
+google.golang.org/genproto v0.0.0-20190801165951-fa694d86fc64/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
+google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
+google.golang.org/genproto v0.0.0-20190911173649-1774047e7e51/go.mod h1:IbNlFCBrqXvoKpeg0TB2l7cyZUmoaFKYIwrEpbDKLA8=
+google.golang.org/genproto v0.0.0-20191108220845-16a3f7862a1a/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc=
+google.golang.org/genproto v0.0.0-20191115194625-c23dd37a84c9/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc=
+google.golang.org/genproto v0.0.0-20191216164720-4f79533eabd1/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc=
+google.golang.org/genproto v0.0.0-20191230161307-f3c370f40bfb/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc=
+google.golang.org/genproto v0.0.0-20200115191322-ca5a22157cba/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc=
+google.golang.org/genproto v0.0.0-20200122232147-0452cf42e150/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc=
+google.golang.org/genproto v0.0.0-20200204135345-fa8e72b47b90/go.mod h1:GmwEX6Z4W5gMy59cAlVYjN9JhxgbQH6Gn+gFDQe2lzA=
+google.golang.org/genproto v0.0.0-20200212174721-66ed5ce911ce/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
+google.golang.org/genproto v0.0.0-20200224152610-e50cd9704f63/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
+google.golang.org/genproto v0.0.0-20200228133532-8c2c7df3a383/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
+google.golang.org/genproto v0.0.0-20200305110556-506484158171/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
+google.golang.org/genproto v0.0.0-20200312145019-da6875a35672/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
+google.golang.org/genproto v0.0.0-20200331122359-1ee6d9798940/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
+google.golang.org/genproto v0.0.0-20200430143042-b979b6f78d84/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
+google.golang.org/genproto v0.0.0-20200511104702-f5ebc3bea380/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
+google.golang.org/genproto v0.0.0-20200515170657-fc4c6c6a6587/go.mod h1:YsZOwe1myG/8QRHRsmBRE1LrgQY60beZKjly0O1fX9U=
+google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
+google.golang.org/genproto v0.0.0-20200618031413-b414f8b61790/go.mod h1:jDfRM7FcilCzHH/e9qn6dsT145K34l5v+OpcnNgKAAA=
+google.golang.org/genproto v0.0.0-20200729003335-053ba62fc06f/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
+google.golang.org/genproto v0.0.0-20200804131852-c06518451d9c/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
+google.golang.org/genproto v0.0.0-20200825200019-8632dd797987/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
+google.golang.org/grpc v0.0.0-20170208002647-2a6bf6142e96/go.mod h1:yo6s7OP7yaDglbqo1J04qKzAhqBH6lvTonzMVmEdcZw=
+google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
+google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38=
+google.golang.org/grpc v1.21.0/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM=
+google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM=
+google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
+google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
+google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
+google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
+google.golang.org/grpc v1.27.1/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
+google.golang.org/grpc v1.28.0/go.mod h1:rpkK4SK4GF4Ach/+MFLZUBavHOvF2JJB5uozKKal+60=
+google.golang.org/grpc v1.29.1/go.mod h1:itym6AZVZYACWQqET3MqgPpjcuV5QH3BxFS3IjizoKk=
+google.golang.org/grpc v1.30.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak=
+google.golang.org/grpc v1.31.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak=
+google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
+google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
+google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
+google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
+google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
+google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4=
+google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
+google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
+google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
+google.golang.org/protobuf v1.28.1 h1:d0NfwRgPtno5B1Wa6L2DAG+KivqkdutMf1UhdNx175w=
+google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
+gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
+gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
+gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
+gopkg.in/resty.v1 v1.12.0/go.mod h1:mDo4pnntr5jdWRML875a/NmxYqAlA73dVijT2AXvQQo=
+gopkg.in/yaml.v2 v2.0.0-20170812160011-eb3733d160e7/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74=
+gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v2 v2.2.5/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
+honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
+honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
+honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
+honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg=
+honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
+honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
+rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
+rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
+rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0=
+rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA=
diff --git a/pebble/ingest.go b/pebble/ingest.go
new file mode 100644
index 0000000..149340d
--- /dev/null
+++ b/pebble/ingest.go
@@ -0,0 +1,2410 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"context"
+	"sort"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/internal/private"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/remote"
+	"github.com/cockroachdb/pebble/shims/slices"
+	"github.com/cockroachdb/pebble/sstable"
+)
+
+func sstableKeyCompare(userCmp Compare, a, b InternalKey) int {
+	c := userCmp(a.UserKey, b.UserKey)
+	if c != 0 {
+		return c
+	}
+	if a.IsExclusiveSentinel() {
+		if !b.IsExclusiveSentinel() {
+			return -1
+		}
+	} else if b.IsExclusiveSentinel() {
+		return +1
+	}
+	return 0
+}
+
+// KeyRange encodes a key range in user key space. A KeyRange's Start is
+// inclusive while its End is exclusive.
+type KeyRange struct {
+	Start, End []byte
+}
+
+// Valid returns true if the KeyRange is defined.
+func (k *KeyRange) Valid() bool {
+	return k.Start != nil && k.End != nil
+}
+
+// Contains returns whether the specified key exists in the KeyRange.
+func (k *KeyRange) Contains(cmp base.Compare, key InternalKey) bool {
+	v := cmp(key.UserKey, k.End)
+	return (v < 0 || (v == 0 && key.IsExclusiveSentinel())) && cmp(k.Start, key.UserKey) <= 0
+}
+
+// OverlapsInternalKeyRange checks if the specified internal key range has an
+// overlap with the KeyRange. Note that we aren't checking for full containment
+// of smallest-largest within k, rather just that there's some intersection
+// between the two ranges.
+func (k *KeyRange) OverlapsInternalKeyRange(cmp base.Compare, smallest, largest InternalKey) bool {
+	v := cmp(k.Start, largest.UserKey)
+	return v <= 0 && !(largest.IsExclusiveSentinel() && v == 0) &&
+		cmp(k.End, smallest.UserKey) > 0
+}
+
+// Overlaps checks if the specified file has an overlap with the KeyRange.
+// Note that we aren't checking for full containment of m within k, rather just
+// that there's some intersection between m and k's bounds.
+func (k *KeyRange) Overlaps(cmp base.Compare, m *fileMetadata) bool {
+	return k.OverlapsInternalKeyRange(cmp, m.Smallest, m.Largest)
+}
+
+// OverlapsKeyRange checks if this span overlaps with the provided KeyRange.
+// Note that we aren't checking for full containment of either span in the other,
+// just that there's a key x that is in both key ranges.
+func (k *KeyRange) OverlapsKeyRange(cmp Compare, span KeyRange) bool {
+	return cmp(k.Start, span.End) < 0 && cmp(k.End, span.Start) > 0
+}
+
+func ingestValidateKey(opts *Options, key *InternalKey) error {
+	if key.Kind() == InternalKeyKindInvalid {
+		return base.CorruptionErrorf("pebble: external sstable has corrupted key: %s",
+			key.Pretty(opts.Comparer.FormatKey))
+	}
+	if key.SeqNum() != 0 {
+		return base.CorruptionErrorf("pebble: external sstable has non-zero seqnum: %s",
+			key.Pretty(opts.Comparer.FormatKey))
+	}
+	return nil
+}
+
+// ingestSynthesizeShared constructs a fileMetadata for one shared sstable owned
+// or shared by another node.
+func ingestSynthesizeShared(
+	opts *Options, sm SharedSSTMeta, fileNum base.DiskFileNum,
+) (*fileMetadata, error) {
+	if sm.Size == 0 {
+		// Disallow 0 file sizes
+		return nil, errors.New("pebble: cannot ingest shared file with size 0")
+	}
+	// Don't load table stats. Doing a round trip to shared storage, one SST
+	// at a time is not worth it as it slows down ingestion.
+	meta := &fileMetadata{
+		FileNum:      fileNum.FileNum(),
+		CreationTime: time.Now().Unix(),
+		Virtual:      true,
+		Size:         sm.Size,
+	}
+	meta.InitProviderBacking(fileNum)
+	// Set the underlying FileBacking's size to the same size as the virtualized
+	// view of the sstable. This ensures that we don't over-prioritize this
+	// sstable for compaction just yet, as we do not have a clear sense of what
+	// parts of this sstable are referenced by other nodes.
+	meta.FileBacking.Size = sm.Size
+	if sm.LargestRangeKey.Valid() && sm.LargestRangeKey.UserKey != nil {
+		// Initialize meta.{HasRangeKeys,Smallest,Largest}, etc.
+		//
+		// NB: We create new internal keys and pass them into ExternalRangeKeyBounds
+		// so that we can sub a zero sequence number into the bounds. We can set
+		// the sequence number to anything here; it'll be reset in ingestUpdateSeqNum
+		// anyway. However we do need to use the same sequence number across all
+		// bound keys at this step so that we end up with bounds that are consistent
+		// across point/range keys.
+		smallestRangeKey := base.MakeInternalKey(sm.SmallestRangeKey.UserKey, 0, sm.SmallestRangeKey.Kind())
+		largestRangeKey := base.MakeExclusiveSentinelKey(sm.LargestRangeKey.Kind(), sm.LargestRangeKey.UserKey)
+		meta.ExtendRangeKeyBounds(opts.Comparer.Compare, smallestRangeKey, largestRangeKey)
+	}
+	if sm.LargestPointKey.Valid() && sm.LargestPointKey.UserKey != nil {
+		// Initialize meta.{HasPointKeys,Smallest,Largest}, etc.
+		//
+		// See point above in the ExtendRangeKeyBounds call on why we use a zero
+		// sequence number here.
+		smallestPointKey := base.MakeInternalKey(sm.SmallestPointKey.UserKey, 0, sm.SmallestPointKey.Kind())
+		largestPointKey := base.MakeInternalKey(sm.LargestPointKey.UserKey, 0, sm.LargestPointKey.Kind())
+		if sm.LargestPointKey.IsExclusiveSentinel() {
+			largestPointKey = base.MakeRangeDeleteSentinelKey(sm.LargestPointKey.UserKey)
+		}
+		meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallestPointKey, largestPointKey)
+	}
+	if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil {
+		return nil, err
+	}
+	return meta, nil
+}
+
+// ingestLoad1External loads the fileMetadata for one external sstable.
+// Sequence number and target level calculation happens during prepare/apply.
+func ingestLoad1External(
+	opts *Options,
+	e ExternalFile,
+	fileNum base.DiskFileNum,
+	objprovider objstorage.Provider,
+	jobID int,
+) (*fileMetadata, error) {
+	if e.Size == 0 {
+		// Disallow 0 file sizes
+		return nil, errors.New("pebble: cannot ingest external file with size 0")
+	}
+	if !e.HasRangeKey && !e.HasPointKey {
+		return nil, errors.New("pebble: cannot ingest external file with no point or range keys")
+	}
+	// Don't load table stats. Doing a round trip to shared storage, one SST
+	// at a time is not worth it as it slows down ingestion.
+	meta := &fileMetadata{}
+	meta.FileNum = fileNum.FileNum()
+	meta.CreationTime = time.Now().Unix()
+	meta.Virtual = true
+	meta.Size = e.Size
+	meta.InitProviderBacking(fileNum)
+
+	// Try to resolve a reference to the external file.
+	backing, err := objprovider.CreateExternalObjectBacking(e.Locator, e.ObjName)
+	if err != nil {
+		return nil, err
+	}
+	metas, err := objprovider.AttachRemoteObjects([]objstorage.RemoteObjectToAttach{{
+		FileNum:  fileNum,
+		FileType: fileTypeTable,
+		Backing:  backing,
+	}})
+	if err != nil {
+		return nil, err
+	}
+	if opts.EventListener.TableCreated != nil {
+		opts.EventListener.TableCreated(TableCreateInfo{
+			JobID:   jobID,
+			Reason:  "ingesting",
+			Path:    objprovider.Path(metas[0]),
+			FileNum: fileNum.FileNum(),
+		})
+	}
+	// In the name of keeping this ingestion as fast as possible, we avoid
+	// *all* existence checks and synthesize a file metadata with smallest/largest
+	// keys that overlap whatever the passed-in span was.
+	smallestCopy := make([]byte, len(e.SmallestUserKey))
+	copy(smallestCopy, e.SmallestUserKey)
+	largestCopy := make([]byte, len(e.LargestUserKey))
+	copy(largestCopy, e.LargestUserKey)
+	if e.HasPointKey {
+		meta.ExtendPointKeyBounds(opts.Comparer.Compare, base.MakeInternalKey(smallestCopy, 0, InternalKeyKindMax),
+			base.MakeRangeDeleteSentinelKey(largestCopy))
+	}
+	if e.HasRangeKey {
+		meta.ExtendRangeKeyBounds(opts.Comparer.Compare, base.MakeInternalKey(smallestCopy, 0, InternalKeyKindRangeKeySet),
+			base.MakeExclusiveSentinelKey(InternalKeyKindRangeKeyDelete, largestCopy))
+	}
+
+	// Set the underlying FileBacking's size to the same size as the virtualized
+	// view of the sstable. This ensures that we don't over-prioritize this
+	// sstable for compaction just yet, as we do not have a clear sense of
+	// what parts of this sstable are referenced by other nodes.
+	meta.FileBacking.Size = e.Size
+
+	if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil {
+		return nil, err
+	}
+	return meta, nil
+}
+
+// ingestLoad1 creates the FileMetadata for one file. This file will be owned
+// by this store.
+func ingestLoad1(
+	opts *Options,
+	fmv FormatMajorVersion,
+	readable objstorage.Readable,
+	cacheID uint64,
+	fileNum base.DiskFileNum,
+) (*fileMetadata, error) {
+	cacheOpts := private.SSTableCacheOpts(cacheID, fileNum).(sstable.ReaderOption)
+	r, err := sstable.NewReader(readable, opts.MakeReaderOptions(), cacheOpts)
+	if err != nil {
+		return nil, err
+	}
+	defer r.Close()
+
+	// Avoid ingesting tables with format versions this DB doesn't support.
+	tf, err := r.TableFormat()
+	if err != nil {
+		return nil, err
+	}
+	if tf < fmv.MinTableFormat() || tf > fmv.MaxTableFormat() {
+		return nil, errors.Newf(
+			"pebble: table format %s is not within range supported at DB format major version %d, (%s,%s)",
+			tf, fmv, fmv.MinTableFormat(), fmv.MaxTableFormat(),
+		)
+	}
+
+	meta := &fileMetadata{}
+	meta.FileNum = fileNum.FileNum()
+	meta.Size = uint64(readable.Size())
+	meta.CreationTime = time.Now().Unix()
+	meta.InitPhysicalBacking()
+
+	// Avoid loading into the table cache for collecting stats if we
+	// don't need to. If there are no range deletions, we have all the
+	// information to compute the stats here.
+	//
+	// This is helpful in tests for avoiding awkwardness around deletion of
+	// ingested files from MemFS. MemFS implements the Windows semantics of
+	// disallowing removal of an open file. Under MemFS, if we don't populate
+	// meta.Stats here, the file will be loaded into the table cache for
+	// calculating stats before we can remove the original link.
+	maybeSetStatsFromProperties(meta.PhysicalMeta(), &r.Properties)
+
+	{
+		iter, err := r.NewIter(nil /* lower */, nil /* upper */)
+		if err != nil {
+			return nil, err
+		}
+		defer iter.Close()
+		var smallest InternalKey
+		if key, _ := iter.First(); key != nil {
+			if err := ingestValidateKey(opts, key); err != nil {
+				return nil, err
+			}
+			smallest = (*key).Clone()
+		}
+		if err := iter.Error(); err != nil {
+			return nil, err
+		}
+		if key, _ := iter.Last(); key != nil {
+			if err := ingestValidateKey(opts, key); err != nil {
+				return nil, err
+			}
+			meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, key.Clone())
+		}
+		if err := iter.Error(); err != nil {
+			return nil, err
+		}
+	}
+
+	iter, err := r.NewRawRangeDelIter()
+	if err != nil {
+		return nil, err
+	}
+	if iter != nil {
+		defer iter.Close()
+		var smallest InternalKey
+		if s := iter.First(); s != nil {
+			key := s.SmallestKey()
+			if err := ingestValidateKey(opts, &key); err != nil {
+				return nil, err
+			}
+			smallest = key.Clone()
+		}
+		if err := iter.Error(); err != nil {
+			return nil, err
+		}
+		if s := iter.Last(); s != nil {
+			k := s.SmallestKey()
+			if err := ingestValidateKey(opts, &k); err != nil {
+				return nil, err
+			}
+			largest := s.LargestKey().Clone()
+			meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, largest)
+		}
+	}
+
+	// Update the range-key bounds for the table.
+	{
+		iter, err := r.NewRawRangeKeyIter()
+		if err != nil {
+			return nil, err
+		}
+		if iter != nil {
+			defer iter.Close()
+			var smallest InternalKey
+			if s := iter.First(); s != nil {
+				key := s.SmallestKey()
+				if err := ingestValidateKey(opts, &key); err != nil {
+					return nil, err
+				}
+				smallest = key.Clone()
+			}
+			if err := iter.Error(); err != nil {
+				return nil, err
+			}
+			if s := iter.Last(); s != nil {
+				k := s.SmallestKey()
+				if err := ingestValidateKey(opts, &k); err != nil {
+					return nil, err
+				}
+				// As range keys are fragmented, the end key of the last range key in
+				// the table provides the upper bound for the table.
+				largest := s.LargestKey().Clone()
+				meta.ExtendRangeKeyBounds(opts.Comparer.Compare, smallest, largest)
+			}
+			if err := iter.Error(); err != nil {
+				return nil, err
+			}
+		}
+	}
+
+	if !meta.HasPointKeys && !meta.HasRangeKeys {
+		return nil, nil
+	}
+
+	// Sanity check that the various bounds on the file were set consistently.
+	if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil {
+		return nil, err
+	}
+
+	return meta, nil
+}
+
+type ingestLoadResult struct {
+	localMeta, sharedMeta []*fileMetadata
+	externalMeta          []*fileMetadata
+	localPaths            []string
+	sharedLevels          []uint8
+	fileCount             int
+}
+
+func ingestLoad(
+	opts *Options,
+	fmv FormatMajorVersion,
+	paths []string,
+	shared []SharedSSTMeta,
+	external []ExternalFile,
+	cacheID uint64,
+	pending []base.DiskFileNum,
+	objProvider objstorage.Provider,
+	jobID int,
+) (ingestLoadResult, error) {
+	meta := make([]*fileMetadata, 0, len(paths))
+	newPaths := make([]string, 0, len(paths))
+	for i := range paths {
+		f, err := opts.FS.Open(paths[i])
+		if err != nil {
+			return ingestLoadResult{}, err
+		}
+
+		readable, err := sstable.NewSimpleReadable(f)
+		if err != nil {
+			return ingestLoadResult{}, err
+		}
+		m, err := ingestLoad1(opts, fmv, readable, cacheID, pending[i])
+		if err != nil {
+			return ingestLoadResult{}, err
+		}
+		if m != nil {
+			meta = append(meta, m)
+			newPaths = append(newPaths, paths[i])
+		}
+	}
+	if len(shared) == 0 && len(external) == 0 {
+		return ingestLoadResult{localMeta: meta, localPaths: newPaths, fileCount: len(meta)}, nil
+	}
+
+	// Sort the shared files according to level.
+	sort.Sort(sharedByLevel(shared))
+
+	sharedMeta := make([]*fileMetadata, 0, len(shared))
+	levels := make([]uint8, 0, len(shared))
+	for i := range shared {
+		m, err := ingestSynthesizeShared(opts, shared[i], pending[len(paths)+i])
+		if err != nil {
+			return ingestLoadResult{}, err
+		}
+		if shared[i].Level < sharedLevelsStart {
+			return ingestLoadResult{}, errors.New("cannot ingest shared file in level below sharedLevelsStart")
+		}
+		sharedMeta = append(sharedMeta, m)
+		levels = append(levels, shared[i].Level)
+	}
+	externalMeta := make([]*fileMetadata, 0, len(external))
+	for i := range external {
+		m, err := ingestLoad1External(opts, external[i], pending[len(paths)+len(shared)+i], objProvider, jobID)
+		if err != nil {
+			return ingestLoadResult{}, err
+		}
+		externalMeta = append(externalMeta, m)
+	}
+	result := ingestLoadResult{
+		localMeta:    meta,
+		sharedMeta:   sharedMeta,
+		externalMeta: externalMeta,
+		localPaths:   newPaths,
+		sharedLevels: levels,
+		fileCount:    len(meta) + len(sharedMeta) + len(externalMeta),
+	}
+	return result, nil
+}
+
+// Struct for sorting metadatas by smallest user keys, while ensuring the
+// matching path also gets swapped to the same index. For use in
+// ingestSortAndVerify.
+type metaAndPaths struct {
+	meta  []*fileMetadata
+	paths []string
+	cmp   Compare
+}
+
+func (m metaAndPaths) Len() int {
+	return len(m.meta)
+}
+
+func (m metaAndPaths) Less(i, j int) bool {
+	return m.cmp(m.meta[i].Smallest.UserKey, m.meta[j].Smallest.UserKey) < 0
+}
+
+func (m metaAndPaths) Swap(i, j int) {
+	m.meta[i], m.meta[j] = m.meta[j], m.meta[i]
+	if m.paths != nil {
+		m.paths[i], m.paths[j] = m.paths[j], m.paths[i]
+	}
+}
+
+func ingestSortAndVerify(cmp Compare, lr ingestLoadResult, exciseSpan KeyRange) error {
+	// Verify that all the shared files (i.e. files in sharedMeta)
+	// fit within the exciseSpan.
+	for i := range lr.sharedMeta {
+		f := lr.sharedMeta[i]
+		if !exciseSpan.Contains(cmp, f.Smallest) || !exciseSpan.Contains(cmp, f.Largest) {
+			return errors.AssertionFailedf("pebble: shared file outside of excise span, span [%s-%s), file = %s", exciseSpan.Start, exciseSpan.End, f.String())
+		}
+	}
+	if len(lr.externalMeta) > 0 {
+		if len(lr.localMeta) > 0 || len(lr.sharedMeta) > 0 {
+			// Currently we only support external ingests on their own. If external
+			// files are present alongside local/shared files, return an error.
+			return errors.AssertionFailedf("pebble: external files cannot be ingested atomically alongside other types of files")
+		}
+		sort.Sort(&metaAndPaths{
+			meta: lr.externalMeta,
+			cmp:  cmp,
+		})
+		for i := 1; i < len(lr.externalMeta); i++ {
+			if sstableKeyCompare(cmp, lr.externalMeta[i-1].Largest, lr.externalMeta[i].Smallest) >= 0 {
+				return errors.AssertionFailedf("pebble: external sstables have overlapping ranges")
+			}
+		}
+		return nil
+	}
+	if len(lr.localMeta) <= 1 || len(lr.localPaths) <= 1 {
+		return nil
+	}
+
+	sort.Sort(&metaAndPaths{
+		meta:  lr.localMeta,
+		paths: lr.localPaths,
+		cmp:   cmp,
+	})
+
+	for i := 1; i < len(lr.localPaths); i++ {
+		if sstableKeyCompare(cmp, lr.localMeta[i-1].Largest, lr.localMeta[i].Smallest) >= 0 {
+			return errors.AssertionFailedf("pebble: local ingestion sstables have overlapping ranges")
+		}
+	}
+	if len(lr.sharedMeta) == 0 {
+		return nil
+	}
+	filesInLevel := make([]*fileMetadata, 0, len(lr.sharedMeta))
+	for l := sharedLevelsStart; l < numLevels; l++ {
+		filesInLevel = filesInLevel[:0]
+		for i := range lr.sharedMeta {
+			if lr.sharedLevels[i] == uint8(l) {
+				filesInLevel = append(filesInLevel, lr.sharedMeta[i])
+			}
+		}
+		slices.SortFunc(filesInLevel, func(a, b *fileMetadata) int {
+			return cmp(a.Smallest.UserKey, b.Smallest.UserKey)
+		})
+		for i := 1; i < len(filesInLevel); i++ {
+			if sstableKeyCompare(cmp, filesInLevel[i-1].Largest, filesInLevel[i].Smallest) >= 0 {
+				return errors.AssertionFailedf("pebble: external shared sstables have overlapping ranges")
+			}
+		}
+	}
+	return nil
+}
+
+func ingestCleanup(objProvider objstorage.Provider, meta []*fileMetadata) error {
+	var firstErr error
+	for i := range meta {
+		if err := objProvider.Remove(fileTypeTable, meta[i].FileBacking.DiskFileNum); err != nil {
+			firstErr = firstError(firstErr, err)
+		}
+	}
+	return firstErr
+}
+
+// ingestLink creates new objects which are backed by either hardlinks to or
+// copies of the ingested files. It also attaches shared objects to the provider.
+func ingestLink(
+	jobID int,
+	opts *Options,
+	objProvider objstorage.Provider,
+	lr ingestLoadResult,
+	shared []SharedSSTMeta,
+) error {
+	for i := range lr.localPaths {
+		objMeta, err := objProvider.LinkOrCopyFromLocal(
+			context.TODO(), opts.FS, lr.localPaths[i], fileTypeTable, lr.localMeta[i].FileBacking.DiskFileNum,
+			objstorage.CreateOptions{PreferSharedStorage: true},
+		)
+		if err != nil {
+			if err2 := ingestCleanup(objProvider, lr.localMeta[:i]); err2 != nil {
+				opts.Logger.Errorf("ingest cleanup failed: %v", err2)
+			}
+			return err
+		}
+		if opts.EventListener.TableCreated != nil {
+			opts.EventListener.TableCreated(TableCreateInfo{
+				JobID:   jobID,
+				Reason:  "ingesting",
+				Path:    objProvider.Path(objMeta),
+				FileNum: lr.localMeta[i].FileNum,
+			})
+		}
+	}
+	sharedObjs := make([]objstorage.RemoteObjectToAttach, 0, len(shared))
+	for i := range shared {
+		backing, err := shared[i].Backing.Get()
+		if err != nil {
+			return err
+		}
+		sharedObjs = append(sharedObjs, objstorage.RemoteObjectToAttach{
+			FileNum:  lr.sharedMeta[i].FileBacking.DiskFileNum,
+			FileType: fileTypeTable,
+			Backing:  backing,
+		})
+	}
+	sharedObjMetas, err := objProvider.AttachRemoteObjects(sharedObjs)
+	if err != nil {
+		return err
+	}
+	for i := range sharedObjMetas {
+		// One corner case around file sizes we need to be mindful of, is that
+		// if one of the shareObjs was initially created by us (and has boomeranged
+		// back from another node), we'll need to update the FileBacking's size
+		// to be the true underlying size. Otherwise, we could hit errors when we
+		// open the db again after a crash/restart (see checkConsistency in open.go),
+		// plus it more accurately allows us to prioritize compactions of files
+		// that were originally created by us.
+		if sharedObjMetas[i].IsShared() && !objProvider.IsSharedForeign(sharedObjMetas[i]) {
+			size, err := objProvider.Size(sharedObjMetas[i])
+			if err != nil {
+				return err
+			}
+			lr.sharedMeta[i].FileBacking.Size = uint64(size)
+		}
+		if opts.EventListener.TableCreated != nil {
+			opts.EventListener.TableCreated(TableCreateInfo{
+				JobID:   jobID,
+				Reason:  "ingesting",
+				Path:    objProvider.Path(sharedObjMetas[i]),
+				FileNum: lr.sharedMeta[i].FileNum,
+			})
+		}
+	}
+	// We do not need to do anything about lr.externalMetas. Those were already
+	// linked in ingestLoad.
+
+	return nil
+}
+
+func ingestMemtableOverlaps(cmp Compare, mem flushable, keyRanges []internalKeyRange) bool {
+	iter := mem.newIter(nil)
+	rangeDelIter := mem.newRangeDelIter(nil)
+	rkeyIter := mem.newRangeKeyIter(nil)
+
+	closeIters := func() error {
+		err := iter.Close()
+		if rangeDelIter != nil {
+			err = firstError(err, rangeDelIter.Close())
+		}
+		if rkeyIter != nil {
+			err = firstError(err, rkeyIter.Close())
+		}
+		return err
+	}
+
+	for _, kr := range keyRanges {
+		if overlapWithIterator(iter, &rangeDelIter, rkeyIter, kr, cmp) {
+			closeIters()
+			return true
+		}
+	}
+
+	// Assume overlap if any iterator errored out.
+	return closeIters() != nil
+}
+
+func ingestUpdateSeqNum(
+	cmp Compare, format base.FormatKey, seqNum uint64, loadResult ingestLoadResult,
+) error {
+	setSeqFn := func(k base.InternalKey) base.InternalKey {
+		return base.MakeInternalKey(k.UserKey, seqNum, k.Kind())
+	}
+	updateMetadata := func(m *fileMetadata) error {
+		// NB: we set the fields directly here, rather than via their Extend*
+		// methods, as we are updating sequence numbers.
+		if m.HasPointKeys {
+			m.SmallestPointKey = setSeqFn(m.SmallestPointKey)
+		}
+		if m.HasRangeKeys {
+			m.SmallestRangeKey = setSeqFn(m.SmallestRangeKey)
+		}
+		m.Smallest = setSeqFn(m.Smallest)
+		// Only update the seqnum for the largest key if that key is not an
+		// "exclusive sentinel" (i.e. a range deletion sentinel or a range key
+		// boundary), as doing so effectively drops the exclusive sentinel (by
+		// lowering the seqnum from the max value), and extends the bounds of the
+		// table.
+		// NB: as the largest range key is always an exclusive sentinel, it is never
+		// updated.
+		if m.HasPointKeys && !m.LargestPointKey.IsExclusiveSentinel() {
+			m.LargestPointKey = setSeqFn(m.LargestPointKey)
+		}
+		if !m.Largest.IsExclusiveSentinel() {
+			m.Largest = setSeqFn(m.Largest)
+		}
+		// Setting smallestSeqNum == largestSeqNum triggers the setting of
+		// Properties.GlobalSeqNum when an sstable is loaded.
+		m.SmallestSeqNum = seqNum
+		m.LargestSeqNum = seqNum
+		// Ensure the new bounds are consistent.
+		if err := m.Validate(cmp, format); err != nil {
+			return err
+		}
+		seqNum++
+		return nil
+	}
+
+	// Shared sstables are required to be sorted by level ascending. We then
+	// iterate the shared sstables in reverse, assigning the lower sequence
+	// numbers to the shared sstables that will be ingested into the lower
+	// (larger numbered) levels first. This ensures sequence number shadowing is
+	// correct.
+	for i := len(loadResult.sharedMeta) - 1; i >= 0; i-- {
+		if i-1 >= 0 && loadResult.sharedLevels[i-1] > loadResult.sharedLevels[i] {
+			panic(errors.AssertionFailedf("shared files %s, %s out of order", loadResult.sharedMeta[i-1], loadResult.sharedMeta[i]))
+		}
+		if err := updateMetadata(loadResult.sharedMeta[i]); err != nil {
+			return err
+		}
+	}
+	for i := range loadResult.localMeta {
+		if err := updateMetadata(loadResult.localMeta[i]); err != nil {
+			return err
+		}
+	}
+	for i := range loadResult.externalMeta {
+		if err := updateMetadata(loadResult.externalMeta[i]); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Denotes an internal key range. Smallest and largest are both inclusive.
+type internalKeyRange struct {
+	smallest, largest InternalKey
+}
+
+func overlapWithIterator(
+	iter internalIterator,
+	rangeDelIter *keyspan.FragmentIterator,
+	rkeyIter keyspan.FragmentIterator,
+	keyRange internalKeyRange,
+	cmp Compare,
+) bool {
+	// Check overlap with point operations.
+	//
+	// When using levelIter, it seeks to the SST whose boundaries
+	// contain keyRange.smallest.UserKey(S).
+	// It then tries to find a point in that SST that is >= S.
+	// If there's no such point it means the SST ends in a tombstone in which case
+	// levelIter.SeekGE generates a boundary range del sentinel.
+	// The comparison of this boundary with keyRange.largest(L) below
+	// is subtle but maintains correctness.
+	// 1) boundary < L,
+	//    since boundary is also > S (initial seek),
+	//    whatever the boundary's start key may be, we're always overlapping.
+	// 2) boundary > L,
+	//    overlap with boundary cannot be determined since we don't know boundary's start key.
+	//    We require checking for overlap with rangeDelIter.
+	// 3) boundary == L and L is not sentinel,
+	//    means boundary < L and hence is similar to 1).
+	// 4) boundary == L and L is sentinel,
+	//    we'll always overlap since for any values of i,j ranges [i, k) and [j, k) always overlap.
+	key, _ := iter.SeekGE(keyRange.smallest.UserKey, base.SeekGEFlagsNone)
+	if key != nil {
+		c := sstableKeyCompare(cmp, *key, keyRange.largest)
+		if c <= 0 {
+			return true
+		}
+	}
+	// Assume overlap if iterator errored.
+	if err := iter.Error(); err != nil {
+		return true
+	}
+
+	computeOverlapWithSpans := func(rIter keyspan.FragmentIterator) bool {
+		// NB: The spans surfaced by the fragment iterator are non-overlapping.
+		span := rIter.SeekLT(keyRange.smallest.UserKey)
+		if span == nil {
+			span = rIter.Next()
+		}
+		for ; span != nil; span = rIter.Next() {
+			if span.Empty() {
+				continue
+			}
+			key := span.SmallestKey()
+			c := sstableKeyCompare(cmp, key, keyRange.largest)
+			if c > 0 {
+				// The start of the span is after the largest key in the
+				// ingested table.
+				return false
+			}
+			if cmp(span.End, keyRange.smallest.UserKey) > 0 {
+				// The end of the span is greater than the smallest in the
+				// table. Note that the span end key is exclusive, thus ">0"
+				// instead of ">=0".
+				return true
+			}
+		}
+		// Assume overlap if iterator errored.
+		if err := rIter.Error(); err != nil {
+			return true
+		}
+		return false
+	}
+
+	// rkeyIter is either a range key level iter, or a range key iterator
+	// over a single file.
+	if rkeyIter != nil {
+		if computeOverlapWithSpans(rkeyIter) {
+			return true
+		}
+	}
+
+	// Check overlap with range deletions.
+	if rangeDelIter == nil || *rangeDelIter == nil {
+		return false
+	}
+	return computeOverlapWithSpans(*rangeDelIter)
+}
+
+// ingestTargetLevel returns the target level for a file being ingested.
+// If suggestSplit is true, it accounts for ingest-time splitting as part of
+// its target level calculation, and if a split candidate is found, that file
+// is returned as the splitFile.
+func ingestTargetLevel(
+	newIters tableNewIters,
+	newRangeKeyIter keyspan.TableNewSpanIter,
+	iterOps IterOptions,
+	comparer *Comparer,
+	v *version,
+	baseLevel int,
+	compactions map[*compaction]struct{},
+	meta *fileMetadata,
+	suggestSplit bool,
+) (targetLevel int, splitFile *fileMetadata, err error) {
+	// Find the lowest level which does not have any files which overlap meta. We
+	// search from L0 to L6 looking for whether there are any files in the level
+	// which overlap meta. We want the "lowest" level (where lower means
+	// increasing level number) in order to reduce write amplification.
+	//
+	// There are 2 kinds of overlap we need to check for: file boundary overlap
+	// and data overlap. Data overlap implies file boundary overlap. Note that it
+	// is always possible to ingest into L0.
+	//
+	// To place meta at level i where i > 0:
+	// - there must not be any data overlap with levels <= i, since that will
+	//   violate the sequence number invariant.
+	// - no file boundary overlap with level i, since that will violate the
+	//   invariant that files do not overlap in levels i > 0.
+	//   - if there is only a file overlap at a given level, and no data overlap,
+	//     we can still slot a file at that level. We return the fileMetadata with
+	//     which we have file boundary overlap (must be only one file, as sstable
+	//     bounds are usually tight on user keys) and the caller is expected to split
+	//     that sstable into two virtual sstables, allowing this file to go into that
+	//     level. Note that if we have file boundary overlap with two files, which
+	//     should only happen on rare occasions, we treat it as data overlap and
+	//     don't use this optimization.
+	//
+	// The file boundary overlap check is simpler to conceptualize. Consider the
+	// following example, in which the ingested file lies completely before or
+	// after the file being considered.
+	//
+	//   |--|           |--|  ingested file: [a,b] or [f,g]
+	//         |-----|        existing file: [c,e]
+	//  _____________________
+	//   a  b  c  d  e  f  g
+	//
+	// In both cases the ingested file can move to considering the next level.
+	//
+	// File boundary overlap does not necessarily imply data overlap. The check
+	// for data overlap is a little more nuanced. Consider the following examples:
+	//
+	//  1. No data overlap:
+	//
+	//          |-|   |--|    ingested file: [cc-d] or [ee-ff]
+	//  |*--*--*----*------*| existing file: [a-g], points: [a, b, c, dd, g]
+	//  _____________________
+	//   a  b  c  d  e  f  g
+	//
+	// In this case the ingested files can "fall through" this level. The checks
+	// continue at the next level.
+	//
+	//  2. Data overlap:
+	//
+	//            |--|        ingested file: [d-e]
+	//  |*--*--*----*------*| existing file: [a-g], points: [a, b, c, dd, g]
+	//  _____________________
+	//   a  b  c  d  e  f  g
+	//
+	// In this case the file cannot be ingested into this level as the point 'dd'
+	// is in the way.
+	//
+	// It is worth noting that the check for data overlap is only approximate. In
+	// the previous example, the ingested table [d-e] could contain only the
+	// points 'd' and 'e', in which case the table would be eligible for
+	// considering lower levels. However, such a fine-grained check would need to
+	// be exhaustive (comparing points and ranges in both the ingested existing
+	// tables) and such a check is prohibitively expensive. Thus Pebble treats any
+	// existing point that falls within the ingested table bounds as being "data
+	// overlap".
+
+	// This assertion implicitly checks that we have the current version of
+	// the metadata.
+	if v.L0Sublevels == nil {
+		return 0, nil, errors.AssertionFailedf("could not read L0 sublevels")
+	}
+	iterOps.CategoryAndQoS = sstable.CategoryAndQoS{
+		Category: "pebble-ingest",
+		QoSLevel: sstable.LatencySensitiveQoSLevel,
+	}
+	// Check for overlap over the keys of L0 by iterating over the sublevels.
+	for subLevel := 0; subLevel < len(v.L0SublevelFiles); subLevel++ {
+		iter := newLevelIter(context.Background(),
+			iterOps, comparer, newIters, v.L0Sublevels.Levels[subLevel].Iter(), manifest.Level(0), internalIterOpts{})
+
+		var rangeDelIter keyspan.FragmentIterator
+		// Pass in a non-nil pointer to rangeDelIter so that levelIter.findFileGE
+		// sets it up for the target file.
+		iter.initRangeDel(&rangeDelIter)
+
+		levelIter := keyspan.LevelIter{}
+		levelIter.Init(
+			keyspan.SpanIterOptions{}, comparer.Compare, newRangeKeyIter,
+			v.L0Sublevels.Levels[subLevel].Iter(), manifest.Level(0), manifest.KeyTypeRange,
+		)
+
+		kr := internalKeyRange{
+			smallest: meta.Smallest,
+			largest:  meta.Largest,
+		}
+		overlap := overlapWithIterator(iter, &rangeDelIter, &levelIter, kr, comparer.Compare)
+		err := iter.Close() // Closes range del iter as well.
+		err = firstError(err, levelIter.Close())
+		if err != nil {
+			return 0, nil, err
+		}
+		if overlap {
+			return targetLevel, nil, nil
+		}
+	}
+
+	level := baseLevel
+	for ; level < numLevels; level++ {
+		levelIter := newLevelIter(context.Background(),
+			iterOps, comparer, newIters, v.Levels[level].Iter(), manifest.Level(level), internalIterOpts{})
+		var rangeDelIter keyspan.FragmentIterator
+		// Pass in a non-nil pointer to rangeDelIter so that levelIter.findFileGE
+		// sets it up for the target file.
+		levelIter.initRangeDel(&rangeDelIter)
+
+		rkeyLevelIter := &keyspan.LevelIter{}
+		rkeyLevelIter.Init(
+			keyspan.SpanIterOptions{}, comparer.Compare, newRangeKeyIter,
+			v.Levels[level].Iter(), manifest.Level(level), manifest.KeyTypeRange,
+		)
+
+		kr := internalKeyRange{
+			smallest: meta.Smallest,
+			largest:  meta.Largest,
+		}
+		overlap := overlapWithIterator(levelIter, &rangeDelIter, rkeyLevelIter, kr, comparer.Compare)
+		err := levelIter.Close() // Closes range del iter as well.
+		err = firstError(err, rkeyLevelIter.Close())
+		if err != nil {
+			return 0, nil, err
+		}
+		if overlap {
+			return targetLevel, splitFile, nil
+		}
+
+		// Check boundary overlap.
+		var candidateSplitFile *fileMetadata
+		boundaryOverlaps := v.Overlaps(level, comparer.Compare, meta.Smallest.UserKey,
+			meta.Largest.UserKey, meta.Largest.IsExclusiveSentinel())
+		if !boundaryOverlaps.Empty() {
+			// We are already guaranteed to not have any data overlaps with files
+			// in boundaryOverlaps, otherwise we'd have returned in the above if
+			// statements. Use this, plus boundaryOverlaps.Len() == 1 to detect for
+			// the case where we can slot this file into the current level despite
+			// a boundary overlap, by splitting one existing file into two virtual
+			// sstables.
+			if suggestSplit && boundaryOverlaps.Len() == 1 {
+				iter := boundaryOverlaps.Iter()
+				candidateSplitFile = iter.First()
+			} else {
+				// We either don't want to suggest ingest-time splits (i.e.
+				// !suggestSplit), or we boundary-overlapped with more than one file.
+				continue
+			}
+		}
+
+		// Check boundary overlap with any ongoing compactions. We consider an
+		// overlapping compaction that's writing files to an output level as
+		// equivalent to boundary overlap with files in that output level.
+		//
+		// We cannot check for data overlap with the new SSTs compaction will produce
+		// since compaction hasn't been done yet. However, there's no need to check
+		// since all keys in them will be from levels in [c.startLevel,
+		// c.outputLevel], and all those levels have already had their data overlap
+		// tested negative (else we'd have returned earlier).
+		//
+		// An alternative approach would be to cancel these compactions and proceed
+		// with an ingest-time split on this level if necessary. However, compaction
+		// cancellation can result in significant wasted effort and is best avoided
+		// unless necessary.
+		overlaps := false
+		for c := range compactions {
+			if c.outputLevel == nil || level != c.outputLevel.level {
+				continue
+			}
+			if comparer.Compare(meta.Smallest.UserKey, c.largest.UserKey) <= 0 &&
+				comparer.Compare(meta.Largest.UserKey, c.smallest.UserKey) >= 0 {
+				overlaps = true
+				break
+			}
+		}
+		if !overlaps {
+			targetLevel = level
+			splitFile = candidateSplitFile
+		}
+	}
+	return targetLevel, splitFile, nil
+}
+
+// Ingest ingests a set of sstables into the DB. Ingestion of the files is
+// atomic and semantically equivalent to creating a single batch containing all
+// of the mutations in the sstables. Ingestion may require the memtable to be
+// flushed. The ingested sstable files are moved into the DB and must reside on
+// the same filesystem as the DB. Sstables can be created for ingestion using
+// sstable.Writer. On success, Ingest removes the input paths.
+//
+// Two types of sstables are accepted for ingestion(s): one is sstables present
+// in the instance's vfs.FS and can be referenced locally. The other is sstables
+// present in remote.Storage, referred to as shared or foreign sstables. These
+// shared sstables can be linked through objstorageprovider.Provider, and do not
+// need to already be present on the local vfs.FS. Foreign sstables must all fit
+// in an excise span, and are destined for a level specified in SharedSSTMeta.
+//
+// All sstables *must* be Sync()'d by the caller after all bytes are written
+// and before its file handle is closed; failure to do so could violate
+// durability or lead to corrupted on-disk state. This method cannot, in a
+// platform-and-FS-agnostic way, ensure that all sstables in the input are
+// properly synced to disk. Opening new file handles and Sync()-ing them
+// does not always guarantee durability; see the discussion here on that:
+// https://github.com/cockroachdb/pebble/pull/835#issuecomment-663075379
+//
+// Ingestion loads each sstable into the lowest level of the LSM which it
+// doesn't overlap (see ingestTargetLevel). If an sstable overlaps a memtable,
+// ingestion forces the memtable to flush, and then waits for the flush to
+// occur. In some cases, such as with no foreign sstables and no excise span,
+// ingestion that gets blocked on a memtable can join the flushable queue and
+// finish even before the memtable has been flushed.
+//
+// The steps for ingestion are:
+//
+//  1. Allocate file numbers for every sstable being ingested.
+//  2. Load the metadata for all sstables being ingested.
+//  3. Sort the sstables by smallest key, verifying non overlap (for local
+//     sstables).
+//  4. Hard link (or copy) the local sstables into the DB directory.
+//  5. Allocate a sequence number to use for all of the entries in the
+//     local sstables. This is the step where overlap with memtables is
+//     determined. If there is overlap, we remember the most recent memtable
+//     that overlaps.
+//  6. Update the sequence number in the ingested local sstables. (Remote
+//     sstables get fixed sequence numbers that were determined at load time.)
+//  7. Wait for the most recent memtable that overlaps to flush (if any).
+//  8. Add the ingested sstables to the version (DB.ingestApply).
+//     8.1.  If an excise span was specified, figure out what sstables in the
+//     current version overlap with the excise span, and create new virtual
+//     sstables out of those sstables that exclude the excised span (DB.excise).
+//  9. Publish the ingestion sequence number.
+//
+// Note that if the mutable memtable overlaps with ingestion, a flush of the
+// memtable is forced equivalent to DB.Flush. Additionally, subsequent
+// mutations that get sequence numbers larger than the ingestion sequence
+// number get queued up behind the ingestion waiting for it to complete. This
+// can produce a noticeable hiccup in performance. See
+// https://github.com/cockroachdb/pebble/issues/25 for an idea for how to fix
+// this hiccup.
+func (d *DB) Ingest(paths []string) error {
+	if err := d.closed.Load(); err != nil {
+		panic(err)
+	}
+	if d.opts.ReadOnly {
+		return ErrReadOnly
+	}
+	_, err := d.ingest(paths, ingestTargetLevel, nil /* shared */, KeyRange{}, nil /* external */)
+	return err
+}
+
+// IngestOperationStats provides some information about where in the LSM the
+// bytes were ingested.
+type IngestOperationStats struct {
+	// Bytes is the total bytes in the ingested sstables.
+	Bytes uint64
+	// ApproxIngestedIntoL0Bytes is the approximate number of bytes ingested
+	// into L0. This value is approximate when flushable ingests are active and
+	// an ingest overlaps an entry in the flushable queue. Currently, this
+	// approximation is very rough, only including tables that overlapped the
+	// memtable. This estimate may be improved with #2112.
+	ApproxIngestedIntoL0Bytes uint64
+	// MemtableOverlappingFiles is the count of ingested sstables
+	// that overlapped keys in the memtables.
+	MemtableOverlappingFiles int
+}
+
+// ExternalFile are external sstables that can be referenced through
+// objprovider and ingested as remote files that will not be refcounted or
+// cleaned up. For use with online restore. Note that the underlying sstable
+// could contain keys outside the [Smallest,Largest) bounds; however Pebble
+// is expected to only read the keys within those bounds.
+type ExternalFile struct {
+	// Locator is the shared.Locator that can be used with objProvider to
+	// resolve a reference to this external sstable.
+	Locator remote.Locator
+	// ObjName is the unique name of this sstable on Locator.
+	ObjName string
+	// Size of the referenced proportion of the virtualized sstable. An estimate
+	// is acceptable in lieu of the backing file size.
+	Size uint64
+	// SmallestUserKey and LargestUserKey are the [smallest,largest) user key
+	// bounds of the sstable. Both these bounds are loose i.e. it's possible for
+	// the sstable to not span the entirety of this range. However, multiple
+	// ExternalFiles in one ingestion must all have non-overlapping
+	// [smallest, largest) spans. Note that this Largest bound is exclusive.
+	SmallestUserKey, LargestUserKey []byte
+	// HasPointKey and HasRangeKey denote whether this file contains point keys
+	// or range keys. If both structs are false, an error is returned during
+	// ingestion.
+	HasPointKey, HasRangeKey bool
+}
+
+// IngestWithStats does the same as Ingest, and additionally returns
+// IngestOperationStats.
+func (d *DB) IngestWithStats(paths []string) (IngestOperationStats, error) {
+	if err := d.closed.Load(); err != nil {
+		panic(err)
+	}
+	if d.opts.ReadOnly {
+		return IngestOperationStats{}, ErrReadOnly
+	}
+	return d.ingest(paths, ingestTargetLevel, nil /* shared */, KeyRange{}, nil /* external */)
+}
+
+// IngestExternalFiles does the same as IngestWithStats, and additionally
+// accepts external files (with locator info that can be resolved using
+// d.opts.SharedStorage). These files must also be non-overlapping with
+// each other, and must be resolvable through d.objProvider.
+func (d *DB) IngestExternalFiles(external []ExternalFile) (IngestOperationStats, error) {
+	if err := d.closed.Load(); err != nil {
+		panic(err)
+	}
+
+	if d.opts.ReadOnly {
+		return IngestOperationStats{}, ErrReadOnly
+	}
+	if d.opts.Experimental.RemoteStorage == nil {
+		return IngestOperationStats{}, errors.New("pebble: cannot ingest external files without shared storage configured")
+	}
+	return d.ingest(nil, ingestTargetLevel, nil /* shared */, KeyRange{}, external)
+}
+
+// IngestAndExcise does the same as IngestWithStats, and additionally accepts a
+// list of shared files to ingest that can be read from a remote.Storage through
+// a Provider. All the shared files must live within exciseSpan, and any existing
+// keys in exciseSpan are deleted by turning existing sstables into virtual
+// sstables (if not virtual already) and shrinking their spans to exclude
+// exciseSpan. See the comment at Ingest for a more complete picture of the
+// ingestion process.
+//
+// Panics if this DB instance was not instantiated with a remote.Storage and
+// shared sstables are present.
+func (d *DB) IngestAndExcise(
+	paths []string, shared []SharedSSTMeta, exciseSpan KeyRange,
+) (IngestOperationStats, error) {
+	if err := d.closed.Load(); err != nil {
+		panic(err)
+	}
+	if d.opts.ReadOnly {
+		return IngestOperationStats{}, ErrReadOnly
+	}
+	return d.ingest(paths, ingestTargetLevel, shared, exciseSpan, nil /* external */)
+}
+
+// Both DB.mu and commitPipeline.mu must be held while this is called.
+func (d *DB) newIngestedFlushableEntry(
+	meta []*fileMetadata, seqNum uint64, logNum base.DiskFileNum,
+) (*flushableEntry, error) {
+	// Update the sequence number for all of the sstables in the
+	// metadata. Writing the metadata to the manifest when the
+	// version edit is applied is the mechanism that persists the
+	// sequence number. The sstables themselves are left unmodified.
+	// In this case, a version edit will only be written to the manifest
+	// when the flushable is eventually flushed. If Pebble restarts in that
+	// time, then we'll lose the ingest sequence number information. But this
+	// information will also be reconstructed on node restart.
+	if err := ingestUpdateSeqNum(
+		d.cmp, d.opts.Comparer.FormatKey, seqNum, ingestLoadResult{localMeta: meta},
+	); err != nil {
+		return nil, err
+	}
+
+	f := newIngestedFlushable(meta, d.opts.Comparer, d.newIters, d.tableNewRangeKeyIter)
+
+	// NB: The logNum/seqNum are the WAL number which we're writing this entry
+	// to and the sequence number within the WAL which we'll write this entry
+	// to.
+	entry := d.newFlushableEntry(f, logNum, seqNum)
+	// The flushable entry starts off with a single reader ref, so increment
+	// the FileMetadata.Refs.
+	for _, file := range f.files {
+		file.Ref()
+	}
+	entry.unrefFiles = func() []*fileBacking {
+		var obsolete []*fileBacking
+		for _, file := range f.files {
+			if file.Unref() == 0 {
+				obsolete = append(obsolete, file.FileMetadata.FileBacking)
+			}
+		}
+		return obsolete
+	}
+
+	entry.flushForced = true
+	entry.releaseMemAccounting = func() {}
+	return entry, nil
+}
+
+// Both DB.mu and commitPipeline.mu must be held while this is called. Since
+// we're holding both locks, the order in which we rotate the memtable or
+// recycle the WAL in this function is irrelevant as long as the correct log
+// numbers are assigned to the appropriate flushable.
+func (d *DB) handleIngestAsFlushable(meta []*fileMetadata, seqNum uint64) error {
+	b := d.NewBatch()
+	for _, m := range meta {
+		b.ingestSST(m.FileNum)
+	}
+	b.setSeqNum(seqNum)
+
+	// If the WAL is disabled, then the logNum used to create the flushable
+	// entry doesn't matter. We just use the logNum assigned to the current
+	// mutable memtable. If the WAL is enabled, then this logNum will be
+	// overwritten by the logNum of the log which will contain the log entry
+	// for the ingestedFlushable.
+	logNum := d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum
+	if !d.opts.DisableWAL {
+		// We create a new WAL for the flushable instead of reusing the end of
+		// the previous WAL. This simplifies the increment of the minimum
+		// unflushed log number, and also simplifies WAL replay.
+		logNum, _ = d.recycleWAL()
+		d.mu.Unlock()
+		err := d.commit.directWrite(b)
+		if err != nil {
+			d.opts.Logger.Fatalf("%v", err)
+		}
+		d.mu.Lock()
+	}
+
+	entry, err := d.newIngestedFlushableEntry(meta, seqNum, logNum)
+	if err != nil {
+		return err
+	}
+	nextSeqNum := seqNum + uint64(b.Count())
+
+	// Set newLogNum to the logNum of the previous flushable. This value is
+	// irrelevant if the WAL is disabled. If the WAL is enabled, then we set
+	// the appropriate value below.
+	newLogNum := d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum
+	if !d.opts.DisableWAL {
+		// This is WAL num of the next mutable memtable which comes after the
+		// ingestedFlushable in the flushable queue. The mutable memtable
+		// will be created below.
+		newLogNum, _ = d.recycleWAL()
+		if err != nil {
+			return err
+		}
+	}
+
+	currMem := d.mu.mem.mutable
+	// NB: Placing ingested sstables above the current memtables
+	// requires rotating of the existing memtables/WAL. There is
+	// some concern of churning through tiny memtables due to
+	// ingested sstables being placed on top of them, but those
+	// memtables would have to be flushed anyways.
+	d.mu.mem.queue = append(d.mu.mem.queue, entry)
+	d.rotateMemtable(newLogNum, nextSeqNum, currMem)
+	d.updateReadStateLocked(d.opts.DebugCheck)
+	d.maybeScheduleFlush()
+	return nil
+}
+
+// See comment at Ingest() for details on how this works.
+func (d *DB) ingest(
+	paths []string,
+	targetLevelFunc ingestTargetLevelFunc,
+	shared []SharedSSTMeta,
+	exciseSpan KeyRange,
+	external []ExternalFile,
+) (IngestOperationStats, error) {
+	if len(shared) > 0 && d.opts.Experimental.RemoteStorage == nil {
+		panic("cannot ingest shared sstables with nil SharedStorage")
+	}
+	if (exciseSpan.Valid() || len(shared) > 0 || len(external) > 0) && d.FormatMajorVersion() < FormatVirtualSSTables {
+		return IngestOperationStats{}, errors.New("pebble: format major version too old for excise, shared or external sstable ingestion")
+	}
+	// Allocate file numbers for all of the files being ingested and mark them as
+	// pending in order to prevent them from being deleted. Note that this causes
+	// the file number ordering to be out of alignment with sequence number
+	// ordering. The sorting of L0 tables by sequence number avoids relying on
+	// that (busted) invariant.
+	d.mu.Lock()
+	pendingOutputs := make([]base.DiskFileNum, len(paths)+len(shared)+len(external))
+	for i := 0; i < len(paths)+len(shared)+len(external); i++ {
+		pendingOutputs[i] = d.mu.versions.getNextDiskFileNum()
+	}
+
+	jobID := d.mu.nextJobID
+	d.mu.nextJobID++
+	d.mu.Unlock()
+
+	// Load the metadata for all the files being ingested. This step detects
+	// and elides empty sstables.
+	loadResult, err := ingestLoad(d.opts, d.FormatMajorVersion(), paths, shared, external, d.cacheID, pendingOutputs, d.objProvider, jobID)
+	if err != nil {
+		return IngestOperationStats{}, err
+	}
+
+	if loadResult.fileCount == 0 {
+		// All of the sstables to be ingested were empty. Nothing to do.
+		return IngestOperationStats{}, nil
+	}
+
+	// Verify the sstables do not overlap.
+	if err := ingestSortAndVerify(d.cmp, loadResult, exciseSpan); err != nil {
+		return IngestOperationStats{}, err
+	}
+
+	// Hard link the sstables into the DB directory. Since the sstables aren't
+	// referenced by a version, they won't be used. If the hard linking fails
+	// (e.g. because the files reside on a different filesystem), ingestLink will
+	// fall back to copying, and if that fails we undo our work and return an
+	// error.
+	if err := ingestLink(jobID, d.opts, d.objProvider, loadResult, shared); err != nil {
+		return IngestOperationStats{}, err
+	}
+
+	// Make the new tables durable. We need to do this at some point before we
+	// update the MANIFEST (via logAndApply), otherwise a crash can have the
+	// tables referenced in the MANIFEST, but not present in the provider.
+	if err := d.objProvider.Sync(); err != nil {
+		return IngestOperationStats{}, err
+	}
+
+	// metaFlushableOverlaps is a slice parallel to meta indicating which of the
+	// ingested sstables overlap some table in the flushable queue. It's used to
+	// approximate ingest-into-L0 stats when using flushable ingests.
+	metaFlushableOverlaps := make([]bool, loadResult.fileCount)
+	var mem *flushableEntry
+	var mut *memTable
+	// asFlushable indicates whether the sstable was ingested as a flushable.
+	var asFlushable bool
+	iterOps := IterOptions{
+		CategoryAndQoS: sstable.CategoryAndQoS{
+			Category: "pebble-ingest",
+			QoSLevel: sstable.LatencySensitiveQoSLevel,
+		},
+	}
+	prepare := func(seqNum uint64) {
+		// Note that d.commit.mu is held by commitPipeline when calling prepare.
+
+		d.mu.Lock()
+		defer d.mu.Unlock()
+
+		// Check to see if any files overlap with any of the memtables. The queue
+		// is ordered from oldest to newest with the mutable memtable being the
+		// last element in the slice. We want to wait for the newest table that
+		// overlaps.
+
+		for i := len(d.mu.mem.queue) - 1; i >= 0; i-- {
+			m := d.mu.mem.queue[i]
+			iter := m.newIter(&iterOps)
+			rangeDelIter := m.newRangeDelIter(&iterOps)
+			rkeyIter := m.newRangeKeyIter(&iterOps)
+
+			checkForOverlap := func(i int, meta *fileMetadata) {
+				if metaFlushableOverlaps[i] {
+					// This table already overlapped a more recent flushable.
+					return
+				}
+				kr := internalKeyRange{
+					smallest: meta.Smallest,
+					largest:  meta.Largest,
+				}
+				if overlapWithIterator(iter, &rangeDelIter, rkeyIter, kr, d.cmp) {
+					// If this is the first table to overlap a flushable, save
+					// the flushable. This ingest must be ingested or flushed
+					// after it.
+					if mem == nil {
+						mem = m
+					}
+					metaFlushableOverlaps[i] = true
+				}
+			}
+			for i := range loadResult.localMeta {
+				checkForOverlap(i, loadResult.localMeta[i])
+			}
+			for i := range loadResult.sharedMeta {
+				checkForOverlap(len(loadResult.localMeta)+i, loadResult.sharedMeta[i])
+			}
+			for i := range loadResult.externalMeta {
+				checkForOverlap(len(loadResult.localMeta)+len(loadResult.sharedMeta)+i, loadResult.externalMeta[i])
+			}
+			if exciseSpan.Valid() {
+				kr := internalKeyRange{
+					smallest: base.MakeInternalKey(exciseSpan.Start, InternalKeySeqNumMax, InternalKeyKindMax),
+					largest:  base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, exciseSpan.End),
+				}
+				if overlapWithIterator(iter, &rangeDelIter, rkeyIter, kr, d.cmp) {
+					if mem == nil {
+						mem = m
+					}
+				}
+			}
+			err := iter.Close()
+			if rangeDelIter != nil {
+				err = firstError(err, rangeDelIter.Close())
+			}
+			if rkeyIter != nil {
+				err = firstError(err, rkeyIter.Close())
+			}
+			if err != nil {
+				d.opts.Logger.Errorf("ingest error reading flushable for log %s: %s", m.logNum, err)
+			}
+		}
+
+		if mem == nil {
+			// No overlap with any of the queued flushables, so no need to queue
+			// after them.
+
+			// New writes with higher sequence numbers may be concurrently
+			// committed. We must ensure they don't flush before this ingest
+			// completes. To do that, we ref the mutable memtable as a writer,
+			// preventing its flushing (and the flushing of all subsequent
+			// flushables in the queue). Once we've acquired the manifest lock
+			// to add the ingested sstables to the LSM, we can unref as we're
+			// guaranteed that the flush won't edit the LSM before this ingest.
+			mut = d.mu.mem.mutable
+			mut.writerRef()
+			return
+		}
+		// The ingestion overlaps with some entry in the flushable queue.
+		if d.FormatMajorVersion() < FormatFlushableIngest ||
+			d.opts.Experimental.DisableIngestAsFlushable() ||
+			len(shared) > 0 || exciseSpan.Valid() || len(external) > 0 ||
+			(len(d.mu.mem.queue) > d.opts.MemTableStopWritesThreshold-1) {
+			// We're not able to ingest as a flushable,
+			// so we must synchronously flush.
+			//
+			// TODO(bilal): Currently, if any of the files being ingested are shared or
+			// there's an excise span present, we cannot use flushable ingests and need
+			// to wait synchronously. Either remove this caveat by fleshing out
+			// flushable ingest logic to also account for these cases, or remove this
+			// comment. Tracking issue: https://github.com/cockroachdb/pebble/issues/2676
+			if mem.flushable == d.mu.mem.mutable {
+				err = d.makeRoomForWrite(nil)
+			}
+			// New writes with higher sequence numbers may be concurrently
+			// committed. We must ensure they don't flush before this ingest
+			// completes. To do that, we ref the mutable memtable as a writer,
+			// preventing its flushing (and the flushing of all subsequent
+			// flushables in the queue). Once we've acquired the manifest lock
+			// to add the ingested sstables to the LSM, we can unref as we're
+			// guaranteed that the flush won't edit the LSM before this ingest.
+			mut = d.mu.mem.mutable
+			mut.writerRef()
+			mem.flushForced = true
+			d.maybeScheduleFlush()
+			return
+		}
+		// Since there aren't too many memtables already queued up, we can
+		// slide the ingested sstables on top of the existing memtables.
+		asFlushable = true
+		err = d.handleIngestAsFlushable(loadResult.localMeta, seqNum)
+	}
+
+	var ve *versionEdit
+	apply := func(seqNum uint64) {
+		if err != nil || asFlushable {
+			// An error occurred during prepare.
+			if mut != nil {
+				if mut.writerUnref() {
+					d.mu.Lock()
+					d.maybeScheduleFlush()
+					d.mu.Unlock()
+				}
+			}
+			return
+		}
+
+		// Update the sequence numbers for all ingested sstables'
+		// metadata. When the version edit is applied, the metadata is
+		// written to the manifest, persisting the sequence number.
+		// The sstables themselves are left unmodified.
+		if err = ingestUpdateSeqNum(
+			d.cmp, d.opts.Comparer.FormatKey, seqNum, loadResult,
+		); err != nil {
+			if mut != nil {
+				if mut.writerUnref() {
+					d.mu.Lock()
+					d.maybeScheduleFlush()
+					d.mu.Unlock()
+				}
+			}
+			return
+		}
+
+		// If we overlapped with a memtable in prepare wait for the flush to
+		// finish.
+		if mem != nil {
+			<-mem.flushed
+		}
+
+		// Assign the sstables to the correct level in the LSM and apply the
+		// version edit.
+		ve, err = d.ingestApply(jobID, loadResult, targetLevelFunc, mut, exciseSpan)
+	}
+
+	// Only one ingest can occur at a time because if not, one would block waiting
+	// for the other to finish applying. This blocking would happen while holding
+	// the commit mutex which would prevent unrelated batches from writing their
+	// changes to the WAL and memtable. This will cause a bigger commit hiccup
+	// during ingestion.
+	d.commit.ingestSem <- struct{}{}
+	d.commit.AllocateSeqNum(loadResult.fileCount, prepare, apply)
+	<-d.commit.ingestSem
+
+	if err != nil {
+		if err2 := ingestCleanup(d.objProvider, loadResult.localMeta); err2 != nil {
+			d.opts.Logger.Errorf("ingest cleanup failed: %v", err2)
+		}
+	} else {
+		// Since we either created a hard link to the ingesting files, or copied
+		// them over, it is safe to remove the originals paths.
+		for _, path := range loadResult.localPaths {
+			if err2 := d.opts.FS.Remove(path); err2 != nil {
+				d.opts.Logger.Errorf("ingest failed to remove original file: %s", err2)
+			}
+		}
+	}
+
+	info := TableIngestInfo{
+		JobID:     jobID,
+		Err:       err,
+		flushable: asFlushable,
+	}
+	if len(loadResult.localMeta) > 0 {
+		info.GlobalSeqNum = loadResult.localMeta[0].SmallestSeqNum
+	} else if len(loadResult.sharedMeta) > 0 {
+		info.GlobalSeqNum = loadResult.sharedMeta[0].SmallestSeqNum
+	} else {
+		info.GlobalSeqNum = loadResult.externalMeta[0].SmallestSeqNum
+	}
+	var stats IngestOperationStats
+	if ve != nil {
+		info.Tables = make([]struct {
+			TableInfo
+			Level int
+		}, len(ve.NewFiles))
+		for i := range ve.NewFiles {
+			e := &ve.NewFiles[i]
+			info.Tables[i].Level = e.Level
+			info.Tables[i].TableInfo = e.Meta.TableInfo()
+			stats.Bytes += e.Meta.Size
+			if e.Level == 0 {
+				stats.ApproxIngestedIntoL0Bytes += e.Meta.Size
+			}
+			if i < len(metaFlushableOverlaps) && metaFlushableOverlaps[i] {
+				stats.MemtableOverlappingFiles++
+			}
+		}
+	} else if asFlushable {
+		// NB: If asFlushable == true, there are no shared sstables.
+		info.Tables = make([]struct {
+			TableInfo
+			Level int
+		}, len(loadResult.localMeta))
+		for i, f := range loadResult.localMeta {
+			info.Tables[i].Level = -1
+			info.Tables[i].TableInfo = f.TableInfo()
+			stats.Bytes += f.Size
+			// We don't have exact stats on which files will be ingested into
+			// L0, because actual ingestion into the LSM has been deferred until
+			// flush time. Instead, we infer based on memtable overlap.
+			//
+			// TODO(jackson): If we optimistically compute data overlap (#2112)
+			// before entering the commit pipeline, we can use that overlap to
+			// improve our approximation by incorporating overlap with L0, not
+			// just memtables.
+			if metaFlushableOverlaps[i] {
+				stats.ApproxIngestedIntoL0Bytes += f.Size
+				stats.MemtableOverlappingFiles++
+			}
+		}
+	}
+	d.opts.EventListener.TableIngested(info)
+
+	return stats, err
+}
+
+// excise updates ve to include a replacement of the file m with new virtual
+// sstables that exclude exciseSpan, returning a slice of newly-created files if
+// any. If the entirety of m is deleted by exciseSpan, no new sstables are added
+// and m is deleted. Note that ve is updated in-place.
+//
+// The manifest lock must be held when calling this method.
+func (d *DB) excise(
+	exciseSpan KeyRange, m *fileMetadata, ve *versionEdit, level int,
+) ([]manifest.NewFileEntry, error) {
+	numCreatedFiles := 0
+	// Check if there's actually an overlap between m and exciseSpan.
+	if !exciseSpan.Overlaps(d.cmp, m) {
+		return nil, nil
+	}
+	ve.DeletedFiles[deletedFileEntry{
+		Level:   level,
+		FileNum: m.FileNum,
+	}] = m
+	// Fast path: m sits entirely within the exciseSpan, so just delete it.
+	if exciseSpan.Contains(d.cmp, m.Smallest) && exciseSpan.Contains(d.cmp, m.Largest) {
+		return nil, nil
+	}
+	var iter internalIterator
+	var rangeDelIter keyspan.FragmentIterator
+	var rangeKeyIter keyspan.FragmentIterator
+	needsBacking := false
+	// Create a file to the left of the excise span, if necessary.
+	// The bounds of this file will be [m.Smallest, lastKeyBefore(exciseSpan.Start)].
+	//
+	// We create bounds that are tight on user keys, and we make the effort to find
+	// the last key in the original sstable that's smaller than exciseSpan.Start
+	// even though it requires some sstable reads. We could choose to create
+	// virtual sstables on loose userKey bounds, in which case we could just set
+	// leftFile.Largest to an exclusive sentinel at exciseSpan.Start. The biggest
+	// issue with that approach would be that it'd lead to lots of small virtual
+	// sstables in the LSM that have no guarantee on containing even a single user
+	// key within the file bounds. This has the potential to increase both read and
+	// write-amp as we will be opening up these sstables only to find no relevant
+	// keys in the read path, and compacting sstables on top of them instead of
+	// directly into the space occupied by them. We choose to incur the cost of
+	// calculating tight bounds at this time instead of creating more work in the
+	// future.
+	//
+	// TODO(bilal): Some of this work can happen without grabbing the manifest
+	// lock; we could grab one currentVersion, release the lock, calculate excised
+	// files, then grab the lock again and recalculate for just the files that
+	// have changed since our previous calculation. Do this optimiaztino as part of
+	// https://github.com/cockroachdb/pebble/issues/2112 .
+	if d.cmp(m.Smallest.UserKey, exciseSpan.Start) < 0 {
+		leftFile := &fileMetadata{
+			Virtual:     true,
+			FileBacking: m.FileBacking,
+			FileNum:     d.mu.versions.getNextFileNum(),
+			// Note that these are loose bounds for smallest/largest seqnums, but they're
+			// sufficient for maintaining correctness.
+			SmallestSeqNum: m.SmallestSeqNum,
+			LargestSeqNum:  m.LargestSeqNum,
+		}
+		if m.HasPointKeys && !exciseSpan.Contains(d.cmp, m.SmallestPointKey) {
+			// This file will contain point keys
+			smallestPointKey := m.SmallestPointKey
+			var err error
+			iter, rangeDelIter, err = d.newIters(context.TODO(), m, &IterOptions{
+				CategoryAndQoS: sstable.CategoryAndQoS{
+					Category: "pebble-ingest",
+					QoSLevel: sstable.LatencySensitiveQoSLevel,
+				},
+				level: manifest.Level(level),
+			}, internalIterOpts{})
+			if err != nil {
+				return nil, err
+			}
+			var key *InternalKey
+			if iter != nil {
+				defer iter.Close()
+				key, _ = iter.SeekLT(exciseSpan.Start, base.SeekLTFlagsNone)
+			} else {
+				iter = emptyIter
+			}
+			if key != nil {
+				leftFile.ExtendPointKeyBounds(d.cmp, smallestPointKey, key.Clone())
+			}
+			// Store the min of (exciseSpan.Start, rdel.End) in lastRangeDel. This
+			// needs to be a copy if the key is owned by the range del iter.
+			var lastRangeDel []byte
+			if rangeDelIter != nil {
+				defer rangeDelIter.Close()
+				rdel := rangeDelIter.SeekLT(exciseSpan.Start)
+				if rdel != nil {
+					lastRangeDel = append(lastRangeDel[:0], rdel.End...)
+					if d.cmp(lastRangeDel, exciseSpan.Start) > 0 {
+						lastRangeDel = exciseSpan.Start
+					}
+				}
+			} else {
+				rangeDelIter = emptyKeyspanIter
+			}
+			if lastRangeDel != nil {
+				leftFile.ExtendPointKeyBounds(d.cmp, smallestPointKey, base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, lastRangeDel))
+			}
+		}
+		if m.HasRangeKeys && !exciseSpan.Contains(d.cmp, m.SmallestRangeKey) {
+			// This file will contain range keys
+			var err error
+			smallestRangeKey := m.SmallestRangeKey
+			rangeKeyIter, err = d.tableNewRangeKeyIter(m, keyspan.SpanIterOptions{})
+			if err != nil {
+				return nil, err
+			}
+			// Store the min of (exciseSpan.Start, rkey.End) in lastRangeKey. This
+			// needs to be a copy if the key is owned by the range key iter.
+			var lastRangeKey []byte
+			var lastRangeKeyKind InternalKeyKind
+			defer rangeKeyIter.Close()
+			rkey := rangeKeyIter.SeekLT(exciseSpan.Start)
+			if rkey != nil {
+				lastRangeKey = append(lastRangeKey[:0], rkey.End...)
+				if d.cmp(lastRangeKey, exciseSpan.Start) > 0 {
+					lastRangeKey = exciseSpan.Start
+				}
+				lastRangeKeyKind = rkey.Keys[0].Kind()
+			}
+			if lastRangeKey != nil {
+				leftFile.ExtendRangeKeyBounds(d.cmp, smallestRangeKey, base.MakeExclusiveSentinelKey(lastRangeKeyKind, lastRangeKey))
+			}
+		}
+		if leftFile.HasRangeKeys || leftFile.HasPointKeys {
+			var err error
+			leftFile.Size, err = d.tableCache.estimateSize(m, leftFile.Smallest.UserKey, leftFile.Largest.UserKey)
+			if err != nil {
+				return nil, err
+			}
+			if leftFile.Size == 0 {
+				// On occasion, estimateSize gives us a low estimate, i.e. a 0 file size,
+				// such as if the excised file only has range keys/dels and no point
+				// keys. This can cause panics in places where we divide by file sizes.
+				// Correct for it here.
+				leftFile.Size = 1
+			}
+			if err := leftFile.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil {
+				return nil, err
+			}
+			leftFile.ValidateVirtual(m)
+			ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: level, Meta: leftFile})
+			needsBacking = true
+			numCreatedFiles++
+		}
+	}
+	// Create a file to the right, if necessary.
+	if exciseSpan.Contains(d.cmp, m.Largest) {
+		// No key exists to the right of the excise span in this file.
+		if needsBacking && !m.Virtual {
+			// If m is virtual, then its file backing is already known to the manifest.
+			// We don't need to create another file backing. Note that there must be
+			// only one CreatedBackingTables entry per backing sstable. This is
+			// indicated by the VersionEdit.CreatedBackingTables invariant.
+			ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking)
+		}
+		return ve.NewFiles[len(ve.NewFiles)-numCreatedFiles:], nil
+	}
+	// Create a new file, rightFile, between [firstKeyAfter(exciseSpan.End), m.Largest].
+	//
+	// See comment before the definition of leftFile for the motivation behind
+	// calculating tight user-key bounds.
+	rightFile := &fileMetadata{
+		Virtual:     true,
+		FileBacking: m.FileBacking,
+		FileNum:     d.mu.versions.getNextFileNum(),
+		// Note that these are loose bounds for smallest/largest seqnums, but they're
+		// sufficient for maintaining correctness.
+		SmallestSeqNum: m.SmallestSeqNum,
+		LargestSeqNum:  m.LargestSeqNum,
+	}
+	if m.HasPointKeys && !exciseSpan.Contains(d.cmp, m.LargestPointKey) {
+		// This file will contain point keys
+		largestPointKey := m.LargestPointKey
+		var err error
+		if iter == nil && rangeDelIter == nil {
+			iter, rangeDelIter, err = d.newIters(context.TODO(), m, &IterOptions{
+				CategoryAndQoS: sstable.CategoryAndQoS{
+					Category: "pebble-ingest",
+					QoSLevel: sstable.LatencySensitiveQoSLevel,
+				},
+				level: manifest.Level(level),
+			}, internalIterOpts{})
+			if err != nil {
+				return nil, err
+			}
+			if iter != nil {
+				defer iter.Close()
+			} else {
+				iter = emptyIter
+			}
+			if rangeDelIter != nil {
+				defer rangeDelIter.Close()
+			} else {
+				rangeDelIter = emptyKeyspanIter
+			}
+		}
+		key, _ := iter.SeekGE(exciseSpan.End, base.SeekGEFlagsNone)
+		if key != nil {
+			rightFile.ExtendPointKeyBounds(d.cmp, key.Clone(), largestPointKey)
+		}
+		// Store the max of (exciseSpan.End, rdel.Start) in firstRangeDel. This
+		// needs to be a copy if the key is owned by the range del iter.
+		var firstRangeDel []byte
+		rdel := rangeDelIter.SeekGE(exciseSpan.End)
+		if rdel != nil {
+			firstRangeDel = append(firstRangeDel[:0], rdel.Start...)
+			if d.cmp(firstRangeDel, exciseSpan.End) < 0 {
+				firstRangeDel = exciseSpan.End
+			}
+		}
+		if firstRangeDel != nil {
+			smallestPointKey := rdel.SmallestKey()
+			smallestPointKey.UserKey = firstRangeDel
+			rightFile.ExtendPointKeyBounds(d.cmp, smallestPointKey, largestPointKey)
+		}
+	}
+	if m.HasRangeKeys && !exciseSpan.Contains(d.cmp, m.LargestRangeKey) {
+		// This file will contain range keys.
+		largestRangeKey := m.LargestRangeKey
+		if rangeKeyIter == nil {
+			var err error
+			rangeKeyIter, err = d.tableNewRangeKeyIter(m, keyspan.SpanIterOptions{})
+			if err != nil {
+				return nil, err
+			}
+			defer rangeKeyIter.Close()
+		}
+		// Store the max of (exciseSpan.End, rkey.Start) in firstRangeKey. This
+		// needs to be a copy if the key is owned by the range key iter.
+		var firstRangeKey []byte
+		rkey := rangeKeyIter.SeekGE(exciseSpan.End)
+		if rkey != nil {
+			firstRangeKey = append(firstRangeKey[:0], rkey.Start...)
+			if d.cmp(firstRangeKey, exciseSpan.End) < 0 {
+				firstRangeKey = exciseSpan.End
+			}
+		}
+		if firstRangeKey != nil {
+			smallestRangeKey := rkey.SmallestKey()
+			smallestRangeKey.UserKey = firstRangeKey
+			// We call ExtendRangeKeyBounds so any internal boundType fields are
+			// set correctly. Note that this is mildly wasteful as we'll be comparing
+			// rightFile.{Smallest,Largest}RangeKey with themselves, which can be
+			// avoided if we exported ExtendOverallKeyBounds or so.
+			rightFile.ExtendRangeKeyBounds(d.cmp, smallestRangeKey, largestRangeKey)
+		}
+	}
+	if rightFile.HasRangeKeys || rightFile.HasPointKeys {
+		var err error
+		rightFile.Size, err = d.tableCache.estimateSize(m, rightFile.Smallest.UserKey, rightFile.Largest.UserKey)
+		if err != nil {
+			return nil, err
+		}
+		if rightFile.Size == 0 {
+			// On occasion, estimateSize gives us a low estimate, i.e. a 0 file size,
+			// such as if the excised file only has range keys/dels and no point keys.
+			// This can cause panics in places where we divide by file sizes. Correct
+			// for it here.
+			rightFile.Size = 1
+		}
+		rightFile.ValidateVirtual(m)
+		ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: level, Meta: rightFile})
+		needsBacking = true
+		numCreatedFiles++
+	}
+
+	if needsBacking && !m.Virtual {
+		// If m is virtual, then its file backing is already known to the manifest.
+		// We don't need to create another file backing. Note that there must be
+		// only one CreatedBackingTables entry per backing sstable. This is
+		// indicated by the VersionEdit.CreatedBackingTables invariant.
+		ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking)
+	}
+
+	if err := rightFile.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil {
+		return nil, err
+	}
+	return ve.NewFiles[len(ve.NewFiles)-numCreatedFiles:], nil
+}
+
+type ingestTargetLevelFunc func(
+	newIters tableNewIters,
+	newRangeKeyIter keyspan.TableNewSpanIter,
+	iterOps IterOptions,
+	comparer *Comparer,
+	v *version,
+	baseLevel int,
+	compactions map[*compaction]struct{},
+	meta *fileMetadata,
+	suggestSplit bool,
+) (int, *fileMetadata, error)
+
+type ingestSplitFile struct {
+	// ingestFile is the file being ingested.
+	ingestFile *fileMetadata
+	// splitFile is the file that needs to be split to allow ingestFile to slot
+	// into `level` level.
+	splitFile *fileMetadata
+	// The level where ingestFile will go (and where splitFile already is).
+	level int
+}
+
+// ingestSplit splits files specified in `files` and updates ve in-place to
+// account for existing files getting split into two virtual sstables. The map
+// `replacedFiles` contains an in-progress map of all files that have been
+// replaced with new virtual sstables in this version edit so far, which is also
+// updated in-place.
+//
+// d.mu as well as the manifest lock must be held when calling this method.
+func (d *DB) ingestSplit(
+	ve *versionEdit,
+	updateMetrics func(*fileMetadata, int, []newFileEntry),
+	files []ingestSplitFile,
+	replacedFiles map[base.FileNum][]newFileEntry,
+) error {
+	for _, s := range files {
+		// replacedFiles can be thought of as a tree, where we start iterating with
+		// s.splitFile and run its fileNum through replacedFiles, then find which of
+		// the replaced files overlaps with s.ingestFile, which becomes the new
+		// splitFile, then we check splitFile's replacements in replacedFiles again
+		// for overlap with s.ingestFile, and so on until we either can't find the
+		// current splitFile in replacedFiles (i.e. that's the file that now needs to
+		// be split), or we don't find a file that overlaps with s.ingestFile, which
+		// means a prior ingest split already produced enough room for s.ingestFile
+		// to go into this level without necessitating another ingest split.
+		splitFile := s.splitFile
+		for splitFile != nil {
+			replaced, ok := replacedFiles[splitFile.FileNum]
+			if !ok {
+				break
+			}
+			updatedSplitFile := false
+			for i := range replaced {
+				if replaced[i].Meta.Overlaps(d.cmp, s.ingestFile.Smallest.UserKey, s.ingestFile.Largest.UserKey, s.ingestFile.Largest.IsExclusiveSentinel()) {
+					if updatedSplitFile {
+						// This should never happen because the earlier ingestTargetLevel
+						// function only finds split file candidates that are guaranteed to
+						// have no data overlap, only boundary overlap. See the comments
+						// in that method to see the definitions of data vs boundary
+						// overlap. That, plus the fact that files in `replaced` are
+						// guaranteed to have file bounds that are tight on user keys
+						// (as that's what `d.excise` produces), means that the only case
+						// where we overlap with two or more files in `replaced` is if we
+						// actually had data overlap all along, or if the ingestion files
+						// were overlapping, either of which is an invariant violation.
+						panic("updated with two files in ingestSplit")
+					}
+					splitFile = replaced[i].Meta
+					updatedSplitFile = true
+				}
+			}
+			if !updatedSplitFile {
+				// None of the replaced files overlapped with the file being ingested.
+				// This can happen if we've already excised a span overlapping with
+				// this file, or if we have consecutive ingested files that can slide
+				// within the same gap between keys in an existing file. For instance,
+				// if an existing file has keys a and g and we're ingesting b-c, d-e,
+				// the first loop iteration will split the existing file into one that
+				// ends in a and another that starts at g, and the second iteration will
+				// fall into this case and require no splitting.
+				//
+				// No splitting necessary.
+				splitFile = nil
+			}
+		}
+		if splitFile == nil {
+			continue
+		}
+		// NB: excise operates on [start, end). We're splitting at [start, end]
+		// (assuming !s.ingestFile.Largest.IsExclusiveSentinel()). The conflation
+		// of exclusive vs inclusive end bounds should not make a difference here
+		// as we're guaranteed to not have any data overlap between splitFile and
+		// s.ingestFile, so panic if we do see a newly added file with an endKey
+		// equalling s.ingestFile.Largest, and !s.ingestFile.Largest.IsExclusiveSentinel()
+		added, err := d.excise(KeyRange{Start: s.ingestFile.Smallest.UserKey, End: s.ingestFile.Largest.UserKey}, splitFile, ve, s.level)
+		if err != nil {
+			return err
+		}
+		if _, ok := ve.DeletedFiles[deletedFileEntry{
+			Level:   s.level,
+			FileNum: splitFile.FileNum,
+		}]; !ok {
+			panic("did not split file that was expected to be split")
+		}
+		replacedFiles[splitFile.FileNum] = added
+		for i := range added {
+			if s.ingestFile.Overlaps(d.cmp, added[i].Meta.Smallest.UserKey, added[i].Meta.Largest.UserKey, added[i].Meta.Largest.IsExclusiveSentinel()) {
+				panic("ingest-time split produced a file that overlaps with ingested file")
+			}
+		}
+		updateMetrics(splitFile, s.level, added)
+	}
+	// Flatten the version edit by removing any entries from ve.NewFiles that
+	// are also in ve.DeletedFiles.
+	newNewFiles := ve.NewFiles[:0]
+	for i := range ve.NewFiles {
+		fn := ve.NewFiles[i].Meta.FileNum
+		deEntry := deletedFileEntry{Level: ve.NewFiles[i].Level, FileNum: fn}
+		if _, ok := ve.DeletedFiles[deEntry]; ok {
+			delete(ve.DeletedFiles, deEntry)
+		} else {
+			newNewFiles = append(newNewFiles, ve.NewFiles[i])
+		}
+	}
+	ve.NewFiles = newNewFiles
+	return nil
+}
+
+func (d *DB) ingestApply(
+	jobID int,
+	lr ingestLoadResult,
+	findTargetLevel ingestTargetLevelFunc,
+	mut *memTable,
+	exciseSpan KeyRange,
+) (*versionEdit, error) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	ve := &versionEdit{
+		NewFiles: make([]newFileEntry, lr.fileCount),
+	}
+	if exciseSpan.Valid() || (d.opts.Experimental.IngestSplit != nil && d.opts.Experimental.IngestSplit()) {
+		ve.DeletedFiles = map[manifest.DeletedFileEntry]*manifest.FileMetadata{}
+	}
+	metrics := make(map[int]*LevelMetrics)
+
+	// Lock the manifest for writing before we use the current version to
+	// determine the target level. This prevents two concurrent ingestion jobs
+	// from using the same version to determine the target level, and also
+	// provides serialization with concurrent compaction and flush jobs.
+	// logAndApply unconditionally releases the manifest lock, but any earlier
+	// returns must unlock the manifest.
+	d.mu.versions.logLock()
+
+	if mut != nil {
+		// Unref the mutable memtable to allows its flush to proceed. Now that we've
+		// acquired the manifest lock, we can be certain that if the mutable
+		// memtable has received more recent conflicting writes, the flush won't
+		// beat us to applying to the manifest resulting in sequence number
+		// inversion. Even though we call maybeScheduleFlush right now, this flush
+		// will apply after our ingestion.
+		if mut.writerUnref() {
+			d.maybeScheduleFlush()
+		}
+	}
+
+	shouldIngestSplit := d.opts.Experimental.IngestSplit != nil &&
+		d.opts.Experimental.IngestSplit() && d.FormatMajorVersion() >= FormatVirtualSSTables
+	current := d.mu.versions.currentVersion()
+	baseLevel := d.mu.versions.picker.getBaseLevel()
+	iterOps := IterOptions{logger: d.opts.Logger}
+	// filesToSplit is a list where each element is a pair consisting of a file
+	// being ingested and a file being split to make room for an ingestion into
+	// that level. Each ingested file will appear at most once in this list. It
+	// is possible for split files to appear twice in this list.
+	filesToSplit := make([]ingestSplitFile, 0)
+	checkCompactions := false
+	for i := 0; i < lr.fileCount; i++ {
+		// Determine the lowest level in the LSM for which the sstable doesn't
+		// overlap any existing files in the level.
+		var m *fileMetadata
+		sharedIdx := -1
+		sharedLevel := -1
+		externalFile := false
+		if i < len(lr.localMeta) {
+			// local file.
+			m = lr.localMeta[i]
+		} else if (i - len(lr.localMeta)) < len(lr.sharedMeta) {
+			// shared file.
+			sharedIdx = i - len(lr.localMeta)
+			m = lr.sharedMeta[sharedIdx]
+			sharedLevel = int(lr.sharedLevels[sharedIdx])
+		} else {
+			// external file.
+			externalFile = true
+			m = lr.externalMeta[i-(len(lr.localMeta)+len(lr.sharedMeta))]
+		}
+		f := &ve.NewFiles[i]
+		var err error
+		if sharedIdx >= 0 {
+			f.Level = sharedLevel
+			if f.Level < sharedLevelsStart {
+				panic("cannot slot a shared file higher than the highest shared level")
+			}
+			ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking)
+		} else {
+			if externalFile {
+				ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking)
+			}
+			var splitFile *fileMetadata
+			if exciseSpan.Valid() && exciseSpan.Contains(d.cmp, m.Smallest) && exciseSpan.Contains(d.cmp, m.Largest) {
+				// This file fits perfectly within the excise span. We can slot it at
+				// L6, or sharedLevelsStart - 1 if we have shared files.
+				if len(lr.sharedMeta) > 0 {
+					f.Level = sharedLevelsStart - 1
+					if baseLevel > f.Level {
+						f.Level = 0
+					}
+				} else {
+					f.Level = 6
+				}
+			} else {
+				// TODO(bilal): findTargetLevel does disk IO (reading files for data
+				// overlap) even though we're holding onto d.mu. Consider unlocking
+				// d.mu while we do this. We already hold versions.logLock so we should
+				// not see any version applications while we're at this. The one
+				// complication here would be pulling out the mu.compact.inProgress
+				// check from findTargetLevel, as that requires d.mu to be held.
+				f.Level, splitFile, err = findTargetLevel(
+					d.newIters, d.tableNewRangeKeyIter, iterOps, d.opts.Comparer, current, baseLevel, d.mu.compact.inProgress, m, shouldIngestSplit)
+			}
+
+			if splitFile != nil {
+				if invariants.Enabled {
+					if lf := current.Levels[f.Level].Find(d.cmp, splitFile); lf == nil {
+						panic("splitFile returned is not in level it should be")
+					}
+				}
+				// We take advantage of the fact that we won't drop the db mutex
+				// between now and the call to logAndApply. So, no files should
+				// get added to a new in-progress compaction at this point. We can
+				// avoid having to iterate on in-progress compactions to cancel them
+				// if none of the files being split have a compacting state.
+				if splitFile.IsCompacting() {
+					checkCompactions = true
+				}
+				filesToSplit = append(filesToSplit, ingestSplitFile{ingestFile: m, splitFile: splitFile, level: f.Level})
+			}
+		}
+		if err != nil {
+			d.mu.versions.logUnlock()
+			return nil, err
+		}
+		f.Meta = m
+		levelMetrics := metrics[f.Level]
+		if levelMetrics == nil {
+			levelMetrics = &LevelMetrics{}
+			metrics[f.Level] = levelMetrics
+		}
+		levelMetrics.NumFiles++
+		levelMetrics.Size += int64(m.Size)
+		levelMetrics.BytesIngested += m.Size
+		levelMetrics.TablesIngested++
+	}
+	// replacedFiles maps files excised due to exciseSpan (or splitFiles returned
+	// by ingestTargetLevel), to files that were created to replace it. This map
+	// is used to resolve references to split files in filesToSplit, as it is
+	// possible for a file that we want to split to no longer exist or have a
+	// newer fileMetadata due to a split induced by another ingestion file, or an
+	// excise.
+	replacedFiles := make(map[base.FileNum][]newFileEntry)
+	updateLevelMetricsOnExcise := func(m *fileMetadata, level int, added []newFileEntry) {
+		levelMetrics := metrics[level]
+		if levelMetrics == nil {
+			levelMetrics = &LevelMetrics{}
+			metrics[level] = levelMetrics
+		}
+		levelMetrics.NumFiles--
+		levelMetrics.Size -= int64(m.Size)
+		for i := range added {
+			levelMetrics.NumFiles++
+			levelMetrics.Size += int64(added[i].Meta.Size)
+		}
+	}
+	if exciseSpan.Valid() {
+		// Iterate through all levels and find files that intersect with exciseSpan.
+		//
+		// TODO(bilal): We could drop the DB mutex here as we don't need it for
+		// excises; we only need to hold the version lock which we already are
+		// holding. However releasing the DB mutex could mess with the
+		// ingestTargetLevel calculation that happened above, as it assumed that it
+		// had a complete view of in-progress compactions that wouldn't change
+		// until logAndApply is called. If we were to drop the mutex now, we could
+		// schedule another in-progress compaction that would go into the chosen target
+		// level and lead to file overlap within level (which would panic in
+		// logAndApply). We should drop the db mutex here, do the excise, then
+		// re-grab the DB mutex and rerun just the in-progress compaction check to
+		// see if any new compactions are conflicting with our chosen target levels
+		// for files, and if they are, we should signal those compactions to error
+		// out.
+		for level := range current.Levels {
+			overlaps := current.Overlaps(level, d.cmp, exciseSpan.Start, exciseSpan.End, true /* exclusiveEnd */)
+			iter := overlaps.Iter()
+
+			for m := iter.First(); m != nil; m = iter.Next() {
+				newFiles, err := d.excise(exciseSpan, m, ve, level)
+				if err != nil {
+					return nil, err
+				}
+
+				if _, ok := ve.DeletedFiles[deletedFileEntry{
+					Level:   level,
+					FileNum: m.FileNum,
+				}]; !ok {
+					// We did not excise this file.
+					continue
+				}
+				replacedFiles[m.FileNum] = newFiles
+				updateLevelMetricsOnExcise(m, level, newFiles)
+			}
+		}
+	}
+	if len(filesToSplit) > 0 {
+		// For the same reasons as the above call to excise, we hold the db mutex
+		// while calling this method.
+		if err := d.ingestSplit(ve, updateLevelMetricsOnExcise, filesToSplit, replacedFiles); err != nil {
+			return nil, err
+		}
+	}
+	if len(filesToSplit) > 0 || exciseSpan.Valid() {
+		for c := range d.mu.compact.inProgress {
+			if c.versionEditApplied {
+				continue
+			}
+			// Check if this compaction overlaps with the excise span. Note that just
+			// checking if the inputs individually overlap with the excise span
+			// isn't sufficient; for instance, a compaction could have [a,b] and [e,f]
+			// as inputs and write it all out as [a,b,e,f] in one sstable. If we're
+			// doing a [c,d) excise at the same time as this compaction, we will have
+			// to error out the whole compaction as we can't guarantee it hasn't/won't
+			// write a file overlapping with the excise span.
+			if exciseSpan.OverlapsInternalKeyRange(d.cmp, c.smallest, c.largest) {
+				c.cancel.Store(true)
+			}
+			// Check if this compaction's inputs have been replaced due to an
+			// ingest-time split. In that case, cancel the compaction as a newly picked
+			// compaction would need to include any new files that slid in between
+			// previously-existing files. Note that we cancel any compaction that has a
+			// file that was ingest-split as an input, even if it started before this
+			// ingestion.
+			if checkCompactions {
+				for i := range c.inputs {
+					iter := c.inputs[i].files.Iter()
+					for f := iter.First(); f != nil; f = iter.Next() {
+						if _, ok := replacedFiles[f.FileNum]; ok {
+							c.cancel.Store(true)
+							break
+						}
+					}
+				}
+			}
+		}
+		// Check for any EventuallyFileOnlySnapshots that could be watching for
+		// an excise on this span.
+		if exciseSpan.Valid() {
+			for s := d.mu.snapshots.root.next; s != &d.mu.snapshots.root; s = s.next {
+				if s.efos == nil {
+					continue
+				}
+				efos := s.efos
+				// TODO(bilal): We can make this faster by taking advantage of the sorted
+				// nature of protectedRanges to do a sort.Search, or even maintaining a
+				// global list of all protected ranges instead of having to peer into every
+				// snapshot.
+				for i := range efos.protectedRanges {
+					if efos.protectedRanges[i].OverlapsKeyRange(d.cmp, exciseSpan) {
+						efos.excised.Store(true)
+						break
+					}
+				}
+			}
+		}
+	}
+	if err := d.mu.versions.logAndApply(jobID, ve, metrics, false /* forceRotation */, func() []compactionInfo {
+		return d.getInProgressCompactionInfoLocked(nil)
+	}); err != nil {
+		return nil, err
+	}
+
+	d.mu.versions.metrics.Ingest.Count++
+
+	d.updateReadStateLocked(d.opts.DebugCheck)
+	// updateReadStateLocked could have generated obsolete tables, schedule a
+	// cleanup job if necessary.
+	d.deleteObsoleteFiles(jobID)
+	d.updateTableStatsLocked(ve.NewFiles)
+	// The ingestion may have pushed a level over the threshold for compaction,
+	// so check to see if one is necessary and schedule it.
+	d.maybeScheduleCompaction()
+	var toValidate []manifest.NewFileEntry
+	dedup := make(map[base.DiskFileNum]struct{})
+	for _, entry := range ve.NewFiles {
+		if _, ok := dedup[entry.Meta.FileBacking.DiskFileNum]; !ok {
+			toValidate = append(toValidate, entry)
+			dedup[entry.Meta.FileBacking.DiskFileNum] = struct{}{}
+		}
+	}
+	d.maybeValidateSSTablesLocked(toValidate)
+	return ve, nil
+}
+
+// maybeValidateSSTablesLocked adds the slice of newFileEntrys to the pending
+// queue of files to be validated, when the feature is enabled.
+//
+// Note that if two entries with the same backing file are added twice, then the
+// block checksums for the backing file will be validated twice.
+//
+// DB.mu must be locked when calling.
+func (d *DB) maybeValidateSSTablesLocked(newFiles []newFileEntry) {
+	// Only add to the validation queue when the feature is enabled.
+	if !d.opts.Experimental.ValidateOnIngest {
+		return
+	}
+
+	d.mu.tableValidation.pending = append(d.mu.tableValidation.pending, newFiles...)
+	if d.shouldValidateSSTablesLocked() {
+		go d.validateSSTables()
+	}
+}
+
+// shouldValidateSSTablesLocked returns true if SSTable validation should run.
+// DB.mu must be locked when calling.
+func (d *DB) shouldValidateSSTablesLocked() bool {
+	return !d.mu.tableValidation.validating &&
+		d.closed.Load() == nil &&
+		d.opts.Experimental.ValidateOnIngest &&
+		len(d.mu.tableValidation.pending) > 0
+}
+
+// validateSSTables runs a round of validation on the tables in the pending
+// queue.
+func (d *DB) validateSSTables() {
+	d.mu.Lock()
+	if !d.shouldValidateSSTablesLocked() {
+		d.mu.Unlock()
+		return
+	}
+
+	pending := d.mu.tableValidation.pending
+	d.mu.tableValidation.pending = nil
+	d.mu.tableValidation.validating = true
+	jobID := d.mu.nextJobID
+	d.mu.nextJobID++
+	rs := d.loadReadState()
+
+	// Drop DB.mu before performing IO.
+	d.mu.Unlock()
+
+	// Validate all tables in the pending queue. This could lead to a situation
+	// where we are starving IO from other tasks due to having to page through
+	// all the blocks in all the sstables in the queue.
+	// TODO(travers): Add some form of pacing to avoid IO starvation.
+
+	// If we fail to validate any files due to reasons other than uncovered
+	// corruption, accumulate them and re-queue them for another attempt.
+	var retry []manifest.NewFileEntry
+
+	for _, f := range pending {
+		// The file may have been moved or deleted since it was ingested, in
+		// which case we skip.
+		if !rs.current.Contains(f.Level, d.cmp, f.Meta) {
+			// Assume the file was moved to a lower level. It is rare enough
+			// that a table is moved or deleted between the time it was ingested
+			// and the time the validation routine runs that the overall cost of
+			// this inner loop is tolerably low, when amortized over all
+			// ingested tables.
+			found := false
+			for i := f.Level + 1; i < numLevels; i++ {
+				if rs.current.Contains(i, d.cmp, f.Meta) {
+					found = true
+					break
+				}
+			}
+			if !found {
+				continue
+			}
+		}
+
+		var err error
+		if f.Meta.Virtual {
+			err = d.tableCache.withVirtualReader(
+				f.Meta.VirtualMeta(), func(v sstable.VirtualReader) error {
+					return v.ValidateBlockChecksumsOnBacking()
+				})
+		} else {
+			err = d.tableCache.withReader(
+				f.Meta.PhysicalMeta(), func(r *sstable.Reader) error {
+					return r.ValidateBlockChecksums()
+				})
+		}
+
+		if err != nil {
+			if IsCorruptionError(err) {
+				// TODO(travers): Hook into the corruption reporting pipeline, once
+				// available. See pebble#1192.
+				d.opts.Logger.Fatalf("pebble: encountered corruption during ingestion: %s", err)
+			} else {
+				// If there was some other, possibly transient, error that
+				// caused table validation to fail inform the EventListener and
+				// move on. We remember the table so that we can retry it in a
+				// subsequent table validation job.
+				//
+				// TODO(jackson): If the error is not transient, this will retry
+				// validation indefinitely. While not great, it's the same
+				// behavior as erroring flushes and compactions. We should
+				// address this as a part of #270.
+				d.opts.EventListener.BackgroundError(err)
+				retry = append(retry, f)
+				continue
+			}
+		}
+
+		d.opts.EventListener.TableValidated(TableValidatedInfo{
+			JobID: jobID,
+			Meta:  f.Meta,
+		})
+	}
+	rs.unref()
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	d.mu.tableValidation.pending = append(d.mu.tableValidation.pending, retry...)
+	d.mu.tableValidation.validating = false
+	d.mu.tableValidation.cond.Broadcast()
+	if d.shouldValidateSSTablesLocked() {
+		go d.validateSSTables()
+	}
+}
diff --git a/pebble/ingest_test.go b/pebble/ingest_test.go
new file mode 100644
index 0000000..c4dcc2a
--- /dev/null
+++ b/pebble/ingest_test.go
@@ -0,0 +1,3516 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"io"
+	"math"
+	"os"
+	"path/filepath"
+	"slices"
+	"sort"
+	"strconv"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/errors/oserror"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/internal/rangekey"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/objstorage/remote"
+	"github.com/cockroachdb/pebble/record"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/cockroachdb/pebble/vfs/errorfs"
+	"github.com/kr/pretty"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+func TestSSTableKeyCompare(t *testing.T) {
+	var buf bytes.Buffer
+	datadriven.RunTest(t, "testdata/sstable_key_compare", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "cmp":
+			buf.Reset()
+			for _, line := range strings.Split(td.Input, "\n") {
+				fields := strings.Fields(line)
+				a := base.ParseInternalKey(fields[0])
+				b := base.ParseInternalKey(fields[1])
+				got := sstableKeyCompare(testkeys.Comparer.Compare, a, b)
+				fmt.Fprintf(&buf, "%38s", fmt.Sprint(a.Pretty(base.DefaultFormatter)))
+				switch got {
+				case -1:
+					fmt.Fprint(&buf, " < ")
+				case +1:
+					fmt.Fprint(&buf, " > ")
+				case 0:
+					fmt.Fprint(&buf, " = ")
+				}
+				fmt.Fprintf(&buf, "%s\n", fmt.Sprint(b.Pretty(base.DefaultFormatter)))
+			}
+			return buf.String()
+		default:
+			return fmt.Sprintf("unrecognized command %q", td.Cmd)
+		}
+	})
+}
+
+func TestIngestLoad(t *testing.T) {
+	mem := vfs.NewMem()
+
+	datadriven.RunTest(t, "testdata/ingest_load", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "load":
+			writerOpts := sstable.WriterOptions{}
+			var dbVersion FormatMajorVersion
+			for _, cmdArgs := range td.CmdArgs {
+				v, err := strconv.Atoi(cmdArgs.Vals[0])
+				if err != nil {
+					return err.Error()
+				}
+				switch k := cmdArgs.Key; k {
+				case "writer-version":
+					fmv := FormatMajorVersion(v)
+					writerOpts.TableFormat = fmv.MaxTableFormat()
+				case "db-version":
+					dbVersion = FormatMajorVersion(v)
+				default:
+					return fmt.Sprintf("unknown cmd %s\n", k)
+				}
+			}
+			f, err := mem.Create("ext")
+			if err != nil {
+				return err.Error()
+			}
+			w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), writerOpts)
+			for _, data := range strings.Split(td.Input, "\n") {
+				if strings.HasPrefix(data, "rangekey: ") {
+					data = strings.TrimPrefix(data, "rangekey: ")
+					s := keyspan.ParseSpan(data)
+					err := rangekey.Encode(&s, w.AddRangeKey)
+					if err != nil {
+						return err.Error()
+					}
+					continue
+				}
+
+				j := strings.Index(data, ":")
+				if j < 0 {
+					return fmt.Sprintf("malformed input: %s\n", data)
+				}
+				key := base.ParseInternalKey(data[:j])
+				value := []byte(data[j+1:])
+				if err := w.Add(key, value); err != nil {
+					return err.Error()
+				}
+			}
+			if err := w.Close(); err != nil {
+				return err.Error()
+			}
+
+			opts := (&Options{
+				Comparer: DefaultComparer,
+				FS:       mem,
+			}).WithFSDefaults()
+			lr, err := ingestLoad(opts, dbVersion, []string{"ext"}, nil, nil, 0, []base.DiskFileNum{base.FileNum(1).DiskFileNum()}, nil, 0)
+			if err != nil {
+				return err.Error()
+			}
+			var buf bytes.Buffer
+			for _, m := range lr.localMeta {
+				fmt.Fprintf(&buf, "%d: %s-%s\n", m.FileNum, m.Smallest, m.Largest)
+				fmt.Fprintf(&buf, "  points: %s-%s\n", m.SmallestPointKey, m.LargestPointKey)
+				fmt.Fprintf(&buf, "  ranges: %s-%s\n", m.SmallestRangeKey, m.LargestRangeKey)
+			}
+			return buf.String()
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestIngestLoadRand(t *testing.T) {
+	mem := vfs.NewMem()
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+	cmp := DefaultComparer.Compare
+	version := internalFormatNewest
+
+	randBytes := func(size int) []byte {
+		data := make([]byte, size)
+		for i := range data {
+			data[i] = byte(rng.Int() & 0xff)
+		}
+		return data
+	}
+
+	paths := make([]string, 1+rng.Intn(10))
+	pending := make([]base.DiskFileNum, len(paths))
+	expected := make([]*fileMetadata, len(paths))
+	for i := range paths {
+		paths[i] = fmt.Sprint(i)
+		pending[i] = base.FileNum(rng.Uint64()).DiskFileNum()
+		expected[i] = &fileMetadata{
+			FileNum: pending[i].FileNum(),
+		}
+		expected[i].StatsMarkValid()
+
+		func() {
+			f, err := mem.Create(paths[i])
+			require.NoError(t, err)
+
+			keys := make([]InternalKey, 1+rng.Intn(100))
+			for i := range keys {
+				keys[i] = base.MakeInternalKey(
+					randBytes(1+rng.Intn(10)),
+					0,
+					InternalKeyKindSet)
+			}
+			slices.SortFunc(keys, func(a, b base.InternalKey) int {
+				return base.InternalCompare(cmp, a, b)
+			})
+
+			expected[i].ExtendPointKeyBounds(cmp, keys[0], keys[len(keys)-1])
+
+			w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
+				TableFormat: version.MaxTableFormat(),
+			})
+			var count uint64
+			for i := range keys {
+				if i > 0 && base.InternalCompare(cmp, keys[i-1], keys[i]) == 0 {
+					// Duplicate key, ignore.
+					continue
+				}
+				w.Add(keys[i], nil)
+				count++
+			}
+			expected[i].Stats.NumEntries = count
+			require.NoError(t, w.Close())
+
+			meta, err := w.Metadata()
+			require.NoError(t, err)
+
+			expected[i].Size = meta.Size
+			expected[i].InitPhysicalBacking()
+		}()
+	}
+
+	opts := (&Options{
+		Comparer: DefaultComparer,
+		FS:       mem,
+	}).WithFSDefaults()
+	lr, err := ingestLoad(opts, version, paths, nil, nil, 0, pending, nil, 0)
+	require.NoError(t, err)
+
+	for _, m := range lr.localMeta {
+		m.CreationTime = 0
+	}
+	t.Log(strings.Join(pretty.Diff(expected, lr.localMeta), "\n"))
+	require.Equal(t, expected, lr.localMeta)
+}
+
+func TestIngestLoadInvalid(t *testing.T) {
+	mem := vfs.NewMem()
+	f, err := mem.Create("invalid")
+	require.NoError(t, err)
+	require.NoError(t, f.Close())
+
+	opts := (&Options{
+		Comparer: DefaultComparer,
+		FS:       mem,
+	}).WithFSDefaults()
+	if _, err := ingestLoad(opts, internalFormatNewest, []string{"invalid"}, nil, nil, 0, []base.DiskFileNum{base.FileNum(1).DiskFileNum()}, nil, 0); err == nil {
+		t.Fatalf("expected error, but found success")
+	}
+}
+
+func TestIngestSortAndVerify(t *testing.T) {
+	comparers := map[string]Compare{
+		"default": DefaultComparer.Compare,
+		"reverse": func(a, b []byte) int {
+			return DefaultComparer.Compare(b, a)
+		},
+	}
+
+	t.Run("", func(t *testing.T) {
+		datadriven.RunTest(t, "testdata/ingest_sort_and_verify", func(t *testing.T, d *datadriven.TestData) string {
+			switch d.Cmd {
+			case "ingest":
+				var buf bytes.Buffer
+				var meta []*fileMetadata
+				var paths []string
+				var cmpName string
+				d.ScanArgs(t, "cmp", &cmpName)
+				cmp := comparers[cmpName]
+				if cmp == nil {
+					return fmt.Sprintf("%s unknown comparer: %s", d.Cmd, cmpName)
+				}
+				for i, data := range strings.Split(d.Input, "\n") {
+					parts := strings.Split(data, "-")
+					if len(parts) != 2 {
+						return fmt.Sprintf("malformed test case: %s", d.Input)
+					}
+					smallest := base.ParseInternalKey(parts[0])
+					largest := base.ParseInternalKey(parts[1])
+					if cmp(smallest.UserKey, largest.UserKey) > 0 {
+						return fmt.Sprintf("range %v-%v is not valid", smallest, largest)
+					}
+					m := (&fileMetadata{}).ExtendPointKeyBounds(cmp, smallest, largest)
+					m.InitPhysicalBacking()
+					meta = append(meta, m)
+					paths = append(paths, strconv.Itoa(i))
+				}
+				lr := ingestLoadResult{localPaths: paths, localMeta: meta}
+				err := ingestSortAndVerify(cmp, lr, KeyRange{})
+				if err != nil {
+					return fmt.Sprintf("%v\n", err)
+				}
+				for i := range meta {
+					fmt.Fprintf(&buf, "%s: %v-%v\n", paths[i], meta[i].Smallest, meta[i].Largest)
+				}
+				return buf.String()
+
+			default:
+				return fmt.Sprintf("unknown command: %s", d.Cmd)
+			}
+		})
+	})
+}
+
+func TestIngestLink(t *testing.T) {
+	// Test linking of tables into the DB directory. Test cleanup when one of the
+	// tables cannot be linked.
+
+	const dir = "db"
+	const count = 10
+	for i := 0; i <= count; i++ {
+		t.Run("", func(t *testing.T) {
+			opts := &Options{FS: vfs.NewMem()}
+			opts.EnsureDefaults().WithFSDefaults()
+			require.NoError(t, opts.FS.MkdirAll(dir, 0755))
+			objProvider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(opts.FS, dir))
+			require.NoError(t, err)
+			defer objProvider.Close()
+
+			paths := make([]string, 10)
+			meta := make([]*fileMetadata, len(paths))
+			contents := make([][]byte, len(paths))
+			for j := range paths {
+				paths[j] = fmt.Sprintf("external%d", j)
+				meta[j] = &fileMetadata{}
+				meta[j].FileNum = FileNum(j)
+				meta[j].InitPhysicalBacking()
+				f, err := opts.FS.Create(paths[j])
+				require.NoError(t, err)
+
+				contents[j] = []byte(fmt.Sprintf("data%d", j))
+				// memFile.Write will modify the supplied buffer when invariants are
+				// enabled, so provide a throw-away copy.
+				_, err = f.Write(append([]byte(nil), contents[j]...))
+				require.NoError(t, err)
+				require.NoError(t, f.Close())
+			}
+
+			if i < count {
+				opts.FS.Remove(paths[i])
+			}
+
+			lr := ingestLoadResult{localMeta: meta, localPaths: paths}
+			err = ingestLink(0 /* jobID */, opts, objProvider, lr, nil /* shared */)
+			if i < count {
+				if err == nil {
+					t.Fatalf("expected error, but found success")
+				}
+			} else {
+				require.NoError(t, err)
+			}
+
+			files, err := opts.FS.List(dir)
+			require.NoError(t, err)
+
+			sort.Strings(files)
+
+			if i < count {
+				if len(files) > 0 {
+					t.Fatalf("expected all of the files to be cleaned up, but found:\n%s",
+						strings.Join(files, "\n"))
+				}
+			} else {
+				if len(files) != count {
+					t.Fatalf("expected %d files, but found:\n%s", count, strings.Join(files, "\n"))
+				}
+				for j := range files {
+					ftype, fileNum, ok := base.ParseFilename(opts.FS, files[j])
+					if !ok {
+						t.Fatalf("unable to parse filename: %s", files[j])
+					}
+					if fileTypeTable != ftype {
+						t.Fatalf("expected table, but found %d", ftype)
+					}
+					if j != int(fileNum.FileNum()) {
+						t.Fatalf("expected table %d, but found %d", j, fileNum)
+					}
+					f, err := opts.FS.Open(opts.FS.PathJoin(dir, files[j]))
+					require.NoError(t, err)
+
+					data, err := io.ReadAll(f)
+					require.NoError(t, err)
+					require.NoError(t, f.Close())
+					if !bytes.Equal(contents[j], data) {
+						t.Fatalf("expected %s, but found %s", contents[j], data)
+					}
+				}
+			}
+		})
+	}
+}
+
+func TestIngestLinkFallback(t *testing.T) {
+	// Verify that ingestLink succeeds if linking fails by falling back to
+	// copying.
+	mem := vfs.NewMem()
+	src, err := mem.Create("source")
+	require.NoError(t, err)
+
+	opts := &Options{FS: errorfs.Wrap(mem, errorfs.ErrInjected.If(errorfs.OnIndex(1)))}
+	opts.EnsureDefaults().WithFSDefaults()
+	objSettings := objstorageprovider.DefaultSettings(opts.FS, "")
+	// Prevent the provider from listing the dir (where we may get an injected error).
+	objSettings.FSDirInitialListing = []string{}
+	objProvider, err := objstorageprovider.Open(objSettings)
+	require.NoError(t, err)
+	defer objProvider.Close()
+
+	meta := []*fileMetadata{{FileNum: 1}}
+	meta[0].InitPhysicalBacking()
+	lr := ingestLoadResult{localMeta: meta, localPaths: []string{"source"}}
+	err = ingestLink(0, opts, objProvider, lr, nil /* shared */)
+	require.NoError(t, err)
+
+	dest, err := mem.Open("000001.sst")
+	require.NoError(t, err)
+
+	// We should be able to write bytes to src, and not have them show up in
+	// dest.
+	_, _ = src.Write([]byte("test"))
+	data, err := io.ReadAll(dest)
+	require.NoError(t, err)
+	if len(data) != 0 {
+		t.Fatalf("expected copy, but files appear to be hard linked: [%s] unexpectedly found", data)
+	}
+}
+
+func TestOverlappingIngestedSSTs(t *testing.T) {
+	dir := ""
+	var (
+		mem        vfs.FS
+		d          *DB
+		opts       *Options
+		closed     = false
+		blockFlush = false
+	)
+	defer func() {
+		if !closed {
+			require.NoError(t, d.Close())
+		}
+	}()
+
+	reset := func(strictMem bool) {
+		if d != nil && !closed {
+			require.NoError(t, d.Close())
+		}
+		blockFlush = false
+
+		if strictMem {
+			mem = vfs.NewStrictMem()
+		} else {
+			mem = vfs.NewMem()
+		}
+
+		require.NoError(t, mem.MkdirAll("ext", 0755))
+		opts = (&Options{
+			FS:                          mem,
+			MemTableStopWritesThreshold: 4,
+			L0CompactionThreshold:       100,
+			L0StopWritesThreshold:       100,
+			DebugCheck:                  DebugCheckLevels,
+			FormatMajorVersion:          internalFormatNewest,
+		}).WithFSDefaults()
+		// Disable automatic compactions because otherwise we'll race with
+		// delete-only compactions triggered by ingesting range tombstones.
+		opts.DisableAutomaticCompactions = true
+
+		var err error
+		d, err = Open(dir, opts)
+		require.NoError(t, err)
+		d.TestOnlyWaitForCleaning()
+	}
+	waitForFlush := func() {
+		if d == nil {
+			return
+		}
+		d.mu.Lock()
+		for d.mu.compact.flushing {
+			d.mu.compact.cond.Wait()
+		}
+		d.mu.Unlock()
+	}
+	reset(false)
+
+	datadriven.RunTest(t, "testdata/flushable_ingest", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "reset":
+			reset(td.HasArg("strictMem"))
+			return ""
+
+		case "ignoreSyncs":
+			var ignoreSyncs bool
+			if len(td.CmdArgs) == 1 && td.CmdArgs[0].String() == "true" {
+				ignoreSyncs = true
+			}
+			mem.(*vfs.MemFS).SetIgnoreSyncs(ignoreSyncs)
+			return ""
+
+		case "resetToSynced":
+			mem.(*vfs.MemFS).ResetToSyncedState()
+			files, err := mem.List(dir)
+			sort.Strings(files)
+			require.NoError(t, err)
+			return strings.Join(files, "\n")
+
+		case "batch":
+			b := d.NewIndexedBatch()
+			if err := runBatchDefineCmd(td, b); err != nil {
+				return err.Error()
+			}
+			if err := b.Commit(nil); err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "build":
+			if err := runBuildCmd(td, d, mem); err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "ingest":
+			if err := runIngestCmd(td, d, mem); err != nil {
+				return err.Error()
+			}
+			if !blockFlush {
+				waitForFlush()
+			}
+			return ""
+
+		case "iter":
+			iter, _ := d.NewIter(nil)
+			return runIterCmd(td, iter, true)
+
+		case "lsm":
+			return runLSMCmd(td, d)
+
+		case "close":
+			if closed {
+				return "already closed"
+			}
+			require.NoError(t, d.Close())
+			closed = true
+			return ""
+
+		case "ls":
+			files, err := mem.List(dir)
+			sort.Strings(files)
+			require.NoError(t, err)
+			return strings.Join(files, "\n")
+
+		case "open":
+			opts.ReadOnly = td.HasArg("readOnly")
+			var err error
+			d, err = Open(dir, opts)
+			closed = false
+			require.NoError(t, err)
+			waitForFlush()
+			d.TestOnlyWaitForCleaning()
+			return ""
+
+		case "blockFlush":
+			blockFlush = true
+			d.mu.Lock()
+			d.mu.compact.flushing = true
+			d.mu.Unlock()
+			return ""
+
+		case "allowFlush":
+			blockFlush = false
+			d.mu.Lock()
+			d.mu.compact.flushing = false
+			d.mu.Unlock()
+			return ""
+
+		case "flush":
+			d.maybeScheduleFlush()
+			waitForFlush()
+			d.TestOnlyWaitForCleaning()
+			return ""
+
+		case "get":
+			return runGetCmd(t, td, d)
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestExcise(t *testing.T) {
+	var mem vfs.FS
+	var d *DB
+	var flushed bool
+	defer func() {
+		require.NoError(t, d.Close())
+	}()
+
+	var opts *Options
+	reset := func() {
+		if d != nil {
+			require.NoError(t, d.Close())
+		}
+
+		mem = vfs.NewMem()
+		require.NoError(t, mem.MkdirAll("ext", 0755))
+		opts = &Options{
+			FS:                    mem,
+			L0CompactionThreshold: 100,
+			L0StopWritesThreshold: 100,
+			DebugCheck:            DebugCheckLevels,
+			EventListener: &EventListener{FlushEnd: func(info FlushInfo) {
+				flushed = true
+			}},
+			FormatMajorVersion: FormatVirtualSSTables,
+			Comparer:           testkeys.Comparer,
+		}
+		// Disable automatic compactions because otherwise we'll race with
+		// delete-only compactions triggered by ingesting range tombstones.
+		opts.DisableAutomaticCompactions = true
+		// Set this to true to add some testing for the virtual sstable validation
+		// code paths.
+		opts.Experimental.ValidateOnIngest = true
+
+		var err error
+		d, err = Open("", opts)
+		require.NoError(t, err)
+	}
+	reset()
+
+	datadriven.RunTest(t, "testdata/excise", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "reset":
+			reset()
+			return ""
+		case "reopen":
+			require.NoError(t, d.Close())
+			var err error
+			d, err = Open("", opts)
+			require.NoError(t, err)
+
+			return ""
+		case "batch":
+			b := d.NewIndexedBatch()
+			if err := runBatchDefineCmd(td, b); err != nil {
+				return err.Error()
+			}
+			if err := b.Commit(nil); err != nil {
+				return err.Error()
+			}
+			return ""
+		case "build":
+			if err := runBuildCmd(td, d, mem); err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "flush":
+			if err := d.Flush(); err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "ingest":
+			flushed = false
+			if err := runIngestCmd(td, d, mem); err != nil {
+				return err.Error()
+			}
+			// Wait for a possible flush.
+			d.mu.Lock()
+			for d.mu.compact.flushing {
+				d.mu.compact.cond.Wait()
+			}
+			d.mu.Unlock()
+			if flushed {
+				return "memtable flushed"
+			}
+			return ""
+
+		case "ingest-and-excise":
+			flushed = false
+			if err := runIngestAndExciseCmd(td, d, mem); err != nil {
+				return err.Error()
+			}
+			// Wait for a possible flush.
+			d.mu.Lock()
+			for d.mu.compact.flushing {
+				d.mu.compact.cond.Wait()
+			}
+			d.mu.Unlock()
+			if flushed {
+				return "memtable flushed"
+			}
+			return ""
+
+		case "get":
+			return runGetCmd(t, td, d)
+
+		case "iter":
+			iter, _ := d.NewIter(&IterOptions{
+				KeyTypes: IterKeyTypePointsAndRanges,
+			})
+			return runIterCmd(td, iter, true)
+
+		case "lsm":
+			return runLSMCmd(td, d)
+
+		case "metrics":
+			// The asynchronous loading of table stats can change metrics, so
+			// wait for all the tables' stats to be loaded.
+			d.mu.Lock()
+			d.waitTableStats()
+			d.mu.Unlock()
+
+			return d.Metrics().StringForTests()
+
+		case "wait-pending-table-stats":
+			return runTableStatsCmd(td, d)
+
+		case "excise":
+			ve := &versionEdit{
+				DeletedFiles: map[deletedFileEntry]*fileMetadata{},
+			}
+			var exciseSpan KeyRange
+			if len(td.CmdArgs) != 2 {
+				panic("insufficient args for compact command")
+			}
+			exciseSpan.Start = []byte(td.CmdArgs[0].Key)
+			exciseSpan.End = []byte(td.CmdArgs[1].Key)
+
+			d.mu.Lock()
+			d.mu.versions.logLock()
+			d.mu.Unlock()
+			current := d.mu.versions.currentVersion()
+			for level := range current.Levels {
+				iter := current.Levels[level].Iter()
+				for m := iter.SeekGE(d.cmp, exciseSpan.Start); m != nil && d.cmp(m.Smallest.UserKey, exciseSpan.End) < 0; m = iter.Next() {
+					_, err := d.excise(exciseSpan, m, ve, level)
+					if err != nil {
+						d.mu.Lock()
+						d.mu.versions.logUnlock()
+						d.mu.Unlock()
+						return fmt.Sprintf("error when excising %s: %s", m.FileNum, err.Error())
+					}
+				}
+			}
+			d.mu.Lock()
+			d.mu.versions.logUnlock()
+			d.mu.Unlock()
+			return fmt.Sprintf("would excise %d files, use ingest-and-excise to excise.\n%s", len(ve.DeletedFiles), ve.DebugString(base.DefaultFormatter))
+
+		case "confirm-backing":
+			// Confirms that the files have the same FileBacking.
+			fileNums := make(map[base.FileNum]struct{})
+			for i := range td.CmdArgs {
+				fNum, err := strconv.Atoi(td.CmdArgs[i].Key)
+				if err != nil {
+					panic("invalid file number")
+				}
+				fileNums[base.FileNum(fNum)] = struct{}{}
+			}
+			d.mu.Lock()
+			currVersion := d.mu.versions.currentVersion()
+			var ptr *manifest.FileBacking
+			for _, level := range currVersion.Levels {
+				lIter := level.Iter()
+				for f := lIter.First(); f != nil; f = lIter.Next() {
+					if _, ok := fileNums[f.FileNum]; ok {
+						if ptr == nil {
+							ptr = f.FileBacking
+							continue
+						}
+						if f.FileBacking != ptr {
+							d.mu.Unlock()
+							return "file backings are not the same"
+						}
+					}
+				}
+			}
+			d.mu.Unlock()
+			return "file backings are the same"
+		case "compact":
+			if len(td.CmdArgs) != 2 {
+				panic("insufficient args for compact command")
+			}
+			l := td.CmdArgs[0].Key
+			r := td.CmdArgs[1].Key
+			err := d.Compact([]byte(l), []byte(r), false)
+			if err != nil {
+				return err.Error()
+			}
+			return ""
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func testIngestSharedImpl(
+	t *testing.T, createOnShared remote.CreateOnSharedStrategy, fileName string,
+) {
+	var d, d1, d2 *DB
+	var efos map[string]*EventuallyFileOnlySnapshot
+	defer func() {
+		for _, e := range efos {
+			require.NoError(t, e.Close())
+		}
+		if d1 != nil {
+			require.NoError(t, d1.Close())
+		}
+		if d2 != nil {
+			require.NoError(t, d2.Close())
+		}
+	}()
+	creatorIDCounter := uint64(1)
+	replicateCounter := 1
+	var opts1, opts2 *Options
+
+	reset := func() {
+		for _, e := range efos {
+			require.NoError(t, e.Close())
+		}
+		if d1 != nil {
+			require.NoError(t, d1.Close())
+		}
+		if d2 != nil {
+			require.NoError(t, d2.Close())
+		}
+		efos = make(map[string]*EventuallyFileOnlySnapshot)
+
+		sstorage := remote.NewInMem()
+		mem1 := vfs.NewMem()
+		mem2 := vfs.NewMem()
+		require.NoError(t, mem1.MkdirAll("ext", 0755))
+		require.NoError(t, mem2.MkdirAll("ext", 0755))
+		opts1 = &Options{
+			Comparer:              testkeys.Comparer,
+			FS:                    mem1,
+			LBaseMaxBytes:         1,
+			L0CompactionThreshold: 100,
+			L0StopWritesThreshold: 100,
+			DebugCheck:            DebugCheckLevels,
+			FormatMajorVersion:    FormatVirtualSSTables,
+		}
+		// lel.
+		lel := MakeLoggingEventListener(DefaultLogger)
+		opts1.EventListener = &lel
+		opts1.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
+			"": sstorage,
+		})
+		opts1.Experimental.CreateOnShared = createOnShared
+		opts1.Experimental.CreateOnSharedLocator = ""
+		// Disable automatic compactions because otherwise we'll race with
+		// delete-only compactions triggered by ingesting range tombstones.
+		opts1.DisableAutomaticCompactions = true
+
+		opts2 = &Options{}
+		*opts2 = *opts1
+		opts2.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
+			"": sstorage,
+		})
+		opts2.Experimental.CreateOnShared = createOnShared
+		opts2.Experimental.CreateOnSharedLocator = ""
+		opts2.FS = mem2
+
+		var err error
+		d1, err = Open("", opts1)
+		require.NoError(t, err)
+		require.NoError(t, d1.SetCreatorID(creatorIDCounter))
+		creatorIDCounter++
+		d2, err = Open("", opts2)
+		require.NoError(t, err)
+		require.NoError(t, d2.SetCreatorID(creatorIDCounter))
+		creatorIDCounter++
+		d = d1
+	}
+	reset()
+
+	datadriven.RunTest(t, fmt.Sprintf("testdata/%s", fileName), func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "restart":
+			for _, e := range efos {
+				require.NoError(t, e.Close())
+			}
+			if d1 != nil {
+				require.NoError(t, d1.Close())
+			}
+			if d2 != nil {
+				require.NoError(t, d2.Close())
+			}
+
+			var err error
+			d1, err = Open("", opts1)
+			if err != nil {
+				return err.Error()
+			}
+			d2, err = Open("", opts2)
+			if err != nil {
+				return err.Error()
+			}
+			d = d1
+			return "ok, note that the active db has been set to 1 (use 'switch' to change)"
+		case "reset":
+			reset()
+			return ""
+		case "switch":
+			if len(td.CmdArgs) != 1 {
+				return "usage: switch <1 or 2>"
+			}
+			switch td.CmdArgs[0].Key {
+			case "1":
+				d = d1
+			case "2":
+				d = d2
+			default:
+				return "usage: switch <1 or 2>"
+			}
+			return "ok"
+		case "batch":
+			b := d.NewIndexedBatch()
+			if err := runBatchDefineCmd(td, b); err != nil {
+				return err.Error()
+			}
+			if err := b.Commit(nil); err != nil {
+				return err.Error()
+			}
+			return ""
+		case "build":
+			if err := runBuildCmd(td, d, d.opts.FS); err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "flush":
+			if err := d.Flush(); err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "ingest":
+			if err := runIngestCmd(td, d, d.opts.FS); err != nil {
+				return err.Error()
+			}
+			// Wait for a possible flush.
+			d.mu.Lock()
+			for d.mu.compact.flushing {
+				d.mu.compact.cond.Wait()
+			}
+			d.mu.Unlock()
+			return ""
+
+		case "ingest-and-excise":
+			if err := runIngestAndExciseCmd(td, d, d.opts.FS); err != nil {
+				return err.Error()
+			}
+			// Wait for a possible flush.
+			d.mu.Lock()
+			for d.mu.compact.flushing {
+				d.mu.compact.cond.Wait()
+			}
+			d.mu.Unlock()
+			return ""
+
+		case "replicate":
+			if len(td.CmdArgs) != 4 {
+				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
+			}
+			var from, to *DB
+			switch td.CmdArgs[0].Key {
+			case "1":
+				from = d1
+			case "2":
+				from = d2
+			default:
+				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
+			}
+			switch td.CmdArgs[1].Key {
+			case "1":
+				to = d1
+			case "2":
+				to = d2
+			default:
+				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
+			}
+			startKey := []byte(td.CmdArgs[2].Key)
+			endKey := []byte(td.CmdArgs[3].Key)
+
+			writeOpts := d.opts.MakeWriterOptions(0 /* level */, to.opts.FormatMajorVersion.MaxTableFormat())
+			sstPath := fmt.Sprintf("ext/replicate%d.sst", replicateCounter)
+			f, err := to.opts.FS.Create(sstPath)
+			require.NoError(t, err)
+			replicateCounter++
+			w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), writeOpts)
+
+			var sharedSSTs []SharedSSTMeta
+			err = from.ScanInternal(context.TODO(), sstable.CategoryAndQoS{}, startKey, endKey,
+				func(key *InternalKey, value LazyValue, _ IteratorLevel) error {
+					val, _, err := value.Value(nil)
+					require.NoError(t, err)
+					require.NoError(t, w.Add(base.MakeInternalKey(key.UserKey, 0, key.Kind()), val))
+					return nil
+				},
+				func(start, end []byte, seqNum uint64) error {
+					require.NoError(t, w.DeleteRange(start, end))
+					return nil
+				},
+				func(start, end []byte, keys []keyspan.Key) error {
+					s := keyspan.Span{
+						Start:     start,
+						End:       end,
+						Keys:      keys,
+						KeysOrder: 0,
+					}
+					require.NoError(t, rangekey.Encode(&s, func(k base.InternalKey, v []byte) error {
+						return w.AddRangeKey(base.MakeInternalKey(k.UserKey, 0, k.Kind()), v)
+					}))
+					return nil
+				},
+				func(sst *SharedSSTMeta) error {
+					sharedSSTs = append(sharedSSTs, *sst)
+					return nil
+				},
+			)
+			require.NoError(t, err)
+			require.NoError(t, w.Close())
+
+			_, err = to.IngestAndExcise([]string{sstPath}, sharedSSTs, KeyRange{Start: startKey, End: endKey})
+			require.NoError(t, err)
+			return fmt.Sprintf("replicated %d shared SSTs", len(sharedSSTs))
+
+		case "get":
+			return runGetCmd(t, td, d)
+
+		case "iter":
+			o := &IterOptions{KeyTypes: IterKeyTypePointsAndRanges}
+			var reader Reader
+			reader = d
+			for _, arg := range td.CmdArgs {
+				switch arg.Key {
+				case "mask-suffix":
+					o.RangeKeyMasking.Suffix = []byte(arg.Vals[0])
+				case "mask-filter":
+					o.RangeKeyMasking.Filter = func() BlockPropertyFilterMask {
+						return sstable.NewTestKeysMaskingFilter()
+					}
+				case "snapshot":
+					reader = efos[arg.Vals[0]]
+				}
+			}
+			iter, err := reader.NewIter(o)
+			if err != nil {
+				return err.Error()
+			}
+			return runIterCmd(td, iter, true)
+
+		case "lsm":
+			return runLSMCmd(td, d)
+
+		case "metrics":
+			// The asynchronous loading of table stats can change metrics, so
+			// wait for all the tables' stats to be loaded.
+			d.mu.Lock()
+			d.waitTableStats()
+			d.mu.Unlock()
+
+			return d.Metrics().StringForTests()
+
+		case "wait-pending-table-stats":
+			return runTableStatsCmd(td, d)
+
+		case "excise":
+			ve := &versionEdit{
+				DeletedFiles: map[deletedFileEntry]*fileMetadata{},
+			}
+			var exciseSpan KeyRange
+			if len(td.CmdArgs) != 2 {
+				panic("insufficient args for excise command")
+			}
+			exciseSpan.Start = []byte(td.CmdArgs[0].Key)
+			exciseSpan.End = []byte(td.CmdArgs[1].Key)
+
+			d.mu.Lock()
+			d.mu.versions.logLock()
+			d.mu.Unlock()
+			current := d.mu.versions.currentVersion()
+			for level := range current.Levels {
+				iter := current.Levels[level].Iter()
+				for m := iter.SeekGE(d.cmp, exciseSpan.Start); m != nil && d.cmp(m.Smallest.UserKey, exciseSpan.End) < 0; m = iter.Next() {
+					_, err := d.excise(exciseSpan, m, ve, level)
+					if err != nil {
+						d.mu.Lock()
+						d.mu.versions.logUnlock()
+						d.mu.Unlock()
+						return fmt.Sprintf("error when excising %s: %s", m.FileNum, err.Error())
+					}
+				}
+			}
+			d.mu.Lock()
+			d.mu.versions.logUnlock()
+			d.mu.Unlock()
+			return fmt.Sprintf("would excise %d files, use ingest-and-excise to excise.\n%s", len(ve.DeletedFiles), ve.String())
+
+		case "file-only-snapshot":
+			if len(td.CmdArgs) != 1 {
+				panic("insufficient args for file-only-snapshot command")
+			}
+			name := td.CmdArgs[0].Key
+			var keyRanges []KeyRange
+			for _, line := range strings.Split(td.Input, "\n") {
+				fields := strings.Fields(line)
+				if len(fields) != 2 {
+					return "expected two fields for file-only snapshot KeyRanges"
+				}
+				kr := KeyRange{Start: []byte(fields[0]), End: []byte(fields[1])}
+				keyRanges = append(keyRanges, kr)
+			}
+
+			s := d.NewEventuallyFileOnlySnapshot(keyRanges)
+			efos[name] = s
+			return "ok"
+
+		case "wait-for-file-only-snapshot":
+			if len(td.CmdArgs) != 1 {
+				panic("insufficient args for file-only-snapshot command")
+			}
+			name := td.CmdArgs[0].Key
+			err := efos[name].WaitForFileOnlySnapshot(context.TODO(), 1*time.Millisecond)
+			if err != nil {
+				return err.Error()
+			}
+			return "ok"
+
+		case "compact":
+			err := runCompactCmd(td, d)
+			if err != nil {
+				return err.Error()
+			}
+			return "ok"
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestIngestShared(t *testing.T) {
+	for _, strategy := range []remote.CreateOnSharedStrategy{remote.CreateOnSharedAll, remote.CreateOnSharedLower} {
+		strategyStr := "all"
+		if strategy == remote.CreateOnSharedLower {
+			strategyStr = "lower"
+		}
+		t.Run(fmt.Sprintf("createOnShared=%s", strategyStr), func(t *testing.T) {
+			fileName := "ingest_shared"
+			if strategy == remote.CreateOnSharedLower {
+				fileName = "ingest_shared_lower"
+			}
+			testIngestSharedImpl(t, strategy, fileName)
+		})
+	}
+}
+
+func TestSimpleIngestShared(t *testing.T) {
+	mem := vfs.NewMem()
+	var d *DB
+	var provider2 objstorage.Provider
+	opts2 := Options{FS: vfs.NewMem(), FormatMajorVersion: FormatVirtualSSTables}
+	opts2.EnsureDefaults()
+
+	// Create an objProvider where we will fake-create some sstables that can
+	// then be shared back to the db instance.
+	providerSettings := objstorageprovider.Settings{
+		Logger:              opts2.Logger,
+		FS:                  opts2.FS,
+		FSDirName:           "",
+		FSDirInitialListing: nil,
+		FSCleaner:           opts2.Cleaner,
+		NoSyncOnClose:       opts2.NoSyncOnClose,
+		BytesPerSync:        opts2.BytesPerSync,
+	}
+	providerSettings.Remote.StorageFactory = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
+		"": remote.NewInMem(),
+	})
+	providerSettings.Remote.CreateOnShared = remote.CreateOnSharedAll
+	providerSettings.Remote.CreateOnSharedLocator = ""
+
+	provider2, err := objstorageprovider.Open(providerSettings)
+	require.NoError(t, err)
+	creatorIDCounter := uint64(1)
+	provider2.SetCreatorID(objstorage.CreatorID(creatorIDCounter))
+	creatorIDCounter++
+
+	defer func() {
+		require.NoError(t, d.Close())
+	}()
+
+	reset := func() {
+		if d != nil {
+			require.NoError(t, d.Close())
+		}
+
+		mem = vfs.NewMem()
+		require.NoError(t, mem.MkdirAll("ext", 0755))
+		opts := &Options{
+			FormatMajorVersion:    FormatVirtualSSTables,
+			FS:                    mem,
+			L0CompactionThreshold: 100,
+			L0StopWritesThreshold: 100,
+		}
+		opts.Experimental.RemoteStorage = providerSettings.Remote.StorageFactory
+		opts.Experimental.CreateOnShared = providerSettings.Remote.CreateOnShared
+		opts.Experimental.CreateOnSharedLocator = providerSettings.Remote.CreateOnSharedLocator
+
+		var err error
+		d, err = Open("", opts)
+		require.NoError(t, err)
+		require.NoError(t, d.SetCreatorID(creatorIDCounter))
+		creatorIDCounter++
+	}
+	reset()
+
+	metaMap := map[base.DiskFileNum]objstorage.ObjectMetadata{}
+
+	require.NoError(t, d.Set([]byte("d"), []byte("unexpected"), nil))
+	require.NoError(t, d.Set([]byte("e"), []byte("unexpected"), nil))
+	require.NoError(t, d.Set([]byte("a"), []byte("unexpected"), nil))
+	require.NoError(t, d.Set([]byte("f"), []byte("unexpected"), nil))
+	d.Flush()
+
+	{
+		// Create a shared file.
+		fn := base.FileNum(2)
+		f, meta, err := provider2.Create(context.TODO(), fileTypeTable, fn.DiskFileNum(), objstorage.CreateOptions{PreferSharedStorage: true})
+		require.NoError(t, err)
+		w := sstable.NewWriter(f, d.opts.MakeWriterOptions(0, d.opts.FormatMajorVersion.MaxTableFormat()))
+		w.Set([]byte("d"), []byte("shared"))
+		w.Set([]byte("e"), []byte("shared"))
+		w.Close()
+		metaMap[fn.DiskFileNum()] = meta
+	}
+
+	m := metaMap[base.FileNum(2).DiskFileNum()]
+	handle, err := provider2.RemoteObjectBacking(&m)
+	require.NoError(t, err)
+	size, err := provider2.Size(m)
+	require.NoError(t, err)
+
+	sharedSSTMeta := SharedSSTMeta{
+		Backing:          handle,
+		Smallest:         base.MakeInternalKey([]byte("d"), 0, InternalKeyKindSet),
+		Largest:          base.MakeInternalKey([]byte("e"), 0, InternalKeyKindSet),
+		SmallestPointKey: base.MakeInternalKey([]byte("d"), 0, InternalKeyKindSet),
+		LargestPointKey:  base.MakeInternalKey([]byte("e"), 0, InternalKeyKindSet),
+		Level:            6,
+		Size:             uint64(size + 5),
+	}
+	_, err = d.IngestAndExcise([]string{}, []SharedSSTMeta{sharedSSTMeta}, KeyRange{Start: []byte("d"), End: []byte("ee")})
+	require.NoError(t, err)
+
+	// TODO(bilal): Once reading of shared sstables is in, verify that the values
+	// of d and e have been updated.
+}
+
+type blockedCompaction struct {
+	startBlock, unblock chan struct{}
+}
+
+func TestConcurrentExcise(t *testing.T) {
+	var d, d1, d2 *DB
+	var efos map[string]*EventuallyFileOnlySnapshot
+	backgroundErrs := make(chan error, 5)
+	var compactions map[string]*blockedCompaction
+	defer func() {
+		for _, e := range efos {
+			require.NoError(t, e.Close())
+		}
+		if d1 != nil {
+			require.NoError(t, d1.Close())
+		}
+		if d2 != nil {
+			require.NoError(t, d2.Close())
+		}
+	}()
+	creatorIDCounter := uint64(1)
+	replicateCounter := 1
+
+	var wg sync.WaitGroup
+	defer wg.Wait()
+	var blockNextCompaction bool
+	var blockedJobID int
+	var blockedCompactionName string
+	var blockedCompactionsMu sync.Mutex // protects the above three variables.
+
+	reset := func() {
+		wg.Wait()
+		for _, e := range efos {
+			require.NoError(t, e.Close())
+		}
+		if d1 != nil {
+			require.NoError(t, d1.Close())
+		}
+		if d2 != nil {
+			require.NoError(t, d2.Close())
+		}
+		efos = make(map[string]*EventuallyFileOnlySnapshot)
+		compactions = make(map[string]*blockedCompaction)
+		backgroundErrs = make(chan error, 5)
+
+		var el EventListener
+		el.EnsureDefaults(testLogger{t: t})
+		el.FlushBegin = func(info FlushInfo) {
+			// Don't block flushes
+		}
+		el.BackgroundError = func(err error) {
+			backgroundErrs <- err
+		}
+		el.CompactionBegin = func(info CompactionInfo) {
+			if info.Reason == "move" {
+				return
+			}
+			blockedCompactionsMu.Lock()
+			defer blockedCompactionsMu.Unlock()
+			if blockNextCompaction {
+				blockNextCompaction = false
+				blockedJobID = info.JobID
+			}
+		}
+		el.TableCreated = func(info TableCreateInfo) {
+			blockedCompactionsMu.Lock()
+			if info.JobID != blockedJobID {
+				blockedCompactionsMu.Unlock()
+				return
+			}
+			blockedJobID = 0
+			c := compactions[blockedCompactionName]
+			blockedCompactionName = ""
+			blockedCompactionsMu.Unlock()
+			c.startBlock <- struct{}{}
+			<-c.unblock
+		}
+
+		sstorage := remote.NewInMem()
+		mem1 := vfs.NewMem()
+		mem2 := vfs.NewMem()
+		require.NoError(t, mem1.MkdirAll("ext", 0755))
+		require.NoError(t, mem2.MkdirAll("ext", 0755))
+		opts1 := &Options{
+			Comparer:              testkeys.Comparer,
+			LBaseMaxBytes:         1,
+			FS:                    mem1,
+			L0CompactionThreshold: 100,
+			L0StopWritesThreshold: 100,
+			DebugCheck:            DebugCheckLevels,
+			FormatMajorVersion:    FormatVirtualSSTables,
+		}
+		// lel.
+		lel := MakeLoggingEventListener(DefaultLogger)
+		tel := TeeEventListener(lel, el)
+		opts1.EventListener = &tel
+		opts1.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
+			"": sstorage,
+		})
+		opts1.Experimental.CreateOnShared = remote.CreateOnSharedAll
+		opts1.Experimental.CreateOnSharedLocator = ""
+		// Disable automatic compactions because otherwise we'll race with
+		// delete-only compactions triggered by ingesting range tombstones.
+		opts1.DisableAutomaticCompactions = true
+
+		opts2 := &Options{}
+		*opts2 = *opts1
+		opts2.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
+			"": sstorage,
+		})
+		opts2.Experimental.CreateOnShared = remote.CreateOnSharedAll
+		opts2.Experimental.CreateOnSharedLocator = ""
+		opts2.FS = mem2
+
+		var err error
+		d1, err = Open("", opts1)
+		require.NoError(t, err)
+		require.NoError(t, d1.SetCreatorID(creatorIDCounter))
+		creatorIDCounter++
+		d2, err = Open("", opts2)
+		require.NoError(t, err)
+		require.NoError(t, d2.SetCreatorID(creatorIDCounter))
+		creatorIDCounter++
+		d = d1
+	}
+	reset()
+
+	datadriven.RunTest(t, "testdata/concurrent_excise", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "reset":
+			reset()
+			return ""
+		case "switch":
+			if len(td.CmdArgs) != 1 {
+				return "usage: switch <1 or 2>"
+			}
+			switch td.CmdArgs[0].Key {
+			case "1":
+				d = d1
+			case "2":
+				d = d2
+			default:
+				return "usage: switch <1 or 2>"
+			}
+			return "ok"
+		case "batch":
+			b := d.NewIndexedBatch()
+			if err := runBatchDefineCmd(td, b); err != nil {
+				return err.Error()
+			}
+			if err := b.Commit(nil); err != nil {
+				return err.Error()
+			}
+			return ""
+		case "build":
+			if err := runBuildCmd(td, d, d.opts.FS); err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "flush":
+			if err := d.Flush(); err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "ingest":
+			if err := runIngestCmd(td, d, d.opts.FS); err != nil {
+				return err.Error()
+			}
+			// Wait for a possible flush.
+			d.mu.Lock()
+			for d.mu.compact.flushing {
+				d.mu.compact.cond.Wait()
+			}
+			d.mu.Unlock()
+			return ""
+
+		case "ingest-and-excise":
+			if err := runIngestAndExciseCmd(td, d, d.opts.FS); err != nil {
+				return err.Error()
+			}
+			// Wait for a possible flush.
+			d.mu.Lock()
+			for d.mu.compact.flushing {
+				d.mu.compact.cond.Wait()
+			}
+			d.mu.Unlock()
+			return ""
+
+		case "replicate":
+			if len(td.CmdArgs) != 4 {
+				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
+			}
+			var from, to *DB
+			switch td.CmdArgs[0].Key {
+			case "1":
+				from = d1
+			case "2":
+				from = d2
+			default:
+				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
+			}
+			switch td.CmdArgs[1].Key {
+			case "1":
+				to = d1
+			case "2":
+				to = d2
+			default:
+				return "usage: replicate <from-db-num> <to-db-num> <start-key> <end-key>"
+			}
+			startKey := []byte(td.CmdArgs[2].Key)
+			endKey := []byte(td.CmdArgs[3].Key)
+
+			writeOpts := d.opts.MakeWriterOptions(0 /* level */, to.opts.FormatMajorVersion.MaxTableFormat())
+			sstPath := fmt.Sprintf("ext/replicate%d.sst", replicateCounter)
+			f, err := to.opts.FS.Create(sstPath)
+			require.NoError(t, err)
+			replicateCounter++
+			w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), writeOpts)
+
+			var sharedSSTs []SharedSSTMeta
+			err = from.ScanInternal(context.TODO(), sstable.CategoryAndQoS{}, startKey, endKey,
+				func(key *InternalKey, value LazyValue, _ IteratorLevel) error {
+					val, _, err := value.Value(nil)
+					require.NoError(t, err)
+					require.NoError(t, w.Add(base.MakeInternalKey(key.UserKey, 0, key.Kind()), val))
+					return nil
+				},
+				func(start, end []byte, seqNum uint64) error {
+					require.NoError(t, w.DeleteRange(start, end))
+					return nil
+				},
+				func(start, end []byte, keys []keyspan.Key) error {
+					s := keyspan.Span{
+						Start:     start,
+						End:       end,
+						Keys:      keys,
+						KeysOrder: 0,
+					}
+					require.NoError(t, rangekey.Encode(&s, func(k base.InternalKey, v []byte) error {
+						return w.AddRangeKey(base.MakeInternalKey(k.UserKey, 0, k.Kind()), v)
+					}))
+					return nil
+				},
+				func(sst *SharedSSTMeta) error {
+					sharedSSTs = append(sharedSSTs, *sst)
+					return nil
+				},
+			)
+			require.NoError(t, err)
+			require.NoError(t, w.Close())
+
+			_, err = to.IngestAndExcise([]string{sstPath}, sharedSSTs, KeyRange{Start: startKey, End: endKey})
+			require.NoError(t, err)
+			return fmt.Sprintf("replicated %d shared SSTs", len(sharedSSTs))
+
+		case "get":
+			return runGetCmd(t, td, d)
+
+		case "iter":
+			o := &IterOptions{KeyTypes: IterKeyTypePointsAndRanges}
+			var reader Reader
+			reader = d
+			for _, arg := range td.CmdArgs {
+				switch arg.Key {
+				case "mask-suffix":
+					o.RangeKeyMasking.Suffix = []byte(arg.Vals[0])
+				case "mask-filter":
+					o.RangeKeyMasking.Filter = func() BlockPropertyFilterMask {
+						return sstable.NewTestKeysMaskingFilter()
+					}
+				case "snapshot":
+					reader = efos[arg.Vals[0]]
+				}
+			}
+			iter, err := reader.NewIter(o)
+			if err != nil {
+				return err.Error()
+			}
+			return runIterCmd(td, iter, true)
+
+		case "lsm":
+			return runLSMCmd(td, d)
+
+		case "metrics":
+			// The asynchronous loading of table stats can change metrics, so
+			// wait for all the tables' stats to be loaded.
+			d.mu.Lock()
+			d.waitTableStats()
+			d.mu.Unlock()
+
+			return d.Metrics().StringForTests()
+
+		case "wait-pending-table-stats":
+			return runTableStatsCmd(td, d)
+
+		case "excise":
+			ve := &versionEdit{
+				DeletedFiles: map[deletedFileEntry]*fileMetadata{},
+			}
+			var exciseSpan KeyRange
+			if len(td.CmdArgs) != 2 {
+				panic("insufficient args for excise command")
+			}
+			exciseSpan.Start = []byte(td.CmdArgs[0].Key)
+			exciseSpan.End = []byte(td.CmdArgs[1].Key)
+
+			d.mu.Lock()
+			d.mu.versions.logLock()
+			d.mu.Unlock()
+			current := d.mu.versions.currentVersion()
+			for level := range current.Levels {
+				iter := current.Levels[level].Iter()
+				for m := iter.SeekGE(d.cmp, exciseSpan.Start); m != nil && d.cmp(m.Smallest.UserKey, exciseSpan.End) < 0; m = iter.Next() {
+					_, err := d.excise(exciseSpan, m, ve, level)
+					if err != nil {
+						d.mu.Lock()
+						d.mu.versions.logUnlock()
+						d.mu.Unlock()
+						return fmt.Sprintf("error when excising %s: %s", m.FileNum, err.Error())
+					}
+				}
+			}
+			d.mu.Lock()
+			d.mu.versions.logUnlock()
+			d.mu.Unlock()
+			return fmt.Sprintf("would excise %d files, use ingest-and-excise to excise.\n%s", len(ve.DeletedFiles), ve.String())
+
+		case "file-only-snapshot":
+			if len(td.CmdArgs) != 1 {
+				panic("insufficient args for file-only-snapshot command")
+			}
+			name := td.CmdArgs[0].Key
+			var keyRanges []KeyRange
+			for _, line := range strings.Split(td.Input, "\n") {
+				fields := strings.Fields(line)
+				if len(fields) != 2 {
+					return "expected two fields for file-only snapshot KeyRanges"
+				}
+				kr := KeyRange{Start: []byte(fields[0]), End: []byte(fields[1])}
+				keyRanges = append(keyRanges, kr)
+			}
+
+			s := d.NewEventuallyFileOnlySnapshot(keyRanges)
+			efos[name] = s
+			return "ok"
+
+		case "wait-for-file-only-snapshot":
+			if len(td.CmdArgs) != 1 {
+				panic("insufficient args for file-only-snapshot command")
+			}
+			name := td.CmdArgs[0].Key
+			err := efos[name].WaitForFileOnlySnapshot(context.TODO(), 1*time.Millisecond)
+			if err != nil {
+				return err.Error()
+			}
+			return "ok"
+
+		case "unblock":
+			name := td.CmdArgs[0].Key
+			blockedCompactionsMu.Lock()
+			c := compactions[name]
+			delete(compactions, name)
+			blockedCompactionsMu.Unlock()
+			c.unblock <- struct{}{}
+			return "ok"
+
+		case "compact":
+			async := false
+			var otherArgs []datadriven.CmdArg
+			var bc *blockedCompaction
+			for i := range td.CmdArgs {
+				switch td.CmdArgs[i].Key {
+				case "block":
+					name := td.CmdArgs[i].Vals[0]
+					bc = &blockedCompaction{startBlock: make(chan struct{}), unblock: make(chan struct{})}
+					blockedCompactionsMu.Lock()
+					compactions[name] = bc
+					blockNextCompaction = true
+					blockedCompactionName = name
+					blockedCompactionsMu.Unlock()
+					async = true
+				default:
+					otherArgs = append(otherArgs, td.CmdArgs[i])
+				}
+			}
+			var tdClone datadriven.TestData
+			tdClone = *td
+			tdClone.CmdArgs = otherArgs
+			if !async {
+				err := runCompactCmd(td, d)
+				if err != nil {
+					return err.Error()
+				}
+			} else {
+				wg.Add(1)
+				go func() {
+					defer wg.Done()
+					_ = runCompactCmd(&tdClone, d)
+				}()
+				<-bc.startBlock
+				return "spun off in separate goroutine"
+			}
+			return "ok"
+		case "wait-for-background-error":
+			err := <-backgroundErrs
+			return err.Error()
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestIngestExternal(t *testing.T) {
+	var mem vfs.FS
+	var d *DB
+	var flushed bool
+	defer func() {
+		require.NoError(t, d.Close())
+	}()
+
+	var remoteStorage remote.Storage
+
+	reset := func() {
+		if d != nil {
+			require.NoError(t, d.Close())
+		}
+
+		mem = vfs.NewMem()
+		require.NoError(t, mem.MkdirAll("ext", 0755))
+		remoteStorage = remote.NewInMem()
+		opts := &Options{
+			FS:                    mem,
+			L0CompactionThreshold: 100,
+			L0StopWritesThreshold: 100,
+			DebugCheck:            DebugCheckLevels,
+			EventListener: &EventListener{FlushEnd: func(info FlushInfo) {
+				flushed = true
+			}},
+			FormatMajorVersion: FormatVirtualSSTables,
+		}
+		opts.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
+			"external-locator": remoteStorage,
+		})
+		opts.Experimental.CreateOnShared = remote.CreateOnSharedNone
+		// Disable automatic compactions because otherwise we'll race with
+		// delete-only compactions triggered by ingesting range tombstones.
+		opts.DisableAutomaticCompactions = true
+
+		var err error
+		d, err = Open("", opts)
+		require.NoError(t, err)
+		require.NoError(t, d.SetCreatorID(1))
+	}
+	reset()
+
+	datadriven.RunTest(t, "testdata/ingest_external", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "reset":
+			reset()
+			return ""
+		case "batch":
+			b := d.NewIndexedBatch()
+			if err := runBatchDefineCmd(td, b); err != nil {
+				return err.Error()
+			}
+			if err := b.Commit(nil); err != nil {
+				return err.Error()
+			}
+			return ""
+		case "build-remote":
+			if err := runBuildRemoteCmd(td, d, remoteStorage); err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "flush":
+			if err := d.Flush(); err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "ingest-external":
+			flushed = false
+			if err := runIngestExternalCmd(td, d, "external-locator"); err != nil {
+				return err.Error()
+			}
+			// Wait for a possible flush.
+			d.mu.Lock()
+			for d.mu.compact.flushing {
+				d.mu.compact.cond.Wait()
+			}
+			d.mu.Unlock()
+			if flushed {
+				return "memtable flushed"
+			}
+			return ""
+
+		case "get":
+			return runGetCmd(t, td, d)
+
+		case "iter":
+			iter, _ := d.NewIter(&IterOptions{
+				KeyTypes: IterKeyTypePointsAndRanges,
+			})
+			return runIterCmd(td, iter, true)
+
+		case "lsm":
+			return runLSMCmd(td, d)
+
+		case "metrics":
+			// The asynchronous loading of table stats can change metrics, so
+			// wait for all the tables' stats to be loaded.
+			d.mu.Lock()
+			d.waitTableStats()
+			d.mu.Unlock()
+
+			return d.Metrics().StringForTests()
+
+		case "wait-pending-table-stats":
+			return runTableStatsCmd(td, d)
+
+		case "compact":
+			if len(td.CmdArgs) != 2 {
+				panic("insufficient args for compact command")
+			}
+			l := td.CmdArgs[0].Key
+			r := td.CmdArgs[1].Key
+			err := d.Compact([]byte(l), []byte(r), false)
+			if err != nil {
+				return err.Error()
+			}
+			return ""
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestIngestMemtableOverlaps(t *testing.T) {
+	comparers := []Comparer{
+		{Name: "default", Compare: DefaultComparer.Compare, FormatKey: DefaultComparer.FormatKey},
+		{
+			Name:      "reverse",
+			Compare:   func(a, b []byte) int { return DefaultComparer.Compare(b, a) },
+			FormatKey: DefaultComparer.FormatKey,
+		},
+	}
+	m := make(map[string]*Comparer)
+	for i := range comparers {
+		c := &comparers[i]
+		m[c.Name] = c
+	}
+
+	for _, comparer := range comparers {
+		t.Run(comparer.Name, func(t *testing.T) {
+			var mem *memTable
+
+			parseMeta := func(s string) *fileMetadata {
+				parts := strings.Split(s, "-")
+				meta := &fileMetadata{}
+				if len(parts) != 2 {
+					t.Fatalf("malformed table spec: %s", s)
+				}
+				var smallest, largest base.InternalKey
+				if strings.Contains(parts[0], ".") {
+					if !strings.Contains(parts[1], ".") {
+						t.Fatalf("malformed table spec: %s", s)
+					}
+					smallest = base.ParseInternalKey(parts[0])
+					largest = base.ParseInternalKey(parts[1])
+				} else {
+					smallest = InternalKey{UserKey: []byte(parts[0])}
+					largest = InternalKey{UserKey: []byte(parts[1])}
+				}
+				// If we're using a reverse comparer, flip the file bounds.
+				if mem.cmp(smallest.UserKey, largest.UserKey) > 0 {
+					smallest, largest = largest, smallest
+				}
+				meta.ExtendPointKeyBounds(comparer.Compare, smallest, largest)
+				meta.InitPhysicalBacking()
+				return meta
+			}
+
+			datadriven.RunTest(t, "testdata/ingest_memtable_overlaps", func(t *testing.T, d *datadriven.TestData) string {
+				switch d.Cmd {
+				case "define":
+					b := newBatch(nil)
+					if err := runBatchDefineCmd(d, b); err != nil {
+						return err.Error()
+					}
+
+					opts := &Options{
+						Comparer: &comparer,
+					}
+					opts.EnsureDefaults().WithFSDefaults()
+					if len(d.CmdArgs) > 1 {
+						return fmt.Sprintf("%s expects at most 1 argument", d.Cmd)
+					}
+					if len(d.CmdArgs) == 1 {
+						opts.Comparer = m[d.CmdArgs[0].String()]
+						if opts.Comparer == nil {
+							return fmt.Sprintf("%s unknown comparer: %s", d.Cmd, d.CmdArgs[0].String())
+						}
+					}
+
+					mem = newMemTable(memTableOptions{Options: opts})
+					if err := mem.apply(b, 0); err != nil {
+						return err.Error()
+					}
+					return ""
+
+				case "overlaps":
+					var buf bytes.Buffer
+					for _, data := range strings.Split(d.Input, "\n") {
+						var keyRanges []internalKeyRange
+						for _, part := range strings.Fields(data) {
+							meta := parseMeta(part)
+							keyRanges = append(keyRanges, internalKeyRange{smallest: meta.Smallest, largest: meta.Largest})
+						}
+						fmt.Fprintf(&buf, "%t\n", ingestMemtableOverlaps(mem.cmp, mem, keyRanges))
+					}
+					return buf.String()
+
+				default:
+					return fmt.Sprintf("unknown command: %s", d.Cmd)
+				}
+			})
+		})
+	}
+}
+
+func TestKeyRangeBasic(t *testing.T) {
+	cmp := base.DefaultComparer.Compare
+	k1 := KeyRange{Start: []byte("b"), End: []byte("c")}
+
+	// Tests for Contains()
+	require.True(t, k1.Contains(cmp, base.MakeInternalKey([]byte("b"), 1, InternalKeyKindSet)))
+	require.False(t, k1.Contains(cmp, base.MakeInternalKey([]byte("c"), 1, InternalKeyKindSet)))
+	require.True(t, k1.Contains(cmp, base.MakeInternalKey([]byte("bb"), 1, InternalKeyKindSet)))
+	require.True(t, k1.Contains(cmp, base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, []byte("c"))))
+
+	m1 := &fileMetadata{
+		Smallest: base.MakeInternalKey([]byte("b"), 1, InternalKeyKindSet),
+		Largest:  base.MakeInternalKey([]byte("c"), 1, InternalKeyKindSet),
+	}
+	require.True(t, k1.Overlaps(cmp, m1))
+	m2 := &fileMetadata{
+		Smallest: base.MakeInternalKey([]byte("c"), 1, InternalKeyKindSet),
+		Largest:  base.MakeInternalKey([]byte("d"), 1, InternalKeyKindSet),
+	}
+	require.False(t, k1.Overlaps(cmp, m2))
+	m3 := &fileMetadata{
+		Smallest: base.MakeInternalKey([]byte("a"), 1, InternalKeyKindSet),
+		Largest:  base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, []byte("b")),
+	}
+	require.False(t, k1.Overlaps(cmp, m3))
+	m4 := &fileMetadata{
+		Smallest: base.MakeInternalKey([]byte("a"), 1, InternalKeyKindSet),
+		Largest:  base.MakeInternalKey([]byte("b"), 1, InternalKeyKindSet),
+	}
+	require.True(t, k1.Overlaps(cmp, m4))
+}
+
+func BenchmarkIngestOverlappingMemtable(b *testing.B) {
+	assertNoError := func(err error) {
+		b.Helper()
+		if err != nil {
+			b.Fatal(err)
+		}
+	}
+
+	for count := 1; count < 6; count++ {
+		b.Run(fmt.Sprintf("memtables=%d", count), func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				b.StopTimer()
+				mem := vfs.NewMem()
+				d, err := Open("", &Options{
+					FS: mem,
+				})
+				assertNoError(err)
+
+				// Create memtables.
+				for {
+					assertNoError(d.Set([]byte("a"), nil, nil))
+					d.mu.Lock()
+					done := len(d.mu.mem.queue) == count
+					d.mu.Unlock()
+					if done {
+						break
+					}
+				}
+
+				// Create the overlapping sstable that will force a flush when ingested.
+				f, err := mem.Create("ext")
+				assertNoError(err)
+				w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
+				assertNoError(w.Set([]byte("a"), nil))
+				assertNoError(w.Close())
+
+				b.StartTimer()
+				assertNoError(d.Ingest([]string{"ext"}))
+			}
+		})
+	}
+}
+
+func TestIngestTargetLevel(t *testing.T) {
+	var d *DB
+	defer func() {
+		if d != nil {
+			// Ignore errors because this test defines fake in-progress transactions
+			// that prohibit clean shutdown.
+			_ = d.Close()
+		}
+	}()
+
+	parseMeta := func(s string) *fileMetadata {
+		var rkey bool
+		if len(s) >= 4 && s[0:4] == "rkey" {
+			rkey = true
+			s = s[5:]
+		}
+		parts := strings.Split(s, "-")
+		if len(parts) != 2 {
+			t.Fatalf("malformed table spec: %s", s)
+		}
+		var m *fileMetadata
+		if rkey {
+			m = (&fileMetadata{}).ExtendRangeKeyBounds(
+				d.cmp,
+				InternalKey{UserKey: []byte(parts[0])},
+				InternalKey{UserKey: []byte(parts[1])},
+			)
+		} else {
+			m = (&fileMetadata{}).ExtendPointKeyBounds(
+				d.cmp,
+				InternalKey{UserKey: []byte(parts[0])},
+				InternalKey{UserKey: []byte(parts[1])},
+			)
+		}
+		m.InitPhysicalBacking()
+		return m
+	}
+
+	datadriven.RunTest(t, "testdata/ingest_target_level", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "define":
+			if d != nil {
+				// Ignore errors because this test defines fake in-progress
+				// transactions that prohibit clean shutdown.
+				_ = d.Close()
+			}
+
+			var err error
+			opts := Options{
+				FormatMajorVersion: internalFormatNewest,
+			}
+			opts.WithFSDefaults()
+			if d, err = runDBDefineCmd(td, &opts); err != nil {
+				return err.Error()
+			}
+
+			readState := d.loadReadState()
+			c := &checkConfig{
+				logger:    d.opts.Logger,
+				comparer:  d.opts.Comparer,
+				readState: readState,
+				newIters:  d.newIters,
+				// TODO: runDBDefineCmd doesn't properly update the visible
+				// sequence number. So we have to explicitly configure level checker with a very large
+				// sequence number, otherwise the DB appears empty.
+				seqNum: InternalKeySeqNumMax,
+			}
+			if err := checkLevelsInternal(c); err != nil {
+				return err.Error()
+			}
+			readState.unref()
+
+			d.mu.Lock()
+			s := d.mu.versions.currentVersion().String()
+			d.mu.Unlock()
+			return s
+
+		case "target":
+			var buf bytes.Buffer
+			suggestSplit := false
+			for _, cmd := range td.CmdArgs {
+				switch cmd.Key {
+				case "suggest-split":
+					suggestSplit = true
+				}
+			}
+			for _, target := range strings.Split(td.Input, "\n") {
+				meta := parseMeta(target)
+				level, overlapFile, err := ingestTargetLevel(
+					d.newIters, d.tableNewRangeKeyIter, IterOptions{logger: d.opts.Logger},
+					d.opts.Comparer, d.mu.versions.currentVersion(), 1, d.mu.compact.inProgress, meta,
+					suggestSplit)
+				if err != nil {
+					return err.Error()
+				}
+				if overlapFile != nil {
+					fmt.Fprintf(&buf, "%d (split file: %s)\n", level, overlapFile.FileNum)
+				} else {
+					fmt.Fprintf(&buf, "%d\n", level)
+				}
+			}
+			return buf.String()
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestIngest(t *testing.T) {
+	var mem vfs.FS
+	var d *DB
+	var flushed bool
+	defer func() {
+		require.NoError(t, d.Close())
+	}()
+
+	reset := func(split bool) {
+		if d != nil {
+			require.NoError(t, d.Close())
+		}
+
+		mem = vfs.NewMem()
+		require.NoError(t, mem.MkdirAll("ext", 0755))
+		opts := &Options{
+			FS:                    mem,
+			L0CompactionThreshold: 100,
+			L0StopWritesThreshold: 100,
+			DebugCheck:            DebugCheckLevels,
+			EventListener: &EventListener{FlushEnd: func(info FlushInfo) {
+				flushed = true
+			}},
+			FormatMajorVersion: internalFormatNewest,
+		}
+		opts.Experimental.IngestSplit = func() bool {
+			return split
+		}
+		// Disable automatic compactions because otherwise we'll race with
+		// delete-only compactions triggered by ingesting range tombstones.
+		opts.DisableAutomaticCompactions = true
+
+		var err error
+		d, err = Open("", opts)
+		require.NoError(t, err)
+	}
+	reset(false /* split */)
+
+	datadriven.RunTest(t, "testdata/ingest", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "reset":
+			split := false
+			for _, cmd := range td.CmdArgs {
+				switch cmd.Key {
+				case "enable-split":
+					split = true
+				default:
+					return fmt.Sprintf("unexpected key: %s", cmd.Key)
+				}
+			}
+			reset(split)
+			return ""
+		case "batch":
+			b := d.NewIndexedBatch()
+			if err := runBatchDefineCmd(td, b); err != nil {
+				return err.Error()
+			}
+			if err := b.Commit(nil); err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "build":
+			if err := runBuildCmd(td, d, mem); err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "ingest":
+			flushed = false
+			if err := runIngestCmd(td, d, mem); err != nil {
+				return err.Error()
+			}
+			// Wait for a possible flush.
+			d.mu.Lock()
+			for d.mu.compact.flushing {
+				d.mu.compact.cond.Wait()
+			}
+			d.mu.Unlock()
+			if flushed {
+				return "memtable flushed"
+			}
+			return ""
+
+		case "get":
+			return runGetCmd(t, td, d)
+
+		case "iter":
+			iter, _ := d.NewIter(&IterOptions{
+				KeyTypes: IterKeyTypePointsAndRanges,
+			})
+			return runIterCmd(td, iter, true)
+
+		case "lsm":
+			return runLSMCmd(td, d)
+
+		case "metrics":
+			// The asynchronous loading of table stats can change metrics, so
+			// wait for all the tables' stats to be loaded.
+			d.mu.Lock()
+			d.waitTableStats()
+			d.mu.Unlock()
+
+			return d.Metrics().StringForTests()
+
+		case "wait-pending-table-stats":
+			return runTableStatsCmd(td, d)
+
+		case "compact":
+			if len(td.CmdArgs) != 2 {
+				panic("insufficient args for compact command")
+			}
+			l := td.CmdArgs[0].Key
+			r := td.CmdArgs[1].Key
+			err := d.Compact([]byte(l), []byte(r), false)
+			if err != nil {
+				return err.Error()
+			}
+			return ""
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestIngestError(t *testing.T) {
+	for i := int32(0); ; i++ {
+		mem := vfs.NewMem()
+
+		f0, err := mem.Create("ext0")
+		require.NoError(t, err)
+		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f0), sstable.WriterOptions{})
+		require.NoError(t, w.Set([]byte("d"), nil))
+		require.NoError(t, w.Close())
+		f1, err := mem.Create("ext1")
+		require.NoError(t, err)
+		w = sstable.NewWriter(objstorageprovider.NewFileWritable(f1), sstable.WriterOptions{})
+		require.NoError(t, w.Set([]byte("d"), nil))
+		require.NoError(t, w.Close())
+
+		ii := errorfs.OnIndex(-1)
+		d, err := Open("", &Options{
+			FS:                    errorfs.Wrap(mem, errorfs.ErrInjected.If(ii)),
+			Logger:                panicLogger{},
+			L0CompactionThreshold: 8,
+		})
+		require.NoError(t, err)
+		// Force the creation of an L0 sstable that overlaps with the tables
+		// we'll attempt to ingest. This ensures that we exercise filesystem
+		// codepaths when determining the ingest target level.
+		require.NoError(t, d.Set([]byte("a"), nil, nil))
+		require.NoError(t, d.Set([]byte("d"), nil, nil))
+		require.NoError(t, d.Flush())
+
+		t.Run(fmt.Sprintf("index-%d", i), func(t *testing.T) {
+			defer func() {
+				if r := recover(); r != nil {
+					if e, ok := r.(error); ok && errors.Is(e, errorfs.ErrInjected) {
+						return
+					}
+					// d.opts.Logger.Fatalf won't propagate ErrInjected
+					// itself, but should contain the error message.
+					if strings.HasSuffix(fmt.Sprint(r), errorfs.ErrInjected.Error()) {
+						return
+					}
+					t.Fatal(r)
+				}
+			}()
+
+			ii.Store(i)
+			err1 := d.Ingest([]string{"ext0"})
+			err2 := d.Ingest([]string{"ext1"})
+			err := firstError(err1, err2)
+			if err != nil && !errors.Is(err, errorfs.ErrInjected) {
+				t.Fatal(err)
+			}
+		})
+
+		// d.Close may error if we failed to flush the manifest.
+		_ = d.Close()
+
+		// If the injector's index is non-negative, the i-th filesystem
+		// operation was never executed.
+		if ii.Load() >= 0 {
+			break
+		}
+	}
+}
+
+func TestIngestIdempotence(t *testing.T) {
+	// Use an on-disk filesystem, because Ingest with a MemFS will copy, not
+	// link the ingested file.
+	dir, err := os.MkdirTemp("", "ingest-idempotence")
+	require.NoError(t, err)
+	defer os.RemoveAll(dir)
+	fs := vfs.Default
+
+	path := fs.PathJoin(dir, "ext")
+	f, err := fs.Create(fs.PathJoin(dir, "ext"))
+	require.NoError(t, err)
+	w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
+	require.NoError(t, w.Set([]byte("d"), nil))
+	require.NoError(t, w.Close())
+
+	d, err := Open(dir, &Options{
+		FS: fs,
+	})
+	require.NoError(t, err)
+	const count = 4
+	for i := 0; i < count; i++ {
+		ingestPath := fs.PathJoin(dir, fmt.Sprintf("ext%d", i))
+		require.NoError(t, fs.Link(path, ingestPath))
+		require.NoError(t, d.Ingest([]string{ingestPath}))
+	}
+	require.NoError(t, d.Close())
+}
+
+func TestIngestCompact(t *testing.T) {
+	mem := vfs.NewMem()
+	lel := MakeLoggingEventListener(&base.InMemLogger{})
+	d, err := Open("", &Options{
+		EventListener:         &lel,
+		FS:                    mem,
+		L0CompactionThreshold: 1,
+		L0StopWritesThreshold: 1,
+	})
+	require.NoError(t, err)
+
+	src := func(i int) string {
+		return fmt.Sprintf("ext%d", i)
+	}
+	f, err := mem.Create(src(0))
+	require.NoError(t, err)
+
+	w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
+	key := []byte("a")
+	require.NoError(t, w.Add(base.MakeInternalKey(key, 0, InternalKeyKindSet), nil))
+	require.NoError(t, w.Close())
+
+	// Make N copies of the sstable.
+	const count = 20
+	for i := 1; i < count; i++ {
+		require.NoError(t, vfs.Copy(d.opts.FS, src(0), src(i)))
+	}
+
+	// Ingest the same sstable multiple times. Compaction should take place as
+	// ingestion happens, preventing an indefinite write stall from occurring.
+	for i := 0; i < count; i++ {
+		if i == 10 {
+			// Half-way through the ingestions, set a key in the memtable to force
+			// overlap with the memtable which will require the memtable to be
+			// flushed.
+			require.NoError(t, d.Set(key, nil, nil))
+		}
+		require.NoError(t, d.Ingest([]string{src(i)}))
+	}
+
+	require.NoError(t, d.Close())
+}
+
+func TestConcurrentIngest(t *testing.T) {
+	mem := vfs.NewMem()
+	d, err := Open("", &Options{
+		FS: mem,
+	})
+	require.NoError(t, err)
+
+	// Create an sstable with 2 keys. This is necessary to trigger the overlap
+	// bug because an sstable with a single key will not have overlap in internal
+	// key space and the sequence number assignment had already guaranteed
+	// correct ordering.
+	src := func(i int) string {
+		return fmt.Sprintf("ext%d", i)
+	}
+	f, err := mem.Create(src(0))
+	require.NoError(t, err)
+
+	w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
+	require.NoError(t, w.Set([]byte("a"), nil))
+	require.NoError(t, w.Set([]byte("b"), nil))
+	require.NoError(t, w.Close())
+
+	// Make N copies of the sstable.
+	errCh := make(chan error, 5)
+	for i := 1; i < cap(errCh); i++ {
+		require.NoError(t, vfs.Copy(d.opts.FS, src(0), src(i)))
+	}
+
+	// Perform N ingestions concurrently.
+	for i := 0; i < cap(errCh); i++ {
+		go func(i int) {
+			err := d.Ingest([]string{src(i)})
+			if err == nil {
+				if _, err = d.opts.FS.Stat(src(i)); oserror.IsNotExist(err) {
+					err = nil
+				}
+			}
+			errCh <- err
+		}(i)
+	}
+	for i := 0; i < cap(errCh); i++ {
+		require.NoError(t, <-errCh)
+	}
+
+	require.NoError(t, d.Close())
+}
+
+func TestConcurrentIngestCompact(t *testing.T) {
+	for i := 0; i < 2; i++ {
+		t.Run("", func(t *testing.T) {
+			mem := vfs.NewMem()
+			compactionReady := make(chan struct{})
+			compactionBegin := make(chan struct{})
+			d, err := Open("", &Options{
+				FS: mem,
+				EventListener: &EventListener{
+					TableCreated: func(info TableCreateInfo) {
+						if info.Reason == "compacting" {
+							close(compactionReady)
+							<-compactionBegin
+						}
+					},
+				},
+			})
+			require.NoError(t, err)
+
+			ingest := func(keys ...string) {
+				t.Helper()
+				f, err := mem.Create("ext")
+				require.NoError(t, err)
+
+				w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
+				for _, k := range keys {
+					require.NoError(t, w.Set([]byte(k), nil))
+				}
+				require.NoError(t, w.Close())
+				require.NoError(t, d.Ingest([]string{"ext"}))
+			}
+
+			compact := func(start, end string) {
+				t.Helper()
+				require.NoError(t, d.Compact([]byte(start), []byte(end), false))
+			}
+
+			lsm := func() string {
+				d.mu.Lock()
+				s := d.mu.versions.currentVersion().String()
+				d.mu.Unlock()
+				return s
+			}
+
+			expectLSM := func(expected string) {
+				t.Helper()
+				expected = strings.TrimSpace(expected)
+				actual := strings.TrimSpace(lsm())
+				if expected != actual {
+					t.Fatalf("expected\n%s\nbut found\n%s", expected, actual)
+				}
+			}
+
+			ingest("a")
+			ingest("a")
+			ingest("c")
+			ingest("c")
+
+			expectLSM(`
+0.0:
+  000005:[a#11,SET-a#11,SET]
+  000007:[c#13,SET-c#13,SET]
+6:
+  000004:[a#10,SET-a#10,SET]
+  000006:[c#12,SET-c#12,SET]
+`)
+
+			// At this point ingestion of an sstable containing only key "b" will be
+			// targeted at L6. Yet a concurrent compaction of sstables 5 and 7 will
+			// create a new sstable in L6 spanning ["a"-"c"]. So the ingestion must
+			// actually target L5.
+
+			switch i {
+			case 0:
+				// Compact, then ingest.
+				go func() {
+					<-compactionReady
+
+					ingest("b")
+
+					close(compactionBegin)
+				}()
+
+				compact("a", "z")
+
+				expectLSM(`
+0.0:
+  000009:[b#14,SET-b#14,SET]
+6:
+  000008:[a#0,SET-c#0,SET]
+`)
+
+			case 1:
+				// Ingest, then compact
+				var wg sync.WaitGroup
+				wg.Add(1)
+				go func() {
+					defer wg.Done()
+					close(compactionBegin)
+					compact("a", "z")
+				}()
+
+				ingest("b")
+				wg.Wait()
+
+				// Because we're performing the ingestion and compaction concurrently,
+				// we can't guarantee any particular LSM structure at this point. The
+				// test will fail with an assertion error due to overlapping sstables
+				// if there is insufficient synchronization between ingestion and
+				// compaction.
+			}
+
+			require.NoError(t, d.Close())
+		})
+	}
+}
+
+func TestIngestFlushQueuedMemTable(t *testing.T) {
+	// Verify that ingestion forces a flush of a queued memtable.
+
+	// Test with a format major version prior to FormatFlushableIngest and one
+	// after. Both should result in the same statistic calculations.
+	for _, fmv := range []FormatMajorVersion{FormatFlushableIngest - 1, internalFormatNewest} {
+		func(fmv FormatMajorVersion) {
+			mem := vfs.NewMem()
+			d, err := Open("", &Options{
+				FS:                 mem,
+				FormatMajorVersion: fmv,
+			})
+			require.NoError(t, err)
+
+			// Add the key "a" to the memtable, then fill up the memtable with the key
+			// "b". The ingested sstable will only overlap with the queued memtable.
+			require.NoError(t, d.Set([]byte("a"), nil, nil))
+			for {
+				require.NoError(t, d.Set([]byte("b"), nil, nil))
+				d.mu.Lock()
+				done := len(d.mu.mem.queue) == 2
+				d.mu.Unlock()
+				if done {
+					break
+				}
+			}
+
+			ingest := func(keys ...string) {
+				t.Helper()
+				f, err := mem.Create("ext")
+				require.NoError(t, err)
+
+				w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
+					TableFormat: fmv.MinTableFormat(),
+				})
+				for _, k := range keys {
+					require.NoError(t, w.Set([]byte(k), nil))
+				}
+				require.NoError(t, w.Close())
+				stats, err := d.IngestWithStats([]string{"ext"})
+				require.NoError(t, err)
+				require.Equal(t, stats.ApproxIngestedIntoL0Bytes, stats.Bytes)
+				require.Equal(t, stats.MemtableOverlappingFiles, 1)
+				require.Less(t, uint64(0), stats.Bytes)
+			}
+
+			ingest("a")
+
+			require.NoError(t, d.Close())
+		}(fmv)
+	}
+}
+
+func TestIngestStats(t *testing.T) {
+	mem := vfs.NewMem()
+	d, err := Open("", &Options{
+		FS: mem,
+	})
+	require.NoError(t, err)
+
+	ingest := func(expectedLevel int, keys ...string) {
+		t.Helper()
+		f, err := mem.Create("ext")
+		require.NoError(t, err)
+
+		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
+		for _, k := range keys {
+			require.NoError(t, w.Set([]byte(k), nil))
+		}
+		require.NoError(t, w.Close())
+		stats, err := d.IngestWithStats([]string{"ext"})
+		require.NoError(t, err)
+		if expectedLevel == 0 {
+			require.Equal(t, stats.ApproxIngestedIntoL0Bytes, stats.Bytes)
+		} else {
+			require.EqualValues(t, 0, stats.ApproxIngestedIntoL0Bytes)
+		}
+		require.Less(t, uint64(0), stats.Bytes)
+	}
+	ingest(6, "a")
+	ingest(0, "a")
+	ingest(6, "b", "g")
+	ingest(0, "c")
+	require.NoError(t, d.Close())
+}
+
+func TestIngestFlushQueuedLargeBatch(t *testing.T) {
+	// Verify that ingestion forces a flush of a queued large batch.
+
+	mem := vfs.NewMem()
+	d, err := Open("", &Options{
+		FS: mem,
+	})
+	require.NoError(t, err)
+
+	// The default large batch threshold is slightly less than 1/2 of the
+	// memtable size which makes triggering a problem with flushing queued large
+	// batches irritating. Manually adjust the threshold to 1/8 of the memtable
+	// size in order to more easily create a situation where a large batch is
+	// queued but not automatically flushed.
+	d.mu.Lock()
+	d.largeBatchThreshold = d.opts.MemTableSize / 8
+	d.mu.Unlock()
+
+	// Set a record with a large value. This will be transformed into a large
+	// batch and placed in the flushable queue.
+	require.NoError(t, d.Set([]byte("a"), bytes.Repeat([]byte("v"), int(d.largeBatchThreshold)), nil))
+
+	ingest := func(keys ...string) {
+		t.Helper()
+		f, err := mem.Create("ext")
+		require.NoError(t, err)
+
+		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
+		for _, k := range keys {
+			require.NoError(t, w.Set([]byte(k), nil))
+		}
+		require.NoError(t, w.Close())
+		require.NoError(t, d.Ingest([]string{"ext"}))
+	}
+
+	ingest("a")
+
+	require.NoError(t, d.Close())
+}
+
+func TestIngestMemtablePendingOverlap(t *testing.T) {
+	mem := vfs.NewMem()
+	d, err := Open("", &Options{
+		FS: mem,
+	})
+	require.NoError(t, err)
+
+	d.mu.Lock()
+	// Use a custom commit pipeline apply function to give us control over
+	// timing of events.
+	assignedBatch := make(chan struct{})
+	applyBatch := make(chan struct{})
+	originalApply := d.commit.env.apply
+	d.commit.env.apply = func(b *Batch, mem *memTable) error {
+		assignedBatch <- struct{}{}
+		applyBatch <- struct{}{}
+		return originalApply(b, mem)
+	}
+	d.mu.Unlock()
+
+	ingest := func(keys ...string) {
+		t.Helper()
+		f, err := mem.Create("ext")
+		require.NoError(t, err)
+
+		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
+		for _, k := range keys {
+			require.NoError(t, w.Set([]byte(k), nil))
+		}
+		require.NoError(t, w.Close())
+		require.NoError(t, d.Ingest([]string{"ext"}))
+	}
+
+	var wg sync.WaitGroup
+	wg.Add(2)
+
+	// First, Set('c') begins. This call will:
+	//
+	// * enqueue the batch to the pending queue.
+	// * allocate a sequence number `x`.
+	// * write the batch to the WAL.
+	//
+	// and then block until we read from the `applyBatch` channel down below.
+	go func() {
+		err := d.Set([]byte("c"), nil, nil)
+		if err != nil {
+			t.Error(err)
+		}
+		wg.Done()
+	}()
+
+	// When the above Set('c') is ready to apply, it sends on the
+	// `assignedBatch` channel. Once that happens, we start Ingest('a', 'c').
+	// The Ingest('a', 'c') allocates sequence number `x + 1`.
+	go func() {
+		// Wait until the Set has grabbed a sequence number before ingesting.
+		<-assignedBatch
+		ingest("a", "c")
+		wg.Done()
+	}()
+
+	// The Set('c')#1 and Ingest('a', 'c')#2 are both pending. To maintain
+	// sequence number invariants, the Set needs to be applied and flushed
+	// before the Ingest determines its target level.
+	//
+	// Sleep a bit to ensure that the Ingest has time to call into
+	// AllocateSeqNum. Once it allocates its sequence number, it should see
+	// that there are unpublished sequence numbers below it and spin until the
+	// Set's sequence number is published. After sleeping, read from
+	// `applyBatch` to actually allow the Set to apply and publish its
+	// sequence number.
+	time.Sleep(100 * time.Millisecond)
+	<-applyBatch
+
+	// Wait for both calls to complete.
+	wg.Wait()
+	require.NoError(t, d.Flush())
+	require.NoError(t, d.CheckLevels(nil))
+	require.NoError(t, d.Close())
+}
+
+type testLogger struct {
+	t testing.TB
+}
+
+func (l testLogger) Infof(format string, args ...interface{}) {
+	l.t.Logf(format, args...)
+}
+
+func (l testLogger) Errorf(format string, args ...interface{}) {
+	l.t.Logf(format, args...)
+}
+
+func (l testLogger) Fatalf(format string, args ...interface{}) {
+	l.t.Fatalf(format, args...)
+}
+
+// TestIngestMemtableOverlapRace is a regression test for the race described in
+// #2196. If an ingest that checks for overlap with the mutable memtable and
+// finds no overlap, it must not allow overlapping keys with later sequence
+// numbers to be applied to the memtable and the memtable to be flushed before
+// the ingest completes.
+//
+// This test operates by committing the same key concurrently:
+//   - 1 goroutine repeatedly ingests the same sstable writing the key `foo`
+//   - n goroutines repeatedly apply batches writing the key `foo` and trigger
+//     flushes.
+//
+// After a while, the database is closed and the manifest is verified. Version
+// edits should contain new files with monotonically increasing sequence
+// numbers, since every flush and every ingest conflicts with one another.
+func TestIngestMemtableOverlapRace(t *testing.T) {
+	mem := vfs.NewMem()
+	el := MakeLoggingEventListener(testLogger{t: t})
+	d, err := Open("", &Options{
+		FS: mem,
+		// Disable automatic compactions to keep the manifest clean; only
+		// flushes and ingests.
+		DisableAutomaticCompactions: true,
+		// Disable the WAL to speed up batch commits.
+		DisableWAL:    true,
+		EventListener: &el,
+		// We're endlessly appending to L0 without clearing it, so set a maximal
+		// stop writes threshold.
+		L0StopWritesThreshold: math.MaxInt,
+		// Accumulating more than 1 immutable memtable doesn't help us exercise
+		// the bug, since the committed keys need to be flushed promptly.
+		MemTableStopWritesThreshold: 2,
+	})
+	require.NoError(t, err)
+
+	// Prepare a sstable `ext` deleting foo.
+	f, err := mem.Create("ext")
+	require.NoError(t, err)
+	w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
+	require.NoError(t, w.Delete([]byte("foo")))
+	require.NoError(t, w.Close())
+
+	var done atomic.Bool
+	const numSetters = 2
+	var wg sync.WaitGroup
+	wg.Add(numSetters + 1)
+
+	untilDone := func(fn func()) {
+		defer wg.Done()
+		for !done.Load() {
+			fn()
+		}
+	}
+
+	// Ingest in the background.
+	totalIngests := 0
+	go untilDone(func() {
+		filename := fmt.Sprintf("ext%d", totalIngests)
+		require.NoError(t, mem.Link("ext", filename))
+		require.NoError(t, d.Ingest([]string{filename}))
+		totalIngests++
+	})
+
+	// Apply batches and trigger flushes in the background.
+	wo := &WriteOptions{Sync: false}
+	var localCommits [numSetters]int
+	for i := 0; i < numSetters; i++ {
+		i := i
+		v := []byte(fmt.Sprintf("v%d", i+1))
+		go untilDone(func() {
+			// Commit a batch setting foo=vN.
+			b := d.NewBatch()
+			require.NoError(t, b.Set([]byte("foo"), v, nil))
+			require.NoError(t, b.Commit(wo))
+			localCommits[i]++
+			d.AsyncFlush()
+		})
+	}
+	time.Sleep(100 * time.Millisecond)
+	done.Store(true)
+	wg.Wait()
+
+	var totalCommits int
+	for i := 0; i < numSetters; i++ {
+		totalCommits += localCommits[i]
+	}
+	m := d.Metrics()
+	tot := m.Total()
+	t.Logf("Committed %d batches.", totalCommits)
+	t.Logf("Flushed %d times.", m.Flush.Count)
+	t.Logf("Ingested %d sstables.", tot.TablesIngested)
+	require.NoError(t, d.CheckLevels(nil))
+	require.NoError(t, d.Close())
+
+	// Replay the manifest. Every flush and ingest is a separate version edit.
+	// Since they all write the same key and compactions are disabled, sequence
+	// numbers of new files should be monotonically increasing.
+	//
+	// This check is necessary because most of these sstables are ingested into
+	// L0. The L0 sublevels construction will order them by LargestSeqNum, even
+	// if they're added to L0 out-of-order. The CheckLevels call at the end of
+	// the test may find that the sublevels are all appropriately ordered, but
+	// the manifest may reveal they were added to the LSM out-of-order.
+	dbDesc, err := Peek("", mem)
+	require.NoError(t, err)
+	require.True(t, dbDesc.Exists)
+	f, err = mem.Open(dbDesc.ManifestFilename)
+	require.NoError(t, err)
+	defer f.Close()
+	rr := record.NewReader(f, 0 /* logNum */)
+	var largest *fileMetadata
+	for {
+		r, err := rr.Next()
+		if err == io.EOF || err == record.ErrInvalidChunk {
+			break
+		}
+		require.NoError(t, err)
+		var ve manifest.VersionEdit
+		require.NoError(t, ve.Decode(r))
+		t.Log(ve.String())
+		for _, f := range ve.NewFiles {
+			if largest != nil {
+				require.Equal(t, 0, f.Level)
+				if largest.LargestSeqNum > f.Meta.LargestSeqNum {
+					t.Fatalf("previous largest file %s has sequence number > next file %s", largest, f.Meta)
+				}
+			}
+			largest = f.Meta
+		}
+	}
+}
+
+type ingestCrashFS struct {
+	vfs.FS
+}
+
+func (fs ingestCrashFS) Link(oldname, newname string) error {
+	if err := fs.FS.Link(oldname, newname); err != nil {
+		return err
+	}
+	panic(errorfs.ErrInjected)
+}
+
+type noRemoveFS struct {
+	vfs.FS
+}
+
+func (fs noRemoveFS) Remove(string) error {
+	return errorfs.ErrInjected
+}
+
+func TestIngestFileNumReuseCrash(t *testing.T) {
+	const count = 10
+	// Use an on-disk filesystem, because Ingest with a MemFS will copy, not
+	// link the ingested file.
+	dir, err := os.MkdirTemp("", "ingest-filenum-reuse")
+	require.NoError(t, err)
+	defer os.RemoveAll(dir)
+	fs := vfs.Default
+
+	readFile := func(s string) []byte {
+		f, err := fs.Open(fs.PathJoin(dir, s))
+		require.NoError(t, err)
+		b, err := io.ReadAll(f)
+		require.NoError(t, err)
+		require.NoError(t, f.Close())
+		return b
+	}
+
+	// Create sstables to ingest.
+	var files []string
+	var fileBytes [][]byte
+	for i := 0; i < count; i++ {
+		name := fmt.Sprintf("ext%d", i)
+		f, err := fs.Create(fs.PathJoin(dir, name))
+		require.NoError(t, err)
+		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
+		require.NoError(t, w.Set([]byte(fmt.Sprintf("foo%d", i)), nil))
+		require.NoError(t, w.Close())
+		files = append(files, name)
+		fileBytes = append(fileBytes, readFile(name))
+	}
+
+	// Open a database with a filesystem that will successfully link the
+	// ingested files but then panic. This is an approximation of what a crash
+	// after linking but before updating the manifest would look like.
+	d, err := Open(dir, &Options{
+		FS: ingestCrashFS{FS: fs},
+	})
+	// A flush here ensures the file num bumps from creating OPTIONS files,
+	// etc get recorded in the manifest. We want the nextFileNum after the
+	// restart to be the same as one of our ingested sstables.
+	require.NoError(t, err)
+	require.NoError(t, d.Set([]byte("boop"), nil, nil))
+	require.NoError(t, d.Flush())
+	for _, f := range files {
+		func() {
+			defer func() { err = recover().(error) }()
+			err = d.Ingest([]string{fs.PathJoin(dir, f)})
+		}()
+		if err == nil || !errors.Is(err, errorfs.ErrInjected) {
+			t.Fatalf("expected injected error, got %v", err)
+		}
+	}
+	// Leave something in the WAL so that Open will flush while replaying the
+	// WAL.
+	require.NoError(t, d.Set([]byte("wal"), nil, nil))
+	require.NoError(t, d.Close())
+
+	// There are now two links to each external file: the original extX link
+	// and a numbered sstable link. The sstable files are still not a part of
+	// the manifest and so they may be overwritten. Open will detect the
+	// obsolete number sstables and try to remove them. The FS here is wrapped
+	// to induce errors on Remove calls. Even if we're unsuccessful in
+	// removing the obsolete files, the external files should not be
+	// overwritten.
+	d, err = Open(dir, &Options{FS: noRemoveFS{FS: fs}})
+	require.NoError(t, err)
+	require.NoError(t, d.Set([]byte("bar"), nil, nil))
+	require.NoError(t, d.Flush())
+	require.NoError(t, d.Close())
+
+	// None of the external files should change despite modifying the linked
+	// versions.
+	for i, f := range files {
+		afterBytes := readFile(f)
+		require.Equal(t, fileBytes[i], afterBytes)
+	}
+}
+
+func TestIngest_UpdateSequenceNumber(t *testing.T) {
+	mem := vfs.NewMem()
+	cmp := base.DefaultComparer.Compare
+	parse := func(input string) (*sstable.Writer, error) {
+		f, err := mem.Create("ext")
+		if err != nil {
+			return nil, err
+		}
+		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
+			TableFormat: sstable.TableFormatMax,
+		})
+		for _, data := range strings.Split(input, "\n") {
+			if strings.HasPrefix(data, "rangekey: ") {
+				data = strings.TrimPrefix(data, "rangekey: ")
+				s := keyspan.ParseSpan(data)
+				err := rangekey.Encode(&s, w.AddRangeKey)
+				if err != nil {
+					return nil, err
+				}
+				continue
+			}
+			j := strings.Index(data, ":")
+			if j < 0 {
+				return nil, errors.Newf("malformed input: %s\n", data)
+			}
+			key := base.ParseInternalKey(data[:j])
+			value := []byte(data[j+1:])
+			if err := w.Add(key, value); err != nil {
+				return nil, err
+			}
+		}
+		return w, nil
+	}
+
+	var (
+		seqnum uint64
+		err    error
+		metas  []*fileMetadata
+	)
+	datadriven.RunTest(t, "testdata/ingest_update_seqnums", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "starting-seqnum":
+			seqnum, err = strconv.ParseUint(td.Input, 10, 64)
+			if err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "reset":
+			metas = metas[:0]
+			return ""
+
+		case "load":
+			w, err := parse(td.Input)
+			if err != nil {
+				return err.Error()
+			}
+			if err = w.Close(); err != nil {
+				return err.Error()
+			}
+			defer w.Close()
+
+			// Format the bounds of the table.
+			wm, err := w.Metadata()
+			if err != nil {
+				return err.Error()
+			}
+
+			// Upper bounds for range dels and range keys are expected to be sentinel
+			// keys.
+			maybeUpdateUpperBound := func(key base.InternalKey) base.InternalKey {
+				switch k := key.Kind(); {
+				case k == base.InternalKeyKindRangeDelete:
+					key.Trailer = base.InternalKeyRangeDeleteSentinel
+				case rangekey.IsRangeKey(k):
+					return base.MakeExclusiveSentinelKey(k, key.UserKey)
+				}
+				return key
+			}
+
+			// Construct the file metadata from the writer metadata.
+			m := &fileMetadata{
+				SmallestSeqNum: 0, // Simulate an ingestion.
+				LargestSeqNum:  0,
+			}
+			if wm.HasPointKeys {
+				m.ExtendPointKeyBounds(cmp, wm.SmallestPoint, wm.LargestPoint)
+			}
+			if wm.HasRangeDelKeys {
+				m.ExtendPointKeyBounds(
+					cmp,
+					wm.SmallestRangeDel,
+					maybeUpdateUpperBound(wm.LargestRangeDel),
+				)
+			}
+			if wm.HasRangeKeys {
+				m.ExtendRangeKeyBounds(
+					cmp,
+					wm.SmallestRangeKey,
+					maybeUpdateUpperBound(wm.LargestRangeKey),
+				)
+			}
+			m.InitPhysicalBacking()
+			if err := m.Validate(cmp, base.DefaultFormatter); err != nil {
+				return err.Error()
+			}
+
+			// Collect this file.
+			metas = append(metas, m)
+
+			// Return an index number for the file.
+			return fmt.Sprintf("file %d\n", len(metas)-1)
+
+		case "update-files":
+			// Update the bounds across all files.
+			if err = ingestUpdateSeqNum(cmp, base.DefaultFormatter, seqnum, ingestLoadResult{localMeta: metas}); err != nil {
+				return err.Error()
+			}
+
+			var buf bytes.Buffer
+			for i, m := range metas {
+				fmt.Fprintf(&buf, "file %d:\n", i)
+				fmt.Fprintf(&buf, "  combined: %s-%s\n", m.Smallest, m.Largest)
+				fmt.Fprintf(&buf, "    points: %s-%s\n", m.SmallestPointKey, m.LargestPointKey)
+				fmt.Fprintf(&buf, "    ranges: %s-%s\n", m.SmallestRangeKey, m.LargestRangeKey)
+			}
+
+			return buf.String()
+
+		default:
+			return fmt.Sprintf("unknown command %s\n", td.Cmd)
+		}
+	})
+}
+
+func TestIngestCleanup(t *testing.T) {
+	fns := []base.FileNum{0, 1, 2}
+
+	testCases := []struct {
+		closeFiles   []base.FileNum
+		cleanupFiles []base.FileNum
+		wantErr      string
+	}{
+		// Close and remove all files.
+		{
+			closeFiles:   fns,
+			cleanupFiles: fns,
+		},
+		// Remove a non-existent file.
+		{
+			closeFiles:   fns,
+			cleanupFiles: []base.FileNum{3},
+			wantErr:      "unknown to the objstorage provider",
+		},
+		// Remove a file that has not been closed.
+		{
+			closeFiles:   []base.FileNum{0, 2},
+			cleanupFiles: fns,
+			wantErr:      oserror.ErrInvalid.Error(),
+		},
+		// Remove all files, one of which is still open, plus a file that does not exist.
+		{
+			closeFiles:   []base.FileNum{0, 2},
+			cleanupFiles: []base.FileNum{0, 1, 2, 3},
+			wantErr:      oserror.ErrInvalid.Error(), // The first error encountered is due to the open file.
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run("", func(t *testing.T) {
+			mem := vfs.NewMem()
+			mem.UseWindowsSemantics(true)
+			objProvider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(mem, ""))
+			require.NoError(t, err)
+			defer objProvider.Close()
+
+			// Create the files in the VFS.
+			metaMap := make(map[base.FileNum]objstorage.Writable)
+			for _, fn := range fns {
+				w, _, err := objProvider.Create(context.Background(), base.FileTypeTable, fn.DiskFileNum(), objstorage.CreateOptions{})
+				require.NoError(t, err)
+
+				metaMap[fn] = w
+			}
+
+			// Close a select number of files.
+			for _, m := range tc.closeFiles {
+				w, ok := metaMap[m]
+				if !ok {
+					continue
+				}
+				require.NoError(t, w.Finish())
+			}
+
+			// Cleanup the set of files in the FS.
+			var toRemove []*fileMetadata
+			for _, fn := range tc.cleanupFiles {
+				m := &fileMetadata{FileNum: fn}
+				m.InitPhysicalBacking()
+				toRemove = append(toRemove, m)
+			}
+
+			err = ingestCleanup(objProvider, toRemove)
+			if tc.wantErr != "" {
+				require.Error(t, err, "got no error, expected %s", tc.wantErr)
+				require.Contains(t, err.Error(), tc.wantErr)
+			} else {
+				require.NoError(t, err)
+			}
+		})
+	}
+}
+
+// fatalCapturingLogger captures a fatal error instead of panicking.
+type fatalCapturingLogger struct {
+	t   testing.TB
+	err error
+}
+
+// Infof implements the Logger interface.
+func (l *fatalCapturingLogger) Infof(fmt string, args ...interface{}) {
+	l.t.Logf(fmt, args...)
+}
+
+// Errorf implements the Logger interface.
+func (l *fatalCapturingLogger) Errorf(fmt string, args ...interface{}) {
+	l.t.Logf(fmt, args...)
+}
+
+// Fatalf implements the Logger interface.
+func (l *fatalCapturingLogger) Fatalf(_ string, args ...interface{}) {
+	l.err = args[0].(error)
+}
+
+func TestIngestValidation(t *testing.T) {
+	type keyVal struct {
+		key, val []byte
+	}
+	// The corruptionLocation enum defines where to corrupt an sstable if
+	// anywhere. corruptionLocation{Start,End} describe the start and end
+	// data blocks. corruptionLocationInternal describes a random data block
+	// that's neither the start or end blocks. The Ingest operation does not
+	// read the entire sstable, only the start and end blocks, so corruption
+	// introduced using corruptionLocationInternal will not be discovered until
+	// the asynchronous validation job runs.
+	type corruptionLocation int
+	const (
+		corruptionLocationNone corruptionLocation = iota
+		corruptionLocationStart
+		corruptionLocationEnd
+		corruptionLocationInternal
+	)
+	// The errReportLocation type defines an enum to allow tests to enforce
+	// expectations about how an error surfaced during ingestion or validation
+	// is reported. Asynchronous validation that uncovers corruption should call
+	// Fatalf on the Logger. Asychronous validation that encounters
+	// non-corruption errors should surface it through the
+	// EventListener.BackgroundError func.
+	type errReportLocation int
+	const (
+		errReportLocationNone errReportLocation = iota
+		errReportLocationIngest
+		errReportLocationFatal
+		errReportLocationBackgroundError
+	)
+	const (
+		nKeys     = 1_000
+		keySize   = 16
+		valSize   = 100
+		blockSize = 100
+
+		ingestTableName = "ext"
+	)
+
+	seed := uint64(time.Now().UnixNano())
+	rng := rand.New(rand.NewSource(seed))
+	t.Logf("rng seed = %d", seed)
+
+	// errfsCounter is used by test cases that make use of an errorfs.Injector
+	// to inject errors into the ingest validation code path.
+	var errfsCounter atomic.Int32
+	testCases := []struct {
+		description     string
+		cLoc            corruptionLocation
+		wantErrType     errReportLocation
+		wantErr         error
+		errorfsInjector errorfs.Injector
+	}{
+		{
+			description: "no corruption",
+			cLoc:        corruptionLocationNone,
+			wantErrType: errReportLocationNone,
+		},
+		{
+			description: "start block",
+			cLoc:        corruptionLocationStart,
+			wantErr:     ErrCorruption,
+			wantErrType: errReportLocationIngest,
+		},
+		{
+			description: "end block",
+			cLoc:        corruptionLocationEnd,
+			wantErr:     ErrCorruption,
+			wantErrType: errReportLocationIngest,
+		},
+		{
+			description: "non-end block",
+			cLoc:        corruptionLocationInternal,
+			wantErr:     ErrCorruption,
+			wantErrType: errReportLocationFatal,
+		},
+		{
+			description: "non-corruption error",
+			cLoc:        corruptionLocationNone,
+			wantErr:     errorfs.ErrInjected,
+			wantErrType: errReportLocationBackgroundError,
+			errorfsInjector: errorfs.InjectorFunc(func(op errorfs.Op) error {
+				// Inject an error on the first read-at operation on an sstable
+				// (excluding the read on the sstable before ingestion has
+				// linked it in).
+				if op.Path != "ext" && op.Kind != errorfs.OpFileReadAt || filepath.Ext(op.Path) != ".sst" {
+					return nil
+				}
+				if errfsCounter.Add(1) == 1 {
+					return errorfs.ErrInjected
+				}
+				return nil
+			}),
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.description, func(t *testing.T) {
+			errfsCounter.Store(0)
+			var wg sync.WaitGroup
+			wg.Add(1)
+
+			fs := vfs.NewMem()
+			var testFS vfs.FS = fs
+			if tc.errorfsInjector != nil {
+				testFS = errorfs.Wrap(fs, tc.errorfsInjector)
+			}
+
+			// backgroundErr is populated by EventListener.BackgroundError.
+			var backgroundErr error
+			logger := &fatalCapturingLogger{t: t}
+			opts := &Options{
+				FS:     testFS,
+				Logger: logger,
+				EventListener: &EventListener{
+					TableValidated: func(i TableValidatedInfo) {
+						wg.Done()
+					},
+					BackgroundError: func(err error) {
+						backgroundErr = err
+					},
+				},
+			}
+			// Disable table stats so that injected errors can't be accidentally
+			// injected into the table stats collector read, and so the table
+			// stats collector won't prime the table+block cache such that the
+			// error injection won't trigger at all during ingest validation.
+			opts.private.disableTableStats = true
+			opts.Experimental.ValidateOnIngest = true
+			d, err := Open("", opts)
+			require.NoError(t, err)
+			defer func() { require.NoError(t, d.Close()) }()
+
+			corrupt := func(f vfs.File) {
+				readable, err := sstable.NewSimpleReadable(f)
+				require.NoError(t, err)
+				// Compute the layout of the sstable in order to find the
+				// appropriate block locations to corrupt.
+				r, err := sstable.NewReader(readable, sstable.ReaderOptions{})
+				require.NoError(t, err)
+				l, err := r.Layout()
+				require.NoError(t, err)
+
+				// Select an appropriate data block to corrupt.
+				var blockIdx int
+				switch tc.cLoc {
+				case corruptionLocationStart:
+					blockIdx = 0
+				case corruptionLocationEnd:
+					blockIdx = len(l.Data) - 1
+				case corruptionLocationInternal:
+					blockIdx = 1 + rng.Intn(len(l.Data)-2)
+				default:
+					t.Fatalf("unknown corruptionLocation: %T", tc.cLoc)
+				}
+				bh := l.Data[blockIdx]
+
+				// Corrupting a key will cause the ingestion to fail due to a
+				// malformed key, rather than a block checksum mismatch.
+				// Instead, we corrupt the last byte in the selected block,
+				// before the trailer, which corresponds to a value.
+				offset := bh.Offset + bh.Length - 1
+				_, err = f.WriteAt([]byte("\xff"), int64(offset))
+				require.NoError(t, err)
+				require.NoError(t, r.Close())
+			}
+
+			type errT struct {
+				errLoc errReportLocation
+				err    error
+			}
+			runIngest := func(keyVals []keyVal) (et errT) {
+				f, err := fs.Create(ingestTableName)
+				require.NoError(t, err)
+				defer func() { _ = fs.Remove(ingestTableName) }()
+
+				w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
+					BlockSize:   blockSize,     // Create many smaller blocks.
+					Compression: NoCompression, // For simpler debugging.
+				})
+				for _, kv := range keyVals {
+					require.NoError(t, w.Set(kv.key, kv.val))
+				}
+				require.NoError(t, w.Close())
+
+				// Possibly corrupt the file.
+				if tc.cLoc != corruptionLocationNone {
+					f, err = fs.OpenReadWrite(ingestTableName)
+					require.NoError(t, err)
+					corrupt(f)
+				}
+
+				// Ingest the external table.
+				err = d.Ingest([]string{ingestTableName})
+				if err != nil {
+					et.errLoc = errReportLocationIngest
+					et.err = err
+					return
+				}
+
+				// Wait for the validation on the sstable to complete.
+				wg.Wait()
+
+				// Return any error encountered during validation.
+				if logger.err != nil {
+					et.errLoc = errReportLocationFatal
+					et.err = logger.err
+				} else if backgroundErr != nil {
+					et.errLoc = errReportLocationBackgroundError
+					et.err = backgroundErr
+				}
+				return
+			}
+
+			// Construct a set of keys to ingest.
+			var keyVals []keyVal
+			for i := 0; i < nKeys; i++ {
+				key := make([]byte, keySize)
+				_, err = rng.Read(key)
+				require.NoError(t, err)
+
+				val := make([]byte, valSize)
+				_, err = rng.Read(val)
+				require.NoError(t, err)
+
+				keyVals = append(keyVals, keyVal{key, val})
+			}
+
+			// Keys must be sorted.
+			slices.SortFunc(keyVals, func(a, b keyVal) int { return d.cmp(a.key, b.key) })
+
+			// Run the ingestion.
+			et := runIngest(keyVals)
+
+			// Assert we saw the errors we expect.
+			switch tc.wantErrType {
+			case errReportLocationNone:
+				require.Equal(t, errReportLocationNone, et.errLoc)
+				require.NoError(t, et.err)
+			case errReportLocationIngest:
+				require.Equal(t, errReportLocationIngest, et.errLoc)
+				require.Error(t, et.err)
+				require.True(t, errors.Is(et.err, tc.wantErr))
+			case errReportLocationFatal:
+				require.Equal(t, errReportLocationFatal, et.errLoc)
+				require.Error(t, et.err)
+				require.True(t, errors.Is(et.err, tc.wantErr))
+			case errReportLocationBackgroundError:
+				require.Equal(t, errReportLocationBackgroundError, et.errLoc)
+				require.Error(t, et.err)
+				require.True(t, errors.Is(et.err, tc.wantErr))
+			default:
+				t.Fatalf("unknown wantErrType %T", tc.wantErrType)
+			}
+		})
+	}
+}
+
+// BenchmarkManySSTables measures the cost of various operations with various
+// counts of SSTables within the database.
+func BenchmarkManySSTables(b *testing.B) {
+	counts := []int{10, 1_000, 10_000, 100_000, 1_000_000}
+	ops := []string{"ingest", "calculateInuseKeyRanges"}
+	for _, op := range ops {
+		b.Run(op, func(b *testing.B) {
+			for _, count := range counts {
+				b.Run(fmt.Sprintf("sstables=%d", count), func(b *testing.B) {
+					mem := vfs.NewMem()
+					d, err := Open("", &Options{
+						FS: mem,
+					})
+					require.NoError(b, err)
+
+					var paths []string
+					for i := 0; i < count; i++ {
+						n := fmt.Sprintf("%07d", i)
+						f, err := mem.Create(n)
+						require.NoError(b, err)
+						w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
+						require.NoError(b, w.Set([]byte(n), nil))
+						require.NoError(b, w.Close())
+						paths = append(paths, n)
+					}
+					require.NoError(b, d.Ingest(paths))
+
+					{
+						const broadIngest = "broad.sst"
+						f, err := mem.Create(broadIngest)
+						require.NoError(b, err)
+						w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
+						require.NoError(b, w.Set([]byte("0"), nil))
+						require.NoError(b, w.Set([]byte("Z"), nil))
+						require.NoError(b, w.Close())
+						require.NoError(b, d.Ingest([]string{broadIngest}))
+					}
+
+					switch op {
+					case "ingest":
+						runBenchmarkManySSTablesIngest(b, d, mem, count)
+					case "calculateInuseKeyRanges":
+						runBenchmarkManySSTablesInUseKeyRanges(b, d, count)
+					}
+					require.NoError(b, d.Close())
+				})
+			}
+		})
+	}
+}
+
+func runBenchmarkManySSTablesIngest(b *testing.B, d *DB, fs vfs.FS, count int) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		n := fmt.Sprintf("%07d", count+i)
+		f, err := fs.Create(n)
+		require.NoError(b, err)
+		w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
+		require.NoError(b, w.Set([]byte(n), nil))
+		require.NoError(b, w.Close())
+		require.NoError(b, d.Ingest([]string{n}))
+	}
+}
+
+func runBenchmarkManySSTablesInUseKeyRanges(b *testing.B, d *DB, count int) {
+	// This benchmark is pretty contrived, but it's not easy to write a
+	// microbenchmark for this in a more natural way. L6 has many files, and
+	// L5 has 1 file spanning the entire breadth of L5.
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	v := d.mu.versions.currentVersion()
+	b.ResetTimer()
+
+	smallest := []byte("0")
+	largest := []byte("z")
+	for i := 0; i < b.N; i++ {
+		_ = calculateInuseKeyRanges(v, d.cmp, 0, numLevels-1, smallest, largest)
+	}
+}
diff --git a/pebble/internal.go b/pebble/internal.go
new file mode 100644
index 0000000..61a4284
--- /dev/null
+++ b/pebble/internal.go
@@ -0,0 +1,51 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import "github.com/cockroachdb/pebble/internal/base"
+
+// InternalKeyKind exports the base.InternalKeyKind type.
+type InternalKeyKind = base.InternalKeyKind
+
+// These constants are part of the file format, and should not be changed.
+const (
+	InternalKeyKindDelete          = base.InternalKeyKindDelete
+	InternalKeyKindSet             = base.InternalKeyKindSet
+	InternalKeyKindMerge           = base.InternalKeyKindMerge
+	InternalKeyKindLogData         = base.InternalKeyKindLogData
+	InternalKeyKindSingleDelete    = base.InternalKeyKindSingleDelete
+	InternalKeyKindRangeDelete     = base.InternalKeyKindRangeDelete
+	InternalKeyKindMax             = base.InternalKeyKindMax
+	InternalKeyKindSetWithDelete   = base.InternalKeyKindSetWithDelete
+	InternalKeyKindRangeKeySet     = base.InternalKeyKindRangeKeySet
+	InternalKeyKindRangeKeyUnset   = base.InternalKeyKindRangeKeyUnset
+	InternalKeyKindRangeKeyDelete  = base.InternalKeyKindRangeKeyDelete
+	InternalKeyKindIngestSST       = base.InternalKeyKindIngestSST
+	InternalKeyKindDeleteSized     = base.InternalKeyKindDeleteSized
+	InternalKeyKindInvalid         = base.InternalKeyKindInvalid
+	InternalKeySeqNumBatch         = base.InternalKeySeqNumBatch
+	InternalKeySeqNumMax           = base.InternalKeySeqNumMax
+	InternalKeyRangeDeleteSentinel = base.InternalKeyRangeDeleteSentinel
+)
+
+// InternalKey exports the base.InternalKey type.
+type InternalKey = base.InternalKey
+
+type internalIterator = base.InternalIterator
+
+// ErrCorruption is a marker to indicate that data in a file (WAL, MANIFEST,
+// sstable) isn't in the expected format.
+var ErrCorruption = base.ErrCorruption
+
+// AttributeAndLen exports the base.AttributeAndLen type.
+type AttributeAndLen = base.AttributeAndLen
+
+// ShortAttribute exports the base.ShortAttribute type.
+type ShortAttribute = base.ShortAttribute
+
+// LazyFetcher exports the base.LazyFetcher type. This export is needed since
+// LazyValue.Clone requires a pointer to a LazyFetcher struct to avoid
+// allocations. No code outside Pebble needs to peer into a LazyFetcher.
+type LazyFetcher = base.LazyFetcher
diff --git a/pebble/internal/ackseq/ackseq.go b/pebble/internal/ackseq/ackseq.go
new file mode 100644
index 0000000..f2c682f
--- /dev/null
+++ b/pebble/internal/ackseq/ackseq.go
@@ -0,0 +1,83 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package ackseq
+
+import (
+	"sync"
+	"sync/atomic"
+
+	"github.com/cockroachdb/errors"
+)
+
+const (
+	// The window size constants. These values specify a window that can hold ~1m
+	// pending unacknowledged sequence numbers using 128 KB of memory.
+	windowSize  = 1 << 20
+	windowMask  = windowSize - 1
+	windowBytes = (windowSize + 7) / 8
+)
+
+// S keeps track of the largest sequence number such that all sequence numbers
+// in the range [0,v) have been acknowledged.
+type S struct {
+	next atomic.Uint64
+	mu   struct {
+		sync.Mutex
+		base   uint64
+		window [windowBytes]uint8
+	}
+}
+
+// New creates a new acknowledged sequence tracker with the specified base
+// sequence number. All of the sequence numbers in the range [0,base) are
+// considered acknowledged. Next() will return base upon first call.
+func New(base uint64) *S {
+	s := &S{}
+	s.next.Store(base)
+	s.mu.base = base
+	return s
+}
+
+// Next returns the next sequence number to use.
+func (s *S) Next() uint64 {
+	return s.next.Add(1) - 1
+}
+
+// Ack acknowledges the specified seqNum, adjusting base as necessary,
+// returning the number of newly acknowledged sequence numbers.
+func (s *S) Ack(seqNum uint64) (int, error) {
+	s.mu.Lock()
+	if s.getLocked(seqNum) {
+		defer s.mu.Unlock()
+		return 0, errors.Errorf(
+			"pending acks exceeds window size: %d has been acked, but %d has not",
+			errors.Safe(seqNum), errors.Safe(s.mu.base))
+	}
+
+	var count int
+	s.setLocked(seqNum)
+	for s.getLocked(s.mu.base) {
+		s.clearLocked(s.mu.base)
+		s.mu.base++
+		count++
+	}
+	s.mu.Unlock()
+	return count, nil
+}
+
+func (s *S) getLocked(seqNum uint64) bool {
+	bit := seqNum & windowMask
+	return (s.mu.window[bit/8] & (1 << (bit % 8))) != 0
+}
+
+func (s *S) setLocked(seqNum uint64) {
+	bit := seqNum & windowMask
+	s.mu.window[bit/8] |= (1 << (bit % 8))
+}
+
+func (s *S) clearLocked(seqNum uint64) {
+	bit := seqNum & windowMask
+	s.mu.window[bit/8] &^= (1 << (bit % 8))
+}
diff --git a/pebble/internal/arenaskl/LICENSE b/pebble/internal/arenaskl/LICENSE
new file mode 100644
index 0000000..8dada3e
--- /dev/null
+++ b/pebble/internal/arenaskl/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/pebble/internal/arenaskl/README.md b/pebble/internal/arenaskl/README.md
new file mode 100644
index 0000000..93a7d32
--- /dev/null
+++ b/pebble/internal/arenaskl/README.md
@@ -0,0 +1,93 @@
+# arenaskl
+
+Fast, lock-free, arena-based Skiplist implementation in Go that supports iteration
+in both directions.
+
+## Advantages
+
+Arenaskl offers several advantages over other skiplist implementations:
+
+* High performance that linearly scales with the number of cores. This is
+  achieved by allocating from a fixed-size arena and by avoiding locks.
+* Iterators that can be allocated on the stack and easily cloned by value.
+* Simple-to-use and low overhead model for detecting and handling race conditions
+  with other threads.
+* Support for iterating in reverse (i.e. previous links). 
+
+## Limitations
+
+The advantages come at a cost that prevents arenaskl from being a general-purpose
+skiplist implementation:
+
+* The size of the arena sets a hard upper bound on the combined size of skiplist
+  nodes, keys, and values. This limit includes even the size of deleted nodes,
+  keys, and values.
+* Deletion is not supported. Instead, higher-level code is expected to
+  add deletion tombstones and needs to process those tombstones
+  appropriately.
+
+## Pedigree
+
+This code is based on Andy Kimball's arenaskl code:
+
+https://github.com/andy-kimball/arenaskl
+
+The arenaskl code is based on the skiplist found in Badger, a Go-based
+KV store:
+
+https://github.com/dgraph-io/badger/tree/master/skl
+
+The skiplist in Badger is itself based on a C++ skiplist built for
+Facebook's RocksDB:
+
+https://github.com/facebook/rocksdb/tree/master/memtable
+
+## Benchmarks
+
+The benchmarks consist of a mix of reads and writes executed in parallel. The
+fraction of reads is indicated in the run name: "frac_X" indicates a run where
+X percent of the operations are reads.
+
+The results are much better than `skiplist` and `slist`.
+
+```
+name                  time/op
+ReadWrite/frac_0-8     470ns ±11%
+ReadWrite/frac_10-8    462ns ± 3%
+ReadWrite/frac_20-8    436ns ± 2%
+ReadWrite/frac_30-8    410ns ± 2%
+ReadWrite/frac_40-8    385ns ± 2%
+ReadWrite/frac_50-8    360ns ± 4%
+ReadWrite/frac_60-8    386ns ± 1%
+ReadWrite/frac_70-8    352ns ± 2%
+ReadWrite/frac_80-8    306ns ± 3%
+ReadWrite/frac_90-8    253ns ± 4%
+ReadWrite/frac_100-8  28.1ns ± 2%
+```
+
+Note that the above numbers are for concurrent operations using 8x
+parallelism. The same benchmarks without concurrency (use these
+numbers when comparing vs batchskl):
+
+```
+name                time/op
+ReadWrite/frac_0    1.53µs ± 1%
+ReadWrite/frac_10   1.46µs ± 2%
+ReadWrite/frac_20   1.39µs ± 3%
+ReadWrite/frac_30   1.28µs ± 3%
+ReadWrite/frac_40   1.21µs ± 2%
+ReadWrite/frac_50   1.11µs ± 3%
+ReadWrite/frac_60   1.23µs ±17%
+ReadWrite/frac_70   1.16µs ± 4%
+ReadWrite/frac_80    959ns ± 3%
+ReadWrite/frac_90    738ns ± 5%
+ReadWrite/frac_100  81.9ns ± 2%
+```
+
+Forward and backward iteration are also fast:
+
+```
+name                time/op
+IterNext            3.97ns ± 5%
+IterPrev            3.88ns ± 3%
+```
diff --git a/pebble/internal/arenaskl/arena.go b/pebble/internal/arenaskl/arena.go
new file mode 100644
index 0000000..011c3b0
--- /dev/null
+++ b/pebble/internal/arenaskl/arena.go
@@ -0,0 +1,125 @@
+/*
+ * Copyright 2017 Dgraph Labs, Inc. and Contributors
+ * Modifications copyright (C) 2017 Andy Kimball and Contributors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package arenaskl
+
+import (
+	"sync/atomic"
+	"unsafe"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/constants"
+	"github.com/cockroachdb/pebble/internal/invariants"
+)
+
+// Arena is lock-free.
+type Arena struct {
+	n   atomic.Uint64
+	buf []byte
+}
+
+const nodeAlignment = 4
+
+var (
+	// ErrArenaFull indicates that the arena is full and cannot perform any more
+	// allocations.
+	ErrArenaFull = errors.New("allocation failed because arena is full")
+)
+
+// NewArena allocates a new arena using the specified buffer as the backing
+// store.
+func NewArena(buf []byte) *Arena {
+	if len(buf) > constants.MaxUint32OrInt {
+		if invariants.Enabled {
+			panic(errors.AssertionFailedf("attempting to create arena of size %d", len(buf)))
+		}
+		buf = buf[:constants.MaxUint32OrInt]
+	}
+	a := &Arena{
+		buf: buf,
+	}
+	// We don't store data at position 0 in order to reserve offset=0 as a kind of
+	// nil pointer.
+	a.n.Store(1)
+	return a
+}
+
+// Size returns the number of bytes allocated by the arena.
+func (a *Arena) Size() uint32 {
+	s := a.n.Load()
+	if s > constants.MaxUint32OrInt {
+		// The last failed allocation can push the size higher than len(a.buf).
+		// Saturate at the maximum representable offset.
+		return constants.MaxUint32OrInt
+	}
+	return uint32(s)
+}
+
+// Capacity returns the capacity of the arena.
+func (a *Arena) Capacity() uint32 {
+	return uint32(len(a.buf))
+}
+
+// alloc allocates a buffer of the given size and with the given alignment
+// (which must be a power of 2).
+//
+// If overflow is not 0, it also ensures that many bytes after the buffer are
+// inside the arena (this is used for structures that are larger than the
+// requested size but don't use those extra bytes).
+func (a *Arena) alloc(size, alignment, overflow uint32) (uint32, uint32, error) {
+	if invariants.Enabled && (alignment&(alignment-1)) != 0 {
+		panic(errors.AssertionFailedf("invalid alignment %d", alignment))
+	}
+	// Verify that the arena isn't already full.
+	origSize := a.n.Load()
+	if int(origSize) > len(a.buf) {
+		return 0, 0, ErrArenaFull
+	}
+
+	// Pad the allocation with enough bytes to ensure the requested alignment.
+	padded := uint64(size) + uint64(alignment) - 1
+
+	newSize := a.n.Add(padded)
+	if newSize+uint64(overflow) > uint64(len(a.buf)) {
+		return 0, 0, ErrArenaFull
+	}
+
+	// Return the aligned offset.
+	offset := (uint32(newSize) - size) & ^(alignment - 1)
+	return offset, uint32(padded), nil
+}
+
+func (a *Arena) getBytes(offset uint32, size uint32) []byte {
+	if offset == 0 {
+		return nil
+	}
+	return a.buf[offset : offset+size : offset+size]
+}
+
+func (a *Arena) getPointer(offset uint32) unsafe.Pointer {
+	if offset == 0 {
+		return nil
+	}
+	return unsafe.Pointer(&a.buf[offset])
+}
+
+func (a *Arena) getPointerOffset(ptr unsafe.Pointer) uint32 {
+	if ptr == nil {
+		return 0
+	}
+	return uint32(uintptr(ptr) - uintptr(unsafe.Pointer(&a.buf[0])))
+}
diff --git a/pebble/internal/arenaskl/arena_test.go b/pebble/internal/arenaskl/arena_test.go
new file mode 100644
index 0000000..f264b8d
--- /dev/null
+++ b/pebble/internal/arenaskl/arena_test.go
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2017 Dgraph Labs, Inc. and Contributors
+ * Modifications copyright (C) 2017 Andy Kimball and Contributors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package arenaskl
+
+import (
+	"math"
+	"testing"
+
+	"github.com/cockroachdb/pebble/internal/constants"
+	"github.com/stretchr/testify/require"
+)
+
+func newArena(n uint32) *Arena {
+	return NewArena(make([]byte, n))
+}
+
+// TestArenaSizeOverflow tests that large allocations do not cause Arena's
+// internal size accounting to overflow and produce incorrect results.
+func TestArenaSizeOverflow(t *testing.T) {
+	a := newArena(constants.MaxUint32OrInt)
+
+	// Allocating under the limit throws no error.
+	offset, _, err := a.alloc(math.MaxUint16, 1, 0)
+	require.Nil(t, err)
+	require.Equal(t, uint32(1), offset)
+	require.Equal(t, uint32(math.MaxUint16)+1, a.Size())
+
+	// Allocating over the limit could cause an accounting
+	// overflow if 32-bit arithmetic was used. It shouldn't.
+	_, _, err = a.alloc(math.MaxUint32, 1, 0)
+	require.Equal(t, ErrArenaFull, err)
+	require.Equal(t, uint32(constants.MaxUint32OrInt), a.Size())
+
+	// Continuing to allocate continues to throw an error.
+	_, _, err = a.alloc(math.MaxUint16, 1, 0)
+	require.Equal(t, ErrArenaFull, err)
+	require.Equal(t, uint32(constants.MaxUint32OrInt), a.Size())
+}
diff --git a/pebble/internal/arenaskl/flush_iterator.go b/pebble/internal/arenaskl/flush_iterator.go
new file mode 100644
index 0000000..2a7ea03
--- /dev/null
+++ b/pebble/internal/arenaskl/flush_iterator.go
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2017 Dgraph Labs, Inc. and Contributors
+ * Modifications copyright (C) 2017 Andy Kimball and Contributors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package arenaskl
+
+import "github.com/cockroachdb/pebble/internal/base"
+
+// flushIterator is an iterator over the skiplist object. Use Skiplist.NewFlushIter
+// to construct an iterator. The current state of the iterator can be cloned by
+// simply value copying the struct.
+type flushIterator struct {
+	Iterator
+	bytesIterated *uint64
+}
+
+// flushIterator implements the base.InternalIterator interface.
+var _ base.InternalIterator = (*flushIterator)(nil)
+
+func (it *flushIterator) String() string {
+	return "memtable"
+}
+
+func (it *flushIterator) SeekGE(
+	key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	panic("pebble: SeekGE unimplemented")
+}
+
+func (it *flushIterator) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	panic("pebble: SeekPrefixGE unimplemented")
+}
+
+func (it *flushIterator) SeekLT(
+	key []byte, flags base.SeekLTFlags,
+) (*base.InternalKey, base.LazyValue) {
+	panic("pebble: SeekLT unimplemented")
+}
+
+// First seeks position at the first entry in list. Returns the key and value
+// if the iterator is pointing at a valid entry, and (nil, nil) otherwise. Note
+// that First only checks the upper bound. It is up to the caller to ensure
+// that key is greater than or equal to the lower bound.
+func (it *flushIterator) First() (*base.InternalKey, base.LazyValue) {
+	key, val := it.Iterator.First()
+	if key == nil {
+		return nil, base.LazyValue{}
+	}
+	*it.bytesIterated += uint64(it.nd.allocSize)
+	return key, val
+}
+
+// Next advances to the next position. Returns the key and value if the
+// iterator is pointing at a valid entry, and (nil, nil) otherwise.
+// Note: flushIterator.Next mirrors the implementation of Iterator.Next
+// due to performance. Keep the two in sync.
+func (it *flushIterator) Next() (*base.InternalKey, base.LazyValue) {
+	it.nd = it.list.getNext(it.nd, 0)
+	if it.nd == it.list.tail {
+		return nil, base.LazyValue{}
+	}
+	it.decodeKey()
+	*it.bytesIterated += uint64(it.nd.allocSize)
+	return &it.key, base.MakeInPlaceValue(it.value())
+}
+
+func (it *flushIterator) NextPrefix(succKey []byte) (*base.InternalKey, base.LazyValue) {
+	panic("pebble: NextPrefix unimplemented")
+}
+
+func (it *flushIterator) Prev() (*base.InternalKey, base.LazyValue) {
+	panic("pebble: Prev unimplemented")
+}
diff --git a/pebble/internal/arenaskl/iterator.go b/pebble/internal/arenaskl/iterator.go
new file mode 100644
index 0000000..a41dd7e
--- /dev/null
+++ b/pebble/internal/arenaskl/iterator.go
@@ -0,0 +1,275 @@
+/*
+ * Copyright 2017 Dgraph Labs, Inc. and Contributors
+ * Modifications copyright (C) 2017 Andy Kimball and Contributors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package arenaskl
+
+import (
+	"context"
+	"sync"
+
+	"github.com/cockroachdb/pebble/internal/base"
+)
+
+type splice struct {
+	prev *node
+	next *node
+}
+
+func (s *splice) init(prev, next *node) {
+	s.prev = prev
+	s.next = next
+}
+
+// Iterator is an iterator over the skiplist object. Use Skiplist.NewIter
+// to construct an iterator. The current state of the iterator can be cloned by
+// simply value copying the struct. All iterator methods are thread-safe.
+type Iterator struct {
+	list  *Skiplist
+	nd    *node
+	key   base.InternalKey
+	lower []byte
+	upper []byte
+}
+
+// Iterator implements the base.InternalIterator interface.
+var _ base.InternalIterator = (*Iterator)(nil)
+
+var iterPool = sync.Pool{
+	New: func() interface{} {
+		return &Iterator{}
+	},
+}
+
+// Close resets the iterator.
+func (it *Iterator) Close() error {
+	it.list = nil
+	it.nd = nil
+	it.lower = nil
+	it.upper = nil
+	iterPool.Put(it)
+	return nil
+}
+
+func (it *Iterator) String() string {
+	return "memtable"
+}
+
+// Error returns any accumulated error.
+func (it *Iterator) Error() error {
+	return nil
+}
+
+// SeekGE moves the iterator to the first entry whose key is greater than or
+// equal to the given key. Returns the key and value if the iterator is
+// pointing at a valid entry, and (nil, nil) otherwise. Note that SeekGE only
+// checks the upper bound. It is up to the caller to ensure that key is greater
+// than or equal to the lower bound.
+func (it *Iterator) SeekGE(key []byte, flags base.SeekGEFlags) (*base.InternalKey, base.LazyValue) {
+	if flags.TrySeekUsingNext() {
+		if it.nd == it.list.tail {
+			// Iterator is done.
+			return nil, base.LazyValue{}
+		}
+		less := it.list.cmp(it.key.UserKey, key) < 0
+		// Arbitrary constant. By measuring the seek cost as a function of the
+		// number of elements in the skip list, and fitting to a model, we
+		// could adjust the number of nexts based on the current size of the
+		// skip list.
+		const numNexts = 5
+		for i := 0; less && i < numNexts; i++ {
+			k, _ := it.Next()
+			if k == nil {
+				// Iterator is done.
+				return nil, base.LazyValue{}
+			}
+			less = it.list.cmp(it.key.UserKey, key) < 0
+		}
+		if !less {
+			return &it.key, base.MakeInPlaceValue(it.value())
+		}
+	}
+	_, it.nd, _ = it.seekForBaseSplice(key)
+	if it.nd == it.list.tail {
+		return nil, base.LazyValue{}
+	}
+	it.decodeKey()
+	if it.upper != nil && it.list.cmp(it.upper, it.key.UserKey) <= 0 {
+		it.nd = it.list.tail
+		return nil, base.LazyValue{}
+	}
+	return &it.key, base.MakeInPlaceValue(it.value())
+}
+
+// SeekPrefixGE moves the iterator to the first entry whose key is greater than
+// or equal to the given key. This method is equivalent to SeekGE and is
+// provided so that an arenaskl.Iterator implements the
+// internal/base.InternalIterator interface.
+func (it *Iterator) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	return it.SeekGE(key, flags)
+}
+
+// SeekLT moves the iterator to the last entry whose key is less than the given
+// key. Returns the key and value if the iterator is pointing at a valid entry,
+// and (nil, nil) otherwise. Note that SeekLT only checks the lower bound. It
+// is up to the caller to ensure that key is less than the upper bound.
+func (it *Iterator) SeekLT(key []byte, flags base.SeekLTFlags) (*base.InternalKey, base.LazyValue) {
+	// NB: the top-level Iterator has already adjusted key based on
+	// the upper-bound.
+	it.nd, _, _ = it.seekForBaseSplice(key)
+	if it.nd == it.list.head {
+		return nil, base.LazyValue{}
+	}
+	it.decodeKey()
+	if it.lower != nil && it.list.cmp(it.lower, it.key.UserKey) > 0 {
+		it.nd = it.list.head
+		return nil, base.LazyValue{}
+	}
+	return &it.key, base.MakeInPlaceValue(it.value())
+}
+
+// First seeks position at the first entry in list. Returns the key and value
+// if the iterator is pointing at a valid entry, and (nil, nil) otherwise. Note
+// that First only checks the upper bound. It is up to the caller to ensure
+// that key is greater than or equal to the lower bound (e.g. via a call to SeekGE(lower)).
+func (it *Iterator) First() (*base.InternalKey, base.LazyValue) {
+	it.nd = it.list.getNext(it.list.head, 0)
+	if it.nd == it.list.tail {
+		return nil, base.LazyValue{}
+	}
+	it.decodeKey()
+	if it.upper != nil && it.list.cmp(it.upper, it.key.UserKey) <= 0 {
+		it.nd = it.list.tail
+		return nil, base.LazyValue{}
+	}
+	return &it.key, base.MakeInPlaceValue(it.value())
+}
+
+// Last seeks position at the last entry in list. Returns the key and value if
+// the iterator is pointing at a valid entry, and (nil, nil) otherwise. Note
+// that Last only checks the lower bound. It is up to the caller to ensure that
+// key is less than the upper bound (e.g. via a call to SeekLT(upper)).
+func (it *Iterator) Last() (*base.InternalKey, base.LazyValue) {
+	it.nd = it.list.getPrev(it.list.tail, 0)
+	if it.nd == it.list.head {
+		return nil, base.LazyValue{}
+	}
+	it.decodeKey()
+	if it.lower != nil && it.list.cmp(it.lower, it.key.UserKey) > 0 {
+		it.nd = it.list.head
+		return nil, base.LazyValue{}
+	}
+	return &it.key, base.MakeInPlaceValue(it.value())
+}
+
+// Next advances to the next position. Returns the key and value if the
+// iterator is pointing at a valid entry, and (nil, nil) otherwise.
+// Note: flushIterator.Next mirrors the implementation of Iterator.Next
+// due to performance. Keep the two in sync.
+func (it *Iterator) Next() (*base.InternalKey, base.LazyValue) {
+	it.nd = it.list.getNext(it.nd, 0)
+	if it.nd == it.list.tail {
+		return nil, base.LazyValue{}
+	}
+	it.decodeKey()
+	if it.upper != nil && it.list.cmp(it.upper, it.key.UserKey) <= 0 {
+		it.nd = it.list.tail
+		return nil, base.LazyValue{}
+	}
+	return &it.key, base.MakeInPlaceValue(it.value())
+}
+
+// NextPrefix advances to the next position with a new prefix. Returns the key
+// and value if the iterator is pointing at a valid entry, and (nil, nil)
+// otherwise.
+func (it *Iterator) NextPrefix(succKey []byte) (*base.InternalKey, base.LazyValue) {
+	return it.SeekGE(succKey, base.SeekGEFlagsNone.EnableTrySeekUsingNext())
+}
+
+// Prev moves to the previous position. Returns the key and value if the
+// iterator is pointing at a valid entry, and (nil, nil) otherwise.
+func (it *Iterator) Prev() (*base.InternalKey, base.LazyValue) {
+	it.nd = it.list.getPrev(it.nd, 0)
+	if it.nd == it.list.head {
+		return nil, base.LazyValue{}
+	}
+	it.decodeKey()
+	if it.lower != nil && it.list.cmp(it.lower, it.key.UserKey) > 0 {
+		it.nd = it.list.head
+		return nil, base.LazyValue{}
+	}
+	return &it.key, base.MakeInPlaceValue(it.value())
+}
+
+// value returns the value at the current position.
+func (it *Iterator) value() []byte {
+	return it.nd.getValue(it.list.arena)
+}
+
+// Head true iff the iterator is positioned at the sentinel head node.
+func (it *Iterator) Head() bool {
+	return it.nd == it.list.head
+}
+
+// Tail true iff the iterator is positioned at the sentinel tail node.
+func (it *Iterator) Tail() bool {
+	return it.nd == it.list.tail
+}
+
+// SetBounds sets the lower and upper bounds for the iterator. Note that the
+// result of Next and Prev will be undefined until the iterator has been
+// repositioned with SeekGE, SeekPrefixGE, SeekLT, First, or Last.
+func (it *Iterator) SetBounds(lower, upper []byte) {
+	it.lower = lower
+	it.upper = upper
+}
+
+// SetContext implements base.InternalIterator.
+func (it *Iterator) SetContext(_ context.Context) {}
+
+func (it *Iterator) decodeKey() {
+	it.key.UserKey = it.list.arena.getBytes(it.nd.keyOffset, it.nd.keySize)
+	it.key.Trailer = it.nd.keyTrailer
+}
+
+func (it *Iterator) seekForBaseSplice(key []byte) (prev, next *node, found bool) {
+	ikey := base.MakeSearchKey(key)
+	level := int(it.list.Height() - 1)
+
+	prev = it.list.head
+	for {
+		prev, next, found = it.list.findSpliceForLevel(ikey, level, prev)
+
+		if found {
+			if level != 0 {
+				// next is pointing at the target node, but we need to find previous on
+				// the bottom level.
+				prev = it.list.getPrev(next, 0)
+			}
+			break
+		}
+
+		if level == 0 {
+			break
+		}
+
+		level--
+	}
+
+	return
+}
diff --git a/pebble/internal/arenaskl/node.go b/pebble/internal/arenaskl/node.go
new file mode 100644
index 0000000..d464bc5
--- /dev/null
+++ b/pebble/internal/arenaskl/node.go
@@ -0,0 +1,133 @@
+/*
+ * Copyright 2017 Dgraph Labs, Inc. and Contributors
+ * Modifications copyright (C) 2017 Andy Kimball and Contributors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package arenaskl
+
+import (
+	"math"
+	"sync/atomic"
+
+	"github.com/cockroachdb/pebble/internal/base"
+)
+
+// MaxNodeSize returns the maximum space needed for a node with the specified
+// key and value sizes. This could overflow a uint32, which is why a uint64
+// is used here. If a key/value overflows a uint32, it should not be added to
+// the skiplist.
+func MaxNodeSize(keySize, valueSize uint32) uint64 {
+	const maxPadding = nodeAlignment - 1
+	return uint64(maxNodeSize) + uint64(keySize) + uint64(valueSize) + maxPadding
+}
+
+type links struct {
+	nextOffset atomic.Uint32
+	prevOffset atomic.Uint32
+}
+
+func (l *links) init(prevOffset, nextOffset uint32) {
+	l.nextOffset.Store(nextOffset)
+	l.prevOffset.Store(prevOffset)
+}
+
+type node struct {
+	// Immutable fields, so no need to lock to access key.
+	keyOffset  uint32
+	keySize    uint32
+	keyTrailer uint64
+	valueSize  uint32
+	allocSize  uint32
+
+	// Most nodes do not need to use the full height of the tower, since the
+	// probability of each successive level decreases exponentially. Because
+	// these elements are never accessed, they do not need to be allocated.
+	// Therefore, when a node is allocated in the arena, its memory footprint
+	// is deliberately truncated to not include unneeded tower elements.
+	//
+	// All accesses to elements should use CAS operations, with no need to lock.
+	tower [maxHeight]links
+}
+
+func newNode(
+	arena *Arena, height uint32, key base.InternalKey, value []byte,
+) (nd *node, err error) {
+	if height < 1 || height > maxHeight {
+		panic("height cannot be less than one or greater than the max height")
+	}
+	keySize := len(key.UserKey)
+	if int64(keySize) > math.MaxUint32 {
+		panic("key is too large")
+	}
+	valueSize := len(value)
+	if int64(len(value)) > math.MaxUint32 {
+		panic("value is too large")
+	}
+	if int64(len(value))+int64(keySize)+int64(maxNodeSize) > math.MaxUint32 {
+		panic("combined key and value size is too large")
+	}
+
+	nd, err = newRawNode(arena, height, uint32(keySize), uint32(valueSize))
+	if err != nil {
+		return
+	}
+	nd.keyTrailer = key.Trailer
+	copy(nd.getKeyBytes(arena), key.UserKey)
+	copy(nd.getValue(arena), value)
+	return
+}
+
+func newRawNode(arena *Arena, height uint32, keySize, valueSize uint32) (nd *node, err error) {
+	// Compute the amount of the tower that will never be used, since the height
+	// is less than maxHeight.
+	unusedSize := uint32((maxHeight - int(height)) * linksSize)
+	nodeSize := uint32(maxNodeSize) - unusedSize
+
+	nodeOffset, allocSize, err := arena.alloc(nodeSize+keySize+valueSize, nodeAlignment, unusedSize)
+	if err != nil {
+		return
+	}
+
+	nd = (*node)(arena.getPointer(nodeOffset))
+	nd.keyOffset = nodeOffset + nodeSize
+	nd.keySize = keySize
+	nd.valueSize = valueSize
+	nd.allocSize = allocSize
+	return
+}
+
+func (n *node) getKeyBytes(arena *Arena) []byte {
+	return arena.getBytes(n.keyOffset, n.keySize)
+}
+
+func (n *node) getValue(arena *Arena) []byte {
+	return arena.getBytes(n.keyOffset+n.keySize, uint32(n.valueSize))
+}
+
+func (n *node) nextOffset(h int) uint32 {
+	return n.tower[h].nextOffset.Load()
+}
+
+func (n *node) prevOffset(h int) uint32 {
+	return n.tower[h].prevOffset.Load()
+}
+
+func (n *node) casNextOffset(h int, old, val uint32) bool {
+	return n.tower[h].nextOffset.CompareAndSwap(old, val)
+}
+
+func (n *node) casPrevOffset(h int, old, val uint32) bool {
+	return n.tower[h].prevOffset.CompareAndSwap(old, val)
+}
diff --git a/pebble/internal/arenaskl/race_test.go b/pebble/internal/arenaskl/race_test.go
new file mode 100644
index 0000000..b9310c9
--- /dev/null
+++ b/pebble/internal/arenaskl/race_test.go
@@ -0,0 +1,42 @@
+//go:build race
+// +build race
+
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package arenaskl
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+// TestNodeArenaEnd tests allocating a node at the boundary of an arena. In Go
+// 1.14 when the race detector is running, Go will also perform some pointer
+// alignment checks. It will detect alignment issues, for example #667 where a
+// node's memory would straddle the arena boundary, with unused regions of the
+// node struct dipping into unallocated memory. This test is only run when the
+// race build tag is provided.
+func TestNodeArenaEnd(t *testing.T) {
+	ikey := makeIkey("a")
+	val := []byte("b")
+
+	// Rather than hardcode an arena size at just the right size, try
+	// allocating using successively larger arena sizes until we allocate
+	// successfully. The prior attempt will have exercised the right code
+	// path.
+	for i := uint32(1); i < 256; i++ {
+		a := newArena(i)
+		_, err := newNode(a, 1, ikey, val)
+		if err == nil {
+			// We reached an arena size big enough to allocate a node.
+			// If there's an issue at the boundary, the race detector would
+			// have found it by now.
+			t.Log(i)
+			break
+		}
+		require.Equal(t, ErrArenaFull, err)
+	}
+}
diff --git a/pebble/internal/arenaskl/skl.go b/pebble/internal/arenaskl/skl.go
new file mode 100644
index 0000000..ef1ebfc
--- /dev/null
+++ b/pebble/internal/arenaskl/skl.go
@@ -0,0 +1,464 @@
+/*
+ * Copyright 2017 Dgraph Labs, Inc. and Contributors
+ * Modifications copyright (C) 2017 Andy Kimball and Contributors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+Adapted from RocksDB inline skiplist.
+
+Key differences:
+- No optimization for sequential inserts (no "prev").
+- No custom comparator.
+- Support overwrites. This requires care when we see the same key when inserting.
+  For RocksDB or LevelDB, overwrites are implemented as a newer sequence number in the key, so
+	there is no need for values. We don't intend to support versioning. In-place updates of values
+	would be more efficient.
+- We discard all non-concurrent code.
+- We do not support Splices. This simplifies the code a lot.
+- No AllocateNode or other pointer arithmetic.
+- We combine the findLessThan, findGreaterOrEqual, etc into one function.
+*/
+
+/*
+Further adapted from Badger: https://github.com/dgraph-io/badger.
+
+Key differences:
+- Support for previous pointers - doubly linked lists. Note that it's up to higher
+  level code to deal with the intermediate state that occurs during insertion,
+  where node A is linked to node B, but node B is not yet linked back to node A.
+- Iterator includes mutator functions.
+*/
+
+package arenaskl // import "github.com/cockroachdb/pebble/internal/arenaskl"
+
+import (
+	"math"
+	"runtime"
+	"sync/atomic"
+	"unsafe"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/fastrand"
+)
+
+const (
+	maxHeight   = 20
+	maxNodeSize = int(unsafe.Sizeof(node{}))
+	linksSize   = int(unsafe.Sizeof(links{}))
+	pValue      = 1 / math.E
+)
+
+// ErrRecordExists indicates that an entry with the specified key already
+// exists in the skiplist. Duplicate entries are not directly supported and
+// instead must be handled by the user by appending a unique version suffix to
+// keys.
+var ErrRecordExists = errors.New("record with this key already exists")
+
+// Skiplist is a fast, concurrent skiplist implementation that supports forward
+// and backward iteration. See batchskl.Skiplist for a non-concurrent
+// skiplist. Keys and values are immutable once added to the skiplist and
+// deletion is not supported. Instead, higher-level code is expected to add new
+// entries that shadow existing entries and perform deletion via tombstones. It
+// is up to the user to process these shadow entries and tombstones
+// appropriately during retrieval.
+type Skiplist struct {
+	arena  *Arena
+	cmp    base.Compare
+	head   *node
+	tail   *node
+	height atomic.Uint32 // Current height. 1 <= height <= maxHeight. CAS.
+
+	// If set to true by tests, then extra delays are added to make it easier to
+	// detect unusual race conditions.
+	testing bool
+}
+
+// Inserter TODO(peter)
+type Inserter struct {
+	spl    [maxHeight]splice
+	height uint32
+}
+
+// Add TODO(peter)
+func (ins *Inserter) Add(list *Skiplist, key base.InternalKey, value []byte) error {
+	return list.addInternal(key, value, ins)
+}
+
+var (
+	probabilities [maxHeight]uint32
+)
+
+func init() {
+	// Precompute the skiplist probabilities so that only a single random number
+	// needs to be generated and so that the optimal pvalue can be used (inverse
+	// of Euler's number).
+	p := float64(1.0)
+	for i := 0; i < maxHeight; i++ {
+		probabilities[i] = uint32(float64(math.MaxUint32) * p)
+		p *= pValue
+	}
+}
+
+// NewSkiplist constructs and initializes a new, empty skiplist. All nodes, keys,
+// and values in the skiplist will be allocated from the given arena.
+func NewSkiplist(arena *Arena, cmp base.Compare) *Skiplist {
+	skl := &Skiplist{}
+	skl.Reset(arena, cmp)
+	return skl
+}
+
+// Reset the skiplist to empty and re-initialize.
+func (s *Skiplist) Reset(arena *Arena, cmp base.Compare) {
+	// Allocate head and tail nodes.
+	head, err := newRawNode(arena, maxHeight, 0, 0)
+	if err != nil {
+		panic("arenaSize is not large enough to hold the head node")
+	}
+	head.keyOffset = 0
+
+	tail, err := newRawNode(arena, maxHeight, 0, 0)
+	if err != nil {
+		panic("arenaSize is not large enough to hold the tail node")
+	}
+	tail.keyOffset = 0
+
+	// Link all head/tail levels together.
+	headOffset := arena.getPointerOffset(unsafe.Pointer(head))
+	tailOffset := arena.getPointerOffset(unsafe.Pointer(tail))
+	for i := 0; i < maxHeight; i++ {
+		head.tower[i].nextOffset.Store(tailOffset)
+		tail.tower[i].prevOffset.Store(headOffset)
+	}
+
+	*s = Skiplist{
+		arena: arena,
+		cmp:   cmp,
+		head:  head,
+		tail:  tail,
+	}
+	s.height.Store(1)
+}
+
+// Height returns the height of the highest tower within any of the nodes that
+// have ever been allocated as part of this skiplist.
+func (s *Skiplist) Height() uint32 { return s.height.Load() }
+
+// Arena returns the arena backing this skiplist.
+func (s *Skiplist) Arena() *Arena { return s.arena }
+
+// Size returns the number of bytes that have allocated from the arena.
+func (s *Skiplist) Size() uint32 { return s.arena.Size() }
+
+// Add adds a new key if it does not yet exist. If the key already exists, then
+// Add returns ErrRecordExists. If there isn't enough room in the arena, then
+// Add returns ErrArenaFull.
+func (s *Skiplist) Add(key base.InternalKey, value []byte) error {
+	var ins Inserter
+	return s.addInternal(key, value, &ins)
+}
+
+func (s *Skiplist) addInternal(key base.InternalKey, value []byte, ins *Inserter) error {
+	if s.findSplice(key, ins) {
+		// Found a matching node, but handle case where it's been deleted.
+		return ErrRecordExists
+	}
+
+	if s.testing {
+		// Add delay to make it easier to test race between this thread
+		// and another thread that sees the intermediate state between
+		// finding the splice and using it.
+		runtime.Gosched()
+	}
+
+	nd, height, err := s.newNode(key, value)
+	if err != nil {
+		return err
+	}
+
+	ndOffset := s.arena.getPointerOffset(unsafe.Pointer(nd))
+
+	// We always insert from the base level and up. After you add a node in base
+	// level, we cannot create a node in the level above because it would have
+	// discovered the node in the base level.
+	var found bool
+	var invalidateSplice bool
+	for i := 0; i < int(height); i++ {
+		prev := ins.spl[i].prev
+		next := ins.spl[i].next
+
+		if prev == nil {
+			// New node increased the height of the skiplist, so assume that the
+			// new level has not yet been populated.
+			if next != nil {
+				panic("next is expected to be nil, since prev is nil")
+			}
+
+			prev = s.head
+			next = s.tail
+		}
+
+		// +----------------+     +------------+     +----------------+
+		// |      prev      |     |     nd     |     |      next      |
+		// | prevNextOffset |---->|            |     |                |
+		// |                |<----| prevOffset |     |                |
+		// |                |     | nextOffset |---->|                |
+		// |                |     |            |<----| nextPrevOffset |
+		// +----------------+     +------------+     +----------------+
+		//
+		// 1. Initialize prevOffset and nextOffset to point to prev and next.
+		// 2. CAS prevNextOffset to repoint from next to nd.
+		// 3. CAS nextPrevOffset to repoint from prev to nd.
+		for {
+			prevOffset := s.arena.getPointerOffset(unsafe.Pointer(prev))
+			nextOffset := s.arena.getPointerOffset(unsafe.Pointer(next))
+			nd.tower[i].init(prevOffset, nextOffset)
+
+			// Check whether next has an updated link to prev. If it does not,
+			// that can mean one of two things:
+			//   1. The thread that added the next node hasn't yet had a chance
+			//      to add the prev link (but will shortly).
+			//   2. Another thread has added a new node between prev and next.
+			nextPrevOffset := next.prevOffset(i)
+			if nextPrevOffset != prevOffset {
+				// Determine whether #1 or #2 is true by checking whether prev
+				// is still pointing to next. As long as the atomic operations
+				// have at least acquire/release semantics (no need for
+				// sequential consistency), this works, as it is equivalent to
+				// the "publication safety" pattern.
+				prevNextOffset := prev.nextOffset(i)
+				if prevNextOffset == nextOffset {
+					// Ok, case #1 is true, so help the other thread along by
+					// updating the next node's prev link.
+					next.casPrevOffset(i, nextPrevOffset, prevOffset)
+				}
+			}
+
+			if prev.casNextOffset(i, nextOffset, ndOffset) {
+				// Managed to insert nd between prev and next, so update the next
+				// node's prev link and go to the next level.
+				if s.testing {
+					// Add delay to make it easier to test race between this thread
+					// and another thread that sees the intermediate state between
+					// setting next and setting prev.
+					runtime.Gosched()
+				}
+
+				next.casPrevOffset(i, prevOffset, ndOffset)
+				break
+			}
+
+			// CAS failed. We need to recompute prev and next. It is unlikely to
+			// be helpful to try to use a different level as we redo the search,
+			// because it is unlikely that lots of nodes are inserted between prev
+			// and next.
+			prev, next, found = s.findSpliceForLevel(key, i, prev)
+			if found {
+				if i != 0 {
+					panic("how can another thread have inserted a node at a non-base level?")
+				}
+
+				return ErrRecordExists
+			}
+			invalidateSplice = true
+		}
+	}
+
+	// If we had to recompute the splice for a level, invalidate the entire
+	// cached splice.
+	if invalidateSplice {
+		ins.height = 0
+	} else {
+		// The splice was valid. We inserted a node between spl[i].prev and
+		// spl[i].next. Optimistically update spl[i].prev for use in a subsequent
+		// call to add.
+		for i := uint32(0); i < height; i++ {
+			ins.spl[i].prev = nd
+		}
+	}
+
+	return nil
+}
+
+// NewIter returns a new Iterator object. The lower and upper bound parameters
+// control the range of keys the iterator will return. Specifying for nil for
+// lower or upper bound disables the check for that boundary. Note that lower
+// bound is not checked on {SeekGE,First} and upper bound is not check on
+// {SeekLT,Last}. The user is expected to perform that check. Note that it is
+// safe for an iterator to be copied by value.
+func (s *Skiplist) NewIter(lower, upper []byte) *Iterator {
+	it := iterPool.Get().(*Iterator)
+	*it = Iterator{list: s, nd: s.head, lower: lower, upper: upper}
+	return it
+}
+
+// NewFlushIter returns a new flushIterator, which is similar to an Iterator
+// but also sets the current number of the bytes that have been iterated
+// through.
+func (s *Skiplist) NewFlushIter(bytesFlushed *uint64) base.InternalIterator {
+	return &flushIterator{
+		Iterator:      Iterator{list: s, nd: s.head},
+		bytesIterated: bytesFlushed,
+	}
+}
+
+func (s *Skiplist) newNode(
+	key base.InternalKey, value []byte,
+) (nd *node, height uint32, err error) {
+	height = s.randomHeight()
+	nd, err = newNode(s.arena, height, key, value)
+	if err != nil {
+		return
+	}
+
+	// Try to increase s.height via CAS.
+	listHeight := s.Height()
+	for height > listHeight {
+		if s.height.CompareAndSwap(listHeight, height) {
+			// Successfully increased skiplist.height.
+			break
+		}
+
+		listHeight = s.Height()
+	}
+
+	return
+}
+
+func (s *Skiplist) randomHeight() uint32 {
+	rnd := fastrand.Uint32()
+
+	h := uint32(1)
+	for h < maxHeight && rnd <= probabilities[h] {
+		h++
+	}
+
+	return h
+}
+
+func (s *Skiplist) findSplice(key base.InternalKey, ins *Inserter) (found bool) {
+	listHeight := s.Height()
+	var level int
+
+	prev := s.head
+	if ins.height < listHeight {
+		// Our cached height is less than the list height, which means there were
+		// inserts that increased the height of the list. Recompute the splice from
+		// scratch.
+		ins.height = listHeight
+		level = int(ins.height)
+	} else {
+		// Our cached height is equal to the list height.
+		for ; level < int(listHeight); level++ {
+			spl := &ins.spl[level]
+			if s.getNext(spl.prev, level) != spl.next {
+				// One or more nodes have been inserted between the splice at this
+				// level.
+				continue
+			}
+			if spl.prev != s.head && !s.keyIsAfterNode(spl.prev, key) {
+				// Key lies before splice.
+				level = int(listHeight)
+				break
+			}
+			if spl.next != s.tail && s.keyIsAfterNode(spl.next, key) {
+				// Key lies after splice.
+				level = int(listHeight)
+				break
+			}
+			// The splice brackets the key!
+			prev = spl.prev
+			break
+		}
+	}
+
+	for level = level - 1; level >= 0; level-- {
+		var next *node
+		prev, next, found = s.findSpliceForLevel(key, level, prev)
+		if next == nil {
+			next = s.tail
+		}
+		ins.spl[level].init(prev, next)
+	}
+
+	return
+}
+
+func (s *Skiplist) findSpliceForLevel(
+	key base.InternalKey, level int, start *node,
+) (prev, next *node, found bool) {
+	prev = start
+
+	for {
+		// Assume prev.key < key.
+		next = s.getNext(prev, level)
+		if next == s.tail {
+			// Tail node, so done.
+			break
+		}
+
+		offset, size := next.keyOffset, next.keySize
+		nextKey := s.arena.buf[offset : offset+size]
+		cmp := s.cmp(key.UserKey, nextKey)
+		if cmp < 0 {
+			// We are done for this level, since prev.key < key < next.key.
+			break
+		}
+		if cmp == 0 {
+			// User-key equality.
+			if key.Trailer == next.keyTrailer {
+				// Internal key equality.
+				found = true
+				break
+			}
+			if key.Trailer > next.keyTrailer {
+				// We are done for this level, since prev.key < key < next.key.
+				break
+			}
+		}
+
+		// Keep moving right on this level.
+		prev = next
+	}
+
+	return
+}
+
+func (s *Skiplist) keyIsAfterNode(nd *node, key base.InternalKey) bool {
+	ndKey := s.arena.buf[nd.keyOffset : nd.keyOffset+nd.keySize]
+	cmp := s.cmp(ndKey, key.UserKey)
+	if cmp < 0 {
+		return true
+	}
+	if cmp > 0 {
+		return false
+	}
+	// User-key equality.
+	if key.Trailer == nd.keyTrailer {
+		// Internal key equality.
+		return false
+	}
+	return key.Trailer < nd.keyTrailer
+}
+
+func (s *Skiplist) getNext(nd *node, h int) *node {
+	offset := nd.tower[h].nextOffset.Load()
+	return (*node)(s.arena.getPointer(offset))
+}
+
+func (s *Skiplist) getPrev(nd *node, h int) *node {
+	offset := nd.tower[h].prevOffset.Load()
+	return (*node)(s.arena.getPointer(offset))
+}
diff --git a/pebble/internal/arenaskl/skl_test.go b/pebble/internal/arenaskl/skl_test.go
new file mode 100644
index 0000000..6e74a4a
--- /dev/null
+++ b/pebble/internal/arenaskl/skl_test.go
@@ -0,0 +1,972 @@
+/*
+ * Copyright 2017 Dgraph Labs, Inc. and Contributors
+ * Modifications copyright (C) 2017 Andy Kimball and Contributors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package arenaskl
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"strconv"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+const arenaSize = 1 << 20
+
+// iterAdapter adapts the new Iterator API which returns the key and value from
+// positioning methods (Seek*, First, Last, Next, Prev) to the old API which
+// returned a boolean corresponding to Valid. Only used by test code.
+type iterAdapter struct {
+	*Iterator
+	key *base.InternalKey
+	val []byte
+}
+
+func newIterAdapter(iter *Iterator) *iterAdapter {
+	return &iterAdapter{
+		Iterator: iter,
+	}
+}
+
+func (i *iterAdapter) update(key *base.InternalKey, val base.LazyValue) bool {
+	i.key = key
+	i.val = val.InPlaceValue()
+	return i.key != nil
+}
+
+func (i *iterAdapter) String() string {
+	return "iter-adapter"
+}
+
+func (i *iterAdapter) SeekGE(key []byte, flags base.SeekGEFlags) bool {
+	return i.update(i.Iterator.SeekGE(key, flags))
+}
+
+func (i *iterAdapter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) bool {
+	return i.update(i.Iterator.SeekPrefixGE(prefix, key, flags))
+}
+
+func (i *iterAdapter) SeekLT(key []byte, flags base.SeekLTFlags) bool {
+	return i.update(i.Iterator.SeekLT(key, flags))
+}
+
+func (i *iterAdapter) First() bool {
+	return i.update(i.Iterator.First())
+}
+
+func (i *iterAdapter) Last() bool {
+	return i.update(i.Iterator.Last())
+}
+
+func (i *iterAdapter) Next() bool {
+	return i.update(i.Iterator.Next())
+}
+
+func (i *iterAdapter) Prev() bool {
+	return i.update(i.Iterator.Prev())
+}
+
+func (i *iterAdapter) Key() base.InternalKey {
+	return *i.key
+}
+
+func (i *iterAdapter) Value() []byte {
+	return i.val
+}
+
+func (i *iterAdapter) Valid() bool {
+	return i.key != nil
+}
+
+func makeIntKey(i int) base.InternalKey {
+	return base.InternalKey{UserKey: []byte(fmt.Sprintf("%05d", i))}
+}
+
+func makeKey(s string) []byte {
+	return []byte(s)
+}
+
+func makeIkey(s string) base.InternalKey {
+	return base.InternalKey{UserKey: []byte(s)}
+}
+
+func makeValue(i int) []byte {
+	return []byte(fmt.Sprintf("v%05d", i))
+}
+
+func makeInserterAdd(s *Skiplist) func(key base.InternalKey, value []byte) error {
+	ins := &Inserter{}
+	return func(key base.InternalKey, value []byte) error {
+		return ins.Add(s, key, value)
+	}
+}
+
+// length iterates over skiplist to give exact size.
+func length(s *Skiplist) int {
+	count := 0
+
+	it := newIterAdapter(s.NewIter(nil, nil))
+	for valid := it.First(); valid; valid = it.Next() {
+		count++
+	}
+
+	return count
+}
+
+// length iterates over skiplist in reverse order to give exact size.
+func lengthRev(s *Skiplist) int {
+	count := 0
+
+	it := newIterAdapter(s.NewIter(nil, nil))
+	for valid := it.Last(); valid; valid = it.Prev() {
+		count++
+	}
+
+	return count
+}
+
+func TestEmpty(t *testing.T) {
+	key := makeKey("aaa")
+	l := NewSkiplist(newArena(arenaSize), bytes.Compare)
+	it := newIterAdapter(l.NewIter(nil, nil))
+
+	require.False(t, it.Valid())
+
+	it.First()
+	require.False(t, it.Valid())
+
+	it.Last()
+	require.False(t, it.Valid())
+
+	require.False(t, it.SeekGE(key, base.SeekGEFlagsNone))
+	require.False(t, it.Valid())
+}
+
+func TestFull(t *testing.T) {
+	l := NewSkiplist(newArena(1000), bytes.Compare)
+
+	foundArenaFull := false
+	for i := 0; i < 100; i++ {
+		err := l.Add(makeIntKey(i), makeValue(i))
+		if err == ErrArenaFull {
+			foundArenaFull = true
+			break
+		}
+	}
+
+	require.True(t, foundArenaFull)
+
+	err := l.Add(makeIkey("someval"), nil)
+	require.Equal(t, ErrArenaFull, err)
+}
+
+// TestBasic tests single-threaded seeks and adds.
+func TestBasic(t *testing.T) {
+	for _, inserter := range []bool{false, true} {
+		t.Run(fmt.Sprintf("inserter=%t", inserter), func(t *testing.T) {
+			l := NewSkiplist(newArena(arenaSize), bytes.Compare)
+			it := newIterAdapter(l.NewIter(nil, nil))
+
+			add := l.Add
+			if inserter {
+				add = makeInserterAdd(l)
+			}
+
+			// Try adding values.
+			add(makeIkey("key1"), makeValue(1))
+			add(makeIkey("key3"), makeValue(3))
+			add(makeIkey("key2"), makeValue(2))
+
+			require.True(t, it.SeekGE(makeKey("key"), base.SeekGEFlagsNone))
+			require.True(t, it.Valid())
+			require.NotEqual(t, "key", it.Key().UserKey)
+
+			require.True(t, it.SeekGE(makeKey("key1"), base.SeekGEFlagsNone))
+			require.EqualValues(t, "key1", it.Key().UserKey)
+			require.EqualValues(t, makeValue(1), it.Value())
+
+			require.True(t, it.SeekGE(makeKey("key2"), base.SeekGEFlagsNone))
+			require.EqualValues(t, "key2", it.Key().UserKey)
+			require.EqualValues(t, makeValue(2), it.Value())
+
+			require.True(t, it.SeekGE(makeKey("key3"), base.SeekGEFlagsNone))
+			require.EqualValues(t, "key3", it.Key().UserKey)
+			require.EqualValues(t, makeValue(3), it.Value())
+
+			key := makeIkey("a")
+			key.SetSeqNum(1)
+			add(key, nil)
+			key.SetSeqNum(2)
+			add(key, nil)
+
+			require.True(t, it.SeekGE(makeKey("a"), base.SeekGEFlagsNone))
+			require.True(t, it.Valid())
+			require.EqualValues(t, "a", it.Key().UserKey)
+			require.EqualValues(t, 2, it.Key().SeqNum())
+
+			require.True(t, it.Next())
+			require.True(t, it.Valid())
+			require.EqualValues(t, "a", it.Key().UserKey)
+			require.EqualValues(t, 1, it.Key().SeqNum())
+
+			key = makeIkey("b")
+			key.SetSeqNum(2)
+			add(key, nil)
+			key.SetSeqNum(1)
+			add(key, nil)
+
+			require.True(t, it.SeekGE(makeKey("b"), base.SeekGEFlagsNone))
+			require.True(t, it.Valid())
+			require.EqualValues(t, "b", it.Key().UserKey)
+			require.EqualValues(t, 2, it.Key().SeqNum())
+
+			require.True(t, it.Next())
+			require.True(t, it.Valid())
+			require.EqualValues(t, "b", it.Key().UserKey)
+			require.EqualValues(t, 1, it.Key().SeqNum())
+		})
+	}
+}
+
+// TestConcurrentBasic tests concurrent writes followed by concurrent reads.
+func TestConcurrentBasic(t *testing.T) {
+	const n = 1000
+
+	for _, inserter := range []bool{false, true} {
+		t.Run(fmt.Sprintf("inserter=%t", inserter), func(t *testing.T) {
+			// Set testing flag to make it easier to trigger unusual race conditions.
+			l := NewSkiplist(newArena(arenaSize), bytes.Compare)
+			l.testing = true
+
+			var wg sync.WaitGroup
+			for i := 0; i < n; i++ {
+				wg.Add(1)
+				go func(i int) {
+					defer wg.Done()
+
+					if inserter {
+						var ins Inserter
+						ins.Add(l, makeIntKey(i), makeValue(i))
+					} else {
+						l.Add(makeIntKey(i), makeValue(i))
+					}
+				}(i)
+			}
+			wg.Wait()
+
+			// Check values. Concurrent reads.
+			for i := 0; i < n; i++ {
+				wg.Add(1)
+				go func(i int) {
+					defer wg.Done()
+
+					it := newIterAdapter(l.NewIter(nil, nil))
+					require.True(t, it.SeekGE(makeKey(fmt.Sprintf("%05d", i)), base.SeekGEFlagsNone))
+					require.EqualValues(t, fmt.Sprintf("%05d", i), it.Key().UserKey)
+				}(i)
+			}
+			wg.Wait()
+			require.Equal(t, n, length(l))
+			require.Equal(t, n, lengthRev(l))
+		})
+	}
+}
+
+// TestConcurrentOneKey will read while writing to one single key.
+func TestConcurrentOneKey(t *testing.T) {
+	const n = 100
+	key := makeKey("thekey")
+	ikey := makeIkey("thekey")
+
+	for _, inserter := range []bool{false, true} {
+		t.Run(fmt.Sprintf("inserter=%t", inserter), func(t *testing.T) {
+			// Set testing flag to make it easier to trigger unusual race conditions.
+			l := NewSkiplist(newArena(arenaSize), bytes.Compare)
+			l.testing = true
+
+			var wg sync.WaitGroup
+			writeDone := make(chan struct{}, 1)
+			for i := 0; i < n; i++ {
+				wg.Add(1)
+				go func(i int) {
+					defer func() {
+						wg.Done()
+						select {
+						case writeDone <- struct{}{}:
+						default:
+						}
+					}()
+
+					if inserter {
+						var ins Inserter
+						ins.Add(l, ikey, makeValue(i))
+					} else {
+						l.Add(ikey, makeValue(i))
+					}
+				}(i)
+			}
+			// Wait until at least some write made it such that reads return a value.
+			<-writeDone
+			var sawValue atomic.Int32
+			for i := 0; i < n; i++ {
+				wg.Add(1)
+				go func() {
+					defer wg.Done()
+
+					it := newIterAdapter(l.NewIter(nil, nil))
+					it.SeekGE(key, base.SeekGEFlagsNone)
+					require.True(t, it.Valid())
+					require.True(t, bytes.Equal(key, it.Key().UserKey))
+
+					sawValue.Add(1)
+					v, err := strconv.Atoi(string(it.Value()[1:]))
+					require.NoError(t, err)
+					require.True(t, 0 <= v && v < n)
+				}()
+			}
+			wg.Wait()
+			require.Equal(t, int32(n), sawValue.Load())
+			require.Equal(t, 1, length(l))
+			require.Equal(t, 1, lengthRev(l))
+		})
+	}
+}
+
+func TestSkiplistAdd(t *testing.T) {
+	for _, inserter := range []bool{false, true} {
+		t.Run(fmt.Sprintf("inserter=%t", inserter), func(t *testing.T) {
+			l := NewSkiplist(newArena(arenaSize), bytes.Compare)
+			it := newIterAdapter(l.NewIter(nil, nil))
+
+			add := l.Add
+			if inserter {
+				add = makeInserterAdd(l)
+			}
+
+			// Add nil key and value (treated same as empty).
+			err := add(base.InternalKey{}, nil)
+			require.Nil(t, err)
+			require.True(t, it.SeekGE([]byte{}, base.SeekGEFlagsNone))
+			require.EqualValues(t, []byte{}, it.Key().UserKey)
+			require.EqualValues(t, []byte{}, it.Value())
+
+			l = NewSkiplist(newArena(arenaSize), bytes.Compare)
+			it = newIterAdapter(l.NewIter(nil, nil))
+
+			add = l.Add
+			if inserter {
+				add = makeInserterAdd(l)
+			}
+
+			// Add empty key and value (treated same as nil).
+			err = add(makeIkey(""), []byte{})
+			require.Nil(t, err)
+			require.True(t, it.SeekGE([]byte{}, base.SeekGEFlagsNone))
+			require.EqualValues(t, []byte{}, it.Key().UserKey)
+			require.EqualValues(t, []byte{}, it.Value())
+
+			// Add to empty list.
+			err = add(makeIntKey(2), makeValue(2))
+			require.Nil(t, err)
+			require.True(t, it.SeekGE(makeKey("00002"), base.SeekGEFlagsNone))
+			require.EqualValues(t, "00002", it.Key().UserKey)
+			require.EqualValues(t, makeValue(2), it.Value())
+
+			// Add first element in non-empty list.
+			err = add(makeIntKey(1), makeValue(1))
+			require.Nil(t, err)
+			require.True(t, it.SeekGE(makeKey("00001"), base.SeekGEFlagsNone))
+			require.EqualValues(t, "00001", it.Key().UserKey)
+			require.EqualValues(t, makeValue(1), it.Value())
+
+			// Add last element in non-empty list.
+			err = add(makeIntKey(4), makeValue(4))
+			require.Nil(t, err)
+			require.True(t, it.SeekGE(makeKey("00004"), base.SeekGEFlagsNone))
+			require.EqualValues(t, "00004", it.Key().UserKey)
+			require.EqualValues(t, makeValue(4), it.Value())
+
+			// Add element in middle of list.
+			err = add(makeIntKey(3), makeValue(3))
+			require.Nil(t, err)
+			require.True(t, it.SeekGE(makeKey("00003"), base.SeekGEFlagsNone))
+			require.EqualValues(t, "00003", it.Key().UserKey)
+			require.EqualValues(t, makeValue(3), it.Value())
+
+			// Try to add element that already exists.
+			err = add(makeIntKey(2), nil)
+			require.Equal(t, ErrRecordExists, err)
+			require.EqualValues(t, "00003", it.Key().UserKey)
+			require.EqualValues(t, makeValue(3), it.Value())
+
+			require.Equal(t, 5, length(l))
+			require.Equal(t, 5, lengthRev(l))
+		})
+	}
+}
+
+// TestConcurrentAdd races between adding same nodes.
+func TestConcurrentAdd(t *testing.T) {
+	for _, inserter := range []bool{false, true} {
+		t.Run(fmt.Sprintf("inserter=%t", inserter), func(t *testing.T) {
+			const n = 100
+
+			// Set testing flag to make it easier to trigger unusual race conditions.
+			l := NewSkiplist(newArena(arenaSize), bytes.Compare)
+			l.testing = true
+
+			start := make([]sync.WaitGroup, n)
+			end := make([]sync.WaitGroup, n)
+
+			for i := 0; i < n; i++ {
+				start[i].Add(1)
+				end[i].Add(2)
+			}
+
+			for f := 0; f < 2; f++ {
+				go func(f int) {
+					it := newIterAdapter(l.NewIter(nil, nil))
+					add := l.Add
+					if inserter {
+						add = makeInserterAdd(l)
+					}
+
+					for i := 0; i < n; i++ {
+						start[i].Wait()
+
+						key := makeIntKey(i)
+						if add(key, nil) == nil {
+							require.True(t, it.SeekGE(key.UserKey, base.SeekGEFlagsNone))
+							require.EqualValues(t, key, it.Key())
+						}
+
+						end[i].Done()
+					}
+				}(f)
+			}
+
+			for i := 0; i < n; i++ {
+				start[i].Done()
+				end[i].Wait()
+			}
+
+			require.Equal(t, n, length(l))
+			require.Equal(t, n, lengthRev(l))
+		})
+	}
+}
+
+// TestIteratorNext tests a basic iteration over all nodes from the beginning.
+func TestIteratorNext(t *testing.T) {
+	const n = 100
+	l := NewSkiplist(newArena(arenaSize), bytes.Compare)
+	it := newIterAdapter(l.NewIter(nil, nil))
+
+	require.False(t, it.Valid())
+
+	it.First()
+	require.False(t, it.Valid())
+
+	for i := n - 1; i >= 0; i-- {
+		l.Add(makeIntKey(i), makeValue(i))
+	}
+
+	it.First()
+	for i := 0; i < n; i++ {
+		require.True(t, it.Valid())
+		require.EqualValues(t, makeIntKey(i), it.Key())
+		require.EqualValues(t, makeValue(i), it.Value())
+		it.Next()
+	}
+	require.False(t, it.Valid())
+}
+
+// TestIteratorPrev tests a basic iteration over all nodes from the end.
+func TestIteratorPrev(t *testing.T) {
+	const n = 100
+	l := NewSkiplist(newArena(arenaSize), bytes.Compare)
+	it := newIterAdapter(l.NewIter(nil, nil))
+
+	require.False(t, it.Valid())
+
+	it.Last()
+	require.False(t, it.Valid())
+
+	var ins Inserter
+	for i := 0; i < n; i++ {
+		ins.Add(l, makeIntKey(i), makeValue(i))
+	}
+
+	it.Last()
+	for i := n - 1; i >= 0; i-- {
+		require.True(t, it.Valid())
+		require.EqualValues(t, makeIntKey(i), it.Key())
+		require.EqualValues(t, makeValue(i), it.Value())
+		it.Prev()
+	}
+	require.False(t, it.Valid())
+}
+
+func TestIteratorSeekGEAndSeekPrefixGE(t *testing.T) {
+	const n = 100
+	l := NewSkiplist(newArena(arenaSize), bytes.Compare)
+	it := newIterAdapter(l.NewIter(nil, nil))
+
+	require.False(t, it.Valid())
+	it.First()
+	require.False(t, it.Valid())
+	// 1000, 1010, 1020, ..., 1990.
+
+	var ins Inserter
+	for i := n - 1; i >= 0; i-- {
+		v := i*10 + 1000
+		ins.Add(l, makeIntKey(v), makeValue(v))
+	}
+
+	require.True(t, it.SeekGE(makeKey(""), base.SeekGEFlagsNone))
+	require.True(t, it.Valid())
+	require.EqualValues(t, "01000", it.Key().UserKey)
+	require.EqualValues(t, "v01000", it.Value())
+
+	require.True(t, it.SeekGE(makeKey("01000"), base.SeekGEFlagsNone))
+	require.True(t, it.Valid())
+	require.EqualValues(t, "01000", it.Key().UserKey)
+	require.EqualValues(t, "v01000", it.Value())
+
+	require.True(t, it.SeekGE(makeKey("01005"), base.SeekGEFlagsNone))
+	require.True(t, it.Valid())
+	require.EqualValues(t, "01010", it.Key().UserKey)
+	require.EqualValues(t, "v01010", it.Value())
+
+	require.True(t, it.SeekGE(makeKey("01010"), base.SeekGEFlagsNone))
+	require.True(t, it.Valid())
+	require.EqualValues(t, "01010", it.Key().UserKey)
+	require.EqualValues(t, "v01010", it.Value())
+
+	require.False(t, it.SeekGE(makeKey("99999"), base.SeekGEFlagsNone))
+	require.False(t, it.Valid())
+
+	// Test SeekGE with trySeekUsingNext optimization.
+	{
+		require.True(t, it.SeekGE(makeKey("01000"), base.SeekGEFlagsNone))
+		require.True(t, it.Valid())
+		require.EqualValues(t, "01000", it.Key().UserKey)
+		require.EqualValues(t, "v01000", it.Value())
+
+		// Seeking to the same key.
+		require.True(t, it.SeekGE(makeKey("01000"), base.SeekGEFlagsNone.EnableTrySeekUsingNext()))
+		require.True(t, it.Valid())
+		require.EqualValues(t, "01000", it.Key().UserKey)
+		require.EqualValues(t, "v01000", it.Value())
+
+		// Seeking to a nearby key that can be reached using Next.
+		require.True(t, it.SeekGE(makeKey("01020"), base.SeekGEFlagsNone.EnableTrySeekUsingNext()))
+		require.True(t, it.Valid())
+		require.EqualValues(t, "01020", it.Key().UserKey)
+		require.EqualValues(t, "v01020", it.Value())
+
+		// Seeking to a key that cannot be reached using Next.
+		require.True(t, it.SeekGE(makeKey("01200"), base.SeekGEFlagsNone.EnableTrySeekUsingNext()))
+		require.True(t, it.Valid())
+		require.EqualValues(t, "01200", it.Key().UserKey)
+		require.EqualValues(t, "v01200", it.Value())
+
+		// Seeking to an earlier key, but the caller lies. Incorrect result.
+		require.True(t, it.SeekGE(makeKey("01100"), base.SeekGEFlagsNone.EnableTrySeekUsingNext()))
+		require.True(t, it.Valid())
+		require.EqualValues(t, "01200", it.Key().UserKey)
+		require.EqualValues(t, "v01200", it.Value())
+
+		// Telling the truth works.
+		require.True(t, it.SeekGE(makeKey("01100"), base.SeekGEFlagsNone))
+		require.True(t, it.Valid())
+		require.EqualValues(t, "01100", it.Key().UserKey)
+		require.EqualValues(t, "v01100", it.Value())
+	}
+
+	// Test SeekPrefixGE with trySeekUsingNext optimization.
+	{
+		require.True(t, it.SeekPrefixGE(makeKey("01000"), makeKey("01000"), base.SeekGEFlagsNone))
+		require.True(t, it.Valid())
+		require.EqualValues(t, "01000", it.Key().UserKey)
+		require.EqualValues(t, "v01000", it.Value())
+
+		// Seeking to the same key.
+		require.True(t, it.SeekPrefixGE(makeKey("01000"), makeKey("01000"), base.SeekGEFlagsNone.EnableTrySeekUsingNext()))
+		require.True(t, it.Valid())
+		require.EqualValues(t, "01000", it.Key().UserKey)
+		require.EqualValues(t, "v01000", it.Value())
+
+		// Seeking to a nearby key that can be reached using Next.
+		require.True(t, it.SeekPrefixGE(makeKey("01020"), makeKey("01020"), base.SeekGEFlagsNone.EnableTrySeekUsingNext()))
+		require.True(t, it.Valid())
+		require.EqualValues(t, "01020", it.Key().UserKey)
+		require.EqualValues(t, "v01020", it.Value())
+
+		// Seeking to a key that cannot be reached using Next.
+		require.True(t, it.SeekPrefixGE(makeKey("01200"), makeKey("01200"), base.SeekGEFlagsNone.EnableTrySeekUsingNext()))
+		require.True(t, it.Valid())
+		require.EqualValues(t, "01200", it.Key().UserKey)
+		require.EqualValues(t, "v01200", it.Value())
+
+		// Seeking to an earlier key, but the caller lies. Incorrect result.
+		require.True(t, it.SeekPrefixGE(makeKey("01100"), makeKey("01100"), base.SeekGEFlagsNone.EnableTrySeekUsingNext()))
+		require.True(t, it.Valid())
+		require.EqualValues(t, "01200", it.Key().UserKey)
+		require.EqualValues(t, "v01200", it.Value())
+
+		// Telling the truth works.
+		require.True(t, it.SeekPrefixGE(makeKey("01100"), makeKey("01100"), base.SeekGEFlagsNone))
+		require.True(t, it.Valid())
+		require.EqualValues(t, "01100", it.Key().UserKey)
+		require.EqualValues(t, "v01100", it.Value())
+	}
+
+	// Test seek for empty key.
+	ins.Add(l, base.InternalKey{}, nil)
+	require.True(t, it.SeekGE([]byte{}, base.SeekGEFlagsNone))
+	require.True(t, it.Valid())
+	require.EqualValues(t, "", it.Key().UserKey)
+
+	require.True(t, it.SeekGE(makeKey(""), base.SeekGEFlagsNone))
+	require.True(t, it.Valid())
+	require.EqualValues(t, "", it.Key().UserKey)
+}
+
+func TestIteratorSeekLT(t *testing.T) {
+	const n = 100
+	l := NewSkiplist(newArena(arenaSize), bytes.Compare)
+	it := newIterAdapter(l.NewIter(nil, nil))
+
+	require.False(t, it.Valid())
+	it.First()
+	require.False(t, it.Valid())
+	// 1000, 1010, 1020, ..., 1990.
+	var ins Inserter
+	for i := n - 1; i >= 0; i-- {
+		v := i*10 + 1000
+		ins.Add(l, makeIntKey(v), makeValue(v))
+	}
+
+	require.False(t, it.SeekLT(makeKey(""), base.SeekLTFlagsNone))
+	require.False(t, it.Valid())
+
+	require.False(t, it.SeekLT(makeKey("01000"), base.SeekLTFlagsNone))
+	require.False(t, it.Valid())
+
+	require.True(t, it.SeekLT(makeKey("01001"), base.SeekLTFlagsNone))
+	require.True(t, it.Valid())
+	require.EqualValues(t, "01000", it.Key().UserKey)
+	require.EqualValues(t, "v01000", it.Value())
+
+	require.True(t, it.SeekLT(makeKey("01005"), base.SeekLTFlagsNone))
+	require.True(t, it.Valid())
+	require.EqualValues(t, "01000", it.Key().UserKey)
+	require.EqualValues(t, "v01000", it.Value())
+
+	require.True(t, it.SeekLT(makeKey("01991"), base.SeekLTFlagsNone))
+	require.True(t, it.Valid())
+	require.EqualValues(t, "01990", it.Key().UserKey)
+	require.EqualValues(t, "v01990", it.Value())
+
+	require.True(t, it.SeekLT(makeKey("99999"), base.SeekLTFlagsNone))
+	require.True(t, it.Valid())
+	require.EqualValues(t, "01990", it.Key().UserKey)
+	require.EqualValues(t, "v01990", it.Value())
+
+	// Test seek for empty key.
+	ins.Add(l, base.InternalKey{}, nil)
+	require.False(t, it.SeekLT([]byte{}, base.SeekLTFlagsNone))
+	require.False(t, it.Valid())
+
+	require.True(t, it.SeekLT(makeKey("\x01"), base.SeekLTFlagsNone))
+	require.True(t, it.Valid())
+	require.EqualValues(t, "", it.Key().UserKey)
+}
+
+// TODO(peter): test First and Last.
+func TestIteratorBounds(t *testing.T) {
+	l := NewSkiplist(newArena(arenaSize), bytes.Compare)
+	for i := 1; i < 10; i++ {
+		require.NoError(t, l.Add(makeIntKey(i), makeValue(i)))
+	}
+
+	key := func(i int) []byte {
+		return makeIntKey(i).UserKey
+	}
+
+	it := newIterAdapter(l.NewIter(key(3), key(7)))
+
+	// SeekGE within the lower and upper bound succeeds.
+	for i := 3; i <= 6; i++ {
+		k := key(i)
+		require.True(t, it.SeekGE(k, base.SeekGEFlagsNone))
+		require.EqualValues(t, string(k), string(it.Key().UserKey))
+	}
+
+	// SeekGE before the lower bound still succeeds (only the upper bound is
+	// checked).
+	for i := 1; i < 3; i++ {
+		k := key(i)
+		require.True(t, it.SeekGE(k, base.SeekGEFlagsNone))
+		require.EqualValues(t, string(k), string(it.Key().UserKey))
+	}
+
+	// SeekGE beyond the upper bound fails.
+	for i := 7; i < 10; i++ {
+		require.False(t, it.SeekGE(key(i), base.SeekGEFlagsNone))
+	}
+
+	require.True(t, it.SeekGE(key(6), base.SeekGEFlagsNone))
+	require.EqualValues(t, "00006", it.Key().UserKey)
+	require.EqualValues(t, "v00006", it.Value())
+
+	// Next into the upper bound fails.
+	require.False(t, it.Next())
+
+	// SeekLT within the lower and upper bound succeeds.
+	for i := 4; i <= 7; i++ {
+		require.True(t, it.SeekLT(key(i), base.SeekLTFlagsNone))
+		require.EqualValues(t, string(key(i-1)), string(it.Key().UserKey))
+	}
+
+	// SeekLT beyond the upper bound still succeeds (only the lower bound is
+	// checked).
+	for i := 8; i < 9; i++ {
+		require.True(t, it.SeekLT(key(8), base.SeekLTFlagsNone))
+		require.EqualValues(t, string(key(i-1)), string(it.Key().UserKey))
+	}
+
+	// SeekLT before the lower bound fails.
+	for i := 1; i < 4; i++ {
+		require.False(t, it.SeekLT(key(i), base.SeekLTFlagsNone))
+	}
+
+	require.True(t, it.SeekLT(key(4), base.SeekLTFlagsNone))
+	require.EqualValues(t, "00003", it.Key().UserKey)
+	require.EqualValues(t, "v00003", it.Value())
+
+	// Prev into the lower bound fails.
+	require.False(t, it.Prev())
+}
+
+func TestBytesIterated(t *testing.T) {
+	l := NewSkiplist(newArena(arenaSize), bytes.Compare)
+	emptySize := l.arena.Size()
+	for i := 0; i < 200; i++ {
+		bytesIterated := l.bytesIterated(t)
+		expected := uint64(l.arena.Size() - emptySize)
+		if bytesIterated != expected {
+			t.Fatalf("bytesIterated: got %d, want %d", bytesIterated, expected)
+		}
+		l.Add(base.InternalKey{UserKey: []byte{byte(i)}}, nil)
+	}
+}
+
+// bytesIterated returns the number of bytes iterated in the skiplist.
+func (s *Skiplist) bytesIterated(t *testing.T) (bytesIterated uint64) {
+	x := s.NewFlushIter(&bytesIterated)
+	var prevIterated uint64
+	for key, _ := x.First(); key != nil; key, _ = x.Next() {
+		if bytesIterated < prevIterated {
+			t.Fatalf("bytesIterated moved backward: %d < %d", bytesIterated, prevIterated)
+		}
+		prevIterated = bytesIterated
+	}
+	if x.Close() != nil {
+		return 0
+	}
+	return bytesIterated
+}
+
+func randomKey(rng *rand.Rand, b []byte) base.InternalKey {
+	key := rng.Uint32()
+	key2 := rng.Uint32()
+	binary.LittleEndian.PutUint32(b, key)
+	binary.LittleEndian.PutUint32(b[4:], key2)
+	return base.InternalKey{UserKey: b}
+}
+
+// Standard test. Some fraction is read. Some fraction is write. Writes have
+// to go through mutex lock.
+func BenchmarkReadWrite(b *testing.B) {
+	for i := 0; i <= 10; i++ {
+		readFrac := float32(i) / 10.0
+		b.Run(fmt.Sprintf("frac_%d", i*10), func(b *testing.B) {
+			l := NewSkiplist(newArena(uint32((b.N+2)*maxNodeSize)), bytes.Compare)
+			b.ResetTimer()
+			var count int
+			b.RunParallel(func(pb *testing.PB) {
+				it := l.NewIter(nil, nil)
+				rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+				buf := make([]byte, 8)
+
+				for pb.Next() {
+					if rng.Float32() < readFrac {
+						key, _ := it.SeekGE(randomKey(rng, buf).UserKey, base.SeekGEFlagsNone)
+						if key != nil {
+							_ = key
+							count++
+						}
+					} else {
+						_ = l.Add(randomKey(rng, buf), nil)
+					}
+				}
+			})
+		})
+	}
+}
+
+func BenchmarkOrderedWrite(b *testing.B) {
+	l := NewSkiplist(newArena(8<<20), bytes.Compare)
+	var ins Inserter
+	buf := make([]byte, 8)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		binary.BigEndian.PutUint64(buf, uint64(i))
+		if err := ins.Add(l, base.InternalKey{UserKey: buf}, nil); err == ErrArenaFull {
+			b.StopTimer()
+			l = NewSkiplist(newArena(uint32((b.N+2)*maxNodeSize)), bytes.Compare)
+			ins = Inserter{}
+			b.StartTimer()
+		}
+	}
+}
+
+func BenchmarkIterNext(b *testing.B) {
+	l := NewSkiplist(newArena(64<<10), bytes.Compare)
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+	buf := make([]byte, 8)
+	for {
+		if err := l.Add(randomKey(rng, buf), nil); err == ErrArenaFull {
+			break
+		}
+	}
+
+	it := l.NewIter(nil, nil)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		key, _ := it.Next()
+		if key == nil {
+			key, _ = it.First()
+		}
+		_ = key
+	}
+}
+
+func BenchmarkIterPrev(b *testing.B) {
+	l := NewSkiplist(newArena(64<<10), bytes.Compare)
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+	buf := make([]byte, 8)
+	for {
+		if err := l.Add(randomKey(rng, buf), nil); err == ErrArenaFull {
+			break
+		}
+	}
+
+	it := l.NewIter(nil, nil)
+	_, _ = it.Last()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		key, _ := it.Prev()
+		if key == nil {
+			key, _ = it.Last()
+		}
+		_ = key
+	}
+}
+
+// BenchmarkSeekPrefixGE looks at the performance of repeated calls to
+// SeekPrefixGE, with different skip distances and different settings of
+// trySeekUsingNext.
+func BenchmarkSeekPrefixGE(b *testing.B) {
+	l := NewSkiplist(newArena(64<<10), bytes.Compare)
+	var count int
+	// count was measured to be 1279.
+	for count = 0; ; count++ {
+		if err := l.Add(makeIntKey(count), makeValue(count)); err == ErrArenaFull {
+			break
+		}
+	}
+	for _, skip := range []int{1, 2, 4, 8, 16} {
+		for _, useNext := range []bool{false, true} {
+			b.Run(fmt.Sprintf("skip=%d/use-next=%t", skip, useNext), func(b *testing.B) {
+				it := l.NewIter(nil, nil)
+				j := 0
+				var k []byte
+				makeKey := func() {
+					k = []byte(fmt.Sprintf("%05d", j))
+				}
+				makeKey()
+				it.SeekPrefixGE(k, k, base.SeekGEFlagsNone)
+				b.ResetTimer()
+				for i := 0; i < b.N; i++ {
+					j += skip
+					var flags base.SeekGEFlags
+					if useNext {
+						flags = flags.EnableTrySeekUsingNext()
+					}
+					if j >= count {
+						j = 0
+						flags = flags.DisableTrySeekUsingNext()
+					}
+					makeKey()
+					it.SeekPrefixGE(k, k, flags)
+				}
+			})
+		}
+	}
+}
+
+// Standard test. Some fraction is read. Some fraction is write. Writes have
+// to go through mutex lock.
+// func BenchmarkReadWriteMap(b *testing.B) {
+// 	for i := 0; i <= 10; i++ {
+// 		readFrac := float32(i) / 10.0
+// 		b.Run(fmt.Sprintf("frac_%d", i*10), func(b *testing.B) {
+// 			m := make(map[string]struct{})
+// 			var mutex sync.RWMutex
+// 			b.ResetTimer()
+// 			var count int
+// 			b.RunParallel(func(pb *testing.PB) {
+// 				rng := rand.New(rand.NewSource(time.Now().UnixNano()))
+// 				for pb.Next() {
+// 					if rng.Float32() < readFrac {
+// 						mutex.RLock()
+// 						_, ok := m[string(randomKey(rng))]
+// 						mutex.RUnlock()
+// 						if ok {
+// 							count++
+// 						}
+// 					} else {
+// 						mutex.Lock()
+// 						m[string(randomKey(rng))] = struct{}{}
+// 						mutex.Unlock()
+// 					}
+// 				}
+// 			})
+// 		})
+// 	}
+// }
diff --git a/pebble/internal/base/cleaner.go b/pebble/internal/base/cleaner.go
new file mode 100644
index 0000000..b86d455
--- /dev/null
+++ b/pebble/internal/base/cleaner.go
@@ -0,0 +1,60 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package base
+
+import "github.com/cockroachdb/pebble/vfs"
+
+// Cleaner cleans obsolete files.
+type Cleaner interface {
+	Clean(fs vfs.FS, fileType FileType, path string) error
+}
+
+// NeedsFileContents is implemented by a cleaner that needs the contents of the
+// files that it is being asked to clean.
+type NeedsFileContents interface {
+	needsFileContents()
+}
+
+// DeleteCleaner deletes file.
+type DeleteCleaner struct{}
+
+// Clean removes file.
+func (DeleteCleaner) Clean(fs vfs.FS, fileType FileType, path string) error {
+	return fs.Remove(path)
+}
+
+func (DeleteCleaner) String() string {
+	return "delete"
+}
+
+// ArchiveCleaner archives file instead delete.
+type ArchiveCleaner struct{}
+
+var _ NeedsFileContents = ArchiveCleaner{}
+
+// Clean archives file.
+func (ArchiveCleaner) Clean(fs vfs.FS, fileType FileType, path string) error {
+	switch fileType {
+	case FileTypeLog, FileTypeManifest, FileTypeTable:
+		destDir := fs.PathJoin(fs.PathDir(path), "archive")
+
+		if err := fs.MkdirAll(destDir, 0755); err != nil {
+			return err
+		}
+
+		destPath := fs.PathJoin(destDir, fs.PathBase(path))
+		return fs.Rename(path, destPath)
+
+	default:
+		return fs.Remove(path)
+	}
+}
+
+func (ArchiveCleaner) String() string {
+	return "archive"
+}
+
+func (ArchiveCleaner) needsFileContents() {
+}
diff --git a/pebble/internal/base/comparer.go b/pebble/internal/base/comparer.go
new file mode 100644
index 0000000..a630962
--- /dev/null
+++ b/pebble/internal/base/comparer.go
@@ -0,0 +1,260 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package base
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"strconv"
+	"unicode/utf8"
+)
+
+// Compare returns -1, 0, or +1 depending on whether a is 'less than', 'equal
+// to' or 'greater than' b. The two arguments can only be 'equal' if their
+// contents are exactly equal. Furthermore, the empty slice must be 'less than'
+// any non-empty slice. Compare is used to compare user keys, such as those
+// passed as arguments to the various DB methods, as well as those returned
+// from Separator, Successor, and Split.
+type Compare func(a, b []byte) int
+
+// Equal returns true if a and b are equivalent. For a given Compare,
+// Equal(a,b) must return true iff Compare(a,b) returns zero, that is,
+// Equal is a (potentially faster) specialization of Compare.
+type Equal func(a, b []byte) bool
+
+// AbbreviatedKey returns a fixed length prefix of a user key such that AbbreviatedKey(a)
+// < AbbreviatedKey(b) iff a < b and AbbreviatedKey(a) > AbbreviatedKey(b) iff a > b. If
+// AbbreviatedKey(a) == AbbreviatedKey(b) an additional comparison is required to
+// determine if the two keys are actually equal.
+//
+// This helps optimize indexed batch comparisons for cache locality. If a Split
+// function is specified, AbbreviatedKey usually returns the first eight bytes
+// of the user key prefix in the order that gives the correct ordering.
+type AbbreviatedKey func(key []byte) uint64
+
+// FormatKey returns a formatter for the user key.
+type FormatKey func(key []byte) fmt.Formatter
+
+// FormatValue returns a formatter for the user value. The key is also
+// specified for the value formatter in order to support value formatting that
+// is dependent on the key.
+type FormatValue func(key, value []byte) fmt.Formatter
+
+// Separator is used to construct SSTable index blocks. A trivial implementation
+// is `return a`, but appending fewer bytes leads to smaller SSTables.
+//
+// Given keys a, b for which Compare(a, b) < 0, Separator returns a key k such
+// that:
+//
+// 1. Compare(a, k) <= 0, and
+// 2. Compare(k, b) < 0.
+//
+// As a special case, b may be nil in which case the second condition is dropped.
+//
+// For example, if dst, a and b are the []byte equivalents of the strings
+// "aqua", "black" and "blue", then the result may be "aquablb".
+// Similarly, if the arguments were "aqua", "green" and "", then the result
+// may be "aquah".
+type Separator func(dst, a, b []byte) []byte
+
+// Successor returns a shortened key given a key a, such that Compare(k, a) >=
+// 0. A simple implementation may return a unchanged. The dst parameter may be
+// used to store the returned key, though it is valid to pass nil. The returned
+// key must be valid to pass to Compare.
+type Successor func(dst, a []byte) []byte
+
+// ImmediateSuccessor is invoked with a prefix key ([Split(a) == len(a)]) and
+// returns the smallest key that is larger than the given prefix a.
+// ImmediateSuccessor must return a prefix key k such that:
+//
+//	Split(k) == len(k) and Compare(k, a) > 0
+//
+// and there exists no representable k2 such that:
+//
+//	Split(k2) == len(k2) and Compare(k2, a) > 0 and Compare(k2, k) < 0
+//
+// As an example, an implementation built on the natural byte ordering using
+// bytes.Compare could append a `\0` to `a`.
+//
+// The dst parameter may be used to store the returned key, though it is valid
+// to pass nil. The returned key must be valid to pass to Compare.
+type ImmediateSuccessor func(dst, a []byte) []byte
+
+// Split returns the length of the prefix of the user key that corresponds to
+// the key portion of an MVCC encoding scheme to enable the use of prefix bloom
+// filters.
+//
+// The method will only ever be called with valid MVCC keys, that is, keys that
+// the user could potentially store in the database. Pebble does not know which
+// keys are MVCC keys and which are not, and may call Split on both MVCC keys
+// and non-MVCC keys.
+//
+// A trivial MVCC scheme is one in which Split() returns len(a). This
+// corresponds to assigning a constant version to each key in the database. For
+// performance reasons, it is preferable to use a `nil` split in this case.
+//
+// The returned prefix must have the following properties:
+//
+//  1. The prefix must be a byte prefix:
+//
+//     bytes.HasPrefix(a, prefix(a))
+//
+//  2. A key consisting of just a prefix must sort before all other keys with
+//     that prefix:
+//
+//     Compare(prefix(a), a) < 0 if len(suffix(a)) > 0
+//
+//  3. Prefixes must be used to order keys before suffixes:
+//
+//     If Compare(a, b) <= 0, then Compare(prefix(a), prefix(b)) <= 0
+//
+//  4. Suffixes themselves must be valid keys and comparable, respecting the same
+//     ordering as within a key.
+//
+//     If Compare(prefix(a), prefix(b)) == 0, then Compare(suffix(a), suffix(b)) == Compare(a, b)
+type Split func(a []byte) int
+
+// Comparer defines a total ordering over the space of []byte keys: a 'less
+// than' relationship.
+type Comparer struct {
+	Compare            Compare
+	Equal              Equal
+	AbbreviatedKey     AbbreviatedKey
+	FormatKey          FormatKey
+	FormatValue        FormatValue
+	Separator          Separator
+	Split              Split
+	Successor          Successor
+	ImmediateSuccessor ImmediateSuccessor
+
+	// Name is the name of the comparer.
+	//
+	// The Level-DB on-disk format stores the comparer name, and opening a
+	// database with a different comparer from the one it was created with
+	// will result in an error.
+	Name string
+}
+
+// DefaultFormatter is the default implementation of user key formatting:
+// non-ASCII data is formatted as escaped hexadecimal values.
+var DefaultFormatter = func(key []byte) fmt.Formatter {
+	return FormatBytes(key)
+}
+
+// DefaultComparer is the default implementation of the Comparer interface.
+// It uses the natural ordering, consistent with bytes.Compare.
+var DefaultComparer = &Comparer{
+	Compare: bytes.Compare,
+	Equal:   bytes.Equal,
+
+	AbbreviatedKey: func(key []byte) uint64 {
+		if len(key) >= 8 {
+			return binary.BigEndian.Uint64(key)
+		}
+		var v uint64
+		for _, b := range key {
+			v <<= 8
+			v |= uint64(b)
+		}
+		return v << uint(8*(8-len(key)))
+	},
+
+	FormatKey: DefaultFormatter,
+
+	Separator: func(dst, a, b []byte) []byte {
+		i, n := SharedPrefixLen(a, b), len(dst)
+		dst = append(dst, a...)
+
+		min := len(a)
+		if min > len(b) {
+			min = len(b)
+		}
+		if i >= min {
+			// Do not shorten if one string is a prefix of the other.
+			return dst
+		}
+
+		if a[i] >= b[i] {
+			// b is smaller than a or a is already the shortest possible.
+			return dst
+		}
+
+		if i < len(b)-1 || a[i]+1 < b[i] {
+			i += n
+			dst[i]++
+			return dst[:i+1]
+		}
+
+		i += n + 1
+		for ; i < len(dst); i++ {
+			if dst[i] != 0xff {
+				dst[i]++
+				return dst[:i+1]
+			}
+		}
+		return dst
+	},
+
+	Successor: func(dst, a []byte) (ret []byte) {
+		for i := 0; i < len(a); i++ {
+			if a[i] != 0xff {
+				dst = append(dst, a[:i+1]...)
+				dst[len(dst)-1]++
+				return dst
+			}
+		}
+		// a is a run of 0xffs, leave it alone.
+		return append(dst, a...)
+	},
+
+	ImmediateSuccessor: func(dst, a []byte) (ret []byte) {
+		return append(append(dst, a...), 0x00)
+	},
+
+	// This name is part of the C++ Level-DB implementation's default file
+	// format, and should not be changed.
+	Name: "leveldb.BytewiseComparator",
+}
+
+// SharedPrefixLen returns the largest i such that a[:i] equals b[:i].
+// This function can be useful in implementing the Comparer interface.
+func SharedPrefixLen(a, b []byte) int {
+	i, n := 0, len(a)
+	if n > len(b) {
+		n = len(b)
+	}
+	asUint64 := func(c []byte, i int) uint64 {
+		return binary.LittleEndian.Uint64(c[i:])
+	}
+	for i < n-7 && asUint64(a, i) == asUint64(b, i) {
+		i += 8
+	}
+	for i < n && a[i] == b[i] {
+		i++
+	}
+	return i
+}
+
+// FormatBytes formats a byte slice using hexadecimal escapes for non-ASCII
+// data.
+type FormatBytes []byte
+
+const lowerhex = "0123456789abcdef"
+
+// Format implements the fmt.Formatter interface.
+func (p FormatBytes) Format(s fmt.State, c rune) {
+	buf := make([]byte, 0, len(p))
+	for _, b := range p {
+		if b < utf8.RuneSelf && strconv.IsPrint(rune(b)) {
+			buf = append(buf, b)
+			continue
+		}
+		buf = append(buf, `\x`...)
+		buf = append(buf, lowerhex[b>>4])
+		buf = append(buf, lowerhex[b&0xF])
+	}
+	s.Write(buf)
+}
diff --git a/pebble/internal/base/comparer_test.go b/pebble/internal/base/comparer_test.go
new file mode 100644
index 0000000..ae49a31
--- /dev/null
+++ b/pebble/internal/base/comparer_test.go
@@ -0,0 +1,117 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package base
+
+import (
+	"fmt"
+	"slices"
+	"testing"
+	"time"
+
+	"golang.org/x/exp/rand"
+)
+
+func TestDefAppendSeparator(t *testing.T) {
+	testCases := []struct {
+		a, b, want string
+	}{
+		// Examples from the doc comments.
+		{"black", "blue", "blb"},
+		{"green", "", "green"},
+		// Non-empty b values. The C++ Level-DB code calls these separators.
+		{"", "2", ""},
+		{"1", "2", "1"},
+		{"1", "29", "2"},
+		{"13", "19", "14"},
+		{"13", "99", "2"},
+		{"135", "19", "14"},
+		{"1357", "19", "14"},
+		{"1357", "2", "14"},
+		{"13\xff", "14", "13\xff"},
+		{"13\xff", "19", "14"},
+		{"1\xff\xff", "19", "1\xff\xff"},
+		{"1\xff\xff", "2", "1\xff\xff"},
+		{"1\xff\xff", "9", "2"},
+		// Empty b values. The C++ Level-DB code calls these successors.
+		{"", "", ""},
+		{"1", "", "1"},
+		{"11", "", "11"},
+		{"11\xff", "", "11\xff"},
+		{"1\xff", "", "1\xff"},
+		{"1\xff\xff", "", "1\xff\xff"},
+		{"\xff", "", "\xff"},
+		{"\xff\xff", "", "\xff\xff"},
+		{"\xff\xff\xff", "", "\xff\xff\xff"},
+	}
+	for _, tc := range testCases {
+		t.Run("", func(t *testing.T) {
+			got := string(DefaultComparer.Separator(nil, []byte(tc.a), []byte(tc.b)))
+			if got != tc.want {
+				t.Errorf("a, b = %q, %q: got %q, want %q", tc.a, tc.b, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestAbbreviatedKey(t *testing.T) {
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+	randBytes := func(size int) []byte {
+		data := make([]byte, size)
+		for i := range data {
+			data[i] = byte(rng.Int() & 0xff)
+		}
+		return data
+	}
+
+	keys := make([][]byte, 10000)
+	for i := range keys {
+		keys[i] = randBytes(rng.Intn(16))
+	}
+	slices.SortFunc(keys, DefaultComparer.Compare)
+
+	for i := 1; i < len(keys); i++ {
+		last := DefaultComparer.AbbreviatedKey(keys[i-1])
+		cur := DefaultComparer.AbbreviatedKey(keys[i])
+		cmp := DefaultComparer.Compare(keys[i-1], keys[i])
+		if cmp == 0 {
+			if last != cur {
+				t.Fatalf("expected equal abbreviated keys: %x[%x] != %x[%x]",
+					last, keys[i-1], cur, keys[i])
+			}
+		} else {
+			if last > cur {
+				t.Fatalf("unexpected abbreviated key ordering: %x[%x] > %x[%x]",
+					last, keys[i-1], cur, keys[i])
+			}
+		}
+	}
+}
+
+func BenchmarkAbbreviatedKey(b *testing.B) {
+	rng := rand.New(rand.NewSource(1449168817))
+	randBytes := func(size int) []byte {
+		data := make([]byte, size)
+		for i := range data {
+			data[i] = byte(rng.Int() & 0xff)
+		}
+		return data
+	}
+	keys := make([][]byte, 10000)
+	for i := range keys {
+		keys[i] = randBytes(8)
+	}
+
+	b.ResetTimer()
+	var sum uint64
+	for i := 0; i < b.N; i++ {
+		j := i % len(keys)
+		sum += DefaultComparer.AbbreviatedKey(keys[j])
+	}
+
+	if testing.Verbose() {
+		// Ensure the compiler doesn't optimize away our benchmark.
+		fmt.Println(sum)
+	}
+}
diff --git a/pebble/internal/base/error.go b/pebble/internal/base/error.go
new file mode 100644
index 0000000..6ef7783
--- /dev/null
+++ b/pebble/internal/base/error.go
@@ -0,0 +1,28 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package base
+
+import "github.com/cockroachdb/errors"
+
+// ErrNotFound means that a get or delete call did not find the requested key.
+var ErrNotFound = errors.New("pebble: not found")
+
+// ErrCorruption is a marker to indicate that data in a file (WAL, MANIFEST,
+// sstable) isn't in the expected format.
+var ErrCorruption = errors.New("pebble: corruption")
+
+// MarkCorruptionError marks given error as a corruption error.
+func MarkCorruptionError(err error) error {
+	if errors.Is(err, ErrCorruption) {
+		return err
+	}
+	return errors.Mark(err, ErrCorruption)
+}
+
+// CorruptionErrorf formats according to a format specifier and returns
+// the string as an error value that is marked as a corruption error.
+func CorruptionErrorf(format string, args ...interface{}) error {
+	return errors.Mark(errors.Newf(format, args...), ErrCorruption)
+}
diff --git a/pebble/internal/base/filenames.go b/pebble/internal/base/filenames.go
new file mode 100644
index 0000000..06098ab
--- /dev/null
+++ b/pebble/internal/base/filenames.go
@@ -0,0 +1,202 @@
+// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package base
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+
+	"github.com/cockroachdb/errors/oserror"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/cockroachdb/redact"
+)
+
+// FileNum is an internal DB identifier for a file.
+type FileNum uint64
+
+// String returns a string representation of the file number.
+func (fn FileNum) String() string { return fmt.Sprintf("%06d", fn) }
+
+// SafeFormat implements redact.SafeFormatter.
+func (fn FileNum) SafeFormat(w redact.SafePrinter, _ rune) {
+	w.Printf("%06d", redact.SafeUint(fn))
+}
+
+// DiskFileNum converts a FileNum to a DiskFileNum. DiskFileNum should only be
+// called if the caller can ensure that the FileNum belongs to a physical file
+// on disk. These could be manifests, log files, physical sstables on disk, the
+// options file, but not virtual sstables.
+func (fn FileNum) DiskFileNum() DiskFileNum {
+	return DiskFileNum(fn)
+}
+
+// A DiskFileNum is just a FileNum belonging to a file which exists on disk.
+// Note that a FileNum is an internal DB identifier and it could belong to files
+// which don't exist on disk. An example would be virtual sstable FileNums.
+// Converting a DiskFileNum to a FileNum is always valid, whereas converting a
+// FileNum to DiskFileNum may not be valid and care should be taken to prove
+// that the FileNum actually exists on disk.
+type DiskFileNum uint64
+
+func (dfn DiskFileNum) String() string { return fmt.Sprintf("%06d", dfn) }
+
+// SafeFormat implements redact.SafeFormatter.
+func (dfn DiskFileNum) SafeFormat(w redact.SafePrinter, verb rune) {
+	w.Printf("%06d", redact.SafeUint(dfn))
+}
+
+// FileNum converts a DiskFileNum to a FileNum. This conversion is always valid.
+func (dfn DiskFileNum) FileNum() FileNum {
+	return FileNum(dfn)
+}
+
+// FileType enumerates the types of files found in a DB.
+type FileType int
+
+// The FileType enumeration.
+const (
+	FileTypeLog FileType = iota
+	FileTypeLock
+	FileTypeTable
+	FileTypeManifest
+	FileTypeCurrent
+	FileTypeOptions
+	FileTypeOldTemp
+	FileTypeTemp
+)
+
+// MakeFilename builds a filename from components.
+func MakeFilename(fileType FileType, dfn DiskFileNum) string {
+	switch fileType {
+	case FileTypeLog:
+		return fmt.Sprintf("%s.log", dfn)
+	case FileTypeLock:
+		return "LOCK"
+	case FileTypeTable:
+		return fmt.Sprintf("%s.sst", dfn)
+	case FileTypeManifest:
+		return fmt.Sprintf("MANIFEST-%s", dfn)
+	case FileTypeCurrent:
+		return "CURRENT"
+	case FileTypeOptions:
+		return fmt.Sprintf("OPTIONS-%s", dfn)
+	case FileTypeOldTemp:
+		return fmt.Sprintf("CURRENT.%s.dbtmp", dfn)
+	case FileTypeTemp:
+		return fmt.Sprintf("temporary.%s.dbtmp", dfn)
+	}
+	panic("unreachable")
+}
+
+// MakeFilepath builds a filepath from components.
+func MakeFilepath(fs vfs.FS, dirname string, fileType FileType, dfn DiskFileNum) string {
+	return fs.PathJoin(dirname, MakeFilename(fileType, dfn))
+}
+
+// ParseFilename parses the components from a filename.
+func ParseFilename(fs vfs.FS, filename string) (fileType FileType, dfn DiskFileNum, ok bool) {
+	filename = fs.PathBase(filename)
+	switch {
+	case filename == "CURRENT":
+		return FileTypeCurrent, 0, true
+	case filename == "LOCK":
+		return FileTypeLock, 0, true
+	case strings.HasPrefix(filename, "MANIFEST-"):
+		dfn, ok = parseDiskFileNum(filename[len("MANIFEST-"):])
+		if !ok {
+			break
+		}
+		return FileTypeManifest, dfn, true
+	case strings.HasPrefix(filename, "OPTIONS-"):
+		dfn, ok = parseDiskFileNum(filename[len("OPTIONS-"):])
+		if !ok {
+			break
+		}
+		return FileTypeOptions, dfn, ok
+	case strings.HasPrefix(filename, "CURRENT.") && strings.HasSuffix(filename, ".dbtmp"):
+		s := strings.TrimSuffix(filename[len("CURRENT."):], ".dbtmp")
+		dfn, ok = parseDiskFileNum(s)
+		if !ok {
+			break
+		}
+		return FileTypeOldTemp, dfn, ok
+	case strings.HasPrefix(filename, "temporary.") && strings.HasSuffix(filename, ".dbtmp"):
+		s := strings.TrimSuffix(filename[len("temporary."):], ".dbtmp")
+		dfn, ok = parseDiskFileNum(s)
+		if !ok {
+			break
+		}
+		return FileTypeTemp, dfn, ok
+	default:
+		i := strings.IndexByte(filename, '.')
+		if i < 0 {
+			break
+		}
+		dfn, ok = parseDiskFileNum(filename[:i])
+		if !ok {
+			break
+		}
+		switch filename[i+1:] {
+		case "sst":
+			return FileTypeTable, dfn, true
+		case "log":
+			return FileTypeLog, dfn, true
+		}
+	}
+	return 0, dfn, false
+}
+
+func parseDiskFileNum(s string) (dfn DiskFileNum, ok bool) {
+	u, err := strconv.ParseUint(s, 10, 64)
+	if err != nil {
+		return dfn, false
+	}
+	return DiskFileNum(u), true
+}
+
+// A Fataler fatals a process with a message when called.
+type Fataler interface {
+	Fatalf(format string, args ...interface{})
+}
+
+// MustExist checks if err is an error indicating a file does not exist.
+// If it is, it lists the containing directory's files to annotate the error
+// with counts of the various types of files and invokes the provided fataler.
+// See cockroachdb/cockroach#56490.
+func MustExist(fs vfs.FS, filename string, fataler Fataler, err error) {
+	if err == nil || !oserror.IsNotExist(err) {
+		return
+	}
+
+	ls, lsErr := fs.List(fs.PathDir(filename))
+	if lsErr != nil {
+		// TODO(jackson): if oserror.IsNotExist(lsErr), the the data directory
+		// doesn't exist anymore. Another process likely deleted it before
+		// killing the process. We want to fatal the process, but without
+		// triggering error reporting like Sentry.
+		fataler.Fatalf("%s:\norig err: %s\nlist err: %s", redact.Safe(fs.PathBase(filename)), err, lsErr)
+	}
+	var total, unknown, tables, logs, manifests int
+	total = len(ls)
+	for _, f := range ls {
+		typ, _, ok := ParseFilename(fs, f)
+		if !ok {
+			unknown++
+			continue
+		}
+		switch typ {
+		case FileTypeTable:
+			tables++
+		case FileTypeLog:
+			logs++
+		case FileTypeManifest:
+			manifests++
+		}
+	}
+
+	fataler.Fatalf("%s:\n%s\ndirectory contains %d files, %d unknown, %d tables, %d logs, %d manifests",
+		fs.PathBase(filename), err, total, unknown, tables, logs, manifests)
+}
diff --git a/pebble/internal/base/filenames_test.go b/pebble/internal/base/filenames_test.go
new file mode 100644
index 0000000..07b7430
--- /dev/null
+++ b/pebble/internal/base/filenames_test.go
@@ -0,0 +1,114 @@
+// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package base
+
+import (
+	"bytes"
+	"fmt"
+	"os"
+	"testing"
+
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/cockroachdb/redact"
+	"github.com/stretchr/testify/require"
+)
+
+func TestParseFilename(t *testing.T) {
+	testCases := map[string]bool{
+		"000000.log":             true,
+		"000000.log.zip":         false,
+		"000000..log":            false,
+		"a000000.log":            false,
+		"abcdef.log":             false,
+		"000001ldb":              false,
+		"000001.sst":             true,
+		"CURRENT":                true,
+		"CURRaNT":                false,
+		"LOCK":                   true,
+		"xLOCK":                  false,
+		"x.LOCK":                 false,
+		"MANIFEST":               false,
+		"MANIFEST123456":         false,
+		"MANIFEST-":              false,
+		"MANIFEST-123456":        true,
+		"MANIFEST-123456.doc":    false,
+		"OPTIONS":                false,
+		"OPTIONS123456":          false,
+		"OPTIONS-":               false,
+		"OPTIONS-123456":         true,
+		"OPTIONS-123456.doc":     false,
+		"CURRENT.123456":         false,
+		"CURRENT.dbtmp":          false,
+		"CURRENT.123456.dbtmp":   true,
+		"temporary.123456.dbtmp": true,
+	}
+	fs := vfs.NewMem()
+	for tc, want := range testCases {
+		_, _, got := ParseFilename(fs, fs.PathJoin("foo", tc))
+		if got != want {
+			t.Errorf("%q: got %v, want %v", tc, got, want)
+		}
+	}
+}
+
+func TestFilenameRoundTrip(t *testing.T) {
+	testCases := map[FileType]bool{
+		// CURRENT and LOCK files aren't numbered.
+		FileTypeCurrent: false,
+		FileTypeLock:    false,
+		// The remaining file types are numbered.
+		FileTypeLog:      true,
+		FileTypeManifest: true,
+		FileTypeTable:    true,
+		FileTypeOptions:  true,
+		FileTypeOldTemp:  true,
+		FileTypeTemp:     true,
+	}
+	fs := vfs.NewMem()
+	for fileType, numbered := range testCases {
+		fileNums := []FileNum{0}
+		if numbered {
+			fileNums = []FileNum{0, 1, 2, 3, 10, 42, 99, 1001}
+		}
+		for _, fileNum := range fileNums {
+			filename := MakeFilepath(fs, "foo", fileType, fileNum.DiskFileNum())
+			gotFT, gotFN, gotOK := ParseFilename(fs, filename)
+			if !gotOK {
+				t.Errorf("could not parse %q", filename)
+				continue
+			}
+			if gotFT != fileType || gotFN.FileNum() != fileNum {
+				t.Errorf("filename=%q: got %v, %v, want %v, %v", filename, gotFT, gotFN, fileType, fileNum)
+				continue
+			}
+		}
+	}
+}
+
+type bufferFataler struct {
+	buf bytes.Buffer
+}
+
+func (b *bufferFataler) Fatalf(msg string, args ...interface{}) {
+	fmt.Fprintf(&b.buf, msg, args...)
+}
+
+func TestMustExist(t *testing.T) {
+	err := os.ErrNotExist
+	fs := vfs.Default
+	var buf bufferFataler
+	filename := fs.PathJoin("..", "..", "testdata", "db-stage-4", "000000.sst")
+
+	MustExist(fs, filename, &buf, err)
+	require.Equal(t, `000000.sst:
+file does not exist
+directory contains 9 files, 2 unknown, 1 tables, 1 logs, 2 manifests`, buf.buf.String())
+}
+
+func TestRedactFileNum(t *testing.T) {
+	// Ensure that redaction never redacts file numbers.
+	require.Equal(t, redact.RedactableString("000005"), redact.Sprint(FileNum(5)))
+	require.Equal(t, redact.RedactableString("000005"), redact.Sprint(DiskFileNum(5)))
+}
diff --git a/pebble/internal/base/internal.go b/pebble/internal/base/internal.go
new file mode 100644
index 0000000..db691ee
--- /dev/null
+++ b/pebble/internal/base/internal.go
@@ -0,0 +1,502 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package base // import "github.com/cockroachdb/pebble/internal/base"
+
+import (
+	"encoding/binary"
+	"fmt"
+	"strconv"
+	"strings"
+
+	"github.com/cockroachdb/redact"
+	"github.com/cockroachdb/pebble/shims/cmp"
+)
+
+const (
+	// SeqNumZero is the zero sequence number, set by compactions if they can
+	// guarantee there are no keys underneath an internal key.
+	SeqNumZero = uint64(0)
+	// SeqNumStart is the first sequence number assigned to a key. Sequence
+	// numbers 1-9 are reserved for potential future use.
+	SeqNumStart = uint64(10)
+)
+
+// InternalKeyKind enumerates the kind of key: a deletion tombstone, a set
+// value, a merged value, etc.
+type InternalKeyKind uint8
+
+// These constants are part of the file format, and should not be changed.
+const (
+	InternalKeyKindDelete  InternalKeyKind = 0
+	InternalKeyKindSet     InternalKeyKind = 1
+	InternalKeyKindMerge   InternalKeyKind = 2
+	InternalKeyKindLogData InternalKeyKind = 3
+	//InternalKeyKindColumnFamilyDeletion     InternalKeyKind = 4
+	//InternalKeyKindColumnFamilyValue        InternalKeyKind = 5
+	//InternalKeyKindColumnFamilyMerge        InternalKeyKind = 6
+
+	// InternalKeyKindSingleDelete (SINGLEDEL) is a performance optimization
+	// solely for compactions (to reduce write amp and space amp). Readers other
+	// than compactions should treat SINGLEDEL as equivalent to a DEL.
+	// Historically, it was simpler for readers other than compactions to treat
+	// SINGLEDEL as equivalent to DEL, but as of the introduction of
+	// InternalKeyKindSSTableInternalObsoleteBit, this is also necessary for
+	// correctness.
+	InternalKeyKindSingleDelete InternalKeyKind = 7
+	//InternalKeyKindColumnFamilySingleDelete InternalKeyKind = 8
+	//InternalKeyKindBeginPrepareXID          InternalKeyKind = 9
+	//InternalKeyKindEndPrepareXID            InternalKeyKind = 10
+	//InternalKeyKindCommitXID                InternalKeyKind = 11
+	//InternalKeyKindRollbackXID              InternalKeyKind = 12
+	//InternalKeyKindNoop                     InternalKeyKind = 13
+	//InternalKeyKindColumnFamilyRangeDelete  InternalKeyKind = 14
+	InternalKeyKindRangeDelete InternalKeyKind = 15
+	//InternalKeyKindColumnFamilyBlobIndex    InternalKeyKind = 16
+	//InternalKeyKindBlobIndex                InternalKeyKind = 17
+
+	// InternalKeyKindSeparator is a key used for separator / successor keys
+	// written to sstable block indexes.
+	//
+	// NOTE: the RocksDB value has been repurposed. This was done to ensure that
+	// keys written to block indexes with value "17" (when 17 happened to be the
+	// max value, and InternalKeyKindMax was therefore set to 17), remain stable
+	// when new key kinds are supported in Pebble.
+	InternalKeyKindSeparator InternalKeyKind = 17
+
+	// InternalKeyKindSetWithDelete keys are SET keys that have met with a
+	// DELETE or SINGLEDEL key in a prior compaction. This key kind is
+	// specific to Pebble. See
+	// https://github.com/cockroachdb/pebble/issues/1255.
+	InternalKeyKindSetWithDelete InternalKeyKind = 18
+
+	// InternalKeyKindRangeKeyDelete removes all range keys within a key range.
+	// See the internal/rangekey package for more details.
+	InternalKeyKindRangeKeyDelete InternalKeyKind = 19
+	// InternalKeyKindRangeKeySet and InternalKeyKindRangeUnset represent
+	// keys that set and unset values associated with ranges of key
+	// space. See the internal/rangekey package for more details.
+	InternalKeyKindRangeKeyUnset InternalKeyKind = 20
+	InternalKeyKindRangeKeySet   InternalKeyKind = 21
+
+	// InternalKeyKindIngestSST is used to distinguish a batch that corresponds to
+	// the WAL entry for ingested sstables that are added to the flushable
+	// queue. This InternalKeyKind cannot appear, amongst other key kinds in a
+	// batch, or in an sstable.
+	InternalKeyKindIngestSST InternalKeyKind = 22
+
+	// InternalKeyKindDeleteSized keys behave identically to
+	// InternalKeyKindDelete keys, except that they hold an associated uint64
+	// value indicating the (len(key)+len(value)) of the shadowed entry the
+	// tombstone is expected to delete. This value is used to inform compaction
+	// heuristics, but is not required to be accurate for correctness.
+	InternalKeyKindDeleteSized InternalKeyKind = 23
+
+	// This maximum value isn't part of the file format. Future extensions may
+	// increase this value.
+	//
+	// When constructing an internal key to pass to DB.Seek{GE,LE},
+	// internalKeyComparer sorts decreasing by kind (after sorting increasing by
+	// user key and decreasing by sequence number). Thus, use InternalKeyKindMax,
+	// which sorts 'less than or equal to' any other valid internalKeyKind, when
+	// searching for any kind of internal key formed by a certain user key and
+	// seqNum.
+	InternalKeyKindMax InternalKeyKind = 23
+
+	// Internal to the sstable format. Not exposed by any sstable iterator.
+	// Declared here to prevent definition of valid key kinds that set this bit.
+	InternalKeyKindSSTableInternalObsoleteBit  InternalKeyKind = 64
+	InternalKeyKindSSTableInternalObsoleteMask InternalKeyKind = 191
+
+	// InternalKeyZeroSeqnumMaxTrailer is the largest trailer with a
+	// zero sequence number.
+	InternalKeyZeroSeqnumMaxTrailer = uint64(255)
+
+	// A marker for an invalid key.
+	InternalKeyKindInvalid InternalKeyKind = InternalKeyKindSSTableInternalObsoleteMask
+
+	// InternalKeySeqNumBatch is a bit that is set on batch sequence numbers
+	// which prevents those entries from being excluded from iteration.
+	InternalKeySeqNumBatch = uint64(1 << 55)
+
+	// InternalKeySeqNumMax is the largest valid sequence number.
+	InternalKeySeqNumMax = uint64(1<<56 - 1)
+
+	// InternalKeyRangeDeleteSentinel is the marker for a range delete sentinel
+	// key. This sequence number and kind are used for the upper stable boundary
+	// when a range deletion tombstone is the largest key in an sstable. This is
+	// necessary because sstable boundaries are inclusive, while the end key of a
+	// range deletion tombstone is exclusive.
+	InternalKeyRangeDeleteSentinel = (InternalKeySeqNumMax << 8) | uint64(InternalKeyKindRangeDelete)
+
+	// InternalKeyBoundaryRangeKey is the marker for a range key boundary. This
+	// sequence number and kind are used during interleaved range key and point
+	// iteration to allow an iterator to stop at range key start keys where
+	// there exists no point key.
+	InternalKeyBoundaryRangeKey = (InternalKeySeqNumMax << 8) | uint64(InternalKeyKindRangeKeySet)
+)
+
+// Assert InternalKeyKindSSTableInternalObsoleteBit > InternalKeyKindMax
+const _ = uint(InternalKeyKindSSTableInternalObsoleteBit - InternalKeyKindMax - 1)
+
+var internalKeyKindNames = []string{
+	InternalKeyKindDelete:         "DEL",
+	InternalKeyKindSet:            "SET",
+	InternalKeyKindMerge:          "MERGE",
+	InternalKeyKindLogData:        "LOGDATA",
+	InternalKeyKindSingleDelete:   "SINGLEDEL",
+	InternalKeyKindRangeDelete:    "RANGEDEL",
+	InternalKeyKindSeparator:      "SEPARATOR",
+	InternalKeyKindSetWithDelete:  "SETWITHDEL",
+	InternalKeyKindRangeKeySet:    "RANGEKEYSET",
+	InternalKeyKindRangeKeyUnset:  "RANGEKEYUNSET",
+	InternalKeyKindRangeKeyDelete: "RANGEKEYDEL",
+	InternalKeyKindIngestSST:      "INGESTSST",
+	InternalKeyKindDeleteSized:    "DELSIZED",
+	InternalKeyKindInvalid:        "INVALID",
+}
+
+func (k InternalKeyKind) String() string {
+	if int(k) < len(internalKeyKindNames) {
+		return internalKeyKindNames[k]
+	}
+	return fmt.Sprintf("UNKNOWN:%d", k)
+}
+
+// SafeFormat implements redact.SafeFormatter.
+func (k InternalKeyKind) SafeFormat(w redact.SafePrinter, _ rune) {
+	w.Print(redact.SafeString(k.String()))
+}
+
+// InternalKey is a key used for the in-memory and on-disk partial DBs that
+// make up a pebble DB.
+//
+// It consists of the user key (as given by the code that uses package pebble)
+// followed by 8-bytes of metadata:
+//   - 1 byte for the type of internal key: delete or set,
+//   - 7 bytes for a uint56 sequence number, in little-endian format.
+type InternalKey struct {
+	UserKey []byte
+	Trailer uint64
+}
+
+// InvalidInternalKey is an invalid internal key for which Valid() will return
+// false.
+var InvalidInternalKey = MakeInternalKey(nil, 0, InternalKeyKindInvalid)
+
+// MakeInternalKey constructs an internal key from a specified user key,
+// sequence number and kind.
+func MakeInternalKey(userKey []byte, seqNum uint64, kind InternalKeyKind) InternalKey {
+	return InternalKey{
+		UserKey: userKey,
+		Trailer: (seqNum << 8) | uint64(kind),
+	}
+}
+
+// MakeTrailer constructs an internal key trailer from the specified sequence
+// number and kind.
+func MakeTrailer(seqNum uint64, kind InternalKeyKind) uint64 {
+	return (seqNum << 8) | uint64(kind)
+}
+
+// MakeSearchKey constructs an internal key that is appropriate for searching
+// for a the specified user key. The search key contain the maximal sequence
+// number and kind ensuring that it sorts before any other internal keys for
+// the same user key.
+func MakeSearchKey(userKey []byte) InternalKey {
+	return InternalKey{
+		UserKey: userKey,
+		Trailer: (InternalKeySeqNumMax << 8) | uint64(InternalKeyKindMax),
+	}
+}
+
+// MakeRangeDeleteSentinelKey constructs an internal key that is a range
+// deletion sentinel key, used as the upper boundary for an sstable when a
+// range deletion is the largest key in an sstable.
+func MakeRangeDeleteSentinelKey(userKey []byte) InternalKey {
+	return InternalKey{
+		UserKey: userKey,
+		Trailer: InternalKeyRangeDeleteSentinel,
+	}
+}
+
+// MakeExclusiveSentinelKey constructs an internal key that is an
+// exclusive sentinel key, used as the upper boundary for an sstable
+// when a ranged key is the largest key in an sstable.
+func MakeExclusiveSentinelKey(kind InternalKeyKind, userKey []byte) InternalKey {
+	return InternalKey{
+		UserKey: userKey,
+		Trailer: (InternalKeySeqNumMax << 8) | uint64(kind),
+	}
+}
+
+var kindsMap = map[string]InternalKeyKind{
+	"DEL":           InternalKeyKindDelete,
+	"SINGLEDEL":     InternalKeyKindSingleDelete,
+	"RANGEDEL":      InternalKeyKindRangeDelete,
+	"LOGDATA":       InternalKeyKindLogData,
+	"SET":           InternalKeyKindSet,
+	"MERGE":         InternalKeyKindMerge,
+	"INVALID":       InternalKeyKindInvalid,
+	"SEPARATOR":     InternalKeyKindSeparator,
+	"SETWITHDEL":    InternalKeyKindSetWithDelete,
+	"RANGEKEYSET":   InternalKeyKindRangeKeySet,
+	"RANGEKEYUNSET": InternalKeyKindRangeKeyUnset,
+	"RANGEKEYDEL":   InternalKeyKindRangeKeyDelete,
+	"INGESTSST":     InternalKeyKindIngestSST,
+	"DELSIZED":      InternalKeyKindDeleteSized,
+}
+
+// ParseInternalKey parses the string representation of an internal key. The
+// format is <user-key>.<kind>.<seq-num>. If the seq-num starts with a "b" it
+// is marked as a batch-seq-num (i.e. the InternalKeySeqNumBatch bit is set).
+func ParseInternalKey(s string) InternalKey {
+	x := strings.Split(s, ".")
+	ukey := x[0]
+	kind, ok := kindsMap[x[1]]
+	if !ok {
+		panic(fmt.Sprintf("unknown kind: %q", x[1]))
+	}
+	j := 0
+	if x[2][0] == 'b' {
+		j = 1
+	}
+	seqNum, _ := strconv.ParseUint(x[2][j:], 10, 64)
+	if x[2][0] == 'b' {
+		seqNum |= InternalKeySeqNumBatch
+	}
+	return MakeInternalKey([]byte(ukey), seqNum, kind)
+}
+
+// ParseKind parses the string representation of an internal key kind.
+func ParseKind(s string) InternalKeyKind {
+	kind, ok := kindsMap[s]
+	if !ok {
+		panic(fmt.Sprintf("unknown kind: %q", s))
+	}
+	return kind
+}
+
+// InternalTrailerLen is the number of bytes used to encode InternalKey.Trailer.
+const InternalTrailerLen = 8
+
+// DecodeInternalKey decodes an encoded internal key. See InternalKey.Encode().
+func DecodeInternalKey(encodedKey []byte) InternalKey {
+	n := len(encodedKey) - InternalTrailerLen
+	var trailer uint64
+	if n >= 0 {
+		trailer = binary.LittleEndian.Uint64(encodedKey[n:])
+		encodedKey = encodedKey[:n:n]
+	} else {
+		trailer = uint64(InternalKeyKindInvalid)
+		encodedKey = nil
+	}
+	return InternalKey{
+		UserKey: encodedKey,
+		Trailer: trailer,
+	}
+}
+
+// InternalCompare compares two internal keys using the specified comparison
+// function. For equal user keys, internal keys compare in descending sequence
+// number order. For equal user keys and sequence numbers, internal keys
+// compare in descending kind order (this may happen in practice among range
+// keys).
+func InternalCompare(userCmp Compare, a, b InternalKey) int {
+	if x := userCmp(a.UserKey, b.UserKey); x != 0 {
+		return x
+	}
+	// Reverse order for trailer comparison.
+	return cmp.Compare(b.Trailer, a.Trailer)
+}
+
+// Encode encodes the receiver into the buffer. The buffer must be large enough
+// to hold the encoded data. See InternalKey.Size().
+func (k InternalKey) Encode(buf []byte) {
+	i := copy(buf, k.UserKey)
+	binary.LittleEndian.PutUint64(buf[i:], k.Trailer)
+}
+
+// EncodeTrailer returns the trailer encoded to an 8-byte array.
+func (k InternalKey) EncodeTrailer() [8]byte {
+	var buf [8]byte
+	binary.LittleEndian.PutUint64(buf[:], k.Trailer)
+	return buf
+}
+
+// Separator returns a separator key such that k <= x && x < other, where less
+// than is consistent with the Compare function. The buf parameter may be used
+// to store the returned InternalKey.UserKey, though it is valid to pass a
+// nil. See the Separator type for details on separator keys.
+func (k InternalKey) Separator(
+	cmp Compare, sep Separator, buf []byte, other InternalKey,
+) InternalKey {
+	buf = sep(buf, k.UserKey, other.UserKey)
+	if len(buf) <= len(k.UserKey) && cmp(k.UserKey, buf) < 0 {
+		// The separator user key is physically shorter than k.UserKey (if it is
+		// longer, we'll continue to use "k"), but logically after. Tack on the max
+		// sequence number to the shortened user key. Note that we could tack on
+		// any sequence number and kind here to create a valid separator key. We
+		// use the max sequence number to match the behavior of LevelDB and
+		// RocksDB.
+		return MakeInternalKey(buf, InternalKeySeqNumMax, InternalKeyKindSeparator)
+	}
+	return k
+}
+
+// Successor returns a successor key such that k <= x. A simple implementation
+// may return k unchanged. The buf parameter may be used to store the returned
+// InternalKey.UserKey, though it is valid to pass a nil.
+func (k InternalKey) Successor(cmp Compare, succ Successor, buf []byte) InternalKey {
+	buf = succ(buf, k.UserKey)
+	if len(buf) <= len(k.UserKey) && cmp(k.UserKey, buf) < 0 {
+		// The successor user key is physically shorter that k.UserKey (if it is
+		// longer, we'll continue to use "k"), but logically after. Tack on the max
+		// sequence number to the shortened user key. Note that we could tack on
+		// any sequence number and kind here to create a valid separator key. We
+		// use the max sequence number to match the behavior of LevelDB and
+		// RocksDB.
+		return MakeInternalKey(buf, InternalKeySeqNumMax, InternalKeyKindSeparator)
+	}
+	return k
+}
+
+// Size returns the encoded size of the key.
+func (k InternalKey) Size() int {
+	return len(k.UserKey) + 8
+}
+
+// SetSeqNum sets the sequence number component of the key.
+func (k *InternalKey) SetSeqNum(seqNum uint64) {
+	k.Trailer = (seqNum << 8) | (k.Trailer & 0xff)
+}
+
+// SeqNum returns the sequence number component of the key.
+func (k InternalKey) SeqNum() uint64 {
+	return k.Trailer >> 8
+}
+
+// SeqNumFromTrailer returns the sequence number component of a trailer.
+func SeqNumFromTrailer(t uint64) uint64 {
+	return t >> 8
+}
+
+// Visible returns true if the key is visible at the specified snapshot
+// sequence number.
+func (k InternalKey) Visible(snapshot, batchSnapshot uint64) bool {
+	return Visible(k.SeqNum(), snapshot, batchSnapshot)
+}
+
+// Visible returns true if a key with the provided sequence number is visible at
+// the specified snapshot sequence numbers.
+func Visible(seqNum uint64, snapshot, batchSnapshot uint64) bool {
+	// There are two snapshot sequence numbers, one for committed keys and one
+	// for batch keys. If a seqNum is less than `snapshot`, then seqNum
+	// corresponds to a committed key that is visible. If seqNum has its batch
+	// bit set, then seqNum corresponds to an uncommitted batch key. Its
+	// visible if its snapshot is less than batchSnapshot.
+	//
+	// There's one complication. The maximal sequence number
+	// (`InternalKeySeqNumMax`) is used across Pebble for exclusive sentinel
+	// keys and other purposes. The maximal sequence number has its batch bit
+	// set, but it can never be < `batchSnapshot`, since there is no expressible
+	// larger snapshot. We dictate that the maximal sequence number is always
+	// visible.
+	return seqNum < snapshot ||
+		((seqNum&InternalKeySeqNumBatch) != 0 && seqNum < batchSnapshot) ||
+		seqNum == InternalKeySeqNumMax
+}
+
+// SetKind sets the kind component of the key.
+func (k *InternalKey) SetKind(kind InternalKeyKind) {
+	k.Trailer = (k.Trailer &^ 0xff) | uint64(kind)
+}
+
+// Kind returns the kind component of the key.
+func (k InternalKey) Kind() InternalKeyKind {
+	return TrailerKind(k.Trailer)
+}
+
+// TrailerKind returns the key kind of the key trailer.
+func TrailerKind(trailer uint64) InternalKeyKind {
+	return InternalKeyKind(trailer & 0xff)
+}
+
+// Valid returns true if the key has a valid kind.
+func (k InternalKey) Valid() bool {
+	return k.Kind() <= InternalKeyKindMax
+}
+
+// Clone clones the storage for the UserKey component of the key.
+func (k InternalKey) Clone() InternalKey {
+	if len(k.UserKey) == 0 {
+		return k
+	}
+	return InternalKey{
+		UserKey: append([]byte(nil), k.UserKey...),
+		Trailer: k.Trailer,
+	}
+}
+
+// CopyFrom converts this InternalKey into a clone of the passed-in InternalKey,
+// reusing any space already used for the current UserKey.
+func (k *InternalKey) CopyFrom(k2 InternalKey) {
+	k.UserKey = append(k.UserKey[:0], k2.UserKey...)
+	k.Trailer = k2.Trailer
+}
+
+// String returns a string representation of the key.
+func (k InternalKey) String() string {
+	return fmt.Sprintf("%s#%d,%d", FormatBytes(k.UserKey), k.SeqNum(), k.Kind())
+}
+
+// Pretty returns a formatter for the key.
+func (k InternalKey) Pretty(f FormatKey) fmt.Formatter {
+	return prettyInternalKey{k, f}
+}
+
+// IsExclusiveSentinel returns whether this internal key excludes point keys
+// with the same user key if used as an end boundary. See the comment on
+// InternalKeyRangeDeletionSentinel.
+func (k InternalKey) IsExclusiveSentinel() bool {
+	switch kind := k.Kind(); kind {
+	case InternalKeyKindRangeDelete:
+		return k.Trailer == InternalKeyRangeDeleteSentinel
+	case InternalKeyKindRangeKeyDelete, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeySet:
+		return (k.Trailer >> 8) == InternalKeySeqNumMax
+	default:
+		return false
+	}
+}
+
+type prettyInternalKey struct {
+	InternalKey
+	formatKey FormatKey
+}
+
+func (k prettyInternalKey) Format(s fmt.State, c rune) {
+	if seqNum := k.SeqNum(); seqNum == InternalKeySeqNumMax {
+		fmt.Fprintf(s, "%s#inf,%s", k.formatKey(k.UserKey), k.Kind())
+	} else {
+		fmt.Fprintf(s, "%s#%d,%s", k.formatKey(k.UserKey), k.SeqNum(), k.Kind())
+	}
+}
+
+// ParsePrettyInternalKey parses the pretty string representation of an
+// internal key. The format is <user-key>#<seq-num>,<kind>.
+func ParsePrettyInternalKey(s string) InternalKey {
+	x := strings.FieldsFunc(s, func(c rune) bool { return c == '#' || c == ',' })
+	ukey := x[0]
+	kind, ok := kindsMap[x[2]]
+	if !ok {
+		panic(fmt.Sprintf("unknown kind: %q", x[2]))
+	}
+	var seqNum uint64
+	if x[1] == "max" || x[1] == "inf" {
+		seqNum = InternalKeySeqNumMax
+	} else {
+		seqNum, _ = strconv.ParseUint(x[1], 10, 64)
+	}
+	return MakeInternalKey([]byte(ukey), seqNum, kind)
+}
diff --git a/pebble/internal/base/internal_test.go b/pebble/internal/base/internal_test.go
new file mode 100644
index 0000000..39466cd
--- /dev/null
+++ b/pebble/internal/base/internal_test.go
@@ -0,0 +1,226 @@
+// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package base
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func (k InternalKey) encodedString() string {
+	buf := make([]byte, k.Size())
+	k.Encode(buf)
+	return string(buf)
+}
+
+func TestInternalKey(t *testing.T) {
+	k := MakeInternalKey([]byte("foo"), 0x08070605040302, 1)
+	if got, want := k.encodedString(), "foo\x01\x02\x03\x04\x05\x06\x07\x08"; got != want {
+		t.Fatalf("k = %q want %q", got, want)
+	}
+	if !k.Valid() {
+		t.Fatalf("invalid key")
+	}
+	if got, want := string(k.UserKey), "foo"; got != want {
+		t.Errorf("ukey = %q want %q", got, want)
+	}
+	if got, want := k.Kind(), InternalKeyKind(1); got != want {
+		t.Errorf("kind = %d want %d", got, want)
+	}
+	if got, want := k.SeqNum(), uint64(0x08070605040302); got != want {
+		t.Errorf("seqNum = %d want %d", got, want)
+	}
+}
+
+func TestInvalidInternalKey(t *testing.T) {
+	testCases := []string{
+		"",
+		"\x01\x02\x03\x04\x05\x06\x07",
+		"foo",
+		"foo\x08\x07\x06\x05\x04\x03\x02",
+		"foo\x18\x07\x06\x05\x04\x03\x02\x01",
+	}
+	for _, tc := range testCases {
+		k := DecodeInternalKey([]byte(tc))
+		if k.Valid() {
+			t.Errorf("%q is a valid key, want invalid", tc)
+		}
+		// Invalid key kind because the key doesn't have an 8 byte trailer.
+		if k.Kind() == InternalKeyKindInvalid && k.UserKey != nil {
+			t.Errorf("expected nil UserKey after decoding encodedKey=%q", tc)
+		}
+	}
+}
+
+func TestInternalKeyComparer(t *testing.T) {
+	// keys are some internal keys, in sorted order.
+	keys := []string{
+		// The remaining test keys are all valid.
+		"" + "\x01\xff\xff\xff\xff\xff\xff\xff",
+		"" + "\x00\xff\xff\xff\xff\xff\xff\xff",
+		"" + "\x01\x01\x00\x00\x00\x00\x00\x00",
+		"" + "\x00\x01\x00\x00\x00\x00\x00\x00",
+		// Invalid internal keys have no user key, but have trailer "\xff \x00 \x00 \x00 \x00 \x00 \x00 \x00"
+		// i.e. seqNum 0 and kind 255 (InternalKeyKindInvalid).
+		"",
+		"" + "\x01\x00\x00\x00\x00\x00\x00\x00",
+		"" + "\x00\x00\x00\x00\x00\x00\x00\x00",
+		"\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00",
+		"\x00blue" + "\x01\x11\x00\x00\x00\x00\x00\x00",
+		"bl\x00ue" + "\x01\x11\x00\x00\x00\x00\x00\x00",
+		"blue" + "\x01\x11\x00\x00\x00\x00\x00\x00",
+		"blue\x00" + "\x01\x11\x00\x00\x00\x00\x00\x00",
+		"green" + "\xff\x11\x00\x00\x00\x00\x00\x00",
+		"green" + "\x01\x11\x00\x00\x00\x00\x00\x00",
+		"green" + "\x01\x00\x00\x00\x00\x00\x00\x00",
+		"red" + "\x01\xff\xff\xff\xff\xff\xff\xff",
+		"red" + "\x01\x72\x73\x74\x75\x76\x77\x78",
+		"red" + "\x01\x00\x00\x00\x00\x00\x00\x11",
+		"red" + "\x01\x00\x00\x00\x00\x00\x11\x00",
+		"red" + "\x01\x00\x00\x00\x00\x11\x00\x00",
+		"red" + "\x01\x00\x00\x00\x11\x00\x00\x00",
+		"red" + "\x01\x00\x00\x11\x00\x00\x00\x00",
+		"red" + "\x01\x00\x11\x00\x00\x00\x00\x00",
+		"red" + "\x01\x11\x00\x00\x00\x00\x00\x00",
+		"red" + "\x00\x11\x00\x00\x00\x00\x00\x00",
+		"red" + "\x00\x00\x00\x00\x00\x00\x00\x00",
+		"\xfe" + "\x01\xff\xff\xff\xff\xff\xff\xff",
+		"\xfe" + "\x00\x00\x00\x00\x00\x00\x00\x00",
+		"\xff" + "\x01\xff\xff\xff\xff\xff\xff\xff",
+		"\xff" + "\x00\x00\x00\x00\x00\x00\x00\x00",
+		"\xff\x40" + "\x01\xff\xff\xff\xff\xff\xff\xff",
+		"\xff\x40" + "\x00\x00\x00\x00\x00\x00\x00\x00",
+		"\xff\xff" + "\x01\xff\xff\xff\xff\xff\xff\xff",
+		"\xff\xff" + "\x00\x00\x00\x00\x00\x00\x00\x00",
+	}
+	c := DefaultComparer.Compare
+	for i := range keys {
+		for j := range keys {
+			ik := DecodeInternalKey([]byte(keys[i]))
+			jk := DecodeInternalKey([]byte(keys[j]))
+			got := InternalCompare(c, ik, jk)
+			want := 0
+			if i < j {
+				want = -1
+			} else if i > j {
+				want = +1
+			}
+			if got != want {
+				t.Errorf("i=%d, j=%d, keys[i]=%q, keys[j]=%q: got %d, want %d",
+					i, j, keys[i], keys[j], got, want)
+			}
+		}
+	}
+}
+
+func TestKindsRoundtrip(t *testing.T) {
+	for kindNum, prettied := range internalKeyKindNames {
+		if prettied == "" {
+			continue
+		}
+		kind := InternalKeyKind(kindNum)
+		got := ParseKind(kind.String())
+		require.Equal(t, got, kind)
+	}
+}
+
+func TestInternalKeySeparator(t *testing.T) {
+	testCases := []struct {
+		a        string
+		b        string
+		expected string
+	}{
+		{"foo.SET.100", "foo.SET.99", "foo.SET.100"},
+		{"foo.SET.100", "foo.SET.100", "foo.SET.100"},
+		{"foo.SET.100", "foo.DEL.100", "foo.SET.100"},
+		{"foo.SET.100", "foo.SET.101", "foo.SET.100"},
+		{"foo.SET.100", "bar.SET.99", "foo.SET.100"},
+		{"foo.SET.100", "hello.SET.200", "g.SEPARATOR.72057594037927935"},
+		{"ABC1AAAAA.SET.100", "ABC2ABB.SET.200", "ABC2.SEPARATOR.72057594037927935"},
+		{"AAA1AAA.SET.100", "AAA2AA.SET.200", "AAA2.SEPARATOR.72057594037927935"},
+		{"AAA1AAA.SET.100", "AAA4.SET.200", "AAA2.SEPARATOR.72057594037927935"},
+		{"AAA1AAA.SET.100", "AAA2.SET.200", "AAA1B.SEPARATOR.72057594037927935"},
+		{"AAA1AAA.SET.100", "AAA2A.SET.200", "AAA2.SEPARATOR.72057594037927935"},
+		{"AAA1.SET.100", "AAA2.SET.200", "AAA1.SET.100"},
+		{"foo.SET.100", "foobar.SET.200", "foo.SET.100"},
+		{"foobar.SET.100", "foo.SET.200", "foobar.SET.100"},
+		{"foo.INGESTSST.100", "foo.INGESTSST.99", "foo.INGESTSST.100"},
+	}
+	d := DefaultComparer
+	for _, c := range testCases {
+		t.Run("", func(t *testing.T) {
+			a := ParseInternalKey(c.a)
+			b := ParseInternalKey(c.b)
+			expected := ParseInternalKey(c.expected)
+			result := a.Separator(d.Compare, d.Separator, nil, b)
+			if cmp := InternalCompare(d.Compare, expected, result); cmp != 0 {
+				t.Fatalf("expected %s, but found %s", expected, result)
+			}
+		})
+	}
+}
+
+func TestIsExclusiveSentinel(t *testing.T) {
+	userKey := []byte("foo")
+	testCases := []struct {
+		name string
+		key  InternalKey
+		want bool
+	}{
+		{
+			name: "rangedel; max seqnum",
+			key:  MakeInternalKey(userKey, InternalKeySeqNumMax, InternalKeyKindRangeKeyDelete),
+			want: true,
+		},
+		{
+			name: "rangedel; non-max seqnum",
+			key:  MakeInternalKey(userKey, 42, InternalKeyKindRangeKeyDelete),
+			want: false,
+		},
+		{
+			name: "rangekeyset; max seqnum",
+			key:  MakeInternalKey(userKey, InternalKeySeqNumMax, InternalKeyKindRangeKeySet),
+			want: true,
+		},
+		{
+			name: "rangekeyset; non-max seqnum",
+			key:  MakeInternalKey(userKey, 42, InternalKeyKindRangeKeySet),
+			want: false,
+		},
+		{
+			name: "rangekeyunset; max seqnum",
+			key:  MakeInternalKey(userKey, InternalKeySeqNumMax, InternalKeyKindRangeKeyUnset),
+			want: true,
+		},
+		{
+			name: "rangekeyunset; non-max seqnum",
+			key:  MakeInternalKey(userKey, 42, InternalKeyKindRangeKeyUnset),
+			want: false,
+		},
+		{
+			name: "rangekeydel; max seqnum",
+			key:  MakeInternalKey(userKey, InternalKeySeqNumMax, InternalKeyKindRangeKeyDelete),
+			want: true,
+		},
+		{
+			name: "rangekeydel; non-max seqnum",
+			key:  MakeInternalKey(userKey, 42, InternalKeyKindRangeKeyDelete),
+			want: false,
+		},
+		{
+			name: "neither rangedel nor rangekey",
+			key:  MakeInternalKey(userKey, InternalKeySeqNumMax, InternalKeyKindSet),
+			want: false,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := tc.key.IsExclusiveSentinel()
+			require.Equal(t, tc.want, got)
+		})
+	}
+}
diff --git a/pebble/internal/base/iterator.go b/pebble/internal/base/iterator.go
new file mode 100644
index 0000000..1b72432
--- /dev/null
+++ b/pebble/internal/base/iterator.go
@@ -0,0 +1,414 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package base
+
+import (
+	"context"
+	"fmt"
+	"time"
+)
+
+// InternalIterator iterates over a DB's key/value pairs in key order. Unlike
+// the Iterator interface, the returned keys are InternalKeys composed of the
+// user-key, a sequence number and a key kind. In forward iteration, key/value
+// pairs for identical user-keys are returned in descending sequence order. In
+// reverse iteration, key/value pairs for identical user-keys are returned in
+// ascending sequence order.
+//
+// InternalIterators provide 5 absolute positioning methods and 2 relative
+// positioning methods. The absolute positioning methods are:
+//
+// - SeekGE
+// - SeekPrefixGE
+// - SeekLT
+// - First
+// - Last
+//
+// The relative positioning methods are:
+//
+// - Next
+// - Prev
+//
+// The relative positioning methods can be used in conjunction with any of the
+// absolute positioning methods with one exception: SeekPrefixGE does not
+// support reverse iteration via Prev. It is undefined to call relative
+// positioning methods without ever calling an absolute positioning method.
+//
+// InternalIterators can optionally implement a prefix iteration mode. This
+// mode is entered by calling SeekPrefixGE and exited by any other absolute
+// positioning method (SeekGE, SeekLT, First, Last). When in prefix iteration
+// mode, a call to Next will advance to the next key which has the same
+// "prefix" as the one supplied to SeekPrefixGE. Note that "prefix" in this
+// context is not a strict byte prefix, but defined by byte equality for the
+// result of the Comparer.Split method. An InternalIterator is not required to
+// support prefix iteration mode, and can implement SeekPrefixGE by forwarding
+// to SeekGE. When the iteration prefix is exhausted, it is not valid to call
+// Next on an internal iterator that's already returned (nil,nilv) or a key
+// beyond the prefix.
+//
+// Bounds, [lower, upper), can be set on iterators, either using the SetBounds()
+// function in the interface, or in implementation specific ways during iterator
+// creation. The forward positioning routines (SeekGE, First, and Next) only
+// check the upper bound. The reverse positioning routines (SeekLT, Last, and
+// Prev) only check the lower bound. It is up to the caller to ensure that the
+// forward positioning routines respect the lower bound and the reverse
+// positioning routines respect the upper bound (i.e. calling SeekGE instead of
+// First if there is a lower bound, and SeekLT instead of Last if there is an
+// upper bound). This imposition is done in order to elevate that enforcement to
+// the caller (generally pebble.Iterator or pebble.mergingIter) rather than
+// having it duplicated in every InternalIterator implementation.
+//
+// Additionally, the caller needs to ensure that SeekGE/SeekPrefixGE are not
+// called with a key > the upper bound, and SeekLT is not called with a key <
+// the lower bound. InternalIterator implementations are required to respect
+// the iterator bounds, never returning records outside of the bounds with one
+// exception: an iterator may generate synthetic RANGEDEL marker records. See
+// levelIter.syntheticBoundary for the sole existing example of this behavior.
+// Specifically, levelIter can return synthetic keys whose user key is equal to
+// the lower/upper bound.
+//
+// The bounds provided to an internal iterator must remain valid until a
+// subsequent call to SetBounds has returned. This requirement exists so that
+// iterator implementations may compare old and new bounds to apply low-level
+// optimizations. The pebble.Iterator satisfies this requirement by maintaining
+// two bound buffers and switching between them.
+//
+// An iterator must be closed after use, but it is not necessary to read an
+// iterator until exhaustion.
+//
+// An iterator is not goroutine-safe, but it is safe to use multiple iterators
+// concurrently, either in separate goroutines or switching between the
+// iterators in a single goroutine.
+//
+// It is also safe to use an iterator concurrently with modifying its
+// underlying DB, if that DB permits modification. However, the resultant
+// key/value pairs are not guaranteed to be a consistent snapshot of that DB
+// at a particular point in time.
+//
+// InternalIterators accumulate errors encountered during operation, exposing
+// them through the Error method. All of the absolute positioning methods
+// reset any accumulated error before positioning. Relative positioning
+// methods return without advancing if the iterator has accumulated an error.
+//
+// nilv == shorthand for LazyValue{}, which represents a nil value.
+type InternalIterator interface {
+	// SeekGE moves the iterator to the first key/value pair whose key is greater
+	// than or equal to the given key. Returns the key and value if the iterator
+	// is pointing at a valid entry, and (nil, nilv) otherwise. Note that SeekGE
+	// only checks the upper bound. It is up to the caller to ensure that key
+	// is greater than or equal to the lower bound.
+	SeekGE(key []byte, flags SeekGEFlags) (*InternalKey, LazyValue)
+
+	// SeekPrefixGE moves the iterator to the first key/value pair whose key is
+	// greater than or equal to the given key. Returns the key and value if the
+	// iterator is pointing at a valid entry, and (nil, nilv) otherwise. Note that
+	// SeekPrefixGE only checks the upper bound. It is up to the caller to ensure
+	// that key is greater than or equal to the lower bound.
+	//
+	// The prefix argument is used by some InternalIterator implementations (e.g.
+	// sstable.Reader) to avoid expensive operations. A user-defined Split
+	// function must be supplied to the Comparer for the DB. The supplied prefix
+	// will be the prefix of the given key returned by that Split function. If
+	// the iterator is able to determine that no key with the prefix exists, it
+	// can return (nil,nilv). Unlike SeekGE, this is not an indication that
+	// iteration is exhausted.
+	//
+	// Note that the iterator may return keys not matching the prefix. It is up
+	// to the caller to check if the prefix matches.
+	//
+	// Calling SeekPrefixGE places the receiver into prefix iteration mode. Once
+	// in this mode, reverse iteration may not be supported and will return an
+	// error. Note that pebble/Iterator.SeekPrefixGE has this same restriction on
+	// not supporting reverse iteration in prefix iteration mode until a
+	// different positioning routine (SeekGE, SeekLT, First or Last) switches the
+	// iterator out of prefix iteration.
+	SeekPrefixGE(prefix, key []byte, flags SeekGEFlags) (*InternalKey, LazyValue)
+
+	// SeekLT moves the iterator to the last key/value pair whose key is less
+	// than the given key. Returns the key and value if the iterator is pointing
+	// at a valid entry, and (nil, nilv) otherwise. Note that SeekLT only checks
+	// the lower bound. It is up to the caller to ensure that key is less than
+	// the upper bound.
+	SeekLT(key []byte, flags SeekLTFlags) (*InternalKey, LazyValue)
+
+	// First moves the iterator the the first key/value pair. Returns the key and
+	// value if the iterator is pointing at a valid entry, and (nil, nilv)
+	// otherwise. Note that First only checks the upper bound. It is up to the
+	// caller to ensure that First() is not called when there is a lower bound,
+	// and instead call SeekGE(lower).
+	First() (*InternalKey, LazyValue)
+
+	// Last moves the iterator the the last key/value pair. Returns the key and
+	// value if the iterator is pointing at a valid entry, and (nil, nilv)
+	// otherwise. Note that Last only checks the lower bound. It is up to the
+	// caller to ensure that Last() is not called when there is an upper bound,
+	// and instead call SeekLT(upper).
+	Last() (*InternalKey, LazyValue)
+
+	// Next moves the iterator to the next key/value pair. Returns the key and
+	// value if the iterator is pointing at a valid entry, and (nil, nilv)
+	// otherwise. Note that Next only checks the upper bound. It is up to the
+	// caller to ensure that key is greater than or equal to the lower bound.
+	//
+	// It is valid to call Next when the iterator is positioned before the first
+	// key/value pair due to either a prior call to SeekLT or Prev which returned
+	// (nil, nilv). It is not allowed to call Next when the previous call to SeekGE,
+	// SeekPrefixGE or Next returned (nil, nilv).
+	Next() (*InternalKey, LazyValue)
+
+	// NextPrefix moves the iterator to the next key/value pair with a different
+	// prefix than the key at the current iterator position. Returns the key and
+	// value if the iterator is pointing at a valid entry, and (nil, nil)
+	// otherwise. Note that NextPrefix only checks the upper bound. It is up to
+	// the caller to ensure that key is greater than or equal to the lower
+	// bound.
+	//
+	// NextPrefix is passed the immediate successor to the current prefix key. A
+	// valid implementation of NextPrefix is to call SeekGE with succKey.
+	//
+	// It is not allowed to call NextPrefix when the previous call was a reverse
+	// positioning operation or a call to a forward positioning method that
+	// returned (nil, nilv). It is also not allowed to call NextPrefix when the
+	// iterator is in prefix iteration mode.
+	NextPrefix(succKey []byte) (*InternalKey, LazyValue)
+
+	// Prev moves the iterator to the previous key/value pair. Returns the key
+	// and value if the iterator is pointing at a valid entry, and (nil, nilv)
+	// otherwise. Note that Prev only checks the lower bound. It is up to the
+	// caller to ensure that key is less than the upper bound.
+	//
+	// It is valid to call Prev when the iterator is positioned after the last
+	// key/value pair due to either a prior call to SeekGE or Next which returned
+	// (nil, nilv). It is not allowed to call Prev when the previous call to SeekLT
+	// or Prev returned (nil, nilv).
+	Prev() (*InternalKey, LazyValue)
+
+	// Error returns any accumulated error. It may not include errors returned
+	// to the client when calling LazyValue.Value().
+	Error() error
+
+	// Close closes the iterator and returns any accumulated error. Exhausting
+	// all the key/value pairs in a table is not considered to be an error.
+	// It is valid to call Close multiple times. Other methods should not be
+	// called after the iterator has been closed.
+	Close() error
+
+	// SetBounds sets the lower and upper bounds for the iterator. Note that the
+	// result of Next and Prev will be undefined until the iterator has been
+	// repositioned with SeekGE, SeekPrefixGE, SeekLT, First, or Last.
+	//
+	// The bounds provided must remain valid until a subsequent call to
+	// SetBounds has returned. This requirement exists so that iterator
+	// implementations may compare old and new bounds to apply low-level
+	// optimizations.
+	SetBounds(lower, upper []byte)
+
+	// SetContext replaces the context provided at iterator creation, or the
+	// last one provided by SetContext.
+	SetContext(ctx context.Context)
+
+	fmt.Stringer
+}
+
+// SeekGEFlags holds flags that may configure the behavior of a forward seek.
+// Not all flags are relevant to all iterators.
+type SeekGEFlags uint8
+
+const (
+	seekGEFlagTrySeekUsingNext uint8 = iota
+	seekGEFlagRelativeSeek
+	seekGEFlagBatchJustRefreshed
+)
+
+// SeekGEFlagsNone is the default value of SeekGEFlags, with all flags disabled.
+const SeekGEFlagsNone = SeekGEFlags(0)
+
+// TrySeekUsingNext indicates whether a performance optimization was enabled
+// by a caller, indicating the caller has not done any action to move this
+// iterator beyond the first key that would be found if this iterator were to
+// honestly do the intended seek. For example, say the caller did a
+// SeekGE(k1...), followed by SeekGE(k2...) where k1 <= k2, without any
+// intermediate positioning calls. The caller can safely specify true for this
+// parameter in the second call. As another example, say the caller did do one
+// call to Next between the two Seek calls, and k1 < k2. Again, the caller can
+// safely specify a true value for this parameter. Note that a false value is
+// always safe. The callee is free to ignore the true value if its
+// implementation does not permit this optimization.
+//
+// We make the caller do this determination since a string comparison of k1, k2
+// is not necessarily cheap, and there may be many iterators in the iterator
+// stack. Doing it once at the root of the iterator stack is cheaper.
+//
+// This optimization could also be applied to SeekLT (where it would be
+// trySeekUsingPrev). We currently only do it for SeekPrefixGE and SeekGE
+// because this is where this optimization helps the performance of CockroachDB.
+// The SeekLT cases in CockroachDB are typically accompanied with bounds that
+// change between seek calls, and is optimized inside certain iterator
+// implementations, like singleLevelIterator, without any extra parameter
+// passing (though the same amortization of string comparisons could be done to
+// improve that optimization, by making the root of the iterator stack do it).
+func (s SeekGEFlags) TrySeekUsingNext() bool { return (s & (1 << seekGEFlagTrySeekUsingNext)) != 0 }
+
+// RelativeSeek is set when in the course of a forward positioning operation, a
+// higher-level iterator seeks a lower-level iterator to a larger key than the
+// one at the current iterator position.
+//
+// Concretely, this occurs when the merging iterator observes a range deletion
+// covering the key at a level's current position, and the merging iterator
+// seeks the level to the range deletion's end key. During lazy-combined
+// iteration, this flag signals to the level iterator that the seek is NOT an
+// absolute-positioning operation from the perspective of the pebble.Iterator,
+// and the level iterator must look for range keys in tables between the current
+// iterator position and the new seeked position.
+func (s SeekGEFlags) RelativeSeek() bool { return (s & (1 << seekGEFlagRelativeSeek)) != 0 }
+
+// BatchJustRefreshed is set by Seek[Prefix]GE when an iterator's view of an
+// indexed batch was just refreshed. It serves as a signal to the batch iterator
+// to ignore the TrySeekUsingNext optimization, because the external knowledge
+// imparted by the TrySeekUsingNext flag does not apply to the batch iterator's
+// position. See (pebble.Iterator).batchJustRefreshed.
+func (s SeekGEFlags) BatchJustRefreshed() bool { return (s & (1 << seekGEFlagBatchJustRefreshed)) != 0 }
+
+// EnableTrySeekUsingNext returns the provided flags with the
+// try-seek-using-next optimization enabled. See TrySeekUsingNext for an
+// explanation of this optimization.
+func (s SeekGEFlags) EnableTrySeekUsingNext() SeekGEFlags {
+	return s | (1 << seekGEFlagTrySeekUsingNext)
+}
+
+// DisableTrySeekUsingNext returns the provided flags with the
+// try-seek-using-next optimization disabled.
+func (s SeekGEFlags) DisableTrySeekUsingNext() SeekGEFlags {
+	return s &^ (1 << seekGEFlagTrySeekUsingNext)
+}
+
+// EnableRelativeSeek returns the provided flags with the relative-seek flag
+// enabled. See RelativeSeek for an explanation of this flag's use.
+func (s SeekGEFlags) EnableRelativeSeek() SeekGEFlags {
+	return s | (1 << seekGEFlagRelativeSeek)
+}
+
+// DisableRelativeSeek returns the provided flags with the relative-seek flag
+// disabled.
+func (s SeekGEFlags) DisableRelativeSeek() SeekGEFlags {
+	return s &^ (1 << seekGEFlagRelativeSeek)
+}
+
+// EnableBatchJustRefreshed returns the provided flags with the
+// batch-just-refreshed bit set. See BatchJustRefreshed for an explanation of
+// this flag.
+func (s SeekGEFlags) EnableBatchJustRefreshed() SeekGEFlags {
+	return s | (1 << seekGEFlagBatchJustRefreshed)
+}
+
+// DisableBatchJustRefreshed returns the provided flags with the
+// batch-just-refreshed bit unset.
+func (s SeekGEFlags) DisableBatchJustRefreshed() SeekGEFlags {
+	return s &^ (1 << seekGEFlagBatchJustRefreshed)
+}
+
+// SeekLTFlags holds flags that may configure the behavior of a reverse seek.
+// Not all flags are relevant to all iterators.
+type SeekLTFlags uint8
+
+const (
+	seekLTFlagRelativeSeek uint8 = iota
+)
+
+// SeekLTFlagsNone is the default value of SeekLTFlags, with all flags disabled.
+const SeekLTFlagsNone = SeekLTFlags(0)
+
+// RelativeSeek is set when in the course of a reverse positioning operation, a
+// higher-level iterator seeks a lower-level iterator to a smaller key than the
+// one at the current iterator position.
+//
+// Concretely, this occurs when the merging iterator observes a range deletion
+// covering the key at a level's current position, and the merging iterator
+// seeks the level to the range deletion's start key. During lazy-combined
+// iteration, this flag signals to the level iterator that the seek is NOT an
+// absolute-positioning operation from the perspective of the pebble.Iterator,
+// and the level iterator must look for range keys in tables between the current
+// iterator position and the new seeked position.
+func (s SeekLTFlags) RelativeSeek() bool { return s&(1<<seekLTFlagRelativeSeek) != 0 }
+
+// EnableRelativeSeek returns the provided flags with the relative-seek flag
+// enabled. See RelativeSeek for an explanation of this flag's use.
+func (s SeekLTFlags) EnableRelativeSeek() SeekLTFlags {
+	return s | (1 << seekLTFlagRelativeSeek)
+}
+
+// DisableRelativeSeek returns the provided flags with the relative-seek flag
+// disabled.
+func (s SeekLTFlags) DisableRelativeSeek() SeekLTFlags {
+	return s &^ (1 << seekLTFlagRelativeSeek)
+}
+
+// InternalIteratorStats contains miscellaneous stats produced by
+// InternalIterators that are part of the InternalIterator tree. Not every
+// field is relevant for an InternalIterator implementation. The field values
+// are aggregated as one goes up the InternalIterator tree.
+type InternalIteratorStats struct {
+	// Bytes in the loaded blocks. If the block was compressed, this is the
+	// compressed bytes. Currently, only the index blocks, data blocks
+	// containing points, and filter blocks are included.
+	BlockBytes uint64
+	// Subset of BlockBytes that were in the block cache.
+	BlockBytesInCache uint64
+	// BlockReadDuration accumulates the duration spent fetching blocks
+	// due to block cache misses.
+	// TODO(sumeer): this currently excludes the time spent in Reader creation,
+	// and in reading the rangedel and rangekey blocks. Fix that.
+	BlockReadDuration time.Duration
+	// The following can repeatedly count the same points if they are iterated
+	// over multiple times. Additionally, they may count a point twice when
+	// switching directions. The latter could be improved if needed.
+
+	// Bytes in keys that were iterated over. Currently, only point keys are
+	// included.
+	KeyBytes uint64
+	// Bytes in values that were iterated over. Currently, only point values are
+	// included. For separated values, this is the size of the handle.
+	ValueBytes uint64
+	// The count of points iterated over.
+	PointCount uint64
+	// Points that were iterated over that were covered by range tombstones. It
+	// can be useful for discovering instances of
+	// https://github.com/cockroachdb/pebble/issues/1070.
+	PointsCoveredByRangeTombstones uint64
+
+	// Stats related to points in value blocks encountered during iteration.
+	// These are useful to understand outliers, since typical user facing
+	// iteration should tend to only look at the latest point, and hence have
+	// the following stats close to 0.
+	SeparatedPointValue struct {
+		// Count is a count of points that were in value blocks. This is not a
+		// subset of PointCount: PointCount is produced by mergingIter and if
+		// positioned once, and successful in returning a point, will have a
+		// PointCount of 1, regardless of how many sstables (and memtables etc.)
+		// in the heap got positioned. The count here includes every sstable
+		// iterator that got positioned in the heap.
+		Count uint64
+		// ValueBytes represent the total byte length of the values (in value
+		// blocks) of the points corresponding to Count.
+		ValueBytes uint64
+		// ValueBytesFetched is the total byte length of the values (in value
+		// blocks) that were retrieved.
+		ValueBytesFetched uint64
+	}
+}
+
+// Merge merges the stats in from into the given stats.
+func (s *InternalIteratorStats) Merge(from InternalIteratorStats) {
+	s.BlockBytes += from.BlockBytes
+	s.BlockBytesInCache += from.BlockBytesInCache
+	s.BlockReadDuration += from.BlockReadDuration
+	s.KeyBytes += from.KeyBytes
+	s.ValueBytes += from.ValueBytes
+	s.PointCount += from.PointCount
+	s.PointsCoveredByRangeTombstones += from.PointsCoveredByRangeTombstones
+	s.SeparatedPointValue.Count += from.SeparatedPointValue.Count
+	s.SeparatedPointValue.ValueBytes += from.SeparatedPointValue.ValueBytes
+	s.SeparatedPointValue.ValueBytesFetched += from.SeparatedPointValue.ValueBytesFetched
+}
diff --git a/pebble/internal/base/iterator_test.go b/pebble/internal/base/iterator_test.go
new file mode 100644
index 0000000..bc09b5c
--- /dev/null
+++ b/pebble/internal/base/iterator_test.go
@@ -0,0 +1,89 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package base
+
+import (
+	"fmt"
+	"testing"
+)
+
+func TestFlags(t *testing.T) {
+	t.Run("SeekGEFlags", func(t *testing.T) {
+		f := SeekGEFlagsNone
+		flags := []flag{
+			{
+				"TrySeekUsingNext",
+				func() bool { return f.TrySeekUsingNext() },
+				func() { f = f.EnableTrySeekUsingNext() },
+				func() { f = f.DisableTrySeekUsingNext() },
+			},
+			{
+				"RelativeSeek",
+				func() bool { return f.RelativeSeek() },
+				func() { f = f.EnableRelativeSeek() },
+				func() { f = f.DisableRelativeSeek() },
+			},
+			{
+				"BatchJustRefreshed",
+				func() bool { return f.BatchJustRefreshed() },
+				func() { f = f.EnableBatchJustRefreshed() },
+				func() { f = f.DisableBatchJustRefreshed() },
+			},
+		}
+		ref := make([]bool, len(flags))
+		checkCombination(t, 0, flags, ref)
+	})
+	t.Run("SeekLTFlags", func(t *testing.T) {
+		f := SeekLTFlagsNone
+		flags := []flag{
+			{
+				"RelativeSeek",
+				func() bool { return f.RelativeSeek() },
+				func() { f = f.EnableRelativeSeek() },
+				func() { f = f.DisableRelativeSeek() },
+			},
+		}
+		ref := make([]bool, len(flags))
+		checkCombination(t, 0, flags, ref)
+	})
+}
+
+type flag struct {
+	label string
+	pred  func() bool
+	set   func()
+	unset func()
+}
+
+func checkCombination(t *testing.T, i int, flags []flag, ref []bool) {
+	if i >= len(ref) {
+		// Verify that ref matches the flag predicates.
+		for j := 0; j < i; j++ {
+			if got := flags[j].pred(); ref[j] != got {
+				t.Errorf("%s() = %t, want %t", flags[j].label, got, ref[j])
+			}
+		}
+		return
+	}
+
+	// flag i remains unset.
+	t.Run(fmt.Sprintf("%s begin unset", flags[i].label), func(t *testing.T) {
+		checkCombination(t, i+1, flags, ref)
+	})
+
+	// set flag i
+	ref[i] = true
+	flags[i].set()
+	t.Run(fmt.Sprintf("%s set", flags[i].label), func(t *testing.T) {
+		checkCombination(t, i+1, flags, ref)
+	})
+
+	// unset flag i
+	ref[i] = false
+	flags[i].unset()
+	t.Run(fmt.Sprintf("%s unset", flags[i].label), func(t *testing.T) {
+		checkCombination(t, i+1, flags, ref)
+	})
+}
diff --git a/pebble/internal/base/lazy_value.go b/pebble/internal/base/lazy_value.go
new file mode 100644
index 0000000..cc6d56d
--- /dev/null
+++ b/pebble/internal/base/lazy_value.go
@@ -0,0 +1,287 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package base
+
+import "github.com/cockroachdb/pebble/internal/invariants"
+
+// A value can have user-defined attributes that are a function of the value
+// byte slice. For now, we only support "short attributes", which can be
+// encoded in 3 bits. We will likely extend this to "long attributes" later
+// for values that are even more expensive to access than those in value
+// blocks in the same sstable.
+//
+// When a sstable writer chooses not to store a value together with the key,
+// it can call the ShortAttributeExtractor to extract the attribute and store
+// it together with the key. This allows for cheap retrieval of
+// AttributeAndLen on the read-path, without doing a more expensive retrieval
+// of the value. In general, the extraction code may want to also look at the
+// key to decide how to treat the value, hence the key* parameters.
+//
+// Write path performance: The ShortAttributeExtractor func cannot be inlined,
+// so we will pay the cost of this function call. However, we will only pay
+// this when (a) the value is not being stored together with the key, and (b)
+// the key-value pair is being initially written to the DB, or a compaction is
+// transitioning the key-value pair from being stored together to being stored
+// separately.
+
+// ShortAttribute encodes a user-specified attribute of the value.
+type ShortAttribute uint8
+
+// MaxShortAttribute is the maximum value of the short attribute (3 bits).
+const MaxShortAttribute = 7
+
+// ShortAttributeExtractor is an extractor that given the value, will return
+// the ShortAttribute.
+type ShortAttributeExtractor func(
+	key []byte, keyPrefixLen int, value []byte) (ShortAttribute, error)
+
+// AttributeAndLen represents the pair of value length and the short
+// attribute.
+type AttributeAndLen struct {
+	ValueLen       int32
+	ShortAttribute ShortAttribute
+}
+
+// LazyValue represents a value that may not already have been extracted.
+// Currently, it can represent either an in-place value (stored with the key)
+// or a value stored in the value section. However, the interface is general
+// enough to support values that are stored in separate files.
+//
+// LazyValue is used in the InternalIterator interface, such that all
+// positioning calls return (*InternalKey, LazyValue). It is also exposed via
+// the public Iterator for callers that need to remember a recent but not
+// necessarily latest LazyValue, in case they need the actual value in the
+// future. An example is a caller that is iterating in reverse and looking for
+// the latest MVCC version for a key -- it cannot identify the latest MVCC
+// version without stepping to the previous key-value pair e.g.
+// storage.pebbleMVCCScanner in CockroachDB.
+//
+// Performance note: It is important for this struct to not exceed a sizeof 32
+// bytes, for optimizing the common case of the in-place value. Prior to
+// introducing LazyValue, we were passing around a []byte which is 24 bytes.
+// Passing a 40 byte or larger struct causes performance to drop by 75% on
+// some benchmarks that do tight iteration loops.
+//
+// Memory management:
+// This is subtle, but important for performance.
+//
+// A LazyValue returned by an InternalIterator or Iterator is unstable in that
+// repositioning the iterator will invalidate the memory inside it. A caller
+// wishing to maintain that LazyValue needs to call LazyValue.Clone(). Note
+// that this does not fetch the value if it is not in-place. Clone() should
+// ideally not be called if LazyValue.Value() has been called, since the
+// cloned LazyValue will forget the extracted/fetched value, and calling
+// Value() on this clone will cause the value to be extracted again. That is,
+// Clone() does not make any promise about the memory stability of the
+// underlying value.
+//
+// A user of an iterator that calls LazyValue.Value() wants as much as
+// possible for the returned value []byte to point to iterator owned memory.
+//
+//  1. [P1] The underlying iterator that owns that memory also needs a promise
+//     from that user that at any time there is at most one value []byte slice
+//     that the caller is expecting it to maintain. Otherwise, the underlying
+//     iterator has to maintain multiple such []byte slices which results in
+//     more complicated and inefficient code.
+//
+//  2. [P2] The underlying iterator, in order to make the promise that it is
+//     maintaining the one value []byte slice, also needs a way to know when
+//     it is relieved of that promise. One way it is relieved of that promise
+//     is by being told that it is being repositioned. Typically, the owner of
+//     the value []byte slice is a sstable iterator, and it will know that it
+//     is relieved of the promise when it is repositioned. However, consider
+//     the case where the caller has used LazyValue.Clone() and repositioned
+//     the iterator (which is actually a tree of iterators). In this case the
+//     underlying sstable iterator may not even be open. LazyValue.Value()
+//     will still work (at a higher cost), but since the sstable iterator is
+//     not open, it does not have a mechanism to know when the retrieved value
+//     is no longer in use. We refer to this situation as "not satisfying P2".
+//     To handle this situation, the LazyValue.Value() method accepts a caller
+//     owned buffer, that the callee will use if needed. The callee explicitly
+//     tells the caller whether the []byte slice for the value is now owned by
+//     the caller. This will be true if the callee attempted to use buf and
+//     either successfully used it or allocated a new []byte slice.
+//
+// To ground the above in reality, we consider three examples of callers of
+// LazyValue.Value():
+//
+//   - Iterator: it calls LazyValue.Value for its own use when merging values.
+//     When merging during reverse iteration, it may have cloned the LazyValue.
+//     In this case it calls LazyValue.Value() on the cloned value, merges it,
+//     and then calls LazyValue.Value() on the current iterator position and
+//     merges it. So it is honoring P1.
+//
+//   - Iterator on behalf of Iterator clients: The Iterator.Value() method
+//     needs to call LazyValue.Value(). The client of Iterator is satisfying P1
+//     because of the inherent Iterator interface constraint, i.e., it is calling
+//     Iterator.Value() on the current Iterator position. It is possible that
+//     the Iterator has cloned this LazyValue (for the reverse iteration case),
+//     which the client is unaware of, so the underlying sstable iterator may
+//     not be able to satisfy P2. This is ok because Iterator will call
+//     LazyValue.Value with its (reusable) owned buffer.
+//
+//   - CockroachDB's pebbleMVCCScanner: This will use LazyValues from Iterator
+//     since during reverse iteration in order to find the highest version that
+//     satisfies a read it needs to clone the LazyValue, step back the iterator
+//     and then decide whether it needs the value from the previously cloned
+//     LazyValue. The pebbleMVCCScanner will satisfy P1. The P2 story is
+//     similar to the previous case in that it will call LazyValue.Value with
+//     its (reusable) owned buffer.
+//
+// Corollary: callers that directly use InternalIterator can know that they
+// have done nothing to interfere with promise P2 can pass in a nil buf and be
+// sure that it will not trigger an allocation.
+//
+// Repeated calling of LazyValue.Value:
+// This is ok as long as the caller continues to satisfy P1. The previously
+// fetched value will be remembered inside LazyValue to avoid fetching again.
+// So if the caller's buffer is used the first time the value was fetched, it
+// is still in use.
+//
+// LazyValue fields are visible outside the package for use in
+// InternalIterator implementations and in Iterator, but not meant for direct
+// use by users of Pebble.
+type LazyValue struct {
+	// ValueOrHandle represents a value, or a handle to be passed to ValueFetcher.
+	// - Fetcher == nil: ValueOrHandle is a value.
+	// - Fetcher != nil: ValueOrHandle is a handle and Fetcher.Attribute is
+	//   initialized.
+	// The ValueOrHandle exposed by InternalIterator or Iterator may not be stable
+	// if the iterator is stepped. To make it stable, make a copy using Clone.
+	ValueOrHandle []byte
+	// Fetcher provides support for fetching an actually lazy value.
+	Fetcher *LazyFetcher
+}
+
+// LazyFetcher supports fetching a lazy value.
+//
+// Fetcher and Attribute are to be initialized at creation time. The fields
+// are arranged to reduce the sizeof this struct.
+type LazyFetcher struct {
+	// Fetcher, given a handle, returns the value.
+	Fetcher ValueFetcher
+	err     error
+	value   []byte
+	// Attribute includes the short attribute and value length.
+	Attribute   AttributeAndLen
+	fetched     bool
+	callerOwned bool
+}
+
+// ValueFetcher is an interface for fetching a value.
+type ValueFetcher interface {
+	// Fetch returns the value, given the handle. It is acceptable to call the
+	// ValueFetcher.Fetch as long as the DB is open. However, one should assume
+	// there is a fast-path when the iterator tree has not moved off the sstable
+	// iterator that initially provided this LazyValue. Hence, to utilize this
+	// fast-path the caller should try to decide whether it needs the value or
+	// not as soon as possible, with minimal possible stepping of the iterator.
+	//
+	// buf will be used if the fetcher cannot satisfy P2 (see earlier comment).
+	// If the fetcher attempted to use buf *and* len(buf) was insufficient, it
+	// will allocate a new slice for the value. In either case it will set
+	// callerOwned to true.
+	Fetch(
+		handle []byte, valLen int32, buf []byte) (val []byte, callerOwned bool, err error)
+}
+
+// Value returns the underlying value.
+func (lv *LazyValue) Value(buf []byte) (val []byte, callerOwned bool, err error) {
+	if lv.Fetcher == nil {
+		return lv.ValueOrHandle, false, nil
+	}
+	// Do the rest of the work in a separate method to attempt mid-stack
+	// inlining of Value(). Unfortunately, this still does not inline since the
+	// cost of 85 exceeds the budget of 80.
+	//
+	// TODO(sumeer): Packing the return values into a struct{[]byte error bool}
+	// causes it to be below the budget. Consider this if we need to recover
+	// more performance. I suspect that inlining this only matters in
+	// micro-benchmarks, and in actual use cases in CockroachDB it will not
+	// matter because there is substantial work done with a fetched value.
+	return lv.fetchValue(buf)
+}
+
+// INVARIANT: lv.Fetcher != nil
+func (lv *LazyValue) fetchValue(buf []byte) (val []byte, callerOwned bool, err error) {
+	f := lv.Fetcher
+	if !f.fetched {
+		f.fetched = true
+		f.value, f.callerOwned, f.err = f.Fetcher.Fetch(
+			lv.ValueOrHandle, lv.Fetcher.Attribute.ValueLen, buf)
+	}
+	return f.value, f.callerOwned, f.err
+}
+
+// InPlaceValue returns the value under the assumption that it is in-place.
+// This is for Pebble-internal code.
+func (lv *LazyValue) InPlaceValue() []byte {
+	if invariants.Enabled && lv.Fetcher != nil {
+		panic("value must be in-place")
+	}
+	return lv.ValueOrHandle
+}
+
+// Len returns the length of the value.
+func (lv *LazyValue) Len() int {
+	if lv.Fetcher == nil {
+		return len(lv.ValueOrHandle)
+	}
+	return int(lv.Fetcher.Attribute.ValueLen)
+}
+
+// TryGetShortAttribute returns the ShortAttribute and a bool indicating
+// whether the ShortAttribute was populated.
+func (lv *LazyValue) TryGetShortAttribute() (ShortAttribute, bool) {
+	if lv.Fetcher == nil {
+		return 0, false
+	}
+	return lv.Fetcher.Attribute.ShortAttribute, true
+}
+
+// Clone creates a stable copy of the LazyValue, by appending bytes to buf.
+// The fetcher parameter must be non-nil and may be over-written and used
+// inside the returned LazyValue -- this is needed to avoid an allocation.
+// Most callers have at most K cloned LazyValues, where K is hard-coded, so
+// they can have a pool of exactly K LazyFetcher structs they can reuse in
+// these calls. The alternative of allocating LazyFetchers from a sync.Pool is
+// not viable since we have no code trigger for returning to the pool
+// (LazyValues are simply GC'd).
+//
+// NB: It is highly preferable that LazyValue.Value() has not been called,
+// since the Clone will forget any previously extracted value, and a future
+// call to Value will cause it to be fetched again. We do this since we don't
+// want to reason about whether or not to clone an already extracted value
+// inside the Fetcher (we don't). Property P1 applies here too: if lv1.Value()
+// has been called, and then lv2 is created as a clone of lv1, then calling
+// lv2.Value() can invalidate any backing memory maintained inside the fetcher
+// for lv1 (even though these are the same values). We initially prohibited
+// calling LazyValue.Clone() if LazyValue.Value() has been called, but there
+// is at least one complex caller (pebbleMVCCScanner inside CockroachDB) where
+// it is not easy to prove this invariant.
+func (lv *LazyValue) Clone(buf []byte, fetcher *LazyFetcher) (LazyValue, []byte) {
+	var lvCopy LazyValue
+	if lv.Fetcher != nil {
+		*fetcher = LazyFetcher{
+			Fetcher:   lv.Fetcher.Fetcher,
+			Attribute: lv.Fetcher.Attribute,
+			// Not copying anything that has been extracted.
+		}
+		lvCopy.Fetcher = fetcher
+	}
+	vLen := len(lv.ValueOrHandle)
+	if vLen == 0 {
+		return lvCopy, buf
+	}
+	bufLen := len(buf)
+	buf = append(buf, lv.ValueOrHandle...)
+	lvCopy.ValueOrHandle = buf[bufLen : bufLen+vLen]
+	return lvCopy, buf
+}
+
+// MakeInPlaceValue constructs an in-place value.
+func MakeInPlaceValue(val []byte) LazyValue {
+	return LazyValue{ValueOrHandle: val}
+}
diff --git a/pebble/internal/base/lazy_value_test.go b/pebble/internal/base/lazy_value_test.go
new file mode 100644
index 0000000..82ad51c
--- /dev/null
+++ b/pebble/internal/base/lazy_value_test.go
@@ -0,0 +1,74 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package base
+
+import (
+	"bytes"
+	"testing"
+	"unsafe"
+
+	"github.com/stretchr/testify/require"
+)
+
+type valueFetcherFunc func(
+	handle []byte, valLen int32, buf []byte) (val []byte, callerOwned bool, err error)
+
+func (v valueFetcherFunc) Fetch(
+	handle []byte, valLen int32, buf []byte,
+) (val []byte, callerOwned bool, err error) {
+	return v(handle, valLen, buf)
+}
+
+func TestLazyValue(t *testing.T) {
+	// Both 40 and 48 bytes makes iteration benchmarks like
+	// BenchmarkIteratorScan/keys=1000,r-amp=1,key-types=points-only 75%
+	// slower.
+	require.True(t, unsafe.Sizeof(LazyValue{}) <= 32)
+
+	fooBytes1 := []byte("foo")
+	fooLV1 := MakeInPlaceValue(fooBytes1)
+	require.Equal(t, 3, fooLV1.Len())
+	_, hasAttr := fooLV1.TryGetShortAttribute()
+	require.False(t, hasAttr)
+	fooLV2, fooBytes2 := fooLV1.Clone(nil, &LazyFetcher{})
+	require.Equal(t, 3, fooLV2.Len())
+	_, hasAttr = fooLV2.TryGetShortAttribute()
+	require.False(t, hasAttr)
+	require.Equal(t, fooLV1.InPlaceValue(), fooLV2.InPlaceValue())
+	getValue := func(lv LazyValue, expectedCallerOwned bool) []byte {
+		v, callerOwned, err := lv.Value(nil)
+		require.NoError(t, err)
+		require.Equal(t, expectedCallerOwned, callerOwned)
+		return v
+	}
+	require.Equal(t, getValue(fooLV1, false), getValue(fooLV2, false))
+	fooBytes2[0] = 'b'
+	require.False(t, bytes.Equal(fooLV1.InPlaceValue(), fooLV2.InPlaceValue()))
+
+	for _, callerOwned := range []bool{false, true} {
+		numCalls := 0
+		fooLV3 := LazyValue{
+			ValueOrHandle: []byte("foo-handle"),
+			Fetcher: &LazyFetcher{
+				Fetcher: valueFetcherFunc(
+					func(handle []byte, valLen int32, buf []byte) ([]byte, bool, error) {
+						numCalls++
+						require.Equal(t, []byte("foo-handle"), handle)
+						require.Equal(t, int32(3), valLen)
+						return fooBytes1, callerOwned, nil
+					}),
+				Attribute: AttributeAndLen{ValueLen: 3, ShortAttribute: 7},
+			},
+		}
+		require.Equal(t, []byte("foo"), getValue(fooLV3, callerOwned))
+		require.Equal(t, 1, numCalls)
+		require.Equal(t, []byte("foo"), getValue(fooLV3, callerOwned))
+		require.Equal(t, 1, numCalls)
+		require.Equal(t, 3, fooLV3.Len())
+		attr, hasAttr := fooLV3.TryGetShortAttribute()
+		require.True(t, hasAttr)
+		require.Equal(t, ShortAttribute(7), attr)
+	}
+}
diff --git a/pebble/internal/base/logger.go b/pebble/internal/base/logger.go
new file mode 100644
index 0000000..5448137
--- /dev/null
+++ b/pebble/internal/base/logger.go
@@ -0,0 +1,158 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package base
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"log"
+	"os"
+	"runtime"
+	"sync"
+
+	"github.com/cockroachdb/pebble/internal/invariants"
+)
+
+// Logger defines an interface for writing log messages.
+type Logger interface {
+	Infof(format string, args ...interface{})
+	Errorf(format string, args ...interface{})
+	Fatalf(format string, args ...interface{})
+}
+type defaultLogger struct{}
+
+// DefaultLogger logs to the Go stdlib logs.
+var DefaultLogger defaultLogger
+
+var _ Logger = DefaultLogger
+
+// Infof implements the Logger.Infof interface.
+func (defaultLogger) Infof(format string, args ...interface{}) {
+	_ = log.Output(2, fmt.Sprintf(format, args...))
+}
+
+// Errorf implements the Logger.Errorf interface.
+func (defaultLogger) Errorf(format string, args ...interface{}) {
+	_ = log.Output(2, fmt.Sprintf(format, args...))
+}
+
+// Fatalf implements the Logger.Fatalf interface.
+func (defaultLogger) Fatalf(format string, args ...interface{}) {
+	_ = log.Output(2, fmt.Sprintf(format, args...))
+	os.Exit(1)
+}
+
+// InMemLogger implements Logger using an in-memory buffer (used for testing).
+// The buffer can be read via String() and cleared via Reset().
+type InMemLogger struct {
+	mu struct {
+		sync.Mutex
+		buf bytes.Buffer
+	}
+}
+
+var _ Logger = (*InMemLogger)(nil)
+
+// Reset clears the internal buffer.
+func (b *InMemLogger) Reset() {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	b.mu.buf.Reset()
+}
+
+// String returns the current internal buffer.
+func (b *InMemLogger) String() string {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	return b.mu.buf.String()
+}
+
+// Infof is part of the Logger interface.
+func (b *InMemLogger) Infof(format string, args ...interface{}) {
+	s := fmt.Sprintf(format, args...)
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	b.mu.buf.Write([]byte(s))
+	if n := len(s); n == 0 || s[n-1] != '\n' {
+		b.mu.buf.Write([]byte("\n"))
+	}
+}
+
+// Errorf is part of the Logger interface.
+func (b *InMemLogger) Errorf(format string, args ...interface{}) {
+	b.Infof(format, args...)
+}
+
+// Fatalf is part of the Logger interface.
+func (b *InMemLogger) Fatalf(format string, args ...interface{}) {
+	b.Infof(format, args...)
+	runtime.Goexit()
+}
+
+// LoggerAndTracer defines an interface for logging and tracing.
+type LoggerAndTracer interface {
+	Logger
+	// Eventf formats and emits a tracing log, if tracing is enabled in the
+	// current context.
+	Eventf(ctx context.Context, format string, args ...interface{})
+	// IsTracingEnabled returns true if tracing is enabled. It can be used as an
+	// optimization to avoid calling Eventf (which will be a noop when tracing
+	// is not enabled) to avoid the overhead of boxing the args.
+	IsTracingEnabled(ctx context.Context) bool
+}
+
+// LoggerWithNoopTracer wraps a logger and does no tracing.
+type LoggerWithNoopTracer struct {
+	Logger
+}
+
+var _ LoggerAndTracer = &LoggerWithNoopTracer{}
+
+// Eventf implements LoggerAndTracer.
+func (*LoggerWithNoopTracer) Eventf(ctx context.Context, format string, args ...interface{}) {
+	if invariants.Enabled && ctx == nil {
+		panic("Eventf context is nil")
+	}
+}
+
+// IsTracingEnabled implements LoggerAndTracer.
+func (*LoggerWithNoopTracer) IsTracingEnabled(ctx context.Context) bool {
+	if invariants.Enabled && ctx == nil {
+		panic("IsTracingEnabled ctx is nil")
+	}
+	return false
+}
+
+// NoopLoggerAndTracer does no logging and tracing. Remember that struct{} is
+// special cased in Go and does not incur an allocation when it backs the
+// interface LoggerAndTracer.
+type NoopLoggerAndTracer struct{}
+
+var _ LoggerAndTracer = NoopLoggerAndTracer{}
+
+// Infof implements LoggerAndTracer.
+func (l NoopLoggerAndTracer) Infof(format string, args ...interface{}) {}
+
+// Errorf implements LoggerAndTracer.
+func (l NoopLoggerAndTracer) Errorf(format string, args ...interface{}) {}
+
+// Fatalf implements LoggerAndTracer.
+func (l NoopLoggerAndTracer) Fatalf(format string, args ...interface{}) {}
+
+// Eventf implements LoggerAndTracer.
+func (l NoopLoggerAndTracer) Eventf(ctx context.Context, format string, args ...interface{}) {
+	if invariants.Enabled && ctx == nil {
+		panic("Eventf context is nil")
+	}
+}
+
+// IsTracingEnabled implements LoggerAndTracer.
+func (l NoopLoggerAndTracer) IsTracingEnabled(ctx context.Context) bool {
+	if invariants.Enabled && ctx == nil {
+		panic("IsTracingEnabled ctx is nil")
+	}
+	return false
+}
diff --git a/pebble/internal/base/merger.go b/pebble/internal/base/merger.go
new file mode 100644
index 0000000..757d150
--- /dev/null
+++ b/pebble/internal/base/merger.go
@@ -0,0 +1,133 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package base
+
+import "io"
+
+// Merge creates a ValueMerger for the specified key initialized with the value
+// of one merge operand.
+type Merge func(key, value []byte) (ValueMerger, error)
+
+// ValueMerger receives merge operands one by one. The operand received is either
+// newer or older than all operands received so far as indicated by the function
+// names, `MergeNewer()` and `MergeOlder()`. Once all operands have been received,
+// the client will invoke `Finish()` to obtain the final result. The order of
+// a merge is not changed after the first call to `MergeNewer()` or
+// `MergeOlder()`, i.e. the same method is used to submit all operands.
+//
+// The implementation may choose to merge values into the result immediately upon
+// receiving each operand, or buffer operands until Finish() is called. For example,
+// buffering may be useful to avoid (de)serializing partial merge results.
+//
+// The merge operation must be associative. That is, for the values A, B, C:
+//
+//	Merge(A).MergeOlder(B).MergeOlder(C) == Merge(C).MergeNewer(B).MergeNewer(A)
+//
+// Examples of merge operators are integer addition, list append, and string
+// concatenation.
+type ValueMerger interface {
+	// MergeNewer adds an operand that is newer than all existing operands.
+	// The caller retains ownership of value.
+	//
+	// If an error is returned the merge is aborted and no other methods must
+	// be called.
+	MergeNewer(value []byte) error
+
+	// MergeOlder adds an operand that is older than all existing operands.
+	// The caller retains ownership of value.
+	//
+	// If an error is returned the merge is aborted and no other methods must
+	// be called.
+	MergeOlder(value []byte) error
+
+	// Finish does any final processing of the added operands and returns a
+	// result. The caller can assume the returned byte slice will not be mutated.
+	//
+	// Finish must be the last function called on the ValueMerger. The caller
+	// must not call any other ValueMerger functions after calling Finish.
+	//
+	// If `includesBase` is true, the oldest merge operand was part of the
+	// merge. This will always be the true during normal iteration, but may be
+	// false during compaction when only a subset of operands may be
+	// available. Note that `includesBase` is set to true conservatively: a false
+	// value means that we could not definitely determine that the base merge
+	// operand was included.
+	//
+	// If a Closer is returned, the returned slice will remain valid until it is
+	// closed. The caller must arrange for the closer to be eventually closed.
+	Finish(includesBase bool) ([]byte, io.Closer, error)
+}
+
+// DeletableValueMerger is an extension to ValueMerger which allows indicating that the
+// result of a merge operation is non-existent. Such non-existent entries will eventually
+// be deleted during compaction. Note that during compaction, non-existence of the result
+// of a merge means that the merge operands will not result in any record being output.
+// This is not the same as transforming the merge operands into a deletion tombstone, as
+// older merge operands will still be visible during iteration. Deletion of the merge operands
+// in this way is akin to the way a SingleDelete+Set combine into non-existence while leaving
+// older records for the same key unaffected.
+type DeletableValueMerger interface {
+	ValueMerger
+
+	// DeletableFinish enables a value merger to indicate that the result of a merge operation
+	// is non-existent. See Finish for a description of includesBase.
+	DeletableFinish(includesBase bool) (value []byte, delete bool, closer io.Closer, err error)
+}
+
+// Merger defines an associative merge operation. The merge operation merges
+// two or more values for a single key. A merge operation is requested by
+// writing a value using {Batch,DB}.Merge(). The value at that key is merged
+// with any existing value. It is valid to Set a value at a key and then Merge
+// a new value. Similar to non-merged values, a merged value can be deleted by
+// either Delete or DeleteRange.
+//
+// The merge operation is invoked when a merge value is encountered during a
+// read, either during a compaction or during iteration.
+type Merger struct {
+	Merge Merge
+
+	// Name is the name of the merger.
+	//
+	// Pebble stores the merger name on disk, and opening a database with a
+	// different merger from the one it was created with will result in an error.
+	Name string
+}
+
+// AppendValueMerger concatenates merge operands in order from oldest to newest.
+type AppendValueMerger struct {
+	buf []byte
+}
+
+// MergeNewer appends value to the result.
+func (a *AppendValueMerger) MergeNewer(value []byte) error {
+	a.buf = append(a.buf, value...)
+	return nil
+}
+
+// MergeOlder prepends value to the result, which involves allocating a new buffer.
+func (a *AppendValueMerger) MergeOlder(value []byte) error {
+	buf := make([]byte, len(a.buf)+len(value))
+	copy(buf, value)
+	copy(buf[len(value):], a.buf)
+	a.buf = buf
+	return nil
+}
+
+// Finish returns the buffer that was constructed on-demand in `Merge{OlderNewer}()` calls.
+func (a *AppendValueMerger) Finish(includesBase bool) ([]byte, io.Closer, error) {
+	return a.buf, nil, nil
+}
+
+// DefaultMerger is the default implementation of the Merger interface. It
+// concatenates the two values to merge.
+var DefaultMerger = &Merger{
+	Merge: func(key, value []byte) (ValueMerger, error) {
+		res := &AppendValueMerger{}
+		res.buf = append(res.buf, value...)
+		return res, nil
+	},
+
+	Name: "pebble.concatenate",
+}
diff --git a/pebble/internal/base/metrics.go b/pebble/internal/base/metrics.go
new file mode 100644
index 0000000..520edc3
--- /dev/null
+++ b/pebble/internal/base/metrics.go
@@ -0,0 +1,98 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package base
+
+import "time"
+
+// ThroughputMetric is used to measure the byte throughput of some component
+// that performs work in a single-threaded manner. The throughput can be
+// approximated by Bytes/(WorkDuration+IdleTime). The idle time is represented
+// separately, so that the user of this metric could approximate the peak
+// throughput as Bytes/WorkTime. The metric is designed to be cumulative (see
+// Merge).
+type ThroughputMetric struct {
+	// Bytes is the processes bytes by the component.
+	Bytes int64
+	// WorkDuration is the duration that the component spent doing work.
+	WorkDuration time.Duration
+	// IdleDuration is the duration that the component was idling, waiting for
+	// work.
+	IdleDuration time.Duration
+}
+
+// Merge accumulates the information from another throughput metric.
+func (tm *ThroughputMetric) Merge(x ThroughputMetric) {
+	tm.Bytes += x.Bytes
+	tm.WorkDuration += x.WorkDuration
+	tm.IdleDuration += x.IdleDuration
+}
+
+// Subtract subtracts the information from another ThroughputMetric
+func (tm *ThroughputMetric) Subtract(x ThroughputMetric) {
+	tm.Bytes -= x.Bytes
+	tm.WorkDuration -= x.WorkDuration
+	tm.IdleDuration -= x.IdleDuration
+}
+
+// PeakRate returns the approximate peak rate if there was no idling.
+func (tm *ThroughputMetric) PeakRate() int64 {
+	if tm.Bytes == 0 {
+		return 0
+	}
+	return int64((float64(tm.Bytes) / float64(tm.WorkDuration)) * float64(time.Second))
+}
+
+// Rate returns the observed rate.
+func (tm *ThroughputMetric) Rate() int64 {
+	if tm.Bytes == 0 {
+		return 0
+	}
+	return int64((float64(tm.Bytes) / float64(tm.WorkDuration+tm.IdleDuration)) *
+		float64(time.Second))
+}
+
+// Utilization returns a percent [0, 1.0] indicating the percent of time
+// work was performed.
+func (tm *ThroughputMetric) Utilization() float64 {
+	if tm.WorkDuration == 0 {
+		return 0
+	}
+	return float64(tm.WorkDuration) / float64(tm.WorkDuration+tm.IdleDuration)
+}
+
+// GaugeSampleMetric is used to measure a gauge value (e.g. queue length) by
+// accumulating samples of that gauge.
+type GaugeSampleMetric struct {
+	// The sum of all the samples.
+	sampleSum int64
+	// The number of samples.
+	count int64
+}
+
+// AddSample adds the given sample.
+func (gsm *GaugeSampleMetric) AddSample(sample int64) {
+	gsm.sampleSum += sample
+	gsm.count++
+}
+
+// Merge accumulates the information from another gauge metric.
+func (gsm *GaugeSampleMetric) Merge(x GaugeSampleMetric) {
+	gsm.sampleSum += x.sampleSum
+	gsm.count += x.count
+}
+
+// Subtract subtracts the information from another gauge metric.
+func (gsm *GaugeSampleMetric) Subtract(x GaugeSampleMetric) {
+	gsm.sampleSum -= x.sampleSum
+	gsm.count -= x.count
+}
+
+// Mean returns the mean value.
+func (gsm *GaugeSampleMetric) Mean() float64 {
+	if gsm.count == 0 {
+		return 0
+	}
+	return float64(gsm.sampleSum) / float64(gsm.count)
+}
diff --git a/pebble/internal/base/metrics_test.go b/pebble/internal/base/metrics_test.go
new file mode 100644
index 0000000..90e3166
--- /dev/null
+++ b/pebble/internal/base/metrics_test.go
@@ -0,0 +1,79 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package base
+
+import (
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestThroughputMetric(t *testing.T) {
+	m1 := ThroughputMetric{
+		Bytes:        10,
+		WorkDuration: time.Millisecond,
+		IdleDuration: 9 * time.Millisecond,
+	}
+	var m2 ThroughputMetric
+	m2.Merge(m1)
+	require.Equal(t, m1, m2)
+	m2.Merge(m1)
+	doubleM1 := ThroughputMetric{
+		Bytes:        2 * m1.Bytes,
+		WorkDuration: 2 * m1.WorkDuration,
+		IdleDuration: 2 * m1.IdleDuration,
+	}
+	require.Equal(t, doubleM1, m2)
+	require.EqualValues(t, 10*100, m1.Rate())
+	require.EqualValues(t, 10*1000, m1.PeakRate())
+}
+
+func TestThroughputMetric_Subtract(t *testing.T) {
+	m1 := ThroughputMetric{
+		Bytes:        10,
+		WorkDuration: time.Millisecond,
+		IdleDuration: 9 * time.Millisecond,
+	}
+	m2 := ThroughputMetric{
+		Bytes:        100,
+		WorkDuration: time.Millisecond,
+		IdleDuration: 90 * time.Millisecond,
+	}
+
+	m2.Subtract(m1)
+	require.Equal(t, int64(90), m2.Bytes)
+	require.Equal(t, 0*time.Millisecond, m2.WorkDuration)
+	require.Equal(t, 81*time.Millisecond, m2.IdleDuration)
+}
+
+func TestGaugeSampleMetric(t *testing.T) {
+	g1 := GaugeSampleMetric{}
+	g1.AddSample(10)
+	g1.AddSample(20)
+	g2 := GaugeSampleMetric{}
+	g2.Merge(g1)
+	g2.AddSample(60)
+	require.EqualValues(t, 30, g2.Mean())
+	require.EqualValues(t, 3, g2.count)
+	require.EqualValues(t, 15, g1.Mean())
+	require.EqualValues(t, 2, g1.count)
+}
+
+func TestGaugeSampleMetricSubtract(t *testing.T) {
+	g1 := GaugeSampleMetric{}
+	g2 := GaugeSampleMetric{}
+	g1.AddSample(10)
+	g1.AddSample(20)
+	g1.AddSample(0)
+
+	g2.AddSample(10)
+
+	g1.Subtract(g2)
+
+	require.Equal(t, int64(20), g1.sampleSum)
+	require.Equal(t, int64(2), g1.count)
+
+}
diff --git a/pebble/internal/base/options.go b/pebble/internal/base/options.go
new file mode 100644
index 0000000..316717e
--- /dev/null
+++ b/pebble/internal/base/options.go
@@ -0,0 +1,76 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package base
+
+// SSTable block defaults.
+const (
+	DefaultBlockRestartInterval = 16
+	DefaultBlockSize            = 4096
+	DefaultBlockSizeThreshold   = 90
+)
+
+// FilterType is the level at which to apply a filter: block or table.
+type FilterType int
+
+// The available filter types.
+const (
+	TableFilter FilterType = iota
+)
+
+func (t FilterType) String() string {
+	switch t {
+	case TableFilter:
+		return "table"
+	}
+	return "unknown"
+}
+
+// FilterWriter provides an interface for creating filter blocks. See
+// FilterPolicy for more details about filters.
+type FilterWriter interface {
+	// AddKey adds a key to the current filter block.
+	AddKey(key []byte)
+
+	// Finish appends to dst an encoded filter tha holds the current set of
+	// keys. The writer state is reset after the call to Finish allowing the
+	// writer to be reused for the creation of additional filters.
+	Finish(dst []byte) []byte
+}
+
+// FilterPolicy is an algorithm for probabilistically encoding a set of keys.
+// The canonical implementation is a Bloom filter.
+//
+// Every FilterPolicy has a name. This names the algorithm itself, not any one
+// particular instance. Aspects specific to a particular instance, such as the
+// set of keys or any other parameters, will be encoded in the []byte filter
+// returned by NewWriter.
+//
+// The name may be written to files on disk, along with the filter data. To use
+// these filters, the FilterPolicy name at the time of writing must equal the
+// name at the time of reading. If they do not match, the filters will be
+// ignored, which will not affect correctness but may affect performance.
+type FilterPolicy interface {
+	// Name names the filter policy.
+	Name() string
+
+	// MayContain returns whether the encoded filter may contain given key.
+	// False positives are possible, where it returns true for keys not in the
+	// original set.
+	MayContain(ftype FilterType, filter, key []byte) bool
+
+	// NewWriter creates a new FilterWriter.
+	NewWriter(ftype FilterType) FilterWriter
+}
+
+// BlockPropertyFilter is used in an Iterator to filter sstables and blocks
+// within the sstable. It should not maintain any per-sstable state, and must
+// be thread-safe.
+type BlockPropertyFilter interface {
+	// Name returns the name of the block property collector.
+	Name() string
+	// Intersects returns true if the set represented by prop intersects with
+	// the set in the filter.
+	Intersects(prop []byte) (bool, error)
+}
diff --git a/pebble/internal/batchskl/README.md b/pebble/internal/batchskl/README.md
new file mode 100644
index 0000000..1e0aa2d
--- /dev/null
+++ b/pebble/internal/batchskl/README.md
@@ -0,0 +1,56 @@
+# batchskl
+
+Fast, non-concurrent skiplist implementation in Go that supports
+forward and backward iteration.
+
+## Limitations
+
+* The interface is tailored for use in indexing pebble batches. Keys
+  and values are stored outside of the skiplist making the skiplist
+  awkward for general purpose use.
+* Deletion is not supported. Instead, higher-level code is expected to
+  add deletion tombstones and needs to process those tombstones
+  appropriately.
+
+## Pedigree
+
+This code is based on Andy Kimball's arenaskl code.
+
+The arenaskl code is based on the skiplist found in Badger, a Go-based
+KV store:
+
+https://github.com/dgraph-io/badger/tree/master/skl
+
+The skiplist in Badger is itself based on a C++ skiplist built for
+Facebook's RocksDB:
+
+https://github.com/facebook/rocksdb/tree/master/memtable
+
+## Benchmarks
+
+The benchmarks consist of a mix of reads and writes executed in parallel. The
+fraction of reads is indicated in the run name: "frac_X" indicates a run where
+X percent of the operations are reads.
+
+```
+name                  time/op
+ReadWrite/frac_0      1.03µs ± 2%
+ReadWrite/frac_10     1.32µs ± 1%
+ReadWrite/frac_20     1.26µs ± 1%
+ReadWrite/frac_30     1.18µs ± 1%
+ReadWrite/frac_40     1.09µs ± 1%
+ReadWrite/frac_50      987ns ± 2%
+ReadWrite/frac_60     1.07µs ± 1%
+ReadWrite/frac_70      909ns ± 1%
+ReadWrite/frac_80      693ns ± 2%
+ReadWrite/frac_90      599ns ± 2%
+ReadWrite/frac_100    45.3ns ± 3%
+```
+
+Forward and backward iteration are also fast:
+
+```
+name                  time/op
+IterNext              4.49ns ± 3%
+IterPrev              4.48ns ± 3%
+```
diff --git a/pebble/internal/batchskl/iterator.go b/pebble/internal/batchskl/iterator.go
new file mode 100644
index 0000000..5917ed1
--- /dev/null
+++ b/pebble/internal/batchskl/iterator.go
@@ -0,0 +1,223 @@
+/*
+ * Copyright 2017 Dgraph Labs, Inc. and Contributors
+ * Modifications copyright (C) 2017 Andy Kimball and Contributors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package batchskl
+
+import "github.com/cockroachdb/pebble/internal/base"
+
+type splice struct {
+	prev uint32
+	next uint32
+}
+
+// Iterator is an iterator over the skiplist object. Use Skiplist.NewIter
+// to construct an iterator. The current state of the iterator can be cloned
+// by simply value copying the struct.
+type Iterator struct {
+	list  *Skiplist
+	nd    uint32
+	key   base.InternalKey
+	lower []byte
+	upper []byte
+}
+
+// Close resets the iterator.
+func (it *Iterator) Close() error {
+	it.list = nil
+	it.nd = 0
+	return nil
+}
+
+// SeekGE moves the iterator to the first entry whose key is greater than or
+// equal to the given key. Returns true if the iterator is pointing at a valid
+// entry and false otherwise. Note that SeekGE only checks the upper bound. It
+// is up to the caller to ensure that key is greater than or equal to the lower
+// bound.
+func (it *Iterator) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKey {
+	if flags.TrySeekUsingNext() {
+		if it.nd == it.list.tail {
+			// Iterator is done.
+			return nil
+		}
+		less := it.list.cmp(it.key.UserKey, key) < 0
+		// Arbitrary constant. By measuring the seek cost as a function of the
+		// number of elements in the skip list, and fitting to a model, we
+		// could adjust the number of nexts based on the current size of the
+		// skip list.
+		const numNexts = 5
+		for i := 0; less && i < numNexts; i++ {
+			k := it.Next()
+			if k == nil {
+				// Iterator is done.
+				return nil
+			}
+			less = it.list.cmp(k.UserKey, key) < 0
+		}
+		if !less {
+			return &it.key
+		}
+	}
+
+	_, it.nd = it.seekForBaseSplice(key, it.list.abbreviatedKey(key))
+	if it.nd == it.list.tail {
+		return nil
+	}
+	nodeKey := it.list.getKey(it.nd)
+	if it.upper != nil && it.list.cmp(it.upper, nodeKey.UserKey) <= 0 {
+		it.nd = it.list.tail
+		return nil
+	}
+	it.key = nodeKey
+	return &it.key
+}
+
+// SeekLT moves the iterator to the last entry whose key is less the given
+// key. Returns true if the iterator is pointing at a valid entry and false
+// otherwise. Note that SeekLT only checks the lower bound. It is up to the
+// caller to ensure that key is less than the upper bound.
+func (it *Iterator) SeekLT(key []byte) *base.InternalKey {
+	it.nd, _ = it.seekForBaseSplice(key, it.list.abbreviatedKey(key))
+	if it.nd == it.list.head {
+		return nil
+	}
+	nodeKey := it.list.getKey(it.nd)
+	if it.lower != nil && it.list.cmp(it.lower, nodeKey.UserKey) > 0 {
+		it.nd = it.list.head
+		return nil
+	}
+	it.key = nodeKey
+	return &it.key
+}
+
+// First seeks position at the first entry in list. Final state of iterator is
+// Valid() iff list is not empty. Note that First only checks the upper
+// bound. It is up to the caller to ensure that key is greater than or equal to
+// the lower bound (e.g. via a call to SeekGE(lower)).
+func (it *Iterator) First() *base.InternalKey {
+	it.nd = it.list.getNext(it.list.head, 0)
+	if it.nd == it.list.tail {
+		return nil
+	}
+	nodeKey := it.list.getKey(it.nd)
+	if it.upper != nil && it.list.cmp(it.upper, nodeKey.UserKey) <= 0 {
+		it.nd = it.list.tail
+		return nil
+	}
+	it.key = nodeKey
+	return &it.key
+}
+
+// Last seeks position at the last entry in list. Final state of iterator is
+// Valid() iff list is not empty. Note that Last only checks the lower
+// bound. It is up to the caller to ensure that key is less than the upper
+// bound (e.g. via a call to SeekLT(upper)).
+func (it *Iterator) Last() *base.InternalKey {
+	it.nd = it.list.getPrev(it.list.tail, 0)
+	if it.nd == it.list.head {
+		return nil
+	}
+	nodeKey := it.list.getKey(it.nd)
+	if it.lower != nil && it.list.cmp(it.lower, nodeKey.UserKey) > 0 {
+		it.nd = it.list.head
+		return nil
+	}
+	it.key = nodeKey
+	return &it.key
+}
+
+// Next advances to the next position. If there are no following nodes, then
+// Valid() will be false after this call.
+func (it *Iterator) Next() *base.InternalKey {
+	it.nd = it.list.getNext(it.nd, 0)
+	if it.nd == it.list.tail {
+		return nil
+	}
+	nodeKey := it.list.getKey(it.nd)
+	if it.upper != nil && it.list.cmp(it.upper, nodeKey.UserKey) <= 0 {
+		it.nd = it.list.tail
+		return nil
+	}
+	it.key = nodeKey
+	return &it.key
+}
+
+// Prev moves to the previous position. If there are no previous nodes, then
+// Valid() will be false after this call.
+func (it *Iterator) Prev() *base.InternalKey {
+	it.nd = it.list.getPrev(it.nd, 0)
+	if it.nd == it.list.head {
+		return nil
+	}
+	nodeKey := it.list.getKey(it.nd)
+	if it.lower != nil && it.list.cmp(it.lower, nodeKey.UserKey) > 0 {
+		it.nd = it.list.head
+		return nil
+	}
+	it.key = nodeKey
+	return &it.key
+}
+
+// Key returns the key at the current position.
+func (it *Iterator) Key() *base.InternalKey {
+	return &it.key
+}
+
+// KeyInfo returns the offset of the start of the record, the start of the key,
+// and the end of the key.
+func (it *Iterator) KeyInfo() (offset, keyStart, keyEnd uint32) {
+	n := it.list.node(it.nd)
+	return n.offset, n.keyStart, n.keyEnd
+}
+
+// Head true iff the iterator is positioned at the sentinel head node.
+func (it *Iterator) Head() bool {
+	return it.nd == it.list.head
+}
+
+// Tail true iff the iterator is positioned at the sentinel tail node.
+func (it *Iterator) Tail() bool {
+	return it.nd == it.list.tail
+}
+
+// Valid returns nil iff the iterator is positioned at a valid node.
+func (it *Iterator) Valid() bool {
+	return it.list != nil && it.nd != it.list.head && it.nd != it.list.tail
+}
+
+func (it *Iterator) String() string {
+	return "batch"
+}
+
+// SetBounds sets the lower and upper bounds for the iterator. Note that the
+// result of Next and Prev will be undefined until the iterator has been
+// repositioned with SeekGE, SeekLT, First, or Last.
+func (it *Iterator) SetBounds(lower, upper []byte) {
+	it.lower = lower
+	it.upper = upper
+}
+
+func (it *Iterator) seekForBaseSplice(key []byte, abbreviatedKey uint64) (prev, next uint32) {
+	prev = it.list.head
+	for level := it.list.height - 1; ; level-- {
+		prev, next = it.list.findSpliceForLevel(key, abbreviatedKey, level, prev)
+		if level == 0 {
+			break
+		}
+	}
+
+	return
+}
diff --git a/pebble/internal/batchskl/skl.go b/pebble/internal/batchskl/skl.go
new file mode 100644
index 0000000..f56d95c
--- /dev/null
+++ b/pebble/internal/batchskl/skl.go
@@ -0,0 +1,442 @@
+/*
+ * Copyright 2017 Dgraph Labs, Inc. and Contributors
+ * Modifications copyright (C) 2017 Andy Kimball and Contributors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License")
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+Adapted from RocksDB inline skiplist.
+
+Key differences:
+- No optimization for sequential inserts (no "prev").
+- No custom comparator.
+- Support overwrites. This requires care when we see the same key when inserting.
+  For RocksDB or LevelDB, overwrites are implemented as a newer sequence number in the key, so
+	there is no need for values. We don't intend to support versioning. In-place updates of values
+	would be more efficient.
+- We discard all non-concurrent code.
+- We do not support Splices. This simplifies the code a lot.
+- No AllocateNode or other pointer arithmetic.
+- We combine the findLessThan, findGreaterOrEqual, etc into one function.
+*/
+
+/*
+Further adapted from Badger: https://github.com/dgraph-io/badger.
+
+Key differences:
+- Support for previous pointers - doubly linked lists. Note that it's up to higher
+  level code to deal with the intermediate state that occurs during insertion,
+  where node A is linked to node B, but node B is not yet linked back to node A.
+- Iterator includes mutator functions.
+*/
+
+/*
+Further adapted from arenaskl: https://github.com/andy-kimball/arenaskl
+
+Key differences:
+- Removed support for deletion.
+- Removed support for concurrency.
+- External storage of keys.
+- Node storage grows to an arbitrary size.
+*/
+
+package batchskl // import "github.com/cockroachdb/pebble/internal/batchskl"
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"math"
+	"time"
+	"unsafe"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/constants"
+	"golang.org/x/exp/rand"
+)
+
+const (
+	maxHeight    = 20
+	maxNodeSize  = uint64(unsafe.Sizeof(node{}))
+	linksSize    = uint64(unsafe.Sizeof(links{}))
+	maxNodesSize = constants.MaxUint32OrInt
+)
+
+var (
+	// ErrExists indicates that a duplicate record was inserted. This should never
+	// happen for normal usage of batchskl as every key should have a unique
+	// sequence number.
+	ErrExists = errors.New("record with this key already exists")
+
+	// ErrTooManyRecords is a sentinel error returned when the size of the raw
+	// nodes slice exceeds the maximum allowed size (currently 1 << 32 - 1). This
+	// corresponds to ~117 M skiplist entries.
+	ErrTooManyRecords = errors.New("too many records")
+)
+
+type links struct {
+	next uint32
+	prev uint32
+}
+
+type node struct {
+	// The offset of the start of the record in the storage.
+	offset uint32
+	// The offset of the start and end of the key in storage.
+	keyStart uint32
+	keyEnd   uint32
+	// A fixed 8-byte abbreviation of the key, used to avoid retrieval of the key
+	// during seek operations. The key retrieval can be expensive purely due to
+	// cache misses while the abbreviatedKey stored here will be in the same
+	// cache line as the key and the links making accessing and comparing against
+	// it almost free.
+	abbreviatedKey uint64
+	// Most nodes do not need to use the full height of the link tower, since the
+	// probability of each successive level decreases exponentially. Because
+	// these elements are never accessed, they do not need to be allocated.
+	// Therefore, when a node is allocated, its memory footprint is deliberately
+	// truncated to not include unneeded link elements.
+	links [maxHeight]links
+}
+
+// Skiplist is a fast, non-cocnurrent skiplist implementation that supports
+// forward and backward iteration. See arenaskl.Skiplist for a concurrent
+// skiplist. Keys and values are stored externally from the skiplist via the
+// Storage interface. Deletion is not supported. Instead, higher-level code is
+// expected to perform deletion via tombstones and needs to process those
+// tombstones appropriately during retrieval operations.
+type Skiplist struct {
+	storage        *[]byte
+	cmp            base.Compare
+	abbreviatedKey base.AbbreviatedKey
+	nodes          []byte
+	head           uint32
+	tail           uint32
+	height         uint32 // Current height: 1 <= height <= maxHeight
+	rand           rand.PCGSource
+}
+
+var (
+	probabilities [maxHeight]uint32
+)
+
+func init() {
+	const pValue = 1 / math.E
+
+	// Precompute the skiplist probabilities so that only a single random number
+	// needs to be generated and so that the optimal pvalue can be used (inverse
+	// of Euler's number).
+	p := float64(1.0)
+	for i := 0; i < maxHeight; i++ {
+		probabilities[i] = uint32(float64(math.MaxUint32) * p)
+		p *= pValue
+	}
+}
+
+// NewSkiplist constructs and initializes a new, empty skiplist.
+func NewSkiplist(storage *[]byte, cmp base.Compare, abbreviatedKey base.AbbreviatedKey) *Skiplist {
+	s := &Skiplist{}
+	s.Init(storage, cmp, abbreviatedKey)
+	return s
+}
+
+// Reset the fields in the skiplist for reuse.
+func (s *Skiplist) Reset() {
+	*s = Skiplist{
+		nodes:  s.nodes[:0],
+		height: 1,
+	}
+	const batchMaxRetainedSize = 1 << 20 // 1 MB
+	if cap(s.nodes) > batchMaxRetainedSize {
+		s.nodes = nil
+	}
+}
+
+// Init the skiplist to empty and re-initialize.
+func (s *Skiplist) Init(storage *[]byte, cmp base.Compare, abbreviatedKey base.AbbreviatedKey) {
+	*s = Skiplist{
+		storage:        storage,
+		cmp:            cmp,
+		abbreviatedKey: abbreviatedKey,
+		nodes:          s.nodes[:0],
+		height:         1,
+	}
+	s.rand.Seed(uint64(time.Now().UnixNano()))
+
+	const initBufSize = 256
+	if cap(s.nodes) < initBufSize {
+		s.nodes = make([]byte, 0, initBufSize)
+	}
+
+	// Allocate head and tail nodes. While allocating a new node can fail, in the
+	// context of initializing the skiplist we consider it unrecoverable.
+	var err error
+	s.head, err = s.newNode(maxHeight, 0, 0, 0, 0)
+	if err != nil {
+		panic(err)
+	}
+	s.tail, err = s.newNode(maxHeight, 0, 0, 0, 0)
+	if err != nil {
+		panic(err)
+	}
+
+	// Link all head/tail levels together.
+	headNode := s.node(s.head)
+	tailNode := s.node(s.tail)
+	for i := uint32(0); i < maxHeight; i++ {
+		headNode.links[i].next = s.tail
+		tailNode.links[i].prev = s.head
+	}
+}
+
+// Add adds a new key to the skiplist if it does not yet exist. If the record
+// already exists, then Add returns ErrRecordExists.
+func (s *Skiplist) Add(keyOffset uint32) error {
+	data := (*s.storage)[keyOffset+1:]
+	v, n := binary.Uvarint(data)
+	if n <= 0 {
+		return errors.Errorf("corrupted batch entry: %d", errors.Safe(keyOffset))
+	}
+	data = data[n:]
+	if v > uint64(len(data)) {
+		return errors.Errorf("corrupted batch entry: %d", errors.Safe(keyOffset))
+	}
+	keyStart := 1 + keyOffset + uint32(n)
+	keyEnd := keyStart + uint32(v)
+	key := data[:v]
+	abbreviatedKey := s.abbreviatedKey(key)
+
+	// spl holds the list of next and previous links for each level in the
+	// skiplist indicating where the new node will be inserted.
+	var spl [maxHeight]splice
+
+	// Fast-path for in-order insertion of keys: compare the new key against the
+	// last key.
+	prev := s.getPrev(s.tail, 0)
+	if prevNode := s.node(prev); prev == s.head ||
+		abbreviatedKey > prevNode.abbreviatedKey ||
+		(abbreviatedKey == prevNode.abbreviatedKey &&
+			s.cmp(key, (*s.storage)[prevNode.keyStart:prevNode.keyEnd]) > 0) {
+		for level := uint32(0); level < s.height; level++ {
+			spl[level].prev = s.getPrev(s.tail, level)
+			spl[level].next = s.tail
+		}
+	} else {
+		s.findSplice(key, abbreviatedKey, &spl)
+	}
+
+	height := s.randomHeight()
+	// Increase s.height as necessary.
+	for ; s.height < height; s.height++ {
+		spl[s.height].next = s.tail
+		spl[s.height].prev = s.head
+	}
+
+	// We always insert from the base level and up. After you add a node in base
+	// level, we cannot create a node in the level above because it would have
+	// discovered the node in the base level.
+	nd, err := s.newNode(height, keyOffset, keyStart, keyEnd, abbreviatedKey)
+	if err != nil {
+		return err
+	}
+	newNode := s.node(nd)
+	for level := uint32(0); level < height; level++ {
+		next := spl[level].next
+		prev := spl[level].prev
+		newNode.links[level].next = next
+		newNode.links[level].prev = prev
+		s.node(next).links[level].prev = nd
+		s.node(prev).links[level].next = nd
+	}
+
+	return nil
+}
+
+// NewIter returns a new Iterator object. The lower and upper bound parameters
+// control the range of keys the iterator will return. Specifying for nil for
+// lower or upper bound disables the check for that boundary. Note that lower
+// bound is not checked on {SeekGE,First} and upper bound is not check on
+// {SeekLT,Last}. The user is expected to perform that check. Note that it is
+// safe for an iterator to be copied by value.
+func (s *Skiplist) NewIter(lower, upper []byte) Iterator {
+	return Iterator{list: s, lower: lower, upper: upper}
+}
+
+func (s *Skiplist) newNode(
+	height,
+	offset, keyStart, keyEnd uint32, abbreviatedKey uint64,
+) (uint32, error) {
+	if height < 1 || height > maxHeight {
+		panic("height cannot be less than one or greater than the max height")
+	}
+
+	unusedSize := uint64(maxHeight-int(height)) * linksSize
+	nodeOffset, err := s.alloc(uint32(maxNodeSize - unusedSize))
+	if err != nil {
+		return 0, err
+	}
+	nd := s.node(nodeOffset)
+
+	nd.offset = offset
+	nd.keyStart = keyStart
+	nd.keyEnd = keyEnd
+	nd.abbreviatedKey = abbreviatedKey
+	return nodeOffset, nil
+}
+
+func (s *Skiplist) alloc(size uint32) (uint32, error) {
+	offset := uint64(len(s.nodes))
+
+	// We only have a need for memory up to offset + size, but we never want
+	// to allocate a node whose tail points into unallocated memory.
+	minAllocSize := offset + maxNodeSize
+	if uint64(cap(s.nodes)) < minAllocSize {
+		allocSize := uint64(cap(s.nodes)) * 2
+		if allocSize < minAllocSize {
+			allocSize = minAllocSize
+		}
+		// Cap the allocation at the max allowed size to avoid wasted capacity.
+		if allocSize > maxNodesSize {
+			// The new record may still not fit within the allocation, in which case
+			// we return early with an error. This avoids the panic below when we
+			// resize the slice. It also avoids the allocation and copy.
+			if uint64(offset)+uint64(size) > maxNodesSize {
+				return 0, errors.Wrapf(ErrTooManyRecords,
+					"alloc of new record (size=%d) would overflow uint32 (current size=%d)",
+					uint64(offset)+uint64(size), offset,
+				)
+			}
+			allocSize = maxNodesSize
+		}
+		tmp := make([]byte, len(s.nodes), allocSize)
+		copy(tmp, s.nodes)
+		s.nodes = tmp
+	}
+
+	newSize := uint32(offset) + size
+	s.nodes = s.nodes[:newSize]
+	return uint32(offset), nil
+}
+
+func (s *Skiplist) node(offset uint32) *node {
+	return (*node)(unsafe.Pointer(&s.nodes[offset]))
+}
+
+func (s *Skiplist) randomHeight() uint32 {
+	rnd := uint32(s.rand.Uint64())
+	h := uint32(1)
+	for h < maxHeight && rnd <= probabilities[h] {
+		h++
+	}
+	return h
+}
+
+func (s *Skiplist) findSplice(key []byte, abbreviatedKey uint64, spl *[maxHeight]splice) {
+	prev := s.head
+
+	for level := s.height - 1; ; level-- {
+		// The code in this loop is the same as findSpliceForLevel(). For some
+		// reason, calling findSpliceForLevel() here is much much slower than the
+		// inlined code below. The excess time is also caught up in the final
+		// return statement which makes little sense. Revisit when in go1.14 or
+		// later if inlining improves.
+
+		next := s.getNext(prev, level)
+		for next != s.tail {
+			// Assume prev.key < key.
+			nextNode := s.node(next)
+			nextAbbreviatedKey := nextNode.abbreviatedKey
+			if abbreviatedKey < nextAbbreviatedKey {
+				// We are done for this level, since prev.key < key < next.key.
+				break
+			}
+			if abbreviatedKey == nextAbbreviatedKey {
+				if s.cmp(key, (*s.storage)[nextNode.keyStart:nextNode.keyEnd]) <= 0 {
+					// We are done for this level, since prev.key < key <= next.key.
+					break
+				}
+			}
+
+			// Keep moving right on this level.
+			prev = next
+			next = nextNode.links[level].next
+		}
+
+		spl[level].prev = prev
+		spl[level].next = next
+		if level == 0 {
+			break
+		}
+	}
+}
+
+func (s *Skiplist) findSpliceForLevel(
+	key []byte, abbreviatedKey uint64, level, start uint32,
+) (prev, next uint32) {
+	prev = start
+	next = s.getNext(prev, level)
+
+	for next != s.tail {
+		// Assume prev.key < key.
+		nextNode := s.node(next)
+		nextAbbreviatedKey := nextNode.abbreviatedKey
+		if abbreviatedKey < nextAbbreviatedKey {
+			// We are done for this level, since prev.key < key < next.key.
+			break
+		}
+		if abbreviatedKey == nextAbbreviatedKey {
+			if s.cmp(key, (*s.storage)[nextNode.keyStart:nextNode.keyEnd]) <= 0 {
+				// We are done for this level, since prev.key < key < next.key.
+				break
+			}
+		}
+
+		// Keep moving right on this level.
+		prev = next
+		next = nextNode.links[level].next
+	}
+
+	return
+}
+
+func (s *Skiplist) getKey(nd uint32) base.InternalKey {
+	n := s.node(nd)
+	kind := base.InternalKeyKind((*s.storage)[n.offset])
+	key := (*s.storage)[n.keyStart:n.keyEnd]
+	return base.MakeInternalKey(key, uint64(n.offset)|base.InternalKeySeqNumBatch, kind)
+}
+
+func (s *Skiplist) getNext(nd, h uint32) uint32 {
+	return s.node(nd).links[h].next
+}
+
+func (s *Skiplist) getPrev(nd, h uint32) uint32 {
+	return s.node(nd).links[h].prev
+}
+
+func (s *Skiplist) debug() string {
+	var buf bytes.Buffer
+	for level := uint32(0); level < s.height; level++ {
+		var count int
+		for nd := s.head; nd != s.tail; nd = s.getNext(nd, level) {
+			count++
+		}
+		fmt.Fprintf(&buf, "%d: %d\n", level, count)
+	}
+	return buf.String()
+}
+
+// Silence unused warning.
+var _ = (*Skiplist).debug
diff --git a/pebble/internal/batchskl/skl_test.go b/pebble/internal/batchskl/skl_test.go
new file mode 100644
index 0000000..4f67a8b
--- /dev/null
+++ b/pebble/internal/batchskl/skl_test.go
@@ -0,0 +1,539 @@
+/*
+ * Copyright 2017 Dgraph Labs, Inc. and Contributors
+ * Modifications copyright (C) 2017 Andy Kimball and Contributors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package batchskl
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+// iterAdapter adapts the new Iterator API which returns the key and value from
+// positioning methods (Seek*, First, Last, Next, Prev) to the old API which
+// returned a boolean corresponding to Valid. Only used by test code.
+type iterAdapter struct {
+	Iterator
+}
+
+func (i *iterAdapter) verify(key *base.InternalKey) bool {
+	valid := key != nil
+	if valid != i.Valid() {
+		panic(fmt.Sprintf("inconsistent valid: %t != %t", valid, i.Valid()))
+	}
+	if valid {
+		if base.InternalCompare(bytes.Compare, *key, i.Key()) != 0 {
+			panic(fmt.Sprintf("inconsistent key: %s != %s", *key, i.Key()))
+		}
+	}
+	return valid
+}
+
+func (i *iterAdapter) SeekGE(key []byte) bool {
+	return i.verify(i.Iterator.SeekGE(key, base.SeekGEFlagsNone))
+}
+
+func (i *iterAdapter) SeekLT(key []byte) bool {
+	return i.verify(i.Iterator.SeekLT(key))
+}
+
+func (i *iterAdapter) First() bool {
+	return i.verify(i.Iterator.First())
+}
+
+func (i *iterAdapter) Last() bool {
+	return i.verify(i.Iterator.Last())
+}
+
+func (i *iterAdapter) Next() bool {
+	return i.verify(i.Iterator.Next())
+}
+
+func (i *iterAdapter) Prev() bool {
+	return i.verify(i.Iterator.Prev())
+}
+
+func (i *iterAdapter) Key() base.InternalKey {
+	return *i.Iterator.Key()
+}
+
+// length iterates over skiplist to give exact size.
+func length(s *Skiplist) int {
+	count := 0
+
+	it := iterAdapter{s.NewIter(nil, nil)}
+	for valid := it.First(); valid; valid = it.Next() {
+		count++
+	}
+
+	return count
+}
+
+// length iterates over skiplist in reverse order to give exact size.
+func lengthRev(s *Skiplist) int {
+	count := 0
+
+	it := iterAdapter{s.NewIter(nil, nil)}
+	for valid := it.Last(); valid; valid = it.Prev() {
+		count++
+	}
+
+	return count
+}
+
+func makeKey(s string) []byte {
+	return []byte(s)
+}
+
+type testStorage struct {
+	data []byte
+}
+
+func (d *testStorage) add(key string) uint32 {
+	offset := uint32(len(d.data))
+	d.data = append(d.data, uint8(base.InternalKeyKindSet))
+	var buf [binary.MaxVarintLen64]byte
+	n := binary.PutUvarint(buf[:], uint64(len(key)))
+	d.data = append(d.data, buf[:n]...)
+	d.data = append(d.data, key...)
+	return offset
+}
+
+func (d *testStorage) addBytes(key []byte) uint32 {
+	offset := uint32(len(d.data))
+	d.data = append(d.data, uint8(base.InternalKeyKindSet))
+	var buf [binary.MaxVarintLen64]byte
+	n := binary.PutUvarint(buf[:], uint64(len(key)))
+	d.data = append(d.data, buf[:n]...)
+	d.data = append(d.data, key...)
+	return offset
+}
+
+func newTestSkiplist(storage *testStorage) *Skiplist {
+	return NewSkiplist(&storage.data, base.DefaultComparer.Compare,
+		base.DefaultComparer.AbbreviatedKey)
+}
+
+func TestEmpty(t *testing.T) {
+	key := makeKey("aaa")
+	l := newTestSkiplist(&testStorage{})
+	it := iterAdapter{l.NewIter(nil, nil)}
+
+	require.False(t, it.Valid())
+
+	it.First()
+	require.False(t, it.Valid())
+
+	it.Last()
+	require.False(t, it.Valid())
+
+	require.False(t, it.SeekGE(key))
+	require.False(t, it.Valid())
+}
+
+// TestBasic tests seeks and adds.
+func TestBasic(t *testing.T) {
+	d := &testStorage{}
+	l := newTestSkiplist(d)
+	it := iterAdapter{l.NewIter(nil, nil)}
+
+	// Try adding values.
+	require.Nil(t, l.Add(d.add("key1")))
+	require.Nil(t, l.Add(d.add("key2")))
+	require.Nil(t, l.Add(d.add("key3")))
+
+	require.True(t, it.SeekGE(makeKey("key")))
+	require.EqualValues(t, "key1", it.Key().UserKey)
+
+	require.True(t, it.SeekGE(makeKey("key1")))
+	require.EqualValues(t, "key1", it.Key().UserKey)
+
+	require.True(t, it.SeekGE(makeKey("key2")))
+	require.EqualValues(t, "key2", it.Key().UserKey)
+
+	require.True(t, it.SeekGE(makeKey("key3")))
+	require.EqualValues(t, "key3", it.Key().UserKey)
+
+	require.True(t, it.SeekGE(makeKey("key2")))
+	require.True(t, it.SeekGE(makeKey("key3")))
+}
+
+func TestSkiplistAdd(t *testing.T) {
+	d := &testStorage{}
+	l := newTestSkiplist(d)
+	it := iterAdapter{l.NewIter(nil, nil)}
+
+	// Add empty key.
+	require.Nil(t, l.Add(d.add("")))
+	require.EqualValues(t, []byte(nil), it.Key().UserKey)
+	require.True(t, it.First())
+	require.EqualValues(t, []byte{}, it.Key().UserKey)
+
+	// Add to empty list.
+	require.Nil(t, l.Add(d.add("00002")))
+	require.True(t, it.SeekGE(makeKey("00002")))
+	require.EqualValues(t, "00002", it.Key().UserKey)
+
+	// Add first element in non-empty list.
+	require.Nil(t, l.Add(d.add("00001")))
+	require.True(t, it.SeekGE(makeKey("00001")))
+	require.EqualValues(t, "00001", it.Key().UserKey)
+
+	// Add last element in non-empty list.
+	require.Nil(t, l.Add(d.add("00004")))
+	require.True(t, it.SeekGE(makeKey("00004")))
+	require.EqualValues(t, "00004", it.Key().UserKey)
+
+	// Add element in middle of list.
+	require.Nil(t, l.Add(d.add("00003")))
+	require.True(t, it.SeekGE(makeKey("00003")))
+	require.EqualValues(t, "00003", it.Key().UserKey)
+
+	// Try to add element that already exists.
+	require.Nil(t, l.Add(d.add("00002")))
+	require.Equal(t, 6, length(l))
+	require.Equal(t, 6, lengthRev(l))
+}
+
+func TestSkiplistAdd_Overflow(t *testing.T) {
+	// Regression test for cockroachdb/pebble#1258. The length of the nodes buffer
+	// cannot exceed the maximum allowable size.
+	d := &testStorage{}
+	l := newTestSkiplist(d)
+
+	// Simulate a full nodes slice. This speeds up the test significantly, as
+	// opposed to adding data to the list.
+	l.nodes = make([]byte, maxNodesSize)
+
+	// Adding a new node to the list would overflow the nodes slice. Note that it
+	// is the size of a new node struct that is relevant here, rather than the
+	// size of the data being added to the list.
+	err := l.Add(d.add("too much!"))
+	require.Error(t, err)
+	require.True(t, errors.Is(err, ErrTooManyRecords))
+}
+
+// TestIteratorNext tests a basic iteration over all nodes from the beginning.
+func TestIteratorNext(t *testing.T) {
+	const n = 100
+	d := &testStorage{}
+	l := newTestSkiplist(d)
+	it := iterAdapter{l.NewIter(nil, nil)}
+
+	require.False(t, it.Valid())
+
+	it.First()
+	require.False(t, it.Valid())
+
+	for i := n - 1; i >= 0; i-- {
+		require.Nil(t, l.Add(d.add(fmt.Sprintf("%05d", i))))
+	}
+
+	it.First()
+	for i := 0; i < n; i++ {
+		require.True(t, it.Valid())
+		require.EqualValues(t, fmt.Sprintf("%05d", i), it.Key().UserKey)
+		it.Next()
+	}
+	require.False(t, it.Valid())
+}
+
+// // TestIteratorPrev tests a basic iteration over all nodes from the end.
+func TestIteratorPrev(t *testing.T) {
+	const n = 100
+	d := &testStorage{}
+	l := newTestSkiplist(d)
+	it := iterAdapter{l.NewIter(nil, nil)}
+
+	require.False(t, it.Valid())
+
+	it.Last()
+	require.False(t, it.Valid())
+
+	for i := 0; i < n; i++ {
+		l.Add(d.add(fmt.Sprintf("%05d", i)))
+	}
+
+	it.Last()
+	for i := n - 1; i >= 0; i-- {
+		require.True(t, it.Valid())
+		require.EqualValues(t, fmt.Sprintf("%05d", i), string(it.Key().UserKey))
+		it.Prev()
+	}
+	require.False(t, it.Valid())
+}
+
+func TestIteratorSeekGE(t *testing.T) {
+	const n = 1000
+	d := &testStorage{}
+	l := newTestSkiplist(d)
+	it := iterAdapter{l.NewIter(nil, nil)}
+
+	require.False(t, it.Valid())
+	it.First()
+	require.False(t, it.Valid())
+	// 1000, 1010, 1020, ..., 1990.
+	for i := n - 1; i >= 0; i-- {
+		require.Nil(t, l.Add(d.add(fmt.Sprintf("%05d", i*10+1000))))
+	}
+
+	require.True(t, it.SeekGE(makeKey("")))
+	require.True(t, it.Valid())
+	require.EqualValues(t, "01000", it.Key().UserKey)
+
+	require.True(t, it.SeekGE(makeKey("01000")))
+	require.True(t, it.Valid())
+	require.EqualValues(t, "01000", it.Key().UserKey)
+
+	require.True(t, it.SeekGE(makeKey("01005")))
+	require.True(t, it.Valid())
+	require.EqualValues(t, "01010", it.Key().UserKey)
+
+	require.True(t, it.SeekGE(makeKey("01010")))
+	require.True(t, it.Valid())
+	require.EqualValues(t, "01010", it.Key().UserKey)
+
+	require.False(t, it.SeekGE(makeKey("99999")))
+	require.False(t, it.Valid())
+
+	// Test seek for empty key.
+	require.Nil(t, l.Add(d.add("")))
+	require.True(t, it.SeekGE([]byte{}))
+	require.True(t, it.Valid())
+
+	require.True(t, it.SeekGE(makeKey("")))
+	require.True(t, it.Valid())
+}
+
+func TestIteratorSeekLT(t *testing.T) {
+	const n = 100
+	d := &testStorage{}
+	l := newTestSkiplist(d)
+	it := iterAdapter{l.NewIter(nil, nil)}
+
+	require.False(t, it.Valid())
+	it.First()
+	require.False(t, it.Valid())
+	// 1000, 1010, 1020, ..., 1990.
+	for i := n - 1; i >= 0; i-- {
+		require.Nil(t, l.Add(d.add(fmt.Sprintf("%05d", i*10+1000))))
+	}
+
+	require.False(t, it.SeekLT(makeKey("")))
+	require.False(t, it.Valid())
+
+	require.False(t, it.SeekLT(makeKey("01000")))
+	require.False(t, it.Valid())
+
+	require.True(t, it.SeekLT(makeKey("01001")))
+	require.EqualValues(t, "01000", it.Key().UserKey)
+	require.True(t, it.Valid())
+
+	require.True(t, it.SeekLT(makeKey("01005")))
+	require.EqualValues(t, "01000", it.Key().UserKey)
+	require.True(t, it.Valid())
+
+	require.True(t, it.SeekLT(makeKey("01991")))
+	require.EqualValues(t, "01990", it.Key().UserKey)
+	require.True(t, it.Valid())
+
+	require.True(t, it.SeekLT(makeKey("99999")))
+	require.True(t, it.Valid())
+	require.EqualValues(t, "01990", it.Key().UserKey)
+
+	// Test seek for empty key.
+	require.Nil(t, l.Add(d.add("")))
+	require.False(t, it.SeekLT([]byte{}))
+	require.False(t, it.Valid())
+	require.True(t, it.SeekLT(makeKey("\x01")))
+	require.True(t, it.Valid())
+	require.EqualValues(t, "", it.Key().UserKey)
+}
+
+// TODO(peter): test First and Last.
+func TestIteratorBounds(t *testing.T) {
+	d := &testStorage{}
+	l := newTestSkiplist(d)
+	for i := 1; i < 10; i++ {
+		require.NoError(t, l.Add(d.add(fmt.Sprintf("%05d", i))))
+	}
+
+	it := iterAdapter{l.NewIter(makeKey("00003"), makeKey("00007"))}
+
+	// SeekGE within the lower and upper bound succeeds.
+	for i := 3; i <= 6; i++ {
+		key := makeKey(fmt.Sprintf("%05d", i))
+		require.True(t, it.SeekGE(key))
+		require.EqualValues(t, string(key), string(it.Key().UserKey))
+	}
+
+	// SeekGE before the lower bound still succeeds (only the upper bound is
+	// checked).
+	for i := 1; i < 3; i++ {
+		key := makeKey(fmt.Sprintf("%05d", i))
+		require.True(t, it.SeekGE(key))
+		require.EqualValues(t, string(key), string(it.Key().UserKey))
+	}
+
+	// SeekGE beyond the upper bound fails.
+	for i := 7; i < 10; i++ {
+		key := makeKey(fmt.Sprintf("%05d", i))
+		require.False(t, it.SeekGE(key))
+	}
+
+	require.True(t, it.SeekGE(makeKey("00006")))
+	require.EqualValues(t, "00006", it.Key().UserKey)
+
+	// Next into the upper bound fails.
+	require.False(t, it.Next())
+
+	// SeekLT within the lower and upper bound succeeds.
+	for i := 4; i <= 7; i++ {
+		key := makeKey(fmt.Sprintf("%05d", i))
+		require.True(t, it.SeekLT(key))
+		require.EqualValues(t, fmt.Sprintf("%05d", i-1), string(it.Key().UserKey))
+	}
+
+	// SeekLT beyond the upper bound still succeeds (only the lower bound is
+	// checked).
+	for i := 8; i < 9; i++ {
+		key := makeKey(fmt.Sprintf("%05d", i))
+		require.True(t, it.SeekLT(key))
+		require.EqualValues(t, fmt.Sprintf("%05d", i-1), string(it.Key().UserKey))
+	}
+
+	// SeekLT before the lower bound fails.
+	for i := 1; i < 4; i++ {
+		key := makeKey(fmt.Sprintf("%05d", i))
+		require.False(t, it.SeekLT(key))
+	}
+
+	require.True(t, it.SeekLT(makeKey("00004")))
+	require.EqualValues(t, "00003", it.Key().UserKey)
+
+	// Prev into the lower bound fails.
+	require.False(t, it.Prev())
+}
+
+func randomKey(rng *rand.Rand, b []byte) []byte {
+	key := rng.Uint32()
+	key2 := rng.Uint32()
+	binary.LittleEndian.PutUint32(b, key)
+	binary.LittleEndian.PutUint32(b[4:], key2)
+	return b
+}
+
+// Standard test. Some fraction is read. Some fraction is write. Writes have
+// to go through mutex lock.
+func BenchmarkReadWrite(b *testing.B) {
+	for i := 0; i <= 10; i++ {
+		readFrac := float32(i) / 10.0
+		b.Run(fmt.Sprintf("frac_%d", i*10), func(b *testing.B) {
+			var buf [8]byte
+			d := &testStorage{
+				data: make([]byte, 0, b.N*10),
+			}
+			l := newTestSkiplist(d)
+			it := l.NewIter(nil, nil)
+			rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				key := randomKey(rng, buf[:])
+				if rng.Float32() < readFrac {
+					_ = it.SeekGE(key, base.SeekGEFlagsNone)
+				} else {
+					offset := d.addBytes(buf[:])
+					_ = l.Add(offset)
+				}
+			}
+			b.StopTimer()
+		})
+	}
+}
+
+func BenchmarkOrderedWrite(b *testing.B) {
+	var buf [8]byte
+	d := &testStorage{
+		data: make([]byte, 0, b.N*10),
+	}
+	l := newTestSkiplist(d)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		binary.BigEndian.PutUint64(buf[:], uint64(i))
+		offset := d.addBytes(buf[:])
+		_ = l.Add(offset)
+	}
+}
+
+func BenchmarkIterNext(b *testing.B) {
+	var buf [8]byte
+	d := &testStorage{
+		data: make([]byte, 0, 64<<10),
+	}
+	l := newTestSkiplist(d)
+
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+	for len(d.data)+20 < cap(d.data) {
+		key := randomKey(rng, buf[:])
+		offset := d.addBytes(key)
+		err := l.Add(offset)
+		require.NoError(b, err)
+	}
+
+	it := l.NewIter(nil, nil)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		if !it.Valid() {
+			it.First()
+		}
+		it.Next()
+	}
+}
+
+func BenchmarkIterPrev(b *testing.B) {
+	var buf [8]byte
+	d := &testStorage{
+		data: make([]byte, 0, 64<<10),
+	}
+	l := newTestSkiplist(d)
+
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+	for len(d.data)+20 < cap(d.data) {
+		key := randomKey(rng, buf[:])
+		offset := d.addBytes(key)
+		err := l.Add(offset)
+		require.NoError(b, err)
+	}
+
+	it := l.NewIter(nil, nil)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		if !it.Valid() {
+			it.Last()
+		}
+		it.Prev()
+	}
+}
diff --git a/pebble/internal/bytealloc/bytealloc.go b/pebble/internal/bytealloc/bytealloc.go
new file mode 100644
index 0000000..b905270
--- /dev/null
+++ b/pebble/internal/bytealloc/bytealloc.go
@@ -0,0 +1,69 @@
+// Copyright 2016 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License.
+
+package bytealloc
+
+import "github.com/cockroachdb/pebble/internal/rawalloc"
+
+// An A provides chunk allocation of []byte, amortizing the overhead of each
+// allocation. Because the underlying storage for the slices is shared, they
+// should share a similar lifetime in order to avoid pinning large amounts of
+// memory unnecessarily. The allocator itself is a []byte where cap() indicates
+// the total amount of memory and len() is the amount already allocated. The
+// size of the buffer to allocate from is grown exponentially when it runs out
+// of room up to a maximum size (chunkAllocMaxSize).
+type A []byte
+
+const chunkAllocMinSize = 512
+const chunkAllocMaxSize = 512 << 10 // 512 KB
+
+func (a A) reserve(n int) A {
+	allocSize := cap(a) * 2
+	if allocSize < chunkAllocMinSize {
+		allocSize = chunkAllocMinSize
+	} else if allocSize > chunkAllocMaxSize {
+		allocSize = chunkAllocMaxSize
+	}
+	if allocSize < n {
+		allocSize = n
+	}
+	return rawalloc.New(0, allocSize)
+}
+
+// Alloc allocates a new chunk of memory with the specified length.
+func (a A) Alloc(n int) (A, []byte) {
+	if cap(a)-len(a) < n {
+		a = a.reserve(n)
+	}
+	p := len(a)
+	r := a[p : p+n : p+n]
+	a = a[:p+n]
+	return a, r
+}
+
+// Copy allocates a new chunk of memory, initializing it from src.
+func (a A) Copy(src []byte) (A, []byte) {
+	var alloc []byte
+	a, alloc = a.Alloc(len(src))
+	copy(alloc, src)
+	return a, alloc
+}
+
+// Reset returns the current chunk, resetting allocated memory back to none.
+// Future allocations will use memory previously allocated by previous calls to
+// Alloc or Copy, so the caller must know know that none of the previously
+// allocated byte slices are still in use.
+func (a A) Reset() A {
+	return a[:0]
+}
diff --git a/pebble/internal/cache/LICENSE b/pebble/internal/cache/LICENSE
new file mode 100644
index 0000000..daa739e
--- /dev/null
+++ b/pebble/internal/cache/LICENSE
@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright (c) 2018 Damian Gryski
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/pebble/internal/cache/cgo_disabled.go b/pebble/internal/cache/cgo_disabled.go
new file mode 100644
index 0000000..0e75574
--- /dev/null
+++ b/pebble/internal/cache/cgo_disabled.go
@@ -0,0 +1,10 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build !cgo
+// +build !cgo
+
+package cache
+
+const cgoEnabled = false
diff --git a/pebble/internal/cache/cgo_enabled.go b/pebble/internal/cache/cgo_enabled.go
new file mode 100644
index 0000000..b7014cb
--- /dev/null
+++ b/pebble/internal/cache/cgo_enabled.go
@@ -0,0 +1,10 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build cgo
+// +build cgo
+
+package cache
+
+const cgoEnabled = true
diff --git a/pebble/internal/cache/clockpro.go b/pebble/internal/cache/clockpro.go
new file mode 100644
index 0000000..cdae6a9
--- /dev/null
+++ b/pebble/internal/cache/clockpro.go
@@ -0,0 +1,909 @@
+// Copyright 2018. All rights reserved. Use of this source code is governed by
+// an MIT-style license that can be found in the LICENSE file.
+
+// Package cache implements the CLOCK-Pro caching algorithm.
+//
+// CLOCK-Pro is a patent-free alternative to the Adaptive Replacement Cache,
+// https://en.wikipedia.org/wiki/Adaptive_replacement_cache.
+// It is an approximation of LIRS ( https://en.wikipedia.org/wiki/LIRS_caching_algorithm ),
+// much like the CLOCK page replacement algorithm is an approximation of LRU.
+//
+// This implementation is based on the python code from https://bitbucket.org/SamiLehtinen/pyclockpro .
+//
+// Slides describing the algorithm: http://fr.slideshare.net/huliang64/clockpro
+//
+// The original paper: http://static.usenix.org/event/usenix05/tech/general/full_papers/jiang/jiang_html/html.html
+//
+// It is MIT licensed, like the original.
+package cache // import "github.com/cockroachdb/pebble/internal/cache"
+
+import (
+	"fmt"
+	"os"
+	"runtime"
+	"runtime/debug"
+	"strings"
+	"sync"
+	"sync/atomic"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+)
+
+type fileKey struct {
+	// id is the namespace for fileNums.
+	id      uint64
+	fileNum base.DiskFileNum
+}
+
+type key struct {
+	fileKey
+	offset uint64
+}
+
+// file returns the "file key" for the receiver. This is the key used for the
+// shard.files map.
+func (k key) file() key {
+	k.offset = 0
+	return k
+}
+
+func (k key) String() string {
+	return fmt.Sprintf("%d/%d/%d", k.id, k.fileNum, k.offset)
+}
+
+// Handle provides a strong reference to a value in the cache. The reference
+// does not pin the value in the cache, but it does prevent the underlying byte
+// slice from being reused.
+type Handle struct {
+	value *Value
+}
+
+// Get returns the value stored in handle.
+func (h Handle) Get() []byte {
+	if h.value != nil {
+		// NB: We don't increment shard.hits in this code path because we only want
+		// to record a hit when the handle is retrieved from the cache.
+		return h.value.buf
+	}
+	return nil
+}
+
+// Release releases the reference to the cache entry.
+func (h Handle) Release() {
+	h.value.release()
+}
+
+type shard struct {
+	hits   atomic.Int64
+	misses atomic.Int64
+
+	mu sync.RWMutex
+
+	reservedSize int64
+	maxSize      int64
+	coldTarget   int64
+	blocks       robinHoodMap // fileNum+offset -> block
+	files        robinHoodMap // fileNum -> list of blocks
+
+	// The blocks and files maps store values in manually managed memory that is
+	// invisible to the Go GC. This is fine for Value and entry objects that are
+	// stored in manually managed memory, but when the "invariants" build tag is
+	// set, all Value and entry objects are Go allocated and the entries map will
+	// contain a reference to every entry.
+	entries map[*entry]struct{}
+
+	handHot  *entry
+	handCold *entry
+	handTest *entry
+
+	sizeHot  int64
+	sizeCold int64
+	sizeTest int64
+
+	// The count fields are used exclusively for asserting expectations.
+	// We've seen infinite looping (cockroachdb/cockroach#70154) that
+	// could be explained by a corrupted sizeCold. Through asserting on
+	// these fields, we hope to gain more insight from any future
+	// reproductions.
+	countHot  int64
+	countCold int64
+	countTest int64
+}
+
+func (c *shard) Get(id uint64, fileNum base.DiskFileNum, offset uint64) Handle {
+	c.mu.RLock()
+	var value *Value
+	if e := c.blocks.Get(key{fileKey{id, fileNum}, offset}); e != nil {
+		value = e.acquireValue()
+		if value != nil {
+			e.referenced.Store(true)
+		}
+	}
+	c.mu.RUnlock()
+	if value == nil {
+		c.misses.Add(1)
+		return Handle{}
+	}
+	c.hits.Add(1)
+	return Handle{value: value}
+}
+
+func (c *shard) Set(id uint64, fileNum base.DiskFileNum, offset uint64, value *Value) Handle {
+	if n := value.refs(); n != 1 {
+		panic(fmt.Sprintf("pebble: Value has already been added to the cache: refs=%d", n))
+	}
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	k := key{fileKey{id, fileNum}, offset}
+	e := c.blocks.Get(k)
+
+	switch {
+	case e == nil:
+		// no cache entry? add it
+		e = newEntry(c, k, int64(len(value.buf)))
+		e.setValue(value)
+		if c.metaAdd(k, e) {
+			value.ref.trace("add-cold")
+			c.sizeCold += e.size
+			c.countCold++
+		} else {
+			value.ref.trace("skip-cold")
+			e.free()
+			e = nil
+		}
+
+	case e.peekValue() != nil:
+		// cache entry was a hot or cold page
+		e.setValue(value)
+		e.referenced.Store(true)
+		delta := int64(len(value.buf)) - e.size
+		e.size = int64(len(value.buf))
+		if e.ptype == etHot {
+			value.ref.trace("add-hot")
+			c.sizeHot += delta
+		} else {
+			value.ref.trace("add-cold")
+			c.sizeCold += delta
+		}
+		c.evict()
+
+	default:
+		// cache entry was a test page
+		c.sizeTest -= e.size
+		c.countTest--
+		c.metaDel(e).release()
+		c.metaCheck(e)
+
+		e.size = int64(len(value.buf))
+		c.coldTarget += e.size
+		if c.coldTarget > c.targetSize() {
+			c.coldTarget = c.targetSize()
+		}
+
+		e.referenced.Store(false)
+		e.setValue(value)
+		e.ptype = etHot
+		if c.metaAdd(k, e) {
+			value.ref.trace("add-hot")
+			c.sizeHot += e.size
+			c.countHot++
+		} else {
+			value.ref.trace("skip-hot")
+			e.free()
+			e = nil
+		}
+	}
+
+	c.checkConsistency()
+
+	// Values are initialized with a reference count of 1. That reference count
+	// is being transferred to the returned Handle.
+	return Handle{value: value}
+}
+
+func (c *shard) checkConsistency() {
+	// See the comment above the count{Hot,Cold,Test} fields.
+	switch {
+	case c.sizeHot < 0 || c.sizeCold < 0 || c.sizeTest < 0 || c.countHot < 0 || c.countCold < 0 || c.countTest < 0:
+		panic(fmt.Sprintf("pebble: unexpected negative: %d (%d bytes) hot, %d (%d bytes) cold, %d (%d bytes) test",
+			c.countHot, c.sizeHot, c.countCold, c.sizeCold, c.countTest, c.sizeTest))
+	case c.sizeHot > 0 && c.countHot == 0:
+		panic(fmt.Sprintf("pebble: mismatch %d hot size, %d hot count", c.sizeHot, c.countHot))
+	case c.sizeCold > 0 && c.countCold == 0:
+		panic(fmt.Sprintf("pebble: mismatch %d cold size, %d cold count", c.sizeCold, c.countCold))
+	case c.sizeTest > 0 && c.countTest == 0:
+		panic(fmt.Sprintf("pebble: mismatch %d test size, %d test count", c.sizeTest, c.countTest))
+	}
+}
+
+// Delete deletes the cached value for the specified file and offset.
+func (c *shard) Delete(id uint64, fileNum base.DiskFileNum, offset uint64) {
+	// The common case is there is nothing to delete, so do a quick check with
+	// shared lock.
+	k := key{fileKey{id, fileNum}, offset}
+	c.mu.RLock()
+	exists := c.blocks.Get(k) != nil
+	c.mu.RUnlock()
+	if !exists {
+		return
+	}
+
+	var deletedValue *Value
+	func() {
+		c.mu.Lock()
+		defer c.mu.Unlock()
+
+		e := c.blocks.Get(k)
+		if e == nil {
+			return
+		}
+		deletedValue = c.metaEvict(e)
+		c.checkConsistency()
+	}()
+	// Now that the mutex has been dropped, release the reference which will
+	// potentially free the memory associated with the previous cached value.
+	deletedValue.release()
+}
+
+// EvictFile evicts all of the cache values for the specified file.
+func (c *shard) EvictFile(id uint64, fileNum base.DiskFileNum) {
+	fkey := key{fileKey{id, fileNum}, 0}
+	for c.evictFileRun(fkey) {
+		// Sched switch to give another goroutine an opportunity to acquire the
+		// shard mutex.
+		runtime.Gosched()
+	}
+}
+
+func (c *shard) evictFileRun(fkey key) (moreRemaining bool) {
+	// If most of the file's blocks are held in the block cache, evicting all
+	// the blocks may take a while. We don't want to block the entire cache
+	// shard, forcing concurrent readers to wait until we're finished. We drop
+	// the mutex every [blocksPerMutexAcquisition] blocks to give other
+	// goroutines an opportunity to make progress.
+	const blocksPerMutexAcquisition = 5
+	c.mu.Lock()
+
+	// Releasing a value may result in free-ing it back to the memory allocator.
+	// This can have a nontrivial cost that we'd prefer to not pay while holding
+	// the shard mutex, so we collect the evicted values in a local slice and
+	// only release them in a defer after dropping the cache mutex.
+	var obsoleteValuesAlloc [blocksPerMutexAcquisition]*Value
+	obsoleteValues := obsoleteValuesAlloc[:0]
+	defer func() {
+		c.mu.Unlock()
+		for _, v := range obsoleteValues {
+			v.release()
+		}
+	}()
+
+	blocks := c.files.Get(fkey)
+	if blocks == nil {
+		// No blocks for this file.
+		return false
+	}
+
+	// b is the current head of the doubly linked list, and n is the entry after b.
+	for b, n := blocks, (*entry)(nil); len(obsoleteValues) < cap(obsoleteValues); b = n {
+		n = b.fileLink.next
+		obsoleteValues = append(obsoleteValues, c.metaEvict(b))
+		if b == n {
+			// b == n represents the case where b was the last entry remaining
+			// in the doubly linked list, which is why it pointed at itself. So
+			// no more entries left.
+			c.checkConsistency()
+			return false
+		}
+	}
+	// Exhausted blocksPerMutexAcquisition.
+	return true
+}
+
+func (c *shard) Free() {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	// NB: we use metaDel rather than metaEvict in order to avoid the expensive
+	// metaCheck call when the "invariants" build tag is specified.
+	for c.handHot != nil {
+		e := c.handHot
+		c.metaDel(c.handHot).release()
+		e.free()
+	}
+
+	c.blocks.free()
+	c.files.free()
+}
+
+func (c *shard) Reserve(n int) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	c.reservedSize += int64(n)
+
+	// Changing c.reservedSize will either increase or decrease
+	// the targetSize. But we want coldTarget to be in the range
+	// [0, targetSize]. So, if c.targetSize decreases, make sure
+	// that the coldTarget fits within the limits.
+	targetSize := c.targetSize()
+	if c.coldTarget > targetSize {
+		c.coldTarget = targetSize
+	}
+
+	c.evict()
+	c.checkConsistency()
+}
+
+// Size returns the current space used by the cache.
+func (c *shard) Size() int64 {
+	c.mu.RLock()
+	size := c.sizeHot + c.sizeCold
+	c.mu.RUnlock()
+	return size
+}
+
+func (c *shard) targetSize() int64 {
+	target := c.maxSize - c.reservedSize
+	// Always return a positive integer for targetSize. This is so that we don't
+	// end up in an infinite loop in evict(), in cases where reservedSize is
+	// greater than or equal to maxSize.
+	if target < 1 {
+		return 1
+	}
+	return target
+}
+
+// Add the entry to the cache, returning true if the entry was added and false
+// if it would not fit in the cache.
+func (c *shard) metaAdd(key key, e *entry) bool {
+	c.evict()
+	if e.size > c.targetSize() {
+		// The entry is larger than the target cache size.
+		return false
+	}
+
+	c.blocks.Put(key, e)
+	if entriesGoAllocated {
+		// Go allocated entries need to be referenced from Go memory. The entries
+		// map provides that reference.
+		c.entries[e] = struct{}{}
+	}
+
+	if c.handHot == nil {
+		// first element
+		c.handHot = e
+		c.handCold = e
+		c.handTest = e
+	} else {
+		c.handHot.link(e)
+	}
+
+	if c.handCold == c.handHot {
+		c.handCold = c.handCold.prev()
+	}
+
+	fkey := key.file()
+	if fileBlocks := c.files.Get(fkey); fileBlocks == nil {
+		c.files.Put(fkey, e)
+	} else {
+		fileBlocks.linkFile(e)
+	}
+	return true
+}
+
+// Remove the entry from the cache. This removes the entry from the blocks map,
+// the files map, and ensures that hand{Hot,Cold,Test} are not pointing at the
+// entry. Returns the deleted value that must be released, if any.
+func (c *shard) metaDel(e *entry) (deletedValue *Value) {
+	if value := e.peekValue(); value != nil {
+		value.ref.trace("metaDel")
+	}
+	// Remove the pointer to the value.
+	deletedValue = e.val
+	e.val = nil
+
+	c.blocks.Delete(e.key)
+	if entriesGoAllocated {
+		// Go allocated entries need to be referenced from Go memory. The entries
+		// map provides that reference.
+		delete(c.entries, e)
+	}
+
+	if e == c.handHot {
+		c.handHot = c.handHot.prev()
+	}
+	if e == c.handCold {
+		c.handCold = c.handCold.prev()
+	}
+	if e == c.handTest {
+		c.handTest = c.handTest.prev()
+	}
+
+	if e.unlink() == e {
+		// This was the last entry in the cache.
+		c.handHot = nil
+		c.handCold = nil
+		c.handTest = nil
+	}
+
+	fkey := e.key.file()
+	if next := e.unlinkFile(); e == next {
+		c.files.Delete(fkey)
+	} else {
+		c.files.Put(fkey, next)
+	}
+	return deletedValue
+}
+
+// Check that the specified entry is not referenced by the cache.
+func (c *shard) metaCheck(e *entry) {
+	if invariants.Enabled {
+		if _, ok := c.entries[e]; ok {
+			fmt.Fprintf(os.Stderr, "%p: %s unexpectedly found in entries map\n%s",
+				e, e.key, debug.Stack())
+			os.Exit(1)
+		}
+		if c.blocks.findByValue(e) != nil {
+			fmt.Fprintf(os.Stderr, "%p: %s unexpectedly found in blocks map\n%s\n%s",
+				e, e.key, &c.blocks, debug.Stack())
+			os.Exit(1)
+		}
+		if c.files.findByValue(e) != nil {
+			fmt.Fprintf(os.Stderr, "%p: %s unexpectedly found in files map\n%s\n%s",
+				e, e.key, &c.files, debug.Stack())
+			os.Exit(1)
+		}
+		// NB: c.hand{Hot,Cold,Test} are pointers into a single linked list. We
+		// only have to traverse one of them to check all of them.
+		var countHot, countCold, countTest int64
+		var sizeHot, sizeCold, sizeTest int64
+		for t := c.handHot.next(); t != nil; t = t.next() {
+			// Recompute count{Hot,Cold,Test} and size{Hot,Cold,Test}.
+			switch t.ptype {
+			case etHot:
+				countHot++
+				sizeHot += t.size
+			case etCold:
+				countCold++
+				sizeCold += t.size
+			case etTest:
+				countTest++
+				sizeTest += t.size
+			}
+			if e == t {
+				fmt.Fprintf(os.Stderr, "%p: %s unexpectedly found in blocks list\n%s",
+					e, e.key, debug.Stack())
+				os.Exit(1)
+			}
+			if t == c.handHot {
+				break
+			}
+		}
+		if countHot != c.countHot || countCold != c.countCold || countTest != c.countTest ||
+			sizeHot != c.sizeHot || sizeCold != c.sizeCold || sizeTest != c.sizeTest {
+			fmt.Fprintf(os.Stderr, `divergence of Hot,Cold,Test statistics
+				cache's statistics: hot %d, %d, cold %d, %d, test %d, %d
+				recalculated statistics: hot %d, %d, cold %d, %d, test %d, %d\n%s`,
+				c.countHot, c.sizeHot, c.countCold, c.sizeCold, c.countTest, c.sizeTest,
+				countHot, sizeHot, countCold, sizeCold, countTest, sizeTest,
+				debug.Stack())
+			os.Exit(1)
+		}
+	}
+}
+
+func (c *shard) metaEvict(e *entry) (evictedValue *Value) {
+	switch e.ptype {
+	case etHot:
+		c.sizeHot -= e.size
+		c.countHot--
+	case etCold:
+		c.sizeCold -= e.size
+		c.countCold--
+	case etTest:
+		c.sizeTest -= e.size
+		c.countTest--
+	}
+	evictedValue = c.metaDel(e)
+	c.metaCheck(e)
+	e.free()
+	return evictedValue
+}
+
+func (c *shard) evict() {
+	for c.targetSize() <= c.sizeHot+c.sizeCold && c.handCold != nil {
+		c.runHandCold(c.countCold, c.sizeCold)
+	}
+}
+
+func (c *shard) runHandCold(countColdDebug, sizeColdDebug int64) {
+	// countColdDebug and sizeColdDebug should equal c.countCold and
+	// c.sizeCold. They're parameters only to aid in debugging of
+	// cockroachdb/cockroach#70154. Since they're parameters, their
+	// arguments will appear within stack traces should we encounter
+	// a reproduction.
+	if c.countCold != countColdDebug || c.sizeCold != sizeColdDebug {
+		panic(fmt.Sprintf("runHandCold: cold count and size are %d, %d, arguments are %d and %d",
+			c.countCold, c.sizeCold, countColdDebug, sizeColdDebug))
+	}
+
+	e := c.handCold
+	if e.ptype == etCold {
+		if e.referenced.Load() {
+			e.referenced.Store(false)
+			e.ptype = etHot
+			c.sizeCold -= e.size
+			c.countCold--
+			c.sizeHot += e.size
+			c.countHot++
+		} else {
+			e.setValue(nil)
+			e.ptype = etTest
+			c.sizeCold -= e.size
+			c.countCold--
+			c.sizeTest += e.size
+			c.countTest++
+			for c.targetSize() < c.sizeTest && c.handTest != nil {
+				c.runHandTest()
+			}
+		}
+	}
+
+	c.handCold = c.handCold.next()
+
+	for c.targetSize()-c.coldTarget <= c.sizeHot && c.handHot != nil {
+		c.runHandHot()
+	}
+}
+
+func (c *shard) runHandHot() {
+	if c.handHot == c.handTest && c.handTest != nil {
+		c.runHandTest()
+		if c.handHot == nil {
+			return
+		}
+	}
+
+	e := c.handHot
+	if e.ptype == etHot {
+		if e.referenced.Load() {
+			e.referenced.Store(false)
+		} else {
+			e.ptype = etCold
+			c.sizeHot -= e.size
+			c.countHot--
+			c.sizeCold += e.size
+			c.countCold++
+		}
+	}
+
+	c.handHot = c.handHot.next()
+}
+
+func (c *shard) runHandTest() {
+	if c.sizeCold > 0 && c.handTest == c.handCold && c.handCold != nil {
+		// sizeCold is > 0, so assert that countCold == 0. See the
+		// comment above count{Hot,Cold,Test}.
+		if c.countCold == 0 {
+			panic(fmt.Sprintf("pebble: mismatch %d cold size, %d cold count", c.sizeCold, c.countCold))
+		}
+
+		c.runHandCold(c.countCold, c.sizeCold)
+		if c.handTest == nil {
+			return
+		}
+	}
+
+	e := c.handTest
+	if e.ptype == etTest {
+		c.sizeTest -= e.size
+		c.countTest--
+		c.coldTarget -= e.size
+		if c.coldTarget < 0 {
+			c.coldTarget = 0
+		}
+		c.metaDel(e).release()
+		c.metaCheck(e)
+		e.free()
+	}
+
+	c.handTest = c.handTest.next()
+}
+
+// Metrics holds metrics for the cache.
+type Metrics struct {
+	// The number of bytes inuse by the cache.
+	Size int64
+	// The count of objects (blocks or tables) in the cache.
+	Count int64
+	// The number of cache hits.
+	Hits int64
+	// The number of cache misses.
+	Misses int64
+}
+
+// Cache implements Pebble's sharded block cache. The Clock-PRO algorithm is
+// used for page replacement
+// (http://static.usenix.org/event/usenix05/tech/general/full_papers/jiang/jiang_html/html.html). In
+// order to provide better concurrency, 4 x NumCPUs shards are created, with
+// each shard being given 1/n of the target cache size. The Clock-PRO algorithm
+// is run independently on each shard.
+//
+// Blocks are keyed by an (id, fileNum, offset) triple. The ID is a namespace
+// for file numbers and allows a single Cache to be shared between multiple
+// Pebble instances. The fileNum and offset refer to an sstable file number and
+// the offset of the block within the file. Because sstables are immutable and
+// file numbers are never reused, (fileNum,offset) are unique for the lifetime
+// of a Pebble instance.
+//
+// In addition to maintaining a map from (fileNum,offset) to data, each shard
+// maintains a map of the cached blocks for a particular fileNum. This allows
+// efficient eviction of all of the blocks for a file which is used when an
+// sstable is deleted from disk.
+//
+// # Memory Management
+//
+// In order to reduce pressure on the Go GC, manual memory management is
+// performed for the data stored in the cache. Manual memory management is
+// performed by calling into C.{malloc,free} to allocate memory. Cache.Values
+// are reference counted and the memory backing a manual value is freed when
+// the reference count drops to 0.
+//
+// Manual memory management brings the possibility of memory leaks. It is
+// imperative that every Handle returned by Cache.{Get,Set} is eventually
+// released. The "invariants" build tag enables a leak detection facility that
+// places a GC finalizer on cache.Value. When the cache.Value finalizer is run,
+// if the underlying buffer is still present a leak has occurred. The "tracing"
+// build tag enables tracing of cache.Value reference count manipulation and
+// eases finding where a leak has occurred. These two facilities are usually
+// used in combination by specifying `-tags invariants,tracing`. Note that
+// "tracing" produces a significant slowdown, while "invariants" does not.
+type Cache struct {
+	refs    atomic.Int64
+	maxSize int64
+	idAlloc atomic.Uint64
+	shards  []shard
+
+	// Traces recorded by Cache.trace. Used for debugging.
+	tr struct {
+		sync.Mutex
+		msgs []string
+	}
+}
+
+// New creates a new cache of the specified size. Memory for the cache is
+// allocated on demand, not during initialization. The cache is created with a
+// reference count of 1. Each DB it is associated with adds a reference, so the
+// creator of the cache should usually release their reference after the DB is
+// created.
+//
+//	c := cache.New(...)
+//	defer c.Unref()
+//	d, err := pebble.Open(pebble.Options{Cache: c})
+func New(size int64) *Cache {
+	// How many cache shards should we create?
+	//
+	// Note that the probability two processors will try to access the same
+	// shard at the same time increases superlinearly with the number of
+	// processors (Eg, consider the brithday problem where each CPU is a person,
+	// and each shard is a possible birthday).
+	//
+	// We could consider growing the number of shards superlinearly, but
+	// increasing the shard count may reduce the effectiveness of the caching
+	// algorithm if frequently-accessed blocks are insufficiently distributed
+	// across shards. If a shard's size is smaller than a single frequently
+	// scanned sstable, then the shard will be unable to hold the entire
+	// frequently-scanned table in memory despite other shards still holding
+	// infrequently accessed blocks.
+	//
+	// Experimentally, we've observed contention contributing to tail latencies
+	// at 2 shards per processor. For now we use 4 shards per processor,
+	// recognizing this may not be final word.
+	m := 4 * runtime.GOMAXPROCS(0)
+
+	// In tests we can use large CPU machines with small cache sizes and have
+	// many caches in existence at a time. If sharding into m shards would
+	// produce too small shards, constrain the number of shards to 4.
+	const minimumShardSize = 4 << 20 // 4 MiB
+	if m > 4 && int(size)/m < minimumShardSize {
+		m = 4
+	}
+	return newShards(size, m)
+}
+
+func newShards(size int64, shards int) *Cache {
+	c := &Cache{
+		maxSize: size,
+		shards:  make([]shard, shards),
+	}
+	c.refs.Store(1)
+	c.idAlloc.Store(1)
+	c.trace("alloc", c.refs.Load())
+	for i := range c.shards {
+		c.shards[i] = shard{
+			maxSize:    size / int64(len(c.shards)),
+			coldTarget: size / int64(len(c.shards)),
+		}
+		if entriesGoAllocated {
+			c.shards[i].entries = make(map[*entry]struct{})
+		}
+		c.shards[i].blocks.init(16)
+		c.shards[i].files.init(16)
+	}
+
+	// Note: this is a no-op if invariants are disabled or race is enabled.
+	invariants.SetFinalizer(c, func(obj interface{}) {
+		c := obj.(*Cache)
+		if v := c.refs.Load(); v != 0 {
+			c.tr.Lock()
+			fmt.Fprintf(os.Stderr,
+				"pebble: cache (%p) has non-zero reference count: %d\n", c, v)
+			if len(c.tr.msgs) > 0 {
+				fmt.Fprintf(os.Stderr, "%s\n", strings.Join(c.tr.msgs, "\n"))
+			}
+			c.tr.Unlock()
+			os.Exit(1)
+		}
+	})
+	return c
+}
+
+func (c *Cache) getShard(id uint64, fileNum base.DiskFileNum, offset uint64) *shard {
+	if id == 0 {
+		panic("pebble: 0 cache ID is invalid")
+	}
+
+	// Inlined version of fnv.New64 + Write.
+	const offset64 = 14695981039346656037
+	const prime64 = 1099511628211
+
+	h := uint64(offset64)
+	for i := 0; i < 8; i++ {
+		h *= prime64
+		h ^= uint64(id & 0xff)
+		id >>= 8
+	}
+	fileNumVal := uint64(fileNum.FileNum())
+	for i := 0; i < 8; i++ {
+		h *= prime64
+		h ^= uint64(fileNumVal) & 0xff
+		fileNumVal >>= 8
+	}
+	for i := 0; i < 8; i++ {
+		h *= prime64
+		h ^= uint64(offset & 0xff)
+		offset >>= 8
+	}
+
+	return &c.shards[h%uint64(len(c.shards))]
+}
+
+// Ref adds a reference to the cache. The cache only remains valid as long a
+// reference is maintained to it.
+func (c *Cache) Ref() {
+	v := c.refs.Add(1)
+	if v <= 1 {
+		panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v))
+	}
+	c.trace("ref", v)
+}
+
+// Unref releases a reference on the cache.
+func (c *Cache) Unref() {
+	v := c.refs.Add(-1)
+	c.trace("unref", v)
+	switch {
+	case v < 0:
+		panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v))
+	case v == 0:
+		for i := range c.shards {
+			c.shards[i].Free()
+		}
+	}
+}
+
+// Get retrieves the cache value for the specified file and offset, returning
+// nil if no value is present.
+func (c *Cache) Get(id uint64, fileNum base.DiskFileNum, offset uint64) Handle {
+	return c.getShard(id, fileNum, offset).Get(id, fileNum, offset)
+}
+
+// Set sets the cache value for the specified file and offset, overwriting an
+// existing value if present. A Handle is returned which provides faster
+// retrieval of the cached value than Get (lock-free and avoidance of the map
+// lookup). The value must have been allocated by Cache.Alloc.
+func (c *Cache) Set(id uint64, fileNum base.DiskFileNum, offset uint64, value *Value) Handle {
+	return c.getShard(id, fileNum, offset).Set(id, fileNum, offset, value)
+}
+
+// Delete deletes the cached value for the specified file and offset.
+func (c *Cache) Delete(id uint64, fileNum base.DiskFileNum, offset uint64) {
+	c.getShard(id, fileNum, offset).Delete(id, fileNum, offset)
+}
+
+// EvictFile evicts all of the cache values for the specified file.
+func (c *Cache) EvictFile(id uint64, fileNum base.DiskFileNum) {
+	if id == 0 {
+		panic("pebble: 0 cache ID is invalid")
+	}
+	for i := range c.shards {
+		c.shards[i].EvictFile(id, fileNum)
+	}
+}
+
+// MaxSize returns the max size of the cache.
+func (c *Cache) MaxSize() int64 {
+	return c.maxSize
+}
+
+// Size returns the current space used by the cache.
+func (c *Cache) Size() int64 {
+	var size int64
+	for i := range c.shards {
+		size += c.shards[i].Size()
+	}
+	return size
+}
+
+// Alloc allocates a byte slice of the specified size, possibly reusing
+// previously allocated but unused memory. The memory backing the value is
+// manually managed. The caller MUST either add the value to the cache (via
+// Cache.Set), or release the value (via Cache.Free). Failure to do so will
+// result in a memory leak.
+func Alloc(n int) *Value {
+	return newValue(n)
+}
+
+// Free frees the specified value. The buffer associated with the value will
+// possibly be reused, making it invalid to use the buffer after calling
+// Free. Do not call Free on a value that has been added to the cache.
+func Free(v *Value) {
+	if n := v.refs(); n > 1 {
+		panic(fmt.Sprintf("pebble: Value has been added to the cache: refs=%d", n))
+	}
+	v.release()
+}
+
+// Reserve N bytes in the cache. This effectively shrinks the size of the cache
+// by N bytes, without actually consuming any memory. The returned closure
+// should be invoked to release the reservation.
+func (c *Cache) Reserve(n int) func() {
+	// Round-up the per-shard reservation. Most reservations should be large, so
+	// this probably doesn't matter in practice.
+	shardN := (n + len(c.shards) - 1) / len(c.shards)
+	for i := range c.shards {
+		c.shards[i].Reserve(shardN)
+	}
+	return func() {
+		if shardN == -1 {
+			panic("pebble: cache reservation already released")
+		}
+		for i := range c.shards {
+			c.shards[i].Reserve(-shardN)
+		}
+		shardN = -1
+	}
+}
+
+// Metrics returns the metrics for the cache.
+func (c *Cache) Metrics() Metrics {
+	var m Metrics
+	for i := range c.shards {
+		s := &c.shards[i]
+		s.mu.RLock()
+		m.Count += int64(s.blocks.Count())
+		m.Size += s.sizeHot + s.sizeCold
+		s.mu.RUnlock()
+		m.Hits += s.hits.Load()
+		m.Misses += s.misses.Load()
+	}
+	return m
+}
+
+// NewID returns a new ID to be used as a namespace for cached file
+// blocks.
+func (c *Cache) NewID() uint64 {
+	return c.idAlloc.Add(1)
+}
diff --git a/pebble/internal/cache/clockpro_normal.go b/pebble/internal/cache/clockpro_normal.go
new file mode 100644
index 0000000..ae49938
--- /dev/null
+++ b/pebble/internal/cache/clockpro_normal.go
@@ -0,0 +1,10 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build !tracing
+// +build !tracing
+
+package cache
+
+func (c *Cache) trace(_ string, _ int64) {}
diff --git a/pebble/internal/cache/clockpro_test.go b/pebble/internal/cache/clockpro_test.go
new file mode 100644
index 0000000..5ec7b7f
--- /dev/null
+++ b/pebble/internal/cache/clockpro_test.go
@@ -0,0 +1,279 @@
+// Copyright 2018. All rights reserved. Use of this source code is governed by
+// an MIT-style license that can be found in the LICENSE file.
+
+package cache
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"os"
+	"runtime"
+	"strconv"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+func TestCache(t *testing.T) {
+	// Test data was generated from the python code
+	f, err := os.Open("testdata/cache")
+	require.NoError(t, err)
+
+	cache := newShards(200, 1)
+	defer cache.Unref()
+
+	scanner := bufio.NewScanner(f)
+	line := 1
+
+	for scanner.Scan() {
+		fields := bytes.Fields(scanner.Bytes())
+
+		key, err := strconv.Atoi(string(fields[0]))
+		require.NoError(t, err)
+
+		wantHit := fields[1][0] == 'h'
+
+		var hit bool
+		h := cache.Get(1, base.FileNum(uint64(key)).DiskFileNum(), 0)
+		if v := h.Get(); v == nil {
+			value := Alloc(1)
+			value.Buf()[0] = fields[0][0]
+			cache.Set(1, base.FileNum(uint64(key)).DiskFileNum(), 0, value).Release()
+		} else {
+			hit = true
+			if !bytes.Equal(v, fields[0][:1]) {
+				t.Errorf("%d: cache returned bad data: got %s , want %s\n", line, v, fields[0][:1])
+			}
+		}
+		h.Release()
+		if hit != wantHit {
+			t.Errorf("%d: cache hit mismatch: got %v, want %v\n", line, hit, wantHit)
+		}
+		line++
+	}
+}
+
+func testValue(cache *Cache, s string, repeat int) *Value {
+	b := bytes.Repeat([]byte(s), repeat)
+	v := Alloc(len(b))
+	copy(v.Buf(), b)
+	return v
+}
+
+func TestCacheDelete(t *testing.T) {
+	cache := newShards(100, 1)
+	defer cache.Unref()
+
+	cache.Set(1, base.FileNum(0).DiskFileNum(), 0, testValue(cache, "a", 5)).Release()
+	cache.Set(1, base.FileNum(1).DiskFileNum(), 0, testValue(cache, "a", 5)).Release()
+	cache.Set(1, base.FileNum(2).DiskFileNum(), 0, testValue(cache, "a", 5)).Release()
+	if expected, size := int64(15), cache.Size(); expected != size {
+		t.Fatalf("expected cache size %d, but found %d", expected, size)
+	}
+	cache.Delete(1, base.FileNum(1).DiskFileNum(), 0)
+	if expected, size := int64(10), cache.Size(); expected != size {
+		t.Fatalf("expected cache size %d, but found %d", expected, size)
+	}
+	if h := cache.Get(1, base.FileNum(0).DiskFileNum(), 0); h.Get() == nil {
+		t.Fatalf("expected to find block 0/0")
+	} else {
+		h.Release()
+	}
+	if h := cache.Get(1, base.FileNum(1).DiskFileNum(), 0); h.Get() != nil {
+		t.Fatalf("expected to not find block 1/0")
+	} else {
+		h.Release()
+	}
+	// Deleting a non-existing block does nothing.
+	cache.Delete(1, base.FileNum(1).DiskFileNum(), 0)
+	if expected, size := int64(10), cache.Size(); expected != size {
+		t.Fatalf("expected cache size %d, but found %d", expected, size)
+	}
+}
+
+func TestEvictFile(t *testing.T) {
+	cache := newShards(100, 1)
+	defer cache.Unref()
+
+	cache.Set(1, base.FileNum(0).DiskFileNum(), 0, testValue(cache, "a", 5)).Release()
+	cache.Set(1, base.FileNum(1).DiskFileNum(), 0, testValue(cache, "a", 5)).Release()
+	cache.Set(1, base.FileNum(2).DiskFileNum(), 0, testValue(cache, "a", 5)).Release()
+	cache.Set(1, base.FileNum(2).DiskFileNum(), 1, testValue(cache, "a", 5)).Release()
+	cache.Set(1, base.FileNum(2).DiskFileNum(), 2, testValue(cache, "a", 5)).Release()
+	if expected, size := int64(25), cache.Size(); expected != size {
+		t.Fatalf("expected cache size %d, but found %d", expected, size)
+	}
+	cache.EvictFile(1, base.FileNum(0).DiskFileNum())
+	if expected, size := int64(20), cache.Size(); expected != size {
+		t.Fatalf("expected cache size %d, but found %d", expected, size)
+	}
+	cache.EvictFile(1, base.FileNum(1).DiskFileNum())
+	if expected, size := int64(15), cache.Size(); expected != size {
+		t.Fatalf("expected cache size %d, but found %d", expected, size)
+	}
+	cache.EvictFile(1, base.FileNum(2).DiskFileNum())
+	if expected, size := int64(0), cache.Size(); expected != size {
+		t.Fatalf("expected cache size %d, but found %d", expected, size)
+	}
+}
+
+func TestEvictAll(t *testing.T) {
+	// Verify that it is okay to evict all of the data from a cache. Previously
+	// this would trigger a nil-pointer dereference.
+	cache := newShards(100, 1)
+	defer cache.Unref()
+
+	cache.Set(1, base.FileNum(0).DiskFileNum(), 0, testValue(cache, "a", 101)).Release()
+	cache.Set(1, base.FileNum(1).DiskFileNum(), 0, testValue(cache, "a", 101)).Release()
+}
+
+func TestMultipleDBs(t *testing.T) {
+	cache := newShards(100, 1)
+	defer cache.Unref()
+
+	cache.Set(1, base.FileNum(0).DiskFileNum(), 0, testValue(cache, "a", 5)).Release()
+	cache.Set(2, base.FileNum(0).DiskFileNum(), 0, testValue(cache, "b", 5)).Release()
+	if expected, size := int64(10), cache.Size(); expected != size {
+		t.Fatalf("expected cache size %d, but found %d", expected, size)
+	}
+	cache.EvictFile(1, base.FileNum(0).DiskFileNum())
+	if expected, size := int64(5), cache.Size(); expected != size {
+		t.Fatalf("expected cache size %d, but found %d", expected, size)
+	}
+	h := cache.Get(1, base.FileNum(0).DiskFileNum(), 0)
+	if v := h.Get(); v != nil {
+		t.Fatalf("expected not present, but found %s", v)
+	}
+	h = cache.Get(2, base.FileNum(0).DiskFileNum(), 0)
+	if v := h.Get(); string(v) != "bbbbb" {
+		t.Fatalf("expected bbbbb, but found %s", v)
+	} else {
+		h.Release()
+	}
+}
+
+func TestZeroSize(t *testing.T) {
+	cache := newShards(0, 1)
+	defer cache.Unref()
+
+	cache.Set(1, base.FileNum(0).DiskFileNum(), 0, testValue(cache, "a", 5)).Release()
+}
+
+func TestReserve(t *testing.T) {
+	cache := newShards(4, 2)
+	defer cache.Unref()
+
+	cache.Set(1, base.FileNum(0).DiskFileNum(), 0, testValue(cache, "a", 1)).Release()
+	cache.Set(2, base.FileNum(0).DiskFileNum(), 0, testValue(cache, "a", 1)).Release()
+	require.EqualValues(t, 2, cache.Size())
+	r := cache.Reserve(1)
+	require.EqualValues(t, 0, cache.Size())
+	cache.Set(1, base.FileNum(0).DiskFileNum(), 0, testValue(cache, "a", 1)).Release()
+	cache.Set(2, base.FileNum(0).DiskFileNum(), 0, testValue(cache, "a", 1)).Release()
+	cache.Set(3, base.FileNum(0).DiskFileNum(), 0, testValue(cache, "a", 1)).Release()
+	cache.Set(4, base.FileNum(0).DiskFileNum(), 0, testValue(cache, "a", 1)).Release()
+	require.EqualValues(t, 2, cache.Size())
+	r()
+	require.EqualValues(t, 2, cache.Size())
+	cache.Set(1, base.FileNum(0).DiskFileNum(), 0, testValue(cache, "a", 1)).Release()
+	cache.Set(2, base.FileNum(0).DiskFileNum(), 0, testValue(cache, "a", 1)).Release()
+	require.EqualValues(t, 4, cache.Size())
+}
+
+func TestReserveDoubleRelease(t *testing.T) {
+	cache := newShards(100, 1)
+	defer cache.Unref()
+
+	r := cache.Reserve(10)
+	r()
+
+	result := func() (result string) {
+		defer func() {
+			if v := recover(); v != nil {
+				result = fmt.Sprint(v)
+			}
+		}()
+		r()
+		return ""
+	}()
+	const expected = "pebble: cache reservation already released"
+	if expected != result {
+		t.Fatalf("expected %q, but found %q", expected, result)
+	}
+}
+
+func TestCacheStressSetExisting(t *testing.T) {
+	cache := newShards(1, 1)
+	defer cache.Unref()
+
+	var wg sync.WaitGroup
+	for i := 0; i < 10; i++ {
+		wg.Add(1)
+		go func(i int) {
+			defer wg.Done()
+			for j := 0; j < 10000; j++ {
+				cache.Set(1, base.FileNum(0).DiskFileNum(), uint64(i), testValue(cache, "a", 1)).Release()
+				runtime.Gosched()
+			}
+		}(i)
+	}
+	wg.Wait()
+}
+
+func BenchmarkCacheGet(b *testing.B) {
+	const size = 100000
+
+	cache := newShards(size, 1)
+	defer cache.Unref()
+
+	for i := 0; i < size; i++ {
+		v := testValue(cache, "a", 1)
+		cache.Set(1, base.FileNum(0).DiskFileNum(), uint64(i), v).Release()
+	}
+
+	b.ResetTimer()
+	b.RunParallel(func(pb *testing.PB) {
+		rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+
+		for pb.Next() {
+			h := cache.Get(1, base.FileNum(0).DiskFileNum(), uint64(rng.Intn(size)))
+			if h.Get() == nil {
+				b.Fatal("failed to lookup value")
+			}
+			h.Release()
+		}
+	})
+}
+
+func TestReserveColdTarget(t *testing.T) {
+	// If coldTarget isn't updated when we call shard.Reserve,
+	// then we unnecessarily remove nodes from the
+	// cache.
+
+	cache := newShards(100, 1)
+	defer cache.Unref()
+
+	for i := 0; i < 50; i++ {
+		cache.Set(uint64(i+1), base.FileNum(0).DiskFileNum(), 0, testValue(cache, "a", 1)).Release()
+	}
+
+	if cache.Size() != 50 {
+		require.Equal(t, 50, cache.Size(), "nodes were unnecessarily evicted from the cache")
+	}
+
+	// There won't be enough space left for 50 nodes in the cache after
+	// we call shard.Reserve. This should trigger a call to evict.
+	cache.Reserve(51)
+
+	// If we don't update coldTarget in Reserve then the cache gets emptied to
+	// size 0. In shard.Evict, we loop until shard.Size() < shard.targetSize().
+	// Therefore, 100 - 51 = 49, but we evict one more node.
+	if cache.Size() != 48 {
+		t.Fatalf("expected positive cache size %d, but found %d", 48, cache.Size())
+	}
+}
diff --git a/pebble/internal/cache/clockpro_tracing.go b/pebble/internal/cache/clockpro_tracing.go
new file mode 100644
index 0000000..d14c1cd
--- /dev/null
+++ b/pebble/internal/cache/clockpro_tracing.go
@@ -0,0 +1,20 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build tracing
+// +build tracing
+
+package cache
+
+import (
+	"fmt"
+	"runtime/debug"
+)
+
+func (c *Cache) trace(msg string, refs int64) {
+	s := fmt.Sprintf("%s: refs=%d\n%s", msg, refs, debug.Stack())
+	c.tr.Lock()
+	c.tr.msgs = append(c.tr.msgs, s)
+	c.tr.Unlock()
+}
diff --git a/pebble/internal/cache/entry.go b/pebble/internal/cache/entry.go
new file mode 100644
index 0000000..a49fde6
--- /dev/null
+++ b/pebble/internal/cache/entry.go
@@ -0,0 +1,155 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package cache
+
+import "sync/atomic"
+
+type entryType int8
+
+const (
+	etTest entryType = iota
+	etCold
+	etHot
+)
+
+func (p entryType) String() string {
+	switch p {
+	case etTest:
+		return "test"
+	case etCold:
+		return "cold"
+	case etHot:
+		return "hot"
+	}
+	return "unknown"
+}
+
+// entry holds the metadata for a cache entry. The memory for an entry is
+// allocated from manually managed memory.
+//
+// Using manual memory management for entries is technically a volation of the
+// Cgo pointer rules:
+//
+//	https://golang.org/cmd/cgo/#hdr-Passing_pointers
+//
+// Specifically, Go pointers should not be stored in C allocated memory. The
+// reason for this rule is that the Go GC will not look at C allocated memory
+// to find pointers to Go objects. If the only reference to a Go object is
+// stored in C allocated memory, the object will be reclaimed. The shard field
+// of the entry struct points to a Go allocated object, thus the
+// violation. What makes this "safe" is that the Cache guarantees that there
+// are other pointers to the shard which will keep it alive.
+type entry struct {
+	key key
+	// The value associated with the entry. The entry holds a reference on the
+	// value which is maintained by entry.setValue().
+	val       *Value
+	blockLink struct {
+		next *entry
+		prev *entry
+	}
+	fileLink struct {
+		next *entry
+		prev *entry
+	}
+	size  int64
+	ptype entryType
+	// referenced is atomically set to indicate that this entry has been accessed
+	// since the last time one of the clock hands swept it.
+	referenced atomic.Bool
+	shard      *shard
+	// Reference count for the entry. The entry is freed when the reference count
+	// drops to zero.
+	ref refcnt
+}
+
+func newEntry(s *shard, key key, size int64) *entry {
+	e := entryAllocNew()
+	*e = entry{
+		key:   key,
+		size:  size,
+		ptype: etCold,
+		shard: s,
+	}
+	e.blockLink.next = e
+	e.blockLink.prev = e
+	e.fileLink.next = e
+	e.fileLink.prev = e
+	e.ref.init(1)
+	return e
+}
+
+func (e *entry) free() {
+	e.setValue(nil)
+	*e = entry{}
+	entryAllocFree(e)
+}
+
+func (e *entry) next() *entry {
+	if e == nil {
+		return nil
+	}
+	return e.blockLink.next
+}
+
+func (e *entry) prev() *entry {
+	if e == nil {
+		return nil
+	}
+	return e.blockLink.prev
+}
+
+func (e *entry) link(s *entry) {
+	s.blockLink.prev = e.blockLink.prev
+	s.blockLink.prev.blockLink.next = s
+	s.blockLink.next = e
+	s.blockLink.next.blockLink.prev = s
+}
+
+func (e *entry) unlink() *entry {
+	next := e.blockLink.next
+	e.blockLink.prev.blockLink.next = e.blockLink.next
+	e.blockLink.next.blockLink.prev = e.blockLink.prev
+	e.blockLink.prev = e
+	e.blockLink.next = e
+	return next
+}
+
+func (e *entry) linkFile(s *entry) {
+	s.fileLink.prev = e.fileLink.prev
+	s.fileLink.prev.fileLink.next = s
+	s.fileLink.next = e
+	s.fileLink.next.fileLink.prev = s
+}
+
+func (e *entry) unlinkFile() *entry {
+	next := e.fileLink.next
+	e.fileLink.prev.fileLink.next = e.fileLink.next
+	e.fileLink.next.fileLink.prev = e.fileLink.prev
+	e.fileLink.prev = e
+	e.fileLink.next = e
+	return next
+}
+
+func (e *entry) setValue(v *Value) {
+	if v != nil {
+		v.acquire()
+	}
+	old := e.val
+	e.val = v
+	old.release()
+}
+
+func (e *entry) peekValue() *Value {
+	return e.val
+}
+
+func (e *entry) acquireValue() *Value {
+	v := e.val
+	if v != nil {
+		v.acquire()
+	}
+	return v
+}
diff --git a/pebble/internal/cache/entry_invariants.go b/pebble/internal/cache/entry_invariants.go
new file mode 100644
index 0000000..31c54e4
--- /dev/null
+++ b/pebble/internal/cache/entry_invariants.go
@@ -0,0 +1,38 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+//
+//go:build (invariants && !race) || (tracing && !race)
+// +build invariants,!race tracing,!race
+
+package cache
+
+import (
+	"fmt"
+	"os"
+
+	"github.com/cockroachdb/pebble/internal/invariants"
+)
+
+// When the "invariants" or "tracing" build tags are enabled, we need to
+// allocate entries using the Go allocator so entry.val properly maintains a
+// reference to the Value.
+const entriesGoAllocated = true
+
+func entryAllocNew() *entry {
+	e := &entry{}
+	// Note: this is a no-op if invariants and tracing are disabled or race is
+	// enabled.
+	invariants.SetFinalizer(e, func(obj interface{}) {
+		e := obj.(*entry)
+		if v := e.ref.refs(); v != 0 {
+			fmt.Fprintf(os.Stderr, "%p: cache entry has non-zero reference count: %d\n%s",
+				e, v, e.ref.traces())
+			os.Exit(1)
+		}
+	})
+	return e
+}
+
+func entryAllocFree(e *entry) {
+}
diff --git a/pebble/internal/cache/entry_normal.go b/pebble/internal/cache/entry_normal.go
new file mode 100644
index 0000000..92afb04
--- /dev/null
+++ b/pebble/internal/cache/entry_normal.go
@@ -0,0 +1,103 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+//
+//go:build (!invariants && !tracing) || race
+// +build !invariants,!tracing race
+
+package cache
+
+import (
+	"runtime"
+	"sync"
+	"unsafe"
+
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/manual"
+)
+
+const (
+	entrySize            = int(unsafe.Sizeof(entry{}))
+	entryAllocCacheLimit = 128
+	// Avoid using runtime.SetFinalizer in race builds as finalizers tickle a bug
+	// in the Go race detector in go1.15 and earlier versions. This requires that
+	// entries are Go allocated rather than manually allocated.
+	//
+	// If cgo is disabled we need to allocate the entries using the Go allocator
+	// and is violates the Go GC rules to put Go pointers (such as the entry
+	// pointer fields) into untyped memory (i.e. a []byte).
+	entriesGoAllocated = invariants.RaceEnabled || !cgoEnabled
+)
+
+var entryAllocPool = sync.Pool{
+	New: func() interface{} {
+		return newEntryAllocCache()
+	},
+}
+
+func entryAllocNew() *entry {
+	a := entryAllocPool.Get().(*entryAllocCache)
+	e := a.alloc()
+	entryAllocPool.Put(a)
+	return e
+}
+
+func entryAllocFree(e *entry) {
+	a := entryAllocPool.Get().(*entryAllocCache)
+	a.free(e)
+	entryAllocPool.Put(a)
+}
+
+type entryAllocCache struct {
+	entries []*entry
+}
+
+func newEntryAllocCache() *entryAllocCache {
+	c := &entryAllocCache{}
+	if !entriesGoAllocated {
+		// Note the use of a "real" finalizer here (as opposed to a build tag-gated
+		// no-op finalizer). Without the finalizer, objects released from the pool
+		// and subsequently GC'd by the Go runtime would fail to have their manually
+		// allocated memory freed, which results in a memory leak.
+		// lint:ignore SetFinalizer
+		runtime.SetFinalizer(c, freeEntryAllocCache)
+	}
+	return c
+}
+
+func freeEntryAllocCache(obj interface{}) {
+	c := obj.(*entryAllocCache)
+	for i, e := range c.entries {
+		c.dealloc(e)
+		c.entries[i] = nil
+	}
+}
+
+func (c *entryAllocCache) alloc() *entry {
+	n := len(c.entries)
+	if n == 0 {
+		if entriesGoAllocated {
+			return &entry{}
+		}
+		b := manual.New(entrySize)
+		return (*entry)(unsafe.Pointer(&b[0]))
+	}
+	e := c.entries[n-1]
+	c.entries = c.entries[:n-1]
+	return e
+}
+
+func (c *entryAllocCache) dealloc(e *entry) {
+	if !entriesGoAllocated {
+		buf := (*[manual.MaxArrayLen]byte)(unsafe.Pointer(e))[:entrySize:entrySize]
+		manual.Free(buf)
+	}
+}
+
+func (c *entryAllocCache) free(e *entry) {
+	if len(c.entries) == entryAllocCacheLimit {
+		c.dealloc(e)
+		return
+	}
+	c.entries = append(c.entries, e)
+}
diff --git a/pebble/internal/cache/refcnt_normal.go b/pebble/internal/cache/refcnt_normal.go
new file mode 100644
index 0000000..9ab3348
--- /dev/null
+++ b/pebble/internal/cache/refcnt_normal.go
@@ -0,0 +1,59 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build !tracing
+// +build !tracing
+
+package cache
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"github.com/cockroachdb/redact"
+)
+
+// refcnt provides an atomic reference count. This version is used when the
+// "tracing" build tag is not enabled. See refcnt_tracing.go for the "tracing"
+// enabled version.
+type refcnt struct {
+	val atomic.Int32
+}
+
+// initialize the reference count to the specified value.
+func (v *refcnt) init(val int32) {
+	v.val.Store(val)
+}
+
+func (v *refcnt) refs() int32 {
+	return v.val.Load()
+}
+
+func (v *refcnt) acquire() {
+	switch v := v.val.Add(1); {
+	case v <= 1:
+		panic(redact.Safe(fmt.Sprintf("pebble: inconsistent reference count: %d", v)))
+	}
+}
+
+func (v *refcnt) release() bool {
+	switch v := v.val.Add(-1); {
+	case v < 0:
+		panic(redact.Safe(fmt.Sprintf("pebble: inconsistent reference count: %d", v)))
+	case v == 0:
+		return true
+	default:
+		return false
+	}
+}
+
+func (v *refcnt) trace(msg string) {
+}
+
+func (v *refcnt) traces() string {
+	return ""
+}
+
+// Silence unused warning.
+var _ = (*refcnt)(nil).traces
diff --git a/pebble/internal/cache/refcnt_tracing.go b/pebble/internal/cache/refcnt_tracing.go
new file mode 100644
index 0000000..1d5e6c0
--- /dev/null
+++ b/pebble/internal/cache/refcnt_tracing.go
@@ -0,0 +1,66 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build tracing
+// +build tracing
+
+package cache
+
+import (
+	"fmt"
+	"runtime/debug"
+	"strings"
+	"sync"
+	"sync/atomic"
+)
+
+// refcnt provides an atomic reference count, along with a tracing facility for
+// debugging logic errors in manipulating the reference count. This version is
+// used when the "tracing" build tag is enabled.
+type refcnt struct {
+	val atomic.Int32
+	sync.Mutex
+	msgs []string
+}
+
+func (v *refcnt) init(val int32) {
+	v.val.Store(val)
+	v.trace("init")
+}
+
+func (v *refcnt) refs() int32 {
+	return v.val.Load()
+}
+
+func (v *refcnt) acquire() {
+	switch n := v.val.Add(1); {
+	case n <= 1:
+		panic(fmt.Sprintf("pebble: inconsistent reference count: %d", n))
+	}
+	v.trace("acquire")
+}
+
+func (v *refcnt) release() bool {
+	n := v.val.Add(-1)
+	switch {
+	case n < 0:
+		panic(fmt.Sprintf("pebble: inconsistent reference count: %d", n))
+	}
+	v.trace("release")
+	return n == 0
+}
+
+func (v *refcnt) trace(msg string) {
+	s := fmt.Sprintf("%s: refs=%d\n%s", msg, v.refs(), debug.Stack())
+	v.Lock()
+	v.msgs = append(v.msgs, s)
+	v.Unlock()
+}
+
+func (v *refcnt) traces() string {
+	v.Lock()
+	s := strings.Join(v.msgs, "\n")
+	v.Unlock()
+	return s
+}
diff --git a/pebble/internal/cache/robin_hood.go b/pebble/internal/cache/robin_hood.go
new file mode 100644
index 0000000..6e093fd
--- /dev/null
+++ b/pebble/internal/cache/robin_hood.go
@@ -0,0 +1,320 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package cache
+
+import (
+	"fmt"
+	"math/bits"
+	"os"
+	"runtime/debug"
+	"strings"
+	"time"
+	"unsafe"
+
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/manual"
+)
+
+var hashSeed = uint64(time.Now().UnixNano())
+
+// Fibonacci hash: https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
+func robinHoodHash(k key, shift uint32) uint32 {
+	const m = 11400714819323198485
+	h := hashSeed
+	h ^= k.id * m
+	h ^= uint64(k.fileNum.FileNum()) * m
+	h ^= k.offset * m
+	return uint32(h >> shift)
+}
+
+type robinHoodEntry struct {
+	key key
+	// Note that value may point to a Go allocated object (if the "invariants"
+	// build tag was specified), even though the memory for the entry itself is
+	// manually managed. This is technically a volation of the Cgo pointer rules:
+	//
+	//   https://golang.org/cmd/cgo/#hdr-Passing_pointers
+	//
+	// Specifically, Go pointers should not be stored in C allocated memory. The
+	// reason for this rule is that the Go GC will not look at C allocated memory
+	// to find pointers to Go objects. If the only reference to a Go object is
+	// stored in C allocated memory, the object will be reclaimed. What makes
+	// this "safe" is that the Cache guarantees that there are other pointers to
+	// the entry and shard which will keep them alive. In particular, every Go
+	// allocated entry in the cache is referenced by the shard.entries map. And
+	// every shard is referenced by the Cache.shards map.
+	value *entry
+	// The distance the entry is from its desired position.
+	dist uint32
+}
+
+type robinHoodEntries struct {
+	ptr unsafe.Pointer
+	len uint32
+}
+
+func newRobinHoodEntries(n uint32) robinHoodEntries {
+	size := uintptr(n) * unsafe.Sizeof(robinHoodEntry{})
+	return robinHoodEntries{
+		ptr: unsafe.Pointer(&(manual.New(int(size)))[0]),
+		len: n,
+	}
+}
+
+func (e robinHoodEntries) at(i uint32) *robinHoodEntry {
+	return (*robinHoodEntry)(unsafe.Pointer(uintptr(e.ptr) +
+		uintptr(i)*unsafe.Sizeof(robinHoodEntry{})))
+}
+
+func (e robinHoodEntries) free() {
+	size := uintptr(e.len) * unsafe.Sizeof(robinHoodEntry{})
+	buf := (*[manual.MaxArrayLen]byte)(e.ptr)[:size:size]
+	manual.Free(buf)
+}
+
+// robinHoodMap is an implementation of Robin Hood hashing. Robin Hood hashing
+// is an open-address hash table using linear probing. The twist is that the
+// linear probe distance is reduced by moving existing entries when inserting
+// and deleting. This is accomplished by keeping track of how far an entry is
+// from its "desired" slot (hash of key modulo number of slots). During
+// insertion, if the new entry being inserted is farther from its desired slot
+// than the target entry, we swap the target and new entry. This effectively
+// steals from the "rich" target entry and gives to the "poor" new entry (thus
+// the origin of the name).
+//
+// An extension over the base Robin Hood hashing idea comes from
+// https://probablydance.com/2017/02/26/i-wrote-the-fastest-hashtable/. A cap
+// is placed on the max distance an entry can be from its desired slot. When
+// this threshold is reached during insertion, the size of the table is doubled
+// and insertion is restarted. Additionally, the entries slice is given "max
+// dist" extra entries on the end. The very last entry in the entries slice is
+// never used and acts as a sentinel which terminates loops. The previous
+// maxDist-1 entries act as the extra entries. For example, if the size of the
+// table is 2, maxDist is computed as 4 and the actual size of the entry slice
+// is 6.
+//
+//	+---+---+---+---+---+---+
+//	| 0 | 1 | 2 | 3 | 4 | 5 |
+//	+---+---+---+---+---+---+
+//	        ^
+//	       size
+//
+// In this scenario, the target entry for a key will always be in the range
+// [0,1]. Valid entries may reside in the range [0,4] due to the linear probing
+// of up to maxDist entries. The entry at index 5 will never contain a value,
+// and instead acts as a sentinel (its distance is always 0). The max distance
+// threshold is set to log2(num-entries). This ensures that retrieval is O(log
+// N), though note that N is the number of total entries, not the count of
+// valid entries.
+//
+// Deletion is implemented via the backward shift delete mechanism instead of
+// tombstones. This preserves the performance of the table in the presence of
+// deletions. See
+// http://codecapsule.com/2013/11/17/robin-hood-hashing-backward-shift-deletion
+// for details.
+type robinHoodMap struct {
+	entries robinHoodEntries
+	size    uint32
+	shift   uint32
+	count   uint32
+	maxDist uint32
+}
+
+func maxDistForSize(size uint32) uint32 {
+	desired := uint32(bits.Len32(size))
+	if desired < 4 {
+		desired = 4
+	}
+	return desired
+}
+
+func newRobinHoodMap(initialCapacity int) *robinHoodMap {
+	m := &robinHoodMap{}
+	m.init(initialCapacity)
+
+	// Note: this is a no-op if invariants are disabled or race is enabled.
+	invariants.SetFinalizer(m, func(obj interface{}) {
+		m := obj.(*robinHoodMap)
+		if m.entries.ptr != nil {
+			fmt.Fprintf(os.Stderr, "%p: robin-hood map not freed\n", m)
+			os.Exit(1)
+		}
+	})
+	return m
+}
+
+func (m *robinHoodMap) init(initialCapacity int) {
+	if initialCapacity < 1 {
+		initialCapacity = 1
+	}
+	targetSize := 1 << (uint(bits.Len(uint(2*initialCapacity-1))) - 1)
+	m.rehash(uint32(targetSize))
+}
+
+func (m *robinHoodMap) free() {
+	if m.entries.ptr != nil {
+		m.entries.free()
+		m.entries.ptr = nil
+	}
+}
+
+func (m *robinHoodMap) rehash(size uint32) {
+	oldEntries := m.entries
+
+	m.size = size
+	m.shift = uint32(64 - bits.Len32(m.size-1))
+	m.maxDist = maxDistForSize(size)
+	m.entries = newRobinHoodEntries(size + m.maxDist)
+	m.count = 0
+
+	for i := uint32(0); i < oldEntries.len; i++ {
+		e := oldEntries.at(i)
+		if e.value != nil {
+			m.Put(e.key, e.value)
+		}
+	}
+
+	if oldEntries.ptr != nil {
+		oldEntries.free()
+	}
+}
+
+// Find an entry containing the specified value. This is intended to be used
+// from debug and test code.
+func (m *robinHoodMap) findByValue(v *entry) *robinHoodEntry {
+	for i := uint32(0); i < m.entries.len; i++ {
+		e := m.entries.at(i)
+		if e.value == v {
+			return e
+		}
+	}
+	return nil
+}
+
+func (m *robinHoodMap) Count() int {
+	return int(m.count)
+}
+
+func (m *robinHoodMap) Put(k key, v *entry) {
+	maybeExists := true
+	n := robinHoodEntry{key: k, value: v, dist: 0}
+	for i := robinHoodHash(k, m.shift); ; i++ {
+		e := m.entries.at(i)
+		if maybeExists && k == e.key {
+			// Entry already exists: overwrite.
+			e.value = n.value
+			m.checkEntry(i)
+			return
+		}
+
+		if e.value == nil {
+			// Found an empty entry: insert here.
+			*e = n
+			m.count++
+			m.checkEntry(i)
+			return
+		}
+
+		if e.dist < n.dist {
+			// Swap the new entry with the current entry because the current is
+			// rich. We then continue to loop, looking for a new location for the
+			// current entry. Note that this is also the not-found condition for
+			// retrieval, which means that "k" is not present in the map. See Get().
+			n, *e = *e, n
+			m.checkEntry(i)
+			maybeExists = false
+		}
+
+		// The new entry gradually moves away from its ideal position.
+		n.dist++
+
+		// If we've reached the max distance threshold, grow the table and restart
+		// the insertion.
+		if n.dist == m.maxDist {
+			m.rehash(2 * m.size)
+			i = robinHoodHash(n.key, m.shift) - 1
+			n.dist = 0
+			maybeExists = false
+		}
+	}
+}
+
+func (m *robinHoodMap) Get(k key) *entry {
+	var dist uint32
+	for i := robinHoodHash(k, m.shift); ; i++ {
+		e := m.entries.at(i)
+		if k == e.key {
+			// Found.
+			return e.value
+		}
+		if e.dist < dist {
+			// Not found.
+			return nil
+		}
+		dist++
+	}
+}
+
+func (m *robinHoodMap) Delete(k key) {
+	var dist uint32
+	for i := robinHoodHash(k, m.shift); ; i++ {
+		e := m.entries.at(i)
+		if k == e.key {
+			m.checkEntry(i)
+			// We found the entry to delete. Shift the following entries backwards
+			// until the next empty value or entry with a zero distance. Note that
+			// empty values are guaranteed to have "dist == 0".
+			m.count--
+			for j := i + 1; ; j++ {
+				t := m.entries.at(j)
+				if t.dist == 0 {
+					*e = robinHoodEntry{}
+					return
+				}
+				e.key = t.key
+				e.value = t.value
+				e.dist = t.dist - 1
+				e = t
+				m.checkEntry(j)
+			}
+		}
+		if dist > e.dist {
+			// Not found.
+			return
+		}
+		dist++
+	}
+}
+
+func (m *robinHoodMap) checkEntry(i uint32) {
+	if invariants.Enabled {
+		e := m.entries.at(i)
+		if e.value != nil {
+			pos := robinHoodHash(e.key, m.shift)
+			if (uint32(i) - pos) != e.dist {
+				fmt.Fprintf(os.Stderr, "%d: invalid dist=%d, expected %d: %s\n%s",
+					i, e.dist, uint32(i)-pos, e.key, debug.Stack())
+				os.Exit(1)
+			}
+			if e.dist > m.maxDist {
+				fmt.Fprintf(os.Stderr, "%d: invalid dist=%d > maxDist=%d: %s\n%s",
+					i, e.dist, m.maxDist, e.key, debug.Stack())
+				os.Exit(1)
+			}
+		}
+	}
+}
+
+func (m *robinHoodMap) String() string {
+	var buf strings.Builder
+	fmt.Fprintf(&buf, "count: %d\n", m.count)
+	for i := uint32(0); i < m.entries.len; i++ {
+		e := m.entries.at(i)
+		if e.value != nil {
+			fmt.Fprintf(&buf, "%d: [%s,%p,%d]\n", i, e.key, e.value, e.dist)
+		}
+	}
+	return buf.String()
+}
diff --git a/pebble/internal/cache/robin_hood_test.go b/pebble/internal/cache/robin_hood_test.go
new file mode 100644
index 0000000..d72c1b3
--- /dev/null
+++ b/pebble/internal/cache/robin_hood_test.go
@@ -0,0 +1,241 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package cache
+
+import (
+	"fmt"
+	"io"
+	"runtime"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"golang.org/x/exp/rand"
+)
+
+func TestRobinHoodMap(t *testing.T) {
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+	rhMap := newRobinHoodMap(0)
+	defer rhMap.free()
+
+	goMap := make(map[key]*entry)
+
+	randomKey := func() key {
+		n := rng.Intn(len(goMap))
+		for k := range goMap {
+			if n == 0 {
+				return k
+			}
+			n--
+		}
+		return key{}
+	}
+
+	ops := 10000 + rng.Intn(10000)
+	for i := 0; i < ops; i++ {
+		var which float64
+		if len(goMap) > 0 {
+			which = rng.Float64()
+		}
+
+		switch {
+		case which < 0.4:
+			// 40% insert.
+			var k key
+			k.id = rng.Uint64()
+			k.fileNum = base.FileNum(rng.Uint64()).DiskFileNum()
+			k.offset = rng.Uint64()
+			e := &entry{}
+			goMap[k] = e
+			rhMap.Put(k, e)
+			if len(goMap) != rhMap.Count() {
+				t.Fatalf("map sizes differ: %d != %d", len(goMap), rhMap.Count())
+			}
+
+		case which < 0.1:
+			// 10% overwrite.
+			k := randomKey()
+			e := &entry{}
+			goMap[k] = e
+			rhMap.Put(k, e)
+			if len(goMap) != rhMap.Count() {
+				t.Fatalf("map sizes differ: %d != %d", len(goMap), rhMap.Count())
+			}
+
+		case which < 0.75:
+			// 25% delete.
+			k := randomKey()
+			delete(goMap, k)
+			rhMap.Delete(k)
+			if len(goMap) != rhMap.Count() {
+				t.Fatalf("map sizes differ: %d != %d", len(goMap), rhMap.Count())
+			}
+
+		default:
+			// 25% lookup.
+			k := randomKey()
+			v := goMap[k]
+			u := rhMap.Get(k)
+			if v != u {
+				t.Fatalf("%s: expected %p, but found %p", k, v, u)
+			}
+		}
+	}
+
+	t.Logf("map size: %d", len(goMap))
+}
+
+const benchSize = 1 << 20
+
+func BenchmarkGoMapInsert(b *testing.B) {
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+	keys := make([]key, benchSize)
+	for i := range keys {
+		keys[i].fileNum = base.FileNum(rng.Uint64n(1 << 20)).DiskFileNum()
+		keys[i].offset = uint64(rng.Intn(1 << 20))
+	}
+	b.ResetTimer()
+
+	var m map[key]*entry
+	for i, j := 0, 0; i < b.N; i, j = i+1, j+1 {
+		if m == nil || j == len(keys) {
+			b.StopTimer()
+			m = make(map[key]*entry, len(keys))
+			j = 0
+			b.StartTimer()
+		}
+		m[keys[j]] = nil
+	}
+}
+
+func BenchmarkRobinHoodInsert(b *testing.B) {
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+	keys := make([]key, benchSize)
+	for i := range keys {
+		keys[i].fileNum = base.FileNum(rng.Uint64n(1 << 20)).DiskFileNum()
+		keys[i].offset = uint64(rng.Intn(1 << 20))
+	}
+	e := &entry{}
+	b.ResetTimer()
+
+	var m *robinHoodMap
+	for i, j := 0, 0; i < b.N; i, j = i+1, j+1 {
+		if m == nil || j == len(keys) {
+			b.StopTimer()
+			m = newRobinHoodMap(len(keys))
+			j = 0
+			b.StartTimer()
+		}
+		m.Put(keys[j], e)
+	}
+
+	runtime.KeepAlive(e)
+}
+
+func BenchmarkGoMapLookupHit(b *testing.B) {
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+	keys := make([]key, benchSize)
+	m := make(map[key]*entry, len(keys))
+	e := &entry{}
+	for i := range keys {
+		keys[i].fileNum = base.FileNum(rng.Uint64n(1 << 20)).DiskFileNum()
+		keys[i].offset = uint64(rng.Intn(1 << 20))
+		m[keys[i]] = e
+	}
+	b.ResetTimer()
+
+	var p *entry
+	for i, j := 0, 0; i < b.N; i, j = i+1, j+1 {
+		if j == len(keys) {
+			j = 0
+		}
+		p = m[keys[j]]
+	}
+
+	if testing.Verbose() {
+		fmt.Fprintln(io.Discard, p)
+	}
+}
+
+func BenchmarkRobinHoodLookupHit(b *testing.B) {
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+	keys := make([]key, benchSize)
+	m := newRobinHoodMap(len(keys))
+	e := &entry{}
+	for i := range keys {
+		keys[i].fileNum = base.FileNum(rng.Uint64n(1 << 20)).DiskFileNum()
+		keys[i].offset = uint64(rng.Intn(1 << 20))
+		m.Put(keys[i], e)
+	}
+	b.ResetTimer()
+
+	var p *entry
+	for i, j := 0, 0; i < b.N; i, j = i+1, j+1 {
+		if j == len(keys) {
+			j = 0
+		}
+		p = m.Get(keys[j])
+	}
+
+	if testing.Verbose() {
+		fmt.Fprintln(io.Discard, p)
+	}
+	runtime.KeepAlive(e)
+}
+
+func BenchmarkGoMapLookupMiss(b *testing.B) {
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+	keys := make([]key, benchSize)
+	m := make(map[key]*entry, len(keys))
+	e := &entry{}
+	for i := range keys {
+		keys[i].id = 1
+		keys[i].fileNum = base.FileNum(rng.Uint64n(1 << 20)).DiskFileNum()
+		keys[i].offset = uint64(rng.Intn(1 << 20))
+		m[keys[i]] = e
+		keys[i].id = 2
+	}
+	b.ResetTimer()
+
+	var p *entry
+	for i, j := 0, 0; i < b.N; i, j = i+1, j+1 {
+		if j == len(keys) {
+			j = 0
+		}
+		p = m[keys[j]]
+	}
+
+	if testing.Verbose() {
+		fmt.Fprintln(io.Discard, p)
+	}
+}
+
+func BenchmarkRobinHoodLookupMiss(b *testing.B) {
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+	keys := make([]key, benchSize)
+	m := newRobinHoodMap(len(keys))
+	e := &entry{}
+	for i := range keys {
+		keys[i].id = 1
+		keys[i].fileNum = base.FileNum(rng.Uint64n(1 << 20)).DiskFileNum()
+		keys[i].offset = uint64(rng.Intn(1 << 20))
+		m.Put(keys[i], e)
+		keys[i].id = 2
+	}
+	b.ResetTimer()
+
+	var p *entry
+	for i, j := 0, 0; i < b.N; i, j = i+1, j+1 {
+		if j == len(keys) {
+			j = 0
+		}
+		p = m.Get(keys[j])
+	}
+
+	if testing.Verbose() {
+		fmt.Fprintln(io.Discard, p)
+	}
+	runtime.KeepAlive(e)
+}
diff --git a/pebble/internal/cache/testdata/cache b/pebble/internal/cache/testdata/cache
new file mode 100644
index 0000000..fc7b218
--- /dev/null
+++ b/pebble/internal/cache/testdata/cache
@@ -0,0 +1,99991 @@
+0 m
+1 m
+2 m
+3 m
+1 h
+1 h
+4 m
+5 m
+6 m
+1 h
+7 m
+8 m
+9 m
+4 h
+10 m
+4 h
+1 h
+11 m
+10 h
+4 h
+12 m
+4 h
+13 m
+1 h
+4 h
+14 m
+15 m
+16 m
+17 m
+18 m
+19 m
+4 h
+1 h
+1 h
+4 h
+4 h
+20 m
+21 m
+10 h
+22 m
+4 h
+4 h
+4 h
+4 h
+8 h
+23 m
+4 h
+24 m
+1 h
+1 h
+1 h
+4 h
+25 m
+26 m
+27 m
+4 h
+28 m
+29 m
+10 h
+30 m
+1 h
+4 h
+1 h
+11 h
+31 m
+4 h
+3 h
+4 h
+1 h
+31 h
+10 h
+32 m
+33 m
+10 h
+34 m
+35 m
+36 m
+37 m
+4 h
+4 h
+1 h
+1 h
+38 m
+39 m
+40 m
+41 m
+42 m
+43 d
+10 h
+4 h
+4 h
+44 m
+45 m
+46 m
+1 h
+1 h
+1 h
+4 h
+10 h
+47 m
+1 h
+1 h
+48 m
+1 h
+49 m
+10 h
+11 h
+4 h
+50 m
+27 h
+10 h
+10 h
+51 m
+11 h
+3 h
+4 h
+52 m
+10 h
+53 m
+1 h
+1 h
+3 h
+54 m
+55 m
+56 m
+57 m
+58 m
+59 m
+60 m
+4 h
+4 h
+1 h
+61 m
+1 h
+1 h
+62 m
+63 m
+1 h
+64 m
+65 m
+66 m
+36 h
+67 m
+4 h
+4 h
+59 h
+68 m
+10 h
+69 m
+4 h
+4 h
+4 h
+70 m
+1 h
+71 m
+1 h
+72 m
+10 h
+73 m
+4 h
+74 m
+10 h
+4 h
+11 h
+4 h
+10 h
+75 m
+76 m
+4 h
+77 m
+78 m
+1 h
+79 m
+80 m
+81 m
+4 h
+4 h
+82 m
+83 m
+84 m
+1 h
+85 m
+1 h
+4 h
+10 h
+86 m
+1 h
+87 m
+11 h
+4 h
+59 h
+1 h
+88 m
+89 m
+90 m
+91 m
+4 h
+4 h
+92 m
+3 h
+4 h
+4 h
+4 h
+93 m
+1 h
+94 m
+10 h
+95 m
+1 h
+82 h
+96 m
+82 h
+1 h
+10 h
+10 h
+97 m
+1 h
+4 h
+4 h
+98 m
+31 h
+99 m
+100 m
+41 h
+10 h
+101 m
+102 m
+1 h
+103 m
+11 h
+104 m
+105 m
+57 h
+106 m
+4 h
+107 m
+83 h
+4 h
+10 h
+108 m
+1 h
+109 m
+1 h
+110 m
+1 h
+10 h
+111 m
+4 h
+4 h
+4 h
+10 h
+112 m
+10 h
+113 m
+4 h
+114 m
+115 m
+116 m
+1 h
+57 h
+117 m
+118 m
+4 h
+1 h
+1 h
+1 h
+10 h
+10 h
+119 m
+4 h
+120 m
+1 h
+1 h
+4 h
+69 h
+4 h
+121 m
+73 h
+122 m
+4 h
+123 m
+4 h
+124 m
+1 h
+4 h
+45 h
+10 h
+10 h
+125 m
+4 h
+109 h
+1 h
+126 m
+82 h
+1 h
+127 m
+11 h
+1 h
+10 h
+10 h
+4 h
+1 h
+1 h
+4 h
+128 m
+10 h
+4 h
+10 h
+129 m
+83 h
+4 h
+1 h
+130 m
+131 m
+132 m
+1 h
+133 m
+134 m
+135 m
+136 m
+1 h
+137 m
+4 h
+4 h
+4 h
+11 h
+4 h
+4 h
+138 m
+139 m
+140 m
+141 m
+4 h
+1 h
+4 h
+65 h
+4 h
+142 m
+4 h
+11 h
+124 h
+143 m
+4 h
+144 m
+145 m
+10 h
+97 h
+146 m
+147 m
+4 h
+148 m
+10 h
+4 h
+82 h
+1 h
+4 h
+3 h
+149 m
+65 h
+150 m
+4 h
+151 m
+152 m
+1 h
+153 m
+91 h
+59 h
+4 h
+4 h
+154 m
+1 h
+4 h
+155 m
+156 m
+157 m
+25 h
+4 h
+158 m
+159 m
+3 h
+82 h
+160 m
+4 h
+161 m
+4 h
+4 h
+162 m
+74 h
+163 m
+4 h
+10 h
+1 h
+4 h
+164 m
+10 h
+165 m
+166 m
+4 h
+167 m
+4 h
+4 h
+74 h
+10 h
+10 h
+1 h
+4 h
+168 m
+169 m
+4 h
+4 h
+143 h
+4 h
+55 h
+170 m
+171 m
+10 h
+11 h
+124 h
+124 h
+1 h
+1 h
+172 m
+4 h
+173 m
+174 m
+4 h
+1 h
+124 h
+4 h
+123 h
+4 h
+4 h
+104 h
+82 h
+175 m
+176 m
+94 h
+4 h
+1 h
+177 m
+1 h
+178 m
+1 h
+179 m
+45 h
+10 h
+4 h
+1 h
+1 h
+1 h
+180 m
+146 h
+181 m
+25 h
+4 h
+182 m
+183 m
+184 m
+185 m
+4 h
+4 h
+11 h
+1 h
+186 m
+143 h
+4 h
+187 m
+188 m
+189 m
+190 m
+57 h
+1 h
+147 h
+10 h
+1 h
+10 h
+4 h
+1 h
+1 h
+10 h
+191 m
+192 m
+10 h
+193 m
+22 h
+1 h
+194 m
+195 m
+25 h
+196 m
+197 m
+4 h
+4 h
+198 m
+92 h
+4 h
+199 m
+147 h
+4 h
+200 m
+201 m
+202 m
+65 h
+4 h
+1 h
+203 m
+10 h
+204 m
+205 m
+4 h
+206 m
+207 m
+1 h
+1 h
+208 m
+4 h
+209 m
+4 h
+124 h
+45 h
+210 m
+59 h
+211 m
+212 m
+4 h
+213 m
+1 h
+10 h
+41 h
+4 h
+109 h
+10 h
+214 m
+215 m
+124 h
+4 h
+1 h
+216 m
+10 h
+4 h
+11 h
+109 h
+4 h
+4 h
+217 m
+10 h
+4 h
+8 h
+1 h
+10 h
+4 h
+4 h
+218 m
+36 h
+219 m
+4 h
+1 h
+220 m
+221 m
+4 h
+4 h
+10 h
+222 m
+4 h
+73 h
+3 h
+51 h
+223 m
+158 h
+11 h
+224 m
+59 h
+11 h
+10 h
+97 h
+74 h
+4 h
+225 m
+102 h
+1 h
+56 h
+4 h
+4 h
+4 h
+226 m
+4 h
+1 h
+227 m
+4 h
+228 m
+229 m
+109 h
+10 h
+230 m
+1 h
+10 h
+231 m
+1 h
+232 m
+10 h
+1 h
+10 h
+82 h
+233 m
+4 h
+4 h
+234 m
+1 h
+4 h
+235 m
+12 m
+10 h
+28 m
+236 m
+237 m
+1 h
+10 h
+238 m
+10 h
+4 h
+239 m
+113 h
+10 h
+40 m
+240 m
+10 h
+4 h
+4 h
+10 h
+1 h
+1 h
+10 h
+82 h
+4 h
+241 m
+4 h
+4 h
+242 m
+243 m
+110 h
+31 h
+244 m
+245 m
+246 m
+4 h
+1 h
+247 m
+248 m
+4 h
+1 h
+1 h
+249 m
+4 h
+124 h
+119 h
+4 h
+4 h
+10 h
+4 h
+10 h
+123 h
+250 m
+251 m
+10 h
+4 h
+252 m
+253 m
+254 m
+238 h
+8 h
+1 h
+1 h
+4 h
+57 h
+4 h
+255 m
+4 h
+10 h
+4 h
+41 h
+10 h
+256 m
+257 m
+92 h
+129 h
+258 m
+125 h
+57 h
+10 h
+97 h
+4 h
+1 h
+1 h
+31 h
+259 m
+1 h
+260 m
+4 h
+4 h
+1 h
+261 m
+196 h
+262 m
+1 h
+1 h
+263 m
+1 h
+3 h
+1 h
+264 m
+265 m
+1 h
+45 h
+82 h
+10 h
+266 m
+267 m
+268 m
+4 h
+262 h
+10 h
+269 m
+4 h
+270 m
+271 m
+272 m
+11 h
+140 h
+10 h
+4 h
+10 h
+273 m
+82 h
+25 h
+4 h
+274 m
+1 h
+10 h
+4 h
+275 m
+1 h
+10 h
+276 m
+4 h
+1 h
+277 m
+10 h
+10 h
+4 h
+4 h
+108 m
+4 h
+278 m
+4 h
+279 m
+4 h
+10 h
+1 h
+57 h
+1 h
+1 h
+57 h
+3 h
+1 h
+10 h
+4 h
+1 h
+280 m
+11 h
+4 h
+1 h
+281 m
+282 m
+10 h
+283 m
+284 m
+285 m
+143 h
+4 h
+1 h
+124 h
+57 h
+12 h
+266 h
+4 h
+4 h
+104 h
+4 h
+11 h
+4 h
+4 h
+286 m
+109 h
+4 h
+4 h
+10 h
+4 h
+287 m
+1 h
+288 m
+79 m
+125 h
+94 h
+139 h
+289 m
+290 m
+4 h
+1 h
+1 h
+1 h
+291 m
+292 m
+293 m
+4 h
+4 h
+294 m
+295 m
+1 h
+10 h
+11 h
+10 h
+92 h
+112 m
+11 h
+10 h
+167 h
+1 h
+296 m
+4 h
+297 m
+73 h
+298 m
+258 h
+65 h
+4 h
+1 h
+4 h
+299 m
+4 h
+1 h
+1 h
+74 h
+300 m
+1 h
+1 h
+10 h
+82 h
+301 m
+83 h
+302 m
+139 h
+4 h
+4 h
+278 h
+303 m
+304 m
+147 h
+305 m
+4 h
+306 m
+1 h
+135 m
+1 h
+4 h
+1 h
+4 h
+307 m
+4 h
+57 h
+10 h
+1 h
+11 h
+25 h
+125 h
+57 h
+4 h
+1 h
+4 h
+4 h
+308 m
+1 h
+10 h
+309 m
+1 h
+310 m
+83 h
+119 h
+311 m
+312 m
+313 m
+4 h
+119 h
+314 m
+1 h
+4 h
+315 m
+316 m
+317 m
+10 h
+10 h
+1 h
+4 h
+59 h
+318 m
+1 h
+319 m
+1 h
+4 h
+10 h
+4 h
+320 m
+109 h
+321 m
+10 h
+322 m
+323 m
+108 h
+10 h
+324 m
+10 h
+4 h
+135 h
+1 h
+325 m
+4 h
+10 h
+272 h
+1 h
+10 h
+4 h
+1 h
+64 m
+326 m
+119 h
+327 m
+328 m
+329 m
+60 m
+36 h
+330 m
+119 h
+4 h
+331 m
+332 m
+4 h
+10 h
+4 h
+333 m
+10 h
+10 h
+4 h
+1 h
+1 h
+10 h
+1 h
+1 h
+334 m
+335 m
+1 h
+65 h
+167 h
+336 m
+143 h
+266 h
+114 m
+337 m
+10 h
+3 h
+124 h
+338 m
+4 h
+339 m
+10 h
+10 h
+340 m
+82 h
+10 h
+11 h
+83 h
+10 h
+4 h
+4 h
+4 h
+295 h
+10 h
+4 h
+1 h
+1 h
+341 m
+1 h
+342 m
+343 m
+57 h
+1 h
+79 h
+110 h
+1 h
+1 h
+344 m
+82 h
+181 m
+11 h
+345 m
+10 h
+1 h
+1 h
+1 h
+346 m
+1 h
+347 m
+1 h
+13 m
+4 h
+348 m
+349 m
+10 h
+13 h
+4 h
+4 h
+1 h
+4 h
+11 h
+350 m
+4 h
+351 m
+10 h
+4 h
+352 m
+353 m
+104 h
+4 h
+82 h
+354 m
+1 h
+4 h
+124 h
+355 m
+11 h
+295 h
+250 h
+12 h
+10 h
+356 m
+109 h
+10 h
+357 m
+1 h
+10 h
+73 h
+25 h
+358 m
+1 h
+1 h
+57 h
+4 h
+359 m
+11 h
+360 m
+1 h
+10 h
+31 h
+361 m
+59 h
+1 h
+4 h
+4 h
+1 h
+362 m
+1 h
+12 h
+41 h
+363 m
+1 h
+10 h
+11 h
+4 h
+147 h
+114 h
+10 h
+10 h
+5 m
+10 h
+10 h
+364 m
+365 m
+1 h
+10 h
+11 h
+4 h
+366 m
+1 h
+4 h
+4 h
+1 h
+367 m
+368 m
+12 h
+147 h
+129 h
+65 h
+10 h
+369 m
+10 h
+82 h
+4 h
+4 h
+370 m
+25 h
+1 h
+371 m
+10 h
+119 h
+4 h
+372 m
+94 h
+373 m
+4 h
+1 h
+4 h
+374 m
+12 h
+4 h
+10 h
+375 m
+28 h
+61 m
+4 h
+114 h
+147 h
+195 m
+376 m
+1 h
+377 m
+378 m
+170 m
+4 h
+10 h
+196 h
+4 h
+1 h
+4 h
+379 m
+380 m
+4 h
+381 m
+4 h
+144 m
+41 h
+1 h
+382 m
+383 m
+384 m
+1 h
+190 m
+112 h
+10 h
+10 h
+10 h
+385 m
+1 h
+1 h
+4 h
+10 h
+1 h
+4 h
+386 m
+4 h
+4 h
+31 h
+10 h
+4 h
+10 h
+74 h
+387 m
+1 h
+388 m
+31 h
+10 h
+389 m
+10 h
+3 h
+10 h
+383 h
+10 h
+1 h
+110 h
+390 m
+10 h
+391 m
+392 m
+25 h
+4 h
+238 h
+10 h
+393 m
+394 m
+4 h
+10 h
+22 m
+4 h
+282 h
+1 h
+4 h
+1 h
+395 m
+396 m
+230 m
+1 h
+1 h
+4 h
+1 h
+10 h
+4 h
+4 h
+195 h
+92 h
+307 h
+10 h
+397 m
+4 h
+4 h
+1 h
+1 h
+4 h
+229 m
+398 m
+4 h
+1 h
+4 h
+4 h
+399 m
+45 h
+79 h
+4 h
+1 h
+27 m
+140 h
+1 h
+400 m
+1 h
+10 h
+1 h
+48 m
+401 m
+402 m
+403 m
+404 m
+4 h
+10 h
+405 m
+1 h
+406 m
+1 h
+1 h
+407 m
+408 m
+10 h
+4 h
+270 m
+195 h
+4 h
+1 h
+409 m
+4 h
+135 h
+1 h
+4 h
+3 h
+410 m
+4 h
+411 m
+4 h
+10 h
+412 m
+10 h
+413 m
+10 h
+138 m
+104 h
+4 h
+172 m
+1 h
+4 h
+1 h
+414 m
+4 h
+4 h
+4 h
+11 h
+195 h
+1 h
+83 h
+109 h
+1 h
+10 h
+10 h
+10 h
+4 h
+415 m
+4 h
+10 h
+416 m
+1 h
+1 h
+4 h
+417 m
+10 h
+10 h
+4 h
+4 h
+368 h
+74 h
+10 h
+65 h
+295 h
+383 h
+4 h
+4 h
+82 h
+10 h
+25 h
+11 h
+64 h
+143 h
+418 m
+10 h
+92 h
+419 m
+420 m
+421 m
+1 h
+59 h
+57 h
+4 h
+422 m
+10 h
+258 h
+423 m
+424 m
+1 h
+4 h
+1 h
+11 h
+425 m
+426 m
+10 h
+4 h
+4 h
+4 h
+427 m
+1 h
+56 h
+1 h
+428 m
+279 m
+429 m
+11 h
+1 h
+430 m
+83 h
+124 h
+4 h
+4 h
+1 h
+1 h
+11 h
+1 h
+31 h
+10 h
+1 h
+266 h
+4 h
+431 m
+432 m
+10 h
+1 h
+1 h
+433 m
+1 h
+11 h
+1 h
+4 h
+1 h
+434 m
+143 h
+4 h
+112 h
+435 m
+436 m
+437 m
+1 h
+10 h
+4 h
+1 h
+438 m
+439 m
+10 h
+110 h
+1 h
+440 m
+10 h
+12 h
+1 h
+441 m
+4 h
+442 m
+1 h
+1 h
+147 h
+4 h
+4 h
+10 h
+1 h
+4 h
+443 m
+10 h
+444 m
+4 h
+445 m
+4 h
+10 h
+1 h
+4 h
+79 h
+74 h
+1 h
+31 h
+1 h
+146 h
+446 m
+10 h
+10 h
+1 h
+164 m
+31 h
+4 h
+83 h
+4 h
+82 h
+4 h
+4 h
+4 h
+10 h
+1 h
+447 m
+10 h
+11 h
+10 h
+104 h
+448 m
+449 m
+450 m
+1 h
+451 m
+1 h
+41 h
+36 h
+452 m
+82 h
+453 m
+10 h
+1 h
+371 h
+4 h
+454 m
+455 m
+31 h
+1 h
+10 h
+456 m
+10 h
+358 h
+457 m
+74 h
+458 m
+10 h
+459 m
+4 h
+65 h
+12 h
+10 h
+4 h
+460 m
+4 h
+11 h
+156 m
+125 h
+118 m
+1 h
+10 h
+10 h
+461 m
+4 h
+114 h
+11 h
+4 h
+462 m
+31 h
+124 h
+463 m
+464 m
+1 h
+1 h
+109 h
+135 h
+10 h
+1 h
+1 h
+465 m
+1 h
+74 h
+466 m
+4 h
+467 m
+4 h
+4 h
+10 h
+468 m
+11 h
+4 h
+1 h
+10 h
+1 h
+469 m
+1 h
+4 h
+4 h
+1 h
+10 h
+195 h
+10 h
+470 m
+471 m
+4 h
+472 m
+4 h
+125 h
+4 h
+146 h
+172 h
+473 m
+4 h
+10 h
+59 h
+4 h
+31 h
+4 h
+474 m
+4 h
+475 m
+4 h
+266 h
+8 h
+4 h
+27 h
+57 h
+476 m
+477 m
+478 m
+10 h
+4 h
+1 h
+10 h
+479 m
+371 h
+1 h
+4 h
+59 h
+94 h
+4 h
+1 h
+480 m
+481 m
+4 h
+482 m
+483 m
+1 h
+484 m
+1 h
+169 m
+1 h
+4 h
+4 h
+4 h
+266 h
+4 h
+4 h
+4 h
+485 m
+41 h
+124 h
+1 h
+1 h
+25 h
+486 m
+487 m
+25 h
+488 m
+1 h
+1 h
+489 m
+4 h
+1 h
+25 h
+4 h
+10 h
+490 m
+4 h
+4 h
+224 m
+97 h
+491 m
+10 h
+22 h
+492 m
+10 h
+493 m
+1 h
+494 m
+1 h
+10 h
+170 h
+495 m
+1 h
+83 h
+496 m
+497 m
+498 m
+4 h
+11 h
+114 h
+1 h
+499 m
+500 m
+4 h
+196 h
+82 h
+1 h
+1 h
+57 h
+10 h
+501 m
+1 h
+10 h
+1 h
+1 h
+27 h
+83 h
+502 m
+4 h
+503 m
+4 h
+4 h
+332 m
+1 h
+4 h
+4 h
+238 h
+504 m
+505 m
+195 h
+83 h
+10 h
+156 h
+4 h
+506 m
+4 h
+507 m
+4 h
+1 h
+4 h
+1 h
+307 h
+59 h
+508 m
+1 h
+97 h
+4 h
+509 m
+10 h
+510 m
+10 h
+1 h
+4 h
+125 h
+185 m
+4 h
+511 m
+4 h
+4 h
+10 h
+4 h
+347 m
+92 h
+190 m
+169 h
+196 h
+110 h
+1 h
+4 h
+10 h
+10 h
+1 h
+11 h
+1 h
+512 m
+4 h
+513 m
+514 m
+10 h
+515 m
+516 m
+1 h
+517 m
+57 h
+138 h
+11 h
+330 m
+1 h
+4 h
+518 m
+4 h
+10 h
+82 h
+31 h
+1 h
+10 h
+1 h
+519 m
+1 h
+4 h
+1 h
+4 h
+1 h
+1 h
+109 h
+10 h
+520 m
+55 m
+521 m
+522 m
+523 m
+297 m
+108 h
+10 h
+10 h
+31 h
+164 h
+524 m
+97 h
+525 m
+526 m
+41 h
+4 h
+4 h
+1 h
+124 h
+4 h
+1 h
+527 m
+195 h
+528 m
+529 m
+4 h
+109 h
+241 m
+4 h
+4 h
+4 h
+1 h
+530 m
+146 h
+10 h
+135 h
+11 h
+4 h
+358 h
+169 h
+531 m
+1 h
+532 m
+4 h
+533 m
+4 h
+4 h
+94 h
+10 h
+4 h
+4 h
+55 h
+1 h
+266 h
+1 h
+10 h
+1 h
+4 h
+82 h
+4 h
+534 m
+535 m
+10 h
+10 h
+4 h
+181 m
+10 h
+536 m
+3 h
+10 h
+1 h
+1 h
+1 h
+10 h
+4 h
+1 h
+265 m
+537 m
+10 h
+82 h
+56 h
+538 m
+4 h
+539 m
+1 h
+92 h
+56 h
+540 m
+541 m
+1 h
+10 h
+1 h
+542 m
+4 h
+64 h
+543 m
+82 h
+544 m
+4 h
+258 h
+4 h
+545 m
+1 h
+170 h
+10 h
+4 h
+297 h
+368 h
+1 h
+1 h
+10 h
+190 h
+135 h
+1 h
+1 h
+546 m
+4 h
+4 h
+307 h
+4 h
+1 h
+169 h
+4 h
+4 h
+143 h
+10 h
+1 h
+547 m
+4 h
+4 h
+10 h
+10 h
+97 h
+548 m
+1 h
+549 m
+550 m
+1 h
+1 h
+551 m
+4 h
+8 h
+4 h
+4 h
+1 h
+552 m
+4 h
+553 m
+538 h
+1 h
+554 m
+1 h
+555 m
+10 h
+4 h
+10 h
+556 m
+10 h
+10 h
+557 m
+558 m
+4 h
+125 h
+184 m
+559 m
+560 m
+4 h
+10 h
+92 h
+10 h
+561 m
+562 m
+57 h
+3 h
+4 h
+4 h
+563 m
+190 h
+4 h
+10 h
+4 h
+564 m
+4 h
+4 h
+59 h
+1 h
+565 m
+10 h
+4 h
+1 h
+566 m
+41 h
+1 h
+1 h
+143 h
+567 m
+10 h
+10 h
+568 m
+82 h
+1 h
+10 h
+4 h
+569 m
+353 m
+570 m
+110 h
+571 m
+572 m
+31 h
+82 h
+573 m
+574 m
+55 h
+1 h
+1 h
+4 h
+4 h
+386 m
+575 m
+576 m
+93 m
+4 h
+577 m
+4 h
+578 m
+579 m
+580 m
+4 h
+4 h
+581 m
+4 h
+582 m
+4 h
+10 h
+4 h
+583 m
+4 h
+1 h
+4 h
+4 h
+584 m
+143 h
+45 h
+1 h
+4 h
+585 m
+59 h
+4 h
+11 h
+586 m
+587 m
+4 h
+588 m
+1 h
+10 h
+10 h
+1 h
+589 m
+1 h
+590 m
+11 h
+1 h
+4 h
+591 m
+4 h
+592 m
+4 h
+4 h
+593 m
+10 h
+594 m
+595 m
+4 h
+241 h
+596 m
+1 h
+443 m
+1 h
+1 h
+597 m
+598 m
+4 h
+10 h
+10 h
+36 h
+599 m
+1 h
+13 h
+4 h
+4 h
+229 m
+1 h
+1 h
+1 h
+4 h
+10 h
+1 h
+10 h
+600 m
+601 m
+4 h
+104 h
+11 h
+4 h
+11 h
+1 h
+10 h
+602 m
+1 h
+83 h
+386 h
+4 h
+83 h
+10 h
+158 h
+603 m
+604 m
+605 m
+83 h
+4 h
+10 h
+83 h
+606 m
+607 m
+10 h
+109 h
+4 h
+25 h
+608 m
+609 m
+4 h
+10 h
+10 h
+610 m
+31 h
+611 m
+10 h
+10 h
+4 h
+612 m
+330 h
+613 m
+10 h
+10 h
+79 h
+614 m
+518 h
+92 h
+4 h
+615 m
+147 h
+4 h
+10 h
+616 m
+4 h
+4 h
+113 m
+617 m
+4 h
+4 h
+108 h
+618 m
+11 h
+1 h
+1 h
+619 m
+620 m
+307 h
+4 h
+4 h
+11 h
+109 h
+10 h
+621 m
+622 m
+386 h
+4 h
+1 h
+274 m
+10 h
+10 h
+623 m
+97 h
+10 h
+10 h
+3 h
+4 h
+4 h
+4 h
+1 h
+443 h
+624 m
+1 h
+1 h
+10 h
+170 h
+3 h
+625 m
+626 m
+4 h
+4 h
+11 h
+4 h
+59 h
+31 h
+627 m
+143 h
+628 m
+13 h
+629 m
+10 h
+1 h
+4 h
+1 h
+4 h
+630 m
+56 h
+10 h
+631 m
+129 h
+22 h
+27 h
+1 h
+4 h
+632 m
+1 h
+25 h
+1 h
+4 h
+82 h
+633 m
+4 h
+634 m
+1 h
+635 m
+636 m
+274 h
+114 h
+1 h
+637 m
+1 h
+638 m
+639 m
+97 h
+10 h
+10 h
+332 h
+4 h
+12 h
+640 m
+368 h
+450 m
+641 m
+11 h
+4 h
+92 h
+4 h
+4 h
+642 m
+4 h
+459 m
+10 h
+1 h
+643 m
+1 h
+1 h
+10 h
+270 m
+3 h
+644 m
+536 m
+10 h
+4 h
+645 m
+646 m
+647 m
+1 h
+114 h
+5 m
+1 h
+10 h
+65 h
+224 h
+170 h
+648 m
+82 h
+10 h
+4 h
+649 m
+1 h
+10 h
+1 h
+650 m
+11 h
+651 m
+1 h
+652 m
+653 m
+4 h
+654 m
+97 h
+109 h
+83 h
+10 h
+56 h
+146 h
+4 h
+10 h
+65 h
+4 h
+4 h
+655 m
+656 m
+4 h
+1 h
+657 m
+11 h
+11 h
+1 h
+270 h
+25 h
+10 h
+1 h
+147 h
+658 m
+64 h
+1 h
+1 h
+1 h
+59 h
+1 h
+659 m
+660 m
+10 h
+27 h
+661 m
+4 h
+662 m
+1 h
+1 h
+4 h
+104 h
+663 m
+10 h
+4 h
+664 m
+10 h
+665 m
+666 m
+11 h
+278 h
+10 h
+181 h
+10 h
+667 m
+4 h
+668 m
+1 h
+669 m
+330 h
+670 m
+671 m
+1 h
+64 h
+4 h
+11 h
+11 h
+672 m
+4 h
+4 h
+673 m
+10 h
+4 h
+674 m
+675 m
+4 h
+676 m
+677 m
+4 h
+10 h
+4 h
+97 h
+4 h
+1 h
+1 h
+1 h
+1 h
+10 h
+678 m
+4 h
+10 h
+114 h
+679 m
+4 h
+59 h
+59 h
+4 h
+4 h
+1 h
+4 h
+4 h
+680 m
+4 h
+65 h
+45 h
+4 h
+41 h
+73 h
+4 h
+31 h
+1 h
+681 m
+10 h
+109 h
+146 h
+22 h
+682 m
+683 m
+4 h
+684 m
+10 h
+10 h
+685 m
+10 h
+10 h
+4 h
+4 h
+4 h
+686 m
+1 h
+4 h
+687 m
+1 h
+688 m
+1 h
+1 h
+4 h
+1 h
+10 h
+434 m
+689 m
+690 m
+41 h
+4 h
+691 m
+4 h
+13 h
+692 m
+4 h
+1 h
+693 m
+10 h
+10 h
+694 m
+1 h
+10 h
+695 m
+59 h
+41 h
+1 h
+4 h
+696 m
+1 h
+10 h
+113 h
+697 m
+698 m
+23 m
+11 h
+699 m
+10 h
+700 m
+94 h
+701 m
+640 h
+702 m
+250 h
+10 h
+1 h
+703 m
+4 h
+1 h
+1 h
+4 h
+10 h
+10 h
+704 m
+265 h
+4 h
+10 h
+74 h
+147 h
+705 m
+4 h
+4 h
+706 m
+59 h
+707 m
+4 h
+569 m
+4 h
+135 h
+4 h
+708 m
+4 h
+1 h
+709 m
+31 h
+1 h
+143 h
+4 h
+710 m
+711 m
+11 h
+57 h
+1 h
+4 h
+1 h
+110 h
+10 h
+712 m
+4 h
+713 m
+12 h
+1 h
+714 m
+541 m
+10 h
+1 h
+97 h
+10 h
+1 h
+359 m
+1 h
+715 m
+716 m
+1 h
+10 h
+717 m
+10 h
+4 h
+57 h
+1 h
+10 h
+1 h
+4 h
+4 h
+4 h
+1 h
+1 h
+718 m
+1 h
+10 h
+295 h
+719 m
+720 m
+4 h
+119 m
+4 h
+11 h
+266 h
+721 m
+4 h
+36 h
+722 m
+1 h
+4 h
+4 h
+723 m
+4 h
+724 m
+353 h
+1 h
+10 h
+1 h
+195 h
+10 h
+1 h
+250 h
+725 m
+726 m
+31 h
+727 m
+4 h
+196 h
+1 h
+36 h
+4 h
+493 m
+575 m
+4 h
+728 m
+1 h
+146 h
+729 m
+1 h
+82 h
+1 h
+4 h
+730 m
+59 h
+731 m
+10 h
+119 h
+4 h
+1 h
+732 m
+10 h
+1 h
+10 h
+1 h
+733 m
+82 h
+4 h
+11 h
+1 h
+4 h
+4 h
+4 h
+4 h
+734 m
+10 h
+36 h
+10 h
+4 h
+27 h
+1 h
+735 m
+736 m
+79 h
+45 h
+737 m
+10 h
+4 h
+10 h
+1 h
+4 h
+10 h
+10 h
+4 h
+1 h
+1 h
+25 h
+738 m
+1 h
+10 h
+739 m
+27 h
+167 h
+4 h
+740 m
+10 h
+10 h
+692 h
+1 h
+57 h
+741 m
+4 h
+156 h
+4 h
+10 h
+1 h
+4 h
+4 h
+464 m
+1 h
+59 h
+4 h
+742 m
+1 h
+1 h
+743 m
+744 m
+169 h
+25 h
+4 h
+1 h
+10 h
+295 h
+745 m
+250 h
+12 h
+9 m
+746 m
+747 m
+124 h
+748 m
+4 h
+749 m
+307 h
+92 h
+4 h
+10 h
+4 h
+10 h
+1 h
+1 h
+750 m
+174 m
+1 h
+278 h
+1 h
+8 h
+258 h
+751 m
+4 h
+4 h
+1 h
+752 m
+65 h
+10 h
+1 h
+753 m
+258 h
+4 h
+59 h
+164 h
+4 h
+754 m
+755 m
+82 h
+1 h
+10 h
+4 h
+10 h
+1 h
+756 m
+57 h
+1 h
+48 m
+757 m
+1 h
+276 m
+82 h
+1 h
+758 m
+1 h
+358 h
+4 h
+759 m
+760 m
+4 h
+4 h
+761 m
+238 h
+717 h
+762 m
+10 h
+4 h
+241 h
+10 h
+1 h
+4 h
+10 h
+10 h
+4 h
+446 m
+763 m
+1 h
+764 m
+83 h
+4 h
+765 m
+766 m
+767 m
+4 h
+768 m
+158 h
+238 h
+1 h
+1 h
+83 h
+82 h
+82 h
+4 h
+109 h
+4 h
+31 h
+1 h
+4 h
+109 h
+769 m
+770 m
+112 h
+229 h
+1 h
+31 h
+771 m
+4 h
+11 h
+204 m
+1 h
+4 h
+4 h
+204 h
+772 m
+272 h
+4 h
+4 h
+1 h
+83 h
+536 h
+773 m
+4 h
+774 m
+4 h
+3 h
+775 m
+776 m
+718 h
+57 h
+1 h
+46 m
+777 m
+1 h
+778 m
+82 h
+82 h
+4 h
+1 h
+779 m
+124 h
+97 h
+266 h
+780 m
+781 m
+4 h
+10 h
+4 h
+782 m
+11 h
+783 m
+4 h
+1 h
+784 m
+10 h
+10 h
+785 m
+4 h
+31 h
+786 m
+787 m
+4 h
+1 h
+10 h
+4 h
+788 m
+4 h
+789 m
+790 m
+143 h
+10 h
+3 h
+110 h
+791 m
+1 h
+10 h
+27 h
+792 m
+4 h
+1 h
+4 h
+1 h
+4 h
+793 m
+143 h
+4 h
+4 h
+4 h
+794 m
+795 m
+13 h
+4 h
+125 h
+4 h
+4 h
+796 m
+4 h
+94 h
+195 h
+4 h
+36 h
+1 h
+4 h
+4 h
+59 h
+4 h
+174 h
+797 m
+289 m
+82 h
+4 h
+798 m
+123 h
+1 h
+10 h
+4 h
+799 m
+1 h
+800 m
+4 h
+801 m
+146 h
+55 h
+802 m
+3 h
+10 h
+83 h
+82 h
+803 m
+4 h
+1 h
+1 h
+1 h
+4 h
+804 m
+82 h
+57 h
+278 h
+805 m
+94 h
+1 h
+4 h
+25 h
+806 m
+57 h
+1 h
+4 h
+4 h
+807 m
+1 h
+443 h
+808 m
+1 h
+1 h
+809 m
+13 h
+1 h
+64 h
+4 h
+810 m
+1 h
+1 h
+811 m
+57 h
+812 m
+4 h
+41 h
+10 h
+813 m
+814 m
+11 h
+4 h
+45 h
+4 h
+25 h
+204 h
+4 h
+1 h
+41 h
+28 h
+815 m
+4 h
+4 h
+10 h
+816 m
+817 m
+1 h
+4 h
+172 h
+4 h
+208 m
+818 m
+819 m
+435 m
+820 m
+4 h
+821 m
+124 h
+4 h
+4 h
+186 m
+4 h
+1 h
+536 h
+4 h
+123 h
+822 m
+123 h
+10 h
+1 h
+10 h
+1 h
+4 h
+10 h
+1 h
+823 m
+158 h
+824 m
+1 h
+825 m
+4 h
+826 m
+4 h
+4 h
+1 h
+4 h
+4 h
+827 m
+828 m
+4 h
+109 h
+4 h
+4 h
+10 h
+64 h
+4 h
+829 m
+1 h
+1 h
+10 h
+4 h
+830 m
+57 h
+57 h
+4 h
+1 h
+831 m
+832 m
+1 h
+833 m
+10 h
+1 h
+834 m
+10 h
+11 h
+835 m
+4 h
+10 h
+4 h
+299 m
+299 h
+55 h
+1 h
+118 h
+74 h
+4 h
+104 h
+10 h
+56 h
+10 h
+10 h
+836 m
+4 h
+124 h
+1 h
+119 h
+1 h
+1 h
+94 h
+10 h
+44 m
+837 m
+82 h
+1 h
+838 m
+109 h
+10 h
+83 h
+1 h
+839 m
+83 h
+840 m
+841 m
+11 h
+1 h
+11 h
+10 h
+10 h
+718 h
+4 h
+842 m
+843 m
+1 h
+4 h
+281 m
+77 m
+45 h
+10 h
+332 h
+844 m
+97 h
+4 h
+25 h
+845 m
+846 m
+10 h
+847 m
+265 h
+848 m
+10 h
+10 h
+4 h
+849 m
+1 h
+850 m
+4 h
+157 m
+1 h
+851 m
+852 m
+4 h
+4 h
+853 m
+854 m
+855 m
+4 h
+1 h
+856 m
+10 h
+4 h
+857 m
+358 h
+4 h
+59 h
+858 m
+4 h
+3 h
+10 h
+4 h
+1 h
+4 h
+10 h
+172 h
+12 h
+114 h
+4 h
+146 h
+57 h
+859 m
+82 h
+4 h
+860 m
+1 h
+861 m
+1 h
+10 h
+4 h
+862 m
+146 h
+4 h
+4 h
+1 h
+533 m
+10 h
+863 m
+1 h
+11 h
+1 h
+12 h
+1 h
+83 h
+864 m
+4 h
+10 h
+4 h
+4 h
+1 h
+865 m
+10 h
+866 m
+3 h
+867 m
+289 h
+4 h
+10 h
+10 h
+868 m
+1 h
+1 h
+10 h
+13 h
+1 h
+4 h
+1 h
+170 h
+146 h
+4 h
+4 h
+94 h
+1 h
+869 m
+1 h
+870 m
+1 h
+1 h
+871 m
+1 h
+4 h
+92 h
+1 h
+4 h
+872 m
+873 m
+874 m
+8 h
+358 h
+28 h
+875 m
+4 h
+119 h
+4 h
+876 m
+83 h
+1 h
+57 h
+279 m
+4 h
+1 h
+1 h
+400 m
+4 h
+536 h
+11 h
+172 h
+877 m
+4 h
+4 h
+10 h
+10 h
+1 h
+4 h
+1 h
+69 m
+1 h
+1 h
+878 m
+879 m
+1 h
+10 h
+4 h
+1 h
+880 m
+59 h
+881 m
+10 h
+25 h
+882 m
+4 h
+1 h
+10 h
+4 h
+1 h
+10 h
+883 m
+884 m
+276 m
+4 h
+125 h
+4 h
+885 m
+124 h
+4 h
+125 h
+1 h
+59 h
+1 h
+1 h
+4 h
+4 h
+886 m
+1 h
+887 m
+196 h
+888 m
+79 h
+27 h
+11 h
+889 m
+4 h
+10 h
+4 h
+4 h
+41 h
+890 m
+4 h
+4 h
+10 h
+891 m
+892 m
+90 m
+893 m
+894 m
+91 m
+10 h
+1 h
+59 h
+1 h
+4 h
+1 h
+1 h
+1 h
+895 m
+56 h
+4 h
+1 h
+896 m
+897 m
+898 m
+4 h
+4 h
+11 h
+82 h
+1 h
+899 m
+258 h
+4 h
+900 m
+901 m
+902 m
+109 h
+1 h
+4 h
+903 m
+135 h
+57 h
+94 h
+1 h
+158 h
+297 h
+56 h
+904 m
+905 m
+4 h
+906 m
+687 m
+1 h
+25 h
+196 h
+1 h
+10 h
+907 m
+1 h
+10 h
+4 h
+1 h
+170 h
+11 h
+1 h
+110 h
+4 h
+10 h
+10 h
+4 h
+908 m
+1 h
+61 m
+10 h
+1 h
+4 h
+909 m
+1 h
+185 h
+910 m
+10 h
+911 m
+105 m
+912 m
+10 h
+10 h
+83 h
+10 h
+4 h
+59 h
+913 m
+10 h
+192 m
+1 h
+1 h
+4 h
+4 h
+4 h
+124 h
+914 m
+4 h
+915 m
+10 h
+916 m
+10 h
+83 h
+1 h
+4 h
+1 h
+307 h
+172 h
+917 m
+143 h
+1 h
+4 h
+918 m
+196 h
+1 h
+4 h
+874 h
+82 h
+919 m
+920 g
+1 h
+1 h
+1 h
+4 h
+55 h
+1 h
+921 m
+922 m
+170 h
+1 h
+923 m
+10 h
+10 h
+4 h
+1 h
+924 m
+192 h
+857 h
+11 h
+10 h
+4 h
+925 m
+926 m
+927 m
+1 h
+11 h
+10 h
+928 m
+4 h
+1 h
+4 h
+104 h
+929 m
+1 h
+10 h
+82 h
+930 m
+196 h
+4 h
+931 m
+3 h
+932 m
+933 m
+10 h
+1 h
+934 m
+10 h
+164 h
+266 h
+4 h
+935 m
+10 h
+4 h
+1 h
+447 m
+936 m
+937 m
+4 h
+938 m
+4 h
+4 h
+4 h
+4 h
+4 h
+939 m
+124 h
+83 h
+940 m
+1 h
+36 h
+941 m
+383 h
+10 h
+942 m
+83 h
+4 h
+11 h
+1 h
+4 h
+4 h
+4 h
+1 h
+1 h
+943 m
+4 h
+4 h
+944 m
+10 h
+1 h
+3 h
+945 m
+10 h
+10 h
+10 h
+1 h
+946 m
+1 h
+1 h
+55 h
+4 h
+947 m
+1 h
+948 m
+4 h
+124 h
+190 h
+949 m
+950 m
+10 h
+951 m
+4 h
+952 m
+278 h
+31 h
+1 h
+953 m
+1 h
+299 h
+4 h
+954 m
+1 h
+69 h
+955 m
+10 h
+1 h
+10 h
+4 h
+156 h
+4 h
+10 h
+956 m
+307 h
+4 h
+4 h
+55 h
+1 h
+1 h
+957 m
+74 h
+4 h
+229 h
+174 h
+195 h
+10 h
+939 h
+4 h
+4 h
+4 h
+195 h
+10 h
+1 h
+1 h
+57 h
+190 h
+4 h
+1 h
+10 h
+48 m
+104 h
+1 h
+1 h
+435 m
+1 h
+1 h
+1 h
+4 h
+4 h
+11 h
+10 h
+31 h
+10 h
+83 h
+4 h
+250 h
+4 h
+4 h
+10 h
+11 h
+1 h
+958 m
+4 h
+83 h
+25 h
+4 h
+959 m
+10 h
+4 h
+158 h
+1 h
+1 h
+10 h
+4 h
+960 m
+25 h
+4 h
+961 m
+10 h
+119 h
+10 h
+4 h
+1 h
+962 m
+146 h
+104 h
+1 h
+10 h
+156 h
+57 h
+4 h
+1 h
+963 m
+1 h
+1 h
+10 h
+964 m
+125 h
+4 h
+1 h
+10 h
+965 m
+966 m
+45 h
+967 m
+10 h
+4 h
+55 h
+1 h
+4 h
+4 h
+8 h
+10 h
+27 h
+59 h
+1 h
+10 h
+1 h
+4 h
+129 h
+10 h
+164 h
+4 h
+4 h
+4 h
+1 h
+10 h
+4 h
+968 m
+10 h
+82 h
+1 h
+1 h
+969 m
+10 h
+4 h
+970 m
+971 m
+972 m
+10 h
+1 h
+4 h
+1 h
+238 h
+203 m
+77 m
+1 h
+45 h
+973 m
+4 h
+13 h
+4 h
+55 h
+45 h
+974 m
+1 h
+757 m
+3 h
+4 h
+4 h
+975 m
+104 h
+976 m
+330 h
+109 h
+1 h
+4 h
+10 h
+536 h
+763 m
+57 h
+4 h
+4 h
+977 m
+125 h
+10 h
+978 m
+4 h
+1 h
+196 h
+358 h
+4 h
+979 m
+4 h
+11 h
+4 h
+980 m
+10 h
+25 h
+10 h
+981 m
+1 h
+57 h
+1 h
+73 h
+10 h
+92 h
+1 h
+41 h
+4 h
+1 h
+4 h
+1 h
+10 h
+11 h
+1 h
+982 m
+10 h
+983 m
+687 h
+12 h
+11 h
+1 h
+4 h
+196 h
+1 h
+41 h
+984 m
+4 h
+4 h
+985 m
+386 h
+10 h
+1 h
+986 m
+59 h
+987 m
+1 h
+988 m
+10 h
+1 h
+11 h
+10 h
+238 h
+146 h
+4 h
+757 h
+10 h
+989 m
+990 m
+991 m
+383 h
+992 m
+1 h
+993 m
+59 h
+4 h
+4 h
+258 h
+97 h
+4 h
+1 h
+994 m
+1 h
+995 m
+1 h
+1 h
+4 h
+10 h
+996 m
+4 h
+1 h
+124 h
+12 h
+4 h
+997 m
+998 m
+4 h
+1 h
+4 h
+10 h
+1 h
+4 h
+10 h
+36 h
+999 m
+4 h
+443 h
+11 h
+359 m
+1 h
+10 h
+4 h
+1000 m
+4 h
+10 h
+1001 m
+438 m
+1 h
+1002 m
+1003 m
+1004 m
+109 h
+1 h
+1005 m
+4 h
+196 h
+1006 m
+97 h
+1 h
+4 h
+10 h
+10 h
+1007 m
+1008 m
+83 h
+104 h
+1009 m
+1 h
+4 h
+56 h
+1010 m
+477 m
+1011 m
+3 h
+1012 m
+56 h
+10 h
+12 h
+4 h
+3 h
+25 h
+1013 m
+10 h
+1 h
+1 h
+10 h
+1014 m
+1015 m
+10 h
+1016 m
+279 m
+10 h
+4 h
+4 h
+4 h
+1017 m
+10 h
+4 h
+114 h
+1 h
+4 h
+1 h
+4 h
+172 h
+1 h
+125 h
+1 h
+10 h
+1018 m
+4 h
+258 h
+4 h
+10 h
+10 h
+74 h
+1 h
+25 h
+10 h
+124 h
+581 m
+195 h
+1019 m
+57 h
+1020 m
+319 m
+109 h
+1021 m
+4 h
+4 h
+181 h
+65 h
+10 h
+1022 m
+92 h
+1 h
+79 h
+109 h
+278 h
+10 h
+27 h
+4 h
+40 h
+1 h
+1023 m
+4 h
+10 h
+1024 m
+250 h
+74 h
+1 h
+250 h
+1025 m
+11 h
+59 h
+10 h
+10 h
+1026 m
+10 h
+4 h
+143 h
+267 m
+4 h
+1 h
+478 m
+25 h
+1027 m
+1028 m
+1 h
+1029 m
+4 h
+4 h
+1030 m
+1 h
+4 h
+1 h
+83 h
+4 h
+10 h
+955 m
+10 h
+1031 m
+1 h
+4 h
+41 h
+164 h
+4 h
+1032 m
+4 h
+1 h
+1033 m
+1034 m
+1035 m
+1036 m
+1037 m
+83 h
+119 h
+1 h
+272 h
+10 h
+1038 m
+4 h
+11 h
+1039 m
+4 h
+79 h
+4 h
+1040 m
+28 h
+10 h
+11 h
+1041 m
+59 h
+41 h
+4 h
+13 h
+4 h
+1042 m
+1043 m
+10 h
+1 h
+4 h
+1 h
+1044 m
+1 h
+1045 m
+10 h
+4 h
+10 h
+1046 m
+1 h
+1047 m
+4 h
+1048 m
+83 h
+1049 m
+1 h
+10 h
+3 h
+4 h
+112 h
+36 h
+1 h
+1 h
+4 h
+1003 h
+1050 m
+114 h
+4 h
+1051 m
+1052 m
+1 h
+1053 m
+3 h
+11 h
+1 h
+1 h
+4 h
+1 h
+1054 m
+1055 m
+4 h
+10 h
+1056 m
+4 h
+1027 h
+150 m
+4 h
+73 h
+1057 m
+109 h
+83 h
+779 m
+1 h
+195 h
+10 h
+25 h
+4 h
+4 h
+4 h
+10 h
+4 h
+4 h
+1 h
+4 h
+41 h
+1 h
+112 h
+1 h
+4 h
+1 h
+10 h
+1058 m
+1 h
+1 h
+4 h
+1059 m
+1060 m
+1 h
+4 h
+10 h
+278 h
+59 h
+41 h
+10 h
+1 h
+10 h
+97 h
+10 h
+4 h
+1061 m
+1062 m
+10 h
+1063 m
+11 h
+1064 m
+10 h
+4 h
+10 h
+64 h
+10 h
+279 h
+10 h
+1 h
+250 h
+45 h
+10 h
+10 h
+11 h
+1 h
+1065 m
+10 h
+3 h
+10 h
+170 h
+41 h
+27 h
+4 h
+1066 m
+1067 m
+11 h
+57 h
+1 h
+1 h
+10 h
+1068 m
+41 h
+92 h
+1069 m
+262 h
+4 h
+1 h
+1 h
+4 h
+1070 m
+386 h
+4 h
+1071 m
+1 h
+1 h
+12 h
+1 h
+1072 m
+1 h
+114 h
+4 h
+10 h
+4 h
+57 h
+403 m
+1073 m
+330 h
+1074 m
+4 h
+1 h
+109 h
+10 h
+4 h
+10 h
+1075 m
+1 h
+1 h
+1076 m
+94 h
+10 h
+1 h
+11 h
+45 h
+10 h
+4 h
+10 h
+1 h
+4 h
+4 h
+1077 m
+10 h
+1 h
+124 h
+10 h
+10 h
+4 h
+1 h
+41 h
+1078 m
+146 h
+4 h
+1 h
+1079 m
+1080 m
+1081 m
+1 h
+92 h
+1 h
+4 h
+10 h
+25 h
+4 h
+4 h
+77 h
+11 h
+4 h
+4 h
+1082 m
+1083 m
+1 h
+1 h
+1084 m
+172 h
+10 h
+885 m
+4 h
+1 h
+3 h
+4 h
+1 h
+10 h
+104 h
+124 h
+11 h
+1 h
+10 h
+4 h
+4 h
+10 h
+1 h
+1 h
+1085 m
+105 m
+265 h
+8 h
+1086 m
+1087 m
+4 h
+4 h
+4 h
+109 h
+27 h
+4 h
+1 h
+4 h
+4 h
+1088 m
+10 h
+4 h
+10 h
+1 h
+57 h
+1089 m
+367 m
+125 h
+4 h
+1 h
+4 h
+1090 m
+1091 m
+146 h
+1092 m
+1093 m
+1094 m
+1095 m
+195 h
+57 h
+4 h
+1096 m
+4 h
+4 h
+4 h
+10 h
+1097 m
+74 h
+1098 m
+97 h
+1 h
+4 h
+169 h
+13 h
+1099 m
+4 h
+1 h
+10 h
+4 h
+4 h
+10 h
+4 h
+4 h
+1100 m
+1 h
+10 h
+173 m
+4 h
+1101 m
+1102 m
+4 h
+1 h
+1 h
+1103 m
+11 h
+1104 m
+4 h
+1105 m
+1 h
+4 h
+112 h
+4 h
+1 h
+1 h
+1106 m
+59 h
+1 h
+82 h
+10 h
+1107 m
+4 h
+1108 m
+55 h
+10 h
+124 h
+79 h
+1109 m
+358 h
+258 h
+1 h
+1110 m
+146 h
+10 h
+433 m
+1111 m
+56 h
+1 h
+4 h
+4 h
+1 h
+1 h
+4 h
+4 h
+1112 m
+4 h
+1 h
+83 h
+57 h
+1113 m
+10 h
+4 h
+13 h
+434 h
+1114 m
+1 h
+238 h
+1115 m
+1116 m
+4 h
+109 h
+1117 m
+10 h
+1118 m
+1 h
+97 h
+1 h
+1119 m
+1120 m
+10 h
+1121 m
+4 h
+4 h
+4 h
+4 h
+4 h
+36 h
+1122 m
+104 h
+59 h
+1123 m
+10 h
+4 h
+129 h
+1124 m
+10 h
+135 h
+1125 m
+1 h
+1 h
+1126 m
+1 h
+1 h
+1127 m
+1128 m
+4 h
+12 h
+10 h
+4 h
+82 h
+10 h
+1 h
+4 h
+1 h
+169 h
+1 h
+1129 m
+1130 m
+1 h
+1131 m
+1132 m
+172 h
+4 h
+1133 m
+10 h
+10 h
+1 h
+1 h
+1 h
+1134 m
+4 h
+1 h
+109 h
+443 h
+1 h
+4 h
+1135 m
+1136 m
+1137 m
+1 h
+1138 m
+1 h
+31 h
+4 h
+1139 m
+1 h
+10 h
+4 h
+4 h
+1140 m
+4 h
+1 h
+1 h
+82 h
+83 h
+91 m
+1141 m
+4 h
+1142 m
+4 h
+82 h
+3 h
+4 h
+69 h
+1143 m
+4 h
+1 h
+1144 m
+79 h
+109 h
+4 h
+4 h
+10 h
+3 h
+1145 m
+82 h
+297 h
+4 h
+1 h
+1146 m
+1 h
+10 h
+114 h
+1 h
+1147 m
+13 h
+1 h
+3 h
+12 h
+4 h
+4 h
+1148 m
+11 h
+4 h
+1 h
+1149 m
+79 h
+4 h
+1 h
+124 h
+10 h
+64 h
+10 h
+4 h
+10 h
+1150 m
+1151 m
+41 h
+1152 m
+1153 m
+4 h
+1 h
+4 h
+1154 m
+104 h
+1 h
+4 h
+59 h
+1155 m
+1 h
+338 m
+59 h
+1 h
+4 h
+1 h
+1156 m
+1157 m
+112 h
+1 h
+4 h
+1158 m
+65 h
+1 h
+10 h
+1159 m
+1160 m
+4 h
+10 h
+1161 m
+1 h
+109 h
+4 h
+11 h
+4 h
+1162 m
+91 h
+4 h
+125 h
+1 h
+10 h
+4 h
+4 h
+1 h
+4 h
+173 h
+59 h
+1163 m
+124 h
+1164 m
+1165 m
+104 h
+73 h
+1166 m
+1167 m
+1168 m
+1169 m
+10 h
+4 h
+10 h
+22 h
+82 h
+11 h
+1170 m
+4 h
+885 h
+10 h
+4 h
+11 h
+4 h
+4 h
+4 h
+4 h
+4 h
+4 h
+4 h
+146 h
+45 h
+1171 m
+1172 m
+169 h
+3 h
+4 h
+108 h
+4 h
+1 h
+1173 m
+4 h
+114 h
+1174 m
+1 h
+1175 m
+4 h
+1027 h
+1176 m
+10 h
+4 h
+1 h
+97 h
+75 m
+4 h
+10 h
+83 h
+4 h
+25 h
+1177 m
+1178 m
+536 h
+4 h
+4 h
+10 h
+109 h
+73 h
+1179 m
+4 h
+3 h
+31 h
+25 h
+10 h
+1180 m
+10 h
+1181 m
+1182 m
+4 h
+1 h
+1 h
+4 h
+1016 m
+10 h
+10 h
+4 h
+4 h
+11 h
+4 h
+11 h
+1183 m
+1 h
+1184 m
+4 h
+4 h
+493 h
+1 h
+4 h
+4 h
+1 h
+10 h
+10 h
+74 h
+1 h
+4 h
+4 h
+10 h
+4 h
+59 h
+143 h
+129 h
+1185 m
+10 h
+1 h
+1186 m
+258 h
+4 h
+1187 m
+1 h
+10 h
+1 h
+1188 m
+10 h
+403 m
+10 h
+1189 m
+1190 m
+10 h
+10 h
+79 h
+1 h
+10 h
+4 h
+4 h
+1191 m
+10 h
+1 h
+10 h
+94 h
+4 h
+11 h
+4 h
+1 h
+1192 m
+1 h
+4 h
+4 h
+135 h
+31 h
+1193 m
+4 h
+4 h
+1 h
+368 h
+10 h
+4 h
+73 h
+4 h
+10 h
+4 h
+1016 h
+219 m
+11 h
+250 h
+4 h
+4 h
+1 h
+10 h
+109 h
+1194 m
+1 h
+12 h
+1195 m
+27 h
+10 h
+270 h
+3 h
+1 h
+10 h
+10 h
+1062 m
+10 h
+297 h
+1 h
+170 h
+282 h
+57 h
+10 h
+1196 m
+4 h
+82 h
+1197 m
+1 h
+1 h
+1 h
+13 h
+1 h
+1198 m
+10 h
+1199 m
+1198 h
+1 h
+135 h
+11 h
+4 h
+386 h
+1200 m
+4 h
+10 h
+57 h
+1201 m
+10 h
+4 h
+1202 m
+10 h
+1203 m
+1204 m
+4 h
+1 h
+11 h
+649 m
+41 h
+1 h
+469 m
+172 h
+74 h
+4 h
+1 h
+4 h
+1 h
+94 h
+1205 m
+256 m
+3 h
+224 h
+1206 m
+11 h
+1207 m
+4 h
+386 h
+4 h
+1 h
+75 h
+10 h
+10 h
+353 h
+4 h
+1208 m
+4 h
+4 h
+1 h
+10 h
+10 h
+4 h
+4 h
+1 h
+1 h
+10 h
+1209 m
+11 h
+1210 m
+4 h
+4 h
+135 h
+11 h
+1 h
+10 h
+31 h
+4 h
+327 m
+1 h
+4 h
+4 h
+1211 m
+57 h
+4 h
+4 h
+1 h
+1212 m
+1213 m
+1 h
+1 h
+10 h
+1 h
+10 h
+4 h
+56 h
+10 h
+1214 m
+1215 m
+1216 m
+196 h
+11 h
+97 h
+1217 m
+1218 m
+1 h
+4 h
+4 h
+1219 m
+10 h
+1220 m
+1221 m
+4 h
+1222 m
+1 h
+123 h
+10 h
+1223 m
+4 h
+65 h
+169 h
+1224 m
+10 h
+4 h
+1 h
+10 h
+1225 m
+4 h
+10 h
+13 h
+4 h
+4 h
+1226 m
+250 h
+1227 m
+1 h
+10 h
+4 h
+87 m
+1228 m
+4 h
+4 h
+4 h
+353 h
+138 h
+10 h
+10 h
+1 h
+1 h
+10 h
+1229 m
+1230 m
+4 h
+4 h
+1231 m
+1 h
+1232 m
+1 h
+1233 m
+4 h
+4 h
+1 h
+36 h
+4 h
+4 h
+4 h
+327 h
+1 h
+25 h
+4 h
+4 h
+41 h
+82 h
+1 h
+4 h
+10 h
+238 h
+10 h
+11 h
+1 h
+1 h
+1 h
+383 h
+10 h
+97 h
+448 m
+1234 m
+1235 m
+11 h
+4 h
+1 h
+10 h
+4 h
+1 h
+4 h
+11 h
+94 h
+4 h
+144 m
+1236 m
+10 h
+125 h
+1237 m
+4 h
+10 h
+322 m
+4 h
+1 h
+57 h
+4 h
+112 h
+124 h
+1238 m
+1239 m
+10 h
+10 h
+1240 m
+1241 m
+10 h
+10 h
+4 h
+57 h
+1242 m
+1 h
+258 h
+109 h
+170 h
+1 h
+1243 m
+1 h
+1 h
+4 h
+181 h
+1244 m
+10 h
+1 h
+4 h
+4 h
+1 h
+4 h
+358 h
+10 h
+4 h
+113 h
+97 h
+1245 m
+4 h
+25 h
+4 h
+238 h
+1246 m
+124 h
+4 h
+10 h
+1 h
+23 m
+4 h
+1247 m
+1 h
+332 h
+1 h
+83 h
+92 h
+124 h
+1 h
+1248 m
+4 h
+1249 m
+4 h
+1250 m
+10 h
+1251 m
+1252 m
+158 h
+1253 m
+4 h
+10 h
+1254 m
+1255 m
+1256 m
+4 h
+1 h
+4 h
+1257 m
+4 h
+158 h
+10 h
+1258 m
+1 h
+4 h
+276 h
+4 h
+575 h
+1259 m
+4 h
+4 h
+1260 m
+1 h
+11 h
+1261 m
+8 h
+10 h
+1262 m
+1263 m
+1 h
+1264 m
+4 h
+1265 m
+3 h
+264 m
+4 h
+1266 m
+4 h
+124 h
+4 h
+4 h
+4 h
+1 h
+1 h
+36 h
+1267 m
+1 h
+1 h
+1268 m
+4 h
+4 h
+12 h
+1 h
+74 h
+124 h
+195 h
+1269 m
+10 h
+358 h
+10 h
+109 h
+190 h
+4 h
+10 h
+1270 m
+1271 m
+1272 m
+4 h
+4 h
+1 h
+10 h
+4 h
+169 h
+11 h
+41 h
+1 h
+1273 m
+1274 m
+4 h
+1275 m
+1276 m
+195 h
+1 h
+1277 m
+10 h
+1 h
+1 h
+1 h
+1278 m
+73 h
+1 h
+1279 m
+1096 m
+10 h
+10 h
+1 h
+10 h
+4 h
+124 h
+170 h
+4 h
+4 h
+25 h
+4 h
+97 h
+138 h
+4 h
+1280 m
+10 h
+8 h
+10 h
+1 h
+196 h
+195 h
+1281 m
+173 h
+195 h
+1282 m
+1 h
+1283 m
+1284 m
+4 h
+1285 m
+1286 m
+57 h
+1287 m
+582 m
+4 h
+10 h
+11 h
+1288 m
+1289 m
+1 h
+56 h
+434 h
+146 h
+1290 m
+1291 m
+164 h
+10 h
+11 h
+4 h
+1 h
+1292 m
+4 h
+1293 m
+265 h
+4 h
+10 h
+1294 m
+327 h
+1295 m
+1 h
+55 h
+1296 m
+538 h
+1297 m
+10 h
+1 h
+181 h
+94 h
+1 h
+1 h
+4 h
+4 h
+1298 m
+4 h
+10 h
+1299 m
+12 h
+1300 m
+10 h
+114 h
+10 h
+1 h
+124 h
+119 h
+4 h
+1301 m
+3 h
+4 h
+1 h
+10 h
+1302 m
+1303 m
+10 h
+1 h
+10 h
+4 h
+1304 m
+1 h
+4 h
+4 h
+147 h
+1305 m
+4 h
+4 h
+1 h
+57 h
+4 h
+3 h
+1306 m
+4 h
+41 h
+1307 m
+4 h
+48 h
+10 h
+139 h
+4 h
+11 h
+13 h
+10 h
+109 h
+3 h
+1308 m
+10 h
+1 h
+1309 m
+4 h
+1310 m
+4 h
+119 h
+1 h
+4 h
+4 h
+1311 m
+22 h
+146 h
+1312 m
+4 h
+1 h
+258 h
+4 h
+1 h
+1313 m
+307 h
+1314 m
+135 h
+10 h
+124 h
+4 h
+10 h
+4 h
+146 h
+10 h
+184 m
+4 h
+135 h
+10 h
+1315 m
+4 h
+4 h
+10 h
+4 h
+10 h
+27 h
+1 h
+10 h
+10 h
+840 m
+1 h
+4 h
+4 h
+493 h
+307 h
+65 h
+1 h
+112 h
+181 h
+25 h
+1316 m
+4 h
+447 m
+4 h
+4 h
+140 h
+1317 m
+1318 m
+109 h
+11 h
+1319 m
+1320 m
+1 h
+10 h
+1321 m
+123 h
+4 h
+173 h
+1322 m
+4 h
+10 h
+1323 m
+4 h
+10 h
+135 h
+1324 m
+1 h
+109 h
+1325 m
+1326 m
+11 h
+1327 m
+1 h
+1 h
+158 h
+10 h
+144 m
+36 h
+57 h
+4 h
+10 h
+10 h
+278 h
+1 h
+4 h
+10 h
+124 h
+1 h
+4 h
+4 h
+4 h
+4 h
+4 h
+1 h
+10 h
+1328 m
+59 h
+4 h
+94 h
+10 h
+1329 m
+1 h
+1 h
+4 h
+1330 m
+10 h
+10 h
+4 h
+4 h
+1 h
+1 h
+10 h
+1331 m
+1 h
+10 h
+82 h
+57 h
+4 h
+4 h
+1332 m
+10 h
+4 h
+29 m
+399 m
+1 h
+59 h
+119 h
+1 h
+12 h
+4 h
+1333 m
+629 m
+1 h
+1334 m
+1335 m
+1336 m
+1 h
+11 h
+1337 m
+10 h
+1338 m
+36 h
+1 h
+4 h
+4 h
+1339 m
+1340 m
+25 h
+124 h
+124 h
+4 h
+1 h
+92 h
+65 h
+41 h
+4 h
+82 h
+10 h
+4 h
+10 h
+204 h
+10 h
+31 h
+125 h
+1341 m
+4 h
+1 h
+10 h
+123 h
+276 h
+77 h
+170 h
+1 h
+12 h
+10 h
+10 h
+10 h
+1342 m
+10 h
+31 h
+11 h
+1 h
+1 h
+4 h
+4 h
+1343 m
+10 h
+1016 h
+4 h
+1344 m
+4 h
+1345 m
+4 h
+13 h
+45 h
+3 h
+1346 m
+1 h
+10 h
+1347 m
+1348 m
+1 h
+1349 m
+266 h
+1 h
+1350 m
+4 h
+1 h
+59 h
+1351 m
+1 h
+1 h
+10 h
+10 h
+4 h
+1352 m
+4 h
+1353 m
+172 h
+4 h
+4 h
+48 h
+3 h
+4 h
+4 h
+11 h
+1354 m
+1355 m
+1356 m
+4 h
+4 h
+1 h
+10 h
+1357 m
+1 h
+167 h
+41 h
+10 h
+236 m
+10 h
+1358 m
+1 h
+10 h
+1359 m
+1 h
+25 h
+1360 m
+10 h
+3 h
+4 h
+1361 m
+48 h
+4 h
+1362 m
+119 h
+1363 m
+1 h
+1 h
+10 h
+1364 m
+10 h
+258 h
+82 h
+82 h
+4 h
+195 h
+4 h
+1365 m
+4 h
+1366 m
+10 h
+10 h
+170 h
+55 h
+25 h
+1367 m
+1368 m
+4 h
+11 h
+1 h
+124 h
+1 h
+4 h
+113 h
+10 h
+339 m
+12 h
+11 h
+1369 m
+10 h
+146 h
+4 h
+10 h
+82 h
+1 h
+1370 m
+1 h
+181 h
+1371 m
+59 h
+1 h
+4 h
+1372 m
+1373 m
+986 m
+10 h
+1 h
+65 h
+1374 m
+1 h
+10 h
+10 h
+1 h
+1271 m
+1375 m
+1 h
+92 h
+1 h
+157 m
+1376 m
+1377 m
+1 h
+1 h
+82 h
+13 h
+65 h
+4 h
+1378 m
+10 h
+4 h
+10 h
+1379 m
+630 m
+737 m
+1 h
+1 h
+4 h
+4 h
+97 h
+4 h
+4 h
+4 h
+1380 m
+1261 m
+1189 m
+1381 m
+1382 m
+10 h
+4 h
+4 h
+1 h
+1383 m
+4 h
+57 h
+1384 m
+4 h
+11 h
+41 h
+1385 m
+65 h
+4 h
+11 h
+83 h
+4 h
+1386 m
+1387 m
+1 h
+1105 m
+1388 m
+135 h
+1389 m
+1390 m
+1391 m
+106 m
+4 h
+1392 m
+10 h
+173 h
+1 h
+1 h
+1393 m
+1394 m
+1198 h
+10 h
+4 h
+1 h
+1 h
+125 h
+28 h
+1 h
+10 h
+4 h
+601 m
+104 h
+28 h
+57 h
+1 h
+1395 m
+104 h
+1396 m
+1 h
+4 h
+1397 m
+4 h
+10 h
+41 h
+1 h
+57 h
+10 h
+4 h
+12 h
+124 h
+4 h
+1 h
+4 h
+1 h
+4 h
+4 h
+124 h
+4 h
+10 h
+10 h
+1398 m
+94 h
+10 h
+4 h
+4 h
+4 h
+10 h
+109 h
+1399 m
+1 h
+1 h
+1 h
+10 h
+1400 m
+4 h
+4 h
+1 h
+1401 m
+74 h
+4 h
+1402 m
+41 h
+1 h
+1 h
+10 h
+79 h
+144 h
+1 h
+92 h
+1403 m
+4 h
+4 h
+1 h
+82 h
+1404 m
+1405 m
+1406 m
+195 h
+4 h
+3 h
+105 h
+10 h
+10 h
+1 h
+4 h
+250 h
+146 h
+4 h
+22 h
+1 h
+46 m
+10 h
+1407 m
+31 h
+4 h
+367 m
+1 h
+1408 m
+359 m
+1409 m
+140 h
+4 h
+74 h
+91 h
+124 h
+1 h
+10 h
+158 h
+1 h
+4 h
+4 h
+4 h
+10 h
+4 h
+1410 m
+276 h
+1411 m
+1 h
+59 h
+74 h
+4 h
+11 h
+146 h
+4 h
+10 h
+1 h
+4 h
+4 h
+118 h
+4 h
+1412 m
+10 h
+1 h
+1413 m
+4 h
+1414 m
+124 h
+10 h
+1415 m
+10 h
+1416 m
+3 h
+1027 h
+1417 m
+1 h
+224 h
+1 h
+1418 m
+4 h
+282 h
+4 h
+1419 m
+1420 m
+1250 m
+1421 m
+1 h
+10 h
+4 h
+1422 m
+65 h
+4 h
+59 h
+112 h
+4 h
+1309 m
+94 h
+1423 m
+10 h
+1424 m
+1 h
+190 h
+4 h
+4 h
+1105 h
+82 h
+1425 m
+1426 m
+104 h
+41 h
+687 h
+82 h
+1 h
+1 h
+109 h
+4 h
+4 h
+1427 m
+12 h
+4 h
+1428 m
+94 h
+4 h
+1 h
+4 h
+125 h
+274 h
+1429 m
+4 h
+258 h
+192 h
+1 h
+123 h
+125 h
+83 h
+4 h
+129 h
+173 h
+1 h
+1430 m
+124 h
+1431 m
+79 h
+3 h
+10 h
+4 h
+1432 m
+4 h
+10 h
+4 h
+1433 m
+1434 m
+1 h
+1435 m
+10 h
+4 h
+4 h
+1 h
+11 h
+1250 h
+4 h
+4 h
+4 h
+1436 m
+1437 m
+1 h
+4 h
+11 h
+4 h
+139 h
+4 h
+4 h
+4 h
+4 h
+1438 m
+10 h
+4 h
+1 h
+1 h
+270 h
+1439 m
+1440 m
+1 h
+124 h
+1441 m
+1442 m
+10 h
+1 h
+4 h
+4 h
+10 h
+4 h
+10 h
+1443 m
+4 h
+1444 m
+4 h
+4 h
+10 h
+4 h
+278 h
+1445 m
+1446 m
+4 h
+1447 m
+181 h
+1448 m
+1449 m
+399 m
+73 h
+1 h
+82 h
+124 h
+1 h
+1 h
+1450 m
+1451 m
+1452 m
+4 h
+1453 m
+59 h
+1454 m
+1 h
+367 h
+10 h
+4 h
+4 h
+1455 m
+1456 m
+615 m
+929 m
+1 h
+4 h
+4 h
+11 h
+1 h
+4 h
+1 h
+1457 m
+1458 m
+1459 m
+10 h
+109 h
+1460 m
+1461 m
+1462 m
+27 h
+4 h
+169 h
+4 h
+1463 m
+1464 m
+1465 m
+4 h
+83 h
+447 m
+11 h
+25 h
+10 h
+4 h
+1466 m
+12 h
+4 h
+25 h
+164 h
+332 h
+1 h
+11 h
+10 h
+1467 m
+196 h
+4 h
+36 h
+10 h
+332 h
+1468 m
+73 h
+258 h
+1469 m
+4 h
+105 h
+10 h
+4 h
+59 h
+4 h
+1 h
+1470 m
+1471 m
+1 h
+1 h
+4 h
+10 h
+167 h
+10 h
+1 h
+11 h
+1472 m
+1473 m
+185 h
+1474 m
+1 h
+4 h
+4 h
+4 h
+36 h
+1 h
+1 h
+238 h
+1475 m
+64 h
+11 h
+1476 m
+59 h
+4 h
+1 h
+1477 m
+146 h
+4 h
+196 h
+4 h
+368 h
+124 h
+4 h
+10 h
+143 h
+1478 m
+4 h
+1 h
+146 h
+4 h
+65 h
+97 h
+1 h
+1479 m
+276 h
+4 h
+1478 h
+1 h
+196 h
+4 h
+1480 m
+1481 m
+1482 m
+1483 m
+119 h
+57 h
+399 h
+1484 m
+1485 m
+1486 m
+1487 m
+135 h
+4 h
+1 h
+4 h
+935 m
+73 h
+158 h
+3 h
+59 h
+4 h
+173 h
+1488 m
+1 h
+10 h
+4 h
+1 h
+1489 m
+4 h
+1 h
+1490 m
+4 h
+1 h
+1491 m
+1492 m
+4 h
+109 h
+1493 m
+1 h
+10 h
+1 h
+36 h
+1494 m
+1495 m
+10 h
+1496 m
+4 h
+185 h
+1497 m
+1 h
+4 h
+1 h
+1 h
+146 h
+4 h
+64 h
+412 m
+4 h
+4 h
+1 h
+266 h
+1 h
+4 h
+1359 m
+1498 m
+1499 m
+13 h
+4 h
+10 h
+359 m
+10 h
+172 h
+1 h
+4 h
+480 m
+307 h
+109 h
+1 h
+297 h
+1500 m
+25 h
+1 h
+10 h
+3 h
+1 h
+10 h
+4 h
+1501 m
+1 h
+10 h
+1 h
+124 h
+4 h
+1502 m
+10 h
+158 h
+4 h
+4 h
+1503 m
+4 h
+119 h
+1 h
+83 h
+31 h
+10 h
+1 h
+996 m
+195 h
+4 h
+1504 m
+1 h
+4 h
+737 m
+1505 m
+1506 m
+57 h
+1 h
+10 h
+770 m
+1507 m
+4 h
+1508 m
+173 h
+1509 m
+10 h
+1 h
+1510 m
+1511 m
+4 h
+279 h
+228 m
+124 h
+3 h
+1512 m
+4 h
+1 h
+172 h
+1 h
+10 h
+1513 m
+4 h
+1514 m
+40 h
+10 h
+41 h
+10 h
+1 h
+124 h
+1515 m
+4 h
+1516 m
+55 h
+1 h
+4 h
+1517 m
+11 h
+1518 m
+4 h
+4 h
+1519 m
+1520 m
+4 h
+10 h
+1 h
+1521 m
+59 h
+10 h
+1522 m
+1 h
+1523 m
+83 h
+10 h
+74 h
+140 h
+41 h
+10 h
+10 h
+1 h
+10 h
+4 h
+10 h
+443 h
+4 h
+56 h
+82 h
+258 h
+536 h
+13 h
+10 h
+1524 m
+10 h
+4 h
+10 h
+4 h
+1525 m
+1 h
+1362 m
+1526 m
+57 h
+1527 m
+266 h
+4 h
+1528 m
+124 h
+185 h
+1 h
+4 h
+185 h
+4 h
+167 h
+4 h
+1 h
+114 h
+4 h
+1 h
+266 h
+4 h
+10 h
+4 h
+1529 m
+1530 m
+1531 m
+10 h
+10 h
+195 h
+4 h
+10 h
+1532 m
+601 m
+1 h
+1533 m
+10 h
+1218 m
+1 h
+4 h
+1534 m
+4 h
+1535 m
+4 h
+82 h
+11 h
+1 h
+1536 m
+10 h
+195 h
+10 h
+124 h
+4 h
+1 h
+1537 m
+1538 m
+1539 m
+4 h
+276 h
+114 h
+4 h
+1 h
+4 h
+10 h
+258 h
+1 h
+1540 m
+82 h
+1003 h
+92 h
+1541 m
+156 h
+4 h
+1542 m
+4 h
+4 h
+40 h
+13 h
+1543 m
+4 h
+73 h
+10 h
+4 h
+22 h
+1544 m
+1545 m
+4 h
+4 h
+4 h
+4 h
+4 h
+4 h
+59 h
+83 h
+4 h
+4 h
+1546 m
+601 h
+1 h
+10 h
+4 h
+1 h
+10 h
+1 h
+1547 m
+1 h
+1 h
+83 h
+112 h
+4 h
+208 m
+1 h
+11 h
+10 h
+10 h
+1 h
+1 h
+4 h
+4 h
+4 h
+1548 m
+857 h
+4 h
+4 h
+1549 m
+109 h
+59 h
+1550 m
+10 h
+45 h
+65 h
+1 h
+25 h
+4 h
+4 h
+1551 m
+4 h
+109 h
+228 h
+12 h
+4 h
+1552 m
+1 h
+1553 m
+1 h
+4 h
+10 h
+4 h
+1554 m
+4 h
+27 h
+1555 m
+4 h
+1 h
+10 h
+4 h
+129 h
+192 h
+1556 m
+25 h
+1557 m
+1 h
+1558 m
+1559 m
+124 h
+4 h
+4 h
+443 h
+10 h
+1560 m
+10 h
+1 h
+1 h
+1561 m
+10 h
+57 h
+4 h
+10 h
+1562 m
+4 h
+4 h
+1563 m
+1564 m
+10 h
+10 h
+1565 m
+4 h
+1566 m
+1567 m
+1568 m
+1569 m
+4 h
+1570 m
+83 h
+4 h
+4 h
+1 h
+1 h
+167 h
+371 h
+97 h
+1299 m
+59 h
+10 h
+10 h
+4 h
+1571 m
+4 h
+4 h
+109 h
+11 h
+1572 m
+4 h
+10 h
+74 h
+10 h
+1573 m
+4 h
+10 h
+1574 m
+1575 m
+1 h
+124 h
+10 h
+10 h
+1576 m
+4 h
+10 h
+1359 h
+13 h
+104 h
+59 h
+1577 m
+1027 h
+1578 m
+1579 m
+1580 m
+266 h
+1 h
+1581 m
+196 h
+295 h
+1582 m
+1 h
+1 h
+1583 m
+1 h
+1584 m
+4 h
+1 h
+1344 m
+1 h
+1585 m
+1 h
+1 h
+4 h
+1586 m
+11 h
+36 h
+97 h
+1 h
+1 h
+25 h
+1 h
+987 m
+241 h
+1587 m
+10 h
+156 h
+195 h
+82 h
+1 h
+4 h
+1 h
+10 h
+1588 m
+278 h
+1589 m
+1590 m
+1 h
+109 h
+1591 m
+888 m
+1 h
+1592 m
+4 h
+13 h
+4 h
+12 h
+823 m
+1593 m
+4 h
+146 h
+82 h
+1 h
+73 h
+1594 m
+1 h
+1595 m
+1596 m
+1 h
+1597 m
+10 h
+1598 m
+73 h
+4 h
+10 h
+109 h
+1 h
+31 h
+1 h
+1599 m
+4 h
+1600 m
+1 h
+1601 m
+204 h
+10 h
+10 h
+1602 m
+1603 m
+1 h
+4 h
+1604 m
+1 h
+10 h
+1 h
+1 h
+4 h
+10 h
+10 h
+1 h
+1 h
+1605 m
+297 h
+55 h
+124 h
+140 h
+10 h
+59 h
+1606 m
+10 h
+4 h
+73 h
+4 h
+4 h
+4 h
+276 h
+10 h
+1607 m
+1 h
+1 h
+1 h
+125 h
+1 h
+1608 m
+10 h
+433 m
+615 m
+1535 m
+10 h
+10 h
+10 h
+1609 m
+69 h
+4 h
+536 h
+1 h
+1610 m
+4 h
+4 h
+1611 m
+1 h
+1612 m
+4 h
+25 h
+10 h
+1613 m
+4 h
+1614 m
+1 h
+1615 m
+3 h
+196 h
+25 h
+41 h
+1616 m
+1617 m
+181 h
+1 h
+4 h
+4 h
+1 h
+124 h
+10 h
+10 h
+1 h
+1 h
+1618 m
+1 h
+1 h
+10 h
+1 h
+3 h
+10 h
+4 h
+1619 m
+1620 m
+4 h
+4 h
+4 h
+262 h
+41 h
+167 h
+1621 m
+4 h
+10 h
+1622 m
+64 h
+1623 m
+167 h
+1 h
+4 h
+124 h
+4 h
+1624 m
+4 h
+1 h
+353 h
+1625 m
+1437 m
+11 h
+383 h
+1 h
+1626 m
+135 h
+4 h
+1627 m
+181 h
+92 h
+31 h
+104 h
+730 m
+1628 m
+4 h
+11 h
+1629 m
+10 h
+1 h
+73 h
+1630 m
+1631 m
+4 h
+10 h
+55 h
+1 h
+1632 m
+4 h
+1 h
+4 h
+256 m
+1 h
+4 h
+4 h
+1633 m
+1 h
+368 h
+4 h
+1 h
+4 h
+114 h
+1634 m
+1 h
+1635 m
+4 h
+10 h
+1636 m
+4 h
+172 h
+10 h
+190 h
+10 h
+124 h
+1637 m
+195 h
+57 h
+10 h
+4 h
+4 h
+1638 m
+4 h
+1 h
+181 h
+1639 m
+11 h
+1640 m
+1 h
+1641 m
+1642 m
+1643 m
+4 h
+167 h
+1644 m
+10 h
+1645 m
+4 h
+4 h
+10 h
+4 h
+1646 m
+10 h
+4 h
+10 h
+307 h
+64 h
+692 h
+368 h
+1 h
+386 h
+41 h
+538 h
+265 h
+1 h
+59 h
+1647 m
+1648 m
+4 h
+1 h
+28 h
+443 h
+186 m
+125 h
+10 h
+1649 m
+3 h
+10 h
+1650 m
+4 h
+1 h
+10 h
+1651 m
+4 h
+1652 m
+4 h
+10 h
+1653 m
+1 h
+139 h
+10 h
+1654 m
+4 h
+83 h
+4 h
+1 h
+443 h
+10 h
+10 h
+4 h
+1655 m
+1 h
+124 h
+1656 m
+307 h
+10 h
+1657 m
+4 h
+10 h
+10 h
+1658 m
+1 h
+4 h
+79 h
+1 h
+4 h
+1659 m
+1660 m
+59 h
+1661 m
+1662 m
+1 h
+36 h
+4 h
+1663 m
+83 h
+4 h
+1 h
+114 h
+359 h
+65 h
+1664 m
+1665 m
+1666 m
+10 h
+1667 m
+3 h
+1668 m
+1 h
+59 h
+45 h
+1669 m
+4 h
+104 h
+1670 m
+1671 m
+83 h
+119 h
+10 h
+1 h
+10 h
+1 h
+56 h
+1672 m
+1 h
+10 h
+241 h
+575 h
+4 h
+4 h
+536 h
+109 h
+167 h
+266 h
+4 h
+4 h
+1673 m
+10 h
+4 h
+119 h
+4 h
+146 h
+10 h
+54 m
+10 h
+1 h
+124 h
+1 h
+91 h
+4 h
+1 h
+1674 m
+1 h
+59 h
+1675 m
+278 h
+1 h
+4 h
+4 h
+1676 m
+1 h
+1 h
+1677 m
+36 h
+1 h
+10 h
+10 h
+48 h
+1 h
+59 h
+1678 m
+40 h
+1 h
+1 h
+371 h
+1 h
+1679 m
+4 h
+1680 m
+4 h
+10 h
+57 h
+1 h
+250 h
+4 h
+1 h
+1 h
+4 h
+56 h
+4 h
+11 h
+74 h
+74 h
+4 h
+12 h
+59 h
+1681 m
+10 h
+1 h
+1682 m
+10 h
+270 h
+1 h
+1 h
+4 h
+10 h
+1 h
+1683 m
+69 h
+3 h
+1684 m
+10 h
+10 h
+11 h
+10 h
+4 h
+459 h
+41 h
+872 m
+1 h
+1685 m
+4 h
+1 h
+1686 m
+25 h
+1 h
+15 m
+4 h
+1687 m
+109 h
+104 h
+1 h
+1 h
+1688 m
+4 h
+1689 m
+10 h
+10 h
+4 h
+295 h
+1690 m
+1 h
+59 h
+10 h
+4 h
+1 h
+4 h
+4 h
+190 h
+1691 m
+1 h
+1692 m
+640 h
+4 h
+1 h
+1 h
+4 h
+10 h
+4 h
+41 h
+4 h
+1 h
+1693 m
+4 h
+36 h
+25 h
+4 h
+4 h
+1694 m
+1695 m
+1696 m
+10 h
+109 h
+135 h
+1 h
+1697 m
+1698 m
+4 h
+4 h
+12 h
+1699 m
+1016 h
+278 h
+1 h
+11 h
+10 h
+1 h
+4 h
+1700 m
+1137 m
+10 h
+170 h
+4 h
+1701 m
+1137 h
+1074 m
+1702 m
+1703 m
+4 h
+125 h
+4 h
+10 h
+64 h
+1704 m
+10 h
+1 h
+278 h
+1 h
+1705 m
+238 h
+687 h
+1706 m
+1 h
+124 h
+36 h
+1707 m
+388 m
+1708 m
+25 h
+1 h
+1709 m
+10 h
+57 h
+570 m
+90 m
+1 h
+10 h
+1710 m
+10 h
+4 h
+1711 m
+10 h
+1 h
+264 m
+4 h
+4 h
+1712 m
+1713 m
+1107 m
+4 h
+82 h
+1 h
+1714 m
+1 h
+104 h
+359 h
+4 h
+11 h
+1 h
+1 h
+11 h
+10 h
+285 m
+4 h
+1 h
+10 h
+1715 m
+1 h
+119 h
+57 h
+41 h
+83 h
+976 m
+169 h
+11 h
+1716 m
+1250 h
+3 h
+10 h
+1 h
+1717 m
+1 h
+65 h
+124 h
+649 m
+1269 m
+1718 m
+57 h
+656 m
+1 h
+112 h
+1719 m
+1720 m
+4 h
+1 h
+10 h
+1721 m
+92 h
+109 h
+10 h
+1722 m
+1 h
+1723 m
+4 h
+10 h
+4 h
+1261 h
+4 h
+4 h
+4 h
+10 h
+10 h
+640 h
+1 h
+185 h
+1 h
+1 h
+11 h
+94 h
+1067 m
+1724 m
+10 h
+1725 m
+1726 m
+1 h
+4 h
+190 h
+4 h
+181 h
+4 h
+4 h
+57 h
+57 h
+185 h
+10 h
+1727 m
+93 m
+4 h
+737 h
+10 h
+140 h
+1728 m
+338 m
+4 h
+1729 m
+143 h
+4 h
+1 h
+4 h
+4 h
+8 h
+4 h
+4 h
+10 h
+1730 m
+13 h
+367 h
+4 h
+3 h
+1731 m
+1732 m
+1 h
+23 m
+4 h
+272 h
+31 h
+10 h
+97 h
+1733 m
+4 h
+124 h
+124 h
+1 h
+1734 m
+157 m
+1 h
+258 h
+1735 m
+1736 m
+1 h
+1191 m
+36 h
+109 h
+74 h
+104 h
+3 h
+10 h
+135 h
+45 h
+185 h
+238 h
+1737 m
+4 h
+10 h
+4 h
+1738 m
+4 h
+4 h
+1 h
+1 h
+4 h
+10 h
+1739 m
+1740 m
+10 h
+1 h
+4 h
+1741 m
+1 h
+1742 m
+1 h
+4 h
+13 h
+4 h
+1321 m
+59 h
+4 h
+164 h
+157 h
+4 h
+4 h
+1743 m
+10 h
+83 h
+4 h
+13 h
+1744 m
+1 h
+1745 m
+114 h
+65 h
+1261 h
+10 h
+359 h
+1 h
+83 h
+10 h
+1746 m
+83 h
+25 h
+1747 m
+1748 m
+1749 m
+156 h
+1 h
+1750 m
+73 h
+4 h
+1751 m
+1 h
+1752 m
+4 h
+12 h
+1753 m
+4 h
+109 h
+10 h
+1754 m
+4 h
+1755 m
+27 h
+4 h
+10 h
+1 h
+10 h
+10 h
+10 h
+57 h
+1756 m
+4 h
+4 h
+59 h
+10 h
+1757 m
+1758 m
+1759 m
+181 h
+1 h
+1760 m
+109 h
+1 h
+1406 m
+1761 m
+10 h
+1 h
+4 h
+4 h
+1762 m
+10 h
+1763 m
+1764 m
+10 h
+1 h
+11 h
+1765 m
+10 h
+10 h
+1 h
+1 h
+41 h
+1766 m
+1 h
+11 h
+11 h
+4 h
+109 h
+1 h
+1767 m
+1768 m
+25 h
+4 h
+1309 h
+10 h
+1 h
+125 h
+3 h
+10 h
+10 h
+1769 m
+1 h
+1 h
+1770 m
+4 h
+307 h
+1 h
+443 h
+4 h
+169 h
+1771 m
+1772 m
+1 h
+1773 m
+1 h
+4 h
+10 h
+1774 m
+4 h
+4 h
+4 h
+4 h
+1775 m
+124 h
+64 h
+1776 m
+1777 m
+1 h
+1 h
+4 h
+1778 m
+10 h
+172 h
+36 h
+4 h
+1779 m
+41 h
+601 h
+104 h
+4 h
+4 h
+97 h
+1780 m
+4 h
+3 h
+1781 m
+4 h
+4 h
+1 h
+57 h
+1 h
+10 h
+59 h
+41 h
+1 h
+1 h
+4 h
+25 h
+158 h
+1 h
+10 h
+10 h
+1782 m
+4 h
+10 h
+1783 m
+1784 m
+10 h
+4 h
+55 h
+1785 m
+1786 m
+174 h
+4 h
+1 h
+4 h
+1787 m
+10 h
+4 h
+1788 m
+11 h
+4 h
+4 h
+1789 m
+1790 m
+4 h
+41 h
+97 h
+4 h
+33 m
+1791 m
+1792 m
+4 h
+1 h
+10 h
+36 h
+106 m
+506 m
+156 h
+1793 m
+1794 m
+615 h
+4 h
+1 h
+4 h
+4 h
+1795 m
+4 h
+1 h
+1 h
+83 h
+1796 m
+1797 m
+1798 m
+73 h
+4 h
+1799 m
+82 h
+109 h
+1800 m
+4 h
+4 h
+185 h
+1 h
+4 h
+10 h
+158 h
+1801 m
+1108 m
+4 h
+59 h
+4 h
+1802 m
+1 h
+399 h
+1 h
+1803 m
+4 h
+4 h
+1804 m
+83 h
+113 h
+4 h
+4 h
+4 h
+1805 m
+114 h
+4 h
+4 h
+1406 m
+119 h
+124 h
+1806 m
+3 h
+1 h
+184 m
+1 h
+10 h
+4 h
+104 h
+109 h
+109 h
+10 h
+4 h
+125 h
+4 h
+97 h
+59 h
+10 h
+4 h
+4 h
+692 h
+1807 m
+4 h
+3 h
+4 h
+1808 m
+4 h
+10 h
+31 h
+1809 m
+4 h
+113 h
+1 h
+123 h
+1810 m
+1811 m
+4 h
+1 h
+11 h
+10 h
+3 h
+4 h
+1 h
+82 h
+1812 m
+4 h
+1 h
+123 h
+1 h
+82 h
+976 m
+10 h
+12 h
+1 h
+12 h
+10 h
+1 h
+4 h
+124 h
+10 h
+4 h
+4 h
+1 h
+4 h
+4 h
+1813 m
+55 h
+1814 m
+1359 h
+4 h
+1815 m
+4 h
+1816 m
+3 h
+388 m
+820 m
+1 h
+1817 m
+1 h
+4 h
+1 h
+11 h
+4 h
+11 h
+4 h
+1818 m
+1819 m
+31 h
+185 h
+4 h
+1820 m
+4 h
+1 h
+1821 m
+4 h
+73 h
+1261 h
+124 h
+1822 m
+10 h
+4 h
+1823 m
+25 h
+10 h
+1824 m
+4 h
+1825 m
+578 m
+10 h
+109 h
+59 h
+186 m
+10 h
+1826 m
+4 h
+1 h
+1827 m
+1470 m
+83 h
+4 h
+10 h
+1 h
+41 h
+447 h
+1828 m
+4 h
+1 h
+10 h
+10 h
+22 h
+4 h
+11 h
+1829 m
+4 h
+106 m
+4 h
+1830 m
+10 h
+1831 m
+10 h
+322 m
+4 h
+278 h
+4 h
+109 h
+869 m
+1 h
+1832 m
+1833 m
+4 h
+1 h
+25 h
+1834 m
+1 h
+143 h
+4 h
+10 h
+1 h
+4 h
+1 h
+1835 m
+1836 m
+56 h
+1 h
+1837 m
+4 h
+307 h
+4 h
+1 h
+1 h
+1838 m
+986 m
+4 h
+1 h
+1 h
+1 h
+185 h
+1 h
+4 h
+157 h
+1839 m
+4 h
+4 h
+4 h
+92 h
+4 h
+1840 m
+1841 m
+10 h
+109 h
+4 h
+1842 m
+443 h
+1843 m
+109 h
+1 h
+4 h
+4 h
+1 h
+1844 m
+31 h
+10 h
+1 h
+124 h
+59 h
+1 h
+4 h
+4 h
+4 h
+3 h
+10 h
+4 h
+1 h
+1 h
+10 h
+4 h
+10 h
+25 h
+4 h
+359 h
+1 h
+83 h
+1845 m
+144 h
+4 h
+4 h
+386 h
+4 h
+1846 m
+135 h
+1847 m
+40 h
+158 h
+82 h
+1 h
+82 h
+4 h
+1848 m
+4 h
+56 h
+1849 m
+1 h
+10 h
+10 h
+1850 m
+10 h
+1127 m
+4 h
+1851 m
+74 h
+1 h
+1852 m
+1853 m
+1854 m
+1855 m
+4 h
+11 h
+1856 m
+1857 m
+31 h
+1 h
+4 h
+295 h
+1403 m
+10 h
+4 h
+57 h
+1030 m
+1 h
+104 h
+1 h
+1858 m
+173 h
+1859 m
+82 h
+1 h
+1860 m
+1030 h
+1 h
+15 m
+112 h
+8 h
+185 h
+1482 m
+4 h
+11 h
+1861 m
+10 h
+4 h
+10 h
+4 h
+1 h
+4 h
+10 h
+328 m
+4 h
+10 h
+976 h
+4 h
+1862 m
+4 h
+10 h
+4 h
+1863 m
+1 h
+4 h
+1864 m
+468 m
+1 h
+1865 m
+1089 m
+4 h
+10 h
+1 h
+10 h
+11 h
+1866 m
+10 h
+114 h
+1 h
+1867 m
+1409 m
+10 h
+4 h
+1868 m
+31 h
+129 h
+4 h
+1869 m
+1870 m
+1 h
+143 h
+124 h
+1871 m
+1872 m
+1873 m
+908 m
+1796 m
+4 h
+4 h
+113 h
+10 h
+1874 m
+1875 m
+10 h
+31 h
+10 h
+230 m
+4 h
+10 h
+1876 m
+1 h
+124 h
+1877 m
+4 h
+4 h
+1 h
+10 h
+59 h
+1205 m
+935 m
+1 h
+143 h
+10 h
+10 h
+1878 m
+4 h
+1879 m
+1880 m
+69 h
+10 h
+55 h
+1 h
+1 h
+4 h
+1 h
+10 h
+4 h
+1881 m
+4 h
+4 h
+4 h
+85 m
+901 m
+1882 m
+1 h
+1883 m
+1 h
+4 h
+10 h
+11 h
+1 h
+4 h
+57 h
+569 h
+1884 m
+4 h
+3 h
+4 h
+1 h
+1885 m
+124 h
+4 h
+520 m
+1886 m
+1 h
+1887 m
+1888 m
+3 h
+82 h
+31 h
+4 h
+10 h
+10 h
+4 h
+4 h
+1 h
+109 h
+1 h
+1889 m
+616 m
+4 h
+185 h
+4 h
+1 h
+56 h
+10 h
+1 h
+1890 m
+10 h
+860 m
+332 h
+4 h
+4 h
+297 h
+1 h
+57 h
+1 h
+156 h
+649 m
+109 h
+10 h
+1891 m
+4 h
+1892 m
+31 h
+11 h
+10 h
+36 h
+1 h
+10 h
+4 h
+1893 m
+73 h
+4 h
+10 h
+1 h
+10 h
+1894 m
+1895 m
+4 h
+4 h
+10 h
+4 h
+4 h
+4 h
+4 h
+4 h
+10 h
+4 h
+238 h
+1896 m
+1 h
+1897 m
+1898 m
+10 h
+3 h
+13 h
+4 h
+1 h
+1899 m
+4 h
+4 h
+1 h
+64 h
+10 h
+10 h
+278 h
+4 h
+4 h
+4 h
+10 h
+12 h
+1900 m
+79 h
+11 h
+4 h
+1250 h
+10 h
+1901 m
+1902 m
+3 h
+1903 m
+92 h
+4 h
+307 h
+626 m
+11 h
+1 h
+1 h
+1904 m
+164 h
+27 h
+1905 m
+4 h
+1389 m
+4 h
+4 h
+1 h
+25 h
+10 h
+10 h
+1 h
+4 h
+10 h
+1 h
+538 h
+1 h
+307 h
+11 h
+1906 m
+1 h
+1 h
+13 h
+82 h
+4 h
+1 h
+69 h
+1907 m
+3 h
+1908 m
+4 h
+4 h
+94 h
+1909 m
+1910 m
+83 h
+10 h
+371 h
+1911 m
+10 h
+4 h
+1912 m
+1913 m
+1914 m
+278 h
+10 h
+371 h
+1772 m
+83 h
+4 h
+11 h
+82 h
+1 h
+230 m
+8 h
+1 h
+4 h
+1915 m
+10 h
+10 h
+1916 m
+1917 m
+1918 m
+4 h
+1919 m
+4 h
+1920 m
+4 h
+4 h
+359 h
+1321 m
+4 h
+1 h
+11 h
+1 h
+146 h
+4 h
+11 h
+25 h
+4 h
+1 h
+1921 m
+1 h
+4 h
+1922 m
+10 h
+46 m
+1923 m
+4 h
+1924 m
+1470 m
+1925 m
+241 h
+1926 m
+1 h
+4 h
+4 h
+8 h
+10 h
+10 h
+124 h
+172 h
+1927 m
+1 h
+4 h
+10 h
+4 h
+55 h
+1 h
+1 h
+4 h
+4 h
+4 h
+1928 m
+1 h
+56 h
+4 h
+10 h
+27 h
+4 h
+4 h
+59 h
+1 h
+11 h
+1 h
+31 h
+146 h
+4 h
+1929 m
+109 h
+1 h
+4 h
+250 h
+10 h
+4 h
+11 h
+143 h
+10 h
+10 h
+1930 m
+10 h
+4 h
+79 h
+1619 m
+4 h
+31 h
+569 h
+4 h
+124 h
+4 h
+11 h
+169 h
+4 h
+167 h
+1 h
+4 h
+1931 m
+399 h
+4 h
+266 h
+4 h
+1 h
+1932 m
+1780 m
+10 h
+1933 m
+196 h
+1934 m
+1 h
+1935 m
+4 h
+10 h
+1936 m
+1937 m
+1 h
+1 h
+1 h
+10 h
+1938 m
+1939 m
+25 h
+41 h
+4 h
+1 h
+10 h
+146 h
+31 h
+59 h
+1940 m
+1941 m
+10 h
+4 h
+1942 m
+4 h
+1 h
+1 h
+4 h
+313 m
+4 h
+1 h
+83 h
+10 h
+1943 m
+114 h
+278 h
+1944 m
+1 h
+1945 m
+12 h
+57 h
+4 h
+143 h
+4 h
+196 h
+10 h
+10 h
+4 h
+82 h
+10 h
+1946 m
+1947 m
+10 h
+1 h
+10 h
+10 h
+1 h
+92 h
+190 h
+11 h
+1 h
+11 h
+4 h
+258 h
+1948 m
+185 h
+92 h
+1949 m
+299 h
+1950 m
+41 h
+1 h
+4 h
+1 h
+11 h
+10 h
+1951 m
+125 h
+1260 m
+10 h
+124 h
+1 h
+1 h
+12 h
+10 h
+186 h
+1 h
+1 h
+1952 m
+10 h
+7 m
+4 h
+4 h
+4 h
+1953 m
+4 h
+4 h
+4 h
+238 h
+82 h
+82 h
+1 h
+1954 m
+10 h
+4 h
+1 h
+4 h
+266 h
+10 h
+1955 m
+1 h
+1956 m
+173 h
+1 h
+1 h
+1027 h
+1957 m
+443 h
+10 h
+11 h
+1958 m
+4 h
+164 h
+386 h
+4 h
+1 h
+1959 m
+57 h
+4 h
+4 h
+82 h
+27 h
+1960 m
+1961 m
+4 h
+40 h
+10 h
+1962 m
+319 m
+10 h
+3 h
+10 h
+10 h
+11 h
+1963 m
+1964 m
+1 h
+10 h
+3 h
+270 h
+10 h
+10 h
+1 h
+109 h
+4 h
+113 h
+307 h
+1780 h
+1 h
+4 h
+1965 m
+1966 m
+986 m
+10 h
+4 h
+4 h
+1967 m
+1968 m
+124 h
+3 h
+1969 m
+1 h
+4 h
+87 m
+4 h
+1 h
+1 h
+65 h
+1970 m
+146 h
+104 h
+1632 m
+4 h
+10 h
+4 h
+125 h
+83 h
+109 h
+1 h
+4 h
+124 h
+10 h
+124 h
+11 h
+10 h
+1 h
+4 h
+4 h
+1971 m
+10 h
+4 h
+59 h
+3 h
+1 h
+1972 m
+10 h
+139 h
+195 h
+10 h
+1 h
+1973 m
+1 h
+10 h
+135 h
+1 h
+1 h
+1 h
+119 h
+146 h
+4 h
+1 h
+1974 m
+1281 m
+10 h
+4 h
+1975 m
+1976 m
+1977 m
+110 h
+146 h
+1978 m
+1 h
+204 h
+1 h
+4 h
+1979 m
+124 h
+1980 m
+10 h
+888 m
+1 h
+1835 m
+4 h
+10 h
+94 h
+109 h
+10 h
+4 h
+1 h
+4 h
+10 h
+4 h
+4 h
+10 h
+1 h
+1 h
+4 h
+238 h
+1981 m
+3 h
+4 h
+1982 m
+11 h
+10 h
+1 h
+4 h
+41 h
+1 h
+1 h
+1983 m
+1 h
+1 h
+31 h
+4 h
+10 h
+4 h
+1 h
+1 h
+1 h
+10 h
+266 h
+10 h
+1714 m
+147 h
+4 h
+1984 m
+140 h
+11 h
+1985 m
+4 h
+59 h
+1986 m
+10 h
+1987 m
+4 h
+173 h
+104 h
+10 h
+73 h
+4 h
+57 h
+4 h
+1 h
+82 h
+10 h
+1 h
+1988 m
+40 h
+1 h
+4 h
+25 h
+1989 m
+307 h
+4 h
+1 h
+295 h
+1137 h
+1 h
+77 h
+3 h
+124 h
+295 h
+1990 m
+10 h
+4 h
+265 h
+11 h
+12 h
+190 h
+1991 m
+1992 m
+4 h
+1 h
+59 h
+1362 h
+10 h
+57 h
+190 h
+1 h
+1 h
+1993 m
+1994 m
+1 h
+536 h
+386 h
+10 h
+10 h
+10 h
+1 h
+1995 m
+4 h
+13 h
+172 h
+276 h
+1 h
+4 h
+1 h
+692 h
+10 h
+10 h
+1996 m
+4 h
+1997 m
+10 h
+4 h
+1998 m
+1999 m
+2000 m
+4 h
+10 h
+82 h
+1 h
+2001 m
+4 h
+147 h
+1 h
+2002 m
+109 h
+10 h
+4 h
+11 h
+104 h
+238 h
+10 h
+1 h
+1 h
+125 h
+195 h
+36 h
+4 h
+547 m
+4 h
+4 h
+10 h
+164 h
+59 h
+278 h
+2003 m
+2004 m
+1 h
+45 h
+170 h
+79 h
+125 h
+10 h
+10 h
+1478 h
+1 h
+2005 m
+1 h
+185 h
+1 h
+36 h
+10 h
+10 h
+4 h
+10 h
+10 h
+2006 m
+57 h
+2007 m
+41 h
+1 h
+2008 m
+2009 m
+1 h
+79 h
+1 h
+2010 m
+124 h
+536 h
+97 h
+2011 m
+11 h
+1 h
+4 h
+1 h
+250 h
+74 h
+1 h
+4 h
+36 h
+140 h
+204 h
+10 h
+4 h
+1 h
+1403 m
+1 h
+10 h
+4 h
+330 h
+10 h
+1 h
+1316 m
+4 h
+4 h
+1 h
+4 h
+195 h
+82 h
+25 h
+2012 m
+307 h
+11 h
+2013 m
+196 h
+59 h
+31 h
+4 h
+25 h
+10 h
+2014 m
+1016 h
+2015 m
+2016 m
+935 m
+4 h
+4 h
+110 h
+104 h
+692 h
+56 h
+10 h
+109 h
+4 h
+10 h
+2017 m
+2018 m
+4 h
+1 h
+4 h
+2019 m
+2020 m
+4 h
+258 h
+1 h
+1 h
+2021 m
+10 h
+13 h
+1 h
+172 h
+4 h
+196 h
+4 h
+10 h
+59 h
+10 h
+2022 m
+164 h
+79 h
+4 h
+2023 m
+4 h
+2024 m
+10 h
+4 h
+31 h
+55 h
+10 h
+2025 m
+97 h
+278 h
+4 h
+4 h
+1 h
+10 h
+41 h
+2026 m
+2027 m
+1409 m
+83 h
+4 h
+4 h
+1 h
+59 h
+4 h
+2028 m
+10 h
+520 m
+2029 m
+4 h
+2030 m
+2031 m
+966 m
+25 h
+45 h
+2032 m
+4 h
+74 h
+4 h
+83 h
+2033 m
+2034 m
+331 m
+1 h
+4 h
+57 h
+2035 m
+4 h
+97 h
+104 h
+2036 m
+1 h
+31 h
+278 h
+139 h
+241 h
+4 h
+2037 m
+170 h
+2038 m
+2039 m
+1 h
+1 h
+986 h
+2040 m
+1 h
+83 h
+4 h
+2041 m
+1 h
+4 h
+4 h
+2042 m
+1 h
+2043 m
+65 h
+1 h
+109 h
+2044 m
+1 h
+10 h
+10 h
+1 h
+2045 m
+1 h
+83 h
+10 h
+4 h
+1 h
+157 h
+4 h
+2046 m
+1 h
+124 h
+10 h
+11 h
+4 h
+2047 m
+4 h
+57 h
+1 h
+1 h
+2048 m
+2049 m
+1 h
+2050 m
+4 h
+1 h
+83 h
+25 h
+1650 m
+2051 m
+1096 h
+4 h
+1 h
+2052 m
+11 h
+10 h
+1 h
+2053 m
+83 h
+1 h
+2054 m
+1 h
+1 h
+4 h
+10 h
+2055 m
+2056 m
+2057 m
+2058 m
+2059 m
+1 h
+45 h
+2060 m
+123 h
+2061 m
+1607 m
+104 h
+4 h
+1 h
+4 h
+1 h
+4 h
+1 h
+169 h
+1 h
+79 h
+1 h
+258 h
+124 h
+11 h
+82 h
+2062 m
+2063 m
+1 h
+31 h
+1 h
+1 h
+1 h
+1 h
+10 h
+1 h
+3 h
+331 m
+4 h
+1321 h
+1 h
+10 h
+4 h
+4 h
+1 h
+41 h
+3 h
+1 h
+2064 m
+4 h
+4 h
+2065 m
+25 h
+10 h
+1 h
+2066 m
+3 h
+57 h
+33 m
+22 h
+4 h
+124 h
+1 h
+4 h
+10 h
+41 h
+10 h
+295 h
+1 h
+2067 m
+10 h
+11 h
+92 h
+1016 h
+2068 m
+83 h
+25 h
+1 h
+2069 m
+224 h
+157 h
+12 h
+2070 m
+986 h
+656 m
+10 h
+4 h
+41 h
+1 h
+2071 m
+1 h
+109 h
+238 h
+204 h
+2072 m
+1074 m
+4 h
+1 h
+2073 m
+1 h
+4 h
+2074 m
+4 h
+82 h
+59 h
+2075 m
+2076 m
+2077 m
+11 h
+1 h
+82 h
+4 h
+1 h
+10 h
+358 h
+4 h
+83 h
+10 h
+4 h
+4 h
+45 h
+4 h
+110 h
+2078 m
+1 h
+25 h
+4 h
+2079 m
+125 h
+11 h
+1 h
+10 h
+83 h
+94 h
+25 h
+4 h
+124 h
+2080 m
+10 h
+2081 m
+1 h
+4 h
+1 h
+4 h
+59 h
+109 h
+1 h
+4 h
+10 h
+4 h
+2082 m
+1 h
+2083 m
+1 h
+157 h
+2084 m
+10 h
+4 h
+55 h
+2085 m
+1 h
+4 h
+10 h
+2086 m
+4 h
+10 h
+4 h
+4 h
+986 h
+2087 m
+65 h
+2088 m
+10 h
+172 h
+10 h
+4 h
+1 h
+59 h
+2089 m
+79 h
+4 h
+109 h
+2090 m
+2091 m
+1 h
+173 h
+4 h
+4 h
+11 h
+1 h
+4 h
+10 h
+8 h
+4 h
+2092 m
+4 h
+4 h
+2093 m
+1 h
+164 h
+4 h
+4 h
+4 h
+4 h
+1 h
+146 h
+57 h
+57 h
+4 h
+4 h
+10 h
+1128 m
+172 h
+2094 m
+2095 m
+10 h
+2096 m
+276 h
+10 h
+266 h
+124 h
+56 h
+4 h
+4 h
+1 h
+704 m
+4 h
+1 h
+124 h
+2097 m
+10 h
+10 h
+25 h
+4 h
+4 h
+103 m
+114 h
+10 h
+10 h
+2098 m
+4 h
+2099 m
+2100 m
+10 h
+359 h
+1 h
+2101 m
+22 h
+143 h
+1 h
+4 h
+4 h
+1293 m
+1 h
+1 h
+4 h
+4 h
+2102 m
+262 h
+57 h
+192 h
+172 h
+4 h
+1 h
+2103 m
+172 h
+258 h
+4 h
+10 h
+2104 m
+4 h
+4 h
+3 h
+4 h
+250 h
+2105 m
+1 h
+1 h
+1 h
+2106 m
+144 h
+1 h
+112 h
+297 h
+2107 m
+196 h
+4 h
+1 h
+2108 m
+4 h
+83 h
+1016 h
+4 h
+4 h
+1 h
+1 h
+1 h
+1 h
+4 h
+1766 m
+2109 m
+1 h
+4 h
+541 h
+1 h
+55 h
+2110 m
+185 h
+105 h
+10 h
+31 h
+2111 m
+83 h
+1 h
+4 h
+4 h
+1089 m
+4 h
+2112 m
+2113 m
+4 h
+2114 m
+4 h
+10 h
+2115 m
+4 h
+4 h
+10 h
+1 h
+2116 m
+119 h
+2117 m
+4 h
+4 h
+2118 m
+10 h
+10 h
+1 h
+11 h
+2119 m
+4 h
+2120 m
+4 h
+10 h
+4 h
+4 h
+2121 m
+1 h
+1 h
+4 h
+74 h
+2122 m
+2123 m
+79 h
+4 h
+2124 m
+4 h
+4 h
+172 h
+264 m
+4 h
+1 h
+10 h
+1 h
+109 h
+2125 m
+4 h
+1 h
+258 h
+4 h
+196 h
+4 h
+10 h
+4 h
+1 h
+10 h
+104 h
+4 h
+1 h
+82 h
+1 h
+4 h
+2126 m
+4 h
+125 h
+2127 m
+2128 m
+2129 m
+172 h
+135 h
+2130 m
+224 h
+4 h
+1 h
+113 h
+1 h
+4 h
+4 h
+10 h
+4 h
+2131 m
+10 h
+10 h
+55 h
+2132 m
+1 h
+2133 m
+4 h
+2134 m
+1 h
+4 h
+2135 m
+146 h
+110 h
+2136 m
+25 h
+266 h
+92 h
+4 h
+10 h
+59 h
+2137 m
+10 h
+2138 m
+4 h
+2139 m
+2140 m
+79 h
+2141 m
+4 h
+4 h
+106 h
+4 h
+2142 m
+109 h
+10 h
+939 h
+10 h
+2143 m
+4 h
+41 h
+4 h
+4 h
+125 h
+1957 m
+4 h
+4 h
+383 h
+2144 m
+842 m
+1 h
+265 h
+10 h
+1 h
+10 h
+2145 m
+2146 m
+1 h
+59 h
+1 h
+2147 m
+4 h
+4 h
+4 h
+1 h
+74 h
+4 h
+4 h
+1 h
+156 h
+4 h
+55 h
+4 h
+83 h
+2148 m
+1 h
+2149 m
+2150 m
+172 h
+2151 m
+10 h
+4 h
+4 h
+2152 m
+1 h
+265 h
+2153 m
+2154 m
+4 h
+59 h
+1 h
+1 h
+2155 m
+2156 m
+11 h
+4 h
+10 h
+169 h
+10 h
+4 h
+4 h
+4 h
+1137 h
+1 h
+2157 m
+4 h
+1137 h
+181 h
+4 h
+10 h
+59 h
+1 h
+4 h
+4 h
+10 h
+1 h
+4 h
+1 h
+770 m
+4 h
+2158 m
+4 h
+104 h
+55 h
+1 h
+258 h
+4 h
+2159 m
+2160 m
+4 h
+10 h
+4 h
+2161 m
+185 h
+10 h
+4 h
+4 h
+1 h
+28 h
+371 h
+224 h
+4 h
+4 h
+1 h
+119 h
+358 h
+10 h
+74 h
+2162 m
+10 h
+4 h
+11 h
+10 h
+10 h
+332 h
+57 h
+238 h
+238 h
+2163 m
+4 h
+4 h
+4 h
+4 h
+1 h
+10 h
+1 h
+4 h
+2164 m
+10 h
+4 h
+2165 m
+938 m
+2166 m
+4 h
+11 h
+4 h
+1 h
+41 h
+256 m
+224 h
+4 h
+2167 m
+4 h
+2168 m
+2169 m
+1 h
+1 h
+1796 h
+4 h
+2170 m
+2171 m
+4 h
+4 h
+4 h
+2172 m
+1 h
+1 h
+4 h
+2173 m
+4 h
+307 h
+2174 m
+1137 h
+1642 m
+2175 m
+10 h
+1 h
+196 h
+25 h
+361 m
+10 h
+4 h
+2176 m
+2177 m
+36 h
+2178 m
+11 h
+2179 m
+11 h
+2180 m
+4 h
+4 h
+94 h
+447 h
+4 h
+4 h
+2181 m
+45 h
+2182 m
+11 h
+2183 m
+4 h
+1 h
+31 h
+59 h
+10 h
+124 h
+4 h
+1 h
+196 h
+4 h
+1635 m
+146 h
+1309 h
+2184 m
+1 h
+1535 h
+2185 m
+174 h
+11 h
+1 h
+4 h
+1 h
+1 h
+270 h
+2186 m
+82 h
+10 h
+1 h
+84 m
+403 h
+2187 m
+2188 m
+2189 m
+1 h
+10 h
+704 m
+4 h
+4 h
+11 h
+2190 m
+2191 m
+238 h
+10 h
+4 h
+195 h
+4 h
+3 h
+258 h
+109 h
+2192 m
+4 h
+2193 m
+1 h
+2194 m
+4 h
+1 h
+4 h
+4 h
+536 h
+2195 m
+2196 m
+4 h
+4 h
+10 h
+57 h
+57 h
+2197 m
+2198 m
+10 h
+57 h
+1 h
+10 h
+69 h
+2199 m
+31 h
+10 h
+2200 m
+2201 m
+1 h
+2202 m
+1 h
+27 h
+1 h
+976 h
+1 h
+114 h
+279 h
+82 h
+4 h
+1 h
+2203 m
+10 h
+2204 m
+1 h
+2205 m
+10 h
+109 h
+74 h
+4 h
+57 h
+4 h
+104 h
+10 h
+1 h
+4 h
+1 h
+1559 m
+4 h
+4 h
+1 h
+2206 m
+1 h
+332 h
+2207 m
+224 h
+2208 m
+4 h
+1 h
+358 h
+3 h
+123 h
+185 h
+2209 m
+2210 m
+1 h
+174 h
+2211 m
+10 h
+2212 m
+1 h
+1 h
+4 h
+2213 m
+1137 h
+1 h
+4 h
+41 h
+4 h
+4 h
+4 h
+2214 m
+10 h
+186 h
+2215 m
+1 h
+10 h
+2216 m
+109 h
+1 h
+10 h
+2217 m
+4 h
+59 h
+13 h
+2218 m
+1 h
+2219 m
+1 h
+57 h
+109 h
+10 h
+119 h
+10 h
+1 h
+941 m
+2148 m
+1 h
+1 h
+4 h
+31 h
+443 h
+10 h
+109 h
+2220 m
+10 h
+10 h
+2221 m
+82 h
+2222 m
+4 h
+4 h
+258 h
+2223 m
+97 h
+4 h
+3 h
+1 h
+1 h
+2224 m
+4 h
+10 h
+1 h
+10 h
+1 h
+1 h
+1 h
+31 h
+258 h
+2225 m
+174 h
+1 h
+10 h
+4 h
+1 h
+4 h
+135 h
+1 h
+2226 m
+2227 m
+84 m
+195 h
+1 h
+4 h
+55 h
+4 h
+10 h
+25 h
+119 h
+4 h
+10 h
+64 h
+2228 m
+2229 m
+1 h
+4 h
+4 h
+2172 m
+757 h
+4 h
+4 h
+4 h
+10 h
+2230 m
+262 h
+2231 m
+11 h
+46 m
+10 h
+2232 m
+4 h
+3 h
+276 h
+41 h
+4 h
+114 h
+1 h
+1 h
+4 h
+1 h
+383 h
+190 h
+25 h
+1 h
+1 h
+1 h
+1981 m
+587 m
+97 h
+110 h
+109 h
+10 h
+2233 m
+10 h
+4 h
+403 h
+2234 m
+278 h
+4 h
+4 h
+2235 m
+146 h
+2236 m
+1 h
+1 h
+1 h
+10 h
+397 m
+11 h
+4 h
+41 h
+10 h
+367 h
+1 h
+4 h
+2237 m
+1 h
+196 h
+1 h
+10 h
+250 h
+1 h
+82 h
+65 h
+3 h
+4 h
+83 h
+1 h
+403 h
+4 h
+1 h
+59 h
+2238 m
+10 h
+25 h
+1 h
+4 h
+4 h
+1 h
+1 h
+2239 m
+2240 m
+267 m
+10 h
+1 h
+4 h
+297 h
+8 h
+1 h
+4 h
+65 h
+229 h
+64 h
+692 h
+1 h
+2241 m
+2242 m
+10 h
+4 h
+4 h
+2243 m
+10 h
+2244 m
+1 h
+2245 m
+258 h
+4 h
+569 h
+1 h
+192 h
+10 h
+4 h
+195 h
+31 h
+1 h
+4 h
+1 h
+2246 m
+11 h
+10 h
+1 h
+4 h
+2247 m
+2248 m
+4 h
+4 h
+2249 m
+1 h
+2250 m
+10 h
+135 h
+2251 m
+4 h
+4 h
+11 h
+2252 m
+2253 m
+10 h
+4 h
+4 h
+143 h
+2254 m
+57 h
+1 h
+124 h
+13 h
+1 h
+3 h
+447 h
+11 h
+1 h
+41 h
+10 h
+1 h
+4 h
+2255 m
+1 h
+1 h
+2256 m
+4 h
+11 h
+73 h
+97 h
+172 h
+4 h
+4 h
+4 h
+11 h
+332 h
+1403 h
+2257 m
+2258 m
+1024 m
+83 h
+109 h
+368 h
+1 h
+289 h
+69 h
+11 h
+109 h
+2259 m
+113 h
+2260 m
+11 h
+1 h
+4 h
+4 h
+4 h
+297 h
+976 h
+10 h
+1 h
+10 h
+59 h
+82 h
+4 h
+1 h
+4 h
+190 h
+2261 m
+4 h
+4 h
+4 h
+1868 m
+2262 m
+4 h
+1 h
+8 h
+4 h
+258 h
+2263 m
+2264 m
+10 h
+4 h
+4 h
+2265 m
+4 h
+4 h
+11 h
+124 h
+10 h
+10 h
+2266 m
+2267 m
+1 h
+10 h
+10 h
+1 h
+8 h
+3 h
+230 h
+196 h
+4 h
+10 h
+2268 m
+2269 m
+4 h
+2270 m
+4 h
+4 h
+4 h
+10 h
+2271 m
+1 h
+4 h
+10 h
+41 h
+10 h
+10 h
+4 h
+4 h
+10 h
+125 h
+322 m
+4 h
+104 h
+2272 m
+2273 m
+65 h
+74 h
+2274 m
+1 h
+4 h
+2275 m
+2276 m
+2277 m
+11 h
+4 h
+57 h
+11 h
+1 h
+10 h
+10 h
+4 h
+10 h
+1406 h
+4 h
+28 h
+1 h
+10 h
+2278 m
+1938 m
+109 h
+2279 m
+2280 m
+2281 m
+10 h
+1 h
+4 h
+2282 m
+1 h
+4 h
+36 h
+2283 m
+1 h
+4 h
+2284 m
+91 h
+4 h
+2285 m
+4 h
+1 h
+1 h
+11 h
+1 h
+10 h
+2286 m
+4 h
+359 h
+2287 m
+97 h
+10 h
+1 h
+4 h
+1 h
+2288 m
+582 m
+1027 h
+4 h
+4 h
+250 h
+2289 m
+164 h
+4 h
+250 h
+2290 m
+2291 m
+1 h
+2292 m
+10 h
+1 h
+297 h
+1 h
+1 h
+1 h
+4 h
+2293 m
+109 h
+4 h
+4 h
+4 h
+2294 m
+10 h
+83 h
+2295 m
+2257 m
+4 h
+10 h
+3 h
+82 h
+2296 m
+4 h
+2297 m
+11 h
+3 h
+279 h
+4 h
+3 h
+65 h
+4 h
+10 h
+4 h
+123 h
+2298 m
+1 h
+4 h
+4 h
+167 h
+4 h
+4 h
+1 h
+4 h
+157 h
+2299 m
+4 h
+83 h
+10 h
+4 h
+10 h
+2300 m
+74 h
+11 h
+295 h
+1 h
+4 h
+1 h
+10 h
+2301 m
+10 h
+1 h
+4 h
+1 h
+1 h
+1 h
+4 h
+10 h
+1 h
+1 h
+4 h
+56 h
+10 h
+4 h
+386 h
+4 h
+353 h
+1 h
+10 h
+1 h
+4 h
+2302 m
+1 h
+307 h
+4 h
+2303 m
+1 h
+2124 m
+10 h
+1985 m
+10 h
+10 h
+4 h
+4 h
+2304 m
+124 h
+359 h
+2305 m
+10 h
+59 h
+4 h
+119 h
+139 h
+692 h
+4 h
+1 h
+2306 m
+10 h
+4 h
+4 h
+4 h
+2307 m
+10 h
+278 h
+1 h
+10 h
+332 h
+31 h
+97 h
+1 h
+4 h
+10 h
+1016 h
+82 h
+114 h
+4 h
+10 h
+276 h
+4 h
+307 h
+1 h
+2308 m
+4 h
+4 h
+10 h
+170 h
+1 h
+1 h
+4 h
+4 h
+11 h
+332 h
+83 h
+4 h
+1 h
+2309 m
+10 h
+59 h
+74 h
+2310 m
+10 h
+1220 m
+2311 m
+4 h
+1 h
+25 h
+2312 m
+4 h
+2313 m
+4 h
+4 h
+2314 m
+11 h
+1 h
+4 h
+2315 m
+2316 m
+4 h
+241 h
+4 h
+1 h
+2317 m
+79 h
+297 h
+4 h
+2318 m
+2319 m
+11 h
+1 h
+4 h
+2320 m
+10 h
+4 h
+10 h
+2321 m
+2322 m
+1 h
+2323 m
+1 h
+11 h
+2324 m
+1 h
+124 h
+4 h
+10 h
+79 h
+4 h
+2325 m
+65 h
+2326 m
+4 h
+2327 m
+718 h
+2328 m
+2329 m
+57 h
+2330 m
+4 h
+10 h
+2331 m
+299 h
+4 h
+477 m
+94 h
+11 h
+10 h
+45 h
+10 h
+976 h
+2332 m
+109 h
+2333 m
+10 h
+11 h
+3 h
+2334 m
+4 h
+4 h
+1 h
+147 h
+109 h
+41 h
+11 h
+10 h
+4 h
+2335 m
+2336 m
+4 h
+4 h
+11 h
+2337 m
+2338 m
+94 h
+13 h
+1 h
+1 h
+1 h
+4 h
+4 h
+195 h
+1261 h
+10 h
+10 h
+1 h
+1 h
+2339 m
+11 h
+4 h
+4 h
+10 h
+4 h
+1 h
+4 h
+1 h
+169 h
+2340 m
+94 h
+4 h
+82 h
+2341 m
+536 h
+41 h
+274 h
+184 m
+11 h
+1 h
+10 h
+2342 m
+11 h
+2343 m
+10 h
+93 m
+3 h
+4 h
+10 h
+83 h
+2344 m
+25 h
+11 h
+2345 m
+1 h
+2346 m
+2347 m
+4 h
+4 h
+135 h
+2348 m
+4 h
+1 h
+4 h
+2349 m
+4 h
+10 h
+4 h
+10 h
+802 m
+10 h
+1 h
+12 h
+10 h
+65 h
+4 h
+2350 m
+4 h
+2351 m
+10 h
+11 h
+278 h
+10 h
+2352 m
+1 h
+4 h
+10 h
+174 h
+2353 m
+2354 m
+238 h
+1766 m
+64 h
+1 h
+4 h
+2355 m
+4 h
+2356 m
+1 h
+2357 m
+2358 m
+857 h
+368 h
+434 h
+10 h
+11 h
+65 h
+1 h
+241 h
+4 h
+1 h
+3 h
+2359 m
+1 h
+8 h
+12 h
+779 m
+4 h
+10 h
+94 h
+1 h
+10 h
+2360 m
+25 h
+1 h
+1 h
+2361 m
+2362 m
+2363 m
+4 h
+4 h
+412 m
+93 m
+1 h
+2364 m
+1 h
+2365 m
+1 h
+10 h
+4 h
+1 h
+4 h
+31 h
+4 h
+2366 m
+82 h
+10 h
+2367 m
+124 h
+2368 m
+22 h
+238 h
+185 h
+1 h
+10 h
+4 h
+1627 m
+10 h
+10 h
+1 h
+3 h
+4 h
+4 h
+4 h
+4 h
+172 h
+2369 m
+4 h
+4 h
+2370 m
+10 h
+1 h
+1 h
+2371 m
+10 h
+4 h
+4 h
+2372 m
+2373 m
+4 h
+1 h
+2374 m
+1 h
+3 h
+1 h
+1 h
+11 h
+156 h
+10 h
+4 h
+1 h
+204 h
+146 h
+857 h
+10 h
+3 h
+4 h
+1 h
+4 h
+74 h
+59 h
+59 h
+4 h
+1 h
+4 h
+25 h
+2375 m
+196 h
+1 h
+4 h
+4 h
+41 h
+2376 m
+330 h
+11 h
+1030 h
+1 h
+2377 m
+104 h
+10 h
+2378 m
+4 h
+10 h
+4 h
+4 h
+1 h
+2379 m
+4 h
+172 h
+4 h
+1 h
+1 h
+4 h
+4 h
+4 h
+4 h
+12 h
+167 h
+2380 m
+2381 m
+2382 m
+22 h
+4 h
+2383 m
+1 h
+1 h
+1 h
+2384 m
+4 h
+1 h
+125 h
+2385 m
+2386 m
+185 h
+1 h
+195 h
+10 h
+2387 m
+262 h
+1 h
+1 h
+1 h
+2388 m
+2389 m
+4 h
+1 h
+2390 m
+10 h
+4 h
+10 h
+1 h
+10 h
+1 h
+2391 m
+11 h
+4 h
+146 h
+11 h
+119 h
+4 h
+10 h
+2392 m
+4 h
+94 h
+146 h
+1 h
+2393 m
+4 h
+2394 m
+2395 m
+2396 m
+2397 m
+192 h
+4 h
+1 h
+2398 m
+1 h
+1 h
+4 h
+4 h
+41 h
+65 h
+730 m
+2399 m
+2400 m
+1250 h
+2401 m
+135 h
+403 h
+1 h
+1 h
+1 h
+10 h
+3 h
+2402 m
+2403 m
+1128 m
+11 h
+69 h
+196 h
+110 h
+1952 m
+4 h
+1 h
+1 h
+1 h
+2404 m
+4 h
+2405 m
+10 h
+10 h
+1 h
+164 h
+110 h
+2406 m
+83 h
+10 h
+2407 m
+276 h
+1 h
+1 h
+11 h
+351 m
+125 h
+192 h
+196 h
+996 m
+2408 m
+2409 m
+4 h
+3 h
+278 h
+434 h
+2410 m
+4 h
+4 h
+10 h
+1 h
+4 h
+10 h
+4 h
+2411 m
+2412 m
+2413 m
+4 h
+1 h
+1 h
+2414 m
+258 h
+83 h
+204 h
+1 h
+82 h
+2415 m
+74 h
+124 h
+10 h
+1 h
+97 h
+2416 m
+2417 m
+4 h
+520 h
+583 m
+1 h
+4 h
+1 h
+1 h
+4 h
+2418 m
+4 h
+59 h
+2419 m
+4 h
+2420 m
+2421 m
+1 h
+1 h
+97 h
+2422 m
+2172 h
+2423 m
+79 h
+4 h
+2424 m
+82 h
+2425 m
+41 h
+10 h
+2426 m
+1 h
+1 h
+1 h
+2427 m
+630 m
+124 h
+10 h
+2428 m
+11 h
+2429 m
+1 h
+4 h
+2430 m
+181 h
+45 h
+10 h
+2431 m
+22 h
+10 h
+1 h
+1 h
+11 h
+4 h
+1 h
+2432 m
+83 h
+10 h
+2433 m
+4 h
+4 h
+4 h
+10 h
+1083 m
+1 h
+195 h
+2434 m
+2435 m
+4 h
+147 h
+192 h
+4 h
+4 h
+10 h
+10 h
+1 h
+2436 m
+1 h
+1 h
+41 h
+4 h
+1 h
+2437 m
+83 h
+2438 m
+1089 m
+4 h
+2439 m
+4 h
+4 h
+10 h
+2440 m
+4 h
+3 h
+11 h
+2441 m
+4 h
+170 h
+1 h
+109 h
+91 h
+31 h
+4 h
+65 h
+2442 m
+87 m
+285 m
+2443 m
+2444 m
+4 h
+4 h
+146 h
+10 h
+46 m
+737 h
+170 h
+1 h
+146 h
+10 h
+124 h
+10 h
+10 h
+3 h
+1027 h
+10 h
+2445 m
+74 h
+135 h
+3 h
+2446 m
+4 h
+1 h
+1 h
+4 h
+808 m
+10 h
+2041 m
+1481 m
+4 h
+83 h
+83 h
+2447 m
+4 h
+1 h
+1 h
+4 h
+1 h
+2448 m
+10 h
+57 h
+2449 m
+10 h
+1 h
+282 h
+3 h
+4 h
+74 h
+2450 m
+4 h
+1 h
+167 h
+25 h
+1880 m
+604 m
+2451 m
+2452 m
+4 h
+41 h
+2453 m
+3 h
+147 h
+83 h
+4 h
+4 h
+2454 m
+169 h
+10 h
+2455 m
+1 h
+55 h
+11 h
+1884 m
+1 h
+4 h
+190 h
+4 h
+4 h
+82 h
+1 h
+2456 m
+3 h
+4 h
+2457 m
+10 h
+295 h
+1 h
+65 h
+4 h
+2458 m
+258 h
+4 h
+4 h
+109 h
+2459 m
+1 h
+45 h
+2460 m
+4 h
+1 h
+2461 m
+195 h
+10 h
+1 h
+4 h
+4 h
+1 h
+2462 m
+4 h
+25 h
+2463 m
+1 h
+2464 m
+4 h
+2465 m
+1448 m
+2466 m
+4 h
+1 h
+2467 m
+1 h
+1 h
+4 h
+2468 m
+4 h
+195 h
+1 h
+1650 m
+1 h
+1137 h
+1 h
+10 h
+1 h
+10 h
+10 h
+4 h
+1 h
+4 h
+4 h
+1 h
+4 h
+4 h
+10 h
+1 h
+2469 m
+4 h
+4 h
+581 m
+195 h
+2470 m
+4 h
+10 h
+2471 m
+82 h
+2472 m
+4 h
+278 h
+25 h
+911 m
+36 h
+1 h
+2473 m
+2474 m
+4 h
+4 h
+757 h
+1 h
+25 h
+4 h
+1 h
+184 m
+41 h
+1 h
+11 h
+11 h
+129 h
+1 h
+1 h
+4 h
+794 m
+10 h
+2475 m
+4 h
+2476 m
+83 h
+4 h
+805 m
+1 h
+359 h
+2477 m
+2478 m
+10 h
+2479 m
+4 h
+4 h
+2480 m
+10 h
+2481 m
+59 h
+4 h
+10 h
+4 h
+10 h
+3 h
+11 h
+10 h
+3 h
+1 h
+2482 m
+1 h
+4 h
+195 h
+4 h
+10 h
+2483 m
+2484 m
+10 h
+41 h
+109 h
+10 h
+4 h
+1 h
+13 h
+97 h
+94 h
+4 h
+13 h
+4 h
+10 h
+4 h
+976 h
+4 h
+4 h
+1 h
+1 h
+4 h
+4 h
+4 h
+1 h
+538 h
+2485 m
+109 h
+10 h
+10 h
+10 h
+1 h
+4 h
+79 h
+2486 m
+1 h
+204 h
+144 h
+3 h
+2487 m
+10 h
+31 h
+1 h
+92 h
+10 h
+10 h
+59 h
+135 h
+4 h
+65 h
+45 h
+1 h
+4 h
+4 h
+65 h
+28 h
+4 h
+1 h
+10 h
+10 h
+4 h
+1 h
+140 h
+2488 m
+1205 m
+10 h
+82 h
+156 h
+3 h
+1646 m
+935 h
+1 h
+1 h
+10 h
+481 m
+1 h
+2489 m
+2490 m
+10 h
+1 h
+1 h
+4 h
+4 h
+4 h
+1 h
+2491 m
+4 h
+109 h
+10 h
+31 h
+4 h
+2492 m
+1 h
+1 h
+805 m
+4 h
+4 h
+2493 m
+3 h
+4 h
+4 h
+2494 m
+4 h
+2444 m
+55 h
+31 h
+2495 m
+2496 m
+1 h
+129 h
+11 h
+4 h
+2497 m
+2498 m
+4 h
+4 h
+4 h
+4 h
+4 h
+12 h
+4 h
+1 h
+2499 m
+258 h
+4 h
+4 h
+156 h
+1766 m
+295 h
+258 h
+55 h
+82 h
+4 h
+79 h
+4 h
+4 h
+195 h
+2500 m
+2501 m
+2502 m
+258 h
+4 h
+2503 m
+4 h
+4 h
+124 h
+4 h
+109 h
+2504 m
+1 h
+1308 m
+170 h
+4 h
+1 h
+4 h
+164 h
+4 h
+4 h
+1 h
+1250 h
+2505 m
+74 h
+4 h
+31 h
+109 h
+10 h
+31 h
+4 h
+2506 m
+10 h
+2507 m
+147 h
+10 h
+1 h
+2508 m
+10 h
+10 h
+1 h
+2509 m
+170 h
+195 h
+109 h
+4 h
+1 h
+1 h
+2510 m
+4 h
+4 h
+4 h
+172 h
+2511 m
+4 h
+1 h
+258 h
+1030 h
+270 h
+79 h
+41 h
+2512 m
+109 h
+4 h
+1 h
+110 h
+4 h
+73 h
+10 h
+57 h
+41 h
+10 h
+4 h
+4 h
+2513 m
+2514 m
+4 h
+56 h
+1 h
+2515 m
+4 h
+11 h
+4 h
+1 h
+1 h
+82 h
+1 h
+10 h
+2516 m
+1327 m
+4 h
+2517 m
+1 h
+55 h
+1 h
+1 h
+83 h
+2518 m
+4 h
+11 h
+4 h
+2519 m
+4 h
+1 h
+10 h
+10 h
+4 h
+74 h
+1 h
+11 h
+1 h
+1 h
+1 h
+2520 m
+25 h
+1 h
+2521 m
+2522 m
+2523 m
+55 h
+1 h
+2524 m
+10 h
+1 h
+506 m
+2525 m
+82 h
+4 h
+10 h
+1 h
+57 h
+4 h
+10 h
+2526 m
+4 h
+1 h
+2527 m
+114 h
+10 h
+10 h
+10 h
+2528 m
+1 h
+41 h
+10 h
+2529 m
+119 h
+4 h
+10 h
+1 h
+1 h
+1504 m
+1 h
+1738 m
+10 h
+2530 m
+2531 m
+55 h
+4 h
+4 h
+1 h
+266 h
+10 h
+1 h
+4 h
+2532 m
+4 h
+4 h
+4 h
+1359 h
+4 h
+4 h
+4 h
+195 h
+4 h
+181 h
+4 h
+4 h
+119 h
+4 h
+11 h
+57 h
+10 h
+1 h
+1 h
+332 h
+11 h
+4 h
+408 m
+57 h
+45 h
+1 h
+1 h
+4 h
+124 h
+3 h
+4 h
+4 h
+10 h
+2533 m
+22 h
+4 h
+1 h
+1 h
+13 h
+11 h
+1 h
+10 h
+4 h
+319 m
+1 h
+10 h
+135 h
+4 h
+1 h
+10 h
+2534 m
+1184 m
+11 h
+4 h
+10 h
+1 h
+1 h
+250 h
+4 h
+2535 m
+1 h
+110 h
+4 h
+2536 m
+2537 m
+2538 m
+170 h
+4 h
+2539 m
+4 h
+1 h
+1 h
+172 h
+4 h
+569 h
+2540 m
+4 h
+4 h
+319 m
+4 h
+1 h
+368 h
+1 h
+82 h
+2541 m
+4 h
+3 h
+1 h
+143 h
+83 h
+4 h
+10 h
+2542 m
+1 h
+2543 m
+4 h
+1 h
+10 h
+10 h
+4 h
+4 h
+1137 h
+2544 m
+11 h
+4 h
+2545 m
+2546 m
+2547 m
+4 h
+2548 m
+82 h
+8 h
+4 h
+83 h
+59 h
+10 h
+10 h
+2549 m
+1 h
+4 h
+1 h
+10 h
+45 h
+31 h
+2550 m
+1 h
+4 h
+11 h
+1 h
+1 h
+4 h
+25 h
+4 h
+1 h
+1454 m
+22 h
+266 h
+2551 m
+1 h
+2552 m
+2553 m
+2554 m
+10 h
+135 h
+1 h
+278 h
+97 h
+1 h
+4 h
+2555 m
+113 h
+493 h
+196 h
+59 h
+10 h
+195 h
+563 m
+109 h
+10 h
+10 h
+794 m
+2556 m
+2557 m
+430 m
+425 m
+41 h
+4 h
+4 h
+4 h
+2558 m
+10 h
+258 h
+4 h
+4 h
+2559 m
+4 h
+1 h
+4 h
+11 h
+2560 m
+1 h
+976 h
+578 m
+12 h
+1780 h
+2561 m
+83 h
+276 h
+1 h
+2562 m
+10 h
+79 h
+28 h
+4 h
+2563 m
+2564 m
+4 h
+10 h
+10 h
+94 h
+143 h
+1 h
+2565 m
+4 h
+11 h
+2566 m
+4 h
+56 h
+11 h
+1 h
+2567 m
+1016 h
+2568 m
+4 h
+332 h
+1 h
+77 h
+4 h
+11 h
+4 h
+82 h
+4 h
+2569 m
+238 h
+4 h
+1 h
+4 h
+4 h
+4 h
+278 h
+11 h
+4 h
+1619 m
+1 h
+1 h
+4 h
+10 h
+3 h
+2570 m
+2571 m
+195 h
+4 h
+4 h
+1 h
+10 h
+2572 m
+119 h
+1 h
+1 h
+1 h
+31 h
+12 h
+4 h
+2573 m
+2574 m
+10 h
+10 h
+4 h
+65 h
+276 h
+2575 m
+124 h
+1 h
+57 h
+31 h
+4 h
+4 h
+4 h
+10 h
+2576 m
+41 h
+2577 m
+46 m
+125 h
+2578 m
+1939 m
+2579 m
+2580 m
+4 h
+25 h
+2581 m
+2582 m
+10 h
+10 h
+124 h
+2583 m
+3 h
+164 h
+10 h
+4 h
+2584 m
+4 h
+1 h
+10 h
+31 h
+33 m
+2585 m
+97 h
+4 h
+4 h
+4 h
+808 m
+4 h
+2586 m
+4 h
+4 h
+4 h
+4 h
+2587 m
+4 h
+1 h
+2588 m
+1 h
+4 h
+912 m
+386 h
+4 h
+92 h
+1 h
+4 h
+1 h
+167 h
+3 h
+1 h
+2589 m
+1 h
+2590 m
+4 h
+54 m
+97 h
+4 h
+1 h
+10 h
+2591 m
+74 h
+10 h
+1 h
+2592 m
+10 h
+4 h
+1 h
+10 h
+4 h
+1 h
+2593 m
+704 h
+4 h
+57 h
+2594 m
+2265 m
+2595 m
+2596 m
+109 h
+104 h
+2597 m
+1 h
+1 h
+1 h
+2598 m
+31 h
+1 h
+1 h
+4 h
+1 h
+4 h
+146 h
+57 h
+4 h
+2599 m
+13 h
+10 h
+69 h
+92 h
+1 h
+2600 m
+4 h
+139 h
+4 h
+4 h
+4 h
+57 h
+10 h
+4 h
+279 h
+2601 m
+2602 m
+1 h
+1 h
+11 h
+10 h
+4 h
+2603 m
+10 h
+332 h
+11 h
+4 h
+2604 m
+2605 m
+1 h
+4 h
+1 h
+2606 m
+4 h
+2607 m
+4 h
+147 h
+2608 m
+295 h
+1 h
+2609 m
+10 h
+4 h
+10 h
+4 h
+4 h
+104 h
+13 h
+1 h
+10 h
+56 h
+10 h
+1 h
+1 h
+2610 m
+1 h
+2611 m
+36 h
+2612 m
+1 h
+59 h
+4 h
+4 h
+4 h
+4 h
+10 h
+195 h
+297 h
+4 h
+125 h
+10 h
+1 h
+2613 m
+4 h
+692 h
+4 h
+2614 m
+1 h
+114 h
+1 h
+265 h
+1968 m
+124 h
+4 h
+2615 m
+358 h
+124 h
+4 h
+2616 m
+2617 m
+2618 m
+190 h
+11 h
+2619 m
+316 m
+1 h
+2620 m
+2621 m
+10 h
+1 h
+10 h
+10 h
+913 m
+11 h
+262 h
+25 h
+2622 m
+2623 m
+25 h
+139 h
+10 h
+83 h
+169 h
+109 h
+1 h
+1 h
+4 h
+1 h
+1 h
+4 h
+146 h
+377 m
+143 h
+10 h
+2624 m
+2625 m
+2626 m
+2627 m
+1751 m
+2628 m
+103 m
+140 h
+4 h
+31 h
+74 h
+10 h
+83 h
+1 h
+196 h
+1 h
+4 h
+1508 m
+2629 m
+10 h
+10 h
+238 h
+10 h
+229 h
+11 h
+1 h
+57 h
+4 h
+57 h
+4 h
+2630 m
+10 h
+2631 m
+4 h
+4 h
+13 h
+2632 m
+2633 m
+386 h
+41 h
+10 h
+104 h
+125 h
+1 h
+2047 m
+279 h
+4 h
+4 h
+10 h
+195 h
+2634 m
+1 h
+4 h
+4 h
+11 h
+109 h
+173 h
+10 h
+4 h
+4 h
+10 h
+12 h
+2635 m
+1 h
+1 h
+3 h
+229 h
+11 h
+2636 m
+1261 h
+59 h
+2637 m
+2638 m
+2639 m
+92 h
+1 h
+4 h
+4 h
+4 h
+258 h
+1 h
+1 h
+4 h
+10 h
+146 h
+2640 m
+1 h
+1 h
+11 h
+278 h
+27 h
+4 h
+1089 m
+41 h
+14 m
+10 h
+1 h
+230 h
+74 h
+2641 m
+4 h
+2642 m
+4 h
+10 h
+2643 m
+1 h
+1 h
+1 h
+10 h
+158 h
+10 h
+2644 m
+1 h
+25 h
+4 h
+10 h
+10 h
+1 h
+73 h
+1 h
+113 h
+2645 m
+359 h
+36 h
+2646 m
+10 h
+1406 h
+1 h
+4 h
+1 h
+4 h
+1 h
+10 h
+2647 m
+2648 m
+2649 m
+250 h
+2650 m
+1 h
+4 h
+2651 m
+2652 m
+2653 m
+297 h
+10 h
+13 h
+1535 h
+2654 m
+10 h
+4 h
+1 h
+10 h
+4 h
+2655 m
+4 h
+219 m
+1 h
+10 h
+4 h
+108 h
+1 h
+11 h
+1 h
+2656 m
+1 h
+140 h
+2657 m
+1 h
+408 m
+119 h
+1 h
+31 h
+59 h
+4 h
+447 h
+1 h
+1 h
+114 h
+45 h
+2658 m
+82 h
+31 h
+4 h
+4 h
+2659 m
+4 h
+2660 m
+94 h
+1 h
+4 h
+4 h
+2661 m
+10 h
+250 h
+10 h
+4 h
+1003 h
+2662 m
+2663 m
+28 h
+79 h
+4 h
+368 h
+10 h
+601 h
+1 h
+4 h
+10 h
+1 h
+2664 m
+1 h
+1322 m
+169 h
+4 h
+4 h
+1 h
+83 h
+4 h
+170 h
+10 h
+10 h
+10 h
+65 h
+10 h
+4 h
+1 h
+2665 m
+2666 m
+10 h
+10 h
+4 h
+1 h
+4 h
+2667 m
+299 h
+10 h
+468 m
+2668 m
+10 h
+250 h
+2669 m
+1 h
+31 h
+169 h
+2670 m
+4 h
+11 h
+2671 m
+976 h
+10 h
+110 h
+10 h
+2672 m
+4 h
+2673 m
+4 h
+330 h
+4 h
+4 h
+4 h
+11 h
+1 h
+2674 m
+2675 m
+10 h
+1493 m
+92 h
+10 h
+11 h
+1981 m
+11 h
+1 h
+4 h
+138 h
+10 h
+4 h
+4 h
+4 h
+2676 m
+10 h
+109 h
+1 h
+118 h
+83 h
+4 h
+258 h
+4 h
+4 h
+10 h
+1 h
+59 h
+1 h
+1 h
+2677 m
+10 h
+4 h
+10 h
+2678 m
+4 h
+114 h
+1 h
+2679 m
+1045 m
+2680 m
+1 h
+4 h
+10 h
+33 m
+2681 m
+2682 m
+4 h
+1482 m
+2683 m
+238 h
+10 h
+1120 m
+10 h
+1 h
+2684 m
+2685 m
+195 h
+22 h
+4 h
+124 h
+36 h
+4 h
+10 h
+2686 m
+55 h
+92 h
+2687 m
+4 h
+125 h
+4 h
+2688 m
+4 h
+12 h
+124 h
+2689 m
+4 h
+57 h
+4 h
+4 h
+3 h
+1780 h
+1 h
+4 h
+10 h
+10 h
+124 h
+2690 m
+10 h
+2691 m
+10 h
+2692 m
+4 h
+4 h
+488 m
+119 h
+322 m
+2693 m
+2694 m
+3 h
+10 h
+144 h
+123 h
+4 h
+1 h
+10 h
+2695 m
+399 h
+1403 h
+3 h
+10 h
+1 h
+4 h
+4 h
+1 h
+1 h
+10 h
+692 h
+1 h
+31 h
+1 h
+4 h
+1 h
+2696 m
+2697 m
+55 h
+1 h
+195 h
+2698 m
+1 h
+11 h
+4 h
+4 h
+2699 m
+477 m
+2700 m
+65 h
+10 h
+332 h
+10 h
+83 h
+10 h
+2701 m
+976 h
+238 h
+4 h
+1 h
+2702 m
+3 h
+1 h
+1 h
+22 h
+1 h
+2703 m
+2494 m
+2704 m
+4 h
+2705 m
+2706 m
+2707 m
+10 h
+125 h
+64 h
+25 h
+125 h
+464 h
+125 h
+1 h
+1 h
+4 h
+40 h
+1 h
+2708 m
+2709 m
+1 h
+82 h
+57 h
+1 h
+1 h
+2710 m
+182 m
+1 h
+386 h
+377 m
+1 h
+2711 m
+4 h
+10 h
+1 h
+250 h
+4 h
+1 h
+4 h
+1 h
+79 h
+2712 m
+2713 m
+2714 m
+4 h
+4 h
+2715 m
+1685 m
+10 h
+4 h
+10 h
+10 h
+2716 m
+4 h
+4 h
+10 h
+1 h
+2717 m
+4 h
+3 h
+1 h
+82 h
+10 h
+1 h
+1 h
+41 h
+10 h
+41 h
+10 h
+2718 m
+4 h
+1 h
+156 h
+2719 m
+4 h
+1 h
+27 h
+1 h
+2720 m
+297 h
+10 h
+10 h
+1184 m
+11 h
+10 h
+2721 m
+2722 m
+4 h
+4 h
+2723 m
+1 h
+65 h
+10 h
+1 h
+1 h
+4 h
+4 h
+4 h
+125 h
+4 h
+10 h
+4 h
+1 h
+2724 m
+383 h
+4 h
+10 h
+2725 m
+4 h
+92 h
+10 h
+59 h
+1 h
+2726 m
+4 h
+11 h
+10 h
+4 h
+1 h
+4 h
+4 h
+4 h
+10 h
+692 h
+1 h
+4 h
+2727 m
+1 h
+41 h
+1 h
+4 h
+82 h
+297 h
+55 h
+10 h
+264 m
+10 h
+2728 m
+10 h
+4 h
+1 h
+4 h
+10 h
+4 h
+359 h
+2729 m
+36 h
+79 h
+185 h
+4 h
+2730 m
+4 h
+266 h
+94 h
+2731 m
+1 h
+4 h
+3 h
+4 h
+4 h
+2732 m
+2733 m
+4 h
+4 h
+135 h
+45 h
+2734 m
+1 h
+2735 m
+4 h
+2736 m
+123 h
+488 m
+4 h
+10 h
+1 h
+4 h
+27 h
+1 h
+258 h
+2737 m
+2738 m
+10 h
+55 h
+83 h
+4 h
+10 h
+2739 m
+3 h
+10 h
+4 h
+11 h
+4 h
+10 h
+10 h
+425 m
+10 h
+4 h
+4 h
+1016 h
+687 h
+2740 m
+1249 m
+4 h
+4 h
+2741 m
+2742 m
+1 h
+2743 m
+2744 m
+146 h
+2745 m
+2746 m
+4 h
+1493 m
+1 h
+82 h
+274 h
+238 h
+1 h
+146 h
+10 h
+4 h
+1 h
+4 h
+4 h
+2747 m
+4 h
+265 h
+2748 m
+2749 m
+2750 m
+1 h
+1 h
+1780 h
+4 h
+2751 m
+2752 m
+4 h
+147 h
+4 h
+2753 m
+2754 m
+966 m
+10 h
+2309 m
+4 h
+2755 m
+10 h
+2756 m
+4 h
+2757 m
+1 h
+123 h
+2758 m
+1 h
+1 h
+1 h
+520 h
+10 h
+10 h
+4 h
+10 h
+2759 m
+230 h
+1 h
+4 h
+4 h
+10 h
+1 h
+4 h
+4 h
+4 h
+11 h
+4 h
+167 h
+1 h
+10 h
+158 h
+2760 m
+1 h
+2688 m
+4 h
+57 h
+2761 m
+4 h
+74 h
+4 h
+10 h
+4 h
+125 h
+2762 m
+468 m
+1 h
+2763 m
+4 h
+4 h
+358 h
+4 h
+83 h
+11 h
+11 h
+10 h
+1 h
+31 h
+59 h
+2764 m
+2379 m
+82 h
+2765 m
+1 h
+2766 m
+79 h
+82 h
+73 h
+55 h
+1 h
+4 h
+4 h
+59 h
+3 h
+2767 m
+31 h
+2768 m
+10 h
+1 h
+2769 m
+83 h
+10 h
+10 h
+1 h
+2770 m
+4 h
+196 h
+1370 m
+185 h
+238 h
+2771 m
+25 h
+1 h
+4 h
+4 h
+55 h
+109 h
+10 h
+279 h
+1470 h
+4 h
+10 h
+4 h
+10 h
+1642 m
+1 h
+4 h
+10 h
+119 h
+3 h
+1406 h
+114 h
+2772 m
+1 h
+2773 m
+10 h
+4 h
+996 m
+124 h
+601 h
+4 h
+4 h
+45 h
+59 h
+11 h
+4 h
+10 h
+4 h
+123 h
+110 h
+2079 m
+2774 m
+1 h
+1 h
+1 h
+2775 m
+1 h
+230 h
+10 h
+10 h
+2776 m
+2777 m
+1 h
+1 h
+250 h
+2778 m
+1 h
+10 h
+2779 m
+2780 m
+55 h
+4 h
+2781 m
+11 h
+119 h
+56 h
+1 h
+125 h
+1 h
+82 h
+1 h
+64 h
+57 h
+103 m
+2782 m
+169 h
+4 h
+167 h
+108 h
+59 h
+2783 m
+1 h
+10 h
+10 h
+2784 m
+10 h
+4 h
+82 h
+2785 m
+1 h
+109 h
+4 h
+2786 m
+79 h
+2787 m
+31 h
+1 h
+1 h
+10 h
+22 h
+10 h
+4 h
+1 h
+4 h
+1409 h
+4 h
+2788 m
+2789 m
+4 h
+1 h
+1 h
+1 h
+10 h
+2314 m
+1 h
+11 h
+1 h
+4 h
+4 h
+986 h
+2790 m
+1308 m
+278 h
+3 h
+124 h
+4 h
+1 h
+10 h
+31 h
+2791 m
+10 h
+109 h
+1030 h
+124 h
+4 h
+4 h
+367 h
+2792 m
+25 h
+135 h
+4 h
+82 h
+2793 m
+104 h
+10 h
+4 h
+10 h
+2794 m
+2795 m
+1 h
+2374 m
+4 h
+10 h
+1 h
+4 h
+2796 m
+1 h
+4 h
+3 h
+4 h
+1 h
+1409 h
+1 h
+1 h
+112 h
+124 h
+4 h
+1 h
+383 h
+2139 m
+10 h
+1 h
+4 h
+12 h
+4 h
+10 h
+2797 m
+1 h
+358 h
+1089 m
+41 h
+2798 m
+55 h
+2799 m
+1299 m
+1838 m
+10 h
+2800 m
+1 h
+4 h
+2801 m
+109 h
+1 h
+10 h
+4 h
+2802 m
+2803 m
+144 h
+238 h
+2804 m
+2805 m
+2806 m
+570 m
+10 h
+2807 m
+74 h
+700 m
+1 h
+125 h
+4 h
+10 h
+104 h
+157 h
+10 h
+4 h
+1 h
+2808 m
+2809 m
+4 h
+1 h
+12 h
+1595 m
+2810 m
+2811 m
+1 h
+4 h
+64 h
+1 h
+3 h
+65 h
+94 h
+146 h
+2812 m
+2813 m
+2814 m
+4 h
+4 h
+1 h
+10 h
+2815 m
+4 h
+386 h
+1 h
+1 h
+4 h
+93 h
+2816 m
+2817 m
+368 h
+3 h
+10 h
+10 h
+4 h
+278 h
+4 h
+2818 m
+69 h
+10 h
+11 h
+4 h
+10 h
+2819 m
+1 h
+4 h
+10 h
+1 h
+443 h
+10 h
+4 h
+2820 m
+1089 h
+4 h
+4 h
+1 h
+4 h
+113 h
+2821 m
+91 h
+2822 m
+10 h
+1 h
+41 h
+1 h
+4 h
+4 h
+1 h
+64 h
+10 h
+57 h
+443 h
+2823 m
+2824 m
+1 h
+10 h
+10 h
+1 h
+11 h
+262 h
+4 h
+4 h
+2825 m
+2826 m
+2827 m
+1 h
+1 h
+1 h
+2828 m
+123 h
+1 h
+4 h
+2829 m
+2830 m
+2831 m
+1 h
+1 h
+2832 m
+92 h
+10 h
+1822 m
+4 h
+10 h
+2833 m
+2834 m
+4 h
+538 h
+1 h
+1 h
+1 h
+25 h
+10 h
+656 m
+4 h
+2835 m
+146 h
+1 h
+2836 m
+4 h
+1 h
+536 h
+10 h
+10 h
+443 h
+59 h
+4 h
+59 h
+114 h
+92 h
+4 h
+172 h
+4 h
+1 h
+83 h
+2837 m
+1 h
+4 h
+2838 m
+11 h
+11 h
+2839 m
+2840 m
+2841 m
+2842 m
+4 h
+157 h
+1048 m
+1 h
+1261 h
+209 m
+258 h
+2843 m
+10 h
+4 h
+10 h
+1 h
+2844 m
+4 h
+2845 m
+4 h
+557 m
+520 h
+170 h
+556 m
+1 h
+2846 m
+1 h
+31 h
+4 h
+4 h
+1 h
+57 h
+2847 m
+10 h
+1 h
+1 h
+129 h
+1 h
+2848 m
+125 h
+4 h
+279 h
+4 h
+2849 m
+4 h
+10 h
+185 h
+4 h
+10 h
+10 h
+250 h
+2850 m
+173 h
+11 h
+4 h
+64 h
+2851 m
+1261 h
+12 h
+10 h
+509 m
+2852 m
+82 h
+626 m
+4 h
+59 h
+4 h
+1 h
+4 h
+2853 m
+2854 m
+10 h
+10 h
+12 h
+295 h
+4 h
+556 m
+2855 m
+10 h
+4 h
+10 h
+10 h
+119 h
+2856 m
+2857 m
+403 h
+4 h
+1 h
+2858 m
+2859 m
+2860 m
+1 h
+4 h
+4 h
+4 h
+13 h
+2861 m
+181 h
+10 h
+4 h
+1 h
+57 h
+10 h
+31 h
+4 h
+3 h
+2862 m
+1 h
+4 h
+4 h
+4 h
+164 h
+4 h
+2863 m
+2864 m
+1 h
+4 h
+109 h
+1 h
+2374 m
+1 h
+10 h
+10 h
+4 h
+4 h
+1 h
+307 h
+25 h
+4 h
+2865 m
+4 h
+2866 m
+976 h
+2867 m
+2868 m
+195 h
+313 m
+10 h
+4 h
+11 h
+4 h
+4 h
+2869 m
+169 h
+10 h
+295 h
+1 h
+2870 m
+10 h
+195 h
+2871 m
+10 h
+928 m
+3 h
+172 h
+11 h
+403 h
+4 h
+2872 m
+1 h
+1 h
+1 h
+10 h
+258 h
+74 h
+4 h
+135 h
+2873 m
+1 h
+172 h
+10 h
+4 h
+22 h
+190 h
+2874 m
+4 h
+167 h
+1 h
+1 h
+10 h
+1 h
+4 h
+1 h
+109 h
+4 h
+143 h
+4 h
+4 h
+4 h
+2875 m
+190 h
+2876 m
+83 h
+10 h
+10 h
+2877 m
+4 h
+13 h
+1470 h
+1 h
+297 h
+59 h
+10 h
+2878 m
+25 h
+57 h
+82 h
+359 h
+10 h
+219 m
+1 h
+1 h
+2879 m
+1 h
+4 h
+2025 m
+4 h
+4 h
+2880 m
+4 h
+4 h
+1 h
+1 h
+820 m
+3 h
+1 h
+2022 m
+2881 m
+2882 m
+2883 m
+93 h
+4 h
+124 h
+11 h
+10 h
+4 h
+10 h
+2884 m
+250 h
+278 h
+2885 m
+747 m
+4 h
+11 h
+124 h
+114 h
+4 h
+1 h
+124 h
+11 h
+10 h
+10 h
+1127 m
+1 h
+10 h
+10 h
+1 h
+4 h
+2886 m
+36 h
+1 h
+2887 m
+2888 m
+13 h
+82 h
+10 h
+11 h
+2889 m
+1 h
+10 h
+1 h
+10 h
+196 h
+1 h
+4 h
+2890 m
+10 h
+82 h
+2891 m
+195 h
+2892 m
+2893 m
+1 h
+1261 h
+1 h
+27 h
+4 h
+10 h
+4 h
+10 h
+1 h
+1 h
+79 h
+59 h
+1 h
+10 h
+1 h
+1 h
+10 h
+4 h
+4 h
+10 h
+31 h
+10 h
+124 h
+939 h
+2625 m
+1 h
+1 h
+82 h
+4 h
+1 h
+1 h
+4 h
+2894 m
+158 h
+295 h
+2895 m
+2896 m
+2897 m
+2898 m
+4 h
+41 h
+4 h
+144 h
+146 h
+4 h
+1 h
+358 h
+2899 m
+10 h
+11 h
+4 h
+2900 m
+1 h
+10 h
+2901 m
+2902 m
+10 h
+1 h
+4 h
+4 h
+4 h
+10 h
+2903 m
+4 h
+2904 m
+4 h
+1 h
+2905 m
+2906 m
+4 h
+164 h
+1 h
+359 h
+59 h
+4 h
+10 h
+10 h
+2907 m
+2908 m
+386 h
+4 h
+173 h
+4 h
+57 h
+4 h
+2909 m
+4 h
+4 h
+45 h
+1 h
+22 h
+11 h
+4 h
+41 h
+1 h
+4 h
+1 h
+1 h
+4 h
+4 h
+124 h
+4 h
+1 h
+82 h
+1 h
+1 h
+124 h
+4 h
+2910 m
+1 h
+10 h
+41 h
+4 h
+11 h
+4 h
+2911 m
+2912 m
+27 h
+10 h
+10 h
+57 h
+230 h
+2913 m
+10 h
+4 h
+10 h
+2891 m
+10 h
+4 h
+4 h
+2914 m
+2915 m
+97 h
+2916 m
+36 h
+2917 m
+533 m
+146 h
+266 h
+2918 m
+123 h
+4 h
+2919 m
+25 h
+10 h
+307 h
+2920 m
+4 h
+1 h
+83 h
+2921 m
+4 h
+10 h
+186 h
+4 h
+1 h
+4 h
+4 h
+4 h
+2025 m
+2922 m
+4 h
+4 h
+4 h
+2923 m
+11 h
+11 h
+4 h
+2924 m
+10 h
+2925 m
+2926 m
+109 h
+116 m
+2927 m
+1 h
+2928 m
+238 h
+124 h
+4 h
+2929 m
+10 h
+41 h
+4 h
+1 h
+4 h
+82 h
+10 h
+2930 m
+332 h
+10 h
+1 h
+4 h
+82 h
+1 h
+2931 m
+1 h
+4 h
+4 h
+1 h
+1 h
+4 h
+1 h
+10 h
+169 h
+4 h
+1 h
+2932 m
+27 h
+59 h
+4 h
+536 h
+3 h
+2933 m
+10 h
+1 h
+10 h
+109 h
+195 h
+569 h
+2934 m
+59 h
+2935 m
+10 h
+10 h
+10 h
+33 h
+31 h
+11 h
+4 h
+4 h
+10 h
+83 h
+10 h
+156 h
+4 h
+2936 m
+888 m
+10 h
+10 h
+10 h
+12 h
+10 h
+1650 m
+2937 m
+1 h
+1 h
+258 h
+3 h
+1 h
+2938 m
+10 h
+1 h
+1089 h
+10 h
+4 h
+4 h
+1 h
+2939 m
+4 h
+2940 m
+79 h
+4 h
+332 h
+10 h
+2941 m
+2942 m
+2943 m
+59 h
+2944 m
+1 h
+1 h
+4 h
+190 h
+10 h
+10 h
+97 h
+2945 m
+4 h
+4 h
+59 h
+2946 m
+150 m
+408 h
+27 h
+1 h
+2947 m
+2948 m
+4 h
+97 h
+2949 m
+173 h
+2950 m
+2951 m
+2794 m
+2952 m
+2953 m
+3 h
+4 h
+1 h
+144 h
+10 h
+135 h
+4 h
+1 h
+10 h
+1 h
+2954 m
+10 h
+2955 m
+1 h
+2956 m
+2957 m
+1 h
+3 h
+1 h
+1 h
+4 h
+92 h
+570 m
+2245 m
+2958 m
+10 h
+4 h
+1 h
+109 h
+2935 m
+4 h
+10 h
+10 h
+1 h
+144 h
+2004 m
+1 h
+358 h
+124 h
+41 h
+2959 m
+2960 m
+4 h
+2961 m
+124 h
+2054 m
+4 h
+1 h
+2962 m
+4 h
+4 h
+11 h
+2963 m
+2964 m
+109 h
+4 h
+2965 m
+4 h
+4 h
+57 h
+2966 m
+2967 m
+2968 m
+11 h
+10 h
+2969 m
+2970 m
+110 h
+4 h
+2971 m
+4 h
+536 h
+2972 m
+10 h
+4 h
+447 h
+272 h
+59 h
+10 h
+69 h
+10 h
+4 h
+10 h
+2973 m
+241 h
+4 h
+4 h
+195 h
+4 h
+1 h
+10 h
+1 h
+10 h
+332 h
+1 h
+2974 m
+10 h
+2975 m
+2976 m
+57 h
+1 h
+1 h
+57 h
+10 h
+10 h
+11 h
+139 h
+10 h
+2977 m
+10 h
+22 h
+11 h
+55 h
+2978 m
+2979 m
+2980 m
+2981 m
+10 h
+954 m
+1 h
+164 h
+2982 m
+10 h
+2983 m
+2984 m
+4 h
+10 h
+4 h
+1 h
+1 h
+3 h
+25 h
+4 h
+4 h
+4 h
+82 h
+56 h
+59 h
+1163 m
+2985 m
+31 h
+2986 m
+1 h
+56 h
+4 h
+4 h
+11 h
+97 h
+1 h
+10 h
+4 h
+10 h
+57 h
+450 m
+2987 m
+4 h
+10 h
+2988 m
+13 h
+4 h
+82 h
+1 h
+181 h
+4 h
+109 h
+10 h
+1 h
+1 h
+59 h
+2989 m
+2990 m
+135 h
+1 h
+74 h
+1 h
+125 h
+1 h
+139 h
+4 h
+2991 m
+109 h
+82 h
+82 h
+4 h
+2992 m
+1 h
+1 h
+4 h
+10 h
+124 h
+10 h
+1 h
+11 h
+1 h
+11 h
+73 h
+4 h
+4 h
+10 h
+4 h
+2993 m
+1 h
+1 h
+4 h
+82 h
+74 h
+69 h
+4 h
+11 h
+2994 m
+4 h
+109 h
+2995 m
+4 h
+1 h
+2379 m
+4 h
+10 h
+79 h
+238 h
+10 h
+1 h
+4 h
+10 h
+1 h
+1478 h
+74 h
+4 h
+185 h
+186 h
+4 h
+4 h
+2996 m
+4 h
+10 h
+204 h
+3 h
+10 h
+2997 m
+2998 m
+2999 m
+1 h
+109 h
+28 h
+170 h
+3000 m
+1766 h
+4 h
+1 h
+250 h
+4 h
+1 h
+10 h
+3001 m
+13 h
+367 h
+10 h
+295 h
+1 h
+4 h
+3002 m
+3003 m
+11 h
+4 h
+4 h
+250 h
+4 h
+575 h
+10 h
+10 h
+3004 m
+10 h
+1 h
+4 h
+4 h
+3005 m
+4 h
+3006 m
+3007 m
+1 h
+1 h
+4 h
+1 h
+4 h
+4 h
+170 h
+4 h
+195 h
+857 h
+31 h
+4 h
+3008 m
+4 h
+27 h
+1 h
+124 h
+4 h
+3009 m
+10 h
+3010 m
+1030 h
+10 h
+57 h
+41 h
+1 h
+10 h
+911 m
+104 h
+10 h
+3011 m
+313 m
+1 h
+4 h
+3012 m
+10 h
+4 h
+1 h
+3013 m
+10 h
+3014 m
+4 h
+104 h
+3015 m
+10 h
+3016 m
+91 h
+4 h
+1 h
+10 h
+195 h
+1 h
+3 h
+4 h
+10 h
+538 h
+12 h
+10 h
+1 h
+3017 m
+2272 m
+4 h
+3018 m
+25 h
+4 h
+1 h
+195 h
+10 h
+1 h
+10 h
+4 h
+4 h
+10 h
+10 h
+1 h
+4 h
+3019 m
+3020 m
+313 h
+70 m
+1 h
+4 h
+3021 m
+12 h
+601 h
+1 h
+1 h
+13 h
+3022 m
+4 h
+4 h
+10 h
+156 h
+1 h
+3023 m
+4 h
+4 h
+1 h
+10 h
+10 h
+3024 m
+41 h
+2928 m
+295 h
+10 h
+3025 m
+4 h
+4 h
+3026 m
+3 h
+4 h
+1 h
+3027 m
+146 h
+10 h
+3028 m
+3029 m
+4 h
+4 h
+4 h
+1 h
+4 h
+1 h
+41 h
+289 h
+4 h
+10 h
+1 h
+10 h
+3030 m
+10 h
+4 h
+3031 m
+3032 m
+4 h
+1 h
+59 h
+4 h
+4 h
+3033 m
+3034 m
+41 h
+3035 m
+4 h
+109 h
+59 h
+45 h
+119 h
+3036 m
+3037 m
+109 h
+10 h
+1 h
+3038 m
+4 h
+10 h
+3039 m
+10 h
+4 h
+22 h
+123 h
+4 h
+3040 m
+4 h
+3041 m
+4 h
+4 h
+3042 m
+241 h
+10 h
+295 h
+1 h
+1 h
+3043 m
+1 h
+59 h
+185 h
+190 h
+3044 m
+1 h
+1 h
+1 h
+10 h
+1 h
+94 h
+1 h
+3045 m
+10 h
+3046 m
+59 h
+3047 m
+57 h
+4 h
+229 h
+4 h
+22 h
+3048 m
+4 h
+10 h
+3049 m
+4 h
+82 h
+4 h
+4 h
+3050 m
+3051 m
+1 h
+10 h
+279 h
+2038 m
+3052 m
+92 h
+3053 m
+10 h
+59 h
+83 h
+41 h
+4 h
+4 h
+1 h
+4 h
+1 h
+64 h
+147 h
+4 h
+4 h
+4 h
+10 h
+4 h
+4 h
+2851 m
+25 h
+69 h
+4 h
+1 h
+4 h
+2607 m
+4 h
+1 h
+278 h
+1619 m
+25 h
+11 h
+1 h
+1 h
+4 h
+73 h
+10 h
+146 h
+278 h
+1 h
+4 h
+13 h
+3054 m
+966 m
+56 h
+36 h
+4 h
+1 h
+1 h
+1 h
+3055 m
+3056 m
+10 h
+4 h
+1 h
+1 h
+10 h
+1 h
+11 h
+1796 h
+4 h
+3057 m
+41 h
+3058 m
+3059 m
+4 h
+1 h
+3060 m
+12 h
+1 h
+1 h
+3061 m
+1 h
+55 h
+3062 m
+10 h
+4 h
+10 h
+158 h
+3063 m
+82 h
+3064 m
+11 h
+3065 m
+1 h
+4 h
+83 h
+3066 m
+57 h
+10 h
+172 h
+10 h
+3067 m
+1861 m
+11 h
+2582 m
+4 h
+74 h
+3068 m
+3069 m
+976 h
+1 h
+4 h
+4 h
+332 h
+4 h
+4 h
+224 h
+10 h
+276 h
+1 h
+4 h
+10 h
+11 h
+479 m
+57 h
+10 h
+4 h
+3070 m
+368 h
+25 h
+4 h
+129 h
+10 h
+1 h
+10 h
+4 h
+4 h
+10 h
+5 m
+1 h
+4 h
+4 h
+1 h
+4 h
+10 h
+4 h
+1 h
+10 h
+3071 m
+10 h
+3072 m
+83 h
+1116 m
+3073 m
+1 h
+3074 m
+11 h
+4 h
+463 m
+10 h
+4 h
+195 h
+59 h
+11 h
+250 h
+3075 m
+3076 m
+1 h
+4 h
+170 h
+2532 m
+1 h
+1089 h
+3077 m
+10 h
+4 h
+3078 m
+3079 m
+10 h
+4 h
+3080 m
+11 h
+10 h
+1 h
+4 h
+857 h
+3081 m
+3082 m
+4 h
+1 h
+1 h
+1444 m
+11 h
+3 h
+358 h
+91 h
+3083 m
+10 h
+3084 m
+4 h
+1 h
+10 h
+788 m
+1 h
+1261 h
+4 h
+1 h
+4 h
+10 h
+488 h
+3085 m
+1 h
+601 h
+4 h
+3086 m
+10 h
+10 h
+10 h
+1 h
+3087 m
+74 h
+1 h
+4 h
+1 h
+119 h
+4 h
+3088 m
+10 h
+69 h
+4 h
+10 h
+125 h
+147 h
+11 h
+114 h
+31 h
+3089 m
+3090 m
+368 h
+4 h
+4 h
+4 h
+986 h
+3091 m
+413 m
+10 h
+307 h
+1 h
+11 h
+10 h
+4 h
+3092 m
+1 h
+104 h
+11 h
+266 h
+8 h
+4 h
+1 h
+4 h
+1884 m
+278 h
+3093 m
+1359 h
+164 h
+124 h
+45 h
+4 h
+4 h
+4 h
+4 h
+256 m
+1 h
+4 h
+4 h
+4 h
+1 h
+1 h
+3094 m
+1 h
+1 h
+800 m
+3095 m
+4 h
+4 h
+1817 m
+11 h
+1 h
+77 h
+4 h
+59 h
+2733 m
+1 h
+3096 m
+3097 m
+195 h
+4 h
+3098 m
+536 h
+3099 m
+640 h
+10 h
+295 h
+3100 m
+3101 m
+1 h
+36 h
+196 h
+3102 m
+109 h
+91 h
+4 h
+4 h
+1 h
+146 h
+4 h
+125 h
+4 h
+4 h
+3103 m
+4 h
+3104 m
+11 h
+3105 m
+57 h
+11 h
+82 h
+169 h
+368 h
+3106 m
+4 h
+1 h
+4 h
+4 h
+82 h
+82 h
+10 h
+3107 m
+10 h
+65 h
+83 h
+1881 m
+358 h
+1 h
+10 h
+83 h
+1 h
+408 h
+57 h
+4 h
+4 h
+3108 m
+3109 m
+4 h
+1 h
+583 m
+4 h
+3110 m
+82 h
+10 h
+59 h
+64 h
+3111 m
+135 h
+31 h
+146 h
+10 h
+1790 m
+265 h
+1 h
+3112 m
+4 h
+3113 m
+57 h
+10 h
+82 h
+82 h
+3114 m
+1 h
+109 h
+65 h
+4 h
+10 h
+109 h
+1 h
+3115 m
+3116 m
+94 h
+10 h
+82 h
+45 h
+4 h
+640 h
+10 h
+11 h
+3117 m
+3118 m
+65 h
+10 h
+1 h
+10 h
+3 h
+4 h
+139 h
+3119 m
+4 h
+10 h
+41 h
+3120 m
+1 h
+4 h
+4 h
+10 h
+1822 m
+332 h
+1 h
+3121 m
+10 h
+3122 m
+4 h
+4 h
+4 h
+4 h
+4 h
+3123 m
+10 h
+4 h
+146 h
+3124 m
+3125 m
+4 h
+4 h
+4 h
+10 h
+1 h
+3126 m
+3127 m
+3128 m
+4 h
+11 h
+10 h
+146 h
+41 h
+45 h
+1 h
+10 h
+1 h
+3129 m
+911 m
+10 h
+3130 m
+1 h
+125 h
+3131 m
+371 h
+4 h
+36 h
+109 h
+10 h
+3132 m
+1 h
+1 h
+4 h
+3133 m
+10 h
+4 h
+3 h
+3134 m
+10 h
+332 h
+1 h
+4 h
+1 h
+3135 m
+3136 m
+10 h
+276 h
+3137 m
+87 m
+109 h
+3138 m
+4 h
+1 h
+10 h
+25 h
+167 h
+4 h
+3139 m
+4 h
+25 h
+3140 m
+1 h
+25 h
+22 h
+270 h
+3141 m
+139 h
+3142 m
+10 h
+27 h
+779 m
+10 h
+4 h
+1 h
+55 h
+1 h
+4 h
+4 h
+27 h
+25 h
+57 h
+274 h
+4 h
+3143 m
+1 h
+74 h
+184 h
+1 h
+94 h
+11 h
+1 h
+536 h
+1 h
+1 h
+59 h
+1 h
+1 h
+3144 m
+3145 m
+10 h
+92 h
+4 h
+11 h
+3146 m
+1 h
+3147 m
+3148 m
+4 h
+1 h
+124 h
+40 h
+1 h
+3149 m
+3150 m
+307 h
+10 h
+465 m
+3151 m
+1 h
+10 h
+4 h
+4 h
+10 h
+110 h
+11 h
+31 h
+10 h
+1 h
+11 h
+59 h
+1 h
+1 h
+3152 m
+10 h
+4 h
+2710 m
+4 h
+3153 m
+10 h
+4 h
+3154 m
+3155 m
+4 h
+3156 m
+4 h
+3157 m
+10 h
+1 h
+4 h
+10 h
+1 h
+3158 m
+1 h
+10 h
+10 h
+10 h
+10 h
+1184 m
+4 h
+3159 m
+955 m
+3160 m
+10 h
+4 h
+976 h
+2688 h
+4 h
+4 h
+1 h
+3161 m
+109 h
+1 h
+147 h
+4 h
+3162 m
+3163 m
+83 h
+434 h
+1 h
+11 h
+4 h
+1939 m
+3164 m
+41 h
+59 h
+12 h
+184 h
+143 h
+4 h
+4 h
+1 h
+82 h
+1 h
+4 h
+3165 m
+2964 m
+10 h
+4 h
+4 h
+3166 m
+173 h
+10 h
+10 h
+172 h
+1 h
+109 h
+75 m
+3167 m
+3168 m
+10 h
+2865 m
+4 h
+3169 m
+3170 m
+358 h
+368 h
+4 h
+1 h
+1 h
+4 h
+3171 m
+4 h
+274 h
+3172 m
+57 h
+1 h
+3173 m
+3174 m
+3175 m
+10 h
+82 h
+31 h
+10 h
+1 h
+185 h
+386 h
+3176 m
+4 h
+46 h
+2475 m
+31 h
+125 h
+138 h
+10 h
+147 h
+10 h
+97 h
+4 h
+3 h
+3177 m
+3178 m
+13 h
+10 h
+3179 m
+4 h
+1 h
+266 h
+110 h
+10 h
+123 h
+4 h
+109 h
+4 h
+83 h
+3180 m
+41 h
+10 h
+33 h
+3181 m
+10 h
+3182 m
+1218 m
+4 h
+4 h
+1 h
+3183 m
+1 h
+59 h
+1 h
+4 h
+10 h
+4 h
+4 h
+358 h
+4 h
+4 h
+11 h
+4 h
+4 h
+4 h
+27 h
+4 h
+4 h
+4 h
+3184 m
+10 h
+1 h
+3185 m
+4 h
+4 h
+1 h
+8 h
+4 h
+3186 m
+3187 m
+4 h
+1 h
+3188 m
+3189 m
+229 h
+94 h
+1880 m
+3190 m
+1 h
+3191 m
+3192 m
+114 h
+1 h
+4 h
+3193 m
+3194 m
+4 h
+79 h
+70 m
+73 h
+74 h
+2851 m
+74 h
+3195 m
+4 h
+3196 m
+4 h
+10 h
+1685 m
+481 m
+97 h
+399 h
+3197 m
+56 h
+41 h
+1544 m
+172 h
+3198 m
+97 h
+94 h
+181 h
+11 h
+1 h
+295 h
+116 m
+4 h
+104 h
+4 h
+3199 m
+10 h
+4 h
+124 h
+169 h
+93 h
+4 h
+1 h
+3200 m
+25 h
+3201 m
+1 h
+4 h
+4 h
+69 h
+4 h
+1306 m
+1822 m
+10 h
+3202 m
+1 h
+172 h
+3203 m
+25 h
+1 h
+1 h
+3204 m
+1 h
+4 h
+256 m
+1 h
+4 h
+3205 m
+172 h
+1 h
+10 h
+181 h
+730 m
+4 h
+3206 m
+11 h
+2205 m
+4 h
+1953 m
+4 h
+4 h
+3207 m
+4 h
+124 h
+4 h
+1 h
+4 h
+3208 m
+190 h
+425 m
+1 h
+10 h
+146 h
+4 h
+41 h
+4 h
+147 h
+10 h
+10 h
+3209 m
+109 h
+4 h
+4 h
+109 h
+83 h
+3210 m
+3211 m
+3212 m
+3213 m
+4 h
+123 h
+10 h
+3214 m
+3215 m
+10 h
+3216 m
+1016 h
+4 h
+3217 m
+1 h
+4 h
+1 h
+1 h
+79 h
+8 h
+4 h
+3218 m
+10 h
+1 h
+110 h
+4 h
+94 h
+10 h
+3219 m
+1493 h
+10 h
+10 h
+3 h
+77 h
+147 h
+4 h
+1 h
+3220 m
+276 h
+434 h
+3221 m
+10 h
+297 h
+3222 m
+11 h
+10 h
+104 h
+11 h
+10 h
+83 h
+3223 m
+3224 m
+1 h
+169 h
+4 h
+1 h
+299 h
+3225 m
+1642 m
+1 h
+11 h
+4 h
+3 h
+12 h
+4 h
+1620 m
+3226 m
+1403 h
+11 h
+3 h
+114 h
+143 h
+172 h
+1 h
+48 h
+10 h
+10 h
+3227 m
+10 h
+468 h
+3228 m
+3229 m
+3230 m
+3231 m
+1 h
+3232 m
+3233 m
+4 h
+10 h
+3234 m
+1 h
+3235 m
+1 h
+3236 m
+3237 m
+10 h
+195 h
+3238 m
+1 h
+195 h
+4 h
+1 h
+11 h
+1 h
+1504 m
+4 h
+3239 m
+1 h
+1 h
+74 h
+82 h
+383 h
+1 h
+1 h
+4 h
+40 h
+11 h
+59 h
+74 h
+3240 m
+10 h
+1 h
+1 h
+4 h
+4 h
+4 h
+3241 m
+10 h
+1 h
+2558 m
+10 h
+4 h
+57 h
+10 h
+1 h
+447 h
+196 h
+3242 m
+238 h
+4 h
+1 h
+3 h
+3243 m
+386 h
+11 h
+174 h
+656 m
+569 h
+4 h
+3244 m
+3245 m
+10 h
+3214 m
+10 h
+10 h
+3246 m
+3247 m
+1 h
+10 h
+1 h
+443 h
+1 h
+10 h
+55 h
+3248 m
+1 h
+156 h
+10 h
+4 h
+104 h
+1357 m
+256 h
+1 h
+1 h
+1 h
+10 h
+1 h
+92 h
+1 h
+509 m
+10 h
+808 h
+83 h
+12 h
+13 h
+8 h
+4 h
+4 h
+1261 h
+4 h
+125 h
+4 h
+82 h
+3249 m
+4 h
+10 h
+3250 m
+1 h
+124 h
+986 h
+10 h
+1 h
+4 h
+4 h
+4 h
+4 h
+4 h
+3251 m
+4 h
+10 h
+4 h
+3252 m
+4 h
+10 h
+4 h
+1 h
+238 h
+157 h
+1 h
+31 h
+3253 m
+4 h
+4 h
+4 h
+1 h
+3254 m
+3255 m
+3256 m
+69 h
+4 h
+4 h
+477 m
+3 h
+4 h
+147 h
+82 h
+4 h
+59 h
+1 h
+3257 m
+1764 m
+10 h
+4 h
+408 h
+10 h
+3258 m
+25 h
+196 h
+3259 m
+3260 m
+1321 h
+167 h
+156 h
+1 h
+109 h
+3261 m
+10 h
+4 h
+3262 m
+124 h
+3263 m
+11 h
+11 h
+3264 m
+4 h
+1 h
+3265 m
+25 h
+12 h
+94 h
+3266 m
+1 h
+307 h
+10 h
+1 h
+3267 m
+3 h
+8 h
+41 h
+10 h
+4 h
+3268 m
+1089 h
+3269 m
+2522 m
+1535 h
+3270 m
+3271 m
+1 h
+3272 m
+4 h
+1 h
+3273 m
+156 h
+3274 m
+3275 m
+3276 m
+3277 m
+41 h
+4 h
+3278 m
+1 h
+4 h
+386 h
+10 h
+3279 m
+3274 m
+3280 m
+146 h
+4 h
+3 h
+3281 m
+10 h
+1 h
+97 h
+59 h
+1 h
+3282 m
+359 h
+3 h
+4 h
+10 h
+10 h
+73 h
+4 h
+4 h
+4 h
+4 h
+25 h
+3283 m
+1 h
+109 h
+1 h
+10 h
+31 h
+119 h
+4 h
+278 h
+10 h
+3284 m
+3285 m
+1437 m
+3286 m
+1070 m
+4 h
+3287 m
+10 h
+4 h
+124 h
+4 h
+3288 m
+3289 m
+3290 m
+3291 m
+3067 m
+3292 m
+4 h
+181 h
+3293 m
+3294 m
+1261 h
+3295 m
+164 h
+4 h
+4 h
+1 h
+27 h
+1 h
+83 h
+64 h
+1 h
+10 h
+1 h
+1 h
+1 h
+238 h
+1 h
+4 h
+1 h
+238 h
+1 h
+4 h
+3296 m
+4 h
+4 h
+4 h
+83 h
+3297 m
+11 h
+59 h
+97 h
+258 h
+3298 m
+143 h
+41 h
+265 h
+1 h
+10 h
+10 h
+97 h
+139 h
+3299 m
+11 h
+1 h
+59 h
+1 h
+64 h
+1 h
+4 h
+10 h
+11 h
+10 h
+13 h
+10 h
+1 h
+3300 m
+10 h
+3301 m
+4 h
+295 h
+10 h
+3302 m
+1 h
+403 h
+383 h
+4 h
+3303 m
+1 h
+124 h
+4 h
+4 h
+3 h
+196 h
+3304 m
+1 h
+4 h
+3305 m
+25 h
+10 h
+4 h
+4 h
+3306 m
+1 h
+4 h
+10 h
+97 h
+10 h
+1 h
+4 h
+4 h
+3307 m
+3308 m
+174 h
+4 h
+295 h
+1 h
+4 h
+1 h
+10 h
+279 h
+1030 h
+11 h
+3309 m
+3310 m
+4 h
+195 h
+4 h
+27 h
+1 h
+1 h
+10 h
+143 h
+1 h
+3311 m
+64 h
+1 h
+4 h
+167 h
+4 h
+1 h
+3312 m
+258 h
+4 h
+3313 m
+3314 m
+4 h
+196 h
+3315 m
+73 h
+190 h
+4 h
+258 h
+368 h
+1250 h
+276 h
+110 h
+1 h
+156 h
+4 h
+1 h
+143 h
+129 h
+1 h
+4 h
+3316 m
+779 m
+11 h
+3317 m
+10 h
+1 h
+57 h
+4 h
+4 h
+3318 m
+109 h
+1 h
+1650 m
+4 h
+124 h
+4 h
+12 h
+2163 m
+3319 m
+124 h
+1 h
+3320 m
+3321 m
+10 h
+3322 m
+2920 m
+25 h
+10 h
+276 h
+4 h
+3323 m
+119 h
+1981 m
+3324 m
+4 h
+3325 m
+59 h
+262 h
+3326 m
+10 h
+31 h
+3327 m
+4 h
+83 h
+3328 m
+4 h
+869 m
+25 h
+10 h
+3329 m
+4 h
+10 h
+4 h
+4 h
+4 h
+4 h
+1 h
+1 h
+169 h
+3330 m
+1 h
+3331 m
+4 h
+45 h
+4 h
+4 h
+4 h
+143 h
+135 h
+4 h
+3332 m
+1 h
+1 h
+1 h
+10 h
+3333 m
+1 h
+4 h
+190 h
+4 h
+4 h
+3334 m
+3335 m
+11 h
+10 h
+3336 m
+10 h
+31 h
+1 h
+990 m
+4 h
+1 h
+4 h
+124 h
+25 h
+4 h
+4 h
+4 h
+69 h
+97 h
+190 h
+3337 m
+10 h
+195 h
+995 m
+1 h
+10 h
+11 h
+3338 m
+2733 m
+3339 m
+1 h
+230 h
+3340 m
+57 h
+31 h
+10 h
+1 h
+45 h
+10 h
+278 h
+40 h
+4 h
+3341 m
+4 h
+3342 m
+4 h
+1 h
+3343 m
+3344 m
+4 h
+22 h
+4 h
+3345 m
+3346 m
+114 h
+4 h
+109 h
+1 h
+1 h
+12 h
+4 h
+25 h
+3347 m
+1 h
+4 h
+3348 m
+3349 m
+4 h
+258 h
+10 h
+3350 m
+4 h
+3351 m
+3352 m
+1 h
+1 h
+3353 m
+4 h
+10 h
+4 h
+1 h
+4 h
+1 h
+4 h
+1 h
+2025 h
+3354 m
+4 h
+1 h
+3355 m
+4 h
+4 h
+4 h
+4 h
+11 h
+1 h
+3 h
+838 m
+1 h
+10 h
+10 h
+74 h
+4 h
+3356 m
+332 h
+238 h
+4 h
+3357 m
+1053 m
+1250 h
+3358 m
+4 h
+124 h
+4 h
+3359 m
+135 h
+285 m
+59 h
+4 h
+4 h
+11 h
+1 h
+1 h
+31 h
+97 h
+3360 m
+11 h
+4 h
+3361 m
+3362 m
+1 h
+556 h
+3363 m
+172 h
+3364 m
+1 h
+195 h
+3365 m
+1137 h
+964 m
+146 h
+10 h
+10 h
+1 h
+1 h
+3366 m
+3367 m
+164 h
+4 h
+4 h
+156 h
+3368 m
+383 h
+3369 m
+3370 m
+1359 h
+10 h
+3371 m
+1 h
+10 h
+10 h
+4 h
+3372 m
+332 h
+25 h
+1 h
+4 h
+1 h
+4 h
+1 h
+4 h
+3373 m
+10 h
+135 h
+4 h
+10 h
+83 h
+1 h
+1 h
+4 h
+4 h
+10 h
+1 h
+4 h
+59 h
+4 h
+4 h
+169 h
+4 h
+10 h
+4 h
+10 h
+1 h
+109 h
+1 h
+1 h
+83 h
+3374 m
+4 h
+1772 m
+10 h
+10 h
+3375 m
+3376 m
+1 h
+270 h
+3377 m
+10 h
+3378 m
+1 h
+4 h
+4 h
+4 h
+1822 h
+147 h
+3379 m
+4 h
+1 h
+4 h
+10 h
+3380 m
+258 h
+4 h
+1 h
+2928 h
+11 h
+3381 m
+10 h
+10 h
+10 h
+92 h
+3382 m
+10 h
+3383 m
+4 h
+1 h
+1 h
+104 h
+1 h
+22 h
+82 h
+1 h
+1 h
+10 h
+10 h
+3384 m
+41 h
+3209 m
+3385 m
+10 h
+3386 m
+1 h
+319 h
+1 h
+158 h
+4 h
+82 h
+196 h
+4 h
+1 h
+1 h
+97 h
+1 h
+4 h
+56 h
+10 h
+3387 m
+124 h
+278 h
+114 h
+4 h
+4 h
+10 h
+1 h
+1 h
+97 h
+4 h
+3388 m
+1835 m
+31 h
+1 h
+3389 m
+3390 m
+192 h
+4 h
+10 h
+4 h
+10 h
+4 h
+41 h
+4 h
+3391 m
+278 h
+4 h
+4 h
+4 h
+4 h
+10 h
+3392 m
+4 h
+11 h
+4 h
+4 h
+3393 m
+4 h
+1 h
+3394 m
+59 h
+3395 m
+4 h
+1 h
+3396 m
+196 h
+11 h
+10 h
+718 h
+3397 m
+3398 m
+25 h
+109 h
+10 h
+4 h
+56 h
+4 h
+4 h
+59 h
+124 h
+4 h
+4 h
+3399 m
+3033 m
+94 h
+1 h
+1 h
+164 h
+1770 m
+3400 m
+164 h
+4 h
+1 h
+174 h
+1 h
+4 h
+1 h
+3401 m
+170 h
+4 h
+3402 m
+1 h
+1 h
+1 h
+10 h
+3403 m
+1 h
+4 h
+1 h
+4 h
+10 h
+82 h
+203 m
+3404 m
+4 h
+4 h
+1 h
+1 h
+278 h
+3405 m
+125 h
+4 h
+307 h
+1 h
+3406 m
+135 h
+3407 m
+276 h
+1 h
+10 h
+1 h
+1 h
+1 h
+278 h
+1 h
+3408 m
+1 h
+4 h
+109 h
+1 h
+3409 m
+97 h
+3410 m
+3411 m
+3412 m
+10 h
+250 h
+3413 m
+40 h
+36 h
+4 h
+27 h
+10 h
+3414 m
+3415 m
+3416 m
+94 h
+4 h
+41 h
+172 h
+4 h
+10 h
+3417 m
+3418 m
+4 h
+1 h
+4 h
+3419 m
+119 h
+3420 m
+3 h
+4 h
+1 h
+10 h
+4 h
+3421 m
+10 h
+4 h
+272 h
+3422 m
+347 m
+4 h
+1 h
+3423 m
+4 h
+4 h
+1 h
+359 h
+1836 m
+1723 m
+10 h
+25 h
+332 h
+4 h
+92 h
+4 h
+397 m
+4 h
+129 h
+4 h
+195 h
+10 h
+1766 h
+4 h
+36 h
+4 h
+3424 m
+4 h
+10 h
+698 m
+3425 m
+1 h
+57 h
+10 h
+3426 m
+1 h
+3427 m
+10 h
+11 h
+3428 m
+146 h
+79 h
+1 h
+1 h
+74 h
+109 h
+55 h
+10 h
+64 h
+10 h
+3429 m
+10 h
+3430 m
+3278 m
+10 h
+10 h
+4 h
+4 h
+4 h
+3431 m
+10 h
+82 h
+1822 h
+857 h
+3432 m
+4 h
+238 h
+11 h
+1 h
+4 h
+279 h
+1 h
+1 h
+3433 m
+10 h
+3434 m
+1 h
+10 h
+322 m
+4 h
+4 h
+1 h
+64 h
+167 h
+10 h
+1089 h
+3435 m
+276 h
+10 h
+56 h
+196 h
+10 h
+10 h
+1 h
+4 h
+4 h
+3436 m
+1 h
+3437 m
+3438 m
+1 h
+3439 m
+125 h
+1 h
+1740 m
+3440 m
+1 h
+28 h
+4 h
+3441 m
+1 h
+4 h
+204 h
+1571 m
+3 h
+1 h
+1 h
+583 m
+1 h
+4 h
+1 h
+4 h
+316 m
+4 h
+4 h
+4 h
+4 h
+1418 m
+10 h
+74 h
+1 h
+1 h
+1 h
+3442 m
+10 h
+10 h
+3443 m
+1 h
+3444 m
+4 h
+11 h
+109 h
+10 h
+36 h
+3445 m
+4 h
+258 h
+10 h
+10 h
+196 h
+4 h
+3446 m
+258 h
+164 h
+3447 m
+3448 m
+3449 m
+779 m
+79 h
+195 h
+1074 m
+3450 m
+3451 m
+10 h
+10 h
+4 h
+1 h
+3452 m
+4 h
+59 h
+10 h
+92 h
+125 h
+79 h
+3453 m
+11 h
+10 h
+1 h
+3454 m
+4 h
+4 h
+10 h
+64 h
+1 h
+10 h
+4 h
+4 h
+109 h
+31 h
+4 h
+603 m
+4 h
+477 m
+45 h
+4 h
+1 h
+3455 m
+1 h
+123 h
+1 h
+4 h
+368 h
+4 h
+3456 m
+4 h
+4 h
+1127 m
+4 h
+4 h
+10 h
+109 h
+1 h
+1 h
+1 h
+64 h
+704 h
+4 h
+4 h
+1830 m
+57 h
+4 h
+40 h
+41 h
+65 h
+4 h
+3457 m
+41 h
+1 h
+3458 m
+4 h
+10 h
+3459 m
+4 h
+1 h
+3460 m
+3461 m
+3462 m
+1403 h
+1 h
+1 h
+4 h
+4 h
+10 h
+3463 m
+124 h
+10 h
+146 h
+1 h
+3464 m
+56 h
+4 h
+1 h
+4 h
+10 h
+4 h
+3465 m
+368 h
+83 h
+3466 m
+124 h
+11 h
+11 h
+4 h
+4 h
+10 h
+4 h
+4 h
+1 h
+65 h
+1 h
+74 h
+3467 m
+31 h
+3468 m
+4 h
+1595 m
+4 h
+4 h
+1 h
+1 h
+3469 m
+109 h
+730 m
+57 h
+1 h
+82 h
+10 h
+258 h
+3470 m
+174 h
+3471 m
+10 h
+4 h
+3472 m
+4 h
+1 h
+196 h
+31 h
+1 h
+3473 m
+4 h
+10 h
+11 h
+1 h
+4 h
+3474 m
+31 h
+1 h
+4 h
+4 h
+10 h
+4 h
+464 h
+3475 m
+1 h
+2124 h
+10 h
+135 h
+1 h
+110 h
+1 h
+1 h
+1 h
+3476 m
+4 h
+10 h
+1 h
+1 h
+3477 m
+4 h
+10 h
+4 h
+1 h
+3478 m
+1 h
+2887 m
+1016 h
+4 h
+119 h
+1 h
+3479 m
+1 h
+4 h
+3480 m
+10 h
+13 h
+4 h
+3481 m
+10 h
+186 h
+3482 m
+3483 m
+1 h
+2614 m
+3484 m
+4 h
+1 h
+61 m
+1 h
+1 h
+1 h
+3485 m
+59 h
+3486 m
+1 h
+3487 m
+10 h
+169 h
+1 h
+10 h
+3488 m
+4 h
+74 h
+4 h
+1070 m
+4 h
+1685 m
+82 h
+541 h
+83 h
+614 m
+583 m
+4 h
+10 h
+10 h
+4 h
+3489 m
+10 h
+4 h
+114 h
+3490 m
+4 h
+92 h
+4 h
+57 h
+4 h
+4 h
+1 h
+41 h
+4 h
+124 h
+3491 m
+4 h
+181 h
+3 h
+4 h
+1 h
+4 h
+36 h
+3492 m
+13 h
+109 h
+146 h
+1 h
+57 h
+3493 m
+238 h
+4 h
+10 h
+3494 m
+1 h
+146 h
+10 h
+3495 m
+82 h
+4 h
+1 h
+3 h
+4 h
+11 h
+4 h
+230 h
+3496 m
+4 h
+4 h
+3497 m
+10 h
+4 h
+3498 m
+10 h
+4 h
+4 h
+4 h
+124 h
+3499 m
+4 h
+3500 m
+1 h
+92 h
+25 h
+11 h
+65 h
+4 h
+1710 m
+1 h
+4 h
+1 h
+4 h
+1 h
+10 h
+3501 m
+25 h
+10 h
+11 h
+620 m
+10 h
+1886 m
+4 h
+4 h
+10 h
+1250 h
+1 h
+3502 m
+1 h
+4 h
+109 h
+3503 m
+124 h
+11 h
+69 h
+270 h
+3504 m
+3505 m
+3506 m
+536 h
+10 h
+10 h
+4 h
+3507 m
+10 h
+73 h
+1 h
+995 m
+157 h
+13 h
+10 h
+4 h
+2002 m
+2303 m
+3508 m
+10 h
+4 h
+1 h
+65 h
+10 h
+3509 m
+147 h
+11 h
+307 h
+2436 m
+10 h
+57 h
+10 h
+3510 m
+11 h
+4 h
+1 h
+1 h
+1 h
+1 h
+10 h
+10 h
+146 h
+3511 m
+3512 m
+2028 m
+114 h
+4 h
+570 h
+25 h
+4 h
+10 h
+1 h
+801 m
+147 h
+4 h
+4 h
+4 h
+92 h
+10 h
+10 h
+195 h
+4 h
+10 h
+1 h
+11 h
+3 h
+1 h
+1 h
+3513 m
+1914 m
+124 h
+79 h
+4 h
+10 h
+56 h
+4 h
+11 h
+167 h
+3514 m
+11 h
+3396 m
+1 h
+3515 m
+79 h
+278 h
+3516 m
+434 h
+4 h
+241 h
+4 h
+13 h
+447 h
+57 h
+108 h
+173 h
+4 h
+4 h
+57 h
+94 h
+3517 m
+1 h
+4 h
+1 h
+10 h
+3518 m
+3519 m
+4 h
+83 h
+3520 m
+1 h
+258 h
+79 h
+264 m
+3521 m
+192 h
+4 h
+10 h
+4 h
+4 h
+10 h
+11 h
+110 h
+3522 m
+1 h
+124 h
+27 h
+3523 m
+4 h
+3524 m
+41 h
+4 h
+3525 m
+4 h
+10 h
+1470 h
+4 h
+1 h
+4 h
+82 h
+83 h
+4 h
+4 h
+4 h
+1 h
+79 h
+11 h
+4 h
+4 h
+1 h
+10 h
+104 h
+10 h
+4 h
+3526 m
+10 h
+4 h
+1260 m
+1 h
+1 h
+3527 m
+1 h
+83 h
+1 h
+4 h
+3528 m
+3529 m
+10 h
+144 h
+3530 m
+2843 m
+1 h
+1 h
+10 h
+184 h
+1 h
+358 h
+109 h
+10 h
+4 h
+3531 m
+4 h
+41 h
+1 h
+83 h
+185 h
+3532 m
+4 h
+1 h
+129 h
+3533 m
+3534 m
+1 h
+238 h
+4 h
+1 h
+10 h
+1 h
+1 h
+3535 m
+124 h
+10 h
+10 h
+1 h
+4 h
+10 h
+4 h
+3536 m
+1 h
+4 h
+1 h
+601 h
+10 h
+10 h
+10 h
+10 h
+4 h
+1 h
+3537 m
+12 h
+2379 m
+82 h
+3538 m
+4 h
+4 h
+1 h
+48 h
+4 h
+1 h
+3539 m
+1 h
+10 h
+83 h
+1 h
+358 h
+4 h
+10 h
+11 h
+3540 m
+3541 m
+10 h
+4 h
+82 h
+3542 m
+65 h
+25 h
+4 h
+4 h
+4 h
+1 h
+4 h
+4 h
+3543 m
+135 h
+4 h
+3544 m
+3545 m
+4 h
+11 h
+368 h
+266 h
+1 h
+3546 m
+119 h
+4 h
+1470 h
+83 h
+3547 m
+1081 m
+82 h
+3115 m
+4 h
+1250 h
+10 h
+3548 m
+4 h
+3549 m
+1 h
+59 h
+581 m
+10 h
+25 h
+4 h
+25 h
+3550 m
+186 h
+332 h
+403 h
+4 h
+109 h
+10 h
+109 h
+92 h
+1 h
+1 h
+3551 m
+4 h
+10 h
+10 h
+4 h
+84 m
+10 h
+204 h
+97 h
+10 h
+10 h
+3 h
+4 h
+4 h
+3552 m
+1 h
+4 h
+82 h
+11 h
+3553 m
+4 h
+4 h
+276 h
+3554 m
+3555 m
+1 h
+124 h
+173 h
+10 h
+10 h
+59 h
+1985 m
+10 h
+3556 m
+4 h
+1 h
+3557 m
+12 h
+4 h
+4 h
+10 h
+8 h
+4 h
+59 h
+10 h
+276 h
+3558 m
+1 h
+3559 m
+1 h
+4 h
+10 h
+10 h
+1 h
+4 h
+4 h
+3560 m
+1 h
+4 h
+3561 m
+1218 m
+4 h
+27 h
+114 h
+112 h
+1 h
+79 h
+2846 m
+1 h
+10 h
+4 h
+4 h
+97 h
+4 h
+125 h
+12 h
+82 h
+3278 h
+332 h
+4 h
+10 h
+10 h
+1 h
+11 h
+3562 m
+1 h
+3563 m
+1003 h
+3564 m
+91 h
+1 h
+1 h
+3565 m
+3 h
+1 h
+1 h
+4 h
+3566 m
+4 h
+10 h
+3567 m
+1027 h
+987 m
+10 h
+3568 m
+4 h
+4 h
+3569 m
+1 h
+1 h
+1 h
+1 h
+4 h
+103 h
+41 h
+10 h
+3570 m
+4 h
+1564 m
+4 h
+3571 m
+10 h
+108 h
+3572 m
+3573 m
+1020 m
+4 h
+3574 m
+1 h
+604 m
+4 h
+3575 m
+125 h
+170 h
+4 h
+3576 m
+1128 m
+1 h
+1 h
+3577 m
+1 h
+124 h
+11 h
+447 h
+4 h
+45 h
+195 h
+4 h
+3578 m
+3579 m
+1 h
+4 h
+4 h
+57 h
+4 h
+1 h
+3580 m
+3581 m
+10 h
+1 h
+3582 m
+4 h
+10 h
+278 h
+4 h
+3583 m
+195 h
+10 h
+11 h
+1 h
+1 h
+3584 m
+4 h
+156 h
+11 h
+4 h
+59 h
+1 h
+1 h
+1016 h
+10 h
+10 h
+3585 m
+109 h
+4 h
+1884 m
+4 h
+4 h
+112 h
+3586 m
+1 h
+4 h
+10 h
+1 h
+3587 m
+185 h
+1 h
+55 h
+1 h
+4 h
+3588 m
+4 h
+4 h
+4 h
+3589 m
+4 h
+1 h
+10 h
+238 h
+2367 m
+65 h
+3590 m
+25 h
+64 h
+10 h
+1 h
+3591 m
+10 h
+3592 m
+169 h
+1 h
+1 h
+3593 m
+4 h
+10 h
+3594 m
+157 h
+1 h
+157 h
+10 h
+41 h
+10 h
+1 h
+10 h
+11 h
+1 h
+124 h
+41 h
+74 h
+4 h
+4 h
+3595 m
+299 h
+195 h
+3596 m
+10 h
+567 m
+10 h
+1 h
+1 h
+40 h
+45 h
+3597 m
+10 h
+129 h
+4 h
+1 h
+272 h
+3 h
+4 h
+4 h
+4 h
+10 h
+131 m
+74 h
+112 h
+1 h
+10 h
+119 h
+99 m
+3598 m
+3 h
+41 h
+4 h
+10 h
+5 m
+74 h
+4 h
+4 h
+92 h
+4 h
+692 h
+119 h
+10 h
+1 h
+4 h
+939 h
+1409 h
+3599 m
+109 h
+717 m
+1 h
+4 h
+1 h
+181 h
+1 h
+1646 m
+170 h
+1 h
+4 h
+4 h
+1 h
+687 h
+74 h
+3600 m
+156 h
+1 h
+3601 m
+1 h
+3602 m
+73 h
+4 h
+14 m
+33 h
+3603 m
+3604 m
+57 h
+1 h
+4 h
+27 h
+4 h
+109 h
+1 h
+10 h
+10 h
+1 h
+578 m
+10 h
+146 h
+4 h
+12 h
+3605 m
+104 h
+41 h
+1 h
+276 h
+82 h
+57 h
+1 h
+332 h
+31 h
+265 h
+1 h
+3606 m
+3607 m
+4 h
+83 h
+10 h
+3608 m
+41 h
+10 h
+2819 m
+3609 m
+3610 m
+3 h
+124 h
+3611 m
+642 m
+11 h
+3 h
+1 h
+4 h
+3612 m
+3613 m
+3614 m
+74 h
+3615 m
+10 h
+4 h
+4 h
+83 h
+123 h
+3616 m
+3617 m
+4 h
+185 h
+3618 m
+3619 m
+11 h
+83 h
+3620 m
+3621 m
+4 h
+143 h
+4 h
+4 h
+10 h
+190 h
+10 h
+1 h
+4 h
+4 h
+3622 m
+4 h
+172 h
+3623 m
+55 h
+3624 m
+92 h
+4 h
+2124 h
+22 h
+1 h
+4 h
+358 h
+4 h
+1220 m
+11 h
+4 h
+1 h
+1 h
+3625 m
+1359 h
+4 h
+10 h
+170 h
+4 h
+1089 h
+10 h
+11 h
+25 h
+1403 h
+1 h
+164 h
+82 h
+10 h
+3381 m
+1 h
+10 h
+4 h
+36 h
+377 h
+3626 m
+1 h
+4 h
+2362 m
+3627 m
+3 h
+3628 m
+3629 m
+4 h
+1 h
+1 h
+74 h
+4 h
+4 h
+11 h
+4 h
+83 h
+4 h
+3630 m
+3631 m
+10 h
+4 h
+4 h
+1 h
+109 h
+4 h
+4 h
+1 h
+976 h
+3632 m
+4 h
+1 h
+4 h
+69 h
+10 h
+1791 m
+3633 m
+10 h
+338 m
+10 h
+1936 m
+57 h
+3634 m
+10 h
+1 h
+489 m
+4 h
+4 h
+1 h
+112 h
+1 h
+1 h
+31 h
+4 h
+3635 m
+4 h
+3636 m
+4 h
+4 h
+4 h
+83 h
+4 h
+114 h
+4 h
+3637 m
+51 m
+3638 m
+3639 m
+146 h
+10 h
+3640 m
+1 h
+1 h
+3641 m
+4 h
+3642 m
+10 h
+4 h
+10 h
+10 h
+10 h
+92 h
+1 h
+1 h
+59 h
+258 h
+794 h
+2628 m
+1 h
+1 h
+918 m
+55 h
+1 h
+4 h
+3643 m
+3644 m
+3645 m
+4 h
+4 h
+195 h
+4 h
+3646 m
+3647 m
+82 h
+4 h
+3648 m
+10 h
+1 h
+195 h
+143 h
+28 h
+56 h
+12 h
+520 h
+1 h
+83 h
+10 h
+1 h
+13 h
+3 h
+626 m
+10 h
+3649 m
+1137 h
+124 h
+25 h
+167 h
+10 h
+1 h
+1 h
+1 h
+4 h
+2379 m
+10 h
+4 h
+10 h
+4 h
+83 h
+114 h
+1 h
+3650 m
+4 h
+805 m
+11 h
+129 h
+3651 m
+97 h
+3 h
+4 h
+124 h
+3652 m
+4 h
+1 h
+139 h
+10 h
+195 h
+3653 m
+307 h
+4 h
+48 h
+3654 m
+10 h
+57 h
+3655 m
+1 h
+1 h
+1 h
+109 h
+41 h
+4 h
+4 h
+4 h
+1406 h
+3656 m
+3657 m
+695 m
+1 h
+1835 h
+11 h
+3658 m
+31 h
+10 h
+172 h
+3 h
+3659 m
+146 h
+124 h
+4 h
+3660 m
+1 h
+4 h
+3661 m
+4 h
+4 h
+1 h
+4 h
+10 h
+801 m
+31 h
+10 h
+4 h
+45 h
+4 h
+1 h
+1 h
+195 h
+3662 m
+1 h
+1 h
+3663 m
+4 h
+31 h
+620 m
+3664 m
+48 h
+1740 m
+156 h
+185 h
+65 h
+4 h
+1796 h
+3665 m
+113 h
+10 h
+3666 m
+10 h
+4 h
+4 h
+1 h
+10 h
+4 h
+3667 m
+258 h
+4 h
+31 h
+3668 m
+25 h
+158 h
+2846 m
+3669 m
+158 h
+1 h
+129 h
+1 h
+158 h
+10 h
+1 h
+3670 m
+3671 m
+3672 m
+147 h
+1250 h
+25 h
+1 h
+25 h
+74 h
+3673 m
+4 h
+157 h
+2442 m
+169 h
+3674 m
+158 h
+10 h
+11 h
+74 h
+3675 m
+1 h
+31 h
+1 h
+41 h
+1 h
+3676 m
+3677 m
+1650 m
+195 h
+4 h
+3678 m
+57 h
+10 h
+3679 m
+57 h
+3680 m
+195 h
+3681 m
+83 h
+3682 m
+59 h
+10 h
+3683 m
+1 h
+4 h
+3684 m
+1 h
+3685 m
+386 h
+3686 m
+3687 m
+3688 m
+4 h
+10 h
+1 h
+65 h
+157 h
+3689 m
+190 h
+4 h
+3690 m
+766 m
+3691 m
+4 h
+1 h
+57 h
+4 h
+11 h
+1 h
+3692 m
+146 h
+4 h
+1 h
+4 h
+3693 m
+1 h
+3694 m
+4 h
+4 h
+4 h
+1 h
+1 h
+4 h
+4 h
+3695 m
+3696 m
+3697 m
+3698 m
+3699 m
+3700 m
+10 h
+4 h
+3701 m
+3702 m
+4 h
+3 h
+147 h
+3703 m
+1 h
+4 h
+10 h
+1 h
+1 h
+4 h
+10 h
+3 h
+986 h
+4 h
+4 h
+10 h
+56 h
+1030 h
+3704 m
+4 h
+1 h
+10 h
+3705 m
+4 h
+1 h
+3706 m
+82 h
+4 h
+45 h
+4 h
+158 h
+3707 m
+3708 m
+885 h
+114 h
+3709 m
+4 h
+1 h
+3710 m
+4 h
+94 h
+4 h
+704 h
+184 h
+1105 m
+125 h
+3711 m
+4 h
+31 h
+3712 m
+1 h
+4 h
+10 h
+1714 m
+2688 h
+358 h
+1 h
+3713 m
+3714 m
+25 h
+109 h
+11 h
+1470 h
+1 h
+368 h
+1 h
+4 h
+4 h
+3715 m
+10 h
+119 h
+289 h
+4 h
+4 h
+3716 m
+4 h
+4 h
+4 h
+4 h
+3717 m
+0 m
+4 h
+10 h
+1 h
+4 h
+1 h
+57 h
+170 h
+3 h
+10 h
+601 h
+1 h
+1 h
+569 h
+22 h
+4 h
+113 h
+1 h
+10 h
+3718 m
+1 h
+113 h
+3719 m
+1 h
+4 h
+4 h
+1 h
+10 h
+83 h
+109 h
+4 h
+57 h
+1 h
+109 h
+601 h
+79 h
+1 h
+169 h
+4 h
+4 h
+1 h
+4 h
+1 h
+1 h
+1875 m
+1 h
+3720 m
+4 h
+2730 m
+10 h
+11 h
+1 h
+10 h
+11 h
+10 h
+55 h
+57 h
+4 h
+332 h
+4 h
+10 h
+155 m
+4 h
+583 h
+367 h
+10 h
+4 h
+65 h
+3721 m
+10 h
+4 h
+3722 m
+1372 m
+4 h
+4 h
+630 m
+4 h
+57 h
+1 h
+4 h
+10 h
+4 h
+3723 m
+4 h
+520 h
+4 h
+488 h
+4 h
+1 h
+3724 m
+3725 m
+190 h
+3726 m
+10 h
+536 h
+1 h
+10 h
+3727 m
+135 h
+4 h
+41 h
+3728 m
+3729 m
+10 h
+181 h
+4 h
+82 h
+258 h
+10 h
+3730 m
+10 h
+976 h
+55 h
+1 h
+4 h
+4 h
+1 h
+1 h
+1 h
+265 h
+143 h
+4 h
+82 h
+3731 m
+3732 m
+4 h
+4 h
+1 h
+82 h
+170 h
+801 h
+2278 m
+10 h
+463 m
+3733 m
+10 h
+4 h
+109 h
+3734 m
+1 h
+4 h
+869 m
+938 m
+4 h
+4 h
+1 h
+3735 m
+1 h
+65 h
+4 h
+123 h
+1 h
+11 h
+4 h
+3736 m
+4 h
+1137 h
+1 h
+97 h
+4 h
+1 h
+3737 m
+27 h
+1 h
+1 h
+4 h
+1 h
+1 h
+2625 m
+45 h
+3738 m
+3739 m
+57 h
+147 h
+3740 m
+147 h
+386 h
+1100 m
+3741 m
+3 h
+3742 m
+3743 m
+1 h
+3744 m
+1 h
+104 h
+138 h
+10 h
+3745 m
+1 h
+146 h
+10 h
+57 h
+10 h
+1 h
+3746 m
+4 h
+3747 m
+1083 m
+59 h
+10 h
+1 h
+4 h
+1 h
+1 h
+10 h
+3748 m
+4 h
+258 h
+1 h
+3749 m
+1 h
+146 h
+3750 m
+3751 m
+3752 m
+11 h
+4 h
+3753 m
+3754 m
+12 h
+11 h
+83 h
+4 h
+41 h
+4 h
+307 h
+4 h
+106 h
+4 h
+1403 h
+4 h
+10 h
+4 h
+1 h
+10 h
+4 h
+447 h
+4 h
+55 h
+4 h
+10 h
+4 h
+4 h
+11 h
+109 h
+135 h
+4 h
+1 h
+3755 m
+359 h
+1 h
+4 h
+1 h
+3756 m
+3757 m
+4 h
+1 h
+10 h
+4 h
+124 h
+12 h
+112 h
+3758 m
+4 h
+1 h
+857 h
+3759 m
+10 h
+3760 m
+97 h
+3761 m
+1 h
+104 h
+3762 m
+41 h
+4 h
+1 h
+10 h
+1 h
+1 h
+4 h
+10 h
+1 h
+1 h
+10 h
+3763 m
+4 h
+1 h
+2139 m
+55 h
+4 h
+3764 m
+59 h
+10 h
+1 h
+4 h
+203 m
+3765 m
+146 h
+64 h
+4 h
+4 h
+1 h
+3766 m
+1 h
+4 h
+10 h
+3767 m
+5 m
+10 h
+3768 m
+4 h
+4 h
+447 h
+3769 m
+10 h
+4 h
+92 h
+3770 m
+25 h
+11 h
+4 h
+3771 m
+10 h
+190 h
+82 h
+4 h
+1 h
+41 h
+186 h
+1 h
+1 h
+1 h
+59 h
+3772 m
+10 h
+1 h
+3773 m
+3774 m
+1 h
+10 h
+10 h
+1 h
+1 h
+82 h
+10 h
+4 h
+1 h
+135 h
+258 h
+195 h
+4 h
+3775 m
+4 h
+3776 m
+10 h
+4 h
+25 h
+4 h
+109 h
+1 h
+3777 m
+1 h
+3778 m
+10 h
+12 h
+92 h
+1 h
+4 h
+123 h
+4 h
+3779 m
+4 h
+4 h
+97 h
+1 h
+1627 m
+1 h
+3780 m
+3781 m
+3782 m
+4 h
+56 h
+3783 m
+3784 m
+82 h
+3785 m
+4 h
+3 h
+59 h
+1 h
+2163 m
+250 h
+4 h
+3786 m
+10 h
+3787 m
+4 h
+3788 m
+196 h
+3789 m
+10 h
+258 h
+3790 m
+1714 m
+10 h
+1 h
+4 h
+36 h
+4 h
+2522 m
+65 h
+4 h
+4 h
+1 h
+3791 m
+4 h
+25 h
+4 h
+65 h
+10 h
+4 h
+4 h
+77 h
+4 h
+10 h
+4 h
+1 h
+1 h
+190 h
+1 h
+4 h
+3792 m
+4 h
+1 h
+64 h
+55 h
+1 h
+10 h
+1981 m
+4 h
+3793 m
+31 h
+82 h
+1003 h
+278 h
+278 h
+125 h
+4 h
+464 h
+3794 m
+3795 m
+10 h
+1 h
+1 h
+27 h
+3796 m
+4 h
+736 m
+3797 m
+1 h
+12 h
+897 m
+3798 m
+443 h
+114 h
+4 h
+4 h
+74 h
+3799 m
+3800 m
+56 h
+11 h
+4 h
+4 h
+91 h
+4 h
+4 h
+1 h
+3801 m
+4 h
+4 h
+74 h
+125 h
+3802 m
+3803 m
+12 h
+10 h
+10 h
+10 h
+4 h
+1 h
+3804 m
+1024 m
+1 h
+10 h
+1 h
+10 h
+3805 m
+10 h
+536 h
+3806 m
+3807 m
+13 h
+135 h
+990 m
+1 h
+1 h
+4 h
+124 h
+1 h
+1 h
+10 h
+57 h
+65 h
+1 h
+4 h
+3808 m
+1 h
+8 h
+3809 m
+8 h
+1 h
+73 h
+10 h
+3810 m
+4 h
+3811 m
+1 h
+1 h
+3812 m
+10 h
+3813 m
+3814 m
+147 h
+10 h
+3815 m
+113 h
+1 h
+4 h
+146 h
+10 h
+97 h
+274 h
+10 h
+4 h
+4 h
+124 h
+3816 m
+11 h
+3817 m
+3818 m
+10 h
+25 h
+1 h
+1 h
+1 h
+3819 m
+4 h
+135 h
+4 h
+10 h
+1 h
+258 h
+1470 h
+4 h
+1 h
+1 h
+1 h
+3820 m
+3821 m
+4 h
+93 h
+1 h
+4 h
+10 h
+11 h
+167 h
+1 h
+1 h
+4 h
+3822 m
+27 h
+11 h
+3823 m
+3824 m
+4 h
+4 h
+1 h
+3825 m
+4 h
+399 h
+10 h
+83 h
+146 h
+3826 m
+195 h
+1 h
+4 h
+1 h
+1 h
+3827 m
+10 h
+10 h
+1677 m
+587 m
+1 h
+224 h
+4 h
+1 h
+3828 m
+3829 m
+4 h
+4 h
+1 h
+1 h
+55 h
+59 h
+1 h
+10 h
+4 h
+264 m
+10 h
+10 h
+4 h
+3830 m
+3831 m
+1 h
+238 h
+4 h
+911 h
+1 h
+1 h
+3832 m
+4 h
+1 h
+11 h
+55 h
+11 h
+57 h
+4 h
+3833 m
+3834 m
+2379 h
+4 h
+3835 m
+4 h
+467 m
+3836 m
+124 h
+1 h
+65 h
+1 h
+83 h
+11 h
+3837 m
+4 h
+250 h
+31 h
+1016 h
+4 h
+10 h
+3838 m
+94 h
+313 h
+4 h
+1 h
+1 h
+1 h
+10 h
+4 h
+4 h
+173 h
+4 h
+1 h
+3839 m
+897 m
+4 h
+1 h
+3840 m
+1 h
+4 h
+4 h
+718 h
+3841 m
+1 h
+3842 m
+83 h
+195 h
+3843 m
+1 h
+1 h
+1 h
+3844 m
+3845 m
+10 h
+4 h
+4 h
+4 h
+250 h
+4 h
+3846 m
+4 h
+359 h
+1 h
+1 h
+109 h
+1 h
+4 h
+8 h
+2119 m
+4 h
+4 h
+1 h
+4 h
+3847 m
+4 h
+1 h
+4 h
+82 h
+4 h
+3848 m
+1 h
+1 h
+55 h
+3849 m
+3850 m
+3851 m
+82 h
+1 h
+10 h
+1 h
+3852 m
+1 h
+3853 m
+25 h
+10 h
+4 h
+3854 m
+190 h
+3855 m
+3856 m
+1 h
+10 h
+3857 m
+265 h
+1 h
+3 h
+10 h
+31 h
+1 h
+2474 m
+3858 m
+3859 m
+10 h
+10 h
+938 m
+3860 m
+3861 m
+3862 m
+104 h
+3 h
+2300 m
+10 h
+1 h
+1 h
+3863 m
+4 h
+169 h
+4 h
+181 h
+808 h
+3864 m
+2101 m
+3025 m
+92 h
+4 h
+181 h
+3865 m
+1 h
+69 h
+4 h
+23 m
+125 h
+57 h
+3866 m
+27 h
+1 h
+1 h
+4 h
+3867 m
+1 h
+3868 m
+4 h
+229 h
+4 h
+538 h
+31 h
+3869 m
+10 h
+4 h
+3870 m
+64 h
+4 h
+4 h
+1 h
+23 m
+3558 m
+10 h
+4 h
+4 h
+1 h
+10 h
+297 h
+4 h
+1 h
+109 h
+4 h
+4 h
+33 h
+1 h
+4 h
+4 h
+1470 h
+10 h
+143 h
+185 h
+1 h
+1 h
+256 h
+4 h
+204 h
+3871 m
+3872 m
+1 h
+3873 m
+125 h
+885 h
+11 h
+3874 m
+322 m
+65 h
+4 h
+3875 m
+258 h
+3876 m
+1835 h
+10 h
+4 h
+4 h
+238 h
+1 h
+4 h
+11 h
+4 h
+4 h
+4 h
+1 h
+4 h
+190 h
+4 h
+41 h
+1 h
+769 m
+4 h
+3877 m
+59 h
+1 h
+55 h
+1 h
+556 h
+3878 m
+77 h
+104 h
+3879 m
+10 h
+1 h
+4 h
+3880 m
+3881 m
+1 h
+119 h
+1 h
+3882 m
+1 h
+1074 m
+4 h
+10 h
+2300 m
+3883 m
+3884 m
+278 h
+3885 m
+83 h
+4 h
+59 h
+10 h
+123 h
+119 h
+3886 m
+1 h
+4 h
+313 h
+4 h
+10 h
+3887 m
+238 h
+1 h
+31 h
+125 h
+3888 m
+10 h
+10 h
+4 h
+717 m
+13 h
+4 h
+57 h
+3067 m
+129 h
+319 h
+423 m
+3889 m
+3890 m
+1 h
+447 h
+4 h
+1 h
+630 m
+4 h
+4 h
+4 h
+938 h
+79 h
+3891 m
+1 h
+1 h
+97 h
+10 h
+59 h
+10 h
+4 h
+3892 m
+157 h
+83 h
+3893 m
+3 h
+11 h
+3894 m
+214 m
+1 h
+57 h
+3895 m
+4 h
+41 h
+83 h
+11 h
+3896 m
+11 h
+3897 m
+258 h
+59 h
+10 h
+3898 m
+146 h
+4 h
+3 h
+4 h
+3899 m
+4 h
+4 h
+4 h
+3900 m
+1016 h
+40 h
+520 h
+1 h
+3901 m
+3902 m
+1 h
+687 h
+3903 m
+3904 m
+10 h
+2954 m
+1 h
+4 h
+73 h
+147 h
+3905 m
+3906 m
+10 h
+10 h
+3907 m
+10 h
+1 h
+73 h
+124 h
+1 h
+10 h
+3908 m
+3089 m
+73 h
+11 h
+4 h
+1772 h
+61 m
+4 h
+1 h
+278 h
+1 h
+139 h
+1 h
+3909 m
+10 h
+1646 m
+3910 m
+3911 m
+147 h
+10 h
+4 h
+11 h
+4 h
+11 h
+57 h
+65 h
+4 h
+3912 m
+119 h
+4 h
+3913 m
+3914 m
+4 h
+1 h
+1 h
+31 h
+3915 m
+1359 h
+195 h
+10 h
+4 h
+10 h
+10 h
+3916 m
+1089 h
+10 h
+278 h
+3917 m
+3918 m
+11 h
+3919 m
+10 h
+1 h
+224 h
+3920 m
+10 h
+3921 m
+114 h
+4 h
+3922 m
+3923 m
+4 h
+10 h
+1 h
+3924 m
+3925 m
+806 m
+10 h
+94 h
+1 h
+3926 m
+10 h
+736 m
+11 h
+581 m
+4 h
+10 h
+104 h
+3927 m
+3 h
+3928 m
+4 h
+3929 m
+4 h
+3930 m
+124 h
+4 h
+10 h
+36 h
+1 h
+125 h
+1 h
+4 h
+13 h
+114 h
+1 h
+82 h
+3931 m
+1 h
+4 h
+109 h
+4 h
+3932 m
+3933 m
+1 h
+3934 m
+1 h
+11 h
+477 m
+3935 m
+3 h
+1 h
+170 h
+11 h
+3936 m
+1137 h
+10 h
+3937 m
+36 h
+31 h
+82 h
+3 h
+10 h
+1 h
+1 h
+1 h
+41 h
+10 h
+3938 m
+3939 m
+11 h
+3940 m
+195 h
+4 h
+4 h
+11 h
+4 h
+56 h
+3941 m
+11 h
+1 h
+4 h
+4 h
+10 h
+1 h
+3942 m
+1344 m
+10 h
+4 h
+97 h
+13 h
+4 h
+10 h
+4 h
+1 h
+83 h
+74 h
+236 m
+10 h
+1 h
+1 h
+3943 m
+3944 m
+10 h
+4 h
+1 h
+10 h
+1 h
+74 h
+82 h
+4 h
+3945 m
+1114 m
+3946 m
+3947 m
+10 h
+3948 m
+3949 m
+4 h
+3950 m
+3951 m
+386 h
+36 h
+3952 m
+1 h
+1 h
+3953 m
+10 h
+10 h
+1 h
+1 h
+3 h
+10 h
+1 h
+4 h
+4 h
+1 h
+74 h
+4 h
+83 h
+10 h
+3954 m
+36 h
+1 h
+10 h
+10 h
+3955 m
+1 h
+10 h
+704 h
+3956 m
+3957 m
+1 h
+27 h
+195 h
+124 h
+1 h
+10 h
+1 h
+3958 m
+4 h
+4 h
+3959 m
+1 h
+3960 m
+10 h
+1 h
+4 h
+109 h
+4 h
+3961 m
+10 h
+2887 m
+36 h
+3962 m
+4 h
+1 h
+57 h
+4 h
+83 h
+10 h
+3622 m
+1 h
+1650 m
+195 h
+1 h
+4 h
+57 h
+25 h
+3 h
+3963 m
+196 h
+4 h
+10 h
+1 h
+4 h
+4 h
+4 h
+4 h
+265 h
+4 h
+11 h
+74 h
+10 h
+41 h
+3964 m
+3965 m
+4 h
+4 h
+4 h
+4 h
+4 h
+11 h
+10 h
+3966 m
+11 h
+10 h
+241 h
+1 h
+3967 m
+4 h
+601 h
+3968 m
+10 h
+10 h
+3969 m
+65 h
+56 h
+2205 m
+1780 h
+4 h
+164 h
+3970 m
+4 h
+3971 m
+64 h
+4 h
+3972 m
+104 h
+289 h
+3973 m
+3974 m
+146 h
+1 h
+10 h
+1861 m
+4 h
+262 h
+4 h
+10 h
+4 h
+173 h
+3975 m
+3976 m
+109 h
+10 h
+258 h
+3977 m
+4 h
+4 h
+10 h
+4 h
+1 h
+1 h
+4 h
+125 h
+146 h
+124 h
+57 h
+10 h
+97 h
+3978 m
+4 h
+82 h
+4 h
+1 h
+10 h
+3979 m
+1 h
+123 h
+1 h
+3980 m
+4 h
+1 h
+1 h
+82 h
+3981 m
+3982 m
+10 h
+3983 m
+8 h
+4 h
+10 h
+1 h
+10 h
+4 h
+3984 m
+82 h
+1 h
+10 h
+3985 m
+4 h
+1 h
+1 h
+3986 m
+10 h
+1 h
+1822 h
+4 h
+4 h
+4 h
+4 h
+3987 m
+536 h
+1 h
+10 h
+10 h
+124 h
+1642 m
+23 h
+3988 m
+1 h
+3989 m
+48 h
+3990 m
+3991 m
+135 h
+57 h
+3992 m
+3993 m
+1 h
+1 h
+11 h
+3994 m
+3 h
+83 h
+1218 m
+3995 m
+264 m
+1 h
+11 h
+1822 h
+1 h
+10 h
+1 h
+3996 m
+10 h
+3997 m
+10 h
+40 h
+3998 m
+1 h
+1 h
+3999 m
+12 h
+3177 m
+4 h
+1 h
+118 m
+276 h
+104 h
+4 h
+11 h
+83 h
+139 h
+4000 m
+10 h
+4001 m
+1137 h
+4 h
+173 h
+4 h
+4 h
+27 h
+976 h
+4002 m
+109 h
+10 h
+10 h
+278 h
+800 m
+64 h
+4 h
+10 h
+4 h
+4003 m
+1 h
+59 h
+1 h
+4 h
+4004 m
+195 h
+4 h
+1 h
+10 h
+4 h
+1685 m
+4005 m
+4 h
+4 h
+1 h
+4006 m
+1 h
+5 m
+4 h
+4007 m
+4 h
+4008 m
+59 h
+10 h
+158 h
+109 h
+1 h
+4 h
+10 h
+763 m
+4 h
+4 h
+1444 m
+4 h
+110 h
+4 h
+4 h
+4 h
+4 h
+3 h
+10 h
+4 h
+10 h
+135 h
+10 h
+27 h
+1 h
+4 h
+190 h
+3 h
+4009 m
+1 h
+1722 m
+4010 m
+147 h
+4011 m
+1 h
+1 h
+4012 m
+4 h
+10 h
+10 h
+4013 m
+4014 m
+12 h
+4 h
+23 h
+4 h
+83 h
+4015 m
+1 h
+520 h
+83 h
+114 h
+1 h
+4016 m
+59 h
+692 h
+1 h
+83 h
+114 h
+1 h
+4 h
+79 h
+12 h
+114 h
+1 h
+4017 m
+4 h
+146 h
+41 h
+4 h
+10 h
+4018 m
+1 h
+4 h
+10 h
+4 h
+94 h
+10 h
+124 h
+747 m
+4019 m
+4020 m
+1 h
+10 h
+4021 m
+164 h
+4 h
+110 h
+146 h
+4022 m
+4023 m
+4 h
+1 h
+4024 m
+82 h
+1 h
+620 m
+1 h
+4025 m
+1 h
+4026 m
+4027 m
+2002 m
+10 h
+4 h
+4028 m
+578 m
+4 h
+11 h
+17 m
+125 h
+4029 m
+4 h
+59 h
+4 h
+10 h
+1 h
+169 h
+4 h
+4030 m
+4 h
+11 h
+4 h
+124 h
+4 h
+1 h
+1 h
+1 h
+4 h
+4031 m
+4032 m
+4 h
+4033 m
+83 h
+1642 m
+238 h
+25 h
+4 h
+1 h
+4034 m
+4035 m
+10 h
+4036 m
+10 h
+4 h
+1635 m
+91 h
+4037 m
+355 m
+4038 m
+109 h
+1 h
+113 h
+1 h
+872 m
+4 h
+358 h
+1 h
+169 h
+1 h
+4 h
+1 h
+104 h
+1 h
+4 h
+11 h
+2347 m
+4 h
+4039 m
+114 h
+1 h
+125 h
+10 h
+4040 m
+10 h
+4 h
+190 h
+4041 m
+4042 m
+4 h
+4043 m
+4 h
+10 h
+353 m
+1 h
+4044 m
+10 h
+1 h
+4 h
+1 h
+4045 m
+4046 m
+83 h
+1 h
+4047 m
+10 h
+4 h
+4 h
+256 h
+4048 m
+1 h
+1 h
+11 h
+10 h
+65 h
+10 h
+297 h
+10 h
+10 h
+10 h
+97 h
+4049 m
+59 h
+11 h
+4050 m
+4 h
+10 h
+4 h
+74 h
+1 h
+4051 m
+1 h
+10 h
+1 h
+4 h
+4 h
+4052 m
+4 h
+1 h
+4 h
+4053 m
+4054 m
+1 h
+4 h
+1 h
+976 h
+119 h
+4 h
+11 h
+4055 m
+82 h
+10 h
+124 h
+10 h
+190 h
+4056 m
+1 h
+4057 m
+4058 m
+1 h
+10 h
+59 h
+3 h
+140 m
+65 h
+221 m
+1 h
+4 h
+10 h
+10 h
+4 h
+59 h
+10 h
+4 h
+4 h
+4 h
+258 h
+10 h
+4059 m
+104 h
+4060 m
+4 h
+4 h
+4061 m
+1 h
+1 h
+4062 m
+73 h
+82 h
+4 h
+4063 m
+13 h
+4 h
+4 h
+4064 m
+4 h
+82 h
+4 h
+1 h
+4065 m
+4066 m
+10 h
+779 h
+4067 m
+4 h
+82 h
+4 h
+74 h
+10 h
+4 h
+1 h
+4068 m
+4069 m
+1 h
+1 h
+125 h
+11 h
+399 h
+114 h
+4 h
+4070 m
+4071 m
+10 h
+1655 m
+4 h
+11 h
+4 h
+278 h
+1 h
+1 h
+27 h
+4 h
+65 h
+4072 m
+10 h
+4 h
+10 h
+185 h
+4 h
+4073 m
+41 h
+4 h
+1 h
+1 h
+4074 m
+358 h
+1 h
+4 h
+4075 m
+10 h
+1 h
+170 h
+4 h
+4076 m
+25 h
+3 h
+238 h
+5 m
+3 h
+332 h
+1 h
+640 h
+4 h
+986 h
+1 h
+1 h
+10 h
+83 h
+4 h
+25 h
+270 h
+82 h
+10 h
+11 h
+22 h
+4 h
+4 h
+4 h
+4 h
+41 h
+59 h
+64 h
+4 h
+10 h
+10 h
+55 h
+3342 m
+1 h
+3 h
+4077 m
+4078 m
+82 h
+4079 m
+4 h
+97 h
+10 h
+4 h
+264 m
+10 h
+3 h
+4 h
+4 h
+10 h
+1 h
+59 h
+4 h
+381 m
+4 h
+10 h
+4 h
+1 h
+10 h
+1454 m
+4080 m
+1 h
+4081 m
+92 h
+1 h
+118 m
+57 h
+4082 m
+399 h
+4083 m
+1 h
+10 h
+4084 m
+1 h
+4 h
+4 h
+11 h
+4 h
+4 h
+10 h
+3048 m
+4085 m
+4 h
+1 h
+939 h
+4086 m
+4 h
+1 h
+2418 m
+124 h
+31 h
+110 h
+266 h
+82 h
+10 h
+74 h
+10 h
+57 h
+4087 m
+4 h
+4088 m
+1 h
+4 h
+4 h
+911 h
+4089 m
+4 h
+4 h
+10 h
+172 h
+1 h
+4090 m
+4 h
+4 h
+4 h
+83 h
+41 h
+11 h
+4091 m
+4 h
+4 h
+4092 m
+10 h
+10 h
+520 h
+1 h
+4093 m
+146 h
+112 h
+4094 m
+4095 m
+10 h
+10 h
+4096 m
+109 h
+4097 m
+1 h
+10 h
+1751 m
+4 h
+10 h
+359 h
+156 h
+4 h
+4098 m
+4099 m
+41 h
+4100 m
+1 h
+57 h
+4101 m
+4 h
+4102 m
+1 h
+1 h
+36 h
+10 h
+1 h
+1 h
+10 h
+1 h
+125 h
+55 h
+4103 m
+1 h
+4104 m
+4 h
+358 h
+12 h
+10 h
+4 h
+139 h
+4105 m
+4106 m
+4107 m
+3 h
+4108 m
+1 h
+4 h
+82 h
+10 h
+4 h
+4109 m
+4110 m
+4111 m
+4 h
+3 h
+547 m
+4112 m
+4 h
+11 h
+278 h
+4113 m
+4 h
+45 h
+10 h
+10 h
+4 h
+1 h
+4 h
+57 h
+2606 m
+4 h
+4 h
+10 h
+185 h
+4114 m
+4115 m
+4116 m
+4117 m
+27 h
+4118 m
+4119 m
+3025 m
+56 h
+10 h
+82 h
+4 h
+114 h
+1 h
+10 h
+4120 m
+59 h
+10 h
+1 h
+11 h
+104 h
+10 h
+10 h
+124 h
+146 h
+167 h
+12 h
+4 h
+195 h
+4 h
+10 h
+4121 m
+10 h
+65 h
+4 h
+4122 m
+4123 m
+10 h
+125 h
+1 h
+4124 m
+79 h
+4 h
+1 h
+1 h
+4 h
+113 h
+124 h
+4 h
+4 h
+12 h
+124 h
+1 h
+57 h
+4 h
+4125 m
+1308 m
+10 h
+1 h
+4126 m
+10 h
+1 h
+1 h
+4127 m
+1 h
+4 h
+4128 m
+860 m
+270 h
+4 h
+41 h
+1564 m
+4 h
+10 h
+1 h
+4129 m
+4 h
+10 h
+55 h
+1 h
+4 h
+297 h
+4130 m
+10 h
+4 h
+1 h
+1790 m
+4 h
+10 h
+10 h
+4131 m
+1 h
+1 h
+22 h
+31 h
+4 h
+10 h
+4132 m
+1 h
+11 h
+4 h
+4133 m
+4 h
+1 h
+109 h
+1374 m
+368 h
+11 h
+1 h
+4134 m
+59 h
+4 h
+10 h
+1 h
+4 h
+114 h
+4 h
+4 h
+10 h
+147 h
+4 h
+2379 h
+4 h
+4135 m
+4 h
+4136 m
+10 h
+1 h
+1 h
+1403 h
+488 h
+4137 m
+4 h
+4138 m
+10 h
+4139 m
+1 h
+4 h
+4140 m
+10 h
+3 h
+493 m
+4141 m
+10 h
+1 h
+172 h
+1 h
+4142 m
+4 h
+10 h
+4143 m
+4144 m
+4145 m
+4 h
+2087 m
+368 h
+1 h
+73 h
+1 h
+4146 m
+125 h
+10 h
+10 h
+4 h
+4147 m
+1 h
+4 h
+4 h
+1 h
+1261 h
+1 h
+4148 m
+11 h
+4 h
+1 h
+1 h
+10 h
+4149 m
+10 h
+1 h
+4150 m
+757 m
+949 m
+4151 m
+104 h
+109 h
+1 h
+10 h
+82 h
+569 h
+4 h
+57 h
+74 h
+10 h
+123 h
+4152 m
+10 h
+4 h
+4 h
+4 h
+4 h
+4153 m
+1 h
+11 h
+4154 m
+2733 m
+4155 m
+1 h
+10 h
+57 h
+97 h
+4 h
+4156 m
+1914 m
+224 h
+4157 m
+4158 m
+1 h
+1 h
+4 h
+4 h
+316 m
+4159 m
+123 h
+31 h
+1261 h
+31 h
+10 h
+104 h
+1 h
+4160 m
+94 h
+430 m
+25 h
+1 h
+31 h
+1835 h
+10 h
+170 h
+1 h
+125 h
+57 h
+1914 m
+297 h
+4 h
+4161 m
+11 h
+1 h
+10 h
+4 h
+1 h
+3177 m
+4 h
+2215 m
+1 h
+4162 m
+10 h
+31 h
+1 h
+10 h
+4163 m
+64 h
+1 h
+4164 m
+4 h
+4165 m
+10 h
+4166 m
+4 h
+4167 m
+83 h
+1 h
+73 h
+1 h
+27 h
+11 h
+4 h
+11 h
+195 h
+104 h
+843 m
+10 h
+57 h
+147 h
+278 h
+195 h
+3 h
+4168 m
+857 h
+4169 m
+10 h
+4 h
+4170 m
+4171 m
+1260 m
+31 h
+1 h
+1 h
+1 h
+258 h
+1 h
+361 m
+4 h
+12 h
+10 h
+1 h
+4 h
+1 h
+104 h
+4 h
+307 h
+1 h
+195 h
+4172 m
+1607 m
+4173 m
+27 h
+4 h
+27 h
+692 h
+447 h
+4 h
+4174 m
+41 h
+338 m
+4 h
+4175 m
+1 h
+687 h
+4 h
+4 h
+3112 m
+1 h
+10 h
+3272 m
+4176 m
+65 h
+10 h
+4177 m
+8 h
+164 h
+4178 m
+10 h
+4 h
+4 h
+536 h
+64 h
+4179 m
+13 h
+1 h
+10 h
+12 h
+4 h
+10 h
+10 h
+262 h
+4 h
+125 h
+1122 m
+1 h
+1470 h
+45 h
+4180 m
+125 h
+4 h
+4 h
+157 h
+4181 m
+10 h
+1 h
+4182 m
+10 h
+1 h
+4183 m
+4 h
+1 h
+1953 m
+4184 m
+4185 m
+4186 m
+4 h
+976 h
+1 h
+3 h
+185 h
+1 h
+4187 m
+4188 m
+79 h
+4 h
+82 h
+12 h
+339 m
+4189 m
+1 h
+4190 m
+10 h
+1 h
+11 h
+4191 m
+4192 m
+4 h
+4 h
+4193 m
+11 h
+11 h
+4194 m
+10 h
+4 h
+4 h
+56 h
+4 h
+158 h
+10 h
+4 h
+110 h
+1 h
+10 h
+4 h
+1 h
+10 h
+1 h
+82 h
+4195 m
+143 h
+4196 m
+4197 m
+11 h
+10 h
+4198 m
+1 h
+1 h
+338 m
+371 h
+10 h
+57 h
+4199 m
+69 h
+4 h
+4200 m
+10 h
+10 h
+13 h
+1 h
+4 h
+1 h
+195 h
+353 m
+109 h
+4201 m
+10 h
+4 h
+4 h
+4 h
+4 h
+10 h
+1 h
+1 h
+4202 m
+4 h
+10 h
+1 h
+10 h
+994 m
+4 h
+4203 m
+386 h
+4 h
+1 h
+4204 m
+135 h
+4205 m
+10 h
+4 h
+4206 m
+31 h
+4 h
+1261 h
+2964 m
+383 h
+12 h
+4 h
+1321 h
+4207 m
+10 h
+4 h
+4208 m
+36 h
+4209 m
+4 h
+4210 m
+129 h
+33 h
+1 h
+1 h
+1 h
+10 h
+1 h
+4211 m
+82 h
+4212 m
+1250 h
+4 h
+4213 m
+10 h
+135 h
+4214 m
+4 h
+13 h
+4215 m
+10 h
+1 h
+110 h
+1822 h
+10 h
+184 h
+4 h
+4216 m
+4 h
+10 h
+31 h
+276 h
+4217 m
+10 h
+1296 m
+4218 m
+4219 m
+4220 m
+10 h
+4 h
+41 h
+1 h
+10 h
+770 m
+167 h
+4 h
+1 h
+1 h
+4 h
+4 h
+4221 m
+79 h
+10 h
+4 h
+1 h
+4 h
+4 h
+10 h
+4222 m
+265 h
+4 h
+1 h
+104 h
+4 h
+1835 h
+4223 m
+1142 m
+1 h
+4 h
+1 h
+1 h
+10 h
+4224 m
+124 h
+4225 m
+1 h
+10 h
+4 h
+11 h
+10 h
+4226 m
+1642 h
+4227 m
+11 h
+4228 m
+4 h
+10 h
+57 h
+74 h
+10 h
+1 h
+4229 m
+4230 m
+4 h
+4 h
+135 h
+4 h
+82 h
+57 h
+1 h
+4 h
+1 h
+10 h
+4 h
+12 h
+4 h
+4231 m
+4232 m
+4 h
+10 h
+4233 m
+4234 m
+73 h
+4235 m
+4236 m
+4237 m
+656 m
+4238 m
+4239 m
+1 h
+4 h
+4240 m
+10 h
+3221 m
+4241 m
+1 h
+338 h
+4 h
+1 h
+1 h
+10 h
+11 h
+65 h
+4242 m
+4 h
+258 h
+3257 m
+1 h
+79 h
+1 h
+1 h
+4 h
+4243 m
+10 h
+4 h
+1 h
+10 h
+4244 m
+10 h
+10 h
+57 h
+4 h
+190 h
+1 h
+109 h
+83 h
+1 h
+4245 m
+11 h
+1 h
+4 h
+386 h
+4 h
+83 h
+124 h
+4246 m
+57 h
+4247 m
+4 h
+238 h
+10 h
+897 h
+82 h
+10 h
+4 h
+1 h
+1822 h
+10 h
+4 h
+4248 m
+12 h
+1 h
+4249 m
+3 h
+4 h
+169 h
+4250 m
+4 h
+65 h
+4251 m
+10 h
+10 h
+10 h
+10 h
+4 h
+4 h
+4252 m
+1 h
+11 h
+10 h
+1 h
+1 h
+4253 m
+4 h
+10 h
+1 h
+11 h
+4 h
+83 h
+1 h
+1137 h
+139 h
+83 h
+4 h
+4 h
+10 h
+1796 h
+83 h
+4254 m
+10 h
+4255 m
+4256 m
+10 h
+10 h
+11 h
+1 h
+4257 m
+10 h
+4 h
+10 h
+10 h
+10 h
+10 h
+4258 m
+4 h
+4259 m
+4260 m
+4 h
+278 h
+138 h
+1 h
+4 h
+4261 m
+4 h
+10 h
+1 h
+65 h
+4 h
+4 h
+4 h
+64 h
+1 h
+79 h
+10 h
+1 h
+4 h
+1 h
+1 h
+4 h
+4 h
+4262 m
+1 h
+4263 m
+4264 m
+10 h
+92 h
+1 h
+1470 h
+4265 m
+4266 m
+4 h
+1 h
+22 h
+10 h
+4267 m
+4 h
+4268 m
+4 h
+4269 m
+1 h
+10 h
+4270 m
+368 h
+65 h
+4 h
+4 h
+238 h
+4 h
+1 h
+57 h
+56 h
+1 h
+4 h
+4 h
+4271 m
+124 h
+1 h
+11 h
+4 h
+4272 m
+82 h
+31 h
+13 h
+1 h
+4 h
+1 h
+74 h
+164 h
+10 h
+181 h
+4273 m
+4 h
+123 h
+11 h
+1 h
+4274 m
+4275 m
+82 h
+4276 m
+12 h
+10 h
+11 h
+45 h
+4277 m
+11 h
+4 h
+59 h
+45 h
+1 h
+4 h
+1 h
+4 h
+10 h
+1838 m
+59 h
+10 h
+4 h
+124 h
+11 h
+10 h
+4278 m
+10 h
+4 h
+1 h
+74 h
+1 h
+11 h
+4279 m
+10 h
+1 h
+4280 m
+4281 m
+10 h
+4282 m
+4283 m
+4 h
+4284 m
+4 h
+13 h
+266 h
+4285 m
+1 h
+57 h
+4286 m
+4 h
+10 h
+4287 m
+1 h
+4 h
+110 h
+1 h
+1 h
+10 h
+4288 m
+1016 h
+4289 m
+169 h
+1 h
+13 h
+1 h
+82 h
+4 h
+10 h
+1 h
+1 h
+3 h
+1 h
+83 h
+136 m
+1137 h
+258 h
+1619 m
+267 m
+25 h
+11 h
+10 h
+4 h
+4290 m
+1766 h
+4291 m
+1 h
+4292 m
+22 h
+4293 m
+4 h
+4 h
+2733 m
+74 h
+1 h
+2054 m
+10 h
+1 h
+1 h
+4 h
+1 h
+4294 m
+1 h
+4295 m
+129 h
+3 h
+10 h
+10 h
+110 h
+1 h
+1 h
+124 h
+1 h
+36 h
+4296 m
+4297 m
+4 h
+10 h
+4298 m
+1677 m
+11 h
+10 h
+10 h
+4299 m
+10 h
+56 h
+4300 m
+4301 m
+1 h
+1780 h
+4302 m
+10 h
+1 h
+31 h
+31 h
+114 h
+10 h
+4 h
+4 h
+4 h
+4303 m
+3 h
+3 h
+1 h
+4 h
+10 h
+4304 m
+4305 m
+10 h
+55 h
+4 h
+181 h
+1201 m
+274 h
+4306 m
+10 h
+10 h
+10 h
+1 h
+4 h
+3 h
+1 h
+10 h
+4 h
+4307 m
+1 h
+110 h
+4 h
+55 h
+79 h
+278 h
+157 h
+4308 m
+276 h
+297 h
+124 h
+4 h
+4 h
+1 h
+10 h
+4309 m
+10 h
+82 h
+4 h
+1 h
+65 h
+59 h
+25 h
+184 h
+129 h
+196 h
+1218 m
+10 h
+4310 m
+4311 m
+13 h
+1 h
+307 h
+4312 m
+4 h
+4313 m
+1 h
+1 h
+4314 m
+2028 m
+1 h
+112 h
+4315 m
+3025 m
+10 h
+219 m
+125 h
+146 h
+41 h
+3 h
+4316 m
+1 h
+146 h
+1 h
+11 h
+4 h
+4317 m
+10 h
+4318 m
+1650 m
+4319 m
+10 h
+4320 m
+1 h
+195 h
+10 h
+4 h
+10 h
+109 h
+10 h
+10 h
+10 h
+56 h
+4321 m
+10 h
+1 h
+4 h
+195 h
+1 h
+11 h
+10 h
+4 h
+4 h
+65 h
+10 h
+170 h
+4 h
+4 h
+4322 m
+4323 m
+4 h
+4324 m
+4 h
+138 h
+195 h
+4325 m
+1 h
+135 h
+4 h
+59 h
+79 h
+10 h
+195 h
+4326 m
+4 h
+10 h
+4 h
+10 h
+4327 m
+1 h
+4 h
+139 h
+4 h
+4 h
+3396 h
+4 h
+1 h
+1 h
+4 h
+4328 m
+64 h
+10 h
+295 h
+10 h
+278 h
+358 h
+15 m
+4329 m
+4330 m
+1975 m
+1 h
+1607 m
+1 h
+82 h
+11 h
+4 h
+4 h
+1406 h
+4331 m
+10 h
+4 h
+4332 m
+1 h
+4333 m
+4 h
+1 h
+4 h
+4334 m
+4 h
+4335 m
+45 h
+4336 m
+10 h
+4337 m
+45 h
+538 h
+4338 m
+278 h
+11 h
+1 h
+104 h
+570 h
+4339 m
+1 h
+4340 m
+114 h
+10 h
+3 h
+4341 m
+1089 h
+10 h
+4 h
+1 h
+1 h
+82 h
+1642 h
+195 h
+4 h
+1 h
+1 h
+59 h
+129 h
+297 h
+4 h
+4342 m
+1 h
+388 m
+164 h
+1 h
+571 m
+276 h
+4343 m
+4 h
+4344 m
+73 h
+4 h
+400 m
+65 h
+4 h
+4 h
+4345 m
+22 h
+4346 m
+4 h
+1 h
+4 h
+1 h
+698 m
+1 h
+4347 m
+4348 m
+4 h
+59 h
+77 h
+1 h
+27 h
+1 h
+4 h
+4 h
+1 h
+4 h
+279 h
+10 h
+4 h
+4 h
+170 h
+11 h
+4349 m
+2374 m
+1196 m
+1 h
+4 h
+4350 m
+1 h
+4 h
+4 h
+157 h
+4 h
+10 h
+1 h
+4 h
+4 h
+10 h
+4256 m
+25 h
+1016 h
+1 h
+4351 m
+4 h
+125 h
+1403 h
+4352 m
+181 h
+4 h
+4 h
+4353 m
+230 h
+1796 h
+443 h
+4 h
+195 h
+297 h
+1 h
+41 h
+1 h
+4354 m
+1220 m
+10 h
+4355 m
+1 h
+196 h
+10 h
+4356 m
+1 h
+757 m
+4357 m
+4 h
+4 h
+4 h
+10 h
+25 h
+186 h
+196 h
+4358 m
+4 h
+4359 m
+124 h
+1198 m
+4360 m
+1 h
+1 h
+990 m
+1 h
+1 h
+4 h
+4361 m
+125 h
+359 h
+4 h
+278 h
+4 h
+4362 m
+4363 m
+4364 m
+10 h
+4 h
+195 h
+10 h
+4365 m
+4366 m
+104 h
+4367 m
+10 h
+10 h
+10 h
+147 h
+1 h
+1 h
+4368 m
+4 h
+4 h
+4369 m
+4 h
+82 h
+22 h
+57 h
+5 h
+367 h
+164 h
+4370 m
+569 h
+4 h
+1 h
+1 h
+4371 m
+4372 m
+1 h
+869 m
+4 h
+359 h
+4373 m
+1 h
+91 h
+4374 m
+1 h
+1 h
+59 h
+1 h
+31 h
+4 h
+59 h
+4375 m
+4 h
+1 h
+1137 h
+520 h
+11 h
+10 h
+4 h
+79 h
+11 h
+139 h
+4 h
+91 h
+4376 m
+488 h
+307 h
+10 h
+4 h
+4 h
+1 h
+4377 m
+10 h
+569 h
+10 h
+1 h
+1 h
+4378 m
+82 h
+1 h
+4 h
+4379 m
+4 h
+45 h
+4 h
+3209 m
+1 h
+3 h
+10 h
+4380 m
+10 h
+4 h
+10 h
+4 h
+1 h
+82 h
+4381 m
+258 h
+109 h
+147 h
+2022 m
+4382 m
+4 h
+4383 m
+4384 m
+330 m
+57 h
+4 h
+11 h
+4385 m
+4386 m
+10 h
+1 h
+4387 m
+11 h
+4388 m
+3 h
+82 h
+4389 m
+4 h
+10 h
+10 h
+10 h
+1 h
+4390 m
+173 h
+1 h
+4391 m
+4 h
+1 h
+4392 m
+1 h
+4 h
+1 h
+1 h
+1 h
+83 h
+4 h
+1 h
+192 h
+109 h
+4393 m
+31 h
+4394 m
+1 h
+1 h
+10 h
+4395 m
+10 h
+10 h
+139 h
+10 h
+4 h
+1 h
+83 h
+4396 m
+10 h
+1316 m
+1 h
+4397 m
+10 h
+4398 m
+114 h
+31 h
+4 h
+4 h
+174 h
+4399 m
+10 h
+146 h
+4 h
+4292 m
+1 h
+1137 h
+4 h
+4400 m
+10 h
+4 h
+1 h
+4 h
+55 h
+4401 m
+1 h
+10 h
+10 h
+10 h
+10 h
+10 h
+10 h
+10 h
+59 h
+11 h
+4 h
+4402 m
+4 h
+4403 m
+4 h
+83 h
+1 h
+1 h
+4404 m
+383 h
+41 h
+4 h
+3398 m
+1 h
+82 h
+4 h
+10 h
+4 h
+4 h
+1 h
+3 h
+83 h
+10 h
+1 h
+1 h
+4 h
+4 h
+1 h
+4 h
+173 h
+332 h
+4405 m
+1 h
+4 h
+443 h
+1 h
+4406 m
+4 h
+4407 m
+1 h
+4408 m
+4409 m
+258 h
+4 h
+4410 m
+4 h
+820 m
+1 h
+146 h
+4 h
+1 h
+10 h
+10 h
+4411 m
+41 h
+2002 m
+4 h
+1 h
+4 h
+1 h
+10 h
+4412 m
+1 h
+4 h
+1 h
+1 h
+185 h
+1 h
+4 h
+170 h
+4 h
+4 h
+10 h
+4 h
+114 h
+4 h
+10 h
+4 h
+1 h
+59 h
+4413 m
+1 h
+2564 m
+10 h
+4 h
+1 h
+1 h
+4 h
+1659 m
+11 h
+1 h
+4414 m
+1 h
+1 h
+4 h
+11 h
+4415 m
+447 h
+4 h
+4 h
+4416 m
+289 h
+4 h
+10 h
+4 h
+125 h
+4 h
+4 h
+4 h
+295 h
+4417 m
+1 h
+114 h
+1 h
+172 h
+4 h
+4418 m
+41 h
+4419 m
+4420 m
+11 h
+1201 m
+4 h
+4 h
+4421 m
+12 h
+10 h
+1 h
+82 h
+4 h
+185 h
+4 h
+258 h
+65 h
+1643 m
+1 h
+4422 m
+4423 m
+842 m
+1362 m
+4424 m
+4425 m
+36 h
+74 h
+4 h
+125 h
+56 h
+1 h
+1 h
+1650 m
+10 h
+10 h
+45 h
+4426 m
+40 h
+4427 m
+4 h
+4428 m
+4429 m
+1 h
+307 h
+10 h
+59 h
+4 h
+4430 m
+1 h
+4431 m
+83 h
+4 h
+4432 m
+4433 m
+125 h
+1 h
+1 h
+3 h
+1261 h
+1 h
+4 h
+1 h
+10 h
+25 h
+4434 m
+92 h
+10 h
+995 m
+10 h
+4 h
+1 h
+31 h
+4 h
+124 h
+82 h
+1 h
+4 h
+10 h
+4435 m
+4 h
+1 h
+82 h
+1 h
+1 h
+1 h
+4 h
+229 h
+27 h
+158 h
+4 h
+4 h
+106 h
+266 h
+1 h
+4436 m
+4437 m
+4 h
+4 h
+22 h
+170 h
+82 h
+10 h
+4 h
+92 h
+1 h
+4 h
+190 h
+2788 m
+897 h
+4438 m
+4 h
+4 h
+10 h
+4 h
+4 h
+4 h
+4 h
+4439 m
+10 h
+1 h
+11 h
+4440 m
+4 h
+10 h
+1 h
+1 h
+31 h
+278 h
+1 h
+124 h
+1 h
+4 h
+4441 m
+1 h
+4 h
+10 h
+4 h
+10 h
+1 h
+4 h
+92 h
+31 h
+10 h
+4442 m
+307 h
+11 h
+110 h
+10 h
+1 h
+25 h
+4 h
+124 h
+820 m
+4 h
+4 h
+119 h
+4 h
+1 h
+10 h
+4 h
+692 h
+4 h
+4 h
+4 h
+4443 m
+3 h
+4 h
+1 h
+4 h
+31 h
+4444 m
+1 h
+59 h
+1 h
+4445 m
+1 h
+4446 m
+1 h
+4 h
+1027 h
+2186 m
+4 h
+83 h
+4447 m
+4448 m
+4 h
+46 h
+4 h
+368 h
+10 h
+4 h
+10 h
+4 h
+4 h
+4 h
+10 h
+4449 m
+74 h
+4 h
+25 h
+4 h
+104 h
+4450 m
+601 h
+167 h
+1 h
+4297 m
+1 h
+4451 m
+4452 m
+4453 m
+4454 m
+65 h
+4455 m
+4456 m
+4 h
+4457 m
+4 h
+4 h
+184 h
+1261 h
+4458 m
+4459 m
+147 h
+4 h
+4 h
+125 h
+4 h
+1 h
+124 h
+10 h
+4460 m
+4461 m
+10 h
+1 h
+4462 m
+1 h
+4463 m
+11 h
+65 h
+169 h
+4464 m
+82 h
+4 h
+383 h
+3376 m
+10 h
+8 h
+10 h
+435 m
+1 h
+4 h
+4 h
+1 h
+1 h
+10 h
+4 h
+4465 m
+403 h
+4466 m
+4 h
+4 h
+4 h
+4467 m
+10 h
+488 h
+4 h
+10 h
+238 h
+3 h
+10 h
+4468 m
+28 h
+10 h
+295 h
+1 h
+4469 m
+158 h
+8 h
+4470 m
+4471 m
+1766 h
+1 h
+1 h
+4472 m
+10 h
+114 h
+4 h
+4 h
+4 h
+1 h
+1 h
+4 h
+1 h
+10 h
+1 h
+4473 m
+4 h
+332 h
+92 h
+1 h
+1137 h
+36 h
+10 h
+2794 m
+10 h
+10 h
+4 h
+4 h
+4474 m
+4475 m
+4 h
+4476 m
+4477 m
+1 h
+13 h
+104 h
+109 h
+10 h
+109 h
+1 h
+4478 m
+820 h
+4 h
+4 h
+4479 m
+4 h
+4 h
+79 h
+146 h
+4 h
+578 m
+125 h
+266 h
+10 h
+4480 m
+4 h
+11 h
+4481 m
+2780 m
+4 h
+10 h
+10 h
+1 h
+4482 m
+4483 m
+4484 m
+94 h
+4485 m
+4486 m
+41 h
+167 h
+146 h
+4 h
+10 h
+10 h
+1 h
+1083 m
+4 h
+4487 m
+1 h
+1 h
+1 h
+10 h
+1 h
+82 h
+4488 m
+10 h
+12 h
+10 h
+4 h
+4 h
+82 h
+1780 h
+4489 m
+11 h
+447 h
+4 h
+83 h
+124 h
+10 h
+13 h
+4490 m
+1 h
+4 h
+10 h
+4 h
+10 h
+146 h
+4491 m
+4492 m
+4493 m
+1 h
+4 h
+4494 m
+10 h
+10 h
+125 h
+4495 m
+10 h
+10 h
+125 h
+4 h
+82 h
+425 m
+4 h
+56 h
+10 h
+1128 m
+46 h
+986 h
+146 h
+11 h
+266 h
+3 h
+4496 m
+4497 m
+4498 m
+4499 m
+108 h
+10 h
+4500 m
+83 h
+2931 m
+4 h
+4501 m
+36 h
+10 h
+4 h
+10 h
+4 h
+27 h
+1 h
+4 h
+70 m
+1 h
+25 h
+10 h
+332 h
+10 h
+4502 m
+4503 m
+10 h
+11 h
+1 h
+1 h
+1 h
+313 h
+109 h
+1 h
+4504 m
+10 h
+94 h
+4 h
+4505 m
+10 h
+1650 h
+3 h
+4506 m
+4507 m
+1 h
+112 h
+4508 m
+83 h
+258 h
+10 h
+3 h
+1 h
+1 h
+11 h
+1 h
+4 h
+1 h
+4 h
+124 h
+4 h
+4509 m
+12 h
+48 h
+1 h
+11 h
+10 h
+1 h
+4 h
+4510 m
+4511 m
+65 h
+4 h
+4 h
+1 h
+4 h
+4512 m
+4 h
+4513 m
+4 h
+1250 h
+124 h
+1 h
+4 h
+55 h
+4 h
+4 h
+1 h
+4514 m
+4 h
+108 h
+57 h
+10 h
+4515 m
+4516 m
+4517 m
+4 h
+10 h
+156 h
+1 h
+164 h
+4518 m
+4 h
+10 h
+4 h
+4519 m
+82 h
+4 h
+1 h
+4520 m
+82 h
+4 h
+4 h
+170 h
+4521 m
+1030 h
+157 h
+1 h
+11 h
+4522 m
+4523 m
+4524 m
+4 h
+4 h
+4 h
+10 h
+56 h
+65 h
+10 h
+1 h
+359 h
+1 h
+4 h
+4 h
+195 h
+59 h
+65 h
+4525 m
+108 h
+114 h
+10 h
+4526 m
+4527 m
+1 h
+1 h
+4 h
+1362 h
+1 h
+1 h
+1 h
+4 h
+1309 m
+4528 m
+2733 h
+10 h
+174 h
+1003 h
+4 h
+1027 h
+4529 m
+276 h
+4530 m
+4 h
+4531 m
+4532 m
+4 h
+1089 h
+1 h
+4 h
+109 h
+4533 m
+4534 m
+520 h
+10 h
+4535 m
+10 h
+4 h
+1260 m
+1 h
+11 h
+626 m
+4 h
+4 h
+1375 m
+601 h
+4 h
+1 h
+146 h
+10 h
+4536 m
+79 h
+170 h
+4537 m
+4538 m
+4539 m
+10 h
+139 h
+124 h
+4540 m
+25 h
+10 h
+3 h
+4 h
+4541 m
+82 h
+146 h
+4542 m
+4 h
+4 h
+25 h
+185 h
+4543 m
+4 h
+10 h
+443 h
+1 h
+359 h
+8 h
+278 h
+10 h
+83 h
+4544 m
+4 h
+10 h
+10 h
+4545 m
+57 h
+4546 m
+11 h
+1 h
+557 m
+1 h
+4547 m
+954 m
+1 h
+1 h
+4 h
+1548 m
+112 h
+4548 m
+2494 m
+4549 m
+4550 m
+4 h
+4 h
+10 h
+57 h
+857 h
+4551 m
+73 h
+358 h
+1 h
+10 h
+1 h
+297 h
+4552 m
+4 h
+4 h
+307 h
+4 h
+4553 m
+1 h
+124 h
+1 h
+59 h
+10 h
+4554 m
+10 h
+4 h
+4555 m
+4 h
+1 h
+4 h
+332 h
+138 h
+4 h
+1 h
+1 h
+1 h
+1 h
+4556 m
+79 h
+355 m
+10 h
+1418 m
+11 h
+4 h
+939 h
+1137 h
+118 h
+12 h
+575 h
+10 h
+172 h
+1 h
+4 h
+4 h
+10 h
+10 h
+4 h
+4557 m
+1016 h
+186 h
+4 h
+10 h
+10 h
+11 h
+4558 m
+4559 m
+8 h
+10 h
+10 h
+4560 m
+10 h
+4 h
+1 h
+4 h
+238 h
+1 h
+4 h
+4561 m
+4562 m
+4 h
+57 h
+1 h
+1 h
+108 h
+1 h
+135 h
+11 h
+1 h
+1 h
+10 h
+4563 m
+1 h
+97 h
+4564 m
+4 h
+4565 m
+4 h
+1 h
+4566 m
+10 h
+1 h
+10 h
+4567 m
+4 h
+4568 m
+4569 m
+4570 m
+82 h
+4571 m
+1 h
+1 h
+10 h
+113 h
+4 h
+109 h
+83 h
+4 h
+5 h
+4572 m
+986 h
+1 h
+114 h
+1 h
+1 h
+4573 m
+4574 m
+4 h
+11 h
+185 h
+11 h
+169 h
+4 h
+1 h
+4575 m
+332 h
+11 h
+4576 m
+104 h
+1 h
+4577 m
+4 h
+4 h
+4 h
+1564 m
+1 h
+4 h
+4 h
+10 h
+10 h
+4 h
+4 h
+125 h
+1359 h
+59 h
+73 h
+4578 m
+11 h
+10 h
+25 h
+4 h
+1 h
+73 h
+40 h
+307 h
+1 h
+138 h
+4579 m
+104 h
+10 h
+64 h
+1 h
+1 h
+4580 m
+82 h
+4 h
+4 h
+4581 m
+1083 m
+4 h
+4582 m
+109 h
+125 h
+79 h
+10 h
+45 h
+10 h
+1 h
+10 h
+119 h
+4583 m
+10 h
+82 h
+4584 m
+203 m
+4585 m
+27 h
+857 h
+4586 m
+65 h
+1 h
+4587 m
+566 m
+11 h
+4 h
+4588 m
+4589 m
+65 h
+11 h
+4590 m
+4 h
+1 h
+4 h
+4 h
+4591 m
+97 h
+4 h
+4 h
+10 h
+4592 m
+109 h
+4 h
+4593 m
+57 h
+10 h
+4594 m
+77 h
+1 h
+10 h
+1 h
+4 h
+157 h
+1 h
+1 h
+4 h
+4 h
+1201 h
+10 h
+4 h
+1 h
+4 h
+4 h
+1105 m
+10 h
+1 h
+57 h
+185 h
+4595 m
+4596 m
+4 h
+143 h
+147 h
+4597 m
+3 h
+83 h
+4598 m
+195 h
+4 h
+1 h
+143 h
+172 h
+4 h
+195 h
+4 h
+4 h
+1 h
+4 h
+1 h
+10 h
+146 h
+1 h
+279 h
+3 h
+270 h
+4599 m
+97 h
+4600 m
+4 h
+10 h
+4 h
+4 h
+4 h
+4601 m
+4 h
+110 h
+4602 m
+10 h
+1 h
+10 h
+4 h
+4 h
+4603 m
+4 h
+4604 m
+1 h
+57 h
+4605 m
+109 h
+139 h
+4292 h
+10 h
+258 h
+4606 m
+1 h
+10 h
+4607 m
+4608 m
+41 h
+82 h
+274 h
+266 h
+4609 m
+10 h
+10 h
+11 h
+1 h
+4 h
+4 h
+687 h
+4 h
+1 h
+10 h
+4 h
+4 h
+190 h
+135 h
+56 h
+4610 m
+351 m
+82 h
+4 h
+4 h
+4 h
+4611 m
+4 h
+10 h
+4 h
+12 h
+59 h
+1 h
+13 h
+1 h
+31 h
+1 h
+4612 m
+97 h
+1 h
+4 h
+48 h
+4613 m
+986 h
+1 h
+94 h
+10 h
+4614 m
+10 h
+281 m
+4615 m
+1 h
+79 h
+673 m
+4616 m
+109 h
+10 h
+4 h
+64 h
+4617 m
+192 h
+10 h
+4618 m
+10 h
+1868 m
+1261 h
+4619 m
+4620 m
+114 h
+4 h
+10 h
+4 h
+11 h
+1 h
+4621 m
+170 h
+1 h
+83 h
+4 h
+4622 m
+351 m
+4623 m
+79 h
+1083 h
+4 h
+538 h
+4624 m
+4 h
+4625 m
+10 h
+59 h
+4626 m
+10 h
+4627 m
+1 h
+4628 m
+1220 m
+1 h
+1 h
+4629 m
+4630 m
+4631 m
+4 h
+8 h
+4632 m
+4 h
+4633 m
+173 h
+4634 m
+4635 m
+4636 m
+716 m
+10 h
+4637 m
+41 h
+4638 m
+146 h
+1261 h
+10 h
+3 h
+307 h
+4 h
+4 h
+10 h
+1 h
+45 h
+3742 m
+10 h
+383 h
+10 h
+4 h
+4 h
+1 h
+11 h
+8 h
+4639 m
+10 h
+4 h
+1 h
+4640 m
+10 h
+4641 m
+135 h
+10 h
+4642 m
+1 h
+4 h
+1185 m
+10 h
+4 h
+4 h
+4 h
+4 h
+4643 m
+4644 m
+4 h
+4 h
+1 h
+1 h
+4 h
+4645 m
+124 h
+4 h
+10 h
+4646 m
+4 h
+10 h
+1 h
+4 h
+41 h
+541 h
+1 h
+4647 m
+1 h
+4 h
+11 h
+4648 m
+4649 m
+1 h
+4650 m
+1 h
+4651 m
+1 h
+4652 m
+4653 m
+22 h
+4 h
+41 h
+1 h
+4 h
+465 m
+10 h
+1 h
+4 h
+10 h
+258 h
+10 h
+4 h
+4654 m
+4 h
+1 h
+4655 m
+82 h
+1 h
+4656 m
+1 h
+4657 m
+10 h
+4 h
+4658 m
+4 h
+4 h
+55 h
+4659 m
+1 h
+4 h
+4660 m
+109 h
+59 h
+31 h
+4608 m
+2925 m
+11 h
+1 h
+1 h
+4 h
+4 h
+4661 m
+4 h
+10 h
+276 h
+4662 m
+59 h
+4663 m
+1 h
+10 h
+4664 m
+82 h
+1 h
+4 h
+266 h
+1 h
+4665 m
+4666 m
+83 h
+4667 m
+1 h
+22 h
+4668 m
+181 h
+57 h
+10 h
+1 h
+1045 m
+1 h
+1571 m
+1759 m
+10 h
+3 h
+4 h
+125 h
+4 h
+10 h
+4669 m
+4 h
+1 h
+4670 m
+4 h
+1861 m
+65 h
+4 h
+124 h
+4 h
+1 h
+4671 m
+489 m
+1 h
+31 h
+4 h
+135 h
+4 h
+10 h
+489 h
+1650 h
+4 h
+1 h
+1 h
+4672 m
+48 h
+1 h
+4673 m
+1 h
+4674 m
+278 h
+4 h
+10 h
+40 h
+4675 m
+1 h
+278 h
+1 h
+4676 m
+4677 m
+1 h
+82 h
+332 h
+12 h
+4678 m
+4679 m
+4680 m
+4681 m
+4 h
+4682 m
+4 h
+4683 m
+59 h
+10 h
+4684 m
+82 h
+1 h
+4 h
+10 h
+4 h
+10 h
+1 h
+1 h
+368 h
+4685 m
+195 h
+10 h
+4 h
+4 h
+1403 h
+22 h
+1261 h
+1 h
+11 h
+4 h
+92 h
+4686 m
+10 h
+185 h
+4 h
+10 h
+195 h
+1430 m
+1 h
+1 h
+4687 m
+986 h
+4688 m
+11 h
+463 m
+1 h
+297 h
+4689 m
+4 h
+1 h
+1 h
+1851 m
+4 h
+10 h
+4 h
+601 h
+4690 m
+59 h
+10 h
+3177 m
+1403 h
+4 h
+4691 m
+65 h
+10 h
+4692 m
+4693 m
+65 h
+124 h
+1 h
+82 h
+4694 m
+4695 m
+4696 m
+4 h
+4 h
+11 h
+4 h
+65 h
+4 h
+911 h
+1 h
+196 h
+10 h
+4697 m
+4698 m
+4699 m
+10 h
+4 h
+1 h
+11 h
+59 h
+10 h
+1 h
+1 h
+4 h
+4 h
+1 h
+10 h
+74 h
+4 h
+1 h
+4 h
+4700 m
+114 h
+41 h
+139 h
+4701 m
+4 h
+258 h
+10 h
+11 h
+4 h
+4 h
+4 h
+4702 m
+10 h
+57 h
+4 h
+1 h
+4 h
+10 h
+1 h
+1 h
+238 h
+4703 m
+59 h
+4 h
+1 h
+1 h
+10 h
+1 h
+10 h
+4704 m
+4 h
+400 m
+1 h
+94 h
+1 h
+4705 m
+1 h
+4706 m
+1 h
+4707 m
+12 h
+10 h
+4708 m
+97 h
+976 h
+4 h
+3702 m
+4 h
+1 h
+10 h
+386 h
+4709 m
+4710 m
+4711 m
+104 h
+10 h
+493 m
+11 h
+10 h
+4712 m
+4 h
+83 h
+322 m
+186 h
+1 h
+1 h
+1 h
+10 h
+1691 m
+4713 m
+1 h
+4714 m
+10 h
+84 m
+536 h
+823 m
+125 h
+11 h
+10 h
+10 h
+1 h
+1 h
+97 h
+1 h
+4 h
+10 h
+1751 m
+124 h
+4 h
+4 h
+3679 m
+8 h
+1449 m
+146 h
+4 h
+4 h
+1 h
+123 h
+125 h
+10 h
+10 h
+1 h
+10 h
+4715 m
+4716 m
+27 h
+1 h
+10 h
+1 h
+4 h
+4717 m
+1 h
+31 h
+83 h
+97 h
+1 h
+10 h
+1 h
+1 h
+976 h
+68 m
+1 h
+4 h
+4718 m
+83 h
+164 h
+4719 m
+1556 m
+4720 m
+4 h
+4 h
+481 m
+119 h
+4721 m
+4722 m
+11 h
+10 h
+48 h
+125 h
+4723 m
+10 h
+4724 m
+12 h
+25 h
+10 h
+1 h
+10 h
+204 h
+1 h
+2719 m
+4 h
+11 h
+4 h
+4 h
+1 h
+94 h
+1 h
+11 h
+4725 m
+4726 m
+4727 m
+10 h
+4728 m
+1 h
+4256 h
+172 h
+4 h
+4729 m
+1 h
+4730 m
+10 h
+11 h
+10 h
+1 h
+4731 m
+140 m
+4 h
+4732 m
+4733 m
+4734 m
+10 h
+1 h
+1838 m
+4735 m
+1 h
+4736 m
+113 h
+4 h
+4 h
+386 h
+55 h
+1045 m
+4737 m
+10 h
+10 h
+4 h
+4738 m
+27 h
+45 h
+10 h
+4739 m
+1 h
+1 h
+4740 m
+4741 m
+2617 m
+170 h
+4 h
+4742 m
+3 h
+64 h
+4 h
+911 h
+2041 m
+1 h
+4 h
+57 h
+10 h
+45 h
+4 h
+4743 m
+11 h
+1 h
+4 h
+36 h
+1 h
+11 h
+258 h
+74 h
+1884 m
+4744 m
+4 h
+8 h
+1 h
+1 h
+4745 m
+304 m
+144 h
+4 h
+1 h
+4746 m
+83 h
+4 h
+4 h
+109 h
+1 h
+41 h
+4 h
+601 h
+1 h
+4747 m
+10 h
+83 h
+36 h
+4 h
+4 h
+4 h
+4 h
+3 h
+3396 h
+4748 m
+4 h
+4 h
+4 h
+4749 m
+4750 m
+10 h
+4 h
+4 h
+10 h
+359 h
+84 h
+4 h
+72 m
+11 h
+4 h
+4751 m
+4752 m
+1 h
+97 h
+4 h
+322 m
+10 h
+1 h
+10 h
+2923 m
+123 h
+4753 m
+4 h
+10 h
+124 h
+1074 m
+1 h
+1 h
+4754 m
+4 h
+10 h
+1 h
+10 h
+4755 m
+4756 m
+1 h
+55 h
+1770 m
+4757 m
+11 h
+73 h
+1 h
+10 h
+73 h
+59 h
+4256 h
+1 h
+4758 m
+4759 m
+4 h
+4760 m
+1 h
+77 h
+1 h
+1 h
+4761 m
+1 h
+1 h
+4 h
+124 h
+1 h
+83 h
+1 h
+27 h
+2447 m
+1 h
+4 h
+83 h
+94 h
+10 h
+4 h
+1 h
+1 h
+10 h
+4111 m
+10 h
+1 h
+8 h
+170 h
+4 h
+4762 m
+61 m
+10 h
+1861 m
+4 h
+10 h
+11 h
+4763 m
+10 h
+4 h
+3 h
+1 h
+82 h
+1 h
+4764 m
+1 h
+1 h
+4765 m
+4 h
+10 h
+238 h
+10 h
+10 h
+1 h
+1 h
+10 h
+258 h
+4 h
+10 h
+4766 m
+10 h
+10 h
+4 h
+55 h
+4767 m
+146 h
+27 h
+4768 m
+1 h
+10 h
+295 h
+4769 m
+4770 m
+297 h
+57 h
+10 h
+10 h
+12 h
+4 h
+1 h
+10 h
+172 h
+4771 m
+10 h
+4772 m
+386 h
+4 h
+601 h
+278 h
+104 h
+1 h
+1 h
+4 h
+195 h
+569 h
+109 h
+41 h
+10 h
+1 h
+10 h
+4 h
+4 h
+4773 m
+230 h
+195 h
+31 h
+4774 m
+4775 m
+4776 m
+10 h
+10 h
+4777 m
+4778 m
+56 h
+1 h
+4202 m
+3 h
+10 h
+4779 m
+4 h
+4780 m
+1 h
+11 h
+1 h
+184 h
+4781 m
+41 h
+4 h
+1 h
+1 h
+626 m
+1 h
+1 h
+4 h
+82 h
+1 h
+28 h
+4 h
+1 h
+4 h
+1 h
+270 h
+1 h
+4 h
+1 h
+1 h
+4 h
+4 h
+297 h
+27 h
+4782 m
+10 h
+339 m
+156 h
+10 h
+4 h
+1650 h
+1 h
+4783 m
+4 h
+1 h
+196 h
+1 h
+10 h
+4 h
+1454 m
+4 h
+185 h
+4297 h
+4 h
+1 h
+4784 m
+4 h
+4 h
+1 h
+11 h
+4 h
+4 h
+196 h
+281 m
+4 h
+10 h
+4 h
+57 h
+4 h
+10 h
+266 h
+10 h
+4 h
+184 h
+41 h
+1 h
+69 h
+169 h
+11 h
+82 h
+41 h
+4785 m
+4 h
+4 h
+65 h
+4786 m
+10 h
+976 h
+4 h
+4 h
+4 h
+4 h
+27 h
+4787 m
+10 h
+4 h
+4788 m
+4789 m
+11 h
+4 h
+4 h
+4790 m
+4 h
+10 h
+4 h
+3904 m
+1 h
+2887 m
+1 h
+11 h
+4 h
+82 h
+1 h
+4791 m
+110 h
+4792 m
+4 h
+4793 m
+10 h
+4 h
+4 h
+143 h
+54 m
+4 h
+10 h
+4794 m
+4 h
+4 h
+1 h
+4 h
+1 h
+443 h
+11 h
+10 h
+83 h
+147 h
+4 h
+4795 m
+65 h
+4 h
+279 h
+10 h
+1 h
+4 h
+1 h
+65 h
+4796 m
+1 h
+55 h
+41 h
+4 h
+4 h
+104 h
+4 h
+3 h
+4797 m
+195 h
+1 h
+4798 m
+1508 m
+1 h
+9 m
+4799 m
+13 h
+4800 m
+10 h
+1 h
+4 h
+25 h
+4801 m
+1 h
+1089 h
+4802 m
+1 h
+4 h
+1 h
+4 h
+4803 m
+11 h
+4804 m
+4 h
+10 h
+4805 m
+108 h
+4806 m
+4807 m
+124 h
+1 h
+82 h
+258 h
+82 h
+1 h
+57 h
+83 h
+1 h
+10 h
+172 h
+3768 m
+4 h
+10 h
+1 h
+73 h
+4 h
+4808 m
+4809 m
+4 h
+4 h
+4 h
+92 h
+4 h
+45 h
+4 h
+11 h
+4 h
+10 h
+1379 m
+1 h
+4810 m
+4 h
+4811 m
+10 h
+4812 m
+4 h
+4 h
+109 h
+4813 m
+10 h
+10 h
+4 h
+1 h
+4 h
+11 h
+10 h
+4 h
+109 h
+4 h
+4814 m
+443 h
+36 h
+258 h
+1 h
+1 h
+1 h
+10 h
+10 h
+4815 m
+1 h
+4816 m
+1 h
+4817 m
+687 h
+10 h
+1 h
+1 h
+601 h
+4 h
+4818 m
+4 h
+11 h
+4 h
+4 h
+4819 m
+1 h
+4 h
+4 h
+4820 m
+4 h
+10 h
+1024 m
+1 h
+10 h
+4821 m
+230 h
+4 h
+4 h
+10 h
+1 h
+4822 m
+31 h
+11 h
+1 h
+4 h
+1 h
+1 h
+59 h
+4 h
+279 h
+1 h
+31 h
+1 h
+1116 m
+1 h
+31 h
+61 m
+146 h
+1 h
+4 h
+1403 h
+1 h
+1 h
+4 h
+2002 m
+4823 m
+74 h
+4824 m
+4 h
+4 h
+170 h
+939 h
+4 h
+1 h
+4825 m
+1309 m
+338 h
+10 h
+4 h
+41 h
+10 h
+4 h
+4826 m
+1 h
+1 h
+1016 h
+4827 m
+4828 m
+4 h
+4829 m
+1 h
+4830 m
+1030 h
+4 h
+195 h
+123 h
+11 h
+4831 m
+10 h
+123 h
+10 h
+10 h
+40 h
+4832 m
+4 h
+4833 m
+4 h
+1 h
+4 h
+1316 m
+124 h
+1 h
+358 h
+184 h
+1 h
+4 h
+57 h
+319 h
+4834 m
+1 h
+258 h
+4 h
+964 m
+976 h
+4 h
+4 h
+10 h
+4 h
+4 h
+4 h
+4 h
+4 h
+4835 m
+4836 m
+4 h
+1 h
+386 h
+4 h
+4837 m
+79 h
+10 h
+4 h
+4 h
+4 h
+4 h
+10 h
+4838 m
+10 h
+4 h
+10 h
+4839 m
+2558 m
+4 h
+4724 m
+4840 m
+10 h
+976 h
+4 h
+1 h
+371 h
+4514 m
+4 h
+10 h
+4 h
+65 h
+124 h
+94 h
+5 h
+119 h
+4 h
+4 h
+110 h
+4841 m
+4 h
+10 h
+10 h
+4 h
+10 h
+10 h
+195 h
+10 h
+4842 m
+1 h
+4 h
+13 h
+4843 m
+4 h
+55 h
+10 h
+4844 m
+265 h
+10 h
+278 h
+4845 m
+999 m
+74 h
+493 m
+1 h
+1 h
+4846 m
+1 h
+4847 m
+4 h
+1 h
+2379 h
+4848 m
+10 h
+4849 m
+10 h
+10 h
+4 h
+4850 m
+1016 h
+83 h
+10 h
+4851 m
+1 h
+266 h
+10 h
+4852 m
+11 h
+1 h
+4853 m
+1 h
+1 h
+4854 m
+17 m
+119 h
+965 m
+4 h
+1 h
+4 h
+1 h
+4 h
+1 h
+10 h
+59 h
+97 h
+4855 m
+65 h
+11 h
+124 h
+4856 m
+1 h
+57 h
+575 h
+4 h
+1027 h
+1 h
+1 h
+4857 m
+601 h
+1508 m
+10 h
+4858 m
+4 h
+4859 m
+74 h
+4 h
+4860 m
+10 h
+125 h
+757 m
+4 h
+79 h
+4 h
+4861 m
+4862 m
+443 h
+41 h
+4 h
+1 h
+4 h
+10 h
+4 h
+1 h
+4 h
+4863 m
+1 h
+4 h
+230 h
+73 h
+1 h
+10 h
+11 h
+4 h
+4864 m
+4865 m
+82 h
+10 h
+4 h
+4866 m
+1 h
+332 h
+1 h
+4 h
+4 h
+307 h
+4 h
+57 h
+4867 m
+4868 m
+4869 m
+10 h
+10 h
+1 h
+57 h
+57 h
+1 h
+4870 m
+1 h
+1 h
+2887 m
+1 h
+4 h
+4 h
+3050 m
+1 h
+10 h
+4871 m
+4872 m
+4873 m
+1344 m
+1 h
+4874 m
+4 h
+4 h
+1 h
+4875 m
+41 h
+181 h
+4876 m
+10 h
+11 h
+1 h
+718 m
+10 h
+4 h
+536 h
+59 h
+1 h
+4 h
+1 h
+4877 m
+4 h
+4 h
+4 h
+4 h
+1 h
+10 h
+92 h
+97 h
+4878 m
+4879 m
+10 h
+1 h
+4 h
+656 m
+1128 m
+4 h
+4880 m
+1 h
+10 h
+97 h
+11 h
+124 h
+1 h
+1 h
+11 h
+3 h
+4881 m
+4 h
+4 h
+4 h
+4882 m
+4883 m
+4 h
+10 h
+4 h
+10 h
+4 h
+10 h
+4 h
+307 h
+1 h
+172 h
+1 h
+1 h
+1 h
+4 h
+13 h
+25 h
+224 h
+109 h
+4 h
+4884 m
+109 h
+10 h
+11 h
+1 h
+4 h
+12 h
+4 h
+1 h
+147 h
+4 h
+10 h
+3 h
+74 h
+4885 m
+119 h
+1470 h
+332 h
+4886 m
+1359 h
+57 h
+4887 m
+1 h
+4 h
+4888 m
+147 h
+1 h
+10 h
+10 h
+4889 m
+4 h
+4890 m
+506 m
+726 m
+238 h
+83 h
+36 h
+1 h
+82 h
+10 h
+4891 m
+55 h
+104 h
+10 h
+4 h
+4 h
+57 h
+119 h
+1 h
+10 h
+10 h
+4892 m
+238 h
+57 h
+4 h
+4893 m
+10 h
+4 h
+4894 m
+1 h
+97 h
+109 h
+11 h
+13 h
+4 h
+4895 m
+1 h
+4 h
+1 h
+4896 m
+1 h
+4897 m
+170 h
+156 h
+139 h
+4 h
+4898 m
+3 h
+4 h
+10 h
+65 h
+10 h
+4 h
+1 h
+4899 m
+1105 m
+57 h
+25 h
+1 h
+73 h
+3435 m
+4900 m
+4901 m
+1 h
+4 h
+1 h
+4902 m
+143 h
+10 h
+1 h
+1 h
+4903 m
+97 h
+4 h
+82 h
+1016 h
+4 h
+4 h
+258 h
+425 m
+114 h
+4904 m
+4905 m
+4906 m
+4 h
+4907 m
+12 h
+1 h
+4908 m
+229 h
+1 h
+569 h
+10 h
+10 h
+4 h
+10 h
+4909 m
+10 h
+4910 m
+4 h
+1 h
+4 h
+4 h
+1 h
+11 h
+4 h
+4 h
+143 h
+1 h
+73 h
+113 h
+4911 m
+10 h
+996 m
+4912 m
+57 h
+4 h
+4 h
+4913 m
+181 h
+1 h
+4 h
+10 h
+4 h
+124 h
+11 h
+1 h
+1261 h
+4 h
+4914 m
+4 h
+92 h
+1 h
+1 h
+4915 m
+167 h
+59 h
+4 h
+57 h
+10 h
+25 h
+45 h
+4916 m
+10 h
+10 h
+10 h
+371 h
+4 h
+1 h
+82 h
+1 h
+4 h
+4917 m
+1 h
+4 h
+83 h
+4 h
+4918 m
+4919 m
+10 h
+97 h
+1 h
+4 h
+12 h
+1 h
+307 h
+1 h
+4920 m
+10 h
+4 h
+10 h
+4921 m
+1 h
+124 h
+4 h
+4922 m
+124 h
+4 h
+83 h
+23 h
+1 h
+4 h
+22 h
+1 h
+11 h
+82 h
+195 h
+4 h
+4923 m
+4 h
+146 h
+4924 m
+258 h
+4 h
+3 h
+3025 m
+4 h
+4 h
+146 h
+4 h
+10 h
+4 h
+4925 m
+144 h
+1 h
+10 h
+4 h
+4 h
+4 h
+4 h
+1772 h
+4 h
+4 h
+69 h
+4 h
+4926 m
+2887 h
+147 h
+10 h
+1 h
+4927 m
+399 h
+4 h
+57 h
+4928 m
+1027 h
+1030 h
+4 h
+238 h
+4 h
+4 h
+1 h
+1089 h
+10 h
+1 h
+4929 m
+4930 m
+1 h
+10 h
+4931 m
+1 h
+1 h
+4932 m
+12 h
+994 m
+104 h
+1 h
+4933 m
+4 h
+4934 m
+4 h
+57 h
+4 h
+4935 m
+4 h
+4 h
+4936 m
+196 h
+4937 m
+10 h
+1 h
+1 h
+4 h
+4938 m
+10 h
+83 h
+4939 m
+13 h
+1 h
+4940 m
+4 h
+169 h
+4941 m
+190 h
+27 h
+4929 m
+4 h
+4 h
+347 m
+4942 m
+1650 h
+10 h
+4943 m
+124 h
+57 h
+1 h
+4944 m
+169 h
+4 h
+4 h
+4945 m
+10 h
+167 h
+1 h
+4 h
+109 h
+1 h
+10 h
+4 h
+4 h
+10 h
+1 h
+27 h
+135 h
+1 h
+4946 m
+4947 m
+1 h
+4297 h
+1 h
+104 h
+4948 m
+4949 m
+22 h
+1 h
+4950 m
+4 h
+172 h
+10 h
+4 h
+976 h
+11 h
+1499 m
+1 h
+11 h
+1 h
+4 h
+10 h
+59 h
+119 h
+74 h
+4520 m
+13 h
+425 m
+82 h
+10 h
+1309 m
+146 h
+4951 m
+4952 m
+125 h
+10 h
+1 h
+186 h
+57 h
+4953 m
+4954 m
+1737 m
+4 h
+1576 m
+10 h
+4955 m
+11 h
+1 h
+4 h
+104 h
+1105 m
+4 h
+1 h
+4956 m
+57 h
+477 m
+2148 m
+1 h
+4 h
+1780 h
+566 m
+2625 m
+1 h
+204 h
+4 h
+4 h
+10 h
+4957 m
+4 h
+1 h
+125 h
+4958 m
+1293 m
+4 h
+10 h
+4959 m
+73 h
+4 h
+104 h
+1 h
+10 h
+147 h
+1 h
+10 h
+181 h
+258 h
+4960 m
+10 h
+4 h
+82 h
+1137 h
+4961 m
+4962 m
+4963 m
+1 h
+238 h
+1714 m
+1975 m
+10 h
+4 h
+64 h
+256 h
+4 h
+4964 m
+1 h
+4 h
+170 h
+82 h
+195 h
+1 h
+92 h
+4 h
+97 h
+4965 m
+4 h
+1105 h
+717 m
+4966 m
+135 h
+1 h
+4 h
+4967 m
+4968 m
+74 h
+1 h
+74 h
+156 h
+4969 m
+31 h
+1 h
+41 h
+4970 m
+1 h
+4 h
+4 h
+307 h
+82 h
+11 h
+4971 m
+10 h
+4 h
+4972 m
+347 m
+10 h
+8 h
+4 h
+3 h
+4973 m
+10 h
+70 m
+359 h
+1 h
+10 h
+493 h
+1 h
+4 h
+10 h
+1137 h
+1105 h
+4 h
+4 h
+56 h
+4974 m
+195 h
+10 h
+41 h
+1 h
+4975 m
+4 h
+1 h
+4 h
+10 h
+1 h
+4 h
+4 h
+1 h
+1 h
+1 h
+4 h
+1 h
+1470 h
+4 h
+1 h
+4 h
+4 h
+625 m
+4976 m
+172 h
+64 h
+27 h
+1 h
+196 h
+4 h
+4 h
+4977 m
+1 h
+4978 m
+4 h
+4 h
+124 h
+4979 m
+1 h
+4 h
+10 h
+10 h
+4 h
+124 h
+1337 m
+4 h
+1 h
+31 h
+1 h
+186 h
+1 h
+12 h
+10 h
+92 h
+4980 m
+4 h
+4 h
+41 h
+11 h
+4441 m
+10 h
+4 h
+1 h
+112 h
+4981 m
+297 h
+4982 m
+4 h
+4 h
+4 h
+79 h
+83 h
+1 h
+3558 m
+1379 m
+4 h
+10 h
+57 h
+1 h
+4983 m
+4984 m
+4 h
+83 h
+4985 m
+869 m
+10 h
+1 h
+1 h
+1 h
+4986 m
+4 h
+41 h
+4987 m
+10 h
+10 h
+4988 m
+4989 m
+717 m
+1650 h
+10 h
+46 h
+13 h
+4 h
+1089 h
+10 h
+265 h
+11 h
+4990 m
+109 h
+4 h
+1 h
+10 h
+4 h
+4991 m
+1 h
+1 h
+186 h
+91 h
+4992 m
+10 h
+1 h
+2148 m
+36 h
+195 h
+4993 m
+4994 m
+11 h
+4995 m
+25 h
+1 h
+4 h
+1 h
+82 h
+1 h
+4 h
+4996 m
+4997 m
+1 h
+4998 m
+4999 m
+10 h
+10 h
+5000 m
+4 h
+1 h
+5001 m
+5002 m
+4 h
+4 h
+272 m
+5003 m
+1 h
+1 h
+5004 m
+1 h
+10 h
+4 h
+83 h
+4 h
+5005 m
+92 h
+3 h
+4 h
+11 h
+4 h
+4 h
+4 h
+1 h
+57 h
+4 h
+1 h
+434 m
+5006 m
+4 h
+4 h
+4 h
+1 h
+1 h
+4 h
+169 h
+5007 m
+4 h
+4 h
+5008 m
+104 h
+4 h
+1 h
+1 h
+10 h
+1 h
+25 h
+5009 m
+10 h
+10 h
+109 h
+4 h
+5010 m
+1898 m
+10 h
+5011 m
+5012 m
+4 h
+10 h
+1 h
+509 m
+4 h
+1 h
+1 h
+5013 m
+5014 m
+74 h
+1 h
+114 h
+3 h
+1027 h
+1337 m
+5015 m
+10 h
+4 h
+4 h
+5016 m
+307 h
+4 h
+1 h
+1016 h
+144 h
+11 h
+4 h
+5017 m
+190 h
+5018 m
+2379 h
+10 h
+1 h
+5019 m
+57 h
+10 h
+135 h
+1 h
+5020 m
+1 h
+167 h
+4 h
+4 h
+5021 m
+10 h
+41 h
+143 h
+10 h
+10 h
+147 h
+1250 h
+1 h
+5022 m
+5023 m
+1 h
+10 h
+1 h
+5024 m
+5025 m
+338 h
+74 h
+4 h
+1 h
+10 h
+10 h
+61 h
+307 h
+135 h
+10 h
+10 h
+10 h
+1 h
+125 h
+41 h
+5026 m
+167 h
+5027 m
+11 h
+4 h
+276 h
+5028 m
+25 h
+5029 m
+4 h
+5030 m
+195 h
+4 h
+10 h
+124 h
+716 m
+224 h
+10 h
+4 h
+5031 m
+10 h
+4 h
+195 h
+276 h
+5032 m
+5033 m
+5034 m
+1261 h
+1 h
+5035 m
+10 h
+1 h
+5036 m
+276 h
+25 h
+1 h
+278 h
+5037 m
+10 h
+4 h
+359 h
+3 h
+4 h
+4 h
+109 h
+5038 m
+556 h
+692 h
+1 h
+4 h
+1 h
+4 h
+1 h
+10 h
+196 h
+5039 m
+3 h
+4 h
+4 h
+55 h
+1 h
+1 h
+4 h
+5040 m
+4 h
+10 h
+190 h
+5041 m
+1 h
+359 h
+5042 m
+11 h
+170 h
+4 h
+11 h
+143 h
+276 h
+1 h
+4 h
+74 h
+10 h
+31 h
+1 h
+10 h
+10 h
+57 h
+4 h
+4 h
+5043 m
+4 h
+5044 m
+4 h
+5045 m
+10 h
+5046 m
+4 h
+167 h
+5047 m
+147 h
+65 h
+5048 m
+443 h
+4 h
+3 h
+4 h
+10 h
+4 h
+1 h
+10 h
+5049 m
+10 h
+5050 m
+4 h
+129 h
+464 m
+10 h
+869 m
+1 h
+3338 m
+1 h
+570 h
+57 h
+4 h
+5051 m
+143 h
+5052 m
+109 h
+31 h
+1 h
+1 h
+4 h
+4 h
+5053 m
+170 h
+4 h
+4 h
+4 h
+10 h
+114 h
+10 h
+4464 m
+45 h
+10 h
+10 h
+4 h
+4 h
+5054 m
+4 h
+5055 m
+5056 m
+1 h
+1 h
+5057 m
+5058 m
+83 h
+5059 m
+5060 m
+3 h
+737 m
+4 h
+5061 m
+11 h
+4 h
+10 h
+77 h
+1281 m
+4 h
+4 h
+687 h
+5062 m
+123 h
+4 h
+11 h
+4 h
+4 h
+41 h
+1 h
+10 h
+83 h
+4 h
+10 h
+4 h
+1790 m
+112 h
+10 h
+10 h
+82 h
+5063 m
+4 h
+13 h
+4 h
+1089 h
+307 h
+83 h
+4 h
+1 h
+4 h
+82 h
+124 h
+5064 m
+5065 m
+4 h
+1 h
+1 h
+73 h
+125 h
+10 h
+338 h
+10 h
+2459 m
+5066 m
+5067 m
+1 h
+1 h
+5068 m
+4 h
+1 h
+10 h
+11 h
+4 h
+195 h
+5069 m
+1 h
+119 h
+5070 m
+139 h
+10 h
+802 m
+1017 m
+322 h
+1835 h
+4 h
+687 h
+1 h
+1 h
+146 h
+5071 m
+82 h
+1 h
+59 h
+10 h
+164 h
+114 h
+4 h
+1 h
+1137 h
+1 h
+1 h
+1 h
+425 h
+1835 h
+3 h
+156 h
+538 h
+5072 m
+5073 m
+5074 m
+5075 m
+4 h
+1081 m
+238 h
+5076 m
+4 h
+1 h
+5077 m
+10 h
+203 m
+319 h
+10 h
+124 h
+1 h
+5078 m
+83 h
+4 h
+1 h
+10 h
+10 h
+1 h
+4 h
+45 h
+10 h
+10 h
+4 h
+57 h
+5079 m
+92 h
+1634 m
+5080 m
+10 h
+5081 m
+64 h
+995 m
+41 h
+1 h
+5082 m
+22 h
+25 h
+4 h
+4 h
+4 h
+5083 m
+1 h
+1 h
+4 h
+45 h
+5084 m
+10 h
+5085 m
+92 h
+5086 m
+4 h
+5087 m
+4 h
+83 h
+190 h
+4 h
+4 h
+45 h
+156 h
+11 h
+4 h
+4 h
+10 h
+10 h
+4 h
+1 h
+1 h
+4 h
+4 h
+1362 h
+4 h
+1 h
+1 h
+94 h
+1 h
+5088 m
+464 h
+11 h
+1 h
+4 h
+986 h
+4 h
+4 h
+5089 m
+2418 m
+5090 m
+83 h
+1 h
+5091 m
+4695 m
+1 h
+4 h
+10 h
+1191 m
+82 h
+4 h
+5092 m
+185 h
+10 h
+10 h
+1 h
+1 h
+10 h
+10 h
+1710 m
+10 h
+1 h
+173 h
+4 h
+124 h
+2520 m
+570 h
+250 h
+10 h
+8 h
+4 h
+4 h
+1 h
+5093 m
+169 h
+1 h
+10 h
+64 h
+5094 m
+3 h
+4 h
+4 h
+1 h
+5095 m
+1 h
+5096 m
+5097 m
+569 h
+170 h
+4 h
+83 h
+57 h
+83 h
+169 h
+4 h
+4 h
+110 h
+31 h
+5098 m
+538 h
+4 h
+1 h
+4 h
+5099 m
+124 h
+10 h
+5100 m
+10 h
+5101 m
+3025 m
+10 h
+1 h
+5102 m
+1 h
+5103 m
+1 h
+4 h
+1 h
+4 h
+1470 h
+4 h
+59 h
+10 h
+4 h
+12 h
+10 h
+1 h
+135 h
+5104 m
+10 h
+4 h
+113 h
+4 h
+5105 m
+278 h
+5106 m
+4 h
+1089 h
+94 h
+4 h
+5107 m
+1 h
+36 h
+4 h
+1308 m
+10 h
+5108 m
+1 h
+2028 m
+5109 m
+5110 m
+1 h
+25 h
+4 h
+763 m
+25 h
+1 h
+1 h
+2558 m
+4 h
+1780 h
+79 h
+1 h
+82 h
+109 h
+4 h
+347 h
+25 h
+3 h
+5111 m
+11 h
+1 h
+5112 m
+1 h
+1 h
+25 h
+10 h
+229 h
+1 h
+1 h
+41 h
+536 h
+5113 m
+5114 m
+4 h
+4 h
+5115 m
+5116 m
+1 h
+64 h
+1083 h
+5117 m
+4 h
+13 h
+5118 m
+4 h
+10 h
+10 h
+2710 m
+4 h
+10 h
+1 h
+1 h
+97 h
+211 m
+1 h
+181 h
+172 h
+4 h
+4 h
+1 h
+1 h
+1574 m
+170 h
+124 h
+3028 m
+1 h
+103 m
+5119 m
+11 h
+10 h
+4 h
+10 h
+1 h
+5120 m
+59 h
+4 h
+83 h
+4 h
+5121 m
+1 h
+4 h
+1 h
+5122 m
+4 h
+196 h
+1 h
+2688 m
+31 h
+82 h
+10 h
+4 h
+10 h
+10 h
+25 h
+69 h
+4 h
+195 h
+5123 m
+79 h
+1 h
+258 h
+10 h
+5124 m
+4 h
+5125 m
+562 m
+1 h
+5126 m
+4 h
+1 h
+353 m
+4 h
+4 h
+1 h
+4 h
+4 h
+4 h
+59 h
+5127 m
+4 h
+4 h
+125 h
+4 h
+5128 m
+5129 m
+1 h
+4 h
+172 h
+5130 m
+4 h
+11 h
+4 h
+5131 m
+10 h
+10 h
+4 h
+4 h
+1 h
+41 h
+10 h
+278 h
+4 h
+1 h
+10 h
+1 h
+4 h
+124 h
+1 h
+1 h
+1 h
+10 h
+1403 h
+5132 m
+125 h
+5133 m
+4 h
+5134 m
+307 h
+74 h
+5135 m
+1 h
+5136 m
+5137 m
+10 h
+4 h
+11 h
+1 h
+4 h
+1 h
+10 h
+25 h
+4 h
+1 h
+1 h
+4 h
+4 h
+1 h
+113 h
+1 h
+109 h
+4 h
+10 h
+64 h
+297 h
+4 h
+4 h
+118 h
+4 h
+4 h
+10 h
+10 h
+4 h
+172 h
+5138 m
+5139 m
+74 h
+4 h
+140 m
+1053 m
+1027 h
+4 h
+1 h
+5140 m
+4 h
+1122 m
+45 h
+5141 m
+1 h
+1499 m
+5142 m
+4 h
+4 h
+4 h
+1 h
+5143 m
+1650 h
+2251 m
+10 h
+4 h
+1403 h
+4 h
+5144 m
+123 h
+4 h
+4 h
+478 m
+4 h
+4 h
+55 h
+4 h
+174 h
+1 h
+4 h
+10 h
+4 h
+4 h
+1 h
+11 h
+1 h
+4 h
+5145 m
+4 h
+4 h
+1 h
+4 h
+4 h
+11 h
+10 h
+412 m
+4 h
+11 h
+1379 m
+4 h
+108 h
+1 h
+5146 m
+1 h
+11 h
+116 m
+10 h
+123 h
+10 h
+10 h
+4 h
+124 h
+489 h
+10 h
+4 h
+185 h
+10 h
+146 h
+10 h
+276 h
+1 h
+13 h
+1 h
+4 h
+5147 m
+285 m
+4 h
+109 h
+190 h
+4 h
+170 h
+388 m
+11 h
+10 h
+10 h
+185 h
+5148 m
+5149 m
+10 h
+1 h
+10 h
+1278 m
+5150 m
+10 h
+1261 h
+4 h
+5151 m
+423 m
+4 h
+4 h
+1 h
+4 h
+1 h
+10 h
+4 h
+10 h
+10 h
+10 h
+114 h
+10 h
+966 m
+190 h
+41 h
+278 h
+11 h
+129 h
+65 h
+443 h
+83 h
+1 h
+5152 m
+119 h
+5153 m
+1 h
+4 h
+10 h
+4 h
+1 h
+82 h
+5154 m
+299 m
+1 h
+5155 m
+4 h
+10 h
+4 h
+10 h
+10 h
+11 h
+4 h
+1 h
+69 h
+1 h
+1 h
+5156 m
+172 h
+57 h
+1 h
+5157 m
+4 h
+5158 m
+10 h
+74 h
+4 h
+1 h
+10 h
+5159 m
+230 h
+935 m
+10 h
+5160 m
+1 h
+4 h
+4 h
+4 h
+718 h
+986 h
+1 h
+5161 m
+840 m
+11 h
+5162 m
+4 h
+4 h
+11 h
+1 h
+5163 m
+31 h
+1 h
+1 h
+5164 m
+1 h
+976 h
+10 h
+4 h
+4 h
+10 h
+1 h
+1 h
+10 h
+173 h
+195 h
+5165 m
+10 h
+125 h
+135 h
+5166 m
+1 h
+5167 m
+1 h
+5168 m
+10 h
+4 h
+4 h
+57 h
+1 h
+82 h
+1 h
+4 h
+4 h
+3546 m
+1 h
+4 h
+307 h
+1 h
+1 h
+64 h
+4 h
+11 h
+3 h
+10 h
+4 h
+5169 m
+3 h
+4 h
+4 h
+1 h
+10 h
+1647 m
+1470 h
+169 h
+3396 h
+5170 m
+1 h
+258 h
+4 h
+59 h
+1 h
+358 h
+3 h
+10 h
+1 h
+124 h
+74 h
+114 h
+1710 m
+5171 m
+4 h
+1 h
+1 h
+10 h
+60 m
+10 h
+5172 m
+5173 m
+4 h
+25 h
+109 h
+4 h
+5174 m
+4 h
+10 h
+5175 m
+10 h
+10 h
+83 h
+5176 m
+10 h
+5177 m
+11 h
+5178 m
+5179 m
+10 h
+1 h
+12 h
+10 h
+5180 m
+5181 m
+1 h
+5182 m
+238 h
+74 h
+11 h
+173 h
+5183 m
+4 h
+10 h
+64 h
+1 h
+4 h
+5184 m
+5185 m
+1 h
+5186 m
+4 h
+109 h
+10 h
+55 h
+1 h
+10 h
+1201 h
+146 h
+4 h
+10 h
+4 h
+1 h
+5187 m
+1 h
+4 h
+1 h
+11 h
+1 h
+2418 m
+10 h
+10 h
+4 h
+11 h
+4 h
+4 h
+1 h
+5188 m
+1 h
+1 h
+1 h
+5189 m
+4 h
+4 h
+1 h
+109 h
+5190 m
+25 h
+5191 m
+57 h
+4 h
+57 h
+11 h
+4 h
+258 h
+1 h
+5192 m
+1 h
+10 h
+10 h
+109 h
+4 h
+10 h
+1 h
+3 h
+1 h
+258 h
+36 h
+4 h
+4 h
+1 h
+5193 m
+25 h
+5194 m
+935 m
+25 h
+10 h
+601 h
+1 h
+1 h
+4 h
+83 h
+5195 m
+109 h
+10 h
+109 h
+2971 m
+4 h
+4 h
+4 h
+1 h
+5196 m
+65 h
+5197 m
+11 h
+1 h
+266 h
+172 h
+1766 h
+5198 m
+1 h
+1 h
+5199 m
+10 h
+92 h
+10 h
+4 h
+1 h
+5200 m
+181 h
+5201 m
+135 h
+10 h
+10 h
+1 h
+4 h
+4 h
+4 h
+1 h
+361 m
+4 h
+5202 m
+5203 m
+1 h
+5204 m
+1 h
+4 h
+1 h
+10 h
+4 h
+5205 m
+124 h
+1893 m
+12 h
+46 h
+509 m
+5206 m
+1 h
+4 h
+4 h
+1 h
+5207 m
+5208 m
+1 h
+10 h
+601 h
+4 h
+1 h
+5209 m
+5210 m
+4 h
+48 h
+169 h
+10 h
+59 h
+172 h
+10 h
+1 h
+65 h
+5211 m
+371 h
+4 h
+14 m
+1053 m
+10 h
+4 h
+5212 m
+5213 m
+82 h
+10 h
+1 h
+10 h
+5214 m
+11 h
+4 h
+1 h
+5215 m
+1 h
+779 h
+1955 m
+5216 m
+1370 m
+575 h
+59 h
+5217 m
+5218 m
+1 h
+4 h
+10 h
+10 h
+5219 m
+10 h
+4 h
+195 h
+770 m
+295 h
+195 h
+10 h
+1 h
+4 h
+8 h
+10 h
+5220 m
+10 h
+59 h
+1 h
+84 h
+4 h
+4 h
+4 h
+4 h
+4 h
+97 h
+266 h
+11 h
+5221 m
+1 h
+1 h
+10 h
+73 h
+10 h
+4 h
+5222 m
+1 h
+109 h
+4 h
+4 h
+4 h
+4 h
+5223 m
+464 h
+5224 m
+5225 m
+11 h
+11 h
+4 h
+4 h
+1470 h
+114 h
+4 h
+83 h
+139 h
+129 h
+190 h
+1 h
+5226 m
+10 h
+289 h
+45 h
+64 h
+4 h
+1 h
+11 h
+1 h
+5227 m
+82 h
+5228 m
+5229 m
+3 h
+83 h
+250 h
+3 h
+10 h
+4 h
+5230 m
+1 h
+97 h
+299 m
+3555 m
+4 h
+5231 m
+97 h
+4 h
+5232 m
+10 h
+147 h
+4 h
+5233 m
+41 h
+4 h
+1 h
+4 h
+129 h
+1 h
+1 h
+57 h
+10 h
+5234 m
+4 h
+57 h
+56 h
+5235 m
+118 h
+135 h
+4 h
+1 h
+4 h
+1822 m
+606 m
+124 h
+25 h
+5236 m
+1 h
+601 h
+4 h
+4 h
+90 m
+92 h
+59 h
+332 h
+4 h
+1 h
+11 h
+1 h
+4 h
+4 h
+4 h
+92 h
+1 h
+4 h
+1 h
+3 h
+4 h
+5237 m
+164 h
+10 h
+4 h
+593 m
+4 h
+125 h
+10 h
+1 h
+5238 m
+124 h
+1 h
+4 h
+10 h
+1 h
+5239 m
+55 h
+4 h
+4 h
+125 h
+1 h
+1 h
+1 h
+4 h
+11 h
+10 h
+4 h
+10 h
+10 h
+5240 m
+674 m
+4 h
+1 h
+65 h
+97 h
+41 h
+687 h
+4 h
+10 h
+172 h
+1 h
+4 h
+885 m
+1261 h
+10 h
+1 h
+11 h
+2928 m
+147 h
+5241 m
+114 h
+266 h
+5242 m
+170 h
+2769 m
+4 h
+140 m
+1 h
+10 h
+25 h
+1 h
+10 h
+1 h
+57 h
+1 h
+59 h
+4 h
+4 h
+4 h
+4 h
+5243 m
+5244 m
+4 h
+5245 m
+345 m
+3630 m
+5246 m
+5247 m
+10 h
+104 h
+10 h
+79 h
+55 h
+93 h
+4 h
+687 h
+5248 m
+339 m
+5249 m
+1 h
+79 h
+4 h
+279 h
+104 h
+12 h
+10 h
+5250 m
+10 h
+1 h
+5251 m
+536 h
+4 h
+25 h
+1 h
+1 h
+10 h
+4 h
+5252 m
+4 h
+4 h
+1 h
+11 h
+10 h
+1 h
+1 h
+10 h
+124 h
+1 h
+1 h
+4 h
+5253 m
+147 h
+2550 m
+4 h
+143 h
+4 h
+4 h
+4 h
+4 h
+4 h
+8 h
+5254 m
+4 h
+4 h
+11 h
+258 h
+10 h
+4 h
+1828 m
+4 h
+5255 m
+10 h
+4 h
+31 h
+73 h
+10 h
+195 h
+10 h
+1 h
+1 h
+1108 m
+8 h
+10 h
+4 h
+276 h
+110 h
+82 h
+5256 m
+10 h
+79 h
+5257 m
+10 h
+5258 m
+31 h
+570 h
+5259 m
+4 h
+1 h
+170 h
+1 h
+447 h
+11 h
+297 h
+135 h
+773 m
+1 h
+4 h
+4 h
+79 h
+4 h
+13 h
+4 h
+4 h
+74 h
+167 h
+1 h
+4 h
+10 h
+28 h
+332 h
+124 h
+5260 m
+1 h
+10 h
+4 h
+1 h
+5261 m
+1 h
+538 h
+4 h
+1 h
+4 h
+77 h
+1 h
+1697 m
+1 h
+59 h
+5262 m
+5263 m
+4 h
+386 h
+4 h
+5264 m
+36 h
+4 h
+4 h
+92 h
+124 h
+4 h
+10 h
+12 h
+3 h
+3 h
+4 h
+1 h
+195 h
+25 h
+4 h
+10 h
+10 h
+3 h
+2788 m
+10 h
+5265 m
+10 h
+358 h
+5266 m
+4 h
+55 h
+1 h
+147 h
+1 h
+83 h
+1 h
+10 h
+1 h
+935 h
+5267 m
+5268 m
+5269 m
+4 h
+1 h
+5270 m
+5271 m
+5272 m
+1 h
+59 h
+4 h
+1 h
+4 h
+15 m
+5273 m
+10 h
+10 h
+5274 m
+82 h
+1116 m
+59 h
+5275 m
+10 h
+83 h
+31 h
+1 h
+1 h
+1 h
+1 h
+5276 m
+4 h
+297 h
+4 h
+5277 m
+79 h
+10 h
+4 h
+5278 m
+1 h
+1 h
+4 h
+5279 m
+5280 m
+64 h
+10 h
+113 h
+5281 m
+5282 m
+4 h
+5283 m
+4 h
+31 h
+3112 m
+4 h
+195 h
+1 h
+10 h
+104 h
+181 h
+1 h
+1 h
+4 h
+28 h
+10 h
+146 h
+83 h
+10 h
+1 h
+1 h
+4 h
+5284 m
+4 h
+57 h
+4 h
+5285 m
+1 h
+1 h
+4 h
+79 h
+5286 m
+779 h
+11 h
+1 h
+4 h
+1 h
+41 h
+1 h
+2625 m
+10 h
+258 h
+65 h
+11 h
+1 h
+5287 m
+1 h
+59 h
+615 m
+8 h
+169 h
+92 h
+5288 m
+83 h
+28 h
+4 h
+1138 m
+5289 m
+31 h
+4 h
+55 h
+167 h
+5290 m
+2928 m
+125 h
+4 h
+4 h
+1 h
+1 h
+5291 m
+1 h
+10 h
+10 h
+5292 m
+5293 m
+1 h
+10 h
+185 h
+4 h
+1 h
+10 h
+4 h
+146 h
+10 h
+4 h
+5294 m
+4 h
+4 h
+25 h
+5295 m
+5296 m
+10 h
+5297 m
+59 h
+620 m
+1 h
+4 h
+94 h
+4 h
+74 h
+5298 m
+73 h
+92 h
+135 h
+181 h
+1 h
+57 h
+170 h
+1 h
+371 h
+10 h
+4 h
+5299 m
+10 h
+4 h
+1 h
+5300 m
+10 h
+5301 m
+41 h
+450 m
+4 h
+5302 m
+5303 m
+10 h
+10 h
+4 h
+4 h
+11 h
+82 h
+4 h
+4 h
+4 h
+1 h
+1 h
+4 h
+109 h
+10 h
+11 h
+124 h
+10 h
+125 h
+1 h
+5304 m
+5305 m
+10 h
+4 h
+10 h
+4 h
+1 h
+935 h
+10 h
+1981 m
+10 h
+2116 m
+41 h
+10 h
+4 h
+5306 m
+167 h
+1 h
+5307 m
+2002 m
+36 h
+5308 m
+4 h
+10 h
+4 h
+10 h
+4 h
+10 h
+1955 m
+5309 m
+4 h
+322 h
+4 h
+11 h
+57 h
+4 h
+4 h
+25 h
+184 h
+5310 m
+1 h
+59 h
+1 h
+4 h
+1 h
+124 h
+1 h
+4 h
+4 h
+1 h
+82 h
+757 m
+332 h
+1938 m
+5311 m
+4 h
+1 h
+1 h
+1 h
+5312 m
+5313 m
+10 h
+5314 m
+4 h
+4 h
+5315 m
+41 h
+5316 m
+1 h
+4 h
+10 h
+4 h
+1 h
+5317 m
+1 h
+10 h
+4 h
+83 h
+4 h
+11 h
+4528 m
+4 h
+10 h
+4 h
+359 h
+5318 m
+4 h
+448 m
+10 h
+10 h
+10 h
+4 h
+1685 m
+3033 m
+146 h
+4 h
+10 h
+4 h
+601 h
+83 h
+195 h
+1 h
+41 h
+1 h
+4 h
+10 h
+11 h
+4 h
+5319 m
+5320 m
+185 h
+124 h
+4 h
+278 h
+147 h
+3111 m
+173 h
+5321 m
+1 h
+5322 m
+3558 m
+59 h
+4 h
+1 h
+119 h
+578 m
+1 h
+1 h
+4 h
+10 h
+4 h
+10 h
+1 h
+143 h
+11 h
+5323 m
+4 h
+4 h
+5324 m
+190 h
+250 h
+11 h
+146 h
+1 h
+5325 m
+1 h
+319 h
+109 h
+11 h
+1 h
+36 h
+1 h
+11 h
+104 h
+3188 m
+692 h
+1 h
+4 h
+4 h
+10 h
+3 h
+4 h
+57 h
+11 h
+1 h
+10 h
+146 h
+4 h
+4 h
+4 h
+434 h
+190 h
+4 h
+4 h
+5326 m
+10 h
+92 h
+4301 m
+4 h
+1 h
+5327 m
+1 h
+1 h
+4 h
+83 h
+5328 m
+83 h
+4 h
+10 h
+57 h
+79 h
+5329 m
+5330 m
+10 h
+65 h
+4 h
+190 h
+4 h
+1 h
+10 h
+5331 m
+4 h
+1738 m
+112 h
+10 h
+4 h
+91 h
+1 h
+11 h
+109 h
+1 h
+1 h
+4 h
+4 h
+5332 m
+5333 m
+10 h
+69 h
+4 h
+5334 m
+195 h
+10 h
+1 h
+31 h
+4 h
+109 h
+10 h
+219 m
+5335 m
+386 h
+4 h
+83 h
+4 h
+4 h
+5336 m
+82 h
+3025 m
+11 h
+4 h
+10 h
+5337 m
+5338 m
+59 h
+4 h
+5339 m
+1 h
+10 h
+1642 h
+5340 m
+22 h
+1 h
+10 h
+4 h
+104 h
+10 h
+4 h
+5341 m
+1 h
+138 h
+5342 m
+620 m
+5343 m
+27 h
+1122 m
+10 h
+2418 h
+4 h
+31 h
+185 h
+4 h
+10 h
+4 h
+4 h
+97 h
+5344 m
+135 h
+1 h
+1 h
+5345 m
+104 h
+1 h
+4 h
+808 h
+5346 m
+1 h
+1105 h
+299 h
+4 h
+65 h
+10 h
+649 m
+13 h
+4 h
+5347 m
+83 h
+1 h
+104 h
+1 h
+929 m
+59 h
+170 h
+10 h
+144 h
+1 h
+5348 m
+5349 m
+10 h
+5350 m
+4 h
+5351 m
+8 h
+4 h
+4 h
+4 h
+5352 m
+4 h
+5353 m
+10 h
+56 h
+332 h
+4 h
+11 h
+5354 m
+5355 m
+10 h
+4240 m
+185 h
+4 h
+4 h
+104 h
+4 h
+5356 m
+5357 m
+5358 m
+4 h
+10 h
+5359 m
+1096 m
+5360 m
+25 h
+4 h
+5361 m
+10 h
+1 h
+1 h
+4 h
+4 h
+4 h
+11 h
+4 h
+1 h
+10 h
+4 h
+279 h
+1454 m
+4 h
+4 h
+4 h
+164 h
+4 h
+1 h
+10 h
+4 h
+1 h
+4 h
+1 h
+5362 m
+5363 m
+11 h
+5364 m
+4 h
+10 h
+4 h
+83 h
+4 h
+1 h
+5365 m
+10 h
+10 h
+104 h
+4 h
+274 h
+5366 m
+65 h
+5367 m
+3 h
+1 h
+1 h
+45 h
+4 h
+1 h
+4 h
+25 h
+4 h
+5368 m
+4 h
+4 h
+83 h
+110 h
+4 h
+5369 m
+5370 m
+10 h
+4 h
+11 h
+10 h
+4 h
+83 h
+1 h
+5371 m
+4 h
+4 h
+1 h
+4 h
+4 h
+5372 m
+4 h
+5373 m
+4 h
+10 h
+383 h
+4 h
+4 h
+1 h
+195 h
+4 h
+10 h
+1 h
+10 h
+109 h
+266 h
+5374 m
+36 h
+4029 m
+4 h
+4 h
+1 h
+5375 m
+2794 m
+8 h
+31 h
+4 h
+1003 h
+557 m
+185 h
+11 h
+4 h
+229 h
+10 h
+5376 m
+5377 m
+59 h
+1796 m
+164 h
+10 h
+1 h
+158 h
+5378 m
+124 h
+1 h
+5379 m
+147 h
+4 h
+99 m
+4 h
+10 h
+31 h
+83 h
+1 h
+10 h
+57 h
+4 h
+279 h
+73 h
+358 h
+4 h
+5380 m
+31 h
+4 h
+5381 m
+1 h
+5382 m
+27 h
+1 h
+124 h
+4240 m
+4 h
+5383 m
+10 h
+5384 m
+1952 m
+4 h
+4 h
+143 h
+185 h
+4 h
+1685 m
+10 h
+5385 m
+10 h
+4 h
+57 h
+5386 m
+10 h
+4 h
+4 h
+3025 m
+820 h
+10 h
+5387 m
+5388 m
+1 h
+3 h
+4 h
+10 h
+196 h
+1 h
+11 h
+4 h
+4 h
+4 h
+4 h
+779 h
+5389 m
+10 h
+10 h
+1 h
+11 h
+5390 m
+1 h
+4 h
+4 h
+1 h
+1 h
+167 h
+10 h
+4 h
+109 h
+4 h
+262 m
+3089 m
+203 m
+4 h
+4 h
+274 h
+1 h
+1 h
+4 h
+147 h
+83 h
+820 h
+73 h
+109 h
+4 h
+41 h
+5391 m
+258 h
+172 h
+140 h
+4 h
+10 h
+10 h
+5392 m
+4 h
+278 h
+4 h
+10 h
+4 h
+55 h
+4 h
+692 h
+265 h
+181 h
+358 h
+1 h
+5393 m
+1 h
+4 h
+5394 m
+10 h
+97 h
+181 h
+10 h
+4 h
+82 h
+5395 m
+93 h
+569 h
+45 h
+5396 m
+1 h
+4 h
+5397 m
+266 h
+338 h
+2124 m
+2308 m
+10 h
+4 h
+4 h
+10 h
+10 h
+4 h
+4 h
+4 h
+114 h
+45 h
+97 h
+10 h
+74 h
+4 h
+1 h
+4 h
+1771 m
+4 h
+5398 m
+146 h
+4 h
+4 h
+59 h
+167 h
+5399 m
+150 m
+386 h
+1 h
+113 h
+10 h
+1 h
+4 h
+10 h
+135 h
+4 h
+1 h
+1 h
+4 h
+5400 m
+307 h
+4 h
+10 h
+73 h
+97 h
+3668 m
+4 h
+97 h
+1 h
+4 h
+10 h
+1 h
+5401 m
+57 h
+65 h
+5402 m
+1 h
+1 h
+1 h
+195 h
+1 h
+79 h
+1 h
+1 h
+4 h
+143 h
+5403 m
+57 h
+140 h
+124 h
+83 h
+5404 m
+3 h
+10 h
+1 h
+2494 m
+10 h
+1 h
+5405 m
+4 h
+1 h
+70 m
+1 h
+4 h
+5406 m
+112 h
+11 h
+1772 h
+5407 m
+1 h
+10 h
+10 h
+2595 m
+10 h
+3 h
+1725 m
+5408 m
+5409 m
+1 h
+538 h
+5410 m
+5411 m
+5412 m
+13 h
+1 h
+10 h
+5413 m
+5414 m
+97 h
+10 h
+10 h
+4 h
+5415 m
+1 h
+5416 m
+1 h
+4 h
+4 h
+11 h
+10 h
+4 h
+10 h
+10 h
+976 h
+82 h
+10 h
+5417 m
+1 h
+1 h
+1642 h
+5418 m
+4 h
+5419 m
+169 h
+4 h
+5420 m
+4 h
+4 h
+25 h
+5421 m
+4 h
+533 m
+82 h
+5422 m
+5423 m
+4 h
+5424 m
+1250 h
+4 h
+4 h
+4 h
+4101 m
+4 h
+1 h
+4 h
+5425 m
+258 h
+1 h
+10 h
+4 h
+4 h
+1 h
+4 h
+1 h
+5426 m
+4 h
+83 h
+1 h
+1393 m
+11 h
+556 h
+57 h
+4 h
+4 h
+1 h
+383 h
+4 h
+5427 m
+5428 m
+10 h
+56 h
+10 h
+5429 m
+31 h
+92 h
+4 h
+10 h
+11 h
+4 h
+4 h
+5430 m
+11 h
+5431 m
+5432 m
+105 m
+10 h
+5433 m
+4 h
+5434 m
+2359 m
+125 h
+10 h
+82 h
+109 h
+10 h
+45 h
+1 h
+10 h
+57 h
+5435 m
+4 h
+4 h
+57 h
+642 m
+123 h
+1372 m
+1 h
+59 h
+4 h
+83 h
+1835 h
+5436 m
+1 h
+2436 m
+59 h
+4 h
+4 h
+4 h
+1 h
+144 h
+83 h
+83 h
+11 h
+338 h
+4 h
+109 h
+10 h
+28 h
+10 h
+4 h
+1 h
+10 h
+1 h
+1 h
+13 h
+4 h
+4 h
+77 h
+1281 m
+4 h
+5437 m
+5438 m
+1 h
+25 h
+10 h
+4 h
+82 h
+2582 m
+5439 m
+4 h
+10 h
+1780 h
+10 h
+5440 m
+5441 m
+5442 m
+5443 m
+4 h
+124 h
+157 h
+10 h
+1 h
+4 h
+520 m
+8 h
+4 h
+1 h
+10 h
+885 m
+935 h
+10 h
+164 h
+10 h
+4 h
+5444 m
+10 h
+10 h
+10 h
+4 h
+1 h
+4 h
+13 h
+4 h
+10 h
+10 h
+4 h
+5445 m
+5446 m
+4 h
+113 h
+5447 m
+73 h
+65 h
+1 h
+83 h
+1 h
+1 h
+31 h
+5448 m
+4 h
+10 h
+11 h
+4 h
+1685 h
+164 h
+1 h
+2281 m
+10 h
+297 h
+110 h
+10 h
+12 h
+5449 m
+5450 m
+4 h
+1 h
+144 h
+4 h
+332 h
+5451 m
+10 h
+65 h
+10 h
+195 h
+12 h
+10 h
+28 h
+2769 m
+124 h
+5452 m
+57 h
+36 h
+5453 m
+1016 h
+125 h
+5454 m
+1 h
+11 h
+3768 m
+4 h
+5455 m
+278 h
+125 h
+4 h
+41 h
+1 h
+5456 m
+5457 m
+1 h
+4 h
+4 h
+4057 m
+4 h
+5458 m
+1 h
+2459 m
+114 h
+224 h
+11 h
+10 h
+1 h
+5459 m
+5460 m
+10 h
+8 h
+4 h
+10 h
+3 h
+1 h
+4 h
+5461 m
+1 h
+4 h
+5462 m
+5463 m
+10 h
+3837 m
+4 h
+1 h
+4 h
+25 h
+4 h
+1 h
+5464 m
+4 h
+1685 h
+4 h
+4 h
+4218 m
+5465 m
+10 h
+25 h
+10 h
+146 h
+4 h
+25 h
+1 h
+157 h
+4 h
+10 h
+10 h
+358 h
+1 h
+4 h
+5466 m
+10 h
+4 h
+10 h
+172 h
+386 h
+5467 m
+181 h
+10 h
+4306 m
+41 h
+5468 m
+4 h
+10 h
+83 h
+1 h
+5469 m
+4 h
+5470 m
+36 h
+1 h
+5471 m
+41 h
+10 h
+4 h
+10 h
+10 h
+5472 m
+4 h
+4 h
+1089 h
+5473 m
+256 h
+10 h
+4 h
+4 h
+4 h
+4 h
+1 h
+425 h
+4 h
+5474 m
+4 h
+5475 m
+10 h
+4 h
+4 h
+10 h
+386 h
+1 h
+5476 m
+1309 h
+1 h
+1472 m
+135 h
+4 h
+10 h
+124 h
+31 h
+147 h
+4 h
+5477 m
+4 h
+45 h
+10 h
+1 h
+97 h
+5478 m
+10 h
+82 h
+4 h
+4 h
+4 h
+5479 m
+5480 m
+10 h
+1 h
+4 h
+5481 m
+10 h
+12 h
+4 h
+10 h
+10 h
+4 h
+276 h
+4 h
+77 h
+147 h
+4 h
+10 h
+1 h
+10 h
+4 h
+4 h
+2846 m
+1 h
+82 h
+1 h
+4 h
+4 h
+10 h
+5482 m
+12 h
+11 h
+4 h
+10 h
+10 h
+12 h
+1 h
+1 h
+4 h
+5483 m
+10 h
+169 h
+4 h
+358 h
+5484 m
+4 h
+5485 m
+5348 m
+140 h
+5486 m
+1 h
+5487 m
+964 m
+2172 m
+82 h
+1 h
+5488 m
+307 h
+4 h
+4 h
+10 h
+5489 m
+332 h
+57 h
+1 h
+1027 h
+1 h
+92 h
+1 h
+4 h
+4 h
+5490 m
+5491 m
+4 h
+4 h
+1 h
+31 h
+1 h
+4 h
+5492 m
+4 h
+319 h
+5493 m
+4 h
+4 h
+5494 m
+5495 m
+10 h
+1 h
+12 h
+10 h
+1 h
+1 h
+1 h
+4 h
+5496 m
+4 h
+1 h
+1 h
+5497 m
+10 h
+4 h
+4 h
+4 h
+74 h
+5498 m
+5499 m
+4 h
+5500 m
+1201 h
+1 h
+5501 m
+97 h
+4 h
+5502 m
+125 h
+4 h
+12 h
+25 h
+82 h
+368 h
+4 h
+4 h
+94 h
+157 h
+4 h
+125 h
+4 h
+1 h
+4 h
+109 h
+4 h
+1 h
+383 h
+57 h
+10 h
+1780 h
+65 h
+716 m
+368 h
+1 h
+5503 m
+1 h
+55 h
+4 h
+4 h
+1137 h
+4 h
+41 h
+4 h
+10 h
+11 h
+104 h
+5504 m
+10 h
+45 h
+181 h
+5505 m
+10 h
+4 h
+129 h
+10 h
+28 h
+5506 m
+692 h
+83 h
+332 h
+4 h
+1 h
+4 h
+5507 m
+4 h
+5508 m
+10 h
+124 h
+4 h
+1 h
+4 h
+1553 m
+5509 m
+4 h
+5510 m
+10 h
+5511 m
+1553 m
+10 h
+109 h
+5512 m
+5513 m
+10 h
+1650 h
+196 h
+219 m
+4 h
+10 h
+104 h
+4 h
+36 h
+10 h
+266 h
+10 h
+57 h
+1 h
+1822 h
+104 h
+195 h
+5514 m
+4 h
+5515 m
+1 h
+4 h
+4 h
+82 h
+5516 m
+1 h
+1403 h
+4 h
+1403 h
+124 h
+1 h
+1 h
+1 h
+5517 m
+2961 m
+10 h
+4 h
+10 h
+5518 m
+25 h
+1 h
+4 h
+1 h
+5519 m
+4 h
+10 h
+4 h
+5520 m
+56 h
+5521 m
+10 h
+1 h
+156 h
+2245 m
+1 h
+1 h
+1 h
+10 h
+4 h
+57 h
+82 h
+1 h
+1548 m
+4 h
+1 h
+23 h
+185 h
+295 h
+5522 m
+4 h
+5523 m
+4 h
+4 h
+1 h
+536 h
+10 h
+104 h
+4 h
+4 h
+5524 m
+10 h
+10 h
+10 h
+10 h
+1 h
+4 h
+4 h
+615 m
+5525 m
+5526 m
+1 h
+4 h
+1 h
+1 h
+10 h
+1 h
+4 h
+976 h
+1 h
+4 h
+4 h
+258 h
+1772 h
+5527 m
+5528 m
+4 h
+5529 m
+1 h
+10 h
+4 h
+10 h
+4 h
+4 h
+143 h
+27 h
+976 h
+10 h
+57 h
+83 h
+13 h
+4 h
+2111 m
+5530 m
+5531 m
+2418 h
+1 h
+5532 m
+184 h
+383 h
+1 h
+4 h
+10 h
+4 h
+10 h
+170 h
+4 h
+4 h
+5533 m
+10 h
+4 h
+10 h
+10 h
+4 h
+59 h
+10 h
+5534 m
+82 h
+4 h
+172 h
+1 h
+12 h
+1 h
+83 h
+4 h
+4 h
+266 h
+4 h
+5505 m
+3 h
+59 h
+109 h
+5535 m
+3216 m
+5536 m
+11 h
+447 h
+129 h
+1 h
+5537 m
+582 m
+4 h
+41 h
+124 h
+4 h
+1 h
+5538 m
+4576 m
+4 h
+94 h
+3 h
+4 h
+1835 h
+238 h
+383 h
+5539 m
+285 m
+31 h
+5540 m
+57 h
+1 h
+10 h
+5541 m
+5542 m
+718 h
+143 h
+5543 m
+4 h
+4 h
+5544 m
+1 h
+718 h
+4 h
+4 h
+5545 m
+332 h
+97 h
+5546 m
+10 h
+5547 m
+10 h
+4 h
+10 h
+10 h
+228 m
+10 h
+1 h
+25 h
+5548 m
+5549 m
+4 h
+5550 m
+59 h
+976 h
+4 h
+5551 m
+5552 m
+5553 m
+4 h
+10 h
+4 h
+41 h
+1 h
+57 h
+5554 m
+172 h
+1 h
+3 h
+4 h
+3499 m
+2110 m
+1 h
+587 m
+1 h
+10 h
+1 h
+10 h
+11 h
+1 h
+1116 m
+59 h
+5555 m
+1 h
+83 h
+4 h
+5556 m
+10 h
+1 h
+45 h
+10 h
+1 h
+1948 m
+1 h
+143 h
+5557 m
+4 h
+10 h
+297 h
+74 h
+195 h
+297 h
+112 h
+143 h
+5558 m
+146 h
+10 h
+5559 m
+5560 m
+1 h
+4 h
+1 h
+258 h
+5561 m
+10 h
+11 h
+10 h
+4 h
+91 h
+1 h
+208 m
+119 h
+4 h
+1 h
+1 h
+1 h
+10 h
+1642 h
+1 h
+65 h
+181 h
+4 h
+1780 h
+4 h
+82 h
+4 h
+4 h
+59 h
+1 h
+4 h
+1 h
+5562 m
+5563 m
+185 h
+1 h
+1 h
+1 h
+2257 m
+5564 m
+5565 m
+4 h
+146 h
+4 h
+10 h
+2846 m
+3 h
+2719 m
+2124 h
+399 h
+5566 m
+119 h
+5567 m
+5568 m
+1 h
+1 h
+56 h
+5569 m
+4 h
+3209 m
+10 h
+5570 m
+1 h
+238 h
+1 h
+5571 m
+11 h
+5572 m
+1 h
+4 h
+5573 m
+124 h
+1 h
+173 h
+359 h
+4 h
+57 h
+4 h
+4 h
+4 h
+1 h
+4 h
+1737 m
+92 h
+10 h
+4 h
+10 h
+4 h
+59 h
+5574 m
+1 h
+108 h
+57 h
+4 h
+11 h
+1 h
+4 h
+1 h
+10 h
+5575 m
+4 h
+10 h
+82 h
+1 h
+4 h
+4 h
+5576 m
+5577 m
+124 h
+1 h
+1 h
+5578 m
+2374 m
+1 h
+4 h
+5579 m
+4 h
+5580 m
+10 h
+25 h
+1 h
+5581 m
+1 h
+11 h
+4 h
+11 h
+10 h
+4 h
+25 h
+10 h
+4 h
+5582 m
+4 h
+1 h
+10 h
+4 h
+443 h
+4 h
+4 h
+1 h
+4 h
+5583 m
+10 h
+5584 m
+276 h
+10 h
+4 h
+109 h
+3 h
+124 h
+28 h
+4 h
+10 h
+10 h
+69 h
+1 h
+10 h
+4 h
+1 h
+1 h
+10 h
+4 h
+97 h
+4 h
+10 h
+4 h
+289 h
+1 h
+10 h
+4 h
+5585 m
+10 h
+5586 m
+1 h
+4 h
+1 h
+5587 m
+91 h
+1766 h
+158 h
+1 h
+5588 m
+5589 m
+4 h
+5590 m
+4 h
+4 h
+1470 h
+4 h
+4 h
+1 h
+5591 m
+1 h
+74 h
+57 h
+4 h
+10 h
+5592 m
+1 h
+4039 m
+56 h
+10 h
+10 h
+83 h
+5593 m
+4 h
+11 h
+13 h
+4 h
+2475 m
+1 h
+1 h
+10 h
+167 h
+4 h
+1 h
+5594 m
+1 h
+10 h
+1 h
+13 h
+10 h
+139 h
+5595 m
+22 h
+57 h
+5596 m
+563 m
+11 h
+4 h
+1 h
+139 h
+5597 m
+10 h
+4 h
+262 h
+169 h
+1790 m
+3 h
+779 h
+4 h
+1 h
+4 h
+425 h
+569 h
+123 h
+5598 m
+1 h
+1 h
+4 h
+1 h
+10 h
+57 h
+57 h
+11 h
+10 h
+146 h
+1 h
+10 h
+5599 m
+1 h
+11 h
+59 h
+4 h
+10 h
+1 h
+1 h
+5600 m
+1 h
+4 h
+5601 m
+31 h
+4 h
+11 h
+5602 m
+4 h
+211 m
+4645 m
+11 h
+11 h
+11 h
+11 h
+57 h
+238 h
+4 h
+1 h
+5603 m
+164 h
+1 h
+4 h
+10 h
+181 h
+299 h
+4 h
+358 h
+105 m
+1 h
+10 h
+109 h
+10 h
+10 h
+1 h
+5604 m
+1 h
+5605 m
+5606 m
+295 h
+5607 m
+5608 m
+5609 m
+5610 m
+278 h
+272 m
+10 h
+1 h
+5611 m
+4 h
+5612 m
+4 h
+5613 m
+4 h
+5614 m
+4 h
+5615 m
+10 h
+124 h
+31 h
+5616 m
+5617 m
+5618 m
+5619 m
+1 h
+4 h
+4 h
+1 h
+4 h
+5620 m
+5621 m
+4 h
+433 m
+1 h
+4 h
+1 h
+158 h
+3 h
+73 h
+124 h
+4 h
+5622 m
+4 h
+118 h
+1 h
+109 h
+82 h
+1016 h
+4 h
+4 h
+1 h
+73 h
+278 h
+5623 m
+31 h
+5624 m
+10 h
+5625 m
+114 h
+64 h
+4 h
+1 h
+4 h
+4 h
+10 h
+4 h
+4 h
+4 h
+4 h
+124 h
+97 h
+10 h
+1 h
+10 h
+10 h
+4 h
+1 h
+5626 m
+4 h
+5627 m
+3558 m
+1389 m
+4 h
+4 h
+229 h
+10 h
+5628 m
+4 h
+10 h
+368 h
+170 h
+45 h
+4 h
+10 h
+332 h
+4 h
+1 h
+4 h
+4 h
+1 h
+1 h
+4 h
+11 h
+4 h
+5629 m
+1 h
+224 h
+4 h
+4 h
+5630 m
+1 h
+1 h
+11 h
+4 h
+10 h
+4 h
+5631 m
+73 h
+124 h
+83 h
+649 m
+146 h
+97 h
+119 h
+297 h
+5632 m
+1 h
+4 h
+5633 m
+12 h
+5634 m
+10 h
+570 h
+4 h
+109 h
+41 h
+1737 m
+55 h
+1 h
+82 h
+4 h
+5635 m
+5636 m
+143 h
+41 h
+5637 m
+4 h
+4 h
+5638 m
+1 h
+10 h
+1 h
+1 h
+10 h
+282 m
+10 h
+1 h
+59 h
+1 h
+2423 m
+1 h
+649 m
+1 h
+1 h
+57 h
+1 h
+5639 m
+4 h
+3 h
+4 h
+41 h
+124 h
+371 h
+10 h
+5640 m
+4 h
+82 h
+1 h
+4 h
+4 h
+3 h
+4 h
+5641 m
+59 h
+1 h
+4 h
+57 h
+265 h
+10 h
+5642 m
+5643 m
+1 h
+1 h
+5644 m
+125 h
+5645 m
+10 h
+4 h
+556 h
+4 h
+5646 m
+488 m
+1 h
+1 h
+4 h
+4 h
+1030 h
+1 h
+1 h
+4 h
+4 h
+10 h
+170 h
+4 h
+5647 m
+65 h
+478 m
+4 h
+10 h
+74 h
+433 m
+69 h
+4 h
+4 h
+4 h
+5648 m
+82 h
+4 h
+403 h
+4 h
+5649 m
+10 h
+1 h
+4 h
+1 h
+1 h
+82 h
+1 h
+5650 m
+4 h
+1 h
+1 h
+1 h
+4 h
+4 h
+11 h
+1 h
+1 h
+13 h
+73 h
+1 h
+4 h
+114 h
+10 h
+3 h
+1 h
+4 h
+5651 m
+82 h
+124 h
+5652 m
+276 h
+1564 m
+11 h
+464 h
+4 h
+69 h
+1 h
+5653 m
+4 h
+1 h
+59 h
+2688 m
+5654 m
+1 h
+1 h
+4 h
+5655 m
+276 h
+1 h
+4 h
+4 h
+4 h
+4 h
+181 h
+4 h
+170 h
+4 h
+124 h
+129 h
+5656 m
+22 h
+59 h
+1 h
+4 h
+109 h
+5657 m
+10 h
+5658 m
+230 h
+593 m
+10 h
+238 h
+4 h
+4 h
+3 h
+4 h
+3 h
+1 h
+4 h
+10 h
+104 h
+5659 m
+10 h
+1 h
+5660 m
+1 h
+1 h
+1 h
+5661 m
+72 m
+4 h
+4 h
+4 h
+10 h
+1 h
+170 h
+97 h
+1 h
+4 h
+1 h
+109 h
+94 h
+139 h
+4 h
+536 h
+4 h
+10 h
+10 h
+5662 m
+4 h
+135 h
+4 h
+4 h
+4 h
+4 h
+12 h
+5650 m
+1 h
+4 h
+4 h
+31 h
+270 h
+5663 m
+4 h
+5664 m
+25 h
+649 h
+1 h
+4 h
+4 h
+10 h
+10 h
+5665 m
+5666 m
+11 h
+1120 m
+23 h
+1 h
+10 h
+258 h
+4 h
+185 h
+4 h
+57 h
+1 h
+4 h
+1 h
+4 h
+11 h
+1 h
+4 h
+4 h
+976 h
+307 h
+10 h
+10 h
+4 h
+4 h
+4 h
+5667 m
+112 h
+4 h
+157 h
+5668 m
+5669 m
+2625 m
+31 h
+4 h
+1 h
+173 h
+5670 m
+147 h
+114 h
+4 h
+4 h
+1 h
+83 h
+1 h
+1 h
+601 h
+5671 m
+4 h
+601 h
+83 h
+1 h
+4 h
+1137 h
+4 h
+10 h
+5672 m
+4 h
+31 h
+4 h
+11 h
+1 h
+44 m
+1 h
+104 h
+109 h
+10 h
+4 h
+4 h
+110 h
+5673 m
+5674 m
+10 h
+1 h
+4 h
+4 h
+5675 m
+4 h
+3 h
+5225 m
+386 h
+5676 m
+4 h
+5141 m
+238 h
+1 h
+11 h
+1 h
+146 h
+5677 m
+1 h
+4 h
+192 h
+10 h
+4 h
+4 h
+10 h
+4 h
+5678 m
+5679 m
+2591 m
+5680 m
+41 h
+4 h
+4 h
+5681 m
+4 h
+5682 m
+10 h
+8 h
+10 h
+5683 m
+1 h
+5684 m
+1 h
+4 h
+4 h
+1 h
+1 h
+5685 m
+143 h
+4 h
+1 h
+5686 m
+1 h
+4 h
+4 h
+278 h
+4 h
+4 h
+13 h
+104 h
+10 h
+25 h
+5687 m
+83 h
+4 h
+1 h
+64 h
+5688 m
+1 h
+1 h
+4 h
+1470 h
+10 h
+4 h
+1790 m
+1 h
+4 h
+10 h
+5689 m
+5690 m
+1 h
+4 h
+1914 m
+10 h
+5691 m
+4 h
+1218 m
+1359 h
+5692 m
+4 h
+4 h
+5693 m
+1627 m
+56 h
+109 h
+65 h
+4 h
+1737 h
+4 h
+10 h
+4 h
+464 h
+238 h
+3 h
+104 h
+109 h
+630 m
+167 h
+4 h
+1 h
+5694 m
+4 h
+4 h
+11 h
+1 h
+4 h
+114 h
+5695 m
+4 h
+1751 m
+41 h
+94 h
+4 h
+1 h
+4 h
+4 h
+97 h
+5696 m
+5697 m
+1 h
+4 h
+911 h
+41 h
+40 h
+10 h
+1 h
+1 h
+113 h
+10 h
+1 h
+4 h
+5698 m
+4 h
+1 h
+1 h
+5699 m
+10 h
+1 h
+73 h
+1 h
+4 h
+25 h
+250 h
+10 h
+10 h
+79 h
+4 h
+5230 m
+5700 m
+1 h
+10 h
+10 h
+10 h
+11 h
+4 h
+1 h
+1 h
+73 h
+1 h
+186 h
+109 h
+4 h
+4 h
+4 h
+125 h
+5701 m
+1714 m
+5702 m
+11 h
+5703 m
+229 h
+31 h
+10 h
+5704 m
+10 h
+11 h
+4 h
+1 h
+10 h
+4 h
+10 h
+4 h
+1 h
+22 h
+5705 m
+8 h
+5706 m
+1 h
+4 h
+1 h
+1 h
+381 m
+4 h
+5707 m
+10 h
+83 h
+4 h
+4 h
+270 h
+4 h
+124 h
+5708 m
+4 h
+156 h
+1 h
+125 h
+4 h
+1 h
+5709 m
+11 h
+4 h
+5710 m
+4 h
+10 h
+5711 m
+4 h
+4 h
+278 h
+74 h
+4 h
+4 h
+4 h
+1 h
+31 h
+10 h
+4 h
+1 h
+1 h
+1 h
+5712 m
+10 h
+4 h
+4 h
+4 h
+5713 m
+1 h
+1948 m
+5714 m
+4 h
+1027 h
+1 h
+10 h
+97 h
+1269 m
+4 h
+4 h
+4 h
+10 h
+4 h
+4 h
+57 h
+1191 m
+185 h
+109 h
+90 m
+3 h
+10 h
+83 h
+4 h
+5715 m
+10 h
+4 h
+10 h
+5716 m
+1 h
+4 h
+5717 m
+124 h
+135 h
+5718 m
+1 h
+4 h
+10 h
+10 h
+25 h
+10 h
+5719 m
+4 h
+1 h
+11 h
+82 h
+1 h
+4 h
+10 h
+1 h
+10 h
+204 h
+1 h
+4 h
+158 h
+10 h
+143 h
+4 h
+5720 m
+5721 m
+5722 m
+4 h
+10 h
+5723 m
+1 h
+4 h
+25 h
+5724 m
+11 h
+45 h
+928 m
+10 h
+10 h
+112 h
+10 h
+109 h
+4 h
+10 h
+5725 m
+5726 m
+5727 m
+5728 m
+114 h
+1 h
+124 h
+1389 m
+135 h
+5729 m
+4 h
+5730 m
+4 h
+158 h
+10 h
+10 h
+1 h
+83 h
+1 h
+5731 m
+5732 m
+1 h
+4 h
+1 h
+10 h
+2625 m
+5733 m
+4 h
+4 h
+4 h
+5734 m
+11 h
+4 h
+31 h
+1 h
+1 h
+10 h
+10 h
+11 h
+10 h
+5735 m
+1 h
+10 h
+94 h
+4 h
+4 h
+4 h
+246 m
+74 h
+169 h
+4 h
+4 h
+181 h
+10 h
+4 h
+65 h
+82 h
+3 h
+5736 m
+1 h
+10 h
+1442 m
+31 h
+11 h
+4 h
+5737 m
+5738 m
+4 h
+10 h
+2851 m
+73 h
+1 h
+1 h
+3477 m
+41 h
+4 h
+10 h
+196 h
+1 h
+10 h
+5739 m
+4 h
+1 h
+5740 m
+4 h
+2998 m
+1 h
+4 h
+104 h
+4 h
+5741 m
+5742 m
+5743 m
+10 h
+5744 m
+4 h
+5745 m
+10 h
+4 h
+5746 m
+5747 m
+5748 m
+109 h
+97 h
+10 h
+129 h
+1 h
+1 h
+10 h
+10 h
+10 h
+10 h
+4 h
+4 h
+5749 m
+5750 m
+83 h
+238 h
+4 h
+13 h
+5751 m
+4 h
+5752 m
+10 h
+10 h
+10 h
+4 h
+5753 m
+5754 m
+4 h
+430 m
+4 h
+4 h
+1 h
+4 h
+11 h
+83 h
+55 h
+4 h
+1785 m
+4 h
+4 h
+10 h
+4 h
+5755 m
+10 h
+4 h
+4 h
+601 h
+4 h
+5756 m
+59 h
+4 h
+1 h
+1 h
+1 h
+94 h
+10 h
+10 h
+8 h
+4 h
+11 h
+1 h
+1 h
+319 h
+4 h
+11 h
+4 h
+113 h
+41 h
+1 h
+11 h
+5757 m
+1 h
+3150 m
+4 h
+10 h
+56 h
+4 h
+10 h
+4 h
+1 h
+2775 m
+109 h
+5758 m
+1 h
+156 h
+73 h
+1 h
+1 h
+31 h
+1 h
+1 h
+5759 m
+1 h
+5760 m
+4 h
+124 h
+1 h
+10 h
+11 h
+1 h
+4 h
+4 h
+4 h
+1 h
+82 h
+146 h
+4 h
+10 h
+5761 m
+289 h
+1 h
+5762 m
+5763 m
+4 h
+4 h
+4 h
+185 h
+4 h
+4 h
+185 h
+109 h
+124 h
+156 h
+1 h
+59 h
+1 h
+5764 m
+5765 m
+1138 m
+5766 m
+5767 m
+31 h
+109 h
+4 h
+92 h
+5768 m
+4 h
+4 h
+83 h
+10 h
+3 h
+74 h
+36 h
+4 h
+31 h
+64 h
+59 h
+1 h
+4 h
+757 m
+1 h
+4 h
+11 h
+4 h
+4 h
+10 h
+1 h
+5769 m
+1 h
+4 h
+238 h
+5770 m
+986 h
+4 h
+5771 m
+12 h
+10 h
+5772 m
+4 h
+1650 h
+4 h
+1 h
+1 h
+109 h
+4 h
+10 h
+4 h
+1127 m
+4 h
+4 h
+110 h
+5773 m
+5774 m
+10 h
+65 h
+1 h
+954 m
+1 h
+5775 m
+5206 m
+447 h
+4549 m
+640 m
+4 h
+1 h
+83 h
+123 h
+717 h
+11 h
+1 h
+195 h
+4 h
+82 h
+4 h
+170 h
+1 h
+1 h
+5776 m
+5777 m
+4 h
+928 m
+170 h
+13 h
+1 h
+4 h
+124 h
+4 h
+238 h
+2617 m
+5778 m
+1 h
+1 h
+10 h
+4 h
+4 h
+1 h
+124 h
+5779 m
+383 h
+10 h
+10 h
+4 h
+1 h
+4 h
+1362 h
+5780 m
+10 h
+869 h
+1 h
+5781 m
+1 h
+5542 m
+4 h
+1 h
+1 h
+10 h
+10 h
+4 h
+10 h
+124 h
+10 h
+4 h
+45 h
+1838 m
+109 h
+1 h
+11 h
+10 h
+1 h
+10 h
+1 h
+4 h
+79 h
+4 h
+10 h
+10 h
+278 h
+4 h
+295 h
+5782 m
+5783 m
+1 h
+4906 m
+4 h
+4 h
+1 h
+1655 m
+1 h
+11 h
+4 h
+4 h
+1 h
+10 h
+185 h
+272 m
+1 h
+10 h
+109 h
+10 h
+11 h
+1 h
+5784 m
+1 h
+4 h
+104 h
+1 h
+5785 m
+5786 m
+5787 m
+250 h
+3212 m
+4 h
+5788 m
+616 m
+5789 m
+1 h
+11 h
+10 h
+986 h
+4 h
+10 h
+5790 m
+5791 m
+626 m
+36 h
+4 h
+10 h
+10 h
+196 h
+10 h
+5792 m
+25 h
+1 h
+27 h
+1470 h
+5793 m
+56 h
+1 h
+5794 m
+10 h
+83 h
+4 h
+4 h
+297 h
+10 h
+4 h
+196 h
+5795 m
+5796 m
+5797 m
+82 h
+4 h
+10 h
+4 h
+10 h
+5798 m
+10 h
+143 h
+1 h
+5799 m
+1 h
+3 h
+5800 m
+2887 h
+31 h
+1089 h
+59 h
+1 h
+1 h
+1 h
+601 h
+1 h
+4 h
+4 h
+4 h
+10 h
+5801 m
+1 h
+5802 m
+10 h
+4 h
+57 h
+4 h
+4 h
+164 h
+59 h
+41 h
+5803 m
+124 h
+5804 m
+5059 m
+403 h
+104 h
+167 h
+4 h
+5805 m
+1 h
+1 h
+10 h
+119 h
+1 h
+10 h
+4 h
+1 h
+10 h
+4 h
+2769 m
+10 h
+1 h
+1 h
+1 h
+5806 m
+5807 m
+92 h
+10 h
+10 h
+4 h
+4 h
+4 h
+569 h
+13 h
+10 h
+4 h
+2958 m
+5808 m
+266 h
+1 h
+1 h
+4 h
+4 h
+10 h
+4 h
+1 h
+5809 m
+56 h
+1 h
+169 h
+4 h
+4 h
+1 h
+10 h
+5810 m
+1 h
+4 h
+31 h
+4 h
+1 h
+4 h
+109 h
+1 h
+5811 m
+1 h
+4 h
+4106 m
+1 h
+4 h
+5812 m
+888 m
+4 h
+1 h
+5813 m
+11 h
+108 h
+4 h
+4 h
+103 m
+10 h
+5814 m
+464 h
+4 h
+295 h
+3 h
+5815 m
+10 h
+170 h
+5816 m
+5817 m
+10 h
+386 h
+1 h
+4 h
+83 h
+1 h
+56 h
+1 h
+1 h
+11 h
+4 h
+295 h
+10 h
+4 h
+10 h
+4 h
+1 h
+4 h
+109 h
+1 h
+124 h
+10 h
+1 h
+10 h
+4 h
+1 h
+22 h
+10 h
+5818 m
+4780 m
+4 h
+1 h
+10 h
+83 h
+274 h
+5819 m
+10 h
+10 h
+4 h
+5820 m
+5821 m
+5822 m
+10 h
+10 h
+11 h
+266 h
+4 h
+10 h
+265 h
+158 h
+1 h
+114 h
+55 h
+4 h
+4 h
+10 h
+4 h
+56 h
+65 h
+5823 m
+1 h
+4 h
+5824 m
+10 h
+11 h
+190 h
+463 m
+4 h
+1714 m
+10 h
+1250 h
+41 h
+4 h
+147 h
+1 h
+5825 m
+4 h
+4 h
+5826 m
+4 h
+83 h
+83 h
+10 h
+170 h
+1 h
+25 h
+10 h
+57 h
+10 h
+1 h
+4 h
+5827 m
+11 h
+5828 m
+1 h
+5829 m
+5830 m
+4 h
+5831 m
+4 h
+2883 m
+4 h
+1 h
+59 h
+91 h
+4 h
+10 h
+56 h
+57 h
+5832 m
+4 h
+4 h
+5833 m
+1 h
+10 h
+5834 m
+4 h
+1 h
+4 h
+10 h
+4 h
+82 h
+1 h
+330 m
+4 h
+31 h
+4 h
+4 h
+4 h
+1 h
+1 h
+12 h
+4 h
+5835 m
+3 h
+11 h
+1 h
+4 h
+4 h
+5836 m
+173 h
+5837 m
+1 h
+10 h
+5838 m
+65 h
+4 h
+5839 m
+5840 m
+1724 m
+4 h
+10 h
+77 h
+4 h
+65 h
+5841 m
+65 h
+1 h
+1 h
+338 h
+1886 m
+31 h
+10 h
+1 h
+41 h
+170 h
+10 h
+10 h
+113 h
+278 h
+64 h
+5842 m
+79 h
+4 h
+1 h
+5584 m
+4 h
+1 h
+10 h
+74 h
+41 h
+84 h
+5843 m
+1 h
+1 h
+5844 m
+1 h
+4 h
+10 h
+2733 h
+4 h
+939 m
+10 h
+1 h
+83 h
+48 h
+4 h
+692 h
+4 h
+40 h
+1 h
+10 h
+4 h
+25 h
+1 h
+10 h
+1 h
+5845 m
+10 h
+4 h
+10 h
+4 h
+1 h
+11 h
+266 h
+11 h
+5846 m
+1 h
+447 h
+1 h
+5847 m
+464 h
+289 h
+109 h
+59 h
+10 h
+1 h
+3799 m
+4 h
+10 h
+4 h
+83 h
+11 h
+4 h
+447 h
+4 h
+4 h
+5848 m
+5849 m
+1 h
+10 h
+4 h
+79 h
+83 h
+1 h
+33 m
+5850 m
+11 h
+4 h
+1 h
+4 h
+124 h
+25 h
+10 h
+5851 m
+4 h
+1 h
+4 h
+278 h
+204 h
+10 h
+4 h
+4 h
+59 h
+911 h
+10 h
+2172 m
+40 h
+5852 m
+3 h
+4 h
+5853 m
+5854 m
+4 h
+1 h
+10 h
+79 h
+250 h
+164 h
+1261 h
+5855 m
+4 h
+4 h
+1409 m
+4 h
+5856 m
+147 h
+1 h
+10 h
+5857 m
+1 h
+1 h
+5858 m
+368 h
+1105 h
+1 h
+10 h
+10 h
+1 h
+278 h
+4 h
+79 h
+3 h
+125 h
+10 h
+1 h
+5859 m
+5860 m
+1 h
+4 h
+4933 m
+1 h
+5861 m
+10 h
+1 h
+10 h
+4 h
+5862 m
+265 h
+185 h
+332 h
+556 h
+10 h
+1 h
+82 h
+219 m
+196 h
+1 h
+11 h
+135 h
+4 h
+4 h
+1 h
+1 h
+1 h
+1 h
+172 h
+4 h
+5863 m
+195 h
+4 h
+10 h
+5864 m
+4 h
+41 h
+4 h
+1 h
+4 h
+630 m
+125 h
+4 h
+65 h
+4 h
+4 h
+1 h
+1 h
+135 h
+1 h
+1 h
+5865 m
+1 h
+1 h
+5141 m
+4 h
+1 h
+5866 m
+1 h
+10 h
+5867 m
+10 h
+10 h
+5868 m
+10 h
+4 h
+5869 m
+5870 m
+316 m
+10 h
+1 h
+83 h
+4 h
+123 h
+5871 m
+10 h
+1 h
+4564 m
+5872 m
+146 h
+4 h
+5873 m
+4 h
+4 h
+4 h
+5874 m
+1 h
+4 h
+4 h
+5875 m
+10 h
+1 h
+5876 m
+4 h
+82 h
+1 h
+4 h
+5877 m
+4 h
+5878 m
+4 h
+1 h
+4 h
+4 h
+265 h
+10 h
+82 h
+1 h
+5879 m
+1 h
+3 h
+1 h
+1 h
+265 h
+74 h
+5880 m
+4 h
+10 h
+11 h
+10 h
+10 h
+289 h
+4 h
+4 h
+5881 m
+5882 m
+10 h
+4 h
+4 h
+4 h
+41 h
+113 h
+156 h
+1 h
+4 h
+5883 m
+10 h
+5884 m
+57 h
+4 h
+5885 m
+258 h
+10 h
+1 h
+5886 m
+27 h
+1 h
+2116 m
+4 h
+5887 m
+5888 m
+1 h
+1 h
+5889 m
+434 h
+4 h
+4 h
+1 h
+4 h
+196 h
+5890 m
+4 h
+794 m
+1 h
+4 h
+5891 m
+5892 m
+4 h
+108 h
+5893 m
+82 h
+2308 m
+1 h
+1 h
+1 h
+10 h
+1725 m
+112 h
+5894 m
+31 h
+196 h
+4 h
+1894 m
+4 h
+4 h
+5895 m
+5896 m
+150 m
+4 h
+146 h
+10 h
+10 h
+4 h
+10 h
+5897 m
+125 h
+1 h
+5898 m
+57 h
+192 h
+1 h
+57 h
+5899 m
+45 h
+1 h
+10 h
+307 h
+5900 m
+125 h
+258 h
+31 h
+124 h
+10 h
+1 h
+10 h
+1 h
+1 h
+4 h
+368 h
+83 h
+4 h
+692 h
+10 h
+83 h
+10 h
+83 h
+5901 m
+10 h
+4 h
+10 h
+5902 m
+195 h
+2459 m
+4 h
+4 h
+104 h
+1 h
+4 h
+5903 m
+146 h
+129 h
+10 h
+59 h
+1790 h
+986 h
+4 h
+5904 m
+5905 m
+146 h
+5906 m
+368 h
+4 h
+1 h
+3 h
+307 h
+110 h
+1201 h
+3469 m
+10 h
+10 h
+4 h
+911 h
+1 h
+538 h
+4 h
+4 h
+403 h
+5907 m
+601 h
+4 h
+4 h
+4 h
+55 h
+5908 m
+104 h
+1 h
+83 h
+1 h
+13 h
+83 h
+25 h
+4 h
+447 h
+5909 m
+1 h
+190 h
+135 h
+59 h
+4 h
+124 h
+1 h
+5910 m
+5911 m
+143 h
+4 h
+4 h
+4 h
+5912 m
+1639 m
+5913 m
+109 h
+1 h
+143 h
+4 h
+4 h
+1 h
+4 h
+196 h
+5914 m
+5915 m
+5916 m
+1 h
+5917 m
+5918 m
+10 h
+10 h
+5919 m
+2459 m
+10 h
+4 h
+3555 m
+11 h
+5920 m
+1 h
+82 h
+13 h
+4 h
+1 h
+10 h
+1 h
+4 h
+12 h
+5921 m
+5922 m
+1955 h
+1 h
+124 h
+1 h
+297 h
+295 h
+13 h
+56 h
+10 h
+11 h
+135 h
+56 h
+5923 m
+338 h
+1 h
+125 h
+41 h
+46 h
+3534 m
+11 h
+10 h
+692 h
+5924 m
+1 h
+5925 m
+4 h
+82 h
+5926 m
+10 h
+48 h
+4 h
+4 h
+4 h
+4 h
+10 h
+45 h
+219 m
+5927 m
+172 h
+164 h
+10 h
+1 h
+4 h
+4 h
+10 h
+10 h
+4780 m
+4 h
+5928 m
+10 h
+1016 h
+4 h
+4 h
+1 h
+10 h
+64 h
+11 h
+59 h
+164 h
+4 h
+601 h
+5929 m
+10 h
+4 h
+4441 m
+1 h
+4 h
+1 h
+5930 m
+79 h
+258 h
+5931 m
+185 h
+25 h
+1316 m
+4 h
+1 h
+31 h
+1796 m
+4 h
+4 h
+94 h
+45 h
+5932 m
+5933 m
+4 h
+1 h
+279 h
+1 h
+10 h
+258 h
+29 m
+1 h
+167 h
+1 h
+5934 m
+55 h
+4 h
+146 h
+10 h
+1 h
+10 h
+5935 m
+4 h
+687 h
+57 h
+1 h
+5936 m
+59 h
+10 h
+92 h
+109 h
+5937 m
+10 h
+11 h
+425 h
+809 m
+1 h
+4 h
+5938 m
+5939 m
+83 h
+167 h
+1016 h
+4 h
+1 h
+4 h
+5940 m
+5941 m
+4 h
+185 h
+2475 m
+4 h
+10 h
+262 h
+4 h
+1 h
+4 h
+147 h
+10 h
+1650 h
+5942 m
+5943 m
+4 h
+358 h
+196 h
+5944 m
+5945 m
+13 h
+1 h
+195 h
+109 h
+124 h
+73 h
+1 h
+1 h
+1 h
+4 h
+109 h
+83 h
+4941 m
+4 h
+11 h
+119 h
+114 h
+4 h
+10 h
+157 h
+538 h
+5946 m
+5947 m
+1 h
+1 h
+1 h
+5948 m
+4 h
+5949 m
+4 h
+2844 m
+1780 h
+10 h
+1 h
+4 h
+106 h
+5950 m
+10 h
+4 h
+5951 m
+185 h
+1 h
+5952 m
+4 h
+4 h
+4 h
+5953 m
+36 h
+4805 m
+172 h
+4 h
+330 m
+238 h
+4 h
+4 h
+4 h
+626 m
+5954 m
+10 h
+4 h
+5955 m
+2719 m
+4 h
+10 h
+4 h
+5956 m
+5944 m
+4 h
+3 h
+4 h
+4 h
+10 h
+4 h
+4 h
+238 h
+4 h
+4 h
+109 h
+164 h
+383 h
+4 h
+5957 m
+1722 m
+5958 m
+5959 m
+12 h
+383 h
+97 h
+31 h
+11 h
+5960 m
+1 h
+986 h
+109 h
+10 h
+649 h
+912 m
+4 h
+4 h
+4 h
+10 h
+11 h
+4 h
+4503 m
+1 h
+4 h
+1 h
+164 h
+140 h
+55 h
+5961 m
+1 h
+11 h
+4 h
+757 m
+82 h
+10 h
+1039 m
+5962 m
+4 h
+4 h
+5963 m
+1 h
+10 h
+5964 m
+13 h
+10 h
+1 h
+11 h
+1 h
+1 h
+10 h
+10 h
+5965 m
+5966 m
+4 h
+59 h
+10 h
+10 h
+4 h
+5967 m
+4 h
+25 h
+1595 m
+4 h
+278 h
+4 h
+1619 m
+124 h
+4 h
+10 h
+4 h
+10 h
+11 h
+434 h
+4 h
+1053 h
+276 h
+5968 m
+10 h
+74 h
+59 h
+4 h
+2022 m
+4 h
+10 h
+1 h
+5969 m
+31 h
+10 h
+4 h
+1 h
+11 h
+4 h
+614 m
+1 h
+4 h
+4 h
+83 h
+4 h
+278 h
+83 h
+5970 m
+4 h
+13 h
+124 h
+41 h
+4 h
+4 h
+10 h
+1 h
+59 h
+4 h
+4 h
+640 h
+1 h
+124 h
+4 h
+1 h
+10 h
+5971 m
+2887 h
+459 m
+5972 m
+5973 m
+4 h
+5974 m
+65 h
+1 h
+4 h
+5975 m
+10 h
+11 h
+1 h
+10 h
+4 h
+157 h
+5976 m
+10 h
+5977 m
+5978 m
+59 h
+279 h
+156 h
+359 h
+144 h
+4 h
+1 h
+8 h
+77 h
+1 h
+1 h
+1 h
+1 h
+1 h
+4 h
+5979 m
+1105 h
+1 h
+11 h
+1 h
+59 h
+1 h
+1 h
+208 m
+1 h
+112 h
+1 h
+94 h
+10 h
+1 h
+41 h
+4 h
+5980 m
+5981 m
+10 h
+1 h
+195 h
+4 h
+5982 m
+4 h
+4 h
+1 h
+1137 h
+5983 m
+10 h
+5984 m
+185 h
+1 h
+4 h
+5985 m
+5986 m
+5987 m
+5988 m
+82 h
+114 h
+7 m
+169 h
+1 h
+10 h
+1 h
+5989 m
+55 h
+73 h
+5990 m
+250 h
+10 h
+1197 m
+10 h
+36 h
+383 h
+119 h
+4 h
+167 h
+4 h
+10 h
+55 h
+4 h
+124 h
+83 h
+65 h
+4 h
+10 h
+4 h
+4 h
+4 h
+4 h
+8 h
+4810 m
+109 h
+1 h
+4 h
+1137 h
+1 h
+5991 m
+1 h
+91 h
+10 h
+170 h
+5992 m
+5993 m
+169 h
+1 h
+10 h
+4 h
+368 h
+3 h
+5994 m
+4 h
+5464 m
+1 h
+1 h
+5995 m
+1 h
+56 h
+104 h
+5996 m
+4 h
+1 h
+73 h
+4 h
+258 h
+4 h
+1 h
+1 h
+10 h
+10 h
+1 h
+1 h
+258 h
+10 h
+41 h
+83 h
+1 h
+1 h
+10 h
+274 h
+4 h
+718 h
+11 h
+10 h
+224 h
+1 h
+83 h
+10 h
+4 h
+190 h
+5997 m
+143 h
+5980 m
+2148 m
+1 h
+4 h
+5998 m
+4 h
+4 h
+5999 m
+238 h
+2794 m
+6000 m
+219 h
+1 h
+79 h
+368 h
+10 h
+1 h
+4 h
+22 h
+4 h
+11 h
+6001 m
+1822 h
+109 h
+108 h
+12 h
+939 h
+6002 m
+4 h
+167 h
+6003 m
+4 h
+250 h
+2719 m
+256 h
+1 h
+1 h
+57 h
+1260 m
+140 h
+307 h
+6004 m
+1 h
+4 h
+11 h
+195 h
+1 h
+83 h
+6005 m
+5422 m
+4 h
+1 h
+1 h
+6006 m
+4 h
+6007 m
+59 h
+6008 m
+4 h
+4 h
+6009 m
+190 h
+4 h
+79 h
+4 h
+11 h
+195 h
+10 h
+6010 m
+4 h
+4 h
+4 h
+4 h
+4297 m
+2475 m
+1 h
+4 h
+4 h
+11 h
+31 h
+4 h
+1 h
+1 h
+109 h
+195 h
+2788 m
+10 h
+1 h
+6011 m
+11 h
+4 h
+6012 m
+1 h
+1 h
+4 h
+4 h
+6013 m
+4 h
+1 h
+4 h
+6014 m
+6015 m
+1 h
+4 h
+48 h
+6016 m
+10 h
+6017 m
+4 h
+6018 m
+4 h
+158 h
+1 h
+10 h
+4 h
+4 h
+11 h
+1 h
+4 h
+40 h
+195 h
+4 h
+6019 m
+1 h
+800 m
+4 h
+1 h
+4 h
+1 h
+1 h
+6020 m
+6021 m
+112 h
+1 h
+10 h
+181 h
+6022 m
+11 h
+10 h
+11 h
+1403 h
+4 h
+156 h
+1 h
+6023 m
+204 h
+1 h
+4 h
+10 h
+6024 m
+4 h
+6025 m
+12 h
+57 h
+6026 m
+1 h
+10 h
+82 h
+45 h
+4 h
+6027 m
+109 h
+10 h
+4 h
+44 m
+10 h
+1 h
+2038 m
+11 h
+6028 m
+1454 m
+4 h
+10 h
+4 h
+1123 m
+4 h
+4 h
+1470 h
+1 h
+1 h
+6029 m
+1790 h
+1 h
+6030 m
+10 h
+109 h
+443 h
+1 h
+1027 h
+3847 m
+31 h
+185 h
+4 h
+4 h
+6031 m
+113 h
+4 h
+4 h
+11 h
+6032 m
+435 m
+1 h
+6033 m
+10 h
+1 h
+46 h
+1 h
+1 h
+4 h
+1 h
+295 h
+3558 m
+10 h
+6034 m
+1 h
+6035 m
+4 h
+1 h
+307 h
+1127 m
+4 h
+1 h
+1 h
+1 h
+4 h
+45 h
+4 h
+83 h
+278 h
+383 h
+6036 m
+4 h
+6037 m
+6038 m
+6039 m
+10 h
+10 h
+6040 m
+1 h
+1 h
+279 h
+4 h
+1 h
+156 h
+4 h
+4 h
+11 h
+1 h
+6041 m
+3214 m
+97 h
+119 h
+1105 h
+4 h
+4 h
+1 h
+4 h
+6042 m
+4 h
+10 h
+4 h
+59 h
+4 h
+147 h
+22 h
+13 h
+10 h
+4 h
+4 h
+10 h
+6043 m
+119 h
+4 h
+1 h
+6044 m
+381 m
+4 h
+10 h
+1 h
+1 h
+6045 m
+4 h
+6046 m
+109 h
+10 h
+4 h
+289 h
+6047 m
+11 h
+4 h
+6048 m
+82 h
+4 h
+1 h
+6049 m
+40 h
+6050 m
+307 h
+266 h
+6051 m
+10 h
+1 h
+10 h
+6052 m
+6053 m
+169 h
+1780 h
+2520 m
+694 m
+4 h
+6054 m
+6055 m
+10 h
+1 h
+4 h
+278 h
+4 h
+10 h
+13 h
+143 h
+2041 m
+6056 m
+31 h
+4 h
+4498 m
+55 h
+1403 h
+10 h
+1 h
+2484 m
+6057 m
+6058 m
+4 h
+10 h
+104 h
+1 h
+1 h
+1 h
+4 h
+1 h
+10 h
+2840 m
+6059 m
+6060 m
+11 h
+4 h
+1 h
+2887 h
+6061 m
+1 h
+124 h
+10 h
+4 h
+4 h
+4 h
+10 h
+1677 m
+4 h
+4 h
+10 h
+10 h
+6062 m
+10 h
+6063 m
+6064 m
+10 h
+4 h
+6065 m
+1 h
+6066 m
+802 m
+6067 m
+4 h
+104 h
+6068 m
+6069 m
+6070 m
+1105 h
+56 h
+6071 m
+6072 m
+6073 m
+4 h
+371 h
+6074 m
+4 h
+6075 m
+4 h
+4 h
+6076 m
+1 h
+10 h
+6077 m
+6078 m
+6079 m
+1 h
+6080 m
+1261 h
+6081 m
+64 h
+4 h
+6082 m
+12 h
+4 h
+4 h
+109 h
+75 m
+224 h
+1 h
+10 h
+12 h
+4 h
+278 h
+82 h
+278 h
+4 h
+108 h
+4 h
+6083 m
+59 h
+332 h
+6084 m
+4 h
+82 h
+10 h
+11 h
+4 h
+10 h
+6085 m
+857 m
+4 h
+6086 m
+4 h
+4 h
+6087 m
+82 h
+4 h
+1 h
+1 h
+4 h
+4596 m
+265 h
+4 h
+184 h
+75 m
+1 h
+4 h
+10 h
+11 h
+1 h
+6088 m
+6089 m
+10 h
+10 h
+4 h
+4 h
+6090 m
+10 h
+104 h
+41 h
+6091 m
+10 h
+13 h
+57 h
+4 h
+4 h
+10 h
+258 h
+82 h
+6092 m
+4 h
+4 h
+6093 m
+4 h
+1359 h
+1 h
+4 h
+6094 m
+4 h
+6095 m
+1 h
+1619 m
+57 h
+1 h
+6096 m
+4 h
+6097 m
+1024 m
+4 h
+569 h
+4 h
+1 h
+10 h
+6098 m
+4 h
+6099 m
+757 m
+36 h
+4 h
+258 h
+4 h
+1 h
+386 h
+570 h
+110 h
+4 h
+1 h
+4 h
+1 h
+170 h
+124 h
+1 h
+4 h
+6100 m
+79 h
+10 h
+169 h
+4 h
+6101 m
+8 h
+57 h
+4 h
+1 h
+4 h
+1766 h
+6102 m
+12 h
+6103 m
+6104 m
+10 h
+4 h
+1 h
+114 h
+6105 m
+5809 m
+6106 m
+4 h
+4 h
+1250 h
+4 h
+97 h
+4 h
+4 h
+4 h
+4 h
+124 h
+11 h
+57 h
+174 h
+10 h
+1 h
+601 h
+12 h
+1 h
+4 h
+10 h
+6107 m
+135 h
+10 h
+6108 m
+258 h
+6109 m
+10 h
+976 h
+41 h
+250 h
+4 h
+41 h
+10 h
+4 h
+4 h
+1 h
+10 h
+6110 m
+73 h
+6111 m
+4 h
+10 h
+6112 m
+83 h
+4 h
+109 h
+1 h
+1 h
+4 h
+1 h
+278 h
+82 h
+10 h
+4 h
+6113 m
+109 h
+6114 m
+4 h
+1 h
+1 h
+25 h
+6115 m
+238 h
+1 h
+10 h
+10 h
+6116 m
+986 h
+25 h
+4 h
+11 h
+1 h
+116 m
+109 h
+1 h
+1 h
+1 h
+22 h
+4 h
+64 h
+6117 m
+1 h
+4 h
+6118 m
+10 h
+25 h
+10 h
+4 h
+10 h
+11 h
+4 h
+10 h
+124 h
+1952 m
+4 h
+41 h
+3115 m
+1 h
+6119 m
+430 m
+4 h
+272 m
+4 h
+6120 m
+6121 m
+1 h
+79 h
+4 h
+147 h
+10 h
+1 h
+6122 m
+1030 h
+6123 m
+31 h
+1 h
+1 h
+10 h
+6124 m
+11 h
+1 h
+10 h
+143 h
+146 h
+41 h
+1 h
+10 h
+4 h
+4 h
+1 h
+6125 m
+4 h
+238 h
+11 h
+4 h
+124 h
+4 h
+4 h
+1 h
+10 h
+1 h
+4 h
+170 h
+6126 m
+1 h
+6127 m
+1 h
+4 h
+10 h
+4 h
+77 h
+4 h
+8 h
+4 h
+6128 m
+4 h
+10 h
+10 h
+4 h
+307 h
+23 h
+172 h
+448 m
+2158 m
+146 h
+1389 h
+10 h
+10 h
+6129 m
+4 h
+125 h
+6130 m
+4 h
+1 h
+700 m
+11 h
+4 h
+2591 m
+10 h
+6131 m
+6132 m
+4 h
+1 h
+10 h
+6133 m
+4 h
+10 h
+4 h
+10 h
+57 h
+1 h
+4 h
+6134 m
+57 h
+10 h
+6135 m
+6136 m
+1 h
+3 h
+10 h
+94 h
+266 h
+4 h
+4 h
+10 h
+1 h
+1 h
+11 h
+6137 m
+6138 m
+1 h
+10 h
+10 h
+10 h
+4 h
+1 h
+1 h
+4 h
+4 h
+10 h
+6139 m
+6140 m
+146 h
+6141 m
+1137 h
+10 h
+4 h
+1089 h
+4 h
+4 h
+1 h
+6142 m
+1 h
+6143 m
+1822 h
+10 h
+203 m
+435 m
+4 h
+6144 m
+11 h
+112 h
+4 h
+124 h
+6145 m
+1 h
+986 h
+10 h
+1 h
+10 h
+6146 m
+1 h
+238 h
+31 h
+146 h
+10 h
+10 h
+31 h
+4 h
+41 h
+10 h
+65 h
+1 h
+6147 m
+73 h
+4 h
+6148 m
+109 h
+82 h
+109 h
+6149 m
+41 h
+4 h
+4 h
+1 h
+10 h
+6150 m
+1 h
+10 h
+11 h
+6151 m
+4 h
+6152 m
+109 h
+4 h
+6153 m
+185 h
+1 h
+25 h
+4 h
+4 h
+10 h
+1 h
+108 h
+104 h
+278 h
+6154 m
+6155 m
+1 h
+1 h
+4 h
+6156 m
+1403 h
+6157 m
+3 h
+1 h
+10 h
+6132 m
+1 h
+1 h
+169 h
+4 h
+4 h
+297 h
+31 h
+1 h
+1 h
+4 h
+614 m
+6158 m
+295 h
+41 h
+1 h
+124 h
+12 h
+2887 h
+4 h
+10 h
+1 h
+6159 m
+4 h
+6160 m
+6161 m
+4359 m
+1 h
+6162 m
+10 h
+6163 m
+332 h
+10 h
+10 h
+4 h
+6164 m
+4 h
+4 h
+6165 m
+109 h
+4 h
+10 h
+10 h
+195 h
+6166 m
+4 h
+1 h
+538 h
+1 h
+4 h
+4 h
+1089 h
+10 h
+10 h
+6167 m
+1 h
+3 h
+4 h
+1128 m
+1 h
+4 h
+1 h
+12 h
+79 h
+1 h
+6168 m
+94 h
+10 h
+4520 m
+6169 m
+6170 m
+1 h
+1357 m
+1100 m
+4 h
+6171 m
+4 h
+6172 m
+11 h
+196 h
+1 h
+11 h
+4 h
+10 h
+4 h
+359 h
+6173 m
+6174 m
+6175 m
+332 h
+1 h
+1 h
+4 h
+10 h
+6176 m
+1 h
+55 h
+1 h
+6177 m
+6178 m
+1 h
+82 h
+10 h
+6179 m
+6180 m
+1 h
+1 h
+1 h
+371 h
+4 h
+4 h
+6181 m
+8 h
+10 h
+6182 m
+1 h
+4 h
+6183 m
+4 h
+6184 m
+4 h
+6185 m
+4 h
+57 h
+4 h
+2374 m
+6186 m
+1 h
+10 h
+6187 m
+3321 m
+10 h
+10 h
+6188 m
+1 h
+1 h
+6189 m
+368 h
+10 h
+272 m
+6190 m
+97 h
+307 h
+6191 m
+10 h
+4 h
+4 h
+10 h
+25 h
+11 h
+4 h
+4 h
+4 h
+403 h
+4 h
+6192 m
+10 h
+538 h
+1 h
+6193 m
+1 h
+4 h
+10 h
+10 h
+181 h
+82 h
+6194 m
+4 h
+4 h
+338 h
+112 h
+31 h
+4 h
+10 h
+6195 m
+1 h
+1 h
+104 h
+4 h
+359 h
+1 h
+4 h
+11 h
+1 h
+6196 m
+10 h
+266 h
+3 h
+57 h
+1 h
+1 h
+6197 m
+4 h
+4 h
+135 h
+10 h
+4 h
+4 h
+41 h
+31 h
+6198 m
+6199 m
+1 h
+4 h
+1 h
+6200 m
+10 h
+1 h
+10 h
+1524 m
+6201 m
+935 h
+6202 m
+79 h
+139 h
+10 h
+10 h
+186 h
+1 h
+4 h
+92 h
+1 h
+1 h
+97 h
+1218 m
+6203 m
+1 h
+1 h
+59 h
+4 h
+4 h
+10 h
+1 h
+4 h
+1 h
+48 h
+57 h
+138 h
+4 h
+83 h
+1 h
+6204 m
+6205 m
+4 h
+6206 m
+575 h
+11 h
+1 h
+4 h
+4 h
+6207 m
+270 h
+4 h
+11 h
+6208 m
+4 h
+10 h
+4 h
+11 h
+4 h
+83 h
+109 h
+258 h
+4 h
+6209 m
+1 h
+41 h
+65 h
+6210 m
+146 h
+4 h
+11 h
+31 h
+1 h
+6211 m
+4 h
+6212 m
+1 h
+1 h
+820 h
+10 h
+388 m
+4 h
+4 h
+4 h
+276 h
+1070 m
+6213 m
+4 h
+250 h
+1 h
+1 h
+4 h
+10 h
+124 h
+155 m
+6214 m
+1 h
+4 h
+36 h
+6215 m
+6216 m
+4 h
+146 h
+6217 m
+6218 m
+10 h
+10 h
+569 h
+1 h
+185 h
+6219 m
+10 h
+1 h
+104 h
+59 h
+6220 m
+1016 h
+6221 m
+489 m
+65 h
+1 h
+125 h
+4 h
+10 h
+4 h
+4 h
+82 h
+77 h
+4 h
+1 h
+192 h
+10 h
+266 h
+6222 m
+124 h
+1 h
+1772 h
+4 h
+11 h
+4 h
+6223 m
+4 h
+36 h
+1 h
+59 h
+1 h
+6224 m
+1 h
+1 h
+3 h
+172 h
+6225 m
+4 h
+4 h
+2374 m
+124 h
+10 h
+224 h
+60 m
+4 h
+10 h
+25 h
+195 h
+196 h
+677 m
+1 h
+1 h
+6226 m
+6227 m
+1642 h
+1642 h
+1 h
+109 h
+6228 m
+6229 m
+1 h
+11 h
+4 h
+6230 m
+12 h
+6231 m
+6232 m
+146 h
+1 h
+10 h
+4 h
+1 h
+1 h
+10 h
+25 h
+10 h
+1337 m
+10 h
+56 h
+1 h
+1 h
+1 h
+3 h
+5348 m
+11 h
+6233 m
+3 h
+123 h
+11 h
+10 h
+478 m
+403 h
+1 h
+6234 m
+6235 m
+4 h
+4 h
+6236 m
+4 h
+11 h
+4 h
+10 h
+57 h
+6237 m
+12 h
+6238 m
+4 h
+4 h
+1 h
+6239 m
+4 h
+295 h
+196 h
+4 h
+74 h
+1 h
+4 h
+1 h
+10 h
+56 h
+1 h
+138 h
+4 h
+4 h
+976 h
+10 h
+6240 m
+6241 m
+6242 m
+1 h
+1 h
+10 h
+1 h
+59 h
+4 h
+45 h
+1 h
+368 h
+4 h
+10 h
+25 h
+4 h
+6243 m
+172 h
+79 h
+6244 m
+649 h
+10 h
+1 h
+204 h
+4 h
+928 h
+10 h
+83 h
+45 h
+6245 m
+1 h
+4 h
+94 h
+4 h
+10 h
+3 h
+45 h
+83 h
+59 h
+4 h
+6246 m
+3341 m
+10 h
+6247 m
+4 h
+124 h
+31 h
+6248 m
+307 h
+1 h
+6249 m
+6250 m
+4 h
+4 h
+6251 m
+1 h
+4 h
+4 h
+6252 m
+6253 m
+10 h
+113 h
+238 h
+6254 m
+186 h
+74 h
+74 h
+4 h
+6255 m
+1 h
+82 h
+155 m
+2733 h
+10 h
+6256 m
+6257 m
+4 h
+1 h
+1 h
+10 h
+22 h
+146 h
+1 h
+1893 m
+124 h
+4 h
+1 h
+1 h
+10 h
+104 h
+6258 m
+10 h
+4 h
+464 h
+1619 h
+6259 m
+316 m
+10 h
+1 h
+4 h
+55 h
+6260 m
+6261 m
+1 h
+1 h
+6262 m
+4 h
+6263 m
+4 h
+4 h
+4 h
+4 h
+2069 m
+4 h
+11 h
+1 h
+4 h
+1 h
+6264 m
+92 h
+4 h
+10 h
+258 h
+57 h
+4 h
+10 h
+4 h
+4 h
+4 h
+124 h
+4 h
+109 h
+79 h
+46 h
+6265 m
+4 h
+368 h
+614 m
+6266 m
+4 h
+6267 m
+4 h
+575 h
+1 h
+1 h
+3 h
+11 h
+56 h
+1 h
+31 h
+6268 m
+10 h
+6269 m
+10 h
+1772 h
+6270 m
+4 h
+22 h
+1 h
+1105 h
+4576 m
+6271 m
+1 h
+1 h
+123 h
+73 h
+1 h
+1 h
+65 h
+6272 m
+4 h
+1 h
+351 m
+4 h
+6273 m
+4 h
+4 h
+1453 m
+6274 m
+6275 m
+4 h
+4 h
+190 h
+358 h
+28 h
+94 h
+1 h
+1 h
+48 h
+4 h
+22 h
+6276 m
+23 h
+10 h
+4 h
+214 m
+73 h
+6277 m
+4 h
+1 h
+1 h
+10 h
+4 h
+4111 m
+6278 m
+10 h
+4 h
+6279 m
+1 h
+4 h
+1 h
+1 h
+6280 m
+10 h
+4538 m
+1 h
+6281 m
+1 h
+135 h
+6282 m
+11 h
+266 h
+10 h
+4 h
+10 h
+6283 m
+4 h
+11 h
+10 h
+147 h
+935 h
+4 h
+4 h
+1 h
+1 h
+4 h
+4 h
+6284 m
+1 h
+1 h
+1 h
+316 m
+4 h
+11 h
+6285 m
+3 h
+4 h
+4 h
+1 h
+6286 m
+1359 h
+6287 m
+83 h
+181 h
+4 h
+4 h
+10 h
+4 h
+10 h
+4 h
+1 h
+6288 m
+4 h
+6289 m
+2308 m
+2769 m
+1 h
+10 h
+31 h
+59 h
+1 h
+8 h
+265 h
+6290 m
+10 h
+1 h
+6291 m
+146 h
+1 h
+97 h
+1 h
+1 h
+6292 m
+97 h
+1250 h
+57 h
+1 h
+135 h
+113 h
+6293 m
+4 h
+119 h
+1 h
+124 h
+158 h
+6294 m
+4 h
+11 h
+10 h
+4 h
+2719 h
+1 h
+196 h
+4 h
+10 h
+1074 m
+6295 m
+6296 m
+4 h
+4 h
+190 h
+1 h
+1 h
+3 h
+4626 m
+41 h
+172 h
+1 h
+1 h
+82 h
+6297 m
+1 h
+73 h
+1 h
+6298 m
+1 h
+4 h
+4 h
+108 h
+1 h
+25 h
+10 h
+184 h
+4 h
+167 h
+11 h
+10 h
+109 h
+4 h
+488 m
+1 h
+97 h
+57 h
+238 h
+6299 m
+939 h
+4 h
+1 h
+10 h
+11 h
+4 h
+1 h
+4 h
+1 h
+6300 m
+1 h
+1 h
+10 h
+3 h
+6301 m
+74 h
+6302 m
+1 h
+1 h
+10 h
+6303 m
+6304 m
+1454 m
+3 h
+6305 m
+488 h
+4 h
+6306 m
+400 m
+4 h
+114 h
+73 h
+73 h
+83 h
+4 h
+6307 m
+1 h
+1 h
+1 h
+229 h
+4 h
+196 h
+57 h
+4 h
+4 h
+1 h
+6308 m
+1 h
+1 h
+3 h
+536 h
+6309 m
+626 m
+10 h
+10 h
+196 h
+1 h
+4 h
+57 h
+4 h
+1 h
+41 h
+11 h
+6310 m
+4 h
+1 h
+11 h
+59 h
+4 h
+601 h
+1 h
+4 h
+6311 m
+10 h
+6312 m
+146 h
+1 h
+6313 m
+1 h
+119 h
+82 h
+6314 m
+10 h
+4 h
+4 h
+1 h
+6315 m
+1 h
+1 h
+167 h
+6316 m
+146 h
+3558 m
+25 h
+3 h
+124 h
+10 h
+4 h
+1 h
+3111 m
+563 m
+4 h
+6317 m
+4 h
+10 h
+591 m
+91 h
+12 h
+1 h
+31 h
+4 h
+1 h
+4 h
+4 h
+4 h
+55 h
+1 h
+4 h
+41 h
+11 h
+109 h
+1 h
+1 h
+4 h
+6318 m
+1 h
+4 h
+10 h
+4 h
+77 h
+4 h
+4 h
+4 h
+1 h
+6319 m
+1 h
+4 h
+4 h
+3834 m
+5765 m
+57 h
+4 h
+157 h
+399 h
+91 h
+10 h
+169 h
+10 h
+57 h
+124 h
+10 h
+4 h
+4 h
+73 h
+172 h
+124 h
+93 h
+104 h
+2172 m
+6320 m
+1074 m
+1 h
+25 h
+109 h
+1 h
+10 h
+6321 m
+897 m
+1 h
+6322 m
+1096 m
+4 h
+322 m
+40 h
+82 h
+6323 m
+6324 m
+4 h
+4 h
+4 h
+135 h
+56 h
+4 h
+4 h
+536 h
+10 h
+1 h
+6325 m
+125 h
+1 h
+6326 m
+1 h
+124 h
+1 h
+11 h
+1 h
+4 h
+64 h
+6327 m
+109 h
+1 h
+11 h
+10 h
+10 h
+6328 m
+4 h
+4 h
+1 h
+1 h
+4 h
+4 h
+25 h
+140 h
+4 h
+1 h
+10 h
+109 h
+109 h
+82 h
+6329 m
+10 h
+10 h
+6330 m
+1089 h
+1 h
+6331 m
+4 h
+6332 m
+4151 m
+6333 m
+1 h
+6334 m
+4 h
+56 h
+40 h
+10 h
+10 h
+4 h
+185 h
+1 h
+104 h
+10 h
+4 h
+1 h
+1548 m
+109 h
+332 h
+83 h
+4 h
+368 h
+59 h
+6335 m
+1 h
+104 h
+1 h
+4 h
+278 h
+5917 m
+6336 m
+12 h
+10 h
+119 h
+1 h
+10 h
+10 h
+6337 m
+135 h
+4 h
+70 m
+4 h
+10 h
+412 m
+1 h
+6338 m
+10 h
+6339 m
+6340 m
+4 h
+6341 m
+4 h
+82 h
+6342 m
+3 h
+10 h
+146 h
+6343 m
+4 h
+575 h
+10 h
+6344 m
+1655 m
+57 h
+195 h
+4 h
+4 h
+1 h
+64 h
+601 h
+6345 m
+1 h
+97 h
+1 h
+1 h
+55 h
+1 h
+10 h
+6346 m
+31 h
+6347 m
+8 h
+1 h
+11 h
+10 h
+10 h
+94 h
+156 h
+6348 m
+459 m
+4 h
+4 h
+1677 m
+6349 m
+11 h
+1 h
+1 h
+1 h
+6350 m
+4 h
+6351 m
+11 h
+57 h
+4 h
+4 h
+1 h
+41 h
+4 h
+1 h
+1 h
+10 h
+1 h
+10 h
+6352 m
+1 h
+10 h
+57 h
+10 h
+4 h
+1 h
+10 h
+10 h
+13 h
+55 h
+3562 m
+4 h
+1 h
+10 h
+6353 m
+6354 m
+91 h
+3 h
+258 h
+10 h
+4 h
+59 h
+10 h
+10 h
+4 h
+124 h
+147 h
+368 h
+1 h
+10 h
+6355 m
+6356 m
+4 h
+25 h
+1 h
+4 h
+4 h
+4 h
+3068 m
+1 h
+4 h
+4 h
+4 h
+4 h
+4 h
+6357 m
+104 h
+718 h
+1 h
+10 h
+3 h
+4 h
+4 h
+1 h
+6358 m
+4 h
+6359 m
+10 h
+6360 m
+113 h
+1 h
+6361 m
+6362 m
+25 h
+1 h
+1 h
+1 h
+10 h
+4 h
+10 h
+6363 m
+575 h
+4 h
+4 h
+48 h
+6364 m
+1 h
+3 h
+10 h
+6365 m
+1 h
+124 h
+59 h
+692 h
+4 h
+443 h
+6366 m
+4 h
+91 h
+6367 m
+41 h
+195 h
+2261 m
+4 h
+4 h
+1 h
+10 h
+6368 m
+1 h
+4 h
+4 h
+4 h
+104 h
+83 h
+6369 m
+10 h
+3779 m
+4 h
+1362 h
+1 h
+82 h
+4 h
+3 h
+6370 m
+91 h
+6371 m
+1822 h
+6372 m
+125 h
+6373 m
+4 h
+1 h
+25 h
+6374 m
+238 h
+123 h
+57 h
+2887 h
+6375 m
+104 h
+10 h
+10 h
+10 h
+1 h
+4 h
+1 h
+2379 m
+82 h
+10 h
+1 h
+6376 m
+603 m
+31 h
+4 h
+4 h
+4 h
+3025 m
+3141 m
+3 h
+10 h
+10 h
+157 h
+6377 m
+6378 m
+1 h
+13 h
+27 h
+1 h
+41 h
+10 h
+6379 m
+6380 m
+1 h
+270 h
+11 h
+4 h
+4 h
+4 h
+6381 m
+73 h
+10 h
+10 h
+10 h
+6382 m
+10 h
+307 h
+4 h
+6383 m
+10 h
+1 h
+6384 m
+65 h
+918 m
+4 h
+383 h
+125 h
+10 h
+6385 m
+808 m
+25 h
+4 h
+4 h
+4 h
+939 h
+6386 m
+83 h
+184 h
+4 h
+10 h
+1296 m
+1 h
+1261 h
+1 h
+10 h
+97 h
+10 h
+6387 m
+687 h
+4 h
+135 h
+4 h
+5475 m
+65 h
+6388 m
+1185 m
+4 h
+6389 m
+10 h
+1 h
+538 h
+1 h
+5070 m
+10 h
+4 h
+4 h
+1 h
+4 h
+6390 m
+1 h
+4 h
+6391 m
+10 h
+4 h
+57 h
+6392 m
+10 h
+10 h
+2146 m
+1 h
+65 h
+371 h
+10 h
+195 h
+1 h
+59 h
+4 h
+3558 m
+170 h
+4 h
+6393 m
+4 h
+4 h
+31 h
+1 h
+6394 m
+22 h
+1 h
+4 h
+4 h
+4 h
+109 h
+10 h
+4 h
+41 h
+4 h
+3 h
+4 h
+297 h
+4 h
+990 m
+25 h
+4 h
+4 h
+4 h
+359 h
+83 h
+4 h
+45 h
+41 h
+1 h
+10 h
+59 h
+1 h
+4 h
+11 h
+6395 m
+6396 m
+4 h
+147 h
+4 h
+6397 m
+4 h
+104 h
+4 h
+83 h
+4 h
+3383 m
+6398 m
+10 h
+124 h
+1370 m
+4 h
+276 h
+97 h
+1 h
+10 h
+238 h
+4 h
+6399 m
+4 h
+4 h
+6400 m
+687 h
+4 h
+1 h
+4 h
+82 h
+55 h
+31 h
+6401 m
+10 h
+4 h
+6402 m
+64 h
+65 h
+6403 m
+6404 m
+4 h
+11 h
+6405 m
+6406 m
+6407 m
+10 h
+4 h
+4 h
+4 h
+10 h
+1 h
+6408 m
+1 h
+1 h
+4 h
+1 h
+4 h
+6409 m
+6410 m
+10 h
+10 h
+1 h
+1 h
+31 h
+82 h
+3278 m
+486 m
+6411 m
+4 h
+10 h
+4 h
+11 h
+55 h
+1 h
+10 h
+295 h
+6412 m
+6413 m
+4 h
+443 h
+4 h
+1 h
+4 h
+6414 m
+276 h
+10 h
+31 h
+190 h
+1 h
+4 h
+1 h
+6415 m
+10 h
+4 h
+1 h
+640 h
+4 h
+4 h
+57 h
+1 h
+6416 m
+4 h
+185 h
+6417 m
+4 h
+82 h
+1 h
+2288 m
+1 h
+82 h
+6418 m
+4 h
+125 h
+6419 m
+4 h
+57 h
+125 h
+4 h
+443 h
+146 h
+5 m
+4 h
+12 h
+10 h
+6420 m
+6421 m
+1027 h
+4 h
+6422 m
+10 h
+4 h
+1 h
+31 h
+4 h
+1 h
+1 h
+6423 m
+10 h
+4 h
+4 h
+112 h
+6424 m
+1 h
+6425 m
+11 h
+1 h
+656 m
+10 h
+1 h
+230 h
+4 h
+6426 m
+6427 m
+1 h
+125 h
+10 h
+65 h
+56 h
+1 h
+10 h
+1 h
+109 h
+6428 m
+1 h
+10 h
+59 h
+1 h
+1 h
+6429 m
+4 h
+1 h
+13 h
+4 h
+91 h
+119 h
+10 h
+41 h
+41 h
+13 h
+1685 h
+6430 m
+2339 m
+1 h
+4 h
+10 h
+6431 m
+386 h
+1 h
+79 h
+135 h
+59 h
+6432 m
+3435 m
+6433 m
+6434 m
+1 h
+4 h
+6435 m
+6436 m
+6437 m
+6438 m
+1 h
+6439 m
+6440 m
+57 h
+4 h
+1 h
+1 h
+57 h
+57 h
+4 h
+332 h
+6441 m
+4 h
+64 h
+119 h
+493 m
+6442 m
+6443 m
+4 h
+82 h
+1 h
+4 h
+12 h
+1 h
+10 h
+156 h
+805 m
+569 h
+4 h
+104 h
+10 h
+6444 m
+6445 m
+4 h
+1 h
+10 h
+6446 m
+10 h
+11 h
+10 h
+4 h
+87 m
+6447 m
+6448 m
+258 h
+4 h
+11 h
+330 h
+4 h
+10 h
+1 h
+82 h
+4 h
+11 h
+1 h
+10 h
+4 h
+4 h
+4 h
+4 h
+4 h
+4 h
+6449 m
+4 h
+6450 m
+6451 m
+6452 m
+6399 m
+6453 m
+11 h
+109 h
+4 h
+10 h
+1685 h
+4 h
+1 h
+4 h
+1 h
+4 h
+4 h
+4 h
+6454 m
+6455 m
+10 h
+1 h
+4 h
+41 h
+4 h
+6456 m
+1 h
+10 h
+488 h
+10 h
+4 h
+10 h
+73 h
+4 h
+4914 m
+1 h
+10 h
+11 h
+6457 m
+737 m
+11 h
+69 m
+4 h
+6458 m
+1 h
+12 h
+6459 m
+12 h
+1 h
+4 h
+1 h
+4 h
+6460 m
+1 h
+10 h
+1 h
+6461 m
+976 h
+146 h
+10 h
+57 h
+10 h
+11 h
+4 h
+170 h
+1 h
+184 h
+6462 m
+10 h
+4 h
+4 h
+4 h
+10 h
+114 h
+10 h
+477 m
+4 h
+4 h
+11 h
+6463 m
+74 h
+64 h
+322 h
+3561 m
+1 h
+27 h
+403 h
+93 h
+10 h
+83 h
+4 h
+4 h
+6464 m
+11 h
+82 h
+4 h
+6465 m
+6466 m
+4 h
+4229 m
+1 h
+6467 m
+10 h
+124 h
+55 h
+224 h
+10 h
+79 h
+6468 m
+1 h
+36 h
+1 h
+6469 m
+1 h
+10 h
+4 h
+10 h
+124 h
+6470 m
+1 h
+6471 m
+4 h
+1 h
+4 h
+6472 m
+368 h
+297 h
+4 h
+4 h
+150 m
+1 h
+1541 m
+6473 m
+6474 m
+4 h
+4 h
+1 h
+229 h
+6475 m
+114 h
+1886 m
+4 h
+10 h
+4 h
+31 h
+10 h
+82 h
+6476 m
+6477 m
+94 h
+4858 m
+1 h
+11 h
+1 h
+13 h
+6105 m
+6478 m
+6479 m
+158 h
+4 h
+10 h
+4 h
+250 h
+1 h
+4 h
+1 h
+4 h
+4 h
+4 h
+1 h
+358 h
+4 h
+4 h
+82 h
+83 h
+41 h
+83 h
+4 h
+4 h
+25 h
+124 h
+138 h
+4 h
+448 m
+1 h
+575 h
+1 h
+6480 m
+1 h
+104 h
+10 h
+1 h
+4 h
+55 h
+6481 m
+6482 m
+6483 m
+10 h
+25 h
+4 h
+307 h
+6484 m
+4 h
+1761 m
+6485 m
+6486 m
+1 h
+57 h
+443 h
+10 h
+4 h
+172 h
+10 h
+10 h
+143 h
+10 h
+10 h
+4 h
+4 h
+6487 m
+10 h
+124 h
+1 h
+6488 m
+4 h
+1 h
+258 h
+10 h
+10 h
+6489 m
+1 h
+6490 m
+3 h
+17 m
+97 h
+4 h
+6491 m
+1 h
+1 h
+1 h
+258 h
+4 h
+1 h
+857 m
+55 h
+6492 m
+4 h
+124 h
+1 h
+93 h
+104 h
+6493 m
+6494 m
+229 h
+5478 m
+4 h
+31 h
+104 h
+10 h
+156 h
+1 h
+10 h
+1 h
+181 h
+83 h
+4 h
+4 h
+3 h
+6495 m
+41 h
+146 h
+601 h
+6496 m
+1 h
+4 h
+6497 m
+6498 m
+6499 m
+6500 m
+6501 m
+82 h
+10 h
+4 h
+4 h
+1 h
+6502 m
+10 h
+11 h
+10 h
+6503 m
+181 h
+4 h
+4 h
+109 h
+4 h
+41 h
+6504 m
+4 h
+4 h
+94 h
+59 h
+6505 m
+1619 h
+167 h
+228 m
+1 h
+6506 m
+1 h
+4 h
+6507 m
+4 h
+6508 m
+4 h
+1751 m
+4 h
+6509 m
+1 h
+6423 m
+4 h
+4 h
+1260 m
+11 h
+6510 m
+10 h
+4 h
+6511 m
+124 h
+4 h
+10 h
+10 h
+4 h
+6512 m
+1 h
+3170 m
+4 h
+12 h
+1 h
+112 h
+4 h
+41 h
+1 h
+6513 m
+412 m
+73 h
+6514 m
+1 h
+1 h
+1 h
+6515 m
+10 h
+10 h
+1 h
+92 h
+10 h
+4 h
+6516 m
+125 h
+11 h
+6517 m
+10 h
+4 h
+4 h
+114 h
+82 h
+4 h
+4 h
+4 h
+10 h
+4 h
+3396 m
+10 h
+1 h
+2163 m
+6518 m
+1 h
+4 h
+10 h
+4 h
+6519 m
+10 h
+1 h
+4 h
+4 h
+1 h
+1 h
+4 h
+3680 m
+1 h
+4 h
+6520 m
+114 h
+4 h
+65 h
+10 h
+82 h
+10 h
+82 h
+1 h
+1 h
+6521 m
+4 h
+59 h
+536 h
+10 h
+6522 m
+737 m
+1 h
+230 h
+4 h
+1 h
+4 h
+6523 m
+10 h
+10 h
+1089 h
+109 h
+6524 m
+6525 m
+10 h
+1548 m
+802 m
+1 h
+4 h
+4 h
+36 h
+4 h
+10 h
+1 h
+4 h
+6526 m
+146 h
+10 h
+6527 m
+124 h
+1 h
+4 h
+1 h
+299 h
+1 h
+6528 m
+4 h
+1 h
+10 h
+82 h
+83 h
+31 h
+11 h
+6529 m
+1 h
+386 h
+6530 m
+1 h
+1322 m
+10 h
+4 h
+6531 m
+83 h
+6532 m
+6533 m
+359 h
+1 h
+4 h
+82 h
+10 h
+6534 m
+41 h
+10 h
+4 h
+10 h
+4 h
+1 h
+83 h
+2442 m
+59 h
+4 h
+6535 m
+6536 m
+914 m
+82 h
+1 h
+6537 m
+1 h
+4 h
+109 h
+1 h
+6538 m
+59 h
+4 h
+4 h
+1 h
+146 h
+1788 m
+6539 m
+6540 m
+10 h
+156 h
+1 h
+1 h
+77 h
+10 h
+22 h
+1 h
+4 h
+4 h
+6541 m
+1 h
+1 h
+6542 m
+1 h
+6543 m
+10 h
+69 h
+6544 m
+147 h
+1027 h
+4 h
+1 h
+92 h
+4 h
+6545 m
+4 h
+6546 m
+1 h
+10 h
+6547 m
+195 h
+1 h
+4 h
+6548 m
+297 h
+1 h
+59 h
+124 h
+4 h
+10 h
+10 h
+4 h
+1 h
+10 h
+6549 m
+1 h
+11 h
+6550 m
+4 h
+4 h
+59 h
+6551 m
+829 m
+59 h
+1 h
+6552 m
+185 h
+10 h
+10 h
+757 h
+1 h
+6553 m
+4 h
+3 h
+4 h
+640 h
+59 h
+73 h
+4 h
+83 h
+1 h
+10 h
+147 h
+6554 m
+12 h
+1 h
+1 h
+1 h
+143 h
+4 h
+4 h
+4 h
+10 h
+358 h
+6555 m
+4 h
+1 h
+6556 m
+1 h
+125 h
+147 h
+6557 m
+1 h
+447 h
+124 h
+1 h
+6558 m
+10 h
+10 h
+6559 m
+6560 m
+92 h
+6561 m
+5944 h
+170 h
+4 h
+4 h
+629 m
+1 h
+1 h
+10 h
+74 h
+4 h
+104 h
+104 h
+10 h
+6562 m
+57 h
+4 h
+28 h
+6563 m
+265 h
+4 h
+6564 m
+41 h
+6565 m
+4 h
+124 h
+265 h
+184 h
+125 h
+1 h
+6566 m
+6567 m
+1 h
+82 h
+1 h
+4 h
+4 h
+129 h
+1137 h
+4 h
+5581 m
+6568 m
+1 h
+4 h
+6569 m
+6570 m
+56 h
+1 h
+6571 m
+4 h
+4 h
+10 h
+10 h
+13 h
+4 h
+129 h
+6572 m
+109 h
+4 h
+1201 h
+10 h
+4 h
+195 h
+173 h
+10 h
+10 h
+1137 h
+164 h
+27 h
+125 h
+10 h
+1 h
+5125 m
+1 h
+10 h
+4 h
+1 h
+4 h
+6573 m
+4 h
+4 h
+185 h
+1 h
+1 h
+1650 h
+1137 h
+83 h
+6574 m
+118 h
+1 h
+4 h
+278 h
+6575 m
+124 h
+1 h
+45 h
+4 h
+1 h
+1 h
+10 h
+6576 m
+4 h
+25 h
+4 h
+1137 h
+4 h
+1 h
+82 h
+656 m
+6577 m
+10 h
+6578 m
+1 h
+12 h
+2116 m
+109 h
+4 h
+3 h
+4 h
+4 h
+4 h
+6579 m
+6580 m
+4 h
+10 h
+4 h
+6581 m
+169 h
+6582 m
+1632 m
+276 h
+1 h
+1 h
+192 h
+3 h
+6583 m
+1 h
+1 h
+2184 m
+1 h
+170 h
+6584 m
+4 h
+6585 m
+1 h
+4 h
+1 h
+4 h
+31 h
+2891 m
+41 h
+94 h
+97 h
+4 h
+4 h
+4 h
+10 h
+124 h
+6586 m
+4 h
+4 h
+1 h
+4 h
+10 h
+4 h
+57 h
+692 h
+4 h
+10 h
+10 h
+6587 m
+45 h
+4 h
+358 h
+10 h
+124 h
+4 h
+6588 m
+1 h
+4 h
+74 h
+6589 m
+6590 m
+6591 m
+11 h
+129 h
+4 h
+1250 h
+4 h
+82 h
+6592 m
+229 h
+4 h
+6593 m
+4 h
+1 h
+10 h
+6594 m
+25 h
+10 h
+4 h
+10 h
+104 h
+1 h
+6595 m
+10 h
+6596 m
+6597 m
+10 h
+4 h
+10 h
+368 h
+109 h
+125 h
+1 h
+6598 m
+4 h
+6599 m
+4 h
+1 h
+6600 m
+4 h
+4 h
+170 h
+4 h
+4 h
+129 h
+6601 m
+57 h
+10 h
+6602 m
+10 h
+6603 m
+10 h
+10 h
+41 h
+1 h
+10 h
+83 h
+4 h
+1 h
+1 h
+4 h
+4 h
+4 h
+12 h
+10 h
+11 h
+1595 m
+10 h
+11 h
+6604 m
+6605 m
+6606 m
+488 h
+109 h
+10 h
+6607 m
+1122 m
+4 h
+94 h
+31 h
+4 h
+1 h
+10 h
+4 h
+6608 m
+170 h
+6609 m
+10 h
+10 h
+4 h
+4 h
+1 h
+1 h
+6610 m
+4 h
+4 h
+170 h
+6611 m
+1 h
+6612 m
+4 h
+82 h
+109 h
+4 h
+10 h
+8 h
+6613 m
+307 h
+1 h
+4 h
+11 h
+4 h
+3 h
+229 h
+4 h
+4 h
+6614 m
+10 h
+192 h
+6615 m
+10 h
+4 h
+4 h
+124 h
+1 h
+6616 m
+6617 m
+1 h
+1 h
+104 h
+1 h
+13 h
+119 h
+468 m
+1 h
+114 h
+73 h
+1 h
+4 h
+74 h
+6618 m
+601 h
+4 h
+25 h
+10 h
+4 h
+1 h
+4 h
+270 h
+41 h
+6619 m
+6620 m
+4 h
+477 m
+1 h
+4 h
+6621 m
+196 h
+4 h
+10 h
+1 h
+1403 h
+4 h
+4 h
+4 h
+4 h
+6622 m
+10 h
+10 h
+11 h
+83 h
+2303 m
+1 h
+10 h
+11 h
+1 h
+25 h
+11 h
+2268 m
+10 h
+83 h
+1 h
+1 h
+6623 m
+6624 m
+6625 m
+1 h
+4 h
+12 h
+1 h
+31 h
+10 h
+6626 m
+73 h
+181 h
+1 h
+4 h
+6627 m
+6628 m
+10 h
+6629 m
+4 h
+4 h
+4 h
+184 h
+82 h
+10 h
+262 h
+6630 m
+10 h
+4 h
+10 h
+4 h
+10 h
+184 h
+45 h
+443 h
+1 h
+4 h
+6631 m
+5341 m
+4932 m
+10 h
+6200 m
+12 h
+59 h
+6632 m
+170 h
+6633 m
+6634 m
+167 h
+4 h
+4 h
+1 h
+91 h
+857 m
+12 h
+4 h
+6635 m
+6636 m
+13 h
+10 h
+4 h
+94 h
+1 h
+31 h
+55 h
+1 h
+6637 m
+601 h
+5590 m
+1 h
+104 h
+1 h
+6638 m
+4 h
+358 h
+11 h
+6639 m
+11 h
+1 h
+4 h
+4 h
+56 h
+11 h
+12 h
+6640 m
+4 h
+10 h
+6641 m
+6642 m
+238 h
+10 h
+6643 m
+4 h
+403 h
+6644 m
+6645 m
+1 h
+4 h
+4 h
+6646 m
+1 h
+1576 m
+25 h
+1 h
+4 h
+1 h
+4 h
+6647 m
+4 h
+31 h
+10 h
+82 h
+6648 m
+1 h
+2591 m
+79 h
+965 m
+1 h
+1 h
+10 h
+1 h
+25 h
+10 h
+1304 m
+64 h
+11 h
+10 h
+83 h
+4 h
+167 h
+4 h
+4 h
+1 h
+6649 m
+570 h
+784 m
+4 h
+146 h
+10 h
+4 h
+1 h
+57 h
+29 m
+4 h
+1 h
+10 h
+1 h
+4 h
+297 h
+4 h
+109 h
+109 h
+6650 m
+1 h
+184 h
+266 h
+11 h
+443 h
+1 h
+6651 m
+10 h
+1769 m
+11 h
+123 h
+603 m
+10 h
+1 h
+4 h
+4 h
+1 h
+250 h
+1 h
+1 h
+6652 m
+109 h
+4 h
+4 h
+10 h
+4 h
+10 h
+64 h
+10 h
+82 h
+125 h
+1218 m
+146 h
+575 h
+6653 m
+135 h
+6654 m
+4 h
+4 h
+82 h
+124 h
+10 h
+79 h
+4 h
+1504 m
+1 h
+4 h
+6655 m
+4 h
+6656 m
+1 h
+10 h
+196 h
+65 h
+10 h
+10 h
+186 h
+114 h
+1 h
+4 h
+1321 m
+10 h
+1 h
+1 h
+36 h
+6657 m
+10 h
+1 h
+4 h
+4 h
+1 h
+1 h
+6658 m
+1 h
+1 h
+135 h
+57 h
+13 h
+1 h
+4 h
+6659 m
+4 h
+649 h
+4 h
+4 h
+4 h
+79 h
+11 h
+83 h
+6660 m
+10 h
+4 h
+6661 m
+10 h
+4 h
+1 h
+6031 m
+4 h
+733 m
+6662 m
+6663 m
+3 h
+10 h
+4 h
+1003 m
+4 h
+4 h
+6664 m
+4 h
+1 h
+6665 m
+5505 m
+192 h
+4 h
+83 h
+10 h
+5762 m
+59 h
+4 h
+13 h
+13 h
+4 h
+1 h
+1 h
+1 h
+6666 m
+6667 m
+41 h
+6668 m
+4 h
+1 h
+6669 m
+1 h
+4 h
+4 h
+1 h
+2418 h
+4 h
+6670 m
+10 h
+6671 m
+1 h
+1322 m
+1 h
+1 h
+4 h
+4 h
+1646 m
+195 h
+3188 m
+113 h
+4 h
+8 h
+4 h
+1 h
+4 h
+6672 m
+10 h
+10 h
+6673 m
+6674 m
+10 h
+6675 m
+4 h
+124 h
+110 h
+10 h
+83 h
+1 h
+147 h
+1 h
+139 h
+6676 m
+1 h
+12 h
+4 h
+135 h
+40 h
+4 h
+4 h
+4723 m
+83 h
+10 h
+4 h
+6677 m
+6678 m
+1 h
+195 h
+4 h
+10 h
+10 h
+4 h
+4 h
+10 h
+6679 m
+10 h
+1 h
+10 h
+1576 m
+97 h
+1650 h
+4 h
+6680 m
+1 h
+167 h
+1 h
+4 h
+1948 m
+4 h
+10 h
+6681 m
+10 h
+119 h
+172 h
+1 h
+4 h
+4 h
+4 h
+57 h
+575 h
+6682 m
+124 h
+11 h
+1 h
+2256 m
+4359 m
+6683 m
+3680 m
+4 h
+4 h
+55 h
+82 h
+1 h
+10 h
+11 h
+1 h
+135 h
+4 h
+8 h
+4 h
+109 h
+82 h
+25 h
+125 h
+4 h
+10 h
+4 h
+1 h
+11 h
+4 h
+79 h
+10 h
+4 h
+601 h
+74 h
+4 h
+10 h
+3837 m
+10 h
+4 h
+1 h
+6684 m
+59 h
+6685 m
+6686 m
+6687 m
+83 h
+4 h
+10 h
+6688 m
+1 h
+820 h
+4 h
+6689 m
+6690 m
+4 h
+104 h
+109 h
+6691 m
+6692 m
+601 h
+1 h
+10 h
+10 h
+195 h
+8 h
+10 h
+911 h
+4520 m
+443 h
+6693 m
+4 h
+4 h
+1 h
+147 h
+3177 m
+10 h
+6694 m
+82 h
+6695 m
+6696 m
+55 h
+12 h
+1 h
+6697 m
+6698 m
+4966 m
+10 h
+1 h
+6699 m
+124 h
+6700 m
+6701 m
+4 h
+74 h
+4 h
+4 h
+6702 m
+1 h
+4 h
+4 h
+124 h
+146 h
+297 h
+1 h
+184 h
+1 h
+6703 m
+12 h
+4 h
+123 h
+10 h
+1 h
+77 h
+27 h
+1 h
+10 h
+1 h
+1 h
+6704 m
+92 h
+4 h
+1 h
+6705 m
+1 h
+4 h
+6706 m
+6707 m
+1650 h
+10 h
+3 h
+1 h
+10 h
+1 h
+118 h
+10 h
+1 h
+6708 m
+1 h
+4 h
+97 h
+10 h
+1 h
+6709 m
+801 m
+169 h
+10 h
+41 h
+146 h
+4 h
+4 h
+4 h
+1 h
+6710 m
+4 h
+4 h
+1 h
+4 h
+10 h
+6711 m
+1 h
+10 h
+1 h
+241 m
+4 h
+1 h
+4 h
+1 h
+1 h
+238 h
+10 h
+4 h
+3150 m
+4 h
+10 h
+10 h
+10 h
+82 h
+820 h
+1 h
+6712 m
+4 h
+6713 m
+4 h
+10 h
+1 h
+125 h
+10 h
+10 h
+4 h
+6714 m
+48 h
+1 h
+6715 m
+1 h
+57 h
+4 h
+6716 m
+64 h
+238 h
+6717 m
+195 h
+1 h
+6718 m
+4 h
+368 h
+493 h
+6719 m
+1 h
+57 h
+104 h
+4 h
+124 h
+25 h
+575 h
+10 h
+1 h
+11 h
+4 h
+10 h
+4 h
+1 h
+1 h
+10 h
+57 h
+173 h
+1 h
+143 h
+4 h
+4 h
+6720 m
+11 h
+1764 m
+6721 m
+4 h
+10 h
+4 h
+6722 m
+10 h
+1 h
+1 h
+1 h
+6723 m
+6724 m
+6725 m
+124 h
+6726 m
+6727 m
+1089 h
+4 h
+6728 m
+4 h
+4 h
+224 h
+4 h
+12 h
+6729 m
+386 h
+4 h
+10 h
+4 h
+31 h
+4 h
+6730 m
+11 h
+82 h
+83 h
+6731 m
+4 h
+73 h
+986 h
+1 h
+4 h
+10 h
+4 h
+1 h
+10 h
+41 h
+2914 m
+4 h
+1197 m
+4 h
+1 h
+10 h
+10 h
+82 h
+4 h
+1 h
+4 h
+4 h
+13 h
+1 h
+4 h
+4 h
+10 h
+640 h
+10 h
+1 h
+6732 m
+1 h
+11 h
+10 h
+11 h
+10 h
+4 h
+6733 m
+1 h
+11 h
+6734 m
+1 h
+97 h
+4 h
+6735 m
+4 h
+10 h
+1 h
+94 h
+4 h
+10 h
+11 h
+1 h
+119 h
+1 h
+4 h
+3 h
+25 h
+1 h
+97 h
+185 h
+4 h
+4 h
+601 h
+6736 m
+1725 m
+6737 m
+1185 m
+1 h
+97 h
+10 h
+11 h
+190 h
+4 h
+4 h
+1 h
+6738 m
+4 h
+4 h
+258 h
+4 h
+4 h
+104 h
+1 h
+4 h
+4 h
+4 h
+31 h
+1 h
+6739 m
+4 h
+10 h
+10 h
+6740 m
+1 h
+10 h
+195 h
+10 h
+6741 m
+2266 m
+10 h
+8 h
+57 h
+10 h
+57 h
+6742 m
+169 h
+1 h
+10 h
+6743 m
+4 h
+6744 m
+6745 m
+31 h
+258 h
+10 h
+1 h
+1 h
+1 h
+4 h
+25 h
+10 h
+6746 m
+4 h
+6747 m
+444 m
+3 h
+4 h
+1 h
+65 h
+10 h
+97 h
+10 h
+10 h
+10 h
+8 h
+6748 m
+10 h
+1 h
+4 h
+4 h
+270 h
+1 h
+10 h
+4 h
+73 h
+31 h
+4 h
+11 h
+11 h
+6749 m
+6750 m
+10 h
+2319 m
+229 h
+25 h
+1 h
+6751 m
+4 h
+4 h
+1 h
+4 h
+41 h
+109 h
+10 h
+82 h
+1 h
+6752 m
+3 h
+73 h
+190 h
+4 h
+11 h
+10 h
+4 h
+10 h
+4 h
+6753 m
+1 h
+1 h
+10 h
+1 h
+4 h
+1 h
+6754 m
+1 h
+10 h
+4 h
+1 h
+6755 m
+4 h
+4 h
+10 h
+4 h
+83 h
+4 h
+169 h
+332 h
+4 h
+307 h
+10 h
+4 h
+4 h
+307 h
+1 h
+6756 m
+109 h
+297 h
+3 h
+4 h
+1 h
+13 h
+1 h
+25 h
+1 h
+4 h
+1 h
+1 h
+4 h
+4 h
+59 h
+1 h
+4 h
+6757 m
+55 h
+10 h
+124 h
+143 h
+6758 m
+31 h
+10 h
+4 h
+10 h
+1 h
+65 h
+74 h
+82 h
+6759 m
+10 h
+104 h
+104 h
+1 h
+1 h
+6760 m
+6761 m
+1 h
+1 h
+1 h
+4 h
+10 h
+6762 m
+3373 m
+1 h
+556 h
+6763 m
+3025 m
+6764 m
+4 h
+4 h
+3 h
+784 m
+629 m
+10 h
+4 h
+4 h
+10 h
+6765 m
+4 h
+56 h
+6766 m
+31 h
+4 h
+10 h
+10 h
+93 h
+10 h
+82 h
+10 h
+1137 h
+272 h
+79 h
+147 h
+4 h
+6767 m
+74 h
+6768 m
+10 h
+10 h
+10 h
+104 h
+1 h
+10 h
+6769 m
+97 h
+4 h
+6770 m
+4 h
+1 h
+6771 m
+1 h
+69 h
+4 h
+718 h
+31 h
+6772 m
+10 h
+4 h
+6773 m
+11 h
+289 h
+6774 m
+4 h
+195 h
+125 h
+1 h
+143 h
+10 h
+332 h
+10 h
+125 h
+4 h
+83 h
+185 h
+3199 m
+1 h
+31 h
+4 h
+109 h
+10 h
+10 h
+82 h
+10 h
+4 h
+6775 m
+10 h
+6776 m
+1 h
+82 h
+10 h
+1 h
+1 h
+6777 m
+1 h
+6778 m
+40 h
+1 h
+6779 m
+1 h
+4 h
+4 h
+10 h
+1 h
+1 h
+6780 m
+10 h
+1 h
+1 h
+10 h
+1406 m
+146 h
+447 h
+1 h
+4 h
+6781 m
+123 h
+4 h
+4 h
+4 h
+4 h
+4 h
+11 h
+4 h
+11 h
+278 h
+10 h
+976 h
+11 h
+73 h
+11 h
+4 h
+2532 m
+1 h
+4 h
+6782 m
+6783 m
+6784 m
+6785 m
+11 h
+112 h
+297 h
+4 h
+6786 m
+4 h
+6787 m
+6788 m
+4 h
+10 h
+2433 m
+41 h
+6789 m
+6790 m
+11 h
+4 h
+12 h
+1 h
+1409 m
+238 h
+65 h
+11 h
+6791 m
+1 h
+73 h
+25 h
+6792 m
+4 h
+10 h
+6793 m
+6794 m
+6795 m
+10 h
+3 h
+6796 m
+1 h
+203 m
+124 h
+10 h
+10 h
+55 h
+1650 h
+59 h
+1 h
+6270 m
+10 h
+185 h
+25 h
+10 h
+1 h
+1 h
+73 h
+110 h
+10 h
+1642 h
+123 h
+92 h
+1499 m
+1 h
+4 h
+1535 m
+1 h
+10 h
+4 h
+1406 m
+1 h
+575 h
+6797 m
+10 h
+6798 m
+6799 m
+22 h
+4 h
+1 h
+10 h
+3177 m
+4 h
+4 h
+4 h
+119 h
+493 h
+1 h
+10 h
+10 h
+83 h
+11 h
+6800 m
+1 h
+4 h
+10 h
+125 h
+11 h
+1 h
+6801 m
+10 h
+3 h
+59 h
+10 h
+11 h
+8 h
+6802 m
+1 h
+1 h
+1 h
+1470 h
+10 h
+77 h
+1 h
+6803 m
+4 h
+6804 m
+4 h
+6805 m
+4 h
+4 h
+1 h
+351 m
+1 h
+4 h
+1 h
+4 h
+6806 m
+4 h
+4 h
+135 h
+4 h
+10 h
+11 h
+4 h
+687 h
+359 h
+6807 m
+11 h
+6808 m
+1030 h
+6809 m
+6810 m
+4 h
+82 h
+692 h
+1 h
+6811 m
+581 m
+11 h
+118 h
+4 h
+4 h
+6812 m
+4 h
+6813 m
+1304 m
+4 h
+10 h
+6814 m
+295 h
+157 h
+1 h
+1 h
+109 h
+10 h
+10 h
+104 h
+4 h
+1953 m
+1 h
+4 h
+195 h
+12 h
+109 h
+114 h
+6815 m
+31 h
+6816 m
+170 h
+104 h
+6817 m
+4 h
+10 h
+10 h
+94 h
+135 h
+4 h
+10 h
+41 h
+82 h
+114 h
+13 h
+10 h
+4 h
+10 h
+4 h
+368 h
+97 h
+10 h
+4 h
+82 h
+25 h
+10 h
+83 h
+1359 h
+1 h
+4 h
+11 h
+6818 m
+10 h
+1279 m
+4689 m
+10 h
+10 h
+6819 m
+4 h
+4 h
+1 h
+4 h
+184 h
+4 h
+172 h
+4 h
+10 h
+10 h
+1 h
+6820 m
+69 h
+10 h
+1 h
+25 h
+620 m
+6821 m
+143 h
+4 h
+1 h
+1 h
+229 h
+6822 m
+4 h
+10 h
+6823 m
+124 h
+146 h
+3 h
+1 h
+11 h
+12 h
+1 h
+4 h
+371 h
+1 h
+468 m
+1 h
+3161 m
+6824 m
+443 h
+4 h
+4 h
+10 h
+258 h
+4 h
+109 h
+10 h
+6825 m
+6826 m
+11 h
+6827 m
+4 h
+1 h
+6828 m
+4574 m
+124 h
+6829 m
+4 h
+6830 m
+1083 m
+6831 m
+6832 m
+4 h
+10 h
+1 h
+4 h
+10 h
+6833 m
+10 h
+57 h
+45 h
+124 h
+4 h
+5348 h
+109 h
+4 h
+59 h
+5225 m
+6834 m
+4 h
+25 h
+10 h
+1 h
+3240 m
+1 h
+10 h
+10 h
+6835 m
+6836 m
+4 h
+4 h
+4 h
+3 h
+1 h
+6837 m
+10 h
+10 h
+10 h
+1642 h
+4 h
+112 h
+6838 m
+1 h
+41 h
+359 h
+4 h
+10 h
+6839 m
+192 h
+1 h
+90 m
+6840 m
+625 m
+258 h
+27 h
+1 h
+6841 m
+4 h
+4 h
+48 h
+692 h
+10 h
+358 h
+10 h
+4 h
+104 h
+3422 m
+185 h
+1 h
+1 h
+72 m
+10 h
+1 h
+2410 m
+256 h
+4 h
+12 h
+6842 m
+4 h
+4 h
+6843 m
+4 h
+4 h
+358 h
+575 h
+10 h
+77 h
+4 h
+12 h
+4 h
+10 h
+25 h
+11 h
+6844 m
+6845 m
+4 h
+1 h
+4 h
+10 h
+1 h
+10 h
+1 h
+1 h
+6846 m
+8 h
+79 h
+124 h
+10 h
+6847 m
+4 h
+3 h
+185 h
+6848 m
+11 h
+1 h
+1 h
+25 h
+1 h
+6849 m
+6850 m
+4 h
+4 h
+6851 m
+45 h
+1 h
+443 h
+185 h
+11 h
+6852 m
+412 m
+3 h
+4 h
+124 h
+1 h
+908 m
+12 h
+1 h
+40 h
+10 h
+6853 m
+57 h
+10 h
+6854 m
+10 h
+4 h
+1 h
+4 h
+11 h
+10 h
+767 m
+124 h
+192 h
+110 h
+4 h
+4 h
+1 h
+6855 m
+4 h
+230 h
+1 h
+6856 m
+6857 m
+801 m
+4 h
+10 h
+1 h
+536 h
+4 h
+1 h
+1 h
+10 h
+1 h
+1 h
+4 h
+1 h
+4 h
+4 h
+1 h
+11 h
+4 h
+3 h
+6858 m
+4 h
+2096 m
+6859 m
+10 h
+91 h
+1 h
+4 h
+1470 h
+6860 m
+4 h
+10 h
+3 h
+6861 m
+1 h
+104 h
+10 h
+10 h
+6862 m
+4 h
+4 h
+10 h
+13 h
+185 h
+10 h
+4 h
+97 h
+1 h
+4 h
+56 h
+116 m
+3398 m
+59 h
+1454 m
+1 h
+4 h
+25 h
+23 h
+10 h
+10 h
+169 h
+4 h
+4 h
+1 h
+4 h
+73 h
+123 h
+1 h
+3 h
+13 h
+6863 m
+10 h
+10 h
+4 h
+1 h
+195 h
+82 h
+2285 m
+4 h
+10 h
+6864 m
+10 h
+687 h
+4 h
+92 h
+10 h
+1403 h
+4 h
+6865 m
+4 h
+6866 m
+1 h
+4 h
+5958 m
+4 h
+4 h
+4863 m
+1 h
+4 h
+109 h
+2984 m
+4 h
+6867 m
+4 h
+10 h
+6868 m
+83 h
+6869 m
+1 h
+4 h
+6870 m
+1 h
+6871 m
+6872 m
+1 h
+4 h
+278 h
+4 h
+10 h
+1 h
+36 h
+4 h
+4 h
+4 h
+138 h
+11 h
+57 h
+109 h
+10 h
+1 h
+4 h
+1 h
+4 h
+4 h
+3 h
+10 h
+1 h
+6873 m
+6874 m
+1470 h
+25 h
+4 h
+230 h
+1 h
+25 h
+4 h
+167 h
+6875 m
+219 h
+91 h
+258 h
+279 h
+270 h
+278 h
+79 h
+41 h
+1 h
+1 h
+4 h
+146 h
+1 h
+869 m
+6876 m
+4 h
+6877 m
+1 h
+3 h
+4 h
+41 h
+31 h
+6878 m
+4 h
+10 h
+1880 m
+186 h
+6879 m
+124 h
+13 h
+6880 m
+4 h
+4 h
+1 h
+10 h
+6881 m
+10 h
+6882 m
+6883 m
+4 h
+1 h
+10 h
+10 h
+190 h
+1 h
+6884 m
+4 h
+31 h
+1 h
+4 h
+4 h
+10 h
+112 h
+1 h
+1 h
+4 h
+4 h
+1 h
+10 h
+6885 m
+41 h
+6886 m
+25 h
+6887 m
+4 h
+5505 m
+83 h
+1 h
+6888 m
+1 h
+1 h
+104 h
+4 h
+1 h
+399 h
+73 h
+6889 m
+6890 m
+4 h
+10 h
+6891 m
+74 h
+3539 m
+10 h
+615 m
+4 h
+1137 h
+10 h
+10 h
+1 h
+6892 m
+6893 m
+4 h
+6894 m
+172 h
+4 h
+3825 m
+1 h
+10 h
+4 h
+1 h
+6895 m
+6896 m
+1089 h
+4 h
+6897 m
+31 h
+1 h
+630 m
+181 h
+6898 m
+6899 m
+10 h
+1 h
+25 h
+4 h
+4 h
+1 h
+1 h
+3 h
+6900 m
+4 h
+94 h
+4 h
+1 h
+6901 m
+1359 h
+6902 m
+1 h
+6903 m
+1406 h
+6904 m
+91 h
+6905 m
+6906 m
+3657 m
+10 h
+6907 m
+10 h
+1697 m
+1968 m
+6908 m
+119 h
+966 m
+6909 m
+10 h
+104 h
+5863 m
+109 h
+6910 m
+6911 m
+10 h
+1 h
+6912 m
+6913 m
+265 h
+93 h
+433 m
+11 h
+10 h
+1 h
+4 h
+1 h
+6914 m
+4 h
+82 h
+125 h
+57 h
+6915 m
+4 h
+1 h
+74 h
+1 h
+6916 m
+4 h
+4 h
+603 m
+181 h
+59 h
+4 h
+41 h
+601 h
+123 h
+124 h
+6917 m
+6918 m
+4 h
+4 h
+36 h
+1 h
+279 h
+2887 h
+6919 m
+1 h
+10 h
+4 h
+1 h
+6920 m
+10 h
+1137 h
+4240 m
+4 h
+6921 m
+1 h
+4 h
+6922 m
+109 h
+4 h
+538 h
+6923 m
+6924 m
+57 h
+976 h
+10 h
+1 h
+1 h
+10 h
+6925 m
+124 h
+6926 m
+1 h
+195 h
+4 h
+857 h
+10 h
+6927 m
+6928 m
+1 h
+4 h
+1 h
+10 h
+6929 m
+1 h
+1 h
+11 h
+6930 m
+2733 h
+6931 m
+6932 m
+3513 m
+6933 m
+6934 m
+4 h
+270 h
+1 h
+1 h
+31 h
+6935 m
+97 h
+4 h
+3 h
+3 h
+4 h
+6936 m
+40 h
+4 h
+64 h
+6937 m
+6938 m
+1 h
+74 h
+4089 m
+1 h
+4 h
+3 h
+6939 m
+65 h
+73 h
+6940 m
+1 h
+4 h
+1 h
+4 h
+4 h
+1105 h
+4 h
+10 h
+4 h
+10 h
+1 h
+1 h
+4 h
+6941 m
+1 h
+97 h
+4 h
+4 h
+97 h
+4 h
+97 h
+1 h
+1 h
+1 h
+70 m
+10 h
+10 h
+6942 m
+2379 m
+11 h
+6943 m
+4 h
+6944 m
+82 h
+6945 m
+1 h
+10 h
+10 h
+10 h
+4 h
+10 h
+6946 m
+10 h
+1 h
+6947 m
+6948 m
+172 h
+4 h
+1 h
+1685 h
+6949 m
+1 h
+146 h
+1 h
+4 h
+4 h
+4 h
+5757 m
+97 h
+256 h
+74 h
+10 h
+3 h
+10 h
+4 h
+91 h
+1 h
+1 h
+10 h
+6950 m
+10 h
+1 h
+6951 m
+104 h
+10 h
+1 h
+6952 m
+1 h
+170 h
+10 h
+6953 m
+368 h
+4 h
+82 h
+10 h
+10 h
+6954 m
+10 h
+10 h
+36 h
+4 h
+83 h
+386 h
+1 h
+6955 m
+11 h
+4 h
+83 h
+1 h
+124 h
+6956 m
+6957 m
+307 h
+125 h
+4 h
+6448 m
+57 h
+1 h
+1 h
+10 h
+4 h
+92 h
+112 h
+6958 m
+806 m
+6959 m
+1 h
+6960 m
+45 h
+1 h
+25 h
+1 h
+1 h
+6961 m
+10 h
+57 h
+4 h
+1 h
+6962 m
+41 h
+3 h
+4 h
+6963 m
+11 h
+4 h
+12 h
+74 h
+10 h
+10 h
+4 h
+196 h
+146 h
+1 h
+6964 m
+6965 m
+1 h
+4 h
+6966 m
+4 h
+10 h
+10 h
+1 h
+84 h
+22 h
+6967 m
+4 h
+6968 m
+6969 m
+4 h
+1 h
+1 h
+6970 m
+1 h
+4 h
+10 h
+4 h
+3025 m
+459 m
+94 h
+82 h
+1 h
+97 h
+10 h
+10 h
+1 h
+79 h
+6971 m
+1 h
+10 h
+110 h
+174 h
+10 h
+11 h
+135 h
+4 h
+4 h
+195 h
+6972 m
+1 h
+1 h
+119 h
+6973 m
+1 h
+6974 m
+6975 m
+139 h
+6976 m
+196 h
+172 h
+10 h
+6977 m
+6978 m
+1 h
+1 h
+11 h
+36 h
+1 h
+10 h
+10 h
+4 h
+4 h
+6979 m
+4 h
+6980 m
+1 h
+6981 m
+1 h
+4 h
+64 h
+250 h
+1 h
+4 h
+4 h
+143 h
+276 h
+4 h
+1122 m
+10 h
+12 h
+31 h
+124 h
+1 h
+6982 m
+6983 m
+129 h
+1 h
+3607 m
+6984 m
+4 h
+10 h
+1 h
+4 h
+4 h
+6985 m
+6986 m
+1 h
+1220 m
+6987 m
+4 h
+94 h
+1 h
+74 h
+4 h
+82 h
+6988 m
+4 h
+185 h
+1345 m
+1 h
+10 h
+1 h
+1 h
+6989 m
+4 h
+1 h
+8 h
+25 h
+1379 m
+1 h
+4 h
+55 h
+1 h
+6990 m
+8 h
+4 h
+1 h
+4 h
+1 h
+6991 m
+2339 m
+65 h
+4 h
+55 h
+6992 m
+1 h
+6993 m
+6994 m
+4 h
+196 h
+1 h
+10 h
+11 h
+10 h
+1 h
+31 h
+4 h
+6995 m
+97 h
+10 h
+57 h
+1 h
+1 h
+1 h
+6996 m
+4 h
+4 h
+1 h
+11 h
+6997 m
+82 h
+147 h
+1 h
+3 h
+1 h
+1 h
+1 h
+4 h
+170 h
+6998 m
+1 h
+4 h
+59 h
+4 h
+1 h
+10 h
+1 h
+4 h
+123 h
+4 h
+6999 m
+138 h
+4 h
+1 h
+1 h
+124 h
+4 h
+1 h
+4 h
+4 h
+4 h
+4 h
+4 h
+7000 m
+4 h
+10 h
+7001 m
+4 h
+7002 m
+4 h
+7003 m
+1 h
+4 h
+1 h
+1 h
+1 h
+7004 m
+7005 m
+10 h
+10 h
+4 h
+4 h
+4 h
+7006 m
+7007 m
+258 h
+10 h
+241 m
+114 h
+4 h
+10 h
+1 h
+250 h
+4 h
+10 h
+359 h
+11 h
+4 h
+538 h
+59 h
+7008 m
+2607 m
+82 h
+1 h
+10 h
+4 h
+7009 m
+1 h
+7010 m
+82 h
+1 h
+124 h
+10 h
+1 h
+468 m
+203 m
+4 h
+1 h
+97 h
+7011 m
+4 h
+22 h
+7012 m
+4 h
+7013 m
+1 h
+266 h
+1 h
+7014 m
+7015 m
+7016 m
+4 h
+4 h
+7017 m
+4 h
+82 h
+10 h
+1 h
+10 h
+3845 m
+1 h
+7018 m
+295 h
+1 h
+4 h
+7019 m
+7020 m
+10 h
+1 h
+1 h
+94 h
+97 h
+41 h
+83 h
+4702 m
+109 h
+3089 m
+10 h
+1 h
+3 h
+7021 m
+4 h
+7022 m
+10 h
+1 h
+3 h
+4 h
+332 h
+698 m
+10 h
+4 h
+3 h
+7023 m
+173 h
+7024 m
+31 h
+7025 m
+1016 h
+109 h
+1 h
+7026 m
+4 h
+10 h
+332 h
+7027 m
+10 h
+1 h
+276 h
+7028 m
+4 h
+1 h
+1 h
+7029 m
+4 h
+7030 m
+7031 m
+10 h
+4 h
+4 h
+25 h
+10 h
+1 h
+757 h
+1 h
+7032 m
+7033 m
+258 h
+1 h
+4 h
+4 h
+4 h
+10 h
+1 h
+125 h
+110 h
+7034 m
+1 h
+169 h
+4 h
+1 h
+4 h
+307 h
+7035 m
+4 h
+3 h
+319 h
+1 h
+129 h
+56 h
+124 h
+1 h
+10 h
+1 h
+7036 m
+1 h
+25 h
+22 h
+10 h
+1 h
+129 h
+270 h
+10 h
+1 h
+7037 m
+1 h
+4 h
+10 h
+1975 m
+10 h
+1304 m
+7038 m
+45 h
+10 h
+10 h
+7039 m
+4177 m
+307 h
+4 h
+173 h
+4 h
+1 h
+65 h
+7040 m
+1 h
+10 h
+7041 m
+7042 m
+1 h
+4 h
+7043 m
+4 h
+4 h
+1 h
+7044 m
+7045 m
+4 h
+146 h
+10 h
+7046 m
+7047 m
+4 h
+93 h
+104 h
+4 h
+4 h
+7048 m
+59 h
+10 h
+4 h
+7049 m
+10 h
+4 h
+1 h
+7050 m
+1 h
+297 h
+4 h
+10 h
+1 h
+4 h
+10 h
+27 h
+7051 m
+1 h
+11 h
+7052 m
+10 h
+386 h
+3 h
+7053 m
+208 m
+41 h
+7054 m
+10 h
+2163 m
+7055 m
+1 h
+93 h
+11 h
+10 h
+170 h
+4 h
+4 h
+4 h
+11 h
+7056 m
+10 h
+4 h
+10 h
+10 h
+986 h
+10 h
+276 h
+158 h
+1 h
+4 h
+1 h
+319 h
+1 h
+4 h
+1 h
+7057 m
+10 h
+986 h
+4 h
+1 h
+1 h
+124 h
+7058 m
+7059 m
+4 h
+1697 m
+7060 m
+4 h
+7061 m
+11 h
+4 h
+4 h
+7062 m
+7063 m
+4 h
+1 h
+124 h
+7064 m
+3 h
+1 h
+8 h
+1 h
+10 h
+10 h
+31 h
+4 h
+11 h
+7065 m
+10 h
+4 h
+7066 m
+1619 h
+1 h
+7067 m
+7068 m
+1 h
+264 m
+1 h
+4 h
+119 h
+4 h
+10 h
+1 h
+2788 m
+1 h
+7069 m
+172 h
+10 h
+278 h
+73 h
+7070 m
+110 h
+10 h
+4 h
+4 h
+1 h
+1868 m
+59 h
+4 h
+1 h
+10 h
+1 h
+4 h
+1 h
+195 h
+10 h
+1 h
+7071 m
+4 h
+386 h
+82 h
+7072 m
+4 h
+10 h
+10 h
+11 h
+7073 m
+7074 m
+10 h
+7075 m
+11 h
+167 h
+4 h
+278 h
+7076 m
+7077 m
+82 h
+3 h
+1 h
+4 h
+1 h
+4 h
+1 h
+1 h
+97 h
+27 h
+10 h
+10 h
+46 h
+7078 m
+11 h
+4 h
+73 h
+4 h
+59 h
+4 h
+4 h
+10 h
+1 h
+7079 m
+7080 m
+4 h
+10 h
+1993 m
+10 h
+25 h
+7081 m
+10 h
+1 h
+10 h
+10 h
+1 h
+4 h
+10 h
+1 h
+4 h
+57 h
+25 h
+10 h
+10 h
+1 h
+92 h
+1 h
+4 h
+3 h
+31 h
+1 h
+7082 m
+1 h
+4 h
+7083 m
+692 h
+1 h
+25 h
+11 h
+4 h
+7084 m
+1 h
+10 h
+7085 m
+10 h
+10 h
+13 h
+1 h
+737 h
+7086 m
+4 h
+138 h
+7087 m
+7088 m
+601 h
+976 h
+2494 m
+1 h
+4 h
+109 h
+7089 m
+4 h
+10 h
+4 h
+7090 m
+4 h
+278 h
+230 h
+3 h
+7091 m
+4 h
+140 h
+1 h
+4 h
+7092 m
+1 h
+4 h
+7093 m
+1 h
+124 h
+7094 m
+28 h
+10 h
+4 h
+7095 m
+307 h
+7096 m
+41 h
+1 h
+11 h
+7097 m
+2931 m
+11 h
+8 h
+7098 m
+73 h
+702 m
+10 h
+124 h
+238 h
+7099 m
+1478 m
+7100 m
+4 h
+1 h
+10 h
+10 h
+190 h
+1 h
+307 h
+1 h
+4 h
+4 h
+276 h
+10 h
+1 h
+1 h
+82 h
+27 h
+1 h
+10 h
+10 h
+7101 m
+10 h
+109 h
+1 h
+7102 m
+157 h
+45 h
+3229 m
+1 h
+4 h
+1 h
+11 h
+4 h
+4 h
+4 h
+7103 m
+4 h
+118 h
+7104 m
+10 h
+110 h
+4 h
+73 h
+1 h
+10 h
+4 h
+109 h
+435 h
+1 h
+1 h
+10 h
+7105 m
+1016 h
+578 m
+184 h
+7106 m
+4 h
+1 h
+7107 m
+7108 m
+4 h
+4 h
+7109 m
+13 h
+4 h
+1 h
+31 h
+7110 m
+4 h
+2116 m
+10 h
+1 h
+7111 m
+4 h
+7112 m
+10 h
+82 h
+2788 m
+332 h
+1 h
+59 h
+4 h
+59 h
+541 m
+4 h
+56 h
+83 h
+1 h
+74 h
+73 h
+10 h
+7113 m
+258 h
+10 h
+7114 m
+1 h
+4 h
+7115 m
+57 h
+7116 m
+512 m
+4 h
+125 h
+1 h
+493 h
+4 h
+10 h
+1 h
+4 h
+7117 m
+1 h
+4 h
+7118 m
+59 h
+1 h
+56 h
+7119 m
+4 h
+10 h
+4 h
+276 h
+1 h
+157 h
+10 h
+64 h
+7120 m
+82 h
+4 h
+124 h
+186 h
+1 h
+4 h
+82 h
+1 h
+4 h
+1 h
+10 h
+1 h
+1 h
+7121 m
+7122 m
+7123 m
+125 h
+4 h
+10 h
+4 h
+10 h
+10 h
+1 h
+10 h
+1 h
+11 h
+36 h
+27 h
+146 h
+10 h
+10 h
+10 h
+1 h
+83 h
+7124 m
+1 h
+4 h
+11 h
+82 h
+4 h
+4 h
+4 h
+129 h
+4 h
+4 h
+1 h
+479 m
+7125 m
+1 h
+7126 m
+7127 m
+5813 m
+4 h
+4 h
+147 h
+10 h
+4 h
+124 h
+4 h
+4 h
+4 h
+1 h
+7128 m
+4 h
+7129 m
+4 h
+4 h
+40 h
+6391 m
+7130 m
+10 h
+1 h
+13 h
+27 h
+7131 m
+65 h
+10 h
+250 h
+10 h
+1 h
+56 h
+7132 m
+4 h
+4 h
+7133 m
+4 h
+7134 m
+11 h
+7135 m
+1016 h
+7136 m
+367 m
+4 h
+4 h
+7137 m
+25 h
+7138 m
+1 h
+4 h
+7139 m
+83 h
+7140 m
+1 h
+4 h
+113 h
+4 h
+1006 m
+10 h
+1 h
+70 m
+7141 m
+10 h
+4 h
+307 h
+110 h
+11 h
+4 h
+1 h
+41 h
+1 h
+4 h
+1 h
+1 h
+4 h
+33 m
+10 h
+10 h
+4292 m
+12 h
+11 h
+4 h
+57 h
+4 h
+1 h
+36 h
+4 h
+1 h
+4 h
+4 h
+7142 m
+1 h
+4 h
+276 h
+10 h
+41 h
+4 h
+986 h
+195 h
+4 h
+10 h
+10 h
+2769 m
+181 h
+7143 m
+4 h
+2540 m
+7144 m
+4 h
+1 h
+5673 m
+1 h
+4 h
+73 h
+7145 m
+7146 m
+2788 h
+10 h
+83 h
+10 h
+1 h
+1 h
+3 h
+4 h
+307 h
+4 h
+1 h
+381 m
+4 h
+1 h
+45 h
+10 h
+119 h
+4 h
+4 h
+1 h
+7147 m
+147 h
+1 h
+4 h
+4 h
+158 h
+57 h
+7148 m
+119 h
+11 h
+10 h
+7149 m
+10 h
+7150 m
+10 h
+4 h
+57 h
+1 h
+4 h
+185 h
+4 h
+1 h
+1 h
+1 h
+4 h
+4 h
+1 h
+4 h
+11 h
+125 h
+1 h
+1 h
+4 h
+4 h
+4 h
+12 h
+79 h
+258 h
+7151 m
+7152 m
+7153 m
+4 h
+10 h
+4 h
+4 h
+83 h
+4 h
+4 h
+10 h
+1359 h
+10 h
+7154 m
+1 h
+7155 m
+4 h
+190 h
+4 h
+383 h
+4 h
+990 m
+4 h
+27 h
+10 h
+730 m
+7156 m
+1 h
+4 h
+4 h
+31 h
+1 h
+10 h
+1 h
+4 h
+7157 m
+7158 m
+92 h
+10 h
+10 h
+59 h
+12 h
+7159 m
+12 h
+7160 m
+146 h
+4 h
+4 h
+41 h
+4 h
+25 h
+79 h
+4 h
+25 h
+104 h
+2379 m
+125 h
+10 h
+7161 m
+358 h
+164 h
+10 h
+10 h
+25 h
+1 h
+4 h
+1142 m
+4 h
+4 h
+4 h
+4 h
+1 h
+1 h
+4 h
+7162 m
+10 h
+12 h
+1 h
+139 h
+7163 m
+10 h
+7164 m
+1 h
+41 h
+297 h
+1 h
+1 h
+59 h
+1 h
+31 h
+4 h
+1322 h
+1 h
+7165 m
+10 h
+692 h
+147 h
+55 h
+276 h
+7166 m
+10 h
+10 h
+10 h
+3 h
+1 h
+10 h
+4 h
+4 h
+1074 h
+10 h
+1 h
+4 h
+7167 m
+1 h
+4 h
+1 h
+10 h
+1 h
+7168 m
+4 h
+258 h
+3558 h
+7169 m
+7170 m
+7171 m
+1 h
+214 m
+2625 m
+276 h
+7172 m
+10 h
+1 h
+4 h
+1 h
+7173 m
+11 h
+4 h
+1 h
+1697 h
+3 h
+7174 m
+4 h
+83 h
+1 h
+1 h
+4 h
+10 h
+7175 m
+11 h
+114 h
+7176 m
+4 h
+94 h
+4 h
+79 h
+4 h
+11 h
+4276 m
+13 h
+4 h
+4 h
+123 h
+114 h
+3396 m
+196 h
+57 h
+125 h
+147 h
+10 h
+7177 m
+1 h
+1 h
+1 h
+7178 m
+57 h
+10 h
+1 h
+170 h
+10 h
+10 h
+1 h
+4 h
+7179 m
+83 h
+258 h
+10 h
+7180 m
+7181 m
+59 h
+238 h
+10 h
+1 h
+7182 m
+79 h
+7183 m
+7184 m
+7185 m
+1 h
+4 h
+4 h
+4 h
+278 h
+447 h
+7186 m
+7187 m
+59 h
+167 h
+7188 m
+10 h
+4 h
+1 h
+1 h
+463 m
+36 h
+7189 m
+238 h
+41 h
+3 h
+125 h
+219 h
+82 h
+7190 m
+74 h
+1 h
+1 h
+10 h
+10 h
+7191 m
+11 h
+4 h
+7192 m
+10 h
+7193 m
+4 h
+10 h
+82 h
+1 h
+4 h
+1 h
+2172 m
+10 h
+119 h
+7194 m
+7195 m
+10 h
+1379 m
+1 h
+10 h
+3847 m
+4 h
+4 h
+1 h
+1 h
+10 h
+11 h
+4 h
+10 h
+4 h
+124 h
+196 h
+4 h
+4 h
+7196 m
+7197 m
+4 h
+10 h
+4 h
+11 h
+1403 h
+55 h
+4 h
+4 h
+7198 m
+4 h
+857 h
+10 h
+69 h
+104 h
+104 h
+4 h
+4 h
+7199 m
+10 h
+11 h
+4 h
+1 h
+4 h
+204 h
+4 h
+7200 m
+11 h
+10 h
+7201 m
+4 h
+4 h
+1 h
+4 h
+353 m
+7202 m
+4 h
+1 h
+4 h
+1 h
+59 h
+4 h
+7203 m
+195 h
+4 h
+195 h
+7204 m
+10 h
+4 h
+10 h
+1074 h
+1 h
+10 h
+976 h
+10 h
+7205 m
+59 h
+7206 m
+7207 m
+1 h
+10 h
+1 h
+1137 h
+10 h
+763 m
+7208 m
+27 h
+10 h
+10 h
+11 h
+4 h
+1 h
+7209 m
+7210 m
+1 h
+104 h
+124 h
+190 h
+4 h
+7211 m
+258 h
+1 h
+7212 m
+83 h
+36 h
+7213 m
+10 h
+221 m
+10 h
+10 h
+1 h
+1 h
+10 h
+4 h
+7214 m
+10 h
+7215 m
+4 h
+569 h
+7216 m
+125 h
+4 h
+7217 m
+1 h
+10 h
+7218 m
+4 h
+25 h
+7219 m
+7220 m
+12 h
+7221 m
+7222 m
+7223 m
+3357 m
+4 h
+11 h
+79 h
+7224 m
+7225 m
+82 h
+56 h
+74 h
+4 h
+10 h
+7226 m
+1 h
+41 h
+10 h
+1122 m
+1 h
+10 h
+97 h
+31 h
+1 h
+7227 m
+1 h
+11 h
+186 h
+7228 m
+4 h
+1 h
+1 h
+4 h
+181 h
+7229 m
+7230 m
+13 h
+299 h
+4 h
+10 h
+65 h
+4 h
+113 h
+289 h
+6747 m
+4 h
+10 h
+7231 m
+169 h
+238 h
+4 h
+4 h
+92 h
+45 h
+1 h
+4 h
+113 h
+6197 m
+1 h
+7232 m
+7233 m
+10 h
+12 h
+10 h
+692 h
+10 h
+7234 m
+7235 m
+258 h
+7236 m
+7237 m
+10 h
+31 h
+1 h
+7238 m
+1 h
+359 h
+10 h
+7239 m
+169 h
+10 h
+5923 m
+4 h
+123 h
+97 h
+1 h
+4 h
+4 h
+447 h
+7240 m
+82 h
+91 h
+65 h
+7241 m
+4 h
+7242 m
+4 h
+7243 m
+1 h
+1 h
+55 h
+2172 m
+4858 m
+7244 m
+82 h
+4 h
+4 h
+7245 m
+4 h
+7246 m
+1 h
+10 h
+3435 m
+7247 m
+7248 m
+7249 m
+114 h
+1137 h
+4 h
+74 h
+1 h
+79 h
+1 h
+7250 m
+4 h
+7251 m
+4 h
+7252 m
+123 h
+10 h
+1 h
+1 h
+4 h
+4 h
+129 h
+7253 m
+57 h
+258 h
+10 h
+4 h
+181 h
+10 h
+4 h
+124 h
+4 h
+7254 m
+4089 m
+1 h
+56 h
+4 h
+10 h
+258 h
+4 h
+1 h
+11 h
+10 h
+229 h
+195 h
+4 h
+4 h
+4 h
+167 h
+4 h
+185 h
+196 h
+1 h
+7255 m
+4 h
+1 h
+5525 m
+359 h
+7256 m
+7257 m
+10 h
+4 h
+83 h
+11 h
+238 h
+4 h
+4 h
+7258 m
+8 h
+4 h
+1 h
+4 h
+59 h
+7259 m
+5 m
+7260 m
+4 h
+2374 h
+4 h
+10 h
+3 h
+1 h
+4 h
+4 h
+1 h
+4 h
+1 h
+1607 m
+6731 m
+10 h
+83 h
+7261 m
+82 h
+167 h
+4 h
+4 h
+110 h
+10 h
+10 h
+7262 m
+270 h
+7263 m
+147 h
+10 h
+7264 m
+7265 m
+10 h
+4 h
+368 h
+4 h
+1220 m
+1 h
+4 h
+4 h
+56 h
+82 h
+1 h
+4 h
+83 h
+109 h
+172 h
+7266 m
+10 h
+1 h
+10 h
+4 h
+4 h
+949 m
+7267 m
+4 h
+83 h
+1 h
+4 h
+358 h
+10 h
+3227 m
+10 h
+10 h
+4 h
+4 h
+1 h
+1 h
+1 h
+1 h
+1 h
+1083 m
+7268 m
+1 h
+1 h
+443 h
+129 h
+10 h
+74 h
+7269 m
+3 h
+1 h
+1 h
+1 h
+59 h
+230 h
+7270 m
+10 h
+7271 m
+7272 m
+11 h
+10 h
+7273 m
+4 h
+74 h
+1 h
+4 h
+4 h
+156 h
+1 h
+164 h
+4 h
+1 h
+888 m
+1766 h
+4 h
+83 h
+1 h
+124 h
+1 h
+4 h
+10 h
+7274 m
+11 h
+7275 m
+4 h
+10 h
+83 h
+146 h
+10 h
+7276 m
+25 h
+7277 m
+10 h
+4 h
+74 h
+1 h
+270 h
+1 h
+4 h
+7278 m
+1 h
+1 h
+146 h
+4 h
+276 h
+4 h
+10 h
+10 h
+135 h
+4 h
+195 h
+158 h
+4 h
+1619 h
+4 h
+1 h
+4 h
+4 h
+4 h
+10 h
+82 h
+31 h
+10 h
+109 h
+83 h
+169 h
+1791 m
+5 m
+7279 m
+4 h
+4 h
+794 m
+7280 m
+7281 m
+11 h
+2607 m
+1 h
+4 h
+7282 m
+1646 m
+4 h
+7283 m
+11 h
+10 h
+195 h
+10 h
+7284 m
+74 h
+11 h
+4 h
+195 h
+10 h
+1 h
+4 h
+4 h
+10 h
+1 h
+4 h
+4 h
+1083 m
+4 h
+4 h
+176 m
+64 h
+1868 m
+7285 m
+4 h
+4 h
+45 h
+3 h
+1074 h
+11 h
+147 h
+7286 m
+4 h
+125 h
+4 h
+1406 h
+10 h
+1 h
+7287 m
+7288 m
+11 h
+4 h
+1 h
+1 h
+7289 m
+7290 m
+11 h
+1 h
+4 h
+10 h
+1 h
+10 h
+4 h
+7291 m
+7292 m
+1 h
+1535 m
+10 h
+11 h
+7293 m
+1 h
+282 m
+4 h
+7294 m
+307 h
+4 h
+7295 m
+172 h
+10 h
+7296 m
+124 h
+10 h
+7297 m
+4 h
+10 h
+4 h
+57 h
+11 h
+1 h
+4 h
+4 h
+4 h
+10 h
+7298 m
+4 h
+94 h
+7299 m
+10 h
+4 h
+7300 m
+468 m
+7301 m
+11 h
+4 h
+4 h
+4 h
+4 h
+1 h
+1 h
+25 h
+10 h
+7302 m
+4 h
+109 h
+1 h
+1 h
+25 h
+1 h
+10 h
+79 h
+10 h
+4 h
+203 m
+4 h
+7303 m
+4 h
+31 h
+4 h
+7304 m
+4 h
+4 h
+1 h
+3 h
+83 h
+1 h
+146 h
+7305 m
+92 h
+7306 m
+10 h
+4 h
+1 h
+3 h
+7307 m
+7308 m
+10 h
+4 h
+7309 m
+143 h
+4 h
+11 h
+11 h
+4 h
+196 h
+94 h
+156 h
+4 h
+4 h
+7310 m
+124 h
+1 h
+223 m
+5557 m
+1619 h
+4 h
+7311 m
+4 h
+7312 m
+1 h
+10 h
+10 h
+327 m
+8 h
+4 h
+110 h
+10 h
+4 h
+5526 m
+10 h
+1 h
+7313 m
+1 h
+279 h
+1564 m
+1 h
+7314 m
+1 h
+4 h
+7315 m
+83 h
+4 h
+7316 m
+4 h
+11 h
+1 h
+7317 m
+4 h
+230 h
+1 h
+7318 m
+1359 h
+4 h
+4 h
+4 h
+10 h
+10 h
+73 h
+1 h
+1 h
+307 h
+1 h
+10 h
+119 h
+10 h
+7319 m
+1 h
+147 h
+65 h
+1 h
+4 h
+6095 m
+4 h
+10 h
+4 h
+7320 m
+59 h
+170 h
+7321 m
+13 h
+4 h
+7322 m
+1 h
+7323 m
+82 h
+4 h
+843 m
+1 h
+258 h
+4 h
+7324 m
+4 h
+10 h
+7325 m
+109 h
+10 h
+114 h
+10 h
+135 h
+5325 m
+4 h
+262 h
+7326 m
+10 h
+1 h
+113 h
+7327 m
+114 h
+1 h
+7328 m
+41 h
+41 h
+124 h
+123 h
+4 h
+83 h
+1 h
+11 h
+1 h
+4 h
+4 h
+55 h
+59 h
+4 h
+12 h
+1564 m
+4 h
+1 h
+1 h
+143 h
+4 h
+10 h
+1 h
+11 h
+4033 m
+1 h
+533 m
+7329 m
+1 h
+7330 m
+11 h
+10 h
+7331 m
+109 h
+118 h
+7332 m
+7333 m
+1089 h
+11 h
+10 h
+10 h
+4 h
+7334 m
+7335 m
+4 h
+313 m
+4 h
+1 h
+7336 m
+358 h
+1 h
+5544 m
+3324 m
+74 h
+4 h
+1532 m
+737 h
+1 h
+13 h
+7337 m
+7338 m
+10 h
+181 h
+258 h
+3 h
+109 h
+45 h
+4 h
+10 h
+10 h
+4 h
+1 h
+7339 m
+124 h
+7340 m
+1 h
+4 h
+7341 m
+125 h
+1 h
+4 h
+4 h
+7342 m
+4 h
+55 h
+10 h
+1 h
+7343 m
+195 h
+138 h
+74 h
+1 h
+83 h
+3293 m
+7344 m
+7345 m
+7346 m
+10 h
+1 h
+7347 m
+430 m
+74 h
+10 h
+109 h
+7348 m
+10 h
+11 h
+1260 m
+110 h
+10 h
+1 h
+4 h
+57 h
+7349 m
+258 h
+7350 m
+104 h
+4 h
+7351 m
+7352 m
+41 h
+4 h
+4 h
+73 h
+299 h
+93 h
+4 h
+278 h
+1 h
+41 h
+8 h
+4 h
+4 h
+1 h
+7353 m
+83 h
+7354 m
+7355 m
+4 h
+250 h
+10 h
+5 h
+1817 m
+104 h
+10 h
+4 h
+11 h
+7356 m
+7357 m
+13 h
+1 h
+82 h
+94 h
+4 h
+1 h
+1 h
+59 h
+10 h
+10 h
+1 h
+10 h
+266 h
+7358 m
+1 h
+4 h
+11 h
+4 h
+1 h
+110 h
+4 h
+10 h
+10 h
+1 h
+1 h
+4 h
+4 h
+10 h
+1 h
+7359 m
+276 h
+7360 m
+10 h
+10 h
+3669 m
+82 h
+4 h
+7361 m
+167 h
+386 h
+4608 m
+1 h
+297 h
+7362 m
+4 h
+190 h
+114 h
+4 h
+1 h
+4 h
+770 m
+10 h
+4 h
+13 h
+278 h
+4 h
+10 h
+7363 m
+1 h
+10 h
+800 m
+4 h
+4 h
+7364 m
+10 h
+1 h
+135 h
+4 h
+57 h
+83 h
+139 h
+1 h
+82 h
+10 h
+4 h
+11 h
+195 h
+10 h
+1470 h
+4 h
+1 h
+4 h
+10 h
+10 h
+10 h
+73 h
+97 h
+4 h
+976 h
+4 h
+1835 m
+83 h
+11 h
+10 h
+4 h
+4 h
+7365 m
+7366 m
+164 h
+986 h
+10 h
+31 h
+11 h
+1 h
+10 h
+7367 m
+330 h
+4 h
+1 h
+7368 m
+82 h
+10 h
+125 h
+986 h
+7369 m
+28 h
+10 h
+1 h
+1016 h
+4 h
+229 h
+2883 m
+270 h
+167 h
+10 h
+1 h
+10 h
+7370 m
+4 h
+986 h
+7371 m
+1 h
+4 h
+10 h
+1 h
+25 h
+266 h
+7372 m
+74 h
+3150 m
+10 h
+106 m
+1 h
+4 h
+1 h
+7373 m
+11 h
+7374 m
+7375 m
+204 h
+4 h
+4 h
+7376 m
+7377 m
+1092 m
+41 h
+4 h
+109 h
+186 h
+4 h
+1 h
+57 h
+1 h
+1 h
+2909 m
+297 h
+1 h
+4 h
+7378 m
+1016 h
+4 h
+4 h
+7379 m
+93 h
+138 h
+4 h
+164 h
+25 h
+4 h
+7380 m
+10 h
+4 h
+4 h
+4 h
+4 h
+7381 m
+125 h
+10 h
+4 h
+7382 m
+1 h
+7383 m
+4 h
+258 h
+181 h
+196 h
+10 h
+1 h
+4 h
+736 m
+4 h
+59 h
+7384 m
+10 h
+1 h
+4 h
+4 h
+170 h
+7385 m
+7386 m
+1 h
+4 h
+1 h
+1 h
+7387 m
+82 h
+4 h
+7388 m
+4 h
+4 h
+7389 m
+57 h
+4 h
+4 h
+7214 m
+10 h
+56 h
+7390 m
+7391 m
+10 h
+31 h
+1 h
+4 h
+1 h
+4 h
+332 h
+4 h
+10 h
+4 h
+4 h
+10 h
+4 h
+4 h
+40 h
+7392 m
+7393 m
+1308 m
+4 h
+10 h
+1 h
+7394 m
+10 h
+45 h
+4 h
+1 h
+4 h
+104 h
+7395 m
+45 h
+8 h
+7396 m
+104 h
+57 h
+4 h
+1 h
+10 h
+10 h
+146 h
+1 h
+4 h
+478 m
+7397 m
+7398 m
+7399 m
+4 h
+10 h
+4 h
+4 h
+7400 m
+1 h
+4 h
+10 h
+4 h
+10 h
+4 h
+4 h
+36 h
+10 h
+692 h
+135 h
+10 h
+7401 m
+5225 m
+4 h
+2607 m
+1 h
+74 h
+10 h
+4 h
+25 h
+1 h
+4 h
+4 h
+4 h
+4 h
+7402 m
+1 h
+10 h
+1016 h
+4 h
+10 h
+4 h
+4 h
+7403 m
+11 h
+146 h
+7404 m
+4 h
+31 h
+11 h
+1 h
+307 h
+7405 m
+11 h
+184 h
+7406 m
+359 h
+25 h
+4 h
+7407 m
+11 h
+13 h
+10 h
+10 h
+4 h
+7408 m
+4 h
+1620 m
+114 h
+4 h
+1 h
+1 h
+1 h
+7409 m
+1 h
+36 h
+4564 m
+36 h
+4 h
+11 h
+4 h
+124 h
+7410 m
+4905 m
+7411 m
+3 h
+1 h
+353 m
+1772 h
+10 h
+4 h
+157 h
+10 h
+4 h
+278 h
+1035 m
+7412 m
+10 h
+82 h
+4 h
+4 h
+1539 m
+1 h
+109 h
+84 h
+109 h
+4 h
+4 h
+7413 m
+125 h
+139 h
+109 h
+1 h
+56 h
+5145 m
+4 h
+1 h
+119 h
+7414 m
+1 h
+1 h
+4 h
+4 h
+7415 m
+935 h
+4 h
+4 h
+119 h
+276 h
+169 h
+4 h
+1 h
+123 h
+1 h
+1 h
+7416 m
+7417 m
+4 h
+4 h
+4 h
+224 h
+7418 m
+7419 m
+7420 m
+7421 m
+262 h
+4 h
+1780 h
+1 h
+10 h
+108 h
+83 h
+1 h
+56 h
+1 h
+1403 h
+1 h
+185 h
+11 h
+4 h
+4 h
+10 h
+4 h
+186 h
+3025 m
+7422 m
+4 h
+7423 m
+4 h
+4 h
+48 h
+4 h
+2720 m
+687 h
+5254 m
+5653 m
+7424 m
+4 h
+433 m
+1 h
+65 h
+359 h
+1137 h
+4 h
+7425 m
+4 h
+87 m
+4 h
+1 h
+169 h
+11 h
+4 h
+4 h
+4 h
+4 h
+4 h
+7426 m
+7427 m
+4 h
+10 h
+10 h
+4 h
+4 h
+1 h
+65 h
+4 h
+295 h
+276 h
+7428 m
+4 h
+1 h
+2961 m
+1 h
+59 h
+7429 m
+10 h
+64 h
+4 h
+4 h
+10 h
+4 h
+1 h
+1 h
+7430 m
+12 h
+48 h
+4 h
+1 h
+109 h
+1 h
+1309 m
+4 h
+11 h
+4 h
+156 h
+10 h
+4 h
+4 h
+4 h
+1 h
+10 h
+1 h
+7431 m
+488 h
+59 h
+190 h
+1006 m
+57 h
+10 h
+7432 m
+4 h
+1 h
+7433 m
+2002 m
+59 h
+7434 m
+5 h
+4 h
+45 h
+7435 m
+447 h
+1 h
+10 h
+7436 m
+7437 m
+4 h
+7438 m
+4 h
+10 h
+1 h
+7439 m
+12 h
+938 m
+4 h
+10 h
+4 h
+4 h
+7440 m
+7441 m
+4 h
+7442 m
+12 h
+4 h
+1 h
+4 h
+10 h
+278 h
+1 h
+1 h
+1 h
+4 h
+4 h
+57 h
+1 h
+7443 m
+4 h
+266 h
+4 h
+1 h
+4 h
+7444 m
+10 h
+10 h
+4 h
+138 h
+4 h
+1 h
+41 h
+4 h
+77 h
+41 h
+59 h
+10 h
+10 h
+4 h
+1 h
+3561 m
+75 m
+124 h
+7445 m
+676 m
+7446 m
+1016 h
+1 h
+181 h
+139 h
+1 h
+464 h
+97 h
+10 h
+7447 m
+1 h
+186 h
+7448 m
+10 h
+10 h
+1 h
+1 h
+10 h
+7449 m
+332 h
+48 h
+11 h
+172 h
+10 h
+1 h
+4 h
+4 h
+1 h
+170 h
+59 h
+158 h
+7450 m
+4 h
+7451 m
+10 h
+274 h
+7452 m
+4 h
+7453 m
+1 h
+7454 m
+56 h
+1 h
+10 h
+106 m
+4 h
+25 h
+4 h
+295 h
+4 h
+2475 m
+7455 m
+7456 m
+4 h
+1 h
+31 h
+11 h
+7457 m
+10 h
+4 h
+295 h
+4 h
+7458 m
+11 h
+1 h
+79 h
+181 h
+4 h
+7459 m
+4 h
+83 h
+2623 m
+57 h
+538 h
+4 h
+4 h
+109 h
+97 h
+214 m
+4 h
+5869 m
+5917 m
+12 h
+1374 m
+59 h
+135 h
+4 h
+7460 m
+4 h
+109 h
+4 h
+147 h
+7461 m
+7462 m
+10 h
+4 h
+10 h
+4 h
+10 h
+25 h
+25 h
+7463 m
+114 h
+1 h
+10 h
+1 h
+195 h
+4 h
+56 h
+83 h
+1 h
+10 h
+4 h
+91 h
+74 h
+1 h
+10 h
+276 h
+4 h
+1 h
+41 h
+7464 m
+70 m
+4 h
+31 h
+7465 m
+1 h
+1 h
+73 h
+10 h
+4 h
+7466 m
+1 h
+4 h
+454 m
+3 h
+69 h
+4 h
+13 h
+40 h
+1 h
+7467 m
+109 h
+10 h
+10 h
+7468 m
+10 h
+1 h
+1 h
+7469 m
+41 h
+4 h
+28 h
+10 h
+7470 m
+55 h
+1 h
+10 h
+4 h
+4 h
+11 h
+4 h
+4 h
+10 h
+3477 m
+195 h
+7471 m
+10 h
+7472 m
+7473 m
+1 h
+7474 m
+46 h
+4 h
+1697 h
+1 h
+94 h
+4 h
+1 h
+7475 m
+1 h
+7476 m
+114 h
+7477 m
+7478 m
+1 h
+73 h
+109 h
+7479 m
+11 h
+45 h
+4 h
+4 h
+7480 m
+4 h
+1 h
+4 h
+4 h
+59 h
+10 h
+104 h
+7481 m
+11 h
+4 h
+270 h
+7482 m
+79 h
+1 h
+74 h
+1 h
+1 h
+7483 m
+1504 m
+1 h
+147 h
+146 h
+124 h
+7484 m
+4 h
+97 h
+4 h
+3 h
+4 h
+135 h
+10 h
+4 h
+1 h
+10 h
+7485 m
+4 h
+10 h
+4 h
+1 h
+1 h
+4 h
+4 h
+11 h
+7486 m
+7487 m
+7488 m
+4 h
+4 h
+41 h
+481 m
+4 h
+1 h
+1 h
+8 h
+4 h
+10 h
+7489 m
+109 h
+4 h
+1 h
+1 h
+478 m
+10 h
+1 h
+1 h
+4 h
+4 h
+1 h
+536 h
+10 h
+266 h
+857 h
+10 h
+4 h
+7490 m
+4 h
+1 h
+7491 m
+190 h
+10 h
+359 h
+10 h
+82 h
+4 h
+2300 m
+7492 m
+7493 m
+7494 m
+7495 m
+7496 m
+4 h
+10 h
+10 h
+3 h
+1 h
+140 h
+1 h
+64 h
+304 m
+41 h
+3 h
+97 h
+1 h
+7497 m
+10 h
+10 h
+1 h
+46 h
+7498 m
+146 h
+82 h
+4 h
+4 h
+1 h
+4 h
+7499 m
+10 h
+1847 m
+10 h
+7500 m
+386 h
+4 h
+1 h
+59 h
+4 h
+31 h
+11 h
+146 h
+1 h
+4 h
+3 h
+1 h
+146 h
+4 h
+4 h
+36 h
+1 h
+94 h
+1 h
+10 h
+7501 m
+7502 m
+114 h
+2840 m
+4 h
+1 h
+13 h
+4 h
+31 h
+7503 m
+10 h
+10 h
+7504 m
+1 h
+7505 m
+2002 m
+4 h
+11 h
+1 h
+258 h
+5505 m
+7506 m
+4 h
+25 h
+114 h
+4 h
+28 h
+4 h
+238 h
+307 h
+2666 m
+10 h
+10 h
+4 h
+1 h
+7507 m
+4 h
+1 h
+10 h
+4 h
+1 h
+155 m
+4 h
+4 h
+7508 m
+10 h
+11 h
+4 h
+264 m
+4 h
+7509 m
+4 h
+1 h
+10 h
+4 h
+332 h
+7510 m
+4 h
+1116 m
+7511 m
+4 h
+4 h
+7512 m
+1 h
+10 h
+4 h
+241 m
+1 h
+1 h
+123 h
+172 h
+4390 m
+4 h
+4 h
+4 h
+4 h
+4 h
+3 h
+4 h
+10 h
+1 h
+7513 m
+11 h
+56 h
+129 h
+7514 m
+55 h
+41 h
+59 h
+7515 m
+4 h
+11 h
+11 h
+109 h
+10 h
+4 h
+108 h
+7516 m
+124 h
+10 h
+10 h
+146 h
+7517 m
+278 h
+7518 m
+31 h
+83 h
+1 h
+7519 m
+83 h
+7520 m
+4 h
+1 h
+59 h
+109 h
+10 h
+1 h
+443 h
+172 h
+11 h
+10 h
+1 h
+10 h
+7521 m
+82 h
+7522 m
+4723 m
+7523 m
+1 h
+1 h
+7524 m
+266 h
+7525 m
+4 h
+1 h
+4 h
+79 h
+104 h
+1 h
+297 h
+56 h
+12 h
+1 h
+4 h
+4 h
+347 m
+1 h
+10 h
+7526 m
+1 h
+7527 m
+7528 m
+10 h
+10 h
+4 h
+1 h
+313 m
+3 h
+4 h
+4 h
+10 h
+1 h
+7529 m
+1 h
+295 h
+10 h
+4 h
+1 h
+7530 m
+184 h
+65 h
+124 h
+4 h
+10 h
+4 h
+7531 m
+1 h
+185 h
+7532 m
+1249 m
+10 h
+173 h
+4 h
+7533 m
+7534 m
+55 h
+57 h
+104 h
+25 h
+4 h
+10 h
+7535 m
+1 h
+1 h
+4 h
+4 h
+135 h
+4 h
+1772 h
+4 h
+4 h
+1 h
+7536 m
+4 h
+4 h
+11 h
+6461 m
+65 h
+4 h
+7537 m
+4 h
+125 h
+4 h
+10 h
+297 h
+7538 m
+4 h
+123 h
+1 h
+83 h
+10 h
+536 h
+3028 m
+5048 m
+7539 m
+1 h
+10 h
+10 h
+125 h
+7540 m
+7541 m
+1 h
+10 h
+7542 m
+10 h
+195 h
+3 h
+386 h
+1 h
+278 h
+10 h
+1 h
+4 h
+7543 m
+4 h
+4 h
+1 h
+4 h
+1137 h
+10 h
+1 h
+10 h
+1835 m
+935 h
+4 h
+82 h
+7544 m
+4489 m
+4 h
+4 h
+4 h
+11 h
+1619 h
+4 h
+265 h
+4 h
+7545 m
+10 h
+7546 m
+10 h
+109 h
+7547 m
+4 h
+10 h
+135 h
+4 h
+1 h
+2054 m
+10 h
+13 h
+4 h
+289 h
+7548 m
+1 h
+4 h
+2931 m
+7549 m
+112 h
+4 h
+1 h
+7550 m
+3112 m
+1 h
+10 h
+10 h
+7551 m
+195 h
+10 h
+4 h
+41 h
+64 h
+1 h
+1 h
+4 h
+25 h
+4 h
+1 h
+13 h
+4 h
+1 h
+1 h
+7552 m
+4 h
+1454 m
+4 h
+4 h
+779 m
+5897 m
+46 h
+7553 m
+4 h
+4 h
+1 h
+1 h
+4 h
+4 h
+4 h
+266 h
+10 h
+4 h
+885 m
+1 h
+7554 m
+266 h
+10 h
+10 h
+4 h
+1 h
+7555 m
+8 h
+195 h
+146 h
+7556 m
+4 h
+258 h
+4 h
+64 h
+7557 m
+4 h
+7558 m
+332 h
+4 h
+1 h
+7559 m
+1 h
+4 h
+1 h
+262 h
+7560 m
+1685 h
+10 h
+1697 h
+7561 m
+4 h
+7562 m
+7563 m
+45 h
+25 h
+4 h
+4 h
+55 h
+1 h
+7564 m
+1 h
+1 h
+104 h
+7565 m
+10 h
+74 h
+1 h
+447 h
+7566 m
+1 h
+10 h
+1 h
+7567 m
+7568 m
+56 h
+1 h
+10 h
+4 h
+4 h
+7569 m
+6491 m
+4 h
+57 h
+10 h
+7570 m
+1 h
+7571 m
+11 h
+10 h
+1 h
+10 h
+7572 m
+7573 m
+1 h
+1 h
+7574 m
+656 h
+10 h
+10 h
+4 h
+1 h
+7575 m
+4 h
+7576 m
+10 h
+10 h
+7577 m
+65 h
+114 h
+56 h
+7578 m
+4 h
+7579 m
+1 h
+25 h
+4 h
+4 h
+4 h
+25 h
+7580 m
+7581 m
+7582 m
+164 h
+1 h
+7583 m
+1389 h
+4 h
+82 h
+10 h
+1 h
+10 h
+7584 m
+4 h
+7585 m
+4 h
+11 h
+4 h
+94 h
+1 h
+10 h
+4 h
+7586 m
+1 h
+7587 m
+109 h
+4 h
+986 h
+4 h
+4 h
+1 h
+7588 m
+1 h
+74 h
+954 m
+1 h
+7589 m
+1 h
+55 h
+1772 h
+97 h
+4 h
+10 h
+4 h
+7590 m
+11 h
+146 h
+10 h
+4 h
+1403 h
+124 h
+11 h
+1 h
+692 h
+10 h
+7591 m
+3 h
+1 h
+4858 m
+7592 m
+10 h
+1 h
+64 h
+1 h
+4 h
+1 h
+11 h
+104 h
+4 h
+1 h
+4 h
+10 h
+250 h
+10 h
+4 h
+4 h
+22 h
+642 m
+6381 m
+10 h
+7593 m
+4 h
+4 h
+4 h
+386 h
+4 h
+11 h
+4 h
+4 h
+196 h
+7594 m
+41 h
+486 m
+7595 m
+4 h
+7596 m
+7597 m
+69 h
+241 m
+119 h
+7598 m
+7599 m
+2002 h
+12 h
+7600 m
+31 h
+4 h
+1 h
+4 h
+4 h
+41 h
+1 h
+125 h
+157 h
+7601 m
+7602 m
+250 h
+7603 m
+7604 m
+4 h
+10 h
+1822 h
+4 h
+7605 m
+92 h
+109 h
+7606 m
+464 h
+3398 m
+1 h
+4 h
+10 h
+146 h
+1 h
+2962 m
+123 h
+444 m
+7607 m
+1886 m
+4 h
+1 h
+1 h
+1508 m
+4 h
+2733 h
+10 h
+10 h
+1 h
+11 h
+1089 h
+10 h
+1771 m
+7608 m
+41 h
+4 h
+1790 h
+1 h
+41 h
+295 h
+7609 m
+64 h
+4 h
+1 h
+7610 m
+4 h
+7611 m
+7612 m
+184 h
+10 h
+4 h
+7613 m
+1 h
+4 h
+7614 m
+4 h
+185 h
+857 h
+4 h
+7615 m
+11 h
+5783 m
+10 h
+1 h
+10 h
+4 h
+124 h
+2072 m
+7616 m
+1 h
+4 h
+109 h
+7617 m
+10 h
+97 h
+10 h
+1 h
+138 h
+1 h
+10 h
+7618 m
+10 h
+4 h
+74 h
+10 h
+1 h
+1 h
+25 h
+4 h
+563 m
+1 h
+10 h
+13 h
+4 h
+7619 m
+4 h
+4 h
+7620 m
+83 h
+1 h
+4 h
+4 h
+4 h
+757 h
+10 h
+4 h
+1 h
+10 h
+1 h
+4 h
+181 h
+278 h
+4 h
+4 h
+7621 m
+3 h
+4 h
+41 h
+10 h
+383 h
+4 h
+4 h
+7622 m
+4 h
+45 h
+7623 m
+4 h
+733 m
+1 h
+1790 h
+4 h
+7624 m
+258 h
+4 h
+7625 m
+1 h
+4 h
+4 h
+1 h
+1 h
+10 h
+7626 m
+7627 m
+4 h
+7628 m
+229 h
+146 h
+4 h
+7629 m
+1 h
+10 h
+4 h
+1 h
+7630 m
+1 h
+10 h
+4 h
+10 h
+4 h
+10 h
+10 h
+1 h
+11 h
+83 h
+1 h
+276 h
+12 h
+7631 m
+36 h
+10 h
+7632 m
+10 h
+190 h
+3 h
+1822 h
+7633 m
+1 h
+1 h
+1 h
+7634 m
+7635 m
+4 h
+7636 m
+1 h
+4 h
+114 h
+4 h
+4 h
+10 h
+1 h
+464 h
+4 h
+143 h
+1 h
+7637 m
+4 h
+11 h
+4 h
+13 h
+4 h
+4 h
+1 h
+10 h
+649 h
+1 h
+7638 m
+4 h
+7639 m
+7640 m
+7641 m
+10 h
+7642 m
+7643 m
+4 h
+4 h
+4 h
+1 h
+1 h
+4 h
+10 h
+448 m
+94 h
+4 h
+1 h
+10 h
+7644 m
+1 h
+5 h
+2928 m
+82 h
+5822 m
+1 h
+258 h
+4 h
+1 h
+4 h
+94 h
+12 h
+7645 m
+4 h
+7646 m
+1445 m
+4 h
+3025 m
+5944 h
+4 h
+1 h
+4 h
+590 m
+4 h
+7647 m
+25 h
+113 h
+1 h
+4 h
+1 h
+1 h
+7648 m
+1 h
+79 h
+10 h
+10 h
+1 h
+3 h
+1 h
+7649 m
+7650 m
+4 h
+1 h
+1 h
+1 h
+11 h
+1 h
+1 h
+1 h
+278 h
+3 h
+97 h
+1 h
+1 h
+83 h
+75 m
+59 h
+7651 m
+57 h
+1 h
+82 h
+7652 m
+4 h
+11 h
+7653 m
+4 h
+7654 m
+10 h
+4 h
+7655 m
+4 h
+1 h
+109 h
+10 h
+7656 m
+1 h
+59 h
+4 h
+4 h
+4 h
+10 h
+8 h
+146 h
+10 h
+11 h
+4 h
+620 m
+17 m
+7657 m
+7658 m
+7659 m
+4 h
+4 h
+10 h
+73 h
+10 h
+1 h
+1 h
+4 h
+7660 m
+13 h
+4 h
+7661 m
+1 h
+1 h
+10 h
+4 h
+4 h
+7662 m
+4 h
+4 h
+4 h
+97 h
+7663 m
+7447 m
+4 h
+4 h
+7664 m
+1 h
+4 h
+10 h
+13 h
+77 h
+65 h
+7665 m
+307 h
+4 h
+7666 m
+4 h
+7253 m
+104 h
+1 h
+4 h
+55 h
+4 h
+157 h
+184 h
+7667 m
+10 h
+10 h
+59 h
+297 h
+10 h
+36 h
+59 h
+4 h
+7668 m
+55 h
+814 m
+82 h
+7669 m
+3396 m
+7670 m
+468 m
+10 h
+1 h
+10 h
+82 h
+4 h
+1 h
+10 h
+10 h
+1 h
+1359 h
+7671 m
+7672 m
+4 h
+1 h
+7673 m
+7674 m
+11 h
+7675 m
+7676 m
+7677 m
+4 h
+10 h
+31 h
+1 h
+1 h
+97 h
+1 h
+10 h
+97 h
+520 m
+64 h
+4 h
+110 h
+4 h
+4 h
+4 h
+1 h
+83 h
+92 h
+1 h
+109 h
+1 h
+1 h
+7678 m
+4 h
+169 h
+10 h
+11 h
+4 h
+4 h
+7679 m
+7680 m
+4 h
+10 h
+2522 m
+84 h
+10 h
+4 h
+581 m
+7681 m
+4 h
+1 h
+94 h
+4 h
+83 h
+97 h
+48 h
+92 h
+7682 m
+4 h
+10 h
+4 h
+7683 m
+64 h
+4 h
+11 h
+4 h
+10 h
+1 h
+4 h
+4 h
+258 h
+10 h
+36 h
+48 h
+22 h
+1 h
+1 h
+12 h
+10 h
+1 h
+65 h
+10 h
+7684 m
+4 h
+12 h
+1 h
+65 h
+7685 m
+7686 m
+11 h
+1 h
+1 h
+10 h
+10 h
+109 h
+10 h
+7687 m
+4 h
+1 h
+181 h
+3 h
+4 h
+7688 m
+10 h
+55 h
+7689 m
+4 h
+10 h
+4 h
+7690 m
+1 h
+83 h
+4 h
+147 h
+74 h
+4 h
+196 h
+4 h
+986 h
+7691 m
+10 h
+4 h
+7692 m
+7693 m
+167 h
+10 h
+1 h
+3 h
+1 h
+2887 h
+994 m
+256 h
+10 h
+4 h
+97 h
+10 h
+7694 m
+10 h
+7695 m
+156 h
+10 h
+4 h
+195 h
+144 h
+10 h
+11 h
+4 h
+640 h
+7696 m
+4 h
+173 h
+4 h
+1 h
+208 m
+7697 m
+82 h
+10 h
+1722 m
+10 h
+59 h
+1 h
+12 h
+12 h
+4 h
+4 h
+10 h
+169 h
+36 h
+443 h
+124 h
+97 h
+10 h
+4 h
+7698 m
+7699 m
+83 h
+7700 m
+41 h
+1 h
+5379 m
+4 h
+7701 m
+1 h
+1 h
+7702 m
+4 h
+1 h
+25 h
+1 h
+258 h
+7703 m
+1 h
+1 h
+11 h
+3979 m
+1 h
+238 h
+7704 m
+1083 h
+7705 m
+11 h
+74 h
+173 h
+488 h
+7706 m
+1 h
+1 h
+4 h
+83 h
+1 h
+1 h
+908 m
+10 h
+45 h
+1 h
+4 h
+1027 h
+4 h
+10 h
+1 h
+4 h
+4 h
+447 h
+195 h
+146 h
+10 h
+4 h
+1 h
+4 h
+7707 m
+2172 h
+124 h
+4 h
+138 h
+4 h
+1 h
+74 h
+7708 m
+2733 h
+3 h
+1 h
+4 h
+10 h
+7709 m
+7710 m
+64 h
+1 h
+7711 m
+7712 m
+10 h
+7713 m
+10 h
+4 h
+7714 m
+10 h
+7715 m
+7716 m
+1445 m
+57 h
+4 h
+7717 m
+10 h
+17 m
+7718 m
+1 h
+91 h
+4 h
+108 h
+7719 m
+10 h
+1 h
+10 h
+1 h
+4 h
+295 h
+104 h
+10 h
+4 h
+2111 m
+4 h
+1 h
+4 h
+4 h
+11 h
+1 h
+7720 m
+110 h
+4 h
+110 h
+1 h
+4 h
+272 h
+1 h
+1 h
+119 h
+4 h
+1 h
+1 h
+4 h
+104 h
+1137 h
+7721 m
+10 h
+10 h
+4 h
+7722 m
+10 h
+92 h
+10 h
+138 h
+4 h
+10 h
+10 h
+1 h
+986 h
+4 h
+3 h
+10 h
+12 h
+278 h
+4 h
+590 m
+1 h
+11 h
+7723 m
+7724 m
+1 h
+1 h
+4 h
+1 h
+48 h
+7725 m
+1 h
+1 h
+278 h
+10 h
+7726 m
+7727 m
+477 m
+10 h
+7728 m
+7729 m
+1 h
+1 h
+10 h
+10 h
+104 h
+13 h
+4 h
+4 h
+4 h
+1 h
+4 h
+1 h
+1 h
+4 h
+4 h
+4 h
+10 h
+59 h
+4 h
+10 h
+4 h
+10 h
+7730 m
+976 h
+1 h
+7731 m
+4 h
+1 h
+56 h
+181 h
+4 h
+7732 m
+1 h
+7733 m
+4 h
+10 h
+4 h
+10 h
+10 h
+10 h
+10 h
+7734 m
+146 h
+1 h
+147 h
+7735 m
+74 h
+7736 m
+4 h
+1 h
+10 h
+7737 m
+31 h
+10 h
+7738 m
+433 m
+7739 m
+4 h
+4 h
+82 h
+1 h
+7740 m
+1 h
+7741 m
+4 h
+4 h
+10 h
+4 h
+57 h
+4 h
+31 h
+556 h
+7742 m
+4 h
+1 h
+56 h
+7743 m
+4 h
+7744 m
+4 h
+4 h
+7745 m
+10 h
+11 h
+4 h
+4 h
+1 h
+7746 m
+1 h
+4 h
+3702 m
+11 h
+124 h
+1122 m
+4 h
+1 h
+169 h
+7747 m
+368 h
+1 h
+11 h
+1 h
+1 h
+113 h
+7661 m
+1 h
+146 h
+4 h
+1666 m
+1 h
+65 h
+1 h
+285 m
+1 h
+4 h
+7748 m
+7749 m
+4 h
+1 h
+140 h
+69 h
+1880 m
+4 h
+7750 m
+1 h
+1 h
+1 h
+82 h
+1 h
+4 h
+10 h
+114 h
+2374 h
+10 h
+538 h
+4 h
+55 h
+109 h
+7751 m
+2314 m
+1 h
+266 h
+1 h
+92 h
+83 h
+3 h
+737 h
+5 h
+11 h
+124 h
+7752 m
+7753 m
+520 m
+41 h
+41 h
+4 h
+4 h
+4 h
+4 h
+27 h
+4 h
+4 h
+4 h
+4 h
+110 h
+784 h
+1 h
+7754 m
+1 h
+59 h
+7755 m
+1 h
+7756 m
+97 h
+97 h
+1822 h
+31 h
+7757 m
+7758 m
+1 h
+4 h
+10 h
+4 h
+7759 m
+1 h
+125 h
+7760 m
+1650 h
+7761 m
+4 h
+92 h
+1 h
+5093 m
+11 h
+157 h
+11 h
+1 h
+11 h
+1 h
+4 h
+190 h
+4 h
+716 m
+278 h
+1835 m
+4 h
+7762 m
+1 h
+1 h
+1 h
+74 h
+1137 h
+1 h
+4 h
+7763 m
+10 h
+1 h
+1 h
+11 h
+4 h
+7764 m
+520 h
+4 h
+10 h
+11 h
+10 h
+383 h
+1 h
+7765 m
+31 h
+4 h
+10 h
+7766 m
+7767 m
+119 h
+7768 m
+4 h
+10 h
+7769 m
+1470 h
+986 h
+56 h
+593 m
+10 h
+7770 m
+10 h
+10 h
+6370 m
+82 h
+7771 m
+4 h
+4 h
+82 h
+185 h
+4 h
+1 h
+3 h
+10 h
+7772 m
+93 h
+7773 m
+1 h
+125 h
+10 h
+7774 m
+59 h
+55 h
+7775 m
+1 h
+7776 m
+270 h
+94 h
+2079 m
+92 h
+7777 m
+258 h
+4 h
+208 m
+1 h
+4 h
+10 h
+1 h
+1260 m
+40 h
+4 h
+4 h
+4 h
+146 h
+4 h
+7778 m
+10 h
+25 h
+74 h
+10 h
+4 h
+11 h
+4 h
+4 h
+4 h
+83 h
+94 h
+124 h
+276 h
+1595 m
+7779 m
+4 h
+10 h
+7780 m
+10 h
+4 h
+1 h
+7781 m
+4 h
+4 h
+10 h
+10 h
+7782 m
+7783 m
+10 h
+114 h
+7784 m
+447 h
+4 h
+10 h
+3 h
+1 h
+7785 m
+57 h
+4 h
+1780 h
+1 h
+7786 m
+10 h
+157 h
+181 h
+4 h
+10 h
+4 h
+4 h
+7787 m
+1 h
+4 h
+4 h
+73 h
+57 h
+10 h
+1 h
+7788 m
+181 h
+1 h
+41 h
+1650 h
+4 h
+2788 h
+112 h
+1 h
+4 h
+11 h
+10 h
+4 h
+7789 m
+4 h
+4 h
+139 h
+10 h
+25 h
+10 h
+5 h
+4 h
+1 h
+69 h
+7790 m
+7791 m
+1185 m
+7792 m
+1445 h
+1123 m
+7793 m
+1 h
+124 h
+1 h
+74 h
+4 h
+7794 m
+7795 m
+7796 m
+8 h
+11 h
+4 h
+172 h
+10 h
+1 h
+4 h
+10 h
+65 h
+41 h
+7797 m
+4 h
+10 h
+4 h
+8 h
+692 h
+83 h
+10 h
+204 h
+4 h
+1198 m
+7798 m
+1 h
+1 h
+4 h
+7799 m
+10 h
+139 h
+10 h
+11 h
+4 h
+4 h
+6726 m
+41 h
+114 h
+7800 m
+11 h
+92 h
+7801 m
+143 h
+10 h
+368 h
+1 h
+124 h
+1 h
+1 h
+4975 m
+7802 m
+601 h
+7803 m
+7804 m
+1016 h
+7805 m
+278 h
+1 h
+7806 m
+12 h
+7807 m
+1 h
+7808 m
+10 h
+1105 h
+7809 m
+174 h
+4 h
+1 h
+4 h
+1 h
+3 h
+5 h
+109 h
+4 h
+124 h
+4 h
+228 m
+7810 m
+104 h
+1 h
+10 h
+1 h
+41 h
+265 h
+10 h
+74 h
+7811 m
+520 h
+10 h
+2625 m
+10 h
+4 h
+10 h
+7812 m
+1 h
+7813 m
+73 h
+1 h
+10 h
+4 h
+1105 h
+31 h
+7814 m
+1 h
+4 h
+3 h
+7815 m
+4 h
+7816 m
+7817 m
+59 h
+10 h
+4 h
+4 h
+7818 m
+6221 m
+4 h
+167 h
+443 h
+7819 m
+4 h
+1 h
+27 h
+7820 m
+104 h
+1 h
+4 h
+4 h
+1 h
+1470 h
+1 h
+92 h
+83 h
+10 h
+1 h
+4 h
+79 h
+7821 m
+1 h
+1 h
+4 h
+7822 m
+1 h
+7823 m
+4 h
+7824 m
+262 h
+65 h
+7825 m
+1 h
+125 h
+4 h
+11 h
+4 h
+2625 m
+73 h
+10 h
+4 h
+22 h
+7826 m
+4 h
+4 h
+7827 m
+2851 m
+4 h
+1 h
+1 h
+83 h
+195 h
+59 h
+4 h
+57 h
+4 h
+1 h
+1646 m
+1 h
+12 h
+4 h
+10 h
+1619 h
+10 h
+4 h
+7828 m
+289 h
+7829 m
+278 h
+10 h
+4 h
+11 h
+4 h
+7830 m
+7831 m
+4 h
+7832 m
+41 h
+4 h
+536 h
+10 h
+1 h
+7833 m
+1 h
+10 h
+4 h
+11 h
+97 h
+10 h
+692 h
+7834 m
+4 h
+1 h
+83 h
+4 h
+97 h
+92 h
+297 h
+4 h
+1016 h
+4 h
+4 h
+1 h
+10 h
+4 h
+7835 m
+4 h
+104 h
+10 h
+7836 m
+25 h
+7837 m
+7838 m
+4 h
+447 h
+10 h
+10 h
+59 h
+1 h
+10 h
+10 h
+7839 m
+7840 m
+7841 m
+4 h
+10 h
+7842 m
+4 h
+4 h
+56 h
+11 h
+10 h
+97 h
+10 h
+11 h
+169 h
+7843 m
+1 h
+10 h
+41 h
+1083 h
+1 h
+10 h
+1089 h
+25 h
+11 h
+7844 m
+4 h
+10 h
+1169 m
+4 h
+7845 m
+10 h
+7846 m
+41 h
+22 h
+4 h
+7847 m
+36 h
+158 h
+7848 m
+7849 m
+4 h
+109 h
+7850 m
+1 h
+185 h
+399 h
+7851 m
+246 m
+82 h
+104 h
+7852 m
+4 h
+79 h
+219 h
+1 h
+123 h
+1 h
+7853 m
+4 h
+25 h
+307 h
+7854 m
+7855 m
+170 h
+1 h
+4 h
+1 h
+172 h
+10 h
+6144 m
+109 h
+3 h
+1016 h
+7856 m
+7857 m
+4 h
+7858 m
+258 h
+4 h
+196 h
+1751 m
+1260 m
+4 h
+7859 m
+7860 m
+83 h
+7861 m
+307 h
+1 h
+1 h
+1 h
+4 h
+276 h
+4 h
+10 h
+7862 m
+1 h
+119 h
+1 h
+4 h
+3307 m
+181 h
+4535 m
+10 h
+1 h
+4 h
+4 h
+7863 m
+1 h
+2719 h
+297 h
+10 h
+167 h
+10 h
+124 h
+4 h
+264 m
+358 h
+10 h
+83 h
+55 h
+1 h
+13 h
+1 h
+4 h
+4 h
+124 h
+4 h
+358 h
+4 h
+7864 m
+10 h
+1 h
+10 h
+4 h
+1 h
+7865 m
+4 h
+1 h
+97 h
+124 h
+195 h
+10 h
+4 h
+7866 m
+73 h
+124 h
+250 h
+371 h
+59 h
+1796 m
+73 h
+4 h
+1 h
+7867 m
+4 h
+4 h
+125 h
+630 m
+2591 m
+1 h
+7868 m
+7869 m
+4 h
+2623 m
+1 h
+112 h
+7870 m
+4 h
+4 h
+22 h
+1 h
+4 h
+4 h
+57 h
+463 m
+1 h
+7871 m
+3 h
+4 h
+10 h
+1 h
+1250 h
+1 h
+1137 h
+7872 m
+447 h
+10 h
+10 h
+7873 m
+7874 m
+1 h
+1 h
+138 h
+7875 m
+10 h
+1089 h
+10 h
+185 h
+4 h
+10 h
+4 h
+7876 m
+538 h
+4 h
+31 h
+4 h
+2172 h
+10 h
+1 h
+10 h
+7126 m
+443 h
+7877 m
+167 h
+4 h
+4 h
+1 h
+7878 m
+57 h
+4 h
+4 h
+1362 h
+1 h
+1 h
+4 h
+4 h
+11 h
+1 h
+10 h
+7879 m
+31 h
+4 h
+1020 m
+4 h
+4 h
+124 h
+3 h
+124 h
+10 h
+1 h
+4 h
+8 h
+4 h
+7880 m
+4 h
+158 h
+4 h
+4 h
+1 h
+114 h
+278 h
+83 h
+5933 m
+10 h
+181 h
+4 h
+7881 m
+73 h
+56 h
+1 h
+3680 m
+1309 m
+10 h
+112 h
+3299 m
+172 h
+630 m
+10 h
+92 h
+10 h
+1 h
+74 h
+10 h
+1 h
+109 h
+57 h
+167 h
+4 h
+1 h
+10 h
+129 h
+7882 m
+10 h
+181 h
+7883 m
+10 h
+1 h
+1 h
+4 h
+7884 m
+27 h
+7885 m
+11 h
+4 h
+110 h
+1 h
+8 h
+7886 m
+10 h
+196 h
+10 h
+7887 m
+4 h
+10 h
+10 h
+1 h
+45 h
+4 h
+7888 m
+7889 m
+258 h
+10 h
+2379 m
+7890 m
+13 h
+10 h
+4 h
+278 h
+8 h
+10 h
+3479 m
+371 h
+1 h
+4 h
+1 h
+1 h
+59 h
+4 h
+2447 m
+10 h
+1 h
+7891 m
+4 h
+164 h
+7892 m
+1 h
+10 h
+7893 m
+4 h
+295 h
+10 h
+1 h
+1 h
+4 h
+7894 m
+4 h
+4 h
+4 h
+125 h
+10 h
+1 h
+1 h
+10 h
+10 h
+358 h
+4 h
+31 h
+36 h
+196 h
+4 h
+4 h
+7895 m
+4 h
+7896 m
+1 h
+139 h
+83 h
+10 h
+4 h
+7897 m
+7898 m
+1 h
+10 h
+123 h
+7899 m
+258 h
+11 h
+4 h
+25 h
+1030 h
+31 h
+10 h
+1 h
+4 h
+1 h
+97 h
+4 h
+123 h
+172 h
+1 h
+4 h
+4 h
+7900 m
+27 h
+4 h
+1 h
+4 h
+7901 m
+124 h
+1 h
+229 h
+11 h
+139 h
+10 h
+7902 m
+7903 m
+4 h
+4 h
+4 h
+41 h
+7904 m
+7905 m
+10 h
+4 h
+4 h
+146 h
+10 h
+10 h
+4 h
+4 h
+7906 m
+1 h
+4240 m
+4350 m
+10 h
+10 h
+7907 m
+7908 m
+1508 m
+1 h
+4 h
+1 h
+1 h
+10 h
+3 h
+7909 m
+25 h
+97 h
+10 h
+1 h
+4 h
+45 h
+7910 m
+82 h
+4 h
+7911 m
+4 h
+4 h
+6954 m
+7912 m
+196 h
+4 h
+7913 m
+4 h
+74 h
+4 h
+4 h
+1 h
+4 h
+4 h
+7914 m
+7915 m
+156 h
+4 h
+106 h
+10 h
+83 h
+1 h
+7916 m
+7917 m
+1 h
+4 h
+4 h
+1 h
+1 h
+250 h
+57 h
+167 h
+82 h
+4 h
+10 h
+10 h
+4 h
+4 h
+4 h
+109 h
+1 h
+1 h
+4 h
+10 h
+4 h
+10 h
+4292 m
+4 h
+7918 m
+4 h
+59 h
+1 h
+1 h
+265 h
+4 h
+10 h
+7919 m
+4 h
+64 h
+7920 m
+488 h
+10 h
+1 h
+73 h
+7921 m
+3 h
+4 h
+10 h
+4 h
+388 m
+167 h
+7922 m
+386 h
+1 h
+3 h
+4 h
+1 h
+4 h
+4 h
+2887 h
+7923 m
+25 h
+1 h
+1 h
+4 h
+7924 m
+7925 m
+5141 m
+8 h
+146 h
+7926 m
+7927 m
+1 h
+11 h
+4 h
+7839 m
+41 h
+91 h
+7928 m
+4 h
+4 h
+1 h
+114 h
+1 h
+7929 m
+112 h
+40 h
+196 h
+7930 m
+10 h
+10 h
+10 h
+7931 m
+146 h
+4 h
+1 h
+3 h
+1 h
+83 h
+4 h
+4 h
+1337 m
+11 h
+4 h
+7932 m
+1 h
+4 h
+64 h
+7933 m
+7661 h
+124 h
+3 h
+7934 m
+10 h
+7935 m
+173 h
+7936 m
+2530 m
+10 h
+1957 m
+10 h
+7937 m
+1 h
+7938 m
+190 h
+10 h
+108 h
+4 h
+7939 m
+1 h
+11 h
+7940 m
+4 h
+1 h
+1 h
+4 h
+1249 m
+10 h
+82 h
+146 h
+1 h
+59 h
+1 h
+4 h
+7941 m
+4 h
+10 h
+4 h
+146 h
+7942 m
+1 h
+1 h
+4 h
+109 h
+1 h
+2813 m
+55 h
+368 h
+4 h
+1 h
+4 h
+4 h
+7943 m
+7944 m
+1 h
+4 h
+7945 m
+3342 m
+83 h
+1 h
+1 h
+5653 m
+10 h
+307 h
+276 h
+7946 m
+7947 m
+7948 m
+4 h
+45 h
+1027 h
+4 h
+1116 m
+7949 m
+124 h
+1 h
+31 h
+10 h
+4 h
+7950 m
+10 h
+1685 h
+124 h
+1 h
+7951 m
+10 h
+1 h
+7952 m
+83 h
+135 h
+65 h
+12 h
+135 h
+4 h
+25 h
+3422 m
+1 h
+124 h
+92 h
+139 h
+82 h
+109 h
+1 h
+1 h
+4 h
+10 h
+124 h
+4 h
+4 h
+4 h
+92 h
+146 h
+10 h
+10 h
+4 h
+4 h
+11 h
+109 h
+10 h
+4 h
+1 h
+10 h
+1249 m
+1 h
+10 h
+1 h
+7953 m
+77 h
+7954 m
+1403 h
+4 h
+4 h
+7955 m
+4 h
+965 m
+4 h
+238 h
+1 h
+83 h
+65 h
+556 h
+4 h
+4 h
+403 h
+7956 m
+4 h
+4 h
+1 h
+4 h
+11 h
+7957 m
+4 h
+692 h
+1064 m
+172 h
+1 h
+1 h
+1 h
+319 h
+371 h
+31 h
+1470 h
+7958 m
+41 h
+7959 m
+10 h
+1 h
+11 h
+1 h
+1 h
+3341 m
+104 h
+82 h
+45 h
+4 h
+4 h
+7960 m
+123 h
+10 h
+4 h
+1 h
+4 h
+10 h
+4 h
+10 h
+36 h
+82 h
+266 h
+147 h
+10 h
+56 h
+1 h
+4 h
+4 h
+45 h
+7961 m
+103 m
+1 h
+1892 m
+73 h
+7962 m
+4 h
+4 h
+1 h
+583 m
+4 h
+10 h
+1 h
+1 h
+7963 m
+4 h
+10 h
+4 h
+4 h
+7964 m
+94 h
+4 h
+2004 m
+1 h
+1 h
+10 h
+4 h
+56 h
+1 h
+10 h
+4 h
+4 h
+143 h
+169 h
+7965 m
+976 h
+1 h
+1 h
+82 h
+7966 m
+1 h
+25 h
+7967 m
+11 h
+4 h
+1 h
+1 h
+7968 m
+114 h
+4 h
+119 h
+1 h
+7969 m
+7970 m
+1 h
+478 h
+1 h
+4 h
+82 h
+4 h
+10 h
+64 h
+10 h
+4 h
+7971 m
+7972 m
+4 h
+4 h
+1 h
+1261 m
+718 h
+1 h
+1 h
+65 h
+7973 m
+124 h
+45 h
+4 h
+4 h
+4 h
+41 h
+92 h
+1 h
+7974 m
+4 h
+4 h
+270 h
+17 h
+1 h
+12 h
+1 h
+10 h
+7975 m
+25 h
+4 h
+124 h
+1 h
+10 h
+4 h
+1 h
+97 h
+4 h
+1 h
+7976 m
+10 h
+7977 m
+1089 h
+7978 m
+1 h
+7979 m
+4 h
+4 h
+4 h
+1 h
+7980 m
+7981 m
+7982 m
+10 h
+1 h
+41 h
+125 h
+1535 m
+10 h
+601 h
+1 h
+10 h
+1 h
+7983 m
+7984 m
+4 h
+73 h
+104 h
+1619 h
+109 h
+82 h
+4 h
+7985 m
+11 h
+7986 m
+1 h
+1137 h
+1 h
+7987 m
+4 h
+4 h
+698 m
+918 m
+4 h
+1 h
+1 h
+2733 h
+383 h
+1 h
+4 h
+10 h
+7988 m
+4 h
+1766 h
+45 h
+36 h
+7989 m
+7990 m
+144 h
+7991 m
+4 h
+1785 m
+4 h
+7992 m
+56 h
+4 h
+10 h
+7993 m
+4 h
+4 h
+10 h
+7994 m
+7995 m
+56 h
+696 m
+4 h
+4 h
+4 h
+7996 m
+10 h
+41 h
+1 h
+104 h
+36 h
+1 h
+4 h
+7997 m
+1 h
+144 h
+1 h
+172 h
+7998 m
+4 h
+7999 m
+82 h
+8000 m
+1 h
+640 h
+8001 m
+27 h
+8002 m
+1 h
+4 h
+2941 m
+8003 m
+4 h
+1053 m
+8004 m
+976 h
+11 h
+82 h
+10 h
+4 h
+55 h
+1 h
+31 h
+4 h
+4 h
+4 h
+82 h
+157 h
+10 h
+10 h
+10 h
+1 h
+1202 m
+4 h
+8005 m
+4 h
+4 h
+1 h
+1 h
+4 h
+4 h
+4 h
+10 h
+4 h
+8006 m
+3555 m
+4 h
+224 h
+10 h
+4 h
+11 h
+45 h
+1 h
+1 h
+8007 m
+8008 m
+59 h
+10 h
+1 h
+4 h
+4 h
+1 h
+8009 m
+1122 m
+1 h
+1 h
+412 m
+4 h
+10 h
+1 h
+10 h
+1 h
+10 h
+10 h
+1 h
+10 h
+4 h
+4 h
+97 h
+3 h
+8010 m
+8011 m
+10 h
+1796 m
+10 h
+8012 m
+57 h
+229 h
+4 h
+1 h
+8013 m
+8014 m
+4 h
+10 h
+8015 m
+3 h
+184 h
+185 h
+4 h
+1016 h
+976 h
+5590 m
+190 h
+8 h
+25 h
+1 h
+8016 m
+10 h
+10 h
+3 h
+8017 m
+4 h
+4 h
+4 h
+10 h
+8018 m
+8 h
+8019 m
+1 h
+737 h
+520 h
+11 h
+4 h
+8020 m
+185 h
+1 h
+8021 m
+8022 m
+1 h
+478 h
+10 h
+8023 m
+41 h
+57 h
+10 h
+10 h
+4229 m
+82 h
+10 h
+1114 m
+4 h
+447 h
+1 h
+11 h
+3112 m
+1 h
+4 h
+4 h
+10 h
+4 h
+4 h
+10 h
+4 h
+299 h
+1 h
+4 h
+10 h
+1 h
+10 h
+4 h
+297 h
+10 h
+4 h
+1 h
+4 h
+10 h
+10 h
+8024 m
+8025 m
+1 h
+4 h
+10 h
+4 h
+8026 m
+1201 m
+285 m
+181 h
+1 h
+8027 m
+11 h
+10 h
+433 m
+8028 m
+56 h
+4 h
+8029 m
+4 h
+8030 m
+104 h
+4 h
+74 h
+1 h
+8031 m
+185 h
+1 h
+97 h
+278 h
+1 h
+8032 m
+4 h
+1016 h
+11 h
+1 h
+1 h
+1 h
+4 h
+10 h
+4 h
+59 h
+4 h
+11 h
+8033 m
+4 h
+1646 m
+1 h
+4 h
+8034 m
+4 h
+1 h
+1 h
+4 h
+8035 m
+56 h
+4 h
+4 h
+8036 m
+1 h
+10 h
+443 h
+4 h
+4 h
+1 h
+8037 m
+8038 m
+4 h
+332 h
+1 h
+4 h
+8039 m
+10 h
+123 h
+1 h
+4 h
+8040 m
+307 h
+31 h
+25 h
+8041 m
+82 h
+8042 m
+1 h
+1 h
+8043 m
+8044 m
+169 h
+8045 m
+265 h
+27 h
+91 h
+10 h
+6438 m
+8046 m
+4 h
+124 h
+10 h
+1 h
+8047 m
+4 h
+4 h
+1 h
+4 h
+1 h
+307 h
+1 h
+1 h
+64 h
+57 h
+41 h
+4 h
+4 h
+10 h
+8048 m
+11 h
+1 h
+4 h
+1 h
+1 h
+8049 m
+10 h
+8050 m
+8051 m
+10 h
+4 h
+1 h
+4 h
+10 h
+114 h
+2314 m
+1 h
+41 h
+10 h
+4 h
+8052 m
+10 h
+83 h
+10 h
+59 h
+1 h
+11 h
+1406 h
+1 h
+687 h
+25 h
+11 h
+4 h
+10 h
+447 h
+4 h
+36 h
+41 h
+1 h
+10 h
+10 h
+82 h
+4 h
+57 h
+4 h
+1 h
+11 h
+173 h
+265 h
+170 h
+11 h
+1454 m
+11 h
+119 h
+97 h
+1 h
+4 h
+8053 m
+1 h
+8054 m
+1 h
+79 h
+8055 m
+10 h
+8056 m
+27 h
+70 m
+6731 m
+4 h
+1 h
+4 h
+8057 m
+22 h
+8058 m
+10 h
+10 h
+5917 m
+8059 m
+10 h
+4 h
+238 h
+4590 m
+27 h
+10 h
+656 h
+8060 m
+8061 m
+10 h
+8062 m
+2002 h
+4 h
+10 h
+8063 m
+1 h
+2625 h
+11 h
+1 h
+1957 m
+172 h
+4 h
+1 h
+4 h
+4 h
+8064 m
+4 h
+27 h
+10 h
+8065 m
+4 h
+8066 m
+124 h
+4 h
+4 h
+1790 h
+97 h
+1 h
+28 h
+8067 m
+4 h
+4 h
+8068 m
+4 h
+8069 m
+11 h
+143 h
+1 h
+4 h
+8070 m
+10 h
+1 h
+10 h
+4 h
+359 h
+289 h
+114 h
+10 h
+1 h
+79 h
+4 h
+4 h
+3 h
+1 h
+8071 m
+347 m
+1 h
+10 h
+13 h
+1 h
+10 h
+82 h
+73 h
+1 h
+125 h
+1 h
+8072 m
+8073 m
+8074 m
+8075 m
+8076 m
+11 h
+238 h
+3679 m
+4 h
+4 h
+45 h
+8077 m
+1 h
+1 h
+143 h
+8078 m
+12 h
+4 h
+8079 m
+1 h
+8080 m
+1 h
+11 h
+4 h
+2265 m
+146 h
+10 h
+4 h
+297 h
+1 h
+4 h
+1620 m
+642 m
+5917 m
+97 h
+4 h
+8081 m
+8082 m
+4 h
+1 h
+10 h
+8083 m
+11 h
+1 h
+114 h
+1 h
+10 h
+22 h
+279 h
+4 h
+1 h
+1 h
+10 h
+8084 m
+1067 m
+8085 m
+8086 m
+4 h
+8087 m
+8088 m
+10 h
+359 h
+94 h
+4 h
+10 h
+4 h
+114 h
+8089 m
+8090 m
+1 h
+1 h
+1 h
+1 h
+8091 m
+4 h
+10 h
+4 h
+4 h
+31 h
+8092 m
+1 h
+94 h
+4 h
+4 h
+124 h
+31 h
+1 h
+124 h
+8093 m
+4 h
+10 h
+1074 h
+238 h
+10 h
+57 h
+59 h
+1 h
+1 h
+1 h
+1105 h
+1 h
+8094 m
+8095 m
+4 h
+4 h
+276 h
+192 h
+10 h
+4 h
+8096 m
+1 h
+935 h
+4 h
+109 h
+10 h
+4 h
+4 h
+8097 m
+8098 m
+8099 m
+10 h
+2379 m
+4 h
+1 h
+10 h
+4 h
+4 h
+8100 m
+1 h
+10 h
+1 h
+8101 m
+10 h
+8102 m
+73 h
+704 m
+8103 m
+307 h
+4 h
+109 h
+1 h
+4 h
+4 h
+1 h
+4 h
+8104 m
+4 h
+190 h
+10 h
+1092 m
+113 h
+1 h
+109 h
+10 h
+4 h
+10 h
+10 h
+459 m
+2418 m
+8 h
+1650 h
+4 h
+64 h
+8105 m
+4 h
+1 h
+1 h
+41 h
+8106 m
+10 h
+8107 m
+57 h
+8108 m
+129 h
+4 h
+10 h
+106 h
+11 h
+59 h
+8109 m
+10 h
+8110 m
+4 h
+170 h
+4 h
+8111 m
+8112 m
+8113 m
+4 h
+10 h
+10 h
+1 h
+347 m
+57 h
+1 h
+8114 m
+1027 h
+1751 m
+83 h
+10 h
+1 h
+4 h
+172 h
+56 h
+2616 m
+1 h
+443 h
+10 h
+10 h
+1 h
+8115 m
+1 h
+8116 m
+8117 m
+4 h
+10 h
+1 h
+8118 m
+10 h
+1 h
+11 h
+8119 m
+8120 m
+8121 m
+1 h
+1 h
+4 h
+4 h
+55 h
+857 h
+620 m
+1 h
+65 h
+1 h
+59 h
+258 h
+11 h
+1 h
+1 h
+4 h
+8122 m
+4 h
+8123 m
+195 h
+1 h
+172 h
+1 h
+687 h
+10 h
+8124 m
+8125 m
+10 h
+1 h
+10 h
+8126 m
+10 h
+8127 m
+4 h
+156 h
+8128 m
+1886 m
+8129 m
+4 h
+10 h
+4 h
+57 h
+4 h
+94 h
+1 h
+1 h
+8130 m
+10 h
+4 h
+4 h
+143 h
+4 h
+1 h
+8131 m
+82 h
+8132 m
+1 h
+1 h
+1 h
+10 h
+83 h
+1 h
+8133 m
+125 h
+25 h
+10 h
+8134 m
+11 h
+371 h
+4 h
+4 h
+5917 h
+92 h
+258 h
+4 h
+10 h
+10 h
+167 h
+10 h
+1 h
+8135 m
+4 h
+4 h
+1 h
+10 h
+4 h
+1 h
+8136 m
+146 h
+10 h
+4378 m
+4 h
+1 h
+10 h
+10 h
+4 h
+22 h
+4 h
+10 h
+4 h
+4 h
+57 h
+965 m
+5387 m
+4 h
+1 h
+4 h
+506 m
+195 h
+124 h
+1 h
+41 h
+109 h
+8137 m
+509 m
+4 h
+4 h
+10 h
+114 h
+10 h
+1 h
+4 h
+1 h
+4 h
+278 h
+4 h
+5229 m
+1403 h
+4 h
+1137 h
+124 h
+4 h
+447 h
+10 h
+186 h
+13 h
+10 h
+1 h
+4 h
+279 h
+12 h
+4 h
+4 h
+1 h
+114 h
+8138 m
+1 h
+8139 m
+10 h
+8140 m
+250 h
+10 h
+8141 m
+4 h
+10 h
+11 h
+109 h
+1 h
+1 h
+1 h
+4 h
+4 h
+1 h
+10 h
+143 h
+1 h
+520 h
+4 h
+170 h
+278 h
+4 h
+1 h
+82 h
+4 h
+10 h
+1 h
+83 h
+4 h
+10 h
+4 h
+601 h
+1 h
+8142 m
+8143 m
+4 h
+8144 m
+185 h
+7787 m
+8 h
+8145 m
+135 h
+10 h
+1 h
+5929 m
+1 h
+272 h
+4 h
+8146 m
+8147 m
+157 h
+11 h
+4 h
+4 h
+4 h
+8148 m
+7348 m
+4 h
+1 h
+31 h
+4 h
+4 h
+1 h
+196 h
+75 m
+1 h
+10 h
+4 h
+10 h
+56 h
+1 h
+2508 m
+1 h
+1 h
+4 h
+8149 m
+1 h
+4 h
+4 h
+109 h
+1445 h
+4 h
+10 h
+124 h
+73 h
+10 h
+4 h
+4 h
+1403 h
+1 h
+10 h
+4 h
+8150 m
+123 h
+444 m
+4 h
+4 h
+11 h
+3303 m
+10 h
+10 h
+59 h
+82 h
+4 h
+1 h
+82 h
+4 h
+913 m
+1 h
+12 h
+123 h
+13 h
+82 h
+4 h
+4 h
+1 h
+278 h
+10 h
+8151 m
+10 h
+1 h
+4 h
+2951 m
+1 h
+1 h
+10 h
+1 h
+135 h
+11 h
+64 h
+10 h
+10 h
+4 h
+8152 m
+601 h
+4 h
+520 h
+57 h
+1 h
+266 h
+1 h
+82 h
+4 h
+8153 m
+1975 m
+8154 m
+1953 m
+10 h
+65 h
+8155 m
+124 h
+794 m
+8156 m
+570 h
+1261 h
+578 m
+4 h
+4 h
+10 h
+4 h
+8157 m
+1822 h
+8158 m
+10 h
+1 h
+1 h
+4 h
+986 h
+1642 h
+4 h
+1 h
+1 h
+8159 m
+1 h
+8160 m
+10 h
+8161 m
+82 h
+4 h
+1 h
+83 h
+114 h
+10 h
+119 h
+4 h
+8162 m
+10 h
+1 h
+11 h
+601 h
+144 h
+1 h
+83 h
+1 h
+8163 m
+8164 m
+3509 m
+4 h
+10 h
+230 h
+73 h
+1 h
+10 h
+1016 h
+10 h
+4810 m
+45 h
+1 h
+1016 h
+8165 m
+8166 m
+10 h
+11 h
+1 h
+10 h
+5621 m
+262 h
+74 h
+1766 h
+1 h
+4 h
+4 h
+1 h
+4 h
+8167 m
+140 h
+4 h
+10 h
+41 h
+1 h
+146 h
+4 h
+1 h
+8168 m
+4 h
+4 h
+4 h
+4574 m
+57 h
+4 h
+4 h
+8169 m
+150 m
+10 h
+3909 m
+1445 h
+4 h
+10 h
+10 h
+10 h
+1 h
+8170 m
+3111 m
+55 h
+36 h
+4 h
+4 h
+13 h
+11 h
+8171 m
+1 h
+57 h
+4 h
+4 h
+4 h
+4 h
+10 h
+8172 m
+4 h
+1 h
+1 h
+358 h
+1 h
+83 h
+1 h
+82 h
+4 h
+8173 m
+10 h
+4 h
+4 h
+59 h
+8174 m
+4 h
+8175 m
+4 h
+10 h
+124 h
+4 h
+4 h
+3 h
+1 h
+307 h
+4 h
+1 h
+82 h
+139 h
+8176 m
+10 h
+10 h
+4 h
+4 h
+4 h
+4 h
+4 h
+8177 m
+1 h
+4 h
+1 h
+1 h
+8178 m
+11 h
+1 h
+10 h
+4 h
+195 h
+8179 m
+4 h
+8180 m
+601 h
+8181 m
+1 h
+8182 m
+964 m
+124 h
+869 m
+1 h
+8183 m
+8184 m
+4 h
+8185 m
+1359 h
+8186 m
+10 h
+266 h
+1 h
+1822 h
+8187 m
+1 h
+1 h
+25 h
+9 m
+8188 m
+1 h
+8189 m
+11 h
+10 h
+8190 m
+8191 m
+8192 m
+124 h
+3 h
+8193 m
+1 h
+8194 m
+83 h
+1 h
+13 h
+25 h
+1 h
+8195 m
+10 h
+57 h
+82 h
+8196 m
+10 h
+59 h
+4 h
+4 h
+4 h
+4 h
+8197 m
+1 h
+1 h
+8198 m
+143 h
+8199 m
+124 h
+1 h
+10 h
+1 h
+601 h
+8200 m
+512 m
+8201 m
+229 h
+8202 m
+1 h
+3 h
+10 h
+12 h
+8203 m
+4 h
+170 h
+12 h
+124 h
+1 h
+10 h
+4 h
+8204 m
+1 h
+11 h
+4 h
+25 h
+4 h
+1070 m
+104 h
+4 h
+4 h
+172 h
+8205 m
+3 h
+8206 m
+1 h
+1 h
+262 h
+10 h
+10 h
+11 h
+8207 m
+8208 m
+1 h
+1556 m
+1 h
+1 h
+4 h
+1 h
+4 h
+36 h
+5917 h
+869 m
+1 h
+48 h
+1 h
+1685 h
+13 h
+124 h
+4 h
+8209 m
+1650 h
+1 h
+8210 m
+10 h
+10 h
+278 h
+4 h
+4 h
+1 h
+8211 m
+8212 m
+8213 m
+31 h
+297 h
+4 h
+4 h
+1362 h
+1 h
+1 h
+1606 m
+190 h
+3 h
+31 h
+57 h
+10 h
+10 h
+8214 m
+1 h
+10 h
+172 h
+278 h
+8215 m
+1016 h
+1 h
+8216 m
+8217 m
+1 h
+8218 m
+5929 m
+4 h
+79 h
+1 h
+10 h
+1 h
+41 h
+124 h
+10 h
+4 h
+1 h
+278 h
+4 h
+536 h
+4 h
+1691 m
+110 h
+8219 m
+8220 m
+138 h
+10 h
+74 h
+8221 m
+124 h
+8222 m
+4 h
+8223 m
+4 h
+114 h
+1 h
+1 h
+10 h
+4986 m
+4 h
+4 h
+4 h
+1250 h
+4 h
+8224 m
+1 h
+307 h
+1 h
+1 h
+1 h
+8225 m
+8226 m
+170 h
+1504 m
+4 h
+3 h
+1 h
+1105 h
+4 h
+4 h
+8227 m
+11 h
+196 h
+10 h
+4 h
+57 h
+4 h
+443 h
+2746 m
+4 h
+911 m
+4 h
+10 h
+125 h
+45 h
+41 h
+4 h
+11 h
+109 h
+4 h
+1 h
+4 h
+5 h
+8228 m
+10 h
+2379 m
+1796 h
+8229 m
+1 h
+8230 m
+90 m
+108 h
+8231 m
+12 h
+8232 m
+4 h
+8233 m
+1 h
+1478 m
+4 h
+10 h
+4 h
+74 h
+143 h
+4 h
+94 h
+11 h
+3 h
+10 h
+10 h
+8234 m
+8235 m
+4 h
+4 h
+13 h
+13 h
+8236 m
+4000 m
+1 h
+1 h
+45 h
+2272 m
+8237 m
+8238 m
+4 h
+4 h
+1 h
+8239 m
+8240 m
+8241 m
+1 h
+1714 m
+10 h
+1 h
+8242 m
+4 h
+1 h
+8243 m
+10 h
+4 h
+1 h
+399 h
+8244 m
+10 h
+11 h
+8245 m
+10 h
+4 h
+125 h
+4 h
+10 h
+4 h
+109 h
+57 h
+3 h
+4 h
+4 h
+4 h
+10 h
+1 h
+8246 m
+143 h
+4 h
+1 h
+12 h
+31 h
+4 h
+10 h
+4 h
+55 h
+1 h
+8017 m
+4 h
+8247 m
+8248 m
+10 h
+8249 m
+41 h
+8250 m
+4 h
+8251 m
+181 h
+10 h
+8252 m
+31 h
+84 h
+8253 m
+4 h
+8254 m
+4 h
+8255 m
+4 h
+1 h
+3 h
+12 h
+1 h
+1 h
+4 h
+4 h
+41 h
+8256 m
+4 h
+4 h
+4 h
+4 h
+4 h
+4 h
+112 h
+8257 m
+1 h
+332 h
+6558 m
+10 h
+270 h
+11 h
+4 h
+4 h
+4 h
+8258 m
+8259 m
+109 h
+266 h
+353 m
+4 h
+4 h
+1 h
+278 h
+1 h
+1 h
+1 h
+10 h
+1 h
+8260 m
+112 h
+1137 h
+59 h
+8261 m
+1 h
+10 h
+113 h
+1 h
+2887 h
+8262 m
+4 h
+10 h
+4 h
+8263 m
+8264 m
+8265 m
+125 h
+4 h
+4 h
+4 h
+569 h
+10 h
+10 h
+1470 h
+8266 m
+10 h
+3 h
+10 h
+74 h
+1 h
+1 h
+11 h
+10 h
+10 h
+92 h
+4 h
+8267 m
+11 h
+10 h
+124 h
+125 h
+104 h
+976 h
+57 h
+258 h
+114 h
+230 h
+4 h
+8268 m
+41 h
+8269 m
+167 h
+464 h
+10 h
+8270 m
+190 h
+4 h
+4 h
+8271 m
+10 h
+25 h
+10 h
+1 h
+10 h
+358 h
+1 h
+8272 m
+4 h
+1 h
+55 h
+4 h
+157 h
+8273 m
+966 m
+8274 m
+59 h
+1 h
+4 h
+8275 m
+157 h
+1 h
+4 h
+31 h
+4 h
+1 h
+79 h
+1 h
+4 h
+4 h
+4 h
+69 h
+10 h
+11 h
+4 h
+57 h
+8276 m
+97 h
+124 h
+3025 m
+125 h
+1 h
+12 h
+4 h
+1 h
+109 h
+718 h
+82 h
+41 h
+4 h
+4 h
+196 h
+4 h
+64 h
+1714 m
+45 h
+4 h
+4 h
+4 h
+114 h
+1 h
+196 h
+8277 m
+4 h
+8278 m
+8279 m
+1 h
+10 h
+92 h
+64 h
+11 h
+8280 m
+1 h
+45 h
+1 h
+109 h
+4 h
+8281 m
+1 h
+8282 m
+1 h
+2625 h
+3675 m
+1 h
+368 h
+1685 h
+119 h
+8283 m
+164 h
+1 h
+4 h
+1 h
+8284 m
+91 h
+1 h
+1 h
+10 h
+4 h
+64 h
+8285 m
+8286 m
+8287 m
+3 h
+8 h
+8288 m
+8289 m
+1 h
+4 h
+4 h
+82 h
+11 h
+10 h
+104 h
+4 h
+359 h
+12 h
+274 h
+10 h
+8290 m
+272 h
+185 h
+8291 m
+8292 m
+3 h
+8293 m
+913 m
+10 h
+57 h
+4 h
+8294 m
+64 h
+4 h
+8295 m
+10 h
+11 h
+1 h
+8296 m
+8297 m
+8298 m
+8299 m
+1 h
+8300 m
+3 h
+10 h
+4 h
+10 h
+4 h
+4 h
+8301 m
+1 h
+1 h
+8302 m
+92 h
+4 h
+124 h
+11 h
+10 h
+124 h
+1 h
+3 h
+1 h
+147 h
+8303 m
+4 h
+4 h
+8304 m
+4 h
+1 h
+8305 m
+10 h
+1 h
+4 h
+4 h
+1 h
+1 h
+125 h
+10 h
+105 m
+4 h
+4 h
+8306 m
+4 h
+1 h
+4 h
+158 h
+25 h
+8307 m
+1 h
+1 h
+10 h
+8308 m
+4218 m
+4 h
+1 h
+8309 m
+1 h
+81 m
+4 h
+250 h
+1 h
+172 h
+55 h
+1 h
+129 h
+1 h
+41 h
+1 h
+4 h
+1 h
+146 h
+4 h
+8310 m
+4 h
+4 h
+59 h
+135 h
+10 h
+4 h
+4 h
+8311 m
+7064 m
+332 h
+4 h
+4 h
+8312 m
+4 h
+8313 m
+4 h
+10 h
+167 h
+8314 m
+10 h
+1 h
+82 h
+55 h
+25 h
+10 h
+1 h
+1 h
+10 h
+1 h
+10 h
+569 h
+1 h
+10 h
+4 h
+8315 m
+195 h
+779 m
+109 h
+912 m
+779 h
+1 h
+238 h
+25 h
+10 h
+10 h
+4 h
+8316 m
+274 h
+1772 h
+4 h
+41 h
+10 h
+195 h
+8317 m
+8318 m
+3 h
+1189 m
+8319 m
+10 h
+31 h
+1 h
+59 h
+8320 m
+4 h
+10 h
+238 h
+8321 m
+1027 h
+1 h
+10 h
+1 h
+10 h
+4 h
+25 h
+10 h
+10 h
+1 h
+1 h
+757 h
+10 h
+1 h
+536 h
+4 h
+4 h
+4 h
+8322 m
+8323 m
+1 h
+1 h
+4 h
+1 h
+190 h
+36 h
+8324 m
+10 h
+10 h
+4 h
+186 h
+8325 m
+4 h
+1 h
+4 h
+8326 m
+170 h
+1 h
+1 h
+332 h
+1 h
+10 h
+8327 m
+4 h
+4 h
+10 h
+8328 m
+8329 m
+1 h
+278 h
+109 h
+4292 m
+4 h
+1 h
+8330 m
+10 h
+8331 m
+8332 m
+250 h
+4 h
+8333 m
+10 h
+1 h
+265 h
+1769 m
+1 h
+4 h
+4 h
+1 h
+10 h
+1 h
+11 h
+4 h
+1 h
+4 h
+83 h
+10 h
+10 h
+10 h
+1 h
+139 h
+10 h
+3646 m
+4 h
+55 h
+3025 m
+1 h
+1 h
+1 h
+1 h
+10 h
+1 h
+1 h
+1 h
+8334 m
+8335 m
+125 h
+4 h
+8336 m
+73 h
+1 h
+330 h
+8337 m
+463 m
+3 h
+104 h
+97 h
+4 h
+1 h
+4 h
+4 h
+11 h
+307 h
+4 h
+1 h
+278 h
+1 h
+1955 m
+57 h
+45 h
+8338 m
+1 h
+59 h
+190 h
+1 h
+8339 m
+4 h
+1454 m
+10 h
+65 h
+4 h
+8340 m
+169 h
+1 h
+258 h
+1 h
+4 h
+5 h
+1 h
+4240 m
+10 h
+10 h
+92 h
+7870 m
+10 h
+8341 m
+1 h
+8342 m
+4 h
+1 h
+4 h
+94 h
+10 h
+73 h
+583 m
+1 h
+1 h
+1 h
+8206 m
+170 h
+4 h
+692 h
+4 h
+4 h
+1 h
+4 h
+4 h
+1968 m
+8343 m
+4 h
+1 h
+4 h
+289 h
+4 h
+10 h
+8344 m
+2172 h
+4 h
+55 h
+4 h
+4 h
+1975 m
+1 h
+146 h
+8345 m
+13 h
+4 h
+57 h
+4 h
+82 h
+8346 m
+1691 m
+10 h
+10 h
+195 h
+11 h
+1 h
+4 h
+31 h
+4 h
+13 h
+10 h
+8347 m
+10 h
+1 h
+8348 m
+195 h
+8349 m
+10 h
+8350 m
+1 h
+4 h
+1038 m
+8351 m
+8352 m
+1619 h
+1 h
+59 h
+1 h
+28 h
+885 m
+10 h
+1 h
+147 h
+143 h
+4 h
+10 h
+10 h
+192 h
+368 h
+192 h
+8353 m
+1 h
+1 h
+4 h
+4 h
+4 h
+10 h
+8354 m
+4 h
+258 h
+11 h
+83 h
+8355 m
+1 h
+10 h
+538 h
+10 h
+10 h
+4 h
+4 h
+27 h
+8 h
+11 h
+10 h
+143 h
+139 h
+1 h
+4 h
+8356 m
+1122 m
+8357 m
+8358 m
+4 h
+114 h
+8359 m
+170 h
+8360 m
+10 h
+8361 m
+4 h
+569 h
+41 h
+4 h
+8362 m
+10 h
+990 m
+11 h
+4 h
+4333 m
+8363 m
+4 h
+1 h
+4 h
+31 h
+4 h
+1 h
+4 h
+4 h
+45 h
+1542 m
+996 m
+8364 m
+4 h
+1 h
+4 h
+1 h
+10 h
+1 h
+403 h
+10 h
+146 h
+114 h
+10 h
+1 h
+4 h
+13 h
+8365 m
+1 h
+114 h
+4 h
+1 h
+4 h
+4 h
+10 h
+8366 m
+3177 m
+8367 m
+124 h
+4 h
+4 h
+4 h
+8368 m
+8369 m
+4858 m
+1 h
+8370 m
+143 h
+8371 m
+10 h
+4 h
+4 h
+1 h
+4 h
+4 h
+10 h
+1914 m
+1 h
+164 h
+82 h
+10 h
+1 h
+8372 m
+10 h
+1 h
+10 h
+4 h
+8373 m
+1 h
+109 h
+109 h
+4 h
+1 h
+56 h
+1 h
+4 h
+8374 m
+8375 m
+10 h
+3555 m
+994 m
+8376 m
+4 h
+1 h
+1 h
+10 h
+1939 m
+8377 m
+64 h
+8378 m
+170 h
+4 h
+11 h
+8379 m
+10 h
+4 h
+94 h
+4 h
+109 h
+8380 m
+8381 m
+8382 m
+297 h
+1 h
+3 h
+41 h
+4 h
+4 h
+241 m
+258 h
+10 h
+1 h
+4 h
+434 m
+10 h
+4 h
+10 h
+10 h
+692 h
+10 h
+109 h
+8383 m
+1 h
+8384 m
+1 h
+733 m
+10 h
+276 h
+1 h
+687 h
+857 h
+8385 m
+4 h
+443 h
+10 h
+56 h
+4 h
+1024 m
+4 h
+10 h
+1 h
+1 h
+1 h
+3768 m
+4 h
+4 h
+4 h
+1 h
+124 h
+1449 m
+4 h
+59 h
+1 h
+1 h
+4 h
+4 h
+190 h
+8386 m
+1 h
+4 h
+1 h
+1 h
+147 h
+10 h
+4 h
+4 h
+59 h
+4 h
+10 h
+1 h
+172 h
+4 h
+82 h
+11 h
+27 h
+1 h
+258 h
+8387 m
+8388 m
+8389 m
+1 h
+10 h
+8390 m
+83 h
+1 h
+3 h
+74 h
+10 h
+10 h
+8391 m
+4 h
+1 h
+4 h
+4 h
+1 h
+1 h
+4 h
+2172 h
+808 m
+1 h
+477 h
+8392 m
+674 m
+13 h
+1 h
+8393 m
+1 h
+1 h
+1 h
+4 h
+10 h
+10 h
+8394 m
+4 h
+4 h
+1835 m
+1070 m
+1 h
+56 h
+8395 m
+65 h
+4 h
+36 h
+135 h
+4 h
+4 h
+8396 m
+4 h
+4 h
+10 h
+4 h
+3 h
+4 h
+4 h
+1 h
+1 h
+2617 m
+1 h
+31 h
+4 h
+4 h
+1030 h
+13 h
+181 h
+8397 m
+7709 m
+1 h
+3 h
+10 h
+8398 m
+83 h
+1 h
+4 h
+4 h
+8399 m
+8400 m
+1 h
+3 h
+1337 m
+10 h
+10 h
+10 h
+4 h
+11 h
+1 h
+92 h
+2920 m
+1 h
+4 h
+31 h
+59 h
+1 h
+8401 m
+4 h
+10 h
+83 h
+8402 m
+4 h
+41 h
+1 h
+59 h
+1 h
+5917 h
+41 h
+4 h
+4 h
+125 h
+73 h
+4 h
+8403 m
+10 h
+104 h
+4 h
+10 h
+4 h
+74 h
+1 h
+4 h
+358 h
+4 h
+45 h
+41 h
+4 h
+181 h
+83 h
+265 h
+10 h
+8404 m
+10 h
+195 h
+11 h
+1 h
+4 h
+8405 m
+4 h
+78 m
+8406 m
+8407 m
+1 h
+8408 m
+4 h
+10 h
+8409 m
+83 h
+464 h
+4 h
+8410 m
+4 h
+10 h
+8411 m
+1 h
+5929 h
+4 h
+11 h
+186 h
+82 h
+119 h
+1796 h
+195 h
+8412 m
+265 h
+4 h
+13 h
+1016 h
+8413 m
+536 h
+2733 h
+4 h
+8414 m
+1 h
+4 h
+10 h
+8415 m
+1 h
+1 h
+4 h
+4 h
+195 h
+8206 h
+8416 m
+4 h
+65 h
+1 h
+8417 m
+125 h
+8418 m
+266 h
+4 h
+8419 m
+10 h
+265 h
+1 h
+4 h
+1 h
+11 h
+4 h
+4 h
+1 h
+10 h
+8420 m
+82 h
+8421 m
+1 h
+8422 m
+10 h
+83 h
+1 h
+8423 m
+4 h
+8424 m
+8425 m
+1 h
+4 h
+1 h
+10 h
+10 h
+10 h
+94 h
+8426 m
+8427 m
+8428 m
+4 h
+976 h
+4 h
+4256 m
+1 h
+8429 m
+1 h
+167 h
+1 h
+4 h
+8430 m
+196 h
+8431 m
+1 h
+578 m
+1 h
+10 h
+1 h
+1 h
+8432 m
+4 h
+8433 m
+718 h
+4 h
+912 m
+4 h
+1 h
+229 h
+8434 m
+10 h
+1 h
+4 h
+8435 m
+11 h
+82 h
+1886 m
+167 h
+10 h
+8436 m
+8437 m
+10 h
+4 h
+73 h
+4 h
+4 h
+8438 m
+12 h
+109 h
+8439 m
+73 h
+4 h
+4 h
+11 h
+25 h
+2592 m
+10 h
+1939 m
+2172 h
+1 h
+266 h
+59 h
+8440 m
+4 h
+104 h
+4 h
+4 h
+10 h
+1 h
+195 h
+109 h
+8441 m
+4 h
+4 h
+1 h
+1 h
+1 h
+6851 m
+8442 m
+8188 m
+4 h
+4 h
+444 m
+65 h
+11 h
+1 h
+74 h
+10 h
+8443 m
+8444 m
+8445 m
+8446 m
+8447 m
+1 h
+536 h
+4 h
+10 h
+1796 h
+4 h
+3 h
+3558 m
+4 h
+1571 m
+11 h
+10 h
+8448 m
+1 h
+8449 m
+4 h
+97 h
+1 h
+172 h
+4 h
+7214 m
+3170 m
+59 h
+4 h
+10 h
+4 h
+10 h
+10 h
+11 h
+10 h
+10 h
+10 h
+1 h
+82 h
+79 h
+1 h
+1508 m
+1 h
+4 h
+11 h
+4 h
+601 h
+493 m
+10 h
+8450 m
+10 h
+13 h
+65 h
+8451 m
+8452 m
+1 h
+10 h
+8453 m
+1 h
+10 h
+10 h
+4 h
+400 m
+1 h
+31 h
+468 m
+4 h
+8454 m
+250 h
+8455 m
+4 h
+4 h
+2494 m
+1 h
+4 h
+59 h
+1822 h
+25 h
+8456 m
+1 h
+11 h
+1 h
+4 h
+4 h
+371 h
+8457 m
+3847 m
+124 h
+447 h
+1 h
+4 h
+10 h
+4824 m
+278 h
+4 h
+10 h
+31 h
+8458 m
+4 h
+1309 m
+8459 m
+347 h
+157 h
+57 h
+10 h
+10 h
+1 h
+8460 m
+8461 m
+8462 m
+10 h
+8463 m
+4 h
+8464 m
+1 h
+185 h
+109 h
+147 h
+4 h
+5863 m
+10 h
+65 h
+25 h
+4 h
+1 h
+8465 m
+4 h
+10 h
+1981 m
+59 h
+2041 m
+4 h
+10 h
+25 h
+10 h
+11 h
+8466 m
+4 h
+1016 h
+8467 m
+10 h
+1 h
+4 h
+8468 m
+1 h
+8469 m
+1 h
+4 h
+25 h
+1 h
+10 h
+57 h
+2148 m
+8470 m
+8471 m
+82 h
+8472 m
+8473 m
+1 h
+4 h
+5436 m
+8474 m
+4 h
+1 h
+4 h
+28 h
+8475 m
+4 h
+8476 m
+11 h
+10 h
+538 h
+8477 m
+10 h
+4 h
+8478 m
+1 h
+4496 m
+4 h
+4 h
+170 h
+1 h
+10 h
+1 h
+1 h
+55 h
+8479 m
+55 h
+316 m
+8480 m
+10 h
+2022 m
+386 h
+4 h
+8481 m
+41 h
+65 h
+196 h
+4 h
+74 h
+25 h
+454 m
+4 h
+1 h
+4 h
+4 h
+4 h
+4 h
+10 h
+10 h
+1 h
+10 h
+83 h
+31 h
+4 h
+8482 m
+5060 m
+10 h
+12 h
+8483 m
+3 h
+238 h
+8484 m
+109 h
+110 h
+1 h
+1 h
+10 h
+4 h
+4 h
+57 h
+4 h
+10 h
+8485 m
+186 h
+8486 m
+1 h
+146 h
+1 h
+8487 m
+4 h
+45 h
+8488 m
+169 h
+8489 m
+10 h
+479 m
+1 h
+10 h
+1 h
+1 h
+463 m
+4 h
+399 h
+10 h
+4 h
+1 h
+1 h
+114 h
+4 h
+4 h
+4 h
+1 h
+11 h
+4 h
+59 h
+3161 m
+4132 m
+4 h
+4 h
+190 h
+83 h
+8490 m
+64 h
+4 h
+204 h
+1 h
+10 h
+8491 m
+10 h
+1 h
+79 h
+8492 m
+8493 m
+8494 m
+1074 h
+1 h
+1117 m
+113 h
+11 h
+4 h
+10 h
+4 h
+964 m
+124 h
+4 h
+22 h
+10 h
+11 h
+1370 m
+1 h
+8495 m
+10 h
+10 h
+8496 m
+45 h
+8497 m
+10 h
+1017 m
+1 h
+8498 m
+8499 m
+1 h
+4 h
+10 h
+4 h
+4 h
+8500 m
+125 h
+31 h
+8501 m
+1 h
+279 h
+8502 m
+8503 m
+4 h
+4 h
+435 m
+8504 m
+1 h
+8505 m
+1 h
+1 h
+22 h
+1 h
+1056 m
+4 h
+4 h
+4 h
+1470 h
+4 h
+55 h
+1 h
+8506 m
+8507 m
+1 h
+8508 m
+1642 h
+4 h
+10 h
+3 h
+10 h
+4 h
+434 m
+8509 m
+1 h
+536 h
+75 m
+10 h
+8510 m
+1250 h
+4 h
+114 h
+65 h
+4 h
+2128 m
+65 h
+10 h
+1 h
+8511 m
+4 h
+195 h
+157 h
+8512 m
+92 h
+1 h
+10 h
+4 h
+31 h
+4 h
+8513 m
+146 h
+1016 h
+8514 m
+4 h
+4 h
+4 h
+8515 m
+10 h
+82 h
+1 h
+75 m
+196 h
+1 h
+4 h
+4 h
+4 h
+46 h
+8516 m
+12 h
+307 h
+185 h
+1 h
+307 h
+4 h
+10 h
+8517 m
+109 h
+1 h
+4 h
+10 h
+8518 m
+82 h
+8519 m
+1 h
+8520 m
+4 h
+4111 m
+4 h
+4 h
+278 h
+358 h
+1 h
+10 h
+10 h
+4 h
+1 h
+172 h
+10 h
+8521 m
+4 h
+8522 m
+4 h
+103 m
+1 h
+8523 m
+4 h
+25 h
+8524 m
+8525 m
+1 h
+41 h
+10 h
+7870 m
+10 h
+4 h
+687 h
+358 h
+276 h
+4 h
+8526 m
+4 h
+10 h
+4 h
+1 h
+8527 m
+10 h
+10 h
+8 h
+124 h
+109 h
+8528 m
+174 h
+114 h
+533 m
+1 h
+10 h
+4 h
+1 h
+1 h
+1 h
+57 h
+1 h
+4 h
+1 h
+10 h
+8529 m
+8530 m
+8531 m
+1 h
+11 h
+4 h
+57 h
+4 h
+4 h
+57 h
+4 h
+1 h
+11 h
+4 h
+399 h
+11 h
+64 h
+124 h
+506 m
+1 h
+1 h
+8532 m
+4 h
+4 h
+6133 m
+8533 m
+8534 m
+2625 h
+11 h
+1 h
+8535 m
+1137 h
+4 h
+10 h
+11 h
+1 h
+1 h
+10 h
+8536 m
+5035 m
+8537 m
+1619 h
+158 h
+3 h
+10 h
+1 h
+4 h
+41 h
+8538 m
+11 h
+1 h
+1 h
+4 h
+172 h
+1 h
+10 h
+11 h
+119 h
+8539 m
+11 h
+8540 m
+3112 m
+4 h
+4 h
+59 h
+8541 m
+11 h
+10 h
+22 h
+8542 m
+83 h
+1642 h
+8543 m
+8544 m
+4 h
+4 h
+184 h
+2184 m
+1 h
+5757 m
+1045 m
+4 h
+3303 m
+4 h
+1070 m
+4 h
+8545 m
+4 h
+1 h
+10 h
+1 h
+8546 m
+4 h
+22 h
+590 m
+1 h
+8547 m
+10 h
+1 h
+1685 h
+8548 m
+10 h
+10 h
+447 h
+806 m
+4 h
+83 h
+1 h
+8549 m
+190 h
+8550 m
+10 h
+8551 m
+10 h
+4 h
+8552 m
+4 h
+57 h
+1374 m
+1 h
+278 h
+447 h
+1 h
+138 h
+1083 h
+3 h
+82 h
+11 h
+1 h
+1 h
+8553 m
+4 h
+4 h
+4 h
+7839 h
+1 h
+10 h
+8554 m
+2532 m
+4464 m
+4651 m
+83 h
+12 h
+10 h
+59 h
+4 h
+10 h
+1 h
+4 h
+4 h
+59 h
+1 h
+8555 m
+4 h
+1 h
+119 h
+4 h
+8556 m
+1 h
+83 h
+1 h
+104 h
+4 h
+10 h
+4 h
+4 h
+124 h
+8557 m
+94 h
+8558 m
+4 h
+4 h
+1 h
+8559 m
+10 h
+11 h
+59 h
+8560 m
+1 h
+10 h
+10 h
+10 h
+10 h
+8561 m
+3 h
+1261 h
+55 h
+65 h
+8562 m
+4 h
+4 h
+6399 m
+11 h
+12 h
+4 h
+4 h
+4 h
+8563 m
+10 h
+11 h
+10 h
+10 h
+1 h
+10 h
+185 h
+8564 m
+8565 m
+1 h
+1 h
+8566 m
+10 h
+4 h
+3 h
+143 h
+139 h
+8567 m
+4 h
+190 h
+4 h
+860 m
+4 h
+1 h
+1939 h
+8568 m
+4 h
+4 h
+4 h
+10 h
+2308 m
+4 h
+4 h
+4 h
+1 h
+4 h
+27 h
+125 h
+2266 m
+4 h
+4 h
+4 h
+4 h
+10 h
+4 h
+8569 m
+4 h
+10 h
+8570 m
+1 h
+64 h
+36 h
+11 h
+4 h
+8571 m
+1 h
+73 h
+8572 m
+1 h
+4 h
+4 h
+8573 m
+4 h
+4 h
+1 h
+170 h
+118 m
+10 h
+4 h
+3750 m
+8574 m
+8575 m
+692 h
+11 h
+10 h
+258 h
+359 h
+10 h
+1 h
+41 h
+1 h
+11 h
+10 h
+1105 h
+8576 m
+230 h
+169 h
+4 h
+1 h
+31 h
+8577 m
+10 h
+8578 m
+10 h
+4 h
+65 h
+146 h
+1 h
+172 h
+4 h
+45 h
+4 h
+4 h
+4 h
+8579 m
+4 h
+8580 m
+82 h
+1 h
+10 h
+10 h
+4 h
+1 h
+10 h
+857 h
+1 h
+195 h
+10 h
+1 h
+25 h
+4 h
+1 h
+911 m
+167 h
+4 h
+1 h
+10 h
+262 h
+8581 m
+8582 m
+1 h
+74 h
+4 h
+8583 m
+79 h
+41 h
+8584 m
+10 h
+169 h
+124 h
+8585 m
+2625 h
+4 h
+4 h
+4 h
+4 h
+8586 m
+10 h
+10 h
+4 h
+4 h
+109 h
+11 h
+4 h
+10 h
+1 h
+8587 m
+64 h
+104 h
+4 h
+869 h
+4 h
+238 h
+10 h
+8588 m
+110 h
+1 h
+4127 m
+1508 m
+4 h
+1 h
+97 h
+83 h
+4 h
+4 h
+8589 m
+4 h
+143 h
+1 h
+8590 m
+8591 m
+1 h
+1 h
+1 h
+4 h
+83 h
+109 h
+1975 m
+4 h
+1 h
+8592 m
+114 h
+1772 h
+10 h
+4 h
+8593 m
+1 h
+31 h
+10 h
+4 h
+4 h
+92 h
+10 h
+332 h
+4 h
+10 h
+143 h
+4 h
+10 h
+65 h
+3600 m
+8594 m
+10 h
+1 h
+1 h
+65 h
+10 h
+687 h
+17 h
+11 h
+8595 m
+4 h
+4 h
+10 h
+4 h
+8596 m
+4 h
+114 h
+4 h
+10 h
+73 h
+1 h
+4896 m
+4 h
+4 h
+10 h
+468 m
+10 h
+10 h
+13 h
+8597 m
+1 h
+59 h
+1 h
+4 h
+125 h
+1 h
+229 h
+1 h
+4 h
+8598 m
+92 h
+2923 m
+57 h
+8599 m
+1 h
+8600 m
+4 h
+97 h
+6400 m
+82 h
+10 h
+4 h
+4 h
+8601 m
+4 h
+3634 m
+1 h
+4 h
+59 h
+8602 m
+1 h
+4 h
+10 h
+649 m
+3 h
+2588 m
+1 h
+10 h
+4 h
+4 h
+11 h
+8603 m
+185 h
+4 h
+4 h
+123 h
+181 h
+4 h
+1 h
+65 h
+69 h
+4 h
+41 h
+10 h
+4 h
+4 h
+1 h
+8604 m
+3 h
+55 h
+57 h
+4 h
+4 h
+4 h
+196 h
+1 h
+8605 m
+10 h
+8606 m
+4 h
+8607 m
+10 h
+4 h
+3 h
+4 h
+1020 m
+10 h
+4 h
+10 h
+1 h
+626 m
+506 m
+1 h
+10 h
+10 h
+146 h
+3555 m
+640 h
+4 h
+125 h
+10 h
+1 h
+124 h
+820 m
+1 h
+1 h
+4 h
+533 m
+55 h
+1 h
+82 h
+8608 m
+1 h
+8609 m
+11 h
+4 h
+110 h
+3293 m
+157 h
+8610 m
+1 h
+8611 m
+7 m
+8612 m
+1 h
+10 h
+57 h
+1 h
+687 h
+109 h
+4 h
+4 h
+92 h
+124 h
+1 h
+8613 m
+124 h
+4 h
+8614 m
+4 h
+28 h
+4 h
+4 h
+10 h
+447 h
+1 h
+4 h
+1 h
+1 h
+11 h
+4 h
+996 m
+8615 m
+65 h
+1 h
+10 h
+125 h
+279 h
+8616 m
+10 h
+578 m
+10 h
+10 h
+7938 m
+4714 m
+11 h
+1 h
+84 h
+8617 m
+1 h
+185 h
+11 h
+4 h
+4 h
+4 h
+11 h
+124 h
+3 h
+8618 m
+10 h
+4283 m
+4 h
+8619 m
+8620 m
+8621 m
+8622 m
+8623 m
+41 h
+2438 m
+1 h
+10 h
+4 h
+10 h
+4 h
+10 h
+10 h
+10 h
+8624 m
+8625 m
+1 h
+11 h
+8626 m
+10 h
+11 h
+1 h
+10 h
+1 h
+11 h
+8627 m
+359 h
+4 h
+73 h
+4 h
+4 h
+4 h
+1 h
+36 h
+82 h
+4 h
+4 h
+8628 m
+45 h
+4349 m
+10 h
+8629 m
+4 h
+1 h
+11 h
+4 h
+13 h
+1 h
+4 h
+8630 m
+1 h
+10 h
+25 h
+8631 m
+1 h
+1 h
+11 h
+8 h
+8632 m
+5411 m
+10 h
+4 h
+4 h
+77 h
+55 h
+4 h
+195 h
+8633 m
+4 h
+4 h
+8634 m
+8635 m
+97 h
+1 h
+109 h
+4 h
+4 h
+55 h
+1 h
+4 h
+1 h
+250 h
+4 h
+22 h
+1 h
+83 h
+4 h
+13 h
+1 h
+1835 m
+4 h
+1 h
+935 h
+8636 m
+8637 m
+4 h
+285 m
+10 h
+4 h
+1 h
+119 h
+4 h
+8638 m
+8639 m
+8640 m
+538 h
+31 h
+8641 m
+4 h
+332 h
+4 h
+8642 m
+4 h
+4 h
+4 h
+11 h
+779 h
+2148 m
+4 h
+1 h
+22 h
+8643 m
+10 h
+59 h
+1 h
+1 h
+36 h
+13 h
+258 h
+31 h
+4 h
+10 h
+1 h
+1 h
+1 h
+1 h
+279 h
+1 h
+8644 m
+147 h
+1 h
+110 h
+10 h
+4 h
+10 h
+195 h
+8645 m
+73 h
+3622 m
+8646 m
+4 h
+10 h
+4 h
+8647 m
+8648 m
+1 h
+4 h
+4 h
+135 h
+31 h
+8147 m
+8649 m
+1 h
+8650 m
+1 h
+8651 m
+55 h
+8652 m
+8653 m
+4 h
+5059 m
+4 h
+4 h
+4 h
+4 h
+8654 m
+4 h
+1 h
+57 h
+8655 m
+172 h
+1 h
+146 h
+1 h
+1 h
+8656 m
+464 h
+8657 m
+10 h
+4 h
+8658 m
+1 h
+10 h
+1 h
+135 h
+8659 m
+1 h
+4 h
+83 h
+4 h
+1 h
+15 m
+10 h
+146 h
+8660 m
+1 h
+1 h
+8661 m
+8662 m
+169 h
+289 h
+57 h
+3303 m
+1 h
+556 h
+8663 m
+8664 m
+8 h
+1 h
+10 h
+4 h
+10 h
+8665 m
+8666 m
+6565 m
+358 h
+4 h
+278 h
+4 h
+1 h
+10 h
+1 h
+4 h
+1 h
+10 h
+8667 m
+10 h
+4 h
+22 h
+8668 m
+31 h
+124 h
+4 h
+3 h
+1214 m
+1 h
+4 h
+158 h
+10 h
+8669 m
+1309 m
+109 h
+1 h
+4 h
+1 h
+8670 m
+1 h
+692 h
+41 h
+10 h
+8671 m
+443 h
+8672 m
+4 h
+1 h
+57 h
+8673 m
+258 h
+4 h
+1261 h
+1 h
+8674 m
+31 h
+10 h
+1 h
+4 h
+1 h
+299 h
+4 h
+13 h
+8675 m
+56 h
+358 h
+4 h
+1 h
+4 h
+1 h
+164 h
+1 h
+97 h
+123 h
+124 h
+4 h
+8676 m
+4 h
+125 h
+1 h
+1 h
+4 h
+4 h
+8677 m
+125 h
+33 m
+10 h
+313 m
+4 h
+119 h
+31 h
+4 h
+4 h
+10 h
+687 h
+10 h
+36 h
+4 h
+10 h
+8678 m
+25 h
+25 h
+8679 m
+3 h
+104 h
+8680 m
+687 h
+447 h
+181 h
+8681 m
+8682 m
+4 h
+4 h
+1 h
+11 h
+79 h
+10 h
+1 h
+4 h
+45 h
+1 h
+8683 m
+8684 m
+1 h
+8685 m
+8686 m
+8687 m
+8688 m
+10 h
+386 h
+4 h
+4 h
+118 h
+4 h
+3 h
+4 h
+8689 m
+8690 m
+4 h
+4 h
+4 h
+2851 m
+10 h
+1 h
+1 h
+4 h
+4 h
+10 h
+10 h
+196 h
+1 h
+4 h
+1 h
+1 h
+920 p
+11 h
+4 h
+1083 h
+4 h
+4 h
+4 h
+57 h
+8691 m
+11 h
+8692 m
+8693 m
+59 h
+4 h
+8694 m
+8695 m
+10 h
+4 h
+4 h
+10 h
+4 h
+692 h
+4 h
+4 h
+8696 m
+124 h
+8697 m
+4 h
+10 h
+4 h
+5567 m
+8698 m
+10 h
+1 h
+10 h
+104 h
+4 h
+8699 m
+4 h
+4 h
+10 h
+1 h
+109 h
+10 h
+124 h
+8700 m
+41 h
+1 h
+114 h
+1 h
+11 h
+1710 m
+4 h
+73 h
+1 h
+4 h
+143 h
+4 h
+4 h
+4 h
+8701 m
+4 h
+536 h
+1410 m
+2815 m
+935 h
+8702 m
+1 h
+4 h
+8703 m
+520 h
+1 h
+8704 m
+8705 m
+10 h
+8706 m
+10 h
+8707 m
+1835 m
+109 h
+536 h
+1 h
+10 h
+8708 m
+4 h
+8709 m
+4 h
+266 h
+8710 m
+3679 m
+1 h
+295 h
+1 h
+4 h
+8711 m
+276 h
+1 h
+1 h
+1 h
+8712 m
+8713 m
+4 h
+10 h
+10 h
+8714 m
+8715 m
+10 h
+10 h
+1 h
+4 h
+8716 m
+4 h
+55 h
+1 h
+8717 m
+10 h
+4 h
+8718 m
+10 h
+8 h
+1 h
+4 h
+8719 m
+4 h
+295 h
+8720 m
+4 h
+155 m
+22 h
+82 h
+8721 m
+1 h
+8722 m
+74 h
+265 h
+195 h
+4 h
+10 h
+4 h
+25 h
+1 h
+1299 m
+57 h
+4 h
+119 h
+4 h
+4 h
+4 h
+4 h
+8723 m
+10 h
+4 h
+10 h
+8724 m
+74 h
+27 h
+10 h
+8725 m
+1177 m
+8726 m
+4 h
+4 h
+1 h
+10 h
+601 h
+1 h
+1 h
+10 h
+4 h
+112 h
+4 h
+36 h
+3837 m
+11 h
+278 h
+11 h
+1 h
+10 h
+1 h
+33 m
+8727 m
+195 h
+8728 m
+10 h
+4 h
+75 h
+45 h
+10 h
+1 h
+4 h
+8729 m
+112 h
+10 h
+11 h
+4 h
+4 h
+59 h
+10 h
+1 h
+8730 m
+1 h
+4 h
+5613 m
+4 h
+1796 h
+278 h
+1 h
+4 h
+770 m
+4 h
+4 h
+630 m
+8731 m
+1 h
+10 h
+8732 m
+170 h
+4 h
+10 h
+4 h
+92 h
+8733 m
+4 h
+169 h
+41 h
+4 h
+8734 m
+4 h
+1 h
+3278 m
+359 h
+64 h
+4 h
+4 h
+4 h
+8735 m
+4 h
+238 h
+4 h
+4 h
+139 h
+8736 m
+4 h
+4 h
+4 h
+41 h
+1 h
+3 h
+10 h
+109 h
+4 h
+4 h
+8737 m
+8738 m
+10 h
+56 h
+8739 m
+238 h
+1 h
+8740 m
+10 h
+83 h
+8741 m
+4 h
+1 h
+4 h
+55 h
+10 h
+1 h
+4 h
+4 h
+1 h
+4 h
+10 h
+10 h
+92 h
+10 h
+5125 m
+8742 m
+10 h
+1 h
+4 h
+11 h
+10 h
+10 h
+8743 m
+190 h
+8744 m
+4 h
+11 h
+10 h
+4 h
+1 h
+172 h
+10 h
+4 h
+1 h
+31 h
+10 h
+488 h
+25 h
+8745 m
+31 h
+8746 m
+1 h
+4 h
+8747 m
+1 h
+10 h
+59 h
+8748 m
+4 h
+4 h
+8749 m
+10 h
+4 h
+169 h
+4 h
+10 h
+8750 m
+1 h
+10 h
+8751 m
+31 h
+31 h
+97 h
+1 h
+1 h
+8752 m
+4 h
+8753 m
+757 h
+4 h
+1 h
+1 h
+10 h
+59 h
+4 h
+1 h
+1 h
+10 h
+8754 m
+1914 m
+1 h
+65 h
+108 h
+139 h
+8755 m
+4 h
+1 h
+339 m
+92 h
+779 h
+8756 m
+4 h
+1214 m
+92 h
+8757 m
+4 h
+11 h
+229 h
+10 h
+4 h
+1 h
+8758 m
+8759 m
+4 h
+124 h
+274 h
+5963 m
+10 h
+4 h
+12 h
+4 h
+1 h
+10 h
+1 h
+5281 m
+10 h
+3799 m
+10 h
+167 h
+8760 m
+692 h
+8761 m
+4 h
+8762 m
+10 h
+4 h
+65 h
+8763 m
+3 h
+10 h
+2379 h
+27 h
+8764 m
+1 h
+4 h
+4 h
+195 h
+383 h
+8765 m
+8766 m
+82 h
+4 h
+4 h
+10 h
+8767 m
+8768 m
+4 h
+146 h
+1 h
+6438 m
+8769 m
+94 h
+4 h
+718 h
+4 h
+1 h
+2280 m
+10 h
+8770 m
+1 h
+74 h
+692 h
+83 h
+4 h
+59 h
+56 h
+7755 m
+135 h
+2794 m
+8771 m
+10 h
+123 h
+4 h
+10 h
+119 h
+8772 m
+3036 m
+10 h
+1 h
+10 h
+8773 m
+1 h
+25 h
+11 h
+4 h
+1 h
+976 h
+443 h
+626 m
+8774 m
+31 h
+4 h
+338 m
+4 h
+10 h
+28 h
+10 h
+10 h
+4 h
+1790 h
+986 h
+4 h
+10 h
+11 h
+4 h
+8775 m
+8776 m
+8777 m
+211 m
+1 h
+114 h
+1 h
+1 h
+8778 m
+1 h
+8779 m
+4 h
+146 h
+8780 m
+4 h
+332 h
+25 h
+8781 m
+8782 m
+4 h
+27 h
+1 h
+186 h
+601 h
+65 h
+6869 m
+5053 m
+82 h
+8783 m
+4 h
+601 h
+1642 h
+4 h
+10 h
+238 h
+56 h
+11 h
+1 h
+10 h
+4 h
+8784 m
+195 h
+4 h
+359 h
+8785 m
+4 h
+4 h
+10 h
+332 h
+1 h
+10 h
+1092 m
+8786 m
+1 h
+8787 m
+8788 m
+10 h
+106 h
+1 h
+4 h
+4 h
+2139 m
+59 h
+4 h
+4 h
+1 h
+1 h
+4 h
+1 h
+4 h
+10 h
+92 h
+4 h
+8789 m
+8790 m
+10 h
+1 h
+368 h
+8791 m
+109 h
+204 h
+842 m
+8792 m
+4 h
+64 h
+538 h
+1 h
+6200 m
+4218 m
+8793 m
+8794 m
+8795 m
+114 h
+1 h
+1685 h
+10 h
+4 h
+8796 m
+578 m
+4 h
+4 h
+8797 m
+1406 h
+57 h
+10 h
+25 h
+4 h
+1 h
+1 h
+4 h
+55 h
+1 h
+8798 m
+1 h
+4 h
+1454 m
+8799 m
+41 h
+468 m
+8800 m
+8801 m
+4 h
+1 h
+10 h
+8802 m
+8803 m
+172 h
+10 h
+986 h
+996 m
+5379 m
+10 h
+146 h
+8804 m
+1 h
+65 h
+57 h
+1284 m
+3742 m
+1 h
+146 h
+1 h
+3 h
+4 h
+10 h
+8805 m
+8806 m
+13 h
+1 h
+10 h
+83 h
+4 h
+10 h
+4 h
+1 h
+4 h
+10 h
+4 h
+8807 m
+8808 m
+8809 m
+1 h
+11 h
+8810 m
+4 h
+104 h
+124 h
+3 h
+4 h
+262 h
+1 h
+4 h
+11 h
+4 h
+1 h
+4 h
+10 h
+1 h
+1 h
+8811 m
+124 h
+1 h
+1 h
+1939 h
+1 h
+8812 m
+8813 m
+8814 m
+1304 m
+8815 m
+82 h
+59 h
+8816 m
+332 h
+1 h
+1 h
+10 h
+1 h
+4 h
+10 h
+4 h
+1 h
+1 h
+8817 m
+4 h
+1 h
+3321 m
+109 h
+82 h
+4 h
+1 h
+55 h
+538 h
+1 h
+1 h
+1 h
+157 h
+10 h
+10 h
+4 h
+10 h
+10 h
+4 h
+1 h
+358 h
+1 h
+4 h
+1478 m
+1 h
+156 h
+1 h
+1 h
+3 h
+2266 m
+8818 m
+135 h
+8819 m
+976 h
+8820 m
+64 h
+59 h
+570 h
+10 h
+109 h
+56 h
+25 h
+1 h
+8821 m
+1 h
+289 h
+10 h
+8822 m
+4810 m
+4 h
+36 h
+10 h
+10 h
+10 h
+313 m
+1 h
+74 h
+10 h
+4 h
+3 h
+10 h
+601 h
+8823 m
+1 h
+10 h
+4 h
+1 h
+4 h
+1 h
+2923 m
+1 h
+5907 m
+620 m
+4 h
+677 m
+8824 m
+4 h
+1 h
+8825 m
+10 h
+10 h
+25 h
+10 h
+40 h
+1 h
+1822 h
+1250 h
+8826 m
+1 h
+1 h
+4 h
+1 h
+1 h
+124 h
+10 h
+8827 m
+109 h
+4 h
+1105 h
+295 h
+1 h
+4 h
+82 h
+74 h
+4 h
+4 h
+10 h
+10 h
+185 h
+125 h
+11 h
+83 h
+65 h
+195 h
+8828 m
+10 h
+56 h
+1 h
+4 h
+11 h
+10 h
+1374 m
+1 h
+64 h
+4 h
+1 h
+1 h
+1 h
+8829 m
+1 h
+5526 m
+4 h
+10 h
+82 h
+4 h
+1 h
+196 h
+1 h
+8830 m
+3161 m
+541 m
+11 h
+4203 m
+4 h
+8831 m
+838 m
+1 h
+10 h
+48 h
+4 h
+1 h
+8832 m
+10 h
+139 h
+4 h
+4 h
+4 h
+8833 m
+4 h
+164 h
+1725 m
+8834 m
+8835 m
+2484 m
+10 h
+8836 m
+10 h
+4 h
+4 h
+1 h
+11 h
+8837 m
+8838 m
+1 h
+8839 m
+8840 m
+3360 m
+59 h
+8841 m
+82 h
+8842 m
+8843 m
+167 h
+8844 m
+8 h
+31 h
+10 h
+55 h
+10 h
+11 h
+41 h
+278 h
+1737 m
+3499 m
+74 h
+10 h
+4 h
+10 h
+1309 m
+31 h
+224 h
+8845 m
+4 h
+1 h
+4 h
+4 h
+74 h
+4 h
+82 h
+578 m
+10 h
+74 h
+8846 m
+4 h
+8847 m
+4 h
+10 h
+238 h
+4 h
+83 h
+1 h
+158 h
+1 h
+36 h
+4 h
+8848 m
+4 h
+6817 m
+8849 m
+83 h
+195 h
+1 h
+196 h
+25 h
+10 h
+3036 m
+1 h
+4 h
+8850 m
+4 h
+4 h
+10 h
+582 m
+1 h
+986 h
+110 h
+12 h
+190 h
+1 h
+435 m
+8851 m
+447 h
+1045 m
+258 h
+135 h
+583 m
+4 h
+1 h
+45 h
+4 h
+124 h
+181 h
+4 h
+8852 m
+4 h
+1 h
+1 h
+158 h
+55 h
+1 h
+8853 m
+114 h
+327 m
+112 h
+1 h
+8854 m
+4 h
+1 h
+45 h
+238 h
+1 h
+468 m
+1 h
+8855 m
+8856 m
+110 h
+8857 m
+1 h
+4 h
+1 h
+119 h
+8858 m
+8859 m
+1619 h
+1 h
+11 h
+8860 m
+10 h
+8861 m
+4 h
+73 h
+1685 h
+36 h
+8862 m
+238 h
+8863 m
+4 h
+4 h
+4 h
+4 h
+40 h
+91 h
+114 h
+4 h
+61 m
+10 h
+10 h
+10 h
+1 h
+8864 m
+4 h
+8865 m
+4 h
+10 h
+1 h
+1620 m
+250 h
+8866 m
+4 h
+8867 m
+368 h
+4 h
+204 h
+4 h
+124 h
+4 h
+8868 m
+10 h
+8869 m
+109 h
+10 h
+4 h
+139 h
+4 h
+8870 m
+8871 m
+23 m
+1 h
+8872 m
+1 h
+1 h
+4 h
+170 h
+8873 m
+56 h
+31 h
+10 h
+8874 m
+2025 m
+83 h
+55 h
+144 h
+124 h
+4 h
+1685 h
+27 h
+4 h
+8875 m
+64 h
+140 h
+4 h
+8876 m
+1 h
+1 h
+1 h
+8877 m
+4 h
+123 h
+1 h
+8878 m
+1 h
+8879 m
+1 h
+860 m
+4 h
+10 h
+4 h
+8880 m
+8881 m
+10 h
+4 h
+10 h
+3 h
+1 h
+118 h
+1 h
+124 h
+1 h
+4 h
+1 h
+4 h
+10 h
+4 h
+4 h
+1 h
+8882 m
+307 h
+4 h
+8883 m
+10 h
+3 h
+10 h
+82 h
+4 h
+5254 m
+1 h
+1478 m
+229 h
+8884 m
+124 h
+8 h
+4 h
+4 h
+4 h
+860 m
+1 h
+10 h
+8885 m
+92 h
+64 h
+4 h
+1 h
+57 h
+4 h
+23 m
+4350 m
+41 h
+4 h
+8886 m
+8887 m
+4 h
+8888 m
+4 h
+10 h
+4 h
+4 h
+1 h
+4 h
+57 h
+1 h
+93 m
+4 h
+10 h
+195 h
+4 h
+8889 m
+4 h
+22 h
+11 h
+41 h
+626 m
+8890 m
+8891 m
+8892 m
+10 h
+143 h
+10 h
+1 h
+41 h
+1 h
+1 h
+1 h
+124 h
+1 h
+10 h
+11 h
+172 h
+10 h
+1 h
+1 h
+4 h
+8893 m
+10 h
+1 h
+266 h
+4 h
+4 h
+10 h
+1 h
+125 h
+8894 m
+11 h
+1619 h
+109 h
+12 h
+4 h
+65 h
+10 h
+61 m
+97 h
+74 h
+147 h
+8895 m
+4 h
+10 h
+1 h
+4 h
+139 h
+5537 m
+4 h
+1 h
+10 h
+11 h
+4 h
+1751 m
+1 h
+276 h
+8896 m
+297 h
+3 h
+125 h
+1 h
+8897 m
+10 h
+7760 m
+11 h
+83 h
+74 h
+164 h
+230 h
+3 h
+4 h
+8898 m
+41 h
+10 h
+4 h
+27 h
+1 h
+83 h
+4 h
+1685 h
+65 h
+4 h
+4 h
+4 h
+185 h
+4 h
+10 h
+8899 m
+11 h
+1 h
+4 h
+911 m
+4 h
+109 h
+8900 m
+278 h
+8901 m
+11 h
+8902 m
+8903 m
+10 h
+1 h
+8904 m
+295 h
+4 h
+4 h
+8905 m
+4132 m
+1 h
+1 h
+307 h
+4 h
+8906 m
+8907 m
+170 h
+4 h
+8908 m
+770 m
+1 h
+4 h
+104 h
+276 h
+575 m
+41 h
+8909 m
+10 h
+8910 m
+125 h
+4 h
+4 h
+8911 m
+10 h
+353 m
+11 h
+1 h
+10 h
+7064 m
+73 h
+359 h
+143 h
+4 h
+8912 m
+1337 m
+1 h
+83 h
+10 h
+10 h
+10 h
+4 h
+10 h
+10 h
+4 h
+1 h
+6135 m
+1 h
+79 h
+1 h
+4 h
+8913 m
+1250 h
+4 h
+10 h
+10 h
+256 m
+4 h
+36 h
+64 h
+2984 m
+4 h
+4 h
+1 h
+921 m
+8914 m
+4 h
+1 h
+4 h
+316 m
+10 h
+4 h
+4 h
+4 h
+123 h
+8915 m
+4 h
+41 h
+319 m
+10 h
+976 h
+8793 m
+4 h
+125 h
+1 h
+640 h
+10 h
+10 h
+8916 m
+5060 m
+8917 m
+4 h
+8918 m
+10 h
+8919 m
+4 h
+10 h
+8920 m
+1 h
+4 h
+4 h
+57 h
+11 h
+124 h
+8653 m
+10 h
+1 h
+2131 m
+1 h
+10 h
+10 h
+8161 m
+4 h
+69 h
+4 h
+4 h
+10 h
+412 m
+1 h
+1 h
+1 h
+8921 m
+8922 m
+1 h
+8923 m
+59 h
+258 h
+1 h
+57 h
+1 h
+124 h
+1 h
+186 h
+77 h
+4 h
+3 h
+4 h
+10 h
+4 h
+12 h
+13 h
+10 h
+5801 m
+4 h
+8924 m
+4 h
+4 h
+4 h
+1 h
+10 h
+173 h
+386 h
+10 h
+56 h
+4 h
+888 m
+10 h
+167 h
+5923 m
+10 h
+1 h
+31 h
+10 h
+1 h
+8925 m
+8926 m
+3 h
+10 h
+82 h
+1 h
+8927 m
+2923 m
+10 h
+1 h
+1 h
+4 h
+25 h
+57 h
+4 h
+4 h
+10 h
+1 h
+4 h
+11 h
+1 h
+8928 m
+4 h
+2520 m
+8929 m
+8930 m
+59 h
+4 h
+8931 m
+8932 m
+1 h
+10 h
+109 h
+10 h
+1 h
+229 h
+56 h
+8933 m
+4 h
+1 h
+11 h
+1 h
+4 h
+125 h
+10 h
+4 h
+10 h
+8934 m
+1772 h
+4 h
+4 h
+4 h
+10 h
+4 h
+1 h
+196 h
+1 h
+147 h
+10 h
+1 h
+10 h
+1 h
+10 h
+4 h
+1 h
+3100 m
+8935 m
+1 h
+146 h
+8936 m
+8937 m
+5 h
+8938 m
+8939 m
+4 h
+1 h
+4 h
+4 h
+698 m
+4 h
+4 h
+10 h
+1 h
+1 h
+295 h
+4 h
+146 h
+123 h
+11 h
+103 m
+8940 m
+82 h
+4 h
+8941 m
+536 h
+4 h
+8942 m
+8943 m
+8944 m
+266 h
+1 h
+10 h
+8945 m
+10 h
+8946 m
+4301 m
+73 h
+3025 m
+31 h
+1 h
+276 h
+146 h
+319 h
+10 h
+59 h
+1 h
+4 h
+1 h
+10 h
+10 h
+1 h
+4 h
+8947 m
+4 h
+4 h
+8948 m
+4 h
+10 h
+8949 m
+8950 m
+156 h
+25 h
+4 h
+1201 m
+8951 m
+10 h
+31 h
+10 h
+1137 h
+8952 m
+8953 m
+684 m
+238 h
+4 h
+22 h
+11 h
+4 h
+10 h
+8954 m
+8955 m
+4 h
+10 h
+4 h
+4 h
+327 m
+1 h
+8956 m
+40 h
+4 h
+2494 m
+92 h
+8957 m
+1620 m
+190 h
+265 h
+11 h
+8958 m
+124 h
+10 h
+8959 m
+4 h
+83 h
+169 h
+8960 m
+4 h
+733 m
+25 h
+10 h
+1 h
+1 h
+224 h
+94 h
+1 h
+8961 m
+4 h
+1 h
+4469 m
+109 h
+1 h
+359 h
+8962 m
+8963 m
+59 h
+8964 m
+108 h
+6399 m
+5965 m
+986 h
+10 h
+31 h
+1 h
+4 h
+278 h
+11 h
+69 h
+8965 m
+1470 h
+10 h
+869 h
+10 h
+8966 m
+64 h
+10 h
+8967 m
+10 h
+4 h
+172 h
+4 h
+4 h
+6592 m
+10 h
+3396 m
+8968 m
+7839 h
+8969 m
+57 h
+10 h
+4 h
+23 h
+10 h
+583 m
+1409 m
+181 h
+8970 m
+319 h
+8971 m
+8972 m
+1 h
+4 h
+59 h
+4 h
+8973 m
+1 h
+1 h
+10 h
+10 h
+4 h
+1 h
+55 h
+4 h
+1 h
+8974 m
+8975 m
+1 h
+4 h
+8976 m
+10 h
+57 h
+167 h
+10 h
+55 h
+10 h
+4 h
+265 h
+13 h
+10 h
+1 h
+1 h
+1638 m
+8977 m
+4 h
+109 h
+8978 m
+10 h
+4 h
+1370 m
+10 h
+4 h
+8979 m
+10 h
+1 h
+8980 m
+1 h
+3 h
+4 h
+8981 m
+10 h
+10 h
+488 h
+1 h
+4 h
+4 h
+4 h
+1 h
+10 h
+4 h
+10 h
+4 h
+687 h
+1 h
+8982 m
+10 h
+124 h
+1685 h
+8983 m
+4 h
+10 h
+1 h
+8984 m
+1 h
+112 h
+8985 m
+8986 m
+143 h
+267 m
+10 h
+74 h
+1 h
+8324 m
+10 h
+1 h
+11 h
+1 h
+10 h
+56 h
+55 h
+82 h
+4 h
+83 h
+27 h
+1 h
+8987 m
+1138 m
+125 h
+140 h
+8988 m
+1 h
+10 h
+104 h
+8989 m
+73 h
+4 h
+1 h
+1271 m
+1 h
+10 h
+4 h
+10 h
+4 h
+1 h
+1620 h
+4 h
+4 h
+1 h
+319 h
+1 h
+103 m
+1 h
+10 h
+11 h
+10 h
+464 h
+11 h
+8990 m
+109 h
+59 h
+195 h
+10 h
+57 h
+124 h
+4 h
+1 h
+10 h
+4 h
+2733 h
+6066 m
+8991 m
+57 h
+488 h
+57 h
+10 h
+185 h
+8992 m
+164 h
+8993 m
+518 m
+4 h
+860 h
+1 h
+4 h
+64 h
+1 h
+129 h
+443 h
+3 h
+1 h
+4 h
+4 h
+8994 m
+10 h
+10 h
+8995 m
+1714 m
+4 h
+1 h
+8996 m
+79 h
+1 h
+464 h
+4 h
+59 h
+4 h
+4 h
+4 h
+1 h
+5 h
+10 h
+10 h
+57 h
+8997 m
+10 h
+4 h
+8998 m
+10 h
+986 h
+245 m
+1 h
+10 h
+8999 m
+4 h
+185 h
+8716 m
+1 h
+104 h
+119 h
+9000 m
+1 h
+196 h
+10 h
+9001 m
+82 h
+9002 m
+1 h
+172 h
+125 h
+146 h
+10 h
+278 h
+4 h
+10 h
+9003 m
+4 h
+9004 m
+10 h
+65 h
+9005 m
+9006 m
+9007 m
+10 h
+4 h
+1 h
+156 h
+104 h
+4 h
+25 h
+1 h
+1 h
+45 h
+1 h
+79 h
+767 m
+10 h
+1016 h
+1 h
+3 h
+11 h
+9008 m
+9009 m
+520 h
+9010 m
+9011 m
+11 h
+10 h
+9012 m
+9013 m
+1 h
+92 h
+169 h
+4 h
+4 h
+9014 m
+9015 m
+4 h
+10 h
+4 h
+4 h
+1 h
+4 h
+1 h
+59 h
+9016 m
+4 h
+204 h
+4 h
+123 h
+9017 m
+1 h
+74 h
+41 h
+4 h
+4 h
+4 h
+1822 h
+1 h
+1 h
+10 h
+9018 m
+4 h
+4 h
+9019 m
+533 h
+4 h
+185 h
+129 h
+4 h
+195 h
+196 h
+167 h
+4 h
+10 h
+9020 m
+10 h
+4 h
+4 h
+1 h
+1 h
+4 h
+4381 m
+1 h
+9021 m
+377 m
+4 h
+22 h
+332 h
+10 h
+692 h
+10 h
+9022 m
+425 m
+9023 m
+1 h
+399 h
+9024 m
+1 h
+4 h
+12 h
+1 h
+146 h
+1 h
+4 h
+4 h
+9025 m
+1 h
+79 h
+4151 m
+463 m
+4 h
+1 h
+1127 m
+9026 m
+4 h
+4 h
+2480 m
+9027 m
+1 h
+192 h
+10 h
+10 h
+4 h
+9028 m
+10 h
+6112 m
+10 h
+3 h
+4 h
+9029 m
+4 h
+9030 m
+9031 m
+92 h
+1 h
+1 h
+10 h
+10 h
+10 h
+1 h
+9032 m
+9033 m
+9034 m
+285 m
+1 h
+11 h
+4 h
+1056 m
+124 h
+22 h
+1249 m
+4 h
+4 h
+1 h
+4 h
+4 h
+10 h
+124 h
+9035 m
+4 h
+1 h
+1 h
+147 h
+1 h
+1 h
+9036 m
+976 h
+55 h
+10 h
+4 h
+4 h
+1548 m
+79 h
+3 h
+4 h
+1 h
+1 h
+108 h
+9037 m
+10 h
+1 h
+1 h
+9038 m
+4 h
+4 h
+4 h
+10 h
+97 h
+1 h
+9039 m
+4 h
+1 h
+9040 m
+9041 m
+403 h
+9042 m
+4240 m
+1 h
+57 h
+4 h
+10 h
+265 h
+169 h
+9043 m
+10 h
+463 m
+9044 m
+1 h
+1137 h
+1 h
+1218 m
+9045 m
+9046 m
+2931 m
+10 h
+124 h
+9047 m
+9048 m
+3177 m
+9049 m
+10 h
+9050 m
+11 h
+1 h
+4 h
+4 h
+1 h
+74 h
+1 h
+9051 m
+258 h
+9052 m
+4 h
+368 h
+10 h
+9053 m
+9054 m
+4 h
+129 h
+1 h
+4 h
+4 h
+1 h
+353 m
+3 h
+1 h
+10 h
+4 h
+9055 m
+4 h
+10 h
+3533 m
+9056 m
+3943 m
+9057 m
+9058 m
+1884 m
+10 h
+10 h
+443 h
+10 h
+9059 m
+1 h
+1 h
+1 h
+1691 m
+9060 m
+4 h
+4 h
+10 h
+104 h
+9061 m
+1 h
+9062 m
+10 h
+11 h
+332 h
+9063 m
+82 h
+3913 m
+4 h
+10 h
+1 h
+5 h
+1 h
+9064 m
+1 h
+266 h
+9065 m
+1835 h
+897 m
+9066 m
+139 h
+9067 m
+9068 m
+204 h
+172 h
+74 h
+1 h
+9069 m
+4 h
+9070 m
+57 h
+1 h
+9071 m
+9072 m
+11 h
+11 h
+4 h
+4 h
+3 h
+156 h
+10 h
+9073 m
+9074 m
+12 h
+250 h
+9075 m
+2706 m
+10 h
+74 h
+4 h
+4 h
+1 h
+41 h
+10 h
+10 h
+1105 h
+11 h
+10 h
+4 h
+190 h
+1 h
+9076 m
+48 h
+103 h
+4 h
+25 h
+74 h
+262 h
+9077 m
+9078 m
+4 h
+195 h
+1 h
+9079 m
+4 h
+9080 m
+4 h
+9081 m
+10 h
+158 h
+4 h
+10 h
+403 h
+1 h
+4 h
+601 h
+1 h
+9082 m
+10 h
+1 h
+359 h
+1 h
+11 h
+192 h
+1 h
+1 h
+2374 m
+9083 m
+9084 m
+1 h
+7585 m
+569 h
+9085 m
+1 h
+3 h
+2617 m
+4 h
+109 h
+146 h
+9086 m
+59 h
+1619 h
+9087 m
+4 h
+10 h
+4 h
+164 h
+1 h
+4 h
+9088 m
+9089 m
+79 h
+1 h
+8477 m
+4 h
+65 h
+10 h
+4 h
+9090 m
+119 h
+4 h
+25 h
+9091 m
+976 h
+1 h
+578 h
+11 h
+82 h
+10 h
+4 h
+4 h
+59 h
+1 h
+1 h
+9092 m
+4 h
+4 h
+10 h
+119 h
+124 h
+97 h
+4 h
+9093 m
+4 h
+9094 m
+4 h
+4 h
+65 h
+9095 m
+10 h
+9096 m
+9097 m
+123 h
+143 h
+3558 h
+9098 m
+10 h
+4 h
+41 h
+82 h
+9099 m
+4 h
+1 h
+4 h
+119 h
+2054 m
+1737 m
+40 h
+9100 m
+1 h
+9101 m
+1 h
+124 h
+4 h
+1 h
+11 h
+1 h
+1 h
+4 h
+1 h
+656 h
+1308 m
+1 h
+1 h
+258 h
+9102 m
+167 h
+869 h
+109 h
+9103 m
+1957 m
+2281 m
+57 h
+9104 m
+8 h
+109 h
+6549 m
+10 h
+11 h
+4 h
+10 h
+195 h
+4 h
+1261 h
+1685 h
+9105 m
+4 h
+82 h
+3558 h
+9106 m
+169 h
+4 h
+4 h
+6941 m
+10 h
+1 h
+4 h
+1 h
+4 h
+9107 m
+27 h
+195 h
+10 h
+146 h
+11 h
+4 h
+4 h
+4 h
+9108 m
+10 h
+9109 m
+1 h
+4 h
+4 h
+9110 m
+10 h
+10 h
+4 h
+82 h
+11 h
+9111 m
+10 h
+4 h
+9112 m
+4 h
+4 h
+9113 m
+9114 m
+332 h
+119 h
+10 h
+4 h
+10 h
+9115 m
+9116 m
+4 h
+31 h
+1 h
+10 h
+4 h
+1 h
+1 h
+278 h
+4 h
+4 h
+9117 m
+1 h
+1 h
+9118 m
+110 h
+4 h
+319 h
+9119 m
+1 h
+40 h
+1 h
+368 h
+109 h
+278 h
+278 h
+4 h
+4 h
+4 h
+169 h
+823 m
+82 h
+9120 m
+4 h
+1 h
+10 h
+25 h
+9121 m
+112 h
+1 h
+10 h
+1835 h
+4 h
+146 h
+4 h
+295 h
+10 h
+9122 m
+11 h
+4 h
+77 h
+9123 m
+9124 m
+74 h
+203 m
+1409 m
+27 h
+27 h
+5053 m
+4 h
+9125 m
+4 h
+9126 m
+4 h
+109 h
+9127 m
+10 h
+9128 m
+2205 m
+4 h
+10 h
+258 h
+25 h
+110 h
+9129 m
+9130 m
+4 h
+1796 h
+4 h
+4 h
+478 m
+1074 h
+359 h
+12 h
+10 h
+1 h
+10 h
+9131 m
+1541 m
+9132 m
+11 h
+9133 m
+4 h
+976 h
+4 h
+307 h
+10 h
+11 h
+1 h
+297 h
+9134 m
+10 h
+11 h
+9135 m
+1 h
+9136 m
+4 h
+13 h
+4 h
+9137 m
+10 h
+109 h
+10 h
+1 h
+4 h
+82 h
+10 h
+1 h
+4 h
+9138 m
+1 h
+4 h
+4 h
+9139 m
+1 h
+4 h
+118 h
+1 h
+147 h
+4 h
+3995 m
+9140 m
+9141 m
+9142 m
+4 h
+9143 m
+109 h
+4 h
+273 m
+11 h
+9144 m
+9145 m
+1 h
+4 h
+10 h
+1766 h
+9146 m
+1 h
+9147 m
+114 h
+4 h
+4 h
+170 h
+1 h
+4 h
+9148 m
+4 h
+25 h
+9149 m
+10 h
+114 h
+9150 m
+45 h
+4 h
+9151 m
+9152 m
+31 h
+10 h
+164 h
+75 h
+10 h
+1 h
+718 h
+4 h
+9153 m
+1 h
+9154 m
+1 h
+9155 m
+10 h
+538 h
+10 h
+1 h
+9156 m
+10 h
+1 h
+4 h
+1685 h
+83 h
+4 h
+9157 m
+74 h
+10 h
+1 h
+73 h
+143 h
+1 h
+9158 m
+10 h
+4 h
+1 h
+9159 m
+1 h
+10 h
+4 h
+57 h
+113 h
+4 h
+41 h
+56 h
+9160 m
+13 h
+10 h
+4 h
+4 h
+1 h
+195 h
+265 h
+2002 m
+12 h
+74 h
+10 h
+4 h
+4 h
+10 h
+266 h
+4 h
+4 h
+1 h
+230 h
+4 h
+9161 m
+1 h
+1 h
+4 h
+9162 m
+9163 m
+230 h
+11 h
+8638 m
+10 h
+4 h
+4 h
+10 h
+9164 m
+4 h
+9165 m
+843 m
+9166 m
+1 h
+4 h
+10 h
+4 h
+9167 m
+687 h
+1 h
+4 h
+1 h
+11 h
+9168 m
+10 h
+110 h
+28 h
+31 h
+82 h
+9169 m
+147 h
+9170 m
+10 h
+1016 h
+11 h
+1 h
+9171 m
+1 h
+4 h
+9172 m
+2824 m
+1 h
+143 h
+9173 m
+1 h
+4 h
+4 h
+10 h
+8 h
+9174 m
+10 h
+57 h
+94 h
+1822 h
+9175 m
+4 h
+4 h
+4 h
+4 h
+4 h
+9176 m
+1 h
+4 h
+9177 m
+258 h
+224 h
+4 h
+9178 m
+1 h
+9179 m
+258 h
+4 h
+4 h
+307 h
+123 h
+9180 m
+124 h
+123 h
+1 h
+9181 m
+4 h
+186 h
+4 h
+64 h
+1303 m
+1 h
+59 h
+1 h
+106 h
+10 h
+383 h
+4 h
+4 h
+9182 m
+10 h
+4 h
+9183 m
+9184 m
+65 h
+4538 m
+1 h
+4 h
+1884 m
+3 h
+9185 m
+3555 m
+4 h
+31 h
+1 h
+4 h
+11 h
+9186 m
+74 h
+139 h
+5976 m
+9187 m
+185 h
+4 h
+9188 m
+9189 m
+83 h
+4 h
+9190 m
+4 h
+9191 m
+31 h
+258 h
+59 h
+9192 m
+4 h
+1 h
+1 h
+1 h
+57 h
+1 h
+10 h
+103 h
+31 h
+9193 m
+1 h
+578 h
+3 h
+172 h
+4 h
+4 h
+82 h
+4 h
+9194 m
+82 h
+4 h
+9195 m
+10 h
+11 h
+9196 m
+4 h
+1 h
+9197 m
+10 h
+173 h
+4 h
+124 h
+2815 m
+4 h
+4966 m
+10 h
+4 h
+10 h
+282 m
+104 h
+9198 m
+1 h
+11 h
+4 h
+9199 m
+1 h
+56 h
+4 h
+1 h
+238 h
+4 h
+9200 m
+9201 m
+1 h
+5230 m
+9202 m
+1 h
+1 h
+1 h
+1 h
+4 h
+4 h
+7394 m
+4 h
+4 h
+4 h
+1 h
+4 h
+477 m
+620 m
+36 h
+1 h
+9203 m
+403 h
+468 h
+41 h
+278 h
+1 h
+241 m
+135 h
+4 h
+801 m
+10 h
+2984 m
+692 h
+1 h
+10 h
+3 h
+9204 m
+59 h
+10 h
+10 h
+10 h
+9205 m
+4 h
+124 h
+9206 m
+9207 m
+3 h
+1 h
+4 h
+9208 m
+4 h
+1 h
+9209 m
+167 h
+31 h
+3 h
+12 h
+4 h
+258 h
+109 h
+4 h
+9210 m
+146 h
+4 h
+10 h
+1 h
+9211 m
+10 h
+10 h
+9212 m
+4 h
+9213 m
+9214 m
+1 h
+798 m
+118 h
+4 h
+31 h
+109 h
+10 h
+4 h
+10 h
+1 h
+4 h
+1 h
+9215 m
+13 h
+9216 m
+1 h
+1027 m
+4 h
+289 h
+9217 m
+119 h
+10 h
+192 h
+10 h
+1 h
+4 h
+1 h
+6869 m
+73 h
+4 h
+25 h
+57 h
+4 h
+9218 m
+11 h
+1 h
+9219 m
+1 h
+10 h
+1 h
+195 h
+536 h
+4 h
+10 h
+94 h
+1 h
+190 h
+55 h
+11 h
+9220 m
+4 h
+464 h
+74 h
+1 h
+9221 m
+4 h
+10 h
+224 h
+4 h
+9222 m
+59 h
+1 h
+4 h
+10 h
+4 h
+1 h
+9223 m
+4 h
+109 h
+8221 m
+4 h
+9224 m
+48 h
+3 h
+1 h
+9225 m
+9226 m
+10 h
+4 h
+4 h
+1772 h
+129 h
+4 h
+123 h
+3707 m
+10 h
+10 h
+92 h
+4 h
+1 h
+79 h
+1 h
+9227 m
+108 h
+1 h
+10 h
+4 h
+1759 m
+359 h
+4 h
+10 h
+9228 m
+4 h
+4 h
+976 h
+1 h
+174 h
+9229 m
+9230 m
+4 h
+4 h
+4 h
+1 h
+56 h
+9231 m
+10 h
+1 h
+4 h
+9232 m
+9233 m
+9234 m
+4 h
+9235 m
+4 h
+4 h
+9236 m
+3562 m
+31 h
+69 h
+1 h
+9237 m
+4 h
+9238 m
+5141 m
+4 h
+4 h
+10 h
+4 h
+9239 m
+1 h
+9240 m
+9241 m
+4 h
+9242 m
+65 h
+73 h
+167 h
+570 h
+3 h
+4 h
+9243 m
+129 h
+4 h
+5523 m
+9244 m
+4 h
+79 h
+4 h
+170 h
+77 h
+9245 m
+10 h
+1 h
+1 h
+10 h
+25 h
+4 h
+11 h
+9246 m
+10 h
+4 h
+170 h
+9247 m
+1 h
+626 h
+25 h
+1 h
+4 h
+1 h
+124 h
+9248 m
+110 h
+41 h
+1619 h
+9249 m
+9250 m
+4 h
+124 h
+4 h
+3396 m
+65 h
+1 h
+10 h
+9251 m
+614 m
+4 h
+56 h
+144 h
+9252 m
+9253 m
+1 h
+4 h
+10 h
+1 h
+1 h
+3799 m
+307 h
+57 h
+9254 m
+1 h
+1 h
+1 h
+173 h
+9255 m
+4 h
+4 h
+1772 h
+109 h
+1 h
+1 h
+82 h
+4 h
+109 h
+10 h
+124 h
+114 h
+9256 m
+4 h
+4 h
+10 h
+4 h
+9257 m
+4 h
+4 h
+228 m
+1 h
+12 h
+9258 m
+4 h
+4 h
+9259 m
+9260 m
+82 h
+1375 m
+319 h
+4 h
+4 h
+59 h
+9261 m
+278 h
+10 h
+4 h
+109 h
+371 h
+9262 m
+1 h
+112 h
+114 h
+9263 m
+1 h
+4 h
+104 h
+9264 m
+10 h
+1 h
+4 h
+9265 m
+83 h
+8179 m
+4 h
+1 h
+1 h
+129 h
+4 h
+4 h
+1 h
+9266 m
+1955 m
+4 h
+9267 m
+9268 m
+10 h
+4 h
+10 h
+170 h
+9269 m
+4 h
+1 h
+1 h
+4 h
+10 h
+45 h
+4 h
+4 h
+4 h
+266 h
+1 h
+4 h
+124 h
+1 h
+27 h
+1 h
+10 h
+83 h
+1 h
+4 h
+158 h
+8 h
+1 h
+10 h
+9270 m
+3 h
+4 h
+1 h
+10 h
+1 h
+48 h
+9271 m
+10 h
+970 m
+1 h
+25 h
+332 h
+692 h
+1 h
+536 h
+250 h
+83 h
+158 h
+4 h
+11 h
+1 h
+10 h
+4 h
+9272 m
+381 m
+11 h
+1 h
+109 h
+9273 m
+1 h
+1 h
+642 m
+9274 m
+11 h
+1 h
+9275 m
+4 h
+4 h
+1 h
+4 h
+1 h
+4 h
+3 h
+9276 m
+1 h
+27 h
+1 h
+4 h
+1 h
+9277 m
+9278 m
+1016 h
+4 h
+9279 m
+104 h
+10 h
+4 h
+74 h
+1 h
+10 h
+9280 m
+10 h
+1 h
+1 h
+297 h
+1 h
+4 h
+10 h
+9281 m
+4 h
+1337 m
+10 h
+82 h
+4 h
+1 h
+1 h
+9282 m
+1 h
+9283 m
+5809 m
+10 h
+4 h
+10 h
+779 h
+276 h
+371 h
+935 h
+9284 m
+9285 m
+46 h
+1 h
+4 h
+10 h
+1 h
+10 h
+4 h
+9286 m
+1 h
+31 h
+4 h
+1766 h
+146 h
+313 h
+190 h
+9287 m
+4 h
+10 h
+10 h
+109 h
+10 h
+135 h
+1 h
+195 h
+94 h
+1 h
+31 h
+4 h
+10 h
+4 h
+4 h
+184 h
+1 h
+123 h
+4 h
+463 h
+9288 m
+10 h
+4 h
+1 h
+935 h
+4 h
+28 h
+83 h
+1 h
+4 h
+4 h
+4 h
+9289 m
+4 h
+9290 m
+123 h
+82 h
+1 h
+1 h
+1 h
+10 h
+4 h
+11 h
+4 h
+59 h
+9291 m
+11 h
+10 h
+11 h
+9292 m
+4 h
+125 h
+10 h
+4 h
+146 h
+9293 m
+10 h
+31 h
+97 h
+41 h
+41 h
+1 h
+1 h
+25 h
+25 h
+1 h
+4 h
+9294 m
+1 h
+4 h
+11 h
+55 h
+3 h
+1 h
+9295 m
+4 h
+1 h
+158 h
+229 h
+11 h
+4 h
+4 h
+10 h
+4 h
+4 h
+4 h
+114 h
+9296 m
+10 h
+9297 m
+10 h
+59 h
+278 h
+9298 m
+1 h
+4 h
+64 h
+4 h
+9299 m
+114 h
+229 h
+278 h
+1 h
+9300 m
+169 h
+140 h
+10 h
+9301 m
+4 h
+1 h
+1556 m
+1 h
+1 h
+238 h
+1 h
+3 h
+25 h
+9302 m
+10 h
+10 h
+1 h
+1 h
+9303 m
+1 h
+10 h
+10 h
+4 h
+4 h
+1 h
+10 h
+1 h
+1191 m
+10 h
+79 h
+4 h
+92 h
+41 h
+27 h
+3533 m
+9304 m
+114 h
+13 h
+4 h
+1 h
+2625 h
+41 h
+368 h
+3 h
+9305 m
+10 h
+1 h
+9306 m
+1 h
+73 h
+4 h
+124 h
+9307 m
+69 h
+10 h
+4 h
+4 h
+1 h
+83 h
+4 h
+4 h
+10 h
+11 h
+307 h
+358 h
+536 h
+464 h
+31 h
+976 h
+1030 h
+1 h
+10 h
+4 h
+83 h
+4 h
+123 h
+1 h
+10 h
+59 h
+4 h
+1 h
+1 h
+31 h
+4 h
+190 h
+10 h
+3 h
+1 h
+25 h
+4 h
+9308 m
+224 h
+1 h
+9309 m
+1 h
+1 h
+10 h
+332 h
+9310 m
+4 h
+4 h
+10 h
+1 h
+4 h
+1886 m
+65 h
+9311 m
+106 h
+124 h
+195 h
+10 h
+4 h
+569 h
+10 h
+4 h
+1 h
+1 h
+2592 m
+966 m
+3 h
+4 h
+124 h
+386 h
+13 h
+9312 m
+9313 m
+41 h
+1 h
+1 h
+1 h
+4 h
+113 h
+113 h
+57 h
+139 h
+124 h
+9314 m
+69 h
+1089 m
+9315 m
+9316 m
+8 h
+2184 m
+4 h
+9317 m
+10 h
+9318 m
+11 h
+1 h
+1 h
+256 m
+13 h
+10 h
+11 h
+9319 m
+8 h
+11 h
+9320 m
+307 h
+4 h
+55 h
+139 h
+1 h
+1 h
+4 h
+2522 m
+135 h
+4 h
+885 m
+4 h
+41 h
+10 h
+10 h
+10 h
+4 h
+11 h
+9321 m
+9322 m
+9323 m
+4 h
+9293 m
+9324 m
+9325 m
+157 h
+443 h
+4 h
+1 h
+11 h
+1 h
+45 h
+10 h
+10 h
+443 h
+55 h
+146 h
+4 h
+1 h
+185 h
+4 h
+4 h
+1642 h
+4718 m
+1 h
+4 h
+4 h
+10 h
+10 h
+4 h
+9326 m
+1 h
+97 h
+10 h
+258 h
+377 m
+10 h
+73 h
+1 h
+125 h
+125 h
+74 h
+11 h
+9327 m
+10 h
+9328 m
+4 h
+9329 m
+489 m
+423 m
+83 h
+25 h
+45 h
+83 h
+55 h
+9330 m
+94 h
+10 h
+9331 m
+1 h
+9332 m
+4 h
+9333 m
+4 h
+1822 h
+238 h
+157 h
+10 h
+10 h
+4 h
+10 h
+123 h
+9334 m
+10 h
+1 h
+10 h
+10 h
+1 h
+4 h
+124 h
+4 h
+4 h
+11 h
+1 h
+4 h
+4 h
+4 h
+4 h
+1 h
+10 h
+56 h
+11 h
+123 h
+10 h
+25 h
+109 h
+2769 m
+1 h
+9335 m
+9336 m
+124 h
+1 h
+124 h
+4 h
+109 h
+4 h
+4 h
+1 h
+1 h
+4 h
+4 h
+270 m
+11 h
+1 h
+266 h
+10 h
+4 h
+10 h
+1 h
+9337 m
+73 h
+9338 m
+1 h
+4 h
+4 h
+9339 m
+119 h
+4 h
+1 h
+4 h
+9340 m
+9341 m
+2425 m
+146 h
+2720 m
+10 h
+4 h
+9342 m
+4 h
+4 h
+4 h
+9343 m
+9344 m
+367 m
+9345 m
+1 h
+9346 m
+4 h
+4 h
+9027 m
+196 h
+4 h
+9347 m
+1 h
+84 h
+1650 h
+10 h
+104 h
+9348 m
+4 h
+2625 h
+4 h
+4 h
+4 h
+4 h
+4 h
+2733 h
+1 h
+146 h
+4 h
+109 h
+9349 m
+9350 m
+4 h
+114 h
+9351 m
+8 h
+123 h
+10 h
+4 h
+9352 m
+4 h
+4 h
+1 h
+10 h
+3742 m
+1 h
+4 h
+4 h
+4 h
+4 h
+4 h
+4 h
+1 h
+129 h
+1 h
+1 h
+9353 m
+4 h
+1 h
+124 h
+170 h
+9354 m
+167 h
+1 h
+10 h
+11 h
+10 h
+28 h
+65 h
+9355 m
+9356 m
+4 h
+1 h
+83 h
+1 h
+4 h
+10 h
+109 h
+10 h
+1 h
+1 h
+4 h
+1 h
+91 h
+9357 m
+9358 m
+9359 m
+1205 m
+10 h
+9360 m
+9361 m
+1 h
+4 h
+9362 m
+9363 m
+82 h
+3 h
+112 h
+1 h
+9364 m
+82 h
+10 h
+1 h
+5557 m
+1 h
+4 h
+9365 m
+4 h
+4 h
+9366 m
+1 h
+299 h
+9367 m
+9368 m
+9007 m
+10 h
+447 h
+1 h
+10 h
+4 h
+9369 m
+2281 m
+965 m
+10 h
+1 h
+601 h
+4 h
+9370 m
+9371 m
+737 m
+4 h
+10 h
+9372 m
+9373 m
+9374 m
+4 h
+692 h
+1 h
+1 h
+4 h
+3 h
+4 h
+4 h
+1 h
+10 h
+10 h
+1 h
+9375 m
+124 h
+4 h
+12 h
+1 h
+9376 m
+1 h
+12 h
+1 h
+1 h
+9377 m
+1 h
+9378 m
+9379 m
+9380 m
+9381 m
+4 h
+4 h
+185 h
+9382 m
+114 h
+912 m
+4 h
+135 h
+41 h
+45 h
+190 h
+1 h
+10 h
+4 h
+4 h
+10 h
+4 h
+10 h
+9383 m
+10 h
+1 h
+538 h
+9384 m
+114 h
+270 m
+4 h
+59 h
+4 h
+4 h
+3341 m
+1 h
+4 h
+4 h
+4 h
+4 h
+4 h
+4 h
+4 h
+1 h
+9385 m
+9386 m
+282 m
+82 h
+1 h
+124 h
+1 h
+488 h
+11 h
+4 h
+83 h
+1 h
+1 h
+1847 m
+172 h
+9387 m
+9388 m
+83 h
+1 h
+9389 m
+9 m
+10 h
+4 h
+4 h
+167 h
+94 h
+10 h
+1 h
+4 h
+9390 m
+4 h
+170 h
+4 h
+1 h
+10 h
+2391 m
+9391 m
+10 h
+10 h
+109 h
+4 h
+1 h
+83 h
+7253 m
+125 h
+4 h
+9392 m
+167 h
+1105 h
+8002 m
+9393 m
+11 h
+195 h
+4 h
+10 h
+4 h
+1 h
+109 h
+9394 m
+4 h
+124 h
+10 h
+4 h
+996 m
+1 h
+10 h
+143 h
+9395 m
+8 h
+10 h
+278 h
+4 h
+12 h
+1 h
+4441 m
+4 h
+9396 m
+9397 m
+1 h
+9398 m
+1 h
+10 h
+9399 m
+104 h
+10 h
+687 h
+125 h
+939 m
+83 h
+4 h
+190 h
+3622 m
+4367 m
+9400 m
+1 h
+11 h
+4 h
+9401 m
+4 h
+9402 m
+295 h
+9403 m
+464 h
+1 h
+10 h
+74 h
+1 h
+4378 m
+123 h
+10 h
+10 h
+447 h
+4 h
+104 h
+195 h
+25 h
+332 h
+1 h
+9404 m
+84 h
+10 h
+59 h
+4 h
+1 h
+10 h
+97 h
+59 h
+10 h
+4 h
+9405 m
+1 h
+140 h
+135 h
+7792 m
+59 h
+9406 m
+1 h
+4 h
+113 h
+10 h
+4 h
+4 h
+4 h
+9407 m
+9408 m
+3 h
+1 h
+9409 m
+976 h
+9410 m
+9411 m
+4 h
+10 h
+278 h
+4 h
+1 h
+4 h
+1445 m
+10 h
+9412 m
+9413 m
+4 h
+10 h
+4 h
+295 h
+9414 m
+104 h
+590 h
+266 h
+1 h
+1 h
+9415 m
+196 h
+1 h
+4 h
+1 h
+10 h
+4 h
+125 h
+1 h
+4 h
+4 h
+4 h
+1 h
+10 h
+92 h
+1 h
+1 h
+25 h
+10 h
+238 h
+464 h
+10 h
+97 h
+4 h
+1 h
+11 h
+1 h
+9416 m
+9417 m
+11 h
+4 h
+1 h
+9 m
+10 h
+9418 m
+4 h
+11 h
+10 h
+4 h
+65 h
+4 h
+4 h
+82 h
+1284 m
+10 h
+10 h
+1 h
+147 h
+9419 m
+9420 m
+4 h
+57 h
+4 h
+4 h
+4 h
+10 h
+4 h
+10 h
+9421 m
+9422 m
+9423 m
+506 h
+10 h
+11 h
+10 h
+9424 m
+1 h
+10 h
+9425 m
+10 h
+1 h
+1 h
+9426 m
+9427 m
+9428 m
+172 h
+1 h
+83 h
+1 h
+4 h
+4 h
+1 h
+157 h
+4 h
+1 h
+4 h
+4 h
+10 h
+9429 m
+1650 h
+9430 m
+9431 m
+4 h
+4 h
+4 h
+124 h
+9432 m
+9433 m
+36 h
+4 h
+57 h
+185 h
+10 h
+109 h
+1 h
+808 m
+9434 m
+1 h
+124 h
+9435 m
+10 h
+262 h
+31 h
+4 h
+195 h
+10 h
+1 h
+278 h
+147 h
+313 h
+65 h
+109 h
+9436 m
+109 h
+4 h
+4 h
+1 h
+1 h
+976 h
+36 h
+1 h
+6549 m
+13 h
+9437 m
+1 h
+10 h
+4 h
+10 h
+1 h
+4 h
+4 h
+9438 m
+9439 m
+9440 m
+9441 m
+1 h
+1 h
+10 h
+9442 m
+4 h
+4320 m
+1 h
+41 h
+9443 m
+190 h
+1 h
+1 h
+10 h
+3 h
+9444 m
+12 h
+9445 m
+4 h
+1 h
+1 h
+9446 m
+46 h
+9447 m
+278 h
+10 h
+1 h
+4 h
+9448 m
+1 h
+1 h
+368 h
+9449 m
+59 h
+156 h
+5 h
+4 h
+41 h
+9450 m
+9451 m
+9452 m
+9453 m
+4 h
+4 h
+1 h
+383 h
+45 h
+4 h
+31 h
+10 h
+4 h
+92 h
+4 h
+11 h
+172 h
+9454 m
+10 h
+9455 m
+59 h
+1 h
+10 h
+124 h
+9456 m
+196 h
+8133 m
+9457 m
+1 h
+12 h
+41 h
+4 h
+195 h
+757 h
+9458 m
+40 h
+1 h
+10 h
+9459 m
+989 m
+1 h
+1470 h
+4 h
+1 h
+65 h
+25 h
+9460 m
+31 h
+1780 m
+83 h
+27 h
+10 h
+1 h
+10 h
+11 h
+1 h
+4 h
+4 h
+479 m
+41 h
+4 h
+1 h
+9461 m
+10 h
+195 h
+8 h
+9462 m
+4 h
+4 h
+1261 h
+1 h
+4 h
+4 h
+10 h
+10 h
+9463 m
+4 h
+1535 m
+4 h
+10 h
+1 h
+9464 m
+9465 m
+4 h
+4 h
+4 h
+4 h
+1 h
+4 h
+1 h
+9466 m
+10 h
+4 h
+181 h
+9467 m
+1 h
+4 h
+9468 m
+8 h
+11 h
+3 h
+4 h
+124 h
+83 h
+4 h
+4 h
+1 h
+10 h
+1 h
+25 h
+4 h
+139 h
+10 h
+92 h
+1 h
+4 h
+181 h
+9469 m
+9470 m
+104 h
+10 h
+9471 m
+57 h
+9472 m
+9473 m
+36 h
+258 h
+10 h
+4 h
+9474 m
+9475 m
+1 h
+9476 m
+8 h
+9477 m
+1 h
+13 h
+10 h
+9478 m
+31 h
+1 h
+10 h
+9479 m
+4 h
+9480 m
+9481 m
+83 h
+9482 m
+4 h
+65 h
+1 h
+114 h
+9483 m
+57 h
+1 h
+1 h
+4 h
+1 h
+9484 m
+10 h
+12 h
+1 h
+9485 m
+4 h
+1 h
+4 h
+83 h
+83 h
+10 h
+238 h
+10 h
+1780 m
+4 h
+9486 m
+82 h
+9487 m
+4 h
+4 h
+31 h
+6869 m
+1 h
+3 h
+195 h
+279 h
+9488 m
+1 h
+9489 m
+569 h
+9490 m
+10 h
+4 h
+4 h
+1 h
+11 h
+164 h
+9491 m
+10 h
+59 h
+1 h
+124 h
+11 h
+4538 m
+10 h
+9492 m
+1 h
+4 h
+359 h
+124 h
+238 h
+9493 m
+4 h
+9494 m
+9495 m
+9496 m
+91 h
+82 h
+9497 m
+1 h
+4 h
+1 h
+65 h
+3 h
+1 h
+73 h
+1 h
+4 h
+9498 m
+4 h
+146 h
+31 h
+11 h
+10 h
+1261 h
+124 h
+41 h
+9499 m
+1 h
+10 h
+83 h
+124 h
+9500 m
+112 h
+8533 m
+1 h
+4 h
+1 h
+83 h
+4 h
+55 h
+4 h
+10 h
+4 h
+1 h
+9501 m
+112 h
+4 h
+1 h
+9502 m
+1828 m
+82 h
+2278 m
+185 h
+11 h
+538 h
+104 h
+4 h
+4 h
+1 h
+9503 m
+1642 h
+4 h
+10 h
+518 m
+114 h
+4 h
+4 h
+9504 m
+4 h
+1646 m
+103 h
+1 h
+57 h
+1 h
+4 h
+83 h
+41 h
+9505 m
+4 h
+9506 m
+4 h
+4 h
+10 h
+4 h
+109 h
+10 h
+2124 m
+41 h
+4 h
+170 h
+1 h
+36 h
+10 h
+125 h
+1 h
+10 h
+9507 m
+9508 m
+1 h
+12 h
+1 h
+276 h
+4 h
+10 h
+74 h
+73 h
+9509 m
+118 h
+27 h
+113 h
+9510 m
+4 h
+36 h
+4 h
+25 h
+4 h
+1 h
+266 h
+1 h
+5225 m
+9511 m
+4 h
+1 h
+4 h
+9512 m
+467 m
+9513 m
+12 h
+2314 m
+4 h
+10 h
+911 m
+9514 m
+10 h
+79 h
+68 m
+10 h
+11 h
+2163 m
+1 h
+4 h
+1 h
+11 h
+9515 m
+10 h
+1 h
+276 h
+9516 m
+94 h
+10 h
+196 h
+7924 m
+83 h
+1 h
+10 h
+278 h
+9517 m
+4 h
+57 h
+1 h
+11 h
+1 h
+4 h
+74 h
+238 h
+4 h
+4 h
+371 h
+4 h
+9518 m
+9519 m
+45 h
+65 h
+1 h
+4 h
+4 h
+4 h
+13 h
+1 h
+1 h
+358 h
+9520 m
+9521 m
+9522 m
+10 h
+9523 m
+10 h
+164 h
+4 h
+1 h
+10 h
+1 h
+9524 m
+10 h
+11 h
+10 h
+464 h
+1083 h
+31 h
+9525 m
+9526 m
+11 h
+4 h
+196 h
+11 h
+1 h
+4 h
+106 h
+1 h
+147 h
+79 h
+9527 m
+1 h
+1 h
+82 h
+10 h
+4 h
+2172 h
+10 h
+9528 m
+74 h
+1 h
+9529 m
+4 h
+1 h
+204 h
+4 h
+9530 m
+7253 m
+97 h
+4 h
+9531 m
+4 h
+9532 m
+4 h
+1 h
+196 h
+9533 m
+4 h
+185 h
+1127 m
+97 h
+1 h
+9534 m
+4 h
+4 h
+1 h
+1 h
+4 h
+250 h
+4 h
+9535 m
+9536 m
+1 h
+10 h
+8 h
+25 h
+11 h
+9537 m
+1 h
+1116 m
+97 h
+9538 m
+4 h
+9539 m
+11 h
+4 h
+9540 m
+4 h
+4 h
+9541 m
+31 h
+10 h
+9542 m
+1309 m
+82 h
+1 h
+9543 m
+9544 m
+114 h
+8 h
+9545 m
+9546 m
+140 h
+170 h
+10 h
+10 h
+4 h
+4 h
+1 h
+4 h
+7999 m
+123 h
+113 h
+447 h
+1478 h
+9547 m
+10 h
+1 h
+9548 m
+31 h
+1 h
+4 h
+27 h
+4 h
+36 h
+4 h
+4 h
+1 h
+4 h
+1 h
+4 h
+9549 m
+1 h
+9550 m
+9551 m
+1650 h
+1 h
+25 h
+4 h
+10 h
+10 h
+10 h
+195 h
+169 h
+4 h
+9552 m
+74 h
+57 h
+1 h
+9553 m
+25 h
+297 h
+1 h
+5456 m
+11 h
+4127 m
+196 h
+4 h
+9554 m
+1 h
+4 h
+11 h
+8555 m
+4 h
+1 h
+1 h
+1 h
+9555 m
+1 h
+9556 m
+4 h
+8485 m
+4 h
+10 h
+5 h
+4 h
+10 h
+31 h
+65 h
+9557 m
+9558 m
+25 h
+4 h
+9559 m
+10 h
+9560 m
+1 h
+4 h
+9561 m
+10 h
+4 h
+4 h
+9562 m
+4 h
+1 h
+1 h
+359 h
+4 h
+31 h
+9563 m
+1 h
+4257 m
+4 h
+4 h
+172 h
+9564 m
+3 h
+164 h
+976 h
+4 h
+10 h
+10 h
+9565 m
+125 h
+22 h
+9566 m
+1 h
+4 h
+91 h
+10 h
+3768 m
+4 h
+1627 m
+578 h
+10 h
+4 h
+1 h
+1 h
+9567 m
+4 h
+10 h
+2794 m
+31 h
+41 h
+1 h
+1 h
+1 h
+11 h
+4 h
+124 h
+10 h
+4 h
+3095 m
+1 h
+91 h
+174 h
+276 h
+4 h
+9568 m
+4 h
+4 h
+82 h
+56 h
+25 h
+4 h
+10 h
+25 h
+1 h
+10 h
+14 m
+1 h
+12 h
+11 h
+9569 m
+1 h
+147 h
+59 h
+2719 m
+1 h
+1 h
+25 h
+4 h
+190 h
+9570 m
+4 h
+170 h
+4 h
+1886 m
+9571 m
+478 m
+9572 m
+9573 m
+4 h
+1027 m
+135 h
+1 h
+1 h
+4 h
+9574 m
+10 h
+2558 m
+94 h
+11 h
+83 h
+9575 m
+4 h
+40 h
+9576 m
+10 h
+119 h
+9577 m
+1 h
+258 h
+94 h
+820 m
+7322 m
+4 h
+4 h
+4 h
+146 h
+83 h
+11 h
+173 h
+55 h
+9578 m
+41 h
+97 h
+1 h
+104 h
+4 h
+1 h
+10 h
+4 h
+4 h
+874 m
+4 h
+1 h
+9579 m
+1 h
+1 h
+332 h
+170 h
+10 h
+1 h
+1 h
+74 h
+4 h
+1493 m
+10 h
+9580 m
+83 h
+4 h
+265 h
+1454 m
+9581 m
+10 h
+25 h
+1 h
+31 h
+4 h
+172 h
+10 h
+9582 m
+4 h
+195 h
+1 h
+1261 h
+976 h
+238 h
+82 h
+9583 m
+4 h
+1 h
+1 h
+9584 m
+4 h
+3704 m
+1 h
+4 h
+11 h
+986 h
+1 h
+1 h
+332 h
+11 h
+1 h
+1 h
+93 h
+8781 m
+10 h
+172 h
+10 h
+4 h
+11 h
+1 h
+4 h
+9585 m
+9586 m
+359 h
+4 h
+4 h
+195 h
+135 h
+4177 m
+4 h
+146 h
+11 h
+11 h
+4 h
+48 h
+9587 m
+4 h
+601 h
+1 h
+82 h
+10 h
+25 h
+119 h
+9588 m
+124 h
+9589 m
+1201 m
+4 h
+1 h
+1 h
+4 h
+601 h
+11 h
+82 h
+91 h
+6381 m
+4 h
+4616 m
+10 h
+4 h
+135 h
+10 h
+119 h
+2952 m
+172 h
+820 m
+36 h
+9590 m
+4 h
+9591 m
+278 h
+9592 m
+4 h
+1 h
+125 h
+9593 m
+211 m
+65 h
+1 h
+56 h
+9594 m
+4 h
+4 h
+1 h
+10 h
+4 h
+4 h
+1 h
+167 h
+9595 m
+1 h
+9596 m
+10 h
+1 h
+1 h
+4 h
+9597 m
+9598 m
+9599 m
+164 h
+10 h
+170 h
+4 h
+4 h
+59 h
+9600 m
+4 h
+11 h
+1453 m
+3068 m
+74 h
+4 h
+4 h
+11 h
+1 h
+4 h
+4 h
+1 h
+3 h
+4 h
+124 h
+4 h
+10 h
+4 h
+97 h
+1 h
+557 m
+9601 m
+4 h
+4 h
+10 h
+192 h
+4 h
+10 h
+3 h
+9602 m
+9603 m
+158 h
+1 h
+9604 m
+4 h
+2592 m
+9605 m
+1 h
+11 h
+9606 m
+9607 m
+82 h
+307 h
+4 h
+9608 m
+11 h
+1 h
+10 h
+135 h
+4 h
+9609 m
+10 h
+1 h
+10 h
+82 h
+9610 m
+3 h
+332 h
+156 h
+238 h
+9611 m
+4 h
+196 h
+104 h
+1 h
+4 h
+3680 m
+4 h
+4 h
+9612 m
+1074 h
+10 h
+83 h
+1 h
+4 h
+238 h
+9613 m
+1893 m
+1261 h
+4648 m
+1 h
+4 h
+4 h
+10 h
+10 h
+4 h
+770 m
+4 h
+4 h
+4 h
+4 h
+1 h
+22 h
+25 h
+1 h
+4 h
+888 m
+1 h
+10 h
+9614 m
+4 h
+9615 m
+1 h
+4 h
+82 h
+4 h
+4 h
+9616 m
+23 h
+4 h
+11 h
+1 h
+1 h
+4 h
+1 h
+8133 m
+10 h
+9617 m
+10 h
+3 h
+1 h
+9618 m
+4 h
+1 h
+9619 m
+92 h
+9620 m
+124 h
+41 h
+31 h
+10 h
+2984 m
+10 h
+4 h
+3161 m
+31 h
+1 h
+73 h
+4 h
+9621 m
+10 h
+4 h
+10 h
+10 h
+82 h
+10 h
+146 h
+4 h
+181 h
+4 h
+109 h
+330 m
+1 h
+1 h
+4 h
+1 h
+1 h
+1 h
+41 h
+1 h
+1 h
+10 h
+1 h
+1220 m
+9622 m
+11 h
+9623 m
+4 h
+9624 m
+1 h
+4 h
+57 h
+4 h
+1 h
+976 h
+1 h
+55 h
+9625 m
+4 h
+11 h
+9626 m
+196 h
+9627 m
+1 h
+10 h
+31 h
+4 h
+1 h
+4 h
+4 h
+278 h
+9628 m
+1127 m
+9629 m
+196 h
+3 h
+3 h
+1 h
+31 h
+4 h
+4 h
+3 h
+11 h
+1 h
+4 h
+1 h
+10 h
+9630 m
+1260 m
+403 h
+12 h
+9631 m
+10 h
+73 h
+4 h
+4 h
+1 h
+2844 m
+278 h
+124 h
+119 h
+4 h
+31 h
+11 h
+4 h
+9632 m
+1 h
+125 h
+1 h
+4 h
+167 h
+1 h
+10 h
+9633 m
+295 h
+1 h
+9475 m
+4 h
+1 h
+57 h
+4 h
+10 h
+1 h
+4 h
+4 h
+10 h
+9634 m
+10 h
+1 h
+4 h
+4 h
+4 h
+10 h
+238 h
+57 h
+1 h
+10 h
+1 h
+9635 m
+9636 m
+146 h
+1 h
+9637 m
+9638 m
+1 h
+10 h
+4 h
+1 h
+4 h
+4 h
+10 h
+9639 m
+1 h
+9640 m
+11 h
+258 h
+4 h
+41 h
+3933 m
+4 h
+297 h
+1 h
+911 m
+9641 m
+1 h
+11 h
+4 h
+124 h
+3 h
+1 h
+196 h
+4 h
+10 h
+2379 h
+41 h
+1 h
+575 m
+97 h
+1 h
+1 h
+1 h
+10 h
+10 h
+10 h
+1 h
+1 h
+10 h
+9642 m
+57 h
+82 h
+4 h
+9643 m
+1 h
+9644 m
+810 m
+9645 m
+83 h
+10 h
+5470 m
+10 h
+1884 h
+1 h
+9646 m
+9647 m
+82 h
+1 h
+9648 m
+4 h
+1074 h
+73 h
+125 h
+1030 h
+9649 m
+10 h
+4 h
+10 h
+9650 m
+3246 m
+186 h
+1 h
+57 h
+10 h
+1884 h
+1 h
+1 h
+687 h
+9651 m
+4 h
+9652 m
+10 h
+147 h
+4 h
+4 h
+1 h
+97 h
+1 h
+4 h
+5869 m
+9653 m
+1 h
+9654 m
+4 h
+4 h
+170 h
+10 h
+1 h
+4 h
+139 h
+4 h
+1677 m
+2418 m
+1 h
+11 h
+69 h
+1 h
+1250 h
+4 h
+4 h
+69 h
+10 h
+4 h
+1 h
+1 h
+9655 m
+2878 m
+4 h
+9656 m
+9657 m
+146 h
+4 h
+1 h
+1 h
+125 h
+9658 m
+9659 m
+4 h
+4 h
+250 h
+4 h
+1 h
+138 h
+9660 m
+1 h
+104 h
+9661 m
+4 h
+9662 m
+9663 m
+9664 m
+1 h
+31 h
+1 h
+9665 m
+1 h
+1 h
+238 h
+10 h
+11 h
+1 h
+10 h
+156 h
+4 h
+677 m
+1 h
+9666 m
+57 h
+186 h
+4 h
+1574 m
+83 h
+9667 m
+1 h
+13 h
+9668 m
+9669 m
+9670 m
+1 h
+11 h
+10 h
+4 h
+1 h
+9671 m
+10 h
+9672 m
+10 h
+11 h
+9673 m
+55 h
+9674 m
+987 m
+10 h
+4 h
+9675 m
+3847 m
+4 h
+11 h
+9676 m
+4 h
+195 h
+11 h
+10 h
+9677 m
+371 h
+9678 m
+3 h
+83 h
+4 h
+9679 m
+25 h
+10 h
+9680 m
+170 h
+4 h
+9681 m
+4 h
+6869 m
+3 h
+169 h
+6599 m
+1 h
+9682 m
+9683 m
+4 h
+124 h
+569 h
+4 h
+9684 m
+1 h
+5917 h
+79 h
+4 h
+3820 m
+55 h
+4 h
+170 h
+4 h
+4 h
+74 h
+1024 m
+9685 m
+9686 m
+9687 m
+9688 m
+4 h
+1 h
+10 h
+1 h
+11 h
+9689 m
+447 h
+10 h
+443 h
+9690 m
+10 h
+9691 m
+4 h
+9692 m
+7395 m
+125 h
+4 h
+4 h
+113 h
+9693 m
+4 h
+9694 m
+167 h
+57 h
+9695 m
+11 h
+9696 m
+9697 m
+939 m
+10 h
+9698 m
+65 h
+10 h
+146 h
+4 h
+4 h
+4 h
+3 h
+9699 m
+4 h
+13 h
+36 h
+9700 m
+195 h
+92 h
+74 h
+9701 m
+124 h
+41 h
+4 h
+1 h
+25 h
+9702 m
+1 h
+55 h
+1 h
+10 h
+109 h
+4 h
+10 h
+447 h
+82 h
+1 h
+1 h
+433 m
+1116 m
+264 m
+4 h
+266 h
+99 m
+4 h
+59 h
+4 h
+203 m
+10 h
+146 h
+3 h
+4 h
+9703 m
+4 h
+9704 m
+297 h
+1 h
+27 h
+10 h
+10 h
+150 m
+403 h
+6102 m
+77 h
+4 h
+4000 m
+2186 m
+1 h
+135 h
+9705 m
+1 h
+83 h
+4 h
+1 h
+4 h
+1 h
+72 m
+9706 m
+4 h
+4 h
+10 h
+3841 m
+4 h
+56 h
+4 h
+1201 m
+82 h
+447 h
+6963 m
+4 h
+1 h
+4 h
+4 h
+172 h
+11 h
+1 h
+124 h
+211 m
+10 h
+4 h
+7572 m
+1 h
+2374 m
+1 h
+57 h
+4 h
+11 h
+4 h
+4 h
+10 h
+4 h
+143 h
+1 h
+93 h
+1 h
+4 h
+77 h
+1 h
+4 h
+9707 m
+4 h
+4 h
+9708 m
+41 h
+4 h
+1 h
+9709 m
+4 h
+10 h
+9710 m
+258 h
+4 h
+9711 m
+9712 m
+4 h
+4 h
+94 h
+11 h
+4 h
+10 h
+258 h
+4 h
+4 h
+4 h
+59 h
+9713 m
+9714 m
+1 h
+10 h
+1 h
+9715 m
+4 h
+10 h
+9716 m
+82 h
+1 h
+9717 m
+9718 m
+10 h
+4 h
+538 h
+9719 m
+1 h
+1 h
+307 h
+114 h
+4 h
+56 h
+12 h
+4 h
+4 h
+70 m
+10 h
+10 h
+9720 m
+9721 m
+4 h
+319 h
+4 h
+10 h
+4 h
+1 h
+1 h
+9722 m
+4 h
+10 h
+2002 m
+1 h
+10 h
+4 h
+9723 m
+59 h
+1 h
+9724 m
+146 h
+64 h
+4 h
+1 h
+10 h
+181 h
+9725 m
+110 h
+9726 m
+279 h
+79 h
+83 h
+9727 m
+10 h
+10 h
+10 h
+1 h
+110 h
+383 h
+9728 m
+7394 m
+1137 h
+9729 m
+10 h
+9730 m
+9731 m
+4 h
+279 h
+295 h
+1 h
+10 h
+11 h
+3 h
+10 h
+10 h
+10 h
+9732 m
+10 h
+1 h
+73 h
+4 h
+104 h
+2475 m
+169 h
+10 h
+11 h
+10 h
+9733 m
+4 h
+9734 m
+963 m
+4 h
+9735 m
+358 h
+9736 m
+9737 m
+10 h
+1 h
+1 h
+4 h
+4 h
+1 h
+139 h
+83 h
+10 h
+1 h
+1253 m
+9738 m
+11 h
+10 h
+82 h
+10 h
+4 h
+190 h
+156 h
+74 h
+4 h
+11 h
+4 h
+533 h
+1 h
+9739 m
+1 h
+4 h
+4 h
+9740 m
+3 h
+9741 m
+146 h
+97 h
+57 h
+74 h
+1 h
+25 h
+31 h
+9742 m
+10 h
+10 h
+169 h
+4 h
+11 h
+196 h
+97 h
+10 h
+3494 m
+9743 m
+4 h
+4 h
+9744 m
+92 h
+4 h
+4 h
+10 h
+181 h
+11 h
+73 h
+256 m
+9745 m
+829 m
+10 h
+82 h
+11 h
+4 h
+9746 m
+112 h
+1 h
+4 h
+82 h
+1 h
+1 h
+9747 m
+1 h
+9748 m
+250 h
+4 h
+4 h
+10 h
+10 h
+41 h
+9749 m
+10 h
+104 h
+9750 m
+170 h
+195 h
+9751 m
+3 h
+1 h
+59 h
+4 h
+4 h
+9752 m
+9753 m
+4 h
+1 h
+359 h
+9754 m
+4 h
+169 h
+1 h
+10 h
+4 h
+9755 m
+4 h
+1 h
+4 h
+114 h
+332 h
+10 h
+4 h
+9756 m
+1 h
+4 h
+11 h
+185 h
+9757 m
+1403 m
+4 h
+4 h
+82 h
+4 h
+10 h
+447 h
+10 h
+258 h
+31 h
+4 h
+109 h
+10 h
+41 h
+77 h
+9758 m
+83 h
+9759 m
+186 h
+1083 h
+4 h
+164 h
+4 h
+219 m
+4 h
+9760 m
+94 h
+1 h
+221 m
+4 h
+1 h
+443 h
+4 h
+4 h
+4 h
+8 h
+170 h
+25 h
+74 h
+4 h
+1 h
+4 h
+10 h
+12 h
+9761 m
+10 h
+4 h
+1 h
+1790 h
+10 h
+4 h
+9762 m
+79 h
+125 h
+4 h
+190 h
+601 h
+4 h
+9763 m
+9764 m
+9765 m
+1 h
+4 h
+9766 m
+11 h
+11 h
+1 h
+124 h
+1884 h
+9767 m
+1 h
+276 h
+104 h
+9768 m
+358 h
+10 h
+10 h
+1 h
+9769 m
+9770 m
+4 h
+1 h
+4 h
+533 h
+4 h
+83 h
+2582 m
+1 h
+64 h
+4 h
+4 h
+82 h
+569 h
+1 h
+170 h
+9771 m
+82 h
+9772 m
+1 h
+2418 m
+1337 m
+109 h
+1 h
+1 h
+135 h
+135 h
+9773 m
+229 h
+1 h
+2865 m
+4 h
+1 h
+279 h
+79 h
+1 h
+10 h
+10 h
+4 h
+10 h
+10 h
+4 h
+3561 m
+4 h
+6387 m
+1470 h
+69 h
+4 h
+304 m
+4 h
+687 h
+9774 m
+4 h
+9775 m
+9776 m
+4 h
+4 h
+1 h
+1 h
+4 h
+278 h
+1 h
+31 h
+10 h
+1 h
+7641 m
+9777 m
+9778 m
+1 h
+9779 m
+123 h
+4 h
+82 h
+144 h
+238 h
+124 h
+4 h
+4 h
+1 h
+4 h
+353 m
+258 h
+4 h
+2184 m
+265 h
+1 h
+489 m
+1 h
+124 h
+10 h
+10 h
+83 h
+9780 m
+9781 m
+11 h
+10 h
+10 h
+1975 m
+4 h
+10 h
+10 h
+1 h
+1 h
+10 h
+11 h
+4 h
+9782 m
+109 h
+9783 m
+4 h
+10 h
+4 h
+9784 m
+59 h
+4 h
+4 h
+1 h
+9785 m
+31 h
+9786 m
+110 h
+9787 m
+1 h
+4 h
+109 h
+10 h
+10 h
+250 h
+4 h
+4702 m
+4 h
+5378 m
+536 h
+59 h
+10 h
+4 h
+11 h
+9788 m
+1 h
+4556 m
+4 h
+9789 m
+9790 m
+1 h
+332 h
+9791 m
+4 h
+4 h
+4 h
+9792 m
+82 h
+1 h
+9793 m
+1137 h
+3 h
+9794 m
+10 h
+9795 m
+31 h
+10 h
+10 h
+4 h
+108 h
+1 h
+4 h
+195 h
+9796 m
+1 h
+1 h
+4 h
+143 h
+1 h
+520 h
+9797 m
+4 h
+9798 m
+4 h
+986 h
+1 h
+9799 m
+74 h
+36 h
+4 h
+4 h
+279 h
+3704 m
+4 h
+4 h
+10 h
+3 h
+9800 m
+4 h
+4 h
+9801 m
+4 h
+10 h
+31 h
+9802 m
+9803 m
+4 h
+238 h
+10 h
+9804 m
+4 h
+4 h
+10 h
+1 h
+718 h
+2101 m
+1 h
+1 h
+2379 h
+170 h
+4 h
+10 h
+170 h
+9805 m
+10 h
+147 h
+172 h
+1 h
+9806 m
+1 h
+83 h
+447 h
+6197 m
+9807 m
+10 h
+9808 m
+4 h
+591 m
+123 h
+4 h
+1 h
+265 h
+9809 m
+125 h
+124 h
+4 h
+9810 m
+10 h
+10 h
+119 h
+4 h
+82 h
+1 h
+339 m
+368 h
+403 h
+9811 m
+1955 m
+8626 m
+9812 m
+4 h
+250 h
+69 h
+4 h
+109 h
+1 h
+82 h
+1 h
+4 h
+289 h
+192 h
+10 h
+278 h
+9813 m
+195 h
+1 h
+4 h
+10 h
+112 h
+299 h
+1 h
+1406 h
+10 h
+1 h
+4 h
+73 h
+1 h
+12 h
+9814 m
+4 h
+104 h
+4 h
+56 h
+4 h
+1 h
+4 h
+4 h
+57 h
+583 h
+9815 m
+4 h
+65 h
+82 h
+170 h
+4 h
+9816 m
+10 h
+10 h
+1 h
+109 h
+9817 m
+22 h
+447 h
+10 h
+4 h
+4 h
+4 h
+10 h
+9818 m
+10 h
+4 h
+124 h
+1 h
+10 h
+9819 m
+65 h
+57 h
+1 h
+4 h
+4 h
+1 h
+11 h
+4553 m
+83 h
+1 h
+3369 m
+278 h
+196 h
+10 h
+82 h
+10 h
+109 h
+4932 m
+4 h
+9820 m
+10 h
+1 h
+1 h
+10 h
+195 h
+10 h
+4 h
+1 h
+4 h
+1 h
+4 h
+9821 m
+258 h
+1030 h
+4 h
+4 h
+9822 m
+10 h
+10 h
+238 h
+9823 m
+9824 m
+9825 m
+1 h
+270 h
+1 h
+10 h
+629 m
+10 h
+109 h
+1 h
+195 h
+10 h
+9826 m
+265 h
+1 h
+82 h
+9827 m
+9828 m
+4320 m
+60 m
+1 h
+114 h
+4 h
+1 h
+109 h
+10 h
+9829 m
+569 h
+109 h
+83 h
+11 h
+124 h
+1 h
+65 h
+10 h
+1 h
+1201 h
+238 h
+22 h
+10 h
+6941 m
+100 m
+10 h
+1 h
+82 h
+238 h
+338 m
+2148 m
+1 h
+9830 m
+147 h
+1 h
+10 h
+9831 m
+195 h
+31 h
+92 h
+9832 m
+9833 m
+10 h
+4 h
+4 h
+196 h
+278 h
+9834 m
+270 h
+4 h
+124 h
+1117 m
+278 h
+9835 m
+1 h
+10 h
+9836 m
+108 h
+1 h
+1 h
+4 h
+1 h
+11 h
+9837 m
+79 h
+1 h
+3 h
+4 h
+4 h
+9838 m
+1 h
+10 h
+4 h
+1 h
+1 h
+1 h
+4 h
+10 h
+4 h
+4 h
+1 h
+57 h
+4 h
+4 h
+1 h
+4 h
+10 h
+9839 m
+9840 m
+94 h
+4 h
+83 h
+59 h
+4 h
+114 h
+10 h
+4 h
+9841 m
+4 h
+3 h
+443 h
+57 h
+4 h
+9842 m
+1 h
+1 h
+3303 m
+9843 m
+1 h
+196 h
+104 h
+9844 m
+4 h
+4 h
+1 h
+4 h
+10 h
+109 h
+1 h
+258 h
+10 h
+4 h
+9845 m
+4 h
+10 h
+83 h
+1 h
+10 h
+10 h
+9846 m
+104 h
+10 h
+1 h
+10 h
+41 h
+11 h
+4 h
+36 h
+4 h
+73 h
+10 h
+109 h
+1 h
+4 h
+4 h
+4 h
+4 h
+10 h
+10 h
+238 h
+9847 m
+10 h
+10 h
+1 h
+4 h
+57 h
+9848 m
+1 h
+9849 m
+10 h
+9850 m
+9851 m
+4 h
+10 h
+488 h
+92 h
+10 h
+196 h
+41 h
+9852 m
+1 h
+9853 m
+4 h
+1 h
+4 h
+1 h
+399 h
+4 h
+25 h
+10 h
+4 h
+4 h
+4 h
+1 h
+8 h
+1442 m
+11 h
+939 m
+36 h
+83 h
+273 m
+4 h
+1 h
+1 h
+1 h
+9854 m
+4 h
+1 h
+1 h
+3321 m
+48 h
+79 h
+185 h
+125 h
+10 h
+79 h
+757 h
+9855 m
+173 h
+9856 m
+4 h
+258 h
+4 h
+359 h
+9857 m
+4 h
+10 h
+4 h
+10 h
+1 h
+1 h
+4 h
+9858 m
+9859 m
+1 h
+82 h
+9860 m
+143 h
+1016 h
+9861 m
+1 h
+9862 m
+10 h
+9863 m
+1835 h
+31 h
+4 h
+9864 m
+1 h
+10 h
+4 h
+124 h
+4 h
+443 h
+10 h
+9865 m
+10 h
+10 h
+4 h
+156 h
+238 h
+40 h
+9866 m
+9867 m
+4 h
+3398 m
+692 h
+4 h
+1 h
+10 h
+11 h
+4 h
+9868 m
+9869 m
+9870 m
+1 h
+31 h
+4 h
+9871 m
+11 h
+4 h
+4 h
+4 h
+9872 m
+7306 m
+1 h
+1 h
+9873 m
+10 h
+55 h
+185 h
+10 h
+1 h
+10 h
+1 h
+1 h
+4 h
+10 h
+10 h
+196 h
+10 h
+73 h
+4 h
+6107 m
+12 h
+4 h
+9874 m
+59 h
+8571 m
+1 h
+46 h
+1 h
+1 h
+77 h
+4 h
+4 h
+9875 m
+9876 m
+83 h
+10 h
+9877 m
+1 h
+1 h
+10 h
+10 h
+9878 m
+11 h
+10 h
+297 h
+41 h
+829 m
+1 h
+10 h
+1 h
+10 h
+4 h
+10 h
+4 h
+4 h
+83 h
+9879 m
+8472 m
+929 m
+143 h
+538 h
+9880 m
+1 h
+97 h
+12 h
+4 h
+1 h
+3 h
+278 h
+156 h
+13 h
+10 h
+4 h
+1 h
+4 h
+4 h
+238 h
+64 h
+125 h
+1070 m
+57 h
+1 h
+4 h
+9881 m
+4 h
+10 h
+11 h
+10 h
+11 h
+108 h
+1 h
+1 h
+9882 m
+3111 m
+3 h
+82 h
+9883 m
+1068 m
+1 h
+4 h
+9884 m
+9885 m
+869 h
+332 h
+4 h
+10 h
+4 h
+9886 m
+57 h
+10 h
+4 h
+1 h
+9887 m
+4 h
+69 h
+1 h
+4 h
+9888 m
+9889 m
+2720 m
+10 h
+1 h
+10 h
+9890 m
+4 h
+307 h
+4 h
+4 h
+1 h
+83 h
+10 h
+10 h
+4 h
+10 h
+3847 m
+10 h
+110 h
+9891 m
+73 h
+4 h
+104 h
+1 h
+274 h
+4 h
+9892 m
+184 h
+10 h
+1 h
+9893 m
+4 h
+1 h
+241 m
+1 h
+4 h
+4 h
+109 h
+11 h
+9894 m
+1 h
+9895 m
+9896 m
+9897 m
+9898 m
+1 h
+9899 m
+9900 m
+9901 m
+9902 m
+109 h
+9903 m
+1 h
+4 h
+10 h
+9904 m
+10 h
+1 h
+1083 h
+83 h
+4 h
+124 h
+9905 m
+36 h
+185 h
+578 h
+12 h
+4 h
+10 h
+4 h
+109 h
+1 h
+4 h
+4 h
+1 h
+9906 m
+1418 m
+4 h
+2928 m
+2313 m
+139 h
+9907 m
+10 h
+9908 m
+9909 m
+1 h
+11 h
+1 h
+94 h
+195 h
+1 h
+9910 m
+4 h
+45 h
+4 h
+57 h
+185 h
+4 h
+9911 m
+119 h
+4 h
+64 h
+1 h
+27 h
+9912 m
+9913 m
+4 h
+169 h
+4 h
+10 h
+10 h
+9914 m
+1 h
+4 h
+4 h
+59 h
+358 h
+4 h
+114 h
+10 h
+97 h
+3 h
+4 h
+9915 m
+1 h
+4 h
+4 h
+4 h
+4 h
+4 h
+10 h
+8 h
+9916 m
+9917 m
+9918 m
+278 h
+83 h
+64 h
+196 h
+1 h
+9919 m
+4 h
+9920 m
+4 h
+9921 m
+386 h
+4 h
+10 h
+4 h
+7924 m
+1 h
+4 h
+9922 m
+4 h
+9923 m
+4 h
+9924 m
+1 h
+185 h
+4 h
+1 h
+4 h
+10 h
+77 h
+10 h
+9925 m
+1 h
+9926 m
+4 h
+1 h
+11 h
+4 h
+4 h
+79 h
+4 h
+631 m
+4 h
+4 h
+9927 m
+459 m
+25 h
+687 h
+9928 m
+82 h
+10 h
+57 h
+10 h
+31 h
+1 h
+10 h
+4 h
+4 h
+9929 m
+1 h
+1 h
+1835 h
+9930 m
+4 h
+464 h
+1 h
+9931 m
+10 h
+112 h
+9932 m
+185 h
+69 h
+11 h
+4 h
+109 h
+2041 m
+9933 m
+9934 m
+108 h
+9935 m
+156 h
+9936 m
+1 h
+4 h
+9937 m
+10 h
+8193 m
+195 h
+9938 m
+4 h
+196 h
+9939 m
+10 h
+1 h
+4 h
+83 h
+9940 m
+10 h
+1 h
+9941 m
+767 m
+10 h
+59 h
+1 h
+143 h
+307 h
+1 h
+9942 m
+2751 m
+125 h
+9943 m
+9944 m
+10 h
+143 h
+4867 m
+266 h
+4 h
+3293 m
+9945 m
+9946 m
+1 h
+3534 m
+5483 m
+4 h
+1 h
+10 h
+2710 m
+1 h
+1 h
+9947 m
+10 h
+10 h
+10 h
+11 h
+4 h
+1 h
+4 h
+9948 m
+9949 m
+1 h
+9950 m
+9951 m
+1 h
+10 h
+1 h
+73 h
+9952 m
+11 h
+4 h
+4 h
+1 h
+9953 m
+10 h
+41 h
+146 h
+4 h
+6869 m
+1 h
+10 h
+1 h
+31 h
+4 h
+1 h
+1 h
+4 h
+1 h
+4 h
+4 h
+4 h
+57 h
+1 h
+935 h
+9954 m
+1847 m
+4 h
+9955 m
+1 h
+9956 m
+9957 m
+11 h
+109 h
+4 h
+9958 m
+45 h
+9959 m
+1 h
+10 h
+82 h
+143 h
+1 h
+371 h
+10 h
+4 h
+4 h
+4 h
+4 h
+146 h
+1 h
+56 h
+4 h
+1 h
+10 h
+1 h
+258 h
+10 h
+10 h
+1 h
+65 h
+1 h
+9960 m
+4 h
+1 h
+1 h
+164 h
+9961 m
+4 h
+1 h
+1 h
+9962 m
+10 h
+4 h
+4 h
+4 h
+9963 m
+1 h
+443 h
+31 h
+9964 m
+11 h
+143 h
+82 h
+9965 m
+4 h
+25 h
+4 h
+1 h
+9966 m
+82 h
+8 h
+238 h
+4 h
+4 h
+9967 m
+9968 m
+966 m
+279 h
+10 h
+4 h
+4 h
+9969 m
+471 m
+9970 m
+9971 m
+10 h
+4 h
+12 h
+9972 m
+11 h
+258 h
+172 h
+400 m
+4 h
+3155 m
+9973 m
+1 h
+10 h
+9974 m
+9975 m
+11 h
+4 h
+3025 h
+124 h
+10 h
+9976 m
+13 h
+146 h
+4 h
+1 h
+9977 m
+57 h
+9978 m
+4 h
+4 h
+9 h
+11 h
+258 h
+9979 m
+74 h
+359 h
+55 h
+371 h
+41 h
+1 h
+307 h
+258 h
+9980 m
+41 h
+27 h
+1220 m
+10 h
+1260 m
+92 h
+10 h
+9981 m
+10 h
+1 h
+113 h
+9982 m
+10 h
+3 h
+8070 m
+4 h
+9983 m
+9984 m
+10 h
+195 h
+10 h
+9985 m
+9986 m
+9987 m
+9988 m
+371 h
+4 h
+9989 m
+4 h
+4 h
+1 h
+83 h
+4 h
+4 h
+4 h
+9990 m
+31 h
+1 h
+1 h
+9991 m
+1 h
+9992 m
+9993 m
+9994 m
+4 h
+4 h
+1 h
+59 h
+65 h
+1 h
+4 h
+4 h
+1 h
+10 h
+1 h
+4 h
+10 h
+10 h
+74 h
+124 h
+1 h
+4 h
+4 h
+9995 m
+9996 m
+9997 m
+4 h
+9998 m
+1 h
+986 h
+59 h
+3 h
+4 h
+4 h
+9999 m
+4 h
+48 h
+3 h
+10 h
+167 h
+1 h
+1 h
+4 h
+4 h
+3 h
+9956 m
+1 h
+4 h
+10000 m
+1374 m
+1504 m
+4 h
+4 h
+10001 m
+4 h
+4 h
+1619 h
+169 h
+10 h
+1 h
+6022 m
+4 h
+1 h
+1 h
+41 h
+278 h
+265 h
+196 h
+489 m
+57 h
+536 h
+74 h
+8809 m
+4 h
+10002 m
+307 h
+4 h
+1886 m
+4 h
+10003 m
+3 h
+124 h
+167 h
+10 h
+1 h
+4 h
+278 h
+4 h
+143 h
+10004 m
+10005 m
+10 h
+1 h
+1 h
+1 h
+1 h
+10 h
+423 m
+11 h
+40 h
+4 h
+4 h
+10006 m
+4 h
+4 h
+10007 m
+4 h
+55 h
+4 h
+10008 m
+82 h
+10 h
+10 h
+1 h
+4 h
+1250 h
+278 h
+10009 m
+10 h
+83 h
+1 h
+3 h
+109 h
+10010 m
+1 h
+185 h
+270 h
+1122 m
+1595 m
+1 h
+4 h
+1 h
+1 h
+1074 h
+4 h
+1 h
+10 h
+1 h
+125 h
+4 h
+1 h
+4 h
+4 h
+3303 m
+1 h
+4 h
+4376 m
+4 h
+4 h
+872 m
+4 h
+4 h
+123 h
+4 h
+1 h
+10011 m
+10 h
+11 h
+57 h
+1 h
+31 h
+10012 m
+1 h
+83 h
+4 h
+10013 m
+10 h
+10 h
+1 h
+4 h
+147 h
+112 h
+109 h
+1 h
+138 h
+1 h
+82 h
+1074 h
+10014 m
+143 h
+4 h
+4 h
+74 h
+4 h
+10015 m
+4 h
+4 h
+4 h
+185 h
+1 h
+10016 m
+4 h
+4 h
+258 h
+2788 m
+4 h
+4 h
+297 h
+10 h
+1 h
+104 h
+41 h
+1309 m
+195 h
+1 h
+10017 m
+138 h
+1 h
+4 h
+31 h
+10018 m
+124 h
+5036 m
+4 h
+10019 m
+10 h
+10020 m
+1 h
+1470 h
+10 h
+1 h
+10021 m
+1 h
+10 h
+10 h
+10022 m
+10023 m
+4 h
+4 h
+7541 m
+3 h
+1 h
+10 h
+73 h
+1 h
+784 m
+763 m
+196 h
+45 h
+4 h
+125 h
+11 h
+1 h
+1 h
+10024 m
+10025 m
+4 h
+4 h
+10026 m
+285 m
+1 h
+1 h
+1137 h
+10027 m
+4 h
+82 h
+10 h
+4 h
+1 h
+10028 m
+1309 m
+371 h
+4 h
+1 h
+10 h
+4 h
+135 h
+10029 m
+10030 m
+83 h
+258 h
+620 m
+1 h
+123 h
+447 h
+10 h
+4 h
+10031 m
+10032 m
+4 h
+1 h
+10 h
+10 h
+4 h
+10033 m
+1 h
+4 h
+4 h
+692 h
+238 h
+10 h
+10034 m
+79 h
+1 h
+4 h
+10035 m
+351 m
+10 h
+986 h
+10036 m
+10037 m
+59 h
+10038 m
+1 h
+10 h
+4 h
+4 h
+186 h
+146 h
+4 h
+10 h
+10 h
+408 m
+25 h
+185 h
+1261 h
+10 h
+109 h
+10039 m
+1 h
+4 h
+3 h
+10040 m
+4 h
+10 h
+1 h
+784 m
+1 h
+59 h
+82 h
+1835 h
+10 h
+10041 m
+1 h
+1 h
+10 h
+157 h
+10 h
+10 h
+8 h
+1 h
+45 h
+124 h
+10 h
+10 h
+10042 m
+10043 m
+1 h
+10 h
+986 h
+10 h
+4 h
+1 h
+1892 m
+10044 m
+1 h
+10 h
+10045 m
+4 h
+10 h
+1893 m
+10046 m
+10047 m
+10 h
+1 h
+10048 m
+900 m
+1685 h
+1 h
+10049 m
+4 h
+1 h
+92 h
+4 h
+10 h
+11 h
+4 h
+4 h
+10050 m
+642 m
+1 h
+1 h
+79 h
+1638 m
+57 h
+173 h
+125 h
+57 h
+1 h
+1 h
+10051 m
+1 h
+4 h
+10052 m
+4 h
+4 h
+10 h
+295 h
+10053 m
+10054 m
+196 h
+4 h
+383 h
+82 h
+1 h
+1 h
+4 h
+10055 m
+4 h
+10056 m
+1 h
+1 h
+10 h
+10057 m
+4 h
+41 h
+10058 m
+1 h
+10059 m
+82 h
+11 h
+278 h
+4 h
+10060 m
+1 h
+10061 m
+10 h
+4 h
+10062 m
+1 h
+1785 m
+4 h
+10063 m
+11 h
+10064 m
+4 h
+4 h
+109 h
+113 h
+4 h
+4 h
+10065 m
+4 h
+4 h
+1838 m
+1 h
+10066 m
+82 h
+1 h
+10 h
+73 h
+4 h
+3 h
+1 h
+4 h
+1185 m
+4 h
+4 h
+10 h
+10 h
+10 h
+4 h
+10 h
+172 h
+4 h
+41 h
+4 h
+1 h
+4 h
+10067 m
+59 h
+4 h
+4 h
+1 h
+2379 h
+1 h
+1 h
+10068 m
+114 h
+1 h
+10069 m
+10 h
+4 h
+10070 m
+10071 m
+4 h
+1 h
+10072 m
+4 h
+4 h
+4 h
+10 h
+146 h
+1250 h
+4 h
+10073 m
+31 h
+10074 m
+1 h
+1642 h
+36 h
+59 h
+1 h
+6766 m
+10 h
+6124 m
+10 h
+4 h
+10 h
+1 h
+1 h
+156 h
+25 h
+4 h
+1 h
+262 h
+4 h
+4 h
+10075 m
+4 h
+1261 h
+371 h
+10076 m
+10 h
+10 h
+4 h
+4 h
+4 h
+10 h
+1 h
+4 h
+4 h
+1 h
+56 h
+11 h
+10 h
+1 h
+10 h
+4 h
+1 h
+10 h
+124 h
+1 h
+10077 m
+10 h
+1 h
+4 h
+1 h
+1 h
+489 m
+10078 m
+3 h
+4 h
+10 h
+4 h
+1 h
+82 h
+4 h
+10 h
+10079 m
+1632 m
+2379 h
+2733 h
+10080 m
+10 h
+82 h
+1 h
+4 h
+147 h
+36 h
+10 h
+10081 m
+4 h
+10082 m
+10 h
+1260 m
+11 h
+1027 m
+1 h
+4 h
+4 h
+4 h
+388 m
+1 h
+4 h
+114 h
+82 h
+214 m
+4 h
+10083 m
+737 m
+1 h
+10084 m
+10 h
+230 h
+10085 m
+12 h
+11 h
+10086 m
+1 h
+1 h
+4 h
+1 h
+1 h
+10087 m
+2840 m
+556 h
+4 h
+10088 m
+1 h
+4 h
+4 h
+82 h
+1 h
+10 h
+4 h
+4 h
+4 h
+10089 m
+146 h
+123 h
+10090 m
+10 h
+4 h
+10091 m
+1 h
+10092 m
+1 h
+1 h
+4 h
+10093 m
+268 m
+4 h
+10094 m
+4240 m
+59 h
+10095 m
+4 h
+10 h
+10096 m
+10097 m
+11 h
+174 h
+10 h
+229 h
+976 h
+10098 m
+13 h
+10 h
+4 h
+10099 m
+1137 h
+73 h
+25 h
+274 h
+4 h
+1 h
+601 h
+1 h
+1 h
+4 h
+4 h
+770 m
+1 h
+125 h
+169 h
+1 h
+10 h
+10 h
+1 h
+45 h
+124 h
+10100 m
+57 h
+4 h
+4 h
+1 h
+1 h
+10101 m
+74 h
+10 h
+687 h
+1 h
+91 h
+4 h
+10102 m
+1 h
+83 h
+10 h
+10 h
+4 h
+1 h
+4 h
+1 h
+10103 m
+4 h
+2617 m
+10104 m
+1 h
+4 h
+92 h
+31 h
+7870 m
+4 h
+10 h
+4 h
+10105 m
+10 h
+11 h
+4 h
+4 h
+79 h
+2303 m
+230 h
+11 h
+10106 m
+412 m
+10 h
+4 h
+10 h
+10107 m
+1 h
+10108 m
+10 h
+11 h
+10109 m
+3990 m
+1 h
+185 h
+11 h
+57 h
+109 h
+640 h
+74 h
+238 h
+10 h
+64 h
+4 h
+11 h
+390 m
+124 h
+10110 m
+4 h
+219 m
+135 h
+4 h
+10 h
+1 h
+65 h
+11 h
+4 h
+1772 h
+4 h
+10 h
+4 h
+4 h
+1 h
+10 h
+4 h
+4 h
+4 h
+41 h
+4 h
+10 h
+1 h
+10 h
+1 h
+1 h
+10 h
+1 h
+229 h
+4 h
+57 h
+97 h
+338 m
+10 h
+147 h
+10111 m
+1 h
+4 h
+4 h
+10112 m
+10 h
+10113 m
+79 h
+10 h
+3837 m
+463 h
+4 h
+1 h
+10114 m
+4 h
+10115 m
+10116 m
+4 h
+4 h
+59 h
+1 h
+1370 m
+11 h
+92 h
+55 h
+4 h
+1 h
+4 h
+4 h
+1 h
+1 h
+1714 h
+1 h
+4 h
+10 h
+10 h
+10117 m
+11 h
+1 h
+11 h
+10 h
+11 h
+25 h
+4 h
+10 h
+955 m
+10118 m
+8 h
+1 h
+692 h
+591 m
+10119 m
+10120 m
+4 h
+1 h
+4 h
+5976 m
+1 h
+8243 m
+10121 m
+65 h
+4 h
+4 h
+10122 m
+4 h
+1650 h
+10123 m
+4 h
+10 h
+13 h
+64 h
+125 h
+3 h
+10124 m
+10 h
+124 h
+1 h
+1 h
+1 h
+10125 m
+1 h
+10126 m
+4 h
+4 h
+10127 m
+65 h
+10 h
+4 h
+135 h
+57 h
+10 h
+31 h
+4 h
+22 h
+124 h
+1 h
+10 h
+10128 m
+4 h
+10129 m
+1 h
+79 h
+10 h
+4 h
+10130 m
+10131 m
+124 h
+124 h
+11 h
+10132 m
+4 h
+109 h
+260 m
+1 h
+10 h
+4 h
+1 h
+4 h
+10133 m
+1 h
+55 h
+295 h
+10134 m
+4 h
+110 h
+4 h
+11 h
+10 h
+10135 m
+297 h
+1 h
+10 h
+169 h
+629 m
+10 h
+4 h
+82 h
+1 h
+11 h
+520 h
+1 h
+109 h
+10 h
+10 h
+1 h
+4 h
+4 h
+1766 h
+1 h
+10136 m
+4 h
+4 h
+1 h
+4 h
+4 h
+109 h
+933 m
+4 h
+83 h
+10137 m
+10 h
+10 h
+10 h
+1 h
+181 h
+270 h
+4 h
+4 h
+169 h
+4 h
+97 h
+578 h
+10138 m
+4 h
+1 h
+4 h
+109 h
+1766 h
+10139 m
+55 h
+4 h
+2851 m
+4 h
+33 m
+10 h
+4 h
+5863 m
+10140 m
+4 h
+4 h
+4 h
+10141 m
+4 h
+172 h
+4 h
+25 h
+4 h
+1 h
+2041 m
+10 h
+4 h
+10142 m
+4 h
+1 h
+10143 m
+1 h
+10 h
+1 h
+10 h
+4 h
+4 h
+10 h
+124 h
+4 h
+2374 m
+4 h
+10 h
+1199 m
+358 h
+11 h
+4 h
+146 h
+4 h
+10144 m
+74 h
+57 h
+4 h
+10 h
+55 h
+125 h
+4 h
+10145 m
+10 h
+4 h
+1 h
+82 h
+10 h
+4 h
+10 h
+266 h
+195 h
+10 h
+1 h
+4 h
+196 h
+10146 m
+10147 m
+114 h
+1 h
+359 h
+10 h
+11 h
+3177 m
+4 h
+10 h
+4 h
+1 h
+10148 m
+4 h
+4 h
+45 h
+1157 m
+10 h
+10 h
+4 h
+1083 h
+10149 m
+169 h
+1650 h
+123 h
+4 h
+3 h
+82 h
+10 h
+4 h
+229 h
+1 h
+57 h
+4 h
+1 h
+10 h
+119 h
+4 h
+4 h
+10 h
+92 h
+4 h
+1 h
+4 h
+4 h
+10 h
+399 h
+10150 m
+83 h
+1 h
+307 h
+4 h
+570 h
+124 h
+4 h
+1 h
+1 h
+10151 m
+1 h
+124 h
+10152 m
+4 h
+1 h
+10 h
+64 h
+1 h
+73 h
+4 h
+123 h
+10153 m
+4 h
+4 h
+10 h
+10154 m
+10155 m
+70 m
+1250 h
+10 h
+1 h
+10156 m
+3680 m
+1 h
+97 h
+10157 m
+4 h
+578 h
+1016 h
+4 h
+13 h
+4 h
+4 h
+1 h
+779 h
+10158 m
+4 h
+1 h
+1 h
+10 h
+1 h
+185 h
+172 h
+2475 m
+1 h
+1 h
+10 h
+4 h
+1105 h
+4 h
+10159 m
+1 h
+56 h
+1 h
+10160 m
+10 h
+10161 m
+41 h
+1 h
+1861 m
+1650 h
+10 h
+83 h
+59 h
+4 h
+4 h
+10162 m
+56 h
+10 h
+4 h
+10163 m
+10164 m
+10165 m
+4 h
+11 h
+8 h
+41 h
+55 h
+4 h
+10166 m
+332 h
+1646 m
+10167 m
+10168 m
+11 h
+167 h
+10 h
+1 h
+10169 m
+4 h
+10 h
+10 h
+1 h
+1070 m
+10 h
+1 h
+4 h
+10170 m
+192 h
+10171 m
+10 h
+459 m
+4 h
+10 h
+4 h
+4 h
+1089 m
+4 h
+10 h
+10172 m
+4 h
+55 h
+57 h
+1 h
+10173 m
+124 h
+1 h
+10 h
+10 h
+7521 m
+388 m
+1 h
+1 h
+170 h
+10174 m
+1 h
+4 h
+4 h
+25 h
+4 h
+5567 m
+109 h
+31 h
+11 h
+10175 m
+4 h
+4 h
+147 h
+10176 m
+135 h
+4 h
+10 h
+11 h
+10 h
+4 h
+10177 m
+10178 m
+1 h
+1 h
+10179 m
+1884 h
+10 h
+4 h
+1 h
+12 h
+12 h
+10180 m
+25 h
+79 h
+31 h
+10181 m
+10 h
+167 h
+1 h
+4 h
+10 h
+4 h
+4 h
+12 h
+10 h
+229 h
+10 h
+10182 m
+114 h
+10 h
+94 h
+4 h
+297 h
+3 h
+10183 m
+10184 m
+10185 m
+698 m
+3622 m
+31 h
+4 h
+4 h
+1 h
+74 h
+10 h
+10186 m
+4 h
+1 h
+1 h
+104 h
+10187 m
+1796 h
+1 h
+23 h
+167 h
+1 h
+10188 m
+158 h
+4 h
+10 h
+3 h
+10189 m
+57 h
+28 h
+109 h
+10 h
+61 m
+1547 m
+590 h
+10 h
+146 h
+124 h
+10190 m
+10191 m
+4 h
+583 h
+10 h
+1 h
+74 h
+158 h
+143 h
+4 h
+1 h
+4 h
+10 h
+45 h
+4 h
+64 h
+4 h
+10192 m
+313 h
+10 h
+4 h
+4 h
+57 h
+10193 m
+8 h
+1 h
+10194 m
+10 h
+10 h
+1 h
+279 h
+4 h
+10195 m
+4 h
+10 h
+1 h
+4 h
+173 h
+5863 m
+10 h
+4 h
+1 h
+1 h
+2475 m
+1 h
+4 h
+1 h
+1 h
+10196 m
+10197 m
+9 h
+1 h
+4 h
+2865 m
+4 h
+4 h
+10198 m
+1 h
+266 h
+6851 m
+10 h
+4 h
+1 h
+10 h
+10 h
+258 h
+1 h
+4 h
+10199 m
+176 m
+10200 m
+10 h
+185 h
+4 h
+10 h
+1 h
+4 h
+10201 m
+1 h
+4 h
+10 h
+238 h
+2760 m
+73 h
+1 h
+2300 m
+10202 m
+4 h
+10203 m
+4 h
+10204 m
+109 h
+10205 m
+83 h
+1 h
+1 h
+1 h
+4 h
+10 h
+4 h
+6586 m
+10206 m
+144 h
+4 h
+1 h
+1 h
+4 h
+4 h
+4 h
+10 h
+1 h
+11 h
+10207 m
+4 h
+4 h
+27 h
+4 h
+1 h
+1 h
+11 h
+1 h
+4 h
+4 h
+6125 m
+4 h
+10208 m
+13 h
+1 h
+10 h
+4 h
+4 h
+857 h
+1 h
+1 h
+4 h
+1 h
+10209 m
+358 h
+4 h
+4 h
+4 h
+4 h
+4 h
+1 h
+114 h
+10210 m
+10 h
+2041 m
+10211 m
+10212 m
+1 h
+10 h
+1 h
+1 h
+10213 m
+4 h
+4 h
+114 h
+10 h
+74 h
+10214 m
+10215 m
+10216 m
+1454 m
+4 h
+2475 h
+4514 m
+4 h
+11 h
+172 h
+10217 m
+1 h
+1 h
+1 h
+10 h
+1 h
+10218 m
+172 h
+10219 m
+276 h
+1027 m
+10 h
+10220 m
+1 h
+10 h
+4 h
+64 h
+10221 m
+10222 m
+10 h
+2788 m
+4 h
+118 h
+10223 m
+125 h
+10224 m
+31 h
+11 h
+10225 m
+2532 m
+1 h
+4 h
+1 h
+1 h
+1 h
+1 h
+10 h
+1 h
+10 h
+147 h
+10226 m
+10227 m
+10228 m
+256 m
+82 h
+4 h
+4 h
+1 h
+4 h
+4 h
+146 h
+4 h
+258 h
+10229 m
+10 h
+73 h
+97 h
+10 h
+10230 m
+25 h
+79 h
+112 h
+4 h
+4 h
+10231 m
+97 h
+976 h
+10232 m
+104 h
+97 h
+1309 h
+13 h
+278 h
+10233 m
+4 h
+10234 m
+10235 m
+10236 m
+4 h
+4 h
+4 h
+10237 m
+4 h
+1 h
+4 h
+4 h
+10238 m
+230 h
+10239 m
+10240 m
+4 h
+10241 m
+4 h
+10242 m
+146 h
+190 h
+1016 h
+31 h
+1 h
+11 h
+12 h
+1 h
+10243 m
+10 h
+10244 m
+45 h
+92 h
+10245 m
+82 h
+10 h
+10 h
+447 h
+4 h
+4 h
+7271 m
+1 h
+4 h
+10 h
+113 h
+83 h
+124 h
+10 h
+238 h
+1 h
+10246 m
+195 h
+443 h
+10247 m
+4 h
+196 h
+4 h
+1766 h
+1 h
+10 h
+73 h
+181 h
+10248 m
+510 m
+4 h
+1137 h
+25 h
+10249 m
+10250 m
+1471 m
+10 h
+1 h
+12 h
+4 h
+1 h
+4 h
+10251 m
+10252 m
+118 h
+10 h
+4 h
+238 h
+10253 m
+45 h
+1 h
+10 h
+104 h
+10254 m
+10255 m
+10 h
+1 h
+4263 m
+10256 m
+1 h
+31 h
+1619 h
+1725 m
+11 h
+4 h
+1 h
+10257 m
+41 h
+59 h
+10 h
+10 h
+4 h
+10258 m
+8571 m
+4 h
+125 h
+4 h
+266 h
+10259 m
+1 h
+196 h
+4 h
+57 h
+1116 m
+10260 m
+109 h
+10261 m
+184 h
+10262 m
+10263 m
+36 h
+4 h
+11 h
+4 h
+3 h
+10 h
+10264 m
+57 h
+10 h
+1 h
+10265 m
+4 h
+1 h
+4 h
+64 h
+10 h
+1 h
+569 h
+1 h
+4 h
+167 h
+108 h
+10266 m
+1 h
+801 m
+10267 m
+10 h
+1 h
+1 h
+1 h
+4 h
+31 h
+4 h
+4356 m
+278 h
+4 h
+10268 m
+124 h
+4 h
+31 h
+4 h
+4 h
+124 h
+4 h
+10269 m
+11 h
+119 h
+4 h
+125 h
+10 h
+11 h
+4 h
+1 h
+4 h
+4 h
+10270 m
+4 h
+57 h
+25 h
+10 h
+10 h
+4 h
+59 h
+10271 m
+10 h
+10272 m
+10273 m
+2002 m
+2607 m
+1685 h
+73 h
+10 h
+10274 m
+1047 m
+4 h
+4 h
+4 h
+4 h
+10275 m
+10276 m
+10 h
+209 m
+4 h
+1 h
+1 h
+4 h
+83 h
+114 h
+1 h
+72 m
+4 h
+45 h
+10277 m
+4 h
+10 h
+10278 m
+5243 m
+10 h
+1 h
+55 h
+143 h
+4 h
+10 h
+214 m
+10279 m
+1 h
+238 h
+1 h
+4 h
+10 h
+11 h
+10 h
+1 h
+10280 m
+10 h
+4 h
+4 h
+59 h
+11 h
+10281 m
+687 h
+4 h
+10282 m
+3 h
+56 h
+110 h
+173 h
+56 h
+1 h
+383 h
+82 h
+8 h
+125 h
+10283 m
+1 h
+11 h
+113 h
+4 h
+4 h
+10284 m
+10 h
+119 h
+10 h
+1 h
+4 h
+757 h
+2379 h
+10285 m
+692 h
+1 h
+10286 m
+4 h
+3909 m
+1 h
+4 h
+4 h
+4 h
+1 h
+4 h
+1 h
+10287 m
+10 h
+4 h
+1 h
+1 h
+10288 m
+65 h
+10289 m
+10290 m
+10291 m
+1 h
+3 h
+10292 m
+692 h
+1620 h
+10 h
+10293 m
+4 h
+10 h
+10294 m
+10295 m
+1 h
+1 h
+4 h
+10296 m
+31 h
+172 h
+143 h
+123 h
+10297 m
+10 h
+4 h
+12 h
+4 h
+92 h
+10298 m
+4 h
+4608 m
+25 h
+10299 m
+4 h
+10300 m
+1 h
+4 h
+1 h
+1 h
+147 h
+36 h
+4 h
+8 h
+4 h
+164 h
+10301 m
+109 h
+10 h
+186 h
+4 h
+1 h
+10302 m
+1 h
+10303 m
+10304 m
+1790 h
+1 h
+1 h
+195 h
+4 h
+10 h
+1 h
+10305 m
+65 h
+4 h
+3299 m
+1 h
+4 h
+4 h
+114 h
+10306 m
+4 h
+4 h
+22 h
+4 h
+1 h
+443 h
+59 h
+4 h
+31 h
+1 h
+12 h
+4 h
+4 h
+190 h
+10 h
+3 h
+10307 m
+2591 m
+10 h
+10 h
+10308 m
+1 h
+10309 m
+1027 h
+14 m
+164 h
+1 h
+97 h
+4 h
+4 h
+61 m
+1 h
+6438 m
+1 h
+82 h
+57 h
+10310 m
+4 h
+1 h
+1 h
+4 h
+169 h
+4 h
+601 h
+339 m
+83 h
+1 h
+4 h
+73 h
+65 h
+113 h
+278 h
+4 h
+10311 m
+4 h
+1 h
+41 h
+4 h
+79 h
+10312 m
+4 h
+4 h
+4 h
+4 h
+69 h
+4 h
+10313 m
+10314 m
+10315 m
+1 h
+1 h
+556 h
+1 h
+82 h
+4 h
+1 h
+4 h
+31 h
+4 h
+59 h
+4 h
+11 h
+10 h
+1 h
+1 h
+173 h
+61 h
+10316 m
+124 h
+10 h
+10317 m
+1 h
+82 h
+4 h
+1 h
+229 h
+304 m
+10318 m
+10319 m
+10 h
+25 h
+10 h
+113 h
+10320 m
+10 h
+139 h
+195 h
+1 h
+4 h
+4 h
+139 h
+125 h
+4 h
+1790 h
+1835 h
+4 h
+716 m
+125 h
+3 h
+10321 m
+4 h
+10 h
+59 h
+5141 m
+10322 m
+4 h
+1 h
+4 h
+73 h
+10323 m
+1 h
+4 h
+1261 h
+10 h
+1 h
+10 h
+733 m
+10324 m
+10 h
+10 h
+10 h
+1 h
+1 h
+371 h
+10325 m
+1 h
+4 h
+10 h
+10 h
+10 h
+4 h
+10326 m
+4 h
+4 h
+4 h
+4 h
+28 h
+1 h
+4 h
+4 h
+10 h
+3 h
+1 h
+10 h
+4 h
+1 h
+1 h
+31 h
+1 h
+10327 m
+10328 m
+4 h
+238 h
+10329 m
+10 h
+10330 m
+1 h
+4 h
+1 h
+10 h
+10 h
+10331 m
+4 h
+59 h
+157 h
+4 h
+1 h
+10332 m
+10333 m
+1 h
+4 h
+10334 m
+45 h
+57 h
+10 h
+4 h
+4 h
+11 h
+900 m
+31 h
+4 h
+1 h
+4 h
+10335 m
+1677 m
+399 h
+10 h
+10 h
+1 h
+10 h
+1374 m
+4 h
+10336 m
+4 h
+1 h
+41 h
+11 h
+10 h
+10337 m
+3 h
+4 h
+10 h
+4 h
+1 h
+1 h
+448 m
+10338 m
+4 h
+10339 m
+1 h
+13 h
+358 h
+164 h
+146 h
+10340 m
+1 h
+4 h
+10341 m
+4 h
+10342 m
+4 h
+10 h
+64 h
+4 h
+10343 m
+10344 m
+125 h
+1 h
+10 h
+4 h
+57 h
+4 h
+4 h
+1722 m
+4 h
+10345 m
+9321 m
+146 h
+1 h
+1 h
+10346 m
+4 h
+332 h
+109 h
+10347 m
+10348 m
+65 h
+10349 m
+10350 m
+4 h
+82 h
+10351 m
+10 h
+55 h
+4 h
+4 h
+4 h
+4 h
+1 h
+10 h
+10352 m
+10 h
+4 h
+10 h
+10 h
+10353 m
+10354 m
+10 h
+10 h
+403 h
+4 h
+4 h
+4 h
+4905 m
+1 h
+10355 m
+124 h
+82 h
+45 h
+1 h
+4 h
+10 h
+4 h
+1 h
+1 h
+11 h
+4 h
+4 h
+10 h
+158 h
+27 h
+45 h
+1790 h
+4 h
+556 h
+31 h
+4 h
+10 h
+10356 m
+57 h
+368 h
+10357 m
+10 h
+10358 m
+10 h
+4 h
+10359 m
+55 h
+31 h
+1817 m
+10360 m
+11 h
+1 h
+10 h
+10361 m
+65 h
+109 h
+10 h
+1 h
+1953 m
+4 h
+125 h
+10 h
+55 h
+195 h
+10362 m
+10363 m
+447 h
+4 h
+1 h
+1 h
+94 h
+1 h
+10 h
+1894 m
+109 h
+1 h
+4 h
+986 h
+1 h
+4 h
+509 m
+4 h
+4 h
+4 h
+10364 m
+1 h
+279 h
+4 h
+10365 m
+10366 m
+10 h
+41 h
+10367 m
+74 h
+4 h
+10368 m
+124 h
+114 h
+10369 m
+3 h
+83 h
+10370 m
+109 h
+4 h
+10 h
+4 h
+28 h
+170 h
+272 m
+1 h
+4 h
+10 h
+9860 m
+1 h
+371 h
+10 h
+1619 h
+1 h
+1 h
+1 h
+109 h
+1 h
+48 h
+10371 m
+4 h
+581 m
+10372 m
+1835 h
+10373 m
+1 h
+258 h
+10 h
+94 h
+44 m
+10374 m
+65 h
+464 h
+10375 m
+104 h
+10376 m
+1 h
+8 h
+4 h
+170 h
+10377 m
+10 h
+10 h
+114 h
+4 h
+4 h
+307 h
+1 h
+1 h
+10 h
+10378 m
+4 h
+1 h
+82 h
+4 h
+65 h
+10379 m
+10 h
+1 h
+299 h
+4 h
+10380 m
+27 h
+1 h
+368 h
+307 h
+4 h
+1 h
+4 h
+129 h
+538 h
+1478 h
+295 h
+10 h
+4 h
+1 h
+276 h
+104 h
+10381 m
+4 h
+4 h
+12 h
+10382 m
+10 h
+4 h
+10 h
+2532 m
+4 h
+4 h
+1472 m
+10 h
+109 h
+10383 m
+1 h
+1 h
+10384 m
+10 h
+2496 m
+10 h
+258 h
+4 h
+1 h
+279 h
+1 h
+3435 m
+1 h
+10385 m
+1 h
+10386 m
+1 h
+1 h
+97 h
+12 h
+4 h
+109 h
+1016 h
+4 h
+10 h
+1 h
+170 h
+138 h
+11 h
+10387 m
+1 h
+4 h
+10388 m
+11 h
+1666 m
+10 h
+10389 m
+65 h
+10 h
+322 m
+4 h
+1 h
+82 h
+4 h
+82 h
+10 h
+1 h
+10 h
+4 h
+10 h
+601 h
+4 h
+10390 m
+1 h
+468 h
+10391 m
+1293 m
+10392 m
+10393 m
+4 h
+10394 m
+10395 m
+10396 m
+10 h
+266 h
+4 h
+10397 m
+83 h
+1 h
+10398 m
+11 h
+10 h
+56 h
+297 h
+4 h
+4 h
+10 h
+4 h
+1 h
+10399 m
+1 h
+144 h
+124 h
+1 h
+109 h
+1563 m
+10 h
+10400 m
+4 h
+4 h
+10401 m
+10 h
+10402 m
+1 h
+4 h
+82 h
+10403 m
+4 h
+1 h
+10404 m
+1769 m
+520 h
+10405 m
+4 h
+1308 m
+10 h
+82 h
+11 h
+1201 h
+10406 m
+10407 m
+4 h
+10408 m
+10 h
+4 h
+1948 m
+157 h
+1796 h
+10 h
+125 h
+4 h
+11 h
+295 h
+4 h
+10 h
+5 h
+45 h
+10 h
+10 h
+192 h
+10409 m
+172 h
+10410 m
+2733 h
+104 h
+10411 m
+31 h
+4 h
+10 h
+97 h
+1 h
+10 h
+64 h
+1 h
+10412 m
+104 h
+4 h
+5982 m
+4 h
+10 h
+4 h
+347 m
+4 h
+109 h
+4 h
+1 h
+4 h
+10413 m
+1 h
+250 h
+10414 m
+4 h
+4 h
+119 h
+4 h
+1 h
+10415 m
+10 h
+4 h
+57 h
+211 h
+65 h
+1 h
+27 h
+57 h
+1284 m
+10416 m
+4 h
+75 h
+10417 m
+10418 m
+10419 m
+10420 m
+10421 m
+1089 m
+10 h
+4 h
+97 h
+157 h
+10422 m
+10 h
+13 h
+11 h
+56 h
+147 h
+109 h
+10 h
+3274 m
+4 h
+10423 m
+10 h
+1952 m
+1260 h
+4 h
+10424 m
+4 h
+13 h
+125 h
+1 h
+1 h
+10 h
+7938 m
+10425 m
+10426 m
+113 h
+10 h
+10427 m
+10 h
+569 h
+10 h
+108 h
+4 h
+59 h
+10 h
+1 h
+10428 m
+10429 m
+10430 m
+10431 m
+4 h
+4 h
+10432 m
+4 h
+1685 h
+10433 m
+4 h
+1 h
+2719 m
+10434 m
+1 h
+125 h
+4 h
+10435 m
+10 h
+10436 m
+1 h
+41 h
+1 h
+358 h
+10437 m
+3 h
+1 h
+10438 m
+10439 m
+4 h
+1 h
+10440 m
+45 h
+4 h
+1 h
+10441 m
+4 h
+10 h
+266 h
+4 h
+10442 m
+94 h
+10443 m
+41 h
+10 h
+1 h
+9411 m
+1 h
+224 h
+6185 m
+1576 m
+4 h
+10444 m
+10445 m
+4 h
+224 h
+10 h
+10446 m
+4 h
+83 h
+4 h
+4 h
+10447 m
+1 h
+4 h
+109 h
+59 h
+1 h
+1 h
+4 h
+10448 m
+10 h
+994 m
+229 h
+146 h
+1 h
+4 h
+976 h
+478 m
+4 h
+57 h
+10449 m
+10450 m
+10 h
+10451 m
+10452 m
+109 h
+124 h
+4 h
+4 h
+3622 m
+914 m
+4 h
+4 h
+1 h
+488 h
+2285 m
+56 h
+4 h
+59 h
+1 h
+1 h
+10453 m
+4 h
+125 h
+1 h
+10454 m
+4 h
+4 h
+4 h
+109 h
+10 h
+11 h
+41 h
+464 h
+10 h
+4 h
+4 h
+74 h
+1027 h
+4 h
+10 h
+109 h
+147 h
+4 h
+185 h
+10 h
+1403 h
+276 h
+1 h
+4 h
+1 h
+266 h
+8 h
+10455 m
+31 h
+368 h
+8 h
+10456 m
+1 h
+557 m
+10457 m
+1 h
+1 h
+195 h
+10458 m
+4 h
+181 h
+10459 m
+4 h
+1 h
+59 h
+4 h
+10 h
+10460 m
+12 h
+146 h
+10461 m
+10462 m
+10 h
+10463 m
+109 h
+4 h
+10464 m
+10 h
+4 h
+59 h
+27 h
+4 h
+10465 m
+4 h
+10466 m
+23 h
+4 h
+4 h
+3 h
+1 h
+10 h
+203 m
+1 h
+4 h
+4 h
+1 h
+4 h
+74 h
+4215 m
+10 h
+31 h
+138 h
+10 h
+6022 m
+10467 m
+10468 m
+447 h
+92 h
+195 h
+12 h
+4 h
+1 h
+11 h
+649 m
+10469 m
+4 h
+2308 m
+4 h
+10470 m
+11 h
+45 h
+307 h
+10471 m
+10472 m
+4 h
+10 h
+10473 m
+10 h
+1685 h
+31 h
+124 h
+4 h
+578 h
+4 h
+2733 h
+25 h
+10 h
+11 h
+4 h
+65 h
+10 h
+140 h
+1 h
+10474 m
+4 h
+1 h
+4 h
+55 h
+10475 m
+4 h
+56 h
+10476 m
+10477 m
+371 h
+4 h
+1 h
+4 h
+10 h
+1 h
+4 h
+10 h
+1 h
+27 h
+33 m
+10478 m
+10479 m
+11 h
+10 h
+1 h
+10 h
+10480 m
+3680 m
+4 h
+124 h
+10481 m
+10482 m
+11 h
+10 h
+1 h
+10483 m
+10484 m
+10485 m
+10 h
+1 h
+4 h
+1 h
+1016 h
+443 h
+258 h
+1 h
+4 h
+1 h
+10486 m
+4 h
+10487 m
+4 h
+1 h
+4 h
+10488 m
+1 h
+164 h
+1 h
+1 h
+4 h
+10 h
+10 h
+22 h
+45 h
+4 h
+10 h
+1 h
+10489 m
+8 h
+23 h
+25 h
+4 h
+83 h
+10490 m
+4 h
+10491 m
+4 h
+10492 m
+186 h
+11 h
+10 h
+10 h
+1 h
+94 h
+10 h
+195 h
+4 h
+2885 m
+4 h
+59 h
+620 m
+10493 m
+2928 m
+10 h
+10494 m
+4 h
+1 h
+1 h
+10495 m
+10 h
+97 h
+1 h
+319 h
+10496 m
+59 h
+4 h
+800 m
+229 h
+4 h
+124 h
+307 h
+1 h
+10497 m
+10498 m
+104 h
+10499 m
+11 h
+10500 m
+4857 m
+10501 m
+4 h
+4 h
+1 h
+74 h
+1 h
+4 h
+330 h
+1884 h
+1 h
+4 h
+173 h
+4 h
+4 h
+10502 m
+4 h
+1 h
+10503 m
+10504 m
+1 h
+83 h
+1 h
+125 h
+1 h
+10 h
+196 h
+1 h
+135 h
+10 h
+10505 m
+125 h
+92 h
+4 h
+10 h
+319 h
+31 h
+4 h
+10506 m
+4 h
+1 h
+4 h
+10507 m
+4 h
+4 h
+185 h
+1 h
+57 h
+59 h
+195 h
+1 h
+8497 m
+1 h
+4 h
+190 h
+1 h
+4 h
+10 h
+59 h
+10508 m
+10509 m
+4 h
+10 h
+10 h
+4 h
+94 h
+10 h
+10510 m
+10511 m
+4 h
+10512 m
+4 h
+1 h
+5309 m
+4 h
+4 h
+11 h
+276 h
+10 h
+4 h
+57 h
+190 h
+10513 m
+196 h
+463 h
+10 h
+4 h
+1 h
+4 h
+10514 m
+1 h
+1 h
+77 h
+1 h
+295 h
+10515 m
+10 h
+1 h
+4 h
+124 h
+169 h
+10516 m
+74 h
+4 h
+1 h
+13 h
+1 h
+4 h
+332 h
+4522 m
+1 h
+167 h
+11 h
+10517 m
+12 h
+4 h
+169 h
+10 h
+4 h
+41 h
+4 h
+4 h
+4 h
+56 h
+1 h
+10518 m
+1 h
+1056 m
+4 h
+1 h
+109 h
+1 h
+10 h
+1 h
+1 h
+10519 m
+12 h
+4 h
+10520 m
+289 h
+10521 m
+4 h
+1 h
+10522 m
+10523 m
+4 h
+1 h
+11 h
+10524 m
+1 h
+4 h
+124 h
+10525 m
+195 h
+10526 m
+10527 m
+11 h
+10528 m
+4 h
+74 h
+123 h
+109 h
+1 h
+195 h
+1 h
+10529 m
+164 h
+1 h
+4 h
+10530 m
+10531 m
+10 h
+1 h
+1 h
+124 h
+1 h
+10532 m
+1 h
+82 h
+82 h
+4 h
+258 h
+11 h
+10533 m
+93 h
+1 h
+41 h
+10 h
+10534 m
+4 h
+10 h
+219 m
+1 h
+11 h
+146 h
+10535 m
+10 h
+1 h
+4 h
+10 h
+1 h
+4 h
+41 h
+114 h
+33 m
+10536 m
+4 h
+125 h
+478 m
+10 h
+1045 m
+135 h
+601 h
+10537 m
+10538 m
+447 h
+1 h
+45 h
+4 h
+36 h
+10539 m
+10540 m
+7832 m
+4 h
+104 h
+10 h
+10 h
+11 h
+4 h
+843 m
+236 m
+447 h
+4 h
+4 h
+1 h
+4 h
+4 h
+1 h
+10541 m
+10542 m
+11 h
+73 h
+1 h
+1 h
+124 h
+450 m
+3 h
+4 h
+10543 m
+10 h
+10 h
+57 h
+4 h
+238 h
+10544 m
+10545 m
+185 h
+10546 m
+1 h
+4 h
+258 h
+4 h
+82 h
+48 h
+94 h
+109 h
+10547 m
+1 h
+10548 m
+10 h
+119 h
+204 h
+692 h
+1 h
+57 h
+1 h
+1 h
+1 h
+4 h
+190 h
+10 h
+10 h
+4 h
+10549 m
+1 h
+10550 m
+25 h
+4 h
+196 h
+4 h
+1454 m
+10551 m
+4 h
+10 h
+338 m
+10552 m
+4 h
+10 h
+265 h
+10553 m
+56 h
+4 h
+10554 m
+4 h
+10555 m
+1796 h
+1 h
+4 h
+82 h
+11 h
+106 h
+10556 m
+7352 m
+10 h
+172 h
+83 h
+10557 m
+10 h
+4 h
+338 m
+1 h
+3558 m
+164 h
+104 h
+195 h
+536 h
+1 h
+10558 m
+4 h
+4 h
+10559 m
+4 h
+4 h
+104 h
+3 h
+4 h
+10560 m
+10561 m
+12 h
+10 h
+1 h
+11 h
+10562 m
+1 h
+12 h
+109 h
+1 h
+10563 m
+11 h
+10 h
+10 h
+3 h
+147 h
+69 h
+316 m
+10564 m
+1861 m
+4 h
+10565 m
+282 m
+4 h
+54 m
+10566 m
+4 h
+1766 h
+4 h
+4 h
+10567 m
+1406 h
+57 h
+11 h
+11 h
+74 h
+31 h
+258 h
+109 h
+10568 m
+1 h
+1548 m
+83 h
+986 h
+4 h
+10569 m
+125 h
+10570 m
+73 h
+4 h
+1 h
+1 h
+1201 h
+10571 m
+10 h
+1 h
+10572 m
+3 h
+4 h
+119 h
+10573 m
+10574 m
+4 h
+10575 m
+9 h
+10 h
+538 h
+2961 m
+10 h
+10 h
+139 h
+4542 m
+10576 m
+10577 m
+1 h
+190 h
+1 h
+173 h
+10578 m
+9040 m
+1650 h
+4 h
+4 h
+79 h
+279 h
+1835 h
+1 h
+1 h
+4 h
+13 h
+10579 m
+3 h
+10580 m
+1 h
+10 h
+1 h
+1 h
+1 h
+4 h
+10 h
+4 h
+10581 m
+8 h
+135 h
+1 h
+433 m
+57 h
+41 h
+10582 m
+10583 m
+10 h
+1 h
+4 h
+10 h
+10 h
+184 h
+1 h
+10584 m
+10 h
+104 h
+4 h
+109 h
+146 h
+97 h
+4 h
+1 h
+10585 m
+10586 m
+10 h
+1 h
+1362 m
+55 h
+11 h
+1 h
+5 h
+1685 h
+10 h
+10587 m
+36 h
+135 h
+10 h
+10 h
+10 h
+4 h
+185 h
+57 h
+4 h
+10588 m
+1 h
+1535 m
+1 h
+8133 m
+1278 m
+91 h
+4 h
+459 m
+4 h
+25 h
+10589 m
+4 h
+109 h
+10 h
+57 h
+10590 m
+1 h
+41 h
+82 h
+4 h
+64 h
+4 h
+146 h
+1 h
+8 h
+1 h
+1 h
+10 h
+1 h
+10 h
+124 h
+1 h
+2002 m
+4 h
+82 h
+1 h
+4 h
+8 h
+4 h
+1 h
+4 h
+57 h
+7915 m
+1027 h
+4 h
+11 h
+4 h
+12 h
+10 h
+10 h
+56 h
+4 h
+4 h
+10591 m
+10 h
+195 h
+1732 m
+1 h
+578 h
+169 h
+626 h
+1 h
+1 h
+4 h
+10592 m
+3 h
+4 h
+10 h
+2064 m
+10593 m
+1 h
+82 h
+10 h
+12 h
+4 h
+1 h
+1 h
+10 h
+10 h
+4 h
+109 h
+489 h
+6197 m
+1 h
+1 h
+10594 m
+4 h
+4 h
+4 h
+104 h
+1137 h
+4 h
+10 h
+55 h
+1 h
+4 h
+10595 m
+10 h
+10 h
+3 h
+11 h
+119 h
+10596 m
+4867 m
+97 h
+82 h
+10597 m
+112 h
+79 h
+59 h
+4 h
+10598 m
+25 h
+10 h
+196 h
+10599 m
+4 h
+11 h
+1953 m
+2914 m
+976 h
+4 h
+11 h
+31 h
+56 h
+10600 m
+11 h
+56 h
+4 h
+10 h
+83 h
+57 h
+10601 m
+1 h
+10602 m
+59 h
+10603 m
+10604 m
+10605 m
+2788 m
+123 h
+10606 m
+1 h
+10607 m
+10 h
+1 h
+10608 m
+10609 m
+146 h
+10 h
+4 h
+57 h
+10610 m
+1 h
+4 h
+104 h
+4 h
+12 h
+1 h
+3240 m
+1 h
+1975 m
+41 h
+10 h
+45 h
+4 h
+1 h
+4 h
+10 h
+4 h
+10611 m
+195 h
+10 h
+4 h
+10612 m
+1250 h
+124 h
+1 h
+10 h
+12 h
+4 h
+805 m
+4 h
+11 h
+10613 m
+57 h
+1 h
+4 h
+4 h
+1 h
+4 h
+59 h
+10 h
+10614 m
+4 h
+10615 m
+4 h
+10 h
+1 h
+1 h
+10 h
+65 h
+297 h
+74 h
+4 h
+6399 m
+4 h
+10616 m
+31 h
+1 h
+10617 m
+1 h
+4 h
+10618 m
+4 h
+92 h
+41 h
+82 h
+10619 m
+1092 m
+1 h
+4 h
+104 h
+4 h
+10620 m
+10 h
+36 h
+692 h
+10621 m
+31 h
+172 h
+4 h
+4 h
+124 h
+172 h
+1 h
+11 h
+1 h
+1 h
+4 h
+10622 m
+10623 m
+1 h
+10 h
+4 h
+1 h
+10624 m
+10 h
+4 h
+164 h
+10625 m
+1 h
+10626 m
+57 h
+11 h
+1 h
+4 h
+11 h
+10627 m
+1 h
+10628 m
+10 h
+59 h
+1 h
+4 h
+1 h
+10629 m
+4 h
+10630 m
+74 h
+4 h
+10631 m
+4 h
+110 h
+1137 h
+1089 m
+4 h
+2887 m
+1 h
+4 h
+4 h
+10632 m
+31 h
+430 m
+4 h
+1 h
+4 h
+10 h
+157 h
+4 h
+4 h
+10633 m
+57 h
+10 h
+1 h
+4 h
+11 h
+4 h
+190 h
+4349 m
+10634 m
+4 h
+1 h
+10635 m
+10 h
+10 h
+124 h
+9757 m
+10636 m
+4 h
+4240 m
+83 h
+33 h
+4692 m
+1 h
+4 h
+1261 h
+40 h
+1 h
+295 h
+888 m
+10637 m
+10 h
+55 h
+10638 m
+1 h
+1861 m
+10639 m
+25 h
+10640 m
+10 h
+60 m
+114 h
+77 h
+45 h
+10 h
+4 h
+10641 m
+10 h
+4 h
+4 h
+11 h
+250 h
+10 h
+10642 m
+114 h
+4 h
+92 h
+10643 m
+4 h
+4 h
+10644 m
+1 h
+1 h
+73 h
+124 h
+4 h
+1 h
+10645 m
+184 h
+779 h
+10 h
+10 h
+4 h
+10 h
+146 h
+1 h
+10646 m
+114 h
+10647 m
+41 h
+4 h
+10648 m
+581 m
+10649 m
+10 h
+10650 m
+56 h
+113 h
+10651 m
+3 h
+125 h
+10652 m
+4 h
+10653 m
+4 h
+10654 m
+4 h
+1 h
+4 h
+97 h
+4 h
+10655 m
+83 h
+10656 m
+11 h
+10657 m
+4 h
+307 h
+4 h
+4 h
+10 h
+10 h
+10 h
+10658 m
+1 h
+11 h
+1 h
+10659 m
+4 h
+10 h
+10660 m
+64 h
+83 h
+4 h
+295 h
+4 h
+4 h
+92 h
+477 m
+10 h
+10 h
+4 h
+1 h
+4 h
+1 h
+4 h
+4 h
+10661 m
+4 h
+135 h
+10662 m
+27 h
+4 h
+10663 m
+10664 m
+10665 m
+4 h
+10666 m
+1 h
+4 h
+2788 m
+31 h
+10667 m
+4 h
+4 h
+1 h
+4 h
+4 h
+146 h
+12 h
+4 h
+478 h
+4 h
+146 h
+10668 m
+1024 m
+10 h
+82 h
+10 h
+109 h
+10669 m
+10 h
+7128 m
+1 h
+10670 m
+10671 m
+1 h
+196 h
+125 h
+1 h
+1 h
+57 h
+11 h
+1 h
+135 h
+83 h
+4 h
+135 h
+10672 m
+1136 m
+1 h
+82 h
+692 h
+1535 m
+1 h
+93 h
+4 h
+820 h
+1 h
+1 h
+10 h
+4 h
+4 h
+10673 m
+4 h
+4 h
+1 h
+10674 m
+13 h
+104 h
+82 h
+27 h
+4 h
+113 h
+4 h
+172 h
+10 h
+5008 m
+4 h
+4 h
+10 h
+4 h
+10675 m
+1 h
+1 h
+4 h
+196 h
+196 h
+11 h
+1 h
+4 h
+74 h
+169 h
+1 h
+1 h
+367 m
+4 h
+4 h
+10676 m
+10677 m
+10 h
+82 h
+1 h
+11 h
+1 h
+25 h
+4 h
+4 h
+1 h
+10678 m
+4 h
+10679 m
+4 h
+1 h
+10 h
+6252 m
+4 h
+488 h
+4 h
+10 h
+10 h
+11 h
+104 h
+1 h
+371 h
+109 h
+10680 m
+4 h
+1 h
+4 h
+203 m
+10 h
+10 h
+1 h
+4 h
+109 h
+10 h
+3013 m
+104 h
+262 h
+10681 m
+10 h
+575 m
+10682 m
+4 h
+1 h
+1 h
+1 h
+1 h
+10 h
+55 h
+11 h
+4 h
+1 h
+4 h
+109 h
+10683 m
+10 h
+1 h
+4 h
+1 h
+186 h
+536 h
+10684 m
+4 h
+4 h
+779 h
+10685 m
+1 h
+1 h
+4 h
+10 h
+10 h
+10 h
+10686 m
+223 m
+110 h
+4 h
+10687 m
+4 h
+83 h
+229 h
+10688 m
+4 h
+124 h
+1 h
+10689 m
+157 h
+307 h
+10690 m
+10 h
+10691 m
+1 h
+10 h
+10692 m
+10 h
+173 h
+1 h
+10693 m
+10 h
+109 h
+1 h
+55 h
+266 h
+184 h
+1 h
+4 h
+10694 m
+109 h
+4 h
+1 h
+4 h
+1 h
+124 h
+238 h
+140 h
+10 h
+82 h
+1 h
+10 h
+82 h
+10695 m
+4 h
+10696 m
+1 h
+10697 m
+10 h
+10698 m
+4 h
+92 h
+4 h
+4 h
+10 h
+4 h
+1 h
+4 h
+109 h
+1 h
+65 h
+1403 h
+1027 h
+10699 m
+10 h
+383 h
+11 h
+4 h
+4 h
+4 h
+10700 m
+4 h
+1 h
+1 h
+3048 m
+4 h
+56 h
+10 h
+4 h
+45 h
+94 h
+10701 m
+1 h
+1 h
+10702 m
+129 h
+4 h
+10703 m
+435 m
+156 h
+164 h
+10704 m
+911 m
+3 h
+10705 m
+10 h
+1 h
+146 h
+1 h
+181 h
+109 h
+167 h
+124 h
+10706 m
+10 h
+4 h
+10707 m
+10 h
+10 h
+4 h
+4 h
+10 h
+1 h
+10708 m
+125 h
+125 h
+4 h
+10 h
+10709 m
+113 h
+10710 m
+195 h
+1 h
+578 h
+1 h
+4 h
+4 h
+10711 m
+82 h
+4 h
+65 h
+4 h
+10712 m
+4 h
+59 h
+1 h
+1 h
+167 h
+4 h
+10713 m
+74 h
+10714 m
+4 h
+4 h
+79 h
+4 h
+10715 m
+4 h
+13 h
+4 h
+10716 m
+4 h
+802 m
+4292 m
+83 h
+10717 m
+11 h
+1 h
+4 h
+4 h
+1 h
+1 h
+10718 m
+22 h
+1 h
+4 h
+3909 m
+1 h
+10 h
+3 h
+10 h
+10719 m
+10720 m
+1 h
+208 m
+1 h
+1137 h
+443 h
+12 h
+10 h
+10 h
+10721 m
+10722 m
+770 m
+1 h
+1 h
+2038 m
+1 h
+74 h
+1 h
+124 h
+1 h
+10 h
+1 h
+5866 m
+195 h
+1768 m
+4 h
+10723 m
+10 h
+1 h
+4 h
+10724 m
+1 h
+10725 m
+10726 m
+649 m
+10727 m
+386 h
+4 h
+1 h
+147 h
+4 h
+1 h
+4 h
+10728 m
+97 h
+1 h
+10729 m
+4 h
+1 h
+4 h
+1 h
+82 h
+110 h
+10 h
+82 h
+1 h
+4 h
+2540 m
+1 h
+4 h
+4 h
+196 h
+125 h
+6855 m
+10730 m
+327 m
+124 h
+10 h
+536 h
+1 h
+1 h
+1 h
+10 h
+10731 m
+869 h
+195 h
+10 h
+4 h
+10732 m
+10733 m
+1 h
+10734 m
+3 h
+10 h
+1309 h
+125 h
+2172 h
+1 h
+41 h
+1 h
+65 h
+10735 m
+4 h
+10736 m
+10737 m
+4 h
+10738 m
+1 h
+57 h
+2418 h
+4 h
+83 h
+10739 m
+4 h
+74 h
+97 h
+842 m
+1 h
+27 h
+4 h
+1 h
+10 h
+109 h
+45 h
+4 h
+10 h
+1 h
+11 h
+4 h
+10740 m
+10 h
+10 h
+536 h
+1564 m
+4 h
+488 h
+4 h
+10741 m
+4 h
+1 h
+108 h
+4 h
+2591 m
+10742 m
+114 h
+79 h
+11 h
+4 h
+79 h
+10743 m
+10744 m
+10745 m
+4 h
+31 h
+10685 m
+1 h
+10746 m
+123 h
+73 h
+10747 m
+3303 m
+41 h
+4 h
+4 h
+10748 m
+10 h
+10 h
+83 h
+8040 m
+10 h
+10749 m
+1 h
+938 m
+70 m
+10 h
+10750 m
+1 h
+10 h
+10 h
+10751 m
+59 h
+64 h
+10 h
+4 h
+48 h
+109 h
+1 h
+10 h
+1359 m
+10752 m
+1 h
+4 h
+108 h
+10 h
+83 h
+64 h
+31 h
+536 h
+4 h
+4 h
+11 h
+939 m
+146 h
+1 h
+10 h
+1 h
+57 h
+10753 m
+10 h
+56 h
+195 h
+10754 m
+229 h
+1 h
+10755 m
+10756 m
+109 h
+10757 m
+10 h
+11 h
+147 h
+3 h
+295 h
+196 h
+4 h
+10758 m
+1 h
+1 h
+1 h
+4 h
+5470 m
+10 h
+219 m
+4 h
+1362 m
+109 h
+1 h
+1 h
+509 m
+4 h
+1 h
+1 h
+4 h
+4 h
+10759 m
+4 h
+185 h
+4 h
+10760 m
+124 h
+1 h
+4 h
+7119 m
+10761 m
+4 h
+69 h
+82 h
+10762 m
+4 h
+10763 m
+1137 h
+97 h
+27 h
+1 h
+4 h
+10764 m
+10765 m
+10 h
+11 h
+10 h
+1 h
+7047 m
+97 h
+238 h
+1 h
+135 h
+1 h
+10 h
+1 h
+3088 m
+1788 m
+10 h
+338 h
+278 h
+371 h
+4 h
+10766 m
+10 h
+4 h
+536 h
+4 h
+238 h
+203 m
+1 h
+10 h
+57 h
+181 h
+25 h
+10767 m
+1 h
+2308 m
+4 h
+4 h
+10768 m
+10 h
+757 h
+10 h
+10769 m
+10770 m
+10 h
+10 h
+4 h
+1 h
+1 h
+11 h
+1 h
+447 h
+10771 m
+135 h
+4 h
+4 h
+10772 m
+10 h
+59 h
+190 h
+25 h
+10773 m
+4 h
+1 h
+1 h
+124 h
+10 h
+11 h
+10 h
+4 h
+4 h
+1137 h
+4 h
+295 h
+4 h
+10774 m
+4 h
+10775 m
+4 h
+123 h
+1 h
+11 h
+11 h
+56 h
+10 h
+4 h
+1 h
+4 h
+135 h
+94 h
+11 h
+8133 m
+1027 h
+4 h
+4 h
+276 h
+10 h
+10 h
+1 h
+10776 m
+10777 m
+4 h
+4 h
+10778 m
+10779 m
+281 m
+4 h
+10780 m
+1 h
+4 h
+1 h
+11 h
+4 h
+3799 m
+10781 m
+10782 m
+4 h
+10 h
+1 h
+1 h
+4 h
+4 h
+1 h
+172 h
+10783 m
+104 h
+4 h
+10 h
+5964 m
+1 h
+13 h
+124 h
+4 h
+10784 m
+4 h
+55 h
+2265 m
+412 m
+11 h
+10 h
+4 h
+3 h
+10785 m
+10786 m
+10 h
+4 h
+10 h
+11 h
+1 h
+109 h
+25 h
+2379 h
+265 h
+10787 m
+8386 m
+976 h
+10788 m
+73 h
+368 h
+10789 m
+10 h
+10 h
+10790 m
+10 h
+2594 m
+1 h
+4 h
+10 h
+10 h
+10791 m
+10792 m
+10793 m
+4 h
+10794 m
+4 h
+1 h
+10 h
+10 h
+1 h
+1 h
+1 h
+10795 m
+10796 m
+65 h
+4 h
+10 h
+10797 m
+1 h
+10 h
+10798 m
+4 h
+146 h
+4 h
+10799 m
+986 h
+4 h
+4 h
+2265 m
+4 h
+4 h
+10 h
+1 h
+10 h
+4 h
+10 h
+10800 m
+4 h
+123 h
+10 h
+3 h
+10 h
+4 h
+1137 h
+10801 m
+41 h
+829 h
+1 h
+911 h
+1 h
+109 h
+4 h
+64 h
+169 h
+4 h
+10 h
+4 h
+1 h
+4 h
+135 h
+10 h
+11 h
+83 h
+779 h
+1 h
+31 h
+1 h
+10802 m
+10803 m
+196 h
+4 h
+371 h
+196 h
+996 m
+1 h
+4 h
+10804 m
+307 h
+4 h
+1 h
+8784 m
+4 h
+4 h
+1 h
+10805 m
+10 h
+3396 m
+10806 m
+2733 h
+757 h
+10 h
+4 h
+10807 m
+10808 m
+1 h
+31 h
+167 h
+10 h
+1 h
+10809 m
+1 h
+1 h
+11 h
+1 h
+1201 h
+10810 m
+1 h
+10811 m
+224 h
+4 h
+1 h
+10812 m
+10813 m
+4 h
+10814 m
+10 h
+10815 m
+1 h
+1 h
+1027 h
+11 h
+203 h
+10816 m
+10817 m
+57 h
+190 h
+97 h
+104 h
+45 h
+25 h
+41 h
+4 h
+278 h
+1 h
+10818 m
+190 h
+3 h
+1 h
+4 h
+196 h
+82 h
+4 h
+6766 m
+1 h
+45 h
+10 h
+10819 m
+4 h
+10 h
+10820 m
+4 h
+4 h
+938 m
+10 h
+10 h
+1 h
+1 h
+3 h
+114 h
+10821 m
+10822 m
+10 h
+110 h
+4 h
+10823 m
+82 h
+1 h
+10824 m
+1374 m
+1 h
+1 h
+1886 m
+4 h
+7253 m
+1 h
+297 h
+447 h
+10 h
+935 h
+1 h
+4 h
+10825 m
+124 h
+3845 m
+10826 m
+10827 m
+4 h
+181 h
+57 h
+4033 m
+10828 m
+10829 m
+4 h
+1 h
+12 h
+332 h
+10830 m
+1 h
+10831 m
+82 h
+399 h
+10 h
+10 h
+1 h
+10832 m
+10833 m
+8 h
+10 h
+4 h
+4 h
+109 h
+11 h
+1 h
+911 h
+4 h
+4 h
+138 h
+57 h
+10834 m
+4 h
+10 h
+4 h
+10835 m
+4 h
+1 h
+56 h
+4 h
+109 h
+238 h
+10 h
+1 h
+1 h
+112 h
+113 h
+1 h
+4 h
+4151 m
+8332 m
+1 h
+12 h
+25 h
+4 h
+11 h
+55 h
+266 h
+4 h
+10836 m
+10837 m
+1 h
+10 h
+109 h
+10838 m
+10839 m
+1 h
+1 h
+124 h
+82 h
+10840 m
+10 h
+10 h
+8332 m
+186 h
+10 h
+10 h
+4 h
+1 h
+1710 m
+1 h
+1 h
+10 h
+1016 h
+4 h
+4 h
+4 h
+10 h
+4 h
+4 h
+108 h
+4 h
+10841 m
+4 h
+297 h
+11 h
+4 h
+11 h
+10 h
+3161 m
+10 h
+10 h
+4 h
+1545 m
+10 h
+10842 m
+109 h
+10843 m
+353 m
+10844 m
+57 h
+4 h
+265 h
+4 h
+10845 m
+4 h
+4 h
+626 h
+10 h
+1 h
+10846 m
+10 h
+1766 h
+1 h
+10 h
+10847 m
+11 h
+10848 m
+4 h
+4 h
+1 h
+10849 m
+83 h
+195 h
+11 h
+185 h
+1 h
+147 h
+4297 m
+10850 m
+10 h
+4 h
+229 h
+1 h
+10 h
+1 h
+4 h
+4 h
+4 h
+181 h
+10 h
+10851 m
+4 h
+1 h
+2442 m
+10852 m
+55 h
+1 h
+10 h
+11 h
+158 h
+4 h
+10853 m
+1 h
+10 h
+1 h
+912 m
+10854 m
+10855 m
+10856 m
+5445 m
+10 h
+4 h
+195 h
+2261 m
+4 h
+11 h
+4 h
+82 h
+10857 m
+4 h
+10 h
+10858 m
+4668 m
+10859 m
+10860 m
+10 h
+184 h
+11 h
+10 h
+6808 m
+1 h
+1410 m
+1 h
+10 h
+4 h
+10 h
+10 h
+1 h
+10 h
+10 h
+170 h
+97 h
+10861 m
+10862 m
+1541 m
+4 h
+83 h
+113 h
+3 h
+700 m
+1 h
+10863 m
+10864 m
+258 h
+31 h
+10865 m
+3533 m
+11 h
+10 h
+258 h
+10866 m
+22 h
+1220 m
+10867 m
+10868 m
+170 h
+5917 m
+11 h
+3 h
+59 h
+109 h
+10869 m
+59 h
+4 h
+10870 m
+4 h
+31 h
+1 h
+10871 m
+10872 m
+307 h
+1 h
+4 h
+10 h
+4 h
+10 h
+1 h
+3 h
+10 h
+1 h
+270 h
+185 h
+11 h
+4 h
+10 h
+10 h
+1 h
+10873 m
+10874 m
+10875 m
+41 h
+196 h
+169 h
+4 h
+27 h
+10876 m
+1 h
+4 h
+93 h
+41 h
+10877 m
+1 h
+31 h
+192 h
+4 h
+75 h
+92 h
+295 h
+31 h
+10 h
+10 h
+4 h
+55 h
+4 h
+28 h
+124 h
+299 h
+1674 m
+10878 m
+10879 m
+1 h
+1 h
+4 h
+10880 m
+4 h
+10 h
+10881 m
+10882 m
+808 m
+4 h
+4 h
+57 h
+59 h
+4 h
+307 h
+4 h
+339 m
+8386 m
+10 h
+167 h
+10883 m
+10884 m
+10 h
+57 h
+10885 m
+4 h
+443 h
+4 h
+10 h
+1620 h
+10886 m
+1 h
+860 m
+1 h
+4 h
+266 h
+83 h
+3 h
+258 h
+109 h
+3 h
+10 h
+10887 m
+4 h
+25 h
+566 m
+65 h
+10 h
+1 h
+319 h
+265 h
+10888 m
+74 h
+10 h
+10889 m
+964 m
+4 h
+4 h
+322 m
+338 h
+4 h
+10 h
+569 h
+1 h
+64 h
+10 h
+692 h
+1 h
+1 h
+11 h
+4 h
+25 h
+1 h
+4 h
+1 h
+4 h
+1 h
+4 h
+10 h
+10890 m
+10 h
+10 h
+10891 m
+10 h
+10892 m
+10893 m
+1 h
+10 h
+10894 m
+4 h
+83 h
+10 h
+4 h
+4 h
+4 h
+1 h
+1 h
+4 h
+10 h
+147 h
+4 h
+10895 m
+806 m
+399 h
+10896 m
+10897 m
+3680 m
+10898 m
+11 h
+11 h
+1 h
+138 h
+92 h
+1218 m
+172 h
+10 h
+10899 m
+4 h
+1 h
+11 h
+4 h
+169 h
+108 h
+83 h
+4 h
+10900 m
+4 h
+10 h
+10901 m
+10902 m
+10 h
+266 h
+10903 m
+10904 m
+4542 m
+83 h
+45 h
+4 h
+1 h
+192 h
+10905 m
+1 h
+10906 m
+1 h
+10907 m
+10908 m
+10 h
+4 h
+10 h
+10909 m
+10 h
+10 h
+10 h
+1 h
+10910 m
+33 h
+10 h
+1 h
+10911 m
+4 h
+278 h
+506 m
+4 h
+10912 m
+9916 m
+10913 m
+4 h
+74 h
+41 h
+4 h
+74 h
+4 h
+4 h
+10914 m
+4 h
+11 h
+4 h
+104 h
+479 m
+10 h
+4 h
+4 h
+1 h
+4 h
+10 h
+262 h
+4 h
+4 h
+4 h
+11 h
+124 h
+4 h
+10 h
+10915 m
+124 h
+10916 m
+386 h
+4 h
+276 h
+4 h
+1 h
+10 h
+10 h
+57 h
+4 h
+11 h
+10917 m
+4 h
+386 h
+4 h
+140 h
+10 h
+10918 m
+10919 m
+74 h
+1 h
+4 h
+4 h
+25 h
+10920 m
+626 h
+4 h
+1 h
+10921 m
+158 h
+692 h
+4 h
+4 h
+2434 m
+82 h
+10 h
+258 h
+1 h
+4 h
+4 h
+4 h
+4 h
+10922 m
+10923 m
+938 h
+4 h
+22 h
+4 h
+10 h
+10 h
+4 h
+27 h
+1 h
+1 h
+986 h
+10924 m
+10 h
+447 h
+10925 m
+77 h
+22 h
+41 h
+1 h
+10 h
+8 h
+1 h
+3 h
+1 h
+2265 h
+4 h
+4 h
+195 h
+3732 m
+4 h
+195 h
+1 h
+1 h
+10926 m
+1 h
+4 h
+1 h
+4 h
+156 h
+4 h
+1 h
+10 h
+65 h
+2265 h
+59 h
+10927 m
+10 h
+10 h
+82 h
+1 h
+10 h
+1 h
+250 h
+97 h
+229 h
+295 h
+10928 m
+10929 m
+1 h
+1 h
+1 h
+9831 m
+10 h
+123 h
+118 h
+10930 m
+10 h
+82 h
+83 h
+4 h
+83 h
+4 h
+229 h
+1 h
+4 h
+93 h
+10 h
+556 h
+4 h
+1 h
+83 h
+4 h
+1 h
+4 h
+1 h
+10931 m
+4 h
+278 h
+4 h
+74 h
+1 h
+4 h
+10 h
+14 m
+10932 m
+4 h
+4 h
+4 h
+4 h
+10933 m
+10 h
+10 h
+10 h
+10934 m
+4 h
+2625 h
+4 h
+4 h
+10935 m
+4 h
+4 h
+10 h
+10 h
+10936 m
+4 h
+4 h
+10 h
+4 h
+1 h
+1 h
+3 h
+3836 m
+113 h
+10 h
+4 h
+4 h
+82 h
+3 h
+83 h
+109 h
+4 h
+1 h
+278 h
+31 h
+59 h
+885 m
+4 h
+114 h
+1 h
+4 h
+1 h
+4 h
+383 h
+1 h
+57 h
+59 h
+447 h
+4 h
+1 h
+10937 m
+1 h
+10 h
+1 h
+10938 m
+11 h
+1 h
+4 h
+10939 m
+4 h
+1 h
+10940 m
+10 h
+1 h
+10 h
+4 h
+97 h
+10941 m
+1 h
+195 h
+4 h
+10 h
+1 h
+10942 m
+10943 m
+1 h
+1 h
+3 h
+1 h
+1 h
+10944 m
+2205 m
+10945 m
+7479 m
+4 h
+31 h
+10 h
+1 h
+4 h
+10 h
+10 h
+6378 m
+11 h
+48 h
+2172 h
+3 h
+1 h
+4 h
+1 h
+114 h
+1 h
+10 h
+626 h
+1 h
+4 h
+4 h
+11 h
+4 h
+1508 m
+332 h
+1 h
+10 h
+4 h
+10 h
+1 h
+4 h
+109 h
+170 h
+33 h
+10946 m
+10947 m
+4 h
+10 h
+57 h
+10948 m
+4 h
+1 h
+4 h
+4 h
+10949 m
+1796 h
+12 h
+10950 m
+383 h
+4 h
+1 h
+1 h
+8305 m
+4 h
+10951 m
+10952 m
+435 m
+196 h
+4 h
+1 h
+1 h
+1 h
+10 h
+4 h
+10 h
+808 m
+4 h
+10953 m
+10954 m
+4 h
+4 h
+10 h
+10955 m
+10956 m
+10957 m
+4 h
+10958 m
+929 m
+1 h
+10959 m
+31 h
+4 h
+10960 m
+92 h
+10961 m
+25 h
+4 h
+4 h
+41 h
+12 h
+276 h
+358 h
+10962 m
+22 h
+4 h
+4 h
+74 h
+4 h
+74 h
+1 h
+11 h
+1 h
+358 h
+57 h
+4 h
+4 h
+40 h
+10 h
+41 h
+358 h
+10 h
+125 h
+10963 m
+241 m
+74 h
+10 h
+1 h
+4 h
+386 h
+10964 m
+1 h
+4 h
+2374 m
+10 h
+4 h
+10 h
+4 h
+1 h
+10 h
+295 h
+1 h
+146 h
+114 h
+10965 m
+1 h
+10 h
+295 h
+1 h
+11 h
+10966 m
+10967 m
+74 h
+10968 m
+1470 h
+1 h
+10 h
+4 h
+4 h
+10 h
+4 h
+10969 m
+4 h
+4 h
+10 h
+25 h
+4 h
+3 h
+10 h
+4 h
+143 h
+1470 h
+4 h
+195 h
+601 h
+1 h
+11 h
+10 h
+1 h
+10970 m
+4 h
+123 h
+4 h
+195 h
+3026 m
+1 h
+10971 m
+10 h
+1 h
+169 h
+59 h
+123 h
+25 h
+1 h
+10 h
+10 h
+1 h
+10972 m
+82 h
+10 h
+124 h
+4 h
+94 h
+4 h
+4 h
+41 h
+164 h
+9 h
+4 h
+12 h
+1 h
+146 h
+1 h
+1 h
+4 h
+10973 m
+1 h
+83 h
+1 h
+195 h
+1309 h
+10974 m
+82 h
+1499 m
+10 h
+10975 m
+11 h
+10 h
+1 h
+4 h
+10976 m
+55 h
+10977 m
+65 h
+4 h
+10978 m
+1 h
+10979 m
+4 h
+36 h
+1 h
+463 h
+1127 m
+10980 m
+1 h
+31 h
+104 h
+124 h
+1 h
+4 h
+1 h
+55 h
+10981 m
+55 h
+3 h
+1 h
+1 h
+10982 m
+97 h
+1 h
+4 h
+10 h
+297 h
+276 h
+1 h
+10983 m
+114 h
+114 h
+1 h
+68 m
+4 h
+138 h
+4 h
+10984 m
+4 h
+10985 m
+1 h
+4 h
+4124 m
+10986 m
+104 h
+2625 h
+10 h
+10987 m
+1 h
+4 h
+10988 m
+10 h
+4 h
+10 h
+1 h
+4 h
+10989 m
+12 h
+10 h
+1 h
+45 h
+10990 m
+1642 h
+1016 h
+4 h
+10991 m
+10 h
+900 m
+10 h
+10992 m
+8486 m
+10993 m
+11 h
+4 h
+2215 m
+4 h
+4 h
+4 h
+4 h
+4 h
+478 h
+1 h
+125 h
+125 h
+5869 m
+4 h
+4 h
+57 h
+11 h
+4596 m
+10 h
+1 h
+45 h
+124 h
+1 h
+10 h
+109 h
+167 h
+10 h
+10994 m
+10 h
+10995 m
+692 h
+10996 m
+124 h
+578 h
+10 h
+10997 m
+10 h
+10998 m
+10999 m
+11000 m
+59 h
+4 h
+4 h
+11001 m
+10 h
+10 h
+74 h
+11002 m
+4 h
+10 h
+1 h
+11003 m
+172 h
+11004 m
+4 h
+10 h
+10 h
+4 h
+10 h
+313 h
+181 h
+10 h
+1 h
+1 h
+36 h
+4 h
+64 h
+11005 m
+11006 m
+4 h
+11007 m
+172 h
+10 h
+82 h
+11008 m
+10 h
+113 h
+11009 m
+11010 m
+1 h
+83 h
+1 h
+11 h
+1 h
+1 h
+4 h
+1 h
+4 h
+4 h
+4 h
+1 h
+10 h
+1 h
+103 m
+11011 m
+4 h
+1083 h
+4 h
+4 h
+4 h
+11012 m
+1 h
+737 m
+1 h
+57 h
+10 h
+1 h
+928 m
+4 h
+74 h
+4 h
+11013 m
+383 h
+3 h
+4 h
+119 h
+4 h
+8179 m
+1 h
+48 h
+1 h
+74 h
+9228 m
+4 h
+8 h
+190 h
+1 h
+2887 m
+82 h
+367 m
+10 h
+11014 m
+10 h
+4 h
+146 h
+11015 m
+4 h
+1 h
+82 h
+10324 m
+520 h
+1 h
+4 h
+1 h
+82 h
+139 h
+1 h
+443 h
+11016 m
+6668 m
+10 h
+295 h
+11017 m
+229 h
+11018 m
+1 h
+10 h
+11 h
+10 h
+4 h
+307 h
+124 h
+57 h
+196 h
+4 h
+7913 m
+10 h
+4 h
+1 h
+2484 m
+4 h
+25 h
+4 h
+3680 m
+196 h
+7798 m
+279 h
+10 h
+57 h
+82 h
+11019 m
+31 h
+358 h
+11020 m
+4 h
+172 h
+11021 m
+11022 m
+4 h
+11023 m
+4 h
+1 h
+11024 m
+4 h
+4 h
+1 h
+4 h
+11025 m
+143 h
+4 h
+4 h
+1 h
+79 h
+3 h
+4 h
+11026 m
+1 h
+11027 m
+11028 m
+3134 m
+4 h
+4 h
+11029 m
+31 h
+109 h
+74 h
+1 h
+13 h
+11030 m
+10 h
+4 h
+124 h
+1 h
+124 h
+4 h
+9065 m
+4 h
+140 h
+10 h
+10 h
+4 h
+10 h
+10 h
+10 h
+59 h
+185 h
+1 h
+3707 m
+11 h
+11031 m
+1 h
+11032 m
+11033 m
+11034 m
+11035 m
+4 h
+124 h
+4 h
+10 h
+4 h
+1 h
+1406 h
+4 h
+1 h
+11036 m
+11037 m
+135 h
+1 h
+146 h
+10 h
+11038 m
+4 h
+1 h
+83 h
+11039 m
+11 h
+4 h
+10 h
+4 h
+77 h
+2412 m
+3095 m
+4 h
+11040 m
+1 h
+569 h
+692 h
+1 h
+571 m
+1 h
+3 h
+1 h
+4 h
+4 h
+124 h
+11041 m
+11042 m
+10 h
+1 h
+1 h
+11043 m
+11044 m
+31 h
+5 h
+27 h
+1 h
+79 h
+135 h
+11045 m
+10 h
+4 h
+13 h
+25 h
+11046 m
+10 h
+10 h
+7253 m
+1 h
+1 h
+11047 m
+1 h
+4 h
+118 h
+10 h
+4 h
+4 h
+11048 m
+4 h
+11049 m
+11050 m
+41 h
+4 h
+79 h
+1 h
+11051 m
+4 h
+4 h
+11052 m
+73 h
+11053 m
+1 h
+4 h
+1 h
+11054 m
+8 h
+170 h
+4 h
+10 h
+65 h
+1 h
+4 h
+1 h
+986 h
+4 h
+4 h
+104 h
+148 m
+1 h
+276 h
+11055 m
+11056 m
+5206 m
+1 h
+11 h
+1953 m
+11057 m
+4 h
+1 h
+10 h
+41 h
+11058 m
+10 h
+10 h
+11059 m
+4 h
+69 h
+11060 m
+11061 m
+4 h
+4 h
+1 h
+11062 m
+114 h
+976 h
+11063 m
+1 h
+129 h
+10 h
+4 h
+3 h
+10 h
+31 h
+692 h
+10 h
+1 h
+464 h
+83 h
+10 h
+11064 m
+109 h
+10 h
+55 h
+10 h
+1 h
+4 h
+1 h
+400 m
+1 h
+10 h
+4 h
+4 h
+4 h
+11065 m
+11066 m
+4 h
+83 h
+1 h
+4 h
+1 h
+1 h
+4 h
+64 h
+65 h
+10 h
+11067 m
+10 h
+10 h
+4 h
+1 h
+1 h
+3177 m
+274 h
+4 h
+83 h
+4 h
+79 h
+82 h
+4 h
+1 h
+1 h
+11068 m
+11069 m
+11070 m
+11071 m
+10 h
+1 h
+4 h
+104 h
+10 h
+1359 m
+10 h
+8 h
+265 h
+99 m
+4 h
+4 h
+1 h
+4 h
+11 h
+13 h
+11072 m
+1 h
+1 h
+4 h
+10 h
+11073 m
+1 h
+10 h
+2435 m
+11074 m
+172 h
+83 h
+82 h
+10 h
+40 h
+1 h
+11075 m
+11076 m
+4 h
+11077 m
+1 h
+12 h
+77 h
+4 h
+4 h
+11078 m
+124 h
+4 h
+10 h
+1 h
+10 h
+10 h
+192 h
+11079 m
+11080 m
+4 h
+124 h
+11081 m
+1 h
+11082 m
+10 h
+4 h
+1 h
+295 h
+4 h
+601 h
+1 h
+520 h
+4 h
+386 h
+1 h
+11083 m
+11084 m
+11085 m
+4 h
+25 h
+11086 m
+11087 m
+425 m
+11088 m
+1677 m
+11089 m
+181 h
+601 h
+4 h
+4 h
+65 h
+464 h
+157 h
+4 h
+11090 m
+10 h
+11091 m
+307 h
+1 h
+1 h
+169 h
+1 h
+10 h
+1 h
+1 h
+11092 m
+1650 h
+11093 m
+1 h
+59 h
+464 h
+10 h
+11094 m
+57 h
+11095 m
+2116 m
+109 h
+11096 m
+1 h
+4 h
+4 h
+22 h
+109 h
+11097 m
+4 h
+4 h
+79 h
+173 h
+114 h
+11098 m
+1137 h
+1 h
+10 h
+4 h
+10 h
+4 h
+730 m
+11099 m
+10689 m
+4 h
+4 h
+11100 m
+11101 m
+10 h
+11102 m
+10 h
+1981 m
+3177 m
+4 h
+118 h
+1 h
+1 h
+4 h
+11103 m
+10 h
+4 h
+146 h
+11 h
+4 h
+10 h
+11 h
+10 h
+4 h
+11104 m
+11105 m
+4 h
+1 h
+2733 h
+264 m
+1 h
+4 h
+1105 h
+4 h
+36 h
+996 m
+158 h
+1 h
+11106 m
+1 h
+10 h
+1 h
+2245 m
+12 h
+124 h
+164 h
+83 h
+196 h
+11107 m
+11108 m
+8114 m
+4 h
+11109 m
+4 h
+1 h
+11110 m
+307 h
+195 h
+7535 m
+11111 m
+91 h
+25 h
+208 m
+92 h
+1 h
+10 h
+11112 m
+11113 m
+11114 m
+4 h
+4 h
+4 h
+196 h
+359 h
+11115 m
+31 h
+4 h
+65 h
+10 h
+11116 m
+167 h
+250 h
+124 h
+4 h
+1 h
+4 h
+10 h
+57 h
+10 h
+11117 m
+4 h
+4 h
+10 h
+55 h
+11118 m
+4 h
+1083 h
+11119 m
+1 h
+10 h
+110 h
+11120 m
+4030 m
+11 h
+11121 m
+143 h
+1 h
+1309 h
+976 h
+11122 m
+1 h
+4 h
+4 h
+10 h
+4 h
+1 h
+10 h
+1 h
+11123 m
+4 h
+3 h
+10 h
+4 h
+331 m
+11124 m
+64 h
+135 h
+4 h
+11125 m
+147 h
+5225 m
+4 h
+1 h
+11126 m
+1 h
+4 h
+4 h
+11127 m
+124 h
+295 h
+4 h
+10 h
+11128 m
+172 h
+4 h
+11129 m
+4 h
+73 h
+73 h
+1261 h
+5046 m
+4 h
+4 h
+11130 m
+1 h
+1 h
+10 h
+10 h
+11131 m
+10 h
+11132 m
+11133 m
+10 h
+4 h
+1 h
+601 h
+10 h
+11134 m
+10 h
+279 h
+4 h
+11135 m
+109 h
+1 h
+4 h
+146 h
+4 h
+8035 m
+569 h
+8767 m
+367 m
+1 h
+4 h
+1 h
+104 h
+10 h
+97 h
+25 h
+10 h
+4 h
+57 h
+1 h
+181 h
+4 h
+56 h
+4 h
+1 h
+11136 m
+1261 h
+10 h
+8571 m
+7641 m
+181 h
+3293 m
+109 h
+59 h
+11137 m
+57 h
+65 h
+11138 m
+73 h
+10 h
+11139 m
+1771 m
+4 h
+4 h
+2591 m
+11 h
+1 h
+59 h
+11140 m
+10 h
+4 h
+4 h
+2887 m
+4 h
+11141 m
+3 h
+994 m
+11142 m
+11143 m
+41 h
+110 h
+4 h
+113 h
+1 h
+4 h
+1 h
+11144 m
+11145 m
+4 h
+1 h
+556 h
+11146 m
+1 h
+4 h
+1 h
+97 h
+10 h
+4 h
+31 h
+109 h
+147 h
+82 h
+83 h
+139 h
+935 h
+11147 m
+4 h
+10 h
+10 h
+4 h
+1016 h
+1 h
+1 h
+11148 m
+4 h
+447 h
+123 h
+1 h
+97 h
+12 h
+10 h
+1 h
+3 h
+338 h
+10 h
+307 h
+1796 h
+74 h
+4 h
+57 h
+1 h
+4 h
+59 h
+1 h
+1 h
+77 h
+1 h
+4 h
+11149 m
+82 h
+11150 m
+5785 m
+11151 m
+11152 m
+104 h
+11153 m
+25 h
+11 h
+11154 m
+11155 m
+74 h
+1 h
+11156 m
+104 h
+82 h
+258 h
+41 h
+10 h
+10 h
+1 h
+11157 m
+10 h
+10 h
+4 h
+109 h
+11158 m
+65 h
+4 h
+59 h
+4 h
+3 h
+1 h
+139 h
+11159 m
+1089 m
+10 h
+9691 m
+1 h
+4 h
+45 h
+4 h
+83 h
+4 h
+11160 m
+11161 m
+11162 m
+190 h
+3 h
+10 h
+1 h
+1337 m
+10 h
+11 h
+135 h
+28 h
+4 h
+3 h
+1 h
+10 h
+10 h
+31 h
+443 h
+4 h
+1 h
+1 h
+4 h
+11163 m
+11164 m
+4 h
+4 h
+10 h
+1 h
+195 h
+4 h
+10 h
+1 h
+22 h
+4 h
+3 h
+146 h
+11165 m
+256 m
+45 h
+11166 m
+2788 h
+1 h
+10 h
+4 h
+779 h
+11167 m
+1 h
+4 h
+110 h
+1 h
+1 h
+82 h
+2887 h
+4 h
+4 h
+4 h
+3455 m
+4 h
+10 h
+1 h
+4 h
+1 h
+10 h
+11168 m
+3 h
+4 h
+1 h
+10 h
+57 h
+170 h
+10 h
+1 h
+11 h
+10 h
+444 m
+55 h
+1 h
+11169 m
+11170 m
+103 h
+4 h
+10 h
+109 h
+11171 m
+4 h
+11172 m
+4 h
+94 h
+1389 m
+4 h
+4 h
+1 h
+4 h
+359 h
+11173 m
+4 h
+1 h
+10 h
+10 h
+4 h
+79 h
+1 h
+146 h
+10 h
+11174 m
+10 h
+4 h
+11175 m
+1 h
+11176 m
+10283 m
+11177 m
+10 h
+4 h
+1 h
+2840 m
+82 h
+4 h
+4 h
+11178 m
+10 h
+6469 m
+10111 m
+1 h
+1 h
+10 h
+4 h
+4 h
+358 h
+278 h
+10 h
+4 h
+4 h
+10 h
+4 h
+4 h
+1 h
+125 h
+10 h
+4 h
+1 h
+11 h
+1 h
+4 h
+4 h
+11179 m
+9372 m
+4 h
+4 h
+11180 m
+4 h
+3272 m
+1201 h
+11181 m
+383 h
+1 h
+10 h
+3737 m
+1 h
+11182 m
+11183 m
+1 h
+208 m
+11184 m
+1 h
+11185 m
+297 h
+1737 m
+10 h
+11186 m
+11187 m
+83 h
+4 h
+11188 m
+4 h
+73 h
+1 h
+10 h
+11189 m
+1092 m
+11190 m
+4 h
+4 h
+11191 m
+57 h
+10 h
+10 h
+10 h
+1 h
+4 h
+172 h
+4 h
+4 h
+11192 m
+83 h
+104 h
+1 h
+4 h
+1691 m
+1 h
+4 h
+10 h
+1 h
+4 h
+1 h
+59 h
+1482 m
+11193 m
+1 h
+1 h
+6139 m
+73 h
+11194 m
+4 h
+11 h
+11 h
+59 h
+1 h
+1 h
+10 h
+4 h
+11195 m
+4 h
+2374 m
+10 h
+10 h
+4 h
+11196 m
+10 h
+990 m
+1 h
+64 h
+208 h
+536 h
+83 h
+4 h
+5567 m
+10 h
+11197 m
+1 h
+11198 m
+4 h
+1 h
+1 h
+276 h
+1 h
+11199 m
+11200 m
+358 h
+56 h
+4 h
+4 h
+185 h
+10 h
+59 h
+11201 m
+4 h
+57 h
+939 m
+4 h
+10 h
+4 h
+109 h
+996 m
+4 h
+4 h
+109 h
+185 h
+1 h
+4 h
+11202 m
+23 h
+11203 m
+10 h
+11204 m
+55 h
+11205 m
+5976 m
+13 h
+59 h
+4 h
+11 h
+276 h
+11206 m
+10 h
+11207 m
+4 h
+569 h
+1 h
+11208 m
+4 h
+11 h
+11 h
+41 h
+10 h
+10 h
+181 h
+64 h
+11209 m
+11 h
+41 h
+4 h
+10 h
+11210 m
+4 h
+4 h
+10 h
+196 h
+1 h
+4 h
+3 h
+3679 m
+2883 m
+10 h
+1 h
+4 h
+10 h
+10 h
+322 m
+11211 m
+1 h
+82 h
+1016 h
+65 h
+31 h
+48 h
+146 h
+11 h
+1 h
+1 h
+11 h
+4 h
+10 h
+4857 m
+229 h
+11212 m
+1 h
+4 h
+1 h
+13 h
+11213 m
+1 h
+10 h
+488 h
+10 h
+1 h
+4 h
+10 h
+4 h
+1 h
+11214 m
+4 h
+1 h
+1 h
+2433 m
+11215 m
+11216 m
+11217 m
+11218 m
+358 h
+10 h
+11219 m
+10 h
+10 h
+10 h
+83 h
+4 h
+10 h
+1 h
+10 h
+11220 m
+4 h
+4 h
+11221 m
+4 h
+157 h
+1 h
+82 h
+104 h
+4 h
+10 h
+11222 m
+185 h
+4 h
+3 h
+4 h
+4 h
+195 h
+11223 m
+146 h
+4 h
+13 h
+11224 m
+4 h
+4 h
+1868 m
+3 h
+4 h
+1 h
+11225 m
+4 h
+10 h
+11226 m
+10 h
+1 h
+10 h
+4 h
+4 h
+1 h
+82 h
+147 h
+4 h
+1 h
+1 h
+31 h
+10 h
+383 h
+4 h
+4 h
+1 h
+118 h
+4 h
+4 h
+1 h
+443 h
+1 h
+447 h
+169 h
+371 h
+4 h
+1 h
+2733 h
+4 h
+10 h
+11227 m
+10 h
+4 h
+4 h
+11228 m
+69 h
+1 h
+1 h
+1 h
+322 m
+1 h
+1 h
+11229 m
+146 h
+4 h
+11 h
+4 h
+83 h
+2309 m
+10 h
+147 h
+11230 m
+11 h
+4 h
+10 h
+41 h
+59 h
+1 h
+10 h
+4 h
+143 h
+4 h
+4 h
+1 h
+4 h
+9501 m
+45 h
+1 h
+170 h
+41 h
+138 h
+173 h
+4 h
+1 h
+109 h
+4 h
+4 h
+4 h
+4 h
+114 h
+3 h
+172 h
+12 h
+4 h
+74 h
+10 h
+27 h
+11231 m
+265 h
+10 h
+109 h
+4 h
+10 h
+1 h
+10 h
+1 h
+10 h
+74 h
+1 h
+1 h
+10 h
+11232 m
+4 h
+1253 m
+10 h
+6855 m
+4 h
+4 h
+57 h
+31 h
+11233 m
+10 h
+192 h
+125 h
+1 h
+11234 m
+4 h
+146 h
+10 h
+1 h
+10 h
+170 h
+4 h
+167 h
+4 h
+1 h
+11 h
+1 h
+4 h
+2846 m
+5199 m
+11235 m
+124 h
+11 h
+4 h
+11236 m
+4 h
+10 h
+4 h
+10 h
+289 h
+11237 m
+4 h
+4 h
+3768 m
+11238 m
+158 h
+119 h
+1 h
+36 h
+4 h
+147 h
+1 h
+338 h
+4 h
+109 h
+83 h
+1 h
+112 h
+11239 m
+11240 m
+1 h
+11241 m
+4 h
+4 h
+4 h
+125 h
+477 m
+109 h
+4 h
+10 h
+1 h
+110 h
+11242 m
+1 h
+10 h
+10 h
+10 h
+11243 m
+4 h
+97 h
+82 h
+10 h
+11 h
+13 h
+1 h
+4 h
+435 m
+11244 m
+1 h
+41 h
+4 h
+181 h
+1 h
+11245 m
+4 h
+1 h
+4 h
+11246 m
+1 h
+25 h
+11247 m
+11248 m
+1 h
+10 h
+31 h
+2958 m
+11249 m
+10 h
+10 h
+11250 m
+1 h
+4 h
+11251 m
+10 h
+1030 h
+125 h
+1261 h
+1 h
+1 h
+10 h
+40 h
+4 h
+4 h
+2265 h
+1330 m
+533 h
+4 h
+1 h
+10 h
+11252 m
+11253 m
+1 h
+11254 m
+57 h
+4 h
+124 h
+1 h
+59 h
+4292 m
+4 h
+110 h
+1 h
+97 h
+1 h
+11255 m
+4 h
+59 h
+83 h
+109 h
+10 h
+4 h
+10 h
+1 h
+11256 m
+1 h
+4 h
+170 h
+4 h
+10 h
+110 h
+4 h
+45 h
+11257 m
+4 h
+11258 m
+4 h
+11259 m
+4 h
+11 h
+1 h
+4 h
+10 h
+4 h
+11260 m
+1 h
+11261 m
+403 h
+1 h
+10 h
+41 h
+11262 m
+1374 m
+169 h
+11263 m
+27 h
+45 h
+11264 m
+583 h
+1691 m
+4 h
+11 h
+4 h
+10 h
+4 h
+4 h
+11265 m
+4 h
+69 h
+5053 m
+11266 m
+1 h
+1619 h
+185 h
+1 h
+3 h
+4 h
+11267 m
+4 h
+258 h
+1 h
+4 h
+1 h
+11 h
+1 h
+4 h
+11268 m
+11269 m
+11270 m
+59 h
+1 h
+11271 m
+10 h
+4 h
+11272 m
+4 h
+10 h
+83 h
+11273 m
+353 m
+124 h
+1 h
+4 h
+11274 m
+4 h
+94 h
+10 h
+1 h
+1 h
+10 h
+1880 m
+10 h
+1 h
+1 h
+1 h
+11275 m
+11276 m
+1 h
+25 h
+64 h
+1 h
+4 h
+56 h
+1 h
+11277 m
+4 h
+11278 m
+4 h
+10 h
+11279 m
+11280 m
+147 h
+103 h
+10 h
+83 h
+56 h
+1 h
+10 h
+11281 m
+11282 m
+770 m
+5526 m
+11283 m
+129 h
+11284 m
+10 h
+124 h
+195 h
+1 h
+1714 m
+10 h
+10 h
+125 h
+169 h
+55 h
+9139 m
+4 h
+11285 m
+1 h
+1 h
+11 h
+4 h
+4 h
+1691 h
+4 h
+630 m
+124 h
+4 h
+10 h
+1 h
+4 h
+4 h
+1 h
+11286 m
+4 h
+4 h
+1 h
+11 h
+1 h
+97 h
+4 h
+1 h
+3 h
+1 h
+1454 m
+1 h
+3 h
+10 h
+10 h
+4 h
+4 h
+265 h
+41 h
+11287 m
+4 h
+10 h
+11288 m
+55 h
+4 h
+1 h
+1 h
+4 h
+1137 h
+11289 m
+4 h
+10 h
+25 h
+4 h
+1 h
+10 h
+114 h
+4 h
+1 h
+1 h
+10 h
+10 h
+11290 m
+4 h
+1 h
+10 h
+10 h
+109 h
+4 h
+11291 m
+25 h
+3141 m
+4 h
+3 h
+10 h
+10 h
+4 h
+1822 m
+119 h
+12 h
+1 h
+3 h
+82 h
+4 h
+36 h
+73 h
+4 h
+692 h
+1 h
+1 h
+11292 m
+4 h
+11293 m
+10 h
+1 h
+4 h
+22 h
+124 h
+11294 m
+10 h
+11295 m
+11296 m
+2794 m
+64 h
+1 h
+4 h
+4 h
+146 h
+195 h
+10 h
+1 h
+10 h
+109 h
+79 h
+169 h
+1 h
+358 h
+11297 m
+103 h
+4 h
+10 h
+4 h
+4 h
+1 h
+10 h
+4 h
+11298 m
+1 h
+10 h
+1 h
+1 h
+4 h
+4 h
+10 h
+82 h
+10 h
+262 h
+11299 m
+10 h
+10 h
+6129 m
+114 h
+31 h
+11300 m
+1 h
+57 h
+10 h
+82 h
+4 h
+4 h
+4 h
+1 h
+4 h
+109 h
+443 h
+11301 m
+11 h
+4 h
+10 h
+25 h
+1 h
+123 h
+1 h
+74 h
+6770 m
+10 h
+493 m
+11 h
+11302 m
+4 h
+4 h
+1 h
+10 h
+4 h
+1 h
+11303 m
+1470 h
+4 h
+4 h
+11304 m
+11305 m
+4 h
+11 h
+4 h
+4 h
+1 h
+11306 m
+11 h
+11307 m
+97 h
+1 h
+10 h
+109 h
+4 h
+1 h
+10 h
+4 h
+11308 m
+10 h
+11309 m
+11310 m
+4 h
+25 h
+1 h
+258 h
+10 h
+4 h
+195 h
+74 h
+536 h
+10 h
+10 h
+801 m
+1 h
+2002 m
+109 h
+10 h
+11311 m
+4714 m
+11312 m
+4 h
+82 h
+4 h
+1 h
+10 h
+1 h
+1 h
+4 h
+11313 m
+172 h
+1 h
+109 h
+27 h
+1 h
+1 h
+10 h
+4 h
+4 h
+1 h
+4 h
+109 h
+25 h
+11 h
+1 h
+640 m
+11314 m
+4 h
+10 h
+1 h
+4 h
+4 h
+11315 m
+4 h
+109 h
+4 h
+4 h
+11 h
+11316 m
+1478 h
+4 h
+1 h
+2235 m
+59 h
+4 h
+4 h
+1 h
+11 h
+11317 w
+1372 m
+11318 m
+11319 m
+359 h
+11320 m
+4 h
+1089 m
+10 h
+73 h
+1 h
+1 h
+1 h
+59 h
+1 h
+11321 m
+4 h
+1 h
+10 h
+97 h
+10 h
+1 h
+4 h
+4 h
+843 m
+4 h
+4 h
+4 h
+1 h
+4 h
+1 h
+3 h
+601 h
+4 h
+11322 m
+447 h
+10 h
+4 h
+4 h
+4 h
+10 h
+11323 m
+1 h
+11324 m
+1 h
+10 h
+1 h
+123 h
+4 h
+11325 m
+4 h
+4 h
+10 h
+1470 h
+3240 m
+4 h
+11326 m
+1 h
+10 h
+10 h
+11327 m
+11328 m
+4 h
+601 h
+41 h
+4 h
+147 h
+4 h
+13 h
+4 h
+11329 m
+10 h
+57 h
+258 h
+10 h
+10 h
+208 h
+1 h
+83 h
+4 h
+11330 m
+11331 m
+1 h
+11 h
+1 h
+1 h
+359 h
+48 h
+10 h
+4 h
+59 h
+11332 m
+1 h
+41 h
+1 h
+4 h
+146 h
+4 h
+1 h
+4 h
+4 h
+1 h
+4338 m
+10 h
+11333 m
+1 h
+10 h
+10 h
+3351 m
+31 h
+10 h
+10 h
+4 h
+195 h
+10 h
+10 h
+119 h
+4 h
+10 h
+10 h
+40 h
+4 h
+11334 m
+25 h
+139 h
+146 h
+97 h
+125 h
+147 h
+1 h
+1 h
+11335 m
+11336 m
+10 h
+4 h
+147 h
+11337 m
+1 h
+103 h
+1403 h
+10 h
+123 h
+11 h
+11338 m
+1 h
+4 h
+3 h
+1 h
+11339 m
+8879 m
+1 h
+11340 m
+10 h
+10 h
+11341 m
+4 h
+74 h
+11342 m
+1 h
+4 h
+158 h
+4 h
+10 h
+4 h
+4 h
+74 h
+4 h
+4 h
+4 h
+11343 m
+4 h
+11344 m
+1 h
+10 h
+272 m
+11345 m
+4 h
+10 h
+174 m
+4 h
+1 h
+425 m
+1 h
+1 h
+2205 m
+4 h
+109 h
+10 h
+4 h
+270 h
+1 h
+56 h
+3 h
+11346 m
+31 h
+109 h
+1820 m
+1 h
+4 h
+10 h
+1 h
+195 h
+1 h
+10 h
+4 h
+11347 m
+10345 m
+1 h
+11348 m
+1 h
+10 h
+1 h
+11349 m
+10 h
+258 h
+140 h
+11350 m
+79 h
+11351 m
+3089 m
+11352 m
+186 h
+1 h
+6558 m
+4 h
+10 h
+1 h
+4 h
+10 h
+4 h
+1766 h
+1 h
+11353 m
+1 h
+109 h
+169 h
+4 h
+11354 m
+10 h
+97 h
+73 h
+4 h
+1780 m
+11355 m
+1 h
+11 h
+3 h
+10 h
+10 h
+4 h
+11356 m
+11 h
+123 h
+11357 m
+4 h
+64 h
+10 h
+4 h
+119 h
+4 h
+181 h
+4 h
+3150 m
+56 h
+1 h
+4 h
+11358 m
+4 h
+1722 m
+698 m
+10 h
+11359 m
+4 h
+443 h
+4 h
+65 h
+6144 m
+4 h
+10 h
+4 h
+4 h
+13 h
+25 h
+307 h
+4 h
+157 h
+22 h
+1 h
+196 h
+12 h
+371 h
+1 h
+4 h
+1 h
+4 h
+1 h
+4 h
+1 h
+4 h
+11360 m
+119 h
+36 h
+4 h
+11361 m
+11362 m
+4 h
+224 h
+1 h
+109 h
+97 h
+4 h
+10 h
+4 h
+4 h
+12 h
+4 h
+11363 m
+5863 m
+4 h
+10 h
+10 h
+1442 m
+1 h
+10 h
+82 h
+601 h
+1 h
+11364 m
+11365 m
+10 h
+1 h
+59 h
+124 h
+10 h
+1 h
+1 h
+698 m
+11 h
+1 h
+4 h
+1 h
+10 h
+11366 m
+11367 m
+4 h
+11 h
+1 h
+1 h
+109 h
+10 h
+1 h
+10 h
+1 h
+204 h
+4 h
+11368 m
+10 h
+4 h
+11369 m
+4 h
+1 h
+12 h
+57 h
+4 h
+11370 m
+4 h
+55 h
+11371 m
+4 h
+146 h
+1796 h
+83 h
+10 h
+11372 m
+478 h
+4 h
+4 h
+4 h
+41 h
+11 h
+266 h
+1 h
+155 m
+1 h
+124 h
+642 m
+92 h
+1847 m
+11373 m
+147 h
+1 h
+10 h
+4 h
+104 h
+4 h
+104 h
+763 m
+1 h
+10 h
+1 h
+1 h
+914 m
+11374 m
+4 h
+7913 m
+1 h
+4 h
+10 h
+22 h
+4 h
+1 h
+1 h
+10 h
+4 h
+1 h
+10 h
+11375 m
+4 h
+1 h
+3 h
+4 h
+4 h
+94 h
+4 h
+11376 m
+124 h
+135 h
+4 h
+7243 m
+1 h
+368 h
+11377 m
+1 h
+4 h
+582 m
+1 h
+1 h
+11378 m
+367 h
+4 h
+698 h
+4 h
+4 h
+4 h
+4 h
+1 h
+10 h
+377 m
+11379 m
+1 h
+10 h
+4 h
+124 h
+57 h
+10 h
+11380 m
+1 h
+11381 m
+4 h
+25 h
+1 h
+1 h
+4 h
+10 h
+4 h
+3 h
+10 h
+9075 m
+119 h
+1 h
+11382 m
+1 h
+1685 h
+1 h
+10 h
+11383 m
+229 h
+104 h
+1116 m
+4 h
+114 h
+1 h
+4 h
+10 h
+1 h
+4 h
+185 h
+5145 m
+4 h
+4 h
+4 h
+4 h
+125 h
+3877 m
+4 h
+4 h
+59 h
+10 h
+270 h
+11384 m
+125 h
+1 h
+11385 m
+172 h
+2090 m
+9120 m
+692 h
+4 h
+4 h
+11386 m
+4 h
+258 h
+1 h
+10 h
+1 h
+4 h
+307 h
+1 h
+4 h
+10 h
+512 m
+4 h
+4 h
+125 h
+4 h
+10 h
+4 h
+1 h
+4 h
+4 h
+11387 m
+12 h
+4 h
+4 h
+124 h
+4 h
+10 h
+11388 m
+278 h
+10 h
+1 h
+4 h
+4 h
+10 h
+1 h
+11389 m
+4 h
+1 h
+10 h
+11390 m
+4 h
+4 h
+11 h
+3028 m
+10 h
+10 h
+10 h
+10 h
+11 h
+11391 m
+3 h
+4 h
+125 h
+1 h
+10 h
+4 h
+11392 m
+1 h
+11 h
+1 h
+11393 m
+1 h
+164 h
+4 h
+1 h
+1 h
+1 h
+1 h
+146 h
+4 h
+104 h
+1 h
+11394 m
+172 h
+4 h
+27 h
+4 h
+82 h
+10 h
+4 h
+1 h
+79 h
+1 h
+4 h
+1 h
+4 h
+10 h
+11395 m
+10 h
+266 h
+295 h
+1 h
+10 h
+10 h
+1 h
+4 h
+10 h
+1493 m
+1027 h
+4 h
+11 h
+65 h
+4 h
+1822 m
+4 h
+10 h
+1 h
+4 h
+4 h
+4 h
+11396 m
+167 h
+1 h
+181 h
+114 h
+1 h
+1 h
+11397 m
+10 h
+11 h
+5122 m
+4904 m
+1 h
+4 h
+488 h
+1 h
+1 h
+10 h
+11398 m
+9669 m
+156 h
+125 h
+10 h
+4 h
+1 h
+4 h
+3 h
+556 h
+383 h
+1 h
+10 h
+601 h
+1 h
+4 h
+11399 m
+1 h
+4 h
+10 h
+10 h
+10 h
+11400 m
+1403 h
+125 h
+1 h
+48 h
+4 h
+4 h
+1 h
+1089 m
+4 h
+1089 h
+4 h
+7243 m
+41 h
+10 h
+1 h
+4 h
+8 h
+11401 m
+11402 m
+11403 m
+164 h
+256 m
+1 h
+10 h
+4 h
+10 h
+626 h
+1 h
+1 h
+278 h
+55 h
+10 h
+4 h
+11404 m
+31 h
+10 h
+10 h
+4 h
+4 h
+4 h
+9372 m
+65 h
+10 h
+4 h
+190 h
+4 h
+1780 m
+1 h
+11405 m
+11406 m
+146 h
+4 h
+11 h
+10 h
+55 h
+22 h
+1 h
+1 h
+123 h
+278 h
+4 h
+82 h
+4 h
+11407 m
+83 h
+1 h
+4 h
+91 h
+4 h
+4 h
+11408 m
+10 h
+129 h
+11409 m
+4 h
+104 h
+11410 m
+4 h
+11411 m
+70 m
+9411 m
+10 h
+11412 m
+2532 m
+11413 m
+4 h
+4 h
+2045 m
+1 h
+11414 m
+10 h
+1 h
+83 h
+22 h
+10 h
+1261 h
+4 h
+1780 h
+4 h
+1 h
+1 h
+11415 m
+1 h
+11416 m
+110 h
+4 h
+4 h
+4 h
+4 h
+11417 m
+4 h
+10 h
+69 h
+1 h
+59 h
+4 h
+1 h
+11418 m
+83 h
+11419 m
+642 m
+59 h
+4 h
+25 h
+1 h
+4 h
+83 h
+1697 m
+10 h
+1 h
+3680 h
+4 h
+11420 m
+10 h
+1 h
+36 h
+3089 m
+79 h
+1 h
+167 h
+10 h
+10 h
+4 h
+10 h
+11421 m
+124 h
+11 h
+57 h
+109 h
+11422 m
+11423 m
+10 h
+10 h
+1 h
+10 h
+11424 m
+1 h
+124 h
+4 h
+1053 m
+4 h
+4 h
+1 h
+4 h
+4 h
+488 h
+1 h
+113 h
+57 h
+195 h
+4 h
+1 h
+4 h
+10 h
+1 h
+73 h
+10 h
+1 h
+358 h
+11425 m
+1 h
+11426 m
+1 h
+11427 m
+1 h
+2887 h
+1 h
+11428 m
+4 h
+75 h
+4 h
+563 m
+10 h
+1 h
+4 h
+278 h
+65 h
+278 h
+319 h
+22 h
+1 h
+11429 m
+11430 m
+1 h
+1939 m
+10 h
+1 h
+1 h
+1 h
+10 h
+4 h
+4 h
+3 h
+10 h
+11431 m
+8212 m
+125 h
+10 h
+1 h
+2788 h
+4 h
+1 h
+11432 m
+167 h
+4 h
+4 h
+124 h
+1 h
+626 h
+11 h
+125 h
+4 h
+123 h
+25 h
+125 h
+11433 m
+4 h
+10 h
+2508 m
+8889 m
+1 h
+82 h
+11434 m
+4 h
+11435 m
+911 h
+1 h
+4 h
+368 h
+10 h
+65 h
+11436 m
+4 h
+164 h
+25 h
+4177 m
+1 h
+11437 m
+258 h
+4 h
+11438 m
+4 h
+4 h
+129 h
+4 h
+1 h
+3396 m
+65 h
+167 h
+3484 m
+195 h
+1 h
+4 h
+1 h
+10 h
+4 h
+258 h
+3 h
+11439 m
+11440 m
+11441 m
+1 h
+22 h
+9335 m
+25 h
+4 h
+172 h
+1038 m
+45 h
+73 h
+170 h
+1650 h
+578 h
+10 h
+11442 m
+4516 m
+1 h
+10 h
+11443 m
+377 m
+1619 h
+4 h
+536 h
+10 h
+4 h
+1646 m
+1 h
+1 h
+10 h
+1016 h
+4 h
+5526 m
+82 h
+11444 m
+94 h
+31 h
+4 h
+185 h
+4 h
+1184 m
+124 h
+31 h
+11445 m
+124 h
+4 h
+1 h
+73 h
+11446 m
+169 h
+4 h
+4 h
+4 h
+11447 m
+6129 m
+4 h
+10 h
+4 h
+2126 m
+158 h
+11448 m
+1 h
+11449 m
+22 h
+4 h
+11450 m
+11451 m
+1642 h
+11 h
+10 h
+4 h
+11452 m
+1185 m
+11453 m
+11454 m
+11455 m
+23 h
+118 h
+25 h
+1595 m
+1 h
+91 h
+1 h
+73 h
+4 h
+4 h
+4 h
+4 h
+82 h
+25 h
+1 h
+10 h
+74 h
+104 h
+1 h
+11456 m
+83 h
+11457 m
+109 h
+143 h
+109 h
+108 h
+10 h
+10 h
+4 h
+10 h
+5 h
+4 h
+4 h
+4 h
+56 h
+1 h
+79 h
+4 h
+4 h
+4 h
+10 h
+4 h
+11458 m
+36 h
+5869 m
+4 h
+12 h
+1 h
+4 h
+4 h
+359 h
+11459 m
+11 h
+1 h
+11460 m
+13 h
+11461 m
+10 h
+1 h
+11462 m
+4 h
+4 h
+4 h
+105 m
+11 h
+1 h
+4 h
+164 h
+1 h
+11463 m
+11464 m
+11465 m
+1564 m
+11466 m
+48 h
+11467 m
+1017 m
+10 h
+11468 m
+109 h
+433 m
+83 h
+4 h
+124 h
+10 h
+10 h
+11469 m
+1 h
+1 h
+1 h
+10 h
+262 h
+4 h
+97 h
+1 h
+3 h
+11 h
+119 h
+10 h
+11470 m
+11471 m
+10 h
+3768 m
+10 h
+11472 m
+4 h
+262 h
+10 h
+10 h
+1 h
+82 h
+1 h
+4 h
+1 h
+10 h
+4 h
+4 h
+4 h
+172 h
+11473 m
+1 h
+4 h
+11474 m
+11475 m
+266 h
+7395 m
+10 h
+10 h
+10 h
+59 h
+1 h
+1 h
+57 h
+1261 h
+83 h
+4 h
+1 h
+4 h
+123 h
+3 h
+4 h
+11476 m
+10 h
+4 h
+1 h
+3184 m
+11477 m
+1 h
+4 h
+1 h
+1 h
+1 h
+11478 m
+224 h
+118 h
+4 h
+91 h
+1 h
+1 h
+109 h
+64 h
+82 h
+146 h
+11 h
+57 h
+4 h
+10 h
+4 h
+11 h
+1 h
+10 h
+1 h
+10 h
+1 h
+4 h
+1977 m
+4 h
+135 h
+10 h
+11479 m
+10 h
+10 h
+10 h
+4 h
+10 h
+4 h
+4 h
+11480 m
+124 h
+1642 h
+1 h
+1 h
+1 h
+214 m
+4 h
+10 h
+4127 m
+399 h
+1 h
+10 h
+7938 m
+11 h
+109 h
+4 h
+11481 m
+82 h
+1 h
+1556 m
+4 h
+1 h
+1754 m
+900 m
+11482 m
+11483 m
+4 h
+11484 m
+25 h
+181 h
+124 h
+1 h
+57 h
+2423 m
+4 h
+11485 m
+83 h
+4 h
+1 h
+4 h
+41 h
+1030 h
+10 h
+4 h
+164 h
+10 h
+4 h
+11486 m
+1 h
+11487 m
+10512 m
+10 h
+11 h
+124 h
+4 h
+36 h
+10 h
+27 h
+11488 m
+4 h
+1 h
+4 h
+10 h
+4 h
+1 h
+11489 m
+97 h
+10 h
+146 h
+28 h
+1 h
+146 h
+10 h
+124 h
+4 h
+10 h
+143 h
+57 h
+11490 m
+6187 m
+74 h
+181 h
+74 h
+1 h
+4 h
+10 h
+1 h
+83 h
+97 h
+2128 m
+10 h
+1403 h
+8610 m
+1261 h
+190 h
+164 h
+11491 m
+4 h
+97 h
+4 h
+31 h
+57 h
+10 h
+4 h
+11492 m
+1 h
+4 h
+170 h
+433 m
+1 h
+11493 m
+11494 m
+11495 m
+158 h
+4 h
+11496 m
+4 h
+4 h
+1 h
+10 h
+4 h
+1 h
+10 h
+1 h
+11497 m
+11498 m
+10 h
+11499 m
+11500 m
+1017 m
+289 h
+3161 m
+10 h
+56 h
+11501 m
+11502 m
+1 h
+4 h
+11 h
+82 h
+4 h
+135 h
+4 h
+1 h
+11503 m
+190 h
+733 m
+65 h
+601 h
+125 h
+97 h
+11504 m
+4 h
+186 h
+10 h
+1 h
+11505 m
+241 m
+83 h
+412 m
+10 h
+125 h
+11506 m
+4 h
+10 h
+59 h
+83 h
+146 h
+4 h
+3 h
+6461 m
+190 h
+4 h
+59 h
+11507 m
+190 h
+8 h
+11 h
+9156 m
+1 h
+11508 m
+4 h
+11509 m
+97 h
+10 h
+4 h
+109 h
+1 h
+41 h
+143 h
+11510 m
+1 h
+11511 m
+4 h
+987 m
+10 h
+4 h
+65 h
+1 h
+11512 m
+124 h
+1 h
+11513 m
+4 h
+10 h
+307 h
+10 h
+4 h
+808 h
+10 h
+11514 m
+11515 m
+4 h
+4 h
+135 h
+124 h
+1 h
+10 h
+185 h
+4 h
+332 h
+1 h
+1 h
+1 h
+358 h
+857 m
+1 h
+10 h
+45 h
+147 h
+11 h
+10 h
+4 h
+4 h
+10 h
+11516 m
+4 h
+447 h
+285 m
+1 h
+10 h
+10 h
+109 h
+1260 h
+4 h
+4 h
+104 h
+31 h
+11517 m
+4 h
+164 h
+219 m
+10 h
+11518 m
+4 h
+109 h
+10 h
+11519 m
+4 h
+94 h
+10 h
+1 h
+11 h
+11520 m
+124 h
+196 h
+22 h
+11521 m
+4 h
+125 h
+164 h
+11522 m
+11141 m
+92 h
+10 h
+10 h
+11523 m
+1 h
+1 h
+1 h
+10 h
+1 h
+13 h
+4 h
+4 h
+10 h
+1 h
+4 h
+11524 m
+5017 m
+6381 m
+4 h
+10 h
+219 m
+11525 m
+4 h
+1 h
+4 h
+79 h
+195 h
+10 h
+11526 m
+82 h
+11527 m
+164 h
+1 h
+82 h
+11528 m
+125 h
+10 h
+1 h
+10 h
+1 h
+1 h
+11529 m
+1 h
+10 h
+4 h
+4 h
+4 h
+4 h
+4 h
+11530 m
+3 h
+4 h
+123 h
+1 h
+4 h
+911 h
+11531 m
+3 h
+36 h
+10 h
+11532 m
+229 h
+383 h
+4 h
+11533 m
+1 h
+4 h
+371 h
+59 h
+10 h
+1 h
+45 h
+10 h
+4 h
+2442 m
+4 h
+94 h
+4 h
+55 h
+4 h
+11534 m
+119 h
+10 h
+4 h
+4 h
+1 h
+4 h
+10 h
+4 h
+9385 m
+11535 m
+459 m
+11536 m
+11537 m
+1137 h
+4 h
+10 h
+1 h
+10 h
+4 h
+11538 m
+1 h
+83 h
+11539 m
+196 h
+1 h
+1 h
+692 h
+11540 m
+83 h
+11541 m
+4 h
+172 h
+190 h
+3558 m
+82 h
+11 h
+1 h
+4 h
+1 h
+1 h
+11542 m
+11543 m
+4 h
+10 h
+114 h
+11544 m
+4 h
+1 h
+4 h
+11545 m
+11546 m
+4 h
+82 h
+11547 m
+4 h
+10 h
+11548 m
+11 h
+4 h
+4 h
+359 h
+11 h
+104 h
+11549 m
+1 h
+4 h
+4 h
+11 h
+4 h
+10 h
+1 h
+4 h
+60 m
+31 h
+965 m
+1 h
+4 h
+1016 h
+11550 m
+11551 m
+11552 m
+4 h
+1281 m
+113 h
+4 h
+10 h
+12 h
+11553 m
+1 h
+57 h
+4 h
+806 m
+4 h
+82 h
+4 h
+11554 m
+2840 m
+4 h
+4 h
+4 h
+31 h
+41 h
+10 h
+10 h
+1 h
+4 h
+10 h
+10 h
+10 h
+82 h
+10 h
+1185 m
+11555 m
+10 h
+1 h
+11556 m
+11557 m
+1 h
+124 h
+4 h
+11558 m
+65 h
+158 h
+73 h
+11559 m
+1 h
+190 h
+119 h
+185 h
+11 h
+11 h
+1 h
+10 h
+11560 m
+1 h
+1 h
+31 h
+4 h
+10 h
+4 h
+74 h
+11561 m
+4 h
+4 h
+4256 m
+11 h
+1250 h
+359 h
+1 h
+11562 m
+307 h
+11563 m
+11564 m
+4 h
+493 m
+4 h
+10 h
+10 h
+94 h
+4 h
+4 h
+2537 m
+1 h
+250 h
+4 h
+4 h
+84 m
+11565 m
+1 h
+1038 m
+1 h
+11566 m
+41 h
+219 h
+4 h
+196 h
+11567 m
+4 h
+10 h
+104 h
+1 h
+195 h
+10 h
+11568 m
+4 h
+11569 m
+371 h
+11570 m
+79 h
+1 h
+55 h
+10 h
+196 h
+11571 m
+1403 h
+10 h
+10 h
+4 h
+10 h
+1105 h
+10 h
+11572 m
+11573 m
+11574 m
+11575 m
+83 h
+11576 m
+22 h
+4 h
+4 h
+1 h
+11485 m
+10 h
+11577 m
+11578 m
+11579 m
+125 h
+4 h
+4 h
+109 h
+1 h
+79 h
+10 h
+4 h
+1 h
+1 h
+11580 m
+195 h
+11581 m
+332 h
+1 h
+656 m
+4 h
+4 h
+25 h
+11582 m
+164 h
+10 h
+56 h
+104 h
+156 h
+4 h
+1 h
+93 h
+4 h
+4 h
+3025 m
+1 h
+11583 m
+11584 m
+1 h
+124 h
+10 h
+1 h
+11585 m
+11586 m
+4 h
+167 h
+11587 m
+10 h
+129 h
+1 h
+1 h
+59 h
+11588 m
+4 h
+36 h
+1478 h
+4 h
+138 h
+11589 m
+11 h
+150 m
+4 h
+4 h
+11590 m
+5093 m
+1 h
+229 h
+4 h
+1 h
+11591 m
+11592 m
+3 h
+1 h
+10 h
+1 h
+74 h
+4 h
+1030 h
+11593 m
+1 h
+4 h
+1 h
+4735 m
+10 h
+1016 h
+1 h
+1016 h
+82 h
+757 h
+1 h
+4 h
+5475 m
+11594 m
+11595 m
+4 h
+1 h
+11596 m
+4 h
+11597 m
+57 h
+11598 m
+11599 m
+74 h
+10 h
+109 h
+11600 m
+4 h
+1 h
+10 h
+57 h
+10 h
+11601 m
+11602 m
+3 h
+11603 m
+1 h
+10 h
+1 h
+1261 h
+10 h
+368 h
+272 m
+25 h
+11604 m
+31 h
+10 h
+59 h
+97 h
+11605 m
+11606 m
+1 h
+11607 m
+4 h
+4 h
+1027 h
+11 h
+1 h
+1 h
+1 h
+57 h
+493 m
+1 h
+1 h
+10 h
+110 h
+4 h
+4 h
+11608 m
+27 h
+4 h
+10 h
+4 h
+11 h
+4 h
+119 h
+22 h
+10 h
+45 h
+1017 h
+11609 m
+4 h
+278 h
+196 h
+74 h
+4 h
+447 h
+857 m
+1 h
+36 h
+1444 m
+172 h
+11610 m
+41 h
+4 h
+195 h
+125 h
+557 m
+11611 m
+4 h
+238 m
+4 h
+82 h
+4 h
+48 h
+55 h
+4 h
+4 h
+11 h
+8496 m
+57 h
+11612 m
+10 h
+1 h
+1 h
+1 h
+1 h
+181 h
+10 h
+4 h
+371 h
+11613 m
+1678 m
+11614 m
+11615 m
+1 h
+10 h
+1 h
+11616 m
+533 h
+10 h
+11617 m
+41 h
+11618 m
+4 h
+1 h
+1 h
+1 h
+11619 m
+10 h
+83 h
+8 h
+59 h
+1 h
+1 h
+4 h
+4 h
+135 h
+135 h
+4 h
+11620 m
+1 h
+195 h
+1 h
+506 m
+59 h
+1 h
+238 h
+278 h
+4 h
+57 h
+4 h
+4 h
+11621 m
+4 h
+272 m
+59 h
+10 h
+700 m
+57 h
+59 h
+4 h
+4 h
+59 h
+4 h
+11622 m
+1278 m
+1 h
+1 h
+10 h
+11623 m
+10 h
+1 h
+5650 m
+11624 m
+10 h
+10 h
+104 h
+1 h
+1 h
+1 h
+41 h
+282 m
+11625 m
+1 h
+4 h
+4 h
+8332 h
+31 h
+316 m
+4 h
+238 h
+119 h
+82 h
+11626 m
+238 h
+4 h
+4 h
+11627 m
+367 h
+1 h
+59 h
+258 h
+10 h
+10 h
+10 h
+119 h
+266 h
+4 h
+11628 m
+1 h
+258 h
+2300 m
+4 h
+4 h
+4 h
+4 h
+10 h
+3 h
+11 h
+25 h
+10 h
+4 h
+31 h
+4 h
+158 h
+4 h
+4 h
+11629 m
+31 h
+64 h
+139 h
+4 h
+57 h
+4 h
+11630 m
+10 h
+12 h
+31 h
+1 h
+4 h
+258 h
+1 h
+1 h
+4 h
+10 h
+2251 m
+3 h
+4 h
+10 h
+97 h
+10 h
+15 m
+11631 m
+82 h
+25 h
+11632 m
+2755 m
+1 h
+10 h
+11633 m
+4 h
+41 h
+110 h
+4 h
+4 h
+1835 m
+4 h
+10 h
+1 h
+11 h
+11634 m
+4 h
+1 h
+186 h
+11635 m
+124 h
+11636 m
+1 h
+11637 m
+1 h
+371 h
+4 h
+3 h
+4 h
+41 h
+4 h
+11638 m
+4 h
+1016 h
+4 h
+11639 m
+79 h
+1 h
+11640 m
+5387 m
+4 h
+25 h
+412 m
+11641 m
+1 h
+104 h
+11642 m
+11 h
+104 h
+1 h
+156 h
+4 h
+295 h
+11643 m
+11644 m
+1 h
+11645 m
+31 h
+278 h
+10914 m
+82 h
+4 h
+11646 m
+125 h
+4297 m
+11647 m
+83 h
+1 h
+11648 m
+4 h
+13 h
+11649 m
+3473 m
+11650 m
+4 h
+11651 m
+1 h
+238 h
+10 h
+4 h
+167 h
+297 h
+1 h
+4 h
+4 h
+146 h
+4 h
+463 h
+1 h
+11652 m
+11653 m
+4 h
+11654 m
+737 m
+10 h
+4 h
+1893 m
+1 h
+4 h
+1 h
+1070 m
+10 h
+11 h
+1822 h
+109 h
+10 h
+297 h
+230 m
+4 h
+1 h
+143 h
+386 h
+569 h
+4 h
+11655 m
+4 h
+123 h
+3 h
+65 h
+1 h
+4 h
+10 h
+11656 m
+10 h
+59 h
+1 h
+124 h
+12 h
+500 m
+1 h
+4 h
+4 h
+138 h
+4 h
+10 h
+11657 m
+1 h
+1 h
+1 h
+601 h
+4 h
+31 h
+11 h
+1 h
+74 h
+4 h
+8496 m
+4 h
+1 h
+11658 m
+4 h
+11659 m
+64 h
+1 h
+10 h
+97 h
+1 h
+2925 m
+1 h
+11660 m
+1583 m
+25 h
+4 h
+92 h
+4 h
+10 h
+4 h
+11661 m
+1 h
+10 h
+11662 m
+4 h
+11663 m
+1 h
+1 h
+1 h
+1 h
+1271 m
+2719 m
+11664 m
+1 h
+48 h
+11665 m
+11666 m
+4 h
+1074 m
+4 h
+11667 m
+10 h
+4 h
+11668 m
+10 h
+371 h
+4 h
+31 h
+640 m
+4 h
+1 h
+10 h
+158 h
+1 h
+4 h
+11669 m
+1 h
+82 h
+3 h
+4 h
+109 h
+1 h
+10 h
+104 h
+11670 m
+1 h
+124 h
+4 h
+4 h
+11671 m
+41 h
+11672 m
+4 h
+45 h
+4 h
+156 h
+1 h
+11 h
+11673 m
+181 h
+4 h
+11674 m
+4 h
+4 h
+11675 m
+1 h
+1 h
+11676 m
+371 h
+4 h
+238 h
+1 h
+1 h
+4 h
+1953 m
+10 h
+4 h
+45 h
+11 h
+1 h
+4 h
+185 h
+10 h
+11677 m
+11678 m
+4 h
+139 h
+1265 m
+4 h
+28 h
+4 h
+10 h
+11679 m
+4 h
+956 m
+11680 m
+4 h
+6784 m
+976 h
+1 h
+6001 m
+11681 m
+4 h
+11682 m
+1 h
+4 h
+1 h
+4 h
+3 h
+4 h
+10 h
+108 h
+11683 m
+82 h
+31 h
+10 h
+4 h
+1 h
+10 h
+11 h
+59 h
+11684 m
+4 h
+4 h
+1764 m
+10 h
+4 h
+109 h
+11685 m
+11686 m
+11687 m
+11688 m
+41 h
+1 h
+1 h
+1 h
+11689 m
+4333 m
+10 h
+266 h
+57 h
+108 h
+124 h
+1 h
+11690 m
+4 h
+4 h
+808 h
+11691 m
+11692 m
+307 h
+1 h
+10 h
+757 h
+172 h
+10 h
+11693 m
+11694 m
+4 h
+104 h
+1 h
+119 h
+57 h
+1 h
+196 h
+10970 m
+1 h
+11 h
+11695 m
+11696 m
+92 h
+10 h
+538 m
+10 h
+1 h
+10 h
+83 h
+10 h
+167 h
+4 h
+808 h
+10 h
+4 h
+4 h
+11697 m
+10 h
+69 h
+48 h
+4 h
+4 h
+11698 m
+332 h
+278 h
+57 h
+57 h
+2258 m
+4 h
+41 h
+125 h
+1 h
+4 h
+1 h
+4 h
+11699 m
+4 h
+172 h
+114 h
+10 h
+11 h
+1 h
+10 h
+1 h
+11700 m
+1 h
+1 h
+147 h
+10 h
+4 h
+4 h
+195 h
+4437 m
+10 h
+10 h
+4 h
+1 h
+10 h
+1 h
+1 h
+4 h
+4 h
+45 h
+1 h
+10 h
+10 h
+124 h
+10 h
+10 h
+124 h
+1 h
+1 h
+10 h
+4 h
+5230 m
+1 h
+10 h
+59 h
+11701 m
+447 h
+3 h
+939 m
+2418 m
+4 h
+1499 m
+1642 h
+167 h
+10 h
+12 h
+10 h
+79 h
+4 h
+97 h
+59 h
+4 h
+11702 m
+1 h
+264 m
+371 h
+10 h
+10 h
+4 h
+1 h
+4 h
+10 h
+1 h
+97 h
+4 h
+11427 m
+10 h
+74 h
+125 h
+4 h
+4 h
+4 h
+124 h
+10 h
+65 h
+1 h
+1030 h
+4 h
+36 h
+31 h
+4 h
+4 h
+2028 m
+1 h
+1470 h
+185 h
+4542 m
+10 h
+10 h
+11 h
+109 h
+965 m
+11 h
+164 h
+11 h
+10 h
+10 h
+282 m
+109 h
+4 h
+4 h
+83 h
+11703 m
+4 h
+11704 m
+229 h
+11705 m
+5864 m
+1 h
+10 h
+4 h
+65 h
+11706 m
+4 h
+41 h
+4 h
+4297 m
+1 h
+4 h
+1 h
+1 h
+11707 m
+109 h
+4 h
+10 h
+11708 m
+11709 m
+11710 m
+10 h
+278 h
+109 h
+10 h
+4 h
+11711 m
+4 h
+13 h
+4 h
+73 h
+3 h
+4 h
+57 h
+11712 m
+11713 m
+11714 m
+11715 m
+11716 m
+629 m
+4 h
+104 h
+1 h
+1 h
+1 h
+31 h
+1 h
+109 h
+1 h
+4 h
+10 h
+109 h
+3 h
+11717 m
+4 h
+4 h
+640 m
+4 h
+1 h
+10 h
+109 h
+1 h
+6776 m
+11718 m
+1 h
+4 h
+146 h
+11 h
+11719 m
+13 h
+1 h
+779 h
+4 h
+10 h
+4 h
+4929 m
+10 h
+4 h
+4 h
+57 h
+1 h
+1 h
+11720 m
+4 h
+1 h
+11721 m
+1 h
+307 h
+11722 m
+56 h
+11723 m
+11724 m
+221 m
+4 h
+11725 m
+1 h
+4 h
+4 h
+1 h
+4 h
+10 h
+169 h
+123 h
+104 h
+146 h
+10 h
+185 h
+11726 m
+9282 m
+11727 m
+1 h
+110 h
+4 h
+2300 m
+4 h
+11728 m
+1 h
+11729 m
+11730 m
+1 h
+11731 m
+135 h
+10 h
+1 h
+1 h
+4 h
+2594 m
+4 h
+4 h
+1 h
+11732 m
+196 h
+82 h
+11733 m
+11734 m
+10 h
+11735 m
+1 h
+65 h
+1 h
+4 h
+4 h
+22 h
+4 h
+10 h
+10 h
+11736 m
+1024 m
+11737 m
+4 h
+1 h
+11738 m
+1 h
+11739 m
+1 h
+4 h
+10 h
+4 h
+1 h
+1 h
+1 h
+195 h
+4 h
+11740 m
+10 h
+11741 m
+45 h
+10 h
+1 h
+10 h
+74 h
+27 h
+4 h
+1 h
+4 h
+11742 m
+4188 m
+11743 m
+4 h
+368 h
+4 h
+10 h
+146 h
+4 h
+45 h
+1189 m
+11744 m
+1 h
+1 h
+4 h
+1 h
+1 h
+4 h
+1 h
+11745 m
+11 h
+11746 m
+31 h
+1 h
+338 h
+11747 m
+11748 m
+1 h
+4 h
+1 h
+10 h
+1027 h
+83 h
+64 h
+4 h
+41 h
+1 h
+1 h
+11749 m
+1 h
+4 h
+1 h
+4 h
+4 h
+4 h
+1 h
+55 h
+10 h
+4 h
+41 h
+10 h
+36 h
+358 h
+1 h
+4 h
+11750 m
+10918 m
+278 h
+6311 m
+172 h
+1128 m
+4 h
+307 h
+279 h
+10 h
+4 h
+1 h
+1886 m
+278 h
+4 h
+195 h
+11751 m
+82 h
+3772 m
+11752 m
+1 h
+8188 m
+10 h
+1 h
+1 h
+59 h
+358 h
+186 h
+999 m
+83 h
+3 h
+4 h
+11753 m
+167 h
+4 h
+11754 m
+1 h
+10 h
+11755 m
+359 h
+11756 m
+124 h
+10 h
+4 h
+10 h
+10 h
+6747 m
+4 h
+10 h
+113 h
+10 h
+25 h
+4 h
+8423 m
+11757 m
+82 h
+4 h
+11758 m
+190 h
+1 h
+10 h
+10 h
+4 h
+10070 m
+1 h
+2438 m
+31 h
+1 h
+11759 m
+1 h
+4 h
+4 h
+204 h
+4 h
+11760 m
+124 h
+4 h
+4 h
+11761 m
+10 h
+4 h
+11762 m
+10 h
+10 h
+4 h
+11763 m
+10 h
+459 m
+1677 m
+11 h
+262 h
+3 h
+11764 m
+4 h
+135 h
+4 h
+3112 m
+4 h
+11765 m
+195 h
+386 h
+4 h
+124 h
+1 h
+170 h
+10 h
+1 h
+10 h
+108 h
+11766 m
+10 h
+1 h
+125 h
+11767 m
+4 h
+203 h
+190 h
+11768 m
+40 h
+4 h
+4 h
+11769 m
+4 h
+11770 m
+10 h
+82 h
+3 h
+73 h
+11771 m
+12 h
+386 h
+4 h
+1 h
+10 h
+10 h
+536 h
+4 h
+918 m
+73 h
+11772 m
+10 h
+692 h
+4 h
+4 h
+4 h
+10 h
+1250 h
+935 h
+939 m
+11773 m
+11774 m
+4 h
+4 h
+123 h
+1 h
+4 h
+11 h
+4728 m
+13 h
+10 h
+104 h
+4 h
+10 h
+4 h
+11 h
+10 h
+4 h
+4 h
+358 h
+447 h
+10 h
+11775 m
+4 h
+238 h
+802 m
+4 h
+4 h
+1 h
+119 h
+1027 h
+1 h
+11776 m
+11777 m
+2265 h
+10 h
+10 h
+4 h
+185 h
+4 h
+74 h
+10 h
+265 h
+4 h
+11778 m
+13 h
+801 m
+74 h
+13 h
+371 h
+266 h
+11779 m
+11780 m
+14 m
+11781 m
+11782 m
+124 h
+4 h
+82 h
+11783 m
+459 m
+1 h
+4 h
+4 h
+3 h
+687 m
+59 h
+11784 m
+464 h
+11785 m
+10 h
+1 h
+10 h
+1249 m
+4 h
+4 h
+1 h
+10 h
+1 h
+1 h
+4 h
+1 h
+23 h
+4 h
+10 h
+2891 m
+1 h
+4 h
+10070 m
+4 h
+1 h
+1 h
+583 h
+4 h
+10 h
+11786 m
+8243 m
+1 h
+73 h
+4 h
+4 h
+4 h
+4 h
+238 h
+4 h
+358 h
+10 h
+4 h
+10 h
+11787 m
+274 h
+4 h
+4 h
+4 h
+4 h
+57 h
+4520 m
+10 h
+97 h
+338 h
+82 h
+4 h
+3558 m
+4 h
+65 h
+6066 m
+119 h
+425 m
+1619 h
+4 h
+1 h
+4 h
+4 h
+57 h
+10 h
+4 h
+278 h
+195 h
+4 h
+11 h
+4 h
+11788 m
+3 h
+139 h
+11789 m
+4 h
+1454 m
+11790 m
+195 h
+104 h
+11791 m
+31 h
+4 h
+181 h
+10 h
+8 h
+1 h
+25 h
+11792 m
+11793 m
+4 h
+82 h
+4 h
+3216 m
+11794 m
+11795 m
+10 h
+4 h
+11796 m
+11797 m
+41 h
+1 h
+2266 m
+10 h
+10 h
+83 h
+8 h
+1 h
+4 h
+1 h
+4 h
+129 h
+10 h
+11798 m
+11799 m
+4 h
+11800 m
+196 h
+4 h
+10 h
+718 m
+4 h
+4 h
+935 h
+4 h
+82 h
+1 h
+11801 m
+4 h
+11802 m
+4 h
+4 h
+10 h
+4 h
+10 h
+1261 h
+297 h
+11803 m
+10 h
+4 h
+4 h
+57 h
+36 h
+4 h
+4 h
+4 h
+11804 m
+330 m
+144 m
+10 h
+10 h
+4 h
+1 h
+11805 m
+31 h
+289 h
+1 h
+2920 m
+181 h
+4 h
+1406 h
+4 h
+92 h
+4 h
+135 h
+897 m
+10 h
+1 h
+4 h
+478 h
+11806 m
+11807 m
+195 h
+1 h
+109 h
+104 h
+11808 m
+4 h
+11809 m
+1 h
+103 h
+11810 m
+601 h
+8497 m
+169 h
+1 h
+10 h
+10 h
+57 h
+4 h
+10 h
+4 h
+4 h
+4 h
+10 h
+4 h
+41 h
+4 h
+11 h
+11811 m
+1 h
+10 h
+4 h
+4 h
+11 h
+1 h
+4 h
+83 h
+1 h
+4 h
+4 h
+4 h
+1 h
+11812 m
+1 h
+11813 m
+41 h
+4 h
+59 h
+114 h
+41 h
+1 h
+1 h
+1 h
+1 h
+11 h
+103 h
+11814 m
+73 h
+3 h
+4 h
+147 h
+620 m
+10 h
+599 m
+1 h
+11815 m
+11816 m
+278 h
+11817 m
+1 h
+1018 m
+13 h
+4 h
+332 h
+10 h
+41 h
+4 h
+10 h
+10 h
+118 h
+258 h
+74 h
+1 h
+4 h
+800 m
+1 h
+4 h
+11818 m
+353 m
+1 h
+10 h
+1 h
+1 h
+31 h
+79 h
+4 h
+97 h
+10 h
+185 h
+3 h
+4 h
+4 h
+147 h
+11819 m
+11820 m
+1 h
+11821 m
+41 h
+4 h
+386 h
+11822 m
+1 h
+1 h
+11 h
+8 h
+10 h
+265 h
+10 h
+40 h
+4 h
+4 h
+1 h
+11823 m
+10 h
+3 h
+4 h
+10 h
+1 h
+4 h
+4 h
+3 h
+114 h
+4 h
+8 h
+109 h
+10 h
+4 h
+4 h
+1478 h
+11824 m
+10 h
+11825 m
+1 h
+11826 m
+10 h
+11827 m
+8974 m
+41 h
+10 h
+4 h
+31 h
+169 h
+1 h
+3188 m
+1 h
+5562 m
+4 h
+1137 h
+1 h
+1 h
+146 h
+173 h
+11828 m
+4 h
+10 h
+11829 m
+4 h
+1 h
+10 h
+41 h
+97 h
+11 h
+25 h
+138 h
+986 h
+11830 m
+11831 m
+358 h
+11832 m
+31 h
+4 h
+256 m
+4 h
+11833 m
+478 h
+55 h
+57 h
+10 h
+1 h
+6613 m
+692 h
+11834 m
+397 m
+11835 m
+1 h
+10 h
+435 m
+1 h
+10 h
+10 h
+4 h
+4 h
+1 h
+110 h
+25 h
+1 h
+146 h
+1030 h
+11836 m
+11837 m
+4 h
+124 h
+4 h
+13 h
+297 h
+11838 m
+11839 m
+3 h
+258 h
+256 m
+4 h
+10 h
+59 h
+1 h
+11840 m
+1952 m
+1 h
+10 h
+4 h
+11841 m
+1 h
+4 h
+11842 m
+4 h
+4 h
+11843 m
+41 h
+10 h
+10 h
+11 h
+1 h
+4 h
+10 h
+10 h
+28 h
+3 h
+4966 m
+36 h
+11844 m
+4 h
+1 h
+1 h
+10 h
+11845 m
+4 h
+169 h
+195 h
+10 h
+11846 m
+10 h
+1 h
+11847 m
+190 h
+11848 m
+11849 m
+4 h
+1 h
+4 h
+4 h
+1 h
+4 h
+1 h
+11850 m
+1 h
+4 h
+4 h
+11851 m
+11852 m
+4 h
+1650 h
+11853 m
+4 h
+4 h
+4 h
+999 m
+192 h
+11854 m
+687 h
+146 h
+11855 m
+94 h
+11 h
+4 h
+11856 m
+10 h
+11857 m
+11858 m
+536 h
+11859 m
+11860 m
+10 h
+11861 m
+4 h
+11862 m
+733 m
+1650 h
+33 h
+4 h
+22 h
+1 h
+10 h
+1 h
+55 h
+4 h
+1 h
+190 h
+157 h
+36 h
+1 h
+82 h
+1445 m
+10 h
+11 h
+307 h
+11863 m
+184 h
+4 h
+1 h
+464 h
+1 h
+10 h
+4 h
+4 h
+11864 m
+3344 m
+124 h
+11865 m
+11866 m
+4 h
+11867 m
+1 h
+48 h
+10 h
+57 h
+172 h
+4 h
+94 h
+11868 m
+4 h
+1 h
+266 h
+4 h
+4 h
+11 h
+1 h
+31 h
+1 h
+11869 m
+4 h
+4 h
+4 h
+1 h
+10 h
+1 h
+4 h
+10 h
+108 h
+10 h
+1 h
+1 h
+11870 m
+4 h
+11871 m
+4 h
+124 h
+1 h
+1 h
+11872 m
+569 h
+11873 m
+1 h
+4 h
+10 h
+11874 m
+10 h
+10 h
+1 h
+4 h
+196 h
+55 h
+57 h
+1 h
+10 h
+4 h
+1 h
+1403 h
+11875 m
+119 h
+170 h
+11876 m
+1 h
+4 h
+74 h
+10 h
+11877 m
+146 h
+4 h
+4 h
+4 h
+57 h
+82 h
+10 h
+172 h
+4 h
+124 h
+1 h
+332 h
+2585 m
+10 h
+11878 m
+146 h
+4 h
+4 h
+112 h
+109 h
+1 h
+307 h
+4 h
+11879 m
+4 h
+4 h
+11880 m
+83 h
+1 h
+11881 m
+4 h
+10 h
+74 h
+1 h
+4 h
+11882 m
+10 h
+1 h
+10 h
+1 h
+4 h
+1 h
+11883 m
+11884 m
+4 h
+11885 m
+59 h
+11886 m
+41 h
+164 h
+4 h
+4 h
+10 h
+1 h
+1 h
+1 h
+129 h
+1 h
+129 h
+1817 m
+11887 m
+10 h
+59 h
+1 h
+104 h
+10 h
+1 h
+5600 m
+57 h
+14 m
+10 h
+11888 m
+4 h
+11889 m
+11890 m
+4 h
+93 h
+144 m
+4 h
+109 h
+4 h
+124 h
+4 h
+124 h
+174 m
+687 h
+4 h
+412 h
+109 h
+4 h
+4 h
+1 h
+11891 m
+581 m
+40 h
+82 h
+147 h
+11892 m
+1 h
+129 h
+4 h
+31 h
+8 h
+114 h
+11893 m
+4 h
+1 h
+536 h
+11894 m
+4 h
+2002 m
+2865 m
+11 h
+270 h
+11895 m
+10 h
+1 h
+31 h
+11896 m
+10 h
+11897 m
+4 h
+59 h
+139 h
+11898 m
+195 h
+9482 m
+1 h
+4 h
+4 h
+10 h
+185 h
+4 h
+11 h
+11899 m
+10 h
+716 m
+1 h
+31 h
+4 h
+10 h
+279 h
+4 h
+4 h
+57 h
+4 h
+10 h
+4 h
+4 h
+59 h
+1 h
+11900 m
+4 h
+224 h
+11901 m
+4 h
+10 h
+82 h
+65 h
+10 h
+4 h
+4 h
+11902 m
+10 h
+83 h
+10 h
+69 h
+10 h
+10 h
+4 h
+11903 m
+57 h
+4 h
+91 h
+11904 m
+1 h
+1 h
+1 h
+1 h
+4 h
+10 h
+4 h
+1 h
+1 h
+295 h
+11905 m
+1 h
+36 h
+4 h
+1 h
+1 h
+104 h
+10 h
+11906 m
+1 h
+1 h
+1771 m
+158 h
+11907 m
+4 h
+11908 m
+10 h
+4 h
+6784 m
+196 h
+4 h
+11909 m
+123 h
+10 h
+4 h
+10 h
+172 h
+196 h
+31 h
+10 h
+1 h
+4 h
+65 h
+11910 m
+4 h
+11911 m
+73 h
+11912 m
+4 h
+4 h
+11 h
+185 h
+10 h
+4 h
+124 h
+114 h
+601 h
+4 h
+10 h
+11913 m
+147 h
+330 m
+70 m
+173 h
+1 h
+570 m
+1410 m
+110 h
+10 h
+4 h
+4 h
+1 h
+4 h
+784 m
+4 h
+164 h
+4 h
+185 h
+11914 m
+169 h
+266 h
+258 h
+119 h
+1 h
+4 h
+10 h
+10 h
+359 h
+124 h
+4 h
+10 h
+11915 m
+10 h
+1 h
+10 h
+1 h
+15 m
+10 h
+1 h
+27 h
+1 h
+1 h
+1 h
+11 h
+11916 m
+4 h
+4849 m
+114 h
+11917 m
+11918 m
+11919 m
+10 h
+538 h
+124 h
+11920 m
+4 h
+1 h
+110 h
+146 h
+4 h
+1 h
+1 h
+170 h
+3 h
+1 h
+1 h
+11921 m
+1 h
+10 h
+1 h
+4 h
+113 h
+4 h
+1 h
+1 h
+114 h
+11922 m
+45 h
+276 h
+10 h
+69 h
+1470 h
+241 m
+2623 m
+11 h
+11923 m
+10 h
+10 h
+11924 m
+1 h
+11925 m
+83 h
+1 h
+1 h
+4 h
+10 h
+59 h
+4 h
+4 h
+10 h
+73 h
+11 h
+10 h
+307 h
+1030 h
+11926 m
+11927 m
+25 h
+11928 m
+11929 m
+11930 m
+4 h
+11931 m
+1 h
+4 h
+11932 m
+4 h
+10 h
+1 h
+1 h
+4 h
+10 h
+11933 m
+1 h
+966 m
+10 h
+1 h
+10 h
+10 h
+11934 m
+4 h
+61 m
+83 h
+93 h
+11935 m
+31 h
+109 h
+83 h
+11936 m
+57 h
+4 h
+11937 m
+11938 m
+10 h
+104 h
+11939 m
+11940 m
+11941 m
+4 h
+403 h
+11942 m
+4 h
+1 h
+307 h
+4 h
+10 h
+10 h
+4 h
+3539 m
+5505 m
+4 h
+104 h
+4 h
+10 h
+10 h
+31 h
+11943 m
+10 h
+1281 m
+11 h
+4 h
+11944 m
+4 h
+10 h
+109 h
+11945 m
+4 h
+25 h
+11 h
+1 h
+1 h
+4 h
+11 h
+11946 m
+11947 m
+109 h
+10 h
+1284 m
+10 h
+4 h
+139 h
+10 h
+4 h
+1 h
+11948 m
+11949 m
+170 h
+4 h
+11950 m
+10 h
+4 h
+11951 m
+11 h
+11952 m
+143 h
+3177 m
+97 h
+25 h
+109 h
+11953 m
+11954 m
+4 h
+11955 m
+11956 m
+4 h
+11957 m
+4 h
+1027 h
+1 h
+1 h
+135 h
+4 h
+4 h
+266 h
+4 h
+45 h
+11958 m
+938 h
+10 h
+196 h
+10 h
+4 h
+11959 m
+11960 m
+4 h
+4 h
+11961 m
+114 h
+11962 m
+1710 m
+11 h
+147 h
+1 h
+1 h
+10 h
+274 h
+4 h
+1 h
+11963 m
+1 h
+1830 m
+4 h
+11 h
+4 h
+59 h
+10 h
+11 h
+11964 m
+1 h
+4 h
+1 h
+4 h
+4 h
+1 h
+4 h
+11965 m
+11966 m
+11967 m
+8 h
+11968 m
+939 h
+1 h
+82 h
+11 h
+1 h
+1 h
+4 h
+4 h
+190 h
+4 h
+147 h
+10 h
+11969 m
+10 h
+164 h
+692 h
+11970 m
+1470 h
+109 h
+104 h
+59 h
+447 h
+82 h
+10 h
+7585 m
+11971 m
+10 h
+4 h
+73 h
+1 h
+4 h
+11972 m
+4 h
+1 h
+11973 m
+6784 m
+4 h
+82 h
+4 h
+31 h
+11 h
+41 h
+124 h
+4 h
+31 h
+11974 m
+10 h
+319 h
+403 h
+124 h
+11975 m
+11976 m
+1 h
+1 h
+1 h
+1632 m
+11977 m
+11978 m
+4 h
+82 h
+1 h
+11 h
+74 h
+4 h
+4 h
+4 h
+11979 m
+11980 m
+4 h
+1 h
+264 m
+11 h
+11981 m
+4 h
+1 h
+575 m
+4 h
+11982 m
+4 h
+4 h
+4 h
+4 h
+236 m
+4 h
+4 h
+57 h
+10 h
+4 h
+10 h
+10 h
+11983 m
+4 h
+11984 m
+1542 m
+4 h
+1957 m
+11 h
+11985 m
+10 h
+10 h
+1 h
+135 h
+506 m
+10 h
+1 h
+11 h
+4 h
+4 h
+307 h
+11986 m
+82 h
+10 h
+11987 m
+10 h
+4 h
+4 h
+1 h
+1 h
+1 h
+1 h
+11988 m
+11989 m
+10 h
+10 h
+10 h
+10 h
+10 h
+11990 m
+1 h
+1 h
+57 h
+57 h
+11991 m
+4 h
+11 h
+1 h
+4 h
+11992 m
+82 h
+83 h
+4 h
+278 h
+4 h
+11993 m
+1 h
+11 h
+83 h
+1 h
+11994 m
+10 h
+11995 m
+11996 m
+4 h
+1 h
+170 h
+82 h
+11997 m
+1 h
+11998 m
+11999 m
+12000 m
+1 h
+10 h
+1 h
+10 h
+124 h
+195 h
+2096 m
+4 h
+125 h
+12001 m
+185 h
+74 h
+6015 m
+65 h
+83 h
+266 h
+444 m
+4 h
+123 h
+12002 m
+10 h
+12003 m
+12004 m
+578 h
+4 h
+10 h
+12005 m
+12006 m
+1 h
+10 h
+4 h
+276 h
+190 h
+4 h
+11 h
+12007 m
+4 h
+12008 m
+1 h
+11 h
+1697 m
+4 h
+110 h
+12009 m
+1 h
+12010 m
+12011 m
+4 h
+4 h
+10 h
+4 h
+1 h
+12012 m
+25 h
+1 h
+4 h
+4 h
+4 h
+358 h
+196 h
+57 h
+4 h
+4 h
+1 h
+459 h
+4 h
+1 h
+190 h
+4 h
+10 h
+1 h
+12013 m
+274 h
+10 h
+12014 m
+74 h
+83 h
+4 h
+1 h
+10 h
+4 h
+10 h
+12015 m
+3 h
+966 m
+1 h
+160 m
+45 h
+190 h
+125 h
+1 h
+12016 m
+12017 m
+10 h
+1 h
+12018 m
+377 h
+11 h
+79 h
+41 h
+1 h
+10 h
+10 h
+12019 m
+65 h
+2813 m
+12020 m
+10 h
+1 h
+12021 m
+10 h
+12022 m
+4 h
+1886 m
+1 h
+65 h
+4 h
+4 h
+1 h
+4 h
+1 h
+12023 m
+12024 m
+12025 m
+114 h
+447 h
+10 h
+10 h
+10 h
+1 h
+83 h
+687 h
+4 h
+1 h
+12026 m
+434 m
+1 h
+10 h
+0 m
+12027 m
+4 h
+10 h
+2592 m
+4 h
+156 h
+4 h
+4 h
+1 h
+1 h
+12028 m
+10 h
+4 h
+4 h
+4 h
+12029 m
+57 h
+4 h
+4 h
+10 h
+12030 m
+11 h
+1 h
+1 h
+69 h
+4 h
+1 h
+12 h
+169 h
+136 m
+12031 m
+1299 m
+4 h
+12032 m
+1 h
+10 h
+12033 m
+12034 m
+1 h
+12035 m
+124 h
+4 h
+1 h
+4 h
+10 h
+12036 m
+97 h
+1 h
+12037 m
+1 h
+12038 m
+109 h
+4471 m
+4 h
+12039 m
+138 h
+5964 m
+536 h
+4 h
+4 h
+1 h
+4 h
+10 h
+185 h
+59 h
+10 h
+4 h
+4 h
+181 h
+12040 m
+12041 m
+10 h
+1 h
+4 h
+4 h
+1 h
+1 h
+601 h
+520 m
+3 h
+12042 m
+4 h
+1189 m
+2733 m
+10 h
+125 h
+41 h
+36 h
+1 h
+1 h
+4 h
+12043 m
+4 h
+1 h
+4 h
+192 h
+1 h
+135 h
+12044 m
+12045 m
+12046 m
+10 h
+12047 m
+11 h
+12048 m
+4 h
+174 m
+1 h
+4 h
+4 h
+11 h
+10 h
+12049 m
+1016 h
+358 h
+25 h
+4 h
+1 h
+10 h
+170 h
+10 h
+10 h
+1137 h
+11 h
+1 h
+10 h
+10 h
+69 h
+4 h
+110 h
+4 h
+4 h
+65 h
+11 h
+12050 m
+10 h
+1 h
+4 h
+4 h
+11 h
+4 h
+4 h
+1089 h
+1 h
+12051 m
+12052 m
+4 h
+31 h
+12053 m
+12054 m
+12055 m
+12056 m
+1 h
+11 h
+4 h
+186 h
+10 h
+11 h
+12057 m
+170 h
+1016 h
+4 h
+124 h
+196 h
+229 h
+939 h
+1 h
+1220 m
+1751 m
+82 h
+124 h
+10 h
+40 h
+129 h
+4 h
+92 h
+12058 m
+1 h
+1 h
+0 m
+125 h
+10 h
+1 h
+12059 m
+1 h
+1 h
+4 h
+4 h
+56 h
+113 h
+4 h
+12060 m
+1 h
+4 h
+477 m
+45 h
+11 h
+4 h
+4 h
+97 h
+332 h
+12061 m
+12062 m
+12063 m
+11 h
+2540 m
+4 h
+1 h
+12064 m
+11 h
+1 h
+55 h
+4 h
+45 h
+4 h
+104 h
+911 h
+4229 m
+36 h
+4 h
+4 h
+4 h
+10 h
+45 h
+1 h
+4 h
+1 h
+4 h
+13 h
+11 h
+447 h
+10 h
+10330 m
+10 h
+8 h
+1 h
+12065 m
+55 h
+4 h
+1266 m
+12066 m
+110 h
+12067 m
+55 h
+4 h
+273 m
+12068 m
+73 h
+1 h
+1 h
+12069 m
+1359 m
+10 h
+1 h
+12070 m
+4 h
+12071 m
+6025 m
+109 h
+12072 m
+1 h
+4 h
+4 h
+4 h
+12073 m
+4 h
+12074 m
+74 h
+10 h
+10 h
+264 m
+12075 m
+12076 m
+12077 m
+276 h
+278 h
+4 h
+4 h
+377 h
+2786 m
+1 h
+12078 m
+4 h
+12079 m
+83 h
+285 m
+83 h
+4 h
+4 h
+383 h
+143 h
+10 h
+3341 m
+12080 m
+1 h
+4 h
+146 h
+12081 m
+278 h
+1 h
+181 h
+1 h
+4 h
+124 h
+12082 m
+1 h
+1 h
+31 h
+4 h
+12083 m
+1 h
+11 h
+1 h
+4 h
+4 h
+79 h
+10 h
+12084 m
+1017 h
+12085 m
+1 h
+265 h
+4 h
+12086 m
+383 h
+59 h
+4 h
+3321 m
+1409 m
+12087 m
+1 h
+13 h
+109 h
+1 h
+4 h
+82 h
+4 h
+4 h
+1 h
+1 h
+1 h
+1 h
+12088 m
+12089 m
+82 h
+1 h
+59 h
+4 h
+12090 m
+1 h
+10 h
+12091 m
+2788 h
+1650 h
+156 h
+629 m
+104 h
+4 h
+109 h
+45 h
+4 h
+6788 m
+1 h
+167 h
+1 h
+1 h
+2379 m
+2116 m
+12092 m
+12093 m
+1 h
+10 h
+1 h
+12094 m
+1 h
+4 h
+794 m
+12095 m
+12096 m
+4744 m
+1 h
+4 h
+10 h
+59 h
+278 h
+1 h
+1 h
+928 m
+11 h
+1 h
+4 h
+4 h
+1 h
+3 h
+12097 m
+10 h
+954 m
+4 h
+250 h
+10 h
+83 h
+4 h
+190 h
+4 h
+12098 m
+12099 m
+1 h
+57 h
+4 h
+10 h
+11 h
+2665 m
+10 h
+578 h
+4 h
+36 h
+533 h
+12100 m
+82 h
+1 h
+10 h
+1574 m
+6413 m
+1 h
+1 h
+12101 m
+4 h
+93 h
+12102 m
+59 h
+4 h
+69 h
+1 h
+4 h
+1 h
+1250 h
+12103 m
+12104 m
+10 h
+2309 m
+1 h
+4 h
+4 h
+97 h
+12105 m
+64 h
+41 h
+4 h
+4 h
+1 h
+28 h
+1 h
+1 h
+12106 m
+59 h
+1 h
+1 h
+430 m
+578 h
+1 h
+4 h
+1 h
+4 h
+10 h
+4 h
+110 h
+4 h
+12107 m
+1 h
+4 h
+4 h
+4 h
+3 h
+12108 m
+12109 m
+10 h
+57 h
+73 h
+10 h
+1 h
+4 h
+4 h
+1 h
+4 h
+12110 m
+10 h
+4 h
+1 h
+258 h
+4 h
+12111 m
+25 h
+1 h
+12112 m
+4 h
+12113 m
+3 h
+12114 m
+12115 m
+3845 m
+10 h
+10 h
+10 h
+4 h
+307 h
+186 h
+12116 m
+124 h
+12117 m
+4 h
+12118 m
+1 h
+12119 m
+45 h
+4106 m
+12120 m
+10 h
+4 h
+12121 m
+169 h
+64 h
+124 h
+4 h
+1 h
+4 h
+167 h
+12122 m
+4 h
+11 h
+1 h
+4 h
+1 h
+12123 m
+4 h
+10 h
+1 h
+4 h
+12124 m
+1 h
+10 h
+4 h
+4 h
+10 h
+11 h
+4 h
+27 h
+12125 m
+4 h
+1 h
+79 h
+10 h
+4 h
+1 h
+8 h
+270 h
+1 h
+41 h
+12126 m
+172 h
+1 h
+41 h
+12127 m
+12128 m
+4 h
+12129 m
+59 h
+2733 h
+195 h
+1359 m
+1959 m
+12130 m
+12131 m
+4 h
+12132 m
+4 h
+12133 m
+12134 m
+12 h
+297 h
+12135 m
+12136 m
+1 h
+1722 m
+10 h
+129 h
+114 h
+4 h
+97 h
+12137 m
+383 h
+229 h
+10 h
+10 h
+1 h
+6766 m
+1 h
+1 h
+10 h
+4 h
+4 h
+1 h
+1 h
+4127 m
+4 h
+12138 m
+114 h
+11 h
+10 h
+10 h
+173 h
+31 h
+1 h
+11 h
+1406 h
+10 h
+12139 m
+1 h
+1 h
+25 h
+10 h
+25 h
+173 h
+279 h
+45 h
+4 h
+4 h
+6095 m
+83 h
+224 h
+4 h
+114 h
+1 h
+939 h
+4 h
+4 h
+11 h
+57 h
+4 h
+10 h
+10 h
+10 h
+4 h
+687 h
+146 h
+10 h
+12140 m
+1 h
+12141 m
+4 h
+25 h
+1 h
+1 h
+1 h
+170 h
+10 h
+4 h
+31 h
+4 h
+83 h
+12142 m
+12143 m
+307 h
+12144 m
+1 h
+12145 m
+4 h
+1 h
+4 h
+83 h
+10 h
+4 h
+74 h
+13 h
+4 h
+10 h
+10 h
+1 h
+4 h
+258 h
+4 h
+124 h
+10 h
+1 h
+1 h
+12146 m
+4 h
+1 h
+12147 m
+4 h
+770 m
+57 h
+266 h
+4 h
+104 h
+1 h
+4 h
+10 h
+167 h
+57 h
+4 h
+1 h
+1 h
+8206 m
+1 h
+11 h
+4 h
+4 h
+59 h
+1 h
+10 h
+1138 m
+83 h
+4 h
+241 m
+12148 m
+1 h
+1409 m
+1016 h
+4 h
+4 h
+4 h
+4 h
+1 h
+4 h
+4 h
+1 h
+12149 m
+11 h
+12150 m
+112 h
+4 h
+31 h
+12151 m
+583 h
+10 h
+12152 m
+10 h
+4 h
+12153 m
+1 h
+4 h
+4 h
+10 h
+12154 m
+4 h
+1556 m
+135 h
+2923 m
+22 h
+4 h
+65 h
+4 h
+270 h
+2928 m
+12155 m
+1 h
+12156 m
+4 h
+12157 m
+158 h
+59 h
+278 h
+31 h
+57 h
+7649 m
+1 h
+10 h
+59 h
+1666 m
+109 h
+25 h
+5125 m
+2951 m
+1 h
+10 h
+4 h
+64 h
+582 m
+4 h
+4 h
+1 h
+12158 m
+963 m
+12159 m
+1 h
+12160 m
+4 h
+2794 m
+1 h
+2205 m
+83 h
+4 h
+1 h
+4 h
+57 h
+1 h
+1 h
+1 h
+4 h
+25 h
+1454 m
+6437 m
+10 h
+4 h
+10 h
+10 h
+307 h
+25 h
+94 h
+2041 m
+1 h
+10 h
+12161 m
+4 h
+25 h
+4 h
+4 h
+3668 m
+4 h
+4 h
+1 h
+196 h
+195 h
+65 h
+11 h
+31 h
+12162 m
+4 h
+10 h
+10 h
+12163 m
+12164 m
+1 h
+1486 m
+2339 m
+4 h
+4 h
+13 h
+12165 m
+4 h
+12166 m
+45 h
+12167 m
+12168 m
+4 h
+4 h
+10 h
+10 h
+94 h
+1 h
+289 h
+12169 m
+10 h
+1 h
+1 h
+124 h
+358 h
+12170 m
+1 h
+10 h
+97 h
+4 h
+25 h
+1016 h
+4 h
+4297 h
+888 m
+124 h
+146 h
+59 h
+4 h
+258 h
+12171 m
+10 h
+4 h
+4 h
+10 h
+4 h
+480 m
+57 h
+400 m
+4 h
+4 h
+1 h
+4 h
+10 h
+10 h
+444 m
+4 h
+4 h
+520 m
+12172 m
+10 h
+65 h
+4 h
+4 h
+266 h
+4 h
+10 h
+104 h
+104 h
+4 h
+4 h
+270 h
+82 h
+10 h
+12173 m
+12131 m
+147 h
+114 h
+1 h
+10 h
+10 h
+12174 m
+4 h
+12175 m
+1 h
+12176 m
+4 h
+196 h
+143 h
+1 h
+10 h
+11 h
+11 h
+64 h
+1 h
+1 h
+4 h
+4 h
+4 h
+4 h
+4 h
+12 h
+74 h
+1 h
+104 h
+1796 m
+12177 m
+1 h
+10 h
+1 h
+113 h
+57 h
+4 h
+1 h
+4 h
+4 h
+12178 m
+1 h
+1 h
+82 h
+4 h
+4 h
+386 h
+12179 m
+12180 m
+4 h
+4 h
+4 h
+4 h
+1 h
+1105 h
+104 h
+12181 m
+1 h
+10 h
+4 h
+92 h
+4 h
+10 h
+1 h
+1 h
+104 h
+12182 m
+12183 m
+4 h
+10 h
+12184 m
+203 h
+104 h
+10 h
+12185 m
+4 h
+4 h
+871 m
+12186 m
+4 h
+65 h
+10 h
+1 h
+147 h
+266 h
+170 h
+11 h
+12187 m
+124 h
+12188 m
+10 h
+1 h
+82 h
+4 h
+1 h
+11 h
+12189 m
+124 h
+1 h
+4 h
+4 h
+25 h
+13 h
+4 h
+4 h
+195 h
+1 h
+64 h
+10 h
+12190 m
+464 h
+1 h
+31 h
+1 h
+4 h
+1 h
+1 h
+169 h
+4 h
+1 h
+4 h
+447 h
+82 h
+10 h
+73 h
+282 h
+129 h
+12191 m
+79 h
+4 h
+10 h
+25 h
+1 h
+196 h
+4 h
+12192 m
+4 h
+8274 m
+10 h
+10 h
+10 h
+4 h
+12193 m
+295 h
+1 h
+1 h
+1 h
+12194 m
+12195 m
+12196 m
+10 h
+3402 m
+125 h
+143 h
+12197 m
+4 h
+12 h
+10 h
+1 h
+4 h
+12198 m
+74 h
+4849 m
+10 h
+4 h
+12199 m
+12200 m
+31 h
+4 h
+10 h
+12201 m
+1 h
+139 h
+4 h
+11 h
+12202 m
+4 h
+1 h
+12203 m
+4 h
+4 h
+83 h
+1261 h
+119 h
+83 h
+4 h
+4 h
+1 h
+4 h
+463 h
+4 h
+118 h
+4 h
+41 h
+4 h
+12 h
+12204 m
+1 h
+65 h
+1 h
+4 h
+3 h
+1 h
+1 h
+10 h
+3702 m
+12205 m
+12206 m
+4 h
+57 h
+22 h
+4 h
+12207 m
+12208 m
+1 h
+1 h
+1 h
+4 h
+4 h
+4 h
+12209 m
+3 h
+10 h
+1 h
+12210 m
+10 h
+4 h
+12211 m
+55 h
+12212 m
+1 h
+170 h
+156 h
+1 h
+4 h
+4 h
+12213 m
+1 h
+12214 m
+4 h
+10 h
+1 h
+10 h
+4 h
+12215 m
+4 h
+4 h
+11 h
+221 m
+4 h
+196 h
+1137 h
+4 h
+327 m
+10 h
+12216 m
+12217 m
+11 h
+10 h
+4 h
+10 h
+4 h
+10 h
+10 h
+10 h
+56 h
+12218 m
+1 h
+4 h
+184 h
+1 h
+986 h
+12219 m
+12220 m
+12221 m
+4 h
+73 h
+73 h
+124 h
+10 h
+10 h
+57 h
+170 h
+12222 m
+55 h
+12223 m
+4 h
+196 h
+57 h
+12224 m
+12225 m
+10 h
+4 h
+1 h
+4 h
+12226 m
+4 h
+109 h
+276 h
+146 h
+74 h
+575 m
+104 h
+1 h
+12227 m
+11 h
+4 h
+1 h
+12228 m
+12229 m
+10 h
+388 m
+12230 m
+12231 m
+82 h
+4 h
+4 h
+185 h
+1 h
+185 h
+1828 m
+1 h
+4 h
+12232 m
+10 h
+10 h
+4 h
+1 h
+278 h
+4 h
+12233 m
+4 h
+12234 m
+12235 m
+1 h
+10 h
+4 h
+12236 m
+112 h
+1 h
+12237 m
+169 h
+4 h
+1 h
+1835 m
+74 h
+1 h
+1 h
+10 h
+4 h
+10 h
+4 h
+82 h
+10 h
+10 h
+8608 m
+12238 m
+4 h
+12239 m
+10659 m
+4 h
+4 h
+12240 m
+10 h
+3278 m
+12241 m
+2002 m
+3 h
+82 h
+55 h
+4 h
+1 h
+4 h
+25 h
+124 h
+57 h
+172 h
+10 h
+12242 m
+186 h
+195 h
+10 h
+10 h
+986 h
+82 h
+1403 h
+4 h
+10 h
+31 h
+1 h
+57 h
+1 h
+12243 m
+1 h
+1822 h
+12244 m
+4 h
+12245 m
+10 h
+10 h
+83 h
+12246 m
+82 h
+4 h
+124 h
+4 h
+10 h
+114 h
+12 h
+31 h
+12247 m
+109 h
+692 h
+4 h
+8854 m
+1137 h
+238 h
+12248 m
+12249 m
+4 h
+41 h
+31 h
+31 h
+12250 m
+4 h
+12251 m
+10 h
+332 h
+13 h
+57 h
+1 h
+109 h
+4 h
+36 h
+12252 m
+83 h
+4 h
+10 h
+12253 m
+4 h
+447 h
+12254 m
+195 h
+4 h
+82 h
+1 h
+1 h
+12255 m
+10 h
+4 h
+10 h
+57 h
+4 h
+10 h
+4 h
+4 h
+1 h
+10 h
+12256 m
+110 h
+10 h
+250 h
+12257 m
+82 h
+4 h
+10 h
+10 h
+258 h
+73 h
+25 h
+59 h
+65 h
+1470 h
+25 h
+10 h
+12258 m
+1379 m
+41 h
+31 h
+1 h
+1 h
+12259 m
+31 h
+4 h
+12260 m
+3 h
+4 h
+4 h
+279 h
+196 h
+181 h
+1 h
+12261 m
+4 h
+266 h
+12262 m
+10 h
+4 h
+12263 m
+146 h
+4 h
+25 h
+278 h
+12264 m
+4 h
+12265 m
+12266 m
+12267 m
+4 h
+12268 m
+10 h
+10 h
+12269 m
+12270 m
+1 h
+1 h
+12271 m
+104 h
+4 h
+2923 m
+1 h
+1 h
+11 h
+1 h
+12272 m
+8556 m
+4 h
+10 h
+990 m
+1 h
+4 h
+83 h
+4 h
+8643 m
+536 h
+4 h
+143 h
+12273 m
+1 h
+57 h
+912 m
+1 h
+4 h
+3 h
+4 h
+11 h
+4 h
+12274 m
+4 h
+74 h
+386 h
+10 h
+538 h
+91 h
+1027 h
+40 h
+1074 h
+1 h
+10 h
+1 h
+4 h
+12275 m
+4 h
+12276 m
+4 h
+1 h
+1 h
+4 h
+173 h
+124 h
+124 h
+4 h
+10 h
+4 h
+109 h
+74 h
+1299 m
+12277 m
+4 h
+146 h
+1 h
+4 h
+12278 m
+8104 m
+4 h
+139 h
+1 h
+1 h
+7306 m
+12279 m
+297 h
+135 h
+10 h
+12280 m
+12281 m
+1 h
+4 h
+1201 m
+1 h
+10 h
+11 h
+4815 m
+10 h
+10 h
+196 h
+10 h
+1 h
+83 h
+4 h
+12282 m
+12283 m
+10 h
+11 h
+4 h
+147 h
+1 h
+1 h
+4 h
+2719 m
+12284 m
+157 h
+1 h
+11 h
+4 h
+4 h
+4 h
+1 h
+79 h
+1201 h
+12285 m
+12 h
+12286 m
+1 h
+4 h
+12287 m
+4 h
+1 h
+12288 m
+4 h
+12289 m
+10 h
+4 h
+4 h
+12290 m
+10 h
+1 h
+4 h
+10745 m
+1 h
+1 h
+1 h
+4 h
+1698 m
+1 h
+74 h
+11 h
+1 h
+4 h
+4 h
+425 m
+10 h
+10 h
+4 h
+12291 m
+4 h
+1 h
+12292 m
+270 h
+10 h
+3 h
+12293 m
+10 h
+4 h
+12294 m
+12295 m
+1453 m
+4 h
+4 h
+4 h
+1953 m
+12296 m
+83 h
+1 h
+488 h
+757 h
+170 h
+4 h
+1 h
+443 h
+12297 m
+12298 m
+2374 m
+1504 m
+74 h
+1 h
+13 h
+10 h
+144 h
+10 h
+147 h
+399 h
+4 h
+10 h
+12299 m
+1 h
+4 h
+1 h
+6503 m
+10 h
+10 h
+82 h
+10 h
+4 h
+1089 h
+124 h
+538 h
+12300 m
+1 h
+110 h
+12301 m
+12302 m
+4 h
+73 h
+238 h
+4 h
+36 h
+4 h
+185 h
+4 h
+12303 m
+11 h
+12304 m
+12305 m
+730 m
+1 h
+56 h
+10 h
+12306 m
+578 h
+6129 m
+12307 m
+4 h
+124 h
+12308 m
+12309 m
+10 h
+4 h
+1 h
+12310 m
+12311 m
+110 h
+12312 m
+13 h
+1 h
+4 h
+1 h
+297 h
+1 h
+1 h
+12313 m
+11 h
+12314 m
+4 h
+4 h
+1 h
+12194 m
+10 h
+10 h
+4 h
+4701 m
+4 h
+94 h
+12 h
+10 h
+4 h
+4 h
+10 h
+83 h
+13 h
+41 h
+10 h
+10 h
+250 h
+12315 m
+11 h
+4 h
+12316 m
+1 h
+135 h
+12317 m
+1 h
+41 h
+12318 m
+1 h
+1 h
+4 h
+238 h
+109 h
+10 h
+4 h
+12319 m
+10 h
+1 h
+109 h
+12320 m
+12 h
+48 h
+12321 m
+4 h
+10 h
+83 h
+4 h
+12322 m
+14 h
+57 h
+124 h
+22 h
+146 h
+12323 m
+57 h
+4 h
+12324 m
+1 h
+83 h
+6457 m
+1 h
+4 h
+1 h
+164 h
+146 h
+10 h
+616 m
+4 h
+97 h
+12325 m
+4 h
+4 h
+10 h
+4 h
+1 h
+1 h
+12326 m
+1 h
+1337 m
+195 h
+1650 h
+10 h
+10 h
+12327 m
+13 h
+140 m
+1016 h
+1858 m
+4 h
+1 h
+12328 m
+12329 m
+12330 m
+196 h
+4 h
+5621 m
+12331 m
+1 h
+4 h
+12332 m
+12333 m
+1 h
+10 h
+11 h
+332 h
+82 h
+4 h
+4 h
+4 h
+11 h
+82 h
+4 h
+12334 m
+2379 m
+82 h
+10 h
+1953 m
+4 h
+4 h
+10 h
+1 h
+1 h
+119 h
+10 h
+12335 m
+4 h
+1 h
+12336 m
+12337 m
+12 h
+45 h
+181 h
+25 h
+196 h
+12338 m
+4 h
+12339 m
+4 h
+74 h
+1 h
+1089 h
+443 h
+1137 h
+4 h
+91 h
+12340 m
+10 h
+1 h
+12341 m
+12342 m
+104 h
+1 h
+157 h
+12343 m
+113 h
+41 h
+12344 m
+4 h
+4 h
+192 h
+12345 m
+1 h
+737 m
+1 h
+59 h
+12346 m
+12347 m
+1 h
+170 h
+23 h
+12348 m
+4 h
+371 h
+31 h
+57 h
+10 h
+6187 m
+12349 m
+1 h
+4 h
+184 h
+27 h
+274 h
+4 h
+12350 m
+274 h
+1766 m
+57 h
+4 h
+10 h
+3 h
+10 h
+11 h
+1 h
+59 h
+4 h
+7 m
+125 h
+12351 m
+12352 m
+4 h
+12353 m
+12354 m
+4 h
+10 h
+4 h
+1 h
+12355 m
+4 h
+4 h
+12356 m
+196 h
+4 h
+3 h
+56 h
+4 h
+10 h
+56 h
+10 h
+256 h
+1 h
+4 h
+869 m
+4 h
+1 h
+12 h
+124 h
+4 h
+6784 h
+4 h
+4 h
+230 m
+403 h
+1 h
+266 h
+4 h
+12357 m
+4 h
+1 h
+57 h
+12358 m
+4 h
+164 h
+77 h
+1 h
+12359 m
+92 h
+4 h
+12360 m
+1 h
+55 h
+2205 m
+4 h
+3 h
+10 h
+59 h
+83 h
+4 h
+1 h
+57 h
+82 h
+74 h
+174 m
+73 h
+10 h
+27 h
+1 h
+10 h
+4 h
+65 h
+4 h
+4 h
+1 h
+146 h
+12361 m
+12362 m
+12363 m
+10 h
+12364 m
+359 h
+4 h
+12365 m
+12366 m
+538 h
+4 h
+1 h
+12367 m
+10 h
+2285 m
+1 h
+4 h
+65 h
+1 h
+11 h
+1 h
+4 h
+1 h
+10 h
+4 h
+106 m
+10 h
+104 h
+1 h
+59 h
+185 h
+125 h
+1406 h
+4 h
+10 h
+164 h
+1 h
+12368 m
+41 h
+10 h
+3 h
+114 h
+1 h
+520 h
+1 h
+10 h
+10 h
+386 h
+59 h
+4 h
+556 h
+1 h
+10 h
+12369 m
+12370 m
+4 h
+12371 m
+4 h
+4 h
+976 h
+10 h
+11 h
+1 h
+4 h
+4 h
+23 h
+124 h
+1 h
+4 h
+4 h
+59 h
+196 h
+12372 m
+4 h
+4 h
+10 h
+10 h
+11 h
+1 h
+1 h
+10 h
+4240 m
+1 h
+1 h
+4 h
+4 h
+1 h
+4 h
+56 h
+93 h
+12373 m
+4 h
+74 h
+4 h
+4 h
+125 h
+12374 m
+1 h
+536 h
+4 h
+31 h
+12375 m
+4 h
+10 h
+3 h
+2923 h
+443 h
+12376 m
+1 h
+258 h
+12377 m
+10 h
+570 m
+12378 m
+4 h
+976 h
+640 h
+195 h
+1089 h
+12 h
+2172 m
+87 m
+1 h
+737 m
+575 m
+1137 h
+64 h
+488 h
+12379 m
+319 h
+104 h
+12380 m
+4 h
+4 h
+4 h
+10 h
+238 h
+12381 m
+1 h
+56 h
+1 h
+4 h
+104 h
+3 h
+4 h
+4 h
+1 h
+195 h
+4 h
+4 h
+144 h
+506 m
+4 h
+196 h
+11 h
+2002 m
+1 h
+27 h
+73 h
+4 h
+4111 m
+195 h
+266 h
+1 h
+10 h
+4 h
+3 h
+316 m
+10 h
+1 h
+12382 m
+83 h
+282 h
+12383 m
+1 h
+91 h
+4 h
+10 h
+4 h
+1 h
+443 h
+143 h
+83 h
+27 h
+1016 h
+185 h
+12384 m
+12385 m
+4 h
+12386 m
+10 h
+11 h
+12387 m
+12388 m
+3 h
+65 h
+1 h
+10 h
+10 h
+12389 m
+12390 m
+12391 m
+12392 m
+307 h
+1 h
+59 h
+12393 m
+1 h
+1 h
+4 h
+1 h
+10 h
+12394 m
+169 h
+12395 m
+12396 m
+0 h
+124 h
+82 h
+10 h
+4 h
+270 h
+124 h
+1 h
+10 h
+1 h
+1 h
+146 h
+4 h
+11 h
+1 h
+10 h
+77 h
+93 h
+4 h
+1 h
+12397 m
+57 h
+12398 m
+4 h
+4 h
+1 h
+4 h
+601 h
+10 h
+11 h
+1 h
+10 h
+4 h
+10 h
+4 h
+79 h
+10 h
+1 h
+12399 m
+358 h
+4 h
+10 h
+274 h
+1 h
+4 h
+10 h
+83 h
+1 h
+4 h
+4 h
+10 h
+135 h
+332 h
+12400 m
+1 h
+1 h
+41 h
+4229 m
+4 h
+10 h
+4 h
+4 h
+358 h
+1 h
+1 h
+1 h
+12401 m
+4 h
+4 h
+190 h
+371 h
+109 h
+1 h
+1 h
+219 h
+10 h
+12402 m
+12403 m
+1714 m
+196 h
+12404 m
+1 h
+1 h
+1 h
+10 h
+1 h
+10 h
+1822 h
+12405 m
+1 h
+1 h
+1955 m
+109 h
+11 h
+1 h
+10 h
+12406 m
+12407 m
+258 h
+82 h
+4966 m
+274 h
+10 h
+10 h
+270 h
+12408 m
+1 h
+4 h
+48 h
+12409 m
+27 h
+12410 m
+4 h
+1737 m
+12411 m
+4 h
+4 h
+4 h
+12412 m
+4 h
+10 h
+4 h
+12413 m
+73 h
+4 h
+4 h
+1 h
+10 h
+12414 m
+1 h
+45 h
+123 h
+4 h
+4 h
+12415 m
+12416 m
+4 h
+12417 m
+4 h
+94 h
+169 h
+1957 m
+4 h
+83 h
+12418 m
+4 h
+97 h
+4 h
+1 h
+12419 m
+3 h
+83 h
+4 h
+97 h
+1 h
+4 h
+11 h
+1 h
+1 h
+10 h
+1 h
+1478 h
+12420 m
+12421 m
+4 h
+12422 m
+173 h
+12423 m
+1 h
+4 h
+10 h
+1 h
+4 h
+229 h
+4 h
+4 h
+110 h
+12424 m
+1 h
+4 h
+4 h
+338 h
+190 h
+10 h
+31 h
+10 h
+10 h
+10 h
+1 h
+874 m
+12425 m
+10 h
+10 h
+4 h
+125 h
+10 h
+104 h
+79 h
+12426 m
+4 h
+4 h
+238 h
+12427 m
+1 h
+10 h
+3 h
+11 h
+2442 m
+12428 m
+10 h
+59 h
+12429 m
+10 h
+10 h
+377 h
+12430 m
+1 h
+1 h
+12431 m
+1 h
+4 h
+533 h
+4 h
+25 h
+82 h
+10 h
+1 h
+4 h
+10 h
+4 h
+10 h
+4 h
+4 h
+11 h
+4520 m
+10 h
+4 h
+12432 m
+64 h
+4 h
+1 h
+10 h
+27 h
+4 h
+12433 m
+12434 m
+276 h
+12 h
+4 h
+124 h
+82 h
+1 h
+110 h
+45 h
+12435 m
+12436 m
+4 h
+4 h
+11 h
+10 h
+4 h
+823 m
+12437 m
+3 h
+31 h
+1 h
+125 h
+4 h
+82 h
+12438 m
+1 h
+4 h
+4 h
+22 h
+4 h
+12 h
+4 h
+1 h
+230 m
+12439 m
+4 h
+332 h
+885 m
+146 h
+1 h
+12440 m
+4 h
+4 h
+12441 m
+11 h
+125 h
+4 h
+4 h
+12442 m
+82 h
+12443 m
+4 h
+1 h
+1 h
+10 h
+92 h
+359 h
+1 h
+4 h
+12444 m
+583 h
+59 h
+2617 m
+12445 m
+4 h
+1 h
+41 h
+1 h
+112 h
+12446 m
+1437 m
+601 h
+4 h
+83 h
+59 h
+4 h
+6753 m
+4 h
+12447 m
+241 m
+10 h
+125 h
+1 h
+157 h
+31 h
+92 h
+358 h
+12448 m
+82 h
+12449 m
+1 h
+82 h
+36 h
+27 h
+1499 m
+12450 m
+4 h
+12451 m
+124 h
+125 h
+4 h
+74 h
+12452 m
+12453 m
+11 h
+1261 h
+986 h
+4 h
+1 h
+1838 m
+4 h
+4 h
+4 h
+4 h
+12454 m
+1 h
+11 h
+12455 m
+581 m
+4 h
+12456 m
+4 h
+25 h
+83 h
+4 h
+12457 m
+4 h
+4 h
+10 h
+104 h
+10 h
+10 h
+4 h
+27 h
+4 h
+10 h
+41 h
+1053 m
+1 h
+146 h
+1 h
+10 h
+4 h
+12458 m
+4 h
+1642 h
+109 h
+196 h
+1 h
+12459 m
+1 h
+12460 m
+10 h
+31 h
+3299 m
+1 h
+1 h
+1 h
+1 h
+4 h
+1 h
+4 h
+4 h
+4 h
+4 h
+12461 m
+4 h
+1 h
+10 h
+359 h
+97 h
+4 h
+1499 m
+55 h
+1359 h
+12462 m
+11 h
+10 h
+4 h
+229 h
+1 h
+4 h
+4 h
+10 h
+1 h
+10 h
+27 h
+12463 m
+1 h
+4 h
+4 h
+9 m
+1 h
+358 h
+12464 m
+104 h
+167 h
+31 h
+1 h
+11 h
+11 h
+12465 m
+12 h
+6469 m
+1 h
+1 h
+1 h
+12466 m
+59 h
+146 h
+4 h
+1 h
+10 h
+57 h
+10 h
+1 h
+1308 m
+57 h
+104 h
+25 h
+7 m
+4 h
+10 h
+258 h
+112 h
+1 h
+976 h
+31 h
+4 h
+1 h
+386 h
+1 h
+4 h
+1 h
+1 h
+12467 m
+332 h
+12 h
+10 h
+4 h
+74 h
+4 h
+1 h
+140 h
+4 h
+57 h
+2418 m
+124 h
+1205 m
+4 h
+10 h
+10 h
+1 h
+10 h
+1 h
+3 h
+124 h
+12468 m
+10 h
+4 h
+1 h
+10 h
+4 h
+1 h
+4 h
+1 h
+12469 m
+1 h
+601 h
+4 h
+10 h
+4 h
+12470 m
+1 h
+12471 m
+1 h
+3 h
+2465 m
+1 h
+1 h
+124 h
+1 h
+4 h
+12272 m
+4 h
+12472 m
+10 h
+12473 m
+307 h
+64 h
+272 m
+4 h
+12474 m
+1535 m
+114 h
+10 h
+4 h
+109 h
+295 h
+10 h
+4 h
+12475 m
+1 h
+4 h
+4 h
+1 h
+33 h
+123 h
+12476 m
+31 h
+10 h
+1 h
+1 h
+12477 m
+181 h
+12478 m
+97 h
+56 h
+4 h
+12479 m
+4 h
+1 h
+1 h
+1 h
+1 h
+1 h
+181 h
+12480 m
+8 h
+4 h
+10 h
+12481 m
+4 h
+770 m
+170 h
+1 h
+1 h
+4 h
+83 h
+1 h
+1 h
+1 h
+12482 m
+12483 m
+4 h
+1 h
+12484 m
+4 h
+4 h
+10 h
+12485 m
+1 h
+12486 m
+12487 m
+1299 m
+4 h
+2927 m
+12488 m
+322 m
+77 h
+10 h
+10 h
+135 h
+12489 m
+4 h
+4 h
+1822 h
+12490 m
+4 h
+1 h
+1 h
+109 h
+104 h
+4 h
+536 h
+11490 m
+12491 m
+12492 m
+4 h
+1089 h
+1 h
+1 h
+4 h
+12493 m
+4 h
+1 h
+12494 m
+4 h
+10 h
+4 h
+12495 m
+10 h
+12496 m
+10 h
+1 h
+31 h
+12497 m
+10 h
+2625 m
+79 h
+4 h
+12498 m
+10 h
+1 h
+125 h
+186 h
+12 h
+1 h
+79 h
+4 h
+1 h
+10 h
+12499 m
+4 h
+169 h
+935 h
+1 h
+7572 m
+12500 m
+718 m
+4 h
+10 h
+4 h
+12501 m
+12502 m
+1 h
+12503 m
+11 h
+3 h
+10243 m
+1 h
+1667 m
+4 h
+8146 m
+12504 m
+10 h
+4 h
+4 h
+10 h
+4 h
+3 h
+36 h
+82 h
+10 h
+4 h
+12505 m
+4 h
+1 h
+447 h
+4 h
+1 h
+4 h
+146 h
+146 h
+4 h
+10 h
+640 h
+4 h
+1 h
+1261 h
+110 h
+1 h
+10 h
+4 h
+4 h
+10 h
+4 h
+12506 m
+4 h
+13 h
+4 h
+1 h
+25 h
+1955 m
+10 h
+1 h
+12507 m
+12508 m
+4 h
+1 h
+4 h
+1 h
+3 h
+12509 m
+4 h
+25 h
+1074 h
+114 h
+1 h
+1 h
+11 h
+4 h
+10 h
+93 h
+4 h
+4 h
+12510 m
+10 h
+1 h
+1 h
+4 h
+1024 m
+601 h
+158 h
+1 h
+12511 m
+82 h
+10 h
+73 h
+12512 m
+12513 m
+4 h
+12514 m
+1 h
+4 h
+12515 m
+4 h
+1 h
+12516 m
+10 h
+124 h
+4 h
+1574 m
+12 h
+1 h
+45 h
+146 h
+6187 m
+1 h
+83 h
+1 h
+167 h
+12517 m
+59 h
+4 h
+4 h
+1 h
+10 h
+250 h
+10 h
+10 h
+12518 m
+4 h
+4 h
+12519 m
+1 h
+31 h
+3177 m
+4 h
+1 h
+11 h
+109 h
+1 h
+10 h
+12520 m
+1619 h
+12521 m
+478 h
+1 h
+12522 m
+7809 m
+10 h
+10 h
+368 h
+1 h
+1 h
+12523 m
+1 h
+12524 m
+82 h
+4 h
+55 h
+1499 h
+119 h
+4 h
+2379 m
+1939 m
+10 h
+12525 m
+1504 m
+4 h
+355 m
+538 h
+1 h
+1 h
+1 h
+74 h
+124 h
+12526 m
+82 h
+77 h
+10 h
+11 h
+1189 m
+5348 m
+265 h
+322 m
+1 h
+1 h
+74 h
+1 h
+4 h
+10 h
+806 m
+10 h
+185 h
+10 h
+4 h
+4 h
+1 h
+1 h
+4815 m
+1 h
+195 h
+1 h
+10 h
+170 h
+986 h
+12527 m
+4 h
+4 h
+12528 m
+12529 m
+447 h
+5224 m
+124 h
+12530 m
+91 h
+10 h
+4 h
+1 h
+4 h
+146 h
+4 h
+1 h
+10 h
+1 h
+1366 m
+1 h
+12531 m
+1 h
+10099 m
+12532 m
+10 h
+12533 m
+367 h
+114 h
+12534 m
+12535 m
+3704 m
+1646 m
+1 h
+1 h
+41 h
+447 h
+41 h
+219 h
+109 h
+82 h
+1 h
+219 h
+4 h
+4 h
+55 h
+1 h
+13 h
+3 h
+10 h
+12536 m
+1 h
+12537 m
+1 h
+55 h
+3 h
+10 h
+10 h
+1 h
+12538 m
+1 h
+94 h
+11 h
+4 h
+59 h
+368 h
+4 h
+83 h
+4 h
+12539 m
+10 h
+10 h
+4 h
+1 h
+4 h
+1 h
+12540 m
+1 h
+10 h
+1 h
+1 h
+12541 m
+12542 m
+307 h
+1642 h
+4 h
+4 h
+12543 m
+64 h
+10 h
+4 h
+65 h
+4 h
+4 h
+10 h
+82 h
+4 h
+1 h
+12544 m
+12545 m
+12546 m
+12547 m
+11 h
+1389 m
+1 h
+12548 m
+12549 m
+10 h
+4 h
+10 h
+4 h
+10 h
+274 h
+12550 m
+1 h
+1 h
+12551 m
+4 h
+10 h
+10 h
+124 h
+4 h
+12552 m
+10 h
+12553 m
+1 h
+57 h
+1 h
+4 h
+79 h
+82 h
+147 h
+10 h
+12554 m
+4 h
+15 m
+4 h
+167 h
+1 h
+12555 m
+79 h
+10 h
+569 h
+4 h
+219 h
+12556 m
+1714 m
+3 h
+146 h
+12557 m
+12558 m
+12559 m
+4 h
+601 h
+1766 h
+1 h
+1337 m
+12560 m
+1 h
+4 h
+12561 m
+11 h
+12562 m
+82 h
+12563 m
+12564 m
+4 h
+55 h
+12565 m
+195 h
+4 h
+12566 m
+4 h
+1 h
+13 h
+12567 m
+538 h
+4 h
+1 h
+1 h
+938 h
+12568 m
+12569 m
+258 h
+10 h
+12570 m
+74 h
+4 h
+10 h
+12571 m
+12572 m
+4 h
+307 h
+1 h
+4 h
+1 h
+443 h
+11 h
+1 h
+11 h
+4 h
+4 h
+10 h
+4 h
+10 h
+11 h
+13 h
+12573 m
+1 h
+4 h
+12574 m
+12575 m
+4 h
+12576 m
+1 h
+10 h
+12577 m
+4 h
+718 m
+4 h
+144 h
+56 h
+12578 m
+12579 m
+12580 m
+4 h
+12581 m
+4 h
+12582 m
+4 h
+94 h
+1650 h
+4 h
+367 h
+55 h
+4 h
+45 h
+4 h
+12583 m
+12584 m
+12585 m
+1 h
+10 h
+104 h
+1948 m
+10 h
+1359 h
+28 h
+12586 m
+4 h
+10 h
+4 h
+1919 m
+4 h
+12587 m
+4 h
+1309 m
+238 h
+46 m
+1 h
+12588 m
+12589 m
+10 h
+8 h
+578 h
+112 h
+83 h
+1 h
+1056 m
+4 h
+10 h
+10 h
+1751 m
+1 h
+124 h
+10 h
+11 h
+1 h
+1 h
+12590 m
+82 h
+4 h
+172 h
+57 h
+12591 m
+4 h
+1 h
+12592 m
+4 h
+10 h
+10 h
+12593 m
+4 h
+12594 m
+4 h
+4 h
+3499 m
+4 h
+10 h
+1 h
+4 h
+195 h
+4 h
+45 h
+1 h
+12595 m
+4 h
+10 h
+12596 m
+146 h
+4 h
+1508 m
+1 h
+203 h
+238 h
+2733 h
+12597 m
+4 h
+7901 m
+158 h
+10 h
+9347 m
+169 h
+888 m
+1 h
+4 h
+1 h
+12598 m
+1 h
+41 h
+4 h
+12599 m
+1 h
+12600 m
+4 h
+10 h
+4 h
+12 h
+274 h
+11 h
+10 h
+4 h
+1835 m
+10 h
+4 h
+1 h
+4 h
+124 h
+170 h
+118 h
+10 h
+11 h
+1 h
+1 h
+4 h
+12601 m
+12602 m
+170 h
+4 h
+295 h
+1486 m
+92 h
+10 h
+4 h
+11 h
+124 h
+10 h
+4 h
+1 h
+109 h
+4 h
+3161 m
+12603 m
+4 h
+10 h
+4 h
+10 h
+12604 m
+83 h
+59 h
+4 h
+31 h
+747 m
+10 h
+10 h
+11 h
+3 h
+10 h
+12605 m
+57 h
+4 h
+185 h
+1 h
+12606 m
+4 h
+12607 m
+1 h
+12 h
+4810 m
+82 h
+169 h
+3 h
+10 h
+12608 m
+10 h
+4 h
+1020 m
+1030 h
+12609 m
+279 h
+10 h
+1619 h
+4 h
+181 h
+4 h
+12610 m
+4 h
+10 h
+4 h
+27 h
+4 h
+12611 m
+12612 m
+1 h
+11 h
+25 h
+4 h
+11 h
+10 h
+4 h
+156 h
+4 h
+10 h
+25 h
+4 h
+83 h
+1981 m
+31 h
+976 h
+4 h
+83 h
+10 h
+25 h
+147 h
+12613 m
+4 h
+299 m
+1 h
+1 h
+4 h
+1 h
+10 h
+4 h
+266 h
+718 h
+12614 m
+1 h
+1 h
+258 h
+1 h
+358 h
+6851 m
+12615 m
+939 h
+147 h
+4 h
+1 h
+12616 m
+4 h
+4 h
+10 h
+1 h
+1 h
+4 h
+12617 m
+12618 m
+12619 m
+4 h
+4 h
+12620 m
+4 h
+59 h
+4 h
+12621 m
+1 h
+4 h
+4 h
+94 h
+307 h
+108 h
+12622 m
+172 h
+125 h
+12623 m
+1886 m
+1 h
+83 h
+1 h
+1 h
+3342 m
+109 h
+10 h
+4 h
+1 h
+11 h
+1 h
+158 h
+12624 m
+12625 m
+1 h
+184 h
+4 h
+1089 h
+56 h
+4 h
+59 h
+205 m
+1 h
+1 h
+4 h
+10 h
+12626 m
+295 h
+11 h
+4 h
+4 h
+687 h
+1 h
+10 h
+4 h
+10 h
+4 h
+164 h
+109 h
+279 h
+10 h
+4 h
+4 h
+4 h
+4 h
+1 h
+4 h
+157 h
+82 h
+626 m
+11 h
+196 h
+109 h
+278 h
+8147 m
+10 h
+4 h
+59 h
+5 m
+1 h
+104 h
+83 h
+3 h
+83 h
+1 h
+82 h
+104 h
+361 m
+4 h
+10 h
+57 h
+12627 m
+4 h
+10 h
+4 h
+1 h
+4 h
+12 h
+11 h
+4 h
+10 h
+1 h
+9431 m
+124 h
+4 h
+1 h
+82 h
+4 h
+10 h
+82 h
+109 h
+10 h
+55 h
+4 h
+1 h
+1 h
+4 h
+1 h
+10 h
+196 h
+4 h
+11 h
+238 h
+124 h
+4 h
+1532 m
+368 h
+1 h
+73 h
+332 h
+1 h
+12628 m
+124 h
+4 h
+12629 m
+9902 m
+113 h
+1127 m
+10 h
+4 h
+41 h
+1322 m
+12 h
+108 h
+10 h
+1235 m
+435 m
+169 h
+92 h
+4 h
+10 h
+12630 m
+1 h
+12631 m
+447 h
+10 h
+250 h
+147 h
+196 h
+4 h
+4 h
+4 h
+10 h
+258 h
+1 h
+1 h
+1 h
+12632 m
+4 h
+4 h
+57 h
+12633 m
+9040 m
+10 h
+4 h
+41 h
+1 h
+258 h
+82 h
+12634 m
+4 h
+27 h
+146 h
+4 h
+41 h
+12635 m
+146 h
+12636 m
+4 h
+10 h
+1 h
+125 h
+10 h
+12637 m
+4 h
+10 h
+82 h
+83 h
+1 h
+12638 m
+1 h
+12639 m
+2072 m
+1 h
+12640 m
+4 h
+10 h
+36 h
+4 h
+1 h
+3396 m
+4 h
+10 h
+4 h
+1 h
+83 h
+12641 m
+74 h
+4 h
+10 h
+1 h
+1 h
+12642 m
+31 h
+10 h
+4 h
+400 m
+4 h
+1 h
+4 h
+1 h
+1 h
+4 h
+1 h
+12643 m
+12644 m
+25 h
+82 h
+238 h
+12645 m
+173 h
+97 h
+538 h
+12 h
+74 h
+1576 m
+10 h
+4 h
+10 h
+4 h
+12646 m
+4 h
+59 h
+4 h
+4 h
+4 h
+12647 m
+83 h
+4 h
+12648 m
+4 h
+4 h
+1 h
+4 h
+10 h
+1 h
+818 m
+12649 m
+10 h
+4 h
+129 h
+869 m
+3 h
+1 h
+4 h
+1 h
+4 h
+12650 m
+11 h
+4 h
+12651 m
+1 h
+1 h
+11 h
+4 h
+10 h
+12652 m
+1 h
+1 h
+4 h
+4 h
+12653 m
+4 h
+1 h
+4 h
+443 h
+238 h
+1 h
+1 h
+4 h
+278 h
+3 h
+77 h
+4 h
+1 h
+1 h
+4 h
+10 h
+12654 m
+4 h
+4 h
+687 h
+10 h
+1 h
+118 h
+10 h
+4 h
+109 h
+368 h
+10 h
+10 h
+12655 m
+430 m
+104 h
+1 h
+1 h
+4 h
+4 h
+12656 m
+1 h
+139 h
+12657 m
+12658 m
+1 h
+10 h
+45 h
+114 h
+10 h
+55 h
+258 h
+1 h
+57 h
+12659 m
+4 h
+82 h
+1 h
+1 h
+119 h
+368 h
+4 h
+113 h
+12660 m
+1 h
+4 h
+13 h
+82 h
+4524 m
+1 h
+12661 m
+4 h
+4 h
+12662 m
+4 h
+4 h
+4 h
+12663 m
+1 h
+1 h
+7553 m
+12664 m
+12665 m
+12666 m
+109 h
+12667 m
+281 m
+12668 m
+1772 m
+4 h
+1 h
+10 h
+12669 m
+156 h
+4 h
+1261 h
+94 h
+196 h
+124 h
+1655 m
+156 h
+4 h
+4 h
+966 h
+10 h
+1 h
+12670 m
+25 h
+10 h
+1 h
+12671 m
+83 h
+4 h
+802 m
+587 m
+1 h
+10 h
+94 h
+147 h
+1 h
+135 h
+3 h
+4 h
+10 h
+258 h
+12672 m
+4 h
+11 h
+4 h
+57 h
+12 h
+12673 m
+4 h
+4 h
+12674 m
+109 h
+12675 m
+12676 m
+55 h
+146 h
+1 h
+12677 m
+10 h
+1659 m
+3373 m
+4 h
+1 h
+4 h
+12678 m
+48 h
+12679 m
+4 h
+4 h
+1 h
+143 h
+125 h
+169 h
+4 h
+1271 m
+4 h
+146 h
+4 h
+173 h
+12680 m
+10 h
+4 h
+10 h
+4 h
+1 h
+1 h
+10 h
+123 h
+135 h
+4 h
+1 h
+156 h
+181 h
+4 h
+1 h
+1 h
+4 h
+12681 m
+1 h
+4 h
+124 h
+10 h
+10 h
+40 h
+12682 m
+4 h
+13 h
+11 h
+125 h
+1 h
+4 h
+4 h
+11 h
+196 h
+114 h
+1 h
+10 h
+4 h
+1250 h
+1201 h
+4 h
+12683 m
+10 h
+12684 m
+12685 m
+4 h
+4 h
+4 h
+119 h
+13 h
+10 h
+25 h
+82 h
+4 h
+4 h
+4 h
+10 h
+195 h
+1 h
+10 h
+307 h
+74 h
+12686 m
+1 h
+1 h
+10 h
+4 h
+12687 m
+1 h
+4 h
+12688 m
+4 h
+1 h
+4 h
+1 h
+3188 m
+83 h
+1 h
+125 h
+10 h
+10 h
+1 h
+10 h
+4905 m
+11 h
+12689 m
+167 h
+1 h
+10 h
+146 h
+11 h
+92 h
+12690 m
+9228 m
+1 h
+4 h
+4 h
+104 h
+4 h
+65 h
+1 h
+10 h
+12691 m
+79 h
+195 h
+4 h
+57 h
+12692 m
+4 h
+12693 m
+4 h
+1486 m
+13 h
+12694 m
+10 h
+12695 m
+10 h
+104 h
+4 h
+4 h
+1 h
+12696 m
+1 h
+146 h
+4 h
+59 h
+10 h
+4 h
+4 h
+12697 m
+10 h
+59 h
+1 h
+12698 m
+4 h
+12699 m
+82 h
+123 h
+104 h
+10 h
+4 h
+124 h
+784 m
+278 h
+10 h
+767 m
+129 h
+4 h
+4 h
+1 h
+1542 m
+10 h
+4 h
+4378 m
+12700 m
+147 h
+4 h
+10 h
+124 h
+10 h
+181 h
+4 h
+386 h
+10 h
+4 h
+92 h
+82 h
+1 h
+147 h
+10 h
+1 h
+12701 m
+1 h
+4 h
+79 h
+12702 m
+4 h
+169 h
+41 h
+172 h
+11 h
+82 h
+4 h
+4 h
+109 h
+4 h
+1 h
+1 h
+106 m
+10 h
+3 h
+12703 m
+11358 m
+1 h
+479 m
+885 m
+12704 m
+266 h
+1 h
+192 h
+10 h
+1 h
+1 h
+12705 m
+1261 h
+4 h
+10 h
+399 h
+12706 m
+4 h
+9893 m
+1 h
+10 h
+538 h
+1 h
+79 h
+4 h
+4 h
+12707 m
+10 h
+82 h
+1 h
+12708 m
+12709 m
+4 h
+57 h
+12710 m
+12711 m
+11 h
+12712 m
+2794 m
+4 h
+195 h
+1 h
+12713 m
+1 h
+1261 h
+57 h
+4 h
+1 h
+1 h
+10 h
+10 h
+1 h
+4 h
+4 h
+104 h
+10 h
+601 h
+10 h
+4 h
+5422 m
+1 h
+486 m
+83 h
+10 h
+45 h
+4 h
+4 h
+4 h
+10 h
+12714 m
+4 h
+4 h
+55 h
+12715 m
+1 h
+4 h
+48 h
+4 h
+1359 h
+4 h
+12716 m
+12717 m
+10 h
+57 h
+1 h
+11 h
+1201 h
+1 h
+82 h
+353 m
+12718 m
+1089 h
+1 h
+4 h
+31 h
+48 h
+4 h
+1 h
+1 h
+1 h
+143 h
+12719 m
+1 h
+4 h
+10 h
+10 h
+25 h
+250 h
+4 h
+10 h
+113 h
+10 h
+25 h
+1 h
+4 h
+12720 m
+110 h
+59 h
+1 h
+10 h
+1 h
+4 h
+4 h
+10 h
+1 h
+1 h
+12721 m
+1 h
+12722 m
+4 h
+92 h
+1366 m
+10 h
+10 h
+4 h
+143 h
+10 h
+1 h
+1 h
+104 h
+12199 m
+1185 m
+169 h
+12723 m
+4 h
+4 h
+601 h
+124 h
+12724 m
+57 h
+265 h
+3 h
+11 h
+1 h
+12725 m
+10 h
+12726 m
+4 h
+10 h
+1 h
+4 h
+4 h
+11 h
+4 h
+59 h
+1 h
+4 h
+1 h
+74 h
+4 h
+196 h
+1804 m
+1 h
+146 h
+12727 m
+4 h
+164 h
+11 h
+59 h
+158 h
+46 m
+4 h
+82 h
+4 h
+1 h
+464 h
+1 h
+1 h
+12728 m
+12729 m
+97 h
+12730 m
+1 h
+3 h
+3 h
+11 h
+83 h
+1 h
+59 h
+12731 m
+12732 m
+4 h
+4 h
+146 h
+31 h
+4 h
+83 h
+41 h
+10 h
+12733 m
+10 h
+11 h
+103 h
+109 h
+12734 m
+4 h
+1 h
+338 h
+4 h
+10 h
+3067 m
+11 h
+4 h
+1 h
+10 h
+125 h
+10 h
+12735 m
+4 h
+124 h
+4 h
+135 h
+1 h
+12736 m
+2846 m
+12737 m
+4 h
+12738 m
+229 h
+12739 m
+10 h
+11 h
+1535 m
+1 h
+10 h
+123 h
+124 h
+12740 m
+1737 m
+10 h
+82 h
+31 h
+1 h
+10 h
+4 h
+12741 m
+1718 m
+4 h
+13 h
+4 h
+4 h
+358 h
+1 h
+11477 m
+12742 m
+83 h
+4 h
+10 h
+12743 m
+12744 m
+55 h
+125 h
+4 h
+4 h
+12745 m
+12746 m
+10 h
+10 h
+996 m
+4 h
+403 h
+1 h
+83 h
+1 h
+1 h
+12747 m
+1 h
+1 h
+147 h
+4 h
+4 h
+4 h
+2935 m
+1 h
+4 h
+41 h
+10 h
+124 h
+1 h
+91 h
+12748 m
+4 h
+10 h
+1 h
+12749 m
+430 m
+12750 m
+10 h
+4 h
+6391 m
+3 h
+10 h
+25 h
+64 h
+1 h
+1 h
+12751 m
+12752 m
+83 h
+4 h
+1 h
+4 h
+4 h
+12753 m
+82 h
+10 h
+4 h
+4 h
+964 m
+12754 m
+1 h
+1 h
+4 h
+12755 m
+4 h
+12756 m
+109 h
+10177 m
+83 h
+10 h
+1 h
+10 h
+4 h
+10 h
+3 h
+10 h
+1 h
+10 h
+12757 m
+1278 m
+12758 m
+1 h
+12759 m
+403 h
+83 h
+92 h
+1 h
+4 h
+510 m
+195 h
+124 h
+10 h
+4 h
+10 h
+12760 m
+1 h
+12761 m
+12762 m
+4 h
+10 h
+1 h
+12 h
+6808 m
+10 h
+4 h
+195 h
+4 h
+4 h
+258 h
+1691 h
+10 h
+82 h
+4 h
+4 h
+10 h
+10 h
+56 h
+258 h
+1 h
+83 h
+4 h
+12763 m
+12764 m
+1 h
+12765 m
+1 h
+4 h
+10 h
+1 h
+1 h
+57 h
+4 h
+1 h
+4 h
+12766 m
+135 h
+36 h
+25 h
+12767 m
+12768 m
+10 h
+12769 m
+12770 m
+10 h
+1 h
+4 h
+4 h
+59 h
+4 h
+12771 m
+12772 m
+1 h
+4 h
+4 h
+10 h
+10 h
+82 h
+4 h
+648 m
+733 m
+11 h
+1304 m
+92 h
+4 h
+181 h
+4 h
+109 h
+12773 m
+299 m
+3 h
+4 h
+12774 m
+4 h
+1 h
+1 h
+1 h
+1 h
+196 h
+1 h
+4 h
+939 h
+12775 m
+4 h
+4 h
+110 h
+1 h
+12776 m
+109 h
+4 h
+4 h
+4 h
+59 h
+4 h
+97 h
+1 h
+1 h
+1027 h
+4 h
+10 h
+1 h
+12777 m
+12778 m
+1 h
+12779 m
+25 h
+10 h
+10 h
+135 h
+12780 m
+185 h
+10 h
+10 h
+4 h
+36 h
+4 h
+4 h
+264 h
+4 h
+11 h
+82 h
+4 h
+368 h
+4 h
+1 h
+4 h
+10 h
+4 h
+4 h
+31 h
+59 h
+10 h
+74 h
+10 h
+4 h
+4 h
+4 h
+1 h
+10 h
+11 h
+1 h
+1 h
+3 h
+4 h
+147 h
+1 h
+10 h
+41 h
+3070 m
+4 h
+4718 m
+10 h
+763 m
+295 h
+12781 m
+1 h
+12782 m
+4 h
+4 h
+4 h
+4 h
+12783 m
+4 h
+1 h
+124 h
+1 h
+109 h
+1 h
+41 h
+4 h
+4 h
+12784 m
+12785 m
+82 h
+109 h
+10 h
+25 h
+110 h
+10 h
+2506 m
+10 h
+12 h
+13 h
+1 h
+125 h
+1 h
+10 h
+1 h
+12786 m
+12787 m
+124 h
+383 h
+31 h
+4 h
+1 h
+12788 m
+12789 m
+4 h
+12790 m
+4 h
+4 h
+74 h
+1 h
+4 h
+4 h
+1 h
+11 h
+10 h
+443 h
+4 h
+12791 m
+36 h
+12792 m
+1 h
+82 h
+12793 m
+59 h
+12794 m
+12795 m
+10221 m
+12796 m
+1027 h
+10 h
+12797 m
+11 h
+190 h
+12798 m
+4 h
+10 h
+4 h
+692 h
+31 h
+82 h
+94 h
+59 h
+7479 m
+1265 m
+1 h
+124 h
+41 h
+4 h
+1 h
+12799 m
+109 h
+82 h
+1535 m
+12800 m
+114 h
+12801 m
+4 h
+4 h
+4 h
+124 h
+12802 m
+1454 m
+4 h
+69 h
+10 h
+4 h
+10 h
+383 h
+36 h
+4 h
+57 h
+4 h
+1 h
+109 h
+57 h
+4 h
+1 h
+10 h
+4 h
+4 h
+3 h
+1 h
+4 h
+12803 m
+4 h
+10 h
+73 h
+12804 m
+4 h
+10 h
+10 h
+1016 h
+10 h
+1 h
+1359 h
+4 h
+12805 m
+12806 m
+11 h
+359 h
+3 h
+54 m
+1 h
+4 h
+10 h
+4 h
+1 h
+10 h
+964 m
+12807 m
+4 h
+12808 m
+4 h
+12809 m
+1271 m
+12810 m
+4 h
+73 h
+273 m
+3 h
+10 h
+1 h
+1 h
+10 h
+1499 h
+12811 m
+12812 m
+4 h
+74 h
+143 h
+12813 m
+12814 m
+1 h
+114 h
+12815 m
+1016 h
+674 m
+12816 m
+4 h
+10 h
+1 h
+1 h
+12817 m
+939 h
+12818 m
+4 h
+687 h
+10 h
+4 h
+10 h
+1 h
+12819 m
+1 h
+1 h
+12820 m
+12821 m
+4 h
+55 h
+12822 m
+1 h
+12823 m
+1 h
+4 h
+4074 m
+4 h
+12824 m
+447 h
+82 h
+12825 m
+1 h
+59 h
+12826 m
+1 h
+1 h
+10 h
+11 h
+10 h
+1 h
+97 h
+31 h
+4 h
+12827 m
+10 h
+4 h
+4 h
+12828 m
+258 h
+10 h
+146 h
+135 h
+12829 m
+12830 m
+124 h
+74 h
+10 h
+10 h
+1 h
+1 h
+10 h
+4 h
+12831 m
+108 h
+478 h
+1 h
+1 h
+1 h
+1 h
+10 h
+10 h
+1 h
+1 h
+332 h
+12832 m
+12833 m
+4723 m
+22 h
+4 h
+4 h
+4 h
+1619 h
+10 h
+12834 m
+885 m
+4 h
+1 h
+4 h
+1 h
+83 h
+1 h
+4 h
+4 h
+4 h
+12835 m
+4 h
+4 h
+1470 h
+1 h
+1 h
+1 h
+4 h
+10 h
+1 h
+4 h
+10 h
+10 h
+4 h
+4 h
+31 h
+10 h
+1 h
+12836 m
+73 h
+4 h
+4 h
+74 h
+12837 m
+92 h
+4252 m
+1 h
+4 h
+59 h
+12838 m
+1016 h
+10 h
+12839 m
+10 h
+4 h
+12840 m
+1 h
+4 h
+10 h
+4 h
+103 h
+146 h
+10 h
+4 h
+4 h
+12841 m
+1374 m
+229 h
+12842 m
+12 h
+114 h
+4 h
+12843 m
+56 h
+12844 m
+1 h
+885 h
+4 h
+12845 m
+124 h
+11 h
+10 h
+266 h
+238 h
+1 h
+12846 m
+181 h
+119 h
+10272 m
+146 h
+4 h
+73 h
+1 h
+11 h
+195 h
+65 h
+12847 m
+12848 m
+41 h
+4 h
+10 h
+12849 m
+4 h
+79 h
+4 h
+12850 m
+10 h
+1 h
+41 h
+313 m
+4 h
+1 h
+10 h
+64 h
+1650 h
+1 h
+1 h
+1 h
+12851 m
+12852 m
+13 h
+12853 m
+4 h
+2625 m
+278 h
+10 h
+10 h
+570 m
+55 h
+12854 m
+12855 m
+10 h
+238 h
+12856 m
+4 h
+10 h
+353 m
+266 h
+1 h
+12857 m
+124 h
+1 h
+4 h
+1 h
+4 h
+4 h
+1 h
+12858 m
+4 h
+12859 m
+27 h
+114 h
+10 h
+109 h
+125 h
+1 h
+1 h
+41 h
+12860 m
+12861 m
+1 h
+10 h
+12862 m
+1 h
+8332 m
+8663 m
+11 h
+338 h
+12863 m
+25 h
+4 h
+358 h
+1 h
+1 h
+1 h
+97 h
+1 h
+1 h
+11 h
+1 h
+10 h
+12864 m
+4 h
+147 h
+12865 m
+57 h
+55 h
+12866 m
+1 h
+10 h
+297 h
+4 h
+11 h
+1 h
+12867 m
+12868 m
+1 h
+1 h
+4 h
+4 h
+10 h
+4 h
+12869 m
+12870 m
+10 h
+12871 m
+140 h
+4 h
+125 h
+2813 m
+4 h
+10 h
+4 h
+4 h
+1 h
+4 h
+124 h
+124 h
+12872 m
+1 h
+73 h
+12873 m
+4 h
+12874 m
+12875 m
+12876 m
+11 h
+4 h
+4 h
+4 h
+1 h
+11 h
+1 h
+57 h
+97 h
+3 h
+10 h
+59 h
+11 h
+12877 m
+270 h
+12878 m
+3 h
+1 h
+12879 m
+2257 m
+4 h
+6290 m
+12880 m
+2438 m
+1261 h
+1 h
+12881 m
+4 h
+1 h
+12882 m
+4 h
+4 h
+1 h
+12883 m
+238 h
+250 h
+316 m
+4 h
+1 h
+1 h
+1 h
+59 h
+1 h
+57 h
+12884 m
+1 h
+174 m
+1 h
+11 h
+1 h
+12885 m
+990 m
+4 h
+12886 m
+4 h
+4 h
+12887 m
+794 m
+3 h
+10 h
+4 h
+399 h
+45 h
+10 h
+12888 m
+4 h
+12889 m
+12890 m
+150 m
+4 h
+147 h
+104 h
+1 h
+4 h
+12891 m
+1 h
+10 h
+10 h
+109 h
+4 h
+12892 m
+12893 m
+12894 m
+4 h
+12895 m
+538 h
+258 h
+5093 m
+119 h
+10783 m
+1030 h
+1 h
+12896 m
+3 h
+806 m
+4 h
+12897 m
+4 h
+4 h
+4 h
+4 h
+12898 m
+82 h
+1 h
+27 h
+4 h
+1 h
+10 h
+28 h
+10 h
+7135 m
+4 h
+170 h
+4 h
+181 h
+10 h
+1 h
+1 h
+73 h
+4 h
+358 h
+1 h
+158 h
+4 h
+4 h
+4 h
+4 h
+10 h
+4 h
+10 h
+4 h
+4 h
+4 h
+1 h
+3 h
+4 h
+4 h
+1 h
+4 h
+181 h
+4 h
+1 h
+12899 m
+59 h
+12900 m
+4 h
+110 h
+229 h
+92 h
+55 h
+10 h
+11 h
+119 h
+104 h
+12901 m
+10 h
+10 h
+4 h
+12902 m
+57 h
+10 h
+1 h
+1 h
+185 h
+110 h
+1 h
+12903 m
+1 h
+12904 m
+1 h
+1261 h
+10 h
+1 h
+167 h
+190 h
+10 h
+12905 m
+1309 h
+4 h
+124 h
+12906 m
+4 h
+1 h
+27 h
+4 h
+12907 m
+12908 m
+763 m
+4 h
+4 h
+25 h
+12909 m
+12910 m
+10 h
+1 h
+12911 m
+11 h
+289 h
+265 h
+12912 m
+4 h
+1 h
+4596 m
+147 h
+83 h
+10 h
+10 h
+1 h
+158 h
+1 h
+4 h
+1 h
+4 h
+1 h
+4 h
+12913 m
+10 h
+12914 m
+4 h
+12915 m
+4 h
+4 h
+1 h
+12916 m
+10 h
+4 h
+12917 m
+12918 m
+10 h
+4 h
+36 h
+1 h
+109 h
+10 h
+1 h
+109 h
+10 h
+10 h
+97 h
+10 h
+167 h
+4 h
+185 h
+4 h
+4 h
+10 h
+4 h
+11 h
+12919 m
+10 h
+1 h
+12920 m
+1 h
+10 h
+74 h
+1 h
+4 h
+109 h
+3 h
+12921 m
+59 h
+4 h
+4 h
+114 h
+307 h
+1 h
+4 h
+1 h
+10 h
+11 h
+4 h
+4 h
+4 h
+91 h
+4 h
+12922 m
+56 h
+1 h
+10 h
+10 h
+4 h
+3345 m
+12923 m
+10 h
+4 h
+97 h
+270 h
+4 h
+10 h
+10 h
+4 h
+1 h
+4 h
+12924 m
+10 h
+4 h
+1 h
+12925 m
+10 h
+4 h
+1 h
+12926 m
+147 h
+112 h
+1 h
+1 h
+4 h
+1 h
+4 h
+124 h
+4 h
+64 h
+4 h
+10 h
+4 h
+640 h
+181 h
+359 h
+4 h
+1642 h
+4 h
+10 h
+109 h
+4 h
+114 h
+1 h
+74 h
+1409 h
+4 h
+4 h
+57 h
+1 h
+109 h
+10 h
+77 h
+4 h
+12927 m
+1 h
+11 h
+265 h
+1 h
+4 h
+12928 m
+10 h
+1650 h
+25 h
+4 h
+1 h
+1 h
+1 h
+12929 m
+146 h
+4 h
+12930 m
+4 h
+4 h
+4 h
+74 h
+1 h
+12931 m
+83 h
+11 h
+4 h
+12932 m
+13 h
+12933 m
+4 h
+4 h
+1 h
+12934 m
+79 h
+12935 m
+196 h
+1017 h
+10 h
+11 h
+12936 m
+12 h
+172 h
+1 h
+4 h
+41 h
+4 h
+4 h
+4 h
+4 h
+4 h
+12937 m
+4 h
+27 h
+4 h
+172 h
+10 h
+265 h
+12938 m
+4 h
+82 h
+4 h
+59 h
+1 h
+10 h
+10 h
+297 h
+82 h
+4 h
+258 h
+1 h
+2617 m
+10 h
+11 h
+4 h
+11 h
+10 h
+41 h
+12939 m
+219 h
+4 h
+4 h
+97 h
+12940 m
+4 h
+10 h
+1822 h
+12941 m
+1 h
+4 h
+278 h
+12942 m
+12943 m
+4 h
+4 h
+12944 m
+12945 m
+12946 m
+1713 m
+10 h
+83 h
+332 h
+425 m
+4 h
+12947 m
+41 h
+125 h
+104 h
+4 h
+3742 m
+12948 m
+10 h
+1 h
+12949 m
+1 h
+447 h
+10 h
+10 h
+258 h
+83 h
+4 h
+181 h
+1 h
+10 h
+172 h
+92 h
+92 h
+56 h
+12950 m
+186 h
+4 h
+12951 m
+1 h
+4 h
+10 h
+10 h
+12952 m
+56 h
+447 h
+10 h
+12953 m
+12954 m
+1 h
+274 h
+4 h
+25 h
+4 h
+12955 m
+4 h
+135 h
+108 h
+1 h
+12956 m
+10 h
+1 h
+4 h
+2582 m
+4 h
+12957 m
+12958 m
+4 h
+10 h
+1137 h
+805 m
+59 h
+82 h
+8133 m
+4 h
+4 h
+10 h
+195 h
+114 h
+1 h
+10 h
+12959 m
+8017 m
+4 h
+12960 m
+4 h
+4 h
+10 h
+11 h
+169 h
+12961 m
+1 h
+5141 m
+10 h
+3988 m
+12962 m
+8327 m
+11 h
+12963 m
+12964 m
+10 h
+4 h
+1 h
+12965 m
+172 h
+12966 m
+238 h
+1 h
+77 h
+10 h
+104 h
+1 h
+10 h
+4 h
+1 h
+692 h
+10 h
+4177 m
+109 h
+1 h
+4 h
+4 h
+1 h
+1 h
+12967 m
+12968 m
+4 h
+195 h
+12969 m
+1 h
+1685 m
+10 h
+10 h
+4 h
+12970 m
+3742 m
+12971 m
+1 h
+1780 h
+4 h
+4 h
+10 h
+10 h
+12972 m
+10 h
+4 h
+12973 m
+12974 m
+4 h
+1 h
+12975 m
+1 h
+10 h
+124 h
+57 h
+12976 m
+10 h
+6187 m
+196 h
+11 h
+79 h
+12977 m
+12978 m
+1003 m
+1 h
+124 h
+4 h
+12979 m
+1 h
+1 h
+11 h
+82 h
+59 h
+4 h
+4 h
+3 h
+10 h
+5348 m
+266 h
+4 h
+12980 m
+1 h
+11 h
+1 h
+109 h
+12981 m
+94 h
+4 h
+10 h
+1271 h
+4 h
+4 h
+12982 m
+10 h
+104 h
+10 h
+12983 m
+327 m
+41 h
+4 h
+4 h
+1454 m
+1 h
+1 h
+1 h
+10 h
+25 h
+12984 m
+10 h
+4 h
+10 h
+10 h
+3115 m
+4 h
+11 h
+4 h
+10 h
+4 h
+81 m
+31 h
+4 h
+4 h
+258 h
+59 h
+12985 m
+10 h
+10 h
+114 h
+4 h
+4 h
+4 h
+109 h
+1 h
+4 h
+172 h
+82 h
+2374 m
+11 h
+64 h
+27 h
+1 h
+10 h
+4 h
+12986 m
+12987 m
+12988 m
+41 h
+1504 m
+4 h
+12989 m
+4 h
+4 h
+10 h
+12990 m
+1 h
+4 h
+266 h
+12991 m
+4 h
+4 h
+31 h
+12992 m
+4 h
+12993 m
+1 h
+274 h
+1389 m
+10 h
+4 h
+4 h
+3 h
+1 h
+170 h
+11 h
+386 h
+1 h
+10 h
+10 h
+10 h
+10 h
+11 h
+203 h
+12994 m
+12995 m
+4 h
+1 h
+12996 m
+4 h
+146 h
+1 h
+10 h
+27 h
+299 m
+1454 m
+4229 m
+258 h
+56 h
+10 h
+1250 h
+4 h
+94 h
+1 h
+10 h
+4 h
+4 h
+1 h
+10 h
+1 h
+12 h
+146 h
+229 h
+82 h
+5363 m
+1 h
+25 h
+12997 m
+4 h
+10 h
+3680 m
+59 h
+1 h
+10 h
+4 h
+2592 m
+12998 m
+10 h
+4 h
+158 h
+4 h
+56 h
+59 h
+55 h
+12999 m
+10 h
+57 h
+13000 m
+104 h
+1 h
+2865 m
+10 h
+4 h
+1 h
+1 h
+11 h
+1 h
+4 h
+135 h
+4 h
+1 h
+1 h
+13001 m
+4 h
+1 h
+990 m
+79 h
+10 h
+1003 m
+45 h
+1 h
+10378 m
+4 h
+1 h
+1 h
+10 h
+225 m
+1 h
+10 h
+10 h
+13002 m
+41 h
+92 h
+1 h
+2617 m
+13003 m
+10 h
+97 h
+1766 h
+74 h
+10 h
+4 h
+25 h
+3344 m
+266 h
+1 h
+4 h
+13004 m
+4 h
+1 h
+10 h
+1 h
+13005 m
+297 h
+4 h
+143 h
+4 h
+1 h
+10 h
+4 h
+13006 m
+1 h
+1 h
+4 h
+13007 m
+1 h
+10 h
+13008 m
+10 h
+158 h
+10 h
+82 h
+4 h
+4 h
+464 h
+22 h
+4 h
+195 h
+4 h
+5863 m
+61 m
+4 h
+4 h
+1 h
+10 h
+10 h
+1 h
+702 m
+11 h
+4 h
+13009 m
+13010 m
+204 m
+13011 m
+4 h
+13012 m
+164 h
+278 h
+4 h
+41 h
+109 h
+13013 m
+57 h
+13014 m
+1 h
+13015 m
+1 h
+9771 m
+13016 m
+4 h
+169 h
+368 h
+5 m
+10 h
+4 h
+4 h
+79 h
+10 h
+1 h
+12 h
+1220 m
+1 h
+4 h
+3 h
+4 h
+266 h
+64 h
+4 h
+4 h
+13017 m
+13018 m
+1 h
+10 h
+4 h
+10 h
+113 h
+1 h
+1 h
+13019 m
+4 h
+250 h
+10 h
+3 h
+11 h
+104 h
+10 h
+4 h
+13020 m
+1261 h
+13021 m
+64 h
+10 h
+13022 m
+10 h
+1 h
+13023 m
+10 h
+13024 m
+4 h
+3 h
+10 h
+4 h
+1 h
+1 h
+41 h
+4 h
+11 h
+4 h
+186 h
+12 h
+13025 m
+59 h
+1406 h
+13026 m
+1 h
+1 h
+10 h
+885 h
+1 h
+4 h
+13027 m
+22 h
+139 h
+11 h
+4 h
+13028 m
+4 h
+4 h
+83 h
+976 h
+11 h
+13029 m
+125 h
+1 h
+10 h
+10 h
+13030 m
+4 h
+10 h
+10 h
+1 h
+1 h
+59 h
+4 h
+4 h
+13031 m
+4 h
+1 h
+10 h
+13032 m
+40 h
+10 h
+1 h
+4 h
+4 h
+13033 m
+4 h
+59 h
+1 h
+4 h
+1822 h
+109 h
+45 h
+13034 m
+4 h
+4 h
+541 m
+10 h
+92 h
+1 h
+55 h
+10 h
+4 h
+13035 m
+11 h
+10 h
+4 h
+10 h
+11 h
+192 h
+4 h
+57 h
+1 h
+13036 m
+1 h
+11 h
+4 h
+400 m
+4 h
+65 h
+114 h
+13037 m
+10 h
+4 h
+4 h
+13038 m
+124 h
+4 h
+31 h
+135 h
+10 h
+10 h
+8 h
+83 h
+1 h
+5 m
+4 h
+1 h
+1751 m
+1 h
+1 h
+10 h
+4 h
+1 h
+1 h
+13039 m
+4 h
+1 h
+13040 m
+190 h
+3702 m
+4 h
+371 h
+1 h
+1 h
+4 h
+1 h
+13041 m
+13042 m
+1 h
+10 h
+1 h
+25 h
+4 h
+1 h
+1 h
+195 h
+4 h
+4 h
+13043 m
+1 h
+56 h
+1 h
+4 h
+10 h
+10700 m
+11246 m
+1 h
+4 h
+13044 m
+13045 m
+4 h
+517 m
+4 h
+13046 m
+1 h
+1 h
+1544 m
+2496 m
+4 h
+13047 m
+7862 m
+11 h
+4 h
+113 h
+135 h
+871 m
+10 h
+10 h
+11 h
+986 h
+2374 m
+4 h
+10 h
+109 h
+2418 m
+278 h
+13048 m
+10 h
+1 h
+13049 m
+4 h
+1 h
+1 h
+13050 m
+124 h
+1 h
+4516 m
+167 h
+69 h
+13 h
+278 h
+359 h
+4 h
+4 h
+10 h
+83 h
+1 h
+13051 m
+7616 m
+13052 m
+13053 m
+13054 m
+1 h
+13055 m
+1 h
+358 h
+157 h
+11990 m
+10 h
+4 h
+11 h
+1632 m
+11 h
+57 h
+6863 m
+10 h
+77 h
+36 h
+1016 h
+11 h
+914 m
+276 h
+4 h
+110 h
+13056 m
+110 h
+4 h
+4 h
+13057 m
+25 h
+83 h
+4 h
+10 h
+13058 m
+4 h
+4 h
+13059 m
+276 h
+143 h
+4 h
+125 h
+270 h
+13060 m
+4 h
+1 h
+1981 m
+1 h
+10 h
+4 h
+83 h
+4 h
+114 h
+45 h
+64 h
+10 h
+31 h
+538 h
+2885 m
+25 h
+1 h
+1 h
+4 h
+10 h
+83 h
+13061 m
+1 h
+13062 m
+13063 m
+4 h
+1785 m
+520 h
+4 h
+10 h
+74 h
+186 h
+10 h
+1 h
+1 h
+10 h
+1 h
+4 h
+2617 h
+4 h
+196 h
+13064 m
+119 h
+11 h
+13065 m
+4 h
+13066 m
+27 h
+1 h
+8 h
+7243 m
+13067 m
+13068 m
+1 h
+13069 m
+10 h
+911 m
+4 h
+4 h
+1 h
+1 h
+36 h
+4 h
+4 h
+10 h
+11 h
+10 h
+1 h
+13070 m
+13071 m
+13072 m
+4 h
+13073 m
+13074 m
+1 h
+4 h
+74 h
+2433 m
+4 h
+4 h
+125 h
+358 h
+13075 m
+13076 m
+13077 m
+13078 m
+12 h
+4 h
+97 h
+13079 m
+1 h
+1 h
+25 h
+4 h
+109 h
+4 h
+4 h
+1284 m
+13080 m
+1 h
+13081 m
+1 h
+13082 m
+13083 m
+13084 m
+172 h
+1 h
+10 h
+173 h
+4 h
+13085 m
+4 h
+1 h
+10 h
+10 h
+4 h
+10 h
+4 h
+332 h
+114 h
+2813 m
+4 h
+4 h
+4 h
+167 h
+4 h
+1 h
+13086 m
+11 h
+25 h
+13087 m
+4 h
+1 h
+74 h
+1 h
+110 h
+2865 m
+10 h
+184 h
+41 h
+57 h
+4 h
+13088 m
+4 h
+13089 m
+4 h
+83 h
+10 h
+104 h
+4 h
+1 h
+109 h
+10 h
+13090 m
+3558 m
+59 h
+285 m
+3 h
+109 h
+1 h
+1 h
+1 h
+10 h
+1 h
+13091 m
+4 h
+4 h
+4 h
+10 h
+109 h
+4 h
+4 h
+1 h
+297 h
+125 h
+10 h
+4 h
+386 h
+4 h
+13092 m
+109 h
+10 h
+295 h
+8 h
+1 h
+4 h
+124 h
+13093 m
+64 h
+10 h
+1281 m
+1714 m
+10 h
+13094 m
+1 h
+10 h
+10 h
+1 h
+4824 m
+57 h
+10 h
+10 h
+1 h
+4 h
+1359 h
+3 h
+13095 m
+1 h
+13096 m
+146 h
+4 h
+164 h
+10 h
+3278 m
+13097 m
+1 h
+3909 m
+13098 m
+13099 m
+4 h
+10 h
+12 h
+3383 m
+41 h
+119 h
+1 h
+4 h
+4 h
+10 h
+10 h
+124 h
+2591 m
+1 h
+4 h
+4 h
+13100 m
+10 h
+5053 m
+10 h
+267 m
+11 h
+1 h
+65 h
+4 h
+13101 m
+10 h
+1 h
+57 h
+104 h
+4 h
+1 h
+13102 m
+13103 m
+4 h
+13104 m
+1 h
+3089 m
+6821 m
+1 h
+4 h
+10 h
+13105 m
+1 h
+195 h
+13106 m
+1 h
+4 h
+4 h
+1 h
+4 h
+91 h
+196 h
+4 h
+123 h
+4 h
+13107 m
+10 h
+119 h
+1 h
+13108 m
+4 h
+13109 m
+13110 m
+4 h
+4 h
+1 h
+13111 m
+64 h
+13112 m
+10 h
+1 h
+10 h
+4 h
+156 h
+73 h
+196 h
+10 h
+4 h
+4 h
+368 h
+10 h
+45 h
+13113 m
+4 h
+4 h
+167 h
+1 h
+1118 m
+1 h
+13114 m
+10 h
+8 h
+10 h
+11 h
+4 h
+1 h
+83 h
+11 h
+118 h
+1 h
+10 h
+4 h
+13115 m
+13116 m
+13117 m
+1 h
+7 h
+4 h
+13118 m
+8496 m
+935 h
+13119 m
+69 h
+1 h
+4 h
+10 h
+3 h
+59 h
+10 h
+1 h
+1006 m
+13120 m
+114 h
+31 h
+489 m
+1 h
+109 h
+13121 m
+164 h
+124 h
+4 h
+4 h
+1 h
+12218 m
+13122 m
+1 h
+4 h
+4 h
+13123 m
+4 h
+97 h
+4 h
+1403 h
+278 h
+4 h
+10 h
+13124 m
+278 h
+57 h
+4 h
+4 h
+4 h
+276 h
+13125 m
+13126 m
+289 h
+56 h
+1 h
+4 h
+1766 h
+1646 m
+10 h
+4 h
+11 h
+13127 m
+4 h
+13128 m
+1 h
+1 h
+1556 m
+129 h
+82 h
+1 h
+13129 m
+4 h
+4 h
+229 h
+1 h
+359 h
+92 h
+10 h
+11 h
+10 h
+13130 m
+1 h
+13131 m
+4 h
+1030 h
+157 h
+124 h
+13132 m
+13133 m
+73 h
+11 h
+4 h
+110 h
+939 h
+3 h
+4 h
+13134 m
+13135 m
+10 h
+4 h
+4 h
+10 h
+31 h
+13136 m
+4 h
+13137 m
+82 h
+10 h
+4 h
+12 h
+1 h
+4 h
+1 h
+1528 m
+10 h
+124 h
+4 h
+10 h
+11 h
+4 h
+4 h
+10 h
+41 h
+10 h
+10 h
+1 h
+125 h
+13138 m
+1 h
+4 h
+109 h
+169 h
+4 h
+13139 m
+13140 m
+4 h
+82 h
+4 h
+82 h
+1 h
+885 h
+13141 m
+12 h
+1 h
+13142 m
+4 h
+1 h
+4 h
+1 h
+229 h
+4 h
+172 h
+4 h
+13143 m
+4 h
+11 h
+10 h
+4 h
+10 h
+266 h
+13144 m
+10 h
+4 h
+4 h
+13145 m
+1 h
+4 h
+558 m
+4 h
+4 h
+4 h
+569 h
+82 h
+13146 m
+1 h
+1 h
+140 h
+13147 m
+10 h
+13148 m
+10 h
+13149 m
+13150 m
+10 h
+4 h
+704 m
+4 h
+97 h
+4 h
+57 h
+13151 m
+1 h
+4 h
+4 h
+4 h
+1003 h
+10 h
+10 h
+4 h
+8 h
+4 h
+13152 m
+4 h
+1 h
+1 h
+13153 m
+10 h
+278 h
+1 h
+10 h
+4 h
+1 h
+276 h
+1 h
+4 h
+4 h
+13154 m
+1796 m
+1 h
+31 h
+13155 m
+13156 m
+1 h
+57 h
+4 h
+59 h
+57 h
+4 h
+13157 m
+488 h
+1 h
+4 h
+9282 m
+663 m
+119 h
+10 h
+31 h
+4 h
+13158 m
+4 h
+4 h
+11 h
+10 h
+11 h
+11 h
+4 h
+82 h
+10 h
+4 h
+13159 m
+10 h
+4 h
+10 h
+4 h
+13160 m
+13161 m
+4 h
+11 h
+1 h
+173 h
+109 h
+4 h
+1 h
+4 h
+10 h
+27 h
+10 h
+125 h
+4 h
+4 h
+1 h
+13162 m
+1 h
+10324 m
+4 h
+13163 m
+1 h
+75 m
+13164 m
+13165 m
+25 h
+1 h
+4 h
+4 h
+1 h
+10 h
+10 h
+97 h
+10 h
+1 h
+13166 m
+4 h
+10 h
+1 h
+1 h
+203 h
+59 h
+10 h
+4 h
+13167 m
+1 h
+1642 h
+307 h
+13 h
+4 h
+27 h
+1 h
+13168 m
+10 h
+1 h
+13169 m
+4 h
+4 h
+83 h
+104 h
+13170 m
+0 h
+276 h
+1 h
+4 h
+601 h
+230 h
+3675 m
+3112 m
+1 h
+464 h
+114 h
+4 h
+13171 m
+13172 m
+13 h
+4 h
+266 h
+4 h
+13173 m
+10 h
+1 h
+13174 m
+13175 m
+4 h
+10 h
+1 h
+5348 m
+4 h
+7419 m
+135 h
+4 h
+857 m
+1 h
+13176 m
+219 h
+13177 m
+13178 m
+1 h
+2923 h
+1 h
+10 h
+10 h
+10 h
+13179 m
+4 h
+4 h
+113 h
+114 h
+276 h
+13180 m
+13181 m
+195 h
+377 h
+1 h
+13182 m
+3 h
+13183 m
+4 h
+124 h
+1016 h
+124 h
+13184 m
+4623 m
+4 h
+1 h
+13185 m
+4 h
+109 h
+4 h
+25 h
+5093 m
+3 h
+4 h
+4 h
+25 h
+13186 m
+114 h
+1 h
+3 h
+13187 m
+10 h
+92 h
+4 h
+1 h
+10 h
+1089 h
+13188 m
+10 h
+4 h
+10 h
+10 h
+1 h
+10 h
+11 h
+1 h
+10 h
+59 h
+2924 m
+4 h
+10 h
+13 h
+4 h
+4 h
+1 h
+73 h
+965 m
+157 h
+13189 m
+4 h
+124 h
+4 h
+1309 h
+10 h
+338 h
+230 h
+687 h
+4 h
+13190 m
+83 h
+13191 m
+11 h
+11 h
+13192 m
+196 h
+4 h
+1116 m
+13193 m
+11 h
+10 h
+10 h
+4 h
+1 h
+12 h
+10 h
+13194 m
+13195 m
+10 h
+13196 m
+13197 m
+4 h
+4 h
+3555 m
+10 h
+1 h
+4 h
+97 h
+267 m
+135 h
+1 h
+1 h
+11 h
+10 h
+4 h
+10 h
+1 h
+4 h
+41 h
+1 h
+1359 h
+13198 m
+59 h
+4 h
+1 h
+13199 m
+13200 m
+13201 m
+11 h
+4 h
+195 h
+143 h
+1 h
+367 h
+4 h
+124 h
+5504 m
+1 h
+1 h
+3398 m
+114 h
+10 h
+4 h
+13202 m
+13203 m
+57 h
+10 h
+4 h
+1 h
+4314 m
+13204 m
+25 h
+74 h
+68 m
+11 h
+9400 m
+13205 m
+4 h
+4 h
+2510 m
+1 h
+1189 m
+4 h
+1 h
+83 h
+45 h
+4 h
+10 h
+4 h
+41 h
+41 h
+990 h
+1 h
+1 h
+167 h
+1886 m
+1 h
+4 h
+13206 m
+13207 m
+13208 m
+147 h
+1 h
+124 h
+3558 m
+31 h
+1822 h
+10 h
+10 h
+13209 m
+13210 m
+10 h
+94 h
+10 h
+45 h
+13211 m
+1 h
+13212 m
+4 h
+119 h
+1 h
+13213 m
+147 h
+13214 m
+1915 m
+4 h
+10 h
+4 h
+1 h
+4 h
+4 h
+383 h
+13215 m
+4 h
+64 h
+1 h
+1620 m
+10 h
+5917 m
+990 h
+11 h
+1 h
+110 h
+4 h
+10 h
+1 h
+4 h
+55 h
+13216 m
+195 h
+4 h
+4 h
+147 h
+6663 m
+1 h
+256 h
+13217 m
+13218 m
+1 h
+185 h
+1 h
+4 h
+4 h
+386 h
+726 m
+1 h
+412 h
+147 h
+278 h
+45 h
+536 h
+1 h
+4 h
+4 h
+4 h
+119 h
+1 h
+1 h
+13219 m
+1 h
+25 h
+10 h
+4 h
+13220 m
+338 h
+4 h
+4 h
+13221 m
+10 h
+124 h
+204 m
+13222 m
+4 h
+4 h
+1 h
+4 h
+31 h
+4 h
+12 h
+4 h
+13223 m
+13224 m
+4 h
+1 h
+10 h
+169 h
+13225 m
+10682 m
+13226 m
+13227 m
+258 h
+57 h
+4 h
+109 h
+13228 m
+307 h
+57 h
+1 h
+59 h
+4 h
+13229 m
+22 h
+4 h
+12 h
+1 h
+1 h
+13230 m
+4 h
+13231 m
+109 h
+12 h
+4 h
+1 h
+10 h
+757 m
+40 h
+170 h
+10 h
+4 h
+10468 m
+258 h
+1 h
+13232 m
+22 h
+4 h
+10 h
+13233 m
+10862 m
+4 h
+4 h
+4 h
+10 h
+10 h
+13234 m
+13235 m
+4 h
+4 h
+12 h
+4 h
+1 h
+4 h
+1 h
+1 h
+13236 m
+1 h
+1 h
+1 h
+104 h
+119 h
+4 h
+4802 m
+10 h
+64 h
+4 h
+911 h
+13237 m
+13238 m
+13239 m
+4 h
+1 h
+10 h
+4 h
+1 h
+4 h
+1 h
+1 h
+1 h
+965 m
+536 h
+4 h
+4 h
+1 h
+4 h
+112 h
+4 h
+4 h
+4 h
+13240 m
+4 h
+73 h
+4 h
+41 h
+10 h
+92 h
+1 h
+3 h
+2920 m
+146 h
+4 h
+1 h
+13241 m
+1 h
+1 h
+10 h
+124 h
+1 h
+31 h
+31 h
+10 h
+888 m
+843 m
+109 h
+59 h
+4 h
+203 h
+13 h
+4 h
+4 h
+83 h
+1 h
+4 h
+285 m
+11 h
+73 h
+13242 m
+124 h
+83 h
+10 h
+4 h
+1 h
+4 h
+10 h
+3 h
+4 h
+4 h
+11 h
+195 h
+59 h
+195 h
+1 h
+10 h
+10 h
+10 h
+13243 m
+13244 m
+11 h
+11 h
+4 h
+297 h
+10 h
+4 h
+468 m
+698 m
+10 h
+1 h
+11 h
+4 h
+124 h
+167 h
+4 h
+13245 m
+4 h
+4 h
+83 h
+278 h
+1 h
+11 h
+1 h
+4 h
+4 h
+1 h
+4 h
+28 h
+10 h
+1 h
+13246 m
+10 h
+109 h
+13247 m
+4 h
+4 h
+83 h
+11 h
+1 h
+4 h
+1619 h
+1 h
+1 h
+73 h
+4 h
+400 m
+112 h
+4 h
+276 h
+13248 m
+1 h
+57 h
+1 h
+10 h
+13249 m
+4 h
+4 h
+1 h
+4 h
+3 h
+1220 m
+1 h
+10 h
+57 h
+270 h
+1 h
+11485 m
+109 h
+3 h
+1137 h
+11 h
+1 h
+172 h
+1 h
+4 h
+10 h
+1 h
+10 h
+1 h
+13250 m
+2040 m
+6107 m
+1 h
+65 h
+1 h
+10 h
+10 h
+192 h
+4 h
+1 h
+4 h
+4 h
+238 h
+2928 m
+144 h
+13251 m
+1 h
+4 h
+1 h
+196 h
+13252 m
+10 h
+1 h
+1 h
+1 h
+13253 m
+185 h
+4 h
+13254 m
+1 h
+1 h
+10 h
+94 h
+13255 m
+41 h
+4 h
+4 h
+157 h
+911 h
+1 h
+4 h
+656 m
+13256 m
+10 h
+13257 m
+97 h
+143 h
+204 m
+4 h
+169 h
+1 h
+4 h
+13258 m
+4 h
+1137 h
+1 h
+6851 m
+1 h
+125 h
+123 h
+11 h
+129 h
+1 h
+10 h
+181 h
+4 h
+10 h
+332 h
+13259 m
+10 h
+109 h
+1 h
+4 h
+13260 m
+10 h
+3539 m
+4 h
+185 h
+11 h
+13261 m
+4 h
+13262 m
+4 h
+1 h
+11 h
+123 h
+4 h
+10 h
+109 h
+4 h
+4 h
+10 h
+74 h
+8 h
+10 h
+1 h
+1772 m
+147 h
+36 h
+10 h
+4 h
+83 h
+4 h
+1 h
+313 m
+13263 m
+74 h
+104 h
+124 h
+1 h
+1 h
+1 h
+4 h
+10 h
+83 h
+13264 m
+10 h
+3 h
+4 h
+4 h
+10 h
+13265 m
+109 h
+4 h
+4 h
+1089 h
+57 h
+4 h
+4 h
+1 h
+322 h
+10 h
+10 h
+11 h
+5567 m
+139 h
+1 h
+1 h
+1 h
+10 h
+4 h
+4 h
+1 h
+4 h
+1 h
+13266 m
+74 h
+266 h
+1 h
+74 h
+3558 h
+114 h
+31 h
+10 h
+10 h
+1039 m
+13267 m
+10 h
+157 h
+1 h
+4 h
+10 h
+108 h
+1 h
+4 h
+1 h
+13268 m
+4 h
+13269 m
+13270 m
+4 h
+1835 m
+278 h
+4 h
+13271 m
+1 h
+82 h
+57 h
+8 h
+4 h
+1 h
+5557 m
+13272 m
+10 h
+1 h
+1 h
+4 h
+262 m
+536 h
+119 h
+478 h
+295 h
+4561 m
+13273 m
+1 h
+4 h
+1 h
+10 h
+104 h
+4 h
+4 h
+4 h
+124 h
+10 h
+4 h
+4 h
+13274 m
+4 h
+4 h
+4 h
+4 h
+4 h
+23 h
+4 h
+13275 m
+1 h
+10 h
+4145 m
+4 h
+4 h
+1 h
+4 h
+488 h
+1 h
+4 h
+1 h
+65 h
+4 h
+1 h
+1 h
+1 h
+4 h
+1 h
+10 h
+13276 m
+11 h
+1137 h
+1 h
+4 h
+4 h
+13277 m
+1 h
+13278 m
+10 h
+4 h
+84 m
+295 h
+509 m
+113 h
+10 h
+1 h
+1 h
+45 h
+170 h
+13279 m
+167 h
+1 h
+4 h
+1 h
+13280 m
+4 h
+4 h
+3 h
+13281 m
+13282 m
+4 h
+13283 m
+1 h
+13284 m
+1 h
+10 h
+2238 m
+1 h
+4 h
+4 h
+13285 m
+10 h
+4 h
+4 h
+4 h
+10 h
+12 h
+4 h
+83 h
+167 h
+4 h
+4 h
+4 h
+1 h
+4 h
+195 h
+3 h
+4 h
+10 h
+4 h
+1 h
+976 h
+4 h
+13286 m
+10 h
+1 h
+1 h
+31 h
+1 h
+135 h
+319 h
+13287 m
+4 h
+4 h
+10 h
+4 h
+4 h
+4 h
+3 h
+1 h
+59 h
+13288 m
+1 h
+4 h
+1 h
+13289 m
+4 h
+59 h
+13290 m
+11 h
+13291 m
+1284 m
+10 h
+4 h
+1 h
+10 h
+4 h
+1 h
+11 h
+1261 h
+1 h
+119 h
+6985 m
+13292 m
+6705 m
+1 h
+1 h
+1 h
+4 h
+4 h
+10 h
+4 h
+82 h
+1553 m
+1953 m
+147 h
+64 h
+11 h
+4 h
+4 h
+13293 m
+1 h
+443 h
+10 h
+4 h
+10 h
+82 h
+4 h
+4 h
+10 h
+109 h
+4 h
+13294 m
+4 h
+1 h
+10 h
+13295 m
+41 h
+65 h
+82 h
+5526 m
+65 h
+1 h
+10 h
+250 h
+1 h
+10 h
+4 h
+737 m
+4 h
+10 h
+147 h
+1 h
+1 h
+3 h
+3 h
+13296 m
+13297 m
+4 h
+10 h
+10 h
+13298 m
+13299 m
+10414 m
+11 h
+4 h
+4 h
+4 h
+1 h
+82 h
+4 h
+4 h
+1 h
+4 h
+4 h
+11 h
+4 h
+1 h
+1 h
+4 h
+59 h
+1 h
+11 h
+3303 m
+4 h
+7999 m
+73 h
+124 h
+1016 h
+10 h
+10 h
+4 h
+109 h
+74 h
+143 h
+13300 m
+4 h
+13301 m
+57 h
+79 h
+36 h
+13302 m
+1 h
+13303 m
+4509 m
+10 h
+4 h
+11 h
+1 h
+181 h
+1 h
+13304 m
+1 h
+13305 m
+1 h
+10 h
+10 h
+13306 m
+82 h
+1 h
+3815 m
+10 h
+1 h
+10 h
+10 h
+1 h
+13307 m
+27 h
+3 h
+11 h
+10 h
+109 h
+1 h
+13308 m
+8535 m
+10 h
+1 h
+5682 m
+10 h
+57 h
+1 h
+4 h
+1 h
+262 h
+1 h
+4 h
+1 h
+10 h
+4 h
+13309 m
+4 h
+692 h
+109 h
+4 h
+4 h
+109 h
+4 h
+1 h
+377 h
+13310 m
+146 h
+11 h
+1 h
+295 h
+4 h
+4 h
+4 h
+112 h
+125 h
+4 h
+4 h
+13311 m
+4 h
+1619 h
+10 h
+1 h
+1 h
+195 h
+1 h
+412 h
+718 h
+109 h
+1 h
+4 h
+1 h
+692 h
+4 h
+13312 m
+10 h
+10 h
+64 h
+11 h
+4 h
+124 h
+73 h
+1 h
+4 h
+82 h
+1 h
+169 h
+4 h
+118 h
+190 h
+83 h
+4 h
+48 h
+55 h
+1 h
+10 h
+1 h
+45 h
+9027 m
+4 h
+1 h
+4 h
+1 h
+11 h
+10 h
+278 h
+10 h
+939 h
+1 h
+55 h
+258 h
+4 h
+10 h
+2887 m
+10 h
+9450 m
+13313 m
+104 h
+10 h
+1 h
+112 h
+4 h
+13314 m
+4 h
+10 h
+25 h
+10 h
+4 h
+13315 m
+10 h
+82 h
+1 h
+4 h
+13316 m
+109 h
+13317 m
+41 h
+129 h
+97 h
+11 h
+13318 m
+13319 m
+74 h
+13320 m
+1751 m
+1 h
+4 h
+4 h
+1 h
+1 h
+4 h
+4 h
+13321 m
+123 h
+11 h
+4 h
+36 h
+1 h
+74 h
+10 h
+620 m
+4 h
+10 h
+10 h
+1 h
+4 h
+1 h
+77 h
+13322 m
+4 h
+10 h
+1619 h
+1 h
+147 h
+4 h
+13323 m
+59 h
+82 h
+1 h
+4 h
+97 h
+57 h
+10 h
+11 h
+4 h
+1409 h
+1685 m
+82 h
+4 h
+10 h
+1 h
+13324 m
+13325 m
+692 h
+124 h
+190 h
+4 h
+57 h
+307 h
+4 h
+13326 m
+10 h
+358 h
+601 h
+4 h
+4 h
+295 h
+4 h
+4 h
+4 h
+174 m
+1650 h
+10 h
+4 h
+124 h
+157 h
+10 h
+4 h
+124 h
+31 h
+1 h
+13327 m
+4 h
+13 h
+2374 h
+1 h
+1619 h
+388 m
+13328 m
+10 h
+196 h
+4 h
+10 h
+13329 m
+13330 m
+13331 m
+82 h
+55 h
+31 h
+13332 m
+4 h
+1 h
+332 h
+13333 m
+139 h
+13334 m
+3 h
+13335 m
+25 h
+4 h
+3 h
+1 h
+307 h
+601 h
+10 h
+12 h
+13336 m
+103 h
+4 h
+10 h
+10 h
+4 h
+13337 m
+4 h
+1 h
+10 h
+13338 m
+640 h
+11 h
+238 h
+1137 h
+359 h
+172 h
+1 h
+112 h
+11 h
+1 h
+4 h
+13339 m
+4 h
+1478 h
+4 h
+4 h
+65 h
+10 h
+83 h
+4 h
+1 h
+59 h
+4 h
+10 h
+1 h
+4 h
+230 h
+313 m
+3 h
+10 h
+4 h
+10 h
+4 h
+4 h
+4 h
+1 h
+1642 h
+13340 m
+1470 h
+4 h
+4 h
+4 h
+1 h
+10 h
+1 h
+11 h
+123 h
+146 h
+10 h
+4 h
+4 h
+13341 m
+4 h
+1 h
+31 h
+4 h
+1 h
+10 h
+41 h
+1 h
+13342 m
+31 h
+4 h
+172 h
+13343 m
+104 h
+4 h
+10 h
+45 h
+4 h
+10 h
+118 h
+1 h
+3 h
+3 h
+1 h
+13344 m
+4 h
+307 h
+83 h
+13345 m
+13346 m
+10 h
+13347 m
+10 h
+1 h
+10 h
+1 h
+11 h
+1359 h
+1117 m
+146 h
+4 h
+11 h
+1016 h
+4 h
+25 h
+164 h
+371 h
+3177 m
+13 h
+109 h
+124 h
+11 h
+808 m
+1 h
+196 h
+459 h
+4 h
+4 h
+13348 m
+2788 h
+1 h
+48 h
+13349 m
+1 h
+4 h
+4 h
+447 h
+4 h
+4 h
+31 h
+13350 m
+104 h
+10 h
+13351 m
+4 h
+13352 m
+10 h
+82 h
+1 h
+10 h
+13353 m
+4 h
+13354 m
+10 h
+57 h
+10 h
+13355 m
+4 h
+82 h
+3307 m
+31 h
+4 h
+13356 m
+1 h
+3070 m
+13357 m
+109 h
+25 h
+8324 m
+536 h
+1 h
+10 h
+4 h
+4 h
+13358 m
+1 h
+640 h
+1 h
+73 h
+22 h
+1 h
+1 h
+996 m
+1 h
+13359 m
+11 h
+10 h
+10 h
+1 h
+11 h
+13360 m
+1 h
+4 h
+4 h
+4 h
+10 h
+1 h
+358 h
+109 h
+157 h
+13361 m
+757 h
+10 h
+1 h
+4 h
+3 h
+11 h
+4 h
+13362 m
+10 h
+10 h
+4 h
+1 h
+1 h
+10 h
+3028 m
+1 h
+1 h
+4 h
+10 h
+13363 m
+13364 m
+10 h
+1 h
+8184 m
+1 h
+13365 m
+4 h
+4 h
+4 h
+4 h
+4 h
+4 h
+11 h
+13366 m
+13367 m
+13368 m
+13369 m
+1 h
+4 h
+1 h
+262 h
+779 h
+124 h
+4 h
+13 h
+65 h
+13370 m
+1470 h
+125 h
+59 h
+10 h
+25 h
+124 h
+11 h
+10 h
+4 h
+4 h
+31 h
+27 h
+113 h
+10 h
+1 h
+4 h
+10 h
+13371 m
+4 h
+172 h
+4 h
+13372 m
+91 h
+4 h
+1 h
+8105 m
+4 h
+4 h
+82 h
+13373 m
+3 h
+1 h
+4 h
+4 h
+8879 m
+13 h
+10 h
+4 h
+112 h
+12 h
+4 h
+4 h
+164 h
+4 h
+13374 m
+1 h
+135 h
+114 h
+13375 m
+1 h
+4 h
+13376 m
+11 h
+4 h
+10 h
+4 h
+1 h
+1 h
+1 h
+82 h
+10 h
+4 h
+25 h
+13377 m
+4 h
+13378 m
+79 h
+13379 m
+13380 m
+4 h
+10 h
+4 h
+1 h
+83 h
+1 h
+4 h
+143 h
+1359 h
+276 h
+1 h
+1 h
+1 h
+4 h
+94 h
+258 h
+10 h
+1 h
+13 h
+158 h
+4 h
+4 h
+31 h
+10 h
+10 h
+36 h
+1 h
+13381 m
+4 h
+1055 m
+1 h
+6381 m
+10 h
+13382 m
+36 h
+143 h
+4 h
+4 h
+25 h
+109 h
+278 h
+4 h
+13383 m
+1 h
+13384 m
+1772 m
+10 h
+1993 m
+330 m
+57 h
+13385 m
+338 h
+4 h
+109 h
+10 h
+4 h
+13386 m
+4 h
+13387 m
+4 h
+13388 m
+150 m
+13389 m
+13390 m
+4 h
+10 h
+4 h
+1 h
+1 h
+3 h
+687 h
+1 h
+1 h
+13391 m
+4 h
+27 h
+10 h
+4 h
+1 h
+13392 m
+4 h
+82 h
+430 h
+10 h
+75 m
+10 h
+10 h
+13 h
+1 h
+2004 m
+11 h
+1 h
+10 h
+238 h
+31 h
+4 h
+31 h
+124 h
+109 h
+976 h
+4 h
+2124 m
+1 h
+11 h
+114 h
+190 h
+4 h
+10 h
+3 h
+13393 m
+4 h
+4 h
+13394 m
+265 h
+4 h
+4 h
+4 h
+13395 m
+4 h
+10 h
+41 h
+12 h
+7074 m
+4 h
+74 h
+4 h
+4 h
+13396 m
+13397 m
+13398 m
+4 h
+125 h
+1 h
+4 h
+125 h
+13399 m
+1 h
+13400 m
+13401 m
+13402 m
+737 h
+31 h
+10 h
+13403 m
+1 h
+10 h
+4 h
+4 h
+4 h
+13404 m
+13405 m
+1 h
+1 h
+13406 m
+4 h
+13407 m
+56 h
+4 h
+10 h
+1 h
+13408 m
+332 h
+1017 h
+4 h
+258 h
+124 h
+13409 m
+10 h
+1 h
+1 h
+109 h
+13410 m
+13411 m
+13412 m
+13413 m
+4 h
+13414 m
+10 h
+4 h
+125 h
+4 h
+1 h
+11 h
+13415 m
+4 h
+169 h
+1089 h
+4 h
+109 h
+1309 h
+1642 h
+204 h
+82 h
+601 h
+59 h
+4 h
+4 h
+601 h
+13416 m
+11 h
+13417 m
+4 h
+13418 m
+4 h
+4 h
+4 h
+10 h
+4 h
+169 h
+1 h
+10 h
+10 h
+11 h
+11 h
+1 h
+250 h
+4 h
+10 h
+10 h
+10 h
+1 h
+10 h
+1 h
+13419 m
+13420 m
+4 h
+4 h
+4 h
+1 h
+1 h
+4 h
+11 h
+10 h
+11 h
+57 h
+1016 h
+13421 m
+1 h
+1 h
+104 h
+13422 m
+2285 m
+1 h
+109 h
+25 h
+10 h
+3 h
+4 h
+4 h
+10 h
+1 h
+13423 m
+13424 m
+4 h
+4 h
+13425 m
+51 m
+332 h
+4 h
+13426 m
+1 h
+73 h
+4 h
+13427 m
+265 h
+65 h
+278 h
+4 h
+299 m
+13428 m
+10 h
+1 h
+332 h
+4 h
+13429 m
+4 h
+13430 m
+10 h
+1 h
+1 h
+1 h
+13431 m
+1 h
+10 h
+8 h
+10 h
+1127 m
+1559 m
+1796 m
+4 h
+4 h
+359 h
+1 h
+57 h
+4 h
+4966 m
+4 h
+1 h
+97 h
+1 h
+1 h
+10 h
+114 h
+204 h
+2148 m
+10 h
+13432 m
+1074 m
+91 h
+412 h
+13433 m
+124 h
+10 h
+1 h
+10 h
+13434 m
+45 h
+57 h
+135 h
+13435 m
+687 h
+13436 m
+1 h
+61 m
+82 h
+4 h
+1 h
+104 h
+1 h
+4 h
+4 h
+13437 m
+13438 m
+4 h
+4 h
+10 h
+1 h
+13439 m
+9397 m
+4 h
+10 h
+4 h
+56 h
+976 h
+10990 m
+10 h
+4 h
+4 h
+1 h
+195 h
+4 h
+124 h
+13440 m
+4 h
+3 h
+10 h
+459 h
+520 h
+4 h
+1016 h
+13441 m
+4 h
+10 h
+10 h
+10 h
+13442 m
+4 h
+119 h
+4 h
+13443 m
+4 h
+13444 m
+1 h
+1 h
+13445 m
+13446 m
+358 h
+13447 m
+238 h
+1 h
+104 h
+10 h
+40 h
+1 h
+4 h
+13448 m
+10 h
+13449 m
+13450 m
+13451 m
+1 h
+6678 m
+143 h
+4 h
+31 h
+13452 m
+13453 m
+1 h
+13454 m
+4 h
+4 h
+156 h
+3616 m
+25 h
+4 h
+1504 m
+4 h
+82 h
+1 h
+25 h
+73 h
+4 h
+109 h
+4 h
+1 h
+4 h
+13455 m
+13456 m
+10 h
+10 h
+146 h
+156 h
+4 h
+4 h
+172 h
+65 h
+2788 h
+3 h
+196 h
+1 h
+4 h
+10 h
+13457 m
+1 h
+13458 m
+13459 m
+4 h
+1 h
+510 m
+13460 m
+11 h
+13461 m
+1 h
+13462 m
+13463 m
+13464 m
+5296 m
+1 h
+13465 m
+423 m
+13466 m
+164 h
+65 h
+10 h
+4 h
+13467 m
+1 h
+13468 m
+1 h
+10 h
+10 h
+13469 m
+1 h
+368 h
+1 h
+13470 m
+1 h
+4245 m
+4 h
+10 h
+13471 m
+1 h
+11 h
+13472 m
+83 h
+10 h
+4 h
+10 h
+4 h
+10 h
+13473 m
+13474 m
+1 h
+13475 m
+295 h
+2308 m
+13476 m
+4 h
+1771 m
+1 h
+146 h
+258 h
+123 h
+4 h
+1764 m
+13477 m
+13478 m
+1 h
+59 h
+55 h
+1 h
+4 h
+10 h
+13479 m
+146 h
+4 h
+13480 m
+13481 m
+4 h
+10 h
+10 h
+10 h
+4 h
+181 h
+10 h
+13482 m
+13483 m
+1 h
+4 h
+41 h
+4 h
+1100 m
+10 h
+146 h
+22 h
+1 h
+4 h
+2733 h
+1 h
+4 h
+1 h
+359 h
+823 m
+1 h
+4 h
+23 h
+124 h
+10 h
+10 h
+4 h
+190 h
+1 h
+13484 m
+57 h
+10 h
+578 h
+172 h
+1 h
+13485 m
+31 h
+10 h
+4 h
+57 h
+64 h
+8346 m
+1 h
+1 h
+258 h
+1127 m
+4 h
+10 h
+10 h
+1 h
+692 h
+278 h
+265 h
+4 h
+640 h
+229 h
+10 h
+13486 m
+11 h
+13487 m
+4 h
+172 h
+104 h
+4 h
+4 h
+4 h
+4 h
+91 h
+13488 m
+10 h
+10 h
+1 h
+1 h
+10101 m
+10 h
+82 h
+1 h
+13489 m
+4 h
+4 h
+4 h
+13490 m
+13491 m
+4 h
+5708 m
+10 h
+41 h
+1 h
+4 h
+170 h
+13492 m
+65 h
+4 h
+1 h
+13493 m
+56 h
+4030 m
+4 h
+124 h
+4 h
+135 h
+2851 m
+1993 m
+1 h
+82 h
+146 h
+4 h
+4 h
+1 h
+1 h
+10 h
+4 h
+13494 m
+13495 m
+2022 m
+4 h
+57 h
+1 h
+4 h
+4 h
+13496 m
+4 h
+332 h
+4 h
+1 h
+1 h
+4 h
+31 h
+1 h
+386 h
+4 h
+1137 h
+4 h
+40 h
+146 h
+13497 m
+3 h
+1 h
+823 m
+57 h
+10 h
+124 h
+13498 m
+74 h
+135 h
+1 h
+10 h
+41 h
+1 h
+13499 m
+10 h
+54 m
+5348 m
+13500 m
+11 h
+1 h
+1 h
+13501 m
+204 h
+167 h
+13502 m
+1595 m
+31 h
+13503 m
+1 h
+1 h
+687 h
+1 h
+4 h
+1 h
+4 h
+1 h
+147 h
+1 h
+124 h
+1 h
+1 h
+801 m
+1 h
+1138 m
+13504 m
+1 h
+11 h
+10 h
+13505 m
+83 h
+13506 m
+11 h
+4 h
+4 h
+73 h
+13507 m
+13508 m
+3737 m
+8 h
+109 h
+13509 m
+258 h
+258 h
+9692 m
+4 h
+4 h
+10 h
+1 h
+10 h
+13510 m
+1 h
+13511 m
+10 h
+4 h
+10 h
+10 h
+4 h
+59 h
+3396 m
+4 h
+195 h
+4 h
+1 h
+41 h
+109 h
+13512 m
+13513 m
+13514 m
+1 h
+10 h
+91 h
+270 h
+1 h
+1 h
+1 h
+1 h
+56 h
+13515 m
+4 h
+169 h
+4 h
+13516 m
+10 h
+1 h
+13517 m
+13518 m
+11 h
+4 h
+73 h
+10 h
+469 m
+167 h
+4 h
+13519 m
+146 h
+4 h
+4 h
+196 h
+403 h
+7444 m
+13520 m
+13521 m
+2625 m
+4 h
+4 h
+114 h
+146 h
+4 h
+73 h
+13522 m
+4 h
+82 h
+10 h
+1 h
+4 h
+13523 m
+1 h
+4 h
+13524 m
+11 h
+11 h
+59 h
+11 h
+1403 h
+10 h
+13525 m
+10 h
+1 h
+4 h
+10 h
+4 h
+83 h
+1 h
+4 h
+986 h
+4 h
+13526 m
+4 h
+11 h
+270 h
+10 h
+4 h
+4 h
+208 m
+3 h
+13527 m
+1 h
+4 h
+13528 m
+10 h
+1 h
+74 h
+10 h
+4 h
+3 h
+297 h
+13529 m
+10 h
+4 h
+13530 m
+13531 m
+13532 m
+1 h
+13533 m
+6852 m
+4 h
+13534 m
+1 h
+10 h
+1 h
+1 h
+4 h
+1 h
+1 h
+13535 m
+13536 m
+10 h
+10 h
+114 h
+4538 m
+13537 m
+74 h
+13538 m
+59 h
+1 h
+82 h
+1 h
+2374 h
+13539 m
+125 h
+10 h
+4 h
+1030 h
+4 h
+1 h
+10 h
+124 h
+13540 m
+1 h
+13541 m
+82 h
+1 h
+13542 m
+13543 m
+4 h
+10 h
+687 h
+1 h
+10 h
+6226 m
+4 h
+10 h
+36 h
+174 m
+13544 m
+10 h
+1 h
+4 h
+146 h
+13545 m
+6863 m
+4 h
+104 h
+4 h
+10 h
+1 h
+13546 m
+13547 m
+65 h
+36 h
+1 h
+4867 m
+190 h
+1 h
+4 h
+4 h
+13548 m
+4 h
+4 h
+13549 m
+4 h
+4 h
+74 h
+4 h
+4 h
+10 h
+5632 m
+1 h
+1 h
+13550 m
+1 h
+1 h
+13551 m
+1 h
+13552 m
+1 h
+11 h
+13553 m
+1 h
+13554 m
+4 h
+1 h
+1 h
+10 h
+11 h
+13555 m
+169 h
+13556 m
+1 h
+13557 m
+129 h
+272 m
+10 h
+13558 m
+4 h
+83 h
+1 h
+4 h
+4 h
+1 h
+59 h
+3768 m
+109 h
+1 h
+41 h
+4 h
+4 h
+10 h
+11 h
+4 h
+10 h
+11559 m
+13559 m
+10 h
+4 h
+13560 m
+94 h
+4 h
+12898 m
+4 h
+434 m
+297 h
+1 h
+4 h
+143 h
+25 h
+110 h
+10 h
+146 h
+1 h
+278 h
+41 h
+13561 m
+1 h
+25 h
+488 h
+11 h
+10 h
+1 h
+124 h
+1 h
+4 h
+13562 m
+1 h
+1 h
+82 h
+1 h
+13563 m
+4 h
+10 h
+447 h
+1 h
+808 m
+11 h
+13564 m
+13565 m
+10 h
+10 h
+4 h
+443 h
+10 h
+143 h
+4 h
+4 h
+443 h
+13566 m
+13567 m
+1 h
+10 h
+10 h
+1 h
+1499 h
+195 h
+4 h
+109 h
+13 h
+1 h
+13568 m
+104 h
+4 h
+13569 m
+4 h
+4 h
+13570 m
+10 h
+13571 m
+307 h
+1 h
+10 h
+4 h
+13572 m
+83 h
+13573 m
+1 h
+1 h
+7395 m
+13574 m
+10 h
+327 m
+25 h
+1 h
+57 h
+4 h
+4 h
+10 h
+1 h
+190 h
+10 h
+13575 m
+4 h
+13576 m
+10 h
+10 h
+124 h
+4 h
+13577 m
+4 h
+1 h
+10 h
+1 h
+1 h
+13578 m
+10 h
+10 h
+10 h
+13579 m
+93 h
+45 h
+412 h
+360 m
+13580 m
+4 h
+10 h
+157 h
+1 h
+4 h
+1 h
+13581 m
+11 h
+4 h
+13582 m
+112 h
+83 h
+109 h
+4 h
+11 h
+463 h
+4 h
+1 h
+13583 m
+4 h
+119 h
+367 h
+4 h
+41 h
+1 h
+4 h
+1 h
+718 h
+13584 m
+11 h
+10 h
+258 h
+4471 m
+4 h
+41 h
+13585 m
+13586 m
+2625 m
+147 h
+113 h
+1 h
+4 h
+13587 m
+59 h
+1 h
+59 h
+13588 m
+4 h
+4 h
+4 h
+13589 m
+41 h
+1 h
+4 h
+150 m
+250 h
+10 h
+13590 m
+4 h
+109 h
+13591 m
+1 h
+13592 m
+4 h
+4 h
+10 h
+13593 m
+238 h
+4 h
+359 h
+1470 h
+4 h
+1 h
+13173 m
+10 h
+11 h
+13594 m
+4 h
+4 h
+4 h
+4 h
+4 h
+1765 m
+4 h
+11 h
+1 h
+125 h
+5619 m
+13595 m
+1 h
+13596 m
+4 h
+4 h
+536 h
+41 h
+4 h
+1 h
+1620 m
+4 h
+4 h
+146 h
+4 h
+1 h
+28 h
+1 h
+185 h
+1 h
+113 h
+11 h
+1 h
+1 h
+36 h
+91 h
+10 h
+31 h
+79 h
+10 h
+4 h
+1 h
+620 m
+10 h
+13597 m
+13598 m
+13599 m
+83 h
+1 h
+1 h
+11 h
+55 h
+4 h
+184 h
+10 h
+167 h
+10 h
+48 h
+59 h
+82 h
+1 h
+10 h
+11 h
+124 h
+4 h
+10 h
+4 h
+31 h
+13600 m
+4 h
+13601 m
+4 h
+4 h
+13602 m
+779 h
+1685 m
+13603 m
+4 h
+1 h
+13604 m
+36 h
+4 h
+4 h
+92 h
+4 h
+10 h
+13605 m
+13606 m
+1 h
+1 h
+92 h
+65 h
+4874 m
+10 h
+6851 m
+10 h
+4 h
+170 h
+13607 m
+4 h
+4 h
+4 h
+1 h
+1 h
+4 h
+12 h
+4 h
+1 h
+10 h
+4 h
+4 h
+13608 m
+4 h
+1 h
+285 m
+10 h
+1 h
+1642 h
+1 h
+13609 m
+5917 m
+10 h
+2308 m
+4 h
+169 h
+4 h
+4 h
+13610 m
+13611 m
+25 h
+11 h
+69 h
+1620 m
+1 h
+10 h
+1 h
+986 h
+1 h
+4 h
+4 h
+12 h
+167 h
+2717 m
+266 h
+10 h
+196 h
+124 h
+13612 m
+250 h
+4 h
+55 h
+4 h
+4 h
+1619 h
+1 h
+10 h
+860 m
+109 h
+185 h
+59 h
+1 h
+4 h
+11 h
+1 h
+31 h
+10 h
+1 h
+13613 m
+25 h
+4 h
+13614 m
+103 h
+13615 m
+1 h
+108 h
+307 h
+82 h
+13616 m
+3161 m
+1 h
+4 h
+13617 m
+13618 m
+1 h
+4 h
+1 h
+13619 m
+11 h
+119 h
+4 h
+13620 m
+433 m
+102 m
+181 h
+40 h
+1016 h
+8 h
+444 m
+124 h
+13621 m
+124 h
+4 h
+463 h
+41 h
+10 h
+10 h
+109 h
+13622 m
+12 h
+4 h
+11 h
+57 h
+64 h
+74 h
+143 h
+4 h
+69 h
+1 h
+83 h
+7535 m
+13623 m
+4 h
+1 h
+1201 h
+4440 m
+1559 m
+4 h
+13624 m
+443 h
+13625 m
+1 h
+10 h
+13626 m
+195 h
+10 h
+10 h
+10 h
+4 h
+10 h
+13627 m
+1411 m
+36 h
+10 h
+83 h
+4 h
+41 h
+6699 m
+10 h
+13628 m
+1454 h
+13629 m
+13630 m
+10 h
+1 h
+1 h
+1403 h
+4 h
+4 h
+1 h
+869 m
+13631 m
+1 h
+1 h
+1 h
+1 h
+1 h
+4 h
+13632 m
+4 h
+4 h
+4 h
+4 h
+4 h
+13633 m
+13634 m
+13635 m
+1 h
+108 h
+1 h
+13636 m
+4 h
+4 h
+13637 m
+13638 m
+2229 m
+59 h
+25 h
+4 h
+1 h
+10 h
+13639 m
+4 h
+4 h
+1 h
+13640 m
+13641 m
+82 h
+1 h
+4 h
+1 h
+4 h
+12739 m
+4 h
+4 h
+25 h
+468 m
+1780 h
+4 h
+266 h
+10 h
+41 h
+13642 m
+1 h
+4 h
+10 h
+359 h
+1 h
+10 h
+4 h
+4 h
+83 h
+94 h
+104 h
+94 h
+4 h
+1532 m
+4 h
+3 h
+1 h
+4 h
+2379 m
+147 h
+4 h
+11 h
+10 h
+3 h
+118 h
+146 h
+146 h
+4 h
+59 h
+10 h
+4 h
+1 h
+13643 m
+4 h
+4 h
+1 h
+1 h
+228 m
+1 h
+31 h
+10 h
+36 h
+338 h
+13644 m
+13645 m
+10 h
+13646 m
+4 h
+4 h
+1 h
+13647 m
+11 h
+164 h
+13648 m
+4 h
+4 h
+69 h
+4 h
+143 h
+1 h
+172 h
+11 h
+4 h
+13649 m
+630 m
+10 h
+4 h
+4 h
+8486 m
+13650 m
+1092 m
+1 h
+737 h
+106 m
+1 h
+65 h
+157 h
+1 h
+41 h
+110 h
+13651 m
+13652 m
+4 h
+4 h
+13653 m
+10 h
+1 h
+4 h
+64 h
+25 h
+13654 m
+1 h
+1 h
+794 m
+272 m
+4 h
+1 h
+64 h
+10 h
+110 h
+13655 m
+13656 m
+4 h
+10 h
+4 h
+1 h
+4625 m
+4 h
+4 h
+13657 m
+307 h
+13658 m
+4 h
+1 h
+1 h
+13659 m
+332 h
+13660 m
+146 h
+4 h
+13661 m
+13662 m
+10 h
+181 h
+4 h
+4 h
+1 h
+6370 m
+10 h
+13663 m
+36 h
+10177 m
+4 h
+278 h
+13664 m
+4 h
+13665 m
+4 h
+4 h
+13666 m
+57 h
+4 h
+10 h
+4 h
+13667 m
+4 h
+10 h
+4 h
+124 h
+4 h
+13668 m
+4 h
+83 h
+4 h
+13 h
+11 h
+1 h
+79 h
+4 h
+10 h
+4 h
+1 h
+1 h
+125 h
+443 h
+13669 m
+11 h
+10 h
+27 h
+124 h
+36 h
+27 h
+196 h
+1 h
+57 h
+135 h
+1761 m
+10 h
+1 h
+40 h
+11 h
+4 h
+10 h
+31 h
+72 m
+4 h
+195 h
+4 h
+10 h
+1 h
+3 h
+13670 m
+22 h
+1955 m
+4 h
+4 h
+13671 m
+1 h
+110 h
+13672 m
+13673 m
+1003 h
+10 h
+57 h
+8497 m
+13674 m
+164 h
+4 h
+13675 m
+1 h
+3143 m
+1445 m
+65 h
+1 h
+1 h
+1 h
+1 h
+4 h
+9940 m
+4 h
+10925 m
+13676 m
+10 h
+4 h
+13677 m
+11 h
+13678 m
+1 h
+13679 m
+4 h
+10 h
+1 h
+13680 m
+4 h
+3 h
+104 h
+10 h
+13681 m
+11 h
+41 h
+1 h
+2788 h
+11 h
+13682 m
+13683 m
+4 h
+1 h
+1309 h
+4 h
+10 h
+11 h
+1 h
+13684 m
+195 h
+83 h
+116 m
+13685 m
+13686 m
+10 h
+8040 m
+181 h
+13687 m
+13688 m
+94 h
+2309 m
+13689 m
+1 h
+10 h
+4 h
+4 h
+4 h
+1 h
+13690 m
+13691 m
+1975 m
+125 h
+1 h
+10 h
+10 h
+10 h
+4 h
+4 h
+1 h
+10 h
+4 h
+7128 m
+4 h
+1281 m
+146 h
+10 h
+1 h
+1 h
+83 h
+10 h
+13692 m
+2418 m
+4 h
+1 h
+10 h
+10 h
+1 h
+83 h
+82 h
+1 h
+1 h
+4 h
+10 h
+91 h
+13693 m
+13694 m
+1 h
+11 h
+4 h
+13695 m
+56 h
+307 h
+1 h
+4 h
+1 h
+295 h
+15 m
+4 h
+4 h
+10 h
+4 h
+1 h
+11 h
+13696 m
+4 h
+1 h
+31 h
+299 m
+13697 m
+83 h
+13698 m
+36 h
+4 h
+4975 m
+4 h
+1 h
+13699 m
+114 h
+13700 m
+1 h
+92 h
+4 h
+7630 m
+1 h
+1 h
+13701 m
+4 h
+109 h
+181 h
+13702 m
+1 h
+10 h
+1 h
+1 h
+2720 m
+1 h
+299 m
+119 h
+13703 m
+36 h
+59 h
+1 h
+433 m
+41 h
+10 h
+13704 m
+11 h
+110 h
+4 h
+520 h
+448 m
+4 h
+10 h
+4 h
+912 m
+10 h
+1 h
+1 h
+4 h
+3 h
+4 h
+13705 m
+10 h
+1 h
+13706 m
+4 h
+2914 m
+146 h
+135 h
+13707 m
+4 h
+10 h
+1 h
+109 h
+4 h
+4 h
+74 h
+13708 m
+4 h
+10 h
+1822 h
+10 h
+4 h
+1 h
+4 h
+4 h
+1 h
+25 h
+13709 m
+13710 m
+13711 m
+10 h
+59 h
+13712 m
+4 h
+1 h
+1 h
+478 h
+109 h
+59 h
+156 h
+11 h
+13713 m
+1 h
+184 h
+13714 m
+1 h
+4 h
+10 h
+258 h
+13715 m
+7814 m
+184 h
+1 h
+65 h
+4 h
+1 h
+13716 m
+11 h
+11 h
+276 h
+97 h
+4 h
+13717 m
+1 h
+109 h
+31 h
+1 h
+4 h
+270 h
+1 h
+3 h
+10272 m
+4 h
+2308 h
+13718 m
+4 h
+1 h
+143 h
+4 h
+1 h
+1100 m
+4 h
+4 h
+4 h
+1 h
+109 h
+10 h
+13719 m
+779 h
+13720 m
+1 h
+4 h
+10 h
+57 h
+10 h
+4 h
+1 h
+13721 m
+59 h
+4 h
+10 h
+1 h
+1847 m
+13722 m
+195 h
+10 h
+1 h
+125 h
+13723 m
+13724 m
+5653 m
+4 h
+820 m
+10 h
+10 h
+1 h
+13725 m
+4 h
+488 h
+1 h
+4 h
+1 h
+4 h
+4 h
+4 h
+4 h
+4 h
+25 h
+13726 m
+1 h
+105 m
+11 h
+319 h
+73 h
+28 h
+10 h
+1 h
+181 h
+10 h
+4 h
+332 h
+1685 m
+74 h
+4 h
+4 h
+4 h
+1 h
+13727 m
+10 h
+1 h
+371 h
+10 h
+82 h
+13728 m
+250 h
+13729 m
+1 h
+13730 m
+4 h
+4 h
+1 h
+31 h
+258 h
+143 h
+13731 m
+4 h
+11 h
+10 h
+4 h
+1780 h
+10 h
+10 h
+1 h
+7253 m
+7243 m
+4 h
+73 h
+3622 m
+10 h
+13732 m
+13733 m
+4 h
+112 h
+125 h
+124 h
+1 h
+4 h
+13734 m
+10 h
+2072 m
+59 h
+83 h
+13735 m
+1 h
+297 h
+13736 m
+4 h
+4 h
+83 h
+46 m
+13737 m
+299 h
+601 h
+10 h
+1 h
+64 h
+1 h
+10 h
+13738 m
+4 h
+1261 h
+10 h
+1 h
+10 h
+1 h
+4 h
+13739 m
+61 m
+1 h
+10 h
+13740 m
+4 h
+4 h
+1 h
+4 h
+4521 m
+116 m
+1564 m
+4 h
+4 h
+82 h
+10 h
+265 h
+1 h
+13741 m
+4 h
+4 h
+11 h
+10 h
+1 h
+82 h
+1 h
+4 h
+4 h
+857 m
+1 h
+4 h
+13742 m
+4 h
+4 h
+119 h
+13743 m
+4 h
+109 h
+10 h
+4 h
+1 h
+1 h
+4 h
+1 h
+13744 m
+13745 m
+172 h
+11 h
+56 h
+10 h
+196 h
+1 h
+91 h
+4 h
+1714 m
+1 h
+4 h
+10 h
+4 h
+1 h
+64 h
+10 h
+4 h
+13746 m
+1497 m
+10 h
+64 h
+25 h
+265 h
+83 h
+109 h
+27 h
+109 h
+1 h
+109 h
+4 h
+112 h
+13747 m
+13748 m
+11 h
+1 h
+13749 m
+399 h
+4 h
+1 h
+4 h
+13750 m
+13751 m
+13752 m
+46 m
+112 h
+1 h
+10 h
+13753 m
+1 h
+22 h
+4 h
+1 h
+10 h
+467 m
+4 h
+1 h
+4 h
+358 h
+13754 m
+1 h
+2379 m
+779 h
+1 h
+45 h
+83 h
+13755 m
+13756 m
+4 h
+1337 m
+332 h
+114 h
+4 h
+108 h
+92 h
+124 h
+10 h
+10 h
+13757 m
+1 h
+1 h
+123 h
+1 h
+4 h
+4 h
+4 h
+1 h
+13758 m
+59 h
+11 h
+13759 m
+4 h
+10 h
+4 h
+1 h
+4 h
+359 h
+4 h
+1 h
+4 h
+83 h
+1 h
+1 h
+13760 m
+649 m
+140 h
+13761 m
+13762 m
+4 h
+13763 m
+1 h
+13764 m
+124 h
+185 h
+4 h
+241 m
+1 h
+10 h
+4 h
+41 h
+13765 m
+31 h
+4 h
+11 h
+4 h
+4 h
+65 h
+10 h
+13766 m
+10 h
+11 h
+11 h
+10 h
+1 h
+57 h
+13767 m
+4 h
+13768 m
+3742 h
+170 h
+4 h
+2625 h
+1 h
+4 h
+13769 m
+1 h
+7585 m
+1 h
+1 h
+4 h
+10 h
+10612 m
+13770 m
+4 h
+4 h
+9 m
+1 h
+640 h
+4 h
+13771 m
+10 h
+10 h
+4 h
+97 h
+4 h
+1 h
+1 h
+4 h
+13772 m
+4 h
+11 h
+72 m
+4 h
+10 h
+4 h
+10 h
+1 h
+13773 m
+109 h
+4 h
+104 h
+1 h
+13774 m
+276 h
+10 h
+10 h
+4 h
+13775 m
+94 h
+10 h
+173 h
+1 h
+124 h
+4 h
+25 h
+4 h
+4 h
+10 h
+4 h
+13776 m
+368 h
+11 h
+13777 m
+109 h
+299 h
+1 h
+1 h
+1 h
+4986 m
+687 h
+266 h
+1 h
+10 h
+109 h
+11 h
+1128 m
+4 h
+4 h
+4 h
+10 h
+10 h
+4 h
+10 h
+28 h
+112 h
+13778 m
+1 h
+4 h
+1 h
+13779 m
+11 h
+276 h
+297 h
+1548 m
+13780 m
+13781 m
+13782 m
+13783 m
+10 h
+1790 m
+82 h
+8 h
+570 m
+4 h
+13784 m
+1655 m
+1 h
+2041 m
+4 h
+4 h
+1 h
+13785 m
+13786 m
+13787 m
+1796 m
+698 m
+12 h
+13788 m
+13789 m
+1 h
+4 h
+13790 m
+1 h
+195 h
+383 h
+4 h
+478 h
+4 h
+13791 m
+13792 m
+10588 m
+737 h
+4 h
+4 h
+11 h
+4 h
+211 m
+264 h
+112 h
+4 h
+10 h
+538 h
+124 h
+13793 m
+4 h
+13794 m
+1030 h
+13795 m
+1 h
+4 h
+1138 m
+601 h
+124 h
+295 h
+1 h
+4 h
+1016 h
+4 h
+181 h
+10 h
+4 h
+195 h
+83 h
+10 h
+10 h
+13796 m
+4 h
+55 h
+4 h
+1 h
+13797 m
+10 h
+1 h
+41 h
+4 h
+1 h
+4 h
+4 h
+185 h
+13798 m
+4 h
+278 h
+1 h
+23 h
+109 h
+1 h
+83 h
+274 h
+4 h
+10 h
+10 h
+27 h
+1 h
+13799 m
+13800 m
+92 h
+4 h
+4 h
+4 h
+4 h
+104 h
+13 h
+13801 m
+1 h
+1 h
+4 h
+1 h
+13802 m
+1 h
+10 h
+13803 m
+97 h
+25 h
+13804 m
+11 h
+1 h
+4 h
+97 h
+41 h
+4 h
+3 h
+94 h
+10 h
+10 h
+3 h
+1714 m
+1 h
+1 h
+13805 m
+4 h
+10 h
+10 h
+13806 m
+4 h
+1 h
+172 h
+13807 m
+1 h
+1 h
+332 h
+13808 m
+13809 m
+1 h
+185 h
+332 h
+10 h
+10 h
+83 h
+1 h
+4 h
+4 h
+4 h
+13810 m
+4 h
+13811 m
+4 h
+146 h
+278 h
+103 h
+11 h
+1 h
+10 h
+13812 m
+13813 m
+10 h
+10 h
+687 h
+10 h
+109 h
+4 h
+64 h
+164 h
+4 h
+4 h
+1 h
+109 h
+4 h
+1 h
+10 h
+4 h
+10 h
+4 h
+13814 m
+478 h
+10 h
+13815 m
+13816 m
+59 h
+13817 m
+1 h
+1822 h
+13818 m
+143 h
+172 h
+4 h
+4 h
+65 h
+13819 m
+10 h
+13820 m
+4 h
+4 h
+3435 m
+4 h
+184 h
+661 m
+11 h
+13821 m
+3 h
+1 h
+4 h
+1 h
+842 m
+4 h
+13822 m
+488 h
+82 h
+4 h
+1 h
+1020 m
+124 h
+59 h
+13823 m
+2710 m
+1 h
+4 h
+13824 m
+3 h
+687 h
+1 h
+82 h
+10 h
+4 h
+371 h
+265 h
+10 h
+4 h
+124 h
+9397 m
+13825 m
+8 h
+4 h
+1 h
+4 h
+1 h
+4 h
+4 h
+190 h
+13826 m
+10 h
+83 h
+4 h
+1 h
+41 h
+11 h
+74 h
+129 h
+167 h
+10 h
+13827 m
+1 h
+113 h
+185 h
+64 h
+4 h
+4 h
+119 h
+4 h
+11 h
+10 h
+4 h
+570 m
+13828 m
+13829 m
+332 h
+4 h
+10 h
+4 h
+10 h
+13830 m
+4 h
+4 h
+1 h
+1953 m
+27 h
+13392 m
+4 h
+10 h
+59 h
+208 m
+13831 m
+13832 m
+13833 m
+1 h
+1 h
+4 h
+10 h
+82 h
+11 h
+1 h
+4 h
+447 h
+114 h
+13834 m
+3 h
+10 h
+10 h
+3 h
+13835 m
+94 h
+1 h
+4 h
+1418 m
+13836 m
+4 h
+1 h
+13837 m
+1 h
+4 h
+13838 m
+13839 m
+467 m
+146 h
+124 h
+4 h
+4 h
+13840 m
+4 h
+13841 m
+13842 m
+13843 m
+13844 m
+4 h
+10 h
+273 m
+10 h
+109 h
+332 h
+4 h
+1556 m
+10 h
+1442 m
+1 h
+1 h
+4 h
+13845 m
+13846 m
+13847 m
+13848 m
+1 h
+185 h
+1 h
+447 h
+1 h
+1 h
+10 h
+124 h
+266 h
+4 h
+4 h
+1 h
+11 h
+110 h
+10 h
+10 h
+12 h
+10 h
+97 h
+13849 m
+1 h
+13850 m
+371 h
+1 h
+64 h
+4 h
+13851 m
+186 h
+4 h
+10 h
+4 h
+3 h
+11 h
+13852 m
+156 h
+4 h
+4 h
+82 h
+13853 m
+10 h
+36 h
+371 h
+3 h
+4 h
+10 h
+4 h
+10 h
+4 h
+11 h
+358 h
+10 h
+1 h
+2920 m
+4 h
+4 h
+1 h
+757 h
+1 h
+13854 m
+13855 m
+13856 m
+1 h
+13857 m
+4426 m
+13858 m
+170 h
+1 h
+4 h
+1 h
+1 h
+4 h
+158 h
+4 h
+10 h
+13859 m
+4 h
+13860 m
+10 h
+10 h
+57 h
+13861 m
+10 h
+1 h
+4 h
+1 h
+13862 m
+125 h
+83 h
+114 h
+13863 m
+13864 m
+4 h
+3 h
+13865 m
+82 h
+278 h
+59 h
+10 h
+1 h
+1 h
+1541 m
+1 h
+4 h
+104 h
+13866 m
+1 h
+1 h
+10 h
+4 h
+147 h
+125 h
+2508 m
+82 h
+13867 m
+13868 m
+40 h
+10 h
+44 m
+146 h
+13869 m
+13870 m
+3 h
+172 h
+912 m
+11 h
+104 h
+10 h
+939 h
+73 h
+10 h
+10 h
+4 h
+10 h
+4 h
+10 h
+10 h
+74 h
+13871 m
+45 h
+3 h
+195 h
+10 h
+13872 m
+1 h
+1470 h
+13873 m
+10 h
+10 h
+1 h
+4 h
+390 m
+1 h
+41 h
+10 h
+1 h
+8 h
+123 h
+368 h
+36 h
+13874 m
+1 h
+509 m
+4 h
+195 h
+57 h
+1 h
+4 h
+1 h
+1 h
+59 h
+13875 m
+13876 m
+4 h
+59 h
+4 h
+4 h
+3396 m
+1 h
+91 h
+1 h
+57 h
+3360 m
+13877 m
+4 h
+119 h
+1 h
+97 h
+113 h
+4 h
+1 h
+1 h
+4 h
+4 h
+10 h
+8555 m
+13878 m
+1 h
+4 h
+4 h
+13879 m
+4 h
+4 h
+79 h
+1 h
+195 h
+31 h
+10 h
+13880 m
+82 h
+13881 m
+13882 m
+1 h
+10 h
+135 h
+4849 m
+13883 m
+110 h
+82 h
+27 h
+4 h
+65 h
+1 h
+1 h
+1 h
+266 h
+10 h
+4 h
+4 h
+82 h
+10 h
+1 h
+10 h
+74 h
+196 h
+13884 m
+10 h
+4 h
+1 h
+13885 m
+10 h
+1 h
+10 h
+1 h
+10 h
+4 h
+1 h
+4 h
+10 h
+181 h
+25 h
+4 h
+1 h
+124 h
+196 h
+1 h
+41 h
+459 h
+94 h
+13886 m
+4 h
+124 h
+4 h
+4 h
+83 h
+1 h
+4 h
+10 h
+10 h
+13887 m
+13888 m
+11 h
+13889 m
+73 h
+83 h
+4 h
+4 h
+4 h
+258 h
+4 h
+1 h
+13890 m
+59 h
+22 h
+36 h
+13891 m
+1 h
+4 h
+278 h
+13892 m
+4 h
+4 h
+13893 m
+4 h
+13894 m
+13 h
+4 h
+13895 m
+10 h
+10 h
+794 m
+13896 m
+1 h
+13897 m
+1 h
+13898 m
+109 h
+7950 m
+11 h
+10 h
+4 h
+2494 m
+4 h
+575 m
+13899 m
+59 h
+13900 m
+4 h
+1 h
+1914 m
+353 m
+13901 m
+13902 m
+4 h
+4 h
+1 h
+295 h
+4 h
+5374 m
+25 h
+4 h
+918 m
+4 h
+13903 m
+297 h
+82 h
+13904 m
+4 h
+13905 m
+1 h
+13906 m
+10 h
+12 h
+10 h
+4 h
+4 h
+1 h
+4 h
+1089 h
+1 h
+10 h
+13907 m
+1 h
+13908 m
+4 h
+5348 m
+1 h
+4 h
+1 h
+1 h
+479 m
+13909 m
+4 h
+10 h
+12237 m
+13910 m
+13911 m
+1822 h
+4 h
+13912 m
+110 h
+299 h
+13913 m
+13914 m
+3704 m
+520 h
+13915 m
+10 h
+4 h
+59 h
+57 h
+276 h
+13916 m
+13917 m
+1 h
+13918 m
+13919 m
+195 h
+10 h
+10 h
+1 h
+4 h
+1 h
+13920 m
+109 h
+13921 m
+13922 m
+4 h
+4 h
+4 h
+112 h
+4 h
+124 h
+73 h
+10 h
+13923 m
+1 h
+31 h
+59 h
+1 h
+10 h
+976 h
+13924 m
+104 h
+10 h
+135 h
+69 h
+65 h
+4 h
+1 h
+4 h
+109 h
+13925 m
+4905 m
+10044 m
+1 h
+4 h
+4 h
+9586 m
+97 h
+13926 m
+578 h
+1020 m
+4 h
+4 h
+27 h
+4 h
+4 h
+190 h
+1725 m
+190 h
+13927 m
+13928 m
+4 h
+4 h
+1 h
+13929 m
+13930 m
+83 h
+1 h
+4 h
+1 h
+4 h
+10 h
+10 h
+10 h
+10 h
+45 h
+169 h
+4 h
+10 h
+104 h
+13931 m
+4 h
+7169 m
+13932 m
+13933 m
+10 h
+13 h
+91 h
+3 h
+11 h
+185 h
+976 h
+3 h
+156 h
+4 h
+112 h
+10 h
+57 h
+10 h
+13934 m
+10 h
+4 h
+386 h
+1650 h
+190 h
+82 h
+1 h
+1 h
+13935 m
+1 h
+11 h
+11 h
+1 h
+4 h
+13936 m
+83 h
+1635 m
+13937 m
+3982 m
+4 h
+83 h
+1 h
+0 h
+353 m
+4 h
+4 h
+10 h
+616 m
+4 h
+13938 m
+10 h
+1 h
+1 h
+570 h
+13939 m
+2215 m
+13940 m
+4 h
+119 h
+4 h
+1 h
+13941 m
+13942 m
+4 h
+10 h
+11639 m
+10 h
+13943 m
+11 h
+1 h
+1 h
+109 h
+103 h
+55 h
+10 h
+13944 m
+13945 m
+10 h
+4 h
+10 h
+1 h
+493 m
+1016 h
+64 h
+1 h
+13946 m
+1 h
+1 h
+1470 h
+31 h
+4 h
+4 h
+1 h
+4 h
+408 m
+1 h
+3025 m
+12 h
+4 h
+1 h
+13947 m
+10 h
+109 h
+41 h
+146 h
+13948 m
+57 h
+10 h
+4 h
+167 h
+4 h
+13949 m
+1 h
+13950 m
+1 h
+13951 m
+4 h
+4 h
+693 m
+13952 m
+74 h
+125 h
+1 h
+57 h
+1 h
+124 h
+57 h
+4 h
+1 h
+10 h
+1137 h
+1 h
+10 h
+55 h
+1 h
+266 h
+4 h
+1 h
+124 h
+4 h
+13953 m
+1 h
+4 h
+13954 m
+10 h
+4 h
+13955 m
+10 h
+13956 m
+4 h
+1 h
+13957 m
+97 h
+1 h
+7565 m
+11 h
+1 h
+13958 m
+4 h
+4 h
+10 h
+36 h
+10 h
+1 h
+11 h
+1713 m
+1 h
+1 h
+4 h
+4 h
+10 h
+1 h
+4 h
+1 h
+1 h
+135 h
+7950 m
+332 h
+3344 m
+10 h
+13959 m
+4 h
+13960 m
+10 h
+1 h
+4 h
+10 h
+13961 m
+4 h
+1 h
+10 h
+4 h
+36 h
+13962 m
+10 h
+4 h
+158 h
+13963 m
+13964 m
+4 h
+1 h
+10 h
+289 h
+4 h
+1 h
+295 h
+13965 m
+109 h
+1 h
+13966 m
+4 h
+4 h
+4 h
+11 h
+4 h
+4 h
+295 h
+1556 m
+11522 m
+10 h
+59 h
+1 h
+976 h
+258 h
+11 h
+124 h
+13753 m
+444 m
+4 h
+258 h
+10 h
+10 h
+13967 m
+13968 m
+83 h
+4 h
+11 h
+4 h
+1 h
+4 h
+1 h
+13969 m
+31 h
+13970 m
+10 h
+307 h
+677 m
+1 h
+3322 m
+10 h
+109 h
+13971 m
+4 h
+4 h
+31 h
+1 h
+13972 m
+4 h
+4 h
+10 h
+1 h
+1 h
+4 h
+13973 m
+1 h
+36 h
+59 h
+1 h
+260 m
+4 h
+1 h
+13974 m
+10 h
+258 h
+1003 h
+11 h
+4 h
+83 h
+1 h
+31 h
+3508 m
+885 h
+10 h
+10 h
+1 h
+13975 m
+1 h
+13976 m
+10 h
+250 h
+13977 m
+1 h
+13978 m
+10 h
+11 h
+1053 m
+4 h
+140 h
+13979 m
+109 h
+4 h
+4 h
+109 h
+965 h
+10 h
+124 h
+4 h
+13980 m
+4 h
+13981 m
+59 h
+1 h
+10 h
+10 h
+10 h
+55 h
+13982 m
+59 h
+10 h
+59 h
+57 h
+13983 m
+146 h
+1 h
+1 h
+74 h
+170 h
+10 h
+1261 h
+59 h
+10 h
+13 h
+13984 m
+4 h
+1 h
+1 h
+1 h
+1 h
+10 h
+13985 m
+65 h
+13986 m
+170 h
+109 h
+1 h
+4 h
+13987 m
+74 h
+10 h
+13988 m
+1 h
+1 h
+113 h
+11 h
+10 h
+4 h
+146 h
+1 h
+1 h
+1 h
+4 h
+307 h
+13989 m
+10 h
+2733 h
+4 h
+3 h
+41 h
+4 h
+1 h
+3 h
+4 h
+11117 m
+13990 m
+4 h
+13991 m
+1 h
+2733 h
+109 h
+10 h
+10 h
+124 h
+23 h
+10 h
+11691 m
+4 h
+1 h
+4 h
+4 h
+104 h
+13992 m
+11672 m
+13993 m
+10 h
+1 h
+4 h
+570 h
+13994 m
+1 h
+5197 m
+13995 m
+10 h
+4 h
+4 h
+1 h
+11 h
+10 h
+1 h
+94 h
+4 h
+4 h
+91 h
+1 h
+73 h
+3 h
+1 h
+4 h
+13996 m
+1 h
+4 h
+13997 m
+4 h
+10 h
+1122 m
+10 h
+4 h
+181 h
+4 h
+82 h
+358 h
+4 h
+13998 m
+4 h
+13999 m
+10 h
+4 h
+1796 m
+10 h
+843 m
+1 h
+125 h
+1 h
+1105 m
+1 h
+10 h
+359 h
+45 h
+1 h
+1 h
+114 h
+1 h
+4 h
+14000 m
+1 h
+45 h
+4 h
+185 h
+185 h
+1 h
+1 h
+4 h
+4 h
+114 h
+7713 m
+125 h
+109 h
+1038 m
+12 h
+186 h
+3 h
+14001 m
+5111 m
+2623 m
+1 h
+11 h
+4 h
+1 h
+10 h
+2002 m
+124 h
+4 h
+1 h
+14002 m
+4 h
+14003 m
+59 h
+11 h
+14004 m
+779 h
+10 h
+1 h
+642 m
+12131 m
+4 h
+14005 m
+4 h
+4 h
+83 h
+4 h
+181 h
+36 h
+4 h
+4 h
+114 h
+1 h
+2840 m
+4 h
+1038 m
+1 h
+11 h
+68 m
+45 h
+14006 m
+307 h
+36 h
+14007 m
+1 h
+14008 m
+1 h
+1 h
+10 h
+4 h
+167 h
+4 h
+14009 m
+59 h
+488 h
+10 h
+687 h
+1 h
+2846 m
+1 h
+295 h
+110 h
+10 h
+14010 m
+1955 m
+1250 h
+4 h
+1 h
+4 h
+14011 m
+14012 m
+1 h
+4 h
+3882 m
+1 h
+14013 m
+4 h
+1 h
+4 h
+9228 m
+358 h
+14014 m
+1 h
+2041 m
+4 h
+11 h
+135 h
+4 h
+8251 m
+1 h
+4 h
+195 h
+59 h
+14015 m
+59 h
+10 h
+1 h
+11 h
+56 h
+1 h
+10 h
+1620 h
+14016 m
+10 h
+4 h
+1 h
+1 h
+1 h
+266 h
+4 h
+4 h
+196 h
+14017 m
+4 h
+4966 m
+1 h
+4 h
+1 h
+11 h
+4 h
+1 h
+1 h
+1 h
+986 h
+4 h
+59 h
+10 h
+1 h
+79 h
+4 h
+4 h
+1 h
+4 h
+1 h
+4 h
+109 h
+1 h
+146 h
+8882 m
+4 h
+10 h
+4 h
+4 h
+1619 h
+4 h
+146 h
+4 h
+1 h
+332 h
+4 h
+5 m
+1 h
+538 h
+1067 m
+4 h
+4 h
+14018 m
+4 h
+10 h
+14019 m
+169 h
+10 h
+14020 m
+65 h
+4 h
+11 h
+4 h
+79 h
+12 h
+10 h
+14021 m
+14022 m
+125 h
+4 h
+4 h
+4 h
+14023 m
+14024 m
+1 h
+14025 m
+14026 m
+10 h
+83 h
+14027 m
+266 h
+59 h
+1 h
+14028 m
+4 h
+536 h
+124 h
+10 h
+14029 m
+10 h
+4 h
+169 h
+14030 m
+4 h
+14031 m
+14032 m
+1 h
+1 h
+196 h
+4 h
+1 h
+83 h
+3533 m
+4 h
+14033 m
+1 h
+4 h
+41 h
+12 h
+10 h
+14034 m
+533 m
+1 h
+59 h
+11 h
+14035 m
+11 h
+79 h
+4 h
+14036 m
+1 h
+4 h
+1 h
+109 h
+10 h
+14037 m
+4 h
+14038 m
+11 h
+11 h
+1 h
+31 h
+464 h
+1 h
+4 h
+3 h
+64 h
+56 h
+10 h
+14039 m
+14040 m
+14041 m
+14042 m
+14043 m
+1 h
+10 h
+10 h
+4 h
+4 h
+4 h
+4 h
+4 h
+4378 m
+4 h
+4 h
+57 h
+1 h
+4 h
+14044 m
+1 h
+2592 m
+11 h
+41 h
+10 h
+27 h
+14045 m
+229 h
+1299 m
+1 h
+4 h
+443 h
+10 h
+241 m
+25 h
+1 h
+399 h
+11 h
+14046 m
+10 h
+10 h
+73 h
+4 h
+10 h
+94 h
+2281 m
+14047 m
+266 h
+1 h
+25 h
+1 h
+1 h
+14048 m
+4 h
+10 h
+4 h
+10 h
+14049 m
+1 h
+77 h
+1379 m
+3 h
+10 h
+4 h
+11 h
+4 h
+4 h
+276 h
+1 h
+4848 m
+4 h
+150 m
+3 h
+14050 m
+11 h
+14051 m
+1 h
+14052 m
+1 h
+10 h
+14053 m
+964 m
+1 h
+4 h
+57 h
+1 h
+164 h
+10 h
+1 h
+74 h
+14054 m
+83 h
+3177 m
+14055 m
+93 h
+1 h
+4 h
+4 h
+4 h
+14056 m
+10 h
+11 h
+538 h
+55 h
+14057 m
+10 h
+93 h
+69 h
+4 h
+258 h
+104 h
+164 h
+11 h
+36 h
+14058 m
+4 h
+13 h
+1 h
+1 h
+10 h
+59 h
+14059 m
+1185 m
+1 h
+5475 m
+4 h
+541 m
+1 h
+10 h
+10 h
+10 h
+4 h
+104 h
+82 h
+14060 m
+11 h
+10 h
+4 h
+274 h
+1 h
+4 h
+7827 m
+1 h
+1 h
+1 h
+14061 m
+10 h
+4966 m
+4 h
+4 h
+4 h
+195 h
+4 h
+4 h
+14062 m
+1 h
+14063 m
+4 h
+79 h
+1 h
+14064 m
+59 h
+138 m
+13435 m
+1 h
+14065 m
+4481 m
+4 h
+10 h
+433 h
+14066 m
+10 h
+11 h
+871 m
+4 h
+14067 m
+4 h
+185 h
+11 h
+4 h
+4 h
+10 h
+79 h
+4 h
+195 h
+4 h
+4 h
+83 h
+276 h
+14068 m
+1027 m
+4 h
+12655 m
+10 h
+146 h
+156 h
+250 h
+4 h
+55 h
+258 h
+229 h
+1886 m
+4 h
+1 h
+4 h
+10 h
+1 h
+73 h
+4 h
+7950 h
+31 h
+10 h
+14069 m
+14070 m
+92 h
+1 h
+14071 m
+45 h
+1 h
+124 h
+14072 m
+1 h
+14073 m
+1 h
+1 h
+1 h
+10 h
+14074 m
+4 h
+1 h
+69 h
+1 h
+185 h
+59 h
+4 h
+4 h
+12964 m
+10 h
+10 h
+1 h
+1 h
+4 h
+1 h
+1 h
+10 h
+124 h
+59 h
+14075 m
+14076 m
+1 h
+196 h
+11 h
+1 h
+11 h
+1 h
+1 h
+14077 m
+41 h
+1 h
+82 h
+10 h
+4 h
+114 h
+229 h
+4 h
+258 h
+10 h
+4 h
+10649 m
+13 h
+4 h
+6882 m
+371 h
+1 h
+10 h
+1 h
+97 h
+14078 m
+1 h
+25 h
+1 h
+14079 m
+10 h
+4 h
+74 h
+332 h
+10 h
+14080 m
+14081 m
+1 h
+2607 m
+181 h
+124 h
+14082 m
+11 h
+1764 m
+4 h
+4 h
+185 h
+332 h
+69 h
+4 h
+4 h
+4 h
+185 h
+114 h
+1 h
+1 h
+4645 m
+10 h
+73 h
+173 h
+31 h
+14083 m
+1105 m
+1 h
+6726 m
+10 h
+1847 m
+674 m
+94 h
+4 h
+4 h
+14084 m
+45 h
+1 h
+888 m
+10 h
+14085 m
+4 h
+1 h
+14086 m
+1 h
+10 h
+4 h
+1 h
+10 h
+4 h
+1 h
+4 h
+4 h
+10 h
+1 h
+10 h
+10 h
+1 h
+10 h
+14087 m
+4 h
+36 h
+4 h
+10 h
+14088 m
+14089 m
+1 h
+1886 m
+14090 m
+14091 m
+14092 m
+1 h
+14093 m
+146 h
+1 h
+11 h
+4 h
+125 h
+4 h
+14094 m
+135 h
+276 h
+4 h
+10 h
+14095 m
+25 h
+14096 m
+1 h
+1 h
+1 h
+4 h
+642 m
+4 h
+11 h
+1 h
+4 h
+1 h
+10 h
+11 h
+14097 m
+10 h
+14098 m
+10 h
+1 h
+14099 m
+1 h
+14100 m
+3 h
+10 h
+124 h
+45 h
+14101 m
+14102 m
+104 h
+10 h
+14103 m
+14104 m
+4 h
+10 h
+238 h
+1 h
+1 h
+14105 m
+10 h
+45 h
+2617 h
+4 h
+14106 m
+1 h
+65 h
+82 h
+31 h
+4 h
+64 h
+172 h
+14107 m
+1 h
+124 h
+92 h
+1796 m
+770 m
+276 h
+1 h
+4 h
+36 h
+4 h
+10 h
+139 h
+82 h
+82 h
+14108 m
+1 h
+119 h
+1 h
+14109 m
+94 h
+1 h
+10 h
+5944 m
+14110 m
+41 h
+1884 m
+1 h
+14111 m
+22 h
+1 h
+4 h
+14112 m
+1458 m
+4 h
+14113 m
+14114 m
+4 h
+14115 m
+167 h
+14116 m
+4 h
+228 m
+14117 m
+1 h
+135 h
+11 h
+11779 m
+14118 m
+1 h
+109 h
+2824 m
+10 h
+4 h
+4 h
+1 h
+11 h
+4 h
+4 h
+4 h
+147 h
+138 m
+4 h
+704 m
+10 h
+10 h
+74 h
+82 h
+4 h
+10 h
+4 h
+4 h
+4 h
+307 h
+57 h
+4 h
+4 h
+125 h
+4 h
+10 h
+14119 m
+4 h
+2535 m
+3837 m
+14120 m
+11 h
+83 h
+11 h
+114 h
+14121 m
+14122 m
+55 h
+4 h
+10 h
+31 h
+10 h
+57 h
+25 h
+59 h
+14123 m
+1 h
+1 h
+1 h
+10 h
+10 h
+22 h
+10 h
+4 h
+196 h
+109 h
+1 h
+83 h
+57 h
+4 h
+1 h
+135 h
+14124 m
+10 h
+1 h
+4 h
+258 h
+1 h
+4 h
+10 h
+14125 m
+1 h
+195 h
+10 h
+25 h
+124 h
+4 h
+109 h
+74 h
+97 h
+4 h
+14126 m
+57 h
+358 h
+1 h
+14127 m
+1 h
+82 h
+14128 m
+1886 h
+65 h
+4 h
+1 h
+172 h
+4 h
+4 h
+229 h
+4 h
+1 h
+10 h
+13 h
+14129 m
+477 m
+4 h
+4 h
+4 h
+4 h
+857 m
+10 h
+4 h
+4 h
+4 h
+14130 m
+14131 m
+10 h
+1 h
+14132 m
+4 h
+2438 m
+1 h
+481 m
+2582 m
+5728 m
+4 h
+10 h
+4 h
+10 h
+14133 m
+65 h
+1 h
+4 h
+10 h
+1655 m
+1 h
+10 h
+146 h
+11 h
+10 h
+10 h
+82 h
+1 h
+4 h
+1 h
+1 h
+10 h
+169 h
+1 h
+1 h
+1 h
+1 h
+146 h
+10 h
+4 h
+4 h
+65 h
+14134 m
+83 h
+82 h
+109 h
+14135 m
+1 h
+2412 m
+11 h
+14136 m
+14137 m
+10 h
+144 h
+196 h
+14138 m
+27 h
+14139 m
+10 h
+1 h
+3 h
+14140 m
+4 h
+73 h
+10 h
+1 h
+4 h
+1137 h
+10 h
+92 h
+4 h
+10 h
+1 h
+14141 m
+4 h
+4 h
+4 h
+258 h
+272 h
+4 h
+4 h
+22 h
+1 h
+587 m
+1 h
+4 h
+1250 h
+4 h
+5470 m
+14142 m
+447 h
+1957 m
+912 m
+4 h
+10 h
+7181 m
+1 h
+14143 m
+1 h
+4 h
+10 h
+14144 m
+14145 m
+11806 m
+185 h
+4 h
+4 h
+4 h
+10 h
+2928 m
+11 h
+2056 m
+1 h
+4 h
+10 h
+82 h
+10 h
+10 h
+1 h
+14146 m
+4 h
+109 h
+14147 m
+118 h
+1 h
+14148 m
+10 h
+14149 m
+1 h
+82 h
+14150 m
+10 h
+4 h
+45 h
+1403 h
+4 h
+10 h
+258 h
+1201 h
+4 h
+266 h
+307 h
+11 h
+1 h
+140 h
+4 h
+1 h
+14151 m
+2265 m
+990 h
+64 h
+10 h
+14152 m
+266 h
+403 h
+4 h
+14153 m
+4 h
+13 h
+4 h
+125 h
+4 h
+4 h
+1 h
+4 h
+1016 h
+1 h
+74 h
+1 h
+36 h
+10 h
+10 h
+8 h
+10 h
+14154 m
+4 h
+14155 m
+4 h
+386 h
+92 h
+12 h
+4 h
+184 h
+14156 m
+4 h
+109 h
+1 h
+368 h
+1 h
+11 h
+10 h
+509 m
+10 h
+4 h
+10 h
+10 h
+353 h
+1 h
+4 h
+12 h
+478 h
+4 h
+10 h
+1 h
+10 h
+434 m
+538 h
+556 m
+1 h
+14157 m
+4 h
+10 h
+4359 m
+4 h
+1 h
+1 h
+1 h
+10 h
+4 h
+14158 m
+4 h
+330 m
+169 h
+1261 h
+630 m
+1 h
+10 h
+10 h
+278 h
+10 h
+45 h
+4 h
+12 h
+4 h
+2435 m
+1053 m
+23 h
+4 h
+267 m
+1 h
+10 h
+75 m
+124 h
+57 h
+3115 m
+7474 m
+1 h
+14159 m
+1 h
+14160 m
+10 h
+4 h
+1 h
+4 h
+14161 m
+1 h
+125 h
+82 h
+4 h
+14162 m
+112 h
+14163 m
+860 m
+386 h
+31 h
+41 h
+4 h
+4 h
+1 h
+1 h
+10 h
+14164 m
+185 h
+104 h
+14165 m
+10 h
+297 h
+14166 m
+14167 m
+4 h
+31 h
+91 h
+4 h
+7271 m
+14168 m
+4 h
+10 h
+4 h
+1 h
+4 h
+869 m
+1 h
+83 h
+10 h
+10 h
+1 h
+4 h
+14169 m
+14170 m
+10 h
+10 h
+104 h
+83 h
+4 h
+59 h
+211 m
+10 h
+25 h
+123 h
+1 h
+1 h
+4 h
+4297 m
+14171 m
+4 h
+332 h
+173 h
+31 h
+10 h
+1 h
+83 h
+14172 m
+14173 m
+14174 m
+4 h
+10 h
+1 h
+10 h
+1 h
+1138 m
+4 h
+82 h
+74 h
+14175 m
+195 h
+57 h
+5 h
+4 h
+10 h
+4 h
+1 h
+82 h
+1 h
+1685 h
+14176 m
+14177 m
+1017 h
+4 h
+4 h
+935 h
+135 h
+4 h
+14178 m
+12301 m
+4 h
+1 h
+1 h
+4 h
+4 h
+14179 m
+10 h
+40 h
+4 h
+5504 m
+92 h
+14180 m
+124 h
+1 h
+139 h
+10 h
+10 h
+1 h
+10 h
+4 h
+1 h
+4 h
+4 h
+4 h
+3742 h
+1 h
+14181 m
+10 h
+575 m
+1 h
+1 h
+14182 m
+143 h
+1 h
+82 h
+10 h
+1 h
+10 h
+4 h
+3479 m
+10 h
+1 h
+14183 m
+4 h
+1 h
+4 h
+3424 m
+1 h
+4 h
+1 h
+14184 m
+278 h
+388 m
+124 h
+1 h
+2172 m
+83 h
+4 h
+14185 m
+1 h
+368 h
+1 h
+11 h
+4 h
+82 h
+1 h
+4 h
+4 h
+965 h
+1 h
+10 h
+14186 m
+4 h
+4 h
+1 h
+581 m
+65 h
+14187 m
+843 m
+109 h
+14188 m
+14189 m
+83 h
+109 h
+11 h
+241 m
+10 h
+4 h
+986 h
+4 h
+4256 m
+4 h
+1 h
+4 h
+1 h
+14190 m
+1 h
+83 h
+10 h
+911 h
+4 h
+295 h
+14191 m
+4 h
+1218 m
+83 h
+1 h
+4 h
+4 h
+59 h
+10 h
+4 h
+478 h
+14192 m
+4 h
+1 h
+82 h
+10 h
+4 h
+14193 m
+10 h
+3 h
+14194 m
+192 h
+1 h
+4 h
+12571 m
+1 h
+1 h
+14195 m
+10 h
+31 h
+14196 m
+4 h
+4 h
+82 h
+10 h
+10 h
+57 h
+14197 m
+14198 m
+4 h
+4 h
+447 h
+74 h
+91 h
+14199 m
+14200 m
+10 h
+1 h
+83 h
+1 h
+1 h
+14201 m
+4 h
+4 h
+8890 m
+4 h
+4 h
+1 h
+4 h
+1 h
+4 h
+10 h
+1 h
+196 h
+10 h
+1045 m
+4 h
+4 h
+10 h
+195 h
+14202 m
+14203 m
+2920 m
+4 h
+1 h
+4 h
+14204 m
+4 h
+1 h
+1278 m
+14205 m
+1 h
+1 h
+4 h
+1 h
+10 h
+14206 m
+307 h
+97 h
+11 h
+14207 m
+4 h
+4 h
+4 h
+1 h
+4 h
+1261 h
+4 h
+14208 m
+295 h
+14209 m
+330 m
+14210 m
+14211 m
+1 h
+4 h
+13 h
+10 h
+14212 m
+10 h
+4 h
+10 h
+869 m
+4 h
+14213 m
+359 h
+10 h
+14214 m
+6747 m
+13 h
+4 h
+10 h
+4 h
+169 h
+4 h
+14215 m
+14216 m
+1 h
+79 h
+4 h
+2442 m
+14217 m
+82 h
+1 h
+4 h
+4 h
+1 h
+14218 m
+4 h
+48 h
+3 h
+4 h
+11 h
+358 h
+4 h
+4 h
+64 h
+307 h
+4151 m
+14219 m
+1 h
+1 h
+1 h
+195 h
+14220 m
+27 h
+1 h
+4 h
+4 h
+14221 m
+4 h
+129 h
+4 h
+10 h
+1 h
+139 h
+10 h
+295 h
+14222 m
+13 h
+601 h
+14223 m
+10 h
+12 h
+14224 m
+10 h
+10 h
+1 h
+4 h
+1 h
+10 h
+4 h
+14225 m
+700 m
+14226 m
+14227 m
+4 h
+1 h
+14228 m
+10 h
+14229 m
+1 h
+1 h
+22 h
+14230 m
+4 h
+196 h
+10 h
+14231 m
+1 h
+14232 m
+14233 m
+14234 m
+10 h
+1 h
+4 h
+83 h
+97 h
+1 h
+10 h
+888 m
+1 h
+4 h
+14235 m
+10 h
+55 h
+1 h
+692 h
+1 h
+4 h
+1 h
+14236 m
+1 h
+4 h
+14237 m
+4 h
+195 h
+556 m
+4 h
+22 h
+1 h
+23 h
+13 h
+2309 m
+1299 m
+4 h
+4 h
+10 h
+11 h
+4 h
+10 h
+1 h
+1 h
+4 h
+1504 m
+10 h
+14238 m
+3 h
+97 h
+1 h
+4 h
+4 h
+575 m
+8395 m
+14239 m
+82 h
+14240 m
+56 h
+172 h
+1 h
+195 h
+4 h
+14241 m
+4 h
+4 h
+31 h
+1027 m
+4 h
+59 h
+4 h
+14242 m
+14243 m
+10 h
+10 h
+4 h
+4 h
+1 h
+1685 h
+4 h
+1 h
+3177 m
+4 h
+104 h
+125 h
+109 h
+1 h
+4895 m
+4 h
+4 h
+378 m
+4 h
+10 h
+25 h
+1 h
+4 h
+59 h
+1 h
+125 h
+143 h
+10 h
+14244 m
+14245 m
+14246 m
+22 h
+1 h
+14247 m
+4 h
+82 h
+4481 m
+79 h
+11 h
+1 h
+36 h
+1 h
+4 h
+10 h
+14248 m
+14078 m
+33 m
+14249 m
+27 h
+79 h
+1 h
+4 h
+1 h
+4 h
+4 h
+40 h
+4 h
+10 h
+14250 m
+14251 m
+14252 m
+169 h
+109 h
+64 h
+125 h
+155 m
+1 h
+1454 h
+14253 m
+4 h
+139 h
+14254 m
+1 h
+10 h
+14255 m
+1 h
+123 h
+1 h
+143 h
+82 h
+10 h
+10 h
+82 h
+196 h
+4 h
+4 h
+73 h
+1454 h
+4 h
+14256 m
+14257 m
+1 h
+10 h
+110 h
+10 h
+41 h
+4 h
+4 h
+25 h
+4 h
+57 h
+4 h
+4 h
+1 h
+4 h
+976 h
+10 h
+10 h
+10 h
+14258 m
+10 h
+158 h
+4 h
+1 h
+4 h
+1 h
+10 h
+82 h
+10 h
+184 h
+4 h
+4 h
+1 h
+295 h
+14259 m
+172 h
+1185 m
+1 h
+10 h
+4 h
+14260 m
+14261 m
+36 h
+14262 m
+1 h
+4 h
+10 h
+14263 m
+4 h
+2879 m
+4 h
+1 h
+4 h
+4 h
+91 h
+110 h
+4 h
+1 h
+147 h
+4 h
+1027 m
+1 h
+4 h
+57 h
+463 m
+1 h
+14264 m
+4 h
+1 h
+125 h
+533 m
+1 h
+109 h
+1 h
+1 h
+4 h
+10 h
+14265 m
+1 h
+14266 m
+181 h
+4 h
+1 h
+1470 h
+4867 m
+1 h
+14267 m
+4 h
+124 h
+109 h
+125 h
+10 h
+10 h
+14268 m
+82 h
+4 h
+4 h
+4 h
+4 h
+14269 m
+1 h
+4 h
+10 h
+1 h
+4 h
+169 h
+4 h
+10 h
+1 h
+1 h
+3923 m
+4 h
+14270 m
+4 h
+14271 m
+10 h
+10 h
+36 h
+4 h
+11 h
+4 h
+4 h
+14272 m
+14273 m
+14274 m
+10 h
+4 h
+1 h
+59 h
+1454 h
+14275 m
+10 h
+14276 m
+1 h
+3 h
+4 h
+156 h
+6413 m
+14277 m
+8535 m
+4 h
+1 h
+14278 m
+10 h
+1 h
+11 h
+14279 m
+14280 m
+14281 m
+14282 m
+10 h
+1 h
+307 h
+4 h
+459 h
+14283 m
+276 h
+3 h
+1 h
+4 h
+10 h
+4 h
+10 h
+14284 m
+27 h
+14285 m
+4 h
+4 h
+459 h
+955 m
+4 h
+14286 m
+14287 m
+14288 m
+10 h
+1445 m
+4 h
+109 h
+10 h
+1 h
+533 m
+10 h
+3299 m
+4 h
+14289 m
+4 h
+4 h
+1 h
+601 h
+14290 m
+84 m
+14291 m
+10 h
+14292 m
+1 h
+57 h
+25 h
+1 h
+170 h
+92 h
+129 h
+1 h
+1 h
+250 h
+14293 m
+4 h
+4 h
+14294 m
+4 h
+250 h
+14295 m
+1822 h
+14296 m
+14297 m
+4 h
+25 h
+1 h
+10 h
+31 h
+14298 m
+1 h
+14299 m
+94 h
+10 h
+124 h
+14300 m
+14301 m
+12 h
+4 h
+4 h
+1 h
+3307 m
+4 h
+10 h
+14302 m
+125 h
+14303 m
+4 h
+10 h
+4 h
+124 h
+1070 m
+4 h
+4 h
+1 h
+4 h
+307 h
+10 h
+57 h
+14304 m
+14305 m
+10 h
+3 h
+110 h
+4 h
+82 h
+276 h
+83 h
+4 h
+25 h
+4 h
+135 h
+65 h
+4 h
+14306 m
+14307 m
+14308 m
+4 h
+10 h
+45 h
+299 h
+109 h
+4 h
+1 h
+383 h
+5567 m
+1 h
+14309 m
+270 h
+8697 m
+4 h
+14310 m
+1 h
+3 h
+14311 m
+10 h
+14312 m
+186 h
+1 h
+10 h
+11 h
+4 h
+10 h
+557 m
+156 h
+104 h
+1 h
+14313 m
+4 h
+1 h
+14314 m
+4 h
+520 h
+14315 m
+10 h
+14316 m
+4 h
+45 h
+1 h
+14317 m
+4 h
+14318 m
+14319 m
+14320 m
+109 h
+1250 h
+4 h
+59 h
+28 h
+124 h
+1 h
+4 h
+687 h
+13 h
+10 h
+601 h
+1 h
+757 h
+14321 m
+12675 m
+4 h
+4 h
+31 h
+620 m
+10 h
+10 h
+14322 m
+447 h
+11 h
+14323 m
+1 h
+10 h
+1 h
+10 h
+4 h
+4 h
+4 h
+8 h
+1 h
+14324 m
+10267 m
+10 h
+1 h
+1 h
+478 h
+10 h
+10 h
+3837 m
+4 h
+1 h
+10 h
+14325 m
+14326 m
+14327 m
+83 h
+185 h
+11 h
+41 h
+4 h
+4 h
+4 h
+135 h
+10 h
+11 h
+1 h
+83 h
+82 h
+1261 h
+1 h
+1 h
+4 h
+113 h
+1 h
+2308 h
+1 h
+14328 m
+297 h
+11 h
+4 h
+2475 m
+14329 m
+4 h
+25 h
+2046 m
+4 h
+4 h
+4 h
+14330 m
+10 h
+12 h
+4 h
+4 h
+2139 m
+4 h
+4 h
+73 h
+368 h
+59 h
+10 h
+4 h
+1 h
+56 h
+83 h
+14331 m
+14332 m
+59 h
+10 h
+129 h
+4 h
+25 h
+4 h
+14333 m
+124 h
+1 h
+196 h
+4 h
+4 h
+1 h
+4 h
+59 h
+4 h
+1 h
+10 h
+3 h
+4 h
+4 h
+4 h
+114 h
+1 h
+14334 m
+4 h
+4 h
+2212 m
+10 h
+10 h
+125 h
+172 h
+109 h
+10 h
+14335 m
+14336 m
+278 h
+4 h
+1 h
+158 h
+520 h
+1 h
+14337 m
+14338 m
+578 h
+4 h
+1 h
+12 h
+14339 m
+14340 m
+92 h
+1 h
+10 h
+14341 m
+14342 m
+10 h
+31 h
+718 h
+65 h
+57 h
+358 h
+1 h
+1 h
+14343 m
+14344 m
+332 h
+1 h
+4 h
+14345 m
+10 h
+146 h
+1 h
+1 h
+14346 m
+10 h
+143 h
+1 h
+135 h
+4 h
+1 h
+7727 m
+14 m
+14347 m
+82 h
+1 h
+986 h
+31 h
+14348 m
+4 h
+14349 m
+14350 m
+83 h
+4 h
+4 h
+11 h
+14351 m
+93 h
+4 h
+119 h
+14352 m
+4 h
+146 h
+14353 m
+167 h
+6057 m
+14354 m
+4 h
+157 h
+41 h
+112 h
+172 h
+4 h
+11 h
+124 h
+258 h
+109 h
+1 h
+1 h
+383 h
+1 h
+109 h
+14355 m
+10 h
+14356 m
+10 h
+6749 m
+383 h
+1 h
+1 h
+4 h
+69 h
+1 h
+14357 m
+82 h
+14358 m
+1 h
+1 h
+109 h
+167 h
+4 h
+1 h
+4 h
+14359 m
+10 h
+11 h
+14360 m
+1 h
+14361 m
+82 h
+41 h
+4 h
+143 h
+14362 m
+1 h
+14363 m
+4 h
+1 h
+367 m
+14364 m
+10 h
+4 h
+41 h
+14365 m
+14366 m
+10 h
+119 h
+10 h
+10 h
+10 h
+59 h
+4 h
+4 h
+1 h
+1642 m
+2418 m
+1 h
+4 h
+14367 m
+143 h
+91 h
+10 h
+10 h
+4 h
+104 h
+620 m
+4 h
+14368 m
+4 h
+25 h
+14369 m
+1 h
+4 h
+82 h
+14370 m
+10 h
+996 m
+97 h
+12570 m
+11417 m
+1 h
+1 h
+4 h
+1269 m
+4 h
+4 h
+4 h
+5205 m
+27 h
+10 h
+4 h
+4 h
+1 h
+56 h
+14371 m
+1 h
+4 h
+14372 m
+4 h
+10 h
+1535 m
+1 h
+1 h
+1 h
+1 h
+4 h
+4 h
+857 m
+1089 h
+14373 m
+1096 m
+10 h
+22 h
+14374 m
+4 h
+10 h
+10 h
+1 h
+146 h
+94 h
+83 h
+45 h
+5616 m
+13 h
+10 h
+4 h
+1 h
+4 h
+1089 h
+97 h
+14375 m
+10391 m
+13 h
+4 h
+10 h
+1 h
+14376 m
+65 h
+82 h
+4 h
+10 h
+10 h
+1 h
+10 h
+1642 h
+14377 m
+4 h
+36 h
+4 h
+1 h
+3 h
+14378 m
+12020 m
+258 h
+11 h
+4 h
+14379 m
+14380 m
+2770 m
+146 h
+25 h
+14381 m
+2028 m
+14382 m
+1 h
+10 h
+14383 m
+14384 m
+4 h
+68 m
+4 h
+14385 m
+1 h
+14386 m
+10 h
+4 h
+11 h
+4 h
+4 h
+14387 m
+238 h
+10 h
+368 h
+14388 m
+10 h
+4 h
+4 h
+4 h
+4 h
+307 h
+14389 m
+14390 m
+14391 m
+1 h
+4 h
+1 h
+1 h
+4 h
+57 h
+1 h
+297 h
+5 h
+1 h
+4 h
+5025 m
+11 h
+14392 m
+11 h
+4 h
+10 h
+4 h
+3 h
+1 h
+74 h
+14393 m
+10 h
+10 h
+10 h
+3143 m
+4 h
+4 h
+794 m
+14394 m
+4530 m
+110 h
+10 h
+10 h
+13 h
+41 h
+1 h
+104 h
+4 h
+1 h
+10 h
+124 h
+4 h
+36 h
+4 h
+10 h
+14395 m
+4 h
+1 h
+4 h
+4 h
+4 h
+10 h
+4 h
+10 h
+10 h
+10 h
+371 h
+124 h
+14396 m
+1 h
+278 h
+1 h
+322 m
+4 h
+3 h
+4 h
+3293 m
+59 h
+10 h
+4 h
+1 h
+4 h
+41 h
+10 h
+10 h
+109 h
+4 h
+12047 m
+12 h
+14397 m
+4 h
+4 h
+14398 m
+3704 m
+1759 m
+1016 h
+1766 h
+4 h
+10 h
+4 h
+4524 m
+10 h
+10 h
+11 h
+10 h
+10 h
+4 h
+2194 m
+10 h
+4 h
+13969 m
+1 h
+6869 m
+1 h
+143 h
+135 h
+14399 m
+25 h
+1 h
+4 h
+1137 h
+4 h
+31 h
+1309 h
+4 h
+65 h
+1 h
+14400 m
+79 h
+4030 m
+976 h
+10 h
+1 h
+10 h
+14401 m
+10 h
+1 h
+689 m
+14402 m
+181 h
+4 h
+92 h
+104 h
+10 h
+1 h
+4 h
+11 h
+1 h
+11 h
+4 h
+109 h
+1 h
+4 h
+1309 h
+14403 m
+4 h
+195 h
+93 h
+1 h
+14404 m
+91 h
+4 h
+1 h
+14405 m
+4 h
+2339 m
+10 h
+1 h
+10 h
+10 h
+1 h
+104 h
+4 h
+1 h
+1 h
+3 h
+4 h
+1 h
+1 h
+1 h
+57 h
+11 h
+1 h
+278 h
+1 h
+1 h
+642 h
+10 h
+1 h
+1 h
+1 h
+59 h
+14406 m
+4 h
+1 h
+14407 m
+14408 m
+4 h
+14409 m
+4 h
+4 h
+4 h
+4 h
+4 h
+11 h
+4 h
+10 h
+57 h
+4 h
+1 h
+4 h
+3 h
+2851 m
+83 h
+10 h
+14410 m
+601 h
+14411 m
+823 m
+10 h
+1650 h
+57 h
+176 m
+14412 m
+3 h
+1822 h
+1 h
+1619 h
+14413 m
+4 h
+1 h
+4 h
+4 h
+14414 m
+10 h
+14415 m
+10 h
+195 h
+14416 m
+1 h
+100 m
+41 h
+4 h
+1 h
+10 h
+4 h
+124 h
+14417 m
+1 h
+4 h
+14418 m
+1 h
+3523 m
+14419 m
+4 h
+10 h
+808 m
+1 h
+10 h
+1 h
+4 h
+74 h
+4 h
+14420 m
+4 h
+4 h
+14421 m
+5475 m
+14422 m
+1 h
+27 h
+1 h
+14423 m
+10 h
+4 h
+14424 m
+10 h
+4538 m
+14425 m
+1 h
+74 h
+1 h
+1 h
+4 h
+4 h
+12 h
+14426 m
+3307 m
+8 h
+14427 m
+14428 m
+1 h
+1 h
+14429 m
+211 m
+172 h
+14430 m
+14431 m
+14432 m
+245 m
+14433 m
+4 h
+10 h
+1 h
+1 h
+14434 m
+14435 m
+327 m
+4 h
+57 h
+4 h
+6726 m
+14436 m
+4 h
+10 h
+14437 m
+278 h
+1 h
+4 h
+5475 m
+11 h
+4 h
+14438 m
+14439 m
+14440 m
+4 h
+4 h
+692 h
+10 h
+14441 m
+14442 m
+14443 m
+10 h
+4 h
+10 h
+10 h
+4 h
+4 h
+25 h
+14444 m
+4 h
+1 h
+83 h
+1 h
+14445 m
+4 h
+14446 m
+1 h
+14447 m
+8114 m
+14448 m
+14449 m
+1 h
+1 h
+14450 m
+14451 m
+59 h
+14452 m
+4 h
+4 h
+4 h
+4 h
+22 h
+10 h
+1 h
+14453 m
+10 h
+1 h
+167 h
+14454 m
+1 h
+109 h
+4 h
+125 h
+14455 m
+4 h
+1 h
+14456 m
+4 h
+1 h
+10 h
+4 h
+14457 m
+143 h
+4 h
+139 h
+10 h
+14458 m
+4 h
+14459 m
+4 h
+10 h
+4 h
+1 h
+1 h
+31 h
+1137 h
+14460 m
+1 h
+14461 m
+1 h
+692 h
+64 h
+10 h
+14462 m
+4 h
+14463 m
+1261 h
+1 h
+14464 m
+1 h
+10 h
+4 h
+1 h
+14465 m
+4 h
+108 h
+10 h
+297 h
+10 h
+601 h
+952 m
+36 h
+1 h
+1 h
+1261 h
+124 h
+14466 m
+10 h
+1 h
+10 h
+3 h
+92 h
+1027 h
+3 h
+10 h
+14467 m
+156 h
+124 h
+1 h
+69 h
+1 h
+10 h
+14468 m
+10 h
+1 h
+125 h
+11 h
+4 h
+14469 m
+14470 m
+10 h
+10 h
+14471 m
+1 h
+10 h
+14472 m
+4 h
+10 h
+14473 m
+10 h
+4 h
+143 h
+1 h
+14474 m
+1261 h
+69 h
+10 h
+11147 m
+2379 m
+14475 m
+14476 m
+192 h
+31 h
+41 h
+10 h
+25 h
+10 h
+4788 m
+4 h
+4 h
+1 h
+1 h
+10 h
+14477 m
+4 h
+1 h
+4 h
+1822 h
+41 h
+3913 m
+14478 m
+1 h
+4 h
+322 m
+1 h
+14479 m
+22 h
+14480 m
+119 h
+11 h
+139 h
+10 h
+1 h
+14481 m
+14482 m
+4 h
+125 h
+157 h
+1 h
+9837 m
+1 h
+3 h
+14483 m
+4 h
+319 h
+4 h
+4932 m
+10 h
+4 h
+5709 m
+13 h
+2923 h
+14484 m
+14485 m
+1 h
+14486 m
+61 m
+10 h
+109 h
+10 h
+57 h
+4 h
+14487 m
+4 h
+167 h
+2710 m
+14488 m
+990 h
+258 h
+4 h
+11 h
+10 h
+10 h
+14489 m
+14490 m
+1359 h
+4 h
+14491 m
+41 h
+14492 m
+82 h
+692 h
+4 h
+1128 m
+10 h
+1196 m
+4 h
+170 h
+4 h
+1 h
+4 h
+10 h
+74 h
+536 h
+14493 m
+14494 m
+10 h
+143 h
+4 h
+4 h
+1886 h
+124 h
+109 h
+1 h
+1 h
+718 h
+10 h
+10 h
+4 h
+10 h
+1 h
+1 h
+10 h
+55 h
+147 h
+10 h
+6784 m
+339 m
+25 h
+11 h
+1089 h
+94 h
+57 h
+124 h
+14495 m
+14496 m
+358 h
+4 h
+22 h
+57 h
+1 h
+4 h
+14497 m
+10 h
+59 h
+2265 m
+279 m
+1 h
+82 h
+520 h
+10 h
+4 h
+36 h
+4 h
+195 h
+14498 m
+14499 m
+14500 m
+28 h
+11 h
+1092 m
+4 h
+10 h
+74 h
+4 h
+10 h
+1 h
+1 h
+11 h
+2110 m
+104 h
+4 h
+1 h
+1 h
+10 h
+1 h
+11 h
+109 h
+11 h
+14501 m
+1 h
+4 h
+2418 m
+1 h
+25 h
+172 h
+1 h
+918 m
+10 h
+1 h
+14502 m
+10 h
+11 h
+10 h
+1 h
+10 h
+2534 m
+1 h
+4 h
+10 h
+4 h
+322 h
+4 h
+4 h
+1 h
+274 h
+59 h
+10 h
+4 h
+1 h
+4 h
+1 h
+4 h
+14503 m
+10 h
+1 h
+4 h
+104 h
+601 h
+11 h
+14504 m
+14505 m
+6399 m
+147 h
+4 h
+146 h
+10 h
+14506 m
+169 h
+4 h
+124 h
+14507 m
+1 h
+4 h
+4 h
+4 h
+238 h
+11 h
+4 h
+4 h
+83 h
+4 h
+4 h
+14508 m
+104 h
+14509 m
+1 h
+129 h
+1 h
+250 h
+10 h
+57 h
+14510 m
+1 h
+10 h
+4 h
+13 h
+14511 m
+1 h
+11 h
+25 h
+4 h
+1646 m
+14512 m
+14513 m
+4 h
+10 h
+1 h
+14514 m
+10 h
+10 h
+125 h
+10 h
+10 h
+10 h
+97 h
+4 h
+4 h
+27 h
+1 h
+10 h
+14515 m
+4 h
+4 h
+14516 m
+10 h
+11 h
+170 h
+10 h
+1 h
+1 h
+10 h
+14517 m
+147 h
+31 h
+135 h
+14518 m
+12170 m
+1 h
+14519 m
+4 h
+10 h
+11 h
+1 h
+4 h
+4297 m
+14520 m
+4 h
+10 h
+4 h
+184 h
+4 h
+1 h
+13 h
+4 h
+4 h
+11 h
+14521 m
+4 h
+1 h
+1 h
+7553 m
+1884 m
+265 h
+14522 m
+10 h
+1 h
+104 h
+14523 m
+56 h
+4 h
+14524 m
+1 h
+14525 m
+195 h
+1 h
+14526 m
+1 h
+10 h
+273 m
+538 h
+14527 m
+3484 m
+4 h
+256 m
+4131 m
+307 h
+195 h
+332 h
+4 h
+158 h
+10 h
+359 h
+11 h
+14528 m
+12 h
+4 h
+10 h
+4 h
+14529 m
+11 h
+1 h
+1 h
+1 h
+125 h
+4 h
+506 m
+10 h
+297 h
+57 h
+4 h
+10 h
+434 m
+57 h
+4 h
+14530 m
+14531 m
+4 h
+4 h
+83 h
+4 h
+4 h
+295 h
+110 h
+135 h
+83 h
+278 h
+1 h
+14532 m
+13 h
+4 h
+1 h
+114 h
+10 h
+10 h
+14533 m
+4 h
+14534 m
+3607 m
+1 h
+266 h
+4 h
+1 h
+1 h
+14535 m
+10 h
+4 h
+14536 m
+4 h
+1796 h
+14537 m
+10615 m
+14538 m
+10 h
+4 h
+804 m
+185 h
+104 h
+358 h
+14539 m
+4 h
+14540 m
+10 h
+10 h
+238 h
+125 h
+12 h
+14541 m
+10 h
+276 h
+10 h
+4 h
+114 h
+4 h
+10 h
+1 h
+125 h
+4 h
+4 h
+14542 m
+55 h
+181 h
+181 h
+14543 m
+4 h
+114 h
+1 h
+14544 m
+1 h
+1 h
+6135 m
+10 h
+181 h
+4 h
+4 h
+10 h
+1 h
+4 h
+447 h
+4 h
+164 h
+14545 m
+10 h
+10 h
+146 h
+14546 m
+10 h
+10 h
+1 h
+195 h
+104 h
+4 h
+4 h
+4 h
+4 h
+14547 m
+113 h
+74 h
+367 h
+4 h
+1 h
+10 h
+114 h
+4 h
+119 h
+4 h
+4 h
+123 h
+10 h
+104 h
+11 h
+1 h
+14548 m
+386 h
+10 h
+14549 m
+1 h
+79 h
+1 h
+1 h
+25 h
+14550 m
+12005 m
+14551 m
+10 h
+14552 m
+1 h
+4 h
+5929 m
+10 h
+1 h
+4 h
+4 h
+11 h
+10 h
+119 h
+10 h
+1 h
+10 h
+10 h
+11 h
+14553 m
+14554 m
+1 h
+91 h
+10 h
+4 h
+1 h
+97 h
+14555 m
+4 h
+25 h
+224 m
+10 h
+279 m
+10 h
+1 h
+185 h
+14556 m
+4 h
+4 h
+4 h
+14557 m
+125 h
+167 h
+40 h
+124 h
+1137 h
+25 h
+14558 m
+4 h
+123 h
+692 h
+14559 m
+5505 m
+1 h
+4 h
+1 h
+447 h
+1 h
+10 h
+172 h
+2379 h
+4 h
+14560 m
+14561 m
+14562 m
+10 h
+10 h
+4 h
+10 h
+82 h
+14563 m
+14564 m
+83 h
+10 h
+64 h
+28 h
+1 h
+14565 m
+1677 m
+14566 m
+11 h
+10 h
+10 h
+14567 m
+14568 m
+14569 m
+4 h
+12329 m
+10 h
+1 h
+377 h
+4 h
+4 h
+367 h
+190 h
+464 h
+124 h
+1016 h
+11 h
+104 h
+11 h
+14570 m
+119 h
+31 h
+4 h
+10 h
+73 h
+4 h
+14571 m
+4 h
+640 h
+4 h
+14572 m
+124 h
+11 h
+31 h
+167 h
+1 h
+41 h
+1 h
+11 h
+4 h
+857 m
+11 h
+10 h
+11 h
+14573 m
+204 h
+192 h
+25 h
+10028 m
+1089 h
+4 h
+4 h
+1 h
+73 h
+14574 m
+196 h
+1 h
+104 h
+10 h
+57 h
+14575 m
+4 h
+14576 m
+1 h
+14577 m
+10 h
+4 h
+4 h
+4 h
+57 h
+1 h
+4 h
+14578 m
+73 h
+10 h
+14579 m
+25 h
+59 h
+4 h
+4 h
+1 h
+14580 m
+12 h
+1 h
+10 h
+2379 h
+124 h
+1 h
+1 h
+14581 m
+78 m
+4 h
+14582 m
+14583 m
+4 h
+14584 m
+14585 m
+1 h
+1 h
+10 h
+4 h
+14586 m
+14587 m
+10 h
+82 h
+25 h
+14588 m
+41 h
+4 h
+10 h
+14589 m
+10 h
+1 h
+1 h
+1 h
+4 h
+1 h
+10 h
+82 h
+10 h
+4 h
+14590 m
+14591 m
+11 h
+219 h
+4 h
+4 h
+4 h
+1 h
+196 h
+11 h
+4 h
+4 h
+55 h
+258 h
+1 h
+10 h
+10 h
+1 h
+14592 m
+4 h
+169 h
+143 h
+297 h
+1 h
+14593 m
+14594 m
+74 h
+82 h
+14595 m
+241 m
+4 h
+12 h
+123 h
+1 h
+124 h
+4 h
+1 h
+4 h
+83 h
+4 h
+124 h
+1 h
+14596 m
+9176 m
+2447 m
+1 h
+4 h
+4 h
+14597 m
+14598 m
+31 h
+4 h
+459 h
+10 h
+4 h
+14599 m
+10 h
+1 h
+1 h
+10 h
+55 h
+1 h
+11 h
+4 h
+278 h
+146 h
+1 h
+146 h
+1 h
+935 h
+601 h
+10 h
+28 h
+4 h
+14600 m
+4 h
+10 h
+4 h
+4 h
+14601 m
+4 h
+4 h
+13544 m
+4 h
+1 h
+14602 m
+41 h
+10 h
+1 h
+14603 m
+14604 m
+1697 m
+25 h
+1 h
+14605 m
+104 h
+14606 m
+4 h
+1 h
+4 h
+146 h
+82 h
+25 h
+14607 m
+4 h
+10 h
+14608 m
+4 h
+4 h
+10 h
+74 h
+14609 m
+14610 m
+591 m
+1 h
+1 h
+14611 m
+10 h
+520 h
+4 h
+4 h
+1 h
+14612 m
+14613 m
+4 h
+238 h
+10 h
+2788 m
+266 h
+4 h
+338 m
+5 h
+1714 m
+14614 m
+258 h
+4 h
+1 h
+4 h
+447 h
+55 h
+114 h
+10 h
+181 h
+57 h
+1 h
+4 h
+1 h
+4 h
+196 h
+10 h
+4 h
+823 m
+4 h
+1 h
+1 h
+1 h
+12 h
+11 h
+4 h
+10 h
+129 h
+14615 m
+10 h
+1 h
+14616 m
+1 h
+185 h
+14617 m
+8950 m
+1 h
+14618 m
+14619 m
+10 h
+569 m
+1 h
+14620 m
+124 h
+4 h
+185 h
+14621 m
+14622 m
+1 h
+57 h
+14623 m
+1 h
+14624 m
+65 h
+14625 m
+93 h
+4 h
+14626 m
+196 h
+3 h
+109 h
+4 h
+1 h
+10 h
+14627 m
+14628 m
+4 h
+1 h
+12131 m
+31 h
+14629 m
+14630 m
+1016 h
+25 h
+14631 m
+57 h
+4 h
+10 h
+5478 m
+14632 m
+109 h
+1 h
+14633 m
+10 h
+278 h
+10 h
+14634 m
+10 h
+109 h
+10 h
+4 h
+10 h
+10 h
+14635 m
+10 h
+83 h
+4 h
+65 h
+14636 m
+1137 h
+353 h
+1 h
+1 h
+10 h
+10 h
+14637 m
+5 h
+109 h
+1 h
+4 h
+4 h
+1772 m
+10089 m
+92 h
+10 h
+10 h
+1 h
+1470 h
+10 h
+1 h
+4 h
+118 h
+1 h
+737 h
+31 h
+14638 m
+14639 m
+1 h
+1595 m
+14640 m
+299 h
+2172 m
+14641 m
+28 h
+4 h
+1 h
+14642 m
+3 h
+4 h
+10 h
+65 h
+4 h
+4 h
+1 h
+14643 m
+11 h
+109 h
+10 h
+1 h
+4 h
+4 h
+1 h
+10 h
+14644 m
+4 h
+4 h
+14645 m
+692 h
+14646 m
+14647 m
+14648 m
+57 h
+4 h
+125 h
+4 h
+1 h
+2733 h
+109 h
+14649 m
+14650 m
+1 h
+1 h
+4 h
+10 h
+4 h
+258 h
+109 h
+1 h
+14651 m
+1 h
+536 h
+4 h
+2494 m
+1 h
+4 h
+10 h
+383 h
+2367 m
+4 h
+1 h
+109 h
+14652 m
+59 h
+3 h
+1838 m
+195 h
+11 h
+10 h
+4 h
+10 h
+4 h
+4 h
+4 h
+10 h
+1 h
+1 h
+10 h
+14653 m
+14654 m
+124 h
+92 h
+10 h
+14655 m
+1 h
+1 h
+4 h
+14656 m
+1227 m
+4 h
+4 h
+4 h
+14657 m
+1 h
+10 h
+4 h
+10 h
+14658 m
+913 m
+56 h
+10 h
+4 h
+1 h
+135 h
+14659 m
+10 h
+83 h
+1 h
+14660 m
+4 h
+1 h
+1 h
+4 h
+31 h
+4 h
+14661 m
+1 h
+169 h
+14662 m
+1 h
+14663 m
+10 h
+64 h
+4 h
+4 h
+4 h
+274 h
+14664 m
+1957 m
+14665 m
+41 h
+10 h
+135 h
+14666 m
+11 h
+4 h
+10 h
+10 h
+1 h
+13 h
+10 h
+10 h
+14667 m
+289 h
+195 h
+2418 h
+190 h
+10 h
+74 h
+14668 m
+59 h
+14669 m
+4 h
+4 h
+4 h
+4 h
+10 h
+104 h
+4 h
+11 h
+3028 m
+4 h
+1074 m
+10 h
+4 h
+536 h
+10 h
+1 h
+31 h
+3 h
+4 h
+4 h
+57 h
+1 h
+1 h
+10 h
+10 h
+74 h
+1 h
+10 h
+135 h
+6869 m
+10 h
+4 h
+14670 m
+14671 m
+22 h
+195 h
+109 h
+1 h
+1 h
+124 h
+172 h
+10 h
+10 h
+10 h
+4 h
+14672 m
+307 h
+2971 m
+10 h
+4 h
+264 m
+1 h
+97 h
+11 h
+1 h
+4 h
+4 h
+124 h
+1 h
+14673 m
+1 h
+4 h
+14674 m
+4 h
+4 h
+4 h
+14675 m
+184 h
+12 h
+1 h
+2072 m
+1030 m
+14676 m
+11 h
+64 h
+4 h
+13 h
+195 h
+83 h
+14677 m
+1137 h
+477 m
+1 h
+3083 m
+4 h
+10 h
+14678 m
+14679 m
+4 h
+10 h
+14680 m
+1 h
+28 h
+14681 m
+1 h
+1 h
+1 h
+9861 m
+4 h
+1 h
+4 h
+10 h
+14682 m
+83 h
+1 h
+1 h
+9757 m
+10 h
+14683 m
+14684 m
+14685 m
+10 h
+14686 m
+1 h
+1 h
+11 h
+11 h
+4 h
+10 h
+104 h
+4 h
+10 h
+10 h
+14687 m
+14688 m
+4 h
+14689 m
+860 m
+1 h
+10 h
+11 h
+11 h
+1 h
+14690 m
+4 h
+338 h
+4 h
+4 h
+4 h
+14691 m
+10 h
+104 h
+1 h
+11 h
+4 h
+1039 m
+10 h
+13361 m
+114 h
+156 h
+146 h
+1 h
+4 h
+9727 m
+156 h
+14692 m
+124 h
+4 h
+1 h
+6567 m
+1 h
+14693 m
+295 h
+4 h
+4 h
+143 h
+1 h
+4 h
+11 h
+14694 m
+4 h
+13 h
+4 h
+4 h
+14695 m
+4 h
+195 h
+4 h
+4 h
+1419 m
+3 h
+964 m
+1 h
+1 h
+109 h
+60 m
+730 m
+1 h
+1 h
+4 h
+109 h
+562 m
+10 h
+10 h
+124 h
+10 h
+143 h
+14696 m
+10 h
+1 h
+172 h
+1 h
+14697 m
+4 h
+14698 m
+4 h
+143 h
+3 h
+4900 m
+36 h
+4 h
+14699 m
+1 h
+4 h
+45 h
+10 h
+10 h
+82 h
+1 h
+55 h
+10 h
+976 h
+4 h
+1 h
+2359 m
+10 h
+4 h
+59 h
+1 h
+4 h
+74 h
+4 h
+1697 m
+4 h
+3 h
+36 h
+1 h
+4 h
+1 h
+1 h
+4 h
+14700 m
+124 h
+4 h
+289 h
+4 h
+1 h
+57 h
+3 h
+10 h
+4 h
+4 h
+110 h
+4 h
+1 h
+1 h
+11 h
+332 h
+4 h
+14701 m
+14702 m
+1 h
+112 h
+10 h
+1 h
+4 h
+1 h
+1 h
+10 h
+258 h
+14703 m
+82 h
+14704 m
+172 h
+14705 m
+10 h
+10 h
+10 h
+1 h
+10 h
+1 h
+10 h
+12 h
+640 h
+59 h
+8 h
+4 h
+1 h
+14706 m
+4 h
+10 h
+125 h
+4 h
+4 h
+10 h
+10 h
+4 h
+57 h
+4 h
+1 h
+1 h
+74 h
+11 h
+1 h
+147 h
+4 h
+1 h
+1 h
+10464 m
+10 h
+31 h
+14707 m
+147 h
+10640 m
+14708 m
+4 h
+4 h
+4 h
+230 h
+8938 m
+843 m
+4 h
+83 h
+10 h
+4 h
+4 h
+14709 m
+4 h
+1713 m
+4 h
+4 h
+278 h
+64 h
+10 h
+104 h
+6702 m
+4 h
+1 h
+230 h
+278 h
+14710 m
+10 h
+57 h
+383 h
+11 h
+4 h
+4 h
+10 h
+4 h
+82 h
+125 h
+1 h
+1 h
+10 h
+10 h
+10 h
+10 h
+4 h
+14711 m
+520 h
+4 h
+4 h
+4 h
+10 h
+1 h
+1 h
+69 h
+4 h
+1 h
+1 h
+1 h
+1639 m
+4 h
+569 m
+14712 m
+986 h
+9933 m
+4441 m
+4 h
+1 h
+258 h
+4 h
+14713 m
+40 h
+4 h
+509 m
+857 m
+4 h
+83 h
+4 h
+14714 m
+4 h
+4441 m
+447 h
+1 h
+190 h
+14715 m
+266 h
+45 h
+1 h
+110 h
+412 h
+146 h
+4 h
+278 h
+1 h
+143 h
+10 h
+10 h
+4 h
+169 h
+1 h
+14716 m
+258 h
+1027 h
+1 h
+10 h
+10 h
+82 h
+4 h
+195 h
+2163 m
+1 h
+14717 m
+1 h
+25 h
+10 h
+10 h
+104 h
+4 h
+10 h
+164 h
+185 h
+1337 m
+4 h
+27 h
+10 h
+147 h
+4 h
+147 h
+4 h
+167 h
+4 h
+14718 m
+4 h
+4 h
+14719 m
+114 h
+184 h
+359 h
+57 h
+4 h
+1 h
+14720 m
+238 h
+2442 m
+4 h
+1 h
+1 h
+4 h
+4 h
+258 h
+10 h
+10 h
+10 h
+1 h
+4 h
+14721 m
+10 h
+139 h
+1 h
+10 h
+10 h
+1 h
+13879 m
+10 h
+4 h
+10 h
+13536 m
+4 h
+146 h
+125 h
+45 h
+14722 m
+192 h
+1 h
+94 h
+1 h
+10 h
+4 h
+4 h
+4 h
+1 h
+14723 m
+14724 m
+10 h
+125 h
+1 h
+10 h
+14725 m
+536 h
+14726 m
+4 h
+1 h
+14727 m
+14728 m
+125 h
+307 h
+11 h
+238 h
+109 h
+1 h
+14729 m
+1 h
+5125 m
+181 h
+91 h
+4 h
+13 h
+4 h
+45 h
+104 h
+4 h
+1 h
+4 h
+4 h
+14730 m
+4 h
+4 h
+10 h
+1 h
+55 h
+4 h
+14731 m
+45 h
+59 h
+14732 m
+1 h
+4 h
+14733 m
+10 h
+4 h
+59 h
+4 h
+4 h
+12 h
+10 h
+41 h
+1 h
+1 h
+92 h
+14734 m
+10 h
+41 h
+4 h
+1 h
+1 h
+31 h
+4 h
+14735 m
+10 h
+14736 m
+1 h
+14737 m
+11 h
+14738 m
+1 h
+195 h
+1 h
+170 h
+4 h
+14739 m
+59 h
+1 h
+11 h
+4 h
+4 h
+124 h
+181 h
+1 h
+238 h
+14740 m
+4 h
+10 h
+14741 m
+14742 m
+1337 m
+114 h
+14743 m
+31 h
+45 h
+4 h
+14744 m
+8197 m
+3 h
+14745 m
+10 h
+79 h
+27 h
+10 h
+4 h
+1737 m
+386 h
+4 h
+14746 m
+4905 m
+1 h
+1 h
+10 h
+229 h
+1 h
+4 h
+4 h
+195 h
+10 h
+11 h
+295 h
+14747 m
+11 h
+1 h
+2769 m
+14748 m
+14749 m
+14750 m
+4 h
+92 h
+14751 m
+1 h
+581 m
+4 h
+4 h
+195 h
+14752 m
+4 h
+332 h
+278 h
+12805 m
+4 h
+14753 m
+14754 m
+169 h
+10 h
+10 h
+4177 m
+36 h
+14755 m
+4 h
+12218 m
+4 h
+14756 m
+10 h
+5557 m
+31 h
+10 h
+4 h
+383 h
+4 h
+14757 m
+4 h
+1 h
+4 h
+2591 m
+14758 m
+1 h
+1 h
+3995 m
+146 h
+119 h
+108 h
+1 h
+10 h
+14759 m
+1 h
+265 h
+79 h
+146 h
+4 h
+1 h
+14760 m
+4 h
+2490 m
+10 h
+123 h
+109 h
+14761 m
+1 h
+10 h
+7479 m
+11 h
+4 h
+14762 m
+359 h
+104 h
+125 h
+14763 m
+4 h
+1 h
+41 h
+1 h
+282 m
+14764 m
+9933 m
+276 h
+4 h
+4 h
+14765 m
+4 h
+1 h
+1 h
+14766 m
+12 h
+10 h
+14767 m
+104 h
+1 h
+14768 m
+11 h
+10 h
+14769 m
+4 h
+1 h
+1 h
+4 h
+4 h
+41 h
+358 h
+1 h
+1 h
+10 h
+4 h
+1 h
+1 h
+4 h
+27 h
+1 h
+92 h
+14770 m
+14771 m
+10 h
+124 h
+332 h
+1 h
+4 h
+1 h
+4 h
+14772 m
+1 h
+10 h
+10 h
+4528 m
+3 h
+103 h
+31 h
+1 h
+10 h
+10 h
+4 h
+4 h
+10 h
+614 m
+626 m
+173 h
+57 h
+14773 m
+14774 m
+10 h
+4 h
+167 h
+4 h
+4 h
+14775 m
+14776 m
+1 h
+31 h
+238 h
+1 h
+4 h
+1 h
+10 h
+109 h
+14777 m
+319 h
+10 h
+4 h
+4 h
+1 h
+83 h
+14778 m
+11 h
+4 h
+14779 m
+4 h
+4 h
+10 h
+10 h
+1 h
+196 h
+8683 m
+443 h
+83 h
+10 h
+124 h
+11 h
+4 h
+4 h
+4 h
+4 h
+14780 m
+10 h
+14781 m
+8890 m
+11 h
+10 h
+1 h
+14782 m
+10 h
+4 h
+11 h
+1 h
+124 h
+4 h
+4 h
+4 h
+14783 m
+4 h
+12338 m
+1791 m
+1796 h
+4 h
+10 h
+10 h
+4 h
+82 h
+10 h
+4 h
+4 h
+4 h
+258 h
+14784 m
+1 h
+124 h
+3 h
+1 h
+1 h
+1 h
+4 h
+14785 m
+4 h
+14786 m
+14787 m
+124 h
+4 h
+14788 m
+4 h
+4 h
+14789 m
+113 h
+13 h
+4 h
+14790 m
+4 h
+4 h
+14791 m
+14792 m
+106 m
+1261 h
+4188 m
+10 h
+14793 m
+14794 m
+4 h
+1 h
+1 h
+10 h
+3750 m
+4 h
+14795 m
+1 h
+1 h
+14796 m
+11 h
+119 h
+14797 m
+1 h
+10 h
+73 h
+4 h
+10 h
+25 h
+14798 m
+83 h
+14477 m
+10 h
+955 m
+1 h
+14799 m
+4744 m
+14800 m
+1780 m
+14801 m
+104 h
+14802 m
+45 h
+14803 m
+276 h
+10 h
+1 h
+55 h
+14804 m
+4 h
+14805 m
+75 m
+14806 m
+11 h
+1 h
+4 h
+14807 m
+69 h
+4 h
+399 h
+4 h
+4 h
+14808 m
+104 h
+733 m
+31 h
+14809 m
+10 h
+14810 m
+14811 m
+124 h
+4 h
+11 h
+14812 m
+10 h
+1952 m
+4 h
+14813 m
+110 h
+11 h
+14814 m
+83 h
+10 h
+4 h
+1 h
+4 h
+10 h
+4 h
+4 h
+1 h
+4 h
+57 h
+10 h
+57 h
+692 h
+10 h
+114 h
+64 h
+12551 m
+4 h
+10 h
+4 h
+167 h
+4 h
+4 h
+4 h
+124 h
+1 h
+10 h
+4 h
+1 h
+1 h
+10 h
+181 h
+83 h
+10 h
+4 h
+10 h
+4 h
+10 h
+82 h
+10 h
+10 h
+83 h
+82 h
+3170 m
+14815 m
+1 h
+8 h
+14816 m
+14817 m
+1 h
+1 h
+7395 m
+4 h
+270 h
+307 h
+14818 m
+4 h
+6599 m
+4 h
+59 h
+1 h
+10 h
+4 h
+11 h
+1 h
+64 h
+10 h
+4 h
+14819 m
+83 h
+4 h
+10 h
+10 h
+109 h
+1 h
+371 h
+7592 m
+4 h
+4 h
+14820 m
+14821 m
+278 h
+41 h
+1 h
+4 h
+82 h
+11 h
+124 h
+125 h
+14822 m
+167 h
+10 h
+10 h
+14823 m
+4 h
+4 h
+10 h
+1 h
+266 h
+83 h
+1 h
+4 h
+4 h
+14824 m
+1330 m
+31 h
+14825 m
+10 h
+4 h
+124 h
+1574 m
+11 h
+763 m
+41 h
+14826 m
+4 h
+14827 m
+1 h
+13 h
+4 h
+14828 m
+4 h
+10 h
+12637 m
+10 h
+1 h
+4 h
+14829 m
+14830 m
+14831 m
+4 h
+13944 m
+642 h
+14832 m
+4 h
+11 h
+1 h
+10 h
+14833 m
+10 h
+4 h
+10 h
+1725 m
+14834 m
+10 h
+4 h
+4 h
+14835 m
+82 h
+4 h
+14836 m
+25 h
+14837 m
+14838 m
+10 h
+4 h
+1 h
+68 m
+4 h
+10 h
+1 h
+4 h
+167 h
+144 h
+4 h
+14839 m
+10 h
+4 h
+4 h
+4 h
+4 h
+4 h
+10 h
+538 h
+10 h
+10 h
+4 h
+10 h
+2845 m
+36 h
+1 h
+408 m
+14840 m
+124 h
+330 h
+114 h
+13 h
+1 h
+74 h
+10 h
+3 h
+14841 m
+11486 m
+4 h
+10 h
+987 m
+14842 m
+1 h
+10 h
+14843 m
+14844 m
+4 h
+2885 m
+112 h
+4 h
+146 h
+10 h
+4 h
+10 h
+10 h
+41 h
+14845 m
+1 h
+57 h
+2184 m
+1 h
+10 h
+4 h
+14846 m
+79 h
+1 h
+4 h
+144 h
+27 h
+4 h
+4 h
+10 h
+14847 m
+4 h
+570 h
+14848 m
+1 h
+3 h
+4 h
+124 h
+4 h
+14849 m
+14850 m
+10 h
+14851 m
+10 h
+1 h
+125 h
+14852 m
+1 h
+14853 m
+1 h
+36 h
+73 h
+69 h
+170 h
+8 h
+83 h
+10 h
+14854 m
+25 h
+14855 m
+4 h
+4 h
+1 h
+10 h
+1 h
+57 h
+4 h
+1 h
+10 h
+4 h
+1 h
+4 h
+14856 m
+4 h
+14857 m
+164 h
+14858 m
+1 h
+10 h
+4 h
+1 h
+14859 m
+869 h
+14860 m
+82 h
+4 h
+10 h
+266 h
+10 h
+330 h
+1 h
+10 h
+10 h
+83 h
+14861 m
+10 h
+73 h
+4 h
+45 h
+3396 m
+1 h
+74 h
+124 h
+4 h
+64 h
+5544 m
+1 h
+10 h
+196 h
+4 h
+14862 m
+14863 m
+59 h
+4103 m
+14689 m
+2699 m
+146 h
+147 h
+299 h
+14864 m
+147 h
+4 h
+1 h
+1 h
+10 h
+10 h
+82 h
+4 h
+4 h
+4 h
+1 h
+447 h
+299 h
+10 h
+10 h
+83 h
+83 h
+307 h
+279 h
+14865 m
+1 h
+65 h
+10 h
+1 h
+1 h
+119 h
+8 h
+1016 h
+14866 m
+1 h
+238 h
+65 h
+1 h
+1780 m
+4 h
+74 h
+125 h
+14867 m
+4 h
+986 h
+4 h
+14868 m
+14869 m
+57 h
+4 h
+10 h
+114 h
+4 h
+1105 h
+4 h
+14870 m
+14871 m
+1 h
+1 h
+4 h
+10 h
+4 h
+57 h
+14872 m
+14873 m
+10 h
+4 h
+4 h
+10 h
+7300 m
+14874 m
+6668 m
+488 h
+59 h
+14875 m
+10 h
+10 h
+14876 m
+59 h
+1 h
+10 h
+196 h
+1 h
+10 h
+14877 m
+25 h
+123 h
+4 h
+4 h
+4 h
+4 h
+265 h
+1 h
+173 h
+14878 m
+10 h
+4 h
+1281 m
+1 h
+10 h
+14879 m
+147 h
+10 h
+4 h
+4 h
+4 h
+196 h
+2275 m
+57 h
+14880 m
+3 h
+2022 m
+1 h
+3048 m
+986 h
+8643 m
+11 h
+7872 m
+1642 h
+1 h
+196 h
+14881 m
+10 h
+10 h
+1 h
+1 h
+14882 m
+83 h
+4 h
+4 h
+4 h
+258 h
+4 h
+1 h
+4 h
+1 h
+1 h
+4 h
+4 h
+578 h
+167 h
+4 h
+4 h
+1 h
+1137 h
+1 h
+10 h
+109 h
+10 h
+4 h
+4 h
+265 h
+172 h
+1 h
+1 h
+196 h
+1 h
+73 h
+1 h
+36 h
+57 h
+4 h
+14883 m
+4 h
+1 h
+4 h
+1 h
+4 h
+885 h
+256 m
+563 m
+4 h
+14884 m
+14885 m
+4 h
+10 h
+1 h
+10 h
+14886 m
+1 h
+27 h
+158 h
+1 h
+1070 m
+57 h
+10 h
+4 h
+4 h
+196 h
+82 h
+10 h
+4 h
+274 h
+4 h
+74 h
+12884 m
+4 h
+10 h
+4 h
+156 h
+4 h
+14887 m
+443 h
+11 h
+14888 m
+14889 m
+181 h
+258 h
+14890 m
+4 h
+4 h
+14891 m
+4 h
+4 h
+4 h
+14892 m
+2444 m
+1 h
+4 h
+4 h
+10 h
+1 h
+1454 h
+4 h
+135 h
+11 h
+59 h
+4 h
+1 h
+10 h
+124 h
+4 h
+124 h
+11 h
+3 h
+4 h
+1 h
+12755 m
+10 h
+12 h
+104 h
+11 h
+14893 m
+5557 m
+25 h
+10 h
+10 h
+2846 m
+14894 m
+10 h
+82 h
+1 h
+4 h
+135 h
+10 h
+4 h
+10 h
+1 h
+27 h
+41 h
+1 h
+4 h
+14895 m
+1 h
+4 h
+164 h
+569 h
+1 h
+4253 m
+14896 m
+57 h
+57 h
+1 h
+4 h
+82 h
+10 h
+10 h
+1 h
+1 h
+1 h
+82 h
+4 h
+1 h
+1 h
+14897 m
+14898 m
+1 h
+4 h
+59 h
+10 h
+1 h
+4 h
+146 h
+4 h
+94 h
+14899 m
+1 h
+41 h
+1 h
+11 h
+83 h
+443 h
+4 h
+82 h
+119 h
+94 h
+1 h
+1 h
+65 h
+10 h
+4 h
+1 h
+10062 m
+4844 m
+14900 m
+10 h
+55 h
+4 h
+10 h
+4486 m
+1 h
+1 h
+13349 m
+464 h
+4 h
+1199 m
+1 h
+4 h
+14901 m
+14902 m
+1 h
+7479 m
+4 h
+1 h
+4 h
+4 h
+4 h
+14903 m
+4 h
+1 h
+1 h
+14904 m
+10 h
+125 h
+31 h
+10 h
+1 h
+1 h
+11 h
+10 h
+258 h
+41 h
+3293 m
+14905 m
+14906 m
+4 h
+14907 m
+195 h
+14908 m
+1 h
+1 h
+1 h
+3 h
+186 h
+14909 m
+11 h
+10 h
+10 h
+4 h
+1 h
+1403 h
+97 h
+4 h
+1 h
+1 h
+113 h
+4 h
+1 h
+1 h
+109 h
+4 h
+4 h
+403 h
+10 h
+31 h
+31 h
+10 h
+1 h
+25 h
+1 h
+12 h
+10 h
+4 h
+10 h
+1 h
+82 h
+83 h
+1 h
+4 h
+1 h
+14910 m
+14911 m
+14912 m
+278 h
+1 h
+14913 m
+14914 m
+4 h
+4 h
+1 h
+11 h
+358 h
+65 h
+1822 h
+143 h
+4 h
+25 h
+14915 m
+172 h
+2379 h
+14916 m
+276 h
+125 h
+4 h
+4 h
+10 h
+14917 m
+1 h
+196 h
+4 h
+27 h
+4 h
+1250 h
+1 h
+14918 m
+185 h
+10 h
+1 h
+4 h
+14919 m
+4 h
+4 h
+4 h
+14920 m
+10 h
+4 h
+14921 m
+109 h
+14922 m
+270 h
+10 h
+4 h
+1 h
+1 h
+57 h
+270 h
+14923 m
+14924 m
+14925 m
+307 h
+14926 m
+83 h
+4 h
+1 h
+123 h
+2594 m
+10 h
+12131 m
+55 h
+14927 m
+3095 m
+3 h
+192 h
+11 h
+10 h
+10 h
+4 h
+4 h
+190 h
+79 h
+1 h
+4 h
+14928 m
+1 h
+10 h
+1 h
+14929 m
+279 h
+195 h
+124 h
+4240 m
+4 h
+4 h
+14930 m
+10 h
+124 h
+1 h
+386 h
+41 h
+14931 m
+1 h
+11 h
+4 h
+4 h
+83 h
+4 h
+4 h
+11766 m
+4 h
+64 h
+1 h
+4 h
+157 h
+4 h
+1 h
+13 h
+14932 m
+125 h
+1 h
+4 h
+195 h
+4 h
+125 h
+10 h
+94 h
+14933 m
+4 h
+57 h
+31 h
+14934 m
+4 h
+10 h
+14935 m
+4 h
+64 h
+82 h
+11 h
+4 h
+97 h
+14936 m
+4 h
+73 h
+46 m
+1 h
+57 h
+1454 h
+1 h
+1 h
+14937 m
+4 h
+13 h
+1 h
+112 h
+14938 m
+10 h
+14939 m
+1 h
+14940 m
+4 h
+25 h
+4 h
+109 h
+10 h
+14941 m
+1 h
+169 h
+14942 m
+10 h
+4 h
+488 h
+4 h
+27 h
+1 h
+4 h
+10 h
+1 h
+14943 m
+158 h
+10 h
+10 h
+22 h
+124 h
+14944 m
+2721 m
+1 h
+143 h
+14945 m
+14946 m
+10 h
+73 h
+1 h
+1697 h
+14947 m
+41 h
+1 h
+10 h
+4 h
+11 h
+14948 m
+1 h
+14949 m
+4538 m
+10 h
+11 h
+10 h
+4 h
+279 h
+4 h
+8 h
+147 h
+4 h
+14950 m
+167 h
+10 h
+106 m
+1 h
+1 h
+14951 m
+1650 h
+10 h
+1 h
+10 h
+14952 m
+1 h
+10 h
+1 h
+14953 m
+195 h
+173 h
+11 h
+3 h
+1 h
+14954 m
+4 h
+1 h
+4 h
+1 h
+332 h
+10 h
+4 h
+4 h
+10 h
+307 h
+1284 m
+2887 m
+2928 m
+10 h
+2163 m
+3 h
+196 h
+14955 m
+10 h
+1 h
+14838 m
+11 h
+1 h
+146 h
+4567 m
+4 h
+14956 m
+3 h
+14957 m
+4 h
+1 h
+10 h
+73 h
+1 h
+4 h
+4 h
+4 h
+14958 m
+1 h
+14959 m
+4 h
+12 h
+14960 m
+125 h
+4 h
+14961 m
+14962 m
+1915 m
+1 h
+14963 m
+1 h
+1 h
+640 h
+258 h
+4 h
+14964 m
+14965 m
+11 h
+181 h
+4 h
+6663 m
+14966 m
+996 m
+4 h
+4 h
+11 h
+4 h
+1 h
+14967 m
+15 m
+11 h
+278 h
+4 h
+4 h
+14968 m
+14969 m
+192 h
+195 h
+14970 m
+82 h
+31 h
+13 h
+27 h
+8 h
+25 h
+14971 m
+14972 m
+1 h
+3 h
+124 h
+57 h
+14973 m
+1 h
+1 h
+11 h
+4 h
+4 h
+4 h
+4 h
+14974 m
+1 h
+14975 m
+9411 m
+4 h
+10 h
+266 h
+97 h
+1 h
+55 h
+125 h
+266 h
+14976 m
+14977 m
+4 h
+14978 m
+4 h
+10 h
+1 h
+274 h
+57 h
+14979 m
+840 m
+3161 m
+14980 m
+4 h
+1 h
+146 h
+59 h
+73 h
+1677 m
+124 h
+1775 m
+196 h
+4 h
+4 h
+14981 m
+147 h
+779 h
+1374 m
+4 h
+5562 m
+1 h
+4 h
+4 h
+1 h
+4 h
+1 h
+65 h
+4 h
+195 h
+4 h
+11 h
+4 h
+14982 m
+14983 m
+4 h
+1 h
+4 h
+79 h
+4 h
+1 h
+125 h
+631 m
+14984 m
+14985 m
+4 h
+4 h
+8324 m
+10 h
+14986 m
+1 h
+10 h
+82 h
+1 h
+1 h
+1 h
+1 h
+1 h
+4 h
+97 h
+4 h
+14987 m
+10 h
+10 h
+9176 m
+14988 m
+190 h
+4 h
+64 h
+31 h
+9400 m
+10 h
+1 h
+10 h
+14989 m
+4 h
+1 h
+14990 m
+538 h
+14991 m
+4 h
+14992 m
+1 h
+1 h
+1714 h
+14993 m
+109 h
+1 h
+4 h
+4 h
+11 h
+14994 m
+4 h
+2064 m
+1 h
+57 h
+1 h
+4 h
+8188 m
+4 h
+1 h
+10 h
+4 h
+14995 m
+125 h
+1685 h
+11 h
+10 h
+1 h
+14996 m
+538 h
+1 h
+5541 m
+14997 m
+10 h
+14998 m
+10 h
+4 h
+124 h
+14999 m
+146 h
+15000 m
+1 h
+2002 m
+4 h
+4 h
+10 h
+4 h
+10 h
+4127 m
+83 h
+687 h
+3 h
+4 h
+144 h
+4 h
+10 h
+15001 m
+10 h
+4 h
+4 h
+481 m
+8610 m
+15002 m
+11 h
+5125 m
+10 h
+15003 m
+15004 m
+15005 m
+10 h
+15006 m
+1 h
+15007 m
+15008 m
+12 h
+332 h
+4 h
+4 h
+65 h
+15009 m
+3141 m
+1 h
+4 h
+4 h
+8 h
+15010 m
+4 h
+15011 m
+65 h
+6963 m
+10 h
+10 h
+563 m
+1 h
+15012 m
+15013 m
+10 h
+15014 m
+15015 m
+1 h
+4 h
+172 h
+4 h
+1293 m
+94 h
+1 h
+15016 m
+57 h
+15017 m
+10 h
+97 h
+4 h
+15018 m
+1 h
+13980 m
+4 h
+332 h
+156 h
+4 h
+6260 m
+4 h
+1 h
+238 h
+55 h
+229 h
+92 h
+4 h
+4 h
+1 h
+4 h
+601 h
+109 h
+229 h
+196 h
+15019 m
+15020 m
+1 h
+11 h
+1 h
+11 h
+124 h
+15021 m
+4 h
+2054 m
+4 h
+1 h
+4 h
+11 h
+10 h
+15022 m
+110 h
+113 h
+69 h
+135 h
+15023 m
+1359 h
+4 h
+156 h
+15024 m
+15025 m
+1 h
+15026 m
+1 h
+10 h
+15027 m
+4089 m
+2891 m
+10 h
+1 h
+5422 m
+536 h
+15028 m
+1 h
+8 h
+57 h
+15029 m
+1 h
+15030 m
+4 h
+4 h
+4 h
+4 h
+383 h
+4 h
+363 m
+1 h
+4 h
+10 h
+4 h
+11 h
+1 h
+285 m
+885 h
+3 h
+59 h
+10 h
+15031 m
+185 h
+41 h
+4 h
+1 h
+9691 m
+4 h
+4 h
+10 h
+4 h
+10 h
+3 h
+1 h
+170 h
+147 h
+59 h
+15032 m
+1070 m
+4 h
+10 h
+82 h
+6783 m
+4 h
+15033 m
+15034 m
+1 h
+15035 m
+135 h
+1 h
+15036 m
+1 h
+1 h
+1 h
+48 h
+4 h
+83 h
+1 h
+146 h
+10 h
+1 h
+1 h
+4 h
+10 h
+135 h
+1953 m
+15037 m
+1409 m
+27 h
+10 h
+56 h
+4 h
+10 h
+10 h
+147 h
+857 h
+124 h
+94 h
+4 h
+15038 m
+10 h
+10 h
+359 h
+4 h
+12 h
+4 h
+1 h
+10 h
+14570 m
+15039 m
+15040 m
+2303 m
+10 h
+83 h
+10 h
+1 h
+4 h
+64 h
+4 h
+1 h
+10 h
+2928 m
+10 h
+4 h
+1 h
+196 h
+15041 m
+1 h
+15042 m
+10 h
+4 h
+4 h
+1261 h
+4 h
+57 h
+4 h
+15043 m
+459 h
+124 h
+15044 m
+1 h
+55 h
+1 h
+45 h
+10 h
+15045 m
+10 h
+1645 m
+4 h
+4 h
+1 h
+1 h
+4 h
+1 h
+124 h
+83 h
+55 h
+649 m
+65 h
+10 h
+4 h
+1198 m
+15046 m
+575 h
+1 h
+15047 m
+195 h
+338 h
+1 h
+1 h
+124 h
+1205 m
+10 h
+15048 m
+4 h
+124 h
+147 h
+4 h
+15049 m
+4 h
+1 h
+15050 m
+4 h
+82 h
+147 h
+1 h
+31 h
+4718 m
+186 h
+4 h
+11 h
+3 h
+1 h
+82 h
+15051 m
+316 m
+278 h
+3 h
+4 h
+888 m
+278 h
+73 h
+1 h
+10 h
+82 h
+15052 m
+164 h
+15053 m
+10 h
+146 h
+10 h
+1 h
+82 h
+10 h
+4 h
+1 h
+1 h
+718 h
+6296 m
+15054 m
+4 h
+1 h
+4 h
+1 h
+4 h
+578 h
+139 h
+6501 m
+4 h
+4 h
+3 h
+4 h
+10 h
+3 h
+4 h
+4 h
+1 h
+119 h
+10 h
+1 h
+4 h
+10 h
+10 h
+1 h
+2495 m
+1116 m
+1 h
+4 h
+12 h
+4 h
+15055 m
+1 h
+4 h
+124 h
+11 h
+1 h
+1 h
+425 m
+15056 m
+4 h
+1 h
+15057 m
+4 h
+15058 m
+3 h
+1 h
+59 h
+10 h
+1 h
+1 h
+10 h
+195 h
+15059 m
+15060 m
+15061 m
+15062 m
+139 h
+4 h
+3 h
+1 h
+4 h
+4 h
+4 h
+4 h
+97 h
+4 h
+10 h
+15063 m
+15064 m
+270 h
+1 h
+4 h
+74 h
+10 h
+15065 m
+5809 m
+1 h
+15066 m
+4 h
+10 h
+1470 h
+1 h
+1 h
+1 h
+4 h
+4 h
+82 h
+4905 m
+7 m
+4 h
+3 h
+4 h
+15067 m
+1 h
+181 h
+4 h
+307 h
+1 h
+5387 m
+124 h
+31 h
+15068 m
+147 h
+4372 m
+10 h
+82 h
+196 h
+1953 m
+15069 m
+1 h
+109 h
+15070 m
+104 h
+169 h
+10 h
+4 h
+4 h
+15071 m
+1 h
+172 h
+10 h
+4 h
+15072 m
+15073 m
+1 h
+1470 h
+4905 m
+1 h
+15074 m
+82 h
+10 h
+10 h
+15075 m
+10 h
+10 h
+13 h
+4 h
+386 h
+10 h
+15076 m
+4 h
+15077 m
+1 h
+10 h
+1 h
+181 h
+4 h
+1 h
+10 h
+386 h
+4 h
+4 h
+15078 m
+10 h
+10 h
+15079 m
+15080 m
+4 h
+6549 m
+164 h
+266 h
+10 h
+83 h
+10 h
+4 h
+59 h
+12 h
+15081 m
+10 h
+82 h
+15082 m
+124 h
+15083 m
+12 h
+4 h
+10 h
+15084 m
+4 h
+15085 m
+1 h
+15086 m
+1 h
+15087 m
+1 h
+1 h
+1 h
+25 h
+15088 m
+97 h
+124 h
+10 h
+124 h
+57 h
+124 h
+10 h
+10 h
+10 h
+1685 h
+4 h
+10 h
+10 h
+146 h
+15089 m
+10 h
+45 h
+258 h
+276 h
+15090 m
+15091 m
+15092 m
+1 h
+1 h
+1 h
+4 h
+1 h
+10 h
+124 h
+4 h
+3216 m
+15093 m
+258 h
+10 h
+15094 m
+4 h
+4 h
+74 h
+4 h
+1 h
+104 h
+1835 m
+4 h
+1137 h
+15095 m
+82 h
+79 h
+15096 m
+10 h
+83 h
+10 h
+4 h
+4 h
+4 h
+10 h
+1 h
+12 h
+1 h
+124 h
+124 h
+322 h
+319 h
+332 h
+4 h
+4 h
+15097 m
+4857 m
+15098 m
+15099 m
+10 h
+15100 m
+181 h
+10 h
+4 h
+4 h
+4 h
+10 h
+4 h
+22 h
+82 h
+15101 m
+4 h
+1 h
+4 h
+10 h
+10 h
+15102 m
+10 h
+15103 m
+698 m
+15104 m
+15105 m
+74 h
+4 h
+10 h
+31 h
+4 h
+4 h
+15106 m
+139 h
+4 h
+11 h
+4 h
+1 h
+1 h
+15107 m
+94 h
+15108 m
+538 h
+15109 m
+1 h
+15110 m
+4132 m
+15111 m
+1 h
+10 h
+4 h
+10 h
+4 h
+10 h
+1 h
+1027 h
+109 h
+3 h
+15112 m
+15113 m
+4 h
+108 h
+10 h
+12700 m
+15114 m
+15115 m
+65 h
+358 h
+4 h
+15116 m
+4 h
+10 h
+4 h
+4 h
+4 h
+1 h
+4 h
+4 h
+276 h
+10 h
+10 h
+15117 m
+15118 m
+1 h
+533 h
+256 m
+3841 m
+15119 m
+10 h
+1 h
+4 h
+135 h
+15120 m
+190 h
+1556 m
+97 h
+91 h
+10 h
+15121 m
+119 h
+15122 m
+15123 m
+15124 m
+4 h
+4 h
+57 h
+4 h
+15125 m
+3 h
+92 h
+4 h
+172 h
+15126 m
+15127 m
+15128 m
+94 h
+15129 m
+15130 m
+1 h
+104 h
+1137 h
+1 h
+3 h
+15131 m
+4 h
+5348 m
+4 h
+11 h
+4 h
+4 h
+10 h
+14308 m
+15132 m
+10 h
+23 h
+112 h
+1 h
+135 h
+4 h
+601 h
+4 h
+5632 m
+10 h
+4 h
+15133 m
+73 h
+4 h
+14 m
+146 h
+10 h
+15134 m
+1 h
+195 h
+11 h
+4 h
+31 h
+10 h
+4 h
+109 h
+15135 m
+10 h
+4 h
+1 h
+15136 m
+1 h
+15137 m
+124 h
+15138 m
+15139 m
+195 h
+10 h
+4 h
+536 h
+1576 m
+10 h
+15140 m
+3 h
+15141 m
+1 h
+4 h
+1 h
+278 h
+1817 m
+1 h
+82 h
+112 h
+15142 m
+195 h
+143 h
+15143 m
+1309 h
+169 h
+4 h
+4 h
+4 h
+25 h
+10 h
+4 h
+1 h
+196 h
+15144 m
+4 h
+1 h
+1 h
+15145 m
+82 h
+1 h
+4 h
+15146 m
+124 h
+15147 m
+15148 m
+383 h
+4 h
+15149 m
+1 h
+104 h
+1 h
+238 h
+1 h
+1 h
+4 h
+4350 m
+83 h
+11 h
+10 h
+10 h
+82 h
+10 h
+1 h
+10 h
+113 h
+15150 m
+4 h
+74 h
+4 h
+4 h
+125 h
+1 h
+27 h
+4 h
+4 h
+367 h
+15151 m
+11 h
+1 h
+15152 m
+139 h
+4 h
+976 h
+124 h
+1 h
+195 h
+15153 m
+10 h
+15154 m
+10 h
+59 h
+10 h
+15155 m
+808 m
+238 h
+10 h
+872 m
+4 h
+36 h
+15156 m
+4 h
+156 h
+4 h
+10 h
+307 h
+33 m
+15157 m
+15158 m
+57 h
+55 h
+1 h
+79 h
+59 h
+297 h
+15159 m
+31 h
+4 h
+1 h
+10 h
+4 h
+15160 m
+1 h
+4 h
+1 h
+8 h
+186 h
+1574 m
+15161 m
+447 h
+15162 m
+15163 m
+15164 m
+147 h
+79 h
+186 h
+4 h
+3 h
+4 h
+15165 m
+45 h
+1 h
+1725 m
+3702 m
+10 h
+4 h
+1 h
+1403 h
+15166 m
+108 h
+1 h
+1 h
+4 h
+11 h
+31 h
+15167 m
+10 h
+15168 m
+4 h
+4 h
+1 h
+10 h
+935 h
+15169 m
+4 h
+10 h
+4 h
+4 h
+15170 m
+57 h
+4 h
+4 h
+10 h
+4 h
+15171 m
+3 h
+10 h
+97 h
+4 h
+10 h
+112 h
+10 h
+48 h
+1 h
+1 h
+1 h
+114 h
+83 h
+82 h
+164 h
+15172 m
+10 h
+114 h
+4 h
+1116 m
+4 h
+4 h
+10 h
+4 h
+1 h
+4 h
+74 h
+1 h
+4 h
+1 h
+4 h
+4 h
+1 h
+1 h
+15173 m
+10 h
+10 h
+4 h
+15174 m
+82 h
+435 m
+59 h
+4 h
+83 h
+10 h
+2720 m
+15175 m
+11 h
+1 h
+1 h
+123 h
+13 h
+15176 m
+83 h
+31 h
+5545 m
+15177 m
+82 h
+4 h
+1766 h
+41 h
+15178 m
+5982 m
+15179 m
+10 h
+1 h
+169 h
+4 h
+4 h
+15180 m
+443 h
+123 h
+15181 v
+13854 m
+11 h
+1861 m
+1 h
+1 h
+15182 m
+4 h
+1 h
+36 h
+97 h
+124 h
+10 h
+990 h
+1 h
+195 h
+1 h
+11 h
+4 h
+1 h
+3 h
+1441 m
+955 m
+1 h
+4 h
+10 h
+12 h
+123 h
+1261 h
+15183 m
+91 h
+109 h
+4 h
+4 h
+10 h
+65 h
+124 h
+169 h
+717 m
+10 h
+124 h
+4 h
+4 h
+31 h
+94 h
+359 h
+11 h
+113 h
+2937 m
+2285 m
+4 h
+8 h
+15184 m
+104 h
+4 h
+4 h
+297 h
+10 h
+4 h
+4 h
+15185 m
+82 h
+15186 m
+1 h
+1 h
+11 h
+10 h
+45 h
+4 h
+15187 m
+4 h
+1 h
+4 h
+273 m
+4 h
+15188 m
+1045 m
+1 h
+1089 h
+4 h
+15189 m
+1 h
+1 h
+10 h
+10 h
+4301 m
+4 h
+45 h
+83 h
+4 h
+195 h
+64 h
+146 h
+10 h
+4 h
+1 h
+10 h
+10 h
+15190 m
+2183 m
+299 h
+4 h
+97 h
+15191 m
+1261 h
+1 h
+454 m
+1 h
+15192 m
+15193 m
+135 h
+4 h
+195 h
+15194 m
+10 h
+4 h
+4 h
+1 h
+15195 m
+13 h
+278 h
+4 h
+5249 m
+986 h
+82 h
+1 h
+114 h
+10 h
+358 h
+10 h
+4 h
+74 h
+4 h
+15196 m
+4 h
+15197 m
+1 h
+15198 m
+1 h
+4 h
+4857 m
+15199 m
+4 h
+1 h
+279 h
+31 h
+10 h
+15200 m
+1 h
+15201 m
+1 h
+4 h
+4 h
+4 h
+156 h
+11 h
+1 h
+4 h
+11 h
+10 h
+27 h
+4 h
+3405 m
+10 h
+3 h
+15202 m
+4 h
+1 h
+4 h
+1089 h
+4 h
+1 h
+15203 m
+4 h
+31 h
+124 h
+57 h
+15204 m
+1 h
+15205 m
+1 h
+4 h
+10 h
+41 h
+15206 m
+15207 m
+4 h
+4 h
+4 h
+110 h
+79 h
+15208 m
+15209 m
+4 h
+4 h
+1 h
+10 h
+4 h
+4 h
+204 h
+4 h
+4520 m
+4 h
+135 h
+4 h
+10 h
+129 h
+82 h
+15210 m
+307 h
+10 h
+1 h
+1015 m
+911 h
+4 h
+125 h
+238 h
+59 h
+11 h
+10 h
+4 h
+238 h
+10 h
+195 h
+4 h
+14112 m
+170 h
+1 h
+6290 m
+195 h
+11 h
+1 h
+1714 h
+97 h
+1137 h
+10 h
+4 h
+857 h
+56 h
+4 h
+11 h
+4 h
+601 h
+4 h
+15211 m
+82 h
+4 h
+2788 h
+15212 m
+125 h
+31 h
+1 h
+4 h
+65 h
+258 h
+64 h
+4 h
+15213 m
+10 h
+358 h
+57 h
+1 h
+15214 m
+10 h
+11 h
+15215 m
+15216 m
+1074 m
+278 h
+4 h
+11 h
+297 h
+4 h
+4 h
+10 h
+256 m
+10 h
+41 h
+146 h
+15217 m
+15218 m
+1 h
+4 h
+307 h
+3 h
+15219 m
+11 h
+97 h
+1 h
+4 h
+4 h
+1 h
+10 h
+289 h
+83 h
+15220 m
+4 h
+1 h
+36 h
+733 m
+196 h
+1 h
+4 h
+15221 m
+15222 m
+1 h
+10 h
+108 h
+4 h
+92 h
+4 h
+258 h
+15223 m
+15224 m
+41 h
+1 h
+195 h
+65 h
+316 m
+695 m
+3287 m
+11 h
+97 h
+15225 m
+1 h
+4 h
+1 h
+10 h
+1 h
+1 h
+15226 m
+1535 m
+1030 h
+307 h
+4 h
+15227 m
+358 h
+10 h
+10 h
+124 h
+1 h
+15228 m
+59 h
+4 h
+1898 m
+25 h
+4 h
+4 h
+15229 m
+11 h
+15230 m
+4 h
+4 h
+10 h
+4 h
+31 h
+15231 m
+57 h
+4 h
+4 h
+10 h
+156 h
+1 h
+8 h
+3 h
+125 h
+119 h
+443 h
+4 h
+2418 h
+1 h
+15232 m
+4 h
+4 h
+4 h
+15233 m
+56 h
+802 m
+11 h
+1309 h
+4 h
+59 h
+11 h
+82 h
+4 h
+15234 m
+4 h
+12543 m
+41 h
+1 h
+10 h
+10 h
+94 h
+1 h
+4 h
+4 h
+10 h
+4 h
+1 h
+170 h
+4 h
+15235 m
+10 h
+4 h
+15236 m
+15237 m
+1772 m
+1 h
+10 h
+15238 m
+1 h
+4 h
+1016 h
+41 h
+10 h
+1 h
+1 h
+1822 h
+10 h
+10 h
+41 h
+4 h
+10 h
+57 h
+4 h
+4 h
+10 h
+15239 m
+10 h
+15240 m
+4 h
+15241 m
+4 h
+109 h
+4 h
+11 h
+94 h
+10 h
+4 h
+1 h
+10 h
+15242 m
+238 h
+4 h
+4 h
+4 h
+4 h
+5814 m
+144 h
+1083 m
+3141 m
+4 h
+10 h
+4 h
+9691 m
+4 h
+57 h
+4 h
+15243 m
+114 h
+15244 m
+10 h
+15245 m
+1 h
+4 h
+10 h
+15246 m
+1 h
+4 h
+3 h
+285 m
+10 h
+15247 m
+447 h
+12 h
+15248 m
+1137 h
+4 h
+4 h
+4 h
+4 h
+4 h
+157 h
+59 h
+4 h
+110 h
+10 h
+15249 m
+266 h
+25 h
+10 h
+45 h
+59 h
+10 h
+4 h
+15250 m
+15251 m
+109 h
+15252 m
+3 h
+4 h
+124 h
+4 h
+4 h
+15253 m
+31 h
+1 h
+15254 m
+4 h
+2923 h
+109 h
+4966 m
+4 h
+1260 m
+27 h
+74 h
+332 h
+15255 m
+386 h
+195 h
+65 h
+124 h
+4 h
+15256 m
+31 h
+1 h
+332 h
+4 h
+4 h
+4 h
+15257 m
+10 h
+1386 m
+15258 m
+1 h
+6214 m
+15259 m
+4 h
+74 h
+7400 m
+1403 h
+5145 m
+4 h
+83 h
+4 h
+1 h
+10 h
+4 h
+289 h
+4 h
+8477 m
+139 h
+109 h
+4 h
+4524 m
+15260 m
+1 h
+4 h
+4 h
+4030 m
+11 h
+157 h
+1027 h
+143 h
+10 h
+4 h
+4 h
+4 h
+279 h
+1 h
+15261 m
+15262 m
+12 h
+4 h
+59 h
+1 h
+15263 m
+400 m
+74 h
+1619 h
+4 h
+10 h
+4 h
+1 h
+125 h
+4 h
+1 h
+114 h
+1 h
+4 h
+1 h
+15264 m
+15265 m
+25 h
+10 h
+4 h
+15266 m
+443 h
+10 h
+25 h
+10 h
+1 h
+330 h
+146 h
+1 h
+15267 m
+1 h
+4 h
+11 h
+11 h
+45 h
+146 h
+59 h
+10 h
+4 h
+10 h
+3 h
+4 h
+15268 m
+4 h
+125 h
+74 h
+59 h
+4 h
+10 h
+295 h
+79 h
+1 h
+125 h
+4 h
+4 h
+11654 m
+10 h
+2627 m
+10 h
+1 h
+97 h
+4 h
+4 h
+9300 m
+14 m
+4 h
+59 h
+172 h
+59 h
+4 h
+2887 m
+1 h
+94 h
+190 h
+103 h
+10 h
+59 h
+15269 m
+15270 m
+4 h
+15271 m
+1 h
+1 h
+25 h
+4 h
+1 h
+1 h
+1 h
+4 h
+4 h
+4 h
+104 h
+4 h
+1 h
+109 h
+3 h
+3523 m
+169 h
+11 h
+10 h
+27 h
+493 m
+79 h
+4240 m
+1 h
+1650 h
+15272 m
+119 h
+1751 m
+4 h
+4 h
+15273 m
+4 h
+412 h
+435 m
+912 m
+15274 m
+1 h
+1 h
+1 h
+143 h
+27 h
+1 h
+27 h
+4 h
+289 h
+4 h
+10 h
+10 h
+114 h
+10 h
+70 m
+640 h
+15275 m
+97 h
+15276 m
+1 h
+1 h
+119 h
+4 h
+28 h
+4 h
+4 h
+11 h
+4 h
+4 h
+104 h
+15277 m
+15278 m
+386 h
+15279 m
+4 h
+15280 m
+15281 m
+4 h
+10 h
+10 h
+1 h
+10 h
+10 h
+164 h
+109 h
+1 h
+109 h
+172 h
+4 h
+238 h
+1 h
+83 h
+10 h
+10 h
+10 h
+4 h
+4 h
+1685 h
+4 h
+1 h
+1 h
+124 h
+1 h
+64 h
+10 h
+11 h
+8 h
+295 h
+10 h
+1 h
+4 h
+1089 h
+57 h
+10 h
+15282 m
+109 h
+10 h
+11109 m
+1 h
+509 m
+15283 m
+15284 m
+41 h
+952 m
+15285 m
+15286 m
+1 h
+2733 h
+4 h
+15287 m
+15288 m
+15289 m
+1 h
+1677 m
+1 h
+4 h
+22 h
+1 h
+10 h
+10062 m
+307 h
+83 h
+3477 m
+59 h
+31 h
+97 h
+13 h
+11 h
+135 h
+11 h
+10 h
+11 h
+1 h
+1 h
+10 h
+12 h
+124 h
+10 h
+4 h
+4 h
+4 h
+170 h
+1137 h
+307 h
+41 h
+113 h
+83 h
+4 h
+4 h
+4 h
+1 h
+10 h
+15290 m
+270 h
+15291 m
+1 h
+1 h
+4 h
+1 h
+692 h
+4 h
+109 h
+4 h
+110 h
+10 h
+1 h
+10 h
+10 h
+79 h
+36 h
+15292 m
+1 h
+1 h
+10 h
+1 h
+443 h
+1 h
+4 h
+15293 m
+10 h
+1 h
+1642 h
+1016 h
+238 h
+4 h
+1 h
+11 h
+4 h
+368 h
+1 h
+1766 h
+1 h
+399 h
+11 h
+10 h
+4 h
+15294 m
+1 h
+11 h
+258 h
+72 m
+1 h
+10 h
+31 h
+1893 m
+74 h
+11 h
+15295 m
+7646 m
+1 h
+4 h
+1 h
+4 h
+295 h
+15296 m
+1 h
+135 h
+10 h
+4 h
+443 h
+11 h
+319 h
+1 h
+12047 m
+15297 m
+170 h
+15298 m
+135 h
+1 h
+1 h
+4 h
+55 h
+9175 m
+15299 m
+15300 m
+11 h
+10 h
+4 h
+15301 m
+353 h
+478 h
+143 h
+266 h
+15302 m
+15303 m
+1 h
+57 h
+124 h
+15304 m
+10 h
+338 h
+10 h
+15305 m
+4 h
+536 h
+4 h
+15306 m
+59 h
+1 h
+1 h
+2475 m
+1 h
+91 h
+10 h
+1 h
+4 h
+15307 m
+10 h
+15308 m
+1 h
+1 h
+10 h
+5613 m
+10 h
+4 h
+15309 m
+330 h
+4 h
+10 h
+15310 m
+31 h
+15311 m
+1 h
+1 h
+15312 m
+4 h
+10 h
+15313 m
+57 h
+4 h
+4 h
+1 h
+15314 m
+4 h
+59 h
+55 h
+92 h
+15315 m
+15316 m
+4 h
+4 h
+4 h
+4 h
+15317 m
+4 h
+276 h
+929 m
+403 h
+15318 m
+2041 m
+4 h
+15319 m
+4 h
+13 h
+10 h
+83 h
+12389 m
+4 h
+125 h
+94 h
+185 h
+184 h
+4 h
+10 h
+104 h
+256 h
+15320 m
+15321 m
+656 m
+82 h
+1 h
+82 h
+25 h
+1 h
+4 h
+15322 m
+56 h
+10 h
+1 h
+1016 h
+10 h
+11 h
+10 h
+4 h
+1 h
+8346 m
+147 h
+12 h
+295 h
+82 h
+169 h
+1 h
+10 h
+1 h
+1 h
+10 h
+36 h
+3622 m
+1 h
+4 h
+109 h
+15323 m
+59 h
+1 h
+4 h
+1 h
+15324 m
+3679 m
+31 h
+1 h
+170 h
+4 h
+172 h
+4 h
+10 h
+4 h
+1 h
+4 h
+10 h
+1881 m
+15325 m
+1 h
+10 h
+1650 h
+1 h
+10 h
+83 h
+4 h
+1 h
+704 m
+114 h
+4 h
+1 h
+1 h
+10 h
+15326 m
+4 h
+15327 m
+10 h
+4 h
+295 h
+694 m
+14098 m
+1 h
+4 h
+1 h
+129 h
+3155 m
+377 h
+15328 m
+56 h
+1 h
+4 h
+1 h
+1914 m
+1595 m
+10 h
+15329 m
+125 h
+10 h
+1 h
+4 h
+15154 m
+4 h
+4 h
+208 m
+4 h
+91 h
+196 h
+4 h
+4 h
+77 m
+15330 m
+1 h
+1 h
+1 h
+4 h
+779 h
+4 h
+297 h
+258 h
+1 h
+4 h
+82 h
+3344 m
+56 h
+368 h
+11 h
+15331 m
+1105 h
+4 h
+1 h
+4 h
+82 h
+4 h
+4 h
+11 h
+15332 m
+1137 h
+4 h
+15333 m
+147 h
+4 h
+124 h
+10 h
+1 h
+4 h
+15334 m
+1 h
+15335 m
+195 h
+10 h
+4 h
+15336 m
+11 h
+15337 m
+4 h
+4 h
+124 h
+10 h
+1 h
+935 h
+15338 m
+15339 m
+25 h
+4 h
+4 h
+4 h
+1 h
+2865 m
+123 h
+4 h
+167 h
+82 h
+1 h
+119 h
+4 h
+1 h
+512 m
+4 h
+4 h
+124 h
+15340 m
+92 h
+59 h
+1 h
+857 h
+10 h
+25 h
+4 h
+1 h
+13 h
+4 h
+10 h
+1 h
+10 h
+1 h
+430 m
+15341 m
+4 h
+5065 m
+591 m
+15342 m
+10 h
+157 h
+15343 m
+4 h
+4 h
+1 h
+11 h
+28 h
+59 h
+10 h
+15344 m
+10 h
+10 h
+1 h
+264 h
+41 h
+10 h
+15345 m
+15346 m
+11 h
+10 h
+146 h
+15347 m
+4 h
+4 h
+164 h
+97 h
+1 h
+83 h
+4 h
+10 h
+15348 m
+125 h
+399 h
+4 h
+104 h
+1 h
+1 h
+196 h
+57 h
+10 h
+1 h
+15349 m
+4 h
+1 h
+1 h
+10 h
+59 h
+1 h
+195 h
+11 h
+15350 m
+4 h
+65 h
+1840 m
+15351 m
+15352 m
+124 h
+10 h
+104 h
+112 h
+4 h
+15353 m
+4 h
+15354 m
+4 h
+5348 m
+1 h
+1 h
+367 h
+123 h
+4 h
+10 h
+15355 m
+1 h
+36 h
+11 h
+11 h
+15356 m
+10 h
+15357 m
+15358 m
+15359 m
+1 h
+1250 h
+250 h
+15360 m
+4 h
+48 h
+4 h
+630 m
+11 h
+1 h
+65 h
+1 h
+147 h
+10 h
+4 h
+143 h
+1 h
+6133 m
+15361 m
+15362 m
+1 h
+10 h
+12 h
+1 h
+15363 m
+4 h
+4 h
+25 h
+5017 m
+4 h
+4 h
+15364 m
+1 h
+15365 m
+92 h
+10 h
+15366 m
+36 h
+103 h
+4 h
+15367 m
+4 h
+4 h
+10 h
+4 h
+92 h
+7 m
+358 h
+15368 m
+4 h
+642 h
+13099 m
+15369 m
+1 h
+83 h
+15370 m
+1322 m
+15371 m
+124 h
+4 h
+69 h
+4 h
+1 h
+10 h
+15372 m
+4 h
+10 h
+1 h
+4 h
+15373 m
+443 h
+4441 h
+15374 m
+279 h
+1764 m
+10 h
+10 h
+124 h
+15375 m
+92 h
+97 h
+1 h
+124 h
+4 h
+4 h
+4 h
+10 h
+1685 h
+1 h
+687 h
+90 m
+4 h
+15376 m
+4 h
+10 h
+15377 m
+10 h
+15378 m
+3 h
+109 h
+25 h
+4 h
+1 h
+139 h
+4 h
+181 h
+11 h
+15379 m
+4 h
+4 h
+10 h
+258 h
+1759 m
+4 h
+4 h
+15380 m
+4 h
+15381 m
+1 h
+31 h
+15382 m
+229 h
+10 h
+1 h
+146 h
+11 h
+57 h
+1 h
+4542 m
+125 h
+15383 m
+258 h
+4 h
+82 h
+1 h
+10 h
+15384 m
+109 h
+114 h
+10 h
+10 h
+1 h
+1 h
+15385 m
+4 h
+41 h
+169 h
+4 h
+124 h
+83 h
+4 h
+25 h
+15386 m
+15387 m
+4 h
+935 h
+10 h
+7381 m
+15388 m
+10 h
+15389 m
+125 h
+158 h
+4 h
+1470 h
+125 h
+10 h
+11427 m
+538 h
+15390 m
+4 h
+10 h
+11381 m
+1 h
+1 h
+146 h
+15391 m
+4 h
+1 h
+4 h
+157 h
+2535 m
+1 h
+10 h
+4 h
+4 h
+4 h
+4 h
+12 h
+14473 m
+704 m
+3 h
+12 h
+966 m
+15392 m
+13 h
+1 h
+332 h
+15393 m
+4 h
+15394 m
+41 h
+1 h
+59 h
+1 h
+15395 m
+10564 m
+10 h
+4 h
+4 h
+3028 m
+11 h
+4 h
+4 h
+2733 h
+15396 m
+1 h
+1074 m
+5047 m
+124 h
+10 h
+1 h
+104 h
+4 h
+4 h
+10 h
+15397 m
+4 h
+15398 m
+1 h
+41 h
+129 h
+4 h
+4 h
+478 h
+1 h
+92 h
+1 h
+5911 m
+73 h
+4 h
+15399 m
+4 h
+10177 m
+4 h
+4 h
+229 h
+74 h
+11 h
+31 h
+1 h
+1 h
+4 h
+1 h
+1 h
+1 h
+757 h
+1 h
+15400 m
+1 h
+172 h
+15401 m
+4 h
+1 h
+14529 m
+15402 m
+10 h
+4 h
+1 h
+4 h
+15403 m
+4 h
+4513 m
+15404 m
+4 h
+4 h
+10 h
+4 h
+4 h
+1 h
+4 h
+15405 m
+367 h
+1137 h
+1 h
+4 h
+6882 m
+11 h
+1 h
+10 h
+1 h
+83 h
+15406 m
+383 h
+4 h
+4 h
+250 h
+109 h
+15407 m
+181 h
+4 h
+1 h
+15408 m
+4 h
+1 h
+319 h
+138 m
+1 h
+10 h
+10 h
+125 h
+4 h
+1 h
+4 h
+15409 m
+25 h
+4 h
+332 h
+4 h
+11778 m
+1 h
+4 h
+15410 m
+15411 m
+1 h
+1 h
+4 h
+4464 m
+15412 m
+4 h
+4 h
+1914 m
+11 h
+4 h
+4 h
+4 h
+4 h
+10 h
+1 h
+4 h
+22 h
+83 h
+10 h
+56 h
+1 h
+805 m
+238 h
+15413 m
+911 h
+4 h
+10 h
+15414 m
+4 h
+36 h
+1 h
+1 h
+1 h
+976 h
+11 h
+10 h
+57 h
+125 h
+4 h
+1 h
+15415 m
+15416 m
+114 h
+4 h
+10532 m
+1454 h
+15417 m
+65 h
+125 h
+15418 m
+4 h
+1 h
+48 h
+65 h
+97 h
+4 h
+15419 m
+4 h
+56 h
+97 h
+1 h
+3657 m
+4 h
+368 h
+4 h
+82 h
+4 h
+10 h
+11 h
+93 h
+4 h
+4 h
+15420 m
+4 h
+15421 m
+72 m
+1 h
+1 h
+229 h
+1 h
+569 h
+1 h
+4 h
+15422 m
+4815 m
+109 h
+3 h
+1 h
+10 h
+4 h
+4 h
+1 h
+11 h
+4 h
+15423 m
+15424 m
+11 h
+4 h
+10028 m
+104 h
+4 h
+94 h
+4 h
+4 h
+999 m
+73 h
+82 h
+1 h
+59 h
+307 h
+885 h
+238 h
+15425 m
+4 h
+10 h
+3 h
+4 h
+11 h
+135 h
+4 h
+4 h
+124 h
+31 h
+508 m
+4 h
+82 h
+1 h
+65 h
+82 h
+11 h
+4 h
+1 h
+1 h
+4 h
+15426 m
+5377 m
+83 h
+59 h
+15427 m
+1 h
+186 h
+10 h
+2002 m
+97 h
+1 h
+15428 m
+195 h
+4 h
+196 h
+15429 m
+4 h
+4 h
+1 h
+156 h
+11 h
+4 h
+4 h
+1403 h
+1 h
+3 h
+4 h
+156 h
+1 h
+10 h
+11 h
+1 h
+4 h
+4 h
+1 h
+4 h
+57 h
+4 h
+31 h
+4 h
+1838 m
+1454 h
+4 h
+164 h
+4 h
+4 h
+1 h
+1 h
+25 h
+4 h
+196 h
+10 h
+15430 m
+15431 m
+82 h
+4 h
+403 h
+1 h
+4 h
+15432 m
+4 h
+11 h
+10 h
+1 h
+4 h
+164 h
+114 h
+10 h
+4 h
+229 h
+1 h
+4 h
+4 h
+11 h
+135 h
+1 h
+135 h
+1 h
+15433 m
+15434 m
+185 h
+15435 m
+10 h
+15436 m
+4 h
+15437 m
+10 h
+12873 m
+1 h
+124 h
+15438 m
+4 h
+4 h
+10 h
+10 h
+10 h
+966 m
+1 h
+1 h
+1835 m
+3 h
+4 h
+1893 m
+4 h
+124 h
+125 h
+4 h
+10 h
+57 h
+4 h
+4 h
+15026 m
+3 h
+10 h
+144 h
+15439 m
+4 h
+15440 m
+1 h
+289 h
+4 h
+1 h
+4 h
+10 h
+15441 m
+4 h
+10 h
+1 h
+11 h
+10 h
+1 h
+4 h
+9692 m
+11 h
+15442 m
+353 h
+109 h
+10 h
+1 h
+65 h
+195 h
+4 h
+1 h
+10 h
+4 h
+10 h
+4 h
+4 h
+4 h
+1 h
+1 h
+147 h
+538 h
+4 h
+1 h
+15443 m
+260 m
+15444 m
+4 h
+1 h
+15445 m
+124 h
+1 h
+59 h
+1 h
+10 h
+4 h
+1 h
+10 h
+4 h
+4 h
+15446 m
+104 h
+1 h
+601 h
+1 h
+4 h
+10 h
+4 h
+169 h
+15447 m
+6528 m
+1278 m
+104 h
+15448 m
+82 h
+59 h
+15449 m
+386 h
+1 h
+4 h
+10 h
+1 h
+1 h
+4 h
+10 h
+59 h
+10 h
+4 h
+4 h
+1 h
+1 h
+190 h
+185 h
+10 h
+1 h
+1 h
+143 h
+4 h
+1 h
+1 h
+1 h
+1074 m
+4 h
+1 h
+11 h
+15450 m
+15451 m
+77 h
+1 h
+1993 m
+4 h
+15452 m
+82 h
+1403 h
+45 h
+3 h
+109 h
+10 h
+1 h
+1989 m
+4 h
+146 h
+169 h
+278 h
+4 h
+104 h
+4 h
+533 h
+1 h
+118 h
+4 h
+368 h
+10 h
+1 h
+143 h
+10 h
+15453 m
+125 h
+59 h
+469 m
+4 h
+10 h
+1 h
+10 h
+83 h
+15454 m
+92 h
+57 h
+3 h
+1198 m
+15455 m
+22 h
+1269 m
+4 h
+10 h
+82 h
+59 h
+15456 m
+4 h
+3453 m
+278 h
+3 h
+1 h
+109 h
+8206 m
+113 h
+10 h
+65 h
+12576 m
+1 h
+15457 m
+73 h
+15458 m
+10 h
+3 h
+15459 m
+4 h
+4 h
+1 h
+4 h
+82 h
+1 h
+10 h
+307 h
+15460 m
+15461 m
+10 h
+1 h
+4 h
+10 h
+4 h
+4 h
+4 h
+4 h
+4 h
+4 h
+10 h
+630 m
+4 h
+10 h
+1382 m
+15462 m
+4 h
+4 h
+1 h
+10 h
+10 h
+59 h
+1 h
+1304 m
+4 h
+10 h
+41 h
+15463 m
+1 h
+2840 m
+533 h
+4 h
+15464 m
+1 h
+15465 m
+4 h
+4 h
+59 h
+31 h
+250 h
+1 h
+1 h
+119 h
+4 h
+4 h
+10 h
+15466 m
+1 h
+146 h
+25 h
+10 h
+124 h
+4 h
+1 h
+15467 m
+195 h
+15468 m
+10 h
+10 h
+10 h
+1 h
+79 h
+10 h
+1 h
+1 h
+4 h
+11 h
+1 h
+156 h
+65 h
+10 h
+15469 m
+1595 m
+10 h
+1 h
+1 h
+15470 m
+10 h
+2314 m
+1 h
+167 h
+4 h
+1 h
+1772 m
+5475 m
+10 h
+15471 m
+15472 m
+15473 m
+10 h
+1 h
+4 h
+25 h
+10 h
+4 h
+11 h
+4 h
+4 h
+4 h
+15474 m
+31 h
+575 h
+12 h
+4 h
+1 h
+15475 m
+57 h
+59 h
+4 h
+15476 m
+4 h
+3 h
+11 h
+12 h
+1 h
+1 h
+113 h
+15477 m
+4 h
+1 h
+1 h
+10 h
+15478 m
+181 h
+82 h
+185 h
+15479 m
+10 h
+15480 m
+15481 m
+11 h
+15482 m
+94 h
+4 h
+538 h
+15483 m
+4 h
+156 h
+15484 m
+10 h
+10 h
+1308 m
+15485 m
+15486 m
+986 h
+1 h
+10 h
+7938 m
+1 h
+1 h
+1 h
+15487 m
+12 h
+15488 m
+15489 m
+124 h
+10 h
+1 h
+15490 m
+1 h
+15491 m
+4 h
+190 h
+10 h
+65 h
+4 h
+1 h
+15492 m
+4608 m
+10 h
+45 h
+57 h
+1 h
+10 h
+4 h
+15493 m
+12 h
+3398 m
+4 h
+10 h
+1 h
+109 h
+6545 m
+195 h
+31 h
+1 h
+59 h
+1564 m
+1 h
+1 h
+1 h
+15494 m
+109 h
+1 h
+10 h
+1 h
+2041 m
+1 h
+4 h
+10 h
+276 h
+1 h
+4 h
+15495 m
+15496 m
+10 h
+1 h
+10 h
+1 h
+4 h
+1 h
+10 h
+1 h
+10 h
+1 h
+15497 m
+15498 m
+4 h
+65 h
+4 h
+1 h
+82 h
+4 h
+15499 m
+15500 m
+10 h
+1 h
+11 h
+135 h
+15501 m
+57 h
+195 h
+15502 m
+4 h
+1 h
+1 h
+10 h
+2556 m
+4 h
+10 h
+11128 m
+4 h
+15503 m
+25 h
+123 h
+4 h
+4 h
+4 h
+15504 m
+265 h
+195 h
+1 h
+15505 m
+10 h
+10 h
+1 h
+10 h
+4 h
+118 h
+1 h
+4 h
+15506 m
+57 h
+73 h
+108 h
+15507 m
+1 h
+1 h
+1 h
+3 h
+1 h
+57 h
+10 h
+4 h
+11 h
+15508 m
+4 h
+15509 m
+4 h
+4 h
+97 h
+1 h
+15510 m
+109 h
+15511 m
+27 h
+10 h
+1 h
+1 h
+4 h
+1 h
+4 h
+4 h
+65 h
+59 h
+15512 m
+4 h
+124 h
+479 m
+4 h
+11 h
+1 h
+661 m
+1 h
+4 h
+15513 m
+15514 m
+8 h
+5407 m
+4 h
+190 h
+12 h
+10 h
+1 h
+91 h
+74 h
+110 h
+4 h
+4 h
+79 h
+4 h
+15515 m
+1 h
+10 h
+11 h
+1 h
+15516 m
+185 h
+368 h
+4 h
+4 h
+15517 m
+15518 m
+4 h
+1 h
+124 h
+83 h
+3322 m
+15519 m
+157 h
+1 h
+299 h
+15520 m
+4 h
+4 h
+250 h
+57 h
+13 h
+190 h
+4 h
+15521 m
+119 h
+109 h
+4 h
+4 h
+15522 m
+4 h
+4 h
+1 h
+4 h
+4 h
+11 h
+4 h
+15523 m
+1 h
+10 h
+10 h
+10 h
+4 h
+83 h
+4 h
+10 h
+10 h
+15524 m
+10 h
+97 h
+4 h
+15525 m
+8 h
+190 h
+1 h
+41 h
+94 h
+13326 m
+996 m
+1 h
+94 h
+123 h
+1 h
+1 h
+4 h
+59 h
+1 h
+15526 m
+1 h
+3 h
+911 h
+10 h
+36 h
+447 h
+4 h
+15527 m
+15528 m
+6583 m
+74 h
+15529 m
+15530 m
+4 h
+1 h
+65 h
+10 h
+3 h
+5483 m
+1 h
+358 h
+4 h
+330 h
+15531 m
+10 h
+15532 m
+15533 m
+123 h
+10 h
+4 h
+4 h
+82 h
+108 h
+278 h
+520 m
+15534 m
+4 h
+1 h
+8 h
+10 h
+15535 m
+297 h
+15536 m
+97 h
+25 h
+770 m
+15537 m
+4 h
+15538 m
+82 h
+1 h
+10 h
+1 h
+4 h
+1 h
+55 h
+15539 m
+274 h
+4 h
+65 h
+4 h
+4 h
+10 h
+15540 m
+1 h
+10 h
+10 h
+15541 m
+109 h
+10 h
+4 h
+4 h
+15542 m
+1 h
+1 h
+1 h
+4 h
+1 h
+10 h
+4 h
+1 h
+73 h
+4 h
+59 h
+55 h
+1 h
+1 h
+15543 m
+1 h
+1 h
+82 h
+1 h
+4 h
+4 h
+4 h
+1 h
+4 h
+4 h
+15544 m
+4 h
+10 h
+10 h
+10 h
+15545 m
+4849 m
+190 h
+4 h
+15546 m
+4 h
+520 h
+1 h
+1 h
+15547 m
+4 h
+1 h
+83 h
+4896 m
+4 h
+15548 m
+4 h
+57 h
+125 h
+15549 m
+15550 m
+1 h
+15551 m
+10 h
+15552 m
+1 h
+4 h
+1030 h
+4 h
+1737 m
+74 h
+109 h
+1 h
+15553 m
+10 h
+1 h
+1 h
+1 h
+4 h
+97 h
+15554 m
+1 h
+4 h
+1 h
+1 h
+59 h
+1 h
+15555 m
+1 h
+195 h
+15556 m
+10 h
+195 h
+11 h
+114 h
+1 h
+82 h
+3 h
+1 h
+15557 m
+4 h
+140 h
+4 h
+125 h
+4 h
+1678 m
+8882 m
+368 h
+386 h
+10 h
+15558 m
+15559 m
+4 h
+15560 m
+4 h
+4 h
+15561 m
+10 h
+10 h
+104 h
+11 h
+15562 m
+10 h
+4 h
+4 h
+2308 h
+104 h
+1 h
+15563 m
+1 h
+79 h
+10 h
+763 m
+4 h
+4 h
+15564 m
+10 h
+59 h
+15565 m
+173 h
+4 h
+15566 m
+158 h
+1271 m
+4 h
+57 h
+536 h
+4 h
+4 h
+11 h
+15567 m
+4 h
+170 h
+1 h
+1 h
+6158 m
+4 h
+1 h
+1 h
+4 h
+1 h
+196 h
+104 h
+82 h
+266 h
+15568 m
+15569 m
+15570 m
+147 h
+15571 m
+11 h
+10 h
+1 h
+15572 m
+1 h
+10 h
+15573 m
+184 h
+258 h
+1 h
+15574 m
+15575 m
+4 h
+10 h
+57 h
+1 h
+15576 m
+10 h
+489 m
+10 h
+1 h
+4 h
+186 h
+15577 m
+4 h
+1 h
+12 h
+146 h
+15578 m
+871 m
+757 h
+4 h
+15579 m
+4 h
+15580 m
+4 h
+4 h
+15581 m
+4 h
+15582 m
+1 h
+10 h
+15583 m
+4 h
+31 h
+8767 m
+10 h
+10 h
+13 h
+4 h
+4 h
+15584 m
+10 h
+15585 m
+124 h
+82 h
+4 h
+4 h
+4 h
+1 h
+15586 m
+1260 m
+10 h
+15587 m
+110 h
+4 h
+65 h
+12489 m
+56 h
+10 h
+1 h
+1 h
+57 h
+15588 m
+10 h
+3 h
+4 h
+4 h
+687 h
+15589 m
+1 h
+10 h
+15590 m
+7839 m
+4 h
+65 h
+1 h
+3 h
+125 h
+10 h
+74 h
+15591 m
+15592 m
+15593 m
+779 h
+15594 m
+4 h
+10 h
+74 h
+10 h
+808 m
+4 h
+1308 m
+2625 m
+1 h
+22 h
+25 h
+1 h
+965 h
+59 h
+15595 m
+74 h
+15596 m
+4 h
+10 h
+10 h
+1 h
+11 h
+4 h
+143 h
+1 h
+15597 m
+5330 m
+1 h
+125 h
+10 h
+15598 m
+15599 m
+1 h
+4 h
+11 h
+1 h
+1 h
+3 h
+125 h
+10 h
+15600 m
+15601 m
+3 h
+83 h
+15602 m
+1 h
+22 h
+167 h
+15603 m
+687 h
+4 h
+1 h
+55 h
+15604 m
+1 h
+13 h
+2931 m
+4 h
+186 h
+1 h
+818 m
+4 h
+1 h
+15605 m
+1 h
+1 h
+15606 m
+15607 m
+4 h
+181 h
+1685 h
+1 h
+479 m
+1 h
+1 h
+3 h
+15608 m
+4 h
+11 h
+4 h
+10 h
+15377 m
+295 h
+4 h
+11 h
+4 h
+353 h
+15609 m
+4 h
+10 h
+1 h
+10 h
+4 h
+1027 h
+10 h
+1 h
+10 h
+1 h
+196 h
+3558 m
+4 h
+10 h
+123 h
+15610 m
+5225 m
+109 h
+4 h
+2591 m
+15611 m
+82 h
+91 h
+4 h
+1 h
+250 h
+1478 m
+13 h
+6941 m
+1 h
+723 m
+70 m
+4 h
+15612 m
+1 h
+4 h
+4 h
+1 h
+15613 m
+77 h
+116 m
+4 h
+4 h
+104 h
+10 h
+73 h
+10 h
+10 h
+15614 m
+10 h
+4 h
+616 m
+6129 m
+4 h
+10 h
+59 h
+55 h
+82 h
+4229 m
+11 h
+4 h
+4 h
+10 h
+1 h
+10 h
+10 h
+57 h
+109 h
+15615 m
+4 h
+1 h
+3 h
+4 h
+258 h
+104 h
+1220 m
+4 h
+15616 m
+15617 m
+65 h
+108 h
+1 h
+1 h
+4 h
+15618 m
+10 h
+1 h
+1 h
+172 h
+4 h
+10 h
+82 h
+276 h
+10 h
+1 h
+4 h
+10 h
+15619 m
+1 h
+4 h
+15620 m
+139 h
+266 h
+109 h
+1 h
+1 h
+92 h
+1 h
+106 m
+15621 m
+15622 m
+10 h
+1780 h
+2851 m
+45 h
+146 h
+4 h
+1 h
+147 h
+1 h
+1 h
+15623 m
+125 h
+1016 h
+4 h
+1 h
+15624 m
+4 h
+4 h
+64 h
+97 h
+22 h
+1 h
+64 h
+15625 m
+15626 m
+10 h
+1 h
+4 h
+15627 m
+15628 m
+4 h
+59 h
+4 h
+10 h
+4 h
+11 h
+1 h
+1796 h
+15629 m
+1981 m
+386 h
+15630 m
+114 h
+15631 m
+4 h
+9482 m
+4 h
+15632 m
+279 h
+1 h
+10 h
+1 h
+448 m
+4 h
+1 h
+36 h
+104 h
+15633 m
+4 h
+119 h
+41 h
+10 h
+4 h
+1 h
+4 h
+1 h
+4 h
+10 h
+123 h
+4 h
+4 h
+10 h
+10 h
+15634 m
+4 h
+10 h
+307 h
+10 h
+15635 m
+1642 h
+4 h
+1 h
+583 m
+11 h
+15636 m
+15637 m
+77 h
+15638 m
+4 h
+4 h
+55 h
+15639 m
+1 h
+1 h
+332 h
+15640 m
+15641 m
+10 h
+4 h
+25 h
+874 m
+15642 m
+4 h
+10 h
+15643 m
+15644 m
+10 h
+83 h
+10 h
+15645 m
+4 h
+11 h
+10 h
+10 h
+15646 m
+4 h
+3177 m
+1 h
+1642 h
+10 h
+12 h
+1444 m
+27 h
+15647 m
+1 h
+4 h
+4 h
+119 h
+4 h
+10 h
+10 h
+4 h
+4 h
+15648 m
+15649 m
+332 h
+124 h
+1 h
+10 h
+224 m
+1 h
+1 h
+208 m
+1 h
+167 h
+15650 m
+97 h
+1 h
+4 h
+1 h
+4 h
+4 h
+4 h
+4 h
+15651 m
+9293 m
+15652 m
+1815 m
+4 h
+10 h
+295 h
+4 h
+10 h
+15653 m
+15654 m
+1 h
+4 h
+1 h
+626 m
+15655 m
+10 h
+15656 m
+15657 m
+15658 m
+10 h
+65 h
+169 h
+195 h
+10 h
+4 h
+15659 m
+45 h
+15660 m
+4 h
+4 h
+15661 m
+15662 m
+10640 m
+10 h
+4 h
+4 h
+45 h
+1 h
+74 h
+15663 m
+185 h
+1822 h
+73 h
+4 h
+10 h
+4 h
+124 h
+1 h
+15664 m
+1 h
+307 h
+2172 m
+4 h
+4 h
+538 h
+4 h
+4 h
+2474 m
+1 h
+15665 m
+10 h
+1 h
+109 h
+5809 m
+1 h
+15666 m
+4 h
+10 h
+25 h
+1 h
+109 h
+167 h
+119 h
+4 h
+11869 m
+1 h
+4 h
+15667 m
+15668 m
+15669 m
+285 m
+124 h
+1 h
+4 h
+124 h
+94 h
+4 h
+3 h
+1 h
+10 h
+4 h
+4 h
+10 h
+1 h
+15670 m
+4 h
+4 h
+10 h
+15671 m
+4 h
+4 h
+4 h
+897 m
+1 h
+10 h
+11 h
+31 h
+10 h
+1 h
+15672 m
+4 h
+15673 m
+4 h
+10 h
+587 m
+3 h
+82 h
+10 h
+59 h
+4 h
+4 h
+11 h
+1 h
+10 h
+74 h
+1 h
+15674 m
+10 h
+1 h
+41 h
+10 h
+15675 m
+4 h
+15676 m
+31 h
+4 h
+10 h
+1 h
+224 m
+15677 m
+238 h
+15678 m
+10 h
+119 h
+2438 m
+125 h
+15679 m
+538 h
+15680 m
+4 h
+15681 m
+15682 m
+22 h
+13140 m
+1 h
+15683 m
+1 h
+185 h
+1 h
+11 h
+74 h
+1 h
+11 h
+1 h
+15684 m
+1 h
+1 h
+1 h
+10 h
+1 h
+15685 m
+1 h
+1 h
+4 h
+109 h
+82 h
+4 h
+1 h
+10 h
+4 h
+1030 h
+15686 m
+2126 m
+4 h
+15687 m
+4 h
+10 h
+15688 m
+56 h
+1 h
+4 h
+36 h
+1 h
+15689 m
+4 h
+15690 m
+4 h
+10 h
+10 h
+97 h
+1 h
+1478 m
+4 h
+10 h
+4 h
+3 h
+15691 m
+10 h
+4 h
+1 h
+10 h
+97 h
+1 h
+1 h
+15692 m
+36 h
+4 h
+4 h
+1 h
+27 h
+10 h
+10 h
+10 h
+15693 m
+1 h
+147 h
+73 h
+10 h
+74 h
+13 h
+15694 m
+169 h
+4 h
+15695 m
+297 h
+1 h
+4 h
+4 h
+8734 m
+1 h
+11 h
+59 h
+10 h
+1 h
+10 h
+1 h
+4 h
+10 h
+4 h
+15696 m
+258 h
+114 h
+3562 m
+1 h
+1 h
+10 h
+4 h
+1948 m
+258 h
+4 h
+3 h
+10 h
+1 h
+15697 m
+83 h
+10 h
+386 h
+4 h
+10 h
+140 h
+15698 m
+4 h
+371 h
+8383 m
+1 h
+4 h
+1220 m
+359 h
+10 h
+12372 m
+11 h
+1038 m
+94 h
+3732 m
+41 h
+4 h
+15699 m
+10 h
+3 h
+15700 m
+10 h
+2309 m
+15701 m
+1 h
+1 h
+57 h
+5863 m
+2965 m
+25 h
+1 h
+15702 m
+1 h
+4 h
+4 h
+1 h
+4 h
+1 h
+4 h
+11779 m
+10 h
+4 h
+119 h
+92 h
+4 h
+15703 m
+27 h
+15704 m
+4524 m
+1454 h
+15705 m
+1 h
+4 h
+15706 m
+15707 m
+258 h
+4 h
+4 h
+1 h
+15708 m
+4 h
+109 h
+1083 m
+229 h
+912 m
+4 h
+4 h
+4 h
+1955 m
+15709 m
+1 h
+1 h
+2851 m
+15710 m
+10 h
+258 h
+45 h
+4359 m
+10 h
+4 h
+4 h
+4 h
+15711 m
+4 h
+2720 m
+10 h
+278 h
+15712 m
+1 h
+4 h
+1 h
+4 h
+4 h
+219 m
+1 h
+4 h
+57 h
+15713 m
+109 h
+4 h
+15714 m
+82 h
+4 h
+11 h
+31 h
+9940 m
+10 h
+4 h
+4 h
+4 h
+11 h
+15715 m
+109 h
+15716 m
+1 h
+15717 m
+10 h
+4 h
+4 h
+139 h
+1 h
+4 h
+147 h
+15718 m
+55 h
+4 h
+10 h
+1 h
+3 h
+3845 m
+15719 m
+4 h
+28 h
+15720 m
+1 h
+22 h
+94 h
+1 h
+4 h
+10 h
+4 h
+4 h
+59 h
+4 h
+11 h
+4 h
+4 h
+1 h
+4 h
+4 h
+1 h
+124 h
+1 h
+4 h
+3089 m
+435 h
+278 h
+15721 m
+4 h
+4 h
+1 h
+15722 m
+15723 m
+11 h
+976 h
+15724 m
+1 h
+15725 m
+3748 m
+10 h
+11 h
+57 h
+10 h
+1 h
+15726 m
+1 h
+1137 h
+1 h
+11 h
+265 h
+15727 m
+1722 m
+1 h
+10 h
+172 h
+10 h
+3 h
+10 h
+241 m
+11 h
+4 h
+167 h
+1 h
+123 h
+7215 m
+4 h
+238 h
+8 h
+15728 m
+4 h
+4 h
+1 h
+3 h
+4 h
+4 h
+238 h
+1 h
+10 h
+15729 m
+97 h
+195 h
+83 h
+10 h
+4 h
+4 h
+143 h
+15730 m
+65 h
+4 h
+278 h
+4 h
+10 h
+10 h
+4 h
+4 h
+3 h
+5933 m
+4 h
+57 h
+5 h
+4 h
+4 h
+15731 m
+1 h
+1 h
+238 h
+278 h
+15732 m
+4 h
+4 h
+11 h
+4 h
+1 h
+82 h
+10 h
+10 h
+158 h
+4 h
+10 h
+4 h
+15733 m
+25 h
+4 h
+156 h
+15734 m
+15735 m
+59 h
+15736 m
+1 h
+1492 m
+15737 m
+1 h
+10 h
+4 h
+4 h
+10 h
+1 h
+1 h
+4 h
+11 h
+4 h
+4 h
+4 h
+15738 m
+447 h
+15739 m
+4 h
+114 h
+4 h
+25 h
+15740 m
+2494 m
+6001 m
+1 h
+31 h
+15741 m
+15742 m
+4 h
+1 h
+15743 m
+2022 m
+4 h
+192 h
+15744 m
+10 h
+15745 m
+119 h
+4 h
+4 h
+4 h
+4 h
+1 h
+1 h
+4 h
+4 h
+55 h
+1 h
+55 h
+2002 m
+15746 m
+1 h
+4 h
+1714 h
+4 h
+10 h
+195 h
+1 h
+4 h
+1 h
+1 h
+1914 h
+4 h
+10 h
+25 h
+57 h
+4 h
+4 h
+36 h
+4 h
+15747 m
+4 h
+15748 m
+640 h
+2183 m
+82 h
+64 h
+4 h
+1 h
+10 h
+11 h
+279 h
+536 h
+10 h
+371 h
+1 h
+11 h
+64 h
+538 h
+3702 m
+1 h
+1 h
+10 h
+10 h
+1 h
+15749 m
+270 h
+79 h
+10 h
+10 h
+1 h
+10 h
+4 h
+164 h
+64 h
+64 h
+124 h
+14050 m
+1 h
+1 h
+520 h
+1 h
+172 h
+10 h
+4 h
+4 h
+15750 m
+1780 h
+4 h
+10 h
+15751 m
+2116 m
+10 h
+4 h
+10 h
+15752 m
+15753 m
+15754 m
+687 h
+1 h
+15755 m
+358 h
+15756 m
+4714 m
+10 h
+1 h
+4 h
+4 h
+1 h
+1 h
+15757 m
+10 h
+1 h
+12 h
+5296 m
+12005 m
+15758 m
+83 h
+4 h
+15759 m
+4 h
+11 h
+4542 m
+156 h
+757 h
+4 h
+1 h
+10958 m
+15760 m
+4 h
+10 h
+4 h
+13392 m
+10 h
+167 h
+15761 m
+656 m
+15762 m
+4 h
+1948 m
+1 h
+10 h
+4 h
+45 h
+36 h
+172 h
+1 h
+15763 m
+4 h
+1 h
+4 h
+15764 m
+57 h
+4 h
+4 h
+55 h
+146 h
+10 h
+36 h
+258 h
+4 h
+109 h
+2459 m
+15765 m
+4 h
+15766 m
+4 h
+1 h
+15767 m
+15768 m
+1 h
+4 h
+1 h
+82 h
+4 h
+10 h
+266 h
+4 h
+4 h
+15769 m
+10 h
+10 h
+15770 m
+4 h
+15771 m
+10 h
+31 h
+10 h
+4 h
+135 h
+10 h
+31 h
+1 h
+1 h
+4 h
+4 h
+10 h
+1 h
+1 h
+383 h
+10 h
+1 h
+1062 m
+4 h
+1 h
+1677 m
+15772 m
+15773 m
+4 h
+146 h
+125 h
+4 h
+10 h
+195 h
+4 h
+4 h
+15774 m
+15775 m
+109 h
+1 h
+15776 m
+15777 m
+73 h
+156 h
+4 h
+10 h
+25 h
+15778 m
+3177 m
+11 h
+10 h
+10 h
+1 h
+10 h
+4 h
+15779 m
+1198 m
+1 h
+10 h
+1 h
+172 h
+1 h
+146 h
+15780 m
+27 h
+4 h
+1201 m
+1 h
+11 h
+4 h
+15781 m
+939 m
+125 h
+112 h
+10 h
+25 h
+15782 m
+4 h
+1 h
+170 h
+10 h
+4 h
+10 h
+4 h
+3 h
+181 h
+10 h
+4 h
+4308 m
+278 h
+1 h
+15783 m
+355 m
+109 h
+15784 m
+59 h
+1 h
+1 h
+10 h
+15785 m
+1 h
+278 h
+11 h
+1 h
+1003 m
+4 h
+10 h
+110 h
+4 h
+15786 m
+4 h
+4 h
+83 h
+10 h
+4 h
+10 h
+1 h
+15787 m
+1 h
+10 h
+4 h
+591 m
+15788 m
+138 m
+15789 m
+4 h
+11 h
+15790 m
+79 h
+15791 m
+15792 m
+6124 m
+1 h
+4 h
+59 h
+10 h
+4 h
+3 h
+13 h
+10 h
+57 h
+15793 m
+15794 m
+1137 h
+109 h
+15795 m
+4 h
+1 h
+3143 m
+1 h
+10 h
+135 h
+82 h
+224 h
+15796 m
+10 h
+190 h
+65 h
+11 h
+15797 m
+15798 m
+10 h
+15799 m
+10 h
+15800 m
+1 h
+169 h
+10 h
+41 h
+170 h
+221 m
+15801 m
+338 h
+10 h
+15802 m
+1 h
+15803 m
+56 h
+1 h
+250 h
+4 h
+82 h
+4 h
+97 h
+4 h
+10 h
+11 h
+11 h
+4 h
+15804 m
+10 h
+1 h
+82 h
+83 h
+1 h
+15805 m
+4 h
+10 h
+15806 m
+4 h
+10 h
+4 h
+10 h
+15807 m
+4 h
+1 h
+10 h
+1 h
+15808 m
+4 h
+124 h
+4 h
+4 h
+1 h
+4 h
+4 h
+966 h
+1 h
+10 h
+4 h
+11 h
+1 h
+1 h
+31 h
+12 h
+2148 m
+79 h
+4 h
+59 h
+4 h
+143 h
+1 h
+1 h
+15809 m
+10 h
+10 h
+15810 m
+59 h
+10 h
+15811 m
+1 h
+1 h
+15812 m
+25 h
+15813 m
+124 h
+4 h
+1 h
+1 h
+4 h
+3847 m
+10 h
+10 h
+15814 m
+3 h
+4 h
+59 h
+104 h
+1 h
+11 h
+15815 m
+1 h
+347 m
+4 h
+4 h
+1 h
+27 h
+15816 m
+169 h
+156 h
+10 h
+4 h
+83 h
+1 h
+1 h
+36 h
+10 h
+1 h
+10 h
+124 h
+1027 h
+15817 m
+15818 m
+143 h
+4 h
+2887 m
+1 h
+1 h
+15819 m
+1 h
+15820 m
+1 h
+124 h
+114 h
+1 h
+297 h
+4 h
+6726 m
+4 h
+3398 m
+307 h
+15821 m
+12 h
+1 h
+10 h
+1 h
+4 h
+15822 m
+4 h
+138 m
+2096 m
+15823 m
+4 h
+1 h
+4 h
+10 h
+4 h
+359 h
+3539 m
+15824 m
+2002 m
+4 h
+1 h
+4 h
+59 h
+123 h
+10 h
+1 h
+10 h
+10 h
+7135 m
+4 h
+15825 m
+4 h
+1 h
+10 h
+1 h
+10 h
+11 h
+1 h
+15826 m
+1 h
+1 h
+15827 m
+13 h
+2694 m
+6422 m
+113 h
+15828 m
+464 h
+1 h
+15829 m
+83 h
+1 h
+1 h
+4 h
+11 h
+10 h
+15830 m
+1122 m
+59 h
+1 h
+13468 m
+976 h
+15831 m
+1 h
+15832 m
+10 h
+1 h
+10 h
+1 h
+4 h
+12 h
+4 h
+15833 m
+10 h
+4 h
+4 h
+36 h
+65 h
+169 h
+195 h
+1 h
+1137 h
+11 h
+1 h
+1359 h
+77 h
+677 m
+15834 m
+4 h
+15835 m
+15836 m
+4 h
+238 h
+2308 h
+1 h
+27 h
+10 h
+15837 m
+3 h
+1 h
+3 h
+10 h
+1 h
+4 h
+4 h
+4 h
+15838 m
+14345 m
+15839 m
+109 h
+1 h
+11 h
+4 h
+15840 m
+12301 m
+169 h
+1 h
+15 m
+1117 m
+15841 m
+15842 m
+15843 m
+10 h
+10 h
+15844 m
+1 h
+397 m
+10089 m
+4 h
+434 m
+110 h
+13886 m
+4 h
+4 h
+241 m
+10 h
+1 h
+1 h
+106 m
+10 h
+9536 m
+1822 h
+1 h
+4 h
+684 m
+10 h
+5 h
+1650 h
+1 h
+15845 m
+15846 m
+1 h
+15847 m
+109 h
+15848 m
+1 h
+109 h
+3 h
+15849 m
+767 m
+4 h
+1 h
+82 h
+4 h
+1 h
+338 h
+1 h
+10 h
+114 h
+4 h
+25 h
+113 h
+15850 m
+228 m
+31 h
+4 h
+2920 m
+386 h
+10 h
+15851 m
+4 h
+57 h
+15852 m
+11 h
+1137 h
+57 h
+73 h
+64 h
+1 h
+4 h
+1981 m
+15853 m
+4 h
+4 h
+15854 m
+15855 m
+10 h
+15856 m
+92 h
+10 h
+109 h
+4 h
+2379 h
+569 h
+10 h
+11 h
+258 h
+41 h
+15857 m
+4576 m
+114 h
+1 h
+10 h
+4 h
+10 h
+146 h
+4 h
+27 h
+1 h
+10 h
+10 h
+15858 m
+4 h
+55 h
+3 h
+4 h
+10 h
+83 h
+10 h
+1 h
+267 m
+55 h
+4 h
+10 h
+4 h
+536 h
+73 h
+12 h
+82 h
+15859 m
+140 h
+1 h
+4 h
+1 h
+48 h
+4 h
+124 h
+10 h
+4 h
+4 h
+1 h
+1 h
+113 h
+25 h
+15860 m
+97 h
+15861 m
+1 h
+1 h
+15862 m
+4 h
+10 h
+10 h
+36 h
+10 h
+1 h
+74 h
+1 h
+386 h
+1 h
+15863 m
+4 h
+1 h
+3025 m
+10 h
+1 h
+1953 h
+1 h
+31 h
+196 h
+15864 m
+718 h
+8809 m
+10 h
+5863 m
+45 h
+31 h
+4 h
+5689 m
+1 h
+15865 m
+10 h
+25 h
+10 h
+15866 m
+83 h
+15867 m
+4 h
+31 h
+4 h
+11 h
+10 h
+4 h
+15868 m
+106 m
+4 h
+97 h
+4 h
+1 h
+718 h
+15869 m
+386 h
+15870 m
+15871 m
+10 h
+4 h
+10 h
+1 h
+57 h
+1 h
+110 h
+1 h
+10 h
+4 h
+447 h
+4 h
+119 h
+15872 m
+11 h
+15873 m
+4 h
+4 h
+4 h
+1 h
+367 h
+15874 m
+15875 m
+185 h
+1 h
+1 h
+10 h
+15876 m
+15877 m
+31 h
+45 h
+4 h
+4 h
+15878 m
+4 h
+11 h
+110 h
+4 h
+1 h
+4 h
+15879 m
+36 h
+270 h
+4 h
+25 h
+4 h
+15880 m
+1 h
+11 h
+15881 m
+15882 m
+4830 m
+114 h
+279 h
+3707 m
+1 h
+15883 m
+10 h
+1650 h
+4 h
+92 h
+82 h
+4 h
+4 h
+10 h
+92 h
+10 h
+4 h
+1 h
+1 h
+15884 m
+332 h
+4905 h
+1 h
+15885 m
+1 h
+1 h
+10 h
+10 h
+15886 m
+15887 m
+15888 m
+119 h
+10 h
+1 h
+4 h
+4 h
+10 h
+56 h
+10 h
+1 h
+15889 m
+1 h
+56 h
+15890 m
+1006 m
+4 h
+10 h
+4 h
+538 h
+4 h
+109 h
+1 h
+1 h
+1 h
+4 h
+15891 m
+104 h
+10 h
+1 h
+10 h
+15892 m
+10 h
+73 h
+4 h
+1 h
+15893 m
+57 h
+1 h
+15894 m
+10 h
+15895 m
+15896 m
+1 h
+954 m
+238 h
+1 h
+135 h
+1 h
+1666 m
+1 h
+15897 m
+443 h
+4 h
+4 h
+15898 m
+82 h
+10 h
+1 h
+1 h
+4 h
+12363 m
+1714 h
+15899 m
+1 h
+4 h
+15900 m
+15901 m
+509 m
+10464 m
+15902 m
+15903 m
+15904 m
+4 h
+146 h
+4 h
+276 h
+15905 m
+15906 m
+10 h
+1 h
+1 h
+10 h
+11 h
+164 h
+4 h
+15907 m
+536 h
+4 h
+15908 m
+15909 m
+4 h
+10 h
+4 h
+12 h
+1 h
+1 h
+25 h
+4 h
+4 h
+4 h
+10 h
+4 h
+27 h
+73 h
+36 h
+4 h
+4 h
+1 h
+3704 m
+15910 m
+4 h
+15911 m
+3 h
+12 h
+11 h
+368 h
+4 h
+15912 m
+11 h
+15913 m
+1 h
+2265 m
+1 h
+11 h
+94 h
+10 h
+15914 m
+3 h
+15915 m
+124 h
+1 h
+1 h
+31 h
+4 h
+1 h
+4 h
+3 h
+138 h
+4 h
+15916 m
+4 h
+65 h
+1 h
+4 h
+2128 m
+103 h
+4 h
+73 h
+10 h
+15917 m
+4 h
+447 h
+1 h
+4 h
+4 h
+10 h
+4 h
+13481 m
+1 h
+83 h
+25 h
+11 h
+1 h
+4 h
+1 h
+1016 h
+1 h
+15918 m
+4 h
+10 h
+83 h
+1 h
+11 h
+1374 m
+1 h
+56 h
+74 h
+114 h
+4 h
+4 h
+4 h
+15919 m
+2754 m
+10 h
+114 h
+15920 m
+15921 m
+15922 m
+15923 m
+185 h
+4 h
+172 h
+7532 m
+1027 h
+1822 h
+15924 m
+1 h
+1 h
+10 h
+15925 m
+10 h
+82 h
+15926 m
+4 h
+195 h
+4 h
+1 h
+1 h
+4 h
+1 h
+464 h
+1 h
+4 h
+1 h
+4 h
+3 h
+10 h
+1 h
+1 h
+15927 m
+4 h
+15928 m
+7727 m
+1 h
+4 h
+15929 m
+4 h
+15930 m
+11 h
+10 h
+10 h
+10 h
+4 h
+10 h
+1 h
+1 h
+10 h
+15931 m
+15932 m
+64 h
+15933 m
+1 h
+4 h
+92 h
+307 h
+10 h
+4 h
+10391 m
+4 h
+4 h
+25 h
+25 h
+4 h
+4 h
+109 h
+307 h
+4 h
+79 h
+4 h
+1 h
+1 h
+4 h
+4 h
+1 h
+82 h
+59 h
+1 h
+582 m
+10 h
+616 m
+1 h
+15934 m
+4 h
+1359 h
+11 h
+1321 m
+73 h
+110 h
+11 h
+15935 m
+15936 m
+4 h
+1619 h
+15937 m
+1 h
+22 h
+4 h
+15938 m
+15939 m
+11 h
+1 h
+10 h
+1 h
+13 h
+92 h
+3 h
+3 h
+10685 m
+1 h
+15940 m
+10 h
+10324 m
+4 h
+124 h
+181 h
+4 h
+15941 m
+843 m
+1 h
+15942 m
+10 h
+13662 m
+4 h
+15943 m
+3 h
+15944 m
+266 h
+15945 m
+15946 m
+15947 m
+1 h
+15948 m
+146 h
+10 h
+4 h
+15949 m
+15950 m
+10 h
+10 h
+10 h
+1 h
+112 h
+1713 m
+4 h
+464 h
+10 h
+332 h
+15951 m
+359 h
+15952 m
+10 h
+10 h
+10 h
+4 h
+15953 m
+147 h
+15954 m
+4 h
+486 m
+10 h
+238 h
+59 h
+74 h
+4 h
+4 h
+3068 m
+10 h
+57 h
+15955 m
+4229 m
+10 h
+4 h
+10 h
+10 h
+4 h
+10 h
+14050 m
+11353 m
+10 h
+1 h
+1074 h
+1 h
+1 h
+10 h
+15956 m
+4 h
+15957 m
+640 h
+10 h
+1 h
+15958 m
+1 h
+1 h
+15959 m
+4 h
+27 h
+4 h
+1 h
+124 h
+477 m
+4 h
+1083 m
+1 h
+4 h
+15960 m
+4 h
+15961 m
+1 h
+109 h
+15962 m
+4 h
+4 h
+82 h
+10 h
+109 h
+124 h
+15963 m
+15964 m
+15965 m
+41 h
+11 h
+125 h
+888 m
+4 h
+4 h
+1053 m
+4 h
+15966 m
+4 h
+31 h
+15967 m
+4 h
+10 h
+1 h
+15968 m
+736 m
+15969 m
+31 h
+1 h
+10 h
+11 h
+4 h
+4 h
+15970 m
+4 h
+15971 m
+1 h
+10 h
+4 h
+4 h
+1340 m
+3 h
+190 h
+583 m
+4 h
+15972 m
+10 h
+1 h
+15973 m
+4 h
+83 h
+15974 m
+135 h
+13 h
+13544 m
+15975 m
+4 h
+15976 m
+1 h
+4 h
+15977 m
+4 h
+4 h
+4 h
+4 h
+114 h
+15978 m
+82 h
+1 h
+15979 m
+12526 m
+15980 m
+4030 m
+1 h
+2733 h
+41 h
+860 m
+114 h
+1250 h
+15981 m
+15982 m
+4 h
+15983 m
+1089 h
+4 h
+110 h
+15984 m
+13 h
+399 h
+15985 m
+15986 m
+196 h
+10 h
+4 h
+147 h
+10 h
+888 m
+11 h
+15987 m
+5141 m
+109 h
+1 h
+229 h
+97 h
+4 h
+677 m
+464 h
+359 h
+4 h
+15988 m
+4 h
+15989 m
+104 h
+1 h
+10 h
+12 h
+4 h
+10 h
+820 m
+15990 m
+5357 m
+4 h
+15263 m
+12 h
+3 h
+1 h
+1 h
+94 h
+4 h
+1 h
+11344 m
+4 h
+1 h
+1780 h
+15991 m
+10 h
+1 h
+74 h
+15992 m
+10 h
+4 h
+4 h
+238 h
+1 h
+4 h
+1 h
+4 h
+82 h
+15993 m
+1 h
+10 h
+4 h
+15994 m
+4 h
+12 h
+4 h
+25 h
+10 h
+12635 m
+4 h
+4 h
+4 h
+147 h
+4 h
+270 h
+267 m
+1 h
+412 h
+10 h
+4 h
+1 h
+279 h
+15995 m
+65 h
+12 h
+15996 m
+1 h
+1 h
+1 h
+10 h
+1 h
+4 h
+15997 m
+25 h
+8040 m
+1 h
+1 h
+1 h
+1 h
+7702 m
+10 h
+808 m
+4 h
+1 h
+4 h
+15998 m
+15999 m
+10 h
+92 h
+12 h
+195 h
+11 h
+16000 m
+16001 m
+92 h
+718 h
+1 h
+16002 m
+55 h
+10 h
+123 h
+10 h
+4 h
+4 h
+10 h
+13 h
+3112 m
+16003 m
+3 h
+562 m
+663 m
+172 h
+10 h
+31 h
+10 h
+10 h
+1 h
+1 h
+4 h
+135 h
+1 h
+16004 m
+16005 m
+4 h
+16006 m
+16007 m
+10 h
+31 h
+16008 m
+1 h
+16009 m
+1 h
+16010 m
+10 h
+1083 m
+4 h
+1 h
+4 h
+16011 m
+4 h
+4 h
+264 h
+10 h
+16012 m
+16013 m
+4 h
+4 h
+2733 h
+57 h
+1 h
+4 h
+16014 m
+16015 m
+250 h
+185 h
+4 h
+4 h
+16016 m
+185 h
+1 h
+1 h
+1 h
+339 m
+10 h
+307 h
+10 h
+10 h
+55 h
+4 h
+12264 m
+4 h
+3 h
+4 h
+4 h
+4 h
+11 h
+264 h
+16017 m
+36 h
+5720 m
+687 h
+167 h
+10 h
+109 h
+4 h
+4 h
+1 h
+4 h
+1 h
+276 h
+146 h
+4 h
+4 h
+1 h
+9182 m
+10 h
+11 h
+10 h
+5254 m
+1 h
+4 h
+4 h
+1 h
+73 h
+74 h
+1 h
+1 h
+10 h
+16018 m
+10 h
+65 h
+258 h
+4 h
+4 h
+10 h
+61 m
+1 h
+1 h
+10 h
+110 h
+16019 m
+16020 m
+4 h
+4 h
+10 h
+4 h
+10 h
+27 h
+536 h
+10 h
+16021 m
+16022 m
+16023 m
+40 m
+1 h
+281 m
+4 h
+82 h
+11621 m
+1535 m
+16024 m
+82 h
+16025 m
+11 h
+229 h
+82 h
+195 h
+143 h
+203 m
+83 h
+4 h
+16026 m
+11 h
+11 h
+1 h
+16027 m
+16028 m
+1261 h
+59 h
+16029 m
+16030 m
+4 h
+10 h
+12 h
+143 h
+4 h
+169 h
+10 h
+1 h
+12 h
+1 h
+104 h
+10 h
+10 h
+10 h
+4 h
+4 h
+16031 m
+1 h
+59 h
+57 h
+4 h
+16032 m
+16033 m
+195 h
+224 h
+41 h
+443 h
+16034 m
+843 m
+195 h
+297 h
+74 h
+31 h
+1 h
+16035 m
+10 h
+4 h
+10 h
+471 m
+10 h
+1 h
+10 h
+181 h
+74 h
+1 h
+10 h
+4 h
+12225 m
+1 h
+4 h
+4 h
+169 h
+2148 m
+11810 m
+1 h
+4 h
+11 h
+16036 m
+4 h
+16037 m
+1 h
+425 m
+4 h
+11 h
+16038 m
+16039 m
+285 m
+16040 m
+278 h
+16041 m
+10 h
+1 h
+83 h
+10 h
+16042 m
+4 h
+16043 m
+1 h
+104 h
+4 h
+1 h
+3 h
+506 m
+4 h
+10 h
+1 h
+10 h
+4 h
+307 h
+1 h
+4 h
+1 h
+73 h
+16044 m
+104 h
+169 h
+22 h
+1 h
+1 h
+1 h
+1 h
+79 h
+16045 m
+10 h
+4 h
+16046 m
+10 h
+59 h
+16047 m
+11 h
+16048 m
+3 h
+4 h
+10 h
+16049 m
+83 h
+48 h
+16050 m
+4 h
+10 h
+1 h
+4 h
+1 h
+1 h
+57 h
+82 h
+1 h
+59 h
+16051 m
+4 h
+1 h
+65 h
+104 h
+104 h
+939 m
+1 h
+1 h
+16052 m
+1 h
+16053 m
+1 h
+228 m
+1 h
+8070 m
+412 h
+16054 m
+16055 m
+10 h
+55 h
+1 h
+16056 m
+10 h
+1 h
+104 h
+16057 m
+1 h
+443 h
+3 h
+1 h
+4 h
+16058 m
+16059 m
+92 h
+16060 m
+16061 m
+74 h
+4 h
+4 h
+238 h
+11 h
+2040 m
+1 h
+82 h
+140 h
+82 h
+73 h
+4 h
+55 h
+1 h
+3170 m
+10 h
+4 h
+10 h
+578 h
+4 h
+16062 m
+4 h
+2308 h
+986 h
+1 h
+4 h
+1 h
+10 h
+1 h
+4 h
+12372 m
+16063 m
+4 h
+10 h
+4 h
+16064 m
+10 h
+196 h
+4 h
+16065 m
+1 h
+4 h
+4 h
+1 h
+1642 h
+16066 m
+4 h
+16067 m
+208 m
+338 h
+110 h
+97 h
+9757 m
+16068 m
+10 h
+10 h
+10 h
+10 h
+4 h
+16069 m
+16070 m
+16071 m
+16072 m
+1 h
+297 h
+2851 h
+114 h
+4 h
+16073 m
+195 h
+97 h
+1 h
+10 h
+109 h
+443 h
+16074 m
+16075 m
+1 h
+1 h
+1 h
+83 h
+642 h
+4 h
+10 h
+1 h
+124 h
+4 h
+33 m
+1 h
+16076 m
+4 h
+4 h
+692 h
+1 h
+59 h
+2054 m
+10 h
+4 h
+104 h
+13980 m
+10 h
+806 m
+4 h
+4 h
+16077 m
+10 h
+4 h
+10 h
+16078 m
+16079 m
+16080 m
+147 h
+4592 m
+65 h
+16081 m
+11 h
+16082 m
+12 h
+1 h
+4 h
+10 h
+1 h
+601 h
+10 h
+73 h
+16083 m
+1955 m
+16084 m
+5809 m
+10 h
+11 h
+41 h
+4 h
+16085 m
+1 h
+4 h
+57 h
+2751 m
+36 h
+16086 m
+3435 m
+4 h
+4 h
+181 h
+1 h
+119 h
+4 h
+16087 m
+4 h
+4 h
+104 h
+143 h
+157 h
+5387 m
+185 h
+74 h
+4 h
+45 h
+4 h
+10 h
+16088 m
+464 h
+1 h
+2625 m
+10 h
+16089 m
+1 h
+16090 m
+2840 m
+10 h
+10 h
+41 h
+4 h
+1 h
+4 h
+1 h
+4 h
+4 h
+16091 m
+1 h
+1 h
+92 h
+10 h
+10 h
+1 h
+10 h
+16092 m
+4 h
+4911 m
+4 h
+358 h
+124 h
+10 h
+4 h
+1 h
+16093 m
+16094 m
+146 h
+11 h
+4 h
+106 h
+5 h
+1 h
+143 h
+4 h
+59 h
+174 m
+16095 m
+1359 h
+966 h
+16096 m
+4 h
+156 h
+139 h
+146 h
+4 h
+1 h
+4 h
+4 h
+10 h
+10 h
+3 h
+124 h
+4 h
+82 h
+184 h
+16097 m
+10 h
+16098 m
+804 m
+4 h
+1 h
+6869 m
+4 h
+16099 m
+59 h
+4 h
+1 h
+2607 m
+4 h
+16100 m
+11 h
+4 h
+16101 m
+1 h
+4 h
+4 h
+25 h
+10 h
+16102 m
+82 h
+4 h
+16103 m
+1 h
+1 h
+4 h
+4 h
+109 h
+1 h
+16104 m
+10 h
+16105 m
+4 h
+4 h
+1 h
+16106 m
+28 h
+16107 m
+2025 m
+1 h
+10 h
+1 h
+229 h
+13426 m
+77 h
+1 h
+278 h
+16108 m
+172 h
+4 h
+10 h
+307 h
+125 h
+4 h
+16109 m
+11 h
+1 h
+16110 m
+4 h
+4 h
+4 h
+16111 m
+4 h
+3 h
+4 h
+25 h
+12 h
+41 h
+297 h
+124 h
+4 h
+10 h
+16112 m
+16113 m
+16114 m
+4 h
+1 h
+16115 m
+4 h
+57 h
+83 h
+4 h
+1 h
+10 h
+4 h
+74 h
+4 h
+1 h
+41 h
+10 h
+4 h
+16116 m
+1 h
+1 h
+1 h
+10 h
+10 h
+4 h
+297 h
+459 h
+196 h
+601 h
+4 h
+157 h
+16117 m
+83 h
+16118 m
+1 h
+97 h
+4 h
+16119 m
+10177 m
+4 h
+181 h
+11 h
+16120 m
+16121 m
+4 h
+11 h
+4 h
+10 h
+4 h
+59 h
+41 h
+4 h
+16122 m
+146 h
+4 h
+16123 m
+10 h
+4 h
+16124 m
+7401 m
+172 h
+109 h
+1 h
+16125 m
+181 h
+687 h
+4 h
+1 h
+1 h
+16126 m
+82 h
+4 h
+258 h
+4 h
+4 h
+16127 m
+10 h
+16128 m
+147 h
+4 h
+164 h
+1445 m
+4 h
+1 h
+11948 m
+10 h
+4 h
+272 m
+124 h
+1 h
+57 h
+59 h
+1 h
+3 h
+403 h
+16129 m
+83 h
+7839 m
+4 h
+16130 m
+4 h
+181 h
+186 h
+150 m
+172 h
+65 h
+10 h
+59 h
+230 m
+4 h
+119 h
+4 h
+59 h
+10 h
+10 h
+4 h
+16131 m
+16132 m
+10 h
+4 h
+16133 m
+1 h
+27 h
+10958 m
+124 h
+16134 m
+16135 m
+16136 m
+10 h
+82 h
+4 h
+1 h
+1 h
+737 m
+1 h
+3307 m
+185 h
+97 h
+16137 m
+57 h
+16138 m
+4 h
+16139 m
+10 h
+41 h
+4 h
+124 h
+1 h
+386 h
+203 m
+601 h
+3988 m
+845 m
+10 h
+4 h
+59 h
+16140 m
+16141 m
+4 h
+146 h
+167 h
+124 h
+119 h
+1 h
+10 h
+1470 h
+4 h
+4 h
+4 h
+156 h
+10 h
+16142 m
+31 h
+1 h
+4 h
+10 h
+195 h
+16143 m
+82 h
+966 h
+4 h
+16144 m
+4 h
+16145 m
+4 h
+536 h
+1 h
+10 h
+10 h
+16146 m
+82 h
+1 h
+10 h
+5348 m
+1 h
+4 h
+693 m
+11 h
+16147 m
+147 h
+11 h
+1 h
+16148 m
+10 h
+4 h
+229 h
+10 h
+4 h
+10 h
+147 h
+16149 m
+4 h
+92 h
+16150 m
+10539 m
+10 h
+10 h
+4 h
+4 h
+28 h
+1 h
+10 h
+16151 m
+10 h
+4 h
+1 h
+1 h
+10 h
+1478 h
+1 h
+1 h
+16152 m
+4 h
+4 h
+10 h
+4 h
+55 h
+16153 m
+10 h
+4 h
+4 h
+4 h
+10 h
+57 h
+16154 m
+857 h
+4 h
+383 h
+27 h
+16155 m
+4 h
+10 h
+146 h
+16156 m
+1 h
+16157 m
+1 h
+10 h
+10 h
+10 h
+4 h
+4 h
+16158 m
+124 h
+82 h
+125 h
+97 h
+4 h
+1 h
+4 h
+11 h
+57 h
+16159 m
+10 h
+332 h
+147 h
+4 h
+3048 m
+10 h
+185 h
+4 h
+57 h
+10 h
+4 h
+538 h
+1 h
+704 h
+4 h
+276 h
+184 h
+10 h
+4 h
+4 h
+57 h
+4 h
+351 m
+55 h
+4 h
+1 h
+16160 m
+16161 m
+16162 m
+1 h
+4 h
+4 h
+4 h
+11 h
+10 h
+173 h
+4 h
+295 h
+16163 m
+4 h
+4 h
+16164 m
+55 h
+4 h
+10 h
+16165 m
+16166 m
+11940 m
+1 h
+11 h
+4 h
+4 h
+12 h
+3 h
+4 h
+1 h
+16167 m
+10 h
+7444 m
+97 h
+109 h
+1 h
+10 h
+146 h
+4 h
+4 h
+10 h
+4 h
+1 h
+16168 m
+266 h
+59 h
+4 h
+16169 m
+386 h
+1 h
+4 h
+16170 m
+10 h
+4 h
+1 h
+1 h
+167 h
+4 h
+10 h
+64 h
+10 h
+4 h
+10414 m
+1 h
+10 h
+73 h
+4 h
+1 h
+190 h
+4 h
+185 h
+156 h
+16171 m
+4 h
+4 h
+147 h
+16172 m
+125 h
+4 h
+4 h
+4 h
+143 h
+109 h
+16173 m
+108 h
+147 h
+13157 m
+124 h
+1 h
+1016 h
+4 h
+16174 m
+3 h
+4 h
+358 h
+16175 m
+10 h
+4 h
+57 h
+4 h
+4 h
+359 h
+2923 h
+16176 m
+73 h
+16177 m
+1 h
+10 h
+16178 m
+196 h
+16179 m
+1 h
+16180 m
+4 h
+4 h
+11 h
+1 h
+10 h
+276 h
+82 h
+1 h
+59 h
+10 h
+97 h
+3533 m
+10 h
+10 h
+1 h
+16181 m
+83 h
+4 h
+16182 m
+4 h
+4 h
+16183 m
+16184 m
+16185 m
+4 h
+1201 m
+3558 m
+10 h
+4 h
+4 h
+16186 m
+16187 m
+1 h
+27 h
+118 h
+10 h
+16188 m
+59 h
+10 h
+16189 m
+10 h
+536 h
+266 h
+10 h
+16190 m
+1 h
+16191 m
+16192 m
+10 h
+1 h
+4 h
+16193 m
+295 h
+6469 m
+16194 m
+16195 m
+10 h
+1 h
+109 h
+274 h
+16196 m
+4 h
+10 h
+10 h
+16197 m
+4 h
+1 h
+16198 m
+16199 m
+4 h
+16200 m
+16201 m
+1 h
+4 h
+4 h
+1 h
+4 h
+10 h
+278 h
+4 h
+97 h
+16202 m
+169 h
+164 h
+4 h
+1 h
+3 h
+1 h
+36 h
+10 h
+10 h
+16203 m
+11 h
+4 h
+1 h
+82 h
+4 h
+4 h
+4 h
+1 h
+83 h
+4 h
+10 h
+16204 m
+4 h
+41 h
+16205 m
+4 h
+1 h
+2887 m
+954 m
+4 h
+146 h
+4 h
+1 h
+266 h
+10 h
+16206 m
+4 h
+10 h
+1 h
+1250 h
+1470 h
+4 h
+10 h
+109 h
+1 h
+4 h
+1 h
+1 h
+1 h
+16207 m
+109 h
+11 h
+10 h
+358 h
+1 h
+4 h
+4 h
+3 h
+4 h
+64 h
+1 h
+59 h
+16208 m
+10 h
+124 h
+1 h
+1 h
+4 h
+1 h
+10418 m
+4 h
+109 h
+1 h
+16209 m
+1 h
+10 h
+140 h
+74 h
+4 h
+1 h
+4 h
+8324 m
+4 h
+11 h
+83 h
+10 h
+10 h
+1 h
+7541 m
+9027 m
+10 h
+10 h
+1 h
+16210 m
+4 h
+1261 h
+1 h
+4 h
+10 h
+57 h
+1 h
+16211 m
+16212 m
+4 h
+1 h
+4 h
+4 h
+4 h
+1 h
+4 h
+1 h
+4 h
+27 h
+1 h
+10 h
+1 h
+10 h
+4 h
+1 h
+10 h
+16213 m
+10 h
+16214 m
+4 h
+16215 m
+1 h
+79 h
+16216 m
+4 h
+16217 m
+11 h
+4 h
+10 h
+1 h
+10 h
+1 h
+16218 m
+10 h
+4 h
+65 h
+1 h
+4 h
+113 h
+10 h
+4 h
+10 h
+59 h
+114 h
+4 h
+25 h
+16219 m
+4 h
+124 h
+1 h
+109 h
+12 h
+2788 h
+1 h
+4 h
+1 h
+322 h
+10 h
+16220 m
+4 h
+10 h
+4 h
+10 h
+1 h
+4 h
+4 h
+4 h
+12 h
+1 h
+4 h
+16221 m
+110 h
+279 h
+4 h
+10 h
+4 h
+1 h
+41 h
+6066 m
+125 h
+16222 m
+16223 m
+4 h
+955 m
+16224 m
+493 m
+16225 m
+4 h
+10 h
+11 h
+4 h
+4 h
+16226 m
+4 h
+1 h
+1309 h
+4 h
+918 m
+4 h
+4 h
+1 h
+1 h
+4535 m
+1 h
+4 h
+1 h
+57 h
+10 h
+124 h
+1 h
+147 h
+4 h
+10 h
+112 h
+10 h
+11 h
+4 h
+16227 m
+1 h
+4 h
+10 h
+16228 m
+4 h
+1 h
+147 h
+1 h
+10 h
+16229 m
+104 h
+9450 m
+83 h
+1 h
+676 m
+1 h
+4 h
+104 h
+4 h
+4 h
+1 h
+10 h
+4 h
+1 h
+16230 m
+16231 m
+10 h
+16232 m
+16233 m
+4 h
+16234 m
+1535 m
+11 h
+74 h
+11 h
+57 h
+4 h
+0 m
+124 h
+11 h
+10 h
+10 h
+1 h
+10 h
+4 h
+4 h
+11 h
+16235 m
+10 h
+4 h
+4 h
+59 h
+16236 m
+4 h
+114 h
+4 h
+4 h
+16237 m
+16238 m
+4 h
+1959 m
+5567 m
+403 h
+10 h
+1030 h
+146 h
+16239 m
+1 h
+59 h
+57 h
+146 h
+1 h
+16240 m
+74 h
+1 h
+4 h
+1 h
+857 h
+10 h
+10 h
+48 h
+4 h
+83 h
+190 h
+1 h
+10 h
+16241 m
+1 h
+4 h
+4 h
+1 h
+2788 h
+1569 m
+10 h
+10 h
+4 h
+4 h
+4 h
+123 h
+4 h
+10 h
+10 h
+174 m
+1 h
+279 h
+135 h
+1 h
+16242 m
+10 h
+4 h
+4 h
+16243 m
+371 h
+1 h
+4 h
+1 h
+1 h
+1 h
+1 h
+11 h
+1 h
+16244 m
+94 h
+11 h
+1 h
+1 h
+74 h
+267 h
+1 h
+10 h
+464 h
+16245 m
+94 h
+10 h
+55 h
+4 h
+10 h
+4 h
+16246 m
+57 h
+4 h
+16247 m
+16248 m
+92 h
+4 h
+1 h
+1 h
+147 h
+14345 m
+25 h
+10 h
+1 h
+3 h
+10 h
+16249 m
+196 h
+10 h
+4 h
+79 h
+56 h
+1 h
+16250 m
+11 h
+1128 m
+4 h
+4 h
+1 h
+195 h
+41 h
+10 h
+4 h
+1 h
+692 h
+4 h
+11 h
+10 h
+1 h
+59 h
+4 h
+4 h
+238 h
+16251 m
+10 h
+4 h
+1 h
+4256 m
+4 h
+65 h
+1 h
+16252 m
+92 h
+1 h
+41 h
+4 h
+1725 m
+65 h
+1 h
+16253 m
+1 h
+82 h
+238 h
+4 h
+3558 m
+1 h
+11 h
+10 h
+1 h
+1 h
+4 h
+2435 m
+16254 m
+1 h
+779 h
+16255 m
+11 h
+11691 m
+322 h
+4 h
+104 h
+16256 m
+4 h
+124 h
+4 h
+1 h
+4 h
+45 h
+135 h
+1 h
+104 h
+4 h
+125 h
+16257 m
+10 h
+10 h
+4 h
+11 h
+16258 m
+10 h
+238 h
+4 h
+371 h
+718 h
+843 h
+10 h
+16259 m
+4 h
+82 h
+1 h
+4 h
+4 h
+276 h
+10 h
+1 h
+10 h
+1713 m
+79 h
+1 h
+1 h
+1 h
+59 h
+2002 h
+1 h
+1 h
+1 h
+31 h
+57 h
+11 h
+109 h
+3 h
+45 h
+10 h
+16260 m
+57 h
+332 h
+1 h
+3 h
+57 h
+16261 m
+2022 m
+16262 m
+1 h
+1 h
+8114 m
+1 h
+10 h
+16263 m
+16264 m
+4 h
+16265 m
+4 h
+10 h
+4 h
+1 h
+224 h
+16266 m
+1 h
+4 h
+25 h
+16267 m
+1 h
+270 h
+1 h
+4 h
+4 h
+4 h
+692 h
+12 h
+4 h
+55 h
+140 h
+1 h
+11 h
+1 h
+56 h
+16268 m
+4 h
+59 h
+57 h
+13043 m
+16269 m
+6784 m
+9323 m
+297 h
+250 h
+2928 m
+170 h
+10 h
+1 h
+1 h
+10 h
+1 h
+10 h
+57 h
+16270 m
+692 h
+4 h
+2490 m
+4 h
+31 h
+4 h
+181 h
+1 h
+10 h
+11 h
+82 h
+16271 m
+112 h
+4 h
+16272 m
+279 h
+1 h
+1 h
+4 h
+4 h
+41 h
+6296 m
+4 h
+1 h
+59 h
+4714 m
+538 h
+27 h
+97 h
+4 h
+4 h
+16273 m
+10 h
+11 h
+45 h
+1 h
+13 h
+4 h
+16274 m
+1 h
+276 h
+4 h
+16275 m
+114 h
+1 h
+5065 m
+195 h
+1 h
+368 h
+12 h
+5584 m
+4 h
+10 h
+1 h
+10 h
+124 h
+65 h
+16276 m
+4 h
+2433 m
+295 h
+4 h
+7900 m
+1 h
+4 h
+22 h
+203 h
+114 h
+1 h
+4 h
+698 m
+16277 m
+143 h
+2865 m
+48 h
+83 h
+16278 m
+3 h
+3 h
+4 h
+10 h
+1 h
+4 h
+4 h
+4 h
+10 h
+4 h
+192 h
+10 h
+1 h
+4 h
+4 h
+124 h
+16279 m
+4 h
+16280 m
+4 h
+135 h
+16281 m
+16282 m
+443 h
+16283 m
+16284 m
+1 h
+10 h
+16285 m
+4 h
+1 h
+13 h
+10 h
+16286 m
+4 h
+6107 m
+12041 m
+16287 m
+1 h
+4 h
+11 h
+1 h
+10 h
+1 h
+16288 m
+1 h
+1 h
+278 h
+16289 m
+1 h
+4 h
+195 h
+1 h
+1 h
+147 h
+488 m
+299 h
+4 h
+16290 m
+16291 m
+3344 m
+16292 m
+464 h
+4 h
+1 h
+16293 m
+16294 m
+169 h
+55 h
+4 h
+1 h
+464 h
+10 h
+4 h
+16295 m
+124 h
+1 h
+16296 m
+4 h
+1 h
+11 h
+4 h
+11 h
+4 h
+1780 h
+1685 h
+4 h
+41 h
+143 h
+186 h
+97 h
+190 h
+4 h
+10 h
+4 h
+1 h
+4 h
+459 h
+3 h
+1 h
+11 h
+16297 m
+16298 m
+4 h
+109 h
+1 h
+11 h
+16299 m
+11 h
+10 h
+1 h
+13 h
+4 h
+1 h
+4 h
+10 h
+4 h
+10 h
+4 h
+1 h
+83 h
+10 h
+4 h
+16300 m
+31 h
+2924 m
+16301 m
+16302 m
+4 h
+10 h
+4111 m
+94 h
+4 h
+16303 m
+1261 h
+10 h
+1980 m
+10 h
+4 h
+203 h
+16304 m
+1 h
+65 h
+16305 m
+146 h
+25 h
+16306 m
+112 h
+16307 m
+16308 m
+10 h
+36 h
+16309 m
+10 h
+238 h
+258 h
+16310 m
+16311 m
+4 h
+347 m
+578 h
+1 h
+4 h
+16312 m
+10 h
+1 h
+16313 m
+74 h
+82 h
+4 h
+16314 m
+3 h
+170 h
+5125 m
+1 h
+1 h
+11 h
+1123 m
+172 h
+109 h
+16315 m
+1 h
+16316 m
+297 h
+1 h
+16317 m
+4 h
+119 h
+1 h
+4 h
+4 h
+10 h
+1 h
+10 h
+1 h
+16318 m
+17 m
+4 h
+16319 m
+4 h
+1 h
+196 h
+4 h
+368 h
+16320 m
+196 h
+10 h
+4 h
+2494 m
+10 h
+10 h
+10 h
+1 h
+167 h
+10 h
+185 h
+10 h
+36 h
+4 h
+16321 m
+1 h
+10 h
+1 h
+110 h
+4 h
+16322 m
+4 h
+1 h
+12 h
+4 h
+1 h
+1409 m
+1 h
+16323 m
+10 h
+57 h
+10 h
+4 h
+4 h
+10 h
+16324 m
+139 h
+1 h
+4 h
+4 h
+1 h
+25 h
+92 h
+83 h
+4 h
+147 h
+3 h
+16325 m
+16326 m
+1 h
+4 h
+1 h
+10 h
+16327 m
+114 h
+1 h
+4 h
+10 h
+4 h
+10 h
+10 h
+4 h
+5008 m
+25 h
+1 h
+10 h
+16328 m
+167 h
+4 h
+1016 h
+1 h
+104 h
+10 h
+3 h
+124 h
+4 h
+1 h
+4 h
+1 h
+4 h
+563 m
+4 h
+10 h
+10 h
+16329 m
+41 h
+25 h
+113 h
+16330 m
+124 h
+10 h
+4 h
+238 h
+1646 m
+4 h
+4 h
+1 h
+4 h
+4 h
+1 h
+3 h
+1967 m
+16331 m
+16332 m
+10 h
+10 h
+1 h
+10 h
+1 h
+124 h
+1 h
+172 h
+16333 m
+25 h
+4 h
+4 h
+16334 m
+16335 m
+1 h
+1 h
+196 h
+4 h
+4 h
+1 h
+16336 m
+10 h
+4 h
+1027 h
+16337 m
+4 h
+1 h
+16338 m
+1 h
+16339 m
+4 h
+11948 m
+4 h
+9467 m
+16340 m
+4 h
+1 h
+4 h
+1 h
+25 h
+10 h
+4 h
+1 h
+82 h
+31 h
+1386 m
+114 h
+64 h
+10 h
+157 h
+143 h
+1 h
+10 h
+1 h
+1 h
+16341 m
+4 h
+1 h
+1 h
+278 h
+1 h
+10 h
+82 h
+10 h
+4030 m
+10 h
+146 h
+125 h
+4 h
+16342 m
+4 h
+447 h
+10 h
+16343 m
+10 h
+10 h
+16344 m
+16345 m
+4 h
+2931 m
+16346 m
+10 h
+4 h
+16347 m
+16348 m
+196 h
+109 h
+16349 m
+10 h
+4 h
+1470 h
+4 h
+16350 m
+16351 m
+55 h
+36 h
+172 h
+10062 m
+11 h
+1 h
+125 h
+1278 m
+16352 m
+16353 m
+4 h
+147 h
+16354 m
+16355 m
+1 h
+10 h
+143 h
+1 h
+1 h
+4 h
+4 h
+25 h
+16356 m
+170 h
+1 h
+11 h
+82 h
+4 h
+170 h
+25 h
+45 h
+4 h
+16357 m
+10 h
+4 h
+464 h
+2163 m
+4 h
+10 h
+4 h
+1 h
+82 h
+1 h
+4 h
+147 h
+16358 m
+16359 m
+4 h
+16360 m
+10 h
+16361 m
+4 h
+109 h
+16362 m
+4 h
+31 h
+124 h
+16363 m
+4 h
+4 h
+1 h
+4 h
+10 h
+4 h
+143 h
+16364 m
+266 h
+82 h
+1 h
+16365 m
+1 h
+4 h
+83 h
+4 h
+1279 m
+36 h
+250 h
+143 h
+615 m
+307 h
+92 h
+125 h
+4 h
+16366 m
+10 h
+11 h
+4 h
+4 h
+12066 m
+16367 m
+1 h
+464 h
+9077 m
+1 h
+4 h
+4 h
+16368 m
+16369 m
+10 h
+16370 m
+825 m
+1 h
+10 h
+1 h
+113 h
+1 h
+1 h
+1 h
+4 h
+12 h
+4 h
+4 h
+1 h
+73 h
+16371 m
+1478 h
+976 h
+10 h
+109 h
+125 h
+11 h
+4 h
+94 h
+104 h
+16372 m
+172 h
+238 h
+258 h
+109 h
+4 h
+4 h
+1 h
+4 h
+16373 m
+1 h
+16374 m
+59 h
+10 h
+45 h
+10 h
+41 h
+16375 m
+1 h
+1 h
+250 h
+57 h
+55 h
+4 h
+73 h
+976 h
+11 h
+1886 m
+10 h
+4 h
+16376 m
+164 h
+16377 m
+11 h
+10 h
+1 h
+190 h
+408 m
+83 h
+10 h
+10 h
+118 h
+10 h
+3 h
+238 h
+4 h
+16378 m
+307 h
+278 h
+4 h
+16379 m
+4 h
+11 h
+40 m
+332 h
+16380 m
+1 h
+1 h
+1 h
+1470 h
+169 h
+4 h
+1 h
+1677 m
+4 h
+10 h
+41 h
+156 h
+4 h
+16381 m
+913 m
+1 h
+6963 m
+124 h
+10 h
+1 h
+16382 m
+1261 h
+16383 m
+10 h
+4 h
+10 h
+4 h
+1 h
+1 h
+4 h
+330 h
+266 h
+55 h
+94 h
+41 h
+4 h
+16384 m
+147 h
+3558 h
+4 h
+10 h
+1 h
+536 h
+1 h
+190 h
+313 m
+125 h
+1198 m
+1 h
+1 h
+11 h
+10 h
+31 h
+1 h
+297 h
+4 h
+4 h
+16385 m
+4 h
+190 h
+1478 h
+358 h
+332 h
+4 h
+1 h
+16386 m
+4 h
+4 h
+4 h
+4 h
+1772 m
+25 h
+1 h
+16387 m
+911 h
+10 h
+109 h
+4 h
+443 h
+10 h
+16388 m
+4 h
+36 h
+82 h
+4 h
+10 h
+109 h
+4 h
+1 h
+1 h
+16389 m
+55 h
+435 h
+10 h
+1957 m
+4 h
+22 h
+1 h
+1 h
+79 h
+147 h
+1542 m
+1 h
+10 h
+4 h
+256 h
+1 h
+1576 m
+16390 m
+1 h
+4 h
+4030 m
+10 h
+262 m
+4 h
+10 h
+4 h
+4 h
+5505 m
+11 h
+1 h
+10 h
+14083 m
+16391 m
+10 h
+4 h
+4 h
+16392 m
+1 h
+10 h
+16393 m
+11 h
+3680 m
+16394 m
+1 h
+16395 m
+4 h
+4 h
+1 h
+10 h
+94 h
+1 h
+16396 m
+22 h
+10 h
+2788 h
+143 h
+10 h
+383 h
+4 h
+16397 m
+103 h
+73 h
+4 h
+1766 h
+10 h
+4 h
+1 h
+10 h
+1 h
+16398 m
+11 h
+82 h
+10 h
+3025 m
+4 h
+16399 m
+4 h
+4 h
+83 h
+31 h
+109 h
+1535 m
+4 h
+4 h
+4 h
+10 h
+16400 m
+10 h
+238 h
+1 h
+16401 m
+10 h
+4 h
+1 h
+4 h
+1 h
+10 h
+74 h
+1 h
+16402 m
+16403 m
+4 h
+10 h
+4 h
+12 h
+10 h
+11 h
+74 h
+124 h
+4 h
+16404 m
+4 h
+172 h
+1137 h
+16405 m
+4 h
+10 h
+4 h
+4 h
+238 h
+1137 h
+4 h
+4 h
+4 h
+1 h
+4 h
+16406 m
+29 m
+1 h
+358 h
+10 h
+4 h
+16407 m
+383 h
+10 h
+4 h
+74 h
+1 h
+16408 m
+16409 m
+1685 h
+224 h
+4 h
+109 h
+16410 m
+447 h
+73 h
+430 m
+169 h
+16411 m
+16412 m
+1 h
+16413 m
+13 h
+4 h
+1 h
+1198 m
+16414 m
+10 h
+1 h
+4240 m
+4 h
+10 h
+82 h
+16415 m
+16416 m
+82 h
+143 h
+8555 m
+16417 m
+1 h
+4 h
+965 h
+4 h
+4 h
+109 h
+1 h
+109 h
+16418 m
+146 h
+4 h
+4 h
+4 h
+4 h
+16419 m
+57 h
+4 h
+4 h
+15 m
+16420 m
+1 h
+16421 m
+4 h
+4 h
+109 h
+25 h
+1 h
+1 h
+10 h
+4 h
+109 h
+1 h
+4 h
+56 h
+4 h
+4 h
+4 h
+25 h
+16422 m
+4 h
+358 h
+16423 m
+4 h
+1403 h
+59 h
+10 h
+4 h
+12 h
+94 h
+10 h
+1936 m
+4 h
+16424 m
+1016 h
+1 h
+1957 m
+1936 m
+16425 m
+536 h
+16426 m
+11 h
+103 h
+278 h
+10 h
+4 h
+1 h
+1 h
+82 h
+11 h
+4 h
+1 h
+4 h
+144 m
+4 h
+4 h
+10 h
+4 h
+4 h
+4 h
+4 h
+144 h
+4 h
+147 h
+1470 h
+57 h
+1 h
+10 h
+1 h
+16427 m
+4 h
+135 h
+82 h
+1 h
+4 h
+129 h
+1 h
+4 h
+11 h
+1 h
+4 h
+10 h
+258 h
+1 h
+1 h
+7479 m
+28 h
+4 h
+16428 m
+112 h
+10 h
+1128 m
+4 h
+181 h
+1 h
+10 h
+4 h
+4457 m
+135 h
+65 h
+4 h
+347 m
+16429 m
+1 h
+1 h
+10 h
+4 h
+16430 m
+4 h
+10 h
+16431 m
+16432 m
+83 h
+172 h
+4 h
+4 h
+16433 m
+14829 m
+65 h
+1016 h
+1 h
+57 h
+4 h
+10 h
+10 h
+16434 m
+16435 m
+1 h
+1 h
+16436 m
+4 h
+1 h
+16437 m
+181 h
+10 h
+16438 m
+4 h
+10 h
+125 h
+135 h
+1 h
+16439 m
+185 h
+16440 m
+4 h
+258 h
+642 h
+11 h
+11 h
+16441 m
+97 h
+143 h
+172 h
+16442 m
+195 h
+16443 m
+16444 m
+97 h
+146 h
+4 h
+4 h
+94 h
+4 h
+16445 m
+11 h
+1 h
+4 h
+8 h
+4 h
+4 h
+10 h
+16446 m
+4 h
+16447 m
+1 h
+16448 m
+4 h
+278 h
+16449 m
+4 h
+270 h
+1 h
+57 h
+82 h
+114 h
+1 h
+1 h
+31 h
+4 h
+1 h
+16450 m
+74 h
+1 h
+4 h
+4 h
+124 h
+238 h
+1685 h
+4 h
+1 h
+10 h
+97 h
+1 h
+1 h
+1 h
+4 h
+4 h
+10 h
+97 h
+57 h
+16451 m
+16452 m
+119 h
+41 h
+16453 m
+4 h
+4 h
+1 h
+82 h
+10 h
+4 h
+359 h
+1 h
+1 h
+1 h
+170 h
+266 h
+16454 m
+4 h
+1 h
+4 h
+109 h
+16455 m
+10 h
+16456 m
+16457 m
+147 h
+169 h
+319 h
+1 h
+16458 m
+4 h
+1 h
+4 h
+16459 m
+1 h
+16460 m
+25 h
+158 h
+10 h
+16461 m
+55 h
+112 h
+4 h
+48 h
+65 h
+109 h
+109 h
+10 h
+16462 m
+4 h
+14569 m
+124 h
+16463 m
+1 h
+1770 m
+195 h
+262 m
+16464 m
+10 h
+8332 m
+173 h
+687 h
+4 h
+2788 h
+1 h
+4 h
+307 h
+10177 m
+10 h
+279 h
+31 h
+16465 m
+1 h
+10 h
+601 h
+59 h
+4 h
+4 h
+41 h
+94 h
+1 h
+4 h
+1 h
+4 h
+4 h
+4 h
+10 h
+186 h
+1 h
+16466 m
+124 h
+79 h
+109 h
+10 h
+4 h
+4 h
+10 h
+10 h
+16467 m
+10 h
+4 h
+4 h
+94 h
+3720 m
+57 h
+10 h
+4 h
+1 h
+4 h
+10 h
+25 h
+1 h
+4 h
+1 h
+1 h
+181 h
+1 h
+425 m
+4 h
+4 h
+569 h
+10 h
+12 h
+16468 m
+1 h
+10 h
+4 h
+10 h
+4 h
+31 h
+1 h
+10 h
+114 h
+10 h
+57 h
+10 h
+97 h
+4 h
+4 h
+41 h
+16469 m
+10 h
+4 h
+7388 m
+536 h
+1 h
+16470 m
+1347 m
+4 h
+4 h
+16471 m
+1 h
+16472 m
+4 h
+41 h
+1 h
+4 h
+1 h
+82 h
+4 h
+1 h
+4 h
+10 h
+1 h
+181 h
+4 h
+10 h
+16473 m
+10 h
+4 h
+6545 m
+1 h
+92 h
+1504 m
+57 h
+1 h
+443 h
+4 h
+4 h
+10 h
+12 h
+4 h
+1 h
+4 h
+16474 m
+1 h
+16475 m
+1 h
+55 h
+1105 h
+1006 m
+59 h
+10 h
+1 h
+4 h
+104 h
+16476 m
+1 h
+4895 m
+16477 m
+468 m
+1 h
+135 h
+1 h
+196 h
+57 h
+16478 m
+11 h
+4 h
+258 h
+16479 m
+569 h
+1 h
+10 h
+1 h
+16480 m
+112 h
+9912 m
+332 h
+4089 m
+16481 m
+195 h
+31 h
+4 h
+1807 m
+276 h
+79 h
+16482 m
+1 h
+578 h
+36 h
+4 h
+5080 m
+1 h
+4 h
+4 h
+840 m
+16483 m
+10 h
+1 h
+16484 m
+7839 m
+1 h
+1 h
+16485 m
+10 h
+1 h
+1 h
+16486 m
+3 h
+16487 m
+172 h
+4 h
+4 h
+10 h
+1 h
+1511 m
+82 h
+4 h
+4 h
+83 h
+16488 m
+1198 h
+16489 m
+10 h
+10 h
+1650 h
+4 h
+3539 m
+4 h
+27 h
+16490 m
+258 h
+4 h
+10 h
+1 h
+16491 m
+129 h
+10 h
+4 h
+1 h
+569 h
+1 h
+1 h
+1 h
+10 h
+157 h
+16492 m
+16493 m
+1 h
+1 h
+10 h
+1 h
+16494 m
+4 h
+3143 m
+4 h
+16495 m
+4 h
+1 h
+16496 m
+16497 m
+9701 m
+181 h
+4 h
+4 h
+114 h
+1 h
+16498 m
+1 h
+4 h
+16499 m
+1 h
+479 h
+10 h
+11 h
+1 h
+4 h
+16500 m
+10 h
+3600 m
+4 h
+1 h
+443 h
+16501 m
+11 h
+10 h
+4564 m
+4 h
+104 h
+97 h
+10 h
+4 h
+1 h
+16502 m
+1 h
+7253 m
+16503 m
+45 h
+4 h
+4 h
+104 h
+11 h
+10 h
+4 h
+10 h
+1 h
+4 h
+10 h
+4 h
+4 h
+16504 m
+16505 m
+1 h
+10 h
+14388 m
+4 h
+10 h
+16506 m
+16507 m
+124 h
+1 h
+109 h
+195 h
+10 h
+16508 m
+109 h
+1 h
+4 h
+10 h
+692 h
+2002 h
+16509 m
+4 h
+10 h
+16510 m
+41 h
+4 h
+10 h
+41 h
+10 h
+4 h
+31 h
+74 h
+119 h
+65 h
+10 h
+59 h
+10 h
+55 h
+4 h
+45 h
+4 h
+10 h
+196 h
+16511 m
+11 h
+1 h
+10 h
+4 h
+1 h
+1 h
+4 h
+1 h
+5230 m
+124 h
+4 h
+16512 m
+16513 m
+114 h
+10 h
+1 h
+4 h
+119 h
+4 h
+4 h
+109 h
+4 h
+169 h
+16514 m
+16515 m
+1 h
+4 h
+10 h
+4 h
+1714 h
+1 h
+1 h
+297 h
+31 h
+16516 m
+73 h
+65 h
+16517 m
+640 h
+147 h
+3 h
+4 h
+16518 m
+230 m
+4 h
+195 h
+16519 m
+4 h
+1 h
+4 h
+40 m
+4 h
+4 h
+10 h
+1 h
+4 h
+4 h
+648 m
+4 h
+69 h
+16520 m
+114 h
+4 h
+16521 m
+4 h
+11 h
+13 h
+146 h
+1714 h
+3 h
+16522 m
+16523 m
+16524 m
+3 h
+10 h
+270 h
+1 h
+10 h
+1 h
+10 h
+1685 h
+256 h
+1 h
+79 h
+1 h
+4372 m
+16525 m
+4 h
+270 h
+10 h
+4 h
+124 h
+4 h
+1 h
+4 h
+109 h
+1 h
+1 h
+16526 m
+4 h
+16527 m
+16528 m
+1 h
+238 h
+228 m
+59 h
+10 h
+1 h
+1 h
+4 h
+65 h
+40 h
+1 h
+629 m
+16529 m
+4 h
+10 h
+1 h
+1 h
+55 h
+289 h
+12 h
+1714 h
+157 h
+10 h
+8497 m
+11 h
+16530 m
+1 h
+3398 m
+16531 m
+13 h
+5695 m
+10 h
+1 h
+1 h
+4 h
+258 h
+10 h
+11 h
+16532 m
+4 h
+10 h
+16533 m
+4 h
+10 h
+16534 m
+10 h
+125 h
+16535 m
+82 h
+4 h
+13 h
+1 h
+1016 h
+10 h
+4 h
+4 h
+4 h
+4 h
+1 h
+10 h
+16536 m
+16537 m
+779 h
+1 h
+3070 m
+10 h
+16538 m
+16539 m
+256 h
+4 h
+1 h
+4 h
+10 h
+82 h
+10 h
+16540 m
+4 h
+1 h
+3 h
+41 h
+1828 m
+1 h
+1 h
+1 h
+1 h
+170 h
+82 h
+16541 m
+10 h
+16542 m
+181 h
+73 h
+3 h
+10 h
+5 h
+16543 m
+16544 m
+119 h
+10942 m
+10958 m
+4 h
+10 h
+74 h
+1 h
+4 h
+4 h
+16545 m
+16546 m
+4 h
+1 h
+4 h
+4 h
+4 h
+125 h
+129 h
+16547 m
+139 h
+1 h
+4 h
+4 h
+59 h
+4 h
+10 h
+4 h
+57 h
+4 h
+14642 m
+65 h
+195 h
+10 h
+16548 m
+10 h
+4 h
+83 h
+4 h
+4 h
+65 h
+1 h
+10 h
+16549 m
+4 h
+129 h
+10 h
+939 m
+1939 m
+16550 m
+4 h
+4 h
+1 h
+11 h
+1 h
+4 h
+1 h
+4 h
+5348 m
+22 h
+45 h
+10 h
+4 h
+139 h
+114 h
+4 h
+4 h
+1114 m
+258 h
+4 h
+258 h
+10 h
+16551 m
+1 h
+157 h
+172 h
+55 h
+4 h
+2474 m
+75 m
+1 h
+1 h
+468 m
+4 h
+1 h
+4 h
+125 h
+10 h
+16552 m
+59 h
+1 h
+1 h
+10 h
+4177 m
+1 h
+57 h
+4 h
+16553 m
+7535 m
+16554 m
+4 h
+16555 m
+2265 m
+16556 m
+966 h
+276 h
+16147 m
+10 h
+13084 m
+195 h
+10 h
+125 h
+11 h
+1 h
+4 h
+1 h
+1 h
+1 h
+4 h
+16557 m
+10 h
+16558 m
+185 h
+1 h
+11 h
+25 h
+64 h
+23 h
+500 m
+10 h
+10 h
+16559 m
+1 h
+10 h
+10 h
+1 h
+16560 m
+601 h
+16561 m
+64 h
+1 h
+45 h
+16562 m
+1 h
+10 h
+1 h
+278 h
+10378 m
+10 h
+27 h
+10 h
+11 h
+10464 m
+16563 m
+1 h
+1 h
+3028 m
+109 h
+4 h
+54 m
+4 h
+109 h
+4 h
+10 h
+31 h
+386 h
+2923 h
+4 h
+140 h
+10 h
+4 h
+17 m
+27 h
+10 h
+1 h
+1 h
+172 h
+4 h
+97 h
+4 h
+10 h
+10 h
+16564 m
+97 h
+4 h
+11141 m
+59 h
+4 h
+1 h
+4 h
+4 h
+16565 m
+4 h
+13 h
+57 h
+4 h
+10 h
+112 h
+1766 h
+1 h
+31 h
+1 h
+173 h
+4 h
+1 h
+266 h
+4 h
+25 h
+12 h
+4 h
+82 h
+4 h
+1 h
+36 h
+4 h
+12 h
+16566 m
+16567 m
+94 h
+7135 m
+4 h
+16568 m
+489 m
+4 h
+82 h
+16569 m
+104 h
+10 h
+10 h
+10 h
+4 h
+581 m
+1 h
+4 h
+4 h
+1 h
+8 h
+124 h
+10 h
+31 h
+16570 m
+4 h
+332 h
+4 h
+4 h
+4 h
+359 h
+16571 m
+41 h
+48 h
+16572 m
+4 h
+16573 m
+1 h
+11 h
+10682 m
+4 h
+4 h
+16574 m
+10 h
+10 h
+4 h
+16575 m
+10 h
+1 h
+16576 m
+4 h
+1 h
+1 h
+94 h
+4 h
+1 h
+16577 m
+16578 m
+16579 m
+16580 m
+10 h
+1 h
+3673 m
+4 h
+1 h
+10 h
+4 h
+4 h
+41 h
+113 h
+297 h
+65 h
+16581 m
+4 h
+4 h
+1470 h
+4 h
+10 h
+10 h
+1 h
+16582 m
+1 h
+1 h
+16583 m
+16584 m
+143 h
+4 h
+10 h
+3 h
+4 h
+16585 m
+1 h
+1 h
+4 h
+10 h
+138 h
+10 h
+4 h
+13958 m
+124 h
+10 h
+16586 m
+25 h
+57 h
+16587 m
+4522 m
+16588 m
+1 h
+4 h
+1 h
+16589 m
+1 h
+146 h
+4 h
+1 h
+16590 m
+16591 m
+224 h
+181 h
+195 h
+10 h
+4 h
+10 h
+59 h
+10 h
+4 h
+4 h
+1 h
+4 h
+4 h
+4 h
+5567 m
+2391 m
+10 h
+16592 m
+4 h
+1 h
+4 h
+16593 m
+4 h
+16594 m
+1 h
+57 h
+4 h
+10 h
+112 h
+10 h
+16595 m
+4 h
+10 h
+57 h
+4 h
+1 h
+109 h
+1470 h
+10 h
+16596 m
+114 h
+1 h
+11 h
+443 h
+10 h
+911 h
+1 h
+10 h
+1 h
+1 h
+16597 m
+4 h
+1 h
+16598 m
+16599 m
+16600 m
+16601 m
+1 h
+4 h
+4 h
+4 h
+16602 m
+16603 m
+4 h
+16604 m
+12 h
+6558 m
+82 h
+155 m
+16605 m
+11 h
+4 h
+16606 m
+1 h
+4 h
+1 h
+4 h
+11 h
+10 h
+94 h
+1 h
+1 h
+1 h
+1 h
+16607 m
+16608 m
+4 h
+1249 m
+1 h
+10 h
+4 h
+10 h
+1 h
+9482 m
+1 h
+57 h
+1 h
+59 h
+1 h
+4 h
+10 h
+3 h
+12 h
+4 h
+10 h
+4 h
+195 h
+11 h
+10 h
+104 h
+158 h
+16609 m
+4 h
+4 h
+1 h
+16610 m
+1 h
+97 h
+16611 m
+10 h
+4 h
+16612 m
+109 h
+97 h
+976 h
+10 h
+4 h
+147 h
+4 h
+4 h
+4 h
+4 h
+4 h
+258 h
+717 m
+4 h
+190 h
+4 h
+61 m
+1 h
+3606 m
+1 h
+4 h
+6473 m
+4 h
+2480 m
+8 h
+59 h
+4 h
+146 h
+986 h
+10 h
+3533 m
+10 h
+4 h
+285 m
+1 h
+16613 m
+16614 m
+59 h
+10 h
+100 m
+1 h
+5003 m
+11 h
+7087 m
+13 h
+4 h
+1 h
+2887 m
+10 h
+16615 m
+4 h
+10 h
+1 h
+1 h
+16616 m
+1 h
+110 h
+590 m
+57 h
+22 h
+12990 m
+1642 h
+143 h
+1 h
+4 h
+16617 m
+16618 m
+4 h
+4 h
+10 h
+4 h
+1 h
+16619 m
+8 h
+16620 m
+45 h
+79 h
+10 h
+16621 m
+4 h
+4 h
+10 h
+7382 m
+1 h
+1 h
+16622 m
+4 h
+1 h
+16623 m
+10 h
+11 h
+1 h
+16624 m
+97 h
+443 h
+124 h
+16625 m
+57 h
+83 h
+55 h
+4 h
+3025 m
+6678 m
+4 h
+82 h
+1 h
+1 h
+266 h
+16626 m
+4 h
+4 h
+10 h
+10 h
+119 h
+10 h
+10 h
+195 h
+4 h
+57 h
+124 h
+825 m
+1 h
+143 h
+1 h
+16627 m
+16628 m
+4 h
+109 h
+10 h
+4 h
+4 h
+25 h
+10 h
+4 h
+10 h
+1 h
+82 h
+3 h
+5230 m
+1 h
+196 h
+4 h
+16629 m
+16630 m
+11334 m
+1 h
+1 h
+4 h
+104 h
+167 h
+1 h
+146 h
+65 h
+1 h
+16631 m
+4 h
+10 h
+4 h
+10 h
+16632 m
+10 h
+1 h
+1 h
+16633 m
+10 h
+16634 m
+4 h
+16635 m
+16636 m
+1 h
+25 h
+1 h
+1 h
+1766 h
+4 h
+5 h
+73 h
+16637 m
+4 h
+16638 m
+195 h
+1 h
+1 h
+1 h
+10 h
+4 h
+104 h
+4561 m
+4 h
+1 h
+82 h
+16639 m
+10 h
+1975 m
+10 h
+123 h
+16640 m
+25 h
+16641 m
+57 h
+4 h
+16642 m
+16643 m
+16644 m
+82 h
+124 h
+82 h
+164 h
+4 h
+16645 m
+11 h
+45 h
+41 h
+4 h
+16646 m
+4 h
+10 h
+8511 m
+4 h
+10 h
+4 h
+1 h
+779 h
+11 h
+10 h
+1 h
+976 h
+104 h
+4 h
+4 h
+1 h
+82 h
+1 h
+7661 m
+4 h
+1 h
+4 h
+10 h
+278 h
+82 h
+41 h
+4 h
+16647 m
+16648 m
+65 h
+3 h
+770 m
+339 m
+2530 m
+1 h
+10 h
+10 h
+1 h
+4 h
+10 h
+4 h
+278 h
+1 h
+1 h
+1 h
+1 h
+2184 m
+4 h
+1 h
+11 h
+4 h
+4 h
+4 h
+16649 m
+16650 m
+4 h
+4 h
+16651 m
+104 h
+4 h
+10 h
+4 h
+4 h
+1 h
+190 h
+16652 m
+4 h
+4 h
+4 h
+4 h
+125 h
+45 h
+1 h
+16653 m
+16654 m
+1 h
+16655 m
+10 h
+16656 m
+109 h
+4 h
+4 h
+4 h
+10 h
+11 h
+10 h
+85 m
+10 h
+536 h
+4 h
+25 h
+16657 m
+16658 m
+1 h
+10 h
+4 h
+16659 m
+27 h
+353 h
+258 h
+82 h
+4 h
+59 h
+4 h
+1 h
+4 h
+4 h
+16660 m
+83 h
+190 h
+250 h
+4 h
+601 h
+976 h
+4 h
+4 h
+83 h
+1 h
+10 h
+16661 m
+79 h
+1 h
+4 h
+4 h
+10 h
+1 h
+10 h
+10 h
+113 h
+4 h
+4 h
+10 h
+4 h
+1 h
+16662 m
+14476 m
+4 h
+10 h
+1 h
+1 h
+10 h
+40 h
+10 h
+92 h
+307 h
+14 m
+1685 h
+64 h
+16663 m
+1 h
+16664 m
+16665 m
+1 h
+10 h
+16666 m
+1 h
+10 h
+10 h
+57 h
+124 h
+1 h
+16667 m
+156 h
+195 h
+10 h
+10 h
+104 h
+16668 m
+557 m
+274 h
+3 h
+10 h
+468 h
+4 h
+1 h
+16669 m
+615 m
+4 h
+307 h
+16670 m
+386 h
+4 h
+1 h
+1714 h
+16671 m
+16672 m
+4 h
+157 h
+4 h
+16673 m
+278 h
+1714 h
+91 h
+857 h
+10 h
+4 h
+4 h
+11 h
+1 h
+1 h
+1 h
+79 h
+4 h
+10 h
+16674 m
+36 h
+16675 m
+10 h
+16676 m
+10 h
+16677 m
+1 h
+1 h
+16678 m
+146 h
+10 h
+4 h
+125 h
+45 h
+4 h
+1366 m
+16679 m
+4 h
+1 h
+4 h
+16680 m
+3 h
+143 h
+56 h
+147 h
+16681 m
+1 h
+190 h
+16682 m
+383 h
+16683 m
+477 m
+1915 m
+135 h
+1 h
+258 h
+16684 m
+16685 m
+109 h
+11 h
+1 h
+4 h
+64 h
+4 h
+10 h
+16686 m
+11 h
+41 h
+1403 h
+1 h
+45 h
+12585 m
+1 h
+10 h
+16687 m
+73 h
+16688 m
+83 h
+265 h
+258 h
+4 h
+74 h
+10 h
+55 h
+4 h
+10 h
+16689 m
+1 h
+4 h
+1003 m
+10 h
+4 h
+10 h
+1 h
+4 h
+16690 m
+4 h
+16691 m
+4 h
+16692 m
+16693 m
+4 h
+1 h
+36 h
+1 h
+16694 m
+4 h
+146 h
+157 h
+1 h
+10 h
+16695 m
+16696 m
+1105 h
+11 h
+2172 m
+16697 m
+1 h
+22 h
+4 h
+1 h
+1 h
+601 h
+16698 m
+1 h
+10 h
+1 h
+16699 m
+1 h
+1 h
+16700 m
+1 h
+1 h
+1 h
+16701 m
+1 h
+16702 m
+10 h
+1 h
+4 h
+185 h
+4 h
+10 h
+82 h
+169 h
+16703 m
+1 h
+4 h
+41 h
+4 h
+57 h
+16704 m
+1 h
+10 h
+146 h
+463 m
+16705 m
+1 h
+1 h
+11 h
+4 h
+1 h
+104 h
+4 h
+103 h
+56 h
+1 h
+16706 m
+16707 m
+109 h
+10 h
+16708 m
+4 h
+4 h
+4 h
+16709 m
+1 h
+4 h
+1 h
+10 h
+109 h
+11 h
+10 h
+4 h
+1 h
+4 h
+1 h
+800 m
+266 h
+4 h
+493 m
+16710 m
+4 h
+16711 m
+3 h
+59 h
+1 h
+16712 m
+16713 m
+3 h
+4 h
+10 h
+3 h
+4 h
+83 h
+25 h
+146 h
+4 h
+40 h
+4 h
+16714 m
+1 h
+124 h
+4 h
+1 h
+113 h
+4 h
+219 h
+1 h
+195 h
+4 h
+1470 h
+55 h
+16715 m
+8 h
+4 h
+1 h
+1 h
+1 h
+65 h
+104 h
+157 h
+16716 m
+16717 m
+11 h
+16718 m
+57 h
+4 h
+83 h
+3 h
+16719 m
+1 h
+307 h
+16720 m
+4 h
+11 h
+16721 m
+4861 m
+16722 m
+16723 m
+10 h
+4297 m
+97 h
+4 h
+104 h
+16724 m
+1 h
+16725 m
+4101 m
+1766 h
+4 h
+4 h
+169 h
+97 h
+4 h
+59 h
+4 h
+4 h
+11 h
+16726 m
+250 h
+10 h
+1 h
+4 h
+16727 m
+1 h
+4 h
+1 h
+16728 m
+615 m
+16729 m
+4 h
+478 m
+10 h
+16730 m
+1 h
+10 h
+16731 m
+299 h
+4 h
+31 h
+10 h
+1545 m
+1 h
+4 h
+1 h
+4 h
+114 h
+11 h
+4 h
+1 h
+16732 m
+4 h
+4 h
+16733 m
+4 h
+1 h
+10 h
+2625 m
+82 h
+16734 m
+16735 m
+1 h
+332 h
+10 h
+186 h
+4 h
+11 h
+57 h
+195 h
+1 h
+4 h
+25 h
+16736 m
+57 h
+10 h
+64 h
+1 h
+10 h
+4 h
+1791 m
+4 h
+1 h
+1 h
+16737 m
+1 h
+10 h
+1 h
+10 h
+1 h
+1 h
+16738 m
+82 h
+4 h
+11 h
+4 h
+1 h
+16739 m
+16740 m
+16741 m
+1 h
+65 h
+10 h
+10 h
+4359 m
+1 h
+1 h
+16742 m
+45 h
+4 h
+16743 m
+16744 m
+10 h
+16745 m
+16746 m
+4 h
+3 h
+1 h
+16747 m
+16748 m
+10 h
+11 h
+1 h
+1 h
+16749 m
+4 h
+1083 h
+16750 m
+4 h
+31 h
+1 h
+4 h
+11 h
+10 h
+16751 m
+16752 m
+10 h
+4 h
+10 h
+181 h
+16753 m
+2710 m
+307 h
+16754 m
+16755 m
+82 h
+125 h
+4 h
+385 m
+16756 m
+1185 m
+10 h
+3 h
+10 h
+4 h
+4 h
+4 h
+4 h
+1766 h
+16757 m
+278 h
+4 h
+1 h
+330 h
+1 h
+4 h
+1 h
+4 h
+4 h
+16758 m
+16759 m
+16760 m
+135 h
+10 h
+25 h
+124 h
+601 h
+390 m
+2339 m
+4 h
+11 h
+16761 m
+4 h
+355 m
+146 h
+10 h
+16762 m
+265 h
+1 h
+146 h
+3533 m
+1 h
+64 h
+10 h
+82 h
+16763 m
+16764 m
+4 h
+4 h
+13 h
+59 h
+16765 m
+46 m
+4 h
+59 h
+4 h
+4 h
+10 h
+2733 h
+10 h
+16766 m
+55 h
+2971 m
+1 h
+10 h
+1 h
+10 h
+109 h
+16767 m
+1 h
+112 h
+22 h
+1 h
+1 h
+11 h
+4 h
+4 h
+11 h
+16768 m
+125 h
+1 h
+4 h
+1 h
+10 h
+1 h
+13 h
+16769 m
+41 h
+109 h
+10 h
+536 h
+1 h
+10 h
+45 h
+1 h
+10 h
+4 h
+1 h
+1 h
+4 h
+4 h
+517 m
+10 h
+4 h
+4 h
+10 h
+16770 m
+16771 m
+4 h
+4 h
+1 h
+16772 m
+10 h
+4 h
+4 h
+2887 m
+4441 m
+41 h
+10 h
+16773 m
+10 h
+2710 m
+4 h
+109 h
+10 h
+4 h
+1 h
+59 h
+322 h
+16774 m
+976 h
+16775 m
+238 h
+4 h
+1 h
+4 h
+1 h
+4 h
+16776 m
+1 h
+10 h
+10 h
+16777 m
+10 h
+31 h
+10 h
+4 h
+4 h
+241 h
+10 h
+1 h
+124 h
+55 h
+12 h
+10 h
+1 h
+10 h
+3 h
+4 h
+506 m
+4 h
+16778 m
+4 h
+16779 m
+4 h
+1835 m
+4 h
+4 h
+109 h
+16780 m
+16781 m
+11 h
+435 h
+1 h
+4 h
+1 h
+16782 m
+2300 m
+16783 m
+1 h
+976 h
+1 h
+1 h
+1 h
+16784 m
+2379 h
+16785 m
+4 h
+16786 m
+6461 m
+1 h
+25 h
+10 h
+11 h
+4 h
+1 h
+1 h
+11 h
+10 h
+82 h
+4 h
+11 h
+278 h
+687 h
+186 h
+73 h
+97 h
+97 h
+469 m
+10 h
+45 h
+195 h
+11 h
+16787 m
+16788 m
+1321 m
+16789 m
+1 h
+16790 m
+1 h
+10 h
+4 h
+270 h
+10 h
+4 h
+65 h
+31 h
+10 h
+114 h
+4 h
+125 h
+4 h
+1 h
+8140 m
+4 h
+4 h
+11707 m
+4 h
+16791 m
+10 h
+4 h
+1 h
+1 h
+4 h
+358 h
+1 h
+109 h
+4 h
+16792 m
+1 h
+4 h
+158 h
+16793 m
+4 h
+10 h
+10 h
+124 h
+16794 m
+964 m
+4 h
+1 h
+119 h
+25 h
+16795 m
+55 h
+109 h
+383 h
+4 h
+82 h
+857 h
+4724 m
+104 h
+4 h
+16796 m
+16797 m
+16798 m
+31 h
+1 h
+56 h
+4 h
+196 h
+143 h
+4 h
+16799 m
+82 h
+1 h
+10 h
+4 h
+10 h
+4 h
+11346 m
+1 h
+16800 m
+1 h
+16801 m
+1 h
+4 h
+16802 m
+10 h
+10 h
+57 h
+10 h
+4 h
+229 h
+16803 m
+1 h
+16804 m
+1403 h
+5308 m
+16805 m
+2851 h
+16806 m
+10 h
+104 h
+4 h
+4 h
+16514 m
+358 h
+157 h
+4 h
+10 h
+1478 h
+94 h
+4 h
+1 h
+74 h
+124 h
+10 h
+8497 m
+16807 m
+4 h
+2040 m
+4 h
+359 h
+31 h
+1 h
+10 h
+1 h
+10 h
+157 h
+4 h
+16808 m
+16809 m
+1 h
+1 h
+1 h
+464 h
+1 h
+16810 m
+10 h
+190 h
+10 h
+4 h
+10 h
+112 h
+57 h
+1 h
+4905 h
+2087 m
+16811 m
+83 h
+55 h
+83 h
+10 h
+99 m
+4 h
+10 h
+146 h
+4 h
+1478 h
+1016 h
+16812 m
+319 h
+262 h
+4 h
+4 h
+4 h
+4 h
+1 h
+4 h
+25 h
+4 h
+16813 m
+278 h
+4 h
+4 h
+16814 m
+104 h
+181 h
+1 h
+41 h
+1714 h
+359 h
+82 h
+4 h
+10 h
+1 h
+16815 m
+16816 m
+16817 m
+4 h
+16748 m
+1 h
+70 m
+172 h
+16818 m
+4 h
+124 h
+16819 m
+4 h
+2788 h
+4 h
+4 h
+16820 m
+16821 m
+1083 h
+10 h
+4 h
+4 h
+297 h
+2002 h
+1666 m
+1492 m
+97 h
+169 h
+146 h
+4 h
+16822 m
+1 h
+4 h
+570 m
+4 h
+1 h
+16823 m
+1 h
+16824 m
+1 h
+10 h
+16825 m
+10 h
+16826 m
+10 h
+75 m
+12 h
+10 h
+4 h
+10 h
+1261 h
+10 h
+10 h
+238 h
+10 h
+1 h
+16827 m
+173 h
+16828 m
+4 h
+1137 h
+9912 m
+4 h
+27 h
+73 h
+4 h
+16829 m
+59 h
+10901 m
+113 h
+4 h
+4 h
+4 h
+10 h
+4 h
+10 h
+10 h
+124 h
+16830 m
+10 h
+1 h
+190 h
+16831 m
+4 h
+4 h
+125 h
+4 h
+4 h
+1 h
+181 h
+4 h
+16832 m
+82 h
+12 h
+124 h
+1 h
+11 h
+1 h
+1250 h
+6266 m
+16833 m
+1766 h
+1 h
+74 h
+10 h
+11 h
+13 h
+65 h
+4 h
+1 h
+4 h
+16834 m
+109 h
+195 h
+4 h
+1 h
+10 h
+4 h
+10 h
+10 h
+4 h
+11 h
+1 h
+4 h
+10 h
+4 h
+4 h
+2192 m
+1 h
+1 h
+1 h
+4 h
+16835 m
+266 h
+4 h
+16836 m
+10 h
+4 h
+4 h
+4 h
+10 h
+4 h
+4 h
+1 h
+55 h
+16837 m
+332 h
+10 h
+10 h
+16838 m
+4 h
+4 h
+1 h
+399 h
+16839 m
+59 h
+10 h
+443 h
+64 h
+4 h
+4 h
+976 h
+10 h
+10 h
+238 h
+4 h
+2495 m
+11 h
+2265 m
+4 h
+16840 m
+1 h
+1 h
+16841 m
+295 h
+1 h
+16842 m
+1 h
+10 h
+6185 m
+57 h
+16843 m
+4 h
+25 h
+16844 m
+4 h
+1 h
+4 h
+16845 m
+124 h
+10 h
+1 h
+16846 m
+16847 m
+16848 m
+4 h
+36 h
+16849 m
+1 h
+59 h
+1875 m
+4 h
+1 h
+1886 m
+11 h
+16850 m
+40 h
+266 h
+16851 m
+94 h
+16852 m
+74 h
+4 h
+10 h
+16853 m
+4 h
+109 h
+204 m
+4 h
+1 h
+172 h
+1 h
+1 h
+1 h
+4 h
+10 h
+10 h
+794 m
+4 h
+4 h
+1 h
+4 h
+31 h
+4 h
+10 h
+10 h
+4 h
+2002 h
+1 h
+196 h
+196 h
+4 h
+4 h
+83 h
+16854 m
+4 h
+125 h
+1 h
+16855 m
+4 h
+83 h
+1 h
+109 h
+1 h
+1 h
+10 h
+109 h
+16856 m
+1 h
+10 h
+1 h
+11 h
+4 h
+4 h
+31 h
+16857 m
+1 h
+16858 m
+10 h
+4 h
+3 h
+59 h
+14814 m
+1 h
+4 h
+10 h
+4 h
+16859 m
+12 h
+1 h
+278 h
+10 h
+4 h
+135 h
+1 h
+16860 m
+3 h
+16861 m
+4 h
+4 h
+114 h
+1 h
+1 h
+10 h
+1 h
+7802 m
+11 h
+4 h
+4 h
+16862 m
+56 h
+338 h
+59 h
+10 h
+4 h
+4 h
+4 h
+114 h
+888 h
+1 h
+181 h
+1 h
+146 h
+1105 h
+82 h
+10 h
+16863 m
+4 h
+5610 m
+1564 m
+10 h
+16864 m
+190 h
+1 h
+2914 m
+109 h
+10 h
+1 h
+1137 h
+16865 m
+16866 m
+4 h
+8 h
+278 h
+10 h
+276 h
+10 h
+355 m
+1 h
+4 h
+1 h
+16867 m
+11 h
+118 h
+1 h
+1 h
+4 h
+16868 m
+4 h
+16869 m
+4 h
+1 h
+16870 m
+1 h
+10 h
+16871 m
+13392 m
+313 m
+4 h
+4 h
+16872 m
+4 h
+4 h
+135 h
+10 h
+16873 m
+16874 m
+4 h
+195 h
+4 h
+1 h
+16875 m
+16876 m
+4 h
+4 h
+4 h
+4 h
+16877 m
+10 h
+397 m
+65 h
+109 h
+1437 m
+5526 m
+1 h
+1 h
+16878 m
+12 h
+1 h
+10 h
+1089 h
+185 h
+10 h
+358 h
+16879 m
+110 h
+4 h
+4 h
+4 h
+10 h
+156 h
+164 h
+190 h
+3 h
+190 h
+41 h
+4 h
+16880 m
+16881 m
+4 h
+1 h
+1 h
+16882 m
+125 h
+1 h
+16883 m
+737 m
+4 h
+3155 m
+41 h
+1 h
+1 h
+4 h
+4 h
+1 h
+11 h
+11 h
+1 h
+59 h
+4 h
+10 h
+36 h
+25 h
+108 h
+11 h
+4 h
+7 m
+10 h
+10 h
+16884 m
+16885 m
+2194 m
+124 h
+4 h
+16886 m
+123 h
+25 h
+1 h
+10 h
+125 h
+4 h
+9933 m
+718 h
+56 h
+4 h
+4 h
+109 h
+1 h
+279 h
+1 h
+16887 m
+83 h
+1137 h
+3 h
+1 h
+4 h
+1 h
+4 h
+1 h
+25 h
+4 h
+4 h
+4 h
+332 h
+16888 m
+1 h
+4 h
+4 h
+16889 m
+1 h
+1 h
+4 h
+10 h
+10 h
+4 h
+332 h
+10 h
+4 h
+11 h
+124 h
+147 h
+4 h
+4 h
+16890 m
+10 h
+1 h
+4 h
+16891 m
+5224 m
+5709 m
+10 h
+1 h
+203 h
+4 h
+16892 m
+1 h
+16893 m
+150 m
+16894 m
+10 h
+4 h
+1 h
+5357 m
+16895 m
+79 h
+16896 m
+56 h
+31 h
+4 h
+4 h
+10 h
+11 h
+170 h
+4 h
+4 h
+1 h
+82 h
+10 h
+4 h
+4 h
+289 h
+444 m
+10 h
+16897 m
+386 h
+10 h
+1083 h
+1 h
+16898 m
+1 h
+11731 m
+1 h
+10 h
+4 h
+8 h
+16899 m
+10 h
+10 h
+4 h
+2233 m
+4 h
+147 h
+4 h
+4 h
+45 h
+10 h
+1 h
+12 h
+5053 m
+10 h
+16900 m
+258 h
+16901 m
+4 h
+1 h
+1 h
+1886 m
+4 h
+16902 m
+10 h
+4 h
+83 h
+10 h
+4 h
+16903 m
+1 h
+1 h
+1 h
+135 h
+1056 m
+16904 m
+1 h
+109 h
+16905 m
+1 h
+10 h
+11 h
+97 h
+16906 m
+10 h
+104 h
+74 h
+1 h
+16907 m
+10 h
+16908 m
+2846 m
+16909 m
+10 h
+123 h
+4 h
+4 h
+4 h
+158 h
+74 h
+4 h
+10 h
+1 h
+4 h
+11715 m
+447 h
+4848 m
+16910 m
+124 h
+4 h
+10 h
+16911 m
+167 h
+10 h
+1 h
+4 h
+1 h
+10 h
+16912 m
+16913 m
+16914 m
+10 h
+16915 m
+1 h
+620 m
+124 h
+1 h
+1 h
+1 h
+1 h
+36 h
+10 h
+173 h
+11 h
+10 h
+57 h
+16916 m
+4 h
+124 h
+4 h
+1 h
+10 h
+10 h
+13 h
+4 h
+4 h
+1 h
+1 h
+10 h
+1 h
+4 h
+4 h
+1 h
+16917 m
+4 h
+4 h
+16918 m
+16919 m
+355 h
+4 h
+16920 m
+4 h
+1 h
+204 h
+4 h
+4 h
+11 h
+1 h
+109 h
+10 h
+4 h
+332 h
+4608 m
+82 h
+55 h
+16921 m
+4 h
+279 h
+12 h
+1406 m
+990 m
+4 h
+3 h
+1 h
+330 h
+1 h
+4 h
+57 h
+41 h
+1 h
+1 h
+1 h
+1 h
+10 h
+338 h
+3 h
+125 h
+36 h
+1650 h
+4 h
+10 h
+3 h
+16922 m
+4 h
+353 h
+1 h
+4 h
+169 h
+1 h
+1 h
+11 h
+359 h
+41 h
+16923 m
+1 h
+4 h
+4 h
+1504 m
+1 h
+11 h
+4 h
+16924 m
+1 h
+41 h
+11825 m
+1 h
+16925 m
+423 m
+2436 m
+16926 m
+16927 m
+4 h
+10 h
+4 h
+1 h
+10 h
+1 h
+10 h
+16928 m
+4 h
+757 h
+16929 m
+1 h
+4 h
+4 h
+7214 m
+230 m
+1250 h
+1 h
+1 h
+59 h
+16930 m
+4 h
+10 h
+4 h
+10 h
+1337 m
+4 h
+4 h
+10 h
+65 h
+10 h
+2865 m
+3 h
+16931 m
+4538 m
+656 m
+16932 m
+4 h
+266 h
+16933 m
+16934 m
+16935 m
+12 h
+10 h
+10 h
+4590 m
+1 h
+16936 m
+1 h
+1 h
+4 h
+1 h
+1772 m
+1 h
+228 m
+1 h
+11 h
+16937 m
+1642 h
+57 h
+124 h
+16938 m
+10 h
+16939 m
+140 h
+74 h
+4 h
+16940 m
+3 h
+1 h
+1 h
+143 h
+1 h
+4 h
+16941 m
+0 m
+109 h
+238 h
+94 h
+1 h
+1 h
+16942 m
+1 h
+124 h
+84 m
+9411 m
+4 h
+1 h
+55 h
+4 h
+16943 m
+4 h
+224 h
+15156 m
+4 h
+16944 m
+1 h
+196 h
+4 h
+4 h
+10 h
+1 h
+11 h
+1975 m
+173 h
+1 h
+59 h
+1 h
+10 h
+1 h
+16945 m
+109 h
+4 h
+16946 m
+59 h
+601 h
+1847 m
+16947 m
+74 h
+10 h
+1 h
+1650 h
+1137 h
+1 h
+4 h
+1 h
+125 h
+4 h
+4 h
+16948 m
+83 h
+1 h
+4 h
+10 h
+4 h
+10 h
+4 h
+16949 m
+11 h
+3 h
+1 h
+10 h
+4 h
+196 h
+16950 m
+4 h
+4 h
+169 h
+4 h
+22 h
+119 h
+10 h
+3236 m
+10 h
+181 h
+1 h
+4 h
+297 h
+10 h
+1 h
+57 h
+114 h
+4 h
+4 h
+10 h
+104 h
+1114 m
+123 h
+79 h
+238 h
+4 h
+16951 m
+5514 m
+4 h
+4 h
+123 h
+1 h
+1 h
+10 h
+135 h
+241 h
+12 h
+4 h
+16952 m
+4 h
+1 h
+10 h
+2887 m
+16953 m
+4 h
+976 h
+65 h
+278 h
+1 h
+1 h
+16954 m
+104 h
+10 h
+16955 m
+1 h
+112 h
+41 h
+59 h
+265 h
+59 h
+10 h
+1 h
+4 h
+16956 m
+1 h
+16957 m
+59 h
+157 h
+4 h
+10 h
+10 h
+12131 m
+4 h
+16958 m
+10 h
+13 h
+276 h
+10 h
+147 h
+4 h
+1321 m
+4 h
+31 h
+16959 m
+4 h
+4 h
+1 h
+204 h
+4 h
+16960 m
+278 h
+4 h
+16961 m
+16962 m
+10 h
+4 h
+16963 m
+73 h
+4 h
+144 h
+16964 m
+1 h
+73 h
+8 h
+16965 m
+22 h
+10 h
+1 h
+4 h
+119 h
+1 h
+1 h
+266 h
+976 h
+167 h
+16966 m
+10 h
+10 h
+4 h
+4030 h
+17 m
+630 m
+10 h
+109 h
+10 h
+1 h
+1 h
+16967 m
+1 h
+1 h
+10 h
+224 h
+82 h
+16968 m
+4 h
+4 h
+1 h
+1 h
+4 h
+1 h
+92 h
+59 h
+4 h
+31 h
+16969 m
+4 h
+1 h
+1 h
+4 h
+1 h
+10 h
+4 h
+16970 m
+1 h
+83 h
+447 h
+16971 m
+1 h
+224 h
+11 h
+4 h
+2379 h
+16972 m
+4 h
+167 h
+4 h
+36 h
+4 h
+3 h
+77 h
+10 h
+1 h
+4 h
+16973 m
+10 h
+4 h
+10 h
+11 h
+55 h
+112 h
+4 h
+146 h
+124 h
+10 h
+16974 m
+13833 m
+10 h
+11 h
+10 h
+1 h
+16975 m
+13 h
+16976 m
+1089 h
+1 h
+10 h
+10 h
+4 h
+265 h
+16977 m
+16978 m
+1 h
+1 h
+16979 m
+10 h
+4 h
+57 h
+10 h
+16980 m
+16981 m
+4 h
+4 h
+4 h
+557 m
+332 h
+2054 m
+10 h
+10 h
+10 h
+808 m
+4 h
+569 h
+10 h
+16982 m
+1 h
+4 h
+4 h
+40 h
+10 h
+41 h
+109 h
+104 h
+10 h
+10 h
+629 m
+45 h
+4 h
+16983 m
+9396 m
+229 h
+11 h
+10 h
+1 h
+1 h
+10 h
+16984 m
+1 h
+13 h
+1016 h
+4 h
+7236 m
+4 h
+91 h
+1 h
+11 h
+2358 m
+230 m
+447 h
+33 m
+10 h
+4 h
+16985 m
+10 h
+316 m
+241 h
+4 h
+3 h
+4 h
+3987 m
+143 h
+55 h
+4 h
+463 m
+1 h
+16986 m
+4 h
+1 h
+1260 m
+10 h
+1 h
+4 h
+2314 m
+1 h
+59 h
+13324 m
+278 h
+1 h
+92 h
+332 h
+2438 m
+16987 m
+1 h
+103 h
+1 h
+74 h
+371 h
+684 m
+16988 m
+10 h
+3 h
+10 h
+16989 m
+16990 m
+10 h
+11 h
+164 h
+4 h
+1 h
+399 h
+10 h
+1 h
+16991 m
+4 h
+169 h
+119 h
+4 h
+4057 m
+10 h
+16992 m
+4 h
+1027 h
+109 h
+1 h
+82 h
+1 h
+4 h
+11 h
+4 h
+4 h
+1470 h
+266 h
+16993 m
+4 h
+10 h
+4 h
+16994 m
+1 h
+4 h
+190 h
+16995 m
+2951 m
+1 h
+1 h
+25 h
+1 h
+4 h
+125 h
+4 h
+1 h
+265 h
+4 h
+10 h
+64 h
+1 h
+203 h
+4 h
+10 h
+3 h
+297 h
+16996 m
+4 h
+1 h
+4 h
+59 h
+16997 m
+1 h
+10 h
+10 h
+1 h
+1 h
+16998 m
+16999 m
+17000 m
+8040 m
+82 h
+4 h
+17001 m
+17002 m
+11 h
+888 h
+10 h
+169 h
+119 h
+57 h
+4 h
+17003 m
+17004 m
+4 h
+1 h
+4 h
+4 h
+31 h
+10 h
+170 h
+4 h
+1 h
+10 h
+10 h
+1 h
+4 h
+41 h
+4 h
+10 h
+112 h
+1 h
+13907 m
+1 h
+4 h
+1 h
+45 h
+4 h
+167 h
+17005 m
+135 h
+17006 m
+4 h
+1 h
+17007 m
+10 h
+10 h
+82 h
+1 h
+64 h
+57 h
+10 h
+17008 m
+270 h
+65 h
+124 h
+4 h
+17009 m
+4 h
+1 h
+169 h
+17010 m
+1016 h
+1 h
+196 h
+4 h
+4 h
+4 h
+31 h
+17011 m
+8486 m
+1 h
+258 h
+4 h
+17012 m
+843 h
+10 h
+4 h
+11 h
+1370 m
+4 h
+45 h
+74 h
+1 h
+1 h
+578 h
+1 h
+55 h
+443 h
+10 h
+97 h
+83 h
+4 h
+17013 m
+1 h
+1 h
+4 h
+4 h
+17014 m
+4 h
+17015 m
+4 h
+17016 m
+97 h
+1 h
+11 h
+4 h
+17017 m
+10 h
+10 h
+4 h
+4 h
+4 h
+4 h
+10 h
+164 h
+104 h
+17018 m
+4 h
+4 h
+779 h
+400 m
+4 h
+10 h
+17019 m
+48 h
+41 h
+17020 m
+17021 m
+10 h
+4 h
+60 m
+17022 m
+4 h
+347 h
+1 h
+4 h
+17023 m
+10 h
+4 h
+1 h
+82 h
+316 m
+12 h
+4 h
+1083 h
+4 h
+1 h
+4 h
+4 h
+4 h
+169 h
+4 h
+10 h
+4 h
+17024 m
+4 h
+4240 m
+181 h
+4 h
+17025 m
+10 h
+4 h
+1 h
+717 m
+196 h
+1 h
+17026 m
+4 h
+65 h
+64 h
+10 h
+1 h
+6558 m
+4 h
+1 h
+4 h
+4 h
+10 h
+4 h
+56 h
+17027 m
+17028 m
+10 h
+10 h
+425 m
+13 h
+4 h
+31 h
+125 h
+17029 m
+1 h
+4 h
+4 h
+1 h
+3286 m
+4 h
+17030 m
+10 h
+10 h
+109 h
+184 h
+4332 m
+1 h
+4 h
+2340 m
+1 h
+185 h
+17031 m
+125 h
+17032 m
+17033 m
+10 h
+4 h
+4 h
+1 h
+17034 m
+17035 m
+4 h
+17036 m
+4 h
+4 h
+4 h
+4 h
+10 h
+17037 m
+2887 m
+65 h
+11 h
+6010 m
+31 h
+22 h
+25 h
+17038 m
+1 h
+10 h
+1 h
+770 m
+4 h
+4 h
+10 h
+4 h
+195 h
+4 h
+4 h
+17039 m
+4 h
+4 h
+124 h
+17040 m
+4 h
+82 h
+17041 m
+1 h
+17042 m
+17043 m
+59 h
+114 h
+9396 m
+17044 m
+10 h
+181 h
+4 h
+4 h
+1 h
+4 h
+17045 m
+1 h
+4 h
+229 h
+9831 m
+17046 m
+11910 m
+928 m
+10 h
+10 h
+10 h
+17047 m
+1 h
+41 h
+11 h
+1 h
+17048 m
+10 h
+17049 m
+17050 m
+10 h
+11 h
+10 h
+10 h
+4 h
+1 h
+156 h
+4 h
+10 h
+10 h
+976 h
+10 h
+1 h
+17051 m
+17052 m
+10 h
+1 h
+11 h
+1 h
+17053 m
+1 h
+1 h
+1 h
+976 h
+10 h
+4 h
+17054 m
+250 h
+1 h
+1 h
+17055 m
+3 h
+195 h
+4 h
+17056 m
+10 h
+1105 h
+1 h
+10 h
+36 h
+3 h
+1027 h
+10 h
+757 h
+17057 m
+25 h
+10 h
+1 h
+1 h
+368 h
+4 h
+11 h
+125 h
+185 h
+17058 m
+196 h
+1 h
+79 h
+4 h
+2172 m
+299 h
+59 h
+17059 m
+307 h
+13 h
+1 h
+11 h
+173 h
+4 h
+10 h
+181 h
+1 h
+17060 m
+17061 m
+17062 m
+371 h
+1 h
+4 h
+262 h
+10 h
+10 h
+408 m
+17063 m
+4 h
+17064 m
+10 h
+1 h
+10 h
+1 h
+4 h
+2459 m
+140 h
+10 h
+10 h
+4 h
+238 h
+1 h
+4 h
+4 h
+10 h
+1 h
+17065 m
+4 h
+147 h
+125 h
+4 h
+17066 m
+10 h
+3322 m
+17067 m
+4 h
+1 h
+22 h
+164 h
+31 h
+4 h
+1 h
+8 h
+10 h
+443 h
+649 m
+4 h
+4 h
+274 h
+1 h
+4 h
+10 h
+1 h
+1 h
+1 h
+17068 m
+4 h
+3 h
+74 h
+4 h
+4 h
+4 h
+1 h
+17069 m
+4 h
+10 h
+258 h
+4 h
+4 h
+3 h
+82 h
+79 h
+17070 m
+104 h
+1 h
+195 h
+692 h
+11 h
+11 h
+17071 m
+10 h
+17072 m
+1 h
+10 h
+10 h
+17073 m
+146 h
+1 h
+4 h
+767 m
+4 h
+104 h
+104 h
+1 h
+17074 m
+4177 m
+1 h
+17075 m
+17076 m
+11 h
+1 h
+1620 m
+4 h
+17077 m
+229 h
+857 h
+10062 m
+170 h
+17078 m
+17079 m
+17080 m
+1 h
+1 h
+46 m
+17081 m
+45 h
+17082 m
+4 h
+3 h
+1 h
+124 h
+10 h
+17083 m
+41 h
+1 h
+4 h
+10 h
+17084 m
+59 h
+17085 m
+3557 m
+195 h
+17086 m
+17087 m
+10 h
+17088 m
+17089 m
+10 h
+55 h
+82 h
+10 h
+297 h
+17090 m
+569 h
+17091 m
+10 h
+1 h
+10 h
+17092 m
+10 h
+6946 m
+9800 m
+1 h
+31 h
+4 h
+124 h
+82 h
+1 h
+17093 m
+265 h
+10 h
+1 h
+184 h
+55 h
+1 h
+1 h
+4 h
+3 h
+4 h
+31 h
+10 h
+158 h
+10 h
+10 h
+4 h
+1 h
+10 h
+1 h
+12 h
+3704 m
+1 h
+297 h
+25 h
+57 h
+4 h
+59 h
+1 h
+28 h
+11 h
+36 h
+1 h
+4 h
+5125 m
+36 h
+4 h
+4 h
+57 h
+1 h
+1 h
+10 h
+10 h
+10 h
+230 h
+1 h
+4 h
+17094 m
+1 h
+73 h
+358 h
+64 h
+278 h
+4 h
+5 h
+1 h
+4 h
+10 h
+4 h
+10 h
+1 h
+10 h
+17095 m
+10 h
+1 h
+1454 h
+17096 m
+36 h
+3707 m
+2017 m
+1 h
+4 h
+4919 m
+4 h
+10 h
+4 h
+172 h
+11 h
+10 h
+196 h
+10 h
+10 h
+10 h
+1137 h
+1677 m
+10 h
+157 h
+4 h
+109 h
+4 h
+109 h
+17097 m
+65 h
+17098 m
+124 h
+17099 m
+4 h
+17100 m
+4 h
+10 h
+4 h
+124 h
+112 h
+17101 m
+4 h
+4 h
+192 h
+17102 m
+4 h
+45 h
+403 h
+1 h
+238 h
+4 h
+1 h
+17103 m
+4 h
+17104 m
+4 h
+1074 h
+4 h
+22 h
+185 h
+3025 m
+10 h
+125 h
+156 h
+1 h
+92 h
+4 h
+10 h
+5470 m
+1 h
+4 h
+4 h
+170 h
+17105 m
+83 h
+5 h
+17106 m
+97 h
+716 m
+1062 m
+4 h
+59 h
+110 h
+1642 h
+1 h
+17107 m
+1 h
+10 h
+17108 m
+4 h
+4 h
+4 h
+17109 m
+124 h
+4 h
+4 h
+31 h
+1 h
+282 m
+1074 h
+4 h
+164 h
+3 h
+31 h
+4 h
+1 h
+4 h
+1 h
+4 h
+112 h
+10 h
+4 h
+10 h
+17110 m
+125 h
+11 h
+17111 m
+1 h
+1 h
+10 h
+1 h
+17112 m
+17113 m
+10 h
+10 h
+185 h
+195 h
+83 h
+4 h
+10 h
+368 h
+987 m
+10 h
+359 h
+1710 m
+17114 m
+73 h
+4 h
+17115 m
+17116 m
+4 h
+10 h
+1 h
+83 h
+17117 m
+17118 m
+109 h
+10 h
+4 h
+4 h
+17119 m
+1 h
+338 h
+10 h
+4 h
+12131 m
+4 h
+17120 m
+17121 m
+587 m
+2041 m
+4 h
+124 h
+11 h
+954 m
+4 h
+79 h
+4 h
+10 h
+9585 m
+801 m
+1 h
+11 h
+1 h
+10 h
+10 h
+3 h
+10 h
+11 h
+17122 m
+13821 m
+17123 m
+55 h
+13 h
+238 h
+4 h
+1 h
+520 h
+4 h
+1 h
+17124 m
+116 m
+1 h
+3 h
+1 h
+1 h
+3 h
+4 h
+941 m
+4 h
+4 h
+4 h
+4 h
+10 h
+4 h
+17125 m
+17126 m
+918 m
+601 h
+10 h
+4 h
+4 h
+5967 m
+4 h
+966 h
+17127 m
+4 h
+4 h
+17128 m
+1 h
+1737 m
+1 h
+1 h
+3562 m
+17129 m
+1 h
+65 h
+17130 m
+4 h
+1 h
+104 h
+10 h
+10 h
+17131 m
+4 h
+11 h
+109 h
+10 h
+17132 m
+1 h
+4 h
+1725 m
+443 h
+17133 m
+425 m
+17134 m
+332 h
+124 h
+4 h
+10 h
+17135 m
+17136 m
+59 h
+31 h
+17137 m
+10 h
+172 h
+31 h
+10 h
+157 h
+17138 m
+4 h
+1137 h
+17139 m
+4 h
+4 h
+17140 m
+276 h
+1 h
+4 h
+10 h
+17141 m
+4 h
+4 h
+10739 m
+4 h
+4 h
+17142 m
+146 h
+1 h
+10 h
+59 h
+1 h
+4 h
+4 h
+1337 m
+4 h
+3 h
+17143 m
+1 h
+1 h
+7900 m
+258 h
+17144 m
+74 h
+1 h
+4 h
+266 h
+4 h
+10 h
+4 h
+82 h
+124 h
+17145 m
+1 h
+1 h
+1 h
+4 h
+55 h
+276 h
+31 h
+17146 m
+1 h
+10 h
+266 h
+143 h
+10 h
+204 h
+13 h
+59 h
+41 h
+17147 m
+4 h
+10 h
+4 h
+10 h
+124 h
+119 h
+2617 m
+1 h
+1 h
+45 h
+17148 m
+10 h
+195 h
+4 h
+10 h
+10 h
+1 h
+17149 m
+4 h
+74 h
+295 h
+147 h
+41 h
+1 h
+17150 m
+1 h
+219 h
+4 h
+4 h
+4 h
+1 h
+25 h
+506 m
+73 h
+10 h
+447 h
+150 m
+4 h
+10 h
+1 h
+17151 m
+4 h
+25 h
+10 h
+1 h
+10 h
+124 h
+4 h
+36 h
+17152 m
+1 h
+10 h
+17153 m
+1 h
+4 h
+17154 m
+4 h
+1 h
+4 h
+167 h
+4 h
+79 h
+8486 m
+11 h
+17155 m
+488 m
+97 h
+1 h
+4 h
+57 h
+13334 m
+4 h
+3 h
+1 h
+184 h
+1 h
+57 h
+4 h
+41 h
+1 h
+1 h
+17156 m
+276 h
+114 h
+1 h
+10 h
+1 h
+4 h
+195 h
+17157 m
+1 h
+10 h
+59 h
+124 h
+307 h
+10 h
+10 h
+10 h
+810 m
+1 h
+10 h
+10 h
+27 h
+1 h
+1 h
+4 h
+1470 h
+4 h
+12898 m
+4 h
+274 h
+195 h
+1 h
+36 h
+10 h
+119 h
+27 h
+1 h
+403 h
+10 h
+10 h
+1 h
+59 h
+4 h
+10 h
+16395 m
+10 h
+4 h
+1 h
+17158 m
+383 h
+387 m
+4 h
+10 h
+1 h
+4 h
+17159 m
+1796 m
+1 h
+17160 m
+11 h
+10 h
+17161 m
+10 h
+5 h
+1 h
+1 h
+17162 m
+274 h
+17163 m
+113 h
+488 m
+4 h
+3 h
+4 h
+196 h
+147 h
+1 h
+56 h
+17164 m
+270 h
+31 h
+4 h
+3 h
+1 h
+7 m
+3 h
+124 h
+17165 m
+4 h
+2788 h
+403 h
+12 h
+1 h
+1053 m
+17166 m
+17167 m
+433 m
+4 h
+1 h
+97 h
+4 h
+1 h
+74 h
+8 h
+17168 m
+55 h
+25 h
+4 h
+17169 m
+10556 m
+17170 m
+1 h
+13 h
+10 h
+1304 m
+104 h
+97 h
+10 h
+10 h
+17171 m
+10 h
+1 h
+4 h
+10 h
+687 h
+4 h
+4 h
+4 h
+82 h
+17172 m
+17173 m
+488 h
+10 h
+1 h
+45 h
+11 h
+3 h
+17174 m
+1 h
+4 h
+4 h
+1 h
+276 h
+10 h
+4 h
+1 h
+59 h
+17175 m
+113 h
+10 h
+196 h
+164 h
+195 h
+4 h
+17176 m
+1 h
+3601 m
+10 h
+135 h
+10 h
+11 h
+59 h
+17177 m
+10 h
+17178 m
+17179 m
+4 h
+114 h
+17180 m
+888 h
+10 h
+1 h
+17181 m
+2558 m
+4 h
+17182 m
+10 h
+41 h
+10 h
+10 h
+17183 m
+1 h
+10 h
+17184 m
+17185 m
+12993 m
+10 h
+114 h
+1 h
+2438 m
+1 h
+1 h
+14570 m
+763 m
+10 h
+1 h
+872 m
+1 h
+4256 m
+1 h
+17186 m
+17187 m
+1 h
+4 h
+1556 m
+1 h
+17188 m
+17189 m
+195 h
+17190 m
+17191 m
+10 h
+1 h
+1542 m
+1 h
+1 h
+265 h
+4 h
+10 h
+10 h
+125 h
+258 h
+17192 m
+13 h
+1 h
+1 h
+601 h
+10 h
+114 h
+3 h
+935 m
+1 h
+10 h
+124 h
+1 h
+1 h
+17193 m
+1 h
+12244 m
+41 h
+10 h
+17194 m
+41 h
+4 h
+1 h
+4 h
+84 m
+17195 m
+17196 m
+1 h
+4 h
+4 h
+1322 m
+4 h
+4 h
+4 h
+25 h
+10 h
+74 h
+10 h
+1 h
+17197 m
+4 h
+82 h
+12911 m
+10 h
+4 h
+17198 m
+17199 m
+147 h
+41 h
+10 h
+4 h
+10 h
+17200 m
+4 h
+10 h
+146 h
+1 h
+4 h
+174 m
+4 h
+57 h
+25 h
+4 h
+17201 m
+1 h
+10 h
+3 h
+4 h
+4 h
+1 h
+125 h
+3 h
+4 h
+27 h
+104 h
+1 h
+10 h
+10 h
+4 h
+1 h
+124 h
+31 h
+4 h
+74 h
+1 h
+1 h
+204 h
+83 h
+57 h
+17202 m
+4747 m
+124 h
+224 h
+4 h
+10 h
+17203 m
+10 h
+65 h
+10 h
+4 h
+65 h
+4 h
+79 h
+10 h
+4 h
+10 h
+10 h
+578 h
+17204 m
+17205 m
+1 h
+1 h
+265 h
+4 h
+11 h
+478 m
+6505 m
+1 h
+3276 m
+4 h
+12 h
+17206 m
+6304 m
+10 h
+4 h
+4 h
+4 h
+265 h
+4 h
+1 h
+1 h
+769 m
+17207 m
+170 h
+17208 m
+10 h
+10 h
+4 h
+1 h
+17209 m
+17210 m
+1 h
+10 h
+1 h
+10 h
+1 h
+17211 m
+10 h
+10 h
+124 h
+10 h
+17212 m
+17213 m
+4 h
+1 h
+91 h
+17214 m
+11 h
+4 h
+17215 m
+4 h
+1 h
+17216 m
+10 h
+10 h
+4 h
+1 h
+1 h
+4 h
+25 h
+10 h
+59 h
+1 h
+17217 m
+17218 m
+17219 m
+10 h
+10 h
+59 h
+1 h
+109 h
+10 h
+4409 m
+4 h
+4 h
+4 h
+10 h
+4240 m
+1 h
+10 h
+45 h
+4 h
+10 h
+10 h
+124 h
+4 h
+4 h
+17220 m
+146 h
+17221 m
+262 h
+4 h
+17222 m
+1 h
+10 h
+196 h
+135 h
+12655 m
+4240 m
+17223 m
+1 h
+1 h
+4 h
+4 h
+1 h
+97 h
+74 h
+1 h
+295 h
+4 h
+4 h
+1 h
+22 h
+1 h
+55 h
+779 h
+1 h
+1 h
+4 h
+4 h
+73 h
+4 h
+10 h
+1 h
+17224 m
+17225 m
+65 h
+10 h
+4 h
+17226 m
+1 h
+10 h
+1 h
+3188 m
+17227 m
+59 h
+4 h
+181 h
+56 h
+17228 m
+493 m
+566 m
+17229 m
+17230 m
+1 h
+256 h
+10 h
+4 h
+10 h
+3 h
+1 h
+1 h
+10 h
+4 h
+36 h
+570 m
+1 h
+10 h
+10 h
+17231 m
+17232 m
+10 h
+31 h
+10 h
+97 h
+5760 m
+4 h
+12 h
+4 h
+4 h
+3 h
+4 h
+1 h
+4 h
+1403 h
+10 h
+10 h
+1 h
+17233 m
+640 h
+4 h
+4 h
+17234 m
+31 h
+802 m
+17235 m
+1 h
+25 h
+17236 m
+1 h
+17237 m
+1835 m
+17238 m
+1089 h
+1 h
+79 h
+1 h
+124 h
+17239 m
+4 h
+17240 m
+4 h
+146 h
+135 h
+319 h
+109 h
+12 h
+295 h
+3 h
+1 h
+4 h
+15377 m
+11 h
+4 h
+10 h
+10 h
+10 h
+10 h
+1045 m
+17241 m
+1 h
+17242 m
+147 h
+1 h
+31 h
+11 h
+17243 m
+1 h
+4 h
+1 h
+1 h
+10 h
+10 h
+13 h
+17244 m
+104 h
+4 h
+10 h
+4576 m
+4 h
+31 h
+4 h
+124 h
+17245 m
+687 h
+181 h
+17246 m
+17247 m
+196 h
+195 h
+17248 m
+4 h
+82 h
+25 h
+17249 m
+17250 m
+4 h
+4292 m
+17251 m
+17252 m
+11 h
+10 h
+1 h
+13 h
+17253 m
+10 h
+1 h
+10 h
+1 h
+10 h
+17254 m
+5917 m
+601 h
+4 h
+97 h
+4 h
+1403 h
+17255 m
+114 h
+82 h
+114 h
+73 h
+17256 m
+17257 m
+10 h
+1 h
+7125 m
+12 h
+10 h
+14723 m
+10 h
+1 h
+238 h
+108 h
+4576 m
+4 h
+10 h
+1955 m
+1 h
+4 h
+4 h
+1 h
+10 h
+10 h
+17258 m
+109 h
+1105 h
+17259 m
+7938 m
+1 h
+4 h
+11 h
+3 h
+4 h
+36 h
+1 h
+4 h
+17260 m
+104 h
+17261 m
+10 h
+4 h
+12 h
+4 h
+1 h
+124 h
+10 h
+82 h
+17262 m
+17263 m
+4 h
+41 h
+113 h
+4 h
+10 h
+355 h
+295 h
+17264 m
+17265 m
+11 h
+11 h
+17266 m
+1 h
+412 h
+1 h
+57 h
+11 h
+109 h
+17267 m
+1 h
+874 m
+195 h
+123 h
+69 h
+4 h
+276 h
+186 h
+4 h
+41 h
+82 h
+1 h
+1 h
+4 h
+1 h
+4 h
+737 m
+172 h
+443 h
+27 h
+8 h
+1 h
+4 h
+17268 m
+4 h
+1 h
+538 h
+1 h
+1 h
+4 h
+17269 m
+36 h
+124 h
+4 h
+11 h
+84 m
+195 h
+520 h
+17270 m
+17271 m
+371 h
+10 h
+4 h
+17272 m
+17273 m
+119 h
+1 h
+17274 m
+4 h
+258 h
+114 h
+31 h
+4 h
+3779 m
+1 h
+4 h
+92 h
+17275 m
+4 h
+5522 m
+147 h
+1 h
+4 h
+17276 m
+59 h
+4 h
+1886 h
+10 h
+4 h
+17277 m
+4 h
+1 h
+4229 m
+2172 m
+11 h
+1406 m
+17278 m
+4 h
+17279 m
+4 h
+4 h
+17280 m
+10 h
+17281 m
+1835 m
+10 h
+83 h
+4 h
+1 h
+10 h
+10 h
+1 h
+1 h
+2025 m
+843 h
+17282 m
+1 h
+17283 m
+3 h
+10 h
+10 h
+4 h
+4 h
+17284 m
+10 h
+1 h
+17285 m
+1 h
+1 h
+1 h
+4 h
+4 h
+10 h
+1 h
+4 h
+17286 m
+4 h
+4 h
+4 h
+4 h
+11 h
+4 h
+1 h
+1 h
+1 h
+4 h
+447 h
+82 h
+4 h
+4 h
+17287 m
+3768 m
+17288 m
+1 h
+4 h
+238 h
+17289 m
+64 h
+64 h
+10 h
+4 h
+11 h
+1790 m
+17290 m
+1868 m
+4 h
+146 h
+17291 m
+276 h
+125 h
+10 h
+10 h
+17292 m
+83 h
+190 h
+1 h
+1 h
+4 h
+109 h
+4 h
+17293 m
+4 h
+1 h
+1 h
+10 h
+10 h
+1 h
+3 h
+11 h
+4 h
+17294 m
+1771 m
+266 h
+1 h
+1 h
+4 h
+4 h
+1 h
+31 h
+4 h
+11 h
+17295 m
+59 h
+10 h
+10 h
+10 h
+1 h
+124 h
+4 h
+10 h
+41 h
+10 h
+6135 m
+1 h
+1 h
+125 h
+17296 m
+57 h
+11 h
+11 h
+94 h
+17297 m
+4 h
+1 h
+10 h
+46 m
+536 h
+82 h
+4 h
+4 h
+1 h
+10 h
+4 h
+4 h
+358 h
+17298 m
+17299 m
+4 h
+17300 m
+17301 m
+4 h
+1 h
+1 h
+307 h
+82 h
+4 h
+10 h
+17302 m
+1 h
+109 h
+17303 m
+17304 m
+1 h
+1 h
+1 h
+59 h
+1 h
+10 h
+1 h
+4 h
+1 h
+12 h
+2627 m
+10900 m
+124 h
+4 h
+4 h
+36 h
+124 h
+1 h
+4 h
+10 h
+82 h
+66 m
+299 h
+1 h
+1 h
+4 h
+1796 m
+4 h
+167 h
+10 h
+1337 m
+135 h
+1 h
+1 h
+17305 m
+94 h
+4 h
+79 h
+1 h
+10 h
+1 h
+4 h
+190 h
+10 h
+112 h
+4 h
+16338 m
+65 h
+1 h
+10 h
+17306 m
+17307 m
+10 h
+4 h
+10 h
+181 h
+1 h
+11 h
+8511 m
+4 h
+10 h
+10 h
+146 h
+4 h
+143 h
+10 h
+17308 m
+1 h
+17309 m
+17310 m
+4 h
+41 h
+10 h
+83 h
+25 h
+10 h
+17311 m
+140 h
+17312 m
+1 h
+1 h
+17313 m
+1 h
+83 h
+106 h
+276 h
+10 h
+45 h
+1 h
+59 h
+1 h
+3 h
+10 h
+10 h
+17314 m
+17315 m
+1 h
+4 h
+41 h
+1574 m
+1027 h
+4 h
+1 h
+17316 m
+10 h
+17317 m
+10 h
+10 h
+4 h
+4 h
+4 h
+123 h
+17318 m
+13 h
+2002 h
+1 h
+17319 m
+3 h
+4 h
+1 h
+4 h
+109 h
+125 h
+74 h
+17320 m
+17321 m
+4 h
+10 h
+10 h
+1 h
+10 h
+258 h
+17322 m
+4 h
+10 h
+17323 m
+4 h
+4 h
+17324 m
+4 h
+82 h
+59 h
+82 h
+4 h
+59 h
+17325 m
+17326 m
+124 h
+1975 m
+17327 m
+4 h
+1 h
+10 h
+4 h
+94 h
+79 h
+69 h
+1 h
+17328 m
+4 h
+1 h
+17329 m
+1 h
+10 h
+1 h
+10 h
+12 h
+48 h
+124 h
+124 h
+17330 m
+17331 m
+1 h
+5537 m
+113 h
+338 h
+4 h
+1 h
+17332 m
+4 h
+110 h
+10 h
+17333 m
+359 h
+17334 m
+1 h
+1 h
+4 h
+4 h
+10 h
+1 h
+4 h
+17335 m
+4 h
+10 h
+1 h
+1 h
+10 h
+17336 m
+1 h
+17337 m
+4 h
+10 h
+4 h
+17338 m
+10 h
+170 h
+17339 m
+17340 m
+17341 m
+104 h
+17342 m
+11 h
+1 h
+1 h
+1 h
+4 h
+1 h
+10 h
+124 h
+10 h
+4 h
+295 h
+4 h
+1 h
+11 h
+4 h
+10 h
+4 h
+278 h
+59 h
+123 h
+1 h
+4 h
+10 h
+10 h
+278 h
+59 h
+17343 m
+65 h
+1 h
+224 h
+17344 m
+17345 m
+536 h
+4 h
+1 h
+10 h
+124 h
+13 h
+258 h
+4 h
+45 h
+17346 m
+17347 m
+4 h
+10 h
+11 h
+94 h
+73 h
+4 h
+56 h
+8767 m
+10 h
+4 h
+73 h
+1 h
+17348 m
+4 h
+55 h
+17349 m
+1 h
+15790 m
+17350 m
+10 h
+17351 m
+1 h
+1 h
+270 h
+17352 m
+65 h
+10 h
+17353 m
+10 h
+1 h
+105 m
+4 h
+10 h
+4 h
+1 h
+10 h
+1725 m
+83 h
+59 h
+1 h
+45 h
+266 h
+124 h
+97 h
+4 h
+1 h
+104 h
+4 h
+17354 m
+195 h
+17355 m
+190 h
+4 h
+10 h
+106 h
+888 h
+45 h
+1 h
+4 h
+17356 m
+4 h
+4 h
+41 h
+1 h
+82 h
+110 h
+266 h
+4 h
+276 h
+17357 m
+170 h
+11 h
+10 h
+10 h
+41 h
+10 h
+1 h
+4 h
+823 m
+1 h
+4 h
+10 h
+10 h
+135 h
+1 h
+229 h
+119 h
+173 h
+1 h
+4 h
+167 h
+10 h
+10 h
+4 h
+800 m
+4 h
+17358 m
+17359 m
+17360 m
+55 h
+4 h
+65 h
+10 h
+4 h
+1 h
+97 h
+17361 m
+17362 m
+172 h
+156 h
+4 h
+1 h
+383 h
+17363 m
+4 h
+1 h
+4 h
+2720 m
+4 h
+169 h
+10 h
+82 h
+4 h
+4 h
+1 h
+33 m
+6132 m
+17364 m
+3 h
+1 h
+4 h
+4 h
+4 h
+11 h
+17365 m
+82 h
+17366 m
+4 h
+1 h
+10 h
+4 h
+4 h
+1 h
+918 m
+1 h
+11 h
+10 h
+17367 m
+41 h
+1 h
+65 h
+1 h
+1362 m
+17368 m
+74 h
+1249 m
+17369 m
+1 h
+82 h
+57 h
+4 h
+3 h
+1 h
+4 h
+59 h
+125 h
+10 h
+22 h
+83 h
+1 h
+4 h
+4350 m
+10 h
+1 h
+2925 m
+10 h
+17370 m
+59 h
+4 h
+1 h
+986 h
+82 h
+4 h
+1309 h
+17371 m
+275 m
+1 h
+55 h
+17372 m
+170 h
+1 h
+10 h
+1 h
+17373 m
+17374 m
+1 h
+4 h
+1822 h
+1 h
+4 h
+1 h
+17375 m
+4 h
+17376 m
+2418 m
+57 h
+4 h
+17377 m
+13879 m
+10 h
+45 h
+4 h
+17378 m
+1772 m
+17379 m
+17380 m
+4 h
+4 h
+4 h
+763 m
+3 h
+4 h
+31 h
+125 h
+1 h
+10 h
+265 h
+4 h
+57 h
+10 h
+10 h
+1 h
+1 h
+173 h
+10 h
+4 h
+27 h
+4 h
+17381 m
+113 h
+1 h
+10 h
+4 h
+25 h
+17382 m
+989 m
+3 h
+4 h
+10 h
+17383 m
+17384 m
+156 h
+3622 m
+4 h
+4 h
+17385 m
+12 h
+4 h
+10365 m
+17386 m
+169 h
+1 h
+4 h
+94 h
+4 h
+368 h
+109 h
+229 h
+17387 m
+17388 m
+10 h
+57 h
+17389 m
+4 h
+1 h
+82 h
+1 h
+1 h
+17390 m
+82 h
+1981 m
+10 h
+1100 m
+1 h
+105 m
+17391 m
+1 h
+17392 m
+146 h
+1 h
+11 h
+55 h
+10 h
+64 h
+17393 m
+17394 m
+11 h
+4 h
+65 h
+266 h
+17395 m
+4 h
+92 h
+1 h
+358 h
+262 h
+57 h
+1 h
+2786 m
+4 h
+1 h
+1 h
+10099 m
+4 h
+4 h
+4 h
+109 h
+4 h
+4 h
+11 h
+17396 m
+4 h
+112 h
+17397 m
+4 h
+1 h
+17398 m
+82 h
+17399 m
+17400 m
+1 h
+4 h
+10 h
+17401 m
+4 h
+17402 m
+332 h
+1 h
+1 h
+10 h
+17403 m
+1 h
+1 h
+986 h
+73 h
+4 h
+17404 m
+4 h
+125 h
+4 h
+3 h
+73 h
+4 h
+10 h
+1 h
+12 h
+1 h
+59 h
+1 h
+4 h
+17405 m
+493 m
+4 h
+4 h
+10 h
+73 h
+17406 m
+17407 m
+82 h
+17408 m
+10937 m
+10 h
+1 h
+10 h
+10 h
+196 h
+7535 m
+12 h
+4 h
+1 h
+10 h
+4 h
+1 h
+1 h
+17409 m
+82 h
+1 h
+17410 m
+124 h
+17411 m
+1 h
+4 h
+1 h
+10 h
+1 h
+10 h
+3089 m
+1 h
+11 h
+1 h
+1 h
+1 h
+4 h
+57 h
+1 h
+4 h
+4 h
+4 h
+17412 m
+1 h
+1 h
+17413 m
+17414 m
+94 h
+17415 m
+4 h
+4 h
+4 h
+40 h
+17416 m
+265 h
+4 h
+73 h
+601 h
+1 h
+4145 m
+10 h
+2887 h
+1 h
+4 h
+3 h
+17417 m
+10 h
+4 h
+986 h
+10 h
+10 h
+10 h
+464 h
+4 h
+10 h
+17418 m
+4 h
+17419 m
+10 h
+123 h
+17420 m
+17421 m
+10 h
+17422 m
+11 h
+17423 m
+4 h
+4 h
+91 h
+4 h
+4 h
+4 h
+1 h
+10 h
+10 h
+82 h
+17424 m
+13 h
+144 h
+4 h
+1 h
+17425 m
+93 m
+12 h
+1 h
+1 h
+338 h
+278 h
+4 h
+190 h
+4 h
+295 h
+642 h
+4 h
+17426 m
+119 h
+4 h
+10 h
+17427 m
+1 h
+17428 m
+10 h
+17429 m
+59 h
+17430 m
+1 h
+82 h
+1 h
+4 h
+1 h
+25 h
+4 h
+4 h
+17431 m
+4 h
+10 h
+17432 m
+10 h
+4 h
+17433 m
+45 h
+10 h
+17434 m
+4 h
+1 h
+17435 m
+1 h
+17436 m
+17437 m
+1 h
+17438 m
+258 h
+1 h
+1 h
+319 h
+4 h
+4 h
+10 h
+1 h
+17439 m
+17440 m
+4 h
+1 h
+17441 m
+4 h
+10 h
+146 h
+17442 m
+82 h
+10 h
+4 h
+157 h
+4 h
+25 h
+11 h
+4 h
+10 h
+11 h
+4 h
+692 h
+36 h
+17443 m
+4 h
+4 h
+1 h
+17444 m
+1 h
+4 h
+125 h
+1 h
+10 h
+1337 m
+4 h
+1 h
+4 h
+10 h
+1 h
+1 h
+10 h
+124 h
+10 h
+2379 h
+4 h
+1389 m
+4 h
+10 h
+28 h
+17445 m
+59 h
+1 h
+3161 m
+17446 m
+4 h
+4 h
+124 h
+10 h
+83 h
+4 h
+11 h
+82 h
+11 h
+17447 m
+135 h
+316 h
+196 h
+10 h
+4 h
+1 h
+825 m
+4 h
+74 h
+17448 m
+10 h
+10 h
+59 h
+447 h
+10 h
+17449 m
+13811 m
+45 h
+17450 m
+4 h
+17451 m
+17452 m
+1 h
+4 h
+73 h
+1 h
+184 h
+173 h
+4 h
+276 h
+17453 m
+1 h
+4 h
+1 h
+900 m
+4 h
+4 h
+4 h
+4 h
+10 h
+1 h
+4 h
+31 h
+4297 m
+13 h
+270 h
+170 h
+17454 m
+1 h
+4 h
+4 h
+4 h
+17455 m
+1 h
+16871 m
+4 h
+1 h
+698 m
+4 h
+59 h
+79 h
+17456 m
+4 h
+147 h
+11 h
+358 h
+10 h
+147 h
+109 h
+1 h
+172 h
+10 h
+10 h
+1 h
+4932 m
+11 h
+1 h
+17457 m
+2558 m
+229 h
+238 h
+583 m
+25 h
+195 h
+27 h
+1 h
+17458 m
+4 h
+1 h
+1 h
+4 h
+10 h
+1 h
+10 h
+4 h
+4 h
+8318 m
+4 h
+109 h
+4 h
+17459 m
+4 h
+1 h
+10 h
+4 h
+10 h
+57 h
+7553 m
+10 h
+94 h
+1 h
+17460 m
+1 h
+1185 m
+4 h
+27 h
+11 h
+4 h
+91 h
+4 h
+41 h
+10 h
+1 h
+10 h
+4 h
+17461 m
+295 h
+181 h
+17462 m
+4 h
+124 h
+1 h
+17463 m
+17464 m
+4 h
+1 h
+285 m
+4 h
+524 m
+195 h
+4384 m
+4 h
+17465 m
+1 h
+4 h
+4 h
+4 h
+17466 m
+17467 m
+262 h
+17468 m
+4 h
+17469 m
+64 h
+195 h
+109 h
+17470 m
+8 h
+4 h
+4 h
+447 h
+4 h
+59 h
+1 h
+4 h
+1772 m
+83 h
+10 h
+4 h
+10 h
+1 h
+4 h
+2205 m
+17471 m
+13 h
+31 h
+77 h
+1 h
+4 h
+17472 m
+4 h
+1 h
+1 h
+4 h
+4 h
+59 h
+1 h
+4 h
+1 h
+31 h
+16512 m
+17473 m
+1 h
+1 h
+2865 m
+1 h
+10 h
+4 h
+718 h
+10 h
+1 h
+4 h
+4 h
+17474 m
+4359 m
+64 h
+698 m
+45 h
+17475 m
+4 h
+17476 m
+1 h
+322 h
+4 h
+36 h
+56 h
+4 h
+307 h
+17477 m
+59 h
+10 h
+147 h
+1 h
+1 h
+125 h
+5162 m
+10 h
+10 h
+1 h
+4 h
+4 h
+10 h
+4 h
+4 h
+12 h
+4 h
+1 h
+4 h
+17478 m
+10 h
+1 h
+17479 m
+4 h
+17480 m
+10 h
+11654 m
+135 h
+4 h
+4 h
+4 h
+1 h
+4 h
+4 h
+4 h
+17481 m
+10 h
+4 h
+45 h
+17482 m
+3 h
+1 h
+203 h
+1 h
+1 h
+4 h
+1 h
+1 h
+17483 m
+4 h
+11 h
+4 h
+10 h
+10 h
+1 h
+10 h
+4 h
+124 h
+11 h
+17484 m
+4 h
+353 h
+1 h
+55 h
+195 h
+64 h
+1322 m
+124 h
+4 h
+4 h
+125 h
+17485 m
+274 h
+1 h
+17486 m
+3 h
+4 h
+4 h
+1 h
+10 h
+57 h
+10 h
+238 h
+4 h
+6371 m
+1 h
+11 h
+17487 m
+17488 m
+1556 m
+196 h
+1 h
+316 h
+4 h
+17489 m
+4 h
+169 h
+17490 m
+25 h
+17491 m
+4 h
+1 h
+17492 m
+10 h
+327 m
+1 h
+383 h
+17493 m
+1 h
+270 h
+4 h
+17494 m
+737 m
+10 h
+124 h
+4 h
+109 h
+359 h
+10 h
+143 h
+17495 m
+12956 m
+10 h
+3159 m
+1 h
+17496 m
+649 m
+4 h
+4 h
+109 h
+4538 m
+17497 m
+59 h
+1 h
+4 h
+4 h
+4 h
+1 h
+1 h
+7243 m
+4 h
+1 h
+82 h
+1 h
+1 h
+1 h
+3188 m
+17498 m
+17499 m
+4 h
+1 h
+4 h
+10 h
+4 h
+4 h
+10 h
+4 h
+17500 m
+41 h
+4 h
+1667 m
+1 h
+444 m
+10 h
+4 h
+17501 m
+17502 m
+10 h
+4 h
+125 h
+4 h
+3 h
+17503 m
+1 h
+1 h
+1 h
+17504 m
+124 h
+4349 m
+17505 m
+4 h
+17506 m
+8 h
+17507 m
+55 h
+57 h
+1665 m
+185 h
+10 h
+17508 m
+1 h
+258 h
+4 h
+1 h
+57 h
+2733 h
+104 h
+4 h
+170 h
+10 h
+4 h
+1 h
+1 h
+147 h
+1 h
+124 h
+11 h
+17509 m
+4 h
+17510 m
+10 h
+10 h
+17511 m
+25 h
+258 h
+17512 m
+79 h
+97 h
+3908 m
+1 h
+1 h
+40 h
+10 h
+10 h
+1 h
+118 h
+2022 m
+1 h
+27 h
+1 h
+4 h
+1 h
+146 h
+10 h
+17513 m
+295 h
+17514 m
+1 h
+4 h
+1 h
+1 h
+4 h
+1 h
+1 h
+17515 m
+124 h
+2438 m
+17516 m
+230 h
+25 h
+17517 m
+1 h
+1 h
+17518 m
+25 h
+10 h
+17519 m
+570 m
+10 h
+4 h
+390 m
+4 h
+11 h
+1 h
+4 h
+4 h
+1 h
+59 h
+4 h
+4 h
+17520 m
+4 h
+109 h
+4 h
+17521 m
+4 h
+17522 m
+10 h
+4 h
+146 h
+935 m
+17523 m
+1 h
+1 h
+10 h
+1772 h
+17524 m
+4 h
+1 h
+4 h
+11 h
+104 h
+64 h
+83 h
+36 h
+1 h
+74 h
+82 h
+4 h
+265 h
+109 h
+338 h
+784 m
+4 h
+5387 m
+156 h
+4 h
+17525 m
+4 h
+1 h
+17526 m
+601 h
+15455 m
+4 h
+4 h
+17527 m
+10 h
+4 h
+1 h
+170 h
+8 h
+4 h
+57 h
+1 h
+17528 m
+17529 m
+1619 h
+4 h
+4 h
+17530 m
+3 h
+185 h
+17531 m
+12192 m
+17532 m
+114 h
+4 h
+125 h
+192 h
+4 h
+1 h
+73 h
+69 h
+82 h
+1 h
+4 h
+11 h
+10 h
+10 h
+11 h
+10 h
+4 h
+4 h
+4 h
+57 h
+1 h
+1 h
+17533 m
+17534 m
+17535 m
+9256 m
+4 h
+17536 m
+1030 m
+167 h
+83 h
+1 h
+1 h
+1 h
+1 h
+56 h
+4 h
+59 h
+4 h
+1 h
+4 h
+10 h
+1 h
+11 h
+1372 m
+10 h
+124 h
+112 h
+1 h
+4 h
+4 h
+266 h
+1 h
+4 h
+79 h
+181 h
+73 h
+4 h
+17537 m
+17538 m
+124 h
+10 h
+4 h
+17539 m
+1 h
+4 h
+17540 m
+332 h
+10383 m
+266 h
+1 h
+1 h
+1 h
+4 h
+4 h
+1 h
+447 h
+4 h
+1 h
+17541 m
+4 h
+109 h
+17542 m
+17543 m
+110 h
+4 h
+4 h
+4 h
+97 h
+10 h
+10 h
+13 h
+10 h
+4 h
+1 h
+59 h
+79 h
+27 h
+10 h
+17544 m
+10 h
+17545 m
+258 h
+4 h
+2607 m
+57 h
+1 h
+157 h
+1 h
+4 h
+10 h
+4 h
+185 h
+10 h
+17546 m
+4 h
+17547 m
+17548 m
+10 h
+17549 m
+1 h
+17550 m
+17551 m
+11 h
+1 h
+4 h
+17552 m
+1403 h
+4 h
+17553 m
+10 h
+4 h
+17554 m
+307 h
+1 h
+99 m
+4 h
+10 h
+104 h
+164 h
+14814 m
+119 h
+9912 m
+4 h
+10 h
+196 h
+25 h
+479 h
+1 h
+536 h
+1 h
+1 h
+4 h
+1 h
+17555 m
+11 h
+885 m
+97 h
+109 h
+4 h
+4 h
+1 h
+139 h
+4 h
+10 h
+687 h
+12 h
+97 h
+10 h
+4 h
+45 h
+1 h
+1 h
+8711 m
+4 h
+4 h
+33 m
+17556 m
+82 h
+4 h
+10 h
+83 h
+4 h
+1 h
+169 h
+10 h
+73 h
+4 h
+11334 m
+10 h
+10 h
+1 h
+82 h
+1 h
+57 h
+278 h
+17557 m
+4 h
+5141 m
+1 h
+17558 m
+17559 m
+17560 m
+10 h
+1 h
+17561 m
+16841 m
+630 m
+2617 m
+9397 m
+65 h
+83 h
+4 h
+4 h
+4 h
+79 h
+4 h
+4 h
+196 h
+1 h
+17562 m
+4 h
+4 h
+4 h
+1691 m
+17563 m
+110 h
+172 h
+125 h
+83 h
+55 h
+13 h
+10 h
+83 h
+1 h
+195 h
+1 h
+17564 m
+10 h
+8503 m
+17565 m
+3 h
+10 h
+10 h
+10 h
+1 h
+295 h
+1 h
+170 h
+4 h
+4 h
+5505 m
+1 h
+4 h
+4 h
+17566 m
+17567 m
+124 h
+10 h
+1027 h
+1 h
+17568 m
+104 h
+10 h
+468 h
+4 h
+2733 h
+124 h
+4867 m
+4 h
+2172 m
+4 h
+4 h
+1 h
+4 h
+8 h
+17569 m
+146 h
+4441 h
+41 h
+17570 m
+4 h
+4 h
+4 h
+1 h
+11 h
+83 h
+4 h
+17571 m
+4 h
+17572 m
+10 h
+17573 m
+10 h
+22 h
+1 h
+10 h
+1 h
+17574 m
+97 h
+4350 m
+14316 m
+794 m
+4 h
+12020 m
+4 h
+17575 m
+17576 m
+10 h
+4 h
+4 h
+17577 m
+10 h
+17578 m
+4 h
+230 h
+4 h
+4 h
+109 h
+7800 m
+17579 m
+4 h
+10 h
+143 h
+4 h
+1 h
+10 h
+31 h
+17580 m
+4 h
+4 h
+1 h
+4 h
+10 h
+4 h
+5348 m
+77 h
+4 h
+1 h
+186 h
+119 h
+4 h
+4 h
+17581 m
+4 h
+17582 m
+10 h
+17583 m
+65 h
+331 m
+4 h
+48 h
+17584 m
+125 h
+185 h
+170 h
+17585 m
+17586 m
+17587 m
+467 m
+10 h
+1 h
+278 h
+1074 h
+25 h
+17588 m
+17589 m
+17590 m
+48 h
+17591 m
+74 h
+464 h
+94 h
+4 h
+1 h
+4 h
+2532 m
+57 h
+4 h
+4 h
+1 h
+17592 m
+4 h
+97 h
+17593 m
+125 h
+1027 h
+17594 m
+3 h
+17595 m
+1 h
+10 h
+692 h
+238 h
+156 h
+11 h
+10 h
+10 h
+4 h
+1 h
+1 h
+1 h
+172 h
+1 h
+109 h
+17596 m
+4 h
+17597 m
+1 h
+57 h
+13790 m
+1768 m
+4 h
+17598 m
+4 h
+8741 m
+17599 m
+73 h
+1 h
+4 h
+1 h
+167 h
+17600 m
+185 h
+1 h
+1 h
+1 h
+17601 m
+10 h
+1 h
+1 h
+10 h
+1 h
+4 h
+1 h
+173 h
+4 h
+4 h
+55 h
+4 h
+1619 h
+10 h
+1 h
+278 h
+195 h
+1 h
+11 h
+59 h
+10 h
+4 h
+4904 m
+1 h
+48 h
+4 h
+10 h
+4 h
+17602 m
+4 h
+59 h
+1 h
+8 h
+4 h
+1 h
+11 h
+4 h
+110 h
+4 h
+4 h
+1 h
+1045 m
+17603 m
+97 h
+1 h
+8 h
+1 h
+3657 m
+1 h
+367 m
+17604 m
+1 h
+1 h
+2585 m
+156 h
+4 h
+64 h
+4 h
+17605 m
+4 h
+10 h
+4 h
+17606 m
+17607 m
+17608 m
+10 h
+4 h
+17609 m
+4 h
+25 h
+4 h
+332 h
+4 h
+4 h
+12 h
+17610 m
+3 h
+4 h
+10 h
+17611 m
+57 h
+1791 m
+17612 m
+4 h
+8 h
+1 h
+4 h
+1 h
+10 h
+17613 m
+1 h
+4 h
+4 h
+1 h
+10 h
+17614 m
+146 h
+4 h
+1 h
+1 h
+4 h
+4 h
+28 h
+4 h
+57 h
+4 h
+307 h
+112 h
+11 h
+97 h
+11 h
+1 h
+10 h
+22 h
+10 h
+3 h
+17615 m
+119 h
+806 m
+10 h
+17616 m
+135 h
+4 h
+79 h
+4 h
+1261 h
+11 h
+1 h
+4 h
+157 h
+4 h
+82 h
+3 h
+59 h
+45 h
+97 h
+17617 m
+17618 m
+1 h
+31 h
+65 h
+11 h
+41 h
+266 h
+1 h
+4 h
+4 h
+4 h
+17619 m
+10 h
+4 h
+229 h
+1 h
+1 h
+10 h
+1 h
+276 h
+10 h
+27 h
+10 h
+17620 m
+4 h
+383 h
+59 h
+17621 m
+581 m
+4 h
+4 h
+1250 h
+41 h
+92 h
+17622 m
+990 m
+10 h
+1 h
+17623 m
+4 h
+1 h
+4 h
+279 h
+601 h
+4 h
+1 h
+4 h
+57 h
+10 h
+1 h
+4 h
+4 h
+6851 m
+4 h
+157 h
+135 h
+10 h
+1 h
+1 h
+4 h
+1 h
+4 h
+258 h
+14708 m
+3 h
+17624 m
+1 h
+146 h
+55 h
+262 h
+158 h
+1 h
+4 h
+1 h
+1 h
+13007 m
+124 h
+169 h
+17625 m
+170 h
+4 h
+17626 m
+911 h
+4 h
+156 h
+4 h
+1 h
+143 h
+17627 m
+4 h
+41 h
+10 h
+4 h
+17628 m
+4 h
+56 h
+4 h
+125 h
+11 h
+124 h
+4292 m
+17629 m
+97 h
+124 h
+2733 h
+36 h
+4 h
+17630 m
+1 h
+1260 m
+4 h
+10 h
+17631 m
diff --git a/pebble/internal/cache/value.go b/pebble/internal/cache/value.go
new file mode 100644
index 0000000..6d2cae1
--- /dev/null
+++ b/pebble/internal/cache/value.go
@@ -0,0 +1,46 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package cache
+
+// Value holds a reference counted immutable value.
+type Value struct {
+	buf []byte
+	// Reference count for the value. The value is freed when the reference count
+	// drops to zero.
+	ref refcnt
+}
+
+// Buf returns the buffer associated with the value. The contents of the buffer
+// should not be changed once the value has been added to the cache. Instead, a
+// new Value should be created and added to the cache to replace the existing
+// value.
+func (v *Value) Buf() []byte {
+	if v == nil {
+		return nil
+	}
+	return v.buf
+}
+
+// Truncate the buffer to the specified length. The buffer length should not be
+// changed once the value has been added to the cache as there may be
+// concurrent readers of the Value. Instead, a new Value should be created and
+// added to the cache to replace the existing value.
+func (v *Value) Truncate(n int) {
+	v.buf = v.buf[:n]
+}
+
+func (v *Value) refs() int32 {
+	return v.ref.refs()
+}
+
+func (v *Value) acquire() {
+	v.ref.acquire()
+}
+
+func (v *Value) release() {
+	if v != nil && v.ref.release() {
+		v.free()
+	}
+}
diff --git a/pebble/internal/cache/value_invariants.go b/pebble/internal/cache/value_invariants.go
new file mode 100644
index 0000000..1e30d27
--- /dev/null
+++ b/pebble/internal/cache/value_invariants.go
@@ -0,0 +1,55 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build (invariants && !race) || (tracing && !race)
+// +build invariants,!race tracing,!race
+
+package cache
+
+import (
+	"fmt"
+	"os"
+
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/manual"
+)
+
+// newValue creates a Value with a manually managed buffer of size n.
+//
+// This definition of newValue is used when either the "invariants" or
+// "tracing" build tags are specified. It hooks up a finalizer to the returned
+// Value that checks for memory leaks when the GC determines the Value is no
+// longer reachable.
+func newValue(n int) *Value {
+	if n == 0 {
+		return nil
+	}
+	b := manual.New(n)
+	v := &Value{buf: b}
+	v.ref.init(1)
+	// Note: this is a no-op if invariants and tracing are disabled or race is
+	// enabled.
+	invariants.SetFinalizer(v, func(obj interface{}) {
+		v := obj.(*Value)
+		if v.buf != nil {
+			fmt.Fprintf(os.Stderr, "%p: cache value was not freed: refs=%d\n%s",
+				v, v.refs(), v.ref.traces())
+			os.Exit(1)
+		}
+	})
+	return v
+}
+
+func (v *Value) free() {
+	// When "invariants" are enabled set the value contents to 0xff in order to
+	// cache use-after-free bugs.
+	for i := range v.buf {
+		v.buf[i] = 0xff
+	}
+	manual.Free(v.buf)
+	// Setting Value.buf to nil is needed for correctness of the leak checking
+	// that is performed when the "invariants" or "tracing" build tags are
+	// enabled.
+	v.buf = nil
+}
diff --git a/pebble/internal/cache/value_normal.go b/pebble/internal/cache/value_normal.go
new file mode 100644
index 0000000..e03379d
--- /dev/null
+++ b/pebble/internal/cache/value_normal.go
@@ -0,0 +1,57 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build (!invariants && !tracing) || race
+// +build !invariants,!tracing race
+
+package cache
+
+import (
+	"unsafe"
+
+	"github.com/cockroachdb/pebble/internal/manual"
+)
+
+const valueSize = int(unsafe.Sizeof(Value{}))
+
+func newValue(n int) *Value {
+	if n == 0 {
+		return nil
+	}
+
+	if !cgoEnabled {
+		// If Cgo is disabled then all memory is allocated from the Go heap and we
+		// can't play the trick below to combine the Value and buffer allocation.
+		v := &Value{buf: make([]byte, n)}
+		v.ref.init(1)
+		return v
+	}
+
+	// When we're not performing leak detection, the lifetime of the returned
+	// Value is exactly the lifetime of the backing buffer and we can manually
+	// allocate both.
+	//
+	// TODO(peter): It may be better to separate the allocation of the value and
+	// the buffer in order to reduce internal fragmentation in malloc. If the
+	// buffer is right at a power of 2, adding valueSize might push the
+	// allocation over into the next larger size.
+	b := manual.New(valueSize + n)
+	v := (*Value)(unsafe.Pointer(&b[0]))
+	v.buf = b[valueSize:]
+	v.ref.init(1)
+	return v
+}
+
+func (v *Value) free() {
+	if !cgoEnabled {
+		return
+	}
+
+	// When we're not performing leak detection, the Value and buffer were
+	// allocated contiguously.
+	n := valueSize + cap(v.buf)
+	buf := (*[manual.MaxArrayLen]byte)(unsafe.Pointer(v))[:n:n]
+	v.buf = nil
+	manual.Free(buf)
+}
diff --git a/pebble/internal/constants/constants.go b/pebble/internal/constants/constants.go
new file mode 100644
index 0000000..8d9198c
--- /dev/null
+++ b/pebble/internal/constants/constants.go
@@ -0,0 +1,17 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package constants
+
+const (
+	// oneIf64Bit is 1 on 64-bit platforms and 0 on 32-bit platforms.
+	oneIf64Bit = ^uint(0) >> 63
+
+	// MaxUint32OrInt returns min(MaxUint32, MaxInt), i.e
+	// - MaxUint32 on 64-bit platforms;
+	// - MaxInt on 32-bit platforms.
+	// It is used when slices are limited to Uint32 on 64-bit platforms (the
+	// length limit for slices is naturally MaxInt on 32-bit platforms).
+	MaxUint32OrInt = (1<<31)<<oneIf64Bit - 1
+)
diff --git a/pebble/internal/constants/constants_test.go b/pebble/internal/constants/constants_test.go
new file mode 100644
index 0000000..fa5a0c1
--- /dev/null
+++ b/pebble/internal/constants/constants_test.go
@@ -0,0 +1,20 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package constants
+
+import (
+	"math"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestConstants(t *testing.T) {
+	if math.MaxInt == math.MaxInt64 {
+		require.Equal(t, uint64(math.MaxUint32), uint64(MaxUint32OrInt))
+	} else {
+		require.Equal(t, uint64(math.MaxInt), uint64(MaxUint32OrInt))
+	}
+}
diff --git a/pebble/internal/crc/crc.go b/pebble/internal/crc/crc.go
new file mode 100644
index 0000000..4021a2e
--- /dev/null
+++ b/pebble/internal/crc/crc.go
@@ -0,0 +1,42 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+// Package crc implements the checksum algorithm used throughout pebble.
+//
+// The algorithm is CRC-32 with Castagnoli's polynomial, followed by a bit
+// rotation and an additional delta. The additional processing is to lessen the
+// probability of arbitrary key/value data coincidentally containing bytes that
+// look like a checksum.
+//
+// To calculate the uint32 checksum of some data:
+//
+//	var u uint32 = crc.New(data).Value()
+//
+// In pebble, the uint32 value is then stored in little-endian format.
+package crc // import "github.com/cockroachdb/pebble/internal/crc"
+
+import "hash/crc32"
+
+var table = crc32.MakeTable(crc32.Castagnoli)
+
+// CRC is a small convenience wrapper for computing the CRC32 checksum used by
+// pebble. This is the same algorithm as used by RocksDB.
+type CRC uint32
+
+// New returns the result of adding the bytes to the zero-value CRC.
+func New(b []byte) CRC {
+	return CRC(0).Update(b)
+}
+
+// Update returns the result of adding the bytes to the CRC.
+func (c CRC) Update(b []byte) CRC {
+	return CRC(crc32.Update(uint32(c), table, b))
+}
+
+// Value returns the cooked CRC value. The additional processing is to lessen
+// the probability of arbitrary key/value data coincidentally containing bytes
+// that look like a checksum.
+func (c CRC) Value() uint32 {
+	return uint32(c>>15|c<<17) + 0xa282ead8
+}
diff --git a/pebble/internal/datatest/datatest.go b/pebble/internal/datatest/datatest.go
new file mode 100644
index 0000000..40f78d5
--- /dev/null
+++ b/pebble/internal/datatest/datatest.go
@@ -0,0 +1,140 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+// Package datatest provides common datadriven test commands for use outside of
+// the root Pebble package.
+package datatest
+
+import (
+	"strings"
+	"sync"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble"
+)
+
+// TODO(jackson): Consider a refactoring that can consolidate this package and
+// the datadriven commands defined in pebble/data_test.go.
+
+// DefineBatch interprets the provided datadriven command as a sequence of write
+// operations, one-per-line, to apply to the provided batch.
+func DefineBatch(d *datadriven.TestData, b *pebble.Batch) error {
+	for _, line := range strings.Split(d.Input, "\n") {
+		parts := strings.Fields(line)
+		if len(parts) == 0 {
+			continue
+		}
+		if parts[1] == `<nil>` {
+			parts[1] = ""
+		}
+		var err error
+		switch parts[0] {
+		case "set":
+			if len(parts) != 3 {
+				return errors.Errorf("%s expects 2 arguments", parts[0])
+			}
+			err = b.Set([]byte(parts[1]), []byte(parts[2]), nil)
+		case "del":
+			if len(parts) != 2 {
+				return errors.Errorf("%s expects 1 argument", parts[0])
+			}
+			err = b.Delete([]byte(parts[1]), nil)
+		case "singledel":
+			if len(parts) != 2 {
+				return errors.Errorf("%s expects 1 argument", parts[0])
+			}
+			err = b.SingleDelete([]byte(parts[1]), nil)
+		case "del-range":
+			if len(parts) != 3 {
+				return errors.Errorf("%s expects 2 arguments", parts[0])
+			}
+			err = b.DeleteRange([]byte(parts[1]), []byte(parts[2]), nil)
+		case "merge":
+			if len(parts) != 3 {
+				return errors.Errorf("%s expects 2 arguments", parts[0])
+			}
+			err = b.Merge([]byte(parts[1]), []byte(parts[2]), nil)
+		case "range-key-set":
+			if len(parts) != 5 {
+				return errors.Errorf("%s expects 4 arguments", parts[0])
+			}
+			err = b.RangeKeySet(
+				[]byte(parts[1]),
+				[]byte(parts[2]),
+				[]byte(parts[3]),
+				[]byte(parts[4]),
+				nil)
+		case "range-key-unset":
+			if len(parts) != 4 {
+				return errors.Errorf("%s expects 3 arguments", parts[0])
+			}
+			err = b.RangeKeyUnset(
+				[]byte(parts[1]),
+				[]byte(parts[2]),
+				[]byte(parts[3]),
+				nil)
+		case "range-key-del":
+			if len(parts) != 3 {
+				return errors.Errorf("%s expects 2 arguments", parts[0])
+			}
+			err = b.RangeKeyDelete(
+				[]byte(parts[1]),
+				[]byte(parts[2]),
+				nil)
+		default:
+			return errors.Errorf("unknown op: %s", parts[0])
+		}
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// CompactionTracker is a listener that tracks the number of compactions.
+type CompactionTracker struct {
+	sync.Cond
+	count    int
+	attached bool
+}
+
+// NewCompactionTracker setups the necessary options to keep track of the
+// compactions that are in flight.
+func NewCompactionTracker(options *pebble.Options) *CompactionTracker {
+	ct := CompactionTracker{}
+	ct.Cond = sync.Cond{
+		L: &sync.Mutex{},
+	}
+	ct.attached = true
+	el := pebble.EventListener{
+		CompactionEnd: func(info pebble.CompactionInfo) {
+			ct.L.Lock()
+			ct.count--
+			ct.Broadcast()
+			ct.L.Unlock()
+		},
+		CompactionBegin: func(info pebble.CompactionInfo) {
+			ct.L.Lock()
+			ct.count++
+			ct.Broadcast()
+			ct.L.Unlock()
+		},
+	}
+
+	options.AddEventListener(el)
+	return &ct
+}
+
+// WaitForInflightCompactionsToEqual waits until compactions meet the specified target.
+func (cql *CompactionTracker) WaitForInflightCompactionsToEqual(target int) {
+	cql.L.Lock()
+	if !cql.attached {
+		panic("Cannot wait for compactions if listener has not been attached")
+	}
+	for cql.count != target {
+		cql.Wait()
+	}
+	cql.L.Unlock()
+}
diff --git a/pebble/internal/dsl/dsl.go b/pebble/internal/dsl/dsl.go
new file mode 100644
index 0000000..ef546fd
--- /dev/null
+++ b/pebble/internal/dsl/dsl.go
@@ -0,0 +1,160 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+// Package dsl provides facilities for parsing lisp-like domain-specific
+// languages (DSL).
+package dsl
+
+import (
+	"fmt"
+	"go/scanner"
+	"go/token"
+	"strconv"
+	"strings"
+
+	"github.com/cockroachdb/errors"
+)
+
+// NewParser constructs a new Parser of a lisp-like DSL.
+func NewParser[T any]() *Parser[T] {
+	p := new(Parser[T])
+	p.constants = make(map[string]func() T)
+	p.funcs = make(map[string]func(*Parser[T], *Scanner) T)
+	return p
+}
+
+// NewPredicateParser constructs a new Parser of a Lisp-like DSL, where the
+// resulting type implements Predicate[E]. NewPredicateParser predefines a few
+// useful functions: Not, And, Or, OnIndex.
+func NewPredicateParser[E any]() *Parser[Predicate[E]] {
+	p := NewParser[Predicate[E]]()
+	p.DefineFunc("Not", parseNot[E])
+	p.DefineFunc("And", parseAnd[E])
+	p.DefineFunc("Or", parseOr[E])
+	p.DefineFunc("OnIndex", parseOnIndex[E])
+	return p
+}
+
+// A Parser holds the rules and logic for parsing a DSL.
+type Parser[T any] struct {
+	constants map[string]func() T
+	funcs     map[string]func(*Parser[T], *Scanner) T
+}
+
+// DefineConstant adds a new constant to the Parser's supported DSL. Whenever
+// the provided identifier is used within a constant context, the provided
+// closure is invoked to instantiate an appropriate AST value.
+func (p *Parser[T]) DefineConstant(identifier string, instantiate func() T) {
+	p.constants[identifier] = instantiate
+}
+
+// DefineFunc adds a new func to the Parser's supported DSL. Whenever the
+// provided identifier is used within a function invocation context, the
+// provided closure is invoked to instantiate an appropriate AST value.
+func (p *Parser[T]) DefineFunc(identifier string, parseFunc func(*Parser[T], *Scanner) T) {
+	p.funcs[identifier] = parseFunc
+}
+
+// Parse parses the provided input string.
+func (p *Parser[T]) Parse(d string) (ret T, err error) {
+	defer func() {
+		if r := recover(); r != nil {
+			var ok bool
+			err, ok = r.(error)
+			if !ok {
+				panic(r)
+			}
+		}
+	}()
+
+	fset := token.NewFileSet()
+	file := fset.AddFile("", -1, len(d))
+	var s Scanner
+	s.Init(file, []byte(strings.TrimSpace(d)), nil /* no error handler */, 0)
+	tok := s.Scan()
+	ret = p.ParseFromPos(&s, tok)
+	tok = s.Scan()
+	if tok.Kind == token.SEMICOLON {
+		tok = s.Scan()
+	}
+	assertTok(tok, token.EOF)
+	return ret, err
+}
+
+// ParseFromPos parses from the provided current position and associated
+// scanner. If the parser fails to parse, it panics. This function is intended
+// to be used when composing Parsers of various types.
+func (p *Parser[T]) ParseFromPos(s *Scanner, tok Token) T {
+	switch tok.Kind {
+	case token.IDENT:
+		// A constant without any parens, eg. `Reads`.
+		p, ok := p.constants[tok.Lit]
+		if !ok {
+			panic(errors.Errorf("dsl: unknown constant %q", tok.Lit))
+		}
+		return p()
+	case token.LPAREN:
+		// Otherwise it's an expression, eg: (OnIndex 1)
+		tok = s.Consume(token.IDENT)
+		fp, ok := p.funcs[tok.Lit]
+		if !ok {
+			panic(errors.Errorf("dsl: unknown func %q", tok.Lit))
+		}
+		return fp(p, s)
+	default:
+		panic(errors.Errorf("dsl: unexpected token %s; expected IDENT or LPAREN", tok.String()))
+	}
+}
+
+// A Scanner holds the scanner's internal state while processing a given text.
+type Scanner struct {
+	scanner.Scanner
+}
+
+// Scan scans the next token and returns it.
+func (s *Scanner) Scan() Token {
+	pos, tok, lit := s.Scanner.Scan()
+	return Token{pos, tok, lit}
+}
+
+// Consume scans the next token. If the token is not of the provided token, it
+// panics. It returns the token itself.
+func (s *Scanner) Consume(expect token.Token) Token {
+	t := s.Scan()
+	assertTok(t, expect)
+	return t
+}
+
+// ConsumeString scans the next token. It panics if the next token is not a
+// string, or if unable to unquote the string. It returns the unquoted string
+// contents.
+func (s *Scanner) ConsumeString() string {
+	lit := s.Consume(token.STRING).Lit
+	str, err := strconv.Unquote(lit)
+	if err != nil {
+		panic(errors.Newf("dsl: unquoting %q: %v", lit, err))
+	}
+	return str
+}
+
+// Token is a lexical token scanned from an input text.
+type Token struct {
+	pos  token.Pos
+	Kind token.Token
+	Lit  string
+}
+
+// String implements fmt.Stringer.
+func (t *Token) String() string {
+	if t.Lit != "" {
+		return fmt.Sprintf("(%s, %q) at pos %v", t.Kind, t.Lit, t.pos)
+	}
+	return fmt.Sprintf("%s at pos %v", t.Kind, t.pos)
+}
+
+func assertTok(tok Token, expect token.Token) {
+	if tok.Kind != expect {
+		panic(errors.Errorf("dsl: unexpected token %s; expected %s", tok.String(), expect))
+	}
+}
diff --git a/pebble/internal/dsl/predicates.go b/pebble/internal/dsl/predicates.go
new file mode 100644
index 0000000..fff0fcd
--- /dev/null
+++ b/pebble/internal/dsl/predicates.go
@@ -0,0 +1,136 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package dsl
+
+import (
+	"fmt"
+	"go/token"
+	"strconv"
+	"strings"
+	"sync/atomic"
+
+	"github.com/cockroachdb/errors"
+)
+
+// Predicate encodes conditional logic that yields a boolean.
+type Predicate[E any] interface {
+	Evaluate(E) bool
+	String() string
+}
+
+// Not returns a Predicate that negates the provided predicate.
+func Not[E any](p Predicate[E]) Predicate[E] { return not[E]{Predicate: p} }
+
+// And returns a Predicate that evaluates to true if all its operands evaluate
+// to true.
+func And[E any](preds ...Predicate[E]) Predicate[E] { return and[E](preds) }
+
+// Or returns a Predicate that evaluates to true if any of its operands evaluate
+// true.
+func Or[E any](preds ...Predicate[E]) Predicate[E] { return or[E](preds) }
+
+// OnIndex returns a Predicate that evaluates to true on its N-th call.
+func OnIndex[E any](n int32) *Index[E] {
+	p := new(Index[E])
+	p.Int32.Store(n)
+	return p
+}
+
+// Index is a Predicate that evaluates to true only on its N-th invocation.
+type Index[E any] struct {
+	atomic.Int32
+}
+
+// String implements fmt.Stringer.
+func (p *Index[E]) String() string {
+	return fmt.Sprintf("(OnIndex %d)", p.Int32.Load())
+}
+
+// Evaluate implements Predicate.
+func (p *Index[E]) Evaluate(E) bool { return p.Int32.Add(-1) == -1 }
+
+type not[E any] struct {
+	Predicate[E]
+}
+
+func (p not[E]) String() string    { return fmt.Sprintf("(Not %s)", p.Predicate.String()) }
+func (p not[E]) Evaluate(e E) bool { return !p.Predicate.Evaluate(e) }
+
+type and[E any] []Predicate[E]
+
+func (p and[E]) String() string {
+	var sb strings.Builder
+	sb.WriteString("(And")
+	for i := 0; i < len(p); i++ {
+		sb.WriteRune(' ')
+		sb.WriteString(p[i].String())
+	}
+	sb.WriteRune(')')
+	return sb.String()
+}
+
+func (p and[E]) Evaluate(e E) bool {
+	ok := true
+	for i := range p {
+		ok = ok && p[i].Evaluate(e)
+	}
+	return ok
+}
+
+type or[E any] []Predicate[E]
+
+func (p or[E]) String() string {
+	var sb strings.Builder
+	sb.WriteString("(Or")
+	for i := 0; i < len(p); i++ {
+		sb.WriteRune(' ')
+		sb.WriteString(p[i].String())
+	}
+	sb.WriteRune(')')
+	return sb.String()
+}
+
+func (p or[E]) Evaluate(e E) bool {
+	ok := false
+	for i := range p {
+		ok = ok || p[i].Evaluate(e)
+	}
+	return ok
+}
+
+func parseNot[E any](p *Parser[Predicate[E]], s *Scanner) Predicate[E] {
+	preds := parseVariadicPredicate(p, s)
+	if len(preds) != 1 {
+		panic(errors.Newf("dsl: not accepts exactly 1 argument, given %d", len(preds)))
+	}
+	return not[E]{Predicate: preds[0]}
+}
+
+func parseAnd[E any](p *Parser[Predicate[E]], s *Scanner) Predicate[E] {
+	return And[E](parseVariadicPredicate[E](p, s)...)
+}
+
+func parseOr[E any](p *Parser[Predicate[E]], s *Scanner) Predicate[E] {
+	return Or[E](parseVariadicPredicate[E](p, s)...)
+}
+
+func parseOnIndex[E any](p *Parser[Predicate[E]], s *Scanner) Predicate[E] {
+	i, err := strconv.ParseInt(s.Consume(token.INT).Lit, 10, 32)
+	if err != nil {
+		panic(err)
+	}
+	s.Consume(token.RPAREN)
+	return OnIndex[E](int32(i))
+}
+
+func parseVariadicPredicate[E any](p *Parser[Predicate[E]], s *Scanner) (ret []Predicate[E]) {
+	tok := s.Scan()
+	for tok.Kind == token.LPAREN || tok.Kind == token.IDENT {
+		ret = append(ret, p.ParseFromPos(s, tok))
+		tok = s.Scan()
+	}
+	assertTok(tok, token.RPAREN)
+	return ret
+}
diff --git a/pebble/internal/fastrand/fastrand.go b/pebble/internal/fastrand/fastrand.go
new file mode 100644
index 0000000..dd3ec9c
--- /dev/null
+++ b/pebble/internal/fastrand/fastrand.go
@@ -0,0 +1,17 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package fastrand
+
+import _ "unsafe" // required by go:linkname
+
+// Uint32 returns a lock free uint32 value.
+//
+//go:linkname Uint32 runtime.fastrand
+func Uint32() uint32
+
+// Uint32n returns a lock free uint32 value in the interval [0, n).
+//
+//go:linkname Uint32n runtime.fastrandn
+func Uint32n(n uint32) uint32
diff --git a/pebble/internal/fastrand/fastrand_test.go b/pebble/internal/fastrand/fastrand_test.go
new file mode 100644
index 0000000..581c056
--- /dev/null
+++ b/pebble/internal/fastrand/fastrand_test.go
@@ -0,0 +1,86 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package fastrand
+
+import (
+	"fmt"
+	"sync"
+	"testing"
+	"time"
+
+	"golang.org/x/exp/rand"
+)
+
+type defaultRand struct {
+	mu  sync.Mutex
+	src rand.PCGSource
+}
+
+func newDefaultRand() *defaultRand {
+	r := &defaultRand{}
+	r.src.Seed(uint64(time.Now().UnixNano()))
+	return r
+}
+
+func (r *defaultRand) Uint32() uint32 {
+	r.mu.Lock()
+	i := uint32(r.src.Uint64())
+	r.mu.Unlock()
+	return i
+}
+
+func BenchmarkFastRand(b *testing.B) {
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			Uint32()
+		}
+	})
+}
+
+func BenchmarkDefaultRand(b *testing.B) {
+	r := newDefaultRand()
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			r.Uint32()
+		}
+	})
+}
+
+// Benchmarks for single-threaded (ST) use of fastrand compared to
+// constructing a Rand, which can have heap allocation overhead.
+
+// Global state to disable elision of benchmark code.
+var xg uint32
+
+func BenchmarkSTFastRand(b *testing.B) {
+	var x uint32
+	for i := 0; i < b.N; i++ {
+		// Arbitrary constant.
+		x = Uint32n(2097152)
+	}
+	xg = x
+}
+
+func BenchmarkSTDefaultRand(b *testing.B) {
+	for _, newPeriod := range []int{0, 10, 100, 1000} {
+		name := "no-new"
+		if newPeriod > 0 {
+			name = fmt.Sprintf("new-period=%d", newPeriod)
+		}
+		b.Run(name, func(b *testing.B) {
+			r := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+			b.ResetTimer()
+			var x uint32
+			for i := 0; i < b.N; i++ {
+				if newPeriod > 0 && i%newPeriod == 0 {
+					r = rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+				}
+				// Arbitrary constant.
+				x = uint32(r.Uint64n(2097152))
+			}
+			xg = x
+		})
+	}
+}
diff --git a/pebble/internal/humanize/humanize.go b/pebble/internal/humanize/humanize.go
new file mode 100644
index 0000000..cb82343
--- /dev/null
+++ b/pebble/internal/humanize/humanize.go
@@ -0,0 +1,68 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package humanize
+
+import (
+	"fmt"
+	"math"
+
+	"github.com/cockroachdb/redact"
+)
+
+func logn(n, b float64) float64 {
+	return math.Log(n) / math.Log(b)
+}
+
+func humanate(s uint64, base float64, suffixes []string) string {
+	if s < 10 {
+		return fmt.Sprintf("%d%s", s, suffixes[0])
+	}
+	e := math.Floor(logn(float64(s), base))
+	suffix := suffixes[int(e)]
+	val := math.Floor(float64(s)/math.Pow(base, e)*10+0.5) / 10
+	f := "%.0f%s"
+	if val < 10 {
+		f = "%.1f%s"
+	}
+
+	return fmt.Sprintf(f, val, suffix)
+}
+
+type config struct {
+	base   float64
+	suffix []string
+}
+
+// Bytes produces human readable representations of byte values in IEC units.
+var Bytes = config{1024, []string{"B", "KB", "MB", "GB", "TB", "PB", "EB"}}
+
+// Count produces human readable representations of unitless values in SI units.
+var Count = config{1000, []string{"", "K", "M", "G", "T", "P", "E"}}
+
+// Int64 produces a human readable representation of the value.
+func (c *config) Int64(s int64) FormattedString {
+	if s < 0 {
+		return FormattedString("-" + humanate(uint64(-s), c.base, c.suffix))
+	}
+	return FormattedString(humanate(uint64(s), c.base, c.suffix))
+}
+
+// Uint64 produces a human readable representation of the value.
+func (c *config) Uint64(s uint64) FormattedString {
+	return FormattedString(humanate(s, c.base, c.suffix))
+}
+
+// FormattedString represents a human readable representation of a value. It
+// implements the redact.SafeValue interface to signal that it represents a
+// a string that does not need to be redacted.
+type FormattedString string
+
+var _ redact.SafeValue = FormattedString("")
+
+// SafeValue implements redact.SafeValue.
+func (fs FormattedString) SafeValue() {}
+
+// String implements fmt.Stringer.
+func (fs FormattedString) String() string { return string(fs) }
diff --git a/pebble/internal/humanize/humanize_test.go b/pebble/internal/humanize/humanize_test.go
new file mode 100644
index 0000000..a4a42c3
--- /dev/null
+++ b/pebble/internal/humanize/humanize_test.go
@@ -0,0 +1,38 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package humanize
+
+import (
+	"bytes"
+	"fmt"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+)
+
+func TestHumanize(t *testing.T) {
+	datadriven.RunTest(t, "testdata/humanize", func(t *testing.T, td *datadriven.TestData) string {
+		var c config
+		switch td.Cmd {
+		case "bytes":
+			c = Bytes
+		case "count":
+			c = Count
+		default:
+			td.Fatalf(t, "invalid command %q", td.Cmd)
+		}
+		var buf bytes.Buffer
+		for _, row := range strings.Split(td.Input, "\n") {
+			val, err := strconv.ParseInt(row, 10, 64)
+			if err != nil {
+				td.Fatalf(t, "error parsing %q: %v", row, err)
+			}
+			fmt.Fprintf(&buf, "%s\n", c.Int64(val))
+		}
+		return buf.String()
+	})
+}
diff --git a/pebble/internal/humanize/testdata/humanize b/pebble/internal/humanize/testdata/humanize
new file mode 100644
index 0000000..27f554a
--- /dev/null
+++ b/pebble/internal/humanize/testdata/humanize
@@ -0,0 +1,49 @@
+bytes
+0
+1
+9
+99
+123
+123456
+12345678
+1234567890
+1234567890123
+123456789012345
+123456789012345678
+----
+0B
+1B
+9B
+99B
+123B
+121KB
+12MB
+1.1GB
+1.1TB
+112TB
+110PB
+
+count
+0
+1
+9
+99
+123
+123456
+12345678
+1234567890
+1234567890123
+123456789012345
+123456789012345678
+----
+0
+1
+9
+99
+123
+124K
+12M
+1.2G
+1.2T
+124T
+124P
diff --git a/pebble/internal/intern/intern.go b/pebble/internal/intern/intern.go
new file mode 100644
index 0000000..9f8bad5
--- /dev/null
+++ b/pebble/internal/intern/intern.go
@@ -0,0 +1,27 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package intern
+
+import "sync"
+
+var pool = sync.Pool{
+	New: func() interface{} {
+		return make(map[string]string)
+	},
+}
+
+// Bytes returns b converted to a string, interned.
+func Bytes(b []byte) string {
+	m := pool.Get().(map[string]string)
+	c, ok := m[string(b)]
+	if ok {
+		pool.Put(m)
+		return c
+	}
+	s := string(b)
+	m[s] = s
+	pool.Put(m)
+	return s
+}
diff --git a/pebble/internal/intern/intern_test.go b/pebble/internal/intern/intern_test.go
new file mode 100644
index 0000000..1db6581
--- /dev/null
+++ b/pebble/internal/intern/intern_test.go
@@ -0,0 +1,30 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package intern
+
+import (
+	"bytes"
+	"testing"
+
+	"github.com/cockroachdb/pebble/internal/invariants"
+)
+
+func TestBytes(t *testing.T) {
+	if invariants.RaceEnabled {
+		// sync.Pool is a no-op under -race, making this test fail.
+		t.Skip("not supported under -race")
+	}
+
+	const abc = "abc"
+	s := bytes.Repeat([]byte(abc), 100)
+	n := testing.AllocsPerRun(100, func() {
+		for i := 0; i < 100; i++ {
+			_ = Bytes(s[i*len(abc) : (i+1)*len(abc)])
+		}
+	})
+	if n > 0 {
+		t.Fatalf("Bytes allocated %d, want 0", int(n))
+	}
+}
diff --git a/pebble/internal/invalidating/iter.go b/pebble/internal/invalidating/iter.go
new file mode 100644
index 0000000..e27db58
--- /dev/null
+++ b/pebble/internal/invalidating/iter.go
@@ -0,0 +1,168 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package invalidating
+
+import (
+	"context"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/fastrand"
+	"github.com/cockroachdb/pebble/internal/invariants"
+)
+
+// MaybeWrapIfInvariants wraps some iterators with an invalidating iterator.
+// MaybeWrapIfInvariants does nothing in non-invariant builds.
+func MaybeWrapIfInvariants(iter base.InternalIterator) base.InternalIterator {
+	if invariants.Enabled {
+		if fastrand.Uint32n(10) == 1 {
+			return NewIter(iter)
+		}
+	}
+	return iter
+}
+
+// iter tests unsafe key/value slice reuse by modifying the last
+// returned key/value to all 1s.
+type iter struct {
+	iter        base.InternalIterator
+	lastKey     *base.InternalKey
+	lastValue   base.LazyValue
+	ignoreKinds [base.InternalKeyKindMax + 1]bool
+	err         error
+}
+
+// Option configures the behavior of an invalidating iterator.
+type Option interface {
+	apply(*iter)
+}
+
+type funcOpt func(*iter)
+
+func (f funcOpt) apply(i *iter) { f(i) }
+
+// IgnoreKinds constructs an Option that configures an invalidating iterator to
+// skip trashing k/v pairs with the provided key kinds. Some iterators provided
+// key stability guarantees for specific key kinds.
+func IgnoreKinds(kinds ...base.InternalKeyKind) Option {
+	return funcOpt(func(i *iter) {
+		for _, kind := range kinds {
+			i.ignoreKinds[kind] = true
+		}
+	})
+}
+
+// NewIter constructs a new invalidating iterator that wraps the provided
+// iterator, trashing buffers for previously returned keys.
+func NewIter(originalIterator base.InternalIterator, opts ...Option) base.InternalIterator {
+	i := &iter{iter: originalIterator}
+	for _, opt := range opts {
+		opt.apply(i)
+	}
+	return i
+}
+
+func (i *iter) update(
+	key *base.InternalKey, value base.LazyValue,
+) (*base.InternalKey, base.LazyValue) {
+	i.trashLastKV()
+	if key == nil {
+		i.lastKey = nil
+		i.lastValue = base.LazyValue{}
+		return nil, base.LazyValue{}
+	}
+
+	i.lastKey = &base.InternalKey{}
+	*i.lastKey = key.Clone()
+	i.lastValue = base.LazyValue{
+		ValueOrHandle: append(make([]byte, 0, len(value.ValueOrHandle)), value.ValueOrHandle...),
+	}
+	if value.Fetcher != nil {
+		fetcher := new(base.LazyFetcher)
+		*fetcher = *value.Fetcher
+		i.lastValue.Fetcher = fetcher
+	}
+	return i.lastKey, i.lastValue
+}
+
+func (i *iter) trashLastKV() {
+	if i.lastKey == nil {
+		return
+	}
+	if i.ignoreKinds[i.lastKey.Kind()] {
+		return
+	}
+
+	if i.lastKey != nil {
+		for j := range i.lastKey.UserKey {
+			i.lastKey.UserKey[j] = 0xff
+		}
+		i.lastKey.Trailer = 0xffffffffffffffff
+	}
+	for j := range i.lastValue.ValueOrHandle {
+		i.lastValue.ValueOrHandle[j] = 0xff
+	}
+	if i.lastValue.Fetcher != nil {
+		// Not all the LazyFetcher fields are visible, so we zero out the last
+		// value's Fetcher struct entirely.
+		*i.lastValue.Fetcher = base.LazyFetcher{}
+	}
+}
+
+func (i *iter) SeekGE(key []byte, flags base.SeekGEFlags) (*base.InternalKey, base.LazyValue) {
+	return i.update(i.iter.SeekGE(key, flags))
+}
+
+func (i *iter) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	return i.update(i.iter.SeekPrefixGE(prefix, key, flags))
+}
+
+func (i *iter) SeekLT(key []byte, flags base.SeekLTFlags) (*base.InternalKey, base.LazyValue) {
+	return i.update(i.iter.SeekLT(key, flags))
+}
+
+func (i *iter) First() (*base.InternalKey, base.LazyValue) {
+	return i.update(i.iter.First())
+}
+
+func (i *iter) Last() (*base.InternalKey, base.LazyValue) {
+	return i.update(i.iter.Last())
+}
+
+func (i *iter) Next() (*base.InternalKey, base.LazyValue) {
+	return i.update(i.iter.Next())
+}
+
+func (i *iter) Prev() (*base.InternalKey, base.LazyValue) {
+	return i.update(i.iter.Prev())
+}
+
+func (i *iter) NextPrefix(succKey []byte) (*base.InternalKey, base.LazyValue) {
+	return i.update(i.iter.NextPrefix(succKey))
+}
+
+func (i *iter) Error() error {
+	if err := i.iter.Error(); err != nil {
+		return err
+	}
+	return i.err
+}
+
+func (i *iter) Close() error {
+	return i.iter.Close()
+}
+
+func (i *iter) SetBounds(lower, upper []byte) {
+	i.iter.SetBounds(lower, upper)
+}
+
+func (i *iter) SetContext(ctx context.Context) {
+	i.iter.SetContext(ctx)
+}
+
+func (i *iter) String() string {
+	return i.iter.String()
+}
diff --git a/pebble/internal/invariants/finalizer_off.go b/pebble/internal/invariants/finalizer_off.go
new file mode 100644
index 0000000..d2c600a
--- /dev/null
+++ b/pebble/internal/invariants/finalizer_off.go
@@ -0,0 +1,14 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build (!invariants && !tracing) || race
+// +build !invariants,!tracing race
+
+package invariants
+
+// SetFinalizer is a wrapper around runtime.SetFinalizer that is a no-op under
+// race builds or if neither the invariants or tracing build tags are
+// specified.
+func SetFinalizer(obj, finalizer interface{}) {
+}
diff --git a/pebble/internal/invariants/finalizer_on.go b/pebble/internal/invariants/finalizer_on.go
new file mode 100644
index 0000000..da4e307
--- /dev/null
+++ b/pebble/internal/invariants/finalizer_on.go
@@ -0,0 +1,17 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build (invariants && !race) || (tracing && !race)
+// +build invariants,!race tracing,!race
+
+package invariants
+
+import "runtime"
+
+// SetFinalizer is a wrapper around runtime.SetFinalizer that is a no-op under
+// race builds or if neither the invariants or tracing build tags are
+// specified.
+func SetFinalizer(obj, finalizer interface{}) {
+	runtime.SetFinalizer(obj, finalizer)
+}
diff --git a/pebble/internal/invariants/off.go b/pebble/internal/invariants/off.go
new file mode 100644
index 0000000..01513f2
--- /dev/null
+++ b/pebble/internal/invariants/off.go
@@ -0,0 +1,11 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build !invariants && !race
+// +build !invariants,!race
+
+package invariants
+
+// Enabled is true if we were built with the "invariants" or "race" build tags.
+const Enabled = false
diff --git a/pebble/internal/invariants/on.go b/pebble/internal/invariants/on.go
new file mode 100644
index 0000000..b418680
--- /dev/null
+++ b/pebble/internal/invariants/on.go
@@ -0,0 +1,11 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build invariants || race
+// +build invariants race
+
+package invariants
+
+// Enabled is true if we were built with the "invariants" or "race" build tags.
+const Enabled = true
diff --git a/pebble/internal/invariants/race_off.go b/pebble/internal/invariants/race_off.go
new file mode 100644
index 0000000..b2b8c5e
--- /dev/null
+++ b/pebble/internal/invariants/race_off.go
@@ -0,0 +1,11 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build !race
+// +build !race
+
+package invariants
+
+// RaceEnabled is true if we were built with the "race" build tag.
+const RaceEnabled = false
diff --git a/pebble/internal/invariants/race_on.go b/pebble/internal/invariants/race_on.go
new file mode 100644
index 0000000..46613f7
--- /dev/null
+++ b/pebble/internal/invariants/race_on.go
@@ -0,0 +1,11 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build race
+// +build race
+
+package invariants
+
+// RaceEnabled is true if we were built with the "race" build tag.
+const RaceEnabled = true
diff --git a/pebble/internal/itertest/datadriven.go b/pebble/internal/itertest/datadriven.go
new file mode 100644
index 0000000..6c2feef
--- /dev/null
+++ b/pebble/internal/itertest/datadriven.go
@@ -0,0 +1,196 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+// Package itertest provides facilities for testing internal iterators.
+package itertest
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/stretchr/testify/require"
+)
+
+type iterCmdOpts struct {
+	fmtKV func(io.Writer, *base.InternalKey, []byte, base.InternalIterator)
+	stats *base.InternalIteratorStats
+}
+
+// An IterOpt configures the behavior of RunInternalIterCmd.
+type IterOpt func(*iterCmdOpts)
+
+// Verbose configures RunInternalIterCmd to output verbose results.
+func Verbose(opts *iterCmdOpts) { opts.fmtKV = verboseFmt }
+
+// Condensed configures RunInternalIterCmd to output condensed results without
+// values.
+func Condensed(opts *iterCmdOpts) { opts.fmtKV = condensedFmt }
+
+// WithStats configures RunInternalIterCmd to collect iterator stats in the
+// struct pointed to by stats.
+func WithStats(stats *base.InternalIteratorStats) IterOpt {
+	return func(opts *iterCmdOpts) {
+		opts.stats = stats
+	}
+}
+
+func defaultFmt(w io.Writer, key *base.InternalKey, v []byte, iter base.InternalIterator) {
+	if key != nil {
+		fmt.Fprintf(w, "%s:%s\n", key.UserKey, v)
+	} else if err := iter.Error(); err != nil {
+		fmt.Fprintf(w, "err=%v\n", err)
+	} else {
+		fmt.Fprintf(w, ".\n")
+	}
+}
+
+func condensedFmt(w io.Writer, key *base.InternalKey, v []byte, iter base.InternalIterator) {
+	if key != nil {
+		fmt.Fprintf(w, "<%s:%d>", key.UserKey, key.SeqNum())
+	} else if err := iter.Error(); err != nil {
+		fmt.Fprintf(w, "err=%v", err)
+	} else {
+		fmt.Fprint(w, ".")
+	}
+}
+
+func verboseFmt(w io.Writer, key *base.InternalKey, v []byte, iter base.InternalIterator) {
+	if key != nil {
+		fmt.Fprintf(w, "%s:%s\n", key, v)
+		return
+	}
+	defaultFmt(w, key, v, iter)
+}
+
+// RunInternalIterCmd evaluates a datadriven command controlling an internal
+// iterator, returning a string with the results of the iterator operations.
+func RunInternalIterCmd(
+	t *testing.T, d *datadriven.TestData, iter base.InternalIterator, opts ...IterOpt,
+) string {
+	var buf bytes.Buffer
+	RunInternalIterCmdWriter(t, &buf, d, iter, opts...)
+	return buf.String()
+}
+
+// RunInternalIterCmdWriter evaluates a datadriven command controlling an
+// internal iterator, writing the results of the iterator operations to the
+// provided Writer.
+func RunInternalIterCmdWriter(
+	t *testing.T, w io.Writer, d *datadriven.TestData, iter base.InternalIterator, opts ...IterOpt,
+) {
+	o := iterCmdOpts{fmtKV: defaultFmt}
+	for _, opt := range opts {
+		opt(&o)
+	}
+
+	getKV := func(key *base.InternalKey, val base.LazyValue) (*base.InternalKey, []byte) {
+		v, _, err := val.Value(nil)
+		require.NoError(t, err)
+		return key, v
+	}
+	var prefix []byte
+	for _, line := range strings.Split(d.Input, "\n") {
+		parts := strings.Fields(line)
+		if len(parts) == 0 {
+			continue
+		}
+		var key *base.InternalKey
+		var value []byte
+		switch parts[0] {
+		case "seek-ge":
+			if len(parts) < 2 || len(parts) > 3 {
+				fmt.Fprint(w, "seek-ge <key> [<try-seek-using-next>]\n")
+				return
+			}
+			prefix = nil
+			var flags base.SeekGEFlags
+			if len(parts) == 3 {
+				if trySeekUsingNext, err := strconv.ParseBool(parts[2]); err != nil {
+					fmt.Fprintf(w, "%s", err.Error())
+					return
+				} else if trySeekUsingNext {
+					flags = flags.EnableTrySeekUsingNext()
+				}
+			}
+			key, value = getKV(iter.SeekGE([]byte(strings.TrimSpace(parts[1])), flags))
+		case "seek-prefix-ge":
+			if len(parts) != 2 && len(parts) != 3 {
+				fmt.Fprint(w, "seek-prefix-ge <key> [<try-seek-using-next>]\n")
+				return
+			}
+			prefix = []byte(strings.TrimSpace(parts[1]))
+			var flags base.SeekGEFlags
+			if len(parts) == 3 {
+				if trySeekUsingNext, err := strconv.ParseBool(parts[2]); err != nil {
+					fmt.Fprintf(w, "%s", err.Error())
+					return
+				} else if trySeekUsingNext {
+					flags = flags.EnableTrySeekUsingNext()
+				}
+			}
+			key, value = getKV(iter.SeekPrefixGE(prefix, prefix /* key */, flags))
+		case "seek-lt":
+			if len(parts) != 2 {
+				fmt.Fprint(w, "seek-lt <key>\n")
+				return
+			}
+			prefix = nil
+			key, value = getKV(iter.SeekLT([]byte(strings.TrimSpace(parts[1])), base.SeekLTFlagsNone))
+		case "first":
+			prefix = nil
+			key, value = getKV(iter.First())
+		case "last":
+			prefix = nil
+			key, value = getKV(iter.Last())
+		case "next":
+			key, value = getKV(iter.Next())
+		case "prev":
+			key, value = getKV(iter.Prev())
+		case "set-bounds":
+			if len(parts) <= 1 || len(parts) > 3 {
+				fmt.Fprint(w, "set-bounds lower=<lower> upper=<upper>\n")
+				return
+			}
+			var lower []byte
+			var upper []byte
+			for _, part := range parts[1:] {
+				arg := strings.Split(strings.TrimSpace(part), "=")
+				switch arg[0] {
+				case "lower":
+					lower = []byte(arg[1])
+				case "upper":
+					upper = []byte(arg[1])
+				default:
+					fmt.Fprintf(w, "set-bounds: unknown arg: %s", arg)
+					return
+				}
+			}
+			iter.SetBounds(lower, upper)
+			continue
+		case "stats":
+			if o.stats != nil {
+				// The timing is non-deterministic, so set to 0.
+				o.stats.BlockReadDuration = 0
+				fmt.Fprintf(w, "%+v\n", *o.stats)
+			}
+			continue
+		case "reset-stats":
+			if o.stats != nil {
+				*o.stats = base.InternalIteratorStats{}
+			}
+			continue
+		default:
+			fmt.Fprintf(w, "unknown op: %s", parts[0])
+			return
+		}
+		o.fmtKV(w, key, value, iter)
+
+	}
+}
diff --git a/pebble/internal/keyspan/bounded.go b/pebble/internal/keyspan/bounded.go
new file mode 100644
index 0000000..70dd395
--- /dev/null
+++ b/pebble/internal/keyspan/bounded.go
@@ -0,0 +1,268 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import "github.com/cockroachdb/pebble/internal/base"
+
+// TODO(jackson): Consider removing this type and adding bounds enforcement
+// directly to the MergingIter. This type is probably too lightweight to warrant
+// its own type, but for now we implement it separately for expediency.
+
+// boundedIterPos records the position of the BoundedIter relative to the
+// underlying iterator's position. It's used to avoid Next/Prev-ing the iterator
+// if there can't possibly be another span within bounds, because the current
+// span overlaps the bound.
+//
+// Imagine bounds [a,c) and an iterator that seeks to a span [b,d). The span
+// [b,d) overlaps some portion of the iterator bounds, so the iterator must
+// return it. If the iterator is subsequently Nexted, Next can tell that the
+// iterator is exhausted without advancing the underlying iterator because the
+// current span's end bound of d is ≥ the upper bound of c. In this case, the
+// bounded iterator returns nil and records i.pos as posAtUpperLimit to remember
+// that the underlying iterator position does not match the current BoundedIter
+// position.
+type boundedIterPos int8
+
+const (
+	posAtLowerLimit boundedIterPos = -1
+	posAtIterSpan   boundedIterPos = 0
+	posAtUpperLimit boundedIterPos = +1
+)
+
+// BoundedIter implements FragmentIterator and enforces bounds.
+//
+// Like the point InternalIterator interface, the bounded iterator's forward
+// positioning routines (SeekGE, First, and Next) only check the upper bound.
+// The reverse positioning routines (SeekLT, Last, and Prev) only check the
+// lower bound. It is up to the caller to ensure that the forward positioning
+// routines respect the lower bound and the reverse positioning routines respect
+// the upper bound (i.e. calling SeekGE instead of First if there is a lower
+// bound, and SeekLT instead of Last if there is an upper bound).
+//
+// When the hasPrefix parameter indicates that the iterator is in prefix
+// iteration mode, BoundedIter elides any spans that do not overlap with the
+// prefix's keyspace. In prefix iteration mode, reverse iteration is disallowed,
+// except for an initial SeekLT with a seek key greater than or equal to the
+// prefix. In prefix iteration mode, the first seek must position the iterator
+// at or immediately before the first fragment covering a key greater than or
+// equal to the prefix.
+type BoundedIter struct {
+	iter      FragmentIterator
+	iterSpan  *Span
+	cmp       base.Compare
+	split     base.Split
+	lower     []byte
+	upper     []byte
+	hasPrefix *bool
+	prefix    *[]byte
+	pos       boundedIterPos
+}
+
+// Init initializes the bounded iterator.
+//
+// In addition to the iterator bounds, Init takes pointers to a boolean
+// indicating whether the iterator is in prefix iteration mode and the prefix
+// key if it is. This is used to exclude spans that are outside the iteration
+// prefix.
+//
+// hasPrefix and prefix are allowed to be nil, however if hasPrefix != nil,
+// prefix must also not be nil.
+func (i *BoundedIter) Init(
+	cmp base.Compare,
+	split base.Split,
+	iter FragmentIterator,
+	lower, upper []byte,
+	hasPrefix *bool,
+	prefix *[]byte,
+) {
+	*i = BoundedIter{
+		iter:      iter,
+		cmp:       cmp,
+		split:     split,
+		lower:     lower,
+		upper:     upper,
+		hasPrefix: hasPrefix,
+		prefix:    prefix,
+	}
+}
+
+var _ FragmentIterator = (*BoundedIter)(nil)
+
+// Seek calls.
+//
+// Seek calls check iterator bounds in the direction of the seek. Additionally,
+// if the iterator is in prefix iteration mode, seek calls check both start and
+// end bounds against the prefix's bounds. We check both bounds for defense in
+// depth. This optimization has been a source of various bugs due to various
+// other prefix iteration optimizations that can result in seek keys that don't
+// respect the prefix bounds.
+
+// SeekGE implements FragmentIterator.
+func (i *BoundedIter) SeekGE(key []byte) *Span {
+	s := i.iter.SeekGE(key)
+	s = i.checkPrefixSpanStart(s)
+	s = i.checkPrefixSpanEnd(s)
+	return i.checkForwardBound(s)
+}
+
+// SeekLT implements FragmentIterator.
+func (i *BoundedIter) SeekLT(key []byte) *Span {
+	s := i.iter.SeekLT(key)
+	s = i.checkPrefixSpanStart(s)
+	s = i.checkPrefixSpanEnd(s)
+	return i.checkBackwardBound(s)
+}
+
+// First implements FragmentIterator.
+func (i *BoundedIter) First() *Span {
+	s := i.iter.First()
+	s = i.checkPrefixSpanStart(s)
+	return i.checkForwardBound(s)
+}
+
+// Last implements FragmentIterator.
+func (i *BoundedIter) Last() *Span {
+	s := i.iter.Last()
+	s = i.checkPrefixSpanEnd(s)
+	return i.checkBackwardBound(s)
+}
+
+// Next implements FragmentIterator.
+func (i *BoundedIter) Next() *Span {
+	switch i.pos {
+	case posAtLowerLimit:
+		// The BoundedIter had previously returned nil, because it knew from
+		// i.iterSpan's bounds that there was no previous span. To Next, we only
+		// need to return the current iter span and reset i.pos to reflect that
+		// we're no longer positioned at the limit.
+		i.pos = posAtIterSpan
+		return i.iterSpan
+	case posAtIterSpan:
+		// If the span at the underlying iterator position extends to or beyond the
+		// upper bound, we can avoid advancing because the next span is necessarily
+		// out of bounds.
+		if i.iterSpan != nil && i.upper != nil && i.cmp(i.iterSpan.End, i.upper) >= 0 {
+			i.pos = posAtUpperLimit
+			return nil
+		}
+		// Similarly, if the span extends to the next prefix and we're in prefix
+		// iteration mode, we can avoid advancing.
+		if i.iterSpan != nil && i.hasPrefix != nil && *i.hasPrefix {
+			ei := i.split(i.iterSpan.End)
+			if i.cmp(i.iterSpan.End[:ei], *i.prefix) > 0 {
+				i.pos = posAtUpperLimit
+				return nil
+			}
+		}
+		return i.checkForwardBound(i.checkPrefixSpanStart(i.iter.Next()))
+	case posAtUpperLimit:
+		// Already exhausted.
+		return nil
+	default:
+		panic("unreachable")
+	}
+}
+
+// Prev implements FragmentIterator.
+func (i *BoundedIter) Prev() *Span {
+	switch i.pos {
+	case posAtLowerLimit:
+		// Already exhausted.
+		return nil
+	case posAtIterSpan:
+		// If the span at the underlying iterator position extends to or beyond
+		// the lower bound, we can avoid advancing because the previous span is
+		// necessarily out of bounds.
+		if i.iterSpan != nil && i.lower != nil && i.cmp(i.iterSpan.Start, i.lower) <= 0 {
+			i.pos = posAtLowerLimit
+			return nil
+		}
+		// Similarly, if the span extends to or beyond the current prefix and
+		// we're in prefix iteration mode, we can avoid advancing.
+		if i.iterSpan != nil && i.hasPrefix != nil && *i.hasPrefix {
+			si := i.split(i.iterSpan.Start)
+			if i.cmp(i.iterSpan.Start[:si], *i.prefix) < 0 {
+				i.pos = posAtLowerLimit
+				return nil
+			}
+		}
+		return i.checkBackwardBound(i.checkPrefixSpanEnd(i.iter.Prev()))
+	case posAtUpperLimit:
+		// The BoundedIter had previously returned nil, because it knew from
+		// i.iterSpan's bounds that there was no next span. To Prev, we only
+		// need to return the current iter span and reset i.pos to reflect that
+		// we're no longer positioned at the limit.
+		i.pos = posAtIterSpan
+		return i.iterSpan
+	default:
+		panic("unreachable")
+	}
+}
+
+// Error implements FragmentIterator.
+func (i *BoundedIter) Error() error {
+	return i.iter.Error()
+}
+
+// Close implements FragmentIterator.
+func (i *BoundedIter) Close() error {
+	return i.iter.Close()
+}
+
+// SetBounds modifies the FragmentIterator's bounds.
+func (i *BoundedIter) SetBounds(lower, upper []byte) {
+	i.lower, i.upper = lower, upper
+}
+
+func (i *BoundedIter) checkPrefixSpanStart(span *Span) *Span {
+	// Compare to the prefix's bounds, if in prefix iteration mode.
+	if span != nil && i.hasPrefix != nil && *i.hasPrefix {
+		si := i.split(span.Start)
+		if i.cmp(span.Start[:si], *i.prefix) > 0 {
+			// This span starts at a prefix that sorts after our current prefix.
+			span = nil
+		}
+	}
+	return span
+}
+
+// checkForwardBound enforces the upper bound, returning nil if the provided
+// span is wholly outside the upper bound. It also updates i.pos and i.iterSpan
+// to reflect the new iterator position.
+func (i *BoundedIter) checkForwardBound(span *Span) *Span {
+	// Compare to the upper bound.
+	if span != nil && i.upper != nil && i.cmp(span.Start, i.upper) >= 0 {
+		span = nil
+	}
+	i.iterSpan = span
+	if i.pos != posAtIterSpan {
+		i.pos = posAtIterSpan
+	}
+	return span
+}
+
+func (i *BoundedIter) checkPrefixSpanEnd(span *Span) *Span {
+	// Compare to the prefix's bounds, if in prefix iteration mode.
+	if span != nil && i.hasPrefix != nil && *i.hasPrefix && i.cmp(span.End, *i.prefix) <= 0 {
+		// This span ends before the current prefix.
+		span = nil
+	}
+	return span
+}
+
+// checkBackward enforces the lower bound, returning nil if the provided span is
+// wholly outside the lower bound.  It also updates i.pos and i.iterSpan to
+// reflect the new iterator position.
+func (i *BoundedIter) checkBackwardBound(span *Span) *Span {
+	// Compare to the lower bound.
+	if span != nil && i.lower != nil && i.cmp(span.End, i.lower) <= 0 {
+		span = nil
+	}
+	i.iterSpan = span
+	if i.pos != posAtIterSpan {
+		i.pos = posAtIterSpan
+	}
+	return span
+}
diff --git a/pebble/internal/keyspan/bounded_test.go b/pebble/internal/keyspan/bounded_test.go
new file mode 100644
index 0000000..edb3b5a
--- /dev/null
+++ b/pebble/internal/keyspan/bounded_test.go
@@ -0,0 +1,69 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import (
+	"bytes"
+	"fmt"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+)
+
+func TestBoundedIter(t *testing.T) {
+	getBounds := func(td *datadriven.TestData) (lower, upper []byte) {
+		for _, cmdArg := range td.CmdArgs {
+			switch cmdArg.Key {
+			case "lower":
+				if len(cmdArg.Vals[0]) > 0 {
+					lower = []byte(cmdArg.Vals[0])
+				}
+			case "upper":
+				if len(cmdArg.Vals[0]) > 0 {
+					upper = []byte(cmdArg.Vals[0])
+				}
+			}
+		}
+		return lower, upper
+	}
+
+	cmp := testkeys.Comparer.Compare
+	split := testkeys.Comparer.Split
+	var buf bytes.Buffer
+	var iter BoundedIter
+	var hasPrefix bool
+	var prefix []byte
+	datadriven.RunTest(t, "testdata/bounded_iter", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "define":
+			var spans []Span
+			lines := strings.Split(strings.TrimSpace(td.Input), "\n")
+			for _, line := range lines {
+				spans = append(spans, ParseSpan(line))
+			}
+			inner := &invalidatingIter{iter: NewIter(cmp, spans)}
+			lower, upper := getBounds(td)
+			iter.Init(cmp, split, inner, lower, upper, &hasPrefix, &prefix)
+			return ""
+		case "set-prefix":
+			hasPrefix = len(td.CmdArgs) > 0
+			if hasPrefix {
+				prefix = []byte(td.CmdArgs[0].String())
+				return fmt.Sprintf("set prefix to %q\n", prefix)
+			}
+			return "cleared prefix"
+		case "iter":
+			buf.Reset()
+			lower, upper := getBounds(td)
+			iter.SetBounds(lower, upper)
+			runIterCmd(t, td, &iter, &buf)
+			return buf.String()
+		default:
+			return fmt.Sprintf("unrecognized command %q", td.Cmd)
+		}
+	})
+}
diff --git a/pebble/internal/keyspan/datadriven_test.go b/pebble/internal/keyspan/datadriven_test.go
new file mode 100644
index 0000000..5b1d7aa
--- /dev/null
+++ b/pebble/internal/keyspan/datadriven_test.go
@@ -0,0 +1,432 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import (
+	"fmt"
+	"go/token"
+	"io"
+	"reflect"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/dsl"
+)
+
+// This file contains testing facilities for Spans and FragmentIterators. It's
+// defined here so that it may be used by the keyspan package to test its
+// various FragmentIterator implementations.
+//
+// TODO(jackson): Move keyspan.{Span,Key,FragmentIterator} into internal/base,
+// and then move the testing facilities to an independent package, eg
+// internal/itertest.
+
+// probe defines an interface for probes that may inspect or mutate internal
+// span iterator behavior.
+type probe interface {
+	// probe inspects, and possibly manipulates, iterator operations' results.
+	probe(*probeContext)
+}
+
+func parseProbes(probeDSLs ...string) []probe {
+	probes := make([]probe, len(probeDSLs))
+	var err error
+	for i := range probeDSLs {
+		probes[i], err = probeParser.Parse(probeDSLs[i])
+		if err != nil {
+			panic(err)
+		}
+	}
+	return probes
+}
+
+func attachProbes(iter FragmentIterator, pctx probeContext, probes ...probe) FragmentIterator {
+	if pctx.log == nil {
+		pctx.log = io.Discard
+	}
+	for i := range probes {
+		iter = &probeIterator{
+			iter:     iter,
+			probe:    probes[i],
+			probeCtx: pctx,
+		}
+	}
+	return iter
+}
+
+// probeContext provides the context within which a probe is run. It includes
+// information about the iterator operation in progress.
+type probeContext struct {
+	op
+	log io.Writer
+}
+
+type op struct {
+	Kind    OpKind
+	SeekKey []byte
+	Span    *Span
+	Err     error
+}
+
+// ErrInjected is an error artificially injected for testing.
+var ErrInjected = &errorProbe{name: "ErrInjected", err: errors.New("injected error")}
+
+var probeParser = func() *dsl.Parser[probe] {
+	valuerParser := dsl.NewParser[valuer]()
+	valuerParser.DefineConstant("StartKey", func() valuer { return startKey{} })
+	valuerParser.DefineFunc("Bytes",
+		func(p *dsl.Parser[valuer], s *dsl.Scanner) valuer {
+			v := bytesConstant{bytes: []byte(s.ConsumeString())}
+			s.Consume(token.RPAREN)
+			return v
+		})
+
+	predicateParser := dsl.NewPredicateParser[*probeContext]()
+	predicateParser.DefineFunc("Equal",
+		func(p *dsl.Parser[dsl.Predicate[*probeContext]], s *dsl.Scanner) dsl.Predicate[*probeContext] {
+			eq := equal{
+				valuerParser.ParseFromPos(s, s.Scan()),
+				valuerParser.ParseFromPos(s, s.Scan()),
+			}
+			s.Consume(token.RPAREN)
+			return eq
+		})
+	for i, name := range opNames {
+		opKind := OpKind(i)
+		predicateParser.DefineConstant(name, func() dsl.Predicate[*probeContext] {
+			// An OpKind implements dsl.Predicate[*probeContext].
+			return opKind
+		})
+	}
+	probeParser := dsl.NewParser[probe]()
+	probeParser.DefineConstant("ErrInjected", func() probe { return ErrInjected })
+	probeParser.DefineConstant("noop", func() probe { return noop{} })
+	probeParser.DefineFunc("If",
+		func(p *dsl.Parser[probe], s *dsl.Scanner) probe {
+			probe := ifProbe{
+				predicateParser.ParseFromPos(s, s.Scan()),
+				probeParser.ParseFromPos(s, s.Scan()),
+				probeParser.ParseFromPos(s, s.Scan()),
+			}
+			s.Consume(token.RPAREN)
+			return probe
+		})
+	probeParser.DefineFunc("Return",
+		func(p *dsl.Parser[probe], s *dsl.Scanner) (ret probe) {
+			switch tok := s.Scan(); tok.Kind {
+			case token.STRING:
+				str, err := strconv.Unquote(tok.Lit)
+				if err != nil {
+					panic(err)
+				}
+				span := ParseSpan(str)
+				ret = returnSpan{s: &span}
+			case token.IDENT:
+				switch tok.Lit {
+				case "nil":
+					ret = returnSpan{s: nil}
+				default:
+					panic(errors.Newf("unrecognized return value %q", tok.Lit))
+				}
+			}
+			s.Consume(token.RPAREN)
+			return ret
+		})
+	probeParser.DefineFunc("Log",
+		func(p *dsl.Parser[probe], s *dsl.Scanner) (ret probe) {
+			ret = loggingProbe{prefix: s.ConsumeString()}
+			s.Consume(token.RPAREN)
+			return ret
+		})
+	return probeParser
+}()
+
+// probe implementations
+
+type errorProbe struct {
+	name string
+	err  error
+}
+
+func (p *errorProbe) String() string { return p.name }
+func (p *errorProbe) Error() error   { return p.err }
+func (p *errorProbe) probe(pctx *probeContext) {
+	pctx.op.Err = p.err
+	pctx.op.Span = nil
+}
+
+// ifProbe is a conditional probe. If its predicate evaluates to true, it probes
+// using its Then probe. If its predicate evalutes to false, it probes using its
+// Else probe.
+type ifProbe struct {
+	Predicate dsl.Predicate[*probeContext]
+	Then      probe
+	Else      probe
+}
+
+func (p ifProbe) String() string { return fmt.Sprintf("(If %s %s %s)", p.Predicate, p.Then, p.Else) }
+func (p ifProbe) probe(pctx *probeContext) {
+	if p.Predicate.Evaluate(pctx) {
+		p.Then.probe(pctx)
+	} else {
+		p.Else.probe(pctx)
+	}
+}
+
+type returnSpan struct {
+	s *Span
+}
+
+func (p returnSpan) String() string {
+	if p.s == nil {
+		return "(Return nil)"
+	}
+	return fmt.Sprintf("(Return %q)", p.s.String())
+}
+
+func (p returnSpan) probe(pctx *probeContext) {
+	pctx.op.Span = p.s
+	pctx.op.Err = nil
+}
+
+type noop struct{}
+
+func (noop) String() string           { return "Noop" }
+func (noop) probe(pctx *probeContext) {}
+
+type loggingProbe struct {
+	prefix string
+}
+
+func (lp loggingProbe) String() string { return fmt.Sprintf("(Log %q)", lp.prefix) }
+func (lp loggingProbe) probe(pctx *probeContext) {
+	opStr := strings.TrimPrefix(pctx.op.Kind.String(), "Op")
+	fmt.Fprintf(pctx.log, "%s%s(", lp.prefix, opStr)
+	if pctx.op.SeekKey != nil {
+		fmt.Fprintf(pctx.log, "%q", pctx.op.SeekKey)
+	}
+	fmt.Fprint(pctx.log, ") = ")
+	if pctx.op.Span == nil {
+		fmt.Fprint(pctx.log, "nil")
+		if pctx.op.Err != nil {
+			fmt.Fprintf(pctx.log, " <err=%q>", pctx.op.Err)
+		}
+	} else {
+		fmt.Fprint(pctx.log, pctx.op.Span.String())
+	}
+	fmt.Fprintln(pctx.log)
+}
+
+// dsl.Predicate[*probeContext] implementations.
+
+type equal struct {
+	a, b valuer
+}
+
+func (e equal) String() string { return fmt.Sprintf("(Equal %s %s)", e.a, e.b) }
+func (e equal) Evaluate(pctx *probeContext) bool {
+	return reflect.DeepEqual(e.a.value(pctx), e.b.value(pctx))
+}
+
+// OpKind indicates the type of iterator operation being performed.
+type OpKind int8
+
+const (
+	OpSeekGE OpKind = iota
+	OpSeekLT
+	OpFirst
+	OpLast
+	OpNext
+	OpPrev
+	OpClose
+	numOpKinds
+)
+
+func (o OpKind) String() string                   { return opNames[o] }
+func (o OpKind) Evaluate(pctx *probeContext) bool { return pctx.op.Kind == o }
+
+var opNames = [numOpKinds]string{
+	OpSeekGE: "OpSeekGE",
+	OpSeekLT: "OpSeekLT",
+	OpFirst:  "OpFirst",
+	OpLast:   "OpLast",
+	OpNext:   "OpNext",
+	OpPrev:   "OpPrev",
+	OpClose:  "OpClose",
+}
+
+// valuer implementations
+
+type valuer interface {
+	fmt.Stringer
+	value(pctx *probeContext) any
+}
+
+type bytesConstant struct {
+	bytes []byte
+}
+
+func (b bytesConstant) String() string               { return fmt.Sprintf("%q", string(b.bytes)) }
+func (b bytesConstant) value(pctx *probeContext) any { return b.bytes }
+
+type startKey struct{}
+
+func (s startKey) String() string { return "StartKey" }
+func (s startKey) value(pctx *probeContext) any {
+	if pctx.op.Span == nil {
+		return nil
+	}
+	return pctx.op.Span.Start
+}
+
+type probeIterator struct {
+	iter     FragmentIterator
+	err      error
+	probe    probe
+	probeCtx probeContext
+}
+
+// Assert that probeIterator implements the fragment iterator interface.
+var _ FragmentIterator = (*probeIterator)(nil)
+
+func (p *probeIterator) handleOp(preProbeOp op) *Span {
+	p.probeCtx.op = preProbeOp
+	if preProbeOp.Span == nil && p.iter != nil {
+		p.probeCtx.op.Err = p.iter.Error()
+	}
+
+	p.probe.probe(&p.probeCtx)
+	p.err = p.probeCtx.op.Err
+	return p.probeCtx.op.Span
+}
+
+func (p *probeIterator) SeekGE(key []byte) *Span {
+	op := op{
+		Kind:    OpSeekGE,
+		SeekKey: key,
+	}
+	if p.iter != nil {
+		op.Span = p.iter.SeekGE(key)
+	}
+	return p.handleOp(op)
+}
+
+func (p *probeIterator) SeekLT(key []byte) *Span {
+	op := op{
+		Kind:    OpSeekLT,
+		SeekKey: key,
+	}
+	if p.iter != nil {
+		op.Span = p.iter.SeekLT(key)
+	}
+	return p.handleOp(op)
+}
+
+func (p *probeIterator) First() *Span {
+	op := op{Kind: OpFirst}
+	if p.iter != nil {
+		op.Span = p.iter.First()
+	}
+	return p.handleOp(op)
+}
+
+func (p *probeIterator) Last() *Span {
+	op := op{Kind: OpLast}
+	if p.iter != nil {
+		op.Span = p.iter.Last()
+	}
+	return p.handleOp(op)
+}
+
+func (p *probeIterator) Next() *Span {
+	op := op{Kind: OpNext}
+	if p.iter != nil {
+		op.Span = p.iter.Next()
+	}
+	return p.handleOp(op)
+}
+
+func (p *probeIterator) Prev() *Span {
+	op := op{Kind: OpPrev}
+	if p.iter != nil {
+		op.Span = p.iter.Prev()
+	}
+	return p.handleOp(op)
+}
+
+func (p *probeIterator) Error() error {
+	return p.err
+}
+
+func (p *probeIterator) Close() error {
+	op := op{Kind: OpClose}
+	if p.iter != nil {
+		op.Err = p.iter.Close()
+	}
+
+	p.probeCtx.op = op
+	p.probe.probe(&p.probeCtx)
+	p.err = p.probeCtx.op.Err
+	return p.err
+}
+
+// runIterCmd evaluates a datadriven command controlling an internal
+// keyspan.FragmentIterator, writing the results of the iterator operations to
+// the provided writer.
+func runIterCmd(t *testing.T, td *datadriven.TestData, iter FragmentIterator, w io.Writer) {
+	lines := strings.Split(strings.TrimSpace(td.Input), "\n")
+	for i, line := range lines {
+		if i > 0 {
+			fmt.Fprintln(w)
+		}
+		line = strings.TrimSpace(line)
+		i := strings.IndexByte(line, '#')
+		iterCmd := line
+		if i > 0 {
+			iterCmd = string(line[:i])
+		}
+		runIterOp(w, iter, iterCmd)
+	}
+}
+
+var iterDelim = map[rune]bool{',': true, ' ': true, '(': true, ')': true, '"': true}
+
+func runIterOp(w io.Writer, it FragmentIterator, op string) {
+	fields := strings.FieldsFunc(op, func(r rune) bool { return iterDelim[r] })
+	var s *Span
+	switch strings.ToLower(fields[0]) {
+	case "first":
+		s = it.First()
+	case "last":
+		s = it.Last()
+	case "seekge", "seek-ge":
+		if len(fields) == 1 {
+			panic(fmt.Sprintf("unable to parse iter op %q", op))
+		}
+		s = it.SeekGE([]byte(fields[1]))
+	case "seeklt", "seek-lt":
+		if len(fields) == 1 {
+			panic(fmt.Sprintf("unable to parse iter op %q", op))
+		}
+		s = it.SeekLT([]byte(fields[1]))
+	case "next":
+		s = it.Next()
+	case "prev":
+		s = it.Prev()
+	default:
+		panic(fmt.Sprintf("unrecognized iter op %q", fields[0]))
+	}
+	if s == nil {
+		fmt.Fprint(w, "<nil>")
+		if err := it.Error(); err != nil {
+			fmt.Fprintf(w, " err=<%s>", it.Error())
+		}
+		return
+	}
+	fmt.Fprint(w, s)
+}
diff --git a/pebble/internal/keyspan/defragment.go b/pebble/internal/keyspan/defragment.go
new file mode 100644
index 0000000..d056ef0
--- /dev/null
+++ b/pebble/internal/keyspan/defragment.go
@@ -0,0 +1,539 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import (
+	"bytes"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/bytealloc"
+	"github.com/cockroachdb/pebble/internal/invariants"
+)
+
+// bufferReuseMaxCapacity is the maximum capacity of a DefragmentingIter buffer
+// that DefragmentingIter will reuse. Buffers larger than this will be
+// discarded and reallocated as necessary.
+const bufferReuseMaxCapacity = 10 << 10 // 10 KB
+
+// keysReuseMaxCapacity is the maximum capacity of a []keyspan.Key buffer that
+// DefragmentingIter will reuse. Buffers larger than this will be discarded and
+// reallocated as necessary.
+const keysReuseMaxCapacity = 100
+
+// DefragmentMethod configures the defragmentation performed by the
+// DefragmentingIter.
+type DefragmentMethod interface {
+	// ShouldDefragment takes two abutting spans and returns whether the two
+	// spans should be combined into a single, defragmented Span.
+	ShouldDefragment(equal base.Equal, left, right *Span) bool
+}
+
+// The DefragmentMethodFunc type is an adapter to allow the use of ordinary
+// functions as DefragmentMethods. If f is a function with the appropriate
+// signature, DefragmentMethodFunc(f) is a DefragmentMethod that calls f.
+type DefragmentMethodFunc func(equal base.Equal, left, right *Span) bool
+
+// ShouldDefragment calls f(equal, left, right).
+func (f DefragmentMethodFunc) ShouldDefragment(equal base.Equal, left, right *Span) bool {
+	return f(equal, left, right)
+}
+
+// DefragmentInternal configures a DefragmentingIter to defragment spans
+// only if they have identical keys. It requires spans' keys to be sorted in
+// trailer descending order.
+//
+// This defragmenting method is intended for use in compactions that may see
+// internal range keys fragments that may now be joined, because the state that
+// required their fragmentation has been dropped.
+var DefragmentInternal DefragmentMethod = DefragmentMethodFunc(func(equal base.Equal, a, b *Span) bool {
+	if a.KeysOrder != ByTrailerDesc || b.KeysOrder != ByTrailerDesc {
+		panic("pebble: span keys unexpectedly not in trailer descending order")
+	}
+	if len(a.Keys) != len(b.Keys) {
+		return false
+	}
+	for i := range a.Keys {
+		if a.Keys[i].Trailer != b.Keys[i].Trailer {
+			return false
+		}
+		if !equal(a.Keys[i].Suffix, b.Keys[i].Suffix) {
+			return false
+		}
+		if !bytes.Equal(a.Keys[i].Value, b.Keys[i].Value) {
+			return false
+		}
+	}
+	return true
+})
+
+// DefragmentReducer merges the current and next Key slices, returning a new Key
+// slice.
+//
+// Implementations should modify and return `cur` to save on allocations, or
+// consider allocating a new slice, as the `cur` slice may be retained by the
+// DefragmentingIter and mutated. The `next` slice must not be mutated.
+//
+// The incoming slices are sorted by (SeqNum, Kind) descending. The output slice
+// must also have this sort order.
+type DefragmentReducer func(cur, next []Key) []Key
+
+// StaticDefragmentReducer is a no-op DefragmentReducer that simply returns the
+// current key slice, effectively retaining the first set of keys encountered
+// for a defragmented span.
+//
+// This reducer can be used, for example, when the set of Keys for each Span
+// being reduced is not expected to change, and therefore the keys from the
+// first span encountered can be used without considering keys in subsequent
+// spans.
+var StaticDefragmentReducer DefragmentReducer = func(cur, _ []Key) []Key {
+	return cur
+}
+
+// iterPos is an enum indicating the position of the defragmenting iter's
+// wrapped iter. The defragmenting iter must look ahead or behind when
+// defragmenting forward or backwards respectively, and this enum records that
+// current position.
+type iterPos int8
+
+const (
+	iterPosPrev iterPos = -1
+	iterPosCurr iterPos = 0
+	iterPosNext iterPos = +1
+)
+
+// DefragmentingIter wraps a key span iterator, defragmenting physical
+// fragmentation during iteration.
+//
+// During flushes and compactions, keys applied over a span may be split at
+// sstable boundaries. This fragmentation can produce internal key bounds that
+// do not match any of the bounds ever supplied to a user operation. This
+// physical fragmentation is necessary to avoid excessively wide sstables.
+//
+// The defragmenting iterator undoes this physical fragmentation, joining spans
+// with abutting bounds and equal state. The defragmenting iterator takes a
+// DefragmentMethod to determine what is "equal state" for a span. The
+// DefragmentMethod is a function type, allowing arbitrary comparisons between
+// Span keys.
+//
+// Seeking (SeekGE, SeekLT) poses an obstacle to defragmentation. A seek may
+// land on a physical fragment in the middle of several fragments that must be
+// defragmented. A seek that lands in a fragment straddling the seek key must
+// first degfragment in the opposite direction of iteration to find the
+// beginning of the defragmented span, and then defragments in the iteration
+// direction, ensuring it's found a whole defragmented span.
+type DefragmentingIter struct {
+	// DefragmentingBuffers holds buffers used for copying iterator state.
+	*DefragmentingBuffers
+	comparer *base.Comparer
+	equal    base.Equal
+	iter     FragmentIterator
+	iterSpan *Span
+	iterPos  iterPos
+
+	// curr holds the span at the current iterator position.
+	curr Span
+
+	// method is a comparison function for two spans. method is called when two
+	// spans are abutting to determine whether they may be defragmented.
+	// method does not itself check for adjacency for the two spans.
+	method DefragmentMethod
+
+	// reduce is the reducer function used to collect Keys across all spans that
+	// constitute a defragmented span.
+	reduce DefragmentReducer
+}
+
+// DefragmentingBuffers holds buffers used for copying iterator state.
+type DefragmentingBuffers struct {
+	// currBuf is a buffer for use when copying user keys for curr. currBuf is
+	// cleared between positioning methods.
+	currBuf bytealloc.A
+	// keysBuf is a buffer for use when copying Keys for DefragmentingIter.curr.
+	keysBuf []Key
+	// keyBuf is a buffer specifically for the defragmented start key when
+	// defragmenting backwards or the defragmented end key when defragmenting
+	// forwards. These bounds are overwritten repeatedly during defragmentation,
+	// and the defragmentation routines overwrite keyBuf repeatedly to store
+	// these extended bounds.
+	keyBuf []byte
+}
+
+// PrepareForReuse discards any excessively large buffers.
+func (bufs *DefragmentingBuffers) PrepareForReuse() {
+	if cap(bufs.currBuf) > bufferReuseMaxCapacity {
+		bufs.currBuf = nil
+	}
+	if cap(bufs.keyBuf) > bufferReuseMaxCapacity {
+		bufs.keyBuf = nil
+	}
+	if cap(bufs.keysBuf) > keysReuseMaxCapacity {
+		bufs.keysBuf = nil
+	}
+}
+
+// Assert that *DefragmentingIter implements the FragmentIterator interface.
+var _ FragmentIterator = (*DefragmentingIter)(nil)
+
+// Init initializes the defragmenting iter using the provided defragment
+// method.
+func (i *DefragmentingIter) Init(
+	comparer *base.Comparer,
+	iter FragmentIterator,
+	equal DefragmentMethod,
+	reducer DefragmentReducer,
+	bufs *DefragmentingBuffers,
+) {
+	*i = DefragmentingIter{
+		DefragmentingBuffers: bufs,
+		comparer:             comparer,
+		equal:                comparer.Equal,
+		iter:                 iter,
+		method:               equal,
+		reduce:               reducer,
+	}
+}
+
+// Error returns any accumulated error.
+func (i *DefragmentingIter) Error() error {
+	return i.iter.Error()
+}
+
+// Close closes the underlying iterators.
+func (i *DefragmentingIter) Close() error {
+	return i.iter.Close()
+}
+
+// SeekGE moves the iterator to the first span covering a key greater than or
+// equal to the given key. This is equivalent to seeking to the first span with
+// an end key greater than the given key.
+func (i *DefragmentingIter) SeekGE(key []byte) *Span {
+	i.iterSpan = i.iter.SeekGE(key)
+	if i.iterSpan == nil {
+		i.iterPos = iterPosCurr
+		return nil
+	} else if i.iterSpan.Empty() {
+		i.iterPos = iterPosCurr
+		return i.iterSpan
+	}
+	// If the span starts strictly after key, we know there mustn't be an
+	// earlier span that ends at i.iterSpan.Start, otherwise i.iter would've
+	// returned that span instead.
+	if i.comparer.Compare(i.iterSpan.Start, key) > 0 {
+		return i.defragmentForward()
+	}
+
+	// The span we landed on has a Start bound ≤ key. There may be additional
+	// fragments before this span. Defragment backward to find the start of the
+	// defragmented span.
+	i.defragmentBackward()
+
+	// Defragmenting backward may have stopped because it encountered an error.
+	// If so, we must not continue so that i.iter.Error() (and thus i.Error())
+	// yields the error.
+	if i.iterSpan == nil && i.iter.Error() != nil {
+		return nil
+	}
+
+	if i.iterPos == iterPosPrev {
+		// Next once back onto the span.
+		i.iterSpan = i.iter.Next()
+	}
+	// Defragment the full span from its start.
+	return i.defragmentForward()
+}
+
+// SeekLT moves the iterator to the last span covering a key less than the
+// given key. This is equivalent to seeking to the last span with a start
+// key less than the given key.
+func (i *DefragmentingIter) SeekLT(key []byte) *Span {
+	i.iterSpan = i.iter.SeekLT(key)
+	if i.iterSpan == nil {
+		i.iterPos = iterPosCurr
+		return nil
+	} else if i.iterSpan.Empty() {
+		i.iterPos = iterPosCurr
+		return i.iterSpan
+	}
+	// If the span ends strictly before key, we know there mustn't be a later
+	// span that starts at i.iterSpan.End, otherwise i.iter would've returned
+	// that span instead.
+	if i.comparer.Compare(i.iterSpan.End, key) < 0 {
+		return i.defragmentBackward()
+	}
+
+	// The span we landed on has a End bound ≥ key. There may be additional
+	// fragments after this span. Defragment forward to find the end of the
+	// defragmented span.
+	i.defragmentForward()
+
+	// Defragmenting forward may have stopped because it encountered an error.
+	// If so, we must not continue so that i.iter.Error() (and thus i.Error())
+	// yields the error.
+	if i.iterSpan == nil && i.iter.Error() != nil {
+		return nil
+	}
+
+	if i.iterPos == iterPosNext {
+		// Prev once back onto the span.
+		i.iterSpan = i.iter.Prev()
+	}
+	// Defragment the full span from its end.
+	return i.defragmentBackward()
+}
+
+// First seeks the iterator to the first span and returns it.
+func (i *DefragmentingIter) First() *Span {
+	i.iterSpan = i.iter.First()
+	if i.iterSpan == nil {
+		i.iterPos = iterPosCurr
+		return nil
+	}
+	return i.defragmentForward()
+}
+
+// Last seeks the iterator to the last span and returns it.
+func (i *DefragmentingIter) Last() *Span {
+	i.iterSpan = i.iter.Last()
+	if i.iterSpan == nil {
+		i.iterPos = iterPosCurr
+		return nil
+	}
+	return i.defragmentBackward()
+}
+
+// Next advances to the next span and returns it.
+func (i *DefragmentingIter) Next() *Span {
+	switch i.iterPos {
+	case iterPosPrev:
+		// Switching directions; The iterator is currently positioned over the
+		// last span of the previous set of fragments. In the below diagram,
+		// the iterator is positioned over the last span that contributes to
+		// the defragmented x position. We want to be positioned over the first
+		// span that contributes to the z position.
+		//
+		//   x x x y y y z z z
+		//       ^       ^
+		//      old     new
+		//
+		// Next once to move onto y, defragment forward to land on the first z
+		// position.
+		i.iterSpan = i.iter.Next()
+		if invariants.Enabled && i.iterSpan == nil && i.iter.Error() == nil {
+			panic("pebble: invariant violation: no next span while switching directions")
+		}
+		// We're now positioned on the first span that was defragmented into the
+		// current iterator position. Skip over the rest of the current iterator
+		// position's constitutent fragments. In the above example, this would
+		// land on the first 'z'.
+		i.defragmentForward()
+		if i.iterSpan == nil {
+			i.iterPos = iterPosCurr
+			return nil
+		}
+
+		// Now that we're positioned over the first of the next set of
+		// fragments, defragment forward.
+		return i.defragmentForward()
+	case iterPosCurr:
+		// iterPosCurr is only used when the iter is exhausted or when the iterator
+		// is at an empty span.
+		if invariants.Enabled && i.iterSpan != nil && !i.iterSpan.Empty() {
+			panic("pebble: invariant violation: iterPosCurr with valid iterSpan")
+		}
+
+		i.iterSpan = i.iter.Next()
+		if i.iterSpan == nil {
+			return nil
+		}
+		return i.defragmentForward()
+	case iterPosNext:
+		// Already at the next span.
+		if i.iterSpan == nil {
+			i.iterPos = iterPosCurr
+			return nil
+		}
+		return i.defragmentForward()
+	default:
+		panic("unreachable")
+	}
+}
+
+// Prev steps back to the previous span and returns it.
+func (i *DefragmentingIter) Prev() *Span {
+	switch i.iterPos {
+	case iterPosPrev:
+		// Already at the previous span.
+		if i.iterSpan == nil {
+			i.iterPos = iterPosCurr
+			return nil
+		}
+		return i.defragmentBackward()
+	case iterPosCurr:
+		// iterPosCurr is only used when the iter is exhausted or when the iterator
+		// is at an empty span.
+		if invariants.Enabled && i.iterSpan != nil && !i.iterSpan.Empty() {
+			panic("pebble: invariant violation: iterPosCurr with valid iterSpan")
+		}
+
+		i.iterSpan = i.iter.Prev()
+		if i.iterSpan == nil {
+			return nil
+		}
+		return i.defragmentBackward()
+	case iterPosNext:
+		// Switching directions; The iterator is currently positioned over the
+		// first fragment of the next set of fragments. In the below diagram,
+		// the iterator is positioned over the first span that contributes to
+		// the defragmented z position. We want to be positioned over the last
+		// span that contributes to the x position.
+		//
+		//   x x x y y y z z z
+		//       ^       ^
+		//      new     old
+		//
+		// Prev once to move onto y, defragment backward to land on the last x
+		// position.
+		i.iterSpan = i.iter.Prev()
+		if invariants.Enabled && i.iterSpan == nil && i.iter.Error() == nil {
+			panic("pebble: invariant violation: no previous span while switching directions")
+		}
+		// We're now positioned on the last span that was defragmented into the
+		// current iterator position. Skip over the rest of the current iterator
+		// position's constitutent fragments. In the above example, this would
+		// land on the last 'x'.
+		i.defragmentBackward()
+
+		// Now that we're positioned over the last of the prev set of
+		// fragments, defragment backward.
+		if i.iterSpan == nil {
+			i.iterPos = iterPosCurr
+			return nil
+		}
+		return i.defragmentBackward()
+	default:
+		panic("unreachable")
+	}
+}
+
+// checkEqual checks the two spans for logical equivalence. It uses the passed-in
+// DefragmentMethod and ensures both spans are NOT empty; not defragmenting empty
+// spans is an optimization that lets us load fewer sstable blocks.
+func (i *DefragmentingIter) checkEqual(left, right *Span) bool {
+	return (!left.Empty() && !right.Empty()) && i.method.ShouldDefragment(i.equal, i.iterSpan, &i.curr)
+}
+
+// defragmentForward defragments spans in the forward direction, starting from
+// i.iter's current position. The span at the current position must be non-nil,
+// but may be Empty().
+func (i *DefragmentingIter) defragmentForward() *Span {
+	if i.iterSpan.Empty() {
+		// An empty span will never be equal to another span; see checkEqual for
+		// why. To avoid loading non-empty range keys further ahead by calling Next,
+		// return early.
+		i.iterPos = iterPosCurr
+		return i.iterSpan
+	}
+	i.saveCurrent()
+
+	i.iterPos = iterPosNext
+	i.iterSpan = i.iter.Next()
+	for i.iterSpan != nil {
+		if !i.equal(i.curr.End, i.iterSpan.Start) {
+			// Not a continuation.
+			break
+		}
+		if !i.checkEqual(i.iterSpan, &i.curr) {
+			// Not a continuation.
+			break
+		}
+		i.keyBuf = append(i.keyBuf[:0], i.iterSpan.End...)
+		i.curr.End = i.keyBuf
+		i.keysBuf = i.reduce(i.keysBuf, i.iterSpan.Keys)
+		i.iterSpan = i.iter.Next()
+	}
+	// i.iterSpan == nil
+	//
+	// The inner iterator may return nil when it encounters an error. If there
+	// was an error, we don't know whether there is another span we should
+	// defragment or not. Return nil so that the caller knows they should check
+	// Error().
+	if i.iter.Error() != nil {
+		return nil
+	}
+	i.curr.Keys = i.keysBuf
+	return &i.curr
+}
+
+// defragmentBackward defragments spans in the backward direction, starting from
+// i.iter's current position. The span at the current position must be non-nil,
+// but may be Empty().
+func (i *DefragmentingIter) defragmentBackward() *Span {
+	if i.iterSpan.Empty() {
+		// An empty span will never be equal to another span; see checkEqual for
+		// why. To avoid loading non-empty range keys further ahead by calling Next,
+		// return early.
+		i.iterPos = iterPosCurr
+		return i.iterSpan
+	}
+	i.saveCurrent()
+
+	i.iterPos = iterPosPrev
+	i.iterSpan = i.iter.Prev()
+	for i.iterSpan != nil {
+		if !i.equal(i.curr.Start, i.iterSpan.End) {
+			// Not a continuation.
+			break
+		}
+		if !i.checkEqual(i.iterSpan, &i.curr) {
+			// Not a continuation.
+			break
+		}
+		i.keyBuf = append(i.keyBuf[:0], i.iterSpan.Start...)
+		i.curr.Start = i.keyBuf
+		i.keysBuf = i.reduce(i.keysBuf, i.iterSpan.Keys)
+		i.iterSpan = i.iter.Prev()
+	}
+	// i.iterSpan == nil
+	//
+	// The inner iterator may return nil when it encounters an error. If there
+	// was an error, we don't know whether there is another span we should
+	// defragment or not. Return nil so that the caller knows they should check
+	// Error().
+	if i.iter.Error() != nil {
+		return nil
+	}
+	i.curr.Keys = i.keysBuf
+	return &i.curr
+}
+
+func (i *DefragmentingIter) saveCurrent() {
+	i.currBuf.Reset()
+	i.keysBuf = i.keysBuf[:0]
+	i.keyBuf = i.keyBuf[:0]
+	if i.iterSpan == nil {
+		return
+	}
+	i.curr = Span{
+		Start:     i.saveBytes(i.iterSpan.Start),
+		End:       i.saveBytes(i.iterSpan.End),
+		KeysOrder: i.iterSpan.KeysOrder,
+	}
+	for j := range i.iterSpan.Keys {
+		i.keysBuf = append(i.keysBuf, Key{
+			Trailer: i.iterSpan.Keys[j].Trailer,
+			Suffix:  i.saveBytes(i.iterSpan.Keys[j].Suffix),
+			Value:   i.saveBytes(i.iterSpan.Keys[j].Value),
+		})
+	}
+	i.curr.Keys = i.keysBuf
+}
+
+func (i *DefragmentingIter) saveBytes(b []byte) []byte {
+	if b == nil {
+		return nil
+	}
+	i.currBuf, b = i.currBuf.Copy(b)
+	return b
+}
diff --git a/pebble/internal/keyspan/defragment_test.go b/pebble/internal/keyspan/defragment_test.go
new file mode 100644
index 0000000..b9856da
--- /dev/null
+++ b/pebble/internal/keyspan/defragment_test.go
@@ -0,0 +1,271 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import (
+	"bytes"
+	"fmt"
+	"math/rand"
+	"sort"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/pmezard/go-difflib/difflib"
+)
+
+func TestDefragmentingIter(t *testing.T) {
+	comparer := testkeys.Comparer
+	cmp := comparer.Compare
+	internalEqual := DefragmentInternal
+	alwaysEqual := DefragmentMethodFunc(func(_ base.Equal, _, _ *Span) bool { return true })
+	staticReducer := StaticDefragmentReducer
+	collectReducer := func(cur, next []Key) []Key {
+		c := keysBySeqNumKind(append(cur, next...))
+		sort.Sort(&c)
+		return c
+	}
+
+	var buf bytes.Buffer
+	var spans []Span
+	datadriven.RunTest(t, "testdata/defragmenting_iter", func(t *testing.T, td *datadriven.TestData) string {
+		buf.Reset()
+		switch td.Cmd {
+		case "define":
+			spans = spans[:0]
+			lines := strings.Split(strings.TrimSpace(td.Input), "\n")
+			for _, line := range lines {
+				spans = append(spans, ParseSpan(line))
+			}
+			return ""
+		case "iter":
+			equal := internalEqual
+			reducer := staticReducer
+			var probes []probe
+			for _, cmdArg := range td.CmdArgs {
+				switch cmd := cmdArg.Key; cmd {
+				case "equal":
+					if len(cmdArg.Vals) != 1 {
+						return fmt.Sprintf("only one equal func expected; got %d", len(cmdArg.Vals))
+					}
+					switch val := cmdArg.Vals[0]; val {
+					case "internal":
+						equal = internalEqual
+					case "always":
+						equal = alwaysEqual
+					default:
+						return fmt.Sprintf("unknown reducer %s", val)
+					}
+				case "reducer":
+					if len(cmdArg.Vals) != 1 {
+						return fmt.Sprintf("only one reducer expected; got %d", len(cmdArg.Vals))
+					}
+					switch val := cmdArg.Vals[0]; val {
+					case "collect":
+						reducer = collectReducer
+					case "static":
+						reducer = staticReducer
+					default:
+						return fmt.Sprintf("unknown reducer %s", val)
+					}
+				case "probes":
+					probes = parseProbes(cmdArg.Vals...)
+				default:
+					return fmt.Sprintf("unknown command: %s", cmd)
+				}
+			}
+			var miter MergingIter
+			miter.Init(cmp, noopTransform, new(MergingBuffers), NewIter(cmp, spans))
+			innerIter := attachProbes(&miter, probeContext{log: &buf}, probes...)
+			var iter DefragmentingIter
+			iter.Init(comparer, innerIter, equal, reducer, new(DefragmentingBuffers))
+			for _, line := range strings.Split(td.Input, "\n") {
+				runIterOp(&buf, &iter, line)
+				fmt.Fprintln(&buf)
+			}
+			return strings.TrimSpace(buf.String())
+		default:
+			return fmt.Sprintf("unrecognized command %q", td.Cmd)
+		}
+	})
+}
+
+func TestDefragmentingIter_Randomized(t *testing.T) {
+	seed := time.Now().UnixNano()
+	for i := int64(0); i < 100; i++ {
+		testDefragmentingIteRandomizedOnce(t, seed+i)
+	}
+}
+
+func TestDefragmentingIter_RandomizedFixedSeed(t *testing.T) {
+	const seed = 1648173101214881000
+	testDefragmentingIteRandomizedOnce(t, seed)
+}
+
+func testDefragmentingIteRandomizedOnce(t *testing.T, seed int64) {
+	comparer := testkeys.Comparer
+	cmp := comparer.Compare
+	formatKey := comparer.FormatKey
+
+	rng := rand.New(rand.NewSource(seed))
+	t.Logf("seed = %d", seed)
+
+	// Use a key space of alphanumeric strings, with a random max length between
+	// 1-2. Repeat keys are more common at the lower max lengths.
+	ks := testkeys.Alpha(rng.Intn(2) + 1)
+
+	// Generate between 1-15 range keys.
+	const maxRangeKeys = 15
+	var original, fragmented []Span
+	numRangeKeys := 1 + rng.Intn(maxRangeKeys)
+	for i := 0; i < numRangeKeys; i++ {
+		startIdx := rng.Int63n(ks.Count())
+		endIdx := rng.Int63n(ks.Count())
+		for startIdx == endIdx {
+			endIdx = rng.Int63n(ks.Count())
+		}
+		if startIdx > endIdx {
+			startIdx, endIdx = endIdx, startIdx
+		}
+
+		key := Key{
+			Trailer: base.MakeTrailer(uint64(i), base.InternalKeyKindRangeKeySet),
+			Value:   []byte(fmt.Sprintf("v%d", rng.Intn(3))),
+		}
+		// Generate suffixes 0, 1, 2, or 3 with 0 indicating none.
+		if suffix := rng.Int63n(4); suffix > 0 {
+			key.Suffix = testkeys.Suffix(suffix)
+		}
+		original = append(original, Span{
+			Start: testkeys.Key(ks, startIdx),
+			End:   testkeys.Key(ks, endIdx),
+			Keys:  []Key{key},
+		})
+
+		for startIdx < endIdx {
+			width := rng.Int63n(endIdx-startIdx) + 1
+			fragmented = append(fragmented, Span{
+				Start: testkeys.Key(ks, startIdx),
+				End:   testkeys.Key(ks, startIdx+width),
+				Keys:  []Key{key},
+			})
+			startIdx += width
+		}
+	}
+
+	// Both the original and the deliberately fragmented spans may contain
+	// overlaps, so we need to sort and fragment them.
+	original = fragment(cmp, formatKey, original)
+	fragmented = fragment(cmp, formatKey, fragmented)
+
+	var originalInner MergingIter
+	originalInner.Init(cmp, noopTransform, new(MergingBuffers), NewIter(cmp, original))
+	var fragmentedInner MergingIter
+	fragmentedInner.Init(cmp, noopTransform, new(MergingBuffers), NewIter(cmp, fragmented))
+
+	var referenceIter, fragmentedIter DefragmentingIter
+	referenceIter.Init(comparer, &originalInner, DefragmentInternal, StaticDefragmentReducer, new(DefragmentingBuffers))
+	fragmentedIter.Init(comparer, &fragmentedInner, DefragmentInternal, StaticDefragmentReducer, new(DefragmentingBuffers))
+
+	// Generate 100 random operations and run them against both iterators.
+	const numIterOps = 100
+	type opKind struct {
+		weight int
+		fn     func() string
+	}
+	ops := []opKind{
+		{weight: 2, fn: func() string { return "first" }},
+		{weight: 2, fn: func() string { return "last" }},
+		{weight: 50, fn: func() string { return "next" }},
+		{weight: 50, fn: func() string { return "prev" }},
+		{weight: 5, fn: func() string {
+			k := testkeys.Key(ks, rng.Int63n(ks.Count()))
+			return fmt.Sprintf("seekge(%s)", k)
+		}},
+		{weight: 5, fn: func() string {
+			k := testkeys.Key(ks, rng.Int63n(ks.Count()))
+			return fmt.Sprintf("seeklt(%s)", k)
+		}},
+	}
+	var totalWeight int
+	for _, op := range ops {
+		totalWeight += op.weight
+	}
+	var referenceHistory, fragmentedHistory bytes.Buffer
+	for i := 0; i < numIterOps; i++ {
+		p := rng.Intn(totalWeight)
+		opIndex := 0
+		if i == 0 {
+			// First op is always a First().
+		} else {
+			for i, op := range ops {
+				if p < op.weight {
+					opIndex = i
+					break
+				}
+				p -= op.weight
+			}
+		}
+		op := ops[opIndex].fn()
+		runIterOp(&referenceHistory, &referenceIter, op)
+		runIterOp(&fragmentedHistory, &fragmentedIter, op)
+		if !bytes.Equal(referenceHistory.Bytes(), fragmentedHistory.Bytes()) {
+			t.Fatal(debugContext(cmp, formatKey, original, fragmented,
+				referenceHistory.String(), fragmentedHistory.String()))
+		}
+		fmt.Fprintln(&referenceHistory)
+		fmt.Fprintln(&fragmentedHistory)
+	}
+}
+
+func fragment(cmp base.Compare, formatKey base.FormatKey, spans []Span) []Span {
+	Sort(cmp, spans)
+	var fragments []Span
+	f := Fragmenter{
+		Cmp:    cmp,
+		Format: formatKey,
+		Emit: func(f Span) {
+			fragments = append(fragments, f)
+		},
+	}
+	for _, s := range spans {
+		f.Add(s)
+	}
+	f.Finish()
+	return fragments
+}
+
+func debugContext(
+	cmp base.Compare,
+	formatKey base.FormatKey,
+	original, fragmented []Span,
+	refHistory, fragHistory string,
+) string {
+	var buf bytes.Buffer
+	fmt.Fprintln(&buf, "Reference:")
+	for _, s := range original {
+		fmt.Fprintln(&buf, s)
+	}
+	fmt.Fprintln(&buf)
+	fmt.Fprintln(&buf, "Fragmented:")
+	for _, s := range fragmented {
+		fmt.Fprintln(&buf, s)
+	}
+	fmt.Fprintln(&buf)
+	fmt.Fprintln(&buf, "\nOperations diff:")
+	diff, err := difflib.GetUnifiedDiffString(difflib.UnifiedDiff{
+		A:       difflib.SplitLines(refHistory),
+		B:       difflib.SplitLines(fragHistory),
+		Context: 5,
+	})
+	if err != nil {
+		panic(err)
+	}
+	fmt.Fprintln(&buf, diff)
+	return buf.String()
+}
diff --git a/pebble/internal/keyspan/doc.go b/pebble/internal/keyspan/doc.go
new file mode 100644
index 0000000..e05aad2
--- /dev/null
+++ b/pebble/internal/keyspan/doc.go
@@ -0,0 +1,13 @@
+// Package keyspan provides facilities for sorting, fragmenting and
+// iterating over spans of user keys.
+//
+// A Span represents a range of user key space with an inclusive start
+// key and exclusive end key. A span may hold any number of Keys which are
+// applied over the entirety of the span's keyspace.
+//
+// Spans are used within Pebble as an in-memory representation of range
+// deletion tombstones, and range key sets, unsets and deletes. Spans
+// are fragmented at overlapping key boundaries by the Fragmenter type.
+// This package's various iteration facilities require these
+// non-overlapping fragmented spans.
+package keyspan
diff --git a/pebble/internal/keyspan/filter.go b/pebble/internal/keyspan/filter.go
new file mode 100644
index 0000000..a63a43c
--- /dev/null
+++ b/pebble/internal/keyspan/filter.go
@@ -0,0 +1,115 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import "github.com/cockroachdb/pebble/internal/base"
+
+// FilterFunc defines a transform from the input Span into the output Span. The
+// function returns true if the Span should be returned by the iterator, and
+// false if the Span should be skipped. The FilterFunc is permitted to mutate
+// the output Span, for example, to elice certain keys, or update the Span's
+// bounds if so desired. The output Span's Keys slice may be reused to reduce
+// allocations.
+type FilterFunc func(in *Span, out *Span) (keep bool)
+
+// filteringIter is a FragmentIterator that uses a FilterFunc to select which
+// Spans from the input iterator are returned in the output.
+//
+// A note on Span lifetimes: as the FilterFunc reuses a Span with a mutable
+// slice of Keys to reduce allocations, Spans returned by this iterator are only
+// valid until the next relative or absolute positioning method is called.
+type filteringIter struct {
+	iter     FragmentIterator
+	filterFn FilterFunc
+	cmp      base.Compare
+
+	// span is a mutable Span passed to the filterFn. The filterFn is free to
+	// mutate this Span. The slice of Keys in the Span is reused with every call
+	// to the filterFn.
+	span Span
+}
+
+var _ FragmentIterator = (*filteringIter)(nil)
+
+// Filter returns a new filteringIter that will filter the Spans from the
+// provided child iterator using the provided FilterFunc.
+func Filter(iter FragmentIterator, filter FilterFunc, cmp base.Compare) FragmentIterator {
+	return &filteringIter{iter: iter, filterFn: filter, cmp: cmp}
+}
+
+// SeekGE implements FragmentIterator.
+func (i *filteringIter) SeekGE(key []byte) *Span {
+	span := i.filter(i.iter.SeekGE(key), +1)
+	// i.filter could return a span that's less than key, _if_ the filterFunc
+	// (which has no knowledge of the seek key) mutated the span to end at a key
+	// less than or equal to `key`. Detect this case and next/invalidate the iter.
+	if span != nil && i.cmp(span.End, key) <= 0 {
+		return i.Next()
+	}
+	return span
+}
+
+// SeekLT implements FragmentIterator.
+func (i *filteringIter) SeekLT(key []byte) *Span {
+	span := i.filter(i.iter.SeekLT(key), -1)
+	// i.filter could return a span that's >= key, _if_ the filterFunc (which has
+	// no knowledge of the seek key) mutated the span to start at a key greater
+	// than or equal to `key`. Detect this case and prev/invalidate the iter.
+	if span != nil && i.cmp(span.Start, key) >= 0 {
+		return i.Prev()
+	}
+	return span
+}
+
+// First implements FragmentIterator.
+func (i *filteringIter) First() *Span {
+	return i.filter(i.iter.First(), +1)
+}
+
+// Last implements FragmentIterator.
+func (i *filteringIter) Last() *Span {
+	return i.filter(i.iter.Last(), -1)
+}
+
+// Next implements FragmentIterator.
+func (i *filteringIter) Next() *Span {
+	return i.filter(i.iter.Next(), +1)
+}
+
+// Prev implements FragmentIterator.
+func (i *filteringIter) Prev() *Span {
+	return i.filter(i.iter.Prev(), -1)
+}
+
+// Error implements FragmentIterator.
+func (i *filteringIter) Error() error {
+	return i.iter.Error()
+}
+
+// Close implements FragmentIterator.
+func (i *filteringIter) Close() error {
+	return i.iter.Close()
+}
+
+// filter uses the filterFn (if configured) to filter and possibly mutate the
+// given Span. If the current Span is to be skipped, the iterator continues
+// iterating in the given direction until it lands on a Span that should be
+// returned, or the iterator becomes invalid.
+func (i *filteringIter) filter(span *Span, dir int8) *Span {
+	if i.filterFn == nil {
+		return span
+	}
+	for i.Error() == nil && span != nil {
+		if keep := i.filterFn(span, &i.span); keep {
+			return &i.span
+		}
+		if dir == +1 {
+			span = i.iter.Next()
+		} else {
+			span = i.iter.Prev()
+		}
+	}
+	return span
+}
diff --git a/pebble/internal/keyspan/filter_test.go b/pebble/internal/keyspan/filter_test.go
new file mode 100644
index 0000000..beb4de8
--- /dev/null
+++ b/pebble/internal/keyspan/filter_test.go
@@ -0,0 +1,79 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+)
+
+func TestFilteringIter(t *testing.T) {
+	// makeFilter returns a FilterFunc that will filter out all keys in a Span
+	// that are not of the given kind. Empty spans are skipped.
+	makeFilter := func(kind base.InternalKeyKind) FilterFunc {
+		return func(in *Span, out *Span) (keep bool) {
+			out.Start, out.End = in.Start, in.End
+			out.Keys = out.Keys[:0]
+			for _, k := range in.Keys {
+				if k.Kind() != kind {
+					continue
+				}
+				out.Keys = append(out.Keys, k)
+			}
+			return len(out.Keys) > 0
+		}
+	}
+
+	cmp := testkeys.Comparer.Compare
+	var spans []Span
+	datadriven.RunTest(t, "testdata/filtering_iter", func(t *testing.T, td *datadriven.TestData) string {
+		switch cmd := td.Cmd; cmd {
+		case "define":
+			spans = spans[:0]
+			lines := strings.Split(strings.TrimSpace(td.Input), "\n")
+			for _, line := range lines {
+				spans = append(spans, ParseSpan(line))
+			}
+			return ""
+
+		case "iter":
+			var filter FilterFunc
+			for _, cmdArg := range td.CmdArgs {
+				switch cmdArg.Key {
+				case "filter":
+					for _, s := range cmdArg.Vals {
+						switch s {
+						case "no-op":
+							filter = nil
+						case "key-kind-set":
+							filter = makeFilter(base.InternalKeyKindRangeKeySet)
+						case "key-kind-unset":
+							filter = makeFilter(base.InternalKeyKindRangeKeyUnset)
+						case "key-kind-del":
+							filter = makeFilter(base.InternalKeyKindRangeKeyDelete)
+						default:
+							return fmt.Sprintf("unknown filter: %s", s)
+						}
+					}
+				default:
+					return fmt.Sprintf("unknown command: %s", cmdArg.Key)
+				}
+			}
+			innerIter := NewIter(cmp, spans)
+			iter := Filter(innerIter, filter, cmp)
+			defer iter.Close()
+			s := runFragmentIteratorCmd(iter, td.Input, nil)
+			return s
+
+		default:
+			return fmt.Sprintf("unknown command: %s", cmd)
+		}
+	})
+}
diff --git a/pebble/internal/keyspan/fragmenter.go b/pebble/internal/keyspan/fragmenter.go
new file mode 100644
index 0000000..d4a410d
--- /dev/null
+++ b/pebble/internal/keyspan/fragmenter.go
@@ -0,0 +1,483 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import (
+	"fmt"
+	"sort"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+)
+
+type spansByStartKey struct {
+	cmp base.Compare
+	buf []Span
+}
+
+func (v *spansByStartKey) Len() int { return len(v.buf) }
+func (v *spansByStartKey) Less(i, j int) bool {
+	return v.cmp(v.buf[i].Start, v.buf[j].Start) < 0
+}
+func (v *spansByStartKey) Swap(i, j int) {
+	v.buf[i], v.buf[j] = v.buf[j], v.buf[i]
+}
+
+type spansByEndKey struct {
+	cmp base.Compare
+	buf []Span
+}
+
+func (v *spansByEndKey) Len() int { return len(v.buf) }
+func (v *spansByEndKey) Less(i, j int) bool {
+	return v.cmp(v.buf[i].End, v.buf[j].End) < 0
+}
+func (v *spansByEndKey) Swap(i, j int) {
+	v.buf[i], v.buf[j] = v.buf[j], v.buf[i]
+}
+
+// keysBySeqNumKind sorts spans by the start key's sequence number in
+// descending order. If two spans have equal sequence number, they're compared
+// by key kind in descending order. This ordering matches the ordering of
+// base.InternalCompare among keys with matching user keys.
+type keysBySeqNumKind []Key
+
+func (v *keysBySeqNumKind) Len() int           { return len(*v) }
+func (v *keysBySeqNumKind) Less(i, j int) bool { return (*v)[i].Trailer > (*v)[j].Trailer }
+func (v *keysBySeqNumKind) Swap(i, j int)      { (*v)[i], (*v)[j] = (*v)[j], (*v)[i] }
+
+// Sort the spans by start key. This is the ordering required by the
+// Fragmenter. Usually spans are naturally sorted by their start key,
+// but that isn't true for range deletion tombstones in the legacy
+// range-del-v1 block format.
+func Sort(cmp base.Compare, spans []Span) {
+	sorter := spansByStartKey{
+		cmp: cmp,
+		buf: spans,
+	}
+	sort.Sort(&sorter)
+}
+
+// Fragmenter fragments a set of spans such that overlapping spans are
+// split at their overlap points. The fragmented spans are output to the
+// supplied Output function.
+type Fragmenter struct {
+	Cmp    base.Compare
+	Format base.FormatKey
+	// Emit is called to emit a fragmented span and its keys. Every key defined
+	// within the emitted Span applies to the entirety of the Span's key span.
+	// Keys are ordered in decreasing order of their sequence numbers, and if
+	// equal, decreasing order of key kind.
+	Emit func(Span)
+	// pending contains the list of pending fragments that have not been
+	// flushed to the block writer. Note that the spans have not been
+	// fragmented on the end keys yet. That happens as the spans are
+	// flushed. All pending spans have the same Start.
+	pending []Span
+	// doneBuf is used to buffer completed span fragments when flushing to a
+	// specific key (e.g. TruncateAndFlushTo). It is cached in the Fragmenter to
+	// allow reuse.
+	doneBuf []Span
+	// sortBuf is used to sort fragments by end key when flushing.
+	sortBuf spansByEndKey
+	// flushBuf is used to sort keys by (seqnum,kind) before emitting.
+	flushBuf keysBySeqNumKind
+	// flushedKey is the key that fragments have been flushed up to. Any
+	// additional spans added to the fragmenter must have a start key >=
+	// flushedKey. A nil value indicates flushedKey has not been set.
+	flushedKey []byte
+	finished   bool
+}
+
+func (f *Fragmenter) checkInvariants(buf []Span) {
+	for i := 1; i < len(buf); i++ {
+		if f.Cmp(buf[i].Start, buf[i].End) >= 0 {
+			panic(fmt.Sprintf("pebble: empty pending span invariant violated: %s", buf[i]))
+		}
+		if f.Cmp(buf[i-1].Start, buf[i].Start) != 0 {
+			panic(fmt.Sprintf("pebble: pending span invariant violated: %s %s",
+				f.Format(buf[i-1].Start), f.Format(buf[i].Start)))
+		}
+	}
+}
+
+// Add adds a span to the fragmenter. Spans may overlap and the
+// fragmenter will internally split them. The spans must be presented in
+// increasing start key order. That is, Add must be called with a series
+// of spans like:
+//
+//	a---e
+//	  c---g
+//	  c-----i
+//	         j---n
+//	         j-l
+//
+// We need to fragment the spans at overlap points. In the above
+// example, we'd create:
+//
+//	a-c-e
+//	  c-e-g
+//	  c-e-g-i
+//	         j-l-n
+//	         j-l
+//
+// The fragments need to be output sorted by start key, and for equal start
+// keys, sorted by descending sequence number. This last part requires a mild
+// bit of care as the fragments are not created in descending sequence number
+// order.
+//
+// Once a start key has been seen, we know that we'll never see a smaller
+// start key and can thus flush all of the fragments that lie before that
+// start key.
+//
+// Walking through the example above, we start with:
+//
+//	a---e
+//
+// Next we add [c,g) resulting in:
+//
+//	a-c-e
+//	  c---g
+//
+// The fragment [a,c) is flushed leaving the pending spans as:
+//
+//	c-e
+//	c---g
+//
+// The next span is [c,i):
+//
+//	c-e
+//	c---g
+//	c-----i
+//
+// No fragments are flushed. The next span is [j,n):
+//
+//	c-e
+//	c---g
+//	c-----i
+//	       j---n
+//
+// The fragments [c,e), [c,g) and [c,i) are flushed. We sort these fragments
+// by their end key, then split the fragments on the end keys:
+//
+//	c-e
+//	c-e-g
+//	c-e---i
+//
+// The [c,e) fragments all get flushed leaving:
+//
+//	e-g
+//	e---i
+//
+// This process continues until there are no more fragments to flush.
+//
+// WARNING: the slices backing Start, End, Keys, Key.Suffix and Key.Value are
+// all retained after this method returns and should not be modified. This is
+// safe for spans that are added from a memtable or batch. It is partially
+// unsafe for a span read from an sstable. Specifically, the Keys slice of a
+// Span returned during sstable iteration is only valid until the next iterator
+// operation. The stability of the user keys depend on whether the block is
+// prefix compressed, and in practice Pebble never prefix compresses range
+// deletion and range key blocks, so these keys are stable. Because of this key
+// stability, typically callers only need to perform a shallow clone of the Span
+// before Add-ing it to the fragmenter.
+//
+// Add requires the provided span's keys are sorted in Trailer descending order.
+func (f *Fragmenter) Add(s Span) {
+	if f.finished {
+		panic("pebble: span fragmenter already finished")
+	} else if s.KeysOrder != ByTrailerDesc {
+		panic("pebble: span keys unexpectedly not in trailer descending order")
+	}
+	if f.flushedKey != nil {
+		switch c := f.Cmp(s.Start, f.flushedKey); {
+		case c < 0:
+			panic(fmt.Sprintf("pebble: start key (%s) < flushed key (%s)",
+				f.Format(s.Start), f.Format(f.flushedKey)))
+		}
+	}
+	if f.Cmp(s.Start, s.End) >= 0 {
+		// An empty span, we can ignore it.
+		return
+	}
+	if invariants.RaceEnabled {
+		f.checkInvariants(f.pending)
+		defer func() { f.checkInvariants(f.pending) }()
+	}
+
+	if len(f.pending) > 0 {
+		// Since all of the pending spans have the same start key, we only need
+		// to compare against the first one.
+		switch c := f.Cmp(f.pending[0].Start, s.Start); {
+		case c > 0:
+			panic(fmt.Sprintf("pebble: keys must be added in order: %s > %s",
+				f.Format(f.pending[0].Start), f.Format(s.Start)))
+		case c == 0:
+			// The new span has the same start key as the existing pending
+			// spans. Add it to the pending buffer.
+			f.pending = append(f.pending, s)
+			return
+		}
+
+		// At this point we know that the new start key is greater than the pending
+		// spans start keys.
+		f.truncateAndFlush(s.Start)
+	}
+
+	f.pending = append(f.pending, s)
+}
+
+// Cover is returned by Framenter.Covers and describes a span's relationship to
+// a key at a particular snapshot.
+type Cover int8
+
+const (
+	// NoCover indicates the tested key does not fall within the span's bounds,
+	// or the span contains no keys with sequence numbers higher than the key's.
+	NoCover Cover = iota
+	// CoversInvisibly indicates the tested key does fall within the span's
+	// bounds and the span contains at least one key with a higher sequence
+	// number, but none visible at the provided snapshot.
+	CoversInvisibly
+	// CoversVisibly indicates the tested key does fall within the span's
+	// bounds, and the span constains at least one key with a sequence number
+	// higher than the key's sequence number that is visible at the provided
+	// snapshot.
+	CoversVisibly
+)
+
+// Covers returns an enum indicating whether the specified key is covered by one
+// of the pending keys. The provided key must be consistent with the ordering of
+// the spans. That is, it is invalid to specify a key here that is out of order
+// with the span start keys passed to Add.
+func (f *Fragmenter) Covers(key base.InternalKey, snapshot uint64) Cover {
+	if f.finished {
+		panic("pebble: span fragmenter already finished")
+	}
+	if len(f.pending) == 0 {
+		return NoCover
+	}
+
+	if f.Cmp(f.pending[0].Start, key.UserKey) > 0 {
+		panic(fmt.Sprintf("pebble: keys must be in order: %s > %s",
+			f.Format(f.pending[0].Start), key.Pretty(f.Format)))
+	}
+
+	cover := NoCover
+	seqNum := key.SeqNum()
+	for _, s := range f.pending {
+		if f.Cmp(key.UserKey, s.End) < 0 {
+			// NB: A range deletion tombstone does not delete a point operation
+			// at the same sequence number, and broadly a span is not considered
+			// to cover a point operation at the same sequence number.
+
+			for i := range s.Keys {
+				if kseq := s.Keys[i].SeqNum(); kseq > seqNum {
+					// This key from the span has a higher sequence number than
+					// `key`. It covers `key`, although the span's key might not
+					// be visible if its snapshot is too high.
+					//
+					// Batch keys are always be visible.
+					if kseq < snapshot || kseq&base.InternalKeySeqNumBatch != 0 {
+						return CoversVisibly
+					}
+					// s.Keys[i] is not visible.
+					cover = CoversInvisibly
+				}
+			}
+		}
+	}
+	return cover
+}
+
+// Empty returns true if all fragments added so far have finished flushing.
+func (f *Fragmenter) Empty() bool {
+	return f.finished || len(f.pending) == 0
+}
+
+// TruncateAndFlushTo flushes all of the fragments with a start key <= key,
+// truncating spans to the specified end key. Used during compaction to force
+// emitting of spans which straddle an sstable boundary. Consider
+// the scenario:
+//
+//	a---------k#10
+//	     f#8
+//	     f#7
+//
+// Let's say the next user key after f is g. Calling TruncateAndFlushTo(g) will
+// flush this span:
+//
+//	a-------g#10
+//	     f#8
+//	     f#7
+//
+// And leave this one in f.pending:
+//
+//	g----k#10
+//
+// WARNING: The fragmenter could hold on to the specified end key. Ensure it's
+// a safe byte slice that could outlast the current sstable output, and one
+// that will never be modified.
+func (f *Fragmenter) TruncateAndFlushTo(key []byte) {
+	if f.finished {
+		panic("pebble: span fragmenter already finished")
+	}
+	if f.flushedKey != nil {
+		switch c := f.Cmp(key, f.flushedKey); {
+		case c < 0:
+			panic(fmt.Sprintf("pebble: start key (%s) < flushed key (%s)",
+				f.Format(key), f.Format(f.flushedKey)))
+		}
+	}
+	if invariants.RaceEnabled {
+		f.checkInvariants(f.pending)
+		defer func() { f.checkInvariants(f.pending) }()
+	}
+	if len(f.pending) > 0 {
+		// Since all of the pending spans have the same start key, we only need
+		// to compare against the first one.
+		switch c := f.Cmp(f.pending[0].Start, key); {
+		case c > 0:
+			panic(fmt.Sprintf("pebble: keys must be added in order: %s > %s",
+				f.Format(f.pending[0].Start), f.Format(key)))
+		case c == 0:
+			return
+		}
+	}
+	f.truncateAndFlush(key)
+}
+
+// Start returns the start key of the first span in the pending buffer, or nil
+// if there are no pending spans. The start key of all pending spans is the same
+// as that of the first one.
+func (f *Fragmenter) Start() []byte {
+	if len(f.pending) > 0 {
+		return f.pending[0].Start
+	}
+	return nil
+}
+
+// Flushes all pending spans up to key (exclusive).
+//
+// WARNING: The specified key is stored without making a copy, so all callers
+// must ensure it is safe.
+func (f *Fragmenter) truncateAndFlush(key []byte) {
+	f.flushedKey = append(f.flushedKey[:0], key...)
+	done := f.doneBuf[:0]
+	pending := f.pending
+	f.pending = f.pending[:0]
+
+	// pending and f.pending share the same underlying storage. As we iterate
+	// over pending we append to f.pending, but only one entry is appended in
+	// each iteration, after we have read the entry being overwritten.
+	for _, s := range pending {
+		if f.Cmp(key, s.End) < 0 {
+			//   s: a--+--e
+			// new:    c------
+			if f.Cmp(s.Start, key) < 0 {
+				done = append(done, Span{
+					Start: s.Start,
+					End:   key,
+					Keys:  s.Keys,
+				})
+			}
+			f.pending = append(f.pending, Span{
+				Start: key,
+				End:   s.End,
+				Keys:  s.Keys,
+			})
+		} else {
+			//   s: a-----e
+			// new:       e----
+			done = append(done, s)
+		}
+	}
+
+	f.doneBuf = done[:0]
+	f.flush(done, nil)
+}
+
+// flush a group of range spans to the block. The spans are required to all have
+// the same start key. We flush all span fragments until startKey > lastKey. If
+// lastKey is nil, all span fragments are flushed. The specification of a
+// non-nil lastKey occurs for range deletion tombstones during compaction where
+// we want to flush (but not truncate) all range tombstones that start at or
+// before the first key in the next sstable. Consider:
+//
+//	a---e#10
+//	a------h#9
+//
+// If a compaction splits the sstables at key c we want the first sstable to
+// contain the tombstones [a,e)#10 and [a,e)#9. Fragmentation would naturally
+// produce a tombstone [e,h)#9, but we don't need to output that tombstone to
+// the first sstable.
+func (f *Fragmenter) flush(buf []Span, lastKey []byte) {
+	if invariants.RaceEnabled {
+		f.checkInvariants(buf)
+	}
+
+	// Sort the spans by end key. This will allow us to walk over the spans and
+	// easily determine the next split point (the smallest end-key).
+	f.sortBuf.cmp = f.Cmp
+	f.sortBuf.buf = buf
+	sort.Sort(&f.sortBuf)
+
+	// Loop over the spans, splitting by end key.
+	for len(buf) > 0 {
+		// A prefix of spans will end at split. remove represents the count of
+		// that prefix.
+		remove := 1
+		split := buf[0].End
+		f.flushBuf = append(f.flushBuf[:0], buf[0].Keys...)
+
+		for i := 1; i < len(buf); i++ {
+			if f.Cmp(split, buf[i].End) == 0 {
+				remove++
+			}
+			f.flushBuf = append(f.flushBuf, buf[i].Keys...)
+		}
+
+		sort.Sort(&f.flushBuf)
+
+		f.Emit(Span{
+			Start: buf[0].Start,
+			End:   split,
+			// Copy the sorted keys to a new slice.
+			//
+			// This allocation is an unfortunate side effect of the Fragmenter and
+			// the expectation that the spans it produces are available in-memory
+			// indefinitely.
+			//
+			// Eventually, we should be able to replace the fragmenter with the
+			// keyspan.MergingIter which will perform just-in-time
+			// fragmentation, and only guaranteeing the memory lifetime for the
+			// current span. The MergingIter fragments while only needing to
+			// access one Span per level. It only accesses the Span at the
+			// current position for each level. During compactions, we can write
+			// these spans to sstables without retaining previous Spans.
+			Keys: append([]Key(nil), f.flushBuf...),
+		})
+
+		if lastKey != nil && f.Cmp(split, lastKey) > 0 {
+			break
+		}
+
+		// Adjust the start key for every remaining span.
+		buf = buf[remove:]
+		for i := range buf {
+			buf[i].Start = split
+		}
+	}
+}
+
+// Finish flushes any remaining fragments to the output. It is an error to call
+// this if any other spans will be added.
+func (f *Fragmenter) Finish() {
+	if f.finished {
+		panic("pebble: span fragmenter already finished")
+	}
+	f.flush(f.pending, nil)
+	f.finished = true
+}
diff --git a/pebble/internal/keyspan/fragmenter_test.go b/pebble/internal/keyspan/fragmenter_test.go
new file mode 100644
index 0000000..6916f15
--- /dev/null
+++ b/pebble/internal/keyspan/fragmenter_test.go
@@ -0,0 +1,320 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import (
+	"bytes"
+	"fmt"
+	"regexp"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/stretchr/testify/require"
+)
+
+var spanRe = regexp.MustCompile(`(\d+):\s*(\w+)-*(\w+)\w*([^\n]*)`)
+
+func parseSpanSingleKey(t *testing.T, s string, kind base.InternalKeyKind) Span {
+	m := spanRe.FindStringSubmatch(s)
+	if len(m) != 5 {
+		t.Fatalf("expected 5 components, but found %d: %s", len(m), s)
+	}
+	seqNum, err := strconv.Atoi(m[1])
+	require.NoError(t, err)
+	return Span{
+		Start: []byte(m[2]),
+		End:   []byte(m[3]),
+		Keys: []Key{
+			{
+				Trailer: base.MakeTrailer(uint64(seqNum), kind),
+				Value:   []byte(strings.TrimSpace(m[4])),
+			},
+		},
+	}
+}
+
+func buildSpans(
+	t *testing.T, cmp base.Compare, formatKey base.FormatKey, s string, kind base.InternalKeyKind,
+) []Span {
+	var spans []Span
+	f := &Fragmenter{
+		Cmp:    cmp,
+		Format: formatKey,
+		Emit: func(fragmented Span) {
+			spans = append(spans, fragmented)
+		},
+	}
+	for _, line := range strings.Split(s, "\n") {
+		if strings.HasPrefix(line, "truncate-and-flush-to ") {
+			parts := strings.Split(line, " ")
+			if len(parts) != 2 {
+				t.Fatalf("expected 2 components, but found %d: %s", len(parts), line)
+			}
+			f.TruncateAndFlushTo([]byte(parts[1]))
+			continue
+		}
+
+		f.Add(parseSpanSingleKey(t, line, kind))
+	}
+	f.Finish()
+	return spans
+}
+
+func formatAlphabeticSpans(spans []Span) string {
+	isLetter := func(b []byte) bool {
+		if len(b) != 1 {
+			return false
+		}
+		return b[0] >= 'a' && b[0] <= 'z'
+	}
+
+	var buf bytes.Buffer
+	for _, v := range spans {
+		switch {
+		case !v.Valid():
+			fmt.Fprintf(&buf, "<invalid>\n")
+		case v.Empty():
+			fmt.Fprintf(&buf, "<empty>\n")
+		case !isLetter(v.Start) || !isLetter(v.End) || v.Start[0] == v.End[0]:
+			for _, k := range v.Keys {
+				fmt.Fprintf(&buf, "%d: %s-%s", k.SeqNum(), v.Start, v.End)
+				if len(k.Value) > 0 {
+					buf.WriteString(strings.Repeat(" ", int('z'-v.End[0]+1)))
+					buf.WriteString(string(k.Value))
+				}
+				fmt.Fprintln(&buf)
+			}
+		default:
+			for _, k := range v.Keys {
+				fmt.Fprintf(&buf, "%d: %s%s%s%s",
+					k.SeqNum(),
+					strings.Repeat(" ", int(v.Start[0]-'a')),
+					v.Start,
+					strings.Repeat("-", int(v.End[0]-v.Start[0]-1)),
+					v.End)
+				if len(k.Value) > 0 {
+					buf.WriteString(strings.Repeat(" ", int('z'-v.End[0]+1)))
+					buf.WriteString(string(k.Value))
+				}
+				fmt.Fprintln(&buf)
+			}
+		}
+	}
+	return buf.String()
+}
+
+func TestFragmenter(t *testing.T) {
+	cmp := base.DefaultComparer.Compare
+	fmtKey := base.DefaultComparer.FormatKey
+
+	var getRe = regexp.MustCompile(`(\w+)#(\d+)`)
+
+	parseGet := func(t *testing.T, s string) (string, int) {
+		m := getRe.FindStringSubmatch(s)
+		if len(m) != 3 {
+			t.Fatalf("expected 3 components, but found %d", len(m))
+		}
+		seq, err := strconv.Atoi(m[2])
+		require.NoError(t, err)
+		return m[1], seq
+	}
+
+	var iter FragmentIterator
+
+	// Returns true if the specified <key,seq> pair is deleted at the specified
+	// read sequence number. Get ignores spans newer than the read sequence
+	// number. This is a simple version of what full processing of range
+	// tombstones looks like.
+	deleted := func(key []byte, seq, readSeq uint64) bool {
+		s := Get(cmp, iter, key)
+		return s != nil && s.CoversAt(readSeq, seq)
+	}
+
+	datadriven.RunTest(t, "testdata/fragmenter", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "build":
+			return func() (result string) {
+				defer func() {
+					if r := recover(); r != nil {
+						result = fmt.Sprint(r)
+					}
+				}()
+
+				spans := buildSpans(t, cmp, fmtKey, d.Input, base.InternalKeyKindRangeDelete)
+				iter = NewIter(cmp, spans)
+				return formatAlphabeticSpans(spans)
+			}()
+
+		case "get":
+			if len(d.CmdArgs) != 1 {
+				return fmt.Sprintf("expected 1 argument, but found %s", d.CmdArgs)
+			}
+			if d.CmdArgs[0].Key != "t" {
+				return fmt.Sprintf("expected timestamp argument, but found %s", d.CmdArgs[0])
+			}
+			readSeq, err := strconv.Atoi(d.CmdArgs[0].Vals[0])
+			require.NoError(t, err)
+
+			var results []string
+			for _, p := range strings.Split(d.Input, " ") {
+				key, seq := parseGet(t, p)
+				if deleted([]byte(key), uint64(seq), uint64(readSeq)) {
+					results = append(results, "deleted")
+				} else {
+					results = append(results, "alive")
+				}
+			}
+			return strings.Join(results, " ")
+
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
+
+func TestFragmenterCovers(t *testing.T) {
+	datadriven.RunTest(t, "testdata/fragmenter_covers", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "build":
+			f := &Fragmenter{
+				Cmp:    base.DefaultComparer.Compare,
+				Format: base.DefaultComparer.FormatKey,
+				Emit: func(fragmented Span) {
+				},
+			}
+			var buf bytes.Buffer
+			for _, line := range strings.Split(d.Input, "\n") {
+				switch {
+				case strings.HasPrefix(line, "add "):
+					t := parseSpanSingleKey(t, strings.TrimPrefix(line, "add "), base.InternalKeyKindRangeDelete)
+					f.Add(t)
+				case strings.HasPrefix(line, "deleted "):
+					fields := strings.Fields(strings.TrimPrefix(line, "deleted "))
+					key := base.ParseInternalKey(fields[0])
+					snapshot, err := strconv.ParseUint(fields[1], 10, 64)
+					if err != nil {
+						return err.Error()
+					}
+					func() {
+						defer func() {
+							if r := recover(); r != nil {
+								fmt.Fprintf(&buf, "%s: %s\n", key, r)
+							}
+						}()
+						switch f.Covers(key, snapshot) {
+						case NoCover:
+							fmt.Fprintf(&buf, "%s: none\n", key)
+						case CoversInvisibly:
+							fmt.Fprintf(&buf, "%s: invisibly\n", key)
+						case CoversVisibly:
+							fmt.Fprintf(&buf, "%s: visibly\n", key)
+						}
+					}()
+				}
+			}
+			return buf.String()
+
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
+
+func TestFragmenterTruncateAndFlushTo(t *testing.T) {
+	cmp := base.DefaultComparer.Compare
+	fmtKey := base.DefaultComparer.FormatKey
+
+	datadriven.RunTest(t, "testdata/fragmenter_truncate_and_flush_to", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "build":
+			return func() (result string) {
+				defer func() {
+					if r := recover(); r != nil {
+						result = fmt.Sprint(r)
+					}
+				}()
+
+				spans := buildSpans(t, cmp, fmtKey, d.Input, base.InternalKeyKindRangeDelete)
+				return formatAlphabeticSpans(spans)
+			}()
+
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
+
+func TestFragmenter_Values(t *testing.T) {
+	cmp := base.DefaultComparer.Compare
+	fmtKey := base.DefaultComparer.FormatKey
+
+	datadriven.RunTest(t, "testdata/fragmenter_values", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "build":
+			return func() (result string) {
+				defer func() {
+					if r := recover(); r != nil {
+						result = fmt.Sprint(r)
+					}
+				}()
+
+				spans := buildSpans(t, cmp, fmtKey, d.Input, base.InternalKeyKindRangeKeySet)
+				return formatAlphabeticSpans(spans)
+			}()
+
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
+
+func TestFragmenter_EmitOrder(t *testing.T) {
+	var buf bytes.Buffer
+
+	datadriven.RunTest(t, "testdata/fragmenter_emit_order", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "build":
+			buf.Reset()
+			f := Fragmenter{
+				Cmp:    base.DefaultComparer.Compare,
+				Format: base.DefaultComparer.FormatKey,
+				Emit: func(span Span) {
+					fmt.Fprintf(&buf, "%s %s:",
+						base.DefaultComparer.FormatKey(span.Start),
+						base.DefaultComparer.FormatKey(span.End))
+					for i, k := range span.Keys {
+						if i == 0 {
+							fmt.Fprint(&buf, " ")
+						} else {
+							fmt.Fprint(&buf, ", ")
+						}
+						fmt.Fprintf(&buf, "#%d,%s", k.SeqNum(), k.Kind())
+					}
+					fmt.Fprintln(&buf, "\n-")
+				},
+			}
+			for _, line := range strings.Split(d.Input, "\n") {
+				fields := strings.Fields(line)
+				if len(fields) != 2 {
+					panic(fmt.Sprintf("datadriven test: expect 2 fields, found %d", len(fields)))
+				}
+				k := base.ParseInternalKey(fields[0])
+				f.Add(Span{
+					Start: k.UserKey,
+					End:   []byte(fields[1]),
+					Keys:  []Key{{Trailer: k.Trailer}},
+				})
+			}
+
+			f.Finish()
+			return buf.String()
+		default:
+			panic(fmt.Sprintf("unrecognized command %q", d.Cmd))
+		}
+	})
+}
diff --git a/pebble/internal/keyspan/get.go b/pebble/internal/keyspan/get.go
new file mode 100644
index 0000000..c07f8c8
--- /dev/null
+++ b/pebble/internal/keyspan/get.go
@@ -0,0 +1,53 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import "github.com/cockroachdb/pebble/internal/base"
+
+// Get returns the newest span that contains the target key. If no span
+// contains the target key, an empty span is returned. The snapshot
+// parameter controls the visibility of spans (only spans older than the
+// snapshot sequence number are visible). The iterator must contain
+// fragmented spans: no span may overlap another.
+func Get(cmp base.Compare, iter FragmentIterator, key []byte) *Span {
+	// NB: We use SeekLT in order to land on the proper span for a search
+	// key that resides in the middle of a span. Consider the scenario:
+	//
+	//     a---e
+	//         e---i
+	//
+	// The spans are indexed by their start keys `a` and `e`. If the
+	// search key is `c` we want to land on the span [a,e). If we were
+	// to use SeekGE then the search key `c` would land on the span
+	// [e,i) and we'd have to backtrack. The one complexity here is what
+	// happens for the search key `e`. In that case SeekLT will land us
+	// on the span [a,e) and we'll have to move forward.
+	iterSpan := iter.SeekLT(key)
+	if iterSpan == nil {
+		iterSpan = iter.Next()
+		if iterSpan == nil {
+			// The iterator is empty.
+			return nil
+		}
+		if cmp(key, iterSpan.Start) < 0 {
+			// The search key lies before the first span.
+			return nil
+		}
+	}
+
+	// Invariant: key > iterSpan.Start
+	if cmp(key, iterSpan.End) >= 0 {
+		// The current span lies before the search key. Advance the iterator
+		// once to potentially land on a key with a start key exactly equal to
+		// key. (See the comment at the beginning of this function.)
+		iterSpan = iter.Next()
+		if iterSpan == nil || cmp(key, iterSpan.Start) < 0 {
+			// We've run out of spans or we've moved on to a span which
+			// starts after our search key.
+			return nil
+		}
+	}
+	return iterSpan
+}
diff --git a/pebble/internal/keyspan/interleaving_iter.go b/pebble/internal/keyspan/interleaving_iter.go
new file mode 100644
index 0000000..e1fd600
--- /dev/null
+++ b/pebble/internal/keyspan/interleaving_iter.go
@@ -0,0 +1,1149 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+)
+
+// A SpanMask may be used to configure an interleaving iterator to skip point
+// keys that fall within the bounds of some spans.
+type SpanMask interface {
+	// SpanChanged is invoked by an interleaving iterator whenever the current
+	// span changes. As the iterator passes into or out of a Span, it invokes
+	// SpanChanged, passing the new Span. When the iterator passes out of a
+	// span's boundaries and is no longer covered by any span, SpanChanged is
+	// invoked with a nil span.
+	//
+	// SpanChanged is invoked before SkipPoint, and callers may use SpanChanged
+	// to recalculate state used by SkipPoint for masking.
+	//
+	// SpanChanged may be invoked consecutively with identical spans under some
+	// circumstances, such as repeatedly absolutely positioning an iterator to
+	// positions covered by the same span, or while changing directions.
+	SpanChanged(*Span)
+	// SkipPoint is invoked by the interleaving iterator whenever the iterator
+	// encounters a point key covered by a Span. If SkipPoint returns true, the
+	// interleaving iterator skips the point key and all larger keys with the
+	// same prefix. This is used during range key iteration to skip over point
+	// keys 'masked' by range keys.
+	SkipPoint(userKey []byte) bool
+}
+
+// InterleavingIter combines an iterator over point keys with an iterator over
+// key spans.
+//
+// Throughout Pebble, some keys apply at single discrete points within the user
+// keyspace. Other keys apply over continuous spans of the user key space.
+// Internally, iterators over point keys adhere to the base.InternalIterator
+// interface, and iterators over spans adhere to the keyspan.FragmentIterator
+// interface. The InterleavingIterator wraps a point iterator and span iterator,
+// providing access to all the elements of both iterators.
+//
+// The InterleavingIterator implements the point base.InternalIterator
+// interface. After any of the iterator's methods return a key, a caller may
+// call Span to retrieve the span covering the returned key, if any.  A span is
+// considered to 'cover' a returned key if the span's [start, end) bounds
+// include the key's user key.
+//
+// In addition to tracking the current covering span, InterleavingIter returns a
+// special InternalKey at span start boundaries. Start boundaries are surfaced
+// as a synthetic span marker: an InternalKey with the boundary as the user key,
+// the infinite sequence number and a key kind selected from an arbitrary key
+// the infinite sequence number and an arbitrary contained key's kind. Since
+// which of the Span's key's kind is surfaced is undefined, the caller should
+// not use the InternalKey's kind. The caller should only rely on the `Span`
+// method for retrieving information about spanning keys. The interleaved
+// synthetic keys have the infinite sequence number so that they're interleaved
+// before any point keys with the same user key when iterating forward and after
+// when iterating backward.
+//
+// Interleaving the synthetic start key boundaries at the maximum sequence
+// number provides an opportunity for the higher-level, public Iterator to
+// observe the Span, even if no live points keys exist within the boudns of the
+// Span.
+//
+// When returning a synthetic marker key for a start boundary, InterleavingIter
+// will truncate the span's start bound to the SeekGE or SeekPrefixGE search
+// key. For example, a SeekGE("d") that finds a span [a, z) may return a
+// synthetic span marker key `d#72057594037927935,21`.
+//
+// If bounds have been applied to the iterator through SetBounds,
+// InterleavingIter will truncate the bounds of spans returned through Span to
+// the set bounds. The bounds returned through Span are not truncated by a
+// SeekGE or SeekPrefixGE search key. Consider, for example SetBounds('c', 'e'),
+// with an iterator containing the Span [a,z):
+//
+//	First()     = `c#72057594037927935,21`        Span() = [c,e)
+//	SeekGE('d') = `d#72057594037927935,21`        Span() = [c,e)
+//
+// InterleavedIter does not interleave synthetic markers for spans that do not
+// contain any keys.
+//
+// # SpanMask
+//
+// InterelavingIter takes a SpanMask parameter that may be used to configure the
+// behavior of the iterator. See the documentation on the SpanMask type.
+//
+// All spans containing keys are exposed during iteration.
+type InterleavingIter struct {
+	cmp         base.Compare
+	comparer    *base.Comparer
+	pointIter   base.InternalIterator
+	keyspanIter FragmentIterator
+	mask        SpanMask
+
+	// lower and upper hold the iteration bounds set through SetBounds.
+	lower, upper []byte
+	// keyBuf is used to copy SeekGE or SeekPrefixGE arguments when they're used
+	// to truncate a span. The byte slices backing a SeekGE/SeekPrefixGE search
+	// keys can come directly from the end user, so they're copied into keyBuf
+	// to ensure key stability.
+	keyBuf []byte
+	// nextPrefixBuf is used during SeekPrefixGE calls to store the truncated
+	// upper bound of the returned spans. SeekPrefixGE truncates the returned
+	// spans to an upper bound of the seeked prefix's immediate successor.
+	nextPrefixBuf []byte
+	pointKey      *base.InternalKey
+	pointVal      base.LazyValue
+	// err holds an iterator error from either pointIter or keyspanIter. It's
+	// reset to nil on seeks. An overview of error-handling mechanics:
+	//
+	// Whenever either pointIter or keyspanIter is respositioned and a nil
+	// key/span is returned, the code performing the positioning is responsible
+	// for checking the iterator's Error() value. This happens in savePoint and
+	// saveSpan[Forward,Backward].
+	//
+	// Once i.err is non-nil, the computation of i.pos must set i.pos =
+	// posExhausted. This happens in compute[Smallest|Largest]Pos and
+	// [next|prev]Pos. Setting i.pos to posExhausted ensures we'll yield nil to
+	// the caller, which they'll interpret as a signal they must check Error().
+	//
+	// INVARIANTS:
+	// i.err != nil => i.pos = posExhausted
+	err error
+	// prefix records the iterator's current prefix if the iterator is in prefix
+	// mode. During prefix mode, Pebble will truncate spans to the next prefix.
+	// If the iterator subsequently leaves prefix mode, the existing span cached
+	// in i.span must be invalidated because its bounds do not reflect the
+	// original span's true bounds.
+	prefix []byte
+	// span holds the span at the keyspanIter's current position. If the span is
+	// wholly contained within the iterator bounds, this span is directly
+	// returned to the iterator consumer through Span(). If either bound needed
+	// to be truncated to the iterator bounds, then truncated is set to true and
+	// Span() must return a pointer to truncatedSpan.
+	span *Span
+	// spanMarker holds the synthetic key that is returned when the iterator
+	// passes over a key span's start bound.
+	spanMarker base.InternalKey
+	// truncated indicates whether or not the span at the current position
+	// needed to be truncated. If it did, truncatedSpan holds the truncated
+	// span that should be returned.
+	truncatedSpan Span
+	truncated     bool
+
+	// Keeping all of the bools/uint8s together reduces the sizeof the struct.
+
+	// pos encodes the current position of the iterator: exhausted, on the point
+	// key, on a keyspan start, or on a keyspan end.
+	pos interleavePos
+	// withinSpan indicates whether the iterator is currently positioned within
+	// the bounds of the current span (i.span). withinSpan must be updated
+	// whenever the interleaving iterator's position enters or exits the bounds
+	// of a span.
+	withinSpan bool
+	// spanMarkerTruncated is set by SeekGE/SeekPrefixGE calls that truncate a
+	// span's start bound marker to the search key. It's returned to false on
+	// the next repositioning of the keyspan iterator.
+	spanMarkerTruncated bool
+	// maskSpanChangedCalled records whether or not the last call to
+	// SpanMask.SpanChanged provided the current span (i.span) or not.
+	maskSpanChangedCalled bool
+	// dir indicates the direction of iteration: forward (+1) or backward (-1)
+	dir int8
+}
+
+// interleavePos indicates the iterator's current position. Note that both
+// keyspanStart and keyspanEnd positions correspond to their user key boundaries
+// with maximal sequence numbers. This means in the forward direction
+// posKeyspanStart and posKeyspanEnd are always interleaved before a posPointKey
+// with the same user key.
+type interleavePos int8
+
+const (
+	posUninitialized interleavePos = iota
+	posExhausted
+	posPointKey
+	posKeyspanStart
+	posKeyspanEnd
+)
+
+// Assert that *InterleavingIter implements the InternalIterator interface.
+var _ base.InternalIterator = &InterleavingIter{}
+
+// InterleavingIterOpts holds options configuring the behavior of a
+// InterleavingIter.
+type InterleavingIterOpts struct {
+	Mask                   SpanMask
+	LowerBound, UpperBound []byte
+}
+
+// Init initializes the InterleavingIter to interleave point keys from pointIter
+// with key spans from keyspanIter.
+//
+// The point iterator must already have the bounds provided on opts. Init does
+// not propagate the bounds down the iterator stack.
+func (i *InterleavingIter) Init(
+	comparer *base.Comparer,
+	pointIter base.InternalIterator,
+	keyspanIter FragmentIterator,
+	opts InterleavingIterOpts,
+) {
+	*i = InterleavingIter{
+		cmp:         comparer.Compare,
+		comparer:    comparer,
+		pointIter:   pointIter,
+		keyspanIter: keyspanIter,
+		mask:        opts.Mask,
+		lower:       opts.LowerBound,
+		upper:       opts.UpperBound,
+	}
+}
+
+// InitSeekGE may be called after Init but before any positioning method.
+// InitSeekGE initializes the current position of the point iterator and then
+// performs a SeekGE on the keyspan iterator using the provided key. InitSeekGE
+// returns whichever point or keyspan key is smaller. After InitSeekGE, the
+// iterator is positioned and may be repositioned using relative positioning
+// methods.
+//
+// This method is used specifically for lazily constructing combined iterators.
+// It allows for seeding the iterator with the current position of the point
+// iterator.
+func (i *InterleavingIter) InitSeekGE(
+	prefix, key []byte, pointKey *base.InternalKey, pointValue base.LazyValue,
+) (*base.InternalKey, base.LazyValue) {
+	i.dir = +1
+	i.clearMask()
+	i.prefix = prefix
+	i.savePoint(pointKey, pointValue)
+	// NB: This keyspanSeekGE call will truncate the span to the seek key if
+	// necessary. This truncation is important for cases where a switch to
+	// combined iteration is made during a user-initiated SeekGE.
+	i.keyspanSeekGE(key, prefix)
+	i.computeSmallestPos()
+	return i.yieldPosition(key, i.nextPos)
+}
+
+// InitSeekLT may be called after Init but before any positioning method.
+// InitSeekLT initializes the current position of the point iterator and then
+// performs a SeekLT on the keyspan iterator using the provided key. InitSeekLT
+// returns whichever point or keyspan key is larger. After InitSeekLT, the
+// iterator is positioned and may be repositioned using relative positioning
+// methods.
+//
+// This method is used specifically for lazily constructing combined iterators.
+// It allows for seeding the iterator with the current position of the point
+// iterator.
+func (i *InterleavingIter) InitSeekLT(
+	key []byte, pointKey *base.InternalKey, pointValue base.LazyValue,
+) (*base.InternalKey, base.LazyValue) {
+	i.dir = -1
+	i.clearMask()
+	i.savePoint(pointKey, pointValue)
+	i.keyspanSeekLT(key)
+	i.computeLargestPos()
+	return i.yieldPosition(i.lower, i.prevPos)
+}
+
+// SeekGE implements (base.InternalIterator).SeekGE.
+//
+// If there exists a span with a start key ≤ the first matching point key,
+// SeekGE will return a synthetic span marker key for the span. If this span's
+// start key is less than key, the returned marker will be truncated to key.
+// Note that this search-key truncation of the marker's key is not applied to
+// the span returned by Span.
+//
+// NB: In accordance with the base.InternalIterator contract:
+//
+//	i.lower ≤ key
+func (i *InterleavingIter) SeekGE(
+	key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	i.err = nil
+	i.clearMask()
+	i.disablePrefixMode()
+	i.savePoint(i.pointIter.SeekGE(key, flags))
+
+	// We need to seek the keyspan iterator too. If the keyspan iterator was
+	// already positioned at a span, we might be able to avoid the seek if the
+	// seek key falls within the existing span's bounds.
+	if i.span != nil && i.cmp(key, i.span.End) < 0 && i.cmp(key, i.span.Start) >= 0 {
+		// We're seeking within the existing span's bounds. We still might need
+		// truncate the span to the iterator's bounds.
+		i.saveSpanForward(i.span)
+		i.savedKeyspan()
+	} else {
+		i.keyspanSeekGE(key, nil /* prefix */)
+	}
+
+	i.dir = +1
+	i.computeSmallestPos()
+	return i.yieldPosition(key, i.nextPos)
+}
+
+// SeekPrefixGE implements (base.InternalIterator).SeekPrefixGE.
+//
+// If there exists a span with a start key ≤ the first matching point key,
+// SeekPrefixGE will return a synthetic span marker key for the span. If this
+// span's start key is less than key, the returned marker will be truncated to
+// key. Note that this search-key truncation of the marker's key is not applied
+// to the span returned by Span.
+//
+// NB: In accordance with the base.InternalIterator contract:
+//
+//	i.lower ≤ key
+func (i *InterleavingIter) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	i.err = nil
+	i.clearMask()
+	i.prefix = prefix
+	i.savePoint(i.pointIter.SeekPrefixGE(prefix, key, flags))
+
+	// We need to seek the keyspan iterator too. If the keyspan iterator was
+	// already positioned at a span, we might be able to avoid the seek if the
+	// entire seek prefix key falls within the existing span's bounds.
+	//
+	// During a SeekPrefixGE, Pebble defragments range keys within the bounds of
+	// the prefix. For example, a SeekPrefixGE('c', 'c@8') must defragment the
+	// any overlapping range keys within the bounds of [c,c\00).
+	//
+	// If range keys are fragmented within a prefix (eg, because a version
+	// within a prefix was chosen as an sstable boundary), then it's possible
+	// the seek key falls into the current i.span, but the current i.span does
+	// not wholly cover the seek prefix.
+	//
+	// For example, a SeekPrefixGE('d@5') may only defragment a range key to
+	// the bounds of [c@2,e). A subsequent SeekPrefixGE('c@0') must re-seek the
+	// keyspan iterator, because although 'c@0' is contained within [c@2,e), the
+	// full span of the prefix is not.
+	//
+	// Similarly, a SeekPrefixGE('a@3') may only defragment a range key to the
+	// bounds [a,c@8). A subsequent SeekPrefixGE('c@10') must re-seek the
+	// keyspan iterator, because although 'c@10' is contained within [a,c@8),
+	// the full span of the prefix is not.
+	seekKeyspanIter := true
+	if i.span != nil && i.cmp(prefix, i.span.Start) >= 0 {
+		if ei := i.comparer.Split(i.span.End); i.cmp(prefix, i.span.End[:ei]) < 0 {
+			// We're seeking within the existing span's bounds. We still might need
+			// truncate the span to the iterator's bounds.
+			i.saveSpanForward(i.span)
+			i.savedKeyspan()
+			seekKeyspanIter = false
+		}
+	}
+	if seekKeyspanIter {
+		i.keyspanSeekGE(key, prefix)
+	}
+
+	i.dir = +1
+	i.computeSmallestPos()
+	return i.yieldPosition(key, i.nextPos)
+}
+
+// SeekLT implements (base.InternalIterator).SeekLT.
+func (i *InterleavingIter) SeekLT(
+	key []byte, flags base.SeekLTFlags,
+) (*base.InternalKey, base.LazyValue) {
+	i.err = nil
+	i.clearMask()
+	i.disablePrefixMode()
+	i.savePoint(i.pointIter.SeekLT(key, flags))
+
+	// We need to seek the keyspan iterator too. If the keyspan iterator was
+	// already positioned at a span, we might be able to avoid the seek if the
+	// seek key falls within the existing span's bounds.
+	if i.span != nil && i.cmp(key, i.span.Start) > 0 && i.cmp(key, i.span.End) < 0 {
+		// We're seeking within the existing span's bounds. We still might need
+		// truncate the span to the iterator's bounds.
+		i.saveSpanBackward(i.span)
+		// The span's start key is still not guaranteed to be less than key,
+		// because of the bounds enforcement. Consider the following example:
+		//
+		// Bounds are set to [d,e). The user performs a SeekLT(d). The
+		// FragmentIterator.SeekLT lands on a span [b,f). This span has a start
+		// key less than d, as expected. Above, saveSpanBackward truncates the
+		// span to match the iterator's current bounds, modifying the span to
+		// [d,e), which does not overlap the search space of [-∞, d).
+		//
+		// This problem is a consequence of the SeekLT's exclusive search key
+		// and the fact that we don't perform bounds truncation at every leaf
+		// iterator.
+		if i.span != nil && i.truncated && i.cmp(i.truncatedSpan.Start, key) >= 0 {
+			i.span = nil
+		}
+		i.savedKeyspan()
+	} else {
+		i.keyspanSeekLT(key)
+	}
+
+	i.dir = -1
+	i.computeLargestPos()
+	return i.yieldPosition(i.lower, i.prevPos)
+}
+
+// First implements (base.InternalIterator).First.
+func (i *InterleavingIter) First() (*base.InternalKey, base.LazyValue) {
+	i.err = nil
+	i.clearMask()
+	i.disablePrefixMode()
+	i.savePoint(i.pointIter.First())
+	i.saveSpanForward(i.keyspanIter.First())
+	i.savedKeyspan()
+	i.dir = +1
+	i.computeSmallestPos()
+	return i.yieldPosition(i.lower, i.nextPos)
+}
+
+// Last implements (base.InternalIterator).Last.
+func (i *InterleavingIter) Last() (*base.InternalKey, base.LazyValue) {
+	i.err = nil
+	i.clearMask()
+	i.disablePrefixMode()
+	i.savePoint(i.pointIter.Last())
+	i.saveSpanBackward(i.keyspanIter.Last())
+	i.savedKeyspan()
+	i.dir = -1
+	i.computeLargestPos()
+	return i.yieldPosition(i.lower, i.prevPos)
+}
+
+// Next implements (base.InternalIterator).Next.
+func (i *InterleavingIter) Next() (*base.InternalKey, base.LazyValue) {
+	if i.dir == -1 {
+		// Switching directions.
+		i.dir = +1
+
+		if i.mask != nil {
+			// Clear the mask while we reposition the point iterator. While
+			// switching directions, we may move the point iterator outside of
+			// i.span's bounds.
+			i.clearMask()
+		}
+
+		// When switching directions, iterator state corresponding to the
+		// current iterator position (as indicated by i.pos) is already correct.
+		// However any state that has yet to be interleaved describes a position
+		// behind the current iterator position and needs to be updated to
+		// describe the position ahead of the current iterator position.
+		switch i.pos {
+		case posExhausted:
+			// Nothing to do. The below nextPos call will move both the point
+			// key and span to their next positions and return
+			// MIN(point,s.Start).
+		case posPointKey:
+			// If we're currently on a point key, the below nextPos will
+			// correctly Next the point key iterator to the next point key.
+			// Do we need to move the span forwards? If the current span lies
+			// entirely behind the current key (!i.withinSpan), then we
+			// need to move it to the first span in the forward direction.
+			if !i.withinSpan {
+				i.saveSpanForward(i.keyspanIter.Next())
+				i.savedKeyspan()
+			}
+		case posKeyspanStart:
+			i.withinSpan = true
+			// Since we're positioned on a Span, the pointIter is positioned
+			// entirely behind the current iterator position. Reposition it
+			// ahead of the current iterator position.
+			i.savePoint(i.pointIter.Next())
+		case posKeyspanEnd:
+			// Since we're positioned on a Span, the pointIter is positioned
+			// entirely behind of the current iterator position. Reposition it
+			// ahead the current iterator position.
+			i.savePoint(i.pointIter.Next())
+		}
+		// Fallthrough to calling i.nextPos.
+	}
+	i.nextPos()
+	return i.yieldPosition(i.lower, i.nextPos)
+}
+
+// NextPrefix implements (base.InternalIterator).NextPrefix.
+func (i *InterleavingIter) NextPrefix(succKey []byte) (*base.InternalKey, base.LazyValue) {
+	if i.dir == -1 {
+		panic("pebble: cannot switch directions with NextPrefix")
+	}
+
+	switch i.pos {
+	case posExhausted:
+		return nil, base.LazyValue{}
+	case posPointKey:
+		i.savePoint(i.pointIter.NextPrefix(succKey))
+		if i.withinSpan {
+			if i.pointKey == nil || i.cmp(i.span.End, i.pointKey.UserKey) <= 0 {
+				i.pos = posKeyspanEnd
+			} else {
+				i.pos = posPointKey
+			}
+		} else {
+			i.computeSmallestPos()
+		}
+	case posKeyspanStart, posKeyspanEnd:
+		i.nextPos()
+	}
+	return i.yieldPosition(i.lower, i.nextPos)
+}
+
+// Prev implements (base.InternalIterator).Prev.
+func (i *InterleavingIter) Prev() (*base.InternalKey, base.LazyValue) {
+	if i.dir == +1 {
+		// Switching directions.
+		i.dir = -1
+
+		if i.mask != nil {
+			// Clear the mask while we reposition the point iterator. While
+			// switching directions, we may move the point iterator outside of
+			// i.span's bounds.
+			i.clearMask()
+		}
+
+		// When switching directions, iterator state corresponding to the
+		// current iterator position (as indicated by i.pos) is already correct.
+		// However any state that has yet to be interleaved describes a position
+		// ahead of the current iterator position and needs to be updated to
+		// describe the position behind the current iterator position.
+		switch i.pos {
+		case posExhausted:
+			// Nothing to do. The below prevPos call will move both the point
+			// key and span to previous positions and return MAX(point, s.End).
+		case posPointKey:
+			// If we're currently on a point key, the point iterator is in the
+			// right place and the call to prevPos will correctly Prev the point
+			// key iterator to the previous point key. Do we need to move the
+			// span backwards? If the current span lies entirely ahead of the
+			// current key (!i.withinSpan), then we need to move it to the first
+			// span in the reverse direction.
+			if !i.withinSpan {
+				i.saveSpanBackward(i.keyspanIter.Prev())
+				i.savedKeyspan()
+			}
+		case posKeyspanStart:
+			// Since we're positioned on a Span, the pointIter is positioned
+			// entirely ahead of the current iterator position. Reposition it
+			// behind the current iterator position.
+			i.savePoint(i.pointIter.Prev())
+			// Without considering truncation of spans to seek keys, the keyspan
+			// iterator is already in the right place. But consider span [a, z)
+			// and this sequence of iterator calls:
+			//
+			//   SeekGE('c') = c.RANGEKEYSET#72057594037927935
+			//   Prev()      = a.RANGEKEYSET#72057594037927935
+			//
+			// If the current span's start key was last surfaced truncated due
+			// to a SeekGE or SeekPrefixGE call, then it's still relevant in the
+			// reverse direction with an untruncated start key.
+			if i.spanMarkerTruncated {
+				// When we fallthrough to calling prevPos, we want to move to
+				// MAX(point, span.Start). We cheat here by claiming we're
+				// currently on the end boundary, so that we'll move on to the
+				// untruncated start key if necessary.
+				i.pos = posKeyspanEnd
+			}
+		case posKeyspanEnd:
+			// Since we're positioned on a Span, the pointIter is positioned
+			// entirely ahead of the current iterator position. Reposition it
+			// behind the current iterator position.
+			i.savePoint(i.pointIter.Prev())
+		}
+
+		if i.spanMarkerTruncated {
+			// Save the keyspan again to clear truncation.
+			i.savedKeyspan()
+		}
+		// Fallthrough to calling i.prevPos.
+	}
+	i.prevPos()
+	return i.yieldPosition(i.lower, i.prevPos)
+}
+
+// computeSmallestPos sets i.{pos,withinSpan} to:
+//
+//	MIN(i.pointKey, i.span.Start)
+func (i *InterleavingIter) computeSmallestPos() {
+	if i.err == nil {
+		if i.span != nil && (i.pointKey == nil || i.cmp(i.startKey(), i.pointKey.UserKey) <= 0) {
+			i.withinSpan = true
+			i.pos = posKeyspanStart
+			return
+		}
+		i.withinSpan = false
+		if i.pointKey != nil {
+			i.pos = posPointKey
+			return
+		}
+	}
+	i.pos = posExhausted
+}
+
+// computeLargestPos sets i.{pos,withinSpan} to:
+//
+//	MAX(i.pointKey, i.span.End)
+func (i *InterleavingIter) computeLargestPos() {
+	if i.err == nil {
+		if i.span != nil && (i.pointKey == nil || i.cmp(i.span.End, i.pointKey.UserKey) > 0) {
+			i.withinSpan = true
+			i.pos = posKeyspanEnd
+			return
+		}
+		i.withinSpan = false
+		if i.pointKey != nil {
+			i.pos = posPointKey
+			return
+		}
+	}
+	i.pos = posExhausted
+}
+
+// nextPos advances the iterator one position in the forward direction.
+func (i *InterleavingIter) nextPos() {
+	if invariants.Enabled {
+		defer func() {
+			if i.err != nil && i.pos != posExhausted {
+				panic(errors.AssertionFailedf("iterator has accumulated error but i.pos = %d", i.pos))
+			}
+		}()
+	}
+	// NB: If i.err != nil or any of the positioning methods performed in this
+	// function result in i.err != nil, we must set i.pos = posExhausted. We
+	// perform this check explicitly here, but if any of the branches below
+	// advance either iterator, they must also check i.err and set posExhausted
+	// if necessary.
+	if i.err != nil {
+		i.pos = posExhausted
+		return
+	}
+
+	switch i.pos {
+	case posExhausted:
+		i.savePoint(i.pointIter.Next())
+		i.saveSpanForward(i.keyspanIter.Next())
+		i.savedKeyspan()
+		i.computeSmallestPos()
+	case posPointKey:
+		i.savePoint(i.pointIter.Next())
+		if i.err != nil {
+			i.pos = posExhausted
+			return
+		}
+		// If we're not currently within the span, we want to chose the
+		// MIN(pointKey,span.Start), which is exactly the calculation performed
+		// by computeSmallestPos.
+		if !i.withinSpan {
+			i.computeSmallestPos()
+			return
+		}
+		// i.withinSpan=true
+		// Since we previously were within the span, we want to choose the
+		// MIN(pointKey,span.End).
+		switch {
+		case i.span == nil:
+			panic("i.withinSpan=true and i.span=nil")
+		case i.pointKey == nil:
+			// Since i.withinSpan=true, we step onto the end boundary of the
+			// keyspan.
+			i.pos = posKeyspanEnd
+		default:
+			// i.withinSpan && i.pointKey != nil && i.span != nil
+			if i.cmp(i.span.End, i.pointKey.UserKey) <= 0 {
+				i.pos = posKeyspanEnd
+			} else {
+				i.pos = posPointKey
+			}
+		}
+	case posKeyspanStart:
+		// Either a point key or the span's end key comes next.
+		if i.pointKey != nil && i.cmp(i.pointKey.UserKey, i.span.End) < 0 {
+			i.pos = posPointKey
+		} else {
+			i.pos = posKeyspanEnd
+		}
+	case posKeyspanEnd:
+		i.saveSpanForward(i.keyspanIter.Next())
+		i.savedKeyspan()
+		i.computeSmallestPos()
+	default:
+		panic(fmt.Sprintf("unexpected pos=%d", i.pos))
+	}
+}
+
+// prevPos advances the iterator one position in the reverse direction.
+func (i *InterleavingIter) prevPos() {
+	if invariants.Enabled {
+		defer func() {
+			if i.err != nil && i.pos != posExhausted {
+				panic(errors.AssertionFailedf("iterator has accumulated error but i.pos = %d", i.pos))
+			}
+		}()
+	}
+	// NB: If i.err != nil or any of the positioning methods performed in this
+	// function result in i.err != nil, we must set i.pos = posExhausted. We
+	// perform this check explicitly here, but if any of the branches below
+	// advance either iterator, they must also check i.err and set posExhausted
+	// if necessary.
+	if i.err != nil {
+		i.pos = posExhausted
+		return
+	}
+
+	switch i.pos {
+	case posExhausted:
+		i.savePoint(i.pointIter.Prev())
+		i.saveSpanBackward(i.keyspanIter.Prev())
+		i.savedKeyspan()
+		i.computeLargestPos()
+	case posPointKey:
+		i.savePoint(i.pointIter.Prev())
+		if i.err != nil {
+			i.pos = posExhausted
+			return
+		}
+		// If we're not currently covered by the span, we want to chose the
+		// MAX(pointKey,span.End), which is exactly the calculation performed
+		// by computeLargestPos.
+		if !i.withinSpan {
+			i.computeLargestPos()
+			return
+		}
+		switch {
+		case i.span == nil:
+			panic("withinSpan=true, but i.span == nil")
+		case i.pointKey == nil:
+			i.pos = posKeyspanEnd
+		default:
+			// i.withinSpan && i.pointKey != nil && i.span != nil
+			if i.cmp(i.span.Start, i.pointKey.UserKey) > 0 {
+				i.pos = posKeyspanStart
+			} else {
+				i.pos = posPointKey
+			}
+		}
+	case posKeyspanStart:
+		i.saveSpanBackward(i.keyspanIter.Prev())
+		i.savedKeyspan()
+		i.computeLargestPos()
+	case posKeyspanEnd:
+		// Either a point key or the span's start key is previous.
+		if i.pointKey != nil && i.cmp(i.pointKey.UserKey, i.span.Start) >= 0 {
+			i.pos = posPointKey
+		} else {
+			i.pos = posKeyspanStart
+		}
+	default:
+		panic(fmt.Sprintf("unexpected pos=%d", i.pos))
+	}
+}
+
+func (i *InterleavingIter) yieldPosition(
+	lowerBound []byte, advance func(),
+) (*base.InternalKey, base.LazyValue) {
+	// This loop returns the first visible position in the current iteration
+	// direction. Some positions are not visible and skipped. For example, if
+	// masking is enabled and the iterator is positioned over a masked point
+	// key, this loop skips the position. If a span's start key should be
+	// interleaved next, but the span is empty, the loop continues to the next
+	// key. Currently, span end keys are also always skipped, and are used only
+	// for maintaining internal state.
+	for {
+		switch i.pos {
+		case posExhausted:
+			return i.yieldNil()
+		case posPointKey:
+			if i.pointKey == nil {
+				panic("i.pointKey is nil")
+			}
+
+			if i.mask != nil {
+				i.maybeUpdateMask()
+				if i.withinSpan && i.mask.SkipPoint(i.pointKey.UserKey) {
+					// The span covers the point key. If a SkipPoint hook is
+					// configured, ask it if we should skip this point key.
+					if i.prefix != nil {
+						// During prefix-iteration node, once a point is masked,
+						// all subsequent keys with the same prefix must also be
+						// masked according to the key ordering. We can stop and
+						// return nil.
+						//
+						// NB: The above is not just an optimization. During
+						// prefix-iteration mode, the internal iterator contract
+						// prohibits us from Next-ing beyond the first key
+						// beyond the iteration prefix. If we didn't already
+						// stop early, we would need to check if this masked
+						// point is already beyond the prefix.
+						return i.yieldNil()
+					}
+					// TODO(jackson): If we thread a base.Comparer through to
+					// InterleavingIter so that we have access to
+					// ImmediateSuccessor, we could use NextPrefix. We'd need to
+					// tweak the SpanMask interface slightly.
+
+					// Advance beyond the masked point key.
+					advance()
+					continue
+				}
+			}
+			return i.yieldPointKey()
+		case posKeyspanEnd:
+			// Don't interleave end keys; just advance.
+			advance()
+			continue
+		case posKeyspanStart:
+			// Don't interleave an empty span.
+			if i.span.Empty() {
+				advance()
+				continue
+			}
+			return i.yieldSyntheticSpanMarker(lowerBound)
+		default:
+			panic(fmt.Sprintf("unexpected interleavePos=%d", i.pos))
+		}
+	}
+}
+
+// keyspanSeekGE seeks the keyspan iterator to the first span covering a key ≥ k.
+func (i *InterleavingIter) keyspanSeekGE(k []byte, prefix []byte) {
+	i.saveSpanForward(i.keyspanIter.SeekGE(k))
+	i.savedKeyspan()
+}
+
+// keyspanSeekLT seeks the keyspan iterator to the last span covering a key < k.
+func (i *InterleavingIter) keyspanSeekLT(k []byte) {
+	i.saveSpanBackward(i.keyspanIter.SeekLT(k))
+	// The current span's start key is not guaranteed to be less than key,
+	// because of the bounds enforcement. Consider the following example:
+	//
+	// Bounds are set to [d,e). The user performs a SeekLT(d). The
+	// FragmentIterator.SeekLT lands on a span [b,f). This span has a start key
+	// less than d, as expected. Above, saveSpanBackward truncates the span to
+	// match the iterator's current bounds, modifying the span to [d,e), which
+	// does not overlap the search space of [-∞, d).
+	//
+	// This problem is a consequence of the SeekLT's exclusive search key and
+	// the fact that we don't perform bounds truncation at every leaf iterator.
+	if i.span != nil && i.truncated && i.cmp(i.truncatedSpan.Start, k) >= 0 {
+		i.span = nil
+	}
+	i.savedKeyspan()
+}
+
+func (i *InterleavingIter) saveSpanForward(span *Span) {
+	i.span = span
+	i.truncated = false
+	i.truncatedSpan = Span{}
+	if i.span == nil {
+		i.err = firstError(i.err, i.keyspanIter.Error())
+		return
+	}
+	if invariants.Enabled {
+		if err := i.keyspanIter.Error(); err != nil {
+			panic(errors.WithSecondaryError(
+				errors.AssertionFailedf("pebble: %T keyspan iterator returned non-nil span %s while iter has error", i.keyspanIter, i.span),
+				err))
+		}
+	}
+	// Check the upper bound if we have one.
+	if i.upper != nil && i.cmp(i.span.Start, i.upper) >= 0 {
+		i.span = nil
+		return
+	}
+
+	// TODO(jackson): The key comparisons below truncate bounds whenever the
+	// keyspan iterator is repositioned. We could perform this lazily, and do it
+	// the first time the user actually asks for this span's bounds in
+	// SpanBounds. This would reduce work in the case where there's no span
+	// covering the point and the keyspan iterator is non-empty.
+
+	// NB: These truncations don't require setting `keyspanMarkerTruncated`:
+	// That flag only applies to truncated span marker keys.
+	if i.lower != nil && i.cmp(i.span.Start, i.lower) < 0 {
+		i.truncated = true
+		i.truncatedSpan = *i.span
+		i.truncatedSpan.Start = i.lower
+	}
+	if i.upper != nil && i.cmp(i.upper, i.span.End) < 0 {
+		if !i.truncated {
+			i.truncated = true
+			i.truncatedSpan = *i.span
+		}
+		i.truncatedSpan.End = i.upper
+	}
+	// If this is a part of a SeekPrefixGE call, we may also need to truncate to
+	// the prefix's bounds.
+	if i.prefix != nil {
+		if !i.truncated {
+			i.truncated = true
+			i.truncatedSpan = *i.span
+		}
+		if i.cmp(i.prefix, i.truncatedSpan.Start) > 0 {
+			i.truncatedSpan.Start = i.prefix
+		}
+		i.nextPrefixBuf = i.comparer.ImmediateSuccessor(i.nextPrefixBuf[:0], i.prefix)
+		if i.truncated && i.cmp(i.nextPrefixBuf, i.truncatedSpan.End) < 0 {
+			i.truncatedSpan.End = i.nextPrefixBuf
+		}
+	}
+
+	if i.truncated && i.comparer.Equal(i.truncatedSpan.Start, i.truncatedSpan.End) {
+		i.span = nil
+	}
+}
+
+func (i *InterleavingIter) saveSpanBackward(span *Span) {
+	i.span = span
+	i.truncated = false
+	i.truncatedSpan = Span{}
+	if i.span == nil {
+		i.err = firstError(i.err, i.keyspanIter.Error())
+		return
+	}
+	if invariants.Enabled {
+		if err := i.keyspanIter.Error(); err != nil {
+			panic(errors.WithSecondaryError(
+				errors.AssertionFailedf("pebble: %T keyspan iterator returned non-nil span %s while iter has error", i.keyspanIter, i.span),
+				err))
+		}
+	}
+
+	// Check the lower bound if we have one.
+	if i.lower != nil && i.cmp(i.span.End, i.lower) <= 0 {
+		i.span = nil
+		return
+	}
+
+	// TODO(jackson): The key comparisons below truncate bounds whenever the
+	// keyspan iterator is repositioned. We could perform this lazily, and do it
+	// the first time the user actually asks for this span's bounds in
+	// SpanBounds. This would reduce work in the case where there's no span
+	// covering the point and the keyspan iterator is non-empty.
+
+	// NB: These truncations don't require setting `keyspanMarkerTruncated`:
+	// That flag only applies to truncated span marker keys.
+	if i.lower != nil && i.cmp(i.span.Start, i.lower) < 0 {
+		i.truncated = true
+		i.truncatedSpan = *i.span
+		i.truncatedSpan.Start = i.lower
+	}
+	if i.upper != nil && i.cmp(i.upper, i.span.End) < 0 {
+		if !i.truncated {
+			i.truncated = true
+			i.truncatedSpan = *i.span
+		}
+		i.truncatedSpan.End = i.upper
+	}
+	if i.truncated && i.comparer.Equal(i.truncatedSpan.Start, i.truncatedSpan.End) {
+		i.span = nil
+	}
+}
+
+func (i *InterleavingIter) yieldNil() (*base.InternalKey, base.LazyValue) {
+	i.withinSpan = false
+	i.clearMask()
+	return i.verify(nil, base.LazyValue{})
+}
+
+func (i *InterleavingIter) yieldPointKey() (*base.InternalKey, base.LazyValue) {
+	return i.verify(i.pointKey, i.pointVal)
+}
+
+func (i *InterleavingIter) yieldSyntheticSpanMarker(
+	lowerBound []byte,
+) (*base.InternalKey, base.LazyValue) {
+	i.spanMarker.UserKey = i.startKey()
+	i.spanMarker.Trailer = base.MakeTrailer(base.InternalKeySeqNumMax, i.span.Keys[0].Kind())
+
+	// Truncate the key we return to our lower bound if we have one. Note that
+	// we use the lowerBound function parameter, not i.lower. The lowerBound
+	// argument is guaranteed to be ≥ i.lower. It may be equal to the SetBounds
+	// lower bound, or it could come from a SeekGE or SeekPrefixGE search key.
+	if lowerBound != nil && i.cmp(lowerBound, i.startKey()) > 0 {
+		// Truncating to the lower bound may violate the upper bound if
+		// lowerBound == i.upper. For example, a SeekGE(k) uses k as a lower
+		// bound for truncating a span. The span a-z will be truncated to [k,
+		// z). If i.upper == k, we'd mistakenly try to return a span [k, k), an
+		// invariant violation.
+		if i.comparer.Equal(lowerBound, i.upper) {
+			return i.yieldNil()
+		}
+
+		// If the lowerBound argument came from a SeekGE or SeekPrefixGE
+		// call, and it may be backed by a user-provided byte slice that is not
+		// guaranteed to be stable.
+		//
+		// If the lowerBound argument is the lower bound set by SetBounds,
+		// Pebble owns the slice's memory. However, consider two successive
+		// calls to SetBounds(). The second may overwrite the lower bound.
+		// Although the external contract requires a seek after a SetBounds,
+		// Pebble's tests don't always. For this reason and to simplify
+		// reasoning around lifetimes, always copy the bound into keyBuf when
+		// truncating.
+		i.keyBuf = append(i.keyBuf[:0], lowerBound...)
+		i.spanMarker.UserKey = i.keyBuf
+		i.spanMarkerTruncated = true
+	}
+	i.maybeUpdateMask()
+	return i.verify(&i.spanMarker, base.LazyValue{})
+}
+
+func (i *InterleavingIter) disablePrefixMode() {
+	if i.prefix != nil {
+		i.prefix = nil
+		// Clear the existing span. It may not hold the true end bound of the
+		// underlying span.
+		i.span = nil
+	}
+}
+
+func (i *InterleavingIter) verify(
+	k *base.InternalKey, v base.LazyValue,
+) (*base.InternalKey, base.LazyValue) {
+	// Wrap the entire function body in the invariants build tag, so that
+	// production builds elide this entire function.
+	if invariants.Enabled {
+		switch {
+		case i.dir == -1 && i.spanMarkerTruncated:
+			panic("pebble: invariant violation: truncated span key in reverse iteration")
+		case k != nil && i.lower != nil && i.cmp(k.UserKey, i.lower) < 0:
+			panic("pebble: invariant violation: key < lower bound")
+		case k != nil && i.upper != nil && i.cmp(k.UserKey, i.upper) >= 0:
+			panic("pebble: invariant violation: key ≥ upper bound")
+		case i.err != nil && k != nil:
+			panic("pebble: invariant violation: accumulated error swallowed")
+		case i.err == nil && i.pointIter.Error() != nil:
+			panic("pebble: invariant violation: pointIter swallowed")
+		case i.err == nil && i.keyspanIter.Error() != nil:
+			panic("pebble: invariant violation: keyspanIter error swallowed")
+		}
+	}
+	return k, v
+}
+
+func (i *InterleavingIter) savedKeyspan() {
+	i.spanMarkerTruncated = false
+	i.maskSpanChangedCalled = false
+}
+
+// updateMask updates the current mask, if a mask is configured and the mask
+// hasn't been updated with the current keyspan yet.
+func (i *InterleavingIter) maybeUpdateMask() {
+	switch {
+	case i.mask == nil, i.maskSpanChangedCalled:
+		return
+	case !i.withinSpan || i.span.Empty():
+		i.clearMask()
+	case i.truncated:
+		i.mask.SpanChanged(&i.truncatedSpan)
+		i.maskSpanChangedCalled = true
+	default:
+		i.mask.SpanChanged(i.span)
+		i.maskSpanChangedCalled = true
+	}
+}
+
+// clearMask clears the current mask, if a mask is configured and no mask should
+// be active.
+func (i *InterleavingIter) clearMask() {
+	if i.mask != nil {
+		i.maskSpanChangedCalled = false
+		i.mask.SpanChanged(nil)
+	}
+}
+
+func (i *InterleavingIter) startKey() []byte {
+	if i.truncated {
+		return i.truncatedSpan.Start
+	}
+	return i.span.Start
+}
+
+func (i *InterleavingIter) savePoint(key *base.InternalKey, value base.LazyValue) {
+	i.pointKey, i.pointVal = key, value
+	if key == nil {
+		i.err = firstError(i.err, i.pointIter.Error())
+	}
+	if invariants.Enabled {
+		if err := i.pointIter.Error(); key != nil && err != nil {
+			panic(errors.WithSecondaryError(
+				errors.AssertionFailedf("pebble: %T point iterator returned non-nil key %q while iter has error", i.pointIter, key),
+				err))
+		}
+	}
+}
+
+// Span returns the span covering the last key returned, if any. A span key is
+// considered to 'cover' a key if the key falls within the span's user key
+// bounds. The returned span is owned by the InterleavingIter. The caller is
+// responsible for copying if stability is required.
+//
+// Span will never return an invalid or empty span.
+func (i *InterleavingIter) Span() *Span {
+	if !i.withinSpan || len(i.span.Keys) == 0 {
+		return nil
+	} else if i.truncated {
+		return &i.truncatedSpan
+	}
+	return i.span
+}
+
+// SetBounds implements (base.InternalIterator).SetBounds.
+func (i *InterleavingIter) SetBounds(lower, upper []byte) {
+	i.lower, i.upper = lower, upper
+	i.pointIter.SetBounds(lower, upper)
+	i.Invalidate()
+}
+
+// SetContext implements (base.InternalIterator).SetContext.
+func (i *InterleavingIter) SetContext(ctx context.Context) {
+	i.pointIter.SetContext(ctx)
+}
+
+// Invalidate invalidates the interleaving iterator's current position, clearing
+// its state. This prevents optimizations such as reusing the current span on
+// seek.
+func (i *InterleavingIter) Invalidate() {
+	i.span = nil
+	i.pointKey = nil
+	i.pointVal = base.LazyValue{}
+}
+
+// Error implements (base.InternalIterator).Error.
+func (i *InterleavingIter) Error() error {
+	return i.err
+}
+
+// Close implements (base.InternalIterator).Close.
+func (i *InterleavingIter) Close() error {
+	perr := i.pointIter.Close()
+	rerr := i.keyspanIter.Close()
+	return firstError(perr, rerr)
+}
+
+// String implements (base.InternalIterator).String.
+func (i *InterleavingIter) String() string {
+	return fmt.Sprintf("keyspan-interleaving(%q)", i.pointIter.String())
+}
+
+func firstError(err0, err1 error) error {
+	if err0 != nil {
+		return err0
+	}
+	return err1
+}
diff --git a/pebble/internal/keyspan/interleaving_iter_test.go b/pebble/internal/keyspan/interleaving_iter_test.go
new file mode 100644
index 0000000..116f037
--- /dev/null
+++ b/pebble/internal/keyspan/interleaving_iter_test.go
@@ -0,0 +1,291 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"io"
+	"sort"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/stretchr/testify/require"
+)
+
+func TestInterleavingIter(t *testing.T) {
+	runInterleavingIterTest(t, "testdata/interleaving_iter")
+}
+
+func TestInterleavingIter_Masking(t *testing.T) {
+	runInterleavingIterTest(t, "testdata/interleaving_iter_masking")
+}
+
+type maskingHooks struct {
+	log        io.Writer
+	cmp        base.Compare
+	split      base.Split
+	threshold  []byte
+	maskSuffix []byte
+}
+
+func (m *maskingHooks) SpanChanged(s *Span) {
+	if m.log != nil {
+		if s == nil {
+			fmt.Fprintln(m.log, "-- SpanChanged(nil)")
+		} else {
+			fmt.Fprintf(m.log, "-- SpanChanged(%s)\n", s)
+		}
+	}
+
+	// Find the smallest suffix of a key contained within the Span, excluding
+	// suffixes less than m.threshold.
+	m.maskSuffix = nil
+	if s == nil || m.threshold == nil || len(s.Keys) == 0 {
+		return
+	}
+	for i := range s.Keys {
+		if s.Keys[i].Suffix == nil {
+			continue
+		}
+		if m.cmp(s.Keys[i].Suffix, m.threshold) < 0 {
+			continue
+		}
+		if m.maskSuffix == nil || m.cmp(m.maskSuffix, s.Keys[i].Suffix) > 0 {
+			m.maskSuffix = s.Keys[i].Suffix
+		}
+	}
+}
+
+func (m *maskingHooks) SkipPoint(userKey []byte) bool {
+	pointSuffix := userKey[m.split(userKey):]
+	return m.maskSuffix != nil && len(pointSuffix) > 0 && m.cmp(m.maskSuffix, pointSuffix) < 0
+}
+
+func runInterleavingIterTest(t *testing.T, filename string) {
+	cmp := testkeys.Comparer.Compare
+	var keyspanIter MergingIter
+	var pointIter pointIterator
+	var iter InterleavingIter
+	var buf bytes.Buffer
+	hooks := maskingHooks{
+		log:   &buf,
+		cmp:   testkeys.Comparer.Compare,
+		split: testkeys.Comparer.Split,
+	}
+
+	var prevKey *base.InternalKey
+	formatKey := func(k *base.InternalKey, _ base.LazyValue) {
+		if k == nil {
+			fmt.Fprint(&buf, ".")
+			return
+		}
+		prevKey = k
+		s := iter.Span()
+		fmt.Fprintf(&buf, "PointKey: %s\n", k.String())
+		if s != nil {
+			fmt.Fprintf(&buf, "Span: %s\n-", s)
+		} else {
+			fmt.Fprintf(&buf, "Span: %s\n-", Span{})
+		}
+	}
+
+	datadriven.RunTest(t, filename, func(t *testing.T, td *datadriven.TestData) string {
+		buf.Reset()
+		switch td.Cmd {
+		case "set-masking-threshold":
+			hooks.threshold = []byte(strings.TrimSpace(td.Input))
+			return "OK"
+		case "define-rangekeys":
+			var spans []Span
+			lines := strings.Split(strings.TrimSpace(td.Input), "\n")
+			for _, line := range lines {
+				spans = append(spans, ParseSpan(line))
+			}
+			keyspanIter.Init(cmp, noopTransform, new(MergingBuffers), NewIter(cmp, spans))
+			hooks.maskSuffix = nil
+			iter.Init(testkeys.Comparer, &pointIter, &keyspanIter,
+				InterleavingIterOpts{Mask: &hooks})
+			return "OK"
+		case "define-pointkeys":
+			var points []base.InternalKey
+			lines := strings.Split(strings.TrimSpace(td.Input), "\n")
+			for _, line := range lines {
+				points = append(points, base.ParseInternalKey(line))
+			}
+			pointIter = pointIterator{cmp: cmp, keys: points}
+			hooks.maskSuffix = nil
+			iter.Init(testkeys.Comparer, &pointIter, &keyspanIter,
+				InterleavingIterOpts{Mask: &hooks})
+			return "OK"
+		case "iter":
+			buf.Reset()
+			// Clear any previous bounds.
+			iter.SetBounds(nil, nil)
+			prevKey = nil
+			lines := strings.Split(strings.TrimSpace(td.Input), "\n")
+			for _, line := range lines {
+				bufLen := buf.Len()
+				line = strings.TrimSpace(line)
+				i := strings.IndexByte(line, ' ')
+				iterCmd := line
+				if i > 0 {
+					iterCmd = string(line[:i])
+				}
+				switch iterCmd {
+				case "first":
+					formatKey(iter.First())
+				case "last":
+					formatKey(iter.Last())
+				case "next":
+					formatKey(iter.Next())
+				case "next-prefix":
+					succKey := testkeys.Comparer.ImmediateSuccessor(nil, prevKey.UserKey[:testkeys.Comparer.Split(prevKey.UserKey)])
+					formatKey(iter.NextPrefix(succKey))
+				case "prev":
+					formatKey(iter.Prev())
+				case "seek-ge":
+					formatKey(iter.SeekGE([]byte(strings.TrimSpace(line[i:])), base.SeekGEFlagsNone))
+				case "seek-prefix-ge":
+					key := []byte(strings.TrimSpace(line[i:]))
+					prefix := key[:testkeys.Comparer.Split(key)]
+					formatKey(iter.SeekPrefixGE(prefix, key, base.SeekGEFlagsNone))
+				case "seek-lt":
+					formatKey(iter.SeekLT([]byte(strings.TrimSpace(line[i:])), base.SeekLTFlagsNone))
+				case "set-bounds":
+					bounds := strings.Fields(line[i:])
+					if len(bounds) != 2 {
+						return fmt.Sprintf("set-bounds expects 2 bounds, got %d", len(bounds))
+					}
+					l, u := []byte(bounds[0]), []byte(bounds[1])
+					if bounds[0] == "." {
+						l = nil
+					}
+					if bounds[1] == "." {
+						u = nil
+					}
+					iter.SetBounds(l, u)
+				default:
+					return fmt.Sprintf("unrecognized iter command %q", iterCmd)
+				}
+				require.NoError(t, iter.Error())
+				if buf.Len() > bufLen {
+					fmt.Fprintln(&buf)
+				}
+			}
+			return strings.TrimSpace(buf.String())
+		default:
+			return fmt.Sprintf("unrecognized command %q", td.Cmd)
+		}
+	})
+	require.NoError(t, iter.Close())
+}
+
+type pointIterator struct {
+	cmp   base.Compare
+	keys  []base.InternalKey
+	lower []byte
+	upper []byte
+	index int
+}
+
+var _ base.InternalIterator = &pointIterator{}
+
+func (i *pointIterator) SeekGE(
+	key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	i.index = sort.Search(len(i.keys), func(j int) bool {
+		return i.cmp(i.keys[j].UserKey, key) >= 0
+	})
+	if i.index < 0 || i.index >= len(i.keys) {
+		return nil, base.LazyValue{}
+	}
+	if i.upper != nil && i.cmp(i.keys[i.index].UserKey, i.upper) >= 0 {
+		return nil, base.LazyValue{}
+	}
+	return &i.keys[i.index], base.LazyValue{}
+}
+
+func (i *pointIterator) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	return i.SeekGE(key, flags)
+}
+
+func (i *pointIterator) SeekLT(
+	key []byte, flags base.SeekLTFlags,
+) (*base.InternalKey, base.LazyValue) {
+	i.index = sort.Search(len(i.keys), func(j int) bool {
+		return i.cmp(i.keys[j].UserKey, key) >= 0
+	})
+	i.index--
+	if i.index < 0 || i.index >= len(i.keys) {
+		return nil, base.LazyValue{}
+	}
+	if i.lower != nil && i.cmp(i.keys[i.index].UserKey, i.lower) < 0 {
+		return nil, base.LazyValue{}
+	}
+	return &i.keys[i.index], base.LazyValue{}
+}
+
+func (i *pointIterator) First() (*base.InternalKey, base.LazyValue) {
+	i.index = 0
+	if i.index < 0 || i.index >= len(i.keys) {
+		return nil, base.LazyValue{}
+	}
+	if i.upper != nil && i.cmp(i.keys[i.index].UserKey, i.upper) >= 0 {
+		return nil, base.LazyValue{}
+	}
+	return &i.keys[i.index], base.LazyValue{}
+}
+
+func (i *pointIterator) Last() (*base.InternalKey, base.LazyValue) {
+	i.index = len(i.keys) - 1
+	if i.index < 0 || i.index >= len(i.keys) {
+		return nil, base.LazyValue{}
+	}
+	if i.lower != nil && i.cmp(i.keys[i.index].UserKey, i.lower) < 0 {
+		return nil, base.LazyValue{}
+	}
+	return &i.keys[i.index], base.LazyValue{}
+}
+
+func (i *pointIterator) Next() (*base.InternalKey, base.LazyValue) {
+	i.index++
+	if i.index < 0 || i.index >= len(i.keys) {
+		return nil, base.LazyValue{}
+	}
+	if i.upper != nil && i.cmp(i.keys[i.index].UserKey, i.upper) >= 0 {
+		return nil, base.LazyValue{}
+	}
+	return &i.keys[i.index], base.LazyValue{}
+}
+
+func (i *pointIterator) NextPrefix(succKey []byte) (*base.InternalKey, base.LazyValue) {
+	return i.SeekGE(succKey, base.SeekGEFlagsNone)
+}
+
+func (i *pointIterator) Prev() (*base.InternalKey, base.LazyValue) {
+	i.index--
+	if i.index < 0 || i.index >= len(i.keys) {
+		return nil, base.LazyValue{}
+	}
+	if i.lower != nil && i.cmp(i.keys[i.index].UserKey, i.lower) < 0 {
+		return nil, base.LazyValue{}
+	}
+	return &i.keys[i.index], base.LazyValue{}
+}
+
+func (i *pointIterator) Close() error   { return nil }
+func (i *pointIterator) Error() error   { return nil }
+func (i *pointIterator) String() string { return "test-point-iterator" }
+func (i *pointIterator) SetBounds(lower, upper []byte) {
+	i.lower, i.upper = lower, upper
+}
+func (i *pointIterator) SetContext(_ context.Context) {}
diff --git a/pebble/internal/keyspan/internal_iter_shim.go b/pebble/internal/keyspan/internal_iter_shim.go
new file mode 100644
index 0000000..bb9e37b
--- /dev/null
+++ b/pebble/internal/keyspan/internal_iter_shim.go
@@ -0,0 +1,125 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import (
+	"context"
+
+	"github.com/cockroachdb/pebble/internal/base"
+)
+
+// InternalIteratorShim is a temporary iterator type used as a shim between
+// keyspan.MergingIter and base.InternalIterator. It's used temporarily for
+// range deletions during compactions, allowing range deletions to be
+// interleaved by a compaction input iterator.
+//
+// TODO(jackson): This type should be removed, and the usages converted to using
+// an InterleavingIterator type that interleaves keyspan.Spans from a
+// keyspan.FragmentIterator with point keys.
+type InternalIteratorShim struct {
+	miter   MergingIter
+	mbufs   MergingBuffers
+	span    *Span
+	iterKey base.InternalKey
+}
+
+// Assert that InternalIteratorShim implements InternalIterator.
+var _ base.InternalIterator = &InternalIteratorShim{}
+
+// Init initializes the internal iterator shim to merge the provided fragment
+// iterators.
+func (i *InternalIteratorShim) Init(cmp base.Compare, iters ...FragmentIterator) {
+	i.miter.Init(cmp, noopTransform, &i.mbufs, iters...)
+}
+
+// Span returns the span containing the full set of keys over the key span at
+// the current iterator position.
+func (i *InternalIteratorShim) Span() *Span {
+	return i.span
+}
+
+// SeekGE implements (base.InternalIterator).SeekGE.
+func (i *InternalIteratorShim) SeekGE(
+	key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	panic("unimplemented")
+}
+
+// SeekPrefixGE implements (base.InternalIterator).SeekPrefixGE.
+func (i *InternalIteratorShim) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	panic("unimplemented")
+}
+
+// SeekLT implements (base.InternalIterator).SeekLT.
+func (i *InternalIteratorShim) SeekLT(
+	key []byte, flags base.SeekLTFlags,
+) (*base.InternalKey, base.LazyValue) {
+	panic("unimplemented")
+}
+
+// First implements (base.InternalIterator).First.
+func (i *InternalIteratorShim) First() (*base.InternalKey, base.LazyValue) {
+	i.span = i.miter.First()
+	for i.span != nil && i.span.Empty() {
+		i.span = i.miter.Next()
+	}
+	if i.span == nil {
+		return nil, base.LazyValue{}
+	}
+	i.iterKey = base.InternalKey{UserKey: i.span.Start, Trailer: i.span.Keys[0].Trailer}
+	return &i.iterKey, base.MakeInPlaceValue(i.span.End)
+}
+
+// Last implements (base.InternalIterator).Last.
+func (i *InternalIteratorShim) Last() (*base.InternalKey, base.LazyValue) {
+	panic("unimplemented")
+}
+
+// Next implements (base.InternalIterator).Next.
+func (i *InternalIteratorShim) Next() (*base.InternalKey, base.LazyValue) {
+	i.span = i.miter.Next()
+	for i.span != nil && i.span.Empty() {
+		i.span = i.miter.Next()
+	}
+	if i.span == nil {
+		return nil, base.LazyValue{}
+	}
+	i.iterKey = base.InternalKey{UserKey: i.span.Start, Trailer: i.span.Keys[0].Trailer}
+	return &i.iterKey, base.MakeInPlaceValue(i.span.End)
+}
+
+// NextPrefix implements (base.InternalIterator).NextPrefix.
+func (i *InternalIteratorShim) NextPrefix([]byte) (*base.InternalKey, base.LazyValue) {
+	panic("unimplemented")
+}
+
+// Prev implements (base.InternalIterator).Prev.
+func (i *InternalIteratorShim) Prev() (*base.InternalKey, base.LazyValue) {
+	panic("unimplemented")
+}
+
+// Error implements (base.InternalIterator).Error.
+func (i *InternalIteratorShim) Error() error {
+	return i.miter.Error()
+}
+
+// Close implements (base.InternalIterator).Close.
+func (i *InternalIteratorShim) Close() error {
+	return i.miter.Close()
+}
+
+// SetBounds implements (base.InternalIterator).SetBounds.
+func (i *InternalIteratorShim) SetBounds(lower, upper []byte) {
+}
+
+// SetContext implements (base.InternalIterator).SetContext.
+func (i *InternalIteratorShim) SetContext(_ context.Context) {}
+
+// String implements fmt.Stringer.
+func (i *InternalIteratorShim) String() string {
+	return i.miter.String()
+}
diff --git a/pebble/internal/keyspan/iter.go b/pebble/internal/keyspan/iter.go
new file mode 100644
index 0000000..7f8ceb8
--- /dev/null
+++ b/pebble/internal/keyspan/iter.go
@@ -0,0 +1,220 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import (
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/manifest"
+)
+
+// FragmentIterator defines an iterator interface over spans. The spans
+// surfaced by a FragmentIterator must be non-overlapping. This is achieved by
+// fragmenting spans at overlap points (see Fragmenter).
+//
+// A Span returned by a FragmentIterator is only valid until the next
+// positioning method. Some implementations (eg, keyspan.Iter) may provide
+// longer lifetimes but implementations need only guarantee stability until the
+// next positioning method.
+type FragmentIterator interface {
+	// SeekGE moves the iterator to the first span covering a key greater than
+	// or equal to the given key. This is equivalent to seeking to the first
+	// span with an end key greater than the given key.
+	SeekGE(key []byte) *Span
+
+	// SeekLT moves the iterator to the last span covering a key less than the
+	// given key. This is equivalent to seeking to the last span with a start
+	// key less than the given key.
+	SeekLT(key []byte) *Span
+
+	// First moves the iterator to the first span.
+	First() *Span
+
+	// Last moves the iterator to the last span.
+	Last() *Span
+
+	// Next moves the iterator to the next span.
+	//
+	// It is valid to call Next when the iterator is positioned before the first
+	// key/value pair due to either a prior call to SeekLT or Prev which
+	// returned an invalid span. It is not allowed to call Next when the
+	// previous call to SeekGE, SeekPrefixGE or Next returned an invalid span.
+	Next() *Span
+
+	// Prev moves the iterator to the previous span.
+	//
+	// It is valid to call Prev when the iterator is positioned after the last
+	// key/value pair due to either a prior call to SeekGE or Next which
+	// returned an invalid span. It is not allowed to call Prev when the
+	// previous call to SeekLT or Prev returned an invalid span.
+	Prev() *Span
+
+	// Error returns any accumulated error.
+	//
+	// TODO(jackson): Lift errors into return values on the positioning methods.
+	Error() error
+
+	// Close closes the iterator and returns any accumulated error. Exhausting
+	// the iterator is not considered to be an error. It is valid to call Close
+	// multiple times. Other methods should not be called after the iterator has
+	// been closed.
+	Close() error
+}
+
+// TableNewSpanIter creates a new iterator for range key spans for the given
+// file.
+type TableNewSpanIter func(file *manifest.FileMetadata, iterOptions SpanIterOptions) (FragmentIterator, error)
+
+// SpanIterOptions is a subset of IterOptions that are necessary to instantiate
+// per-sstable span iterators.
+type SpanIterOptions struct {
+	// RangeKeyFilters can be used to avoid scanning tables and blocks in tables
+	// when iterating over range keys.
+	RangeKeyFilters []base.BlockPropertyFilter
+}
+
+// Iter is an iterator over a set of fragmented spans.
+type Iter struct {
+	cmp   base.Compare
+	spans []Span
+	index int
+}
+
+// Iter implements the FragmentIterator interface.
+var _ FragmentIterator = (*Iter)(nil)
+
+// NewIter returns a new iterator over a set of fragmented spans.
+func NewIter(cmp base.Compare, spans []Span) *Iter {
+	i := &Iter{}
+	i.Init(cmp, spans)
+	return i
+}
+
+// Count returns the number of spans contained by Iter.
+func (i *Iter) Count() int {
+	return len(i.spans)
+}
+
+// Init initializes an Iter with the provided spans.
+func (i *Iter) Init(cmp base.Compare, spans []Span) {
+	*i = Iter{
+		cmp:   cmp,
+		spans: spans,
+		index: -1,
+	}
+}
+
+// SeekGE implements FragmentIterator.SeekGE.
+func (i *Iter) SeekGE(key []byte) *Span {
+	// NB: manually inlined sort.Search is ~5% faster.
+	//
+	// Define f(j) = false iff the span i.spans[j] is strictly before `key`
+	// (equivalently, i.spans[j].End ≤ key.)
+	//
+	// Define f(-1) == false and f(n) == true.
+	// Invariant: f(index-1) == false, f(upper) == true.
+	i.index = 0
+	upper := len(i.spans)
+	for i.index < upper {
+		h := int(uint(i.index+upper) >> 1) // avoid overflow when computing h
+		// i.index ≤ h < upper
+		if i.cmp(key, i.spans[h].End) >= 0 {
+			i.index = h + 1 // preserves f(i-1) == false
+		} else {
+			upper = h // preserves f(j) == true
+		}
+	}
+
+	// i.index == upper, f(i.index-1) == false, and f(upper) (= f(i.index)) ==
+	// true => answer is i.index.
+	if i.index >= len(i.spans) {
+		return nil
+	}
+	return &i.spans[i.index]
+}
+
+// SeekLT implements FragmentIterator.SeekLT.
+func (i *Iter) SeekLT(key []byte) *Span {
+	// NB: manually inlined sort.Search is ~5% faster.
+	//
+	// Define f(-1) == false and f(n) == true.
+	// Invariant: f(index-1) == false, f(upper) == true.
+	i.index = 0
+	upper := len(i.spans)
+	for i.index < upper {
+		h := int(uint(i.index+upper) >> 1) // avoid overflow when computing h
+		// i.index ≤ h < upper
+		if i.cmp(key, i.spans[h].Start) > 0 {
+			i.index = h + 1 // preserves f(i-1) == false
+		} else {
+			upper = h // preserves f(j) == true
+		}
+	}
+	// i.index == upper, f(i.index-1) == false, and f(upper) (= f(i.index)) ==
+	// true => answer is i.index.
+
+	// Since keys are strictly increasing, if i.index > 0 then i.index-1 will be
+	// the largest whose key is < the key sought.
+	i.index--
+	if i.index < 0 {
+		return nil
+	}
+	return &i.spans[i.index]
+}
+
+// First implements FragmentIterator.First.
+func (i *Iter) First() *Span {
+	if len(i.spans) == 0 {
+		return nil
+	}
+	i.index = 0
+	return &i.spans[i.index]
+}
+
+// Last implements FragmentIterator.Last.
+func (i *Iter) Last() *Span {
+	if len(i.spans) == 0 {
+		return nil
+	}
+	i.index = len(i.spans) - 1
+	return &i.spans[i.index]
+}
+
+// Next implements FragmentIterator.Next.
+func (i *Iter) Next() *Span {
+	if i.index >= len(i.spans) {
+		return nil
+	}
+	i.index++
+	if i.index >= len(i.spans) {
+		return nil
+	}
+	return &i.spans[i.index]
+}
+
+// Prev implements FragmentIterator.Prev.
+func (i *Iter) Prev() *Span {
+	if i.index < 0 {
+		return nil
+	}
+	i.index--
+	if i.index < 0 {
+		return nil
+	}
+	return &i.spans[i.index]
+}
+
+// Error implements FragmentIterator.Error.
+func (i *Iter) Error() error {
+	return nil
+}
+
+// Close implements FragmentIterator.Close.
+func (i *Iter) Close() error {
+	return nil
+}
+
+func (i *Iter) String() string {
+	return "fragmented-spans"
+}
diff --git a/pebble/internal/keyspan/iter_test.go b/pebble/internal/keyspan/iter_test.go
new file mode 100644
index 0000000..c269f3b
--- /dev/null
+++ b/pebble/internal/keyspan/iter_test.go
@@ -0,0 +1,147 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import (
+	"bytes"
+	"fmt"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+)
+
+func runFragmentIteratorCmd(iter FragmentIterator, input string, extraInfo func() string) string {
+	var b bytes.Buffer
+	for _, line := range strings.Split(input, "\n") {
+		parts := strings.Fields(line)
+		if len(parts) == 0 {
+			continue
+		}
+		var span *Span
+		switch parts[0] {
+		case "seek-ge":
+			if len(parts) != 2 {
+				return "seek-ge <key>\n"
+			}
+			span = iter.SeekGE([]byte(strings.TrimSpace(parts[1])))
+		case "seek-lt":
+			if len(parts) != 2 {
+				return "seek-lt <key>\n"
+			}
+			span = iter.SeekLT([]byte(strings.TrimSpace(parts[1])))
+		case "first":
+			span = iter.First()
+		case "last":
+			span = iter.Last()
+		case "next":
+			span = iter.Next()
+		case "prev":
+			span = iter.Prev()
+		default:
+			return fmt.Sprintf("unknown op: %s", parts[0])
+		}
+		if span != nil {
+			fmt.Fprintf(&b, "%s", span)
+			if extraInfo != nil {
+				fmt.Fprintf(&b, " (%s)", extraInfo())
+			}
+			b.WriteByte('\n')
+		} else if err := iter.Error(); err != nil {
+			fmt.Fprintf(&b, "err=%v\n", err)
+		} else {
+			fmt.Fprintf(&b, ".\n")
+		}
+	}
+	return b.String()
+}
+
+func TestIter(t *testing.T) {
+	var spans []Span
+	datadriven.RunTest(t, "testdata/iter", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "define":
+			spans = nil
+			for _, line := range strings.Split(d.Input, "\n") {
+				spans = append(spans, ParseSpan(line))
+			}
+			return ""
+
+		case "iter":
+			iter := NewIter(base.DefaultComparer.Compare, spans)
+			defer iter.Close()
+			return runFragmentIteratorCmd(iter, d.Input, nil)
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
+
+// invalidatingIter wraps a FragmentIterator and implements FragmentIterator
+// itself. Spans surfaced by the inner iterator are copied to buffers that are
+// zeroed by sbubsequent iterator positioning calls. This is intended to help
+// surface bugs in improper lifetime expectations of Spans.
+type invalidatingIter struct {
+	iter FragmentIterator
+	bufs [][]byte
+	keys []Key
+	span Span
+}
+
+// invalidatingIter implements FragmentIterator.
+var _ FragmentIterator = (*invalidatingIter)(nil)
+
+func (i *invalidatingIter) invalidate(s *Span) *Span {
+	// Zero the entirety of the byte bufs and the keys slice.
+	for j := range i.bufs {
+		for k := range i.bufs[j] {
+			i.bufs[j][k] = 0x00
+		}
+		i.bufs[j] = nil
+	}
+	for j := range i.keys {
+		i.keys[j] = Key{}
+	}
+	if s == nil {
+		return nil
+	}
+
+	// Copy all of the span's slices into slices owned by the invalidating iter
+	// that we can invalidate on a subsequent positioning method.
+	i.bufs = i.bufs[:0]
+	i.keys = i.keys[:0]
+	i.span = Span{
+		Start: i.saveBytes(s.Start),
+		End:   i.saveBytes(s.End),
+	}
+	for j := range s.Keys {
+		i.keys = append(i.keys, Key{
+			Trailer: s.Keys[j].Trailer,
+			Suffix:  i.saveBytes(s.Keys[j].Suffix),
+			Value:   i.saveBytes(s.Keys[j].Value),
+		})
+	}
+	i.span.Keys = i.keys
+	return &i.span
+}
+
+func (i *invalidatingIter) saveBytes(b []byte) []byte {
+	if b == nil {
+		return nil
+	}
+	saved := append([]byte(nil), b...)
+	i.bufs = append(i.bufs, saved)
+	return saved
+}
+
+func (i *invalidatingIter) SeekGE(key []byte) *Span { return i.invalidate(i.iter.SeekGE(key)) }
+func (i *invalidatingIter) SeekLT(key []byte) *Span { return i.invalidate(i.iter.SeekLT(key)) }
+func (i *invalidatingIter) First() *Span            { return i.invalidate(i.iter.First()) }
+func (i *invalidatingIter) Last() *Span             { return i.invalidate(i.iter.Last()) }
+func (i *invalidatingIter) Next() *Span             { return i.invalidate(i.iter.Next()) }
+func (i *invalidatingIter) Prev() *Span             { return i.invalidate(i.iter.Prev()) }
+func (i *invalidatingIter) Close() error            { return i.iter.Close() }
+func (i *invalidatingIter) Error() error            { return i.iter.Error() }
diff --git a/pebble/internal/keyspan/level_iter.go b/pebble/internal/keyspan/level_iter.go
new file mode 100644
index 0000000..6dd7ac6
--- /dev/null
+++ b/pebble/internal/keyspan/level_iter.go
@@ -0,0 +1,521 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import (
+	"fmt"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/manifest"
+)
+
+// LevelIter provides a merged view of spans from sstables in a level.
+// It takes advantage of level invariants to only have one sstable span block
+// open at one time, opened using the newIter function passed in.
+type LevelIter struct {
+	cmp base.Compare
+	// Denotes the kind of key the level iterator should read. If the key type
+	// is KeyTypePoint, the level iterator will read range tombstones (which
+	// only affect point keys). If the key type is KeyTypeRange, the level
+	// iterator will read range keys. It is invalid to configure an iterator
+	// with the KeyTypePointAndRange key type.
+	//
+	// If key type is KeyTypePoint, no straddle spans are emitted between files,
+	// and point key bounds are used to find files instead of range key bounds.
+	//
+	// TODO(bilal): Straddle spans can safely be produced in rangedel mode once
+	// we can guarantee that we will never read sstables in a level that split
+	// user keys across them. This might be guaranteed in a future release, but
+	// as of CockroachDB 22.2 it is not guaranteed, so to be safe disable it when
+	// keyType == KeyTypePoint
+	keyType manifest.KeyType
+	// The LSM level this LevelIter is initialized for. Used in logging.
+	level manifest.Level
+	// The below fields are used to fill in gaps between adjacent files' range
+	// key spaces. This is an optimization to avoid unnecessarily loading files
+	// in cases where range keys are sparse and rare. dir is set by every
+	// positioning operation, straddleDir is set to dir whenever a straddling
+	// Span is synthesized and the last positioning operation returned a
+	// synthesized straddle span.
+	//
+	// Note that when a straddle span is initialized, iterFile is modified to
+	// point to the next file in the straddleDir direction. A change of direction
+	// on a straddle key therefore necessitates the value of iterFile to be
+	// reverted.
+	dir         int
+	straddle    Span
+	straddleDir int
+	// The iter for the current file (iterFile). It is nil under any of the
+	// following conditions:
+	// - files.Current() == nil
+	// - err != nil
+	// - straddleDir != 0, in which case iterFile is not nil and points to the
+	//   next file (in the straddleDir direction).
+	// - some other constraint, like the bounds in opts, caused the file at index to not
+	//   be relevant to the iteration.
+	iter FragmentIterator
+	// iterFile holds the current file.
+	// INVARIANT: iterFile = files.Current()
+	iterFile *manifest.FileMetadata
+	newIter  TableNewSpanIter
+	files    manifest.LevelIterator
+	err      error
+
+	// The options that were passed in.
+	tableOpts SpanIterOptions
+
+	// TODO(bilal): Add InternalIteratorStats.
+}
+
+// LevelIter implements the keyspan.FragmentIterator interface.
+var _ FragmentIterator = (*LevelIter)(nil)
+
+// NewLevelIter returns a LevelIter.
+func NewLevelIter(
+	opts SpanIterOptions,
+	cmp base.Compare,
+	newIter TableNewSpanIter,
+	files manifest.LevelIterator,
+	level manifest.Level,
+	keyType manifest.KeyType,
+) *LevelIter {
+	l := &LevelIter{}
+	l.Init(opts, cmp, newIter, files, level, keyType)
+	return l
+}
+
+// Init initializes a LevelIter.
+func (l *LevelIter) Init(
+	opts SpanIterOptions,
+	cmp base.Compare,
+	newIter TableNewSpanIter,
+	files manifest.LevelIterator,
+	level manifest.Level,
+	keyType manifest.KeyType,
+) {
+	l.err = nil
+	l.level = level
+	l.tableOpts = opts
+	l.cmp = cmp
+	l.iterFile = nil
+	l.newIter = newIter
+	switch keyType {
+	case manifest.KeyTypePoint:
+		l.keyType = keyType
+		l.files = files.Filter(keyType)
+	case manifest.KeyTypeRange:
+		l.keyType = keyType
+		l.files = files.Filter(keyType)
+	default:
+		panic(fmt.Sprintf("unsupported key type: %v", keyType))
+	}
+}
+
+func (l *LevelIter) findFileGE(key []byte) *manifest.FileMetadata {
+	// Find the earliest file whose largest key is >= key.
+	//
+	// If the earliest file has its largest key == key and that largest key is a
+	// range deletion sentinel, we know that we manufactured this sentinel to convert
+	// the exclusive range deletion end key into an inclusive key (reminder: [start, end)#seqnum
+	// is the form of a range deletion sentinel which can contribute a largest key = end#sentinel).
+	// In this case we don't return this as the earliest file since there is nothing actually
+	// equal to key in it.
+
+	m := l.files.SeekGE(l.cmp, key)
+	for m != nil {
+		largestKey := m.LargestRangeKey
+		if l.keyType == manifest.KeyTypePoint {
+			largestKey = m.LargestPointKey
+		}
+		if !largestKey.IsExclusiveSentinel() || l.cmp(largestKey.UserKey, key) != 0 {
+			break
+		}
+		m = l.files.Next()
+	}
+	return m
+}
+
+func (l *LevelIter) findFileLT(key []byte) *manifest.FileMetadata {
+	// Find the last file whose smallest key is < key.
+	return l.files.SeekLT(l.cmp, key)
+}
+
+type loadFileReturnIndicator int8
+
+const (
+	noFileLoaded loadFileReturnIndicator = iota
+	fileAlreadyLoaded
+	newFileLoaded
+)
+
+func (l *LevelIter) loadFile(file *manifest.FileMetadata, dir int) loadFileReturnIndicator {
+	indicator := noFileLoaded
+	if l.iterFile == file {
+		if l.err != nil {
+			return noFileLoaded
+		}
+		if l.iter != nil {
+			// We are already at the file, but we would need to check for bounds.
+			// Set indicator accordingly.
+			indicator = fileAlreadyLoaded
+		}
+		// We were already at file, but don't have an iterator, probably because the file was
+		// beyond the iteration bounds. It may still be, but it is also possible that the bounds
+		// have changed. We handle that below.
+	}
+
+	// Note that LevelIter.Close() can be called multiple times.
+	if indicator != fileAlreadyLoaded {
+		if err := l.Close(); err != nil {
+			return noFileLoaded
+		}
+	}
+
+	l.iterFile = file
+	if file == nil {
+		return noFileLoaded
+	}
+	if indicator != fileAlreadyLoaded {
+		l.iter, l.err = l.newIter(file, l.tableOpts)
+		indicator = newFileLoaded
+	}
+	if l.err != nil {
+		return noFileLoaded
+	}
+	return indicator
+}
+
+// SeekGE implements keyspan.FragmentIterator.
+func (l *LevelIter) SeekGE(key []byte) *Span {
+	l.dir = +1
+	l.straddle = Span{}
+	l.straddleDir = 0
+	l.err = nil // clear cached iteration error
+
+	f := l.findFileGE(key)
+	if f != nil && l.keyType == manifest.KeyTypeRange && l.cmp(key, f.SmallestRangeKey.UserKey) < 0 {
+		// Peek at the previous file.
+		prevFile := l.files.Prev()
+		l.files.Next()
+		if prevFile != nil {
+			// We could unconditionally return an empty span between the seek key and
+			// f.SmallestRangeKey, however if this span is to the left of all range
+			// keys on this level, it could lead to inconsistent behaviour in relative
+			// positioning operations. Consider this example, with a b-c range key:
+			//
+			// SeekGE(a) -> a-b:{}
+			// Next() -> b-c{(#5,RANGEKEYSET,@4,foo)}
+			// Prev() -> nil
+			//
+			// Iterators higher up in the iterator stack rely on this sort of relative
+			// positioning consistency.
+			//
+			// TODO(bilal): Investigate ways to be able to return straddle spans in
+			// cases similar to the above, while still retaining correctness.
+			// Return a straddling key instead of loading the file.
+			l.iterFile = f
+			if err := l.Close(); err != nil {
+				return l.verify(nil)
+			}
+			l.straddleDir = +1
+			l.straddle = Span{
+				Start: prevFile.LargestRangeKey.UserKey,
+				End:   f.SmallestRangeKey.UserKey,
+				Keys:  nil,
+			}
+			return l.verify(&l.straddle)
+		}
+	}
+	loadFileIndicator := l.loadFile(f, +1)
+	if loadFileIndicator == noFileLoaded {
+		return l.verify(nil)
+	}
+	if span := l.iter.SeekGE(key); span != nil {
+		return l.verify(span)
+	}
+	return l.skipEmptyFileForward()
+}
+
+// SeekLT implements keyspan.FragmentIterator.
+func (l *LevelIter) SeekLT(key []byte) *Span {
+	l.dir = -1
+	l.straddle = Span{}
+	l.straddleDir = 0
+	l.err = nil // clear cached iteration error
+
+	f := l.findFileLT(key)
+	if f != nil && l.keyType == manifest.KeyTypeRange && l.cmp(f.LargestRangeKey.UserKey, key) < 0 {
+		// Peek at the next file.
+		nextFile := l.files.Next()
+		l.files.Prev()
+		if nextFile != nil {
+			// We could unconditionally return an empty span between f.LargestRangeKey
+			// and the seek key, however if this span is to the right of all range keys
+			// on this level, it could lead to inconsistent behaviour in relative
+			// positioning operations. Consider this example, with a b-c range key:
+			//
+			// SeekLT(d) -> c-d:{}
+			// Prev() -> b-c{(#5,RANGEKEYSET,@4,foo)}
+			// Next() -> nil
+			//
+			// Iterators higher up in the iterator stack rely on this sort of relative
+			// positioning consistency.
+			//
+			// TODO(bilal): Investigate ways to be able to return straddle spans in
+			// cases similar to the above, while still retaining correctness.
+			// Return a straddling key instead of loading the file.
+			l.iterFile = f
+			if err := l.Close(); err != nil {
+				return l.verify(nil)
+			}
+			l.straddleDir = -1
+			l.straddle = Span{
+				Start: f.LargestRangeKey.UserKey,
+				End:   nextFile.SmallestRangeKey.UserKey,
+				Keys:  nil,
+			}
+			return l.verify(&l.straddle)
+		}
+	}
+	if l.loadFile(f, -1) == noFileLoaded {
+		return l.verify(nil)
+	}
+	if span := l.iter.SeekLT(key); span != nil {
+		return l.verify(span)
+	}
+	return l.skipEmptyFileBackward()
+}
+
+// First implements keyspan.FragmentIterator.
+func (l *LevelIter) First() *Span {
+	l.dir = +1
+	l.straddle = Span{}
+	l.straddleDir = 0
+	l.err = nil // clear cached iteration error
+
+	if l.loadFile(l.files.First(), +1) == noFileLoaded {
+		return l.verify(nil)
+	}
+	if span := l.iter.First(); span != nil {
+		return l.verify(span)
+	}
+	return l.skipEmptyFileForward()
+}
+
+// Last implements keyspan.FragmentIterator.
+func (l *LevelIter) Last() *Span {
+	l.dir = -1
+	l.straddle = Span{}
+	l.straddleDir = 0
+	l.err = nil // clear cached iteration error
+
+	if l.loadFile(l.files.Last(), -1) == noFileLoaded {
+		return l.verify(nil)
+	}
+	if span := l.iter.Last(); span != nil {
+		return l.verify(span)
+	}
+	return l.skipEmptyFileBackward()
+}
+
+// Next implements keyspan.FragmentIterator.
+func (l *LevelIter) Next() *Span {
+	if l.err != nil || (l.iter == nil && l.iterFile == nil && l.dir > 0) {
+		return l.verify(nil)
+	}
+	if l.iter == nil && l.iterFile == nil {
+		// l.dir <= 0
+		return l.First()
+	}
+	l.dir = +1
+
+	if l.iter != nil {
+		if span := l.iter.Next(); span != nil {
+			return l.verify(span)
+		}
+	}
+	return l.skipEmptyFileForward()
+}
+
+// Prev implements keyspan.FragmentIterator.
+func (l *LevelIter) Prev() *Span {
+	if l.err != nil || (l.iter == nil && l.iterFile == nil && l.dir < 0) {
+		return l.verify(nil)
+	}
+	if l.iter == nil && l.iterFile == nil {
+		// l.dir >= 0
+		return l.Last()
+	}
+	l.dir = -1
+
+	if l.iter != nil {
+		if span := l.iter.Prev(); span != nil {
+			return l.verify(span)
+		}
+	}
+	return l.skipEmptyFileBackward()
+}
+
+func (l *LevelIter) skipEmptyFileForward() *Span {
+	if l.straddleDir == 0 && l.keyType == manifest.KeyTypeRange &&
+		l.iterFile != nil && l.iter != nil {
+		// We were at a file that had spans. Check if the next file that has
+		// spans is not directly adjacent to the current file i.e. there is a
+		// gap in the span keyspace between the two files. In that case, synthesize
+		// a "straddle span" in l.straddle and return that.
+		//
+		// Straddle spans are not created in rangedel mode.
+		if err := l.Close(); err != nil {
+			l.err = err
+			return l.verify(nil)
+		}
+		startKey := l.iterFile.LargestRangeKey.UserKey
+		// Resetting l.iterFile without loading the file into l.iter is okay and
+		// does not change the logic in loadFile() as long as l.iter is also nil;
+		// which it should be due to the Close() call above.
+		l.iterFile = l.files.Next()
+		if l.iterFile == nil {
+			return l.verify(nil)
+		}
+		endKey := l.iterFile.SmallestRangeKey.UserKey
+		if l.cmp(startKey, endKey) < 0 {
+			// There is a gap between the two files. Synthesize a straddling span
+			// to avoid unnecessarily loading the next file.
+			l.straddle = Span{
+				Start: startKey,
+				End:   endKey,
+			}
+			l.straddleDir = +1
+			return l.verify(&l.straddle)
+		}
+	} else if l.straddleDir < 0 {
+		// We were at a straddle key, but are now changing directions. l.iterFile
+		// was already moved backward by skipEmptyFileBackward, so advance it
+		// forward.
+		l.iterFile = l.files.Next()
+	}
+	l.straddle = Span{}
+	l.straddleDir = 0
+	var span *Span
+	for span.Empty() {
+		fileToLoad := l.iterFile
+		if l.keyType == manifest.KeyTypePoint {
+			// We haven't iterated to the next file yet if we're in point key
+			// (rangedel) mode.
+			fileToLoad = l.files.Next()
+		}
+		if l.loadFile(fileToLoad, +1) == noFileLoaded {
+			return l.verify(nil)
+		}
+		span = l.iter.First()
+		// In rangedel mode, we can expect to get empty files that we'd need to
+		// skip over, but not in range key mode.
+		if l.keyType == manifest.KeyTypeRange {
+			break
+		}
+	}
+	return l.verify(span)
+}
+
+func (l *LevelIter) skipEmptyFileBackward() *Span {
+	// We were at a file that had spans. Check if the previous file that has
+	// spans is not directly adjacent to the current file i.e. there is a
+	// gap in the span keyspace between the two files. In that case, synthesize
+	// a "straddle span" in l.straddle and return that.
+	//
+	// Straddle spans are not created in rangedel mode.
+	if l.straddleDir == 0 && l.keyType == manifest.KeyTypeRange &&
+		l.iterFile != nil && l.iter != nil {
+		if err := l.Close(); err != nil {
+			l.err = err
+			return l.verify(nil)
+		}
+		endKey := l.iterFile.SmallestRangeKey.UserKey
+		// Resetting l.iterFile without loading the file into l.iter is okay and
+		// does not change the logic in loadFile() as long as l.iter is also nil;
+		// which it should be due to the Close() call above.
+		l.iterFile = l.files.Prev()
+		if l.iterFile == nil {
+			return l.verify(nil)
+		}
+		startKey := l.iterFile.LargestRangeKey.UserKey
+		if l.cmp(startKey, endKey) < 0 {
+			// There is a gap between the two files. Synthesize a straddling span
+			// to avoid unnecessarily loading the next file.
+			l.straddle = Span{
+				Start: startKey,
+				End:   endKey,
+			}
+			l.straddleDir = -1
+			return l.verify(&l.straddle)
+		}
+	} else if l.straddleDir > 0 {
+		// We were at a straddle key, but are now changing directions. l.iterFile
+		// was already advanced forward by skipEmptyFileForward, so move it
+		// backward.
+		l.iterFile = l.files.Prev()
+	}
+	l.straddle = Span{}
+	l.straddleDir = 0
+	var span *Span
+	for span.Empty() {
+		fileToLoad := l.iterFile
+		if l.keyType == manifest.KeyTypePoint {
+			fileToLoad = l.files.Prev()
+		}
+		if l.loadFile(fileToLoad, -1) == noFileLoaded {
+			return l.verify(nil)
+		}
+		span = l.iter.Last()
+		// In rangedel mode, we can expect to get empty files that we'd need to
+		// skip over, but not in range key mode as the filter on the FileMetadata
+		// should guarantee we always get a non-empty file.
+		if l.keyType == manifest.KeyTypeRange {
+			break
+		}
+	}
+	return l.verify(span)
+}
+
+// verify is invoked whenever a span is returned from an iterator positioning
+// method to a caller. During invariant builds, it asserts invariants to the
+// caller.
+func (l *LevelIter) verify(s *Span) *Span {
+	// NB: Do not add any logic outside the invariants.Enabled conditional to
+	// ensure that verify is always compiled away in production builds.
+	if invariants.Enabled {
+		if f := l.files.Current(); f != l.iterFile {
+			panic(fmt.Sprintf("LevelIter.files.Current (%s) and l.iterFile (%s) diverged",
+				f, l.iterFile))
+		}
+	}
+	return s
+}
+
+// Error implements keyspan.FragmentIterator.
+func (l *LevelIter) Error() error {
+	if l.err != nil || l.iter == nil {
+		return l.err
+	}
+	return l.iter.Error()
+}
+
+// Close implements keyspan.FragmentIterator.
+func (l *LevelIter) Close() error {
+	if l.iter != nil {
+		l.err = l.iter.Close()
+		l.iter = nil
+	}
+	return l.err
+}
+
+// String implements keyspan.FragmentIterator.
+func (l *LevelIter) String() string {
+	if l.iterFile != nil {
+		return fmt.Sprintf("%s: fileNum=%s", l.level, l.iterFile.FileNum)
+	}
+	return fmt.Sprintf("%s: fileNum=<nil>", l.level)
+}
diff --git a/pebble/internal/keyspan/level_iter_test.go b/pebble/internal/keyspan/level_iter_test.go
new file mode 100644
index 0000000..6e30396
--- /dev/null
+++ b/pebble/internal/keyspan/level_iter_test.go
@@ -0,0 +1,472 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/stretchr/testify/require"
+)
+
+func TestLevelIterEquivalence(t *testing.T) {
+	type level [][]Span
+	testCases := []struct {
+		name   string
+		levels []level
+	}{
+		{
+			"single level, no gaps, no overlaps",
+			[]level{
+				{
+					{
+						Span{
+							Start: []byte("a"),
+							End:   []byte("b"),
+							Keys: []Key{{
+								Trailer: base.MakeTrailer(2, base.InternalKeyKindRangeKeySet),
+								Suffix:  nil,
+								Value:   []byte("foo"),
+							}},
+						},
+						Span{
+							Start: []byte("b"),
+							End:   []byte("c"),
+							Keys: []Key{{
+								Trailer: base.MakeTrailer(2, base.InternalKeyKindRangeKeySet),
+								Suffix:  nil,
+								Value:   []byte("foo"),
+							}},
+						},
+						Span{
+							Start: []byte("c"),
+							End:   []byte("d"),
+							Keys: []Key{{
+								Trailer: base.MakeTrailer(2, base.InternalKeyKindRangeKeySet),
+								Suffix:  nil,
+								Value:   []byte("foo"),
+							}},
+						},
+					},
+					{
+						Span{
+							Start: []byte("d"),
+							End:   []byte("e"),
+							Keys: []Key{{
+								Trailer: base.MakeTrailer(2, base.InternalKeyKindRangeKeySet),
+								Suffix:  nil,
+								Value:   []byte("foo"),
+							}},
+						},
+						Span{
+							Start: []byte("e"),
+							End:   []byte("f"),
+							Keys: []Key{{
+								Trailer: base.MakeTrailer(2, base.InternalKeyKindRangeKeySet),
+								Suffix:  nil,
+								Value:   []byte("foo"),
+							}},
+						},
+						Span{
+							Start: []byte("f"),
+							End:   []byte("g"),
+							Keys: []Key{{
+								Trailer: base.MakeTrailer(2, base.InternalKeyKindRangeKeySet),
+								Suffix:  nil,
+								Value:   []byte("foo"),
+							}},
+						},
+					},
+				},
+			},
+		},
+		{
+			"single level, overlapping fragments",
+			[]level{
+				{
+					{
+						Span{
+							Start: []byte("a"),
+							End:   []byte("b"),
+							Keys: []Key{
+								{
+									Trailer: base.MakeTrailer(4, base.InternalKeyKindRangeKeySet),
+									Suffix:  nil,
+									Value:   []byte("bar"),
+								},
+								{
+									Trailer: base.MakeTrailer(2, base.InternalKeyKindRangeKeySet),
+									Suffix:  nil,
+									Value:   []byte("foo"),
+								},
+							},
+						},
+						Span{
+							Start: []byte("b"),
+							End:   []byte("c"),
+							Keys: []Key{
+								{
+									Trailer: base.MakeTrailer(4, base.InternalKeyKindRangeKeySet),
+									Suffix:  nil,
+									Value:   []byte("bar"),
+								},
+								{
+									Trailer: base.MakeTrailer(2, base.InternalKeyKindRangeKeySet),
+									Suffix:  nil,
+									Value:   []byte("foo"),
+								},
+							},
+						},
+						Span{
+							Start: []byte("c"),
+							End:   []byte("d"),
+							Keys: []Key{{
+								Trailer: base.MakeTrailer(2, base.InternalKeyKindRangeKeySet),
+								Suffix:  nil,
+								Value:   []byte("foo"),
+							}},
+						},
+					},
+					{
+						Span{
+							Start: []byte("d"),
+							End:   []byte("e"),
+							Keys: []Key{{
+								Trailer: base.MakeTrailer(2, base.InternalKeyKindRangeKeySet),
+								Suffix:  nil,
+								Value:   []byte("foo"),
+							}},
+						},
+						Span{
+							Start: []byte("e"),
+							End:   []byte("f"),
+							Keys: []Key{{
+								Trailer: base.MakeTrailer(2, base.InternalKeyKindRangeKeySet),
+								Suffix:  nil,
+								Value:   []byte("foo"),
+							}},
+						},
+						Span{
+							Start: []byte("f"),
+							End:   []byte("g"),
+							Keys: []Key{{
+								Trailer: base.MakeTrailer(2, base.InternalKeyKindRangeKeySet),
+								Suffix:  nil,
+								Value:   []byte("foo"),
+							}},
+						},
+					},
+				},
+			},
+		},
+		{
+			"single level, gaps between files and range keys",
+			[]level{
+				{
+					{
+						Span{
+							Start: []byte("a"),
+							End:   []byte("b"),
+							Keys: []Key{{
+								Trailer: base.MakeTrailer(2, base.InternalKeyKindRangeKeySet),
+								Suffix:  nil,
+								Value:   []byte("foo"),
+							}},
+						},
+						Span{
+							Start: []byte("c"),
+							End:   []byte("d"),
+							Keys: []Key{{
+								Trailer: base.MakeTrailer(2, base.InternalKeyKindRangeKeySet),
+								Suffix:  nil,
+								Value:   []byte("foo"),
+							}},
+						},
+						Span{
+							Start: []byte("e"),
+							End:   []byte("f"),
+							Keys: []Key{{
+								Trailer: base.MakeTrailer(2, base.InternalKeyKindRangeKeySet),
+								Suffix:  nil,
+								Value:   []byte("foo"),
+							}},
+						},
+					},
+					{
+						Span{
+							Start: []byte("g"),
+							End:   []byte("h"),
+							Keys: []Key{{
+								Trailer: base.MakeTrailer(2, base.InternalKeyKindRangeKeySet),
+								Suffix:  nil,
+								Value:   []byte("foo"),
+							}},
+						},
+						Span{
+							Start: []byte("i"),
+							End:   []byte("j"),
+							Keys: []Key{{
+								Trailer: base.MakeTrailer(2, base.InternalKeyKindRangeKeySet),
+								Suffix:  nil,
+								Value:   []byte("foo"),
+							}},
+						},
+						Span{
+							Start: []byte("k"),
+							End:   []byte("l"),
+							Keys: []Key{{
+								Trailer: base.MakeTrailer(2, base.InternalKeyKindRangeKeySet),
+								Suffix:  nil,
+								Value:   []byte("foo"),
+							}},
+						},
+					},
+				},
+			},
+		},
+		{
+			"two levels, one with overlapping unset",
+			[]level{
+				{
+					{
+						Span{
+							Start: []byte("a"),
+							End:   []byte("h"),
+							Keys: []Key{{
+								Trailer: base.MakeTrailer(2, base.InternalKeyKindRangeKeySet),
+								Suffix:  nil,
+								Value:   []byte("foo"),
+							}},
+						},
+					},
+					{
+						Span{
+							Start: []byte("l"),
+							End:   []byte("u"),
+							Keys: []Key{{
+								Trailer: base.MakeTrailer(2, base.InternalKeyKindRangeKeyUnset),
+								Suffix:  nil,
+								Value:   nil,
+							}},
+						},
+					},
+				},
+				{
+					{
+						Span{
+							Start: []byte("e"),
+							End:   []byte("r"),
+							Keys: []Key{{
+								Trailer: base.MakeTrailer(1, base.InternalKeyKindRangeKeySet),
+								Suffix:  nil,
+								Value:   []byte("foo"),
+							}},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		var fileIters []FragmentIterator
+		var levelIters []FragmentIterator
+		var iter1, iter2 MergingIter
+		for j, level := range tc.levels {
+			j := j // Copy for use in closures down below.
+			var levelIter LevelIter
+			var metas []*manifest.FileMetadata
+			for k, file := range level {
+				fileIters = append(fileIters, NewIter(base.DefaultComparer.Compare, file))
+				meta := &manifest.FileMetadata{
+					FileNum:          base.FileNum(k + 1),
+					Size:             1024,
+					SmallestSeqNum:   2,
+					LargestSeqNum:    2,
+					SmallestRangeKey: base.MakeInternalKey(file[0].Start, file[0].SmallestKey().SeqNum(), file[0].SmallestKey().Kind()),
+					LargestRangeKey:  base.MakeExclusiveSentinelKey(file[len(file)-1].LargestKey().Kind(), file[len(file)-1].End),
+					HasPointKeys:     false,
+					HasRangeKeys:     true,
+				}
+				meta.InitPhysicalBacking()
+				meta.ExtendRangeKeyBounds(base.DefaultComparer.Compare, meta.SmallestRangeKey, meta.LargestRangeKey)
+				metas = append(metas, meta)
+			}
+
+			tableNewIters := func(file *manifest.FileMetadata, iterOptions SpanIterOptions) (FragmentIterator, error) {
+				return NewIter(base.DefaultComparer.Compare, tc.levels[j][file.FileNum-1]), nil
+			}
+			// Add all the fileMetadatas to L6.
+			b := &manifest.BulkVersionEdit{}
+			amap := make(map[base.FileNum]*manifest.FileMetadata)
+			for i := range metas {
+				amap[metas[i].FileNum] = metas[i]
+			}
+			b.Added[6] = amap
+			v, err := b.Apply(nil, base.DefaultComparer.Compare, base.DefaultFormatter, 0, 0, nil, manifest.ProhibitSplitUserKeys)
+			require.NoError(t, err)
+			levelIter.Init(
+				SpanIterOptions{}, base.DefaultComparer.Compare, tableNewIters,
+				v.Levels[6].Iter(), 0, manifest.KeyTypeRange,
+			)
+			levelIters = append(levelIters, &levelIter)
+		}
+
+		iter1.Init(base.DefaultComparer.Compare, VisibleTransform(base.InternalKeySeqNumMax), new(MergingBuffers), fileIters...)
+		iter2.Init(base.DefaultComparer.Compare, VisibleTransform(base.InternalKeySeqNumMax), new(MergingBuffers), levelIters...)
+		// Check iter1 and iter2 for equivalence.
+
+		require.Equal(t, iter1.First(), iter2.First(), "failed on test case %q", tc.name)
+		valid := true
+		for valid {
+			f1 := iter1.Next()
+			var f2 *Span
+			for {
+				f2 = iter2.Next()
+				// The level iter could produce empty spans that straddle between
+				// files. Ignore those.
+				if f2 == nil || !f2.Empty() {
+					break
+				}
+			}
+
+			require.Equal(t, f1, f2, "failed on test case %q", tc.name)
+			valid = f1 != nil && f2 != nil
+		}
+	}
+}
+
+func TestLevelIter(t *testing.T) {
+	var level [][]Span
+	var rangedels [][]Span
+	var metas []*manifest.FileMetadata
+	var iter FragmentIterator
+	var extraInfo func() string
+
+	datadriven.RunTest(t, "testdata/level_iter", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "define":
+			level = level[:0]
+			metas = metas[:0]
+			rangedels = rangedels[:0]
+			if iter != nil {
+				iter.Close()
+				iter = nil
+			}
+			var pointKeys []base.InternalKey
+			var currentRangeDels []Span
+			var currentFile []Span
+			for _, key := range strings.Split(d.Input, "\n") {
+				if strings.HasPrefix(key, "file") {
+					// Skip the very first file creation.
+					if len(level) != 0 || len(currentFile) != 0 {
+						meta := &manifest.FileMetadata{
+							FileNum: base.FileNum(len(level) + 1),
+						}
+						if len(currentFile) > 0 {
+							smallest := base.MakeInternalKey(currentFile[0].Start, currentFile[0].SmallestKey().SeqNum(), currentFile[0].SmallestKey().Kind())
+							largest := base.MakeExclusiveSentinelKey(currentFile[len(currentFile)-1].LargestKey().Kind(), currentFile[len(currentFile)-1].End)
+							meta.ExtendRangeKeyBounds(base.DefaultComparer.Compare, smallest, largest)
+						}
+						if len(pointKeys) != 0 {
+							meta.ExtendPointKeyBounds(base.DefaultComparer.Compare, pointKeys[0], pointKeys[len(pointKeys)-1])
+						}
+						meta.InitPhysicalBacking()
+						level = append(level, currentFile)
+						metas = append(metas, meta)
+						rangedels = append(rangedels, currentRangeDels)
+						currentRangeDels = nil
+						currentFile = nil
+						pointKeys = nil
+					}
+					continue
+				}
+				key = strings.TrimSpace(key)
+				if strings.HasPrefix(key, "point:") {
+					key = strings.TrimPrefix(key, "point:")
+					j := strings.Index(key, ":")
+					ikey := base.ParseInternalKey(key[:j])
+					pointKeys = append(pointKeys, ikey)
+					if ikey.Kind() == base.InternalKeyKindRangeDelete {
+						currentRangeDels = append(currentRangeDels, Span{
+							Start: ikey.UserKey, End: []byte(key[j+1:]), Keys: []Key{{Trailer: ikey.Trailer}}})
+					}
+					continue
+				}
+				span := ParseSpan(key)
+				currentFile = append(currentFile, span)
+			}
+			meta := &manifest.FileMetadata{
+				FileNum: base.FileNum(len(level) + 1),
+			}
+			meta.InitPhysicalBacking()
+			level = append(level, currentFile)
+			rangedels = append(rangedels, currentRangeDels)
+			if len(currentFile) > 0 {
+				smallest := base.MakeInternalKey(currentFile[0].Start, currentFile[0].SmallestKey().SeqNum(), currentFile[0].SmallestKey().Kind())
+				largest := base.MakeExclusiveSentinelKey(currentFile[len(currentFile)-1].LargestKey().Kind(), currentFile[len(currentFile)-1].End)
+				meta.ExtendRangeKeyBounds(base.DefaultComparer.Compare, smallest, largest)
+			}
+			if len(pointKeys) != 0 {
+				meta.ExtendPointKeyBounds(base.DefaultComparer.Compare, pointKeys[0], pointKeys[len(pointKeys)-1])
+			}
+			metas = append(metas, meta)
+			return ""
+		case "num-files":
+			return fmt.Sprintf("%d", len(level))
+		case "close-iter":
+			_ = iter.Close()
+			iter = nil
+			return "ok"
+		case "iter":
+			keyType := manifest.KeyTypeRange
+			for _, arg := range d.CmdArgs {
+				if strings.Contains(arg.Key, "rangedel") {
+					keyType = manifest.KeyTypePoint
+				}
+			}
+			if iter == nil {
+				var lastFileNum base.FileNum
+				tableNewIters := func(file *manifest.FileMetadata, _ SpanIterOptions) (FragmentIterator, error) {
+					keyType := keyType
+					spans := level[file.FileNum-1]
+					if keyType == manifest.KeyTypePoint {
+						spans = rangedels[file.FileNum-1]
+					}
+					lastFileNum = file.FileNum
+					return NewIter(base.DefaultComparer.Compare, spans), nil
+				}
+				b := &manifest.BulkVersionEdit{}
+				amap := make(map[base.FileNum]*manifest.FileMetadata)
+				for i := range metas {
+					amap[metas[i].FileNum] = metas[i]
+				}
+				b.Added[6] = amap
+				v, err := b.Apply(nil, base.DefaultComparer.Compare, base.DefaultFormatter, 0, 0, nil, manifest.ProhibitSplitUserKeys)
+				require.NoError(t, err)
+				iter = NewLevelIter(
+					SpanIterOptions{}, base.DefaultComparer.Compare,
+					tableNewIters, v.Levels[6].Iter(), 6, keyType,
+				)
+				extraInfo = func() string {
+					return fmt.Sprintf("file = %s.sst", lastFileNum)
+				}
+			}
+
+			return runFragmentIteratorCmd(iter, d.Input, extraInfo)
+
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+
+	if iter != nil {
+		iter.Close()
+	}
+}
diff --git a/pebble/internal/keyspan/merging_iter.go b/pebble/internal/keyspan/merging_iter.go
new file mode 100644
index 0000000..c73ba59
--- /dev/null
+++ b/pebble/internal/keyspan/merging_iter.go
@@ -0,0 +1,1209 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import (
+	"bytes"
+	"fmt"
+	"sort"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/manifest"
+)
+
+// TODO(jackson): Consider implementing an optimization to seek lower levels
+// past higher levels' RANGEKEYDELs. This would be analaogous to the
+// optimization pebble.mergingIter performs for RANGEDELs during point key
+// seeks. It may not be worth it, because range keys are rare and cascading
+// seeks would require introducing key comparisons to switchTo{Min,Max}Heap
+// where there currently are none.
+
+// TODO(jackson): There are several opportunities to use base.Equal in the
+// MergingIter implementation, but will require a bit of plumbing to thread the
+// Equal function.
+
+// MergingIter merges spans across levels of the LSM, exposing an iterator over
+// spans that yields sets of spans fragmented at unique user key boundaries.
+//
+// A MergingIter is initialized with an arbitrary number of child iterators over
+// fragmented spans. Each child iterator exposes fragmented key spans, such that
+// overlapping keys are surfaced in a single Span. Key spans from one child
+// iterator may overlap key spans from another child iterator arbitrarily.
+//
+// The spans combined by MergingIter will return spans with keys sorted by
+// trailer descending. If the MergingIter is configured with a Transformer, it's
+// permitted to modify the ordering of the spans' keys returned by MergingIter.
+//
+// # Algorithm
+//
+// The merging iterator wraps child iterators, merging and fragmenting spans
+// across levels. The high-level algorithm is:
+//
+//  1. Initialize the heap with bound keys from child iterators' spans.
+//  2. Find the next [or previous] two unique user keys' from bounds.
+//  3. Consider the span formed between the two unique user keys a candidate
+//     span.
+//  4. Determine if any of the child iterators' spans overlap the candidate
+//     span.
+//     4a. If any of the child iterator's current bounds are end keys
+//     (during forward iteration) or start keys (during reverse
+//     iteration), then all the spans with that bound overlap the
+//     candidate span.
+//     4b. Apply the configured transform, which may remove keys.
+//     4c. If no spans overlap, forget the smallest (forward iteration)
+//     or largest (reverse iteration) unique user key and advance
+//     the iterators to the next unique user key. Start again from 3.
+//
+// # Detailed algorithm
+//
+// Each level (i0, i1, ...) has a user-provided input FragmentIterator. The
+// merging iterator steps through individual boundaries of the underlying
+// spans separately. If the underlying FragmentIterator has fragments
+// [a,b){#2,#1} [b,c){#1} the mergingIterLevel.{next,prev} step through:
+//
+//	(a, start), (b, end), (b, start), (c, end)
+//
+// Note that (a, start) and (b, end) are observed ONCE each, despite two keys
+// sharing those bounds. Also note that (b, end) and (b, start) are two distinct
+// iterator positions of a mergingIterLevel.
+//
+// The merging iterator maintains a heap (min during forward iteration, max
+// during reverse iteration) containing the boundKeys. Each boundKey is a
+// 3-tuple holding the bound user key, whether the bound is a start or end key
+// and the set of keys from that level that have that bound. The heap orders
+// based on the boundKey's user key only.
+//
+// The merging iterator is responsible for merging spans across levels to
+// determine which span is next, but it's also responsible for fragmenting
+// overlapping spans. Consider the example:
+//
+//	       i0:     b---d e-----h
+//	       i1:   a---c         h-----k
+//	       i2:   a------------------------------p
+//
+//	fragments:   a-b-c-d-e-----h-----k----------p
+//
+// None of the individual child iterators contain a span with the exact bounds
+// [c,d), but the merging iterator must produce a span [c,d). To accomplish
+// this, the merging iterator visits every span between unique boundary user
+// keys. In the above example, this is:
+//
+//	[a,b), [b,c), [c,d), [d,e), [e, h), [h, k), [k, p)
+//
+// The merging iterator first initializes the heap to prepare for iteration.
+// The description below discusses the mechanics of forward iteration after a
+// call to First, but the mechanics are similar for reverse iteration and
+// other positioning methods.
+//
+// During a call to First, the heap is initialized by seeking every
+// mergingIterLevel to the first bound of the first fragment. In the above
+// example, this seeks the child iterators to:
+//
+//	i0: (b, boundKindFragmentStart, [ [b,d) ])
+//	i1: (a, boundKindFragmentStart, [ [a,c) ])
+//	i2: (a, boundKindFragmentStart, [ [a,p) ])
+//
+// After fixing up the heap, the root of the heap is a boundKey with the
+// smallest user key ('a' in the example). Once the heap is setup for iteration
+// in the appropriate direction and location, the merging iterator uses
+// find{Next,Prev}FragmentSet to find the next/previous span bounds.
+//
+// During forward iteration, the root of the heap's user key is the start key
+// key of next merged span. findNextFragmentSet sets m.start to this user
+// key. The heap may contain other boundKeys with the same user key if another
+// level has a fragment starting or ending at the same key, so the
+// findNextFragmentSet method pulls from the heap until it finds the first key
+// greater than m.start. This key is used as the end key.
+//
+// In the above example, this results in m.start = 'a', m.end = 'b' and child
+// iterators in the following positions:
+//
+//	i0: (b, boundKindFragmentStart, [ [b,d) ])
+//	i1: (c, boundKindFragmentEnd,   [ [a,c) ])
+//	i2: (p, boundKindFragmentEnd,   [ [a,p) ])
+//
+// With the user key bounds of the next merged span established,
+// findNextFragmentSet must determine which, if any, fragments overlap the span.
+// During forward iteration any child iterator that is now positioned at an end
+// boundary has an overlapping span. (Justification: The child iterator's end
+// boundary is ≥ m.end. The corresponding start boundary must be ≤ m.start since
+// there were no other user keys between m.start and m.end. So the fragments
+// associated with the iterator's current end boundary have start and end bounds
+// such that start ≤ m.start < m.end ≤ end).
+//
+// findNextFragmentSet iterates over the levels, collecting keys from any child
+// iterators positioned at end boundaries. In the above example, i1 and i2 are
+// positioned at end boundaries, so findNextFragmentSet collects the keys of
+// [a,c) and [a,p). These spans contain the merging iterator's [m.start, m.end)
+// span, but they may also extend beyond the m.start and m.end. The merging
+// iterator returns the keys with the merging iter's m.start and m.end bounds,
+// preserving the underlying keys' sequence numbers, key kinds and values.
+//
+// A MergingIter is configured with a Transform that's applied to the span
+// before surfacing it to the iterator user. A Transform may remove keys
+// arbitrarily, but it may not modify the values themselves.
+//
+// It may be the case that findNextFragmentSet finds no levels positioned at end
+// boundaries, or that there are no spans remaining after applying a transform,
+// in which case the span [m.start, m.end) overlaps with nothing. In this case
+// findNextFragmentSet loops, repeating the above process again until it finds a
+// span that does contain keys.
+//
+// # Memory safety
+//
+// The FragmentIterator interface only guarantees stability of a Span and its
+// associated slices until the next positioning method is called. Adjacent Spans
+// may be contained in different sstables, requring the FragmentIterator
+// implementation to close one sstable, releasing its memory, before opening the
+// next. Most of the state used by the MergingIter is derived from spans at
+// current child iterator positions only, ensuring state is stable. The one
+// exception is the start bound during forward iteration and the end bound
+// during reverse iteration.
+//
+// If the heap root originates from an end boundary when findNextFragmentSet
+// begins, a Next on the heap root level may invalidate the end boundary. To
+// accommodate this, find{Next,Prev}FragmentSet copy the initial boundary if the
+// subsequent Next/Prev would move to the next span.
+type MergingIter struct {
+	*MergingBuffers
+	// start and end hold the bounds for the span currently under the
+	// iterator position.
+	//
+	// Invariant: None of the levels' iterators contain spans with a bound
+	// between start and end. For all bounds b, b ≤ start || b ≥ end.
+	start, end []byte
+
+	// transformer defines a transformation to be applied to a span before it's
+	// yielded to the user. Transforming may filter individual keys contained
+	// within the span.
+	transformer Transformer
+	// span holds the iterator's current span. This span is used as the
+	// destination for transforms. Every tranformed span overwrites the
+	// previous.
+	span Span
+	err  error
+	dir  int8
+
+	// alloc preallocates mergingIterLevel and mergingIterItems for use by the
+	// merging iterator. As long as the merging iterator is used with
+	// manifest.NumLevels+3 and fewer fragment iterators, the merging iterator
+	// will not need to allocate upon initialization. The value NumLevels+3
+	// mirrors the preallocated levels in iterAlloc used for point iterators.
+	// Invariant: cap(levels) == cap(items)
+	alloc struct {
+		levels [manifest.NumLevels + 3]mergingIterLevel
+		items  [manifest.NumLevels + 3]mergingIterItem
+	}
+}
+
+// MergingBuffers holds buffers used while merging keyspans.
+type MergingBuffers struct {
+	// keys holds all of the keys across all levels that overlap the key span
+	// [start, end), sorted by Trailer descending. This slice is reconstituted
+	// in synthesizeKeys from each mergingIterLevel's keys every time the
+	// [start, end) bounds change.
+	//
+	// Each element points into a child iterator's memory, so the keys may not
+	// be directly modified.
+	keys keysBySeqNumKind
+	// levels holds levels allocated by MergingIter.init. The MergingIter will
+	// prefer use of its `manifest.NumLevels+3` array, so this slice will be
+	// longer if set.
+	levels []mergingIterLevel
+	// heap holds a slice for the merging iterator heap allocated by
+	// MergingIter.init. The MergingIter will prefer use of its
+	// `manifest.NumLevels+3` items array, so this slice will be longer if set.
+	heap mergingIterHeap
+	// buf is a buffer used to save [start, end) boundary keys.
+	buf []byte
+}
+
+// PrepareForReuse discards any excessively large buffers.
+func (bufs *MergingBuffers) PrepareForReuse() {
+	if cap(bufs.buf) > bufferReuseMaxCapacity {
+		bufs.buf = nil
+	}
+}
+
+// MergingIter implements the FragmentIterator interface.
+var _ FragmentIterator = (*MergingIter)(nil)
+
+type mergingIterLevel struct {
+	iter FragmentIterator
+
+	// heapKey holds the current key at this level for use within the heap.
+	heapKey boundKey
+}
+
+func (l *mergingIterLevel) next() {
+	if l.heapKey.kind == boundKindFragmentStart {
+		l.heapKey = boundKey{
+			kind: boundKindFragmentEnd,
+			key:  l.heapKey.span.End,
+			span: l.heapKey.span,
+		}
+		return
+	}
+	if s := l.iter.Next(); s == nil {
+		l.heapKey = boundKey{kind: boundKindInvalid}
+	} else {
+		l.heapKey = boundKey{
+			kind: boundKindFragmentStart,
+			key:  s.Start,
+			span: s,
+		}
+	}
+}
+
+func (l *mergingIterLevel) prev() {
+	if l.heapKey.kind == boundKindFragmentEnd {
+		l.heapKey = boundKey{
+			kind: boundKindFragmentStart,
+			key:  l.heapKey.span.Start,
+			span: l.heapKey.span,
+		}
+		return
+	}
+	if s := l.iter.Prev(); s == nil {
+		l.heapKey = boundKey{kind: boundKindInvalid}
+	} else {
+		l.heapKey = boundKey{
+			kind: boundKindFragmentEnd,
+			key:  s.End,
+			span: s,
+		}
+	}
+}
+
+// Init initializes the merging iterator with the provided fragment iterators.
+func (m *MergingIter) Init(
+	cmp base.Compare, transformer Transformer, bufs *MergingBuffers, iters ...FragmentIterator,
+) {
+	*m = MergingIter{
+		MergingBuffers: bufs,
+		transformer:    transformer,
+	}
+	m.heap.cmp = cmp
+	levels, items := m.levels, m.heap.items
+
+	// Invariant: cap(levels) >= cap(items)
+	// Invariant: cap(alloc.levels) == cap(alloc.items)
+	if len(iters) <= len(m.alloc.levels) {
+		// The slices allocated on the MergingIter struct are large enough.
+		m.levels = m.alloc.levels[:len(iters)]
+		m.heap.items = m.alloc.items[:0]
+	} else if len(iters) <= cap(levels) {
+		// The existing heap-allocated slices are large enough, so reuse them.
+		m.levels = levels[:len(iters)]
+		m.heap.items = items[:0]
+	} else {
+		// Heap allocate new slices.
+		m.levels = make([]mergingIterLevel, len(iters))
+		m.heap.items = make([]mergingIterItem, 0, len(iters))
+	}
+	for i := range m.levels {
+		m.levels[i] = mergingIterLevel{iter: iters[i]}
+	}
+}
+
+// AddLevel adds a new level to the bottom of the merging iterator. AddLevel
+// must be called after Init and before any other method.
+func (m *MergingIter) AddLevel(iter FragmentIterator) {
+	m.levels = append(m.levels, mergingIterLevel{iter: iter})
+}
+
+// SeekGE moves the iterator to the first span covering a key greater than
+// or equal to the given key. This is equivalent to seeking to the first
+// span with an end key greater than the given key.
+func (m *MergingIter) SeekGE(key []byte) *Span {
+	m.invalidate() // clear state about current position
+
+	// SeekGE(k) seeks to the first span with an end key greater than the given
+	// key. The merged span M that we're searching for might straddle the seek
+	// `key`. In this case, the M.Start may be a key ≤ the seek key.
+	//
+	// Consider a SeekGE(dog) in the following example.
+	//
+	//            i0:     b---d e-----h
+	//            i1:   a---c         h-----k
+	//            i2:   a------------------------------p
+	//        merged:   a-b-c-d-e-----h-----k----------p
+	//
+	// The merged span M containing 'dog' is [d,e). The 'd' of the merged span
+	// comes from i0's [b,d)'s end boundary. The [b,d) span does not cover any
+	// key >= dog, so we cannot find the span by positioning the child iterators
+	// using a SeekGE(dog).
+	//
+	// Instead, if we take all the child iterators' spans bounds:
+	//                  a b c d e     h     k          p
+	// We want to partition them into keys ≤ `key` and keys > `key`.
+	//                        dog
+	//                         │
+	//                  a b c d│e     h     k          p
+	//                         │
+	// The largest key on the left of the partition forms the merged span's
+	// start key, and the smallest key on the right of the partition forms the
+	// merged span's end key. Recharacterized:
+	//
+	//   M.Start: the largest boundary ≤ k of any child span
+	//   M.End:   the smallest boundary > k of any child span
+	//
+	// The FragmentIterator interface doesn't implement seeking by all bounds,
+	// it implements seeking by containment. A SeekGE(k) will ensure we observe
+	// all start boundaries ≥ k and all end boundaries > k but does not ensure
+	// we observe end boundaries = k or any boundaries < k.  A SeekLT(k) will
+	// ensure we observe all start boundaries < k and all end boundaries ≤ k but
+	// does not ensure we observe any start boundaries = k or any boundaries >
+	// k. This forces us to seek in one direction and step in the other.
+	//
+	// In a SeekGE, we want to end up oriented in the forward direction when
+	// complete, so we begin with searching for M.Start by SeekLT-ing every
+	// child iterator to `k`.  For every child span found, we determine the
+	// largest bound ≤ `k` and use it to initialize our max heap. The resulting
+	// root of the max heap is a preliminary value for `M.Start`.
+	for i := range m.levels {
+		l := &m.levels[i]
+		s := l.iter.SeekLT(key)
+		if s == nil {
+			l.heapKey = boundKey{kind: boundKindInvalid}
+		} else if m.cmp(s.End, key) <= 0 {
+			l.heapKey = boundKey{
+				kind: boundKindFragmentEnd,
+				key:  s.End,
+				span: s,
+			}
+		} else {
+			// s.End > key && s.Start < key
+			// We need to use this span's start bound, since that's the largest
+			// bound ≤ key.
+			l.heapKey = boundKey{
+				kind: boundKindFragmentStart,
+				key:  s.Start,
+				span: s,
+			}
+		}
+	}
+	m.initMaxHeap()
+	if m.err != nil {
+		return nil
+	} else if len(m.heap.items) == 0 {
+		// There are no spans covering any key < `key`. There is no span that
+		// straddles the seek key. Reorient the heap into a min heap and return
+		// the first span we find in the forward direction.
+		m.switchToMinHeap()
+		return m.findNextFragmentSet()
+	}
+
+	// The heap root is now the largest boundary key b such that:
+	//   1. b < k
+	//   2. b = k, and b is an end boundary
+	// There's a third case that we will need to consider later, after we've
+	// switched to a min heap:
+	//   3. there exists a start boundary key b such that b = k.
+	// A start boundary key equal to k would not be surfaced when we seeked all
+	// the levels using SeekLT(k), since no key <k would be covered within a
+	// span within an inclusive `k` start boundary.
+	//
+	// Assume that the tightest boundary ≤ k is the current heap root (cases 1 &
+	// 2). After we switch to a min heap, we'll check for the third case and
+	// adjust the start boundary if necessary.
+	m.start = m.heap.items[0].boundKey.key
+
+	// Before switching the direction of the heap, save a copy of the start
+	// boundary if it's the end boundary of some child span. Next-ing the child
+	// iterator might switch files and invalidate the memory of the bound.
+	if m.heap.items[0].boundKey.kind == boundKindFragmentEnd {
+		m.buf = append(m.buf[:0], m.start...)
+		m.start = m.buf
+	}
+
+	// Switch to a min heap. This will move each level to the next bound in
+	// every level, and then establish a min heap. This allows us to obtain the
+	// smallest boundary key > `key`, which will serve as our candidate end
+	// bound.
+	m.switchToMinHeap()
+	if m.err != nil {
+		return nil
+	} else if len(m.heap.items) == 0 {
+		return nil
+	}
+
+	// Check for the case 3 described above. It's possible that when we switch
+	// heap directions, we discover a start boundary of some child span that is
+	// equal to the seek key `key`. In this case, we want this key to be our
+	// start boundary.
+	if m.heap.items[0].boundKey.kind == boundKindFragmentStart &&
+		m.cmp(m.heap.items[0].boundKey.key, key) == 0 {
+		// Call findNextFragmentSet, which will set m.start to the heap root and
+		// proceed forward.
+		return m.findNextFragmentSet()
+	}
+
+	m.end = m.heap.items[0].boundKey.key
+	if found, s := m.synthesizeKeys(+1); found && s != nil {
+		return s
+	}
+	return m.findNextFragmentSet()
+
+}
+
+// SeekLT moves the iterator to the last span covering a key less than the
+// given key. This is equivalent to seeking to the last span with a start
+// key less than the given key.
+func (m *MergingIter) SeekLT(key []byte) *Span {
+	m.invalidate() // clear state about current position
+
+	// SeekLT(k) seeks to the last span with a start key less than the given
+	// key. The merged span M that we're searching for might straddle the seek
+	// `key`. In this case, the M.End may be a key ≥ the seek key.
+	//
+	// Consider a SeekLT(dog) in the following example.
+	//
+	//            i0:     b---d e-----h
+	//            i1:   a---c         h-----k
+	//            i2:   a------------------------------p
+	//        merged:   a-b-c-d-e-----h-----k----------p
+	//
+	// The merged span M containing the largest key <'dog' is [d,e). The 'e' of
+	// the merged span comes from i0's [e,h)'s start boundary. The [e,h) span
+	// does not cover any key < dog, so we cannot find the span by positioning
+	// the child iterators using a SeekLT(dog).
+	//
+	// Instead, if we take all the child iterators' spans bounds:
+	//                  a b c d e     h     k          p
+	// We want to partition them into keys < `key` and keys ≥ `key`.
+	//                        dog
+	//                         │
+	//                  a b c d│e     h     k          p
+	//                         │
+	// The largest key on the left of the partition forms the merged span's
+	// start key, and the smallest key on the right of the partition forms the
+	// merged span's end key. Recharacterized:
+	//
+	//   M.Start: the largest boundary < k of any child span
+	//   M.End:   the smallest boundary ≥ k of any child span
+	//
+	// The FragmentIterator interface doesn't implement seeking by all bounds,
+	// it implements seeking by containment. A SeekGE(k) will ensure we observe
+	// all start boundaries ≥ k and all end boundaries > k but does not ensure
+	// we observe end boundaries = k or any boundaries < k.  A SeekLT(k) will
+	// ensure we observe all start boundaries < k and all end boundaries ≤ k but
+	// does not ensure we observe any start boundaries = k or any boundaries >
+	// k. This forces us to seek in one direction and step in the other.
+	//
+	// In a SeekLT, we want to end up oriented in the backward direction when
+	// complete, so we begin with searching for M.End by SeekGE-ing every
+	// child iterator to `k`. For every child span found, we determine the
+	// smallest bound ≥ `k` and use it to initialize our min heap. The resulting
+	// root of the min heap is a preliminary value for `M.End`.
+	for i := range m.levels {
+		l := &m.levels[i]
+		s := l.iter.SeekGE(key)
+		if s == nil {
+			l.heapKey = boundKey{kind: boundKindInvalid}
+		} else if m.cmp(s.Start, key) >= 0 {
+			l.heapKey = boundKey{
+				kind: boundKindFragmentStart,
+				key:  s.Start,
+				span: s,
+			}
+		} else {
+			// s.Start < key
+			// We need to use this span's end bound, since that's the smallest
+			// bound > key.
+			l.heapKey = boundKey{
+				kind: boundKindFragmentEnd,
+				key:  s.End,
+				span: s,
+			}
+		}
+	}
+	m.initMinHeap()
+	if m.err != nil {
+		return nil
+	} else if len(m.heap.items) == 0 {
+		// There are no spans covering any key ≥ `key`. There is no span that
+		// straddles the seek key. Reorient the heap into a max heap and return
+		// the first span we find in the reverse direction.
+		m.switchToMaxHeap()
+		return m.findPrevFragmentSet()
+	}
+
+	// The heap root is now the smallest boundary key b such that:
+	//   1. b > k
+	//   2. b = k, and b is a start boundary
+	// There's a third case that we will need to consider later, after we've
+	// switched to a max heap:
+	//   3. there exists an end boundary key b such that b = k.
+	// An end boundary key equal to k would not be surfaced when we seeked all
+	// the levels using SeekGE(k), since k would not be contained within the
+	// exclusive end boundary.
+	//
+	// Assume that the tightest boundary ≥ k is the current heap root (cases 1 &
+	// 2). After we switch to a max heap, we'll check for the third case and
+	// adjust the end boundary if necessary.
+	m.end = m.heap.items[0].boundKey.key
+
+	// Before switching the direction of the heap, save a copy of the end
+	// boundary if it's the start boundary of some child span. Prev-ing the
+	// child iterator might switch files and invalidate the memory of the bound.
+	if m.heap.items[0].boundKey.kind == boundKindFragmentStart {
+		m.buf = append(m.buf[:0], m.end...)
+		m.end = m.buf
+	}
+
+	// Switch to a max heap. This will move each level to the previous bound in
+	// every level, and then establish a max heap. This allows us to obtain the
+	// largest boundary key < `key`, which will serve as our candidate start
+	// bound.
+	m.switchToMaxHeap()
+	if m.err != nil {
+		return nil
+	} else if len(m.heap.items) == 0 {
+		return nil
+	}
+	// Check for the case 3 described above. It's possible that when we switch
+	// heap directions, we discover an end boundary of some child span that is
+	// equal to the seek key `key`. In this case, we want this key to be our end
+	// boundary.
+	if m.heap.items[0].boundKey.kind == boundKindFragmentEnd &&
+		m.cmp(m.heap.items[0].boundKey.key, key) == 0 {
+		// Call findPrevFragmentSet, which will set m.end to the heap root and
+		// proceed backwards.
+		return m.findPrevFragmentSet()
+	}
+
+	m.start = m.heap.items[0].boundKey.key
+	if found, s := m.synthesizeKeys(-1); found && s != nil {
+		return s
+	}
+	return m.findPrevFragmentSet()
+}
+
+// First seeks the iterator to the first span.
+func (m *MergingIter) First() *Span {
+	m.invalidate() // clear state about current position
+	for i := range m.levels {
+		if s := m.levels[i].iter.First(); s == nil {
+			m.levels[i].heapKey = boundKey{kind: boundKindInvalid}
+		} else {
+			m.levels[i].heapKey = boundKey{
+				kind: boundKindFragmentStart,
+				key:  s.Start,
+				span: s,
+			}
+		}
+	}
+	m.initMinHeap()
+	return m.findNextFragmentSet()
+}
+
+// Last seeks the iterator to the last span.
+func (m *MergingIter) Last() *Span {
+	m.invalidate() // clear state about current position
+	for i := range m.levels {
+		if s := m.levels[i].iter.Last(); s == nil {
+			m.levels[i].heapKey = boundKey{kind: boundKindInvalid}
+		} else {
+			m.levels[i].heapKey = boundKey{
+				kind: boundKindFragmentEnd,
+				key:  s.End,
+				span: s,
+			}
+		}
+	}
+	m.initMaxHeap()
+	return m.findPrevFragmentSet()
+}
+
+// Next advances the iterator to the next span.
+func (m *MergingIter) Next() *Span {
+	if m.err != nil {
+		return nil
+	}
+	if m.dir == +1 && (m.end == nil || m.start == nil) {
+		return nil
+	}
+	if m.dir != +1 {
+		m.switchToMinHeap()
+	}
+	return m.findNextFragmentSet()
+}
+
+// Prev advances the iterator to the previous span.
+func (m *MergingIter) Prev() *Span {
+	if m.err != nil {
+		return nil
+	}
+	if m.dir == -1 && (m.end == nil || m.start == nil) {
+		return nil
+	}
+	if m.dir != -1 {
+		m.switchToMaxHeap()
+	}
+	return m.findPrevFragmentSet()
+}
+
+// Error returns any accumulated error.
+func (m *MergingIter) Error() error {
+	if m.heap.len() == 0 || m.err != nil {
+		return m.err
+	}
+	return m.levels[m.heap.items[0].index].iter.Error()
+}
+
+// Close closes the iterator, releasing all acquired resources.
+func (m *MergingIter) Close() error {
+	for i := range m.levels {
+		if err := m.levels[i].iter.Close(); err != nil && m.err == nil {
+			m.err = err
+		}
+	}
+	m.levels = nil
+	m.heap.items = m.heap.items[:0]
+	return m.err
+}
+
+// String implements fmt.Stringer.
+func (m *MergingIter) String() string {
+	return "merging-keyspan"
+}
+
+func (m *MergingIter) initMinHeap() {
+	m.dir = +1
+	m.heap.reverse = false
+	m.initHeap()
+}
+
+func (m *MergingIter) initMaxHeap() {
+	m.dir = -1
+	m.heap.reverse = true
+	m.initHeap()
+}
+
+func (m *MergingIter) initHeap() {
+	m.heap.items = m.heap.items[:0]
+	for i := range m.levels {
+		if l := &m.levels[i]; l.heapKey.kind != boundKindInvalid {
+			m.heap.items = append(m.heap.items, mergingIterItem{
+				index:    i,
+				boundKey: &l.heapKey,
+			})
+		} else {
+			m.err = firstError(m.err, l.iter.Error())
+			if m.err != nil {
+				return
+			}
+		}
+	}
+	m.heap.init()
+}
+
+func (m *MergingIter) switchToMinHeap() {
+	// switchToMinHeap reorients the heap for forward iteration, without moving
+	// the current MergingIter position.
+
+	// The iterator is currently positioned at the span [m.start, m.end),
+	// oriented in the reverse direction, so each level's iterator is positioned
+	// to the largest key ≤ m.start. To reorient in the forward direction, we
+	// must advance each level's iterator to the smallest key ≥ m.end. Consider
+	// this three-level example.
+	//
+	//         i0:     b---d e-----h
+	//         i1:   a---c         h-----k
+	//         i2:   a------------------------------p
+	//
+	//     merged:   a-b-c-d-e-----h-----k----------p
+	//
+	// If currently positioned at the merged span [c,d), then the level
+	// iterators' heap keys are:
+	//
+	//    i0: (b, [b, d))   i1: (c, [a,c))   i2: (a, [a,p))
+	//
+	// Reversing the heap should not move the merging iterator and should not
+	// change the current [m.start, m.end) bounds. It should only prepare for
+	// forward iteration by updating the child iterators' heap keys to:
+	//
+	//    i0: (d, [b, d))   i1: (h, [h,k))   i2: (p, [a,p))
+	//
+	// In every level the first key ≥ m.end is the next in the iterator.
+	// Justification: Suppose not and a level iterator's next key was some key k
+	// such that k < m.end. The max-heap invariant dictates that the current
+	// iterator position is the largest entry with a user key ≥ m.start. This
+	// means k > m.start. We started with the assumption that k < m.end, so
+	// m.start < k < m.end. But then k is between our current span bounds,
+	// and reverse iteration would have constructed the current interval to be
+	// [k, m.end) not [m.start, m.end).
+
+	if invariants.Enabled {
+		for i := range m.levels {
+			l := &m.levels[i]
+			if l.heapKey.kind != boundKindInvalid && m.cmp(l.heapKey.key, m.start) > 0 {
+				panic("pebble: invariant violation: max-heap key > m.start")
+			}
+		}
+	}
+
+	for i := range m.levels {
+		m.levels[i].next()
+	}
+	m.initMinHeap()
+}
+
+func (m *MergingIter) switchToMaxHeap() {
+	// switchToMaxHeap reorients the heap for reverse iteration, without moving
+	// the current MergingIter position.
+
+	// The iterator is currently positioned at the span [m.start, m.end),
+	// oriented in the forward direction. Each level's iterator is positioned at
+	// the smallest bound ≥ m.end. To reorient in the reverse direction, we must
+	// move each level's iterator to the largest key ≤ m.start. Consider this
+	// three-level example.
+	//
+	//         i0:     b---d e-----h
+	//         i1:   a---c         h-----k
+	//         i2:   a------------------------------p
+	//
+	//     merged:   a-b-c-d-e-----h-----k----------p
+	//
+	// If currently positioned at the merged span [c,d), then the level
+	// iterators' heap keys are:
+	//
+	//    i0: (d, [b, d))   i1: (h, [h,k))   i2: (p, [a,p))
+	//
+	// Reversing the heap should not move the merging iterator and should not
+	// change the current [m.start, m.end) bounds. It should only prepare for
+	// reverse iteration by updating the child iterators' heap keys to:
+	//
+	//    i0: (b, [b, d))   i1: (c, [a,c))   i2: (a, [a,p))
+	//
+	// In every level the largest key ≤ m.start is the prev in the iterator.
+	// Justification: Suppose not and a level iterator's prev key was some key k
+	// such that k > m.start. The min-heap invariant dictates that the current
+	// iterator position is the smallest entry with a user key ≥ m.end. This
+	// means k < m.end, otherwise the iterator would be positioned at k. We
+	// started with the assumption that k > m.start, so m.start < k < m.end. But
+	// then k is between our current span bounds, and reverse iteration
+	// would have constructed the current interval to be [m.start, k) not
+	// [m.start, m.end).
+
+	if invariants.Enabled {
+		for i := range m.levels {
+			l := &m.levels[i]
+			if l.heapKey.kind != boundKindInvalid && m.cmp(l.heapKey.key, m.end) < 0 {
+				panic("pebble: invariant violation: min-heap key < m.end")
+			}
+		}
+	}
+
+	for i := range m.levels {
+		m.levels[i].prev()
+	}
+	m.initMaxHeap()
+}
+
+func (m *MergingIter) cmp(a, b []byte) int {
+	return m.heap.cmp(a, b)
+}
+
+func (m *MergingIter) findNextFragmentSet() *Span {
+	// Each iteration of this loop considers a new merged span between unique
+	// user keys. An iteration may find that there exists no overlap for a given
+	// span, (eg, if the spans [a,b), [d, e) exist within level iterators, the
+	// below loop will still consider [b,d) before continuing to [d, e)). It
+	// returns when it finds a span that is covered by at least one key.
+
+	for m.heap.len() > 0 && m.err == nil {
+		// Initialize the next span's start bound. SeekGE and First prepare the
+		// heap without advancing. Next leaves the heap in a state such that the
+		// root is the smallest bound key equal to the returned span's end key,
+		// so the heap is already positioned at the next merged span's start key.
+
+		// NB: m.heapRoot() might be either an end boundary OR a start boundary
+		// of a level's span. Both end and start boundaries may still be a start
+		// key of a span in the set of fragmented spans returned by MergingIter.
+		// Consider the scenario:
+		//       a----------l      #1
+		//         b-----------m   #2
+		//
+		// The merged, fully-fragmented spans that MergingIter exposes to the caller
+		// have bounds:
+		//        a-b              #1
+		//          b--------l     #1
+		//          b--------l     #2
+		//                   l-m   #2
+		//
+		// When advancing to l-m#2, we must set m.start to 'l', which originated
+		// from [a,l)#1's end boundary.
+		m.start = m.heap.items[0].boundKey.key
+
+		// Before calling nextEntry, consider whether it might invalidate our
+		// start boundary. If the start boundary key originated from an end
+		// boundary, then we need to copy the start key before advancing the
+		// underlying iterator to the next Span.
+		if m.heap.items[0].boundKey.kind == boundKindFragmentEnd {
+			m.buf = append(m.buf[:0], m.start...)
+			m.start = m.buf
+		}
+
+		// There may be many entries all with the same user key. Spans in other
+		// levels may also start or end at this same user key. For eg:
+		// L1:   [a, c) [c, d)
+		// L2:          [c, e)
+		// If we're positioned at L1's end(c) end boundary, we want to advance
+		// to the first bound > c.
+		m.nextEntry()
+		for len(m.heap.items) > 0 && m.err == nil && m.cmp(m.heapRoot(), m.start) == 0 {
+			m.nextEntry()
+		}
+		if len(m.heap.items) == 0 || m.err != nil {
+			break
+		}
+
+		// The current entry at the top of the heap is the first key > m.start.
+		// It must become the end bound for the span we will return to the user.
+		// In the above example, the root of the heap is L1's end(d).
+		m.end = m.heap.items[0].boundKey.key
+
+		// Each level within m.levels may have a span that overlaps the
+		// fragmented key span [m.start, m.end). Update m.keys to point to them
+		// and sort them by kind, sequence number. There may not be any keys
+		// defined over [m.start, m.end) if we're between the end of one span
+		// and the start of the next, OR if the configured transform filters any
+		// keys out. We allow empty spans that were emitted by child iterators, but
+		// we elide empty spans created by the mergingIter itself that don't overlap
+		// with any child iterator returned spans (i.e. empty spans that bridge two
+		// distinct child-iterator-defined spans).
+		if found, s := m.synthesizeKeys(+1); found && s != nil {
+			return s
+		}
+	}
+	// Exhausted.
+	m.clear()
+	return nil
+}
+
+func (m *MergingIter) findPrevFragmentSet() *Span {
+	// Each iteration of this loop considers a new merged span between unique
+	// user keys. An iteration may find that there exists no overlap for a given
+	// span, (eg, if the spans [a,b), [d, e) exist within level iterators, the
+	// below loop will still consider [b,d) before continuing to [a, b)). It
+	// returns when it finds a span that is covered by at least one key.
+
+	for m.heap.len() > 0 && m.err == nil {
+		// Initialize the next span's end bound. SeekLT and Last prepare the
+		// heap without advancing. Prev leaves the heap in a state such that the
+		// root is the largest bound key equal to the returned span's start key,
+		// so the heap is already positioned at the next merged span's end key.
+
+		// NB: m.heapRoot() might be either an end boundary OR a start boundary
+		// of a level's span. Both end and start boundaries may still be a start
+		// key of a span returned by MergingIter. Consider the scenario:
+		//       a----------l      #2
+		//         b-----------m   #1
+		//
+		// The merged, fully-fragmented spans that MergingIter exposes to the caller
+		// have bounds:
+		//        a-b              #2
+		//          b--------l     #2
+		//          b--------l     #1
+		//                   l-m   #1
+		//
+		// When Preving to a-b#2, we must set m.end to 'b', which originated
+		// from [b,m)#1's start boundary.
+		m.end = m.heap.items[0].boundKey.key
+
+		// Before calling prevEntry, consider whether it might invalidate our
+		// end boundary. If the end boundary key originated from a start
+		// boundary, then we need to copy the end key before advancing the
+		// underlying iterator to the previous Span.
+		if m.heap.items[0].boundKey.kind == boundKindFragmentStart {
+			m.buf = append(m.buf[:0], m.end...)
+			m.end = m.buf
+		}
+
+		// There may be many entries all with the same user key. Spans in other
+		// levels may also start or end at this same user key. For eg:
+		// L1:   [a, c) [c, d)
+		// L2:          [c, e)
+		// If we're positioned at L1's start(c) start boundary, we want to prev
+		// to move to the first bound < c.
+		m.prevEntry()
+		for len(m.heap.items) > 0 && m.err == nil && m.cmp(m.heapRoot(), m.end) == 0 {
+			m.prevEntry()
+		}
+		if len(m.heap.items) == 0 || m.err != nil {
+			break
+		}
+
+		// The current entry at the top of the heap is the first key < m.end.
+		// It must become the start bound for the span we will return to the
+		// user. In the above example, the root of the heap is L1's start(a).
+		m.start = m.heap.items[0].boundKey.key
+
+		// Each level within m.levels may have a set of keys that overlap the
+		// fragmented key span [m.start, m.end). Update m.keys to point to them
+		// and sort them by kind, sequence number. There may not be any keys
+		// spanning [m.start, m.end) if we're between the end of one span and
+		// the start of the next, OR if the configured transform filters any
+		// keys out.  We allow empty spans that were emitted by child iterators, but
+		// we elide empty spans created by the mergingIter itself that don't overlap
+		// with any child iterator returned spans (i.e. empty spans that bridge two
+		// distinct child-iterator-defined spans).
+		if found, s := m.synthesizeKeys(-1); found && s != nil {
+			return s
+		}
+	}
+	// Exhausted.
+	m.clear()
+	return nil
+}
+
+func (m *MergingIter) heapRoot() []byte {
+	return m.heap.items[0].boundKey.key
+}
+
+// synthesizeKeys is called by find{Next,Prev}FragmentSet to populate and
+// sort the set of keys overlapping [m.start, m.end).
+//
+// During forward iteration, if the current heap item is a fragment end,
+// then the fragment's start must be ≤ m.start and the fragment overlaps the
+// current iterator position of [m.start, m.end).
+//
+// During reverse iteration, if the current heap item is a fragment start,
+// then the fragment's end must be ≥ m.end and the fragment overlaps the
+// current iteration position of [m.start, m.end).
+//
+// The boolean return value, `found`, is true if the returned span overlaps
+// with a span returned by a child iterator.
+func (m *MergingIter) synthesizeKeys(dir int8) (bool, *Span) {
+	if invariants.Enabled {
+		if m.cmp(m.start, m.end) >= 0 {
+			panic(fmt.Sprintf("pebble: invariant violation: span start ≥ end: %s >= %s", m.start, m.end))
+		}
+	}
+
+	m.keys = m.keys[:0]
+	found := false
+	for i := range m.levels {
+		if dir == +1 && m.levels[i].heapKey.kind == boundKindFragmentEnd ||
+			dir == -1 && m.levels[i].heapKey.kind == boundKindFragmentStart {
+			m.keys = append(m.keys, m.levels[i].heapKey.span.Keys...)
+			found = true
+		}
+	}
+	// TODO(jackson): We should be able to remove this sort and instead
+	// guarantee that we'll return keys in the order of the levels they're from.
+	// With careful iterator construction, this would  guarantee that they're
+	// sorted by trailer descending for the range key iteration use case.
+	sort.Sort(&m.keys)
+
+	// Apply the configured transform. See VisibleTransform.
+	m.span = Span{
+		Start:     m.start,
+		End:       m.end,
+		Keys:      m.keys,
+		KeysOrder: ByTrailerDesc,
+	}
+	// NB: m.heap.cmp is a base.Compare, whereas m.cmp is a method on
+	// MergingIter.
+	if err := m.transformer.Transform(m.heap.cmp, m.span, &m.span); err != nil {
+		m.err = err
+		return false, nil
+	}
+	return found, &m.span
+}
+
+func (m *MergingIter) invalidate() {
+	m.err = nil
+}
+
+func (m *MergingIter) clear() {
+	for fi := range m.keys {
+		m.keys[fi] = Key{}
+	}
+	m.keys = m.keys[:0]
+}
+
+// nextEntry steps to the next entry.
+func (m *MergingIter) nextEntry() {
+	l := &m.levels[m.heap.items[0].index]
+	l.next()
+	if !l.heapKey.valid() {
+		// l.iter is exhausted.
+		m.err = l.iter.Error()
+		if m.err == nil {
+			m.heap.pop()
+		}
+		return
+	}
+
+	if m.heap.len() > 1 {
+		m.heap.fix(0)
+	}
+}
+
+// prevEntry steps to the previous entry.
+func (m *MergingIter) prevEntry() {
+	l := &m.levels[m.heap.items[0].index]
+	l.prev()
+	if !l.heapKey.valid() {
+		// l.iter is exhausted.
+		m.err = l.iter.Error()
+		if m.err == nil {
+			m.heap.pop()
+		}
+		return
+	}
+
+	if m.heap.len() > 1 {
+		m.heap.fix(0)
+	}
+}
+
+// DebugString returns a string representing the current internal state of the
+// merging iterator and its heap for debugging purposes.
+func (m *MergingIter) DebugString() string {
+	var buf bytes.Buffer
+	fmt.Fprintf(&buf, "Current bounds: [%q, %q)\n", m.start, m.end)
+	for i := range m.levels {
+		fmt.Fprintf(&buf, "%d: heap key %s\n", i, m.levels[i].heapKey)
+	}
+	return buf.String()
+}
+
+type mergingIterItem struct {
+	// boundKey points to the corresponding mergingIterLevel's `iterKey`.
+	*boundKey
+	// index is the index of this level within the MergingIter's levels field.
+	index int
+}
+
+// mergingIterHeap is copied from mergingIterHeap defined in the root pebble
+// package for use with point keys.
+
+type mergingIterHeap struct {
+	cmp     base.Compare
+	reverse bool
+	items   []mergingIterItem
+}
+
+func (h *mergingIterHeap) len() int {
+	return len(h.items)
+}
+
+func (h *mergingIterHeap) less(i, j int) bool {
+	// This key comparison only uses the user key and not the boundKind. Bound
+	// kind doesn't matter because when stepping over a user key,
+	// findNextFragmentSet and findPrevFragmentSet skip past all heap items with
+	// that user key, and makes no assumptions on ordering. All other heap
+	// examinations only consider the user key.
+	ik, jk := h.items[i].key, h.items[j].key
+	c := h.cmp(ik, jk)
+	if h.reverse {
+		return c > 0
+	}
+	return c < 0
+}
+
+func (h *mergingIterHeap) swap(i, j int) {
+	h.items[i], h.items[j] = h.items[j], h.items[i]
+}
+
+// init, fix, up and down are copied from the go stdlib.
+func (h *mergingIterHeap) init() {
+	// heapify
+	n := h.len()
+	for i := n/2 - 1; i >= 0; i-- {
+		h.down(i, n)
+	}
+}
+
+func (h *mergingIterHeap) fix(i int) {
+	if !h.down(i, h.len()) {
+		h.up(i)
+	}
+}
+
+func (h *mergingIterHeap) pop() *mergingIterItem {
+	n := h.len() - 1
+	h.swap(0, n)
+	h.down(0, n)
+	item := &h.items[n]
+	h.items = h.items[:n]
+	return item
+}
+
+func (h *mergingIterHeap) up(j int) {
+	for {
+		i := (j - 1) / 2 // parent
+		if i == j || !h.less(j, i) {
+			break
+		}
+		h.swap(i, j)
+		j = i
+	}
+}
+
+func (h *mergingIterHeap) down(i0, n int) bool {
+	i := i0
+	for {
+		j1 := 2*i + 1
+		if j1 >= n || j1 < 0 { // j1 < 0 after int overflow
+			break
+		}
+		j := j1 // left child
+		if j2 := j1 + 1; j2 < n && h.less(j2, j1) {
+			j = j2 // = 2*i + 2  // right child
+		}
+		if !h.less(j, i) {
+			break
+		}
+		h.swap(i, j)
+		i = j
+	}
+	return i > i0
+}
+
+type boundKind int8
+
+const (
+	boundKindInvalid boundKind = iota
+	boundKindFragmentStart
+	boundKindFragmentEnd
+)
+
+type boundKey struct {
+	kind boundKind
+	key  []byte
+	// span holds the span the bound key comes from.
+	//
+	// If kind is boundKindFragmentStart, then key is span.Start. If kind is
+	// boundKindFragmentEnd, then key is span.End.
+	span *Span
+}
+
+func (k boundKey) valid() bool {
+	return k.kind != boundKindInvalid
+}
+
+func (k boundKey) String() string {
+	var buf bytes.Buffer
+	switch k.kind {
+	case boundKindInvalid:
+		fmt.Fprint(&buf, "invalid")
+	case boundKindFragmentStart:
+		fmt.Fprint(&buf, "fragment-start")
+	case boundKindFragmentEnd:
+		fmt.Fprint(&buf, "fragment-end  ")
+	default:
+		fmt.Fprintf(&buf, "unknown-kind(%d)", k.kind)
+	}
+	fmt.Fprintf(&buf, " %s [", k.key)
+	fmt.Fprintf(&buf, "%s", k.span)
+	fmt.Fprint(&buf, "]")
+	return buf.String()
+}
diff --git a/pebble/internal/keyspan/merging_iter_test.go b/pebble/internal/keyspan/merging_iter_test.go
new file mode 100644
index 0000000..18650bb
--- /dev/null
+++ b/pebble/internal/keyspan/merging_iter_test.go
@@ -0,0 +1,252 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import (
+	"bytes"
+	"fmt"
+	"math/rand"
+	"slices"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/stretchr/testify/require"
+)
+
+func TestMergingIter(t *testing.T) {
+	cmp := base.DefaultComparer.Compare
+
+	var definedIters []FragmentIterator
+	var buf bytes.Buffer
+	datadriven.RunTest(t, "testdata/merging_iter", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "define":
+			definedIters = definedIters[:0]
+			lines := strings.Split(strings.TrimSpace(td.Input), "\n")
+			var spans []Span
+			for _, line := range lines {
+				if line == "--" {
+					definedIters = append(definedIters, &invalidatingIter{iter: NewIter(cmp, spans)})
+					spans = nil
+					continue
+				}
+				spans = append(spans, ParseSpan(line))
+			}
+			if len(spans) > 0 {
+				definedIters = append(definedIters, &invalidatingIter{iter: NewIter(cmp, spans)})
+			}
+			return fmt.Sprintf("%d levels", len(definedIters))
+		case "iter":
+			buf.Reset()
+			pctx := probeContext{log: &buf}
+			snapshot := base.InternalKeySeqNumMax
+			iters := slices.Clone(definedIters)
+			for _, cmdArg := range td.CmdArgs {
+				switch cmdArg.Key {
+				case "snapshot":
+					var err error
+					snapshot, err = strconv.ParseUint(cmdArg.Vals[0], 10, 64)
+					require.NoError(t, err)
+				case "probes":
+					// The first value indicates which of the merging iterator's
+					// child iterators is the target.
+					i, err := strconv.Atoi(cmdArg.Vals[0])
+					if err != nil {
+						return err.Error()
+					}
+					// The remaining values define probes to attach.
+					iters[i] = attachProbes(iters[i], pctx, parseProbes(cmdArg.Vals[1:]...)...)
+				default:
+					return fmt.Sprintf("unrecognized arg %q", cmdArg.Key)
+				}
+			}
+			var iter MergingIter
+			iter.Init(cmp, VisibleTransform(snapshot), new(MergingBuffers), iters...)
+			runIterCmd(t, td, &iter, &buf)
+			return buf.String()
+		default:
+			return fmt.Sprintf("unrecognized command %q", td.Cmd)
+		}
+	})
+}
+
+// TestMergingIter_FragmenterEquivalence tests for equivalence between the
+// fragmentation performed on-the-fly by the MergingIter and the fragmentation
+// performed by the Fragmenter.
+//
+// It does this by producing 1-10 levels of well-formed fragments. Generated
+// fragments may overlap other levels arbitrarily, but within their level
+// generated fragments may only overlap other fragments that share the same user
+// key bounds.
+//
+// The test then feeds all the fragments, across all levels, into a Fragmenter
+// and produces a Iter over those fragments. The test also constructs a
+// MergingIter with a separate Iter for each level. It runs a random
+// series of operations, applying each operation to both. It asserts that each
+// operation has identical results on both iterators.
+func TestMergingIter_FragmenterEquivalence(t *testing.T) {
+	seed := time.Now().UnixNano()
+	for i := int64(0); i < 10; i++ {
+		testFragmenterEquivalenceOnce(t, seed+i)
+	}
+}
+
+func TestMergingIter_FragmenterEquivalence_Seed(t *testing.T) {
+	// This test uses a fixed seed. It's useful to manually edit its seed when
+	// debugging a test failure of the variable-seed test.
+	const seed = 1644517830186873000
+	testFragmenterEquivalenceOnce(t, seed)
+}
+
+func testFragmenterEquivalenceOnce(t *testing.T, seed int64) {
+	cmp := testkeys.Comparer.Compare
+	rng := rand.New(rand.NewSource(seed))
+	t.Logf("seed = %d", seed)
+
+	// Use a key space of alphanumeric strings, with a random max length between
+	// 1-3. Repeat keys are more common at the lower max lengths.
+	ks := testkeys.Alpha(rng.Intn(3) + 1)
+
+	// Generate between 1 and 10 levels of fragment iterators.
+	levels := make([][]Span, rng.Intn(10)+1)
+	iters := make([]FragmentIterator, len(levels))
+	var allSpans []Span
+	var buf bytes.Buffer
+	for l := 0; l < len(levels); l++ {
+		fmt.Fprintf(&buf, "level %d: ", l)
+		for keyspaceStartIdx := int64(0); keyspaceStartIdx < ks.Count(); {
+			// Generate spans of lengths of up to a third of the keyspace.
+			spanStartIdx := keyspaceStartIdx + rng.Int63n(ks.Count()/3)
+			spanEndIdx := spanStartIdx + rng.Int63n(ks.Count()/3) + 1
+
+			if spanEndIdx < ks.Count() {
+				keyCount := uint64(rng.Intn(3) + 1)
+				s := Span{
+					Start: testkeys.Key(ks, spanStartIdx),
+					End:   testkeys.Key(ks, spanEndIdx),
+					Keys:  make([]Key, 0, keyCount),
+				}
+				for k := keyCount; k > 0; k-- {
+					seqNum := uint64((len(levels)-l)*3) + k
+					s.Keys = append(s.Keys, Key{
+						Trailer: base.MakeTrailer(seqNum, base.InternalKeyKindRangeKeySet),
+					})
+				}
+				if len(levels[l]) > 0 {
+					fmt.Fprint(&buf, ", ")
+				}
+				fmt.Fprintf(&buf, "%s", s)
+
+				levels[l] = append(levels[l], s)
+				allSpans = append(allSpans, s)
+			}
+			keyspaceStartIdx = spanEndIdx
+		}
+		iters[l] = &invalidatingIter{iter: NewIter(cmp, levels[l])}
+		fmt.Fprintln(&buf)
+	}
+
+	// Fragment the spans across the levels.
+	var allFragmented []Span
+	f := Fragmenter{
+		Cmp:    cmp,
+		Format: testkeys.Comparer.FormatKey,
+		Emit: func(span Span) {
+			allFragmented = append(allFragmented, span)
+		},
+	}
+	Sort(f.Cmp, allSpans)
+	for _, s := range allSpans {
+		f.Add(s)
+	}
+	f.Finish()
+
+	// Log all the levels and their fragments, as well as the fully-fragmented
+	// spans produced by the Fragmenter.
+	fmt.Fprintln(&buf, "Fragmenter fragments:")
+	for i, s := range allFragmented {
+		if i > 0 {
+			fmt.Fprint(&buf, ", ")
+		}
+		fmt.Fprint(&buf, s)
+	}
+	t.Logf("%d levels:\n%s\n", len(levels), buf.String())
+
+	fragmenterIter := NewIter(f.Cmp, allFragmented)
+	mergingIter := &MergingIter{}
+	mergingIter.Init(f.Cmp, VisibleTransform(base.InternalKeySeqNumMax), new(MergingBuffers), iters...)
+
+	// Position both so that it's okay to perform relative positioning
+	// operations immediately.
+	mergingIter.First()
+	fragmenterIter.First()
+
+	type opKind struct {
+		weight int
+		fn     func() (str string, f *Span, m *Span)
+	}
+	ops := []opKind{
+		{weight: 2, fn: func() (string, *Span, *Span) {
+			return "First()", fragmenterIter.First(), mergingIter.First()
+		}},
+		{weight: 2, fn: func() (string, *Span, *Span) {
+			return "Last()", fragmenterIter.Last(), mergingIter.Last()
+		}},
+		{weight: 5, fn: func() (string, *Span, *Span) {
+			k := testkeys.Key(ks, rng.Int63n(ks.Count()))
+			return fmt.Sprintf("SeekGE(%q)", k),
+				fragmenterIter.SeekGE(k),
+				mergingIter.SeekGE(k)
+		}},
+		{weight: 5, fn: func() (string, *Span, *Span) {
+			k := testkeys.Key(ks, rng.Int63n(ks.Count()))
+			return fmt.Sprintf("SeekLT(%q)", k),
+				fragmenterIter.SeekLT(k),
+				mergingIter.SeekLT(k)
+		}},
+		{weight: 50, fn: func() (string, *Span, *Span) {
+			return "Next()", fragmenterIter.Next(), mergingIter.Next()
+		}},
+		{weight: 50, fn: func() (string, *Span, *Span) {
+			return "Prev()", fragmenterIter.Prev(), mergingIter.Prev()
+		}},
+	}
+	var totalWeight int
+	for _, op := range ops {
+		totalWeight += op.weight
+	}
+
+	var fragmenterBuf bytes.Buffer
+	var mergingBuf bytes.Buffer
+	opCount := rng.Intn(200) + 50
+	for i := 0; i < opCount; i++ {
+		p := rng.Intn(totalWeight)
+		opIndex := 0
+		for i, op := range ops {
+			if p < op.weight {
+				opIndex = i
+				break
+			}
+			p -= op.weight
+		}
+
+		opString, fs, ms := ops[opIndex].fn()
+
+		fragmenterBuf.Reset()
+		mergingBuf.Reset()
+		fmt.Fprint(&fragmenterBuf, fs)
+		fmt.Fprint(&mergingBuf, ms)
+		if fragmenterBuf.String() != mergingBuf.String() {
+			t.Fatalf("seed %d, op %d: %s = %s, fragmenter iterator returned %s",
+				seed, i, opString, mergingBuf.String(), fragmenterBuf.String())
+		}
+		t.Logf("op %d: %s = %s", i, opString, fragmenterBuf.String())
+	}
+}
diff --git a/pebble/internal/keyspan/seek.go b/pebble/internal/keyspan/seek.go
new file mode 100644
index 0000000..efcf682
--- /dev/null
+++ b/pebble/internal/keyspan/seek.go
@@ -0,0 +1,48 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import "github.com/cockroachdb/pebble/internal/base"
+
+// SeekLE seeks to the span that contains or is before the target key.
+func SeekLE(cmp base.Compare, iter FragmentIterator, key []byte) *Span {
+	// NB: We use SeekLT in order to land on the proper span for a search
+	// key that resides in the middle of a span. Consider the scenario:
+	//
+	//     a---e
+	//         e---i
+	//
+	// The spans are indexed by their start keys `a` and `e`. If the
+	// search key is `c` we want to land on the span [a,e). If we were to
+	// use SeekGE then the search key `c` would land on the span [e,i) and
+	// we'd have to backtrack. The one complexity here is what happens for the
+	// search key `e`. In that case SeekLT will land us on the span [a,e)
+	// and we'll have to move forward.
+	iterSpan := iter.SeekLT(key)
+
+	if iterSpan == nil {
+		// Advance the iterator once to see if the next span has a start key
+		// equal to key.
+		iterSpan = iter.Next()
+		if iterSpan == nil || cmp(key, iterSpan.Start) < 0 {
+			// The iterator is exhausted or we've hit the next span.
+			return nil
+		}
+	} else {
+		// Invariant: key > iterSpan.Start
+		if cmp(key, iterSpan.End) >= 0 {
+			// The current span lies entirely before the search key. Check to see if
+			// the next span contains the search key. If it doesn't, we'll backup
+			// and return to our earlier candidate.
+			iterSpan = iter.Next()
+			if iterSpan == nil || cmp(key, iterSpan.Start) < 0 {
+				// The next span is past our search key or there is no next span. Go
+				// back.
+				iterSpan = iter.Prev()
+			}
+		}
+	}
+	return iterSpan
+}
diff --git a/pebble/internal/keyspan/seek_test.go b/pebble/internal/keyspan/seek_test.go
new file mode 100644
index 0000000..aa1d643
--- /dev/null
+++ b/pebble/internal/keyspan/seek_test.go
@@ -0,0 +1,63 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import (
+	"bytes"
+	"fmt"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+)
+
+func TestSeek(t *testing.T) {
+	cmp := base.DefaultComparer.Compare
+	fmtKey := base.DefaultComparer.FormatKey
+	var iter FragmentIterator
+	var buf bytes.Buffer
+
+	datadriven.RunTest(t, "testdata/seek", func(t *testing.T, d *datadriven.TestData) string {
+		buf.Reset()
+		switch d.Cmd {
+		case "build":
+			spans := buildSpans(t, cmp, fmtKey, d.Input, base.InternalKeyKindRangeDelete)
+			for _, s := range spans {
+				fmt.Fprintln(&buf, s)
+			}
+			iter = NewIter(cmp, spans)
+			return buf.String()
+		case "seek-ge", "seek-le":
+			seek := SeekLE
+			if d.Cmd == "seek-ge" {
+				seek = func(_ base.Compare, iter FragmentIterator, key []byte) *Span {
+					return iter.SeekGE(key)
+				}
+			}
+
+			for _, line := range strings.Split(d.Input, "\n") {
+				parts := strings.Fields(line)
+				if len(parts) != 2 {
+					return fmt.Sprintf("malformed input: %s", line)
+				}
+				seq, err := strconv.ParseUint(parts[1], 10, 64)
+				if err != nil {
+					return err.Error()
+				}
+				span := seek(cmp, iter, []byte(parts[0]))
+				if span != nil {
+					visible := span.Visible(seq)
+					span = &visible
+				}
+				fmt.Fprintln(&buf, span)
+			}
+			return buf.String()
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
diff --git a/pebble/internal/keyspan/span.go b/pebble/internal/keyspan/span.go
new file mode 100644
index 0000000..257b373
--- /dev/null
+++ b/pebble/internal/keyspan/span.go
@@ -0,0 +1,467 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan // import "github.com/cockroachdb/pebble/internal/keyspan"
+
+import (
+	"bytes"
+	"fmt"
+	"sort"
+	"strconv"
+	"strings"
+	"unicode"
+
+	"github.com/cockroachdb/pebble/internal/base"
+)
+
+// Span represents a set of keys over a span of user key space. All of the keys
+// within a Span are applied across the span's key span indicated by Start and
+// End. Each internal key applied over the user key span appears as a separate
+// Key, with its own kind and sequence number. Optionally, each Key may also
+// have a Suffix and/or Value.
+//
+// Note that the start user key is inclusive and the end user key is exclusive.
+//
+// Currently the only supported key kinds are:
+//
+//	RANGEDEL, RANGEKEYSET, RANGEKEYUNSET, RANGEKEYDEL.
+type Span struct {
+	// Start and End encode the user key range of all the contained items, with
+	// an inclusive start key and exclusive end key. Both Start and End must be
+	// non-nil, or both nil if representing an invalid Span.
+	Start, End []byte
+	// Keys holds the set of keys applied over the [Start, End) user key range.
+	// Keys is sorted by (SeqNum, Kind) descending, unless otherwise specified
+	// by the context. If SeqNum and Kind are equal, the order of Keys is
+	// undefined. Keys may be empty, even if Start and End are non-nil.
+	//
+	// Keys are a decoded representation of the internal keys stored in batches
+	// or sstable blocks. A single internal key in a range key block may produce
+	// several decoded Keys.
+	Keys      []Key
+	KeysOrder KeysOrder
+}
+
+// KeysOrder describes the ordering of Keys within a Span.
+type KeysOrder int8
+
+const (
+	// ByTrailerDesc indicates a Span's keys are sorted by Trailer descending.
+	// This is the default ordering, and the ordering used during physical
+	// storage.
+	ByTrailerDesc KeysOrder = iota
+	// BySuffixAsc indicates a Span's keys are sorted by Suffix ascending. This
+	// ordering is used during user iteration of range keys.
+	BySuffixAsc
+)
+
+// Key represents a single key applied over a span of user keys. A Key is
+// contained by a Span which specifies the span of user keys over which the Key
+// is applied.
+type Key struct {
+	// Trailer contains the key kind and sequence number.
+	Trailer uint64
+	// Suffix holds an optional suffix associated with the key. This is only
+	// non-nil for RANGEKEYSET and RANGEKEYUNSET keys.
+	Suffix []byte
+	// Value holds a logical value associated with the Key. It is NOT the
+	// internal value stored in a range key or range deletion block.  This is
+	// only non-nil for RANGEKEYSET keys.
+	Value []byte
+}
+
+// SeqNum returns the sequence number component of the key.
+func (k Key) SeqNum() uint64 {
+	return k.Trailer >> 8
+}
+
+// VisibleAt returns true if the provided key is visible at the provided
+// snapshot sequence number. It interprets batch sequence numbers as always
+// visible, because non-visible batch span keys are filtered when they're
+// fragmented.
+func (k Key) VisibleAt(snapshot uint64) bool {
+	seq := k.SeqNum()
+	return seq < snapshot || seq&base.InternalKeySeqNumBatch != 0
+}
+
+// Kind returns the kind component of the key.
+func (k Key) Kind() base.InternalKeyKind {
+	return base.InternalKeyKind(k.Trailer & 0xff)
+}
+
+// Equal returns true if this Key is equal to the given key. Two keys are said
+// to be equal if the two Keys have equal trailers, suffix and value. Suffix
+// comparison uses the provided base.Compare func. Value comparison is bytewise.
+func (k Key) Equal(equal base.Equal, b Key) bool {
+	return k.Trailer == b.Trailer &&
+		equal(k.Suffix, b.Suffix) &&
+		bytes.Equal(k.Value, b.Value)
+}
+
+// Valid returns true if the span is defined.
+func (s *Span) Valid() bool {
+	return s.Start != nil && s.End != nil
+}
+
+// Empty returns true if the span does not contain any keys. An empty span may
+// still be Valid. A non-empty span must be Valid.
+//
+// An Empty span may be produced by Visible, or be produced by iterators in
+// order to surface the gaps between keys.
+func (s *Span) Empty() bool {
+	return s == nil || len(s.Keys) == 0
+}
+
+// SmallestKey returns the smallest internal key defined by the span's keys.
+// It requires the Span's keys be in ByTrailerDesc order. It panics if the span
+// contains no keys or its keys are sorted in a different order.
+func (s *Span) SmallestKey() base.InternalKey {
+	if len(s.Keys) == 0 {
+		panic("pebble: Span contains no keys")
+	} else if s.KeysOrder != ByTrailerDesc {
+		panic("pebble: span's keys unexpectedly not in trailer order")
+	}
+	// The first key has the highest (sequence number,kind) tuple.
+	return base.InternalKey{
+		UserKey: s.Start,
+		Trailer: s.Keys[0].Trailer,
+	}
+}
+
+// LargestKey returns the largest internal key defined by the span's keys. The
+// returned key will always be a "sentinel key" at the end boundary. The
+// "sentinel key" models the exclusive end boundary by returning an InternalKey
+// with the maximal sequence number, ensuring all InternalKeys with the same
+// user key sort after the sentinel key.
+//
+// It requires the Span's keys be in ByTrailerDesc order. It panics if the span
+// contains no keys or its keys are sorted in a different order.
+func (s *Span) LargestKey() base.InternalKey {
+	if len(s.Keys) == 0 {
+		panic("pebble: Span contains no keys")
+	} else if s.KeysOrder != ByTrailerDesc {
+		panic("pebble: span's keys unexpectedly not in trailer order")
+	}
+	// The last key has the lowest (sequence number,kind) tuple.
+	kind := s.Keys[len(s.Keys)-1].Kind()
+	return base.MakeExclusiveSentinelKey(kind, s.End)
+}
+
+// SmallestSeqNum returns the smallest sequence number of a key contained within
+// the span. It requires the Span's keys be in ByTrailerDesc order. It panics if
+// the span contains no keys or its keys are sorted in a different order.
+func (s *Span) SmallestSeqNum() uint64 {
+	if len(s.Keys) == 0 {
+		panic("pebble: Span contains no keys")
+	} else if s.KeysOrder != ByTrailerDesc {
+		panic("pebble: span's keys unexpectedly not in trailer order")
+	}
+
+	return s.Keys[len(s.Keys)-1].SeqNum()
+}
+
+// LargestSeqNum returns the largest sequence number of a key contained within
+// the span. It requires the Span's keys be in ByTrailerDesc order. It panics if
+// the span contains no keys or its keys are sorted in a different order.
+func (s *Span) LargestSeqNum() uint64 {
+	if len(s.Keys) == 0 {
+		panic("pebble: Span contains no keys")
+	} else if s.KeysOrder != ByTrailerDesc {
+		panic("pebble: span's keys unexpectedly not in trailer order")
+	}
+	return s.Keys[0].SeqNum()
+}
+
+// TODO(jackson): Replace most of the calls to Visible with more targeted calls
+// that avoid the need to construct a new Span.
+
+// Visible returns a span with the subset of keys visible at the provided
+// sequence number. It requires the Span's keys be in ByTrailerDesc order. It
+// panics if the span's keys are sorted in a different order.
+//
+// Visible may incur an allocation, so callers should prefer targeted,
+// non-allocating methods when possible.
+func (s Span) Visible(snapshot uint64) Span {
+	if s.KeysOrder != ByTrailerDesc {
+		panic("pebble: span's keys unexpectedly not in trailer order")
+	}
+
+	ret := Span{Start: s.Start, End: s.End}
+	if len(s.Keys) == 0 {
+		return ret
+	}
+
+	// Keys from indexed batches may force an allocation. The Keys slice is
+	// ordered by sequence number, so ordinarily we can return the trailing
+	// subslice containing keys with sequence numbers less than `seqNum`.
+	//
+	// However, batch keys are special. Only visible batch keys are included
+	// when an Iterator's batch spans are fragmented. They must always be
+	// visible.
+	//
+	// Batch keys can create a sandwich of visible batch keys at the beginning
+	// of the slice and visible committed keys at the end of the slice, forcing
+	// us to allocate a new slice and copy the contents.
+	//
+	// Care is taking to only incur an allocation only when batch keys and
+	// visible keys actually sandwich non-visible keys.
+
+	// lastBatchIdx and lastNonVisibleIdx are set to the last index of a batch
+	// key and a non-visible key respectively.
+	lastBatchIdx := -1
+	lastNonVisibleIdx := -1
+	for i := range s.Keys {
+		if seqNum := s.Keys[i].SeqNum(); seqNum&base.InternalKeySeqNumBatch != 0 {
+			// Batch key. Always visible.
+			lastBatchIdx = i
+		} else if seqNum >= snapshot {
+			// This key is not visible.
+			lastNonVisibleIdx = i
+		}
+	}
+
+	// In the following comments: b = batch, h = hidden, v = visible (committed).
+	switch {
+	case lastNonVisibleIdx == -1:
+		// All keys are visible.
+		//
+		// [b b b], [v v v] and [b b b v v v]
+		ret.Keys = s.Keys
+	case lastBatchIdx == -1:
+		// There are no batch keys, so we can return the continuous subslice
+		// starting after the last non-visible Key.
+		//
+		// h h h [v v v]
+		ret.Keys = s.Keys[lastNonVisibleIdx+1:]
+	case lastNonVisibleIdx == len(s.Keys)-1:
+		// While we have a batch key and non-visible keys, there are no
+		// committed visible keys. The 'sandwich' is missing the bottom layer,
+		// so we can return the continuous sublice at the beginning.
+		//
+		// [b b b] h h h
+		ret.Keys = s.Keys[0 : lastBatchIdx+1]
+	default:
+		// This is the problematic sandwich case. Allocate a new slice, copying
+		// the batch keys and the visible keys into it.
+		//
+		// [b b b] h h h [v v v]
+		ret.Keys = make([]Key, (lastBatchIdx+1)+(len(s.Keys)-lastNonVisibleIdx-1))
+		copy(ret.Keys, s.Keys[:lastBatchIdx+1])
+		copy(ret.Keys[lastBatchIdx+1:], s.Keys[lastNonVisibleIdx+1:])
+	}
+	return ret
+}
+
+// VisibleAt returns true if the span contains a key visible at the provided
+// snapshot. Keys with sequence numbers with the batch bit set are treated as
+// always visible.
+//
+// VisibleAt requires the Span's keys be in ByTrailerDesc order. It panics if
+// the span's keys are sorted in a different order.
+func (s *Span) VisibleAt(snapshot uint64) bool {
+	if s.KeysOrder != ByTrailerDesc {
+		panic("pebble: span's keys unexpectedly not in trailer order")
+	}
+	if len(s.Keys) == 0 {
+		return false
+	} else if first := s.Keys[0].SeqNum(); first&base.InternalKeySeqNumBatch != 0 {
+		// Only visible batch keys are included when an Iterator's batch spans
+		// are fragmented. They must always be visible.
+		return true
+	} else {
+		// Otherwise we check the last key. Since keys are ordered decreasing in
+		// sequence number, the last key has the lowest sequence number of any
+		// of the span's keys. If any of the keys are visible, the last key must
+		// be visible. Or put differently: if the last key is not visible, then
+		// no key is visible.
+		return s.Keys[len(s.Keys)-1].SeqNum() < snapshot
+	}
+}
+
+// ShallowClone returns the span with a Keys slice owned by the span itself.
+// None of the key byte slices are cloned (see Span.DeepClone).
+func (s *Span) ShallowClone() Span {
+	c := Span{
+		Start:     s.Start,
+		End:       s.End,
+		Keys:      make([]Key, len(s.Keys)),
+		KeysOrder: s.KeysOrder,
+	}
+	copy(c.Keys, s.Keys)
+	return c
+}
+
+// DeepClone clones the span, creating copies of all contained slices. DeepClone
+// is intended for non-production code paths like tests, the level checker, etc
+// because it is allocation heavy.
+func (s *Span) DeepClone() Span {
+	c := Span{
+		Start:     make([]byte, len(s.Start)),
+		End:       make([]byte, len(s.End)),
+		Keys:      make([]Key, len(s.Keys)),
+		KeysOrder: s.KeysOrder,
+	}
+	copy(c.Start, s.Start)
+	copy(c.End, s.End)
+	for i := range s.Keys {
+		c.Keys[i].Trailer = s.Keys[i].Trailer
+		if len(s.Keys[i].Suffix) > 0 {
+			c.Keys[i].Suffix = make([]byte, len(s.Keys[i].Suffix))
+			copy(c.Keys[i].Suffix, s.Keys[i].Suffix)
+		}
+		if len(s.Keys[i].Value) > 0 {
+			c.Keys[i].Value = make([]byte, len(s.Keys[i].Value))
+			copy(c.Keys[i].Value, s.Keys[i].Value)
+		}
+	}
+	return c
+}
+
+// Contains returns true if the specified key resides within the span's bounds.
+func (s *Span) Contains(cmp base.Compare, key []byte) bool {
+	return cmp(s.Start, key) <= 0 && cmp(key, s.End) < 0
+}
+
+// Covers returns true if the span covers keys at seqNum.
+//
+// Covers requires the Span's keys be in ByTrailerDesc order. It panics if the
+// span's keys are sorted in a different order.
+func (s Span) Covers(seqNum uint64) bool {
+	if s.KeysOrder != ByTrailerDesc {
+		panic("pebble: span's keys unexpectedly not in trailer order")
+	}
+	return !s.Empty() && s.Keys[0].SeqNum() > seqNum
+}
+
+// CoversAt returns true if the span contains a key that is visible at the
+// provided snapshot sequence number, and that key's sequence number is higher
+// than seqNum.
+//
+// Keys with sequence numbers with the batch bit set are treated as always
+// visible.
+//
+// CoversAt requires the Span's keys be in ByTrailerDesc order. It panics if the
+// span's keys are sorted in a different order.
+func (s *Span) CoversAt(snapshot, seqNum uint64) bool {
+	if s.KeysOrder != ByTrailerDesc {
+		panic("pebble: span's keys unexpectedly not in trailer order")
+	}
+	// NB: A key is visible at `snapshot` if its sequence number is strictly
+	// less than `snapshot`. See base.Visible.
+	for i := range s.Keys {
+		if kseq := s.Keys[i].SeqNum(); kseq&base.InternalKeySeqNumBatch != 0 {
+			// Only visible batch keys are included when an Iterator's batch spans
+			// are fragmented. They must always be visible.
+			return kseq > seqNum
+		} else if kseq < snapshot {
+			return kseq > seqNum
+		}
+	}
+	return false
+}
+
+// String returns a string representation of the span.
+func (s Span) String() string {
+	return fmt.Sprint(prettySpan{Span: s, formatKey: base.DefaultFormatter})
+}
+
+// Pretty returns a formatter for the span.
+func (s Span) Pretty(f base.FormatKey) fmt.Formatter {
+	// TODO(jackson): Take a base.FormatValue to format Key.Value too.
+	return prettySpan{s, f}
+}
+
+type prettySpan struct {
+	Span
+	formatKey base.FormatKey
+}
+
+func (s prettySpan) Format(fs fmt.State, c rune) {
+	if !s.Valid() {
+		fmt.Fprintf(fs, "<invalid>")
+		return
+	}
+	fmt.Fprintf(fs, "%s-%s:{", s.formatKey(s.Start), s.formatKey(s.End))
+	for i, k := range s.Keys {
+		if i > 0 {
+			fmt.Fprint(fs, " ")
+		}
+		fmt.Fprintf(fs, "(#%d,%s", k.SeqNum(), k.Kind())
+		if len(k.Suffix) > 0 || len(k.Value) > 0 {
+			fmt.Fprintf(fs, ",%s", k.Suffix)
+		}
+		if len(k.Value) > 0 {
+			fmt.Fprintf(fs, ",%s", k.Value)
+		}
+		fmt.Fprint(fs, ")")
+	}
+	fmt.Fprintf(fs, "}")
+}
+
+// SortKeysByTrailer sorts a keys slice by trailer.
+func SortKeysByTrailer(keys *[]Key) {
+	// NB: keys is a pointer to a slice instead of a slice to avoid `sorted`
+	// escaping to the heap.
+	sorted := (*keysBySeqNumKind)(keys)
+	sort.Sort(sorted)
+}
+
+// KeysBySuffix implements sort.Interface, sorting its member Keys slice to by
+// Suffix in the order dictated by Cmp.
+type KeysBySuffix struct {
+	Cmp  base.Compare
+	Keys []Key
+}
+
+func (s *KeysBySuffix) Len() int           { return len(s.Keys) }
+func (s *KeysBySuffix) Less(i, j int) bool { return s.Cmp(s.Keys[i].Suffix, s.Keys[j].Suffix) < 0 }
+func (s *KeysBySuffix) Swap(i, j int)      { s.Keys[i], s.Keys[j] = s.Keys[j], s.Keys[i] }
+
+// ParseSpan parses the string representation of a Span. It's intended for
+// tests. ParseSpan panics if passed a malformed span representation.
+func ParseSpan(input string) Span {
+	var s Span
+	parts := strings.FieldsFunc(input, func(r rune) bool {
+		switch r {
+		case '-', ':', '{', '}':
+			return true
+		default:
+			return unicode.IsSpace(r)
+		}
+	})
+	s.Start, s.End = []byte(parts[0]), []byte(parts[1])
+
+	// Each of the remaining parts represents a single Key.
+	s.Keys = make([]Key, 0, len(parts)-2)
+	for _, p := range parts[2:] {
+		keyFields := strings.FieldsFunc(p, func(r rune) bool {
+			switch r {
+			case '#', ',', '(', ')':
+				return true
+			default:
+				return unicode.IsSpace(r)
+			}
+		})
+
+		var k Key
+		// Parse the sequence number.
+		seqNum, err := strconv.ParseUint(keyFields[0], 10, 64)
+		if err != nil {
+			panic(fmt.Sprintf("invalid sequence number: %q: %s", keyFields[0], err))
+		}
+		// Parse the key kind.
+		kind := base.ParseKind(keyFields[1])
+		k.Trailer = base.MakeTrailer(seqNum, kind)
+		// Parse the optional suffix.
+		if len(keyFields) >= 3 {
+			k.Suffix = []byte(keyFields[2])
+		}
+		// Parse the optional value.
+		if len(keyFields) >= 4 {
+			k.Value = []byte(keyFields[3])
+		}
+		s.Keys = append(s.Keys, k)
+	}
+	return s
+}
diff --git a/pebble/internal/keyspan/span_test.go b/pebble/internal/keyspan/span_test.go
new file mode 100644
index 0000000..29651fb
--- /dev/null
+++ b/pebble/internal/keyspan/span_test.go
@@ -0,0 +1,98 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import (
+	"bytes"
+	"fmt"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/stretchr/testify/require"
+)
+
+// TODO(jackson): Add unit tests for all of the various Span methods.
+
+func TestSpan_ParseRoundtrip(t *testing.T) {
+	spans := []string{
+		"a-c:{(#5,RANGEDEL)}",
+		"a-c:{(#5,RANGEDEL) (#2,RANGEDEL)}",
+		"h-z:{(#20,RANGEKEYSET,@5,foo) (#15,RANGEKEYUNSET,@9) (#2,RANGEKEYDEL)}",
+	}
+	for _, input := range spans {
+		got := ParseSpan(input).String()
+		if got != input {
+			t.Errorf("ParseSpan(%q).String() = %q", input, got)
+		}
+	}
+}
+
+func TestSpan_Visible(t *testing.T) {
+	var s Span
+	datadriven.RunTest(t, "testdata/visible", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "define":
+			s = ParseSpan(d.Input)
+			return fmt.Sprint(s)
+		case "visible":
+			var buf bytes.Buffer
+			for _, line := range strings.Split(d.Input, "\n") {
+				snapshot, err := strconv.ParseUint(line, 10, 64)
+				require.NoError(t, err)
+				fmt.Fprintf(&buf, "%-2d: %s\n", snapshot, s.Visible(snapshot))
+			}
+			return buf.String()
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
+
+func TestSpan_VisibleAt(t *testing.T) {
+	var s Span
+	datadriven.RunTest(t, "testdata/visible_at", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "define":
+			s = ParseSpan(d.Input)
+			return fmt.Sprint(s)
+		case "visible-at":
+			var buf bytes.Buffer
+			for _, line := range strings.Split(d.Input, "\n") {
+				snapshot, err := strconv.ParseUint(line, 10, 64)
+				require.NoError(t, err)
+				fmt.Fprintf(&buf, "%-2d: %t\n", snapshot, s.VisibleAt(snapshot))
+			}
+			return buf.String()
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
+
+func TestSpan_CoversAt(t *testing.T) {
+	var s Span
+	datadriven.RunTest(t, "testdata/covers_at", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "define":
+			s = ParseSpan(d.Input)
+			return fmt.Sprint(s)
+		case "covers-at":
+			var buf bytes.Buffer
+			for _, line := range strings.Split(d.Input, "\n") {
+				fields := strings.Fields(line)
+				snapshot, err := strconv.ParseUint(fields[0], 10, 64)
+				require.NoError(t, err)
+				seqNum, err := strconv.ParseUint(fields[1], 10, 64)
+				require.NoError(t, err)
+				fmt.Fprintf(&buf, "%d %d : %t\n", snapshot, seqNum, s.CoversAt(snapshot, seqNum))
+			}
+			return buf.String()
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
diff --git a/pebble/internal/keyspan/testdata/bounded_iter b/pebble/internal/keyspan/testdata/bounded_iter
new file mode 100644
index 0000000..8532f62
--- /dev/null
+++ b/pebble/internal/keyspan/testdata/bounded_iter
@@ -0,0 +1,251 @@
+define
+a-b:{(#10,RANGEKEYSET,@5,apples)}
+d-e:{(#4,RANGEKEYSET,@3,coconut)}
+g-h:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)}
+----
+
+# Nothing out of bounds.
+
+iter lower=a upper=z
+first
+next
+next
+next
+last
+prev
+prev
+prev
+----
+a-b:{(#10,RANGEKEYSET,@5,apples)}
+d-e:{(#4,RANGEKEYSET,@3,coconut)}
+g-h:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)}
+<nil>
+g-h:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)}
+d-e:{(#4,RANGEKEYSET,@3,coconut)}
+a-b:{(#10,RANGEKEYSET,@5,apples)}
+<nil>
+
+# Test out of upper bound, but undiscovered until we Next.
+
+iter lower=a upper=f
+first
+next
+next
+prev
+----
+a-b:{(#10,RANGEKEYSET,@5,apples)}
+d-e:{(#4,RANGEKEYSET,@3,coconut)}
+<nil>
+d-e:{(#4,RANGEKEYSET,@3,coconut)}
+
+# Test out of upper bound, but discovered before we Next.
+
+iter lower=a upper=dog
+first
+next
+next
+prev
+----
+a-b:{(#10,RANGEKEYSET,@5,apples)}
+d-e:{(#4,RANGEKEYSET,@3,coconut)}
+<nil>
+d-e:{(#4,RANGEKEYSET,@3,coconut)}
+
+# Test out of lower bound, but undiscovered until we Prev.
+
+iter lower=c upper=z
+last
+prev
+prev
+next
+----
+g-h:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)}
+d-e:{(#4,RANGEKEYSET,@3,coconut)}
+<nil>
+d-e:{(#4,RANGEKEYSET,@3,coconut)}
+
+# Test out of lower bound, but discovered before we Prev.
+
+iter lower=d upper=z
+last
+prev
+prev
+next
+----
+g-h:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)}
+d-e:{(#4,RANGEKEYSET,@3,coconut)}
+<nil>
+d-e:{(#4,RANGEKEYSET,@3,coconut)}
+
+# Test a single span ([b-g)) within the bounds, overlapping on both ends.
+
+define
+a-b:{(#10,RANGEKEYSET,@5)}
+b-g:{(#4,RANGEKEYSET,@3)}
+g-h:{(#20,RANGEKEYSET,@5)}
+----
+
+iter lower=c upper=f
+seek-ge b
+next
+next
+seek-ge b
+prev
+prev
+seek-lt f
+prev
+prev
+seek-lt f
+next
+next
+prev
+prev
+----
+b-g:{(#4,RANGEKEYSET,@3)}
+<nil>
+<nil>
+b-g:{(#4,RANGEKEYSET,@3)}
+<nil>
+<nil>
+b-g:{(#4,RANGEKEYSET,@3)}
+<nil>
+<nil>
+b-g:{(#4,RANGEKEYSET,@3)}
+<nil>
+<nil>
+b-g:{(#4,RANGEKEYSET,@3)}
+<nil>
+
+set-prefix bar
+----
+set prefix to "bar"
+
+# Test seeking to a portion of the keyspace that contains no range keys with
+# start bounds ≥ the seek key such that the range key also overlaps the current
+# prefix.
+
+iter lower=a upper=z
+seek-ge bar
+prev
+prev
+----
+b-g:{(#4,RANGEKEYSET,@3)}
+<nil>
+<nil>
+
+# Test seeking to a portion of the keyspace that contains a range key with a
+# start bound < the seek key, and the range key also overlaps the current
+# prefix.
+
+iter lower=a upper=z
+seek-lt bar
+next
+prev
+prev
+----
+b-g:{(#4,RANGEKEYSET,@3)}
+<nil>
+b-g:{(#4,RANGEKEYSET,@3)}
+<nil>
+
+# Test seeking with bounds narrower than the range of the seek prefix. This is
+# possible in practice because the bounded iterator iterates over fragments, not
+# pre-defragmented range keys.
+
+iter lower=bar@9 upper=bar@3
+seek-lt bar
+next
+prev
+prev
+----
+b-g:{(#4,RANGEKEYSET,@3)}
+<nil>
+b-g:{(#4,RANGEKEYSET,@3)}
+<nil>
+
+# Test a similar scenario but on the start prefix of a key.
+
+iter lower=b@9 upper=b@3
+seek-lt b
+next
+next
+prev
+prev
+----
+<nil>
+b-g:{(#4,RANGEKEYSET,@3)}
+<nil>
+b-g:{(#4,RANGEKEYSET,@3)}
+<nil>
+
+# Test a scenario where the prefix overlaps a span, but the bounds exclude it.
+
+iter lower=z@9 upper=z@3
+seek-lt z@3
+next
+----
+<nil>
+<nil>
+
+# Test many spans matching the prefix, due to fragmentation within a prefix.
+
+define
+b-boo:{(#1,RANGEKEYSET,@1)}
+c@9-c@8:{(#1,RANGEKEYSET,@1)}
+c@8-c@7:{(#1,RANGEKEYSET,@1)}
+c@7-c@6:{(#1,RANGEKEYSET,@1)}
+c@6-c@5:{(#1,RANGEKEYSET,@1)}
+c@5-c@4:{(#1,RANGEKEYSET,@1)}
+----
+
+set-prefix c
+----
+set prefix to "c"
+
+iter
+seek-lt c
+next
+next
+next
+next
+next
+next
+----
+<nil>
+c@9-c@8:{(#1,RANGEKEYSET,@1)}
+c@8-c@7:{(#1,RANGEKEYSET,@1)}
+c@7-c@6:{(#1,RANGEKEYSET,@1)}
+c@6-c@5:{(#1,RANGEKEYSET,@1)}
+c@5-c@4:{(#1,RANGEKEYSET,@1)}
+<nil>
+
+# Test the same scenario with bounds limiting iteration to a subset of the
+# keys.
+
+iter lower=c@7 upper=c@5
+seek-lt c@7
+next
+next
+next
+----
+<nil>
+c@7-c@6:{(#1,RANGEKEYSET,@1)}
+c@6-c@5:{(#1,RANGEKEYSET,@1)}
+<nil>
+
+define
+a@7-a@5:{(#1,RANGEKEYSET,@1)}
+b-boo:{(#1,RANGEKEYSET,@1)}
+c@9-c@8:{(#1,RANGEKEYSET,@1)}
+----
+
+set-prefix b
+----
+set prefix to "b"
+
+iter
+seek-lt c@8
+seek-ge a@9
+----
+<nil>
+<nil>
diff --git a/pebble/internal/keyspan/testdata/covers_at b/pebble/internal/keyspan/testdata/covers_at
new file mode 100644
index 0000000..c32f7ed
--- /dev/null
+++ b/pebble/internal/keyspan/testdata/covers_at
@@ -0,0 +1,91 @@
+define
+a-b:{(#5,RANGEDEL) (#3,RANGEDEL)}
+----
+a-b:{(#5,RANGEDEL) (#3,RANGEDEL)}
+
+covers-at
+6 6
+6 5
+6 4
+6 2
+6 3
+5 5
+5 4
+5 3
+5 2
+4 5
+4 1
+3 9
+3 2
+3 1
+3 0
+2 0
+1 0
+----
+6 6 : false
+6 5 : false
+6 4 : true
+6 2 : true
+6 3 : true
+5 5 : false
+5 4 : false
+5 3 : false
+5 2 : true
+4 5 : false
+4 1 : true
+3 9 : false
+3 2 : false
+3 1 : false
+3 0 : false
+2 0 : false
+1 0 : false
+
+# The below sequence number is the minimal batch sequence number (eg, a RANGEDEL
+# written right at the beginning of the batch.) In the tests below, all other
+# batch sequence numbers are not covered by it.
+
+define
+a-c:{(#36028797018963968,RANGEDEL)}
+----
+a-c:{(#36028797018963968,RANGEDEL)}
+
+covers-at
+100 90000
+100 90
+0 0
+33 36028797018964068
+33 36028797018963968
+----
+100 90000 : true
+100 90 : true
+0 0 : true
+33 36028797018964068 : false
+33 36028797018963968 : false
+
+# The below sequence number is a batch sequence number for offset 100.
+
+define
+a-c:{(#36028797018964068,RANGEDEL)}
+----
+a-c:{(#36028797018964068,RANGEDEL)}
+
+covers-at
+10 10
+----
+10 10 : true
+
+# The below sequence number is a batch sequence number for offset 200. It should
+# not be covered.
+
+covers-at
+100 36028797018964168
+----
+100 36028797018964168 : false
+
+# The below sequence number is a batch sequence number for offset 0. It should
+# be covered.
+
+covers-at
+100 36028797018963968
+----
+100 36028797018963968 : true
diff --git a/pebble/internal/keyspan/testdata/defragmenting_iter b/pebble/internal/keyspan/testdata/defragmenting_iter
new file mode 100644
index 0000000..f81ca9b
--- /dev/null
+++ b/pebble/internal/keyspan/testdata/defragmenting_iter
@@ -0,0 +1,395 @@
+# Test a scenario that should NOT result in defragmentation.
+
+define
+a-c:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+c-d:{(#4,RANGEKEYSET,@3,bananas)}
+d-e:{(#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@1,pineapple)}
+----
+
+iter
+first
+next
+next
+last
+prev
+prev
+----
+a-c:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+c-d:{(#4,RANGEKEYSET,@3,bananas)}
+d-e:{(#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@1,pineapple)}
+d-e:{(#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@1,pineapple)}
+c-d:{(#4,RANGEKEYSET,@3,bananas)}
+a-c:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+
+iter
+first
+next
+next
+next
+last
+prev
+prev
+prev
+----
+a-c:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+c-d:{(#4,RANGEKEYSET,@3,bananas)}
+d-e:{(#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@1,pineapple)}
+<nil>
+d-e:{(#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@1,pineapple)}
+c-d:{(#4,RANGEKEYSET,@3,bananas)}
+a-c:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+<nil>
+
+# Test a scenario that SHOULD result in internal defragmentation ([a,c) and
+# [c,d) should be merged.
+
+define
+a-c:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+c-d:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+d-e:{(#1,RANGEKEYSET,@3,bananas)}
+----
+
+iter
+first
+next
+next
+----
+a-d:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+d-e:{(#1,RANGEKEYSET,@3,bananas)}
+<nil>
+
+# Test defragmenting in both directions at seek keys.
+
+define
+a-f:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+f-h:{(#3,RANGEKEYSET,@3,bananas)}
+h-p:{(#3,RANGEKEYSET,@3,bananas)}
+p-t:{(#3,RANGEKEYSET,@3,bananas)}
+----
+
+iter
+seekge b
+prev
+seekge b
+next
+seeklt d
+next
+seeklt d
+prev
+----
+a-f:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+<nil>
+a-f:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+f-t:{(#3,RANGEKEYSET,@3,bananas)}
+a-f:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+f-t:{(#3,RANGEKEYSET,@3,bananas)}
+a-f:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+<nil>
+
+iter
+seeklt d
+next
+prev
+----
+a-f:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+f-t:{(#3,RANGEKEYSET,@3,bananas)}
+a-f:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+
+# Test next-ing and prev-ing around seek keys.
+
+define
+a-f:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+f-h:{(#3,RANGEKEYSET,@3,bananas)}
+h-p:{(#3,RANGEKEYSET,@3,bananas)}
+p-t:{(#3,RANGEKEYSET,@3,bananas)}
+t-z:{(#4,RANGEKEYSET,@2,oranges)}
+----
+
+iter
+seekge r
+prev
+next
+next
+----
+f-t:{(#3,RANGEKEYSET,@3,bananas)}
+a-f:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+f-t:{(#3,RANGEKEYSET,@3,bananas)}
+t-z:{(#4,RANGEKEYSET,@2,oranges)}
+
+iter
+seekge f
+seekge h
+seekge p
+seekge t
+seekge u
+seekge v
+seekge z
+----
+f-t:{(#3,RANGEKEYSET,@3,bananas)}
+f-t:{(#3,RANGEKEYSET,@3,bananas)}
+f-t:{(#3,RANGEKEYSET,@3,bananas)}
+t-z:{(#4,RANGEKEYSET,@2,oranges)}
+t-z:{(#4,RANGEKEYSET,@2,oranges)}
+t-z:{(#4,RANGEKEYSET,@2,oranges)}
+<nil>
+
+iter
+seeklt f
+seeklt h
+seeklt p
+seeklt t
+seeklt u
+seeklt z
+----
+a-f:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+f-t:{(#3,RANGEKEYSET,@3,bananas)}
+f-t:{(#3,RANGEKEYSET,@3,bananas)}
+f-t:{(#3,RANGEKEYSET,@3,bananas)}
+t-z:{(#4,RANGEKEYSET,@2,oranges)}
+t-z:{(#4,RANGEKEYSET,@2,oranges)}
+
+# Test iteration with a reducer that collects keys across all spans that
+# constitute a defragmented span. Abutting spans are always combined.
+
+define
+a-b:{(#3,RANGEDEL) (#2,RANGEDEL)}
+b-c:{(#4,RANGEDEL) (#1,RANGEDEL)}
+c-d:{(#5,RANGEDEL)}
+e-f:{(#1,RANGEDEL)}
+f-g:{(#2,RANGEDEL)}
+----
+
+iter equal=always reducer=collect
+first
+next
+next
+last
+prev
+prev
+----
+a-d:{(#5,RANGEDEL) (#4,RANGEDEL) (#3,RANGEDEL) (#2,RANGEDEL) (#1,RANGEDEL)}
+e-g:{(#2,RANGEDEL) (#1,RANGEDEL)}
+<nil>
+e-g:{(#2,RANGEDEL) (#1,RANGEDEL)}
+a-d:{(#5,RANGEDEL) (#4,RANGEDEL) (#3,RANGEDEL) (#2,RANGEDEL) (#1,RANGEDEL)}
+<nil>
+
+# Test defragmentation of non-empty (i.e. more than one value) fragments, while
+# empty fragments are left untouched.
+
+define
+a-c:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+c-d:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+d-e:{}
+e-f:{}
+g-h:{(#1,RANGEKEYSET,@3,bananas)}
+----
+
+iter
+first
+next
+next
+next
+next
+----
+a-d:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+d-e:{}
+e-f:{}
+g-h:{(#1,RANGEKEYSET,@3,bananas)}
+<nil>
+
+iter
+last
+prev
+prev
+prev
+prev
+----
+g-h:{(#1,RANGEKEYSET,@3,bananas)}
+e-f:{}
+d-e:{}
+a-d:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+<nil>
+
+iter
+seekge d
+next
+prev
+seekge e
+next
+prev
+prev
+prev
+----
+d-e:{}
+e-f:{}
+d-e:{}
+e-f:{}
+g-h:{(#1,RANGEKEYSET,@3,bananas)}
+e-f:{}
+d-e:{}
+a-d:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+
+iter
+seeklt e
+next
+prev
+seeklt f
+next
+prev
+prev
+prev
+----
+d-e:{}
+e-f:{}
+d-e:{}
+e-f:{}
+g-h:{(#1,RANGEKEYSET,@3,bananas)}
+e-f:{}
+d-e:{}
+a-d:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+
+# Test that the defragmenting iterator does yield errors in cases that do not
+# need to defragment.
+
+define
+a-c:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+c-d:{(#4,RANGEKEYSET,@3,bananas)}
+d-e:{(#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@1,pineapple)}
+----
+
+iter probes=(ErrInjected)
+seek-ge b
+seek-lt d
+first
+last
+----
+<nil> err=<injected error>
+<nil> err=<injected error>
+<nil> err=<injected error>
+<nil> err=<injected error>
+
+# Next and Prev may only be called on positioned iterators, so to test
+# propagation of errors on Next or Prev, we must use a probe that injects errors
+# on Next or Prev but leaves seeks untouched.
+#
+# The situation is complicated by the fact that a seek on the defragmenting
+# iterator will result in Next/Prevs on the embedded iterator (in order to peek
+# ahead to see if anything needs to be defragmented).
+#
+# First we test the seeks too result in injected errors when they Next/Prev
+# ahead to determine if there's anything to defragment.
+
+iter probes=((If (Or OpNext OpPrev) ErrInjected noop), (Log "#  inner."))
+seek-ge b
+next
+seek-lt cat
+prev
+----
+#  inner.SeekGE("b") = a-c:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+#  inner.Prev() = nil <err="injected error">
+<nil> err=<injected error>
+#  inner.Next() = nil <err="injected error">
+<nil> err=<injected error>
+#  inner.SeekLT("cat") = c-d:{(#4,RANGEKEYSET,@3,bananas)}
+#  inner.Next() = nil <err="injected error">
+<nil> err=<injected error>
+#  inner.Prev() = nil <err="injected error">
+<nil> err=<injected error>
+
+# Use a probe that injects errors whenever we otherwise would've returned the
+# c-d span. First and Last calls should both return errors because during
+# defragmenting they'll step the internal iterator on to the error position.
+
+iter probes=((If (Equal StartKey (Bytes "c")) ErrInjected noop), (Log "#  inner."))
+first
+last
+----
+#  inner.First() = a-c:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+#  inner.Next() = nil <err="injected error">
+<nil> err=<injected error>
+#  inner.Last() = d-e:{(#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@1,pineapple)}
+#  inner.Prev() = nil <err="injected error">
+<nil> err=<injected error>
+
+# In order to test that errors are injected when Next-ing the top-level
+# iterator, define test data that includes 5 spans.
+
+define
+a-b:{(#3,RANGEKEYUNSET,@5)}
+b-c:{(#4,RANGEKEYSET,@5,apples)}
+c-d:{(#5,RANGEKEYSET,@3,bananas)}
+d-e:{(#6,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@1,pineapple)}
+e-f:{(#4,RANGEKEYSET,@1,pineapple)}
+----
+
+# Use a probe that injects errors whenever we would've otherwise returned the
+# c-d span. Our initial First/Last seeks should not step on to the error
+# position and should not error. The subsequent Next/Prev however should.
+
+iter probes=((If (Equal StartKey (Bytes "c")) ErrInjected noop), (Log "#  inner."))
+first
+next
+last
+prev
+----
+#  inner.First() = a-b:{(#3,RANGEKEYUNSET,@5)}
+#  inner.Next() = b-c:{(#4,RANGEKEYSET,@5,apples)}
+a-b:{(#3,RANGEKEYUNSET,@5)}
+#  inner.Next() = nil <err="injected error">
+<nil> err=<injected error>
+#  inner.Last() = e-f:{(#4,RANGEKEYSET,@1,pineapple)}
+#  inner.Prev() = d-e:{(#6,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@1,pineapple)}
+e-f:{(#4,RANGEKEYSET,@1,pineapple)}
+#  inner.Prev() = nil <err="injected error">
+<nil> err=<injected error>
+
+# When seeking, the defragmenting iterator needs to defragment in both
+# directions. A forward seek first defragments in the reverse direction, and
+# then in the forward direction. A backward seek does the inverse. If an error
+# is encountered while performing the first defragment scan, it must be
+# surfaced.
+#
+# To test this scenario we again inject errors instead of the c-d span.
+#   - The SeekGE('d') should land on d-e, try to defragment backward first and
+#     encounter the error.
+#   - The SeekLT('c') should land on b-c, try to defragment forward first and
+#     encounter the error.
+iter probes=((If (Equal StartKey (Bytes "c")) ErrInjected noop), (Log "#  inner."))
+seek-ge d
+seek-lt c
+----
+#  inner.SeekGE("d") = d-e:{(#6,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@1,pineapple)}
+#  inner.Prev() = nil <err="injected error">
+<nil> err=<injected error>
+#  inner.SeekLT("c") = b-c:{(#4,RANGEKEYSET,@5,apples)}
+#  inner.Next() = nil <err="injected error">
+<nil> err=<injected error>
+
+# When changing directions in some circumstances we step an iterator and then
+# defragment twice; once to skip over the current span and once to construct the
+# next defragmented span in the new iteration direction. If the first step of
+# the iterator surfaces an error, ensure that it's still propagated.
+iter probes=((If (And OpPrev (Equal StartKey (Bytes "c"))) ErrInjected noop), (Log "#  inner."))
+seek-ge c
+prev
+----
+#  inner.SeekGE("c") = c-d:{(#5,RANGEKEYSET,@3,bananas)}
+#  inner.Prev() = b-c:{(#4,RANGEKEYSET,@5,apples)}
+#  inner.Next() = c-d:{(#5,RANGEKEYSET,@3,bananas)}
+#  inner.Next() = d-e:{(#6,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@1,pineapple)}
+c-d:{(#5,RANGEKEYSET,@3,bananas)}
+#  inner.Prev() = nil <err="injected error">
+<nil> err=<injected error>
+
+iter probes=((If (And OpNext (Equal StartKey (Bytes "c"))) ErrInjected noop), (Log "#  inner."))
+seek-lt d
+next
+----
+#  inner.SeekLT("d") = c-d:{(#5,RANGEKEYSET,@3,bananas)}
+#  inner.Next() = d-e:{(#6,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@1,pineapple)}
+#  inner.Prev() = c-d:{(#5,RANGEKEYSET,@3,bananas)}
+#  inner.Prev() = b-c:{(#4,RANGEKEYSET,@5,apples)}
+c-d:{(#5,RANGEKEYSET,@3,bananas)}
+#  inner.Next() = nil <err="injected error">
+<nil> err=<injected error>
diff --git a/pebble/internal/keyspan/testdata/filtering_iter b/pebble/internal/keyspan/testdata/filtering_iter
new file mode 100644
index 0000000..3299254
--- /dev/null
+++ b/pebble/internal/keyspan/testdata/filtering_iter
@@ -0,0 +1,84 @@
+# The following filters are available:
+# - no-op: passes through all spans.
+# - key-kind-{set,unset,del}: filters keys in spans with the given key kind.
+
+define
+a-c:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+c-d:{(#4,RANGEKEYSET,@3,bananas) (#3,RANGEKEYDEL)}
+d-e:{(#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@1,pineapple)}
+----
+
+iter filter=no-op
+first
+next
+next
+next
+----
+a-c:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+c-d:{(#4,RANGEKEYSET,@3,bananas) (#3,RANGEKEYDEL)}
+d-e:{(#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@1,pineapple)}
+.
+
+iter filter=key-kind-set
+first
+next
+next
+next
+----
+a-c:{(#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+c-d:{(#4,RANGEKEYSET,@3,bananas)}
+d-e:{(#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@1,pineapple)}
+.
+
+iter filter=key-kind-set
+last
+prev
+prev
+prev
+----
+d-e:{(#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@1,pineapple)}
+c-d:{(#4,RANGEKEYSET,@3,bananas)}
+a-c:{(#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+.
+
+iter filter=key-kind-set
+seek-ge a
+seek-ge c
+next
+seek-lt b
+prev
+next
+seek-lt z
+next
+----
+a-c:{(#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+c-d:{(#4,RANGEKEYSET,@3,bananas)}
+d-e:{(#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@1,pineapple)}
+a-c:{(#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+.
+a-c:{(#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+d-e:{(#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@1,pineapple)}
+.
+
+iter filter=key-kind-set
+first
+next
+next
+----
+a-c:{(#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+c-d:{(#4,RANGEKEYSET,@3,bananas)}
+d-e:{(#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@1,pineapple)}
+
+iter filter=key-kind-unset
+first
+next
+----
+a-c:{(#3,RANGEKEYUNSET,@5)}
+.
+
+iter filter=key-kind-del
+first
+next
+----
+c-d:{(#3,RANGEKEYDEL)}
+.
diff --git a/pebble/internal/keyspan/testdata/fragmenter b/pebble/internal/keyspan/testdata/fragmenter
new file mode 100644
index 0000000..a064b00
--- /dev/null
+++ b/pebble/internal/keyspan/testdata/fragmenter
@@ -0,0 +1,951 @@
+build
+3: a-----------m
+2:      f------------s
+1:          j---------------z
+----
+3: a----f
+3:      f---j
+2:      f---j
+3:          j--m
+2:          j--m
+1:          j--m
+2:             m-----s
+1:             m-----s
+1:                   s------z
+
+# Building is idempotent.
+build
+3: a----f
+3:      f---j
+2:      f---j
+3:          j--m
+2:          j--m
+1:          j--m
+2:             m-----s
+1:             m-----s
+1:                   s------z
+----
+3: a----f
+3:      f---j
+2:      f---j
+3:          j--m
+2:          j--m
+1:          j--m
+2:             m-----s
+1:             m-----s
+1:                   s------z
+
+# An empty tombstone will not get emitted.
+build
+1: a-a
+----
+
+build
+2:   c-e
+1: a-c
+----
+pebble: keys must be added in order: c > a
+
+build
+3: a-a
+3: a-b
+2: a-b
+1: a-a
+----
+3: ab
+2: ab
+
+build
+1: a---e
+3:  b-d
+----
+1: ab
+3:  b-d
+1:  b-d
+1:    de
+
+get t=4
+a#3 a#2 a#1 a#0
+----
+alive alive alive deleted
+
+get t=3
+a#2 a#1 a#0
+----
+alive alive deleted
+
+get t=2
+a#1 a#0
+----
+alive deleted
+
+get t=4
+b#3 b#2 b#1 b#0
+----
+alive deleted deleted deleted
+
+get t=3
+b#2 b#1 b#0
+----
+alive alive deleted
+
+get t=2
+b#1 b#0
+----
+alive deleted
+
+get t=4
+c#3 c#2 c#1 c#0
+----
+alive deleted deleted deleted
+
+get t=3
+c#2 c#1 c#0
+----
+alive alive deleted
+
+get t=2
+c#1 c#0
+----
+alive deleted
+
+get t=4
+d#3 d#2 d#1 d#0
+----
+alive alive alive deleted
+
+get t=3
+d#2 d#1 d#0
+----
+alive alive deleted
+
+get t=2
+d#1 d#0
+----
+alive deleted
+
+
+build
+3: a---e
+1:  b-d
+----
+3: ab
+3:  b-d
+1:  b-d
+3:    de
+
+get t=4
+a#3 a#2 a#1 a#0
+----
+alive deleted deleted deleted
+
+get t=3
+a#2 a#1 a#0
+----
+alive alive alive
+
+get t=2
+a#1 a#0
+----
+alive alive
+
+get t=4
+b#3 b#2 b#1 b#0
+----
+alive deleted deleted deleted
+
+get t=3
+b#2 b#1 b#0
+----
+alive alive deleted
+
+get t=2
+b#1 b#0
+----
+alive deleted
+
+get t=4
+c#3 c#2 c#1 c#0
+----
+alive deleted deleted deleted
+
+get t=3
+c#2 c#1 c#0
+----
+alive alive deleted
+
+get t=2
+c#1 c#0
+----
+alive deleted
+
+get t=4
+d#3 d#2 d#1 d#0
+----
+alive deleted deleted deleted
+
+get t=3
+d#2 d#1 d#0
+----
+alive alive alive
+
+get t=2
+d#1 d#0
+----
+alive alive
+
+
+build
+3: a--d
+1:  b--e
+----
+3: ab
+3:  b-d
+1:  b-d
+1:    de
+
+get t=4
+a#3 a#2 a#1 a#0
+----
+alive deleted deleted deleted
+
+get t=3
+a#2 a#1 a#0
+----
+alive alive alive
+
+get t=2
+a#1 a#0
+----
+alive alive
+
+get t=4
+b#3 b#2 b#1 b#0
+----
+alive deleted deleted deleted
+
+get t=3
+b#2 b#1 b#0
+----
+alive alive deleted
+
+get t=2
+b#1 b#0
+----
+alive deleted
+
+get t=4
+c#3 c#2 c#1 c#0
+----
+alive deleted deleted deleted
+
+get t=3
+c#2 c#1 c#0
+----
+alive alive deleted
+
+get t=2
+c#1 c#0
+----
+alive deleted
+
+get t=4
+d#3 d#2 d#1 d#0
+----
+alive alive alive deleted
+
+get t=3
+d#2 d#1 d#0
+----
+alive alive deleted
+
+get t=2
+d#1 d#0
+----
+alive deleted
+
+
+build
+1: a--d
+3:  b--e
+----
+1: ab
+3:  b-d
+1:  b-d
+3:    de
+
+get t=4
+a#3 a#2 a#1 a#0
+----
+alive alive alive deleted
+
+get t=3
+a#2 a#1 a#0
+----
+alive alive deleted
+
+get t=2
+a#1 a#0
+----
+alive deleted
+
+get t=4
+b#3 b#2 b#1 b#0
+----
+alive deleted deleted deleted
+
+get t=3
+b#2 b#1 b#0
+----
+alive alive deleted
+
+get t=2
+b#1 b#0
+----
+alive deleted
+
+get t=4
+c#3 c#2 c#1 c#0
+----
+alive deleted deleted deleted
+
+get t=3
+c#2 c#1 c#0
+----
+alive alive deleted
+
+get t=2
+c#1 c#0
+----
+alive deleted
+
+get t=4
+d#3 d#2 d#1 d#0
+----
+alive deleted deleted deleted
+
+get t=3
+d#2 d#1 d#0
+----
+alive alive alive
+
+get t=2
+d#1 d#0
+----
+alive alive
+
+
+build
+3: a--d
+1: a---e
+----
+3: a--d
+1: a--d
+1:    de
+
+get t=4
+a#3 a#2 a#1 a#0
+----
+alive deleted deleted deleted
+
+get t=3
+a#2 a#1 a#0
+----
+alive alive deleted
+
+get t=2
+a#1 a#0
+----
+alive deleted
+
+get t=4
+b#3 b#2 b#1 b#0
+----
+alive deleted deleted deleted
+
+get t=3
+b#2 b#1 b#0
+----
+alive alive deleted
+
+get t=2
+b#1 b#0
+----
+alive deleted
+
+get t=4
+c#3 c#2 c#1 c#0
+----
+alive deleted deleted deleted
+
+get t=3
+c#2 c#1 c#0
+----
+alive alive deleted
+
+get t=2
+c#1 c#0
+----
+alive deleted
+
+get t=4
+d#3 d#2 d#1 d#0
+----
+alive alive alive deleted
+
+get t=3
+d#2 d#1 d#0
+----
+alive alive deleted
+
+get t=2
+d#1 d#0
+----
+alive deleted
+
+
+build
+3: a---e
+1: a--d
+----
+3: a--d
+1: a--d
+3:    de
+
+get t=4
+a#3 a#2 a#1 a#0
+----
+alive deleted deleted deleted
+
+get t=3
+a#2 a#1 a#0
+----
+alive alive deleted
+
+get t=2
+a#1 a#0
+----
+alive deleted
+
+get t=4
+b#3 b#2 b#1 b#0
+----
+alive deleted deleted deleted
+
+get t=3
+b#2 b#1 b#0
+----
+alive alive deleted
+
+get t=2
+b#1 b#0
+----
+alive deleted
+
+get t=4
+c#3 c#2 c#1 c#0
+----
+alive deleted deleted deleted
+
+get t=3
+c#2 c#1 c#0
+----
+alive alive deleted
+
+get t=2
+c#1 c#0
+----
+alive deleted
+
+get t=4
+d#3 d#2 d#1 d#0
+----
+alive deleted deleted deleted
+
+get t=3
+d#2 d#1 d#0
+----
+alive alive alive
+
+get t=2
+d#1 d#0
+----
+alive alive
+
+
+build
+1: a---e
+3:  b--e
+----
+1: ab
+3:  b--e
+1:  b--e
+
+get t=4
+a#3 a#2 a#1 a#0
+----
+alive alive alive deleted
+
+get t=3
+a#2 a#1 a#0
+----
+alive alive deleted
+
+get t=2
+a#1 a#0
+----
+alive deleted
+
+get t=4
+b#3 b#2 b#1 b#0
+----
+alive deleted deleted deleted
+
+get t=3
+b#2 b#1 b#0
+----
+alive alive deleted
+
+get t=2
+b#1 b#0
+----
+alive deleted
+
+get t=4
+c#3 c#2 c#1 c#0
+----
+alive deleted deleted deleted
+
+get t=3
+c#2 c#1 c#0
+----
+alive alive deleted
+
+get t=2
+c#1 c#0
+----
+alive deleted
+
+get t=4
+d#3 d#2 d#1 d#0
+----
+alive deleted deleted deleted
+
+get t=3
+d#2 d#1 d#0
+----
+alive alive deleted
+
+get t=2
+d#1 d#0
+----
+alive deleted
+
+
+build
+3: a---e
+1:  b--e
+----
+3: ab
+3:  b--e
+1:  b--e
+
+get t=4
+a#3 a#2 a#1 a#0
+----
+alive deleted deleted deleted
+
+get t=3
+a#2 a#1 a#0
+----
+alive alive alive
+
+get t=2
+a#1 a#0
+----
+alive alive
+
+get t=4
+b#3 b#2 b#1 b#0
+----
+alive deleted deleted deleted
+
+get t=3
+b#2 b#1 b#0
+----
+alive alive deleted
+
+get t=2
+b#1 b#0
+----
+alive deleted
+
+get t=4
+c#3 c#2 c#1 c#0
+----
+alive deleted deleted deleted
+
+get t=3
+c#2 c#1 c#0
+----
+alive alive deleted
+
+get t=2
+c#1 c#0
+----
+alive deleted
+
+get t=4
+d#3 d#2 d#1 d#0
+----
+alive deleted deleted deleted
+
+get t=3
+d#2 d#1 d#0
+----
+alive alive deleted
+
+get t=2
+d#1 d#0
+----
+alive deleted
+
+
+build
+3: a---e
+1: a---e
+----
+3: a---e
+1: a---e
+
+get t=4
+a#3 a#2 a#1 a#0
+----
+alive deleted deleted deleted
+
+get t=3
+a#2 a#1 a#0
+----
+alive alive deleted
+
+get t=2
+a#1 a#0
+----
+alive deleted
+
+get t=4
+b#3 b#2 b#1 b#0
+----
+alive deleted deleted deleted
+
+get t=3
+b#2 b#1 b#0
+----
+alive alive deleted
+
+get t=2
+b#1 b#0
+----
+alive deleted
+
+get t=4
+c#3 c#2 c#1 c#0
+----
+alive deleted deleted deleted
+
+get t=3
+c#2 c#1 c#0
+----
+alive alive deleted
+
+get t=2
+c#1 c#0
+----
+alive deleted
+
+get t=4
+d#3 d#2 d#1 d#0
+----
+alive deleted deleted deleted
+
+get t=3
+d#2 d#1 d#0
+----
+alive alive deleted
+
+get t=2
+d#1 d#0
+----
+alive deleted
+
+
+build
+1: a-c
+3:   c-e
+----
+1: a-c
+3:   c-e
+
+get t=4
+a#3 a#2 a#1 a#0
+----
+alive alive alive deleted
+
+get t=3
+a#2 a#1 a#0
+----
+alive alive deleted
+
+get t=2
+a#1 a#0
+----
+alive deleted
+
+get t=4
+b#3 b#2 b#1 b#0
+----
+alive alive alive deleted
+
+get t=3
+b#2 b#1 b#0
+----
+alive alive deleted
+
+get t=2
+b#1 b#0
+----
+alive deleted
+
+get t=4
+c#3 c#2 c#1 c#0
+----
+alive deleted deleted deleted
+
+get t=3
+c#2 c#1 c#0
+----
+alive alive alive
+
+get t=2
+c#1 c#0
+----
+alive alive
+
+get t=4
+d#3 d#2 d#1 d#0
+----
+alive deleted deleted deleted
+
+get t=3
+d#2 d#1 d#0
+----
+alive alive alive
+
+get t=2
+d#1 d#0
+----
+alive alive
+
+
+build
+3: a-c
+1:   c-e
+----
+3: a-c
+1:   c-e
+
+get t=4
+a#3 a#2 a#1 a#0
+----
+alive deleted deleted deleted
+
+get t=3
+a#2 a#1 a#0
+----
+alive alive alive
+
+get t=2
+a#1 a#0
+----
+alive alive
+
+get t=4
+b#3 b#2 b#1 b#0
+----
+alive deleted deleted deleted
+
+get t=3
+b#2 b#1 b#0
+----
+alive alive alive
+
+get t=2
+b#1 b#0
+----
+alive alive
+
+get t=4
+c#3 c#2 c#1 c#0
+----
+alive alive alive deleted
+
+get t=3
+c#2 c#1 c#0
+----
+alive alive deleted
+
+get t=2
+c#1 c#0
+----
+alive deleted
+
+get t=4
+d#3 d#2 d#1 d#0
+----
+alive alive alive deleted
+
+get t=3
+d#2 d#1 d#0
+----
+alive alive deleted
+
+get t=2
+d#1 d#0
+----
+alive deleted
+
+
+build
+1: a-c
+3:    de
+----
+1: a-c
+3:    de
+
+get t=4
+a#3 a#2 a#1 a#0
+----
+alive alive alive deleted
+
+get t=3
+a#2 a#1 a#0
+----
+alive alive deleted
+
+get t=2
+a#1 a#0
+----
+alive deleted
+
+get t=4
+b#3 b#2 b#1 b#0
+----
+alive alive alive deleted
+
+get t=3
+b#2 b#1 b#0
+----
+alive alive deleted
+
+get t=2
+b#1 b#0
+----
+alive deleted
+
+get t=4
+c#3 c#2 c#1 c#0
+----
+alive alive alive alive
+
+get t=3
+c#2 c#1 c#0
+----
+alive alive alive
+
+get t=2
+c#1 c#0
+----
+alive alive
+
+get t=4
+d#3 d#2 d#1 d#0
+----
+alive deleted deleted deleted
+
+get t=3
+d#2 d#1 d#0
+----
+alive alive alive
+
+get t=2
+d#1 d#0
+----
+alive alive
+
+
+build
+3: a-c
+1:    de
+----
+3: a-c
+1:    de
+
+get t=4
+a#3 a#2 a#1 a#0
+----
+alive deleted deleted deleted
+
+get t=3
+a#2 a#1 a#0
+----
+alive alive alive
+
+get t=2
+a#1 a#0
+----
+alive alive
+
+get t=4
+b#3 b#2 b#1 b#0
+----
+alive deleted deleted deleted
+
+get t=3
+b#2 b#1 b#0
+----
+alive alive alive
+
+get t=2
+b#1 b#0
+----
+alive alive
+
+get t=4
+c#3 c#2 c#1 c#0
+----
+alive alive alive alive
+
+get t=3
+c#2 c#1 c#0
+----
+alive alive alive
+
+get t=2
+c#1 c#0
+----
+alive alive
+
+get t=4
+d#3 d#2 d#1 d#0
+----
+alive alive alive deleted
+
+get t=3
+d#2 d#1 d#0
+----
+alive alive deleted
+
+get t=2
+d#1 d#0
+----
+alive deleted
diff --git a/pebble/internal/keyspan/testdata/fragmenter_covers b/pebble/internal/keyspan/testdata/fragmenter_covers
new file mode 100644
index 0000000..abd505d
--- /dev/null
+++ b/pebble/internal/keyspan/testdata/fragmenter_covers
@@ -0,0 +1,58 @@
+# This datadriven test uses a single command 'build' that illustrates a sequence
+# of calls to a fragmenter.
+#
+# 'add' lines add a new span with the provided sequence number and the provided
+# bounds. 'add' outputs nothing.
+#
+# 'deleted' lines test whether the provided key is deleted by a RANGEDEL in the
+# fragmenter when read at the trailing snapshot sequence number.
+
+build
+deleted a.SET.0     5
+add 3: a-----------m
+deleted a.SET.0     5
+deleted a.SET.1     5
+deleted a.SET.1     2
+deleted a.SET.2     5
+deleted a.SET.3     5
+deleted l.SET.3     5
+add 2:      f------------s
+deleted e.SET.3     5
+deleted f.SET.2     5
+deleted l.SET.2     5
+deleted m.SET.2     5
+add 1:          j---------------z
+deleted j.SET.1     5
+deleted j.SET.1     1
+deleted j.SET.2     5
+deleted j.SET.3     5
+deleted l.SET.2     5
+deleted m.SET.2     5
+deleted r.SET.1     5
+deleted r.SET.1     1
+deleted s.SET.1     5
+deleted y.SET.0     5
+deleted z.SET.0     5
+----
+a#0,1: none
+a#0,1: visibly
+a#1,1: visibly
+a#1,1: invisibly
+a#2,1: visibly
+a#3,1: none
+l#3,1: none
+e#3,1: pebble: keys must be in order: f > e#3,SET
+f#2,1: visibly
+l#2,1: visibly
+m#2,1: none
+j#1,1: visibly
+j#1,1: invisibly
+j#2,1: visibly
+j#3,1: none
+l#2,1: visibly
+m#2,1: none
+r#1,1: visibly
+r#1,1: invisibly
+s#1,1: none
+y#0,1: visibly
+z#0,1: none
diff --git a/pebble/internal/keyspan/testdata/fragmenter_emit_order b/pebble/internal/keyspan/testdata/fragmenter_emit_order
new file mode 100644
index 0000000..9af2c42
--- /dev/null
+++ b/pebble/internal/keyspan/testdata/fragmenter_emit_order
@@ -0,0 +1,21 @@
+build
+a.RANGEKEYSET.5 b
+a.RANGEKEYSET.4 b
+a.RANGEKEYUNSET.6 b
+----
+a b: #6,RANGEKEYUNSET, #5,RANGEKEYSET, #4,RANGEKEYSET
+-
+
+# Test that keys emitted together that share the same sequence number are
+# ordered by key kind, descending.
+# NB: RANGEKEYSET > RANGEKEYUNSET > RANGEKEYDEL
+
+build
+b.RANGEKEYSET.5 c
+b.RANGEKEYUNSET.5 d
+b.RANGEKEYDEL.5 c
+----
+b c: #5,RANGEKEYSET, #5,RANGEKEYUNSET, #5,RANGEKEYDEL
+-
+c d: #5,RANGEKEYUNSET
+-
diff --git a/pebble/internal/keyspan/testdata/fragmenter_truncate_and_flush_to b/pebble/internal/keyspan/testdata/fragmenter_truncate_and_flush_to
new file mode 100644
index 0000000..9b7ecce
--- /dev/null
+++ b/pebble/internal/keyspan/testdata/fragmenter_truncate_and_flush_to
@@ -0,0 +1,113 @@
+build
+2: a--c
+1:  b--d
+truncate-and-flush-to c
+----
+2: ab
+2:  bc
+1:  bc
+1:   cd
+
+build
+truncate-and-flush-to c
+1:  b--d
+----
+pebble: start key (b) < flushed key (c)
+
+build
+truncate-and-flush-to c
+truncate-and-flush-to b
+----
+pebble: start key (b) < flushed key (c)
+
+# Call out of order
+
+build
+3: a--d
+2:    d--g
+truncate-and-flush-to c
+----
+pebble: start key (c) < flushed key (d)
+
+build
+3: a--d
+truncate-and-flush-to a
+----
+3: a--d
+
+build
+3: a--d
+2:    d--g
+truncate-and-flush-to d
+----
+3: a--d
+2:    d--g
+
+build
+2: a----f
+truncate-and-flush-to c
+----
+2: a-c
+2:   c--f
+
+build
+2: a----f
+truncate-and-flush-to f
+----
+2: a----f
+
+build
+2: a----f
+truncate-and-flush-to g
+----
+2: a----f
+
+build
+3: a-c
+1: a-----g
+truncate-and-flush-to d
+----
+3: a-c
+1: a-c
+1:   cd
+1:    d--g
+
+build
+2: a---e
+1: a------h
+truncate-and-flush-to c
+----
+2: a-c
+1: a-c
+2:   c-e
+1:   c-e
+1:     e--h
+
+build
+3: a-c
+2: a---e
+1: a-----g
+truncate-and-flush-to d
+3:    d----i
+----
+3: a-c
+2: a-c
+1: a-c
+2:   cd
+1:   cd
+3:    de
+2:    de
+1:    de
+3:     e-g
+1:     e-g
+3:       g-i
+
+build
+3: a-c
+2: a-----g
+truncate-and-flush-to e
+----
+3: a-c
+2: a-c
+2:   c-e
+2:     e-g
diff --git a/pebble/internal/keyspan/testdata/fragmenter_values b/pebble/internal/keyspan/testdata/fragmenter_values
new file mode 100644
index 0000000..7462ae8
--- /dev/null
+++ b/pebble/internal/keyspan/testdata/fragmenter_values
@@ -0,0 +1,65 @@
+build
+3: a-----------m              apples
+2:      f------------s        bananas
+1:          j---------------z coconuts
+----
+3: a----f                     apples
+3:      f---j                 apples
+2:      f---j                 bananas
+3:          j--m              apples
+2:          j--m              bananas
+1:          j--m              coconuts
+2:             m-----s        bananas
+1:             m-----s        coconuts
+1:                   s------z coconuts
+
+# Building is idempotent.
+build
+3: a----f                     a
+3:      f---j                 b
+2:      f---j                 c
+3:          j--m              d
+2:          j--m              e
+1:          j--m              f
+2:             m-----s        g
+1:             m-----s        h
+1:                   s------z i
+----
+3: a----f                     a
+3:      f---j                 b
+2:      f---j                 c
+3:          j--m              d
+2:          j--m              e
+1:          j--m              f
+2:             m-----s        g
+1:             m-----s        h
+1:                   s------z i
+
+build
+2: a--c                       apple
+1:  b--d                      banana
+truncate-and-flush-to c
+----
+2: ab                         apple
+2:  bc                        apple
+1:  bc                        banana
+1:   cd                       banana
+
+build
+3: a-c                        apple
+2: a---e                      banana
+1: a-----g                    coconut
+truncate-and-flush-to d
+3:    d----i                  orange
+----
+3: a-c                        apple
+2: a-c                        banana
+1: a-c                        coconut
+2:   cd                       banana
+1:   cd                       coconut
+3:    de                      orange
+2:    de                      banana
+1:    de                      coconut
+3:     e-g                    orange
+1:     e-g                    coconut
+3:       g-i                  orange
diff --git a/pebble/internal/keyspan/testdata/interleaving_iter b/pebble/internal/keyspan/testdata/interleaving_iter
new file mode 100644
index 0000000..b49db93
--- /dev/null
+++ b/pebble/internal/keyspan/testdata/interleaving_iter
@@ -0,0 +1,998 @@
+define-rangekeys
+a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)}
+c-d:{(#4,RANGEKEYSET,@3,coconut)}
+e-f:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)}
+h-j:{(#22,RANGEKEYDEL) (#21,RANGEKEYSET,@5,peaches) (#21,RANGEKEYSET,@3,starfruit)}
+l-m:{(#2,RANGEKEYUNSET,@9) (#2,RANGEKEYUNSET,@5)}
+q-z:{(#14,RANGEKEYSET,@9,mangos)}
+----
+OK
+
+define-pointkeys
+artichoke.SET.10
+artichoke.SET.8
+carrot.SET.13
+cauliflower.DEL.9
+parsnip.SET.3
+tomato.SET.2
+zucchini.MERGE.12
+----
+OK
+
+iter
+first
+next
+next
+next
+next
+next
+next
+next
+next
+next
+next
+next
+----
+-- SpanChanged(nil)
+-- SpanChanged(a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)})
+PointKey: a#72057594037927935,21
+Span: a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)}
+-
+PointKey: artichoke#10,1
+Span: a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)}
+-
+PointKey: artichoke#8,1
+Span: a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)}
+-
+-- SpanChanged(c-d:{(#4,RANGEKEYSET,@3,coconut)})
+PointKey: c#72057594037927935,21
+Span: c-d:{(#4,RANGEKEYSET,@3,coconut)}
+-
+PointKey: carrot#13,1
+Span: c-d:{(#4,RANGEKEYSET,@3,coconut)}
+-
+PointKey: cauliflower#9,0
+Span: c-d:{(#4,RANGEKEYSET,@3,coconut)}
+-
+-- SpanChanged(e-f:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)})
+PointKey: e#72057594037927935,21
+Span: e-f:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)}
+-
+-- SpanChanged(h-j:{(#22,RANGEKEYDEL) (#21,RANGEKEYSET,@5,peaches) (#21,RANGEKEYSET,@3,starfruit)})
+PointKey: h#72057594037927935,19
+Span: h-j:{(#22,RANGEKEYDEL) (#21,RANGEKEYSET,@5,peaches) (#21,RANGEKEYSET,@3,starfruit)}
+-
+-- SpanChanged(l-m:{(#2,RANGEKEYUNSET,@9) (#2,RANGEKEYUNSET,@5)})
+PointKey: l#72057594037927935,20
+Span: l-m:{(#2,RANGEKEYUNSET,@9) (#2,RANGEKEYUNSET,@5)}
+-
+-- SpanChanged(nil)
+PointKey: parsnip#3,1
+Span: <invalid>
+-
+-- SpanChanged(q-z:{(#14,RANGEKEYSET,@9,mangos)})
+PointKey: q#72057594037927935,21
+Span: q-z:{(#14,RANGEKEYSET,@9,mangos)}
+-
+PointKey: tomato#2,1
+Span: q-z:{(#14,RANGEKEYSET,@9,mangos)}
+-
+
+# Test set-bounds passes through to the underlying point iterator and truncates
+# a range key's end.
+
+iter
+set-bounds b carrot
+seek-ge b
+next
+next
+----
+-- SpanChanged(nil)
+-- SpanChanged(b-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)})
+PointKey: b#72057594037927935,21
+Span: b-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)}
+-
+-- SpanChanged(c-carrot:{(#4,RANGEKEYSET,@3,coconut)})
+PointKey: c#72057594037927935,21
+Span: c-carrot:{(#4,RANGEKEYSET,@3,coconut)}
+-
+-- SpanChanged(nil)
+.
+
+
+# Test set-bounds passes through to the underlying point iterator and truncates
+# a range key's start.
+
+iter
+set-bounds b carrot
+seek-lt carrot
+prev
+prev
+----
+-- SpanChanged(nil)
+-- SpanChanged(c-carrot:{(#4,RANGEKEYSET,@3,coconut)})
+PointKey: c#72057594037927935,21
+Span: c-carrot:{(#4,RANGEKEYSET,@3,coconut)}
+-
+-- SpanChanged(b-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)})
+PointKey: b#72057594037927935,21
+Span: b-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)}
+-
+-- SpanChanged(nil)
+.
+
+# Test seek-ge.
+# NB: The `seek-ge yyy` case demonstrates truncation to the search key.
+
+iter
+first
+seek-ge a
+seek-ge p
+seek-ge yyy
+seek-ge z
+----
+-- SpanChanged(nil)
+-- SpanChanged(a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)})
+PointKey: a#72057594037927935,21
+Span: a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)})
+PointKey: a#72057594037927935,21
+Span: a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+PointKey: parsnip#3,1
+Span: <invalid>
+-
+-- SpanChanged(nil)
+-- SpanChanged(q-z:{(#14,RANGEKEYSET,@9,mangos)})
+PointKey: yyy#72057594037927935,21
+Span: q-z:{(#14,RANGEKEYSET,@9,mangos)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+PointKey: zucchini#12,2
+Span: <invalid>
+-
+
+iter
+last
+prev
+prev
+prev
+prev
+next
+next
+next
+next
+----
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+PointKey: zucchini#12,2
+Span: <invalid>
+-
+-- SpanChanged(q-z:{(#14,RANGEKEYSET,@9,mangos)})
+PointKey: tomato#2,1
+Span: q-z:{(#14,RANGEKEYSET,@9,mangos)}
+-
+PointKey: q#72057594037927935,21
+Span: q-z:{(#14,RANGEKEYSET,@9,mangos)}
+-
+-- SpanChanged(nil)
+PointKey: parsnip#3,1
+Span: <invalid>
+-
+-- SpanChanged(l-m:{(#2,RANGEKEYUNSET,@9) (#2,RANGEKEYUNSET,@5)})
+PointKey: l#72057594037927935,20
+Span: l-m:{(#2,RANGEKEYUNSET,@9) (#2,RANGEKEYUNSET,@5)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+PointKey: parsnip#3,1
+Span: <invalid>
+-
+-- SpanChanged(q-z:{(#14,RANGEKEYSET,@9,mangos)})
+PointKey: q#72057594037927935,21
+Span: q-z:{(#14,RANGEKEYSET,@9,mangos)}
+-
+PointKey: tomato#2,1
+Span: q-z:{(#14,RANGEKEYSET,@9,mangos)}
+-
+-- SpanChanged(nil)
+PointKey: zucchini#12,2
+Span: <invalid>
+-
+
+iter
+seek-ge tomato
+next
+seek-ge q
+seek-ge parsnip
+next
+----
+-- SpanChanged(nil)
+-- SpanChanged(q-z:{(#14,RANGEKEYSET,@9,mangos)})
+PointKey: tomato#72057594037927935,21
+Span: q-z:{(#14,RANGEKEYSET,@9,mangos)}
+-
+PointKey: tomato#2,1
+Span: q-z:{(#14,RANGEKEYSET,@9,mangos)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(q-z:{(#14,RANGEKEYSET,@9,mangos)})
+PointKey: q#72057594037927935,21
+Span: q-z:{(#14,RANGEKEYSET,@9,mangos)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+PointKey: parsnip#3,1
+Span: <invalid>
+-
+-- SpanChanged(q-z:{(#14,RANGEKEYSET,@9,mangos)})
+PointKey: q#72057594037927935,21
+Span: q-z:{(#14,RANGEKEYSET,@9,mangos)}
+-
+
+iter
+seek-lt tomato
+prev
+seek-lt a
+seek-lt tomato
+seek-lt tomago
+----
+-- SpanChanged(nil)
+-- SpanChanged(q-z:{(#14,RANGEKEYSET,@9,mangos)})
+PointKey: q#72057594037927935,21
+Span: q-z:{(#14,RANGEKEYSET,@9,mangos)}
+-
+-- SpanChanged(nil)
+PointKey: parsnip#3,1
+Span: <invalid>
+-
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+.
+-- SpanChanged(nil)
+-- SpanChanged(q-z:{(#14,RANGEKEYSET,@9,mangos)})
+PointKey: q#72057594037927935,21
+Span: q-z:{(#14,RANGEKEYSET,@9,mangos)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(q-z:{(#14,RANGEKEYSET,@9,mangos)})
+PointKey: q#72057594037927935,21
+Span: q-z:{(#14,RANGEKEYSET,@9,mangos)}
+-
+
+define-rangekeys
+a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)}
+c-d:{(#4,RANGEKEYSET,@3,coconut)}
+e-f:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)}
+h-j:{(#22,RANGEKEYDEL) (#21,RANGEKEYSET,@5,peaches) (#21,RANGEKEYSET,@3,starfruit)}
+l-m:{(#2,RANGEKEYUNSET,@9) (#2,RANGEKEYUNSET,@5)}
+q-z:{(#14,RANGEKEYSET,@9,mangos)}
+----
+OK
+
+define-pointkeys
+a.SET.10
+a.SET.8
+b.SET.13
+c.DEL.9
+d.SET.3
+e.SET.2
+----
+OK
+
+iter
+seek-ge a
+next
+next
+next
+----
+-- SpanChanged(nil)
+-- SpanChanged(a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)})
+PointKey: a#72057594037927935,21
+Span: a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)}
+-
+PointKey: a#10,1
+Span: a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)}
+-
+PointKey: a#8,1
+Span: a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)}
+-
+PointKey: b#13,1
+Span: a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)}
+-
+
+iter
+seek-lt a
+----
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+.
+
+iter
+seek-ge ab
+next
+next
+next
+next
+next
+next
+next
+----
+-- SpanChanged(nil)
+-- SpanChanged(a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)})
+PointKey: ab#72057594037927935,21
+Span: a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)}
+-
+PointKey: b#13,1
+Span: a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)}
+-
+-- SpanChanged(c-d:{(#4,RANGEKEYSET,@3,coconut)})
+PointKey: c#72057594037927935,21
+Span: c-d:{(#4,RANGEKEYSET,@3,coconut)}
+-
+PointKey: c#9,0
+Span: c-d:{(#4,RANGEKEYSET,@3,coconut)}
+-
+-- SpanChanged(nil)
+PointKey: d#3,1
+Span: <invalid>
+-
+-- SpanChanged(e-f:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)})
+PointKey: e#72057594037927935,21
+Span: e-f:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)}
+-
+PointKey: e#2,1
+Span: e-f:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)}
+-
+-- SpanChanged(h-j:{(#22,RANGEKEYDEL) (#21,RANGEKEYSET,@5,peaches) (#21,RANGEKEYSET,@3,starfruit)})
+PointKey: h#72057594037927935,19
+Span: h-j:{(#22,RANGEKEYDEL) (#21,RANGEKEYSET,@5,peaches) (#21,RANGEKEYSET,@3,starfruit)}
+-
+
+define-rangekeys
+a-z:{(#5,RANGEKEYSET,@5,apples)}
+----
+OK
+
+define-pointkeys
+a.SET.10
+a.SET.8
+b.SET.13
+c.DEL.9
+d.SET.3
+e.SET.2
+----
+OK
+
+iter
+first
+next
+next
+next
+next
+next
+----
+-- SpanChanged(nil)
+-- SpanChanged(a-z:{(#5,RANGEKEYSET,@5,apples)})
+PointKey: a#72057594037927935,21
+Span: a-z:{(#5,RANGEKEYSET,@5,apples)}
+-
+PointKey: a#10,1
+Span: a-z:{(#5,RANGEKEYSET,@5,apples)}
+-
+PointKey: a#8,1
+Span: a-z:{(#5,RANGEKEYSET,@5,apples)}
+-
+PointKey: b#13,1
+Span: a-z:{(#5,RANGEKEYSET,@5,apples)}
+-
+PointKey: c#9,0
+Span: a-z:{(#5,RANGEKEYSET,@5,apples)}
+-
+PointKey: d#3,1
+Span: a-z:{(#5,RANGEKEYSET,@5,apples)}
+-
+
+# Switch to reverse within a range key.
+# NB: The seek-ge b should truncate the range key a-z to b.
+
+iter
+seek-ge b
+prev
+----
+-- SpanChanged(nil)
+-- SpanChanged(a-z:{(#5,RANGEKEYSET,@5,apples)})
+PointKey: b#72057594037927935,21
+Span: a-z:{(#5,RANGEKEYSET,@5,apples)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(a-z:{(#5,RANGEKEYSET,@5,apples)})
+PointKey: a#8,1
+Span: a-z:{(#5,RANGEKEYSET,@5,apples)}
+-
+
+# Switch to reverse after a seek-ge. Reverse iteration should not revisit the
+# interleaved range-key start at the seek-ge bound: The range-key start should
+# be interleaved at its true start key.
+
+iter
+seek-ge b
+next
+prev
+prev
+prev
+----
+-- SpanChanged(nil)
+-- SpanChanged(a-z:{(#5,RANGEKEYSET,@5,apples)})
+PointKey: b#72057594037927935,21
+Span: a-z:{(#5,RANGEKEYSET,@5,apples)}
+-
+PointKey: b#13,1
+Span: a-z:{(#5,RANGEKEYSET,@5,apples)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(a-z:{(#5,RANGEKEYSET,@5,apples)})
+PointKey: a#8,1
+Span: a-z:{(#5,RANGEKEYSET,@5,apples)}
+-
+PointKey: a#10,1
+Span: a-z:{(#5,RANGEKEYSET,@5,apples)}
+-
+PointKey: a#72057594037927935,21
+Span: a-z:{(#5,RANGEKEYSET,@5,apples)}
+-
+
+# Switch to forward iteration after a seek-lt.
+
+iter
+seek-lt c
+next
+----
+-- SpanChanged(nil)
+-- SpanChanged(a-z:{(#5,RANGEKEYSET,@5,apples)})
+PointKey: b#13,1
+Span: a-z:{(#5,RANGEKEYSET,@5,apples)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(a-z:{(#5,RANGEKEYSET,@5,apples)})
+PointKey: c#9,0
+Span: a-z:{(#5,RANGEKEYSET,@5,apples)}
+-
+
+iter
+seek-lt c
+prev
+next
+----
+-- SpanChanged(nil)
+-- SpanChanged(a-z:{(#5,RANGEKEYSET,@5,apples)})
+PointKey: b#13,1
+Span: a-z:{(#5,RANGEKEYSET,@5,apples)}
+-
+PointKey: a#8,1
+Span: a-z:{(#5,RANGEKEYSET,@5,apples)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(a-z:{(#5,RANGEKEYSET,@5,apples)})
+PointKey: b#13,1
+Span: a-z:{(#5,RANGEKEYSET,@5,apples)}
+-
+
+# Test sparse range keys.
+
+define-rangekeys
+ace-bat:{(#5,RANGEKEYSET,@5,v5)}
+x-z:{(#6,RANGEKEYSET,@6,v5)}
+----
+OK
+
+define-pointkeys
+a.SET.9
+b.SET.13
+c.DEL.9
+d.SET.18
+m.SET.4
+o.MERGE.3
+r.SET.22
+y.SET.3
+z.SET.3
+----
+OK
+
+iter
+first
+next
+next
+prev
+next
+next
+----
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+PointKey: a#9,1
+Span: <invalid>
+-
+-- SpanChanged(ace-bat:{(#5,RANGEKEYSET,@5,v5)})
+PointKey: ace#72057594037927935,21
+Span: ace-bat:{(#5,RANGEKEYSET,@5,v5)}
+-
+PointKey: b#13,1
+Span: ace-bat:{(#5,RANGEKEYSET,@5,v5)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(ace-bat:{(#5,RANGEKEYSET,@5,v5)})
+PointKey: ace#72057594037927935,21
+Span: ace-bat:{(#5,RANGEKEYSET,@5,v5)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(ace-bat:{(#5,RANGEKEYSET,@5,v5)})
+PointKey: b#13,1
+Span: ace-bat:{(#5,RANGEKEYSET,@5,v5)}
+-
+-- SpanChanged(nil)
+PointKey: c#9,0
+Span: <invalid>
+-
+
+iter
+seek-lt ace
+seek-lt zoo
+----
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+PointKey: a#9,1
+Span: <invalid>
+-
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+PointKey: z#3,1
+Span: <invalid>
+-
+
+iter
+last
+prev
+next
+next
+----
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+PointKey: z#3,1
+Span: <invalid>
+-
+-- SpanChanged(x-z:{(#6,RANGEKEYSET,@6,v5)})
+PointKey: y#3,1
+Span: x-z:{(#6,RANGEKEYSET,@6,v5)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+PointKey: z#3,1
+Span: <invalid>
+-
+-- SpanChanged(nil)
+.
+
+iter
+seek-lt m
+next
+seek-ge m
+prev
+----
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+PointKey: d#18,1
+Span: <invalid>
+-
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+PointKey: m#4,1
+Span: <invalid>
+-
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+PointKey: m#4,1
+Span: <invalid>
+-
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+PointKey: d#18,1
+Span: <invalid>
+-
+
+# First, Last, SeekLT and SeekGE elide spans without Sets.
+
+define-rangekeys
+b-d:{(#5,RANGEKEYDEL)}
+f-g:{(#6,RANGEKEYDEL)}
+----
+OK
+
+define-pointkeys
+c.SET.8
+----
+OK
+
+iter
+first
+last
+seek-ge a
+seek-lt d
+----
+-- SpanChanged(nil)
+-- SpanChanged(b-d:{(#5,RANGEKEYDEL)})
+PointKey: b#72057594037927935,19
+Span: b-d:{(#5,RANGEKEYDEL)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(f-g:{(#6,RANGEKEYDEL)})
+PointKey: f#72057594037927935,19
+Span: f-g:{(#6,RANGEKEYDEL)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(b-d:{(#5,RANGEKEYDEL)})
+PointKey: b#72057594037927935,19
+Span: b-d:{(#5,RANGEKEYDEL)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(b-d:{(#5,RANGEKEYDEL)})
+PointKey: c#8,1
+Span: b-d:{(#5,RANGEKEYDEL)}
+-
+
+# Test a scenario where Next is out of point keys, the current range key has
+# already been interleaved, and there are no more range keys.
+
+define-rangekeys
+w-y:{(#5,RANGEKEYSET,@1,v1)}
+y-z:{(#5,RANGEKEYDEL)}
+----
+OK
+
+define-pointkeys
+x.SET.8
+----
+OK
+
+iter
+first
+next
+next
+----
+-- SpanChanged(nil)
+-- SpanChanged(w-y:{(#5,RANGEKEYSET,@1,v1)})
+PointKey: w#72057594037927935,21
+Span: w-y:{(#5,RANGEKEYSET,@1,v1)}
+-
+PointKey: x#8,1
+Span: w-y:{(#5,RANGEKEYSET,@1,v1)}
+-
+-- SpanChanged(y-z:{(#5,RANGEKEYDEL)})
+PointKey: y#72057594037927935,19
+Span: y-z:{(#5,RANGEKEYDEL)}
+-
+
+# Test a scenario where we change direction on a synthetic range key boundary
+# key.
+iter
+first
+prev
+----
+-- SpanChanged(nil)
+-- SpanChanged(w-y:{(#5,RANGEKEYSET,@1,v1)})
+PointKey: w#72057594037927935,21
+Span: w-y:{(#5,RANGEKEYSET,@1,v1)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+.
+
+define-rangekeys
+a-z:{(#5,RANGEKEYSET,@1,v1)}
+----
+OK
+
+define-pointkeys
+z.SET.8
+----
+OK
+
+iter
+seek-ge c
+prev
+next
+----
+-- SpanChanged(nil)
+-- SpanChanged(a-z:{(#5,RANGEKEYSET,@1,v1)})
+PointKey: c#72057594037927935,21
+Span: a-z:{(#5,RANGEKEYSET,@1,v1)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(a-z:{(#5,RANGEKEYSET,@1,v1)})
+PointKey: a#72057594037927935,21
+Span: a-z:{(#5,RANGEKEYSET,@1,v1)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+PointKey: z#8,1
+Span: <invalid>
+-
+
+iter
+set-bounds . c
+first
+set-bounds c .
+last
+prev
+prev
+----
+-- SpanChanged(nil)
+-- SpanChanged(a-c:{(#5,RANGEKEYSET,@1,v1)})
+PointKey: a#72057594037927935,21
+Span: a-c:{(#5,RANGEKEYSET,@1,v1)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+PointKey: z#8,1
+Span: <invalid>
+-
+-- SpanChanged(c-z:{(#5,RANGEKEYSET,@1,v1)})
+PointKey: c#72057594037927935,21
+Span: c-z:{(#5,RANGEKEYSET,@1,v1)}
+-
+-- SpanChanged(nil)
+.
+
+# Test switching directions after exhausting a range key iterator.
+# Switching reverse to forward iteration.
+
+define-rangekeys
+j-l:{(#3,RANGEKEYSET,@1,v0)}
+----
+OK
+
+define-pointkeys
+g.SET.1
+s.SET.1
+v.SET.2
+v.SET.1
+z.SET.1
+----
+OK
+
+iter
+last
+prev
+prev
+prev
+prev
+prev
+next
+----
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+PointKey: z#1,1
+Span: <invalid>
+-
+-- SpanChanged(nil)
+PointKey: v#1,1
+Span: <invalid>
+-
+-- SpanChanged(nil)
+PointKey: v#2,1
+Span: <invalid>
+-
+-- SpanChanged(nil)
+PointKey: s#1,1
+Span: <invalid>
+-
+-- SpanChanged(j-l:{(#3,RANGEKEYSET,@1,v0)})
+PointKey: j#72057594037927935,21
+Span: j-l:{(#3,RANGEKEYSET,@1,v0)}
+-
+-- SpanChanged(nil)
+PointKey: g#1,1
+Span: <invalid>
+-
+-- SpanChanged(nil)
+-- SpanChanged(j-l:{(#3,RANGEKEYSET,@1,v0)})
+PointKey: j#72057594037927935,21
+Span: j-l:{(#3,RANGEKEYSET,@1,v0)}
+-
+
+# Test switching directions after exhausting a range key iterator.
+# Switching forward to reverse iteration.
+
+define-rangekeys
+j-l:{(#3,RANGEKEYSET,@1,v0)}
+----
+OK
+
+define-pointkeys
+a.SET.1
+k.SET.1
+m.SET.1
+----
+OK
+
+iter
+first
+next
+next
+next
+prev
+----
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+PointKey: a#1,1
+Span: <invalid>
+-
+-- SpanChanged(j-l:{(#3,RANGEKEYSET,@1,v0)})
+PointKey: j#72057594037927935,21
+Span: j-l:{(#3,RANGEKEYSET,@1,v0)}
+-
+PointKey: k#1,1
+Span: j-l:{(#3,RANGEKEYSET,@1,v0)}
+-
+-- SpanChanged(nil)
+PointKey: m#1,1
+Span: <invalid>
+-
+-- SpanChanged(nil)
+-- SpanChanged(j-l:{(#3,RANGEKEYSET,@1,v0)})
+PointKey: k#1,1
+Span: j-l:{(#3,RANGEKEYSET,@1,v0)}
+-
+
+# Test a seek that moves the lower bound beyond the upper bound.
+
+define-rangekeys
+a-d:{(#10,RANGEKEYSET,@5,apples)}
+----
+OK
+
+define-pointkeys
+b.SET.8
+----
+OK
+
+
+iter
+set-bounds a c
+seek-ge c
+----
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+.
+
+iter
+set-bounds a c
+seek-lt a
+----
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+.
+
+# Test a SeekLT that searches a keyspace exclusive with the iterator's bounds.
+# Previously, there was a bug that would incorrectly surface the span with the
+# iterator's bounds, despite the fact the SeekLT search key is exclusive. See
+# the comment in keyspanSeekLT.
+
+define-rangekeys
+b-f:{(#1,RANGEKEYSET,@1,foo)}
+----
+OK
+
+define-pointkeys
+f.SET.3
+----
+OK
+
+iter
+set-bounds d e
+seek-lt d
+----
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+.
+
+# Test seek-prefix-ge and its truncation of bounds to the prefix's bounds.
+
+define-rangekeys
+b-d:{(#5,RANGEKEYSET,@1,foo)}
+f-g:{(#6,RANGEKEYSET,@1,foo)}
+----
+OK
+
+define-pointkeys
+c.SET.8
+----
+OK
+
+iter
+seek-prefix-ge b
+next
+seek-prefix-ge c
+next
+seek-ge c
+----
+-- SpanChanged(nil)
+-- SpanChanged(b-b\x00:{(#5,RANGEKEYSET,@1,foo)})
+PointKey: b#72057594037927935,21
+Span: b-b\x00:{(#5,RANGEKEYSET,@1,foo)}
+-
+PointKey: c#8,1
+Span: b-b\x00:{(#5,RANGEKEYSET,@1,foo)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(c-c\x00:{(#5,RANGEKEYSET,@1,foo)})
+PointKey: c#72057594037927935,21
+Span: c-c\x00:{(#5,RANGEKEYSET,@1,foo)}
+-
+PointKey: c#8,1
+Span: c-c\x00:{(#5,RANGEKEYSET,@1,foo)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(b-d:{(#5,RANGEKEYSET,@1,foo)})
+PointKey: c#72057594037927935,21
+Span: b-d:{(#5,RANGEKEYSET,@1,foo)}
+-
+
+# Test NextPrefix
+
+define-rangekeys
+b-e:{(#5,RANGEKEYSET,@9,foo)}
+f-g:{(#6,RANGEKEYSET,@9,foo)}
+----
+OK
+
+define-pointkeys
+a@4.SET.8
+c@11.SET.8
+c@3.SET.8
+c@1.SET.4
+d@5.SET.3
+e@9.SET.2
+----
+OK
+
+iter
+first
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+----
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+PointKey: a@4#8,1
+Span: <invalid>
+-
+-- SpanChanged(b-e:{(#5,RANGEKEYSET,@9,foo)})
+PointKey: b#72057594037927935,21
+Span: b-e:{(#5,RANGEKEYSET,@9,foo)}
+-
+PointKey: c@11#8,1
+Span: b-e:{(#5,RANGEKEYSET,@9,foo)}
+-
+PointKey: d@5#3,1
+Span: b-e:{(#5,RANGEKEYSET,@9,foo)}
+-
+-- SpanChanged(nil)
+PointKey: e@9#2,1
+Span: <invalid>
+-
+-- SpanChanged(f-g:{(#6,RANGEKEYSET,@9,foo)})
+PointKey: f#72057594037927935,21
+Span: f-g:{(#6,RANGEKEYSET,@9,foo)}
+-
+-- SpanChanged(nil)
+.
+.
diff --git a/pebble/internal/keyspan/testdata/interleaving_iter_masking b/pebble/internal/keyspan/testdata/interleaving_iter_masking
new file mode 100644
index 0000000..8ad8fb3
--- /dev/null
+++ b/pebble/internal/keyspan/testdata/interleaving_iter_masking
@@ -0,0 +1,501 @@
+# Test the scenario illustrated in the below visualization.
+#
+#        ^
+#     @9 |        •―――――――――――――――○ [e,m)@9
+#   s  8 |                      • l@8
+#   u  7 |------------------------------------ @7 masking
+#   f  6 |      [h,q)@6 •―――――――――――――――――○     threshold
+#   f  5 |              • h@5
+#   f  4 |                          • n@4
+#   i  3 |          •―――――――――――○ [f,l)@3
+#   x  2 |  • b@2
+#      1 |
+#      0 |___________________________________
+#         a b c d e f g h i j k l m n o p q
+#
+
+define-rangekeys
+e-f:{(#1,RANGEKEYSET,@9,foo)}
+f-h:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@3,bar)}
+h-l:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax) (#1,RANGEKEYSET,@3,bar)}
+l-m:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax)}
+m-q:{(#1,RANGEKEYSET,@6,bax)}
+----
+OK
+
+define-pointkeys
+b@2.SET.1
+h@5.SET.1
+l@8.SET.1
+n@4.SET.1
+----
+OK
+
+set-masking-threshold
+@7
+----
+OK
+
+iter
+first
+next
+next
+next
+next
+next
+next
+next
+----
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+PointKey: b@2#1,1
+Span: <invalid>
+-
+-- SpanChanged(e-f:{(#1,RANGEKEYSET,@9,foo)})
+PointKey: e#72057594037927935,21
+Span: e-f:{(#1,RANGEKEYSET,@9,foo)}
+-
+-- SpanChanged(f-h:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@3,bar)})
+PointKey: f#72057594037927935,21
+Span: f-h:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@3,bar)}
+-
+-- SpanChanged(h-l:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax) (#1,RANGEKEYSET,@3,bar)})
+PointKey: h#72057594037927935,21
+Span: h-l:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax) (#1,RANGEKEYSET,@3,bar)}
+-
+-- SpanChanged(l-m:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax)})
+PointKey: l#72057594037927935,21
+Span: l-m:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax)}
+-
+PointKey: l@8#1,1
+Span: l-m:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax)}
+-
+-- SpanChanged(m-q:{(#1,RANGEKEYSET,@6,bax)})
+PointKey: m#72057594037927935,21
+Span: m-q:{(#1,RANGEKEYSET,@6,bax)}
+-
+-- SpanChanged(nil)
+.
+
+iter
+last
+prev
+prev
+prev
+prev
+prev
+prev
+prev
+----
+-- SpanChanged(nil)
+-- SpanChanged(m-q:{(#1,RANGEKEYSET,@6,bax)})
+PointKey: m#72057594037927935,21
+Span: m-q:{(#1,RANGEKEYSET,@6,bax)}
+-
+-- SpanChanged(l-m:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax)})
+PointKey: l@8#1,1
+Span: l-m:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax)}
+-
+PointKey: l#72057594037927935,21
+Span: l-m:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax)}
+-
+-- SpanChanged(h-l:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax) (#1,RANGEKEYSET,@3,bar)})
+PointKey: h#72057594037927935,21
+Span: h-l:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax) (#1,RANGEKEYSET,@3,bar)}
+-
+-- SpanChanged(f-h:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@3,bar)})
+PointKey: f#72057594037927935,21
+Span: f-h:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@3,bar)}
+-
+-- SpanChanged(e-f:{(#1,RANGEKEYSET,@9,foo)})
+PointKey: e#72057594037927935,21
+Span: e-f:{(#1,RANGEKEYSET,@9,foo)}
+-
+-- SpanChanged(nil)
+PointKey: b@2#1,1
+Span: <invalid>
+-
+-- SpanChanged(nil)
+.
+
+iter
+seek-ge a
+seek-ge c
+seek-ge h
+seek-ge i
+seek-ge l
+next
+seek-ge m
+seek-ge r
+----
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+PointKey: b@2#1,1
+Span: <invalid>
+-
+-- SpanChanged(nil)
+-- SpanChanged(e-f:{(#1,RANGEKEYSET,@9,foo)})
+PointKey: e#72057594037927935,21
+Span: e-f:{(#1,RANGEKEYSET,@9,foo)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(h-l:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax) (#1,RANGEKEYSET,@3,bar)})
+PointKey: h#72057594037927935,21
+Span: h-l:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax) (#1,RANGEKEYSET,@3,bar)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(h-l:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax) (#1,RANGEKEYSET,@3,bar)})
+PointKey: i#72057594037927935,21
+Span: h-l:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax) (#1,RANGEKEYSET,@3,bar)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(l-m:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax)})
+PointKey: l#72057594037927935,21
+Span: l-m:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax)}
+-
+PointKey: l@8#1,1
+Span: l-m:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(m-q:{(#1,RANGEKEYSET,@6,bax)})
+PointKey: m#72057594037927935,21
+Span: m-q:{(#1,RANGEKEYSET,@6,bax)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(nil)
+.
+
+# Setting the masking threshold to @9 should result in l@8 being masked by
+# [e,m)@9.
+
+set-masking-threshold
+@9
+----
+OK
+
+iter
+seek-ge l
+next
+seek-lt l
+seek-lt ll
+prev
+----
+-- SpanChanged(nil)
+-- SpanChanged(l-m:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax)})
+PointKey: l#72057594037927935,21
+Span: l-m:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax)}
+-
+-- SpanChanged(m-q:{(#1,RANGEKEYSET,@6,bax)})
+PointKey: m#72057594037927935,21
+Span: m-q:{(#1,RANGEKEYSET,@6,bax)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(h-l:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax) (#1,RANGEKEYSET,@3,bar)})
+PointKey: h#72057594037927935,21
+Span: h-l:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax) (#1,RANGEKEYSET,@3,bar)}
+-
+-- SpanChanged(nil)
+-- SpanChanged(l-m:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax)})
+PointKey: l#72057594037927935,21
+Span: l-m:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax)}
+-
+-- SpanChanged(h-l:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax) (#1,RANGEKEYSET,@3,bar)})
+PointKey: h#72057594037927935,21
+Span: h-l:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax) (#1,RANGEKEYSET,@3,bar)}
+-
+
+iter
+seek-ge l
+next
+----
+-- SpanChanged(nil)
+-- SpanChanged(l-m:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax)})
+PointKey: l#72057594037927935,21
+Span: l-m:{(#1,RANGEKEYSET,@9,foo) (#1,RANGEKEYSET,@6,bax)}
+-
+-- SpanChanged(m-q:{(#1,RANGEKEYSET,@6,bax)})
+PointKey: m#72057594037927935,21
+Span: m-q:{(#1,RANGEKEYSET,@6,bax)}
+-
+
+define-rangekeys
+a-c:{(#1,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@2,bananas)}
+----
+OK
+
+define-pointkeys
+a.SET.1
+a@3.SET.1
+a@12.SET.1
+b@2.SET.1
+----
+OK
+
+set-masking-threshold
+@10
+----
+OK
+
+# Test that both a@3 and b@2 are masked by the rangekey.
+# The unsuffixed point key 'a' and the point key at a higher timestamp 'a@12'
+# are not masked.
+
+iter
+first
+next
+next
+next
+----
+-- SpanChanged(nil)
+-- SpanChanged(a-c:{(#1,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@2,bananas)})
+PointKey: a#72057594037927935,21
+Span: a-c:{(#1,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@2,bananas)}
+-
+PointKey: a#1,1
+Span: a-c:{(#1,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@2,bananas)}
+-
+PointKey: a@12#1,1
+Span: a-c:{(#1,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@2,bananas)}
+-
+-- SpanChanged(nil)
+.
+
+iter
+last
+prev
+prev
+prev
+----
+-- SpanChanged(nil)
+-- SpanChanged(a-c:{(#1,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@2,bananas)})
+PointKey: a@12#1,1
+Span: a-c:{(#1,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@2,bananas)}
+-
+PointKey: a#1,1
+Span: a-c:{(#1,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@2,bananas)}
+-
+PointKey: a#72057594037927935,21
+Span: a-c:{(#1,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@2,bananas)}
+-
+-- SpanChanged(nil)
+.
+
+# Try the same test, but with a range key that sorts before the masking
+# threshold (eg, higher MVCC timestamp). Nothing should be masked.
+
+define-rangekeys
+a-c:{(#2,RANGEKEYSET,@20,apples)}
+----
+OK
+
+iter
+first
+next
+next
+next
+next
+next
+----
+-- SpanChanged(nil)
+-- SpanChanged(a-c:{(#2,RANGEKEYSET,@20,apples)})
+PointKey: a#72057594037927935,21
+Span: a-c:{(#2,RANGEKEYSET,@20,apples)}
+-
+PointKey: a#1,1
+Span: a-c:{(#2,RANGEKEYSET,@20,apples)}
+-
+PointKey: a@3#1,1
+Span: a-c:{(#2,RANGEKEYSET,@20,apples)}
+-
+PointKey: a@12#1,1
+Span: a-c:{(#2,RANGEKEYSET,@20,apples)}
+-
+PointKey: b@2#1,1
+Span: a-c:{(#2,RANGEKEYSET,@20,apples)}
+-
+-- SpanChanged(nil)
+.
+
+iter
+last
+prev
+prev
+prev
+prev
+prev
+----
+-- SpanChanged(nil)
+-- SpanChanged(a-c:{(#2,RANGEKEYSET,@20,apples)})
+PointKey: b@2#1,1
+Span: a-c:{(#2,RANGEKEYSET,@20,apples)}
+-
+PointKey: a@12#1,1
+Span: a-c:{(#2,RANGEKEYSET,@20,apples)}
+-
+PointKey: a@3#1,1
+Span: a-c:{(#2,RANGEKEYSET,@20,apples)}
+-
+PointKey: a#1,1
+Span: a-c:{(#2,RANGEKEYSET,@20,apples)}
+-
+PointKey: a#72057594037927935,21
+Span: a-c:{(#2,RANGEKEYSET,@20,apples)}
+-
+-- SpanChanged(nil)
+.
+
+# Try the original test, but with an internal range key containing just an
+# Unset, and no Set. Nothing should be masked. No range keys should be surfaced,
+# because there are none.
+
+define-rangekeys
+a-c:{(#1,RANGEKEYUNSET,@5) (#1,RANGEKEYUNSET,@2)}
+----
+OK
+
+iter
+first
+next
+next
+next
+next
+----
+-- SpanChanged(nil)
+-- SpanChanged(a-c:{(#1,RANGEKEYUNSET,@5) (#1,RANGEKEYUNSET,@2)})
+PointKey: a#72057594037927935,20
+Span: a-c:{(#1,RANGEKEYUNSET,@5) (#1,RANGEKEYUNSET,@2)}
+-
+PointKey: a#1,1
+Span: a-c:{(#1,RANGEKEYUNSET,@5) (#1,RANGEKEYUNSET,@2)}
+-
+PointKey: a@12#1,1
+Span: a-c:{(#1,RANGEKEYUNSET,@5) (#1,RANGEKEYUNSET,@2)}
+-
+-- SpanChanged(nil)
+.
+-- SpanChanged(nil)
+.
+
+iter
+last
+prev
+prev
+prev
+prev
+----
+-- SpanChanged(nil)
+-- SpanChanged(a-c:{(#1,RANGEKEYUNSET,@5) (#1,RANGEKEYUNSET,@2)})
+PointKey: a@12#1,1
+Span: a-c:{(#1,RANGEKEYUNSET,@5) (#1,RANGEKEYUNSET,@2)}
+-
+PointKey: a#1,1
+Span: a-c:{(#1,RANGEKEYUNSET,@5) (#1,RANGEKEYUNSET,@2)}
+-
+PointKey: a#72057594037927935,20
+Span: a-c:{(#1,RANGEKEYUNSET,@5) (#1,RANGEKEYUNSET,@2)}
+-
+-- SpanChanged(nil)
+.
+-- SpanChanged(nil)
+.
+
+# Test a scenario where a point key is masked in the forward direction, which in
+# turn requires nexting to the next range key as well.
+
+define-rangekeys
+a-c:{(#1,RANGEKEYSET,@5,apples)}
+c-z:{(#1,RANGEKEYSET,@10,bananas)}
+----
+OK
+
+define-pointkeys
+b@3.SET.2
+d@9.SET.4
+j@11.SET.3
+----
+OK
+
+set-masking-threshold
+@20
+----
+OK
+
+iter
+first
+next
+next
+----
+-- SpanChanged(nil)
+-- SpanChanged(a-c:{(#1,RANGEKEYSET,@5,apples)})
+PointKey: a#72057594037927935,21
+Span: a-c:{(#1,RANGEKEYSET,@5,apples)}
+-
+-- SpanChanged(c-z:{(#1,RANGEKEYSET,@10,bananas)})
+PointKey: c#72057594037927935,21
+Span: c-z:{(#1,RANGEKEYSET,@10,bananas)}
+-
+PointKey: j@11#3,1
+Span: c-z:{(#1,RANGEKEYSET,@10,bananas)}
+-
+
+iter
+last
+prev
+prev
+----
+-- SpanChanged(nil)
+-- SpanChanged(c-z:{(#1,RANGEKEYSET,@10,bananas)})
+PointKey: j@11#3,1
+Span: c-z:{(#1,RANGEKEYSET,@10,bananas)}
+-
+PointKey: c#72057594037927935,21
+Span: c-z:{(#1,RANGEKEYSET,@10,bananas)}
+-
+-- SpanChanged(a-c:{(#1,RANGEKEYSET,@5,apples)})
+PointKey: a#72057594037927935,21
+Span: a-c:{(#1,RANGEKEYSET,@5,apples)}
+-
+
+# Test a scenario where a there's an empty range key, requiring the interleaving
+# iter to call SpanChanged(nil) which should clear the previous mask.
+
+define-rangekeys
+a-c:{(#1,RANGEKEYSET,@10,apples)}
+c-e:{}
+e-f:{(#1,RANGEKEYSET,@5,bananas)}
+----
+OK
+
+define-pointkeys
+a@2.SET.4
+b@9.SET.2
+d@9.SET.3
+----
+OK
+
+set-masking-threshold
+@20
+----
+OK
+
+iter
+seek-ge a
+next
+next
+next
+----
+-- SpanChanged(nil)
+-- SpanChanged(a-c:{(#1,RANGEKEYSET,@10,apples)})
+PointKey: a#72057594037927935,21
+Span: a-c:{(#1,RANGEKEYSET,@10,apples)}
+-
+-- SpanChanged(nil)
+PointKey: d@9#3,1
+Span: <invalid>
+-
+-- SpanChanged(e-f:{(#1,RANGEKEYSET,@5,bananas)})
+PointKey: e#72057594037927935,21
+Span: e-f:{(#1,RANGEKEYSET,@5,bananas)}
+-
+-- SpanChanged(nil)
+.
diff --git a/pebble/internal/keyspan/testdata/iter b/pebble/internal/keyspan/testdata/iter
new file mode 100644
index 0000000..5a1c451
--- /dev/null
+++ b/pebble/internal/keyspan/testdata/iter
@@ -0,0 +1,55 @@
+define
+a-b:{(#2,SET) (#1,SET)}
+b-c:{(#2,SET) (#1,SET)}
+c-d:{(#2,SET) (#1,SET)}
+----
+
+iter
+seek-ge a
+seek-ge b
+seek-ge c
+seek-ge cat
+seek-ge d
+seek-lt a
+seek-lt b
+seek-lt c
+seek-lt cat
+seek-lt d
+seek-lt e
+----
+a-b:{(#2,SET) (#1,SET)}
+b-c:{(#2,SET) (#1,SET)}
+c-d:{(#2,SET) (#1,SET)}
+c-d:{(#2,SET) (#1,SET)}
+.
+.
+a-b:{(#2,SET) (#1,SET)}
+b-c:{(#2,SET) (#1,SET)}
+c-d:{(#2,SET) (#1,SET)}
+c-d:{(#2,SET) (#1,SET)}
+c-d:{(#2,SET) (#1,SET)}
+
+iter
+first
+next
+prev
+prev
+next
+next
+next
+prev
+next
+next
+prev
+----
+a-b:{(#2,SET) (#1,SET)}
+b-c:{(#2,SET) (#1,SET)}
+a-b:{(#2,SET) (#1,SET)}
+.
+a-b:{(#2,SET) (#1,SET)}
+b-c:{(#2,SET) (#1,SET)}
+c-d:{(#2,SET) (#1,SET)}
+b-c:{(#2,SET) (#1,SET)}
+c-d:{(#2,SET) (#1,SET)}
+.
+c-d:{(#2,SET) (#1,SET)}
diff --git a/pebble/internal/keyspan/testdata/level_iter b/pebble/internal/keyspan/testdata/level_iter
new file mode 100644
index 0000000..3919819
--- /dev/null
+++ b/pebble/internal/keyspan/testdata/level_iter
@@ -0,0 +1,475 @@
+
+# Simple case.
+
+define
+file
+  a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)}
+file
+  b-c:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)}
+  c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)}
+----
+
+iter
+seek-ge a
+seek-ge apple
+seek-ge b
+seek-ge banana
+seek-ge c
+seek-ge cantalope
+seek-ge d
+seek-ge dragonfruit
+----
+a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst)
+a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst)
+b-c:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+b-c:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+.
+.
+
+iter
+seek-lt a
+seek-lt apple
+seek-lt b
+seek-lt banana
+seek-lt c
+seek-lt cantalope
+seek-lt d
+seek-lt dragonfruit
+prev
+----
+.
+a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst)
+a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst)
+b-c:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+b-c:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+b-c:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+
+iter
+seek-ge a
+prev
+seek-lt d
+next
+----
+a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst)
+.
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+.
+
+iter
+first
+next
+next
+next
+----
+a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst)
+b-c:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+.
+
+iter
+last
+prev
+prev
+prev
+----
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+b-c:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst)
+.
+
+# Set some bounds
+
+iter
+seek-ge a
+seek-ge b
+seek-ge c
+seek-ge d
+seek-lt a
+seek-lt b
+seek-lt c
+seek-lt d
+----
+a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst)
+b-c:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+.
+.
+a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst)
+b-c:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+
+
+iter
+seek-lt cc
+prev
+prev
+prev
+----
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+b-c:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst)
+.
+
+# Test skipping over empty/point-key-only files in both directions.
+
+define
+file
+  a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)}
+file
+  point:b.SET.1:foo
+file
+  c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)}
+  d-e:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)}
+----
+
+num-files
+----
+3
+
+iter
+first
+next
+next
+next
+----
+a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst)
+b-c:{} (file = 000001.sst)
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000003.sst)
+d-e:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000003.sst)
+
+iter
+last
+prev
+prev
+prev
+----
+d-e:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000003.sst)
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000003.sst)
+b-c:{} (file = 000003.sst)
+a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst)
+
+# Test straddle keys between files.
+
+define
+file
+  a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)}
+file
+  c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)}
+file
+  e-f:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)}
+file
+  g-h:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)}
+----
+
+iter
+first
+next
+next
+next
+next
+next
+next
+next
+----
+a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst)
+b-c:{} (file = 000001.sst)
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+d-e:{} (file = 000002.sst)
+e-f:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000003.sst)
+f-g:{} (file = 000003.sst)
+g-h:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000004.sst)
+.
+
+iter
+last
+prev
+prev
+prev
+prev
+prev
+prev
+prev
+----
+g-h:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000004.sst)
+f-g:{} (file = 000004.sst)
+e-f:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000003.sst)
+d-e:{} (file = 000003.sst)
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+b-c:{} (file = 000002.sst)
+a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst)
+.
+
+# The below case seeks into a file straddle, then iterates forward and back to
+# it, and confirms that changing iterator directions on a straddle does the
+# right thing.
+
+iter
+seek-ge bb
+next
+prev
+next
+prev
+prev
+----
+b-c:{} (file = 000001.sst)
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+b-c:{} (file = 000002.sst)
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+b-c:{} (file = 000002.sst)
+a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst)
+
+# The same case as above, but with inverted directions.
+
+iter
+seek-lt dd
+prev
+next
+prev
+next
+next
+----
+d-e:{} (file = 000001.sst)
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+d-e:{} (file = 000002.sst)
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+d-e:{} (file = 000002.sst)
+e-f:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000003.sst)
+
+iter
+seek-lt dd
+prev
+next
+prev
+next
+next
+----
+d-e:{} (file = 000003.sst)
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+d-e:{} (file = 000002.sst)
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+d-e:{} (file = 000002.sst)
+e-f:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000003.sst)
+
+# Seeks right at the bound should return nothing.
+
+iter
+seek-lt bb
+----
+b-c:{} (file = 000003.sst)
+
+iter
+seek-ge dd
+----
+d-e:{} (file = 000003.sst)
+
+iter
+seek-lt d
+prev
+next
+prev
+prev
+prev
+next
+next
+----
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+b-c:{} (file = 000002.sst)
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+b-c:{} (file = 000002.sst)
+a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst)
+.
+a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst)
+b-c:{} (file = 000001.sst)
+
+# A bunch of files with point keys only should not fragment straddles.
+
+define
+file
+  a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)}
+file
+  point:c.SET.1:foo
+file
+  point:d.SET.1:foo
+file
+  e-f:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)}
+file
+  point:g.SET.1:foo
+file
+  h-i:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)}
+----
+
+iter
+first
+next
+next
+next
+next
+next
+----
+a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst)
+b-e:{} (file = 000001.sst)
+e-f:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000004.sst)
+f-h:{} (file = 000004.sst)
+h-i:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000006.sst)
+.
+
+iter
+last
+prev
+prev
+prev
+prev
+prev
+----
+h-i:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000006.sst)
+f-h:{} (file = 000006.sst)
+e-f:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000004.sst)
+b-e:{} (file = 000004.sst)
+a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst)
+.
+
+# Test files with range keys and rangedels
+
+define
+file
+  a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)}
+  point:a.SET.1:foo
+  point:b.SET.1:foo
+file
+  c-e:{(#3,RANGEKEYSET,@3,baz) (#3,RANGEKEYSET,@1,bar)}
+  point:c.RANGEDEL.2:f
+  point:d.SET.1:foo
+file
+  g-h:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)}
+  i-j:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)}
+  point:f.RANGEDEL.2:g
+----
+
+iter rangedel
+first
+next
+next
+next
+----
+c-f:{(#2,RANGEDEL)} (file = 000002.sst)
+f-g:{(#2,RANGEDEL)} (file = 000003.sst)
+.
+.
+
+iter rangedel
+last
+prev
+prev
+prev
+----
+f-g:{(#2,RANGEDEL)} (file = 000003.sst)
+c-f:{(#2,RANGEDEL)} (file = 000002.sst)
+.
+.
+
+iter rangedel
+seek-ge c
+next
+next
+----
+c-f:{(#2,RANGEDEL)} (file = 000002.sst)
+f-g:{(#2,RANGEDEL)} (file = 000003.sst)
+.
+
+iter rangedel
+seek-lt ff
+prev
+next
+prev
+prev
+----
+f-g:{(#2,RANGEDEL)} (file = 000003.sst)
+c-f:{(#2,RANGEDEL)} (file = 000002.sst)
+f-g:{(#2,RANGEDEL)} (file = 000003.sst)
+c-f:{(#2,RANGEDEL)} (file = 000002.sst)
+.
+
+close-iter
+----
+ok
+
+# Test that a regular LevelIter ignores rangedels and emits straddle spans.
+
+iter
+first
+next
+next
+next
+next
+next
+----
+a-b:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst)
+b-c:{} (file = 000001.sst)
+c-e:{(#3,RANGEKEYSET,@3,baz) (#3,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+e-g:{} (file = 000002.sst)
+g-h:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000003.sst)
+i-j:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000003.sst)
+
+iter
+seek-ge c
+next
+next
+next
+next
+----
+c-e:{(#3,RANGEKEYSET,@3,baz) (#3,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+e-g:{} (file = 000002.sst)
+g-h:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000003.sst)
+i-j:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000003.sst)
+.
+
+# Test seeking outside of bounds with straddles.
+
+define
+file
+  c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)}
+file
+  e-f:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)}
+file
+  g-h:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)}
+----
+
+iter
+seek-lt j
+next
+prev
+prev
+----
+g-h:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000003.sst)
+.
+g-h:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000003.sst)
+f-g:{} (file = 000003.sst)
+
+iter
+seek-lt j
+prev
+prev
+next
+next
+----
+g-h:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000003.sst)
+f-g:{} (file = 000003.sst)
+e-f:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000002.sst)
+f-g:{} (file = 000002.sst)
+g-h:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000003.sst)
+
+iter
+seek-ge a
+prev
+next
+next
+----
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst)
+.
+c-d:{(#2,RANGEKEYSET,@3,foo) (#1,RANGEKEYSET,@1,bar)} (file = 000001.sst)
+d-e:{} (file = 000001.sst)
diff --git a/pebble/internal/keyspan/testdata/merging_iter b/pebble/internal/keyspan/testdata/merging_iter
new file mode 100644
index 0000000..aa309e2
--- /dev/null
+++ b/pebble/internal/keyspan/testdata/merging_iter
@@ -0,0 +1,758 @@
+# Test a single level.
+
+define
+a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas)}
+c-d:{(#4,RANGEKEYSET,@3,coconut)}
+e-f:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)}
+h-j:{(#22,RANGEKEYDEL) (#21,RANGEKEYSET,@5,peaches) (#21,RANGEKEYSET,@3,starfruit)}
+l-m:{(#2,RANGEKEYUNSET,@9) (#2,RANGEKEYUNSET,@5)}
+q-z:{(#14,RANGEKEYSET,@9,mangos)}
+----
+1 levels
+
+iter
+first
+next
+next
+next
+next
+next
+next
+----
+a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas)}
+c-d:{(#4,RANGEKEYSET,@3,coconut)}
+e-f:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)}
+h-j:{(#22,RANGEKEYDEL) (#21,RANGEKEYSET,@5,peaches) (#21,RANGEKEYSET,@3,starfruit)}
+l-m:{(#2,RANGEKEYUNSET,@9) (#2,RANGEKEYUNSET,@5)}
+q-z:{(#14,RANGEKEYSET,@9,mangos)}
+<nil>
+
+# Test snapshot filtering.
+
+iter snapshot=12
+first
+next
+next
+next
+next
+next
+next
+----
+a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas)}
+c-d:{(#4,RANGEKEYSET,@3,coconut)}
+e-f:{}
+h-j:{}
+l-m:{(#2,RANGEKEYUNSET,@9) (#2,RANGEKEYUNSET,@5)}
+q-z:{}
+<nil>
+
+# Test error handling on seeks.
+
+iter probes=(0,ErrInjected,(Log "#  inner."))
+first
+last
+seek-ge boo
+seek-lt lemon
+----
+#  inner.First() = nil <err="injected error">
+<nil> err=<injected error>
+#  inner.Last() = nil <err="injected error">
+<nil> err=<injected error>
+#  inner.SeekLT("boo") = nil <err="injected error">
+<nil> err=<injected error>
+#  inner.SeekGE("lemon") = nil <err="injected error">
+<nil> err=<injected error>
+
+# Test error handling on steps.
+
+iter probes=(0,(If (Or OpNext OpPrev) ErrInjected noop),(Log "#  inner."))
+first
+next
+last
+prev
+----
+#  inner.First() = a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas)}
+a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas)}
+#  inner.Next() = nil <err="injected error">
+<nil> err=<injected error>
+#  inner.Last() = q-z:{(#14,RANGEKEYSET,@9,mangos)}
+q-z:{(#14,RANGEKEYSET,@9,mangos)}
+#  inner.Prev() = nil <err="injected error">
+<nil> err=<injected error>
+
+define
+b-d:{#10,RANGEKEYSET,@1,apples}
+e-h:{#8,RANGEKEYDEL}
+--
+a-c:{#3,RANGEKEYUNSET,@1}
+h-k:{#5,RANGEKEYDEL}
+----
+2 levels
+
+iter
+first
+next
+next
+next
+next
+next
+----
+a-b:{(#3,RANGEKEYUNSET,@1)}
+b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)}
+c-d:{(#10,RANGEKEYSET,@1,apples)}
+e-h:{(#8,RANGEKEYDEL)}
+h-k:{(#5,RANGEKEYDEL)}
+<nil>
+
+iter
+last
+prev
+prev
+prev
+prev
+prev
+----
+h-k:{(#5,RANGEKEYDEL)}
+e-h:{(#8,RANGEKEYDEL)}
+c-d:{(#10,RANGEKEYSET,@1,apples)}
+b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)}
+a-b:{(#3,RANGEKEYUNSET,@1)}
+<nil>
+
+# Test changing directions at each iterator position, reverse to forward.
+iter
+last
+next
+last
+prev
+next
+----
+h-k:{(#5,RANGEKEYDEL)}
+<nil>
+h-k:{(#5,RANGEKEYDEL)}
+e-h:{(#8,RANGEKEYDEL)}
+h-k:{(#5,RANGEKEYDEL)}
+
+iter
+last
+prev
+prev
+next
+----
+h-k:{(#5,RANGEKEYDEL)}
+e-h:{(#8,RANGEKEYDEL)}
+c-d:{(#10,RANGEKEYSET,@1,apples)}
+e-h:{(#8,RANGEKEYDEL)}
+
+iter
+last
+prev
+prev
+prev
+next
+----
+h-k:{(#5,RANGEKEYDEL)}
+e-h:{(#8,RANGEKEYDEL)}
+c-d:{(#10,RANGEKEYSET,@1,apples)}
+b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)}
+c-d:{(#10,RANGEKEYSET,@1,apples)}
+
+iter
+last
+prev
+prev
+prev
+prev
+next
+----
+h-k:{(#5,RANGEKEYDEL)}
+e-h:{(#8,RANGEKEYDEL)}
+c-d:{(#10,RANGEKEYSET,@1,apples)}
+b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)}
+a-b:{(#3,RANGEKEYUNSET,@1)}
+b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)}
+
+iter
+last
+prev
+prev
+prev
+prev
+prev
+next
+----
+h-k:{(#5,RANGEKEYDEL)}
+e-h:{(#8,RANGEKEYDEL)}
+c-d:{(#10,RANGEKEYSET,@1,apples)}
+b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)}
+a-b:{(#3,RANGEKEYUNSET,@1)}
+<nil>
+a-b:{(#3,RANGEKEYUNSET,@1)}
+
+# Test changing directions at each iterator position, forward to reverse.
+
+iter
+first
+prev
+first
+next
+prev
+----
+a-b:{(#3,RANGEKEYUNSET,@1)}
+<nil>
+a-b:{(#3,RANGEKEYUNSET,@1)}
+b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)}
+a-b:{(#3,RANGEKEYUNSET,@1)}
+
+iter
+first
+next
+next
+prev
+----
+a-b:{(#3,RANGEKEYUNSET,@1)}
+b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)}
+c-d:{(#10,RANGEKEYSET,@1,apples)}
+b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)}
+
+iter
+first
+next
+next
+next
+prev
+----
+a-b:{(#3,RANGEKEYUNSET,@1)}
+b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)}
+c-d:{(#10,RANGEKEYSET,@1,apples)}
+e-h:{(#8,RANGEKEYDEL)}
+c-d:{(#10,RANGEKEYSET,@1,apples)}
+
+iter
+first
+next
+next
+next
+next
+next
+prev
+----
+a-b:{(#3,RANGEKEYUNSET,@1)}
+b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)}
+c-d:{(#10,RANGEKEYSET,@1,apples)}
+e-h:{(#8,RANGEKEYDEL)}
+h-k:{(#5,RANGEKEYDEL)}
+<nil>
+h-k:{(#5,RANGEKEYDEL)}
+
+iter
+first
+next
+next
+next
+next
+prev
+----
+a-b:{(#3,RANGEKEYUNSET,@1)}
+b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)}
+c-d:{(#10,RANGEKEYSET,@1,apples)}
+e-h:{(#8,RANGEKEYDEL)}
+h-k:{(#5,RANGEKEYDEL)}
+e-h:{(#8,RANGEKEYDEL)}
+
+# Test SeekGE. Note that MergingIter's SeekGE implements the FragmentIterator's
+# SeekGE semantics. It returns the first fragment that covers a key ≥ the search
+# key.
+
+iter
+seek-ge cc
+----
+c-d:{(#10,RANGEKEYSET,@1,apples)}
+
+iter
+seek-ge 1
+seek-ge a
+seek-ge b
+seek-ge bb
+----
+a-b:{(#3,RANGEKEYUNSET,@1)}
+a-b:{(#3,RANGEKEYUNSET,@1)}
+b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)}
+b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)}
+
+iter
+seek-ge c
+seek-ge cc
+seek-ge e
+seek-ge f
+----
+c-d:{(#10,RANGEKEYSET,@1,apples)}
+c-d:{(#10,RANGEKEYSET,@1,apples)}
+e-h:{(#8,RANGEKEYDEL)}
+e-h:{(#8,RANGEKEYDEL)}
+
+iter
+seek-ge h
+seek-ge i
+seek-ge k
+seek-ge l
+----
+h-k:{(#5,RANGEKEYDEL)}
+h-k:{(#5,RANGEKEYDEL)}
+<nil>
+<nil>
+
+# Test SeekLT. Note that MergingIter's SeekLT implements the FragmentIterator's
+# SeekLT semantics. It returns the first fragment with a Start key < the search
+# key, NOT the first fragment that covers a key < the search key.
+#
+# NB: seek-lt bb finds b-c#3.RANGEKEYUNSET (the last fragment with the bounds
+# [b,c), unlike the above seek-ge b which finds the first).
+
+iter
+seek-lt b
+----
+a-b:{(#3,RANGEKEYUNSET,@1)}
+
+iter
+seek-lt 1
+seek-lt a
+seek-lt aa
+seek-lt b
+seek-lt bb
+seek-lt c
+----
+<nil>
+<nil>
+a-b:{(#3,RANGEKEYUNSET,@1)}
+a-b:{(#3,RANGEKEYUNSET,@1)}
+b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)}
+b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)}
+
+iter
+seek-lt cc
+seek-lt d
+seek-lt dd
+seek-lt e
+seek-lt ee
+seek-lt h
+seek-lt hh
+seek-lt k
+seek-lt z
+----
+c-d:{(#10,RANGEKEYSET,@1,apples)}
+c-d:{(#10,RANGEKEYSET,@1,apples)}
+c-d:{(#10,RANGEKEYSET,@1,apples)}
+c-d:{(#10,RANGEKEYSET,@1,apples)}
+e-h:{(#8,RANGEKEYDEL)}
+e-h:{(#8,RANGEKEYDEL)}
+h-k:{(#5,RANGEKEYDEL)}
+h-k:{(#5,RANGEKEYDEL)}
+h-k:{(#5,RANGEKEYDEL)}
+
+# Test error handling with multiple levels. Inject errors in all operations on
+# the first iterator, and none of the second iterator.
+
+iter probes=(0,ErrInjected,(Log "#  a.")) probes=(1,(Log "#  b."))
+seek-ge a
+seek-ge b
+seek-ge c
+seek-ge d
+seek-ge e
+seek-ge f
+seek-ge g
+seek-ge h
+seek-ge i
+seek-ge j
+seek-ge k
+seek-ge z
+----
+#  a.SeekLT("a") = nil <err="injected error">
+#  b.SeekLT("a") = nil
+<nil> err=<injected error>
+#  a.SeekLT("b") = nil <err="injected error">
+#  b.SeekLT("b") = a-c:{(#3,RANGEKEYUNSET,@1)}
+<nil> err=<injected error>
+#  a.SeekLT("c") = nil <err="injected error">
+#  b.SeekLT("c") = a-c:{(#3,RANGEKEYUNSET,@1)}
+<nil> err=<injected error>
+#  a.SeekLT("d") = nil <err="injected error">
+#  b.SeekLT("d") = a-c:{(#3,RANGEKEYUNSET,@1)}
+<nil> err=<injected error>
+#  a.SeekLT("e") = nil <err="injected error">
+#  b.SeekLT("e") = a-c:{(#3,RANGEKEYUNSET,@1)}
+<nil> err=<injected error>
+#  a.SeekLT("f") = nil <err="injected error">
+#  b.SeekLT("f") = a-c:{(#3,RANGEKEYUNSET,@1)}
+<nil> err=<injected error>
+#  a.SeekLT("g") = nil <err="injected error">
+#  b.SeekLT("g") = a-c:{(#3,RANGEKEYUNSET,@1)}
+<nil> err=<injected error>
+#  a.SeekLT("h") = nil <err="injected error">
+#  b.SeekLT("h") = a-c:{(#3,RANGEKEYUNSET,@1)}
+<nil> err=<injected error>
+#  a.SeekLT("i") = nil <err="injected error">
+#  b.SeekLT("i") = h-k:{(#5,RANGEKEYDEL)}
+<nil> err=<injected error>
+#  a.SeekLT("j") = nil <err="injected error">
+#  b.SeekLT("j") = h-k:{(#5,RANGEKEYDEL)}
+<nil> err=<injected error>
+#  a.SeekLT("k") = nil <err="injected error">
+#  b.SeekLT("k") = h-k:{(#5,RANGEKEYDEL)}
+<nil> err=<injected error>
+#  a.SeekLT("z") = nil <err="injected error">
+#  b.SeekLT("z") = h-k:{(#5,RANGEKEYDEL)}
+<nil> err=<injected error>
+
+# Test the same as above, but with errors injected on the second iterator.
+
+iter probes=(0,(Log "#  a.")) probes=(1,ErrInjected,(Log "#  b."))
+seek-ge a
+seek-ge b
+seek-ge c
+seek-ge d
+seek-ge e
+seek-ge f
+seek-ge g
+seek-ge h
+seek-ge i
+seek-ge j
+seek-ge k
+seek-ge z
+----
+#  a.SeekLT("a") = nil
+#  b.SeekLT("a") = nil <err="injected error">
+<nil> err=<injected error>
+#  a.SeekLT("b") = nil
+#  b.SeekLT("b") = nil <err="injected error">
+<nil> err=<injected error>
+#  a.SeekLT("c") = b-d:{(#10,RANGEKEYSET,@1,apples)}
+#  b.SeekLT("c") = nil <err="injected error">
+<nil> err=<injected error>
+#  a.SeekLT("d") = b-d:{(#10,RANGEKEYSET,@1,apples)}
+#  b.SeekLT("d") = nil <err="injected error">
+<nil> err=<injected error>
+#  a.SeekLT("e") = b-d:{(#10,RANGEKEYSET,@1,apples)}
+#  b.SeekLT("e") = nil <err="injected error">
+<nil> err=<injected error>
+#  a.SeekLT("f") = e-h:{(#8,RANGEKEYDEL)}
+#  b.SeekLT("f") = nil <err="injected error">
+<nil> err=<injected error>
+#  a.SeekLT("g") = e-h:{(#8,RANGEKEYDEL)}
+#  b.SeekLT("g") = nil <err="injected error">
+<nil> err=<injected error>
+#  a.SeekLT("h") = e-h:{(#8,RANGEKEYDEL)}
+#  b.SeekLT("h") = nil <err="injected error">
+<nil> err=<injected error>
+#  a.SeekLT("i") = e-h:{(#8,RANGEKEYDEL)}
+#  b.SeekLT("i") = nil <err="injected error">
+<nil> err=<injected error>
+#  a.SeekLT("j") = e-h:{(#8,RANGEKEYDEL)}
+#  b.SeekLT("j") = nil <err="injected error">
+<nil> err=<injected error>
+#  a.SeekLT("k") = e-h:{(#8,RANGEKEYDEL)}
+#  b.SeekLT("k") = nil <err="injected error">
+<nil> err=<injected error>
+#  a.SeekLT("z") = e-h:{(#8,RANGEKEYDEL)}
+#  b.SeekLT("z") = nil <err="injected error">
+<nil> err=<injected error>
+
+# Test SeekLTs with errors injected on the first iterator.
+
+iter probes=(0,ErrInjected,(Log "#  a.")) probes=(1,(Log "#  b."))
+seek-lt a
+seek-lt b
+seek-lt c
+seek-lt d
+seek-lt e
+seek-lt f
+seek-lt g
+seek-lt h
+seek-lt i
+seek-lt j
+seek-lt k
+seek-lt z
+----
+#  a.SeekGE("a") = nil <err="injected error">
+#  b.SeekGE("a") = a-c:{(#3,RANGEKEYUNSET,@1)}
+<nil> err=<injected error>
+#  a.SeekGE("b") = nil <err="injected error">
+#  b.SeekGE("b") = a-c:{(#3,RANGEKEYUNSET,@1)}
+<nil> err=<injected error>
+#  a.SeekGE("c") = nil <err="injected error">
+#  b.SeekGE("c") = h-k:{(#5,RANGEKEYDEL)}
+<nil> err=<injected error>
+#  a.SeekGE("d") = nil <err="injected error">
+#  b.SeekGE("d") = h-k:{(#5,RANGEKEYDEL)}
+<nil> err=<injected error>
+#  a.SeekGE("e") = nil <err="injected error">
+#  b.SeekGE("e") = h-k:{(#5,RANGEKEYDEL)}
+<nil> err=<injected error>
+#  a.SeekGE("f") = nil <err="injected error">
+#  b.SeekGE("f") = h-k:{(#5,RANGEKEYDEL)}
+<nil> err=<injected error>
+#  a.SeekGE("g") = nil <err="injected error">
+#  b.SeekGE("g") = h-k:{(#5,RANGEKEYDEL)}
+<nil> err=<injected error>
+#  a.SeekGE("h") = nil <err="injected error">
+#  b.SeekGE("h") = h-k:{(#5,RANGEKEYDEL)}
+<nil> err=<injected error>
+#  a.SeekGE("i") = nil <err="injected error">
+#  b.SeekGE("i") = h-k:{(#5,RANGEKEYDEL)}
+<nil> err=<injected error>
+#  a.SeekGE("j") = nil <err="injected error">
+#  b.SeekGE("j") = h-k:{(#5,RANGEKEYDEL)}
+<nil> err=<injected error>
+#  a.SeekGE("k") = nil <err="injected error">
+#  b.SeekGE("k") = nil
+<nil> err=<injected error>
+#  a.SeekGE("z") = nil <err="injected error">
+#  b.SeekGE("z") = nil
+<nil> err=<injected error>
+
+# Test SeekLTs with errors injected on the second iterator.
+
+iter probes=(0,(Log "#  a.")) probes=(1,ErrInjected,(Log "#  b."))
+seek-lt a
+seek-lt b
+seek-lt c
+seek-lt d
+seek-lt e
+seek-lt f
+seek-lt g
+seek-lt h
+seek-lt i
+seek-lt j
+seek-lt k
+seek-lt z
+----
+#  a.SeekGE("a") = b-d:{(#10,RANGEKEYSET,@1,apples)}
+#  b.SeekGE("a") = nil <err="injected error">
+<nil> err=<injected error>
+#  a.SeekGE("b") = b-d:{(#10,RANGEKEYSET,@1,apples)}
+#  b.SeekGE("b") = nil <err="injected error">
+<nil> err=<injected error>
+#  a.SeekGE("c") = b-d:{(#10,RANGEKEYSET,@1,apples)}
+#  b.SeekGE("c") = nil <err="injected error">
+<nil> err=<injected error>
+#  a.SeekGE("d") = e-h:{(#8,RANGEKEYDEL)}
+#  b.SeekGE("d") = nil <err="injected error">
+<nil> err=<injected error>
+#  a.SeekGE("e") = e-h:{(#8,RANGEKEYDEL)}
+#  b.SeekGE("e") = nil <err="injected error">
+<nil> err=<injected error>
+#  a.SeekGE("f") = e-h:{(#8,RANGEKEYDEL)}
+#  b.SeekGE("f") = nil <err="injected error">
+<nil> err=<injected error>
+#  a.SeekGE("g") = e-h:{(#8,RANGEKEYDEL)}
+#  b.SeekGE("g") = nil <err="injected error">
+<nil> err=<injected error>
+#  a.SeekGE("h") = nil
+#  b.SeekGE("h") = nil <err="injected error">
+<nil> err=<injected error>
+#  a.SeekGE("i") = nil
+#  b.SeekGE("i") = nil <err="injected error">
+<nil> err=<injected error>
+#  a.SeekGE("j") = nil
+#  b.SeekGE("j") = nil <err="injected error">
+<nil> err=<injected error>
+#  a.SeekGE("k") = nil
+#  b.SeekGE("k") = nil <err="injected error">
+<nil> err=<injected error>
+#  a.SeekGE("z") = nil
+#  b.SeekGE("z") = nil <err="injected error">
+<nil> err=<injected error>
+
+# Test error handling during Next.
+
+iter probes=(0,(If OpNext ErrInjected noop),(Log "#  a.")) probes=(1,(Log "#  b."))
+first
+next
+next
+next
+----
+#  a.First() = b-d:{(#10,RANGEKEYSET,@1,apples)}
+#  b.First() = a-c:{(#3,RANGEKEYUNSET,@1)}
+a-b:{(#3,RANGEKEYUNSET,@1)}
+b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)}
+#  b.Next() = h-k:{(#5,RANGEKEYDEL)}
+c-d:{(#10,RANGEKEYSET,@1,apples)}
+#  a.Next() = nil <err="injected error">
+<nil> err=<injected error>
+
+iter probes=(0,(Log "#  a.")) probes=(1,(If OpNext ErrInjected noop),(Log "#  b."))
+first
+next
+next
+----
+#  a.First() = b-d:{(#10,RANGEKEYSET,@1,apples)}
+#  b.First() = a-c:{(#3,RANGEKEYUNSET,@1)}
+a-b:{(#3,RANGEKEYUNSET,@1)}
+b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)}
+#  b.Next() = nil <err="injected error">
+<nil> err=<injected error>
+
+# Test error handling during Prev.
+
+iter probes=(0,(If OpPrev ErrInjected noop),(Log "#  a.")) probes=(1,(Log "#  b."))
+last
+prev
+prev
+----
+#  a.Last() = e-h:{(#8,RANGEKEYDEL)}
+#  b.Last() = h-k:{(#5,RANGEKEYDEL)}
+h-k:{(#5,RANGEKEYDEL)}
+#  b.Prev() = a-c:{(#3,RANGEKEYUNSET,@1)}
+e-h:{(#8,RANGEKEYDEL)}
+#  a.Prev() = nil <err="injected error">
+<nil> err=<injected error>
+
+iter probes=(0,(Log "#  a.")) probes=(1,(If OpPrev ErrInjected noop),(Log "#  b."))
+last
+prev
+----
+#  a.Last() = e-h:{(#8,RANGEKEYDEL)}
+#  b.Last() = h-k:{(#5,RANGEKEYDEL)}
+h-k:{(#5,RANGEKEYDEL)}
+#  b.Prev() = nil <err="injected error">
+<nil> err=<injected error>
+
+define
+a-f:{(#5,RANGEKEYDEL) (#4,RANGEKEYDEL)}
+k-s:{(#5,RANGEKEYDEL) (#4,RANGEKEYDEL)}
+----
+1 levels
+
+iter
+first
+prev
+next
+----
+a-f:{(#5,RANGEKEYDEL) (#4,RANGEKEYDEL)}
+<nil>
+a-f:{(#5,RANGEKEYDEL) (#4,RANGEKEYDEL)}
+
+iter
+last
+next
+prev
+----
+k-s:{(#5,RANGEKEYDEL) (#4,RANGEKEYDEL)}
+<nil>
+k-s:{(#5,RANGEKEYDEL) (#4,RANGEKEYDEL)}
+
+define
+w-x:{(#5,RANGEKEYDEL) (#3,RANGEKEYDEL)}
+x-z:{(#5,RANGEKEYDEL)}
+--
+w-y:{(#4,RANGEKEYDEL) (#1,RANGEKEYDEL)}
+----
+2 levels
+
+iter
+last
+next
+prev
+first
+prev
+next
+----
+y-z:{(#5,RANGEKEYDEL)}
+<nil>
+y-z:{(#5,RANGEKEYDEL)}
+w-x:{(#5,RANGEKEYDEL) (#4,RANGEKEYDEL) (#3,RANGEKEYDEL) (#1,RANGEKEYDEL)}
+<nil>
+w-x:{(#5,RANGEKEYDEL) (#4,RANGEKEYDEL) (#3,RANGEKEYDEL) (#1,RANGEKEYDEL)}
+
+iter
+seek-ge x
+prev
+seek-ge xray
+prev
+----
+x-y:{(#5,RANGEKEYDEL) (#4,RANGEKEYDEL) (#1,RANGEKEYDEL)}
+w-x:{(#5,RANGEKEYDEL) (#4,RANGEKEYDEL) (#3,RANGEKEYDEL) (#1,RANGEKEYDEL)}
+x-y:{(#5,RANGEKEYDEL) (#4,RANGEKEYDEL) (#1,RANGEKEYDEL)}
+w-x:{(#5,RANGEKEYDEL) (#4,RANGEKEYDEL) (#3,RANGEKEYDEL) (#1,RANGEKEYDEL)}
+
+define
+il-qb:{(#10,RANGEKEYDEL)}
+sn-wn:{(#10,RANGEKEYDEL)}
+--
+qt-kh:{(#9,RANGEKEYDEL) (#8,RANGEKEYDEL) (#7,RANGEKEYDEL)}
+ky-sv:{(#8,RANGEKEYDEL) (#7,RANGEKEYDEL)}
+--
+as-fz:{(#5,RANGEKEYDEL) (#4,RANGEKEYDEL)}
+hh-ir:{(#4,RANGEKEYDEL)}
+rf-yx:{(#4,RANGEKEYDEL)}
+----
+3 levels
+
+iter
+seek-ge qp
+next
+next
+next
+next
+next
+seek-ge yz
+prev
+----
+qb-rf:{(#8,RANGEKEYDEL) (#7,RANGEKEYDEL)}
+rf-sn:{(#8,RANGEKEYDEL) (#7,RANGEKEYDEL) (#4,RANGEKEYDEL)}
+sn-sv:{(#10,RANGEKEYDEL) (#8,RANGEKEYDEL) (#7,RANGEKEYDEL) (#4,RANGEKEYDEL)}
+sv-wn:{(#10,RANGEKEYDEL) (#4,RANGEKEYDEL)}
+wn-yx:{(#4,RANGEKEYDEL)}
+<nil>
+<nil>
+wn-yx:{(#4,RANGEKEYDEL)}
+
+# Test that empty spans from child iterators are preserved
+define
+b-d:{#10,RANGEKEYSET,@1,apples}
+e-f:{}
+g-h:{#8,RANGEKEYDEL}
+--
+a-c:{#3,RANGEKEYUNSET,@1}
+h-k:{#5,RANGEKEYDEL}
+k-m:{}
+----
+2 levels
+
+iter
+first
+next
+next
+next
+next
+next
+next
+next
+----
+a-b:{(#3,RANGEKEYUNSET,@1)}
+b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)}
+c-d:{(#10,RANGEKEYSET,@1,apples)}
+e-f:{}
+g-h:{(#8,RANGEKEYDEL)}
+h-k:{(#5,RANGEKEYDEL)}
+k-m:{}
+<nil>
+
+iter
+last
+prev
+prev
+prev
+prev
+prev
+prev
+prev
+----
+k-m:{}
+h-k:{(#5,RANGEKEYDEL)}
+g-h:{(#8,RANGEKEYDEL)}
+e-f:{}
+c-d:{(#10,RANGEKEYSET,@1,apples)}
+b-c:{(#10,RANGEKEYSET,@1,apples) (#3,RANGEKEYUNSET,@1)}
+a-b:{(#3,RANGEKEYUNSET,@1)}
+<nil>
diff --git a/pebble/internal/keyspan/testdata/seek b/pebble/internal/keyspan/testdata/seek
new file mode 100644
index 0000000..e75a65c
--- /dev/null
+++ b/pebble/internal/keyspan/testdata/seek
@@ -0,0 +1,309 @@
+build
+1:  b-d
+----
+b-d:{(#1,RANGEDEL)}
+
+seek-ge
+a 2
+b 2
+b 1
+d 2
+----
+b-d:{(#1,RANGEDEL)}
+b-d:{(#1,RANGEDEL)}
+b-d:{}
+<nil>
+
+seek-le
+a 2
+b 2
+b 1
+d 2
+----
+<nil>
+b-d:{(#1,RANGEDEL)}
+b-d:{}
+b-d:{(#1,RANGEDEL)}
+
+build
+3:  b-d
+2:  b-d
+1:  b-d
+----
+b-d:{(#3,RANGEDEL) (#2,RANGEDEL) (#1,RANGEDEL)}
+
+seek-ge
+a 4
+b 4
+b 3
+b 2
+b 1
+d 4
+----
+b-d:{(#3,RANGEDEL) (#2,RANGEDEL) (#1,RANGEDEL)}
+b-d:{(#3,RANGEDEL) (#2,RANGEDEL) (#1,RANGEDEL)}
+b-d:{(#2,RANGEDEL) (#1,RANGEDEL)}
+b-d:{(#1,RANGEDEL)}
+b-d:{}
+<nil>
+
+seek-le
+a 4
+b 4
+b 3
+b 2
+b 1
+d 4
+----
+<nil>
+b-d:{(#3,RANGEDEL) (#2,RANGEDEL) (#1,RANGEDEL)}
+b-d:{(#2,RANGEDEL) (#1,RANGEDEL)}
+b-d:{(#1,RANGEDEL)}
+b-d:{}
+b-d:{(#3,RANGEDEL) (#2,RANGEDEL) (#1,RANGEDEL)}
+
+build
+1:  b-d
+2:    d-f
+----
+b-d:{(#1,RANGEDEL)}
+d-f:{(#2,RANGEDEL)}
+
+seek-ge
+b 2
+d 2
+d 3
+e 3
+----
+b-d:{(#1,RANGEDEL)}
+d-f:{}
+d-f:{(#2,RANGEDEL)}
+d-f:{(#2,RANGEDEL)}
+
+seek-le
+a 3
+b 2
+d 2
+d 3
+e 3
+f 3
+----
+<nil>
+b-d:{(#1,RANGEDEL)}
+d-f:{}
+d-f:{(#2,RANGEDEL)}
+d-f:{(#2,RANGEDEL)}
+d-f:{(#2,RANGEDEL)}
+
+build
+3: a-----------m
+2:      f------------s
+1:          j---------------z
+----
+a-f:{(#3,RANGEDEL)}
+f-j:{(#3,RANGEDEL) (#2,RANGEDEL)}
+j-m:{(#3,RANGEDEL) (#2,RANGEDEL) (#1,RANGEDEL)}
+m-s:{(#2,RANGEDEL) (#1,RANGEDEL)}
+s-z:{(#1,RANGEDEL)}
+
+seek-ge
+a 4
+a 3
+a 2
+a 1
+f 4
+f 3
+f 2
+f 1
+j 4
+j 3
+j 2
+j 1
+m 3
+m 2
+m 1
+s 2
+s 1
+z 2
+----
+a-f:{(#3,RANGEDEL)}
+a-f:{}
+a-f:{}
+a-f:{}
+f-j:{(#3,RANGEDEL) (#2,RANGEDEL)}
+f-j:{(#2,RANGEDEL)}
+f-j:{}
+f-j:{}
+j-m:{(#3,RANGEDEL) (#2,RANGEDEL) (#1,RANGEDEL)}
+j-m:{(#2,RANGEDEL) (#1,RANGEDEL)}
+j-m:{(#1,RANGEDEL)}
+j-m:{}
+m-s:{(#2,RANGEDEL) (#1,RANGEDEL)}
+m-s:{(#1,RANGEDEL)}
+m-s:{}
+s-z:{(#1,RANGEDEL)}
+s-z:{}
+<nil>
+
+seek-le
+a 4
+a 3
+a 2
+a 1
+f 4
+f 3
+f 2
+f 1
+j 4
+j 3
+j 2
+j 1
+m 3
+m 2
+m 1
+s 2
+s 1
+z 2
+----
+a-f:{(#3,RANGEDEL)}
+a-f:{}
+a-f:{}
+a-f:{}
+f-j:{(#3,RANGEDEL) (#2,RANGEDEL)}
+f-j:{(#2,RANGEDEL)}
+f-j:{}
+f-j:{}
+j-m:{(#3,RANGEDEL) (#2,RANGEDEL) (#1,RANGEDEL)}
+j-m:{(#2,RANGEDEL) (#1,RANGEDEL)}
+j-m:{(#1,RANGEDEL)}
+j-m:{}
+m-s:{(#2,RANGEDEL) (#1,RANGEDEL)}
+m-s:{(#1,RANGEDEL)}
+m-s:{}
+s-z:{(#1,RANGEDEL)}
+s-z:{}
+s-z:{(#1,RANGEDEL)}
+
+build
+1: a-----------m
+2:      f------------s
+3:          j---------------z
+----
+a-f:{(#1,RANGEDEL)}
+f-j:{(#2,RANGEDEL) (#1,RANGEDEL)}
+j-m:{(#3,RANGEDEL) (#2,RANGEDEL) (#1,RANGEDEL)}
+m-s:{(#3,RANGEDEL) (#2,RANGEDEL)}
+s-z:{(#3,RANGEDEL)}
+
+seek-ge
+a 2
+a 1
+f 3
+f 2
+f 1
+j 4
+j 3
+j 2
+j 1
+m 4
+m 3
+m 2
+m 1
+s 4
+s 3
+s 2
+s 1
+z 4
+----
+a-f:{(#1,RANGEDEL)}
+a-f:{}
+f-j:{(#2,RANGEDEL) (#1,RANGEDEL)}
+f-j:{(#1,RANGEDEL)}
+f-j:{}
+j-m:{(#3,RANGEDEL) (#2,RANGEDEL) (#1,RANGEDEL)}
+j-m:{(#2,RANGEDEL) (#1,RANGEDEL)}
+j-m:{(#1,RANGEDEL)}
+j-m:{}
+m-s:{(#3,RANGEDEL) (#2,RANGEDEL)}
+m-s:{(#2,RANGEDEL)}
+m-s:{}
+m-s:{}
+s-z:{(#3,RANGEDEL)}
+s-z:{}
+s-z:{}
+s-z:{}
+<nil>
+
+seek-le
+a 2
+a 1
+f 3
+f 2
+f 1
+j 4
+j 3
+j 2
+j 1
+m 4
+m 3
+m 2
+m 1
+s 4
+s 3
+s 2
+s 1
+z 4
+z 3
+z 2
+----
+a-f:{(#1,RANGEDEL)}
+a-f:{}
+f-j:{(#2,RANGEDEL) (#1,RANGEDEL)}
+f-j:{(#1,RANGEDEL)}
+f-j:{}
+j-m:{(#3,RANGEDEL) (#2,RANGEDEL) (#1,RANGEDEL)}
+j-m:{(#2,RANGEDEL) (#1,RANGEDEL)}
+j-m:{(#1,RANGEDEL)}
+j-m:{}
+m-s:{(#3,RANGEDEL) (#2,RANGEDEL)}
+m-s:{(#2,RANGEDEL)}
+m-s:{}
+m-s:{}
+s-z:{(#3,RANGEDEL)}
+s-z:{}
+s-z:{}
+s-z:{}
+s-z:{(#3,RANGEDEL)}
+s-z:{}
+s-z:{}
+
+build
+1: a-c
+3: a-c
+5: a-c
+5: c-e
+----
+a-c:{(#5,RANGEDEL) (#3,RANGEDEL) (#1,RANGEDEL)}
+c-e:{(#5,RANGEDEL)}
+
+# Regression test for a bug where seek-le was failing to find the most recent
+# version of a tombstone. The bug existed when seek-{ge,le} performed snapshot
+# filtering, and the problematic case was "seek-le c 4". The seeking code was
+# finding the tombstone c-e#5, determining it wasn't visible and then return the
+# immediately preceding tombstone a-c#1. Now we return c-e:{} immediately,
+# because the span c-e covers c and contains no visible keys.
+
+seek-le
+c 1
+c 2
+c 3
+c 4
+c 5
+c 6
+----
+c-e:{}
+c-e:{}
+c-e:{}
+c-e:{}
+c-e:{}
+c-e:{(#5,RANGEDEL)}
diff --git a/pebble/internal/keyspan/testdata/truncate b/pebble/internal/keyspan/testdata/truncate
new file mode 100644
index 0000000..33ab3a5
--- /dev/null
+++ b/pebble/internal/keyspan/testdata/truncate
@@ -0,0 +1,318 @@
+build
+1:  b-d
+2:  d-f
+3:  f-h
+----
+1:  b-d
+2:    d-f
+3:      f-h
+
+
+truncate a-b
+----
+
+truncate a-c
+----
+1:  bc
+
+truncate a-d
+----
+1:  b-d
+
+truncate a-e
+----
+1:  b-d
+2:    de
+
+# The second range tombstone should be elided, as it starts after the
+# specified file end key.
+
+truncate a-e endKey=(d.SET.3)
+----
+1:  b-d
+
+# The second range tombstone should be back in the below example, as the
+# specified end key has a trailer (RANGEDEL.2) exactly matching that of the
+# rangedel tombstone's start key.
+
+truncate a-e endKey=(d.RANGEDEL.2)
+----
+1:  b-d
+2:    de
+
+truncate a-e endKey=(d.SET.1)
+----
+1:  b-d
+2:    de
+
+# Similarly, truncate range tombstones that end before the start key.
+
+truncate a-e startKey=(d.SET.3)
+----
+2:    de
+
+truncate a-e startKey=(c.SET.3)
+----
+1:  b-d
+2:    de
+
+truncate a-f
+----
+1:  b-d
+2:    d-f
+
+truncate a-g
+----
+1:  b-d
+2:    d-f
+3:      fg
+
+truncate a-h
+----
+1:  b-d
+2:    d-f
+3:      f-h
+
+
+truncate b-b
+----
+
+truncate b-c
+----
+1:  bc
+
+truncate b-d
+----
+1:  b-d
+
+truncate b-e
+----
+1:  b-d
+2:    de
+
+truncate b-f
+----
+1:  b-d
+2:    d-f
+
+truncate b-g
+----
+1:  b-d
+2:    d-f
+3:      fg
+
+truncate b-h
+----
+1:  b-d
+2:    d-f
+3:      f-h
+
+
+truncate c-c
+----
+
+truncate c-d
+----
+1:   cd
+
+truncate c-e
+----
+1:   cd
+2:    de
+
+truncate c-f
+----
+1:   cd
+2:    d-f
+
+truncate c-g
+----
+1:   cd
+2:    d-f
+3:      fg
+
+truncate c-h
+----
+1:   cd
+2:    d-f
+3:      f-h
+
+
+truncate d-d
+----
+
+truncate d-e
+----
+2:    de
+
+truncate d-f
+----
+2:    d-f
+
+truncate d-g
+----
+2:    d-f
+3:      fg
+
+truncate d-h
+----
+2:    d-f
+3:      f-h
+
+
+truncate e-e
+----
+
+truncate e-f
+----
+2:     ef
+
+truncate e-g
+----
+2:     ef
+3:      fg
+
+truncate e-h
+----
+2:     ef
+3:      f-h
+
+
+truncate f-f
+----
+
+truncate f-g
+----
+3:      fg
+
+truncate f-h
+----
+3:      f-h
+
+
+truncate g-g
+----
+
+truncate g-h
+----
+3:       gh
+
+# Regression test for https://github.com/cockroachdb/cockroach/issues/113973.
+
+truncate-and-save-iter a-dd
+----
+ok
+
+saved-iter
+first
+next
+next
+next
+----
+b-d:{(#1,RANGEDEL)}
+d-dd:{(#2,RANGEDEL)}
+<nil>
+<nil>
+
+saved-iter
+seek-ge e
+next
+next
+----
+<nil>
+<nil>
+<nil>
+
+saved-iter
+seek-ge e
+prev
+prev
+----
+<nil>
+d-dd:{(#2,RANGEDEL)}
+b-d:{(#1,RANGEDEL)}
+
+saved-iter
+seek-lt e
+prev
+prev
+----
+d-dd:{(#2,RANGEDEL)}
+b-d:{(#1,RANGEDEL)}
+<nil>
+
+saved-iter
+seek-lt e
+next
+next
+----
+d-dd:{(#2,RANGEDEL)}
+<nil>
+<nil>
+
+truncate-and-save-iter ee-h
+----
+ok
+
+saved-iter
+first
+next
+next
+next
+----
+ee-f:{(#2,RANGEDEL)}
+f-h:{(#3,RANGEDEL)}
+<nil>
+<nil>
+
+saved-iter
+seek-ge e
+next
+next
+----
+ee-f:{(#2,RANGEDEL)}
+f-h:{(#3,RANGEDEL)}
+<nil>
+
+saved-iter
+seek-ge e
+prev
+prev
+----
+ee-f:{(#2,RANGEDEL)}
+<nil>
+<nil>
+
+saved-iter
+seek-lt e
+prev
+prev
+----
+<nil>
+<nil>
+<nil>
+
+saved-iter
+seek-lt e
+next
+next
+----
+<nil>
+ee-f:{(#2,RANGEDEL)}
+f-h:{(#3,RANGEDEL)}
+
+
+truncate-and-save-iter a-g
+----
+ok
+
+saved-iter
+seek-ge h
+prev
+seek-lt h
+next
+----
+<nil>
+f-g:{(#3,RANGEDEL)}
+f-g:{(#3,RANGEDEL)}
+<nil>
diff --git a/pebble/internal/keyspan/testdata/visible b/pebble/internal/keyspan/testdata/visible
new file mode 100644
index 0000000..6a3b14b
--- /dev/null
+++ b/pebble/internal/keyspan/testdata/visible
@@ -0,0 +1,58 @@
+define
+a-b:{(#5,RANGEKEYSET) (#3,RANGEKEYSET)}
+----
+a-b:{(#5,RANGEKEYSET) (#3,RANGEKEYSET)}
+
+visible
+6
+5
+4
+3
+2
+1
+----
+6 : a-b:{(#5,RANGEKEYSET) (#3,RANGEKEYSET)}
+5 : a-b:{(#3,RANGEKEYSET)}
+4 : a-b:{(#3,RANGEKEYSET)}
+3 : a-b:{}
+2 : a-b:{}
+1 : a-b:{}
+
+define
+a-c:{(#36028797018963996,RANGEKEYSET) (#36028797018963995,RANGEKEYSET)}
+----
+a-c:{(#36028797018963996,RANGEKEYSET) (#36028797018963995,RANGEKEYSET)}
+
+visible
+5
+1
+----
+5 : a-c:{(#36028797018963996,RANGEKEYSET) (#36028797018963995,RANGEKEYSET)}
+1 : a-c:{(#36028797018963996,RANGEKEYSET) (#36028797018963995,RANGEKEYSET)}
+
+define
+a-c:{(#36028797018963996,RANGEKEYSET) (#36028797018963995,RANGEKEYSET) (#10,RANGEKEYSET) (#9,RANGEKEYSET) (#4,RANGEKEYSET) (#1,RANGEKEYSET)}
+----
+a-c:{(#36028797018963996,RANGEKEYSET) (#36028797018963995,RANGEKEYSET) (#10,RANGEKEYSET) (#9,RANGEKEYSET) (#4,RANGEKEYSET) (#1,RANGEKEYSET)}
+
+# Test 'sandwich cases'. Eg, at snapshot=7 the keys at #10 and #9 are invisible,
+# but the batch keys and the keys at #4 and #1 are visible.
+
+visible
+12
+10
+8
+7
+4
+3
+2
+1
+----
+12: a-c:{(#36028797018963996,RANGEKEYSET) (#36028797018963995,RANGEKEYSET) (#10,RANGEKEYSET) (#9,RANGEKEYSET) (#4,RANGEKEYSET) (#1,RANGEKEYSET)}
+10: a-c:{(#36028797018963996,RANGEKEYSET) (#36028797018963995,RANGEKEYSET) (#9,RANGEKEYSET) (#4,RANGEKEYSET) (#1,RANGEKEYSET)}
+8 : a-c:{(#36028797018963996,RANGEKEYSET) (#36028797018963995,RANGEKEYSET) (#4,RANGEKEYSET) (#1,RANGEKEYSET)}
+7 : a-c:{(#36028797018963996,RANGEKEYSET) (#36028797018963995,RANGEKEYSET) (#4,RANGEKEYSET) (#1,RANGEKEYSET)}
+4 : a-c:{(#36028797018963996,RANGEKEYSET) (#36028797018963995,RANGEKEYSET) (#1,RANGEKEYSET)}
+3 : a-c:{(#36028797018963996,RANGEKEYSET) (#36028797018963995,RANGEKEYSET) (#1,RANGEKEYSET)}
+2 : a-c:{(#36028797018963996,RANGEKEYSET) (#36028797018963995,RANGEKEYSET) (#1,RANGEKEYSET)}
+1 : a-c:{(#36028797018963996,RANGEKEYSET) (#36028797018963995,RANGEKEYSET)}
diff --git a/pebble/internal/keyspan/testdata/visible_at b/pebble/internal/keyspan/testdata/visible_at
new file mode 100644
index 0000000..6c8d56b
--- /dev/null
+++ b/pebble/internal/keyspan/testdata/visible_at
@@ -0,0 +1,58 @@
+define
+a-b:{(#5,RANGEKEYSET) (#3,RANGEKEYSET)}
+----
+a-b:{(#5,RANGEKEYSET) (#3,RANGEKEYSET)}
+
+visible-at
+6
+5
+4
+3
+2
+1
+----
+6 : true
+5 : true
+4 : true
+3 : false
+2 : false
+1 : false
+
+# NB: #36028797018963996 and #36028797018963995 are sequence numbers with the
+# batch bit set. These keys should always be visible.
+
+define
+a-c:{(#36028797018963996,RANGEKEYSET) (#36028797018963995,RANGEKEYSET)}
+----
+a-c:{(#36028797018963996,RANGEKEYSET) (#36028797018963995,RANGEKEYSET)}
+
+visible-at
+5
+1
+----
+5 : true
+1 : true
+
+define
+a-c:{(#36028797018963996,RANGEKEYSET) (#36028797018963995,RANGEKEYSET) (#10,RANGEKEYSET) (#9,RANGEKEYSET) (#4,RANGEKEYSET) (#1,RANGEKEYSET)}
+----
+a-c:{(#36028797018963996,RANGEKEYSET) (#36028797018963995,RANGEKEYSET) (#10,RANGEKEYSET) (#9,RANGEKEYSET) (#4,RANGEKEYSET) (#1,RANGEKEYSET)}
+
+visible-at
+12
+10
+8
+7
+4
+3
+2
+1
+----
+12: true
+10: true
+8 : true
+7 : true
+4 : true
+3 : true
+2 : true
+1 : true
diff --git a/pebble/internal/keyspan/transformer.go b/pebble/internal/keyspan/transformer.go
new file mode 100644
index 0000000..b5e8735
--- /dev/null
+++ b/pebble/internal/keyspan/transformer.go
@@ -0,0 +1,50 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import "github.com/cockroachdb/pebble/internal/base"
+
+// Transformer defines a transformation to be applied to a Span.
+type Transformer interface {
+	// Transform takes a Span as input and writes the transformed Span to the
+	// provided output *Span pointer. The output Span's Keys slice may be reused
+	// by Transform to reduce allocations.
+	Transform(cmp base.Compare, in Span, out *Span) error
+}
+
+// The TransformerFunc type is an adapter to allow the use of ordinary functions
+// as Transformers. If f is a function with the appropriate signature,
+// TransformerFunc(f) is a Transformer that calls f.
+type TransformerFunc func(base.Compare, Span, *Span) error
+
+// Transform calls f(cmp, in, out).
+func (tf TransformerFunc) Transform(cmp base.Compare, in Span, out *Span) error {
+	return tf(cmp, in, out)
+}
+
+var noopTransform Transformer = TransformerFunc(func(_ base.Compare, s Span, dst *Span) error {
+	dst.Start, dst.End = s.Start, s.End
+	dst.Keys = append(dst.Keys[:0], s.Keys...)
+	return nil
+})
+
+// VisibleTransform filters keys that are invisible at the provided snapshot
+// sequence number.
+func VisibleTransform(snapshot uint64) Transformer {
+	return TransformerFunc(func(_ base.Compare, s Span, dst *Span) error {
+		dst.Start, dst.End = s.Start, s.End
+		dst.Keys = dst.Keys[:0]
+		for _, k := range s.Keys {
+			// NB: The InternalKeySeqNumMax value is used for the batch snapshot
+			// because a batch's visible span keys are filtered when they're
+			// fragmented. There's no requirement to enforce visibility at
+			// iteration time.
+			if base.Visible(k.SeqNum(), snapshot, base.InternalKeySeqNumMax) {
+				dst.Keys = append(dst.Keys, k)
+			}
+		}
+		return nil
+	})
+}
diff --git a/pebble/internal/keyspan/truncate.go b/pebble/internal/keyspan/truncate.go
new file mode 100644
index 0000000..c0e609b
--- /dev/null
+++ b/pebble/internal/keyspan/truncate.go
@@ -0,0 +1,73 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import "github.com/cockroachdb/pebble/internal/base"
+
+// Truncate creates a new iterator where every span in the supplied iterator is
+// truncated to be contained within the range [lower, upper). If start and end
+// are specified, filter out any spans that are completely outside those bounds.
+func Truncate(
+	cmp base.Compare,
+	iter FragmentIterator,
+	lower, upper []byte,
+	start, end *base.InternalKey,
+	panicOnUpperTruncate bool,
+) FragmentIterator {
+	return Filter(iter, func(in *Span, out *Span) (keep bool) {
+		out.Start, out.End = in.Start, in.End
+		out.Keys = append(out.Keys[:0], in.Keys...)
+
+		// Ignore this span if it lies completely outside start, end. Note that
+		// end endInclusive indicated whether end is inclusive.
+		//
+		// The comparison between s.End and start is by user key only, as
+		// the span is exclusive at s.End, so comparing by user keys
+		// is sufficient.
+		if start != nil && cmp(in.End, start.UserKey) <= 0 {
+			return false
+		}
+		if end != nil {
+			v := cmp(in.Start, end.UserKey)
+			switch {
+			case v > 0:
+				// Wholly outside the end bound. Skip it.
+				return false
+			case v == 0:
+				// This span begins at the same user key as `end`. Whether or
+				// not any of the keys contained within the span are relevant is
+				// dependent on Trailers. Any keys contained within the span
+				// with trailers larger than end cover the small sliver of
+				// keyspace between [k#inf, k#<end-seqnum>]. Since keys are
+				// sorted descending by Trailer within the span, we need to find
+				// the prefix of keys with larger trailers.
+				for i := range in.Keys {
+					if in.Keys[i].Trailer < end.Trailer {
+						out.Keys = out.Keys[:i]
+						break
+					}
+				}
+			default:
+				// Wholly within the end bound. Keep it.
+			}
+		}
+
+		var truncated bool
+		// Truncate the bounds to lower and upper.
+		if cmp(in.Start, lower) < 0 {
+			out.Start = lower
+		}
+		if cmp(in.End, upper) > 0 {
+			truncated = true
+			out.End = upper
+		}
+
+		if panicOnUpperTruncate && truncated {
+			panic("pebble: upper bound should not be truncated")
+		}
+
+		return !out.Empty() && cmp(out.Start, out.End) < 0
+	}, cmp)
+}
diff --git a/pebble/internal/keyspan/truncate_test.go b/pebble/internal/keyspan/truncate_test.go
new file mode 100644
index 0000000..f2b2793
--- /dev/null
+++ b/pebble/internal/keyspan/truncate_test.go
@@ -0,0 +1,94 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package keyspan
+
+import (
+	"bytes"
+	"fmt"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+)
+
+func TestTruncate(t *testing.T) {
+	cmp := base.DefaultComparer.Compare
+	fmtKey := base.DefaultComparer.FormatKey
+	var iter FragmentIterator
+	var savedIter FragmentIterator
+	defer func() {
+		if savedIter != nil {
+			savedIter.Close()
+			savedIter = nil
+		}
+	}()
+
+	datadriven.RunTest(t, "testdata/truncate", func(t *testing.T, d *datadriven.TestData) string {
+		doTruncate := func() FragmentIterator {
+			if len(d.Input) > 0 {
+				t.Fatalf("unexpected input: %s", d.Input)
+			}
+			if len(d.CmdArgs) < 1 || len(d.CmdArgs) > 3 {
+				t.Fatalf("expected 1-3 arguments: %s", d.CmdArgs)
+			}
+			parts := strings.Split(d.CmdArgs[0].String(), "-")
+			var startKey, endKey *base.InternalKey
+			if len(d.CmdArgs) > 1 {
+				for _, arg := range d.CmdArgs[1:] {
+					switch arg.Key {
+					case "startKey":
+						startKey = &base.InternalKey{}
+						*startKey = base.ParseInternalKey(arg.Vals[0])
+					case "endKey":
+						endKey = &base.InternalKey{}
+						*endKey = base.ParseInternalKey(arg.Vals[0])
+					}
+				}
+			}
+			if len(parts) != 2 {
+				t.Fatalf("malformed arg: %s", d.CmdArgs[0])
+			}
+			lower := []byte(parts[0])
+			upper := []byte(parts[1])
+
+			tIter := Truncate(
+				cmp, iter, lower, upper, startKey, endKey, false,
+			)
+			return tIter
+		}
+
+		switch d.Cmd {
+		case "build":
+			tombstones := buildSpans(t, cmp, fmtKey, d.Input, base.InternalKeyKindRangeDelete)
+			iter = NewIter(cmp, tombstones)
+			return formatAlphabeticSpans(tombstones)
+
+		case "truncate":
+			tIter := doTruncate()
+			defer tIter.Close()
+			var truncated []Span
+			for s := tIter.First(); s != nil; s = tIter.Next() {
+				truncated = append(truncated, s.ShallowClone())
+			}
+			return formatAlphabeticSpans(truncated)
+
+		case "truncate-and-save-iter":
+			if savedIter != nil {
+				savedIter.Close()
+			}
+			savedIter = doTruncate()
+			return "ok"
+
+		case "saved-iter":
+			var buf bytes.Buffer
+			runIterCmd(t, d, savedIter, &buf)
+			return buf.String()
+
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
diff --git a/pebble/internal/lint/lint.go b/pebble/internal/lint/lint.go
new file mode 100644
index 0000000..338a34a
--- /dev/null
+++ b/pebble/internal/lint/lint.go
@@ -0,0 +1,5 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package lint
diff --git a/pebble/internal/lint/lint_test.go b/pebble/internal/lint/lint_test.go
new file mode 100644
index 0000000..e088d69
--- /dev/null
+++ b/pebble/internal/lint/lint_test.go
@@ -0,0 +1,301 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package lint
+
+import (
+	"bytes"
+	"fmt"
+	"go/build"
+	"os/exec"
+	"regexp"
+	"runtime"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/ghemawat/stream"
+	"github.com/stretchr/testify/require"
+)
+
+const (
+	cmdGo       = "go"
+	golint      = "golang.org/x/lint/golint@6edffad5e6160f5949cdefc81710b2706fbcd4f6"
+	staticcheck = "honnef.co/go/tools/cmd/staticcheck@2023.1"
+	crlfmt      = "github.com/cockroachdb/crlfmt@44a36ec7"
+)
+
+func dirCmd(t *testing.T, dir string, name string, args ...string) stream.Filter {
+	cmd := exec.Command(name, args...)
+	cmd.Dir = dir
+	out, err := cmd.CombinedOutput()
+	switch err.(type) {
+	case nil:
+	case *exec.ExitError:
+		// Non-zero exit is expected.
+	default:
+		require.NoError(t, err)
+	}
+	return stream.ReadLines(bytes.NewReader(out))
+}
+
+func ignoreGoMod() stream.Filter {
+	return stream.GrepNot(`^go: (finding|extracting|downloading)`)
+}
+
+func TestLint(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("lint checks skipped on Windows")
+	}
+	if invariants.RaceEnabled {
+		// We are not interested in race-testing the linters themselves.
+		t.Skip("lint checks skipped on race builds")
+	}
+
+	const root = "github.com/cockroachdb/pebble"
+
+	pkg, err := build.Import(root, "../..", 0)
+	require.NoError(t, err)
+
+	var pkgs []string
+	if err := stream.ForEach(
+		stream.Sequence(
+			dirCmd(t, pkg.Dir, "go", "list", "./..."),
+			ignoreGoMod(),
+		), func(s string) {
+			pkgs = append(pkgs, s)
+		}); err != nil {
+		require.NoError(t, err)
+	}
+
+	t.Run("TestGolint", func(t *testing.T) {
+		t.Parallel()
+
+		args := []string{"run", golint}
+		args = append(args, pkgs...)
+
+		// This is overkill right now, but provides a structure for filtering out
+		// lint errors we don't care about.
+		if err := stream.ForEach(
+			stream.Sequence(
+				dirCmd(t, pkg.Dir, cmdGo, args...),
+				stream.GrepNot("go: downloading"),
+			), func(s string) {
+				t.Errorf("\n%s", s)
+			}); err != nil {
+			t.Error(err)
+		}
+	})
+
+	t.Run("TestStaticcheck", func(t *testing.T) {
+		t.Parallel()
+
+		args := []string{"run", staticcheck}
+		args = append(args, pkgs...)
+
+		if err := stream.ForEach(
+			stream.Sequence(
+				dirCmd(t, pkg.Dir, cmdGo, args...),
+				stream.GrepNot("go: downloading"),
+			), func(s string) {
+				t.Errorf("\n%s", s)
+			}); err != nil {
+			t.Error(err)
+		}
+	})
+
+	t.Run("TestGoVet", func(t *testing.T) {
+		t.Parallel()
+
+		if err := stream.ForEach(
+			stream.Sequence(
+				dirCmd(t, pkg.Dir, "go", "vet", "-all", "./..."),
+				stream.GrepNot(`^#`), // ignore comment lines
+				ignoreGoMod(),
+			), func(s string) {
+				t.Errorf("\n%s", s)
+			}); err != nil {
+			t.Error(err)
+		}
+	})
+
+	t.Run("TestFmtErrorf", func(t *testing.T) {
+		t.Parallel()
+
+		if err := stream.ForEach(
+			dirCmd(t, pkg.Dir, "git", "grep", "fmt\\.Errorf("),
+			func(s string) {
+				t.Errorf("\n%s <- please use \"errors.Errorf\" instead", s)
+			}); err != nil {
+			t.Error(err)
+		}
+	})
+
+	t.Run("TestOSIsErr", func(t *testing.T) {
+		t.Parallel()
+
+		if err := stream.ForEach(
+			dirCmd(t, pkg.Dir, "git", "grep", "os\\.Is"),
+			func(s string) {
+				t.Errorf("\n%s <- please use the \"oserror\" equivalent instead", s)
+			}); err != nil {
+			t.Error(err)
+		}
+	})
+
+	t.Run("TestSetFinalizer", func(t *testing.T) {
+		t.Parallel()
+
+		if err := stream.ForEach(
+			stream.Sequence(
+				dirCmd(t, pkg.Dir, "git", "grep", "-B1", "runtime\\.SetFinalizer("),
+				lintIgnore("lint:ignore SetFinalizer"),
+				stream.GrepNot(`^internal/invariants/finalizer_on.go`),
+			), func(s string) {
+				t.Errorf("\n%s <- please use the \"invariants.SetFinalizer\" equivalent instead", s)
+			}); err != nil {
+			t.Error(err)
+		}
+	})
+
+	// Disallow "raw" atomics; wrappers like atomic.Int32 provide much better
+	// safety and alignment guarantees.
+	t.Run("TestRawAtomics", func(t *testing.T) {
+		t.Parallel()
+		if err := stream.ForEach(
+			stream.Sequence(
+				dirCmd(t, pkg.Dir, "git", "grep", `atomic\.\(Load\|Store\|Add\|Swap\|Compare\)`),
+				lintIgnore("lint:ignore RawAtomics"),
+			), func(s string) {
+				t.Errorf("\n%s <- please use atomic wrappers (like atomic.Int32) instead", s)
+			}); err != nil {
+			t.Error(err)
+		}
+	})
+
+	t.Run("TestForbiddenImports", func(t *testing.T) {
+		t.Parallel()
+
+		// Forbidden-import-pkg -> permitted-replacement-pkg
+		forbiddenImports := map[string]string{
+			"errors":     "github.com/cockroachdb/errors",
+			"pkg/errors": "github.com/cockroachdb/errors",
+		}
+
+		// grepBuf creates a grep string that matches any forbidden import pkgs.
+		var grepBuf bytes.Buffer
+		grepBuf.WriteByte('(')
+		for forbiddenPkg := range forbiddenImports {
+			grepBuf.WriteByte('|')
+			grepBuf.WriteString(regexp.QuoteMeta(forbiddenPkg))
+		}
+		grepBuf.WriteString(")$")
+
+		filter := stream.FilterFunc(func(arg stream.Arg) error {
+			for _, path := range pkgs {
+				buildContext := build.Default
+				buildContext.UseAllFiles = true
+				importPkg, err := buildContext.Import(path, pkg.Dir, 0)
+				if _, ok := err.(*build.MultiplePackageError); ok {
+					buildContext.UseAllFiles = false
+					importPkg, err = buildContext.Import(path, pkg.Dir, 0)
+				}
+
+				switch err.(type) {
+				case nil:
+					for _, s := range importPkg.Imports {
+						arg.Out <- importPkg.ImportPath + ": " + s
+					}
+					for _, s := range importPkg.TestImports {
+						arg.Out <- importPkg.ImportPath + ": " + s
+					}
+					for _, s := range importPkg.XTestImports {
+						arg.Out <- importPkg.ImportPath + ": " + s
+					}
+				case *build.NoGoError:
+				default:
+					return errors.Wrapf(err, "error loading package %s", path)
+				}
+			}
+			return nil
+		})
+		if err := stream.ForEach(stream.Sequence(
+			filter,
+			stream.Sort(),
+			stream.Uniq(),
+			stream.Grep(grepBuf.String()),
+		), func(s string) {
+			pkgStr := strings.Split(s, ": ")
+			importedPkg := pkgStr[1]
+
+			// Test that a disallowed package is not imported.
+			if replPkg, ok := forbiddenImports[importedPkg]; ok {
+				t.Errorf("\n%s <- please use %q instead of %q", s, replPkg, importedPkg)
+			}
+		}); err != nil {
+			t.Error(err)
+		}
+	})
+
+	t.Run("TestCrlfmt", func(t *testing.T) {
+		t.Parallel()
+
+		args := []string{"run", crlfmt, "-fast", "-tab", "2", "."}
+		var buf bytes.Buffer
+		if err := stream.ForEach(
+			stream.Sequence(
+				dirCmd(t, pkg.Dir, cmdGo, args...),
+				stream.GrepNot("go: downloading"),
+			),
+			func(s string) {
+				fmt.Fprintln(&buf, s)
+			}); err != nil {
+			t.Error(err)
+		}
+		errs := buf.String()
+		if len(errs) > 0 {
+			t.Errorf("\n%s", errs)
+		}
+
+		if t.Failed() {
+			reWriteCmd := []string{crlfmt, "-w"}
+			reWriteCmd = append(reWriteCmd, args...)
+			t.Logf("run the following to fix your formatting:\n"+
+				"\n%s\n\n"+
+				"Don't forget to add amend the result to the correct commits.",
+				strings.Join(reWriteCmd, " "),
+			)
+		}
+	})
+}
+
+// lintIgnore is a stream.FilterFunc that filters out lines that are preceded by
+// the given ignore directive. The function assumes the input stream receives a
+// sequence of strings that are to be considered as pairs. If the first string
+// in the sequence matches the ignore directive, the following string is
+// dropped, else it is emitted.
+//
+// For example, given the sequence "foo", "bar", "baz", "bam", and an ignore
+// directive "foo", the sequence "baz", "bam" would be emitted. If the directive
+// was "baz", the sequence "foo", "bar" would be emitted.
+func lintIgnore(ignore string) stream.FilterFunc {
+	return func(arg stream.Arg) error {
+		var prev string
+		var i int
+		for s := range arg.In {
+			if i%2 == 0 {
+				// Fist string in the pair is used as the filter. Store it.
+				prev = s
+			} else {
+				// Second string is emitted only if it _does not_ match the directive.
+				if !strings.Contains(prev, ignore) {
+					arg.Out <- s
+				}
+			}
+			i++
+		}
+		return nil
+	}
+}
diff --git a/pebble/internal/manifest/btree.go b/pebble/internal/manifest/btree.go
new file mode 100644
index 0000000..dd17834
--- /dev/null
+++ b/pebble/internal/manifest/btree.go
@@ -0,0 +1,1304 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package manifest
+
+import (
+	"bytes"
+	"fmt"
+	"strings"
+	"sync/atomic"
+	"unsafe"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	stdcmp "github.com/cockroachdb/pebble/shims/cmp"
+)
+
+// The Annotator type defined below is used by other packages to lazily
+// compute a value over a B-Tree. Each node of the B-Tree stores one
+// `annotation` per annotator, containing the result of the computation over
+// the node's subtree.
+//
+// An annotation is marked as valid if it's current with the current subtree
+// state. Annotations are marked as invalid whenever a node will be mutated
+// (in mut).  Annotators may also return `false` from `Accumulate` to signal
+// that a computation for a file is not stable and may change in the future.
+// Annotations that include these unstable values are also marked as invalid
+// on the node, ensuring that future queries for the annotation will recompute
+// the value.
+
+// An Annotator defines a computation over a level's FileMetadata. If the
+// computation is stable and uses inputs that are fixed for the lifetime of
+// a FileMetadata, the LevelMetadata's internal data structures are annotated
+// with the intermediary computations. This allows the computation to be
+// computed incrementally as edits are applied to a level.
+type Annotator interface {
+	// Zero returns the zero value of an annotation. This value is returned
+	// when a LevelMetadata is empty. The dst argument, if non-nil, is an
+	// obsolete value previously returned by this Annotator and may be
+	// overwritten and reused to avoid a memory allocation.
+	Zero(dst interface{}) (v interface{})
+
+	// Accumulate computes the annotation for a single file in a level's
+	// metadata. It merges the file's value into dst and returns a bool flag
+	// indicating whether or not the value is stable and okay to cache as an
+	// annotation. If the file's value may change over the life of the file,
+	// the annotator must return false.
+	//
+	// Implementations may modify dst and return it to avoid an allocation.
+	Accumulate(m *FileMetadata, dst interface{}) (v interface{}, cacheOK bool)
+
+	// Merge combines two values src and dst, returning the result.
+	// Implementations may modify dst and return it to avoid an allocation.
+	Merge(src interface{}, dst interface{}) interface{}
+}
+
+type btreeCmp func(*FileMetadata, *FileMetadata) int
+
+func btreeCmpSeqNum(a, b *FileMetadata) int {
+	return a.cmpSeqNum(b)
+}
+
+func btreeCmpSmallestKey(cmp Compare) btreeCmp {
+	return func(a, b *FileMetadata) int {
+		return a.cmpSmallestKey(b, cmp)
+	}
+}
+
+// btreeCmpSpecificOrder is used in tests to construct a B-Tree with a
+// specific ordering of FileMetadata within the tree. It's typically used to
+// test consistency checking code that needs to construct a malformed B-Tree.
+func btreeCmpSpecificOrder(files []*FileMetadata) btreeCmp {
+	m := map[*FileMetadata]int{}
+	for i, f := range files {
+		m[f] = i
+	}
+	return func(a, b *FileMetadata) int {
+		ai, aok := m[a]
+		bi, bok := m[b]
+		if !aok || !bok {
+			panic("btreeCmpSliceOrder called with unknown files")
+		}
+		return stdcmp.Compare(ai, bi)
+	}
+}
+
+const (
+	degree   = 16
+	maxItems = 2*degree - 1
+	minItems = degree - 1
+)
+
+type annotation struct {
+	annotator Annotator
+	// v is an annotation value, the output of either
+	// annotator.Value or annotator.Merge.
+	v interface{}
+	// valid indicates whether future reads of the annotation may use v as-is.
+	// If false, v will be zeroed and recalculated.
+	valid bool
+}
+
+type leafNode struct {
+	ref   atomic.Int32
+	count int16
+	leaf  bool
+	// subtreeCount holds the count of files in the entire subtree formed by
+	// this node. For leaf nodes, subtreeCount is always equal to count. For
+	// non-leaf nodes, it's the sum of count plus all the children's
+	// subtreeCounts.
+	//
+	// NB: We could move this field to the end of the node struct, since leaf =>
+	// count=subtreeCount, however the unsafe casting [leafToNode] performs make
+	// it risky and cumbersome.
+	subtreeCount int
+	items        [maxItems]*FileMetadata
+	// annot contains one annotation per annotator, merged over the entire
+	// node's files (and all descendants for non-leaf nodes).
+	annot []annotation
+}
+
+type node struct {
+	leafNode
+	children [maxItems + 1]*node
+}
+
+//go:nocheckptr casts a ptr to a smaller struct to a ptr to a larger struct.
+func leafToNode(ln *leafNode) *node {
+	return (*node)(unsafe.Pointer(ln))
+}
+
+func newLeafNode() *node {
+	n := leafToNode(new(leafNode))
+	n.leaf = true
+	n.ref.Store(1)
+	return n
+}
+
+func newNode() *node {
+	n := new(node)
+	n.ref.Store(1)
+	return n
+}
+
+// mut creates and returns a mutable node reference. If the node is not shared
+// with any other trees then it can be modified in place. Otherwise, it must be
+// cloned to ensure unique ownership. In this way, we enforce a copy-on-write
+// policy which transparently incorporates the idea of local mutations, like
+// Clojure's transients or Haskell's ST monad, where nodes are only copied
+// during the first time that they are modified between Clone operations.
+//
+// When a node is cloned, the provided pointer will be redirected to the new
+// mutable node.
+func mut(n **node) *node {
+	if (*n).ref.Load() == 1 {
+		// Exclusive ownership. Can mutate in place.
+
+		// Whenever a node will be mutated, reset its annotations to be marked
+		// as uncached. This ensures any future calls to (*node).annotation
+		// will recompute annotations on the modified subtree.
+		for i := range (*n).annot {
+			(*n).annot[i].valid = false
+		}
+		return *n
+	}
+	// If we do not have unique ownership over the node then we
+	// clone it to gain unique ownership. After doing so, we can
+	// release our reference to the old node. We pass recursive
+	// as true because even though we just observed the node's
+	// reference count to be greater than 1, we might be racing
+	// with another call to decRef on this node.
+	c := (*n).clone()
+	(*n).decRef(true /* contentsToo */, nil)
+	*n = c
+	// NB: We don't need to clear annotations, because (*node).clone does not
+	// copy them.
+	return *n
+}
+
+// incRef acquires a reference to the node.
+func (n *node) incRef() {
+	n.ref.Add(1)
+}
+
+// decRef releases a reference to the node. If requested, the method will unref
+// its items and recurse into child nodes and decrease their refcounts as well.
+// Some internal codepaths that manually copy the node's items or children to
+// new nodes pass contentsToo=false to preserve existing reference counts during
+// operations that should yield a net-zero change to descendant refcounts.
+// When a node is released, its contained files are dereferenced.
+func (n *node) decRef(contentsToo bool, obsolete *[]*FileBacking) {
+	if n.ref.Add(-1) > 0 {
+		// Other references remain. Can't free.
+		return
+	}
+
+	// Dereference the node's metadata and release child references if
+	// requested. Some internal callers may not want to propagate the deref
+	// because they're manually copying the filemetadata and children to other
+	// nodes, and they want to preserve the existing reference count.
+	if contentsToo {
+		for _, f := range n.items[:n.count] {
+			if f.Unref() == 0 {
+				// There are two sources of node dereferences: tree mutations
+				// and Version dereferences. Files should only be made obsolete
+				// during Version dereferences, during which `obsolete` will be
+				// non-nil.
+				if obsolete == nil {
+					panic(fmt.Sprintf("file metadata %s dereferenced to zero during tree mutation", f.FileNum))
+				}
+				// Reference counting is performed on the FileBacking. In the case
+				// of a virtual sstable, this reference counting is performed on
+				// a FileBacking which is shared by every single virtual sstable
+				// with the same backing sstable. If the reference count hits 0,
+				// then we know that the FileBacking won't be required by any
+				// sstable in Pebble, and that the backing sstable can be deleted.
+				*obsolete = append(*obsolete, f.FileBacking)
+			}
+		}
+		if !n.leaf {
+			for i := int16(0); i <= n.count; i++ {
+				n.children[i].decRef(true /* contentsToo */, obsolete)
+			}
+		}
+	}
+}
+
+// clone creates a clone of the receiver with a single reference count.
+func (n *node) clone() *node {
+	var c *node
+	if n.leaf {
+		c = newLeafNode()
+	} else {
+		c = newNode()
+	}
+	// NB: copy field-by-field without touching n.ref to avoid
+	// triggering the race detector and looking like a data race.
+	c.count = n.count
+	c.items = n.items
+	c.subtreeCount = n.subtreeCount
+	// Increase the refcount of each contained item.
+	for _, f := range n.items[:n.count] {
+		f.Ref()
+	}
+	if !c.leaf {
+		// Copy children and increase each refcount.
+		c.children = n.children
+		for i := int16(0); i <= c.count; i++ {
+			c.children[i].incRef()
+		}
+	}
+	return c
+}
+
+// insertAt inserts the provided file and node at the provided index. This
+// function is for use only as a helper function for internal B-Tree code.
+// Clients should not invoke it directly.
+func (n *node) insertAt(index int, item *FileMetadata, nd *node) {
+	if index < int(n.count) {
+		copy(n.items[index+1:n.count+1], n.items[index:n.count])
+		if !n.leaf {
+			copy(n.children[index+2:n.count+2], n.children[index+1:n.count+1])
+		}
+	}
+	n.items[index] = item
+	if !n.leaf {
+		n.children[index+1] = nd
+	}
+	n.count++
+}
+
+// pushBack inserts the provided file and node at the tail of the node's items.
+// This function is for use only as a helper function for internal B-Tree code.
+// Clients should not invoke it directly.
+func (n *node) pushBack(item *FileMetadata, nd *node) {
+	n.items[n.count] = item
+	if !n.leaf {
+		n.children[n.count+1] = nd
+	}
+	n.count++
+}
+
+// pushFront inserts the provided file and node at the head of the
+// node's items. This function is for use only as a helper function for internal B-Tree
+// code. Clients should not invoke it directly.
+func (n *node) pushFront(item *FileMetadata, nd *node) {
+	if !n.leaf {
+		copy(n.children[1:n.count+2], n.children[:n.count+1])
+		n.children[0] = nd
+	}
+	copy(n.items[1:n.count+1], n.items[:n.count])
+	n.items[0] = item
+	n.count++
+}
+
+// removeAt removes a value at a given index, pulling all subsequent values
+// back. This function is for use only as a helper function for internal B-Tree
+// code. Clients should not invoke it directly.
+func (n *node) removeAt(index int) (*FileMetadata, *node) {
+	var child *node
+	if !n.leaf {
+		child = n.children[index+1]
+		copy(n.children[index+1:n.count], n.children[index+2:n.count+1])
+		n.children[n.count] = nil
+	}
+	n.count--
+	out := n.items[index]
+	copy(n.items[index:n.count], n.items[index+1:n.count+1])
+	n.items[n.count] = nil
+	return out, child
+}
+
+// popBack removes and returns the last element in the list. This function is
+// for use only as a helper function for internal B-Tree code. Clients should
+// not invoke it directly.
+func (n *node) popBack() (*FileMetadata, *node) {
+	n.count--
+	out := n.items[n.count]
+	n.items[n.count] = nil
+	if n.leaf {
+		return out, nil
+	}
+	child := n.children[n.count+1]
+	n.children[n.count+1] = nil
+	return out, child
+}
+
+// popFront removes and returns the first element in the list. This function is
+// for use only as a helper function for internal B-Tree code. Clients should
+// not invoke it directly.
+func (n *node) popFront() (*FileMetadata, *node) {
+	n.count--
+	var child *node
+	if !n.leaf {
+		child = n.children[0]
+		copy(n.children[:n.count+1], n.children[1:n.count+2])
+		n.children[n.count+1] = nil
+	}
+	out := n.items[0]
+	copy(n.items[:n.count], n.items[1:n.count+1])
+	n.items[n.count] = nil
+	return out, child
+}
+
+// find returns the index where the given item should be inserted into this
+// list. 'found' is true if the item already exists in the list at the given
+// index.
+//
+// This function is for use only as a helper function for internal B-Tree code.
+// Clients should not invoke it directly.
+func (n *node) find(cmp btreeCmp, item *FileMetadata) (index int, found bool) {
+	// Logic copied from sort.Search. Inlining this gave
+	// an 11% speedup on BenchmarkBTreeDeleteInsert.
+	i, j := 0, int(n.count)
+	for i < j {
+		h := int(uint(i+j) >> 1) // avoid overflow when computing h
+		// i ≤ h < j
+		v := cmp(item, n.items[h])
+		if v == 0 {
+			return h, true
+		} else if v > 0 {
+			i = h + 1
+		} else {
+			j = h
+		}
+	}
+	return i, false
+}
+
+// split splits the given node at the given index. The current node shrinks,
+// and this function returns the item that existed at that index and a new
+// node containing all items/children after it.
+//
+// split is called when we want to perform a transformation like the one
+// depicted in the following diagram.
+//
+//	Before:
+//	                       +-----------+
+//	             n *node   |   x y z   |
+//	                       +--/-/-\-\--+
+//
+//	After:
+//	                       +-----------+
+//	                       |     y     |  n's parent
+//	                       +----/-\----+
+//	                           /   \
+//	                          v     v
+//	              +-----------+     +-----------+
+//	      n *node |         x |     | z         | next *node
+//	              +-----------+     +-----------+
+//
+// split does not perform the complete transformation; the caller is responsible
+// for updating the parent appropriately. split splits `n` into two nodes, `n`
+// and `next`, returning `next` and the file that separates them. In the diagram
+// above, `n.split` removes y and z from `n`, returning y in the first return
+// value and `next` in the second return value. The caller is responsible for
+// updating n's parent to now contain `y` as the separator between nodes `n` and
+// `next`.
+//
+// This function is for use only as a helper function for internal B-Tree code.
+// Clients should not invoke it directly.
+func (n *node) split(i int) (*FileMetadata, *node) {
+	out := n.items[i]
+	var next *node
+	if n.leaf {
+		next = newLeafNode()
+	} else {
+		next = newNode()
+	}
+	next.count = n.count - int16(i+1)
+	copy(next.items[:], n.items[i+1:n.count])
+	for j := int16(i); j < n.count; j++ {
+		n.items[j] = nil
+	}
+	if !n.leaf {
+		copy(next.children[:], n.children[i+1:n.count+1])
+		descendantsMoved := 0
+		for j := int16(i + 1); j <= n.count; j++ {
+			descendantsMoved += n.children[j].subtreeCount
+			n.children[j] = nil
+		}
+		n.subtreeCount -= descendantsMoved
+		next.subtreeCount += descendantsMoved
+	}
+	n.count = int16(i)
+	// NB: We subtract one more than `next.count` from n's subtreeCount because
+	// the item at index `i` was removed from `n.items`. We'll return the item
+	// at index `i`, and the caller is responsible for updating the subtree
+	// count of whichever node adopts it.
+	n.subtreeCount -= int(next.count) + 1
+	next.subtreeCount += int(next.count)
+	return out, next
+}
+
+// Insert inserts a item into the subtree rooted at this node, making sure no
+// nodes in the subtree exceed maxItems items.
+func (n *node) Insert(cmp btreeCmp, item *FileMetadata) error {
+	i, found := n.find(cmp, item)
+	if found {
+		// cmp provides a total ordering of the files within a level.
+		// If we're inserting a metadata that's equal to an existing item
+		// in the tree, we're inserting a file into a level twice.
+		return errors.Errorf("files %s and %s collided on sort keys",
+			errors.Safe(item.FileNum), errors.Safe(n.items[i].FileNum))
+	}
+	if n.leaf {
+		n.insertAt(i, item, nil)
+		n.subtreeCount++
+		return nil
+	}
+	if n.children[i].count >= maxItems {
+		splitLa, splitNode := mut(&n.children[i]).split(maxItems / 2)
+		n.insertAt(i, splitLa, splitNode)
+
+		switch cmp := cmp(item, n.items[i]); {
+		case cmp < 0:
+			// no change, we want first split node
+		case cmp > 0:
+			i++ // we want second split node
+		default:
+			// cmp provides a total ordering of the files within a level.
+			// If we're inserting a metadata that's equal to an existing item
+			// in the tree, we're inserting a file into a level twice.
+			return errors.Errorf("files %s and %s collided on sort keys",
+				errors.Safe(item.FileNum), errors.Safe(n.items[i].FileNum))
+		}
+	}
+
+	err := mut(&n.children[i]).Insert(cmp, item)
+	if err == nil {
+		n.subtreeCount++
+	}
+	return err
+}
+
+// removeMax removes and returns the maximum item from the subtree rooted at
+// this node. This function is for use only as a helper function for internal
+// B-Tree code. Clients should not invoke it directly.
+func (n *node) removeMax() *FileMetadata {
+	if n.leaf {
+		n.count--
+		n.subtreeCount--
+		out := n.items[n.count]
+		n.items[n.count] = nil
+		return out
+	}
+	child := mut(&n.children[n.count])
+	if child.count <= minItems {
+		n.rebalanceOrMerge(int(n.count))
+		return n.removeMax()
+	}
+	n.subtreeCount--
+	return child.removeMax()
+}
+
+// Remove removes a item from the subtree rooted at this node. Returns
+// the item that was removed or nil if no matching item was found.
+func (n *node) Remove(cmp btreeCmp, item *FileMetadata) (out *FileMetadata) {
+	i, found := n.find(cmp, item)
+	if n.leaf {
+		if found {
+			out, _ = n.removeAt(i)
+			n.subtreeCount--
+			return out
+		}
+		return nil
+	}
+	if n.children[i].count <= minItems {
+		// Child not large enough to remove from.
+		n.rebalanceOrMerge(i)
+		return n.Remove(cmp, item)
+	}
+	child := mut(&n.children[i])
+	if found {
+		// Replace the item being removed with the max item in our left child.
+		out = n.items[i]
+		n.items[i] = child.removeMax()
+		n.subtreeCount--
+		return out
+	}
+	// File is not in this node and child is large enough to remove from.
+	out = child.Remove(cmp, item)
+	if out != nil {
+		n.subtreeCount--
+	}
+	return out
+}
+
+// rebalanceOrMerge grows child 'i' to ensure it has sufficient room to remove a
+// item from it while keeping it at or above minItems. This function is for use
+// only as a helper function for internal B-Tree code. Clients should not invoke
+// it directly.
+func (n *node) rebalanceOrMerge(i int) {
+	switch {
+	case i > 0 && n.children[i-1].count > minItems:
+		// Rebalance from left sibling.
+		//
+		//          +-----------+
+		//          |     y     |
+		//          +----/-\----+
+		//              /   \
+		//             v     v
+		// +-----------+     +-----------+
+		// |         x |     |           |
+		// +----------\+     +-----------+
+		//             \
+		//              v
+		//              a
+		//
+		// After:
+		//
+		//          +-----------+
+		//          |     x     |
+		//          +----/-\----+
+		//              /   \
+		//             v     v
+		// +-----------+     +-----------+
+		// |           |     | y         |
+		// +-----------+     +/----------+
+		//                   /
+		//                  v
+		//                  a
+		//
+		left := mut(&n.children[i-1])
+		child := mut(&n.children[i])
+		xLa, grandChild := left.popBack()
+		yLa := n.items[i-1]
+		child.pushFront(yLa, grandChild)
+		n.items[i-1] = xLa
+		child.subtreeCount++
+		left.subtreeCount--
+		if grandChild != nil {
+			child.subtreeCount += grandChild.subtreeCount
+			left.subtreeCount -= grandChild.subtreeCount
+		}
+
+	case i < int(n.count) && n.children[i+1].count > minItems:
+		// Rebalance from right sibling.
+		//
+		//          +-----------+
+		//          |     y     |
+		//          +----/-\----+
+		//              /   \
+		//             v     v
+		// +-----------+     +-----------+
+		// |           |     | x         |
+		// +-----------+     +/----------+
+		//                   /
+		//                  v
+		//                  a
+		//
+		// After:
+		//
+		//          +-----------+
+		//          |     x     |
+		//          +----/-\----+
+		//              /   \
+		//             v     v
+		// +-----------+     +-----------+
+		// |         y |     |           |
+		// +----------\+     +-----------+
+		//             \
+		//              v
+		//              a
+		//
+		right := mut(&n.children[i+1])
+		child := mut(&n.children[i])
+		xLa, grandChild := right.popFront()
+		yLa := n.items[i]
+		child.pushBack(yLa, grandChild)
+		child.subtreeCount++
+		right.subtreeCount--
+		if grandChild != nil {
+			child.subtreeCount += grandChild.subtreeCount
+			right.subtreeCount -= grandChild.subtreeCount
+		}
+		n.items[i] = xLa
+
+	default:
+		// Merge with either the left or right sibling.
+		//
+		//          +-----------+
+		//          |   u y v   |
+		//          +----/-\----+
+		//              /   \
+		//             v     v
+		// +-----------+     +-----------+
+		// |         x |     | z         |
+		// +-----------+     +-----------+
+		//
+		// After:
+		//
+		//          +-----------+
+		//          |    u v    |
+		//          +-----|-----+
+		//                |
+		//                v
+		//          +-----------+
+		//          |   x y z   |
+		//          +-----------+
+		//
+		if i >= int(n.count) {
+			i = int(n.count - 1)
+		}
+		child := mut(&n.children[i])
+		// Make mergeChild mutable, bumping the refcounts on its children if necessary.
+		_ = mut(&n.children[i+1])
+		mergeLa, mergeChild := n.removeAt(i)
+		child.items[child.count] = mergeLa
+		copy(child.items[child.count+1:], mergeChild.items[:mergeChild.count])
+		if !child.leaf {
+			copy(child.children[child.count+1:], mergeChild.children[:mergeChild.count+1])
+		}
+		child.count += mergeChild.count + 1
+		child.subtreeCount += mergeChild.subtreeCount + 1
+
+		mergeChild.decRef(false /* contentsToo */, nil)
+	}
+}
+
+// InvalidateAnnotation removes any existing cached annotations for the provided
+// annotator from this node's subtree.
+func (n *node) InvalidateAnnotation(a Annotator) {
+	// Find this annotator's annotation on this node.
+	var annot *annotation
+	for i := range n.annot {
+		if n.annot[i].annotator == a {
+			annot = &n.annot[i]
+		}
+	}
+
+	if annot != nil && annot.valid {
+		annot.valid = false
+		annot.v = a.Zero(annot.v)
+	}
+	if !n.leaf {
+		for i := int16(0); i <= n.count; i++ {
+			n.children[i].InvalidateAnnotation(a)
+		}
+	}
+}
+
+// Annotation retrieves, computing if not already computed, the provided
+// annotator's annotation of this node. The second return value indicates
+// whether the future reads of this annotation may use the first return value
+// as-is. If false, the annotation is not stable and may change on a subsequent
+// computation.
+func (n *node) Annotation(a Annotator) (interface{}, bool) {
+	// Find this annotator's annotation on this node.
+	var annot *annotation
+	for i := range n.annot {
+		if n.annot[i].annotator == a {
+			annot = &n.annot[i]
+		}
+	}
+
+	// If it exists and is marked as valid, we can return it without
+	// recomputing anything.
+	if annot != nil && annot.valid {
+		return annot.v, true
+	}
+
+	if annot == nil {
+		// This is n's first time being annotated by a.
+		// Create a new zeroed annotation.
+		n.annot = append(n.annot, annotation{
+			annotator: a,
+			v:         a.Zero(nil),
+		})
+		annot = &n.annot[len(n.annot)-1]
+	} else {
+		// There's an existing annotation that must be recomputed.
+		// Zero its value.
+		annot.v = a.Zero(annot.v)
+	}
+
+	annot.valid = true
+	for i := int16(0); i <= n.count; i++ {
+		if !n.leaf {
+			v, ok := n.children[i].Annotation(a)
+			annot.v = a.Merge(v, annot.v)
+			annot.valid = annot.valid && ok
+		}
+		if i < n.count {
+			v, ok := a.Accumulate(n.items[i], annot.v)
+			annot.v = v
+			annot.valid = annot.valid && ok
+		}
+	}
+	return annot.v, annot.valid
+}
+
+func (n *node) verifyInvariants() {
+	recomputedSubtreeCount := int(n.count)
+	if !n.leaf {
+		for i := int16(0); i <= n.count; i++ {
+			n.children[i].verifyInvariants()
+			recomputedSubtreeCount += n.children[i].subtreeCount
+		}
+	}
+	if recomputedSubtreeCount != n.subtreeCount {
+		panic(fmt.Sprintf("recomputed subtree count (%d) ≠ n.subtreeCount (%d)",
+			recomputedSubtreeCount, n.subtreeCount))
+	}
+}
+
+// btree is an implementation of a B-Tree.
+//
+// btree stores FileMetadata in an ordered structure, allowing easy insertion,
+// removal, and iteration. The B-Tree stores items in order based on cmp. The
+// first level of the LSM uses a cmp function that compares sequence numbers.
+// All other levels compare using the FileMetadata.Smallest.
+//
+// Write operations are not safe for concurrent mutation by multiple
+// goroutines, but Read operations are.
+type btree struct {
+	root *node
+	cmp  btreeCmp
+}
+
+// Release dereferences and clears the root node of the btree, removing all
+// items from the btree. In doing so, it decrements contained file counts.
+// It returns a slice of newly obsolete backing files, if any.
+func (t *btree) Release() (obsolete []*FileBacking) {
+	if t.root != nil {
+		t.root.decRef(true /* contentsToo */, &obsolete)
+		t.root = nil
+	}
+	return obsolete
+}
+
+// Clone clones the btree, lazily. It does so in constant time.
+func (t *btree) Clone() btree {
+	c := *t
+	if c.root != nil {
+		// Incrementing the reference count on the root node is sufficient to
+		// ensure that no node in the cloned tree can be mutated by an actor
+		// holding a reference to the original tree and vice versa. This
+		// property is upheld because the root node in the receiver btree and
+		// the returned btree will both necessarily have a reference count of at
+		// least 2 when this method returns. All tree mutations recursively
+		// acquire mutable node references (see mut) as they traverse down the
+		// tree. The act of acquiring a mutable node reference performs a clone
+		// if a node's reference count is greater than one. Cloning a node (see
+		// clone) increases the reference count on each of its children,
+		// ensuring that they have a reference count of at least 2. This, in
+		// turn, ensures that any of the child nodes that are modified will also
+		// be copied-on-write, recursively ensuring the immutability property
+		// over the entire tree.
+		c.root.incRef()
+	}
+	return c
+}
+
+// Delete removes the provided file from the tree.
+// It returns true if the file now has a zero reference count.
+func (t *btree) Delete(item *FileMetadata) (obsolete bool) {
+	if t.root == nil || t.root.count == 0 {
+		return false
+	}
+	if out := mut(&t.root).Remove(t.cmp, item); out != nil {
+		obsolete = out.Unref() == 0
+	}
+	if invariants.Enabled {
+		t.root.verifyInvariants()
+	}
+	if t.root.count == 0 {
+		old := t.root
+		if t.root.leaf {
+			t.root = nil
+		} else {
+			t.root = t.root.children[0]
+		}
+		old.decRef(false /* contentsToo */, nil)
+	}
+	return obsolete
+}
+
+// Insert adds the given item to the tree. If a item in the tree already
+// equals the given one, Insert panics.
+func (t *btree) Insert(item *FileMetadata) error {
+	if t.root == nil {
+		t.root = newLeafNode()
+	} else if t.root.count >= maxItems {
+		splitLa, splitNode := mut(&t.root).split(maxItems / 2)
+		newRoot := newNode()
+		newRoot.count = 1
+		newRoot.items[0] = splitLa
+		newRoot.children[0] = t.root
+		newRoot.children[1] = splitNode
+		newRoot.subtreeCount = t.root.subtreeCount + splitNode.subtreeCount + 1
+		t.root = newRoot
+	}
+	item.Ref()
+	err := mut(&t.root).Insert(t.cmp, item)
+	if invariants.Enabled {
+		t.root.verifyInvariants()
+	}
+	return err
+}
+
+// Iter returns a new iterator object. It is not safe to continue using an
+// iterator after modifications are made to the tree. If modifications are made,
+// create a new iterator.
+func (t *btree) Iter() iterator {
+	return iterator{r: t.root, pos: -1, cmp: t.cmp}
+}
+
+// Count returns the number of files contained within the B-Tree.
+func (t *btree) Count() int {
+	if t.root == nil {
+		return 0
+	}
+	return t.root.subtreeCount
+}
+
+// String returns a string description of the tree. The format is
+// similar to the https://en.wikipedia.org/wiki/Newick_format.
+func (t *btree) String() string {
+	if t.Count() == 0 {
+		return ";"
+	}
+	var b strings.Builder
+	t.root.writeString(&b)
+	return b.String()
+}
+
+func (n *node) writeString(b *strings.Builder) {
+	if n.leaf {
+		for i := int16(0); i < n.count; i++ {
+			if i != 0 {
+				b.WriteString(",")
+			}
+			b.WriteString(n.items[i].String())
+		}
+		return
+	}
+	for i := int16(0); i <= n.count; i++ {
+		b.WriteString("(")
+		n.children[i].writeString(b)
+		b.WriteString(")")
+		if i < n.count {
+			b.WriteString(n.items[i].String())
+		}
+	}
+}
+
+// iterStack represents a stack of (node, pos) tuples, which captures
+// iteration state as an iterator descends a btree.
+type iterStack struct {
+	// a contains aLen stack frames when an iterator stack is short enough.
+	// If the iterator stack overflows the capacity of iterStackArr, the stack
+	// is moved to s and aLen is set to -1.
+	a    iterStackArr
+	aLen int16 // -1 when using s
+	s    []iterFrame
+}
+
+// Used to avoid allocations for stacks below a certain size.
+type iterStackArr [3]iterFrame
+
+type iterFrame struct {
+	n   *node
+	pos int16
+}
+
+func (is *iterStack) push(f iterFrame) {
+	if is.aLen == -1 {
+		is.s = append(is.s, f)
+	} else if int(is.aLen) == len(is.a) {
+		is.s = make([]iterFrame, int(is.aLen)+1, 2*int(is.aLen))
+		copy(is.s, is.a[:])
+		is.s[int(is.aLen)] = f
+		is.aLen = -1
+	} else {
+		is.a[is.aLen] = f
+		is.aLen++
+	}
+}
+
+func (is *iterStack) pop() iterFrame {
+	if is.aLen == -1 {
+		f := is.s[len(is.s)-1]
+		is.s = is.s[:len(is.s)-1]
+		return f
+	}
+	is.aLen--
+	return is.a[is.aLen]
+}
+
+func (is *iterStack) len() int {
+	if is.aLen == -1 {
+		return len(is.s)
+	}
+	return int(is.aLen)
+}
+
+func (is *iterStack) clone() iterStack {
+	// If the iterator is using the embedded iterStackArr, we only need to
+	// copy the struct itself.
+	if is.s == nil {
+		return *is
+	}
+	clone := *is
+	clone.s = make([]iterFrame, len(is.s))
+	copy(clone.s, is.s)
+	return clone
+}
+
+func (is *iterStack) nth(n int) (f iterFrame, ok bool) {
+	if is.aLen == -1 {
+		if n >= len(is.s) {
+			return f, false
+		}
+		return is.s[n], true
+	}
+	if int16(n) >= is.aLen {
+		return f, false
+	}
+	return is.a[n], true
+}
+
+func (is *iterStack) reset() {
+	if is.aLen == -1 {
+		is.s = is.s[:0]
+	} else {
+		is.aLen = 0
+	}
+}
+
+// iterator is responsible for search and traversal within a btree.
+type iterator struct {
+	// the root node of the B-Tree.
+	r *node
+	// n and pos make up the current position of the iterator.
+	// If valid, n.items[pos] is the current value of the iterator.
+	//
+	// n may be nil iff i.r is nil.
+	n   *node
+	pos int16
+	// cmp dictates the ordering of the FileMetadata.
+	cmp func(*FileMetadata, *FileMetadata) int
+	// a stack of n's ancestors within the B-Tree, alongside the position
+	// taken to arrive at n. If non-empty, the bottommost frame of the stack
+	// will always contain the B-Tree root.
+	s iterStack
+}
+
+// countLeft returns the count of files that are to the left of the current
+// iterator position.
+func (i *iterator) countLeft() int {
+	if i.r == nil {
+		return 0
+	}
+
+	// Each iterator has a stack of frames marking the path from the root node
+	// to the current iterator position. All files (n.items) and all subtrees
+	// (n.children) with indexes less than [pos] are to the left of the current
+	// iterator position.
+	//
+	//     +------------------------+  -
+	//     |  Root            pos:5 |   |
+	//     +------------------------+   | stack
+	//     |  Root/5          pos:3 |   | frames
+	//     +------------------------+   | [i.s]
+	//     |  Root/5/3        pos:9 |   |
+	//     +========================+  -
+	//     |                        |
+	//     | i.n: Root/5/3/9 i.pos:2|
+	//     +------------------------+
+	//
+	var count int
+	// Walk all the ancestors in the iterator stack [i.s], tallying up all the
+	// files and subtrees to the left of the stack frame's position.
+	f, ok := i.s.nth(0)
+	for fi := 0; ok; fi++ {
+		// There are [f.pos] files contained within [f.n.items] that sort to the
+		// left of the subtree the iterator has descended.
+		count += int(f.pos)
+		// Any subtrees that fall before the stack frame's position are entirely
+		// to the left of the iterator's current position.
+		for j := int16(0); j < f.pos; j++ {
+			count += f.n.children[j].subtreeCount
+		}
+		f, ok = i.s.nth(fi + 1)
+	}
+
+	// The bottommost stack frame is inlined within the iterator struct. Again,
+	// [i.pos] files fall to the left of the current iterator position.
+	count += int(i.pos)
+	if !i.n.leaf {
+		// NB: Unlike above, we use a `<= i.pos` comparison. The iterator is
+		// positioned at item `i.n.items[i.pos]`, which sorts after everything
+		// in the subtree at `i.n.children[i.pos]`.
+		for j := int16(0); j <= i.pos; j++ {
+			count += i.n.children[j].subtreeCount
+		}
+	}
+	return count
+}
+
+func (i *iterator) clone() iterator {
+	c := *i
+	c.s = i.s.clone()
+	return c
+}
+
+func (i *iterator) reset() {
+	i.n = i.r
+	i.pos = -1
+	i.s.reset()
+}
+
+func (i iterator) String() string {
+	var buf bytes.Buffer
+	for n := 0; ; n++ {
+		f, ok := i.s.nth(n)
+		if !ok {
+			break
+		}
+		fmt.Fprintf(&buf, "%p: %02d/%02d\n", f.n, f.pos, f.n.count)
+	}
+	if i.r == nil {
+		fmt.Fprintf(&buf, "<nil>: %02d", i.pos)
+	} else {
+		fmt.Fprintf(&buf, "%p: %02d/%02d", i.n, i.pos, i.n.count)
+	}
+	return buf.String()
+}
+
+func cmpIter(a, b iterator) int {
+	if a.r != b.r {
+		panic("compared iterators from different btrees")
+	}
+
+	// Each iterator has a stack of frames marking the path from the root node
+	// to the current iterator position. We walk both paths formed by the
+	// iterators' stacks simultaneously, descending from the shared root node,
+	// always comparing nodes at the same level in the tree.
+	//
+	// If the iterators' paths ever diverge and point to different nodes, the
+	// iterators are not equal and we use the node positions to evaluate the
+	// comparison.
+	//
+	// If an iterator's stack ends, we stop descending and use its current
+	// node and position for the final comparison. One iterator's stack may
+	// end before another's if one iterator is positioned deeper in the tree.
+	//
+	// a                                b
+	// +------------------------+      +--------------------------+ -
+	// |  Root            pos:5 |   =  |  Root              pos:5 |  |
+	// +------------------------+      +--------------------------+  | stack
+	// |  Root/5          pos:3 |   =  |  Root/5            pos:3 |  | frames
+	// +------------------------+      +--------------------------+  |
+	// |  Root/5/3        pos:9 |   >  |  Root/5/3          pos:1 |  |
+	// +========================+      +==========================+ -
+	// |                        |      |                          |
+	// | a.n: Root/5/3/9 a.pos:2|      | b.n: Root/5/3/1, b.pos:5 |
+	// +------------------------+      +--------------------------+
+
+	// Initialize with the iterator's current node and position. These are
+	// conceptually the most-recent/current frame of the iterator stack.
+	an, apos := a.n, a.pos
+	bn, bpos := b.n, b.pos
+
+	// aok, bok are set while traversing the iterator's path down the B-Tree.
+	// They're declared in the outer scope because they help distinguish the
+	// sentinel case when both iterators' first frame points to the last child
+	// of the root. If an iterator has no other frames in its stack, it's the
+	// end sentinel state which sorts after everything else.
+	var aok, bok bool
+	for i := 0; ; i++ {
+		var af, bf iterFrame
+		af, aok = a.s.nth(i)
+		bf, bok = b.s.nth(i)
+		if !aok || !bok {
+			if aok {
+				// Iterator a, unlike iterator b, still has a frame. Set an,
+				// apos so we compare using the frame from the stack.
+				an, apos = af.n, af.pos
+			}
+			if bok {
+				// Iterator b, unlike iterator a, still has a frame. Set bn,
+				// bpos so we compare using the frame from the stack.
+				bn, bpos = bf.n, bf.pos
+			}
+			break
+		}
+
+		// aok && bok
+		if af.n != bf.n {
+			panic("nonmatching nodes during btree iterator comparison")
+		}
+		if v := stdcmp.Compare(af.pos, bf.pos); v != 0 {
+			return v
+		}
+		// Otherwise continue up both iterators' stacks (equivalently, down the
+		// B-Tree away from the root).
+	}
+
+	if aok && bok {
+		panic("expected one or more stacks to have been exhausted")
+	}
+	if an != bn {
+		panic("nonmatching nodes during btree iterator comparison")
+	}
+	if v := stdcmp.Compare(apos, bpos); v != 0 {
+		return v
+	}
+	switch {
+	case aok:
+		// a is positioned at a leaf child at this position and b is at an
+		// end sentinel state.
+		return -1
+	case bok:
+		// b is positioned at a leaf child at this position and a is at an
+		// end sentinel state.
+		return +1
+	default:
+		return 0
+	}
+}
+
+func (i *iterator) descend(n *node, pos int16) {
+	i.s.push(iterFrame{n: n, pos: pos})
+	i.n = n.children[pos]
+	i.pos = 0
+}
+
+// ascend ascends up to the current node's parent and resets the position
+// to the one previously set for this parent node.
+func (i *iterator) ascend() {
+	f := i.s.pop()
+	i.n = f.n
+	i.pos = f.pos
+}
+
+// seek repositions the iterator over the first file for which fn returns
+// true, mirroring the semantics of the standard library's sort.Search
+// function.  Like sort.Search, seek requires the iterator's B-Tree to be
+// ordered such that fn returns false for some (possibly empty) prefix of the
+// tree's files, and then true for the (possibly empty) remainder.
+func (i *iterator) seek(fn func(*FileMetadata) bool) {
+	i.reset()
+	if i.r == nil {
+		return
+	}
+
+	for {
+		// Logic copied from sort.Search.
+		j, k := 0, int(i.n.count)
+		for j < k {
+			h := int(uint(j+k) >> 1) // avoid overflow when computing h
+
+			// j ≤ h < k
+			if !fn(i.n.items[h]) {
+				j = h + 1 // preserves f(j-1) == false
+			} else {
+				k = h // preserves f(k) == true
+			}
+		}
+
+		i.pos = int16(j)
+		if i.n.leaf {
+			if i.pos == i.n.count {
+				i.next()
+			}
+			return
+		}
+		i.descend(i.n, i.pos)
+	}
+}
+
+// first seeks to the first item in the btree.
+func (i *iterator) first() {
+	i.reset()
+	if i.r == nil {
+		return
+	}
+	for !i.n.leaf {
+		i.descend(i.n, 0)
+	}
+	i.pos = 0
+}
+
+// last seeks to the last item in the btree.
+func (i *iterator) last() {
+	i.reset()
+	if i.r == nil {
+		return
+	}
+	for !i.n.leaf {
+		i.descend(i.n, i.n.count)
+	}
+	i.pos = i.n.count - 1
+}
+
+// next positions the iterator to the item immediately following
+// its current position.
+func (i *iterator) next() {
+	if i.r == nil {
+		return
+	}
+
+	if i.n.leaf {
+		if i.pos < i.n.count {
+			i.pos++
+		}
+		if i.pos < i.n.count {
+			return
+		}
+		for i.s.len() > 0 && i.pos >= i.n.count {
+			i.ascend()
+		}
+		return
+	}
+
+	i.descend(i.n, i.pos+1)
+	for !i.n.leaf {
+		i.descend(i.n, 0)
+	}
+	i.pos = 0
+}
+
+// prev positions the iterator to the item immediately preceding
+// its current position.
+func (i *iterator) prev() {
+	if i.r == nil {
+		return
+	}
+
+	if i.n.leaf {
+		i.pos--
+		if i.pos >= 0 {
+			return
+		}
+		for i.s.len() > 0 && i.pos < 0 {
+			i.ascend()
+			i.pos--
+		}
+		return
+	}
+
+	i.descend(i.n, i.pos)
+	for !i.n.leaf {
+		i.descend(i.n, i.n.count)
+	}
+	i.pos = i.n.count - 1
+}
+
+// valid returns whether the iterator is positioned at a valid position.
+func (i *iterator) valid() bool {
+	return i.r != nil && i.pos >= 0 && i.pos < i.n.count
+}
+
+// cur returns the item at the iterator's current position. It is illegal
+// to call cur if the iterator is not valid.
+func (i *iterator) cur() *FileMetadata {
+	if invariants.Enabled && !i.valid() {
+		panic("btree iterator.cur invoked on invalid iterator")
+	}
+	return i.n.items[i.pos]
+}
diff --git a/pebble/internal/manifest/btree_test.go b/pebble/internal/manifest/btree_test.go
new file mode 100644
index 0000000..cce22a2
--- /dev/null
+++ b/pebble/internal/manifest/btree_test.go
@@ -0,0 +1,991 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package manifest
+
+import (
+	stdcmp "cmp"
+	"fmt"
+	"math/rand"
+	"reflect"
+	"slices"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/stretchr/testify/require"
+)
+
+func newItem(k InternalKey) *FileMetadata {
+	m := (&FileMetadata{}).ExtendPointKeyBounds(
+		base.DefaultComparer.Compare, k, k,
+	)
+	m.InitPhysicalBacking()
+	return m
+}
+
+func cmp(a, b *FileMetadata) int {
+	return cmpKey(a.Smallest, b.Smallest)
+}
+
+func cmpKey(a, b InternalKey) int {
+	return base.InternalCompare(base.DefaultComparer.Compare, a, b)
+}
+
+//////////////////////////////////////////
+//        Invariant verification        //
+//////////////////////////////////////////
+
+// Verify asserts that the tree's structural invariants all hold.
+func (t *btree) Verify(tt *testing.T) {
+	if t.Count() == 0 {
+		require.Nil(tt, t.root)
+		return
+	}
+	t.verifyLeafSameDepth(tt)
+	t.verifyCountAllowed(tt)
+	t.isSorted(tt)
+	t.root.verifyInvariants()
+}
+
+func (t *btree) verifyLeafSameDepth(tt *testing.T) {
+	h := t.height()
+	t.root.verifyDepthEqualToHeight(tt, 1, h)
+}
+
+func (n *node) verifyDepthEqualToHeight(t *testing.T, depth, height int) {
+	if n.leaf {
+		require.Equal(t, height, depth, "all leaves should have the same depth as the tree height")
+	}
+	n.recurse(func(child *node, _ int16) {
+		child.verifyDepthEqualToHeight(t, depth+1, height)
+	})
+}
+
+func (t *btree) verifyCountAllowed(tt *testing.T) {
+	t.root.verifyCountAllowed(tt, true)
+}
+
+// height returns the height of the tree.
+func (t *btree) height() int {
+	if t.root == nil {
+		return 0
+	}
+	h := 1
+	n := t.root
+	for !n.leaf {
+		n = n.children[0]
+		h++
+	}
+	return h
+}
+
+func (n *node) verifyCountAllowed(t *testing.T, root bool) {
+	if !root {
+		require.GreaterOrEqual(t, n.count, int16(minItems), "item count %d must be in range [%d,%d]", n.count, minItems, maxItems)
+		require.LessOrEqual(t, n.count, int16(maxItems), "item count %d must be in range [%d,%d]", n.count, minItems, maxItems)
+	}
+	for i, item := range n.items {
+		if i < int(n.count) {
+			require.NotNil(t, item, "item below count")
+		} else {
+			require.Nil(t, item, "item above count")
+		}
+	}
+	if !n.leaf {
+		for i, child := range n.children {
+			if i <= int(n.count) {
+				require.NotNil(t, child, "node below count")
+			} else {
+				require.Nil(t, child, "node above count")
+			}
+		}
+	}
+	n.recurse(func(child *node, _ int16) {
+		child.verifyCountAllowed(t, false)
+	})
+}
+
+func (t *btree) isSorted(tt *testing.T) {
+	t.root.isSorted(tt, t.cmp)
+}
+
+func (n *node) isSorted(t *testing.T, cmp func(*FileMetadata, *FileMetadata) int) {
+	for i := int16(1); i < n.count; i++ {
+		require.LessOrEqual(t, cmp(n.items[i-1], n.items[i]), 0)
+	}
+	if !n.leaf {
+		for i := int16(0); i < n.count; i++ {
+			prev := n.children[i]
+			next := n.children[i+1]
+
+			require.LessOrEqual(t, cmp(prev.items[prev.count-1], n.items[i]), 0)
+			require.LessOrEqual(t, cmp(n.items[i], next.items[0]), 0)
+		}
+	}
+	n.recurse(func(child *node, _ int16) {
+		child.isSorted(t, cmp)
+	})
+}
+
+func (n *node) recurse(f func(child *node, pos int16)) {
+	if !n.leaf {
+		for i := int16(0); i <= n.count; i++ {
+			f(n.children[i], i)
+		}
+	}
+}
+
+//////////////////////////////////////////
+//              Unit Tests              //
+//////////////////////////////////////////
+
+func key(i int) InternalKey {
+	if i < 0 || i > 99999 {
+		panic("key out of bounds")
+	}
+	return base.MakeInternalKey([]byte(fmt.Sprintf("%05d", i)), 0, base.InternalKeyKindSet)
+}
+
+func keyWithMemo(i int, memo map[int]InternalKey) InternalKey {
+	if s, ok := memo[i]; ok {
+		return s
+	}
+	s := key(i)
+	memo[i] = s
+	return s
+}
+
+func checkIterRelative(t *testing.T, it *iterator, start, end int, keyMemo map[int]InternalKey) {
+	t.Helper()
+	i := start
+	for ; it.valid(); it.next() {
+		item := it.cur()
+		expected := keyWithMemo(i, keyMemo)
+		if cmpKey(expected, item.Smallest) != 0 {
+			t.Fatalf("expected %s, but found %s", expected, item.Smallest)
+		}
+		i++
+	}
+	if i != end {
+		t.Fatalf("expected %d, but at %d", end, i)
+	}
+}
+
+func checkIter(t *testing.T, it iterator, start, end int, keyMemo map[int]InternalKey) {
+	t.Helper()
+	i := start
+	for it.first(); it.valid(); it.next() {
+		item := it.cur()
+		expected := keyWithMemo(i, keyMemo)
+		if cmpKey(expected, item.Smallest) != 0 {
+			t.Fatalf("expected %s, but found %s", expected, item.Smallest)
+		}
+		require.Equal(t, i-start, it.countLeft())
+		i++
+	}
+	if i != end {
+		t.Fatalf("expected %d, but at %d", end, i)
+	}
+
+	for it.last(); it.valid(); it.prev() {
+		i--
+		item := it.cur()
+		expected := keyWithMemo(i, keyMemo)
+		if cmpKey(expected, item.Smallest) != 0 {
+			t.Fatalf("expected %s, but found %s", expected, item.Smallest)
+		}
+		require.Equal(t, i-start, it.countLeft())
+	}
+	if i != start {
+		t.Fatalf("expected %d, but at %d: %+v", start, i, it)
+	}
+}
+
+// TestBTree tests basic btree operations.
+func TestBTree(t *testing.T) {
+	var tr btree
+	tr.cmp = cmp
+	keyMemo := make(map[int]InternalKey)
+
+	// With degree == 16 (max-items/node == 31) we need 513 items in order for
+	// there to be 3 levels in the tree. The count here is comfortably above
+	// that.
+	const count = 768
+	items := rang(0, count-1)
+
+	// Add keys in sorted order.
+	for i := 0; i < count; i++ {
+		require.NoError(t, tr.Insert(items[i]))
+		tr.Verify(t)
+		if e := i + 1; e != tr.Count() {
+			t.Fatalf("expected length %d, but found %d", e, tr.Count())
+		}
+		checkIter(t, tr.Iter(), 0, i+1, keyMemo)
+	}
+
+	// delete keys in sorted order.
+	for i := 0; i < count; i++ {
+		obsolete := tr.Delete(items[i])
+		if !obsolete {
+			t.Fatalf("expected item %d to be obsolete", i)
+		}
+		tr.Verify(t)
+		if e := count - (i + 1); e != tr.Count() {
+			t.Fatalf("expected length %d, but found %d", e, tr.Count())
+		}
+		checkIter(t, tr.Iter(), i+1, count, keyMemo)
+	}
+
+	// Add keys in reverse sorted order.
+	for i := 1; i <= count; i++ {
+		require.NoError(t, tr.Insert(items[count-i]))
+		tr.Verify(t)
+		if i != tr.Count() {
+			t.Fatalf("expected length %d, but found %d", i, tr.Count())
+		}
+		checkIter(t, tr.Iter(), count-i, count, keyMemo)
+	}
+
+	// delete keys in reverse sorted order.
+	for i := 1; i <= count; i++ {
+		obsolete := tr.Delete(items[count-i])
+		if !obsolete {
+			t.Fatalf("expected item %d to be obsolete", i)
+		}
+		tr.Verify(t)
+		if e := count - i; e != tr.Count() {
+			t.Fatalf("expected length %d, but found %d", e, tr.Count())
+		}
+		checkIter(t, tr.Iter(), 0, count-i, keyMemo)
+	}
+}
+
+func TestIterClone(t *testing.T) {
+	const count = 65536
+
+	var tr btree
+	tr.cmp = cmp
+	keyMemo := make(map[int]InternalKey)
+
+	for i := 0; i < count; i++ {
+		require.NoError(t, tr.Insert(newItem(key(i))))
+	}
+
+	it := tr.Iter()
+	i := 0
+	for it.first(); it.valid(); it.next() {
+		if i%500 == 0 {
+			c := it.clone()
+
+			require.Equal(t, 0, cmpIter(it, c))
+			checkIterRelative(t, &c, i, count, keyMemo)
+			if i < count {
+				require.Equal(t, -1, cmpIter(it, c))
+				require.Equal(t, +1, cmpIter(c, it))
+			}
+		}
+		i++
+	}
+}
+
+func TestIterCmpEdgeCases(t *testing.T) {
+	var tr btree
+	tr.cmp = cmp
+	t.Run("empty", func(t *testing.T) {
+		a := tr.Iter()
+		b := tr.Iter()
+		require.Equal(t, 0, cmpIter(a, b))
+	})
+	require.NoError(t, tr.Insert(newItem(key(5))))
+	t.Run("exhausted_next", func(t *testing.T) {
+		a := tr.Iter()
+		b := tr.Iter()
+		a.first()
+		b.first()
+		require.Equal(t, 0, cmpIter(a, b))
+		b.next()
+		require.False(t, b.valid())
+		require.Equal(t, -1, cmpIter(a, b))
+	})
+	t.Run("exhausted_prev", func(t *testing.T) {
+		a := tr.Iter()
+		b := tr.Iter()
+		a.first()
+		b.first()
+		b.prev()
+		require.False(t, b.valid())
+		require.Equal(t, 1, cmpIter(a, b))
+		b.next()
+		require.Equal(t, 0, cmpIter(a, b))
+	})
+}
+
+func TestIterCmpRand(t *testing.T) {
+	const itemCount = 65536
+	const iterCount = 1000
+
+	var tr btree
+	tr.cmp = cmp
+	for i := 0; i < itemCount; i++ {
+		require.NoError(t, tr.Insert(newItem(key(i))))
+	}
+
+	seed := time.Now().UnixNano()
+	rng := rand.New(rand.NewSource(seed))
+	iters1 := make([]*LevelIterator, iterCount)
+	iters2 := make([]*LevelIterator, iterCount)
+	for i := 0; i < iterCount; i++ {
+		k := rng.Intn(itemCount)
+		iter := LevelIterator{iter: tr.Iter()}
+		iter.SeekGE(base.DefaultComparer.Compare, key(k).UserKey)
+		iters1[i] = &iter
+		iters2[i] = &iter
+	}
+
+	// All the iterators should be positioned, so sorting them by items and by
+	// iterator comparisons should equal identical orderings.
+	slices.SortStableFunc(iters1, func(a, b *LevelIterator) int { return cmpIter(a.iter, b.iter) })
+	slices.SortStableFunc(iters2, func(a, b *LevelIterator) int { return cmp(a.iter.cur(), b.iter.cur()) })
+	for i := 0; i < iterCount; i++ {
+		if iters1[i] != iters2[i] {
+			t.Fatalf("seed %d: iters out of order at index %d:\n%s\n\n%s",
+				seed, i, iters1[i], iters2[i])
+		}
+	}
+}
+
+// TestBTreeSeek tests basic btree iterator operations on an iterator wrapped
+// by a LevelIterator.
+func TestBTreeSeek(t *testing.T) {
+	const count = 513
+
+	var tr btree
+	tr.cmp = cmp
+	for i := 0; i < count; i++ {
+		require.NoError(t, tr.Insert(newItem(key(i*2))))
+	}
+
+	it := LevelIterator{iter: tr.Iter()}
+	for i := 0; i < 2*count-1; i++ {
+		item := it.SeekGE(base.DefaultComparer.Compare, key(i).UserKey)
+		if item == nil {
+			t.Fatalf("%d: expected valid iterator", i)
+		}
+		expected := key(2 * ((i + 1) / 2))
+		if cmpKey(expected, item.Smallest) != 0 {
+			t.Fatalf("%d: expected %s, but found %s", i, expected, item.Smallest)
+		}
+	}
+	it.SeekGE(base.DefaultComparer.Compare, key(2*count-1).UserKey)
+	if it.iter.valid() {
+		t.Fatalf("expected invalid iterator")
+	}
+
+	for i := 1; i < 2*count; i++ {
+		item := it.SeekLT(base.DefaultComparer.Compare, key(i).UserKey)
+		if item == nil {
+			t.Fatalf("%d: expected valid iterator", i)
+		}
+		expected := key(2 * ((i - 1) / 2))
+		if cmpKey(expected, item.Smallest) != 0 {
+			t.Fatalf("%d: expected %s, but found %s", i, expected, item.Smallest)
+		}
+	}
+	it.SeekLT(base.DefaultComparer.Compare, key(0).UserKey)
+	if it.iter.valid() {
+		t.Fatalf("expected invalid iterator")
+	}
+}
+
+func TestBTreeInsertDuplicateError(t *testing.T) {
+	var tr btree
+	tr.cmp = cmp
+	require.NoError(t, tr.Insert(newItem(key(1))))
+	require.NoError(t, tr.Insert(newItem(key(2))))
+	require.NoError(t, tr.Insert(newItem(key(3))))
+	wantErr := errors.Errorf("files %s and %s collided on sort keys",
+		errors.Safe(base.FileNum(000000)), errors.Safe(base.FileNum(000000)))
+	require.Error(t, wantErr, tr.Insert(newItem(key(2))))
+}
+
+// TestBTreeCloneConcurrentOperations tests that cloning a btree returns a new
+// btree instance which is an exact logical copy of the original but that can be
+// modified independently going forward.
+func TestBTreeCloneConcurrentOperations(t *testing.T) {
+	const cloneTestSize = 1000
+	p := perm(cloneTestSize)
+
+	var trees []*btree
+	treeC, treeDone := make(chan *btree), make(chan struct{})
+	go func() {
+		for b := range treeC {
+			trees = append(trees, b)
+		}
+		close(treeDone)
+	}()
+
+	var wg sync.WaitGroup
+	var populate func(tr *btree, start int)
+	populate = func(tr *btree, start int) {
+		t.Logf("Starting new clone at %v", start)
+		treeC <- tr
+		for i := start; i < cloneTestSize; i++ {
+			require.NoError(t, tr.Insert(p[i]))
+			if i%(cloneTestSize/5) == 0 {
+				wg.Add(1)
+				c := tr.Clone()
+				go populate(&c, i+1)
+			}
+		}
+		wg.Done()
+	}
+
+	wg.Add(1)
+	var tr btree
+	tr.cmp = cmp
+	go populate(&tr, 0)
+	wg.Wait()
+	close(treeC)
+	<-treeDone
+
+	t.Logf("Starting equality checks on %d trees", len(trees))
+	want := rang(0, cloneTestSize-1)
+	for i, tree := range trees {
+		if got := all(tree); !reflect.DeepEqual(strReprs(got), strReprs(want)) {
+			t.Errorf("tree %v mismatch", i)
+		}
+	}
+
+	t.Log("Removing half of items from first half")
+	toRemove := want[cloneTestSize/2:]
+	for i := 0; i < len(trees)/2; i++ {
+		tree := trees[i]
+		wg.Add(1)
+		go func() {
+			for _, item := range toRemove {
+				tree.Delete(item)
+			}
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+
+	t.Log("Checking all values again")
+	for i, tree := range trees {
+		var wantpart []*FileMetadata
+		if i < len(trees)/2 {
+			wantpart = want[:cloneTestSize/2]
+		} else {
+			wantpart = want
+		}
+		if got := all(tree); !reflect.DeepEqual(strReprs(got), strReprs(wantpart)) {
+			t.Errorf("tree %v mismatch, want %#v got %#v", i, strReprs(wantpart), strReprs(got))
+		}
+	}
+
+	var obsolete []*FileBacking
+	for i := range trees {
+		obsolete = append(obsolete, trees[i].Release()...)
+	}
+	if len(obsolete) != len(p) {
+		t.Errorf("got %d obsolete trees, expected %d", len(obsolete), len(p))
+	}
+}
+
+// TestIterStack tests the interface of the iterStack type.
+func TestIterStack(t *testing.T) {
+	f := func(i int) iterFrame { return iterFrame{pos: int16(i)} }
+	var is iterStack
+	for i := 1; i <= 2*len(iterStackArr{}); i++ {
+		var j int
+		for j = 0; j < i; j++ {
+			is.push(f(j))
+		}
+		require.Equal(t, j, is.len())
+		for j--; j >= 0; j-- {
+			require.Equal(t, f(j), is.pop())
+		}
+		is.reset()
+	}
+}
+
+func TestIterEndSentinel(t *testing.T) {
+	var tr btree
+	tr.cmp = cmp
+	require.NoError(t, tr.Insert(newItem(key(1))))
+	require.NoError(t, tr.Insert(newItem(key(2))))
+	require.NoError(t, tr.Insert(newItem(key(3))))
+	iter := LevelIterator{iter: tr.Iter()}
+	iter.SeekGE(base.DefaultComparer.Compare, key(3).UserKey)
+	require.True(t, iter.iter.valid())
+	iter.Next()
+	require.False(t, iter.iter.valid())
+
+	// If we seek into the end sentinel, prev should return us to a valid
+	// position.
+	iter.SeekGE(base.DefaultComparer.Compare, key(4).UserKey)
+	require.False(t, iter.iter.valid())
+	iter.Prev()
+	require.True(t, iter.iter.valid())
+}
+
+type orderStatistic struct{}
+
+func (o orderStatistic) Zero(dst interface{}) interface{} {
+	if dst == nil {
+		return new(int)
+	}
+	v := dst.(*int)
+	*v = 0
+	return v
+}
+
+func (o orderStatistic) Accumulate(meta *FileMetadata, dst interface{}) (interface{}, bool) {
+	v := dst.(*int)
+	*v++
+	return v, true
+}
+
+func (o orderStatistic) Merge(src interface{}, dst interface{}) interface{} {
+	srcv := src.(*int)
+	dstv := dst.(*int)
+	*dstv = *dstv + *srcv
+	return dstv
+}
+
+func TestAnnotationOrderStatistic(t *testing.T) {
+	const count = 1000
+	ann := orderStatistic{}
+
+	var tr btree
+	tr.cmp = cmp
+	for i := 1; i <= count; i++ {
+		require.NoError(t, tr.Insert(newItem(key(i))))
+
+		v, ok := tr.root.Annotation(ann)
+		require.True(t, ok)
+		vtyped := v.(*int)
+		require.Equal(t, i, *vtyped)
+	}
+
+	v, ok := tr.root.Annotation(ann)
+	require.True(t, ok)
+	vtyped := v.(*int)
+	require.Equal(t, count, *vtyped)
+
+	v, ok = tr.root.Annotation(ann)
+	vtyped = v.(*int)
+	require.True(t, ok)
+	require.Equal(t, count, *vtyped)
+}
+
+// TestRandomizedBTree tests a random set of Insert, Delete and iteration
+// operations, checking for equivalence with a map of filenums.
+func TestRandomizedBTree(t *testing.T) {
+	const maxFileNum = 50_000
+
+	seed := time.Now().UnixNano()
+	t.Log("seed", seed)
+	rng := rand.New(rand.NewSource(seed))
+
+	var numOps int
+	if invariants.RaceEnabled {
+		// Reduce the number of ops in race mode so the test doesn't take very long.
+		numOps = 1_000 + rng.Intn(4_000)
+	} else {
+		numOps = 10_000 + rng.Intn(40_000)
+	}
+
+	var metadataAlloc [maxFileNum]FileMetadata
+	for i := 0; i < len(metadataAlloc); i++ {
+		metadataAlloc[i].FileNum = base.FileNum(i)
+		metadataAlloc[i].InitPhysicalBacking()
+	}
+
+	// Use a btree comparator that sorts by file number to make it easier to
+	// prevent duplicates or overlaps.
+	tree := btree{
+		cmp: func(a *FileMetadata, b *FileMetadata) int {
+			return stdcmp.Compare(a.FileNum, b.FileNum)
+		},
+	}
+
+	type opDecl struct {
+		fn     func()
+		weight int
+	}
+	ref := map[base.FileNum]bool{}
+	ops := []opDecl{
+		{
+			// Insert
+			fn: func() {
+				f := &metadataAlloc[rng.Intn(maxFileNum)]
+				err := tree.Insert(f)
+				if ref[f.FileNum] {
+					require.Error(t, err, "btree.Insert should error if file already exists")
+				} else {
+					ref[f.FileNum] = true
+					require.NoError(t, err)
+				}
+			},
+			weight: 20,
+		},
+		{
+			// Delete
+			fn: func() {
+				f := &metadataAlloc[rng.Intn(maxFileNum)]
+				tree.Delete(f)
+				delete(ref, f.FileNum)
+			},
+			weight: 10,
+		},
+		{
+			// Iterate
+			fn: func() {
+				iter := tree.Iter()
+				count := 0
+				var prev base.FileNum
+				for iter.first(); iter.valid(); iter.next() {
+					fn := iter.cur().FileNum
+					require.True(t, ref[fn])
+					if count > 0 {
+						require.Less(t, prev, fn)
+					}
+					count++
+				}
+				require.Equal(t, count, len(ref))
+			},
+			weight: 1,
+		},
+	}
+	weightSum := 0
+	for i := range ops {
+		weightSum += ops[i].weight
+	}
+
+	for i := 0; i < numOps; i++ {
+		w := rng.Intn(weightSum)
+		for j := range ops {
+			w -= ops[j].weight
+			if w < 0 {
+				ops[j].fn()
+				break
+			}
+		}
+	}
+}
+
+//////////////////////////////////////////
+//              Benchmarks              //
+//////////////////////////////////////////
+
+// perm returns a random permutation of items with keys in the range [0, n).
+func perm(n int) (out []*FileMetadata) {
+	for _, i := range rand.Perm(n) {
+		out = append(out, newItem(key(i)))
+	}
+	return out
+}
+
+// rang returns an ordered list of items with keys in the range [m, n].
+func rang(m, n int) (out []*FileMetadata) {
+	for i := m; i <= n; i++ {
+		out = append(out, newItem(key(i)))
+	}
+	return out
+}
+
+func strReprs(items []*FileMetadata) []string {
+	s := make([]string, len(items))
+	for i := range items {
+		s[i] = items[i].String()
+	}
+	return s
+}
+
+// all extracts all items from a tree in order as a slice.
+func all(tr *btree) (out []*FileMetadata) {
+	it := tr.Iter()
+	it.first()
+	for it.valid() {
+		out = append(out, it.cur())
+		it.next()
+	}
+	return out
+}
+
+func forBenchmarkSizes(b *testing.B, f func(b *testing.B, count int)) {
+	for _, count := range []int{16, 128, 1024, 8192, 65536} {
+		b.Run(fmt.Sprintf("count=%d", count), func(b *testing.B) {
+			f(b, count)
+		})
+	}
+}
+
+// BenchmarkBTreeInsert measures btree insertion performance.
+func BenchmarkBTreeInsert(b *testing.B) {
+	forBenchmarkSizes(b, func(b *testing.B, count int) {
+		insertP := perm(count)
+		b.ResetTimer()
+		for i := 0; i < b.N; {
+			var tr btree
+			tr.cmp = cmp
+			for _, item := range insertP {
+				if err := tr.Insert(item); err != nil {
+					b.Fatal(err)
+				}
+				i++
+				if i >= b.N {
+					return
+				}
+			}
+		}
+	})
+}
+
+// BenchmarkBTreeDelete measures btree deletion performance.
+func BenchmarkBTreeDelete(b *testing.B) {
+	forBenchmarkSizes(b, func(b *testing.B, count int) {
+		insertP, removeP := perm(count), perm(count)
+		b.ResetTimer()
+		for i := 0; i < b.N; {
+			b.StopTimer()
+			var tr btree
+			tr.cmp = cmp
+			for _, item := range insertP {
+				if err := tr.Insert(item); err != nil {
+					b.Fatal(err)
+				}
+			}
+			b.StartTimer()
+			for _, item := range removeP {
+				tr.Delete(item)
+				i++
+				if i >= b.N {
+					return
+				}
+			}
+			if tr.Count() > 0 {
+				b.Fatalf("tree not empty: %s", &tr)
+			}
+		}
+	})
+}
+
+// BenchmarkBTreeDeleteInsert measures btree deletion and insertion performance.
+func BenchmarkBTreeDeleteInsert(b *testing.B) {
+	forBenchmarkSizes(b, func(b *testing.B, count int) {
+		insertP := perm(count)
+		var tr btree
+		tr.cmp = cmp
+		for _, item := range insertP {
+			if err := tr.Insert(item); err != nil {
+				b.Fatal(err)
+			}
+		}
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			item := insertP[i%count]
+			tr.Delete(item)
+			if err := tr.Insert(item); err != nil {
+				b.Fatal(err)
+			}
+		}
+	})
+}
+
+// BenchmarkBTreeDeleteInsertCloneOnce measures btree deletion and insertion
+// performance after the tree has been copy-on-write cloned once.
+func BenchmarkBTreeDeleteInsertCloneOnce(b *testing.B) {
+	forBenchmarkSizes(b, func(b *testing.B, count int) {
+		insertP := perm(count)
+		var tr btree
+		tr.cmp = cmp
+		for _, item := range insertP {
+			if err := tr.Insert(item); err != nil {
+				b.Fatal(err)
+			}
+		}
+		tr = tr.Clone()
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			item := insertP[i%count]
+			tr.Delete(item)
+			if err := tr.Insert(item); err != nil {
+				b.Fatal(err)
+			}
+		}
+	})
+}
+
+// BenchmarkBTreeDeleteInsertCloneEachTime measures btree deletion and insertion
+// performance while the tree is repeatedly copy-on-write cloned.
+func BenchmarkBTreeDeleteInsertCloneEachTime(b *testing.B) {
+	for _, release := range []bool{false, true} {
+		b.Run(fmt.Sprintf("release=%t", release), func(b *testing.B) {
+			forBenchmarkSizes(b, func(b *testing.B, count int) {
+				insertP := perm(count)
+				var tr, trRelease btree
+				tr.cmp = cmp
+				trRelease.cmp = cmp
+				for _, item := range insertP {
+					if err := tr.Insert(item); err != nil {
+						b.Fatal(err)
+					}
+				}
+				b.ResetTimer()
+				for i := 0; i < b.N; i++ {
+					item := insertP[i%count]
+					if release {
+						trRelease.Release()
+						trRelease = tr
+					}
+					tr = tr.Clone()
+					tr.Delete(item)
+					if err := tr.Insert(item); err != nil {
+						b.Fatal(err)
+					}
+				}
+			})
+		})
+	}
+}
+
+// BenchmarkBTreeIter measures the cost of creating a btree iterator.
+func BenchmarkBTreeIter(b *testing.B) {
+	var tr btree
+	tr.cmp = cmp
+	for i := 0; i < b.N; i++ {
+		it := tr.Iter()
+		it.first()
+	}
+}
+
+// BenchmarkBTreeIterSeekGE measures the cost of seeking a btree iterator
+// forward.
+func BenchmarkBTreeIterSeekGE(b *testing.B) {
+	rng := rand.New(rand.NewSource(time.Now().UnixNano()))
+	forBenchmarkSizes(b, func(b *testing.B, count int) {
+		var keys []InternalKey
+		var tr btree
+		tr.cmp = cmp
+
+		for i := 0; i < count; i++ {
+			s := key(i)
+			keys = append(keys, s)
+			if err := tr.Insert(newItem(s)); err != nil {
+				b.Fatal(err)
+			}
+		}
+
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			k := keys[rng.Intn(len(keys))]
+			it := LevelIterator{iter: tr.Iter()}
+			f := it.SeekGE(base.DefaultComparer.Compare, k.UserKey)
+			if testing.Verbose() {
+				if f == nil {
+					b.Fatal("expected to find key")
+				}
+				if cmpKey(k, f.Smallest) != 0 {
+					b.Fatalf("expected %s, but found %s", k, f.Smallest)
+				}
+			}
+		}
+	})
+}
+
+// BenchmarkBTreeIterSeekLT measures the cost of seeking a btree iterator
+// backward.
+func BenchmarkBTreeIterSeekLT(b *testing.B) {
+	rng := rand.New(rand.NewSource(time.Now().UnixNano()))
+	forBenchmarkSizes(b, func(b *testing.B, count int) {
+		var keys []InternalKey
+		var tr btree
+		tr.cmp = cmp
+
+		for i := 0; i < count; i++ {
+			k := key(i)
+			keys = append(keys, k)
+			if err := tr.Insert(newItem(k)); err != nil {
+				b.Fatal(err)
+			}
+		}
+
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			j := rng.Intn(len(keys))
+			k := keys[j]
+			it := LevelIterator{iter: tr.Iter()}
+			f := it.SeekLT(base.DefaultComparer.Compare, k.UserKey)
+			if testing.Verbose() {
+				if j == 0 {
+					if f != nil {
+						b.Fatal("unexpected key")
+					}
+				} else {
+					if f == nil {
+						b.Fatal("expected to find key")
+					}
+					k := keys[j-1]
+					if cmpKey(k, f.Smallest) != 0 {
+						b.Fatalf("expected %s, but found %s", k, f.Smallest)
+					}
+				}
+			}
+		}
+	})
+}
+
+// BenchmarkBTreeIterNext measures the cost of seeking a btree iterator to the
+// next item in the tree.
+func BenchmarkBTreeIterNext(b *testing.B) {
+	var tr btree
+	tr.cmp = cmp
+
+	const count = 8 << 10
+	for i := 0; i < count; i++ {
+		item := newItem(key(i))
+		if err := tr.Insert(item); err != nil {
+			b.Fatal(err)
+		}
+	}
+
+	it := tr.Iter()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		if !it.valid() {
+			it.first()
+		}
+		it.next()
+	}
+}
+
+// BenchmarkBTreeIterPrev measures the cost of seeking a btree iterator to the
+// previous item in the tree.
+func BenchmarkBTreeIterPrev(b *testing.B) {
+	var tr btree
+	tr.cmp = cmp
+
+	const count = 8 << 10
+	for i := 0; i < count; i++ {
+		item := newItem(key(i))
+		if err := tr.Insert(item); err != nil {
+			b.Fatal(err)
+		}
+	}
+
+	it := tr.Iter()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		if !it.valid() {
+			it.first()
+		}
+		it.prev()
+	}
+}
diff --git a/pebble/internal/manifest/l0_sublevels.go b/pebble/internal/manifest/l0_sublevels.go
new file mode 100644
index 0000000..3857045
--- /dev/null
+++ b/pebble/internal/manifest/l0_sublevels.go
@@ -0,0 +1,2042 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package manifest
+
+import (
+	"bytes"
+	"fmt"
+	"math"
+	"sort"
+	"strings"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	stdcmp "github.com/cockroachdb/pebble/shims/cmp"
+	"github.com/cockroachdb/pebble/shims/slices"
+)
+
+// errInvalidL0SublevelsOpt is for use in AddL0Files when the incremental
+// sublevel generation optimization failed, and NewL0Sublevels must be called.
+var errInvalidL0SublevelsOpt = errors.New("pebble: L0 sublevel generation optimization cannot be used")
+
+// Intervals are of the form [start, end) with no gap between intervals. Each
+// file overlaps perfectly with a sequence of intervals. This perfect overlap
+// occurs because the union of file boundary keys is used to pick intervals.
+// However the largest key in a file is inclusive, so when it is used as
+// an interval, the actual key is ImmediateSuccessor(key). We don't have the
+// ImmediateSuccessor function to do this computation, so we instead keep an
+// isLargest bool to remind the code about this fact. This is used for
+// comparisons in the following manner:
+// - intervalKey{k, false} < intervalKey{k, true}
+// - k1 < k2 -> intervalKey{k1, _} < intervalKey{k2, _}.
+//
+// Note that the file's largest key is exclusive if the internal key
+// has a trailer matching the rangedel sentinel key. In this case, we set
+// isLargest to false for end interval computation.
+//
+// For example, consider three files with bounds [a,e], [b,g], and [e,j]. The
+// interval keys produced would be intervalKey{a, false}, intervalKey{b, false},
+// intervalKey{e, false}, intervalKey{e, true}, intervalKey{g, true} and
+// intervalKey{j, true}, resulting in intervals
+// [a, b), [b, (e, false)), [(e,false), (e, true)), [(e, true), (g, true)) and
+// [(g, true), (j, true)). The first file overlaps with the first three
+// perfectly, the second file overlaps with the second through to fourth
+// intervals, and the third file overlaps with the last three.
+//
+// The intervals are indexed starting from 0, with the index of the interval
+// being the index of the start key of the interval.
+//
+// In addition to helping with compaction picking, we use interval indices
+// to assign each file an interval range once. Subsequent operations, say
+// picking overlapping files for a compaction, only need to use the index
+// numbers and so avoid expensive byte slice comparisons.
+type intervalKey struct {
+	key       []byte
+	isLargest bool
+}
+
+// intervalKeyTemp is used in the sortAndSweep step. It contains additional metadata
+// which is used to generate the {min,max}IntervalIndex for files.
+type intervalKeyTemp struct {
+	intervalKey intervalKey
+	fileMeta    *FileMetadata
+	isEndKey    bool
+}
+
+func (i *intervalKeyTemp) setFileIntervalIndex(idx int) {
+	if i.isEndKey {
+		// This is the right endpoint of some file interval, so the
+		// file.maxIntervalIndex must be j - 1 as maxIntervalIndex is
+		// inclusive.
+		i.fileMeta.maxIntervalIndex = idx - 1
+		return
+	}
+	// This is the left endpoint for some file interval, so the
+	// file.minIntervalIndex must be j.
+	i.fileMeta.minIntervalIndex = idx
+}
+
+func intervalKeyCompare(cmp Compare, a, b intervalKey) int {
+	rv := cmp(a.key, b.key)
+	if rv == 0 {
+		if a.isLargest && !b.isLargest {
+			return +1
+		}
+		if !a.isLargest && b.isLargest {
+			return -1
+		}
+	}
+	return rv
+}
+
+type intervalKeySorter struct {
+	keys []intervalKeyTemp
+	cmp  Compare
+}
+
+func (s intervalKeySorter) Len() int { return len(s.keys) }
+func (s intervalKeySorter) Less(i, j int) bool {
+	return intervalKeyCompare(s.cmp, s.keys[i].intervalKey, s.keys[j].intervalKey) < 0
+}
+func (s intervalKeySorter) Swap(i, j int) {
+	s.keys[i], s.keys[j] = s.keys[j], s.keys[i]
+}
+
+// sortAndSweep will sort the intervalKeys using intervalKeySorter, remove the
+// duplicate fileIntervals, and set the {min, max}IntervalIndex for the files.
+func sortAndSweep(keys []intervalKeyTemp, cmp Compare) []intervalKeyTemp {
+	if len(keys) == 0 {
+		return nil
+	}
+	sorter := intervalKeySorter{keys: keys, cmp: cmp}
+	sort.Sort(sorter)
+
+	// intervalKeys are generated using the file bounds. Specifically, there are
+	// 2 intervalKeys for each file, and len(keys) = 2 * number of files. Each
+	// `intervalKeyTemp` stores information about which file it was generated
+	// from, and whether the key represents the end key of the file. So, as
+	// we're deduplicating the `keys` slice, we're guaranteed to iterate over
+	// the interval keys belonging to each of the files. Since the
+	// file.{min,max}IntervalIndex points to the position of the files bounds in
+	// the deduplicated `keys` slice, we can determine
+	// file.{min,max}IntervalIndex during the iteration.
+	i := 0
+	j := 0
+	for i < len(keys) {
+		// loop invariant: j <= i
+		currKey := keys[i]
+		keys[j] = keys[i]
+
+		for {
+			keys[i].setFileIntervalIndex(j)
+			i++
+			if i >= len(keys) || intervalKeyCompare(cmp, currKey.intervalKey, keys[i].intervalKey) != 0 {
+				break
+			}
+		}
+		j++
+	}
+	return keys[:j]
+}
+
+// A key interval of the form [start, end). The end is not represented here
+// since it is implicit in the start of the next interval. The last interval is
+// an exception but we don't need to ever lookup the end of that interval; the
+// last fileInterval will only act as an end key marker. The set of intervals
+// is const after initialization.
+type fileInterval struct {
+	index    int
+	startKey intervalKey
+
+	// True iff some file in this interval is compacting to base. Such intervals
+	// cannot have any files participate in L0 -> Lbase compactions.
+	isBaseCompacting bool
+
+	// The min and max intervals index across all the files that overlap with
+	// this interval. Inclusive on both sides.
+	filesMinIntervalIndex int
+	filesMaxIntervalIndex int
+
+	// True if another interval that has a file extending into this interval is
+	// undergoing a compaction into Lbase. In other words, this bool is true if
+	// any interval in [filesMinIntervalIndex, filesMaxIntervalIndex] has
+	// isBaseCompacting set to true. This lets the compaction picker
+	// de-prioritize this interval for picking compactions, since there's a high
+	// chance that a base compaction with a sufficient height of sublevels
+	// rooted at this interval could not be chosen due to the ongoing base
+	// compaction in the other interval. If the file straddling the two
+	// intervals is at a sufficiently high sublevel (with enough compactible
+	// files below it to satisfy minCompactionDepth), this is not an issue, but
+	// to optimize for quickly picking base compactions far away from other base
+	// compactions, this bool is used as a heuristic (but not as a complete
+	// disqualifier).
+	intervalRangeIsBaseCompacting bool
+
+	// All files in this interval, in increasing sublevel order.
+	files []*FileMetadata
+
+	// len(files) - compactingFileCount is the stack depth that requires
+	// starting new compactions. This metric is not precise since the
+	// compactingFileCount can include files that are part of N (where N > 1)
+	// intra-L0 compactions, so the stack depth after those complete will be
+	// len(files) - compactingFileCount + N. We ignore this imprecision since we
+	// don't want to track which files are part of which intra-L0 compaction.
+	compactingFileCount int
+
+	// Interpolated from files in this interval. For files spanning multiple
+	// intervals, we assume an equal distribution of bytes across all those
+	// intervals.
+	estimatedBytes uint64
+}
+
+// Helper type for any cases requiring a bool slice.
+type bitSet []bool
+
+func newBitSet(n int) bitSet {
+	return make([]bool, n)
+}
+
+func (b *bitSet) markBit(i int) {
+	(*b)[i] = true
+}
+
+func (b *bitSet) markBits(start, end int) {
+	for i := start; i < end; i++ {
+		(*b)[i] = true
+	}
+}
+
+func (b *bitSet) clearAllBits() {
+	for i := range *b {
+		(*b)[i] = false
+	}
+}
+
+// L0Compaction describes an active compaction with inputs from L0.
+type L0Compaction struct {
+	Smallest  InternalKey
+	Largest   InternalKey
+	IsIntraL0 bool
+}
+
+// L0Sublevels represents a sublevel view of SSTables in L0. Tables in one
+// sublevel are non-overlapping in key ranges, and keys in higher-indexed
+// sublevels shadow older versions in lower-indexed sublevels. These invariants
+// are similar to the regular level invariants, except with higher indexed
+// sublevels having newer keys as opposed to lower indexed levels.
+//
+// There is no limit to the number of sublevels that can exist in L0 at any
+// time, however read and compaction performance is best when there are as few
+// sublevels as possible.
+type L0Sublevels struct {
+	// Levels are ordered from oldest sublevel to youngest sublevel in the
+	// outer slice, and the inner slice contains non-overlapping files for
+	// that sublevel in increasing key order. Levels is constructed from
+	// levelFiles and is used by callers that require a LevelSlice. The below two
+	// fields are treated as immutable once created in NewL0Sublevels.
+	Levels     []LevelSlice
+	levelFiles [][]*FileMetadata
+
+	cmp       Compare
+	formatKey base.FormatKey
+
+	fileBytes uint64
+	// All the L0 files, ordered from oldest to youngest.
+	levelMetadata *LevelMetadata
+
+	// The file intervals in increasing key order.
+	orderedIntervals []fileInterval
+
+	// Keys to break flushes at.
+	flushSplitUserKeys [][]byte
+
+	// Only used to check invariants.
+	addL0FilesCalled bool
+}
+
+type sublevelSorter []*FileMetadata
+
+// Len implements sort.Interface.
+func (sl sublevelSorter) Len() int {
+	return len(sl)
+}
+
+// Less implements sort.Interface.
+func (sl sublevelSorter) Less(i, j int) bool {
+	return sl[i].minIntervalIndex < sl[j].minIntervalIndex
+}
+
+// Swap implements sort.Interface.
+func (sl sublevelSorter) Swap(i, j int) {
+	sl[i], sl[j] = sl[j], sl[i]
+}
+
+// NewL0Sublevels creates an L0Sublevels instance for a given set of L0 files.
+// These files must all be in L0 and must be sorted by seqnum (see
+// SortBySeqNum). During interval iteration, when flushSplitMaxBytes bytes are
+// exceeded in the range of intervals since the last flush split key, a flush
+// split key is added.
+//
+// This method can be called without DB.mu being held, so any DB.mu protected
+// fields in FileMetadata cannot be accessed here, such as Compacting and
+// IsIntraL0Compacting. Those fields are accessed in InitCompactingFileInfo
+// instead.
+func NewL0Sublevels(
+	levelMetadata *LevelMetadata, cmp Compare, formatKey base.FormatKey, flushSplitMaxBytes int64,
+) (*L0Sublevels, error) {
+	s := &L0Sublevels{cmp: cmp, formatKey: formatKey}
+	s.levelMetadata = levelMetadata
+	keys := make([]intervalKeyTemp, 0, 2*s.levelMetadata.Len())
+	iter := levelMetadata.Iter()
+	for i, f := 0, iter.First(); f != nil; i, f = i+1, iter.Next() {
+		f.L0Index = i
+		keys = append(keys, intervalKeyTemp{
+			intervalKey: intervalKey{key: f.Smallest.UserKey},
+			fileMeta:    f,
+			isEndKey:    false,
+		})
+		keys = append(keys, intervalKeyTemp{
+			intervalKey: intervalKey{
+				key:       f.Largest.UserKey,
+				isLargest: !f.Largest.IsExclusiveSentinel(),
+			},
+			fileMeta: f,
+			isEndKey: true,
+		})
+	}
+	keys = sortAndSweep(keys, cmp)
+	// All interval indices reference s.orderedIntervals.
+	s.orderedIntervals = make([]fileInterval, len(keys))
+	for i := range keys {
+		s.orderedIntervals[i] = fileInterval{
+			index:                 i,
+			startKey:              keys[i].intervalKey,
+			filesMinIntervalIndex: i,
+			filesMaxIntervalIndex: i,
+		}
+	}
+	// Initialize minIntervalIndex and maxIntervalIndex for each file, and use that
+	// to update intervals.
+	for f := iter.First(); f != nil; f = iter.Next() {
+		if err := s.addFileToSublevels(f, false /* checkInvariant */); err != nil {
+			return nil, err
+		}
+	}
+	// Sort each sublevel in increasing key order.
+	for i := range s.levelFiles {
+		sort.Sort(sublevelSorter(s.levelFiles[i]))
+	}
+
+	// Construct a parallel slice of sublevel B-Trees.
+	// TODO(jackson): Consolidate and only use the B-Trees.
+	for _, sublevelFiles := range s.levelFiles {
+		tr, ls := makeBTree(btreeCmpSmallestKey(cmp), sublevelFiles)
+		s.Levels = append(s.Levels, ls)
+		tr.Release()
+	}
+
+	s.calculateFlushSplitKeys(flushSplitMaxBytes)
+	return s, nil
+}
+
+// Helper function to merge new intervalKeys into an existing slice of old
+// fileIntervals, into result. Returns the new result and a slice of ints
+// mapping old interval indices to new ones. The added intervalKeys do not need
+// to be sorted; they get sorted and deduped in this function.
+func mergeIntervals(
+	old, result []fileInterval, added []intervalKeyTemp, compare Compare,
+) ([]fileInterval, []int) {
+	sorter := intervalKeySorter{keys: added, cmp: compare}
+	sort.Sort(sorter)
+
+	oldToNewMap := make([]int, len(old))
+	i := 0
+	j := 0
+
+	for i < len(old) || j < len(added) {
+		for j > 0 && j < len(added) && intervalKeyCompare(compare, added[j-1].intervalKey, added[j].intervalKey) == 0 {
+			added[j].setFileIntervalIndex(len(result) - 1)
+			j++
+		}
+		if i >= len(old) && j >= len(added) {
+			break
+		}
+		var cmp int
+		if i >= len(old) {
+			cmp = +1
+		}
+		if j >= len(added) {
+			cmp = -1
+		}
+		if cmp == 0 {
+			cmp = intervalKeyCompare(compare, old[i].startKey, added[j].intervalKey)
+		}
+		switch {
+		case cmp <= 0:
+			// Shallow-copy the existing interval.
+			newInterval := old[i]
+			result = append(result, newInterval)
+			oldToNewMap[i] = len(result) - 1
+			i++
+			if cmp == 0 {
+				added[j].setFileIntervalIndex(len(result) - 1)
+				j++
+			}
+		case cmp > 0:
+			var prevInterval fileInterval
+			// Insert a new interval for a newly-added file. prevInterval, if
+			// non-zero, will be "inherited"; we copy its files as those extend
+			// into this interval.
+			if len(result) > 0 {
+				prevInterval = result[len(result)-1]
+			}
+			newInterval := fileInterval{
+				index:                 len(result),
+				startKey:              added[j].intervalKey,
+				filesMinIntervalIndex: len(result),
+				filesMaxIntervalIndex: len(result),
+
+				// estimatedBytes gets recalculated later on, as the number of intervals
+				// the file bytes are interpolated over has changed.
+				estimatedBytes: 0,
+				// Copy the below attributes from prevInterval.
+				files:                         append([]*FileMetadata(nil), prevInterval.files...),
+				isBaseCompacting:              prevInterval.isBaseCompacting,
+				intervalRangeIsBaseCompacting: prevInterval.intervalRangeIsBaseCompacting,
+				compactingFileCount:           prevInterval.compactingFileCount,
+			}
+			result = append(result, newInterval)
+			added[j].setFileIntervalIndex(len(result) - 1)
+			j++
+		}
+	}
+	return result, oldToNewMap
+}
+
+// AddL0Files incrementally builds a new L0Sublevels for when the only change
+// since the receiver L0Sublevels was an addition of the specified files, with
+// no L0 deletions. The common case of this is an ingestion or a flush. These
+// files can "sit on top" of existing sublevels, creating at most one new
+// sublevel for a flush (and possibly multiple for an ingestion), and at most
+// 2*len(files) additions to s.orderedIntervals. No files must have been deleted
+// from L0, and the added files must all be newer in sequence numbers than
+// existing files in L0Sublevels. The files parameter must be sorted in seqnum
+// order. The levelMetadata parameter corresponds to the new L0 post addition of
+// files. This method is meant to be significantly more performant than
+// NewL0Sublevels.
+//
+// Note that this function can only be called once on a given receiver; it
+// appends to some slices in s which is only safe when done once. This is okay,
+// as the common case (generating a new L0Sublevels after a flush/ingestion) is
+// only going to necessitate one call of this method on a given receiver. The
+// returned value, if non-nil, can then have [*L0Sublevels.AddL0Files] called on
+// it again, and so on. If [errInvalidL0SublevelsOpt] is returned as an error,
+// it likely means the optimization could not be applied (i.e. files added were
+// older than files already in the sublevels, which is possible around
+// ingestions and in tests). Eg. it can happen when an ingested file was
+// ingested without queueing a flush since it did not actually overlap with any
+// keys in the memtable. Later on the memtable was flushed, and the memtable had
+// keys spanning around the ingested file, producing a flushed file that
+// overlapped with the ingested file in file bounds but not in keys. It's
+// possible for that flushed file to have a lower LargestSeqNum than the
+// ingested file if all the additions after the ingestion were to another
+// flushed file that was split into a separate sstable during flush. Any other
+// non-nil error means [L0Sublevels] generation failed in the same way as
+// [NewL0Sublevels] would likely fail.
+func (s *L0Sublevels) AddL0Files(
+	files []*FileMetadata, flushSplitMaxBytes int64, levelMetadata *LevelMetadata,
+) (*L0Sublevels, error) {
+	if invariants.Enabled && s.addL0FilesCalled {
+		panic("AddL0Files called twice on the same receiver")
+	}
+	s.addL0FilesCalled = true
+
+	// Start with a shallow copy of s.
+	newVal := &L0Sublevels{}
+	*newVal = *s
+
+	newVal.addL0FilesCalled = false
+	newVal.levelMetadata = levelMetadata
+	// Deep copy levelFiles and Levels, as they are mutated and sorted below.
+	// Shallow copies of slices that we just append to, are okay.
+	newVal.levelFiles = make([][]*FileMetadata, len(s.levelFiles))
+	for i := range s.levelFiles {
+		newVal.levelFiles[i] = make([]*FileMetadata, len(s.levelFiles[i]))
+		copy(newVal.levelFiles[i], s.levelFiles[i])
+	}
+	newVal.Levels = make([]LevelSlice, len(s.Levels))
+	copy(newVal.Levels, s.Levels)
+
+	fileKeys := make([]intervalKeyTemp, 0, 2*len(files))
+	for _, f := range files {
+		left := intervalKeyTemp{
+			intervalKey: intervalKey{key: f.Smallest.UserKey},
+			fileMeta:    f,
+		}
+		right := intervalKeyTemp{
+			intervalKey: intervalKey{
+				key:       f.Largest.UserKey,
+				isLargest: !f.Largest.IsExclusiveSentinel(),
+			},
+			fileMeta: f,
+			isEndKey: true,
+		}
+		fileKeys = append(fileKeys, left, right)
+	}
+	keys := make([]fileInterval, 0, 2*levelMetadata.Len())
+	var oldToNewMap []int
+	// We can avoid the sortAndSweep step on the combined length of
+	// s.orderedIntervals and fileKeys by treating this as a merge of two sorted
+	// runs, fileKeys and s.orderedIntervals, into `keys` which will form
+	// newVal.orderedIntervals.
+	keys, oldToNewMap = mergeIntervals(s.orderedIntervals, keys, fileKeys, s.cmp)
+	if invariants.Enabled {
+		for i := 1; i < len(keys); i++ {
+			if intervalKeyCompare(newVal.cmp, keys[i-1].startKey, keys[i].startKey) >= 0 {
+				panic("keys not sorted correctly")
+			}
+		}
+	}
+	newVal.orderedIntervals = keys
+	// Update indices in s.orderedIntervals for fileIntervals we retained.
+	for _, newIdx := range oldToNewMap {
+		newInterval := &keys[newIdx]
+		newInterval.index = newIdx
+		// This code, and related code in the for loop below, adjusts
+		// files{Min,Max}IntervalIndex just for interval indices shifting due to
+		// new intervals, and not for any of the new files being added to the
+		// same intervals. The goal is to produce a state of the system that's
+		// accurate for all existing files, and has all the new intervals to
+		// support new files. Once that's done, we can just call
+		// addFileToSublevel to adjust all relevant intervals for new files.
+		newInterval.filesMinIntervalIndex = oldToNewMap[newInterval.filesMinIntervalIndex]
+		// maxIntervalIndexes are special. Since it's an inclusive end bound, we
+		// actually have to map it to the _next_ old interval's new previous
+		// interval. This logic is easier to understand if you see
+		// [f.minIntervalIndex, f.maxIntervalIndex] as [f.minIntervalIndex,
+		// f.maxIntervalIndex+1). The other case to remember is when the
+		// interval is completely empty (i.e. len(newInterval.files) == 0); in
+		// that case we want to refer back to ourselves regardless of additions
+		// to the right of us.
+		if newInterval.filesMaxIntervalIndex < len(oldToNewMap)-1 && len(newInterval.files) > 0 {
+			newInterval.filesMaxIntervalIndex = oldToNewMap[newInterval.filesMaxIntervalIndex+1] - 1
+		} else {
+			// newInterval.filesMaxIntervalIndex == len(oldToNewMap)-1.
+			newInterval.filesMaxIntervalIndex = oldToNewMap[newInterval.filesMaxIntervalIndex]
+		}
+	}
+	// Loop through all instances of new intervals added between two old
+	// intervals and expand [filesMinIntervalIndex, filesMaxIntervalIndex] of
+	// new intervals to reflect that of adjacent old intervals.
+	{
+		// We can skip cases where new intervals were added to the left of all
+		// existing intervals (eg. if the first entry in oldToNewMap is
+		// oldToNewMap[0] >= 1). Those intervals will only contain newly added
+		// files and will have their parameters adjusted down in
+		// addFileToSublevels. The same can also be said about new intervals
+		// that are to the right of all existing intervals.
+		lastIdx := 0
+		for _, newIdx := range oldToNewMap {
+			for i := lastIdx + 1; i < newIdx; i++ {
+				minIntervalIndex := i
+				maxIntervalIndex := i
+				if keys[lastIdx].filesMaxIntervalIndex != lastIdx {
+					// Last old interval has files extending into keys[i].
+					minIntervalIndex = keys[lastIdx].filesMinIntervalIndex
+					maxIntervalIndex = keys[lastIdx].filesMaxIntervalIndex
+				}
+
+				keys[i].filesMinIntervalIndex = minIntervalIndex
+				keys[i].filesMaxIntervalIndex = maxIntervalIndex
+			}
+			lastIdx = newIdx
+		}
+	}
+	// Go through old files and update interval indices.
+	//
+	// TODO(bilal): This is the only place in this method where we loop through
+	// all existing files, which could be much more in number than newly added
+	// files. See if we can avoid the need for this, either by getting rid of
+	// f.minIntervalIndex and f.maxIntervalIndex and calculating them on the fly
+	// with a binary search, or by only looping through files to the right of
+	// the first interval touched by this method.
+	for sublevel := range s.Levels {
+		s.Levels[sublevel].Each(func(f *FileMetadata) {
+			oldIntervalDelta := f.maxIntervalIndex - f.minIntervalIndex + 1
+			oldMinIntervalIndex := f.minIntervalIndex
+			f.minIntervalIndex = oldToNewMap[f.minIntervalIndex]
+			// maxIntervalIndex is special. Since it's an inclusive end bound,
+			// we actually have to map it to the _next_ old interval's new
+			// previous interval. This logic is easier to understand if you see
+			// [f.minIntervalIndex, f.maxIntervalIndex] as [f.minIntervalIndex,
+			// f.maxIntervalIndex+1).
+			f.maxIntervalIndex = oldToNewMap[f.maxIntervalIndex+1] - 1
+			newIntervalDelta := f.maxIntervalIndex - f.minIntervalIndex + 1
+			// Recalculate estimatedBytes for all old files across new
+			// intervals, but only if new intervals were added in between.
+			if oldIntervalDelta != newIntervalDelta {
+				// j is incremented so that oldToNewMap[j] points to the next
+				// old interval. This is used to distinguish between old
+				// intervals (i.e. ones where we need to subtract
+				// f.Size/oldIntervalDelta) from new ones (where we don't need
+				// to subtract). In both cases we need to add
+				// f.Size/newIntervalDelta.
+				j := oldMinIntervalIndex
+				for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ {
+					if oldToNewMap[j] == i {
+						newVal.orderedIntervals[i].estimatedBytes -= f.Size / uint64(oldIntervalDelta)
+						j++
+					}
+					newVal.orderedIntervals[i].estimatedBytes += f.Size / uint64(newIntervalDelta)
+				}
+			}
+		})
+	}
+	updatedSublevels := make([]int, 0)
+	// Update interval indices for new files.
+	for i, f := range files {
+		f.L0Index = s.levelMetadata.Len() + i
+		if err := newVal.addFileToSublevels(f, true /* checkInvariant */); err != nil {
+			return nil, err
+		}
+		updatedSublevels = append(updatedSublevels, f.SubLevel)
+	}
+
+	// Sort and deduplicate updatedSublevels.
+	sort.Ints(updatedSublevels)
+	{
+		j := 0
+		for i := 1; i < len(updatedSublevels); i++ {
+			if updatedSublevels[i] != updatedSublevels[j] {
+				j++
+				updatedSublevels[j] = updatedSublevels[i]
+			}
+		}
+		updatedSublevels = updatedSublevels[:j+1]
+	}
+
+	// Sort each updated sublevel in increasing key order.
+	for _, sublevel := range updatedSublevels {
+		sort.Sort(sublevelSorter(newVal.levelFiles[sublevel]))
+	}
+
+	// Construct a parallel slice of sublevel B-Trees.
+	// TODO(jackson): Consolidate and only use the B-Trees.
+	for _, sublevel := range updatedSublevels {
+		tr, ls := makeBTree(btreeCmpSmallestKey(newVal.cmp), newVal.levelFiles[sublevel])
+		if sublevel == len(newVal.Levels) {
+			newVal.Levels = append(newVal.Levels, ls)
+		} else {
+			// sublevel < len(s.Levels). If this panics, updatedSublevels was not
+			// populated correctly.
+			newVal.Levels[sublevel] = ls
+		}
+		tr.Release()
+	}
+
+	newVal.flushSplitUserKeys = nil
+	newVal.calculateFlushSplitKeys(flushSplitMaxBytes)
+	return newVal, nil
+}
+
+// addFileToSublevels is called during L0Sublevels generation, and adds f to the
+// correct sublevel's levelFiles, the relevant intervals' files slices, and sets
+// interval indices on f. This method, if called successively on multiple files,
+// _must_ be called on successively newer files (by seqnum). If checkInvariant
+// is true, it could check for this in some cases and return
+// [errInvalidL0SublevelsOpt] if that invariant isn't held.
+func (s *L0Sublevels) addFileToSublevels(f *FileMetadata, checkInvariant bool) error {
+	// This is a simple and not very accurate estimate of the number of
+	// bytes this SSTable contributes to the intervals it is a part of.
+	//
+	// TODO(bilal): Call EstimateDiskUsage in sstable.Reader with interval
+	// bounds to get a better estimate for each interval.
+	interpolatedBytes := f.Size / uint64(f.maxIntervalIndex-f.minIntervalIndex+1)
+	s.fileBytes += f.Size
+	subLevel := 0
+	// Update state in every fileInterval for this file.
+	for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ {
+		interval := &s.orderedIntervals[i]
+		if len(interval.files) > 0 {
+			if checkInvariant && interval.files[len(interval.files)-1].LargestSeqNum > f.LargestSeqNum {
+				// We are sliding this file "underneath" an existing file. Throw away
+				// and start over in NewL0Sublevels.
+				return errInvalidL0SublevelsOpt
+			}
+			// interval.files is sorted by sublevels, from lowest to highest.
+			// AddL0Files can only add files at sublevels higher than existing files
+			// in the same key intervals.
+			if maxSublevel := interval.files[len(interval.files)-1].SubLevel; subLevel <= maxSublevel {
+				subLevel = maxSublevel + 1
+			}
+		}
+		interval.estimatedBytes += interpolatedBytes
+		if f.minIntervalIndex < interval.filesMinIntervalIndex {
+			interval.filesMinIntervalIndex = f.minIntervalIndex
+		}
+		if f.maxIntervalIndex > interval.filesMaxIntervalIndex {
+			interval.filesMaxIntervalIndex = f.maxIntervalIndex
+		}
+		interval.files = append(interval.files, f)
+	}
+	f.SubLevel = subLevel
+	if subLevel > len(s.levelFiles) {
+		return errors.Errorf("chose a sublevel beyond allowed range of sublevels: %d vs 0-%d", subLevel, len(s.levelFiles))
+	}
+	if subLevel == len(s.levelFiles) {
+		s.levelFiles = append(s.levelFiles, []*FileMetadata{f})
+	} else {
+		s.levelFiles[subLevel] = append(s.levelFiles[subLevel], f)
+	}
+	return nil
+}
+
+func (s *L0Sublevels) calculateFlushSplitKeys(flushSplitMaxBytes int64) {
+	var cumulativeBytes uint64
+	// Multiply flushSplitMaxBytes by the number of sublevels. This prevents
+	// excessive flush splitting when the number of sublevels increases.
+	flushSplitMaxBytes *= int64(len(s.levelFiles))
+	for i := 0; i < len(s.orderedIntervals); i++ {
+		interval := &s.orderedIntervals[i]
+		if flushSplitMaxBytes > 0 && cumulativeBytes > uint64(flushSplitMaxBytes) &&
+			(len(s.flushSplitUserKeys) == 0 ||
+				!bytes.Equal(interval.startKey.key, s.flushSplitUserKeys[len(s.flushSplitUserKeys)-1])) {
+			s.flushSplitUserKeys = append(s.flushSplitUserKeys, interval.startKey.key)
+			cumulativeBytes = 0
+		}
+		cumulativeBytes += s.orderedIntervals[i].estimatedBytes
+	}
+}
+
+// InitCompactingFileInfo initializes internal flags relating to compacting
+// files. Must be called after sublevel initialization.
+//
+// Requires DB.mu *and* the manifest lock to be held.
+func (s *L0Sublevels) InitCompactingFileInfo(inProgress []L0Compaction) {
+	for i := range s.orderedIntervals {
+		s.orderedIntervals[i].compactingFileCount = 0
+		s.orderedIntervals[i].isBaseCompacting = false
+		s.orderedIntervals[i].intervalRangeIsBaseCompacting = false
+	}
+
+	iter := s.levelMetadata.Iter()
+	for f := iter.First(); f != nil; f = iter.Next() {
+		if invariants.Enabled {
+			if !bytes.Equal(s.orderedIntervals[f.minIntervalIndex].startKey.key, f.Smallest.UserKey) {
+				panic(fmt.Sprintf("f.minIntervalIndex in FileMetadata out of sync with intervals in L0Sublevels: %s != %s",
+					s.formatKey(s.orderedIntervals[f.minIntervalIndex].startKey.key), s.formatKey(f.Smallest.UserKey)))
+			}
+			if !bytes.Equal(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key, f.Largest.UserKey) {
+				panic(fmt.Sprintf("f.maxIntervalIndex in FileMetadata out of sync with intervals in L0Sublevels: %s != %s",
+					s.formatKey(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key), s.formatKey(f.Smallest.UserKey)))
+			}
+		}
+		if !f.IsCompacting() {
+			continue
+		}
+		if invariants.Enabled {
+			if s.cmp(s.orderedIntervals[f.minIntervalIndex].startKey.key, f.Smallest.UserKey) != 0 || s.cmp(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key, f.Largest.UserKey) != 0 {
+				panic(fmt.Sprintf("file %s has inconsistent L0 Sublevel interval bounds: %s-%s, %s-%s", f.FileNum,
+					s.orderedIntervals[f.minIntervalIndex].startKey.key, s.orderedIntervals[f.maxIntervalIndex+1].startKey.key,
+					f.Smallest.UserKey, f.Largest.UserKey))
+			}
+		}
+		for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ {
+			interval := &s.orderedIntervals[i]
+			interval.compactingFileCount++
+			if !f.IsIntraL0Compacting {
+				// If f.Compacting && !f.IsIntraL0Compacting, this file is
+				// being compacted to Lbase.
+				interval.isBaseCompacting = true
+			}
+		}
+	}
+
+	// Some intervals may be base compacting without the files contained within
+	// those intervals being marked as compacting. This is possible if the files
+	// were added after the compaction initiated, and the active compaction
+	// files straddle the input file. Mark these intervals as base compacting.
+	for _, c := range inProgress {
+		startIK := intervalKey{key: c.Smallest.UserKey, isLargest: false}
+		endIK := intervalKey{key: c.Largest.UserKey, isLargest: !c.Largest.IsExclusiveSentinel()}
+		start, _ := slices.BinarySearchFunc(s.orderedIntervals, startIK, func(a fileInterval, b intervalKey) int {
+			return intervalKeyCompare(s.cmp, a.startKey, b)
+		})
+		end, _ := slices.BinarySearchFunc(s.orderedIntervals, endIK, func(a fileInterval, b intervalKey) int {
+			return intervalKeyCompare(s.cmp, a.startKey, b)
+		})
+		for i := start; i < end && i < len(s.orderedIntervals); i++ {
+			interval := &s.orderedIntervals[i]
+			if !c.IsIntraL0 {
+				interval.isBaseCompacting = true
+			}
+		}
+	}
+
+	min := 0
+	for i := range s.orderedIntervals {
+		interval := &s.orderedIntervals[i]
+		if interval.isBaseCompacting {
+			minIndex := interval.filesMinIntervalIndex
+			if minIndex < min {
+				minIndex = min
+			}
+			for j := minIndex; j <= interval.filesMaxIntervalIndex; j++ {
+				min = j
+				s.orderedIntervals[j].intervalRangeIsBaseCompacting = true
+			}
+		}
+	}
+}
+
+// String produces a string containing useful debug information. Useful in test
+// code and debugging.
+func (s *L0Sublevels) String() string {
+	return s.describe(false)
+}
+
+func (s *L0Sublevels) describe(verbose bool) string {
+	var buf strings.Builder
+	fmt.Fprintf(&buf, "file count: %d, sublevels: %d, intervals: %d\nflush split keys(%d): [",
+		s.levelMetadata.Len(), len(s.levelFiles), len(s.orderedIntervals), len(s.flushSplitUserKeys))
+	for i := range s.flushSplitUserKeys {
+		fmt.Fprintf(&buf, "%s", s.formatKey(s.flushSplitUserKeys[i]))
+		if i < len(s.flushSplitUserKeys)-1 {
+			fmt.Fprintf(&buf, ", ")
+		}
+	}
+	fmt.Fprintln(&buf, "]")
+	numCompactingFiles := 0
+	for i := len(s.levelFiles) - 1; i >= 0; i-- {
+		maxIntervals := 0
+		sumIntervals := 0
+		var totalBytes uint64
+		for _, f := range s.levelFiles[i] {
+			intervals := f.maxIntervalIndex - f.minIntervalIndex + 1
+			if intervals > maxIntervals {
+				maxIntervals = intervals
+			}
+			sumIntervals += intervals
+			totalBytes += f.Size
+			if f.IsCompacting() {
+				numCompactingFiles++
+			}
+		}
+		fmt.Fprintf(&buf, "0.%d: file count: %d, bytes: %d, width (mean, max): %0.1f, %d, interval range: [%d, %d]\n",
+			i, len(s.levelFiles[i]), totalBytes, float64(sumIntervals)/float64(len(s.levelFiles[i])), maxIntervals, s.levelFiles[i][0].minIntervalIndex,
+			s.levelFiles[i][len(s.levelFiles[i])-1].maxIntervalIndex)
+		for _, f := range s.levelFiles[i] {
+			intervals := f.maxIntervalIndex - f.minIntervalIndex + 1
+			if verbose {
+				fmt.Fprintf(&buf, "\t%s\n", f)
+			}
+			if s.levelMetadata.Len() > 50 && intervals*3 > len(s.orderedIntervals) {
+				var intervalsBytes uint64
+				for k := f.minIntervalIndex; k <= f.maxIntervalIndex; k++ {
+					intervalsBytes += s.orderedIntervals[k].estimatedBytes
+				}
+				fmt.Fprintf(&buf, "wide file: %d, [%d, %d], byte fraction: %f\n",
+					f.FileNum, f.minIntervalIndex, f.maxIntervalIndex,
+					float64(intervalsBytes)/float64(s.fileBytes))
+			}
+		}
+	}
+
+	lastCompactingIntervalStart := -1
+	fmt.Fprintf(&buf, "compacting file count: %d, base compacting intervals: ", numCompactingFiles)
+	i := 0
+	foundBaseCompactingIntervals := false
+	for ; i < len(s.orderedIntervals); i++ {
+		interval := &s.orderedIntervals[i]
+		if len(interval.files) == 0 {
+			continue
+		}
+		if !interval.isBaseCompacting {
+			if lastCompactingIntervalStart != -1 {
+				if foundBaseCompactingIntervals {
+					buf.WriteString(", ")
+				}
+				fmt.Fprintf(&buf, "[%d, %d]", lastCompactingIntervalStart, i-1)
+				foundBaseCompactingIntervals = true
+			}
+			lastCompactingIntervalStart = -1
+		} else {
+			if lastCompactingIntervalStart == -1 {
+				lastCompactingIntervalStart = i
+			}
+		}
+	}
+	if lastCompactingIntervalStart != -1 {
+		if foundBaseCompactingIntervals {
+			buf.WriteString(", ")
+		}
+		fmt.Fprintf(&buf, "[%d, %d]", lastCompactingIntervalStart, i-1)
+	} else if !foundBaseCompactingIntervals {
+		fmt.Fprintf(&buf, "none")
+	}
+	fmt.Fprintln(&buf, "")
+	return buf.String()
+}
+
+// ReadAmplification returns the contribution of L0Sublevels to the read
+// amplification for any particular point key. It is the maximum height of any
+// tracked fileInterval. This is always less than or equal to the number of
+// sublevels.
+func (s *L0Sublevels) ReadAmplification() int {
+	amp := 0
+	for i := range s.orderedIntervals {
+		interval := &s.orderedIntervals[i]
+		fileCount := len(interval.files)
+		if amp < fileCount {
+			amp = fileCount
+		}
+	}
+	return amp
+}
+
+// UserKeyRange encodes a key range in user key space. A UserKeyRange's Start
+// and End boundaries are both inclusive.
+type UserKeyRange struct {
+	Start, End []byte
+}
+
+// InUseKeyRanges returns the merged table bounds of L0 files overlapping the
+// provided user key range. The returned key ranges are sorted and
+// nonoverlapping.
+func (s *L0Sublevels) InUseKeyRanges(smallest, largest []byte) []UserKeyRange {
+	// Binary search to find the provided keys within the intervals.
+	startIK := intervalKey{key: smallest, isLargest: false}
+	endIK := intervalKey{key: largest, isLargest: true}
+	start := sort.Search(len(s.orderedIntervals), func(i int) bool {
+		return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, startIK) > 0
+	})
+	if start > 0 {
+		// Back up to the first interval with a start key <= startIK.
+		start--
+	}
+	end := sort.Search(len(s.orderedIntervals), func(i int) bool {
+		return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, endIK) > 0
+	})
+
+	var keyRanges []UserKeyRange
+	var curr *UserKeyRange
+	for i := start; i < end; {
+		// Intervals with no files are not in use and can be skipped, once we
+		// end the current UserKeyRange.
+		if len(s.orderedIntervals[i].files) == 0 {
+			curr = nil
+			i++
+			continue
+		}
+
+		// If curr is nil, start a new in-use key range.
+		if curr == nil {
+			keyRanges = append(keyRanges, UserKeyRange{
+				Start: s.orderedIntervals[i].startKey.key,
+			})
+			curr = &keyRanges[len(keyRanges)-1]
+		}
+
+		// If the filesMaxIntervalIndex is not the current index, we can jump to
+		// the max index, knowing that all intermediary intervals are overlapped
+		// by some file.
+		if maxIdx := s.orderedIntervals[i].filesMaxIntervalIndex; maxIdx != i {
+			// Note that end may be less than or equal to maxIdx if we're
+			// concerned with a key range that ends before the interval at
+			// maxIdx starts. We must set curr.End now, before making that leap,
+			// because this iteration may be the last.
+			i = maxIdx
+			curr.End = s.orderedIntervals[i+1].startKey.key
+			continue
+		}
+
+		// No files overlapping with this interval overlap with the next
+		// interval. Update the current end to be the next interval's start key.
+		// Note that curr is not necessarily finished, because there may be an
+		// abutting non-empty interval.
+		curr.End = s.orderedIntervals[i+1].startKey.key
+		i++
+	}
+	return keyRanges
+}
+
+// FlushSplitKeys returns a slice of user keys to split flushes at. Used by
+// flushes to avoid writing sstables that straddle these split keys. These
+// should be interpreted as the keys to start the next sstable (not the last key
+// to include in the prev sstable). These are user keys so that range tombstones
+// can be properly truncated (untruncated range tombstones are not permitted for
+// L0 files).
+func (s *L0Sublevels) FlushSplitKeys() [][]byte {
+	return s.flushSplitUserKeys
+}
+
+// MaxDepthAfterOngoingCompactions returns an estimate of maximum depth of
+// sublevels after all ongoing compactions run to completion. Used by compaction
+// picker to decide compaction score for L0. There is no scoring for intra-L0
+// compactions -- they only run if L0 score is high but we're unable to pick an
+// L0 -> Lbase compaction.
+func (s *L0Sublevels) MaxDepthAfterOngoingCompactions() int {
+	depth := 0
+	for i := range s.orderedIntervals {
+		interval := &s.orderedIntervals[i]
+		intervalDepth := len(interval.files) - interval.compactingFileCount
+		if depth < intervalDepth {
+			depth = intervalDepth
+		}
+	}
+	return depth
+}
+
+// Only for temporary debugging in the absence of proper tests.
+//
+// TODO(bilal): Simplify away the debugging statements in this method, and make
+// this a pure sanity checker.
+//
+//lint:ignore U1000 - useful for debugging
+func (s *L0Sublevels) checkCompaction(c *L0CompactionFiles) error {
+	includedFiles := newBitSet(s.levelMetadata.Len())
+	fileIntervalsByLevel := make([]struct {
+		min int
+		max int
+	}, len(s.levelFiles))
+	for i := range fileIntervalsByLevel {
+		fileIntervalsByLevel[i].min = math.MaxInt32
+		fileIntervalsByLevel[i].max = 0
+	}
+	var topLevel int
+	var increment int
+	var limitReached func(int) bool
+	if c.isIntraL0 {
+		topLevel = len(s.levelFiles) - 1
+		increment = +1
+		limitReached = func(level int) bool {
+			return level == len(s.levelFiles)
+		}
+	} else {
+		topLevel = 0
+		increment = -1
+		limitReached = func(level int) bool {
+			return level < 0
+		}
+	}
+	for _, f := range c.Files {
+		if fileIntervalsByLevel[f.SubLevel].min > f.minIntervalIndex {
+			fileIntervalsByLevel[f.SubLevel].min = f.minIntervalIndex
+		}
+		if fileIntervalsByLevel[f.SubLevel].max < f.maxIntervalIndex {
+			fileIntervalsByLevel[f.SubLevel].max = f.maxIntervalIndex
+		}
+		includedFiles.markBit(f.L0Index)
+		if c.isIntraL0 {
+			if topLevel > f.SubLevel {
+				topLevel = f.SubLevel
+			}
+		} else {
+			if topLevel < f.SubLevel {
+				topLevel = f.SubLevel
+			}
+		}
+	}
+	min := fileIntervalsByLevel[topLevel].min
+	max := fileIntervalsByLevel[topLevel].max
+	for level := topLevel; !limitReached(level); level += increment {
+		if fileIntervalsByLevel[level].min < min {
+			min = fileIntervalsByLevel[level].min
+		}
+		if fileIntervalsByLevel[level].max > max {
+			max = fileIntervalsByLevel[level].max
+		}
+		index, _ := slices.BinarySearchFunc(s.levelFiles[level], min, func(a *FileMetadata, b int) int {
+			return stdcmp.Compare(a.maxIntervalIndex, b)
+		})
+		// start := index
+		for ; index < len(s.levelFiles[level]); index++ {
+			f := s.levelFiles[level][index]
+			if f.minIntervalIndex > max {
+				break
+			}
+			if c.isIntraL0 && f.LargestSeqNum >= c.earliestUnflushedSeqNum {
+				return errors.Errorf(
+					"sstable %s in compaction has sequence numbers higher than the earliest unflushed seqnum %d: %d-%d",
+					f.FileNum, c.earliestUnflushedSeqNum, f.SmallestSeqNum,
+					f.LargestSeqNum)
+			}
+			if !includedFiles[f.L0Index] {
+				var buf strings.Builder
+				fmt.Fprintf(&buf, "bug %t, seed interval: %d: level %d, sl index %d, f.index %d, min %d, max %d, pre-min %d, pre-max %d, f.min %d, f.max %d, filenum: %d, isCompacting: %t\n%s\n",
+					c.isIntraL0, c.seedInterval, level, index, f.L0Index, min, max, c.preExtensionMinInterval, c.preExtensionMaxInterval,
+					f.minIntervalIndex, f.maxIntervalIndex,
+					f.FileNum, f.IsCompacting(), s)
+				fmt.Fprintf(&buf, "files included:\n")
+				for _, f := range c.Files {
+					fmt.Fprintf(&buf, "filenum: %d, sl: %d, index: %d, [%d, %d]\n",
+						f.FileNum, f.SubLevel, f.L0Index, f.minIntervalIndex, f.maxIntervalIndex)
+				}
+				fmt.Fprintf(&buf, "files added:\n")
+				for _, f := range c.filesAdded {
+					fmt.Fprintf(&buf, "filenum: %d, sl: %d, index: %d, [%d, %d]\n",
+						f.FileNum, f.SubLevel, f.L0Index, f.minIntervalIndex, f.maxIntervalIndex)
+				}
+				return errors.New(buf.String())
+			}
+		}
+	}
+	return nil
+}
+
+// UpdateStateForStartedCompaction updates internal L0Sublevels state for a
+// recently started compaction. isBase specifies if this is a base compaction;
+// if false, this is assumed to be an intra-L0 compaction. The specified
+// compaction must be involving L0 SSTables. It's assumed that the Compacting
+// and IsIntraL0Compacting fields are already set on all [FileMetadata]s passed
+// in.
+func (s *L0Sublevels) UpdateStateForStartedCompaction(inputs []LevelSlice, isBase bool) error {
+	minIntervalIndex := -1
+	maxIntervalIndex := 0
+	for i := range inputs {
+		iter := inputs[i].Iter()
+		for f := iter.First(); f != nil; f = iter.Next() {
+			for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ {
+				interval := &s.orderedIntervals[i]
+				interval.compactingFileCount++
+			}
+			if f.minIntervalIndex < minIntervalIndex || minIntervalIndex == -1 {
+				minIntervalIndex = f.minIntervalIndex
+			}
+			if f.maxIntervalIndex > maxIntervalIndex {
+				maxIntervalIndex = f.maxIntervalIndex
+			}
+		}
+	}
+	if isBase {
+		for i := minIntervalIndex; i <= maxIntervalIndex; i++ {
+			interval := &s.orderedIntervals[i]
+			interval.isBaseCompacting = isBase
+			for j := interval.filesMinIntervalIndex; j <= interval.filesMaxIntervalIndex; j++ {
+				s.orderedIntervals[j].intervalRangeIsBaseCompacting = true
+			}
+		}
+	}
+	return nil
+}
+
+// L0CompactionFiles represents a candidate set of L0 files for compaction. Also
+// referred to as "lcf". Contains state information useful for generating the
+// compaction (such as Files), as well as for picking between candidate
+// compactions (eg. fileBytes and seedIntervalStackDepthReduction).
+type L0CompactionFiles struct {
+	Files []*FileMetadata
+
+	FilesIncluded bitSet
+	// A "seed interval" is an interval with a high stack depth that was chosen
+	// to bootstrap this compaction candidate. seedIntervalStackDepthReduction
+	// is the number of sublevels that have a file in the seed interval that is
+	// a part of this compaction.
+	seedIntervalStackDepthReduction int
+	// For base compactions, seedIntervalMinLevel is 0, and for intra-L0
+	// compactions, seedIntervalMaxLevel is len(s.Files)-1 i.e. the highest
+	// sublevel.
+	seedIntervalMinLevel int
+	seedIntervalMaxLevel int
+	// Index of the seed interval.
+	seedInterval int
+	// Sum of file sizes for all files in this compaction.
+	fileBytes uint64
+	// Intervals with index [minIntervalIndex, maxIntervalIndex] are
+	// participating in this compaction; it's the union set of all intervals
+	// overlapped by participating files.
+	minIntervalIndex int
+	maxIntervalIndex int
+
+	// Set for intra-L0 compactions. SSTables with sequence numbers greater
+	// than earliestUnflushedSeqNum cannot be a part of intra-L0 compactions.
+	isIntraL0               bool
+	earliestUnflushedSeqNum uint64
+
+	// For debugging purposes only. Used in checkCompaction().
+	preExtensionMinInterval int
+	preExtensionMaxInterval int
+	filesAdded              []*FileMetadata
+}
+
+// Clone allocates a new L0CompactionFiles, with the same underlying data. Note
+// that the two fileMetadata slices contain values that point to the same
+// underlying fileMetadata object. This is safe because these objects are read
+// only.
+func (l *L0CompactionFiles) Clone() *L0CompactionFiles {
+	oldLcf := *l
+	return &oldLcf
+}
+
+// String merely prints the starting address of the first file, if it exists.
+func (l *L0CompactionFiles) String() string {
+	if len(l.Files) > 0 {
+		return fmt.Sprintf("First File Address: %p", &l.Files[0])
+	}
+	return ""
+}
+
+// addFile adds the specified file to the LCF.
+func (l *L0CompactionFiles) addFile(f *FileMetadata) {
+	if l.FilesIncluded[f.L0Index] {
+		return
+	}
+	l.FilesIncluded.markBit(f.L0Index)
+	l.Files = append(l.Files, f)
+	l.filesAdded = append(l.filesAdded, f)
+	l.fileBytes += f.Size
+	if f.minIntervalIndex < l.minIntervalIndex {
+		l.minIntervalIndex = f.minIntervalIndex
+	}
+	if f.maxIntervalIndex > l.maxIntervalIndex {
+		l.maxIntervalIndex = f.maxIntervalIndex
+	}
+}
+
+// Helper to order intervals being considered for compaction.
+type intervalAndScore struct {
+	interval int
+	score    int
+}
+type intervalSorterByDecreasingScore []intervalAndScore
+
+func (is intervalSorterByDecreasingScore) Len() int { return len(is) }
+func (is intervalSorterByDecreasingScore) Less(i, j int) bool {
+	return is[i].score > is[j].score
+}
+func (is intervalSorterByDecreasingScore) Swap(i, j int) {
+	is[i], is[j] = is[j], is[i]
+}
+
+// Compactions:
+//
+// The sub-levels and intervals can be visualized in 2 dimensions as the X axis
+// containing intervals in increasing order and the Y axis containing sub-levels
+// (older to younger). The intervals can be sparse wrt sub-levels. We observe
+// that the system is typically under severe pressure in L0 during large numbers
+// of ingestions where most files added to L0 are narrow and non-overlapping.
+//
+//    L0.1    d---g
+//    L0.0  c--e  g--j o--s u--x
+//
+// As opposed to a case with a lot of wide, overlapping L0 files:
+//
+//    L0.3     d-----------r
+//    L0.2    c--------o
+//    L0.1   b-----------q
+//    L0.0  a----------------x
+//
+// In that case we expect the rectangle represented in the good visualization
+// above (i.e. the first one) to be wide and short, and not too sparse (most
+// intervals will have fileCount close to the sub-level count), which would make
+// it amenable to concurrent L0 -> Lbase compactions.
+//
+// L0 -> Lbase: The high-level goal of a L0 -> Lbase compaction is to reduce
+// stack depth, by compacting files in the intervals with the highest (fileCount
+// - compactingCount). Additionally, we would like compactions to not involve a
+// huge number of files, so that they finish quickly, and to allow for
+// concurrent L0 -> Lbase compactions when needed. In order to achieve these
+// goals we would like compactions to visualize as capturing thin and tall
+// rectangles. The approach below is to consider intervals in some order and
+// then try to construct a compaction using the interval. The first interval we
+// can construct a compaction for is the compaction that is started. There can
+// be multiple heuristics in choosing the ordering of the intervals -- the code
+// uses one heuristic that worked well for a large ingestion stemming from a
+// cockroachdb import, but additional experimentation is necessary to pick a
+// general heuristic. Additionally, the compaction that gets picked may be not
+// as desirable as one that could be constructed later in terms of reducing
+// stack depth (since adding more files to the compaction can get blocked by
+// needing to encompass files that are already being compacted). So an
+// alternative would be to try to construct more than one compaction and pick
+// the best one.
+//
+// Here's a visualization of an ideal L0->LBase compaction selection:
+//
+//    L0.3  a--d    g-j
+//    L0.2         f--j          r-t
+//    L0.1   b-d  e---j
+//    L0.0  a--d   f--j  l--o  p-----x
+//
+//    Lbase a--------i    m---------w
+//
+// The [g,j] interval has the highest stack depth, so it would have the highest
+// priority for selecting a base compaction candidate. Assuming none of the
+// files are already compacting, this is the compaction that will be chosen:
+//
+//               _______
+//    L0.3  a--d |  g-j|
+//    L0.2       | f--j|         r-t
+//    L0.1   b-d |e---j|
+//    L0.0  a--d | f--j| l--o  p-----x
+//
+//    Lbase a--------i    m---------w
+//
+// Note that running this compaction will mark the a--i file in Lbase as
+// compacting, and when ExtendL0ForBaseCompactionTo is called with the bounds of
+// that base file, it'll expand the compaction to also include all L0 files in
+// the a-d interval. The resultant compaction would then be:
+//
+//         _____________
+//    L0.3 |a--d    g-j|
+//    L0.2 |       f--j|         r-t
+//    L0.1 | b-d  e---j|
+//    L0.0 |a--d   f--j| l--o  p-----x
+//
+//    Lbase a--------i    m---------w
+//
+// The next best interval for base compaction would therefore be the one
+// including r--t in L0.2 and p--x in L0.0, and both this compaction and the one
+// picked earlier can run in parallel. This is assuming minCompactionDepth >= 2,
+// otherwise the second compaction has too little depth to pick.
+//
+//         _____________
+//    L0.3 |a--d    g-j|      _________
+//    L0.2 |       f--j|      |  r-t  |
+//    L0.1 | b-d  e---j|      |       |
+//    L0.0 |a--d   f--j| l--o |p-----x|
+//
+//    Lbase a--------i    m---------w
+//
+// Note that when ExtendL0ForBaseCompactionTo is called, the compaction expands
+// to the following, given that the [l,o] file can be added without including
+// additional files in Lbase:
+//
+//         _____________
+//    L0.3 |a--d    g-j|      _________
+//    L0.2 |       f--j|      |  r-t  |
+//    L0.1 | b-d  e---j|______|       |
+//    L0.0 |a--d   f--j||l--o  p-----x|
+//
+//    Lbase a--------i    m---------w
+//
+// If an additional file existed in LBase that overlapped with [l,o], it would
+// be excluded from the compaction. Concretely:
+//
+//         _____________
+//    L0.3 |a--d    g-j|      _________
+//    L0.2 |       f--j|      |  r-t  |
+//    L0.1 | b-d  e---j|      |       |
+//    L0.0 |a--d   f--j| l--o |p-----x|
+//
+//    Lbase a--------ij--lm---------w
+//
+// Intra-L0: If the L0 score is high, but PickBaseCompaction() is unable to pick
+// a compaction, PickIntraL0Compaction will be used to pick an intra-L0
+// compaction. Similar to L0 -> Lbase compactions, we want to allow for multiple
+// intra-L0 compactions and not generate wide output files that hinder later
+// concurrency of L0 -> Lbase compactions. Also compactions that produce wide
+// files don't reduce stack depth -- they represent wide rectangles in our
+// visualization, which means many intervals have their depth reduced by a small
+// amount. Typically, L0 files have non-overlapping sequence numbers, and
+// sticking to that invariant would require us to consider intra-L0 compactions
+// that proceed from youngest to oldest files, which could result in the
+// aforementioned undesirable wide rectangle shape. But this non-overlapping
+// sequence number is already relaxed in RocksDB -- sstables are primarily
+// ordered by their largest sequence number. So we can arrange for intra-L0
+// compactions to capture thin and tall rectangles starting with the top of the
+// stack (youngest files). Like the L0 -> Lbase case we order the intervals
+// using a heuristic and consider each in turn. The same comment about better L0
+// -> Lbase heuristics and not being greedy applies here.
+//
+// Going back to a modified version of our example from earlier, let's say these
+// are the base compactions in progress:
+//                _______
+//    L0.3  a--d  |  g-j|      _________
+//    L0.2        | f--j|      |  r-t  |
+//    L0.1   b-d  |e---j|      |       |
+//    L0.0  a--d  | f--j| l--o |p-----x|
+//
+//    Lbase a---------i    m---------w
+//
+// Since both LBase files are compacting, the only L0 compaction that can be
+// picked is an intra-L0 compaction. For this, the b--d interval has the highest
+// stack depth (3), and starting with a--d in L0.3 as the seed file, we can
+// iterate downward and build this compaction, assuming all files in that
+// interval are not compacting and have a highest sequence number less than
+// earliestUnflushedSeqNum:
+//
+//                _______
+//    L0.3 |a--d| |  g-j|      _________
+//    L0.2 |    | | f--j|      |  r-t  |
+//    L0.1 | b-d| |e---j|      |       |
+//    L0.0 |a--d| | f--j| l--o |p-----x|
+//         ------
+//    Lbase a---------i    m---------w
+//
+
+// PickBaseCompaction picks a base compaction based on the above specified
+// heuristics, for the specified Lbase files and a minimum depth of overlapping
+// files that can be selected for compaction. Returns nil if no compaction is
+// possible.
+func (s *L0Sublevels) PickBaseCompaction(
+	minCompactionDepth int, baseFiles LevelSlice,
+) (*L0CompactionFiles, error) {
+	// For LBase compactions, we consider intervals in a greedy manner in the
+	// following order:
+	// - Intervals that are unlikely to be blocked due
+	//   to ongoing L0 -> Lbase compactions. These are the ones with
+	//   !isBaseCompacting && !intervalRangeIsBaseCompacting.
+	// - Intervals that are !isBaseCompacting && intervalRangeIsBaseCompacting.
+	//
+	// The ordering heuristic exists just to avoid wasted work. Ideally,
+	// we would consider all intervals with isBaseCompacting = false and
+	// construct a compaction for it and compare the constructed compactions
+	// and pick the best one. If microbenchmarks show that we can afford
+	// this cost we can eliminate this heuristic.
+	scoredIntervals := make([]intervalAndScore, 0, len(s.orderedIntervals))
+	sublevelCount := len(s.levelFiles)
+	for i := range s.orderedIntervals {
+		interval := &s.orderedIntervals[i]
+		depth := len(interval.files) - interval.compactingFileCount
+		if interval.isBaseCompacting || minCompactionDepth > depth {
+			continue
+		}
+		if interval.intervalRangeIsBaseCompacting {
+			scoredIntervals = append(scoredIntervals, intervalAndScore{interval: i, score: depth})
+		} else {
+			// Prioritize this interval by incrementing the score by the number
+			// of sublevels.
+			scoredIntervals = append(scoredIntervals, intervalAndScore{interval: i, score: depth + sublevelCount})
+		}
+	}
+	sort.Sort(intervalSorterByDecreasingScore(scoredIntervals))
+
+	// Optimization to avoid considering different intervals that
+	// are likely to choose the same seed file. Again this is just
+	// to reduce wasted work.
+	consideredIntervals := newBitSet(len(s.orderedIntervals))
+	for _, scoredInterval := range scoredIntervals {
+		interval := &s.orderedIntervals[scoredInterval.interval]
+		if consideredIntervals[interval.index] {
+			continue
+		}
+
+		// Pick the seed file for the interval as the file
+		// in the lowest sub-level.
+		f := interval.files[0]
+		// Don't bother considering the intervals that are covered by the seed
+		// file since they are likely nearby. Note that it is possible that
+		// those intervals have seed files at lower sub-levels so could be
+		// viable for compaction.
+		if f == nil {
+			return nil, errors.New("no seed file found in sublevel intervals")
+		}
+		consideredIntervals.markBits(f.minIntervalIndex, f.maxIntervalIndex+1)
+		if f.IsCompacting() {
+			if f.IsIntraL0Compacting {
+				// If we're picking a base compaction and we came across a seed
+				// file candidate that's being intra-L0 compacted, skip the
+				// interval instead of erroring out.
+				continue
+			}
+			// We chose a compaction seed file that should not be compacting.
+			// Usually means the score is not accurately accounting for files
+			// already compacting, or internal state is inconsistent.
+			return nil, errors.Errorf("file %s chosen as seed file for compaction should not be compacting", f.FileNum)
+		}
+
+		c := s.baseCompactionUsingSeed(f, interval.index, minCompactionDepth)
+		if c != nil {
+			// Check if the chosen compaction overlaps with any files in Lbase
+			// that have Compacting = true. If that's the case, this compaction
+			// cannot be chosen.
+			baseIter := baseFiles.Iter()
+			// An interval starting at ImmediateSuccessor(key) can never be the
+			// first interval of a compaction since no file can start at that
+			// interval.
+			m := baseIter.SeekGE(s.cmp, s.orderedIntervals[c.minIntervalIndex].startKey.key)
+
+			var baseCompacting bool
+			for ; m != nil && !baseCompacting; m = baseIter.Next() {
+				cmp := s.cmp(m.Smallest.UserKey, s.orderedIntervals[c.maxIntervalIndex+1].startKey.key)
+				// Compaction is ending at exclusive bound of c.maxIntervalIndex+1
+				if cmp > 0 || (cmp == 0 && !s.orderedIntervals[c.maxIntervalIndex+1].startKey.isLargest) {
+					break
+				}
+				baseCompacting = baseCompacting || m.IsCompacting()
+			}
+			if baseCompacting {
+				continue
+			}
+			return c, nil
+		}
+	}
+	return nil, nil
+}
+
+// Helper function for building an L0 -> Lbase compaction using a seed interval
+// and seed file in that seed interval.
+func (s *L0Sublevels) baseCompactionUsingSeed(
+	f *FileMetadata, intervalIndex int, minCompactionDepth int,
+) *L0CompactionFiles {
+	c := &L0CompactionFiles{
+		FilesIncluded:        newBitSet(s.levelMetadata.Len()),
+		seedInterval:         intervalIndex,
+		seedIntervalMinLevel: 0,
+		minIntervalIndex:     f.minIntervalIndex,
+		maxIntervalIndex:     f.maxIntervalIndex,
+	}
+	c.addFile(f)
+
+	// The first iteration of this loop builds the compaction at the seed file's
+	// sublevel. Future iterations expand on this compaction by stacking more
+	// files from intervalIndex and repeating. This is an optional activity so
+	// when it fails we can fallback to the last successful candidate.
+	var lastCandidate *L0CompactionFiles
+	interval := &s.orderedIntervals[intervalIndex]
+
+	for i := 0; i < len(interval.files); i++ {
+		f2 := interval.files[i]
+		sl := f2.SubLevel
+		c.seedIntervalStackDepthReduction++
+		c.seedIntervalMaxLevel = sl
+		c.addFile(f2)
+		// The seed file is in the lowest sublevel in the seed interval, but it
+		// may overlap with other files in even lower sublevels. For correctness
+		// we need to grow our interval to include those files, and capture all
+		// files in the next level that fall in this extended interval and so
+		// on. This can result in a triangular shape like the following where
+		// again the X axis is the key intervals and the Y axis is oldest to
+		// youngest. Note that it is not necessary for correctness to fill out
+		// the shape at the higher sub-levels to make it more rectangular since
+		// the invariant only requires that younger versions of a key not be
+		// moved to Lbase while leaving behind older versions.
+		//                     -
+		//                    ---
+		//                   -----
+		// It may be better for performance to have a more rectangular shape
+		// since the files being left behind will overlap with the same Lbase
+		// key range as that of this compaction. But there is also the danger
+		// that in trying to construct a more rectangular shape we will be
+		// forced to pull in a file that is already compacting. We expect
+		// extendCandidateToRectangle to eventually be called on this compaction
+		// if it's chosen, at which point we would iterate backward and choose
+		// those files. This logic is similar to compaction.grow for non-L0
+		// compactions.
+		done := false
+		for currLevel := sl - 1; currLevel >= 0; currLevel-- {
+			if !s.extendFiles(currLevel, math.MaxUint64, c) {
+				// Failed to extend due to ongoing compaction.
+				done = true
+				break
+			}
+		}
+		if done {
+			break
+		}
+		// Observed some compactions using > 1GB from L0 in an import
+		// experiment. Very long running compactions are not great as they
+		// reduce concurrency while they run, and take a while to produce
+		// results, though they're sometimes unavoidable. There is a tradeoff
+		// here in that adding more depth is more efficient in reducing stack
+		// depth, but long running compactions reduce flexibility in what can
+		// run concurrently in L0 and even Lbase -> Lbase+1. An increase more
+		// than 150% in bytes since the last candidate compaction (along with a
+		// total compaction size in excess of 100mb), or a total compaction size
+		// beyond a hard limit of 500mb, is criteria for rejecting this
+		// candidate. This lets us prefer slow growths as we add files, while
+		// still having a hard limit. Note that if this is the first compaction
+		// candidate to reach a stack depth reduction of minCompactionDepth or
+		// higher, this candidate will be chosen regardless.
+		if lastCandidate == nil {
+			lastCandidate = &L0CompactionFiles{}
+		} else if lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth &&
+			c.fileBytes > 100<<20 &&
+			(float64(c.fileBytes)/float64(lastCandidate.fileBytes) > 1.5 || c.fileBytes > 500<<20) {
+			break
+		}
+		*lastCandidate = *c
+	}
+	if lastCandidate != nil && lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth {
+		lastCandidate.FilesIncluded.clearAllBits()
+		for _, f := range lastCandidate.Files {
+			lastCandidate.FilesIncluded.markBit(f.L0Index)
+		}
+		return lastCandidate
+	}
+	return nil
+}
+
+// Expands fields in the provided L0CompactionFiles instance (cFiles) to
+// include overlapping files in the specified sublevel. Returns true if the
+// compaction is possible (i.e. does not conflict with any base/intra-L0
+// compacting files).
+func (s *L0Sublevels) extendFiles(
+	sl int, earliestUnflushedSeqNum uint64, cFiles *L0CompactionFiles,
+) bool {
+	index, _ := slices.BinarySearchFunc(s.levelFiles[sl], cFiles.minIntervalIndex, func(a *FileMetadata, b int) int {
+		return stdcmp.Compare(a.maxIntervalIndex, b)
+	})
+	for ; index < len(s.levelFiles[sl]); index++ {
+		f := s.levelFiles[sl][index]
+		if f.minIntervalIndex > cFiles.maxIntervalIndex {
+			break
+		}
+		if f.IsCompacting() {
+			return false
+		}
+		// Skip over files that are newer than earliestUnflushedSeqNum. This is
+		// okay because this compaction can just pretend these files are not in
+		// L0 yet. These files must be in higher sublevels than any overlapping
+		// files with f.LargestSeqNum < earliestUnflushedSeqNum, and the output
+		// of the compaction will also go in a lower (older) sublevel than this
+		// file by definition.
+		if f.LargestSeqNum >= earliestUnflushedSeqNum {
+			continue
+		}
+		cFiles.addFile(f)
+	}
+	return true
+}
+
+// PickIntraL0Compaction picks an intra-L0 compaction for files in this
+// sublevel. This method is only called when a base compaction cannot be chosen.
+// See comment above [PickBaseCompaction] for heuristics involved in this
+// selection.
+func (s *L0Sublevels) PickIntraL0Compaction(
+	earliestUnflushedSeqNum uint64, minCompactionDepth int,
+) (*L0CompactionFiles, error) {
+	scoredIntervals := make([]intervalAndScore, len(s.orderedIntervals))
+	for i := range s.orderedIntervals {
+		interval := &s.orderedIntervals[i]
+		depth := len(interval.files) - interval.compactingFileCount
+		if minCompactionDepth > depth {
+			continue
+		}
+		scoredIntervals[i] = intervalAndScore{interval: i, score: depth}
+	}
+	sort.Sort(intervalSorterByDecreasingScore(scoredIntervals))
+
+	// Optimization to avoid considering different intervals that are likely to
+	// choose the same seed file. Again this is just to reduce wasted work.
+	consideredIntervals := newBitSet(len(s.orderedIntervals))
+	for _, scoredInterval := range scoredIntervals {
+		interval := &s.orderedIntervals[scoredInterval.interval]
+		if consideredIntervals[interval.index] {
+			continue
+		}
+
+		var f *FileMetadata
+		// Pick the seed file for the interval as the file in the highest
+		// sub-level.
+		stackDepthReduction := scoredInterval.score
+		for i := len(interval.files) - 1; i >= 0; i-- {
+			f = interval.files[i]
+			if f.IsCompacting() {
+				break
+			}
+			consideredIntervals.markBits(f.minIntervalIndex, f.maxIntervalIndex+1)
+			// Can this be the seed file? Files with newer sequence numbers than
+			// earliestUnflushedSeqNum cannot be in the compaction.
+			if f.LargestSeqNum >= earliestUnflushedSeqNum {
+				stackDepthReduction--
+				if stackDepthReduction == 0 {
+					break
+				}
+			} else {
+				break
+			}
+		}
+		if stackDepthReduction < minCompactionDepth {
+			// Can't use this interval.
+			continue
+		}
+
+		if f == nil {
+			return nil, errors.New("no seed file found in sublevel intervals")
+		}
+		if f.IsCompacting() {
+			// This file could be in a concurrent intra-L0 or base compaction.
+			// Try another interval.
+			continue
+		}
+
+		// We have a seed file. Build a compaction off of that seed.
+		c := s.intraL0CompactionUsingSeed(
+			f, interval.index, earliestUnflushedSeqNum, minCompactionDepth)
+		if c != nil {
+			return c, nil
+		}
+	}
+	return nil, nil
+}
+
+func (s *L0Sublevels) intraL0CompactionUsingSeed(
+	f *FileMetadata, intervalIndex int, earliestUnflushedSeqNum uint64, minCompactionDepth int,
+) *L0CompactionFiles {
+	// We know that all the files that overlap with intervalIndex have
+	// LargestSeqNum < earliestUnflushedSeqNum, but for other intervals
+	// we need to exclude files >= earliestUnflushedSeqNum
+
+	c := &L0CompactionFiles{
+		FilesIncluded:           newBitSet(s.levelMetadata.Len()),
+		seedInterval:            intervalIndex,
+		seedIntervalMaxLevel:    len(s.levelFiles) - 1,
+		minIntervalIndex:        f.minIntervalIndex,
+		maxIntervalIndex:        f.maxIntervalIndex,
+		isIntraL0:               true,
+		earliestUnflushedSeqNum: earliestUnflushedSeqNum,
+	}
+	c.addFile(f)
+
+	var lastCandidate *L0CompactionFiles
+	interval := &s.orderedIntervals[intervalIndex]
+	slIndex := len(interval.files) - 1
+	for {
+		if interval.files[slIndex] == f {
+			break
+		}
+		slIndex--
+	}
+	// The first iteration of this loop produces an intra-L0 compaction at the
+	// seed level. Iterations after that optionally add to the compaction by
+	// stacking more files from intervalIndex and repeating. This is an optional
+	// activity so when it fails we can fallback to the last successful
+	// candidate. The code stops adding when it can't add more, or when
+	// fileBytes grows too large.
+	for ; slIndex >= 0; slIndex-- {
+		f2 := interval.files[slIndex]
+		sl := f2.SubLevel
+		if f2.IsCompacting() {
+			break
+		}
+		c.seedIntervalStackDepthReduction++
+		c.seedIntervalMinLevel = sl
+		c.addFile(f2)
+		// The seed file captures all files in the higher level that fall in the
+		// range of intervals. That may extend the range of intervals so for
+		// correctness we need to capture all files in the next higher level
+		// that fall in this extended interval and so on. This can result in an
+		// inverted triangular shape like the following where again the X axis
+		// is the key intervals and the Y axis is oldest to youngest. Note that
+		// it is not necessary for correctness to fill out the shape at lower
+		// sub-levels to make it more rectangular since the invariant only
+		// requires that if we move an older seqnum for key k into a file that
+		// has a higher seqnum, we also move all younger seqnums for that key k
+		// into that file.
+		//                  -----
+		//                   ---
+		//                    -
+		// It may be better for performance to have a more rectangular shape
+		// since it will reduce the stack depth for more intervals. But there is
+		// also the danger that in explicitly trying to construct a more
+		// rectangular shape we will be forced to pull in a file that is already
+		// compacting. We assume that the performance concern is not a practical
+		// issue.
+		done := false
+		for currLevel := sl + 1; currLevel < len(s.levelFiles); currLevel++ {
+			if !s.extendFiles(currLevel, earliestUnflushedSeqNum, c) {
+				// Failed to extend due to ongoing compaction.
+				done = true
+				break
+			}
+		}
+		if done {
+			break
+		}
+		if lastCandidate == nil {
+			lastCandidate = &L0CompactionFiles{}
+		} else if lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth &&
+			c.fileBytes > 100<<20 &&
+			(float64(c.fileBytes)/float64(lastCandidate.fileBytes) > 1.5 || c.fileBytes > 500<<20) {
+			break
+		}
+		*lastCandidate = *c
+	}
+	if lastCandidate != nil && lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth {
+		lastCandidate.FilesIncluded.clearAllBits()
+		for _, f := range lastCandidate.Files {
+			lastCandidate.FilesIncluded.markBit(f.L0Index)
+		}
+		s.extendCandidateToRectangle(
+			lastCandidate.minIntervalIndex, lastCandidate.maxIntervalIndex, lastCandidate, false)
+		return lastCandidate
+	}
+	return nil
+}
+
+// ExtendL0ForBaseCompactionTo extends the specified base compaction candidate
+// L0CompactionFiles to optionally cover more files in L0 without "touching" any
+// of the passed-in keys (i.e. the smallest/largest bounds are exclusive), as
+// including any user keys for those internal keys could require choosing more
+// files in LBase which is undesirable. Unbounded start/end keys are indicated
+// by passing in the InvalidInternalKey.
+func (s *L0Sublevels) ExtendL0ForBaseCompactionTo(
+	smallest, largest InternalKey, candidate *L0CompactionFiles,
+) bool {
+	firstIntervalIndex := 0
+	lastIntervalIndex := len(s.orderedIntervals) - 1
+	if smallest.Kind() != base.InternalKeyKindInvalid {
+		if smallest.Trailer == base.InternalKeyRangeDeleteSentinel {
+			// Starting at smallest.UserKey == interval.startKey is okay.
+			firstIntervalIndex = sort.Search(len(s.orderedIntervals), func(i int) bool {
+				return s.cmp(smallest.UserKey, s.orderedIntervals[i].startKey.key) <= 0
+			})
+		} else {
+			firstIntervalIndex = sort.Search(len(s.orderedIntervals), func(i int) bool {
+				// Need to start at >= smallest since if we widen too much we may miss
+				// an Lbase file that overlaps with an L0 file that will get picked in
+				// this widening, which would be bad. This interval will not start with
+				// an immediate successor key.
+				return s.cmp(smallest.UserKey, s.orderedIntervals[i].startKey.key) < 0
+			})
+		}
+	}
+	if largest.Kind() != base.InternalKeyKindInvalid {
+		// First interval that starts at or beyond the largest. This interval will not
+		// start with an immediate successor key.
+		lastIntervalIndex = sort.Search(len(s.orderedIntervals), func(i int) bool {
+			return s.cmp(largest.UserKey, s.orderedIntervals[i].startKey.key) <= 0
+		})
+		// Right now, lastIntervalIndex has a startKey that extends beyond largest.
+		// The previous interval, by definition, has an end key higher than largest.
+		// Iterate back twice to get the last interval that's completely within
+		// (smallest, largest). Except in the case where we went past the end of the
+		// list; in that case, the last interval to include is the very last
+		// interval in the list.
+		if lastIntervalIndex < len(s.orderedIntervals) {
+			lastIntervalIndex--
+		}
+		lastIntervalIndex--
+	}
+	if lastIntervalIndex < firstIntervalIndex {
+		return false
+	}
+	return s.extendCandidateToRectangle(firstIntervalIndex, lastIntervalIndex, candidate, true)
+}
+
+// Best-effort attempt to make the compaction include more files in the
+// rectangle defined by [minIntervalIndex, maxIntervalIndex] on the X axis and
+// bounded on the Y axis by seedIntervalMinLevel and seedIntervalMaxLevel.
+//
+// This is strictly an optional extension; at any point where we can't feasibly
+// add more files, the sublevel iteration can be halted early and candidate will
+// still be a correct compaction candidate.
+//
+// Consider this scenario (original candidate is inside the rectangle), with
+// isBase = true and interval bounds a-j (from the union of base file bounds and
+// that of compaction candidate):
+//
+//	           _______
+//	L0.3  a--d |  g-j|
+//	L0.2       | f--j|         r-t
+//	L0.1   b-d |e---j|
+//	L0.0  a--d | f--j| l--o  p-----x
+//
+//	Lbase a--------i    m---------w
+//
+// This method will iterate from the bottom up. At L0.0, it will add a--d since
+// it's in the bounds, then add b-d, then a--d, and so on, to produce this:
+//
+//	     _____________
+//	L0.3 |a--d    g-j|
+//	L0.2 |       f--j|         r-t
+//	L0.1 | b-d  e---j|
+//	L0.0 |a--d   f--j| l--o  p-----x
+//
+//	Lbase a-------i     m---------w
+//
+// Let's assume that, instead of a--d in the top sublevel, we had 3 files, a-b,
+// bb-c, and cc-d, of which bb-c is compacting. Let's also add another sublevel
+// L0.4 with some files, all of which aren't compacting:
+//
+//	L0.4  a------c ca--d _______
+//	L0.3  a-b bb-c  cc-d |  g-j|
+//	L0.2                 | f--j|         r-t
+//	L0.1    b----------d |e---j|
+//	L0.0  a------------d | f--j| l--o  p-----x
+//
+//	Lbase a------------------i    m---------w
+//
+// This method then needs to choose between the left side of L0.3 bb-c (i.e.
+// a-b), or the right side (i.e. cc-d and g-j) for inclusion in this compaction.
+// Since the right side has more files as well as one file that has already been
+// picked, it gets chosen at that sublevel, resulting in this intermediate
+// compaction:
+//
+//	L0.4  a------c ca--d
+//	              ______________
+//	L0.3  a-b bb-c| cc-d    g-j|
+//	L0.2 _________|        f--j|         r-t
+//	L0.1 |  b----------d  e---j|
+//	L0.0 |a------------d   f--j| l--o  p-----x
+//
+//	Lbase a------------------i    m---------w
+//
+// Since bb-c had to be excluded at L0.3, the interval bounds for L0.4 are
+// actually ca-j, since ca is the next interval start key after the end interval
+// of bb-c. This would result in only ca-d being chosen at that sublevel, even
+// though a--c is also not compacting. This is the final result:
+//
+//	              ______________
+//	L0.4  a------c|ca--d       |
+//	L0.3  a-b bb-c| cc-d    g-j|
+//	L0.2 _________|        f--j|         r-t
+//	L0.1 |  b----------d  e---j|
+//	L0.0 |a------------d   f--j| l--o  p-----x
+//
+//	Lbase a------------------i    m---------w
+//
+// TODO(bilal): Add more targeted tests for this method, through
+// ExtendL0ForBaseCompactionTo and intraL0CompactionUsingSeed.
+func (s *L0Sublevels) extendCandidateToRectangle(
+	minIntervalIndex int, maxIntervalIndex int, candidate *L0CompactionFiles, isBase bool,
+) bool {
+	candidate.preExtensionMinInterval = candidate.minIntervalIndex
+	candidate.preExtensionMaxInterval = candidate.maxIntervalIndex
+	// Extend {min,max}IntervalIndex to include all of the candidate's current
+	// bounds.
+	if minIntervalIndex > candidate.minIntervalIndex {
+		minIntervalIndex = candidate.minIntervalIndex
+	}
+	if maxIntervalIndex < candidate.maxIntervalIndex {
+		maxIntervalIndex = candidate.maxIntervalIndex
+	}
+	var startLevel, increment, endLevel int
+	if isBase {
+		startLevel = 0
+		increment = +1
+		// seedIntervalMaxLevel is inclusive, while endLevel is exclusive.
+		endLevel = candidate.seedIntervalMaxLevel + 1
+	} else {
+		startLevel = len(s.levelFiles) - 1
+		increment = -1
+		// seedIntervalMinLevel is inclusive, while endLevel is exclusive.
+		endLevel = candidate.seedIntervalMinLevel - 1
+	}
+	// Stats for files.
+	addedCount := 0
+	// Iterate from the oldest sub-level for L0 -> Lbase and youngest sub-level
+	// for intra-L0. The idea here is that anything that can't be included from
+	// that level constrains what can be included from the next level. This
+	// change in constraint is directly incorporated into minIntervalIndex,
+	// maxIntervalIndex.
+	for sl := startLevel; sl != endLevel; sl += increment {
+		files := s.levelFiles[sl]
+		// Find the first file that overlaps with minIntervalIndex.
+		index := sort.Search(len(files), func(i int) bool {
+			return minIntervalIndex <= files[i].maxIntervalIndex
+		})
+		// Track the files that are fully within the current constraint of
+		// [minIntervalIndex, maxIntervalIndex].
+		firstIndex := -1
+		lastIndex := -1
+		for ; index < len(files); index++ {
+			f := files[index]
+			if f.minIntervalIndex > maxIntervalIndex {
+				break
+			}
+			include := true
+			// Extends out on the left so can't be included. This narrows what
+			// we can included in the next level.
+			if f.minIntervalIndex < minIntervalIndex {
+				include = false
+				minIntervalIndex = f.maxIntervalIndex + 1
+			}
+			// Extends out on the right so can't be included.
+			if f.maxIntervalIndex > maxIntervalIndex {
+				include = false
+				maxIntervalIndex = f.minIntervalIndex - 1
+			}
+			if !include {
+				continue
+			}
+			if firstIndex == -1 {
+				firstIndex = index
+			}
+			lastIndex = index
+		}
+		if minIntervalIndex > maxIntervalIndex {
+			// We excluded files that prevent continuation.
+			break
+		}
+		if firstIndex < 0 {
+			// No files to add in this sub-level.
+			continue
+		}
+		// We have the files in [firstIndex, lastIndex] as potential for
+		// inclusion. Some of these may already have been picked. Some of them
+		// may be already compacting. The latter is tricky since we have to
+		// decide whether to contract minIntervalIndex or maxIntervalIndex when
+		// we encounter an already compacting file. We pick the longest sequence
+		// between firstIndex and lastIndex of non-compacting files -- this is
+		// represented by [candidateNonCompactingFirst,
+		// candidateNonCompactingLast].
+		nonCompactingFirst := -1
+		currentRunHasAlreadyPickedFiles := false
+		candidateNonCompactingFirst := -1
+		candidateNonCompactingLast := -1
+		candidateHasAlreadyPickedFiles := false
+		for index = firstIndex; index <= lastIndex; index++ {
+			f := files[index]
+			if f.IsCompacting() {
+				if nonCompactingFirst != -1 {
+					last := index - 1
+					// Prioritize runs of consecutive non-compacting files that
+					// have files that have already been picked. That is to say,
+					// if candidateHasAlreadyPickedFiles == true, we stick with
+					// it, and if currentRunHasAlreadyPickedfiles == true, we
+					// pick that run even if it contains fewer files than the
+					// previous candidate.
+					if !candidateHasAlreadyPickedFiles && (candidateNonCompactingFirst == -1 ||
+						currentRunHasAlreadyPickedFiles ||
+						(last-nonCompactingFirst) > (candidateNonCompactingLast-candidateNonCompactingFirst)) {
+						candidateNonCompactingFirst = nonCompactingFirst
+						candidateNonCompactingLast = last
+						candidateHasAlreadyPickedFiles = currentRunHasAlreadyPickedFiles
+					}
+				}
+				nonCompactingFirst = -1
+				currentRunHasAlreadyPickedFiles = false
+				continue
+			}
+			if nonCompactingFirst == -1 {
+				nonCompactingFirst = index
+			}
+			if candidate.FilesIncluded[f.L0Index] {
+				currentRunHasAlreadyPickedFiles = true
+			}
+		}
+		// Logic duplicated from inside the for loop above.
+		if nonCompactingFirst != -1 {
+			last := index - 1
+			if !candidateHasAlreadyPickedFiles && (candidateNonCompactingFirst == -1 ||
+				currentRunHasAlreadyPickedFiles ||
+				(last-nonCompactingFirst) > (candidateNonCompactingLast-candidateNonCompactingFirst)) {
+				candidateNonCompactingFirst = nonCompactingFirst
+				candidateNonCompactingLast = last
+			}
+		}
+		if candidateNonCompactingFirst == -1 {
+			// All files are compacting. There will be gaps that we could
+			// exploit to continue, but don't bother.
+			break
+		}
+		// May need to shrink [minIntervalIndex, maxIntervalIndex] for the next level.
+		if candidateNonCompactingFirst > firstIndex {
+			minIntervalIndex = files[candidateNonCompactingFirst-1].maxIntervalIndex + 1
+		}
+		if candidateNonCompactingLast < lastIndex {
+			maxIntervalIndex = files[candidateNonCompactingLast+1].minIntervalIndex - 1
+		}
+		for index := candidateNonCompactingFirst; index <= candidateNonCompactingLast; index++ {
+			f := files[index]
+			if f.IsCompacting() {
+				// TODO(bilal): Do a logger.Fatalf instead of a panic, for
+				// cleaner unwinding and error messages.
+				panic(fmt.Sprintf("expected %s to not be compacting", f.FileNum))
+			}
+			if candidate.isIntraL0 && f.LargestSeqNum >= candidate.earliestUnflushedSeqNum {
+				continue
+			}
+			if !candidate.FilesIncluded[f.L0Index] {
+				addedCount++
+				candidate.addFile(f)
+			}
+		}
+	}
+	return addedCount > 0
+}
diff --git a/pebble/internal/manifest/l0_sublevels_test.go b/pebble/internal/manifest/l0_sublevels_test.go
new file mode 100644
index 0000000..8cedb87
--- /dev/null
+++ b/pebble/internal/manifest/l0_sublevels_test.go
@@ -0,0 +1,620 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package manifest
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"math"
+	"os"
+	"slices"
+	"sort"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/record"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+func readManifest(filename string) (*Version, error) {
+	f, err := os.Open(filename)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	rr := record.NewReader(f, 0 /* logNum */)
+	var v *Version
+	addedByFileNum := make(map[base.FileNum]*FileMetadata)
+	for {
+		r, err := rr.Next()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return nil, err
+		}
+		var ve VersionEdit
+		if err = ve.Decode(r); err != nil {
+			return nil, err
+		}
+		var bve BulkVersionEdit
+		bve.AddedByFileNum = addedByFileNum
+		if err := bve.Accumulate(&ve); err != nil {
+			return nil, err
+		}
+		if v, err = bve.Apply(v, base.DefaultComparer.Compare, base.DefaultFormatter, 10<<20, 32000, nil, ProhibitSplitUserKeys); err != nil {
+			return nil, err
+		}
+	}
+	return v, nil
+}
+
+func visualizeSublevels(
+	s *L0Sublevels, compactionFiles bitSet, otherLevels [][]*FileMetadata,
+) string {
+	var buf strings.Builder
+	if compactionFiles == nil {
+		compactionFiles = newBitSet(s.levelMetadata.Len())
+	}
+	largestChar := byte('a')
+	printLevel := func(files []*FileMetadata, level string, isL0 bool) {
+		lastChar := byte('a')
+		fmt.Fprintf(&buf, "L%s:", level)
+		for i := 0; i < 5-len(level); i++ {
+			buf.WriteByte(' ')
+		}
+		for j, f := range files {
+			for lastChar < f.Smallest.UserKey[0] {
+				buf.WriteString("   ")
+				lastChar++
+			}
+			buf.WriteByte(f.Smallest.UserKey[0])
+			middleChar := byte('-')
+			if isL0 {
+				if compactionFiles[f.L0Index] {
+					middleChar = '+'
+				} else if f.IsCompacting() {
+					if f.IsIntraL0Compacting {
+						middleChar = '^'
+					} else {
+						middleChar = 'v'
+					}
+				}
+			} else if f.IsCompacting() {
+				middleChar = '='
+			}
+			if largestChar < f.Largest.UserKey[0] {
+				largestChar = f.Largest.UserKey[0]
+			}
+			if f.Smallest.UserKey[0] == f.Largest.UserKey[0] {
+				buf.WriteByte(f.Largest.UserKey[0])
+				if compactionFiles[f.L0Index] {
+					buf.WriteByte('+')
+				} else if j < len(files)-1 {
+					buf.WriteByte(' ')
+				}
+				lastChar++
+				continue
+			}
+			buf.WriteByte(middleChar)
+			buf.WriteByte(middleChar)
+			lastChar++
+			for lastChar < f.Largest.UserKey[0] {
+				buf.WriteByte(middleChar)
+				buf.WriteByte(middleChar)
+				buf.WriteByte(middleChar)
+				lastChar++
+			}
+			if f.Largest.IsExclusiveSentinel() &&
+				j < len(files)-1 && files[j+1].Smallest.UserKey[0] == f.Largest.UserKey[0] {
+				// This case happens where two successive files have
+				// matching end/start user keys but where the left-side file
+				// has the sentinel key as its end key trailer. In this case
+				// we print the sstables as:
+				//
+				// a------d------g
+				//
+				continue
+			}
+			buf.WriteByte(middleChar)
+			buf.WriteByte(f.Largest.UserKey[0])
+			if j < len(files)-1 {
+				buf.WriteByte(' ')
+			}
+			lastChar++
+		}
+		fmt.Fprintf(&buf, "\n")
+	}
+	for i := len(s.levelFiles) - 1; i >= 0; i-- {
+		printLevel(s.levelFiles[i], fmt.Sprintf("0.%d", i), true)
+	}
+	for i := range otherLevels {
+		if len(otherLevels[i]) == 0 {
+			continue
+		}
+		printLevel(otherLevels[i], strconv.Itoa(i+1), false)
+	}
+	buf.WriteString("       ")
+	for b := byte('a'); b <= largestChar; b++ {
+		buf.WriteByte(b)
+		buf.WriteByte(b)
+		if b < largestChar {
+			buf.WriteByte(' ')
+		}
+	}
+	buf.WriteByte('\n')
+	return buf.String()
+}
+
+func TestL0Sublevels(t *testing.T) {
+	parseMeta := func(s string) (*FileMetadata, error) {
+		parts := strings.Split(s, ":")
+		if len(parts) != 2 {
+			t.Fatalf("malformed table spec: %s", s)
+		}
+		fileNum, err := strconv.Atoi(strings.TrimSpace(parts[0]))
+		if err != nil {
+			return nil, err
+		}
+		fields := strings.Fields(parts[1])
+		keyRange := strings.Split(strings.TrimSpace(fields[0]), "-")
+		m := (&FileMetadata{}).ExtendPointKeyBounds(
+			base.DefaultComparer.Compare,
+			base.ParseInternalKey(strings.TrimSpace(keyRange[0])),
+			base.ParseInternalKey(strings.TrimSpace(keyRange[1])),
+		)
+		m.SmallestSeqNum = m.Smallest.SeqNum()
+		m.LargestSeqNum = m.Largest.SeqNum()
+		if m.Largest.IsExclusiveSentinel() {
+			m.LargestSeqNum = m.SmallestSeqNum
+		}
+		m.FileNum = base.FileNum(fileNum)
+		m.Size = uint64(256)
+		m.InitPhysicalBacking()
+		if len(fields) > 1 {
+			for _, field := range fields[1:] {
+				parts := strings.Split(field, "=")
+				switch parts[0] {
+				case "base_compacting":
+					m.IsIntraL0Compacting = false
+					m.CompactionState = CompactionStateCompacting
+				case "intra_l0_compacting":
+					m.IsIntraL0Compacting = true
+					m.CompactionState = CompactionStateCompacting
+				case "compacting":
+					m.CompactionState = CompactionStateCompacting
+				case "size":
+					sizeInt, err := strconv.Atoi(parts[1])
+					if err != nil {
+						return nil, err
+					}
+					m.Size = uint64(sizeInt)
+				}
+			}
+		}
+
+		return m, nil
+	}
+
+	var err error
+	var fileMetas [NumLevels][]*FileMetadata
+	var explicitSublevels [][]*FileMetadata
+	var activeCompactions []L0Compaction
+	var sublevels *L0Sublevels
+	baseLevel := NumLevels - 1
+
+	datadriven.RunTest(t, "testdata/l0_sublevels", func(t *testing.T, td *datadriven.TestData) string {
+		pickBaseCompaction := false
+		level := 0
+		addL0FilesOpt := false
+		switch td.Cmd {
+		case "add-l0-files":
+			addL0FilesOpt = true
+			level = 0
+			fallthrough
+		case "define":
+			if !addL0FilesOpt {
+				fileMetas = [NumLevels][]*FileMetadata{}
+				baseLevel = NumLevels - 1
+				activeCompactions = nil
+			}
+			explicitSublevels = [][]*FileMetadata{}
+			sublevel := -1
+			addedL0Files := make([]*FileMetadata, 0)
+			for _, data := range strings.Split(td.Input, "\n") {
+				data = strings.TrimSpace(data)
+				switch data[:2] {
+				case "L0", "L1", "L2", "L3", "L4", "L5", "L6":
+					level, err = strconv.Atoi(data[1:2])
+					if err != nil {
+						return err.Error()
+					}
+					if level == 0 && len(data) > 3 {
+						// Sublevel was specified.
+						sublevel, err = strconv.Atoi(data[3:])
+						if err != nil {
+							return err.Error()
+						}
+					} else {
+						sublevel = -1
+					}
+				default:
+					meta, err := parseMeta(data)
+					if err != nil {
+						return err.Error()
+					}
+					if level != 0 && level < baseLevel {
+						baseLevel = level
+					}
+					fileMetas[level] = append(fileMetas[level], meta)
+					if level == 0 {
+						addedL0Files = append(addedL0Files, meta)
+					}
+					if sublevel != -1 {
+						for len(explicitSublevels) <= sublevel {
+							explicitSublevels = append(explicitSublevels, []*FileMetadata{})
+						}
+						explicitSublevels[sublevel] = append(explicitSublevels[sublevel], meta)
+					}
+				}
+			}
+
+			flushSplitMaxBytes := 64
+			initialize := true
+			for _, arg := range td.CmdArgs {
+				switch arg.Key {
+				case "flush_split_max_bytes":
+					flushSplitMaxBytes, err = strconv.Atoi(arg.Vals[0])
+					if err != nil {
+						t.Fatal(err)
+					}
+				case "no_initialize":
+					// This case is for use with explicitly-specified sublevels
+					// only.
+					initialize = false
+				}
+			}
+			SortBySeqNum(fileMetas[0])
+			for i := 1; i < NumLevels; i++ {
+				SortBySmallest(fileMetas[i], base.DefaultComparer.Compare)
+			}
+
+			levelMetadata := makeLevelMetadata(base.DefaultComparer.Compare, 0, fileMetas[0])
+			if initialize {
+				if addL0FilesOpt {
+					SortBySeqNum(addedL0Files)
+					sublevels, err = sublevels.AddL0Files(addedL0Files, int64(flushSplitMaxBytes), &levelMetadata)
+					// Check if the output matches a full initialization.
+					sublevels2, _ := NewL0Sublevels(&levelMetadata, base.DefaultComparer.Compare, base.DefaultFormatter, int64(flushSplitMaxBytes))
+					if sublevels != nil && sublevels2 != nil {
+						require.Equal(t, sublevels.flushSplitUserKeys, sublevels2.flushSplitUserKeys)
+						require.Equal(t, sublevels.levelFiles, sublevels2.levelFiles)
+					}
+				} else {
+					sublevels, err = NewL0Sublevels(
+						&levelMetadata,
+						base.DefaultComparer.Compare,
+						base.DefaultFormatter,
+						int64(flushSplitMaxBytes))
+				}
+				if err != nil {
+					return err.Error()
+				}
+				sublevels.InitCompactingFileInfo(nil)
+			} else {
+				// This case is for use with explicitly-specified sublevels
+				// only.
+				sublevels = &L0Sublevels{
+					levelFiles:    explicitSublevels,
+					cmp:           base.DefaultComparer.Compare,
+					formatKey:     base.DefaultFormatter,
+					levelMetadata: &levelMetadata,
+				}
+				for _, files := range explicitSublevels {
+					sublevels.Levels = append(sublevels.Levels, NewLevelSliceSpecificOrder(files))
+				}
+			}
+
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			var builder strings.Builder
+			builder.WriteString(sublevels.describe(true))
+			builder.WriteString(visualizeSublevels(sublevels, nil, fileMetas[1:]))
+			return builder.String()
+		case "pick-base-compaction":
+			pickBaseCompaction = true
+			fallthrough
+		case "pick-intra-l0-compaction":
+			minCompactionDepth := 3
+			earliestUnflushedSeqNum := uint64(math.MaxUint64)
+			for _, arg := range td.CmdArgs {
+				switch arg.Key {
+				case "min_depth":
+					minCompactionDepth, err = strconv.Atoi(arg.Vals[0])
+					if err != nil {
+						t.Fatal(err)
+					}
+				case "earliest_unflushed_seqnum":
+					eusnInt, err := strconv.Atoi(arg.Vals[0])
+					if err != nil {
+						t.Fatal(err)
+					}
+					earliestUnflushedSeqNum = uint64(eusnInt)
+				}
+			}
+
+			var lcf *L0CompactionFiles
+			if pickBaseCompaction {
+				baseFiles := NewLevelSliceKeySorted(base.DefaultComparer.Compare, fileMetas[baseLevel])
+				lcf, err = sublevels.PickBaseCompaction(minCompactionDepth, baseFiles)
+				if err == nil && lcf != nil {
+					// Try to extend the base compaction into a more rectangular
+					// shape, using the smallest/largest keys of the files before
+					// and after overlapping base files. This mimics the logic
+					// the compactor is expected to implement.
+					baseFiles := fileMetas[baseLevel]
+					firstFile := sort.Search(len(baseFiles), func(i int) bool {
+						return sublevels.cmp(baseFiles[i].Largest.UserKey, sublevels.orderedIntervals[lcf.minIntervalIndex].startKey.key) >= 0
+					})
+					lastFile := sort.Search(len(baseFiles), func(i int) bool {
+						return sublevels.cmp(baseFiles[i].Smallest.UserKey, sublevels.orderedIntervals[lcf.maxIntervalIndex+1].startKey.key) >= 0
+					})
+					startKey := base.InvalidInternalKey
+					endKey := base.InvalidInternalKey
+					if firstFile > 0 {
+						startKey = baseFiles[firstFile-1].Largest
+					}
+					if lastFile < len(baseFiles) {
+						endKey = baseFiles[lastFile].Smallest
+					}
+					sublevels.ExtendL0ForBaseCompactionTo(
+						startKey,
+						endKey,
+						lcf)
+				}
+			} else {
+				lcf, err = sublevels.PickIntraL0Compaction(earliestUnflushedSeqNum, minCompactionDepth)
+			}
+			if err != nil {
+				return fmt.Sprintf("error: %s", err.Error())
+			}
+			if lcf == nil {
+				return "no compaction picked"
+			}
+			var builder strings.Builder
+			builder.WriteString(fmt.Sprintf("compaction picked with stack depth reduction %d\n", lcf.seedIntervalStackDepthReduction))
+			for i, file := range lcf.Files {
+				builder.WriteString(file.FileNum.String())
+				if i < len(lcf.Files)-1 {
+					builder.WriteByte(',')
+				}
+			}
+			startKey := sublevels.orderedIntervals[lcf.seedInterval].startKey
+			endKey := sublevels.orderedIntervals[lcf.seedInterval+1].startKey
+			builder.WriteString(fmt.Sprintf("\nseed interval: %s-%s\n", startKey.key, endKey.key))
+			builder.WriteString(visualizeSublevels(sublevels, lcf.FilesIncluded, fileMetas[1:]))
+
+			return builder.String()
+		case "read-amp":
+			return strconv.Itoa(sublevels.ReadAmplification())
+		case "in-use-key-ranges":
+			var buf bytes.Buffer
+			for _, data := range strings.Split(strings.TrimSpace(td.Input), "\n") {
+				keyRange := strings.Split(strings.TrimSpace(data), "-")
+				smallest := []byte(strings.TrimSpace(keyRange[0]))
+				largest := []byte(strings.TrimSpace(keyRange[1]))
+
+				keyRanges := sublevels.InUseKeyRanges(smallest, largest)
+				for i, r := range keyRanges {
+					fmt.Fprintf(&buf, "%s-%s", sublevels.formatKey(r.Start), sublevels.formatKey(r.End))
+					if i < len(keyRanges)-1 {
+						fmt.Fprint(&buf, ", ")
+					}
+				}
+				if len(keyRanges) == 0 {
+					fmt.Fprint(&buf, ".")
+				}
+				fmt.Fprintln(&buf)
+			}
+			return buf.String()
+		case "flush-split-keys":
+			var builder strings.Builder
+			builder.WriteString("flush user split keys: ")
+			flushSplitKeys := sublevels.FlushSplitKeys()
+			for i, key := range flushSplitKeys {
+				builder.Write(key)
+				if i < len(flushSplitKeys)-1 {
+					builder.WriteString(", ")
+				}
+			}
+			if len(flushSplitKeys) == 0 {
+				builder.WriteString("none")
+			}
+			return builder.String()
+		case "max-depth-after-ongoing-compactions":
+			return strconv.Itoa(sublevels.MaxDepthAfterOngoingCompactions())
+		case "l0-check-ordering":
+			for sublevel, files := range sublevels.levelFiles {
+				slice := NewLevelSliceSpecificOrder(files)
+				err := CheckOrdering(base.DefaultComparer.Compare, base.DefaultFormatter,
+					L0Sublevel(sublevel), slice.Iter(), ProhibitSplitUserKeys)
+				if err != nil {
+					return err.Error()
+				}
+			}
+			return "OK"
+		case "update-state-for-compaction":
+			var fileNums []base.FileNum
+			for _, arg := range td.CmdArgs {
+				switch arg.Key {
+				case "files":
+					for _, val := range arg.Vals {
+						fileNum, err := strconv.ParseUint(val, 10, 64)
+						if err != nil {
+							return err.Error()
+						}
+						fileNums = append(fileNums, base.FileNum(fileNum))
+					}
+				}
+			}
+			files := make([]*FileMetadata, 0, len(fileNums))
+			for _, num := range fileNums {
+				for _, f := range fileMetas[0] {
+					if f.FileNum == num {
+						f.CompactionState = CompactionStateCompacting
+						files = append(files, f)
+						break
+					}
+				}
+			}
+			slice := NewLevelSliceSeqSorted(files)
+			sm, la := KeyRange(base.DefaultComparer.Compare, slice.Iter())
+			activeCompactions = append(activeCompactions, L0Compaction{Smallest: sm, Largest: la})
+			if err := sublevels.UpdateStateForStartedCompaction([]LevelSlice{slice}, true); err != nil {
+				return err.Error()
+			}
+			return "OK"
+		case "describe":
+			var builder strings.Builder
+			builder.WriteString(sublevels.describe(true))
+			builder.WriteString(visualizeSublevels(sublevels, nil, fileMetas[1:]))
+			return builder.String()
+		}
+		return fmt.Sprintf("unrecognized command: %s", td.Cmd)
+	})
+}
+
+func TestAddL0FilesEquivalence(t *testing.T) {
+	seed := uint64(time.Now().UnixNano())
+	rng := rand.New(rand.NewSource(seed))
+	t.Logf("seed: %d", seed)
+
+	var inUseKeys [][]byte
+	const keyReusePct = 0.15
+	var fileMetas []*FileMetadata
+	var s, s2 *L0Sublevels
+	keySpace := testkeys.Alpha(8)
+
+	flushSplitMaxBytes := rng.Int63n(1 << 20)
+
+	// The outer loop runs once for each version edit. The inner loop(s) run
+	// once for each file, or each file bound.
+	for i := 0; i < 100; i++ {
+		var filesToAdd []*FileMetadata
+		numFiles := 1 + rng.Intn(9)
+		keys := make([][]byte, 0, 2*numFiles)
+		for j := 0; j < 2*numFiles; j++ {
+			if rng.Float64() <= keyReusePct && len(inUseKeys) > 0 {
+				keys = append(keys, inUseKeys[rng.Intn(len(inUseKeys))])
+			} else {
+				newKey := testkeys.Key(keySpace, rng.Int63n(keySpace.Count()))
+				inUseKeys = append(inUseKeys, newKey)
+				keys = append(keys, newKey)
+			}
+		}
+		slices.SortFunc(keys, bytes.Compare)
+		for j := 0; j < numFiles; j++ {
+			startKey := keys[j*2]
+			endKey := keys[j*2+1]
+			if bytes.Equal(startKey, endKey) {
+				continue
+			}
+			meta := (&FileMetadata{
+				FileNum:        base.FileNum(i*10 + j + 1),
+				Size:           rng.Uint64n(1 << 20),
+				SmallestSeqNum: uint64(2*i + 1),
+				LargestSeqNum:  uint64(2*i + 2),
+			}).ExtendPointKeyBounds(
+				base.DefaultComparer.Compare,
+				base.MakeInternalKey(startKey, uint64(2*i+1), base.InternalKeyKindSet),
+				base.MakeRangeDeleteSentinelKey(endKey),
+			)
+			meta.InitPhysicalBacking()
+			fileMetas = append(fileMetas, meta)
+			filesToAdd = append(filesToAdd, meta)
+		}
+		if len(filesToAdd) == 0 {
+			continue
+		}
+
+		levelMetadata := makeLevelMetadata(testkeys.Comparer.Compare, 0, fileMetas)
+		var err error
+
+		if s2 == nil {
+			s2, err = NewL0Sublevels(&levelMetadata, testkeys.Comparer.Compare, testkeys.Comparer.FormatKey, flushSplitMaxBytes)
+			require.NoError(t, err)
+		} else {
+			// AddL0Files relies on the indices in FileMetadatas pointing to that of
+			// the previous L0Sublevels. So it must be called before NewL0Sublevels;
+			// calling it the other way around results in out-of-bounds panics.
+			SortBySeqNum(filesToAdd)
+			s2, err = s2.AddL0Files(filesToAdd, flushSplitMaxBytes, &levelMetadata)
+			require.NoError(t, err)
+		}
+
+		s, err = NewL0Sublevels(&levelMetadata, testkeys.Comparer.Compare, testkeys.Comparer.FormatKey, flushSplitMaxBytes)
+		require.NoError(t, err)
+
+		// Check for equivalence.
+		require.Equal(t, s.flushSplitUserKeys, s2.flushSplitUserKeys)
+		require.Equal(t, s.orderedIntervals, s2.orderedIntervals)
+		require.Equal(t, s.levelFiles, s2.levelFiles)
+	}
+}
+
+func BenchmarkManifestApplyWithL0Sublevels(b *testing.B) {
+	b.ResetTimer()
+	for n := 0; n < b.N; n++ {
+		v, err := readManifest("testdata/MANIFEST_import")
+		require.NotNil(b, v)
+		require.NoError(b, err)
+	}
+}
+
+func BenchmarkL0SublevelsInit(b *testing.B) {
+	v, err := readManifest("testdata/MANIFEST_import")
+	if err != nil {
+		b.Fatal(err)
+	}
+	b.ResetTimer()
+	for n := 0; n < b.N; n++ {
+		sl, err := NewL0Sublevels(&v.Levels[0],
+			base.DefaultComparer.Compare, base.DefaultFormatter, 5<<20)
+		require.NoError(b, err)
+		if sl == nil {
+			b.Fatal("expected non-nil L0Sublevels to be generated")
+		}
+	}
+}
+
+func BenchmarkL0SublevelsInitAndPick(b *testing.B) {
+	v, err := readManifest("testdata/MANIFEST_import")
+	if err != nil {
+		b.Fatal(err)
+	}
+	b.ResetTimer()
+	for n := 0; n < b.N; n++ {
+		sl, err := NewL0Sublevels(&v.Levels[0],
+			base.DefaultComparer.Compare, base.DefaultFormatter, 5<<20)
+		require.NoError(b, err)
+		if sl == nil {
+			b.Fatal("expected non-nil L0Sublevels to be generated")
+		}
+		c, err := sl.PickBaseCompaction(2, LevelSlice{})
+		require.NoError(b, err)
+		if c == nil {
+			b.Fatal("expected non-nil compaction to be generated")
+		}
+	}
+}
diff --git a/pebble/internal/manifest/level.go b/pebble/internal/manifest/level.go
new file mode 100644
index 0000000..1a971f6
--- /dev/null
+++ b/pebble/internal/manifest/level.go
@@ -0,0 +1,46 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package manifest
+
+import "fmt"
+
+const (
+	// 3 bits are necessary to represent level values from 0-6.
+	levelBits = 3
+	levelMask = (1 << levelBits) - 1
+	// invalidSublevel denotes an invalid or non-applicable sublevel.
+	invalidSublevel = -1
+)
+
+// Level encodes a level and optional sublevel for use in log and error
+// messages. The encoding has the property that Level(0) ==
+// L0Sublevel(invalidSublevel).
+type Level uint32
+
+func makeLevel(level, sublevel int) Level {
+	return Level(((sublevel + 1) << levelBits) | level)
+}
+
+// LevelToInt returns the int representation of a Level
+func LevelToInt(l Level) int {
+	return int(l) & levelMask
+}
+
+// L0Sublevel returns a Level representing the specified L0 sublevel.
+func L0Sublevel(sublevel int) Level {
+	if sublevel < 0 {
+		panic(fmt.Sprintf("invalid L0 sublevel: %d", sublevel))
+	}
+	return makeLevel(0, sublevel)
+}
+
+func (l Level) String() string {
+	level := int(l) & levelMask
+	sublevel := (int(l) >> levelBits) - 1
+	if sublevel != invalidSublevel {
+		return fmt.Sprintf("L%d.%d", level, sublevel)
+	}
+	return fmt.Sprintf("L%d", level)
+}
diff --git a/pebble/internal/manifest/level_metadata.go b/pebble/internal/manifest/level_metadata.go
new file mode 100644
index 0000000..d48e277
--- /dev/null
+++ b/pebble/internal/manifest/level_metadata.go
@@ -0,0 +1,748 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package manifest
+
+import (
+	"bytes"
+	"fmt"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+)
+
+// LevelMetadata contains metadata for all of the files within
+// a level of the LSM.
+type LevelMetadata struct {
+	level     int
+	totalSize uint64
+	// NumVirtual is the number of virtual sstables in the level.
+	NumVirtual uint64
+	// VirtualSize is the size of the virtual sstables in the level.
+	VirtualSize uint64
+	tree        btree
+}
+
+// clone makes a copy of the level metadata, implicitly increasing the ref
+// count of every file contained within lm.
+func (lm *LevelMetadata) clone() LevelMetadata {
+	return LevelMetadata{
+		level:       lm.level,
+		totalSize:   lm.totalSize,
+		NumVirtual:  lm.NumVirtual,
+		VirtualSize: lm.VirtualSize,
+		tree:        lm.tree.Clone(),
+	}
+}
+
+func (lm *LevelMetadata) release() (obsolete []*FileBacking) {
+	return lm.tree.Release()
+}
+
+func makeLevelMetadata(cmp Compare, level int, files []*FileMetadata) LevelMetadata {
+	bcmp := btreeCmpSeqNum
+	if level > 0 {
+		bcmp = btreeCmpSmallestKey(cmp)
+	}
+	var lm LevelMetadata
+	lm.level = level
+	lm.tree, _ = makeBTree(bcmp, files)
+	for _, f := range files {
+		lm.totalSize += f.Size
+		if f.Virtual {
+			lm.NumVirtual++
+			lm.VirtualSize += f.Size
+		}
+	}
+	return lm
+}
+
+func makeBTree(cmp btreeCmp, files []*FileMetadata) (btree, LevelSlice) {
+	var t btree
+	t.cmp = cmp
+	for _, f := range files {
+		t.Insert(f)
+	}
+	return t, newLevelSlice(t.Iter())
+}
+
+func (lm *LevelMetadata) insert(f *FileMetadata) error {
+	if err := lm.tree.Insert(f); err != nil {
+		return err
+	}
+	lm.totalSize += f.Size
+	if f.Virtual {
+		lm.NumVirtual++
+		lm.VirtualSize += f.Size
+	}
+	return nil
+}
+
+func (lm *LevelMetadata) remove(f *FileMetadata) bool {
+	lm.totalSize -= f.Size
+	if f.Virtual {
+		lm.NumVirtual--
+		lm.VirtualSize -= f.Size
+	}
+	return lm.tree.Delete(f)
+}
+
+// Empty indicates whether there are any files in the level.
+func (lm *LevelMetadata) Empty() bool {
+	return lm.tree.Count() == 0
+}
+
+// Len returns the number of files within the level.
+func (lm *LevelMetadata) Len() int {
+	return lm.tree.Count()
+}
+
+// Size returns the cumulative size of all the files within the level.
+func (lm *LevelMetadata) Size() uint64 {
+	return lm.totalSize
+}
+
+// Iter constructs a LevelIterator over the entire level.
+func (lm *LevelMetadata) Iter() LevelIterator {
+	return LevelIterator{iter: lm.tree.Iter()}
+}
+
+// Slice constructs a slice containing the entire level.
+func (lm *LevelMetadata) Slice() LevelSlice {
+	return newLevelSlice(lm.tree.Iter())
+}
+
+// Find finds the provided file in the level if it exists.
+func (lm *LevelMetadata) Find(cmp base.Compare, m *FileMetadata) *LevelFile {
+	iter := lm.Iter()
+	if lm.level != 0 {
+		// If lm holds files for levels >0, we can narrow our search by binary
+		// searching by bounds.
+		o := overlaps(iter, cmp, m.Smallest.UserKey,
+			m.Largest.UserKey, m.Largest.IsExclusiveSentinel())
+		iter = o.Iter()
+	}
+	for f := iter.First(); f != nil; f = iter.Next() {
+		if f == m {
+			lf := iter.Take()
+			return &lf
+		}
+	}
+	return nil
+}
+
+// Annotation lazily calculates and returns the annotation defined by
+// Annotator. The Annotator is used as the key for pre-calculated
+// values, so equal Annotators must be used to avoid duplicate computations
+// and cached annotations. Annotation must not be called concurrently, and in
+// practice this is achieved by requiring callers to hold DB.mu.
+func (lm *LevelMetadata) Annotation(annotator Annotator) interface{} {
+	if lm.Empty() {
+		return annotator.Zero(nil)
+	}
+	v, _ := lm.tree.root.Annotation(annotator)
+	return v
+}
+
+// InvalidateAnnotation clears any cached annotations defined by Annotator. The
+// Annotator is used as the key for pre-calculated values, so equal Annotators
+// must be used to clear the appropriate cached annotation. InvalidateAnnotation
+// must not be called concurrently, and in practice this is achieved by
+// requiring callers to hold DB.mu.
+func (lm *LevelMetadata) InvalidateAnnotation(annotator Annotator) {
+	if lm.Empty() {
+		return
+	}
+	lm.tree.root.InvalidateAnnotation(annotator)
+}
+
+// LevelFile holds a file's metadata along with its position
+// within a level of the LSM.
+type LevelFile struct {
+	*FileMetadata
+	slice LevelSlice
+}
+
+// Slice constructs a LevelSlice containing only this file.
+func (lf LevelFile) Slice() LevelSlice {
+	return lf.slice
+}
+
+// NewLevelSliceSeqSorted constructs a LevelSlice over the provided files,
+// sorted by the L0 sequence number sort order.
+// TODO(jackson): Can we improve this interface or avoid needing to export
+// a slice constructor like this?
+func NewLevelSliceSeqSorted(files []*FileMetadata) LevelSlice {
+	tr, slice := makeBTree(btreeCmpSeqNum, files)
+	tr.Release()
+	slice.verifyInvariants()
+	return slice
+}
+
+// NewLevelSliceKeySorted constructs a LevelSlice over the provided files,
+// sorted by the files smallest keys.
+// TODO(jackson): Can we improve this interface or avoid needing to export
+// a slice constructor like this?
+func NewLevelSliceKeySorted(cmp base.Compare, files []*FileMetadata) LevelSlice {
+	tr, slice := makeBTree(btreeCmpSmallestKey(cmp), files)
+	tr.Release()
+	slice.verifyInvariants()
+	return slice
+}
+
+// NewLevelSliceSpecificOrder constructs a LevelSlice over the provided files,
+// ordering the files by their order in the provided slice. It's used in
+// tests.
+// TODO(jackson): Update tests to avoid requiring this and remove it.
+func NewLevelSliceSpecificOrder(files []*FileMetadata) LevelSlice {
+	tr, slice := makeBTree(btreeCmpSpecificOrder(files), files)
+	tr.Release()
+	slice.verifyInvariants()
+	return slice
+}
+
+// newLevelSlice constructs a new LevelSlice backed by iter.
+func newLevelSlice(iter iterator) LevelSlice {
+	s := LevelSlice{iter: iter}
+	if iter.r != nil {
+		s.length = iter.r.subtreeCount
+	}
+	s.verifyInvariants()
+	return s
+}
+
+// newBoundedLevelSlice constructs a new LevelSlice backed by iter and bounded
+// by the provided start and end bounds. The provided startBound and endBound
+// iterators must be iterators over the same B-Tree. Both start and end bounds
+// are inclusive.
+func newBoundedLevelSlice(iter iterator, startBound, endBound *iterator) LevelSlice {
+	s := LevelSlice{
+		iter:  iter,
+		start: startBound,
+		end:   endBound,
+	}
+	if iter.valid() {
+		s.length = endBound.countLeft() - startBound.countLeft()
+		// NB: The +1 is a consequence of the end bound being inclusive.
+		if endBound.valid() {
+			s.length++
+		}
+		// NB: A slice that's empty due to its bounds may have an endBound
+		// positioned before the startBound due to the inclusive bounds.
+		// TODO(jackson): Consider refactoring the end boundary to be exclusive;
+		// it would simplify some areas (eg, here) and complicate others (eg,
+		// Reslice-ing to grow compactions).
+		if s.length < 0 {
+			s.length = 0
+		}
+	}
+	s.verifyInvariants()
+	return s
+}
+
+// LevelSlice contains a slice of the files within a level of the LSM.
+// A LevelSlice is immutable once created, but may be used to construct a
+// mutable LevelIterator over the slice's files.
+//
+// LevelSlices should be constructed through one of the existing constructors,
+// not manually initialized.
+type LevelSlice struct {
+	iter   iterator
+	length int
+	// start and end form the inclusive bounds of a slice of files within a
+	// level of the LSM. They may be nil if the entire B-Tree backing iter is
+	// accessible.
+	start *iterator
+	end   *iterator
+}
+
+func (ls LevelSlice) verifyInvariants() {
+	if invariants.Enabled {
+		i := ls.Iter()
+		var length int
+		for f := i.First(); f != nil; f = i.Next() {
+			length++
+		}
+		if ls.length != length {
+			panic(fmt.Sprintf("LevelSlice %s has length %d value; actual length is %d", ls, ls.length, length))
+		}
+	}
+}
+
+// Each invokes fn for each element in the slice.
+func (ls LevelSlice) Each(fn func(*FileMetadata)) {
+	iter := ls.Iter()
+	for f := iter.First(); f != nil; f = iter.Next() {
+		fn(f)
+	}
+}
+
+// String implements fmt.Stringer.
+func (ls LevelSlice) String() string {
+	var buf bytes.Buffer
+	fmt.Fprintf(&buf, "%d files: ", ls.length)
+	ls.Each(func(f *FileMetadata) {
+		if buf.Len() > 0 {
+			fmt.Fprintf(&buf, " ")
+		}
+		fmt.Fprint(&buf, f)
+	})
+	return buf.String()
+}
+
+// Empty indicates whether the slice contains any files.
+func (ls *LevelSlice) Empty() bool {
+	return emptyWithBounds(ls.iter, ls.start, ls.end)
+}
+
+// Iter constructs a LevelIterator that iterates over the slice.
+func (ls *LevelSlice) Iter() LevelIterator {
+	return LevelIterator{
+		start: ls.start,
+		end:   ls.end,
+		iter:  ls.iter.clone(),
+	}
+}
+
+// Len returns the number of files in the slice. Its runtime is constant.
+func (ls *LevelSlice) Len() int {
+	return ls.length
+}
+
+// SizeSum sums the size of all files in the slice. Its runtime is linear in
+// the length of the slice.
+func (ls *LevelSlice) SizeSum() uint64 {
+	var sum uint64
+	iter := ls.Iter()
+	for f := iter.First(); f != nil; f = iter.Next() {
+		sum += f.Size
+	}
+	return sum
+}
+
+// NumVirtual returns the number of virtual sstables in the level. Its runtime is
+// linear in the length of the slice.
+func (ls *LevelSlice) NumVirtual() uint64 {
+	var n uint64
+	iter := ls.Iter()
+	for f := iter.First(); f != nil; f = iter.Next() {
+		if f.Virtual {
+			n++
+		}
+	}
+	return n
+}
+
+// VirtualSizeSum returns the sum of the sizes of the virtual sstables in the
+// level.
+func (ls *LevelSlice) VirtualSizeSum() uint64 {
+	var sum uint64
+	iter := ls.Iter()
+	for f := iter.First(); f != nil; f = iter.Next() {
+		if f.Virtual {
+			sum += f.Size
+		}
+	}
+	return sum
+}
+
+// Reslice constructs a new slice backed by the same underlying level, with
+// new start and end positions. Reslice invokes the provided function, passing
+// two LevelIterators: one positioned to i's inclusive start and one
+// positioned to i's inclusive end. The resliceFunc may move either iterator
+// forward or backwards, including beyond the callee's original bounds to
+// capture additional files from the underlying level. Reslice constructs and
+// returns a new LevelSlice with the final bounds of the iterators after
+// calling resliceFunc.
+func (ls LevelSlice) Reslice(resliceFunc func(start, end *LevelIterator)) LevelSlice {
+	if ls.iter.r == nil {
+		return ls
+	}
+	var start, end LevelIterator
+	if ls.start == nil {
+		start.iter = ls.iter.clone()
+		start.iter.first()
+	} else {
+		start.iter = ls.start.clone()
+	}
+	if ls.end == nil {
+		end.iter = ls.iter.clone()
+		end.iter.last()
+	} else {
+		end.iter = ls.end.clone()
+	}
+	resliceFunc(&start, &end)
+	return newBoundedLevelSlice(start.iter.clone(), &start.iter, &end.iter)
+}
+
+// KeyType is used to specify the type of keys we're looking for in
+// LevelIterator positioning operations. Files not containing any keys of the
+// desired type are skipped.
+type KeyType int8
+
+const (
+	// KeyTypePointAndRange denotes a search among the entire keyspace, including
+	// both point keys and range keys. No sstables are skipped.
+	KeyTypePointAndRange KeyType = iota
+	// KeyTypePoint denotes a search among the point keyspace. SSTables with no
+	// point keys will be skipped. Note that the point keyspace includes rangedels.
+	KeyTypePoint
+	// KeyTypeRange denotes a search among the range keyspace. SSTables with no
+	// range keys will be skipped.
+	KeyTypeRange
+)
+
+type keyTypeAnnotator struct{}
+
+var _ Annotator = keyTypeAnnotator{}
+
+func (k keyTypeAnnotator) Zero(dst interface{}) interface{} {
+	var val *KeyType
+	if dst != nil {
+		val = dst.(*KeyType)
+	} else {
+		val = new(KeyType)
+	}
+	*val = KeyTypePoint
+	return val
+}
+
+func (k keyTypeAnnotator) Accumulate(m *FileMetadata, dst interface{}) (interface{}, bool) {
+	v := dst.(*KeyType)
+	switch *v {
+	case KeyTypePoint:
+		if m.HasRangeKeys {
+			*v = KeyTypePointAndRange
+		}
+	case KeyTypePointAndRange:
+		// Do nothing.
+	default:
+		panic("unexpected key type")
+	}
+	return v, true
+}
+
+func (k keyTypeAnnotator) Merge(src interface{}, dst interface{}) interface{} {
+	v := dst.(*KeyType)
+	srcVal := src.(*KeyType)
+	switch *v {
+	case KeyTypePoint:
+		if *srcVal == KeyTypePointAndRange {
+			*v = KeyTypePointAndRange
+		}
+	case KeyTypePointAndRange:
+		// Do nothing.
+	default:
+		panic("unexpected key type")
+	}
+	return v
+}
+
+// LevelIterator iterates over a set of files' metadata. Its zero value is an
+// empty iterator.
+type LevelIterator struct {
+	iter   iterator
+	start  *iterator
+	end    *iterator
+	filter KeyType
+}
+
+func (i LevelIterator) String() string {
+	var buf bytes.Buffer
+	iter := i.iter.clone()
+	iter.first()
+	iter.prev()
+	if i.iter.pos == -1 {
+		fmt.Fprint(&buf, "(<start>)*")
+	}
+	iter.next()
+	for ; iter.valid(); iter.next() {
+		if buf.Len() > 0 {
+			fmt.Fprint(&buf, "   ")
+		}
+
+		if i.start != nil && cmpIter(iter, *i.start) == 0 {
+			fmt.Fprintf(&buf, " [ ")
+		}
+		isCurrentPos := cmpIter(iter, i.iter) == 0
+		if isCurrentPos {
+			fmt.Fprint(&buf, " ( ")
+		}
+		fmt.Fprint(&buf, iter.cur().String())
+		if isCurrentPos {
+			fmt.Fprint(&buf, " )*")
+		}
+		if i.end != nil && cmpIter(iter, *i.end) == 0 {
+			fmt.Fprintf(&buf, " ]")
+		}
+	}
+	if i.iter.n != nil && i.iter.pos >= i.iter.n.count {
+		if buf.Len() > 0 {
+			fmt.Fprint(&buf, "   ")
+		}
+		fmt.Fprint(&buf, "(<end>)*")
+	}
+	return buf.String()
+}
+
+// Clone copies the iterator, returning an independent iterator at the same
+// position.
+func (i *LevelIterator) Clone() LevelIterator {
+	if i.iter.r == nil {
+		return *i
+	}
+	// The start and end iterators are not cloned and are treated as
+	// immutable.
+	return LevelIterator{
+		iter:   i.iter.clone(),
+		start:  i.start,
+		end:    i.end,
+		filter: i.filter,
+	}
+}
+
+// Current returns the item at the current iterator position.
+//
+// Current is deprecated. Callers should instead use the return value of a
+// positioning operation.
+func (i *LevelIterator) Current() *FileMetadata {
+	if !i.iter.valid() ||
+		(i.end != nil && cmpIter(i.iter, *i.end) > 0) ||
+		(i.start != nil && cmpIter(i.iter, *i.start) < 0) {
+		return nil
+	}
+	return i.iter.cur()
+}
+
+func (i *LevelIterator) empty() bool {
+	return emptyWithBounds(i.iter, i.start, i.end)
+}
+
+// Filter clones the iterator and sets the desired KeyType as the key to filter
+// files on.
+func (i *LevelIterator) Filter(keyType KeyType) LevelIterator {
+	l := i.Clone()
+	l.filter = keyType
+	return l
+}
+
+func emptyWithBounds(i iterator, start, end *iterator) bool {
+	// If i.r is nil, the iterator was constructed from an empty btree.
+	// If the end bound is before the start bound, the bounds represent an
+	// empty slice of the B-Tree.
+	return i.r == nil || (start != nil && end != nil && cmpIter(*end, *start) < 0)
+}
+
+// First seeks to the first file in the iterator and returns it.
+func (i *LevelIterator) First() *FileMetadata {
+	if i.empty() {
+		return nil
+	}
+	if i.start != nil {
+		i.iter = i.start.clone()
+	} else {
+		i.iter.first()
+	}
+	if !i.iter.valid() {
+		return nil
+	}
+	return i.skipFilteredForward(i.iter.cur())
+}
+
+// Last seeks to the last file in the iterator and returns it.
+func (i *LevelIterator) Last() *FileMetadata {
+	if i.empty() {
+		return nil
+	}
+	if i.end != nil {
+		i.iter = i.end.clone()
+	} else {
+		i.iter.last()
+	}
+	if !i.iter.valid() {
+		return nil
+	}
+	return i.skipFilteredBackward(i.iter.cur())
+}
+
+// Next advances the iterator to the next file and returns it.
+func (i *LevelIterator) Next() *FileMetadata {
+	if i.iter.r == nil {
+		return nil
+	}
+	if invariants.Enabled && (i.iter.pos >= i.iter.n.count || (i.end != nil && cmpIter(i.iter, *i.end) > 0)) {
+		panic("pebble: cannot next forward-exhausted iterator")
+	}
+	i.iter.next()
+	if !i.iter.valid() {
+		return nil
+	}
+	return i.skipFilteredForward(i.iter.cur())
+}
+
+// Prev moves the iterator the previous file and returns it.
+func (i *LevelIterator) Prev() *FileMetadata {
+	if i.iter.r == nil {
+		return nil
+	}
+	if invariants.Enabled && (i.iter.pos < 0 || (i.start != nil && cmpIter(i.iter, *i.start) < 0)) {
+		panic("pebble: cannot prev backward-exhausted iterator")
+	}
+	i.iter.prev()
+	if !i.iter.valid() {
+		return nil
+	}
+	return i.skipFilteredBackward(i.iter.cur())
+}
+
+// SeekGE seeks to the first file in the iterator's file set with a largest
+// user key greater than or equal to the provided user key. The iterator must
+// have been constructed from L1+, because it requires the underlying files to
+// be sorted by user keys and non-overlapping.
+func (i *LevelIterator) SeekGE(cmp Compare, userKey []byte) *FileMetadata {
+	// TODO(jackson): Assert that i.iter.cmp == btreeCmpSmallestKey.
+	if i.iter.r == nil {
+		return nil
+	}
+	m := i.seek(func(m *FileMetadata) bool {
+		return cmp(m.Largest.UserKey, userKey) >= 0
+	})
+	if i.filter != KeyTypePointAndRange && m != nil {
+		b, ok := m.LargestBound(i.filter)
+		if !ok {
+			m = i.Next()
+		} else if c := cmp(b.UserKey, userKey); c < 0 || c == 0 && b.IsExclusiveSentinel() {
+			// This file does not contain any keys of the type ≥ lower. It
+			// should be filtered, even though it does contain point keys.
+			m = i.Next()
+		}
+	}
+	return i.skipFilteredForward(m)
+}
+
+// SeekLT seeks to the last file in the iterator's file set with a smallest
+// user key less than the provided user key. The iterator must have been
+// constructed from L1+, because it requires the underlying files to be sorted
+// by user keys and non-overlapping.
+func (i *LevelIterator) SeekLT(cmp Compare, userKey []byte) *FileMetadata {
+	// TODO(jackson): Assert that i.iter.cmp == btreeCmpSmallestKey.
+	if i.iter.r == nil {
+		return nil
+	}
+	i.seek(func(m *FileMetadata) bool {
+		return cmp(m.Smallest.UserKey, userKey) >= 0
+	})
+	m := i.Prev()
+	// Although i.Prev() guarantees that the current file contains keys of the
+	// relevant type, it doesn't guarantee that the keys of the relevant type
+	// are < userKey.
+	if i.filter != KeyTypePointAndRange && m != nil {
+		b, ok := m.SmallestBound(i.filter)
+		if !ok {
+			panic("unreachable")
+		}
+		if c := cmp(b.UserKey, userKey); c >= 0 {
+			// This file does not contain any keys of the type ≥ lower. It
+			// should be filtered, even though it does contain point keys.
+			m = i.Prev()
+		}
+	}
+	return i.skipFilteredBackward(m)
+}
+
+// skipFilteredForward takes the file metadata at the iterator's current
+// position, and skips forward if the current key-type filter (i.filter)
+// excludes the file. It skips until it finds an unfiltered file or exhausts the
+// level. If lower is != nil, skipFilteredForward skips any files that do not
+// contain keys with the provided key-type ≥ lower.
+//
+// skipFilteredForward also enforces the upper bound, returning nil if at any
+// point the upper bound is exceeded.
+func (i *LevelIterator) skipFilteredForward(meta *FileMetadata) *FileMetadata {
+	for meta != nil && !meta.ContainsKeyType(i.filter) {
+		i.iter.next()
+		if !i.iter.valid() {
+			meta = nil
+		} else {
+			meta = i.iter.cur()
+		}
+	}
+	if meta != nil && i.end != nil && cmpIter(i.iter, *i.end) > 0 {
+		// Exceeded upper bound.
+		meta = nil
+	}
+	return meta
+}
+
+// skipFilteredBackward takes the file metadata at the iterator's current
+// position, and skips backward if the current key-type filter (i.filter)
+// excludes the file. It skips until it finds an unfiltered file or exhausts the
+// level. If upper is != nil, skipFilteredBackward skips any files that do not
+// contain keys with the provided key-type < upper.
+//
+// skipFilteredBackward also enforces the lower bound, returning nil if at any
+// point the lower bound is exceeded.
+func (i *LevelIterator) skipFilteredBackward(meta *FileMetadata) *FileMetadata {
+	for meta != nil && !meta.ContainsKeyType(i.filter) {
+		i.iter.prev()
+		if !i.iter.valid() {
+			meta = nil
+		} else {
+			meta = i.iter.cur()
+		}
+	}
+	if meta != nil && i.start != nil && cmpIter(i.iter, *i.start) < 0 {
+		// Exceeded lower bound.
+		meta = nil
+	}
+	return meta
+}
+
+func (i *LevelIterator) seek(fn func(*FileMetadata) bool) *FileMetadata {
+	i.iter.seek(fn)
+
+	// i.iter.seek seeked in the unbounded underlying B-Tree. If the iterator
+	// has start or end bounds, we may have exceeded them. Reset to the bounds
+	// if necessary.
+	//
+	// NB: The LevelIterator and LevelSlice semantics require that a bounded
+	// LevelIterator/LevelSlice containing files x0, x1, ..., xn behave
+	// identically to an unbounded LevelIterator/LevelSlice of a B-Tree
+	// containing x0, x1, ..., xn. In other words, any files outside the
+	// LevelIterator's bounds should not influence the iterator's behavior.
+	// When seeking, this means a SeekGE that seeks beyond the end bound,
+	// followed by a Prev should return the last element within bounds.
+	if i.end != nil && cmpIter(i.iter, *i.end) > 0 {
+		i.iter = i.end.clone()
+		// Since seek(fn) positioned beyond i.end, we know there is nothing to
+		// return within bounds.
+		i.iter.next()
+		return nil
+	} else if i.start != nil && cmpIter(i.iter, *i.start) < 0 {
+		i.iter = i.start.clone()
+	}
+	if !i.iter.valid() {
+		return nil
+	}
+	return i.iter.cur()
+}
+
+// Take constructs a LevelFile containing the file at the iterator's current
+// position. Take panics if the iterator is not currently positioned over a
+// file.
+func (i *LevelIterator) Take() LevelFile {
+	m := i.Current()
+	if m == nil {
+		panic("Take called on invalid LevelIterator")
+	}
+	// LevelSlice's start and end fields are immutable and are positioned to
+	// the same position for a LevelFile because they're inclusive, so we can
+	// share one iterator stack between the two bounds.
+	boundsIter := i.iter.clone()
+	s := newBoundedLevelSlice(i.iter.clone(), &boundsIter, &boundsIter)
+	return LevelFile{
+		FileMetadata: m,
+		slice:        s,
+	}
+}
diff --git a/pebble/internal/manifest/level_metadata_test.go b/pebble/internal/manifest/level_metadata_test.go
new file mode 100644
index 0000000..95ef91a
--- /dev/null
+++ b/pebble/internal/manifest/level_metadata_test.go
@@ -0,0 +1,144 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package manifest
+
+import (
+	"bytes"
+	"fmt"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/stretchr/testify/require"
+)
+
+func TestLevelIterator(t *testing.T) {
+	var level LevelSlice
+	datadriven.RunTest(t, "testdata/level_iterator",
+		func(t *testing.T, d *datadriven.TestData) string {
+			switch d.Cmd {
+			case "define":
+				var files []*FileMetadata
+				var startReslice int
+				var endReslice int
+				for _, metaStr := range strings.Split(d.Input, " ") {
+					switch metaStr {
+					case "[":
+						startReslice = len(files)
+						continue
+					case "]":
+						endReslice = len(files)
+						continue
+					case " ", "":
+						continue
+					default:
+						parts := strings.Split(metaStr, "-")
+						if len(parts) != 2 {
+							t.Fatalf("malformed table spec: %q", metaStr)
+						}
+						m := &FileMetadata{FileNum: base.FileNum(len(files) + 1)}
+						m.ExtendPointKeyBounds(
+							base.DefaultComparer.Compare,
+							base.ParseInternalKey(strings.TrimSpace(parts[0])),
+							base.ParseInternalKey(strings.TrimSpace(parts[1])),
+						)
+						m.SmallestSeqNum = m.Smallest.SeqNum()
+						m.LargestSeqNum = m.Largest.SeqNum()
+						m.InitPhysicalBacking()
+						files = append(files, m)
+					}
+				}
+				level = NewLevelSliceKeySorted(base.DefaultComparer.Compare, files)
+				level = level.Reslice(func(start, end *LevelIterator) {
+					for i := 0; i < startReslice; i++ {
+						start.Next()
+					}
+					for i := len(files); i > endReslice; i-- {
+						end.Prev()
+					}
+				})
+				return ""
+
+			case "iter":
+				return runIterCmd(t, d, level.Iter(), false /* verbose */)
+
+			default:
+				return fmt.Sprintf("unknown command %q", d.Cmd)
+			}
+		})
+}
+
+func TestLevelIteratorFiltered(t *testing.T) {
+	var level LevelSlice
+	datadriven.RunTest(t, "testdata/level_iterator_filtered",
+		func(t *testing.T, d *datadriven.TestData) string {
+			switch d.Cmd {
+			case "define":
+				var files []*FileMetadata
+				for _, metaStr := range strings.Split(d.Input, "\n") {
+					m, err := ParseFileMetadataDebug(metaStr)
+					require.NoError(t, err)
+					files = append(files, m)
+				}
+				level = NewLevelSliceKeySorted(base.DefaultComparer.Compare, files)
+				return ""
+
+			case "iter":
+				var keyType string
+				d.ScanArgs(t, "key-type", &keyType)
+				iter := level.Iter()
+				switch keyType {
+				case "both":
+					// noop
+				case "points":
+					iter = iter.Filter(KeyTypePoint)
+				case "ranges":
+					iter = iter.Filter(KeyTypeRange)
+				}
+				return runIterCmd(t, d, iter, true /* verbose */)
+
+			default:
+				return fmt.Sprintf("unknown command %q", d.Cmd)
+			}
+		})
+}
+
+func runIterCmd(t *testing.T, d *datadriven.TestData, iter LevelIterator, verbose bool) string {
+	var buf bytes.Buffer
+	for _, line := range strings.Split(d.Input, "\n") {
+		parts := strings.Fields(line)
+		if len(parts) == 0 {
+			continue
+		}
+		var m *FileMetadata
+		switch parts[0] {
+		case "first":
+			m = iter.First()
+		case "last":
+			m = iter.Last()
+		case "next":
+			m = iter.Next()
+		case "prev":
+			m = iter.Prev()
+		case "seek-ge":
+			m = iter.SeekGE(base.DefaultComparer.Compare, []byte(parts[1]))
+		case "seek-lt":
+			m = iter.SeekLT(base.DefaultComparer.Compare, []byte(parts[1]))
+		default:
+			return fmt.Sprintf("unknown command %q", parts[0])
+		}
+		if m == nil {
+			fmt.Fprintln(&buf, ".")
+		} else {
+			if verbose {
+				fmt.Fprintln(&buf, m.DebugString(base.DefaultComparer.FormatKey, verbose))
+			} else {
+				fmt.Fprintln(&buf, m)
+			}
+		}
+	}
+	return buf.String()
+}
diff --git a/pebble/internal/manifest/level_test.go b/pebble/internal/manifest/level_test.go
new file mode 100644
index 0000000..0b9aa7f
--- /dev/null
+++ b/pebble/internal/manifest/level_test.go
@@ -0,0 +1,64 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package manifest
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestLevel(t *testing.T) {
+	testCases := []struct {
+		level    int
+		expected string
+	}{
+		{0, "L0"},
+		{1, "L1"},
+		{2, "L2"},
+		{3, "L3"},
+		{4, "L4"},
+		{5, "L5"},
+		{6, "L6"},
+		{7, "L7"},
+	}
+
+	for _, c := range testCases {
+		t.Run("", func(t *testing.T) {
+			s := Level(c.level).String()
+			require.EqualValues(t, c.expected, s)
+		})
+	}
+}
+
+func TestL0Sublevel(t *testing.T) {
+	testCases := []struct {
+		level    int
+		sublevel int
+		expected string
+	}{
+		{0, 0, "L0.0"},
+		{0, 1, "L0.1"},
+		{0, 2, "L0.2"},
+		{0, 1000, "L0.1000"},
+		{0, -1, "invalid L0 sublevel: -1"},
+		{0, -2, "invalid L0 sublevel: -2"},
+	}
+
+	for _, c := range testCases {
+		t.Run("", func(t *testing.T) {
+			s := func() (result string) {
+				defer func() {
+					if r := recover(); r != nil {
+						result = fmt.Sprint(r)
+					}
+				}()
+				return L0Sublevel(c.sublevel).String()
+			}()
+			require.EqualValues(t, c.expected, s)
+		})
+	}
+}
diff --git a/pebble/internal/manifest/testdata/MANIFEST_import b/pebble/internal/manifest/testdata/MANIFEST_import
new file mode 100644
index 0000000..3c5a010
Binary files /dev/null and b/pebble/internal/manifest/testdata/MANIFEST_import differ
diff --git a/pebble/internal/manifest/testdata/file_metadata_bounds b/pebble/internal/manifest/testdata/file_metadata_bounds
new file mode 100644
index 0000000..f849d44
--- /dev/null
+++ b/pebble/internal/manifest/testdata/file_metadata_bounds
@@ -0,0 +1,81 @@
+# Points only (single update).
+
+extend-point-key-bounds
+a.SET.0 - z.DEL.42
+----
+000000:[a#0,SET-z#42,DEL] seqnums:[0-0] points:[a#0,SET-z#42,DEL]
+  bounds: (smallest=point,largest=point) (0x00000111)
+
+# Rangedels only (single update).
+
+reset
+----
+
+extend-point-key-bounds
+a.RANGEDEL.0:z
+----
+000000:[a#0,RANGEDEL-z#inf,RANGEDEL] seqnums:[0-0] points:[a#0,RANGEDEL-z#inf,RANGEDEL]
+  bounds: (smallest=point,largest=point) (0x00000111)
+
+# Range keys only (single update).
+
+reset
+----
+
+extend-range-key-bounds
+a.RANGEKEYSET.0:z
+----
+000000:[a#0,RANGEKEYSET-z#inf,RANGEKEYSET] seqnums:[0-0] ranges:[a#0,RANGEKEYSET-z#inf,RANGEKEYSET]
+  bounds: (smallest=range,largest=range) (0x00000000)
+
+# Multiple updates with various key kinds.
+
+reset
+----
+
+extend-point-key-bounds
+m.SET.0 - n.SET.0
+----
+000000:[m#0,SET-n#0,SET] seqnums:[0-0] points:[m#0,SET-n#0,SET]
+  bounds: (smallest=point,largest=point) (0x00000111)
+
+# Extend the lower point key bound.
+
+extend-point-key-bounds
+j.SET.0 - k.SET.0
+----
+000000:[j#0,SET-n#0,SET] seqnums:[0-0] points:[j#0,SET-n#0,SET]
+  bounds: (smallest=point,largest=point) (0x00000111)
+
+# Extend the upper point key bound with a rangedel.
+
+extend-point-key-bounds
+k.RANGEDEL.0:o
+----
+000000:[j#0,SET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL]
+  bounds: (smallest=point,largest=point) (0x00000111)
+
+# Extend the lower bounds bound with a range key.
+
+extend-range-key-bounds
+a.RANGEKEYSET.42:m
+----
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+  bounds: (smallest=range,largest=point) (0x00000101)
+
+# Extend again with a wide range key (equal keys tiebreak on seqnums descending,
+# so the overall lower bound is unchanged).
+
+extend-range-key-bounds
+a.RANGEKEYSET.0:z
+----
+000000:[a#42,RANGEKEYSET-z#inf,RANGEKEYSET] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-z#inf,RANGEKEYSET]
+  bounds: (smallest=range,largest=range) (0x00000001)
+
+# Extend again with a wide rangedel over the same range.
+
+extend-point-key-bounds
+A.RANGEDEL.0:y
+----
+000000:[A#0,RANGEDEL-z#inf,RANGEKEYSET] seqnums:[0-0] points:[A#0,RANGEDEL-y#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-z#inf,RANGEKEYSET]
+  bounds: (smallest=point,largest=range) (0x00000011)
diff --git a/pebble/internal/manifest/testdata/l0_sublevels b/pebble/internal/manifest/testdata/l0_sublevels
new file mode 100644
index 0000000..f190ed2
--- /dev/null
+++ b/pebble/internal/manifest/testdata/l0_sublevels
@@ -0,0 +1,1766 @@
+
+define
+L0
+  000009:a.SET.10-b.SET.10
+  000007:c.SET.6-d.SET.8
+  000003:e.SET.5-j.SET.7
+----
+file count: 3, sublevels: 1, intervals: 6
+flush split keys(3): [b, d, j]
+0.0: file count: 3, bytes: 768, width (mean, max): 1.0, 1, interval range: [0, 4]
+	000009:[a#10,1-b#10,1]
+	000007:[c#6,1-d#8,1]
+	000003:[e#5,1-j#7,1]
+compacting file count: 0, base compacting intervals: none
+L0.0:  a---b c---d e---------------j
+       aa bb cc dd ee ff gg hh ii jj
+
+in-use-key-ranges
+a-z
+a-c
+aa-cc
+f-g
+e-j
+----
+a-b, c-d, e-j
+a-b, c-d
+a-b, c-d
+e-j
+e-j
+
+define
+L0
+  000009:a.SET.10-b.SET.10
+  000007:b.SET.6-j.SET.8
+  000003:e.SET.5-j.SET.7
+----
+file count: 3, sublevels: 3, intervals: 5
+flush split keys(2): [b, j]
+0.2: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [0, 1]
+	000009:[a#10,1-b#10,1]
+0.1: file count: 1, bytes: 256, width (mean, max): 3.0, 3, interval range: [1, 3]
+	000007:[b#6,1-j#8,1]
+0.0: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [3, 3]
+	000003:[e#5,1-j#7,1]
+compacting file count: 0, base compacting intervals: none
+L0.2:  a---b
+L0.1:     b------------------------j
+L0.0:              e---------------j
+       aa bb cc dd ee ff gg hh ii jj
+
+in-use-key-ranges
+a-z
+a-b
+a-aa
+b-bb
+b-j
+j-j
+----
+a-j
+a-j
+a-b
+b-j
+b-j
+e-j
+
+define no_initialize
+L0.2
+  000009:a.SET.10-b.SET.10
+L0.1
+  000003:e.SET.5-j.SET.7
+L0.0
+  000007:b.SET.6-j.SET.8
+----
+file count: 3, sublevels: 3, intervals: 0
+flush split keys(0): []
+0.2: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000009:[a#10,1-b#10,1]
+0.1: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000003:[e#5,1-j#7,1]
+0.0: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000007:[b#6,1-j#8,1]
+compacting file count: 0, base compacting intervals: none
+L0.2:  a---b
+L0.1:              e---------------j
+L0.0:     b------------------------j
+       aa bb cc dd ee ff gg hh ii jj
+
+l0-check-ordering
+----
+OK
+
+define no_initialize
+L0.1
+  000009:a.SET.10-b.SET.10
+L0.0
+  000007:b.SET.6-j.SET.8
+  000003:e.SET.5-j.SET.7
+----
+file count: 3, sublevels: 2, intervals: 0
+flush split keys(0): []
+0.1: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000009:[a#10,1-b#10,1]
+0.0: file count: 2, bytes: 512, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000007:[b#6,1-j#8,1]
+	000003:[e#5,1-j#7,1]
+compacting file count: 0, base compacting intervals: none
+L0.1:  a---b
+L0.0:     b------------------------j e---j
+       aa bb cc dd ee ff gg hh ii jj
+
+l0-check-ordering
+----
+L0.0 files 000007 and 000003 have overlapping ranges: [b#6,SET-j#8,SET] vs [e#5,SET-j#7,SET]
+
+define
+L0
+  000001:a.SET.2-b.SET.3
+  000002:c.SET.3-d.SET.5
+  000003:e.SET.5-f.SET.7
+  000005:f.SET.6-h.SET.9
+  000006:f.SET.4-g.SET.5
+  000009:f.SET.10-i.SET.10
+  000010:f.SET.11-g.SET.11
+L6
+  000007:a.SET.0-f.SET.0
+  000008:g.SET.0-s.SET.0
+----
+file count: 7, sublevels: 5, intervals: 10
+flush split keys(3): [d, f, g]
+0.4: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [5, 6]
+	000010:[f#11,1-g#11,1]
+0.3: file count: 1, bytes: 256, width (mean, max): 4.0, 4, interval range: [5, 8]
+	000009:[f#10,1-i#10,1]
+0.2: file count: 1, bytes: 256, width (mean, max): 3.0, 3, interval range: [5, 7]
+	000005:[f#6,1-h#9,1]
+0.1: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [4, 5]
+	000003:[e#5,1-f#7,1]
+0.0: file count: 3, bytes: 768, width (mean, max): 1.3, 2, interval range: [0, 6]
+	000001:[a#2,1-b#3,1]
+	000002:[c#3,1-d#5,1]
+	000006:[f#4,1-g#5,1]
+compacting file count: 0, base compacting intervals: none
+L0.4:                 f---g
+L0.3:                 f---------i
+L0.2:                 f------h
+L0.1:              e---f
+L0.0:  a---b c---d    f---g
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+max-depth-after-ongoing-compactions
+----
+5
+
+pick-base-compaction min_depth=3
+----
+compaction picked with stack depth reduction 5
+000006,000003,000005,000009,000010,000001,000002
+seed interval: f-f
+L0.4:                 f+++g
+L0.3:                 f+++++++++i
+L0.2:                 f++++++h
+L0.1:              e+++f
+L0.0:  a+++b c+++d    f+++g
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+# SSTables 000001 and 000002 are optional additions to the above compaction, as they
+# overlap with base files that overlap with L0 files in the seed interval.
+# Marking 0002 as compacting should be enough to exclude both from the
+# chosen compaction.
+
+in-use-key-ranges
+a-z
+----
+a-b, c-d, e-i
+
+define
+L0
+  000001:a.SET.2-b.SET.3
+  000002:c.SET.3-d.SET.5 intra_l0_compacting
+  000003:e.SET.5-f.SET.7
+  000005:f.SET.6-h.SET.9
+  000006:f.SET.4-g.SET.5
+  000009:f.SET.10-i.SET.10
+  000010:f.SET.11-g.SET.11
+L6
+  000007:a.SET.0-f.SET.0
+  000008:g.SET.0-s.SET.0
+----
+file count: 7, sublevels: 5, intervals: 10
+flush split keys(3): [d, f, g]
+0.4: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [5, 6]
+	000010:[f#11,1-g#11,1]
+0.3: file count: 1, bytes: 256, width (mean, max): 4.0, 4, interval range: [5, 8]
+	000009:[f#10,1-i#10,1]
+0.2: file count: 1, bytes: 256, width (mean, max): 3.0, 3, interval range: [5, 7]
+	000005:[f#6,1-h#9,1]
+0.1: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [4, 5]
+	000003:[e#5,1-f#7,1]
+0.0: file count: 3, bytes: 768, width (mean, max): 1.3, 2, interval range: [0, 6]
+	000001:[a#2,1-b#3,1]
+	000002:[c#3,1-d#5,1]
+	000006:[f#4,1-g#5,1]
+compacting file count: 1, base compacting intervals: none
+L0.4:                 f---g
+L0.3:                 f---------i
+L0.2:                 f------h
+L0.1:              e---f
+L0.0:  a---b c^^^d    f---g
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+
+pick-base-compaction min_depth=3
+----
+compaction picked with stack depth reduction 5
+000006,000003,000005,000009,000010
+seed interval: f-f
+L0.4:                 f+++g
+L0.3:                 f+++++++++i
+L0.2:                 f++++++h
+L0.1:              e+++f
+L0.0:  a---b c^^^d    f+++g
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+# Mark the above compaction as started.
+
+update-state-for-compaction files=(000006,000003,000005,000009,000010)
+----
+OK
+
+describe
+----
+file count: 7, sublevels: 5, intervals: 10
+flush split keys(3): [d, f, g]
+0.4: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [5, 6]
+	000010:[f#11,1-g#11,1]
+0.3: file count: 1, bytes: 256, width (mean, max): 4.0, 4, interval range: [5, 8]
+	000009:[f#10,1-i#10,1]
+0.2: file count: 1, bytes: 256, width (mean, max): 3.0, 3, interval range: [5, 7]
+	000005:[f#6,1-h#9,1]
+0.1: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [4, 5]
+	000003:[e#5,1-f#7,1]
+0.0: file count: 3, bytes: 768, width (mean, max): 1.3, 2, interval range: [0, 6]
+	000001:[a#2,1-b#3,1]
+	000002:[c#3,1-d#5,1]
+	000006:[f#4,1-g#5,1]
+compacting file count: 6, base compacting intervals: [4, 9]
+L0.4:                 fvvvg
+L0.3:                 fvvvvvvvvvi
+L0.2:                 fvvvvvvh
+L0.1:              evvvf
+L0.0:  a---b c^^^d    fvvvg
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+pick-base-compaction min_depth=3
+----
+no compaction picked
+
+# Extend one of the SSTables (000009) to the right, and place an SSTable "under"
+# the extension (000011). This adds it to the compaction.
+
+define
+L0
+  000005:f.SET.6-h.SET.9
+  000006:f.SET.4-g.SET.5
+  000009:f.SET.10-p.SET.10
+  000010:f.SET.11-g.SET.11
+  000011:n.SET.8-p.SET.10
+L6
+  000007:a.SET.0-f.SET.0
+  000008:g.SET.0-s.SET.0
+----
+file count: 5, sublevels: 4, intervals: 5
+flush split keys(2): [g, p]
+0.3: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000010:[f#11,1-g#11,1]
+0.2: file count: 1, bytes: 256, width (mean, max): 4.0, 4, interval range: [0, 3]
+	000009:[f#10,1-p#10,1]
+0.1: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [0, 1]
+	000005:[f#6,1-h#9,1]
+0.0: file count: 2, bytes: 512, width (mean, max): 1.0, 1, interval range: [0, 3]
+	000006:[f#4,1-g#5,1]
+	000011:[n#8,1-p#10,1]
+compacting file count: 0, base compacting intervals: none
+L0.3:                 f---g
+L0.2:                 f------------------------------p
+L0.1:                 f------h
+L0.0:                 f---g                   n------p
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+in-use-key-ranges
+a-z
+----
+f-p
+
+pick-base-compaction min_depth=3
+----
+compaction picked with stack depth reduction 4
+000006,000005,000009,000011,000010
+seed interval: f-g
+L0.3:                 f+++g
+L0.2:                 f++++++++++++++++++++++++++++++p
+L0.1:                 f++++++h
+L0.0:                 f+++g                   n++++++p
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+# Set SSTable 000011 which is under/older SSTable 000009 to IsBaseCompacting = true.
+# This should prevent SSTable 000009 from participating in a base compaction.
+
+define
+L0
+  000005:f.SET.6-h.SET.9
+  000006:f.SET.4-g.SET.5
+  000009:f.SET.10-p.SET.10
+  000010:f.SET.11-g.SET.11
+  000011:n.SET.8-p.SET.9 base_compacting
+L6
+  000007:a.SET.0-f.SET.0
+  000008:g.SET.0-s.SET.0
+----
+file count: 5, sublevels: 4, intervals: 5
+flush split keys(2): [g, p]
+0.3: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000010:[f#11,1-g#11,1]
+0.2: file count: 1, bytes: 256, width (mean, max): 4.0, 4, interval range: [0, 3]
+	000009:[f#10,1-p#10,1]
+0.1: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [0, 1]
+	000005:[f#6,1-h#9,1]
+0.0: file count: 2, bytes: 512, width (mean, max): 1.0, 1, interval range: [0, 3]
+	000006:[f#4,1-g#5,1]
+	000011:[n#8,1-p#9,1]
+compacting file count: 1, base compacting intervals: [3, 4]
+L0.3:                 f---g
+L0.2:                 f------------------------------p
+L0.1:                 f------h
+L0.0:                 f---g                   nvvvvvvp
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+pick-base-compaction min_depth=3
+----
+no compaction picked
+
+pick-intra-l0-compaction min_depth=3
+----
+compaction picked with stack depth reduction 4
+000010,000009,000005,000006
+seed interval: f-g
+L0.3:                 f+++g
+L0.2:                 f++++++++++++++++++++++++++++++p
+L0.1:                 f++++++h
+L0.0:                 f+++g                   nvvvvvvp
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+# Raise 000009 to a higher level, so that there's still a stack depth of 3 below
+# it. This should make f-g a candidate for base compaction again.
+
+define
+L0
+  000005:f.SET.6-h.SET.9
+  000006:f.SET.4-g.SET.5
+  000009:f.SET.12-p.SET.12
+  000010:f.SET.11-g.SET.11
+  000011:n.SET.8-p.SET.10 base_compacting
+L6
+  000007:a.SET.0-f.SET.0
+  000008:g.SET.0-s.SET.0
+----
+file count: 5, sublevels: 4, intervals: 5
+flush split keys(2): [g, p]
+0.3: file count: 1, bytes: 256, width (mean, max): 4.0, 4, interval range: [0, 3]
+	000009:[f#12,1-p#12,1]
+0.2: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000010:[f#11,1-g#11,1]
+0.1: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [0, 1]
+	000005:[f#6,1-h#9,1]
+0.0: file count: 2, bytes: 512, width (mean, max): 1.0, 1, interval range: [0, 3]
+	000006:[f#4,1-g#5,1]
+	000011:[n#8,1-p#10,1]
+compacting file count: 1, base compacting intervals: [3, 4]
+L0.3:                 f------------------------------p
+L0.2:                 f---g
+L0.1:                 f------h
+L0.0:                 f---g                   nvvvvvvp
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+pick-base-compaction min_depth=3
+----
+compaction picked with stack depth reduction 3
+000006,000005,000010
+seed interval: f-g
+L0.3:                 f------------------------------p
+L0.2:                 f+++g
+L0.1:                 f++++++h
+L0.0:                 f+++g                   nvvvvvvp
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+pick-intra-l0-compaction min_depth=3
+----
+compaction picked with stack depth reduction 4
+000009,000010,000005,000006
+seed interval: f-g
+L0.3:                 f++++++++++++++++++++++++++++++p
+L0.2:                 f+++g
+L0.1:                 f++++++h
+L0.0:                 f+++g                   nvvvvvvp
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+max-depth-after-ongoing-compactions
+----
+4
+
+# Assume the above base compaction is chosen. This should reduce max depth after
+# ongoing compactions.
+
+define
+L0
+  000005:f.SET.6-h.SET.9 base_compacting
+  000006:f.SET.4-g.SET.5 base_compacting
+  000009:f.SET.12-p.SET.12
+  000010:f.SET.11-g.SET.11 base_compacting
+  000011:n.SET.8-p.SET.10 base_compacting
+L6
+  000007:a.SET.0-f.SET.0
+  000008:g.SET.0-s.SET.0
+----
+file count: 5, sublevels: 4, intervals: 5
+flush split keys(2): [g, p]
+0.3: file count: 1, bytes: 256, width (mean, max): 4.0, 4, interval range: [0, 3]
+	000009:[f#12,1-p#12,1]
+0.2: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000010:[f#11,1-g#11,1]
+0.1: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [0, 1]
+	000005:[f#6,1-h#9,1]
+0.0: file count: 2, bytes: 512, width (mean, max): 1.0, 1, interval range: [0, 3]
+	000006:[f#4,1-g#5,1]
+	000011:[n#8,1-p#10,1]
+compacting file count: 4, base compacting intervals: [0, 1], [3, 4]
+L0.3:                 f------------------------------p
+L0.2:                 fvvvg
+L0.1:                 fvvvvvvh
+L0.0:                 fvvvg                   nvvvvvvp
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+pick-base-compaction min_depth=3
+----
+no compaction picked
+
+pick-intra-l0-compaction min_depth=3
+----
+no compaction picked
+
+max-depth-after-ongoing-compactions
+----
+1
+
+# Ensure that when 000011 is not base compacting, it's chosen for compactions
+# along with 000009.
+
+define
+L0
+  000005:f.SET.6-h.SET.9
+  000006:f.SET.4-g.SET.5
+  000009:f.SET.12-p.SET.12
+  000010:f.SET.11-g.SET.11
+  000011:n.SET.8-p.SET.10
+L6
+  000007:a.SET.0-f.SET.0
+  000008:g.SET.0-s.SET.0
+----
+file count: 5, sublevels: 4, intervals: 5
+flush split keys(2): [g, p]
+0.3: file count: 1, bytes: 256, width (mean, max): 4.0, 4, interval range: [0, 3]
+	000009:[f#12,1-p#12,1]
+0.2: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000010:[f#11,1-g#11,1]
+0.1: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [0, 1]
+	000005:[f#6,1-h#9,1]
+0.0: file count: 2, bytes: 512, width (mean, max): 1.0, 1, interval range: [0, 3]
+	000006:[f#4,1-g#5,1]
+	000011:[n#8,1-p#10,1]
+compacting file count: 0, base compacting intervals: none
+L0.3:                 f------------------------------p
+L0.2:                 f---g
+L0.1:                 f------h
+L0.0:                 f---g                   n------p
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+pick-base-compaction min_depth=3
+----
+compaction picked with stack depth reduction 4
+000006,000005,000010,000009,000011
+seed interval: f-g
+L0.3:                 f++++++++++++++++++++++++++++++p
+L0.2:                 f+++g
+L0.1:                 f++++++h
+L0.0:                 f+++g                   n++++++p
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+pick-intra-l0-compaction min_depth=3
+----
+compaction picked with stack depth reduction 4
+000009,000010,000005,000006,000011
+seed interval: f-g
+L0.3:                 f++++++++++++++++++++++++++++++p
+L0.2:                 f+++g
+L0.1:                 f++++++h
+L0.0:                 f+++g                   n++++++p
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+# Don't pick a base compaction if the overlapping Lbase files are marked as
+# compacting.
+
+define
+L0
+  000005:f.SET.6-h.SET.9
+  000006:f.SET.4-g.SET.5
+  000009:f.SET.12-p.SET.12
+  000010:f.SET.11-g.SET.11
+  000011:n.SET.8-p.SET.10
+L6
+  000007:a.SET.0-f.SET.0
+  000008:g.SET.0-s.SET.0 compacting
+----
+file count: 5, sublevels: 4, intervals: 5
+flush split keys(2): [g, p]
+0.3: file count: 1, bytes: 256, width (mean, max): 4.0, 4, interval range: [0, 3]
+	000009:[f#12,1-p#12,1]
+0.2: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000010:[f#11,1-g#11,1]
+0.1: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [0, 1]
+	000005:[f#6,1-h#9,1]
+0.0: file count: 2, bytes: 512, width (mean, max): 1.0, 1, interval range: [0, 3]
+	000006:[f#4,1-g#5,1]
+	000011:[n#8,1-p#10,1]
+compacting file count: 0, base compacting intervals: none
+L0.3:                 f------------------------------p
+L0.2:                 f---g
+L0.1:                 f------h
+L0.0:                 f---g                   n------p
+L6:    a---------------f g====================================s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+pick-base-compaction min_depth=3
+----
+no compaction picked
+
+# Greatly increase the size of SSTable 000009, past 100 << 20. This should make
+# it no longer a candidate for base compaction.
+
+define
+L0
+  000005:f.SET.6-h.SET.9
+  000006:f.SET.4-g.SET.5
+  000009:f.SET.12-p.SET.12 size=104859600
+  000010:f.SET.11-g.SET.11
+  000011:n.SET.8-p.SET.10
+L6
+  000007:a.SET.0-f.SET.0
+  000008:g.SET.0-s.SET.0
+----
+file count: 5, sublevels: 4, intervals: 5
+flush split keys(4): [g, h, n, p]
+0.3: file count: 1, bytes: 104859600, width (mean, max): 4.0, 4, interval range: [0, 3]
+	000009:[f#12,1-p#12,1]
+0.2: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000010:[f#11,1-g#11,1]
+0.1: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [0, 1]
+	000005:[f#6,1-h#9,1]
+0.0: file count: 2, bytes: 512, width (mean, max): 1.0, 1, interval range: [0, 3]
+	000006:[f#4,1-g#5,1]
+	000011:[n#8,1-p#10,1]
+compacting file count: 0, base compacting intervals: none
+L0.3:                 f------------------------------p
+L0.2:                 f---g
+L0.1:                 f------h
+L0.0:                 f---g                   n------p
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+pick-base-compaction min_depth=3
+----
+compaction picked with stack depth reduction 3
+000006,000005,000010,000011
+seed interval: f-g
+L0.3:                 f------------------------------p
+L0.2:                 f+++g
+L0.1:                 f++++++h
+L0.0:                 f+++g                   n++++++p
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+pick-intra-l0-compaction min_depth=3
+----
+compaction picked with stack depth reduction 4
+000009,000010,000005,000006,000011
+seed interval: f-g
+L0.3:                 f++++++++++++++++++++++++++++++p
+L0.2:                 f+++g
+L0.1:                 f++++++h
+L0.0:                 f+++g                   n++++++p
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+# However, when the size increase is applied to a lower sublevel that is
+# necessary to include to meet the minimum stack depth reduction, we overlook
+# the size difference and choose the file for compaction anyway.
+
+define
+L0
+  000005:f.SET.6-h.SET.9
+  000006:f.SET.4-g.SET.5
+  000009:f.SET.12-p.SET.12
+  000010:f.SET.11-g.SET.11 size=104859600
+  000011:n.SET.8-p.SET.10
+L6
+  000007:a.SET.0-f.SET.0
+  000008:g.SET.0-s.SET.0
+----
+file count: 5, sublevels: 4, intervals: 5
+flush split keys(2): [g, p]
+0.3: file count: 1, bytes: 256, width (mean, max): 4.0, 4, interval range: [0, 3]
+	000009:[f#12,1-p#12,1]
+0.2: file count: 1, bytes: 104859600, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000010:[f#11,1-g#11,1]
+0.1: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [0, 1]
+	000005:[f#6,1-h#9,1]
+0.0: file count: 2, bytes: 512, width (mean, max): 1.0, 1, interval range: [0, 3]
+	000006:[f#4,1-g#5,1]
+	000011:[n#8,1-p#10,1]
+compacting file count: 0, base compacting intervals: none
+L0.3:                 f------------------------------p
+L0.2:                 f---g
+L0.1:                 f------h
+L0.0:                 f---g                   n------p
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+pick-base-compaction min_depth=3
+----
+compaction picked with stack depth reduction 4
+000006,000005,000010,000009,000011
+seed interval: f-g
+L0.3:                 f++++++++++++++++++++++++++++++p
+L0.2:                 f+++g
+L0.1:                 f++++++h
+L0.0:                 f+++g                   n++++++p
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+pick-intra-l0-compaction min_depth=3
+----
+compaction picked with stack depth reduction 4
+000009,000010,000005,000006,000011
+seed interval: f-g
+L0.3:                 f++++++++++++++++++++++++++++++p
+L0.2:                 f+++g
+L0.1:                 f++++++h
+L0.0:                 f+++g                   n++++++p
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+read-amp
+----
+4
+
+# In L0.0, SST 000007 is marked as base compacting. There are two SSTs to the left
+# of it in the sublevel, and one to its right. The ones to its left should be
+# chosen by extendCandidateToRectangle.
+
+define
+L0
+  000004:h.SET.2-j.SET.4
+  000005:f.SET.6-h.SET.9
+  000006:f.SET.4-g.SET.5
+  000007:k.SET.2-l.SET.4 base_compacting
+  000009:f.SET.12-p.SET.12
+  000010:f.SET.11-g.SET.11
+  000011:n.SET.8-p.SET.10
+L6
+  000012:a.SET.0-f.SET.0
+  000008:g.SET.0-s.SET.0
+----
+file count: 7, sublevels: 4, intervals: 9
+flush split keys(4): [g, h, l, p]
+0.3: file count: 1, bytes: 256, width (mean, max): 8.0, 8, interval range: [0, 7]
+	000009:[f#12,1-p#12,1]
+0.2: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000010:[f#11,1-g#11,1]
+0.1: file count: 1, bytes: 256, width (mean, max): 3.0, 3, interval range: [0, 2]
+	000005:[f#6,1-h#9,1]
+0.0: file count: 4, bytes: 1024, width (mean, max): 1.2, 2, interval range: [0, 7]
+	000006:[f#4,1-g#5,1]
+	000004:[h#2,1-j#4,1]
+	000007:[k#2,1-l#4,1]
+	000011:[n#8,1-p#10,1]
+compacting file count: 1, base compacting intervals: [5, 5]
+L0.3:                 f------------------------------p
+L0.2:                 f---g
+L0.1:                 f------h
+L0.0:                 f---g h------j kvvvl    n------p
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+pick-intra-l0-compaction min_depth=3
+----
+compaction picked with stack depth reduction 4
+000009,000010,000005,000006,000004
+seed interval: f-g
+L0.3:                 f++++++++++++++++++++++++++++++p
+L0.2:                 f+++g
+L0.1:                 f++++++h
+L0.0:                 f+++g h++++++j kvvvl    n------p
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+pick-base-compaction min_depth=3
+----
+compaction picked with stack depth reduction 3
+000006,000005,000004,000010
+seed interval: f-g
+L0.3:                 f------------------------------p
+L0.2:                 f+++g
+L0.1:                 f++++++h
+L0.0:                 f+++g h++++++j kvvvl    n------p
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+
+# Now shift the base_compacting marker one SST to the left. But since file 6
+# was already chosen as part of the seed compaction construction, we still
+# prefer to choose it over files 7 and 11.
+
+define
+L0
+  000004:h.SET.2-j.SET.4 base_compacting
+  000005:f.SET.6-h.SET.9
+  000006:f.SET.4-g.SET.5
+  000007:k.SET.2-l.SET.4
+  000009:f.SET.12-p.SET.12
+  000010:f.SET.11-g.SET.11
+  000011:n.SET.8-p.SET.10
+L6
+  000012:a.SET.0-f.SET.0
+  000008:g.SET.0-s.SET.0
+----
+file count: 7, sublevels: 4, intervals: 9
+flush split keys(4): [g, h, l, p]
+0.3: file count: 1, bytes: 256, width (mean, max): 8.0, 8, interval range: [0, 7]
+	000009:[f#12,1-p#12,1]
+0.2: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000010:[f#11,1-g#11,1]
+0.1: file count: 1, bytes: 256, width (mean, max): 3.0, 3, interval range: [0, 2]
+	000005:[f#6,1-h#9,1]
+0.0: file count: 4, bytes: 1024, width (mean, max): 1.2, 2, interval range: [0, 7]
+	000006:[f#4,1-g#5,1]
+	000004:[h#2,1-j#4,1]
+	000007:[k#2,1-l#4,1]
+	000011:[n#8,1-p#10,1]
+compacting file count: 1, base compacting intervals: [2, 3]
+L0.3:                 f------------------------------p
+L0.2:                 f---g
+L0.1:                 f------h
+L0.0:                 f---g hvvvvvvj k---l    n------p
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+pick-intra-l0-compaction min_depth=3
+----
+compaction picked with stack depth reduction 4
+000009,000010,000005,000006
+seed interval: f-g
+L0.3:                 f++++++++++++++++++++++++++++++p
+L0.2:                 f+++g
+L0.1:                 f++++++h
+L0.0:                 f+++g hvvvvvvj k---l    n------p
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+# Without any base_compacting markers, all SSTs in the bottom sublevel should
+# be chosen for an intra-L0 compaction.
+
+define
+L0
+  000004:h.SET.2-j.SET.4
+  000005:f.SET.6-h.SET.9
+  000006:f.SET.4-g.SET.5
+  000007:k.SET.2-l.SET.4
+  000009:f.SET.12-p.SET.12
+  000010:f.SET.11-g.SET.11
+  000011:n.SET.8-p.SET.10
+L6
+  000012:a.SET.0-f.SET.0
+  000008:g.SET.0-s.SET.0
+----
+file count: 7, sublevels: 4, intervals: 9
+flush split keys(4): [g, h, l, p]
+0.3: file count: 1, bytes: 256, width (mean, max): 8.0, 8, interval range: [0, 7]
+	000009:[f#12,1-p#12,1]
+0.2: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000010:[f#11,1-g#11,1]
+0.1: file count: 1, bytes: 256, width (mean, max): 3.0, 3, interval range: [0, 2]
+	000005:[f#6,1-h#9,1]
+0.0: file count: 4, bytes: 1024, width (mean, max): 1.2, 2, interval range: [0, 7]
+	000006:[f#4,1-g#5,1]
+	000004:[h#2,1-j#4,1]
+	000007:[k#2,1-l#4,1]
+	000011:[n#8,1-p#10,1]
+compacting file count: 0, base compacting intervals: none
+L0.3:                 f------------------------------p
+L0.2:                 f---g
+L0.1:                 f------h
+L0.0:                 f---g h------j k---l    n------p
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+pick-intra-l0-compaction min_depth=3
+----
+compaction picked with stack depth reduction 4
+000009,000010,000005,000006,000004,000007,000011
+seed interval: f-g
+L0.3:                 f++++++++++++++++++++++++++++++p
+L0.2:                 f+++g
+L0.1:                 f++++++h
+L0.0:                 f+++g h++++++j k+++l    n++++++p
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+define flush_split_max_bytes=32
+L0
+  000001:a.SET.2-e.SET.5 size=64
+  000002:c.SET.6-g.SET.8 size=16
+  000003:f.SET.9-j.SET.11 size=16
+L6
+  000007:a.SET.0-f.SET.0
+  000008:g.SET.0-s.SET.0
+----
+file count: 3, sublevels: 3, intervals: 6
+flush split keys(0): []
+0.2: file count: 1, bytes: 16, width (mean, max): 2.0, 2, interval range: [3, 4]
+	000003:[f#9,1-j#11,1]
+0.1: file count: 1, bytes: 16, width (mean, max): 3.0, 3, interval range: [1, 3]
+	000002:[c#6,1-g#8,1]
+0.0: file count: 1, bytes: 64, width (mean, max): 2.0, 2, interval range: [0, 1]
+	000001:[a#2,1-e#5,1]
+compacting file count: 0, base compacting intervals: none
+L0.2:                 f------------j
+L0.1:        c------------g
+L0.0:  a------------e
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+# Check that read amplification is the sublevel height of the tallest key
+# interval, not the overall count of sublevels.
+
+read-amp
+----
+2
+
+in-use-key-ranges
+a-z
+----
+a-j
+
+# The comparison of a cumulative count of interpolated bytes and
+# flushSplitMaxBytes is a <, so even though the cumulative count equals 32 after
+# a-c, we do not emit a flush split key until the end of the next interval, c-e.
+
+flush-split-keys
+----
+flush user split keys: none
+
+# Reduce flush_split_max_bytes by 1, and there should also be a split key at c.
+
+define flush_split_max_bytes=31
+L0
+  000001:a.SET.2-e.SET.5 size=64
+  000002:c.SET.6-g.SET.8 size=16
+  000003:f.SET.9-j.SET.11 size=16
+L6
+  000007:a.SET.0-f.SET.0
+  000008:g.SET.0-s.SET.0
+----
+file count: 3, sublevels: 3, intervals: 6
+flush split keys(1): [j]
+0.2: file count: 1, bytes: 16, width (mean, max): 2.0, 2, interval range: [3, 4]
+	000003:[f#9,1-j#11,1]
+0.1: file count: 1, bytes: 16, width (mean, max): 3.0, 3, interval range: [1, 3]
+	000002:[c#6,1-g#8,1]
+0.0: file count: 1, bytes: 64, width (mean, max): 2.0, 2, interval range: [0, 1]
+	000001:[a#2,1-e#5,1]
+compacting file count: 0, base compacting intervals: none
+L0.2:                 f------------j
+L0.1:        c------------g
+L0.0:  a------------e
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+flush-split-keys
+----
+flush user split keys: j
+
+max-depth-after-ongoing-compactions
+----
+2
+
+define flush_split_max_bytes=64
+L0
+  000001:a.SET.2-d.SET.5 size=64
+  000002:e.SET.6-g.SET.8 size=64
+  000003:h.SET.9-j.SET.11 size=16
+L6
+  000007:a.SET.0-f.SET.0
+  000008:g.SET.0-s.SET.0
+----
+file count: 3, sublevels: 1, intervals: 6
+flush split keys(1): [g]
+0.0: file count: 3, bytes: 144, width (mean, max): 1.0, 1, interval range: [0, 4]
+	000001:[a#2,1-d#5,1]
+	000002:[e#6,1-g#8,1]
+	000003:[h#9,1-j#11,1]
+compacting file count: 0, base compacting intervals: none
+L0.0:  a---------d e------g h------j
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+flush-split-keys
+----
+flush user split keys: g
+
+# The calculation for flush split bytes multiplies the specified max bytes
+# parameter with the number of sublevels. In the case below, that should mean
+# a flush split key would not be emitted at d despite the estimated bytes tally
+# exceeding 64 bytes. Instead, it would be emitted when 64 * 2 = 128 bytes have
+# been exceeded.
+
+define flush_split_max_bytes=64
+L0
+  000001:a.SET.2-d.SET.5 size=64
+  000004:d.SET.12-e.SET.12 size=64
+  000002:e.SET.6-g.SET.8 size=64
+  000003:h.SET.9-j.SET.11 size=16
+L6
+  000007:a.SET.0-f.SET.0
+  000008:g.SET.0-s.SET.0
+----
+file count: 4, sublevels: 2, intervals: 8
+flush split keys(1): [e]
+0.1: file count: 1, bytes: 64, width (mean, max): 3.0, 3, interval range: [1, 3]
+	000004:[d#12,1-e#12,1]
+0.0: file count: 3, bytes: 144, width (mean, max): 1.7, 2, interval range: [0, 6]
+	000001:[a#2,1-d#5,1]
+	000002:[e#6,1-g#8,1]
+	000003:[h#9,1-j#11,1]
+compacting file count: 0, base compacting intervals: none
+L0.1:           d---e
+L0.0:  a---------d e------g h------j
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+in-use-key-ranges
+b-b
+dd-e
+dd-i
+dd-h
+dd-j
+dd-s
+----
+a-d
+d-g
+d-g, h-j
+d-g, h-j
+d-g, h-j
+d-g, h-j
+
+flush-split-keys
+----
+flush user split keys: e
+
+# Ensure that the compaction picker doesn't error out when all seed files are
+# compacting.
+
+define
+L0
+  000004:h.SET.2-j.SET.4    base_compacting
+  000005:f.SET.6-h.SET.9
+  000006:f.SET.4-g.SET.5    base_compacting
+  000007:k.SET.2-l.SET.4    base_compacting
+  000009:f.SET.12-p.SET.12  intra_l0_compacting
+  000010:f.SET.11-g.SET.11
+  000011:n.SET.8-p.SET.10   base_compacting
+L6
+  000012:a.SET.0-f.SET.0
+  000008:g.SET.0-s.SET.0
+----
+file count: 7, sublevels: 4, intervals: 9
+flush split keys(4): [g, h, l, p]
+0.3: file count: 1, bytes: 256, width (mean, max): 8.0, 8, interval range: [0, 7]
+	000009:[f#12,1-p#12,1]
+0.2: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000010:[f#11,1-g#11,1]
+0.1: file count: 1, bytes: 256, width (mean, max): 3.0, 3, interval range: [0, 2]
+	000005:[f#6,1-h#9,1]
+0.0: file count: 4, bytes: 1024, width (mean, max): 1.2, 2, interval range: [0, 7]
+	000006:[f#4,1-g#5,1]
+	000004:[h#2,1-j#4,1]
+	000007:[k#2,1-l#4,1]
+	000011:[n#8,1-p#10,1]
+compacting file count: 5, base compacting intervals: [0, 0], [2, 3], [5, 5], [7, 8]
+L0.3:                 f^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^p
+L0.2:                 f---g
+L0.1:                 f------h
+L0.0:                 fvvvg hvvvvvvj kvvvl    nvvvvvvp
+L6:    a---------------f g------------------------------------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+pick-base-compaction min_depth=2
+----
+no compaction picked
+
+pick-intra-l0-compaction min_depth=2
+----
+no compaction picked
+
+# Ensure that base files with largest key set to the rangedel sentinel key are
+# treated as not containing the largest user key. If L0 files containing that
+# user key get added to that compaction, it could trigger a
+# "files have overlapping ranges" error in Lbase as one of the outputs of the
+# compaction would overlap with an Lbase file not in the compaction.
+# Compare the output of the next two calls to PickBaseCompaction below; as the
+# base file's end key is changed to the range deletion sentinel, L0 files
+# overlapping with it are no longer chosen for compaction.
+
+define
+L0
+  000004:h.SET.2-j.SET.4
+  000005:f.SET.6-h.SET.9
+  000006:f.SET.4-g.SET.5
+  000007:k.SET.2-l.SET.4
+  000009:n.SET.12-o.SET.12
+  000010:f.SET.11-g.SET.11
+  000011:n.SET.8-o.SET.10
+L6
+  000001:a.SET.0-o.SET.0
+  000008:p.SET.0-s.SET.0
+----
+file count: 7, sublevels: 3, intervals: 9
+flush split keys(4): [g, h, l, o]
+0.2: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000010:[f#11,1-g#11,1]
+0.1: file count: 2, bytes: 512, width (mean, max): 2.0, 3, interval range: [0, 7]
+	000005:[f#6,1-h#9,1]
+	000009:[n#12,1-o#12,1]
+0.0: file count: 4, bytes: 1024, width (mean, max): 1.2, 2, interval range: [0, 7]
+	000006:[f#4,1-g#5,1]
+	000004:[h#2,1-j#4,1]
+	000007:[k#2,1-l#4,1]
+	000011:[n#8,1-o#10,1]
+compacting file count: 0, base compacting intervals: none
+L0.2:                 f---g
+L0.1:                 f------h                n---o
+L0.0:                 f---g h------j k---l    n---o
+L6:    a------------------------------------------o p---------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+pick-base-compaction min_depth=2
+----
+compaction picked with stack depth reduction 3
+000006,000005,000004,000010,000007,000011,000009
+seed interval: f-g
+L0.2:                 f+++g
+L0.1:                 f++++++h                n+++o
+L0.0:                 f+++g h++++++j k+++l    n+++o
+L6:    a------------------------------------------o p---------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+define
+L0
+  000004:h.SET.2-j.SET.4
+  000005:f.SET.6-h.SET.9
+  000006:f.SET.4-g.SET.5
+  000007:k.SET.2-l.SET.4
+  000009:n.SET.12-o.SET.12
+  000010:f.SET.11-g.SET.11
+  000011:n.SET.8-o.SET.10
+L6
+  000001:a.SET.0-o.RANGEDEL.72057594037927935
+  000008:p.SET.0-s.SET.0
+----
+file count: 7, sublevels: 3, intervals: 9
+flush split keys(4): [g, h, l, o]
+0.2: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000010:[f#11,1-g#11,1]
+0.1: file count: 2, bytes: 512, width (mean, max): 2.0, 3, interval range: [0, 7]
+	000005:[f#6,1-h#9,1]
+	000009:[n#12,1-o#12,1]
+0.0: file count: 4, bytes: 1024, width (mean, max): 1.2, 2, interval range: [0, 7]
+	000006:[f#4,1-g#5,1]
+	000004:[h#2,1-j#4,1]
+	000007:[k#2,1-l#4,1]
+	000011:[n#8,1-o#10,1]
+compacting file count: 0, base compacting intervals: none
+L0.2:                 f---g
+L0.1:                 f------h                n---o
+L0.0:                 f---g h------j k---l    n---o
+L6:    a------------------------------------------o p---------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+pick-base-compaction min_depth=2
+----
+compaction picked with stack depth reduction 3
+000006,000005,000004,000010,000007,000011,000009
+seed interval: f-g
+L0.2:                 f+++g
+L0.1:                 f++++++h                n+++o
+L0.0:                 f+++g h++++++j k+++l    n+++o
+L6:    a------------------------------------------o p---------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+in-use-key-ranges
+a-z
+n-o
+----
+f-j, k-l, n-o
+n-o
+
+# Ensure that two L0 sstables where one ends at a rangedel sentinel key and
+# the other starts at the same user key occupy the same sublevel.
+
+define
+L0
+  000004:a.SET.2-d.RANGEDEL.72057594037927935
+  000005:d.SET.3-g.SET.5
+L6
+  000001:a.SET.0-o.SET.0
+  000008:p.SET.0-s.SET.0
+----
+file count: 2, sublevels: 1, intervals: 3
+flush split keys(2): [d, g]
+0.0: file count: 2, bytes: 512, width (mean, max): 1.0, 1, interval range: [0, 1]
+	000004:[a#2,1-d#72057594037927935,15]
+	000005:[d#3,1-g#5,1]
+compacting file count: 0, base compacting intervals: none
+L0.0:  a--------d---------g
+L6:    a------------------------------------------o p---------s
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss
+
+in-use-key-ranges
+a-z
+a-g
+b-c
+----
+a-g
+a-g
+a-d
+
+define
+L0
+  000004:a.SET.2-d.RANGEDEL.72057594037927935
+  000005:d.SET.3-g.SET.5
+  000006:f.SET.6-i.SET.6
+  000007:h.SET.7-m.SET.7
+  000009:q.SET.7-r.SET.7
+  000010:g.SET.10-i.SET.10
+----
+file count: 6, sublevels: 4, intervals: 10
+flush split keys(4): [f, g, i, r]
+0.3: file count: 1, bytes: 256, width (mean, max): 3.0, 3, interval range: [3, 5]
+	000010:[g#10,1-i#10,1]
+0.2: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [5, 6]
+	000007:[h#7,1-m#7,1]
+0.1: file count: 1, bytes: 256, width (mean, max): 4.0, 4, interval range: [2, 5]
+	000006:[f#6,1-i#6,1]
+0.0: file count: 3, bytes: 768, width (mean, max): 1.7, 3, interval range: [0, 8]
+	000004:[a#2,1-d#72057594037927935,15]
+	000005:[d#3,1-g#5,1]
+	000009:[q#7,1-r#7,1]
+compacting file count: 0, base compacting intervals: none
+L0.3:                    g------i
+L0.2:                       h---------------m
+L0.1:                 f---------i
+L0.0:  a--------d---------g                            q---r
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr
+
+in-use-key-ranges
+f-m
+f-n
+f-l
+ff-m
+ff-n
+ff-l
+----
+f-m
+f-m
+f-m
+f-m
+f-m
+f-m
+
+in-use-key-ranges
+n-o
+m-q
+l-qq
+----
+.
+i-m, q-r
+i-m, q-r
+
+in-use-key-ranges
+a-z
+g-l
+----
+a-m, q-r
+g-m
+
+in-use-key-ranges
+a-ff
+a-gg
+a-i
+d-d
+----
+a-g
+a-i
+a-m
+d-g
+
+# Same example as above, except we incrementally add the sublevels. The output
+# of in-use-key-ranges must be the same.
+
+define
+L0
+  000004:a.SET.2-d.RANGEDEL.72057594037927935
+----
+file count: 1, sublevels: 1, intervals: 2
+flush split keys(1): [d]
+0.0: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000004:[a#2,1-d#72057594037927935,15]
+compacting file count: 0, base compacting intervals: none
+L0.0:  a---------d
+       aa bb cc dd
+
+add-l0-files
+  000005:d.SET.3-g.SET.5
+----
+file count: 2, sublevels: 1, intervals: 3
+flush split keys(2): [d, g]
+0.0: file count: 2, bytes: 512, width (mean, max): 1.0, 1, interval range: [0, 1]
+	000004:[a#2,1-d#72057594037927935,15]
+	000005:[d#3,1-g#5,1]
+compacting file count: 0, base compacting intervals: none
+L0.0:  a--------d---------g
+       aa bb cc dd ee ff gg
+
+add-l0-files
+  000006:f.SET.6-i.SET.6
+----
+file count: 3, sublevels: 2, intervals: 5
+flush split keys(2): [d, g]
+0.1: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [2, 3]
+	000006:[f#6,1-i#6,1]
+0.0: file count: 2, bytes: 512, width (mean, max): 1.5, 2, interval range: [0, 2]
+	000004:[a#2,1-d#72057594037927935,15]
+	000005:[d#3,1-g#5,1]
+compacting file count: 0, base compacting intervals: none
+L0.1:                 f---------i
+L0.0:  a--------d---------g
+       aa bb cc dd ee ff gg hh ii
+
+add-l0-files
+  000007:h.SET.7-m.SET.7
+  000009:q.SET.8-r.SET.8
+----
+file count: 5, sublevels: 3, intervals: 9
+flush split keys(4): [d, g, i, r]
+0.2: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [4, 5]
+	000007:[h#7,1-m#7,1]
+0.1: file count: 1, bytes: 256, width (mean, max): 3.0, 3, interval range: [2, 4]
+	000006:[f#6,1-i#6,1]
+0.0: file count: 3, bytes: 768, width (mean, max): 1.3, 2, interval range: [0, 7]
+	000004:[a#2,1-d#72057594037927935,15]
+	000005:[d#3,1-g#5,1]
+	000009:[q#8,1-r#8,1]
+compacting file count: 0, base compacting intervals: none
+L0.2:                       h---------------m
+L0.1:                 f---------i
+L0.0:  a--------d---------g                            q---r
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr
+
+# The output below should exactly match the output of the second last define.
+
+add-l0-files
+  000010:g.SET.10-i.SET.10
+----
+file count: 6, sublevels: 4, intervals: 10
+flush split keys(4): [f, g, i, r]
+0.3: file count: 1, bytes: 256, width (mean, max): 3.0, 3, interval range: [3, 5]
+	000010:[g#10,1-i#10,1]
+0.2: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [5, 6]
+	000007:[h#7,1-m#7,1]
+0.1: file count: 1, bytes: 256, width (mean, max): 4.0, 4, interval range: [2, 5]
+	000006:[f#6,1-i#6,1]
+0.0: file count: 3, bytes: 768, width (mean, max): 1.7, 3, interval range: [0, 8]
+	000004:[a#2,1-d#72057594037927935,15]
+	000005:[d#3,1-g#5,1]
+	000009:[q#8,1-r#8,1]
+compacting file count: 0, base compacting intervals: none
+L0.3:                    g------i
+L0.2:                       h---------------m
+L0.1:                 f---------i
+L0.0:  a--------d---------g                            q---r
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr
+
+in-use-key-ranges
+f-m
+f-n
+f-l
+ff-m
+ff-n
+ff-l
+----
+f-m
+f-m
+f-m
+f-m
+f-m
+f-m
+
+in-use-key-ranges
+n-o
+m-q
+l-qq
+----
+.
+i-m, q-r
+i-m, q-r
+
+in-use-key-ranges
+a-z
+g-l
+----
+a-m, q-r
+g-m
+
+in-use-key-ranges
+a-ff
+a-gg
+a-i
+d-d
+----
+a-g
+a-i
+a-m
+d-g
+
+pick-base-compaction min_depth=3
+----
+compaction picked with stack depth reduction 3
+000005,000006,000010,000007,000004,000009
+seed interval: g-g
+L0.3:                    g++++++i
+L0.2:                       h+++++++++++++++m
+L0.1:                 f+++++++++i
+L0.0:  a++++++++d+++++++++g                            q+++r
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr
+
+# Adding two overlapping L0 files is supported too, as long as they're disjoint
+# in sequence number ranges.
+
+add-l0-files
+  000011:b.SET.13-e.SET.15
+  000012:c.SET.16-e.SET.17
+----
+file count: 8, sublevels: 4, intervals: 13
+flush split keys(5): [d, e, g, i, r]
+0.3: file count: 1, bytes: 256, width (mean, max): 3.0, 3, interval range: [6, 8]
+	000010:[g#10,1-i#10,1]
+0.2: file count: 2, bytes: 512, width (mean, max): 2.0, 2, interval range: [2, 9]
+	000012:[c#16,1-e#17,1]
+	000007:[h#7,1-m#7,1]
+0.1: file count: 2, bytes: 512, width (mean, max): 3.5, 4, interval range: [1, 8]
+	000011:[b#13,1-e#15,1]
+	000006:[f#6,1-i#6,1]
+0.0: file count: 3, bytes: 768, width (mean, max): 2.7, 4, interval range: [0, 11]
+	000004:[a#2,1-d#72057594037927935,15]
+	000005:[d#3,1-g#5,1]
+	000009:[q#8,1-r#8,1]
+compacting file count: 0, base compacting intervals: none
+L0.3:                    g------i
+L0.2:        c------e       h---------------m
+L0.1:     b---------e f---------i
+L0.0:  a--------d---------g                            q---r
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr
+
+# Multiple sublevels can also be added in one add-l0-files.
+
+add-l0-files
+  000013:h.SET.18-i.SET.19
+  000014:g.SET.20-i.SET.21
+----
+file count: 10, sublevels: 6, intervals: 13
+flush split keys(4): [d, g, h, i]
+0.5: file count: 1, bytes: 256, width (mean, max): 3.0, 3, interval range: [6, 8]
+	000014:[g#20,1-i#21,1]
+0.4: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [8, 8]
+	000013:[h#18,1-i#19,1]
+0.3: file count: 1, bytes: 256, width (mean, max): 3.0, 3, interval range: [6, 8]
+	000010:[g#10,1-i#10,1]
+0.2: file count: 2, bytes: 512, width (mean, max): 2.0, 2, interval range: [2, 9]
+	000012:[c#16,1-e#17,1]
+	000007:[h#7,1-m#7,1]
+0.1: file count: 2, bytes: 512, width (mean, max): 3.5, 4, interval range: [1, 8]
+	000011:[b#13,1-e#15,1]
+	000006:[f#6,1-i#6,1]
+0.0: file count: 3, bytes: 768, width (mean, max): 2.7, 4, interval range: [0, 11]
+	000004:[a#2,1-d#72057594037927935,15]
+	000005:[d#3,1-g#5,1]
+	000009:[q#8,1-r#8,1]
+compacting file count: 0, base compacting intervals: none
+L0.5:                    g------i
+L0.4:                       h---i
+L0.3:                    g------i
+L0.2:        c------e       h---------------m
+L0.1:     b---------e f---------i
+L0.0:  a--------d---------g                            q---r
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr
+
+# Adding an old L0 file returns an error.
+
+add-l0-files
+  000015:h.SET.17-i.SET.17
+----
+pebble: L0 sublevel generation optimization cannot be used
+
+# The following test cases cover the examples provided in the documentation.
+# NOTE: following initialization, some of the files fall down into lower levels
+# where there is space.
+
+# Example 1. No in-progress L0 -> LBase compaction.
+
+define
+L0.3
+  000011:a.SET.18-d.SET.19
+  000012:g.SET.20-j.SET.21
+L0.2
+  000009:f.SET.14-j.SET.15
+  000010:r.SET.16-t.SET.17
+L0.1
+  000007:b.SET.10-d.SET.11
+  000008:e.SET.12-j.SET.13
+L0.0
+  000003:a.SET.2-d.SET.3
+  000004:f.SET.4-j.SET.5
+  000005:l.SET.6-o.SET.7
+  000006:p.SET.8-x.SET.9
+L6
+  000001:a.SET.0-i.SET.0
+  000002:m.SET.0-w.SET.0
+----
+file count: 10, sublevels: 4, intervals: 13
+flush split keys(5): [d, g, j, r, t]
+0.3: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [5, 5]
+	000012:[g#20,1-j#21,1]
+0.2: file count: 2, bytes: 512, width (mean, max): 2.0, 2, interval range: [0, 5]
+	000011:[a#18,1-d#19,1]
+	000009:[f#14,1-j#15,1]
+0.1: file count: 3, bytes: 768, width (mean, max): 1.7, 3, interval range: [1, 10]
+	000007:[b#10,1-d#11,1]
+	000008:[e#12,1-j#13,1]
+	000010:[r#16,1-t#17,1]
+0.0: file count: 4, bytes: 1024, width (mean, max): 2.0, 3, interval range: [0, 11]
+	000003:[a#2,1-d#3,1]
+	000004:[f#4,1-j#5,1]
+	000005:[l#6,1-o#7,1]
+	000006:[p#8,1-x#9,1]
+compacting file count: 0, base compacting intervals: none
+L0.3:                    g---------j
+L0.2:  a---------d    f------------j
+L0.1:     b------d e---------------j                      r------t
+L0.0:  a---------d    f------------j    l---------o p------------------------x
+L6:    a------------------------i          m------------------------------w
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss tt uu vv ww xx
+
+pick-base-compaction min_depth=3
+----
+compaction picked with stack depth reduction 4
+000004,000008,000009,000012,000003,000007,000011
+seed interval: g-j
+L0.3:                    g+++++++++j
+L0.2:  a+++++++++d    f++++++++++++j
+L0.1:     b++++++d e+++++++++++++++j                      r------t
+L0.0:  a+++++++++d    f++++++++++++j    l---------o p------------------------x
+L6:    a------------------------i          m------------------------------w
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss tt uu vv ww xx
+
+# Example 2. Left half of the keyspace compacting. Select the "next best"
+# compaction.
+
+define
+L0.3
+  000011:a.SET.18-d.SET.19 base_compacting
+  000012:g.SET.20-j.SET.21 base_compacting
+L0.2
+  000009:f.SET.14-j.SET.15 base_compacting
+  000010:r.SET.16-t.SET.17
+L0.1
+  000007:b.SET.10-d.SET.11 base_compacting
+  000008:e.SET.12-j.SET.13 base_compacting
+L0.0
+  000003:a.SET.2-d.SET.3 base_compacting
+  000004:f.SET.4-j.SET.5 base_compacting
+  000005:l.SET.6-o.SET.7
+  000006:p.SET.8-x.SET.9
+L6
+  000001:a.SET.0-i.SET.0
+  000002:m.SET.0-w.SET.0
+----
+file count: 10, sublevels: 4, intervals: 13
+flush split keys(5): [d, g, j, r, t]
+0.3: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [5, 5]
+	000012:[g#20,1-j#21,1]
+0.2: file count: 2, bytes: 512, width (mean, max): 2.0, 2, interval range: [0, 5]
+	000011:[a#18,1-d#19,1]
+	000009:[f#14,1-j#15,1]
+0.1: file count: 3, bytes: 768, width (mean, max): 1.7, 3, interval range: [1, 10]
+	000007:[b#10,1-d#11,1]
+	000008:[e#12,1-j#13,1]
+	000010:[r#16,1-t#17,1]
+0.0: file count: 4, bytes: 1024, width (mean, max): 2.0, 3, interval range: [0, 11]
+	000003:[a#2,1-d#3,1]
+	000004:[f#4,1-j#5,1]
+	000005:[l#6,1-o#7,1]
+	000006:[p#8,1-x#9,1]
+compacting file count: 7, base compacting intervals: [0, 6]
+L0.3:                    gvvvvvvvvvj
+L0.2:  avvvvvvvvvd    fvvvvvvvvvvvvj
+L0.1:     bvvvvvvd evvvvvvvvvvvvvvvj                      r------t
+L0.0:  avvvvvvvvvd    fvvvvvvvvvvvvj    l---------o p------------------------x
+L6:    a------------------------i          m------------------------------w
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss tt uu vv ww xx
+
+pick-base-compaction min_depth=3
+----
+no compaction picked
+
+pick-base-compaction min_depth=2
+----
+compaction picked with stack depth reduction 2
+000006,000010,000005
+seed interval: r-t
+L0.3:                    gvvvvvvvvvj
+L0.2:  avvvvvvvvvd    fvvvvvvvvvvvvj
+L0.1:     bvvvvvvd evvvvvvvvvvvvvvvj                      r++++++t
+L0.0:  avvvvvvvvvd    fvvvvvvvvvvvvj    l+++++++++o p++++++++++++++++++++++++x
+L6:    a------------------------i          m------------------------------w
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss tt uu vv ww xx
+
+# Example 3. The same as Example 2, except there is now an additional file in
+# LBase that overlaps with the [l,o] file in L0.0.
+
+define
+L0.3
+  000011:a.SET.18-d.SET.19 base_compacting
+  000012:g.SET.20-j.SET.21 base_compacting
+L0.2
+  000009:f.SET.14-j.SET.15 base_compacting
+  000010:r.SET.16-t.SET.17
+L0.1
+  000007:b.SET.10-d.SET.11 base_compacting
+  000008:e.SET.12-j.SET.13 base_compacting
+L0.0
+  000003:a.SET.2-d.SET.3 base_compacting
+  000004:f.SET.4-j.SET.5 base_compacting
+  000005:l.SET.6-o.SET.7
+  000006:p.SET.8-x.SET.9
+L6
+  000001:a.SET.0-i.SET.0
+  000013:j.SET.0-l.SET.0
+  000002:m.SET.0-w.SET.0
+----
+file count: 10, sublevels: 4, intervals: 13
+flush split keys(5): [d, g, j, r, t]
+0.3: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [5, 5]
+	000012:[g#20,1-j#21,1]
+0.2: file count: 2, bytes: 512, width (mean, max): 2.0, 2, interval range: [0, 5]
+	000011:[a#18,1-d#19,1]
+	000009:[f#14,1-j#15,1]
+0.1: file count: 3, bytes: 768, width (mean, max): 1.7, 3, interval range: [1, 10]
+	000007:[b#10,1-d#11,1]
+	000008:[e#12,1-j#13,1]
+	000010:[r#16,1-t#17,1]
+0.0: file count: 4, bytes: 1024, width (mean, max): 2.0, 3, interval range: [0, 11]
+	000003:[a#2,1-d#3,1]
+	000004:[f#4,1-j#5,1]
+	000005:[l#6,1-o#7,1]
+	000006:[p#8,1-x#9,1]
+compacting file count: 7, base compacting intervals: [0, 6]
+L0.3:                    gvvvvvvvvvj
+L0.2:  avvvvvvvvvd    fvvvvvvvvvvvvj
+L0.1:     bvvvvvvd evvvvvvvvvvvvvvvj                      r------t
+L0.0:  avvvvvvvvvd    fvvvvvvvvvvvvj    l---------o p------------------------x
+L6:    a------------------------i j------l m------------------------------w
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss tt uu vv ww xx
+
+pick-base-compaction min_depth=2
+----
+compaction picked with stack depth reduction 2
+000006,000010
+seed interval: r-t
+L0.3:                    gvvvvvvvvvj
+L0.2:  avvvvvvvvvd    fvvvvvvvvvvvvj
+L0.1:     bvvvvvvd evvvvvvvvvvvvvvvj                      r++++++t
+L0.0:  avvvvvvvvvd    fvvvvvvvvvvvvj    l---------o p++++++++++++++++++++++++x
+L6:    a------------------------i j------l m------------------------------w
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss tt uu vv ww xx
+
+# Example 4. Intra-L0 compactions.
+
+define
+L0.3
+  000011:a.SET.18-d.SET.19
+  000012:g.SET.20-j.SET.21 base_compacting
+L0.2
+  000009:f.SET.14-j.SET.15 base_compacting
+  000010:r.SET.16-t.SET.17 base_compacting
+L0.1
+  000007:b.SET.10-d.SET.11
+  000008:e.SET.12-j.SET.13 base_compacting
+L0.0
+  000003:a.SET.2-d.SET.3
+  000004:f.SET.4-j.SET.5 base_compacting
+  000005:l.SET.6-o.SET.7
+  000006:p.SET.8-x.SET.9 base_compacting
+L6
+  000001:a.SET.0-i.SET.0
+  000002:m.SET.0-w.SET.0
+----
+file count: 10, sublevels: 4, intervals: 13
+flush split keys(5): [d, g, j, r, t]
+0.3: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [5, 5]
+	000012:[g#20,1-j#21,1]
+0.2: file count: 2, bytes: 512, width (mean, max): 2.0, 2, interval range: [0, 5]
+	000011:[a#18,1-d#19,1]
+	000009:[f#14,1-j#15,1]
+0.1: file count: 3, bytes: 768, width (mean, max): 1.7, 3, interval range: [1, 10]
+	000007:[b#10,1-d#11,1]
+	000008:[e#12,1-j#13,1]
+	000010:[r#16,1-t#17,1]
+0.0: file count: 4, bytes: 1024, width (mean, max): 2.0, 3, interval range: [0, 11]
+	000003:[a#2,1-d#3,1]
+	000004:[f#4,1-j#5,1]
+	000005:[l#6,1-o#7,1]
+	000006:[p#8,1-x#9,1]
+compacting file count: 6, base compacting intervals: [3, 6], [9, 12]
+L0.3:                    gvvvvvvvvvj
+L0.2:  a---------d    fvvvvvvvvvvvvj
+L0.1:     b------d evvvvvvvvvvvvvvvj                      rvvvvvvt
+L0.0:  a---------d    fvvvvvvvvvvvvj    l---------o pvvvvvvvvvvvvvvvvvvvvvvvvx
+L6:    a------------------------i          m------------------------------w
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss tt uu vv ww xx
+
+pick-intra-l0-compaction min_depth=2
+----
+compaction picked with stack depth reduction 3
+000011,000007,000003
+seed interval: b-d
+L0.3:                    gvvvvvvvvvj
+L0.2:  a+++++++++d    fvvvvvvvvvvvvj
+L0.1:     b++++++d evvvvvvvvvvvvvvvj                      rvvvvvvt
+L0.0:  a+++++++++d    fvvvvvvvvvvvvj    l---------o pvvvvvvvvvvvvvvvvvvvvvvvvx
+L6:    a------------------------i          m------------------------------w
+       aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss tt uu vv ww xx
+
+# Regression test for cockroachdb/cockroach#101896. We must return
+# errInvalidL0SublevelOpt in any case where a new L0 file is being AddL0File'd
+# with a largest sequence number below an existing file in the same interval.
+
+define
+L0
+  000004:a.SET.2-e.SET.3
+  000006:a.SET.7-b.SET.8
+  000007:d.SET.12-f.SET.12
+----
+file count: 3, sublevels: 2, intervals: 5
+flush split keys(2): [b, e]
+0.1: file count: 2, bytes: 512, width (mean, max): 1.5, 2, interval range: [0, 3]
+	000006:[a#7,1-b#8,1]
+	000007:[d#12,1-f#12,1]
+0.0: file count: 1, bytes: 256, width (mean, max): 3.0, 3, interval range: [0, 2]
+	000004:[a#2,1-e#3,1]
+compacting file count: 0, base compacting intervals: none
+L0.1:  a---b    d------f
+L0.0:  a------------e
+       aa bb cc dd ee ff
+
+# Note that 000006 will bump the sublevel for the incoming file to 2. We
+# should still realize that it's slotting below 000007 and return an error.
+
+add-l0-files
+  000015:a.SET.9-g.SET.10
+----
+pebble: L0 sublevel generation optimization cannot be used
+
+# Fully-regenerated L0 sublevels allow us to pick an intra-L0 compaction that
+# does not violate sublevel ordering.
+
+define
+L0
+  000004:a.SET.2-e.SET.3
+  000006:a.SET.7-b.SET.8
+  000007:d.SET.12-f.SET.12
+  000015:a.SET.9-g.SET.10
+----
+file count: 4, sublevels: 4, intervals: 6
+flush split keys(2): [b, e]
+0.3: file count: 1, bytes: 256, width (mean, max): 2.0, 2, interval range: [2, 3]
+	000007:[d#12,1-f#12,1]
+0.2: file count: 1, bytes: 256, width (mean, max): 5.0, 5, interval range: [0, 4]
+	000015:[a#9,1-g#10,1]
+0.1: file count: 1, bytes: 256, width (mean, max): 1.0, 1, interval range: [0, 0]
+	000006:[a#7,1-b#8,1]
+0.0: file count: 1, bytes: 256, width (mean, max): 3.0, 3, interval range: [0, 2]
+	000004:[a#2,1-e#3,1]
+compacting file count: 0, base compacting intervals: none
+L0.3:           d------f
+L0.2:  a------------------g
+L0.1:  a---b
+L0.0:  a------------e
+       aa bb cc dd ee ff gg
+
+# Exclude the d-f file through earliest_unflushed_seqnum.
+
+pick-intra-l0-compaction min_depth=2 earliest_unflushed_seqnum=11
+----
+compaction picked with stack depth reduction 3
+000015,000006,000004
+seed interval: a-b
+L0.3:           d------f
+L0.2:  a++++++++++++++++++g
+L0.1:  a+++b
+L0.0:  a++++++++++++e
+       aa bb cc dd ee ff gg
+
+pick-intra-l0-compaction min_depth=2
+----
+compaction picked with stack depth reduction 3
+000015,000007,000006,000004
+seed interval: a-b
+L0.3:           d++++++f
+L0.2:  a++++++++++++++++++g
+L0.1:  a+++b
+L0.0:  a++++++++++++e
+       aa bb cc dd ee ff gg
diff --git a/pebble/internal/manifest/testdata/level_iterator b/pebble/internal/manifest/testdata/level_iterator
new file mode 100644
index 0000000..dca3e8e
--- /dev/null
+++ b/pebble/internal/manifest/testdata/level_iterator
@@ -0,0 +1,128 @@
+define
+[ ]
+----
+
+iter
+first
+last
+seek-lt a
+seek-lt z
+seek-ge a
+seek-ge z
+----
+.
+.
+.
+.
+.
+.
+
+define
+[ a.SET.1-b.SET.2 ]
+----
+
+iter
+last
+----
+000001:[a#1,1-b#2,1]
+
+iter
+first
+next
+prev
+prev
+----
+000001:[a#1,1-b#2,1]
+.
+000001:[a#1,1-b#2,1]
+.
+
+iter
+seek-ge a
+seek-ge b
+seek-ge c
+----
+000001:[a#1,1-b#2,1]
+000001:[a#1,1-b#2,1]
+.
+
+iter
+seek-lt a
+seek-lt b
+seek-lt z
+----
+.
+000001:[a#1,1-b#2,1]
+000001:[a#1,1-b#2,1]
+
+define
+[ b.SET.1-c.SET.2 ]
+----
+
+iter
+seek-ge a
+seek-ge d
+seek-lt a
+seek-lt z
+----
+000001:[b#1,1-c#2,1]
+.
+.
+000001:[b#1,1-c#2,1]
+
+
+define
+a.SET.1-b.SET.2 [ c.SET.3-d.SET.4 e.SET.5-f.SET.6 ] g.SET.7-h.SET.8
+----
+
+iter
+first
+prev
+last
+next
+----
+000002:[c#3,1-d#4,1]
+.
+000003:[e#5,1-f#6,1]
+.
+
+iter
+seek-ge a
+seek-ge b
+seek-ge c
+seek-ge h
+prev
+----
+000002:[c#3,1-d#4,1]
+000002:[c#3,1-d#4,1]
+000002:[c#3,1-d#4,1]
+.
+000003:[e#5,1-f#6,1]
+
+iter
+seek-lt b
+next
+seek-lt a
+next
+seek-lt z
+----
+.
+000002:[c#3,1-d#4,1]
+.
+000002:[c#3,1-d#4,1]
+000003:[e#5,1-f#6,1]
+
+define
+a.SET.1-b.SET.2 c.SET.3-d.SET.4 e.SET.5-f.SET.6 g.SET.7-h.SET.8 [ ]
+----
+
+iter
+seek-ge cat
+seek-lt cat
+first
+last
+----
+.
+.
+.
+.
diff --git a/pebble/internal/manifest/testdata/level_iterator_filtered b/pebble/internal/manifest/testdata/level_iterator_filtered
new file mode 100644
index 0000000..066207c
--- /dev/null
+++ b/pebble/internal/manifest/testdata/level_iterator_filtered
@@ -0,0 +1,537 @@
+define
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+----
+
+iter key-type=points
+seek-ge a
+seek-ge m
+seek-ge n
+seek-ge o
+seek-ge p
+----
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+.
+.
+
+iter key-type=ranges
+seek-ge a
+seek-ge m
+seek-ge n
+seek-ge o
+seek-ge p
+----
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+.
+.
+.
+.
+
+iter key-type=points
+seek-lt a
+seek-lt b
+seek-lt c
+seek-lt j
+seek-lt k
+seek-lt l
+seek-lt m
+seek-lt n
+seek-lt o
+seek-lt p
+----
+.
+.
+.
+.
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+
+iter key-type=ranges
+seek-lt a
+seek-lt b
+seek-lt c
+seek-lt j
+seek-lt k
+seek-lt l
+seek-lt m
+seek-lt n
+seek-lt o
+seek-lt p
+----
+.
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+
+iter key-type=points
+seek-lt a
+next
+next
+seek-ge o
+prev
+prev
+----
+.
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+.
+.
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+.
+
+iter key-type=ranges
+seek-lt a
+next
+next
+seek-ge m
+prev
+prev
+----
+.
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+.
+.
+000000:[a#42,RANGEKEYSET-o#inf,RANGEDEL] seqnums:[0-0] points:[j#0,SET-o#inf,RANGEDEL] ranges:[a#42,RANGEKEYSET-m#inf,RANGEKEYSET]
+.
+
+define
+000000:[a#9,SET-b#2,DEL] points:[a#9,SET-b#2,DEL]
+000001:[c#9,SET-d#2,DEL] points:[c#9,SET-d#2,DEL]
+000002:[e#9,SET-f#2,DEL] points:[e#9,SET-f#2,DEL]
+000003:[g#9,SET-g#2,DEL] points:[g#9,SET-g#2,DEL]
+000004:[i#9,SET-j#2,DEL] points:[i#9,SET-j#2,DEL]
+000005:[k#9,SET-k#2,DEL] points:[k#9,SET-k#2,DEL]
+----
+
+iter key-type=points
+seek-ge a
+seek-ge apple
+seek-ge b
+seek-ge banana
+seek-ge c
+seek-ge cantalope
+seek-ge d
+seek-ge dragonfruit
+----
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000001:[c#9,SET-d#2,DEL] seqnums:[0-0] points:[c#9,SET-d#2,DEL]
+000001:[c#9,SET-d#2,DEL] seqnums:[0-0] points:[c#9,SET-d#2,DEL]
+000001:[c#9,SET-d#2,DEL] seqnums:[0-0] points:[c#9,SET-d#2,DEL]
+000001:[c#9,SET-d#2,DEL] seqnums:[0-0] points:[c#9,SET-d#2,DEL]
+000002:[e#9,SET-f#2,DEL] seqnums:[0-0] points:[e#9,SET-f#2,DEL]
+
+iter key-type=points
+seek-lt a
+seek-lt apple
+seek-lt b
+seek-lt banana
+seek-lt c
+seek-lt cantalope
+seek-lt d
+seek-lt dragonfruit
+----
+.
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000001:[c#9,SET-d#2,DEL] seqnums:[0-0] points:[c#9,SET-d#2,DEL]
+000001:[c#9,SET-d#2,DEL] seqnums:[0-0] points:[c#9,SET-d#2,DEL]
+000001:[c#9,SET-d#2,DEL] seqnums:[0-0] points:[c#9,SET-d#2,DEL]
+
+iter key-type=ranges
+seek-ge a
+seek-ge apple
+seek-ge b
+seek-ge banana
+seek-ge c
+seek-ge cantalope
+seek-ge d
+seek-ge dragonfruit
+----
+.
+.
+.
+.
+.
+.
+.
+.
+
+iter key-type=ranges
+seek-lt a
+seek-lt apple
+seek-lt b
+seek-lt banana
+seek-lt c
+seek-lt cantalope
+seek-lt d
+seek-lt dragonfruit
+----
+.
+.
+.
+.
+.
+.
+.
+.
+
+define
+000000:[a#9,SET-b#2,DEL] points:[a#9,SET-b#2,DEL]
+000001:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET] ranges:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000003:[g#9,SET-g#2,DEL] points:[g#9,SET-g#2,DEL]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000005:[k#9,SET-k#2,DEL] points:[k#9,SET-k#2,DEL]
+----
+
+iter key-type=both
+seek-ge a
+seek-ge apple
+seek-ge b
+seek-ge banana
+seek-ge c
+seek-ge cantalope
+seek-ge d
+seek-ge dragonfruit
+seek-ge e
+seek-ge elderberry
+seek-ge f
+seek-ge figs
+seek-ge g
+seek-ge guava
+seek-ge h
+seek-ge huckleberry
+seek-ge i
+seek-ge incaberry
+seek-ge j
+seek-ge jujube
+seek-ge k
+seek-ge kiwi
+seek-ge l
+----
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000001:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET] seqnums:[0-0] ranges:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET]
+000001:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET] seqnums:[0-0] ranges:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET]
+000001:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET] seqnums:[0-0] ranges:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET]
+000001:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET] seqnums:[0-0] ranges:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000003:[g#9,SET-g#2,DEL] seqnums:[0-0] points:[g#9,SET-g#2,DEL]
+000003:[g#9,SET-g#2,DEL] seqnums:[0-0] points:[g#9,SET-g#2,DEL]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000005:[k#9,SET-k#2,DEL] seqnums:[0-0] points:[k#9,SET-k#2,DEL]
+000005:[k#9,SET-k#2,DEL] seqnums:[0-0] points:[k#9,SET-k#2,DEL]
+.
+.
+
+iter key-type=both
+seek-lt a
+seek-lt apple
+seek-lt b
+seek-lt banana
+seek-lt c
+seek-lt cantalope
+seek-lt d
+seek-lt dragonfruit
+seek-lt e
+seek-lt elderberry
+seek-lt f
+seek-lt figs
+seek-lt g
+seek-lt guava
+seek-lt h
+seek-lt huckleberry
+seek-lt i
+seek-lt incaberry
+seek-lt j
+seek-lt jujube
+seek-lt k
+seek-lt kiwi
+seek-lt l
+----
+.
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000001:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET] seqnums:[0-0] ranges:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET]
+000001:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET] seqnums:[0-0] ranges:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET]
+000001:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET] seqnums:[0-0] ranges:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET]
+000001:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET] seqnums:[0-0] ranges:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000003:[g#9,SET-g#2,DEL] seqnums:[0-0] points:[g#9,SET-g#2,DEL]
+000003:[g#9,SET-g#2,DEL] seqnums:[0-0] points:[g#9,SET-g#2,DEL]
+000003:[g#9,SET-g#2,DEL] seqnums:[0-0] points:[g#9,SET-g#2,DEL]
+000003:[g#9,SET-g#2,DEL] seqnums:[0-0] points:[g#9,SET-g#2,DEL]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000005:[k#9,SET-k#2,DEL] seqnums:[0-0] points:[k#9,SET-k#2,DEL]
+000005:[k#9,SET-k#2,DEL] seqnums:[0-0] points:[k#9,SET-k#2,DEL]
+
+
+iter key-type=points
+seek-ge a
+seek-ge apple
+seek-ge b
+seek-ge banana
+seek-ge c
+seek-ge cantalope
+seek-ge d
+seek-ge dragonfruit
+seek-ge e
+seek-ge elderberry
+seek-ge f
+seek-ge figs
+seek-ge g
+seek-ge guava
+seek-ge h
+seek-ge huckleberry
+seek-ge i
+seek-ge incaberry
+seek-ge j
+seek-ge jujube
+seek-ge k
+seek-ge kiwi
+seek-ge l
+----
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000003:[g#9,SET-g#2,DEL] seqnums:[0-0] points:[g#9,SET-g#2,DEL]
+000003:[g#9,SET-g#2,DEL] seqnums:[0-0] points:[g#9,SET-g#2,DEL]
+000003:[g#9,SET-g#2,DEL] seqnums:[0-0] points:[g#9,SET-g#2,DEL]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000005:[k#9,SET-k#2,DEL] seqnums:[0-0] points:[k#9,SET-k#2,DEL]
+000005:[k#9,SET-k#2,DEL] seqnums:[0-0] points:[k#9,SET-k#2,DEL]
+000005:[k#9,SET-k#2,DEL] seqnums:[0-0] points:[k#9,SET-k#2,DEL]
+.
+.
+
+iter key-type=points
+seek-lt a
+seek-lt apple
+seek-lt b
+seek-lt banana
+seek-lt c
+seek-lt cantalope
+seek-lt d
+seek-lt dragonfruit
+seek-lt e
+seek-lt elderberry
+seek-lt f
+seek-lt figs
+seek-lt g
+seek-lt guava
+seek-lt h
+seek-lt huckleberry
+seek-lt i
+seek-lt incaberry
+seek-lt j
+seek-lt jujube
+seek-lt k
+seek-lt kiwi
+seek-lt l
+----
+.
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000003:[g#9,SET-g#2,DEL] seqnums:[0-0] points:[g#9,SET-g#2,DEL]
+000003:[g#9,SET-g#2,DEL] seqnums:[0-0] points:[g#9,SET-g#2,DEL]
+000003:[g#9,SET-g#2,DEL] seqnums:[0-0] points:[g#9,SET-g#2,DEL]
+000003:[g#9,SET-g#2,DEL] seqnums:[0-0] points:[g#9,SET-g#2,DEL]
+000003:[g#9,SET-g#2,DEL] seqnums:[0-0] points:[g#9,SET-g#2,DEL]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000005:[k#9,SET-k#2,DEL] seqnums:[0-0] points:[k#9,SET-k#2,DEL]
+000005:[k#9,SET-k#2,DEL] seqnums:[0-0] points:[k#9,SET-k#2,DEL]
+
+iter key-type=ranges
+seek-ge a
+seek-ge apple
+seek-ge b
+seek-ge banana
+seek-ge c
+seek-ge cantalope
+seek-ge d
+seek-ge dragonfruit
+seek-ge e
+seek-ge elderberry
+seek-ge f
+seek-ge figs
+seek-ge g
+seek-ge guava
+seek-ge h
+seek-ge huckleberry
+seek-ge i
+seek-ge incaberry
+seek-ge j
+seek-ge jujube
+seek-ge k
+seek-ge kiwi
+seek-ge l
+----
+000001:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET] seqnums:[0-0] ranges:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET]
+000001:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET] seqnums:[0-0] ranges:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET]
+000001:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET] seqnums:[0-0] ranges:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET]
+000001:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET] seqnums:[0-0] ranges:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET]
+000001:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET] seqnums:[0-0] ranges:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET]
+000001:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET] seqnums:[0-0] ranges:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+.
+.
+.
+.
+
+iter key-type=ranges
+seek-lt a
+seek-lt apple
+seek-lt b
+seek-lt banana
+seek-lt c
+seek-lt cantalope
+seek-lt d
+seek-lt dragonfruit
+seek-lt e
+seek-lt elderberry
+seek-lt f
+seek-lt figs
+seek-lt g
+seek-lt guava
+seek-lt h
+seek-lt huckleberry
+seek-lt i
+seek-lt incaberry
+seek-lt j
+seek-lt jujube
+seek-lt k
+seek-lt kiwi
+seek-lt l
+----
+.
+.
+.
+.
+.
+000001:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET] seqnums:[0-0] ranges:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET]
+000001:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET] seqnums:[0-0] ranges:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET]
+000001:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET] seqnums:[0-0] ranges:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET]
+000001:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET] seqnums:[0-0] ranges:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+
+iter key-type=both
+first
+next
+next
+next
+next
+next
+next
+----
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000001:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET] seqnums:[0-0] ranges:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000003:[g#9,SET-g#2,DEL] seqnums:[0-0] points:[g#9,SET-g#2,DEL]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000005:[k#9,SET-k#2,DEL] seqnums:[0-0] points:[k#9,SET-k#2,DEL]
+.
+
+iter key-type=points
+first
+next
+next
+next
+next
+next
+----
+000000:[a#9,SET-b#2,DEL] seqnums:[0-0] points:[a#9,SET-b#2,DEL]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000003:[g#9,SET-g#2,DEL] seqnums:[0-0] points:[g#9,SET-g#2,DEL]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+000005:[k#9,SET-k#2,DEL] seqnums:[0-0] points:[k#9,SET-k#2,DEL]
+.
+
+iter key-type=ranges
+first
+next
+next
+next
+----
+000001:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET] seqnums:[0-0] ranges:[c#9,RANGEKEYSET-d#inf,RANGEKEYSET]
+000002:[e#9,SET-f#inf,RANGEKEYDEL] seqnums:[0-0] points:[e#9,SET-elderberry#2,DEL] ranges:[e#3,RANGEKEYSET-f#inf,RANGEKEYDEL]
+000004:[i#9,RANGEKEYSET-j#2,RANGEKEYSET] seqnums:[0-0] points:[incaberry#9,SET-incaettry#9,SET] ranges:[i#9,RANGEKEYSET-j#2,RANGEKEYSET]
+.
diff --git a/pebble/internal/manifest/testdata/overlaps b/pebble/internal/manifest/testdata/overlaps
new file mode 100644
index 0000000..e584991
--- /dev/null
+++ b/pebble/internal/manifest/testdata/overlaps
@@ -0,0 +1,566 @@
+define
+0:
+  000700:[b#7008,SET-e#7009,SET]
+  000701:[c#7018,SET-f#7019,SET]
+  000702:[f#7028,SET-g#7029,SET]
+  000703:[x#7038,SET-y#7039,SET]
+  000704:[n#7048,SET-p#7049,SET]
+  000705:[p#7058,SET-p#7059,SET]
+  000706:[p#7068,SET-u#7069,SET]
+  000707:[r#7078,SET-s#7079,SET]
+1:
+  000710:[a#7140,SET-d#inf,RANGEDEL]
+  000711:[d#7108,SET-g#7109,SET]
+  000712:[g#7118,SET-j#7119,SET]
+  000713:[n#7128,SET-p#7129,SET]
+  000714:[p#7148,SET-p#7149,SET]
+  000715:[p#7138,SET-u#7139,SET]
+----
+0.3:
+  000704:[n#7048,SET-p#7049,SET]
+0.2:
+  000700:[b#7008,SET-e#7009,SET]
+  000705:[p#7058,SET-p#7059,SET]
+0.1:
+  000701:[c#7018,SET-f#7019,SET]
+  000706:[p#7068,SET-u#7069,SET]
+0.0:
+  000702:[f#7028,SET-g#7029,SET]
+  000707:[r#7078,SET-s#7079,SET]
+  000703:[x#7038,SET-y#7039,SET]
+1:
+  000710:[a#7140,SET-d#inf,RANGEDEL]
+  000711:[d#7108,SET-g#7109,SET]
+  000712:[g#7118,SET-j#7119,SET]
+  000713:[n#7128,SET-p#7129,SET]
+  000714:[p#7148,SET-p#7149,SET]
+  000715:[p#7138,SET-u#7139,SET]
+
+# Level 0
+
+overlaps level=0 start=a end=a exclusive-end=false
+----
+0 files:
+
+overlaps level=0 start=a end=b exclusive-end=false
+----
+3 files:
+000700:[b#7008,SET-e#7009,SET]
+000701:[c#7018,SET-f#7019,SET]
+000702:[f#7028,SET-g#7029,SET]
+
+overlaps level=0 start=a end=d exclusive-end=false
+----
+3 files:
+000700:[b#7008,SET-e#7009,SET]
+000701:[c#7018,SET-f#7019,SET]
+000702:[f#7028,SET-g#7029,SET]
+
+overlaps level=0 start=a end=e exclusive-end=false
+----
+3 files:
+000700:[b#7008,SET-e#7009,SET]
+000701:[c#7018,SET-f#7019,SET]
+000702:[f#7028,SET-g#7029,SET]
+
+overlaps level=0 start=a end=g exclusive-end=false
+----
+3 files:
+000700:[b#7008,SET-e#7009,SET]
+000701:[c#7018,SET-f#7019,SET]
+000702:[f#7028,SET-g#7029,SET]
+
+overlaps level=0 start=a end=z exclusive-end=false
+----
+8 files:
+000700:[b#7008,SET-e#7009,SET]
+000701:[c#7018,SET-f#7019,SET]
+000702:[f#7028,SET-g#7029,SET]
+000703:[x#7038,SET-y#7039,SET]
+000704:[n#7048,SET-p#7049,SET]
+000705:[p#7058,SET-p#7059,SET]
+000706:[p#7068,SET-u#7069,SET]
+000707:[r#7078,SET-s#7079,SET]
+
+overlaps level=0 start=c end=e exclusive-end=false
+----
+3 files:
+000700:[b#7008,SET-e#7009,SET]
+000701:[c#7018,SET-f#7019,SET]
+000702:[f#7028,SET-g#7029,SET]
+
+overlaps level=0 start=d end=d exclusive-end=false
+----
+3 files:
+000700:[b#7008,SET-e#7009,SET]
+000701:[c#7018,SET-f#7019,SET]
+000702:[f#7028,SET-g#7029,SET]
+
+# The below case relies on exclusive-end changing to false after picking some file.
+
+overlaps level=0 start=b end=f exclusive-end=true
+----
+3 files:
+000700:[b#7008,SET-e#7009,SET]
+000701:[c#7018,SET-f#7019,SET]
+000702:[f#7028,SET-g#7029,SET]
+
+overlaps level=0 start=g end=n exclusive-end=false
+----
+7 files:
+000700:[b#7008,SET-e#7009,SET]
+000701:[c#7018,SET-f#7019,SET]
+000702:[f#7028,SET-g#7029,SET]
+000704:[n#7048,SET-p#7049,SET]
+000705:[p#7058,SET-p#7059,SET]
+000706:[p#7068,SET-u#7069,SET]
+000707:[r#7078,SET-s#7079,SET]
+
+overlaps level=0 start=h end=i exclusive-end=false
+----
+0 files:
+
+overlaps level=0 start=h end=o exclusive-end=false
+----
+4 files:
+000704:[n#7048,SET-p#7049,SET]
+000705:[p#7058,SET-p#7059,SET]
+000706:[p#7068,SET-u#7069,SET]
+000707:[r#7078,SET-s#7079,SET]
+
+overlaps level=0 start=h end=u exclusive-end=false
+----
+4 files:
+000704:[n#7048,SET-p#7049,SET]
+000705:[p#7058,SET-p#7059,SET]
+000706:[p#7068,SET-u#7069,SET]
+000707:[r#7078,SET-s#7079,SET]
+
+overlaps level=0 start=k end=l exclusive-end=false
+----
+0 files:
+
+overlaps level=0 start=k end=o exclusive-end=false
+----
+4 files:
+000704:[n#7048,SET-p#7049,SET]
+000705:[p#7058,SET-p#7059,SET]
+000706:[p#7068,SET-u#7069,SET]
+000707:[r#7078,SET-s#7079,SET]
+
+overlaps level=0 start=k end=p exclusive-end=false
+----
+4 files:
+000704:[n#7048,SET-p#7049,SET]
+000705:[p#7058,SET-p#7059,SET]
+000706:[p#7068,SET-u#7069,SET]
+000707:[r#7078,SET-s#7079,SET]
+
+overlaps level=0 start=n end=o exclusive-end=false
+----
+4 files:
+000704:[n#7048,SET-p#7049,SET]
+000705:[p#7058,SET-p#7059,SET]
+000706:[p#7068,SET-u#7069,SET]
+000707:[r#7078,SET-s#7079,SET]
+
+overlaps level=0 start=n end=z exclusive-end=false
+----
+5 files:
+000703:[x#7038,SET-y#7039,SET]
+000704:[n#7048,SET-p#7049,SET]
+000705:[p#7058,SET-p#7059,SET]
+000706:[p#7068,SET-u#7069,SET]
+000707:[r#7078,SET-s#7079,SET]
+
+overlaps level=0 start=o end=z exclusive-end=false
+----
+5 files:
+000703:[x#7038,SET-y#7039,SET]
+000704:[n#7048,SET-p#7049,SET]
+000705:[p#7058,SET-p#7059,SET]
+000706:[p#7068,SET-u#7069,SET]
+000707:[r#7078,SET-s#7079,SET]
+
+overlaps level=0 start=p end=z exclusive-end=false
+----
+5 files:
+000703:[x#7038,SET-y#7039,SET]
+000704:[n#7048,SET-p#7049,SET]
+000705:[p#7058,SET-p#7059,SET]
+000706:[p#7068,SET-u#7069,SET]
+000707:[r#7078,SET-s#7079,SET]
+
+overlaps level=0 start=q end=z exclusive-end=false
+----
+5 files:
+000703:[x#7038,SET-y#7039,SET]
+000704:[n#7048,SET-p#7049,SET]
+000705:[p#7058,SET-p#7059,SET]
+000706:[p#7068,SET-u#7069,SET]
+000707:[r#7078,SET-s#7079,SET]
+
+overlaps level=0 start=r end=s exclusive-end=false
+----
+4 files:
+000704:[n#7048,SET-p#7049,SET]
+000705:[p#7058,SET-p#7059,SET]
+000706:[p#7068,SET-u#7069,SET]
+000707:[r#7078,SET-s#7079,SET]
+
+overlaps level=0 start=r end=z exclusive-end=false
+----
+5 files:
+000703:[x#7038,SET-y#7039,SET]
+000704:[n#7048,SET-p#7049,SET]
+000705:[p#7058,SET-p#7059,SET]
+000706:[p#7068,SET-u#7069,SET]
+000707:[r#7078,SET-s#7079,SET]
+
+overlaps level=0 start=s end=z exclusive-end=false
+----
+5 files:
+000703:[x#7038,SET-y#7039,SET]
+000704:[n#7048,SET-p#7049,SET]
+000705:[p#7058,SET-p#7059,SET]
+000706:[p#7068,SET-u#7069,SET]
+000707:[r#7078,SET-s#7079,SET]
+
+overlaps level=0 start=u end=z exclusive-end=false
+----
+5 files:
+000703:[x#7038,SET-y#7039,SET]
+000704:[n#7048,SET-p#7049,SET]
+000705:[p#7058,SET-p#7059,SET]
+000706:[p#7068,SET-u#7069,SET]
+000707:[r#7078,SET-s#7079,SET]
+
+overlaps level=0 start=y end=z exclusive-end=false
+----
+1 files:
+000703:[x#7038,SET-y#7039,SET]
+
+overlaps level=0 start=z end=z exclusive-end=false
+----
+0 files:
+
+# Level 1
+
+overlaps level=1 start=a end=a exclusive-end=false
+----
+1 files:
+000710:[a#7140,SET-d#inf,RANGEDEL]
+
+overlaps level=1 start=a end=b exclusive-end=false
+----
+1 files:
+000710:[a#7140,SET-d#inf,RANGEDEL]
+
+overlaps level=1 start=a end=d exclusive-end=false
+----
+2 files:
+000710:[a#7140,SET-d#inf,RANGEDEL]
+000711:[d#7108,SET-g#7109,SET]
+
+overlaps level=1 start=a end=e exclusive-end=false
+----
+2 files:
+000710:[a#7140,SET-d#inf,RANGEDEL]
+000711:[d#7108,SET-g#7109,SET]
+
+overlaps level=1 start=a end=g exclusive-end=false
+----
+3 files:
+000710:[a#7140,SET-d#inf,RANGEDEL]
+000711:[d#7108,SET-g#7109,SET]
+000712:[g#7118,SET-j#7119,SET]
+
+overlaps level=1 start=a end=g exclusive-end=true
+----
+2 files:
+000710:[a#7140,SET-d#inf,RANGEDEL]
+000711:[d#7108,SET-g#7109,SET]
+
+overlaps level=1 start=a end=z exclusive-end=false
+----
+6 files:
+000710:[a#7140,SET-d#inf,RANGEDEL]
+000711:[d#7108,SET-g#7109,SET]
+000712:[g#7118,SET-j#7119,SET]
+000713:[n#7128,SET-p#7129,SET]
+000714:[p#7148,SET-p#7149,SET]
+000715:[p#7138,SET-u#7139,SET]
+
+overlaps level=1 start=a end=z exclusive-end=true
+----
+6 files:
+000710:[a#7140,SET-d#inf,RANGEDEL]
+000711:[d#7108,SET-g#7109,SET]
+000712:[g#7118,SET-j#7119,SET]
+000713:[n#7128,SET-p#7129,SET]
+000714:[p#7148,SET-p#7149,SET]
+000715:[p#7138,SET-u#7139,SET]
+
+overlaps level=1 start=c end=e exclusive-end=false
+----
+2 files:
+000710:[a#7140,SET-d#inf,RANGEDEL]
+000711:[d#7108,SET-g#7109,SET]
+
+overlaps level=1 start=d end=d exclusive-end=false
+----
+1 files:
+000711:[d#7108,SET-g#7109,SET]
+
+overlaps level=1 start=g end=n exclusive-end=false
+----
+3 files:
+000711:[d#7108,SET-g#7109,SET]
+000712:[g#7118,SET-j#7119,SET]
+000713:[n#7128,SET-p#7129,SET]
+
+overlaps level=1 start=h end=i exclusive-end=false
+----
+1 files:
+000712:[g#7118,SET-j#7119,SET]
+
+overlaps level=1 start=h end=n exclusive-end=true
+----
+1 files:
+000712:[g#7118,SET-j#7119,SET]
+
+overlaps level=1 start=h end=n exclusive-end=false
+----
+2 files:
+000712:[g#7118,SET-j#7119,SET]
+000713:[n#7128,SET-p#7129,SET]
+
+overlaps level=1 start=h end=o exclusive-end=false
+----
+2 files:
+000712:[g#7118,SET-j#7119,SET]
+000713:[n#7128,SET-p#7129,SET]
+
+overlaps level=1 start=h end=u exclusive-end=false
+----
+4 files:
+000712:[g#7118,SET-j#7119,SET]
+000713:[n#7128,SET-p#7129,SET]
+000714:[p#7148,SET-p#7149,SET]
+000715:[p#7138,SET-u#7139,SET]
+
+overlaps level=1 start=k end=l exclusive-end=false
+----
+0 files:
+
+overlaps level=1 start=k end=o exclusive-end=false
+----
+1 files:
+000713:[n#7128,SET-p#7129,SET]
+
+overlaps level=1 start=k end=p exclusive-end=false
+----
+3 files:
+000713:[n#7128,SET-p#7129,SET]
+000714:[p#7148,SET-p#7149,SET]
+000715:[p#7138,SET-u#7139,SET]
+
+overlaps level=1 start=k end=p exclusive-end=true
+----
+1 files:
+000713:[n#7128,SET-p#7129,SET]
+
+overlaps level=1 start=n end=o exclusive-end=false
+----
+1 files:
+000713:[n#7128,SET-p#7129,SET]
+
+overlaps level=1 start=n end=z exclusive-end=false
+----
+3 files:
+000713:[n#7128,SET-p#7129,SET]
+000714:[p#7148,SET-p#7149,SET]
+000715:[p#7138,SET-u#7139,SET]
+
+overlaps level=1 start=o end=z exclusive-end=false
+----
+3 files:
+000713:[n#7128,SET-p#7129,SET]
+000714:[p#7148,SET-p#7149,SET]
+000715:[p#7138,SET-u#7139,SET]
+
+overlaps level=1 start=p end=z exclusive-end=false
+----
+3 files:
+000713:[n#7128,SET-p#7129,SET]
+000714:[p#7148,SET-p#7149,SET]
+000715:[p#7138,SET-u#7139,SET]
+
+overlaps level=1 start=q end=z exclusive-end=false
+----
+1 files:
+000715:[p#7138,SET-u#7139,SET]
+
+overlaps level=1 start=r end=s exclusive-end=false
+----
+1 files:
+000715:[p#7138,SET-u#7139,SET]
+
+overlaps level=1 start=r end=z exclusive-end=false
+----
+1 files:
+000715:[p#7138,SET-u#7139,SET]
+
+overlaps level=1 start=s end=z exclusive-end=false
+----
+1 files:
+000715:[p#7138,SET-u#7139,SET]
+
+overlaps level=1 start=u end=z exclusive-end=false
+----
+1 files:
+000715:[p#7138,SET-u#7139,SET]
+
+overlaps level=1 start=y end=z exclusive-end=false
+----
+0 files:
+
+overlaps level=1 start=z end=z exclusive-end=false
+----
+0 files:
+
+# Level 2 is empty.
+
+overlaps level=2 start=a end=z exclusive-end=false
+----
+0 files:
+
+# Test a scenario where an originally exclusive-end must be promoted to
+# inclusive during the iterative expansion of L0 overlaps.
+#
+# 000003 with the f largest bound must be included.
+
+define
+0:
+  000001:[a#1,SET-d#2,SET]
+  000002:[c#3,SET-f#4,SET]
+  000003:[f#5,SET-f#5,SET]
+----
+0.2:
+  000001:[a#1,SET-d#2,SET]
+0.1:
+  000002:[c#3,SET-f#4,SET]
+0.0:
+  000003:[f#5,SET-f#5,SET]
+
+overlaps level=0 start=a end=b exclusive-end=true
+----
+3 files:
+000001:[a#1,SET-d#2,SET]
+000002:[c#3,SET-f#4,SET]
+000003:[f#5,SET-f#5,SET]
+
+# The below is a verbatim reproduction of the case detected by the
+# metamorphic tests in pebble#1459: The above case is already a
+# simplified version of the same condition. The verbatim reproduction is
+# included for completeness.
+
+define
+0.4:
+  000987:[aiinjp@20#4667,SET-fcklu@5#inf,RANGEDEL]
+  000988:[fcklu@5#4668,MERGE-glpw@1#inf,RANGEDEL]
+  000989:[glpw@1#4662,RANGEDEL-mlgxnog@19#inf,RANGEDEL]
+  000990:[mlgxnog@19#4662,RANGEDEL-nwnmqtyvjt@5#inf,RANGEDEL]
+  000991:[nwnmqtyvjt@5#4662,RANGEDEL-wmkrrxp@6#inf,RANGEDEL]
+0.3:
+  000978:[dygfdczcax@15#4609,DEL-vtocgpw@18#4609,DEL]
+  000992:[wmkrrxp@6#4657,MERGE-yyquzcd@21#4624,SET]
+  000993:[zslykqao@12#4636,SINGLEDEL-zzqwavxgrec@12#4627,DEL]
+0.2:
+  000981:[fhcykuix@5#4601,MERGE-kiati@10#4595,MERGE]
+  000977:[mgksrvk@15#4598,DEL-mgksrvk@15#4598,DEL]
+  000982:[nirnrarzktp@12#4600,MERGE-zaowx@3#4602,SET]
+  000828:[zzqwavxgrec@12#4092,SINGLEDEL-zzqwavxgrec@12#4092,SINGLEDEL]
+0.1:
+  000980:[dusu@10#4603,SET-duyeldgvnll@21#4605,SET]
+  000973:[ewqqtp@15#4591,RANGEDEL-zaygjmy@1#inf,RANGEDEL]
+  000605:[zzqwavxgrec@12#2894,SET-zzqwavxgrec@12#2894,SET]
+0.0:
+  000910:[abddymplk@20#4370,MERGE-abddymplk@20#4370,MERGE]
+  000939:[abvukibeofb@13#4439,SET-abvukibeofb@13#4439,SET]
+  000975:[ajoqjxr@16#4578,MERGE-zjyqka@1#4544,DEL]
+  000983:[znnoar@20#4604,SINGLEDEL-znnoar@20#4604,SINGLEDEL]
+  000535:[zzqwavxgrec@12#2657,SINGLEDEL-zzqwavxgrec@12#2526,SET]
+5:
+  000971:[acutc@6#4227,SET-zzhra@12#inf,RANGEDEL]
+6:
+  000806:[gourk@18#0,SET-zzhra@2#0,SET]
+----
+0.4:
+  000987:[aiinjp@20#4667,SET-fcklu@5#inf,RANGEDEL]
+  000988:[fcklu@5#4668,MERGE-glpw@1#inf,RANGEDEL]
+  000989:[glpw@1#4662,RANGEDEL-mlgxnog@19#inf,RANGEDEL]
+  000990:[mlgxnog@19#4662,RANGEDEL-nwnmqtyvjt@5#inf,RANGEDEL]
+  000991:[nwnmqtyvjt@5#4662,RANGEDEL-wmkrrxp@6#inf,RANGEDEL]
+0.3:
+  000978:[dygfdczcax@15#4609,DEL-vtocgpw@18#4609,DEL]
+  000992:[wmkrrxp@6#4657,MERGE-yyquzcd@21#4624,SET]
+  000993:[zslykqao@12#4636,SINGLEDEL-zzqwavxgrec@12#4627,DEL]
+0.2:
+  000981:[fhcykuix@5#4601,MERGE-kiati@10#4595,MERGE]
+  000977:[mgksrvk@15#4598,DEL-mgksrvk@15#4598,DEL]
+  000982:[nirnrarzktp@12#4600,MERGE-zaowx@3#4602,SET]
+  000828:[zzqwavxgrec@12#4092,SINGLEDEL-zzqwavxgrec@12#4092,SINGLEDEL]
+0.1:
+  000980:[dusu@10#4603,SET-duyeldgvnll@21#4605,SET]
+  000973:[ewqqtp@15#4591,RANGEDEL-zaygjmy@1#inf,RANGEDEL]
+  000605:[zzqwavxgrec@12#2894,SET-zzqwavxgrec@12#2894,SET]
+0.0:
+  000910:[abddymplk@20#4370,MERGE-abddymplk@20#4370,MERGE]
+  000939:[abvukibeofb@13#4439,SET-abvukibeofb@13#4439,SET]
+  000975:[ajoqjxr@16#4578,MERGE-zjyqka@1#4544,DEL]
+  000983:[znnoar@20#4604,SINGLEDEL-znnoar@20#4604,SINGLEDEL]
+  000535:[zzqwavxgrec@12#2657,SINGLEDEL-zzqwavxgrec@12#2526,SET]
+5:
+  000971:[acutc@6#4227,SET-zzhra@12#inf,RANGEDEL]
+6:
+  000806:[gourk@18#0,SET-zzhra@2#0,SET]
+
+overlaps level=0 start=heacptnep@12 end=kiicbzwtpe@16 exclusive-end=false
+----
+13 files:
+000973:[ewqqtp@15#4591,RANGEDEL-zaygjmy@1#inf,RANGEDEL]
+000975:[ajoqjxr@16#4578,MERGE-zjyqka@1#4544,DEL]
+000977:[mgksrvk@15#4598,DEL-mgksrvk@15#4598,DEL]
+000978:[dygfdczcax@15#4609,DEL-vtocgpw@18#4609,DEL]
+000980:[dusu@10#4603,SET-duyeldgvnll@21#4605,SET]
+000981:[fhcykuix@5#4601,MERGE-kiati@10#4595,MERGE]
+000982:[nirnrarzktp@12#4600,MERGE-zaowx@3#4602,SET]
+000987:[aiinjp@20#4667,SET-fcklu@5#inf,RANGEDEL]
+000988:[fcklu@5#4668,MERGE-glpw@1#inf,RANGEDEL]
+000989:[glpw@1#4662,RANGEDEL-mlgxnog@19#inf,RANGEDEL]
+000990:[mlgxnog@19#4662,RANGEDEL-nwnmqtyvjt@5#inf,RANGEDEL]
+000991:[nwnmqtyvjt@5#4662,RANGEDEL-wmkrrxp@6#inf,RANGEDEL]
+000992:[wmkrrxp@6#4657,MERGE-yyquzcd@21#4624,SET]
+
+overlaps level=0 start=acutc@6 end=zzhra@12 exclusive-end=true
+----
+18 files:
+000535:[zzqwavxgrec@12#2657,SINGLEDEL-zzqwavxgrec@12#2526,SET]
+000605:[zzqwavxgrec@12#2894,SET-zzqwavxgrec@12#2894,SET]
+000828:[zzqwavxgrec@12#4092,SINGLEDEL-zzqwavxgrec@12#4092,SINGLEDEL]
+000973:[ewqqtp@15#4591,RANGEDEL-zaygjmy@1#inf,RANGEDEL]
+000975:[ajoqjxr@16#4578,MERGE-zjyqka@1#4544,DEL]
+000977:[mgksrvk@15#4598,DEL-mgksrvk@15#4598,DEL]
+000978:[dygfdczcax@15#4609,DEL-vtocgpw@18#4609,DEL]
+000980:[dusu@10#4603,SET-duyeldgvnll@21#4605,SET]
+000981:[fhcykuix@5#4601,MERGE-kiati@10#4595,MERGE]
+000982:[nirnrarzktp@12#4600,MERGE-zaowx@3#4602,SET]
+000983:[znnoar@20#4604,SINGLEDEL-znnoar@20#4604,SINGLEDEL]
+000987:[aiinjp@20#4667,SET-fcklu@5#inf,RANGEDEL]
+000988:[fcklu@5#4668,MERGE-glpw@1#inf,RANGEDEL]
+000989:[glpw@1#4662,RANGEDEL-mlgxnog@19#inf,RANGEDEL]
+000990:[mlgxnog@19#4662,RANGEDEL-nwnmqtyvjt@5#inf,RANGEDEL]
+000991:[nwnmqtyvjt@5#4662,RANGEDEL-wmkrrxp@6#inf,RANGEDEL]
+000992:[wmkrrxp@6#4657,MERGE-yyquzcd@21#4624,SET]
+000993:[zslykqao@12#4636,SINGLEDEL-zzqwavxgrec@12#4627,DEL]
diff --git a/pebble/internal/manifest/testdata/version_check_ordering b/pebble/internal/manifest/testdata/version_check_ordering
new file mode 100644
index 0000000..9f8f710
--- /dev/null
+++ b/pebble/internal/manifest/testdata/version_check_ordering
@@ -0,0 +1,302 @@
+# Note: when specifying test cases with tables in L0, the L0 files should be
+# specified in seqnum descending order, as the test case input is parsed as the
+# inverse of `(*FileMetadata).DebugString`.
+
+check-ordering
+0:
+  000001:[a#1,SET-b#2,SET]
+----
+OK
+
+check-ordering
+0:
+  000002:[c#3,SET-d#4,SET]
+  000001:[a#1,SET-b#2,SET]
+----
+OK
+
+check-ordering
+0:
+  000002:[a#1,SET-b#2,SET]
+  000001:[c#3,SET-d#4,SET]
+----
+L0 files 000001 and 000002 are not properly ordered: <#3-#4> vs <#1-#2>
+0.0:
+  000002:[a#1,SET-b#2,SET] seqnums:[1-2] points:[a#1,SET-b#2,SET]
+  000001:[c#3,SET-d#4,SET] seqnums:[3-4] points:[c#3,SET-d#4,SET]
+
+check-ordering
+0:
+  000008:[k#16,SET-n#19,SET]
+  000007:[a#14,SET-j#17,SET]
+  000006:[b#15,SET-d#15,SET]
+  000005:[i#8,SET-j#13,SET]
+  000004:[g#6,SET-h#12,SET]
+  000003:[e#2,SET-f#7,SET]
+  000002:[a#1,SET-b#5,SET]
+  000001:[c#3,SET-d#4,SET]
+----
+OK
+
+# Add some ingested SSTables around the 14-19 seqnum cases.
+check-ordering
+0:
+  000010:[m#20,SET-n#20,SET]
+  000009:[k#16,SET-n#19,SET]
+  000008:[m#18,SET-n#18,SET]
+  000007:[a#14,SET-j#17,SET]
+  000006:[b#15,SET-d#15,SET]
+  000005:[i#8,SET-j#13,SET]
+  000004:[g#6,SET-h#12,SET]
+  000003:[e#2,SET-f#7,SET]
+  000002:[a#1,SET-b#5,SET]
+  000001:[c#3,SET-d#4,SET]
+----
+OK
+
+# Coincident sequence numbers around sstables with overlapping sequence numbers
+# are possible due to flush splitting, so this is acceptable.
+check-ordering
+0:
+  000010:[m#20,SET-n#20,SET]
+  000009:[k#16,SET-n#19,SET]
+  000008:[m#18,SET-n#18,SET]
+  000007:[a#15,SET-j#17,SET]
+  000006:[b#15,SET-d#15,SET]
+  000005:[i#8,SET-j#13,SET]
+  000004:[g#6,SET-h#12,SET]
+  000003:[e#2,SET-f#7,SET]
+  000002:[a#1,SET-b#5,SET]
+  000001:[c#3,SET-d#4,SET]
+----
+OK
+
+# Ensure that sstables passed in a non-sorted order are detected.
+check-ordering
+0:
+  000002:[a#1,SET-b#2,SET]
+  000001:[a#3,SET-d#3,SET]
+----
+L0 files 000001 and 000002 are not properly ordered: <#3-#3> vs <#1-#2>
+0.1:
+  000002:[a#1,SET-b#2,SET] seqnums:[1-2] points:[a#1,SET-b#2,SET]
+0.0:
+  000001:[a#3,SET-d#3,SET] seqnums:[3-3] points:[a#3,SET-d#3,SET]
+
+check-ordering
+0:
+  000002:[a#3,SET-b#3,SET]
+  000001:[a#2,SET-d#4,SET]
+----
+L0 files 000001 and 000002 are not properly ordered: <#2-#4> vs <#3-#3>
+0.1:
+  000002:[a#3,SET-b#3,SET] seqnums:[3-3] points:[a#3,SET-b#3,SET]
+0.0:
+  000001:[a#2,SET-d#4,SET] seqnums:[2-4] points:[a#2,SET-d#4,SET]
+
+check-ordering
+0:
+  000002:[a#3,SET-b#3,SET]
+  000001:[a#3,SET-d#3,SET]
+----
+OK
+
+check-ordering
+0:
+  000002:[a#3,SET-d#5,SET]
+  000001:[a#3,SET-d#3,SET]
+----
+OK
+
+check-ordering
+0:
+  000002:[a#3,SET-d#5,SET]
+  000001:[a#4,SET-d#4,SET]
+----
+OK
+
+check-ordering
+0:
+  000002:[a#5,SET-d#5,SET]
+  000001:[a#3,SET-d#5,SET]
+----
+OK
+
+check-ordering
+0:
+  000003:[a#4,SET-d#6,SET]
+  000002:[a#5,SET-d#5,SET]
+  000001:[a#4,SET-d#4,SET]
+----
+OK
+
+check-ordering
+0:
+  000003:[a#0,SET-d#3,SET]
+  000002:[a#0,SET-d#0,SET]
+  000001:[a#0,SET-d#0,SET]
+----
+OK
+
+check-ordering
+1:
+  000001:[a#1,SET-b#2,SET]
+----
+OK
+
+check-ordering
+1:
+  000001:[b#1,SET-a#2,SET]
+----
+L1 : file 000001 has inconsistent bounds: b#1,SET vs a#2,SET
+1:
+  000001:[b#1,SET-a#2,SET] seqnums:[0-0] points:[b#1,SET-a#2,SET]
+
+check-ordering
+1:
+  000001:[a#1,SET-b#2,SET]
+  000002:[c#3,SET-d#4,SET]
+----
+OK
+
+check-ordering
+1:
+  000001:[a#1,SET-b#2,SET]
+  000002:[d#3,SET-c#4,SET]
+----
+L1 : file 000002 has inconsistent bounds: d#3,SET vs c#4,SET
+1:
+  000001:[a#1,SET-b#2,SET] seqnums:[0-0] points:[a#1,SET-b#2,SET]
+  000002:[d#3,SET-c#4,SET] seqnums:[0-0] points:[d#3,SET-c#4,SET]
+
+check-ordering
+1:
+  000001:[a#1,SET-b#2,SET]
+  000002:[b#1,SET-d#4,SET]
+----
+L1 files 000001 and 000002 have overlapping ranges: [a#1,SET-b#2,SET] vs [b#1,SET-d#4,SET]
+1:
+  000001:[a#1,SET-b#2,SET] seqnums:[0-0] points:[a#1,SET-b#2,SET]
+  000002:[b#1,SET-d#4,SET] seqnums:[0-0] points:[b#1,SET-d#4,SET]
+
+check-ordering allow-split-user-keys
+1:
+  000001:[a#1,SET-b#2,SET]
+  000002:[b#1,SET-d#4,SET]
+----
+OK
+
+check-ordering
+1:
+  000001:[a#1,SET-b#2,SET]
+  000002:[b#2,SET-d#4,SET]
+----
+L1 files 000001 and 000002 have overlapping ranges: [a#1,SET-b#2,SET] vs [b#2,SET-d#4,SET]
+1:
+  000001:[a#1,SET-b#2,SET] seqnums:[0-0] points:[a#1,SET-b#2,SET]
+  000002:[b#2,SET-d#4,SET] seqnums:[0-0] points:[b#2,SET-d#4,SET]
+
+check-ordering
+1:
+  000001:[a#1,SET-c#2,SET]
+  000002:[b#3,SET-d#4,SET]
+----
+L1 files 000001 and 000002 have overlapping ranges: [a#1,SET-c#2,SET] vs [b#3,SET-d#4,SET]
+1:
+  000001:[a#1,SET-c#2,SET] seqnums:[0-0] points:[a#1,SET-c#2,SET]
+  000002:[b#3,SET-d#4,SET] seqnums:[0-0] points:[b#3,SET-d#4,SET]
+
+check-ordering
+1:
+  000001:[a#1,SET-c#2,SET]
+2:
+  000002:[b#3,SET-d#4,SET]
+----
+OK
+
+check-ordering
+1:
+  000001:[a#1,SET-c#2,SET]
+2:
+  000002:[b#3,SET-d#4,SET]
+  000003:[c#5,SET-e#6,SET]
+----
+L2 files 000002 and 000003 have overlapping ranges: [b#3,SET-d#4,SET] vs [c#5,SET-e#6,SET]
+1:
+  000001:[a#1,SET-c#2,SET] seqnums:[0-0] points:[a#1,SET-c#2,SET]
+2:
+  000002:[b#3,SET-d#4,SET] seqnums:[0-0] points:[b#3,SET-d#4,SET]
+  000003:[c#5,SET-e#6,SET] seqnums:[0-0] points:[c#5,SET-e#6,SET]
+
+# Ordering considers tables with just range keys.
+
+check-ordering
+0:
+  000002:[c#3,RANGEKEYSET-d#inf,RANGEKEYSET] ranges:[c#3,RANGEKEYSET-d#inf,RANGEKEYSET]
+  000001:[a#1,RANGEKEYSET-b#inf,RANGEKEYSET] ranges:[a#1,RANGEKEYSET-b#inf,RANGEKEYSET]
+----
+OK
+
+check-ordering
+0:
+  000002:[c#1,RANGEKEYSET-d#inf,RANGEKEYSET] ranges:[c#1,RANGEKEYSET-d#inf,RANGEKEYSET]
+  000001:[a#3,RANGEKEYSET-b#inf,RANGEKEYSET] ranges:[a#3,RANGEKEYSET-b#inf,RANGEKEYSET]
+----
+L0 files 000001 and 000002 are not properly ordered: <#3-#72057594037927935> vs <#1-#72057594037927935>
+0.0:
+  000001:[a#3,RANGEKEYSET-b#inf,RANGEKEYSET] seqnums:[3-72057594037927935] ranges:[a#3,RANGEKEYSET-b#inf,RANGEKEYSET]
+  000002:[c#1,RANGEKEYSET-d#inf,RANGEKEYSET] seqnums:[1-72057594037927935] ranges:[c#1,RANGEKEYSET-d#inf,RANGEKEYSET]
+
+check-ordering
+1:
+  000001:[a#1,RANGEKEYSET-b#inf,RANGEKEYSET] ranges:[a#1,RANGEKEYSET-b#inf,RANGEKEYSET]
+  000002:[c#3,RANGEKEYSET-d#inf,RANGEKEYSET] ranges:[c#3,RANGEKEYSET-d#inf,RANGEKEYSET]
+----
+OK
+
+check-ordering
+1:
+  000001:[c#3,RANGEKEYSET-d#inf,RANGEKEYSET] ranges:[c#3,RANGEKEYSET-d#inf,RANGEKEYSET]
+  000002:[a#1,RANGEKEYSET-b#inf,RANGEKEYSET] ranges:[a#1,RANGEKEYSET-b#inf,RANGEKEYSET]
+----
+L1 files 000001 and 000002 are not properly ordered: [c#3,RANGEKEYSET-d#inf,RANGEKEYSET] vs [a#1,RANGEKEYSET-b#inf,RANGEKEYSET]
+1:
+  000001:[c#3,RANGEKEYSET-d#inf,RANGEKEYSET] seqnums:[0-0] ranges:[c#3,RANGEKEYSET-d#inf,RANGEKEYSET]
+  000002:[a#1,RANGEKEYSET-b#inf,RANGEKEYSET] seqnums:[0-0] ranges:[a#1,RANGEKEYSET-b#inf,RANGEKEYSET]
+
+# Ordering considers tables with both point and range keys.
+
+check-ordering
+0:
+  000002:[c#1,RANGEKEYSET-e#4,SET] points:[d#3,SET-e#4,SET] ranges:[c#1,RANGEKEYSET-d#inf,RANGEKEYSET]
+  000001:[a#1,RANGEKEYSET-c#2,SET] points:[b#1,SET-c#2,SET] ranges:[a#1,RANGEKEYSET-b#inf,RANGEKEYSET]
+----
+OK
+
+check-ordering
+0:
+  000002:[c#1,RANGEKEYSET-e#2,SET] points:[d#3,SET-e#2,SET] ranges:[c#1,RANGEKEYSET-d#inf,RANGEKEYSET]
+  000001:[a#1,RANGEKEYSET-c#4,SET] points:[b#1,SET-c#4,SET] ranges:[a#1,RANGEKEYSET-b#inf,RANGEKEYSET]
+----
+L0 files 000001 and 000002 are not properly ordered: <#1-#4> vs <#1-#2>
+0.1:
+  000002:[c#1,RANGEKEYSET-e#2,SET] seqnums:[1-2] points:[d#3,SET-e#2,SET] ranges:[c#1,RANGEKEYSET-d#inf,RANGEKEYSET]
+0.0:
+  000001:[a#1,RANGEKEYSET-c#4,SET] seqnums:[1-4] points:[b#1,SET-c#4,SET] ranges:[a#1,RANGEKEYSET-b#inf,RANGEKEYSET]
+
+check-ordering
+1:
+  000001:[a#1,RANGEKEYSET-c#2,SET] points:[b#1,SET-c#2,SET] ranges:[a#1,RANGEKEYSET-b#inf,RANGEKEYSET]
+  000002:[d#3,RANGEKEYSET-f#4,SET] points:[e#3,SET-f#4,SET] ranges:[d#3,RANGEKEYSET-e#inf,RANGEKEYSET]
+----
+OK
+
+check-ordering
+1:
+  000001:[a#1,RANGEKEYSET-c#2,SET] points:[b#1,SET-c#2,SET] ranges:[a#1,RANGEKEYSET-b#inf,RANGEKEYSET]
+  000002:[c#3,RANGEKEYSET-f#4,SET] points:[e#3,SET-f#4,SET] ranges:[c#3,RANGEKEYSET-e#inf,RANGEKEYSET]
+----
+L1 files 000001 and 000002 have overlapping ranges: [a#1,RANGEKEYSET-c#2,SET] vs [c#3,RANGEKEYSET-f#4,SET]
+1:
+  000001:[a#1,RANGEKEYSET-c#2,SET] seqnums:[0-0] points:[b#1,SET-c#2,SET] ranges:[a#1,RANGEKEYSET-b#inf,RANGEKEYSET]
+  000002:[c#3,RANGEKEYSET-f#4,SET] seqnums:[0-0] points:[e#3,SET-f#4,SET] ranges:[c#3,RANGEKEYSET-e#inf,RANGEKEYSET]
diff --git a/pebble/internal/manifest/testdata/version_edit_apply b/pebble/internal/manifest/testdata/version_edit_apply
new file mode 100644
index 0000000..94df012
--- /dev/null
+++ b/pebble/internal/manifest/testdata/version_edit_apply
@@ -0,0 +1,191 @@
+apply
+ L0
+  1:[a#1,SET-b#2,SET]
+  2:[c#3,SET-d#4,SET]
+edit
+ delete
+  L0
+   1
+ add
+  L2
+  1:[a#1,SET-b#2,SET]
+  4:[c#3,SET-d#4,SET]
+----
+0.0:
+  000002:[c#3,SET-d#4,SET]
+2:
+  000001:[a#1,SET-b#2,SET]
+  000004:[c#3,SET-d#4,SET]
+zombies []
+
+apply
+ L0
+  1:[a#1,SET-b#2,SET]
+  2:[c#3,SET-d#4,SET]
+edit
+ delete
+  L1
+   1
+----
+pebble: internal error: No current or added files but have deleted files: 1
+
+apply
+ L0
+  1:[a#1,SET-c#2,SET]
+  2:[c#3,SET-d#4,SET]
+edit
+ delete
+  L0
+   1
+ add
+  L2
+   1:[a#1,SET-c#2,SET]
+   4:[b#3,SET-d#4,SET]
+----
+pebble: internal error: L2 files 000001 and 000004 have overlapping ranges: [a#1,SET-c#2,SET] vs [b#3,SET-d#4,SET]
+
+apply
+ L0
+  1:[a#1,SET-c#2,SET]
+  2:[c#3,SET-d#4,SET]
+edit
+ add
+  L0
+   4:[b#3,SET-d#5,SET]
+----
+0.2:
+  000004:[b#3,SET-d#5,SET]
+0.1:
+  000002:[c#3,SET-d#4,SET]
+0.0:
+  000001:[a#1,SET-c#2,SET]
+zombies []
+
+apply
+ L0
+   1:[a#1,SET-c#2,SET]
+   2:[c#3,SET-d#4,SET]
+edit
+ add
+  L0
+   4:[b#0,SET-d#0,SET]
+----
+0.2:
+  000002:[c#3,SET-d#4,SET]
+0.1:
+  000001:[a#1,SET-c#2,SET]
+0.0:
+  000004:[b#0,SET-d#0,SET]
+zombies []
+
+
+apply
+edit
+ add
+  L0
+   1:[a#1,SET-c#2,SET]
+   4:[b#3,SET-d#5,SET]
+----
+0.1:
+  000004:[b#3,SET-d#5,SET]
+0.0:
+  000001:[a#1,SET-c#2,SET]
+zombies []
+
+apply
+ L0
+  1:[a#1,SET-c#2,SET]
+----
+0.0:
+  000001:[a#1,SET-c#2,SET]
+zombies []
+
+apply
+ L2
+  3:[b#1,SET-c#2,SET]
+  4:[d#3,SET-f#4,SET]
+  5:[h#3,SET-h#2,SET]
+  2:[n#5,SET-q#3,SET]
+  1:[r#2,SET-t#1,SET]
+edit
+ delete
+  L2
+   4
+   1
+ add
+  L2
+   6:[a#10,SET-a#7,SET]
+   7:[e#1,SET-g#2,SET]
+   10:[j#3,SET-m#2,SET]
+----
+2:
+  000006:[a#10,SET-a#7,SET]
+  000003:[b#1,SET-c#2,SET]
+  000007:[e#1,SET-g#2,SET]
+  000005:[h#3,SET-h#2,SET]
+  000010:[j#3,SET-m#2,SET]
+  000002:[n#5,SET-q#3,SET]
+zombies [1 4]
+
+apply
+edit
+ add
+  L2
+   10:[j#3,SET-m#2,SET]
+   6:[a#10,SET-a#7,SET]
+----
+2:
+  000006:[a#10,SET-a#7,SET]
+  000010:[j#3,SET-m#2,SET]
+zombies []
+
+# Verify that the zombies map is populated correctly.
+
+apply
+ L0
+  1:[a#1,SET-b#2,SET]
+ L1
+  2:[c#3,SET-d#2,SET]
+edit
+ delete
+  L0
+   1
+  L1
+   2
+----
+zombies [1 2]
+
+# Deletion of a non-existent table results in an error.
+
+apply
+ L0
+  1:[a#1,SET-b#2,SET]
+edit
+ delete
+  L0
+   2
+----
+pebble: file deleted L0.000002 before it was inserted
+
+apply
+ L0
+  1:[a#1,SET-b#2,SET]
+edit
+ delete
+  L0
+   1
+ add
+  L2
+  1:[a#1,SET-b#2,SET]
+  4:[c#3,SET-d#4,SET]
+  5:[s#3,SET-z#4,SET]
+edit
+  delete
+    L2
+     1
+    L2
+     4
+----
+2:
+  000005:[s#3,SET-z#4,SET]
+zombies []
diff --git a/pebble/internal/manifest/version.go b/pebble/internal/manifest/version.go
new file mode 100644
index 0000000..549aa22
--- /dev/null
+++ b/pebble/internal/manifest/version.go
@@ -0,0 +1,1561 @@
+// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package manifest
+
+import (
+	"bytes"
+	"fmt"
+	"sort"
+	"strconv"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"unicode"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	stdcmp "github.com/cockroachdb/pebble/shims/cmp"
+)
+
+// Compare exports the base.Compare type.
+type Compare = base.Compare
+
+// InternalKey exports the base.InternalKey type.
+type InternalKey = base.InternalKey
+
+// TableInfo contains the common information for table related events.
+type TableInfo struct {
+	// FileNum is the internal DB identifier for the table.
+	FileNum base.FileNum
+	// Size is the size of the file in bytes.
+	Size uint64
+	// Smallest is the smallest internal key in the table.
+	Smallest InternalKey
+	// Largest is the largest internal key in the table.
+	Largest InternalKey
+	// SmallestSeqNum is the smallest sequence number in the table.
+	SmallestSeqNum uint64
+	// LargestSeqNum is the largest sequence number in the table.
+	LargestSeqNum uint64
+}
+
+// TableStats contains statistics on a table used for compaction heuristics,
+// and export via Metrics.
+type TableStats struct {
+	// The total number of entries in the table.
+	NumEntries uint64
+	// The number of point and range deletion entries in the table.
+	NumDeletions uint64
+	// NumRangeKeySets is the total number of range key sets in the table.
+	//
+	// NB: If there's a chance that the sstable contains any range key sets,
+	// then NumRangeKeySets must be > 0.
+	NumRangeKeySets uint64
+	// Estimate of the total disk space that may be dropped by this table's
+	// point deletions by compacting them.
+	PointDeletionsBytesEstimate uint64
+	// Estimate of the total disk space that may be dropped by this table's
+	// range deletions by compacting them. This estimate is at data-block
+	// granularity and is not updated if compactions beneath the table reduce
+	// the amount of reclaimable disk space. It also does not account for
+	// overlapping data in L0 and ignores L0 sublevels, but the error that
+	// introduces is expected to be small.
+	//
+	// Tables in the bottommost level of the LSM may have a nonzero estimate if
+	// snapshots or move compactions prevented the elision of their range
+	// tombstones. A table in the bottommost level that was ingested into L6
+	// will have a zero estimate, because the file's sequence numbers indicate
+	// that the tombstone cannot drop any data contained within the file itself.
+	RangeDeletionsBytesEstimate uint64
+	// Total size of value blocks and value index block.
+	ValueBlocksSize uint64
+}
+
+// boundType represents the type of key (point or range) present as the smallest
+// and largest keys.
+type boundType uint8
+
+const (
+	boundTypePointKey boundType = iota + 1
+	boundTypeRangeKey
+)
+
+// CompactionState is the compaction state of a file.
+//
+// The following shows the valid state transitions:
+//
+//	NotCompacting --> Compacting --> Compacted
+//	      ^               |
+//	      |               |
+//	      +-------<-------+
+//
+// Input files to a compaction transition to Compacting when a compaction is
+// picked. A file that has finished compacting typically transitions into the
+// Compacted state, at which point it is effectively obsolete ("zombied") and
+// will eventually be removed from the LSM. A file that has been move-compacted
+// will transition from Compacting back into the NotCompacting state, signaling
+// that the file may be selected for a subsequent compaction. A failed
+// compaction will result in all input tables transitioning from Compacting to
+// NotCompacting.
+//
+// This state is in-memory only. It is not persisted to the manifest.
+type CompactionState uint8
+
+// CompactionStates.
+const (
+	CompactionStateNotCompacting CompactionState = iota
+	CompactionStateCompacting
+	CompactionStateCompacted
+)
+
+// String implements fmt.Stringer.
+func (s CompactionState) String() string {
+	switch s {
+	case CompactionStateNotCompacting:
+		return "NotCompacting"
+	case CompactionStateCompacting:
+		return "Compacting"
+	case CompactionStateCompacted:
+		return "Compacted"
+	default:
+		panic(fmt.Sprintf("pebble: unknown compaction state %d", s))
+	}
+}
+
+// FileMetadata is maintained for leveled-ssts, i.e., they belong to a level of
+// some version. FileMetadata does not contain the actual level of the sst,
+// since such leveled-ssts can move across levels in different versions, while
+// sharing the same FileMetadata. There are two kinds of leveled-ssts, physical
+// and virtual. Underlying both leveled-ssts is a backing-sst, for which the
+// only state is FileBacking. A backing-sst is level-less. It is possible for a
+// backing-sst to be referred to by a physical sst in one version and by one or
+// more virtual ssts in one or more versions. A backing-sst becomes obsolete
+// and can be deleted once it is no longer required by any physical or virtual
+// sst in any version.
+//
+// We maintain some invariants:
+//
+//  1. Each physical and virtual sst will have a unique FileMetadata.FileNum,
+//     and there will be exactly one FileMetadata associated with the FileNum.
+//
+//  2. Within a version, a backing-sst is either only referred to by one
+//     physical sst or one or more virtual ssts.
+//
+//  3. Once a backing-sst is referred to by a virtual sst in the latest version,
+//     it cannot go back to being referred to by a physical sst in any future
+//     version.
+//
+// Once a physical sst is no longer needed by any version, we will no longer
+// maintain the file metadata associated with it. We will still maintain the
+// FileBacking associated with the physical sst if the backing sst is required
+// by any virtual ssts in any version.
+type FileMetadata struct {
+	// AllowedSeeks is used to determine if a file should be picked for
+	// a read triggered compaction. It is decremented when read sampling
+	// in pebble.Iterator after every after every positioning operation
+	// that returns a user key (eg. Next, Prev, SeekGE, SeekLT, etc).
+	AllowedSeeks atomic.Int64
+
+	// statsValid indicates if stats have been loaded for the table. The
+	// TableStats structure is populated only if valid is true.
+	statsValid atomic.Bool
+
+	// FileBacking is the state which backs either a physical or virtual
+	// sstables.
+	FileBacking *FileBacking
+
+	// InitAllowedSeeks is the inital value of allowed seeks. This is used
+	// to re-set allowed seeks on a file once it hits 0.
+	InitAllowedSeeks int64
+	// FileNum is the file number.
+	//
+	// INVARIANT: when !FileMetadata.Virtual, FileNum == FileBacking.DiskFileNum.
+	FileNum base.FileNum
+	// Size is the size of the file, in bytes. Size is an approximate value for
+	// virtual sstables.
+	//
+	// INVARIANTS:
+	// - When !FileMetadata.Virtual, Size == FileBacking.Size.
+	// - Size should be non-zero. Size 0 virtual sstables must not be created.
+	Size uint64
+	// File creation time in seconds since the epoch (1970-01-01 00:00:00
+	// UTC). For ingested sstables, this corresponds to the time the file was
+	// ingested. For virtual sstables, this corresponds to the wall clock time
+	// when the FileMetadata for the virtual sstable was first created.
+	CreationTime int64
+	// Lower and upper bounds for the smallest and largest sequence numbers in
+	// the table, across both point and range keys. For physical sstables, these
+	// values are tight bounds. For virtual sstables, there is no guarantee that
+	// there will be keys with SmallestSeqNum or LargestSeqNum within virtual
+	// sstable bounds.
+	SmallestSeqNum uint64
+	LargestSeqNum  uint64
+	// SmallestPointKey and LargestPointKey are the inclusive bounds for the
+	// internal point keys stored in the table. This includes RANGEDELs, which
+	// alter point keys.
+	// NB: these field should be set using ExtendPointKeyBounds. They are left
+	// exported for reads as an optimization.
+	SmallestPointKey InternalKey
+	LargestPointKey  InternalKey
+	// SmallestRangeKey and LargestRangeKey are the inclusive bounds for the
+	// internal range keys stored in the table.
+	// NB: these field should be set using ExtendRangeKeyBounds. They are left
+	// exported for reads as an optimization.
+	SmallestRangeKey InternalKey
+	LargestRangeKey  InternalKey
+	// Smallest and Largest are the inclusive bounds for the internal keys stored
+	// in the table, across both point and range keys.
+	// NB: these fields are derived from their point and range key equivalents,
+	// and are updated via the MaybeExtend{Point,Range}KeyBounds methods.
+	Smallest InternalKey
+	Largest  InternalKey
+	// Stats describe table statistics. Protected by DB.mu.
+	//
+	// For virtual sstables, set stats upon virtual sstable creation as
+	// asynchronous computation of stats is not currently supported.
+	//
+	// TODO(bananabrick): To support manifest replay for virtual sstables, we
+	// probably need to compute virtual sstable stats asynchronously. Otherwise,
+	// we'd have to write virtual sstable stats to the version edit.
+	Stats TableStats
+
+	// For L0 files only. Protected by DB.mu. Used to generate L0 sublevels and
+	// pick L0 compactions. Only accurate for the most recent Version.
+	SubLevel         int
+	L0Index          int
+	minIntervalIndex int
+	maxIntervalIndex int
+
+	// NB: the alignment of this struct is 8 bytes. We pack all the bools to
+	// ensure an optimal packing.
+
+	// IsIntraL0Compacting is set to True if this file is part of an intra-L0
+	// compaction. When it's true, IsCompacting must also return true. If
+	// Compacting is true and IsIntraL0Compacting is false for an L0 file, the
+	// file must be part of a compaction to Lbase.
+	IsIntraL0Compacting bool
+	CompactionState     CompactionState
+	// True if compaction of this file has been explicitly requested.
+	// Previously, RocksDB and earlier versions of Pebble allowed this
+	// flag to be set by a user table property collector. Some earlier
+	// versions of Pebble respected this flag, while other more recent
+	// versions ignored this flag.
+	//
+	// More recently this flag has been repurposed to facilitate the
+	// compaction of 'atomic compaction units'. Files marked for
+	// compaction are compacted in a rewrite compaction at the lowest
+	// possible compaction priority.
+	//
+	// NB: A count of files marked for compaction is maintained on
+	// Version, and compaction picking reads cached annotations
+	// determined by this field.
+	//
+	// Protected by DB.mu.
+	MarkedForCompaction bool
+	// HasPointKeys tracks whether the table contains point keys (including
+	// RANGEDELs). If a table contains only range deletions, HasPointsKeys is
+	// still true.
+	HasPointKeys bool
+	// HasRangeKeys tracks whether the table contains any range keys.
+	HasRangeKeys bool
+	// smallestSet and largestSet track whether the overall bounds have been set.
+	boundsSet bool
+	// boundTypeSmallest and boundTypeLargest provide an indication as to which
+	// key type (point or range) corresponds to the smallest and largest overall
+	// table bounds.
+	boundTypeSmallest, boundTypeLargest boundType
+	// Virtual is true if the FileMetadata belongs to a virtual sstable.
+	Virtual bool
+}
+
+// PhysicalFileMeta is used by functions which want a guarantee that their input
+// belongs to a physical sst and not a virtual sst.
+//
+// NB: This type should only be constructed by calling
+// FileMetadata.PhysicalMeta.
+type PhysicalFileMeta struct {
+	*FileMetadata
+}
+
+// VirtualFileMeta is used by functions which want a guarantee that their input
+// belongs to a virtual sst and not a physical sst.
+//
+// A VirtualFileMeta inherits all the same fields as a FileMetadata. These
+// fields have additional invariants imposed on them, and/or slightly varying
+// meanings:
+//   - Smallest and Largest (and their counterparts
+//     {Smallest, Largest}{Point,Range}Key) remain tight bounds that represent a
+//     key at that exact bound. We make the effort to determine the next smallest
+//     or largest key in an sstable after virtualizing it, to maintain this
+//     tightness. If the largest is a sentinel key (IsExclusiveSentinel()), it
+//     could mean that a rangedel or range key ends at that user key, or has been
+//     truncated to that user key.
+//   - One invariant is that if a rangedel or range key is truncated on its
+//     upper bound, the virtual sstable *must* have a rangedel or range key
+//     sentinel key as its upper bound. This is because truncation yields
+//     an exclusive upper bound for the rangedel/rangekey, and if there are
+//     any points at that exclusive upper bound within the same virtual
+//     sstable, those could get uncovered by this truncation. We enforce this
+//     invariant in calls to keyspan.Truncate.
+//   - Size is an estimate of the size of the virtualized portion of this sstable.
+//     The underlying file's size is stored in FileBacking.Size, though it could
+//     also be estimated or could correspond to just the referenced portion of
+//     a file (eg. if the file originated on another node).
+//   - Size must be > 0.
+//   - SmallestSeqNum and LargestSeqNum are loose bounds for virtual sstables.
+//     This means that all keys in the virtual sstable must have seqnums within
+//     [SmallestSeqNum, LargestSeqNum], however there's no guarantee that there's
+//     a key with a seqnum at either of the bounds. Calculating tight seqnum
+//     bounds would be too expensive and deliver little value.
+//
+// NB: This type should only be constructed by calling FileMetadata.VirtualMeta.
+type VirtualFileMeta struct {
+	*FileMetadata
+}
+
+// PhysicalMeta should be the only source of creating the PhysicalFileMeta
+// wrapper type.
+func (m *FileMetadata) PhysicalMeta() PhysicalFileMeta {
+	if m.Virtual {
+		panic("pebble: file metadata does not belong to a physical sstable")
+	}
+	return PhysicalFileMeta{
+		m,
+	}
+}
+
+// VirtualMeta should be the only source of creating the VirtualFileMeta wrapper
+// type.
+func (m *FileMetadata) VirtualMeta() VirtualFileMeta {
+	if !m.Virtual {
+		panic("pebble: file metadata does not belong to a virtual sstable")
+	}
+	return VirtualFileMeta{
+		m,
+	}
+}
+
+// FileBacking either backs a single physical sstable, or one or more virtual
+// sstables.
+//
+// See the comment above the FileMetadata type for sstable terminology.
+type FileBacking struct {
+	// Reference count for the backing file on disk: incremented when a
+	// physical or virtual sstable which is backed by the FileBacking is
+	// added to a version and decremented when the version is unreferenced.
+	// We ref count in order to determine when it is safe to delete a
+	// backing sst file from disk. The backing file is obsolete when the
+	// reference count falls to zero.
+	refs atomic.Int32
+	// latestVersionRefs are the references to the FileBacking in the
+	// latest version. This reference can be through a single physical
+	// sstable in the latest version, or one or more virtual sstables in the
+	// latest version.
+	//
+	// INVARIANT: latestVersionRefs <= refs.
+	latestVersionRefs atomic.Int32
+	// VirtualizedSize is set iff the backing sst is only referred to by
+	// virtual ssts in the latest version. VirtualizedSize is the sum of the
+	// virtual sstable sizes of all of the virtual sstables in the latest
+	// version which are backed by the physical sstable. When a virtual
+	// sstable is removed from the latest version, we will decrement the
+	// VirtualizedSize. During compaction picking, we'll compensate a
+	// virtual sstable file size by
+	// (FileBacking.Size - FileBacking.VirtualizedSize) / latestVersionRefs.
+	// The intuition is that if FileBacking.Size - FileBacking.VirtualizedSize
+	// is high, then the space amplification due to virtual sstables is
+	// high, and we should pick the virtual sstable with a higher priority.
+	//
+	// TODO(bananabrick): Compensate the virtual sstable file size using
+	// the VirtualizedSize during compaction picking and test.
+	VirtualizedSize atomic.Uint64
+	DiskFileNum     base.DiskFileNum
+	Size            uint64
+}
+
+// InitPhysicalBacking allocates and sets the FileBacking which is required by a
+// physical sstable FileMetadata.
+//
+// Ensure that the state required by FileBacking, such as the FileNum, is
+// already set on the FileMetadata before InitPhysicalBacking is called.
+// Calling InitPhysicalBacking only after the relevant state has been set in the
+// FileMetadata is not necessary in tests which don't rely on FileBacking.
+func (m *FileMetadata) InitPhysicalBacking() {
+	if m.Virtual {
+		panic("pebble: virtual sstables should use a pre-existing FileBacking")
+	}
+	if m.FileBacking == nil {
+		m.FileBacking = &FileBacking{Size: m.Size, DiskFileNum: m.FileNum.DiskFileNum()}
+	}
+}
+
+// InitProviderBacking creates a new FileBacking for a file backed by
+// an objstorage.Provider.
+func (m *FileMetadata) InitProviderBacking(fileNum base.DiskFileNum) {
+	if !m.Virtual {
+		panic("pebble: provider-backed sstables must be virtual")
+	}
+	if m.FileBacking == nil {
+		m.FileBacking = &FileBacking{DiskFileNum: fileNum}
+	}
+}
+
+// ValidateVirtual should be called once the FileMetadata for a virtual sstable
+// is created to verify that the fields of the virtual sstable are sound.
+func (m *FileMetadata) ValidateVirtual(createdFrom *FileMetadata) {
+	if !m.Virtual {
+		panic("pebble: invalid virtual sstable")
+	}
+
+	if createdFrom.SmallestSeqNum != m.SmallestSeqNum {
+		panic("pebble: invalid smallest sequence number for virtual sstable")
+	}
+
+	if createdFrom.LargestSeqNum != m.LargestSeqNum {
+		panic("pebble: invalid largest sequence number for virtual sstable")
+	}
+
+	if createdFrom.FileBacking != nil && createdFrom.FileBacking != m.FileBacking {
+		panic("pebble: invalid physical sstable state for virtual sstable")
+	}
+
+	if m.Size == 0 {
+		panic("pebble: virtual sstable size must be set upon creation")
+	}
+}
+
+// Refs returns the refcount of backing sstable.
+func (m *FileMetadata) Refs() int32 {
+	return m.FileBacking.refs.Load()
+}
+
+// Ref increments the ref count associated with the backing sstable.
+func (m *FileMetadata) Ref() {
+	m.FileBacking.refs.Add(1)
+}
+
+// Unref decrements the ref count associated with the backing sstable.
+func (m *FileMetadata) Unref() int32 {
+	v := m.FileBacking.refs.Add(-1)
+	if invariants.Enabled && v < 0 {
+		panic("pebble: invalid FileMetadata refcounting")
+	}
+	return v
+}
+
+// LatestRef increments the latest ref count associated with the backing
+// sstable.
+func (m *FileMetadata) LatestRef() {
+	m.FileBacking.latestVersionRefs.Add(1)
+
+	if m.Virtual {
+		m.FileBacking.VirtualizedSize.Add(m.Size)
+	}
+}
+
+// LatestUnref decrements the latest ref count associated with the backing
+// sstable.
+func (m *FileMetadata) LatestUnref() int32 {
+	if m.Virtual {
+		m.FileBacking.VirtualizedSize.Add(-m.Size)
+	}
+
+	v := m.FileBacking.latestVersionRefs.Add(-1)
+	if invariants.Enabled && v < 0 {
+		panic("pebble: invalid FileMetadata latest refcounting")
+	}
+	return v
+}
+
+// LatestRefs returns the latest ref count associated with the backing sstable.
+func (m *FileMetadata) LatestRefs() int32 {
+	return m.FileBacking.latestVersionRefs.Load()
+}
+
+// SetCompactionState transitions this file's compaction state to the given
+// state. Protected by DB.mu.
+func (m *FileMetadata) SetCompactionState(to CompactionState) {
+	if invariants.Enabled {
+		transitionErr := func() error {
+			return errors.Newf("pebble: invalid compaction state transition: %s -> %s", m.CompactionState, to)
+		}
+		switch m.CompactionState {
+		case CompactionStateNotCompacting:
+			if to != CompactionStateCompacting {
+				panic(transitionErr())
+			}
+		case CompactionStateCompacting:
+			if to != CompactionStateCompacted && to != CompactionStateNotCompacting {
+				panic(transitionErr())
+			}
+		case CompactionStateCompacted:
+			panic(transitionErr())
+		default:
+			panic(fmt.Sprintf("pebble: unknown compaction state: %d", m.CompactionState))
+		}
+	}
+	m.CompactionState = to
+}
+
+// IsCompacting returns true if this file's compaction state is
+// CompactionStateCompacting. Protected by DB.mu.
+func (m *FileMetadata) IsCompacting() bool {
+	return m.CompactionState == CompactionStateCompacting
+}
+
+// StatsValid returns true if the table stats have been populated. If StatValid
+// returns true, the Stats field may be read (with or without holding the
+// database mutex).
+func (m *FileMetadata) StatsValid() bool {
+	return m.statsValid.Load()
+}
+
+// StatsMarkValid marks the TableStats as valid. The caller must hold DB.mu
+// while populating TableStats and calling StatsMarkValud. Once stats are
+// populated, they must not be mutated.
+func (m *FileMetadata) StatsMarkValid() {
+	m.statsValid.Store(true)
+}
+
+// ExtendPointKeyBounds attempts to extend the lower and upper point key bounds
+// and overall table bounds with the given smallest and largest keys. The
+// smallest and largest bounds may not be extended if the table already has a
+// bound that is smaller or larger, respectively. The receiver is returned.
+// NB: calling this method should be preferred to manually setting the bounds by
+// manipulating the fields directly, to maintain certain invariants.
+func (m *FileMetadata) ExtendPointKeyBounds(
+	cmp Compare, smallest, largest InternalKey,
+) *FileMetadata {
+	// Update the point key bounds.
+	if !m.HasPointKeys {
+		m.SmallestPointKey, m.LargestPointKey = smallest, largest
+		m.HasPointKeys = true
+	} else {
+		if base.InternalCompare(cmp, smallest, m.SmallestPointKey) < 0 {
+			m.SmallestPointKey = smallest
+		}
+		if base.InternalCompare(cmp, largest, m.LargestPointKey) > 0 {
+			m.LargestPointKey = largest
+		}
+	}
+	// Update the overall bounds.
+	m.extendOverallBounds(cmp, m.SmallestPointKey, m.LargestPointKey, boundTypePointKey)
+	return m
+}
+
+// ExtendRangeKeyBounds attempts to extend the lower and upper range key bounds
+// and overall table bounds with the given smallest and largest keys. The
+// smallest and largest bounds may not be extended if the table already has a
+// bound that is smaller or larger, respectively. The receiver is returned.
+// NB: calling this method should be preferred to manually setting the bounds by
+// manipulating the fields directly, to maintain certain invariants.
+func (m *FileMetadata) ExtendRangeKeyBounds(
+	cmp Compare, smallest, largest InternalKey,
+) *FileMetadata {
+	// Update the range key bounds.
+	if !m.HasRangeKeys {
+		m.SmallestRangeKey, m.LargestRangeKey = smallest, largest
+		m.HasRangeKeys = true
+	} else {
+		if base.InternalCompare(cmp, smallest, m.SmallestRangeKey) < 0 {
+			m.SmallestRangeKey = smallest
+		}
+		if base.InternalCompare(cmp, largest, m.LargestRangeKey) > 0 {
+			m.LargestRangeKey = largest
+		}
+	}
+	// Update the overall bounds.
+	m.extendOverallBounds(cmp, m.SmallestRangeKey, m.LargestRangeKey, boundTypeRangeKey)
+	return m
+}
+
+// extendOverallBounds attempts to extend the overall table lower and upper
+// bounds. The given bounds may not be used if a lower or upper bound already
+// exists that is smaller or larger than the given keys, respectively. The given
+// boundType will be used if the bounds are updated.
+func (m *FileMetadata) extendOverallBounds(
+	cmp Compare, smallest, largest InternalKey, bTyp boundType,
+) {
+	if !m.boundsSet {
+		m.Smallest, m.Largest = smallest, largest
+		m.boundsSet = true
+		m.boundTypeSmallest, m.boundTypeLargest = bTyp, bTyp
+	} else {
+		if base.InternalCompare(cmp, smallest, m.Smallest) < 0 {
+			m.Smallest = smallest
+			m.boundTypeSmallest = bTyp
+		}
+		if base.InternalCompare(cmp, largest, m.Largest) > 0 {
+			m.Largest = largest
+			m.boundTypeLargest = bTyp
+		}
+	}
+}
+
+// Overlaps returns true if the file key range overlaps with the given range.
+func (m *FileMetadata) Overlaps(cmp Compare, start []byte, end []byte, exclusiveEnd bool) bool {
+	if c := cmp(m.Largest.UserKey, start); c < 0 || (c == 0 && m.Largest.IsExclusiveSentinel()) {
+		// f is completely before the specified range; no overlap.
+		return false
+	}
+	if c := cmp(m.Smallest.UserKey, end); c > 0 || (c == 0 && exclusiveEnd) {
+		// f is completely after the specified range; no overlap.
+		return false
+	}
+	return true
+}
+
+// ContainedWithinSpan returns true if the file key range completely overlaps with the
+// given range ("end" is assumed to exclusive).
+func (m *FileMetadata) ContainedWithinSpan(cmp Compare, start, end []byte) bool {
+	lowerCmp, upperCmp := cmp(m.Smallest.UserKey, start), cmp(m.Largest.UserKey, end)
+	return lowerCmp >= 0 && (upperCmp < 0 || (upperCmp == 0 && m.Largest.IsExclusiveSentinel()))
+}
+
+// ContainsKeyType returns whether or not the file contains keys of the provided
+// type.
+func (m *FileMetadata) ContainsKeyType(kt KeyType) bool {
+	switch kt {
+	case KeyTypePointAndRange:
+		return true
+	case KeyTypePoint:
+		return m.HasPointKeys
+	case KeyTypeRange:
+		return m.HasRangeKeys
+	default:
+		panic("unrecognized key type")
+	}
+}
+
+// SmallestBound returns the file's smallest bound of the key type. It returns a
+// false second return value if the file does not contain any keys of the key
+// type.
+func (m *FileMetadata) SmallestBound(kt KeyType) (*InternalKey, bool) {
+	switch kt {
+	case KeyTypePointAndRange:
+		return &m.Smallest, true
+	case KeyTypePoint:
+		return &m.SmallestPointKey, m.HasPointKeys
+	case KeyTypeRange:
+		return &m.SmallestRangeKey, m.HasRangeKeys
+	default:
+		panic("unrecognized key type")
+	}
+}
+
+// LargestBound returns the file's largest bound of the key type. It returns a
+// false second return value if the file does not contain any keys of the key
+// type.
+func (m *FileMetadata) LargestBound(kt KeyType) (*InternalKey, bool) {
+	switch kt {
+	case KeyTypePointAndRange:
+		return &m.Largest, true
+	case KeyTypePoint:
+		return &m.LargestPointKey, m.HasPointKeys
+	case KeyTypeRange:
+		return &m.LargestRangeKey, m.HasRangeKeys
+	default:
+		panic("unrecognized key type")
+	}
+}
+
+const (
+	maskContainsPointKeys = 1 << 0
+	maskSmallest          = 1 << 1
+	maskLargest           = 1 << 2
+)
+
+// boundsMarker returns a marker byte whose bits encode the following
+// information (in order from least significant bit):
+// - if the table contains point keys
+// - if the table's smallest key is a point key
+// - if the table's largest key is a point key
+func (m *FileMetadata) boundsMarker() (sentinel uint8, err error) {
+	if m.HasPointKeys {
+		sentinel |= maskContainsPointKeys
+	}
+	switch m.boundTypeSmallest {
+	case boundTypePointKey:
+		sentinel |= maskSmallest
+	case boundTypeRangeKey:
+		// No op - leave bit unset.
+	default:
+		return 0, base.CorruptionErrorf("file %s has neither point nor range key as smallest key", m.FileNum)
+	}
+	switch m.boundTypeLargest {
+	case boundTypePointKey:
+		sentinel |= maskLargest
+	case boundTypeRangeKey:
+		// No op - leave bit unset.
+	default:
+		return 0, base.CorruptionErrorf("file %s has neither point nor range key as largest key", m.FileNum)
+	}
+	return
+}
+
+// String implements fmt.Stringer, printing the file number and the overall
+// table bounds.
+func (m *FileMetadata) String() string {
+	return fmt.Sprintf("%s:[%s-%s]", m.FileNum, m.Smallest, m.Largest)
+}
+
+// DebugString returns a verbose representation of FileMetadata, typically for
+// use in tests and debugging, returning the file number and the point, range
+// and overall bounds for the table.
+func (m *FileMetadata) DebugString(format base.FormatKey, verbose bool) string {
+	var b bytes.Buffer
+	fmt.Fprintf(&b, "%s:[%s-%s]",
+		m.FileNum, m.Smallest.Pretty(format), m.Largest.Pretty(format))
+	if !verbose {
+		return b.String()
+	}
+	fmt.Fprintf(&b, " seqnums:[%d-%d]", m.SmallestSeqNum, m.LargestSeqNum)
+	if m.HasPointKeys {
+		fmt.Fprintf(&b, " points:[%s-%s]",
+			m.SmallestPointKey.Pretty(format), m.LargestPointKey.Pretty(format))
+	}
+	if m.HasRangeKeys {
+		fmt.Fprintf(&b, " ranges:[%s-%s]",
+			m.SmallestRangeKey.Pretty(format), m.LargestRangeKey.Pretty(format))
+	}
+	return b.String()
+}
+
+// ParseFileMetadataDebug parses a FileMetadata from its DebugString
+// representation.
+func ParseFileMetadataDebug(s string) (*FileMetadata, error) {
+	// Split lines of the form:
+	//  000000:[a#0,SET-z#0,SET] seqnums:[5-5] points:[...] ranges:[...]
+	fields := strings.FieldsFunc(s, func(c rune) bool {
+		switch c {
+		case ':', '[', '-', ']':
+			return true
+		default:
+			return unicode.IsSpace(c) // NB: also trim whitespace padding.
+		}
+	})
+	if len(fields)%3 != 0 {
+		return nil, errors.Newf("malformed input: %s", s)
+	}
+	m := &FileMetadata{}
+	for len(fields) > 0 {
+		prefix := fields[0]
+		if prefix == "seqnums" {
+			smallestSeqNum, err := strconv.ParseUint(fields[1], 10, 64)
+			if err != nil {
+				return m, errors.Newf("malformed input: %s: %s", s, err)
+			}
+			largestSeqNum, err := strconv.ParseUint(fields[2], 10, 64)
+			if err != nil {
+				return m, errors.Newf("malformed input: %s: %s", s, err)
+			}
+			m.SmallestSeqNum, m.LargestSeqNum = smallestSeqNum, largestSeqNum
+			fields = fields[3:]
+			continue
+		}
+		smallest := base.ParsePrettyInternalKey(fields[1])
+		largest := base.ParsePrettyInternalKey(fields[2])
+		switch prefix {
+		case "points":
+			m.SmallestPointKey, m.LargestPointKey = smallest, largest
+			m.HasPointKeys = true
+		case "ranges":
+			m.SmallestRangeKey, m.LargestRangeKey = smallest, largest
+			m.HasRangeKeys = true
+		default:
+			fileNum, err := strconv.ParseUint(prefix, 10, 64)
+			if err != nil {
+				return m, errors.Newf("malformed input: %s: %s", s, err)
+			}
+			m.FileNum = base.FileNum(fileNum)
+			m.Smallest, m.Largest = smallest, largest
+			m.boundsSet = true
+		}
+		fields = fields[3:]
+	}
+	// By default, when the parser sees just the overall bounds, we set the point
+	// keys. This preserves backwards compatability with existing test cases that
+	// specify only the overall bounds.
+	if !m.HasPointKeys && !m.HasRangeKeys {
+		m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest
+		m.HasPointKeys = true
+	}
+	m.InitPhysicalBacking()
+	return m, nil
+}
+
+// Validate validates the metadata for consistency with itself, returning an
+// error if inconsistent.
+func (m *FileMetadata) Validate(cmp Compare, formatKey base.FormatKey) error {
+	// Combined range and point key validation.
+
+	if !m.HasPointKeys && !m.HasRangeKeys {
+		return base.CorruptionErrorf("file %s has neither point nor range keys",
+			errors.Safe(m.FileNum))
+	}
+	if base.InternalCompare(cmp, m.Smallest, m.Largest) > 0 {
+		return base.CorruptionErrorf("file %s has inconsistent bounds: %s vs %s",
+			errors.Safe(m.FileNum), m.Smallest.Pretty(formatKey),
+			m.Largest.Pretty(formatKey))
+	}
+	if m.SmallestSeqNum > m.LargestSeqNum {
+		return base.CorruptionErrorf("file %s has inconsistent seqnum bounds: %d vs %d",
+			errors.Safe(m.FileNum), m.SmallestSeqNum, m.LargestSeqNum)
+	}
+
+	// Point key validation.
+
+	if m.HasPointKeys {
+		if base.InternalCompare(cmp, m.SmallestPointKey, m.LargestPointKey) > 0 {
+			return base.CorruptionErrorf("file %s has inconsistent point key bounds: %s vs %s",
+				errors.Safe(m.FileNum), m.SmallestPointKey.Pretty(formatKey),
+				m.LargestPointKey.Pretty(formatKey))
+		}
+		if base.InternalCompare(cmp, m.SmallestPointKey, m.Smallest) < 0 ||
+			base.InternalCompare(cmp, m.LargestPointKey, m.Largest) > 0 {
+			return base.CorruptionErrorf(
+				"file %s has inconsistent point key bounds relative to overall bounds: "+
+					"overall = [%s-%s], point keys = [%s-%s]",
+				errors.Safe(m.FileNum),
+				m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey),
+				m.SmallestPointKey.Pretty(formatKey), m.LargestPointKey.Pretty(formatKey),
+			)
+		}
+	}
+
+	// Range key validation.
+
+	if m.HasRangeKeys {
+		if base.InternalCompare(cmp, m.SmallestRangeKey, m.LargestRangeKey) > 0 {
+			return base.CorruptionErrorf("file %s has inconsistent range key bounds: %s vs %s",
+				errors.Safe(m.FileNum), m.SmallestRangeKey.Pretty(formatKey),
+				m.LargestRangeKey.Pretty(formatKey))
+		}
+		if base.InternalCompare(cmp, m.SmallestRangeKey, m.Smallest) < 0 ||
+			base.InternalCompare(cmp, m.LargestRangeKey, m.Largest) > 0 {
+			return base.CorruptionErrorf(
+				"file %s has inconsistent range key bounds relative to overall bounds: "+
+					"overall = [%s-%s], range keys = [%s-%s]",
+				errors.Safe(m.FileNum),
+				m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey),
+				m.SmallestRangeKey.Pretty(formatKey), m.LargestRangeKey.Pretty(formatKey),
+			)
+		}
+	}
+
+	// Ensure that FileMetadata.Init was called.
+	if m.FileBacking == nil {
+		return base.CorruptionErrorf("file metadata FileBacking not set")
+	}
+
+	return nil
+}
+
+// TableInfo returns a subset of the FileMetadata state formatted as a
+// TableInfo.
+func (m *FileMetadata) TableInfo() TableInfo {
+	return TableInfo{
+		FileNum:        m.FileNum,
+		Size:           m.Size,
+		Smallest:       m.Smallest,
+		Largest:        m.Largest,
+		SmallestSeqNum: m.SmallestSeqNum,
+		LargestSeqNum:  m.LargestSeqNum,
+	}
+}
+
+func (m *FileMetadata) cmpSeqNum(b *FileMetadata) int {
+	// NB: This is the same ordering that RocksDB uses for L0 files.
+
+	// Sort first by largest sequence number.
+	if v := stdcmp.Compare(m.LargestSeqNum, b.LargestSeqNum); v != 0 {
+		return v
+	}
+	// Then by smallest sequence number.
+	if v := stdcmp.Compare(m.SmallestSeqNum, b.SmallestSeqNum); v != 0 {
+		return v
+	}
+	// Break ties by file number.
+	return stdcmp.Compare(m.FileNum, b.FileNum)
+}
+
+func (m *FileMetadata) lessSeqNum(b *FileMetadata) bool {
+	return m.cmpSeqNum(b) < 0
+}
+
+func (m *FileMetadata) cmpSmallestKey(b *FileMetadata, cmp Compare) int {
+	return base.InternalCompare(cmp, m.Smallest, b.Smallest)
+}
+
+// KeyRange returns the minimum smallest and maximum largest internalKey for
+// all the FileMetadata in iters.
+func KeyRange(ucmp Compare, iters ...LevelIterator) (smallest, largest InternalKey) {
+	first := true
+	for _, iter := range iters {
+		for meta := iter.First(); meta != nil; meta = iter.Next() {
+			if first {
+				first = false
+				smallest, largest = meta.Smallest, meta.Largest
+				continue
+			}
+			if base.InternalCompare(ucmp, smallest, meta.Smallest) >= 0 {
+				smallest = meta.Smallest
+			}
+			if base.InternalCompare(ucmp, largest, meta.Largest) <= 0 {
+				largest = meta.Largest
+			}
+		}
+	}
+	return smallest, largest
+}
+
+type bySeqNum []*FileMetadata
+
+func (b bySeqNum) Len() int { return len(b) }
+func (b bySeqNum) Less(i, j int) bool {
+	return b[i].lessSeqNum(b[j])
+}
+func (b bySeqNum) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
+
+// SortBySeqNum sorts the specified files by increasing sequence number.
+func SortBySeqNum(files []*FileMetadata) {
+	sort.Sort(bySeqNum(files))
+}
+
+type bySmallest struct {
+	files []*FileMetadata
+	cmp   Compare
+}
+
+func (b bySmallest) Len() int { return len(b.files) }
+func (b bySmallest) Less(i, j int) bool {
+	return b.files[i].cmpSmallestKey(b.files[j], b.cmp) < 0
+}
+func (b bySmallest) Swap(i, j int) { b.files[i], b.files[j] = b.files[j], b.files[i] }
+
+// SortBySmallest sorts the specified files by smallest key using the supplied
+// comparison function to order user keys.
+func SortBySmallest(files []*FileMetadata, cmp Compare) {
+	sort.Sort(bySmallest{files, cmp})
+}
+
+func overlaps(iter LevelIterator, cmp Compare, start, end []byte, exclusiveEnd bool) LevelSlice {
+	startIter := iter.Clone()
+	{
+		startIterFile := startIter.SeekGE(cmp, start)
+		// SeekGE compares user keys. The user key `start` may be equal to the
+		// f.Largest because f.Largest is a range deletion sentinel, indicating
+		// that the user key `start` is NOT contained within the file f. If
+		// that's the case, we can narrow the overlapping bounds to exclude the
+		// file with the sentinel.
+		if startIterFile != nil && startIterFile.Largest.IsExclusiveSentinel() &&
+			cmp(startIterFile.Largest.UserKey, start) == 0 {
+			startIterFile = startIter.Next()
+		}
+		_ = startIterFile // Ignore unused assignment.
+	}
+
+	endIter := iter.Clone()
+	{
+		endIterFile := endIter.SeekGE(cmp, end)
+
+		if !exclusiveEnd {
+			// endIter is now pointing at the *first* file with a largest key >= end.
+			// If there are multiple files including the user key `end`, we want all
+			// of them, so move forward.
+			for endIterFile != nil && cmp(endIterFile.Largest.UserKey, end) == 0 {
+				endIterFile = endIter.Next()
+			}
+		}
+
+		// LevelSlice uses inclusive bounds, so if we seeked to the end sentinel
+		// or nexted too far because Largest.UserKey equaled `end`, go back.
+		//
+		// Consider !exclusiveEnd and end = 'f', with the following file bounds:
+		//
+		//     [b,d] [e, f] [f, f] [g, h]
+		//
+		// the above for loop will Next until it arrives at [g, h]. We need to
+		// observe that g > f, and Prev to the file with bounds [f, f].
+		if endIterFile == nil {
+			endIterFile = endIter.Prev()
+		} else if c := cmp(endIterFile.Smallest.UserKey, end); c > 0 || c == 0 && exclusiveEnd {
+			endIterFile = endIter.Prev()
+		}
+		_ = endIterFile // Ignore unused assignment.
+	}
+	return newBoundedLevelSlice(startIter.Clone().iter, &startIter.iter, &endIter.iter)
+}
+
+// NumLevels is the number of levels a Version contains.
+const NumLevels = 7
+
+// NewVersion constructs a new Version with the provided files. It requires
+// the provided files are already well-ordered. It's intended for testing.
+func NewVersion(
+	cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, files [NumLevels][]*FileMetadata,
+) *Version {
+	var v Version
+	for l := range files {
+		// NB: We specifically insert `files` into the B-Tree in the order
+		// they appear within `files`. Some tests depend on this behavior in
+		// order to test consistency checking, etc. Once we've constructed the
+		// initial B-Tree, we swap out the btreeCmp for the correct one.
+		// TODO(jackson): Adjust or remove the tests and remove this.
+		v.Levels[l].tree, _ = makeBTree(btreeCmpSpecificOrder(files[l]), files[l])
+		v.Levels[l].level = l
+		if l == 0 {
+			v.Levels[l].tree.cmp = btreeCmpSeqNum
+		} else {
+			v.Levels[l].tree.cmp = btreeCmpSmallestKey(cmp)
+		}
+		for _, f := range files[l] {
+			v.Levels[l].totalSize += f.Size
+		}
+	}
+	if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil {
+		panic(err)
+	}
+	return &v
+}
+
+// Version is a collection of file metadata for on-disk tables at various
+// levels. In-memory DBs are written to level-0 tables, and compactions
+// migrate data from level N to level N+1. The tables map internal keys (which
+// are a user key, a delete or set bit, and a sequence number) to user values.
+//
+// The tables at level 0 are sorted by largest sequence number. Due to file
+// ingestion, there may be overlap in the ranges of sequence numbers contain in
+// level 0 sstables. In particular, it is valid for one level 0 sstable to have
+// the seqnum range [1,100] while an adjacent sstable has the seqnum range
+// [50,50]. This occurs when the [50,50] table was ingested and given a global
+// seqnum. The ingestion code will have ensured that the [50,50] sstable will
+// not have any keys that overlap with the [1,100] in the seqnum range
+// [1,49]. The range of internal keys [fileMetadata.smallest,
+// fileMetadata.largest] in each level 0 table may overlap.
+//
+// The tables at any non-0 level are sorted by their internal key range and any
+// two tables at the same non-0 level do not overlap.
+//
+// The internal key ranges of two tables at different levels X and Y may
+// overlap, for any X != Y.
+//
+// Finally, for every internal key in a table at level X, there is no internal
+// key in a higher level table that has both the same user key and a higher
+// sequence number.
+type Version struct {
+	refs atomic.Int32
+
+	// The level 0 sstables are organized in a series of sublevels. Similar to
+	// the seqnum invariant in normal levels, there is no internal key in a
+	// higher level table that has both the same user key and a higher sequence
+	// number. Within a sublevel, tables are sorted by their internal key range
+	// and any two tables at the same sublevel do not overlap. Unlike the normal
+	// levels, sublevel n contains older tables (lower sequence numbers) than
+	// sublevel n+1.
+	//
+	// The L0Sublevels struct is mostly used for compaction picking. As most
+	// internal data structures in it are only necessary for compaction picking
+	// and not for iterator creation, the reference to L0Sublevels is nil'd
+	// after this version becomes the non-newest version, to reduce memory
+	// usage.
+	//
+	// L0Sublevels.Levels contains L0 files ordered by sublevels. All the files
+	// in Levels[0] are in L0Sublevels.Levels. L0SublevelFiles is also set to
+	// a reference to that slice, as that slice is necessary for iterator
+	// creation and needs to outlast L0Sublevels.
+	L0Sublevels     *L0Sublevels
+	L0SublevelFiles []LevelSlice
+
+	Levels [NumLevels]LevelMetadata
+
+	// RangeKeyLevels holds a subset of the same files as Levels that contain range
+	// keys (i.e. fileMeta.HasRangeKeys == true). The memory amplification of this
+	// duplication should be minimal, as range keys are expected to be rare.
+	RangeKeyLevels [NumLevels]LevelMetadata
+
+	// The callback to invoke when the last reference to a version is
+	// removed. Will be called with list.mu held.
+	Deleted func(obsolete []*FileBacking)
+
+	// Stats holds aggregated stats about the version maintained from
+	// version to version.
+	Stats struct {
+		// MarkedForCompaction records the count of files marked for
+		// compaction within the version.
+		MarkedForCompaction int
+	}
+
+	// The list the version is linked into.
+	list *VersionList
+
+	// The next/prev link for the versionList doubly-linked list of versions.
+	prev, next *Version
+}
+
+// String implements fmt.Stringer, printing the FileMetadata for each level in
+// the Version.
+func (v *Version) String() string {
+	return v.string(base.DefaultFormatter, false)
+}
+
+// DebugString returns an alternative format to String() which includes sequence
+// number and kind information for the sstable boundaries.
+func (v *Version) DebugString(format base.FormatKey) string {
+	return v.string(format, true)
+}
+
+func describeSublevels(format base.FormatKey, verbose bool, sublevels []LevelSlice) string {
+	var buf bytes.Buffer
+	for sublevel := len(sublevels) - 1; sublevel >= 0; sublevel-- {
+		fmt.Fprintf(&buf, "0.%d:\n", sublevel)
+		sublevels[sublevel].Each(func(f *FileMetadata) {
+			fmt.Fprintf(&buf, "  %s\n", f.DebugString(format, verbose))
+		})
+	}
+	return buf.String()
+}
+
+func (v *Version) string(format base.FormatKey, verbose bool) string {
+	var buf bytes.Buffer
+	if len(v.L0SublevelFiles) > 0 {
+		fmt.Fprintf(&buf, "%s", describeSublevels(format, verbose, v.L0SublevelFiles))
+	}
+	for level := 1; level < NumLevels; level++ {
+		if v.Levels[level].Empty() {
+			continue
+		}
+		fmt.Fprintf(&buf, "%d:\n", level)
+		iter := v.Levels[level].Iter()
+		for f := iter.First(); f != nil; f = iter.Next() {
+			fmt.Fprintf(&buf, "  %s\n", f.DebugString(format, verbose))
+		}
+	}
+	return buf.String()
+}
+
+// ParseVersionDebug parses a Version from its DebugString output.
+func ParseVersionDebug(
+	cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, s string,
+) (*Version, error) {
+	var level int
+	var files [NumLevels][]*FileMetadata
+	for _, l := range strings.Split(s, "\n") {
+		l = strings.TrimSpace(l)
+
+		switch l[:2] {
+		case "0.", "0:", "1:", "2:", "3:", "4:", "5:", "6:":
+			var err error
+			level, err = strconv.Atoi(l[:1])
+			if err != nil {
+				return nil, err
+			}
+		default:
+			m, err := ParseFileMetadataDebug(l)
+			if err != nil {
+				return nil, err
+			}
+			// If we only parsed overall bounds, default to setting the point bounds.
+			if !m.HasPointKeys && !m.HasRangeKeys {
+				m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest
+				m.HasPointKeys = true
+			}
+			files[level] = append(files[level], m)
+		}
+	}
+	// Reverse the order of L0 files. This ensures we construct the same
+	// sublevels. (They're printed from higher sublevel to lower, which means in
+	// a partial order that represents newest to oldest).
+	for i := 0; i < len(files[0])/2; i++ {
+		files[0][i], files[0][len(files[0])-i-1] = files[0][len(files[0])-i-1], files[0][i]
+	}
+	return NewVersion(cmp, formatKey, flushSplitBytes, files), nil
+}
+
+// Refs returns the number of references to the version.
+func (v *Version) Refs() int32 {
+	return v.refs.Load()
+}
+
+// Ref increments the version refcount.
+func (v *Version) Ref() {
+	v.refs.Add(1)
+}
+
+// Unref decrements the version refcount. If the last reference to the version
+// was removed, the version is removed from the list of versions and the
+// Deleted callback is invoked. Requires that the VersionList mutex is NOT
+// locked.
+func (v *Version) Unref() {
+	if v.refs.Add(-1) == 0 {
+		l := v.list
+		l.mu.Lock()
+		l.Remove(v)
+		v.Deleted(v.unrefFiles())
+		l.mu.Unlock()
+	}
+}
+
+// UnrefLocked decrements the version refcount. If the last reference to the
+// version was removed, the version is removed from the list of versions and
+// the Deleted callback is invoked. Requires that the VersionList mutex is
+// already locked.
+func (v *Version) UnrefLocked() {
+	if v.refs.Add(-1) == 0 {
+		v.list.Remove(v)
+		v.Deleted(v.unrefFiles())
+	}
+}
+
+func (v *Version) unrefFiles() []*FileBacking {
+	var obsolete []*FileBacking
+	for _, lm := range v.Levels {
+		obsolete = append(obsolete, lm.release()...)
+	}
+	for _, lm := range v.RangeKeyLevels {
+		obsolete = append(obsolete, lm.release()...)
+	}
+	return obsolete
+}
+
+// Next returns the next version in the list of versions.
+func (v *Version) Next() *Version {
+	return v.next
+}
+
+// InitL0Sublevels initializes the L0Sublevels
+func (v *Version) InitL0Sublevels(
+	cmp Compare, formatKey base.FormatKey, flushSplitBytes int64,
+) error {
+	var err error
+	v.L0Sublevels, err = NewL0Sublevels(&v.Levels[0], cmp, formatKey, flushSplitBytes)
+	if err == nil && v.L0Sublevels != nil {
+		v.L0SublevelFiles = v.L0Sublevels.Levels
+	}
+	return err
+}
+
+// Contains returns a boolean indicating whether the provided file exists in
+// the version at the given level. If level is non-zero then Contains binary
+// searches among the files. If level is zero, Contains scans the entire
+// level.
+func (v *Version) Contains(level int, cmp Compare, m *FileMetadata) bool {
+	iter := v.Levels[level].Iter()
+	if level > 0 {
+		overlaps := v.Overlaps(level, cmp, m.Smallest.UserKey, m.Largest.UserKey,
+			m.Largest.IsExclusiveSentinel())
+		iter = overlaps.Iter()
+	}
+	for f := iter.First(); f != nil; f = iter.Next() {
+		if f == m {
+			return true
+		}
+	}
+	return false
+}
+
+// Overlaps returns all elements of v.files[level] whose user key range
+// intersects the given range. If level is non-zero then the user key ranges of
+// v.files[level] are assumed to not overlap (although they may touch). If level
+// is zero then that assumption cannot be made, and the [start, end] range is
+// expanded to the union of those matching ranges so far and the computation is
+// repeated until [start, end] stabilizes.
+// The returned files are a subsequence of the input files, i.e., the ordering
+// is not changed.
+func (v *Version) Overlaps(
+	level int, cmp Compare, start, end []byte, exclusiveEnd bool,
+) LevelSlice {
+	if level == 0 {
+		// Indices that have been selected as overlapping.
+		l0 := v.Levels[level]
+		l0Iter := l0.Iter()
+		selectedIndices := make([]bool, l0.Len())
+		numSelected := 0
+		var slice LevelSlice
+		for {
+			restart := false
+			for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() {
+				selected := selectedIndices[i]
+				if selected {
+					continue
+				}
+				if !meta.Overlaps(cmp, start, end, exclusiveEnd) {
+					// meta is completely outside the specified range; skip it.
+					continue
+				}
+				// Overlaps.
+				selectedIndices[i] = true
+				numSelected++
+
+				smallest := meta.Smallest.UserKey
+				largest := meta.Largest.UserKey
+				// Since level == 0, check if the newly added fileMetadata has
+				// expanded the range. We expand the range immediately for files
+				// we have remaining to check in this loop. All already checked
+				// and unselected files will need to be rechecked via the
+				// restart below.
+				if cmp(smallest, start) < 0 {
+					start = smallest
+					restart = true
+				}
+				if v := cmp(largest, end); v > 0 {
+					end = largest
+					exclusiveEnd = meta.Largest.IsExclusiveSentinel()
+					restart = true
+				} else if v == 0 && exclusiveEnd && !meta.Largest.IsExclusiveSentinel() {
+					// Only update the exclusivity of our existing `end`
+					// bound.
+					exclusiveEnd = false
+					restart = true
+				}
+			}
+
+			if !restart {
+				// Construct a B-Tree containing only the matching items.
+				var tr btree
+				tr.cmp = v.Levels[level].tree.cmp
+				for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() {
+					if selectedIndices[i] {
+						err := tr.Insert(meta)
+						if err != nil {
+							panic(err)
+						}
+					}
+				}
+				slice = newLevelSlice(tr.Iter())
+				// TODO(jackson): Avoid the oddity of constructing and
+				// immediately releasing a B-Tree. Make LevelSlice an
+				// interface?
+				tr.Release()
+				break
+			}
+			// Continue looping to retry the files that were not selected.
+		}
+		return slice
+	}
+
+	return overlaps(v.Levels[level].Iter(), cmp, start, end, exclusiveEnd)
+}
+
+// CheckOrdering checks that the files are consistent with respect to
+// increasing file numbers (for level 0 files) and increasing and non-
+// overlapping internal key ranges (for level non-0 files).
+func (v *Version) CheckOrdering(
+	cmp Compare, format base.FormatKey, order OrderingInvariants,
+) error {
+	for sublevel := len(v.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- {
+		sublevelIter := v.L0SublevelFiles[sublevel].Iter()
+		// Sublevels have NEVER allowed split user keys, so we can pass
+		// ProhibitSplitUserKeys.
+		if err := CheckOrdering(cmp, format, L0Sublevel(sublevel), sublevelIter, ProhibitSplitUserKeys); err != nil {
+			return base.CorruptionErrorf("%s\n%s", err, v.DebugString(format))
+		}
+	}
+
+	for level, lm := range v.Levels {
+		if err := CheckOrdering(cmp, format, Level(level), lm.Iter(), order); err != nil {
+			return base.CorruptionErrorf("%s\n%s", err, v.DebugString(format))
+		}
+	}
+	return nil
+}
+
+// VersionList holds a list of versions. The versions are ordered from oldest
+// to newest.
+type VersionList struct {
+	mu   *sync.Mutex
+	root Version
+}
+
+// Init initializes the version list.
+func (l *VersionList) Init(mu *sync.Mutex) {
+	l.mu = mu
+	l.root.next = &l.root
+	l.root.prev = &l.root
+}
+
+// Empty returns true if the list is empty, and false otherwise.
+func (l *VersionList) Empty() bool {
+	return l.root.next == &l.root
+}
+
+// Front returns the oldest version in the list. Note that this version is only
+// valid if Empty() returns true.
+func (l *VersionList) Front() *Version {
+	return l.root.next
+}
+
+// Back returns the newest version in the list. Note that this version is only
+// valid if Empty() returns true.
+func (l *VersionList) Back() *Version {
+	return l.root.prev
+}
+
+// PushBack adds a new version to the back of the list. This new version
+// becomes the "newest" version in the list.
+func (l *VersionList) PushBack(v *Version) {
+	if v.list != nil || v.prev != nil || v.next != nil {
+		panic("pebble: version list is inconsistent")
+	}
+	v.prev = l.root.prev
+	v.prev.next = v
+	v.next = &l.root
+	v.next.prev = v
+	v.list = l
+	// Let L0Sublevels on the second newest version get GC'd, as it is no longer
+	// necessary. See the comment in Version.
+	v.prev.L0Sublevels = nil
+}
+
+// Remove removes the specified version from the list.
+func (l *VersionList) Remove(v *Version) {
+	if v == &l.root {
+		panic("pebble: cannot remove version list root node")
+	}
+	if v.list != l {
+		panic("pebble: version list is inconsistent")
+	}
+	v.prev.next = v.next
+	v.next.prev = v.prev
+	v.next = nil // avoid memory leaks
+	v.prev = nil // avoid memory leaks
+	v.list = nil // avoid memory leaks
+}
+
+// OrderingInvariants dictates the file ordering invariants active.
+type OrderingInvariants int8
+
+const (
+	// ProhibitSplitUserKeys indicates that adjacent files within a level cannot
+	// contain the same user key.
+	ProhibitSplitUserKeys OrderingInvariants = iota
+	// AllowSplitUserKeys indicates that adjacent files within a level may
+	// contain the same user key. This is only allowed by historical format
+	// major versions.
+	//
+	// TODO(jackson): Remove.
+	AllowSplitUserKeys
+)
+
+// CheckOrdering checks that the files are consistent with respect to
+// seqnums (for level 0 files -- see detailed comment below) and increasing and non-
+// overlapping internal key ranges (for non-level 0 files).
+//
+// The ordering field may be passed AllowSplitUserKeys to allow adjacent files that are both
+// inclusive of the same user key. Pebble no longer creates version edits
+// installing such files, and Pebble databases with sufficiently high format
+// major version should no longer have any such files within their LSM.
+// TODO(jackson): Remove AllowSplitUserKeys when we remove support for the
+// earlier format major versions.
+func CheckOrdering(
+	cmp Compare, format base.FormatKey, level Level, files LevelIterator, ordering OrderingInvariants,
+) error {
+	// The invariants to check for L0 sublevels are the same as the ones to
+	// check for all other levels. However, if L0 is not organized into
+	// sublevels, or if all L0 files are being passed in, we do the legacy L0
+	// checks, defined in the detailed comment below.
+	if level == Level(0) {
+		// We have 2 kinds of files:
+		// - Files with exactly one sequence number: these could be either ingested files
+		//   or flushed files. We cannot tell the difference between them based on FileMetadata,
+		//   so our consistency checking here uses the weaker checks assuming it is a narrow
+		//   flushed file. We cannot error on ingested files having sequence numbers coincident
+		//   with flushed files as the seemingly ingested file could just be a flushed file
+		//   with just one key in it which is a truncated range tombstone sharing sequence numbers
+		//   with other files in the same flush.
+		// - Files with multiple sequence numbers: these are necessarily flushed files.
+		//
+		// Three cases of overlapping sequence numbers:
+		// Case 1:
+		// An ingested file contained in the sequence numbers of the flushed file -- it must be
+		// fully contained (not coincident with either end of the flushed file) since the memtable
+		// must have been at [a, b-1] (where b > a) when the ingested file was assigned sequence
+		// num b, and the memtable got a subsequent update that was given sequence num b+1, before
+		// being flushed.
+		//
+		// So a sequence [1000, 1000] [1002, 1002] [1000, 2000] is invalid since the first and
+		// third file are inconsistent with each other. So comparing adjacent files is insufficient
+		// for consistency checking.
+		//
+		// Visually we have something like
+		// x------y x-----------yx-------------y (flushed files where x, y are the endpoints)
+		//     y       y  y        y             (y's represent ingested files)
+		// And these are ordered in increasing order of y. Note that y's must be unique.
+		//
+		// Case 2:
+		// A flushed file that did not overlap in keys with any file in any level, but does overlap
+		// in the file key intervals. This file is placed in L0 since it overlaps in the file
+		// key intervals but since it has no overlapping data, it is assigned a sequence number
+		// of 0 in RocksDB. We handle this case for compatibility with RocksDB.
+		//
+		// Case 3:
+		// A sequence of flushed files that overlap in sequence numbers with one another,
+		// but do not overlap in keys inside the sstables. These files correspond to
+		// partitioned flushes or the results of intra-L0 compactions of partitioned
+		// flushes.
+		//
+		// Since these types of SSTables violate most other sequence number
+		// overlap invariants, and handling this case is important for compatibility
+		// with future versions of pebble, this method relaxes most L0 invariant
+		// checks.
+
+		var prev *FileMetadata
+		for f := files.First(); f != nil; f, prev = files.Next(), f {
+			if prev == nil {
+				continue
+			}
+			// Validate that the sorting is sane.
+			if prev.LargestSeqNum == 0 && f.LargestSeqNum == prev.LargestSeqNum {
+				// Multiple files satisfying case 2 mentioned above.
+			} else if !prev.lessSeqNum(f) {
+				return base.CorruptionErrorf("L0 files %s and %s are not properly ordered: <#%d-#%d> vs <#%d-#%d>",
+					errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
+					errors.Safe(prev.SmallestSeqNum), errors.Safe(prev.LargestSeqNum),
+					errors.Safe(f.SmallestSeqNum), errors.Safe(f.LargestSeqNum))
+			}
+		}
+	} else {
+		var prev *FileMetadata
+		for f := files.First(); f != nil; f, prev = files.Next(), f {
+			if err := f.Validate(cmp, format); err != nil {
+				return errors.Wrapf(err, "%s ", level)
+			}
+			if prev != nil {
+				if prev.cmpSmallestKey(f, cmp) >= 0 {
+					return base.CorruptionErrorf("%s files %s and %s are not properly ordered: [%s-%s] vs [%s-%s]",
+						errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
+						prev.Smallest.Pretty(format), prev.Largest.Pretty(format),
+						f.Smallest.Pretty(format), f.Largest.Pretty(format))
+				}
+
+				// What's considered "overlapping" is dependent on the format
+				// major version. If ordering=ProhibitSplitUserKeys, then both
+				// files cannot contain keys with the same user keys. If the
+				// bounds have the same user key, the previous file's boundary
+				// must have a Trailer indicating that it's exclusive.
+				switch ordering {
+				case AllowSplitUserKeys:
+					if base.InternalCompare(cmp, prev.Largest, f.Smallest) >= 0 {
+						return base.CorruptionErrorf("%s files %s and %s have overlapping ranges: [%s-%s] vs [%s-%s]",
+							errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
+							prev.Smallest.Pretty(format), prev.Largest.Pretty(format),
+							f.Smallest.Pretty(format), f.Largest.Pretty(format))
+					}
+				case ProhibitSplitUserKeys:
+					if v := cmp(prev.Largest.UserKey, f.Smallest.UserKey); v > 0 || (v == 0 && !prev.Largest.IsExclusiveSentinel()) {
+						return base.CorruptionErrorf("%s files %s and %s have overlapping ranges: [%s-%s] vs [%s-%s]",
+							errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum),
+							prev.Smallest.Pretty(format), prev.Largest.Pretty(format),
+							f.Smallest.Pretty(format), f.Largest.Pretty(format))
+					}
+				default:
+					panic("unreachable")
+				}
+			}
+		}
+	}
+	return nil
+}
diff --git a/pebble/internal/manifest/version_edit.go b/pebble/internal/manifest/version_edit.go
new file mode 100644
index 0000000..ee3a919
--- /dev/null
+++ b/pebble/internal/manifest/version_edit.go
@@ -0,0 +1,1122 @@
+// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package manifest
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	stdcmp "github.com/cockroachdb/pebble/shims/cmp"
+	"github.com/cockroachdb/pebble/shims/slices"
+)
+
+// TODO(peter): describe the MANIFEST file format, independently of the C++
+// project.
+
+var errCorruptManifest = base.CorruptionErrorf("pebble: corrupt manifest")
+
+type byteReader interface {
+	io.ByteReader
+	io.Reader
+}
+
+// Tags for the versionEdit disk format.
+// Tag 8 is no longer used.
+const (
+	// LevelDB tags.
+	tagComparator     = 1
+	tagLogNumber      = 2
+	tagNextFileNumber = 3
+	tagLastSequence   = 4
+	tagCompactPointer = 5
+	tagDeletedFile    = 6
+	tagNewFile        = 7
+	tagPrevLogNumber  = 9
+
+	// RocksDB tags.
+	tagNewFile2         = 100
+	tagNewFile3         = 102
+	tagNewFile4         = 103
+	tagColumnFamily     = 200
+	tagColumnFamilyAdd  = 201
+	tagColumnFamilyDrop = 202
+	tagMaxColumnFamily  = 203
+
+	// Pebble tags.
+	tagNewFile5            = 104 // Range keys.
+	tagCreatedBackingTable = 105
+	tagRemovedBackingTable = 106
+
+	// The custom tags sub-format used by tagNewFile4 and above.
+	customTagTerminate         = 1
+	customTagNeedsCompaction   = 2
+	customTagCreationTime      = 6
+	customTagPathID            = 65
+	customTagNonSafeIgnoreMask = 1 << 6
+	customTagVirtual           = 66
+)
+
+// DeletedFileEntry holds the state for a file deletion from a level. The file
+// itself might still be referenced by another level.
+type DeletedFileEntry struct {
+	Level   int
+	FileNum base.FileNum
+}
+
+// NewFileEntry holds the state for a new file or one moved from a different
+// level.
+type NewFileEntry struct {
+	Level int
+	Meta  *FileMetadata
+	// BackingFileNum is only set during manifest replay, and only for virtual
+	// sstables.
+	BackingFileNum base.DiskFileNum
+}
+
+// VersionEdit holds the state for an edit to a Version along with other
+// on-disk state (log numbers, next file number, and the last sequence number).
+type VersionEdit struct {
+	// ComparerName is the value of Options.Comparer.Name. This is only set in
+	// the first VersionEdit in a manifest (either when the DB is created, or
+	// when a new manifest is created) and is used to verify that the comparer
+	// specified at Open matches the comparer that was previously used.
+	ComparerName string
+
+	// MinUnflushedLogNum is the smallest WAL log file number corresponding to
+	// mutations that have not been flushed to an sstable.
+	//
+	// This is an optional field, and 0 represents it is not set.
+	MinUnflushedLogNum base.DiskFileNum
+
+	// ObsoletePrevLogNum is a historic artifact from LevelDB that is not used by
+	// Pebble, RocksDB, or even LevelDB. Its use in LevelDB was deprecated in
+	// 6/2011. We keep it around purely for informational purposes when
+	// displaying MANIFEST contents.
+	ObsoletePrevLogNum uint64
+
+	// The next file number. A single counter is used to assign file numbers
+	// for the WAL, MANIFEST, sstable, and OPTIONS files.
+	NextFileNum uint64
+
+	// LastSeqNum is an upper bound on the sequence numbers that have been
+	// assigned in flushed WALs. Unflushed WALs (that will be replayed during
+	// recovery) may contain sequence numbers greater than this value.
+	LastSeqNum uint64
+
+	// A file num may be present in both deleted files and new files when it
+	// is moved from a lower level to a higher level (when the compaction
+	// found that there was no overlapping file at the higher level).
+	DeletedFiles map[DeletedFileEntry]*FileMetadata
+	NewFiles     []NewFileEntry
+	// CreatedBackingTables can be used to preserve the FileBacking associated
+	// with a physical sstable. This is useful when virtual sstables in the
+	// latest version are reconstructed during manifest replay, and we also need
+	// to reconstruct the FileBacking which is required by these virtual
+	// sstables.
+	//
+	// INVARIANT: The FileBacking associated with a physical sstable must only
+	// be added as a backing file in the same version edit where the physical
+	// sstable is first virtualized. This means that the physical sstable must
+	// be present in DeletedFiles and that there must be at least one virtual
+	// sstable with the same FileBacking as the physical sstable in NewFiles. A
+	// file must be present in CreatedBackingTables in exactly one version edit.
+	// The physical sstable associated with the FileBacking must also not be
+	// present in NewFiles.
+	CreatedBackingTables []*FileBacking
+	// RemovedBackingTables is used to remove the FileBacking associated with a
+	// virtual sstable. Note that a backing sstable can be removed as soon as
+	// there are no virtual sstables in the latest version which are using the
+	// backing sstable, but the backing sstable doesn't necessarily have to be
+	// removed atomically with the version edit which removes the last virtual
+	// sstable associated with the backing sstable. The removal can happen in a
+	// future version edit.
+	//
+	// INVARIANT: A file must only be added to RemovedBackingTables if it was
+	// added to CreateBackingTables in a prior version edit. The same version
+	// edit also cannot have the same file present in both CreateBackingTables
+	// and RemovedBackingTables. A file must be present in RemovedBackingTables
+	// in exactly one version edit.
+	RemovedBackingTables []base.DiskFileNum
+}
+
+// Decode decodes an edit from the specified reader.
+//
+// Note that the Decode step will not set the FileBacking for virtual sstables
+// and the responsibility is left to the caller. However, the Decode step will
+// populate the NewFileEntry.BackingFileNum in VersionEdit.NewFiles.
+func (v *VersionEdit) Decode(r io.Reader) error {
+	br, ok := r.(byteReader)
+	if !ok {
+		br = bufio.NewReader(r)
+	}
+	d := versionEditDecoder{br}
+	for {
+		tag, err := binary.ReadUvarint(br)
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return err
+		}
+		switch tag {
+		case tagComparator:
+			s, err := d.readBytes()
+			if err != nil {
+				return err
+			}
+			v.ComparerName = string(s)
+
+		case tagLogNumber:
+			n, err := d.readUvarint()
+			if err != nil {
+				return err
+			}
+			v.MinUnflushedLogNum = base.DiskFileNum(n)
+
+		case tagNextFileNumber:
+			n, err := d.readUvarint()
+			if err != nil {
+				return err
+			}
+			v.NextFileNum = n
+
+		case tagLastSequence:
+			n, err := d.readUvarint()
+			if err != nil {
+				return err
+			}
+			v.LastSeqNum = n
+
+		case tagCompactPointer:
+			if _, err := d.readLevel(); err != nil {
+				return err
+			}
+			if _, err := d.readBytes(); err != nil {
+				return err
+			}
+			// NB: RocksDB does not use compaction pointers anymore.
+
+		case tagRemovedBackingTable:
+			n, err := d.readUvarint()
+			if err != nil {
+				return err
+			}
+			v.RemovedBackingTables = append(
+				v.RemovedBackingTables, base.FileNum(n).DiskFileNum(),
+			)
+		case tagCreatedBackingTable:
+			dfn, err := d.readUvarint()
+			if err != nil {
+				return err
+			}
+			size, err := d.readUvarint()
+			if err != nil {
+				return err
+			}
+			fileBacking := &FileBacking{
+				DiskFileNum: base.FileNum(dfn).DiskFileNum(),
+				Size:        size,
+			}
+			v.CreatedBackingTables = append(v.CreatedBackingTables, fileBacking)
+		case tagDeletedFile:
+			level, err := d.readLevel()
+			if err != nil {
+				return err
+			}
+			fileNum, err := d.readFileNum()
+			if err != nil {
+				return err
+			}
+			if v.DeletedFiles == nil {
+				v.DeletedFiles = make(map[DeletedFileEntry]*FileMetadata)
+			}
+			v.DeletedFiles[DeletedFileEntry{level, fileNum}] = nil
+
+		case tagNewFile, tagNewFile2, tagNewFile3, tagNewFile4, tagNewFile5:
+			level, err := d.readLevel()
+			if err != nil {
+				return err
+			}
+			fileNum, err := d.readFileNum()
+			if err != nil {
+				return err
+			}
+			if tag == tagNewFile3 {
+				// The pathID field appears unused in RocksDB.
+				_ /* pathID */, err := d.readUvarint()
+				if err != nil {
+					return err
+				}
+			}
+			size, err := d.readUvarint()
+			if err != nil {
+				return err
+			}
+			// We read the smallest / largest key bounds differently depending on
+			// whether we have point, range or both types of keys present in the
+			// table.
+			var (
+				smallestPointKey, largestPointKey []byte
+				smallestRangeKey, largestRangeKey []byte
+				parsedPointBounds                 bool
+				boundsMarker                      byte
+			)
+			if tag != tagNewFile5 {
+				// Range keys not present in the table. Parse the point key bounds.
+				smallestPointKey, err = d.readBytes()
+				if err != nil {
+					return err
+				}
+				largestPointKey, err = d.readBytes()
+				if err != nil {
+					return err
+				}
+			} else {
+				// Range keys are present in the table. Determine whether we have point
+				// keys to parse, in addition to the bounds.
+				boundsMarker, err = d.ReadByte()
+				if err != nil {
+					return err
+				}
+				// Parse point key bounds, if present.
+				if boundsMarker&maskContainsPointKeys > 0 {
+					smallestPointKey, err = d.readBytes()
+					if err != nil {
+						return err
+					}
+					largestPointKey, err = d.readBytes()
+					if err != nil {
+						return err
+					}
+					parsedPointBounds = true
+				} else {
+					// The table does not have point keys.
+					// Sanity check: the bounds must be range keys.
+					if boundsMarker&maskSmallest != 0 || boundsMarker&maskLargest != 0 {
+						return base.CorruptionErrorf(
+							"new-file-4-range-keys: table without point keys has point key bounds: marker=%x",
+							boundsMarker,
+						)
+					}
+				}
+				// Parse range key bounds.
+				smallestRangeKey, err = d.readBytes()
+				if err != nil {
+					return err
+				}
+				largestRangeKey, err = d.readBytes()
+				if err != nil {
+					return err
+				}
+			}
+			var smallestSeqNum uint64
+			var largestSeqNum uint64
+			if tag != tagNewFile {
+				smallestSeqNum, err = d.readUvarint()
+				if err != nil {
+					return err
+				}
+				largestSeqNum, err = d.readUvarint()
+				if err != nil {
+					return err
+				}
+			}
+			var markedForCompaction bool
+			var creationTime uint64
+			virtualState := struct {
+				virtual        bool
+				backingFileNum uint64
+			}{}
+			if tag == tagNewFile4 || tag == tagNewFile5 {
+				for {
+					customTag, err := d.readUvarint()
+					if err != nil {
+						return err
+					}
+					if customTag == customTagTerminate {
+						break
+					} else if customTag == customTagVirtual {
+						virtualState.virtual = true
+						n, err := d.readUvarint()
+						if err != nil {
+							return err
+						}
+						virtualState.backingFileNum = n
+						continue
+					}
+
+					field, err := d.readBytes()
+					if err != nil {
+						return err
+					}
+					switch customTag {
+					case customTagNeedsCompaction:
+						if len(field) != 1 {
+							return base.CorruptionErrorf("new-file4: need-compaction field wrong size")
+						}
+						markedForCompaction = (field[0] == 1)
+
+					case customTagCreationTime:
+						var n int
+						creationTime, n = binary.Uvarint(field)
+						if n != len(field) {
+							return base.CorruptionErrorf("new-file4: invalid file creation time")
+						}
+
+					case customTagPathID:
+						return base.CorruptionErrorf("new-file4: path-id field not supported")
+
+					default:
+						if (customTag & customTagNonSafeIgnoreMask) != 0 {
+							return base.CorruptionErrorf("new-file4: custom field not supported: %d", customTag)
+						}
+					}
+				}
+			}
+			m := &FileMetadata{
+				FileNum:             fileNum,
+				Size:                size,
+				CreationTime:        int64(creationTime),
+				SmallestSeqNum:      smallestSeqNum,
+				LargestSeqNum:       largestSeqNum,
+				MarkedForCompaction: markedForCompaction,
+				Virtual:             virtualState.virtual,
+			}
+			if tag != tagNewFile5 { // no range keys present
+				m.SmallestPointKey = base.DecodeInternalKey(smallestPointKey)
+				m.LargestPointKey = base.DecodeInternalKey(largestPointKey)
+				m.HasPointKeys = true
+				m.Smallest, m.Largest = m.SmallestPointKey, m.LargestPointKey
+				m.boundTypeSmallest, m.boundTypeLargest = boundTypePointKey, boundTypePointKey
+			} else { // range keys present
+				// Set point key bounds, if parsed.
+				if parsedPointBounds {
+					m.SmallestPointKey = base.DecodeInternalKey(smallestPointKey)
+					m.LargestPointKey = base.DecodeInternalKey(largestPointKey)
+					m.HasPointKeys = true
+				}
+				// Set range key bounds.
+				m.SmallestRangeKey = base.DecodeInternalKey(smallestRangeKey)
+				m.LargestRangeKey = base.DecodeInternalKey(largestRangeKey)
+				m.HasRangeKeys = true
+				// Set overall bounds (by default assume range keys).
+				m.Smallest, m.Largest = m.SmallestRangeKey, m.LargestRangeKey
+				m.boundTypeSmallest, m.boundTypeLargest = boundTypeRangeKey, boundTypeRangeKey
+				if boundsMarker&maskSmallest == maskSmallest {
+					m.Smallest = m.SmallestPointKey
+					m.boundTypeSmallest = boundTypePointKey
+				}
+				if boundsMarker&maskLargest == maskLargest {
+					m.Largest = m.LargestPointKey
+					m.boundTypeLargest = boundTypePointKey
+				}
+			}
+			m.boundsSet = true
+			if !virtualState.virtual {
+				m.InitPhysicalBacking()
+			}
+
+			nfe := NewFileEntry{
+				Level: level,
+				Meta:  m,
+			}
+			if virtualState.virtual {
+				nfe.BackingFileNum = base.FileNum(virtualState.backingFileNum).DiskFileNum()
+			}
+			v.NewFiles = append(v.NewFiles, nfe)
+
+		case tagPrevLogNumber:
+			n, err := d.readUvarint()
+			if err != nil {
+				return err
+			}
+			v.ObsoletePrevLogNum = n
+
+		case tagColumnFamily, tagColumnFamilyAdd, tagColumnFamilyDrop, tagMaxColumnFamily:
+			return base.CorruptionErrorf("column families are not supported")
+
+		default:
+			return errCorruptManifest
+		}
+	}
+	return nil
+}
+
+func (v *VersionEdit) string(verbose bool, fmtKey base.FormatKey) string {
+	var buf bytes.Buffer
+	if v.ComparerName != "" {
+		fmt.Fprintf(&buf, "  comparer:     %s", v.ComparerName)
+	}
+	if v.MinUnflushedLogNum != 0 {
+		fmt.Fprintf(&buf, "  log-num:       %d\n", v.MinUnflushedLogNum)
+	}
+	if v.ObsoletePrevLogNum != 0 {
+		fmt.Fprintf(&buf, "  prev-log-num:  %d\n", v.ObsoletePrevLogNum)
+	}
+	if v.NextFileNum != 0 {
+		fmt.Fprintf(&buf, "  next-file-num: %d\n", v.NextFileNum)
+	}
+	if v.LastSeqNum != 0 {
+		fmt.Fprintf(&buf, "  last-seq-num:  %d\n", v.LastSeqNum)
+	}
+	entries := make([]DeletedFileEntry, 0, len(v.DeletedFiles))
+	for df := range v.DeletedFiles {
+		entries = append(entries, df)
+	}
+	slices.SortFunc(entries, func(a, b DeletedFileEntry) int {
+		if v := stdcmp.Compare(a.Level, b.Level); v != 0 {
+			return v
+		}
+		return stdcmp.Compare(a.FileNum, b.FileNum)
+	})
+	for _, df := range entries {
+		fmt.Fprintf(&buf, "  deleted:       L%d %s\n", df.Level, df.FileNum)
+	}
+	for _, nf := range v.NewFiles {
+		fmt.Fprintf(&buf, "  added:         L%d", nf.Level)
+		if verbose {
+			fmt.Fprintf(&buf, " %s", nf.Meta.DebugString(fmtKey, true /* verbose */))
+		} else {
+			fmt.Fprintf(&buf, " %s", nf.Meta.String())
+		}
+		if nf.Meta.CreationTime != 0 {
+			fmt.Fprintf(&buf, " (%s)",
+				time.Unix(nf.Meta.CreationTime, 0).UTC().Format(time.RFC3339))
+		}
+		fmt.Fprintln(&buf)
+	}
+	return buf.String()
+}
+
+// DebugString is a more verbose version of String(). Use this in tests.
+func (v *VersionEdit) DebugString(fmtKey base.FormatKey) string {
+	return v.string(true /* verbose */, fmtKey)
+}
+
+// String implements fmt.Stringer for a VersionEdit.
+func (v *VersionEdit) String() string {
+	return v.string(false /* verbose */, base.DefaultFormatter)
+}
+
+// Encode encodes an edit to the specified writer.
+func (v *VersionEdit) Encode(w io.Writer) error {
+	e := versionEditEncoder{new(bytes.Buffer)}
+
+	if v.ComparerName != "" {
+		e.writeUvarint(tagComparator)
+		e.writeString(v.ComparerName)
+	}
+	if v.MinUnflushedLogNum != 0 {
+		e.writeUvarint(tagLogNumber)
+		e.writeUvarint(uint64(v.MinUnflushedLogNum))
+	}
+	if v.ObsoletePrevLogNum != 0 {
+		e.writeUvarint(tagPrevLogNumber)
+		e.writeUvarint(v.ObsoletePrevLogNum)
+	}
+	if v.NextFileNum != 0 {
+		e.writeUvarint(tagNextFileNumber)
+		e.writeUvarint(uint64(v.NextFileNum))
+	}
+	for _, dfn := range v.RemovedBackingTables {
+		e.writeUvarint(tagRemovedBackingTable)
+		e.writeUvarint(uint64(dfn.FileNum()))
+	}
+	for _, fileBacking := range v.CreatedBackingTables {
+		e.writeUvarint(tagCreatedBackingTable)
+		e.writeUvarint(uint64(fileBacking.DiskFileNum.FileNum()))
+		e.writeUvarint(fileBacking.Size)
+	}
+	// RocksDB requires LastSeqNum to be encoded for the first MANIFEST entry,
+	// even though its value is zero. We detect this by encoding LastSeqNum when
+	// ComparerName is set.
+	if v.LastSeqNum != 0 || v.ComparerName != "" {
+		e.writeUvarint(tagLastSequence)
+		e.writeUvarint(v.LastSeqNum)
+	}
+	for x := range v.DeletedFiles {
+		e.writeUvarint(tagDeletedFile)
+		e.writeUvarint(uint64(x.Level))
+		e.writeUvarint(uint64(x.FileNum))
+	}
+	for _, x := range v.NewFiles {
+		customFields := x.Meta.MarkedForCompaction || x.Meta.CreationTime != 0 || x.Meta.Virtual
+		var tag uint64
+		switch {
+		case x.Meta.HasRangeKeys:
+			tag = tagNewFile5
+		case customFields:
+			tag = tagNewFile4
+		default:
+			tag = tagNewFile2
+		}
+		e.writeUvarint(tag)
+		e.writeUvarint(uint64(x.Level))
+		e.writeUvarint(uint64(x.Meta.FileNum))
+		e.writeUvarint(x.Meta.Size)
+		if !x.Meta.HasRangeKeys {
+			// If we have no range keys, preserve the original format and write the
+			// smallest and largest point keys.
+			e.writeKey(x.Meta.SmallestPointKey)
+			e.writeKey(x.Meta.LargestPointKey)
+		} else {
+			// When range keys are present, we first write a marker byte that
+			// indicates if the table also contains point keys, in addition to how the
+			// overall bounds for the table should be reconstructed. This byte is
+			// followed by the keys themselves.
+			b, err := x.Meta.boundsMarker()
+			if err != nil {
+				return err
+			}
+			if err = e.WriteByte(b); err != nil {
+				return err
+			}
+			// Write point key bounds (if present).
+			if x.Meta.HasPointKeys {
+				e.writeKey(x.Meta.SmallestPointKey)
+				e.writeKey(x.Meta.LargestPointKey)
+			}
+			// Write range key bounds.
+			e.writeKey(x.Meta.SmallestRangeKey)
+			e.writeKey(x.Meta.LargestRangeKey)
+		}
+		e.writeUvarint(x.Meta.SmallestSeqNum)
+		e.writeUvarint(x.Meta.LargestSeqNum)
+		if customFields {
+			if x.Meta.CreationTime != 0 {
+				e.writeUvarint(customTagCreationTime)
+				var buf [binary.MaxVarintLen64]byte
+				n := binary.PutUvarint(buf[:], uint64(x.Meta.CreationTime))
+				e.writeBytes(buf[:n])
+			}
+			if x.Meta.MarkedForCompaction {
+				e.writeUvarint(customTagNeedsCompaction)
+				e.writeBytes([]byte{1})
+			}
+			if x.Meta.Virtual {
+				e.writeUvarint(customTagVirtual)
+				e.writeUvarint(uint64(x.Meta.FileBacking.DiskFileNum.FileNum()))
+			}
+			e.writeUvarint(customTagTerminate)
+		}
+	}
+	_, err := w.Write(e.Bytes())
+	return err
+}
+
+// versionEditDecoder should be used to decode version edits.
+type versionEditDecoder struct {
+	byteReader
+}
+
+func (d versionEditDecoder) readBytes() ([]byte, error) {
+	n, err := d.readUvarint()
+	if err != nil {
+		return nil, err
+	}
+	s := make([]byte, n)
+	_, err = io.ReadFull(d, s)
+	if err != nil {
+		if err == io.ErrUnexpectedEOF {
+			return nil, errCorruptManifest
+		}
+		return nil, err
+	}
+	return s, nil
+}
+
+func (d versionEditDecoder) readLevel() (int, error) {
+	u, err := d.readUvarint()
+	if err != nil {
+		return 0, err
+	}
+	if u >= NumLevels {
+		return 0, errCorruptManifest
+	}
+	return int(u), nil
+}
+
+func (d versionEditDecoder) readFileNum() (base.FileNum, error) {
+	u, err := d.readUvarint()
+	if err != nil {
+		return 0, err
+	}
+	return base.FileNum(u), nil
+}
+
+func (d versionEditDecoder) readUvarint() (uint64, error) {
+	u, err := binary.ReadUvarint(d)
+	if err != nil {
+		if err == io.EOF {
+			return 0, errCorruptManifest
+		}
+		return 0, err
+	}
+	return u, nil
+}
+
+type versionEditEncoder struct {
+	*bytes.Buffer
+}
+
+func (e versionEditEncoder) writeBytes(p []byte) {
+	e.writeUvarint(uint64(len(p)))
+	e.Write(p)
+}
+
+func (e versionEditEncoder) writeKey(k InternalKey) {
+	e.writeUvarint(uint64(k.Size()))
+	e.Write(k.UserKey)
+	buf := k.EncodeTrailer()
+	e.Write(buf[:])
+}
+
+func (e versionEditEncoder) writeString(s string) {
+	e.writeUvarint(uint64(len(s)))
+	e.WriteString(s)
+}
+
+func (e versionEditEncoder) writeUvarint(u uint64) {
+	var buf [binary.MaxVarintLen64]byte
+	n := binary.PutUvarint(buf[:], u)
+	e.Write(buf[:n])
+}
+
+// BulkVersionEdit summarizes the files added and deleted from a set of version
+// edits.
+//
+// INVARIANTS:
+// No file can be added to a level more than once. This is true globally, and
+// also true for all of the calls to Accumulate for a single bulk version edit.
+//
+// No file can be removed from a level more than once. This is true globally,
+// and also true for all of the calls to Accumulate for a single bulk version
+// edit.
+//
+// A file must not be added and removed from a given level in the same version
+// edit.
+//
+// A file that is being removed from a level must have been added to that level
+// before (in a prior version edit). Note that a given file can be deleted from
+// a level and added to another level in a single version edit
+type BulkVersionEdit struct {
+	Added   [NumLevels]map[base.FileNum]*FileMetadata
+	Deleted [NumLevels]map[base.FileNum]*FileMetadata
+
+	// AddedFileBacking is a map to support lookup so that we can populate the
+	// FileBacking of virtual sstables during manifest replay.
+	AddedFileBacking   map[base.DiskFileNum]*FileBacking
+	RemovedFileBacking []base.DiskFileNum
+
+	// AddedByFileNum maps file number to file metadata for all added files
+	// from accumulated version edits. AddedByFileNum is only populated if set
+	// to non-nil by a caller. It must be set to non-nil when replaying
+	// version edits read from a MANIFEST (as opposed to VersionEdits
+	// constructed in-memory).  While replaying a MANIFEST file,
+	// VersionEdit.DeletedFiles map entries have nil values, because the
+	// on-disk deletion record encodes only the file number. Accumulate
+	// uses AddedByFileNum to correctly populate the BulkVersionEdit's Deleted
+	// field with non-nil *FileMetadata.
+	AddedByFileNum map[base.FileNum]*FileMetadata
+
+	// MarkedForCompactionCountDiff holds the aggregated count of files
+	// marked for compaction added or removed.
+	MarkedForCompactionCountDiff int
+}
+
+// Accumulate adds the file addition and deletions in the specified version
+// edit to the bulk edit's internal state.
+//
+// INVARIANTS:
+// If a file is added to a given level in a call to Accumulate and then removed
+// from that level in a subsequent call, the file will not be present in the
+// resulting BulkVersionEdit.Deleted for that level.
+//
+// After accumulation of version edits, the bulk version edit may have
+// information about a file which has been deleted from a level, but it may
+// not have information about the same file added to the same level. The add
+// could've occurred as part of a previous bulk version edit. In this case,
+// the deleted file must be present in BulkVersionEdit.Deleted, at the end
+// of the accumulation, because we need to decrease the refcount of the
+// deleted file in Apply.
+func (b *BulkVersionEdit) Accumulate(ve *VersionEdit) error {
+	for df, m := range ve.DeletedFiles {
+		dmap := b.Deleted[df.Level]
+		if dmap == nil {
+			dmap = make(map[base.FileNum]*FileMetadata)
+			b.Deleted[df.Level] = dmap
+		}
+
+		if m == nil {
+			// m is nil only when replaying a MANIFEST.
+			if b.AddedByFileNum == nil {
+				return errors.Errorf("deleted file L%d.%s's metadata is absent and bve.AddedByFileNum is nil", df.Level, df.FileNum)
+			}
+			m = b.AddedByFileNum[df.FileNum]
+			if m == nil {
+				return base.CorruptionErrorf("pebble: file deleted L%d.%s before it was inserted", df.Level, df.FileNum)
+			}
+		}
+		if m.MarkedForCompaction {
+			b.MarkedForCompactionCountDiff--
+		}
+		if _, ok := b.Added[df.Level][df.FileNum]; !ok {
+			dmap[df.FileNum] = m
+		} else {
+			// Present in b.Added for the same level.
+			delete(b.Added[df.Level], df.FileNum)
+		}
+	}
+
+	// Generate state for Added backing files. Note that these must be generated
+	// before we loop through the NewFiles, because we need to populate the
+	// FileBackings which might be used by the NewFiles loop.
+	if b.AddedFileBacking == nil {
+		b.AddedFileBacking = make(map[base.DiskFileNum]*FileBacking)
+	}
+	for _, fb := range ve.CreatedBackingTables {
+		if _, ok := b.AddedFileBacking[fb.DiskFileNum]; ok {
+			// There is already a FileBacking associated with fb.DiskFileNum.
+			// This should never happen. There must always be only one FileBacking
+			// associated with a backing sstable.
+			panic(fmt.Sprintf("pebble: duplicate file backing %s", fb.DiskFileNum.String()))
+		}
+		b.AddedFileBacking[fb.DiskFileNum] = fb
+	}
+
+	for _, nf := range ve.NewFiles {
+		// A new file should not have been deleted in this or a preceding
+		// VersionEdit at the same level (though files can move across levels).
+		if dmap := b.Deleted[nf.Level]; dmap != nil {
+			if _, ok := dmap[nf.Meta.FileNum]; ok {
+				return base.CorruptionErrorf("pebble: file deleted L%d.%s before it was inserted", nf.Level, nf.Meta.FileNum)
+			}
+		}
+		if nf.Meta.Virtual && nf.Meta.FileBacking == nil {
+			// FileBacking for a virtual sstable must only be nil if we're performing
+			// manifest replay.
+			nf.Meta.FileBacking = b.AddedFileBacking[nf.BackingFileNum]
+			if nf.Meta.FileBacking == nil {
+				return errors.Errorf("FileBacking for virtual sstable must not be nil")
+			}
+		} else if nf.Meta.FileBacking == nil {
+			return errors.Errorf("Added file L%d.%s's has no FileBacking", nf.Level, nf.Meta.FileNum)
+		}
+
+		if b.Added[nf.Level] == nil {
+			b.Added[nf.Level] = make(map[base.FileNum]*FileMetadata)
+		}
+		b.Added[nf.Level][nf.Meta.FileNum] = nf.Meta
+		if b.AddedByFileNum != nil {
+			b.AddedByFileNum[nf.Meta.FileNum] = nf.Meta
+		}
+		if nf.Meta.MarkedForCompaction {
+			b.MarkedForCompactionCountDiff++
+		}
+	}
+
+	// Since a file can be removed from backing files in exactly one version
+	// edit it is safe to just append without any de-duplication.
+	b.RemovedFileBacking = append(b.RemovedFileBacking, ve.RemovedBackingTables...)
+
+	return nil
+}
+
+// AccumulateIncompleteAndApplySingleVE should be called if a single version edit
+// is to be applied to the provided curr Version and if the caller needs to
+// update the versionSet.zombieTables map. This function exists separately from
+// BulkVersionEdit.Apply because it is easier to reason about properties
+// regarding BulkVersionedit.Accumulate/Apply and zombie table generation, if we
+// know that exactly one version edit is being accumulated.
+//
+// Note that the version edit passed into this function may be incomplete
+// because compactions don't have the ref counting information necessary to
+// populate VersionEdit.RemovedBackingTables. This function will complete such a
+// version edit by populating RemovedBackingTables.
+//
+// Invariant: Any file being deleted through ve must belong to the curr Version.
+// We can't have a delete for some arbitrary file which does not exist in curr.
+func AccumulateIncompleteAndApplySingleVE(
+	ve *VersionEdit,
+	curr *Version,
+	cmp Compare,
+	formatKey base.FormatKey,
+	flushSplitBytes int64,
+	readCompactionRate int64,
+	backingStateMap map[base.DiskFileNum]*FileBacking,
+	addBackingFunc func(*FileBacking),
+	removeBackingFunc func(base.DiskFileNum),
+	orderingInvariants OrderingInvariants,
+) (_ *Version, zombies map[base.DiskFileNum]uint64, _ error) {
+	if len(ve.RemovedBackingTables) != 0 {
+		panic("pebble: invalid incomplete version edit")
+	}
+	var b BulkVersionEdit
+	err := b.Accumulate(ve)
+	if err != nil {
+		return nil, nil, err
+	}
+	zombies = make(map[base.DiskFileNum]uint64)
+	v, err := b.Apply(
+		curr, cmp, formatKey, flushSplitBytes, readCompactionRate, zombies, orderingInvariants,
+	)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	for _, s := range b.AddedFileBacking {
+		addBackingFunc(s)
+	}
+
+	for fileNum := range zombies {
+		if _, ok := backingStateMap[fileNum]; ok {
+			// This table was backing some virtual sstable in the latest version,
+			// but is now a zombie. We add RemovedBackingTables entries for
+			// these, before the version edit is written to disk.
+			ve.RemovedBackingTables = append(
+				ve.RemovedBackingTables, fileNum,
+			)
+			removeBackingFunc(fileNum)
+		}
+	}
+	return v, zombies, nil
+}
+
+// Apply applies the delta b to the current version to produce a new
+// version. The new version is consistent with respect to the comparer cmp.
+//
+// curr may be nil, which is equivalent to a pointer to a zero version.
+//
+// On success, if a non-nil zombies map is provided to Apply, the map is updated
+// with file numbers and files sizes of deleted files. These files are
+// considered zombies because they are no longer referenced by the returned
+// Version, but cannot be deleted from disk as they are still in use by the
+// incoming Version.
+func (b *BulkVersionEdit) Apply(
+	curr *Version,
+	cmp Compare,
+	formatKey base.FormatKey,
+	flushSplitBytes int64,
+	readCompactionRate int64,
+	zombies map[base.DiskFileNum]uint64,
+	orderingInvariants OrderingInvariants,
+) (*Version, error) {
+	addZombie := func(state *FileBacking) {
+		if zombies != nil {
+			zombies[state.DiskFileNum] = state.Size
+		}
+	}
+	removeZombie := func(state *FileBacking) {
+		if zombies != nil {
+			delete(zombies, state.DiskFileNum)
+		}
+	}
+
+	v := new(Version)
+
+	// Adjust the count of files marked for compaction.
+	if curr != nil {
+		v.Stats.MarkedForCompaction = curr.Stats.MarkedForCompaction
+	}
+	v.Stats.MarkedForCompaction += b.MarkedForCompactionCountDiff
+	if v.Stats.MarkedForCompaction < 0 {
+		return nil, base.CorruptionErrorf("pebble: version marked for compaction count negative")
+	}
+
+	for level := range v.Levels {
+		if curr == nil || curr.Levels[level].tree.root == nil {
+			v.Levels[level] = makeLevelMetadata(cmp, level, nil /* files */)
+		} else {
+			v.Levels[level] = curr.Levels[level].clone()
+		}
+		if curr == nil || curr.RangeKeyLevels[level].tree.root == nil {
+			v.RangeKeyLevels[level] = makeLevelMetadata(cmp, level, nil /* files */)
+		} else {
+			v.RangeKeyLevels[level] = curr.RangeKeyLevels[level].clone()
+		}
+
+		if len(b.Added[level]) == 0 && len(b.Deleted[level]) == 0 {
+			// There are no edits on this level.
+			if level == 0 {
+				// Initialize L0Sublevels.
+				if curr == nil || curr.L0Sublevels == nil {
+					if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil {
+						return nil, errors.Wrap(err, "pebble: internal error")
+					}
+				} else {
+					v.L0Sublevels = curr.L0Sublevels
+					v.L0SublevelFiles = v.L0Sublevels.Levels
+				}
+			}
+			continue
+		}
+
+		// Some edits on this level.
+		lm := &v.Levels[level]
+		lmRange := &v.RangeKeyLevels[level]
+
+		addedFilesMap := b.Added[level]
+		deletedFilesMap := b.Deleted[level]
+		if n := v.Levels[level].Len() + len(addedFilesMap); n == 0 {
+			return nil, base.CorruptionErrorf(
+				"pebble: internal error: No current or added files but have deleted files: %d",
+				errors.Safe(len(deletedFilesMap)))
+		}
+
+		// NB: addedFilesMap may be empty. If a file is present in addedFilesMap
+		// for a level, it won't be present in deletedFilesMap for the same
+		// level.
+
+		for _, f := range deletedFilesMap {
+			if obsolete := v.Levels[level].remove(f); obsolete {
+				// Deleting a file from the B-Tree may decrement its
+				// reference count. However, because we cloned the
+				// previous level's B-Tree, this should never result in a
+				// file's reference count dropping to zero.
+				err := errors.Errorf("pebble: internal error: file L%d.%s obsolete during B-Tree removal", level, f.FileNum)
+				return nil, err
+			}
+			if f.HasRangeKeys {
+				if obsolete := v.RangeKeyLevels[level].remove(f); obsolete {
+					// Deleting a file from the B-Tree may decrement its
+					// reference count. However, because we cloned the
+					// previous level's B-Tree, this should never result in a
+					// file's reference count dropping to zero.
+					err := errors.Errorf("pebble: internal error: file L%d.%s obsolete during range-key B-Tree removal", level, f.FileNum)
+					return nil, err
+				}
+			}
+
+			// Note that a backing sst will only become a zombie if the
+			// references to it in the latest version is 0. We will remove the
+			// backing sst from the zombie list in the next loop if one of the
+			// addedFiles in any of the levels is referencing the backing sst.
+			// This is possible if a physical sstable is virtualized, or if it
+			// is moved.
+			latestRefCount := f.LatestRefs()
+			if latestRefCount <= 0 {
+				// If a file is present in deletedFilesMap for a level, then it
+				// must have already been added to the level previously, which
+				// means that its latest ref count cannot be 0.
+				err := errors.Errorf("pebble: internal error: incorrect latestRefs reference counting for file", f.FileNum)
+				return nil, err
+			} else if f.LatestUnref() == 0 {
+				addZombie(f.FileBacking)
+			}
+		}
+
+		addedFiles := make([]*FileMetadata, 0, len(addedFilesMap))
+		for _, f := range addedFilesMap {
+			addedFiles = append(addedFiles, f)
+		}
+		// Sort addedFiles by file number. This isn't necessary, but tests which
+		// replay invalid manifests check the error output, and the error output
+		// depends on the order in which files are added to the btree.
+		slices.SortFunc(addedFiles, func(a, b *FileMetadata) int {
+			return stdcmp.Compare(a.FileNum, b.FileNum)
+		})
+
+		var sm, la *FileMetadata
+		for _, f := range addedFiles {
+			// NB: allowedSeeks is used for read triggered compactions. It is set using
+			// Options.Experimental.ReadCompactionRate which defaults to 32KB.
+			var allowedSeeks int64
+			if readCompactionRate != 0 {
+				allowedSeeks = int64(f.Size) / readCompactionRate
+			}
+			if allowedSeeks < 100 {
+				allowedSeeks = 100
+			}
+			f.AllowedSeeks.Store(allowedSeeks)
+			f.InitAllowedSeeks = allowedSeeks
+
+			err := lm.insert(f)
+			// We're adding this file to the new version, so increment the
+			// latest refs count.
+			f.LatestRef()
+			if err != nil {
+				return nil, errors.Wrap(err, "pebble")
+			}
+			if f.HasRangeKeys {
+				err = lmRange.insert(f)
+				if err != nil {
+					return nil, errors.Wrap(err, "pebble")
+				}
+			}
+			removeZombie(f.FileBacking)
+			// Track the keys with the smallest and largest keys, so that we can
+			// check consistency of the modified span.
+			if sm == nil || base.InternalCompare(cmp, sm.Smallest, f.Smallest) > 0 {
+				sm = f
+			}
+			if la == nil || base.InternalCompare(cmp, la.Largest, f.Largest) < 0 {
+				la = f
+			}
+		}
+
+		if level == 0 {
+			if curr != nil && curr.L0Sublevels != nil && len(deletedFilesMap) == 0 {
+				// Flushes and ingestions that do not delete any L0 files do not require
+				// a regeneration of L0Sublevels from scratch. We can instead generate
+				// it incrementally.
+				var err error
+				// AddL0Files requires addedFiles to be sorted in seqnum order.
+				SortBySeqNum(addedFiles)
+				v.L0Sublevels, err = curr.L0Sublevels.AddL0Files(addedFiles, flushSplitBytes, &v.Levels[0])
+				if errors.Is(err, errInvalidL0SublevelsOpt) {
+					err = v.InitL0Sublevels(cmp, formatKey, flushSplitBytes)
+				} else if invariants.Enabled && err == nil {
+					copyOfSublevels, err := NewL0Sublevels(&v.Levels[0], cmp, formatKey, flushSplitBytes)
+					if err != nil {
+						panic(fmt.Sprintf("error when regenerating sublevels: %s", err))
+					}
+					s1 := describeSublevels(base.DefaultFormatter, false /* verbose */, copyOfSublevels.Levels)
+					s2 := describeSublevels(base.DefaultFormatter, false /* verbose */, v.L0Sublevels.Levels)
+					if s1 != s2 {
+						panic(fmt.Sprintf("incremental L0 sublevel generation produced different output than regeneration: %s != %s", s1, s2))
+					}
+				}
+				if err != nil {
+					return nil, errors.Wrap(err, "pebble: internal error")
+				}
+				v.L0SublevelFiles = v.L0Sublevels.Levels
+			} else if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil {
+				return nil, errors.Wrap(err, "pebble: internal error")
+			}
+			if err := CheckOrdering(cmp, formatKey, Level(0), v.Levels[level].Iter(), orderingInvariants); err != nil {
+				return nil, errors.Wrap(err, "pebble: internal error")
+			}
+			continue
+		}
+
+		// Check consistency of the level in the vicinity of our edits.
+		if sm != nil && la != nil {
+			overlap := overlaps(v.Levels[level].Iter(), cmp, sm.Smallest.UserKey,
+				la.Largest.UserKey, la.Largest.IsExclusiveSentinel())
+			// overlap contains all of the added files. We want to ensure that
+			// the added files are consistent with neighboring existing files
+			// too, so reslice the overlap to pull in a neighbor on each side.
+			check := overlap.Reslice(func(start, end *LevelIterator) {
+				if m := start.Prev(); m == nil {
+					start.Next()
+				}
+				if m := end.Next(); m == nil {
+					end.Prev()
+				}
+			})
+			if err := CheckOrdering(cmp, formatKey, Level(level), check.Iter(), orderingInvariants); err != nil {
+				return nil, errors.Wrap(err, "pebble: internal error")
+			}
+		}
+	}
+	return v, nil
+}
diff --git a/pebble/internal/manifest/version_edit_test.go b/pebble/internal/manifest/version_edit_test.go
new file mode 100644
index 0000000..6d09153
--- /dev/null
+++ b/pebble/internal/manifest/version_edit_test.go
@@ -0,0 +1,545 @@
+// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package manifest
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"os"
+	"reflect"
+	"slices"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/record"
+	"github.com/kr/pretty"
+	"github.com/stretchr/testify/require"
+)
+
+func checkRoundTrip(e0 VersionEdit) error {
+	var e1 VersionEdit
+	buf := new(bytes.Buffer)
+	if err := e0.Encode(buf); err != nil {
+		return errors.Wrap(err, "encode")
+	}
+	if err := e1.Decode(buf); err != nil {
+		return errors.Wrap(err, "decode")
+	}
+	if diff := pretty.Diff(e0, e1); diff != nil {
+		return errors.Errorf("%s", strings.Join(diff, "\n"))
+	}
+	return nil
+}
+
+// Version edits with virtual sstables will not be the same after a round trip
+// as the Decode function will not set the FileBacking for a virtual sstable.
+// We test round trip + bve accumulation here, after which the virtual sstable
+// FileBacking should be set.
+func TestVERoundTripAndAccumulate(t *testing.T) {
+	cmp := base.DefaultComparer.Compare
+	m1 := (&FileMetadata{
+		FileNum:        810,
+		Size:           8090,
+		CreationTime:   809060,
+		SmallestSeqNum: 9,
+		LargestSeqNum:  11,
+	}).ExtendPointKeyBounds(
+		cmp,
+		base.MakeInternalKey([]byte("a"), 0, base.InternalKeyKindSet),
+		base.MakeInternalKey([]byte("m"), 0, base.InternalKeyKindSet),
+	).ExtendRangeKeyBounds(
+		cmp,
+		base.MakeInternalKey([]byte("l"), 0, base.InternalKeyKindRangeKeySet),
+		base.MakeExclusiveSentinelKey(base.InternalKeyKindRangeKeySet, []byte("z")),
+	)
+	m1.InitPhysicalBacking()
+
+	m2 := (&FileMetadata{
+		FileNum:        812,
+		Size:           8090,
+		CreationTime:   809060,
+		SmallestSeqNum: 9,
+		LargestSeqNum:  11,
+		Virtual:        true,
+		FileBacking:    m1.FileBacking,
+	}).ExtendPointKeyBounds(
+		cmp,
+		base.MakeInternalKey([]byte("a"), 0, base.InternalKeyKindSet),
+		base.MakeInternalKey([]byte("c"), 0, base.InternalKeyKindSet),
+	)
+
+	ve1 := VersionEdit{
+		ComparerName:         "11",
+		MinUnflushedLogNum:   22,
+		ObsoletePrevLogNum:   33,
+		NextFileNum:          44,
+		LastSeqNum:           55,
+		CreatedBackingTables: []*FileBacking{m1.FileBacking},
+		NewFiles: []NewFileEntry{
+			{
+				Level: 4,
+				Meta:  m2,
+				// Only set for the test.
+				BackingFileNum: m2.FileBacking.DiskFileNum,
+			},
+		},
+	}
+	var err error
+	buf := new(bytes.Buffer)
+	if err = ve1.Encode(buf); err != nil {
+		t.Error(err)
+	}
+	var ve2 VersionEdit
+	if err = ve2.Decode(buf); err != nil {
+		t.Error(err)
+	}
+	// Perform accumulation to set the FileBacking on the files in the Decoded
+	// version edit.
+	var bve BulkVersionEdit
+	require.NoError(t, bve.Accumulate(&ve2))
+	if diff := pretty.Diff(ve1, ve2); diff != nil {
+		t.Error(errors.Errorf("%s", strings.Join(diff, "\n")))
+	}
+}
+
+func TestVersionEditRoundTrip(t *testing.T) {
+	cmp := base.DefaultComparer.Compare
+	m1 := (&FileMetadata{
+		FileNum:      805,
+		Size:         8050,
+		CreationTime: 805030,
+	}).ExtendPointKeyBounds(
+		cmp,
+		base.DecodeInternalKey([]byte("abc\x00\x01\x02\x03\x04\x05\x06\x07")),
+		base.DecodeInternalKey([]byte("xyz\x01\xff\xfe\xfd\xfc\xfb\xfa\xf9")),
+	)
+	m1.InitPhysicalBacking()
+
+	m2 := (&FileMetadata{
+		FileNum:             806,
+		Size:                8060,
+		CreationTime:        806040,
+		SmallestSeqNum:      3,
+		LargestSeqNum:       5,
+		MarkedForCompaction: true,
+	}).ExtendPointKeyBounds(
+		cmp,
+		base.DecodeInternalKey([]byte("A\x00\x01\x02\x03\x04\x05\x06\x07")),
+		base.DecodeInternalKey([]byte("Z\x01\xff\xfe\xfd\xfc\xfb\xfa\xf9")),
+	)
+	m2.InitPhysicalBacking()
+
+	m3 := (&FileMetadata{
+		FileNum:      807,
+		Size:         8070,
+		CreationTime: 807050,
+	}).ExtendRangeKeyBounds(
+		cmp,
+		base.MakeInternalKey([]byte("aaa"), 0, base.InternalKeyKindRangeKeySet),
+		base.MakeExclusiveSentinelKey(base.InternalKeyKindRangeKeySet, []byte("zzz")),
+	)
+	m3.InitPhysicalBacking()
+
+	m4 := (&FileMetadata{
+		FileNum:        809,
+		Size:           8090,
+		CreationTime:   809060,
+		SmallestSeqNum: 9,
+		LargestSeqNum:  11,
+	}).ExtendPointKeyBounds(
+		cmp,
+		base.MakeInternalKey([]byte("a"), 0, base.InternalKeyKindSet),
+		base.MakeInternalKey([]byte("m"), 0, base.InternalKeyKindSet),
+	).ExtendRangeKeyBounds(
+		cmp,
+		base.MakeInternalKey([]byte("l"), 0, base.InternalKeyKindRangeKeySet),
+		base.MakeExclusiveSentinelKey(base.InternalKeyKindRangeKeySet, []byte("z")),
+	)
+	m4.InitPhysicalBacking()
+
+	m5 := (&FileMetadata{
+		FileNum:        810,
+		Size:           8090,
+		CreationTime:   809060,
+		SmallestSeqNum: 9,
+		LargestSeqNum:  11,
+	}).ExtendPointKeyBounds(
+		cmp,
+		base.MakeInternalKey([]byte("a"), 0, base.InternalKeyKindSet),
+		base.MakeInternalKey([]byte("m"), 0, base.InternalKeyKindSet),
+	).ExtendRangeKeyBounds(
+		cmp,
+		base.MakeInternalKey([]byte("l"), 0, base.InternalKeyKindRangeKeySet),
+		base.MakeExclusiveSentinelKey(base.InternalKeyKindRangeKeySet, []byte("z")),
+	)
+	m5.InitPhysicalBacking()
+
+	m6 := (&FileMetadata{
+		FileNum:        811,
+		Size:           8090,
+		CreationTime:   809060,
+		SmallestSeqNum: 9,
+		LargestSeqNum:  11,
+	}).ExtendPointKeyBounds(
+		cmp,
+		base.MakeInternalKey([]byte("a"), 0, base.InternalKeyKindSet),
+		base.MakeInternalKey([]byte("m"), 0, base.InternalKeyKindSet),
+	).ExtendRangeKeyBounds(
+		cmp,
+		base.MakeInternalKey([]byte("l"), 0, base.InternalKeyKindRangeKeySet),
+		base.MakeExclusiveSentinelKey(base.InternalKeyKindRangeKeySet, []byte("z")),
+	)
+	m6.InitPhysicalBacking()
+
+	testCases := []VersionEdit{
+		// An empty version edit.
+		{},
+		// A complete version edit.
+		{
+			ComparerName:       "11",
+			MinUnflushedLogNum: 22,
+			ObsoletePrevLogNum: 33,
+			NextFileNum:        44,
+			LastSeqNum:         55,
+			RemovedBackingTables: []base.DiskFileNum{
+				base.FileNum(10).DiskFileNum(), base.FileNum(11).DiskFileNum(),
+			},
+			CreatedBackingTables: []*FileBacking{m5.FileBacking, m6.FileBacking},
+			DeletedFiles: map[DeletedFileEntry]*FileMetadata{
+				{
+					Level:   3,
+					FileNum: 703,
+				}: nil,
+				{
+					Level:   4,
+					FileNum: 704,
+				}: nil,
+			},
+			NewFiles: []NewFileEntry{
+				{
+					Level: 4,
+					Meta:  m1,
+				},
+				{
+					Level: 5,
+					Meta:  m2,
+				},
+				{
+					Level: 6,
+					Meta:  m3,
+				},
+				{
+					Level: 6,
+					Meta:  m4,
+				},
+			},
+		},
+	}
+	for _, tc := range testCases {
+		if err := checkRoundTrip(tc); err != nil {
+			t.Error(err)
+		}
+	}
+}
+
+func TestVersionEditDecode(t *testing.T) {
+	// TODO(radu): these should be datadriven tests that output the encoded and
+	// decoded edits.
+	cmp := base.DefaultComparer.Compare
+	m := (&FileMetadata{
+		FileNum:        4,
+		Size:           709,
+		SmallestSeqNum: 12,
+		LargestSeqNum:  14,
+		CreationTime:   1701712644,
+	}).ExtendPointKeyBounds(
+		cmp,
+		base.MakeInternalKey([]byte("bar"), 14, base.InternalKeyKindDelete),
+		base.MakeInternalKey([]byte("foo"), 13, base.InternalKeyKindSet),
+	)
+	m.InitPhysicalBacking()
+
+	testCases := []struct {
+		filename     string
+		encodedEdits []string
+		edits        []VersionEdit
+	}{
+		// db-stage-1 and db-stage-2 have the same manifest.
+		{
+			filename: "db-stage-1/MANIFEST-000001",
+			encodedEdits: []string{
+				"\x01\x1aleveldb.BytewiseComparator\x03\x02\x04\x00",
+				"\x02\x02\x03\x03\x04\t",
+			},
+			edits: []VersionEdit{
+				{
+					ComparerName: "leveldb.BytewiseComparator",
+					NextFileNum:  2,
+				},
+				{
+					MinUnflushedLogNum: 0x2,
+					NextFileNum:        0x3,
+					LastSeqNum:         0x9,
+				},
+			},
+		},
+		// db-stage-3 and db-stage-4 have the same manifest.
+		{
+			filename: "db-stage-3/MANIFEST-000006",
+			encodedEdits: []string{
+				"\x01\x1aleveldb.BytewiseComparator\x02\x02\x03\a\x04\x00",
+				"\x02\x05\x03\x06\x04\x0eg\x00\x04\xc5\x05\vbar\x00\x0e\x00\x00\x00\x00\x00\x00\vfoo\x01\r\x00\x00\x00\x00\x00\x00\f\x0e\x06\x05\x84\xa6\xb8\xab\x06\x01",
+			},
+			edits: []VersionEdit{
+				{
+					ComparerName:       "leveldb.BytewiseComparator",
+					MinUnflushedLogNum: 0x2,
+					NextFileNum:        0x7,
+				},
+				{
+					MinUnflushedLogNum: 0x5,
+					NextFileNum:        0x6,
+					LastSeqNum:         0xe,
+					NewFiles: []NewFileEntry{
+						{
+							Level: 0,
+							Meta:  m,
+						},
+					},
+				},
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run("", func(t *testing.T) {
+			f, err := os.Open("../../testdata/" + tc.filename)
+			if err != nil {
+				t.Fatalf("filename=%q: open error: %v", tc.filename, err)
+			}
+			defer f.Close()
+			i, r := 0, record.NewReader(f, 0 /* logNum */)
+			for {
+				rr, err := r.Next()
+				if err == io.EOF {
+					break
+				}
+				if err != nil {
+					t.Fatalf("filename=%q i=%d: record reader error: %v", tc.filename, i, err)
+				}
+				if i >= len(tc.edits) {
+					t.Fatalf("filename=%q i=%d: too many version edits", tc.filename, i+1)
+				}
+
+				encodedEdit, err := io.ReadAll(rr)
+				if err != nil {
+					t.Fatalf("filename=%q i=%d: read error: %v", tc.filename, i, err)
+				}
+				if s := string(encodedEdit); s != tc.encodedEdits[i] {
+					t.Fatalf("filename=%q i=%d: got encoded %q, want %q", tc.filename, i, s, tc.encodedEdits[i])
+				}
+
+				var edit VersionEdit
+				err = edit.Decode(bytes.NewReader(encodedEdit))
+				if err != nil {
+					t.Fatalf("filename=%q i=%d: decode error: %v", tc.filename, i, err)
+				}
+				if !reflect.DeepEqual(edit, tc.edits[i]) {
+					t.Fatalf("filename=%q i=%d: decode\n\tgot  %#v\n\twant %#v\n%s", tc.filename, i, edit, tc.edits[i],
+						strings.Join(pretty.Diff(edit, tc.edits[i]), "\n"))
+				}
+				if err := checkRoundTrip(edit); err != nil {
+					t.Fatalf("filename=%q i=%d: round trip: %v", tc.filename, i, err)
+				}
+
+				i++
+			}
+			if i != len(tc.edits) {
+				t.Fatalf("filename=%q: got %d edits, want %d", tc.filename, i, len(tc.edits))
+			}
+		})
+	}
+}
+
+func TestVersionEditEncodeLastSeqNum(t *testing.T) {
+	testCases := []struct {
+		edit    VersionEdit
+		encoded string
+	}{
+		// If ComparerName is unset, LastSeqNum is only encoded if non-zero.
+		{VersionEdit{LastSeqNum: 0}, ""},
+		{VersionEdit{LastSeqNum: 1}, "\x04\x01"},
+		// For compatibility with RocksDB, if ComparerName is set we always encode
+		// LastSeqNum.
+		{VersionEdit{ComparerName: "foo", LastSeqNum: 0}, "\x01\x03\x66\x6f\x6f\x04\x00"},
+		{VersionEdit{ComparerName: "foo", LastSeqNum: 1}, "\x01\x03\x66\x6f\x6f\x04\x01"},
+	}
+	for _, c := range testCases {
+		t.Run("", func(t *testing.T) {
+			var buf bytes.Buffer
+			require.NoError(t, c.edit.Encode(&buf))
+			if result := buf.String(); c.encoded != result {
+				t.Fatalf("expected %x, but found %x", c.encoded, result)
+			}
+
+			if c.edit.ComparerName != "" {
+				// Manually decode the version edit so that we can verify the contents
+				// even if the LastSeqNum decodes to 0.
+				d := versionEditDecoder{strings.NewReader(c.encoded)}
+
+				// Decode ComparerName.
+				tag, err := d.readUvarint()
+				require.NoError(t, err)
+				if tag != tagComparator {
+					t.Fatalf("expected %d, but found %d", tagComparator, tag)
+				}
+				s, err := d.readBytes()
+				require.NoError(t, err)
+				if c.edit.ComparerName != string(s) {
+					t.Fatalf("expected %q, but found %q", c.edit.ComparerName, s)
+				}
+
+				// Decode LastSeqNum.
+				tag, err = d.readUvarint()
+				require.NoError(t, err)
+				if tag != tagLastSequence {
+					t.Fatalf("expected %d, but found %d", tagLastSequence, tag)
+				}
+				val, err := d.readUvarint()
+				require.NoError(t, err)
+				if c.edit.LastSeqNum != val {
+					t.Fatalf("expected %d, but found %d", c.edit.LastSeqNum, val)
+				}
+			}
+		})
+	}
+}
+
+func TestVersionEditApply(t *testing.T) {
+	parseMeta := func(s string) (*FileMetadata, error) {
+		m, err := ParseFileMetadataDebug(s)
+		if err != nil {
+			return nil, err
+		}
+		m.SmallestSeqNum = m.Smallest.SeqNum()
+		m.LargestSeqNum = m.Largest.SeqNum()
+		if m.SmallestSeqNum > m.LargestSeqNum {
+			m.SmallestSeqNum, m.LargestSeqNum = m.LargestSeqNum, m.SmallestSeqNum
+		}
+		m.InitPhysicalBacking()
+		return m, nil
+	}
+
+	// TODO(bananabrick): Improve the parsing logic in this test.
+	datadriven.RunTest(t, "testdata/version_edit_apply",
+		func(t *testing.T, d *datadriven.TestData) string {
+			switch d.Cmd {
+			case "apply":
+				// TODO(sumeer): move this Version parsing code to utils, to
+				// avoid repeating it, and make it the inverse of
+				// Version.DebugString().
+				var v *Version
+				var veList []*VersionEdit
+				isVersion := true
+				isDelete := true
+				var level int
+				var err error
+				versionFiles := map[base.FileNum]*FileMetadata{}
+				for _, data := range strings.Split(d.Input, "\n") {
+					data = strings.TrimSpace(data)
+					switch data {
+					case "edit":
+						isVersion = false
+						veList = append(veList, &VersionEdit{})
+					case "delete":
+						isVersion = false
+						isDelete = true
+					case "add":
+						isVersion = false
+						isDelete = false
+					case "L0", "L1", "L2", "L3", "L4", "L5", "L6":
+						level, err = strconv.Atoi(data[1:])
+						if err != nil {
+							return err.Error()
+						}
+					default:
+						var ve *VersionEdit
+						if len(veList) > 0 {
+							ve = veList[len(veList)-1]
+						}
+						if isVersion || !isDelete {
+							meta, err := parseMeta(data)
+							if err != nil {
+								return err.Error()
+							}
+							if isVersion {
+								if v == nil {
+									v = new(Version)
+									for l := 0; l < NumLevels; l++ {
+										v.Levels[l] = makeLevelMetadata(base.DefaultComparer.Compare, l, nil /* files */)
+									}
+								}
+								versionFiles[meta.FileNum] = meta
+								v.Levels[level].insert(meta)
+								meta.LatestRef()
+							} else {
+								ve.NewFiles =
+									append(ve.NewFiles, NewFileEntry{Level: level, Meta: meta})
+							}
+						} else {
+							fileNum, err := strconv.Atoi(data)
+							if err != nil {
+								return err.Error()
+							}
+							dfe := DeletedFileEntry{Level: level, FileNum: base.FileNum(fileNum)}
+							if ve.DeletedFiles == nil {
+								ve.DeletedFiles = make(map[DeletedFileEntry]*FileMetadata)
+							}
+							ve.DeletedFiles[dfe] = versionFiles[dfe.FileNum]
+						}
+					}
+				}
+
+				if v != nil {
+					if err := v.InitL0Sublevels(base.DefaultComparer.Compare, base.DefaultFormatter, 10<<20); err != nil {
+						return err.Error()
+					}
+				}
+
+				bve := BulkVersionEdit{}
+				bve.AddedByFileNum = make(map[base.FileNum]*FileMetadata)
+				for _, ve := range veList {
+					if err := bve.Accumulate(ve); err != nil {
+						return err.Error()
+					}
+				}
+				zombies := make(map[base.DiskFileNum]uint64)
+				newv, err := bve.Apply(v, base.DefaultComparer.Compare, base.DefaultFormatter, 10<<20, 32000, zombies, ProhibitSplitUserKeys)
+				if err != nil {
+					return err.Error()
+				}
+
+				zombieFileNums := make([]base.DiskFileNum, 0, len(zombies))
+				if len(veList) == 1 {
+					// Only care about zombies if a single version edit was
+					// being applied.
+					for fileNum := range zombies {
+						zombieFileNums = append(zombieFileNums, fileNum)
+					}
+					slices.Sort(zombieFileNums)
+				}
+
+				return fmt.Sprintf("%szombies %d\n", newv, zombieFileNums)
+
+			default:
+				return fmt.Sprintf("unknown command: %s", d.Cmd)
+			}
+		})
+}
diff --git a/pebble/internal/manifest/version_test.go b/pebble/internal/manifest/version_test.go
new file mode 100644
index 0000000..abde613
--- /dev/null
+++ b/pebble/internal/manifest/version_test.go
@@ -0,0 +1,429 @@
+// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package manifest
+
+import (
+	"bytes"
+	"fmt"
+	"strings"
+	"sync"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/stretchr/testify/require"
+)
+
+func levelMetadata(level int, files ...*FileMetadata) LevelMetadata {
+	return makeLevelMetadata(base.DefaultComparer.Compare, level, files)
+}
+
+func ikey(s string) InternalKey {
+	return base.MakeInternalKey([]byte(s), 0, base.InternalKeyKindSet)
+}
+
+func TestIkeyRange(t *testing.T) {
+	cmp := base.DefaultComparer.Compare
+	testCases := []struct {
+		input, want string
+	}{
+		{
+			"",
+			"-",
+		},
+		{
+			"a-e",
+			"a-e",
+		},
+		{
+			"a-e a-e",
+			"a-e",
+		},
+		{
+			"c-g a-e",
+			"a-g",
+		},
+		{
+			"a-e c-g a-e",
+			"a-g",
+		},
+		{
+			"b-d f-g",
+			"b-g",
+		},
+		{
+			"d-e b-d",
+			"b-e",
+		},
+		{
+			"e-e",
+			"e-e",
+		},
+		{
+			"f-g e-e d-e c-g b-d a-e",
+			"a-g",
+		},
+	}
+	for _, tc := range testCases {
+		var f []*FileMetadata
+		if tc.input != "" {
+			for i, s := range strings.Split(tc.input, " ") {
+				m := (&FileMetadata{
+					FileNum: base.FileNum(i),
+				}).ExtendPointKeyBounds(cmp, ikey(s[0:1]), ikey(s[2:3]))
+				m.InitPhysicalBacking()
+				f = append(f, m)
+			}
+		}
+		levelMetadata := makeLevelMetadata(base.DefaultComparer.Compare, 0, f)
+
+		sm, la := KeyRange(base.DefaultComparer.Compare, levelMetadata.Iter())
+		got := string(sm.UserKey) + "-" + string(la.UserKey)
+		if got != tc.want {
+			t.Errorf("KeyRange(%q) = %q, %q", tc.input, got, tc.want)
+		}
+	}
+}
+
+func TestOverlaps(t *testing.T) {
+	var v *Version
+	cmp := testkeys.Comparer.Compare
+	fmtKey := testkeys.Comparer.FormatKey
+	datadriven.RunTest(t, "testdata/overlaps", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "define":
+			var err error
+			v, err = ParseVersionDebug(cmp, fmtKey, 64>>10 /* flush split bytes */, d.Input)
+			if err != nil {
+				return err.Error()
+			}
+			return v.String()
+		case "overlaps":
+			var level int
+			var start, end string
+			var exclusiveEnd bool
+			d.ScanArgs(t, "level", &level)
+			d.ScanArgs(t, "start", &start)
+			d.ScanArgs(t, "end", &end)
+			d.ScanArgs(t, "exclusive-end", &exclusiveEnd)
+			overlaps := v.Overlaps(level, testkeys.Comparer.Compare, []byte(start), []byte(end), exclusiveEnd)
+			var buf bytes.Buffer
+			fmt.Fprintf(&buf, "%d files:\n", overlaps.Len())
+			overlaps.Each(func(f *FileMetadata) {
+				fmt.Fprintf(&buf, "%s\n", f.DebugString(base.DefaultFormatter, false))
+			})
+			return buf.String()
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
+
+func TestContains(t *testing.T) {
+	cmp := base.DefaultComparer.Compare
+	newFileMeta := func(fileNum base.FileNum, size uint64, smallest, largest base.InternalKey) *FileMetadata {
+		m := (&FileMetadata{
+			FileNum: fileNum,
+			Size:    size,
+		}).ExtendPointKeyBounds(cmp, smallest, largest)
+		m.InitPhysicalBacking()
+		return m
+	}
+	m00 := newFileMeta(
+		700,
+		1,
+		base.ParseInternalKey("b.SET.7008"),
+		base.ParseInternalKey("e.SET.7009"),
+	)
+	m01 := newFileMeta(
+		701,
+		1,
+		base.ParseInternalKey("c.SET.7018"),
+		base.ParseInternalKey("f.SET.7019"),
+	)
+	m02 := newFileMeta(
+		702,
+		1,
+		base.ParseInternalKey("f.SET.7028"),
+		base.ParseInternalKey("g.SET.7029"),
+	)
+	m03 := newFileMeta(
+		703,
+		1,
+		base.ParseInternalKey("x.SET.7038"),
+		base.ParseInternalKey("y.SET.7039"),
+	)
+	m04 := newFileMeta(
+		704,
+		1,
+		base.ParseInternalKey("n.SET.7048"),
+		base.ParseInternalKey("p.SET.7049"),
+	)
+	m05 := newFileMeta(
+		705,
+		1,
+		base.ParseInternalKey("p.SET.7058"),
+		base.ParseInternalKey("p.SET.7059"),
+	)
+	m06 := newFileMeta(
+		706,
+		1,
+		base.ParseInternalKey("p.SET.7068"),
+		base.ParseInternalKey("u.SET.7069"),
+	)
+	m07 := newFileMeta(
+		707,
+		1,
+		base.ParseInternalKey("r.SET.7078"),
+		base.ParseInternalKey("s.SET.7079"),
+	)
+
+	m10 := newFileMeta(
+		710,
+		1,
+		base.ParseInternalKey("d.SET.7108"),
+		base.ParseInternalKey("g.SET.7109"),
+	)
+	m11 := newFileMeta(
+		711,
+		1,
+		base.ParseInternalKey("g.SET.7118"),
+		base.ParseInternalKey("j.SET.7119"),
+	)
+	m12 := newFileMeta(
+		712,
+		1,
+		base.ParseInternalKey("n.SET.7128"),
+		base.ParseInternalKey("p.SET.7129"),
+	)
+	m13 := newFileMeta(
+		713,
+		1,
+		base.ParseInternalKey("p.SET.7148"),
+		base.ParseInternalKey("p.SET.7149"),
+	)
+	m14 := newFileMeta(
+		714,
+		1,
+		base.ParseInternalKey("p.SET.7138"),
+		base.ParseInternalKey("u.SET.7139"),
+	)
+
+	v := Version{
+		Levels: [NumLevels]LevelMetadata{
+			0: levelMetadata(0, m00, m01, m02, m03, m04, m05, m06, m07),
+			1: levelMetadata(1, m10, m11, m12, m13, m14),
+		},
+	}
+
+	testCases := []struct {
+		level int
+		file  *FileMetadata
+		want  bool
+	}{
+		// Level 0: m00=b-e, m01=c-f, m02=f-g, m03=x-y, m04=n-p, m05=p-p, m06=p-u, m07=r-s.
+		// Note that:
+		//   - the slice isn't sorted (e.g. m02=f-g, m03=x-y, m04=n-p),
+		//   - m00 and m01 overlap (not just touch),
+		//   - m06 contains m07,
+		//   - m00, m01 and m02 transitively overlap/touch each other, and
+		//   - m04, m05, m06 and m07 transitively overlap/touch each other.
+		{0, m00, true},
+		{0, m01, true},
+		{0, m02, true},
+		{0, m03, true},
+		{0, m04, true},
+		{0, m05, true},
+		{0, m06, true},
+		{0, m07, true},
+		{0, m10, false},
+		{0, m11, false},
+		{0, m12, false},
+		{0, m13, false},
+		{0, m14, false},
+		{1, m00, false},
+		{1, m01, false},
+		{1, m02, false},
+		{1, m03, false},
+		{1, m04, false},
+		{1, m05, false},
+		{1, m06, false},
+		{1, m07, false},
+		{1, m10, true},
+		{1, m11, true},
+		{1, m12, true},
+		{1, m13, true},
+		{1, m14, true},
+
+		// Level 2: empty.
+		{2, m00, false},
+		{2, m14, false},
+	}
+
+	for _, tc := range testCases {
+		got := v.Contains(tc.level, cmp, tc.file)
+		if got != tc.want {
+			t.Errorf("level=%d, file=%s\ngot %t\nwant %t", tc.level, tc.file, got, tc.want)
+		}
+	}
+}
+
+func TestVersionUnref(t *testing.T) {
+	list := &VersionList{}
+	list.Init(&sync.Mutex{})
+	v := &Version{Deleted: func([]*FileBacking) {}}
+	v.Ref()
+	list.PushBack(v)
+	v.Unref()
+	if !list.Empty() {
+		t.Fatalf("expected version list to be empty")
+	}
+}
+
+func TestCheckOrdering(t *testing.T) {
+	cmp := base.DefaultComparer.Compare
+	fmtKey := base.DefaultComparer.FormatKey
+	datadriven.RunTest(t, "testdata/version_check_ordering",
+		func(t *testing.T, d *datadriven.TestData) string {
+			switch d.Cmd {
+			case "check-ordering":
+				orderingInvariants := ProhibitSplitUserKeys
+				if d.HasArg("allow-split-user-keys") {
+					orderingInvariants = AllowSplitUserKeys
+				}
+				v, err := ParseVersionDebug(cmp, fmtKey, 10<<20, d.Input)
+				if err != nil {
+					return err.Error()
+				}
+				// L0 files compare on sequence numbers. Use the seqnums from the
+				// smallest / largest bounds for the table.
+				v.Levels[0].Slice().Each(func(m *FileMetadata) {
+					m.SmallestSeqNum = m.Smallest.SeqNum()
+					m.LargestSeqNum = m.Largest.SeqNum()
+				})
+				if err = v.CheckOrdering(cmp, base.DefaultFormatter, orderingInvariants); err != nil {
+					return err.Error()
+				}
+				return "OK"
+
+			default:
+				return fmt.Sprintf("unknown command: %s", d.Cmd)
+			}
+		})
+}
+
+func TestExtendBounds(t *testing.T) {
+	cmp := base.DefaultComparer.Compare
+	parseBounds := func(line string) (lower, upper InternalKey) {
+		parts := strings.Split(line, "-")
+		if len(parts) == 1 {
+			parts = strings.Split(parts[0], ":")
+			start, end := strings.TrimSpace(parts[0]), strings.TrimSpace(parts[1])
+			lower = base.ParseInternalKey(start)
+			switch k := lower.Kind(); k {
+			case base.InternalKeyKindRangeDelete:
+				upper = base.MakeRangeDeleteSentinelKey([]byte(end))
+			case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset, base.InternalKeyKindRangeKeyDelete:
+				upper = base.MakeExclusiveSentinelKey(k, []byte(end))
+			default:
+				panic(fmt.Sprintf("unknown kind %s with end key", k))
+			}
+		} else {
+			l, u := strings.TrimSpace(parts[0]), strings.TrimSpace(parts[1])
+			lower, upper = base.ParseInternalKey(l), base.ParseInternalKey(u)
+		}
+		return
+	}
+	format := func(m *FileMetadata) string {
+		var b bytes.Buffer
+		var smallest, largest string
+		switch m.boundTypeSmallest {
+		case boundTypePointKey:
+			smallest = "point"
+		case boundTypeRangeKey:
+			smallest = "range"
+		default:
+			return fmt.Sprintf("unknown bound type %d", m.boundTypeSmallest)
+		}
+		switch m.boundTypeLargest {
+		case boundTypePointKey:
+			largest = "point"
+		case boundTypeRangeKey:
+			largest = "range"
+		default:
+			return fmt.Sprintf("unknown bound type %d", m.boundTypeLargest)
+		}
+		bounds, err := m.boundsMarker()
+		if err != nil {
+			panic(err)
+		}
+		fmt.Fprintf(&b, "%s\n", m.DebugString(base.DefaultFormatter, true))
+		fmt.Fprintf(&b, "  bounds: (smallest=%s,largest=%s) (0x%08b)\n", smallest, largest, bounds)
+		return b.String()
+	}
+	m := &FileMetadata{}
+	datadriven.RunTest(t, "testdata/file_metadata_bounds", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "reset":
+			m = &FileMetadata{}
+			return ""
+		case "extend-point-key-bounds":
+			u, l := parseBounds(d.Input)
+			m.ExtendPointKeyBounds(cmp, u, l)
+			return format(m)
+		case "extend-range-key-bounds":
+			u, l := parseBounds(d.Input)
+			m.ExtendRangeKeyBounds(cmp, u, l)
+			return format(m)
+		default:
+			return fmt.Sprintf("unknown command %s\n", d.Cmd)
+		}
+	})
+}
+
+func TestFileMetadata_ParseRoundTrip(t *testing.T) {
+	testCases := []struct {
+		name   string
+		input  string
+		output string
+	}{
+		{
+			name:  "point keys only",
+			input: "000001:[a#0,SET-z#0,DEL] seqnums:[0-0] points:[a#0,SET-z#0,DEL]",
+		},
+		{
+			name:  "range keys only",
+			input: "000001:[a#0,RANGEKEYSET-z#0,RANGEKEYDEL] seqnums:[0-0] ranges:[a#0,RANGEKEYSET-z#0,RANGEKEYDEL]",
+		},
+		{
+			name:  "point and range keys",
+			input: "000001:[a#0,RANGEKEYSET-d#0,DEL] seqnums:[0-0] points:[b#0,SET-d#0,DEL] ranges:[a#0,RANGEKEYSET-c#0,RANGEKEYDEL]",
+		},
+		{
+			name:  "point and range keys with nonzero senums",
+			input: "000001:[a#3,RANGEKEYSET-d#4,DEL] seqnums:[3-7] points:[b#3,SET-d#4,DEL] ranges:[a#3,RANGEKEYSET-c#5,RANGEKEYDEL]",
+		},
+		{
+			name:   "whitespace",
+			input:  " 000001 : [ a#0,SET - z#0,DEL] points : [ a#0,SET - z#0,DEL] ",
+			output: "000001:[a#0,SET-z#0,DEL] seqnums:[0-0] points:[a#0,SET-z#0,DEL]",
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			m, err := ParseFileMetadataDebug(tc.input)
+			require.NoError(t, err)
+			err = m.Validate(base.DefaultComparer.Compare, base.DefaultFormatter)
+			require.NoError(t, err)
+			got := m.DebugString(base.DefaultFormatter, true)
+			want := tc.input
+			if tc.output != "" {
+				want = tc.output
+			}
+			require.Equal(t, want, got)
+		})
+	}
+}
diff --git a/pebble/internal/manual/manual.go b/pebble/internal/manual/manual.go
new file mode 100644
index 0000000..640816a
--- /dev/null
+++ b/pebble/internal/manual/manual.go
@@ -0,0 +1,60 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package manual
+
+// #include <stdlib.h>
+import "C"
+import "unsafe"
+
+// The go:linkname directives provides backdoor access to private functions in
+// the runtime. Below we're accessing the throw function.
+
+//go:linkname throw runtime.throw
+func throw(s string)
+
+// TODO(peter): Rather than relying an C malloc/free, we could fork the Go
+// runtime page allocator and allocate large chunks of memory using mmap or
+// similar.
+
+// New allocates a slice of size n. The returned slice is from manually managed
+// memory and MUST be released by calling Free. Failure to do so will result in
+// a memory leak.
+func New(n int) []byte {
+	if n == 0 {
+		return make([]byte, 0)
+	}
+	// We need to be conscious of the Cgo pointer passing rules:
+	//
+	//   https://golang.org/cmd/cgo/#hdr-Passing_pointers
+	//
+	//   ...
+	//   Note: the current implementation has a bug. While Go code is permitted
+	//   to write nil or a C pointer (but not a Go pointer) to C memory, the
+	//   current implementation may sometimes cause a runtime error if the
+	//   contents of the C memory appear to be a Go pointer. Therefore, avoid
+	//   passing uninitialized C memory to Go code if the Go code is going to
+	//   store pointer values in it. Zero out the memory in C before passing it
+	//   to Go.
+	ptr := C.calloc(C.size_t(n), 1)
+	if ptr == nil {
+		// NB: throw is like panic, except it guarantees the process will be
+		// terminated. The call below is exactly what the Go runtime invokes when
+		// it cannot allocate memory.
+		throw("out of memory")
+	}
+	// Interpret the C pointer as a pointer to a Go array, then slice.
+	return (*[MaxArrayLen]byte)(unsafe.Pointer(ptr))[:n:n]
+}
+
+// Free frees the specified slice.
+func Free(b []byte) {
+	if cap(b) != 0 {
+		if len(b) == 0 {
+			b = b[:cap(b)]
+		}
+		ptr := unsafe.Pointer(&b[0])
+		C.free(ptr)
+	}
+}
diff --git a/pebble/internal/manual/manual_32bit.go b/pebble/internal/manual/manual_32bit.go
new file mode 100644
index 0000000..19369fa
--- /dev/null
+++ b/pebble/internal/manual/manual_32bit.go
@@ -0,0 +1,13 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build 386 || amd64p32 || arm || armbe || ppc || sparc
+// +build 386 amd64p32 arm armbe ppc sparc
+
+package manual
+
+const (
+	// MaxArrayLen is a safe maximum length for slices on this architecture.
+	MaxArrayLen = 1<<31 - 1
+)
diff --git a/pebble/internal/manual/manual_64bit.go b/pebble/internal/manual/manual_64bit.go
new file mode 100644
index 0000000..8c08232
--- /dev/null
+++ b/pebble/internal/manual/manual_64bit.go
@@ -0,0 +1,13 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build amd64 || arm64 || arm64be || ppc64 || ppc64le || mips64 || mips64le || s390x || sparc64 || riscv64
+// +build amd64 arm64 arm64be ppc64 ppc64le mips64 mips64le s390x sparc64 riscv64
+
+package manual
+
+const (
+	// MaxArrayLen is a safe maximum length for slices on this architecture.
+	MaxArrayLen = 1<<50 - 1
+)
diff --git a/pebble/internal/manual/manual_mips.go b/pebble/internal/manual/manual_mips.go
new file mode 100644
index 0000000..08bb880
--- /dev/null
+++ b/pebble/internal/manual/manual_mips.go
@@ -0,0 +1,13 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build mips || mipsle || mips64p32 || mips64p32le
+// +build mips mipsle mips64p32 mips64p32le
+
+package manual
+
+const (
+	// MaxArrayLen is a safe maximum length for slices on this architecture.
+	MaxArrayLen = 1 << 30
+)
diff --git a/pebble/internal/manual/manual_nocgo.go b/pebble/internal/manual/manual_nocgo.go
new file mode 100644
index 0000000..74befbd
--- /dev/null
+++ b/pebble/internal/manual/manual_nocgo.go
@@ -0,0 +1,20 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build !cgo
+// +build !cgo
+
+package manual
+
+// Provides versions of New and Free when cgo is not available (e.g. cross
+// compilation).
+
+// New allocates a slice of size n.
+func New(n int) []byte {
+	return make([]byte, n)
+}
+
+// Free frees the specified slice.
+func Free(b []byte) {
+}
diff --git a/pebble/internal/metamorphic/.gitignore b/pebble/internal/metamorphic/.gitignore
new file mode 100644
index 0000000..33a7810
--- /dev/null
+++ b/pebble/internal/metamorphic/.gitignore
@@ -0,0 +1,2 @@
+_meta/
+*.test
diff --git a/pebble/internal/metamorphic/crossversion/crossversion_test.go b/pebble/internal/metamorphic/crossversion/crossversion_test.go
new file mode 100644
index 0000000..192140e
--- /dev/null
+++ b/pebble/internal/metamorphic/crossversion/crossversion_test.go
@@ -0,0 +1,409 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+// Package crossversion builds on the metamorphic testing implemented in
+// internal/metamorphic, performing metamorphic testing across versions of
+// Pebble. This improves test coverage of upgrade and migration code paths.
+package crossversion
+
+import (
+	"bytes"
+	"context"
+	"flag"
+	"fmt"
+	"io"
+	"math/rand"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+	"unicode"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/metamorphic"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+var (
+	factor       int
+	seed         int64
+	versions     pebbleVersions
+	artifactsDir string
+	streamOutput bool
+)
+
+func init() {
+	// NB: If you add new command-line flags, you should update the
+	// reproductionCommand function.
+	flag.Int64Var(&seed, "seed", 0,
+		`a pseudorandom number generator seed`)
+	flag.IntVar(&factor, "factor", 10,
+		`the number of data directories to carry forward
+from one version's run to the subsequent version's runs.`)
+	flag.Var(&versions, "version",
+		`a comma-separated 3-tuple defining a Pebble version to test.
+The expected format is <label>,<SHA>,<test-binary-path>.
+The label should be a human-readable label describing the
+version, for example, 'CRDB-22.1'. The SHA indicates the
+exact commit sha of the version, and may be abbreviated.
+The test binary path must point to a test binary of the
+internal/metamorphic package built on the indicated SHA.
+A test binary may be built with 'go test -c'.
+
+This flag should be provided multiple times to indicate
+the set of versions to test. The order of the versions
+is significant and database states generated from earlier
+versions will be used to initialize runs of subsequent
+versions.`)
+	flag.StringVar(&artifactsDir, "artifacts", "",
+		`the path to a directory where test artifacts should be
+moved on failure. Defaults to the current working directory.`)
+	flag.BoolVar(&streamOutput, "stream-output", false,
+		`stream TestMeta output to standard output`)
+}
+
+func reproductionCommand() string {
+	return fmt.Sprintf(
+		"SEED=%d FACTOR=%d ./scripts/run-crossversion-meta.sh %s\n",
+		seed, factor, versions.String(),
+	)
+}
+
+// TestMetaCrossVersion performs cross-version metamorphic testing.
+//
+// It runs tests against the internal/metamorphic test binaries specified with
+// multiple instances of the -version flag, exercising upgrade and migration
+// code paths.
+//
+// More specifically, assume we are passed the following versions:
+//
+//	--version 22.2,<sha>,meta-22-2.test --version 23.1,<sha>,meta-23-1.test
+//
+// TestMetaCrossVersion will:
+//   - run TestMeta on meta-22-2.test;
+//   - retain a random subset of the resulting directories (each directory is a
+//     store after a sequence of operations);
+//   - run TestMeta on meta-23.1.test once for every retained directory from the
+//     previous version (using it as initial state).
+func TestMetaCrossVersion(t *testing.T) {
+	if seed == 0 {
+		seed = time.Now().UnixNano()
+	}
+	tempDir := t.TempDir()
+	t.Logf("Test directory: %s\n", tempDir)
+	t.Logf("Reproduction:\n  %s\n", reproductionCommand())
+
+	// Print all the versions supplied and ensure all the test binaries
+	// actually exist before proceeding.
+	for i, v := range versions {
+		if len(v.SHA) > 8 {
+			// Use shortened SHAs for readability.
+			versions[i].SHA = versions[i].SHA[:8]
+		}
+		absPath, err := filepath.Abs(v.TestBinaryPath)
+		if err != nil {
+			t.Fatal(err)
+		}
+		fi, err := os.Stat(absPath)
+		if err != nil {
+			t.Fatal(err)
+		}
+		versions[i].TestBinaryPath = absPath
+		t.Logf("%d: %s (Mode = %s)", i, v.String(), fi.Mode())
+	}
+
+	// All randomness should be derived from `seed`. This makes reproducing a
+	// failure locally easier.
+	ctx := context.Background()
+	require.NoError(t, runCrossVersion(ctx, t, tempDir, versions, seed, factor))
+}
+
+type pebbleVersion struct {
+	Label          string
+	SHA            string
+	TestBinaryPath string
+}
+
+type initialState struct {
+	desc string
+	path string
+}
+
+func (s initialState) String() string {
+	if s.desc == "" {
+		return "<empty>"
+	}
+	return s.desc
+}
+
+func runCrossVersion(
+	ctx context.Context,
+	t *testing.T,
+	tempDir string,
+	versions pebbleVersions,
+	seed int64,
+	factor int,
+) error {
+	prng := rand.New(rand.NewSource(seed))
+	// Use prng to derive deterministic seeds to provide to the child
+	// metamorphic runs. The same seed is used for all runs on a particular
+	// Pebble version.
+	versionSeeds := make([]uint64, len(versions))
+	for i := range versions {
+		versionSeeds[i] = prng.Uint64()
+	}
+
+	rootDir := filepath.Join(tempDir, strconv.FormatInt(seed, 10))
+	if err := os.MkdirAll(rootDir, os.ModePerm); err != nil {
+		return err
+	}
+
+	// When run with test parallelism, multiple tests may fail concurrently.
+	// Only one should actually run the test failure logic which copies the root
+	// dir into the artifacts directory.
+	var fatalOnce sync.Once
+
+	// The outer for loop executes once per version being tested. It takes a
+	// list of initial states, populated by the previous version. The inner loop
+	// executes once per initial state, running the metamorphic test against the
+	// initial state.
+	//
+	// The number of states that are carried forward from one version to the
+	// next is fixed by `factor`.
+	initialStates := []initialState{{}}
+	for i := range versions {
+		t.Logf("Running tests with version %s with %d initial state(s).", versions[i].SHA, len(initialStates))
+		histories, nextInitialStates, err := runVersion(ctx, t, &fatalOnce, rootDir, versions[i], versionSeeds[i], initialStates)
+		if err != nil {
+			return err
+		}
+
+		// All the initial states described the same state and all of this
+		// version's metamorphic runs used the same seed, so all of the
+		// resulting histories should be identical.
+		if h, diff := metamorphic.CompareHistories(t, histories); h > 0 {
+			fatalf(t, &fatalOnce, rootDir, "Metamorphic test divergence between %q and %q:\nDiff:\n%s",
+				nextInitialStates[0].desc, nextInitialStates[h].desc, diff)
+		}
+
+		// Prune the set of initial states we collected for this version, using
+		// the deterministic randomness of prng to pick which states we keep.
+		if len(nextInitialStates) > factor {
+			prng.Shuffle(len(nextInitialStates), func(i, j int) {
+				nextInitialStates[i], nextInitialStates[j] = nextInitialStates[j], nextInitialStates[i]
+			})
+			// Delete the states that we're not going to use.
+			for _, s := range nextInitialStates[factor:] {
+				require.NoError(t, os.RemoveAll(s.path))
+			}
+			nextInitialStates = nextInitialStates[:factor]
+		}
+		initialStates = nextInitialStates
+	}
+	return nil
+}
+
+func runVersion(
+	ctx context.Context,
+	t *testing.T,
+	fatalOnce *sync.Once,
+	rootDir string,
+	vers pebbleVersion,
+	seed uint64,
+	initialStates []initialState,
+) (histories []string, nextInitialStates []initialState, err error) {
+	// mu guards histories and nextInitialStates. The subtests may be run in
+	// parallel (via t.Parallel()).
+	var mu sync.Mutex
+
+	// The outer 'execution-<label>' subtest will block until all of the
+	// individual subtests have completed.
+	t.Run(fmt.Sprintf("execution-%s", vers.Label), func(t *testing.T) {
+		for j, s := range initialStates {
+			j, s := j, s // re-bind loop vars to scope
+
+			runID := fmt.Sprintf("%s_%s_%d_%03d", vers.Label, vers.SHA, seed, j)
+			r := metamorphicTestRun{
+				seed:           seed,
+				dir:            filepath.Join(rootDir, runID),
+				vers:           vers,
+				initialState:   s,
+				testBinaryPath: vers.TestBinaryPath,
+			}
+			t.Run(s.desc, func(t *testing.T) {
+				t.Parallel()
+				require.NoError(t, os.MkdirAll(r.dir, os.ModePerm))
+
+				var buf bytes.Buffer
+				var out io.Writer = &buf
+				if streamOutput {
+					out = io.MultiWriter(out, os.Stderr)
+				}
+				t.Logf("  Running test with version %s with initial state %s.",
+					vers.SHA, s)
+				if err := r.run(ctx, out); err != nil {
+					fatalf(t, fatalOnce, rootDir, "Metamorphic test failed: %s\nOutput:%s\n", err, buf.String())
+				}
+
+				// dir is a directory containing the ops file and subdirectories for
+				// each run with a particular set of OPTIONS. For example:
+				//
+				// dir/
+				//   ops
+				//   random-000/
+				//   random-001/
+				//   ...
+				//   standard-000/
+				//   standard-001/
+				//   ...
+				dir := getRunDir(t, r.dir)
+				// subrunDirs contains the names of all dir's subdirectories.
+				subrunDirs := getDirs(t, dir)
+
+				mu.Lock()
+				defer mu.Unlock()
+				for _, subrunDir := range subrunDirs {
+					// Record the subrun as an initial state for the next version.
+					nextInitialStates = append(nextInitialStates, initialState{
+						path: filepath.Join(dir, subrunDir),
+						desc: fmt.Sprintf("sha=%s-seed=%d-opts=%s(%s)", vers.SHA, seed, subrunDir, s.String()),
+					})
+					histories = append(histories, filepath.Join(dir, subrunDir, "history"))
+				}
+			})
+		}
+	})
+	return histories, nextInitialStates, err
+}
+
+func fatalf(t testing.TB, fatalOnce *sync.Once, dir string, msg string, args ...interface{}) {
+	fatalOnce.Do(func() {
+		if artifactsDir == "" {
+			var err error
+			artifactsDir, err = os.Getwd()
+			require.NoError(t, err)
+		}
+		// When run with test parallelism, other subtests may still be running
+		// within subdirectories of `dir`. We copy instead of rename so that those
+		// substests don't also fail when we remove their files out from under them.
+		// Those additional failures would confuse the test output.
+		dst := filepath.Join(artifactsDir, filepath.Base(dir))
+		t.Logf("Copying test dir %q to %q.", dir, dst)
+		_, err := vfs.Clone(vfs.Default, vfs.Default, dir, dst, vfs.CloneTryLink)
+		if err != nil {
+			t.Error(err)
+		}
+		t.Fatalf(msg, args...)
+	})
+}
+
+type metamorphicTestRun struct {
+	seed           uint64
+	dir            string
+	vers           pebbleVersion
+	initialState   initialState
+	testBinaryPath string
+}
+
+func (r *metamorphicTestRun) run(ctx context.Context, output io.Writer) error {
+	args := []string{
+		"-test.run", "TestMeta$",
+		"-seed", strconv.FormatUint(r.seed, 10),
+		"-keep",
+		// Use an op-count distribution that includes a low lower bound, so that
+		// some intermediary versions do very little work besides opening the
+		// database. This helps exercise state from version n that survives to
+		// versions ≥ n+2.
+		"-ops", "uniform:1-10000",
+		// Explicitly specify the location of the _meta directory. In Cockroach
+		// CI when built using bazel, the subprocesses may be given a different
+		// current working directory than the one provided below. To ensure we
+		// can find this run's artifacts, explicitly pass the intended dir.
+		"-dir", filepath.Join(r.dir, "_meta"),
+	}
+	// Propagate the verbose flag, if necessary.
+	if testing.Verbose() {
+		args = append(args, "-test.v")
+	}
+	if r.initialState.path != "" {
+		args = append(args,
+			"--initial-state", r.initialState.path,
+			"--initial-state-desc", r.initialState.desc)
+	}
+	cmd := exec.CommandContext(ctx, r.testBinaryPath, args...)
+	cmd.Dir = r.dir
+	cmd.Stderr = output
+	cmd.Stdout = output
+
+	// Print the command itself before executing it.
+	if testing.Verbose() {
+		fmt.Fprintln(output, cmd)
+	}
+
+	return cmd.Run()
+}
+
+func (v pebbleVersion) String() string {
+	return fmt.Sprintf("%s,%s,%s", v.Label, v.SHA, v.TestBinaryPath)
+}
+
+// pebbleVersions implements flag.Value for the -version flag.
+type pebbleVersions []pebbleVersion
+
+var _ flag.Value = (*pebbleVersions)(nil)
+
+// String returns the SHAs of the versions.
+func (f *pebbleVersions) String() string {
+	var buf bytes.Buffer
+	for i, v := range *f {
+		if i > 0 {
+			fmt.Fprint(&buf, " ")
+		}
+		fmt.Fprintf(&buf, v.SHA)
+	}
+	return buf.String()
+}
+
+// Set is part of the flag.Value interface; it is called once for every
+// occurrence of the version flag.
+func (f *pebbleVersions) Set(value string) error {
+	// Expected format is `<label>,<sha>,<path>`.
+	fields := strings.FieldsFunc(value, func(r rune) bool { return r == ',' || unicode.IsSpace(r) })
+	if len(fields) != 3 {
+		return errors.Newf("unable to parse version %q", value)
+	}
+	*f = append(*f, pebbleVersion{
+		Label:          fields[0],
+		SHA:            fields[1],
+		TestBinaryPath: fields[2],
+	})
+	return nil
+}
+
+func getDirs(t testing.TB, dir string) (names []string) {
+	dirents, err := os.ReadDir(dir)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, dirent := range dirents {
+		if dirent.IsDir() {
+			names = append(names, dirent.Name())
+		}
+	}
+	return names
+}
+
+func getRunDir(t testing.TB, dir string) string {
+	metaDir := filepath.Join(dir, "_meta")
+	dirs := getDirs(t, metaDir)
+	if len(dirs) != 1 {
+		t.Fatalf("expected 1 directory, found %d", len(dirs))
+	}
+	return filepath.Join(metaDir, dirs[0])
+}
diff --git a/pebble/internal/metamorphic/doc.go b/pebble/internal/metamorphic/doc.go
new file mode 100644
index 0000000..dea1cd6
--- /dev/null
+++ b/pebble/internal/metamorphic/doc.go
@@ -0,0 +1,7 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+// Package metamorphic holds the entrypoint for Pebble's internal metamorphic
+// tests. See pebble/metamorphic for the package on which these tests build.
+package metamorphic
diff --git a/pebble/internal/metamorphic/meta_test.go b/pebble/internal/metamorphic/meta_test.go
new file mode 100644
index 0000000..02e9beb
--- /dev/null
+++ b/pebble/internal/metamorphic/meta_test.go
@@ -0,0 +1,93 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package metamorphic
+
+import (
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/pebble/internal/metamorphic/metaflags"
+	"github.com/cockroachdb/pebble/metamorphic"
+)
+
+// TODO(peter):
+//
+// Miscellaneous:
+// - Add support for different comparers. In particular, allow reverse
+//   comparers and a comparer which supports Comparer.Split (by splitting off
+//   a variable length suffix).
+// - Add support for Writer.LogData
+
+var runOnceFlags, runFlags = metaflags.InitAllFlags()
+
+// TestMeta generates a random set of operations to run, then runs multiple
+// instances of the test with varying options. See standardOptions() for the set
+// of options that are always run, and randomOptions() for the randomly
+// generated options. The number of operations to generate is determined by the
+// `--ops` flag. If a failure occurs, the output is kept in `_meta/<test>`,
+// though note that a subsequent invocation will overwrite that output. A test
+// can be re-run by using the `--run-dir` flag. For example:
+//
+//	go test -v -run TestMeta --run-dir _meta/standard-017
+//
+// This will reuse the existing operations present in _meta/ops, rather than
+// generating a new set.
+//
+// The generated operations and options are generated deterministically from a
+// pseudorandom number generator seed. If a failure occurs, the seed is
+// printed, and the full suite of tests may be re-run using the `--seed` flag:
+//
+//	go test -v -run TestMeta --seed 1594395154492165000
+//
+// This will generate a new `_meta/<test>` directory, with the same operations
+// and options. This must be run on the same commit SHA as the original
+// failure, otherwise changes to the metamorphic tests may cause the generated
+// operations and options to differ.
+//
+// Each instance of the test is run in a different process, by executing the
+// same binary (i.e. os.Args[0]) and passing `--run_dir`; the "inner" binary can
+// be customized via the --inner-binary flag (used for code coverage
+// instrumentation).
+func TestMeta(t *testing.T) {
+	switch {
+	case runOnceFlags.Compare != "":
+		runDirs := strings.Split(runOnceFlags.Compare, ",")
+		onceOpts := runOnceFlags.MakeRunOnceOptions()
+		metamorphic.Compare(t, runOnceFlags.Dir, runOnceFlags.Seed, runDirs, onceOpts...)
+
+	case runOnceFlags.RunDir != "":
+		// The --run-dir flag is specified either in the child process (see
+		// runOptions() below) or the user specified it manually in order to re-run
+		// a test.
+		onceOpts := runOnceFlags.MakeRunOnceOptions()
+		metamorphic.RunOnce(t, runOnceFlags.RunDir, runOnceFlags.Seed, filepath.Join(runOnceFlags.RunDir, "history"), onceOpts...)
+
+	default:
+		opts := runFlags.MakeRunOptions()
+		metamorphic.RunAndCompare(t, runFlags.Dir, opts...)
+	}
+}
+
+func TestMetaTwoInstance(t *testing.T) {
+	switch {
+	case runOnceFlags.Compare != "":
+		runDirs := strings.Split(runOnceFlags.Compare, ",")
+		onceOpts := runOnceFlags.MakeRunOnceOptions()
+		metamorphic.Compare(t, runOnceFlags.Dir, runOnceFlags.Seed, runDirs, onceOpts...)
+
+	case runOnceFlags.RunDir != "":
+		// The --run-dir flag is specified either in the child process (see
+		// runOptions() below) or the user specified it manually in order to re-run
+		// a test.
+		onceOpts := runOnceFlags.MakeRunOnceOptions()
+		metamorphic.RunOnce(t, runOnceFlags.RunDir, runOnceFlags.Seed, filepath.Join(runOnceFlags.RunDir, "history"), onceOpts...)
+
+	default:
+		opts := runFlags.MakeRunOptions()
+		opts = append(opts, metamorphic.MultiInstance(2))
+		metamorphic.RunAndCompare(t, runFlags.Dir, opts...)
+	}
+}
diff --git a/pebble/internal/metamorphic/metaflags/meta_flags.go b/pebble/internal/metamorphic/metaflags/meta_flags.go
new file mode 100644
index 0000000..28bdf1b
--- /dev/null
+++ b/pebble/internal/metamorphic/metaflags/meta_flags.go
@@ -0,0 +1,234 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+// Package metaflags defines command-line flags for the metamorphic tests and
+// provides functionality to construct the respective
+// metamorphic.RunOptions/RunOnceOptions.
+package metaflags
+
+import (
+	"flag"
+	"fmt"
+	"math"
+	"regexp"
+
+	"github.com/cockroachdb/pebble/internal/randvar"
+	"github.com/cockroachdb/pebble/metamorphic"
+)
+
+// CommonFlags contains flags that apply to both metamorphic.Run and
+// metamorphic.RunOnce/Compare.
+type CommonFlags struct {
+	// Dir is the directory storing test state. See "dir" flag below.
+	Dir string
+	// Seed for generation of random operations. See "seed" flag below.
+	Seed uint64
+	// ErrorRate is the rate of injected filesystem errors. See "error-rate" flag
+	// below.
+	ErrorRate float64
+	// FailRE causes the test to fail if the output matches this regex. See "fail"
+	// flag below.
+	FailRE string
+	// Keep determines if the DB directory is kept on successful runs. See "keep"
+	// flag below.
+	Keep bool
+	// MaxThreads used by a single run. See "max-threads" flag below.
+	MaxThreads int
+	// NumInstances is the number of Pebble instances to create in one run. See
+	// "num-instances" flag below.
+	NumInstances int
+}
+
+func initCommonFlags() *CommonFlags {
+	c := &CommonFlags{}
+	flag.StringVar(&c.Dir, "dir", "_meta",
+		"the directory storing test state")
+
+	flag.Uint64Var(&c.Seed, "seed", 0,
+		"a pseudorandom number generator seed")
+
+	// TODO: default error rate to a non-zero value. Currently, retrying is
+	// non-deterministic because of the Ierator.*WithLimit() methods since
+	// they may say that the Iterator is not valid, but be positioned at a
+	// certain key that can be returned in the future if the limit is changed.
+	// Since that key is hidden from clients of Iterator, the retryableIter
+	// using SeekGE will not necessarily position the Iterator that saw an
+	// injected error at the same place as an Iterator that did not see that
+	// error.
+	flag.Float64Var(&c.ErrorRate, "error-rate", 0.0,
+		"rate of errors injected into filesystem operations (0 ≤ r < 1)")
+
+	flag.StringVar(&c.FailRE, "fail", "",
+		"fail the test if the supplied regular expression matches the output")
+
+	flag.BoolVar(&c.Keep, "keep", false,
+		"keep the DB directory even on successful runs")
+
+	flag.IntVar(&c.MaxThreads, "max-threads", math.MaxInt,
+		"limit execution of a single run to the provided number of threads; must be ≥ 1")
+
+	flag.IntVar(&c.NumInstances, "num-instances", 1, "number of pebble instances to create (default: 1)")
+
+	return c
+}
+
+// RunOnceFlags contains flags that apply only to metamorphic.RunOnce/Compare.
+type RunOnceFlags struct {
+	*CommonFlags
+	// RunDir applies to metamorphic.RunOnce and contains the specific
+	// configuration of the run. See "run-dir" flag below.
+	RunDir string
+	// Compare applies to metamorphic.Compare. See "compare" flag below.
+	Compare string
+}
+
+func initRunOnceFlags(c *CommonFlags) *RunOnceFlags {
+	ro := &RunOnceFlags{CommonFlags: c}
+	flag.StringVar(&ro.RunDir, "run-dir", "",
+		"the specific configuration to (re-)run (used for post-mortem debugging)")
+
+	flag.StringVar(&ro.Compare, "compare", "",
+		`comma separated list of options files to compare. The result of each run is compared with
+the result of the run from the first options file in the list. Example, -compare
+random-003,standard-000. The dir flag should have the directory containing these directories.
+Example, -dir _meta/200610-203012.077`)
+	return ro
+}
+
+// RunFlags contains flags that apply only to metamorphic.Run.
+type RunFlags struct {
+	*CommonFlags
+	// FS controls the type of filesystems to use. See "fs" flag below.
+	FS string
+	// TraceFile for execution tracing. See "trace-file" flag below.
+	TraceFile string
+	// Ops describes how the total number of operations is generated. See "ops" flags below.
+	Ops randvar.Flag
+	// InnerBinary is the binary to invoke for a single run. See "inner-binary"
+	// flag below.
+	InnerBinary string
+	// PreviousOps is the path to the ops file of a previous run. See the
+	// "previous-ops" flag below.
+	PreviousOps string
+	// InitialStatePath is the path to a database data directory from a previous
+	// run. See the "initial-state" flag below.
+	InitialStatePath string
+	// InitialStateDesc is a human-readable description of the initial database
+	// state. See "initial-state-desc" flag below.
+	InitialStateDesc string
+}
+
+func initRunFlags(c *CommonFlags) *RunFlags {
+	r := &RunFlags{CommonFlags: c}
+	flag.StringVar(&r.FS, "fs", "rand",
+		`force the tests to use either memory or disk-backed filesystems (valid: "mem", "disk", "rand")`)
+
+	flag.StringVar(&r.TraceFile, "trace-file", "",
+		"write an execution trace to `<run-dir>/file`")
+
+	if err := r.Ops.Set("uniform:5000-10000"); err != nil {
+		panic(err)
+	}
+	flag.Var(&r.Ops, "ops", "uniform:5000-10000")
+
+	flag.StringVar(&r.InnerBinary, "inner-binary", "",
+		`binary to run for each instance of the test (this same binary by default); cannot be used
+with --run-dir or --compare`)
+
+	// The following options may be used for split-version metamorphic testing.
+	// To perform split-version testing, the client runs the metamorphic tests
+	// on an earlier Pebble SHA passing the `--keep` flag. The client then
+	// switches to the later Pebble SHA, setting the below options to point to
+	// the `ops` file and one of the previous run's data directories.
+
+	flag.StringVar(&r.PreviousOps, "previous-ops", "",
+		"path to an ops file, used to prepopulate the set of keys operations draw from")
+
+	flag.StringVar(&r.InitialStatePath, "initial-state", "",
+		"path to a database's data directory, used to prepopulate the test run's databases")
+
+	flag.StringVar(&r.InitialStateDesc, "initial-state-desc", "",
+		`a human-readable description of the initial database state.
+		If set this parameter is written to the OPTIONS to aid in
+		debugging. It's intended to describe the lineage of a
+		database's state, including sufficient information for
+		reproduction (eg, SHA, prng seed, etc).`)
+	return r
+}
+
+// InitRunOnceFlags initializes the flags that are used for a single run of the
+// metamorphic test.
+func InitRunOnceFlags() *RunOnceFlags {
+	return initRunOnceFlags(initCommonFlags())
+}
+
+// InitAllFlags initializes all metamorphic test flags: those used for a
+// single run, and those used for a "top level" run.
+func InitAllFlags() (*RunOnceFlags, *RunFlags) {
+	c := initCommonFlags()
+	return initRunOnceFlags(c), initRunFlags(c)
+}
+
+// MakeRunOnceOptions constructs RunOnceOptions based on the flags.
+func (ro *RunOnceFlags) MakeRunOnceOptions() []metamorphic.RunOnceOption {
+	onceOpts := []metamorphic.RunOnceOption{
+		metamorphic.MaxThreads(ro.MaxThreads),
+	}
+	if ro.Keep {
+		onceOpts = append(onceOpts, metamorphic.KeepData{})
+	}
+	if ro.FailRE != "" {
+		onceOpts = append(onceOpts, metamorphic.FailOnMatch{Regexp: regexp.MustCompile(ro.FailRE)})
+	}
+	if ro.ErrorRate > 0 {
+		onceOpts = append(onceOpts, metamorphic.InjectErrorsRate(ro.ErrorRate))
+	}
+	if ro.NumInstances > 1 {
+		onceOpts = append(onceOpts, metamorphic.MultiInstance(ro.NumInstances))
+	}
+	return onceOpts
+}
+
+// MakeRunOptions constructs RunOptions based on the flags.
+func (r *RunFlags) MakeRunOptions() []metamorphic.RunOption {
+	opts := []metamorphic.RunOption{
+		metamorphic.Seed(r.Seed),
+		metamorphic.OpCount(r.Ops.Static),
+		metamorphic.MaxThreads(r.MaxThreads),
+	}
+	if r.Keep {
+		opts = append(opts, metamorphic.KeepData{})
+	}
+	if r.FailRE != "" {
+		opts = append(opts, metamorphic.FailOnMatch{Regexp: regexp.MustCompile(r.FailRE)})
+	}
+	if r.ErrorRate > 0 {
+		opts = append(opts, metamorphic.InjectErrorsRate(r.ErrorRate))
+	}
+	if r.TraceFile != "" {
+		opts = append(opts, metamorphic.RuntimeTrace(r.TraceFile))
+	}
+	if r.PreviousOps != "" {
+		opts = append(opts, metamorphic.ExtendPreviousRun(r.PreviousOps, r.InitialStatePath, r.InitialStateDesc))
+	}
+	if r.NumInstances > 1 {
+		opts = append(opts, metamorphic.MultiInstance(r.NumInstances))
+	}
+
+	// If the filesystem type was forced, all tests will use that value.
+	switch r.FS {
+	case "", "rand", "default":
+		// No-op. Use the generated value for the filesystem.
+	case "disk":
+		opts = append(opts, metamorphic.UseDisk)
+	case "mem":
+		opts = append(opts, metamorphic.UseInMemory)
+	default:
+		panic(fmt.Sprintf("unknown forced filesystem type: %q", r.FS))
+	}
+	if r.InnerBinary != "" {
+		opts = append(opts, metamorphic.InnerBinary(r.InnerBinary))
+	}
+	return opts
+}
diff --git a/pebble/internal/metamorphic/metarunner/main.go b/pebble/internal/metamorphic/metarunner/main.go
new file mode 100644
index 0000000..237dfec
--- /dev/null
+++ b/pebble/internal/metamorphic/metarunner/main.go
@@ -0,0 +1,66 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+// metarunner is a utility which runs metamorphic.RunOnce or Compare. It is
+// equivalent to executing `internal/metamorphic.TestMeta` with `--run-dir` or
+// `--compare`. It is used for code coverage instrumentation.
+package main
+
+import (
+	"flag"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"github.com/cockroachdb/pebble/internal/metamorphic/metaflags"
+	"github.com/cockroachdb/pebble/metamorphic"
+)
+
+var runOnceFlags = metaflags.InitRunOnceFlags()
+var _ = flag.String("test.run", "", `ignored; used for compatibility with TestMeta`)
+
+func main() {
+	flag.Parse()
+	onceOpts := runOnceFlags.MakeRunOnceOptions()
+	t := &mockT{}
+	switch {
+	case runOnceFlags.Compare != "":
+		runDirs := strings.Split(runOnceFlags.Compare, ",")
+		metamorphic.Compare(t, runOnceFlags.Dir, runOnceFlags.Seed, runDirs, onceOpts...)
+
+	case runOnceFlags.RunDir != "":
+		// The --run-dir flag is specified either in the child process (see
+		// runOptions() below) or the user specified it manually in order to re-run
+		// a test.
+		metamorphic.RunOnce(t, runOnceFlags.RunDir, runOnceFlags.Seed, filepath.Join(runOnceFlags.RunDir, "history"), onceOpts...)
+
+	default:
+		t.Errorf("--compare or --run-dir must be used")
+	}
+
+	if t.Failed() {
+		// Make sure we return an error code.
+		t.FailNow()
+	}
+}
+
+type mockT struct {
+	failed bool
+}
+
+var _ metamorphic.TestingT = (*mockT)(nil)
+
+func (t *mockT) Errorf(format string, args ...interface{}) {
+	t.failed = true
+	fmt.Fprintf(os.Stderr, format+"\n", args...)
+}
+
+func (t *mockT) FailNow() {
+	os.Exit(2)
+}
+
+func (t *mockT) Failed() bool {
+	return t.failed
+}
diff --git a/pebble/internal/mkbench/main.go b/pebble/internal/mkbench/main.go
new file mode 100644
index 0000000..41b7709
--- /dev/null
+++ b/pebble/internal/mkbench/main.go
@@ -0,0 +1,60 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+// mkbench is a utility for processing the raw nightly benchmark data in JSON
+// data that can be visualized by docs/js/app.js. The raw data is expected to
+// be stored in dated directories underneath the "data/" directory:
+//
+//	data/YYYYMMDD/.../<file>
+//
+// The files are expected to be bzip2 compressed. Within each file mkbench
+// looks for Go-bench-style lines of the form:
+//
+//	Benchmark<name> %d %f ops/sec %d read %d write %f r-amp %f w-amp
+//
+// The output is written to "data.js". In order to avoid reading all of the raw
+// data to regenerate "data.js" on every run, mkbench first reads "data.js",
+// noting which days have already been processed and exluding files in those
+// directories from being read. This has the additional effect of merging the
+// existing "data.js" with new raw data, which avoids needing to have all of
+// the raw data present to construct a new "data.js" (only the new raw data is
+// necessary).
+//
+// The nightly Pebble benchmarks are orchestrated from the CockroachDB
+// repo:
+//
+//	https://github.com/cockroachdb/cockroach/blob/master/build/teamcity-nightly-pebble.sh
+package main
+
+import (
+	"os"
+
+	"github.com/spf13/cobra"
+)
+
+var rootCmd = &cobra.Command{
+	Use:   "mkbench",
+	Short: "pebble benchmark data tools",
+}
+
+func init() {
+	y := getYCSBCommand()
+	rootCmd.AddCommand(getYCSBCommand())
+	rootCmd.AddCommand(getWriteCommand())
+	rootCmd.SilenceUsage = true
+
+	// For backwards compatability, the YCSB command is run, with the same
+	// flags, if a subcommand is not specified.
+	// TODO(travers): Remove this after updating the call site in the
+	// nightly-pebble script in cockroach.
+	*rootCmd.Flags() = *y.Flags()
+	rootCmd.RunE = y.RunE
+}
+
+func main() {
+	if err := rootCmd.Execute(); err != nil {
+		// Cobra has already printed the error message.
+		os.Exit(1)
+	}
+}
diff --git a/pebble/internal/mkbench/split.go b/pebble/internal/mkbench/split.go
new file mode 100644
index 0000000..b09e989
--- /dev/null
+++ b/pebble/internal/mkbench/split.go
@@ -0,0 +1,89 @@
+package main
+
+import (
+	"cmp"
+	"slices"
+)
+
+const increment = 50 // ops/sec
+
+// findOptimalSplit computes and returns a value that separates the given pass
+// and fail measurements optimally, such that the number of mis-classified
+// passes (pass values that fall above the split) and fails (fail values that
+// fall below the split) is minimized.
+//
+// The following gives a visual representation of the problem:
+//
+//		                     Optimal partition (=550) -----> |
+//	                                                         |
+//	  Passes:   o          o        o              o o o oo  |
+//	  Fails:                         x             x         |x    x  x     x x        x
+//	  |---------|---------|---------|---------|---------|----|----|---------|---------|---------|---> x
+//	  0        100       200       300       400       500   |   600       700       800       900
+//
+// The algorithm works by computing the error (i.e. mis-classifications) at
+// various points along the x-axis, starting from the origin and increasing by
+// the given increment.
+func findOptimalSplit(pass, fail []int) int {
+	// Not enough data to compute a sensible score.
+	if len(pass) == 0 || len(fail) == 0 {
+		return -1
+	}
+
+	// Maintain counters for the number of incorrectly classified passes and
+	// fails. All passes are initially incorrect, as we start at 0. Conversely,
+	// no fails are incorrectly classified, as all scores are >= 0.
+	pCount, fCount := len(pass), 0
+	p, f := make([]int, len(pass)), make([]int, len(fail))
+	copy(p, pass)
+	copy(f, fail)
+
+	// Sort the inputs.
+	slices.Sort(p)
+	slices.Sort(f)
+
+	// Find the global min and max.
+	min, max := p[0], f[len(fail)-1]
+
+	// Iterate over the range in increments.
+	var result [][]int
+	for x := min; x <= max; x = x + increment {
+		// Reduce the count of incorrect passes as x increases (i.e. fewer pass
+		// values are incorrect as x increases).
+		for len(p) > 0 && p[0] <= x {
+			pCount--
+			p = p[1:]
+		}
+
+		// Increase the count of incorrect fails as x increases (i.e. more fail
+		// values are incorrect as x increases).
+		for len(f) > 0 && f[0] < x {
+			fCount++
+			f = f[1:]
+		}
+
+		// Add a (x, score) tuple to result slice.
+		result = append(result, []int{x, pCount + fCount})
+	}
+
+	// Sort the (x, score) result slice by score ascending. Tie-break by x
+	// ascending.
+	slices.SortFunc(result, func(a, b []int) int {
+		if v := cmp.Compare(a[1], b[1]); v != 0 {
+			return v
+		}
+		return cmp.Compare(a[0], b[0])
+	})
+
+	// If there is more than one interval, split the difference between the min
+	// and the max.
+	splitMin, splitMax := result[0][0], result[0][0]
+	for i := 1; i < len(result); i++ {
+		if result[i][1] != result[0][1] {
+			break
+		}
+		splitMax = result[i][0]
+	}
+
+	return (splitMin + splitMax) / 2
+}
diff --git a/pebble/internal/mkbench/split_test.go b/pebble/internal/mkbench/split_test.go
new file mode 100644
index 0000000..4f73a7b
--- /dev/null
+++ b/pebble/internal/mkbench/split_test.go
@@ -0,0 +1,72 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package main
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestFindOptimalSplit(t *testing.T) {
+	testCases := []struct {
+		passes, fails []int
+		want          int
+	}{
+		{
+			// Not enough data.
+			passes: []int{},
+			fails:  []int{},
+			want:   -1,
+		},
+		{
+			// Not enough data.
+			passes: []int{1, 2, 3},
+			fails:  []int{},
+			want:   -1,
+		},
+		{
+			// Not enough data.
+			passes: []int{},
+			fails:  []int{1, 2, 3},
+			want:   -1,
+		},
+		{
+			// Trivial example.
+			passes: []int{100},
+			fails:  []int{200},
+			want:   150,
+		},
+		{
+			// Example given in the doc comment for the function.
+			passes: []int{100, 210, 300, 380, 450, 470, 490, 510, 520},
+			fails:  []int{310, 450, 560, 610, 640, 700, 720, 810},
+			want:   550,
+		},
+		{
+			// Empirical data from an actual test run (~1hr).
+			passes: []int{
+				1000, 1100, 1300, 1700, 2500, 4100, 7300, 13700, 26500, 52100,
+				52100, 52100, 26500, 26600, 26800, 27200, 28000, 29600, 32800,
+				32800, 32900, 32900, 33000, 33000, 33100, 33100, 33100, 33100,
+				33100, 33100, 33000, 33100, 33000, 32900, 33000, 33200, 33600,
+				34400, 36000,
+			},
+			fails: []int{
+				103300, 52200, 52200, 52100, 39200, 33100, 33200, 33300, 33200,
+				33200, 33200, 33200, 33200, 33100, 33300, 33100, 33100, 33000,
+				39200, 36100,
+			},
+			want: 33100,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run("", func(t *testing.T) {
+			split := findOptimalSplit(tc.passes, tc.fails)
+			require.Equal(t, tc.want, split)
+		})
+	}
+}
diff --git a/pebble/internal/mkbench/testdata/README.md b/pebble/internal/mkbench/testdata/README.md
new file mode 100644
index 0000000..773c277
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/README.md
@@ -0,0 +1,20 @@
+testdata
+========
+
+The files in this directory serve as test fixtures, used for validating that
+the `mkbench` command produces the expected output when reading and writing
+various Pebble benchmark data.
+
+The data was generated by downloading a sample of existing raw benchmark log
+data from the
+[`pebble-benchmarks`](https://s3.console.aws.amazon.com/s3/buckets/pebble-benchmarks)
+S3 bucket. The data was trimmed so as to reduce the amount of data checked into
+version control, with something akin to:
+
+```bash
+for _file in $(find "$DOWNLOADED_DATA" -name '*.gz'); do
+  gunzip --to-stdout "$_file" \
+    | grep 'Benchmarkycsb' -B 10 -A 10 \
+    | gzip > "$OUTPUT_DIR/$(basename "$_file")"
+done
+```
diff --git a/pebble/internal/mkbench/testdata/data-symlink b/pebble/internal/mkbench/testdata/data-symlink
new file mode 120000
index 0000000..460aa0e
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/data-symlink
@@ -0,0 +1 @@
+./data
\ No newline at end of file
diff --git a/pebble/internal/mkbench/testdata/data.js b/pebble/internal/mkbench/testdata/data.js
new file mode 100644
index 0000000..9deaff2
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/data.js
@@ -0,0 +1,14 @@
+data = {
+	"ycsb/A/values=1024": "20211027,78991.2,167664928506,195841456091,18.1,7.6\n20211028,76173.1,163837523469,191331830481,21.8,7.6\n",
+	"ycsb/A/values=64": "20211027,805932.0,66665600435,96836944450,6.6,3.2\n20211028,858109.4,70103249248,102210210700,6.6,3.2\n",
+	"ycsb/B/values=1024": "20211027,341473.6,112400263114,125402666871,7.6,11.3\n20211028,316904.6,109159389858,121296635757,7.7,11.8\n",
+	"ycsb/B/values=64": "20211027,1075407.2,22154594608,26308104322,3.6,6.6\n20211028,1089716.3,22381718563,26590669658,3.8,6.6\n",
+	"ycsb/C/values=1024": "20211027,810829.8,63927668424,63904393793,3.1,0.0\n20211028,776007.3,64139568628,64115747881,3.1,0.0\n",
+	"ycsb/C/values=64": "20211027,2564601.3,2882448948,2799242874,1.3,0.0\n20211028,2525296.7,2965080075,2880219325,1.6,0.0\n",
+	"ycsb/D/values=1024": "20211027,181657.9,118666785237,130265893197,9.7,22.1\n20211028,166447.2,112757153890,123408930102,10.3,22.8\n",
+	"ycsb/D/values=64": "20211027,837877.8,33353662677,38527524489,4.2,12.4\n20211028,889454.5,35951784950,41464556296,4.2,12.6\n",
+	"ycsb/E/values=1024": "20211027,57152.2,64801175403,68460858079,15.6,36.8\n20211028,50962.8,64276563556,67472766539,16.6,40.7\n",
+	"ycsb/E/values=64": "20211027,218106.4,19431395098,20670111161,3.1,25.6\n20211028,229320.5,20839571074,22160767810,3.1,26.1\n",
+	"ycsb/F/values=1024": "20211027,26429.2,168575162808,202512196433,0.0,11.8\n20211028,26533.0,168204615605,202274816199,0.0,11.7\n",
+	"ycsb/F/values=64": "20211027,237870.0,160873832374,190711184710,0.0,10.8\n20211028,236735.5,160863320765,190579811705,0.0,10.9\n"
+};
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/write/size=1024/run_1/1.log.gz b/pebble/internal/mkbench/testdata/data/20211027/pebble/write/size=1024/run_1/1.log.gz
new file mode 100644
index 0000000..2f5f204
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211027/pebble/write/size=1024/run_1/1.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/write/size=1024/run_1/2.log.gz b/pebble/internal/mkbench/testdata/data/20211027/pebble/write/size=1024/run_1/2.log.gz
new file mode 100644
index 0000000..a33028e
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211027/pebble/write/size=1024/run_1/2.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/1.ycsb_A.log.gz b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/1.ycsb_A.log.gz
new file mode 100644
index 0000000..ba8d6ff
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/1.ycsb_A.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/1.ycsb_B.log.gz b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/1.ycsb_B.log.gz
new file mode 100644
index 0000000..ab07da8
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/1.ycsb_B.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/1.ycsb_C.log.gz b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/1.ycsb_C.log.gz
new file mode 100644
index 0000000..3738929
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/1.ycsb_C.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/1.ycsb_D.log.gz b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/1.ycsb_D.log.gz
new file mode 100644
index 0000000..1c31b85
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/1.ycsb_D.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/1.ycsb_E.log.gz b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/1.ycsb_E.log.gz
new file mode 100644
index 0000000..8049ce7
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/1.ycsb_E.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/1.ycsb_F.log.gz b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/1.ycsb_F.log.gz
new file mode 100644
index 0000000..08d5322
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/1.ycsb_F.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/2.ycsb_A.log.gz b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/2.ycsb_A.log.gz
new file mode 100644
index 0000000..d6d0630
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/2.ycsb_A.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/2.ycsb_B.log.gz b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/2.ycsb_B.log.gz
new file mode 100644
index 0000000..c36f348
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/2.ycsb_B.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/2.ycsb_C.log.gz b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/2.ycsb_C.log.gz
new file mode 100644
index 0000000..068afd0
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/2.ycsb_C.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/2.ycsb_D.log.gz b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/2.ycsb_D.log.gz
new file mode 100644
index 0000000..b260e87
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/2.ycsb_D.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/2.ycsb_E.log.gz b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/2.ycsb_E.log.gz
new file mode 100644
index 0000000..06ee4ba
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/2.ycsb_E.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/2.ycsb_F.log.gz b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/2.ycsb_F.log.gz
new file mode 100644
index 0000000..c1a7f5c
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/2.ycsb_F.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/3.ycsb_A.log.gz b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/3.ycsb_A.log.gz
new file mode 100644
index 0000000..0d5b48b
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/3.ycsb_A.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/3.ycsb_B.log.gz b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/3.ycsb_B.log.gz
new file mode 100644
index 0000000..b056028
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/3.ycsb_B.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/3.ycsb_C.log.gz b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/3.ycsb_C.log.gz
new file mode 100644
index 0000000..33463ac
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/3.ycsb_C.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/3.ycsb_D.log.gz b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/3.ycsb_D.log.gz
new file mode 100644
index 0000000..9da0fc2
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/3.ycsb_D.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/3.ycsb_E.log.gz b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/3.ycsb_E.log.gz
new file mode 100644
index 0000000..0011a0b
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/3.ycsb_E.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/3.ycsb_F.log.gz b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/3.ycsb_F.log.gz
new file mode 100644
index 0000000..609a803
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=1024/run_1/3.ycsb_F.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/1.ycsb_A.log b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/1.ycsb_A.log
new file mode 100644
index 0000000..5d2ce47
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/1.ycsb_A.log
@@ -0,0 +1,21 @@
+ update_50    9m58s       425738.8       401662.5      0.6      1.0      1.2     16.3
+   read_50    9m59s       416106.6       401572.2      0.0      0.0      0.0     16.8
+ update_50    9m59s       416509.6       401687.3      0.6      1.0      1.2     10.5
+   read_50    10m0s       410298.1       401587.0      0.0      0.0      0.1     14.2
+ update_50    10m0s       409634.3       401700.7      0.6      1.0      1.3     16.3
+
+____optype__elapsed_____ops(total)___ops/sec(cum)__avg(ms)__p50(ms)__p95(ms)__p99(ms)_pMax(ms)
+   read_50   600.0s      240953485       401586.5      0.0      0.0      0.0      0.1     96.5
+ update_50   600.0s      241021712       401700.3      0.6      0.6      1.0      1.3     71.3
+
+Benchmarkycsb/A/values=64 481975197  803286.8 ops/sec  66645046201 read  96721702921 write  6.56 r-amp  3.25 w-amp
+
+__level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
+    WAL         1    46 M       -    25 G       -       -       -       -    28 G       -       -       -     1.1
+      0        12    24 M    1.36    28 G     0 B       0     0 B       0   8.8 G   5.8 K     0 B       2     0.3
+      1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      2         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      3         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      4        24    90 M    1.42   8.9 G     0 B       0     0 B       0    29 G   8.0 K    33 G       1     3.2
+      5        46   271 M    1.37   5.1 G     0 B       0    72 M      39    14 G   2.5 K    16 G       1     2.8
+      6        77   807 M       -   3.1 G     0 B       0     0 B       0    10 G     854    13 G       1     3.3
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/1.ycsb_B.log b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/1.ycsb_B.log
new file mode 100644
index 0000000..7916c83
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/1.ycsb_B.log
@@ -0,0 +1,21 @@
+ update__5    9m58s        62364.4        51711.4      3.5      8.9     11.0     17.8
+   read_95    9m59s      1172297.1       983118.2      0.0      0.0      0.0      2.6
+ update__5    9m59s        61940.9        51728.5      3.5      8.4     11.5     21.0
+   read_95    10m0s      1167190.9       983425.0      0.0      0.0      0.0      1.4
+ update__5    10m0s        61550.6        51744.9      3.7      8.4     11.0     17.8
+
+____optype__elapsed_____ops(total)___ops/sec(cum)__avg(ms)__p50(ms)__p95(ms)__p99(ms)_pMax(ms)
+   read_95   600.0s      590056493       983424.5      0.0      0.0      0.0      0.0    142.6
+ update__5   600.0s       31046993        51744.8      4.5      4.1     10.5     14.7     37.7
+
+Benchmarkycsb/B/values=64 621103486  1035169.3 ops/sec  21259017987 read  25254720854 write  3.75 r-amp  6.59 w-amp
+
+__level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
+    WAL         1    52 M       -   3.3 G       -       -       -       -   3.6 G       -       -       -     1.1
+      0         0     0 B    0.00   3.5 G     0 B       0     0 B       0   1.1 G     628     0 B       0     0.3
+      1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      2         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      3         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      4         0     0 B    0.00   1.3 G     0 B       0     0 B       0   1.8 G     559   1.8 G       0     1.4
+      5        34   244 M    1.00   1.3 G     0 B       0    22 M       6    15 G   2.1 K    16 G       1    11.2
+      6        68   796 M       -   724 M     0 B       0     0 B       0   2.0 G     186   2.3 G       1     2.8
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/1.ycsb_C.log b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/1.ycsb_C.log
new file mode 100644
index 0000000..41e8585
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/1.ycsb_C.log
@@ -0,0 +1,21 @@
+  read_100    9m55s      2779257.0      2539885.3      0.0      0.0      0.0    536.9
+  read_100    9m56s      3303207.2      2539707.1      0.0      0.0      0.0    536.9
+  read_100    9m57s      3171780.6      2540119.9      0.0      0.0      0.0    570.4
+  read_100    9m58s      2575344.7      2541782.9      0.0      0.0      0.0    570.4
+  read_100    9m59s      3016807.5      2541411.9      0.0      0.0      0.0    536.9
+  read_100    10m0s      3193238.0      2541974.1      0.0      0.0      0.0    536.9
+
+____optype__elapsed_____ops(total)___ops/sec(cum)__avg(ms)__p50(ms)__p95(ms)__p99(ms)_pMax(ms)
+  read_100   600.2s     1525748556      2541970.3      0.1      0.0      0.0      0.0   3221.2
+
+Benchmarkycsb/C/values=64 1525748556  2541970.3 ops/sec  2831100523 read  2749016415 write  1.33 r-amp  0.00 w-amp
+
+__level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
+    WAL         1     0 B       -     0 B       -       -       -       -     0 B       -       -       -     0.0
+      0         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      2         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      3         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      4         0     0 B    0.00   132 M     0 B       0     0 B       0   251 M      65   252 M       0     1.9
+      5         0     0 B    0.00   245 M     0 B       0    20 M       5   630 M      91   631 M       0     2.6
+      6        65   807 M       -   479 M     0 B       0     0 B       0   1.7 G     146   1.8 G       1     3.6
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/1.ycsb_D.log b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/1.ycsb_D.log
new file mode 100644
index 0000000..a87e407
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/1.ycsb_D.log
@@ -0,0 +1,21 @@
+   read_95    9m58s       818826.6       794193.4      0.0      0.0      0.0      3.9
+ insert__5    9m59s        42391.3        41791.1      5.2     13.6     17.8     29.4
+   read_95    9m59s       808290.9       794216.9      0.0      0.0      0.0      6.0
+ insert__5    10m0s        39749.8        41787.7      5.0     13.6     18.9     27.3
+   read_95    10m0s       753861.2       794149.9      0.0      0.0      0.1     35.7
+
+____optype__elapsed_____ops(total)___ops/sec(cum)__avg(ms)__p50(ms)__p95(ms)__p99(ms)_pMax(ms)
+ insert__5   600.0s       25073575        41787.7      5.3      4.7     13.1     18.9     48.2
+   read_95   600.0s      476507392       794148.0      0.0      0.0      0.0      0.0    176.2
+
+Benchmarkycsb/D/values=64 501580967  835935.6 ops/sec  33495399430 read  38665025075 write  4.20 r-amp  12.49 w-amp
+
+__level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
+    WAL         1    41 M       -   2.6 G       -       -       -       -   2.9 G       -       -       -     1.1
+      0         0     0 B    0.00   2.8 G     0 B       0     0 B       0   2.3 G   1.2 K     0 B       0     0.8
+      1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      2         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      3         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      4        15    55 M    0.80   2.4 G     0 B       0    25 M      13   4.5 G   1.3 K   4.5 G       1     1.9
+      5        58   405 M    1.08   2.5 G     0 B       0    21 M       8    17 G   2.4 K    17 G       1     6.7
+      6       176   2.3 G       -   2.3 G     0 B       0     0 B       0   9.5 G     766   9.9 G       1     4.1
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/1.ycsb_E.log b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/1.ycsb_E.log
new file mode 100644
index 0000000..359a166
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/1.ycsb_E.log
@@ -0,0 +1,21 @@
+   scan_95    9m58s       227185.2       207012.6      0.0      0.6      1.2      5.8
+ insert__5    9m59s        11985.7        10893.9     24.1     28.3     30.4     79.7
+   scan_95    9m59s       231504.3       207053.6      0.0      0.6      1.2      5.2
+ insert__5    10m0s        12265.0        10896.2     23.1     26.2     28.3     54.5
+   scan_95    10m0s       230701.0       207093.0      0.0      0.6      1.1      4.1
+
+____optype__elapsed_____ops(total)___ops/sec(cum)__avg(ms)__p50(ms)__p95(ms)__p99(ms)_pMax(ms)
+ insert__5   600.0s        6537698        10896.2     20.7     25.2     31.5     41.9    159.4
+   scan_95   600.0s      124255604       207092.6      0.1      0.0      0.7      1.4    302.0
+
+Benchmarkycsb/E/values=64 130793302  217988.7 ops/sec  19374344380 read  20613537366 write  3.03 r-amp  25.54 w-amp
+
+__level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
+    WAL         1    49 M       -   701 M       -       -       -       -   770 M       -       -       -     1.1
+      0         0     0 B    0.00   721 M     0 B       0     0 B       0   592 M     300     0 B       0     0.8
+      1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      2         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      3         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      4         0     0 B    0.00   724 M     0 B       0     0 B       0   843 M     217   845 M       0     1.2
+      5         0     0 B    0.00   837 M     0 B       0    20 M       5   1.4 G     347   1.4 G       0     1.7
+      6        89   1.3 G       -   1.0 G     0 B       0     0 B       0    16 G   1.1 K    16 G       1    15.0
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/1.ycsb_F.log b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/1.ycsb_F.log
new file mode 100644
index 0000000..caa41d5
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/1.ycsb_F.log
@@ -0,0 +1,21 @@
+insert_100    9m54s       212436.8       238751.2      0.2      6.3      9.4     14.7
+insert_100    9m55s       212115.8       238706.6      0.2      6.3     10.0     14.7
+insert_100    9m56s       187282.1       238620.2      0.2      7.9     11.0     16.3
+insert_100    9m57s       202323.9       238559.3      0.3      6.0      9.4     12.6
+insert_100    9m58s       109155.1       238343.0      0.9      9.4     11.5     14.2
+insert_100    9m59s       289498.8       238428.6      0.2      3.4      7.9     16.8
+
+____optype__elapsed_____ops(total)___ops/sec(cum)__avg(ms)__p50(ms)__p95(ms)__p99(ms)_pMax(ms)
+insert_100   600.0s      143061128       238435.2      1.1      0.2      5.0      9.4     23.1
+
+Benchmarkycsb/F/values=64 143061128  238435.2 ops/sec  160649791701 read  190575693558 write  0.00 r-amp  10.79 w-amp
+
+__level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
+    WAL         1    43 M       -    15 G       -       -       -       -    16 G       -       -       -     1.1
+      0        21    33 M    1.30    16 G     0 B       0     0 B       0    14 G   8.6 K     0 B       2     0.8
+      1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      2         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      3        27   102 M    1.23   6.2 G     0 B       0   8.8 M       5    13 G   3.4 K    13 G       1     2.0
+      4        66   440 M    1.23    13 G     0 B       0   137 M      44    35 G   6.9 K    35 G       1     2.6
+      5       164   2.0 G    1.15    13 G     0 B       0   287 M      81    51 G   6.1 K    51 G       1     4.0
+      6       394   9.7 G       -    11 G     0 B       0     0 B       0    49 G   3.2 K    51 G       1     4.4
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/2.ycsb_A.log b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/2.ycsb_A.log
new file mode 100644
index 0000000..02a4240
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/2.ycsb_A.log
@@ -0,0 +1,21 @@
+ update_50    9m58s       426757.8       425532.0      0.6      1.0      1.2      8.1
+   read_50    9m59s       439325.0       425635.3      0.0      0.0      0.0     13.6
+ update_50    9m59s       439115.0       425554.5      0.6      1.0      1.2     13.1
+   read_50    10m0s       442687.2       425663.7      0.0      0.0      0.0     15.2
+ update_50    10m0s       444176.6       425585.4      0.6      0.9      1.2     14.2
+
+____optype__elapsed_____ops(total)___ops/sec(cum)__avg(ms)__p50(ms)__p95(ms)__p99(ms)_pMax(ms)
+   read_50   600.0s      255398650       425663.0      0.0      0.0      0.0      0.0     96.5
+ update_50   600.0s      255351694       425584.8      0.6      0.6      1.0      1.2     56.6
+
+Benchmarkycsb/A/values=64 510750344  851247.8 ops/sec  70038357863 read  101882774448 write  6.59 r-amp  3.23 w-amp
+
+__level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
+    WAL         2    62 M       -    27 G       -       -       -       -    29 G       -       -       -     1.1
+      0        10    18 M    1.39    29 G     0 B       0     0 B       0   9.4 G   6.1 K     0 B       2     0.3
+      1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      2         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      3         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      4        24    92 M    1.41   9.5 G     0 B       0     0 B       0    30 G   8.3 K    34 G       1     3.2
+      5        46   276 M    1.35   5.5 G     0 B       0    48 M      36    15 G   2.7 K    17 G       1     2.8
+      6        83   807 M       -   3.3 G     0 B       0     0 B       0    11 G     872    14 G       1     3.3
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/2.ycsb_B.log b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/2.ycsb_B.log
new file mode 100644
index 0000000..85e1a32
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/2.ycsb_B.log
@@ -0,0 +1,21 @@
+ update__5    9m58s        65386.9        56779.9      3.4      7.9     10.5     16.8
+   read_95    9m59s      1240926.3      1079344.4      0.0      0.0      0.0      1.7
+ update__5    9m59s        65231.0        56794.0      3.4      8.1     11.0     19.9
+   read_95    10m0s      1042521.0      1079285.1      0.0      0.0      0.0     46.1
+ update__5    10m0s        54731.9        56790.7      3.8     10.0     13.6     22.0
+
+____optype__elapsed_____ops(total)___ops/sec(cum)__avg(ms)__p50(ms)__p95(ms)__p99(ms)_pMax(ms)
+   read_95   600.0s      647572887      1079282.5      0.0      0.0      0.0      0.0    167.8
+ update__5   600.0s       34074505        56790.5      4.1      3.8      9.4     13.1     50.3
+
+Benchmarkycsb/B/values=64 681647392  1136073.1 ops/sec  23286659636 read  27671317213 write  3.58 r-amp  6.58 w-amp
+
+__level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
+    WAL         1   5.4 M       -   3.6 G       -       -       -       -   3.9 G       -       -       -     1.1
+      0         0     0 B    0.00   3.9 G     0 B       0     0 B       0   1.2 G     697     0 B       0     0.3
+      1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      2         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      3         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      4         6    18 M    0.29   1.4 G     0 B       0   2.0 M       1   1.6 G     524   1.6 G       1     1.2
+      5        32   239 M    0.97   1.5 G     0 B       0    12 M       3    17 G   2.4 K    18 G       1    11.3
+      6        67   800 M       -   783 M     0 B       0     0 B       0   2.2 G     195   2.6 G       1     2.9
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/2.ycsb_C.log b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/2.ycsb_C.log
new file mode 100644
index 0000000..1693eab
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/2.ycsb_C.log
@@ -0,0 +1,21 @@
+  read_100    9m55s      3820266.8      2767533.2      0.0      0.0      0.0    570.4
+  read_100    9m56s      3053984.9      2768326.7      0.0      0.0      0.0    536.9
+  read_100    9m57s      3890906.7      2767333.9      0.0      0.0      0.0    570.4
+  read_100    9m58s      3050569.3      2767807.7      0.0      0.0      0.0    570.4
+  read_100    9m59s      2565996.2      2768936.4      0.0      0.0      0.0    536.9
+  read_100    10m0s      3786726.8      2768884.5      0.0      0.0      0.0    536.9
+
+____optype__elapsed_____ops(total)___ops/sec(cum)__avg(ms)__p50(ms)__p95(ms)__p99(ms)_pMax(ms)
+  read_100   600.3s     1662048193      2768879.3      0.1      0.0      0.0      0.0   4563.4
+
+Benchmarkycsb/C/values=64 1662048193  2768879.3 ops/sec  2929168438 read  2844620768 write  1.20 r-amp  0.00 w-amp
+
+__level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
+    WAL         1     0 B       -     0 B       -       -       -       -     0 B       -       -       -     0.0
+      0         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      2         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      3         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      4         0     0 B    0.00   147 M     0 B       0     0 B       0   278 M      71   280 M       0     1.9
+      5         0     0 B    0.00   256 M     0 B       0    12 M       3   628 M      91   630 M       0     2.5
+      6        65   807 M       -   494 M     0 B       0     0 B       0   1.8 G     151   1.8 G       1     3.7
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/2.ycsb_D.log b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/2.ycsb_D.log
new file mode 100644
index 0000000..42f661c
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/2.ycsb_D.log
@@ -0,0 +1,21 @@
+   read_95    9m58s       850164.1       836517.8      0.0      0.0      0.0     62.9
+ insert__5    9m59s        47288.3        44026.4      4.7     12.6     16.3     29.4
+   read_95    9m59s       892237.6       836609.1      0.0      0.0      0.0     15.2
+ insert__5    10m0s        46511.7        44030.6      4.7     12.6     16.8     25.2
+   read_95    10m0s       881005.9       836683.0      0.0      0.0      0.0      0.2
+
+____optype__elapsed_____ops(total)___ops/sec(cum)__avg(ms)__p50(ms)__p95(ms)__p99(ms)_pMax(ms)
+ insert__5   600.0s       26419275        44030.7      5.1      4.7     12.6     17.8    906.0
+   read_95   600.0s      502026767       836684.1      0.0      0.0      0.0      0.0    176.2
+
+Benchmarkycsb/D/values=64 528446042  880714.8 ops/sec  35620784205 read  41079115853 write  4.21 r-amp  12.60 w-amp
+
+__level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
+    WAL         1    26 M       -   2.8 G       -       -       -       -   3.0 G       -       -       -     1.1
+      0         0     0 B    0.00   3.0 G     0 B       0     0 B       0   2.5 G   1.3 K     0 B       0     0.8
+      1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      2         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      3         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      4        18    63 M    0.99   2.6 G     0 B       0    21 M      11   4.7 G   1.3 K   4.7 G       1     1.8
+      5        59   409 M    0.99   2.6 G     0 B       0    38 M      15    18 G   2.6 K    18 G       1     6.9
+      6       194   2.5 G       -   2.5 G     0 B       0     0 B       0    10 G     816    10 G       1     4.1
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/2.ycsb_E.log b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/2.ycsb_E.log
new file mode 100644
index 0000000..f1516a2
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/2.ycsb_E.log
@@ -0,0 +1,21 @@
+   scan_95    9m58s       209083.8       215588.6      0.0      0.7      1.4      6.0
+ insert__5    9m59s        11753.5        11350.9     24.1     28.3     29.4     54.5
+   scan_95    9m59s       219819.3       215595.6      0.0      0.6      1.2      6.3
+ insert__5    10m0s        11769.1        11351.6     24.1     29.4     35.7     60.8
+   scan_95    10m0s       223214.4       215608.4      0.0      0.6      1.2      5.5
+
+____optype__elapsed_____ops(total)___ops/sec(cum)__avg(ms)__p50(ms)__p95(ms)__p99(ms)_pMax(ms)
+ insert__5   600.0s        6811069        11351.6     19.9     24.1     30.4     39.8    906.0
+   scan_95   600.0s      129367417       215608.2      0.1      0.0      0.6      1.3    201.3
+
+Benchmarkycsb/E/values=64 136178486  226959.8 ops/sec  20597866509 read  21909958468 write  3.15 r-amp  26.06 w-amp
+
+__level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
+    WAL         1    24 M       -   730 M       -       -       -       -   802 M       -       -       -     1.1
+      0         0     0 B    0.00   778 M     0 B       0     0 B       0   639 M     324     0 B       0     0.8
+      1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      2         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      3         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      4         0     0 B    0.00   786 M     0 B       0     0 B       0   917 M     235   919 M       0     1.2
+      5         3   3.1 M    0.01   895 M     0 B       0    12 M       3   1.4 G     364   1.4 G       1     1.7
+      6        94   1.3 G       -   1.1 G     0 B       0     0 B       0    17 G   1.2 K    17 G       1    15.1
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/2.ycsb_F.log b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/2.ycsb_F.log
new file mode 100644
index 0000000..b341232
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/2.ycsb_F.log
@@ -0,0 +1,21 @@
+insert_100    9m55s       181783.7       237091.4      0.2      7.9     11.0     13.6
+insert_100    9m56s       226428.6       237073.5      0.2      5.0      9.4     13.1
+insert_100    9m57s       201485.0       237013.8      0.2      6.3     10.0     15.7
+insert_100    9m58s       220304.3       236985.9      0.2      5.0     10.0     15.7
+insert_100    9m59s       175681.7       236883.7      0.2      6.8     11.0     14.2
+insert_100    10m0s       123210.3       236694.1      0.6      9.4     11.5     13.1
+
+____optype__elapsed_____ops(total)___ops/sec(cum)__avg(ms)__p50(ms)__p95(ms)__p99(ms)_pMax(ms)
+insert_100   600.0s      142016773       236693.6      1.1      0.2      5.0      9.4     18.9
+
+Benchmarkycsb/F/values=64 142016773  236693.6 ops/sec  160562686448 read  190278354322 write  0.00 r-amp  10.85 w-amp
+
+__level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
+    WAL         1    34 M       -    15 G       -       -       -       -    16 G       -       -       -     1.1
+      0        18    35 M    0.75    16 G     0 B       0     0 B       0    13 G   8.5 K     0 B       1     0.8
+      1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      2         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      3        24    86 M    1.13   6.1 G     0 B       0   4.1 M       2    12 G   3.3 K    12 G       1     2.0
+      4        62   429 M    1.16    13 G     0 B       0   151 M      48    35 G   7.0 K    35 G       1     2.6
+      5       158   2.0 G    1.16    13 G     0 B       0   194 M      61    52 G   6.2 K    52 G       1     4.0
+      6       387   9.6 G       -    11 G     0 B       0     0 B       0    49 G   3.2 K    50 G       1     4.4
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/3.ycsb_A.log b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/3.ycsb_A.log
new file mode 100644
index 0000000..073e359
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/3.ycsb_A.log
@@ -0,0 +1,21 @@
+ update_50    9m58s       401868.7       404237.2      0.6      1.0      1.3     13.1
+   read_50    9m59s       417218.0       404281.7      0.0      0.0      0.0     11.5
+ update_50    9m59s       417932.8       404260.1      0.6      1.0      1.2     10.5
+   read_50    10m0s       414986.0       404299.5      0.0      0.0      0.0     14.7
+ update_50    10m0s       415308.2       404278.6      0.6      1.0      1.2     12.6
+
+____optype__elapsed_____ops(total)___ops/sec(cum)__avg(ms)__p50(ms)__p95(ms)__p99(ms)_pMax(ms)
+   read_50   600.0s      242580437       404299.1      0.0      0.0      0.0      0.1    130.0
+ update_50   600.0s      242567850       404278.1      0.6      0.6      1.0      1.3     46.1
+
+Benchmarkycsb/A/values=64 485148287  808577.2 ops/sec  66686154669 read  96952185980 write  6.61 r-amp  3.24 w-amp
+
+__level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
+    WAL         1    56 M       -    25 G       -       -       -       -    28 G       -       -       -     1.1
+      0        12    22 M    1.35    28 G     0 B       0     0 B       0   8.9 G   5.8 K     0 B       2     0.3
+      1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      2         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      3         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      4        25    91 M    1.42   9.0 G     0 B       0     0 B       0    28 G   7.9 K    32 G       1     3.2
+      5        48   273 M    1.38   5.1 G     0 B       0    91 M      44    15 G   2.5 K    17 G       1     2.8
+      6        85   807 M       -   3.1 G     0 B       0     0 B       0    10 G     843    13 G       1     3.3
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/3.ycsb_B.log b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/3.ycsb_B.log
new file mode 100644
index 0000000..0e30b4a
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/3.ycsb_B.log
@@ -0,0 +1,21 @@
+ update__5    9m58s        49355.5        53801.8      4.5     12.1     16.3     30.4
+   read_95    9m59s       901724.6      1021867.1      0.0      0.0      0.0     23.1
+ update__5    9m59s        47301.7        53791.0      4.5     12.1     16.3     25.2
+   read_95    10m0s       879528.0      1021630.3      0.0      0.0      0.5     35.7
+ update__5    10m0s        46711.0        53779.3      3.9     11.5     16.3     29.4
+
+____optype__elapsed_____ops(total)___ops/sec(cum)__avg(ms)__p50(ms)__p95(ms)__p99(ms)_pMax(ms)
+   read_95   600.0s      612979289      1021628.0      0.0      0.0      0.0      0.0    113.2
+ update__5   600.0s       32267610        53779.1      4.3      3.9     10.0     14.2     54.5
+
+Benchmarkycsb/B/values=64 645246899  1075407.2 ops/sec  22154594608 read  26308104322 write  3.63 r-amp  6.60 w-amp
+
+__level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
+    WAL         1    23 M       -   3.4 G       -       -       -       -   3.7 G       -       -       -     1.1
+      0         0     0 B    0.00   3.7 G     0 B       0     0 B       0   1.2 G     660     0 B       0     0.3
+      1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      2         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      3         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      4         2   5.8 M    0.09   1.3 G     0 B       0   6.1 M       3   1.6 G     504   1.6 G       1     1.2
+      5        34   247 M    0.97   1.4 G     0 B       0    37 M      10    16 G   2.2 K    16 G       1    11.5
+      6        68   798 M       -   756 M     0 B       0     0 B       0   2.1 G     196   2.5 G       1     2.9
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/3.ycsb_C.log b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/3.ycsb_C.log
new file mode 100644
index 0000000..84126f5
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/3.ycsb_C.log
@@ -0,0 +1,21 @@
+  read_100    9m55s      6817289.2      2584152.7      0.0      0.0      0.0    570.4
+  read_100    9m56s      2976256.9      2584760.3      0.0      0.0      0.0    536.9
+  read_100    9m57s      2678248.1      2585459.3      0.0      0.0      0.0    536.9
+  read_100    9m58s      2549408.0      2587163.4      0.0      0.0      0.0    570.4
+  read_100    9m59s      4690274.5      2587443.5      0.0      0.0      0.0    536.9
+  read_100    10m0s      2619757.1      2587500.4      0.0      0.0      0.0    536.9
+
+____optype__elapsed_____ops(total)___ops/sec(cum)__avg(ms)__p50(ms)__p95(ms)__p99(ms)_pMax(ms)
+  read_100   600.4s     1553316623      2587232.4      0.1      0.0      0.0      0.0   3623.9
+
+Benchmarkycsb/C/values=64 1553316623  2587232.4 ops/sec  2933797373 read  2849469333 write  1.24 r-amp  0.00 w-amp
+
+__level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
+    WAL         1     0 B       -     0 B       -       -       -       -     0 B       -       -       -     0.0
+      0         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      2         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      3         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      4         0     0 B    0.00   111 M     0 B       0     0 B       0   286 M      76   287 M       0     2.6
+      5         0     0 B    0.00   226 M     0 B       0    34 M      10   642 M      94   643 M       0     2.8
+      6        63   807 M       -   494 M     0 B       0     0 B       0   1.7 G     152   1.8 G       1     3.6
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/3.ycsb_D.log b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/3.ycsb_D.log
new file mode 100644
index 0000000..eb20d19
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/3.ycsb_D.log
@@ -0,0 +1,21 @@
+   read_95    9m58s       771690.6       797882.1      0.0      0.0      0.1     28.3
+ insert__5    9m59s        40781.4        41981.5      4.5     13.1     18.9     30.4
+   read_95    9m59s       770830.5       797837.0      0.0      0.0      0.8     37.7
+ insert__5    10m0s        42133.7        41981.7      4.5     12.6     17.8     31.5
+   read_95    10m0s       799694.0       797840.2      0.0      0.0      0.1     54.5
+
+____optype__elapsed_____ops(total)___ops/sec(cum)__avg(ms)__p50(ms)__p95(ms)__p99(ms)_pMax(ms)
+ insert__5   600.0s       25189227        41981.6      5.3      4.7     13.1     18.9     60.8
+   read_95   600.0s      478707810       797838.4      0.0      0.0      0.0      0.0    251.7
+
+Benchmarkycsb/D/values=64 503897037  839820.0 ops/sec  33211925925 read  38390023904 write  4.22 r-amp  12.35 w-amp
+
+__level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
+    WAL         1    54 M       -   2.6 G       -       -       -       -   2.9 G       -       -       -     1.1
+      0         0     0 B    0.00   2.8 G     0 B       0     0 B       0   2.3 G   1.2 K     0 B       0     0.8
+      1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      2         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      3         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      4        11    39 M    0.60   2.4 G     0 B       0    32 M      16   4.5 G   1.3 K   4.5 G       1     1.9
+      5        63   402 M    0.99   2.5 G     0 B       0    59 M      26    16 G   2.4 K    16 G       1     6.6
+      6       178   2.4 G       -   2.4 G     0 B       0     0 B       0   9.5 G     770   9.9 G       1     4.0
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/3.ycsb_E.log b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/3.ycsb_E.log
new file mode 100644
index 0000000..cdeab85
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/3.ycsb_E.log
@@ -0,0 +1,21 @@
+   scan_95    9m58s       213199.8       207298.9      0.0      0.6      1.3      5.5
+ insert__5    9m59s        11048.2        10911.0     25.2     30.4     31.5     58.7
+   scan_95    9m59s       212520.5       207307.7      0.0      0.6      1.3      4.7
+ insert__5    10m0s        11053.6        10911.2     25.2     29.4     32.5     58.7
+   scan_95    10m0s       210302.0       207312.8      0.0      0.6      1.3      5.0
+
+____optype__elapsed_____ops(total)___ops/sec(cum)__avg(ms)__p50(ms)__p95(ms)__p99(ms)_pMax(ms)
+ insert__5   600.0s        6546948        10911.2     20.7     25.2     31.5     41.9    159.4
+   scan_95   600.0s      124391682       207312.8      0.1      0.0      0.7      1.4    268.4
+
+Benchmarkycsb/E/values=64 130938630  218224.1 ops/sec  19488445817 read  20726684957 write  3.08 r-amp  25.65 w-amp
+
+__level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
+    WAL         1    50 M       -   702 M       -       -       -       -   771 M       -       -       -     1.1
+      0         0     0 B    0.00   721 M     0 B       0     0 B       0   592 M     300     0 B       0     0.8
+      1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      2         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      3         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      4         0     0 B    0.00   703 M     0 B       0     0 B       0   882 M     228   883 M       0     1.3
+      5         1   777 K    0.00   817 M     0 B       0    34 M      10   1.4 G     341   1.4 G       1     1.8
+      6        93   1.3 G       -   1.1 G     0 B       0     0 B       0    16 G   1.1 K    16 G       1    14.8
diff --git a/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/3.ycsb_F.log b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/3.ycsb_F.log
new file mode 100644
index 0000000..bb4b768
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/data/20211027/pebble/ycsb/size=64/run_1/3.ycsb_F.log
@@ -0,0 +1,21 @@
+insert_100    9m55s       287372.6       238235.7      0.2      4.5      8.1     14.7
+insert_100    9m56s       203066.9       238176.7      0.2      6.3      9.4     15.7
+insert_100    9m57s       135283.0       238004.3      0.4      9.4     11.0     14.7
+insert_100    9m58s       223131.1       237979.6      0.2      6.3      9.4     14.2
+insert_100    9m59s       196813.5       237910.8      0.2      6.3      9.4     12.6
+insert_100    10m0s       213784.9       237870.5      0.2      5.2     10.0     15.7
+
+____optype__elapsed_____ops(total)___ops/sec(cum)__avg(ms)__p50(ms)__p95(ms)__p99(ms)_pMax(ms)
+insert_100   600.0s      142722382       237870.0      1.1      0.2      5.0      9.4     25.2
+
+Benchmarkycsb/F/values=64 142722382  237870.0 ops/sec  160873832374 read  190711184710 write  0.00 r-amp  10.83 w-amp
+
+__level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
+    WAL         2    60 M       -    15 G       -       -       -       -    16 G       -       -       -     1.1
+      0         8    15 M    0.69    16 G     0 B       0     0 B       0    13 G   8.5 K     0 B       1     0.8
+      1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      2         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      3        25    97 M    1.17   6.1 G     0 B       0   8.1 M       4    13 G   3.4 K    13 G       1     2.1
+      4        65   451 M    1.23    13 G     0 B       0   128 M      41    35 G   6.9 K    35 G       1     2.6
+      5       155   2.0 G    1.15    13 G     0 B       0   282 M      84    51 G   6.1 K    51 G       1     4.0
+      6       395   9.7 G       -    11 G     0 B       0     0 B       0    49 G   3.2 K    51 G       1     4.4
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/write/size=1024/run_1/1.log.gz b/pebble/internal/mkbench/testdata/data/20211028/pebble/write/size=1024/run_1/1.log.gz
new file mode 100644
index 0000000..2a327ae
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/write/size=1024/run_1/1.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/write/size=1024/run_1/2.log.gz b/pebble/internal/mkbench/testdata/data/20211028/pebble/write/size=1024/run_1/2.log.gz
new file mode 100644
index 0000000..b8f8dc9
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/write/size=1024/run_1/2.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/1.ycsb_A.log.gz b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/1.ycsb_A.log.gz
new file mode 100644
index 0000000..24cdf7f
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/1.ycsb_A.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/1.ycsb_B.log.gz b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/1.ycsb_B.log.gz
new file mode 100644
index 0000000..4928ea3
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/1.ycsb_B.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/1.ycsb_C.log.gz b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/1.ycsb_C.log.gz
new file mode 100644
index 0000000..ec97b9f
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/1.ycsb_C.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/1.ycsb_D.log.gz b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/1.ycsb_D.log.gz
new file mode 100644
index 0000000..b149c7d
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/1.ycsb_D.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/1.ycsb_E.log.gz b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/1.ycsb_E.log.gz
new file mode 100644
index 0000000..2daf285
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/1.ycsb_E.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/1.ycsb_F.log.gz b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/1.ycsb_F.log.gz
new file mode 100644
index 0000000..c70889f
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/1.ycsb_F.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/2.ycsb_A.log.gz b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/2.ycsb_A.log.gz
new file mode 100644
index 0000000..3f1474a
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/2.ycsb_A.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/2.ycsb_B.log.gz b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/2.ycsb_B.log.gz
new file mode 100644
index 0000000..d2bfddf
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/2.ycsb_B.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/2.ycsb_C.log.gz b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/2.ycsb_C.log.gz
new file mode 100644
index 0000000..46dc1e2
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/2.ycsb_C.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/2.ycsb_D.log.gz b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/2.ycsb_D.log.gz
new file mode 100644
index 0000000..f43fd4c
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/2.ycsb_D.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/2.ycsb_E.log.gz b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/2.ycsb_E.log.gz
new file mode 100644
index 0000000..b3fbf0e
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/2.ycsb_E.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/2.ycsb_F.log.gz b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/2.ycsb_F.log.gz
new file mode 100644
index 0000000..96bad43
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/2.ycsb_F.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/3.ycsb_A.log.gz b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/3.ycsb_A.log.gz
new file mode 100644
index 0000000..2e2d9bb
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/3.ycsb_A.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/3.ycsb_B.log.gz b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/3.ycsb_B.log.gz
new file mode 100644
index 0000000..c6ca003
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/3.ycsb_B.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/3.ycsb_C.log.gz b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/3.ycsb_C.log.gz
new file mode 100644
index 0000000..140904e
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/3.ycsb_C.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/3.ycsb_D.log.gz b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/3.ycsb_D.log.gz
new file mode 100644
index 0000000..87356e3
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/3.ycsb_D.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/3.ycsb_E.log.gz b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/3.ycsb_E.log.gz
new file mode 100644
index 0000000..f0e4777
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/3.ycsb_E.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/3.ycsb_F.log.gz b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/3.ycsb_F.log.gz
new file mode 100644
index 0000000..b6a4e75
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=1024/run_1/3.ycsb_F.log.gz differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/1.ycsb_A.log.bz2 b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/1.ycsb_A.log.bz2
new file mode 100644
index 0000000..4d0ed41
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/1.ycsb_A.log.bz2 differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/1.ycsb_B.log.bz2 b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/1.ycsb_B.log.bz2
new file mode 100644
index 0000000..59cd248
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/1.ycsb_B.log.bz2 differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/1.ycsb_C.log.bz2 b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/1.ycsb_C.log.bz2
new file mode 100644
index 0000000..0eeada1
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/1.ycsb_C.log.bz2 differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/1.ycsb_D.log.bz2 b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/1.ycsb_D.log.bz2
new file mode 100644
index 0000000..8d3636d
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/1.ycsb_D.log.bz2 differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/1.ycsb_E.log.bz2 b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/1.ycsb_E.log.bz2
new file mode 100644
index 0000000..0b91e82
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/1.ycsb_E.log.bz2 differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/1.ycsb_F.log.bz2 b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/1.ycsb_F.log.bz2
new file mode 100644
index 0000000..0280ea8
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/1.ycsb_F.log.bz2 differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/2.ycsb_A.log.bz2 b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/2.ycsb_A.log.bz2
new file mode 100644
index 0000000..b934565
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/2.ycsb_A.log.bz2 differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/2.ycsb_B.log.bz2 b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/2.ycsb_B.log.bz2
new file mode 100644
index 0000000..b6bde6c
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/2.ycsb_B.log.bz2 differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/2.ycsb_C.log.bz2 b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/2.ycsb_C.log.bz2
new file mode 100644
index 0000000..1e8005c
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/2.ycsb_C.log.bz2 differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/2.ycsb_D.log.bz2 b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/2.ycsb_D.log.bz2
new file mode 100644
index 0000000..c9f6a88
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/2.ycsb_D.log.bz2 differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/2.ycsb_E.log.bz2 b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/2.ycsb_E.log.bz2
new file mode 100644
index 0000000..5e52e5c
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/2.ycsb_E.log.bz2 differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/2.ycsb_F.log.bz2 b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/2.ycsb_F.log.bz2
new file mode 100644
index 0000000..9b60afe
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/2.ycsb_F.log.bz2 differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/3.ycsb_A.log.bz2 b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/3.ycsb_A.log.bz2
new file mode 100644
index 0000000..54ca194
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/3.ycsb_A.log.bz2 differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/3.ycsb_B.log.bz2 b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/3.ycsb_B.log.bz2
new file mode 100644
index 0000000..9fb4f06
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/3.ycsb_B.log.bz2 differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/3.ycsb_C.log.bz2 b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/3.ycsb_C.log.bz2
new file mode 100644
index 0000000..2a0c431
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/3.ycsb_C.log.bz2 differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/3.ycsb_D.log.bz2 b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/3.ycsb_D.log.bz2
new file mode 100644
index 0000000..4f80d07
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/3.ycsb_D.log.bz2 differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/3.ycsb_E.log.bz2 b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/3.ycsb_E.log.bz2
new file mode 100644
index 0000000..bbb0892
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/3.ycsb_E.log.bz2 differ
diff --git a/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/3.ycsb_F.log.bz2 b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/3.ycsb_F.log.bz2
new file mode 100644
index 0000000..1168459
Binary files /dev/null and b/pebble/internal/mkbench/testdata/data/20211028/pebble/ycsb/size=64/run_1/3.ycsb_F.log.bz2 differ
diff --git a/pebble/internal/mkbench/testdata/write-throughput/20211027-pebble-write-size=1024-run_1-summary.json b/pebble/internal/mkbench/testdata/write-throughput/20211027-pebble-write-size=1024-run_1-summary.json
new file mode 100644
index 0000000..1f45ef5
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/write-throughput/20211027-pebble-write-size=1024-run_1-summary.json
@@ -0,0 +1,10 @@
+{
+	"20211027/pebble/write/size=1024/run_1/1.log.gz": {
+		"opsSec": 58600,
+		"rawData": "60,30000,true,2413192409,4,1.11\n121,30094,true,4384425494,4,1.11\n181,30269,true,6248072011,4,1.11\n242,30677,true,8258202360,5,1.11\n302,31356,true,10286760713,5,1.11\n362,33166,true,12369996711,5,1.11\n422,35847,true,14621206099,5,1.11\n482,42751,true,17379095163,5,1.11\n543,53714,true,20871156704,5,1.11\n604,73663,true,25681079754,5,1.11\n648,115648,false,31602877260,5,1.11\n739,73663,true,35851797927,5,1.11\n799,73747,true,40571673552,5,1.11\n860,73977,true,45676948623,5,1.11\n921,74366,true,50168216360,5,1.11\n982,74986,true,55041980416,5,1.11\n987,76593,false,55472452138,5,1.11\n1078,74986,true,60287569779,5,1.11\n1138,75095,true,65070685844,5,1.11\n1179,75273,false,69098237223,5,1.11\n1270,75095,true,73305613286,5,1.11\n1330,75219,true,78030231594,6,1.11\n1391,75388,true,82914309367,6,1.11\n1452,75800,true,87840819971,6,1.11\n1513,76610,true,92755048971,6,1.11\n1539,78319,false,95709408791,6,1.11\n1630,76610,true,99903564029,6,1.11\n1673,76701,false,103721367494,6,1.11\n1764,76610,true,108753816023,6,1.11\n1825,76725,true,113311993988,6,1.11\n1841,76912,false,115472729603,6,1.11\n1932,76725,true,119654786828,6,1.11\n1971,76801,false,122809787130,6,1.11\n2062,76725,true,128064656376,6,1.11\n2077,76839,false,129762610745,6,1.11\n2168,76725,true,133994490923,6,1.11\n2197,76804,false,137034693248,6,1.11\n2288,76725,true,141373828256,6,1.11\n2316,76829,false,144157740970,6,1.11\n2407,76725,true,148601267333,6,1.11\n2410,76847,false,149653665443,6,1.11\n2501,76725,true,153888198803,6,1.11\n2562,76834,true,158773998199,6,1.11\n2567,76996,false,160036138430,6,1.11\n2658,76834,true,164439914099,6,1.11\n2671,76951,false,166222610671,6,1.11\n2760,76834,false,170557757451,6,1.11\n2851,76725,true,175917674234,6,1.11\n2889,76835,false,178970474952,6,1.11\n2980,76725,true,183924354644,6,1.11\n2986,76827,false,184305899055,6,1.11\n3077,76725,true,188778615400,6,1.11\n3085,76840,false,189572924096,6,1.11\n3176,76725,true,194293251757,6,1.11\n3201,76814,false,197140335274,6,1.11\n3291,76725,false,202086050308,6,1.11\n3382,76610,true,206258730150,6,1.11\n3398,76697,false,207514304384,6,1.11\n3484,76610,false,212844547439,6,1.11\n3575,75800,true,216995851384,6,1.11\n3606,75921,false,220428657762,6,1.11\n3666,75800,false,222635285484,6,1.11\n3757,75388,true,226806291027,6,1.11\n3796,75497,false,230038986756,6,1.11\n3866,75388,false,234072036925,6,1.11\n3957,75219,true,238140241422,6,1.11\n3972,75311,false,240223818197,6,1.11\n4043,75219,false,242862328005,6,1.11\n4134,75095,true,247569408443,6,1.11\n4180,75193,false,251989954261,6,1.11\n4266,75095,false,256621115576,6,1.11\n4357,74986,true,260445837069,6,1.11\n4373,75100,false,262659723566,6,1.11\n4422,74986,false,263773840342,6,1.11\n4513,74366,true,268206913658,6,1.11\n4547,74479,false,271412348697,6,1.11\n4636,74366,false,276199095732,6,1.11\n4727,73977,true,280450553202,6,1.11\n4740,74080,false,281476705816,6,1.11\n4817,73977,false,285325991753,6,1.11\n4908,73747,true,289883211957,6,1.11\n4920,73852,false,291757864633,6,1.11\n5011,73747,true,295620016889,6,1.11\n5026,73862,false,297296732240,6,1.11\n5112,73747,false,301818381736,6,1.11\n5182,73663,false,304405951711,6,1.11\n5273,53714,true,307890110778,6,1.11\n5333,53800,true,311266631781,6,1.11\n5393,54010,true,314848988038,6,1.11\n5454,54416,true,318407881759,6,1.11\n5514,55107,true,322409558264,6,1.11\n5575,56650,true,325842538430,6,1.11\n5633,59451,false,329835833021,6,1.11\n5724,56650,true,332802103575,6,1.11\n5785,56746,true,336403947823,6,1.11\n5836,56976,false,339808253745,6,1.11\n5927,56746,true,343236473164,6,1.11\n5987,56856,true,346801489081,6,1.11\n6047,57038,true,350417058755,6,1.11\n6090,57510,false,354029132679,6,1.11\n6181,57038,true,357646024959,6,1.11\n6241,57120,true,360445031950,6,1.11\n6291,57288,false,364284705166,6,1.11\n6382,57120,true,367242532937,6,1.11\n6443,57208,true,372477182442,6,1.11\n6460,57410,false,373038487748,6,1.11\n6551,57208,true,375931573145,6,1.11\n6586,57285,false,377819679086,6,1.11\n6677,57208,true,381883299564,6,1.11\n6737,57324,true,385280158285,6,1.11\n6798,57518,true,388935619741,6,1.11\n6859,57883,true,392779994698,6,1.11\n6875,58782,false,394010067121,6,1.11\n6966,57883,true,397866500524,6,1.11\n6971,57998,false,398132546220,6,1.11\n7055,57883,false,401739951951,6,1.11\n7146,57518,true,404964815795,6,1.11\n7206,57593,true,408535552894,6,1.11\n7264,57758,false,412658265732,6,1.11\n7355,57593,true,416010429967,6,1.11\n7416,57705,true,419566187363,6,1.11\n7477,57948,true,423393164458,6,1.11\n7482,58250,false,424563610287,6,1.11\n7573,57948,true,427422099698,6,1.11\n7633,58049,true,431087116056,6,1.11\n7654,58244,false,433102635695,6,1.11\n7745,58049,true,436258116700,6,1.11\n7754,58154,false,437173582320,6,1.11\n7845,58049,true,440824396221,6,1.11\n7905,58169,true,445347653142,6,1.11\n7908,58342,false,444673601542,6,1.11\n7969,58169,false,446803326390,6,1.11\n8060,58049,true,450418798259,6,1.11\n8120,58149,true,454106703851,6,1.11\n8181,58352,true,457709365859,6,1.11\n8196,58653,false,458546939040,6,1.11\n8287,58352,true,462379724196,6,1.11\n8348,58467,true,466755237096,6,1.11\n8369,58677,false,468276652108,6,1.11\n8460,58467,true,471325029544,6,1.11\n8521,58589,true,475788389106,6,1.11\n8541,58810,false,476676817170,6,1.11\n8632,58589,true,480254675206,6,1.11\n8652,58678,false,482446461584,6,1.11\n8743,58589,true,485291467006,6,1.11\n8794,58710,false,488968351178,6,1.11\n8885,58589,true,492383997644,6,1.11\n8946,58670,true,496099668865,6,1.11\n8955,58860,false,496606202072,6,1.11\n9046,58670,true,500453925394,6,1.11\n9047,58770,false,500809185927,6,1.11\n9138,58670,true,504332316543,6,1.11\n9199,58785,true,508169194167,6,1.11\n9259,58961,true,512011148387,6,1.11\n9277,59339,false,513537276681,6,1.11\n9368,58961,true,517144709732,6,1.11\n9394,59084,false,518783589788,6,1.11\n9485,58961,true,522293073172,6,1.11\n9530,59039,false,525988279301,6,1.11\n9621,58961,true,529188790139,6,1.11\n9651,59046,false,532093047303,6,1.11\n9714,58961,false,533755103633,6,1.11\n9805,58785,true,536766141062,6,1.11\n9855,58885,false,540704771229,6,1.11\n9946,58785,true,543692000566,6,1.11\n9984,58884,false,546405790896,6,1.11\n10075,58785,true,550032157106,6,1.11\n10135,58891,true,553791696438,6,1.11\n10189,59077,false,558085833617,6,1.11\n10275,58891,false,560738500080,6,1.11\n10366,58785,true,564765474543,6,1.11\n10388,58866,false,566289672823,6,1.11\n10479,58785,true,569640846330,6,1.11\n10539,58901,true,573453362643,6,1.11\n10566,59092,false,575042000543,6,1.11\n10657,58901,true,579742452672,6,1.11\n10699,59021,false,582983359965,6,1.11\n10774,58901,false,585066443492,6,1.11\n10865,58785,true,588619064253,6,1.11\n10918,58862,false,592523707363,6,1.11\n11009,58785,true,595440067821,6,1.11\n11060,58876,false,599466597365,6,1.11\n11142,58785,false,601856419165,6,1.11\n11233,58670,true,606077147208,6,1.11\n11291,58793,false,610235224941,6,1.11\n11382,58670,true,613224775028,6,1.11\n11389,58761,false,613602886090,6,1.11\n11480,58670,true,617461642116,6,1.11\n11506,58765,false,619167276688,6,1.11\n11597,58670,true,622878548353,6,1.11\n11640,58771,false,626280402063,6,1.11\n11731,58670,true,629490445201,6,1.11\n11769,58755,false,632418270225,6,1.11\n11849,58670,false,635318295972,6,1.11\n11940,58589,true,638825472498,6,1.11\n12000,58694,false,642570522362,6,1.11\n12091,58589,true,646311807561,6,1.11\n12145,58683,false,649775994866,6,1.11\n12195,58589,false,651344363930,6,1.11\n12286,58467,true,654815246184,6,1.11\n12301,58585,false,656131884601,6,1.11\n12392,58467,true,659629927692,6,1.11\n12437,58560,false,662702481935,6,1.11\n12528,58467,true,666223419354,6,1.11\n12589,58562,true,670123143232,6,1.11\n12600,58750,false,670621214626,6,1.11\n12691,58562,true,674563890950,6,1.11\n12697,58641,false,675166835743,6,1.11\n12788,58562,true,678665196053,6,1.11\n12820,58673,false,681019611110,6,1.11\n12911,58562,true,684644208468,6,1.11\n12939,58644,false,687078824073,6,1.11\n13002,58562,false,689136532472,6,1.11\n13093,58467,true,692203634626,6,1.11\n13153,58583,true,696052199426,6,1.11\n13198,58752,false,699225792083,6,1.11\n13289,58583,true,702551421468,6,1.11\n13331,58694,false,705830131015,6,1.11\n13422,58583,true,709562456034,6,1.11\n13457,58665,false,711461892921,6,1.11\n13526,58583,false,713910711378,6,1.11\n13617,58467,true,717805197930,6,1.11\n13677,58566,true,721096500149,6,1.11\n13690,58815,false,722606936725,6,1.11\n13781,58566,true,725740211159,6,1.11\n13793,58676,false,727072494316,6,1.11\n13884,58566,true,730469350264,6,1.11\n13933,58688,false,733405611432,6,1.11\n14021,58566,false,737210831403,6,1.11\n14112,58467,true,740876841316,6,1.11\n14126,58544,false,742841547079,6,1.11\n14209,58467,false,745813696427,6,1.11\n14300,58352,true,749064561304,7,1.11\n14360,58433,true,752587919634,7,1.11\n14421,58670,true,756481046401,7,1.11\n14482,59069,true,760244813853,7,1.11\n14543,59885,true,764133884585,7,1.11\n14603,61764,true,768153213074,7,1.11\n14664,65580,true,772404197806,7,1.11\n14724,71737,true,777080606891,7,1.11\n14759,86256,false,780347660766,7,1.11\n14850,71737,true,785466088907,7,1.11\n14911,71832,true,789464906412,7,1.11\n14914,72012,false,790428082522,7,1.11\n14991,71832,false,793739343424,7,1.11\n15082,71737,true,798436762655,7,1.11\n15142,71830,true,802557689548,7,1.11\n15160,72066,false,804960382418,7,1.11\n15251,71830,true,808535752226,7,1.11\n15278,71924,false,811405092698,7,1.11\n15365,71830,false,815548037132,7,1.11\n15456,71737,true,819663615699,7,1.11\n15472,71820,false,821798688251,7,1.11\n15559,71737,false,825787450675,7,1.11\n15633,65580,false,828617620747,7,1.11\n15724,61764,true,832311787061,7,1.11\n15785,61856,true,836449994457,7,1.11\n15800,62019,false,837701662951,7,1.11\n15886,61856,false,841685454204,7,1.11\n15940,61764,false,843316078066,7,1.11\n16031,59885,true,846584791811,7,1.11\n16091,60002,true,851213145272,7,1.11\n16108,60153,false,851857455574,7,1.11\n16185,60002,false,855187135784,7,1.11\n16276,59885,true,859275606994,7,1.11\n16336,59964,true,862225868418,7,1.11\n16367,60186,false,865101883011,7,1.11\n16430,59964,false,866570304948,7,1.11\n16521,59885,true,870273479042,7,1.11\n16552,59996,false,872208596287,7,1.11\n16643,59885,true,876124460254,7,1.11\n16703,59963,true,880319325736,7,1.11\n16717,60127,false,880925284118,7,1.11\n16808,59963,true,885392293690,7,1.11\n16822,60077,false,885629333081,7,1.11\n16872,59963,false,887470154028,7,1.11\n16963,59885,true,890908453019,7,1.11\n17024,59985,true,895333871272,7,1.11\n17029,60159,false,895649045764,7,1.11\n17087,59985,false,897916167153,7,1.11\n17178,59885,true,900899911815,7,1.11\n17238,59976,true,904608411678,7,1.11\n17261,60142,false,906827598561,7,1.11\n17340,59976,false,909550112885,7,1.11\n17431,59885,true,913442876256,7,1.11\n17466,59988,false,915363752402,7,1.11\n17557,59885,true,919450111472,7,1.11\n17573,59977,false,921275126245,7,1.11\n17642,59885,false,923443167404,7,1.11\n17733,59069,true,926760143083,7,1.11\n17794,59191,true,930579932193,7,1.11\n17811,59411,false,932095991250,7,1.11\n17902,59191,true,935471803117,7,1.11\n17943,59307,false,938745818617,7,1.11\n18034,59191,true,942002579924,7,1.11\n18051,59302,false,943804283557,7,1.11\n18142,59191,true,946854173078,7,1.11\n18176,59268,false,949862353021,7,1.11\n18224,59191,false,950857005333,7,1.11\n18315,59069,true,954217798362,7,1.11\n18376,59151,true,957837106142,7,1.11\n18402,59371,false,960165853573,7,1.11\n18493,59151,true,963378919223,7,1.11\n18540,59227,false,967181638027,7,1.11\n18631,59151,true,970467199379,7,1.11\n18668,59230,false,973548994259,7,1.11\n18744,59151,false,975997652721,7,1.11\n18823,59069,false,979094603842,7,1.11\n18914,58670,true,982719524238,7,1.11\n18965,58773,false,985539079603,7,1.11\n19056,58670,true,989278052862,7,1.11\n19112,58787,false,992760978365,7,1.11\n19203,58670,true,996700483028,7,1.11\n19214,58785,false,997767902517,7,1.11\n19305,58670,true,1001752892261,7,1.11\n19365,58771,true,1004987494778,7,1.11\n19374,58999,false,1006477736828,7,1.11\n19465,58771,true,1009221211072,7,1.11\n19489,58873,false,1011667643405,7,1.11\n19549,58771,false,1013350427354,7,1.11\n19640,58670,true,1016569571909,7,1.11\n19656,58765,false,1017489480782,7,1.11\n19738,58670,false,1021498207804,7,1.11\n19829,58433,true,1024661235510,7,1.11\n19890,58529,true,1028494227785,7,1.11\n19915,58715,false,1030620915143,7,1.11\n20006,58529,true,1035311306368,7,1.11\n20028,58630,false,1036119380650,7,1.11\n20119,58529,true,1038967534222,7,1.11\n20140,58613,false,1040720849825,7,1.11\n20231,58529,true,1044568234424,7,1.11\n20252,58636,false,1045369529493,7,1.11\n20308,58529,false,1047389934896,7,1.11\n20399,58433,true,1050823508304,7,1.11\n20436,58542,false,1053578040618,7,1.11\n20527,58433,true,1057231480857,7,1.11\n20588,58510,true,1060826539039,7,1.11\n20611,58682,false,1062816277543,7,1.11\n20702,58510,true,1066147307326,7,1.11\n20732,58624,false,1068533422385,7,1.11\n20823,58510,true,1072416907003,7,1.11\n20826,58595,false,1072086011280,7,1.11\n20907,58510,false,1076007698989,7,1.11\n20998,58433,true,1078999634445,7,1.11\n21057,58551,false,1082551656915,7,1.11\n21148,58433,true,1086396153743,7,1.11\n21163,58541,false,1088321339742,7,1.11\n21254,58433,true,1091326180239,7,1.11\n21288,58538,false,1094095467123,7,1.11\n21372,58433,false,1097008436809,7,1.11\n21427,58352,false,1098539368024,7,1.11\n21518,58149,true,1102127068624,7,1.11\n21574,58268,false,1106269375514,7,1.11\n21640,58149,false,1107880999086,7,1.11\n21723,58049,false,1111611769317,7,1.11\n21814,57948,true,1114985643811,7,1.11\n21869,58024,false,1118683260680,7,1.11\n21960,57948,false,1122555647830,7,1.11\n22025,57705,false,1124203087956,7,1.11\n22104,57593,false,1127812008839,7,1.11\n22195,57518,true,1131131513553,7,1.11\n22207,57632,false,1132755567654,7,1.11\n22298,57518,true,1135486630639,7,1.11\n22358,57611,true,1139210280708,7,1.11\n22371,57818,false,1140737438140,7,1.11\n22462,57611,true,1143835999852,7,1.11\n22523,57700,true,1147557020431,7,1.11\n22529,57857,false,1148922191135,7,1.11\n22581,57700,false,1149805621071,7,1.11\n22639,57611,false,1151965408396,7,1.11\n22720,57518,false,1154722434500,7,1.11\n22811,57324,true,1157921812115,7,1.11\n22872,57422,true,1161680619441,7,1.11\n22928,57601,false,1165576213078,7,1.11\n22979,57422,false,1166808728094,7,1.11\n23070,57324,true,1170308337194,7,1.11\n23078,57408,false,1171348545065,7,1.11\n23169,57324,true,1174340178228,7,1.11\n23229,57431,false,1178987763327,7,1.11\n23320,57324,true,1181753709464,7,1.11\n23351,57444,false,1183627710515,7,1.11\n23409,57324,false,1185391852971,7,1.11\n23500,57208,true,1189350105637,7,1.11\n23528,57284,false,1191617234238,7,1.11\n23619,57208,true,1194901704408,7,1.11\n23638,57317,false,1195851682714,7,1.11\n23686,57208,false,1197794335277,7,1.11\n23777,57120,true,1201723626146,7,1.11\n23831,57229,false,1204829638048,7,1.11\n23922,57120,true,1207610895031,7,1.11\n23966,57217,false,1211263353385,7,1.11\n24006,57120,false,1211225555690,7,1.11\n24097,57038,true,1214937735945,7,1.11\n24152,57123,false,1218898051904,7,1.11\n24243,57038,true,1221849480856,7,1.11\n24269,57146,false,1224225458687,7,1.11\n24325,57038,false,1225347062928,7,1.11\n24416,56856,true,1228690101984,7,1.11\n24477,56979,true,1233012605368,7,1.11\n24504,57189,false,1234868963725,7,1.11\n24595,56979,true,1237651770322,7,1.11\n24622,57079,false,1240040901899,7,1.11\n24687,56979,false,1242088225308,7,1.11\n24758,56856,false,1244290048332,7,1.11\n24849,56746,true,1248342616202,7,1.11\n24851,56859,false,1247921516773,7,1.11\n24942,56746,true,1251599728116,7,1.11\n24962,56833,false,1253596745554,7,1.11\n25004,56746,false,1254161818176,7,1.11\n25095,56650,true,1257270825837,7,1.11\n25107,56733,false,1258366308447,7,1.11\n25198,56650,true,1261887103934,7,1.11\n25241,56725,false,1265285168086,7,1.11\n25332,56650,true,1267945494641,7,1.11\n25343,56769,false,1270023006320,7,1.11\n25391,56650,false,1270216057911,7,1.11\n25482,55107,true,1273673048273,7,1.11\n25542,55201,true,1276858469537,7,1.11\n25583,55411,false,1279211180579,7,1.11\n25634,55201,false,1280622747176,7,1.11\n25725,55107,true,1284135420956,7,1.11\n25734,55226,false,1285413200517,7,1.11\n25812,55107,false,1287755426836,7,1.11\n25903,54416,true,1291197291754,7,1.11\n25951,54495,false,1295046596925,7,1.11\n26042,54416,true,1297625340182,7,1.11\n26102,54521,true,1300952472811,7,1.11\n26134,54671,false,1302780848326,7,1.11\n26225,54521,true,1306362585882,7,1.11\n26234,54597,false,1307560863039,7,1.11\n26313,54521,false,1309716299091,7,1.11\n26404,54416,true,1313430376428,7,1.11\n26419,54515,false,1314744607210,7,1.11\n26467,54416,false,1315939720524,7,1.11\n26558,54010,true,1318969707947,7,1.11\n26618,54107,true,1322248980479,7,1.11\n26669,54268,false,1326075019172,7,1.11\n26746,54107,false,1328210974361,7,1.11\n26810,54010,false,1330906823797,7,1.11\n26860,53800,false,1331939684637,7,1.11\n26951,53714,true,1334695323922,7,1.11\n26992,53828,false,1337847725046,7,1.11\n27083,53714,true,1340853479286,7,1.11\n27143,53823,true,1344200413484,7,1.11\n27203,53999,true,1347293917309,7,1.11\n27239,54487,false,1350022900828,7,1.11\n27330,53999,true,1352964886290,7,1.11\n27368,54123,false,1356118124923,7,1.11\n27459,53999,true,1358715224306,7,1.11\n27482,54092,false,1360667079768,7,1.11\n27573,53999,true,1363564735378,7,1.11\n27599,54085,false,1365338599039,7,1.11\n27653,53999,false,1367331150358,7,1.11\n27716,53823,false,1368930510908,7,1.11\n27807,53714,true,1371889357710,7,1.11\n27868,53797,true,1375386265935,7,1.11\n27929,53989,true,1379136421629,7,1.11\n27950,54449,false,1380185167436,7,1.11\n27989,53989,false,1380840163691,7,1.11\n28080,53797,true,1384106584999,7,1.11\n28111,53888,false,1386119327017,7,1.11\n28179,53797,false,1388189543992,7,1.11\n28270,53714,true,1391688414995,7,1.11\n28331,53790,true,1395156831325,7,1.11\n28340,53944,false,1395620833916,7,1.11\n28431,53790,true,1399323358274,7,1.11\n28469,53866,false,1401691866878,7,1.11\n28560,53790,true,1404929704935,7,1.11\n28602,53913,false,1408012621465,7,1.11\n28693,53790,true,1410880946464,7,1.11\n28700,53899,false,1411261302990,7,1.11\n28759,53790,false,1413932398340,7,1.11\n"
+	},
+	"20211027/pebble/write/size=1024/run_1/2.log.gz": {
+		"opsSec": 59050,
+		"rawData": "60,30000,true,2411644356,4,1.22\n121,30106,true,4375860236,4,1.22\n182,30283,true,6367082959,4,1.22\n242,30687,true,8266540423,5,1.22\n303,31309,true,10305385934,5,1.22\n364,33198,true,12436847194,5,1.22\n424,35663,true,14696054375,5,1.22\n485,42578,true,17556101894,5,1.22\n545,54594,true,21021417209,5,1.22\n606,74362,true,25806672366,5,1.22\n644,131090,false,31185816690,5,1.22\n735,74362,true,35540107184,5,1.22\n795,74459,true,40679355637,5,1.22\n856,74640,true,45403958852,5,1.22\n916,75020,true,50267845766,5,1.22\n938,75922,false,52486541770,5,1.22\n1029,75020,true,56611625806,5,1.22\n1090,75124,true,61435549886,5,1.22\n1129,75314,false,65341865512,5,1.22\n1220,75124,true,69444310045,5,1.22\n1281,75221,true,74318071772,5,1.22\n1291,75402,false,75465420940,6,1.22\n1382,75221,true,80010991676,6,1.22\n1442,75326,true,84790711280,6,1.22\n1503,75490,true,89664905243,6,1.22\n1563,75839,true,94764100986,6,1.22\n1584,76513,false,97030966188,6,1.22\n1675,75839,true,101186401969,6,1.22\n1713,75925,false,104758042886,6,1.22\n1804,75839,true,109223874405,6,1.22\n1865,75935,true,114751192465,6,1.22\n1867,76119,false,114190119106,6,1.22\n1958,75935,true,119542994154,6,1.22\n1976,76029,false,120552768082,6,1.22\n2067,75935,true,125992889442,6,1.22\n2094,76031,false,127940802945,6,1.22\n2180,75935,false,132670739548,6,1.22\n2271,75839,true,137229221646,6,1.22\n2306,75928,false,140869766532,6,1.22\n2397,75839,true,145699664907,6,1.22\n2419,75914,false,147321491228,6,1.22\n2510,75839,true,151629815230,6,1.22\n2537,75939,false,154782375778,6,1.22\n2623,75839,false,158594451953,6,1.22\n2714,75490,true,163318627064,6,1.22\n2739,75583,false,165318973821,6,1.22\n2803,75490,false,168542843759,6,1.22\n2894,75326,true,173105021460,6,1.22\n2922,75418,false,176021799767,6,1.22\n3013,75326,true,180487753643,6,1.22\n3021,75449,false,181515012618,6,1.22\n3086,75326,false,183842799713,6,1.22\n3177,75221,true,188495575081,6,1.22\n3219,75339,false,192576085441,6,1.22\n3310,75221,true,196669636281,6,1.22\n3313,75320,false,197283221213,6,1.22\n3400,75221,false,202168747527,6,1.22\n3491,75124,true,206331064208,6,1.22\n3507,75221,false,208289105158,6,1.22\n3591,75124,false,212421229307,6,1.22\n3682,75020,true,217006517811,6,1.22\n3695,75120,false,218448070734,6,1.22\n3756,75020,false,221044599433,6,1.22\n3847,74640,true,225643496594,6,1.22\n3884,74722,false,228892685187,6,1.22\n3968,74640,false,233168825515,6,1.22\n4059,74459,true,237922378571,6,1.22\n4072,74581,false,238764492752,6,1.22\n4158,74459,false,243592153627,6,1.22\n4249,74362,true,247575985200,6,1.22\n4260,74478,false,248630997452,6,1.22\n4351,74362,true,253750839926,6,1.22\n4364,74455,false,254924092506,6,1.22\n4429,74362,false,258009602831,6,1.22\n4520,54594,true,260741795001,6,1.22\n4581,54713,true,264500589728,6,1.22\n4641,54920,true,267804099598,6,1.22\n4702,55221,true,271685030776,6,1.22\n4763,55832,true,275054008411,6,1.22\n4809,57255,false,278090720610,6,1.22\n4900,55832,true,281402809047,6,1.22\n4961,55952,true,285241205957,6,1.22\n4983,56176,false,286779171018,6,1.22\n5074,55952,true,289899113012,6,1.22\n5134,56029,true,293444304570,6,1.22\n5195,56184,true,297106899771,6,1.22\n5215,56495,false,299216788320,6,1.22\n5306,56184,true,302255527919,6,1.22\n5367,56291,true,306310508502,6,1.22\n5427,56506,true,309262130396,6,1.22\n5488,56987,true,313634667921,6,1.22\n5548,57704,true,316589685955,6,1.22\n5573,59305,false,318171197001,6,1.22\n5664,57704,true,321886431098,6,1.22\n5700,57819,false,324611800743,6,1.22\n5791,57704,true,328006864579,6,1.22\n5852,57798,true,331797974916,6,1.22\n5862,57968,false,332997116599,6,1.22\n5953,57798,true,336130373440,6,1.22\n6009,57901,false,339830580638,6,1.22\n6100,57798,true,343558332397,6,1.22\n6132,57901,false,346063866017,6,1.22\n6223,57798,true,348972802787,6,1.22\n6283,57914,true,352894860914,6,1.22\n6344,58088,true,356848226678,6,1.22\n6346,58452,false,357471529387,6,1.22\n6421,58088,false,360016224763,6,1.22\n6512,57914,true,363352957932,6,1.22\n6572,58009,true,366935484541,6,1.22\n6632,58224,true,370740389863,6,1.22\n6639,58614,false,372106894522,6,1.22\n6730,58224,true,374904066270,6,1.22\n6764,58335,false,377086645583,6,1.22\n6855,58224,true,381153037415,6,1.22\n6912,58325,false,385128871340,6,1.22\n7003,58224,true,389484575343,6,1.22\n7064,58310,true,391863063900,6,1.22\n7087,58529,false,393811502193,6,1.22\n7178,58310,true,397080957611,6,1.22\n7238,58413,true,401143101241,6,1.22\n7240,58642,false,401954235361,6,1.22\n7287,58413,false,402436239230,6,1.22\n7378,58310,true,405945661271,6,1.22\n7438,58420,true,409533654178,6,1.22\n7499,58650,true,413403888367,6,1.22\n7520,59093,false,415221785006,6,1.22\n7611,58650,true,418531192248,6,1.22\n7638,58765,false,421105083372,6,1.22\n7729,58650,true,424005032376,6,1.22\n7763,58737,false,426418024076,6,1.22\n7854,58650,true,429983425862,6,1.22\n7893,58741,false,433341439685,6,1.22\n7984,58650,true,436350964908,6,1.22\n7996,58745,false,436913951335,6,1.22\n8087,58650,true,441219878801,6,1.22\n8106,58766,false,442399527246,6,1.22\n8197,58650,true,445949053431,6,1.22\n8257,58737,true,449564954160,6,1.22\n8308,58953,false,453615455214,6,1.22\n8366,58737,false,454770895616,6,1.22\n8457,58650,true,458417274455,6,1.22\n8518,58743,true,462155723352,6,1.22\n8532,58946,false,463945905960,6,1.22\n8615,58743,false,466949627233,6,1.22\n8706,58650,true,470662772491,6,1.22\n8767,58774,true,474033519449,6,1.22\n8772,58974,false,474320734825,6,1.22\n8808,58774,false,474765251379,6,1.22\n8900,58650,true,478697088154,6,1.22\n8960,58729,true,482268198889,6,1.22\n8994,58898,false,484338453049,6,1.22\n9085,58729,true,488282747851,6,1.22\n9145,58841,true,491983910314,6,1.22\n9206,59036,true,496088335883,6,1.22\n9233,59414,false,498109275742,6,1.22\n9324,59036,true,502833922326,6,1.22\n9339,59132,false,503102081311,6,1.22\n9430,59036,true,506798794401,6,1.22\n9460,59140,false,508018101647,6,1.22\n9551,59036,true,511858403463,6,1.22\n9575,59156,false,514131352241,6,1.22\n9666,59036,true,517085800291,6,1.22\n9711,59148,false,519990726588,6,1.22\n9802,59036,true,523923447375,6,1.22\n9853,59136,false,526952822582,6,1.22\n9944,59036,true,531479544986,6,1.22\n10005,59138,true,534684324088,6,1.22\n10008,59323,false,535745798789,6,1.22\n10099,59138,true,539595257500,6,1.22\n10134,59232,false,541749850645,6,1.22\n10216,59138,false,544759253870,6,1.22\n10307,59036,true,548089312131,6,1.22\n10368,59113,true,552073035583,6,1.22\n10420,59360,false,555457120980,6,1.22\n10511,59113,true,559334118019,6,1.22\n10545,59217,false,561367185060,6,1.22\n10611,59113,false,563850062496,6,1.22\n10702,59036,true,567493068973,6,1.22\n10763,59127,true,571155436576,6,1.22\n10817,59325,false,575126669291,6,1.22\n10868,59127,false,576326326093,6,1.22\n10959,59036,true,579880556361,6,1.22\n11018,59120,false,583850514689,6,1.22\n11109,59036,true,587298082429,6,1.22\n11121,59150,false,588879660178,6,1.22\n11212,59036,true,591934319503,6,1.22\n11273,59142,true,595969052134,6,1.22\n11286,59388,false,597374291237,6,1.22\n11377,59142,true,600457451837,6,1.22\n11438,59254,true,604448135608,6,1.22\n11443,59436,false,604995167353,6,1.22\n11517,59254,false,607449487346,6,1.22\n11602,59142,false,611677040176,6,1.22\n11693,59036,true,614781405357,6,1.22\n11753,59113,true,618453597831,6,1.22\n11786,59332,false,620612479432,6,1.22\n11877,59113,true,624417920717,6,1.22\n11886,59190,false,625488171871,6,1.22\n11977,59113,true,628938456131,6,1.22\n11987,59230,false,629523651036,6,1.22\n12078,59113,true,633434121205,6,1.22\n12139,59219,true,637449502452,6,1.22\n12145,59451,false,638331583616,6,1.22\n12236,59219,true,641841230244,6,1.22\n12270,59296,false,645293945938,6,1.22\n12361,59219,true,647549175857,6,1.22\n12421,59319,false,651624171812,6,1.22\n12512,59219,true,655184167908,6,1.22\n12534,59311,false,656659037037,6,1.22\n12591,59219,false,658483363003,6,1.22\n12682,59113,true,662159552036,6,1.22\n12742,59206,true,665836247222,6,1.22\n12751,59435,false,667088791595,6,1.22\n12842,59206,true,670350287209,6,1.22\n12894,59298,false,674242985034,6,1.22\n12977,59206,false,677767884341,6,1.22\n13068,59113,true,680831806985,6,1.22\n13115,59235,false,684377217917,6,1.22\n13176,59113,false,685715295765,6,1.22\n13267,59036,true,689641436111,6,1.22\n13297,59147,false,692252136423,6,1.22\n13364,59036,false,694700502865,6,1.22\n13455,58841,true,697652367366,6,1.22\n13515,58953,true,701746279451,6,1.22\n13550,59107,false,704479719893,6,1.22\n13641,58953,true,707434627134,6,1.22\n13688,59053,false,711313362974,6,1.22\n13760,58953,false,713948760821,6,1.22\n13834,58841,false,716333638354,6,1.22\n13925,58729,true,721117497476,6,1.22\n13986,58835,true,723535908918,6,1.22\n13996,59002,false,724755762913,6,1.22\n14063,58835,false,727377127232,6,1.22\n14154,58729,true,730288791143,6,1.22\n14203,58830,false,733723815484,6,1.22\n14294,58729,true,737191812443,6,1.22\n14339,58829,false,740070534187,6,1.22\n14430,58729,true,743856399571,6,1.22\n14490,58846,true,747618110795,7,1.22\n14551,59017,true,751490666499,7,1.22\n14611,59455,true,755225259819,7,1.22\n14672,60092,true,759161616565,7,1.22\n14732,62041,true,763055063486,7,1.22\n14793,65231,true,767331466251,7,1.22\n14853,70738,true,771747076738,7,1.22\n14903,83473,false,776321274190,7,1.22\n14994,70738,true,781058676314,7,1.22\n15040,70856,false,785052835054,7,1.22\n15131,70738,true,789359257080,7,1.22\n15188,70824,false,794029737836,7,1.22\n15279,70738,true,797783429088,7,1.22\n15319,70858,false,801332190785,7,1.22\n15410,70738,true,805804531300,7,1.22\n15446,70855,false,808317764375,7,1.22\n15537,70738,true,812701298432,7,1.22\n15560,70860,false,814608866816,7,1.22\n15651,70738,true,819590395959,7,1.22\n15666,70825,false,821105267270,7,1.22\n15757,70738,true,824800367433,7,1.22\n15762,70856,false,826079592173,7,1.22\n15848,70738,false,830285183558,7,1.22\n15939,65231,true,833630084162,7,1.22\n15945,65355,false,834208010881,7,1.22\n16009,65231,false,836383598691,7,1.22\n16100,62041,true,840733678872,7,1.22\n16109,62155,false,841556908657,7,1.22\n16200,62041,true,845102479277,7,1.22\n16260,62142,true,849172437779,7,1.22\n16290,62339,false,851896377526,7,1.22\n16334,62142,false,852596673388,7,1.22\n16425,62041,true,856197552419,7,1.22\n16470,62159,false,859871200142,7,1.22\n16561,62041,true,863090063890,7,1.22\n16577,62134,false,864129530580,7,1.22\n16653,62041,false,867393356983,7,1.22\n16744,60092,true,871131649124,7,1.22\n16766,60206,false,873276382646,7,1.22\n16821,60092,false,875136620687,7,1.22\n16912,59455,true,878140153916,7,1.22\n16938,59558,false,880430467924,7,1.22\n17029,59455,true,883819944348,7,1.22\n17041,59535,false,884885088809,7,1.22\n17132,59455,true,888240169092,7,1.22\n17150,59569,false,890250151382,7,1.22\n17241,59455,true,893378005306,7,1.22\n17288,59572,false,897129971406,7,1.22\n17379,59455,true,900069545671,7,1.22\n17395,59558,false,901790646777,7,1.22\n17486,59455,true,905135201295,7,1.22\n17516,59562,false,907647088969,7,1.22\n17607,59455,true,911200593955,7,1.22\n17614,59556,false,912068357586,7,1.22\n17705,59455,true,916599736510,7,1.22\n17711,59560,false,916263559482,7,1.22\n17802,59455,true,919310192855,7,1.22\n17863,59577,true,923460825950,7,1.22\n17884,59750,false,924920289651,7,1.22\n17975,59577,true,928468351101,7,1.22\n17988,59701,false,929449797133,7,1.22\n18079,59577,true,933332674957,7,1.22\n18086,59672,false,933761716446,7,1.22\n18164,59577,false,937212514510,7,1.22\n18255,59455,true,940596414444,7,1.22\n18315,59544,true,944453898168,7,1.22\n18358,59760,false,948080234462,7,1.22\n18423,59544,false,949807048573,7,1.22\n18514,59455,true,953224235801,7,1.22\n18545,59537,false,956045651704,7,1.22\n18612,59455,false,957906547897,7,1.22\n18703,59017,true,961395755329,7,1.22\n18736,59140,false,963969323242,7,1.22\n18827,59017,true,968305249841,7,1.22\n18887,59110,true,971151047666,7,1.22\n18937,59325,false,974617894811,7,1.22\n19028,59110,true,978091001363,7,1.22\n19055,59192,false,980549202313,7,1.22\n19143,59110,false,984066938258,7,1.22\n19215,59017,false,987087404018,7,1.22\n19306,58846,true,990191388407,7,1.22\n19328,58924,false,991886660408,7,1.22\n19419,58846,true,995125143507,7,1.22\n19478,58941,false,999708393235,7,1.22\n19569,58846,true,1002581548033,7,1.22\n19576,58965,false,1004044944515,7,1.22\n19630,58846,false,1004551420706,7,1.22\n19721,58729,true,1008413092332,7,1.22\n19782,58828,true,1012226655460,7,1.22\n19793,59067,false,1013852379425,7,1.22\n19884,58828,true,1017467328976,7,1.22\n19945,58941,true,1020610078816,7,1.22\n19948,59112,false,1021448494263,7,1.22\n20032,58941,false,1025182355567,7,1.22\n20122,58828,false,1029022192665,7,1.22\n20213,58729,true,1031754672325,7,1.22\n20271,58825,false,1035431752538,7,1.22\n20362,58729,true,1039392457246,7,1.22\n20375,58836,false,1040516846902,7,1.22\n20418,58729,false,1041696448193,7,1.22\n20485,58650,false,1043557301975,7,1.22\n20576,58420,true,1047222017036,7,1.22\n20637,58495,true,1050831804690,7,1.22\n20697,58727,true,1054882640639,7,1.22\n20699,59124,false,1055560154555,7,1.22\n20790,58727,true,1058678067945,7,1.22\n20817,58809,false,1060789639959,7,1.22\n20908,58727,true,1064215124771,7,1.22\n20927,58828,false,1066197368170,7,1.22\n21018,58727,true,1069575010516,7,1.22\n21079,58845,true,1072930719374,7,1.22\n21082,59021,false,1073989021663,7,1.22\n21147,58845,false,1075438533951,7,1.22\n21238,58727,true,1079256582325,7,1.22\n21259,58849,false,1080456352783,7,1.22\n21342,58727,false,1084216569549,7,1.22\n21392,58495,false,1085574379402,7,1.22\n21483,58420,true,1088921584035,7,1.22\n21543,58526,true,1092847937470,7,1.22\n21578,58700,false,1095719711628,7,1.22\n21669,58526,true,1098689341311,7,1.22\n21675,58640,false,1099620634771,7,1.22\n21766,58526,true,1102850252128,7,1.22\n21820,58647,false,1106458517828,7,1.22\n21911,58526,true,1110632872394,7,1.22\n21932,58603,false,1112258647797,7,1.22\n22006,58526,false,1114648125229,7,1.22\n22097,58420,true,1117935883753,7,1.22\n22158,58522,true,1121708287615,7,1.22\n22169,58765,false,1123222266481,7,1.22\n22213,58522,false,1124249094343,7,1.22\n22285,58420,false,1126083455829,7,1.22\n22376,58310,true,1130555007569,7,1.22\n22412,58397,false,1132725493769,7,1.22\n22498,58310,false,1135867374416,7,1.22\n22589,58224,true,1139473331287,7,1.22\n22644,58334,false,1142734476458,7,1.22\n22735,58224,true,1146492034265,7,1.22\n22793,58330,false,1150614414318,7,1.22\n22880,58224,false,1154651615901,7,1.22\n22954,58009,false,1157278447464,7,1.22\n23045,57914,true,1160153433211,7,1.22\n23105,58037,true,1163908548660,7,1.22\n23117,58260,false,1165540259393,7,1.22\n23208,58037,true,1168381720140,7,1.22\n23242,58145,false,1171302771477,7,1.22\n23320,58037,false,1174511587957,7,1.22\n23411,57914,true,1177283890159,7,1.22\n23462,58027,false,1181064720974,7,1.22\n23545,57914,false,1184021291748,7,1.22\n23635,57798,false,1187768191366,7,1.22\n23726,57704,true,1191234258692,7,1.22\n23787,57791,true,1195020872169,7,1.22\n23818,57941,false,1197641401059,7,1.22\n23883,57791,false,1199051949675,7,1.22\n23974,57704,true,1202820259425,7,1.22\n23988,57828,false,1204419748572,7,1.22\n24079,57704,true,1208157280634,7,1.22\n24104,57827,false,1209870290484,7,1.22\n24150,57704,false,1210808598859,7,1.22\n24241,56987,true,1213737472752,7,1.22\n24293,57082,false,1217070639223,7,1.22\n24384,56987,true,1220600943154,7,1.22\n24422,57099,false,1223251248025,7,1.22\n24513,56987,true,1226804626361,7,1.22\n24550,57087,false,1229804494879,7,1.22\n24633,56987,false,1232322866205,7,1.22\n24724,56506,true,1235902148093,7,1.22\n24745,56619,false,1237070128938,7,1.22\n24837,56506,true,1240877289474,7,1.22\n24849,56588,false,1242569907306,7,1.22\n24907,56506,false,1244033157455,7,1.22\n24998,56291,true,1247129554034,7,1.22\n25041,56388,false,1250292280185,7,1.22\n25132,56291,true,1253488904455,7,1.22\n25192,56400,true,1257761842368,7,1.22\n25246,56615,false,1260766440688,7,1.22\n25337,56400,true,1263919212492,7,1.22\n25342,56498,false,1264051083416,7,1.22\n25421,56400,false,1267731959037,7,1.22\n25512,56291,true,1270788688628,7,1.22\n25534,56409,false,1272523960311,7,1.22\n25625,56291,true,1276394706500,7,1.22\n25664,56391,false,1278140239497,7,1.22\n25711,56291,false,1279687959002,7,1.22\n25802,56184,true,1283396500299,7,1.22\n25863,56300,true,1286720517045,7,1.22\n25869,56516,false,1287696418146,7,1.22\n25914,56300,false,1288427914293,7,1.22\n26005,56184,true,1291523853665,7,1.22\n26065,56286,true,1295089257953,7,1.22\n26125,56481,true,1299144154617,7,1.22\n26143,56808,false,1300430830532,7,1.22\n26216,56481,false,1302896287048,7,1.22\n26306,56286,false,1306037940352,7,1.22\n26397,56184,true,1309563657716,7,1.22\n26443,56285,false,1313263472677,7,1.22\n26534,56184,true,1316575166643,7,1.22\n26574,56300,false,1319654995923,7,1.22\n26636,56184,false,1320324709278,7,1.22\n26727,56029,true,1324064187410,7,1.22\n26788,56130,true,1327780999572,7,1.22\n26832,56304,false,1331186891469,7,1.22\n26900,56130,false,1332586246220,7,1.22\n26954,56029,false,1334950314481,7,1.22\n27045,55952,true,1337734201830,7,1.22\n27106,56033,true,1342234272814,7,1.22\n27141,56207,false,1344244907815,7,1.22\n27232,56033,true,1348377055088,7,1.22\n27278,56127,false,1350658251651,7,1.22\n27333,56033,false,1352063832405,7,1.22\n27424,55952,true,1355008732269,7,1.22\n27431,56060,false,1355902903593,7,1.22\n27522,55952,true,1359669305609,7,1.22\n27583,56071,true,1362822659334,7,1.22\n27603,56265,false,1364567230502,7,1.22\n27694,56071,true,1367558604259,7,1.22\n27713,56173,false,1369610438004,7,1.22\n27804,56071,true,1372349304497,7,1.22\n27845,56146,false,1374817246554,7,1.22\n27881,56071,false,1376273662755,7,1.22\n27965,55952,false,1379060986849,7,1.22\n28056,55832,true,1382274620230,7,1.22\n28117,55910,true,1385893831093,7,1.22\n28131,56110,false,1386791147702,7,1.22\n28222,55910,true,1390459046487,7,1.22\n28243,55986,false,1391534743557,7,1.22\n28301,55910,false,1393124409908,7,1.22\n28392,55832,true,1396966959985,7,1.22\n28438,55911,false,1400351326991,7,1.22\n28529,55832,true,1403606565194,7,1.22\n28589,55938,true,1406915167937,7,1.22\n28598,56179,false,1407521383674,7,1.22\n28689,55938,true,1411315905776,7,1.22\n28696,56049,false,1411819846223,7,1.22\n28787,55938,true,1415191285067,7,1.22\n"
+	}
+}
diff --git a/pebble/internal/mkbench/testdata/write-throughput/20211028-pebble-write-size=1024-run_1-summary.json b/pebble/internal/mkbench/testdata/write-throughput/20211028-pebble-write-size=1024-run_1-summary.json
new file mode 100644
index 0000000..aa9a24a
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/write-throughput/20211028-pebble-write-size=1024-run_1-summary.json
@@ -0,0 +1,10 @@
+{
+	"20211028/pebble/write/size=1024/run_1/1.log.gz": {
+		"opsSec": 68300,
+		"rawData": "60,30000,true,2412509031,4,1.33\n120,30085,true,4344378713,4,1.33\n180,30277,true,6272317922,4,1.33\n241,30661,true,8189027886,5,1.33\n301,31424,true,10222017845,5,1.33\n361,33127,true,12396292309,5,1.33\n422,36597,true,14717462185,5,1.33\n483,43587,true,17554957072,5,1.33\n544,56514,true,21241082442,5,1.33\n604,79371,true,26336232585,5,1.33\n647,125634,false,32124634528,5,1.33\n738,79371,true,37164132432,5,1.33\n799,79466,true,42296443866,5,1.33\n860,79616,true,47567186132,5,1.33\n921,80044,true,52808310775,5,1.33\n981,80682,true,57831481919,5,1.33\n1042,82513,true,63541116962,5,1.33\n1081,85181,false,67143896030,5,1.33\n1172,82513,true,72088488508,5,1.33\n1233,82601,true,77318173559,6,1.33\n1293,82780,true,82647209577,6,1.33\n1354,83260,true,88018155620,6,1.33\n1415,84224,true,93716244253,6,1.33\n1476,85759,true,99077161515,6,1.33\n1482,89524,false,99539591743,6,1.33\n1573,85759,true,105073372842,6,1.33\n1612,85842,false,109566627869,6,1.33\n1703,85759,true,114325017198,6,1.33\n1764,85880,true,119778635490,6,1.33\n1771,86039,false,120485856119,6,1.33\n1862,85880,true,126031096708,6,1.33\n1884,85982,false,128904522232,6,1.33\n1975,85880,true,133835783814,6,1.33\n2012,85970,false,136957500896,6,1.33\n2103,85880,true,142526856347,6,1.33\n2149,85967,false,147559226116,6,1.33\n2233,85880,false,152201498028,6,1.33\n2324,85759,true,157186900436,6,1.33\n2362,85852,false,161045048280,6,1.33\n2443,85759,false,166127170943,6,1.33\n2534,84224,true,171313914924,6,1.33\n2571,84313,false,175034766035,6,1.33\n2662,84224,true,179761074492,6,1.33\n2695,84315,false,183351433391,6,1.33\n2786,84224,true,188464805391,6,1.33\n2810,84325,false,190837521545,6,1.33\n2901,84224,true,195576218627,6,1.33\n2926,84321,false,198552018455,6,1.33\n3017,84224,true,203304491277,6,1.33\n3021,84323,false,204542308226,6,1.33\n3112,84224,true,209101198886,6,1.33\n3129,84321,false,211579508010,6,1.33\n3213,84224,false,216049291087,6,1.33\n3302,83260,false,221584949766,6,1.33\n3393,82780,true,226200227786,6,1.33\n3427,82890,false,229731234147,6,1.33\n3518,82780,true,234425198486,6,1.33\n3526,82901,false,235531154563,6,1.33\n3617,82780,true,240807981667,6,1.33\n3653,82886,false,243876942440,6,1.33\n3743,82780,false,249525688040,6,1.33\n3834,82601,true,254245789120,6,1.33\n3861,82679,false,257492528744,6,1.33\n3950,82601,false,262411238768,6,1.33\n4031,82513,false,266284082170,6,1.33\n4122,80682,true,271577117752,6,1.33\n4128,80804,false,272965308632,6,1.33\n4219,80682,true,277371250202,6,1.33\n4229,80768,false,278663909951,6,1.33\n4320,80682,true,283433523504,6,1.33\n4337,80762,false,285447788474,6,1.33\n4428,80682,false,291062403441,6,1.33\n4512,80044,false,294977169750,6,1.33\n4603,79616,true,299861025246,6,1.33\n4659,79691,false,305303578426,6,1.33\n4718,79616,false,307557557334,6,1.33\n4800,79466,false,312394894955,6,1.33\n4891,79371,true,316590840295,6,1.33\n4923,79454,false,320102831781,6,1.33\n5014,79371,true,324614421908,6,1.33\n5048,79484,false,328377558834,6,1.33\n5139,79371,true,332626644864,6,1.33\n5163,79490,false,335112152644,6,1.33\n5210,79371,false,336660637206,6,1.33\n5301,56514,true,339787047596,6,1.33\n5361,56633,true,343366873764,6,1.33\n5421,56872,true,347006688993,6,1.33\n5482,57174,true,350594313082,6,1.33\n5542,58104,true,354371542936,6,1.33\n5603,59315,true,358689288349,6,1.33\n5663,62468,true,362776275383,6,1.33\n5723,67900,true,366498740425,6,1.33\n5736,78323,false,367983973159,6,1.33\n5827,67900,true,371932663648,6,1.33\n5870,67998,false,374971741182,6,1.33\n5961,67900,true,379463031798,6,1.33\n6022,67988,true,383792182807,6,1.33\n6079,68179,false,388543417639,6,1.33\n6170,67988,true,392991736756,6,1.33\n6223,68109,false,396306634252,6,1.33\n6293,67988,false,399295917037,6,1.33\n6384,67900,true,403539024508,6,1.33\n6445,68022,true,408462890027,6,1.33\n6483,68223,false,411536040576,6,1.33\n6574,68022,true,415135936109,6,1.33\n6610,68127,false,418386679853,6,1.33\n6701,68022,true,422146213408,6,1.33\n6762,68106,true,427011295245,6,1.33\n6775,68354,false,428067017435,6,1.33\n6866,68106,true,432345255332,6,1.33\n6908,68224,false,435909027699,6,1.33\n6999,68106,true,439697739697,6,1.33\n7041,68212,false,442473284291,6,1.33\n7132,68106,true,446973506691,6,1.33\n7192,68181,true,451295358279,6,1.33\n7196,68346,false,452204989442,6,1.33\n7287,68181,true,455999840828,6,1.33\n7322,68277,false,459180794024,6,1.33\n7413,68181,true,463495810486,6,1.33\n7416,68277,false,463127893649,6,1.33\n7507,68181,true,467683821851,6,1.33\n7567,68300,true,472212010419,6,1.33\n7576,68549,false,473465192665,6,1.33\n7667,68300,true,477165547892,6,1.33\n7699,68379,false,479392773530,6,1.33\n7790,68300,true,484614423329,6,1.33\n7793,68379,false,484557195261,6,1.33\n7884,68300,true,489024078599,6,1.33\n7945,68390,true,492977337128,6,1.33\n7954,68568,false,494201067961,6,1.33\n8045,68390,true,497965081313,6,1.33\n8062,68495,false,499167773105,6,1.33\n8153,68390,true,503724424274,6,1.33\n8214,68512,true,508152251632,6,1.33\n8227,68701,false,509908351680,6,1.33\n8303,68512,false,512934958674,6,1.33\n8387,68390,false,516545037052,6,1.33\n8478,68300,true,521029497465,6,1.33\n8538,68420,true,525074276231,6,1.33\n8584,68664,false,529015215340,6,1.33\n8664,68420,false,532795058713,6,1.33\n8755,68300,true,536875363439,6,1.33\n8763,68401,false,537936469770,6,1.33\n8854,68300,true,541546574034,6,1.33\n8914,68406,true,545949188881,6,1.33\n8963,68639,false,550048058671,6,1.33\n9054,68406,true,553861215573,6,1.33\n9058,68485,false,554806473407,6,1.33\n9149,68406,true,558833442856,6,1.33\n9154,68513,false,559456932853,6,1.33\n9245,68406,true,564257580065,6,1.33\n9265,68498,false,565096313351,6,1.33\n9356,68406,true,569406579115,6,1.33\n9400,68504,false,573489257890,6,1.33\n9466,68406,false,575810237965,6,1.33\n9557,68300,true,579603066579,6,1.33\n9605,68382,false,583652445449,6,1.33\n9696,68300,true,587600293143,6,1.33\n9718,68416,false,589578130022,6,1.33\n9809,68300,true,593604729677,6,1.33\n9840,68392,false,596661469241,6,1.33\n9931,68300,true,600850658986,6,1.33\n9992,68423,true,604989609766,6,1.33\n9995,68628,false,605121776839,6,1.33\n10086,68423,true,611149431378,6,1.33\n10091,68500,false,610802977540,6,1.33\n10182,68423,true,614208841472,6,1.33\n10214,68507,false,617397319010,6,1.33\n10274,68423,false,618725915866,6,1.33\n10365,68300,true,623751900085,6,1.33\n10393,68383,false,625197830438,6,1.33\n10484,68300,true,629672004008,6,1.33\n10521,68419,false,632502765615,6,1.33\n10612,68300,true,636824944688,6,1.33\n10624,68387,false,637863160646,6,1.33\n10675,68300,false,639617064377,6,1.33\n10766,68181,true,643721373525,6,1.33\n10826,68256,true,648122560960,6,1.33\n10834,68503,false,649475779361,6,1.33\n10925,68256,true,653785385996,6,1.33\n10971,68361,false,656915900747,6,1.33\n11055,68256,false,660334985099,6,1.33\n11146,68181,true,665756134895,6,1.33\n11178,68280,false,667976407196,6,1.33\n11269,68181,true,671496777108,6,1.33\n11304,68302,false,674760430414,6,1.33\n11374,68181,false,677601735073,6,1.33\n11450,68106,false,680825104749,6,1.33\n11541,68022,true,684849789443,6,1.33\n11548,68114,false,685658758645,6,1.33\n11639,68022,true,689829325839,6,1.33\n11670,68127,false,692538342700,6,1.33\n11761,68022,true,697087547893,6,1.33\n11822,68139,true,701789000241,6,1.33\n11849,68358,false,703672005692,6,1.33\n11904,68139,false,705110539437,6,1.33\n11995,68022,true,708993857081,6,1.33\n12029,68108,false,712030959063,6,1.33\n12120,68022,true,715991254342,6,1.33\n12140,68110,false,717805889689,6,1.33\n12222,68022,false,722088937031,6,1.33\n12299,67900,false,724737651592,6,1.33\n12390,62468,true,729303358242,6,1.33\n12450,62580,true,732984933440,6,1.33\n12501,62796,false,736444510579,6,1.33\n12592,62580,true,740078620637,6,1.33\n12645,62704,false,744246055734,6,1.33\n12736,62580,true,747630033192,7,1.33\n12797,62671,true,751796160842,7,1.33\n12858,62920,true,755840993785,7,1.33\n12918,63237,true,759949205261,7,1.33\n12979,63961,true,763980793056,7,1.33\n13040,65380,true,768212437303,7,1.33\n13101,68020,true,772623383031,7,1.33\n13161,74683,true,777334978185,7,1.33\n13222,84812,true,783439432547,7,1.33\n13232,107304,false,784693401471,7,1.33\n13323,84812,true,789492148296,7,1.33\n13365,84910,false,794150750793,7,1.33\n13436,84812,false,797543863362,7,1.33\n13527,74683,true,801913993515,7,1.33\n13587,74807,true,806713142294,7,1.33\n13648,74982,true,811758618110,7,1.33\n13681,75352,false,814862724710,7,1.33\n13772,74982,true,818952955664,7,1.33\n13812,75090,false,823071475439,7,1.33\n13902,74982,false,827002362663,7,1.33\n13978,74807,false,831233238454,7,1.33\n14069,74683,false,836135590880,7,1.33\n14160,68020,true,839902887387,7,1.33\n14192,68112,false,842735178263,7,1.33\n14283,68020,true,846762046969,7,1.33\n14343,68123,true,850975480830,7,1.33\n14358,68294,false,852897323036,7,1.33\n14449,68123,true,856543056577,7,1.33\n14461,68242,false,858179048994,7,1.33\n14552,68123,true,861781439174,7,1.33\n14579,68237,false,864731492708,7,1.33\n14661,68123,false,867930835902,7,1.33\n14752,68020,true,872025720295,7,1.33\n14775,68132,false,874725594923,7,1.33\n14866,68020,true,878111921489,7,1.33\n14875,68112,false,879057158069,7,1.33\n14949,68020,false,883146650336,7,1.33\n15040,65380,true,887072109300,7,1.33\n15100,65473,true,890931988042,7,1.33\n15161,65659,true,894609193462,7,1.33\n15202,66094,false,898196080160,7,1.33\n15293,65659,true,901801266919,7,1.33\n15327,65737,false,905151379781,7,1.33\n15403,65659,false,908479778711,7,1.33\n15494,65473,true,911704788937,7,1.33\n15516,65594,false,913594566364,7,1.33\n15607,65473,true,917697522156,7,1.33\n15637,65558,false,920403471843,7,1.33\n15686,65473,false,921417840551,7,1.33\n15777,65380,true,925118984957,7,1.33\n15836,65497,false,930000710648,7,1.33\n15927,65380,true,933500136196,7,1.33\n15988,65472,true,937957365276,7,1.33\n15994,65684,false,938774455877,7,1.33\n16085,65472,true,942411086589,7,1.33\n16145,65592,true,946859232628,7,1.33\n16169,65809,false,948202604735,7,1.33\n16213,65592,false,950158727867,7,1.33\n16304,65472,true,953499446088,7,1.33\n16365,65574,true,957881403311,7,1.33\n16373,65779,false,959024294033,7,1.33\n16464,65574,true,962650166747,7,1.33\n16511,65652,false,965837784371,7,1.33\n16602,65574,true,970235908270,7,1.33\n16659,65663,false,974270748890,7,1.33\n16750,65574,true,978331686646,7,1.33\n16791,65652,false,981683005476,7,1.33\n16882,65574,true,985505983375,7,1.33\n16885,65684,false,986616980319,7,1.33\n16968,65574,false,989676526851,7,1.33\n17059,65472,true,994208782864,7,1.33\n17119,65552,true,998076591495,7,1.33\n17148,65702,false,999776428469,7,1.33\n17221,65552,false,1003169049314,7,1.33\n17312,65472,true,1007117172550,7,1.33\n17322,65552,false,1008079725581,7,1.33\n17410,65472,false,1012302718179,7,1.33\n17490,65380,false,1015884111963,7,1.33\n17581,63961,true,1019507243832,7,1.33\n17624,64054,false,1023267987845,7,1.33\n17715,63961,true,1026735478735,7,1.33\n17775,64046,true,1030631691550,7,1.33\n17808,64287,false,1033696480633,7,1.33\n17899,64046,true,1037063390790,7,1.33\n17923,64159,false,1039613985093,7,1.33\n18014,64046,true,1042966957295,7,1.33\n18069,64162,false,1047221663212,7,1.33\n18160,64046,true,1050900131676,7,1.33\n18199,64133,false,1053584647711,7,1.33\n18290,64046,true,1057618475510,7,1.33\n18313,64163,false,1060147447136,7,1.33\n18404,64046,true,1063361434910,7,1.33\n18437,64155,false,1065674917559,7,1.33\n18528,64046,true,1070217870010,7,1.33\n18529,64157,false,1070582682694,7,1.33\n18620,64046,true,1074136854601,7,1.33\n18681,64133,true,1078549732367,7,1.33\n18728,64354,false,1082061388302,7,1.33\n18803,64133,false,1085241566930,7,1.33\n18894,64046,true,1088674566026,7,1.33\n18936,64159,false,1091481829313,7,1.33\n19021,64046,false,1096303744599,7,1.33\n19112,63961,true,1099594789032,7,1.33\n19172,64085,true,1103594658247,7,1.33\n19204,64324,false,1106494709982,7,1.33\n19295,64085,true,1110067371067,7,1.33\n19306,64193,false,1111541801591,7,1.33\n19397,64085,true,1115125480113,7,1.33\n19403,64190,false,1116009768856,7,1.33\n19467,64085,false,1118598308188,7,1.33\n19557,63961,false,1122653828796,7,1.33\n19648,63237,true,1126245439719,7,1.33\n19685,63349,false,1128656689808,7,1.33\n19776,63237,true,1133364011512,7,1.33\n19813,63321,false,1135701333738,7,1.33\n19904,63237,true,1138986770039,7,1.33\n19959,63353,false,1143088821599,7,1.33\n20022,63237,false,1145673041731,7,1.33\n20113,62920,true,1149195319918,7,1.33\n20173,63001,true,1153063639635,7,1.33\n20208,63245,false,1156053568157,7,1.33\n20299,63001,true,1159562132343,7,1.33\n20310,63110,false,1160752202613,7,1.33\n20401,63001,true,1164505979845,7,1.33\n20461,63121,true,1168492898650,7,1.33\n20470,63343,false,1170042295732,7,1.33\n20561,63121,true,1173224165789,7,1.33\n20600,63217,false,1176142861561,7,1.33\n20691,63121,true,1180405328551,7,1.33\n20752,63228,true,1184598755704,7,1.33\n20780,63454,false,1186252334499,7,1.33\n20871,63228,true,1189924882232,7,1.33\n20875,63331,false,1191072360058,7,1.33\n20960,63228,false,1193945626467,7,1.33\n21028,63121,false,1197017907806,7,1.33\n21119,63001,true,1200730601404,7,1.33\n21139,63106,false,1202523947455,7,1.33\n21230,63001,true,1206110895919,7,1.33\n21290,63077,true,1210113032301,7,1.33\n21351,63238,true,1214205079186,7,1.33\n21365,63718,false,1215677743548,7,1.33\n21456,63238,true,1219249677551,7,1.33\n21517,63314,true,1223515666271,7,1.33\n21539,63500,false,1225141499472,7,1.33\n21603,63314,false,1228064085657,7,1.33\n21687,63238,false,1230782342989,7,1.33\n21778,63077,true,1235011900369,7,1.33\n21836,63169,false,1239228047556,7,1.33\n21890,63077,false,1240412002599,7,1.33\n21981,63001,true,1244539081467,7,1.33\n22041,63076,true,1248568607677,7,1.33\n22043,63315,false,1249324549440,7,1.33\n22113,63076,false,1251865406798,7,1.33\n22204,63001,true,1255921756179,7,1.33\n22262,63104,false,1260228468676,7,1.33\n22353,63001,false,1263891879956,7,1.33\n22408,62920,false,1266077002515,7,1.33\n22499,62671,true,1269599895952,7,1.33\n22560,62760,true,1273422784626,7,1.33\n22612,62926,false,1277755079298,7,1.33\n22654,62760,false,1278284449397,7,1.33\n22745,62671,true,1281823489088,7,1.33\n22786,62777,false,1284835313816,7,1.33\n22877,62671,true,1289978945434,7,1.33\n22937,62766,true,1293145971822,7,1.33\n22967,62981,false,1295157153538,7,1.33\n23018,62766,false,1296487566773,7,1.33\n23109,62671,true,1300419235754,7,1.33\n23167,62781,false,1304854507102,7,1.33\n23252,62671,false,1308157149321,7,1.33\n23343,62580,true,1312301681716,7,1.33\n23404,62678,true,1315789158683,7,1.33\n23434,62883,false,1318711516831,7,1.33\n23505,62678,false,1320728637882,7,1.33\n23596,62580,true,1324747404028,7,1.33\n23657,62676,true,1328727170015,7,1.33\n23664,62924,false,1329176831073,7,1.33\n23730,62676,false,1331759973229,7,1.33\n23821,62580,true,1335695894584,7,1.33\n23848,62657,false,1337599556940,7,1.33\n23939,62580,true,1341548834619,7,1.33\n23997,62683,false,1345579632314,7,1.33\n24088,62580,true,1349617652137,7,1.33\n24144,62668,false,1354119536913,7,1.33\n24206,62580,false,1355540438853,7,1.33\n24266,62468,false,1358075557293,7,1.33\n24357,59315,true,1361287467075,7,1.33\n24417,59427,true,1365287788740,7,1.33\n24478,59655,true,1368913552065,7,1.33\n24538,60051,true,1372778944826,7,1.33\n24547,60907,false,1373535296130,7,1.33\n24611,60051,false,1376567101856,7,1.33\n24702,59655,true,1379445727594,7,1.33\n24709,59749,false,1380914697467,7,1.33\n24800,59655,true,1383801440986,7,1.33\n24822,59760,false,1385146324143,7,1.33\n24913,59655,true,1389018541185,7,1.33\n24974,59764,true,1393044837121,7,1.33\n25034,59940,true,1396691475808,7,1.33\n25064,60392,false,1399419522949,7,1.33\n25107,59940,false,1399475481469,7,1.33\n25198,59764,true,1403593462009,7,1.33\n25229,59877,false,1406081297340,7,1.33\n25320,59764,true,1409309889783,7,1.33\n25330,59859,false,1410671337356,7,1.33\n25421,59764,true,1413961213443,7,1.33\n25481,59872,true,1417841451098,7,1.33\n25493,60092,false,1418469827410,7,1.33\n25571,59872,false,1421727255929,7,1.33\n25662,59764,true,1425457619135,7,1.33\n25694,59846,false,1427865074033,7,1.33\n25785,59764,true,1431827155088,7,1.33\n25845,59842,true,1435915853338,7,1.33\n25850,60086,false,1436190236805,7,1.33\n25941,59842,true,1439953594946,7,1.33\n25997,59925,false,1443459792564,7,1.33\n26088,59842,true,1447454509110,7,1.33\n26130,59936,false,1450523525547,7,1.33\n26195,59842,false,1452377759411,7,1.33\n26286,59764,true,1455436719308,7,1.33\n26347,59849,true,1459437831075,7,1.33\n26389,60023,false,1462509897177,7,1.33\n26459,59849,false,1465369787620,7,1.33\n26549,59764,false,1469000071936,7,1.33\n26640,59655,true,1472370327565,7,1.33\n26700,59763,true,1476955262789,7,1.33\n26705,59956,false,1477480032948,7,1.33\n26796,59763,true,1480453704970,7,1.33\n26856,59841,true,1484365602674,7,1.33\n26917,60022,true,1488773956213,7,1.33\n26942,60504,false,1490555837124,7,1.33\n27023,60022,false,1493376977872,7,1.33\n27114,59841,true,1497139106619,7,1.33\n27117,59945,false,1497123725969,7,1.33\n27208,59841,true,1501384304347,7,1.33\n27257,59951,false,1504205955839,7,1.33\n27304,59841,false,1506013788944,7,1.33\n27395,59763,true,1509230462755,7,1.33\n27456,59854,true,1513014522566,7,1.33\n27517,60010,true,1517825146708,7,1.33\n27540,60458,false,1519493291388,7,1.33\n27631,60010,true,1524060321768,7,1.33\n27636,60110,false,1523430210189,7,1.33\n27691,60010,false,1524864985786,7,1.33\n27779,59854,false,1528143130990,7,1.33\n27870,59763,true,1531980073225,7,1.33\n27930,59863,true,1535666780069,7,1.33\n27991,60040,true,1539609936870,7,1.33\n27997,60364,false,1539989757410,7,1.33\n28088,60040,true,1543964704383,7,1.33\n28114,60155,false,1545895097856,7,1.33\n28190,60040,false,1549305901564,7,1.33\n28281,59863,true,1552925347348,7,1.33\n28313,59963,false,1555132567197,7,1.33\n28404,59863,true,1558287459239,7,1.33\n28464,59966,true,1562580651284,7,1.33\n28491,60172,false,1564205701290,7,1.33\n28568,59966,false,1566853499144,7,1.33\n28647,59863,false,1570693841007,7,1.33\n28738,59763,true,1573898350688,7,1.33\n28799,59882,true,1577910039536,7,1.33\n"
+	},
+	"20211028/pebble/write/size=1024/run_1/2.log.gz": {
+		"opsSec": 62750,
+		"rawData": "60,30000,true,2445304423,4,1.44\n120,30113,true,4333010732,4,1.44\n180,30302,true,6248791776,4,1.44\n241,30791,true,8260445796,5,1.44\n302,31421,true,10294218524,5,1.44\n362,33286,true,12406565040,5,1.44\n423,36567,true,14755897113,5,1.44\n484,42095,true,17517395502,5,1.44\n544,55447,true,20979755261,5,1.44\n604,75653,true,25904180364,5,1.44\n657,120111,false,33469355212,5,1.44\n748,75653,true,37522276620,5,1.44\n808,75764,true,42427500614,5,1.44\n868,76006,true,47275151149,5,1.44\n928,76488,true,52199564925,5,1.44\n989,77246,true,57062907721,5,1.44\n1050,78666,true,62171844760,5,1.44\n1111,81283,true,67875341951,5,1.44\n1125,87401,false,69158723410,5,1.44\n1216,81283,true,74021237183,5,1.44\n1276,81390,true,79169889450,6,1.44\n1336,81595,true,84329561964,6,1.44\n1397,81904,true,89687428046,6,1.44\n1458,82650,true,94996629215,6,1.44\n1500,84358,false,99514456784,6,1.44\n1591,82650,true,104211178894,6,1.44\n1652,82741,true,109502815254,6,1.44\n1664,82895,false,111345373977,6,1.44\n1755,82741,true,116291437850,6,1.44\n1813,82841,false,121843715596,6,1.44\n1904,82741,true,126538876662,6,1.44\n1940,82854,false,130203821201,6,1.44\n2031,82741,true,135297083725,6,1.44\n2087,82816,false,140413083433,6,1.44\n2178,82741,true,145151561136,6,1.44\n2196,82837,false,147081633335,6,1.44\n2287,82741,true,152861931645,6,1.44\n2332,82838,false,156563305280,6,1.44\n2423,82741,true,161770359248,6,1.44\n2453,82825,false,164188924848,6,1.44\n2544,82741,true,170236085786,6,1.44\n2579,82833,false,173556316163,6,1.44\n2670,82741,true,177916926878,6,1.44\n2702,82849,false,181475776986,6,1.44\n2793,82741,true,186080813165,6,1.44\n2817,82863,false,188831354640,6,1.44\n2887,82741,false,192203708112,6,1.44\n2978,82650,true,197165683052,6,1.44\n3008,82761,false,199719884908,6,1.44\n3099,82650,true,205215422656,6,1.44\n3112,82731,false,206907308613,6,1.44\n3203,82650,true,211694865668,6,1.44\n3221,82735,false,213835513177,6,1.44\n3312,82650,true,219371118001,6,1.44\n3333,82727,false,220894942026,6,1.44\n3424,82650,true,225757008523,6,1.44\n3429,82763,false,226888707987,6,1.44\n3520,82650,true,231598433862,6,1.44\n3524,82726,false,232470268700,6,1.44\n3615,82650,true,237276296897,6,1.44\n3624,82741,false,239147657245,6,1.44\n3707,82650,false,243397761476,6,1.44\n3798,81904,true,248060556974,6,1.44\n3830,81982,false,251733838057,6,1.44\n3915,81904,false,256208978368,6,1.44\n4004,81595,false,261363666095,6,1.44\n4095,81390,true,266090409286,6,1.44\n4125,81468,false,268803996134,6,1.44\n4197,81390,false,272753188091,6,1.44\n4288,81283,true,277560227413,6,1.44\n4304,81371,false,279667734021,6,1.44\n4395,81283,true,284357679120,6,1.44\n4410,81393,false,285454170970,6,1.44\n4500,81283,false,291501040914,6,1.44\n4591,78666,true,295890550275,6,1.44\n4600,78771,false,297069834242,6,1.44\n4691,78666,true,301753565522,6,1.44\n4706,78751,false,303021874411,6,1.44\n4796,78666,false,308544351396,6,1.44\n4878,77246,false,312774526974,6,1.44\n4969,76488,true,317152661583,6,1.44\n5028,76603,false,322781205812,6,1.44\n5101,76488,false,326204676552,6,1.44\n5192,76006,true,330399689216,6,1.44\n5208,76086,false,332523270012,6,1.44\n5299,76006,true,336625846549,6,1.44\n5332,76083,false,340075782497,6,1.44\n5423,76006,true,344206470201,6,1.44\n5441,76119,false,345946285649,6,1.44\n5514,76006,false,349581458362,6,1.44\n5605,75764,true,354274563419,6,1.44\n5616,75877,false,355850750771,6,1.44\n5707,75764,true,361017458776,6,1.44\n5755,75849,false,364538788381,6,1.44\n5831,75764,false,367828233242,6,1.44\n5922,75653,true,373238598984,6,1.44\n5954,75761,false,375556046278,6,1.44\n6045,75653,false,380619226436,6,1.44\n6136,55447,true,383595785286,6,1.44\n6197,55563,true,387098640324,6,1.44\n6258,55770,true,390746379609,6,1.44\n6319,56250,true,394508935316,6,1.44\n6379,56851,true,398112177406,6,1.44\n6403,58494,false,400143460224,6,1.44\n6494,56851,true,403561097486,6,1.44\n6555,56926,true,407009884538,6,1.44\n6615,57116,true,410591462651,6,1.44\n6676,57425,true,414303539137,6,1.44\n6736,58183,true,417888002253,6,1.44\n6796,59998,true,421773961257,6,1.44\n6817,62659,false,424146767596,6,1.44\n6908,59998,true,426990339833,6,1.44\n6968,60121,true,430972850041,6,1.44\n7028,60285,true,435577556987,6,1.44\n7045,60756,false,437009087032,6,1.44\n7136,60285,true,440321231441,6,1.44\n7196,60406,true,444300482600,6,1.44\n7223,60564,false,446156326468,6,1.44\n7314,60406,true,449565610179,6,1.44\n7375,60498,true,453196611429,6,1.44\n7436,60714,true,457309402054,6,1.44\n7461,61130,false,459326805131,6,1.44\n7552,60714,true,462752361905,6,1.44\n7613,60829,true,466705071738,6,1.44\n7641,61078,false,468932910774,6,1.44\n7732,60829,true,472745282593,6,1.44\n7792,60918,true,476320913714,6,1.44\n7833,61130,false,478997655350,6,1.44\n7924,60918,true,483375248706,6,1.44\n7984,61007,true,486790411409,6,1.44\n8016,61211,false,488980362043,6,1.44\n8098,61007,false,492394284220,6,1.44\n8189,60918,true,496645622959,6,1.44\n8249,61007,true,500482701244,6,1.44\n8273,61239,false,502414814732,6,1.44\n8364,61007,true,505657984281,6,1.44\n8424,61111,true,509707231899,6,1.44\n8485,61346,true,513537478831,6,1.44\n8522,61739,false,516749253761,6,1.44\n8613,61346,true,519811829103,6,1.44\n8645,61455,false,522518814472,6,1.44\n8736,61346,true,526522700361,6,1.44\n8796,61439,true,529937300522,6,1.44\n8823,61590,false,532217719748,6,1.44\n8914,61439,true,535561677618,6,1.44\n8974,61561,true,539490088269,6,1.44\n9034,61808,true,543374067899,6,1.44\n9047,62289,false,545132825251,6,1.44\n9138,61808,true,548335290715,6,1.44\n9185,61922,false,551762002386,6,1.44\n9271,61808,false,555721108460,6,1.44\n9362,61561,true,559214431472,6,1.44\n9422,61683,true,563070961113,6,1.44\n9467,61879,false,566010546250,6,1.44\n9542,61683,false,569451753519,6,1.44\n9633,61561,true,572971749124,6,1.44\n9693,61646,true,577410610534,6,1.44\n9737,61837,false,580448088406,6,1.44\n9828,61646,true,583964041160,6,1.44\n9889,61723,true,588086395733,6,1.44\n9949,61896,true,591727014688,6,1.44\n9980,62251,false,594449132947,6,1.44\n10048,61896,false,597085363416,6,1.44\n10139,61723,true,600592341931,6,1.44\n10155,61837,false,601812225661,6,1.44\n10246,61723,true,605825937310,6,1.44\n10306,61803,true,609484833956,6,1.44\n10312,62044,false,610548797193,6,1.44\n10403,61803,true,614335067565,6,1.44\n10445,61902,false,616958276988,6,1.44\n10536,61803,true,620637981516,6,1.44\n10596,61925,true,624580156663,6,1.44\n10646,62162,false,627891892883,6,1.44\n10737,61925,true,631871613018,6,1.44\n10764,62018,false,633713149100,6,1.44\n10855,61925,true,637683659779,6,1.44\n10897,62023,false,641298307063,6,1.44\n10988,61925,true,644685141941,6,1.44\n11039,62042,false,648959311450,6,1.44\n11100,61925,false,650367293823,6,1.44\n11191,61803,true,654533459159,6,1.44\n11252,61924,true,658337773868,6,1.44\n11302,62090,false,662174973743,6,1.44\n11393,61924,true,665284112846,6,1.44\n11454,62021,true,669489553216,6,1.44\n11459,62218,false,670086836737,6,1.44\n11547,62021,false,673720417838,6,1.44\n11630,61924,false,677404657963,6,1.44\n11721,61803,true,681014331674,6,1.44\n11781,61880,true,684984775378,6,1.44\n11825,62081,false,688526001217,6,1.44\n11874,61880,false,689610887648,6,1.44\n11965,61803,true,693173348143,6,1.44\n12026,61900,true,697761957957,6,1.44\n12050,62146,false,699154881853,6,1.44\n12141,61900,true,702818723039,6,1.44\n12201,62009,true,706758677024,6,1.44\n12217,62186,false,708579963266,6,1.44\n12308,62009,true,711938889564,6,1.44\n12368,62093,true,715909185786,6,1.44\n12385,62302,false,718406857113,6,1.44\n12476,62093,true,721042572192,6,1.44\n12518,62209,false,724333351489,6,1.44\n12609,62093,true,728226027510,6,1.44\n12669,62197,true,732154826762,6,1.44\n12700,62381,false,734425767746,6,1.44\n12761,62197,false,736878997444,6,1.44\n12852,62093,true,740010564029,6,1.44\n12913,62182,true,744272780019,6,1.44\n12929,62393,false,745629464975,6,1.44\n12972,62182,false,746452585592,6,1.44\n13063,62093,true,749933881946,7,1.44\n13123,62175,true,753975646867,7,1.44\n13183,62390,true,757984730919,7,1.44\n13244,62820,true,761947012252,7,1.44\n13304,63741,true,766074071880,7,1.44\n13365,65223,true,770429684877,7,1.44\n13426,68295,true,774727004261,7,1.44\n13487,73462,true,779485378331,7,1.44\n13546,88870,false,785064936473,7,1.44\n13637,73462,true,790003051230,7,1.44\n13694,73573,false,794773543647,7,1.44\n13785,73462,true,799029715225,7,1.44\n13845,73570,true,803840790309,7,1.44\n13867,73775,false,806164354499,7,1.44\n13958,73570,true,810290216670,7,1.44\n14019,73673,true,815001536350,7,1.44\n14060,73855,false,819000953063,7,1.44\n14151,73673,true,823645429321,7,1.44\n14180,73785,false,826164353543,7,1.44\n14271,73673,true,830856685385,7,1.44\n14314,73778,false,834143032325,7,1.44\n14391,73673,false,838020748678,7,1.44\n14462,73570,false,840412346837,7,1.44\n14502,73462,false,841745919624,7,1.44\n14593,68295,true,845561730499,7,1.44\n14616,68375,false,847628610840,7,1.44\n14707,68295,true,852515543078,7,1.44\n14759,68377,false,856228489307,7,1.44\n14850,68295,true,859928121853,7,1.44\n14897,68370,false,864028343837,7,1.44\n14988,68295,true,867783374234,7,1.44\n15000,68418,false,868715334548,7,1.44\n15085,68295,false,872741528125,7,1.44\n15176,65223,true,877069643445,7,1.44\n15189,65326,false,878656053932,7,1.44\n15280,65223,true,882171437290,7,1.44\n15316,65335,false,884914062302,7,1.44\n15407,65223,true,889040268299,7,1.44\n15424,65322,false,890519655293,7,1.44\n15515,65223,true,894371268625,7,1.44\n15575,65309,true,898488349809,7,1.44\n15577,65501,false,899279267879,7,1.44\n15668,65309,true,902918138002,7,1.44\n15675,65427,false,903313723756,7,1.44\n15727,65309,false,905405939359,7,1.44\n15818,65223,true,909067242197,7,1.44\n15878,65343,true,913492302485,7,1.44\n15913,65510,false,916253899271,7,1.44\n16004,65343,true,920250122732,7,1.44\n16012,65454,false,921010591615,7,1.44\n16103,65343,true,924790496743,7,1.44\n16123,65426,false,926777450430,7,1.44\n16214,65343,true,930626486880,7,1.44\n16224,65426,false,931853821135,7,1.44\n16315,65343,true,935401148431,7,1.44\n16376,65465,true,939769927445,7,1.44\n16388,65645,false,941079191217,7,1.44\n16479,65465,true,944682337210,7,1.44\n16539,65553,true,948792388487,7,1.44\n16600,65786,true,953321763020,7,1.44\n16611,66230,false,954480062136,7,1.44\n16686,65786,false,958122142146,7,1.44\n16768,65553,false,960803880795,7,1.44\n16837,65465,false,963913347857,7,1.44\n16928,65343,true,967702065990,7,1.44\n16988,65438,true,972330670748,7,1.44\n17011,65673,false,974339908842,7,1.44\n17082,65438,false,976606675432,7,1.44\n17173,65343,true,980732863996,7,1.44\n17232,65447,false,985298359641,7,1.44\n17323,65343,true,988840819186,7,1.44\n17348,65440,false,991289257032,7,1.44\n17439,65343,true,994847760381,7,1.44\n17479,65425,false,998625872292,7,1.44\n17570,65343,true,1001917782088,7,1.44\n17588,65451,false,1003843606571,7,1.44\n17679,65343,true,1007322769691,7,1.44\n17713,65466,false,1010244635627,7,1.44\n17804,65343,true,1013957526363,7,1.44\n17835,65465,false,1017058013439,7,1.44\n17926,65343,true,1020523212995,7,1.44\n17933,65461,false,1021113317223,7,1.44\n18017,65343,false,1025118180787,7,1.44\n18108,65223,true,1028975136486,7,1.44\n18124,65344,false,1030679286991,7,1.44\n18215,65223,true,1034215022609,7,1.44\n18232,65322,false,1036220489425,7,1.44\n18323,65223,true,1039757521350,7,1.44\n18364,65310,false,1043332342823,7,1.44\n18455,65223,true,1046736052367,7,1.44\n18486,65316,false,1049937512084,7,1.44\n18577,65223,true,1053135013377,7,1.44\n18605,65325,false,1055103784476,7,1.44\n18696,65223,true,1059358181969,7,1.44\n18705,65309,false,1059954807855,7,1.44\n18796,65223,true,1064365903181,7,1.44\n18809,65318,false,1065645226550,7,1.44\n18900,65223,true,1069670898154,7,1.44\n18908,65325,false,1070541346087,7,1.44\n18999,65223,true,1074203564163,7,1.44\n19059,65315,false,1078565441602,7,1.44\n19150,65223,true,1082556730341,7,1.44\n19202,65323,false,1087622728980,7,1.44\n19276,65223,false,1089439989112,7,1.44\n19367,63741,true,1093600699238,7,1.44\n19426,63836,false,1098488190900,7,1.44\n19506,63741,false,1101580589727,7,1.44\n19567,62820,false,1103866034916,7,1.44\n19658,62390,true,1106954801519,7,1.44\n19672,62492,false,1108189333457,7,1.44\n19763,62390,true,1112085861342,7,1.44\n19824,62512,true,1116355430211,7,1.44\n19867,62722,false,1119642781935,7,1.44\n19958,62512,true,1123273485785,7,1.44\n20018,62608,true,1126960434743,7,1.44\n20043,62761,false,1128845659352,7,1.44\n20134,62608,true,1132928851843,7,1.44\n20138,62717,false,1133864603194,7,1.44\n20229,62608,true,1137109638582,7,1.44\n20241,62696,false,1138613882248,7,1.44\n20332,62608,true,1142864174183,7,1.44\n20379,62723,false,1145725897656,7,1.44\n20470,62608,true,1149260303163,7,1.44\n20530,62697,true,1153463000626,7,1.44\n20539,62944,false,1154669864695,7,1.44\n20630,62697,true,1157908237807,7,1.44\n20637,62806,false,1158714601580,7,1.44\n20728,62697,true,1162397657065,7,1.44\n20782,62773,false,1166541202570,7,1.44\n20873,62697,true,1170090425180,7,1.44\n20887,62821,false,1171428178199,7,1.44\n20978,62697,true,1175141460962,7,1.44\n20990,62796,false,1176681096226,7,1.44\n21081,62697,true,1180352304147,7,1.44\n21141,62806,true,1184173838168,7,1.44\n21165,63021,false,1186324238931,7,1.44\n21250,62806,false,1189510201698,7,1.44\n21341,62697,true,1193517612277,7,1.44\n21401,62806,true,1197362196677,7,1.44\n21461,62997,false,1201532089546,7,1.44\n21552,62806,true,1205572508101,7,1.44\n21573,62915,false,1207884219575,7,1.44\n21648,62806,false,1210332781234,7,1.44\n21739,62697,true,1214173354589,7,1.44\n21755,62784,false,1215987242882,7,1.44\n21843,62697,false,1219780693520,7,1.44\n21934,62608,true,1223054824148,7,1.44\n21994,62686,true,1227362760763,7,1.44\n22000,62857,false,1228452984487,7,1.44\n22091,62686,true,1231487076577,7,1.44\n22093,62767,false,1232070614269,7,1.44\n22184,62686,true,1236068666285,7,1.44\n22245,62807,true,1239959089474,7,1.44\n22266,62960,false,1242003375156,7,1.44\n22357,62807,true,1245503209513,7,1.44\n22418,62931,true,1249607635373,7,1.44\n22430,63126,false,1251759385924,7,1.44\n22499,62931,false,1253712866944,7,1.44\n22590,62807,true,1257120407943,7,1.44\n22643,62927,false,1261522016817,7,1.44\n22712,62807,false,1263694555734,7,1.44\n22803,62686,true,1267223087880,7,1.44\n22842,62784,false,1270482523760,7,1.44\n22933,62686,true,1273953937851,7,1.44\n22993,62784,true,1278002314857,7,1.44\n23010,62974,false,1279966620626,7,1.44\n23101,62784,true,1283870110051,7,1.44\n23133,62896,false,1285953077476,7,1.44\n23219,62784,false,1289545030051,7,1.44\n23310,62686,true,1293078628837,7,1.44\n23367,62772,false,1296874209777,7,1.44\n23458,62686,true,1300961978868,7,1.44\n23485,62806,false,1303473319080,7,1.44\n23576,62686,true,1306870503139,7,1.44\n23597,62769,false,1308509759886,7,1.44\n23688,62686,true,1312524422242,7,1.44\n23748,62773,true,1316407899431,7,1.44\n23766,62967,false,1318024598439,7,1.44\n23844,62773,false,1321144013785,7,1.44\n23935,62686,true,1324949877681,7,1.44\n23996,62762,true,1328927143074,7,1.44\n24024,62992,false,1330818948297,7,1.44\n24066,62762,false,1332145112405,7,1.44\n24122,62686,false,1334033203135,7,1.44\n24213,62608,true,1337534760038,7,1.44\n24273,62707,true,1341402435312,7,1.44\n24301,62886,false,1343583889793,7,1.44\n24392,62707,true,1347923456567,7,1.44\n24427,62810,false,1350590097607,7,1.44\n24518,62707,true,1353875678116,7,1.44\n24578,62802,true,1357803486740,7,1.44\n24603,63028,false,1359518913863,7,1.44\n24680,62802,false,1363768577063,7,1.44\n24771,62707,true,1366870450305,7,1.44\n24793,62785,false,1369244597348,7,1.44\n24884,62707,true,1372371385198,7,1.44\n24897,62813,false,1373486137842,7,1.44\n24964,62707,false,1376388997111,7,1.44\n25055,62608,true,1379801625563,7,1.44\n25115,62709,true,1383879260752,7,1.44\n25176,62876,true,1387877287561,7,1.44\n25224,63349,false,1391971087960,7,1.44\n25313,62876,false,1395549109980,7,1.44\n25378,62709,false,1398002249958,7,1.44\n25469,62608,true,1401570769014,7,1.44\n25530,62721,true,1405559387798,7,1.44\n25585,62878,false,1409599432634,7,1.44\n25676,62721,true,1413230613500,7,1.44\n25706,62814,false,1415387785664,7,1.44\n25797,62721,true,1419395277962,7,1.44\n25858,62830,true,1423846723222,7,1.44\n25874,63074,false,1424862191852,7,1.44\n25965,62830,true,1429827952765,7,1.44\n26018,62950,false,1432556031970,7,1.44\n26094,62830,false,1435668358455,7,1.44\n26170,62721,false,1438696080024,7,1.44\n26261,62608,true,1442424838944,7,1.44\n26321,62702,true,1446489171746,7,1.44\n26382,62865,true,1451222180652,7,1.44\n26429,63175,false,1454214915578,7,1.44\n26515,62865,false,1457236786731,7,1.44\n26606,62702,true,1462117251350,7,1.44\n26667,62791,true,1465452124309,7,1.44\n26702,62998,false,1468687798464,7,1.44\n26793,62791,true,1471910227693,7,1.44\n26848,62894,false,1476689681319,7,1.44\n26939,62791,true,1479978450745,7,1.44\n26963,62914,false,1482096863231,7,1.44\n27050,62791,false,1485819922246,7,1.44\n27141,62702,true,1489114694647,7,1.44\n27200,62826,false,1493055677672,7,1.44\n27291,62702,true,1497145858023,7,1.44\n27352,62800,true,1501977744894,7,1.44\n27365,63014,false,1503018745867,7,1.44\n27412,62800,false,1504394418333,7,1.44\n27490,62702,false,1507281322724,7,1.44\n27581,62608,true,1510610262903,7,1.44\n27642,62696,true,1514650740911,7,1.44\n27654,62932,false,1516065466306,7,1.44\n27745,62696,true,1519640226035,7,1.44\n27755,62802,false,1520773092672,7,1.44\n27846,62696,true,1524203401468,7,1.44\n27873,62781,false,1526437178406,7,1.44\n27964,62696,true,1530257846626,7,1.44\n28024,62777,true,1534929092243,7,1.44\n28063,62947,false,1537240616826,7,1.44\n28137,62777,false,1540647052001,7,1.44\n28228,62696,true,1543785678902,7,1.44\n28230,62777,false,1544465987200,7,1.44\n28321,62696,true,1547951449569,7,1.44\n28377,62772,false,1552112435800,7,1.44\n28460,62696,false,1555652331687,7,1.44\n28551,62608,true,1559739519318,7,1.44\n28611,62691,true,1563523728829,7,1.44\n28672,62915,true,1567409715167,7,1.44\n28680,63377,false,1568563376098,7,1.44\n28771,62915,true,1572049146431,7,1.44\n28774,63017,false,1572999235741,7,1.44\n"
+	}
+}
diff --git a/pebble/internal/mkbench/testdata/write-throughput/summary.json b/pebble/internal/mkbench/testdata/write-throughput/summary.json
new file mode 100644
index 0000000..15b9409
--- /dev/null
+++ b/pebble/internal/mkbench/testdata/write-throughput/summary.json
@@ -0,0 +1,18 @@
+{
+	"write/values=1024": [
+		{
+			"name": "write/values=1024",
+			"date": "20211027",
+			"opsSec": 58825,
+			"writeAmp": 1.17,
+			"summaryPath": "20211027-pebble-write-size=1024-run_1-summary.json"
+		},
+		{
+			"name": "write/values=1024",
+			"date": "20211028",
+			"opsSec": 65525,
+			"writeAmp": 1.39,
+			"summaryPath": "20211028-pebble-write-size=1024-run_1-summary.json"
+		}
+	]
+}
diff --git a/pebble/internal/mkbench/testutil.go b/pebble/internal/mkbench/testutil.go
new file mode 100644
index 0000000..30e0aa2
--- /dev/null
+++ b/pebble/internal/mkbench/testutil.go
@@ -0,0 +1,84 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package main
+
+import (
+	"bytes"
+	"io"
+	"os"
+	"path/filepath"
+	"runtime"
+	"testing"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/errors/oserror"
+	"github.com/pmezard/go-difflib/difflib"
+)
+
+// filesEqual returns the diff between contents of a and b.
+func filesEqual(a, b string) error {
+	aBytes, err := os.ReadFile(a)
+	if err != nil {
+		return err
+	}
+	bBytes, err := os.ReadFile(b)
+	if err != nil {
+		return err
+	}
+
+	// Normalize newlines.
+	aBytes = bytes.Replace(aBytes, []byte{13, 10} /* \r\n */, []byte{10} /* \n */, -1)
+	bBytes = bytes.Replace(bBytes, []byte{13, 10}, []byte{10}, -1)
+
+	d, _ := difflib.GetUnifiedDiffString(difflib.UnifiedDiff{
+		A: difflib.SplitLines(string(aBytes)),
+		B: difflib.SplitLines(string(bBytes)),
+	})
+	if d != "" {
+		return errors.Errorf("a != b\ndiff = %s", d)
+	}
+
+	return nil
+}
+
+// copyDir recursively copies the fromPath to toPath, excluding certain paths.
+func copyDir(fromPath, toPath string) error {
+	walkFn := func(path, pathRel string, info os.FileInfo) error {
+		// Preserve the directory structure.
+		if info.IsDir() {
+			err := os.Mkdir(filepath.Join(toPath, pathRel), 0700)
+			if err != nil && !oserror.IsNotExist(err) {
+				return err
+			}
+			return nil
+		}
+
+		// Copy files.
+		fIn, err := os.Open(path)
+		if err != nil {
+			return err
+		}
+		defer func() { _ = fIn.Close() }()
+
+		fOut, err := os.OpenFile(filepath.Join(toPath, pathRel), os.O_CREATE|os.O_WRONLY, 0700)
+		if err != nil {
+			return err
+		}
+		defer func() { _ = fOut.Close() }()
+
+		_, err = io.Copy(fOut, fIn)
+		return err
+	}
+	return walkDir(fromPath, walkFn)
+}
+
+func maybeSkip(t *testing.T) {
+	// The paths in the per-run summary.json files are UNIX-oriented. To avoid
+	// duplicating the test fixtures just for Windows, just skip the tests on
+	// Windows.
+	if runtime.GOOS == "windows" {
+		t.Skipf("skipped on windows")
+	}
+}
diff --git a/pebble/internal/mkbench/util.go b/pebble/internal/mkbench/util.go
new file mode 100644
index 0000000..ac3996f
--- /dev/null
+++ b/pebble/internal/mkbench/util.go
@@ -0,0 +1,49 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package main
+
+import (
+	"encoding/json"
+	"log"
+	"os"
+	"path/filepath"
+)
+
+func prettyJSON(v interface{}) []byte {
+	data, err := json.MarshalIndent(v, "", "\t")
+	if err != nil {
+		log.Fatal(err)
+	}
+	return data
+}
+
+// walkDir recursively walks the given directory (following any symlinks it may
+// encounter), running the handleFn on each file.
+func walkDir(dir string, handleFn func(path, pathRel string, info os.FileInfo) error) error {
+	var walkFn filepath.WalkFunc
+	walkFn = func(path string, info os.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+		if info == nil {
+			return nil
+		}
+		if !info.Mode().IsRegular() && !info.Mode().IsDir() {
+			info, err = os.Stat(path)
+			if err == nil && info.Mode().IsDir() {
+				_ = filepath.Walk(path+string(os.PathSeparator), walkFn)
+			}
+			return nil
+		}
+
+		rel, err := filepath.Rel(dir, path)
+		if err != nil {
+			return err
+		}
+
+		return handleFn(path, rel, info)
+	}
+	return filepath.Walk(dir, walkFn)
+}
diff --git a/pebble/internal/mkbench/util_test.go b/pebble/internal/mkbench/util_test.go
new file mode 100644
index 0000000..f0bf116
--- /dev/null
+++ b/pebble/internal/mkbench/util_test.go
@@ -0,0 +1,32 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package main
+
+import (
+	"os"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestWalkDir(t *testing.T) {
+	maybeSkip(t)
+
+	// Number of files and directories in the data fixture directory (and
+	// therefore the symlinked directory). Generated via:
+	//   find . testdata | wc -l
+	const wantCount = 97
+	for _, path := range dataDirPaths {
+		t.Run(path, func(t *testing.T) {
+			var paths []string
+			err := walkDir(path, func(_, pathRel string, info os.FileInfo) error {
+				paths = append(paths, pathRel)
+				return nil
+			})
+			require.NoError(t, err)
+			require.Equal(t, wantCount, len(paths))
+		})
+	}
+}
diff --git a/pebble/internal/mkbench/write.go b/pebble/internal/mkbench/write.go
new file mode 100644
index 0000000..abeeca5
--- /dev/null
+++ b/pebble/internal/mkbench/write.go
@@ -0,0 +1,608 @@
+package main
+
+import (
+	"bufio"
+	"bytes"
+	"cmp"
+	"compress/bzip2"
+	"compress/gzip"
+	"encoding/json"
+	"fmt"
+	"io"
+	"math"
+	"os"
+	"path/filepath"
+	"slices"
+	"sort"
+	"strings"
+	"time"
+
+	"github.com/cockroachdb/errors/oserror"
+	"github.com/spf13/cobra"
+)
+
+// A note to the reader on nomenclature used in this command.
+//
+// The write-throughput benchmark is generated by a roachtest with a number of
+// independent worker VMs running the same benchmark (to allow for an average
+// value to be recorded).
+//
+// An instance of the roachtest on a given day, for a given workload type (e.g.
+// values of size 1024B, values of size 64B, etc.) is modelled as a `writeRun`.
+// Each worker VM in a `writeRun` produces data modelled as a `rawWriteRun`.
+// Each `rawWriteRun` contains the raw data points emitted periodically by the
+// VM and are modelled as `writePoint`s.
+//
+// A `writeWorkload` (i.e. singular) models all data for a particular type of
+// benchmark run (e.g. values of size 1024B), across all days. It is a mapping
+// of day to `writeRun`, which is a collection of `rawWriteRun`s.
+//
+// The `writeWorkloads` (i.e. plural) is a mapping from workload name to its
+// `writeWorkload`.
+//
+// The data can be thought of being modelled as follows:
+//
+//                                     `writeWorkloads`---------\
+// - workload-name-A:                  `writeWorkload`-------\  |
+//   - day-1:                          `writeRun`---------\  |  |
+//      - VM-1:                        `rawWriteRun`----\ |  |  |
+//        [ ... raw data point ... ]   `writePoint`     x |  |  |
+//        ...                                             |  |  |
+//      - VM-N:                                           |  |  |
+//     	  [ ... raw data point ... ]                      x  |  |
+//     ...                                                   |  |
+//   - day-N:                                                |  |
+//      - VM-1:                                              |  |
+//     	  [ ... raw data point ... ]                         |  |
+//        ...                                                |  |
+//      - VM-N:                                              |  |
+//     	  [ ... raw data point ... ]                         x  |
+//   ...                                                        |
+// - workload-name-Z:                                           |
+//   - day-1:                                                   |
+//      - VM-1:                                                 |
+//     	  [ ... raw data point ... ]                            |
+//        ...                                                   |
+//      - VM-N:                                                 |
+//     	  [ ... raw data point ... ]                            |
+//     ...                                                      |
+//   - day-N:                                                   |
+//      - VM-1:                                                 |
+//     	  [ ... raw data point ... ]                            |
+//        ...                                                   |
+//      - VM-N:                                                 |
+//     	  [ ... raw data point ... ]                            x
+
+const (
+	// summaryFilename is the filename for the top-level summary output.
+	summaryFilename = "summary.json"
+
+	// rawRunFmt is the format string for raw benchmark data.
+	rawRunFmt = "BenchmarkRaw%s %d ops/sec %v pass %s elapsed %d bytes %d levels %f writeAmp"
+)
+
+func getWriteCommand() *cobra.Command {
+	c := &cobra.Command{
+		Use:   "write",
+		Short: "parse write throughput benchmark data",
+		Long: `
+Parses write-throughput benchmark data into two sets of JSON "summary" files:
+
+1. A top-level summary.json file. Data in this file is reported per-day, per
+workload (i.e. values=1024, etc.), and is responsible for the top-level
+write-throughput visualizations on the Pebble benchmarks page.
+
+Each data-point for a time-series contains an ops/sec figure (measured as a
+simple average over all data points for that workload run), and a relative path
+to a per-run summary JSON file, containing the raw data for the run.
+
+2. A per-run *-summary.json file. Data in this file contains the raw data for
+each of the benchmark instances participating in the workload run on the given
+day. Each key in the file is the relative path to the original raw data file.
+Each data point contains the calculated optimal ops/sec for the instance of the
+run (see split.go for more detail on the algorithm), in addition to the raw data
+in CSV format.
+
+This command can be run without flags at the root of the directory containing
+the raw data. By default the raw data will be pulled from "data", and the
+resulting top-level and per-run summary files are written to "write-throughput".
+Both locations can be overridden with the --data-dir and --summary-dir flags,
+respectively.
+`,
+		RunE: func(cmd *cobra.Command, args []string) error {
+			dataDir, err := cmd.Flags().GetString("data-dir")
+			if err != nil {
+				return err
+			}
+
+			summaryDir, err := cmd.Flags().GetString("summary-dir")
+			if err != nil {
+				return err
+			}
+
+			return parseWrite(dataDir, summaryDir)
+		},
+	}
+
+	c.Flags().String("data-dir", "data", "path to the raw data directory")
+	c.Flags().String("summary-dir", "write-throughput", "output directory containing the summary files")
+	c.SilenceUsage = true
+
+	return c
+}
+
+// writePoint is a raw datapoint from an individual write-throughput benchmark
+// run.
+type writePoint struct {
+	elapsedSecs int
+	opsSec      int
+	passed      bool
+	size        uint64
+	levels      int
+	writeAmp    float64
+}
+
+// formatCSV returns a comma-separated string representation of the datapoint.
+func (p writePoint) formatCSV() string {
+	return fmt.Sprintf(
+		"%d,%d,%v,%d,%d,%.2f",
+		p.elapsedSecs, p.opsSec, p.passed, p.size, p.levels, p.writeAmp)
+}
+
+// rawWriteRun is a collection of datapoints from a single instance of a
+// benchmark run (i.e. datapoints comprising a single roachtest instance of a
+// write-throughput benchmark).
+type rawWriteRun struct {
+	points []writePoint
+	split  int // memoized
+}
+
+// opsPerSecSplit returns an optimal-split point that divides the passes and
+// fails from the datapoints in a rawWriteRun.
+func (r *rawWriteRun) opsPerSecSplit() int {
+	if r.split > 0 {
+		return r.split
+	}
+
+	// Pre-process by partitioning the datapoint into passes and fails.
+	var passes, fails []int
+	for _, p := range r.points {
+		if p.passed {
+			passes = append(passes, p.opsSec)
+		} else {
+			fails = append(fails, p.opsSec)
+		}
+	}
+
+	// Compute and cache the split point as we only need to calculate it once.
+	split := findOptimalSplit(passes, fails)
+	r.split = split
+
+	return split
+}
+
+// writeAmp returns the value of the write-amplification at the end of the run.
+func (r *rawWriteRun) writeAmp() float64 {
+	return r.points[len(r.points)-1].writeAmp
+}
+
+// formatCSV returns a comma-separated string representation of the rawWriteRun.
+// The value itself is a newline-delimited string value comprised of the CSV
+// representation of the individual writePoints.
+func (r rawWriteRun) formatCSV() string {
+	var b bytes.Buffer
+	for _, p := range r.points {
+		_, _ = fmt.Fprintf(&b, "%s\n", p.formatCSV())
+	}
+	return b.String()
+}
+
+// writeRunSummary represents a single summary datapoint across all rawWriteRuns
+// that comprise a writeRun. The datapoint contains a summary ops-per-second
+// value, in addition to a path to the summary.json file with the combined data
+// for the run.
+type writeRunSummary struct {
+	Name        string  `json:"name"`
+	Date        string  `json:"date"`
+	OpsSec      int     `json:"opsSec"`
+	WriteAmp    float64 `json:"writeAmp"`
+	SummaryPath string  `json:"summaryPath"`
+}
+
+// writeWorkloadSummary is an alias for a slice of writeRunSummaries.
+type writeWorkloadSummary []writeRunSummary
+
+// writeRun is a collection of one or more rawWriteRuns (i.e. the union of all
+// rawWriteRuns from each worker participating in the roachtest cluster used for
+// running the write-throughput benchmarks).
+type writeRun struct {
+	// name is the benchmark workload name (i.e. "values=1024").
+	name string
+
+	// date is the date on which the writeRun took place.
+	date string
+
+	// dir is path to the directory containing the raw data. The path is
+	// relative to the data-dir.
+	dir string
+
+	// rawRuns is a map from input data filename to its rawWriteRun data.
+	rawRuns map[string]rawWriteRun
+}
+
+// summaryFilename returns the filename to be used for storing the summary
+// output for the writeRun. The filename preserves the original data source path
+// for ease of debugging / data-provenance.
+func (r writeRun) summaryFilename() string {
+	parts := strings.Split(r.dir, string(os.PathSeparator))
+	parts = append(parts, summaryFilename)
+	return strings.Join(parts, "-")
+}
+
+// summarize computes a writeRunSummary datapoint for the writeRun.
+func (r writeRun) summarize() writeRunSummary {
+	var (
+		sumOpsSec   int
+		sumWriteAmp float64
+	)
+	for _, rr := range r.rawRuns {
+		sumOpsSec += rr.opsPerSecSplit()
+		sumWriteAmp += rr.writeAmp()
+	}
+	l := len(r.rawRuns)
+
+	return writeRunSummary{
+		Name:        r.name,
+		Date:        r.date,
+		SummaryPath: r.summaryFilename(),
+		// Calculate an average across all raw runs in this run.
+		// TODO(travers): test how this works in practice, after we have
+		// gathered enough data.
+		OpsSec:   sumOpsSec / l,
+		WriteAmp: math.Round(100*sumWriteAmp/float64(l)) / 100, // round to 2dp.
+	}
+}
+
+// cookedWriteRun is a representation of a previously parsed (or "cooked")
+// writeRun.
+type cookedWriteRun struct {
+	OpsSec int    `json:"opsSec"`
+	Raw    string `json:"rawData"`
+}
+
+// formatSummaryJSON returns a JSON representation of the combined raw data from
+// all rawWriteRuns that comprise the writeRun. It has the form:
+//
+//	{
+//	  "original-raw-write-run-log-file-1.gz": {
+//	    "opsSec": ...,
+//	    "raw": ...,
+//	  },
+//	   ...
+//	  "original-raw-write-run-log-file-N.gz": {
+//	    "opsSec": ...,
+//	    "raw": ...,
+//	  },
+//	}
+func (r writeRun) formatSummaryJSON() ([]byte, error) {
+	m := make(map[string]cookedWriteRun)
+	for name, data := range r.rawRuns {
+		m[name] = cookedWriteRun{
+			OpsSec: data.opsPerSecSplit(),
+			Raw:    data.formatCSV(),
+		}
+	}
+	return prettyJSON(&m), nil
+}
+
+// write workload is a map from "day" to corresponding writeRun, for a given
+// write-throughput benchmark workload (i.e. values=1024).
+type writeWorkload struct {
+	days map[string]*writeRun // map from day to runs for the given workload
+}
+
+// writeWorkloads is an alias for a map from workload name to its corresponding
+// map from day to writeRun.
+type writeWorkloads map[string]*writeWorkload
+
+// nameDay is a (name, day) tuple, used as a map key.
+type nameDay struct {
+	name, day string
+}
+
+type writeLoader struct {
+	// rootDir is the path to the root directory containing the data.
+	dataDir string
+
+	// summaryFilename is the name of the file containing the summary data.
+	summaryDir string
+
+	// workloads is a map from workload name to its corresponding data.
+	workloads writeWorkloads
+
+	// cooked is a "set" of (workload, day) tuples representing whether
+	// previously parsed data was present for the (workload, day).
+	cooked map[nameDay]bool
+
+	// cookedSummaries is a map from workload name to previously generated data
+	// for the workload. This data is "mixed-in" with new data when the summary
+	// files are written out.
+	cookedSummaries map[string]writeWorkloadSummary
+}
+
+// newWriteLoader returns a new writeLoader that can be used to generate the
+// summary files for write-throughput benchmarking data.
+func newWriteLoader(dataDir, summaryDir string) *writeLoader {
+	return &writeLoader{
+		dataDir:         dataDir,
+		summaryDir:      summaryDir,
+		workloads:       make(writeWorkloads),
+		cooked:          make(map[nameDay]bool),
+		cookedSummaries: make(map[string]writeWorkloadSummary),
+	}
+}
+
+// loadCooked loads previously summarized write throughput benchmark data.
+func (l *writeLoader) loadCooked() error {
+	b, err := os.ReadFile(filepath.Join(l.summaryDir, summaryFilename))
+	if err != nil {
+		// The first ever run will not find the summary file. Return early in
+		// this case, and we'll start afresh.
+		if oserror.IsNotExist(err) {
+			return nil
+		}
+		return err
+	}
+
+	// Reconstruct the summary.
+	summaries := make(map[string]writeWorkloadSummary)
+	err = json.Unmarshal(b, &summaries)
+	if err != nil {
+		return err
+	}
+
+	// Populate the cooked map.
+	l.cookedSummaries = summaries
+
+	// Populate the set used for determining whether we can skip a raw file.
+	for name, workloadSummary := range summaries {
+		for _, runSummary := range workloadSummary {
+			l.cooked[nameDay{name, runSummary.Date}] = true
+		}
+	}
+
+	return nil
+}
+
+// loadRaw loads the raw data from the root data directory.
+func (l *writeLoader) loadRaw() error {
+	walkFn := func(path, pathRel string, info os.FileInfo) error {
+		// The relative directory structure is of the form:
+		//   $day/pebble/write/$name/$run/$file
+		parts := strings.Split(pathRel, string(os.PathSeparator))
+		if len(parts) < 6 {
+			return nil // stumble forward on invalid paths
+		}
+
+		// Filter out files that aren't in write benchmark directories.
+		if parts[2] != "write" {
+			return nil
+		}
+		day := parts[0]
+
+		f, err := os.Open(path)
+		if err != nil {
+			_, _ = fmt.Fprintf(os.Stderr, "%+v\n", err)
+			return nil // stumble forward on error
+		}
+		defer func() { _ = f.Close() }()
+
+		rd := io.Reader(f)
+		if strings.HasSuffix(path, ".bz2") {
+			rd = bzip2.NewReader(f)
+		} else if strings.HasSuffix(path, ".gz") {
+			var err error
+			rd, err = gzip.NewReader(f)
+			if err != nil {
+				_, _ = fmt.Fprintf(os.Stderr, "%+v\n", err)
+				return nil // stumble forward on error
+			}
+		}
+
+		// Parse the data for this file and add to the appropriate workload.
+		s := bufio.NewScanner(rd)
+		r := rawWriteRun{}
+		var name string
+		for s.Scan() {
+			line := s.Text()
+			if !strings.HasPrefix(line, "BenchmarkRaw") {
+				continue
+			}
+
+			var p writePoint
+			var nameInner, elapsed string
+			n, err := fmt.Sscanf(line, rawRunFmt,
+				&nameInner, &p.opsSec, &p.passed, &elapsed, &p.size, &p.levels, &p.writeAmp)
+			if err != nil || n != 7 {
+				// Stumble forward on error.
+				_, _ = fmt.Fprintf(os.Stderr, "%s: %v\n", s.Text(), err)
+				continue
+			}
+
+			// The first datapoint we see in the file is assumed to be the same
+			// for all datapoints.
+			if name == "" {
+				name = nameInner
+
+				// Skip files for (workload, day) pairs that have been parsed
+				// previously. Note that this relies on loadCooked having been
+				// called previously to seed the map with cooked data.
+				if ok := l.cooked[nameDay{name, day}]; ok {
+					_, _ = fmt.Fprintf(os.Stderr,
+						"skipping previously cooked data in file %s (workload=%q, day=%q)\n",
+						pathRel, name, day)
+					return nil
+				}
+			} else if name != nameInner {
+				_, _ = fmt.Fprintf(os.Stderr,
+					"WARN: benchmark name %q differs from previously seen name %q: %s",
+					nameInner, name, s.Text())
+			}
+
+			// Convert the elapsed time into seconds.
+			secs, err := time.ParseDuration(elapsed)
+			if err != nil {
+				// Stumble forward on error.
+				_, _ = fmt.Fprintf(os.Stderr, "%s: %v\n", s.Text(), err)
+				continue
+			}
+			p.elapsedSecs = int(secs.Seconds())
+
+			// Add this data point to the collection of points for this run.
+			r.points = append(r.points, p)
+		}
+
+		// Add the raw run to the map.
+		l.addRawRun(name, day, pathRel, r)
+
+		return nil
+	}
+	return walkDir(l.dataDir, walkFn)
+}
+
+// addRawRun adds a rawWriteRun to the corresponding datastructures by looking
+// up the workload name (i.e. "values=1024"), then appending the rawWriteRun to
+// the corresponding slice of all rawWriteRuns.
+func (l *writeLoader) addRawRun(name, day, path string, raw rawWriteRun) {
+	// Skip files with no points (i.e. files that couldn't be parsed).
+	if len(raw.points) == 0 {
+		return
+	}
+
+	_, _ = fmt.Fprintf(
+		os.Stderr, "adding raw run: (workload=%q, day=%q); nPoints=%d; file=%s\n",
+		name, day, len(raw.points), path)
+
+	w := l.workloads[name]
+	if w == nil {
+		w = &writeWorkload{days: make(map[string]*writeRun)}
+		l.workloads[name] = w
+	}
+
+	r := w.days[day]
+	if r == nil {
+		r = &writeRun{
+			name:    name,
+			date:    day,
+			dir:     filepath.Dir(path),
+			rawRuns: make(map[string]rawWriteRun),
+		}
+		w.days[day] = r
+	}
+	r.rawRuns[path] = raw
+}
+
+// cookSummary writes out the data in the loader to the summary file (new or
+// existing).
+func (l *writeLoader) cookSummary() error {
+	summary := make(map[string]writeWorkloadSummary)
+	for name, w := range l.workloads {
+		summary[name] = cookWriteSummary(w)
+	}
+
+	// Mix in the previously cooked values.
+	for name, cooked := range l.cookedSummaries {
+		existing, ok := summary[name]
+		if !ok {
+			summary[name] = cooked
+		} else {
+			// We must merge and re-sort by date.
+			existing = append(existing, cooked...)
+			slices.SortFunc(existing, func(a, b writeRunSummary) int {
+				return cmp.Compare(a.Date, b.Date)
+			})
+			summary[name] = existing
+		}
+	}
+	b := prettyJSON(&summary)
+	b = append(b, '\n')
+
+	outputPath := filepath.Join(l.summaryDir, summaryFilename)
+	err := os.WriteFile(outputPath, b, 0644)
+	if err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// cookWriteSummary is a helper that generates the summary for a write workload
+// by computing the per-day summaries across all runs.
+func cookWriteSummary(w *writeWorkload) writeWorkloadSummary {
+	days := make([]string, 0, len(w.days))
+	for day := range w.days {
+		days = append(days, day)
+	}
+	sort.Strings(days)
+
+	var summary writeWorkloadSummary
+	for _, day := range days {
+		r := w.days[day]
+		summary = append(summary, r.summarize())
+	}
+
+	return summary
+}
+
+// cookWriteRunSummaries writes out the per-run summary files.
+func (l *writeLoader) cookWriteRunSummaries() error {
+	for _, w := range l.workloads {
+		for _, r := range w.days {
+			// Write out files preserving the original directory structure for
+			// ease of understanding / debugging.
+			outputPath := filepath.Join(l.summaryDir, r.summaryFilename())
+			if err := outputWriteRunSummary(r, outputPath); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+// outputWriteRunSummary is a helper that generates the summary JSON for the
+// writeRun and writes it to the given output path.
+func outputWriteRunSummary(r *writeRun, outputPath string) error {
+	f, err := os.OpenFile(outputPath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
+	if err != nil {
+		return err
+	}
+	defer func() { _ = f.Close() }()
+
+	b, err := r.formatSummaryJSON()
+	if err != nil {
+		return err
+	}
+	b = append(b, '\n')
+
+	_, err = f.Write(b)
+	return err
+}
+
+// parseWrite parses the raw write-throughput benchmark data and writes out the
+// summary files.
+func parseWrite(dataDir, summaryDir string) error {
+	l := newWriteLoader(dataDir, summaryDir)
+	if err := l.loadCooked(); err != nil {
+		return err
+	}
+
+	if err := l.loadRaw(); err != nil {
+		return err
+	}
+
+	if err := l.cookSummary(); err != nil {
+		return err
+	}
+
+	return l.cookWriteRunSummaries()
+}
diff --git a/pebble/internal/mkbench/write_test.go b/pebble/internal/mkbench/write_test.go
new file mode 100644
index 0000000..da81e25
--- /dev/null
+++ b/pebble/internal/mkbench/write_test.go
@@ -0,0 +1,113 @@
+package main
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+const testdataSummaryDir = "./testdata/write-throughput/"
+
+var (
+	testdataSummaryFile            = filepath.Join(testdataSummaryDir, summaryFilename)
+	testdataPerRunSummaryFilenames = []string{
+		"20211027-pebble-write-size=1024-run_1-summary.json",
+		"20211028-pebble-write-size=1024-run_1-summary.json",
+	}
+)
+
+func TestParseWrite_FromScratch(t *testing.T) {
+	maybeSkip(t)
+
+	testFn := func(t *testing.T, dataDir string) {
+		// Set up the output directory for the test.
+		testDir := t.TempDir()
+		summaryDir := filepath.Join(testDir, "write-throughput")
+		err := os.Mkdir(summaryDir, 0700)
+		require.NoError(t, err)
+
+		// Write out the new summary file.
+		err = parseWrite(dataDir, summaryDir)
+		require.NoError(t, err)
+
+		// Confirm new summary.json file matches what we expect.
+		err = filesEqual(testdataSummaryFile, filepath.Join(summaryDir, summaryFilename))
+		require.NoError(t, err)
+
+		// The individual per-run *summary.json files are equal.
+		for _, p := range testdataPerRunSummaryFilenames {
+			err = filesEqual(filepath.Join(testdataSummaryDir, p), filepath.Join(summaryDir, p))
+			require.NoError(t, err)
+		}
+	}
+
+	for _, dir := range dataDirPaths {
+		t.Run(dir, func(t *testing.T) {
+			testFn(t, dir)
+		})
+	}
+}
+
+func TestParseWrite_Existing(t *testing.T) {
+	maybeSkip(t)
+
+	testFn := func(t *testing.T, dataDir string) {
+		// Set up the output directory for the test.
+		testDir := t.TempDir()
+		summaryDir := filepath.Join(testDir, "write-throughput")
+		err := os.Mkdir(summaryDir, 0700)
+		require.NoError(t, err)
+
+		// Copy all files into the test dir excluding one day.
+		newDataDir := filepath.Join(testDir, "data")
+		err = copyDir(dataDir, newDataDir)
+		require.NoError(t, err)
+		err = os.RemoveAll(filepath.Join(newDataDir, "20211027"))
+		require.NoError(t, err)
+
+		// Write out the new summary file.
+		err = parseWrite(newDataDir, summaryDir)
+		require.NoError(t, err)
+
+		// Confirm new summary.json files are NOT equal.
+		err = filesEqual(testdataSummaryFile, filepath.Join(summaryDir, summaryFilename))
+		require.Error(t, err)
+
+		// The only per-run *summary.json files are for the days we did not remove
+		// (i.e. 20211028/**-summary.json)
+		var perRunFiles []string
+		err = filepath.Walk(summaryDir, func(path string, info os.FileInfo, err error) error {
+			basename := filepath.Base(path)
+			if strings.HasSuffix(basename, "-summary.json") {
+				perRunFiles = append(perRunFiles, basename)
+			}
+			return nil
+		})
+		require.NoError(t, err)
+		require.Equal(t, perRunFiles, testdataPerRunSummaryFilenames[1:])
+
+		// Re-construct the summary.json file with the full set of data (i.e. the
+		// original data directory).
+		err = parseWrite(dataDir, summaryDir)
+		require.NoError(t, err)
+
+		// Confirm the two summary.json files are now equal.
+		err = filesEqual(testdataSummaryFile, filepath.Join(summaryDir, summaryFilename))
+		require.NoError(t, err)
+
+		// The individual per-run *summary.json files are equal.
+		for _, p := range testdataPerRunSummaryFilenames {
+			err = filesEqual(filepath.Join(testdataSummaryDir, p), filepath.Join(summaryDir, p))
+			require.NoError(t, err)
+		}
+	}
+
+	for _, dir := range dataDirPaths {
+		t.Run(dir, func(t *testing.T) {
+			testFn(t, dir)
+		})
+	}
+}
diff --git a/pebble/internal/mkbench/ycsb.go b/pebble/internal/mkbench/ycsb.go
new file mode 100644
index 0000000..53df8f8
--- /dev/null
+++ b/pebble/internal/mkbench/ycsb.go
@@ -0,0 +1,301 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package main
+
+import (
+	"bufio"
+	"bytes"
+	"compress/bzip2"
+	"compress/gzip"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log"
+	"math"
+	"os"
+	"sort"
+	"strings"
+
+	"github.com/cockroachdb/errors/oserror"
+	"github.com/spf13/cobra"
+)
+
+const (
+	defaultDir        = "data"
+	defaultCookedFile = "data.js"
+)
+
+func getYCSBCommand() *cobra.Command {
+	c := &cobra.Command{
+		Use:   "ycsb",
+		Short: "parse YCSB benchmark data",
+		RunE: func(cmd *cobra.Command, args []string) error {
+			dataDir, err := cmd.Flags().GetString("dir")
+			if err != nil {
+				return err
+			}
+
+			inFile, err := cmd.Flags().GetString("in")
+			if err != nil {
+				return err
+			}
+
+			outFile, err := cmd.Flags().GetString("out")
+			if err != nil {
+				return err
+			}
+
+			parseYCSB(dataDir, inFile, outFile)
+			return nil
+		},
+	}
+
+	c.Flags().String("dir", defaultDir, "path to data directory")
+	c.Flags().String("in", defaultCookedFile, "path to (possibly non-empty) input cooked data file")
+	c.Flags().String("out", defaultCookedFile, "path to output data file")
+	c.SilenceUsage = true
+
+	return c
+}
+
+type ycsbRun struct {
+	opsSec     float64
+	readBytes  int64
+	writeBytes int64
+	readAmp    float64
+	writeAmp   float64
+}
+
+func (r ycsbRun) formatCSV() string {
+	return fmt.Sprintf("%.1f,%d,%d,%.1f,%.1f",
+		r.opsSec, r.readBytes, r.writeBytes, r.readAmp, r.writeAmp)
+}
+
+type ycsbWorkload struct {
+	days map[string][]ycsbRun // data -> runs
+}
+
+type ycsbLoader struct {
+	cookedDays map[string]bool          // set of already cooked days
+	data       map[string]*ycsbWorkload // workload name -> workload data
+}
+
+func newYCSBLoader() *ycsbLoader {
+	return &ycsbLoader{
+		cookedDays: make(map[string]bool),
+		data:       make(map[string]*ycsbWorkload),
+	}
+}
+
+func (l *ycsbLoader) addRun(name, day string, r ycsbRun) {
+	w := l.data[name]
+	if w == nil {
+		w = &ycsbWorkload{days: make(map[string][]ycsbRun)}
+		l.data[name] = w
+	}
+	w.days[day] = append(w.days[day], r)
+}
+
+func (l *ycsbLoader) loadCooked(path string) {
+	data, err := os.ReadFile(path)
+	if oserror.IsNotExist(err) {
+		return
+	}
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	data = bytes.TrimSpace(data)
+
+	prefix := []byte("data = ")
+	if !bytes.HasPrefix(data, prefix) {
+		log.Fatalf("missing '%s' prefix", prefix)
+	}
+	data = bytes.TrimPrefix(data, prefix)
+
+	suffix := []byte(";")
+	if !bytes.HasSuffix(data, suffix) {
+		log.Fatalf("missing '%s' suffix", suffix)
+	}
+	data = bytes.TrimSuffix(data, suffix)
+
+	m := make(map[string]string)
+	if err := json.Unmarshal(data, &m); err != nil {
+		log.Fatal(err)
+	}
+
+	for name, data := range m {
+		s := bufio.NewScanner(strings.NewReader(data))
+		for s.Scan() {
+			line := s.Text()
+			line = strings.Replace(line, ",", " ", -1)
+
+			var r ycsbRun
+			var day string
+			n, err := fmt.Sscanf(line, "%s %f %d %d %f %f",
+				&day, &r.opsSec, &r.readBytes, &r.writeBytes, &r.readAmp, &r.writeAmp)
+			if err != nil || n != 6 {
+				log.Fatalf("%s: %+v", line, err)
+			}
+			l.cookedDays[day] = true
+			l.addRun(name, day, r)
+		}
+	}
+}
+
+func (l *ycsbLoader) loadRaw(dir string) {
+	walkFn := func(path, pathRel string, info os.FileInfo) error {
+		// The directory structure is of the form:
+		//   $date/pebble/ycsb/$name/$run/$file
+		parts := strings.Split(pathRel, string(os.PathSeparator))
+		if len(parts) < 6 {
+			return nil // stumble forward on invalid paths
+		}
+
+		// We're only interested in YCSB benchmark data.
+		if parts[2] != "ycsb" {
+			return nil
+		}
+
+		day := parts[0]
+		if l.cookedDays[day] {
+			return nil
+		}
+
+		f, err := os.Open(path)
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "%+v\n", err)
+			return nil // stumble forward on error
+		}
+		defer f.Close()
+
+		r := io.Reader(f)
+		if strings.HasSuffix(path, ".bz2") {
+			r = bzip2.NewReader(f)
+		} else if strings.HasSuffix(path, ".gz") {
+			var err error
+			r, err = gzip.NewReader(f)
+			if err != nil {
+				fmt.Fprintf(os.Stderr, "%+v\n", err)
+				return nil // stumble forward on error
+			}
+		}
+
+		s := bufio.NewScanner(r)
+		for s.Scan() {
+			line := s.Text()
+			if !strings.HasPrefix(line, "Benchmark") {
+				continue
+			}
+
+			var r ycsbRun
+			var name string
+			var ops int64
+			n, err := fmt.Sscanf(line,
+				"Benchmark%s %d %f ops/sec %d read %d write %f r-amp %f w-amp",
+				&name, &ops, &r.opsSec, &r.readBytes, &r.writeBytes, &r.readAmp, &r.writeAmp)
+			if err != nil || n != 7 {
+				fmt.Fprintf(os.Stderr, "%s: %v\n", s.Text(), err)
+				// Stumble forward on error.
+				continue
+			}
+
+			fmt.Fprintf(os.Stderr, "%s: adding %s\n", day, name)
+			l.addRun(name, day, r)
+		}
+		return nil
+	}
+
+	_ = walkDir(dir, walkFn)
+}
+
+func (l *ycsbLoader) cook(path string) {
+	m := make(map[string]string)
+	for name, workload := range l.data {
+		m[name] = l.cookWorkload(workload)
+	}
+
+	out := []byte("data = ")
+	out = append(out, prettyJSON(m)...)
+	out = append(out, []byte(";\n")...)
+	if err := os.WriteFile(path, out, 0644); err != nil {
+		log.Fatal(err)
+	}
+}
+
+func (l *ycsbLoader) cookWorkload(w *ycsbWorkload) string {
+	days := make([]string, 0, len(w.days))
+	for day := range w.days {
+		days = append(days, day)
+	}
+	sort.Strings(days)
+
+	var buf bytes.Buffer
+	for _, day := range days {
+		fmt.Fprintf(&buf, "%s,%s\n", day, l.cookDay(w.days[day]))
+	}
+	return buf.String()
+}
+
+func (l *ycsbLoader) cookDay(runs []ycsbRun) string {
+	if len(runs) == 1 {
+		return runs[0].formatCSV()
+	}
+
+	// The benchmarks show significant run-to-run variance due to
+	// instance-to-instance performance variability on AWS. We attempt to smooth
+	// out this variance by excluding outliers: any run that is more than one
+	// stddev from the average, and then taking the average of the remaining
+	// runs. Note that the runs on a given day are all from the same SHA, so this
+	// smoothing will not affect exceptional day-to-day performance changes.
+
+	var sum float64
+	for i := range runs {
+		sum += runs[i].opsSec
+	}
+	mean := sum / float64(len(runs))
+
+	var sum2 float64
+	for i := range runs {
+		v := runs[i].opsSec - mean
+		sum2 += v * v
+	}
+
+	stddev := math.Sqrt(sum2 / float64(len(runs)))
+	lo := mean - stddev
+	hi := mean + stddev
+
+	var avg ycsbRun
+	var count int
+	for i := range runs {
+		r := &runs[i]
+		if r.opsSec < lo || r.opsSec > hi {
+			continue
+		}
+		count++
+		avg.opsSec += r.opsSec
+		avg.readBytes += r.readBytes
+		avg.writeBytes += r.writeBytes
+		avg.readAmp += r.readAmp
+		avg.writeAmp += r.writeAmp
+	}
+
+	avg.opsSec /= float64(count)
+	avg.readBytes /= int64(count)
+	avg.writeBytes /= int64(count)
+	avg.readAmp /= float64(count)
+	avg.writeAmp /= float64(count)
+	return avg.formatCSV()
+}
+
+// parseYCSB coalesces YCSB benchmark data.
+func parseYCSB(dataDir, inFile, outFile string) {
+	log.SetFlags(log.Lshortfile)
+
+	l := newYCSBLoader()
+	l.loadCooked(inFile)
+	l.loadRaw(dataDir)
+	l.cook(outFile)
+}
diff --git a/pebble/internal/mkbench/ycsb_test.go b/pebble/internal/mkbench/ycsb_test.go
new file mode 100644
index 0000000..fa3c7e7
--- /dev/null
+++ b/pebble/internal/mkbench/ycsb_test.go
@@ -0,0 +1,78 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package main
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+const (
+	dataDirPath          = "./testdata/data"
+	dataSymlinkedDirPath = "./testdata/data-symlink"
+	dataJSPath           = "./testdata/data.js"
+)
+
+var dataDirPaths = []string{dataDirPath, dataSymlinkedDirPath}
+
+func TestParseYCSB_FromScratch(t *testing.T) {
+	maybeSkip(t)
+
+	testFn := func(t *testing.T, dataDir string) {
+		// Write out a new data.js file from the input data.
+		fPath := filepath.Join(t.TempDir(), "data.js")
+		parseYCSB(dataDir, fPath, fPath)
+
+		// Confirm the two data.js files are now equal.
+		err := filesEqual(dataJSPath, fPath)
+		require.NoError(t, err)
+	}
+
+	for _, dir := range dataDirPaths {
+		t.Run(dir, func(t *testing.T) {
+			testFn(t, dir)
+		})
+	}
+}
+
+func TestYCSB_Existing(t *testing.T) {
+	maybeSkip(t)
+
+	testFn := func(t *testing.T, dataDir string) {
+		// Set up the test directory.
+		testDir := t.TempDir()
+		newDataDir := filepath.Join(testDir, "data")
+		newDataJS := filepath.Join(testDir, "data.js")
+
+		// Copy all files into the test dir excluding one day.
+		err := copyDir(dataDir, newDataDir)
+		require.NoError(t, err)
+		err = os.RemoveAll(filepath.Join(newDataDir, "20211027"))
+		require.NoError(t, err)
+
+		// Construct the data.js file on the test data with a single day removed.
+		parseYCSB(newDataDir, newDataJS, newDataJS)
+
+		// Confirm the two data.js files are not equal.
+		err = filesEqual(dataJSPath, newDataJS)
+		require.Error(t, err)
+
+		// Re-construct the data.js file with the full set of data.
+		parseYCSB(dataDir, dataJSPath, newDataJS)
+
+		// Confirm the two data.js files are now equal.
+		err = filesEqual(dataJSPath, newDataJS)
+		require.NoError(t, err)
+	}
+
+	for _, dir := range dataDirPaths {
+		t.Run(dir, func(t *testing.T) {
+			testFn(t, dir)
+		})
+	}
+}
diff --git a/pebble/internal/pacertoy/pebble/main.go b/pebble/internal/pacertoy/pebble/main.go
new file mode 100644
index 0000000..910d5c0
--- /dev/null
+++ b/pebble/internal/pacertoy/pebble/main.go
@@ -0,0 +1,420 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package main
+
+import (
+	"fmt"
+	"math"
+	"os"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/cockroachdb/pebble/internal/rate"
+	"golang.org/x/exp/rand"
+)
+
+const (
+	// If measureLatency is true, the simulator outputs p50, p95, and p99
+	// latencies after all writes are completed. In this mode, all writes
+	// are immediately queued. If this is disabled, writes come in continually
+	// at different rates.
+	measureLatency = false
+
+	writeAmount = 2000 << 20 // 2 GB
+
+	// Max rate for all compactions. This is intentionally set low enough that
+	// user writes will have to be delayed.
+	maxCompactionRate = 100 << 20 // 100 MB/s
+	minCompactionRate = 20 << 20  // 20 MB/s
+
+	memtableSize        = 64 << 20 // 64 MB
+	maxMemtableCount    = 5
+	drainDelayThreshold = 1.05 * memtableSize
+	maxFlushRate        = 30 << 20 // 30 MB/s
+	minFlushRate        = 4 << 20  // 4 MB/s
+
+	l0CompactionThreshold = 1
+
+	levelRatio = 10
+	numLevels  = 7
+
+	compactionDebtSlowdownThreshold = 2 * memtableSize
+)
+
+type compactionPacer struct {
+	level      atomic.Int64
+	maxDrainer *rate.Limiter
+	minDrainer *rate.Limiter
+}
+
+func newCompactionPacer() *compactionPacer {
+	p := &compactionPacer{
+		maxDrainer: rate.NewLimiter(maxCompactionRate, maxCompactionRate),
+		minDrainer: rate.NewLimiter(minCompactionRate, minCompactionRate),
+	}
+	return p
+}
+
+func (p *compactionPacer) fill(n int64) {
+	p.level.Add(n)
+}
+
+func (p *compactionPacer) drain(n int64, delay bool) bool {
+	p.maxDrainer.Wait(float64(n))
+
+	if delay {
+		p.minDrainer.Wait(float64(n))
+	}
+	level := p.level.Add(-n)
+	return level <= compactionDebtSlowdownThreshold
+}
+
+type flushPacer struct {
+	level           atomic.Int64
+	drainDelayLevel float64
+	fillCond        sync.Cond
+	// minDrainer is the drainer which sets the minimum speed of draining.
+	minDrainer *rate.Limiter
+	// maxDrainer is the drainer which sets the maximum speed of draining.
+	maxDrainer *rate.Limiter
+}
+
+func newFlushPacer(mu *sync.Mutex) *flushPacer {
+	p := &flushPacer{
+		drainDelayLevel: drainDelayThreshold,
+		minDrainer:      rate.NewLimiter(minFlushRate, minFlushRate),
+		maxDrainer:      rate.NewLimiter(maxFlushRate, maxFlushRate),
+	}
+	p.fillCond.L = mu
+	return p
+}
+
+func (p *flushPacer) fill(n int64) {
+	p.level.Add(n)
+	p.fillCond.Signal()
+}
+
+func (p *flushPacer) drain(n int64, delay bool) bool {
+	p.maxDrainer.Wait(float64(n))
+
+	if delay {
+		p.minDrainer.Wait(float64(n))
+	}
+	level := p.level.Add(-n)
+	p.fillCond.Signal()
+	return float64(level) <= p.drainDelayLevel
+}
+
+// DB models a Pebble DB.
+type DB struct {
+	mu         sync.Mutex
+	flushPacer *flushPacer
+	flushCond  sync.Cond
+	memtables  []*int64
+	fill       atomic.Int64
+	drain      atomic.Int64
+
+	compactionMu    sync.Mutex
+	compactionPacer *compactionPacer
+	// L0 is represented as an array of integers whereas every other level
+	// is represented as a single integer.
+	L0 []*int64
+	// Non-L0 sstables. sstables[0] == L1.
+	sstables            []atomic.Int64
+	maxSSTableSizes     []int64
+	compactionFlushCond sync.Cond
+	prevCompactionDebt  float64
+}
+
+func newDB() *DB {
+	db := &DB{}
+	db.flushPacer = newFlushPacer(&db.mu)
+	db.flushCond.L = &db.mu
+	db.memtables = append(db.memtables, new(int64))
+
+	db.compactionFlushCond.L = &db.compactionMu
+	db.L0 = append(db.L0, new(int64))
+	db.compactionPacer = newCompactionPacer()
+
+	db.maxSSTableSizes = make([]int64, numLevels-1)
+	db.sstables = make([]atomic.Int64, numLevels-1)
+	base := int64(levelRatio)
+	for i := uint64(0); i < numLevels-2; i++ {
+		// Each level is 10 times larger than the one above it.
+		db.maxSSTableSizes[i] = memtableSize * l0CompactionThreshold * base
+		base *= levelRatio
+
+		// Begin with each level full.
+		newLevel := db.maxSSTableSizes[i]
+
+		db.sstables[i].Store(newLevel)
+	}
+	db.sstables[numLevels-2].Store(0)
+	db.maxSSTableSizes[numLevels-2] = math.MaxInt64
+
+	go db.drainMemtable()
+	go db.drainCompaction()
+
+	return db
+}
+
+// drainCompaction simulates background compactions.
+func (db *DB) drainCompaction() {
+	rng := rand.New(rand.NewSource(1))
+
+	for {
+		db.compactionMu.Lock()
+
+		for len(db.L0) <= l0CompactionThreshold {
+			db.compactionFlushCond.Wait()
+		}
+		l0Table := db.L0[0]
+		db.compactionMu.Unlock()
+
+		var delay bool
+		for i, size := int64(0), int64(0); i < *l0Table; i += size {
+			size = 10000 + rng.Int63n(500)
+			if size > (*l0Table - i) {
+				size = *l0Table - i
+			}
+			delay = db.compactionPacer.drain(size, delay)
+		}
+
+		db.compactionMu.Lock()
+		db.L0 = db.L0[1:]
+		db.compactionMu.Unlock()
+
+		singleTableSize := int64(memtableSize)
+		tablesToCompact := 0
+		for i := range db.sstables {
+			newSSTableSize := db.sstables[i].Add(singleTableSize)
+			if newSSTableSize > db.maxSSTableSizes[i] {
+				db.sstables[i].Add(-singleTableSize)
+				tablesToCompact++
+			} else {
+				// Lower levels do not need compaction if level above it did not
+				// need compaction.
+				break
+			}
+		}
+
+		totalCompactionBytes := int64(tablesToCompact * memtableSize)
+
+		for t := 0; t < tablesToCompact; t++ {
+			db.compactionPacer.fill(memtableSize)
+			for i, size := int64(0), int64(0); i < memtableSize; i += size {
+				size = 10000 + rng.Int63n(500)
+				if size > (totalCompactionBytes - i) {
+					size = totalCompactionBytes - i
+				}
+				delay = db.compactionPacer.drain(size, delay)
+			}
+
+			db.delayMemtableDrain()
+		}
+	}
+}
+
+// fillCompaction fills L0 sstables.
+func (db *DB) fillCompaction(size int64) {
+	db.compactionMu.Lock()
+
+	db.compactionPacer.fill(size)
+
+	last := db.L0[len(db.L0)-1]
+	if *last+size > memtableSize {
+		last = new(int64)
+		db.L0 = append(db.L0, last)
+		db.compactionFlushCond.Signal()
+	}
+	*last += size
+
+	db.compactionMu.Unlock()
+}
+
+// drainMemtable simulates memtable flushing.
+func (db *DB) drainMemtable() {
+	rng := rand.New(rand.NewSource(2))
+
+	for {
+		db.mu.Lock()
+		for len(db.memtables) <= 1 {
+			db.flushCond.Wait()
+		}
+		memtable := db.memtables[0]
+		db.mu.Unlock()
+
+		var delay bool
+		for i, size := int64(0), int64(0); i < *memtable; i += size {
+			size = 1000 + rng.Int63n(50)
+			if size > (*memtable - i) {
+				size = *memtable - i
+			}
+			delay = db.flushPacer.drain(size, delay)
+			db.drain.Add(size)
+
+			db.fillCompaction(size)
+		}
+
+		db.delayMemtableDrain()
+
+		db.mu.Lock()
+		db.memtables = db.memtables[1:]
+		db.mu.Unlock()
+	}
+}
+
+// delayMemtableDrain applies memtable drain delays depending on compaction debt.
+func (db *DB) delayMemtableDrain() {
+	totalCompactionBytes := db.compactionPacer.level.Load()
+	compactionDebt := math.Max(float64(totalCompactionBytes)-l0CompactionThreshold*memtableSize, 0.0)
+
+	db.mu.Lock()
+	if compactionDebt > compactionDebtSlowdownThreshold {
+		// Compaction debt is above the threshold and the debt is growing. Throttle memtable flushing.
+		drainLimit := maxFlushRate * float64(compactionDebtSlowdownThreshold/compactionDebt)
+		if drainLimit > 0 && drainLimit <= maxFlushRate {
+			db.flushPacer.maxDrainer.SetRate(drainLimit)
+		}
+	} else {
+		// Continuously speed up memtable flushing to make sure that slowdown signal did not
+		// decrease the memtable flush rate by too much.
+		drainLimit := db.flushPacer.maxDrainer.Rate() * 1.05
+		if drainLimit > 0 && drainLimit <= maxFlushRate {
+			db.flushPacer.maxDrainer.SetRate(drainLimit)
+		}
+	}
+
+	db.prevCompactionDebt = compactionDebt
+	db.mu.Unlock()
+}
+
+// fillMemtable simulates memtable filling.
+func (db *DB) fillMemtable(size int64) {
+	db.mu.Lock()
+
+	for len(db.memtables) > maxMemtableCount {
+		db.flushPacer.fillCond.Wait()
+	}
+	db.flushPacer.fill(size)
+	db.fill.Add(size)
+
+	last := db.memtables[len(db.memtables)-1]
+	if *last+size > memtableSize {
+		last = new(int64)
+		db.memtables = append(db.memtables, last)
+		db.flushCond.Signal()
+	}
+	*last += size
+
+	db.mu.Unlock()
+}
+
+// simulateWrite simulates user writes.
+func simulateWrite(db *DB, measureLatencyMode bool) {
+	limiter := rate.NewLimiter(10<<20, 10<<20) // 10 MB/s
+	fmt.Printf("filling at 10 MB/sec\n")
+
+	setRate := func(mb int) {
+		fmt.Printf("filling at %d MB/sec\n", mb)
+		limiter.SetRate(float64(mb << 20))
+	}
+
+	if !measureLatencyMode {
+		go func() {
+			rng := rand.New(rand.NewSource(3))
+			for {
+				secs := 5 + rng.Intn(5)
+				time.Sleep(time.Duration(secs) * time.Second)
+				mb := 10 + rng.Intn(20)
+				setRate(mb)
+			}
+		}()
+	}
+
+	rng := rand.New(rand.NewSource(uint64(4)))
+
+	totalWrites := int64(0)
+	percentiles := []int64{50, 95, 99}
+	percentileIndex := 0
+	percentileTimes := make([]time.Time, 0)
+
+	startTime := time.Now()
+	for totalWrites <= writeAmount {
+		size := 1000 + rng.Int63n(50)
+		if !measureLatencyMode {
+			limiter.Wait(float64(size))
+		}
+		db.fillMemtable(size)
+
+		// Calculate latency percentiles
+		totalWrites += size
+		if percentileIndex < len(percentiles) && totalWrites > (percentiles[percentileIndex]*writeAmount/100) {
+			percentileTimes = append(percentileTimes, time.Now())
+			percentileIndex++
+		}
+	}
+
+	time.Sleep(time.Second * 10)
+	// Latency should only be measured when `limiter.WaitN` is removed.
+	if measureLatencyMode {
+		fmt.Printf("_____p50______p95______p99\n")
+		fmt.Printf("%8s %8s %8s\n",
+			time.Duration(percentileTimes[0].Sub(startTime).Seconds())*time.Second,
+			time.Duration(percentileTimes[1].Sub(startTime).Seconds())*time.Second,
+			time.Duration(percentileTimes[2].Sub(startTime).Seconds())*time.Second)
+	}
+
+	os.Exit(0)
+}
+
+func main() {
+	db := newDB()
+
+	go simulateWrite(db, measureLatency)
+
+	tick := time.NewTicker(time.Second)
+	start := time.Now()
+	lastNow := start
+	var lastFill, lastDrain int64
+
+	for i := 0; ; i++ {
+		<-tick.C
+		if (i % 20) == 0 {
+			fmt.Printf("_elapsed___memtbs____dirty_____fill____drain____cdebt__l0count___max-f-rate\n")
+		}
+
+		db.mu.Lock()
+		memtableCount := len(db.memtables)
+		db.mu.Unlock()
+		dirty := db.flushPacer.level.Load()
+		fill := db.fill.Load()
+		drain := db.drain.Load()
+
+		db.compactionMu.Lock()
+		compactionL0 := len(db.L0)
+		db.compactionMu.Unlock()
+		totalCompactionBytes := db.compactionPacer.level.Load()
+		compactionDebt := math.Max(float64(totalCompactionBytes)-l0CompactionThreshold*memtableSize, 0.0)
+		maxFlushRate := db.flushPacer.maxDrainer.Rate()
+
+		now := time.Now()
+		elapsed := now.Sub(lastNow).Seconds()
+		fmt.Printf("%8s %8d %8.1f %8.1f %8.1f %8.1f %8d %12.1f\n",
+			time.Duration(now.Sub(start).Seconds()+0.5)*time.Second,
+			memtableCount,
+			float64(dirty)/(1024.0*1024.0),
+			float64(fill-lastFill)/(1024.0*1024.0*elapsed),
+			float64(drain-lastDrain)/(1024.0*1024.0*elapsed),
+			compactionDebt/(1024.0*1024.0),
+			compactionL0,
+			maxFlushRate/(1024.0*1024.0))
+
+		lastNow = now
+		lastFill = fill
+		lastDrain = drain
+	}
+}
diff --git a/pebble/internal/pacertoy/rocksdb/main.go b/pebble/internal/pacertoy/rocksdb/main.go
new file mode 100644
index 0000000..a33fb98
--- /dev/null
+++ b/pebble/internal/pacertoy/rocksdb/main.go
@@ -0,0 +1,383 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package main
+
+import (
+	"fmt"
+	"math"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/cockroachdb/pebble/internal/rate"
+	"golang.org/x/exp/rand"
+)
+
+const (
+	// Max rate for all compactions. This is intentionally set low enough that
+	// user writes will have to be delayed.
+	maxCompactionRate = 80 << 20 // 80 MB/s
+
+	memtableSize          = 64 << 20 // 64 MB
+	memtableStopThreshold = 2 * memtableSize
+	maxWriteRate          = 30 << 20 // 30 MB/s
+	startingWriteRate     = 30 << 20 // 30 MB/s
+
+	l0SlowdownThreshold   = 4
+	l0CompactionThreshold = 1
+
+	levelRatio = 10
+	numLevels  = 7
+
+	// Slowdown threshold is set at the compaction debt incurred by the largest
+	// possible compaction.
+	compactionDebtSlowdownThreshold = memtableSize * (numLevels - 2)
+)
+
+type compactionPacer struct {
+	level   atomic.Int64
+	drainer *rate.Limiter
+}
+
+func newCompactionPacer() *compactionPacer {
+	p := &compactionPacer{
+		drainer: rate.NewLimiter(maxCompactionRate, maxCompactionRate),
+	}
+	return p
+}
+
+func (p *compactionPacer) fill(n int64) {
+	p.level.Add(n)
+}
+
+func (p *compactionPacer) drain(n int64) {
+	p.drainer.Wait(float64(n))
+
+	p.level.Add(-n)
+}
+
+type flushPacer struct {
+	level                 atomic.Int64
+	memtableStopThreshold float64
+	fillCond              sync.Cond
+}
+
+func newFlushPacer(mu *sync.Mutex) *flushPacer {
+	p := &flushPacer{
+		memtableStopThreshold: memtableStopThreshold,
+	}
+	p.fillCond.L = mu
+	return p
+}
+
+func (p *flushPacer) fill(n int64) {
+	for float64(p.level.Load()) >= p.memtableStopThreshold {
+		p.fillCond.Wait()
+	}
+	p.level.Add(n)
+	p.fillCond.Signal()
+}
+
+func (p *flushPacer) drain(n int64) {
+	p.level.Add(-n)
+}
+
+// DB models a RocksDB DB.
+type DB struct {
+	mu         sync.Mutex
+	flushPacer *flushPacer
+	flushCond  sync.Cond
+	memtables  []*int64
+	fill       atomic.Int64
+	drain      atomic.Int64
+
+	compactionMu    sync.Mutex
+	compactionPacer *compactionPacer
+	// L0 is represented as an array of integers whereas every other level
+	// is represented as a single integer.
+	L0 []*int64
+	// Non-L0 sstables. sstables[0] == L1.
+	sstables            []atomic.Int64
+	maxSSTableSizes     []int64
+	compactionFlushCond sync.Cond
+	prevCompactionDebt  float64
+	previouslyInDebt    bool
+
+	writeLimiter *rate.Limiter
+}
+
+func newDB() *DB {
+	db := &DB{}
+	db.flushPacer = newFlushPacer(&db.mu)
+	db.flushCond.L = &db.mu
+	db.memtables = append(db.memtables, new(int64))
+
+	db.compactionFlushCond.L = &db.compactionMu
+	db.L0 = append(db.L0, new(int64))
+	db.compactionPacer = newCompactionPacer()
+
+	db.maxSSTableSizes = make([]int64, numLevels-1)
+	db.sstables = make([]atomic.Int64, numLevels-1)
+	base := int64(levelRatio)
+	for i := uint64(0); i < numLevels-2; i++ {
+		// Each level is 10 times larger than the one above it.
+		db.maxSSTableSizes[i] = memtableSize * l0CompactionThreshold * base
+		base *= levelRatio
+
+		// Begin with each level full.
+		newLevel := db.maxSSTableSizes[i]
+
+		db.sstables[i].Store(newLevel)
+	}
+	db.sstables[numLevels-2].Store(0)
+	db.maxSSTableSizes[numLevels-2] = math.MaxInt64
+
+	db.writeLimiter = rate.NewLimiter(startingWriteRate, startingWriteRate)
+
+	go db.drainMemtable()
+	go db.drainCompaction()
+
+	return db
+}
+
+// drainCompaction simulates background compactions.
+func (db *DB) drainCompaction() {
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+
+	for {
+		db.compactionMu.Lock()
+
+		for len(db.L0) <= l0CompactionThreshold {
+			db.compactionFlushCond.Wait()
+		}
+		l0Table := db.L0[0]
+		db.compactionMu.Unlock()
+
+		for i, size := int64(0), int64(0); i < *l0Table; i += size {
+			size = 10000 + rng.Int63n(500)
+			if size > (*l0Table - i) {
+				size = *l0Table - i
+			}
+			db.compactionPacer.drain(size)
+		}
+
+		db.compactionMu.Lock()
+		db.L0 = db.L0[1:]
+		db.compactionMu.Unlock()
+
+		singleTableSize := int64(memtableSize)
+		tablesToCompact := 0
+		for i := range db.sstables {
+			newSSTableSize := db.sstables[i].Add(singleTableSize)
+			if newSSTableSize > db.maxSSTableSizes[i] {
+				db.sstables[i].Add(-singleTableSize)
+				tablesToCompact++
+			} else {
+				// Lower levels do not need compaction if level above it did not
+				// need compaction.
+				break
+			}
+		}
+
+		totalCompactionBytes := int64(tablesToCompact * memtableSize)
+		db.compactionPacer.fill(totalCompactionBytes)
+
+		for t := 0; t < tablesToCompact; t++ {
+			for i, size := int64(0), int64(0); i < memtableSize; i += size {
+				size = 10000 + rng.Int63n(500)
+				if size > (totalCompactionBytes - i) {
+					size = totalCompactionBytes - i
+				}
+				db.compactionPacer.drain(size)
+			}
+
+			db.delayUserWrites()
+		}
+	}
+}
+
+// fillCompaction fills L0 sstables.
+func (db *DB) fillCompaction(size int64) {
+	db.compactionMu.Lock()
+
+	db.compactionPacer.fill(size)
+
+	last := db.L0[len(db.L0)-1]
+	if *last+size > memtableSize {
+		last = new(int64)
+		db.L0 = append(db.L0, last)
+		db.compactionFlushCond.Signal()
+	}
+	*last += size
+
+	db.compactionMu.Unlock()
+}
+
+// drainMemtable simulates memtable flushing.
+func (db *DB) drainMemtable() {
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+
+	for {
+		db.mu.Lock()
+		for len(db.memtables) <= 1 {
+			db.flushCond.Wait()
+		}
+		memtable := db.memtables[0]
+		db.mu.Unlock()
+
+		for i, size := int64(0), int64(0); i < *memtable; i += size {
+			size = 1000 + rng.Int63n(50)
+			if size > (*memtable - i) {
+				size = *memtable - i
+			}
+			db.flushPacer.drain(size)
+			db.drain.Add(size)
+
+			db.fillCompaction(size)
+		}
+
+		db.delayUserWrites()
+
+		db.mu.Lock()
+		db.memtables = db.memtables[1:]
+		db.mu.Unlock()
+	}
+}
+
+// delayUserWrites applies write delays depending on compaction debt.
+func (db *DB) delayUserWrites() {
+	totalCompactionBytes := db.compactionPacer.level.Load()
+	compactionDebt := math.Max(float64(totalCompactionBytes)-l0CompactionThreshold*memtableSize, 0.0)
+
+	db.mu.Lock()
+	if len(db.L0) > l0SlowdownThreshold || compactionDebt > compactionDebtSlowdownThreshold {
+		db.previouslyInDebt = true
+		if compactionDebt > db.prevCompactionDebt {
+			// Debt is growing.
+			drainLimit := db.writeLimiter.Rate() * 0.8
+			if drainLimit > 0 {
+				db.writeLimiter.SetRate(drainLimit)
+			}
+		} else {
+			// Debt is shrinking.
+			drainLimit := db.writeLimiter.Rate() * 1 / 0.8
+			if drainLimit <= maxWriteRate {
+				db.writeLimiter.SetRate(drainLimit)
+			}
+		}
+	} else if db.previouslyInDebt {
+		// If compaction was previously delayed and has recovered, RocksDB
+		// "rewards" the rate by double the slowdown ratio.
+
+		// From RocksDB:
+		// If the DB recovers from delay conditions, we reward with reducing
+		// double the slowdown ratio. This is to balance the long term slowdown
+		// increase signal.
+		drainLimit := db.writeLimiter.Rate() * 1.4
+		if drainLimit <= maxWriteRate {
+			db.writeLimiter.SetRate(drainLimit)
+		}
+		db.previouslyInDebt = false
+	}
+
+	db.prevCompactionDebt = compactionDebt
+	db.mu.Unlock()
+}
+
+// fillMemtable simulates memtable filling.
+func (db *DB) fillMemtable(size int64) {
+	db.mu.Lock()
+
+	db.flushPacer.fill(size)
+	db.fill.Add(size)
+
+	last := db.memtables[len(db.memtables)-1]
+	if *last+size > memtableSize {
+		last = new(int64)
+		db.memtables = append(db.memtables, last)
+		db.flushCond.Signal()
+	}
+	*last += size
+
+	db.mu.Unlock()
+}
+
+// simulateWrite simulates user writes.
+func simulateWrite(db *DB) {
+	limiter := rate.NewLimiter(10<<20, 10<<20) // 10 MB/s
+	fmt.Printf("filling at 10 MB/sec\n")
+
+	setRate := func(mb int) {
+		fmt.Printf("filling at %d MB/sec\n", mb)
+		limiter.SetRate(float64(mb << 20))
+	}
+
+	go func() {
+		rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+		for {
+			secs := 5 + rng.Intn(5)
+			time.Sleep(time.Duration(secs) * time.Second)
+			mb := 11 + rng.Intn(20)
+			setRate(mb)
+		}
+	}()
+
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+
+	for {
+		size := 1000 + rng.Int63n(50)
+		limiter.Wait(float64(size))
+		db.writeLimiter.Wait(float64(size))
+		db.fillMemtable(size)
+	}
+}
+
+func main() {
+	db := newDB()
+
+	go simulateWrite(db)
+
+	tick := time.NewTicker(time.Second)
+	start := time.Now()
+	lastNow := start
+	var lastFill, lastDrain int64
+
+	for i := 0; ; i++ {
+		<-tick.C
+		if (i % 20) == 0 {
+			fmt.Printf("_elapsed___memtbs____dirty_____fill____drain____cdebt__l0count___max-w-rate\n")
+		}
+
+		db.mu.Lock()
+		memtableCount := len(db.memtables)
+		db.mu.Unlock()
+		dirty := db.flushPacer.level.Load()
+		fill := db.fill.Load()
+		drain := db.drain.Load()
+
+		db.compactionMu.Lock()
+		compactionL0 := len(db.L0)
+		db.compactionMu.Unlock()
+		totalCompactionBytes := db.compactionPacer.level.Load()
+		compactionDebt := math.Max(float64(totalCompactionBytes)-l0CompactionThreshold*memtableSize, 0.0)
+		maxWriteRate := db.writeLimiter.Rate()
+
+		now := time.Now()
+		elapsed := now.Sub(lastNow).Seconds()
+		fmt.Printf("%8s %8d %8.1f %8.1f %8.1f %8.1f %8d %12.1f\n",
+			time.Duration(now.Sub(start).Seconds()+0.5)*time.Second,
+			memtableCount,
+			float64(dirty)/(1024.0*1024.0),
+			float64(fill-lastFill)/(1024.0*1024.0*elapsed),
+			float64(drain-lastDrain)/(1024.0*1024.0*elapsed),
+			compactionDebt/(1024.0*1024.0),
+			compactionL0,
+			maxWriteRate/(1024.0*1024.0))
+
+		lastNow = now
+		lastFill = fill
+		lastDrain = drain
+	}
+}
diff --git a/pebble/internal/private/batch.go b/pebble/internal/private/batch.go
new file mode 100644
index 0000000..dcdd1f1
--- /dev/null
+++ b/pebble/internal/private/batch.go
@@ -0,0 +1,19 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package private
+
+import (
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+)
+
+// BatchSort is a hook for constructing iterators over the point and range
+// mutations contained in a batch in sorted order. It is intended for testing
+// use only.
+var BatchSort func(interface{}) (
+	points base.InternalIterator,
+	rangeDels keyspan.FragmentIterator,
+	rangeKeys keyspan.FragmentIterator,
+)
diff --git a/pebble/internal/private/sstable.go b/pebble/internal/private/sstable.go
new file mode 100644
index 0000000..780dd56
--- /dev/null
+++ b/pebble/internal/private/sstable.go
@@ -0,0 +1,29 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package private
+
+import "github.com/cockroachdb/pebble/internal/base"
+
+// SSTableCacheOpts is a hook for specifying cache options to
+// sstable.NewReader.
+var SSTableCacheOpts func(cacheID uint64, fileNum base.DiskFileNum) interface{}
+
+// SSTableRawTombstonesOpt is a sstable.Reader option for disabling
+// fragmentation of the range tombstones returned by
+// sstable.Reader.NewRangeDelIter(). Used by debug tools to get a raw view of
+// the tombstones contained in an sstable.
+var SSTableRawTombstonesOpt interface{}
+
+// SSTableWriterDisableKeyOrderChecks is a hook for disabling the key ordering
+// invariant check performed by sstable.Writer. It is intended for internal use
+// only in the construction of invalid sstables for testing. See
+// tool/make_test_sstables.go.
+var SSTableWriterDisableKeyOrderChecks func(interface{})
+
+// SSTableInternalProperties is a func(*sstable.Writer) *sstable.Properties
+// function that allows Pebble-internal code to mutate properties that external
+// sstable writers are not permitted to edit. It's an untyped interface{} to
+// avoid a cyclic dependency.
+var SSTableInternalProperties interface{}
diff --git a/pebble/internal/randvar/deck.go b/pebble/internal/randvar/deck.go
new file mode 100644
index 0000000..873348f
--- /dev/null
+++ b/pebble/internal/randvar/deck.go
@@ -0,0 +1,62 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package randvar
+
+import (
+	"sync"
+
+	"golang.org/x/exp/rand"
+)
+
+// Deck is a random number generator that generates numbers in the range
+// [0,len(weights)-1] where the probability of i is
+// weights(i)/sum(weights). Unlike Weighted, the weights are specified as
+// integers and used in a deck-of-cards style random number selection which
+// ensures that each element is returned with a desired frequency within the
+// size of the deck.
+type Deck struct {
+	rng *rand.Rand
+	mu  struct {
+		sync.Mutex
+		index int
+		deck  []int
+	}
+}
+
+// NewDeck returns a new deck random number generator.
+func NewDeck(rng *rand.Rand, weights ...int) *Deck {
+	var sum int
+	for i := range weights {
+		sum += weights[i]
+	}
+	deck := make([]int, 0, sum)
+	for i := range weights {
+		for j := 0; j < weights[i]; j++ {
+			deck = append(deck, i)
+		}
+	}
+	d := &Deck{
+		rng: ensureRand(rng),
+	}
+	d.mu.index = len(deck)
+	d.mu.deck = deck
+	return d
+}
+
+// Int returns a random number in the range [0,len(weights)-1] where the
+// probability of i is weights(i)/sum(weights).
+func (d *Deck) Int() int {
+	d.mu.Lock()
+	if d.mu.index == len(d.mu.deck) {
+		d.rng.Shuffle(len(d.mu.deck), func(i, j int) {
+			d.mu.deck[i], d.mu.deck[j] = d.mu.deck[j], d.mu.deck[i]
+		})
+		d.mu.index = 0
+	}
+	result := d.mu.deck[d.mu.index]
+	d.mu.index++
+	d.mu.Unlock()
+	return result
+}
diff --git a/pebble/internal/randvar/deck_test.go b/pebble/internal/randvar/deck_test.go
new file mode 100644
index 0000000..a58cd7e
--- /dev/null
+++ b/pebble/internal/randvar/deck_test.go
@@ -0,0 +1,20 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package randvar
+
+import "testing"
+
+func TestDeck(t *testing.T) {
+	d := NewDeck(nil, 10, 20, 20, 0, 30)
+
+	x := make([]int, 10000)
+	for i := range x {
+		x[i] = d.Int()
+	}
+
+	if testing.Verbose() {
+		dumpSamples(x)
+	}
+}
diff --git a/pebble/internal/randvar/flag.go b/pebble/internal/randvar/flag.go
new file mode 100644
index 0000000..d962b77
--- /dev/null
+++ b/pebble/internal/randvar/flag.go
@@ -0,0 +1,160 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package randvar
+
+import (
+	"encoding/binary"
+	"regexp"
+	"strconv"
+	"strings"
+
+	"github.com/cockroachdb/errors"
+	"golang.org/x/exp/rand"
+)
+
+var randVarRE = regexp.MustCompile(`^(?:(latest|uniform|zipf):)?(\d+)(?:-(\d+))?$`)
+
+// Flag provides a command line flag interface for specifying static random
+// variables.
+type Flag struct {
+	Static
+	spec string
+}
+
+// NewFlag creates a new Flag initialized with the specified spec.
+func NewFlag(spec string) *Flag {
+	f := &Flag{}
+	if err := f.Set(spec); err != nil {
+		panic(err)
+	}
+	return f
+}
+
+func (f *Flag) String() string {
+	return f.spec
+}
+
+// Type implements the Flag.Value interface.
+func (f *Flag) Type() string {
+	return "randvar"
+}
+
+// Set implements the Flag.Value interface.
+func (f *Flag) Set(spec string) error {
+	m := randVarRE.FindStringSubmatch(spec)
+	if m == nil {
+		return errors.Errorf("invalid random var spec: %s", errors.Safe(spec))
+	}
+
+	min, err := strconv.Atoi(m[2])
+	if err != nil {
+		return err
+	}
+	max := min
+	if m[3] != "" {
+		max, err = strconv.Atoi(m[3])
+		if err != nil {
+			return err
+		}
+	}
+
+	switch strings.ToLower(m[1]) {
+	case "", "uniform":
+		f.Static = NewUniform(uint64(min), uint64(max))
+	case "latest":
+		f.Static, err = NewSkewedLatest(uint64(min), uint64(max), 0.99)
+		if err != nil {
+			return err
+		}
+	case "zipf":
+		var err error
+		f.Static, err = NewZipf(uint64(min), uint64(max), 0.99)
+		if err != nil {
+			return err
+		}
+	default:
+		return errors.Errorf("unknown random var distribution: %s", errors.Safe(m[1]))
+	}
+	f.spec = spec
+	return nil
+}
+
+// BytesFlag provides a command line flag interface for specifying random
+// bytes. The specification provides for both the length of the random bytes
+// and a target compression ratio.
+type BytesFlag struct {
+	sizeFlag          Flag
+	targetCompression float64
+	spec              string
+}
+
+// NewBytesFlag creates a new BytesFlag initialized with the specified spec.
+func NewBytesFlag(spec string) *BytesFlag {
+	f := &BytesFlag{}
+	if err := f.Set(spec); err != nil {
+		panic(err)
+	}
+	return f
+}
+
+func (f *BytesFlag) String() string {
+	return f.spec
+}
+
+// Type implements the Flag.Value interface.
+func (f *BytesFlag) Type() string {
+	return "randbytes"
+}
+
+// Set implements the Flag.Value interface.
+func (f *BytesFlag) Set(spec string) error {
+	parts := strings.Split(spec, "/")
+	if len(parts) == 0 || len(parts) > 2 {
+		return errors.Errorf("invalid randbytes spec: %s", errors.Safe(spec))
+	}
+	if err := f.sizeFlag.Set(parts[0]); err != nil {
+		return err
+	}
+	f.targetCompression = 1.0
+	if len(parts) == 2 {
+		var err error
+		f.targetCompression, err = strconv.ParseFloat(parts[1], 64)
+		if err != nil {
+			return err
+		}
+	}
+	f.spec = spec
+	return nil
+}
+
+// Bytes returns random bytes. The length of the random bytes comes from the
+// internal sizeFlag.
+func (f *BytesFlag) Bytes(r *rand.Rand, buf []byte) []byte {
+	size := int(f.sizeFlag.Uint64(r))
+	uniqueSize := int(float64(size) / f.targetCompression)
+	if uniqueSize < 1 {
+		uniqueSize = 1
+	}
+	if cap(buf) < size {
+		buf = make([]byte, size)
+	}
+	data := buf[:size]
+	offset := 0
+	for offset+8 <= uniqueSize {
+		binary.LittleEndian.PutUint64(data[offset:], r.Uint64())
+		offset += 8
+	}
+	word := r.Uint64()
+	for offset < uniqueSize {
+		data[offset] = byte(word)
+		word >>= 8
+		offset++
+	}
+	for offset < size {
+		data[offset] = data[offset-uniqueSize]
+		offset++
+	}
+	return data
+}
diff --git a/pebble/internal/randvar/rand.go b/pebble/internal/randvar/rand.go
new file mode 100644
index 0000000..f7d18a2
--- /dev/null
+++ b/pebble/internal/randvar/rand.go
@@ -0,0 +1,23 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package randvar
+
+import (
+	"time"
+
+	"golang.org/x/exp/rand"
+)
+
+// NewRand creates a new random number generator seeded with the current time.
+func NewRand() *rand.Rand {
+	return rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+}
+
+func ensureRand(rng *rand.Rand) *rand.Rand {
+	if rng != nil {
+		return rng
+	}
+	return NewRand()
+}
diff --git a/pebble/internal/randvar/randvar.go b/pebble/internal/randvar/randvar.go
new file mode 100644
index 0000000..d7d1d90
--- /dev/null
+++ b/pebble/internal/randvar/randvar.go
@@ -0,0 +1,25 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package randvar
+
+import "golang.org/x/exp/rand"
+
+// Static models a random variable that pulls from a distribution with static
+// bounds
+type Static interface {
+	Uint64(rng *rand.Rand) uint64
+}
+
+// Dynamic models a random variable that pulls from a distribution with an
+// upper bound that can change dynamically using the IncMax method.
+type Dynamic interface {
+	Static
+
+	// Increment the max value the variable will return.
+	IncMax(delta int)
+
+	// Read the current max value the variable will return.
+	Max() uint64
+}
diff --git a/pebble/internal/randvar/skewed_latest.go b/pebble/internal/randvar/skewed_latest.go
new file mode 100644
index 0000000..1a3dbb1
--- /dev/null
+++ b/pebble/internal/randvar/skewed_latest.go
@@ -0,0 +1,77 @@
+// Copyright 2018 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License. See the AUTHORS file
+// for names of contributors.
+
+package randvar
+
+import (
+	"sync"
+
+	"golang.org/x/exp/rand"
+)
+
+// SkewedLatest is a random number generator that generates numbers in
+// the range [min, max], but skews it towards max using a zipfian
+// distribution.
+type SkewedLatest struct {
+	mu struct {
+		sync.RWMutex
+		max  uint64
+		zipf *Zipf
+	}
+}
+
+// NewDefaultSkewedLatest constructs a new SkewedLatest generator with the
+// default parameters.
+func NewDefaultSkewedLatest() (*SkewedLatest, error) {
+	return NewSkewedLatest(1, defaultMax, defaultTheta)
+}
+
+// NewSkewedLatest constructs a new SkewedLatest generator with the given
+// parameters. It returns an error if the parameters are outside the accepted
+// range.
+func NewSkewedLatest(min, max uint64, theta float64) (*SkewedLatest, error) {
+	z := &SkewedLatest{}
+	z.mu.max = max
+	zipf, err := NewZipf(0, max-min, theta)
+	if err != nil {
+		return nil, err
+	}
+	z.mu.zipf = zipf
+	return z, nil
+}
+
+// IncMax increments max.
+func (z *SkewedLatest) IncMax(delta int) {
+	z.mu.Lock()
+	z.mu.zipf.IncMax(delta)
+	z.mu.max += uint64(delta)
+	z.mu.Unlock()
+}
+
+// Max returns max.
+func (z *SkewedLatest) Max() uint64 {
+	z.mu.Lock()
+	defer z.mu.Unlock()
+	return z.mu.zipf.Max()
+}
+
+// Uint64 returns a random Uint64 between min and max, where keys near max are
+// most likely to be drawn.
+func (z *SkewedLatest) Uint64(rng *rand.Rand) uint64 {
+	z.mu.RLock()
+	result := z.mu.max - z.mu.zipf.Uint64(rng)
+	z.mu.RUnlock()
+	return result
+}
diff --git a/pebble/internal/randvar/skewed_latest_test.go b/pebble/internal/randvar/skewed_latest_test.go
new file mode 100644
index 0000000..c9cf20e
--- /dev/null
+++ b/pebble/internal/randvar/skewed_latest_test.go
@@ -0,0 +1,68 @@
+// Copyright 2018 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License. See the AUTHORS file
+// for names of contributors.
+
+package randvar
+
+import (
+	"fmt"
+	"sort"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func dumpSamples(x []int) {
+	sort.Ints(x)
+
+	max := x[len(x)-1]
+	step := max / 20
+	if step == 0 {
+		step = 1
+	}
+	index := 0
+	count := 0
+	for i := 0; i <= max; i += step {
+		count = 0
+		for index < len(x) {
+			if x[index] >= i+step {
+				break
+			}
+			index++
+			count++
+		}
+		fmt.Printf("[%3d-%3d) ", i, i+step)
+		for j := 0; j < count; j++ {
+			if j%50 == 0 {
+				fmt.Printf("%c", '∎')
+			}
+		}
+		fmt.Println()
+	}
+}
+
+func TestSkewedLatest(t *testing.T) {
+	rng := NewRand()
+	z, err := NewSkewedLatest(0, 99, 0.99)
+	require.NoError(t, err)
+
+	x := make([]int, 10000)
+	for i := range x {
+		x[i] = int(z.Uint64(rng))
+	}
+
+	if testing.Verbose() {
+		dumpSamples(x)
+	}
+}
diff --git a/pebble/internal/randvar/uniform.go b/pebble/internal/randvar/uniform.go
new file mode 100644
index 0000000..8f85e97
--- /dev/null
+++ b/pebble/internal/randvar/uniform.go
@@ -0,0 +1,54 @@
+// Copyright 2018 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License. See the AUTHORS file
+// for names of contributors.
+
+package randvar
+
+import (
+	"sync/atomic"
+
+	"golang.org/x/exp/rand"
+)
+
+// Uniform is a random number generator that generates draws from a uniform
+// distribution.
+type Uniform struct {
+	min uint64
+	max atomic.Uint64
+}
+
+// NewUniform constructs a new Uniform generator with the given
+// parameters. Returns an error if the parameters are outside the accepted
+// range.
+func NewUniform(min, max uint64) *Uniform {
+	u := &Uniform{min: min}
+	u.max.Store(max)
+	return u
+}
+
+// IncMax increments max.
+func (g *Uniform) IncMax(delta int) {
+	g.max.Add(uint64(delta))
+}
+
+// Max returns the max value of the distribution.
+func (g *Uniform) Max() uint64 {
+	return g.max.Load()
+}
+
+// Uint64 returns a random Uint64 between min and max, drawn from a uniform
+// distribution.
+func (g *Uniform) Uint64(rng *rand.Rand) uint64 {
+	return rng.Uint64n(g.Max()-g.min+1) + g.min
+}
diff --git a/pebble/internal/randvar/weighted.go b/pebble/internal/randvar/weighted.go
new file mode 100644
index 0000000..3294757
--- /dev/null
+++ b/pebble/internal/randvar/weighted.go
@@ -0,0 +1,41 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package randvar
+
+import "golang.org/x/exp/rand"
+
+// Weighted is a random number generator that generates numbers in the range
+// [0,len(weights)-1] where the probability of i is weights(i)/sum(weights).
+type Weighted struct {
+	rng     *rand.Rand
+	sum     float64
+	weights []float64
+}
+
+// NewWeighted returns a new weighted random number generator.
+func NewWeighted(rng *rand.Rand, weights ...float64) *Weighted {
+	var sum float64
+	for i := range weights {
+		sum += weights[i]
+	}
+	return &Weighted{
+		rng:     ensureRand(rng),
+		sum:     sum,
+		weights: weights,
+	}
+}
+
+// Int returns a random number in the range [0,len(weights)-1] where the
+// probability of i is weights(i)/sum(weights).
+func (w *Weighted) Int() int {
+	p := w.rng.Float64() * w.sum
+	for i, weight := range w.weights {
+		if p <= weight {
+			return i
+		}
+		p -= weight
+	}
+	return len(w.weights) - 1
+}
diff --git a/pebble/internal/randvar/weighted_test.go b/pebble/internal/randvar/weighted_test.go
new file mode 100644
index 0000000..70601bc
--- /dev/null
+++ b/pebble/internal/randvar/weighted_test.go
@@ -0,0 +1,20 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package randvar
+
+import "testing"
+
+func TestWeighted(t *testing.T) {
+	w := NewWeighted(nil, 1, 2, 2, 0, 3)
+
+	x := make([]int, 10000)
+	for i := range x {
+		x[i] = w.Int()
+	}
+
+	if testing.Verbose() {
+		dumpSamples(x)
+	}
+}
diff --git a/pebble/internal/randvar/zipf.go b/pebble/internal/randvar/zipf.go
new file mode 100644
index 0000000..f9aa4e0
--- /dev/null
+++ b/pebble/internal/randvar/zipf.go
@@ -0,0 +1,147 @@
+// Copyright 2017 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License. See the AUTHORS file
+// for names of contributors.
+//
+// ZipfGenerator implements the Incrementing Zipfian Random Number Generator from
+// [1]: "Quickly Generating Billion-Record Synthetic Databases"
+// by Gray, Sundaresan, Englert, Baclawski, and Weinberger, SIGMOD 1994.
+
+package randvar
+
+import (
+	"math"
+	"sync"
+
+	"github.com/cockroachdb/errors"
+	"golang.org/x/exp/rand"
+)
+
+const (
+	// See https://github.com/brianfrankcooper/YCSB/blob/f886c1e7988f8f4965cb88a1fe2f6bad2c61b56d/core/src/main/java/com/yahoo/ycsb/generator/ScrambledZipfianGenerator.java#L33-L35
+	defaultMax   = 10000000000
+	defaultTheta = 0.99
+	defaultZetaN = 26.46902820178302
+)
+
+// Zipf is a random number generator that generates random numbers from a Zipf
+// distribution. Unlike rand.Zipf, this generator supports incrementing the max
+// parameter without performing an expensive recomputation of the underlying
+// hidden parameters, which is a pattern used in [1] for efficiently generating
+// large volumes of Zipf-distributed records for synthetic data. Second,
+// rand.Zipf only supports theta <= 1, we suppose all values of theta.
+type Zipf struct {
+	// Supplied constants.
+	theta float64
+	min   uint64
+	// Internally computed constants.
+	alpha, zeta2 float64
+	halfPowTheta float64
+	// Mutable state.
+	mu struct {
+		sync.RWMutex
+		max   uint64
+		eta   float64
+		zetaN float64
+	}
+}
+
+// NewDefaultZipf constructs a new Zipf generator with the default parameters.
+func NewDefaultZipf() (*Zipf, error) {
+	return NewZipf(1, defaultMax, defaultTheta)
+}
+
+// NewZipf constructs a new Zipf generator with the given parameters.  Returns
+// an error if the parameters are outside the accepted range.
+func NewZipf(min, max uint64, theta float64) (*Zipf, error) {
+	if min > max {
+		return nil, errors.Errorf("min %d > max %d", errors.Safe(min), errors.Safe(max))
+	}
+	if theta < 0.0 || theta == 1.0 {
+		return nil, errors.New("0 < theta, and theta != 1")
+	}
+
+	z := &Zipf{
+		min:   min,
+		theta: theta,
+	}
+	z.mu.max = max
+
+	// Compute hidden parameters.
+	z.zeta2 = computeZetaFromScratch(2, theta)
+	z.halfPowTheta = 1.0 + math.Pow(0.5, z.theta)
+	z.mu.zetaN = computeZetaFromScratch(max+1-min, theta)
+	z.alpha = 1.0 / (1.0 - theta)
+	z.mu.eta = (1 - math.Pow(2.0/float64(z.mu.max+1-z.min), 1.0-theta)) / (1.0 - z.zeta2/z.mu.zetaN)
+	return z, nil
+}
+
+// computeZetaIncrementally recomputes zeta(max, theta), assuming that sum =
+// zeta(oldMax, theta). Returns zeta(max, theta), computed incrementally.
+func computeZetaIncrementally(oldMax, max uint64, theta float64, sum float64) float64 {
+	if max < oldMax {
+		panic("unable to decrement max!")
+	}
+	for i := oldMax + 1; i <= max; i++ {
+		sum += 1.0 / math.Pow(float64(i), theta)
+	}
+	return sum
+}
+
+// The function zeta computes the value
+// zeta(n, theta) = (1/1)^theta + (1/2)^theta + (1/3)^theta + ... + (1/n)^theta
+func computeZetaFromScratch(n uint64, theta float64) float64 {
+	if n == defaultMax && theta == defaultTheta {
+		// Precomputed value, borrowed from ScrambledZipfianGenerator.java. This is
+		// quite slow to calculate from scratch due to the large n value.
+		return defaultZetaN
+	}
+	return computeZetaIncrementally(0, n, theta, 0.0)
+}
+
+// IncMax increments max and recomputes the internal values that depend on
+// it. Returns an error if the recomputation failed.
+func (z *Zipf) IncMax(delta int) {
+	z.mu.Lock()
+	oldMax := z.mu.max
+	z.mu.max += uint64(delta)
+	z.mu.zetaN = computeZetaIncrementally(oldMax+1-z.min, z.mu.max+1-z.min, z.theta, z.mu.zetaN)
+	z.mu.eta = (1 - math.Pow(2.0/float64(z.mu.max+1-z.min), 1.0-z.theta)) / (1.0 - z.zeta2/z.mu.zetaN)
+	z.mu.Unlock()
+}
+
+// Max returns the max.
+func (z *Zipf) Max() uint64 {
+	z.mu.Lock()
+	defer z.mu.Unlock()
+	return z.mu.max
+}
+
+// Uint64 draws a new value between min and max, with probabilities according
+// to the Zipf distribution.
+func (z *Zipf) Uint64(rng *rand.Rand) uint64 {
+	u := rng.Float64()
+	z.mu.RLock()
+	uz := u * z.mu.zetaN
+	var result uint64
+	if uz < 1.0 {
+		result = z.min
+	} else if uz < z.halfPowTheta {
+		result = z.min + 1
+	} else {
+		spread := float64(z.mu.max + 1 - z.min)
+		result = z.min + uint64(spread*math.Pow(z.mu.eta*u-z.mu.eta+1.0, z.alpha))
+	}
+	z.mu.RUnlock()
+	return result
+}
diff --git a/pebble/internal/randvar/zipf_test.go b/pebble/internal/randvar/zipf_test.go
new file mode 100644
index 0000000..cba525b
--- /dev/null
+++ b/pebble/internal/randvar/zipf_test.go
@@ -0,0 +1,129 @@
+// Copyright 2017 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License. See the AUTHORS file
+// for names of contributors.
+
+package randvar
+
+import (
+	"math"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestZeta(t *testing.T) {
+	var zetaTests = []struct {
+		n        uint64
+		theta    float64
+		expected float64
+	}{
+		{20, 0.99, 3.64309060779367},
+		{200, 0.99, 6.02031118558},
+		{1000, 0.99, 7.72895321728},
+		{2000, 0.99, 8.47398788329},
+		{10000, 0.99, 10.2243614596},
+		{100000, 0.99, 12.7783380626},
+		{1000000, 0.99, 15.391849746},
+		{10000000, 0.99, 18.066242575},
+		// TODO(peter): The last test case takes an excessively long time to run (7s).
+		// {100000000, 0.99, 20.80293049},
+	}
+
+	t.Run("FromScratch", func(t *testing.T) {
+		for _, test := range zetaTests {
+			computedZeta := computeZetaFromScratch(test.n, test.theta)
+			if math.Abs(computedZeta-test.expected) > 0.000000001 {
+				t.Fatalf("expected %6.4f, got %6.4f", test.expected, computedZeta)
+			}
+		}
+	})
+
+	t.Run("Incrementally", func(t *testing.T) {
+		// Theta cannot be 1 by definition, so this is a safe initial value.
+		oldTheta := 1.0
+		var oldZetaN float64
+		var oldN uint64
+		for _, test := range zetaTests {
+			// If theta has changed, recompute from scratch
+			if test.theta != oldTheta {
+				oldZetaN = computeZetaFromScratch(test.n, test.theta)
+				oldN = test.n
+				continue
+			}
+
+			computedZeta := computeZetaIncrementally(oldN, test.n, test.theta, oldZetaN)
+			if math.Abs(computedZeta-test.expected) > 0.000000001 {
+				t.Fatalf("expected %6.4f, got %6.4f", test.expected, computedZeta)
+			}
+
+			oldZetaN = computedZeta
+			oldN = test.n
+		}
+	})
+}
+
+func TestZetaIncMax(t *testing.T) {
+	// Construct a zipf generator covering the range [0,10] incrementally.
+	z0, err := NewZipf(0, 0, 0.99)
+	require.NoError(t, err)
+
+	for i := 0; i < 10; i++ {
+		z0.IncMax(1)
+	}
+
+	// Construct a zipf generator covering the range [0,10] via the constructor.
+	z10, err := NewZipf(0, 10, 0.99)
+	require.NoError(t, err)
+
+	z0.mu.Lock()
+	defer z0.mu.Unlock()
+	z10.mu.Lock()
+	defer z10.mu.Unlock()
+	if z0.mu.zetaN != z10.mu.zetaN {
+		t.Fatalf("expected zetaN %v, but found %v", z10.mu.zetaN, z0.mu.zetaN)
+	}
+	if z0.mu.eta != z10.mu.eta {
+		t.Fatalf("expected eta %v, but found %v", z10.mu.eta, z0.mu.eta)
+	}
+}
+
+func TestNewZipf(t *testing.T) {
+	var gens = []struct {
+		min, max uint64
+		theta    float64
+	}{
+		{0, 100, 0.99},
+		{0, 100, 1.01},
+	}
+
+	for _, gen := range gens {
+		_, err := NewZipf(gen.min, gen.max, gen.theta)
+		require.NoError(t, err)
+	}
+}
+
+func TestZipf(t *testing.T) {
+	rng := NewRand()
+	z, err := NewZipf(0, 99, 0.99)
+	require.NoError(t, err)
+
+	x := make([]int, 10000)
+	for i := range x {
+		x[i] = int(z.Uint64(rng))
+	}
+
+	if testing.Verbose() {
+		dumpSamples(x)
+	}
+}
diff --git a/pebble/internal/rangedel/rangedel.go b/pebble/internal/rangedel/rangedel.go
new file mode 100644
index 0000000..f8504bb
--- /dev/null
+++ b/pebble/internal/rangedel/rangedel.go
@@ -0,0 +1,43 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package rangedel
+
+import (
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+)
+
+// Encode takes a Span containing only range deletions. It invokes the provided
+// closure with the encoded internal keys that represent the Span's state. The
+// keys and values passed to emit are only valid until the closure returns.  If
+// emit returns an error, Encode stops and returns the error.
+func Encode(s *keyspan.Span, emit func(k base.InternalKey, v []byte) error) error {
+	for _, k := range s.Keys {
+		if k.Kind() != base.InternalKeyKindRangeDelete {
+			return base.CorruptionErrorf("pebble: rangedel.Encode cannot encode %s key", k.Kind())
+		}
+		ik := base.InternalKey{
+			UserKey: s.Start,
+			Trailer: k.Trailer,
+		}
+		if err := emit(ik, s.End); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Decode takes an internal key pair encoding a range deletion and returns a
+// decoded keyspan containing the key. If keysDst is provided, the key will be
+// appended to keysDst, avoiding an allocation.
+func Decode(ik base.InternalKey, v []byte, keysDst []keyspan.Key) keyspan.Span {
+	return keyspan.Span{
+		Start: ik.UserKey,
+		End:   v,
+		Keys: append(keysDst, keyspan.Key{
+			Trailer: ik.Trailer,
+		}),
+	}
+}
diff --git a/pebble/internal/rangekey/coalesce.go b/pebble/internal/rangekey/coalesce.go
new file mode 100644
index 0000000..c0456bb
--- /dev/null
+++ b/pebble/internal/rangekey/coalesce.go
@@ -0,0 +1,380 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package rangekey
+
+import (
+	"bytes"
+	"math"
+	"sort"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+)
+
+// UserIteratorConfig holds state for constructing the range key iterator stack
+// for user iteration. The range key iterator must merge range key spans across
+// the levels of the LSM. This merging is performed by a keyspan.MergingIter
+// on-the-fly. The UserIteratorConfig implements keyspan.Transformer, evaluating
+// range-key semantics and shadowing, so the spans returned by a MergingIter are
+// fully resolved.
+//
+// The MergingIter is wrapped by a BoundedIter, which elides spans that are
+// outside the iterator bounds (or the current prefix's bounds, during prefix
+// iteration mode).
+//
+// To provide determinisim during iteration, the BoundedIter is wrapped by a
+// DefragmentingIter that defragments abutting spans with identical
+// user-observable state.
+//
+// At the top-level an InterleavingIter interleaves range keys with point keys
+// and performs truncation to iterator bounds.
+//
+// Below is an abbreviated diagram illustrating the mechanics of a SeekGE.
+//
+//	               InterleavingIter.SeekGE
+//	                       │
+//	            DefragmentingIter.SeekGE
+//	                       │
+//	               BoundedIter.SeekGE
+//	                       │
+//	      ╭────────────────┴───────────────╮
+//	      │                                ├── defragmentBwd*
+//	MergingIter.SeekGE                     │
+//	      │                                ╰── defragmentFwd
+//	      ╰─╶╶ per level╶╶ ─╮
+//	                        │
+//	                        │
+//	                        ├── <?>.SeekLT
+//	                        │
+//	                        ╰── <?>.Next
+type UserIteratorConfig struct {
+	snapshot     uint64
+	comparer     *base.Comparer
+	miter        keyspan.MergingIter
+	biter        keyspan.BoundedIter
+	diter        keyspan.DefragmentingIter
+	liters       [manifest.NumLevels]keyspan.LevelIter
+	litersUsed   int
+	internalKeys bool
+	bufs         *Buffers
+}
+
+// Buffers holds various buffers used for range key iteration. They're exposed
+// so that they may be pooled and reused between iterators.
+type Buffers struct {
+	merging       keyspan.MergingBuffers
+	defragmenting keyspan.DefragmentingBuffers
+	sortBuf       keyspan.KeysBySuffix
+}
+
+// PrepareForReuse discards any excessively large buffers.
+func (bufs *Buffers) PrepareForReuse() {
+	bufs.merging.PrepareForReuse()
+	bufs.defragmenting.PrepareForReuse()
+}
+
+// Init initializes the range key iterator stack for user iteration. The
+// resulting fragment iterator applies range key semantics, defragments spans
+// according to their user-observable state and, if !internalKeys, removes all
+// Keys other than RangeKeySets describing the current state of range keys. The
+// resulting spans contain Keys sorted by suffix (unless internalKeys is true,
+// in which case they remain sorted by trailer descending).
+//
+// The snapshot sequence number parameter determines which keys are visible. Any
+// keys not visible at the provided snapshot are ignored.
+func (ui *UserIteratorConfig) Init(
+	comparer *base.Comparer,
+	snapshot uint64,
+	lower, upper []byte,
+	hasPrefix *bool,
+	prefix *[]byte,
+	internalKeys bool,
+	bufs *Buffers,
+	iters ...keyspan.FragmentIterator,
+) keyspan.FragmentIterator {
+	ui.snapshot = snapshot
+	ui.comparer = comparer
+	ui.internalKeys = internalKeys
+	ui.miter.Init(comparer.Compare, ui, &bufs.merging, iters...)
+	ui.biter.Init(comparer.Compare, comparer.Split, &ui.miter, lower, upper, hasPrefix, prefix)
+	if internalKeys {
+		ui.diter.Init(comparer, &ui.biter, keyspan.DefragmentInternal, keyspan.StaticDefragmentReducer, &bufs.defragmenting)
+	} else {
+		ui.diter.Init(comparer, &ui.biter, ui, keyspan.StaticDefragmentReducer, &bufs.defragmenting)
+	}
+	ui.litersUsed = 0
+	ui.bufs = bufs
+	return &ui.diter
+}
+
+// AddLevel adds a new level to the bottom of the iterator stack. AddLevel
+// must be called after Init and before any other method on the iterator.
+func (ui *UserIteratorConfig) AddLevel(iter keyspan.FragmentIterator) {
+	ui.miter.AddLevel(iter)
+}
+
+// NewLevelIter returns a pointer to a newly allocated or reused
+// keyspan.LevelIter. The caller is responsible for calling Init() on this
+// instance.
+func (ui *UserIteratorConfig) NewLevelIter() *keyspan.LevelIter {
+	if ui.litersUsed >= len(ui.liters) {
+		return &keyspan.LevelIter{}
+	}
+	ui.litersUsed++
+	return &ui.liters[ui.litersUsed-1]
+}
+
+// SetBounds propagates bounds to the iterator stack. The fragment iterator
+// interface ordinarily doesn't enforce bounds, so this is exposed as an
+// explicit method on the user iterator config.
+func (ui *UserIteratorConfig) SetBounds(lower, upper []byte) {
+	ui.biter.SetBounds(lower, upper)
+}
+
+// Transform implements the keyspan.Transformer interface for use with a
+// keyspan.MergingIter. It transforms spans by resolving range keys at the
+// provided snapshot sequence number. Shadowing of keys is resolved (eg, removal
+// of unset keys, removal of keys overwritten by a set at the same suffix, etc)
+// and then non-RangeKeySet keys are removed. The resulting transformed spans
+// only contain RangeKeySets describing the state visible at the provided
+// sequence number, and hold their Keys sorted by Suffix (except if internalKeys
+// is true, then keys remain sorted by trailer.
+func (ui *UserIteratorConfig) Transform(cmp base.Compare, s keyspan.Span, dst *keyspan.Span) error {
+	// Apply shadowing of keys.
+	dst.Start = s.Start
+	dst.End = s.End
+	ui.bufs.sortBuf = keyspan.KeysBySuffix{
+		Cmp:  cmp,
+		Keys: ui.bufs.sortBuf.Keys[:0],
+	}
+	if err := coalesce(ui.comparer.Equal, &ui.bufs.sortBuf, ui.snapshot, s.Keys); err != nil {
+		return err
+	}
+	if ui.internalKeys {
+		if s.KeysOrder != keyspan.ByTrailerDesc {
+			panic("unexpected key ordering in UserIteratorTransform with internalKeys = true")
+		}
+		dst.Keys = ui.bufs.sortBuf.Keys
+		keyspan.SortKeysByTrailer(&dst.Keys)
+		return nil
+	}
+	// During user iteration over range keys, unsets and deletes don't matter. This
+	// step helps logical defragmentation during iteration.
+	keys := ui.bufs.sortBuf.Keys
+	dst.Keys = dst.Keys[:0]
+	for i := range keys {
+		switch keys[i].Kind() {
+		case base.InternalKeyKindRangeKeySet:
+			if invariants.Enabled && len(dst.Keys) > 0 && cmp(dst.Keys[len(dst.Keys)-1].Suffix, keys[i].Suffix) > 0 {
+				panic("pebble: keys unexpectedly not in ascending suffix order")
+			}
+			dst.Keys = append(dst.Keys, keys[i])
+		case base.InternalKeyKindRangeKeyUnset:
+			if invariants.Enabled && len(dst.Keys) > 0 && cmp(dst.Keys[len(dst.Keys)-1].Suffix, keys[i].Suffix) > 0 {
+				panic("pebble: keys unexpectedly not in ascending suffix order")
+			}
+			// Skip.
+			continue
+		case base.InternalKeyKindRangeKeyDelete:
+			// Skip.
+			continue
+		default:
+			return base.CorruptionErrorf("pebble: unrecognized range key kind %s", keys[i].Kind())
+		}
+	}
+	// coalesce results in dst.Keys being sorted by Suffix.
+	dst.KeysOrder = keyspan.BySuffixAsc
+	return nil
+}
+
+// ShouldDefragment implements the DefragmentMethod interface and configures a
+// DefragmentingIter to defragment spans of range keys if their user-visible
+// state is identical. This defragmenting method assumes the provided spans have
+// already been transformed through (UserIterationConfig).Transform, so all
+// RangeKeySets are user-visible sets and are already in Suffix order. This
+// defragmenter checks for equality between set suffixes and values (ignoring
+// sequence numbers). It's intended for use during user iteration, when the
+// wrapped keyspan iterator is merging spans across all levels of the LSM.
+func (ui *UserIteratorConfig) ShouldDefragment(equal base.Equal, a, b *keyspan.Span) bool {
+	// This method is not called with internalKeys = true.
+	if ui.internalKeys {
+		panic("unexpected call to ShouldDefragment with internalKeys = true")
+	}
+	// This implementation must only be used on spans that have transformed by
+	// ui.Transform. The transform applies shadowing, removes all keys besides
+	// the resulting Sets and sorts the keys by suffix. Since shadowing has been
+	// applied, each Set must set a unique suffix. If the two spans are
+	// equivalent, they must have the same number of range key sets.
+	if len(a.Keys) != len(b.Keys) || len(a.Keys) == 0 {
+		return false
+	}
+	if a.KeysOrder != keyspan.BySuffixAsc || b.KeysOrder != keyspan.BySuffixAsc {
+		panic("pebble: range key span's keys unexpectedly not in ascending suffix order")
+	}
+
+	ret := true
+	for i := range a.Keys {
+		if invariants.Enabled {
+			if a.Keys[i].Kind() != base.InternalKeyKindRangeKeySet ||
+				b.Keys[i].Kind() != base.InternalKeyKindRangeKeySet {
+				panic("pebble: unexpected non-RangeKeySet during defragmentation")
+			}
+			if i > 0 && (ui.comparer.Compare(a.Keys[i].Suffix, a.Keys[i-1].Suffix) < 0 ||
+				ui.comparer.Compare(b.Keys[i].Suffix, b.Keys[i-1].Suffix) < 0) {
+				panic("pebble: range keys not ordered by suffix during defragmentation")
+			}
+		}
+		if !equal(a.Keys[i].Suffix, b.Keys[i].Suffix) {
+			ret = false
+			break
+		}
+		if !bytes.Equal(a.Keys[i].Value, b.Keys[i].Value) {
+			ret = false
+			break
+		}
+	}
+	return ret
+}
+
+// Coalesce imposes range key semantics and coalesces range keys with the same
+// bounds. Coalesce drops any keys shadowed by more recent sets, unsets or
+// deletes. Coalesce modifies the provided span's Keys slice, reslicing the
+// slice to remove dropped keys.
+//
+// Coalescence has subtle behavior with respect to sequence numbers. Coalesce
+// depends on a keyspan.Span's Keys being sorted in sequence number descending
+// order. The first key has the largest sequence number. The returned coalesced
+// span includes only the largest sequence number. All other sequence numbers
+// are forgotten. When a compaction constructs output range keys from a
+// coalesced span, it produces at most one RANGEKEYSET, one RANGEKEYUNSET and
+// one RANGEKEYDEL. Each one of these keys adopt the largest sequence number.
+//
+// This has the potentially surprising effect of 'promoting' a key to a higher
+// sequence number. This is okay, because:
+//   - There are no other overlapping keys within the coalesced span of
+//     sequence numbers (otherwise they would be in the compaction, due to
+//     the LSM invariant).
+//   - Range key sequence numbers are never compared to point key sequence
+//     numbers. Range keys and point keys have parallel existences.
+//   - Compactions only coalesce within snapshot stripes.
+//
+// Additionally, internal range keys at the same sequence number have subtle
+// mechanics:
+//   - RANGEKEYSETs shadow RANGEKEYUNSETs of the same suffix.
+//   - RANGEKEYDELs only apply to keys at lower sequence numbers.
+//
+// This is required for ingestion. Ingested sstables are assigned a single
+// sequence number for the file, at which all of the file's keys are visible.
+// The RANGEKEYSET, RANGEKEYUNSET and RANGEKEYDEL key kinds are ordered such
+// that among keys with equal sequence numbers (thus ordered by their kinds) the
+// keys do not affect one another. Ingested sstables are expected to be
+// consistent with respect to the set/unset suffixes: A given suffix should be
+// set or unset but not both.
+//
+// The resulting dst Keys slice is sorted by Trailer.
+func Coalesce(cmp base.Compare, eq base.Equal, keys []keyspan.Key, dst *[]keyspan.Key) error {
+	// TODO(jackson): Currently, Coalesce doesn't actually perform the sequence
+	// number promotion described in the comment above.
+	keysBySuffix := keyspan.KeysBySuffix{
+		Cmp:  cmp,
+		Keys: (*dst)[:0],
+	}
+	if err := coalesce(eq, &keysBySuffix, math.MaxUint64, keys); err != nil {
+		return err
+	}
+	// Update the span with the (potentially reduced) keys slice. coalesce left
+	// the keys in *dst sorted by suffix. Re-sort them by trailer.
+	*dst = keysBySuffix.Keys
+	keyspan.SortKeysByTrailer(dst)
+	return nil
+}
+
+func coalesce(
+	equal base.Equal, keysBySuffix *keyspan.KeysBySuffix, snapshot uint64, keys []keyspan.Key,
+) error {
+	// First, enforce visibility and RangeKeyDelete mechanics. We only need to
+	// consider the prefix of keys before and including the first
+	// RangeKeyDelete. We also must skip any keys that aren't visible at the
+	// provided snapshot sequence number.
+	//
+	// NB: Within a given sequence number, keys are ordered as:
+	//   RangeKeySet > RangeKeyUnset > RangeKeyDelete
+	// This is significant, because this ensures that a Set or Unset sharing a
+	// sequence number with a Delete do not shadow each other.
+	deleteIdx := -1
+	for i := range keys {
+		if invariants.Enabled && i > 0 && keys[i].Trailer > keys[i-1].Trailer {
+			panic("pebble: invariant violation: span keys unordered")
+		}
+		if !keys[i].VisibleAt(snapshot) {
+			continue
+		}
+		// Once a RangeKeyDelete is observed, we know it shadows all subsequent
+		// keys and we can break early. We don't add the RangeKeyDelete key to
+		// keysBySuffix.keys yet, because we don't want a suffix-less key
+		// that appeared earlier in the slice to elide it. It'll be added back
+		// in at the end.
+		if keys[i].Kind() == base.InternalKeyKindRangeKeyDelete {
+			deleteIdx = i
+			break
+		}
+		keysBySuffix.Keys = append(keysBySuffix.Keys, keys[i])
+	}
+
+	// Sort the accumulated keys by suffix. There may be duplicates within a
+	// suffix, in which case the one with a larger trailer survives.
+	//
+	// We use a stable sort so that the first key with a given suffix is the one
+	// that with the highest Trailer (because the input `keys` was sorted by
+	// trailer descending).
+	sort.Stable(keysBySuffix)
+
+	// Grab a handle of the full sorted slice, before reslicing
+	// keysBySuffix.keys to accumulate the final coalesced keys.
+	sorted := keysBySuffix.Keys
+	keysBySuffix.Keys = keysBySuffix.Keys[:0]
+
+	var (
+		// prevSuffix is updated on each iteration of the below loop, and
+		// compared by the subsequent iteration to determine whether adjacent
+		// keys are defined at the same suffix.
+		prevSuffix []byte
+		// shadowing is set to true once any Key is shadowed by another key.
+		// When it's set to true—or after the loop if no keys are shadowed—the
+		// keysBySuffix.keys slice is resliced to contain the prefix of
+		// unshadowed keys. This avoids copying them incrementally in the common
+		// case of no shadowing.
+		shadowing bool
+	)
+	for i := range sorted {
+		if i > 0 && equal(prevSuffix, sorted[i].Suffix) {
+			// Skip; this key is shadowed by the predecessor that had a larger
+			// Trailer. If this is the first shadowed key, set shadowing=true
+			// and reslice keysBySuffix.keys to hold the entire unshadowed
+			// prefix.
+			if !shadowing {
+				keysBySuffix.Keys = keysBySuffix.Keys[:i]
+				shadowing = true
+			}
+			continue
+		}
+		prevSuffix = sorted[i].Suffix
+		if shadowing {
+			keysBySuffix.Keys = append(keysBySuffix.Keys, sorted[i])
+		}
+	}
+	// If there was no shadowing, keysBySuffix.keys is untouched. We can simply
+	// set it to the existing `sorted` slice (also backed by keysBySuffix.keys).
+	if !shadowing {
+		keysBySuffix.Keys = sorted
+	}
+	// If the original input `keys` slice contained a RangeKeyDelete, add it.
+	if deleteIdx >= 0 {
+		keysBySuffix.Keys = append(keysBySuffix.Keys, keys[deleteIdx])
+	}
+	return nil
+}
diff --git a/pebble/internal/rangekey/coalesce_test.go b/pebble/internal/rangekey/coalesce_test.go
new file mode 100644
index 0000000..2ba63f7
--- /dev/null
+++ b/pebble/internal/rangekey/coalesce_test.go
@@ -0,0 +1,416 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package rangekey
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"math"
+	"math/rand"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/pmezard/go-difflib/difflib"
+	"github.com/stretchr/testify/require"
+)
+
+func TestCoalesce(t *testing.T) {
+	var buf bytes.Buffer
+	eq := testkeys.Comparer.Equal
+	cmp := testkeys.Comparer.Compare
+
+	datadriven.RunTest(t, "testdata/coalesce", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "coalesce":
+			buf.Reset()
+			span := keyspan.ParseSpan(td.Input)
+			coalesced := keyspan.Span{
+				Start: span.Start,
+				End:   span.End,
+			}
+			if err := Coalesce(cmp, eq, span.Keys, &coalesced.Keys); err != nil {
+				return err.Error()
+			}
+			fmt.Fprintln(&buf, coalesced)
+			return buf.String()
+		default:
+			return fmt.Sprintf("unrecognized command %q", td.Cmd)
+		}
+	})
+}
+
+func TestIter(t *testing.T) {
+	eq := testkeys.Comparer.Equal
+	cmp := testkeys.Comparer.Compare
+	var iter keyspan.MergingIter
+	var buf bytes.Buffer
+
+	datadriven.RunTest(t, "testdata/iter", func(t *testing.T, td *datadriven.TestData) string {
+		buf.Reset()
+		switch td.Cmd {
+		case "define":
+			visibleSeqNum := base.InternalKeySeqNumMax
+			for _, arg := range td.CmdArgs {
+				if arg.Key == "visible-seq-num" {
+					var err error
+					visibleSeqNum, err = strconv.ParseUint(arg.Vals[0], 10, 64)
+					require.NoError(t, err)
+				}
+			}
+
+			var spans []keyspan.Span
+			lines := strings.Split(strings.TrimSpace(td.Input), "\n")
+			for _, line := range lines {
+				spans = append(spans, keyspan.ParseSpan(line))
+			}
+			transform := keyspan.TransformerFunc(func(cmp base.Compare, s keyspan.Span, dst *keyspan.Span) error {
+				keysBySuffix := keyspan.KeysBySuffix{
+					Cmp:  cmp,
+					Keys: dst.Keys[:0],
+				}
+				if err := coalesce(eq, &keysBySuffix, visibleSeqNum, s.Keys); err != nil {
+					return err
+				}
+				// Update the span with the (potentially reduced) keys slice.  coalesce left
+				// the keys in *dst sorted by suffix. Re-sort them by trailer.
+				dst.Keys = keysBySuffix.Keys
+				keyspan.SortKeysByTrailer(&dst.Keys)
+				dst.Start = s.Start
+				dst.End = s.End
+				return nil
+			})
+			iter.Init(cmp, transform, new(keyspan.MergingBuffers), keyspan.NewIter(cmp, spans))
+			return "OK"
+		case "iter":
+			buf.Reset()
+			lines := strings.Split(strings.TrimSpace(td.Input), "\n")
+			for _, line := range lines {
+				line = strings.TrimSpace(line)
+				i := strings.IndexByte(line, ' ')
+				iterCmd := line
+				if i > 0 {
+					iterCmd = string(line[:i])
+				}
+				var s *keyspan.Span
+				switch iterCmd {
+				case "first":
+					s = iter.First()
+				case "last":
+					s = iter.Last()
+				case "next":
+					s = iter.Next()
+				case "prev":
+					s = iter.Prev()
+				case "seek-ge":
+					s = iter.SeekGE([]byte(strings.TrimSpace(line[i:])))
+				case "seek-lt":
+					s = iter.SeekLT([]byte(strings.TrimSpace(line[i:])))
+				default:
+					return fmt.Sprintf("unrecognized iter command %q", iterCmd)
+				}
+				require.NoError(t, iter.Error())
+				fmt.Fprint(&buf, s)
+				if buf.Len() > 0 {
+					fmt.Fprintln(&buf)
+				}
+			}
+			return buf.String()
+		default:
+			return fmt.Sprintf("unrecognized command %q", td.Cmd)
+		}
+	})
+}
+
+func TestDefragmenting(t *testing.T) {
+	cmp := testkeys.Comparer.Compare
+
+	var buf bytes.Buffer
+	var spans []keyspan.Span
+	var hasPrefix bool
+	var prefix []byte
+	datadriven.RunTest(t, "testdata/defragmenting_iter", func(t *testing.T, td *datadriven.TestData) string {
+		buf.Reset()
+		switch td.Cmd {
+		case "define":
+			spans = spans[:0]
+			lines := strings.Split(strings.TrimSpace(td.Input), "\n")
+			for _, line := range lines {
+				spans = append(spans, keyspan.ParseSpan(line))
+			}
+			return ""
+		case "iter":
+			var userIterCfg UserIteratorConfig
+			iter := userIterCfg.Init(testkeys.Comparer, base.InternalKeySeqNumMax,
+				nil /* lower */, nil, /* upper */
+				&hasPrefix, &prefix, false /* internalKeys */, new(Buffers),
+				keyspan.NewIter(cmp, spans))
+			for _, line := range strings.Split(td.Input, "\n") {
+				runIterOp(&buf, iter, line)
+			}
+			return strings.TrimSpace(buf.String())
+		default:
+			return fmt.Sprintf("unrecognized command %q", td.Cmd)
+		}
+	})
+}
+
+func TestDefragmentingIter_Randomized(t *testing.T) {
+	seed := time.Now().UnixNano()
+	for i := int64(0); i < 100; i++ {
+		testDefragmentingIteRandomizedOnce(t, seed+i)
+	}
+}
+
+func TestDefragmentingIter_RandomizedFixedSeed(t *testing.T) {
+	const seed = 1648173101214881000
+	testDefragmentingIteRandomizedOnce(t, seed)
+}
+
+func testDefragmentingIteRandomizedOnce(t *testing.T, seed int64) {
+	cmp := testkeys.Comparer.Compare
+	formatKey := testkeys.Comparer.FormatKey
+
+	rng := rand.New(rand.NewSource(seed))
+	t.Logf("seed = %d", seed)
+
+	// Use a key space of alphanumeric strings, with a random max length between
+	// 1-2. Repeat keys are more common at the lower max lengths.
+	ks := testkeys.Alpha(rng.Intn(2) + 1)
+
+	// Generate between 1-15 range keys.
+	const maxRangeKeys = 15
+	var original, fragmented []keyspan.Span
+	numRangeKeys := 1 + rng.Intn(maxRangeKeys)
+	for i := 0; i < numRangeKeys; i++ {
+		startIdx := rng.Int63n(ks.Count())
+		endIdx := rng.Int63n(ks.Count())
+		for startIdx == endIdx {
+			endIdx = rng.Int63n(ks.Count())
+		}
+		if startIdx > endIdx {
+			startIdx, endIdx = endIdx, startIdx
+		}
+
+		key := keyspan.Key{
+			Trailer: base.MakeTrailer(uint64(i), base.InternalKeyKindRangeKeySet),
+			Value:   []byte(fmt.Sprintf("v%d", rng.Intn(3))),
+		}
+		// Generate suffixes 0, 1, 2, or 3 with 0 indicating none.
+		if suffix := rng.Int63n(4); suffix > 0 {
+			key.Suffix = testkeys.Suffix(suffix)
+		}
+		original = append(original, keyspan.Span{
+			Start: testkeys.Key(ks, startIdx),
+			End:   testkeys.Key(ks, endIdx),
+			Keys:  []keyspan.Key{key},
+		})
+
+		for startIdx < endIdx {
+			width := rng.Int63n(endIdx-startIdx) + 1
+			fragmented = append(fragmented, keyspan.Span{
+				Start: testkeys.Key(ks, startIdx),
+				End:   testkeys.Key(ks, startIdx+width),
+				Keys:  []keyspan.Key{key},
+			})
+			startIdx += width
+		}
+	}
+
+	// Both the original and the deliberately fragmented spans may contain
+	// overlaps, so we need to sort and fragment them.
+	original = fragment(cmp, formatKey, original)
+	fragmented = fragment(cmp, formatKey, fragmented)
+
+	var referenceCfg, fragmentedCfg UserIteratorConfig
+	referenceIter := referenceCfg.Init(testkeys.Comparer, base.InternalKeySeqNumMax,
+		nil /* lower */, nil, /* upper */
+		new(bool), new([]byte), false /* internalKeys */, new(Buffers),
+		keyspan.NewIter(cmp, original))
+	fragmentedIter := fragmentedCfg.Init(testkeys.Comparer, base.InternalKeySeqNumMax,
+		nil /* lower */, nil, /* upper */
+		new(bool), new([]byte), false /* internalKeys */, new(Buffers),
+		keyspan.NewIter(cmp, fragmented))
+
+	// Generate 100 random operations and run them against both iterators.
+	const numIterOps = 100
+	type opKind struct {
+		weight int
+		fn     func() string
+	}
+	ops := []opKind{
+		{weight: 2, fn: func() string { return "first" }},
+		{weight: 2, fn: func() string { return "last" }},
+		{weight: 50, fn: func() string { return "next" }},
+		{weight: 50, fn: func() string { return "prev" }},
+		{weight: 5, fn: func() string {
+			k := testkeys.Key(ks, rng.Int63n(ks.Count()))
+			return fmt.Sprintf("seekge(%s)", k)
+		}},
+		{weight: 5, fn: func() string {
+			k := testkeys.Key(ks, rng.Int63n(ks.Count()))
+			return fmt.Sprintf("seeklt(%s)", k)
+		}},
+	}
+	var totalWeight int
+	for _, op := range ops {
+		totalWeight += op.weight
+	}
+	var referenceHistory, fragmentedHistory bytes.Buffer
+	for i := 0; i < numIterOps; i++ {
+		p := rng.Intn(totalWeight)
+		opIndex := 0
+		if i == 0 {
+			// First op is always a First().
+		} else {
+			for i, op := range ops {
+				if p < op.weight {
+					opIndex = i
+					break
+				}
+				p -= op.weight
+			}
+		}
+		op := ops[opIndex].fn()
+		runIterOp(&referenceHistory, referenceIter, op)
+		runIterOp(&fragmentedHistory, fragmentedIter, op)
+		if !bytes.Equal(referenceHistory.Bytes(), fragmentedHistory.Bytes()) {
+			t.Fatal(debugContext(cmp, formatKey, original, fragmented,
+				referenceHistory.String(), fragmentedHistory.String()))
+		}
+	}
+}
+
+func fragment(cmp base.Compare, formatKey base.FormatKey, spans []keyspan.Span) []keyspan.Span {
+	keyspan.Sort(cmp, spans)
+	var fragments []keyspan.Span
+	f := keyspan.Fragmenter{
+		Cmp:    cmp,
+		Format: formatKey,
+		Emit: func(f keyspan.Span) {
+			fragments = append(fragments, f)
+		},
+	}
+	for _, s := range spans {
+		f.Add(s)
+	}
+	f.Finish()
+	return fragments
+}
+
+func debugContext(
+	cmp base.Compare,
+	formatKey base.FormatKey,
+	original, fragmented []keyspan.Span,
+	refHistory, fragHistory string,
+) string {
+	var buf bytes.Buffer
+	fmt.Fprintln(&buf, "Reference:")
+	for _, s := range original {
+		fmt.Fprintln(&buf, s)
+	}
+	fmt.Fprintln(&buf)
+	fmt.Fprintln(&buf, "Fragmented:")
+	for _, s := range fragmented {
+		fmt.Fprintln(&buf, s)
+	}
+	fmt.Fprintln(&buf)
+	fmt.Fprintln(&buf, "\nOperations diff:")
+	diff, err := difflib.GetUnifiedDiffString(difflib.UnifiedDiff{
+		A:       difflib.SplitLines(refHistory),
+		B:       difflib.SplitLines(fragHistory),
+		Context: 5,
+	})
+	if err != nil {
+		panic(err)
+	}
+	fmt.Fprintln(&buf, diff)
+	return buf.String()
+}
+
+var iterDelim = map[rune]bool{',': true, ' ': true, '(': true, ')': true, '"': true}
+
+func runIterOp(w io.Writer, it keyspan.FragmentIterator, op string) {
+	fields := strings.FieldsFunc(op, func(r rune) bool { return iterDelim[r] })
+	var s *keyspan.Span
+	switch strings.ToLower(fields[0]) {
+	case "first":
+		s = it.First()
+	case "last":
+		s = it.Last()
+	case "seekge":
+		s = it.SeekGE([]byte(fields[1]))
+	case "seeklt":
+		s = it.SeekLT([]byte(fields[1]))
+	case "next":
+		s = it.Next()
+	case "prev":
+		s = it.Prev()
+	default:
+		panic(fmt.Sprintf("unrecognized iter op %q", fields[0]))
+	}
+	fmt.Fprintf(w, "%-10s", op)
+	if s == nil {
+		fmt.Fprintln(w, ".")
+		return
+	}
+	fmt.Fprintln(w, s)
+}
+
+func BenchmarkTransform(b *testing.B) {
+	var bufs Buffers
+	var ui UserIteratorConfig
+	reinit := func() {
+		bufs.PrepareForReuse()
+		_ = ui.Init(testkeys.Comparer, math.MaxUint64, nil, nil, new(bool), nil, true /* internalKeys */, &bufs)
+	}
+
+	for _, shadowing := range []bool{false, true} {
+		b.Run(fmt.Sprintf("shadowing=%t", shadowing), func(b *testing.B) {
+			for n := 1; n <= 128; n *= 2 {
+				b.Run(fmt.Sprintf("keys=%d", n), func(b *testing.B) {
+					rng := rand.New(rand.NewSource(233473048763))
+					reinit()
+
+					suffixes := make([][]byte, n)
+					for s := range suffixes {
+						if shadowing {
+							suffixes[s] = testkeys.Suffix(int64(rng.Intn(n)))
+						} else {
+							suffixes[s] = testkeys.Suffix(int64(s))
+						}
+					}
+					rng.Shuffle(len(suffixes), func(i, j int) {
+						suffixes[i], suffixes[j] = suffixes[j], suffixes[i]
+					})
+
+					var keys []keyspan.Key
+					for k := 0; k < n; k++ {
+						keys = append(keys, keyspan.Key{
+							Trailer: base.MakeTrailer(uint64(n-k), base.InternalKeyKindRangeKeySet),
+							Suffix:  suffixes[k],
+						})
+					}
+					dst := keyspan.Span{Keys: make([]keyspan.Key, 0, len(keys))}
+					b.ResetTimer()
+
+					for i := 0; i < b.N; i++ {
+						err := ui.Transform(testkeys.Comparer.Compare, keyspan.Span{Keys: keys}, &dst)
+						if err != nil {
+							b.Fatal(err)
+						}
+						dst.Keys = dst.Keys[:0]
+					}
+				})
+			}
+		})
+	}
+}
diff --git a/pebble/internal/rangekey/rangekey.go b/pebble/internal/rangekey/rangekey.go
new file mode 100644
index 0000000..2a99834
--- /dev/null
+++ b/pebble/internal/rangekey/rangekey.go
@@ -0,0 +1,411 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+// Package rangekey provides facilities for encoding, decoding and merging range
+// keys.
+//
+// Range keys map a span of keyspan `[start, end)`, at an optional suffix, to a
+// value.
+//
+// # Encoding
+//
+// Unlike other Pebble keys, range keys encode several fields of information:
+// start key, end key, suffix and value. Internally within Pebble and its
+// sstables, all keys including range keys are represented as a key-value tuple.
+// Range keys map to internal key-value tuples by mapping the start key to the
+// key and encoding the remainder of the fields in the value.
+//
+// ## `RANGEKEYSET`
+//
+// A `RANGEKEYSET` represents one more range keys set over a single region of
+// user key space. Each represented range key must have a unique suffix.  A
+// `RANGEKEYSET` encapsulates a start key, an end key and a set of SuffixValue
+// pairs.
+//
+// A `RANGEKEYSET` key's user key holds the start key. Its value is a varstring
+// end key, followed by a set of SuffixValue pairs. A `RANGEKEYSET` may have
+// multiple SuffixValue pairs if the keyspan was set at multiple unique suffix
+// values.
+//
+// ## `RANGEKEYUNSET`
+//
+// A `RANGEKEYUNSET` represents the removal of range keys at specific suffixes
+// over a single region of user key space. A `RANGEKEYUNSET` encapsulates a
+// start key, an end key and a set of suffixes.
+//
+// A `RANGEKEYUNSET` key's user key holds the start key. Its value is a
+// varstring end key, followed by a set of suffixes. A `RANGEKEYUNSET` may have
+// multiple suffixes if the keyspan was unset at multiple unique suffixes.
+//
+// ## `RANGEKEYDEL`
+//
+// A `RANGEKEYDEL` represents the removal of all range keys over a single region
+// of user key space, regardless of suffix. A `RANGEKEYDEL` encapsulates a
+// start key and an end key. The end key is stored in the value, without any
+// varstring length prefixing.
+package rangekey
+
+// TODO(jackson): Document the encoding of RANGEKEYSET and RANGEKEYUNSET values
+// once we're confident they're stable.
+
+import (
+	"encoding/binary"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+)
+
+// Encode takes a Span containing only range keys. It invokes the provided
+// closure with the encoded internal keys that represent the Span's state. The
+// keys and values passed to emit are only valid until the closure returns.
+// If emit returns an error, Encode stops and returns the error.
+func Encode(s *keyspan.Span, emit func(k base.InternalKey, v []byte) error) error {
+	enc := Encoder{Emit: emit}
+	return enc.Encode(s)
+}
+
+// An Encoder encodes range keys into their on-disk InternalKey format. An
+// Encoder holds internal buffers, reused between Emit calls.
+type Encoder struct {
+	Emit   func(base.InternalKey, []byte) error
+	buf    []byte
+	unsets [][]byte
+	sets   []SuffixValue
+}
+
+// Encode takes a Span containing only range keys. It invokes the Encoder's Emit
+// closure with the encoded internal keys that represent the Span's state. The
+// keys and values passed to emit are only valid until the closure returns.  If
+// Emit returns an error, Encode stops and returns the error.
+//
+// The encoded key-value pair passed to Emit is only valid until the closure
+// completes.
+func (e *Encoder) Encode(s *keyspan.Span) error {
+	if s.Empty() {
+		return nil
+	}
+
+	// This for loop iterates through the span's keys, which are sorted by
+	// sequence number descending, grouping them into sequence numbers. All keys
+	// with identical sequence numbers are flushed together.
+	var del bool
+	var seqNum uint64
+	for i := range s.Keys {
+		if i == 0 || s.Keys[i].SeqNum() != seqNum {
+			if i > 0 {
+				// Flush all the existing internal keys that exist at seqNum.
+				if err := e.flush(s, seqNum, del); err != nil {
+					return err
+				}
+			}
+
+			// Reset sets, unsets, del.
+			seqNum = s.Keys[i].SeqNum()
+			del = false
+			e.sets = e.sets[:0]
+			e.unsets = e.unsets[:0]
+		}
+
+		switch s.Keys[i].Kind() {
+		case base.InternalKeyKindRangeKeySet:
+			e.sets = append(e.sets, SuffixValue{
+				Suffix: s.Keys[i].Suffix,
+				Value:  s.Keys[i].Value,
+			})
+		case base.InternalKeyKindRangeKeyUnset:
+			e.unsets = append(e.unsets, s.Keys[i].Suffix)
+		case base.InternalKeyKindRangeKeyDelete:
+			del = true
+		default:
+			return base.CorruptionErrorf("pebble: %s key kind is not a range key", s.Keys[i].Kind())
+		}
+	}
+	return e.flush(s, seqNum, del)
+}
+
+// flush constructs internal keys for accumulated key state, and emits the
+// internal keys.
+func (e *Encoder) flush(s *keyspan.Span, seqNum uint64, del bool) error {
+	if len(e.sets) > 0 {
+		ik := base.MakeInternalKey(s.Start, seqNum, base.InternalKeyKindRangeKeySet)
+		l := EncodedSetValueLen(s.End, e.sets)
+		if l > cap(e.buf) {
+			e.buf = make([]byte, l)
+		}
+		EncodeSetValue(e.buf[:l], s.End, e.sets)
+		if err := e.Emit(ik, e.buf[:l]); err != nil {
+			return err
+		}
+	}
+	if len(e.unsets) > 0 {
+		ik := base.MakeInternalKey(s.Start, seqNum, base.InternalKeyKindRangeKeyUnset)
+		l := EncodedUnsetValueLen(s.End, e.unsets)
+		if l > cap(e.buf) {
+			e.buf = make([]byte, l)
+		}
+		EncodeUnsetValue(e.buf[:l], s.End, e.unsets)
+		if err := e.Emit(ik, e.buf[:l]); err != nil {
+			return err
+		}
+	}
+	if del {
+		ik := base.MakeInternalKey(s.Start, seqNum, base.InternalKeyKindRangeKeyDelete)
+		// s.End is stored directly in the value for RangeKeyDeletes.
+		if err := e.Emit(ik, s.End); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Decode takes an internal key pair encoding range key(s) and returns a decoded
+// keyspan containing the keys. If keysDst is provided, keys will be appended to
+// keysDst.
+func Decode(ik base.InternalKey, v []byte, keysDst []keyspan.Key) (keyspan.Span, error) {
+	var s keyspan.Span
+
+	// Hydrate the user key bounds.
+	s.Start = ik.UserKey
+	var ok bool
+	s.End, v, ok = DecodeEndKey(ik.Kind(), v)
+	if !ok {
+		return keyspan.Span{}, base.CorruptionErrorf("pebble: unable to decode range key end from %s", ik.Kind())
+	}
+	s.Keys = keysDst
+
+	// Hydrate the contents of the range key(s).
+	switch ik.Kind() {
+	case base.InternalKeyKindRangeKeySet:
+		for len(v) > 0 {
+			var sv SuffixValue
+			sv, v, ok = decodeSuffixValue(v)
+			if !ok {
+				return keyspan.Span{}, base.CorruptionErrorf("pebble: unable to decode range key suffix-value tuple")
+			}
+			s.Keys = append(s.Keys, keyspan.Key{
+				Trailer: ik.Trailer,
+				Suffix:  sv.Suffix,
+				Value:   sv.Value,
+			})
+		}
+	case base.InternalKeyKindRangeKeyUnset:
+		for len(v) > 0 {
+			var suffix []byte
+			suffix, v, ok = decodeSuffix(v)
+			if !ok {
+				return keyspan.Span{}, base.CorruptionErrorf("pebble: unable to decode range key unset suffix")
+			}
+			s.Keys = append(s.Keys, keyspan.Key{
+				Trailer: ik.Trailer,
+				Suffix:  suffix,
+			})
+		}
+	case base.InternalKeyKindRangeKeyDelete:
+		if len(v) > 0 {
+			return keyspan.Span{}, base.CorruptionErrorf("pebble: RANGEKEYDELs must not contain additional data")
+		}
+		s.Keys = append(s.Keys, keyspan.Key{Trailer: ik.Trailer})
+	default:
+		return keyspan.Span{}, base.CorruptionErrorf("pebble: %s is not a range key", ik.Kind())
+	}
+	return s, nil
+}
+
+// SuffixValue represents a tuple of a suffix and a corresponding value. A
+// physical RANGEKEYSET key may contain many logical RangeKeySets, each
+// represented with a separate SuffixValue tuple.
+type SuffixValue struct {
+	Suffix []byte
+	Value  []byte
+}
+
+// encodedSetSuffixValuesLen precomputes the length of the given slice of
+// SuffixValues, when encoded for a RangeKeySet. It may be used to construct a
+// buffer of the appropriate size before encoding.
+func encodedSetSuffixValuesLen(suffixValues []SuffixValue) int {
+	var n int
+	for i := 0; i < len(suffixValues); i++ {
+		n += lenVarint(len(suffixValues[i].Suffix))
+		n += len(suffixValues[i].Suffix)
+		n += lenVarint(len(suffixValues[i].Value))
+		n += len(suffixValues[i].Value)
+	}
+	return n
+}
+
+// encodeSetSuffixValues encodes a slice of SuffixValues for a RangeKeySet into
+// dst. The length of dst must be greater than or equal to
+// encodedSetSuffixValuesLen. encodeSetSuffixValues returns the number of bytes
+// written, which should always equal the EncodedSetValueLen with the same
+// arguments.
+func encodeSetSuffixValues(dst []byte, suffixValues []SuffixValue) int {
+	// Encode the list of (suffix, value-len) tuples.
+	var n int
+	for i := 0; i < len(suffixValues); i++ {
+		// Encode the length of the suffix.
+		n += binary.PutUvarint(dst[n:], uint64(len(suffixValues[i].Suffix)))
+
+		// Encode the suffix itself.
+		n += copy(dst[n:], suffixValues[i].Suffix)
+
+		// Encode the value length.
+		n += binary.PutUvarint(dst[n:], uint64(len(suffixValues[i].Value)))
+
+		// Encode the value itself.
+		n += copy(dst[n:], suffixValues[i].Value)
+	}
+	return n
+}
+
+// EncodedSetValueLen precomputes the length of a RangeKeySet's value when
+// encoded. It may be used to construct a buffer of the appropriate size before
+// encoding.
+func EncodedSetValueLen(endKey []byte, suffixValues []SuffixValue) int {
+	n := lenVarint(len(endKey))
+	n += len(endKey)
+	n += encodedSetSuffixValuesLen(suffixValues)
+	return n
+}
+
+// EncodeSetValue encodes a RangeKeySet's value into dst. The length of dst must
+// be greater than or equal to EncodedSetValueLen. EncodeSetValue returns the
+// number of bytes written, which should always equal the EncodedSetValueLen
+// with the same arguments.
+func EncodeSetValue(dst []byte, endKey []byte, suffixValues []SuffixValue) int {
+	// First encode the end key as a varstring.
+	n := binary.PutUvarint(dst, uint64(len(endKey)))
+	n += copy(dst[n:], endKey)
+	n += encodeSetSuffixValues(dst[n:], suffixValues)
+	return n
+}
+
+// DecodeEndKey reads the end key from the beginning of a range key (RANGEKEYSET,
+// RANGEKEYUNSET or RANGEKEYDEL)'s physical encoded value. Both sets and unsets
+// encode the range key, plus additional data in the value.
+func DecodeEndKey(kind base.InternalKeyKind, data []byte) (endKey, value []byte, ok bool) {
+	switch kind {
+	case base.InternalKeyKindRangeKeyDelete:
+		// No splitting is necessary for range key deletes. The value is the end
+		// key, and there is no additional associated value.
+		return data, nil, true
+	case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset:
+		v, n := binary.Uvarint(data)
+		if n <= 0 || uint64(n)+v >= uint64(len(data)) {
+			return nil, nil, false
+		}
+		endKey, value = data[n:n+int(v)], data[n+int(v):]
+		return endKey, value, true
+	default:
+		panic(errors.Newf("key kind %s is not a range key kind", kind))
+	}
+}
+
+// decodeSuffixValue decodes a single encoded SuffixValue from a RangeKeySet's
+// split value. The end key must have already been stripped from the
+// RangeKeySet's value (see DecodeEndKey).
+func decodeSuffixValue(data []byte) (sv SuffixValue, rest []byte, ok bool) {
+	// Decode the suffix.
+	sv.Suffix, data, ok = decodeVarstring(data)
+	if !ok {
+		return SuffixValue{}, nil, false
+	}
+	// Decode the value.
+	sv.Value, data, ok = decodeVarstring(data)
+	if !ok {
+		return SuffixValue{}, nil, false
+	}
+	return sv, data, true
+}
+
+// encodedUnsetSuffixesLen precomputes the length of the given slice of
+// suffixes, when encoded for a RangeKeyUnset. It may be used to construct a
+// buffer of the appropriate size before encoding.
+func encodedUnsetSuffixesLen(suffixes [][]byte) int {
+	var n int
+	for i := 0; i < len(suffixes); i++ {
+		n += lenVarint(len(suffixes[i]))
+		n += len(suffixes[i])
+	}
+	return n
+}
+
+// encodeUnsetSuffixes encodes a slice of suffixes for a RangeKeyUnset into dst.
+// The length of dst must be greater than or equal to EncodedUnsetSuffixesLen.
+// EncodeUnsetSuffixes returns the number of bytes written, which should always
+// equal the EncodedUnsetSuffixesLen with the same arguments.
+func encodeUnsetSuffixes(dst []byte, suffixes [][]byte) int {
+	// Encode the list of (suffix, value-len) tuples.
+	var n int
+	for i := 0; i < len(suffixes); i++ {
+		//  Encode the length of the suffix.
+		n += binary.PutUvarint(dst[n:], uint64(len(suffixes[i])))
+
+		// Encode the suffix itself.
+		n += copy(dst[n:], suffixes[i])
+	}
+	return n
+}
+
+// EncodedUnsetValueLen precomputes the length of a RangeKeyUnset's value when
+// encoded.  It may be used to construct a buffer of the appropriate size before
+// encoding.
+func EncodedUnsetValueLen(endKey []byte, suffixes [][]byte) int {
+	n := lenVarint(len(endKey))
+	n += len(endKey)
+	n += encodedUnsetSuffixesLen(suffixes)
+	return n
+}
+
+// EncodeUnsetValue encodes a RangeKeyUnset's value into dst. The length of dst
+// must be greater than or equal to EncodedUnsetValueLen. EncodeUnsetValue
+// returns the number of bytes written, which should always equal the
+// EncodedUnsetValueLen with the same arguments.
+func EncodeUnsetValue(dst []byte, endKey []byte, suffixes [][]byte) int {
+	// First encode the end key as a varstring.
+	n := binary.PutUvarint(dst, uint64(len(endKey)))
+	n += copy(dst[n:], endKey)
+	n += encodeUnsetSuffixes(dst[n:], suffixes)
+	return n
+}
+
+// decodeSuffix decodes a single suffix from the beginning of data. If decoding
+// suffixes from a RangeKeyUnset's value, the end key must have already been
+// stripped from the RangeKeyUnset's value (see DecodeEndKey).
+func decodeSuffix(data []byte) (suffix, rest []byte, ok bool) {
+	return decodeVarstring(data)
+}
+
+func decodeVarstring(data []byte) (v, rest []byte, ok bool) {
+	// Decode the length of the string.
+	l, n := binary.Uvarint(data)
+	if n <= 0 {
+		return nil, nil, ok
+	}
+
+	// Extract the string itself.
+	return data[n : n+int(l)], data[n+int(l):], true
+}
+
+// IsRangeKey returns true if the given key kind is one of the range key kinds.
+func IsRangeKey(kind base.InternalKeyKind) bool {
+	switch kind {
+	case base.InternalKeyKindRangeKeyDelete,
+		base.InternalKeyKindRangeKeyUnset,
+		base.InternalKeyKindRangeKeySet:
+		return true
+	default:
+		return false
+	}
+}
+
+func lenVarint(v int) (n int) {
+	x := uint32(v)
+	n++
+	for x >= 0x80 {
+		x >>= 7
+		n++
+	}
+	return n
+}
diff --git a/pebble/internal/rangekey/rangekey_test.go b/pebble/internal/rangekey/rangekey_test.go
new file mode 100644
index 0000000..deec970
--- /dev/null
+++ b/pebble/internal/rangekey/rangekey_test.go
@@ -0,0 +1,239 @@
+package rangekey
+
+import (
+	"testing"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/stretchr/testify/require"
+)
+
+func TestSetSuffixValues_RoundTrip(t *testing.T) {
+	testCases := [][]SuffixValue{
+		{
+			{Suffix: []byte{}, Value: []byte("world")},
+		},
+		{
+			{Suffix: []byte("foo"), Value: []byte("bar")},
+		},
+		{
+			{Suffix: []byte(""), Value: []byte("boop")},
+			{Suffix: []byte("foo"), Value: []byte("beep")},
+			{Suffix: []byte("bar"), Value: []byte("bop")},
+			{Suffix: []byte("bax"), Value: []byte("boink")},
+			{Suffix: []byte("zoop"), Value: []byte("zoink")},
+		},
+	}
+
+	var b []byte
+	for _, tc := range testCases {
+		// Encode.
+		l := encodedSetSuffixValuesLen(tc)
+		if l <= cap(b) {
+			b = b[:l]
+		} else {
+			b = make([]byte, l)
+		}
+		n := encodeSetSuffixValues(b, tc)
+		require.Equal(t, l, n)
+
+		// Decode.
+		var suffixValues []SuffixValue
+		for len(b) > 0 {
+			sv, rest, ok := decodeSuffixValue(b)
+			require.True(t, ok)
+			suffixValues = append(suffixValues, sv)
+			b = rest
+		}
+		require.Equal(t, tc, suffixValues)
+	}
+}
+
+func TestSetValue_Roundtrip(t *testing.T) {
+	testCases := []struct {
+		endKey       []byte
+		suffixValues []SuffixValue
+	}{
+		{
+			endKey: []byte("hello"),
+			suffixValues: []SuffixValue{
+				{Suffix: []byte{}, Value: []byte("world")},
+			},
+		},
+		{
+			endKey: []byte("hello world"),
+			suffixValues: []SuffixValue{
+				{Suffix: []byte("foo"), Value: []byte("bar")},
+			},
+		},
+		{
+			endKey: []byte("hello world"),
+			suffixValues: []SuffixValue{
+				{Suffix: []byte(""), Value: []byte("boop")},
+				{Suffix: []byte("foo"), Value: []byte("beep")},
+				{Suffix: []byte("bar"), Value: []byte("bop")},
+				{Suffix: []byte("bax"), Value: []byte("boink")},
+				{Suffix: []byte("zoop"), Value: []byte("zoink")},
+			},
+		},
+	}
+
+	var b []byte
+	for _, tc := range testCases {
+		l := EncodedSetValueLen(tc.endKey, tc.suffixValues)
+
+		if l <= cap(b) {
+			b = b[:l]
+		} else {
+			b = make([]byte, l)
+		}
+
+		n := EncodeSetValue(b, tc.endKey, tc.suffixValues)
+		require.Equal(t, l, n)
+
+		var endKey, rest []byte
+		var ok bool
+		endKey, rest, ok = DecodeEndKey(base.InternalKeyKindRangeKeySet, b[:n])
+		require.True(t, ok)
+
+		var suffixValues []SuffixValue
+		for len(rest) > 0 {
+			var sv SuffixValue
+			var ok bool
+			sv, rest, ok = decodeSuffixValue(rest)
+			require.True(t, ok)
+			suffixValues = append(suffixValues, sv)
+		}
+		require.Equal(t, tc.endKey, endKey)
+		require.Equal(t, tc.suffixValues, suffixValues)
+	}
+}
+
+func TestUnsetSuffixes_RoundTrip(t *testing.T) {
+	type suffixes [][]byte
+	testCases := []suffixes{
+		{{}},
+		{[]byte("foo")},
+		{
+			{},
+			[]byte("foo"),
+			[]byte("bar"),
+			[]byte("bax"),
+			[]byte("zoop"),
+		},
+	}
+
+	var b []byte
+	for _, tc := range testCases {
+		// Encode.
+		l := encodedUnsetSuffixesLen(tc)
+		if l <= cap(b) {
+			b = b[:l]
+		} else {
+			b = make([]byte, l)
+		}
+		n := encodeUnsetSuffixes(b, tc)
+		require.Equal(t, l, n)
+
+		// Decode.
+		var ss suffixes
+		for len(b) > 0 {
+			s, rest, ok := decodeSuffix(b)
+			require.True(t, ok)
+			ss = append(ss, s)
+			b = rest
+		}
+		require.Equal(t, tc, ss)
+	}
+}
+
+func TestUnsetValue_Roundtrip(t *testing.T) {
+	testCases := []struct {
+		endKey   []byte
+		suffixes [][]byte
+	}{
+		{
+			endKey:   []byte("hello"),
+			suffixes: [][]byte{{}},
+		},
+		{
+			endKey:   []byte("hello world"),
+			suffixes: [][]byte{[]byte("foo")},
+		},
+		{
+			endKey: []byte("hello world"),
+			suffixes: [][]byte{
+				{},
+				[]byte("foo"),
+				[]byte("bar"),
+				[]byte("bax"),
+				[]byte("zoop"),
+			},
+		},
+	}
+
+	var b []byte
+	for _, tc := range testCases {
+		l := EncodedUnsetValueLen(tc.endKey, tc.suffixes)
+
+		if l <= cap(b) {
+			b = b[:l]
+		} else {
+			b = make([]byte, l)
+		}
+
+		n := EncodeUnsetValue(b, tc.endKey, tc.suffixes)
+		require.Equal(t, l, n)
+
+		var ok bool
+		var endKey, rest []byte
+		endKey, rest, ok = DecodeEndKey(base.InternalKeyKindRangeKeyUnset, b[:n])
+		require.True(t, ok)
+		var suffixes [][]byte
+		for len(rest) > 0 {
+			var ok bool
+			var suffix []byte
+			suffix, rest, ok = decodeSuffix(rest)
+			require.True(t, ok)
+			suffixes = append(suffixes, suffix)
+		}
+		require.Equal(t, tc.endKey, endKey)
+		require.Equal(t, tc.suffixes, suffixes)
+	}
+}
+
+func TestIsRangeKey(t *testing.T) {
+	testCases := []struct {
+		kind base.InternalKeyKind
+		want bool
+	}{
+		{
+			kind: base.InternalKeyKindRangeKeyDelete,
+			want: true,
+		},
+		{
+			kind: base.InternalKeyKindRangeKeyUnset,
+			want: true,
+		},
+		{
+			kind: base.InternalKeyKindRangeKeyDelete,
+			want: true,
+		},
+		{
+			kind: base.InternalKeyKindDelete,
+			want: false,
+		},
+		{
+			kind: base.InternalKeyKindDeleteSized,
+			want: false,
+		},
+		{
+			kind: base.InternalKeyKindSet,
+			want: false,
+		},
+	}
+	for _, tc := range testCases {
+		t.Run("", func(t *testing.T) {
+			require.Equal(t, tc.want, IsRangeKey(tc.kind))
+		})
+	}
+}
diff --git a/pebble/internal/rangekey/testdata/coalesce b/pebble/internal/rangekey/testdata/coalesce
new file mode 100644
index 0000000..05657eb
--- /dev/null
+++ b/pebble/internal/rangekey/testdata/coalesce
@@ -0,0 +1,87 @@
+# All disjoint RANGEKEYSETs.
+
+coalesce
+a-c:{(#10,RANGEKEYSET,@5,foo)}
+----
+a-c:{(#10,RANGEKEYSET,@5,foo)}
+
+coalesce
+c-d:{(#4,RANGEKEYSET,@3,foo)}
+----
+c-d:{(#4,RANGEKEYSET,@3,foo)}
+
+coalesce
+e-f:{(#20,RANGEKEYSET,@5,bar) (#20,RANGEKEYSET,@3,foo)}
+----
+e-f:{(#20,RANGEKEYSET,@5,bar) (#20,RANGEKEYSET,@3,foo)}
+
+# Merge overlapping RANGEKEYSETs.
+
+coalesce
+a-c:{(#10,RANGEKEYSET,@5,foo5) (#4,RANGEKEYSET,@3,foo3) (#4,RANGEKEYSET,@2,foo2)}
+----
+a-c:{(#10,RANGEKEYSET,@5,foo5) (#4,RANGEKEYSET,@3,foo3) (#4,RANGEKEYSET,@2,foo2)}
+
+# RANGEKEYUNSETs.
+
+coalesce
+a-c:{(#10,RANGEKEYUNSET,@5)}
+----
+a-c:{(#10,RANGEKEYUNSET,@5)}
+
+coalesce
+c-d:{(#4,RANGEKEYUNSET,@3)}
+----
+c-d:{(#4,RANGEKEYUNSET,@3)}
+
+coalesce
+e-f:{(#20,RANGEKEYUNSET,@5) (#20,RANGEKEYUNSET,@3)}
+----
+e-f:{(#20,RANGEKEYUNSET,@5) (#20,RANGEKEYUNSET,@3)}
+
+# Merge overlapping RANGEKEYUNSETs.
+
+coalesce
+a-c:{(#10,RANGEKEYUNSET,@5) (#4,RANGEKEYUNSET,@3) (#4,RANGEKEYUNSET,@2)}
+----
+a-c:{(#10,RANGEKEYUNSET,@5) (#4,RANGEKEYUNSET,@3) (#4,RANGEKEYUNSET,@2)}
+
+# Unsets may partially remove sets.
+
+coalesce
+a-c:{(#10,RANGEKEYUNSET,@100) (#9,RANGEKEYSET,@100,v100) (#9,RANGEKEYSET,@50,v50)}
+----
+a-c:{(#10,RANGEKEYUNSET,@100) (#9,RANGEKEYSET,@50,v50)}
+
+coalesce
+c-d:{(#9,RANGEKEYSET,@100,v100) (#9,RANGEKEYSET,@50,v50)}
+----
+c-d:{(#9,RANGEKEYSET,@100,v100) (#9,RANGEKEYSET,@50,v50)}
+
+# Unsets may wholly remove sets.
+
+coalesce
+b-c:{(#10,RANGEKEYUNSET,@3) (#10,RANGEKEYUNSET,@2) (#10,RANGEKEYUNSET,@1) (#8,RANGEKEYSET,@3,v3) (#8,RANGEKEYSET,@2,v2) (#8,RANGEKEYSET,@1,v1)}
+----
+b-c:{(#10,RANGEKEYUNSET,@3) (#10,RANGEKEYUNSET,@2) (#10,RANGEKEYUNSET,@1)}
+
+# Sets may shadow unsets.
+
+coalesce
+a-c:{(#5,RANGEKEYSET,@5,v5) (#4,RANGEKEYUNSET,@5)}
+----
+a-c:{(#5,RANGEKEYSET,@5,v5)}
+
+# Deletes shadow Sets and Unsets, but not at the same sequence number.
+
+coalesce
+a-c:{(#10,RANGEKEYSET,@5,foo5) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,foo3) (#4,RANGEKEYSET,@2,foo2)}
+----
+a-c:{(#10,RANGEKEYSET,@5,foo5) (#10,RANGEKEYDEL)}
+
+# Within a sequence number, none of the internal range keys affect one another.
+
+coalesce
+a-c:{(#5,RANGEKEYSET,@5,foo) (#5,RANGEKEYUNSET,@5) (#5,RANGEKEYDEL)}
+----
+a-c:{(#5,RANGEKEYSET,@5,foo) (#5,RANGEKEYDEL)}
diff --git a/pebble/internal/rangekey/testdata/defragmenting_iter b/pebble/internal/rangekey/testdata/defragmenting_iter
new file mode 100644
index 0000000..63f2177
--- /dev/null
+++ b/pebble/internal/rangekey/testdata/defragmenting_iter
@@ -0,0 +1,81 @@
+define
+a-c:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+c-d:{(#4,RANGEKEYSET,@3,bananas)}
+d-e:{(#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@1,pineapple)}
+----
+
+# Iterating with logical defragmentation should combine [a,c) and [c,d)
+# fragments.
+
+iter
+first
+next
+next
+last
+prev
+prev
+----
+first     a-d:{(#1,RANGEKEYSET,@3,bananas)}
+next      d-e:{(#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@1,pineapple)}
+next      .
+last      d-e:{(#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@1,pineapple)}
+prev      a-d:{(#4,RANGEKEYSET,@3,bananas)}
+prev      .
+
+# Test defragmenting in both directions at seek keys.
+
+define
+a-f:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+f-h:{(#3,RANGEKEYSET,@3,bananas)}
+h-p:{(#3,RANGEKEYSET,@3,bananas)}
+p-t:{(#3,RANGEKEYSET,@3,bananas)}
+----
+
+iter
+seekge b
+prev
+seekge b
+next
+seeklt d
+next
+seeklt d
+prev
+----
+seekge b  a-t:{(#1,RANGEKEYSET,@3,bananas)}
+prev      .
+seekge b  a-t:{(#1,RANGEKEYSET,@3,bananas)}
+next      .
+seeklt d  a-t:{(#3,RANGEKEYSET,@3,bananas)}
+next      .
+seeklt d  a-t:{(#3,RANGEKEYSET,@3,bananas)}
+prev      .
+
+iter
+seeklt d
+next
+prev
+----
+seeklt d  a-t:{(#3,RANGEKEYSET,@3,bananas)}
+next      .
+prev      a-t:{(#3,RANGEKEYSET,@3,bananas)}
+
+# Test next-ing and prev-ing around seek keys.
+
+define
+a-f:{(#3,RANGEKEYUNSET,@5) (#2,RANGEKEYSET,@5,apples) (#1,RANGEKEYSET,@3,bananas)}
+f-h:{(#3,RANGEKEYSET,@3,bananas)}
+h-p:{(#3,RANGEKEYSET,@3,bananas)}
+p-t:{(#3,RANGEKEYSET,@3,bananas)}
+t-z:{(#4,RANGEKEYSET,@2,oranges)}
+----
+
+iter
+seekge r
+prev
+next
+next
+----
+seekge r  a-t:{(#1,RANGEKEYSET,@3,bananas)}
+prev      .
+next      a-t:{(#1,RANGEKEYSET,@3,bananas)}
+next      t-z:{(#4,RANGEKEYSET,@2,oranges)}
diff --git a/pebble/internal/rangekey/testdata/iter b/pebble/internal/rangekey/testdata/iter
new file mode 100644
index 0000000..a26f62e
--- /dev/null
+++ b/pebble/internal/rangekey/testdata/iter
@@ -0,0 +1,201 @@
+define
+a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL) (#8,RANGEKEYUNSET,@1) (#4,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)}
+c-d:{(#4,RANGEKEYSET,@3,coconut)}
+e-f:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)}
+h-j:{(#22,RANGEKEYDEL) (#21,RANGEKEYSET,@5,peaches) (#21,RANGEKEYSET,@3,starfruit)}
+l-m:{(#2,RANGEKEYUNSET,@9) (#2,RANGEKEYUNSET,@5)}
+q-z:{(#14,RANGEKEYSET,@9,mangos)}
+----
+OK
+
+iter
+first
+next
+next
+next
+next
+next
+next
+----
+a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL)}
+c-d:{(#4,RANGEKEYSET,@3,coconut)}
+e-f:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)}
+h-j:{(#22,RANGEKEYDEL)}
+l-m:{(#2,RANGEKEYUNSET,@9) (#2,RANGEKEYUNSET,@5)}
+q-z:{(#14,RANGEKEYSET,@9,mangos)}
+<nil>
+
+iter
+last
+prev
+prev
+prev
+prev
+prev
+prev
+----
+q-z:{(#14,RANGEKEYSET,@9,mangos)}
+l-m:{(#2,RANGEKEYUNSET,@9) (#2,RANGEKEYUNSET,@5)}
+h-j:{(#22,RANGEKEYDEL)}
+e-f:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)}
+c-d:{(#4,RANGEKEYSET,@3,coconut)}
+a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL)}
+<nil>
+
+iter
+seek-ge cat
+prev
+next
+next
+next
+----
+c-d:{(#4,RANGEKEYSET,@3,coconut)}
+a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL)}
+c-d:{(#4,RANGEKEYSET,@3,coconut)}
+e-f:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)}
+h-j:{(#22,RANGEKEYDEL)}
+
+iter
+seek-ge c
+prev
+next
+next
+next
+----
+c-d:{(#4,RANGEKEYSET,@3,coconut)}
+a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL)}
+c-d:{(#4,RANGEKEYSET,@3,coconut)}
+e-f:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)}
+h-j:{(#22,RANGEKEYDEL)}
+
+iter
+seek-ge cat
+seek-ge c
+prev
+prev
+next
+next
+next
+----
+c-d:{(#4,RANGEKEYSET,@3,coconut)}
+c-d:{(#4,RANGEKEYSET,@3,coconut)}
+a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL)}
+<nil>
+a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL)}
+c-d:{(#4,RANGEKEYSET,@3,coconut)}
+e-f:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)}
+
+iter
+seek-ge dog
+next
+prev
+next
+next
+next
+next
+----
+e-f:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)}
+h-j:{(#22,RANGEKEYDEL)}
+e-f:{(#20,RANGEKEYSET,@5,pineapple) (#20,RANGEKEYSET,@3,guava)}
+h-j:{(#22,RANGEKEYDEL)}
+l-m:{(#2,RANGEKEYUNSET,@9) (#2,RANGEKEYUNSET,@5)}
+q-z:{(#14,RANGEKEYSET,@9,mangos)}
+<nil>
+
+iter
+seek-ge a
+seek-ge ace
+seek-ge bat
+seek-ge c
+----
+a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL)}
+a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL)}
+a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL)}
+c-d:{(#4,RANGEKEYSET,@3,coconut)}
+
+iter
+seek-ge 1
+seek-ge c1
+----
+a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL)}
+c-d:{(#4,RANGEKEYSET,@3,coconut)}
+
+iter
+seek-ge zoo
+prev
+seek-ge z
+prev
+seek-ge yeti
+----
+<nil>
+q-z:{(#14,RANGEKEYSET,@9,mangos)}
+<nil>
+q-z:{(#14,RANGEKEYSET,@9,mangos)}
+q-z:{(#14,RANGEKEYSET,@9,mangos)}
+
+iter
+seek-ge h
+seek-ge j
+----
+h-j:{(#22,RANGEKEYDEL)}
+l-m:{(#2,RANGEKEYUNSET,@9) (#2,RANGEKEYUNSET,@5)}
+
+iter
+first
+prev
+next
+----
+a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL)}
+<nil>
+a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL)}
+
+iter
+last
+next
+prev
+----
+q-z:{(#14,RANGEKEYSET,@9,mangos)}
+<nil>
+q-z:{(#14,RANGEKEYSET,@9,mangos)}
+
+iter
+seek-lt a
+seek-lt 0
+seek-lt aa
+seek-lt z
+seek-lt zoo
+next
+prev
+----
+<nil>
+<nil>
+a-c:{(#10,RANGEKEYSET,@5,apples) (#10,RANGEKEYDEL)}
+q-z:{(#14,RANGEKEYSET,@9,mangos)}
+q-z:{(#14,RANGEKEYSET,@9,mangos)}
+<nil>
+q-z:{(#14,RANGEKEYSET,@9,mangos)}
+
+define visible-seq-num=10
+a-c:{(#8,RANGEKEYSET,@5,apples) (#7,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)}
+c-d:{(#12,RANGEKEYSET,@3,coconut) (#5,RANGEKEYSET,@1,coconut)}
+d-f:{(#15,RANGEKEYSET,@2,oranges) (#5,RANGEKEYSET,@1,coconut)}
+----
+OK
+
+iter
+first
+next
+next
+----
+a-c:{(#8,RANGEKEYSET,@5,apples) (#7,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)}
+c-d:{(#5,RANGEKEYSET,@1,coconut)}
+d-f:{(#5,RANGEKEYSET,@1,coconut)}
+
+iter
+last
+prev
+prev
+----
+d-f:{(#5,RANGEKEYSET,@1,coconut)}
+c-d:{(#5,RANGEKEYSET,@1,coconut)}
+a-c:{(#8,RANGEKEYSET,@5,apples) (#7,RANGEKEYSET,@3,bananas) (#4,RANGEKEYSET,@2,oranges)}
diff --git a/pebble/internal/rate/rate.go b/pebble/internal/rate/rate.go
new file mode 100644
index 0000000..f5f801c
--- /dev/null
+++ b/pebble/internal/rate/rate.go
@@ -0,0 +1,96 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+// Package rate provides a rate limiter.
+package rate // import "github.com/cockroachdb/pebble/internal/rate"
+
+import (
+	"sync"
+	"time"
+
+	"github.com/cockroachdb/tokenbucket"
+)
+
+// A Limiter controls how frequently events are allowed to happen.
+// It implements a "token bucket" of size b, initially full and refilled
+// at rate r tokens per second.
+//
+// Informally, in any large enough time interval, the Limiter limits the
+// rate to r tokens per second, with a maximum burst size of b events.
+//
+// Limiter is thread-safe.
+type Limiter struct {
+	mu struct {
+		sync.Mutex
+		tb    tokenbucket.TokenBucket
+		rate  float64
+		burst float64
+	}
+	sleepFn func(d time.Duration)
+}
+
+// NewLimiter returns a new Limiter that allows events up to rate r and permits
+// bursts of at most b tokens.
+func NewLimiter(r float64, b float64) *Limiter {
+	l := &Limiter{}
+	l.mu.tb.Init(tokenbucket.TokensPerSecond(r), tokenbucket.Tokens(b))
+	l.mu.rate = r
+	l.mu.burst = b
+	return l
+}
+
+// NewLimiterWithCustomTime returns a new Limiter that allows events up to rate
+// r and permits bursts of at most b tokens. The limiter uses the given
+// functions to retrieve the current time and to sleep (useful for testing).
+func NewLimiterWithCustomTime(
+	r float64, b float64, nowFn func() time.Time, sleepFn func(d time.Duration),
+) *Limiter {
+	l := &Limiter{}
+	l.mu.tb.InitWithNowFn(tokenbucket.TokensPerSecond(r), tokenbucket.Tokens(b), nowFn)
+	l.mu.rate = r
+	l.mu.burst = b
+	l.sleepFn = sleepFn
+	return l
+}
+
+// Wait sleeps until enough tokens are available. If n is more than the burst,
+// the token bucket will go into debt, delaying future operations.
+func (l *Limiter) Wait(n float64) {
+	for {
+		l.mu.Lock()
+		ok, d := l.mu.tb.TryToFulfill(tokenbucket.Tokens(n))
+		l.mu.Unlock()
+		if ok {
+			return
+		}
+		if l.sleepFn != nil {
+			l.sleepFn(d)
+		} else {
+			time.Sleep(d)
+		}
+	}
+}
+
+// Remove removes tokens for an operation that bypassed any waiting; it can put
+// the token bucket into debt, delaying future operations.
+func (l *Limiter) Remove(n float64) {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	l.mu.tb.Adjust(-tokenbucket.Tokens(n))
+}
+
+// Rate returns the current rate limit.
+func (l *Limiter) Rate() float64 {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	return l.mu.rate
+}
+
+// SetRate updates the rate limit.
+func (l *Limiter) SetRate(r float64) {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	l.mu.tb.UpdateConfig(tokenbucket.TokensPerSecond(r), tokenbucket.Tokens(l.mu.burst))
+	l.mu.rate = r
+}
diff --git a/pebble/internal/rawalloc/rawalloc.go b/pebble/internal/rawalloc/rawalloc.go
new file mode 100644
index 0000000..c527c50
--- /dev/null
+++ b/pebble/internal/rawalloc/rawalloc.go
@@ -0,0 +1,24 @@
+// Copyright 2018 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License.
+
+package rawalloc
+
+// New returns a new byte slice of the specified length and capacity where the
+// backing memory is uninitialized. This differs from make([]byte) which
+// guarantees that the backing memory for the slice is initialized to zero. Use
+// carefully.
+func New(len, cap int) []byte {
+	ptr := mallocgc(uintptr(cap), nil, false)
+	return (*[maxArrayLen]byte)(ptr)[:len:cap]
+}
diff --git a/pebble/internal/rawalloc/rawalloc_32bit.go b/pebble/internal/rawalloc/rawalloc_32bit.go
new file mode 100644
index 0000000..3112cc9
--- /dev/null
+++ b/pebble/internal/rawalloc/rawalloc_32bit.go
@@ -0,0 +1,22 @@
+// Copyright 2014 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License.
+
+//go:build 386 || amd64p32 || arm || armbe || ppc || sparc
+// +build 386 amd64p32 arm armbe ppc sparc
+
+package rawalloc
+
+const (
+	maxArrayLen = 1<<31 - 1
+)
diff --git a/pebble/internal/rawalloc/rawalloc_64bit.go b/pebble/internal/rawalloc/rawalloc_64bit.go
new file mode 100644
index 0000000..c97ba33
--- /dev/null
+++ b/pebble/internal/rawalloc/rawalloc_64bit.go
@@ -0,0 +1,22 @@
+// Copyright 2014 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License.
+
+//go:build amd64 || arm64 || arm64be || ppc64 || ppc64le || mips64 || mips64le || s390x || sparc64 || riscv64
+// +build amd64 arm64 arm64be ppc64 ppc64le mips64 mips64le s390x sparc64 riscv64
+
+package rawalloc
+
+const (
+	maxArrayLen = 1<<50 - 1
+)
diff --git a/pebble/internal/rawalloc/rawalloc_gccgo.go b/pebble/internal/rawalloc/rawalloc_gccgo.go
new file mode 100644
index 0000000..f2db79c
--- /dev/null
+++ b/pebble/internal/rawalloc/rawalloc_gccgo.go
@@ -0,0 +1,23 @@
+// Copyright 2018 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License.
+
+//go:build gccgo
+// +build gccgo
+
+package rawalloc
+
+import "unsafe"
+
+//extern runtime.mallocgc
+func mallocgc(size uintptr, typ unsafe.Pointer, needzero bool) unsafe.Pointer
diff --git a/pebble/internal/rawalloc/rawalloc_go1.9.go b/pebble/internal/rawalloc/rawalloc_go1.9.go
new file mode 100644
index 0000000..65da436
--- /dev/null
+++ b/pebble/internal/rawalloc/rawalloc_go1.9.go
@@ -0,0 +1,28 @@
+// Copyright 2018 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License.
+
+//go:build gc && go1.9
+// +build gc,go1.9
+
+package rawalloc
+
+import "unsafe"
+
+// The go:linkname directives provides backdoor access to private functions in
+// the runtime. Below we're accessing the mallocgc function. Note that this
+// access is necessarily tied to a specific Go release which is why this file
+// is protected by a build tag.
+
+//go:linkname mallocgc runtime.mallocgc
+func mallocgc(size uintptr, typ unsafe.Pointer, needzero bool) unsafe.Pointer
diff --git a/pebble/internal/rawalloc/rawalloc_mips.go b/pebble/internal/rawalloc/rawalloc_mips.go
new file mode 100644
index 0000000..55b45eb
--- /dev/null
+++ b/pebble/internal/rawalloc/rawalloc_mips.go
@@ -0,0 +1,22 @@
+// Copyright 2014 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License.
+
+//go:build mips || mipsle || mips64p32 || mips64p32le
+// +build mips mipsle mips64p32 mips64p32le
+
+package rawalloc
+
+const (
+	maxArrayLen = 1 << 30
+)
diff --git a/pebble/internal/rawalloc/rawalloc_test.go b/pebble/internal/rawalloc/rawalloc_test.go
new file mode 100644
index 0000000..6de7e98
--- /dev/null
+++ b/pebble/internal/rawalloc/rawalloc_test.go
@@ -0,0 +1,32 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package rawalloc
+
+import (
+	"fmt"
+	"testing"
+)
+
+var sizes = []int{16, 100, 1024, 1024 * 10, 1024 * 100, 1024 * 1024}
+
+func BenchmarkRawalloc(b *testing.B) {
+	for _, size := range sizes {
+		b.Run(fmt.Sprintf("rawalloc-%d", size), func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				_ = New(size, size)
+			}
+		})
+	}
+}
+
+func BenchmarkMake(b *testing.B) {
+	for _, size := range sizes {
+		b.Run(fmt.Sprintf("make-%d", size), func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				_ = make([]byte, size)
+			}
+		})
+	}
+}
diff --git a/pebble/internal/testkeys/strconv.go b/pebble/internal/testkeys/strconv.go
new file mode 100644
index 0000000..4c9148f
--- /dev/null
+++ b/pebble/internal/testkeys/strconv.go
@@ -0,0 +1,121 @@
+/*
+Copyright 2013 The Perkeep Authors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package testkeys
+
+import (
+	"strconv"
+
+	"github.com/cockroachdb/errors"
+)
+
+// parseUintBytes is like strconv.ParseUint, but using a []byte. Use of this
+// function avoids an allocation when parsing an integer out of a []byte.
+//
+// This function is copied from go4.org/strconv.
+func parseUintBytes(s []byte, base int, bitSize int) (n uint64, err error) {
+	var cutoff, maxVal uint64
+
+	if bitSize == 0 {
+		bitSize = int(strconv.IntSize)
+	}
+
+	s0 := s
+	switch {
+	case len(s) < 1:
+		err = strconv.ErrSyntax
+		goto Error
+
+	case 2 <= base && base <= 36:
+		// valid base; nothing to do
+
+	case base == 0:
+		// Look for octal, hex prefix.
+		switch {
+		case s[0] == '0' && len(s) > 1 && (s[1] == 'x' || s[1] == 'X'):
+			base = 16
+			s = s[2:]
+			if len(s) < 1 {
+				err = strconv.ErrSyntax
+				goto Error
+			}
+		case s[0] == '0':
+			base = 8
+		default:
+			base = 10
+		}
+
+	default:
+		err = errors.New("invalid base " + strconv.Itoa(base))
+		goto Error
+	}
+
+	n = 0
+	cutoff = cutoff64(base)
+	maxVal = 1<<uint(bitSize) - 1
+
+	for i := 0; i < len(s); i++ {
+		var v byte
+		d := s[i]
+		switch {
+		case '0' <= d && d <= '9':
+			v = d - '0'
+		case 'a' <= d && d <= 'z':
+			v = d - 'a' + 10
+		case 'A' <= d && d <= 'Z':
+			v = d - 'A' + 10
+		default:
+			n = 0
+			err = strconv.ErrSyntax
+			goto Error
+		}
+		if int(v) >= base {
+			n = 0
+			err = strconv.ErrSyntax
+			goto Error
+		}
+
+		if n >= cutoff {
+			// n*base overflows
+			n = 1<<64 - 1
+			err = strconv.ErrRange
+			goto Error
+		}
+		n *= uint64(base)
+
+		n1 := n + uint64(v)
+		if n1 < n || n1 > maxVal {
+			// n+v overflows
+			n = 1<<64 - 1
+			err = strconv.ErrRange
+			goto Error
+		}
+		n = n1
+	}
+
+	return n, nil
+
+Error:
+	return n, &strconv.NumError{Func: "ParseUint", Num: string(s0), Err: err}
+}
+
+// Return the first number n such that n*base >= 1<<64.
+func cutoff64(base int) uint64 {
+	if base < 2 {
+		return 0
+	}
+	return (1<<64-1)/uint64(base) + 1
+}
diff --git a/pebble/internal/testkeys/testdata/divvy b/pebble/internal/testkeys/testdata/divvy
new file mode 100644
index 0000000..8bf4424
--- /dev/null
+++ b/pebble/internal/testkeys/testdata/divvy
@@ -0,0 +1,46 @@
+divvy alpha=1 portions=3
+----
+a d g j m p s v y
+b e h k n q t w z
+c f i l o r u x
+
+divvy alpha=1 portions=1
+----
+a b c d e f g h i j k l m n o p q r s t u v w x y z
+
+divvy alpha=1 portions=5
+----
+a f k p u z
+b g l q v
+c h m r w
+d i n s x
+e j o t y
+
+divvy alpha=2 portions=26
+----
+a az by cx dw ev fu gt hs ir jq kp lo mn nm ol pk qj ri sh tg uf ve wd xc yb za
+aa b bz cy dx ew fv gu ht is jr kq lp mo nn om pl qk rj si th ug vf we xd yc zb
+ab ba c cz dy ex fw gv hu it js kr lq mp no on pm ql rk sj ti uh vg wf xe yd zc
+ac bb ca d dz ey fx gw hv iu jt ks lr mq np oo pn qm rl sk tj ui vh wg xf ye zd
+ad bc cb da e ez fy gx hw iv ju kt ls mr nq op po qn rm sl tk uj vi wh xg yf ze
+ae bd cc db ea f fz gy hx iw jv ku lt ms nr oq pp qo rn sm tl uk vj wi xh yg zf
+af be cd dc eb fa g gz hy ix jw kv lu mt ns or pq qp ro sn tm ul vk wj xi yh zg
+ag bf ce dd ec fb ga h hz iy jx kw lv mu nt os pr qq rp so tn um vl wk xj yi zh
+ah bg cf de ed fc gb ha i iz jy kx lw mv nu ot ps qr rq sp to un vm wl xk yj zi
+ai bh cg df ee fd gc hb ia j jz ky lx mw nv ou pt qs rr sq tp uo vn wm xl yk zj
+aj bi ch dg ef fe gd hc ib ja k kz ly mx nw ov pu qt rs sr tq up vo wn xm yl zk
+ak bj ci dh eg ff ge hd ic jb ka l lz my nx ow pv qu rt ss tr uq vp wo xn ym zl
+al bk cj di eh fg gf he id jc kb la m mz ny ox pw qv ru st ts ur vq wp xo yn zm
+am bl ck dj ei fh gg hf ie jd kc lb ma n nz oy px qw rv su tt us vr wq xp yo zn
+an bm cl dk ej fi gh hg if je kd lc mb na o oz py qx rw sv tu ut vs wr xq yp zo
+ao bn cm dl ek fj gi hh ig jf ke ld mc nb oa p pz qy rx sw tv uu vt ws xr yq zp
+ap bo cn dm el fk gj hi ih jg kf le md nc ob pa q qz ry sx tw uv vu wt xs yr zq
+aq bp co dn em fl gk hj ii jh kg lf me nd oc pb qa r rz sy tx uw vv wu xt ys zr
+ar bq cp do en fm gl hk ij ji kh lg mf ne od pc qb ra s sz ty ux vw wv xu yt zs
+as br cq dp eo fn gm hl ik jj ki lh mg nf oe pd qc rb sa t tz uy vx ww xv yu zt
+at bs cr dq ep fo gn hm il jk kj li mh ng of pe qd rc sb ta u uz vy wx xw yv zu
+au bt cs dr eq fp go hn im jl kk lj mi nh og pf qe rd sc tb ua v vz wy xx yw zv
+av bu ct ds er fq gp ho in jm kl lk mj ni oh pg qf re sd tc ub va w wz xy yx zw
+aw bv cu dt es fr gq hp io jn km ll mk nj oi ph qg rf se td uc vb wa x xz yy zx
+ax bw cv du et fs gr hq ip jo kn lm ml nk oj pi qh rg sf te ud vc wb xa y yz zy
+ay bx cw dv eu ft gs hr iq jp ko ln mm nl ok pj qi rh sg tf ue vd wc xb ya z zz
diff --git a/pebble/internal/testkeys/testkeys.go b/pebble/internal/testkeys/testkeys.go
new file mode 100644
index 0000000..091baa1
--- /dev/null
+++ b/pebble/internal/testkeys/testkeys.go
@@ -0,0 +1,512 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+// Package testkeys provides facilities for generating and comparing
+// human-readable test keys for use in tests and benchmarks. This package
+// provides a single Comparer implementation that compares all keys generated
+// by this package.
+//
+// Keys generated by this package may optionally have a 'suffix' encoding an
+// MVCC timestamp. This suffix is of the form "@<integer>". Comparisons on the
+// suffix are performed using integer value, not the byte representation.
+package testkeys
+
+import (
+	"bytes"
+	"fmt"
+	"math"
+	"strconv"
+	"strings"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/shims/cmp"
+	"golang.org/x/exp/rand"
+)
+
+const alpha = "abcdefghijklmnopqrstuvwxyz"
+
+const suffixDelim = '@'
+
+var inverseAlphabet = make(map[byte]int64, len(alpha))
+
+func init() {
+	for i := range alpha {
+		inverseAlphabet[alpha[i]] = int64(i)
+	}
+}
+
+// MaxSuffixLen is the maximum length of a suffix generated by this package.
+var MaxSuffixLen = 1 + len(fmt.Sprintf("%d", int64(math.MaxInt64)))
+
+// Comparer is the comparer for test keys generated by this package.
+var Comparer = &base.Comparer{
+	Compare: compare,
+	Equal:   func(a, b []byte) bool { return compare(a, b) == 0 },
+	AbbreviatedKey: func(k []byte) uint64 {
+		return base.DefaultComparer.AbbreviatedKey(k[:split(k)])
+	},
+	FormatKey: base.DefaultFormatter,
+	Separator: func(dst, a, b []byte) []byte {
+		ai := split(a)
+		if ai == len(a) {
+			return append(dst, a...)
+		}
+		bi := split(b)
+		if bi == len(b) {
+			return append(dst, a...)
+		}
+
+		// If the keys are the same just return a.
+		if bytes.Equal(a[:ai], b[:bi]) {
+			return append(dst, a...)
+		}
+		n := len(dst)
+		dst = base.DefaultComparer.Separator(dst, a[:ai], b[:bi])
+		// Did it pick a separator different than a[:ai] -- if not we can't do better than a.
+		buf := dst[n:]
+		if bytes.Equal(a[:ai], buf) {
+			return append(dst[:n], a...)
+		}
+		// The separator is > a[:ai], so return it
+		return dst
+	},
+	Successor: func(dst, a []byte) []byte {
+		ai := split(a)
+		if ai == len(a) {
+			return append(dst, a...)
+		}
+		n := len(dst)
+		dst = base.DefaultComparer.Successor(dst, a[:ai])
+		// Did it pick a successor different than a[:ai] -- if not we can't do better than a.
+		buf := dst[n:]
+		if bytes.Equal(a[:ai], buf) {
+			return append(dst[:n], a...)
+		}
+		// The successor is > a[:ai], so return it.
+		return dst
+	},
+	ImmediateSuccessor: func(dst, a []byte) []byte {
+		// TODO(jackson): Consider changing this Comparer to only support
+		// representable prefix keys containing characters a-z.
+		ai := split(a)
+		if ai != len(a) {
+			panic("pebble: ImmediateSuccessor invoked with a non-prefix key")
+		}
+		return append(append(dst, a...), 0x00)
+	},
+	Split: split,
+	Name:  "pebble.internal.testkeys",
+}
+
+func compare(a, b []byte) int {
+	ai, bi := split(a), split(b)
+	if v := bytes.Compare(a[:ai], b[:bi]); v != 0 {
+		return v
+	}
+
+	if len(a[ai:]) == 0 {
+		if len(b[bi:]) == 0 {
+			return 0
+		}
+		return -1
+	} else if len(b[bi:]) == 0 {
+		return +1
+	}
+	return compareTimestamps(a[ai:], b[bi:])
+}
+
+func split(a []byte) int {
+	i := bytes.LastIndexByte(a, suffixDelim)
+	if i >= 0 {
+		return i
+	}
+	return len(a)
+}
+
+func compareTimestamps(a, b []byte) int {
+	ai, err := parseUintBytes(bytes.TrimPrefix(a, []byte{suffixDelim}), 10, 64)
+	if err != nil {
+		panic(fmt.Sprintf("invalid test mvcc timestamp %q", a))
+	}
+	bi, err := parseUintBytes(bytes.TrimPrefix(b, []byte{suffixDelim}), 10, 64)
+	if err != nil {
+		panic(fmt.Sprintf("invalid test mvcc timestamp %q", b))
+	}
+	return cmp.Compare(bi, ai)
+}
+
+// Keyspace describes a finite keyspace of unsuffixed test keys.
+type Keyspace interface {
+	// Count returns the number of keys that exist within this keyspace.
+	Count() int64
+
+	// MaxLen returns the maximum length, in bytes, of a key within this
+	// keyspace. This is only guaranteed to return an upper bound.
+	MaxLen() int
+
+	// Slice returns the sub-keyspace from index i, inclusive, to index j,
+	// exclusive. The receiver is unmodified.
+	Slice(i, j int64) Keyspace
+
+	// EveryN returns a key space that includes 1 key for every N keys in the
+	// original keyspace. The receiver is unmodified.
+	EveryN(n int64) Keyspace
+
+	// key writes the i-th key to the buffer and returns the length.
+	key(buf []byte, i int64) int
+}
+
+// Divvy divides the provided keyspace into N equal portions, containing
+// disjoint keys evenly distributed across the keyspace.
+func Divvy(ks Keyspace, n int64) []Keyspace {
+	ret := make([]Keyspace, n)
+	for i := int64(0); i < n; i++ {
+		ret[i] = ks.Slice(i, ks.Count()).EveryN(n)
+	}
+	return ret
+}
+
+// Alpha constructs a keyspace consisting of all keys containing characters a-z,
+// with at most `maxLength` characters.
+func Alpha(maxLength int) Keyspace {
+	return alphabet{
+		alphabet:  []byte(alpha),
+		maxLength: maxLength,
+		increment: 1,
+	}
+}
+
+// KeyAt returns the i-th key within the keyspace with a suffix encoding the
+// timestamp t.
+func KeyAt(k Keyspace, i int64, t int64) []byte {
+	b := make([]byte, k.MaxLen()+MaxSuffixLen)
+	return b[:WriteKeyAt(b, k, i, t)]
+}
+
+// WriteKeyAt writes the i-th key within the keyspace to the buffer dst, with a
+// suffix encoding the timestamp t suffix. It returns the number of bytes
+// written.
+func WriteKeyAt(dst []byte, k Keyspace, i int64, t int64) int {
+	n := WriteKey(dst, k, i)
+	n += WriteSuffix(dst[n:], t)
+	return n
+}
+
+// Suffix returns the test keys suffix representation of timestamp t.
+func Suffix(t int64) []byte {
+	b := make([]byte, MaxSuffixLen)
+	return b[:WriteSuffix(b, t)]
+}
+
+// SuffixLen returns the exact length of the given suffix when encoded.
+func SuffixLen(t int64) int {
+	// Begin at 1 for the '@' delimiter, 1 for a single digit.
+	n := 2
+	t /= 10
+	for t > 0 {
+		t /= 10
+		n++
+	}
+	return n
+}
+
+// ParseSuffix returns the integer representation of the encoded suffix.
+func ParseSuffix(s []byte) (int64, error) {
+	return strconv.ParseInt(strings.TrimPrefix(string(s), string(suffixDelim)), 10, 64)
+}
+
+// WriteSuffix writes the test keys suffix representation of timestamp t to dst,
+// returning the number of bytes written.
+func WriteSuffix(dst []byte, t int64) int {
+	dst[0] = suffixDelim
+	n := 1
+	n += len(strconv.AppendInt(dst[n:n], t, 10))
+	return n
+}
+
+// Key returns the i-th unsuffixed key within the keyspace.
+func Key(k Keyspace, i int64) []byte {
+	b := make([]byte, k.MaxLen())
+	return b[:k.key(b, i)]
+}
+
+// WriteKey writes the i-th unsuffixed key within the keyspace to the buffer dst. It
+// returns the number of bytes written.
+func WriteKey(dst []byte, k Keyspace, i int64) int {
+	return k.key(dst, i)
+}
+
+type alphabet struct {
+	alphabet  []byte
+	maxLength int
+	headSkip  int64
+	tailSkip  int64
+	increment int64
+}
+
+func (a alphabet) Count() int64 {
+	// Calculate the total number of keys, ignoring the increment.
+	total := keyCount(len(a.alphabet), a.maxLength) - a.headSkip - a.tailSkip
+
+	// The increment dictates that we take every N keys, where N = a.increment.
+	// Consider a total containing the 5 keys:
+	//   a  b  c  d  e
+	//   ^     ^     ^
+	// If the increment is 2, this keyspace includes 'a', 'c' and 'e'. After
+	// dividing by the increment, there may be remainder. If there is, there's
+	// one additional key in the alphabet.
+	count := total / a.increment
+	if total%a.increment > 0 {
+		count++
+	}
+	return count
+}
+
+func (a alphabet) MaxLen() int {
+	return a.maxLength
+}
+
+func (a alphabet) Slice(i, j int64) Keyspace {
+	s := a
+	s.headSkip += i
+	s.tailSkip += a.Count() - j
+	return s
+}
+
+func (a alphabet) EveryN(n int64) Keyspace {
+	s := a
+	s.increment *= n
+	return s
+}
+
+func keyCount(n, l int) int64 {
+	if n == 0 {
+		return 0
+	} else if n == 1 {
+		return int64(l)
+	}
+	// The number of representable keys in the keyspace is a function of the
+	// length of the alphabet n and the max key length l. Consider how the
+	// number of representable keys grows as l increases:
+	//
+	// l = 1: n
+	// l = 2: n + n^2
+	// l = 3: n + n^2 + n^3
+	// ...
+	// Σ i=(1...l) n^i = n*(n^l - 1)/(n-1)
+	return (int64(n) * (int64(math.Pow(float64(n), float64(l))) - 1)) / int64(n-1)
+}
+
+func (a alphabet) key(buf []byte, idx int64) int {
+	// This function generates keys of length 1..maxKeyLength, pulling
+	// characters from the alphabet. The idx determines which key to generate,
+	// generating the i-th lexicographically next key.
+	//
+	// The index to use is advanced by `headSkip`, allowing a keyspace to encode
+	// a subregion of the keyspace.
+	//
+	// Eg, alphabet = `ab`, maxKeyLength = 3:
+	//
+	//           aaa aab     aba abb         baa bab     bba bbb
+	//       aa          ab              ba          bb
+	//   a                           b
+	//   0   1   2   3   4   5   6   7   8   9   10  11  12  13
+	//
+	return generateAlphabetKey(buf, a.alphabet, (idx*a.increment)+a.headSkip,
+		keyCount(len(a.alphabet), a.maxLength))
+}
+
+func generateAlphabetKey(buf, alphabet []byte, i, keyCount int64) int {
+	if keyCount == 0 || i > keyCount || i < 0 {
+		return 0
+	}
+
+	// Of the keyCount keys in the generative keyspace, how many are there
+	// starting with a particular character?
+	keysPerCharacter := keyCount / int64(len(alphabet))
+
+	// Find the character that the key at index i starts with and set it.
+	characterIdx := i / keysPerCharacter
+	buf[0] = alphabet[characterIdx]
+
+	// Consider characterIdx = 0, pointing to 'a'.
+	//
+	//           aaa aab     aba abb         baa bab     bba bbb
+	//       aa          ab              ba          bb
+	//   a                           b
+	//   0   1   2   3   4   5   6   7   8   9   10  11  12  13
+	//  \_________________________/
+	//    |keysPerCharacter| keys
+	//
+	// In our recursive call, we reduce the problem to:
+	//
+	//           aaa aab     aba abb
+	//       aa          ab
+	//       0   1   2   3   4   5
+	//     \________________________/
+	//    |keysPerCharacter-1| keys
+	//
+	// In the subproblem, there are keysPerCharacter-1 keys (eliminating the
+	// just 'a' key, plus any keys beginning with any other character).
+	//
+	// The index i is also offset, reduced by the count of keys beginning with
+	// characters earlier in the alphabet (keysPerCharacter*characterIdx) and
+	// the key consisting of just the 'a' (-1).
+	i = i - keysPerCharacter*characterIdx - 1
+	return 1 + generateAlphabetKey(buf[1:], alphabet, i, keysPerCharacter-1)
+}
+
+// computeAlphabetKeyIndex computes the inverse of generateAlphabetKey,
+// returning the index of a particular key, given the provided alphabet and max
+// length of a key.
+//
+// len(key) must be ≥ 1.
+func computeAlphabetKeyIndex(key []byte, alphabet map[byte]int64, n int) int64 {
+	i, ok := alphabet[key[0]]
+	if !ok {
+		panic(fmt.Sprintf("unrecognized alphabet character %v", key[0]))
+	}
+	// How many keys exist that start with the preceding i characters? Each of
+	// the i characters themselves are a key, plus the count of all the keys
+	// with one less character for each.
+	ret := i + i*keyCount(len(alphabet), n-1)
+	if len(key) > 1 {
+		ret += 1 + computeAlphabetKeyIndex(key[1:], alphabet, n-1)
+	}
+	return ret
+}
+
+func abs(a int64) int64 {
+	if a < 0 {
+		return -a
+	}
+	return a
+}
+
+// RandomSeparator returns a random alphabetic key k such that a < k < b,
+// pulling randomness from the provided random number generator. If dst is
+// provided and the generated key fits within dst's capacity, the returned slice
+// will use dst's memory.
+//
+// If a prefix P exists such that Prefix(a) < P < Prefix(b), the generated key
+// will consist of the prefix P appended with the provided suffix. A zero suffix
+// generates an unsuffixed key. If no such prefix P exists, RandomSeparator will
+// try to find a key k with either Prefix(a) or Prefix(b) such that a < k < b,
+// but the generated key will not use the provided suffix. Note that it's
+// possible that no separator key exists (eg, a='a@2', b='a@1'), in which case
+// RandomSeparator returns nil.
+//
+// If RandomSeparator generates a new prefix, the generated prefix will have
+// length at most MAX(maxLength, len(Prefix(a)), len(Prefix(b))).
+//
+// RandomSeparator panics if a or b fails to decode.
+func RandomSeparator(dst, a, b []byte, suffix int64, maxLength int, rng *rand.Rand) []byte {
+	if Comparer.Compare(a, b) >= 0 {
+		return nil
+	}
+
+	// Determine both keys' logical prefixes and suffixes.
+	ai := Comparer.Split(a)
+	bi := Comparer.Split(b)
+	ap := a[:ai]
+	bp := b[:bi]
+	if len(ap) > len(bp) {
+		if len(ap) > maxLength {
+			maxLength = len(ap)
+		}
+	} else if len(bp) >= len(ap) {
+		if len(bp) > maxLength {
+			maxLength = len(bp)
+		}
+	}
+	var as, bs int64
+	var err error
+	if ai != len(a) {
+		as, err = ParseSuffix(a[ai:])
+		if err != nil {
+			panic(fmt.Sprintf("failed to parse suffix of %q", a))
+		}
+	}
+	if bi != len(b) {
+		bs, err = ParseSuffix(b[bi:])
+		if err != nil {
+			panic(fmt.Sprintf("failed to parse suffix of %q", b))
+		}
+	}
+
+	apIdx := computeAlphabetKeyIndex(ap, inverseAlphabet, maxLength)
+	bpIdx := computeAlphabetKeyIndex(bp, inverseAlphabet, maxLength)
+	diff := bpIdx - apIdx
+	generatedIdx := bpIdx
+	if diff > 0 {
+		var add int64 = diff + 1
+		var start int64 = apIdx
+		if as == 1 {
+			// There's no expressible key with prefix a greater than a@1. So,
+			// exclude ap.
+			start = apIdx + 1
+			add = diff
+		}
+		if bs == 0 {
+			// No key with prefix b can sort before b@0. We don't want to pick b.
+			add--
+		}
+		// We're allowing generated id to be in the range [start, start + add - 1].
+		if start > start+add-1 {
+			return nil
+		}
+		// If we can generate a key which is actually in the middle of apIdx
+		// and bpIdx use it so that we don't have to bother about timestamps.
+		generatedIdx = rng.Int63n(add) + start
+		for diff > 1 && generatedIdx == apIdx || generatedIdx == bpIdx {
+			generatedIdx = rng.Int63n(add) + start
+		}
+	}
+
+	switch {
+	case generatedIdx == apIdx && generatedIdx == bpIdx:
+		if abs(bs-as) <= 1 {
+			// There's no expressible suffix between the two, and there's no
+			// possible separator key.
+			return nil
+		}
+		// The key b is >= key a, but has the same prefix, so b must have the
+		// smaller timestamp, unless a has timestamp of 0.
+		//
+		// NB: The zero suffix (suffix-less) sorts before all other suffixes, so
+		// any suffix we generate will be greater than it.
+		if as == 0 {
+			// bs > as
+			suffix = bs + rng.Int63n(10) + 1
+		} else {
+			// bs < as.
+			// Generate suffix in range [bs + 1, as - 1]
+			suffix = bs + 1 + rng.Int63n(as-bs-1)
+		}
+	case generatedIdx == apIdx:
+		// NB: The zero suffix (suffix-less) sorts before all other suffixes, so
+		// any suffix we generate will be greater than it.
+		if as == 0 && suffix == 0 {
+			suffix++
+		} else if as != 0 && suffix >= as {
+			suffix = rng.Int63n(as)
+		}
+	case generatedIdx == bpIdx:
+		if suffix <= bs {
+			suffix = bs + rng.Int63n(10) + 1
+		}
+	}
+	if sz := maxLength + SuffixLen(suffix); cap(dst) < sz {
+		dst = make([]byte, sz)
+	} else {
+		dst = dst[:cap(dst)]
+	}
+	var w int
+	if suffix == 0 {
+		w = WriteKey(dst, Alpha(maxLength), generatedIdx)
+	} else {
+		w = WriteKeyAt(dst, Alpha(maxLength), generatedIdx, suffix)
+	}
+	return dst[:w]
+}
diff --git a/pebble/internal/testkeys/testkeys_test.go b/pebble/internal/testkeys/testkeys_test.go
new file mode 100644
index 0000000..43514bd
--- /dev/null
+++ b/pebble/internal/testkeys/testkeys_test.go
@@ -0,0 +1,252 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package testkeys
+
+import (
+	"bytes"
+	"fmt"
+	"slices"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+func TestGenerateAlphabetKey(t *testing.T) {
+	testCases := []struct {
+		i     int64
+		depth int
+		want  string
+	}{
+		{0, 1, "a"},
+		{0, 2, "a"},
+		{0, 3, "a"},
+
+		{1, 1, "b"},
+		{2, 1, "c"},
+
+		{0, 2, "a"},
+		{1, 2, "aa"},
+		{2, 2, "ab"},
+		{3, 2, "ac"},
+		{4, 2, "b"},
+		{5, 2, "ba"},
+		{6, 2, "bb"},
+		{7, 2, "bc"},
+		{8, 2, "c"},
+		{9, 2, "ca"},
+		{10, 2, "cb"},
+		{11, 2, "cc"},
+	}
+	testAlphabet := []byte{byte('a'), byte('b'), byte('c')}
+	testInverseAlphabet := map[byte]int64{byte('a'): 0, byte('b'): 1, byte('c'): 2}
+
+	buf := make([]byte, 10)
+	for _, tc := range testCases {
+		kc := keyCount(len(testAlphabet), tc.depth)
+		n := generateAlphabetKey(buf, testAlphabet, tc.i, kc)
+		got := string(buf[:n])
+		if got != tc.want {
+			t.Errorf("generateAlphabetKey(%q, %d, %d) = %q, want %q", testAlphabet, tc.i, kc, got, tc.want)
+		}
+		i := computeAlphabetKeyIndex([]byte(got), testInverseAlphabet, tc.depth)
+		if i != tc.i {
+			t.Errorf("computeAlphabetKeyIndex(%q, %d) = %d, want %d", got, tc.depth, i, tc.i)
+		}
+	}
+}
+
+func TestKeyCount(t *testing.T) {
+	type params struct {
+		n, l int
+	}
+	testCases := map[params]int64{
+		{26, 1}: 26,
+		{52, 1}: 52,
+		{2, 2}:  6,
+		{2, 3}:  14,
+		{2, 4}:  30,
+		{3, 2}:  12,
+	}
+	for p, want := range testCases {
+		got := keyCount(p.n, p.l)
+		if got != want {
+			t.Errorf("keyCount(%d, %d) = %d, want %d", p.n, p.l, got, want)
+		}
+	}
+}
+
+func TestFullKeyspaces(t *testing.T) {
+	testCases := []struct {
+		ks   Keyspace
+		want string
+	}{
+		{
+			Alpha(1),
+			"a b c d e f g h i j k l m n o p q r s t u v w x y z",
+		},
+		{
+			alphabet{[]byte("abc"), 2, 0, 0, 1},
+			"a aa ab ac b ba bb bc c ca cb cc",
+		},
+		{
+			alphabet{[]byte("abc"), 2, 0, 0, 2},
+			"a ab b bb c cb",
+		},
+		{
+			alphabet{[]byte("abc"), 3, 0, 0, 1},
+			"a aa aaa aab aac ab aba abb abc ac aca acb acc b ba baa bab bac bb bba bbb bbc bc bca bcb bcc c ca caa cab cac cb cba cbb cbc cc cca ccb ccc",
+		},
+		{
+			alphabet{[]byte("abc"), 3, 7, 10, 1},
+			"abb abc ac aca acb acc b ba baa bab bac bb bba bbb bbc bc bca bcb bcc c ca caa",
+		},
+	}
+	for _, tc := range testCases {
+		require.Equal(t, tc.want, keyspaceToString(tc.ks))
+	}
+}
+
+func TestSlice(t *testing.T) {
+	testCases := []struct {
+		orig Keyspace
+		i, j int64
+		want string
+	}{
+		{Alpha(1), 1, 25, "b c d e f g h i j k l m n o p q r s t u v w x y"},
+		{Alpha(1).Slice(1, 25), 1, 23, "c d e f g h i j k l m n o p q r s t u v w x"},
+		{Alpha(1).Slice(1, 25).Slice(1, 23), 10, 22, "m n o p q r s t u v w x"},
+	}
+	for _, tc := range testCases {
+		got := keyspaceToString(tc.orig.Slice(tc.i, tc.j))
+		if got != tc.want {
+			t.Errorf("(%q).Slice(%d, %d) = %q, want %q",
+				keyspaceToString(tc.orig), tc.i, tc.j, got, tc.want)
+		}
+	}
+}
+
+func TestSuffix(t *testing.T) {
+	ks := Alpha(3)
+	require.Equal(t, "a@1", string(KeyAt(ks, 0, 1)))
+	require.Equal(t, "a@10", string(KeyAt(ks, 0, 10)))
+	require.Equal(t, "aab@5", string(KeyAt(ks, 3, 5)))
+
+	assertCmp := func(want int, a, b []byte) {
+		got := Comparer.Compare(a, b)
+		if got != want {
+			t.Helper()
+			t.Errorf("Compare(%q, %q) = %d, want %d", a, b, got, want)
+		}
+	}
+
+	for i := int64(1); i < ks.Count(); i++ {
+		assertCmp(-1, KeyAt(ks, i-1, 1), KeyAt(ks, i, 1))
+		assertCmp(-1, Key(ks, i-1), Key(ks, i))
+		assertCmp(0, Key(ks, i), Key(ks, i))
+		for ts := int64(2); ts < 11; ts++ {
+			assertCmp(+1, KeyAt(ks, i, ts-1), KeyAt(ks, i, ts))
+			assertCmp(-1, KeyAt(ks, i-1, ts-1), KeyAt(ks, i, ts))
+		}
+	}
+
+	// Suffixes should be comparable on their own too.
+	a, b := make([]byte, MaxSuffixLen), make([]byte, MaxSuffixLen)
+	for ts := int64(2); ts < 150; ts++ {
+		an := WriteSuffix(a, ts-1)
+		bn := WriteSuffix(b, ts)
+		assertCmp(+1, a[:an], b[:bn])
+	}
+}
+
+func TestSuffixLen(t *testing.T) {
+	testCases := map[int64]int{
+		0:    2,
+		1:    2,
+		5:    2,
+		9:    2,
+		10:   3,
+		17:   3,
+		20:   3,
+		99:   3,
+		100:  4,
+		101:  4,
+		999:  4,
+		1000: 5,
+	}
+	for ts, want := range testCases {
+		if got := SuffixLen(ts); got != want {
+			t.Errorf("SuffixLen(%d) = %d, want %d", ts, got, want)
+		}
+	}
+}
+
+func TestDivvy(t *testing.T) {
+	var buf bytes.Buffer
+	datadriven.RunTest(t, "testdata/divvy", func(t *testing.T, d *datadriven.TestData) string {
+		buf.Reset()
+		switch d.Cmd {
+		case "divvy":
+			var alphaLen int
+			var portions int64
+			d.ScanArgs(t, "alpha", &alphaLen)
+			d.ScanArgs(t, "portions", &portions)
+
+			input := Alpha(alphaLen)
+			for _, ks := range Divvy(input, portions) {
+				fmt.Fprintln(&buf, keyspaceToString(ks))
+			}
+			return buf.String()
+		default:
+			return fmt.Sprintf("unrecognized command %q", d.Cmd)
+		}
+	})
+}
+
+func keyspaceToString(ks Keyspace) string {
+	var buf bytes.Buffer
+	b := make([]byte, ks.MaxLen())
+	for i := int64(0); i < ks.Count(); i++ {
+		n := ks.key(b, i)
+		if i > 0 {
+			buf.WriteRune(' ')
+		}
+		buf.Write(b[:n])
+	}
+	return buf.String()
+}
+
+func TestRandomSeparator(t *testing.T) {
+	rng := rand.New(rand.NewSource(0))
+	keys := [][]byte{[]byte("a"), []byte("zzz@9")}
+	for n := 0; n < 1000; n++ {
+		i := rng.Intn(len(keys))
+		j := rng.Intn(len(keys))
+		for i == j {
+			j = rng.Intn(len(keys))
+		}
+		if i > j {
+			i, j = j, i
+		}
+
+		a := keys[i]
+		b := keys[j]
+		suffix := rng.Int63n(10)
+		sep := RandomSeparator(nil, a, b, suffix, 3, rng)
+		t.Logf("RandomSeparator(%q, %q, %d) = %q\n", a, b, suffix, sep)
+		if sep == nil {
+			continue
+		}
+		for k := 0; k < len(keys); k++ {
+			v := Comparer.Compare(sep, keys[k])
+			if k <= i && v <= 0 || k >= j && v >= 0 {
+				t.Fatalf("RandomSeparator(%q, %q, %d) = %q; but Compare(%q,%q) = %d\n", a, b, suffix, sep, sep, keys[k], v)
+			}
+		}
+		keys = append(keys, sep)
+		slices.SortFunc(keys, Comparer.Compare)
+	}
+}
diff --git a/pebble/internal_test.go b/pebble/internal_test.go
new file mode 100644
index 0000000..3b14a87
--- /dev/null
+++ b/pebble/internal_test.go
@@ -0,0 +1,88 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import "github.com/cockroachdb/pebble/internal/base"
+
+// internalIterAdapter adapts the new internalIterator interface which returns
+// the key and value from positioning methods (Seek*, First, Last, Next, Prev)
+// to the old interface which returned a boolean corresponding to Valid. Only
+// used by test code.
+type internalIterAdapter struct {
+	internalIterator
+	key *InternalKey
+	val []byte
+	err error
+}
+
+func newInternalIterAdapter(iter internalIterator) *internalIterAdapter {
+	return &internalIterAdapter{
+		internalIterator: iter,
+	}
+}
+
+func (i *internalIterAdapter) update(key *InternalKey, val LazyValue) bool {
+	i.key = key
+	if v, _, err := val.Value(nil); err != nil {
+		i.key = nil
+		i.val = nil
+		i.err = err
+	} else {
+		i.val = v
+	}
+	return i.key != nil
+}
+
+func (i *internalIterAdapter) String() string {
+	return "internal-iter-adapter"
+}
+
+func (i *internalIterAdapter) SeekGE(key []byte, flags base.SeekGEFlags) bool {
+	return i.update(i.internalIterator.SeekGE(key, flags))
+}
+
+func (i *internalIterAdapter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) bool {
+	return i.update(i.internalIterator.SeekPrefixGE(prefix, key, flags))
+}
+
+func (i *internalIterAdapter) SeekLT(key []byte, flags base.SeekLTFlags) bool {
+	return i.update(i.internalIterator.SeekLT(key, flags))
+}
+
+func (i *internalIterAdapter) First() bool {
+	return i.update(i.internalIterator.First())
+}
+
+func (i *internalIterAdapter) Last() bool {
+	return i.update(i.internalIterator.Last())
+}
+
+func (i *internalIterAdapter) Next() bool {
+	return i.update(i.internalIterator.Next())
+}
+
+func (i *internalIterAdapter) Prev() bool {
+	return i.update(i.internalIterator.Prev())
+}
+
+func (i *internalIterAdapter) Key() *InternalKey {
+	return i.key
+}
+
+func (i *internalIterAdapter) Value() []byte {
+	return i.val
+}
+
+func (i *internalIterAdapter) Valid() bool {
+	return i.key != nil
+}
+
+func (i *internalIterAdapter) Error() error {
+	err := i.internalIterator.Error()
+	if err != nil {
+		return err
+	}
+	return i.err
+}
diff --git a/pebble/iterator.go b/pebble/iterator.go
new file mode 100644
index 0000000..740b975
--- /dev/null
+++ b/pebble/iterator.go
@@ -0,0 +1,3016 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"context"
+	"io"
+	"sync"
+	"unsafe"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/bytealloc"
+	"github.com/cockroachdb/pebble/internal/fastrand"
+	"github.com/cockroachdb/pebble/internal/humanize"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/internal/rangekey"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/redact"
+)
+
+// iterPos describes the state of the internal iterator, in terms of whether it
+// is at the position returned to the user (cur), one ahead of the position
+// returned (next for forward iteration and prev for reverse iteration). The cur
+// position is split into two states, for forward and reverse iteration, since
+// we need to differentiate for switching directions.
+//
+// There is subtlety in what is considered the current position of the Iterator.
+// The internal iterator exposes a sequence of internal keys. There is not
+// always a single internalIterator position corresponding to the position
+// returned to the user. Consider the example:
+//
+//	a.MERGE.9 a.MERGE.8 a.MERGE.7 a.SET.6 b.DELETE.9 b.DELETE.5 b.SET.4
+//	\                                   /
+//	  \       Iterator.Key() = 'a'    /
+//
+// The Iterator exposes one valid position at user key 'a' and the two exhausted
+// positions at the beginning and end of iteration. The underlying
+// internalIterator contains 7 valid positions and 2 exhausted positions.
+//
+// Iterator positioning methods must set iterPos to iterPosCur{Foward,Backward}
+// iff the user key at the current internalIterator position equals the
+// Iterator.Key returned to the user. This guarantees that a call to nextUserKey
+// or prevUserKey will advance to the next or previous iterator position.
+// iterPosCur{Forward,Backward} does not make any guarantee about the internal
+// iterator position among internal keys with matching user keys, and it will
+// vary subtly depending on the particular key kinds encountered. In the above
+// example, the iterator returning 'a' may set iterPosCurForward if the internal
+// iterator is positioned at any of a.MERGE.9, a.MERGE.8, a.MERGE.7 or a.SET.6.
+//
+// When setting iterPos to iterPosNext or iterPosPrev, the internal iterator
+// must be advanced to the first internalIterator position at a user key greater
+// (iterPosNext) or less (iterPosPrev) than the key returned to the user. An
+// internalIterator position that's !Valid() must also be considered greater or
+// less—depending on the direction of iteration—than the last valid Iterator
+// position.
+type iterPos int8
+
+const (
+	iterPosCurForward iterPos = 0
+	iterPosNext       iterPos = 1
+	iterPosPrev       iterPos = -1
+	iterPosCurReverse iterPos = -2
+
+	// For limited iteration. When the iterator is at iterPosCurForwardPaused
+	// - Next*() call should behave as if the internal iterator is already
+	//   at next (akin to iterPosNext).
+	// - Prev*() call should behave as if the internal iterator is at the
+	//   current key (akin to iterPosCurForward).
+	//
+	// Similar semantics apply to CurReversePaused.
+	iterPosCurForwardPaused iterPos = 2
+	iterPosCurReversePaused iterPos = -3
+)
+
+// Approximate gap in bytes between samples of data read during iteration.
+// This is multiplied with a default ReadSamplingMultiplier of 1 << 4 to yield
+// 1 << 20 (1MB). The 1MB factor comes from:
+// https://github.com/cockroachdb/pebble/issues/29#issuecomment-494477985
+const readBytesPeriod uint64 = 1 << 16
+
+var errReversePrefixIteration = errors.New("pebble: unsupported reverse prefix iteration")
+
+// IteratorMetrics holds per-iterator metrics. These do not change over the
+// lifetime of the iterator.
+type IteratorMetrics struct {
+	// The read amplification experienced by this iterator. This is the sum of
+	// the memtables, the L0 sublevels and the non-empty Ln levels. Higher read
+	// amplification generally results in slower reads, though allowing higher
+	// read amplification can also result in faster writes.
+	ReadAmp int
+}
+
+// IteratorStatsKind describes the two kind of iterator stats.
+type IteratorStatsKind int8
+
+const (
+	// InterfaceCall represents calls to Iterator.
+	InterfaceCall IteratorStatsKind = iota
+	// InternalIterCall represents calls by Iterator to its internalIterator.
+	InternalIterCall
+	// NumStatsKind is the number of kinds, and is used for array sizing.
+	NumStatsKind
+)
+
+// IteratorStats contains iteration stats.
+type IteratorStats struct {
+	// ForwardSeekCount includes SeekGE, SeekPrefixGE, First.
+	ForwardSeekCount [NumStatsKind]int
+	// ReverseSeek includes SeekLT, Last.
+	ReverseSeekCount [NumStatsKind]int
+	// ForwardStepCount includes Next.
+	ForwardStepCount [NumStatsKind]int
+	// ReverseStepCount includes Prev.
+	ReverseStepCount [NumStatsKind]int
+	InternalStats    InternalIteratorStats
+	RangeKeyStats    RangeKeyIteratorStats
+}
+
+var _ redact.SafeFormatter = &IteratorStats{}
+
+// InternalIteratorStats contains miscellaneous stats produced by internal
+// iterators.
+type InternalIteratorStats = base.InternalIteratorStats
+
+// RangeKeyIteratorStats contains miscellaneous stats about range keys
+// encountered by the iterator.
+type RangeKeyIteratorStats struct {
+	// Count records the number of range keys encountered during
+	// iteration. Range keys may be counted multiple times if the iterator
+	// leaves a range key's bounds and then returns.
+	Count int
+	// ContainedPoints records the number of point keys encountered within the
+	// bounds of a range key. Note that this includes point keys with suffixes
+	// that sort both above and below the covering range key's suffix.
+	ContainedPoints int
+	// SkippedPoints records the count of the subset of ContainedPoints point
+	// keys that were skipped during iteration due to range-key masking. It does
+	// not include point keys that were never loaded because a
+	// RangeKeyMasking.Filter excluded the entire containing block.
+	SkippedPoints int
+}
+
+// Merge adds all of the argument's statistics to the receiver. It may be used
+// to accumulate stats across multiple iterators.
+func (s *RangeKeyIteratorStats) Merge(o RangeKeyIteratorStats) {
+	s.Count += o.Count
+	s.ContainedPoints += o.ContainedPoints
+	s.SkippedPoints += o.SkippedPoints
+}
+
+// LazyValue is a lazy value. See the long comment in base.LazyValue.
+type LazyValue = base.LazyValue
+
+// Iterator iterates over a DB's key/value pairs in key order.
+//
+// An iterator must be closed after use, but it is not necessary to read an
+// iterator until exhaustion.
+//
+// An iterator is not goroutine-safe, but it is safe to use multiple iterators
+// concurrently, with each in a dedicated goroutine.
+//
+// It is also safe to use an iterator concurrently with modifying its
+// underlying DB, if that DB permits modification. However, the resultant
+// key/value pairs are not guaranteed to be a consistent snapshot of that DB
+// at a particular point in time.
+//
+// If an iterator encounters an error during any operation, it is stored by
+// the Iterator and surfaced through the Error method. All absolute
+// positioning methods (eg, SeekLT, SeekGT, First, Last, etc) reset any
+// accumulated error before positioning. All relative positioning methods (eg,
+// Next, Prev) return without advancing if the iterator has an accumulated
+// error.
+type Iterator struct {
+	// The context is stored here since (a) Iterators are expected to be
+	// short-lived (since they pin memtables and sstables), (b) plumbing a
+	// context into every method is very painful, (c) they do not (yet) respect
+	// context cancellation and are only used for tracing.
+	ctx       context.Context
+	opts      IterOptions
+	merge     Merge
+	comparer  base.Comparer
+	iter      internalIterator
+	pointIter internalIterator
+	// Either readState or version is set, but not both.
+	readState *readState
+	version   *version
+	// rangeKey holds iteration state specific to iteration over range keys.
+	// The range key field may be nil if the Iterator has never been configured
+	// to iterate over range keys. Its non-nilness cannot be used to determine
+	// if the Iterator is currently iterating over range keys: For that, consult
+	// the IterOptions using opts.rangeKeys(). If non-nil, its rangeKeyIter
+	// field is guaranteed to be non-nil too.
+	rangeKey *iteratorRangeKeyState
+	// rangeKeyMasking holds state for range-key masking of point keys.
+	rangeKeyMasking rangeKeyMasking
+	err             error
+	// When iterValidityState=IterValid, key represents the current key, which
+	// is backed by keyBuf.
+	key    []byte
+	keyBuf []byte
+	value  LazyValue
+	// For use in LazyValue.Clone.
+	valueBuf []byte
+	fetcher  base.LazyFetcher
+	// For use in LazyValue.Value.
+	lazyValueBuf []byte
+	valueCloser  io.Closer
+	// boundsBuf holds two buffers used to store the lower and upper bounds.
+	// Whenever the Iterator's bounds change, the new bounds are copied into
+	// boundsBuf[boundsBufIdx]. The two bounds share a slice to reduce
+	// allocations. opts.LowerBound and opts.UpperBound point into this slice.
+	boundsBuf    [2][]byte
+	boundsBufIdx int
+	// iterKey, iterValue reflect the latest position of iter, except when
+	// SetBounds is called. In that case, these are explicitly set to nil.
+	iterKey             *InternalKey
+	iterValue           LazyValue
+	alloc               *iterAlloc
+	getIterAlloc        *getIterAlloc
+	prefixOrFullSeekKey []byte
+	readSampling        readSampling
+	stats               IteratorStats
+	externalReaders     [][]*sstable.Reader
+
+	// Following fields used when constructing an iterator stack, eg, in Clone
+	// and SetOptions or when re-fragmenting a batch's range keys/range dels.
+	// Non-nil if this Iterator includes a Batch.
+	batch            *Batch
+	newIters         tableNewIters
+	newIterRangeKey  keyspan.TableNewSpanIter
+	lazyCombinedIter lazyCombinedIter
+	seqNum           uint64
+	// batchSeqNum is used by Iterators over indexed batches to detect when the
+	// underlying batch has been mutated. The batch beneath an indexed batch may
+	// be mutated while the Iterator is open, but new keys are not surfaced
+	// until the next call to SetOptions.
+	batchSeqNum uint64
+	// batch{PointIter,RangeDelIter,RangeKeyIter} are used when the Iterator is
+	// configured to read through an indexed batch. If a batch is set, these
+	// iterators will be included within the iterator stack regardless of
+	// whether the batch currently contains any keys of their kind. These
+	// pointers are used during a call to SetOptions to refresh the Iterator's
+	// view of its indexed batch.
+	batchPointIter    batchIter
+	batchRangeDelIter keyspan.Iter
+	batchRangeKeyIter keyspan.Iter
+	// merging is a pointer to this iterator's point merging iterator. It
+	// appears here because key visibility is handled by the merging iterator.
+	// During SetOptions on an iterator over an indexed batch, this field is
+	// used to update the merging iterator's batch snapshot.
+	merging *mergingIter
+
+	// Keeping the bools here after all the 8 byte aligned fields shrinks the
+	// sizeof this struct by 24 bytes.
+
+	// INVARIANT:
+	// iterValidityState==IterAtLimit <=>
+	//  pos==iterPosCurForwardPaused || pos==iterPosCurReversePaused
+	iterValidityState IterValidityState
+	// Set to true by SetBounds, SetOptions. Causes the Iterator to appear
+	// exhausted externally, while preserving the correct iterValidityState for
+	// the iterator's internal state. Preserving the correct internal validity
+	// is used for SeekPrefixGE(..., trySeekUsingNext), and SeekGE/SeekLT
+	// optimizations after "no-op" calls to SetBounds and SetOptions.
+	requiresReposition bool
+	// The position of iter. When this is iterPos{Prev,Next} the iter has been
+	// moved past the current key-value, which can only happen if
+	// iterValidityState=IterValid, i.e., there is something to return to the
+	// client for the current position.
+	pos iterPos
+	// Relates to the prefixOrFullSeekKey field above.
+	hasPrefix bool
+	// Used for deriving the value of SeekPrefixGE(..., trySeekUsingNext),
+	// and SeekGE/SeekLT optimizations
+	lastPositioningOp lastPositioningOpKind
+	// Used for determining when it's safe to perform SeekGE optimizations that
+	// reuse the iterator state to avoid the cost of a full seek if the iterator
+	// is already positioned in the correct place. If the iterator's view of its
+	// indexed batch was just refreshed, some optimizations cannot be applied on
+	// the first seek after the refresh:
+	// - SeekGE has a no-op optimization that does not seek on the internal
+	//   iterator at all if the iterator is already in the correct place.
+	//   This optimization cannot be performed if the internal iterator was
+	//   last positioned when the iterator had a different view of an
+	//   underlying batch.
+	// - Seek[Prefix]GE set flags.TrySeekUsingNext()=true when the seek key is
+	//   greater than the previous operation's seek key, under the expectation
+	//   that the various internal iterators can use their current position to
+	//   avoid a full expensive re-seek. This applies to the batchIter as well.
+	//   However, if the view of the batch was just refreshed, the batchIter's
+	//   position is not useful because it may already be beyond new keys less
+	//   than the seek key. To prevent the use of this optimization in
+	//   batchIter, Seek[Prefix]GE set flags.BatchJustRefreshed()=true if this
+	//   bit is enabled.
+	batchJustRefreshed bool
+	// Used for an optimization in external iterators to reduce the number of
+	// merging levels.
+	forwardOnly bool
+	// batchOnlyIter is set to true for Batch.NewBatchOnlyIter.
+	batchOnlyIter bool
+	// closePointIterOnce is set to true if this point iter can only be Close()d
+	// once, _and_ closing i.iter and then i.pointIter would close i.pointIter
+	// twice. This is necessary to track if the point iter is an internal iterator
+	// that could release its resources to a pool on Close(), making it harder for
+	// that iterator to make its own closes idempotent.
+	//
+	// TODO(bilal): Update SetOptions to always close out point key iterators when
+	// they won't be used, so that Close() doesn't need to default to closing
+	// point iterators twice.
+	closePointIterOnce bool
+	// Used in some tests to disable the random disabling of seek optimizations.
+	forceEnableSeekOpt bool
+	// Set to true if NextPrefix is not currently permitted. Defaults to false
+	// in case an iterator never had any bounds.
+	nextPrefixNotPermittedByUpperBound bool
+}
+
+// cmp is a convenience shorthand for the i.comparer.Compare function.
+func (i *Iterator) cmp(a, b []byte) int {
+	return i.comparer.Compare(a, b)
+}
+
+// split is a convenience shorthand for the i.comparer.Split function.
+func (i *Iterator) split(a []byte) int {
+	return i.comparer.Split(a)
+}
+
+// equal is a convenience shorthand for the i.comparer.Equal function.
+func (i *Iterator) equal(a, b []byte) bool {
+	return i.comparer.Equal(a, b)
+}
+
+// iteratorRangeKeyState holds an iterator's range key iteration state.
+type iteratorRangeKeyState struct {
+	opts  *IterOptions
+	cmp   base.Compare
+	split base.Split
+	// rangeKeyIter holds the range key iterator stack that iterates over the
+	// merged spans across the entirety of the LSM.
+	rangeKeyIter keyspan.FragmentIterator
+	iiter        keyspan.InterleavingIter
+	// stale is set to true when the range key state recorded here (in start,
+	// end and keys) may not be in sync with the current range key at the
+	// interleaving iterator's current position.
+	//
+	// When the interelaving iterator passes over a new span, it invokes the
+	// SpanChanged hook defined on the `rangeKeyMasking` type,  which sets stale
+	// to true if the span is non-nil.
+	//
+	// The parent iterator may not be positioned over the interleaving
+	// iterator's current position (eg, i.iterPos = iterPos{Next,Prev}), so
+	// {keys,start,end} are only updated to the new range key during a call to
+	// Iterator.saveRangeKey.
+	stale bool
+	// updated is used to signal to the Iterator client whether the state of
+	// range keys has changed since the previous iterator position through the
+	// `RangeKeyChanged` method. It's set to true during an Iterator positioning
+	// operation that changes the state of the current range key. Each Iterator
+	// positioning operation sets it back to false before executing.
+	//
+	// TODO(jackson): The lifecycle of {stale,updated,prevPosHadRangeKey} is
+	// intricate and confusing. Try to refactor to reduce complexity.
+	updated bool
+	// prevPosHadRangeKey records whether the previous Iterator position had a
+	// range key (HasPointAndRage() = (_, true)). It's updated at the beginning
+	// of each new Iterator positioning operation. It's required by saveRangeKey to
+	// to set `updated` appropriately: Without this record of the previous iterator
+	// state, it's ambiguous whether an iterator only temporarily stepped onto a
+	// position without a range key.
+	prevPosHadRangeKey bool
+	// rangeKeyOnly is set to true if at the current iterator position there is
+	// no point key, only a range key start boundary.
+	rangeKeyOnly bool
+	// hasRangeKey is true when the current iterator position has a covering
+	// range key (eg, a range key with bounds [<lower>,<upper>) such that
+	// <lower> ≤ Key() < <upper>).
+	hasRangeKey bool
+	// start and end are the [start, end) boundaries of the current range keys.
+	start []byte
+	end   []byte
+
+	rangeKeyBuffers
+
+	// iterConfig holds fields that are used for the construction of the
+	// iterator stack, but do not need to be directly accessed during iteration.
+	// This struct is bundled within the iteratorRangeKeyState struct to reduce
+	// allocations.
+	iterConfig rangekey.UserIteratorConfig
+}
+
+type rangeKeyBuffers struct {
+	// keys is sorted by Suffix ascending.
+	keys []RangeKeyData
+	// buf is used to save range-key data before moving the range-key iterator.
+	// Start and end boundaries, suffixes and values are all copied into buf.
+	buf bytealloc.A
+	// internal holds buffers used by the range key internal iterators.
+	internal rangekey.Buffers
+}
+
+func (b *rangeKeyBuffers) PrepareForReuse() {
+	const maxKeysReuse = 100
+	if len(b.keys) > maxKeysReuse {
+		b.keys = nil
+	}
+	// Avoid caching the key buf if it is overly large. The constant is
+	// fairly arbitrary.
+	if cap(b.buf) >= maxKeyBufCacheSize {
+		b.buf = nil
+	} else {
+		b.buf = b.buf[:0]
+	}
+	b.internal.PrepareForReuse()
+}
+
+func (i *iteratorRangeKeyState) init(cmp base.Compare, split base.Split, opts *IterOptions) {
+	i.cmp = cmp
+	i.split = split
+	i.opts = opts
+}
+
+var iterRangeKeyStateAllocPool = sync.Pool{
+	New: func() interface{} {
+		return &iteratorRangeKeyState{}
+	},
+}
+
+// isEphemeralPosition returns true iff the current iterator position is
+// ephemeral, and won't be visited during subsequent relative positioning
+// operations.
+//
+// The iterator position resulting from a SeekGE or SeekPrefixGE that lands on a
+// straddling range key without a coincident point key is such a position.
+func (i *Iterator) isEphemeralPosition() bool {
+	return i.opts.rangeKeys() && i.rangeKey != nil && i.rangeKey.rangeKeyOnly &&
+		!i.equal(i.rangeKey.start, i.key)
+}
+
+type lastPositioningOpKind int8
+
+const (
+	unknownLastPositionOp lastPositioningOpKind = iota
+	seekPrefixGELastPositioningOp
+	seekGELastPositioningOp
+	seekLTLastPositioningOp
+	// internalNextOp is a special internal iterator positioning operation used
+	// by CanDeterministicallySingleDelete. It exists for enforcing requirements
+	// around calling CanDeterministicallySingleDelete at most once per external
+	// iterator position.
+	internalNextOp
+	// invalidatedLastPositionOp is similar to unknownLastPositionOp and the
+	// only reason to distinguish this is for the wider set of SeekGE
+	// optimizations we permit for the external iterator Iterator.forwardOnly
+	// case. Most code predicates should be doing equality comparisons with one
+	// of the seek* enum values, so this duplication should not result in code
+	// of the form:
+	//  if unknownLastPositionOp || invalidLastPositionOp
+	invalidatedLastPositionOp
+)
+
+// Limited iteration mode. Not for use with prefix iteration.
+//
+// SeekGE, SeekLT, Prev, Next have WithLimit variants, that pause the iterator
+// at the limit in a best-effort manner. The client should behave correctly
+// even if the limits are ignored. These limits are not "deep", in that they
+// are not passed down to the underlying collection of internalIterators. This
+// is because the limits are transient, and apply only until the next
+// iteration call. They serve mainly as a way to bound the amount of work when
+// two (or more) Iterators are being coordinated at a higher level.
+//
+// In limited iteration mode:
+// - Avoid using Iterator.Valid if the last call was to a *WithLimit() method.
+//   The return value from the *WithLimit() method provides a more precise
+//   disposition.
+// - The limit is exclusive for forward and inclusive for reverse.
+//
+//
+// Limited iteration mode & range keys
+//
+// Limited iteration interacts with range-key iteration. When range key
+// iteration is enabled, range keys are interleaved at their start boundaries.
+// Limited iteration must ensure that if a range key exists within the limit,
+// the iterator visits the range key.
+//
+// During forward limited iteration, this is trivial: An overlapping range key
+// must have a start boundary less than the limit, and the range key's start
+// boundary will be interleaved and found to be within the limit.
+//
+// During reverse limited iteration, the tail of the range key may fall within
+// the limit. The range key must be surfaced even if the range key's start
+// boundary is less than the limit, and if there are no point keys between the
+// current iterator position and the limit. To provide this guarantee, reverse
+// limited iteration ignores the limit as long as there is a range key
+// overlapping the iteration position.
+
+// IterValidityState captures the state of the Iterator.
+type IterValidityState int8
+
+const (
+	// IterExhausted represents an Iterator that is exhausted.
+	IterExhausted IterValidityState = iota
+	// IterValid represents an Iterator that is valid.
+	IterValid
+	// IterAtLimit represents an Iterator that has a non-exhausted
+	// internalIterator, but has reached a limit without any key for the
+	// caller.
+	IterAtLimit
+)
+
+// readSampling stores variables used to sample a read to trigger a read
+// compaction
+type readSampling struct {
+	bytesUntilReadSampling uint64
+	initialSamplePassed    bool
+	pendingCompactions     readCompactionQueue
+	// forceReadSampling is used for testing purposes to force a read sample on every
+	// call to Iterator.maybeSampleRead()
+	forceReadSampling bool
+}
+
+func (i *Iterator) findNextEntry(limit []byte) {
+	i.iterValidityState = IterExhausted
+	i.pos = iterPosCurForward
+	if i.opts.rangeKeys() && i.rangeKey != nil {
+		i.rangeKey.rangeKeyOnly = false
+	}
+
+	// Close the closer for the current value if one was open.
+	if i.closeValueCloser() != nil {
+		return
+	}
+
+	for i.iterKey != nil {
+		key := *i.iterKey
+
+		if i.hasPrefix {
+			if n := i.split(key.UserKey); !i.equal(i.prefixOrFullSeekKey, key.UserKey[:n]) {
+				return
+			}
+		}
+		// Compare with limit every time we start at a different user key.
+		// Note that given the best-effort contract of limit, we could avoid a
+		// comparison in the common case by doing this only after
+		// i.nextUserKey is called for the deletes below. However that makes
+		// the behavior non-deterministic (since the behavior will vary based
+		// on what has been compacted), which makes it hard to test with the
+		// metamorphic test. So we forego that performance optimization.
+		if limit != nil && i.cmp(limit, i.iterKey.UserKey) <= 0 {
+			i.iterValidityState = IterAtLimit
+			i.pos = iterPosCurForwardPaused
+			return
+		}
+
+		// If the user has configured a SkipPoint function, invoke it to see
+		// whether we should skip over the current user key.
+		if i.opts.SkipPoint != nil && key.Kind() != InternalKeyKindRangeKeySet && i.opts.SkipPoint(i.iterKey.UserKey) {
+			// NB: We could call nextUserKey, but in some cases the SkipPoint
+			// predicate function might be cheaper than nextUserKey's key copy
+			// and key comparison. This should be the case for MVCC suffix
+			// comparisons, for example. In the future, we could expand the
+			// SkipPoint interface to give the implementor more control over
+			// whether we skip over just the internal key, the user key, or even
+			// the key prefix.
+			i.stats.ForwardStepCount[InternalIterCall]++
+			i.iterKey, i.iterValue = i.iter.Next()
+			continue
+		}
+
+		switch key.Kind() {
+		case InternalKeyKindRangeKeySet:
+			// Save the current key.
+			i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
+			i.key = i.keyBuf
+			i.value = LazyValue{}
+			// There may also be a live point key at this userkey that we have
+			// not yet read. We need to find the next entry with this user key
+			// to find it. Save the range key so we don't lose it when we Next
+			// the underlying iterator.
+			i.saveRangeKey()
+			pointKeyExists := i.nextPointCurrentUserKey()
+			if i.err != nil {
+				i.iterValidityState = IterExhausted
+				return
+			}
+			i.rangeKey.rangeKeyOnly = !pointKeyExists
+			i.iterValidityState = IterValid
+			return
+
+		case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
+			// NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not
+			// only simpler, but is also necessary for correctness due to
+			// InternalKeyKindSSTableInternalObsoleteBit.
+			i.nextUserKey()
+			continue
+
+		case InternalKeyKindSet, InternalKeyKindSetWithDelete:
+			i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
+			i.key = i.keyBuf
+			i.value = i.iterValue
+			i.iterValidityState = IterValid
+			i.saveRangeKey()
+			return
+
+		case InternalKeyKindMerge:
+			// Resolving the merge may advance us to the next point key, which
+			// may be covered by a different set of range keys. Save the range
+			// key state so we don't lose it.
+			i.saveRangeKey()
+			if i.mergeForward(key) {
+				i.iterValidityState = IterValid
+				return
+			}
+
+			// The merge didn't yield a valid key, either because the value
+			// merger indicated it should be deleted, or because an error was
+			// encountered.
+			i.iterValidityState = IterExhausted
+			if i.err != nil {
+				return
+			}
+			if i.pos != iterPosNext {
+				i.nextUserKey()
+			}
+			if i.closeValueCloser() != nil {
+				return
+			}
+			i.pos = iterPosCurForward
+
+		default:
+			i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind()))
+			i.iterValidityState = IterExhausted
+			return
+		}
+	}
+}
+
+func (i *Iterator) nextPointCurrentUserKey() bool {
+	// If the user has configured a SkipPoint function and the current user key
+	// would be skipped by it, there's no need to step forward looking for a
+	// point key. If we were to find one, it should be skipped anyways.
+	if i.opts.SkipPoint != nil && i.opts.SkipPoint(i.key) {
+		return false
+	}
+
+	i.pos = iterPosCurForward
+
+	i.iterKey, i.iterValue = i.iter.Next()
+	i.stats.ForwardStepCount[InternalIterCall]++
+	if i.iterKey == nil || !i.equal(i.key, i.iterKey.UserKey) {
+		i.pos = iterPosNext
+		return false
+	}
+
+	key := *i.iterKey
+	switch key.Kind() {
+	case InternalKeyKindRangeKeySet:
+		// RangeKeySets must always be interleaved as the first internal key
+		// for a user key.
+		i.err = base.CorruptionErrorf("pebble: unexpected range key set mid-user key")
+		return false
+
+	case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
+		// NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not
+		// only simpler, but is also necessary for correctness due to
+		// InternalKeyKindSSTableInternalObsoleteBit.
+		return false
+
+	case InternalKeyKindSet, InternalKeyKindSetWithDelete:
+		i.value = i.iterValue
+		return true
+
+	case InternalKeyKindMerge:
+		return i.mergeForward(key)
+
+	default:
+		i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind()))
+		return false
+	}
+}
+
+// mergeForward resolves a MERGE key, advancing the underlying iterator forward
+// to merge with subsequent keys with the same userkey. mergeForward returns a
+// boolean indicating whether or not the merge yielded a valid key. A merge may
+// not yield a valid key if an error occurred, in which case i.err is non-nil,
+// or the user's value merger specified the key to be deleted.
+//
+// mergeForward does not update iterValidityState.
+func (i *Iterator) mergeForward(key base.InternalKey) (valid bool) {
+	var iterValue []byte
+	iterValue, _, i.err = i.iterValue.Value(nil)
+	if i.err != nil {
+		return false
+	}
+	var valueMerger ValueMerger
+	valueMerger, i.err = i.merge(key.UserKey, iterValue)
+	if i.err != nil {
+		return false
+	}
+
+	i.mergeNext(key, valueMerger)
+	if i.err != nil {
+		return false
+	}
+
+	var needDelete bool
+	var value []byte
+	value, needDelete, i.valueCloser, i.err = finishValueMerger(
+		valueMerger, true /* includesBase */)
+	i.value = base.MakeInPlaceValue(value)
+	if i.err != nil {
+		return false
+	}
+	if needDelete {
+		_ = i.closeValueCloser()
+		return false
+	}
+	return true
+}
+
+func (i *Iterator) closeValueCloser() error {
+	if i.valueCloser != nil {
+		i.err = i.valueCloser.Close()
+		i.valueCloser = nil
+	}
+	return i.err
+}
+
+func (i *Iterator) nextUserKey() {
+	if i.iterKey == nil {
+		return
+	}
+	trailer := i.iterKey.Trailer
+	done := i.iterKey.Trailer <= base.InternalKeyZeroSeqnumMaxTrailer
+	if i.iterValidityState != IterValid {
+		i.keyBuf = append(i.keyBuf[:0], i.iterKey.UserKey...)
+		i.key = i.keyBuf
+	}
+	for {
+		i.iterKey, i.iterValue = i.iter.Next()
+		i.stats.ForwardStepCount[InternalIterCall]++
+		// NB: We're guaranteed to be on the next user key if the previous key
+		// had a zero sequence number (`done`), or the new key has a trailer
+		// greater or equal to the previous key's trailer. This is true because
+		// internal keys with the same user key are sorted by Trailer in
+		// strictly monotonically descending order. We expect the trailer
+		// optimization to trigger around 50% of the time with randomly
+		// distributed writes. We expect it to trigger very frequently when
+		// iterating through ingested sstables, which contain keys that all have
+		// the same sequence number.
+		if done || i.iterKey == nil || i.iterKey.Trailer >= trailer {
+			break
+		}
+		if !i.equal(i.key, i.iterKey.UserKey) {
+			break
+		}
+		done = i.iterKey.Trailer <= base.InternalKeyZeroSeqnumMaxTrailer
+		trailer = i.iterKey.Trailer
+	}
+}
+
+func (i *Iterator) maybeSampleRead() {
+	// This method is only called when a public method of Iterator is
+	// returning, and below we exclude the case were the iterator is paused at
+	// a limit. The effect of these choices is that keys that are deleted, but
+	// are encountered during iteration, are not accounted for in the read
+	// sampling and will not cause read driven compactions, even though we are
+	// incurring cost in iterating over them. And this issue is not limited to
+	// Iterator, which does not see the effect of range deletes, which may be
+	// causing iteration work in mergingIter. It is not clear at this time
+	// whether this is a deficiency worth addressing.
+	if i.iterValidityState != IterValid {
+		return
+	}
+	if i.readState == nil {
+		return
+	}
+	if i.readSampling.forceReadSampling {
+		i.sampleRead()
+		return
+	}
+	samplingPeriod := int32(int64(readBytesPeriod) * i.readState.db.opts.Experimental.ReadSamplingMultiplier)
+	if samplingPeriod <= 0 {
+		return
+	}
+	bytesRead := uint64(len(i.key) + i.value.Len())
+	for i.readSampling.bytesUntilReadSampling < bytesRead {
+		i.readSampling.bytesUntilReadSampling += uint64(fastrand.Uint32n(2 * uint32(samplingPeriod)))
+		// The block below tries to adjust for the case where this is the
+		// first read in a newly-opened iterator. As bytesUntilReadSampling
+		// starts off at zero, we don't want to sample the first read of
+		// every newly-opened iterator, but we do want to sample some of them.
+		if !i.readSampling.initialSamplePassed {
+			i.readSampling.initialSamplePassed = true
+			if fastrand.Uint32n(uint32(i.readSampling.bytesUntilReadSampling)) > uint32(bytesRead) {
+				continue
+			}
+		}
+		i.sampleRead()
+	}
+	i.readSampling.bytesUntilReadSampling -= bytesRead
+}
+
+func (i *Iterator) sampleRead() {
+	var topFile *manifest.FileMetadata
+	topLevel, numOverlappingLevels := numLevels, 0
+	mi := i.merging
+	if mi == nil {
+		return
+	}
+	if len(mi.levels) > 1 {
+		mi.ForEachLevelIter(func(li *levelIter) bool {
+			l := manifest.LevelToInt(li.level)
+			if f := li.iterFile; f != nil {
+				var containsKey bool
+				if i.pos == iterPosNext || i.pos == iterPosCurForward ||
+					i.pos == iterPosCurForwardPaused {
+					containsKey = i.cmp(f.SmallestPointKey.UserKey, i.key) <= 0
+				} else if i.pos == iterPosPrev || i.pos == iterPosCurReverse ||
+					i.pos == iterPosCurReversePaused {
+					containsKey = i.cmp(f.LargestPointKey.UserKey, i.key) >= 0
+				}
+				// Do nothing if the current key is not contained in f's
+				// bounds. We could seek the LevelIterator at this level
+				// to find the right file, but the performance impacts of
+				// doing that are significant enough to negate the benefits
+				// of read sampling in the first place. See the discussion
+				// at:
+				// https://github.com/cockroachdb/pebble/pull/1041#issuecomment-763226492
+				if containsKey {
+					numOverlappingLevels++
+					if numOverlappingLevels >= 2 {
+						// Terminate the loop early if at least 2 overlapping levels are found.
+						return true
+					}
+					topLevel = l
+					topFile = f
+				}
+			}
+			return false
+		})
+	}
+	if topFile == nil || topLevel >= numLevels {
+		return
+	}
+	if numOverlappingLevels >= 2 {
+		allowedSeeks := topFile.AllowedSeeks.Add(-1)
+		if allowedSeeks == 0 {
+
+			// Since the compaction queue can handle duplicates, we can keep
+			// adding to the queue even once allowedSeeks hits 0.
+			// In fact, we NEED to keep adding to the queue, because the queue
+			// is small and evicts older and possibly useful compactions.
+			topFile.AllowedSeeks.Add(topFile.InitAllowedSeeks)
+
+			read := readCompaction{
+				start:   topFile.SmallestPointKey.UserKey,
+				end:     topFile.LargestPointKey.UserKey,
+				level:   topLevel,
+				fileNum: topFile.FileNum,
+			}
+			i.readSampling.pendingCompactions.add(&read, i.cmp)
+		}
+	}
+}
+
+func (i *Iterator) findPrevEntry(limit []byte) {
+	i.iterValidityState = IterExhausted
+	i.pos = iterPosCurReverse
+	if i.opts.rangeKeys() && i.rangeKey != nil {
+		i.rangeKey.rangeKeyOnly = false
+	}
+
+	// Close the closer for the current value if one was open.
+	if i.valueCloser != nil {
+		i.err = i.valueCloser.Close()
+		i.valueCloser = nil
+		if i.err != nil {
+			i.iterValidityState = IterExhausted
+			return
+		}
+	}
+
+	var valueMerger ValueMerger
+	firstLoopIter := true
+	rangeKeyBoundary := false
+	// The code below compares with limit in multiple places. As documented in
+	// findNextEntry, this is being done to make the behavior of limit
+	// deterministic to allow for metamorphic testing. It is not required by
+	// the best-effort contract of limit.
+	for i.iterKey != nil {
+		key := *i.iterKey
+
+		// NB: We cannot pause if the current key is covered by a range key.
+		// Otherwise, the user might not ever learn of a range key that covers
+		// the key space being iterated over in which there are no point keys.
+		// Since limits are best effort, ignoring the limit in this case is
+		// allowed by the contract of limit.
+		if firstLoopIter && limit != nil && i.cmp(limit, i.iterKey.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) {
+			i.iterValidityState = IterAtLimit
+			i.pos = iterPosCurReversePaused
+			return
+		}
+		firstLoopIter = false
+
+		if i.iterValidityState == IterValid {
+			if !i.equal(key.UserKey, i.key) {
+				// We've iterated to the previous user key.
+				i.pos = iterPosPrev
+				if valueMerger != nil {
+					var needDelete bool
+					var value []byte
+					value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, true /* includesBase */)
+					i.value = base.MakeInPlaceValue(value)
+					if i.err == nil && needDelete {
+						// The point key at this key is deleted. If we also have
+						// a range key boundary at this key, we still want to
+						// return. Otherwise, we need to continue looking for
+						// a live key.
+						i.value = LazyValue{}
+						if rangeKeyBoundary {
+							i.rangeKey.rangeKeyOnly = true
+						} else {
+							i.iterValidityState = IterExhausted
+							if i.closeValueCloser() == nil {
+								continue
+							}
+						}
+					}
+				}
+				if i.err != nil {
+					i.iterValidityState = IterExhausted
+				}
+				return
+			}
+		}
+
+		// If the user has configured a SkipPoint function, invoke it to see
+		// whether we should skip over the current user key.
+		if i.opts.SkipPoint != nil && key.Kind() != InternalKeyKindRangeKeySet && i.opts.SkipPoint(key.UserKey) {
+			// NB: We could call prevUserKey, but in some cases the SkipPoint
+			// predicate function might be cheaper than prevUserKey's key copy
+			// and key comparison. This should be the case for MVCC suffix
+			// comparisons, for example. In the future, we could expand the
+			// SkipPoint interface to give the implementor more control over
+			// whether we skip over just the internal key, the user key, or even
+			// the key prefix.
+			i.stats.ReverseStepCount[InternalIterCall]++
+			i.iterKey, i.iterValue = i.iter.Prev()
+			if limit != nil && i.iterKey != nil && i.cmp(limit, i.iterKey.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) {
+				i.iterValidityState = IterAtLimit
+				i.pos = iterPosCurReversePaused
+				return
+			}
+			continue
+		}
+
+		switch key.Kind() {
+		case InternalKeyKindRangeKeySet:
+			// Range key start boundary markers are interleaved with the maximum
+			// sequence number, so if there's a point key also at this key, we
+			// must've already iterated over it.
+			// This is the final entry at this user key, so we may return
+			i.rangeKey.rangeKeyOnly = i.iterValidityState != IterValid
+			i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
+			i.key = i.keyBuf
+			i.iterValidityState = IterValid
+			i.saveRangeKey()
+			// In all other cases, previous iteration requires advancing to
+			// iterPosPrev in order to determine if the key is live and
+			// unshadowed by another key at the same user key. In this case,
+			// because range key start boundary markers are always interleaved
+			// at the maximum sequence number, we know that there aren't any
+			// additional keys with the same user key in the backward direction.
+			//
+			// We Prev the underlying iterator once anyways for consistency, so
+			// that we can maintain the invariant during backward iteration that
+			// i.iterPos = iterPosPrev.
+			i.stats.ReverseStepCount[InternalIterCall]++
+			i.iterKey, i.iterValue = i.iter.Prev()
+
+			// Set rangeKeyBoundary so that on the next iteration, we know to
+			// return the key even if the MERGE point key is deleted.
+			rangeKeyBoundary = true
+
+		case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
+			i.value = LazyValue{}
+			i.iterValidityState = IterExhausted
+			valueMerger = nil
+			i.iterKey, i.iterValue = i.iter.Prev()
+			i.stats.ReverseStepCount[InternalIterCall]++
+			// Compare with the limit. We could optimize by only checking when
+			// we step to the previous user key, but detecting that requires a
+			// comparison too. Note that this position may already passed a
+			// number of versions of this user key, but they are all deleted, so
+			// the fact that a subsequent Prev*() call will not see them is
+			// harmless. Also note that this is the only place in the loop,
+			// other than the firstLoopIter and SkipPoint cases above, where we
+			// could step to a different user key and start processing it for
+			// returning to the caller.
+			if limit != nil && i.iterKey != nil && i.cmp(limit, i.iterKey.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) {
+				i.iterValidityState = IterAtLimit
+				i.pos = iterPosCurReversePaused
+				return
+			}
+			continue
+
+		case InternalKeyKindSet, InternalKeyKindSetWithDelete:
+			i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
+			i.key = i.keyBuf
+			// iterValue is owned by i.iter and could change after the Prev()
+			// call, so use valueBuf instead. Note that valueBuf is only used
+			// in this one instance; everywhere else (eg. in findNextEntry),
+			// we just point i.value to the unsafe i.iter-owned value buffer.
+			i.value, i.valueBuf = i.iterValue.Clone(i.valueBuf[:0], &i.fetcher)
+			i.saveRangeKey()
+			i.iterValidityState = IterValid
+			i.iterKey, i.iterValue = i.iter.Prev()
+			i.stats.ReverseStepCount[InternalIterCall]++
+			valueMerger = nil
+			continue
+
+		case InternalKeyKindMerge:
+			if i.iterValidityState == IterExhausted {
+				i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
+				i.key = i.keyBuf
+				i.saveRangeKey()
+				var iterValue []byte
+				iterValue, _, i.err = i.iterValue.Value(nil)
+				if i.err != nil {
+					return
+				}
+				valueMerger, i.err = i.merge(i.key, iterValue)
+				if i.err != nil {
+					return
+				}
+				i.iterValidityState = IterValid
+			} else if valueMerger == nil {
+				// Extract value before iterValue since we use value before iterValue
+				// and the underlying iterator is not required to provide backing
+				// memory for both simultaneously.
+				var value []byte
+				var callerOwned bool
+				value, callerOwned, i.err = i.value.Value(i.lazyValueBuf)
+				if callerOwned {
+					i.lazyValueBuf = value[:0]
+				}
+				if i.err != nil {
+					return
+				}
+				valueMerger, i.err = i.merge(i.key, value)
+				var iterValue []byte
+				iterValue, _, i.err = i.iterValue.Value(nil)
+				if i.err != nil {
+					return
+				}
+				if i.err == nil {
+					i.err = valueMerger.MergeNewer(iterValue)
+				}
+				if i.err != nil {
+					i.iterValidityState = IterExhausted
+					return
+				}
+			} else {
+				var iterValue []byte
+				iterValue, _, i.err = i.iterValue.Value(nil)
+				if i.err != nil {
+					return
+				}
+				i.err = valueMerger.MergeNewer(iterValue)
+				if i.err != nil {
+					i.iterValidityState = IterExhausted
+					return
+				}
+			}
+			i.iterKey, i.iterValue = i.iter.Prev()
+			i.stats.ReverseStepCount[InternalIterCall]++
+			continue
+
+		default:
+			i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind()))
+			i.iterValidityState = IterExhausted
+			return
+		}
+	}
+
+	// i.iterKey == nil, so broke out of the preceding loop.
+	if i.iterValidityState == IterValid {
+		i.pos = iterPosPrev
+		if valueMerger != nil {
+			var needDelete bool
+			var value []byte
+			value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, true /* includesBase */)
+			i.value = base.MakeInPlaceValue(value)
+			if i.err == nil && needDelete {
+				i.key = nil
+				i.value = LazyValue{}
+				i.iterValidityState = IterExhausted
+			}
+		}
+		if i.err != nil {
+			i.iterValidityState = IterExhausted
+		}
+	}
+}
+
+func (i *Iterator) prevUserKey() {
+	if i.iterKey == nil {
+		return
+	}
+	if i.iterValidityState != IterValid {
+		// If we're going to compare against the prev key, we need to save the
+		// current key.
+		i.keyBuf = append(i.keyBuf[:0], i.iterKey.UserKey...)
+		i.key = i.keyBuf
+	}
+	for {
+		i.iterKey, i.iterValue = i.iter.Prev()
+		i.stats.ReverseStepCount[InternalIterCall]++
+		if i.iterKey == nil {
+			break
+		}
+		if !i.equal(i.key, i.iterKey.UserKey) {
+			break
+		}
+	}
+}
+
+func (i *Iterator) mergeNext(key InternalKey, valueMerger ValueMerger) {
+	// Save the current key.
+	i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
+	i.key = i.keyBuf
+
+	// Loop looking for older values for this key and merging them.
+	for {
+		i.iterKey, i.iterValue = i.iter.Next()
+		i.stats.ForwardStepCount[InternalIterCall]++
+		if i.iterKey == nil {
+			i.pos = iterPosNext
+			return
+		}
+		key = *i.iterKey
+		if !i.equal(i.key, key.UserKey) {
+			// We've advanced to the next key.
+			i.pos = iterPosNext
+			return
+		}
+		switch key.Kind() {
+		case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
+			// We've hit a deletion tombstone. Return everything up to this
+			// point.
+			//
+			// NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not
+			// only simpler, but is also necessary for correctness due to
+			// InternalKeyKindSSTableInternalObsoleteBit.
+			return
+
+		case InternalKeyKindSet, InternalKeyKindSetWithDelete:
+			// We've hit a Set value. Merge with the existing value and return.
+			var iterValue []byte
+			iterValue, _, i.err = i.iterValue.Value(nil)
+			if i.err != nil {
+				return
+			}
+			i.err = valueMerger.MergeOlder(iterValue)
+			return
+
+		case InternalKeyKindMerge:
+			// We've hit another Merge value. Merge with the existing value and
+			// continue looping.
+			var iterValue []byte
+			iterValue, _, i.err = i.iterValue.Value(nil)
+			if i.err != nil {
+				return
+			}
+			i.err = valueMerger.MergeOlder(iterValue)
+			if i.err != nil {
+				return
+			}
+			continue
+
+		case InternalKeyKindRangeKeySet:
+			// The RANGEKEYSET marker must sort before a MERGE at the same user key.
+			i.err = base.CorruptionErrorf("pebble: out of order range key marker")
+			return
+
+		default:
+			i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind()))
+			return
+		}
+	}
+}
+
+// SeekGE moves the iterator to the first key/value pair whose key is greater
+// than or equal to the given key. Returns true if the iterator is pointing at
+// a valid entry and false otherwise.
+func (i *Iterator) SeekGE(key []byte) bool {
+	return i.SeekGEWithLimit(key, nil) == IterValid
+}
+
+// SeekGEWithLimit moves the iterator to the first key/value pair whose key is
+// greater than or equal to the given key.
+//
+// If limit is provided, it serves as a best-effort exclusive limit. If the
+// first key greater than or equal to the given search key is also greater than
+// or equal to limit, the Iterator may pause and return IterAtLimit. Because
+// limits are best-effort, SeekGEWithLimit may return a key beyond limit.
+//
+// If the Iterator is configured to iterate over range keys, SeekGEWithLimit
+// guarantees it will surface any range keys with bounds overlapping the
+// keyspace [key, limit).
+func (i *Iterator) SeekGEWithLimit(key []byte, limit []byte) IterValidityState {
+	if i.rangeKey != nil {
+		// NB: Check Valid() before clearing requiresReposition.
+		i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
+		// If we have a range key but did not expose it at the previous iterator
+		// position (because the iterator was not at a valid position), updated
+		// must be true. This ensures that after an iterator op sequence like:
+		//   - Next()             → (IterValid, RangeBounds() = [a,b))
+		//   - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
+		//   - SeekGE(...)        → (IterValid, RangeBounds() = [a,b))
+		// the iterator returns RangeKeyChanged()=true.
+		//
+		// The remainder of this function will only update i.rangeKey.updated if
+		// the iterator moves into a new range key, or out of the current range
+		// key.
+		i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
+	}
+	lastPositioningOp := i.lastPositioningOp
+	hasPrefix := i.hasPrefix
+	// Set it to unknown, since this operation may not succeed, in which case
+	// the SeekGE following this should not make any assumption about iterator
+	// position.
+	i.lastPositioningOp = unknownLastPositionOp
+	i.requiresReposition = false
+	i.err = nil // clear cached iteration error
+	i.hasPrefix = false
+	i.stats.ForwardSeekCount[InterfaceCall]++
+	if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 {
+		key = lowerBound
+	} else if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 {
+		key = upperBound
+	}
+	seekInternalIter := true
+
+	var flags base.SeekGEFlags
+	if i.batchJustRefreshed {
+		i.batchJustRefreshed = false
+		flags = flags.EnableBatchJustRefreshed()
+	}
+	if lastPositioningOp == seekGELastPositioningOp {
+		cmp := i.cmp(i.prefixOrFullSeekKey, key)
+		// If this seek is to the same or later key, and the iterator is
+		// already positioned there, this is a noop. This can be helpful for
+		// sparse key spaces that have many deleted keys, where one can avoid
+		// the overhead of iterating past them again and again.
+		if cmp <= 0 {
+			if !flags.BatchJustRefreshed() &&
+				(i.iterValidityState == IterExhausted ||
+					(i.iterValidityState == IterValid && i.cmp(key, i.key) <= 0 &&
+						(limit == nil || i.cmp(i.key, limit) < 0))) {
+				// Noop
+				if !invariants.Enabled || !disableSeekOpt(key, uintptr(unsafe.Pointer(i))) || i.forceEnableSeekOpt {
+					i.lastPositioningOp = seekGELastPositioningOp
+					return i.iterValidityState
+				}
+			}
+			// cmp == 0 is not safe to optimize since
+			// - i.pos could be at iterPosNext, due to a merge.
+			// - Even if i.pos were at iterPosCurForward, we could have a DELETE,
+			//   SET pair for a key, and the iterator would have moved past DELETE
+			//   but stayed at iterPosCurForward. A similar situation occurs for a
+			//   MERGE, SET pair where the MERGE is consumed and the iterator is
+			//   at the SET.
+			// We also leverage the IterAtLimit <=> i.pos invariant defined in the
+			// comment on iterValidityState, to exclude any cases where i.pos
+			// is iterPosCur{Forward,Reverse}Paused. This avoids the need to
+			// special-case those iterator positions and their interactions with
+			// TrySeekUsingNext, as the main uses for TrySeekUsingNext in CockroachDB
+			// do not use limited Seeks in the first place.
+			if cmp < 0 && i.iterValidityState != IterAtLimit && limit == nil {
+				flags = flags.EnableTrySeekUsingNext()
+			}
+			if invariants.Enabled && flags.TrySeekUsingNext() && !i.forceEnableSeekOpt && disableSeekOpt(key, uintptr(unsafe.Pointer(i))) {
+				flags = flags.DisableTrySeekUsingNext()
+			}
+			if !flags.BatchJustRefreshed() && i.pos == iterPosCurForwardPaused && i.cmp(key, i.iterKey.UserKey) <= 0 {
+				// Have some work to do, but don't need to seek, and we can
+				// start doing findNextEntry from i.iterKey.
+				seekInternalIter = false
+			}
+		}
+	}
+	// Check for another TrySeekUsingNext optimization opportunity, currently
+	// specifically tailored to external iterators. This case is intended to
+	// trigger in instances of Seek-ing with monotonically increasing keys with
+	// Nexts interspersed. At the time of writing, this is the case for
+	// CockroachDB scans. This optimization is important for external iterators
+	// to avoid re-seeking within an already-exhausted sstable. It is not always
+	// a performance win more generally, so we restrict it to external iterators
+	// that are configured to only use forward positioning operations.
+	//
+	// TODO(jackson): This optimization should be obsolete once we introduce and
+	// use the NextPrefix iterator positioning operation.
+	if seekInternalIter && i.forwardOnly && lastPositioningOp != invalidatedLastPositionOp &&
+		i.pos == iterPosCurForward && !hasPrefix && i.iterValidityState == IterValid &&
+		i.cmp(key, i.iterKey.UserKey) > 0 {
+		flags = flags.EnableTrySeekUsingNext()
+		if invariants.Enabled && flags.TrySeekUsingNext() && !i.forceEnableSeekOpt && disableSeekOpt(key, uintptr(unsafe.Pointer(i))) {
+			flags = flags.DisableTrySeekUsingNext()
+		}
+	}
+	if seekInternalIter {
+		i.iterKey, i.iterValue = i.iter.SeekGE(key, flags)
+		i.stats.ForwardSeekCount[InternalIterCall]++
+	}
+	i.findNextEntry(limit)
+	i.maybeSampleRead()
+	if i.Error() == nil {
+		// Prepare state for a future noop optimization.
+		i.prefixOrFullSeekKey = append(i.prefixOrFullSeekKey[:0], key...)
+		i.lastPositioningOp = seekGELastPositioningOp
+	}
+	return i.iterValidityState
+}
+
+// SeekPrefixGE moves the iterator to the first key/value pair whose key is
+// greater than or equal to the given key and which has the same "prefix" as
+// the given key. The prefix for a key is determined by the user-defined
+// Comparer.Split function. The iterator will not observe keys not matching the
+// "prefix" of the search key. Calling SeekPrefixGE puts the iterator in prefix
+// iteration mode. The iterator remains in prefix iteration until a subsequent
+// call to another absolute positioning method (SeekGE, SeekLT, First,
+// Last). Reverse iteration (Prev) is not supported when an iterator is in
+// prefix iteration mode. Returns true if the iterator is pointing at a valid
+// entry and false otherwise.
+//
+// The semantics of SeekPrefixGE are slightly unusual and designed for
+// iteration to be able to take advantage of bloom filters that have been
+// created on the "prefix". If you're not using bloom filters, there is no
+// reason to use SeekPrefixGE.
+//
+// An example Split function may separate a timestamp suffix from the prefix of
+// the key.
+//
+//	Split(<key>@<timestamp>) -> <key>
+//
+// Consider the keys "a@1", "a@2", "aa@3", "aa@4". The prefixes for these keys
+// are "a", and "aa". Note that despite "a" and "aa" sharing a prefix by the
+// usual definition, those prefixes differ by the definition of the Split
+// function. To see how this works, consider the following set of calls on this
+// data set:
+//
+//	SeekPrefixGE("a@0") -> "a@1"
+//	Next()              -> "a@2"
+//	Next()              -> EOF
+//
+// If you're just looking to iterate over keys with a shared prefix, as
+// defined by the configured comparer, set iterator bounds instead:
+//
+//	iter := db.NewIter(&pebble.IterOptions{
+//	  LowerBound: []byte("prefix"),
+//	  UpperBound: []byte("prefiy"),
+//	})
+//	for iter.First(); iter.Valid(); iter.Next() {
+//	  // Only keys beginning with "prefix" will be visited.
+//	}
+//
+// See ExampleIterator_SeekPrefixGE for a working example.
+//
+// When iterating with range keys enabled, all range keys encountered are
+// truncated to the seek key's prefix's bounds. The truncation of the upper
+// bound requires that the database's Comparer is configured with a
+// ImmediateSuccessor method. For example, a SeekPrefixGE("a@9") call with the
+// prefix "a" will truncate range key bounds to [a,ImmediateSuccessor(a)].
+func (i *Iterator) SeekPrefixGE(key []byte) bool {
+	if i.rangeKey != nil {
+		// NB: Check Valid() before clearing requiresReposition.
+		i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
+		// If we have a range key but did not expose it at the previous iterator
+		// position (because the iterator was not at a valid position), updated
+		// must be true. This ensures that after an iterator op sequence like:
+		//   - Next()             → (IterValid, RangeBounds() = [a,b))
+		//   - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
+		//   - SeekPrefixGE(...)  → (IterValid, RangeBounds() = [a,b))
+		// the iterator returns RangeKeyChanged()=true.
+		//
+		// The remainder of this function will only update i.rangeKey.updated if
+		// the iterator moves into a new range key, or out of the current range
+		// key.
+		i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
+	}
+	lastPositioningOp := i.lastPositioningOp
+	// Set it to unknown, since this operation may not succeed, in which case
+	// the SeekPrefixGE following this should not make any assumption about
+	// iterator position.
+	i.lastPositioningOp = unknownLastPositionOp
+	i.requiresReposition = false
+	i.err = nil // clear cached iteration error
+	i.stats.ForwardSeekCount[InterfaceCall]++
+	if i.comparer.Split == nil {
+		panic("pebble: split must be provided for SeekPrefixGE")
+	}
+	if i.comparer.ImmediateSuccessor == nil && i.opts.KeyTypes != IterKeyTypePointsOnly {
+		panic("pebble: ImmediateSuccessor must be provided for SeekPrefixGE with range keys")
+	}
+	prefixLen := i.split(key)
+	keyPrefix := key[:prefixLen]
+	var flags base.SeekGEFlags
+	if i.batchJustRefreshed {
+		flags = flags.EnableBatchJustRefreshed()
+		i.batchJustRefreshed = false
+	}
+	if lastPositioningOp == seekPrefixGELastPositioningOp {
+		if !i.hasPrefix {
+			panic("lastPositioningOpsIsSeekPrefixGE is true, but hasPrefix is false")
+		}
+		// The iterator has not been repositioned after the last SeekPrefixGE.
+		// See if we are seeking to a larger key, since then we can optimize
+		// the seek by using next. Note that we could also optimize if Next
+		// has been called, if the iterator is not exhausted and the current
+		// position is <= the seek key. We are keeping this limited for now
+		// since such optimizations require care for correctness, and to not
+		// become de-optimizations (if one usually has to do all the next
+		// calls and then the seek). This SeekPrefixGE optimization
+		// specifically benefits CockroachDB.
+		cmp := i.cmp(i.prefixOrFullSeekKey, keyPrefix)
+		// cmp == 0 is not safe to optimize since
+		// - i.pos could be at iterPosNext, due to a merge.
+		// - Even if i.pos were at iterPosCurForward, we could have a DELETE,
+		//   SET pair for a key, and the iterator would have moved past DELETE
+		//   but stayed at iterPosCurForward. A similar situation occurs for a
+		//   MERGE, SET pair where the MERGE is consumed and the iterator is
+		//   at the SET.
+		// In general some versions of i.prefix could have been consumed by
+		// the iterator, so we only optimize for cmp < 0.
+		if cmp < 0 {
+			flags = flags.EnableTrySeekUsingNext()
+		}
+		if invariants.Enabled && flags.TrySeekUsingNext() && !i.forceEnableSeekOpt && disableSeekOpt(key, uintptr(unsafe.Pointer(i))) {
+			flags = flags.DisableTrySeekUsingNext()
+		}
+	}
+	// Make a copy of the prefix so that modifications to the key after
+	// SeekPrefixGE returns does not affect the stored prefix.
+	if cap(i.prefixOrFullSeekKey) < prefixLen {
+		i.prefixOrFullSeekKey = make([]byte, prefixLen)
+	} else {
+		i.prefixOrFullSeekKey = i.prefixOrFullSeekKey[:prefixLen]
+	}
+	i.hasPrefix = true
+	copy(i.prefixOrFullSeekKey, keyPrefix)
+
+	if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 {
+		if n := i.split(lowerBound); !bytes.Equal(i.prefixOrFullSeekKey, lowerBound[:n]) {
+			i.err = errors.New("pebble: SeekPrefixGE supplied with key outside of lower bound")
+			i.iterValidityState = IterExhausted
+			return false
+		}
+		key = lowerBound
+	} else if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 {
+		if n := i.split(upperBound); !bytes.Equal(i.prefixOrFullSeekKey, upperBound[:n]) {
+			i.err = errors.New("pebble: SeekPrefixGE supplied with key outside of upper bound")
+			i.iterValidityState = IterExhausted
+			return false
+		}
+		key = upperBound
+	}
+	i.iterKey, i.iterValue = i.iter.SeekPrefixGE(i.prefixOrFullSeekKey, key, flags)
+	i.stats.ForwardSeekCount[InternalIterCall]++
+	i.findNextEntry(nil)
+	i.maybeSampleRead()
+	if i.Error() == nil {
+		i.lastPositioningOp = seekPrefixGELastPositioningOp
+	}
+	return i.iterValidityState == IterValid
+}
+
+// Deterministic disabling of the seek optimizations. It uses the iterator
+// pointer, since we want diversity in iterator behavior for the same key.  Used
+// for tests.
+func disableSeekOpt(key []byte, ptr uintptr) bool {
+	// Fibonacci hash https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
+	simpleHash := (11400714819323198485 * uint64(ptr)) >> 63
+	return key != nil && key[0]&byte(1) == 0 && simpleHash == 0
+}
+
+// SeekLT moves the iterator to the last key/value pair whose key is less than
+// the given key. Returns true if the iterator is pointing at a valid entry and
+// false otherwise.
+func (i *Iterator) SeekLT(key []byte) bool {
+	return i.SeekLTWithLimit(key, nil) == IterValid
+}
+
+// SeekLTWithLimit moves the iterator to the last key/value pair whose key is
+// less than the given key.
+//
+// If limit is provided, it serves as a best-effort inclusive limit. If the last
+// key less than the given search key is also less than limit, the Iterator may
+// pause and return IterAtLimit. Because limits are best-effort, SeekLTWithLimit
+// may return a key beyond limit.
+//
+// If the Iterator is configured to iterate over range keys, SeekLTWithLimit
+// guarantees it will surface any range keys with bounds overlapping the
+// keyspace up to limit.
+func (i *Iterator) SeekLTWithLimit(key []byte, limit []byte) IterValidityState {
+	if i.rangeKey != nil {
+		// NB: Check Valid() before clearing requiresReposition.
+		i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
+		// If we have a range key but did not expose it at the previous iterator
+		// position (because the iterator was not at a valid position), updated
+		// must be true. This ensures that after an iterator op sequence like:
+		//   - Next()               → (IterValid, RangeBounds() = [a,b))
+		//   - NextWithLimit(...)   → (IterAtLimit, RangeBounds() = -)
+		//   - SeekLTWithLimit(...) → (IterValid, RangeBounds() = [a,b))
+		// the iterator returns RangeKeyChanged()=true.
+		//
+		// The remainder of this function will only update i.rangeKey.updated if
+		// the iterator moves into a new range key, or out of the current range
+		// key.
+		i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
+	}
+	lastPositioningOp := i.lastPositioningOp
+	// Set it to unknown, since this operation may not succeed, in which case
+	// the SeekLT following this should not make any assumption about iterator
+	// position.
+	i.lastPositioningOp = unknownLastPositionOp
+	i.batchJustRefreshed = false
+	i.requiresReposition = false
+	i.err = nil // clear cached iteration error
+	i.stats.ReverseSeekCount[InterfaceCall]++
+	if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 {
+		key = upperBound
+	} else if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 {
+		key = lowerBound
+	}
+	i.hasPrefix = false
+	seekInternalIter := true
+	// The following noop optimization only applies when i.batch == nil, since
+	// an iterator over a batch is iterating over mutable data, that may have
+	// changed since the last seek.
+	if lastPositioningOp == seekLTLastPositioningOp && i.batch == nil {
+		cmp := i.cmp(key, i.prefixOrFullSeekKey)
+		// If this seek is to the same or earlier key, and the iterator is
+		// already positioned there, this is a noop. This can be helpful for
+		// sparse key spaces that have many deleted keys, where one can avoid
+		// the overhead of iterating past them again and again.
+		if cmp <= 0 {
+			// NB: when pos != iterPosCurReversePaused, the invariant
+			// documented earlier implies that iterValidityState !=
+			// IterAtLimit.
+			if i.iterValidityState == IterExhausted ||
+				(i.iterValidityState == IterValid && i.cmp(i.key, key) < 0 &&
+					(limit == nil || i.cmp(limit, i.key) <= 0)) {
+				if !invariants.Enabled || !disableSeekOpt(key, uintptr(unsafe.Pointer(i))) {
+					i.lastPositioningOp = seekLTLastPositioningOp
+					return i.iterValidityState
+				}
+			}
+			if i.pos == iterPosCurReversePaused && i.cmp(i.iterKey.UserKey, key) < 0 {
+				// Have some work to do, but don't need to seek, and we can
+				// start doing findPrevEntry from i.iterKey.
+				seekInternalIter = false
+			}
+		}
+	}
+	if seekInternalIter {
+		i.iterKey, i.iterValue = i.iter.SeekLT(key, base.SeekLTFlagsNone)
+		i.stats.ReverseSeekCount[InternalIterCall]++
+	}
+	i.findPrevEntry(limit)
+	i.maybeSampleRead()
+	if i.Error() == nil && i.batch == nil {
+		// Prepare state for a future noop optimization.
+		i.prefixOrFullSeekKey = append(i.prefixOrFullSeekKey[:0], key...)
+		i.lastPositioningOp = seekLTLastPositioningOp
+	}
+	return i.iterValidityState
+}
+
+// First moves the iterator the the first key/value pair. Returns true if the
+// iterator is pointing at a valid entry and false otherwise.
+func (i *Iterator) First() bool {
+	if i.rangeKey != nil {
+		// NB: Check Valid() before clearing requiresReposition.
+		i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
+		// If we have a range key but did not expose it at the previous iterator
+		// position (because the iterator was not at a valid position), updated
+		// must be true. This ensures that after an iterator op sequence like:
+		//   - Next()             → (IterValid, RangeBounds() = [a,b))
+		//   - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
+		//   - First(...)         → (IterValid, RangeBounds() = [a,b))
+		// the iterator returns RangeKeyChanged()=true.
+		//
+		// The remainder of this function will only update i.rangeKey.updated if
+		// the iterator moves into a new range key, or out of the current range
+		// key.
+		i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
+	}
+	i.err = nil // clear cached iteration error
+	i.hasPrefix = false
+	i.batchJustRefreshed = false
+	i.lastPositioningOp = unknownLastPositionOp
+	i.requiresReposition = false
+	i.stats.ForwardSeekCount[InterfaceCall]++
+
+	i.iterFirstWithinBounds()
+	i.findNextEntry(nil)
+	i.maybeSampleRead()
+	return i.iterValidityState == IterValid
+}
+
+// Last moves the iterator the the last key/value pair. Returns true if the
+// iterator is pointing at a valid entry and false otherwise.
+func (i *Iterator) Last() bool {
+	if i.rangeKey != nil {
+		// NB: Check Valid() before clearing requiresReposition.
+		i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
+		// If we have a range key but did not expose it at the previous iterator
+		// position (because the iterator was not at a valid position), updated
+		// must be true. This ensures that after an iterator op sequence like:
+		//   - Next()             → (IterValid, RangeBounds() = [a,b))
+		//   - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
+		//   - Last(...)          → (IterValid, RangeBounds() = [a,b))
+		// the iterator returns RangeKeyChanged()=true.
+		//
+		// The remainder of this function will only update i.rangeKey.updated if
+		// the iterator moves into a new range key, or out of the current range
+		// key.
+		i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
+	}
+	i.err = nil // clear cached iteration error
+	i.hasPrefix = false
+	i.batchJustRefreshed = false
+	i.lastPositioningOp = unknownLastPositionOp
+	i.requiresReposition = false
+	i.stats.ReverseSeekCount[InterfaceCall]++
+
+	i.iterLastWithinBounds()
+	i.findPrevEntry(nil)
+	i.maybeSampleRead()
+	return i.iterValidityState == IterValid
+}
+
+// Next moves the iterator to the next key/value pair. Returns true if the
+// iterator is pointing at a valid entry and false otherwise.
+func (i *Iterator) Next() bool {
+	return i.nextWithLimit(nil) == IterValid
+}
+
+// NextWithLimit moves the iterator to the next key/value pair.
+//
+// If limit is provided, it serves as a best-effort exclusive limit. If the next
+// key  is greater than or equal to limit, the Iterator may pause and return
+// IterAtLimit. Because limits are best-effort, NextWithLimit may return a key
+// beyond limit.
+//
+// If the Iterator is configured to iterate over range keys, NextWithLimit
+// guarantees it will surface any range keys with bounds overlapping the
+// keyspace up to limit.
+func (i *Iterator) NextWithLimit(limit []byte) IterValidityState {
+	return i.nextWithLimit(limit)
+}
+
+// NextPrefix moves the iterator to the next key/value pair with a key
+// containing a different prefix than the current key. Prefixes are determined
+// by Comparer.Split. Exhausts the iterator if invoked while in prefix-iteration
+// mode.
+//
+// It is not permitted to invoke NextPrefix while at a IterAtLimit position.
+// When called in this condition, NextPrefix has non-deterministic behavior.
+//
+// It is not permitted to invoke NextPrefix when the Iterator has an
+// upper-bound that is a versioned MVCC key (see the comment for
+// Comparer.Split). It returns an error in this case.
+func (i *Iterator) NextPrefix() bool {
+	if i.nextPrefixNotPermittedByUpperBound {
+		i.lastPositioningOp = unknownLastPositionOp
+		i.requiresReposition = false
+		i.err = errors.Errorf("NextPrefix not permitted with upper bound %s",
+			i.comparer.FormatKey(i.opts.UpperBound))
+		i.iterValidityState = IterExhausted
+		return false
+	}
+	if i.hasPrefix {
+		i.iterValidityState = IterExhausted
+		return false
+	}
+	return i.nextPrefix() == IterValid
+}
+
+func (i *Iterator) nextPrefix() IterValidityState {
+	if i.rangeKey != nil {
+		// NB: Check Valid() before clearing requiresReposition.
+		i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
+		// If we have a range key but did not expose it at the previous iterator
+		// position (because the iterator was not at a valid position), updated
+		// must be true. This ensures that after an iterator op sequence like:
+		//   - Next()             → (IterValid, RangeBounds() = [a,b))
+		//   - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
+		//   - NextWithLimit(...) → (IterValid, RangeBounds() = [a,b))
+		// the iterator returns RangeKeyChanged()=true.
+		//
+		// The remainder of this function will only update i.rangeKey.updated if
+		// the iterator moves into a new range key, or out of the current range
+		// key.
+		i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
+	}
+
+	// Although NextPrefix documents that behavior at IterAtLimit is undefined,
+	// this function handles these cases as a simple prefix-agnostic Next. This
+	// is done for deterministic behavior in the metamorphic tests.
+	//
+	// TODO(jackson): If the metamorphic test operation generator is adjusted to
+	// make generation of some operations conditional on the previous
+	// operations, then we can remove this behavior and explicitly error.
+
+	i.lastPositioningOp = unknownLastPositionOp
+	i.requiresReposition = false
+	switch i.pos {
+	case iterPosCurForward:
+		// Positioned on the current key. Advance to the next prefix.
+		i.internalNextPrefix(i.split(i.key))
+	case iterPosCurForwardPaused:
+		// Positioned at a limit. Implement as a prefix-agnostic Next. See TODO
+		// up above. The iterator is already positioned at the next key.
+	case iterPosCurReverse:
+		// Switching directions.
+		// Unless the iterator was exhausted, reverse iteration needs to
+		// position the iterator at iterPosPrev.
+		if i.iterKey != nil {
+			i.err = errors.New("switching from reverse to forward but iter is not at prev")
+			i.iterValidityState = IterExhausted
+			return i.iterValidityState
+		}
+		// The Iterator is exhausted and i.iter is positioned before the first
+		// key. Reposition to point to the first internal key.
+		i.iterFirstWithinBounds()
+	case iterPosCurReversePaused:
+		// Positioned at a limit. Implement as a prefix-agnostic Next. See TODO
+		// up above.
+		//
+		// Switching directions; The iterator must not be exhausted since it
+		// paused.
+		if i.iterKey == nil {
+			i.err = errors.New("switching paused from reverse to forward but iter is exhausted")
+			i.iterValidityState = IterExhausted
+			return i.iterValidityState
+		}
+		i.nextUserKey()
+	case iterPosPrev:
+		// The underlying iterator is pointed to the previous key (this can
+		// only happen when switching iteration directions).
+		if i.iterKey == nil {
+			// We're positioned before the first key. Need to reposition to point to
+			// the first key.
+			i.iterFirstWithinBounds()
+		} else {
+			// Move the internal iterator back onto the user key stored in
+			// i.key. iterPosPrev guarantees that it's positioned at the last
+			// key with the user key less than i.key, so we're guaranteed to
+			// land on the correct key with a single Next.
+			i.iterKey, i.iterValue = i.iter.Next()
+			if invariants.Enabled && !i.equal(i.iterKey.UserKey, i.key) {
+				i.opts.logger.Fatalf("pebble: invariant violation: Nexting internal iterator from iterPosPrev landed on %q, not %q",
+					i.iterKey.UserKey, i.key)
+			}
+		}
+		// The internal iterator is now positioned at i.key. Advance to the next
+		// prefix.
+		i.internalNextPrefix(i.split(i.key))
+	case iterPosNext:
+		// Already positioned on the next key. Only call nextPrefixKey if the
+		// next key shares the same prefix.
+		if i.iterKey != nil {
+			currKeyPrefixLen := i.split(i.key)
+			iterKeyPrefixLen := i.split(i.iterKey.UserKey)
+			if bytes.Equal(i.iterKey.UserKey[:iterKeyPrefixLen], i.key[:currKeyPrefixLen]) {
+				i.internalNextPrefix(currKeyPrefixLen)
+			}
+		}
+	}
+
+	i.stats.ForwardStepCount[InterfaceCall]++
+	i.findNextEntry(nil /* limit */)
+	i.maybeSampleRead()
+	return i.iterValidityState
+}
+
+func (i *Iterator) internalNextPrefix(currKeyPrefixLen int) {
+	if i.iterKey == nil {
+		return
+	}
+	// The Next "fast-path" is not really a fast-path when there is more than
+	// one version. However, even with TableFormatPebblev3, there is a small
+	// slowdown (~10%) for one version if we remove it and only call NextPrefix.
+	// When there are two versions, only calling NextPrefix is ~30% faster.
+	i.stats.ForwardStepCount[InternalIterCall]++
+	if i.iterKey, i.iterValue = i.iter.Next(); i.iterKey == nil {
+		return
+	}
+	iterKeyPrefixLen := i.split(i.iterKey.UserKey)
+	if !bytes.Equal(i.iterKey.UserKey[:iterKeyPrefixLen], i.key[:currKeyPrefixLen]) {
+		return
+	}
+	i.stats.ForwardStepCount[InternalIterCall]++
+	i.prefixOrFullSeekKey = i.comparer.ImmediateSuccessor(i.prefixOrFullSeekKey[:0], i.key[:currKeyPrefixLen])
+	i.iterKey, i.iterValue = i.iter.NextPrefix(i.prefixOrFullSeekKey)
+	if invariants.Enabled && i.iterKey != nil {
+		if iterKeyPrefixLen := i.split(i.iterKey.UserKey); i.cmp(i.iterKey.UserKey[:iterKeyPrefixLen], i.prefixOrFullSeekKey) < 0 {
+			panic(errors.AssertionFailedf("pebble: iter.NextPrefix did not advance beyond the current prefix: now at %q; expected to be geq %q",
+				i.iterKey, i.prefixOrFullSeekKey))
+		}
+	}
+}
+
+func (i *Iterator) nextWithLimit(limit []byte) IterValidityState {
+	i.stats.ForwardStepCount[InterfaceCall]++
+	if i.hasPrefix {
+		if limit != nil {
+			i.err = errors.New("cannot use limit with prefix iteration")
+			i.iterValidityState = IterExhausted
+			return i.iterValidityState
+		} else if i.iterValidityState == IterExhausted {
+			// No-op, already exhasuted. We avoid executing the Next because it
+			// can break invariants: Specifically, a file that fails the bloom
+			// filter test may result in its level being removed from the
+			// merging iterator. The level's removal can cause a lazy combined
+			// iterator to miss range keys and trigger a switch to combined
+			// iteration at a larger key, breaking keyspan invariants.
+			return i.iterValidityState
+		}
+	}
+	if i.err != nil {
+		return i.iterValidityState
+	}
+	if i.rangeKey != nil {
+		// NB: Check Valid() before clearing requiresReposition.
+		i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
+		// If we have a range key but did not expose it at the previous iterator
+		// position (because the iterator was not at a valid position), updated
+		// must be true. This ensures that after an iterator op sequence like:
+		//   - Next()             → (IterValid, RangeBounds() = [a,b))
+		//   - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
+		//   - NextWithLimit(...) → (IterValid, RangeBounds() = [a,b))
+		// the iterator returns RangeKeyChanged()=true.
+		//
+		// The remainder of this function will only update i.rangeKey.updated if
+		// the iterator moves into a new range key, or out of the current range
+		// key.
+		i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
+	}
+	i.lastPositioningOp = unknownLastPositionOp
+	i.requiresReposition = false
+	switch i.pos {
+	case iterPosCurForward:
+		i.nextUserKey()
+	case iterPosCurForwardPaused:
+		// Already at the right place.
+	case iterPosCurReverse:
+		// Switching directions.
+		// Unless the iterator was exhausted, reverse iteration needs to
+		// position the iterator at iterPosPrev.
+		if i.iterKey != nil {
+			i.err = errors.New("switching from reverse to forward but iter is not at prev")
+			i.iterValidityState = IterExhausted
+			return i.iterValidityState
+		}
+		// We're positioned before the first key. Need to reposition to point to
+		// the first key.
+		i.iterFirstWithinBounds()
+	case iterPosCurReversePaused:
+		// Switching directions.
+		// The iterator must not be exhausted since it paused.
+		if i.iterKey == nil {
+			i.err = errors.New("switching paused from reverse to forward but iter is exhausted")
+			i.iterValidityState = IterExhausted
+			return i.iterValidityState
+		}
+		i.nextUserKey()
+	case iterPosPrev:
+		// The underlying iterator is pointed to the previous key (this can
+		// only happen when switching iteration directions). We set
+		// i.iterValidityState to IterExhausted here to force the calls to
+		// nextUserKey to save the current key i.iter is pointing at in order
+		// to determine when the next user-key is reached.
+		i.iterValidityState = IterExhausted
+		if i.iterKey == nil {
+			// We're positioned before the first key. Need to reposition to point to
+			// the first key.
+			i.iterFirstWithinBounds()
+		} else {
+			i.nextUserKey()
+		}
+		i.nextUserKey()
+	case iterPosNext:
+		// Already at the right place.
+	}
+	i.findNextEntry(limit)
+	i.maybeSampleRead()
+	return i.iterValidityState
+}
+
+// Prev moves the iterator to the previous key/value pair. Returns true if the
+// iterator is pointing at a valid entry and false otherwise.
+func (i *Iterator) Prev() bool {
+	return i.PrevWithLimit(nil) == IterValid
+}
+
+// PrevWithLimit moves the iterator to the previous key/value pair.
+//
+// If limit is provided, it serves as a best-effort inclusive limit. If the
+// previous key is less than limit, the Iterator may pause and return
+// IterAtLimit. Because limits are best-effort, PrevWithLimit may return a key
+// beyond limit.
+//
+// If the Iterator is configured to iterate over range keys, PrevWithLimit
+// guarantees it will surface any range keys with bounds overlapping the
+// keyspace up to limit.
+func (i *Iterator) PrevWithLimit(limit []byte) IterValidityState {
+	i.stats.ReverseStepCount[InterfaceCall]++
+	if i.err != nil {
+		return i.iterValidityState
+	}
+	if i.rangeKey != nil {
+		// NB: Check Valid() before clearing requiresReposition.
+		i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
+		// If we have a range key but did not expose it at the previous iterator
+		// position (because the iterator was not at a valid position), updated
+		// must be true. This ensures that after an iterator op sequence like:
+		//   - Next()             → (IterValid, RangeBounds() = [a,b))
+		//   - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
+		//   - PrevWithLimit(...) → (IterValid, RangeBounds() = [a,b))
+		// the iterator returns RangeKeyChanged()=true.
+		//
+		// The remainder of this function will only update i.rangeKey.updated if
+		// the iterator moves into a new range key, or out of the current range
+		// key.
+		i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
+	}
+	i.lastPositioningOp = unknownLastPositionOp
+	i.requiresReposition = false
+	if i.hasPrefix {
+		i.err = errReversePrefixIteration
+		i.iterValidityState = IterExhausted
+		return i.iterValidityState
+	}
+	switch i.pos {
+	case iterPosCurForward:
+		// Switching directions, and will handle this below.
+	case iterPosCurForwardPaused:
+		// Switching directions, and will handle this below.
+	case iterPosCurReverse:
+		i.prevUserKey()
+	case iterPosCurReversePaused:
+		// Already at the right place.
+	case iterPosNext:
+		// The underlying iterator is pointed to the next key (this can only happen
+		// when switching iteration directions). We will handle this below.
+	case iterPosPrev:
+		// Already at the right place.
+	}
+	if i.pos == iterPosCurForward || i.pos == iterPosNext || i.pos == iterPosCurForwardPaused {
+		// Switching direction.
+		stepAgain := i.pos == iterPosNext
+
+		// Synthetic range key markers are a special case. Consider SeekGE(b)
+		// which finds a range key [a, c). To ensure the user observes the range
+		// key, the Iterator pauses at Key() = b. The iterator must advance the
+		// internal iterator to see if there's also a coincident point key at
+		// 'b', leaving the iterator at iterPosNext if there's not.
+		//
+		// This is a problem: Synthetic range key markers are only interleaved
+		// during the original seek. A subsequent Prev() of i.iter will not move
+		// back onto the synthetic range key marker. In this case where the
+		// previous iterator position was a synthetic range key start boundary,
+		// we must not step a second time.
+		if i.isEphemeralPosition() {
+			stepAgain = false
+		}
+
+		// We set i.iterValidityState to IterExhausted here to force the calls
+		// to prevUserKey to save the current key i.iter is pointing at in
+		// order to determine when the prev user-key is reached.
+		i.iterValidityState = IterExhausted
+		if i.iterKey == nil {
+			// We're positioned after the last key. Need to reposition to point to
+			// the last key.
+			i.iterLastWithinBounds()
+		} else {
+			i.prevUserKey()
+		}
+		if stepAgain {
+			i.prevUserKey()
+		}
+	}
+	i.findPrevEntry(limit)
+	i.maybeSampleRead()
+	return i.iterValidityState
+}
+
+// iterFirstWithinBounds moves the internal iterator to the first key,
+// respecting bounds.
+func (i *Iterator) iterFirstWithinBounds() {
+	i.stats.ForwardSeekCount[InternalIterCall]++
+	if lowerBound := i.opts.GetLowerBound(); lowerBound != nil {
+		i.iterKey, i.iterValue = i.iter.SeekGE(lowerBound, base.SeekGEFlagsNone)
+	} else {
+		i.iterKey, i.iterValue = i.iter.First()
+	}
+}
+
+// iterLastWithinBounds moves the internal iterator to the last key, respecting
+// bounds.
+func (i *Iterator) iterLastWithinBounds() {
+	i.stats.ReverseSeekCount[InternalIterCall]++
+	if upperBound := i.opts.GetUpperBound(); upperBound != nil {
+		i.iterKey, i.iterValue = i.iter.SeekLT(upperBound, base.SeekLTFlagsNone)
+	} else {
+		i.iterKey, i.iterValue = i.iter.Last()
+	}
+}
+
+// RangeKeyData describes a range key's data, set through RangeKeySet. The key
+// boundaries of the range key is provided by Iterator.RangeBounds.
+type RangeKeyData struct {
+	Suffix []byte
+	Value  []byte
+}
+
+// rangeKeyWithinLimit is called during limited reverse iteration when
+// positioned over a key beyond the limit. If there exists a range key that lies
+// within the limit, the iterator must not pause in order to ensure the user has
+// an opportunity to observe the range key within limit.
+//
+// It would be valid to ignore the limit whenever there's a range key covering
+// the key, but that would introduce nondeterminism. To preserve determinism for
+// testing, the iterator ignores the limit only if the covering range key does
+// cover the keyspace within the limit.
+//
+// This awkwardness exists because range keys are interleaved at their inclusive
+// start positions. Note that limit is inclusive.
+func (i *Iterator) rangeKeyWithinLimit(limit []byte) bool {
+	if i.rangeKey == nil || !i.opts.rangeKeys() {
+		return false
+	}
+	s := i.rangeKey.iiter.Span()
+	// If the range key ends beyond the limit, then the range key does not cover
+	// any portion of the keyspace within the limit and it is safe to pause.
+	return s != nil && i.cmp(s.End, limit) > 0
+}
+
+// saveRangeKey saves the current range key to the underlying iterator's current
+// range key state. If the range key has not changed, saveRangeKey is a no-op.
+// If there is a new range key, saveRangeKey copies all of the key, value and
+// suffixes into Iterator-managed buffers.
+func (i *Iterator) saveRangeKey() {
+	if i.rangeKey == nil || i.opts.KeyTypes == IterKeyTypePointsOnly {
+		return
+	}
+
+	s := i.rangeKey.iiter.Span()
+	if s == nil {
+		i.rangeKey.hasRangeKey = false
+		i.rangeKey.updated = i.rangeKey.prevPosHadRangeKey
+		return
+	} else if !i.rangeKey.stale {
+		// The range key `s` is identical to the one currently saved. No-op.
+		return
+	}
+
+	if s.KeysOrder != keyspan.BySuffixAsc {
+		panic("pebble: range key span's keys unexpectedly not in ascending suffix order")
+	}
+
+	// Although `i.rangeKey.stale` is true, the span s may still be identical
+	// to the currently saved span. This is possible when seeking the iterator,
+	// which may land back on the same range key. If we previously had a range
+	// key and the new one has an identical start key, then it must be the same
+	// range key and we can avoid copying and keep `i.rangeKey.updated=false`.
+	//
+	// TODO(jackson): These key comparisons could be avoidable during relative
+	// positioning operations continuing in the same direction, because these
+	// ops will never encounter the previous position's range key while
+	// stale=true. However, threading whether the current op is a seek or step
+	// maybe isn't worth it. This key comparison is only necessary once when we
+	// step onto a new range key, which should be relatively rare.
+	if i.rangeKey.prevPosHadRangeKey && i.equal(i.rangeKey.start, s.Start) &&
+		i.equal(i.rangeKey.end, s.End) {
+		i.rangeKey.updated = false
+		i.rangeKey.stale = false
+		i.rangeKey.hasRangeKey = true
+		return
+	}
+	i.stats.RangeKeyStats.Count += len(s.Keys)
+	i.rangeKey.buf.Reset()
+	i.rangeKey.hasRangeKey = true
+	i.rangeKey.updated = true
+	i.rangeKey.stale = false
+	i.rangeKey.buf, i.rangeKey.start = i.rangeKey.buf.Copy(s.Start)
+	i.rangeKey.buf, i.rangeKey.end = i.rangeKey.buf.Copy(s.End)
+	i.rangeKey.keys = i.rangeKey.keys[:0]
+	for j := 0; j < len(s.Keys); j++ {
+		if invariants.Enabled {
+			if s.Keys[j].Kind() != base.InternalKeyKindRangeKeySet {
+				panic("pebble: user iteration encountered non-RangeKeySet key kind")
+			} else if j > 0 && i.cmp(s.Keys[j].Suffix, s.Keys[j-1].Suffix) < 0 {
+				panic("pebble: user iteration encountered range keys not in suffix order")
+			}
+		}
+		var rkd RangeKeyData
+		i.rangeKey.buf, rkd.Suffix = i.rangeKey.buf.Copy(s.Keys[j].Suffix)
+		i.rangeKey.buf, rkd.Value = i.rangeKey.buf.Copy(s.Keys[j].Value)
+		i.rangeKey.keys = append(i.rangeKey.keys, rkd)
+	}
+}
+
+// RangeKeyChanged indicates whether the most recent iterator positioning
+// operation resulted in the iterator stepping into or out of a new range key.
+// If true, previously returned range key bounds and data has been invalidated.
+// If false, previously obtained range key bounds, suffix and value slices are
+// still valid and may continue to be read.
+//
+// Invalid iterator positions are considered to not hold range keys, meaning
+// that if an iterator steps from an IterExhausted or IterAtLimit position onto
+// a position with a range key, RangeKeyChanged will yield true.
+func (i *Iterator) RangeKeyChanged() bool {
+	return i.iterValidityState == IterValid && i.rangeKey != nil && i.rangeKey.updated
+}
+
+// HasPointAndRange indicates whether there exists a point key, a range key or
+// both at the current iterator position.
+func (i *Iterator) HasPointAndRange() (hasPoint, hasRange bool) {
+	if i.iterValidityState != IterValid || i.requiresReposition {
+		return false, false
+	}
+	if i.opts.KeyTypes == IterKeyTypePointsOnly {
+		return true, false
+	}
+	return i.rangeKey == nil || !i.rangeKey.rangeKeyOnly, i.rangeKey != nil && i.rangeKey.hasRangeKey
+}
+
+// RangeBounds returns the start (inclusive) and end (exclusive) bounds of the
+// range key covering the current iterator position. RangeBounds returns nil
+// bounds if there is no range key covering the current iterator position, or
+// the iterator is not configured to surface range keys.
+//
+// If valid, the returned start bound is less than or equal to Key() and the
+// returned end bound is greater than Key().
+func (i *Iterator) RangeBounds() (start, end []byte) {
+	if i.rangeKey == nil || !i.opts.rangeKeys() || !i.rangeKey.hasRangeKey {
+		return nil, nil
+	}
+	return i.rangeKey.start, i.rangeKey.end
+}
+
+// Key returns the key of the current key/value pair, or nil if done. The
+// caller should not modify the contents of the returned slice, and its
+// contents may change on the next call to Next.
+//
+// If positioned at an iterator position that only holds a range key, Key()
+// always returns the start bound of the range key. Otherwise, it returns the
+// point key's key.
+func (i *Iterator) Key() []byte {
+	return i.key
+}
+
+// Value returns the value of the current key/value pair, or nil if done. The
+// caller should not modify the contents of the returned slice, and its
+// contents may change on the next call to Next.
+//
+// Only valid if HasPointAndRange() returns true for hasPoint.
+// Deprecated: use ValueAndErr instead.
+func (i *Iterator) Value() []byte {
+	val, _ := i.ValueAndErr()
+	return val
+}
+
+// ValueAndErr returns the value, and any error encountered in extracting the value.
+// REQUIRES: i.Error()==nil and HasPointAndRange() returns true for hasPoint.
+//
+// The caller should not modify the contents of the returned slice, and its
+// contents may change on the next call to Next.
+func (i *Iterator) ValueAndErr() ([]byte, error) {
+	val, callerOwned, err := i.value.Value(i.lazyValueBuf)
+	if err != nil {
+		i.err = err
+	}
+	if callerOwned {
+		i.lazyValueBuf = val[:0]
+	}
+	return val, err
+}
+
+// LazyValue returns the LazyValue. Only for advanced use cases.
+// REQUIRES: i.Error()==nil and HasPointAndRange() returns true for hasPoint.
+func (i *Iterator) LazyValue() LazyValue {
+	return i.value
+}
+
+// RangeKeys returns the range key values and their suffixes covering the
+// current iterator position. The range bounds may be retrieved separately
+// through Iterator.RangeBounds().
+func (i *Iterator) RangeKeys() []RangeKeyData {
+	if i.rangeKey == nil || !i.opts.rangeKeys() || !i.rangeKey.hasRangeKey {
+		return nil
+	}
+	return i.rangeKey.keys
+}
+
+// Valid returns true if the iterator is positioned at a valid key/value pair
+// and false otherwise.
+func (i *Iterator) Valid() bool {
+	valid := i.iterValidityState == IterValid && !i.requiresReposition
+	if invariants.Enabled {
+		if err := i.Error(); valid && err != nil {
+			panic(errors.WithSecondaryError(errors.AssertionFailedf("pebble: iterator is valid with non-nil Error"), err))
+		}
+	}
+	return valid
+}
+
+// Error returns any accumulated error.
+func (i *Iterator) Error() error {
+	if i.iter != nil {
+		return firstError(i.err, i.iter.Error())
+	}
+	return i.err
+}
+
+const maxKeyBufCacheSize = 4 << 10 // 4 KB
+
+// Close closes the iterator and returns any accumulated error. Exhausting
+// all the key/value pairs in a table is not considered to be an error.
+// It is not valid to call any method, including Close, after the iterator
+// has been closed.
+func (i *Iterator) Close() error {
+	// Close the child iterator before releasing the readState because when the
+	// readState is released sstables referenced by the readState may be deleted
+	// which will fail on Windows if the sstables are still open by the child
+	// iterator.
+	if i.iter != nil {
+		i.err = firstError(i.err, i.iter.Close())
+
+		// Closing i.iter did not necessarily close the point and range key
+		// iterators. Calls to SetOptions may have 'disconnected' either one
+		// from i.iter if iteration key types were changed. Both point and range
+		// key iterators are preserved in case the iterator needs to switch key
+		// types again. We explicitly close both of these iterators here.
+		//
+		// NB: If the iterators were still connected to i.iter, they may be
+		// closed, but calling Close on a closed internal iterator or fragment
+		// iterator is allowed.
+		if i.pointIter != nil && !i.closePointIterOnce {
+			i.err = firstError(i.err, i.pointIter.Close())
+		}
+		if i.rangeKey != nil && i.rangeKey.rangeKeyIter != nil {
+			i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close())
+		}
+	}
+	err := i.err
+
+	if i.readState != nil {
+		if i.readSampling.pendingCompactions.size > 0 {
+			// Copy pending read compactions using db.mu.Lock()
+			i.readState.db.mu.Lock()
+			i.readState.db.mu.compact.readCompactions.combine(&i.readSampling.pendingCompactions, i.cmp)
+			reschedule := i.readState.db.mu.compact.rescheduleReadCompaction
+			i.readState.db.mu.compact.rescheduleReadCompaction = false
+			concurrentCompactions := i.readState.db.mu.compact.compactingCount
+			i.readState.db.mu.Unlock()
+
+			if reschedule && concurrentCompactions == 0 {
+				// In a read heavy workload, flushes may not happen frequently enough to
+				// schedule compactions.
+				i.readState.db.compactionSchedulers.Add(1)
+				go i.readState.db.maybeScheduleCompactionAsync()
+			}
+		}
+
+		i.readState.unref()
+		i.readState = nil
+	}
+
+	if i.version != nil {
+		i.version.Unref()
+	}
+
+	for _, readers := range i.externalReaders {
+		for _, r := range readers {
+			err = firstError(err, r.Close())
+		}
+	}
+
+	// Close the closer for the current value if one was open.
+	if i.valueCloser != nil {
+		err = firstError(err, i.valueCloser.Close())
+		i.valueCloser = nil
+	}
+
+	if i.rangeKey != nil {
+
+		i.rangeKey.rangeKeyBuffers.PrepareForReuse()
+		*i.rangeKey = iteratorRangeKeyState{
+			rangeKeyBuffers: i.rangeKey.rangeKeyBuffers,
+		}
+		iterRangeKeyStateAllocPool.Put(i.rangeKey)
+		i.rangeKey = nil
+	}
+	if alloc := i.alloc; alloc != nil {
+		// Avoid caching the key buf if it is overly large. The constant is fairly
+		// arbitrary.
+		if cap(i.keyBuf) >= maxKeyBufCacheSize {
+			alloc.keyBuf = nil
+		} else {
+			alloc.keyBuf = i.keyBuf
+		}
+		if cap(i.prefixOrFullSeekKey) >= maxKeyBufCacheSize {
+			alloc.prefixOrFullSeekKey = nil
+		} else {
+			alloc.prefixOrFullSeekKey = i.prefixOrFullSeekKey
+		}
+		for j := range i.boundsBuf {
+			if cap(i.boundsBuf[j]) >= maxKeyBufCacheSize {
+				alloc.boundsBuf[j] = nil
+			} else {
+				alloc.boundsBuf[j] = i.boundsBuf[j]
+			}
+		}
+		*alloc = iterAlloc{
+			keyBuf:              alloc.keyBuf,
+			boundsBuf:           alloc.boundsBuf,
+			prefixOrFullSeekKey: alloc.prefixOrFullSeekKey,
+		}
+		iterAllocPool.Put(alloc)
+	} else if alloc := i.getIterAlloc; alloc != nil {
+		if cap(i.keyBuf) >= maxKeyBufCacheSize {
+			alloc.keyBuf = nil
+		} else {
+			alloc.keyBuf = i.keyBuf
+		}
+		*alloc = getIterAlloc{
+			keyBuf: alloc.keyBuf,
+		}
+		getIterAllocPool.Put(alloc)
+	}
+	return err
+}
+
+// SetBounds sets the lower and upper bounds for the iterator. Once SetBounds
+// returns, the caller is free to mutate the provided slices.
+//
+// The iterator will always be invalidated and must be repositioned with a call
+// to SeekGE, SeekPrefixGE, SeekLT, First, or Last.
+func (i *Iterator) SetBounds(lower, upper []byte) {
+	// Ensure that the Iterator appears exhausted, regardless of whether we
+	// actually have to invalidate the internal iterator. Optimizations that
+	// avoid exhaustion are an internal implementation detail that shouldn't
+	// leak through the interface. The caller should still call an absolute
+	// positioning method to reposition the iterator.
+	i.requiresReposition = true
+
+	if ((i.opts.LowerBound == nil) == (lower == nil)) &&
+		((i.opts.UpperBound == nil) == (upper == nil)) &&
+		i.equal(i.opts.LowerBound, lower) &&
+		i.equal(i.opts.UpperBound, upper) {
+		// Unchanged, noop.
+		return
+	}
+
+	// Copy the user-provided bounds into an Iterator-owned buffer, and set them
+	// on i.opts.{Lower,Upper}Bound.
+	i.processBounds(lower, upper)
+
+	i.iter.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
+	// If the iterator has an open point iterator that's not currently being
+	// used, propagate the new bounds to it.
+	if i.pointIter != nil && !i.opts.pointKeys() {
+		i.pointIter.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
+	}
+	// If the iterator has a range key iterator, propagate bounds to it. The
+	// top-level SetBounds on the interleaving iterator (i.iter) won't propagate
+	// bounds to the range key iterator stack, because the FragmentIterator
+	// interface doesn't define a SetBounds method. We need to directly inform
+	// the iterConfig stack.
+	if i.rangeKey != nil {
+		i.rangeKey.iterConfig.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
+	}
+
+	// Even though this is not a positioning operation, the alteration of the
+	// bounds means we cannot optimize Seeks by using Next.
+	i.invalidate()
+}
+
+// SetContext replaces the context provided at iterator creation, or the last
+// one provided by SetContext. Even though iterators are expected to be
+// short-lived, there are some cases where either (a) iterators are used far
+// from the code that created them, (b) iterators are reused (while being
+// short-lived) for processing different requests. For such scenarios, we
+// allow the caller to replace the context.
+func (i *Iterator) SetContext(ctx context.Context) {
+	i.ctx = ctx
+	i.iter.SetContext(ctx)
+	// If the iterator has an open point iterator that's not currently being
+	// used, propagate the new context to it.
+	if i.pointIter != nil && !i.opts.pointKeys() {
+		i.pointIter.SetContext(i.ctx)
+	}
+}
+
+// Initialization and changing of the bounds must call processBounds.
+// processBounds saves the bounds and computes derived state from those
+// bounds.
+func (i *Iterator) processBounds(lower, upper []byte) {
+	// Copy the user-provided bounds into an Iterator-owned buffer. We can't
+	// overwrite the current bounds, because some internal iterators compare old
+	// and new bounds for optimizations.
+
+	buf := i.boundsBuf[i.boundsBufIdx][:0]
+	if lower != nil {
+		buf = append(buf, lower...)
+		i.opts.LowerBound = buf
+	} else {
+		i.opts.LowerBound = nil
+	}
+	i.nextPrefixNotPermittedByUpperBound = false
+	if upper != nil {
+		buf = append(buf, upper...)
+		i.opts.UpperBound = buf[len(buf)-len(upper):]
+		if i.comparer.Split != nil {
+			if i.comparer.Split(i.opts.UpperBound) != len(i.opts.UpperBound) {
+				// Setting an upper bound that is a versioned MVCC key. This means
+				// that a key can have some MVCC versions before the upper bound and
+				// some after. This causes significant complications for NextPrefix,
+				// so we bar the user of NextPrefix.
+				i.nextPrefixNotPermittedByUpperBound = true
+			}
+		}
+	} else {
+		i.opts.UpperBound = nil
+	}
+	i.boundsBuf[i.boundsBufIdx] = buf
+	i.boundsBufIdx = 1 - i.boundsBufIdx
+}
+
+// SetOptions sets new iterator options for the iterator. Note that the lower
+// and upper bounds applied here will supersede any bounds set by previous calls
+// to SetBounds.
+//
+// Note that the slices provided in this SetOptions must not be changed by the
+// caller until the iterator is closed, or a subsequent SetBounds or SetOptions
+// has returned. This is because comparisons between the existing and new bounds
+// are sometimes used to optimize seeking. See the extended commentary on
+// SetBounds.
+//
+// If the iterator was created over an indexed mutable batch, the iterator's
+// view of the mutable batch is refreshed.
+//
+// The iterator will always be invalidated and must be repositioned with a call
+// to SeekGE, SeekPrefixGE, SeekLT, First, or Last.
+//
+// If only lower and upper bounds need to be modified, prefer SetBounds.
+func (i *Iterator) SetOptions(o *IterOptions) {
+	if i.externalReaders != nil {
+		if err := validateExternalIterOpts(o); err != nil {
+			panic(err)
+		}
+	}
+
+	// Ensure that the Iterator appears exhausted, regardless of whether we
+	// actually have to invalidate the internal iterator. Optimizations that
+	// avoid exhaustion are an internal implementation detail that shouldn't
+	// leak through the interface. The caller should still call an absolute
+	// positioning method to reposition the iterator.
+	i.requiresReposition = true
+
+	// Check if global state requires we close all internal iterators.
+	//
+	// If the Iterator is in an error state, invalidate the existing iterators
+	// so that we reconstruct an iterator state from scratch.
+	//
+	// If OnlyReadGuaranteedDurable changed, the iterator stacks are incorrect,
+	// improperly including or excluding memtables. Invalidate them so that
+	// finishInitializingIter will reconstruct them.
+	//
+	// If either the original options or the new options specify a table filter,
+	// we need to reconstruct the iterator stacks. If they both supply a table
+	// filter, we can't be certain that it's the same filter since we have no
+	// mechanism to compare the filter closures.
+	closeBoth := i.err != nil ||
+		o.OnlyReadGuaranteedDurable != i.opts.OnlyReadGuaranteedDurable ||
+		o.TableFilter != nil || i.opts.TableFilter != nil
+
+	// If either options specify block property filters for an iterator stack,
+	// reconstruct it.
+	if i.pointIter != nil && (closeBoth || len(o.PointKeyFilters) > 0 || len(i.opts.PointKeyFilters) > 0 ||
+		o.RangeKeyMasking.Filter != nil || i.opts.RangeKeyMasking.Filter != nil || o.SkipPoint != nil ||
+		i.opts.SkipPoint != nil) {
+		i.err = firstError(i.err, i.pointIter.Close())
+		i.pointIter = nil
+	}
+	if i.rangeKey != nil {
+		if closeBoth || len(o.RangeKeyFilters) > 0 || len(i.opts.RangeKeyFilters) > 0 {
+			i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close())
+			i.rangeKey = nil
+		} else {
+			// If there's still a range key iterator stack, invalidate the
+			// iterator. This ensures RangeKeyChanged() returns true if a
+			// subsequent positioning operation discovers a range key. It also
+			// prevents seek no-op optimizations.
+			i.invalidate()
+		}
+	}
+
+	// If the iterator is backed by a batch that's been mutated, refresh its
+	// existing point and range-key iterators, and invalidate the iterator to
+	// prevent seek-using-next optimizations. If we don't yet have a point-key
+	// iterator or range-key iterator but we require one, it'll be created in
+	// the slow path that reconstructs the iterator in finishInitializingIter.
+	if i.batch != nil {
+		nextBatchSeqNum := (uint64(len(i.batch.data)) | base.InternalKeySeqNumBatch)
+		if nextBatchSeqNum != i.batchSeqNum {
+			i.batchSeqNum = nextBatchSeqNum
+			if i.merging != nil {
+				i.merging.batchSnapshot = nextBatchSeqNum
+			}
+			// Prevent a no-op seek optimization on the next seek. We won't be
+			// able to reuse the top-level Iterator state, because it may be
+			// incorrect after the inclusion of new batch mutations.
+			i.batchJustRefreshed = true
+			if i.pointIter != nil && i.batch.countRangeDels > 0 {
+				if i.batchRangeDelIter.Count() == 0 {
+					// When we constructed this iterator, there were no
+					// rangedels in the batch. Iterator construction will
+					// have excluded the batch rangedel iterator from the
+					// point iterator stack. We need to reconstruct the
+					// point iterator to add i.batchRangeDelIter into the
+					// iterator stack.
+					i.err = firstError(i.err, i.pointIter.Close())
+					i.pointIter = nil
+				} else {
+					// There are range deletions in the batch and we already
+					// have a batch rangedel iterator. We can update the
+					// batch rangedel iterator in place.
+					//
+					// NB: There may or may not be new range deletions. We
+					// can't tell based on i.batchRangeDelIter.Count(),
+					// which is the count of fragmented range deletions, NOT
+					// the number of range deletions written to the batch
+					// [i.batch.countRangeDels].
+					i.batch.initRangeDelIter(&i.opts, &i.batchRangeDelIter, nextBatchSeqNum)
+				}
+			}
+			if i.rangeKey != nil && i.batch.countRangeKeys > 0 {
+				if i.batchRangeKeyIter.Count() == 0 {
+					// When we constructed this iterator, there were no range
+					// keys in the batch. Iterator construction will have
+					// excluded the batch rangekey iterator from the range key
+					// iterator stack. We need to reconstruct the range key
+					// iterator to add i.batchRangeKeyIter into the iterator
+					// stack.
+					i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close())
+					i.rangeKey = nil
+				} else {
+					// There are range keys in the batch and we already
+					// have a batch rangekey iterator. We can update the batch
+					// rangekey iterator in place.
+					//
+					// NB: There may or may not be new range keys. We can't
+					// tell based on i.batchRangeKeyIter.Count(), which is the
+					// count of fragmented range keys, NOT the number of
+					// range keys written to the batch [i.batch.countRangeKeys].
+					i.batch.initRangeKeyIter(&i.opts, &i.batchRangeKeyIter, nextBatchSeqNum)
+					i.invalidate()
+				}
+			}
+		}
+	}
+
+	// Reset combinedIterState.initialized in case the iterator key types
+	// changed. If there's already a range key iterator stack, the combined
+	// iterator is already initialized.  Additionally, if the iterator is not
+	// configured to include range keys, mark it as initialized to signal that
+	// lower level iterators should not trigger a switch to combined iteration.
+	i.lazyCombinedIter.combinedIterState = combinedIterState{
+		initialized: i.rangeKey != nil || !i.opts.rangeKeys(),
+	}
+
+	boundsEqual := ((i.opts.LowerBound == nil) == (o.LowerBound == nil)) &&
+		((i.opts.UpperBound == nil) == (o.UpperBound == nil)) &&
+		i.equal(i.opts.LowerBound, o.LowerBound) &&
+		i.equal(i.opts.UpperBound, o.UpperBound)
+
+	if boundsEqual && o.KeyTypes == i.opts.KeyTypes &&
+		(i.pointIter != nil || !i.opts.pointKeys()) &&
+		(i.rangeKey != nil || !i.opts.rangeKeys() || i.opts.KeyTypes == IterKeyTypePointsAndRanges) &&
+		i.equal(o.RangeKeyMasking.Suffix, i.opts.RangeKeyMasking.Suffix) &&
+		o.UseL6Filters == i.opts.UseL6Filters {
+		// The options are identical, so we can likely use the fast path. In
+		// addition to all the above constraints, we cannot use the fast path if
+		// configured to perform lazy combined iteration but an indexed batch
+		// used by the iterator now contains range keys. Lazy combined iteration
+		// is not compatible with batch range keys because we always need to
+		// merge the batch's range keys into iteration.
+		if i.rangeKey != nil || !i.opts.rangeKeys() || i.batch == nil || i.batch.countRangeKeys == 0 {
+			// Fast path. This preserves the Seek-using-Next optimizations as
+			// long as the iterator wasn't already invalidated up above.
+			return
+		}
+	}
+	// Slow path.
+
+	// The options changed. Save the new ones to i.opts.
+	if boundsEqual {
+		// Copying the options into i.opts will overwrite LowerBound and
+		// UpperBound fields with the user-provided slices. We need to hold on
+		// to the Pebble-owned slices, so save them and re-set them after the
+		// copy.
+		lower, upper := i.opts.LowerBound, i.opts.UpperBound
+		i.opts = *o
+		i.opts.LowerBound, i.opts.UpperBound = lower, upper
+	} else {
+		i.opts = *o
+		i.processBounds(o.LowerBound, o.UpperBound)
+		// Propagate the changed bounds to the existing point iterator.
+		// NB: We propagate i.opts.{Lower,Upper}Bound, not o.{Lower,Upper}Bound
+		// because i.opts now point to buffers owned by Pebble.
+		if i.pointIter != nil {
+			i.pointIter.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
+		}
+		if i.rangeKey != nil {
+			i.rangeKey.iterConfig.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
+		}
+	}
+
+	// Even though this is not a positioning operation, the invalidation of the
+	// iterator stack means we cannot optimize Seeks by using Next.
+	i.invalidate()
+
+	// Iterators created through NewExternalIter have a different iterator
+	// initialization process.
+	if i.externalReaders != nil {
+		finishInitializingExternal(i.ctx, i)
+		return
+	}
+	finishInitializingIter(i.ctx, i.alloc)
+}
+
+func (i *Iterator) invalidate() {
+	i.lastPositioningOp = invalidatedLastPositionOp
+	i.hasPrefix = false
+	i.iterKey = nil
+	i.iterValue = LazyValue{}
+	i.err = nil
+	// This switch statement isn't necessary for correctness since callers
+	// should call a repositioning method. We could have arbitrarily set i.pos
+	// to one of the values. But it results in more intuitive behavior in
+	// tests, which do not always reposition.
+	switch i.pos {
+	case iterPosCurForward, iterPosNext, iterPosCurForwardPaused:
+		i.pos = iterPosCurForward
+	case iterPosCurReverse, iterPosPrev, iterPosCurReversePaused:
+		i.pos = iterPosCurReverse
+	}
+	i.iterValidityState = IterExhausted
+	if i.rangeKey != nil {
+		i.rangeKey.iiter.Invalidate()
+		i.rangeKey.prevPosHadRangeKey = false
+	}
+}
+
+// Metrics returns per-iterator metrics.
+func (i *Iterator) Metrics() IteratorMetrics {
+	m := IteratorMetrics{
+		ReadAmp: 1,
+	}
+	if mi, ok := i.iter.(*mergingIter); ok {
+		m.ReadAmp = len(mi.levels)
+	}
+	return m
+}
+
+// ResetStats resets the stats to 0.
+func (i *Iterator) ResetStats() {
+	i.stats = IteratorStats{}
+}
+
+// Stats returns the current stats.
+func (i *Iterator) Stats() IteratorStats {
+	return i.stats
+}
+
+// CloneOptions configures an iterator constructed through Iterator.Clone.
+type CloneOptions struct {
+	// IterOptions, if non-nil, define the iterator options to configure a
+	// cloned iterator. If nil, the clone adopts the same IterOptions as the
+	// iterator being cloned.
+	IterOptions *IterOptions
+	// RefreshBatchView may be set to true when cloning an Iterator over an
+	// indexed batch. When false, the clone adopts the same (possibly stale)
+	// view of the indexed batch as the cloned Iterator. When true, the clone is
+	// constructed with a refreshed view of the batch, observing all of the
+	// batch's mutations at the time of the Clone. If the cloned iterator was
+	// not constructed to read over an indexed batch, RefreshVatchView has no
+	// effect.
+	RefreshBatchView bool
+}
+
+// Clone creates a new Iterator over the same underlying data, i.e., over the
+// same {batch, memtables, sstables}). The resulting iterator is not positioned.
+// It starts with the same IterOptions, unless opts.IterOptions is set.
+//
+// When called on an Iterator over an indexed batch, the clone's visibility of
+// the indexed batch is determined by CloneOptions.RefreshBatchView. If false,
+// the clone inherits the iterator's current (possibly stale) view of the batch,
+// and callers may call SetOptions to subsequently refresh the clone's view to
+// include all batch mutations. If true, the clone is constructed with a
+// complete view of the indexed batch's mutations at the time of the Clone.
+//
+// Callers can use Clone if they need multiple iterators that need to see
+// exactly the same underlying state of the DB. This should not be used to
+// extend the lifetime of the data backing the original Iterator since that
+// will cause an increase in memory and disk usage (use NewSnapshot for that
+// purpose).
+func (i *Iterator) Clone(opts CloneOptions) (*Iterator, error) {
+	return i.CloneWithContext(context.Background(), opts)
+}
+
+// CloneWithContext is like Clone, and additionally accepts a context for
+// tracing.
+func (i *Iterator) CloneWithContext(ctx context.Context, opts CloneOptions) (*Iterator, error) {
+	if opts.IterOptions == nil {
+		opts.IterOptions = &i.opts
+	}
+	if i.batchOnlyIter {
+		return nil, errors.Errorf("cannot Clone a batch-only Iterator")
+	}
+	readState := i.readState
+	vers := i.version
+	if readState == nil && vers == nil {
+		return nil, errors.Errorf("cannot Clone a closed Iterator")
+	}
+	// i is already holding a ref, so there is no race with unref here.
+	//
+	// TODO(bilal): If the underlying iterator was created on a snapshot, we could
+	// grab a reference to the current readState instead of reffing the original
+	// readState. This allows us to release references to some zombie sstables
+	// and memtables.
+	if readState != nil {
+		readState.ref()
+	}
+	if vers != nil {
+		vers.Ref()
+	}
+	// Bundle various structures under a single umbrella in order to allocate
+	// them together.
+	buf := iterAllocPool.Get().(*iterAlloc)
+	dbi := &buf.dbi
+	*dbi = Iterator{
+		ctx:                 ctx,
+		opts:                *opts.IterOptions,
+		alloc:               buf,
+		merge:               i.merge,
+		comparer:            i.comparer,
+		readState:           readState,
+		version:             vers,
+		keyBuf:              buf.keyBuf,
+		prefixOrFullSeekKey: buf.prefixOrFullSeekKey,
+		boundsBuf:           buf.boundsBuf,
+		batch:               i.batch,
+		batchSeqNum:         i.batchSeqNum,
+		newIters:            i.newIters,
+		newIterRangeKey:     i.newIterRangeKey,
+		seqNum:              i.seqNum,
+	}
+	dbi.processBounds(dbi.opts.LowerBound, dbi.opts.UpperBound)
+
+	// If the caller requested the clone have a current view of the indexed
+	// batch, set the clone's batch sequence number appropriately.
+	if i.batch != nil && opts.RefreshBatchView {
+		dbi.batchSeqNum = (uint64(len(i.batch.data)) | base.InternalKeySeqNumBatch)
+	}
+
+	return finishInitializingIter(ctx, buf), nil
+}
+
+// Merge adds all of the argument's statistics to the receiver. It may be used
+// to accumulate stats across multiple iterators.
+func (stats *IteratorStats) Merge(o IteratorStats) {
+	for i := InterfaceCall; i < NumStatsKind; i++ {
+		stats.ForwardSeekCount[i] += o.ForwardSeekCount[i]
+		stats.ReverseSeekCount[i] += o.ReverseSeekCount[i]
+		stats.ForwardStepCount[i] += o.ForwardStepCount[i]
+		stats.ReverseStepCount[i] += o.ReverseStepCount[i]
+	}
+	stats.InternalStats.Merge(o.InternalStats)
+	stats.RangeKeyStats.Merge(o.RangeKeyStats)
+}
+
+func (stats *IteratorStats) String() string {
+	return redact.StringWithoutMarkers(stats)
+}
+
+// SafeFormat implements the redact.SafeFormatter interface.
+func (stats *IteratorStats) SafeFormat(s redact.SafePrinter, verb rune) {
+	for i := range stats.ForwardStepCount {
+		switch IteratorStatsKind(i) {
+		case InterfaceCall:
+			s.SafeString("(interface (dir, seek, step): ")
+		case InternalIterCall:
+			s.SafeString(", (internal (dir, seek, step): ")
+		}
+		s.Printf("(fwd, %d, %d), (rev, %d, %d))",
+			redact.Safe(stats.ForwardSeekCount[i]), redact.Safe(stats.ForwardStepCount[i]),
+			redact.Safe(stats.ReverseSeekCount[i]), redact.Safe(stats.ReverseStepCount[i]))
+	}
+	if stats.InternalStats != (InternalIteratorStats{}) {
+		s.SafeString(",\n(internal-stats: ")
+		s.Printf("(block-bytes: (total %s, cached %s, read-time %s)), "+
+			"(points: (count %s, key-bytes %s, value-bytes %s, tombstoned %s))",
+			humanize.Bytes.Uint64(stats.InternalStats.BlockBytes),
+			humanize.Bytes.Uint64(stats.InternalStats.BlockBytesInCache),
+			humanize.FormattedString(stats.InternalStats.BlockReadDuration.String()),
+			humanize.Count.Uint64(stats.InternalStats.PointCount),
+			humanize.Bytes.Uint64(stats.InternalStats.KeyBytes),
+			humanize.Bytes.Uint64(stats.InternalStats.ValueBytes),
+			humanize.Count.Uint64(stats.InternalStats.PointsCoveredByRangeTombstones),
+		)
+		if stats.InternalStats.SeparatedPointValue.Count != 0 {
+			s.Printf(", (separated: (count %s, bytes %s, fetched %s)))",
+				humanize.Count.Uint64(stats.InternalStats.SeparatedPointValue.Count),
+				humanize.Bytes.Uint64(stats.InternalStats.SeparatedPointValue.ValueBytes),
+				humanize.Bytes.Uint64(stats.InternalStats.SeparatedPointValue.ValueBytesFetched))
+		} else {
+			s.Printf(")")
+		}
+	}
+	if stats.RangeKeyStats != (RangeKeyIteratorStats{}) {
+		s.SafeString(",\n(range-key-stats: ")
+		s.Printf("(count %d), (contained points: (count %d, skipped %d)))",
+			stats.RangeKeyStats.Count,
+			stats.RangeKeyStats.ContainedPoints,
+			stats.RangeKeyStats.SkippedPoints)
+	}
+}
+
+// CanDeterministicallySingleDelete takes a valid iterator and examines internal
+// state to determine if a SingleDelete deleting Iterator.Key() would
+// deterministically delete the key. CanDeterministicallySingleDelete requires
+// the iterator to be oriented in the forward direction (eg, the last
+// positioning operation must've been a First, a Seek[Prefix]GE, or a
+// Next[Prefix][WithLimit]).
+//
+// This function does not change the external position of the iterator, and all
+// positioning methods should behave the same as if it was never called. This
+// function will only return a meaningful result the first time it's invoked at
+// an iterator position. This function invalidates the iterator Value's memory,
+// and the caller must not rely on the memory safety of the previous Iterator
+// position.
+//
+// If CanDeterministicallySingleDelete returns true AND the key at the iterator
+// position is not modified between the creation of the Iterator and the commit
+// of a batch containing a SingleDelete over the key, then the caller can be
+// assured that SingleDelete is equivalent to Delete on the local engine, but it
+// may not be true on another engine that received the same writes and with
+// logically equivalent state since this engine may have collapsed multiple SETs
+// into one.
+func CanDeterministicallySingleDelete(it *Iterator) (bool, error) {
+	// This function may only be called once per external iterator position. We
+	// can validate this by checking the last positioning operation.
+	if it.lastPositioningOp == internalNextOp {
+		return false, errors.New("pebble: CanDeterministicallySingleDelete called twice")
+	}
+	validity, kind := it.internalNext()
+	var shadowedBySingleDelete bool
+	for validity == internalNextValid {
+		switch kind {
+		case InternalKeyKindDelete, InternalKeyKindDeleteSized:
+			// A DEL or DELSIZED tombstone is okay. An internal key
+			// sequence like SINGLEDEL; SET; DEL; SET can be handled
+			// deterministically. If there are SETs further down, we
+			// don't care about them.
+			return true, nil
+		case InternalKeyKindSingleDelete:
+			// A SingleDelete is okay as long as when that SingleDelete was
+			// written, it was written deterministically (eg, with its own
+			// CanDeterministicallySingleDelete check). Validate that it was
+			// written deterministically. We'll allow one set to appear after
+			// the SingleDelete.
+			shadowedBySingleDelete = true
+			validity, kind = it.internalNext()
+			continue
+		case InternalKeyKindSet, InternalKeyKindSetWithDelete, InternalKeyKindMerge:
+			// If we observed a single delete, it's allowed to delete 1 key.
+			// We'll keep looping to validate that the internal keys beneath the
+			// already-written single delete are copacetic.
+			if shadowedBySingleDelete {
+				shadowedBySingleDelete = false
+				validity, kind = it.internalNext()
+				continue
+			}
+			// We encountered a shadowed SET, SETWITHDEL, MERGE. A SINGLEDEL
+			// that deleted the KV at the original iterator position could
+			// result in this key becoming visible.
+			return false, nil
+		case InternalKeyKindRangeDelete:
+			// RangeDeletes are handled by the merging iterator and should never
+			// be observed by the top-level Iterator.
+			panic(errors.AssertionFailedf("pebble: unexpected range delete"))
+		case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
+			// Range keys are interleaved at the maximal sequence number and
+			// should never be observed within a user key.
+			panic(errors.AssertionFailedf("pebble: unexpected range key"))
+		default:
+			panic(errors.AssertionFailedf("pebble: unexpected key kind: %s", errors.Safe(kind)))
+		}
+	}
+	if validity == internalNextError {
+		return false, it.Error()
+	}
+	return true, nil
+}
+
+// internalNextValidity enumerates the potential outcomes of a call to
+// internalNext.
+type internalNextValidity int8
+
+const (
+	// internalNextError is returned by internalNext when an error occurred and
+	// the caller is responsible for checking iter.Error().
+	internalNextError internalNextValidity = iota
+	// internalNextExhausted is returned by internalNext when the next internal
+	// key is an internal key with a different user key than Iterator.Key().
+	internalNextExhausted
+	// internalNextValid is returned by internalNext when the internal next
+	// found a shadowed internal key with a user key equal to Iterator.Key().
+	internalNextValid
+)
+
+// internalNext advances internal Iterator state forward to expose the
+// InternalKeyKind of the next internal key with a user key equal to Key().
+//
+// internalNext is a highly specialized operation and is unlikely to be
+// generally useful. See Iterator.Next for how to reposition the iterator to the
+// next key. internalNext requires the Iterator to be at a valid position in the
+// forward direction (the last positioning operation must've been a First, a
+// Seek[Prefix]GE, or a Next[Prefix][WithLimit] and Valid() must return true).
+//
+// internalNext, unlike all other Iterator methods, exposes internal LSM state.
+// internalNext advances the Iterator's internal iterator to the next shadowed
+// key with a user key equal to Key(). When a key is overwritten or deleted, its
+// removal from the LSM occurs lazily as a part of compactions. internalNext
+// allows the caller to see whether an obsolete internal key exists with the
+// current Key(), and what it's key kind is. Note that the existence of an
+// internal key is nondeterministic and dependent on internal LSM state. These
+// semantics are unlikely to be applicable to almost all use cases.
+//
+// If internalNext finds a key that shares the same user key as Key(), it
+// returns internalNextValid and the internal key's kind. If internalNext
+// encounters an error, it returns internalNextError and the caller is expected
+// to call Iterator.Error() to retrieve it. In all other circumstances,
+// internalNext returns internalNextExhausted, indicating that there are no more
+// additional internal keys with the user key Key().
+//
+// internalNext does not change the external position of the iterator, and a
+// Next operation should behave the same as if internalNext was never called.
+// internalNext does invalidate the iterator Value's memory, and the caller must
+// not rely on the memory safety of the previous Iterator position.
+func (i *Iterator) internalNext() (internalNextValidity, base.InternalKeyKind) {
+	i.stats.ForwardStepCount[InterfaceCall]++
+	if i.err != nil {
+		return internalNextError, base.InternalKeyKindInvalid
+	} else if i.iterValidityState != IterValid {
+		return internalNextExhausted, base.InternalKeyKindInvalid
+	}
+	i.lastPositioningOp = internalNextOp
+
+	switch i.pos {
+	case iterPosCurForward:
+		i.iterKey, i.iterValue = i.iter.Next()
+		if i.iterKey == nil {
+			// We check i.iter.Error() here and return an internalNextError enum
+			// variant so that the caller does not need to check i.iter.Error()
+			// in the common case that the next internal key has a new user key.
+			if i.err = i.iter.Error(); i.err != nil {
+				return internalNextError, base.InternalKeyKindInvalid
+			}
+			i.pos = iterPosNext
+			return internalNextExhausted, base.InternalKeyKindInvalid
+		} else if i.comparer.Equal(i.iterKey.UserKey, i.key) {
+			return internalNextValid, i.iterKey.Kind()
+		}
+		i.pos = iterPosNext
+		return internalNextExhausted, base.InternalKeyKindInvalid
+	case iterPosCurReverse, iterPosCurReversePaused, iterPosPrev:
+		i.err = errors.New("switching from reverse to forward via internalNext is prohibited")
+		i.iterValidityState = IterExhausted
+		return internalNextError, base.InternalKeyKindInvalid
+	case iterPosNext, iterPosCurForwardPaused:
+		// The previous method already moved onto the next user key. This is
+		// only possible if
+		//   - the last positioning method was a call to internalNext, and we
+		//     advanced to a new user key.
+		//   - the previous non-internalNext iterator operation encountered a
+		//     range key or merge, forcing an internal Next that found a new
+		//     user key that's not equal to i.Iterator.Key().
+		return internalNextExhausted, base.InternalKeyKindInvalid
+	default:
+		panic("unreachable")
+	}
+}
diff --git a/pebble/iterator_example_test.go b/pebble/iterator_example_test.go
new file mode 100644
index 0000000..341c668
--- /dev/null
+++ b/pebble/iterator_example_test.go
@@ -0,0 +1,124 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble_test
+
+import (
+	"fmt"
+	"log"
+
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+func ExampleIterator() {
+	db, err := pebble.Open("", &pebble.Options{FS: vfs.NewMem()})
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	keys := []string{"hello", "world", "hello world"}
+	for _, key := range keys {
+		if err := db.Set([]byte(key), nil, pebble.Sync); err != nil {
+			log.Fatal(err)
+		}
+	}
+
+	iter, _ := db.NewIter(nil)
+	for iter.First(); iter.Valid(); iter.Next() {
+		fmt.Printf("%s\n", iter.Key())
+	}
+	if err := iter.Close(); err != nil {
+		log.Fatal(err)
+	}
+	if err := db.Close(); err != nil {
+		log.Fatal(err)
+	}
+	// Output:
+	// hello
+	// hello world
+	// world
+}
+
+func ExampleIterator_prefixIteration() {
+	db, err := pebble.Open("", &pebble.Options{FS: vfs.NewMem()})
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	keyUpperBound := func(b []byte) []byte {
+		end := make([]byte, len(b))
+		copy(end, b)
+		for i := len(end) - 1; i >= 0; i-- {
+			end[i] = end[i] + 1
+			if end[i] != 0 {
+				return end[:i+1]
+			}
+		}
+		return nil // no upper-bound
+	}
+
+	prefixIterOptions := func(prefix []byte) *pebble.IterOptions {
+		return &pebble.IterOptions{
+			LowerBound: prefix,
+			UpperBound: keyUpperBound(prefix),
+		}
+	}
+
+	keys := []string{"hello", "world", "hello world"}
+	for _, key := range keys {
+		if err := db.Set([]byte(key), nil, pebble.Sync); err != nil {
+			log.Fatal(err)
+		}
+	}
+
+	iter, _ := db.NewIter(prefixIterOptions([]byte("hello")))
+	for iter.First(); iter.Valid(); iter.Next() {
+		fmt.Printf("%s\n", iter.Key())
+	}
+	if err := iter.Close(); err != nil {
+		log.Fatal(err)
+	}
+	if err := db.Close(); err != nil {
+		log.Fatal(err)
+	}
+	// Output:
+	// hello
+	// hello world
+}
+
+func ExampleIterator_SeekGE() {
+	db, err := pebble.Open("", &pebble.Options{FS: vfs.NewMem()})
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	keys := []string{"hello", "world", "hello world"}
+	for _, key := range keys {
+		if err := db.Set([]byte(key), nil, pebble.Sync); err != nil {
+			log.Fatal(err)
+		}
+	}
+
+	iter, _ := db.NewIter(nil)
+	if iter.SeekGE([]byte("a")); iter.Valid() {
+		fmt.Printf("%s\n", iter.Key())
+	}
+	if iter.SeekGE([]byte("hello w")); iter.Valid() {
+		fmt.Printf("%s\n", iter.Key())
+	}
+	if iter.SeekGE([]byte("w")); iter.Valid() {
+		fmt.Printf("%s\n", iter.Key())
+	}
+	if err := iter.Close(); err != nil {
+		log.Fatal(err)
+	}
+	if err := db.Close(); err != nil {
+		log.Fatal(err)
+	}
+	// Output:
+	// hello
+	// hello world
+	// world
+}
diff --git a/pebble/iterator_histories_test.go b/pebble/iterator_histories_test.go
new file mode 100644
index 0000000..792b2fa
--- /dev/null
+++ b/pebble/iterator_histories_test.go
@@ -0,0 +1,392 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+package pebble
+
+import (
+	"bytes"
+	"fmt"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+// TODO(jackson): Add a range keys test with concurrency: the logic to cache
+// fragmented spans is susceptible to races.
+
+func TestIterHistories(t *testing.T) {
+	datadriven.Walk(t, "testdata/iter_histories", func(t *testing.T, path string) {
+		filename := filepath.Base(path)
+		switch {
+		case invariants.Enabled && strings.Contains(filename, "no_invariants"):
+			t.Skip("disabled when run with -tags invariants due to nondeterminism")
+		}
+
+		var d *DB
+		var buf bytes.Buffer
+		iters := map[string]*Iterator{}
+		batches := map[string]*Batch{}
+		newIter := func(name string, reader Reader, o *IterOptions) *Iterator {
+			it, _ := reader.NewIter(o)
+			iters[name] = it
+			return it
+		}
+		var opts *Options
+		parseOpts := func(td *datadriven.TestData) (*Options, error) {
+			opts = &Options{
+				FS:                 vfs.NewMem(),
+				Comparer:           testkeys.Comparer,
+				FormatMajorVersion: FormatRangeKeys,
+				BlockPropertyCollectors: []func() BlockPropertyCollector{
+					sstable.NewTestKeysBlockPropertyCollector,
+				},
+			}
+
+			opts.DisableAutomaticCompactions = true
+			opts.EnsureDefaults()
+			opts.WithFSDefaults()
+			if err := parseDBOptionsArgs(opts, td.CmdArgs); err != nil {
+				return nil, err
+			}
+			return opts, nil
+		}
+		cleanup := func() (err error) {
+			for key, batch := range batches {
+				err = firstError(err, batch.Close())
+				delete(batches, key)
+			}
+			for key, iter := range iters {
+				err = firstError(err, iter.Close())
+				delete(iters, key)
+			}
+			if d != nil {
+				err = firstError(err, d.Close())
+				d = nil
+			}
+			return err
+		}
+		defer cleanup()
+
+		datadriven.RunTest(t, path, func(t *testing.T, td *datadriven.TestData) string {
+			buf.Reset()
+			switch td.Cmd {
+			case "define":
+				var err error
+				if err := cleanup(); err != nil {
+					return err.Error()
+				}
+				opts, err = parseOpts(td)
+				if err != nil {
+					return err.Error()
+				}
+				d, err = runDBDefineCmd(td, opts)
+				if err != nil {
+					return err.Error()
+				}
+				return runLSMCmd(td, d)
+			case "reopen":
+				var err error
+				if err := cleanup(); err != nil {
+					return err.Error()
+				}
+				if err := parseDBOptionsArgs(opts, td.CmdArgs); err != nil {
+					return err.Error()
+				}
+				d, err = Open("", opts)
+				require.NoError(t, err)
+				return ""
+			case "reset":
+				var err error
+				if err := cleanup(); err != nil {
+					return err.Error()
+				}
+				opts, err = parseOpts(td)
+				if err != nil {
+					return err.Error()
+				}
+
+				d, err = Open("", opts)
+				require.NoError(t, err)
+				return ""
+			case "populate":
+				b := d.NewBatch()
+				runPopulateCmd(t, td, b)
+				count := b.Count()
+				require.NoError(t, b.Commit(nil))
+				return fmt.Sprintf("wrote %d keys\n", count)
+			case "batch":
+				var name string
+				td.MaybeScanArgs(t, "name", &name)
+				commit := td.HasArg("commit")
+				b := d.NewIndexedBatch()
+				require.NoError(t, runBatchDefineCmd(td, b))
+				var err error
+				if commit {
+					func() {
+						defer func() {
+							if r := recover(); r != nil {
+								err = errors.New(r.(string))
+							}
+						}()
+						err = b.Commit(nil)
+					}()
+				} else if name != "" {
+					batches[name] = b
+				}
+				if err != nil {
+					return err.Error()
+				}
+				count := b.Count()
+				if commit {
+					return fmt.Sprintf("committed %d keys\n", count)
+				}
+				return fmt.Sprintf("wrote %d keys to batch %q\n", count, name)
+			case "compact":
+				if err := runCompactCmd(td, d); err != nil {
+					return err.Error()
+				}
+				return runLSMCmd(td, d)
+			case "flush":
+				err := d.Flush()
+				if err != nil {
+					return err.Error()
+				}
+				return ""
+			case "get":
+				var reader Reader = d
+				if arg, ok := td.Arg("reader"); ok {
+					if reader, ok = batches[arg.Vals[0]]; !ok {
+						return fmt.Sprintf("unknown reader %q", arg.Vals[0])
+					}
+				}
+				for _, l := range strings.Split(td.Input, "\n") {
+					v, closer, err := reader.Get([]byte(l))
+					if err != nil {
+						fmt.Fprintf(&buf, "%s: error: %s\n", l, err)
+					} else {
+						fmt.Fprintf(&buf, "%s: %s\n", l, v)
+					}
+					if err := closer.Close(); err != nil {
+						fmt.Fprintf(&buf, "close err: %s\n", err)
+					}
+				}
+				return buf.String()
+			case "ingest":
+				if err := runBuildCmd(td, d, d.opts.FS); err != nil {
+					return err.Error()
+				}
+				if err := runIngestCmd(td, d, d.opts.FS); err != nil {
+					return err.Error()
+				}
+				return ""
+			case "layout":
+				var verbose bool
+				var filename string
+				td.ScanArgs(t, "filename", &filename)
+				td.MaybeScanArgs(t, "verbose", &verbose)
+				f, err := opts.FS.Open(filename)
+				if err != nil {
+					return err.Error()
+				}
+				readable, err := sstable.NewSimpleReadable(f)
+				if err != nil {
+					return err.Error()
+				}
+				r, err := sstable.NewReader(readable, opts.MakeReaderOptions())
+				if err != nil {
+					return err.Error()
+				}
+				defer r.Close()
+				l, err := r.Layout()
+				if err != nil {
+					return err.Error()
+				}
+				l.Describe(&buf, verbose, r, nil)
+				return buf.String()
+			case "lsm":
+				return runLSMCmd(td, d)
+			case "metrics":
+				d.mu.Lock()
+				d.waitTableStats()
+				d.mu.Unlock()
+				m := d.Metrics()
+				return fmt.Sprintf("Metrics.Keys.RangeKeySetsCount = %d\n", m.Keys.RangeKeySetsCount)
+			case "mutate":
+				var batchName string
+				td.ScanArgs(t, "batch", &batchName)
+				mut := newBatch(d)
+				if err := runBatchDefineCmd(td, mut); err != nil {
+					return err.Error()
+				}
+				if err := batches[batchName].Apply(mut, nil); err != nil {
+					return err.Error()
+				}
+				return ""
+			case "clone":
+				var from, to string
+				var cloneOpts CloneOptions
+				var iterOpts IterOptions
+				td.ScanArgs(t, "from", &from)
+				td.ScanArgs(t, "to", &to)
+				td.ScanArgs(t, "refresh-batch", &cloneOpts.RefreshBatchView)
+				fromIter := iters[from]
+				if foundAny, err := parseIterOptions(&iterOpts, &fromIter.opts, strings.Fields(td.Input)); err != nil {
+					return fmt.Sprintf("clone: %s", err.Error())
+				} else if foundAny {
+					cloneOpts.IterOptions = &iterOpts
+				}
+				var err error
+				iters[to], err = fromIter.Clone(cloneOpts)
+				if err != nil {
+					return err.Error()
+				}
+				return ""
+			case "commit":
+				name := pluckStringCmdArg(td, "batch")
+				b := batches[name]
+				defer b.Close()
+				count := b.Count()
+				require.NoError(t, d.Apply(b, nil))
+				delete(batches, name)
+				return fmt.Sprintf("committed %d keys\n", count)
+			case "combined-iter":
+				o := &IterOptions{KeyTypes: IterKeyTypePointsAndRanges}
+				var reader Reader = d
+				var name string
+				for _, arg := range td.CmdArgs {
+					switch arg.Key {
+					case "mask-suffix":
+						o.RangeKeyMasking.Suffix = []byte(arg.Vals[0])
+					case "mask-filter":
+						o.RangeKeyMasking.Filter = func() BlockPropertyFilterMask {
+							return sstable.NewTestKeysMaskingFilter()
+						}
+					case "lower":
+						o.LowerBound = []byte(arg.Vals[0])
+					case "upper":
+						o.UpperBound = []byte(arg.Vals[0])
+					case "name":
+						name = arg.Vals[0]
+					case "reader":
+						reader = batches[arg.Vals[0]]
+						if reader == nil {
+							return fmt.Sprintf("unknown reader %q", arg.Vals[0])
+						}
+					case "point-key-filter":
+						if len(arg.Vals) != 2 {
+							return fmt.Sprintf("blockprop-filter expects 2 arguments, received %d", len(arg.Vals))
+						}
+						min, err := strconv.ParseUint(arg.Vals[0], 10, 64)
+						if err != nil {
+							return err.Error()
+						}
+						max, err := strconv.ParseUint(arg.Vals[1], 10, 64)
+						if err != nil {
+							return err.Error()
+						}
+						o.PointKeyFilters = []sstable.BlockPropertyFilter{
+							sstable.NewTestKeysBlockPropertyFilter(min, max),
+						}
+						o.SkipPoint = func(k []byte) bool {
+							i := testkeys.Comparer.Split(k)
+							if i == len(k) {
+								return false
+							}
+							v, err := testkeys.ParseSuffix(k[i:])
+							if err != nil {
+								return false
+							}
+							return uint64(v) < min || uint64(v) >= max
+						}
+					case "snapshot":
+						s, err := strconv.ParseUint(arg.Vals[0], 10, 64)
+						if err != nil {
+							return err.Error()
+						}
+						func() {
+							d.mu.Lock()
+							defer d.mu.Unlock()
+							l := &d.mu.snapshots
+							for i := l.root.next; i != &l.root; i = i.next {
+								if i.seqNum == s {
+									reader = i
+									break
+								}
+							}
+						}()
+					case "use-l6-filter":
+						o.UseL6Filters = true
+					}
+				}
+				var iter *Iterator
+				var err error
+				func() {
+					defer func() {
+						if r := recover(); r != nil {
+							switch v := r.(type) {
+							case string:
+								err = errors.New(v)
+							case error:
+								err = v
+							default:
+								panic(r)
+							}
+						}
+					}()
+					iter = newIter(name, reader, o)
+				}()
+				if err != nil {
+					return err.Error()
+				}
+				return runIterCmd(td, iter, name == "" /* close iter */)
+			case "rangekey-iter":
+				name := pluckStringCmdArg(td, "name")
+				iter := newIter(name, d, &IterOptions{KeyTypes: IterKeyTypeRangesOnly})
+				return runIterCmd(td, iter, name == "" /* close iter */)
+			case "scan-rangekeys":
+				iter := newIter(
+					pluckStringCmdArg(td, "name"),
+					d,
+					&IterOptions{KeyTypes: IterKeyTypeRangesOnly},
+				)
+				func() {
+					defer iter.Close()
+					for iter.First(); iter.Valid(); iter.Next() {
+						start, end := iter.RangeBounds()
+						fmt.Fprintf(&buf, "[%s, %s)\n", start, end)
+						writeRangeKeys(&buf, iter)
+						fmt.Fprintln(&buf)
+					}
+				}()
+				return buf.String()
+			case "iter":
+				var name string
+				td.ScanArgs(t, "iter", &name)
+				return runIterCmd(td, iters[name], false /* close iter */)
+			case "wait-table-stats":
+				d.mu.Lock()
+				d.waitTableStats()
+				d.mu.Unlock()
+				return ""
+			default:
+				return fmt.Sprintf("unknown command %q", td.Cmd)
+			}
+		})
+	})
+}
+
+func pluckStringCmdArg(td *datadriven.TestData, key string) string {
+	if arg, ok := td.Arg(key); ok {
+		return arg.Vals[0]
+	}
+	return ""
+}
diff --git a/pebble/iterator_test.go b/pebble/iterator_test.go
new file mode 100644
index 0000000..0be563d
--- /dev/null
+++ b/pebble/iterator_test.go
@@ -0,0 +1,2913 @@
+// Copyright 2013 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"context"
+	"flag"
+	"fmt"
+	"io"
+	"runtime"
+	"sort"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/bytealloc"
+	"github.com/cockroachdb/pebble/internal/invalidating"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+var testKeyValuePairs = []string{
+	"10:10",
+	"11:11",
+	"12:12",
+	"13:13",
+	"14:14",
+	"15:15",
+	"16:16",
+	"17:17",
+	"18:18",
+	"19:19",
+}
+
+type fakeIter struct {
+	lower    []byte
+	upper    []byte
+	keys     []InternalKey
+	vals     [][]byte
+	index    int
+	valid    bool
+	closeErr error
+}
+
+// fakeIter implements the base.InternalIterator interface.
+var _ base.InternalIterator = (*fakeIter)(nil)
+
+func fakeIkey(s string) InternalKey {
+	j := strings.Index(s, ":")
+	seqNum, err := strconv.Atoi(s[j+1:])
+	if err != nil {
+		panic(err)
+	}
+	return base.MakeInternalKey([]byte(s[:j]), uint64(seqNum), InternalKeyKindSet)
+}
+
+func newFakeIterator(closeErr error, keys ...string) *fakeIter {
+	ikeys := make([]InternalKey, len(keys))
+	for i, k := range keys {
+		ikeys[i] = fakeIkey(k)
+	}
+	return &fakeIter{
+		keys:     ikeys,
+		index:    0,
+		valid:    len(ikeys) > 0,
+		closeErr: closeErr,
+	}
+}
+
+func (f *fakeIter) String() string {
+	return "fake"
+}
+
+func (f *fakeIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) {
+	f.valid = false
+	for f.index = 0; f.index < len(f.keys); f.index++ {
+		if DefaultComparer.Compare(key, f.key().UserKey) <= 0 {
+			if f.upper != nil && DefaultComparer.Compare(f.upper, f.key().UserKey) <= 0 {
+				return nil, base.LazyValue{}
+			}
+			f.valid = true
+			return f.Key(), f.Value()
+		}
+	}
+	return nil, base.LazyValue{}
+}
+
+func (f *fakeIter) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	return f.SeekGE(key, flags)
+}
+
+func (f *fakeIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) {
+	f.valid = false
+	for f.index = len(f.keys) - 1; f.index >= 0; f.index-- {
+		if DefaultComparer.Compare(key, f.key().UserKey) > 0 {
+			if f.lower != nil && DefaultComparer.Compare(f.lower, f.key().UserKey) > 0 {
+				return nil, base.LazyValue{}
+			}
+			f.valid = true
+			return f.Key(), f.Value()
+		}
+	}
+	return nil, base.LazyValue{}
+}
+
+func (f *fakeIter) First() (*InternalKey, base.LazyValue) {
+	f.valid = false
+	f.index = -1
+	if key, _ := f.Next(); key == nil {
+		return nil, base.LazyValue{}
+	}
+	if f.upper != nil && DefaultComparer.Compare(f.upper, f.key().UserKey) <= 0 {
+		return nil, base.LazyValue{}
+	}
+	f.valid = true
+	return f.Key(), f.Value()
+}
+
+func (f *fakeIter) Last() (*InternalKey, base.LazyValue) {
+	f.valid = false
+	f.index = len(f.keys)
+	if key, _ := f.Prev(); key == nil {
+		return nil, base.LazyValue{}
+	}
+	if f.lower != nil && DefaultComparer.Compare(f.lower, f.key().UserKey) > 0 {
+		return nil, base.LazyValue{}
+	}
+	f.valid = true
+	return f.Key(), f.Value()
+}
+
+func (f *fakeIter) Next() (*InternalKey, base.LazyValue) {
+	f.valid = false
+	if f.index == len(f.keys) {
+		return nil, base.LazyValue{}
+	}
+	f.index++
+	if f.index == len(f.keys) {
+		return nil, base.LazyValue{}
+	}
+	if f.upper != nil && DefaultComparer.Compare(f.upper, f.key().UserKey) <= 0 {
+		return nil, base.LazyValue{}
+	}
+	f.valid = true
+	return f.Key(), f.Value()
+}
+
+func (f *fakeIter) Prev() (*InternalKey, base.LazyValue) {
+	f.valid = false
+	if f.index < 0 {
+		return nil, base.LazyValue{}
+	}
+	f.index--
+	if f.index < 0 {
+		return nil, base.LazyValue{}
+	}
+	if f.lower != nil && DefaultComparer.Compare(f.lower, f.key().UserKey) > 0 {
+		return nil, base.LazyValue{}
+	}
+	f.valid = true
+	return f.Key(), f.Value()
+}
+
+func (f *fakeIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
+	return f.SeekGE(succKey, base.SeekGEFlagsNone)
+}
+
+// key returns the current Key the iterator is positioned at regardless of the
+// value of f.valid.
+func (f *fakeIter) key() *InternalKey {
+	return &f.keys[f.index]
+}
+
+func (f *fakeIter) Key() *InternalKey {
+	if f.valid {
+		return &f.keys[f.index]
+	}
+	// It is invalid to call Key() when Valid() returns false. Rather than
+	// returning nil here which would technically be more correct, return a
+	// non-nil key which is the behavior of some InternalIterator
+	// implementations. This provides better testing of users of
+	// InternalIterators.
+	if f.index < 0 {
+		return &f.keys[0]
+	}
+	return &f.keys[len(f.keys)-1]
+}
+
+func (f *fakeIter) Value() base.LazyValue {
+	if f.index >= 0 && f.index < len(f.vals) {
+		return base.MakeInPlaceValue(f.vals[f.index])
+	}
+	return base.LazyValue{}
+}
+
+func (f *fakeIter) Valid() bool {
+	return f.index >= 0 && f.index < len(f.keys) && f.valid
+}
+
+func (f *fakeIter) Error() error {
+	return f.closeErr
+}
+
+func (f *fakeIter) Close() error {
+	return f.closeErr
+}
+
+func (f *fakeIter) SetBounds(lower, upper []byte) {
+	f.lower = lower
+	f.upper = upper
+}
+
+func (f *fakeIter) SetContext(_ context.Context) {}
+
+// testIterator tests creating a combined iterator from a number of sub-
+// iterators. newFunc is a constructor function. splitFunc returns a random
+// split of the testKeyValuePairs slice such that walking a combined iterator
+// over those splits should recover the original key/value pairs in order.
+func testIterator(
+	t *testing.T,
+	newFunc func(...internalIterator) internalIterator,
+	splitFunc func(r *rand.Rand) [][]string,
+) {
+	// Test pre-determined sub-iterators. The sub-iterators are designed
+	// so that the combined key/value pair order is the same whether the
+	// combined iterator is concatenating or merging.
+	testCases := []struct {
+		desc  string
+		iters []internalIterator
+		want  string
+	}{
+		{
+			"one sub-iterator",
+			[]internalIterator{
+				newFakeIterator(nil, "e:1", "w:2"),
+			},
+			"<e:1><w:2>.",
+		},
+		{
+			"two sub-iterators",
+			[]internalIterator{
+				newFakeIterator(nil, "a0:0"),
+				newFakeIterator(nil, "b1:1", "b2:2"),
+			},
+			"<a0:0><b1:1><b2:2>.",
+		},
+		{
+			"empty sub-iterators",
+			[]internalIterator{
+				newFakeIterator(nil),
+				newFakeIterator(nil),
+				newFakeIterator(nil),
+			},
+			".",
+		},
+		{
+			"sub-iterator errors",
+			[]internalIterator{
+				newFakeIterator(nil, "a0:0", "a1:1"),
+				newFakeIterator(errors.New("the sky is falling"), "b2:2", "b3:3", "b4:4"),
+				newFakeIterator(errors.New("run for your lives"), "c5:5", "c6:6"),
+			},
+			"<a0:0><a1:1><b2:2><b3:3><b4:4>err=the sky is falling",
+		},
+	}
+	for _, tc := range testCases {
+		var b bytes.Buffer
+		iter := invalidating.NewIter(newFunc(tc.iters...))
+		for key, _ := iter.First(); key != nil; key, _ = iter.Next() {
+			fmt.Fprintf(&b, "<%s:%d>", key.UserKey, key.SeqNum())
+		}
+		if err := iter.Close(); err != nil {
+			fmt.Fprintf(&b, "err=%v", err)
+		} else {
+			b.WriteByte('.')
+		}
+		if got := b.String(); got != tc.want {
+			t.Errorf("%s:\ngot  %q\nwant %q", tc.desc, got, tc.want)
+		}
+	}
+
+	// Test randomly generated sub-iterators.
+	r := rand.New(rand.NewSource(0))
+	for i, nBad := 0, 0; i < 1000; i++ {
+		bad := false
+
+		splits := splitFunc(r)
+		iters := make([]internalIterator, len(splits))
+		for i, split := range splits {
+			iters[i] = newFakeIterator(nil, split...)
+		}
+		iter := newInternalIterAdapter(invalidating.NewIter(newFunc(iters...)))
+		iter.First()
+
+		j := 0
+		for ; iter.Valid() && j < len(testKeyValuePairs); j++ {
+			got := fmt.Sprintf("%s:%d", iter.Key().UserKey, iter.Key().SeqNum())
+			want := testKeyValuePairs[j]
+			if got != want {
+				bad = true
+				t.Errorf("random splits: i=%d, j=%d: got %q, want %q", i, j, got, want)
+			}
+			iter.Next()
+		}
+		if iter.Valid() {
+			bad = true
+			t.Errorf("random splits: i=%d, j=%d: iter was not exhausted", i, j)
+		}
+		if j != len(testKeyValuePairs) {
+			bad = true
+			t.Errorf("random splits: i=%d, j=%d: want j=%d", i, j, len(testKeyValuePairs))
+			return
+		}
+		if err := iter.Close(); err != nil {
+			bad = true
+			t.Errorf("random splits: i=%d, j=%d: %v", i, j, err)
+		}
+
+		if bad {
+			nBad++
+			if nBad == 10 {
+				t.Fatal("random splits: too many errors; stopping")
+			}
+		}
+	}
+}
+
+// deletableSumValueMerger computes the sum of its arguments,
+// but transforms a zero sum into a non-existent entry.
+type deletableSumValueMerger struct {
+	sum int64
+}
+
+func newDeletableSumValueMerger(key, value []byte) (ValueMerger, error) {
+	m := &deletableSumValueMerger{}
+	return m, m.MergeNewer(value)
+}
+
+func (m *deletableSumValueMerger) parseAndCalculate(value []byte) error {
+	v, err := strconv.ParseInt(string(value), 10, 64)
+	if err == nil {
+		m.sum += v
+	}
+	return err
+}
+
+func (m *deletableSumValueMerger) MergeNewer(value []byte) error {
+	return m.parseAndCalculate(value)
+}
+
+func (m *deletableSumValueMerger) MergeOlder(value []byte) error {
+	return m.parseAndCalculate(value)
+}
+
+func (m *deletableSumValueMerger) Finish(includesBase bool) ([]byte, io.Closer, error) {
+	if m.sum == 0 {
+		return nil, nil, nil
+	}
+	return []byte(strconv.FormatInt(m.sum, 10)), nil, nil
+}
+
+func (m *deletableSumValueMerger) DeletableFinish(
+	includesBase bool,
+) ([]byte, bool, io.Closer, error) {
+	value, closer, err := m.Finish(includesBase)
+	return value, len(value) == 0, closer, err
+}
+
+func TestIterator(t *testing.T) {
+	var merge Merge
+	var keys []InternalKey
+	var vals [][]byte
+
+	newIter := func(seqNum uint64, opts IterOptions) *Iterator {
+		if merge == nil {
+			merge = DefaultMerger.Merge
+		}
+		wrappedMerge := func(key, value []byte) (ValueMerger, error) {
+			if len(key) == 0 {
+				t.Fatalf("an empty key is passed into Merge")
+			}
+			return merge(key, value)
+		}
+		it := &Iterator{
+			opts:     opts,
+			comparer: *testkeys.Comparer,
+			merge:    wrappedMerge,
+		}
+		// NB: Use a mergingIter to filter entries newer than seqNum.
+		iter := newMergingIter(nil /* logger */, &it.stats.InternalStats, it.cmp, it.split, &fakeIter{
+			lower: opts.GetLowerBound(),
+			upper: opts.GetUpperBound(),
+			keys:  keys,
+			vals:  vals,
+		})
+		iter.snapshot = seqNum
+		// NB: This Iterator cannot be cloned since it is not constructed
+		// with a readState. It suffices for this test.
+		it.iter = invalidating.NewIter(iter)
+		return it
+	}
+
+	datadriven.RunTest(t, "testdata/iterator", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "define":
+			merge = nil
+			if arg, ok := d.Arg("merger"); ok && len(arg.Vals[0]) > 0 && arg.Vals[0] == "deletable" {
+				merge = newDeletableSumValueMerger
+			}
+			keys = keys[:0]
+			vals = vals[:0]
+			for _, key := range strings.Split(d.Input, "\n") {
+				j := strings.Index(key, ":")
+				keys = append(keys, base.ParseInternalKey(key[:j]))
+				vals = append(vals, []byte(key[j+1:]))
+			}
+			return ""
+
+		case "iter":
+			var seqNum uint64
+			var opts IterOptions
+			d.MaybeScanArgs(t, "seq", &seqNum)
+			var lower, upper string
+			if d.MaybeScanArgs(t, "lower", &lower) {
+				opts.LowerBound = []byte(lower)
+			}
+			if d.MaybeScanArgs(t, "upper", &upper) {
+				opts.UpperBound = []byte(upper)
+			}
+
+			iter := newIter(seqNum, opts)
+			iterOutput := runIterCmd(d, iter, true)
+			stats := iter.Stats()
+			return fmt.Sprintf("%sstats: %s\n", iterOutput, stats.String())
+
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
+
+type minSeqNumPropertyCollector struct {
+	minSeqNum uint64
+}
+
+func (c *minSeqNumPropertyCollector) Add(key InternalKey, value []byte) error {
+	if c.minSeqNum == 0 || c.minSeqNum > key.SeqNum() {
+		c.minSeqNum = key.SeqNum()
+	}
+	return nil
+}
+
+func (c *minSeqNumPropertyCollector) Finish(userProps map[string]string) error {
+	userProps["test.min-seq-num"] = fmt.Sprint(c.minSeqNum)
+	return nil
+}
+
+func (c *minSeqNumPropertyCollector) Name() string {
+	return "minSeqNumPropertyCollector"
+}
+
+func TestReadSampling(t *testing.T) {
+	var d *DB
+	defer func() {
+		if d != nil {
+			require.NoError(t, d.Close())
+		}
+	}()
+
+	var iter *Iterator
+	defer func() {
+		if iter != nil {
+			require.NoError(t, iter.Close())
+		}
+	}()
+
+	datadriven.RunTest(t, "testdata/iterator_read_sampling", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "define":
+			if iter != nil {
+				if err := iter.Close(); err != nil {
+					return err.Error()
+				}
+			}
+			if d != nil {
+				if err := d.Close(); err != nil {
+					return err.Error()
+				}
+			}
+
+			opts := &Options{}
+			opts.TablePropertyCollectors = append(opts.TablePropertyCollectors,
+				func() TablePropertyCollector {
+					return &minSeqNumPropertyCollector{}
+				})
+
+			var err error
+			if d, err = runDBDefineCmd(td, opts); err != nil {
+				return err.Error()
+			}
+
+			d.mu.Lock()
+			// Disable the "dynamic base level" code for this test.
+			// d.mu.versions.picker.forceBaseLevel1()
+			s := d.mu.versions.currentVersion().String()
+			d.mu.Unlock()
+			return s
+
+		case "set":
+			if d == nil {
+				return fmt.Sprintf("%s: db is not defined", td.Cmd)
+			}
+
+			var allowedSeeks int64
+			td.ScanArgs(t, "allowed-seeks", &allowedSeeks)
+
+			d.mu.Lock()
+			for _, l := range d.mu.versions.currentVersion().Levels {
+				l.Slice().Each(func(f *fileMetadata) {
+					f.AllowedSeeks.Store(allowedSeeks)
+				})
+			}
+			d.mu.Unlock()
+			return ""
+
+		case "show":
+			if d == nil {
+				return fmt.Sprintf("%s: db is not defined", td.Cmd)
+			}
+
+			var fileNum int64
+			for _, arg := range td.CmdArgs {
+				if len(arg.Vals) != 2 {
+					return fmt.Sprintf("%s: %s=<value>", td.Cmd, arg.Key)
+				}
+				switch arg.Key {
+				case "allowed-seeks":
+					var err error
+					fileNum, err = strconv.ParseInt(arg.Vals[0], 10, 64)
+					if err != nil {
+						return err.Error()
+					}
+				}
+			}
+
+			var foundAllowedSeeks int64 = -1
+			d.mu.Lock()
+			for _, l := range d.mu.versions.currentVersion().Levels {
+				l.Slice().Each(func(f *fileMetadata) {
+					if f.FileNum == base.FileNum(fileNum) {
+						actualAllowedSeeks := f.AllowedSeeks.Load()
+						foundAllowedSeeks = actualAllowedSeeks
+					}
+				})
+			}
+			d.mu.Unlock()
+
+			if foundAllowedSeeks == -1 {
+				return fmt.Sprintf("invalid file num: %d", fileNum)
+			}
+			return fmt.Sprintf("%d", foundAllowedSeeks)
+
+		case "iter":
+			if iter == nil || iter.iter == nil {
+				// TODO(peter): runDBDefineCmd doesn't properly update the visible
+				// sequence number. So we have to use a snapshot with a very large
+				// sequence number, otherwise the DB appears empty.
+				snap := Snapshot{
+					db:     d,
+					seqNum: InternalKeySeqNumMax,
+				}
+				iter, _ = snap.NewIter(nil)
+				iter.readSampling.forceReadSampling = true
+			}
+			return runIterCmd(td, iter, false)
+
+		case "read-compactions":
+			if d == nil {
+				return fmt.Sprintf("%s: db is not defined", td.Cmd)
+			}
+
+			d.mu.Lock()
+			var sb strings.Builder
+			if d.mu.compact.readCompactions.size == 0 {
+				sb.WriteString("(none)")
+			}
+			for i := 0; i < d.mu.compact.readCompactions.size; i++ {
+				rc := d.mu.compact.readCompactions.at(i)
+				sb.WriteString(fmt.Sprintf("(level: %d, start: %s, end: %s)\n", rc.level, string(rc.start), string(rc.end)))
+			}
+			d.mu.Unlock()
+			return sb.String()
+
+		case "iter-read-compactions":
+			if iter == nil {
+				return fmt.Sprintf("%s: iter is not defined", td.Cmd)
+			}
+
+			var sb strings.Builder
+			if iter.readSampling.pendingCompactions.size == 0 {
+				sb.WriteString("(none)")
+			}
+			for i := 0; i < iter.readSampling.pendingCompactions.size; i++ {
+				rc := iter.readSampling.pendingCompactions.at(i)
+				sb.WriteString(fmt.Sprintf("(level: %d, start: %s, end: %s)\n", rc.level, string(rc.start), string(rc.end)))
+			}
+			return sb.String()
+
+		case "close-iter":
+			if iter != nil {
+				if err := iter.Close(); err != nil {
+					return err.Error()
+				}
+			}
+			return ""
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestIteratorTableFilter(t *testing.T) {
+	var d *DB
+	defer func() {
+		if d != nil {
+			require.NoError(t, d.Close())
+		}
+	}()
+
+	datadriven.RunTest(t, "testdata/iterator_table_filter", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "define":
+			if d != nil {
+				if err := d.Close(); err != nil {
+					return err.Error()
+				}
+			}
+
+			opts := &Options{}
+			opts.TablePropertyCollectors = append(opts.TablePropertyCollectors,
+				func() TablePropertyCollector {
+					return &minSeqNumPropertyCollector{}
+				})
+
+			var err error
+			if d, err = runDBDefineCmd(td, opts); err != nil {
+				return err.Error()
+			}
+
+			d.mu.Lock()
+			// Disable the "dynamic base level" code for this test.
+			d.mu.versions.picker.forceBaseLevel1()
+			s := d.mu.versions.currentVersion().String()
+			d.mu.Unlock()
+			return s
+
+		case "iter":
+			// We're using an iterator table filter to approximate what is done by
+			// snapshots.
+			iterOpts := &IterOptions{}
+			var filterSeqNum uint64
+			if td.MaybeScanArgs(t, "filter", &filterSeqNum) {
+				iterOpts.TableFilter = func(userProps map[string]string) bool {
+					minSeqNum, err := strconv.ParseUint(userProps["test.min-seq-num"], 10, 64)
+					if err != nil {
+						return true
+					}
+					return minSeqNum < filterSeqNum
+				}
+			}
+
+			// TODO(peter): runDBDefineCmd doesn't properly update the visible
+			// sequence number. So we have to use a snapshot with a very large
+			// sequence number, otherwise the DB appears empty.
+			snap := Snapshot{
+				db:     d,
+				seqNum: InternalKeySeqNumMax,
+			}
+			iter, _ := snap.NewIter(iterOpts)
+			return runIterCmd(td, iter, true)
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestIteratorNextPrev(t *testing.T) {
+	var mem vfs.FS
+	var d *DB
+	defer func() {
+		require.NoError(t, d.Close())
+	}()
+
+	reset := func() {
+		if d != nil {
+			require.NoError(t, d.Close())
+		}
+
+		mem = vfs.NewMem()
+		require.NoError(t, mem.MkdirAll("ext", 0755))
+		opts := &Options{FS: mem}
+		// Automatic compactions may compact away tombstones from L6, making
+		// some testcases non-deterministic.
+		opts.DisableAutomaticCompactions = true
+		var err error
+		d, err = Open("", opts)
+		require.NoError(t, err)
+	}
+	reset()
+
+	datadriven.RunTest(t, "testdata/iterator_next_prev", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "reset":
+			reset()
+			return ""
+
+		case "build":
+			if err := runBuildCmd(td, d, mem); err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "ingest":
+			if err := runIngestCmd(td, d, mem); err != nil {
+				return err.Error()
+			}
+			return runLSMCmd(td, d)
+
+		case "iter":
+			snap := Snapshot{
+				db:     d,
+				seqNum: InternalKeySeqNumMax,
+			}
+			td.MaybeScanArgs(t, "seq", &snap.seqNum)
+			iter, _ := snap.NewIter(nil)
+			return runIterCmd(td, iter, true)
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestIteratorStats(t *testing.T) {
+	var mem vfs.FS
+	var d *DB
+	defer func() {
+		require.NoError(t, d.Close())
+	}()
+
+	reset := func() {
+		if d != nil {
+			require.NoError(t, d.Close())
+		}
+
+		mem = vfs.NewMem()
+		require.NoError(t, mem.MkdirAll("ext", 0755))
+		opts := &Options{Comparer: testkeys.Comparer, FS: mem, FormatMajorVersion: internalFormatNewest}
+		// Automatic compactions may make some testcases non-deterministic.
+		opts.DisableAutomaticCompactions = true
+		var err error
+		d, err = Open("", opts)
+		require.NoError(t, err)
+	}
+	reset()
+
+	datadriven.RunTest(t, "testdata/iterator_stats", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "reset":
+			reset()
+			return ""
+
+		case "build":
+			if err := runBuildCmd(td, d, mem); err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "ingest":
+			if err := runIngestCmd(td, d, mem); err != nil {
+				return err.Error()
+			}
+			return runLSMCmd(td, d)
+
+		case "iter":
+			snap := Snapshot{
+				db:     d,
+				seqNum: InternalKeySeqNumMax,
+			}
+			td.MaybeScanArgs(t, "seq", &snap.seqNum)
+			iter, _ := snap.NewIter(nil)
+			return runIterCmd(td, iter, true)
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+type iterSeekOptWrapper struct {
+	internalIterator
+
+	seekGEUsingNext, seekPrefixGEUsingNext *int
+}
+
+func (i *iterSeekOptWrapper) SeekGE(
+	key []byte, flags base.SeekGEFlags,
+) (*InternalKey, base.LazyValue) {
+	if flags.TrySeekUsingNext() {
+		*i.seekGEUsingNext++
+	}
+	return i.internalIterator.SeekGE(key, flags)
+}
+
+func (i *iterSeekOptWrapper) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*InternalKey, base.LazyValue) {
+	if flags.TrySeekUsingNext() {
+		*i.seekPrefixGEUsingNext++
+	}
+	return i.internalIterator.SeekPrefixGE(prefix, key, flags)
+}
+
+func TestIteratorSeekOpt(t *testing.T) {
+	var d *DB
+	defer func() {
+		require.NoError(t, d.Close())
+	}()
+	var iter *Iterator
+	defer func() {
+		if iter != nil {
+			require.NoError(t, iter.Close())
+		}
+	}()
+	var seekGEUsingNext, seekPrefixGEUsingNext int
+
+	datadriven.RunTest(t, "testdata/iterator_seek_opt", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "define":
+			if iter != nil {
+				if err := iter.Close(); err != nil {
+					return err.Error()
+				}
+			}
+			if d != nil {
+				if err := d.Close(); err != nil {
+					return err.Error()
+				}
+			}
+			seekGEUsingNext = 0
+			seekPrefixGEUsingNext = 0
+
+			opts := &Options{}
+			opts.TablePropertyCollectors = append(opts.TablePropertyCollectors,
+				func() TablePropertyCollector {
+					return &minSeqNumPropertyCollector{}
+				})
+
+			var err error
+			if d, err = runDBDefineCmd(td, opts); err != nil {
+				return err.Error()
+			}
+
+			d.mu.Lock()
+			s := d.mu.versions.currentVersion().String()
+			d.mu.Unlock()
+			oldNewIters := d.newIters
+			d.newIters = func(
+				ctx context.Context, file *manifest.FileMetadata, opts *IterOptions,
+				internalOpts internalIterOpts) (internalIterator, keyspan.FragmentIterator, error) {
+				iter, rangeIter, err := oldNewIters(ctx, file, opts, internalOpts)
+				iterWrapped := &iterSeekOptWrapper{
+					internalIterator:      iter,
+					seekGEUsingNext:       &seekGEUsingNext,
+					seekPrefixGEUsingNext: &seekPrefixGEUsingNext,
+				}
+				return iterWrapped, rangeIter, err
+			}
+			return s
+
+		case "iter":
+			if iter == nil || iter.iter == nil {
+				// TODO(peter): runDBDefineCmd doesn't properly update the visible
+				// sequence number. So we have to use a snapshot with a very large
+				// sequence number, otherwise the DB appears empty.
+				snap := Snapshot{
+					db:     d,
+					seqNum: InternalKeySeqNumMax,
+				}
+				iter, _ = snap.NewIter(nil)
+				iter.readSampling.forceReadSampling = true
+				iter.comparer.Split = func(a []byte) int { return len(a) }
+				iter.forceEnableSeekOpt = true
+				iter.merging.forceEnableSeekOpt = true
+			}
+			iterOutput := runIterCmd(td, iter, false)
+			stats := iter.Stats()
+			// InternalStats are non-deterministic since they depend on how data is
+			// distributed across memtables and sstables in the DB.
+			stats.InternalStats = InternalIteratorStats{}
+			var builder strings.Builder
+			fmt.Fprintf(&builder, "%sstats: %s\n", iterOutput, stats.String())
+			fmt.Fprintf(&builder, "SeekGEs with trySeekUsingNext: %d\n", seekGEUsingNext)
+			fmt.Fprintf(&builder, "SeekPrefixGEs with trySeekUsingNext: %d\n", seekPrefixGEUsingNext)
+			return builder.String()
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+type errorSeekIter struct {
+	internalIterator
+	// Fields controlling error injection for seeks.
+	injectSeekErrorCounts []int
+	seekCount             int
+	err                   error
+}
+
+func (i *errorSeekIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) {
+	if i.tryInjectError() {
+		return nil, base.LazyValue{}
+	}
+	i.err = nil
+	i.seekCount++
+	return i.internalIterator.SeekGE(key, flags)
+}
+
+func (i *errorSeekIter) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*InternalKey, base.LazyValue) {
+	if i.tryInjectError() {
+		return nil, base.LazyValue{}
+	}
+	i.err = nil
+	i.seekCount++
+	return i.internalIterator.SeekPrefixGE(prefix, key, flags)
+}
+
+func (i *errorSeekIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) {
+	if i.tryInjectError() {
+		return nil, base.LazyValue{}
+	}
+	i.err = nil
+	i.seekCount++
+	return i.internalIterator.SeekLT(key, flags)
+}
+
+func (i *errorSeekIter) tryInjectError() bool {
+	if len(i.injectSeekErrorCounts) > 0 && i.injectSeekErrorCounts[0] == i.seekCount {
+		i.seekCount++
+		i.err = errors.Errorf("injecting error")
+		i.injectSeekErrorCounts = i.injectSeekErrorCounts[1:]
+		return true
+	}
+	return false
+}
+
+func (i *errorSeekIter) First() (*InternalKey, base.LazyValue) {
+	i.err = nil
+	return i.internalIterator.First()
+}
+
+func (i *errorSeekIter) Last() (*InternalKey, base.LazyValue) {
+	i.err = nil
+	return i.internalIterator.Last()
+}
+
+func (i *errorSeekIter) Next() (*InternalKey, base.LazyValue) {
+	if i.err != nil {
+		return nil, base.LazyValue{}
+	}
+	return i.internalIterator.Next()
+}
+
+func (i *errorSeekIter) Prev() (*InternalKey, base.LazyValue) {
+	if i.err != nil {
+		return nil, base.LazyValue{}
+	}
+	return i.internalIterator.Prev()
+}
+
+func (i *errorSeekIter) Error() error {
+	if i.err != nil {
+		return i.err
+	}
+	return i.internalIterator.Error()
+}
+
+func TestIteratorSeekOptErrors(t *testing.T) {
+	var keys []InternalKey
+	var vals [][]byte
+
+	var errorIter errorSeekIter
+	newIter := func(opts IterOptions) *Iterator {
+		iter := &fakeIter{
+			lower: opts.GetLowerBound(),
+			upper: opts.GetUpperBound(),
+			keys:  keys,
+			vals:  vals,
+		}
+		errorIter = errorSeekIter{internalIterator: invalidating.NewIter(iter)}
+		// NB: This Iterator cannot be cloned since it is not constructed
+		// with a readState. It suffices for this test.
+		return &Iterator{
+			opts:     opts,
+			comparer: *testkeys.Comparer,
+			merge:    DefaultMerger.Merge,
+			iter:     &errorIter,
+		}
+	}
+
+	datadriven.RunTest(t, "testdata/iterator_seek_opt_errors", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "define":
+			keys = keys[:0]
+			vals = vals[:0]
+			for _, key := range strings.Split(d.Input, "\n") {
+				j := strings.Index(key, ":")
+				keys = append(keys, base.ParseInternalKey(key[:j]))
+				vals = append(vals, []byte(key[j+1:]))
+			}
+			return ""
+
+		case "iter":
+			var opts IterOptions
+			var injectSeekGEErrorCounts []int
+			for _, arg := range d.CmdArgs {
+				if len(arg.Vals) < 1 {
+					return fmt.Sprintf("%s: %s=<value>", d.Cmd, arg.Key)
+				}
+				switch arg.Key {
+				case "lower":
+					opts.LowerBound = []byte(arg.Vals[0])
+				case "upper":
+					opts.UpperBound = []byte(arg.Vals[0])
+				case "seek-error":
+					for i := 0; i < len(arg.Vals); i++ {
+						n, err := strconv.Atoi(arg.Vals[i])
+						if err != nil {
+							return err.Error()
+						}
+						injectSeekGEErrorCounts = append(injectSeekGEErrorCounts, n)
+					}
+				default:
+					return fmt.Sprintf("%s: unknown arg: %s", d.Cmd, arg.Key)
+				}
+			}
+
+			iter := newIter(opts)
+			errorIter.injectSeekErrorCounts = injectSeekGEErrorCounts
+			return runIterCmd(d, iter, true)
+
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
+
+type testBlockIntervalCollector struct {
+	numLength     int
+	offsetFromEnd int
+	initialized   bool
+	lower, upper  uint64
+}
+
+func (bi *testBlockIntervalCollector) Add(key InternalKey, value []byte) error {
+	k := key.UserKey
+	if len(k) < bi.numLength+bi.offsetFromEnd {
+		return nil
+	}
+	n := len(k) - bi.offsetFromEnd - bi.numLength
+	val, err := strconv.Atoi(string(k[n : n+bi.numLength]))
+	if err != nil {
+		return err
+	}
+	if val < 0 {
+		panic("testBlockIntervalCollector expects values >= 0")
+	}
+	uval := uint64(val)
+	if !bi.initialized {
+		bi.lower, bi.upper = uval, uval+1
+		bi.initialized = true
+		return nil
+	}
+	if bi.lower > uval {
+		bi.lower = uval
+	}
+	if uval >= bi.upper {
+		bi.upper = uval + 1
+	}
+	return nil
+}
+
+func (bi *testBlockIntervalCollector) FinishDataBlock() (lower uint64, upper uint64, err error) {
+	bi.initialized = false
+	l, u := bi.lower, bi.upper
+	bi.lower, bi.upper = 0, 0
+	return l, u, nil
+}
+
+func TestIteratorBlockIntervalFilter(t *testing.T) {
+	var mem vfs.FS
+	var d *DB
+	defer func() {
+		require.NoError(t, d.Close())
+	}()
+
+	type collector struct {
+		id     uint16
+		offset int
+	}
+	createDB := func(collectors []collector) {
+		if d != nil {
+			require.NoError(t, d.Close())
+		}
+
+		mem = vfs.NewMem()
+		require.NoError(t, mem.MkdirAll("ext", 0755))
+
+		var bpCollectors []func() BlockPropertyCollector
+		for _, c := range collectors {
+			coll := c
+			bpCollectors = append(bpCollectors, func() BlockPropertyCollector {
+				return sstable.NewBlockIntervalCollector(
+					fmt.Sprintf("%d", coll.id),
+					&testBlockIntervalCollector{numLength: 2, offsetFromEnd: coll.offset},
+					nil, /* range key collector */
+				)
+			})
+		}
+		opts := &Options{
+			FS:                      mem,
+			FormatMajorVersion:      internalFormatNewest,
+			BlockPropertyCollectors: bpCollectors,
+		}
+		lo := LevelOptions{BlockSize: 1, IndexBlockSize: 1}
+		opts.Levels = append(opts.Levels, lo)
+
+		// Automatic compactions may compact away tombstones from L6, making
+		// some testcases non-deterministic.
+		opts.DisableAutomaticCompactions = true
+		var err error
+		d, err = Open("", opts)
+		require.NoError(t, err)
+	}
+
+	datadriven.RunTest(
+		t, "testdata/iterator_block_interval_filter", func(t *testing.T, td *datadriven.TestData) string {
+			switch td.Cmd {
+			case "build":
+				var collectors []collector
+				for _, arg := range td.CmdArgs {
+					switch arg.Key {
+					case "id_offset":
+						if len(arg.Vals) != 2 {
+							return "id and offset not provided"
+						}
+						var id, offset int
+						var err error
+						if id, err = strconv.Atoi(arg.Vals[0]); err != nil {
+							return err.Error()
+						}
+						if offset, err = strconv.Atoi(arg.Vals[1]); err != nil {
+							return err.Error()
+						}
+						collectors = append(collectors, collector{id: uint16(id), offset: offset})
+					default:
+						return fmt.Sprintf("unknown key: %s", arg.Key)
+					}
+				}
+				createDB(collectors)
+				b := d.NewBatch()
+				if err := runBatchDefineCmd(td, b); err != nil {
+					return err.Error()
+				}
+				if err := b.Commit(nil); err != nil {
+					return err.Error()
+				}
+				if err := d.Flush(); err != nil {
+					return err.Error()
+				}
+				return runLSMCmd(td, d)
+
+			case "iter":
+				var opts IterOptions
+				for _, arg := range td.CmdArgs {
+					switch arg.Key {
+					case "id_lower_upper":
+						if len(arg.Vals) != 3 {
+							return "id, lower, upper not provided"
+						}
+						var id, lower, upper int
+						var err error
+						if id, err = strconv.Atoi(arg.Vals[0]); err != nil {
+							return err.Error()
+						}
+						if lower, err = strconv.Atoi(arg.Vals[1]); err != nil {
+							return err.Error()
+						}
+						if upper, err = strconv.Atoi(arg.Vals[2]); err != nil {
+							return err.Error()
+						}
+						opts.PointKeyFilters = append(opts.PointKeyFilters,
+							sstable.NewBlockIntervalFilter(fmt.Sprintf("%d", id),
+								uint64(lower), uint64(upper)))
+					default:
+						return fmt.Sprintf("unknown key: %s", arg.Key)
+					}
+				}
+				rand.Shuffle(len(opts.PointKeyFilters), func(i, j int) {
+					opts.PointKeyFilters[i], opts.PointKeyFilters[j] =
+						opts.PointKeyFilters[j], opts.PointKeyFilters[i]
+				})
+				iter, _ := d.NewIter(&opts)
+				return runIterCmd(td, iter, true)
+
+			default:
+				return fmt.Sprintf("unknown command: %s", td.Cmd)
+			}
+		})
+}
+
+var seed = flag.Uint64("seed", 0, "a pseudorandom number generator seed")
+
+func randStr(fill []byte, rng *rand.Rand) {
+	const letters = "abcdefghijklmnopqrstuvwxyz"
+	const lettersLen = len(letters)
+	for i := 0; i < len(fill); i++ {
+		fill[i] = letters[rng.Intn(lettersLen)]
+	}
+}
+
+func randValue(n int, rng *rand.Rand) []byte {
+	buf := make([]byte, n)
+	randStr(buf, rng)
+	return buf
+}
+
+func randKey(n int, rng *rand.Rand) ([]byte, int) {
+	keyPrefix := randValue(n, rng)
+	suffix := rng.Intn(100)
+	return append(keyPrefix, []byte(fmt.Sprintf("%02d", suffix))...), suffix
+}
+
+func TestIteratorRandomizedBlockIntervalFilter(t *testing.T) {
+	mem := vfs.NewMem()
+	opts := &Options{
+		FS:                 mem,
+		FormatMajorVersion: internalFormatNewest,
+		BlockPropertyCollectors: []func() BlockPropertyCollector{
+			func() BlockPropertyCollector {
+				return sstable.NewBlockIntervalCollector(
+					"0", &testBlockIntervalCollector{numLength: 2}, nil, /* range key collector */
+				)
+			},
+		},
+	}
+	seed := *seed
+	if seed == 0 {
+		seed = uint64(time.Now().UnixNano())
+		t.Logf("seed: %d", seed)
+	}
+	rng := rand.New(rand.NewSource(seed))
+	opts.FlushSplitBytes = 1 << rng.Intn(8)            // 1B - 256B
+	opts.L0CompactionThreshold = 1 << rng.Intn(2)      // 1-2
+	opts.L0CompactionFileThreshold = 1 << rng.Intn(11) // 1-1024
+	opts.LBaseMaxBytes = 1 << rng.Intn(11)             // 1B - 1KB
+	opts.MemTableSize = 2 << 10                        // 2KB
+	var lopts LevelOptions
+	lopts.BlockSize = 1 << rng.Intn(8)      // 1B - 256B
+	lopts.IndexBlockSize = 1 << rng.Intn(8) // 1B - 256B
+	opts.Levels = []LevelOptions{lopts}
+
+	d, err := Open("", opts)
+	require.NoError(t, err)
+	defer func() {
+		require.NoError(t, d.Close())
+	}()
+	matchingKeyValues := make(map[string]string)
+	lower := rng.Intn(100)
+	upper := rng.Intn(100)
+	if lower > upper {
+		lower, upper = upper, lower
+	}
+	n := 2000
+	for i := 0; i < n; i++ {
+		key, suffix := randKey(20+rng.Intn(5), rng)
+		value := randValue(50, rng)
+		if lower <= suffix && suffix < upper {
+			matchingKeyValues[string(key)] = string(value)
+		}
+		d.Set(key, value, nil)
+	}
+
+	var iterOpts IterOptions
+	iterOpts.PointKeyFilters = []BlockPropertyFilter{
+		sstable.NewBlockIntervalFilter("0",
+			uint64(lower), uint64(upper)),
+	}
+	iter, _ := d.NewIter(&iterOpts)
+	defer func() {
+		require.NoError(t, iter.Close())
+	}()
+	iter.First()
+	found := 0
+	matchingCount := len(matchingKeyValues)
+	for ; iter.Valid(); iter.Next() {
+		found++
+		key := string(iter.Key())
+		value, ok := matchingKeyValues[key]
+		if ok {
+			require.Equal(t, value, string(iter.Value()))
+			delete(matchingKeyValues, key)
+		}
+	}
+	t.Logf("generated %d keys: %d matching, %d found", n, matchingCount, found)
+	require.Equal(t, 0, len(matchingKeyValues))
+}
+
+func TestIteratorGuaranteedDurable(t *testing.T) {
+	mem := vfs.NewMem()
+	opts := &Options{FS: mem}
+	d, err := Open("", opts)
+	require.NoError(t, err)
+	defer func() {
+		require.NoError(t, d.Close())
+	}()
+	iterOptions := IterOptions{OnlyReadGuaranteedDurable: true}
+	failFunc := func(t *testing.T, reader Reader) {
+		defer func() {
+			if r := recover(); r == nil {
+				require.Fail(t, "expected panic")
+			}
+			reader.Close()
+		}()
+		iter, _ := reader.NewIter(&iterOptions)
+		defer iter.Close()
+	}
+	t.Run("snapshot", func(t *testing.T) {
+		failFunc(t, d.NewSnapshot())
+	})
+	t.Run("batch", func(t *testing.T) {
+		failFunc(t, d.NewIndexedBatch())
+	})
+	t.Run("db", func(t *testing.T) {
+		d.Set([]byte("k"), []byte("v"), nil)
+		foundKV := func(o *IterOptions) bool {
+			iter, _ := d.NewIter(o)
+			defer iter.Close()
+			iter.SeekGE([]byte("k"))
+			return iter.Valid()
+		}
+		require.True(t, foundKV(nil))
+		require.False(t, foundKV(&iterOptions))
+		require.NoError(t, d.Flush())
+		require.True(t, foundKV(nil))
+		require.True(t, foundKV(&iterOptions))
+	})
+}
+
+func TestIteratorBoundsLifetimes(t *testing.T) {
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+	d := newPointTestkeysDatabase(t, testkeys.Alpha(2))
+	defer func() { require.NoError(t, d.Close()) }()
+
+	var buf bytes.Buffer
+	iterators := map[string]*Iterator{}
+	var labels []string
+	printIters := func(w io.Writer) {
+		labels = labels[:0]
+		for label := range iterators {
+			labels = append(labels, label)
+		}
+		sort.Strings(labels)
+		for _, label := range labels {
+			it := iterators[label]
+			fmt.Fprintf(&buf, "%s: (", label)
+			if it.opts.LowerBound == nil {
+				fmt.Fprint(&buf, "<nil>, ")
+			} else {
+				fmt.Fprintf(&buf, "%q, ", it.opts.LowerBound)
+			}
+			if it.opts.UpperBound == nil {
+				fmt.Fprint(&buf, "<nil>)")
+			} else {
+				fmt.Fprintf(&buf, "%q)", it.opts.UpperBound)
+			}
+			fmt.Fprintf(&buf, " boundsBufIdx=%d\n", it.boundsBufIdx)
+		}
+	}
+	parseBounds := func(td *datadriven.TestData) (lower, upper []byte) {
+		for _, arg := range td.CmdArgs {
+			if arg.Key == "lower" {
+				lower = []byte(arg.Vals[0])
+			} else if arg.Key == "upper" {
+				upper = []byte(arg.Vals[0])
+			}
+		}
+		return lower, upper
+	}
+	trashBounds := func(bounds ...[]byte) {
+		for _, bound := range bounds {
+			rng.Read(bound[:])
+		}
+	}
+
+	datadriven.RunTest(t, "testdata/iterator_bounds_lifetimes", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "define":
+			var err error
+			if d, err = runDBDefineCmd(td, d.opts); err != nil {
+				return err.Error()
+			}
+			d.mu.Lock()
+			s := d.mu.versions.currentVersion().String()
+			d.mu.Unlock()
+			return s
+		case "new-iter":
+			var label string
+			td.ScanArgs(t, "label", &label)
+			lower, upper := parseBounds(td)
+			iterators[label], _ = d.NewIter(&IterOptions{
+				LowerBound: lower,
+				UpperBound: upper,
+			})
+			trashBounds(lower, upper)
+			buf.Reset()
+			printIters(&buf)
+			return buf.String()
+		case "clone":
+			var from, to string
+			td.ScanArgs(t, "from", &from)
+			td.ScanArgs(t, "to", &to)
+			var err error
+			iterators[to], err = iterators[from].Clone(CloneOptions{})
+			if err != nil {
+				return err.Error()
+			}
+			buf.Reset()
+			printIters(&buf)
+			return buf.String()
+		case "close":
+			var label string
+			td.ScanArgs(t, "label", &label)
+			iterators[label].Close()
+			delete(iterators, label)
+			buf.Reset()
+			printIters(&buf)
+			return buf.String()
+		case "iter":
+			var label string
+			td.ScanArgs(t, "label", &label)
+			return runIterCmd(td, iterators[label], false /* closeIter */)
+		case "set-bounds":
+			var label string
+			td.ScanArgs(t, "label", &label)
+			lower, upper := parseBounds(td)
+			iterators[label].SetBounds(lower, upper)
+			trashBounds(lower, upper)
+			buf.Reset()
+			printIters(&buf)
+			return buf.String()
+		case "set-options":
+			var label string
+			var tableFilter bool
+			td.ScanArgs(t, "label", &label)
+			opts := iterators[label].opts
+			for _, arg := range td.CmdArgs {
+				if arg.Key == "table-filter" {
+					tableFilter = true
+				}
+				if arg.Key == "key-types" {
+					switch arg.Vals[0] {
+					case "points-only":
+						opts.KeyTypes = IterKeyTypePointsOnly
+					case "ranges-only":
+						opts.KeyTypes = IterKeyTypeRangesOnly
+					case "both":
+						opts.KeyTypes = IterKeyTypePointsAndRanges
+					default:
+						panic(fmt.Sprintf("unrecognized key type %q", arg.Vals[0]))
+					}
+				}
+			}
+			opts.LowerBound, opts.UpperBound = parseBounds(td)
+			if tableFilter {
+				opts.TableFilter = func(userProps map[string]string) bool { return false }
+			}
+			iterators[label].SetOptions(&opts)
+			trashBounds(opts.LowerBound, opts.UpperBound)
+			buf.Reset()
+			printIters(&buf)
+			return buf.String()
+		default:
+			return fmt.Sprintf("unrecognized command %q", td.Cmd)
+		}
+	})
+}
+
+func TestIteratorStatsMerge(t *testing.T) {
+	s := IteratorStats{
+		ForwardSeekCount: [NumStatsKind]int{1, 2},
+		ReverseSeekCount: [NumStatsKind]int{3, 4},
+		ForwardStepCount: [NumStatsKind]int{5, 6},
+		ReverseStepCount: [NumStatsKind]int{7, 8},
+		InternalStats: InternalIteratorStats{
+			BlockBytes:                     9,
+			BlockBytesInCache:              10,
+			BlockReadDuration:              3 * time.Millisecond,
+			KeyBytes:                       11,
+			ValueBytes:                     12,
+			PointCount:                     13,
+			PointsCoveredByRangeTombstones: 14,
+		},
+		RangeKeyStats: RangeKeyIteratorStats{
+			Count:           15,
+			ContainedPoints: 16,
+			SkippedPoints:   17,
+		},
+	}
+	s.InternalStats.SeparatedPointValue.Count = 1
+	s.InternalStats.SeparatedPointValue.ValueBytes = 5
+	s.InternalStats.SeparatedPointValue.ValueBytesFetched = 3
+	s2 := IteratorStats{
+		ForwardSeekCount: [NumStatsKind]int{1, 2},
+		ReverseSeekCount: [NumStatsKind]int{3, 4},
+		ForwardStepCount: [NumStatsKind]int{5, 6},
+		ReverseStepCount: [NumStatsKind]int{7, 8},
+		InternalStats: InternalIteratorStats{
+			BlockBytes:                     9,
+			BlockBytesInCache:              10,
+			BlockReadDuration:              4 * time.Millisecond,
+			KeyBytes:                       11,
+			ValueBytes:                     12,
+			PointCount:                     13,
+			PointsCoveredByRangeTombstones: 14,
+		},
+		RangeKeyStats: RangeKeyIteratorStats{
+			Count:           15,
+			ContainedPoints: 16,
+			SkippedPoints:   17,
+		},
+	}
+	s2.InternalStats.SeparatedPointValue.Count = 2
+	s2.InternalStats.SeparatedPointValue.ValueBytes = 10
+	s2.InternalStats.SeparatedPointValue.ValueBytesFetched = 6
+	s.Merge(s2)
+	expected := IteratorStats{
+		ForwardSeekCount: [NumStatsKind]int{2, 4},
+		ReverseSeekCount: [NumStatsKind]int{6, 8},
+		ForwardStepCount: [NumStatsKind]int{10, 12},
+		ReverseStepCount: [NumStatsKind]int{14, 16},
+		InternalStats: InternalIteratorStats{
+			BlockBytes:                     18,
+			BlockBytesInCache:              20,
+			BlockReadDuration:              7 * time.Millisecond,
+			KeyBytes:                       22,
+			ValueBytes:                     24,
+			PointCount:                     26,
+			PointsCoveredByRangeTombstones: 28,
+		},
+		RangeKeyStats: RangeKeyIteratorStats{
+			Count:           30,
+			ContainedPoints: 32,
+			SkippedPoints:   34,
+		},
+	}
+	expected.InternalStats.SeparatedPointValue.Count = 3
+	expected.InternalStats.SeparatedPointValue.ValueBytes = 15
+	expected.InternalStats.SeparatedPointValue.ValueBytesFetched = 9
+	require.Equal(t, expected, s)
+}
+
+// TestSetOptionsEquivalence tests equivalence between SetOptions to mutate an
+// iterator and constructing a new iterator with NewIter. The long-lived
+// iterator and the new iterator should surface identical iterator states.
+func TestSetOptionsEquivalence(t *testing.T) {
+	seed := uint64(time.Now().UnixNano())
+	// Call a helper function with the seed so that the seed appears within
+	// stack traces if there's a panic.
+	testSetOptionsEquivalence(t, seed)
+}
+
+func testSetOptionsEquivalence(t *testing.T, seed uint64) {
+	rng := rand.New(rand.NewSource(seed))
+	ks := testkeys.Alpha(2)
+	d := newTestkeysDatabase(t, ks, rng)
+	defer func() { require.NoError(t, d.Close()) }()
+
+	var o IterOptions
+	generateNewOptions := func() {
+		// TODO(jackson): Include test coverage for block property filters, etc.
+		if rng.Intn(2) == 1 {
+			o.KeyTypes = IterKeyType(rng.Intn(3))
+		}
+		if rng.Intn(2) == 1 {
+			if rng.Intn(2) == 1 {
+				o.LowerBound = nil
+				if rng.Intn(2) == 1 {
+					o.LowerBound = testkeys.KeyAt(ks, rng.Int63n(ks.Count()), rng.Int63n(ks.Count()))
+				}
+			}
+			if rng.Intn(2) == 1 {
+				o.UpperBound = nil
+				if rng.Intn(2) == 1 {
+					o.UpperBound = testkeys.KeyAt(ks, rng.Int63n(ks.Count()), rng.Int63n(ks.Count()))
+				}
+			}
+			if testkeys.Comparer.Compare(o.LowerBound, o.UpperBound) > 0 {
+				o.LowerBound, o.UpperBound = o.UpperBound, o.LowerBound
+			}
+		}
+		o.RangeKeyMasking.Suffix = nil
+		if o.KeyTypes == IterKeyTypePointsAndRanges && rng.Intn(2) == 1 {
+			o.RangeKeyMasking.Suffix = testkeys.Suffix(rng.Int63n(ks.Count()))
+		}
+	}
+
+	var longLivedIter, newIter *Iterator
+	var history, longLivedBuf, newIterBuf bytes.Buffer
+	defer func() {
+		if r := recover(); r != nil {
+			t.Log(history.String())
+			panic(r)
+		}
+	}()
+	defer func() {
+		if longLivedIter != nil {
+			longLivedIter.Close()
+		}
+		if newIter != nil {
+			newIter.Close()
+		}
+	}()
+
+	type positioningOp struct {
+		desc string
+		run  func(*Iterator) IterValidityState
+	}
+	positioningOps := []func() positioningOp{
+		// SeekGE
+		func() positioningOp {
+			k := testkeys.Key(ks, rng.Int63n(ks.Count()))
+			return positioningOp{
+				desc: fmt.Sprintf("SeekGE(%q)", k),
+				run: func(it *Iterator) IterValidityState {
+					return it.SeekGEWithLimit(k, nil)
+				},
+			}
+		},
+		// SeekLT
+		func() positioningOp {
+			k := testkeys.Key(ks, rng.Int63n(ks.Count()))
+			return positioningOp{
+				desc: fmt.Sprintf("SeekLT(%q)", k),
+				run: func(it *Iterator) IterValidityState {
+					return it.SeekLTWithLimit(k, nil)
+				},
+			}
+		},
+		// SeekPrefixGE
+		func() positioningOp {
+			k := testkeys.Key(ks, rng.Int63n(ks.Count()))
+			return positioningOp{
+				desc: fmt.Sprintf("SeekPrefixGE(%q)", k),
+				run: func(it *Iterator) IterValidityState {
+					if it.SeekPrefixGE(k) {
+						return IterValid
+					}
+					return IterExhausted
+				},
+			}
+		},
+	}
+
+	for i := 0; i < 10_000; i++ {
+		// Generate new random options. The options in o will be mutated.
+		generateNewOptions()
+		fmt.Fprintf(&history, "new options: %s\n", iterOptionsString(&o))
+
+		newIter, _ = d.NewIter(&o)
+		if longLivedIter == nil {
+			longLivedIter, _ = d.NewIter(&o)
+		} else {
+			longLivedIter.SetOptions(&o)
+		}
+
+		// Apply the same operation to both keys.
+		iterOp := positioningOps[rng.Intn(len(positioningOps))]()
+		newIterValidity := iterOp.run(newIter)
+		longLivedValidity := iterOp.run(longLivedIter)
+
+		newIterBuf.Reset()
+		longLivedBuf.Reset()
+		printIterState(&newIterBuf, newIter, newIterValidity, true /* printValidityState */)
+		printIterState(&longLivedBuf, longLivedIter, longLivedValidity, true /* printValidityState */)
+		fmt.Fprintf(&history, "%s = %s\n", iterOp.desc, newIterBuf.String())
+
+		if newIterBuf.String() != longLivedBuf.String() {
+			t.Logf("history:\n%s\n", history.String())
+			t.Logf("seed: %d\n", seed)
+			t.Fatalf("expected %q, got %q", newIterBuf.String(), longLivedBuf.String())
+		}
+		_ = newIter.Close()
+
+		newIter = nil
+	}
+	t.Logf("history:\n%s\n", history.String())
+}
+
+func iterOptionsString(o *IterOptions) string {
+	var buf bytes.Buffer
+	fmt.Fprintf(&buf, "key-types=%s, lower=%q, upper=%q",
+		o.KeyTypes, o.LowerBound, o.UpperBound)
+	if o.TableFilter != nil {
+		fmt.Fprintf(&buf, ", table-filter")
+	}
+	if o.OnlyReadGuaranteedDurable {
+		fmt.Fprintf(&buf, ", only-durable")
+	}
+	if o.UseL6Filters {
+		fmt.Fprintf(&buf, ", use-L6-filters")
+	}
+	for i, pkf := range o.PointKeyFilters {
+		fmt.Fprintf(&buf, ", point-key-filter[%d]=%q", i, pkf.Name())
+	}
+	for i, rkf := range o.RangeKeyFilters {
+		fmt.Fprintf(&buf, ", range-key-filter[%d]=%q", i, rkf.Name())
+	}
+	return buf.String()
+}
+
+func newTestkeysDatabase(t *testing.T, ks testkeys.Keyspace, rng *rand.Rand) *DB {
+	dbOpts := &Options{
+		Comparer:           testkeys.Comparer,
+		FS:                 vfs.NewMem(),
+		FormatMajorVersion: FormatRangeKeys,
+		Logger:             panicLogger{},
+	}
+	d, err := Open("", dbOpts)
+	require.NoError(t, err)
+
+	// Randomize the order in which we write keys.
+	order := rng.Perm(int(ks.Count()))
+	b := d.NewBatch()
+	keyBuf := make([]byte, ks.MaxLen()+testkeys.MaxSuffixLen)
+	keyBuf2 := make([]byte, ks.MaxLen()+testkeys.MaxSuffixLen)
+	for i := 0; i < len(order); i++ {
+		const maxVersionsPerKey = 10
+		keyIndex := order[i]
+		for versions := rng.Intn(maxVersionsPerKey); versions > 0; versions-- {
+			n := testkeys.WriteKeyAt(keyBuf, ks, int64(keyIndex), rng.Int63n(maxVersionsPerKey))
+			b.Set(keyBuf[:n], keyBuf[:n], nil)
+		}
+
+		// Sometimes add a range key too.
+		if rng.Intn(100) == 1 {
+			startIdx := rng.Int63n(ks.Count())
+			endIdx := rng.Int63n(ks.Count())
+			startLen := testkeys.WriteKey(keyBuf, ks, startIdx)
+			endLen := testkeys.WriteKey(keyBuf2, ks, endIdx)
+			suffixInt := rng.Int63n(maxVersionsPerKey)
+			require.NoError(t, b.RangeKeySet(
+				keyBuf[:startLen],
+				keyBuf2[:endLen],
+				testkeys.Suffix(suffixInt),
+				nil,
+				nil))
+		}
+
+		// Randomize the flush points.
+		if !b.Empty() && rng.Intn(10) == 1 {
+			require.NoError(t, b.Commit(nil))
+			require.NoError(t, d.Flush())
+			b = d.NewBatch()
+		}
+	}
+	if !b.Empty() {
+		require.NoError(t, b.Commit(nil))
+	}
+	return d
+}
+
+func newPointTestkeysDatabase(t *testing.T, ks testkeys.Keyspace) *DB {
+	dbOpts := &Options{
+		Comparer:           testkeys.Comparer,
+		FS:                 vfs.NewMem(),
+		FormatMajorVersion: FormatRangeKeys,
+	}
+	d, err := Open("", dbOpts)
+	require.NoError(t, err)
+
+	b := d.NewBatch()
+	keyBuf := make([]byte, ks.MaxLen()+testkeys.MaxSuffixLen)
+	for i := int64(0); i < ks.Count(); i++ {
+		n := testkeys.WriteKeyAt(keyBuf, ks, i, i)
+		b.Set(keyBuf[:n], keyBuf[:n], nil)
+	}
+	require.NoError(t, b.Commit(nil))
+	return d
+}
+
+func BenchmarkIteratorSeekGE(b *testing.B) {
+	m, keys := buildMemTable(b)
+	iter := &Iterator{
+		comparer: *DefaultComparer,
+		iter:     m.newIter(nil),
+	}
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		key := keys[rng.Intn(len(keys))]
+		iter.SeekGE(key)
+	}
+}
+
+func BenchmarkIteratorNext(b *testing.B) {
+	m, _ := buildMemTable(b)
+	iter := &Iterator{
+		comparer: *DefaultComparer,
+		iter:     m.newIter(nil),
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		if !iter.Valid() {
+			iter.First()
+		}
+		iter.Next()
+	}
+}
+
+func BenchmarkIteratorPrev(b *testing.B) {
+	m, _ := buildMemTable(b)
+	iter := &Iterator{
+		comparer: *DefaultComparer,
+		iter:     m.newIter(nil),
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		if !iter.Valid() {
+			iter.Last()
+		}
+		iter.Prev()
+	}
+}
+
+type twoLevelBloomTombstoneState struct {
+	keys        [][]byte
+	readers     [8][][]*sstable.Reader
+	levelSlices [8][]manifest.LevelSlice
+	indexFunc   func(twoLevelIndex bool, bloom bool, withTombstone bool) int
+}
+
+func setupForTwoLevelBloomTombstone(b *testing.B, keyOffset int) twoLevelBloomTombstoneState {
+	const blockSize = 32 << 10
+	const restartInterval = 16
+	const levelCount = 5
+
+	var readers [8][][]*sstable.Reader
+	var levelSlices [8][]manifest.LevelSlice
+	var keys [][]byte
+	indexFunc := func(twoLevelIndex bool, bloom bool, withTombstone bool) int {
+		index := 0
+		if twoLevelIndex {
+			index = 4
+		}
+		if bloom {
+			index += 2
+		}
+		if withTombstone {
+			index++
+		}
+		return index
+	}
+	for _, twoLevelIndex := range []bool{false, true} {
+		for _, bloom := range []bool{false, true} {
+			for _, withTombstone := range []bool{false, true} {
+				index := indexFunc(twoLevelIndex, bloom, withTombstone)
+				levels := levelCount
+				if withTombstone {
+					levels = 1
+				}
+				readers[index], levelSlices[index], keys = buildLevelsForMergingIterSeqSeek(
+					b, blockSize, restartInterval, levels, keyOffset, withTombstone, bloom, twoLevelIndex)
+			}
+		}
+	}
+	return twoLevelBloomTombstoneState{
+		keys: keys, readers: readers, levelSlices: levelSlices, indexFunc: indexFunc}
+}
+
+// BenchmarkIteratorSeqSeekPrefixGENotFound exercises the case of SeekPrefixGE
+// specifying monotonic keys all of which precede actual keys present in L6 of
+// the DB. Moreover, with-tombstone=true exercises the sub-case where those
+// actual keys are deleted using a range tombstone that has not physically
+// deleted those keys due to the presence of a snapshot that needs to see
+// those keys. This sub-case needs to be efficient in (a) avoiding iteration
+// over all those deleted keys, including repeated iteration, (b) using the
+// next optimization, since the seeks are monotonic.
+func BenchmarkIteratorSeqSeekPrefixGENotFound(b *testing.B) {
+	const keyOffset = 100000
+	state := setupForTwoLevelBloomTombstone(b, keyOffset)
+	readers := state.readers
+	levelSlices := state.levelSlices
+	indexFunc := state.indexFunc
+
+	// We will not be seeking to the keys that were written but instead to
+	// keys before the written keys. This is to validate that the optimization
+	// to use Next still functions when mergingIter checks for the prefix
+	// match, and that mergingIter can avoid iterating over all the keys
+	// deleted by a range tombstone when there is no possibility of matching
+	// the prefix.
+	var keys [][]byte
+	for i := 0; i < keyOffset; i++ {
+		keys = append(keys, []byte(fmt.Sprintf("%08d", i)))
+	}
+	for _, skip := range []int{1, 2, 4} {
+		for _, twoLevelIndex := range []bool{false, true} {
+			for _, bloom := range []bool{false, true} {
+				for _, withTombstone := range []bool{false, true} {
+					b.Run(fmt.Sprintf("skip=%d/two-level=%t/bloom=%t/with-tombstone=%t",
+						skip, twoLevelIndex, bloom, withTombstone),
+						func(b *testing.B) {
+							index := indexFunc(twoLevelIndex, bloom, withTombstone)
+							readers := readers[index]
+							levelSlices := levelSlices[index]
+							m := buildMergingIter(readers, levelSlices)
+							iter := Iterator{
+								comparer: *testkeys.Comparer,
+								merge:    DefaultMerger.Merge,
+								iter:     m,
+							}
+							pos := 0
+							b.ResetTimer()
+							for i := 0; i < b.N; i++ {
+								// When withTombstone=true, and prior to the
+								// optimization to stop early due to a range
+								// tombstone, the iteration would continue into the
+								// next file, and not be able to use Next at the lower
+								// level in the next SeekPrefixGE call. So we would
+								// incur the cost of iterating over all the deleted
+								// keys for every seek. Note that it is not possible
+								// to do a noop optimization in Iterator for the
+								// prefix case, unlike SeekGE/SeekLT, since we don't
+								// know if the iterators inside mergingIter are all
+								// appropriately positioned -- some may not be due to
+								// bloom filters not matching.
+								valid := iter.SeekPrefixGE(keys[pos])
+								if valid {
+									b.Fatalf("key should not be found")
+								}
+								pos += skip
+								if pos >= keyOffset {
+									pos = 0
+								}
+							}
+							b.StopTimer()
+							iter.Close()
+						})
+				}
+			}
+		}
+	}
+	for _, r := range readers {
+		for i := range r {
+			for j := range r[i] {
+				r[i][j].Close()
+			}
+		}
+	}
+}
+
+// BenchmarkIteratorSeqSeekPrefixGEFound exercises the case of SeekPrefixGE
+// specifying monotonic keys that are present in L6 of the DB. Moreover,
+// with-tombstone=true exercises the sub-case where those actual keys are
+// deleted using a range tombstone that has not physically deleted those keys
+// due to the presence of a snapshot that needs to see those keys. This
+// sub-case needs to be efficient in (a) avoiding iteration over all those
+// deleted keys, including repeated iteration, (b) using the next
+// optimization, since the seeks are monotonic.
+func BenchmarkIteratorSeqSeekPrefixGEFound(b *testing.B) {
+	state := setupForTwoLevelBloomTombstone(b, 0)
+	keys := state.keys
+	readers := state.readers
+	levelSlices := state.levelSlices
+	indexFunc := state.indexFunc
+
+	for _, skip := range []int{1, 2, 4} {
+		for _, twoLevelIndex := range []bool{false, true} {
+			for _, bloom := range []bool{false, true} {
+				for _, withTombstone := range []bool{false, true} {
+					b.Run(fmt.Sprintf("skip=%d/two-level=%t/bloom=%t/with-tombstone=%t",
+						skip, twoLevelIndex, bloom, withTombstone),
+						func(b *testing.B) {
+							index := indexFunc(twoLevelIndex, bloom, withTombstone)
+							readers := readers[index]
+							levelSlices := levelSlices[index]
+							m := buildMergingIter(readers, levelSlices)
+							iter := Iterator{
+								comparer: *testkeys.Comparer,
+								merge:    DefaultMerger.Merge,
+								iter:     m,
+							}
+							pos := 0
+							b.ResetTimer()
+							for i := 0; i < b.N; i++ {
+								// When withTombstone=true, and prior to the
+								// optimization to stop early due to a range
+								// tombstone, the iteration would continue into the
+								// next file, and not be able to use Next at the lower
+								// level in the next SeekPrefixGE call. So we would
+								// incur the cost of iterating over all the deleted
+								// keys for every seek. Note that it is not possible
+								// to do a noop optimization in Iterator for the
+								// prefix case, unlike SeekGE/SeekLT, since we don't
+								// know if the iterators inside mergingIter are all
+								// appropriately positioned -- some may not be due to
+								// bloom filters not matching.
+								_ = iter.SeekPrefixGE(keys[pos])
+								pos += skip
+								if pos >= len(keys) {
+									pos = 0
+								}
+							}
+							b.StopTimer()
+							iter.Close()
+						})
+				}
+			}
+		}
+	}
+	for _, r := range readers {
+		for i := range r {
+			for j := range r[i] {
+				r[i][j].Close()
+			}
+		}
+	}
+}
+
+// BenchmarkIteratorSeqSeekGEWithBounds is analogous to
+// BenchmarkMergingIterSeqSeekGEWithBounds, except for using an Iterator,
+// which causes it to exercise the end-to-end code path.
+func BenchmarkIteratorSeqSeekGEWithBounds(b *testing.B) {
+	const blockSize = 32 << 10
+	const restartInterval = 16
+	const levelCount = 5
+	for _, twoLevelIndex := range []bool{false, true} {
+		b.Run(fmt.Sprintf("two-level=%t", twoLevelIndex),
+			func(b *testing.B) {
+				readers, levelSlices, keys := buildLevelsForMergingIterSeqSeek(
+					b, blockSize, restartInterval, levelCount, 0, /* keyOffset */
+					false, false, twoLevelIndex)
+				m := buildMergingIter(readers, levelSlices)
+				iter := Iterator{
+					comparer: *testkeys.Comparer,
+					merge:    DefaultMerger.Merge,
+					iter:     m,
+				}
+				keyCount := len(keys)
+				b.ResetTimer()
+				for i := 0; i < b.N; i++ {
+					pos := i % (keyCount - 1)
+					iter.SetBounds(keys[pos], keys[pos+1])
+					// SeekGE will return keys[pos].
+					valid := iter.SeekGE(keys[pos])
+					for valid {
+						valid = iter.Next()
+					}
+					if iter.Error() != nil {
+						b.Fatalf(iter.Error().Error())
+					}
+				}
+				iter.Close()
+				for i := range readers {
+					for j := range readers[i] {
+						readers[i][j].Close()
+					}
+				}
+			})
+	}
+}
+
+func BenchmarkIteratorSeekGENoop(b *testing.B) {
+	const blockSize = 32 << 10
+	const restartInterval = 16
+	const levelCount = 5
+	const keyOffset = 10000
+	readers, levelSlices, _ := buildLevelsForMergingIterSeqSeek(
+		b, blockSize, restartInterval, levelCount, keyOffset, false, false, false)
+	var keys [][]byte
+	for i := 0; i < keyOffset; i++ {
+		keys = append(keys, []byte(fmt.Sprintf("%08d", i)))
+	}
+	for _, withLimit := range []bool{false, true} {
+		b.Run(fmt.Sprintf("withLimit=%t", withLimit), func(b *testing.B) {
+			m := buildMergingIter(readers, levelSlices)
+			iter := Iterator{
+				comparer: *testkeys.Comparer,
+				merge:    DefaultMerger.Merge,
+				iter:     m,
+			}
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				pos := i % (len(keys) - 1)
+				if withLimit {
+					if iter.SeekGEWithLimit(keys[pos], keys[pos+1]) != IterAtLimit {
+						b.Fatal("should be at limit")
+					}
+				} else {
+					if !iter.SeekGE(keys[pos]) {
+						b.Fatal("should be valid")
+					}
+				}
+			}
+			iter.Close()
+		})
+	}
+	for i := range readers {
+		for j := range readers[i] {
+			readers[i][j].Close()
+		}
+	}
+}
+
+func BenchmarkBlockPropertyFilter(b *testing.B) {
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+	for _, matchInterval := range []int{1, 10, 100, 1000} {
+		b.Run(fmt.Sprintf("match-interval=%d", matchInterval), func(b *testing.B) {
+			mem := vfs.NewMem()
+			opts := &Options{
+				FS:                 mem,
+				FormatMajorVersion: FormatNewest,
+				BlockPropertyCollectors: []func() BlockPropertyCollector{
+					func() BlockPropertyCollector {
+						return sstable.NewBlockIntervalCollector(
+							"0", &testBlockIntervalCollector{numLength: 3}, nil, /* range key collector */
+						)
+					},
+				},
+			}
+			d, err := Open("", opts)
+			require.NoError(b, err)
+			defer func() {
+				require.NoError(b, d.Close())
+			}()
+			batch := d.NewBatch()
+			const numKeys = 20 * 1000
+			const valueSize = 1000
+			for i := 0; i < numKeys; i++ {
+				key := fmt.Sprintf("%06d%03d", i, i%matchInterval)
+				value := randValue(valueSize, rng)
+				require.NoError(b, batch.Set([]byte(key), value, nil))
+			}
+			require.NoError(b, batch.Commit(nil))
+			require.NoError(b, d.Flush())
+			require.NoError(b, d.Compact(nil, []byte{0xFF}, false))
+
+			for _, filter := range []bool{false, true} {
+				b.Run(fmt.Sprintf("filter=%t", filter), func(b *testing.B) {
+					var iterOpts IterOptions
+					if filter {
+						iterOpts.PointKeyFilters = []BlockPropertyFilter{
+							sstable.NewBlockIntervalFilter("0",
+								uint64(0), uint64(1)),
+						}
+					}
+					iter, _ := d.NewIter(&iterOpts)
+					b.ResetTimer()
+					for i := 0; i < b.N; i++ {
+						valid := iter.First()
+						for valid {
+							valid = iter.Next()
+						}
+					}
+					b.StopTimer()
+					require.NoError(b, iter.Close())
+				})
+			}
+		})
+	}
+}
+
+func TestRangeKeyMaskingRandomized(t *testing.T) {
+	seed := *seed
+	if seed == 0 {
+		seed = uint64(time.Now().UnixNano())
+		t.Logf("seed: %d", seed)
+	}
+	rng := rand.New(rand.NewSource(seed))
+
+	// Generate keyspace with point keys, and range keys which will
+	// mask the point keys.
+	var timestamps []int64
+	for i := 0; i <= 100; i++ {
+		timestamps = append(timestamps, rng.Int63n(1000))
+	}
+
+	ks := testkeys.Alpha(5)
+	numKeys := 1000 + rng.Intn(9000)
+	keys := make([][]byte, numKeys)
+	keyTimeStamps := make([]int64, numKeys) // ts associated with the keys.
+	for i := 0; i < numKeys; i++ {
+		keys[i] = make([]byte, 5+testkeys.MaxSuffixLen)
+		keyTimeStamps[i] = timestamps[rng.Intn(len(timestamps))]
+		n := testkeys.WriteKeyAt(keys[i], ks, rng.Int63n(ks.Count()), keyTimeStamps[i])
+		keys[i] = keys[i][:n]
+	}
+
+	numRangeKeys := rng.Intn(20)
+	type rkey struct {
+		start  []byte
+		end    []byte
+		suffix []byte
+	}
+	rkeys := make([]rkey, numRangeKeys)
+	pointKeyHidden := make([]bool, numKeys)
+	for i := 0; i < numRangeKeys; i++ {
+		rkeys[i].start = make([]byte, 5)
+		rkeys[i].end = make([]byte, 5)
+
+		testkeys.WriteKey(rkeys[i].start[:5], ks, rng.Int63n(ks.Count()))
+		testkeys.WriteKey(rkeys[i].end[:5], ks, rng.Int63n(ks.Count()))
+
+		for bytes.Equal(rkeys[i].start[:5], rkeys[i].end[:5]) {
+			testkeys.WriteKey(rkeys[i].end[:5], ks, rng.Int63n(ks.Count()))
+		}
+
+		if bytes.Compare(rkeys[i].start[:5], rkeys[i].end[:5]) > 0 {
+			rkeys[i].start, rkeys[i].end = rkeys[i].end, rkeys[i].start
+		}
+
+		rkeyTimestamp := timestamps[rng.Intn(len(timestamps))]
+		rkeys[i].suffix = []byte("@" + strconv.FormatInt(rkeyTimestamp, 10))
+
+		// Each time we create a range key, check if the range key masks any
+		// point keys.
+		for j, pkey := range keys {
+			if pointKeyHidden[j] {
+				continue
+			}
+
+			if keyTimeStamps[j] >= rkeyTimestamp {
+				continue
+			}
+
+			if testkeys.Comparer.Compare(pkey, rkeys[i].start) >= 0 &&
+				testkeys.Comparer.Compare(pkey, rkeys[i].end) < 0 {
+				pointKeyHidden[j] = true
+			}
+		}
+	}
+
+	// Define a simple base testOpts, and a randomized testOpts. The results
+	// of iteration will be compared.
+	type testOpts struct {
+		levelOpts []LevelOptions
+		filter    func() BlockPropertyFilterMask
+	}
+
+	baseOpts := testOpts{
+		levelOpts: make([]LevelOptions, 7),
+	}
+	for i := 0; i < len(baseOpts.levelOpts); i++ {
+		baseOpts.levelOpts[i].TargetFileSize = 1
+		baseOpts.levelOpts[i].BlockSize = 1
+	}
+
+	randomOpts := testOpts{
+		levelOpts: []LevelOptions{
+			{
+				TargetFileSize: int64(1 + rng.Intn(2<<20)), // Vary the L0 file size.
+				BlockSize:      1 + rng.Intn(32<<10),
+			},
+		},
+	}
+	if rng.Intn(2) == 0 {
+		randomOpts.filter = func() BlockPropertyFilterMask {
+			return sstable.NewTestKeysMaskingFilter()
+		}
+	}
+
+	maxProcs := runtime.GOMAXPROCS(0)
+
+	opts1 := &Options{
+		FS:                       vfs.NewStrictMem(),
+		Comparer:                 testkeys.Comparer,
+		FormatMajorVersion:       FormatNewest,
+		MaxConcurrentCompactions: func() int { return maxProcs/2 + 1 },
+		BlockPropertyCollectors: []func() BlockPropertyCollector{
+			sstable.NewTestKeysBlockPropertyCollector,
+		},
+	}
+	opts1.Levels = baseOpts.levelOpts
+	d1, err := Open("", opts1)
+	require.NoError(t, err)
+
+	opts2 := &Options{
+		FS:                       vfs.NewStrictMem(),
+		Comparer:                 testkeys.Comparer,
+		FormatMajorVersion:       FormatNewest,
+		MaxConcurrentCompactions: func() int { return maxProcs/2 + 1 },
+		BlockPropertyCollectors: []func() BlockPropertyCollector{
+			sstable.NewTestKeysBlockPropertyCollector,
+		},
+	}
+	opts2.Levels = randomOpts.levelOpts
+	d2, err := Open("", opts2)
+	require.NoError(t, err)
+
+	defer func() {
+		if err := d1.Close(); err != nil {
+			t.Fatal(err)
+		}
+		if err := d2.Close(); err != nil {
+			t.Fatal(err)
+		}
+	}()
+
+	// Run test
+	var batch1 *Batch
+	var batch2 *Batch
+	const keysPerBatch = 50
+	for i := 0; i < numKeys; i++ {
+		if i%keysPerBatch == 0 {
+			if batch1 != nil {
+				require.NoError(t, batch1.Commit(nil))
+				require.NoError(t, batch2.Commit(nil))
+			}
+			batch1 = d1.NewBatch()
+			batch2 = d2.NewBatch()
+		}
+		require.NoError(t, batch1.Set(keys[i], []byte{1}, nil))
+		require.NoError(t, batch2.Set(keys[i], []byte{1}, nil))
+	}
+
+	for _, rkey := range rkeys {
+		require.NoError(t, d1.RangeKeySet(rkey.start, rkey.end, rkey.suffix, nil, nil))
+		require.NoError(t, d2.RangeKeySet(rkey.start, rkey.end, rkey.suffix, nil, nil))
+	}
+
+	// Scan the keyspace
+	iter1Opts := IterOptions{
+		KeyTypes: IterKeyTypePointsAndRanges,
+		RangeKeyMasking: RangeKeyMasking{
+			Suffix: []byte("@1000"),
+			Filter: baseOpts.filter,
+		},
+	}
+
+	iter2Opts := IterOptions{
+		KeyTypes: IterKeyTypePointsAndRanges,
+		RangeKeyMasking: RangeKeyMasking{
+			Suffix: []byte("@1000"),
+			Filter: randomOpts.filter,
+		},
+	}
+
+	iter1, _ := d1.NewIter(&iter1Opts)
+	iter2, _ := d2.NewIter(&iter2Opts)
+	defer func() {
+		if err := iter1.Close(); err != nil {
+			t.Fatal(err)
+		}
+		if err := iter2.Close(); err != nil {
+			t.Fatal(err)
+		}
+	}()
+
+	for valid1, valid2 := iter1.First(), iter2.First(); valid1 || valid2; valid1, valid2 = iter1.Next(), iter2.Next() {
+		if valid1 != valid2 {
+			t.Fatalf("iteration didn't produce identical results")
+		}
+
+		// Confirm exposed range key state is identical.
+		hasP1, hasR1 := iter1.HasPointAndRange()
+		hasP2, hasR2 := iter2.HasPointAndRange()
+		if hasP1 != hasP2 || hasR1 != hasR2 {
+			t.Fatalf("iteration didn't produce identical results")
+		}
+		if hasP1 && !bytes.Equal(iter1.Key(), iter2.Key()) {
+			t.Fatalf(fmt.Sprintf("iteration didn't produce identical point keys: %s, %s", iter1.Key(), iter2.Key()))
+		}
+		if hasR1 {
+			// Confirm that the range key is the same.
+			b1, e1 := iter1.RangeBounds()
+			b2, e2 := iter2.RangeBounds()
+			if !bytes.Equal(b1, b2) || !bytes.Equal(e1, e2) {
+				t.Fatalf(fmt.Sprintf(
+					"iteration didn't produce identical range keys: [%s, %s], [%s, %s]",
+					b1, e1, b2, e2,
+				))
+			}
+
+		}
+
+		// Confirm that the returned point key wasn't hidden.
+		for j, pkey := range keys {
+			if bytes.Equal(iter1.Key(), pkey) && pointKeyHidden[j] {
+				t.Fatalf(fmt.Sprintf("hidden point key was exposed %s %d", pkey, keyTimeStamps[j]))
+			}
+		}
+	}
+}
+
+// BenchmarkIterator_RangeKeyMasking benchmarks a scan through a keyspace with
+// 10,000 random suffixed point keys, and three range keys covering most of the
+// keyspace. It varies the suffix of the range keys in subbenchmarks to exercise
+// varying amounts of masking. This benchmark does configure a block-property
+// filter, allowing for skipping blocks wholly contained within a range key and
+// consisting of points all with a suffix lower than the range key's.
+func BenchmarkIterator_RangeKeyMasking(b *testing.B) {
+	const (
+		prefixLen    = 20
+		valueSize    = 1024
+		batches      = 200
+		keysPerBatch = 50
+	)
+	var alloc bytealloc.A
+	rng := rand.New(rand.NewSource(uint64(1658872515083979000)))
+	keyBuf := make([]byte, prefixLen+testkeys.MaxSuffixLen)
+	valBuf := make([]byte, valueSize)
+
+	mem := vfs.NewStrictMem()
+	maxProcs := runtime.GOMAXPROCS(0)
+	opts := &Options{
+		FS:                       mem,
+		Comparer:                 testkeys.Comparer,
+		FormatMajorVersion:       FormatNewest,
+		MaxConcurrentCompactions: func() int { return maxProcs/2 + 1 },
+		BlockPropertyCollectors: []func() BlockPropertyCollector{
+			sstable.NewTestKeysBlockPropertyCollector,
+		},
+	}
+	d, err := Open("", opts)
+	require.NoError(b, err)
+
+	keys := make([][]byte, keysPerBatch*batches)
+	for bi := 0; bi < batches; bi++ {
+		batch := d.NewBatch()
+		for k := 0; k < keysPerBatch; k++ {
+			randStr(keyBuf[:prefixLen], rng)
+			suffix := rng.Int63n(100)
+			suffixLen := testkeys.WriteSuffix(keyBuf[prefixLen:], suffix)
+			randStr(valBuf[:], rng)
+
+			var key []byte
+			alloc, key = alloc.Copy(keyBuf[:prefixLen+suffixLen])
+			keys[bi*keysPerBatch+k] = key
+			require.NoError(b, batch.Set(key, valBuf[:], nil))
+		}
+		require.NoError(b, batch.Commit(nil))
+	}
+
+	// Wait for compactions to complete before starting benchmarks. We don't
+	// want to benchmark while compactions are running.
+	d.mu.Lock()
+	for d.mu.compact.compactingCount > 0 {
+		d.mu.compact.cond.Wait()
+	}
+	d.mu.Unlock()
+	b.Log(d.Metrics().String())
+	require.NoError(b, d.Close())
+	// Set ignore syncs to true so that each subbenchmark may mutate state and
+	// then revert back to the original state.
+	mem.SetIgnoreSyncs(true)
+
+	// TODO(jackson): Benchmark lazy-combined iteration versus not.
+	// TODO(jackson): Benchmark seeks.
+	for _, rkSuffix := range []string{"@10", "@50", "@75", "@100"} {
+		b.Run(fmt.Sprintf("range-keys-suffixes=%s", rkSuffix), func(b *testing.B) {
+			d, err := Open("", opts)
+			require.NoError(b, err)
+			require.NoError(b, d.RangeKeySet([]byte("b"), []byte("e"), []byte(rkSuffix), nil, nil))
+			require.NoError(b, d.RangeKeySet([]byte("f"), []byte("p"), []byte(rkSuffix), nil, nil))
+			require.NoError(b, d.RangeKeySet([]byte("q"), []byte("z"), []byte(rkSuffix), nil, nil))
+			require.NoError(b, d.Flush())
+
+			// Populate 3 range keys, covering most of the keyspace, at the
+			// given suffix.
+
+			iterOpts := IterOptions{
+				KeyTypes: IterKeyTypePointsAndRanges,
+				RangeKeyMasking: RangeKeyMasking{
+					Suffix: []byte("@100"),
+					Filter: func() BlockPropertyFilterMask {
+						return sstable.NewTestKeysMaskingFilter()
+					},
+				},
+			}
+			b.Run("forward", func(b *testing.B) {
+				b.Run("seekprefix", func(b *testing.B) {
+					b.ResetTimer()
+					for i := 0; i < b.N; i++ {
+						iter, _ := d.NewIter(&iterOpts)
+						count := 0
+						for j := 0; j < len(keys); j++ {
+							if !iter.SeekPrefixGE(keys[j]) {
+								b.Errorf("unable to find %q\n", keys[j])
+							}
+							if hasPoint, _ := iter.HasPointAndRange(); hasPoint {
+								count++
+							}
+						}
+						if err := iter.Close(); err != nil {
+							b.Fatal(err)
+						}
+					}
+				})
+				b.Run("next", func(b *testing.B) {
+					b.ResetTimer()
+					for i := 0; i < b.N; i++ {
+						iter, _ := d.NewIter(&iterOpts)
+						count := 0
+						for valid := iter.First(); valid; valid = iter.Next() {
+							if hasPoint, _ := iter.HasPointAndRange(); hasPoint {
+								count++
+							}
+						}
+						if err := iter.Close(); err != nil {
+							b.Fatal(err)
+						}
+					}
+				})
+			})
+			b.Run("backward", func(b *testing.B) {
+				b.ResetTimer()
+				for i := 0; i < b.N; i++ {
+					iter, _ := d.NewIter(&iterOpts)
+					count := 0
+					for valid := iter.Last(); valid; valid = iter.Prev() {
+						if hasPoint, _ := iter.HasPointAndRange(); hasPoint {
+							count++
+						}
+					}
+					if err := iter.Close(); err != nil {
+						b.Fatal(err)
+					}
+				}
+			})
+
+			// Reset the benchmark state at the end of each run to remove the
+			// range keys we wrote.
+			b.StopTimer()
+			require.NoError(b, d.Close())
+			mem.ResetToSyncedState()
+		})
+	}
+
+}
+
+func BenchmarkIteratorScan(b *testing.B) {
+	const maxPrefixLen = 8
+	keyBuf := make([]byte, maxPrefixLen+testkeys.MaxSuffixLen)
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+
+	for _, keyCount := range []int64{100, 1000, 10000} {
+		for _, readAmp := range []int{1, 3, 7, 10} {
+			func() {
+				opts := &Options{
+					FS:                 vfs.NewMem(),
+					FormatMajorVersion: FormatNewest,
+				}
+				opts.DisableAutomaticCompactions = true
+				d, err := Open("", opts)
+				require.NoError(b, err)
+				defer func() { require.NoError(b, d.Close()) }()
+
+				// Take the very large keyspace consisting of alphabetic
+				// characters of lengths up to `maxPrefixLen` and reduce it down
+				// to `keyCount` keys by picking every 1 key every `keyCount` keys.
+				keys := testkeys.Alpha(maxPrefixLen)
+				keys = keys.EveryN(keys.Count() / keyCount)
+				if keys.Count() < keyCount {
+					b.Fatalf("expected %d keys, found %d", keyCount, keys.Count())
+				}
+
+				// Portion the keys into `readAmp` overlapping key sets.
+				for _, ks := range testkeys.Divvy(keys, int64(readAmp)) {
+					batch := d.NewBatch()
+					for i := int64(0); i < ks.Count(); i++ {
+						n := testkeys.WriteKeyAt(keyBuf[:], ks, i, rng.Int63n(100))
+						batch.Set(keyBuf[:n], keyBuf[:n], nil)
+					}
+					require.NoError(b, batch.Commit(nil))
+					require.NoError(b, d.Flush())
+				}
+				// Each level is a sublevel.
+				m := d.Metrics()
+				require.Equal(b, readAmp, m.ReadAmp())
+
+				for _, keyTypes := range []IterKeyType{IterKeyTypePointsOnly, IterKeyTypePointsAndRanges} {
+					iterOpts := IterOptions{KeyTypes: keyTypes}
+					b.Run(fmt.Sprintf("keys=%d,r-amp=%d,key-types=%s", keyCount, readAmp, keyTypes), func(b *testing.B) {
+						for i := 0; i < b.N; i++ {
+							b.StartTimer()
+							iter, _ := d.NewIter(&iterOpts)
+							valid := iter.First()
+							for valid {
+								valid = iter.Next()
+							}
+							b.StopTimer()
+							require.NoError(b, iter.Close())
+						}
+					})
+				}
+			}()
+		}
+	}
+}
+
+func BenchmarkIteratorScanNextPrefix(b *testing.B) {
+	setupBench := func(
+		b *testing.B, maxKeysPerLevel, versCount, readAmp int, enableValueBlocks bool) *DB {
+		keyBuf := make([]byte, readAmp+testkeys.MaxSuffixLen)
+		opts := &Options{
+			FS:                 vfs.NewMem(),
+			Comparer:           testkeys.Comparer,
+			FormatMajorVersion: FormatNewest,
+		}
+		opts.DisableAutomaticCompactions = true
+		opts.Experimental.EnableValueBlocks = func() bool { return enableValueBlocks }
+		d, err := Open("", opts)
+		require.NoError(b, err)
+
+		// Create `readAmp` levels. Prefixes in the top of the LSM are length 1.
+		// Prefixes in the bottom of the LSM are length `readAmp`. Eg,:
+		//
+		//    a  b c...
+		//    aa ab ac...
+		//    aaa aab aac...
+		//
+		for l := readAmp; l > 0; l-- {
+			ks := testkeys.Alpha(l)
+			if step := ks.Count() / int64(maxKeysPerLevel); step > 1 {
+				ks = ks.EveryN(step)
+			}
+			if ks.Count() > int64(maxKeysPerLevel) {
+				ks = ks.Slice(0, int64(maxKeysPerLevel))
+			}
+
+			batch := d.NewBatch()
+			for i := int64(0); i < ks.Count(); i++ {
+				for v := 0; v < versCount; v++ {
+					n := testkeys.WriteKeyAt(keyBuf[:], ks, i, int64(versCount-v+1))
+					batch.Set(keyBuf[:n], keyBuf[:n], nil)
+				}
+			}
+			require.NoError(b, batch.Commit(nil))
+			require.NoError(b, d.Flush())
+		}
+
+		// Each level is a sublevel.
+		m := d.Metrics()
+		require.Equal(b, readAmp, m.ReadAmp())
+		return d
+	}
+
+	for _, keysPerLevel := range []int{10, 100, 1000} {
+		b.Run(fmt.Sprintf("keysPerLevel=%d", keysPerLevel), func(b *testing.B) {
+			for _, versionCount := range []int{1, 2, 10, 100} {
+				b.Run(fmt.Sprintf("versions=%d", versionCount), func(b *testing.B) {
+					for _, readAmp := range []int{1, 3, 7, 10} {
+						b.Run(fmt.Sprintf("ramp=%d", readAmp), func(b *testing.B) {
+							for _, enableValueBlocks := range []bool{false, true} {
+								b.Run(fmt.Sprintf("value-blocks=%t", enableValueBlocks), func(b *testing.B) {
+									d := setupBench(b, keysPerLevel, versionCount, readAmp, enableValueBlocks)
+									defer func() { require.NoError(b, d.Close()) }()
+									for _, keyTypes := range []IterKeyType{
+										IterKeyTypePointsOnly, IterKeyTypePointsAndRanges} {
+										b.Run(fmt.Sprintf("key-types=%s", keyTypes), func(b *testing.B) {
+											iterOpts := IterOptions{KeyTypes: keyTypes}
+											iter, _ := d.NewIter(&iterOpts)
+											var valid bool
+											b.ResetTimer()
+											for i := 0; i < b.N; i++ {
+												if !valid {
+													valid = iter.First()
+													if !valid {
+														b.Fatalf("iter must be valid")
+													}
+												} else {
+													valid = iter.NextPrefix()
+												}
+											}
+											b.StopTimer()
+											require.NoError(b, iter.Close())
+										})
+									}
+								})
+							}
+						})
+					}
+				})
+			}
+		})
+	}
+}
+
+func BenchmarkCombinedIteratorSeek(b *testing.B) {
+	for _, withRangeKey := range []bool{false, true} {
+		b.Run(fmt.Sprintf("range-key=%t", withRangeKey), func(b *testing.B) {
+			rng := rand.New(rand.NewSource(uint64(1658872515083979000)))
+			ks := testkeys.Alpha(1)
+			opts := &Options{
+				FS:                 vfs.NewMem(),
+				Comparer:           testkeys.Comparer,
+				FormatMajorVersion: FormatNewest,
+			}
+			d, err := Open("", opts)
+			require.NoError(b, err)
+			defer func() { require.NoError(b, d.Close()) }()
+
+			keys := make([][]byte, ks.Count())
+			for i := int64(0); i < ks.Count(); i++ {
+				keys[i] = testkeys.Key(ks, i)
+				var val [40]byte
+				rng.Read(val[:])
+				require.NoError(b, d.Set(keys[i], val[:], nil))
+			}
+			if withRangeKey {
+				require.NoError(b, d.RangeKeySet([]byte("a"), []byte{'z', 0x00}, []byte("@5"), nil, nil))
+			}
+
+			batch := d.NewIndexedBatch()
+			defer batch.Close()
+
+			for _, useBatch := range []bool{false, true} {
+				b.Run(fmt.Sprintf("batch=%t", useBatch), func(b *testing.B) {
+					for i := 0; i < b.N; i++ {
+						iterOpts := IterOptions{KeyTypes: IterKeyTypePointsAndRanges}
+						var it *Iterator
+						if useBatch {
+							it, _ = batch.NewIter(&iterOpts)
+						} else {
+							it, _ = d.NewIter(&iterOpts)
+						}
+						for j := 0; j < len(keys); j++ {
+							if !it.SeekGE(keys[j]) {
+								b.Errorf("key %q missing", keys[j])
+							}
+						}
+						require.NoError(b, it.Close())
+					}
+				})
+			}
+		})
+	}
+}
+
+// BenchmarkCombinedIteratorSeek_Bounded benchmarks a bounded iterator that
+// performs repeated seeks over 5% of the middle of a keyspace covered by a
+// range key that's fragmented across hundreds of files. The iterator bounds
+// should prevent defragmenting beyond the iterator's bounds.
+func BenchmarkCombinedIteratorSeek_Bounded(b *testing.B) {
+	d, keys := buildFragmentedRangeKey(b, uint64(1658872515083979000))
+
+	var lower = len(keys) / 2
+	var upper = len(keys)/2 + len(keys)/20 // 5%
+	iterOpts := IterOptions{
+		KeyTypes:   IterKeyTypePointsAndRanges,
+		LowerBound: keys[lower],
+		UpperBound: keys[upper],
+	}
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		it, _ := d.NewIter(&iterOpts)
+		for j := lower; j < upper; j++ {
+			if !it.SeekGE(keys[j]) {
+				b.Errorf("key %q missing", keys[j])
+			}
+		}
+		require.NoError(b, it.Close())
+	}
+}
+
+// BenchmarkCombinedIteratorSeekPrefix benchmarks an iterator that
+// performs repeated prefix seeks over 5% of the middle of a keyspace covered by a
+// range key that's fragmented across hundreds of files. The seek prefix should
+// avoid defragmenting beyond the seek prefixes.
+func BenchmarkCombinedIteratorSeekPrefix(b *testing.B) {
+	d, keys := buildFragmentedRangeKey(b, uint64(1658872515083979000))
+
+	var lower = len(keys) / 2
+	var upper = len(keys)/2 + len(keys)/20 // 5%
+	iterOpts := IterOptions{
+		KeyTypes: IterKeyTypePointsAndRanges,
+	}
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		it, _ := d.NewIter(&iterOpts)
+		for j := lower; j < upper; j++ {
+			if !it.SeekPrefixGE(keys[j]) {
+				b.Errorf("key %q missing", keys[j])
+			}
+		}
+		require.NoError(b, it.Close())
+	}
+}
+
+func buildFragmentedRangeKey(b testing.TB, seed uint64) (d *DB, keys [][]byte) {
+	rng := rand.New(rand.NewSource(seed))
+	ks := testkeys.Alpha(2)
+	opts := &Options{
+		FS:                        vfs.NewMem(),
+		Comparer:                  testkeys.Comparer,
+		FormatMajorVersion:        FormatNewest,
+		L0CompactionFileThreshold: 1,
+	}
+	opts.EnsureDefaults()
+	for l := 0; l < len(opts.Levels); l++ {
+		opts.Levels[l].TargetFileSize = 1
+	}
+	var err error
+	d, err = Open("", opts)
+	require.NoError(b, err)
+
+	keys = make([][]byte, ks.Count())
+	for i := int64(0); i < ks.Count(); i++ {
+		keys[i] = testkeys.Key(ks, i)
+	}
+	for i := 0; i < len(keys); i++ {
+		var val [40]byte
+		rng.Read(val[:])
+		require.NoError(b, d.Set(keys[i], val[:], nil))
+		if i < len(keys)-1 {
+			require.NoError(b, d.RangeKeySet(keys[i], keys[i+1], []byte("@5"), nil, nil))
+		}
+		require.NoError(b, d.Flush())
+	}
+
+	d.mu.Lock()
+	for d.mu.compact.compactingCount > 0 {
+		d.mu.compact.cond.Wait()
+	}
+	v := d.mu.versions.currentVersion()
+	d.mu.Unlock()
+	require.GreaterOrEqualf(b, v.Levels[numLevels-1].Len(),
+		700, "expect many (≥700) L6 files but found %d", v.Levels[numLevels-1].Len())
+	return d, keys
+}
+
+// BenchmarkSeekPrefixTombstones benchmarks a SeekPrefixGE into the beginning of
+// a series of sstables containing exclusively range tombstones. Previously,
+// such a seek would next through all the tombstone files until it arrived at a
+// point key or exhausted the level's files. The SeekPrefixGE should not next
+// beyond the files that contain the prefix.
+//
+// See cockroachdb/cockroach#89327.
+func BenchmarkSeekPrefixTombstones(b *testing.B) {
+	o := (&Options{
+		FS:                 vfs.NewMem(),
+		Comparer:           testkeys.Comparer,
+		FormatMajorVersion: FormatNewest,
+	}).EnsureDefaults()
+	wOpts := o.MakeWriterOptions(numLevels-1, FormatNewest.MaxTableFormat())
+	d, err := Open("", o)
+	require.NoError(b, err)
+	defer func() { require.NoError(b, d.Close()) }()
+
+	// Keep a snapshot open for the duration of the test to prevent elision-only
+	// compactions from removing the ingested files containing exclusively
+	// elidable tombstones.
+	defer d.NewSnapshot().Close()
+
+	ks := testkeys.Alpha(2)
+	for i := int64(0); i < ks.Count()-1; i++ {
+		func() {
+			filename := fmt.Sprintf("ext%2d", i)
+			f, err := o.FS.Create(filename)
+			require.NoError(b, err)
+			w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), wOpts)
+			require.NoError(b, w.DeleteRange(testkeys.Key(ks, i), testkeys.Key(ks, i+1)))
+			require.NoError(b, w.Close())
+			require.NoError(b, d.Ingest([]string{filename}))
+		}()
+	}
+
+	d.mu.Lock()
+	require.Equal(b, int64(ks.Count()-1), d.mu.versions.metrics.Levels[numLevels-1].NumFiles)
+	d.mu.Unlock()
+
+	seekKey := testkeys.Key(ks, 1)
+	iter, _ := d.NewIter(nil)
+	defer iter.Close()
+	b.ResetTimer()
+	defer b.StopTimer()
+	for i := 0; i < b.N; i++ {
+		iter.SeekPrefixGE(seekKey)
+	}
+}
diff --git a/pebble/level_checker.go b/pebble/level_checker.go
new file mode 100644
index 0000000..2901c45
--- /dev/null
+++ b/pebble/level_checker.go
@@ -0,0 +1,769 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"sort"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+)
+
+// This file implements DB.CheckLevels() which checks that every entry in the
+// DB is consistent with respect to the level invariant: any point (or the
+// infinite number of points in a range tombstone) has a seqnum such that a
+// point with the same UserKey at a lower level has a lower seqnum. This is an
+// expensive check since it involves iterating over all the entries in the DB,
+// hence only intended for tests or tools.
+//
+// If we ignore range tombstones, the consistency checking of points can be
+// done with a simplified version of mergingIter. simpleMergingIter is that
+// simplified version of mergingIter that only needs to step through points
+// (analogous to only doing Next()). It can also easily accommodate
+// consistency checking of points relative to range tombstones.
+// simpleMergingIter does not do any seek optimizations present in mergingIter
+// (it minimally needs to seek the range delete iterators to position them at
+// or past the current point) since it does not want to miss points for
+// purposes of consistency checking.
+//
+// Mutual consistency of range tombstones is non-trivial to check. One needs
+// to detect inversions of the form [a, c)#8 at higher level and [b, c)#10 at
+// a lower level. The start key of the former is not contained in the latter
+// and we can't use the exclusive end key, c, for a containment check since it
+// is the sentinel key. We observe that if these tombstones were fragmented
+// wrt each other we would have [a, b)#8 and [b, c)#8 at the higher level and
+// [b, c)#10 at the lower level and then it is is trivial to compare the two
+// [b, c) tombstones. Note that this fragmentation needs to take into account
+// that tombstones in a file may be untruncated and need to act within the
+// bounds of the file. This checking is performed by checkRangeTombstones()
+// and its helper functions.
+
+// The per-level structure used by simpleMergingIter.
+type simpleMergingIterLevel struct {
+	iter         internalIterator
+	rangeDelIter keyspan.FragmentIterator
+	levelIterBoundaryContext
+
+	iterKey   *InternalKey
+	iterValue base.LazyValue
+	tombstone *keyspan.Span
+}
+
+type simpleMergingIter struct {
+	levels   []simpleMergingIterLevel
+	snapshot uint64
+	heap     simpleMergingIterHeap
+	// The last point's key and level. For validation.
+	lastKey     InternalKey
+	lastLevel   int
+	lastIterMsg string
+	// A non-nil valueMerger means MERGE record processing is ongoing.
+	valueMerger base.ValueMerger
+	// The first error will cause step() to return false.
+	err       error
+	numPoints int64
+	merge     Merge
+	formatKey base.FormatKey
+}
+
+func (m *simpleMergingIter) init(
+	merge Merge,
+	cmp Compare,
+	snapshot uint64,
+	formatKey base.FormatKey,
+	levels ...simpleMergingIterLevel,
+) {
+	m.levels = levels
+	m.formatKey = formatKey
+	m.merge = merge
+	m.snapshot = snapshot
+	m.lastLevel = -1
+	m.heap.cmp = cmp
+	m.heap.items = make([]simpleMergingIterItem, 0, len(levels))
+	for i := range m.levels {
+		l := &m.levels[i]
+		l.iterKey, l.iterValue = l.iter.First()
+		if l.iterKey != nil {
+			item := simpleMergingIterItem{
+				index: i,
+				value: l.iterValue,
+			}
+			item.key.Trailer = l.iterKey.Trailer
+			item.key.UserKey = append(item.key.UserKey[:0], l.iterKey.UserKey...)
+			m.heap.items = append(m.heap.items, item)
+		}
+	}
+	m.heap.init()
+
+	if m.heap.len() == 0 {
+		return
+	}
+	m.positionRangeDels()
+}
+
+// Positions all the rangedel iterators at or past the current top of the
+// heap, using SeekGE().
+func (m *simpleMergingIter) positionRangeDels() {
+	item := &m.heap.items[0]
+	for i := range m.levels {
+		l := &m.levels[i]
+		if l.rangeDelIter == nil {
+			continue
+		}
+		l.tombstone = l.rangeDelIter.SeekGE(item.key.UserKey)
+	}
+}
+
+// Returns true if not yet done.
+func (m *simpleMergingIter) step() bool {
+	if m.heap.len() == 0 || m.err != nil {
+		return false
+	}
+	item := &m.heap.items[0]
+	l := &m.levels[item.index]
+	// Sentinels are not relevant for this point checking.
+	if !item.key.IsExclusiveSentinel() && item.key.Visible(m.snapshot, base.InternalKeySeqNumMax) {
+		m.numPoints++
+		keyChanged := m.heap.cmp(item.key.UserKey, m.lastKey.UserKey) != 0
+		if !keyChanged {
+			// At the same user key. We will see them in decreasing seqnum
+			// order so the lastLevel must not be lower.
+			if m.lastLevel > item.index {
+				m.err = errors.Errorf("found InternalKey %s in %s and InternalKey %s in %s",
+					item.key.Pretty(m.formatKey), l.iter, m.lastKey.Pretty(m.formatKey),
+					m.lastIterMsg)
+				return false
+			}
+			m.lastLevel = item.index
+		} else {
+			// The user key has changed.
+			m.lastKey.Trailer = item.key.Trailer
+			m.lastKey.UserKey = append(m.lastKey.UserKey[:0], item.key.UserKey...)
+			m.lastLevel = item.index
+		}
+		// Ongoing series of MERGE records ends with a MERGE record.
+		if keyChanged && m.valueMerger != nil {
+			var closer io.Closer
+			_, closer, m.err = m.valueMerger.Finish(true /* includesBase */)
+			if m.err == nil && closer != nil {
+				m.err = closer.Close()
+			}
+			m.valueMerger = nil
+		}
+		itemValue, _, err := item.value.Value(nil)
+		if err != nil {
+			m.err = err
+			return false
+		}
+		if m.valueMerger != nil {
+			// Ongoing series of MERGE records.
+			switch item.key.Kind() {
+			case InternalKeyKindSingleDelete, InternalKeyKindDelete, InternalKeyKindDeleteSized:
+				var closer io.Closer
+				_, closer, m.err = m.valueMerger.Finish(true /* includesBase */)
+				if m.err == nil && closer != nil {
+					m.err = closer.Close()
+				}
+				m.valueMerger = nil
+			case InternalKeyKindSet, InternalKeyKindSetWithDelete:
+				m.err = m.valueMerger.MergeOlder(itemValue)
+				if m.err == nil {
+					var closer io.Closer
+					_, closer, m.err = m.valueMerger.Finish(true /* includesBase */)
+					if m.err == nil && closer != nil {
+						m.err = closer.Close()
+					}
+				}
+				m.valueMerger = nil
+			case InternalKeyKindMerge:
+				m.err = m.valueMerger.MergeOlder(itemValue)
+			default:
+				m.err = errors.Errorf("pebble: invalid internal key kind %s in %s",
+					item.key.Pretty(m.formatKey),
+					l.iter)
+				return false
+			}
+		} else if item.key.Kind() == InternalKeyKindMerge && m.err == nil {
+			// New series of MERGE records.
+			m.valueMerger, m.err = m.merge(item.key.UserKey, itemValue)
+		}
+		if m.err != nil {
+			m.err = errors.Wrapf(m.err, "merge processing error on key %s in %s",
+				item.key.Pretty(m.formatKey), l.iter)
+			return false
+		}
+		// Is this point covered by a tombstone at a lower level? Note that all these
+		// iterators must be positioned at a key > item.key. So the Largest key bound
+		// of the sstable containing the tombstone >= item.key. So the upper limit of
+		// the tombstone cannot be file-bounds-constrained to < item.key. But it is
+		// possible that item.key < smallest key bound of the sstable, in which case
+		// this tombstone should be ignored.
+		for level := item.index + 1; level < len(m.levels); level++ {
+			lvl := &m.levels[level]
+			if lvl.rangeDelIter == nil || lvl.tombstone.Empty() {
+				continue
+			}
+			if (lvl.smallestUserKey == nil || m.heap.cmp(lvl.smallestUserKey, item.key.UserKey) <= 0) &&
+				lvl.tombstone.Contains(m.heap.cmp, item.key.UserKey) {
+				if lvl.tombstone.CoversAt(m.snapshot, item.key.SeqNum()) {
+					m.err = errors.Errorf("tombstone %s in %s deletes key %s in %s",
+						lvl.tombstone.Pretty(m.formatKey), lvl.iter, item.key.Pretty(m.formatKey),
+						l.iter)
+					return false
+				}
+			}
+		}
+	}
+
+	// The iterator for the current level may be closed in the following call to
+	// Next(). We save its debug string for potential use after it is closed -
+	// either in this current step() invocation or on the next invocation.
+	m.lastIterMsg = l.iter.String()
+
+	// Step to the next point.
+	if l.iterKey, l.iterValue = l.iter.Next(); l.iterKey != nil {
+		// Check point keys in an sstable are ordered. Although not required, we check
+		// for memtables as well. A subtle check here is that successive sstables of
+		// L1 and higher levels are ordered. This happens when levelIter moves to the
+		// next sstable in the level, in which case item.key is previous sstable's
+		// last point key.
+		if base.InternalCompare(m.heap.cmp, item.key, *l.iterKey) >= 0 {
+			m.err = errors.Errorf("out of order keys %s >= %s in %s",
+				item.key.Pretty(m.formatKey), l.iterKey.Pretty(m.formatKey), l.iter)
+			return false
+		}
+		item.key.Trailer = l.iterKey.Trailer
+		item.key.UserKey = append(item.key.UserKey[:0], l.iterKey.UserKey...)
+		item.value = l.iterValue
+		if m.heap.len() > 1 {
+			m.heap.fix(0)
+		}
+	} else {
+		m.err = l.iter.Close()
+		l.iter = nil
+		m.heap.pop()
+	}
+	if m.err != nil {
+		return false
+	}
+	if m.heap.len() == 0 {
+		// Last record was a MERGE record.
+		if m.valueMerger != nil {
+			var closer io.Closer
+			_, closer, m.err = m.valueMerger.Finish(true /* includesBase */)
+			if m.err == nil && closer != nil {
+				m.err = closer.Close()
+			}
+			if m.err != nil {
+				m.err = errors.Wrapf(m.err, "merge processing error on key %s in %s",
+					item.key.Pretty(m.formatKey), m.lastIterMsg)
+			}
+			m.valueMerger = nil
+		}
+		return false
+	}
+	m.positionRangeDels()
+	return true
+}
+
+// Checking that range tombstones are mutually consistent is performed by checkRangeTombstones().
+// See the overview comment at the top of the file.
+//
+// We do this check as follows:
+// - For each level that can have untruncated tombstones, compute the atomic compaction
+//   bounds (getAtomicUnitBounds()) and use them to truncate tombstones.
+// - Now that we have a set of truncated tombstones for each level, put them into one
+//   pool of tombstones along with their level information (addTombstonesFromIter()).
+// - Collect the start and end user keys from all these tombstones (collectAllUserKey()) and use
+//   them to fragment all the tombstones (fragmentUsingUserKey()).
+// - Sort tombstones by start key and decreasing seqnum (tombstonesByStartKeyAndSeqnum) -- all
+//   tombstones that have the same start key will have the same end key because they have been
+//   fragmented.
+// - Iterate and check (iterateAndCheckTombstones()).
+// Note that this simple approach requires holding all the tombstones across all levels in-memory.
+// A more sophisticated incremental approach could be devised, if necessary.
+
+// A tombstone and the corresponding level it was found in.
+type tombstoneWithLevel struct {
+	keyspan.Span
+	level int
+	// The level in LSM. A -1 means it's a memtable.
+	lsmLevel int
+	fileNum  FileNum
+}
+
+// For sorting tombstoneWithLevels in increasing order of start UserKey and
+// for the same start UserKey in decreasing order of seqnum.
+type tombstonesByStartKeyAndSeqnum struct {
+	cmp Compare
+	buf []tombstoneWithLevel
+}
+
+func (v *tombstonesByStartKeyAndSeqnum) Len() int { return len(v.buf) }
+func (v *tombstonesByStartKeyAndSeqnum) Less(i, j int) bool {
+	less := v.cmp(v.buf[i].Start, v.buf[j].Start)
+	if less == 0 {
+		return v.buf[i].LargestSeqNum() > v.buf[j].LargestSeqNum()
+	}
+	return less < 0
+}
+func (v *tombstonesByStartKeyAndSeqnum) Swap(i, j int) {
+	v.buf[i], v.buf[j] = v.buf[j], v.buf[i]
+}
+
+func iterateAndCheckTombstones(
+	cmp Compare, formatKey base.FormatKey, tombstones []tombstoneWithLevel,
+) error {
+	sortBuf := tombstonesByStartKeyAndSeqnum{
+		cmp: cmp,
+		buf: tombstones,
+	}
+	sort.Sort(&sortBuf)
+
+	// For a sequence of tombstones that share the same start UserKey, we will
+	// encounter them in non-increasing seqnum order and so should encounter them
+	// in non-decreasing level order.
+	lastTombstone := tombstoneWithLevel{}
+	for _, t := range tombstones {
+		if cmp(lastTombstone.Start, t.Start) == 0 && lastTombstone.level > t.level {
+			return errors.Errorf("encountered tombstone %s in %s"+
+				" that has a lower seqnum than the same tombstone in %s",
+				t.Span.Pretty(formatKey), levelOrMemtable(t.lsmLevel, t.fileNum),
+				levelOrMemtable(lastTombstone.lsmLevel, lastTombstone.fileNum))
+		}
+		lastTombstone = t
+	}
+	return nil
+}
+
+type checkConfig struct {
+	logger    Logger
+	comparer  *Comparer
+	readState *readState
+	newIters  tableNewIters
+	seqNum    uint64
+	stats     *CheckLevelsStats
+	merge     Merge
+	formatKey base.FormatKey
+}
+
+// cmp is shorthand for comparer.Compare.
+func (c *checkConfig) cmp(a, b []byte) int { return c.comparer.Compare(a, b) }
+
+func checkRangeTombstones(c *checkConfig) error {
+	var level int
+	var tombstones []tombstoneWithLevel
+	var err error
+
+	memtables := c.readState.memtables
+	for i := len(memtables) - 1; i >= 0; i-- {
+		iter := memtables[i].newRangeDelIter(nil)
+		if iter == nil {
+			continue
+		}
+		if tombstones, err = addTombstonesFromIter(iter, level, -1, 0, tombstones,
+			c.seqNum, c.cmp, c.formatKey, nil); err != nil {
+			return err
+		}
+		level++
+	}
+
+	current := c.readState.current
+	addTombstonesFromLevel := func(files manifest.LevelIterator, lsmLevel int) error {
+		for f := files.First(); f != nil; f = files.Next() {
+			lf := files.Take()
+			atomicUnit, _ := expandToAtomicUnit(c.cmp, lf.Slice(), true /* disableIsCompacting */)
+			lower, upper := manifest.KeyRange(c.cmp, atomicUnit.Iter())
+			iterToClose, iter, err := c.newIters(
+				context.Background(), lf.FileMetadata, &IterOptions{level: manifest.Level(lsmLevel)}, internalIterOpts{})
+			if err != nil {
+				return err
+			}
+			iterToClose.Close()
+			if iter == nil {
+				continue
+			}
+			truncate := func(t keyspan.Span) keyspan.Span {
+				// Same checks as in keyspan.Truncate.
+				if c.cmp(t.Start, lower.UserKey) < 0 {
+					t.Start = lower.UserKey
+				}
+				if c.cmp(t.End, upper.UserKey) > 0 {
+					t.End = upper.UserKey
+				}
+				if c.cmp(t.Start, t.End) >= 0 {
+					// Remove the keys.
+					t.Keys = t.Keys[:0]
+				}
+				return t
+			}
+			if tombstones, err = addTombstonesFromIter(iter, level, lsmLevel, f.FileNum,
+				tombstones, c.seqNum, c.cmp, c.formatKey, truncate); err != nil {
+				return err
+			}
+		}
+		return nil
+	}
+	// Now the levels with untruncated tombsones.
+	for i := len(current.L0SublevelFiles) - 1; i >= 0; i-- {
+		if current.L0SublevelFiles[i].Empty() {
+			continue
+		}
+		err := addTombstonesFromLevel(current.L0SublevelFiles[i].Iter(), 0)
+		if err != nil {
+			return err
+		}
+		level++
+	}
+	for i := 1; i < len(current.Levels); i++ {
+		if err := addTombstonesFromLevel(current.Levels[i].Iter(), i); err != nil {
+			return err
+		}
+		level++
+	}
+	if c.stats != nil {
+		c.stats.NumTombstones = len(tombstones)
+	}
+	// We now have truncated tombstones.
+	// Fragment them all.
+	userKeys := collectAllUserKeys(c.cmp, tombstones)
+	tombstones = fragmentUsingUserKeys(c.cmp, tombstones, userKeys)
+	return iterateAndCheckTombstones(c.cmp, c.formatKey, tombstones)
+}
+
+func levelOrMemtable(lsmLevel int, fileNum FileNum) string {
+	if lsmLevel == -1 {
+		return "memtable"
+	}
+	return fmt.Sprintf("L%d: fileNum=%s", lsmLevel, fileNum)
+}
+
+func addTombstonesFromIter(
+	iter keyspan.FragmentIterator,
+	level int,
+	lsmLevel int,
+	fileNum FileNum,
+	tombstones []tombstoneWithLevel,
+	seqNum uint64,
+	cmp Compare,
+	formatKey base.FormatKey,
+	truncate func(tombstone keyspan.Span) keyspan.Span,
+) (_ []tombstoneWithLevel, err error) {
+	defer func() {
+		err = firstError(err, iter.Close())
+	}()
+
+	var prevTombstone keyspan.Span
+	for tomb := iter.First(); tomb != nil; tomb = iter.Next() {
+		t := tomb.Visible(seqNum)
+		if t.Empty() {
+			continue
+		}
+		t = t.DeepClone()
+		// This is mainly a test for rangeDelV2 formatted blocks which are expected to
+		// be ordered and fragmented on disk. But we anyways check for memtables,
+		// rangeDelV1 as well.
+		if cmp(prevTombstone.End, t.Start) > 0 {
+			return nil, errors.Errorf("unordered or unfragmented range delete tombstones %s, %s in %s",
+				prevTombstone.Pretty(formatKey), t.Pretty(formatKey), levelOrMemtable(lsmLevel, fileNum))
+		}
+		prevTombstone = t
+
+		// Truncation of a tombstone must happen after checking its ordering,
+		// fragmentation wrt previous tombstone. Since it is possible that after
+		// truncation the tombstone is ordered, fragmented when it originally wasn't.
+		if truncate != nil {
+			t = truncate(t)
+		}
+		if !t.Empty() {
+			tombstones = append(tombstones, tombstoneWithLevel{
+				Span:     t,
+				level:    level,
+				lsmLevel: lsmLevel,
+				fileNum:  fileNum,
+			})
+		}
+	}
+	return tombstones, nil
+}
+
+type userKeysSort struct {
+	cmp Compare
+	buf [][]byte
+}
+
+func (v *userKeysSort) Len() int { return len(v.buf) }
+func (v *userKeysSort) Less(i, j int) bool {
+	return v.cmp(v.buf[i], v.buf[j]) < 0
+}
+func (v *userKeysSort) Swap(i, j int) {
+	v.buf[i], v.buf[j] = v.buf[j], v.buf[i]
+}
+func collectAllUserKeys(cmp Compare, tombstones []tombstoneWithLevel) [][]byte {
+	keys := make([][]byte, 0, len(tombstones)*2)
+	for _, t := range tombstones {
+		keys = append(keys, t.Start)
+		keys = append(keys, t.End)
+	}
+	sorter := userKeysSort{
+		cmp: cmp,
+		buf: keys,
+	}
+	sort.Sort(&sorter)
+	var last, curr int
+	for last, curr = -1, 0; curr < len(keys); curr++ {
+		if last < 0 || cmp(keys[last], keys[curr]) != 0 {
+			last++
+			keys[last] = keys[curr]
+		}
+	}
+	keys = keys[:last+1]
+	return keys
+}
+
+func fragmentUsingUserKeys(
+	cmp Compare, tombstones []tombstoneWithLevel, userKeys [][]byte,
+) []tombstoneWithLevel {
+	var buf []tombstoneWithLevel
+	for _, t := range tombstones {
+		// Find the first position with tombstone start < user key
+		i := sort.Search(len(userKeys), func(i int) bool {
+			return cmp(t.Start, userKeys[i]) < 0
+		})
+		for ; i < len(userKeys); i++ {
+			if cmp(userKeys[i], t.End) >= 0 {
+				break
+			}
+			tPartial := t
+			tPartial.End = userKeys[i]
+			buf = append(buf, tPartial)
+			t.Start = userKeys[i]
+		}
+		buf = append(buf, t)
+	}
+	return buf
+}
+
+// CheckLevelsStats provides basic stats on points and tombstones encountered.
+type CheckLevelsStats struct {
+	NumPoints     int64
+	NumTombstones int
+}
+
+// CheckLevels checks:
+//   - Every entry in the DB is consistent with the level invariant. See the
+//     comment at the top of the file.
+//   - Point keys in sstables are ordered.
+//   - Range delete tombstones in sstables are ordered and fragmented.
+//   - Successful processing of all MERGE records.
+func (d *DB) CheckLevels(stats *CheckLevelsStats) error {
+	// Grab and reference the current readState.
+	readState := d.loadReadState()
+	defer readState.unref()
+
+	// Determine the seqnum to read at after grabbing the read state (current and
+	// memtables) above.
+	seqNum := d.mu.versions.visibleSeqNum.Load()
+
+	checkConfig := &checkConfig{
+		logger:    d.opts.Logger,
+		comparer:  d.opts.Comparer,
+		readState: readState,
+		newIters:  d.newIters,
+		seqNum:    seqNum,
+		stats:     stats,
+		merge:     d.merge,
+		formatKey: d.opts.Comparer.FormatKey,
+	}
+	return checkLevelsInternal(checkConfig)
+}
+
+func checkLevelsInternal(c *checkConfig) (err error) {
+	// Phase 1: Use a simpleMergingIter to step through all the points and ensure
+	// that points with the same user key at different levels are not inverted
+	// wrt sequence numbers and the same holds for tombstones that cover points.
+	// To do this, one needs to construct a simpleMergingIter which is similar to
+	// how one constructs a mergingIter.
+
+	// Add mem tables from newest to oldest.
+	var mlevels []simpleMergingIterLevel
+	defer func() {
+		for i := range mlevels {
+			l := &mlevels[i]
+			if l.iter != nil {
+				err = firstError(err, l.iter.Close())
+				l.iter = nil
+			}
+			if l.rangeDelIter != nil {
+				err = firstError(err, l.rangeDelIter.Close())
+				l.rangeDelIter = nil
+			}
+		}
+	}()
+
+	memtables := c.readState.memtables
+	for i := len(memtables) - 1; i >= 0; i-- {
+		mem := memtables[i]
+		mlevels = append(mlevels, simpleMergingIterLevel{
+			iter:         mem.newIter(nil),
+			rangeDelIter: mem.newRangeDelIter(nil),
+		})
+	}
+
+	current := c.readState.current
+	// Determine the final size for mlevels so that there are no more
+	// reallocations. levelIter will hold a pointer to elements in mlevels.
+	start := len(mlevels)
+	for sublevel := len(current.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- {
+		if current.L0SublevelFiles[sublevel].Empty() {
+			continue
+		}
+		mlevels = append(mlevels, simpleMergingIterLevel{})
+	}
+	for level := 1; level < len(current.Levels); level++ {
+		if current.Levels[level].Empty() {
+			continue
+		}
+		mlevels = append(mlevels, simpleMergingIterLevel{})
+	}
+	mlevelAlloc := mlevels[start:]
+	// Add L0 files by sublevel.
+	for sublevel := len(current.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- {
+		if current.L0SublevelFiles[sublevel].Empty() {
+			continue
+		}
+		manifestIter := current.L0SublevelFiles[sublevel].Iter()
+		iterOpts := IterOptions{logger: c.logger}
+		li := &levelIter{}
+		li.init(context.Background(), iterOpts, c.comparer, c.newIters, manifestIter,
+			manifest.L0Sublevel(sublevel), internalIterOpts{})
+		li.initRangeDel(&mlevelAlloc[0].rangeDelIter)
+		li.initBoundaryContext(&mlevelAlloc[0].levelIterBoundaryContext)
+		mlevelAlloc[0].iter = li
+		mlevelAlloc = mlevelAlloc[1:]
+	}
+	for level := 1; level < len(current.Levels); level++ {
+		if current.Levels[level].Empty() {
+			continue
+		}
+
+		iterOpts := IterOptions{logger: c.logger}
+		li := &levelIter{}
+		li.init(context.Background(), iterOpts, c.comparer, c.newIters,
+			current.Levels[level].Iter(), manifest.Level(level), internalIterOpts{})
+		li.initRangeDel(&mlevelAlloc[0].rangeDelIter)
+		li.initBoundaryContext(&mlevelAlloc[0].levelIterBoundaryContext)
+		mlevelAlloc[0].iter = li
+		mlevelAlloc = mlevelAlloc[1:]
+	}
+
+	mergingIter := &simpleMergingIter{}
+	mergingIter.init(c.merge, c.cmp, c.seqNum, c.formatKey, mlevels...)
+	for cont := mergingIter.step(); cont; cont = mergingIter.step() {
+	}
+	if err := mergingIter.err; err != nil {
+		return err
+	}
+	if c.stats != nil {
+		c.stats.NumPoints = mergingIter.numPoints
+	}
+
+	// Phase 2: Check that the tombstones are mutually consistent.
+	return checkRangeTombstones(c)
+}
+
+type simpleMergingIterItem struct {
+	index int
+	key   InternalKey
+	value base.LazyValue
+}
+
+type simpleMergingIterHeap struct {
+	cmp     Compare
+	reverse bool
+	items   []simpleMergingIterItem
+}
+
+func (h *simpleMergingIterHeap) len() int {
+	return len(h.items)
+}
+
+func (h *simpleMergingIterHeap) less(i, j int) bool {
+	ikey, jkey := h.items[i].key, h.items[j].key
+	if c := h.cmp(ikey.UserKey, jkey.UserKey); c != 0 {
+		if h.reverse {
+			return c > 0
+		}
+		return c < 0
+	}
+	if h.reverse {
+		return ikey.Trailer < jkey.Trailer
+	}
+	return ikey.Trailer > jkey.Trailer
+}
+
+func (h *simpleMergingIterHeap) swap(i, j int) {
+	h.items[i], h.items[j] = h.items[j], h.items[i]
+}
+
+// init, fix, up and down are copied from the go stdlib.
+func (h *simpleMergingIterHeap) init() {
+	// heapify
+	n := h.len()
+	for i := n/2 - 1; i >= 0; i-- {
+		h.down(i, n)
+	}
+}
+
+func (h *simpleMergingIterHeap) fix(i int) {
+	if !h.down(i, h.len()) {
+		h.up(i)
+	}
+}
+
+func (h *simpleMergingIterHeap) pop() *simpleMergingIterItem {
+	n := h.len() - 1
+	h.swap(0, n)
+	h.down(0, n)
+	item := &h.items[n]
+	h.items = h.items[:n]
+	return item
+}
+
+func (h *simpleMergingIterHeap) up(j int) {
+	for {
+		i := (j - 1) / 2 // parent
+		if i == j || !h.less(j, i) {
+			break
+		}
+		h.swap(i, j)
+		j = i
+	}
+}
+
+func (h *simpleMergingIterHeap) down(i0, n int) bool {
+	i := i0
+	for {
+		j1 := 2*i + 1
+		if j1 >= n || j1 < 0 { // j1 < 0 after int overflow
+			break
+		}
+		j := j1 // left child
+		if j2 := j1 + 1; j2 < n && h.less(j2, j1) {
+			j = j2 // = 2*i + 2  // right child
+		}
+		if !h.less(j, i) {
+			break
+		}
+		h.swap(i, j)
+		i = j
+	}
+	return i > i0
+}
diff --git a/pebble/level_checker_test.go b/pebble/level_checker_test.go
new file mode 100644
index 0000000..ae8f9f4
--- /dev/null
+++ b/pebble/level_checker_test.go
@@ -0,0 +1,273 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"io"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/internal/private"
+	"github.com/cockroachdb/pebble/internal/rangedel"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+func TestCheckLevelsBasics(t *testing.T) {
+	testCases := []string{"db-stage-1", "db-stage-2", "db-stage-3", "db-stage-4"}
+	for _, tc := range testCases {
+		t.Run(tc, func(t *testing.T) {
+			t.Logf("%s", t.Name())
+			fs := vfs.NewMem()
+			_, err := vfs.Clone(vfs.Default, fs, filepath.Join("testdata", tc), tc)
+			if err != nil {
+				t.Fatalf("%s: cloneFileSystem failed: %v", tc, err)
+			}
+			d, err := Open(tc, &Options{
+				FS: fs,
+			})
+			if err != nil {
+				t.Fatalf("%s: Open failed: %v", tc, err)
+			}
+			require.NoError(t, d.CheckLevels(nil))
+			require.NoError(t, d.Close())
+		})
+	}
+}
+
+type failMerger struct {
+	lastBuf    []byte
+	closeCount int
+}
+
+func (f *failMerger) MergeNewer(value []byte) error {
+	return nil
+}
+
+func (f *failMerger) MergeOlder(value []byte) error {
+	if string(value) == "fail-merge" {
+		f.lastBuf = nil
+		return errors.New("merge failed")
+	}
+	f.lastBuf = append(f.lastBuf[:0], value...)
+	return nil
+}
+
+func (f *failMerger) Finish(includesBase bool) ([]byte, io.Closer, error) {
+	if string(f.lastBuf) == "fail-finish" {
+		f.lastBuf = nil
+		return nil, nil, errors.New("finish failed")
+	}
+	f.closeCount++
+	return nil, f, nil
+}
+
+func (f *failMerger) Close() error {
+	f.closeCount--
+	f.lastBuf = nil
+	return nil
+}
+
+func TestCheckLevelsCornerCases(t *testing.T) {
+	memFS := vfs.NewMem()
+	var levels [][]*fileMetadata
+	formatKey := DefaultComparer.FormatKey
+	// Indexed by fileNum
+	var readers []*sstable.Reader
+	defer func() {
+		for _, r := range readers {
+			r.Close()
+		}
+	}()
+
+	var fileNum FileNum
+	newIters :=
+		func(_ context.Context, file *manifest.FileMetadata, _ *IterOptions, _ internalIterOpts) (internalIterator, keyspan.FragmentIterator, error) {
+			r := readers[file.FileNum]
+			rangeDelIter, err := r.NewRawRangeDelIter()
+			if err != nil {
+				return nil, nil, err
+			}
+			iter, err := r.NewIter(nil /* lower */, nil /* upper */)
+			if err != nil {
+				return nil, nil, err
+			}
+			return iter, rangeDelIter, nil
+		}
+
+	fm := &failMerger{}
+	defer require.Equal(t, 0, fm.closeCount)
+
+	failMerger := &Merger{
+		Merge: func(key, value []byte) (ValueMerger, error) {
+			fm.lastBuf = append(fm.lastBuf[:0], value...)
+			return fm, nil
+		},
+
+		Name: "fail-merger",
+	}
+
+	datadriven.RunTest(t, "testdata/level_checker", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "define":
+			lines := strings.Split(d.Input, "\n")
+			levels = levels[:0]
+			for i := 0; i < len(lines); i++ {
+				line := lines[i]
+				line = strings.TrimSpace(line)
+				if line == "L" {
+					// start next level
+					levels = append(levels, nil)
+					continue
+				}
+				li := &levels[len(levels)-1]
+				keys := strings.Fields(line)
+				smallestKey := base.ParseInternalKey(keys[0])
+				largestKey := base.ParseInternalKey(keys[1])
+				m := (&fileMetadata{
+					FileNum: fileNum,
+				}).ExtendPointKeyBounds(DefaultComparer.Compare, smallestKey, largestKey)
+				m.InitPhysicalBacking()
+				*li = append(*li, m)
+
+				i++
+				line = lines[i]
+				line = strings.TrimSpace(line)
+				name := fmt.Sprint(fileNum)
+				fileNum++
+				f, err := memFS.Create(name)
+				if err != nil {
+					return err.Error()
+				}
+				writeUnfragmented := false
+				w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
+				for _, arg := range d.CmdArgs {
+					switch arg.Key {
+					case "disable-key-order-checks":
+						private.SSTableWriterDisableKeyOrderChecks(w)
+					case "write-unfragmented":
+						writeUnfragmented = true
+					default:
+						return fmt.Sprintf("unknown arg: %s", arg.Key)
+					}
+				}
+				var tombstones []keyspan.Span
+				frag := keyspan.Fragmenter{
+					Cmp:    DefaultComparer.Compare,
+					Format: formatKey,
+					Emit: func(fragmented keyspan.Span) {
+						tombstones = append(tombstones, fragmented)
+					},
+				}
+				keyvalues := strings.Fields(line)
+				for _, kv := range keyvalues {
+					j := strings.Index(kv, ":")
+					ikey := base.ParseInternalKey(kv[:j])
+					value := []byte(kv[j+1:])
+					var err error
+					switch ikey.Kind() {
+					case InternalKeyKindRangeDelete:
+						if writeUnfragmented {
+							err = w.Add(ikey, value)
+							break
+						}
+						frag.Add(rangedel.Decode(ikey, value, nil))
+					default:
+						err = w.Add(ikey, value)
+					}
+					if err != nil {
+						return err.Error()
+					}
+				}
+				frag.Finish()
+				for _, v := range tombstones {
+					if err := rangedel.Encode(&v, w.Add); err != nil {
+						return err.Error()
+					}
+				}
+				if err := w.Close(); err != nil {
+					return err.Error()
+				}
+				f, err = memFS.Open(name)
+				if err != nil {
+					return err.Error()
+				}
+				readable, err := sstable.NewSimpleReadable(f)
+				if err != nil {
+					return err.Error()
+				}
+				cacheOpts := private.SSTableCacheOpts(0, base.FileNum(uint64(fileNum)-1).DiskFileNum()).(sstable.ReaderOption)
+				r, err := sstable.NewReader(readable, sstable.ReaderOptions{}, cacheOpts)
+				if err != nil {
+					return err.Error()
+				}
+				readers = append(readers, r)
+			}
+			// TODO(sbhola): clean this up by wrapping levels in a Version and using
+			// Version.DebugString().
+			var buf bytes.Buffer
+			for i, l := range levels {
+				fmt.Fprintf(&buf, "Level %d\n", i+1)
+				for j, f := range l {
+					fmt.Fprintf(&buf, "  file %d: [%s-%s]\n", j, f.Smallest.String(), f.Largest.String())
+				}
+			}
+			return buf.String()
+		case "check":
+			merge := DefaultMerger.Merge
+			for _, arg := range d.CmdArgs {
+				switch arg.Key {
+				case "merger":
+					if len(arg.Vals) != 1 {
+						return fmt.Sprintf("expected one arg value, got %d", len(arg.Vals))
+					}
+					if arg.Vals[0] != failMerger.Name {
+						return "unsupported merger"
+					}
+					merge = failMerger.Merge
+				default:
+					return fmt.Sprintf("unknown arg: %s", arg.Key)
+				}
+			}
+
+			var files [numLevels][]*fileMetadata
+			for i := range levels {
+				// Start from level 1 in this test.
+				files[i+1] = levels[i]
+			}
+			version := manifest.NewVersion(
+				base.DefaultComparer.Compare,
+				base.DefaultFormatter,
+				0,
+				files)
+			readState := &readState{current: version}
+			c := &checkConfig{
+				comparer:  DefaultComparer,
+				readState: readState,
+				newIters:  newIters,
+				seqNum:    InternalKeySeqNumMax,
+				merge:     merge,
+				formatKey: formatKey,
+			}
+			if err := checkLevelsInternal(c); err != nil {
+				return err.Error()
+			}
+			return ""
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
diff --git a/pebble/level_iter.go b/pebble/level_iter.go
new file mode 100644
index 0000000..0c26dbc
--- /dev/null
+++ b/pebble/level_iter.go
@@ -0,0 +1,1252 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"context"
+	"fmt"
+	"runtime/debug"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/sstable"
+)
+
+// tableNewIters creates a new point and range-del iterator for the given file
+// number.
+//
+// On success, the internalIterator is not-nil and must be closed; the
+// FragmentIterator can be nil.
+// TODO(radu): always return a non-nil FragmentIterator.
+//
+// On error, the iterators are nil.
+//
+// The only (non-test) implementation of tableNewIters is tableCacheContainer.newIters().
+type tableNewIters func(
+	ctx context.Context,
+	file *manifest.FileMetadata,
+	opts *IterOptions,
+	internalOpts internalIterOpts,
+) (internalIterator, keyspan.FragmentIterator, error)
+
+// tableNewRangeDelIter takes a tableNewIters and returns a TableNewSpanIter
+// for the rangedel iterator returned by tableNewIters.
+func tableNewRangeDelIter(ctx context.Context, newIters tableNewIters) keyspan.TableNewSpanIter {
+	return func(file *manifest.FileMetadata, iterOptions keyspan.SpanIterOptions) (keyspan.FragmentIterator, error) {
+		iter, rangeDelIter, err := newIters(ctx, file, nil, internalIterOpts{})
+		if iter != nil {
+			_ = iter.Close()
+		}
+		if rangeDelIter == nil {
+			rangeDelIter = emptyKeyspanIter
+		}
+		return rangeDelIter, err
+	}
+}
+
+type internalIterOpts struct {
+	bytesIterated      *uint64
+	bufferPool         *sstable.BufferPool
+	stats              *base.InternalIteratorStats
+	boundLimitedFilter sstable.BoundLimitedBlockPropertyFilter
+}
+
+// levelIter provides a merged view of the sstables in a level.
+//
+// levelIter is used during compaction and as part of the Iterator
+// implementation. When used as part of the Iterator implementation, level
+// iteration needs to "pause" at sstable boundaries if a range deletion
+// tombstone is the source of that boundary. We know if a range tombstone is
+// the smallest or largest key in a file because the kind will be
+// InternalKeyKindRangeDeletion. If the boundary key is a range deletion
+// tombstone, we materialize a fake entry to return from levelIter. This
+// prevents mergingIter from advancing past the sstable until the sstable
+// contains the smallest (or largest for reverse iteration) key in the merged
+// heap. Note that mergingIter treats a range deletion tombstone returned by
+// the point iterator as a no-op.
+//
+// SeekPrefixGE presents the need for a second type of pausing. If an sstable
+// iterator returns "not found" for a SeekPrefixGE operation, we don't want to
+// advance to the next sstable as the "not found" does not indicate that all of
+// the keys in the sstable are less than the search key. Advancing to the next
+// sstable would cause us to skip over range tombstones, violating
+// correctness. Instead, SeekPrefixGE creates a synthetic boundary key with the
+// kind InternalKeyKindRangeDeletion which will be used to pause the levelIter
+// at the sstable until the mergingIter is ready to advance past it.
+type levelIter struct {
+	// The context is stored here since (a) iterators are expected to be
+	// short-lived (since they pin sstables), (b) plumbing a context into every
+	// method is very painful, (c) they do not (yet) respect context
+	// cancellation and are only used for tracing.
+	ctx      context.Context
+	logger   Logger
+	comparer *Comparer
+	cmp      Compare
+	split    Split
+	// The lower/upper bounds for iteration as specified at creation or the most
+	// recent call to SetBounds.
+	lower []byte
+	upper []byte
+	// The iterator options for the currently open table. If
+	// tableOpts.{Lower,Upper}Bound are nil, the corresponding iteration boundary
+	// does not lie within the table bounds.
+	tableOpts IterOptions
+	// The LSM level this levelIter is initialized for.
+	level manifest.Level
+	// The keys to return when iterating past an sstable boundary and that
+	// boundary is a range deletion tombstone. The boundary could be smallest
+	// (i.e. arrived at with Prev), or largest (arrived at with Next).
+	smallestBoundary *InternalKey
+	largestBoundary  *InternalKey
+	// combinedIterState may be set when a levelIter is used during user
+	// iteration. Although levelIter only iterates over point keys, it's also
+	// responsible for lazily constructing the combined range & point iterator
+	// when it observes a file containing range keys. If the combined iter
+	// state's initialized field is true, the iterator is already using combined
+	// iterator, OR the iterator is not configured to use combined iteration. If
+	// it's false, the levelIter must set the `triggered` and `key` fields when
+	// the levelIter passes over a file containing range keys. See the
+	// lazyCombinedIter for more details.
+	combinedIterState *combinedIterState
+	// A synthetic boundary key to return when SeekPrefixGE finds an sstable
+	// which doesn't contain the search key, but which does contain range
+	// tombstones.
+	syntheticBoundary InternalKey
+	// The iter for the current file. It is nil under any of the following conditions:
+	// - files.Current() == nil
+	// - err != nil
+	// - some other constraint, like the bounds in opts, caused the file at index to not
+	//   be relevant to the iteration.
+	iter internalIterator
+	// iterFile holds the current file. It is always equal to l.files.Current().
+	iterFile *fileMetadata
+	// filteredIter is an optional interface that may be implemented by internal
+	// iterators that perform filtering of keys. When a new file's iterator is
+	// opened, it's tested to see if it implements filteredIter. If it does,
+	// it's stored here to allow the level iterator to recognize when keys were
+	// omitted from iteration results due to filtering. This is important when a
+	// file contains range deletions that may delete keys from other files. The
+	// levelIter must not advance to the next file until the mergingIter has
+	// advanced beyond the file's bounds. See
+	// levelIterBoundaryContext.isIgnorableBoundaryKey.
+	filteredIter filteredIter
+	newIters     tableNewIters
+	// When rangeDelIterPtr != nil, the caller requires that *rangeDelIterPtr must
+	// point to a range del iterator corresponding to the current file. When this
+	// iterator returns nil, *rangeDelIterPtr should also be set to nil. Whenever
+	// a non-nil internalIterator is placed in rangeDelIterPtr, a copy is placed
+	// in rangeDelIterCopy. This is done for the following special case:
+	// when this iterator returns nil because of exceeding the bounds, we don't
+	// close iter and *rangeDelIterPtr since we could reuse it in the next seek. But
+	// we need to set *rangeDelIterPtr to nil because of the aforementioned contract.
+	// This copy is used to revive the *rangeDelIterPtr in the case of reuse.
+	rangeDelIterPtr  *keyspan.FragmentIterator
+	rangeDelIterCopy keyspan.FragmentIterator
+	files            manifest.LevelIterator
+	err              error
+
+	// Pointer into this level's entry in `mergingIterLevel::levelIterBoundaryContext`.
+	// We populate it with the corresponding bounds for the currently opened file. It is used for
+	// two purposes (described for forward iteration. The explanation for backward iteration is
+	// similar.)
+	// - To limit the optimization that seeks lower-level iterators past keys shadowed by a range
+	//   tombstone. Limiting this seek to the file largestUserKey is necessary since
+	//   range tombstones are stored untruncated, while they only apply to keys within their
+	//   containing file's boundaries. For a detailed example, see comment above `mergingIter`.
+	// - To constrain the tombstone to act-within the bounds of the sstable when checking
+	//   containment. For forward iteration we need the smallestUserKey.
+	//
+	// An example is sstable bounds [c#8, g#12] containing a tombstone [b, i)#7.
+	// - When doing a SeekGE to user key X, the levelIter is at this sstable because X is either within
+	//   the sstable bounds or earlier than the start of the sstable (and there is no sstable in
+	//   between at this level). If X >= smallestUserKey, and the tombstone [b, i) contains X,
+	//   it is correct to SeekGE the sstables at lower levels to min(g, i) (i.e., min of
+	//   largestUserKey, tombstone.End) since any user key preceding min(g, i) must be covered by this
+	//   tombstone (since it cannot have a version younger than this tombstone as it is at a lower
+	//   level). And even if X = smallestUserKey or equal to the start user key of the tombstone,
+	//   if the above conditions are satisfied we know that the internal keys corresponding to X at
+	//   lower levels must have a version smaller than that in this file (again because of the level
+	//   argument). So we don't need to use sequence numbers for this comparison.
+	// - When checking whether this tombstone deletes internal key X we know that the levelIter is at this
+	//   sstable so (repeating the above) X.UserKey is either within the sstable bounds or earlier than the
+	//   start of the sstable (and there is no sstable in between at this level).
+	//   - X is at at a lower level. If X.UserKey >= smallestUserKey, and the tombstone contains
+	//     X.UserKey, we know X is deleted. This argument also works when X is a user key (we use
+	//     it when seeking to test whether a user key is deleted).
+	//   - X is at the same level. X must be within the sstable bounds of the tombstone so the
+	//     X.UserKey >= smallestUserKey comparison is trivially true. In addition to the tombstone containing
+	//     X we need to compare the sequence number of X and the tombstone (we don't need to look
+	//     at how this tombstone is truncated to act-within the file bounds, which are InternalKeys,
+	//     since X and the tombstone are from the same file).
+	//
+	// Iterating backwards has one more complication when checking whether a tombstone deletes
+	// internal key X at a lower level (the construction we do here also works for a user key X).
+	// Consider sstable bounds [c#8, g#InternalRangeDelSentinel] containing a tombstone [b, i)#7.
+	// If we are positioned at key g#10 at a lower sstable, the tombstone we will see is [b, i)#7,
+	// since the higher sstable is positioned at a key <= g#10. We should not use this tombstone
+	// to delete g#10. This requires knowing that the largestUserKey is a range delete sentinel,
+	// which we set in a separate bool below.
+	//
+	// These fields differs from the `*Boundary` fields in a few ways:
+	// - `*Boundary` is only populated when the iterator is positioned exactly on the sentinel key.
+	// - `*Boundary` can hold either the lower- or upper-bound, depending on the iterator direction.
+	// - `*Boundary` is not exposed to the next higher-level iterator, i.e., `mergingIter`.
+	boundaryContext *levelIterBoundaryContext
+
+	// internalOpts holds the internal iterator options to pass to the table
+	// cache when constructing new table iterators.
+	internalOpts internalIterOpts
+
+	// Scratch space for the obsolete keys filter, when there are no other block
+	// property filters specified. See the performance note where
+	// IterOptions.PointKeyFilters is declared.
+	filtersBuf [1]BlockPropertyFilter
+
+	// Disable invariant checks even if they are otherwise enabled. Used by tests
+	// which construct "impossible" situations (e.g. seeking to a key before the
+	// lower bound).
+	disableInvariants bool
+}
+
+// filteredIter is an additional interface implemented by iterators that may
+// skip over point keys during iteration. The sstable.Iterator implements this
+// interface.
+type filteredIter interface {
+	// MaybeFilteredKeys may be called when an iterator is exhausted, indicating
+	// whether or not the iterator's last positioning method may have skipped
+	// any keys due to low-level filters.
+	//
+	// When an iterator is configured to use block-property filters, the
+	// low-level iterator may skip over blocks or whole sstables of keys.
+	// Implementations that implement skipping must implement this interface.
+	// Higher-level iterators require it to preserve invariants (eg, a levelIter
+	// used in a mergingIter must keep the file's range-del iterator open until
+	// the mergingIter has moved past the file's bounds, even if all of the
+	// file's point keys were filtered).
+	//
+	// MaybeFilteredKeys may always return false positives, that is it may
+	// return true when no keys were filtered. It should only be called when the
+	// iterator is exhausted. It must never return false negatives when the
+	// iterator is exhausted.
+	MaybeFilteredKeys() bool
+}
+
+// levelIter implements the base.InternalIterator interface.
+var _ base.InternalIterator = (*levelIter)(nil)
+
+// newLevelIter returns a levelIter. It is permissible to pass a nil split
+// parameter if the caller is never going to call SeekPrefixGE.
+func newLevelIter(
+	ctx context.Context,
+	opts IterOptions,
+	comparer *Comparer,
+	newIters tableNewIters,
+	files manifest.LevelIterator,
+	level manifest.Level,
+	internalOpts internalIterOpts,
+) *levelIter {
+	l := &levelIter{}
+	l.init(ctx, opts, comparer, newIters, files, level, internalOpts)
+	return l
+}
+
+func (l *levelIter) init(
+	ctx context.Context,
+	opts IterOptions,
+	comparer *Comparer,
+	newIters tableNewIters,
+	files manifest.LevelIterator,
+	level manifest.Level,
+	internalOpts internalIterOpts,
+) {
+	l.ctx = ctx
+	l.err = nil
+	l.level = level
+	l.logger = opts.getLogger()
+	l.lower = opts.LowerBound
+	l.upper = opts.UpperBound
+	l.tableOpts.TableFilter = opts.TableFilter
+	l.tableOpts.PointKeyFilters = opts.PointKeyFilters
+	if len(opts.PointKeyFilters) == 0 {
+		l.tableOpts.PointKeyFilters = l.filtersBuf[:0:1]
+	}
+	l.tableOpts.UseL6Filters = opts.UseL6Filters
+	l.tableOpts.CategoryAndQoS = opts.CategoryAndQoS
+	l.tableOpts.level = l.level
+	l.tableOpts.snapshotForHideObsoletePoints = opts.snapshotForHideObsoletePoints
+	l.comparer = comparer
+	l.cmp = comparer.Compare
+	l.split = comparer.Split
+	l.iterFile = nil
+	l.newIters = newIters
+	l.files = files
+	l.internalOpts = internalOpts
+}
+
+func (l *levelIter) initRangeDel(rangeDelIter *keyspan.FragmentIterator) {
+	l.rangeDelIterPtr = rangeDelIter
+}
+
+func (l *levelIter) initBoundaryContext(context *levelIterBoundaryContext) {
+	l.boundaryContext = context
+}
+
+func (l *levelIter) initCombinedIterState(state *combinedIterState) {
+	l.combinedIterState = state
+}
+
+func (l *levelIter) maybeTriggerCombinedIteration(file *fileMetadata, dir int) {
+	// If we encounter a file that contains range keys, we may need to
+	// trigger a switch to combined range-key and point-key iteration,
+	// if the *pebble.Iterator is configured for it. This switch is done
+	// lazily because range keys are intended to be rare, and
+	// constructing the range-key iterator substantially adds to the
+	// cost of iterator construction and seeking.
+	//
+	// If l.combinedIterState.initialized is already true, either the
+	// iterator is already using combined iteration or the iterator is not
+	// configured to observe range keys. Either way, there's nothing to do.
+	// If false, trigger the switch to combined iteration, using the the
+	// file's bounds to seek the range-key iterator appropriately.
+	//
+	// We only need to trigger combined iteration if the file contains
+	// RangeKeySets: if there are only Unsets and Dels, the user will observe no
+	// range keys regardless. If this file has table stats available, they'll
+	// tell us whether the file has any RangeKeySets. Otherwise, we must
+	// fallback to assuming it does if HasRangeKeys=true.
+	if file != nil && file.HasRangeKeys && l.combinedIterState != nil && !l.combinedIterState.initialized &&
+		(l.upper == nil || l.cmp(file.SmallestRangeKey.UserKey, l.upper) < 0) &&
+		(l.lower == nil || l.cmp(file.LargestRangeKey.UserKey, l.lower) > 0) &&
+		(!file.StatsValid() || file.Stats.NumRangeKeySets > 0) {
+		// The file contains range keys, and we're not using combined iteration yet.
+		// Trigger a switch to combined iteration. It's possible that a switch has
+		// already been triggered if multiple levels encounter files containing
+		// range keys while executing a single mergingIter operation. In this case,
+		// we need to compare the existing key recorded to l.combinedIterState.key,
+		// adjusting it if our key is smaller (forward iteration) or larger
+		// (backward iteration) than the existing key.
+		//
+		// These key comparisons are only required during a single high-level
+		// iterator operation. When the high-level iter op completes,
+		// iinitialized will be true, and future calls to this function will be
+		// no-ops.
+		switch dir {
+		case +1:
+			if !l.combinedIterState.triggered {
+				l.combinedIterState.triggered = true
+				l.combinedIterState.key = file.SmallestRangeKey.UserKey
+			} else if l.cmp(l.combinedIterState.key, file.SmallestRangeKey.UserKey) > 0 {
+				l.combinedIterState.key = file.SmallestRangeKey.UserKey
+			}
+		case -1:
+			if !l.combinedIterState.triggered {
+				l.combinedIterState.triggered = true
+				l.combinedIterState.key = file.LargestRangeKey.UserKey
+			} else if l.cmp(l.combinedIterState.key, file.LargestRangeKey.UserKey) < 0 {
+				l.combinedIterState.key = file.LargestRangeKey.UserKey
+			}
+		}
+	}
+}
+
+func (l *levelIter) findFileGE(key []byte, flags base.SeekGEFlags) *fileMetadata {
+	// Find the earliest file whose largest key is >= key.
+
+	// NB: if flags.TrySeekUsingNext()=true, the levelIter must respect it. If
+	// the levelIter is positioned at the key P, it must return a key ≥ P. If
+	// used within a merging iterator, the merging iterator will depend on the
+	// levelIter only moving forward to maintain heap invariants.
+
+	// Ordinarily we seek the LevelIterator using SeekGE. In some instances, we
+	// Next instead. In other instances, we try Next-ing first, falling back to
+	// seek:
+	//   a) flags.TrySeekUsingNext(): The top-level Iterator knows we're seeking
+	//      to a key later than the current iterator position. We don't know how
+	//      much later the seek key is, so it's possible there are many sstables
+	//      between the current position and the seek key. However in most real-
+	//      world use cases, the seek key is likely to be nearby. Rather than
+	//      performing a log(N) seek through the file metadata, we next a few
+	//      times from from our existing location. If we don't find a file whose
+	//      largest is >= key within a few nexts, we fall back to seeking.
+	//
+	//      Note that in this case, the file returned by findFileGE may be
+	//      different than the file returned by a raw binary search (eg, when
+	//      TrySeekUsingNext=false). This is possible because the most recent
+	//      positioning operation may have already determined that previous
+	//      files' keys that are ≥ key are all deleted. This information is
+	//      encoded within the iterator's current iterator position and is
+	//      unavailable to a fresh binary search.
+	//
+	//   b) flags.RelativeSeek(): The merging iterator decided to re-seek this
+	//      level according to a range tombstone. When lazy combined iteration
+	//      is enabled, the level iterator is responsible for watching for
+	//      files containing range keys and triggering the switch to combined
+	//      iteration when such a file is observed. If a range deletion was
+	//      observed in a higher level causing the merging iterator to seek the
+	//      level to the range deletion's end key, we need to check whether all
+	//      of the files between the old position and the new position contain
+	//      any range keys.
+	//
+	//      In this scenario, we don't seek the LevelIterator and instead we
+	//      Next it, one file at a time, checking each for range keys. The
+	//      merging iterator sets this flag to inform us that we're moving
+	//      forward relative to the existing position and that we must examine
+	//      each intermediate sstable's metadata for lazy-combined iteration.
+	//      In this case, we only Next and never Seek. We set nextsUntilSeek=-1
+	//      to signal this intention.
+	//
+	// NB: At most one of flags.RelativeSeek() and flags.TrySeekUsingNext() may
+	// be set, because the merging iterator re-seeks relative seeks with
+	// explicitly only the RelativeSeek flag set.
+	var nextsUntilSeek int
+	var nextInsteadOfSeek bool
+	if flags.TrySeekUsingNext() {
+		nextInsteadOfSeek = true
+		nextsUntilSeek = 4 // arbitrary
+	}
+	if flags.RelativeSeek() && l.combinedIterState != nil && !l.combinedIterState.initialized {
+		nextInsteadOfSeek = true
+		nextsUntilSeek = -1
+	}
+
+	var m *fileMetadata
+	if nextInsteadOfSeek {
+		m = l.iterFile
+	} else {
+		m = l.files.SeekGE(l.cmp, key)
+	}
+	// The below loop has a bit of an unusual organization. There are several
+	// conditions under which we need to Next to a later file. If none of those
+	// conditions are met, the file in `m` is okay to return. The loop body is
+	// structured with a series of if statements, each of which may continue the
+	// loop to the next file. If none of the statements are met, the end of the
+	// loop body is a break.
+	for m != nil {
+		if m.HasRangeKeys {
+			l.maybeTriggerCombinedIteration(m, +1)
+
+			// Some files may only contain range keys, which we can skip.
+			// NB: HasPointKeys=true if the file contains any points or range
+			// deletions (which delete points).
+			if !m.HasPointKeys {
+				m = l.files.Next()
+				continue
+			}
+		}
+
+		// This file has point keys.
+		//
+		// However, there are a couple reasons why `m` may not be positioned ≥
+		// `key` yet:
+		//
+		// 1. If SeekGE(key) landed on a file containing range keys, the file
+		//    may contain range keys ≥ `key` but no point keys ≥ `key`.
+		// 2. When nexting instead of seeking, we must check to see whether
+		//    we've nexted sufficiently far, or we need to next again.
+		//
+		// If the file does not contain point keys ≥ `key`, next to continue
+		// looking for a file that does.
+		if (m.HasRangeKeys || nextInsteadOfSeek) && l.cmp(m.LargestPointKey.UserKey, key) < 0 {
+			// If nextInsteadOfSeek is set and nextsUntilSeek is non-negative,
+			// the iterator has been nexting hoping to discover the relevant
+			// file without seeking. It's exhausted the allotted nextsUntilSeek
+			// and should seek to the sought key.
+			if nextInsteadOfSeek && nextsUntilSeek == 0 {
+				nextInsteadOfSeek = false
+				m = l.files.SeekGE(l.cmp, key)
+				continue
+			} else if nextsUntilSeek > 0 {
+				nextsUntilSeek--
+			}
+			m = l.files.Next()
+			continue
+		}
+
+		// This file has a point key bound ≥ `key`. But the largest point key
+		// bound may still be a range deletion sentinel, which is exclusive.  In
+		// this case, the file doesn't actually contain any point keys equal to
+		// `key`. We next to keep searching for a file that actually contains
+		// point keys ≥ key.
+		//
+		// Additionally, this prevents loading untruncated range deletions from
+		// a table which can't possibly contain the target key and is required
+		// for correctness by mergingIter.SeekGE (see the comment in that
+		// function).
+		if m.LargestPointKey.IsExclusiveSentinel() && l.cmp(m.LargestPointKey.UserKey, key) == 0 {
+			m = l.files.Next()
+			continue
+		}
+
+		// This file contains point keys ≥ `key`. Break and return it.
+		break
+	}
+	return m
+}
+
+func (l *levelIter) findFileLT(key []byte, flags base.SeekLTFlags) *fileMetadata {
+	// Find the last file whose smallest key is < ikey.
+
+	// Ordinarily we seek the LevelIterator using SeekLT.
+	//
+	// When lazy combined iteration is enabled, there's a complication. The
+	// level iterator is responsible for watching for files containing range
+	// keys and triggering the switch to combined iteration when such a file is
+	// observed. If a range deletion was observed in a higher level causing the
+	// merging iterator to seek the level to the range deletion's start key, we
+	// need to check whether all of the files between the old position and the
+	// new position contain any range keys.
+	//
+	// In this scenario, we don't seek the LevelIterator and instead we Prev it,
+	// one file at a time, checking each for range keys.
+	prevInsteadOfSeek := flags.RelativeSeek() && l.combinedIterState != nil && !l.combinedIterState.initialized
+
+	var m *fileMetadata
+	if prevInsteadOfSeek {
+		m = l.iterFile
+	} else {
+		m = l.files.SeekLT(l.cmp, key)
+	}
+	// The below loop has a bit of an unusual organization. There are several
+	// conditions under which we need to Prev to a previous file. If none of
+	// those conditions are met, the file in `m` is okay to return. The loop
+	// body is structured with a series of if statements, each of which may
+	// continue the loop to the previous file. If none of the statements are
+	// met, the end of the loop body is a break.
+	for m != nil {
+		if m.HasRangeKeys {
+			l.maybeTriggerCombinedIteration(m, -1)
+
+			// Some files may only contain range keys, which we can skip.
+			// NB: HasPointKeys=true if the file contains any points or range
+			// deletions (which delete points).
+			if !m.HasPointKeys {
+				m = l.files.Prev()
+				continue
+			}
+		}
+
+		// This file has point keys.
+		//
+		// However, there are a couple reasons why `m` may not be positioned <
+		// `key` yet:
+		//
+		// 1. If SeekLT(key) landed on a file containing range keys, the file
+		//    may contain range keys < `key` but no point keys < `key`.
+		// 2. When preving instead of seeking, we must check to see whether
+		//    we've preved sufficiently far, or we need to prev again.
+		//
+		// If the file does not contain point keys < `key`, prev to continue
+		// looking for a file that does.
+		if (m.HasRangeKeys || prevInsteadOfSeek) && l.cmp(m.SmallestPointKey.UserKey, key) >= 0 {
+			m = l.files.Prev()
+			continue
+		}
+
+		// This file contains point keys < `key`. Break and return it.
+		break
+	}
+	return m
+}
+
+// Init the iteration bounds for the current table. Returns -1 if the table
+// lies fully before the lower bound, +1 if the table lies fully after the
+// upper bound, and 0 if the table overlaps the iteration bounds.
+func (l *levelIter) initTableBounds(f *fileMetadata) int {
+	l.tableOpts.LowerBound = l.lower
+	if l.tableOpts.LowerBound != nil {
+		if l.cmp(f.LargestPointKey.UserKey, l.tableOpts.LowerBound) < 0 {
+			// The largest key in the sstable is smaller than the lower bound.
+			return -1
+		}
+		if l.cmp(l.tableOpts.LowerBound, f.SmallestPointKey.UserKey) <= 0 {
+			// The lower bound is smaller or equal to the smallest key in the
+			// table. Iteration within the table does not need to check the lower
+			// bound.
+			l.tableOpts.LowerBound = nil
+		}
+	}
+	l.tableOpts.UpperBound = l.upper
+	if l.tableOpts.UpperBound != nil {
+		if l.cmp(f.SmallestPointKey.UserKey, l.tableOpts.UpperBound) >= 0 {
+			// The smallest key in the sstable is greater than or equal to the upper
+			// bound.
+			return 1
+		}
+		if l.cmp(l.tableOpts.UpperBound, f.LargestPointKey.UserKey) > 0 {
+			// The upper bound is greater than the largest key in the
+			// table. Iteration within the table does not need to check the upper
+			// bound. NB: tableOpts.UpperBound is exclusive and f.LargestPointKey is
+			// inclusive.
+			l.tableOpts.UpperBound = nil
+		}
+	}
+	return 0
+}
+
+type loadFileReturnIndicator int8
+
+const (
+	noFileLoaded loadFileReturnIndicator = iota
+	fileAlreadyLoaded
+	newFileLoaded
+)
+
+func (l *levelIter) loadFile(file *fileMetadata, dir int) loadFileReturnIndicator {
+	l.smallestBoundary = nil
+	l.largestBoundary = nil
+	if l.boundaryContext != nil {
+		l.boundaryContext.isSyntheticIterBoundsKey = false
+		l.boundaryContext.isIgnorableBoundaryKey = false
+	}
+	if l.iterFile == file {
+		if l.err != nil {
+			return noFileLoaded
+		}
+		if l.iter != nil {
+			// We don't bother comparing the file bounds with the iteration bounds when we have
+			// an already open iterator. It is possible that the iter may not be relevant given the
+			// current iteration bounds, but it knows those bounds, so it will enforce them.
+			if l.rangeDelIterPtr != nil {
+				*l.rangeDelIterPtr = l.rangeDelIterCopy
+			}
+
+			// There are a few reasons we might not have triggered combined
+			// iteration yet, even though we already had `file` open.
+			// 1. If the bounds changed, we might have previously avoided
+			//    switching to combined iteration because the bounds excluded
+			//    the range keys contained in this file.
+			// 2. If an existing iterator was reconfigured to iterate over range
+			//    keys (eg, using SetOptions), then we wouldn't have triggered
+			//    the switch to combined iteration yet.
+			l.maybeTriggerCombinedIteration(file, dir)
+			return fileAlreadyLoaded
+		}
+		// We were already at file, but don't have an iterator, probably because the file was
+		// beyond the iteration bounds. It may still be, but it is also possible that the bounds
+		// have changed. We handle that below.
+	}
+
+	// Close both iter and rangeDelIterPtr. While mergingIter knows about
+	// rangeDelIterPtr, it can't call Close() on it because it does not know
+	// when the levelIter will switch it. Note that levelIter.Close() can be
+	// called multiple times.
+	if err := l.Close(); err != nil {
+		return noFileLoaded
+	}
+
+	for {
+		l.iterFile = file
+		if file == nil {
+			return noFileLoaded
+		}
+
+		l.maybeTriggerCombinedIteration(file, dir)
+		if !file.HasPointKeys {
+			switch dir {
+			case +1:
+				file = l.files.Next()
+				continue
+			case -1:
+				file = l.files.Prev()
+				continue
+			}
+		}
+
+		switch l.initTableBounds(file) {
+		case -1:
+			// The largest key in the sstable is smaller than the lower bound.
+			if dir < 0 {
+				return noFileLoaded
+			}
+			file = l.files.Next()
+			continue
+		case +1:
+			// The smallest key in the sstable is greater than or equal to the upper
+			// bound.
+			if dir > 0 {
+				return noFileLoaded
+			}
+			file = l.files.Prev()
+			continue
+		}
+
+		var rangeDelIter keyspan.FragmentIterator
+		var iter internalIterator
+		iter, rangeDelIter, l.err = l.newIters(l.ctx, l.iterFile, &l.tableOpts, l.internalOpts)
+		l.iter = iter
+		if l.err != nil {
+			return noFileLoaded
+		}
+		if rangeDelIter != nil {
+			if fi, ok := iter.(filteredIter); ok {
+				l.filteredIter = fi
+			} else {
+				l.filteredIter = nil
+			}
+		} else {
+			l.filteredIter = nil
+		}
+		if l.rangeDelIterPtr != nil {
+			*l.rangeDelIterPtr = rangeDelIter
+			l.rangeDelIterCopy = rangeDelIter
+		} else if rangeDelIter != nil {
+			rangeDelIter.Close()
+		}
+		if l.boundaryContext != nil {
+			l.boundaryContext.smallestUserKey = file.Smallest.UserKey
+			l.boundaryContext.largestUserKey = file.Largest.UserKey
+			l.boundaryContext.isLargestUserKeyExclusive = file.Largest.IsExclusiveSentinel()
+		}
+		return newFileLoaded
+	}
+}
+
+// In race builds we verify that the keys returned by levelIter lie within
+// [lower,upper).
+func (l *levelIter) verify(key *InternalKey, val base.LazyValue) (*InternalKey, base.LazyValue) {
+	// Note that invariants.Enabled is a compile time constant, which means the
+	// block of code will be compiled out of normal builds making this method
+	// eligible for inlining. Do not change this to use a variable.
+	if invariants.Enabled && !l.disableInvariants && key != nil {
+		// We allow returning a boundary key that is outside of the lower/upper
+		// bounds as such keys are always range tombstones which will be skipped by
+		// the Iterator.
+		if l.lower != nil && key != l.smallestBoundary && l.cmp(key.UserKey, l.lower) < 0 {
+			l.logger.Fatalf("levelIter %s: lower bound violation: %s < %s\n%s", l.level, key, l.lower, debug.Stack())
+		}
+		if l.upper != nil && key != l.largestBoundary && l.cmp(key.UserKey, l.upper) > 0 {
+			l.logger.Fatalf("levelIter %s: upper bound violation: %s > %s\n%s", l.level, key, l.upper, debug.Stack())
+		}
+	}
+	return key, val
+}
+
+func (l *levelIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) {
+	l.err = nil // clear cached iteration error
+	if l.boundaryContext != nil {
+		l.boundaryContext.isSyntheticIterBoundsKey = false
+		l.boundaryContext.isIgnorableBoundaryKey = false
+	}
+	// NB: the top-level Iterator has already adjusted key based on
+	// IterOptions.LowerBound.
+	loadFileIndicator := l.loadFile(l.findFileGE(key, flags), +1)
+	if loadFileIndicator == noFileLoaded {
+		return nil, base.LazyValue{}
+	}
+	if loadFileIndicator == newFileLoaded {
+		// File changed, so l.iter has changed, and that iterator is not
+		// positioned appropriately.
+		flags = flags.DisableTrySeekUsingNext()
+	}
+	if ikey, val := l.iter.SeekGE(key, flags); ikey != nil {
+		return l.verify(ikey, val)
+	}
+	return l.verify(l.skipEmptyFileForward())
+}
+
+func (l *levelIter) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	l.err = nil // clear cached iteration error
+	if l.boundaryContext != nil {
+		l.boundaryContext.isSyntheticIterBoundsKey = false
+		l.boundaryContext.isIgnorableBoundaryKey = false
+	}
+
+	// NB: the top-level Iterator has already adjusted key based on
+	// IterOptions.LowerBound.
+	loadFileIndicator := l.loadFile(l.findFileGE(key, flags), +1)
+	if loadFileIndicator == noFileLoaded {
+		return nil, base.LazyValue{}
+	}
+	if loadFileIndicator == newFileLoaded {
+		// File changed, so l.iter has changed, and that iterator is not
+		// positioned appropriately.
+		flags = flags.DisableTrySeekUsingNext()
+	}
+	if key, val := l.iter.SeekPrefixGE(prefix, key, flags); key != nil {
+		return l.verify(key, val)
+	}
+	// When SeekPrefixGE returns nil, we have not necessarily reached the end of
+	// the sstable. All we know is that a key with prefix does not exist in the
+	// current sstable. We do know that the key lies within the bounds of the
+	// table as findFileGE found the table where key <= meta.Largest. We return
+	// the table's bound with isIgnorableBoundaryKey set.
+	if l.rangeDelIterPtr != nil && *l.rangeDelIterPtr != nil {
+		if l.tableOpts.UpperBound != nil {
+			l.syntheticBoundary.UserKey = l.tableOpts.UpperBound
+			l.syntheticBoundary.Trailer = InternalKeyRangeDeleteSentinel
+			l.largestBoundary = &l.syntheticBoundary
+			if l.boundaryContext != nil {
+				l.boundaryContext.isSyntheticIterBoundsKey = true
+				l.boundaryContext.isIgnorableBoundaryKey = false
+			}
+			return l.verify(l.largestBoundary, base.LazyValue{})
+		}
+		// Return the file's largest bound, ensuring this file stays open until
+		// the mergingIter advances beyond the file's bounds. We set
+		// isIgnorableBoundaryKey to signal that the actual key returned should
+		// be ignored, and does not represent a real key in the database.
+		l.largestBoundary = &l.iterFile.LargestPointKey
+		if l.boundaryContext != nil {
+			l.boundaryContext.isSyntheticIterBoundsKey = false
+			l.boundaryContext.isIgnorableBoundaryKey = true
+		}
+		return l.verify(l.largestBoundary, base.LazyValue{})
+	}
+	// It is possible that we are here because bloom filter matching failed. In
+	// that case it is likely that all keys matching the prefix are wholly
+	// within the current file and cannot be in the subsequent file. In that
+	// case we don't want to go to the next file, since loading and seeking in
+	// there has some cost. Additionally, for sparse key spaces, loading the
+	// next file will defeat the optimization for the next SeekPrefixGE that is
+	// called with flags.TrySeekUsingNext(), since for sparse key spaces it is
+	// likely that the next key will also be contained in the current file.
+	var n int
+	if l.split != nil {
+		// If the split function is specified, calculate the prefix length accordingly.
+		n = l.split(l.iterFile.LargestPointKey.UserKey)
+	} else {
+		// If the split function is not specified, the entire key is used as the
+		// prefix. This case can occur when getIter uses SeekPrefixGE.
+		n = len(l.iterFile.LargestPointKey.UserKey)
+	}
+	if l.cmp(prefix, l.iterFile.LargestPointKey.UserKey[:n]) < 0 {
+		return nil, base.LazyValue{}
+	}
+	return l.verify(l.skipEmptyFileForward())
+}
+
+func (l *levelIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) {
+	l.err = nil // clear cached iteration error
+	if l.boundaryContext != nil {
+		l.boundaryContext.isSyntheticIterBoundsKey = false
+		l.boundaryContext.isIgnorableBoundaryKey = false
+	}
+
+	// NB: the top-level Iterator has already adjusted key based on
+	// IterOptions.UpperBound.
+	if l.loadFile(l.findFileLT(key, flags), -1) == noFileLoaded {
+		return nil, base.LazyValue{}
+	}
+	if key, val := l.iter.SeekLT(key, flags); key != nil {
+		return l.verify(key, val)
+	}
+	return l.verify(l.skipEmptyFileBackward())
+}
+
+func (l *levelIter) First() (*InternalKey, base.LazyValue) {
+	l.err = nil // clear cached iteration error
+	if l.boundaryContext != nil {
+		l.boundaryContext.isSyntheticIterBoundsKey = false
+		l.boundaryContext.isIgnorableBoundaryKey = false
+	}
+
+	// NB: the top-level Iterator will call SeekGE if IterOptions.LowerBound is
+	// set.
+	if l.loadFile(l.files.First(), +1) == noFileLoaded {
+		return nil, base.LazyValue{}
+	}
+	if key, val := l.iter.First(); key != nil {
+		return l.verify(key, val)
+	}
+	return l.verify(l.skipEmptyFileForward())
+}
+
+func (l *levelIter) Last() (*InternalKey, base.LazyValue) {
+	l.err = nil // clear cached iteration error
+	if l.boundaryContext != nil {
+		l.boundaryContext.isSyntheticIterBoundsKey = false
+		l.boundaryContext.isIgnorableBoundaryKey = false
+	}
+
+	// NB: the top-level Iterator will call SeekLT if IterOptions.UpperBound is
+	// set.
+	if l.loadFile(l.files.Last(), -1) == noFileLoaded {
+		return nil, base.LazyValue{}
+	}
+	if key, val := l.iter.Last(); key != nil {
+		return l.verify(key, val)
+	}
+	return l.verify(l.skipEmptyFileBackward())
+}
+
+func (l *levelIter) Next() (*InternalKey, base.LazyValue) {
+	if l.err != nil || l.iter == nil {
+		return nil, base.LazyValue{}
+	}
+	if l.boundaryContext != nil {
+		l.boundaryContext.isSyntheticIterBoundsKey = false
+		l.boundaryContext.isIgnorableBoundaryKey = false
+	}
+
+	switch {
+	case l.largestBoundary != nil:
+		if l.tableOpts.UpperBound != nil {
+			// The UpperBound was within this file, so don't load the next
+			// file. We leave the largestBoundary unchanged so that subsequent
+			// calls to Next() stay at this file. If a Seek/First/Last call is
+			// made and this file continues to be relevant, loadFile() will
+			// set the largestBoundary to nil.
+			if l.rangeDelIterPtr != nil {
+				*l.rangeDelIterPtr = nil
+			}
+			return nil, base.LazyValue{}
+		}
+		// We're stepping past the boundary key, so now we can load the next file.
+		if l.loadFile(l.files.Next(), +1) != noFileLoaded {
+			if key, val := l.iter.First(); key != nil {
+				return l.verify(key, val)
+			}
+			return l.verify(l.skipEmptyFileForward())
+		}
+		return nil, base.LazyValue{}
+
+	default:
+		// Reset the smallest boundary since we're moving away from it.
+		l.smallestBoundary = nil
+		if key, val := l.iter.Next(); key != nil {
+			return l.verify(key, val)
+		}
+	}
+	return l.verify(l.skipEmptyFileForward())
+}
+
+func (l *levelIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
+	if l.err != nil || l.iter == nil {
+		return nil, base.LazyValue{}
+	}
+	if l.boundaryContext != nil {
+		l.boundaryContext.isSyntheticIterBoundsKey = false
+		l.boundaryContext.isIgnorableBoundaryKey = false
+	}
+
+	switch {
+	case l.largestBoundary != nil:
+		if l.tableOpts.UpperBound != nil {
+			// The UpperBound was within this file, so don't load the next
+			// file. We leave the largestBoundary unchanged so that subsequent
+			// calls to Next() stay at this file. If a Seek/First/Last call is
+			// made and this file continues to be relevant, loadFile() will
+			// set the largestBoundary to nil.
+			if l.rangeDelIterPtr != nil {
+				*l.rangeDelIterPtr = nil
+			}
+			return nil, base.LazyValue{}
+		}
+		// We're stepping past the boundary key, so we need to load a later
+		// file.
+
+	default:
+		// Reset the smallest boundary since we're moving away from it.
+		l.smallestBoundary = nil
+
+		if key, val := l.iter.NextPrefix(succKey); key != nil {
+			return l.verify(key, val)
+		}
+		// Fall through to seeking.
+	}
+
+	// Seek the manifest level iterator using TrySeekUsingNext=true and
+	// RelativeSeek=true so that we take advantage of the knowledge that
+	// `succKey` can only be contained in later files.
+	metadataSeekFlags := base.SeekGEFlagsNone.EnableTrySeekUsingNext().EnableRelativeSeek()
+	if l.loadFile(l.findFileGE(succKey, metadataSeekFlags), +1) != noFileLoaded {
+		// NB: The SeekGE on the file's iterator must not set TrySeekUsingNext,
+		// because l.iter is unpositioned.
+		if key, val := l.iter.SeekGE(succKey, base.SeekGEFlagsNone); key != nil {
+			return l.verify(key, val)
+		}
+		return l.verify(l.skipEmptyFileForward())
+	}
+	return nil, base.LazyValue{}
+}
+
+func (l *levelIter) Prev() (*InternalKey, base.LazyValue) {
+	if l.err != nil || l.iter == nil {
+		return nil, base.LazyValue{}
+	}
+	if l.boundaryContext != nil {
+		l.boundaryContext.isSyntheticIterBoundsKey = false
+		l.boundaryContext.isIgnorableBoundaryKey = false
+	}
+
+	switch {
+	case l.smallestBoundary != nil:
+		if l.tableOpts.LowerBound != nil {
+			// The LowerBound was within this file, so don't load the previous
+			// file. We leave the smallestBoundary unchanged so that
+			// subsequent calls to Prev() stay at this file. If a
+			// Seek/First/Last call is made and this file continues to be
+			// relevant, loadFile() will set the smallestBoundary to nil.
+			if l.rangeDelIterPtr != nil {
+				*l.rangeDelIterPtr = nil
+			}
+			return nil, base.LazyValue{}
+		}
+		// We're stepping past the boundary key, so now we can load the prev file.
+		if l.loadFile(l.files.Prev(), -1) != noFileLoaded {
+			if key, val := l.iter.Last(); key != nil {
+				return l.verify(key, val)
+			}
+			return l.verify(l.skipEmptyFileBackward())
+		}
+		return nil, base.LazyValue{}
+
+	default:
+		// Reset the largest boundary since we're moving away from it.
+		l.largestBoundary = nil
+		if key, val := l.iter.Prev(); key != nil {
+			return l.verify(key, val)
+		}
+	}
+	return l.verify(l.skipEmptyFileBackward())
+}
+
+func (l *levelIter) skipEmptyFileForward() (*InternalKey, base.LazyValue) {
+	var key *InternalKey
+	var val base.LazyValue
+	// The first iteration of this loop starts with an already exhausted
+	// l.iter. The reason for the exhaustion is either that we iterated to the
+	// end of the sstable, or our iteration was terminated early due to the
+	// presence of an upper-bound or the use of SeekPrefixGE. If
+	// l.rangeDelIterPtr is non-nil, we may need to pretend the iterator is
+	// not exhausted to allow for the merging to finish consuming the
+	// l.rangeDelIterPtr before levelIter switches the rangeDelIter from
+	// under it. This pretense is done by either generating a synthetic
+	// boundary key or returning the largest key of the file, depending on the
+	// exhaustion reason.
+
+	// Subsequent iterations will examine consecutive files such that the first
+	// file that does not have an exhausted iterator causes the code to return
+	// that key, else the behavior described above if there is a corresponding
+	// rangeDelIterPtr.
+	for ; key == nil; key, val = l.iter.First() {
+		if l.rangeDelIterPtr != nil {
+			// We're being used as part of a mergingIter and we've exhausted the
+			// current sstable. If an upper bound is present and the upper bound lies
+			// within the current sstable, then we will have reached the upper bound
+			// rather than the end of the sstable. We need to return a synthetic
+			// boundary key so that mergingIter can use the range tombstone iterator
+			// until the other levels have reached this boundary.
+			//
+			// It is safe to set the boundary key to the UpperBound user key
+			// with the RANGEDEL sentinel since it is the smallest InternalKey
+			// that matches the exclusive upper bound, and does not represent
+			// a real key.
+			if l.tableOpts.UpperBound != nil {
+				if *l.rangeDelIterPtr != nil {
+					l.syntheticBoundary.UserKey = l.tableOpts.UpperBound
+					l.syntheticBoundary.Trailer = InternalKeyRangeDeleteSentinel
+					l.largestBoundary = &l.syntheticBoundary
+					if l.boundaryContext != nil {
+						l.boundaryContext.isSyntheticIterBoundsKey = true
+					}
+					return l.largestBoundary, base.LazyValue{}
+				}
+				// Else there are no range deletions in this sstable. This
+				// helps with performance when many levels are populated with
+				// sstables and most don't have any actual keys within the
+				// bounds.
+				return nil, base.LazyValue{}
+			}
+			// If the boundary is a range deletion tombstone, return that key.
+			if l.iterFile.LargestPointKey.Kind() == InternalKeyKindRangeDelete {
+				l.largestBoundary = &l.iterFile.LargestPointKey
+				if l.boundaryContext != nil {
+					l.boundaryContext.isIgnorableBoundaryKey = true
+				}
+				return l.largestBoundary, base.LazyValue{}
+			}
+			// If the last point iterator positioning op might've skipped keys,
+			// it's possible the file's range deletions are still relevant to
+			// other levels. Return the largest boundary as a special ignorable
+			// marker to avoid advancing to the next file.
+			//
+			// The sstable iterator cannot guarantee that keys were skipped. A
+			// SeekGE that lands on a index separator k only knows that the
+			// block at the index entry contains keys ≤ k. We can't know whether
+			// there were actually keys between the seek key and the index
+			// separator key. If the block is then excluded due to block
+			// property filters, the iterator does not know whether keys were
+			// actually skipped by the block's exclusion.
+			//
+			// Since MaybeFilteredKeys cannot guarantee that keys were skipped,
+			// it's possible l.iterFile.Largest was already returned. Returning
+			// l.iterFile.Largest again is a violation of the strict
+			// monotonicity normally provided. The mergingIter's heap can
+			// tolerate this repeat key and in this case will keep the level at
+			// the top of the heap and immediately skip the entry, advancing to
+			// the next file.
+			if *l.rangeDelIterPtr != nil && l.filteredIter != nil &&
+				l.filteredIter.MaybeFilteredKeys() {
+				l.largestBoundary = &l.iterFile.Largest
+				if l.boundaryContext != nil {
+					l.boundaryContext.isIgnorableBoundaryKey = true
+				}
+				return l.largestBoundary, base.LazyValue{}
+			}
+		}
+
+		// Current file was exhausted. Move to the next file.
+		if l.loadFile(l.files.Next(), +1) == noFileLoaded {
+			return nil, base.LazyValue{}
+		}
+	}
+	return key, val
+}
+
+func (l *levelIter) skipEmptyFileBackward() (*InternalKey, base.LazyValue) {
+	var key *InternalKey
+	var val base.LazyValue
+	// The first iteration of this loop starts with an already exhausted
+	// l.iter. The reason for the exhaustion is either that we iterated to the
+	// end of the sstable, or our iteration was terminated early due to the
+	// presence of a lower-bound. If l.rangeDelIterPtr is non-nil, we may need
+	// to pretend the iterator is not exhausted to allow for the merging to
+	// finish consuming the l.rangeDelIterPtr before levelIter switches the
+	// rangeDelIter from under it. This pretense is done by either generating
+	// a synthetic boundary key or returning the smallest key of the file,
+	// depending on the exhaustion reason.
+
+	// Subsequent iterations will examine consecutive files such that the first
+	// file that does not have an exhausted iterator causes the code to return
+	// that key, else the behavior described above if there is a corresponding
+	// rangeDelIterPtr.
+	for ; key == nil; key, val = l.iter.Last() {
+		if l.rangeDelIterPtr != nil {
+			// We're being used as part of a mergingIter and we've exhausted the
+			// current sstable. If a lower bound is present and the lower bound lies
+			// within the current sstable, then we will have reached the lower bound
+			// rather than the beginning of the sstable. We need to return a
+			// synthetic boundary key so that mergingIter can use the range tombstone
+			// iterator until the other levels have reached this boundary.
+			//
+			// It is safe to set the boundary key to the LowerBound user key
+			// with the RANGEDEL sentinel since it is the smallest InternalKey
+			// that is within the inclusive lower bound, and does not
+			// represent a real key.
+			if l.tableOpts.LowerBound != nil {
+				if *l.rangeDelIterPtr != nil {
+					l.syntheticBoundary.UserKey = l.tableOpts.LowerBound
+					l.syntheticBoundary.Trailer = InternalKeyRangeDeleteSentinel
+					l.smallestBoundary = &l.syntheticBoundary
+					if l.boundaryContext != nil {
+						l.boundaryContext.isSyntheticIterBoundsKey = true
+					}
+					return l.smallestBoundary, base.LazyValue{}
+				}
+				// Else there are no range deletions in this sstable. This
+				// helps with performance when many levels are populated with
+				// sstables and most don't have any actual keys within the
+				// bounds.
+				return nil, base.LazyValue{}
+			}
+			// If the boundary is a range deletion tombstone, return that key.
+			if l.iterFile.SmallestPointKey.Kind() == InternalKeyKindRangeDelete {
+				l.smallestBoundary = &l.iterFile.SmallestPointKey
+				if l.boundaryContext != nil {
+					l.boundaryContext.isIgnorableBoundaryKey = true
+				}
+				return l.smallestBoundary, base.LazyValue{}
+			}
+			// If the last point iterator positioning op skipped keys, it's
+			// possible the file's range deletions are still relevant to other
+			// levels. Return the smallest boundary as a special ignorable key
+			// to avoid advancing to the next file.
+			//
+			// The sstable iterator cannot guarantee that keys were skipped.  A
+			// SeekGE that lands on a index separator k only knows that the
+			// block at the index entry contains keys ≤ k. We can't know whether
+			// there were actually keys between the seek key and the index
+			// separator key. If the block is then excluded due to block
+			// property filters, the iterator does not know whether keys were
+			// actually skipped by the block's exclusion.
+			//
+			// Since MaybeFilteredKeys cannot guarantee that keys were skipped,
+			// it's possible l.iterFile.Smallest was already returned. Returning
+			// l.iterFile.Smallest again is a violation of the strict
+			// monotonicity normally provided. The mergingIter's heap can
+			// tolerate this repeat key and in this case will keep the level at
+			// the top of the heap and immediately skip the entry, advancing to
+			// the next file.
+			if *l.rangeDelIterPtr != nil && l.filteredIter != nil && l.filteredIter.MaybeFilteredKeys() {
+				l.smallestBoundary = &l.iterFile.Smallest
+				if l.boundaryContext != nil {
+					l.boundaryContext.isIgnorableBoundaryKey = true
+				}
+				return l.smallestBoundary, base.LazyValue{}
+			}
+		}
+
+		// Current file was exhausted. Move to the previous file.
+		if l.loadFile(l.files.Prev(), -1) == noFileLoaded {
+			return nil, base.LazyValue{}
+		}
+	}
+	return key, val
+}
+
+func (l *levelIter) Error() error {
+	if l.err != nil || l.iter == nil {
+		return l.err
+	}
+	return l.iter.Error()
+}
+
+func (l *levelIter) Close() error {
+	if l.iter != nil {
+		l.err = l.iter.Close()
+		l.iter = nil
+	}
+	if l.rangeDelIterPtr != nil {
+		if t := l.rangeDelIterCopy; t != nil {
+			l.err = firstError(l.err, t.Close())
+		}
+		*l.rangeDelIterPtr = nil
+		l.rangeDelIterCopy = nil
+	}
+	return l.err
+}
+
+func (l *levelIter) SetBounds(lower, upper []byte) {
+	l.lower = lower
+	l.upper = upper
+
+	if l.iter == nil {
+		return
+	}
+
+	// Update tableOpts.{Lower,Upper}Bound in case the new boundaries fall within
+	// the boundaries of the current table.
+	if l.initTableBounds(l.iterFile) != 0 {
+		// The table does not overlap the bounds. Close() will set levelIter.err if
+		// an error occurs.
+		_ = l.Close()
+		return
+	}
+
+	l.iter.SetBounds(l.tableOpts.LowerBound, l.tableOpts.UpperBound)
+}
+
+func (l *levelIter) SetContext(ctx context.Context) {
+	l.ctx = ctx
+	if l.iter != nil {
+		// TODO(sumeer): this is losing the ctx = objiotracing.WithLevel(ctx,
+		// manifest.LevelToInt(opts.level)) that happens in table_cache.go.
+		l.iter.SetContext(ctx)
+	}
+}
+
+func (l *levelIter) String() string {
+	if l.iterFile != nil {
+		return fmt.Sprintf("%s: fileNum=%s", l.level, l.iter.String())
+	}
+	return fmt.Sprintf("%s: fileNum=<nil>", l.level)
+}
+
+var _ internalIterator = &levelIter{}
diff --git a/pebble/level_iter_test.go b/pebble/level_iter_test.go
new file mode 100644
index 0000000..63d448e
--- /dev/null
+++ b/pebble/level_iter_test.go
@@ -0,0 +1,717 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/bloom"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/itertest"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/internal/rangedel"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+const (
+	level = 1
+)
+
+func TestLevelIter(t *testing.T) {
+	var iters []*fakeIter
+	var files manifest.LevelSlice
+
+	newIters := func(
+		_ context.Context, file *manifest.FileMetadata, opts *IterOptions, _ internalIterOpts,
+	) (internalIterator, keyspan.FragmentIterator, error) {
+		f := *iters[file.FileNum]
+		f.lower = opts.GetLowerBound()
+		f.upper = opts.GetUpperBound()
+		return &f, nil, nil
+	}
+
+	datadriven.RunTest(t, "testdata/level_iter", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "define":
+			iters = nil
+			var metas []*fileMetadata
+			for _, line := range strings.Split(d.Input, "\n") {
+				f := &fakeIter{}
+				for _, key := range strings.Fields(line) {
+					j := strings.Index(key, ":")
+					f.keys = append(f.keys, base.ParseInternalKey(key[:j]))
+					f.vals = append(f.vals, []byte(key[j+1:]))
+				}
+				iters = append(iters, f)
+
+				meta := (&fileMetadata{
+					FileNum: FileNum(len(metas)),
+				}).ExtendPointKeyBounds(
+					DefaultComparer.Compare,
+					f.keys[0],
+					f.keys[len(f.keys)-1],
+				)
+				meta.InitPhysicalBacking()
+				metas = append(metas, meta)
+			}
+			files = manifest.NewLevelSliceKeySorted(base.DefaultComparer.Compare, metas)
+
+			return ""
+
+		case "iter":
+			var opts IterOptions
+			for _, arg := range d.CmdArgs {
+				if len(arg.Vals) != 1 {
+					return fmt.Sprintf("%s: %s=<value>", d.Cmd, arg.Key)
+				}
+				switch arg.Key {
+				case "lower":
+					opts.LowerBound = []byte(arg.Vals[0])
+				case "upper":
+					opts.UpperBound = []byte(arg.Vals[0])
+				default:
+					return fmt.Sprintf("%s: unknown arg: %s", d.Cmd, arg.Key)
+				}
+			}
+
+			iter := newLevelIter(context.Background(), opts, testkeys.Comparer, newIters, files.Iter(), manifest.Level(level), internalIterOpts{})
+			defer iter.Close()
+			// Fake up the range deletion initialization.
+			iter.initRangeDel(new(keyspan.FragmentIterator))
+			iter.disableInvariants = true
+			return itertest.RunInternalIterCmd(t, d, iter, itertest.Verbose)
+
+		case "load":
+			// The "load" command allows testing the iterator options passed to load
+			// sstables.
+			//
+			// load <key> [lower=<key>] [upper=<key>]
+			var opts IterOptions
+			var key string
+			for _, arg := range d.CmdArgs {
+				if len(arg.Vals) == 0 {
+					key = arg.Key
+					continue
+				}
+				if len(arg.Vals) != 1 {
+					return fmt.Sprintf("%s: %s=<value>", d.Cmd, arg.Key)
+				}
+				switch arg.Key {
+				case "lower":
+					opts.LowerBound = []byte(arg.Vals[0])
+				case "upper":
+					opts.UpperBound = []byte(arg.Vals[0])
+				default:
+					return fmt.Sprintf("%s: unknown arg: %s", d.Cmd, arg.Key)
+				}
+			}
+
+			var tableOpts *IterOptions
+			newIters2 := func(
+				ctx context.Context, file *manifest.FileMetadata, opts *IterOptions,
+				internalOpts internalIterOpts,
+			) (internalIterator, keyspan.FragmentIterator, error) {
+				tableOpts = opts
+				return newIters(ctx, file, opts, internalOpts)
+			}
+
+			iter := newLevelIter(context.Background(), opts, testkeys.Comparer, newIters2, files.Iter(), manifest.Level(level), internalIterOpts{})
+			iter.SeekGE([]byte(key), base.SeekGEFlagsNone)
+			lower, upper := tableOpts.GetLowerBound(), tableOpts.GetUpperBound()
+			return fmt.Sprintf("[%s,%s]\n", lower, upper)
+
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
+
+type levelIterTest struct {
+	cmp          base.Comparer
+	mem          vfs.FS
+	readers      []*sstable.Reader
+	metas        []*fileMetadata
+	itersCreated int
+}
+
+func newLevelIterTest() *levelIterTest {
+	lt := &levelIterTest{
+		cmp: *DefaultComparer,
+		mem: vfs.NewMem(),
+	}
+	lt.cmp.Split = func(a []byte) int { return len(a) }
+	return lt
+}
+
+func (lt *levelIterTest) newIters(
+	ctx context.Context, file *manifest.FileMetadata, opts *IterOptions, iio internalIterOpts,
+) (internalIterator, keyspan.FragmentIterator, error) {
+	lt.itersCreated++
+	iter, err := lt.readers[file.FileNum].NewIterWithBlockPropertyFiltersAndContextEtc(
+		ctx, opts.LowerBound, opts.UpperBound, nil, false, true, iio.stats, sstable.CategoryAndQoS{},
+		nil, sstable.TrivialReaderProvider{Reader: lt.readers[file.FileNum]})
+	if err != nil {
+		return nil, nil, err
+	}
+	rangeDelIter, err := lt.readers[file.FileNum].NewRawRangeDelIter()
+	if err != nil {
+		return nil, nil, err
+	}
+	return iter, rangeDelIter, nil
+}
+
+func (lt *levelIterTest) runClear(d *datadriven.TestData) string {
+	lt.mem = vfs.NewMem()
+	for _, r := range lt.readers {
+		r.Close()
+	}
+	lt.readers = nil
+	lt.metas = nil
+	lt.itersCreated = 0
+	return ""
+}
+
+func (lt *levelIterTest) runBuild(d *datadriven.TestData) string {
+	fileNum := FileNum(len(lt.readers))
+	name := fmt.Sprint(fileNum)
+	f0, err := lt.mem.Create(name)
+	if err != nil {
+		return err.Error()
+	}
+
+	tableFormat := sstable.TableFormatRocksDBv2
+	for _, arg := range d.CmdArgs {
+		if arg.Key == "format" {
+			switch arg.Vals[0] {
+			case "rocksdbv2":
+				tableFormat = sstable.TableFormatRocksDBv2
+			case "pebblev2":
+				tableFormat = sstable.TableFormatPebblev2
+			}
+		}
+	}
+	fp := bloom.FilterPolicy(10)
+	w := sstable.NewWriter(objstorageprovider.NewFileWritable(f0), sstable.WriterOptions{
+		Comparer:     &lt.cmp,
+		FilterPolicy: fp,
+		TableFormat:  tableFormat,
+	})
+	var tombstones []keyspan.Span
+	f := keyspan.Fragmenter{
+		Cmp:    lt.cmp.Compare,
+		Format: lt.cmp.FormatKey,
+		Emit: func(fragmented keyspan.Span) {
+			tombstones = append(tombstones, fragmented)
+		},
+	}
+	for _, key := range strings.Split(d.Input, "\n") {
+		j := strings.Index(key, ":")
+		ikey := base.ParseInternalKey(key[:j])
+		value := []byte(key[j+1:])
+		switch ikey.Kind() {
+		case InternalKeyKindRangeDelete:
+			f.Add(rangedel.Decode(ikey, value, nil))
+		case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
+			if err := w.AddRangeKey(ikey, value); err != nil {
+				return err.Error()
+			}
+		default:
+			if err := w.Add(ikey, value); err != nil {
+				return err.Error()
+			}
+		}
+	}
+	f.Finish()
+	for _, v := range tombstones {
+		if err := rangedel.Encode(&v, w.Add); err != nil {
+			return err.Error()
+		}
+	}
+	if err := w.Close(); err != nil {
+		return err.Error()
+	}
+	meta, err := w.Metadata()
+	if err != nil {
+		return err.Error()
+	}
+
+	f1, err := lt.mem.Open(name)
+	if err != nil {
+		return err.Error()
+	}
+	readable, err := sstable.NewSimpleReadable(f1)
+	if err != nil {
+		return err.Error()
+	}
+	r, err := sstable.NewReader(readable, sstable.ReaderOptions{
+		Filters: map[string]FilterPolicy{
+			fp.Name(): fp,
+		},
+	})
+	if err != nil {
+		return err.Error()
+	}
+	lt.readers = append(lt.readers, r)
+	m := &fileMetadata{FileNum: fileNum}
+	if meta.HasPointKeys {
+		m.ExtendPointKeyBounds(lt.cmp.Compare, meta.SmallestPoint, meta.LargestPoint)
+	}
+	if meta.HasRangeDelKeys {
+		m.ExtendPointKeyBounds(lt.cmp.Compare, meta.SmallestRangeDel, meta.LargestRangeDel)
+	}
+	if meta.HasRangeKeys {
+		m.ExtendRangeKeyBounds(lt.cmp.Compare, meta.SmallestRangeKey, meta.LargestRangeKey)
+	}
+	m.InitPhysicalBacking()
+	lt.metas = append(lt.metas, m)
+
+	var buf bytes.Buffer
+	for _, f := range lt.metas {
+		fmt.Fprintf(&buf, "%d: %s-%s\n", f.FileNum, f.Smallest, f.Largest)
+	}
+	return buf.String()
+}
+
+func TestLevelIterBoundaries(t *testing.T) {
+	lt := newLevelIterTest()
+	defer lt.runClear(nil)
+
+	var iter *levelIter
+	datadriven.RunTest(t, "testdata/level_iter_boundaries", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "clear":
+			return lt.runClear(d)
+
+		case "build":
+			return lt.runBuild(d)
+
+		case "iter":
+			// The save and continue parameters allow us to save the iterator
+			// for later continued use.
+			save := false
+			cont := false
+			for _, arg := range d.CmdArgs {
+				switch arg.Key {
+				case "save":
+					save = true
+				case "continue":
+					cont = true
+				default:
+					return fmt.Sprintf("%s: unknown arg: %s", d.Cmd, arg.Key)
+				}
+			}
+			if !cont && iter != nil {
+				return "preceding iter was not closed"
+			}
+			if cont && iter == nil {
+				return "no existing iter"
+			}
+			if iter == nil {
+				slice := manifest.NewLevelSliceKeySorted(lt.cmp.Compare, lt.metas)
+				iter = newLevelIter(context.Background(), IterOptions{}, testkeys.Comparer, lt.newIters, slice.Iter(), manifest.Level(level), internalIterOpts{})
+				// Fake up the range deletion initialization.
+				iter.initRangeDel(new(keyspan.FragmentIterator))
+			}
+			if !save {
+				defer func() {
+					iter.Close()
+					iter = nil
+				}()
+			}
+			return itertest.RunInternalIterCmd(t, d, iter, itertest.Verbose)
+
+		case "file-pos":
+			// Returns the FileNum at which the iterator is positioned.
+			if iter == nil {
+				return "nil levelIter"
+			}
+			if iter.iterFile == nil {
+				return "nil iterFile"
+			}
+			return fmt.Sprintf("file %d", iter.iterFile.FileNum)
+
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
+
+// levelIterTestIter allows a datadriven test to use runInternalIterCmd and
+// perform parallel operations on both both a levelIter and rangeDelIter.
+type levelIterTestIter struct {
+	*levelIter
+	rangeDelIter keyspan.FragmentIterator
+}
+
+func (i *levelIterTestIter) rangeDelSeek(
+	key []byte, ikey *InternalKey, val base.LazyValue, dir int,
+) (*InternalKey, base.LazyValue) {
+	var tombstone keyspan.Span
+	if i.rangeDelIter != nil {
+		var t *keyspan.Span
+		if dir < 0 {
+			t = keyspan.SeekLE(i.levelIter.cmp, i.rangeDelIter, key)
+		} else {
+			t = i.rangeDelIter.SeekGE(key)
+		}
+		if t != nil {
+			tombstone = t.Visible(1000)
+		}
+	}
+	if ikey == nil {
+		return &InternalKey{
+			UserKey: []byte(fmt.Sprintf("./%s", tombstone)),
+		}, base.LazyValue{}
+	}
+	return &InternalKey{
+		UserKey: []byte(fmt.Sprintf("%s/%s", ikey.UserKey, tombstone)),
+		Trailer: ikey.Trailer,
+	}, val
+}
+
+func (i *levelIterTestIter) String() string {
+	return "level-iter-test"
+}
+
+func (i *levelIterTestIter) SeekGE(
+	key []byte, flags base.SeekGEFlags,
+) (*InternalKey, base.LazyValue) {
+	ikey, val := i.levelIter.SeekGE(key, flags)
+	return i.rangeDelSeek(key, ikey, val, 1)
+}
+
+func (i *levelIterTestIter) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	ikey, val := i.levelIter.SeekPrefixGE(prefix, key, flags)
+	return i.rangeDelSeek(key, ikey, val, 1)
+}
+
+func (i *levelIterTestIter) SeekLT(
+	key []byte, flags base.SeekLTFlags,
+) (*InternalKey, base.LazyValue) {
+	ikey, val := i.levelIter.SeekLT(key, flags)
+	return i.rangeDelSeek(key, ikey, val, -1)
+}
+
+func TestLevelIterSeek(t *testing.T) {
+	lt := newLevelIterTest()
+	defer lt.runClear(nil)
+
+	datadriven.RunTest(t, "testdata/level_iter_seek", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "clear":
+			return lt.runClear(d)
+
+		case "build":
+			return lt.runBuild(d)
+
+		case "iter":
+			var stats base.InternalIteratorStats
+			slice := manifest.NewLevelSliceKeySorted(lt.cmp.Compare, lt.metas)
+			iter := &levelIterTestIter{levelIter: &levelIter{}}
+			iter.init(context.Background(), IterOptions{}, testkeys.Comparer, lt.newIters, slice.Iter(),
+				manifest.Level(level), internalIterOpts{stats: &stats})
+			defer iter.Close()
+			iter.initRangeDel(&iter.rangeDelIter)
+			return itertest.RunInternalIterCmd(t, d, iter, itertest.Verbose, itertest.WithStats(&stats))
+
+		case "iters-created":
+			return fmt.Sprintf("%d", lt.itersCreated)
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
+
+func buildLevelIterTables(
+	b *testing.B, blockSize, restartInterval, count int,
+) ([]*sstable.Reader, manifest.LevelSlice, [][]byte, func()) {
+	mem := vfs.NewMem()
+	files := make([]vfs.File, count)
+	for i := range files {
+		f, err := mem.Create(fmt.Sprintf("bench%d", i))
+		if err != nil {
+			b.Fatal(err)
+		}
+		files[i] = f
+	}
+
+	writers := make([]*sstable.Writer, len(files))
+	for i := range files {
+		writers[i] = sstable.NewWriter(objstorageprovider.NewFileWritable(files[i]), sstable.WriterOptions{
+			BlockRestartInterval: restartInterval,
+			BlockSize:            blockSize,
+			Compression:          NoCompression,
+		})
+	}
+
+	var keys [][]byte
+	var i int
+	const targetSize = 2 << 20
+	for _, w := range writers {
+		for ; w.EstimatedSize() < targetSize; i++ {
+			key := []byte(fmt.Sprintf("%08d", i))
+			keys = append(keys, key)
+			ikey := base.MakeInternalKey(key, 0, InternalKeyKindSet)
+			w.Add(ikey, nil)
+		}
+		if err := w.Close(); err != nil {
+			b.Fatal(err)
+		}
+	}
+
+	opts := sstable.ReaderOptions{Cache: NewCache(128 << 20), Comparer: DefaultComparer}
+	defer opts.Cache.Unref()
+	readers := make([]*sstable.Reader, len(files))
+	for i := range files {
+		f, err := mem.Open(fmt.Sprintf("bench%d", i))
+		if err != nil {
+			b.Fatal(err)
+		}
+		readable, err := sstable.NewSimpleReadable(f)
+		if err != nil {
+			b.Fatal(err)
+		}
+		readers[i], err = sstable.NewReader(readable, opts)
+		if err != nil {
+			b.Fatal(err)
+		}
+	}
+
+	cleanup := func() {
+		for _, r := range readers {
+			require.NoError(b, r.Close())
+		}
+	}
+
+	meta := make([]*fileMetadata, len(readers))
+	for i := range readers {
+		iter, err := readers[i].NewIter(nil /* lower */, nil /* upper */)
+		require.NoError(b, err)
+		smallest, _ := iter.First()
+		meta[i] = &fileMetadata{}
+		meta[i].FileNum = FileNum(i)
+		largest, _ := iter.Last()
+		meta[i].ExtendPointKeyBounds(opts.Comparer.Compare, (*smallest).Clone(), (*largest).Clone())
+		meta[i].InitPhysicalBacking()
+	}
+	slice := manifest.NewLevelSliceKeySorted(base.DefaultComparer.Compare, meta)
+	return readers, slice, keys, cleanup
+}
+
+func BenchmarkLevelIterSeekGE(b *testing.B) {
+	const blockSize = 32 << 10
+
+	for _, restartInterval := range []int{16} {
+		b.Run(fmt.Sprintf("restart=%d", restartInterval),
+			func(b *testing.B) {
+				for _, count := range []int{5} {
+					b.Run(fmt.Sprintf("count=%d", count),
+						func(b *testing.B) {
+							readers, metas, keys, cleanup := buildLevelIterTables(b, blockSize, restartInterval, count)
+							defer cleanup()
+							newIters := func(
+								_ context.Context, file *manifest.FileMetadata, _ *IterOptions, _ internalIterOpts,
+							) (internalIterator, keyspan.FragmentIterator, error) {
+								iter, err := readers[file.FileNum].NewIter(nil /* lower */, nil /* upper */)
+								return iter, nil, err
+							}
+							l := newLevelIter(context.Background(), IterOptions{}, DefaultComparer, newIters, metas.Iter(), manifest.Level(level), internalIterOpts{})
+							rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+
+							b.ResetTimer()
+							for i := 0; i < b.N; i++ {
+								l.SeekGE(keys[rng.Intn(len(keys))], base.SeekGEFlagsNone)
+							}
+							l.Close()
+						})
+				}
+			})
+	}
+}
+
+// A benchmark that simulates the behavior of a levelIter being used as part
+// of a mergingIter where narrow bounds are repeatedly set and used to Seek
+// and then iterate over the keys within the bounds. This resembles MVCC
+// scanning by CockroachDB when doing a lookup/index join with a large number
+// of left rows, that are batched and reuse the same iterator, and which can
+// have good locality of access. This results in the successive bounds being
+// in the same file.
+func BenchmarkLevelIterSeqSeekGEWithBounds(b *testing.B) {
+	const blockSize = 32 << 10
+
+	for _, restartInterval := range []int{16} {
+		b.Run(fmt.Sprintf("restart=%d", restartInterval),
+			func(b *testing.B) {
+				for _, count := range []int{5} {
+					b.Run(fmt.Sprintf("count=%d", count),
+						func(b *testing.B) {
+							readers, metas, keys, cleanup :=
+								buildLevelIterTables(b, blockSize, restartInterval, count)
+							defer cleanup()
+							// This newIters is cheaper than in practice since it does not do
+							// tableCacheShard.findNode.
+							newIters := func(
+								_ context.Context, file *manifest.FileMetadata, opts *IterOptions, _ internalIterOpts,
+							) (internalIterator, keyspan.FragmentIterator, error) {
+								iter, err := readers[file.FileNum].NewIter(
+									opts.LowerBound, opts.UpperBound)
+								return iter, nil, err
+							}
+							l := newLevelIter(context.Background(), IterOptions{}, DefaultComparer, newIters, metas.Iter(), manifest.Level(level), internalIterOpts{})
+							// Fake up the range deletion initialization, to resemble the usage
+							// in a mergingIter.
+							l.initRangeDel(new(keyspan.FragmentIterator))
+							keyCount := len(keys)
+							b.ResetTimer()
+							for i := 0; i < b.N; i++ {
+								pos := i % (keyCount - 1)
+								l.SetBounds(keys[pos], keys[pos+1])
+								// SeekGE will return keys[pos].
+								k, _ := l.SeekGE(keys[pos], base.SeekGEFlagsNone)
+								// Next() will get called once and return nil.
+								for k != nil {
+									k, _ = l.Next()
+								}
+							}
+							l.Close()
+						})
+				}
+			})
+	}
+}
+
+// BenchmarkLevelIterSeqSeekPrefixGE simulates the behavior of a levelIter
+// being used as part of a mergingIter where SeekPrefixGE is used to seek in a
+// monotonically increasing manner. This resembles key-value lookups done by
+// CockroachDB when evaluating Put operations.
+func BenchmarkLevelIterSeqSeekPrefixGE(b *testing.B) {
+	const blockSize = 32 << 10
+	const restartInterval = 16
+	readers, metas, keys, cleanup :=
+		buildLevelIterTables(b, blockSize, restartInterval, 5)
+	defer cleanup()
+	// This newIters is cheaper than in practice since it does not do
+	// tableCacheShard.findNode.
+	newIters := func(
+		_ context.Context, file *manifest.FileMetadata, opts *IterOptions, _ internalIterOpts,
+	) (internalIterator, keyspan.FragmentIterator, error) {
+		iter, err := readers[file.FileNum].NewIter(
+			opts.LowerBound, opts.UpperBound)
+		return iter, nil, err
+	}
+
+	for _, skip := range []int{1, 2, 4, 8, 16} {
+		for _, useNext := range []bool{false, true} {
+			b.Run(fmt.Sprintf("skip=%d/use-next=%t", skip, useNext),
+				func(b *testing.B) {
+					l := newLevelIter(context.Background(), IterOptions{}, testkeys.Comparer, newIters, metas.Iter(), manifest.Level(level), internalIterOpts{})
+					// Fake up the range deletion initialization, to resemble the usage
+					// in a mergingIter.
+					l.initRangeDel(new(keyspan.FragmentIterator))
+					keyCount := len(keys)
+					pos := 0
+					l.SeekPrefixGE(keys[pos], keys[pos], base.SeekGEFlagsNone)
+					b.ResetTimer()
+					for i := 0; i < b.N; i++ {
+						pos += skip
+						var flags base.SeekGEFlags
+						if useNext {
+							flags = flags.EnableTrySeekUsingNext()
+						}
+						if pos >= keyCount {
+							pos = 0
+							flags = flags.DisableTrySeekUsingNext()
+						}
+						// SeekPrefixGE will return keys[pos].
+						l.SeekPrefixGE(keys[pos], keys[pos], flags)
+					}
+					b.StopTimer()
+					l.Close()
+				})
+		}
+	}
+}
+
+func BenchmarkLevelIterNext(b *testing.B) {
+	const blockSize = 32 << 10
+
+	for _, restartInterval := range []int{16} {
+		b.Run(fmt.Sprintf("restart=%d", restartInterval),
+			func(b *testing.B) {
+				for _, count := range []int{5} {
+					b.Run(fmt.Sprintf("count=%d", count),
+						func(b *testing.B) {
+							readers, metas, _, cleanup := buildLevelIterTables(b, blockSize, restartInterval, count)
+							defer cleanup()
+							newIters := func(
+								_ context.Context, file *manifest.FileMetadata, _ *IterOptions, _ internalIterOpts,
+							) (internalIterator, keyspan.FragmentIterator, error) {
+								iter, err := readers[file.FileNum].NewIter(nil /* lower */, nil /* upper */)
+								return iter, nil, err
+							}
+							l := newLevelIter(context.Background(), IterOptions{}, testkeys.Comparer, newIters, metas.Iter(), manifest.Level(level), internalIterOpts{})
+
+							b.ResetTimer()
+							for i := 0; i < b.N; i++ {
+								key, _ := l.Next()
+								if key == nil {
+									key, _ = l.First()
+								}
+								_ = key
+							}
+							l.Close()
+						})
+				}
+			})
+	}
+}
+
+func BenchmarkLevelIterPrev(b *testing.B) {
+	const blockSize = 32 << 10
+
+	for _, restartInterval := range []int{16} {
+		b.Run(fmt.Sprintf("restart=%d", restartInterval),
+			func(b *testing.B) {
+				for _, count := range []int{5} {
+					b.Run(fmt.Sprintf("count=%d", count),
+						func(b *testing.B) {
+							readers, metas, _, cleanup := buildLevelIterTables(b, blockSize, restartInterval, count)
+							defer cleanup()
+							newIters := func(
+								_ context.Context, file *manifest.FileMetadata, _ *IterOptions, _ internalIterOpts,
+							) (internalIterator, keyspan.FragmentIterator, error) {
+								iter, err := readers[file.FileNum].NewIter(nil /* lower */, nil /* upper */)
+								return iter, nil, err
+							}
+							l := newLevelIter(context.Background(), IterOptions{}, DefaultComparer, newIters, metas.Iter(), manifest.Level(level), internalIterOpts{})
+
+							b.ResetTimer()
+							for i := 0; i < b.N; i++ {
+								key, _ := l.Prev()
+								if key == nil {
+									key, _ = l.Last()
+								}
+								_ = key
+							}
+							l.Close()
+						})
+				}
+			})
+	}
+}
diff --git a/pebble/log_recycler.go b/pebble/log_recycler.go
new file mode 100644
index 0000000..c8c2ff4
--- /dev/null
+++ b/pebble/log_recycler.go
@@ -0,0 +1,108 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"sync"
+
+	"github.com/cockroachdb/errors"
+)
+
+type logRecycler struct {
+	// The maximum number of log files to maintain for recycling.
+	limit int
+
+	// The minimum log number that is allowed to be recycled. Log numbers smaller
+	// than this will be subject to immediate deletion. This is used to prevent
+	// recycling a log written by a previous instance of the DB which may not
+	// have had log recycling enabled. If that previous instance of the DB was
+	// RocksDB, the old non-recyclable log record headers will be present.
+	minRecycleLogNum FileNum
+
+	mu struct {
+		sync.Mutex
+		logs      []fileInfo
+		maxLogNum FileNum
+	}
+}
+
+// add attempts to recycle the log file specified by logInfo. Returns true if
+// the log file should not be deleted (i.e. the log is being recycled), and
+// false otherwise.
+func (r *logRecycler) add(logInfo fileInfo) bool {
+	if logInfo.fileNum.FileNum() < r.minRecycleLogNum {
+		return false
+	}
+
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if logInfo.fileNum.FileNum() <= r.mu.maxLogNum {
+		// The log file number was already considered for recycling. Don't consider
+		// it again. This avoids a race between adding the same log file for
+		// recycling multiple times, and removing the log file for actual
+		// reuse. Note that we return true because the log was already considered
+		// for recycling and either it was deleted on the previous attempt (which
+		// means we shouldn't get here) or it was recycled and thus the file
+		// shouldn't be deleted.
+		return true
+	}
+	r.mu.maxLogNum = logInfo.fileNum.FileNum()
+	if len(r.mu.logs) >= r.limit {
+		return false
+	}
+	r.mu.logs = append(r.mu.logs, logInfo)
+	return true
+}
+
+// peek returns the log at the head of the recycling queue, or the zero value
+// fileInfo and false if the queue is empty.
+func (r *logRecycler) peek() (fileInfo, bool) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if len(r.mu.logs) == 0 {
+		return fileInfo{}, false
+	}
+	return r.mu.logs[0], true
+}
+
+func (r *logRecycler) stats() (count int, size uint64) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	count = len(r.mu.logs)
+	for i := 0; i < count; i++ {
+		size += r.mu.logs[i].fileSize
+	}
+	return count, size
+}
+
+// pop removes the log number at the head of the recycling queue, enforcing
+// that it matches the specified logNum. An error is returned of the recycling
+// queue is empty or the head log number does not match the specified one.
+func (r *logRecycler) pop(logNum FileNum) error {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if len(r.mu.logs) == 0 {
+		return errors.New("pebble: log recycler empty")
+	}
+	if r.mu.logs[0].fileNum.FileNum() != logNum {
+		return errors.Errorf("pebble: log recycler invalid %d vs %d", errors.Safe(logNum), errors.Safe(fileInfoNums(r.mu.logs)))
+	}
+	r.mu.logs = r.mu.logs[1:]
+	return nil
+}
+
+func fileInfoNums(finfos []fileInfo) []FileNum {
+	if len(finfos) == 0 {
+		return nil
+	}
+	nums := make([]FileNum, len(finfos))
+	for i := range finfos {
+		nums[i] = finfos[i].fileNum.FileNum()
+	}
+	return nums
+}
diff --git a/pebble/log_recycler_test.go b/pebble/log_recycler_test.go
new file mode 100644
index 0000000..7b71b34
--- /dev/null
+++ b/pebble/log_recycler_test.go
@@ -0,0 +1,135 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"testing"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+func (r *logRecycler) logNums() []FileNum {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	return fileInfoNums(r.mu.logs)
+}
+
+func (r *logRecycler) maxLogNum() FileNum {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	return r.mu.maxLogNum
+}
+
+func TestLogRecycler(t *testing.T) {
+	r := logRecycler{limit: 3, minRecycleLogNum: 4}
+
+	// Logs below the min-recycle number are not recycled.
+	require.False(t, r.add(fileInfo{base.FileNum(1).DiskFileNum(), 0}))
+	require.False(t, r.add(fileInfo{base.FileNum(2).DiskFileNum(), 0}))
+	require.False(t, r.add(fileInfo{base.FileNum(3).DiskFileNum(), 0}))
+
+	// Logs are recycled up to the limit.
+	require.True(t, r.add(fileInfo{base.FileNum(4).DiskFileNum(), 0}))
+	require.EqualValues(t, []FileNum{4}, r.logNums())
+	require.EqualValues(t, 4, r.maxLogNum())
+	fi, ok := r.peek()
+	require.True(t, ok)
+	require.EqualValues(t, uint64(4), uint64(fi.fileNum.FileNum()))
+	require.True(t, r.add(fileInfo{base.FileNum(5).DiskFileNum(), 0}))
+	require.EqualValues(t, []FileNum{4, 5}, r.logNums())
+	require.EqualValues(t, 5, r.maxLogNum())
+	require.True(t, r.add(fileInfo{base.FileNum(6).DiskFileNum(), 0}))
+	require.EqualValues(t, []FileNum{4, 5, 6}, r.logNums())
+	require.EqualValues(t, 6, r.maxLogNum())
+
+	// Trying to add a file past the limit fails.
+	require.False(t, r.add(fileInfo{base.FileNum(7).DiskFileNum(), 0}))
+	require.EqualValues(t, []FileNum{4, 5, 6}, r.logNums())
+	require.EqualValues(t, 7, r.maxLogNum())
+
+	// Trying to add a previously recycled file returns success, but the internal
+	// state is unchanged.
+	require.True(t, r.add(fileInfo{base.FileNum(4).DiskFileNum(), 0}))
+	require.EqualValues(t, []FileNum{4, 5, 6}, r.logNums())
+	require.EqualValues(t, 7, r.maxLogNum())
+
+	// An error is returned if we try to pop an element other than the first.
+	require.Regexp(t, `invalid 000005 vs \[000004 000005 000006\]`, r.pop(5))
+
+	require.NoError(t, r.pop(4))
+	require.EqualValues(t, []FileNum{5, 6}, r.logNums())
+
+	// Log number 7 was already considered, so it won't be recycled.
+	require.True(t, r.add(fileInfo{base.FileNum(7).DiskFileNum(), 0}))
+	require.EqualValues(t, []FileNum{5, 6}, r.logNums())
+
+	require.True(t, r.add(fileInfo{base.FileNum(8).DiskFileNum(), 0}))
+	require.EqualValues(t, []FileNum{5, 6, 8}, r.logNums())
+	require.EqualValues(t, 8, r.maxLogNum())
+
+	require.NoError(t, r.pop(5))
+	require.EqualValues(t, []FileNum{6, 8}, r.logNums())
+	require.NoError(t, r.pop(6))
+	require.EqualValues(t, []FileNum{8}, r.logNums())
+	require.NoError(t, r.pop(8))
+	require.EqualValues(t, []FileNum(nil), r.logNums())
+
+	require.Regexp(t, `empty`, r.pop(9))
+}
+
+func TestRecycleLogs(t *testing.T) {
+	mem := vfs.NewMem()
+	d, err := Open("", &Options{
+		FS: mem,
+	})
+	require.NoError(t, err)
+
+	logNum := func() FileNum {
+		d.mu.Lock()
+		defer d.mu.Unlock()
+		return d.mu.log.queue[len(d.mu.log.queue)-1].fileNum.FileNum()
+	}
+	logCount := func() int {
+		d.mu.Lock()
+		defer d.mu.Unlock()
+		return len(d.mu.log.queue)
+	}
+
+	// Flush the memtable a few times, forcing rotation of the WAL. We should see
+	// the recycled logs change as expected.
+	require.EqualValues(t, []FileNum(nil), d.logRecycler.logNums())
+	curLog := logNum()
+
+	require.NoError(t, d.Flush())
+
+	require.EqualValues(t, []FileNum{curLog}, d.logRecycler.logNums())
+	curLog = logNum()
+
+	require.NoError(t, d.Flush())
+
+	require.EqualValues(t, []FileNum{curLog}, d.logRecycler.logNums())
+
+	require.NoError(t, d.Close())
+
+	d, err = Open("", &Options{
+		FS: mem,
+	})
+	require.NoError(t, err)
+	metrics := d.Metrics()
+	if n := logCount(); n != int(metrics.WAL.Files) {
+		t.Fatalf("expected %d WAL files, but found %d", n, metrics.WAL.Files)
+	}
+	if n, sz := d.logRecycler.stats(); n != int(metrics.WAL.ObsoleteFiles) {
+		t.Fatalf("expected %d obsolete WAL files, but found %d", n, metrics.WAL.ObsoleteFiles)
+	} else if sz != metrics.WAL.ObsoletePhysicalSize {
+		t.Fatalf("expected %d obsolete physical WAL size, but found %d", sz, metrics.WAL.ObsoletePhysicalSize)
+	}
+	if recycled := d.logRecycler.logNums(); len(recycled) != 0 {
+		t.Fatalf("expected no recycled WAL files after recovery, but found %d", recycled)
+	}
+	require.NoError(t, d.Close())
+}
diff --git a/pebble/logger.go b/pebble/logger.go
new file mode 100644
index 0000000..5e3e92e
--- /dev/null
+++ b/pebble/logger.go
@@ -0,0 +1,16 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import "github.com/cockroachdb/pebble/internal/base"
+
+// Logger defines an interface for writing log messages.
+type Logger = base.Logger
+
+// DefaultLogger logs to the Go stdlib logs.
+var DefaultLogger = base.DefaultLogger
+
+// LoggerAndTracer defines an interface for logging and tracing.
+type LoggerAndTracer = base.LoggerAndTracer
diff --git a/pebble/mem_table.go b/pebble/mem_table.go
new file mode 100644
index 0000000..e728e94
--- /dev/null
+++ b/pebble/mem_table.go
@@ -0,0 +1,421 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"fmt"
+	"os"
+	"sync"
+	"sync/atomic"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/arenaskl"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manual"
+	"github.com/cockroachdb/pebble/internal/rangedel"
+	"github.com/cockroachdb/pebble/internal/rangekey"
+)
+
+func memTableEntrySize(keyBytes, valueBytes int) uint64 {
+	return arenaskl.MaxNodeSize(uint32(keyBytes)+8, uint32(valueBytes))
+}
+
+// memTableEmptySize is the amount of allocated space in the arena when the
+// memtable is empty.
+var memTableEmptySize = func() uint32 {
+	var pointSkl arenaskl.Skiplist
+	var rangeDelSkl arenaskl.Skiplist
+	var rangeKeySkl arenaskl.Skiplist
+	arena := arenaskl.NewArena(make([]byte, 16<<10 /* 16 KB */))
+	pointSkl.Reset(arena, bytes.Compare)
+	rangeDelSkl.Reset(arena, bytes.Compare)
+	rangeKeySkl.Reset(arena, bytes.Compare)
+	return arena.Size()
+}()
+
+// A memTable implements an in-memory layer of the LSM. A memTable is mutable,
+// but append-only. Records are added, but never removed. Deletion is supported
+// via tombstones, but it is up to higher level code (see Iterator) to support
+// processing those tombstones.
+//
+// A memTable is implemented on top of a lock-free arena-backed skiplist. An
+// arena is a fixed size contiguous chunk of memory (see
+// Options.MemTableSize). A memTable's memory consumption is thus fixed at the
+// time of creation (with the exception of the cached fragmented range
+// tombstones). The arena-backed skiplist provides both forward and reverse
+// links which makes forward and reverse iteration the same speed.
+//
+// A batch is "applied" to a memTable in a two step process: prepare(batch) ->
+// apply(batch). memTable.prepare() is not thread-safe and must be called with
+// external synchronization. Preparation reserves space in the memTable for the
+// batch. Note that we pessimistically compute how much space a batch will
+// consume in the memTable (see memTableEntrySize and
+// Batch.memTableSize). Preparation is an O(1) operation. Applying a batch to
+// the memTable can be performed concurrently with other apply
+// operations. Applying a batch is an O(n logm) operation where N is the number
+// of records in the batch and M is the number of records in the memtable. The
+// commitPipeline serializes batch preparation, and allows batch application to
+// proceed concurrently.
+//
+// It is safe to call get, apply, newIter, and newRangeDelIter concurrently.
+type memTable struct {
+	cmp         Compare
+	formatKey   base.FormatKey
+	equal       Equal
+	arenaBuf    []byte
+	skl         arenaskl.Skiplist
+	rangeDelSkl arenaskl.Skiplist
+	rangeKeySkl arenaskl.Skiplist
+	// reserved tracks the amount of space used by the memtable, both by actual
+	// data stored in the memtable as well as inflight batch commit
+	// operations. This value is incremented pessimistically by prepare() in
+	// order to account for the space needed by a batch.
+	reserved uint32
+	// writerRefs tracks the write references on the memtable. The two sources of
+	// writer references are the memtable being on DB.mu.mem.queue and from
+	// inflight mutations that have reserved space in the memtable but not yet
+	// applied. The memtable cannot be flushed to disk until the writer refs
+	// drops to zero.
+	writerRefs atomic.Int32
+	tombstones keySpanCache
+	rangeKeys  keySpanCache
+	// The current logSeqNum at the time the memtable was created. This is
+	// guaranteed to be less than or equal to any seqnum stored in the memtable.
+	logSeqNum                    uint64
+	releaseAccountingReservation func()
+}
+
+func (m *memTable) free() {
+	if m != nil {
+		m.releaseAccountingReservation()
+		manual.Free(m.arenaBuf)
+		m.arenaBuf = nil
+	}
+}
+
+// memTableOptions holds configuration used when creating a memTable. All of
+// the fields are optional and will be filled with defaults if not specified
+// which is used by tests.
+type memTableOptions struct {
+	*Options
+	arenaBuf                     []byte
+	size                         int
+	logSeqNum                    uint64
+	releaseAccountingReservation func()
+}
+
+func checkMemTable(obj interface{}) {
+	m := obj.(*memTable)
+	if m.arenaBuf != nil {
+		fmt.Fprintf(os.Stderr, "%p: memTable buffer was not freed\n", m.arenaBuf)
+		os.Exit(1)
+	}
+}
+
+// newMemTable returns a new MemTable of the specified size. If size is zero,
+// Options.MemTableSize is used instead.
+func newMemTable(opts memTableOptions) *memTable {
+	opts.Options = opts.Options.EnsureDefaults()
+	m := new(memTable)
+	m.init(opts)
+	return m
+}
+
+func (m *memTable) init(opts memTableOptions) {
+	if opts.size == 0 {
+		opts.size = int(opts.MemTableSize)
+	}
+	*m = memTable{
+		cmp:                          opts.Comparer.Compare,
+		formatKey:                    opts.Comparer.FormatKey,
+		equal:                        opts.Comparer.Equal,
+		arenaBuf:                     opts.arenaBuf,
+		logSeqNum:                    opts.logSeqNum,
+		releaseAccountingReservation: opts.releaseAccountingReservation,
+	}
+	m.writerRefs.Store(1)
+	m.tombstones = keySpanCache{
+		cmp:           m.cmp,
+		formatKey:     m.formatKey,
+		skl:           &m.rangeDelSkl,
+		constructSpan: rangeDelConstructSpan,
+	}
+	m.rangeKeys = keySpanCache{
+		cmp:           m.cmp,
+		formatKey:     m.formatKey,
+		skl:           &m.rangeKeySkl,
+		constructSpan: rangekey.Decode,
+	}
+
+	if m.arenaBuf == nil {
+		m.arenaBuf = make([]byte, opts.size)
+	}
+
+	arena := arenaskl.NewArena(m.arenaBuf)
+	m.skl.Reset(arena, m.cmp)
+	m.rangeDelSkl.Reset(arena, m.cmp)
+	m.rangeKeySkl.Reset(arena, m.cmp)
+	m.reserved = arena.Size()
+}
+
+func (m *memTable) writerRef() {
+	switch v := m.writerRefs.Add(1); {
+	case v <= 1:
+		panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v))
+	}
+}
+
+// writerUnref drops a ref on the memtable. Returns true if this was the last ref.
+func (m *memTable) writerUnref() (wasLastRef bool) {
+	switch v := m.writerRefs.Add(-1); {
+	case v < 0:
+		panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v))
+	case v == 0:
+		return true
+	default:
+		return false
+	}
+}
+
+// readyForFlush is part of the flushable interface.
+func (m *memTable) readyForFlush() bool {
+	return m.writerRefs.Load() == 0
+}
+
+// Prepare reserves space for the batch in the memtable and references the
+// memtable preventing it from being flushed until the batch is applied. Note
+// that prepare is not thread-safe, while apply is. The caller must call
+// writerUnref() after the batch has been applied.
+func (m *memTable) prepare(batch *Batch) error {
+	avail := m.availBytes()
+	if batch.memTableSize > uint64(avail) {
+		return arenaskl.ErrArenaFull
+	}
+	m.reserved += uint32(batch.memTableSize)
+
+	m.writerRef()
+	return nil
+}
+
+func (m *memTable) apply(batch *Batch, seqNum uint64) error {
+	if seqNum < m.logSeqNum {
+		return base.CorruptionErrorf("pebble: batch seqnum %d is less than memtable creation seqnum %d",
+			errors.Safe(seqNum), errors.Safe(m.logSeqNum))
+	}
+
+	var ins arenaskl.Inserter
+	var tombstoneCount, rangeKeyCount uint32
+	startSeqNum := seqNum
+	for r := batch.Reader(); ; seqNum++ {
+		kind, ukey, value, ok, err := r.Next()
+		if !ok {
+			if err != nil {
+				return err
+			}
+			break
+		}
+		ikey := base.MakeInternalKey(ukey, seqNum, kind)
+		switch kind {
+		case InternalKeyKindRangeDelete:
+			err = m.rangeDelSkl.Add(ikey, value)
+			tombstoneCount++
+		case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
+			err = m.rangeKeySkl.Add(ikey, value)
+			rangeKeyCount++
+		case InternalKeyKindLogData:
+			// Don't increment seqNum for LogData, since these are not applied
+			// to the memtable.
+			seqNum--
+		case InternalKeyKindIngestSST:
+			panic("pebble: cannot apply ingested sstable key kind to memtable")
+		default:
+			err = ins.Add(&m.skl, ikey, value)
+		}
+		if err != nil {
+			return err
+		}
+	}
+	if seqNum != startSeqNum+uint64(batch.Count()) {
+		return base.CorruptionErrorf("pebble: inconsistent batch count: %d vs %d",
+			errors.Safe(seqNum), errors.Safe(startSeqNum+uint64(batch.Count())))
+	}
+	if tombstoneCount != 0 {
+		m.tombstones.invalidate(tombstoneCount)
+	}
+	if rangeKeyCount != 0 {
+		m.rangeKeys.invalidate(rangeKeyCount)
+	}
+	return nil
+}
+
+// newIter is part of the flushable interface. It returns an iterator that is
+// unpositioned (Iterator.Valid() will return false). The iterator can be
+// positioned via a call to SeekGE, SeekLT, First or Last.
+func (m *memTable) newIter(o *IterOptions) internalIterator {
+	return m.skl.NewIter(o.GetLowerBound(), o.GetUpperBound())
+}
+
+// newFlushIter is part of the flushable interface.
+func (m *memTable) newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator {
+	return m.skl.NewFlushIter(bytesFlushed)
+}
+
+// newRangeDelIter is part of the flushable interface.
+func (m *memTable) newRangeDelIter(*IterOptions) keyspan.FragmentIterator {
+	tombstones := m.tombstones.get()
+	if tombstones == nil {
+		return nil
+	}
+	return keyspan.NewIter(m.cmp, tombstones)
+}
+
+// newRangeKeyIter is part of the flushable interface.
+func (m *memTable) newRangeKeyIter(*IterOptions) keyspan.FragmentIterator {
+	rangeKeys := m.rangeKeys.get()
+	if rangeKeys == nil {
+		return nil
+	}
+	return keyspan.NewIter(m.cmp, rangeKeys)
+}
+
+// containsRangeKeys is part of the flushable interface.
+func (m *memTable) containsRangeKeys() bool {
+	return m.rangeKeys.count.Load() > 0
+}
+
+func (m *memTable) availBytes() uint32 {
+	a := m.skl.Arena()
+	if m.writerRefs.Load() == 1 {
+		// If there are no other concurrent apply operations, we can update the
+		// reserved bytes setting to accurately reflect how many bytes of been
+		// allocated vs the over-estimation present in memTableEntrySize.
+		m.reserved = a.Size()
+	}
+	return a.Capacity() - m.reserved
+}
+
+// inuseBytes is part of the flushable interface.
+func (m *memTable) inuseBytes() uint64 {
+	return uint64(m.skl.Size() - memTableEmptySize)
+}
+
+// totalBytes is part of the flushable interface.
+func (m *memTable) totalBytes() uint64 {
+	return uint64(m.skl.Arena().Capacity())
+}
+
+// empty returns whether the MemTable has no key/value pairs.
+func (m *memTable) empty() bool {
+	return m.skl.Size() == memTableEmptySize
+}
+
+// A keySpanFrags holds a set of fragmented keyspan.Spans with a particular key
+// kind at a particular moment for a memtable.
+//
+// When a new span of a particular kind is added to the memtable, it may overlap
+// with other spans of the same kind. Instead of performing the fragmentation
+// whenever an iterator requires it, fragments are cached within a keySpanCache
+// type. The keySpanCache uses keySpanFrags to hold the cached fragmented spans.
+//
+// The count of keys (and keys of any given kind) in a memtable only
+// monotonically increases. The count of key spans of a particular kind is used
+// as a stand-in for a 'sequence number'. A keySpanFrags represents the
+// fragmented state of the memtable's keys of a given kind at the moment while
+// there existed `count` keys of that kind in the memtable.
+//
+// It's currently only used to contain fragmented range deletion tombstones.
+type keySpanFrags struct {
+	count uint32
+	once  sync.Once
+	spans []keyspan.Span
+}
+
+type constructSpan func(ik base.InternalKey, v []byte, keysDst []keyspan.Key) (keyspan.Span, error)
+
+func rangeDelConstructSpan(
+	ik base.InternalKey, v []byte, keysDst []keyspan.Key,
+) (keyspan.Span, error) {
+	return rangedel.Decode(ik, v, keysDst), nil
+}
+
+// get retrieves the fragmented spans, populating them if necessary. Note that
+// the populated span fragments may be built from more than f.count memTable
+// spans, but that is ok for correctness. All we're requiring is that the
+// memTable contains at least f.count keys of the configured kind. This
+// situation can occur if there are multiple concurrent additions of the key
+// kind and a concurrent reader. The reader can load a keySpanFrags and populate
+// it even though is has been invalidated (i.e. replaced with a newer
+// keySpanFrags).
+func (f *keySpanFrags) get(
+	skl *arenaskl.Skiplist, cmp Compare, formatKey base.FormatKey, constructSpan constructSpan,
+) []keyspan.Span {
+	f.once.Do(func() {
+		frag := &keyspan.Fragmenter{
+			Cmp:    cmp,
+			Format: formatKey,
+			Emit: func(fragmented keyspan.Span) {
+				f.spans = append(f.spans, fragmented)
+			},
+		}
+		it := skl.NewIter(nil, nil)
+		var keysDst []keyspan.Key
+		for key, val := it.First(); key != nil; key, val = it.Next() {
+			s, err := constructSpan(*key, val.InPlaceValue(), keysDst)
+			if err != nil {
+				panic(err)
+			}
+			frag.Add(s)
+			keysDst = s.Keys[len(s.Keys):]
+		}
+		frag.Finish()
+	})
+	return f.spans
+}
+
+// A keySpanCache is used to cache a set of fragmented spans. The cache is
+// invalidated whenever a key of the same kind is added to a memTable, and
+// populated when empty when a span iterator of that key kind is created.
+type keySpanCache struct {
+	count         atomic.Uint32
+	frags         atomic.Pointer[keySpanFrags]
+	cmp           Compare
+	formatKey     base.FormatKey
+	constructSpan constructSpan
+	skl           *arenaskl.Skiplist
+}
+
+// Invalidate the current set of cached spans, indicating the number of
+// spans that were added.
+func (c *keySpanCache) invalidate(count uint32) {
+	newCount := c.count.Add(count)
+	var frags *keySpanFrags
+
+	for {
+		oldFrags := c.frags.Load()
+		if oldFrags != nil && oldFrags.count >= newCount {
+			// Someone else invalidated the cache before us and their invalidation
+			// subsumes ours.
+			break
+		}
+		if frags == nil {
+			frags = &keySpanFrags{count: newCount}
+		}
+		if c.frags.CompareAndSwap(oldFrags, frags) {
+			// We successfully invalidated the cache.
+			break
+		}
+		// Someone else invalidated the cache. Loop and try again.
+	}
+}
+
+func (c *keySpanCache) get() []keyspan.Span {
+	frags := c.frags.Load()
+	if frags == nil {
+		return nil
+	}
+	return frags.get(c.skl, c.cmp, c.formatKey, c.constructSpan)
+}
diff --git a/pebble/mem_table_test.go b/pebble/mem_table_test.go
new file mode 100644
index 0000000..62ea9c6
--- /dev/null
+++ b/pebble/mem_table_test.go
@@ -0,0 +1,455 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"strconv"
+	"strings"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/arenaskl"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/itertest"
+	"github.com/cockroachdb/pebble/internal/rangekey"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+	"golang.org/x/sync/errgroup"
+)
+
+// get gets the value for the given key. It returns ErrNotFound if the DB does
+// not contain the key.
+func (m *memTable) get(key []byte) (value []byte, err error) {
+	it := m.skl.NewIter(nil, nil)
+	ikey, val := it.SeekGE(key, base.SeekGEFlagsNone)
+	if ikey == nil {
+		return nil, ErrNotFound
+	}
+	if !m.equal(key, ikey.UserKey) {
+		return nil, ErrNotFound
+	}
+	switch ikey.Kind() {
+	case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
+		return nil, ErrNotFound
+	default:
+		return val.InPlaceValue(), nil
+	}
+}
+
+// Set sets the value for the given key. It overwrites any previous value for
+// that key; a DB is not a multi-map. NB: this might have unexpected
+// interaction with prepare/apply. Caveat emptor!
+func (m *memTable) set(key InternalKey, value []byte) error {
+	if key.Kind() == InternalKeyKindRangeDelete {
+		if err := m.rangeDelSkl.Add(key, value); err != nil {
+			return err
+		}
+		m.tombstones.invalidate(1)
+		return nil
+	}
+	if rangekey.IsRangeKey(key.Kind()) {
+		if err := m.rangeKeySkl.Add(key, value); err != nil {
+			return err
+		}
+		m.rangeKeys.invalidate(1)
+		return nil
+	}
+	return m.skl.Add(key, value)
+}
+
+// count returns the number of entries in a DB.
+func (m *memTable) count() (n int) {
+	x := newInternalIterAdapter(m.newIter(nil))
+	for valid := x.First(); valid; valid = x.Next() {
+		n++
+	}
+	if x.Close() != nil {
+		return -1
+	}
+	return n
+}
+
+// bytesIterated returns the number of bytes iterated in a DB.
+func (m *memTable) bytesIterated(t *testing.T) (bytesIterated uint64) {
+	x := newInternalIterAdapter(m.newFlushIter(nil, &bytesIterated))
+	var prevIterated uint64
+	for valid := x.First(); valid; valid = x.Next() {
+		if bytesIterated < prevIterated {
+			t.Fatalf("bytesIterated moved backward: %d < %d", bytesIterated, prevIterated)
+		}
+		prevIterated = bytesIterated
+	}
+	if x.Close() != nil {
+		return 0
+	}
+	return bytesIterated
+}
+
+func ikey(s string) InternalKey {
+	return base.MakeInternalKey([]byte(s), 0, InternalKeyKindSet)
+}
+
+func TestMemTableBasic(t *testing.T) {
+	// Check the empty DB.
+	m := newMemTable(memTableOptions{})
+	if got, want := m.count(), 0; got != want {
+		t.Fatalf("0.count: got %v, want %v", got, want)
+	}
+	v, err := m.get([]byte("cherry"))
+	if string(v) != "" || err != ErrNotFound {
+		t.Fatalf("1.get: got (%q, %v), want (%q, %v)", v, err, "", ErrNotFound)
+	}
+	// Add some key/value pairs.
+	m.set(ikey("cherry"), []byte("red"))
+	m.set(ikey("peach"), []byte("yellow"))
+	m.set(ikey("grape"), []byte("red"))
+	m.set(ikey("grape"), []byte("green"))
+	m.set(ikey("plum"), []byte("purple"))
+	if got, want := m.count(), 4; got != want {
+		t.Fatalf("2.count: got %v, want %v", got, want)
+	}
+	// Get keys that are and aren't in the DB.
+	v, err = m.get([]byte("plum"))
+	if string(v) != "purple" || err != nil {
+		t.Fatalf("6.get: got (%q, %v), want (%q, %v)", v, err, "purple", error(nil))
+	}
+	v, err = m.get([]byte("lychee"))
+	if string(v) != "" || err != ErrNotFound {
+		t.Fatalf("7.get: got (%q, %v), want (%q, %v)", v, err, "", ErrNotFound)
+	}
+	// Check an iterator.
+	s, x := "", newInternalIterAdapter(m.newIter(nil))
+	for valid := x.SeekGE([]byte("mango"), base.SeekGEFlagsNone); valid; valid = x.Next() {
+		s += fmt.Sprintf("%s/%s.", x.Key().UserKey, x.Value())
+	}
+	if want := "peach/yellow.plum/purple."; s != want {
+		t.Fatalf("8.iter: got %q, want %q", s, want)
+	}
+	if err = x.Close(); err != nil {
+		t.Fatalf("9.close: %v", err)
+	}
+	// Check some more sets and deletes.
+	if err := m.set(ikey("apricot"), []byte("orange")); err != nil {
+		t.Fatalf("12.set: %v", err)
+	}
+	if got, want := m.count(), 5; got != want {
+		t.Fatalf("13.count: got %v, want %v", got, want)
+	}
+}
+
+func TestMemTableCount(t *testing.T) {
+	m := newMemTable(memTableOptions{})
+	for i := 0; i < 200; i++ {
+		if j := m.count(); j != i {
+			t.Fatalf("count: got %d, want %d", j, i)
+		}
+		m.set(InternalKey{UserKey: []byte{byte(i)}}, nil)
+	}
+}
+
+func TestMemTableBytesIterated(t *testing.T) {
+	m := newMemTable(memTableOptions{})
+	for i := 0; i < 200; i++ {
+		bytesIterated := m.bytesIterated(t)
+		expected := m.inuseBytes()
+		if bytesIterated != expected {
+			t.Fatalf("bytesIterated: got %d, want %d", bytesIterated, expected)
+		}
+		m.set(InternalKey{UserKey: []byte{byte(i)}}, nil)
+	}
+}
+
+func TestMemTableEmpty(t *testing.T) {
+	m := newMemTable(memTableOptions{})
+	if !m.empty() {
+		t.Errorf("got !empty, want empty")
+	}
+	// Add one key/value pair with an empty key and empty value.
+	m.set(InternalKey{}, nil)
+	if m.empty() {
+		t.Errorf("got empty, want !empty")
+	}
+}
+
+func TestMemTable1000Entries(t *testing.T) {
+	// Initialize the DB.
+	const N = 1000
+	m0 := newMemTable(memTableOptions{})
+	for i := 0; i < N; i++ {
+		k := ikey(strconv.Itoa(i))
+		v := []byte(strings.Repeat("x", i))
+		m0.set(k, v)
+	}
+	// Check the DB count.
+	if got, want := m0.count(), 1000; got != want {
+		t.Fatalf("count: got %v, want %v", got, want)
+	}
+	// Check random-access lookup.
+	r := rand.New(rand.NewSource(0))
+	for i := 0; i < 3*N; i++ {
+		j := r.Intn(N)
+		k := []byte(strconv.Itoa(j))
+		v, err := m0.get(k)
+		require.NoError(t, err)
+		if len(v) != cap(v) {
+			t.Fatalf("get: j=%d, got len(v)=%d, cap(v)=%d", j, len(v), cap(v))
+		}
+		var c uint8
+		if len(v) != 0 {
+			c = v[0]
+		} else {
+			c = 'x'
+		}
+		if len(v) != j || c != 'x' {
+			t.Fatalf("get: j=%d, got len(v)=%d,c=%c, want %d,%c", j, len(v), c, j, 'x')
+		}
+	}
+	// Check that iterating through the middle of the DB looks OK.
+	// Keys are in lexicographic order, not numerical order.
+	// Multiples of 3 are not present.
+	wants := []string{
+		"499",
+		"5",
+		"50",
+		"500",
+		"501",
+		"502",
+		"503",
+		"504",
+		"505",
+		"506",
+		"507",
+	}
+	x := newInternalIterAdapter(m0.newIter(nil))
+	x.SeekGE([]byte(wants[0]), base.SeekGEFlagsNone)
+	for _, want := range wants {
+		if !x.Valid() {
+			t.Fatalf("iter: next failed, want=%q", want)
+		}
+		if got := string(x.Key().UserKey); got != want {
+			t.Fatalf("iter: got %q, want %q", got, want)
+		}
+		if k := x.Key().UserKey; len(k) != cap(k) {
+			t.Fatalf("iter: len(k)=%d, cap(k)=%d", len(k), cap(k))
+		}
+		if v := x.Value(); len(v) != cap(v) {
+			t.Fatalf("iter: len(v)=%d, cap(v)=%d", len(v), cap(v))
+		}
+		x.Next()
+	}
+	if err := x.Close(); err != nil {
+		t.Fatalf("close: %v", err)
+	}
+}
+
+func TestMemTableIter(t *testing.T) {
+	var mem *memTable
+	for _, testdata := range []string{
+		"testdata/internal_iter_next", "testdata/internal_iter_bounds"} {
+		datadriven.RunTest(t, testdata, func(t *testing.T, d *datadriven.TestData) string {
+			switch d.Cmd {
+			case "define":
+				mem = newMemTable(memTableOptions{})
+				for _, key := range strings.Split(d.Input, "\n") {
+					j := strings.Index(key, ":")
+					if err := mem.set(base.ParseInternalKey(key[:j]), []byte(key[j+1:])); err != nil {
+						return err.Error()
+					}
+				}
+				return ""
+
+			case "iter":
+				var options IterOptions
+				for _, arg := range d.CmdArgs {
+					switch arg.Key {
+					case "lower":
+						if len(arg.Vals) != 1 {
+							return fmt.Sprintf(
+								"%s expects at most 1 value for lower", d.Cmd)
+						}
+						options.LowerBound = []byte(arg.Vals[0])
+					case "upper":
+						if len(arg.Vals) != 1 {
+							return fmt.Sprintf(
+								"%s expects at most 1 value for upper", d.Cmd)
+						}
+						options.UpperBound = []byte(arg.Vals[0])
+					default:
+						return fmt.Sprintf("unknown arg: %s", arg.Key)
+					}
+				}
+				iter := mem.newIter(&options)
+				defer iter.Close()
+				return itertest.RunInternalIterCmd(t, d, iter)
+
+			default:
+				return fmt.Sprintf("unknown command: %s", d.Cmd)
+			}
+		})
+	}
+}
+
+func TestMemTableDeleteRange(t *testing.T) {
+	var mem *memTable
+	var seqNum uint64
+
+	datadriven.RunTest(t, "testdata/delete_range", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "clear":
+			mem = nil
+			seqNum = 0
+			return ""
+
+		case "define":
+			b := newBatch(nil)
+			if err := runBatchDefineCmd(td, b); err != nil {
+				return err.Error()
+			}
+			if mem == nil {
+				mem = newMemTable(memTableOptions{})
+			}
+			if err := mem.apply(b, seqNum); err != nil {
+				return err.Error()
+			}
+			seqNum += uint64(b.Count())
+			return ""
+
+		case "scan":
+			var buf bytes.Buffer
+			if td.HasArg("range-del") {
+				iter := mem.newRangeDelIter(nil)
+				defer iter.Close()
+				scanKeyspanIterator(&buf, iter)
+			} else {
+				iter := mem.newIter(nil)
+				defer iter.Close()
+				scanInternalIter(&buf, iter)
+			}
+			return buf.String()
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestMemTableConcurrentDeleteRange(t *testing.T) {
+	// Concurrently write and read range tombstones. Workers add range
+	// tombstones, and then immediately retrieve them verifying that the
+	// tombstones they've added are all present.
+
+	m := newMemTable(memTableOptions{Options: &Options{MemTableSize: 64 << 20}})
+
+	const workers = 10
+	eg, _ := errgroup.WithContext(context.Background())
+	var seqNum atomic.Uint64
+	seqNum.Store(1)
+	for i := 0; i < workers; i++ {
+		i := i
+		eg.Go(func() error {
+			start := ([]byte)(fmt.Sprintf("%03d", i))
+			end := ([]byte)(fmt.Sprintf("%03d", i+1))
+			for j := 0; j < 100; j++ {
+				b := newBatch(nil)
+				b.DeleteRange(start, end, nil)
+				n := seqNum.Add(1) - 1
+				require.NoError(t, m.apply(b, n))
+				b.release()
+
+				var count int
+				it := m.newRangeDelIter(nil)
+				for s := it.SeekGE(start); s != nil; s = it.Next() {
+					if m.cmp(s.Start, end) >= 0 {
+						break
+					}
+					count += len(s.Keys)
+				}
+				if j+1 != count {
+					return errors.Errorf("%d: expected %d tombstones, but found %d", i, j+1, count)
+				}
+			}
+			return nil
+		})
+	}
+	err := eg.Wait()
+	if err != nil {
+		t.Error(err)
+	}
+}
+
+func TestMemTableReserved(t *testing.T) {
+	m := newMemTable(memTableOptions{size: 5000})
+	// Increase to 2 references.
+	m.writerRef()
+	// The initial reservation accounts for the already allocated bytes from the
+	// arena.
+	require.Equal(t, m.reserved, m.skl.Arena().Size())
+	b := newBatch(nil)
+	b.Set([]byte("blueberry"), []byte("pie"), nil)
+	require.NotEqual(t, 0, int(b.memTableSize))
+	prevReserved := m.reserved
+	m.prepare(b)
+	require.Equal(t, int(m.reserved), int(b.memTableSize)+int(prevReserved))
+}
+
+func buildMemTable(b *testing.B) (*memTable, [][]byte) {
+	m := newMemTable(memTableOptions{})
+	var keys [][]byte
+	var ikey InternalKey
+	for i := 0; ; i++ {
+		key := []byte(fmt.Sprintf("%08d", i))
+		keys = append(keys, key)
+		ikey = base.MakeInternalKey(key, 0, InternalKeyKindSet)
+		if m.set(ikey, nil) == arenaskl.ErrArenaFull {
+			break
+		}
+	}
+	return m, keys
+}
+
+func BenchmarkMemTableIterSeekGE(b *testing.B) {
+	m, keys := buildMemTable(b)
+	iter := m.newIter(nil)
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		iter.SeekGE(keys[rng.Intn(len(keys))], base.SeekGEFlagsNone)
+	}
+}
+
+func BenchmarkMemTableIterNext(b *testing.B) {
+	m, _ := buildMemTable(b)
+	iter := m.newIter(nil)
+	_, _ = iter.First()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		key, _ := iter.Next()
+		if key == nil {
+			key, _ = iter.First()
+		}
+		_ = key
+	}
+}
+
+func BenchmarkMemTableIterPrev(b *testing.B) {
+	m, _ := buildMemTable(b)
+	iter := m.newIter(nil)
+	_, _ = iter.Last()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		key, _ := iter.Prev()
+		if key == nil {
+			key, _ = iter.Last()
+		}
+		_ = key
+	}
+}
diff --git a/pebble/merger.go b/pebble/merger.go
new file mode 100644
index 0000000..26f6ee6
--- /dev/null
+++ b/pebble/merger.go
@@ -0,0 +1,37 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"io"
+
+	"github.com/cockroachdb/pebble/internal/base"
+)
+
+// Merge exports the base.Merge type.
+type Merge = base.Merge
+
+// Merger exports the base.Merger type.
+type Merger = base.Merger
+
+// ValueMerger exports the base.ValueMerger type.
+type ValueMerger = base.ValueMerger
+
+// DeletableValueMerger exports the base.DeletableValueMerger type.
+type DeletableValueMerger = base.DeletableValueMerger
+
+// DefaultMerger exports the base.DefaultMerger variable.
+var DefaultMerger = base.DefaultMerger
+
+func finishValueMerger(
+	valueMerger ValueMerger, includesBase bool,
+) (value []byte, needDelete bool, closer io.Closer, err error) {
+	if valueMerger2, ok := valueMerger.(DeletableValueMerger); ok {
+		value, needDelete, closer, err = valueMerger2.DeletableFinish(includesBase)
+	} else {
+		value, closer, err = valueMerger.Finish(includesBase)
+	}
+	return
+}
diff --git a/pebble/merging_iter.go b/pebble/merging_iter.go
new file mode 100644
index 0000000..a434108
--- /dev/null
+++ b/pebble/merging_iter.go
@@ -0,0 +1,1400 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"runtime/debug"
+	"unsafe"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+)
+
+type mergingIterLevel struct {
+	index int
+	iter  internalIterator
+	// rangeDelIter is set to the range-deletion iterator for the level. When
+	// configured with a levelIter, this pointer changes as sstable boundaries
+	// are crossed. See levelIter.initRangeDel and the Range Deletions comment
+	// below.
+	rangeDelIter keyspan.FragmentIterator
+	// iterKey and iterValue cache the current key and value iter are pointed at.
+	iterKey   *InternalKey
+	iterValue base.LazyValue
+	// levelIter is non-nil if this level's iter is ultimately backed by a
+	// *levelIter. The handle in iter may have wrapped the levelIter with
+	// intermediary internalIterator implementations.
+	levelIter *levelIter
+
+	// levelIterBoundaryContext's fields are set when using levelIter, in order
+	// to surface sstable boundary keys and file-level context. See levelIter
+	// comment and the Range Deletions comment below.
+	levelIterBoundaryContext
+
+	// tombstone caches the tombstone rangeDelIter is currently pointed at. If
+	// tombstone is nil, there are no further tombstones within the
+	// current sstable in the current iterator direction. The cached tombstone is
+	// only valid for the levels in the range [0,heap[0].index]. This avoids
+	// positioning tombstones at lower levels which cannot possibly shadow the
+	// current key.
+	tombstone *keyspan.Span
+}
+
+type levelIterBoundaryContext struct {
+	// smallestUserKey and largestUserKey are populated with the smallest and
+	// largest boundaries of the current file.
+	smallestUserKey, largestUserKey []byte
+	// isLargestUserKeyExclusive is set to true when a file's largest boundary
+	// is an exclusive key, (eg, a range deletion sentinel). If true, the file
+	// does not contain any keys with the provided user key, and the
+	// largestUserKey bound is exclusive.
+	isLargestUserKeyExclusive bool
+	// isSyntheticIterBoundsKey is set to true iff the key returned by the level
+	// iterator is a synthetic key derived from the iterator bounds. This is used
+	// to prevent the mergingIter from being stuck at such a synthetic key if it
+	// becomes the top element of the heap. When used with a user-facing Iterator,
+	// the only range deletions exposed by this mergingIter should be those with
+	// `isSyntheticIterBoundsKey || isIgnorableBoundaryKey`.
+	isSyntheticIterBoundsKey bool
+	// isIgnorableBoundaryKey is set to true iff the key returned by the level
+	// iterator is a file boundary key that should be ignored when returning to
+	// the parent iterator. File boundary keys are used by the level iter to
+	// keep a levelIter file's range deletion iterator open as long as other
+	// levels within the merging iterator require it. When used with a user-facing
+	// Iterator, the only range deletions exposed by this mergingIter should be
+	// those with `isSyntheticIterBoundsKey || isIgnorableBoundaryKey`.
+	isIgnorableBoundaryKey bool
+}
+
+// mergingIter provides a merged view of multiple iterators from different
+// levels of the LSM.
+//
+// The core of a mergingIter is a heap of internalIterators (see
+// mergingIterHeap). The heap can operate as either a min-heap, used during
+// forward iteration (First, SeekGE, Next) or a max-heap, used during reverse
+// iteration (Last, SeekLT, Prev). The heap is initialized in calls to First,
+// Last, SeekGE, and SeekLT. A call to Next or Prev takes the current top
+// element on the heap, advances its iterator, and then "fixes" the heap
+// property. When one of the child iterators is exhausted during Next/Prev
+// iteration, it is removed from the heap.
+//
+// # Range Deletions
+//
+// A mergingIter can optionally be configured with a slice of range deletion
+// iterators. The range deletion iterator slice must exactly parallel the point
+// iterators and the range deletion iterator must correspond to the same level
+// in the LSM as the point iterator. Note that each memtable and each table in
+// L0 is a different "level" from the mergingIter perspective. So level 0 below
+// does not correspond to L0 in the LSM.
+//
+// A range deletion iterator iterates over fragmented range tombstones. Range
+// tombstones are fragmented by splitting them at any overlapping points. This
+// fragmentation guarantees that within an sstable tombstones will either be
+// distinct or will have identical start and end user keys. While range
+// tombstones are fragmented within an sstable, the start and end keys are not truncated
+// to sstable boundaries. This is necessary because the tombstone end key is
+// exclusive and does not have a sequence number. Consider an sstable
+// containing the range tombstone [a,c)#9 and the key "b#8". The tombstone must
+// delete "b#8", yet older versions of "b" might spill over to the next
+// sstable. So the boundary key for this sstable must be "b#8". Adjusting the
+// end key of tombstones to be optionally inclusive or contain a sequence
+// number would be possible solutions (such solutions have potentially serious
+// issues: tombstones have exclusive end keys since an inclusive deletion end can
+// be converted to an exclusive one while the reverse transformation is not possible;
+// the semantics of a sequence number for the end key of a range tombstone are murky).
+//
+// The approach taken here performs an
+// implicit truncation of the tombstone to the sstable boundaries.
+//
+// During initialization of a mergingIter, the range deletion iterators for
+// batches, memtables, and L0 tables are populated up front. Note that Batches
+// and memtables index unfragmented tombstones.  Batch.newRangeDelIter() and
+// memTable.newRangeDelIter() fragment and cache the tombstones on demand. The
+// L1-L6 range deletion iterators are populated by levelIter. When configured
+// to load range deletion iterators, whenever a levelIter loads a table it
+// loads both the point iterator and the range deletion
+// iterator. levelIter.rangeDelIter is configured to point to the right entry
+// in mergingIter.levels. The effect of this setup is that
+// mergingIter.levels[i].rangeDelIter always contains the fragmented range
+// tombstone for the current table in level i that the levelIter has open.
+//
+// Another crucial mechanism of levelIter is that it materializes fake point
+// entries for the table boundaries if the boundary is range deletion
+// key. Consider a table that contains only a range tombstone [a-e)#10. The
+// sstable boundaries for this table will be a#10,15 and
+// e#72057594037927935,15. During forward iteration levelIter will return
+// e#72057594037927935,15 as a key. During reverse iteration levelIter will
+// return a#10,15 as a key. These sentinel keys act as bookends to point
+// iteration and allow mergingIter to keep a table and its associated range
+// tombstones loaded as long as there are keys at lower levels that are within
+// the bounds of the table.
+//
+// The final piece to the range deletion puzzle is the LSM invariant that for a
+// given key K newer versions of K can only exist earlier in the level, or at
+// higher levels of the tree. For example, if K#4 exists in L3, k#5 can only
+// exist earlier in the L3 or in L0, L1, L2 or a memtable. Get very explicitly
+// uses this invariant to find the value for a key by walking the LSM level by
+// level. For range deletions, this invariant means that a range deletion at
+// level N will necessarily shadow any keys within its bounds in level Y where
+// Y > N. One wrinkle to this statement is that it only applies to keys that
+// lie within the sstable bounds as well, but we get that guarantee due to the
+// way the range deletion iterator and point iterator are bound together by a
+// levelIter.
+//
+// Tying the above all together, we get a picture where each level (index in
+// mergingIter.levels) is composed of both point operations (pX) and range
+// deletions (rX). The range deletions for level X shadow both the point
+// operations and range deletions for level Y where Y > X allowing mergingIter
+// to skip processing entries in that shadow. For example, consider the
+// scenario:
+//
+//	r0: a---e
+//	r1:    d---h
+//	r2:       g---k
+//	r3:          j---n
+//	r4:             m---q
+//
+// This is showing 5 levels of range deletions. Consider what happens upon
+// SeekGE("b"). We first seek the point iterator for level 0 (the point values
+// are not shown above) and we then seek the range deletion iterator. That
+// returns the tombstone [a,e). This tombstone tells us that all keys in the
+// range [a,e) in lower levels are deleted so we can skip them. So we can
+// adjust the seek key to "e", the tombstone end key. For level 1 we seek to
+// "e" and find the range tombstone [d,h) and similar logic holds. By the time
+// we get to level 4 we're seeking to "n".
+//
+// One consequence of not truncating tombstone end keys to sstable boundaries
+// is the seeking process described above cannot always seek to the tombstone
+// end key in the older level. For example, imagine in the above example r3 is
+// a partitioned level (i.e., L1+ in our LSM), and the sstable containing [j,
+// n) has "k" as its upper boundary. In this situation, compactions involving
+// keys at or after "k" can output those keys to r4+, even if they're newer
+// than our tombstone [j, n). So instead of seeking to "n" in r4 we can only
+// seek to "k".  To achieve this, the instance variable `largestUserKey.`
+// maintains the upper bounds of the current sstables in the partitioned
+// levels. In this example, `levels[3].largestUserKey` holds "k", telling us to
+// limit the seek triggered by a tombstone in r3 to "k".
+//
+// During actual iteration levels can contain both point operations and range
+// deletions. Within a level, when a range deletion contains a point operation
+// the sequence numbers must be checked to determine if the point operation is
+// newer or older than the range deletion tombstone. The mergingIter maintains
+// the invariant that the range deletion iterators for all levels newer that
+// the current iteration key (L < m.heap.items[0].index) are positioned at the
+// next (or previous during reverse iteration) range deletion tombstone. We
+// know those levels don't contain a range deletion tombstone that covers the
+// current key because if they did the current key would be deleted. The range
+// deletion iterator for the current key's level is positioned at a range
+// tombstone covering or past the current key. The position of all of other
+// range deletion iterators is unspecified. Whenever a key from those levels
+// becomes the current key, their range deletion iterators need to be
+// positioned. This lazy positioning avoids seeking the range deletion
+// iterators for keys that are never considered. (A similar bit of lazy
+// evaluation can be done for the point iterators, but is still TBD).
+//
+// For a full example, consider the following setup:
+//
+//	p0:               o
+//	r0:             m---q
+//
+//	p1:              n p
+//	r1:       g---k
+//
+//	p2:  b d    i
+//	r2: a---e           q----v
+//
+//	p3:     e
+//	r3:
+//
+// If we start iterating from the beginning, the first key we encounter is "b"
+// in p2. When the mergingIter is pointing at a valid entry, the range deletion
+// iterators for all of the levels < m.heap.items[0].index are positioned at
+// the next range tombstone past the current key. So r0 will point at [m,q) and
+// r1 at [g,k). When the key "b" is encountered, we check to see if the current
+// tombstone for r0 or r1 contains it, and whether the tombstone for r2, [a,e),
+// contains and is newer than "b".
+//
+// Advancing the iterator finds the next key at "d". This is in the same level
+// as the previous key "b" so we don't have to reposition any of the range
+// deletion iterators, but merely check whether "d" is now contained by any of
+// the range tombstones at higher levels or has stepped past the range
+// tombstone in its own level or higher levels. In this case, there is nothing to be done.
+//
+// Advancing the iterator again finds "e". Since "e" comes from p3, we have to
+// position the r3 range deletion iterator, which is empty. "e" is past the r2
+// tombstone of [a,e) so we need to advance the r2 range deletion iterator to
+// [q,v).
+//
+// The next key is "i". Because this key is in p2, a level above "e", we don't
+// have to reposition any range deletion iterators and instead see that "i" is
+// covered by the range tombstone [g,k). The iterator is immediately advanced
+// to "n" which is covered by the range tombstone [m,q) causing the iterator to
+// advance to "o" which is visible.
+//
+// TODO(peter,rangedel): For testing, advance the iterator through various
+// scenarios and have each step display the current state (i.e. the current
+// heap and range-del iterator positioning).
+type mergingIter struct {
+	logger        Logger
+	split         Split
+	dir           int
+	snapshot      uint64
+	batchSnapshot uint64
+	levels        []mergingIterLevel
+	heap          mergingIterHeap
+	err           error
+	prefix        []byte
+	lower         []byte
+	upper         []byte
+	stats         *InternalIteratorStats
+
+	// levelsPositioned, if non-nil, is a slice of the same length as levels.
+	// It's used by NextPrefix to record which levels have already been
+	// repositioned. It's created lazily by the first call to NextPrefix.
+	levelsPositioned []bool
+
+	combinedIterState *combinedIterState
+
+	// Used in some tests to disable the random disabling of seek optimizations.
+	forceEnableSeekOpt bool
+}
+
+// mergingIter implements the base.InternalIterator interface.
+var _ base.InternalIterator = (*mergingIter)(nil)
+
+// newMergingIter returns an iterator that merges its input. Walking the
+// resultant iterator will return all key/value pairs of all input iterators
+// in strictly increasing key order, as defined by cmp. It is permissible to
+// pass a nil split parameter if the caller is never going to call
+// SeekPrefixGE.
+//
+// The input's key ranges may overlap, but there are assumed to be no duplicate
+// keys: if iters[i] contains a key k then iters[j] will not contain that key k.
+//
+// None of the iters may be nil.
+func newMergingIter(
+	logger Logger,
+	stats *base.InternalIteratorStats,
+	cmp Compare,
+	split Split,
+	iters ...internalIterator,
+) *mergingIter {
+	m := &mergingIter{}
+	levels := make([]mergingIterLevel, len(iters))
+	for i := range levels {
+		levels[i].iter = iters[i]
+	}
+	m.init(&IterOptions{logger: logger}, stats, cmp, split, levels...)
+	return m
+}
+
+func (m *mergingIter) init(
+	opts *IterOptions,
+	stats *base.InternalIteratorStats,
+	cmp Compare,
+	split Split,
+	levels ...mergingIterLevel,
+) {
+	m.err = nil // clear cached iteration error
+	m.logger = opts.getLogger()
+	if opts != nil {
+		m.lower = opts.LowerBound
+		m.upper = opts.UpperBound
+	}
+	m.snapshot = InternalKeySeqNumMax
+	m.batchSnapshot = InternalKeySeqNumMax
+	m.levels = levels
+	m.heap.cmp = cmp
+	m.split = split
+	m.stats = stats
+	if cap(m.heap.items) < len(levels) {
+		m.heap.items = make([]*mergingIterLevel, 0, len(levels))
+	} else {
+		m.heap.items = m.heap.items[:0]
+	}
+	for l := range m.levels {
+		m.levels[l].index = l
+	}
+}
+
+func (m *mergingIter) initHeap() {
+	m.heap.items = m.heap.items[:0]
+	for i := range m.levels {
+		if l := &m.levels[i]; l.iterKey != nil {
+			m.heap.items = append(m.heap.items, l)
+		} else {
+			m.err = firstError(m.err, l.iter.Error())
+			if m.err != nil {
+				return
+			}
+		}
+	}
+	m.heap.init()
+}
+
+func (m *mergingIter) initMinHeap() {
+	m.dir = 1
+	m.heap.reverse = false
+	m.initHeap()
+	m.initMinRangeDelIters(-1)
+}
+
+// The level of the previous top element was oldTopLevel. Note that all range delete
+// iterators < oldTopLevel are positioned past the key of the previous top element and
+// the range delete iterator == oldTopLevel is positioned at or past the key of the
+// previous top element. We need to position the range delete iterators from oldTopLevel + 1
+// to the level of the current top element.
+func (m *mergingIter) initMinRangeDelIters(oldTopLevel int) {
+	if m.heap.len() == 0 {
+		return
+	}
+
+	// Position the range-del iterators at levels <= m.heap.items[0].index.
+	item := m.heap.items[0]
+	for level := oldTopLevel + 1; level <= item.index; level++ {
+		l := &m.levels[level]
+		if l.rangeDelIter == nil {
+			continue
+		}
+		l.tombstone = l.rangeDelIter.SeekGE(item.iterKey.UserKey)
+	}
+}
+
+func (m *mergingIter) initMaxHeap() {
+	m.dir = -1
+	m.heap.reverse = true
+	m.initHeap()
+	m.initMaxRangeDelIters(-1)
+}
+
+// The level of the previous top element was oldTopLevel. Note that all range delete
+// iterators < oldTopLevel are positioned before the key of the previous top element and
+// the range delete iterator == oldTopLevel is positioned at or before the key of the
+// previous top element. We need to position the range delete iterators from oldTopLevel + 1
+// to the level of the current top element.
+func (m *mergingIter) initMaxRangeDelIters(oldTopLevel int) {
+	if m.heap.len() == 0 {
+		return
+	}
+	// Position the range-del iterators at levels <= m.heap.items[0].index.
+	item := m.heap.items[0]
+	for level := oldTopLevel + 1; level <= item.index; level++ {
+		l := &m.levels[level]
+		if l.rangeDelIter == nil {
+			continue
+		}
+		l.tombstone = keyspan.SeekLE(m.heap.cmp, l.rangeDelIter, item.iterKey.UserKey)
+	}
+}
+
+func (m *mergingIter) switchToMinHeap() {
+	if m.heap.len() == 0 {
+		if m.lower != nil {
+			m.SeekGE(m.lower, base.SeekGEFlagsNone)
+		} else {
+			m.First()
+		}
+		return
+	}
+
+	// We're switching from using a max heap to a min heap. We need to advance
+	// any iterator that is less than or equal to the current key. Consider the
+	// scenario where we have 2 iterators being merged (user-key:seq-num):
+	//
+	// i1:     *a:2     b:2
+	// i2: a:1      b:1
+	//
+	// The current key is a:2 and i2 is pointed at a:1. When we switch to forward
+	// iteration, we want to return a key that is greater than a:2.
+
+	key := m.heap.items[0].iterKey
+	cur := m.heap.items[0]
+
+	for i := range m.levels {
+		l := &m.levels[i]
+		if l == cur {
+			continue
+		}
+
+		// If the iterator is exhausted, it may be out of bounds if range
+		// deletions modified our search key as we descended. we need to
+		// reposition it within the search bounds. If the current key is a
+		// range tombstone, the iterator might still be exhausted but at a
+		// sstable boundary sentinel. It would be okay to reposition an
+		// interator like this only through successive Next calls, except that
+		// it would violate the levelIter's invariants by causing it to return
+		// a key before the lower bound.
+		//
+		//           bounds = [ f, _ )
+		// L0:   [ b ]          [ f*                   z ]
+		// L1: [ a           |----|        k        y ]
+		// L2:    [  c  (d) ] [ e      g     m ]
+		// L3:             [                    x ]
+		//
+		// * - current key   [] - table bounds () - heap item
+		//
+		// In the above diagram, the L2 iterator is positioned at a sstable
+		// boundary (d) outside the lower bound (f). It arrived here from a
+		// seek whose seek-key was modified by a range tombstone. If we called
+		// Next on the L2 iterator, it would return e, violating its lower
+		// bound.  Instead, we seek it to >= f and Next from there.
+
+		if l.iterKey == nil || (m.lower != nil && l.isSyntheticIterBoundsKey &&
+			l.iterKey.IsExclusiveSentinel() &&
+			m.heap.cmp(l.iterKey.UserKey, m.lower) <= 0) {
+			if m.lower != nil {
+				l.iterKey, l.iterValue = l.iter.SeekGE(m.lower, base.SeekGEFlagsNone)
+			} else {
+				l.iterKey, l.iterValue = l.iter.First()
+			}
+		}
+		for ; l.iterKey != nil; l.iterKey, l.iterValue = l.iter.Next() {
+			if base.InternalCompare(m.heap.cmp, *key, *l.iterKey) < 0 {
+				// key < iter-key
+				break
+			}
+			// key >= iter-key
+		}
+	}
+
+	// Special handling for the current iterator because we were using its key
+	// above. The iterator cur.iter may still be exhausted at a sstable boundary
+	// sentinel. Similar to the logic applied to the other levels, in these
+	// cases we seek the iterator to the first key in order to avoid violating
+	// levelIter's invariants. See the example in the for loop above.
+	if m.lower != nil && cur.isSyntheticIterBoundsKey && cur.iterKey.IsExclusiveSentinel() &&
+		m.heap.cmp(cur.iterKey.UserKey, m.lower) <= 0 {
+		cur.iterKey, cur.iterValue = cur.iter.SeekGE(m.lower, base.SeekGEFlagsNone)
+	} else {
+		cur.iterKey, cur.iterValue = cur.iter.Next()
+	}
+	m.initMinHeap()
+}
+
+func (m *mergingIter) switchToMaxHeap() {
+	if m.heap.len() == 0 {
+		if m.upper != nil {
+			m.SeekLT(m.upper, base.SeekLTFlagsNone)
+		} else {
+			m.Last()
+		}
+		return
+	}
+
+	// We're switching from using a min heap to a max heap. We need to backup any
+	// iterator that is greater than or equal to the current key. Consider the
+	// scenario where we have 2 iterators being merged (user-key:seq-num):
+	//
+	// i1: a:2     *b:2
+	// i2:     a:1      b:1
+	//
+	// The current key is b:2 and i2 is pointing at b:1. When we switch to
+	// reverse iteration, we want to return a key that is less than b:2.
+	key := m.heap.items[0].iterKey
+	cur := m.heap.items[0]
+
+	for i := range m.levels {
+		l := &m.levels[i]
+		if l == cur {
+			continue
+		}
+
+		// If the iterator is exhausted, it may be out of bounds if range
+		// deletions modified our search key as we descended. we need to
+		// reposition it within the search bounds. If the current key is a
+		// range tombstone, the iterator might still be exhausted but at a
+		// sstable boundary sentinel. It would be okay to reposition an
+		// interator like this only through successive Prev calls, except that
+		// it would violate the levelIter's invariants by causing it to return
+		// a key beyond the upper bound.
+		//
+		//           bounds = [ _, g )
+		// L0:   [ b ]          [ f*                   z ]
+		// L1: [ a                |-------| k       y ]
+		// L2:    [  c   d  ]        h [(i)    m ]
+		// L3:             [  e                  x ]
+		//
+		// * - current key   [] - table bounds () - heap item
+		//
+		// In the above diagram, the L2 iterator is positioned at a sstable
+		// boundary (i) outside the upper bound (g). It arrived here from a
+		// seek whose seek-key was modified by a range tombstone. If we called
+		// Prev on the L2 iterator, it would return h, violating its upper
+		// bound.  Instead, we seek it to < g, and Prev from there.
+
+		if l.iterKey == nil || (m.upper != nil && l.isSyntheticIterBoundsKey &&
+			l.iterKey.IsExclusiveSentinel() && m.heap.cmp(l.iterKey.UserKey, m.upper) >= 0) {
+			if m.upper != nil {
+				l.iterKey, l.iterValue = l.iter.SeekLT(m.upper, base.SeekLTFlagsNone)
+			} else {
+				l.iterKey, l.iterValue = l.iter.Last()
+			}
+		}
+		for ; l.iterKey != nil; l.iterKey, l.iterValue = l.iter.Prev() {
+			if base.InternalCompare(m.heap.cmp, *key, *l.iterKey) > 0 {
+				// key > iter-key
+				break
+			}
+			// key <= iter-key
+		}
+	}
+
+	// Special handling for the current iterator because we were using its key
+	// above. The iterator cur.iter may still be exhausted at a sstable boundary
+	// sentinel. Similar to the logic applied to the other levels, in these
+	// cases we seek the iterator to  in order to avoid violating levelIter's
+	// invariants by Prev-ing through files.  See the example in the for loop
+	// above.
+	if m.upper != nil && cur.isSyntheticIterBoundsKey && cur.iterKey.IsExclusiveSentinel() &&
+		m.heap.cmp(cur.iterKey.UserKey, m.upper) >= 0 {
+		cur.iterKey, cur.iterValue = cur.iter.SeekLT(m.upper, base.SeekLTFlagsNone)
+	} else {
+		cur.iterKey, cur.iterValue = cur.iter.Prev()
+	}
+	m.initMaxHeap()
+}
+
+// maybeNextEntryWithinPrefix steps to the next entry, as long as the iteration
+// prefix has not already been exceeded. If it has, it exhausts the iterator by
+// resetting the heap to empty.
+func (m *mergingIter) maybeNextEntryWithinPrefix(l *mergingIterLevel) {
+	if s := m.split(l.iterKey.UserKey); !bytes.Equal(m.prefix, l.iterKey.UserKey[:s]) {
+		// The item at the root of the heap already exceeds the iteration
+		// prefix. We should not advance any more. Clear the heap to reflect
+		// that the iterator is now exhausted (within this prefix, at
+		// least).
+		m.heap.items = m.heap.items[:0]
+		return
+	}
+	m.nextEntry(l, nil /* succKey */)
+}
+
+// nextEntry unconditionally steps to the next entry. item is the current top
+// item in the heap.
+//
+// nextEntry should be called directly when not in prefix-iteration mode, or by
+// Next.  During prefix iteration mode, all other callers should use
+// maybeNextEntryWithinPrefix which will avoid advancing the iterator if the
+// current iteration prefix has been exhausted. See the comment within
+// nextEntry's body for an explanation of why other callers should call
+// maybeNextEntryWithinPrefix, which will ensure the documented invariant is
+// preserved.
+func (m *mergingIter) nextEntry(l *mergingIterLevel, succKey []byte) {
+	// INVARIANT: If in prefix iteration mode, item.iterKey must have a prefix equal
+	// to m.prefix. This invariant is important for ensuring TrySeekUsingNext
+	// optimizations behave correctly.
+	//
+	// During prefix iteration, the iterator does not have a full view of the
+	// LSM. Some level iterators may omit keys that are known to fall outside
+	// the seek prefix (eg, due to sstable bloom filter exclusion). It's
+	// important that in such cases we don't position any iterators beyond
+	// m.prefix, because doing so may interfere with future seeks.
+	//
+	// Let prefixes P1 < P2 < P3. Imagine a SeekPrefixGE to prefix P1, followed
+	// by a SeekPrefixGE to prefix P2. Imagine there exist live keys at prefix
+	// P2, but they're not visible to the SeekPrefixGE(P1) (because of
+	// bloom-filter exclusion or a range tombstone that deletes prefix P1 but
+	// not P2). If the SeekPrefixGE(P1) is allowed to move any level iterators
+	// to P3, the SeekPrefixGE(P2, TrySeekUsingNext=true) may mistakenly think
+	// the level contains no point keys or range tombstones within the prefix
+	// P2. Care is taken to avoid ever advancing the iterator beyond the current
+	// prefix. If nextEntry is ever invoked while we're already beyond the
+	// current prefix, we're violating the invariant.
+	if invariants.Enabled && m.prefix != nil {
+		if s := m.split(l.iterKey.UserKey); !bytes.Equal(m.prefix, l.iterKey.UserKey[:s]) {
+			m.logger.Fatalf("mergingIter: prefix violation: nexting beyond prefix %q; existing heap root %q\n%s",
+				m.prefix, l.iterKey, debug.Stack())
+		}
+	}
+
+	oldTopLevel := l.index
+	oldRangeDelIter := l.rangeDelIter
+
+	if succKey == nil {
+		l.iterKey, l.iterValue = l.iter.Next()
+	} else {
+		l.iterKey, l.iterValue = l.iter.NextPrefix(succKey)
+	}
+
+	if l.iterKey != nil {
+		if m.heap.len() > 1 {
+			m.heap.fix(0)
+		}
+		if l.rangeDelIter != oldRangeDelIter {
+			// The rangeDelIter changed which indicates that the l.iter moved to the
+			// next sstable. We have to update the tombstone for oldTopLevel as well.
+			oldTopLevel--
+		}
+	} else {
+		m.err = l.iter.Error()
+		if m.err == nil {
+			m.heap.pop()
+		}
+	}
+
+	// The cached tombstones are only valid for the levels
+	// [0,oldTopLevel]. Updated the cached tombstones for any levels in the range
+	// [oldTopLevel+1,heap[0].index].
+	m.initMinRangeDelIters(oldTopLevel)
+}
+
+// isNextEntryDeleted starts from the current entry (as the next entry) and if
+// it is deleted, moves the iterators forward as needed and returns true, else
+// it returns false. item is the top item in the heap.
+//
+// During prefix iteration mode, isNextEntryDeleted will exhaust the iterator by
+// clearing the heap if the deleted key(s) extend beyond the iteration prefix
+// during prefix-iteration mode.
+func (m *mergingIter) isNextEntryDeleted(item *mergingIterLevel) bool {
+	// Look for a range deletion tombstone containing item.iterKey at higher
+	// levels (level < item.index). If we find such a range tombstone we know
+	// it deletes the key in the current level. Also look for a range
+	// deletion at the current level (level == item.index). If we find such a
+	// range deletion we need to check whether it is newer than the current
+	// entry.
+	for level := 0; level <= item.index; level++ {
+		l := &m.levels[level]
+		if l.rangeDelIter == nil || l.tombstone == nil {
+			// If l.tombstone is nil, there are no further tombstones
+			// in the current sstable in the current (forward) iteration
+			// direction.
+			continue
+		}
+		if m.heap.cmp(l.tombstone.End, item.iterKey.UserKey) <= 0 {
+			// The current key is at or past the tombstone end key.
+			//
+			// NB: for the case that this l.rangeDelIter is provided by a levelIter we know that
+			// the levelIter must be positioned at a key >= item.iterKey. So it is sufficient to seek the
+			// current l.rangeDelIter (since any range del iterators that will be provided by the
+			// levelIter in the future cannot contain item.iterKey). Also, it is possible that we
+			// will encounter parts of the range delete that should be ignored -- we handle that
+			// below.
+			l.tombstone = l.rangeDelIter.SeekGE(item.iterKey.UserKey)
+		}
+		if l.tombstone == nil {
+			continue
+		}
+
+		// Reasoning for correctness of untruncated tombstone handling when the untruncated
+		// tombstone is at a higher level:
+		// The iterator corresponding to this tombstone is still in the heap so it must be
+		// positioned >= item.iterKey. Which means the Largest key bound of the sstable containing this
+		// tombstone is >= item.iterKey. So the upper limit of this tombstone cannot be file-bounds-constrained
+		// to < item.iterKey. But it is possible that item.key < smallestUserKey, in which
+		// case this tombstone should be ignored.
+		//
+		// Example 1:
+		// sstable bounds [c#8, g#12] containing a tombstone [b, i)#7, and key is c#6. The
+		// smallestUserKey is c, so we know the key is within the file bounds and the tombstone
+		// [b, i) covers it.
+		//
+		// Example 2:
+		// Same sstable bounds but key is b#10. The smallestUserKey is c, so the tombstone [b, i)
+		// does not cover this key.
+		//
+		// For a tombstone at the same level as the key, the file bounds are trivially satisfied.
+		if (l.smallestUserKey == nil || m.heap.cmp(l.smallestUserKey, item.iterKey.UserKey) <= 0) &&
+			l.tombstone.VisibleAt(m.snapshot) && l.tombstone.Contains(m.heap.cmp, item.iterKey.UserKey) {
+			if level < item.index {
+				// We could also do m.seekGE(..., level + 1). The levels from
+				// [level + 1, item.index) are already after item.iterKey so seeking them may be
+				// wasteful.
+
+				// We can seek up to the min of largestUserKey and tombstone.End.
+				//
+				// Using example 1 above, we can seek to the smaller of g and i, which is g.
+				//
+				// Another example, where the sstable bounds are [c#8, i#InternalRangeDelSentinel],
+				// and the tombstone is [b, i)#8. Seeking to i is correct since it is seeking up to
+				// the exclusive bound of the tombstone. We do not need to look at
+				// isLargestKeyRangeDelSentinel.
+				//
+				// Progress argument: Since this file is at a higher level than item.iterKey we know
+				// that the iterator in this file must be positioned within its bounds and at a key
+				// X > item.iterKey (otherwise it would be the min of the heap). It is not
+				// possible for X.UserKey == item.iterKey.UserKey, since it is incompatible with
+				// X > item.iterKey (a lower version cannot be in a higher sstable), so it must be that
+				// X.UserKey > item.iterKey.UserKey. Which means l.largestUserKey > item.key.UserKey.
+				// We also know that l.tombstone.End > item.iterKey.UserKey. So the min of these,
+				// seekKey, computed below, is > item.iterKey.UserKey, so the call to seekGE() will
+				// make forward progress.
+				seekKey := l.tombstone.End
+				if l.largestUserKey != nil && m.heap.cmp(l.largestUserKey, seekKey) < 0 {
+					seekKey = l.largestUserKey
+				}
+				// This seek is not directly due to a SeekGE call, so we don't know
+				// enough about the underlying iterator positions, and so we keep the
+				// try-seek-using-next optimization disabled. Additionally, if we're in
+				// prefix-seek mode and a re-seek would have moved us past the original
+				// prefix, we can remove all merging iter levels below the rangedel
+				// tombstone's level and return immediately instead of re-seeking. This
+				// is correct since those levels cannot provide a key that matches the
+				// prefix, and is also visible. Additionally, this is important to make
+				// subsequent `TrySeekUsingNext` work correctly, as a re-seek on a
+				// different prefix could have resulted in this iterator skipping visible
+				// keys at prefixes in between m.prefix and seekKey, that are currently
+				// not in the heap due to a bloom filter mismatch.
+				//
+				// Additionally, we set the relative-seek flag. This is
+				// important when iterating with lazy combined iteration. If
+				// there's a range key between this level's current file and the
+				// file the seek will land on, we need to detect it in order to
+				// trigger construction of the combined iterator.
+				if m.prefix != nil {
+					if n := m.split(seekKey); !bytes.Equal(m.prefix, seekKey[:n]) {
+						for i := item.index; i < len(m.levels); i++ {
+							// Remove this level from the heap. Setting iterKey and iterValue
+							// to their zero values should be sufficient for initMinHeap to not
+							// re-initialize the heap with them in it. Other fields in
+							// mergingIterLevel can remain as-is; the iter/rangeDelIter needs
+							// to stay intact for future trySeekUsingNexts to work, the level
+							// iter boundary context is owned by the levelIter which is not
+							// being repositioned, and any tombstones in these levels will be
+							// irrelevant for us anyway.
+							m.levels[i].iterKey = nil
+							m.levels[i].iterValue = base.LazyValue{}
+						}
+						// TODO(bilal): Consider a more efficient way of removing levels from
+						// the heap without reinitializing all of it. This would likely
+						// necessitate tracking the heap positions of each mergingIterHeap
+						// item in the mergingIterLevel, and then swapping that item in the
+						// heap with the last-positioned heap item, and shrinking the heap by
+						// one.
+						m.initMinHeap()
+						return true
+					}
+				}
+				m.seekGE(seekKey, item.index, base.SeekGEFlagsNone.EnableRelativeSeek())
+				return true
+			}
+			if l.tombstone.CoversAt(m.snapshot, item.iterKey.SeqNum()) {
+				if m.prefix == nil {
+					m.nextEntry(item, nil /* succKey */)
+				} else {
+					m.maybeNextEntryWithinPrefix(item)
+				}
+				return true
+			}
+		}
+	}
+	return false
+}
+
+// Starting from the current entry, finds the first (next) entry that can be returned.
+func (m *mergingIter) findNextEntry() (*InternalKey, base.LazyValue) {
+	for m.heap.len() > 0 && m.err == nil {
+		item := m.heap.items[0]
+		if m.levels[item.index].isSyntheticIterBoundsKey {
+			break
+		}
+
+		m.addItemStats(item)
+
+		// Skip ignorable boundary keys. These are not real keys and exist to
+		// keep sstables open until we've surpassed their end boundaries so that
+		// their range deletions are visible.
+		if m.levels[item.index].isIgnorableBoundaryKey {
+			if m.prefix == nil {
+				m.nextEntry(item, nil /* succKey */)
+			} else {
+				m.maybeNextEntryWithinPrefix(item)
+			}
+			continue
+		}
+
+		// Check if the heap root key is deleted by a range tombstone in a
+		// higher level. If it is, isNextEntryDeleted will advance the iterator
+		// to a later key (through seeking or nexting).
+		if m.isNextEntryDeleted(item) {
+			m.stats.PointsCoveredByRangeTombstones++
+			continue
+		}
+
+		// Check if the key is visible at the iterator sequence numbers.
+		if !item.iterKey.Visible(m.snapshot, m.batchSnapshot) {
+			if m.prefix == nil {
+				m.nextEntry(item, nil /* succKey */)
+			} else {
+				m.maybeNextEntryWithinPrefix(item)
+			}
+			continue
+		}
+
+		// The heap root is visible and not deleted by any range tombstones.
+		// Return it.
+		return item.iterKey, item.iterValue
+	}
+	return nil, base.LazyValue{}
+}
+
+// Steps to the prev entry. item is the current top item in the heap.
+func (m *mergingIter) prevEntry(l *mergingIterLevel) {
+	oldTopLevel := l.index
+	oldRangeDelIter := l.rangeDelIter
+	if l.iterKey, l.iterValue = l.iter.Prev(); l.iterKey != nil {
+		if m.heap.len() > 1 {
+			m.heap.fix(0)
+		}
+		if l.rangeDelIter != oldRangeDelIter && l.rangeDelIter != nil {
+			// The rangeDelIter changed which indicates that the l.iter moved to the
+			// previous sstable. We have to update the tombstone for oldTopLevel as
+			// well.
+			oldTopLevel--
+		}
+	} else {
+		m.err = l.iter.Error()
+		if m.err == nil {
+			m.heap.pop()
+		}
+	}
+
+	// The cached tombstones are only valid for the levels
+	// [0,oldTopLevel]. Updated the cached tombstones for any levels in the range
+	// [oldTopLevel+1,heap[0].index].
+	m.initMaxRangeDelIters(oldTopLevel)
+}
+
+// isPrevEntryDeleted() starts from the current entry (as the prev entry) and if it is deleted,
+// moves the iterators backward as needed and returns true, else it returns false. item is the top
+// item in the heap.
+func (m *mergingIter) isPrevEntryDeleted(item *mergingIterLevel) bool {
+	// Look for a range deletion tombstone containing item.iterKey at higher
+	// levels (level < item.index). If we find such a range tombstone we know
+	// it deletes the key in the current level. Also look for a range
+	// deletion at the current level (level == item.index). If we find such a
+	// range deletion we need to check whether it is newer than the current
+	// entry.
+	for level := 0; level <= item.index; level++ {
+		l := &m.levels[level]
+		if l.rangeDelIter == nil || l.tombstone == nil {
+			// If l.tombstone is nil, there are no further tombstones
+			// in the current sstable in the current (reverse) iteration
+			// direction.
+			continue
+		}
+		if m.heap.cmp(item.iterKey.UserKey, l.tombstone.Start) < 0 {
+			// The current key is before the tombstone start key.
+			//
+			// NB: for the case that this l.rangeDelIter is provided by a levelIter we know that
+			// the levelIter must be positioned at a key < item.iterKey. So it is sufficient to seek the
+			// current l.rangeDelIter (since any range del iterators that will be provided by the
+			// levelIter in the future cannot contain item.iterKey). Also, it is it is possible that we
+			// will encounter parts of the range delete that should be ignored -- we handle that
+			// below.
+			l.tombstone = keyspan.SeekLE(m.heap.cmp, l.rangeDelIter, item.iterKey.UserKey)
+		}
+		if l.tombstone == nil {
+			continue
+		}
+
+		// Reasoning for correctness of untruncated tombstone handling when the untruncated
+		// tombstone is at a higher level:
+		//
+		// The iterator corresponding to this tombstone is still in the heap so it must be
+		// positioned <= item.iterKey. Which means the Smallest key bound of the sstable containing this
+		// tombstone is <= item.iterKey. So the lower limit of this tombstone cannot have been
+		// file-bounds-constrained to > item.iterKey. But it is possible that item.key >= Largest
+		// key bound of this sstable, in which case this tombstone should be ignored.
+		//
+		// Example 1:
+		// sstable bounds [c#8, g#12] containing a tombstone [b, i)#7, and key is f#6. The
+		// largestUserKey is g, so we know the key is within the file bounds and the tombstone
+		// [b, i) covers it.
+		//
+		// Example 2:
+		// Same sstable but the key is g#6. This cannot happen since the [b, i)#7 untruncated
+		// tombstone was involved in a compaction which must have had a file to the right of this
+		// sstable that is part of the same atomic compaction group for future compactions. That
+		// file must have bounds that cover g#6 and this levelIter must be at that file.
+		//
+		// Example 3:
+		// sstable bounds [c#8, g#RangeDelSentinel] containing [b, i)#7 and the key is g#10.
+		// This key is not deleted by this tombstone. We need to look at
+		// isLargestUserKeyExclusive.
+		//
+		// For a tombstone at the same level as the key, the file bounds are trivially satisfied.
+
+		// Default to within bounds.
+		withinLargestSSTableBound := true
+		if l.largestUserKey != nil {
+			cmpResult := m.heap.cmp(l.largestUserKey, item.iterKey.UserKey)
+			withinLargestSSTableBound = cmpResult > 0 || (cmpResult == 0 && !l.isLargestUserKeyExclusive)
+		}
+		if withinLargestSSTableBound && l.tombstone.Contains(m.heap.cmp, item.iterKey.UserKey) && l.tombstone.VisibleAt(m.snapshot) {
+			if level < item.index {
+				// We could also do m.seekLT(..., level + 1). The levels from
+				// [level + 1, item.index) are already before item.iterKey so seeking them may be
+				// wasteful.
+
+				// We can seek up to the max of smallestUserKey and tombstone.Start.UserKey.
+				//
+				// Using example 1 above, we can seek to the larger of c and b, which is c.
+				//
+				// Progress argument: We know that the iterator in this file is positioned within
+				// its bounds and at a key X < item.iterKey (otherwise it would be the max of the heap).
+				// So smallestUserKey <= item.iterKey.UserKey and we already know that
+				// l.tombstone.Start.UserKey <= item.iterKey.UserKey. So the seekKey computed below
+				// is <= item.iterKey.UserKey, and since we do a seekLT() we will make backwards
+				// progress.
+				seekKey := l.tombstone.Start
+				if l.smallestUserKey != nil && m.heap.cmp(l.smallestUserKey, seekKey) > 0 {
+					seekKey = l.smallestUserKey
+				}
+				// We set the relative-seek flag. This is important when
+				// iterating with lazy combined iteration. If there's a range
+				// key between this level's current file and the file the seek
+				// will land on, we need to detect it in order to trigger
+				// construction of the combined iterator.
+				m.seekLT(seekKey, item.index, base.SeekLTFlagsNone.EnableRelativeSeek())
+				return true
+			}
+			if l.tombstone.CoversAt(m.snapshot, item.iterKey.SeqNum()) {
+				m.prevEntry(item)
+				return true
+			}
+		}
+	}
+	return false
+}
+
+// Starting from the current entry, finds the first (prev) entry that can be returned.
+func (m *mergingIter) findPrevEntry() (*InternalKey, base.LazyValue) {
+	for m.heap.len() > 0 && m.err == nil {
+		item := m.heap.items[0]
+		if m.levels[item.index].isSyntheticIterBoundsKey {
+			break
+		}
+		m.addItemStats(item)
+		if m.isPrevEntryDeleted(item) {
+			m.stats.PointsCoveredByRangeTombstones++
+			continue
+		}
+		if item.iterKey.Visible(m.snapshot, m.batchSnapshot) &&
+			(!m.levels[item.index].isIgnorableBoundaryKey) {
+			return item.iterKey, item.iterValue
+		}
+		m.prevEntry(item)
+	}
+	return nil, base.LazyValue{}
+}
+
+// Seeks levels >= level to >= key. Additionally uses range tombstones to extend the seeks.
+func (m *mergingIter) seekGE(key []byte, level int, flags base.SeekGEFlags) {
+	// When seeking, we can use tombstones to adjust the key we seek to on each
+	// level. Consider the series of range tombstones:
+	//
+	//   1: a---e
+	//   2:    d---h
+	//   3:       g---k
+	//   4:          j---n
+	//   5:             m---q
+	//
+	// If we SeekGE("b") we also find the tombstone "b" resides within in the
+	// first level which is [a,e). Regardless of whether this tombstone deletes
+	// "b" in that level, we know it deletes "b" in all lower levels, so we
+	// adjust the search key in the next level to the tombstone end key "e". We
+	// then SeekGE("e") in the second level and find the corresponding tombstone
+	// [d,h). This process continues and we end up seeking for "h" in the 3rd
+	// level, "k" in the 4th level and "n" in the last level.
+	//
+	// TODO(peter,rangedel): In addition to the above we can delay seeking a
+	// level (and any lower levels) when the current iterator position is
+	// contained within a range tombstone at a higher level.
+
+	// Deterministically disable the TrySeekUsingNext optimizations sometimes in
+	// invariant builds to encourage the metamorphic tests to surface bugs. Note
+	// that we cannot disable the optimization within individual levels. It must
+	// be disabled for all levels or none. If one lower-level iterator performs
+	// a fresh seek whereas another takes advantage of its current iterator
+	// position, the heap can become inconsistent. Consider the following
+	// example:
+	//
+	//     L5:  [ [b-c) ]  [ d ]*
+	//     L6:  [  b ]           [e]*
+	//
+	// Imagine a SeekGE(a). The [b-c) range tombstone deletes the L6 point key
+	// 'b', resulting in the iterator positioned at d with the heap:
+	//
+	//     {L5: d, L6: e}
+	//
+	// A subsequent SeekGE(b) is seeking to a larger key, so the caller may set
+	// TrySeekUsingNext()=true. If the L5 iterator used the TrySeekUsingNext
+	// optimization but the L6 iterator did not, the iterator would have the
+	// heap:
+	//
+	//     {L6: b, L5: d}
+	//
+	// Because the L5 iterator has already advanced to the next sstable, the
+	// merging iterator cannot observe the [b-c) range tombstone and will
+	// mistakenly return L6's deleted point key 'b'.
+	if invariants.Enabled && flags.TrySeekUsingNext() && !m.forceEnableSeekOpt &&
+		disableSeekOpt(key, uintptr(unsafe.Pointer(m))) {
+		flags = flags.DisableTrySeekUsingNext()
+	}
+
+	for ; level < len(m.levels); level++ {
+		if invariants.Enabled && m.lower != nil && m.heap.cmp(key, m.lower) < 0 {
+			m.logger.Fatalf("mergingIter: lower bound violation: %s < %s\n%s", key, m.lower, debug.Stack())
+		}
+
+		l := &m.levels[level]
+		if m.prefix != nil {
+			l.iterKey, l.iterValue = l.iter.SeekPrefixGE(m.prefix, key, flags)
+		} else {
+			l.iterKey, l.iterValue = l.iter.SeekGE(key, flags)
+		}
+
+		// If this level contains overlapping range tombstones, alter the seek
+		// key accordingly. Caveat: If we're performing lazy-combined iteration,
+		// we cannot alter the seek key: Range tombstones don't delete range
+		// keys, and there might exist live range keys within the range
+		// tombstone's span that need to be observed to trigger a switch to
+		// combined iteration.
+		if rangeDelIter := l.rangeDelIter; rangeDelIter != nil &&
+			(m.combinedIterState == nil || m.combinedIterState.initialized) {
+			// The level has a range-del iterator. Find the tombstone containing
+			// the search key.
+			//
+			// For untruncated tombstones that are possibly file-bounds-constrained, we are using a
+			// levelIter which will set smallestUserKey and largestUserKey. Since the levelIter
+			// is at this file we know that largestUserKey >= key, so we know that the
+			// tombstone we find cannot be file-bounds-constrained in its upper bound to something < key.
+			// We do need to  compare with smallestUserKey to ensure that the tombstone is not
+			// file-bounds-constrained in its lower bound.
+			//
+			// See the detailed comments in isNextEntryDeleted() on why similar containment and
+			// seeking logic is correct. The subtle difference here is that key is a user key,
+			// so we can have a sstable with bounds [c#8, i#InternalRangeDelSentinel], and the
+			// tombstone is [b, k)#8 and the seek key is i: levelIter.SeekGE(i) will move past
+			// this sstable since it realizes the largest key is a InternalRangeDelSentinel.
+			l.tombstone = rangeDelIter.SeekGE(key)
+			if l.tombstone != nil && l.tombstone.VisibleAt(m.snapshot) && l.tombstone.Contains(m.heap.cmp, key) &&
+				(l.smallestUserKey == nil || m.heap.cmp(l.smallestUserKey, key) <= 0) {
+				// NB: Based on the comment above l.largestUserKey >= key, and based on the
+				// containment condition tombstone.End > key, so the assignment to key results
+				// in a monotonically non-decreasing key across iterations of this loop.
+				//
+				// The adjustment of key here can only move it to a larger key. Since
+				// the caller of seekGE guaranteed that the original key was greater
+				// than or equal to m.lower, the new key will continue to be greater
+				// than or equal to m.lower.
+				if l.largestUserKey != nil &&
+					m.heap.cmp(l.largestUserKey, l.tombstone.End) < 0 {
+					// Truncate the tombstone for seeking purposes. Note that this can over-truncate
+					// but that is harmless for this seek optimization.
+					key = l.largestUserKey
+				} else {
+					key = l.tombstone.End
+				}
+			}
+		}
+	}
+
+	m.initMinHeap()
+}
+
+func (m *mergingIter) String() string {
+	return "merging"
+}
+
+// SeekGE implements base.InternalIterator.SeekGE. Note that SeekGE only checks
+// the upper bound. It is up to the caller to ensure that key is greater than
+// or equal to the lower bound.
+func (m *mergingIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) {
+	m.err = nil // clear cached iteration error
+	m.prefix = nil
+	m.seekGE(key, 0 /* start level */, flags)
+	return m.findNextEntry()
+}
+
+// SeekPrefixGE implements base.InternalIterator.SeekPrefixGE. Note that
+// SeekPrefixGE only checks the upper bound. It is up to the caller to ensure
+// that key is greater than or equal to the lower bound.
+func (m *mergingIter) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	m.err = nil // clear cached iteration error
+	m.prefix = prefix
+	m.seekGE(key, 0 /* start level */, flags)
+	return m.findNextEntry()
+}
+
+// Seeks levels >= level to < key. Additionally uses range tombstones to extend the seeks.
+func (m *mergingIter) seekLT(key []byte, level int, flags base.SeekLTFlags) {
+	// See the comment in seekGE regarding using tombstones to adjust the seek
+	// target per level.
+	m.prefix = nil
+	for ; level < len(m.levels); level++ {
+		if invariants.Enabled && m.upper != nil && m.heap.cmp(key, m.upper) > 0 {
+			m.logger.Fatalf("mergingIter: upper bound violation: %s > %s\n%s", key, m.upper, debug.Stack())
+		}
+
+		l := &m.levels[level]
+		l.iterKey, l.iterValue = l.iter.SeekLT(key, flags)
+
+		// If this level contains overlapping range tombstones, alter the seek
+		// key accordingly. Caveat: If we're performing lazy-combined iteration,
+		// we cannot alter the seek key: Range tombstones don't delete range
+		// keys, and there might exist live range keys within the range
+		// tombstone's span that need to be observed to trigger a switch to
+		// combined iteration.
+		if rangeDelIter := l.rangeDelIter; rangeDelIter != nil &&
+			(m.combinedIterState == nil || m.combinedIterState.initialized) {
+			// The level has a range-del iterator. Find the tombstone containing
+			// the search key.
+			//
+			// For untruncated tombstones that are possibly file-bounds-constrained we are using a
+			// levelIter which will set smallestUserKey and largestUserKey. Since the levelIter
+			// is at this file we know that smallestUserKey <= key, so we know that the
+			// tombstone we find cannot be file-bounds-constrained in its lower bound to something > key.
+			// We do need to  compare with largestUserKey to ensure that the tombstone is not
+			// file-bounds-constrained in its upper bound.
+			//
+			// See the detailed comments in isPrevEntryDeleted() on why similar containment and
+			// seeking logic is correct.
+
+			// Default to within bounds.
+			withinLargestSSTableBound := true
+			if l.largestUserKey != nil {
+				cmpResult := m.heap.cmp(l.largestUserKey, key)
+				withinLargestSSTableBound = cmpResult > 0 || (cmpResult == 0 && !l.isLargestUserKeyExclusive)
+			}
+
+			l.tombstone = keyspan.SeekLE(m.heap.cmp, rangeDelIter, key)
+			if l.tombstone != nil && l.tombstone.VisibleAt(m.snapshot) &&
+				l.tombstone.Contains(m.heap.cmp, key) && withinLargestSSTableBound {
+				// NB: Based on the comment above l.smallestUserKey <= key, and based
+				// on the containment condition tombstone.Start.UserKey <= key, so the
+				// assignment to key results in a monotonically non-increasing key
+				// across iterations of this loop.
+				//
+				// The adjustment of key here can only move it to a smaller key. Since
+				// the caller of seekLT guaranteed that the original key was less than
+				// or equal to m.upper, the new key will continue to be less than or
+				// equal to m.upper.
+				if l.smallestUserKey != nil &&
+					m.heap.cmp(l.smallestUserKey, l.tombstone.Start) >= 0 {
+					// Truncate the tombstone for seeking purposes. Note that this can over-truncate
+					// but that is harmless for this seek optimization.
+					key = l.smallestUserKey
+				} else {
+					key = l.tombstone.Start
+				}
+			}
+		}
+	}
+
+	m.initMaxHeap()
+}
+
+// SeekLT implements base.InternalIterator.SeekLT. Note that SeekLT only checks
+// the lower bound. It is up to the caller to ensure that key is less than the
+// upper bound.
+func (m *mergingIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) {
+	m.err = nil // clear cached iteration error
+	m.prefix = nil
+	m.seekLT(key, 0 /* start level */, flags)
+	return m.findPrevEntry()
+}
+
+// First implements base.InternalIterator.First. Note that First only checks
+// the upper bound. It is up to the caller to ensure that key is greater than
+// or equal to the lower bound (e.g. via a call to SeekGE(lower)).
+func (m *mergingIter) First() (*InternalKey, base.LazyValue) {
+	m.err = nil // clear cached iteration error
+	m.prefix = nil
+	m.heap.items = m.heap.items[:0]
+	for i := range m.levels {
+		l := &m.levels[i]
+		l.iterKey, l.iterValue = l.iter.First()
+	}
+	m.initMinHeap()
+	return m.findNextEntry()
+}
+
+// Last implements base.InternalIterator.Last. Note that Last only checks the
+// lower bound. It is up to the caller to ensure that key is less than the
+// upper bound (e.g. via a call to SeekLT(upper))
+func (m *mergingIter) Last() (*InternalKey, base.LazyValue) {
+	m.err = nil // clear cached iteration error
+	m.prefix = nil
+	for i := range m.levels {
+		l := &m.levels[i]
+		l.iterKey, l.iterValue = l.iter.Last()
+	}
+	m.initMaxHeap()
+	return m.findPrevEntry()
+}
+
+func (m *mergingIter) Next() (*InternalKey, base.LazyValue) {
+	if m.err != nil {
+		return nil, base.LazyValue{}
+	}
+
+	if m.dir != 1 {
+		m.switchToMinHeap()
+		return m.findNextEntry()
+	}
+
+	if m.heap.len() == 0 {
+		return nil, base.LazyValue{}
+	}
+
+	// NB: It's okay to call nextEntry directly even during prefix iteration
+	// mode (as opposed to indirectly through maybeNextEntryWithinPrefix).
+	// During prefix iteration mode, we rely on the caller to not call Next if
+	// the iterator has already advanced beyond the iteration prefix. See the
+	// comment above the base.InternalIterator interface.
+	m.nextEntry(m.heap.items[0], nil /* succKey */)
+	return m.findNextEntry()
+}
+
+func (m *mergingIter) NextPrefix(succKey []byte) (*InternalKey, LazyValue) {
+	if m.dir != 1 {
+		panic("pebble: cannot switch directions with NextPrefix")
+	}
+	if m.err != nil || m.heap.len() == 0 {
+		return nil, LazyValue{}
+	}
+	if m.levelsPositioned == nil {
+		m.levelsPositioned = make([]bool, len(m.levels))
+	} else {
+		for i := range m.levelsPositioned {
+			m.levelsPositioned[i] = false
+		}
+	}
+
+	// The heap root necessarily must be positioned at a key < succKey, because
+	// NextPrefix was invoked.
+	root := &m.heap.items[0]
+	m.levelsPositioned[(*root).index] = true
+	if invariants.Enabled && m.heap.cmp((*root).iterKey.UserKey, succKey) >= 0 {
+		m.logger.Fatalf("pebble: invariant violation: NextPrefix(%q) called on merging iterator already positioned at %q",
+			succKey, (*root).iterKey)
+	}
+	m.nextEntry(*root, succKey)
+	// NB: root is a pointer to the heap root. nextEntry may have changed
+	// the heap root, so we must not expect root to still point to the same
+	// level (or to even be valid, if the heap is now exhaused).
+
+	for m.heap.len() > 0 {
+		if m.levelsPositioned[(*root).index] {
+			// A level we've previously positioned is at the top of the heap, so
+			// there are no other levels positioned at keys < succKey. We've
+			// advanced as far as we need to.
+			break
+		}
+		// Since this level was not the original heap root when NextPrefix was
+		// called, we don't know whether this level's current key has the
+		// previous prefix or a new one.
+		if m.heap.cmp((*root).iterKey.UserKey, succKey) >= 0 {
+			break
+		}
+		m.levelsPositioned[(*root).index] = true
+		m.nextEntry(*root, succKey)
+	}
+	return m.findNextEntry()
+}
+
+func (m *mergingIter) Prev() (*InternalKey, base.LazyValue) {
+	if m.err != nil {
+		return nil, base.LazyValue{}
+	}
+
+	if m.dir != -1 {
+		if m.prefix != nil {
+			m.err = errors.New("pebble: unsupported reverse prefix iteration")
+			return nil, base.LazyValue{}
+		}
+		m.switchToMaxHeap()
+		return m.findPrevEntry()
+	}
+
+	if m.heap.len() == 0 {
+		return nil, base.LazyValue{}
+	}
+
+	m.prevEntry(m.heap.items[0])
+	return m.findPrevEntry()
+}
+
+func (m *mergingIter) Error() error {
+	if m.heap.len() == 0 || m.err != nil {
+		return m.err
+	}
+	return m.levels[m.heap.items[0].index].iter.Error()
+}
+
+func (m *mergingIter) Close() error {
+	for i := range m.levels {
+		iter := m.levels[i].iter
+		if err := iter.Close(); err != nil && m.err == nil {
+			m.err = err
+		}
+		if rangeDelIter := m.levels[i].rangeDelIter; rangeDelIter != nil {
+			if err := rangeDelIter.Close(); err != nil && m.err == nil {
+				m.err = err
+			}
+		}
+	}
+	m.levels = nil
+	m.heap.items = m.heap.items[:0]
+	return m.err
+}
+
+func (m *mergingIter) SetBounds(lower, upper []byte) {
+	m.prefix = nil
+	m.lower = lower
+	m.upper = upper
+	for i := range m.levels {
+		m.levels[i].iter.SetBounds(lower, upper)
+	}
+	m.heap.clear()
+}
+
+func (m *mergingIter) SetContext(ctx context.Context) {
+	for i := range m.levels {
+		m.levels[i].iter.SetContext(ctx)
+	}
+}
+
+func (m *mergingIter) DebugString() string {
+	var buf bytes.Buffer
+	sep := ""
+	for m.heap.len() > 0 {
+		item := m.heap.pop()
+		fmt.Fprintf(&buf, "%s%s", sep, item.iterKey)
+		sep = " "
+	}
+	if m.dir == 1 {
+		m.initMinHeap()
+	} else {
+		m.initMaxHeap()
+	}
+	return buf.String()
+}
+
+func (m *mergingIter) ForEachLevelIter(fn func(li *levelIter) bool) {
+	for _, ml := range m.levels {
+		if ml.levelIter != nil {
+			if done := fn(ml.levelIter); done {
+				break
+			}
+		}
+	}
+}
+
+func (m *mergingIter) addItemStats(l *mergingIterLevel) {
+	m.stats.PointCount++
+	m.stats.KeyBytes += uint64(len(l.iterKey.UserKey))
+	m.stats.ValueBytes += uint64(len(l.iterValue.ValueOrHandle))
+}
+
+var _ internalIterator = &mergingIter{}
diff --git a/pebble/merging_iter_heap.go b/pebble/merging_iter_heap.go
new file mode 100644
index 0000000..c8c336f
--- /dev/null
+++ b/pebble/merging_iter_heap.go
@@ -0,0 +1,92 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+type mergingIterHeap struct {
+	cmp     Compare
+	reverse bool
+	items   []*mergingIterLevel
+}
+
+func (h *mergingIterHeap) len() int {
+	return len(h.items)
+}
+
+func (h *mergingIterHeap) clear() {
+	h.items = h.items[:0]
+}
+
+func (h *mergingIterHeap) less(i, j int) bool {
+	ikey, jkey := h.items[i].iterKey, h.items[j].iterKey
+	if c := h.cmp(ikey.UserKey, jkey.UserKey); c != 0 {
+		if h.reverse {
+			return c > 0
+		}
+		return c < 0
+	}
+	if h.reverse {
+		return ikey.Trailer < jkey.Trailer
+	}
+	return ikey.Trailer > jkey.Trailer
+}
+
+func (h *mergingIterHeap) swap(i, j int) {
+	h.items[i], h.items[j] = h.items[j], h.items[i]
+}
+
+// init, fix, up and down are copied from the go stdlib.
+func (h *mergingIterHeap) init() {
+	// heapify
+	n := h.len()
+	for i := n/2 - 1; i >= 0; i-- {
+		h.down(i, n)
+	}
+}
+
+func (h *mergingIterHeap) fix(i int) {
+	if !h.down(i, h.len()) {
+		h.up(i)
+	}
+}
+
+func (h *mergingIterHeap) pop() *mergingIterLevel {
+	n := h.len() - 1
+	h.swap(0, n)
+	h.down(0, n)
+	item := h.items[n]
+	h.items = h.items[:n]
+	return item
+}
+
+func (h *mergingIterHeap) up(j int) {
+	for {
+		i := (j - 1) / 2 // parent
+		if i == j || !h.less(j, i) {
+			break
+		}
+		h.swap(i, j)
+		j = i
+	}
+}
+
+func (h *mergingIterHeap) down(i0, n int) bool {
+	i := i0
+	for {
+		j1 := 2*i + 1
+		if j1 >= n || j1 < 0 { // j1 < 0 after int overflow
+			break
+		}
+		j := j1 // left child
+		if j2 := j1 + 1; j2 < n && h.less(j2, j1) {
+			j = j2 // = 2*i + 2  // right child
+		}
+		if !h.less(j, i) {
+			break
+		}
+		h.swap(i, j)
+		i = j
+	}
+	return i > i0
+}
diff --git a/pebble/merging_iter_test.go b/pebble/merging_iter_test.go
new file mode 100644
index 0000000..2e3d994
--- /dev/null
+++ b/pebble/merging_iter_test.go
@@ -0,0 +1,729 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"context"
+	"fmt"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/bloom"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/itertest"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/internal/rangedel"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+func TestMergingIter(t *testing.T) {
+	var stats base.InternalIteratorStats
+	newFunc := func(iters ...internalIterator) internalIterator {
+		return newMergingIter(nil /* logger */, &stats, DefaultComparer.Compare,
+			func(a []byte) int { return len(a) }, iters...)
+	}
+	testIterator(t, newFunc, func(r *rand.Rand) [][]string {
+		// Shuffle testKeyValuePairs into one or more splits. Each individual
+		// split is in increasing order, but different splits may overlap in
+		// range. Some of the splits may be empty.
+		splits := make([][]string, 1+r.Intn(2+len(testKeyValuePairs)))
+		for _, kv := range testKeyValuePairs {
+			j := r.Intn(len(splits))
+			splits[j] = append(splits[j], kv)
+		}
+		return splits
+	})
+}
+
+func TestMergingIterSeek(t *testing.T) {
+	var def string
+	datadriven.RunTest(t, "testdata/merging_iter_seek", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "define":
+			def = d.Input
+			return ""
+
+		case "iter":
+			var iters []internalIterator
+			for _, line := range strings.Split(def, "\n") {
+				f := &fakeIter{}
+				for _, key := range strings.Fields(line) {
+					j := strings.Index(key, ":")
+					f.keys = append(f.keys, base.ParseInternalKey(key[:j]))
+					f.vals = append(f.vals, []byte(key[j+1:]))
+				}
+				iters = append(iters, f)
+			}
+
+			var stats base.InternalIteratorStats
+			iter := newMergingIter(nil /* logger */, &stats, DefaultComparer.Compare,
+				func(a []byte) int { return len(a) }, iters...)
+			defer iter.Close()
+			return itertest.RunInternalIterCmd(t, d, iter)
+
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
+
+func TestMergingIterNextPrev(t *testing.T) {
+	// The data is the same in each of these cases, but divided up amongst the
+	// iterators differently. This data must match the definition in
+	// testdata/internal_iter_next.
+	iterCases := [][]string{
+		{
+			"a.SET.2:2 a.SET.1:1 b.SET.2:2 b.SET.1:1 c.SET.2:2 c.SET.1:1",
+		},
+		{
+			"a.SET.2:2 b.SET.2:2 c.SET.2:2",
+			"a.SET.1:1 b.SET.1:1 c.SET.1:1",
+		},
+		{
+			"a.SET.2:2 b.SET.2:2",
+			"a.SET.1:1 b.SET.1:1",
+			"c.SET.2:2 c.SET.1:1",
+		},
+		{
+			"a.SET.2:2",
+			"a.SET.1:1",
+			"b.SET.2:2",
+			"b.SET.1:1",
+			"c.SET.2:2",
+			"c.SET.1:1",
+		},
+	}
+
+	for _, c := range iterCases {
+		t.Run("", func(t *testing.T) {
+			datadriven.RunTest(t, "testdata/internal_iter_next", func(t *testing.T, d *datadriven.TestData) string {
+				switch d.Cmd {
+				case "define":
+					// Ignore. We've defined the iterator data above.
+					return ""
+
+				case "iter":
+					iters := make([]internalIterator, len(c))
+					for i := range c {
+						f := &fakeIter{}
+						iters[i] = f
+						for _, key := range strings.Fields(c[i]) {
+							j := strings.Index(key, ":")
+							f.keys = append(f.keys, base.ParseInternalKey(key[:j]))
+							f.vals = append(f.vals, []byte(key[j+1:]))
+						}
+					}
+
+					var stats base.InternalIteratorStats
+					iter := newMergingIter(nil /* logger */, &stats, DefaultComparer.Compare,
+						func(a []byte) int { return len(a) }, iters...)
+					defer iter.Close()
+					return itertest.RunInternalIterCmd(t, d, iter)
+
+				default:
+					return fmt.Sprintf("unknown command: %s", d.Cmd)
+				}
+			})
+		})
+	}
+}
+
+func TestMergingIterCornerCases(t *testing.T) {
+	memFS := vfs.NewMem()
+	cmp := DefaultComparer.Compare
+	fmtKey := DefaultComparer.FormatKey
+	opts := (*Options)(nil).EnsureDefaults()
+	var v *version
+
+	// Indexed by fileNum.
+	var readers []*sstable.Reader
+	defer func() {
+		for _, r := range readers {
+			r.Close()
+		}
+	}()
+
+	var fileNum base.FileNum
+	newIters :=
+		func(_ context.Context, file *manifest.FileMetadata, opts *IterOptions, iio internalIterOpts,
+		) (internalIterator, keyspan.FragmentIterator, error) {
+			r := readers[file.FileNum]
+			rangeDelIter, err := r.NewRawRangeDelIter()
+			if err != nil {
+				return nil, nil, err
+			}
+			iter, err := r.NewIterWithBlockPropertyFilters(
+				opts.GetLowerBound(), opts.GetUpperBound(), nil, true /* useFilterBlock */, iio.stats,
+				sstable.CategoryAndQoS{}, nil, sstable.TrivialReaderProvider{Reader: r})
+			if err != nil {
+				return nil, nil, err
+			}
+			return iter, rangeDelIter, nil
+		}
+
+	datadriven.RunTest(t, "testdata/merging_iter", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "define":
+			lines := strings.Split(d.Input, "\n")
+
+			var files [numLevels][]*fileMetadata
+			var level int
+			for i := 0; i < len(lines); i++ {
+				line := lines[i]
+				line = strings.TrimSpace(line)
+				if line == "L" || line == "L0" {
+					// start next level
+					level++
+					continue
+				}
+				keys := strings.Fields(line)
+				smallestKey := base.ParseInternalKey(keys[0])
+				largestKey := base.ParseInternalKey(keys[1])
+				m := (&fileMetadata{
+					FileNum: fileNum,
+				}).ExtendPointKeyBounds(cmp, smallestKey, largestKey)
+				m.InitPhysicalBacking()
+				files[level] = append(files[level], m)
+
+				i++
+				line = lines[i]
+				line = strings.TrimSpace(line)
+				name := fmt.Sprint(fileNum)
+				fileNum++
+				f, err := memFS.Create(name)
+				if err != nil {
+					return err.Error()
+				}
+				w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
+				var tombstones []keyspan.Span
+				frag := keyspan.Fragmenter{
+					Cmp:    cmp,
+					Format: fmtKey,
+					Emit: func(fragmented keyspan.Span) {
+						tombstones = append(tombstones, fragmented)
+					},
+				}
+				keyvalues := strings.Fields(line)
+				for _, kv := range keyvalues {
+					j := strings.Index(kv, ":")
+					ikey := base.ParseInternalKey(kv[:j])
+					value := []byte(kv[j+1:])
+					switch ikey.Kind() {
+					case InternalKeyKindRangeDelete:
+						frag.Add(keyspan.Span{Start: ikey.UserKey, End: value, Keys: []keyspan.Key{{Trailer: ikey.Trailer}}})
+					default:
+						if err := w.Add(ikey, value); err != nil {
+							return err.Error()
+						}
+					}
+				}
+				frag.Finish()
+				for _, v := range tombstones {
+					if err := rangedel.Encode(&v, w.Add); err != nil {
+						return err.Error()
+					}
+				}
+				if err := w.Close(); err != nil {
+					return err.Error()
+				}
+				f, err = memFS.Open(name)
+				if err != nil {
+					return err.Error()
+				}
+				readable, err := sstable.NewSimpleReadable(f)
+				if err != nil {
+					return err.Error()
+				}
+				r, err := sstable.NewReader(readable, sstable.ReaderOptions{})
+				if err != nil {
+					return err.Error()
+				}
+				readers = append(readers, r)
+			}
+
+			v = newVersion(opts, files)
+			return v.String()
+		case "iter":
+			levelIters := make([]mergingIterLevel, 0, len(v.Levels))
+			var stats base.InternalIteratorStats
+			for i, l := range v.Levels {
+				slice := l.Slice()
+				if slice.Empty() {
+					continue
+				}
+				li := &levelIter{}
+				li.init(context.Background(), IterOptions{}, testkeys.Comparer,
+					newIters, slice.Iter(), manifest.Level(i), internalIterOpts{stats: &stats})
+				i := len(levelIters)
+				levelIters = append(levelIters, mergingIterLevel{iter: li})
+				li.initRangeDel(&levelIters[i].rangeDelIter)
+				li.initBoundaryContext(&levelIters[i].levelIterBoundaryContext)
+			}
+			miter := &mergingIter{}
+			miter.init(nil /* opts */, &stats, cmp, func(a []byte) int { return len(a) }, levelIters...)
+			defer miter.Close()
+			miter.forceEnableSeekOpt = true
+			// Exercise SetContext for fun
+			// (https://github.com/cockroachdb/pebble/pull/3037 caused a SIGSEGV due
+			// to a nil pointer dereference).
+			miter.SetContext(context.Background())
+			return itertest.RunInternalIterCmd(t, d, miter,
+				itertest.Verbose, itertest.WithStats(&stats))
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
+
+func buildMergingIterTables(
+	b *testing.B, blockSize, restartInterval, count int,
+) ([]*sstable.Reader, [][]byte, func()) {
+	mem := vfs.NewMem()
+	files := make([]vfs.File, count)
+	for i := range files {
+		f, err := mem.Create(fmt.Sprintf("bench%d", i))
+		if err != nil {
+			b.Fatal(err)
+		}
+		files[i] = f
+	}
+
+	writers := make([]*sstable.Writer, len(files))
+	for i := range files {
+		writers[i] = sstable.NewWriter(objstorageprovider.NewFileWritable(files[i]), sstable.WriterOptions{
+			BlockRestartInterval: restartInterval,
+			BlockSize:            blockSize,
+			Compression:          NoCompression,
+		})
+	}
+
+	estimatedSize := func() uint64 {
+		var sum uint64
+		for _, w := range writers {
+			sum += w.EstimatedSize()
+		}
+		return sum
+	}
+
+	var keys [][]byte
+	var ikey InternalKey
+	targetSize := uint64(count * (2 << 20))
+	for i := 0; estimatedSize() < targetSize; i++ {
+		key := []byte(fmt.Sprintf("%08d", i))
+		keys = append(keys, key)
+		ikey.UserKey = key
+		j := rand.Intn(len(writers))
+		w := writers[j]
+		w.Add(ikey, nil)
+	}
+
+	for _, w := range writers {
+		if err := w.Close(); err != nil {
+			b.Fatal(err)
+		}
+	}
+
+	opts := sstable.ReaderOptions{Cache: NewCache(128 << 20)}
+	defer opts.Cache.Unref()
+
+	readers := make([]*sstable.Reader, len(files))
+	for i := range files {
+		f, err := mem.Open(fmt.Sprintf("bench%d", i))
+		if err != nil {
+			b.Fatal(err)
+		}
+		readable, err := sstable.NewSimpleReadable(f)
+		if err != nil {
+			b.Fatal(err)
+		}
+		readers[i], err = sstable.NewReader(readable, opts)
+		if err != nil {
+			b.Fatal(err)
+		}
+	}
+	return readers, keys, func() {
+		for _, r := range readers {
+			require.NoError(b, r.Close())
+		}
+	}
+}
+
+func BenchmarkMergingIterSeekGE(b *testing.B) {
+	const blockSize = 32 << 10
+
+	for _, restartInterval := range []int{16} {
+		b.Run(fmt.Sprintf("restart=%d", restartInterval),
+			func(b *testing.B) {
+				for _, count := range []int{1, 2, 3, 4, 5} {
+					b.Run(fmt.Sprintf("count=%d", count),
+						func(b *testing.B) {
+							readers, keys, cleanup := buildMergingIterTables(b, blockSize, restartInterval, count)
+							defer cleanup()
+							iters := make([]internalIterator, len(readers))
+							for i := range readers {
+								var err error
+								iters[i], err = readers[i].NewIter(nil /* lower */, nil /* upper */)
+								require.NoError(b, err)
+							}
+							var stats base.InternalIteratorStats
+							m := newMergingIter(nil /* logger */, &stats, DefaultComparer.Compare,
+								func(a []byte) int { return len(a) }, iters...)
+							rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+
+							b.ResetTimer()
+							for i := 0; i < b.N; i++ {
+								m.SeekGE(keys[rng.Intn(len(keys))], base.SeekGEFlagsNone)
+							}
+							m.Close()
+						})
+				}
+			})
+	}
+}
+
+func BenchmarkMergingIterNext(b *testing.B) {
+	const blockSize = 32 << 10
+
+	for _, restartInterval := range []int{16} {
+		b.Run(fmt.Sprintf("restart=%d", restartInterval),
+			func(b *testing.B) {
+				for _, count := range []int{1, 2, 3, 4, 5} {
+					b.Run(fmt.Sprintf("count=%d", count),
+						func(b *testing.B) {
+							readers, _, cleanup := buildMergingIterTables(b, blockSize, restartInterval, count)
+							defer cleanup()
+							iters := make([]internalIterator, len(readers))
+							for i := range readers {
+								var err error
+								iters[i], err = readers[i].NewIter(nil /* lower */, nil /* upper */)
+								require.NoError(b, err)
+							}
+							var stats base.InternalIteratorStats
+							m := newMergingIter(nil /* logger */, &stats, DefaultComparer.Compare,
+								func(a []byte) int { return len(a) }, iters...)
+
+							b.ResetTimer()
+							for i := 0; i < b.N; i++ {
+								key, _ := m.Next()
+								if key == nil {
+									key, _ = m.First()
+								}
+								_ = key
+							}
+							m.Close()
+						})
+				}
+			})
+	}
+}
+
+func BenchmarkMergingIterPrev(b *testing.B) {
+	const blockSize = 32 << 10
+
+	for _, restartInterval := range []int{16} {
+		b.Run(fmt.Sprintf("restart=%d", restartInterval),
+			func(b *testing.B) {
+				for _, count := range []int{1, 2, 3, 4, 5} {
+					b.Run(fmt.Sprintf("count=%d", count),
+						func(b *testing.B) {
+							readers, _, cleanup := buildMergingIterTables(b, blockSize, restartInterval, count)
+							defer cleanup()
+							iters := make([]internalIterator, len(readers))
+							for i := range readers {
+								var err error
+								iters[i], err = readers[i].NewIter(nil /* lower */, nil /* upper */)
+								require.NoError(b, err)
+							}
+							var stats base.InternalIteratorStats
+							m := newMergingIter(nil /* logger */, &stats, DefaultComparer.Compare,
+								func(a []byte) int { return len(a) }, iters...)
+
+							b.ResetTimer()
+							for i := 0; i < b.N; i++ {
+								key, _ := m.Prev()
+								if key == nil {
+									key, _ = m.Last()
+								}
+								_ = key
+							}
+							m.Close()
+						})
+				}
+			})
+	}
+}
+
+// Builds levels for BenchmarkMergingIterSeqSeekGEWithBounds. The lowest level,
+// index 0 here, contains most of the data. Each level has 2 files, to allow for
+// stepping into the second file if needed. The lowest level has all the keys in
+// the file 0, and a single "lastIKey" in file 1. File 0 in all other levels have
+// only the first and last key of file 0 of the aforementioned level -- this
+// simulates sparseness of data, but not necessarily of file width, in higher
+// levels. File 1 in other levels is similar to File 1 in the aforementioned level
+// since it is only for stepping into. If writeRangeTombstoneToLowestLevel is
+// true, a range tombstone is written to the first lowest level file that
+// deletes all the keys in it, and no other levels should be written.
+func buildLevelsForMergingIterSeqSeek(
+	b *testing.B,
+	blockSize, restartInterval, levelCount int,
+	keyOffset int,
+	writeRangeTombstoneToLowestLevel bool,
+	writeBloomFilters bool,
+	forceTwoLevelIndex bool,
+) (readers [][]*sstable.Reader, levelSlices []manifest.LevelSlice, keys [][]byte) {
+	mem := vfs.NewMem()
+	if writeRangeTombstoneToLowestLevel && levelCount != 1 {
+		panic("expect to write only 1 level")
+	}
+	files := make([][]vfs.File, levelCount)
+	for i := range files {
+		for j := 0; j < 2; j++ {
+			f, err := mem.Create(fmt.Sprintf("bench%d_%d", i, j))
+			if err != nil {
+				b.Fatal(err)
+			}
+			files[i] = append(files[i], f)
+		}
+	}
+
+	const targetL6FirstFileSize = 2 << 20
+	writers := make([][]*sstable.Writer, levelCount)
+	// A policy unlikely to have false positives.
+	filterPolicy := bloom.FilterPolicy(100)
+	for i := range files {
+		for j := range files[i] {
+			writerOptions := sstable.WriterOptions{
+				BlockRestartInterval: restartInterval,
+				BlockSize:            blockSize,
+				Compression:          NoCompression,
+			}
+			if writeBloomFilters {
+				writerOptions.FilterPolicy = filterPolicy
+				writerOptions.FilterType = base.TableFilter
+			}
+			if forceTwoLevelIndex {
+				if i == 0 && j == 0 {
+					// Ignoring compression, approximate number of blocks
+					numDataBlocks := targetL6FirstFileSize / blockSize
+					if numDataBlocks < 4 {
+						b.Fatalf("cannot produce two level index")
+					}
+					// Produce ~2 lower-level index blocks.
+					writerOptions.IndexBlockSize = (numDataBlocks / 2) * 8
+				} else if j == 0 {
+					// Only 2 keys in these files, so to produce two level indexes we
+					// set the block sizes to 1.
+					writerOptions.BlockSize = 1
+					writerOptions.IndexBlockSize = 1
+				}
+			}
+			writers[i] = append(writers[i], sstable.NewWriter(objstorageprovider.NewFileWritable(files[i][j]), writerOptions))
+		}
+	}
+
+	i := keyOffset
+	w := writers[0][0]
+	for ; w.EstimatedSize() < targetL6FirstFileSize; i++ {
+		key := []byte(fmt.Sprintf("%08d", i))
+		keys = append(keys, key)
+		ikey := base.MakeInternalKey(key, 0, InternalKeyKindSet)
+		w.Add(ikey, nil)
+	}
+	if writeRangeTombstoneToLowestLevel {
+		tombstoneKey := base.MakeInternalKey(keys[0], 1, InternalKeyKindRangeDelete)
+		w.Add(tombstoneKey, []byte(fmt.Sprintf("%08d", i)))
+	}
+	for j := 1; j < len(files); j++ {
+		for _, k := range []int{0, len(keys) - 1} {
+			ikey := base.MakeInternalKey(keys[k], uint64(j), InternalKeyKindSet)
+			writers[j][0].Add(ikey, nil)
+		}
+	}
+	lastKey := []byte(fmt.Sprintf("%08d", i))
+	keys = append(keys, lastKey)
+	for j := 0; j < len(files); j++ {
+		lastIKey := base.MakeInternalKey(lastKey, uint64(j), InternalKeyKindSet)
+		writers[j][1].Add(lastIKey, nil)
+	}
+	for _, levelWriters := range writers {
+		for j, w := range levelWriters {
+			if err := w.Close(); err != nil {
+				b.Fatal(err)
+			}
+			meta, err := w.Metadata()
+			require.NoError(b, err)
+			if forceTwoLevelIndex && j == 0 && meta.Properties.IndexType != 2 {
+				b.Fatalf("did not produce two level index")
+			}
+		}
+	}
+
+	opts := sstable.ReaderOptions{Cache: NewCache(128 << 20), Comparer: DefaultComparer}
+	if writeBloomFilters {
+		opts.Filters = make(map[string]FilterPolicy)
+		opts.Filters[filterPolicy.Name()] = filterPolicy
+	}
+	defer opts.Cache.Unref()
+
+	readers = make([][]*sstable.Reader, levelCount)
+	for i := range files {
+		for j := range files[i] {
+			f, err := mem.Open(fmt.Sprintf("bench%d_%d", i, j))
+			if err != nil {
+				b.Fatal(err)
+			}
+			readable, err := sstable.NewSimpleReadable(f)
+			if err != nil {
+				b.Fatal(err)
+			}
+			r, err := sstable.NewReader(readable, opts)
+			if err != nil {
+				b.Fatal(err)
+			}
+			readers[i] = append(readers[i], r)
+		}
+	}
+	levelSlices = make([]manifest.LevelSlice, levelCount)
+	for i := range readers {
+		meta := make([]*fileMetadata, len(readers[i]))
+		for j := range readers[i] {
+			iter, err := readers[i][j].NewIter(nil /* lower */, nil /* upper */)
+			require.NoError(b, err)
+			smallest, _ := iter.First()
+			meta[j] = &fileMetadata{}
+			// The same FileNum is being reused across different levels, which
+			// is harmless for the benchmark since each level has its own iterator
+			// creation func.
+			meta[j].FileNum = FileNum(j)
+			largest, _ := iter.Last()
+			meta[j].ExtendPointKeyBounds(opts.Comparer.Compare, smallest.Clone(), largest.Clone())
+			meta[j].InitPhysicalBacking()
+		}
+		levelSlices[i] = manifest.NewLevelSliceSpecificOrder(meta)
+	}
+	return readers, levelSlices, keys
+}
+
+func buildMergingIter(readers [][]*sstable.Reader, levelSlices []manifest.LevelSlice) *mergingIter {
+	mils := make([]mergingIterLevel, len(levelSlices))
+	for i := len(readers) - 1; i >= 0; i-- {
+		levelIndex := i
+		level := len(readers) - 1 - i
+		newIters := func(
+			_ context.Context, file *manifest.FileMetadata, opts *IterOptions, _ internalIterOpts,
+		) (internalIterator, keyspan.FragmentIterator, error) {
+			iter, err := readers[levelIndex][file.FileNum].NewIter(
+				opts.LowerBound, opts.UpperBound)
+			if err != nil {
+				return nil, nil, err
+			}
+			rdIter, err := readers[levelIndex][file.FileNum].NewRawRangeDelIter()
+			if err != nil {
+				iter.Close()
+				return nil, nil, err
+			}
+			return iter, rdIter, err
+		}
+		l := newLevelIter(
+			context.Background(), IterOptions{}, testkeys.Comparer, newIters, levelSlices[i].Iter(),
+			manifest.Level(level), internalIterOpts{})
+		l.initRangeDel(&mils[level].rangeDelIter)
+		l.initBoundaryContext(&mils[level].levelIterBoundaryContext)
+		mils[level].iter = l
+	}
+	var stats base.InternalIteratorStats
+	m := &mergingIter{}
+	m.init(nil /* logger */, &stats, testkeys.Comparer.Compare,
+		func(a []byte) int { return len(a) }, mils...)
+	return m
+}
+
+// A benchmark that simulates the behavior of a mergingIter where
+// monotonically increasing narrow bounds are repeatedly set and used to Seek
+// and then iterate over the keys within the bounds. This resembles MVCC
+// scanning by CockroachDB when doing a lookup/index join with a large number
+// of left rows, that are batched and reuse the same iterator, and which can
+// have good locality of access. This results in the successive bounds being
+// in the same file.
+func BenchmarkMergingIterSeqSeekGEWithBounds(b *testing.B) {
+	const blockSize = 32 << 10
+
+	restartInterval := 16
+	for _, levelCount := range []int{5} {
+		b.Run(fmt.Sprintf("levelCount=%d", levelCount),
+			func(b *testing.B) {
+				readers, levelSlices, keys := buildLevelsForMergingIterSeqSeek(
+					b, blockSize, restartInterval, levelCount, 0 /* keyOffset */, false, false, false)
+				m := buildMergingIter(readers, levelSlices)
+				keyCount := len(keys)
+				b.ResetTimer()
+				for i := 0; i < b.N; i++ {
+					pos := i % (keyCount - 1)
+					m.SetBounds(keys[pos], keys[pos+1])
+					// SeekGE will return keys[pos].
+					k, _ := m.SeekGE(keys[pos], base.SeekGEFlagsNone)
+					for k != nil {
+						k, _ = m.Next()
+					}
+				}
+				m.Close()
+				for i := range readers {
+					for j := range readers[i] {
+						readers[i][j].Close()
+					}
+				}
+			})
+	}
+}
+
+func BenchmarkMergingIterSeqSeekPrefixGE(b *testing.B) {
+	const blockSize = 32 << 10
+	const restartInterval = 16
+	const levelCount = 5
+	readers, levelSlices, keys := buildLevelsForMergingIterSeqSeek(
+		b, blockSize, restartInterval, levelCount, 0 /* keyOffset */, false, false, false)
+
+	for _, skip := range []int{1, 2, 4, 8, 16} {
+		for _, useNext := range []bool{false, true} {
+			b.Run(fmt.Sprintf("skip=%d/use-next=%t", skip, useNext),
+				func(b *testing.B) {
+					m := buildMergingIter(readers, levelSlices)
+					keyCount := len(keys)
+					pos := 0
+
+					m.SeekPrefixGE(keys[pos], keys[pos], base.SeekGEFlagsNone)
+					b.ResetTimer()
+					for i := 0; i < b.N; i++ {
+						pos += skip
+						var flags base.SeekGEFlags
+						if useNext {
+							flags = flags.EnableTrySeekUsingNext()
+						}
+						if pos >= keyCount {
+							pos = 0
+							flags = flags.DisableTrySeekUsingNext()
+						}
+						// SeekPrefixGE will return keys[pos].
+						m.SeekPrefixGE(keys[pos], keys[pos], flags)
+					}
+					b.StopTimer()
+					m.Close()
+				})
+		}
+	}
+	for i := range readers {
+		for j := range readers[i] {
+			readers[i][j].Close()
+		}
+	}
+}
diff --git a/pebble/metamorphic/config.go b/pebble/metamorphic/config.go
new file mode 100644
index 0000000..891c565
--- /dev/null
+++ b/pebble/metamorphic/config.go
@@ -0,0 +1,189 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package metamorphic
+
+import "github.com/cockroachdb/pebble/internal/randvar"
+
+type opType int
+
+const (
+	batchAbort opType = iota
+	batchCommit
+	dbCheckpoint
+	dbClose
+	dbCompact
+	dbFlush
+	dbRatchetFormatMajorVersion
+	dbRestart
+	iterClose
+	iterFirst
+	iterLast
+	iterNext
+	iterNextWithLimit
+	iterNextPrefix
+	iterCanSingleDelete
+	iterPrev
+	iterPrevWithLimit
+	iterSeekGE
+	iterSeekGEWithLimit
+	iterSeekLT
+	iterSeekLTWithLimit
+	iterSeekPrefixGE
+	iterSetBounds
+	iterSetOptions
+	newBatch
+	newIndexedBatch
+	newIter
+	newIterUsingClone
+	newSnapshot
+	readerGet
+	replicate
+	snapshotClose
+	writerApply
+	writerDelete
+	writerDeleteRange
+	writerIngest
+	writerMerge
+	writerRangeKeyDelete
+	writerRangeKeySet
+	writerRangeKeyUnset
+	writerSet
+	writerSingleDelete
+)
+
+type config struct {
+	// Weights for the operation mix to generate. ops[i] corresponds to the
+	// weight for opType(i).
+	ops []int
+
+	// newPrefix configures the probability that when generating a new user key,
+	// the generated key uses a new key prefix rather than an existing prefix
+	// with a suffix.
+	newPrefix float64
+	// writeSuffixDist defines the distribution of key suffixes during writing.
+	// It's a dynamic randvar to roughly emulate workloads with MVCC timestamps,
+	// skewing towards most recent timestamps.
+	writeSuffixDist randvar.Dynamic
+
+	// numInstances defines the number of pebble instances created for this
+	// metamorphic test run.
+	numInstances int
+
+	// TODO(peter): unimplemented
+	// keyDist        randvar.Dynamic
+	// keySizeDist    randvar.Static
+	// valueSizeDist  randvar.Static
+	// updateFrac     float64
+	// lowerBoundFrac float64
+	// upperBoundFrac float64
+}
+
+func (c config) withNewPrefixProbability(p float64) config {
+	c.newPrefix = p
+	return c
+}
+
+func (c config) withOpWeight(op opType, weight int) config {
+	c.ops[op] = weight
+	return c
+}
+
+var presetConfigs = []config{
+	defaultConfig(),
+	// Generate a configuration that helps exercise code paths dependent on many
+	// versions of keys with the same prefixes. The default configuration does
+	// not tend to generate many versions of the same key. Additionally, its
+	// relatively high weight for deletion write operations makes it less likely
+	// that we'll accumulate enough versions to exercise some code paths (eg,
+	// see #2921 which requires >16 SETs for versions of the same prefix to
+	// reside in a single block to exercise the code path).
+	//
+	// To encourage generation of many versions of the same keys, generate a new
+	// prefix only 4% of the time when generating a new key. The remaining 96%
+	// of new key generations will use an existing prefix. To keep the size of
+	// the database growing, we also reduce the probability of delete write
+	// operations significantly.
+	defaultConfig().
+		withNewPrefixProbability(0.04).
+		withOpWeight(writerDeleteRange, 1).
+		withOpWeight(writerDelete, 5).
+		withOpWeight(writerSingleDelete, 5).
+		withOpWeight(writerMerge, 0),
+}
+
+var multiInstancePresetConfig = multiInstanceConfig()
+
+func defaultConfig() config {
+	return config{
+		// dbClose is not in this list since it is deterministically generated once, at the end of the test.
+		ops: []int{
+			batchAbort:                  5,
+			batchCommit:                 5,
+			dbCheckpoint:                1,
+			dbCompact:                   1,
+			dbFlush:                     2,
+			dbRatchetFormatMajorVersion: 1,
+			dbRestart:                   2,
+			iterClose:                   5,
+			iterFirst:                   100,
+			iterLast:                    100,
+			iterNext:                    100,
+			iterNextWithLimit:           20,
+			iterNextPrefix:              20,
+			iterCanSingleDelete:         20,
+			iterPrev:                    100,
+			iterPrevWithLimit:           20,
+			iterSeekGE:                  100,
+			iterSeekGEWithLimit:         20,
+			iterSeekLT:                  100,
+			iterSeekLTWithLimit:         20,
+			iterSeekPrefixGE:            100,
+			iterSetBounds:               100,
+			iterSetOptions:              10,
+			newBatch:                    5,
+			newIndexedBatch:             5,
+			newIter:                     10,
+			newIterUsingClone:           5,
+			newSnapshot:                 10,
+			readerGet:                   100,
+			replicate:                   0,
+			snapshotClose:               10,
+			writerApply:                 10,
+			writerDelete:                100,
+			writerDeleteRange:           50,
+			writerIngest:                100,
+			writerMerge:                 100,
+			writerRangeKeySet:           10,
+			writerRangeKeyUnset:         10,
+			writerRangeKeyDelete:        5,
+			writerSet:                   100,
+			writerSingleDelete:          50,
+		},
+		// Use a new prefix 75% of the time (and 25% of the time use an existing
+		// prefix with an alternative suffix).
+		newPrefix: 0.75,
+		// Use a skewed distribution of suffixes to mimic MVCC timestamps. The
+		// range will be widened whenever a suffix is found to already be in use
+		// for a particular prefix.
+		writeSuffixDist: mustDynamic(randvar.NewSkewedLatest(0, 1, 0.99)),
+	}
+}
+
+func multiInstanceConfig() config {
+	cfg := defaultConfig()
+	cfg.ops[replicate] = 5
+	// Single deletes and merges are disabled in multi-instance mode, as
+	// replicateOp doesn't support them.
+	cfg.ops[writerSingleDelete] = 0
+	cfg.ops[writerMerge] = 0
+	return cfg
+}
+
+func mustDynamic(dyn randvar.Dynamic, err error) randvar.Dynamic {
+	if err != nil {
+		panic(err)
+	}
+	return dyn
+}
diff --git a/pebble/metamorphic/generator.go b/pebble/metamorphic/generator.go
new file mode 100644
index 0000000..8e7e6d4
--- /dev/null
+++ b/pebble/metamorphic/generator.go
@@ -0,0 +1,1588 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package metamorphic
+
+import (
+	"bytes"
+	"fmt"
+	"slices"
+
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/randvar"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"golang.org/x/exp/rand"
+)
+
+const maxValueSize = 20
+
+type iterOpts struct {
+	lower    []byte
+	upper    []byte
+	keyTypes uint32 // pebble.IterKeyType
+	// maskSuffix may be set if keyTypes is IterKeyTypePointsAndRanges to
+	// configure IterOptions.RangeKeyMasking.Suffix.
+	maskSuffix []byte
+
+	// If filterMax is >0, this iterator will filter out any keys that have
+	// suffixes that don't fall within the range [filterMin,filterMax).
+	// Additionally, the iterator will be constructed with a block-property
+	// filter that filters out blocks accordingly. Not all OPTIONS hook up the
+	// corresponding block property collector, so block-filtering may still be
+	// effectively disabled in some runs. The iterator operations themselves
+	// however will always skip past any points that should be filtered to
+	// ensure determinism.
+	filterMin uint64
+	filterMax uint64
+
+	// see IterOptions.UseL6Filters.
+	useL6Filters bool
+
+	// NB: If adding or removing fields, ensure IsZero is in sync.
+}
+
+func (o iterOpts) IsZero() bool {
+	return o.lower == nil && o.upper == nil && o.keyTypes == 0 &&
+		o.maskSuffix == nil && o.filterMin == 0 && o.filterMax == 0 && !o.useL6Filters
+}
+
+type generator struct {
+	cfg config
+	rng *rand.Rand
+
+	init *initOp
+	ops  []op
+
+	// keyManager tracks the state of keys a operation generation time.
+	keyManager *keyManager
+	dbs        objIDSlice
+	// Unordered sets of object IDs for live objects. Used to randomly select on
+	// object when generating an operation. There are 4 concrete objects: the DB
+	// (of which there is exactly 1), batches, iterators, and snapshots.
+	//
+	// liveBatches contains the live indexed and write-only batches.
+	liveBatches objIDSlice
+	// liveIters contains the live iterators.
+	liveIters     objIDSlice
+	itersLastOpts map[objID]iterOpts
+	// liveReaders contains the DB, and any live indexed batches and snapshots. The DB is always
+	// at index 0.
+	liveReaders objIDSlice
+	// liveSnapshots contains the live snapshots.
+	liveSnapshots objIDSlice
+	// liveWriters contains the DB, and any live batches. The DB is always at index 0.
+	liveWriters objIDSlice
+
+	// Maps used to find associated objects during generation. These maps are not
+	// needed during test execution.
+	//
+	// batchID -> batch iters: used to keep track of the open iterators on an
+	// indexed batch. The iter set value will also be indexed by the readers map.
+	batches map[objID]objIDSet
+	// iterID -> reader iters: used to keep track of all of the open
+	// iterators. The iter set value will also be indexed by either the batches
+	// or snapshots maps.
+	iters map[objID]objIDSet
+	// objectID -> db: used to keep track of the DB a batch, iter, or snapshot
+	// was created on.
+	objDB map[objID]objID
+	// readerID -> reader iters: used to keep track of the open iterators on a
+	// reader. The iter set value will also be indexed by either the batches or
+	// snapshots maps. This map is the union of batches and snapshots maps.
+	readers map[objID]objIDSet
+	// snapshotID -> snapshot iters: used to keep track of the open iterators on
+	// a snapshot. The iter set value will also be indexed by the readers map.
+	snapshots map[objID]objIDSet
+	// snapshotID -> bounds of the snapshot: only populated for snapshots that
+	// are constrained by bounds.
+	snapshotBounds map[objID][]pebble.KeyRange
+	// iterSequenceNumber is the metaTimestamp at which the iter was created.
+	iterCreationTimestamp map[objID]int
+	// iterReaderID is a map from an iterID to a readerID.
+	iterReaderID map[objID]objID
+}
+
+func newGenerator(rng *rand.Rand, cfg config, km *keyManager) *generator {
+	g := &generator{
+		cfg:                   cfg,
+		rng:                   rng,
+		init:                  &initOp{dbSlots: uint32(cfg.numInstances)},
+		keyManager:            km,
+		liveReaders:           objIDSlice{makeObjID(dbTag, 1)},
+		liveWriters:           objIDSlice{makeObjID(dbTag, 1)},
+		dbs:                   objIDSlice{makeObjID(dbTag, 1)},
+		objDB:                 make(map[objID]objID),
+		batches:               make(map[objID]objIDSet),
+		iters:                 make(map[objID]objIDSet),
+		readers:               make(map[objID]objIDSet),
+		snapshots:             make(map[objID]objIDSet),
+		snapshotBounds:        make(map[objID][]pebble.KeyRange),
+		itersLastOpts:         make(map[objID]iterOpts),
+		iterCreationTimestamp: make(map[objID]int),
+		iterReaderID:          make(map[objID]objID),
+	}
+	for i := 1; i < cfg.numInstances; i++ {
+		g.liveReaders = append(g.liveReaders, makeObjID(dbTag, uint32(i+1)))
+		g.liveWriters = append(g.liveWriters, makeObjID(dbTag, uint32(i+1)))
+		g.dbs = append(g.dbs, makeObjID(dbTag, uint32(i+1)))
+	}
+	// Note that the initOp fields are populated during generation.
+	g.ops = append(g.ops, g.init)
+	return g
+}
+
+func generate(rng *rand.Rand, count uint64, cfg config, km *keyManager) []op {
+	g := newGenerator(rng, cfg, km)
+
+	generators := []func(){
+		batchAbort:                  g.batchAbort,
+		batchCommit:                 g.batchCommit,
+		dbCheckpoint:                g.dbCheckpoint,
+		dbCompact:                   g.dbCompact,
+		dbFlush:                     g.dbFlush,
+		dbRatchetFormatMajorVersion: g.dbRatchetFormatMajorVersion,
+		dbRestart:                   g.dbRestart,
+		iterClose:                   g.randIter(g.iterClose),
+		iterFirst:                   g.randIter(g.iterFirst),
+		iterLast:                    g.randIter(g.iterLast),
+		iterNext:                    g.randIter(g.iterNext),
+		iterNextWithLimit:           g.randIter(g.iterNextWithLimit),
+		iterNextPrefix:              g.randIter(g.iterNextPrefix),
+		iterCanSingleDelete:         g.randIter(g.iterCanSingleDelete),
+		iterPrev:                    g.randIter(g.iterPrev),
+		iterPrevWithLimit:           g.randIter(g.iterPrevWithLimit),
+		iterSeekGE:                  g.randIter(g.iterSeekGE),
+		iterSeekGEWithLimit:         g.randIter(g.iterSeekGEWithLimit),
+		iterSeekLT:                  g.randIter(g.iterSeekLT),
+		iterSeekLTWithLimit:         g.randIter(g.iterSeekLTWithLimit),
+		iterSeekPrefixGE:            g.randIter(g.iterSeekPrefixGE),
+		iterSetBounds:               g.randIter(g.iterSetBounds),
+		iterSetOptions:              g.randIter(g.iterSetOptions),
+		newBatch:                    g.newBatch,
+		newIndexedBatch:             g.newIndexedBatch,
+		newIter:                     g.newIter,
+		newIterUsingClone:           g.newIterUsingClone,
+		newSnapshot:                 g.newSnapshot,
+		readerGet:                   g.readerGet,
+		replicate:                   g.replicate,
+		snapshotClose:               g.snapshotClose,
+		writerApply:                 g.writerApply,
+		writerDelete:                g.writerDelete,
+		writerDeleteRange:           g.writerDeleteRange,
+		writerIngest:                g.writerIngest,
+		writerMerge:                 g.writerMerge,
+		writerRangeKeyDelete:        g.writerRangeKeyDelete,
+		writerRangeKeySet:           g.writerRangeKeySet,
+		writerRangeKeyUnset:         g.writerRangeKeyUnset,
+		writerSet:                   g.writerSet,
+		writerSingleDelete:          g.writerSingleDelete,
+	}
+
+	// TPCC-style deck of cards randomization. Every time the end of the deck is
+	// reached, we shuffle the deck.
+	deck := randvar.NewDeck(g.rng, cfg.ops...)
+	for i := uint64(0); i < count; i++ {
+		generators[deck.Int()]()
+	}
+
+	g.dbClose()
+	return g.ops
+}
+
+func (g *generator) add(op op) {
+	g.ops = append(g.ops, op)
+	g.keyManager.update(op)
+}
+
+// randKeyToWrite returns a key for any write other than SingleDelete.
+//
+// TODO(peter): make the size and distribution of keys configurable. See
+// keyDist and keySizeDist in config.go.
+func (g *generator) randKeyToWrite(newKey float64) []byte {
+	return g.randKeyHelper(g.keyManager.eligibleWriteKeys(), newKey, nil)
+}
+
+// prefixKeyRange generates a [start, end) pair consisting of two prefix keys.
+func (g *generator) prefixKeyRange() ([]byte, []byte) {
+	start := g.randPrefixToWrite(0.001)
+	end := g.randPrefixToWrite(0.001)
+	for g.cmp(start, end) == 0 {
+		end = g.randPrefixToWrite(0.05)
+	}
+	if g.cmp(start, end) > 0 {
+		start, end = end, start
+	}
+	return start, end
+}
+
+// randPrefixToWrite returns a prefix key (a key with no suffix) for a range key
+// write operation.
+func (g *generator) randPrefixToWrite(newPrefix float64) []byte {
+	prefixes := g.keyManager.prefixes()
+	if len(prefixes) > 0 && g.rng.Float64() > newPrefix {
+		// Use an existing prefix.
+		p := g.rng.Intn(len(prefixes))
+		return prefixes[p]
+	}
+
+	// Use a new prefix.
+	var prefix []byte
+	for {
+		prefix = g.randKeyHelperSuffix(nil, 4, 12, 0)
+		if !g.keyManager.prefixExists(prefix) {
+			if !g.keyManager.addNewKey(prefix) {
+				panic("key must not exist if prefix doesn't exist")
+			}
+			return prefix
+		}
+	}
+}
+
+// randSuffixToWrite generates a random suffix according to the configuration's suffix
+// distribution. It takes a probability 0 ≤ p ≤ 1.0 indicating the probability
+// with which the generator should increase the max suffix generated by the
+// generator.
+//
+// randSuffixToWrite may return a nil suffix, with the probability the
+// configuration's suffix distribution assigns to the zero suffix.
+func (g *generator) randSuffixToWrite(incMaxProb float64) []byte {
+	if g.rng.Float64() < incMaxProb {
+		g.cfg.writeSuffixDist.IncMax(1)
+	}
+	return suffixFromInt(int64(g.cfg.writeSuffixDist.Uint64(g.rng)))
+}
+
+// randSuffixToRead generates a random suffix used during reads. The suffixes
+// generated by this function are within the same range as suffixes generated by
+// randSuffixToWrite, however randSuffixToRead pulls from a uniform
+// distribution.
+func (g *generator) randSuffixToRead() []byte {
+	// When reading, don't apply the recency skewing in order to better exercise
+	// a reading a mix of older and newer keys.
+	max := g.cfg.writeSuffixDist.Max()
+	return suffixFromInt(g.rng.Int63n(int64(max)))
+}
+
+func suffixFromInt(suffix int64) []byte {
+	// Treat the zero as no suffix to match the behavior during point key
+	// generation in randKeyHelper.
+	if suffix == 0 {
+		return nil
+	}
+	return testkeys.Suffix(suffix)
+}
+
+func (g *generator) randKeyToSingleDelete(id, dbID objID) []byte {
+	keys := g.keyManager.eligibleSingleDeleteKeys(id, dbID)
+	length := len(keys)
+	if length == 0 {
+		return nil
+	}
+	return keys[g.rng.Intn(length)]
+}
+
+// randKeyToRead returns a key for read operations.
+func (g *generator) randKeyToRead(newKey float64) []byte {
+	return g.randKeyHelper(g.keyManager.eligibleReadKeys(), newKey, nil)
+}
+
+// randKeyToReadInRange returns a key for read operations within the provided
+// key range. The bounds of the provided key range must span a prefix boundary.
+func (g *generator) randKeyToReadInRange(newKey float64, kr pebble.KeyRange) []byte {
+	return g.randKeyHelper(g.keyManager.eligibleReadKeysInRange(kr), newKey, &kr)
+}
+
+func (g *generator) randKeyHelper(
+	keys [][]byte, newKey float64, newKeyBounds *pebble.KeyRange,
+) []byte {
+	switch {
+	case len(keys) > 0 && g.rng.Float64() > newKey:
+		// Use an existing user key.
+		return keys[g.rng.Intn(len(keys))]
+
+	case len(keys) > 0 && g.rng.Float64() > g.cfg.newPrefix:
+		// Use an existing prefix but a new suffix, producing a new user key.
+		prefixes := g.keyManager.prefixes()
+
+		// If we're constrained to a key range, find which existing prefixes
+		// fall within that key range.
+		if newKeyBounds != nil {
+			s, _ := slices.BinarySearchFunc(prefixes, newKeyBounds.Start, g.cmp)
+			e, _ := slices.BinarySearchFunc(prefixes, newKeyBounds.End, g.cmp)
+			prefixes = prefixes[s:e]
+		}
+
+		if len(prefixes) > 0 {
+			for {
+				// Pick a prefix on each iteration in case most or all suffixes are
+				// already in use for any individual prefix.
+				p := g.rng.Intn(len(prefixes))
+				suffix := int64(g.cfg.writeSuffixDist.Uint64(g.rng))
+
+				var key []byte
+				if suffix > 0 {
+					key = resizeBuffer(key, len(prefixes[p]), testkeys.SuffixLen(suffix))
+					n := copy(key, prefixes[p])
+					testkeys.WriteSuffix(key[n:], suffix)
+				} else {
+					key = resizeBuffer(key, len(prefixes[p]), 0)
+					copy(key, prefixes[p])
+				}
+
+				if (newKeyBounds == nil || (g.cmp(key, newKeyBounds.Start) >= 0 && g.cmp(key, newKeyBounds.End) < 0)) &&
+					g.keyManager.addNewKey(key) {
+					return key
+				}
+
+				// If the generated key already existed, or the generated key
+				// fell outside the provided bounds, increase the suffix
+				// distribution and loop.
+				g.cfg.writeSuffixDist.IncMax(1)
+			}
+		}
+		// Otherwise fall through to generating a new prefix.
+		fallthrough
+
+	default:
+		// Use a new prefix, producing a new user key.
+
+		var key []byte
+
+		suffix := int64(g.cfg.writeSuffixDist.Uint64(g.rng))
+
+		// If we have bounds in which we need to generate the key, use
+		// testkeys.RandomSeparator to generate a key between the bounds.
+		if newKeyBounds != nil {
+			targetLength := 4 + g.rng.Intn(8)
+			key = testkeys.RandomSeparator(nil, g.prefix(newKeyBounds.Start), g.prefix(newKeyBounds.End),
+				suffix, targetLength, g.rng)
+		} else {
+			for {
+				key = g.randKeyHelperSuffix(nil, 4, 12, suffix)
+				if !g.keyManager.prefixExists(key[:testkeys.Comparer.Split(key)]) {
+					if !g.keyManager.addNewKey(key) {
+						panic("key must not exist if prefix doesn't exist")
+					}
+					break
+				}
+			}
+		}
+		return key
+	}
+}
+
+// randKeyHelperSuffix is a helper function for randKeyHelper, and should not be
+// invoked directly.
+func (g *generator) randKeyHelperSuffix(
+	dst []byte, minPrefixLen, maxPrefixLen int, suffix int64,
+) []byte {
+	n := minPrefixLen
+	if maxPrefixLen > minPrefixLen {
+		n += g.rng.Intn(maxPrefixLen - minPrefixLen)
+	}
+	// In order to test a mix of suffixed and unsuffixed keys, omit the zero
+	// suffix.
+	if suffix == 0 {
+		dst = resizeBuffer(dst, n, 0)
+		g.fillRand(dst)
+		return dst
+	}
+	suffixLen := testkeys.SuffixLen(suffix)
+	dst = resizeBuffer(dst, n, suffixLen)
+	g.fillRand(dst[:n])
+	testkeys.WriteSuffix(dst[n:], suffix)
+	return dst
+}
+
+func resizeBuffer(buf []byte, prefixLen, suffixLen int) []byte {
+	if cap(buf) >= prefixLen+suffixLen {
+		return buf[:prefixLen+suffixLen]
+	}
+	return make([]byte, prefixLen+suffixLen)
+}
+
+// TODO(peter): make the value size configurable. See valueSizeDist in
+// config.go.
+func (g *generator) randValue(min, max int) []byte {
+	n := min
+	if max > min {
+		n += g.rng.Intn(max - min)
+	}
+	if n == 0 {
+		return nil
+	}
+	buf := make([]byte, n)
+	g.fillRand(buf)
+	return buf
+}
+
+func (g *generator) fillRand(buf []byte) {
+	// NB: The actual random values are not particularly important. We only use
+	// lowercase letters because that makes visual determination of ordering
+	// easier, rather than having to remember the lexicographic ordering of
+	// uppercase vs lowercase, or letters vs numbers vs punctuation.
+	const letters = "abcdefghijklmnopqrstuvwxyz"
+	const lettersLen = uint64(len(letters))
+	const lettersCharsPerRand = 12 // floor(log(math.MaxUint64)/log(lettersLen))
+
+	var r uint64
+	var q int
+	for i := 0; i < len(buf); i++ {
+		if q == 0 {
+			r = g.rng.Uint64()
+			q = lettersCharsPerRand
+		}
+		buf[i] = letters[r%lettersLen]
+		r = r / lettersLen
+		q--
+	}
+}
+
+func (g *generator) newBatch() {
+	batchID := makeObjID(batchTag, g.init.batchSlots)
+	g.init.batchSlots++
+	g.liveBatches = append(g.liveBatches, batchID)
+	g.liveWriters = append(g.liveWriters, batchID)
+	dbID := g.dbs.rand(g.rng)
+	g.objDB[batchID] = dbID
+
+	g.add(&newBatchOp{
+		dbID:    dbID,
+		batchID: batchID,
+	})
+}
+
+func (g *generator) newIndexedBatch() {
+	batchID := makeObjID(batchTag, g.init.batchSlots)
+	g.init.batchSlots++
+	g.liveBatches = append(g.liveBatches, batchID)
+	g.liveReaders = append(g.liveReaders, batchID)
+	g.liveWriters = append(g.liveWriters, batchID)
+
+	iters := make(objIDSet)
+	g.batches[batchID] = iters
+	g.readers[batchID] = iters
+	dbID := g.dbs.rand(g.rng)
+	g.objDB[batchID] = dbID
+
+	g.add(&newIndexedBatchOp{
+		dbID:    dbID,
+		batchID: batchID,
+	})
+}
+
+// removeFromBatchGenerator will not generate a closeOp for the target batch as
+// not every batch that is removed from the generator should be closed. For
+// example, running a closeOp before an ingestOp that contains the closed batch
+// will cause an error.
+func (g *generator) removeBatchFromGenerator(batchID objID) {
+	g.liveBatches.remove(batchID)
+	iters := g.batches[batchID]
+	delete(g.batches, batchID)
+
+	if iters != nil {
+		g.liveReaders.remove(batchID)
+		delete(g.readers, batchID)
+	}
+	g.liveWriters.remove(batchID)
+	for _, id := range iters.sorted() {
+		g.liveIters.remove(id)
+		delete(g.iters, id)
+		g.add(&closeOp{objID: id, derivedDBID: g.objDB[batchID]})
+	}
+}
+
+func (g *generator) batchAbort() {
+	if len(g.liveBatches) == 0 {
+		return
+	}
+
+	batchID := g.liveBatches.rand(g.rng)
+	g.removeBatchFromGenerator(batchID)
+
+	g.add(&closeOp{objID: batchID, derivedDBID: g.objDB[batchID]})
+}
+
+func (g *generator) batchCommit() {
+	if len(g.liveBatches) == 0 {
+		return
+	}
+
+	batchID := g.liveBatches.rand(g.rng)
+	dbID := g.objDB[batchID]
+	g.removeBatchFromGenerator(batchID)
+	g.add(&batchCommitOp{
+		dbID:    dbID,
+		batchID: batchID,
+	})
+	g.add(&closeOp{objID: batchID, derivedDBID: dbID})
+
+}
+
+func (g *generator) dbClose() {
+	// Close any live iterators and snapshots, so that we can close the DB
+	// cleanly.
+	for len(g.liveIters) > 0 {
+		g.randIter(g.iterClose)()
+	}
+	for len(g.liveSnapshots) > 0 {
+		g.snapshotClose()
+	}
+	for len(g.liveBatches) > 0 {
+		batchID := g.liveBatches[0]
+		dbID := g.objDB[batchID]
+		g.removeBatchFromGenerator(batchID)
+		g.add(&closeOp{objID: batchID, derivedDBID: dbID})
+	}
+	for len(g.dbs) > 0 {
+		db := g.dbs[0]
+		g.dbs = g.dbs[1:]
+		g.add(&closeOp{objID: db})
+	}
+}
+
+func (g *generator) dbCheckpoint() {
+	// 1/2 of the time we don't restrict the checkpoint;
+	// 1/4 of the time we restrict to 1 span;
+	// 1/8 of the time we restrict to 2 spans; etc.
+	numSpans := 0
+	var spans []pebble.CheckpointSpan
+	for g.rng.Intn(2) == 0 {
+		numSpans++
+	}
+	if numSpans > 0 {
+		spans = make([]pebble.CheckpointSpan, numSpans)
+	}
+	for i := range spans {
+		start := g.randKeyToRead(0.01)
+		end := g.randKeyToRead(0.01)
+		if g.cmp(start, end) > 0 {
+			start, end = end, start
+		}
+		spans[i].Start = start
+		spans[i].End = end
+	}
+	dbID := g.dbs.rand(g.rng)
+	g.add(&checkpointOp{
+		dbID:  dbID,
+		spans: spans,
+	})
+}
+
+func (g *generator) dbCompact() {
+	// Generate new key(s) with a 1% probability.
+	start := g.randKeyToRead(0.01)
+	end := g.randKeyToRead(0.01)
+	if g.cmp(start, end) > 0 {
+		start, end = end, start
+	}
+	dbID := g.dbs.rand(g.rng)
+	g.add(&compactOp{
+		dbID:        dbID,
+		start:       start,
+		end:         end,
+		parallelize: g.rng.Float64() < 0.5,
+	})
+}
+
+func (g *generator) dbFlush() {
+	g.add(&flushOp{g.dbs.rand(g.rng)})
+}
+
+func (g *generator) dbRatchetFormatMajorVersion() {
+	// Ratchet to a random format major version between the minimum the
+	// metamorphic tests support and the newest. At runtime, the generated
+	// version may be behind the database's format major version, in which case
+	// RatchetFormatMajorVersion should deterministically error.
+
+	dbID := g.dbs.rand(g.rng)
+	n := int(newestFormatMajorVersionToTest - minimumFormatMajorVersion)
+	vers := pebble.FormatMajorVersion(g.rng.Intn(n+1)) + minimumFormatMajorVersion
+	g.add(&dbRatchetFormatMajorVersionOp{dbID: dbID, vers: vers})
+}
+
+func (g *generator) dbRestart() {
+	// Close any live iterators and snapshots, so that we can close the DB
+	// cleanly.
+	dbID := g.dbs.rand(g.rng)
+	for len(g.liveIters) > 0 {
+		g.randIter(g.iterClose)()
+	}
+	for len(g.liveSnapshots) > 0 {
+		g.snapshotClose()
+	}
+	// Close the batches.
+	for len(g.liveBatches) > 0 {
+		batchID := g.liveBatches[0]
+		dbID := g.objDB[batchID]
+		g.removeBatchFromGenerator(batchID)
+		g.add(&closeOp{objID: batchID, derivedDBID: dbID})
+	}
+	if len(g.liveReaders) != len(g.dbs) || len(g.liveWriters) != len(g.dbs) {
+		panic(fmt.Sprintf("unexpected counts: liveReaders %d, liveWriters: %d",
+			len(g.liveReaders), len(g.liveWriters)))
+	}
+	g.add(&dbRestartOp{dbID: dbID})
+}
+
+// maybeSetSnapshotIterBounds must be called whenever creating a new iterator or
+// modifying the bounds of an iterator. If the iterator is backed by a snapshot
+// that only guarantees consistency within a limited set of key spans, then the
+// iterator must set bounds within one of the snapshot's consistent keyspans. It
+// returns true if the provided readerID is a bounded snapshot and bounds were
+// set.
+func (g *generator) maybeSetSnapshotIterBounds(readerID objID, opts *iterOpts) bool {
+	snapBounds, isBoundedSnapshot := g.snapshotBounds[readerID]
+	if !isBoundedSnapshot {
+		return false
+	}
+	// Pick a random keyrange within one of the snapshot's key ranges.
+	parentBounds := snapBounds[g.rng.Intn(len(snapBounds))]
+	// With 10% probability, use the parent start bound as-is.
+	if g.rng.Float64() <= 0.1 {
+		opts.lower = parentBounds.Start
+	} else {
+		opts.lower = testkeys.RandomSeparator(
+			nil, /* dst */
+			parentBounds.Start,
+			parentBounds.End,
+			0, /* suffix */
+			4+g.rng.Intn(8),
+			g.rng,
+		)
+	}
+	// With 10% probability, use the parent end bound as-is.
+	if g.rng.Float64() <= 0.1 {
+		opts.upper = parentBounds.End
+	} else {
+		opts.upper = testkeys.RandomSeparator(
+			nil, /* dst */
+			opts.lower,
+			parentBounds.End,
+			0, /* suffix */
+			4+g.rng.Intn(8),
+			g.rng,
+		)
+	}
+	return true
+}
+
+func (g *generator) newIter() {
+	iterID := makeObjID(iterTag, g.init.iterSlots)
+	g.init.iterSlots++
+	g.liveIters = append(g.liveIters, iterID)
+
+	readerID := g.liveReaders.rand(g.rng)
+	if iters := g.readers[readerID]; iters != nil {
+		iters[iterID] = struct{}{}
+		g.iters[iterID] = iters
+		//lint:ignore SA9003 - readability
+	} else {
+		// NB: the DB object does not track its open iterators because it never
+		// closes.
+	}
+	g.iterReaderID[iterID] = readerID
+	dbID := g.deriveDB(iterID)
+
+	var opts iterOpts
+	if !g.maybeSetSnapshotIterBounds(readerID, &opts) {
+		// Generate lower/upper bounds with a 10% probability.
+		if g.rng.Float64() <= 0.1 {
+			// Generate a new key with a .1% probability.
+			opts.lower = g.randKeyToRead(0.001)
+		}
+		if g.rng.Float64() <= 0.1 {
+			// Generate a new key with a .1% probability.
+			opts.upper = g.randKeyToRead(0.001)
+		}
+		if g.cmp(opts.lower, opts.upper) > 0 {
+			opts.lower, opts.upper = opts.upper, opts.lower
+		}
+	}
+	opts.keyTypes, opts.maskSuffix = g.randKeyTypesAndMask()
+
+	// With 10% probability, enable automatic filtering of keys with suffixes
+	// not in the provided range. This filtering occurs both through
+	// block-property filtering and explicitly within the iterator operations to
+	// ensure determinism.
+	if g.rng.Float64() <= 0.1 {
+		max := g.cfg.writeSuffixDist.Max()
+		opts.filterMin, opts.filterMax = g.rng.Uint64n(max)+1, g.rng.Uint64n(max)+1
+		if opts.filterMin > opts.filterMax {
+			opts.filterMin, opts.filterMax = opts.filterMax, opts.filterMin
+		} else if opts.filterMin == opts.filterMax {
+			opts.filterMax = opts.filterMin + 1
+		}
+	}
+
+	// Enable L6 filters with a 10% probability.
+	if g.rng.Float64() <= 0.1 {
+		opts.useL6Filters = true
+	}
+
+	g.itersLastOpts[iterID] = opts
+	g.iterCreationTimestamp[iterID] = g.keyManager.nextMetaTimestamp()
+	g.iterReaderID[iterID] = readerID
+	g.add(&newIterOp{
+		readerID:    readerID,
+		iterID:      iterID,
+		iterOpts:    opts,
+		derivedDBID: dbID,
+	})
+}
+
+func (g *generator) randKeyTypesAndMask() (keyTypes uint32, maskSuffix []byte) {
+	// Iterate over different key types.
+	p := g.rng.Float64()
+	switch {
+	case p < 0.2: // 20% probability
+		keyTypes = uint32(pebble.IterKeyTypePointsOnly)
+	case p < 0.8: // 60% probability
+		keyTypes = uint32(pebble.IterKeyTypePointsAndRanges)
+		// With 50% probability, enable masking.
+		if g.rng.Intn(2) == 1 {
+			maskSuffix = g.randSuffixToRead()
+		}
+	default: // 20% probability
+		keyTypes = uint32(pebble.IterKeyTypeRangesOnly)
+	}
+	return keyTypes, maskSuffix
+}
+
+func (g *generator) deriveDB(readerID objID) objID {
+	if readerID.tag() == iterTag {
+		readerID = g.iterReaderID[readerID]
+	}
+	dbParentID := readerID
+	if dbParentID.tag() != dbTag {
+		dbParentID = g.objDB[dbParentID]
+	}
+	g.objDB[readerID] = dbParentID
+	return dbParentID
+}
+
+func (g *generator) newIterUsingClone() {
+	if len(g.liveIters) == 0 {
+		return
+	}
+	existingIterID := g.liveIters.rand(g.rng)
+	iterID := makeObjID(iterTag, g.init.iterSlots)
+	g.init.iterSlots++
+	g.liveIters = append(g.liveIters, iterID)
+	if iters := g.iters[existingIterID]; iters != nil {
+		iters[iterID] = struct{}{}
+		g.iters[iterID] = iters
+		//lint:ignore SA9003 - readability
+	} else {
+		// NB: the DB object does not track its open iterators because it never
+		// closes.
+	}
+	readerID := g.iterReaderID[existingIterID]
+	g.iterReaderID[iterID] = readerID
+	g.deriveDB(iterID)
+
+	var refreshBatch bool
+	if readerID.tag() == batchTag {
+		refreshBatch = g.rng.Intn(2) == 1
+	}
+
+	opts := g.itersLastOpts[existingIterID]
+	// With 50% probability, consider modifying the iterator options used by the
+	// clone.
+	if g.rng.Intn(2) == 1 {
+		g.maybeMutateOptions(readerID, &opts)
+	}
+	g.itersLastOpts[iterID] = opts
+
+	g.iterCreationTimestamp[iterID] = g.keyManager.nextMetaTimestamp()
+	g.iterReaderID[iterID] = g.iterReaderID[existingIterID]
+	g.add(&newIterUsingCloneOp{
+		existingIterID:  existingIterID,
+		iterID:          iterID,
+		refreshBatch:    refreshBatch,
+		iterOpts:        opts,
+		derivedReaderID: readerID,
+	})
+}
+
+func (g *generator) iterClose(iterID objID) {
+	g.liveIters.remove(iterID)
+	if readerIters, ok := g.iters[iterID]; ok {
+		delete(g.iters, iterID)
+		delete(readerIters, iterID)
+		//lint:ignore SA9003 - readability
+	} else {
+		// NB: the DB object does not track its open iterators because it never
+		// closes.
+	}
+
+	readerID := g.iterReaderID[iterID]
+	g.add(&closeOp{objID: iterID, derivedDBID: g.objDB[readerID]})
+}
+
+func (g *generator) iterSetBounds(iterID objID) {
+	iterLastOpts := g.itersLastOpts[iterID]
+	newOpts := iterLastOpts
+	// TODO(jackson): The logic to increase the probability of advancing bounds
+	// monotonically only applies if the snapshot is not bounded. Refactor to
+	// allow bounded snapshots to benefit too, when possible.
+	if !g.maybeSetSnapshotIterBounds(g.iterReaderID[iterID], &newOpts) {
+		var lower, upper []byte
+		genLower := g.rng.Float64() <= 0.9
+		genUpper := g.rng.Float64() <= 0.9
+		// When one of ensureLowerGE, ensureUpperLE is true, the new bounds
+		// don't overlap with the previous bounds.
+		var ensureLowerGE, ensureUpperLE bool
+		if genLower && iterLastOpts.upper != nil && g.rng.Float64() <= 0.9 {
+			ensureLowerGE = true
+		}
+		if (!ensureLowerGE || g.rng.Float64() < 0.5) && genUpper && iterLastOpts.lower != nil {
+			ensureUpperLE = true
+			ensureLowerGE = false
+		}
+		attempts := 0
+		for {
+			attempts++
+			if genLower {
+				// Generate a new key with a .1% probability.
+				lower = g.randKeyToRead(0.001)
+			}
+			if genUpper {
+				// Generate a new key with a .1% probability.
+				upper = g.randKeyToRead(0.001)
+			}
+			if g.cmp(lower, upper) > 0 {
+				lower, upper = upper, lower
+			}
+			if ensureLowerGE && g.cmp(iterLastOpts.upper, lower) > 0 {
+				if attempts < 25 {
+					continue
+				}
+				lower = iterLastOpts.upper
+				upper = lower
+				break
+			}
+			if ensureUpperLE && g.cmp(upper, iterLastOpts.lower) > 0 {
+				if attempts < 25 {
+					continue
+				}
+				upper = iterLastOpts.lower
+				lower = upper
+				break
+			}
+			break
+		}
+		newOpts.lower = lower
+		newOpts.upper = upper
+	}
+	g.itersLastOpts[iterID] = newOpts
+	g.add(&iterSetBoundsOp{
+		iterID: iterID,
+		lower:  newOpts.lower,
+		upper:  newOpts.upper,
+	})
+	// Additionally seek the iterator in a manner consistent with the bounds,
+	// and do some steps (Next/Prev). The seeking exercises typical
+	// CockroachDB behavior when using iterators and the steps are trying to
+	// stress the region near the bounds. Ideally, we should not do this as
+	// part of generating a single op, but this is easier than trying to
+	// control future op generation via generator state.
+	doSeekLT := newOpts.upper != nil && g.rng.Float64() < 0.5
+	doSeekGE := newOpts.lower != nil && g.rng.Float64() < 0.5
+	if doSeekLT && doSeekGE {
+		// Pick the seek.
+		if g.rng.Float64() < 0.5 {
+			doSeekGE = false
+		} else {
+			doSeekLT = false
+		}
+	}
+	if doSeekLT {
+		g.add(&iterSeekLTOp{
+			iterID:          iterID,
+			key:             newOpts.upper,
+			derivedReaderID: g.iterReaderID[iterID],
+		})
+		if g.rng.Float64() < 0.5 {
+			g.iterNext(iterID)
+		}
+		if g.rng.Float64() < 0.5 {
+			g.iterNext(iterID)
+		}
+		if g.rng.Float64() < 0.5 {
+			g.iterPrev(iterID)
+		}
+	} else if doSeekGE {
+		g.add(&iterSeekGEOp{
+			iterID:          iterID,
+			key:             newOpts.lower,
+			derivedReaderID: g.iterReaderID[iterID],
+		})
+		if g.rng.Float64() < 0.5 {
+			g.iterPrev(iterID)
+		}
+		if g.rng.Float64() < 0.5 {
+			g.iterPrev(iterID)
+		}
+		if g.rng.Float64() < 0.5 {
+			g.iterNext(iterID)
+		}
+	}
+}
+
+func (g *generator) iterSetOptions(iterID objID) {
+	opts := g.itersLastOpts[iterID]
+	g.maybeMutateOptions(g.iterReaderID[iterID], &opts)
+	g.itersLastOpts[iterID] = opts
+	g.add(&iterSetOptionsOp{
+		iterID:          iterID,
+		iterOpts:        opts,
+		derivedReaderID: g.iterReaderID[iterID],
+	})
+
+	// Additionally, perform a random absolute positioning operation. The
+	// SetOptions contract requires one before the next relative positioning
+	// operation. Ideally, we should not do this as part of generating a single
+	// op, but this is easier than trying to control future op generation via
+	// generator state.
+	g.pickOneUniform(
+		g.iterFirst,
+		g.iterLast,
+		g.iterSeekGE,
+		g.iterSeekGEWithLimit,
+		g.iterSeekPrefixGE,
+		g.iterSeekLT,
+		g.iterSeekLTWithLimit,
+	)(iterID)
+}
+
+func (g *generator) iterSeekGE(iterID objID) {
+	g.add(&iterSeekGEOp{
+		iterID:          iterID,
+		key:             g.randKeyToRead(0.001), // 0.1% new keys
+		derivedReaderID: g.iterReaderID[iterID],
+	})
+}
+
+func (g *generator) iterSeekGEWithLimit(iterID objID) {
+	// 0.1% new keys
+	key, limit := g.randKeyToRead(0.001), g.randKeyToRead(0.001)
+	if g.cmp(key, limit) > 0 {
+		key, limit = limit, key
+	}
+	g.add(&iterSeekGEOp{
+		iterID:          iterID,
+		key:             key,
+		limit:           limit,
+		derivedReaderID: g.iterReaderID[iterID],
+	})
+}
+
+func (g *generator) randKeyToReadWithinBounds(lower, upper []byte, readerID objID) []*keyMeta {
+	var inRangeKeys []*keyMeta
+	for _, keyMeta := range g.keyManager.byObj[readerID] {
+		posKey := keyMeta.key
+		if g.cmp(posKey, lower) < 0 || g.cmp(posKey, upper) >= 0 {
+			continue
+		}
+		inRangeKeys = append(inRangeKeys, keyMeta)
+	}
+	return inRangeKeys
+}
+
+func (g *generator) iterSeekPrefixGE(iterID objID) {
+	lower := g.itersLastOpts[iterID].lower
+	upper := g.itersLastOpts[iterID].upper
+	iterCreationTimestamp := g.iterCreationTimestamp[iterID]
+	var key []byte
+
+	// We try to make sure that the SeekPrefixGE key is within the iter bounds,
+	// and that the iter can read the key. If the key was created on a batch
+	// which deleted the key, then the key will still be considered visible
+	// by the current logic. We're also not accounting for keys written to
+	// batches which haven't been presisted to the DB. But we're only picking
+	// keys in a best effort manner, and the logic is better than picking a
+	// random key.
+	if g.rng.Intn(10) >= 1 {
+		possibleKeys := make([][]byte, 0, 100)
+		inRangeKeys := g.randKeyToReadWithinBounds(lower, upper, g.objDB[iterID])
+		for _, keyMeta := range inRangeKeys {
+			posKey := keyMeta.key
+			var foundWriteWithoutDelete bool
+			for _, update := range keyMeta.updateOps {
+				if update.metaTimestamp > iterCreationTimestamp {
+					break
+				}
+
+				if update.deleted {
+					foundWriteWithoutDelete = false
+				} else {
+					foundWriteWithoutDelete = true
+				}
+			}
+			if foundWriteWithoutDelete {
+				possibleKeys = append(possibleKeys, posKey)
+			}
+		}
+
+		if len(possibleKeys) > 0 {
+			key = []byte(possibleKeys[g.rng.Int31n(int32(len(possibleKeys)))])
+		}
+	}
+
+	if key == nil {
+		// TODO(bananabrick): We should try and use keys within the bounds,
+		// even if we couldn't find any keys visible to the iterator. However,
+		// doing this in experiments didn't really increase the valid
+		// SeekPrefixGE calls by much.
+		key = g.randKeyToRead(0) // 0% new keys
+	}
+
+	g.add(&iterSeekPrefixGEOp{
+		iterID:          iterID,
+		key:             key,
+		derivedReaderID: g.iterReaderID[iterID],
+	})
+}
+
+func (g *generator) iterSeekLT(iterID objID) {
+	g.add(&iterSeekLTOp{
+		iterID:          iterID,
+		key:             g.randKeyToRead(0.001), // 0.1% new keys
+		derivedReaderID: g.iterReaderID[iterID],
+	})
+}
+
+func (g *generator) iterSeekLTWithLimit(iterID objID) {
+	// 0.1% new keys
+	key, limit := g.randKeyToRead(0.001), g.randKeyToRead(0.001)
+	if g.cmp(limit, key) > 0 {
+		key, limit = limit, key
+	}
+	g.add(&iterSeekLTOp{
+		iterID:          iterID,
+		key:             key,
+		limit:           limit,
+		derivedReaderID: g.iterReaderID[iterID],
+	})
+}
+
+// randIter performs partial func application ("currying"), returning a new
+// function that supplies the given func with a random iterator.
+func (g *generator) randIter(gen func(objID)) func() {
+	return func() {
+		if len(g.liveIters) == 0 {
+			return
+		}
+		gen(g.liveIters.rand(g.rng))
+	}
+}
+
+func (g *generator) iterFirst(iterID objID) {
+	g.add(&iterFirstOp{
+		iterID:          iterID,
+		derivedReaderID: g.iterReaderID[iterID],
+	})
+}
+
+func (g *generator) iterLast(iterID objID) {
+	g.add(&iterLastOp{
+		iterID:          iterID,
+		derivedReaderID: g.iterReaderID[iterID],
+	})
+}
+
+func (g *generator) iterNext(iterID objID) {
+	g.add(&iterNextOp{
+		iterID:          iterID,
+		derivedReaderID: g.iterReaderID[iterID],
+	})
+}
+
+func (g *generator) iterPrev(iterID objID) {
+	g.add(&iterPrevOp{
+		iterID:          iterID,
+		derivedReaderID: g.iterReaderID[iterID],
+	})
+}
+
+func (g *generator) iterNextWithLimit(iterID objID) {
+	g.add(&iterNextOp{
+		iterID:          iterID,
+		limit:           g.randKeyToRead(0.001), // 0.1% new keys
+		derivedReaderID: g.iterReaderID[iterID],
+	})
+}
+
+func (g *generator) iterNextPrefix(iterID objID) {
+	g.add(&iterNextPrefixOp{
+		iterID:          iterID,
+		derivedReaderID: g.iterReaderID[iterID],
+	})
+}
+
+func (g *generator) iterCanSingleDelete(iterID objID) {
+	g.add(&iterCanSingleDelOp{
+		iterID:          iterID,
+		derivedReaderID: g.iterReaderID[iterID],
+	})
+}
+
+func (g *generator) iterPrevWithLimit(iterID objID) {
+	g.add(&iterPrevOp{
+		iterID:          iterID,
+		limit:           g.randKeyToRead(0.001), // 0.1% new keys
+		derivedReaderID: g.iterReaderID[iterID],
+	})
+}
+
+func (g *generator) readerGet() {
+	if len(g.liveReaders) == 0 {
+		return
+	}
+
+	readerID := g.liveReaders.rand(g.rng)
+
+	// If the chosen reader is a snapshot created with user-specified key
+	// ranges, restrict the read to fall within one of the provided key ranges.
+	var key []byte
+	if bounds := g.snapshotBounds[readerID]; len(bounds) > 0 {
+		kr := bounds[g.rng.Intn(len(bounds))]
+		key = g.randKeyToReadInRange(0.001, kr) // 0.1% new keys
+	} else {
+		key = g.randKeyToRead(0.001) // 0.1% new keys
+	}
+	derivedDBID := objID(0)
+	if readerID.tag() == batchTag || readerID.tag() == snapTag {
+		derivedDBID = g.deriveDB(readerID)
+	}
+	g.add(&getOp{readerID: readerID, key: key, derivedDBID: derivedDBID})
+}
+
+func (g *generator) replicate() {
+	if len(g.dbs) < 2 {
+		return
+	}
+
+	source := g.dbs.rand(g.rng)
+	dest := source
+	for dest == source {
+		dest = g.dbs.rand(g.rng)
+	}
+
+	var startKey, endKey []byte
+	startKey = g.randKeyToRead(0.001) // 0.1% new keys
+	endKey = g.randKeyToRead(0.001)   // 0.1% new keys
+	for g.cmp(startKey, endKey) == 0 {
+		endKey = g.randKeyToRead(0.01) // 1% new keys
+	}
+	if g.cmp(startKey, endKey) > 0 {
+		startKey, endKey = endKey, startKey
+	}
+	g.add(&replicateOp{
+		source: source,
+		dest:   dest,
+		start:  startKey,
+		end:    endKey,
+	})
+}
+
+// generateDisjointKeyRanges generates n disjoint key ranges.
+func (g *generator) generateDisjointKeyRanges(n int) []pebble.KeyRange {
+	bounds := make([][]byte, 2*n)
+	used := map[string]bool{}
+	for i := 0; i < len(bounds); i++ {
+		k := g.prefix(g.randKeyToRead(0.1))
+		for used[string(k)] {
+			k = g.prefix(g.randKeyToRead(0.1))
+		}
+		bounds[i] = k
+		used[string(k)] = true
+	}
+	slices.SortFunc(bounds, g.cmp)
+	keyRanges := make([]pebble.KeyRange, n)
+	for i := range keyRanges {
+		keyRanges[i] = pebble.KeyRange{
+			Start: bounds[i*2],
+			End:   bounds[i*2+1],
+		}
+	}
+	return keyRanges
+}
+
+func (g *generator) newSnapshot() {
+	snapID := makeObjID(snapTag, g.init.snapshotSlots)
+	g.init.snapshotSlots++
+	g.liveSnapshots = append(g.liveSnapshots, snapID)
+	g.liveReaders = append(g.liveReaders, snapID)
+	dbID := g.dbs.rand(g.rng)
+	g.objDB[snapID] = dbID
+
+	iters := make(objIDSet)
+	g.snapshots[snapID] = iters
+	g.readers[snapID] = iters
+
+	s := &newSnapshotOp{
+		dbID:   dbID,
+		snapID: snapID,
+	}
+
+	// With 75% probability, impose bounds on the keys that may be read with the
+	// snapshot. Setting bounds allows some runs of the metamorphic test to use
+	// a EventuallyFileOnlySnapshot instead of a Snapshot, testing equivalence
+	// between the two for reads within those bounds.
+	//
+	// If we're in multi-instance mode, we must always create bounds, as we will
+	// always create EventuallyFileOnlySnapshots to allow commands that use excises
+	// (eg. replicateOp) to work.
+	if g.rng.Float64() < 0.75 || g.dbs.Len() > 1 {
+		s.bounds = g.generateDisjointKeyRanges(
+			g.rng.Intn(5) + 1, /* between 1-5 */
+		)
+		g.snapshotBounds[snapID] = s.bounds
+	}
+	g.add(s)
+	if g.dbs.Len() > 1 {
+		// Do a flush after each EFOS, if we're in multi-instance mode. This limits
+		// the testing area of EFOS, but allows them to be used alongside operations
+		// that do an excise (eg. replicateOp). This will be revisited when
+		// https://github.com/cockroachdb/pebble/issues/2885 is implemented.
+		g.add(&flushOp{dbID})
+	}
+}
+
+func (g *generator) snapshotClose() {
+	if len(g.liveSnapshots) == 0 {
+		return
+	}
+
+	snapID := g.liveSnapshots.rand(g.rng)
+	g.liveSnapshots.remove(snapID)
+	iters := g.snapshots[snapID]
+	delete(g.snapshots, snapID)
+	g.liveReaders.remove(snapID)
+	delete(g.readers, snapID)
+
+	for _, id := range iters.sorted() {
+		g.liveIters.remove(id)
+		delete(g.iters, id)
+		g.add(&closeOp{objID: id, derivedDBID: g.objDB[snapID]})
+	}
+
+	g.add(&closeOp{objID: snapID, derivedDBID: g.objDB[snapID]})
+}
+
+func (g *generator) writerApply() {
+	if len(g.liveBatches) == 0 {
+		return
+	}
+	if len(g.liveWriters) < 2 {
+		panic(fmt.Sprintf("insufficient liveWriters (%d) to apply batch", len(g.liveWriters)))
+	}
+
+	batchID := g.liveBatches.rand(g.rng)
+	dbID := g.objDB[batchID]
+
+	var writerID objID
+	for {
+		// NB: The writer we're applying to, as well as the batch we're applying,
+		// must be from the same DB. The writer could be the db itself. Applying
+		// a batch from one DB on another DB results in a panic, so avoid that.
+		writerID = g.liveWriters.rand(g.rng)
+		writerDBID := writerID
+		if writerID.tag() != dbTag {
+			writerDBID = g.objDB[writerID]
+		}
+		if writerID != batchID && writerDBID == dbID {
+			break
+		}
+	}
+
+	g.removeBatchFromGenerator(batchID)
+
+	g.add(&applyOp{
+		writerID: writerID,
+		batchID:  batchID,
+	})
+	g.add(&closeOp{
+		objID:       batchID,
+		derivedDBID: dbID,
+	})
+}
+
+func (g *generator) writerDelete() {
+	if len(g.liveWriters) == 0 {
+		return
+	}
+
+	writerID := g.liveWriters.rand(g.rng)
+	derivedDBID := writerID
+	if derivedDBID.tag() != dbTag {
+		derivedDBID = g.objDB[writerID]
+	}
+	g.add(&deleteOp{
+		writerID:    writerID,
+		key:         g.randKeyToWrite(0.001), // 0.1% new keys
+		derivedDBID: derivedDBID,
+	})
+}
+
+func (g *generator) writerDeleteRange() {
+	if len(g.liveWriters) == 0 {
+		return
+	}
+
+	start := g.randKeyToWrite(0.001)
+	end := g.randKeyToWrite(0.001)
+	if g.cmp(start, end) > 0 {
+		start, end = end, start
+	}
+
+	writerID := g.liveWriters.rand(g.rng)
+	g.add(&deleteRangeOp{
+		writerID: writerID,
+		start:    start,
+		end:      end,
+	})
+}
+
+func (g *generator) writerRangeKeyDelete() {
+	if len(g.liveWriters) == 0 {
+		return
+	}
+	start, end := g.prefixKeyRange()
+
+	writerID := g.liveWriters.rand(g.rng)
+	g.add(&rangeKeyDeleteOp{
+		writerID: writerID,
+		start:    start,
+		end:      end,
+	})
+}
+
+func (g *generator) writerRangeKeySet() {
+	if len(g.liveWriters) == 0 {
+		return
+	}
+	start, end := g.prefixKeyRange()
+
+	// 90% of the time, set a suffix.
+	var suffix []byte
+	if g.rng.Float64() < 0.90 {
+		// Increase the max suffix 5% of the time.
+		suffix = g.randSuffixToWrite(0.05)
+	}
+
+	writerID := g.liveWriters.rand(g.rng)
+	g.add(&rangeKeySetOp{
+		writerID: writerID,
+		start:    start,
+		end:      end,
+		suffix:   suffix,
+		value:    g.randValue(0, maxValueSize),
+	})
+}
+
+func (g *generator) writerRangeKeyUnset() {
+	if len(g.liveWriters) == 0 {
+		return
+	}
+	start, end := g.prefixKeyRange()
+
+	// 90% of the time, set a suffix.
+	var suffix []byte
+	if g.rng.Float64() < 0.90 {
+		// Increase the max suffix 5% of the time.
+		suffix = g.randSuffixToWrite(0.05)
+	}
+
+	// TODO(jackson): Increase probability of effective unsets? Purely random
+	// unsets are unlikely to remove an active range key.
+
+	writerID := g.liveWriters.rand(g.rng)
+	g.add(&rangeKeyUnsetOp{
+		writerID: writerID,
+		start:    start,
+		end:      end,
+		suffix:   suffix,
+	})
+}
+
+func (g *generator) writerIngest() {
+	if len(g.liveBatches) == 0 {
+		return
+	}
+
+	// TODO(nicktrav): this is resulting in too many single batch ingests.
+	// Consider alternatives. One possibility would be to pass through whether
+	// we can tolerate failure or not, and if the ingestOp encounters a
+	// failure, it would retry after splitting into single batch ingests.
+
+	dbID := g.dbs.rand(g.rng)
+	// Ingest between 1 and 3 batches.
+	batchIDs := make([]objID, 0, 1+g.rng.Intn(3))
+	canFail := cap(batchIDs) > 1
+	for i := 0; i < cap(batchIDs); i++ {
+		batchID := g.liveBatches.rand(g.rng)
+		if canFail && !g.keyManager.canTolerateApplyFailure(batchID) {
+			continue
+		}
+		// After the ingest runs, it either succeeds and the keys are in the
+		// DB, or it fails and these keys never make it to the DB.
+		g.removeBatchFromGenerator(batchID)
+		batchIDs = append(batchIDs, batchID)
+		if len(g.liveBatches) == 0 {
+			break
+		}
+	}
+	if len(batchIDs) == 0 && len(g.liveBatches) > 0 {
+		// Unable to find multiple batches because of the
+		// canTolerateApplyFailure call above, so just pick one batch.
+		batchID := g.liveBatches.rand(g.rng)
+		g.removeBatchFromGenerator(batchID)
+		batchIDs = append(batchIDs, batchID)
+	}
+	derivedDBIDs := make([]objID, len(batchIDs))
+	for i := range batchIDs {
+		derivedDBIDs[i] = g.objDB[batchIDs[i]]
+	}
+	g.add(&ingestOp{
+		dbID:         dbID,
+		batchIDs:     batchIDs,
+		derivedDBIDs: derivedDBIDs,
+	})
+}
+
+func (g *generator) writerMerge() {
+	if len(g.liveWriters) == 0 {
+		return
+	}
+
+	writerID := g.liveWriters.rand(g.rng)
+	g.add(&mergeOp{
+		writerID: writerID,
+		// 20% new keys.
+		key:   g.randKeyToWrite(0.2),
+		value: g.randValue(0, maxValueSize),
+	})
+}
+
+func (g *generator) writerSet() {
+	if len(g.liveWriters) == 0 {
+		return
+	}
+
+	writerID := g.liveWriters.rand(g.rng)
+	g.add(&setOp{
+		writerID: writerID,
+		// 50% new keys.
+		key:   g.randKeyToWrite(0.5),
+		value: g.randValue(0, maxValueSize),
+	})
+}
+
+func (g *generator) writerSingleDelete() {
+	if len(g.liveWriters) == 0 {
+		return
+	}
+
+	writerID := g.liveWriters.rand(g.rng)
+	dbID := g.objDB[writerID]
+	key := g.randKeyToSingleDelete(writerID, dbID)
+	if key == nil {
+		return
+	}
+	g.add(&singleDeleteOp{
+		writerID: writerID,
+		key:      key,
+		// Keys eligible for single deletes can be removed with a regular
+		// delete. Mutate a percentage of SINGLEDEL ops into DELETEs. Note that
+		// here we are only determining whether the replacement *could* happen.
+		// At test runtime, the `replaceSingleDelete` test option must also be
+		// set to true for the single delete to be replaced.
+		maybeReplaceDelete: g.rng.Float64() < 0.25,
+	})
+}
+
+func (g *generator) maybeMutateOptions(readerID objID, opts *iterOpts) {
+	// With 95% probability, allow changes to any options at all. This ensures
+	// that in 5% of cases there are no changes, and SetOptions hits its fast
+	// path.
+	if g.rng.Intn(100) >= 5 {
+		if !g.maybeSetSnapshotIterBounds(readerID, opts) {
+			// With 1/3 probability, clear existing bounds.
+			if opts.lower != nil && g.rng.Intn(3) == 0 {
+				opts.lower = nil
+			}
+			if opts.upper != nil && g.rng.Intn(3) == 0 {
+				opts.upper = nil
+			}
+			// With 1/3 probability, update the bounds.
+			if g.rng.Intn(3) == 0 {
+				// Generate a new key with a .1% probability.
+				opts.lower = g.randKeyToRead(0.001)
+			}
+			if g.rng.Intn(3) == 0 {
+				// Generate a new key with a .1% probability.
+				opts.upper = g.randKeyToRead(0.001)
+			}
+			if g.cmp(opts.lower, opts.upper) > 0 {
+				opts.lower, opts.upper = opts.upper, opts.lower
+			}
+		}
+
+		// With 1/3 probability, update the key-types/mask.
+		if g.rng.Intn(3) == 0 {
+			opts.keyTypes, opts.maskSuffix = g.randKeyTypesAndMask()
+		}
+
+		// With 1/3 probability, clear existing filter.
+		if opts.filterMax > 0 && g.rng.Intn(3) == 0 {
+			opts.filterMax, opts.filterMin = 0, 0
+		}
+		// With 10% probability, set a filter range.
+		if g.rng.Intn(10) == 1 {
+			max := g.cfg.writeSuffixDist.Max()
+			opts.filterMin, opts.filterMax = g.rng.Uint64n(max)+1, g.rng.Uint64n(max)+1
+			if opts.filterMin > opts.filterMax {
+				opts.filterMin, opts.filterMax = opts.filterMax, opts.filterMin
+			} else if opts.filterMin == opts.filterMax {
+				opts.filterMax = opts.filterMin + 1
+			}
+		}
+		// With 10% probability, flip enablement of L6 filters.
+		if g.rng.Float64() <= 0.1 {
+			opts.useL6Filters = !opts.useL6Filters
+		}
+	}
+}
+
+func (g *generator) pickOneUniform(options ...func(objID)) func(objID) {
+	i := g.rng.Intn(len(options))
+	return options[i]
+}
+
+func (g *generator) cmp(a, b []byte) int {
+	return g.keyManager.comparer.Compare(a, b)
+}
+
+func (g *generator) equal(a, b []byte) bool {
+	return g.keyManager.comparer.Equal(a, b)
+}
+
+func (g *generator) split(a []byte) int {
+	return g.keyManager.comparer.Split(a)
+}
+
+func (g *generator) prefix(a []byte) []byte {
+	return a[:g.split(a)]
+}
+
+func (g *generator) String() string {
+	var buf bytes.Buffer
+	for _, op := range g.ops {
+		fmt.Fprintf(&buf, "%s\n", op)
+	}
+	return buf.String()
+}
diff --git a/pebble/metamorphic/generator_test.go b/pebble/metamorphic/generator_test.go
new file mode 100644
index 0000000..e1e74c4
--- /dev/null
+++ b/pebble/metamorphic/generator_test.go
@@ -0,0 +1,224 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package metamorphic
+
+import (
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/randvar"
+	"github.com/pmezard/go-difflib/difflib"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+func TestGenerator(t *testing.T) {
+	rng := randvar.NewRand()
+	g := newGenerator(rng, defaultConfig(), newKeyManager(1 /* numInstances */))
+
+	g.newBatch()
+	g.newBatch()
+	g.newBatch()
+	require.EqualValues(t, 3, len(g.liveBatches))
+	require.EqualValues(t, 0, len(g.batches))
+
+	g.newIndexedBatch()
+	g.newIndexedBatch()
+	g.newIndexedBatch()
+	require.EqualValues(t, 6, len(g.liveBatches))
+	require.EqualValues(t, 3, len(g.batches))
+	require.EqualValues(t, 3, len(g.readers))
+
+	g.newIter()
+	g.newIter()
+	g.newIter()
+	require.EqualValues(t, 3, len(g.liveIters))
+
+	g.batchAbort()
+	g.batchAbort()
+	g.batchAbort()
+	g.batchAbort()
+	g.batchAbort()
+	g.batchAbort()
+
+	require.EqualValues(t, 0, len(g.liveBatches))
+	require.EqualValues(t, 0, len(g.batches))
+	require.EqualValues(t, 0, len(g.iters))
+	require.EqualValues(t, 1, len(g.liveReaders))
+	require.EqualValues(t, 0, len(g.readers))
+	require.EqualValues(t, 0, len(g.liveSnapshots))
+	require.EqualValues(t, 0, len(g.snapshots))
+	require.EqualValues(t, 1, len(g.liveWriters))
+
+	g.randIter(g.iterClose)()
+	g.randIter(g.iterClose)()
+	g.randIter(g.iterClose)()
+	require.EqualValues(t, 0, len(g.liveIters))
+
+	if testing.Verbose() {
+		t.Logf("\n%s", g)
+	}
+
+	g = newGenerator(rng, defaultConfig(), newKeyManager(1 /* numInstances */))
+
+	g.newSnapshot()
+	g.newSnapshot()
+	g.newSnapshot()
+	require.EqualValues(t, 3, len(g.liveSnapshots))
+	require.EqualValues(t, 3, len(g.snapshots))
+
+	g.newIter()
+	g.newIter()
+	g.newIter()
+	g.snapshotClose()
+	g.snapshotClose()
+	g.snapshotClose()
+
+	require.EqualValues(t, 0, len(g.liveBatches))
+	require.EqualValues(t, 0, len(g.batches))
+	require.EqualValues(t, 0, len(g.iters))
+	require.EqualValues(t, 1, len(g.liveReaders))
+	require.EqualValues(t, 0, len(g.readers))
+	require.EqualValues(t, 0, len(g.liveSnapshots))
+	require.EqualValues(t, 0, len(g.snapshots))
+	require.EqualValues(t, 1, len(g.liveWriters))
+
+	g.randIter(g.iterClose)()
+	g.randIter(g.iterClose)()
+	g.randIter(g.iterClose)()
+	require.EqualValues(t, 0, len(g.liveIters))
+
+	if testing.Verbose() {
+		t.Logf("\n%s", g)
+	}
+
+	g = newGenerator(rng, defaultConfig(), newKeyManager(1 /* numInstances */))
+
+	g.newIndexedBatch()
+	g.newIndexedBatch()
+	g.newIndexedBatch()
+	g.newIter()
+	g.newIter()
+	g.newIter()
+	g.writerApply()
+	g.writerApply()
+	g.writerApply()
+	g.randIter(g.iterClose)()
+	g.randIter(g.iterClose)()
+	g.randIter(g.iterClose)()
+
+	require.EqualValues(t, 0, len(g.liveBatches))
+	require.EqualValues(t, 0, len(g.batches))
+	require.EqualValues(t, 0, len(g.liveIters))
+	require.EqualValues(t, 0, len(g.iters))
+	require.EqualValues(t, 1, len(g.liveReaders))
+	require.EqualValues(t, 0, len(g.readers))
+	require.EqualValues(t, 0, len(g.liveSnapshots))
+	require.EqualValues(t, 0, len(g.snapshots))
+	require.EqualValues(t, 1, len(g.liveWriters))
+
+	if testing.Verbose() {
+		t.Logf("\n%s", g)
+	}
+}
+
+func TestGeneratorRandom(t *testing.T) {
+	seed := uint64(time.Now().UnixNano())
+	ops := randvar.NewUniform(1000, 10000)
+	cfgs := []string{"default", "multiInstance"}
+	generateFromSeed := func(cfg config) string {
+		rng := rand.New(rand.NewSource(seed))
+		count := ops.Uint64(rng)
+		return formatOps(generate(rng, count, cfg, newKeyManager(cfg.numInstances)))
+	}
+
+	for i := range cfgs {
+		t.Run(fmt.Sprintf("config=%s", cfgs[i]), func(t *testing.T) {
+			cfg := defaultConfig
+			if cfgs[i] == "multiInstance" {
+				cfg = func() config {
+					cfg := multiInstanceConfig()
+					cfg.numInstances = 2
+					return cfg
+				}
+			}
+			// Ensure that generate doesn't use any other source of randomness other
+			// than rng.
+			referenceOps := generateFromSeed(cfg())
+			for i := 0; i < 10; i++ {
+				regeneratedOps := generateFromSeed(cfg())
+				diff, err := difflib.GetUnifiedDiffString(difflib.UnifiedDiff{
+					A:       difflib.SplitLines(referenceOps),
+					B:       difflib.SplitLines(regeneratedOps),
+					Context: 1,
+				})
+				require.NoError(t, err)
+				if len(diff) > 0 {
+					t.Fatalf("Diff:\n%s", diff)
+				}
+			}
+			if testing.Verbose() {
+				t.Logf("\nOps:\n%s", referenceOps)
+			}
+		})
+	}
+}
+
+func TestGenerateRandKeyToReadInRange(t *testing.T) {
+	rng := randvar.NewRand()
+	g := newGenerator(rng, defaultConfig(), newKeyManager(1 /* numInstances */))
+	// Seed 100 initial keys.
+	for i := 0; i < 100; i++ {
+		_ = g.randKeyToWrite(1.0)
+	}
+	for i := 0; i < 100; i++ {
+		a := g.randKeyToRead(0.01)
+		b := g.randKeyToRead(0.01)
+		// Ensure unique prefixes; required by randKeyToReadInRange.
+		for g.equal(g.prefix(a), g.prefix(b)) {
+			b = g.randKeyToRead(0.01)
+		}
+		if v := g.cmp(a, b); v > 0 {
+			a, b = b, a
+		}
+		kr := pebble.KeyRange{Start: a, End: b}
+		for j := 0; j < 10; j++ {
+			k := g.randKeyToReadInRange(0.05, kr)
+			if g.cmp(k, a) < 0 {
+				t.Errorf("generated random key %q outside range %s", k, kr)
+			} else if g.cmp(k, b) >= 0 {
+				t.Errorf("generated random key %q outside range %s", k, kr)
+			}
+		}
+	}
+}
+
+func TestGenerateDisjointKeyRanges(t *testing.T) {
+	rng := randvar.NewRand()
+	g := newGenerator(rng, defaultConfig(), newKeyManager(1 /* numInstances */))
+
+	for i := 0; i < 10; i++ {
+		keyRanges := g.generateDisjointKeyRanges(5)
+		for j := range keyRanges {
+			t.Logf("%d: %d: %s", i, j, keyRanges[j])
+			for k := range keyRanges {
+				if j == k {
+					continue
+				}
+				if g.cmp(keyRanges[j].End, keyRanges[k].Start) <= 0 {
+					require.Less(t, j, k)
+					continue
+				}
+				if g.cmp(keyRanges[j].Start, keyRanges[k].End) >= 0 {
+					require.Greater(t, j, k)
+					continue
+				}
+				t.Fatalf("generated key ranges %q and %q overlap", keyRanges[j], keyRanges[k])
+			}
+		}
+	}
+}
diff --git a/pebble/metamorphic/history.go b/pebble/metamorphic/history.go
new file mode 100644
index 0000000..4597ad8
--- /dev/null
+++ b/pebble/metamorphic/history.go
@@ -0,0 +1,193 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package metamorphic
+
+import (
+	"fmt"
+	"io"
+	"log"
+	"os"
+	"regexp"
+	"strconv"
+	"strings"
+	"sync/atomic"
+	"unicode"
+
+	"github.com/cockroachdb/errors"
+	"github.com/pmezard/go-difflib/difflib"
+	"github.com/stretchr/testify/require"
+)
+
+// history records the results of running a series of operations.
+//
+// history also implements the pebble.Logger interface, outputting to a stdlib
+// logger, prefixing the log messages with "//"-style comments.
+type history struct {
+	err    atomic.Value
+	failRE *regexp.Regexp
+	log    *log.Logger
+}
+
+func newHistory(failRE *regexp.Regexp, writers ...io.Writer) *history {
+	h := &history{failRE: failRE}
+	h.log = log.New(io.MultiWriter(writers...), "", 0)
+	return h
+}
+
+// Recordf records the results of a single operation.
+func (h *history) Recordf(op int, format string, args ...interface{}) {
+	if strings.Contains(format, "\n") {
+		// We could remove this restriction but suffixing every line with "#<seq>".
+		panic(fmt.Sprintf("format string must not contain \\n: %q", format))
+	}
+
+	// We suffix every line with #<op> in order to provide a marker to locate
+	// the line using the diff output. This is necessary because the diff of two
+	// histories is done after stripping comment lines (`// ...`) from the
+	// history output, which ruins the line number information in the diff
+	// output.
+	m := fmt.Sprintf(format, args...) + fmt.Sprintf(" #%d", op)
+	h.log.Print(m)
+
+	if h.failRE != nil && h.failRE.MatchString(m) {
+		err := errors.Errorf("failure regexp %q matched output: %s", h.failRE, m)
+		h.err.Store(err)
+	}
+}
+
+// Error returns an error if the test has failed from log output, either a
+// failure regexp match or a call to Fatalf.
+func (h *history) Error() error {
+	if v := h.err.Load(); v != nil {
+		return v.(error)
+	}
+	return nil
+}
+
+func (h *history) format(prefix, format string, args ...interface{}) string {
+	var buf strings.Builder
+	orig := fmt.Sprintf(format, args...)
+	for _, line := range strings.Split(strings.TrimSpace(orig), "\n") {
+		buf.WriteString(prefix)
+		buf.WriteString(line)
+		buf.WriteString("\n")
+	}
+	return buf.String()
+}
+
+// Infof implements the pebble.Logger interface. Note that the output is
+// commented.
+func (h *history) Infof(format string, args ...interface{}) {
+	_ = h.log.Output(2, h.format("// INFO: ", format, args...))
+}
+
+// Errorf implements the pebble.Logger interface. Note that the output is
+// commented.
+func (h *history) Errorf(format string, args ...interface{}) {
+	_ = h.log.Output(2, h.format("// ERROR: ", format, args...))
+}
+
+// Fatalf implements the pebble.Logger interface. Note that the output is
+// commented.
+func (h *history) Fatalf(format string, args ...interface{}) {
+	_ = h.log.Output(2, h.format("// FATAL: ", format, args...))
+	h.err.Store(errors.Errorf(format, args...))
+}
+
+func (h *history) recorder(thread int, op int) historyRecorder {
+	return historyRecorder{
+		history: h,
+		op:      op,
+	}
+}
+
+// historyRecorder pairs a history with an operation, annotating all lines
+// recorded through it with the operation number.
+type historyRecorder struct {
+	history *history
+	op      int
+}
+
+// Recordf records the results of a single operation.
+func (h historyRecorder) Recordf(format string, args ...interface{}) {
+	h.history.Recordf(h.op, format, args...)
+}
+
+// Error returns an error if the test has failed from log output, either a
+// failure regexp match or a call to Fatalf.
+func (h historyRecorder) Error() error {
+	return h.history.Error()
+}
+
+// CompareHistories takes a slice of file paths containing history files. It
+// performs a diff comparing the first path to all other paths. CompareHistories
+// returns the index and diff for the first history that differs. If all the
+// histories are identical, CompareHistories returns a zero index and an empty
+// string.
+func CompareHistories(t TestingT, paths []string) (i int, diff string) {
+	base := readHistory(t, paths[0])
+	base = reorderHistory(base)
+
+	for i := 1; i < len(paths); i++ {
+		lines := readHistory(t, paths[i])
+		lines = reorderHistory(lines)
+		diff := difflib.UnifiedDiff{
+			A:       base,
+			B:       lines,
+			Context: 5,
+		}
+		text, err := difflib.GetUnifiedDiffString(diff)
+		require.NoError(t, err)
+		if text != "" {
+			return i, text
+		}
+	}
+	return 0, ""
+}
+
+// reorderHistory takes lines from a history file and reorders the operation
+// results to be in the order of the operation index numbers. Runs with more
+// than 1 thread may produce out-of-order histories. Comment lines must've
+// already been filtered out.
+func reorderHistory(lines []string) []string {
+	reordered := make([]string, len(lines))
+	for _, l := range lines {
+		if cleaned := strings.TrimSpace(l); cleaned == "" {
+			continue
+		}
+		reordered[extractOp(l)] = l
+	}
+	return reordered
+}
+
+// extractOp parses out an operation's index from the trailing comment. Every
+// line of history output is suffixed with a comment containing `#<op>`
+func extractOp(line string) int {
+	i := strings.LastIndexByte(line, '#')
+	j := strings.IndexFunc(line[i+1:], unicode.IsSpace)
+	if j == -1 {
+		j = len(line[i+1:])
+	}
+	v, err := strconv.Atoi(line[i+1 : i+1+j])
+	if err != nil {
+		panic(fmt.Sprintf("unable to parse line %q: %s", line, err))
+	}
+	return v
+}
+
+// Read a history file, stripping out lines that begin with a comment.
+func readHistory(t TestingT, historyPath string) []string {
+	data, err := os.ReadFile(historyPath)
+	require.NoError(t, err)
+	lines := difflib.SplitLines(string(data))
+	newLines := make([]string, 0, len(lines))
+	for _, line := range lines {
+		if strings.HasPrefix(line, "// ") {
+			continue
+		}
+		newLines = append(newLines, line)
+	}
+	return newLines
+}
diff --git a/pebble/metamorphic/history_test.go b/pebble/metamorphic/history_test.go
new file mode 100644
index 0000000..1223270
--- /dev/null
+++ b/pebble/metamorphic/history_test.go
@@ -0,0 +1,56 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package metamorphic
+
+import (
+	"bytes"
+	"fmt"
+	"regexp"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/pmezard/go-difflib/difflib"
+	"github.com/stretchr/testify/require"
+)
+
+func TestHistoryLogger(t *testing.T) {
+	var buf bytes.Buffer
+	h := newHistory(nil, &buf)
+	h.Infof("hello\nworld\n")
+	h.Fatalf("hello\n\nworld")
+
+	expected := `// INFO: hello
+// INFO: world
+// FATAL: hello
+// FATAL: 
+// FATAL: world
+`
+	if actual := buf.String(); expected != actual {
+		t.Fatalf("expected\n%s\nbut found\n%s", expected, actual)
+	}
+}
+
+func TestHistoryFail(t *testing.T) {
+	var buf bytes.Buffer
+	h := newHistory(regexp.MustCompile("foo"), &buf)
+	h.Recordf(1, "bar")
+	require.NoError(t, h.Error())
+	h.Recordf(2, "foo bar")
+	require.EqualError(t, h.Error(), `failure regexp "foo" matched output: foo bar #2`)
+}
+
+func TestReorderHistory(t *testing.T) {
+	datadriven.RunTest(t, "testdata/reorder_history", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "reorder":
+			lines := difflib.SplitLines(string(d.Input))
+			lines = reorderHistory(lines)
+			return strings.Join(lines, "")
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
diff --git a/pebble/metamorphic/key_manager.go b/pebble/metamorphic/key_manager.go
new file mode 100644
index 0000000..5943aa3
--- /dev/null
+++ b/pebble/metamorphic/key_manager.go
@@ -0,0 +1,580 @@
+package metamorphic
+
+import (
+	"cmp"
+	"fmt"
+	"slices"
+
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/stretchr/testify/require"
+)
+
+// objKey is a tuple of (objID, key). This struct is used primarily as a map
+// key for keyManager. Only writer objTags can occur here, i.e., dbTag and
+// batchTag, since this is used for tracking the keys in a writer.
+type objKey struct {
+	id  objID
+	key []byte
+}
+
+// makeObjKey returns a new objKey given and id and key.
+func makeObjKey(id objID, key []byte) objKey {
+	if id.tag() != dbTag && id.tag() != batchTag {
+		panic("unexpected non-writer tag")
+	}
+	return objKey{id, key}
+}
+
+// String implements fmt.Stringer, returning a stable string representation of
+// the objKey. This string is used as map key.
+func (o objKey) String() string {
+	return fmt.Sprintf("%s:%s", o.id, o.key)
+}
+
+type keyUpdate struct {
+	deleted bool
+	// metaTimestamp at which the write or delete op occurred.
+	metaTimestamp int
+}
+
+// keyMeta is metadata associated with an (objID, key) pair, where objID is
+// a writer containing the key.
+type keyMeta struct {
+	objKey
+
+	// The number of Sets of the key in this writer.
+	sets int
+	// The number of Merges of the key in this writer.
+	merges int
+	// singleDel can be true only if sets <= 1 && merges == 0 and the
+	// SingleDelete was added to this writer after the set.
+	singleDel bool
+	// The number of Deletes of the key in this writer.
+	dels int
+	// del can be true only if a Delete was added to this writer after the
+	// Sets and Merges counted above.
+	del bool
+
+	// updateOps should always be ordered by non-decreasing metaTimestamp.
+	// updateOps will not be updated if the key is range deleted. Therefore, it
+	// is a best effort sequence of updates to the key. updateOps is used to
+	// determine if an iterator created on the DB can read a certain key.
+	updateOps []keyUpdate
+}
+
+func (m *keyMeta) clear() {
+	m.sets = 0
+	m.merges = 0
+	m.singleDel = false
+	m.del = false
+	m.dels = 0
+	m.updateOps = nil
+}
+
+// mergeInto merges this metadata this into the metadata for other.
+func (m *keyMeta) mergeInto(keyManager *keyManager, other *keyMeta) {
+	if other.del && !m.del {
+		// m's Sets and Merges are later.
+		if m.sets > 0 || m.merges > 0 {
+			other.del = false
+		}
+	} else {
+		other.del = m.del
+	}
+	// Sets, merges, dels are additive.
+	other.sets += m.sets
+	other.merges += m.merges
+	other.dels += m.dels
+
+	// Single deletes are preserved. This is valid since we are also
+	// maintaining a global invariant that SingleDelete will only be added for
+	// a key that has no inflight Sets or Merges (Sets have made their way to
+	// the DB), and no subsequent Sets or Merges will happen until the
+	// SingleDelete makes its way to the DB.
+	other.singleDel = other.singleDel || m.singleDel
+	if other.singleDel {
+		if other.sets > 1 || other.merges > 0 || other.dels > 0 {
+			panic(fmt.Sprintf("invalid sets %d or merges %d or dels %d",
+				other.sets, other.merges, other.dels))
+		}
+	}
+
+	// Determine if the key is visible or not after the keyMetas are merged.
+	// TODO(bananabrick): We currently only care about key updates which make it
+	// to the DB, since we only use key updates to determine if an iterator
+	// can read a key in the DB. We could extend the timestamp system to add
+	// support for iterators created on batches.
+	if other.del || other.singleDel {
+		other.updateOps = append(
+			other.updateOps, keyUpdate{true, keyManager.nextMetaTimestamp()},
+		)
+	} else {
+		other.updateOps = append(
+			other.updateOps, keyUpdate{false, keyManager.nextMetaTimestamp()},
+		)
+	}
+}
+
+// keyManager tracks the write operations performed on keys in the generation
+// phase of the metamorphic test. It makes the assumption that write
+// operations do not fail, since that can cause the keyManager state to be not
+// in-sync with the actual state of the writers. This assumption is needed to
+// correctly decide when it is safe to generate a SingleDelete. This
+// assumption is violated in a single place in the metamorphic test: ingestion
+// of multiple batches. We sidestep this issue in a narrow way in
+// generator.writerIngest by not ingesting multiple batches that contain
+// deletes or single deletes, since loss of those specific operations on a key
+// are what we cannot tolerate (doing SingleDelete on a key that has not been
+// written to because the Set was lost is harmless).
+type keyManager struct {
+	comparer *base.Comparer
+
+	// metaTimestamp is used to provide a ordering over certain operations like
+	// iter creation, updates to keys. Keeping track of the timestamp allows us
+	// to make determinations such as whether a key will be visible to an
+	// iterator.
+	metaTimestamp int
+
+	// byObjKey tracks the state for each (writer, key) pair. It refers to the
+	// same *keyMeta as in the byObj slices. Using a map allows for fast state
+	// lookups when changing the state based on a writer operation on the key.
+	byObjKey map[string]*keyMeta
+	// List of keys per writer, and what has happened to it in that writer.
+	// Will be transferred when needed.
+	byObj map[objID][]*keyMeta
+
+	// globalKeys represents all the keys that have been generated so far. Not
+	// all these keys have been written to. globalKeys is sorted.
+	globalKeys [][]byte
+	// globalKeysMap contains the same keys as globalKeys. It ensures no
+	// duplication, and contains the aggregate state of the key across all
+	// writers, including inflight state that has not made its way to the DB
+	// yet.The keyMeta.objKey is uninitialized.
+	globalKeysMap map[string]*keyMeta
+	// globalKeyPrefixes contains all the key prefixes (as defined by the
+	// comparer's Split) generated so far. globalKeyPrefixes is sorted.
+	globalKeyPrefixes [][]byte
+	// globalKeyPrefixesMap contains the same keys as globalKeyPrefixes. It
+	// ensures no duplication.
+	globalKeyPrefixesMap map[string]struct{}
+
+	// Using SingleDeletes imposes some constraints on the above state, and
+	// causes some state transitions that help with generating complex but
+	// correct sequences involving SingleDeletes.
+	// - Generating a SingleDelete requires for that key: global.merges==0 &&
+	//   global.sets==1 && global.dels==0 && !global.singleDel && (db.sets==1
+	//   || writer.sets==1), where global represents the entry in
+	//   globalKeysMap[key] and db represents the entry in
+	//   byObjKey[makeObjKey(makeObjID(dbTag, 0), key)], and writer is the
+	//   entry in byObjKey[makeObjKey(writerID, key)].
+	//
+	// - We do not track state changes due to range deletes, so one should
+	//   think of these counts as upper bounds. Also we are not preventing
+	//   interactions caused by concurrently in-flight range deletes and
+	//   SingleDelete. This is acceptable since it does not cause
+	//   non-determinism.
+	//
+	// - When the SingleDelete is generated, it is recorded as
+	//   writer.singleDel=true and global.singleDel=true. No more write
+	//   operations are permitted on this key until db.singleDel transitions
+	//   to true.
+	//
+	// - When db.singleDel transitions to true, we are guaranteed that no
+	//   writer other than the DB has any writes for this key. We set
+	//   db.singleDel and global.singleDel to false and the corresponding sets
+	//   and merges counts in global and db also to 0. This allows this key to
+	//   fully participate again in write operations. This means we can
+	//   generate sequences of the form:
+	//   SET => SINGLEDEL => SET* => MERGE* => DEL
+	//   SET => SINGLEDEL => SET => SINGLEDEL, among others.
+	//
+	// - The above logic is insufficient to generate sequences of the form
+	//   SET => DEL => SET => SINGLEDEL
+	//   To do this we need to track Deletes. When db.del transitions to true,
+	//   we check if db.sets==global.sets && db.merges==global.merges &&
+	//   db.dels==global.dels. If true, there are no in-flight
+	//   sets/merges/deletes to this key. We then default initialize the
+	//   global and db entries since one can behave as if this key was never
+	//   written in this system. This enables the above sequence, among
+	//   others.
+}
+
+func (k *keyManager) nextMetaTimestamp() int {
+	ret := k.metaTimestamp
+	k.metaTimestamp++
+	return ret
+}
+
+// newKeyManager returns a pointer to a new keyManager. Callers should
+// interact with this using addNewKey, eligible*Keys, update,
+// canTolerateApplyFailure methods only.
+func newKeyManager(numInstances int) *keyManager {
+	m := &keyManager{
+		comparer:             testkeys.Comparer,
+		byObjKey:             make(map[string]*keyMeta),
+		byObj:                make(map[objID][]*keyMeta),
+		globalKeysMap:        make(map[string]*keyMeta),
+		globalKeyPrefixesMap: make(map[string]struct{}),
+	}
+	for i := 1; i <= max(numInstances, 1); i++ {
+		m.byObj[makeObjID(dbTag, uint32(i))] = []*keyMeta{}
+	}
+	return m
+}
+
+// addNewKey adds the given key to the key manager for global key tracking.
+// Returns false iff this is not a new key.
+func (k *keyManager) addNewKey(key []byte) bool {
+	_, ok := k.globalKeysMap[string(key)]
+	if ok {
+		return false
+	}
+	keyString := string(key)
+	insertSorted(k.comparer.Compare, &k.globalKeys, key)
+	k.globalKeysMap[keyString] = &keyMeta{objKey: objKey{key: key}}
+
+	prefixLen := k.comparer.Split(key)
+	if _, ok := k.globalKeyPrefixesMap[keyString[:prefixLen]]; !ok {
+		insertSorted(k.comparer.Compare, &k.globalKeyPrefixes, key[:prefixLen])
+		k.globalKeyPrefixesMap[keyString[:prefixLen]] = struct{}{}
+	}
+	return true
+}
+
+// getOrInit returns the keyMeta for the (objID, key) pair, if it exists, else
+// allocates, initializes and returns a new value.
+func (k *keyManager) getOrInit(id objID, key []byte) *keyMeta {
+	o := makeObjKey(id, key)
+	m, ok := k.byObjKey[o.String()]
+	if ok {
+		return m
+	}
+	m = &keyMeta{objKey: makeObjKey(id, key)}
+	// Initialize the key-to-meta index.
+	k.byObjKey[o.String()] = m
+	// Add to the id-to-metas slide.
+	k.byObj[o.id] = append(k.byObj[o.id], m)
+	return m
+}
+
+// contains returns true if the (objID, key) pair is tracked by the keyManager.
+func (k *keyManager) contains(id objID, key []byte) bool {
+	_, ok := k.byObjKey[makeObjKey(id, key).String()]
+	return ok
+}
+
+// mergeKeysInto merges all metadata for all keys associated with the "from" ID
+// with the metadata for keys associated with the "to" ID.
+func (k *keyManager) mergeKeysInto(from, to objID) {
+	msFrom, ok := k.byObj[from]
+	if !ok {
+		msFrom = []*keyMeta{}
+		k.byObj[from] = msFrom
+	}
+
+	msTo, ok := k.byObj[to]
+	if !ok {
+		msTo = []*keyMeta{}
+		k.byObj[to] = msTo
+	}
+
+	// Sort to facilitate a merge.
+	slices.SortFunc(msFrom, func(a, b *keyMeta) int {
+		return cmp.Compare(a.String(), b.String())
+	})
+	slices.SortFunc(msTo, func(a, b *keyMeta) int {
+		return cmp.Compare(a.String(), b.String())
+	})
+
+	var msNew []*keyMeta
+	var iTo int
+	for _, m := range msFrom {
+		// Move cursor on mTo forward.
+		for iTo < len(msTo) && string(msTo[iTo].key) < string(m.key) {
+			msNew = append(msNew, msTo[iTo])
+			iTo++
+		}
+
+		var mTo *keyMeta
+		if iTo < len(msTo) && string(msTo[iTo].key) == string(m.key) {
+			mTo = msTo[iTo]
+			iTo++
+		} else {
+			mTo = &keyMeta{objKey: makeObjKey(to, m.key)}
+			k.byObjKey[mTo.String()] = mTo
+		}
+
+		m.mergeInto(k, mTo)
+		msNew = append(msNew, mTo)
+
+		delete(k.byObjKey, m.String()) // Unlink "from".
+	}
+
+	// Add any remaining items from the "to" set.
+	for iTo < len(msTo) {
+		msNew = append(msNew, msTo[iTo])
+		iTo++
+	}
+
+	k.byObj[to] = msNew   // Update "to".
+	delete(k.byObj, from) // Unlink "from".
+}
+
+func (k *keyManager) checkForDelOrSingleDelTransition(dbMeta *keyMeta, globalMeta *keyMeta) {
+	if dbMeta.singleDel {
+		if !globalMeta.singleDel {
+			panic("inconsistency with globalMeta")
+		}
+		if dbMeta.del || globalMeta.del || dbMeta.dels > 0 || globalMeta.dels > 0 ||
+			dbMeta.merges > 0 || globalMeta.merges > 0 || dbMeta.sets != 1 || globalMeta.sets != 1 {
+			panic("inconsistency in metas when SingleDelete applied to DB")
+		}
+		dbMeta.clear()
+		globalMeta.clear()
+		return
+	}
+	if dbMeta.del && globalMeta.sets == dbMeta.sets && globalMeta.merges == dbMeta.merges &&
+		globalMeta.dels == dbMeta.dels {
+		if dbMeta.singleDel || globalMeta.singleDel {
+			panic("Delete should not have happened given SingleDelete")
+		}
+		dbMeta.clear()
+		globalMeta.clear()
+	}
+}
+
+func (k *keyManager) checkForDelOrSingleDelTransitionInDB(dbID objID) {
+	keys := k.byObj[dbID]
+	for _, dbMeta := range keys {
+		globalMeta := k.globalKeysMap[string(dbMeta.key)]
+		k.checkForDelOrSingleDelTransition(dbMeta, globalMeta)
+	}
+}
+
+// update updates the internal state of the keyManager according to the given
+// op.
+func (k *keyManager) update(o op) {
+	switch s := o.(type) {
+	case *setOp:
+		meta := k.getOrInit(s.writerID, s.key)
+		globalMeta := k.globalKeysMap[string(s.key)]
+		meta.sets++ // Update the set count on this specific (id, key) pair.
+		meta.del = false
+		globalMeta.sets++
+		meta.updateOps = append(meta.updateOps, keyUpdate{false, k.nextMetaTimestamp()})
+		if meta.singleDel || globalMeta.singleDel {
+			panic("setting a key that has in-flight SingleDelete")
+		}
+	case *mergeOp:
+		meta := k.getOrInit(s.writerID, s.key)
+		globalMeta := k.globalKeysMap[string(s.key)]
+		meta.merges++
+		meta.del = false
+		globalMeta.merges++
+		meta.updateOps = append(meta.updateOps, keyUpdate{false, k.nextMetaTimestamp()})
+		if meta.singleDel || globalMeta.singleDel {
+			panic("merging a key that has in-flight SingleDelete")
+		}
+	case *deleteOp:
+		meta := k.getOrInit(s.writerID, s.key)
+		globalMeta := k.globalKeysMap[string(s.key)]
+		meta.del = true
+		globalMeta.del = true
+		meta.dels++
+		globalMeta.dels++
+		meta.updateOps = append(meta.updateOps, keyUpdate{true, k.nextMetaTimestamp()})
+		if s.writerID.tag() == dbTag {
+			k.checkForDelOrSingleDelTransition(meta, globalMeta)
+		}
+	case *singleDeleteOp:
+		if !k.globalStateIndicatesEligibleForSingleDelete(s.key) {
+			panic("key ineligible for SingleDelete")
+		}
+		meta := k.getOrInit(s.writerID, s.key)
+		globalMeta := k.globalKeysMap[string(s.key)]
+		meta.singleDel = true
+		globalMeta.singleDel = true
+		meta.updateOps = append(meta.updateOps, keyUpdate{true, k.nextMetaTimestamp()})
+		if s.writerID.tag() == dbTag {
+			k.checkForDelOrSingleDelTransition(meta, globalMeta)
+		}
+	case *ingestOp:
+		// For each batch, merge all keys with the keys in the DB.
+		for _, batchID := range s.batchIDs {
+			k.mergeKeysInto(batchID, s.dbID)
+		}
+		k.checkForDelOrSingleDelTransitionInDB(s.dbID)
+	case *applyOp:
+		// Merge the keys from this writer into the parent writer.
+		k.mergeKeysInto(s.batchID, s.writerID)
+		if s.writerID.tag() == dbTag {
+			k.checkForDelOrSingleDelTransitionInDB(s.writerID)
+		}
+	case *batchCommitOp:
+		// Merge the keys from the batch with the keys from the DB.
+		k.mergeKeysInto(s.batchID, s.dbID)
+		k.checkForDelOrSingleDelTransitionInDB(s.dbID)
+	}
+}
+
+func (k *keyManager) eligibleReadKeys() (keys [][]byte) {
+	return k.globalKeys
+}
+
+// eligibleReadKeysInRange returns all eligible read keys within the range
+// [start,end). The returned slice is owned by the keyManager and must not be
+// retained.
+func (k *keyManager) eligibleReadKeysInRange(kr pebble.KeyRange) (keys [][]byte) {
+	s, _ := slices.BinarySearchFunc(k.globalKeys, kr.Start, k.comparer.Compare)
+	e, _ := slices.BinarySearchFunc(k.globalKeys, kr.End, k.comparer.Compare)
+	if s >= e {
+		return nil
+	}
+	return k.globalKeys[s:e]
+}
+
+func (k *keyManager) prefixes() (prefixes [][]byte) {
+	return k.globalKeyPrefixes
+}
+
+// prefixExists returns true if a key has been generated with the provided
+// prefix before.
+func (k *keyManager) prefixExists(prefix []byte) bool {
+	_, exists := k.globalKeyPrefixesMap[string(prefix)]
+	return exists
+}
+
+func (k *keyManager) eligibleWriteKeys() (keys [][]byte) {
+	// Creating and sorting this slice of keys is wasteful given that the
+	// caller will pick one, but makes it simpler for unit testing.
+	for _, v := range k.globalKeysMap {
+		if v.singleDel {
+			continue
+		}
+		keys = append(keys, v.key)
+	}
+	slices.SortFunc(keys, k.comparer.Compare)
+	return keys
+}
+
+// eligibleSingleDeleteKeys returns a slice of keys that can be safely single
+// deleted, given the writer id.
+func (k *keyManager) eligibleSingleDeleteKeys(id, dbID objID) (keys [][]byte) {
+	// Creating and sorting this slice of keys is wasteful given that the
+	// caller will pick one, but makes it simpler for unit testing.
+	addForObjID := func(id objID) {
+		for _, m := range k.byObj[id] {
+			if m.sets == 1 && k.globalStateIndicatesEligibleForSingleDelete(m.key) {
+				keys = append(keys, m.key)
+			}
+		}
+	}
+	addForObjID(id)
+	if id.tag() != dbTag {
+		addForObjID(dbID)
+	}
+	slices.SortFunc(keys, k.comparer.Compare)
+	return keys
+}
+
+func (k *keyManager) globalStateIndicatesEligibleForSingleDelete(key []byte) bool {
+	m := k.globalKeysMap[string(key)]
+	return m.merges == 0 && m.sets == 1 && m.dels == 0 && !m.singleDel
+}
+
+// canTolerateApplyFailure is called with a batch ID and returns true iff a
+// failure to apply this batch to the DB can be tolerated.
+func (k *keyManager) canTolerateApplyFailure(id objID) bool {
+	if id.tag() != batchTag {
+		panic("called with an objID that is not a batch")
+	}
+	ms, ok := k.byObj[id]
+	if !ok {
+		return true
+	}
+	for _, m := range ms {
+		if m.singleDel || m.del {
+			return false
+		}
+	}
+	return true
+}
+
+func opWrittenKeys(untypedOp op) [][]byte {
+	switch t := untypedOp.(type) {
+	case *applyOp:
+	case *batchCommitOp:
+	case *checkpointOp:
+	case *closeOp:
+	case *compactOp:
+	case *dbRestartOp:
+	case *deleteOp:
+		return [][]byte{t.key}
+	case *deleteRangeOp:
+		return [][]byte{t.start, t.end}
+	case *flushOp:
+	case *getOp:
+	case *ingestOp:
+	case *initOp:
+	case *iterFirstOp:
+	case *iterLastOp:
+	case *iterNextOp:
+	case *iterNextPrefixOp:
+	case *iterCanSingleDelOp:
+	case *iterPrevOp:
+	case *iterSeekGEOp:
+	case *iterSeekLTOp:
+	case *iterSeekPrefixGEOp:
+	case *iterSetBoundsOp:
+	case *iterSetOptionsOp:
+	case *mergeOp:
+		return [][]byte{t.key}
+	case *newBatchOp:
+	case *newIndexedBatchOp:
+	case *newIterOp:
+	case *newIterUsingCloneOp:
+	case *newSnapshotOp:
+	case *rangeKeyDeleteOp:
+	case *rangeKeySetOp:
+	case *rangeKeyUnsetOp:
+	case *setOp:
+		return [][]byte{t.key}
+	case *singleDeleteOp:
+		return [][]byte{t.key}
+	case *replicateOp:
+		return [][]byte{t.start, t.end}
+	}
+	return nil
+}
+
+func loadPrecedingKeys(t TestingT, ops []op, cfg *config, m *keyManager) {
+	for _, op := range ops {
+		// Pretend we're generating all the operation's keys as potential new
+		// key, so that we update the key manager's keys and prefix sets.
+		for _, k := range opWrittenKeys(op) {
+			m.addNewKey(k)
+
+			// If the key has a suffix, ratchet up the suffix distribution if
+			// necessary.
+			if s := m.comparer.Split(k); s < len(k) {
+				suffix, err := testkeys.ParseSuffix(k[s:])
+				require.NoError(t, err)
+				if uint64(suffix) > cfg.writeSuffixDist.Max() {
+					diff := int(uint64(suffix) - cfg.writeSuffixDist.Max())
+					cfg.writeSuffixDist.IncMax(diff)
+				}
+			}
+		}
+
+		// Update key tracking state.
+		m.update(op)
+	}
+}
+
+func insertSorted(cmp base.Compare, dst *[][]byte, k []byte) {
+	s := *dst
+	i, _ := slices.BinarySearchFunc(s, k, cmp)
+	*dst = slices.Insert(s, i, k)
+}
diff --git a/pebble/metamorphic/key_manager_test.go b/pebble/metamorphic/key_manager_test.go
new file mode 100644
index 0000000..8ca8257
--- /dev/null
+++ b/pebble/metamorphic/key_manager_test.go
@@ -0,0 +1,390 @@
+package metamorphic
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/randvar"
+	"github.com/stretchr/testify/require"
+)
+
+func TestObjKey(t *testing.T) {
+	testCases := []struct {
+		key  objKey
+		want string
+	}{
+		{
+			key:  makeObjKey(makeObjID(dbTag, 1), []byte("foo")),
+			want: "db1:foo",
+		},
+		{
+			key:  makeObjKey(makeObjID(batchTag, 1), []byte("bar")),
+			want: "batch1:bar",
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run("", func(t *testing.T) {
+			require.Equal(t, tc.want, tc.key.String())
+		})
+	}
+}
+
+func TestGlobalStateIndicatesEligibleForSingleDelete(t *testing.T) {
+	key := makeObjKey(makeObjID(dbTag, 1), []byte("foo"))
+	testCases := []struct {
+		meta keyMeta
+		want bool
+	}{
+		{
+			meta: keyMeta{
+				objKey: key,
+			},
+			want: false,
+		},
+		{
+			meta: keyMeta{
+				objKey: key,
+				sets:   1,
+			},
+			want: true,
+		},
+		{
+			meta: keyMeta{
+				objKey: key,
+				sets:   2,
+			},
+			want: false,
+		},
+		{
+			meta: keyMeta{
+				objKey: key,
+				sets:   1,
+				merges: 1,
+			},
+			want: false,
+		},
+		{
+			meta: keyMeta{
+				objKey: key,
+				sets:   1,
+				dels:   1,
+			},
+			want: false,
+		},
+		{
+			meta: keyMeta{
+				objKey:    key,
+				sets:      1,
+				singleDel: true,
+			},
+			want: false,
+		},
+	}
+
+	for _, tc := range testCases {
+		k := newKeyManager(1)
+		t.Run("", func(t *testing.T) {
+			k.globalKeysMap[string(key.key)] = &tc.meta
+			require.Equal(t, tc.want, k.globalStateIndicatesEligibleForSingleDelete(key.key))
+		})
+	}
+}
+
+func TestKeyMeta_MergeInto(t *testing.T) {
+	testCases := []struct {
+		existing keyMeta
+		toMerge  keyMeta
+		expected keyMeta
+	}{
+		{
+			existing: keyMeta{
+				sets:      1,
+				merges:    0,
+				singleDel: false,
+			},
+			toMerge: keyMeta{
+				sets:      0,
+				merges:    0,
+				singleDel: true,
+			},
+			expected: keyMeta{
+				sets:      1,
+				merges:    0,
+				singleDel: true,
+				updateOps: []keyUpdate{
+					{deleted: true, metaTimestamp: 0},
+				},
+			},
+		},
+		{
+			existing: keyMeta{
+				sets:   3,
+				merges: 1,
+				dels:   7,
+			},
+			toMerge: keyMeta{
+				sets:   4,
+				merges: 2,
+				dels:   8,
+				del:    true,
+			},
+			expected: keyMeta{
+				sets:   7,
+				merges: 3,
+				dels:   15,
+				del:    true,
+				updateOps: []keyUpdate{
+					{deleted: true, metaTimestamp: 1},
+				},
+			},
+		},
+		{
+			existing: keyMeta{
+				sets:   3,
+				merges: 1,
+				dels:   7,
+				del:    true,
+			},
+			toMerge: keyMeta{
+				sets:   1,
+				merges: 0,
+				dels:   8,
+				del:    false,
+			},
+			expected: keyMeta{
+				sets:   4,
+				merges: 1,
+				dels:   15,
+				del:    false,
+				updateOps: []keyUpdate{
+					{deleted: false, metaTimestamp: 2},
+				},
+			},
+		},
+	}
+
+	keyManager := newKeyManager(1 /* numInstances */)
+	for _, tc := range testCases {
+		t.Run("", func(t *testing.T) {
+			tc.toMerge.mergeInto(keyManager, &tc.existing)
+			require.Equal(t, tc.expected, tc.existing)
+		})
+	}
+}
+
+func TestKeyManager_AddKey(t *testing.T) {
+	m := newKeyManager(1 /* numInstances */)
+	require.Empty(t, m.globalKeys)
+
+	k1 := []byte("foo")
+	require.True(t, m.addNewKey(k1))
+	require.Len(t, m.globalKeys, 1)
+	require.Len(t, m.globalKeyPrefixes, 1)
+	require.Contains(t, m.globalKeys, k1)
+	require.Contains(t, m.globalKeyPrefixes, k1)
+	require.False(t, m.addNewKey(k1))
+	require.True(t, m.prefixExists([]byte("foo")))
+	require.False(t, m.prefixExists([]byte("bar")))
+
+	k2 := []byte("bar")
+	require.True(t, m.addNewKey(k2))
+	require.Len(t, m.globalKeys, 2)
+	require.Len(t, m.globalKeyPrefixes, 2)
+	require.Contains(t, m.globalKeys, k2)
+	require.Contains(t, m.globalKeyPrefixes, k2)
+	require.True(t, m.prefixExists([]byte("bar")))
+	k3 := []byte("bax@4")
+	require.True(t, m.addNewKey(k3))
+	require.Len(t, m.globalKeys, 3)
+	require.Len(t, m.globalKeyPrefixes, 3)
+	require.Contains(t, m.globalKeys, k3)
+	require.Contains(t, m.globalKeyPrefixes, []byte("bax"))
+	require.True(t, m.prefixExists([]byte("bax")))
+	k4 := []byte("foo@6")
+	require.True(t, m.addNewKey(k4))
+	require.Len(t, m.globalKeys, 4)
+	require.Len(t, m.globalKeyPrefixes, 3)
+	require.Contains(t, m.globalKeys, k4)
+	require.True(t, m.prefixExists([]byte("foo")))
+
+	require.Equal(t, [][]byte{
+		[]byte("bar"), []byte("bax"), []byte("foo"),
+	}, m.prefixes())
+}
+
+func TestKeyManager_GetOrInit(t *testing.T) {
+	id := makeObjID(batchTag, 1)
+	key := []byte("foo")
+	o := makeObjKey(id, key)
+
+	m := newKeyManager(1 /* numInstances */)
+	require.NotContains(t, m.byObjKey, o.String())
+	require.NotContains(t, m.byObj, id)
+	require.Contains(t, m.byObj, makeObjID(dbTag, 1)) // Always contains the DB key.
+
+	meta1 := m.getOrInit(id, key)
+	require.Contains(t, m.byObjKey, o.String())
+	require.Contains(t, m.byObj, id)
+
+	// Idempotent.
+	meta2 := m.getOrInit(id, key)
+	require.Equal(t, meta1, meta2)
+}
+
+func TestKeyManager_Contains(t *testing.T) {
+	id := makeObjID(dbTag, 1)
+	key := []byte("foo")
+
+	m := newKeyManager(1 /* numInstances */)
+	require.False(t, m.contains(id, key))
+
+	m.getOrInit(id, key)
+	require.True(t, m.contains(id, key))
+}
+
+func TestKeyManager_MergeInto(t *testing.T) {
+	fromID := makeObjID(batchTag, 1)
+	toID := makeObjID(dbTag, 1)
+
+	m := newKeyManager(1 /* numInstances */)
+
+	// Two keys in "from".
+	a := m.getOrInit(fromID, []byte("foo"))
+	a.sets = 1
+	b := m.getOrInit(fromID, []byte("bar"))
+	b.merges = 2
+
+	// One key in "to", with same value as a key in "from", that will be merged.
+	m.getOrInit(toID, []byte("foo"))
+
+	// Before, there are two sets.
+	require.Len(t, m.byObj[fromID], 2)
+	require.Len(t, m.byObj[toID], 1)
+
+	m.mergeKeysInto(fromID, toID)
+
+	// Keys in "from" sets are moved to "to" set.
+	require.Len(t, m.byObj[toID], 2)
+
+	// Key "foo" was merged into "to".
+	foo := m.getOrInit(toID, []byte("foo"))
+	require.Equal(t, 1, foo.sets) // value was merged.
+
+	// Key "bar" was merged into "to".
+	bar := m.getOrInit(toID, []byte("bar"))
+	require.Equal(t, 2, bar.merges) // value was unchanged.
+
+	// Keys in "from" sets are removed from maps.
+	require.NotContains(t, m.byObjKey, makeObjKey(fromID, a.key))
+	require.NotContains(t, m.byObjKey, makeObjKey(fromID, b.key))
+	require.NotContains(t, m.byObj, fromID)
+}
+
+func mustParseObjID(s string) objID {
+	id, err := parseObjID(s)
+	if err != nil {
+		panic(err)
+	}
+	return id
+}
+
+func printKeys(w io.Writer, keys [][]byte) {
+	if len(keys) == 0 {
+		fmt.Fprintln(w, "(none)")
+		return
+	}
+	for i, key := range keys {
+		if i > 0 {
+			fmt.Fprint(w, ", ")
+		}
+		fmt.Fprintf(w, "%q", key)
+	}
+	fmt.Fprintln(w)
+}
+
+func TestKeyManager(t *testing.T) {
+	var buf bytes.Buffer
+	km := newKeyManager(1 /* numInstances */)
+	datadriven.RunTest(t, "testdata/key_manager", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "reset":
+			km = newKeyManager(1 /* numInstances */)
+			return ""
+		case "run":
+			buf.Reset()
+			for _, line := range strings.Split(td.Input, "\n") {
+				fields := strings.Fields(line)
+				switch fields[0] {
+				case "add-new-key":
+					if km.addNewKey([]byte(fields[1])) {
+						fmt.Fprintf(&buf, "%q is new\n", fields[1])
+					} else {
+						fmt.Fprintf(&buf, "%q already tracked\n", fields[1])
+					}
+				case "keys":
+				case "read-keys":
+					fmt.Fprintf(&buf, "read keys: ")
+					printKeys(&buf, km.eligibleReadKeys())
+				case "write-keys":
+					fmt.Fprintf(&buf, "write keys: ")
+					printKeys(&buf, km.eligibleWriteKeys())
+				case "singledel-keys":
+					fmt.Fprintf(&buf, "singledel keys: ")
+					printKeys(&buf, km.eligibleSingleDeleteKeys(
+						mustParseObjID(fields[1]), mustParseObjID(fields[2])))
+				case "op":
+					ops, err := parse([]byte(strings.TrimPrefix(line, "op")), parserOpts{
+						allowUndefinedObjs: true,
+					})
+					if err != nil {
+						t.Fatal(err)
+					} else if len(ops) != 1 {
+						t.Fatalf("expected 1 op but found %d", len(ops))
+					}
+					km.update(ops[0])
+					fmt.Fprintf(&buf, "[%s]\n", ops[0])
+				default:
+					return fmt.Sprintf("unrecognized subcommand %q", fields[0])
+				}
+			}
+			return buf.String()
+		default:
+			return fmt.Sprintf("unrecognized command %q", td.Cmd)
+		}
+	})
+}
+
+func TestOpWrittenKeys(t *testing.T) {
+	for name, info := range methods {
+		t.Run(name, func(t *testing.T) {
+			// Any operations that exist in methods but are not handled in
+			// opWrittenKeys will result in a panic, failing the subtest.
+			opWrittenKeys(info.constructor())
+		})
+	}
+}
+
+func TestLoadPrecedingKeys(t *testing.T) {
+	rng := randvar.NewRand()
+	cfg := defaultConfig()
+	km := newKeyManager(1 /* numInstances */)
+	ops := generate(rng, 1000, cfg, km)
+
+	cfg2 := defaultConfig()
+	km2 := newKeyManager(1 /* numInstances */)
+	loadPrecedingKeys(t, ops, &cfg2, km2)
+
+	// NB: We can't assert equality, because the original run may not have
+	// ever used the max of the distribution.
+	require.Greater(t, cfg2.writeSuffixDist.Max(), uint64(1))
+
+	// NB: We can't assert equality, because the original run may have generated
+	// keys that it didn't end up using in operations.
+	require.Subset(t, km.globalKeys, km2.globalKeys)
+	require.Subset(t, km.globalKeyPrefixes, km2.globalKeyPrefixes)
+}
diff --git a/pebble/metamorphic/meta.go b/pebble/metamorphic/meta.go
new file mode 100644
index 0000000..12fe3bc
--- /dev/null
+++ b/pebble/metamorphic/meta.go
@@ -0,0 +1,621 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+// Package metamorphic provides a testing framework for running randomized tests
+// over multiple Pebble databases with varying configurations. Logically
+// equivalent operations should result in equivalent output across all
+// configurations.
+package metamorphic
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"os"
+	"os/exec"
+	"path"
+	"path/filepath"
+	"regexp"
+	"sort"
+	"strconv"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/dsl"
+	"github.com/cockroachdb/pebble/internal/randvar"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/cockroachdb/pebble/vfs/errorfs"
+	"github.com/pmezard/go-difflib/difflib"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+	"golang.org/x/sync/errgroup"
+)
+
+type runAndCompareOptions struct {
+	seed              uint64
+	ops               randvar.Static
+	previousOpsPath   string
+	initialStatePath  string
+	initialStateDesc  string
+	traceFile         string
+	innerBinary       string
+	mutateTestOptions []func(*TestOptions)
+	customRuns        map[string]string
+	numInstances      int
+	runOnceOptions
+}
+
+// A RunOption configures the behavior of RunAndCompare.
+type RunOption interface {
+	apply(*runAndCompareOptions)
+}
+
+// Seed configures generation to use the provided seed. Seed may be used to
+// deterministically reproduce the same run.
+type Seed uint64
+
+func (s Seed) apply(ro *runAndCompareOptions) { ro.seed = uint64(s) }
+
+// ExtendPreviousRun configures RunAndCompare to use the output of a previous
+// metamorphic test run to seed the this run. It's used in the crossversion
+// metamorphic tests, in which a data directory is upgraded through multiple
+// versions of Pebble, exercising upgrade code paths and cross-version
+// compatibility.
+//
+// The opsPath should be the filesystem path to the ops file containing the
+// operations run within the previous iteration of the metamorphic test. It's
+// used to inform operation generation to prefer using keys used in the previous
+// run, which are therefore more likely to be "interesting."
+//
+// The initialStatePath argument should be the filesystem path to the data
+// directory containing the database where the previous run of the metamorphic
+// test left off.
+//
+// The initialStateDesc argument is presentational and should hold a
+// human-readable description of the initial state.
+func ExtendPreviousRun(opsPath, initialStatePath, initialStateDesc string) RunOption {
+	return closureOpt(func(ro *runAndCompareOptions) {
+		ro.previousOpsPath = opsPath
+		ro.initialStatePath = initialStatePath
+		ro.initialStateDesc = initialStateDesc
+	})
+}
+
+var (
+	// UseDisk configures RunAndCompare to use the physical filesystem for all
+	// generated runs.
+	UseDisk = closureOpt(func(ro *runAndCompareOptions) {
+		ro.mutateTestOptions = append(ro.mutateTestOptions, func(to *TestOptions) { to.useDisk = true })
+	})
+	// UseInMemory configures RunAndCompare to use an in-memory virtual
+	// filesystem for all generated runs.
+	UseInMemory = closureOpt(func(ro *runAndCompareOptions) {
+		ro.mutateTestOptions = append(ro.mutateTestOptions, func(to *TestOptions) { to.useDisk = false })
+	})
+)
+
+// OpCount configures the random variable for the number of operations to
+// generate.
+func OpCount(rv randvar.Static) RunOption {
+	return closureOpt(func(ro *runAndCompareOptions) { ro.ops = rv })
+}
+
+// RuntimeTrace configures each test run to collect a runtime trace and output
+// it with the provided filename.
+func RuntimeTrace(name string) RunOption {
+	return closureOpt(func(ro *runAndCompareOptions) { ro.traceFile = name })
+}
+
+// InnerBinary configures the binary that is called for each run. If not
+// specified, this binary (os.Args[0]) is called.
+func InnerBinary(path string) RunOption {
+	return closureOpt(func(ro *runAndCompareOptions) { ro.innerBinary = path })
+}
+
+// ParseCustomTestOption adds support for parsing the provided CustomOption from
+// OPTIONS files serialized by the metamorphic tests. This RunOption alone does
+// not cause the metamorphic tests to run with any variant of the provided
+// CustomOption set.
+func ParseCustomTestOption(name string, parseFn func(value string) (CustomOption, bool)) RunOption {
+	return closureOpt(func(ro *runAndCompareOptions) { ro.customOptionParsers[name] = parseFn })
+}
+
+// AddCustomRun adds an additional run of the metamorphic tests, using the
+// provided OPTIONS file contents. The default options will be used, except
+// those options that are overriden by the provided OPTIONS string.
+func AddCustomRun(name string, serializedOptions string) RunOption {
+	return closureOpt(func(ro *runAndCompareOptions) { ro.customRuns[name] = serializedOptions })
+}
+
+type closureOpt func(*runAndCompareOptions)
+
+func (f closureOpt) apply(ro *runAndCompareOptions) { f(ro) }
+
+// RunAndCompare runs the metamorphic tests, using the provided root directory
+// to hold test data.
+func RunAndCompare(t *testing.T, rootDir string, rOpts ...RunOption) {
+	runOpts := runAndCompareOptions{
+		ops:        randvar.NewUniform(1000, 10000),
+		customRuns: map[string]string{},
+		runOnceOptions: runOnceOptions{
+			customOptionParsers: map[string]func(string) (CustomOption, bool){},
+		},
+	}
+	for _, o := range rOpts {
+		o.apply(&runOpts)
+	}
+	if runOpts.seed == 0 {
+		runOpts.seed = uint64(time.Now().UnixNano())
+	}
+
+	require.NoError(t, os.MkdirAll(rootDir, 0755))
+	metaDir, err := os.MkdirTemp(rootDir, time.Now().Format("060102-150405.000"))
+	require.NoError(t, err)
+	require.NoError(t, os.MkdirAll(metaDir, 0755))
+	defer func() {
+		if !t.Failed() && !runOpts.keep {
+			_ = os.RemoveAll(metaDir)
+		}
+	}()
+
+	rng := rand.New(rand.NewSource(runOpts.seed))
+	opCount := runOpts.ops.Uint64(rng)
+
+	// Generate a new set of random ops, writing them to <dir>/ops. These will be
+	// read by the child processes when performing a test run.
+	km := newKeyManager(runOpts.numInstances)
+	cfg := presetConfigs[rng.Intn(len(presetConfigs))]
+	if runOpts.previousOpsPath != "" {
+		// During cross-version testing, we load keys from an `ops` file
+		// produced by a metamorphic test run of an earlier Pebble version.
+		// Seeding the keys ensure we generate interesting operations, including
+		// ones with key shadowing, merging, etc.
+		opsPath := filepath.Join(filepath.Dir(filepath.Clean(runOpts.previousOpsPath)), "ops")
+		opsData, err := os.ReadFile(opsPath)
+		require.NoError(t, err)
+		ops, err := parse(opsData, parserOpts{})
+		require.NoError(t, err)
+		loadPrecedingKeys(t, ops, &cfg, km)
+	}
+	if runOpts.numInstances > 1 {
+		// The multi-instance variant does not support all operations yet.
+		//
+		// TODO(bilal): Address this and use the default configs.
+		cfg = multiInstancePresetConfig
+		cfg.numInstances = runOpts.numInstances
+	}
+	ops := generate(rng, opCount, cfg, km)
+	opsPath := filepath.Join(metaDir, "ops")
+	formattedOps := formatOps(ops)
+	require.NoError(t, os.WriteFile(opsPath, []byte(formattedOps), 0644))
+
+	// runOptions performs a particular test run with the specified options. The
+	// options are written to <run-dir>/OPTIONS and a child process is created to
+	// actually execute the test.
+	runOptions := func(t *testing.T, opts *TestOptions) {
+		if opts.Opts.Cache != nil {
+			defer opts.Opts.Cache.Unref()
+		}
+		for _, fn := range runOpts.mutateTestOptions {
+			fn(opts)
+		}
+		runDir := filepath.Join(metaDir, path.Base(t.Name()))
+		require.NoError(t, os.MkdirAll(runDir, 0755))
+
+		optionsPath := filepath.Join(runDir, "OPTIONS")
+		optionsStr := optionsToString(opts)
+		require.NoError(t, os.WriteFile(optionsPath, []byte(optionsStr), 0644))
+
+		args := []string{
+			"-keep=" + fmt.Sprint(runOpts.keep),
+			"-run-dir=" + runDir,
+			"-test.run=" + t.Name() + "$",
+		}
+		if runOpts.numInstances > 1 {
+			args = append(args, "--num-instances="+strconv.Itoa(runOpts.numInstances))
+		}
+		if runOpts.traceFile != "" {
+			args = append(args, "-test.trace="+filepath.Join(runDir, runOpts.traceFile))
+		}
+
+		binary := os.Args[0]
+		if runOpts.innerBinary != "" {
+			binary = runOpts.innerBinary
+		}
+		cmd := exec.Command(binary, args...)
+		out, err := cmd.CombinedOutput()
+		if err != nil {
+			t.Fatalf(`
+===== SEED =====
+%d
+===== ERR =====
+%v
+===== OUT =====
+%s
+===== OPTIONS =====
+%s
+===== OPS =====
+%s
+===== HISTORY =====
+%s`, runOpts.seed, err, out, optionsStr, formattedOps, readFile(filepath.Join(runDir, "history")))
+		}
+	}
+
+	var names []string
+	options := map[string]*TestOptions{}
+
+	// Create the standard options.
+	for i, opts := range standardOptions() {
+		name := fmt.Sprintf("standard-%03d", i)
+		names = append(names, name)
+		options[name] = opts
+	}
+
+	// Create the custom option runs, if any.
+	for name, customOptsStr := range runOpts.customRuns {
+		options[name] = defaultTestOptions()
+		if err := parseOptions(options[name], customOptsStr, runOpts.customOptionParsers); err != nil {
+			t.Fatalf("custom opts %q: %s", name, err)
+		}
+	}
+	// Sort the custom options names for determinism (they're currently in
+	// random order from map iteration).
+	sort.Strings(names[len(names)-len(runOpts.customRuns):])
+
+	// Create random options. We make an arbitrary choice to run with as many
+	// random options as we have standard options.
+	nOpts := len(options)
+	for i := 0; i < nOpts; i++ {
+		name := fmt.Sprintf("random-%03d", i)
+		names = append(names, name)
+		opts := randomOptions(rng, runOpts.customOptionParsers)
+		options[name] = opts
+	}
+
+	// If the user provided the path to an initial database state to use, update
+	// all the options to pull from it.
+	if runOpts.initialStatePath != "" {
+		for _, o := range options {
+			var err error
+			o.initialStatePath, err = filepath.Abs(runOpts.initialStatePath)
+			require.NoError(t, err)
+			o.initialStateDesc = runOpts.initialStateDesc
+		}
+	}
+
+	// Run the options.
+	t.Run("execution", func(t *testing.T) {
+		for _, name := range names {
+			name := name
+			t.Run(name, func(t *testing.T) {
+				t.Parallel()
+				runOptions(t, options[name])
+			})
+		}
+	})
+	// NB: The above 'execution' subtest will not complete until all of the
+	// individual execution/ subtests have completed. The grouping within the
+	// `execution` subtest ensures all the histories are available when we
+	// proceed to comparing against the base history.
+
+	// Don't bother comparing output if we've already failed.
+	if t.Failed() {
+		return
+	}
+
+	t.Run("compare", func(t *testing.T) {
+		getHistoryPath := func(name string) string {
+			return filepath.Join(metaDir, name, "history")
+		}
+
+		base := readHistory(t, getHistoryPath(names[0]))
+		base = reorderHistory(base)
+		for i := 1; i < len(names); i++ {
+			t.Run(names[i], func(t *testing.T) {
+				lines := readHistory(t, getHistoryPath(names[i]))
+				lines = reorderHistory(lines)
+				diff := difflib.UnifiedDiff{
+					A:       base,
+					B:       lines,
+					Context: 5,
+				}
+				text, err := difflib.GetUnifiedDiffString(diff)
+				require.NoError(t, err)
+				if text != "" {
+					// NB: We force an exit rather than using t.Fatal because the latter
+					// will run another instance of the test if -count is specified, while
+					// we're happy to exit on the first failure.
+					optionsStrA := optionsToString(options[names[0]])
+					optionsStrB := optionsToString(options[names[i]])
+
+					fmt.Printf(`
+		===== SEED =====
+		%d
+		===== DIFF =====
+		%s/{%s,%s}
+		%s
+		===== OPTIONS %s =====
+		%s
+		===== OPTIONS %s =====
+		%s
+		===== OPS =====
+		%s
+		`, runOpts.seed, metaDir, names[0], names[i], text, names[0], optionsStrA, names[i], optionsStrB, formattedOps)
+					os.Exit(1)
+				}
+			})
+		}
+	})
+}
+
+type runOnceOptions struct {
+	keep                bool
+	maxThreads          int
+	errorRate           float64
+	failRegexp          *regexp.Regexp
+	numInstances        int
+	customOptionParsers map[string]func(string) (CustomOption, bool)
+}
+
+// A RunOnceOption configures the behavior of a single run of the metamorphic
+// tests.
+type RunOnceOption interface {
+	applyOnce(*runOnceOptions)
+}
+
+// KeepData keeps the database directory, even on successful runs. If the test
+// used an in-memory filesystem, the in-memory filesystem will be persisted to
+// the run directory.
+type KeepData struct{}
+
+func (KeepData) apply(ro *runAndCompareOptions) { ro.keep = true }
+func (KeepData) applyOnce(ro *runOnceOptions)   { ro.keep = true }
+
+// InjectErrorsRate configures the run to inject errors into read-only
+// filesystem operations and retry injected errors.
+type InjectErrorsRate float64
+
+func (r InjectErrorsRate) apply(ro *runAndCompareOptions) { ro.errorRate = float64(r) }
+func (r InjectErrorsRate) applyOnce(ro *runOnceOptions)   { ro.errorRate = float64(r) }
+
+// MaxThreads sets an upper bound on the number of parallel execution threads
+// during replay.
+type MaxThreads int
+
+func (m MaxThreads) apply(ro *runAndCompareOptions) { ro.maxThreads = int(m) }
+func (m MaxThreads) applyOnce(ro *runOnceOptions)   { ro.maxThreads = int(m) }
+
+// FailOnMatch configures the run to fail immediately if the history matches the
+// provided regular expression.
+type FailOnMatch struct {
+	*regexp.Regexp
+}
+
+func (f FailOnMatch) apply(ro *runAndCompareOptions) { ro.failRegexp = f.Regexp }
+func (f FailOnMatch) applyOnce(ro *runOnceOptions)   { ro.failRegexp = f.Regexp }
+
+// MultiInstance configures the number of pebble instances to create.
+type MultiInstance int
+
+func (m MultiInstance) apply(ro *runAndCompareOptions) { ro.numInstances = int(m) }
+func (m MultiInstance) applyOnce(ro *runOnceOptions)   { ro.numInstances = int(m) }
+
+// RunOnce performs one run of the metamorphic tests. RunOnce expects the
+// directory named by `runDir` to already exist and contain an `OPTIONS` file
+// containing the test run's configuration. The history of the run is persisted
+// to a file at the path `historyPath`.
+//
+// The `seed` parameter is not functional; it's used for context in logging.
+func RunOnce(t TestingT, runDir string, seed uint64, historyPath string, rOpts ...RunOnceOption) {
+	runOpts := runOnceOptions{
+		customOptionParsers: map[string]func(string) (CustomOption, bool){},
+	}
+	for _, o := range rOpts {
+		o.applyOnce(&runOpts)
+	}
+
+	opsPath := filepath.Join(filepath.Dir(filepath.Clean(runDir)), "ops")
+	opsData, err := os.ReadFile(opsPath)
+	require.NoError(t, err)
+
+	ops, err := parse(opsData, parserOpts{})
+	require.NoError(t, err)
+	_ = ops
+
+	optionsPath := filepath.Join(runDir, "OPTIONS")
+	optionsData, err := os.ReadFile(optionsPath)
+	require.NoError(t, err)
+
+	// NB: It's important to use defaultTestOptions() here as the base into
+	// which we parse the serialized options. It contains the relevant defaults,
+	// like the appropriate block-property collectors.
+	testOpts := defaultTestOptions()
+	opts := testOpts.Opts
+	require.NoError(t, parseOptions(testOpts, string(optionsData), runOpts.customOptionParsers))
+
+	// Always use our custom comparer which provides a Split method, splitting
+	// keys at the trailing '@'.
+	opts.Comparer = testkeys.Comparer
+	// Use an archive cleaner to ease post-mortem debugging.
+	opts.Cleaner = base.ArchiveCleaner{}
+
+	// Set up the filesystem to use for the test. Note that by default we use an
+	// in-memory FS.
+	if testOpts.useDisk {
+		opts.FS = vfs.Default
+		require.NoError(t, os.RemoveAll(opts.FS.PathJoin(runDir, "data")))
+	} else {
+		opts.Cleaner = base.ArchiveCleaner{}
+		if testOpts.strictFS {
+			opts.FS = vfs.NewStrictMem()
+		} else {
+			opts.FS = vfs.NewMem()
+		}
+	}
+	opts.WithFSDefaults()
+
+	threads := testOpts.threads
+	if runOpts.maxThreads < threads {
+		threads = runOpts.maxThreads
+	}
+
+	dir := opts.FS.PathJoin(runDir, "data")
+	// Set up the initial database state if configured to start from a non-empty
+	// database. By default tests start from an empty database, but split
+	// version testing may configure a previous metamorphic tests's database
+	// state as the initial state.
+	if testOpts.initialStatePath != "" {
+		require.NoError(t, setupInitialState(dir, testOpts))
+	}
+
+	// Wrap the filesystem with one that will inject errors into read
+	// operations with *errorRate probability.
+	opts.FS = errorfs.Wrap(opts.FS, errorfs.ErrInjected.If(
+		dsl.And[errorfs.Op](errorfs.Reads, errorfs.Randomly(runOpts.errorRate, int64(seed))),
+	))
+
+	if opts.WALDir != "" {
+		if runOpts.numInstances > 1 {
+			// TODO(bilal): Allow opts to diverge on a per-instance basis, and use
+			// that to set unique WAL dirs for all instances in multi-instance mode.
+			opts.WALDir = ""
+		} else {
+			opts.WALDir = opts.FS.PathJoin(runDir, opts.WALDir)
+		}
+	}
+
+	historyFile, err := os.Create(historyPath)
+	require.NoError(t, err)
+	defer historyFile.Close()
+	writers := []io.Writer{historyFile}
+
+	if testing.Verbose() {
+		writers = append(writers, os.Stdout)
+	}
+	h := newHistory(runOpts.failRegexp, writers...)
+
+	m := newTest(ops)
+	require.NoError(t, m.init(h, dir, testOpts, runOpts.numInstances))
+
+	if threads <= 1 {
+		for m.step(h) {
+			if err := h.Error(); err != nil {
+				fmt.Fprintf(os.Stderr, "Seed: %d\n", seed)
+				fmt.Fprintln(os.Stderr, err)
+				m.maybeSaveData()
+				os.Exit(1)
+			}
+		}
+	} else {
+		eg, ctx := errgroup.WithContext(context.Background())
+		for t := 0; t < threads; t++ {
+			t := t // bind loop var to scope
+			eg.Go(func() error {
+				for idx := 0; idx < len(m.ops); idx++ {
+					// Skip any operations whose receiver object hashes to a
+					// different thread. All operations with the same receiver
+					// are performed from the same thread. This goroutine is
+					// only responsible for executing operations that hash to
+					// `t`.
+					if hashThread(m.ops[idx].receiver(), threads) != t {
+						continue
+					}
+
+					// Some operations have additional synchronization
+					// dependencies. If this operation has any, wait for its
+					// dependencies to complete before executing.
+					for _, waitOnIdx := range m.opsWaitOn[idx] {
+						select {
+						case <-ctx.Done():
+							// Exit if some other thread already errored out.
+							return ctx.Err()
+						case <-m.opsDone[waitOnIdx]:
+						}
+					}
+
+					m.ops[idx].run(m, h.recorder(t, idx))
+
+					// If this operation has a done channel, close it so that
+					// other operations that synchronize on this operation know
+					// that it's been completed.
+					if ch := m.opsDone[idx]; ch != nil {
+						close(ch)
+					}
+
+					if err := h.Error(); err != nil {
+						return err
+					}
+				}
+				return nil
+			})
+		}
+		if err := eg.Wait(); err != nil {
+			fmt.Fprintf(os.Stderr, "Seed: %d\n", seed)
+			fmt.Fprintln(os.Stderr, err)
+			m.maybeSaveData()
+			os.Exit(1)
+		}
+	}
+
+	if runOpts.keep && !testOpts.useDisk {
+		m.maybeSaveData()
+	}
+}
+
+func hashThread(objID objID, numThreads int) int {
+	// Fibonacci hash https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
+	return int((11400714819323198485 * uint64(objID)) % uint64(numThreads))
+}
+
+// Compare runs the metamorphic tests in the provided runDirs and compares their
+// histories.
+func Compare(t TestingT, rootDir string, seed uint64, runDirs []string, rOpts ...RunOnceOption) {
+	historyPaths := make([]string, len(runDirs))
+	for i := 0; i < len(runDirs); i++ {
+		historyPath := filepath.Join(rootDir, runDirs[i]+"-"+time.Now().Format("060102-150405.000"))
+		runDirs[i] = filepath.Join(rootDir, runDirs[i])
+		_ = os.Remove(historyPath)
+		historyPaths[i] = historyPath
+	}
+	defer func() {
+		for _, path := range historyPaths {
+			_ = os.Remove(path)
+		}
+	}()
+
+	for i, runDir := range runDirs {
+		RunOnce(t, runDir, seed, historyPaths[i], rOpts...)
+	}
+
+	if t.Failed() {
+		return
+	}
+
+	i, diff := CompareHistories(t, historyPaths)
+	if i != 0 {
+		fmt.Printf(`
+===== DIFF =====
+%s/{%s,%s}
+%s
+`, rootDir, runDirs[0], runDirs[i], diff)
+		os.Exit(1)
+	}
+}
+
+// TestingT is an interface wrapper around *testing.T
+type TestingT interface {
+	require.TestingT
+	Failed() bool
+}
+
+func readFile(path string) string {
+	history, err := os.ReadFile(path)
+	if err != nil {
+		return fmt.Sprintf("err: %v", err)
+	}
+
+	return string(history)
+}
diff --git a/pebble/metamorphic/ops.go b/pebble/metamorphic/ops.go
new file mode 100644
index 0000000..3743b8a
--- /dev/null
+++ b/pebble/metamorphic/ops.go
@@ -0,0 +1,1557 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package metamorphic
+
+import (
+	"bytes"
+	"context"
+	"crypto/rand"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"path"
+	"path/filepath"
+	"strings"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/private"
+	"github.com/cockroachdb/pebble/internal/rangekey"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs/errorfs"
+)
+
+// op defines the interface for a single operation, such as creating a batch,
+// or advancing an iterator.
+type op interface {
+	String() string
+	run(t *test, h historyRecorder)
+
+	// receiver returns the object ID of the object the operation is performed
+	// on. Every operation has a receiver (eg, batch0.Set(...) has `batch0` as
+	// its receiver). Receivers are used for synchronization when running with
+	// concurrency.
+	receiver() objID
+
+	// syncObjs returns an additional set of object IDs—excluding the
+	// receiver—that the operation must synchronize with. At execution time,
+	// the operation will run serially with respect to all other operations
+	// that return these objects from their own syncObjs or receiver methods.
+	syncObjs() objIDSlice
+}
+
+// initOp performs test initialization
+type initOp struct {
+	dbSlots       uint32
+	batchSlots    uint32
+	iterSlots     uint32
+	snapshotSlots uint32
+}
+
+func (o *initOp) run(t *test, h historyRecorder) {
+	t.batches = make([]*pebble.Batch, o.batchSlots)
+	t.iters = make([]*retryableIter, o.iterSlots)
+	t.snapshots = make([]readerCloser, o.snapshotSlots)
+	h.Recordf("%s", o)
+}
+
+func (o *initOp) String() string {
+	return fmt.Sprintf("Init(%d /* dbs */, %d /* batches */, %d /* iters */, %d /* snapshots */)",
+		o.dbSlots, o.batchSlots, o.iterSlots, o.snapshotSlots)
+}
+
+func (o *initOp) receiver() objID { return makeObjID(dbTag, 1) }
+func (o *initOp) syncObjs() objIDSlice {
+	syncObjs := make([]objID, 0)
+	// Add any additional DBs to syncObjs.
+	for i := uint32(2); i < o.dbSlots+1; i++ {
+		syncObjs = append(syncObjs, makeObjID(dbTag, i))
+	}
+	return syncObjs
+}
+
+// applyOp models a Writer.Apply operation.
+type applyOp struct {
+	writerID objID
+	batchID  objID
+}
+
+func (o *applyOp) run(t *test, h historyRecorder) {
+	b := t.getBatch(o.batchID)
+	w := t.getWriter(o.writerID)
+	var err error
+	if o.writerID.tag() == dbTag && t.testOpts.asyncApplyToDB && t.writeOpts.Sync {
+		err = w.(*pebble.DB).ApplyNoSyncWait(b, t.writeOpts)
+		if err == nil {
+			err = b.SyncWait()
+		}
+	} else {
+		err = w.Apply(b, t.writeOpts)
+	}
+	h.Recordf("%s // %v", o, err)
+	// batch will be closed by a closeOp which is guaranteed to be generated
+}
+
+func (o *applyOp) String() string  { return fmt.Sprintf("%s.Apply(%s)", o.writerID, o.batchID) }
+func (o *applyOp) receiver() objID { return o.writerID }
+func (o *applyOp) syncObjs() objIDSlice {
+	// Apply should not be concurrent with operations that are mutating the
+	// batch.
+	return []objID{o.batchID}
+}
+
+// checkpointOp models a DB.Checkpoint operation.
+type checkpointOp struct {
+	dbID objID
+	// If non-empty, the checkpoint is restricted to these spans.
+	spans []pebble.CheckpointSpan
+}
+
+func (o *checkpointOp) run(t *test, h historyRecorder) {
+	// TODO(josh): db.Checkpoint does not work with shared storage yet.
+	// It would be better to filter out ahead of calling run on the op,
+	// by setting the weight that generator.go uses to zero, or similar.
+	// But IIUC the ops are shared for ALL the metamorphic test runs, so
+	// not sure how to do that easily:
+	// https://github.com/cockroachdb/pebble/blob/master/metamorphic/meta.go#L177
+	if t.testOpts.sharedStorageEnabled {
+		h.Recordf("%s // %v", o, nil)
+		return
+	}
+	var opts []pebble.CheckpointOption
+	if len(o.spans) > 0 {
+		opts = append(opts, pebble.WithRestrictToSpans(o.spans))
+	}
+	db := t.getDB(o.dbID)
+	err := withRetries(func() error {
+		return db.Checkpoint(o.dir(t.dir, h.op), opts...)
+	})
+	h.Recordf("%s // %v", o, err)
+}
+
+func (o *checkpointOp) dir(dataDir string, idx int) string {
+	return filepath.Join(dataDir, "checkpoints", fmt.Sprintf("op-%06d", idx))
+}
+
+func (o *checkpointOp) String() string {
+	var spanStr bytes.Buffer
+	for i, span := range o.spans {
+		if i > 0 {
+			spanStr.WriteString(",")
+		}
+		fmt.Fprintf(&spanStr, "%q,%q", span.Start, span.End)
+	}
+	return fmt.Sprintf("%s.Checkpoint(%s)", o.dbID, spanStr.String())
+}
+
+func (o *checkpointOp) receiver() objID      { return o.dbID }
+func (o *checkpointOp) syncObjs() objIDSlice { return nil }
+
+// closeOp models a {Batch,Iterator,Snapshot}.Close operation.
+type closeOp struct {
+	objID       objID
+	derivedDBID objID
+}
+
+func (o *closeOp) run(t *test, h historyRecorder) {
+	c := t.getCloser(o.objID)
+	if o.objID.tag() == dbTag && t.opts.DisableWAL {
+		// Special case: If WAL is disabled, do a flush right before DB Close. This
+		// allows us to reuse this run's data directory as initial state for
+		// future runs without losing any mutations.
+		_ = t.getDB(o.objID).Flush()
+	}
+	t.clearObj(o.objID)
+	err := c.Close()
+	h.Recordf("%s // %v", o, err)
+}
+
+func (o *closeOp) String() string  { return fmt.Sprintf("%s.Close()", o.objID) }
+func (o *closeOp) receiver() objID { return o.objID }
+func (o *closeOp) syncObjs() objIDSlice {
+	// Synchronize on the database so that we don't close the database before
+	// all its iterators, snapshots and batches are closed.
+	// TODO(jackson): It would be nice to relax this so that Close calls can
+	// execute in parallel.
+	if o.objID.tag() == dbTag {
+		return nil
+	}
+	if o.derivedDBID != 0 {
+		return []objID{o.derivedDBID}
+	}
+	return nil
+}
+
+// compactOp models a DB.Compact operation.
+type compactOp struct {
+	dbID        objID
+	start       []byte
+	end         []byte
+	parallelize bool
+}
+
+func (o *compactOp) run(t *test, h historyRecorder) {
+	err := withRetries(func() error {
+		return t.getDB(o.dbID).Compact(o.start, o.end, o.parallelize)
+	})
+	h.Recordf("%s // %v", o, err)
+}
+
+func (o *compactOp) String() string {
+	return fmt.Sprintf("%s.Compact(%q, %q, %t /* parallelize */)", o.dbID, o.start, o.end, o.parallelize)
+}
+
+func (o *compactOp) receiver() objID      { return o.dbID }
+func (o *compactOp) syncObjs() objIDSlice { return nil }
+
+// deleteOp models a Write.Delete operation.
+type deleteOp struct {
+	writerID objID
+	key      []byte
+
+	derivedDBID objID
+}
+
+func (o *deleteOp) run(t *test, h historyRecorder) {
+	w := t.getWriter(o.writerID)
+	var err error
+	if t.testOpts.deleteSized && t.isFMV(o.derivedDBID, pebble.FormatDeleteSizedAndObsolete) {
+		// Call DeleteSized with a deterministic size derived from the index.
+		// The size does not need to be accurate for correctness.
+		err = w.DeleteSized(o.key, hashSize(t.idx), t.writeOpts)
+	} else {
+		err = w.Delete(o.key, t.writeOpts)
+	}
+	h.Recordf("%s // %v", o, err)
+}
+
+func hashSize(index int) uint32 {
+	// Fibonacci hash https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
+	return uint32((11400714819323198485 * uint64(index)) % maxValueSize)
+}
+
+func (o *deleteOp) String() string {
+	return fmt.Sprintf("%s.Delete(%q)", o.writerID, o.key)
+}
+func (o *deleteOp) receiver() objID      { return o.writerID }
+func (o *deleteOp) syncObjs() objIDSlice { return nil }
+
+// singleDeleteOp models a Write.SingleDelete operation.
+type singleDeleteOp struct {
+	writerID           objID
+	key                []byte
+	maybeReplaceDelete bool
+}
+
+func (o *singleDeleteOp) run(t *test, h historyRecorder) {
+	w := t.getWriter(o.writerID)
+	var err error
+	if t.testOpts.replaceSingleDelete && o.maybeReplaceDelete {
+		err = w.Delete(o.key, t.writeOpts)
+	} else {
+		err = w.SingleDelete(o.key, t.writeOpts)
+	}
+	// NOTE: even if the SINGLEDEL was replaced with a DELETE, we must still
+	// write the former to the history log. The log line will indicate whether
+	// or not the delete *could* have been replaced. The OPTIONS file should
+	// also be consulted to determine what happened at runtime (i.e. by taking
+	// the logical AND).
+	h.Recordf("%s // %v", o, err)
+}
+
+func (o *singleDeleteOp) String() string {
+	return fmt.Sprintf("%s.SingleDelete(%q, %v /* maybeReplaceDelete */)", o.writerID, o.key, o.maybeReplaceDelete)
+}
+
+func (o *singleDeleteOp) receiver() objID      { return o.writerID }
+func (o *singleDeleteOp) syncObjs() objIDSlice { return nil }
+
+// deleteRangeOp models a Write.DeleteRange operation.
+type deleteRangeOp struct {
+	writerID objID
+	start    []byte
+	end      []byte
+}
+
+func (o *deleteRangeOp) run(t *test, h historyRecorder) {
+	w := t.getWriter(o.writerID)
+	err := w.DeleteRange(o.start, o.end, t.writeOpts)
+	h.Recordf("%s // %v", o, err)
+}
+
+func (o *deleteRangeOp) String() string {
+	return fmt.Sprintf("%s.DeleteRange(%q, %q)", o.writerID, o.start, o.end)
+}
+
+func (o *deleteRangeOp) receiver() objID      { return o.writerID }
+func (o *deleteRangeOp) syncObjs() objIDSlice { return nil }
+
+// flushOp models a DB.Flush operation.
+type flushOp struct {
+	db objID
+}
+
+func (o *flushOp) run(t *test, h historyRecorder) {
+	db := t.getDB(o.db)
+	err := db.Flush()
+	h.Recordf("%s // %v", o, err)
+}
+
+func (o *flushOp) String() string       { return fmt.Sprintf("%s.Flush()", o.db) }
+func (o *flushOp) receiver() objID      { return o.db }
+func (o *flushOp) syncObjs() objIDSlice { return nil }
+
+// mergeOp models a Write.Merge operation.
+type mergeOp struct {
+	writerID objID
+	key      []byte
+	value    []byte
+}
+
+func (o *mergeOp) run(t *test, h historyRecorder) {
+	w := t.getWriter(o.writerID)
+	err := w.Merge(o.key, o.value, t.writeOpts)
+	h.Recordf("%s // %v", o, err)
+}
+
+func (o *mergeOp) String() string       { return fmt.Sprintf("%s.Merge(%q, %q)", o.writerID, o.key, o.value) }
+func (o *mergeOp) receiver() objID      { return o.writerID }
+func (o *mergeOp) syncObjs() objIDSlice { return nil }
+
+// setOp models a Write.Set operation.
+type setOp struct {
+	writerID objID
+	key      []byte
+	value    []byte
+}
+
+func (o *setOp) run(t *test, h historyRecorder) {
+	w := t.getWriter(o.writerID)
+	err := w.Set(o.key, o.value, t.writeOpts)
+	h.Recordf("%s // %v", o, err)
+}
+
+func (o *setOp) String() string       { return fmt.Sprintf("%s.Set(%q, %q)", o.writerID, o.key, o.value) }
+func (o *setOp) receiver() objID      { return o.writerID }
+func (o *setOp) syncObjs() objIDSlice { return nil }
+
+// rangeKeyDeleteOp models a Write.RangeKeyDelete operation.
+type rangeKeyDeleteOp struct {
+	writerID objID
+	start    []byte
+	end      []byte
+}
+
+func (o *rangeKeyDeleteOp) run(t *test, h historyRecorder) {
+	w := t.getWriter(o.writerID)
+	err := w.RangeKeyDelete(o.start, o.end, t.writeOpts)
+	h.Recordf("%s // %v", o, err)
+}
+
+func (o *rangeKeyDeleteOp) String() string {
+	return fmt.Sprintf("%s.RangeKeyDelete(%q, %q)", o.writerID, o.start, o.end)
+}
+
+func (o *rangeKeyDeleteOp) receiver() objID      { return o.writerID }
+func (o *rangeKeyDeleteOp) syncObjs() objIDSlice { return nil }
+
+// rangeKeySetOp models a Write.RangeKeySet operation.
+type rangeKeySetOp struct {
+	writerID objID
+	start    []byte
+	end      []byte
+	suffix   []byte
+	value    []byte
+}
+
+func (o *rangeKeySetOp) run(t *test, h historyRecorder) {
+	w := t.getWriter(o.writerID)
+	err := w.RangeKeySet(o.start, o.end, o.suffix, o.value, t.writeOpts)
+	h.Recordf("%s // %v", o, err)
+}
+
+func (o *rangeKeySetOp) String() string {
+	return fmt.Sprintf("%s.RangeKeySet(%q, %q, %q, %q)",
+		o.writerID, o.start, o.end, o.suffix, o.value)
+}
+
+func (o *rangeKeySetOp) receiver() objID      { return o.writerID }
+func (o *rangeKeySetOp) syncObjs() objIDSlice { return nil }
+
+// rangeKeyUnsetOp models a Write.RangeKeyUnset operation.
+type rangeKeyUnsetOp struct {
+	writerID objID
+	start    []byte
+	end      []byte
+	suffix   []byte
+}
+
+func (o *rangeKeyUnsetOp) run(t *test, h historyRecorder) {
+	w := t.getWriter(o.writerID)
+	err := w.RangeKeyUnset(o.start, o.end, o.suffix, t.writeOpts)
+	h.Recordf("%s // %v", o, err)
+}
+
+func (o *rangeKeyUnsetOp) String() string {
+	return fmt.Sprintf("%s.RangeKeyUnset(%q, %q, %q)",
+		o.writerID, o.start, o.end, o.suffix)
+}
+
+func (o *rangeKeyUnsetOp) receiver() objID      { return o.writerID }
+func (o *rangeKeyUnsetOp) syncObjs() objIDSlice { return nil }
+
+// newBatchOp models a Write.NewBatch operation.
+type newBatchOp struct {
+	dbID    objID
+	batchID objID
+}
+
+func (o *newBatchOp) run(t *test, h historyRecorder) {
+	b := t.getDB(o.dbID).NewBatch()
+	t.setBatch(o.batchID, b)
+	h.Recordf("%s", o)
+}
+
+func (o *newBatchOp) String() string  { return fmt.Sprintf("%s = %s.NewBatch()", o.batchID, o.dbID) }
+func (o *newBatchOp) receiver() objID { return o.dbID }
+func (o *newBatchOp) syncObjs() objIDSlice {
+	// NewBatch should not be concurrent with operations that interact with that
+	// same batch.
+	return []objID{o.batchID}
+}
+
+// newIndexedBatchOp models a Write.NewIndexedBatch operation.
+type newIndexedBatchOp struct {
+	dbID    objID
+	batchID objID
+}
+
+func (o *newIndexedBatchOp) run(t *test, h historyRecorder) {
+	b := t.getDB(o.dbID).NewIndexedBatch()
+	t.setBatch(o.batchID, b)
+	h.Recordf("%s", o)
+}
+
+func (o *newIndexedBatchOp) String() string {
+	return fmt.Sprintf("%s = %s.NewIndexedBatch()", o.batchID, o.dbID)
+}
+func (o *newIndexedBatchOp) receiver() objID { return o.dbID }
+func (o *newIndexedBatchOp) syncObjs() objIDSlice {
+	// NewIndexedBatch should not be concurrent with operations that interact
+	// with that same batch.
+	return []objID{o.batchID}
+}
+
+// batchCommitOp models a Batch.Commit operation.
+type batchCommitOp struct {
+	dbID    objID
+	batchID objID
+}
+
+func (o *batchCommitOp) run(t *test, h historyRecorder) {
+	b := t.getBatch(o.batchID)
+	err := b.Commit(t.writeOpts)
+	h.Recordf("%s // %v", o, err)
+}
+
+func (o *batchCommitOp) String() string  { return fmt.Sprintf("%s.Commit()", o.batchID) }
+func (o *batchCommitOp) receiver() objID { return o.batchID }
+func (o *batchCommitOp) syncObjs() objIDSlice {
+	// Synchronize on the database so that NewIters wait for the commit.
+	return []objID{o.dbID}
+}
+
+// ingestOp models a DB.Ingest operation.
+type ingestOp struct {
+	dbID     objID
+	batchIDs []objID
+
+	derivedDBIDs []objID
+}
+
+func (o *ingestOp) run(t *test, h historyRecorder) {
+	// We can only use apply as an alternative for ingestion if we are ingesting
+	// a single batch. If we are ingesting multiple batches, the batches may
+	// overlap which would cause ingestion to fail but apply would succeed.
+	if t.testOpts.ingestUsingApply && len(o.batchIDs) == 1 && o.derivedDBIDs[0] == o.dbID {
+		id := o.batchIDs[0]
+		b := t.getBatch(id)
+		iter, rangeDelIter, rangeKeyIter := private.BatchSort(b)
+		db := t.getDB(o.dbID)
+		c, err := o.collapseBatch(t, db, iter, rangeDelIter, rangeKeyIter, b)
+		if err == nil {
+			err = db.Apply(c, t.writeOpts)
+		}
+		_ = b.Close()
+		_ = c.Close()
+		t.clearObj(id)
+		h.Recordf("%s // %v", o, err)
+		return
+	}
+
+	var paths []string
+	var err error
+	for i, id := range o.batchIDs {
+		b := t.getBatch(id)
+		t.clearObj(id)
+		path, err2 := o.build(t, h, b, i)
+		if err2 != nil {
+			h.Recordf("Build(%s) // %v", id, err2)
+		}
+		err = firstError(err, err2)
+		if err2 == nil {
+			paths = append(paths, path)
+		}
+		err = firstError(err, b.Close())
+	}
+
+	err = firstError(err, withRetries(func() error {
+		return t.getDB(o.dbID).Ingest(paths)
+	}))
+
+	h.Recordf("%s // %v", o, err)
+}
+
+func (o *ingestOp) build(t *test, h historyRecorder, b *pebble.Batch, i int) (string, error) {
+	path := t.opts.FS.PathJoin(t.tmpDir, fmt.Sprintf("ext%d-%d", o.dbID.slot(), i))
+	f, err := t.opts.FS.Create(path)
+	if err != nil {
+		return "", err
+	}
+	db := t.getDB(o.dbID)
+
+	iter, rangeDelIter, rangeKeyIter := private.BatchSort(b)
+	defer closeIters(iter, rangeDelIter, rangeKeyIter)
+
+	equal := t.opts.Comparer.Equal
+	tableFormat := db.FormatMajorVersion().MaxTableFormat()
+	w := sstable.NewWriter(
+		objstorageprovider.NewFileWritable(f),
+		t.opts.MakeWriterOptions(0, tableFormat),
+	)
+
+	var lastUserKey []byte
+	for key, value := iter.First(); key != nil; key, value = iter.Next() {
+		// Ignore duplicate keys.
+		if equal(lastUserKey, key.UserKey) {
+			continue
+		}
+		// NB: We don't have to copy the key or value since we're reading from a
+		// batch which doesn't do prefix compression.
+		lastUserKey = key.UserKey
+
+		key.SetSeqNum(base.SeqNumZero)
+		// It's possible that we wrote the key on a batch from a db that supported
+		// DeleteSized, but are now ingesting into a db that does not. Detect
+		// this case and translate the key to an InternalKeyKindDelete.
+		if key.Kind() == pebble.InternalKeyKindDeleteSized && !t.isFMV(o.dbID, pebble.FormatDeleteSizedAndObsolete) {
+			value = pebble.LazyValue{}
+			key.SetKind(pebble.InternalKeyKindDelete)
+		}
+		if err := w.Add(*key, value.InPlaceValue()); err != nil {
+			return "", err
+		}
+	}
+	if err := iter.Close(); err != nil {
+		return "", err
+	}
+	iter = nil
+
+	if rangeDelIter != nil {
+		// NB: The range tombstones have already been fragmented by the Batch.
+		for t := rangeDelIter.First(); t != nil; t = rangeDelIter.Next() {
+			// NB: We don't have to copy the key or value since we're reading from a
+			// batch which doesn't do prefix compression.
+			if err := w.DeleteRange(t.Start, t.End); err != nil {
+				return "", err
+			}
+		}
+		if err := rangeDelIter.Close(); err != nil {
+			return "", err
+		}
+		rangeDelIter = nil
+	}
+
+	if rangeKeyIter != nil {
+		for span := rangeKeyIter.First(); span != nil; span = rangeKeyIter.Next() {
+			// Coalesce the keys of this span and then zero the sequence
+			// numbers. This is necessary in order to make the range keys within
+			// the ingested sstable internally consistent at the sequence number
+			// it's ingested at. The individual keys within a batch are
+			// committed at unique sequence numbers, whereas all the keys of an
+			// ingested sstable are given the same sequence number. A span
+			// contaning keys that both set and unset the same suffix at the
+			// same sequence number is nonsensical, so we "coalesce" or collapse
+			// the keys.
+			collapsed := keyspan.Span{
+				Start: span.Start,
+				End:   span.End,
+				Keys:  make([]keyspan.Key, 0, len(span.Keys)),
+			}
+			err = rangekey.Coalesce(t.opts.Comparer.Compare, equal, span.Keys, &collapsed.Keys)
+			if err != nil {
+				return "", err
+			}
+			for i := range collapsed.Keys {
+				collapsed.Keys[i].Trailer = base.MakeTrailer(0, collapsed.Keys[i].Kind())
+			}
+			keyspan.SortKeysByTrailer(&collapsed.Keys)
+			if err := rangekey.Encode(&collapsed, w.AddRangeKey); err != nil {
+				return "", err
+			}
+		}
+		if err := rangeKeyIter.Error(); err != nil {
+			return "", err
+		}
+		if err := rangeKeyIter.Close(); err != nil {
+			return "", err
+		}
+		rangeKeyIter = nil
+	}
+
+	if err := w.Close(); err != nil {
+		return "", err
+	}
+	return path, nil
+}
+
+func (o *ingestOp) receiver() objID { return o.dbID }
+func (o *ingestOp) syncObjs() objIDSlice {
+	// Ingest should not be concurrent with mutating the batches that will be
+	// ingested as sstables.
+	objs := make([]objID, 0, len(o.batchIDs)+1)
+	objs = append(objs, o.batchIDs...)
+	addedDBs := make(map[objID]struct{})
+	for i := range o.derivedDBIDs {
+		_, ok := addedDBs[o.derivedDBIDs[i]]
+		if !ok && o.derivedDBIDs[i] != o.dbID {
+			objs = append(objs, o.derivedDBIDs[i])
+			addedDBs[o.derivedDBIDs[i]] = struct{}{}
+		}
+	}
+	return objs
+}
+
+func closeIters(
+	pointIter base.InternalIterator,
+	rangeDelIter keyspan.FragmentIterator,
+	rangeKeyIter keyspan.FragmentIterator,
+) {
+	if pointIter != nil {
+		pointIter.Close()
+	}
+	if rangeDelIter != nil {
+		rangeDelIter.Close()
+	}
+	if rangeKeyIter != nil {
+		rangeKeyIter.Close()
+	}
+}
+
+// collapseBatch collapses the mutations in a batch to be equivalent to an
+// sstable ingesting those mutations. Duplicate updates to a key are collapsed
+// so that only the latest update is performed. All range deletions are
+// performed first in the batch to match the semantics of ingestion where a
+// range deletion does not delete a point record contained in the sstable.
+func (o *ingestOp) collapseBatch(
+	t *test,
+	db *pebble.DB,
+	pointIter base.InternalIterator,
+	rangeDelIter, rangeKeyIter keyspan.FragmentIterator,
+	b *pebble.Batch,
+) (*pebble.Batch, error) {
+	defer closeIters(pointIter, rangeDelIter, rangeKeyIter)
+	equal := t.opts.Comparer.Equal
+	collapsed := db.NewBatch()
+
+	if rangeDelIter != nil {
+		// NB: The range tombstones have already been fragmented by the Batch.
+		for t := rangeDelIter.First(); t != nil; t = rangeDelIter.Next() {
+			// NB: We don't have to copy the key or value since we're reading from a
+			// batch which doesn't do prefix compression.
+			if err := collapsed.DeleteRange(t.Start, t.End, nil); err != nil {
+				return nil, err
+			}
+		}
+		if err := rangeDelIter.Close(); err != nil {
+			return nil, err
+		}
+		rangeDelIter = nil
+	}
+
+	if pointIter != nil {
+		var lastUserKey []byte
+		for key, value := pointIter.First(); key != nil; key, value = pointIter.Next() {
+			// Ignore duplicate keys.
+			//
+			// Note: this is necessary due to MERGE keys, otherwise it would be
+			// fine to include all the keys in the batch and let the normal
+			// sequence number precedence determine which of the keys "wins".
+			// But the code to build the ingested sstable will only keep the
+			// most recent internal key and will not merge across internal keys.
+			if equal(lastUserKey, key.UserKey) {
+				continue
+			}
+			// NB: We don't have to copy the key or value since we're reading from a
+			// batch which doesn't do prefix compression.
+			lastUserKey = key.UserKey
+
+			var err error
+			switch key.Kind() {
+			case pebble.InternalKeyKindDelete:
+				err = collapsed.Delete(key.UserKey, nil)
+			case pebble.InternalKeyKindDeleteSized:
+				v, _ := binary.Uvarint(value.InPlaceValue())
+				// Batch.DeleteSized takes just the length of the value being
+				// deleted and adds the key's length to derive the overall entry
+				// size of the value being deleted. This has already been done
+				// to the key we're reading from the batch, so we must subtract
+				// the key length from the encoded value before calling
+				// collapsed.DeleteSized, which will again add the key length
+				// before encoding.
+				err = collapsed.DeleteSized(key.UserKey, uint32(v-uint64(len(key.UserKey))), nil)
+			case pebble.InternalKeyKindSingleDelete:
+				err = collapsed.SingleDelete(key.UserKey, nil)
+			case pebble.InternalKeyKindSet:
+				err = collapsed.Set(key.UserKey, value.InPlaceValue(), nil)
+			case pebble.InternalKeyKindMerge:
+				err = collapsed.Merge(key.UserKey, value.InPlaceValue(), nil)
+			case pebble.InternalKeyKindLogData:
+				err = collapsed.LogData(key.UserKey, nil)
+			default:
+				err = errors.Errorf("unknown batch record kind: %d", key.Kind())
+			}
+			if err != nil {
+				return nil, err
+			}
+		}
+		if err := pointIter.Close(); err != nil {
+			return nil, err
+		}
+		pointIter = nil
+	}
+
+	// There's no equivalent of a MERGE operator for range keys, so there's no
+	// need to collapse the range keys here. Rather than reading the range keys
+	// from `rangeKeyIter`, which will already be fragmented, read the range
+	// keys from the batch and copy them verbatim. This marginally improves our
+	// test coverage over the alternative approach of pre-fragmenting and
+	// pre-coalescing before writing to the batch.
+	//
+	// The `rangeKeyIter` is used only to determine if there are any range keys
+	// in the batch at all, and only because we already have it handy from
+	// private.BatchSort.
+	if rangeKeyIter != nil {
+		for r := b.Reader(); ; {
+			kind, key, value, ok, err := r.Next()
+			if !ok {
+				if err != nil {
+					return nil, err
+				}
+				break
+			} else if !rangekey.IsRangeKey(kind) {
+				continue
+			}
+			ik := base.MakeInternalKey(key, 0, kind)
+			if err := collapsed.AddInternalKey(&ik, value, nil); err != nil {
+				return nil, err
+			}
+		}
+		if err := rangeKeyIter.Close(); err != nil {
+			return nil, err
+		}
+		rangeKeyIter = nil
+	}
+
+	return collapsed, nil
+}
+
+func (o *ingestOp) String() string {
+	var buf strings.Builder
+	buf.WriteString(o.dbID.String())
+	buf.WriteString(".Ingest(")
+	for i, id := range o.batchIDs {
+		if i > 0 {
+			buf.WriteString(", ")
+		}
+		buf.WriteString(id.String())
+	}
+	buf.WriteString(")")
+	return buf.String()
+}
+
+// getOp models a Reader.Get operation.
+type getOp struct {
+	readerID    objID
+	key         []byte
+	derivedDBID objID
+}
+
+func (o *getOp) run(t *test, h historyRecorder) {
+	r := t.getReader(o.readerID)
+	var val []byte
+	var closer io.Closer
+	err := withRetries(func() (err error) {
+		val, closer, err = r.Get(o.key)
+		return err
+	})
+	h.Recordf("%s // [%q] %v", o, val, err)
+	if closer != nil {
+		closer.Close()
+	}
+}
+
+func (o *getOp) String() string  { return fmt.Sprintf("%s.Get(%q)", o.readerID, o.key) }
+func (o *getOp) receiver() objID { return o.readerID }
+func (o *getOp) syncObjs() objIDSlice {
+	if o.readerID.tag() == dbTag {
+		return nil
+	}
+	// batch.Get reads through to the current database state.
+	if o.derivedDBID != 0 {
+		return []objID{o.derivedDBID}
+	}
+	return nil
+}
+
+// newIterOp models a Reader.NewIter operation.
+type newIterOp struct {
+	readerID objID
+	iterID   objID
+	iterOpts
+	derivedDBID objID
+}
+
+func (o *newIterOp) run(t *test, h historyRecorder) {
+	r := t.getReader(o.readerID)
+	opts := iterOptions(o.iterOpts)
+
+	var i *pebble.Iterator
+	for {
+		i, _ = r.NewIter(opts)
+		if err := i.Error(); !errors.Is(err, errorfs.ErrInjected) {
+			break
+		}
+		// close this iter and retry NewIter
+		_ = i.Close()
+	}
+	t.setIter(o.iterID, i)
+
+	// Trash the bounds to ensure that Pebble doesn't rely on the stability of
+	// the user-provided bounds.
+	if opts != nil {
+		rand.Read(opts.LowerBound[:])
+		rand.Read(opts.UpperBound[:])
+	}
+	h.Recordf("%s // %v", o, i.Error())
+}
+
+func (o *newIterOp) String() string {
+	return fmt.Sprintf("%s = %s.NewIter(%q, %q, %d /* key types */, %d, %d, %t /* use L6 filters */, %q /* masking suffix */)",
+		o.iterID, o.readerID, o.lower, o.upper, o.keyTypes, o.filterMin, o.filterMax, o.useL6Filters, o.maskSuffix)
+}
+
+func (o *newIterOp) receiver() objID { return o.readerID }
+func (o *newIterOp) syncObjs() objIDSlice {
+	// Prevent o.iterID ops from running before it exists.
+	objs := []objID{o.iterID}
+	// If reading through a batch or snapshot, the new iterator will also observe database
+	// state, and we must synchronize on the database state for a consistent
+	// view.
+	if o.readerID.tag() == batchTag || o.readerID.tag() == snapTag {
+		objs = append(objs, o.derivedDBID)
+	}
+	return objs
+}
+
+// newIterUsingCloneOp models a Iterator.Clone operation.
+type newIterUsingCloneOp struct {
+	existingIterID objID
+	iterID         objID
+	refreshBatch   bool
+	iterOpts
+
+	// derivedReaderID is the ID of the underlying reader that backs both the
+	// existing iterator and the new iterator. The derivedReaderID is NOT
+	// serialized by String and is derived from other operations during parse.
+	derivedReaderID objID
+}
+
+func (o *newIterUsingCloneOp) run(t *test, h historyRecorder) {
+	iter := t.getIter(o.existingIterID)
+	cloneOpts := pebble.CloneOptions{
+		IterOptions:      iterOptions(o.iterOpts),
+		RefreshBatchView: o.refreshBatch,
+	}
+	i, err := iter.iter.Clone(cloneOpts)
+	if err != nil {
+		panic(err)
+	}
+	t.setIter(o.iterID, i)
+	h.Recordf("%s // %v", o, i.Error())
+}
+
+func (o *newIterUsingCloneOp) String() string {
+	return fmt.Sprintf("%s = %s.Clone(%t, %q, %q, %d /* key types */, %d, %d, %t /* use L6 filters */, %q /* masking suffix */)",
+		o.iterID, o.existingIterID, o.refreshBatch, o.lower, o.upper,
+		o.keyTypes, o.filterMin, o.filterMax, o.useL6Filters, o.maskSuffix)
+}
+
+func (o *newIterUsingCloneOp) receiver() objID { return o.existingIterID }
+
+func (o *newIterUsingCloneOp) syncObjs() objIDSlice {
+	objIDs := []objID{o.iterID}
+	// If the underlying reader is a batch, we must synchronize with the batch.
+	// If refreshBatch=true, synchronizing is necessary to observe all the
+	// mutations up to until this op and no more. Even when refreshBatch=false,
+	// we must synchronize because iterator construction may access state cached
+	// on the indexed batch to avoid refragmenting range tombstones or range
+	// keys.
+	if o.derivedReaderID.tag() == batchTag {
+		objIDs = append(objIDs, o.derivedReaderID)
+	}
+	return objIDs
+}
+
+// iterSetBoundsOp models an Iterator.SetBounds operation.
+type iterSetBoundsOp struct {
+	iterID objID
+	lower  []byte
+	upper  []byte
+}
+
+func (o *iterSetBoundsOp) run(t *test, h historyRecorder) {
+	i := t.getIter(o.iterID)
+	var lower, upper []byte
+	if o.lower != nil {
+		lower = append(lower, o.lower...)
+	}
+	if o.upper != nil {
+		upper = append(upper, o.upper...)
+	}
+	i.SetBounds(lower, upper)
+
+	// Trash the bounds to ensure that Pebble doesn't rely on the stability of
+	// the user-provided bounds.
+	rand.Read(lower[:])
+	rand.Read(upper[:])
+
+	h.Recordf("%s // %v", o, i.Error())
+}
+
+func (o *iterSetBoundsOp) String() string {
+	return fmt.Sprintf("%s.SetBounds(%q, %q)", o.iterID, o.lower, o.upper)
+}
+
+func (o *iterSetBoundsOp) receiver() objID      { return o.iterID }
+func (o *iterSetBoundsOp) syncObjs() objIDSlice { return nil }
+
+// iterSetOptionsOp models an Iterator.SetOptions operation.
+type iterSetOptionsOp struct {
+	iterID objID
+	iterOpts
+
+	// derivedReaderID is the ID of the underlying reader that backs the
+	// iterator. The derivedReaderID is NOT serialized by String and is derived
+	// from other operations during parse.
+	derivedReaderID objID
+}
+
+func (o *iterSetOptionsOp) run(t *test, h historyRecorder) {
+	i := t.getIter(o.iterID)
+
+	opts := iterOptions(o.iterOpts)
+	if opts == nil {
+		opts = &pebble.IterOptions{}
+	}
+	i.SetOptions(opts)
+
+	// Trash the bounds to ensure that Pebble doesn't rely on the stability of
+	// the user-provided bounds.
+	rand.Read(opts.LowerBound[:])
+	rand.Read(opts.UpperBound[:])
+
+	h.Recordf("%s // %v", o, i.Error())
+}
+
+func (o *iterSetOptionsOp) String() string {
+	return fmt.Sprintf("%s.SetOptions(%q, %q, %d /* key types */, %d, %d, %t /* use L6 filters */, %q /* masking suffix */)",
+		o.iterID, o.lower, o.upper, o.keyTypes, o.filterMin, o.filterMax, o.useL6Filters, o.maskSuffix)
+}
+
+func iterOptions(o iterOpts) *pebble.IterOptions {
+	if o.IsZero() {
+		return nil
+	}
+	var lower, upper []byte
+	if o.lower != nil {
+		lower = append(lower, o.lower...)
+	}
+	if o.upper != nil {
+		upper = append(upper, o.upper...)
+	}
+	opts := &pebble.IterOptions{
+		LowerBound: lower,
+		UpperBound: upper,
+		KeyTypes:   pebble.IterKeyType(o.keyTypes),
+		RangeKeyMasking: pebble.RangeKeyMasking{
+			Suffix: o.maskSuffix,
+		},
+		UseL6Filters: o.useL6Filters,
+	}
+	if opts.RangeKeyMasking.Suffix != nil {
+		opts.RangeKeyMasking.Filter = func() pebble.BlockPropertyFilterMask {
+			return sstable.NewTestKeysMaskingFilter()
+		}
+	}
+	if o.filterMax > 0 {
+		opts.PointKeyFilters = []pebble.BlockPropertyFilter{
+			sstable.NewTestKeysBlockPropertyFilter(o.filterMin, o.filterMax),
+		}
+		// Enforce the timestamp bounds in SkipPoint, so that the iterator never
+		// returns a key outside the filterMin, filterMax bounds. This provides
+		// deterministic iteration.
+		opts.SkipPoint = func(k []byte) (skip bool) {
+			n := testkeys.Comparer.Split(k)
+			if n == len(k) {
+				// No suffix, don't skip it.
+				return false
+			}
+			v, err := testkeys.ParseSuffix(k[n:])
+			if err != nil {
+				panic(err)
+			}
+			ts := uint64(v)
+			return ts < o.filterMin || ts >= o.filterMax
+		}
+	}
+	return opts
+}
+
+func (o *iterSetOptionsOp) receiver() objID { return o.iterID }
+
+func (o *iterSetOptionsOp) syncObjs() objIDSlice {
+	if o.derivedReaderID.tag() == batchTag {
+		// If the underlying reader is a batch, we must synchronize with the
+		// batch so that we observe all the mutations up until this operation
+		// and no more.
+		return []objID{o.derivedReaderID}
+	}
+	return nil
+}
+
+// iterSeekGEOp models an Iterator.SeekGE[WithLimit] operation.
+type iterSeekGEOp struct {
+	iterID objID
+	key    []byte
+	limit  []byte
+
+	derivedReaderID objID
+}
+
+func iteratorPos(i *retryableIter) string {
+	var buf bytes.Buffer
+	fmt.Fprintf(&buf, "%q", i.Key())
+	hasPoint, hasRange := i.HasPointAndRange()
+	if hasPoint {
+		fmt.Fprintf(&buf, ",%q", i.Value())
+	} else {
+		fmt.Fprint(&buf, ",<no point>")
+	}
+	if hasRange {
+		start, end := i.RangeBounds()
+		fmt.Fprintf(&buf, ",[%q,%q)=>{", start, end)
+		for i, rk := range i.RangeKeys() {
+			if i > 0 {
+				fmt.Fprint(&buf, ",")
+			}
+			fmt.Fprintf(&buf, "%q=%q", rk.Suffix, rk.Value)
+		}
+		fmt.Fprint(&buf, "}")
+	} else {
+		fmt.Fprint(&buf, ",<no range>")
+	}
+	if i.RangeKeyChanged() {
+		fmt.Fprint(&buf, "*")
+	}
+	return buf.String()
+}
+
+func validBoolToStr(valid bool) string {
+	return fmt.Sprintf("%t", valid)
+}
+
+func validityStateToStr(validity pebble.IterValidityState) (bool, string) {
+	// We can't distinguish between IterExhausted and IterAtLimit in a
+	// deterministic manner.
+	switch validity {
+	case pebble.IterExhausted, pebble.IterAtLimit:
+		return false, "invalid"
+	case pebble.IterValid:
+		return true, "valid"
+	default:
+		panic("unknown validity")
+	}
+}
+
+func (o *iterSeekGEOp) run(t *test, h historyRecorder) {
+	i := t.getIter(o.iterID)
+	var valid bool
+	var validStr string
+	if o.limit == nil {
+		valid = i.SeekGE(o.key)
+		validStr = validBoolToStr(valid)
+	} else {
+		valid, validStr = validityStateToStr(i.SeekGEWithLimit(o.key, o.limit))
+	}
+	if valid {
+		h.Recordf("%s // [%s,%s] %v", o, validStr, iteratorPos(i), i.Error())
+	} else {
+		h.Recordf("%s // [%s] %v", o, validStr, i.Error())
+	}
+}
+
+func (o *iterSeekGEOp) String() string {
+	return fmt.Sprintf("%s.SeekGE(%q, %q)", o.iterID, o.key, o.limit)
+}
+func (o *iterSeekGEOp) receiver() objID      { return o.iterID }
+func (o *iterSeekGEOp) syncObjs() objIDSlice { return onlyBatchIDs(o.derivedReaderID) }
+
+func onlyBatchIDs(ids ...objID) objIDSlice {
+	var ret objIDSlice
+	for _, id := range ids {
+		if id.tag() == batchTag {
+			ret = append(ret, id)
+		}
+	}
+	return ret
+}
+
+// iterSeekPrefixGEOp models an Iterator.SeekPrefixGE operation.
+type iterSeekPrefixGEOp struct {
+	iterID objID
+	key    []byte
+
+	derivedReaderID objID
+}
+
+func (o *iterSeekPrefixGEOp) run(t *test, h historyRecorder) {
+	i := t.getIter(o.iterID)
+	valid := i.SeekPrefixGE(o.key)
+	if valid {
+		h.Recordf("%s // [%t,%s] %v", o, valid, iteratorPos(i), i.Error())
+	} else {
+		h.Recordf("%s // [%t] %v", o, valid, i.Error())
+	}
+}
+
+func (o *iterSeekPrefixGEOp) String() string {
+	return fmt.Sprintf("%s.SeekPrefixGE(%q)", o.iterID, o.key)
+}
+func (o *iterSeekPrefixGEOp) receiver() objID      { return o.iterID }
+func (o *iterSeekPrefixGEOp) syncObjs() objIDSlice { return onlyBatchIDs(o.derivedReaderID) }
+
+// iterSeekLTOp models an Iterator.SeekLT[WithLimit] operation.
+type iterSeekLTOp struct {
+	iterID objID
+	key    []byte
+	limit  []byte
+
+	derivedReaderID objID
+}
+
+func (o *iterSeekLTOp) run(t *test, h historyRecorder) {
+	i := t.getIter(o.iterID)
+	var valid bool
+	var validStr string
+	if o.limit == nil {
+		valid = i.SeekLT(o.key)
+		validStr = validBoolToStr(valid)
+	} else {
+		valid, validStr = validityStateToStr(i.SeekLTWithLimit(o.key, o.limit))
+	}
+	if valid {
+		h.Recordf("%s // [%s,%s] %v", o, validStr, iteratorPos(i), i.Error())
+	} else {
+		h.Recordf("%s // [%s] %v", o, validStr, i.Error())
+	}
+}
+
+func (o *iterSeekLTOp) String() string {
+	return fmt.Sprintf("%s.SeekLT(%q, %q)", o.iterID, o.key, o.limit)
+}
+
+func (o *iterSeekLTOp) receiver() objID      { return o.iterID }
+func (o *iterSeekLTOp) syncObjs() objIDSlice { return onlyBatchIDs(o.derivedReaderID) }
+
+// iterFirstOp models an Iterator.First operation.
+type iterFirstOp struct {
+	iterID objID
+
+	derivedReaderID objID
+}
+
+func (o *iterFirstOp) run(t *test, h historyRecorder) {
+	i := t.getIter(o.iterID)
+	valid := i.First()
+	if valid {
+		h.Recordf("%s // [%t,%s] %v", o, valid, iteratorPos(i), i.Error())
+	} else {
+		h.Recordf("%s // [%t] %v", o, valid, i.Error())
+	}
+}
+
+func (o *iterFirstOp) String() string       { return fmt.Sprintf("%s.First()", o.iterID) }
+func (o *iterFirstOp) receiver() objID      { return o.iterID }
+func (o *iterFirstOp) syncObjs() objIDSlice { return onlyBatchIDs(o.derivedReaderID) }
+
+// iterLastOp models an Iterator.Last operation.
+type iterLastOp struct {
+	iterID objID
+
+	derivedReaderID objID
+}
+
+func (o *iterLastOp) run(t *test, h historyRecorder) {
+	i := t.getIter(o.iterID)
+	valid := i.Last()
+	if valid {
+		h.Recordf("%s // [%t,%s] %v", o, valid, iteratorPos(i), i.Error())
+	} else {
+		h.Recordf("%s // [%t] %v", o, valid, i.Error())
+	}
+}
+
+func (o *iterLastOp) String() string       { return fmt.Sprintf("%s.Last()", o.iterID) }
+func (o *iterLastOp) receiver() objID      { return o.iterID }
+func (o *iterLastOp) syncObjs() objIDSlice { return onlyBatchIDs(o.derivedReaderID) }
+
+// iterNextOp models an Iterator.Next[WithLimit] operation.
+type iterNextOp struct {
+	iterID objID
+	limit  []byte
+
+	derivedReaderID objID
+}
+
+func (o *iterNextOp) run(t *test, h historyRecorder) {
+	i := t.getIter(o.iterID)
+	var valid bool
+	var validStr string
+	if o.limit == nil {
+		valid = i.Next()
+		validStr = validBoolToStr(valid)
+	} else {
+		valid, validStr = validityStateToStr(i.NextWithLimit(o.limit))
+	}
+	if valid {
+		h.Recordf("%s // [%s,%s] %v", o, validStr, iteratorPos(i), i.Error())
+	} else {
+		h.Recordf("%s // [%s] %v", o, validStr, i.Error())
+	}
+}
+
+func (o *iterNextOp) String() string       { return fmt.Sprintf("%s.Next(%q)", o.iterID, o.limit) }
+func (o *iterNextOp) receiver() objID      { return o.iterID }
+func (o *iterNextOp) syncObjs() objIDSlice { return onlyBatchIDs(o.derivedReaderID) }
+
+// iterNextPrefixOp models an Iterator.NextPrefix operation.
+type iterNextPrefixOp struct {
+	iterID objID
+
+	derivedReaderID objID
+}
+
+func (o *iterNextPrefixOp) run(t *test, h historyRecorder) {
+	i := t.getIter(o.iterID)
+	valid := i.NextPrefix()
+	validStr := validBoolToStr(valid)
+	if valid {
+		h.Recordf("%s // [%s,%s] %v", o, validStr, iteratorPos(i), i.Error())
+	} else {
+		h.Recordf("%s // [%s] %v", o, validStr, i.Error())
+	}
+}
+
+func (o *iterNextPrefixOp) String() string       { return fmt.Sprintf("%s.NextPrefix()", o.iterID) }
+func (o *iterNextPrefixOp) receiver() objID      { return o.iterID }
+func (o *iterNextPrefixOp) syncObjs() objIDSlice { return onlyBatchIDs(o.derivedReaderID) }
+
+// iterCanSingleDelOp models a call to CanDeterministicallySingleDelete with an
+// Iterator.
+type iterCanSingleDelOp struct {
+	iterID objID
+
+	derivedReaderID objID
+}
+
+func (o *iterCanSingleDelOp) run(t *test, h historyRecorder) {
+	// TODO(jackson): When we perform error injection, we'll need to rethink
+	// this.
+	_, err := pebble.CanDeterministicallySingleDelete(t.getIter(o.iterID).iter)
+	// The return value of CanDeterministicallySingleDelete is dependent on
+	// internal LSM state and non-deterministic, so we don't record it.
+	// Including the operation within the metamorphic test at all helps ensure
+	// that it does not change the result of any other Iterator operation that
+	// should be deterministic, regardless of its own outcome.
+	//
+	// We still record the value of the error because it's deterministic, at
+	// least for now. The possible error cases are:
+	//  - The iterator was already in an error state when the operation ran.
+	//  - The operation is deterministically invalid (like using an InternalNext
+	//    to change directions.)
+	h.Recordf("%s // %v", o, err)
+}
+
+func (o *iterCanSingleDelOp) String() string       { return fmt.Sprintf("%s.InternalNext()", o.iterID) }
+func (o *iterCanSingleDelOp) receiver() objID      { return o.iterID }
+func (o *iterCanSingleDelOp) syncObjs() objIDSlice { return onlyBatchIDs(o.derivedReaderID) }
+
+// iterPrevOp models an Iterator.Prev[WithLimit] operation.
+type iterPrevOp struct {
+	iterID objID
+	limit  []byte
+
+	derivedReaderID objID
+}
+
+func (o *iterPrevOp) run(t *test, h historyRecorder) {
+	i := t.getIter(o.iterID)
+	var valid bool
+	var validStr string
+	if o.limit == nil {
+		valid = i.Prev()
+		validStr = validBoolToStr(valid)
+	} else {
+		valid, validStr = validityStateToStr(i.PrevWithLimit(o.limit))
+	}
+	if valid {
+		h.Recordf("%s // [%s,%s] %v", o, validStr, iteratorPos(i), i.Error())
+	} else {
+		h.Recordf("%s // [%s] %v", o, validStr, i.Error())
+	}
+}
+
+func (o *iterPrevOp) String() string       { return fmt.Sprintf("%s.Prev(%q)", o.iterID, o.limit) }
+func (o *iterPrevOp) receiver() objID      { return o.iterID }
+func (o *iterPrevOp) syncObjs() objIDSlice { return onlyBatchIDs(o.derivedReaderID) }
+
+// newSnapshotOp models a DB.NewSnapshot operation.
+type newSnapshotOp struct {
+	dbID   objID
+	snapID objID
+	// If nonempty, this snapshot must not be used to read any keys outside of
+	// the provided bounds. This allows some implementations to use 'Eventually
+	// file-only snapshots,' which require bounds.
+	bounds []pebble.KeyRange
+}
+
+func (o *newSnapshotOp) run(t *test, h historyRecorder) {
+	// Fibonacci hash https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
+	if len(t.dbs) > 1 || (len(o.bounds) > 0 && ((11400714819323198485*uint64(t.idx)*t.testOpts.seedEFOS)>>63) == 1) {
+		s := t.getDB(o.dbID).NewEventuallyFileOnlySnapshot(o.bounds)
+		t.setSnapshot(o.snapID, s)
+	} else {
+		s := t.getDB(o.dbID).NewSnapshot()
+		t.setSnapshot(o.snapID, s)
+	}
+	h.Recordf("%s", o)
+}
+
+func (o *newSnapshotOp) String() string {
+	var buf bytes.Buffer
+	fmt.Fprintf(&buf, "%s = %s.NewSnapshot(", o.snapID, o.dbID)
+	for i := range o.bounds {
+		if i > 0 {
+			fmt.Fprint(&buf, ", ")
+		}
+		fmt.Fprintf(&buf, "%q, %q", o.bounds[i].Start, o.bounds[i].End)
+	}
+	fmt.Fprint(&buf, ")")
+	return buf.String()
+}
+func (o *newSnapshotOp) receiver() objID      { return o.dbID }
+func (o *newSnapshotOp) syncObjs() objIDSlice { return []objID{o.snapID} }
+
+type dbRatchetFormatMajorVersionOp struct {
+	dbID objID
+	vers pebble.FormatMajorVersion
+}
+
+func (o *dbRatchetFormatMajorVersionOp) run(t *test, h historyRecorder) {
+	var err error
+	// NB: We no-op the operation if we're already at or above the provided
+	// format major version. Different runs start at different format major
+	// versions, making the presence of an error and the error message itself
+	// non-deterministic if we attempt to upgrade to an older version.
+	//
+	//Regardless, subsequent operations should behave identically, which is what
+	//we're really aiming to test by including this format major version ratchet
+	//operation.
+	if t.getDB(o.dbID).FormatMajorVersion() < o.vers {
+		err = t.getDB(o.dbID).RatchetFormatMajorVersion(o.vers)
+	}
+	h.Recordf("%s // %v", o, err)
+}
+
+func (o *dbRatchetFormatMajorVersionOp) String() string {
+	return fmt.Sprintf("%s.RatchetFormatMajorVersion(%s)", o.dbID, o.vers)
+}
+func (o *dbRatchetFormatMajorVersionOp) receiver() objID      { return o.dbID }
+func (o *dbRatchetFormatMajorVersionOp) syncObjs() objIDSlice { return nil }
+
+type dbRestartOp struct {
+	dbID objID
+}
+
+func (o *dbRestartOp) run(t *test, h historyRecorder) {
+	if err := t.restartDB(o.dbID); err != nil {
+		h.Recordf("%s // %v", o, err)
+		h.history.err.Store(errors.Wrap(err, "dbRestartOp"))
+	} else {
+		h.Recordf("%s", o)
+	}
+}
+
+func (o *dbRestartOp) String() string       { return fmt.Sprintf("%s.Restart()", o.dbID) }
+func (o *dbRestartOp) receiver() objID      { return o.dbID }
+func (o *dbRestartOp) syncObjs() objIDSlice { return nil }
+
+func formatOps(ops []op) string {
+	var buf strings.Builder
+	for _, op := range ops {
+		fmt.Fprintf(&buf, "%s\n", op)
+	}
+	return buf.String()
+}
+
+// replicateOp models an operation that could copy keys from one db to
+// another through either an IngestAndExcise, or an Ingest.
+type replicateOp struct {
+	source, dest objID
+	start, end   []byte
+}
+
+func (r *replicateOp) runSharedReplicate(
+	t *test, h historyRecorder, source, dest *pebble.DB, w *sstable.Writer, sstPath string,
+) {
+	var sharedSSTs []pebble.SharedSSTMeta
+	var err error
+	err = source.ScanInternal(context.TODO(), sstable.CategoryAndQoS{}, r.start, r.end,
+		func(key *pebble.InternalKey, value pebble.LazyValue, _ pebble.IteratorLevel) error {
+			val, _, err := value.Value(nil)
+			if err != nil {
+				panic(err)
+			}
+			return w.Add(base.MakeInternalKey(key.UserKey, 0, key.Kind()), val)
+		},
+		func(start, end []byte, seqNum uint64) error {
+			return w.DeleteRange(start, end)
+		},
+		func(start, end []byte, keys []keyspan.Key) error {
+			s := keyspan.Span{
+				Start:     start,
+				End:       end,
+				Keys:      keys,
+				KeysOrder: 0,
+			}
+			return rangekey.Encode(&s, func(k base.InternalKey, v []byte) error {
+				return w.AddRangeKey(base.MakeInternalKey(k.UserKey, 0, k.Kind()), v)
+			})
+		},
+		func(sst *pebble.SharedSSTMeta) error {
+			sharedSSTs = append(sharedSSTs, *sst)
+			return nil
+		},
+	)
+	if err != nil {
+		h.Recordf("%s // %v", r, err)
+		return
+	}
+
+	_, err = dest.IngestAndExcise([]string{sstPath}, sharedSSTs, pebble.KeyRange{Start: r.start, End: r.end})
+	h.Recordf("%s // %v", r, err)
+}
+
+func (r *replicateOp) run(t *test, h historyRecorder) {
+	// Shared replication only works if shared storage is enabled.
+	useSharedIngest := t.testOpts.useSharedReplicate
+	if !t.testOpts.sharedStorageEnabled {
+		useSharedIngest = false
+	}
+
+	source := t.getDB(r.source)
+	dest := t.getDB(r.dest)
+	sstPath := path.Join(t.tmpDir, fmt.Sprintf("ext-replicate%d.sst", t.idx))
+	f, err := t.opts.FS.Create(sstPath)
+	if err != nil {
+		h.Recordf("%s // %v", r, err)
+		return
+	}
+	w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), t.opts.MakeWriterOptions(0, dest.FormatMajorVersion().MaxTableFormat()))
+
+	if useSharedIngest {
+		r.runSharedReplicate(t, h, source, dest, w, sstPath)
+		return
+	}
+
+	iter, err := source.NewIter(&pebble.IterOptions{
+		LowerBound: r.start,
+		UpperBound: r.end,
+		KeyTypes:   pebble.IterKeyTypePointsAndRanges,
+	})
+	if err != nil {
+		panic(err)
+	}
+	defer iter.Close()
+
+	// Write rangedels and rangekeydels for the range. This mimics the Excise
+	// that runSharedReplicate would do.
+	if err := w.DeleteRange(r.start, r.end); err != nil {
+		panic(err)
+	}
+	if err := w.RangeKeyDelete(r.start, r.end); err != nil {
+		panic(err)
+	}
+
+	for ok := iter.SeekGE(r.start); ok && iter.Error() != nil; ok = iter.Next() {
+		hasPoint, hasRange := iter.HasPointAndRange()
+		if hasPoint {
+			val, err := iter.ValueAndErr()
+			if err != nil {
+				panic(err)
+			}
+			if err := w.Set(iter.Key(), val); err != nil {
+				panic(err)
+			}
+		}
+		if hasRange && iter.RangeKeyChanged() {
+			rangeKeys := iter.RangeKeys()
+			rkStart, rkEnd := iter.RangeBounds()
+			for i := range rangeKeys {
+				if err := w.RangeKeySet(rkStart, rkEnd, rangeKeys[i].Suffix, rangeKeys[i].Value); err != nil {
+					panic(err)
+				}
+			}
+		}
+	}
+	if err := w.Close(); err != nil {
+		panic(err)
+	}
+
+	err = dest.Ingest([]string{sstPath})
+	h.Recordf("%s // %v", r, err)
+}
+
+func (r *replicateOp) String() string {
+	return fmt.Sprintf("%s.Replicate(%s, %q, %q)", r.source, r.dest, r.start, r.end)
+}
+
+func (r *replicateOp) receiver() objID      { return r.source }
+func (r *replicateOp) syncObjs() objIDSlice { return objIDSlice{r.dest} }
diff --git a/pebble/metamorphic/options.go b/pebble/metamorphic/options.go
new file mode 100644
index 0000000..b288c44
--- /dev/null
+++ b/pebble/metamorphic/options.go
@@ -0,0 +1,676 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package metamorphic
+
+import (
+	"bytes"
+	"fmt"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/bloom"
+	"github.com/cockroachdb/pebble/internal/cache"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/objstorage/remote"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"golang.org/x/exp/rand"
+)
+
+const (
+	// The metamorphic test exercises range keys, so we cannot use an older
+	// FormatMajorVersion than pebble.FormatRangeKeys.
+	minimumFormatMajorVersion = pebble.FormatRangeKeys
+	// The format major version to use in the default options configurations. We
+	// default to the last format major version of Cockroach 22.2 so we exercise
+	// the runtime version ratcheting that a cluster upgrading to 23.1 would
+	// experience. The randomized options may still use format major versions
+	// that are less than defaultFormatMajorVersion but are at least
+	// minimumFormatMajorVersion.
+	defaultFormatMajorVersion = pebble.FormatPrePebblev1Marked
+	// newestFormatMajorVersionToTest is the most recent format major version
+	// the metamorphic tests should use. This may be greater than
+	// pebble.FormatNewest when some format major versions are marked as
+	// experimental.
+	newestFormatMajorVersionToTest = pebble.FormatNewest
+)
+
+func parseOptions(
+	opts *TestOptions, data string, customOptionParsers map[string]func(string) (CustomOption, bool),
+) error {
+	hooks := &pebble.ParseHooks{
+		NewCache:        pebble.NewCache,
+		NewFilterPolicy: filterPolicyFromName,
+		SkipUnknown: func(name, value string) bool {
+			switch name {
+			case "TestOptions":
+				return true
+			case "TestOptions.strictfs":
+				opts.strictFS = true
+				return true
+			case "TestOptions.ingest_using_apply":
+				opts.ingestUsingApply = true
+				return true
+			case "TestOptions.delete_sized":
+				opts.deleteSized = true
+				return true
+			case "TestOptions.replace_single_delete":
+				opts.replaceSingleDelete = true
+				return true
+			case "TestOptions.use_disk":
+				opts.useDisk = true
+				return true
+			case "TestOptions.initial_state_desc":
+				opts.initialStateDesc = value
+				return true
+			case "TestOptions.initial_state_path":
+				opts.initialStatePath = value
+				return true
+			case "TestOptions.threads":
+				v, err := strconv.Atoi(value)
+				if err != nil {
+					panic(err)
+				}
+				opts.threads = v
+				return true
+			case "TestOptions.disable_block_property_collector":
+				v, err := strconv.ParseBool(value)
+				if err != nil {
+					panic(err)
+				}
+				opts.disableBlockPropertyCollector = v
+				if v {
+					opts.Opts.BlockPropertyCollectors = nil
+				}
+				return true
+			case "TestOptions.enable_value_blocks":
+				opts.enableValueBlocks = true
+				opts.Opts.Experimental.EnableValueBlocks = func() bool { return true }
+				return true
+			case "TestOptions.async_apply_to_db":
+				opts.asyncApplyToDB = true
+				return true
+			case "TestOptions.shared_storage_enabled":
+				opts.sharedStorageEnabled = true
+				opts.Opts.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
+					"": remote.NewInMem(),
+				})
+				if opts.Opts.Experimental.CreateOnShared == remote.CreateOnSharedNone {
+					opts.Opts.Experimental.CreateOnShared = remote.CreateOnSharedAll
+				}
+				return true
+			case "TestOptions.secondary_cache_enabled":
+				opts.secondaryCacheEnabled = true
+				opts.Opts.Experimental.SecondaryCacheSizeBytes = 1024 * 1024 * 32 // 32 MBs
+				return true
+			case "TestOptions.seed_efos":
+				v, err := strconv.ParseUint(value, 10, 64)
+				if err != nil {
+					panic(err)
+				}
+				opts.seedEFOS = v
+				return true
+			case "TestOptions.ingest_split":
+				opts.ingestSplit = true
+				opts.Opts.Experimental.IngestSplit = func() bool {
+					return true
+				}
+				return true
+			default:
+				if customOptionParsers == nil {
+					return false
+				}
+				name = strings.TrimPrefix(name, "TestOptions.")
+				if p, ok := customOptionParsers[name]; ok {
+					if customOpt, ok := p(value); ok {
+						opts.CustomOpts = append(opts.CustomOpts, customOpt)
+						return true
+					}
+				}
+				return false
+			}
+		},
+	}
+	err := opts.Opts.Parse(data, hooks)
+	opts.Opts.EnsureDefaults()
+	return err
+}
+
+func optionsToString(opts *TestOptions) string {
+	var buf bytes.Buffer
+	if opts.strictFS {
+		fmt.Fprint(&buf, "  strictfs=true\n")
+	}
+	if opts.ingestUsingApply {
+		fmt.Fprint(&buf, "  ingest_using_apply=true\n")
+	}
+	if opts.deleteSized {
+		fmt.Fprint(&buf, "  delete_sized=true\n")
+	}
+	if opts.replaceSingleDelete {
+		fmt.Fprint(&buf, "  replace_single_delete=true\n")
+	}
+	if opts.useDisk {
+		fmt.Fprint(&buf, "  use_disk=true\n")
+	}
+	if opts.initialStatePath != "" {
+		fmt.Fprintf(&buf, "  initial_state_path=%s\n", opts.initialStatePath)
+	}
+	if opts.initialStateDesc != "" {
+		fmt.Fprintf(&buf, "  initial_state_desc=%s\n", opts.initialStateDesc)
+	}
+	if opts.threads != 0 {
+		fmt.Fprintf(&buf, "  threads=%d\n", opts.threads)
+	}
+	if opts.disableBlockPropertyCollector {
+		fmt.Fprintf(&buf, "  disable_block_property_collector=%t\n", opts.disableBlockPropertyCollector)
+	}
+	if opts.enableValueBlocks {
+		fmt.Fprintf(&buf, "  enable_value_blocks=%t\n", opts.enableValueBlocks)
+	}
+	if opts.asyncApplyToDB {
+		fmt.Fprint(&buf, "  async_apply_to_db=true\n")
+	}
+	if opts.sharedStorageEnabled {
+		fmt.Fprint(&buf, "  shared_storage_enabled=true\n")
+	}
+	if opts.secondaryCacheEnabled {
+		fmt.Fprint(&buf, "  secondary_cache_enabled=true\n")
+	}
+	if opts.seedEFOS != 0 {
+		fmt.Fprintf(&buf, "  seed_efos=%d\n", opts.seedEFOS)
+	}
+	if opts.ingestSplit {
+		fmt.Fprintf(&buf, "  ingest_split=%v\n", opts.ingestSplit)
+	}
+	for _, customOpt := range opts.CustomOpts {
+		fmt.Fprintf(&buf, "  %s=%s\n", customOpt.Name(), customOpt.Value())
+	}
+
+	s := opts.Opts.String()
+	if buf.Len() == 0 {
+		return s
+	}
+	return s + "\n[TestOptions]\n" + buf.String()
+}
+
+func defaultTestOptions() *TestOptions {
+	return &TestOptions{
+		Opts:    defaultOptions(),
+		threads: 16,
+	}
+}
+
+func defaultOptions() *pebble.Options {
+	opts := &pebble.Options{
+		Comparer:           testkeys.Comparer,
+		FS:                 vfs.NewMem(),
+		FormatMajorVersion: defaultFormatMajorVersion,
+		Levels: []pebble.LevelOptions{{
+			FilterPolicy: bloom.FilterPolicy(10),
+		}},
+		BlockPropertyCollectors: blockPropertyCollectorConstructors,
+	}
+	return opts
+}
+
+// TestOptions describes the options configuring an individual run of the
+// metamorphic tests.
+type TestOptions struct {
+	// Opts holds the *pebble.Options for the test.
+	Opts *pebble.Options
+	// CustomOptions holds custom test options that are defined outside of this
+	// package.
+	CustomOpts []CustomOption
+	useDisk    bool
+	strictFS   bool
+	threads    int
+	// Use Batch.Apply rather than DB.Ingest.
+	ingestUsingApply bool
+	// Use Batch.DeleteSized rather than Batch.Delete.
+	deleteSized bool
+	// Replace a SINGLEDEL with a DELETE.
+	replaceSingleDelete bool
+	// The path on the local filesystem where the initial state of the database
+	// exists.  Empty if the test run begins from an empty database state.
+	initialStatePath string
+	// A human-readable string describing the initial state of the database.
+	// Empty if the test run begins from an empty database state.
+	initialStateDesc string
+	// Disable the block property collector, which may be used by block property
+	// filters.
+	disableBlockPropertyCollector bool
+	// Enable the use of value blocks.
+	enableValueBlocks bool
+	// Use DB.ApplyNoSyncWait for applies that want to sync the WAL.
+	asyncApplyToDB bool
+	// Enable the use of shared storage.
+	sharedStorageEnabled bool
+	// Enables the use of shared replication in TestOptions.
+	useSharedReplicate bool
+	// Enable the secondary cache. Only effective if sharedStorageEnabled is
+	// also true.
+	secondaryCacheEnabled bool
+	// If nonzero, enables the use of EventuallyFileOnlySnapshots for
+	// newSnapshotOps that are keyspan-bounded. The set of which newSnapshotOps
+	// are actually created as EventuallyFileOnlySnapshots is deterministically
+	// derived from the seed and the operation index.
+	seedEFOS uint64
+	// Enables ingest splits. Saved here for serialization as Options does not
+	// serialize this.
+	ingestSplit bool
+}
+
+// CustomOption defines a custom option that configures the behavior of an
+// individual test run. Like all test options, custom options are serialized to
+// the OPTIONS file even if they're not options ordinarily understood by Pebble.
+type CustomOption interface {
+	// Name returns the name of the custom option. This is the key under which
+	// the option appears in the OPTIONS file, within the [TestOptions] stanza.
+	Name() string
+	// Value returns the value of the custom option, serialized as it should
+	// appear within the OPTIONS file.
+	Value() string
+	// Close is run after the test database has been closed at the end of the
+	// test as well as during restart operations within the test sequence. It's
+	// passed a copy of the *pebble.Options. If the custom options hold on to
+	// any resources outside, Close should release them.
+	Close(*pebble.Options) error
+	// Open is run before the test runs and during a restart operation after the
+	// test database has been closed and Close has been called. It's passed a
+	// copy of the *pebble.Options. If the custom options must acquire any
+	// resources before the test continues, it should reacquire them.
+	Open(*pebble.Options) error
+
+	// TODO(jackson): provide additional hooks for custom options changing the
+	// behavior of a run.
+}
+
+func standardOptions() []*TestOptions {
+	// The index labels are not strictly necessary, but they make it easier to
+	// find which options correspond to a failure.
+	stdOpts := []string{
+		0: "", // default options
+		1: `
+[Options]
+  cache_size=1
+`,
+		2: `
+[Options]
+  disable_wal=true
+`,
+		3: `
+[Options]
+  l0_compaction_threshold=1
+`,
+		4: `
+[Options]
+  l0_compaction_threshold=1
+  l0_stop_writes_threshold=1
+`,
+		5: `
+[Options]
+  lbase_max_bytes=1
+`,
+		6: `
+[Options]
+  max_manifest_file_size=1
+`,
+		7: `
+[Options]
+  max_open_files=1
+`,
+		8: `
+[Options]
+  mem_table_size=2000
+`,
+		9: `
+[Options]
+  mem_table_stop_writes_threshold=2
+`,
+		10: `
+[Options]
+  wal_dir=data/wal
+`,
+		11: `
+[Level "0"]
+  block_restart_interval=1
+`,
+		12: `
+[Level "0"]
+  block_size=1
+`,
+		13: `
+[Level "0"]
+  compression=NoCompression
+`,
+		14: `
+[Level "0"]
+  index_block_size=1
+`,
+		15: `
+[Level "0"]
+  target_file_size=1
+`,
+		16: `
+[Level "0"]
+  filter_policy=none
+`,
+		// 1GB
+		17: `
+[Options]
+  bytes_per_sync=1073741824
+[TestOptions]
+  strictfs=true
+`,
+		18: `
+[Options]
+  max_concurrent_compactions=2
+`,
+		19: `
+[TestOptions]
+  ingest_using_apply=true
+`,
+		20: `
+[TestOptions]
+  replace_single_delete=true
+`,
+		21: `
+[TestOptions]
+ use_disk=true
+`,
+		22: `
+[Options]
+  max_writer_concurrency=2
+  force_writer_parallelism=true
+`,
+		23: `
+[TestOptions]
+  disable_block_property_collector=true
+`,
+		24: `
+[TestOptions]
+  threads=1
+`,
+		25: `
+[TestOptions]
+  enable_value_blocks=true
+`,
+		26: fmt.Sprintf(`
+[Options]
+  format_major_version=%s
+`, newestFormatMajorVersionToTest),
+		27: `
+[TestOptions]
+  shared_storage_enabled=true
+  secondary_cache_enabled=true
+`,
+	}
+
+	opts := make([]*TestOptions, len(stdOpts))
+	for i := range opts {
+		opts[i] = defaultTestOptions()
+		// NB: The standard options by definition can never include custom
+		// options, so no need to propagate custom option parsers.
+		if err := parseOptions(opts[i], stdOpts[i], nil /* custom option parsers */); err != nil {
+			panic(err)
+		}
+	}
+	return opts
+}
+
+func randomOptions(
+	rng *rand.Rand, customOptionParsers map[string]func(string) (CustomOption, bool),
+) *TestOptions {
+	testOpts := defaultTestOptions()
+	opts := testOpts.Opts
+
+	// There are some private options, which we don't want users to fiddle with.
+	// There's no way to set it through the public interface. The only method is
+	// through Parse.
+	{
+		var privateOpts bytes.Buffer
+		fmt.Fprintln(&privateOpts, `[Options]`)
+		if rng.Intn(3) == 0 /* 33% */ {
+			fmt.Fprintln(&privateOpts, `  disable_delete_only_compactions=true`)
+		}
+		if rng.Intn(3) == 0 /* 33% */ {
+			fmt.Fprintln(&privateOpts, `  disable_elision_only_compactions=true`)
+		}
+		if rng.Intn(5) == 0 /* 20% */ {
+			fmt.Fprintln(&privateOpts, `  disable_lazy_combined_iteration=true`)
+		}
+		if privateOptsStr := privateOpts.String(); privateOptsStr != `[Options]\n` {
+			parseOptions(testOpts, privateOptsStr, customOptionParsers)
+		}
+	}
+
+	opts.BytesPerSync = 1 << uint(rng.Intn(28))     // 1B - 256MB
+	opts.Cache = cache.New(1 << uint(rng.Intn(30))) // 1B - 1GB
+	opts.DisableWAL = rng.Intn(2) == 0
+	opts.FlushDelayDeleteRange = time.Millisecond * time.Duration(5*rng.Intn(245)) // 5-250ms
+	opts.FlushDelayRangeKey = time.Millisecond * time.Duration(5*rng.Intn(245))    // 5-250ms
+	opts.FlushSplitBytes = 1 << rng.Intn(20)                                       // 1B - 1MB
+	opts.FormatMajorVersion = minimumFormatMajorVersion
+	n := int(newestFormatMajorVersionToTest - opts.FormatMajorVersion)
+	opts.FormatMajorVersion += pebble.FormatMajorVersion(rng.Intn(n + 1))
+	opts.Experimental.L0CompactionConcurrency = 1 + rng.Intn(4) // 1-4
+	opts.Experimental.LevelMultiplier = 5 << rng.Intn(7)        // 5 - 320
+	opts.TargetByteDeletionRate = 1 << uint(20+rng.Intn(10))    // 1MB - 1GB
+	opts.Experimental.ValidateOnIngest = rng.Intn(2) != 0
+	opts.L0CompactionThreshold = 1 + rng.Intn(100)     // 1 - 100
+	opts.L0CompactionFileThreshold = 1 << rng.Intn(11) // 1 - 1024
+	opts.L0StopWritesThreshold = 1 + rng.Intn(100)     // 1 - 100
+	if opts.L0StopWritesThreshold < opts.L0CompactionThreshold {
+		opts.L0StopWritesThreshold = opts.L0CompactionThreshold
+	}
+	opts.LBaseMaxBytes = 1 << uint(rng.Intn(30)) // 1B - 1GB
+	maxConcurrentCompactions := rng.Intn(3) + 1  // 1-3
+	opts.MaxConcurrentCompactions = func() int {
+		return maxConcurrentCompactions
+	}
+	opts.MaxManifestFileSize = 1 << uint(rng.Intn(30)) // 1B  - 1GB
+	opts.MemTableSize = 2 << (10 + uint(rng.Intn(16))) // 2KB - 256MB
+	opts.MemTableStopWritesThreshold = 2 + rng.Intn(5) // 2 - 5
+	if rng.Intn(2) == 0 {
+		opts.WALDir = "data/wal"
+	}
+	if rng.Intn(4) == 0 {
+		// Enable Writer parallelism for 25% of the random options. Setting
+		// MaxWriterConcurrency to any value greater than or equal to 1 has the
+		// same effect currently.
+		opts.Experimental.MaxWriterConcurrency = 2
+		opts.Experimental.ForceWriterParallelism = true
+	}
+	if rng.Intn(2) == 0 {
+		opts.Experimental.DisableIngestAsFlushable = func() bool { return true }
+	}
+	var lopts pebble.LevelOptions
+	lopts.BlockRestartInterval = 1 + rng.Intn(64)  // 1 - 64
+	lopts.BlockSize = 1 << uint(rng.Intn(24))      // 1 - 16MB
+	lopts.BlockSizeThreshold = 50 + rng.Intn(50)   // 50 - 100
+	lopts.IndexBlockSize = 1 << uint(rng.Intn(24)) // 1 - 16MB
+	lopts.TargetFileSize = 1 << uint(rng.Intn(28)) // 1 - 256MB
+
+	// We either use no bloom filter, the default filter, or a filter with
+	// randomized bits-per-key setting. We zero out the Filters map. It'll get
+	// repopulated on EnsureDefaults accordingly.
+	opts.Filters = nil
+	switch rng.Intn(3) {
+	case 0:
+		lopts.FilterPolicy = nil
+	case 1:
+		lopts.FilterPolicy = bloom.FilterPolicy(10)
+	default:
+		lopts.FilterPolicy = newTestingFilterPolicy(1 << rng.Intn(5))
+	}
+
+	// We use either no compression, snappy compression or zstd compression.
+	switch rng.Intn(3) {
+	case 0:
+		lopts.Compression = pebble.NoCompression
+	case 1:
+		lopts.Compression = pebble.ZstdCompression
+	default:
+		lopts.Compression = pebble.SnappyCompression
+	}
+	opts.Levels = []pebble.LevelOptions{lopts}
+
+	// Explicitly disable disk-backed FS's for the random configurations. The
+	// single standard test configuration that uses a disk-backed FS is
+	// sufficient.
+	testOpts.useDisk = false
+	testOpts.strictFS = rng.Intn(2) != 0 // Only relevant for MemFS.
+	testOpts.threads = rng.Intn(runtime.GOMAXPROCS(0)) + 1
+	if testOpts.strictFS {
+		opts.DisableWAL = false
+	}
+	testOpts.ingestUsingApply = rng.Intn(2) != 0
+	testOpts.deleteSized = rng.Intn(2) != 0
+	testOpts.replaceSingleDelete = rng.Intn(2) != 0
+	testOpts.disableBlockPropertyCollector = rng.Intn(2) == 1
+	if testOpts.disableBlockPropertyCollector {
+		testOpts.Opts.BlockPropertyCollectors = nil
+	}
+	testOpts.enableValueBlocks = opts.FormatMajorVersion >= pebble.FormatSSTableValueBlocks &&
+		rng.Intn(2) != 0
+	if testOpts.enableValueBlocks {
+		testOpts.Opts.Experimental.EnableValueBlocks = func() bool { return true }
+	}
+	testOpts.asyncApplyToDB = rng.Intn(2) != 0
+	// 20% of time, enable shared storage.
+	if rng.Intn(5) == 0 {
+		testOpts.sharedStorageEnabled = true
+		inMemShared := remote.NewInMem()
+		testOpts.Opts.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
+			"": inMemShared,
+		})
+		// If shared storage is enabled, pick between writing all files on shared
+		// vs. lower levels only, 50% of the time.
+		testOpts.Opts.Experimental.CreateOnShared = remote.CreateOnSharedAll
+		if rng.Intn(2) == 0 {
+			testOpts.Opts.Experimental.CreateOnShared = remote.CreateOnSharedLower
+		}
+		// If shared storage is enabled, enable secondary cache 50% of time.
+		if rng.Intn(2) == 0 {
+			testOpts.secondaryCacheEnabled = true
+			// TODO(josh): Randomize various secondary cache settings.
+			testOpts.Opts.Experimental.SecondaryCacheSizeBytes = 1024 * 1024 * 32 // 32 MBs
+		}
+		// 50% of the time, enable shared replication.
+		testOpts.useSharedReplicate = rng.Intn(2) == 0
+	}
+	testOpts.seedEFOS = rng.Uint64()
+	testOpts.ingestSplit = rng.Intn(2) == 0
+	opts.Experimental.IngestSplit = func() bool { return testOpts.ingestSplit }
+	testOpts.Opts.EnsureDefaults()
+	return testOpts
+}
+
+func setupInitialState(dataDir string, testOpts *TestOptions) error {
+	// Copy (vfs.Default,<initialStatePath>/data) to (testOpts.opts.FS,<dataDir>).
+	ok, err := vfs.Clone(
+		vfs.Default,
+		testOpts.Opts.FS,
+		vfs.Default.PathJoin(testOpts.initialStatePath, "data"),
+		dataDir,
+		vfs.CloneSync,
+		vfs.CloneSkip(func(filename string) bool {
+			// Skip the archive of historical files, any checkpoints created by
+			// operations and files staged for ingest in tmp.
+			b := filepath.Base(filename)
+			return b == "archive" || b == "checkpoints" || b == "tmp"
+		}))
+	if err != nil {
+		return err
+	} else if !ok {
+		return os.ErrNotExist
+	}
+
+	// Tests with wal_dir set store their WALs in a `wal` directory. The source
+	// database (initialStatePath) could've had wal_dir set, or the current test
+	// options (testOpts) could have wal_dir set, or both.
+	fs := testOpts.Opts.FS
+	walDir := fs.PathJoin(dataDir, "wal")
+	if err := fs.MkdirAll(walDir, os.ModePerm); err != nil {
+		return err
+	}
+
+	// Copy <dataDir>/wal/*.log -> <dataDir>.
+	src, dst := walDir, dataDir
+	if testOpts.Opts.WALDir != "" {
+		// Copy <dataDir>/*.log -> <dataDir>/wal.
+		src, dst = dst, src
+	}
+	return moveLogs(fs, src, dst)
+}
+
+func moveLogs(fs vfs.FS, srcDir, dstDir string) error {
+	ls, err := fs.List(srcDir)
+	if err != nil {
+		return err
+	}
+	for _, f := range ls {
+		if filepath.Ext(f) != ".log" {
+			continue
+		}
+		src := fs.PathJoin(srcDir, f)
+		dst := fs.PathJoin(dstDir, f)
+		if err := fs.Rename(src, dst); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+var blockPropertyCollectorConstructors = []func() pebble.BlockPropertyCollector{
+	sstable.NewTestKeysBlockPropertyCollector,
+}
+
+// testingFilterPolicy is used to allow bloom filter policies with non-default
+// bits-per-key setting. It is necessary because the name of the production
+// filter policy is fixed (see bloom.FilterPolicy.Name()); we need to output a
+// custom policy name to the OPTIONS file that the test can then parse.
+type testingFilterPolicy struct {
+	bloom.FilterPolicy
+}
+
+var _ pebble.FilterPolicy = (*testingFilterPolicy)(nil)
+
+func newTestingFilterPolicy(bitsPerKey int) *testingFilterPolicy {
+	return &testingFilterPolicy{
+		FilterPolicy: bloom.FilterPolicy(bitsPerKey),
+	}
+}
+
+const testingFilterPolicyFmt = "testing_bloom_filter/bits_per_key=%d"
+
+// Name implements the pebble.FilterPolicy interface.
+func (t *testingFilterPolicy) Name() string {
+	if t.FilterPolicy == 10 {
+		return "rocksdb.BuiltinBloomFilter"
+	}
+	return fmt.Sprintf(testingFilterPolicyFmt, t.FilterPolicy)
+}
+
+func filterPolicyFromName(name string) (pebble.FilterPolicy, error) {
+	switch name {
+	case "none":
+		return nil, nil
+	case "rocksdb.BuiltinBloomFilter":
+		return bloom.FilterPolicy(10), nil
+	}
+	var bitsPerKey int
+	if _, err := fmt.Sscanf(name, testingFilterPolicyFmt, &bitsPerKey); err != nil {
+		return nil, errors.Errorf("Invalid filter policy name '%s'", name)
+	}
+	return newTestingFilterPolicy(bitsPerKey), nil
+}
diff --git a/pebble/metamorphic/options_test.go b/pebble/metamorphic/options_test.go
new file mode 100644
index 0000000..cc86ca9
--- /dev/null
+++ b/pebble/metamorphic/options_test.go
@@ -0,0 +1,234 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package metamorphic
+
+import (
+	"fmt"
+	"io/fs"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/kr/pretty"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+func TestSetupInitialState(t *testing.T) {
+	// Construct a small database in the test's TempDir.
+	initialStatePath := t.TempDir()
+	initialDataPath := vfs.Default.PathJoin(initialStatePath, "data")
+	{
+		d, err := pebble.Open(initialDataPath, &pebble.Options{})
+		require.NoError(t, err)
+		const maxKeyLen = 2
+		ks := testkeys.Alpha(maxKeyLen)
+		var key [maxKeyLen]byte
+		for i := int64(0); i < ks.Count(); i++ {
+			n := testkeys.WriteKey(key[:], ks, i)
+			require.NoError(t, d.Set(key[:n], key[:n], pebble.NoSync))
+			if i%100 == 0 {
+				require.NoError(t, d.Flush())
+			}
+		}
+		require.NoError(t, d.Close())
+	}
+	ls, err := vfs.Default.List(initialStatePath)
+	require.NoError(t, err)
+
+	// setupInitialState with an initial state path set to the test's TempDir
+	// should populate opts.opts.FS with the directory's contents.
+	opts := &TestOptions{
+		Opts:             defaultOptions(),
+		initialStatePath: initialStatePath,
+		initialStateDesc: "test",
+	}
+	require.NoError(t, setupInitialState("data", opts))
+	copied, err := opts.Opts.FS.List("")
+	require.NoError(t, err)
+	require.ElementsMatch(t, ls, copied)
+}
+
+func TestOptionsRoundtrip(t *testing.T) {
+	// Some fields must be ignored to avoid spurious diffs.
+	ignorePrefixes := []string{
+		// Pointers
+		"Cache:",
+		"Cache.",
+		"FS:",
+		"TableCache:",
+		// Function pointers
+		"BlockPropertyCollectors:",
+		"EventListener:",
+		"MaxConcurrentCompactions:",
+		"Experimental.EnableValueBlocks:",
+		"Experimental.DisableIngestAsFlushable:",
+		"Experimental.RemoteStorage:",
+		"Experimental.IngestSplit:",
+		// Floating points
+		"Experimental.PointTombstoneWeight:",
+	}
+
+	// Ensure that we unref any caches created, so invariants builds don't
+	// complain about the leaked ref counts.
+	maybeUnref := func(o *TestOptions) {
+		if o.Opts.Cache != nil {
+			o.Opts.Cache.Unref()
+		}
+	}
+
+	checkOptions := func(t *testing.T, o *TestOptions) {
+		s := optionsToString(o)
+		t.Logf("Serialized options:\n%s\n", s)
+
+		parsed := defaultTestOptions()
+		require.NoError(t, parseOptions(parsed, s, nil))
+		maybeUnref(parsed)
+		got := optionsToString(parsed)
+		require.Equal(t, s, got)
+		t.Logf("Re-serialized options:\n%s\n", got)
+
+		// In some options, the closure obscures the underlying value. Check
+		// that the return values are equal.
+		require.Equal(t, o.Opts.Experimental.EnableValueBlocks == nil, parsed.Opts.Experimental.EnableValueBlocks == nil)
+		if o.Opts.Experimental.EnableValueBlocks != nil {
+			require.Equal(t, o.Opts.Experimental.EnableValueBlocks(), parsed.Opts.Experimental.EnableValueBlocks())
+		}
+		require.Equal(t, o.Opts.Experimental.DisableIngestAsFlushable == nil, parsed.Opts.Experimental.DisableIngestAsFlushable == nil)
+		if o.Opts.Experimental.DisableIngestAsFlushable != nil {
+			require.Equal(t, o.Opts.Experimental.DisableIngestAsFlushable(), parsed.Opts.Experimental.DisableIngestAsFlushable())
+		}
+		if o.Opts.Experimental.IngestSplit != nil && o.Opts.Experimental.IngestSplit() {
+			require.Equal(t, o.Opts.Experimental.IngestSplit(), parsed.Opts.Experimental.IngestSplit())
+		}
+		require.Equal(t, o.Opts.MaxConcurrentCompactions(), parsed.Opts.MaxConcurrentCompactions())
+		require.Equal(t, len(o.Opts.BlockPropertyCollectors), len(parsed.Opts.BlockPropertyCollectors))
+
+		diff := pretty.Diff(o.Opts, parsed.Opts)
+		cleaned := diff[:0]
+		for _, d := range diff {
+			var ignored bool
+			for _, prefix := range ignorePrefixes {
+				if strings.HasPrefix(d, prefix) {
+					ignored = true
+					break
+				}
+			}
+			if !ignored {
+				cleaned = append(cleaned, d)
+			}
+		}
+		require.Equal(t, diff[:0], cleaned)
+	}
+
+	standard := standardOptions()
+	for i := range standard {
+		t.Run(fmt.Sprintf("standard-%03d", i), func(t *testing.T) {
+			defer maybeUnref(standard[i])
+			checkOptions(t, standard[i])
+		})
+	}
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+	for i := 0; i < 100; i++ {
+		t.Run(fmt.Sprintf("random-%03d", i), func(t *testing.T) {
+			o := randomOptions(rng, nil)
+			defer maybeUnref(o)
+			checkOptions(t, o)
+		})
+	}
+}
+
+// TestBlockPropertiesParse ensures that the testkeys block property collector
+// is in use by default. It runs a single OPTIONS run of the metamorphic tests
+// and scans the resulting data directory to ensure there's at least one sstable
+// with the property. It runs the test with the archive cleaner to avoid any
+// flakiness from small working sets of keys.
+func TestBlockPropertiesParse(t *testing.T) {
+	const fixedSeed = 1
+	const numOps = 10_000
+	metaDir := t.TempDir()
+
+	rng := rand.New(rand.NewSource(fixedSeed))
+	ops := generate(rng, numOps, presetConfigs[0], newKeyManager(1 /* numInstances */))
+	opsPath := filepath.Join(metaDir, "ops")
+	formattedOps := formatOps(ops)
+	require.NoError(t, os.WriteFile(opsPath, []byte(formattedOps), 0644))
+
+	runDir := filepath.Join(metaDir, "run")
+	require.NoError(t, os.MkdirAll(runDir, os.ModePerm))
+	optionsPath := filepath.Join(runDir, "OPTIONS")
+	opts := defaultTestOptions()
+	opts.Opts.EnsureDefaults()
+	opts.Opts.Cleaner = pebble.ArchiveCleaner{}
+	optionsStr := optionsToString(opts)
+	require.NoError(t, os.WriteFile(optionsPath, []byte(optionsStr), 0644))
+
+	RunOnce(t, runDir, fixedSeed, filepath.Join(runDir, "history"), KeepData{})
+	var foundTableBlockProperty bool
+	require.NoError(t, filepath.Walk(filepath.Join(runDir, "data"),
+		func(path string, info fs.FileInfo, err error) error {
+			if err != nil {
+				return err
+			}
+			if filepath.Ext(path) != ".sst" {
+				return nil
+			}
+			f, err := vfs.Default.Open(path)
+			if err != nil {
+				return err
+			}
+			readable, err := sstable.NewSimpleReadable(f)
+			if err != nil {
+				return err
+			}
+			r, err := sstable.NewReader(readable, opts.Opts.MakeReaderOptions())
+			if err != nil {
+				return err
+			}
+			_, ok := r.Properties.UserProperties[opts.Opts.BlockPropertyCollectors[0]().Name()]
+			foundTableBlockProperty = foundTableBlockProperty || ok
+			return r.Close()
+		}))
+	require.True(t, foundTableBlockProperty)
+}
+
+func TestCustomOptionParser(t *testing.T) {
+	customOptionParsers := map[string]func(string) (CustomOption, bool){
+		"foo": func(value string) (CustomOption, bool) {
+			return testCustomOption{name: "foo", value: value}, true
+		},
+	}
+
+	o1 := defaultTestOptions()
+	o2 := defaultTestOptions()
+
+	require.NoError(t, parseOptions(o1, `
+[TestOptions]
+  foo=bar
+`, customOptionParsers))
+	require.NoError(t, parseOptions(o2, optionsToString(o1), customOptionParsers))
+	defer o2.Opts.Cache.Unref()
+
+	for _, o := range []*TestOptions{o1, o2} {
+		require.Equal(t, 1, len(o.CustomOpts))
+		require.Equal(t, "foo", o.CustomOpts[0].Name())
+		require.Equal(t, "bar", o.CustomOpts[0].Value())
+	}
+}
+
+type testCustomOption struct {
+	name, value string
+}
+
+func (o testCustomOption) Name() string                { return o.name }
+func (o testCustomOption) Value() string               { return o.value }
+func (o testCustomOption) Close(*pebble.Options) error { return nil }
+func (o testCustomOption) Open(*pebble.Options) error  { return nil }
diff --git a/pebble/metamorphic/parser.go b/pebble/metamorphic/parser.go
new file mode 100644
index 0000000..dcc19e2
--- /dev/null
+++ b/pebble/metamorphic/parser.go
@@ -0,0 +1,599 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package metamorphic
+
+import (
+	"fmt"
+	"go/scanner"
+	"go/token"
+	"reflect"
+	"strconv"
+	"strings"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble"
+)
+
+type methodInfo struct {
+	constructor func() op
+	validTags   uint32
+}
+
+func makeMethod(i interface{}, tags ...objTag) *methodInfo {
+	var validTags uint32
+	for _, tag := range tags {
+		validTags |= 1 << tag
+	}
+
+	t := reflect.TypeOf(i)
+	return &methodInfo{
+		constructor: func() op {
+			return reflect.New(t).Interface().(op)
+		},
+		validTags: validTags,
+	}
+}
+
+// args returns the receiverID, targetID and arguments for the op. The
+// receiverID is the ID of the object the op will be applied to. The targetID
+// is the ID of the object for assignment. If the method does not return a new
+// object, then targetID will be nil. The argument list is just what it sounds
+// like: the list of arguments for the operation.
+func opArgs(op op) (receiverID *objID, targetID *objID, args []interface{}) {
+	switch t := op.(type) {
+	case *applyOp:
+		return &t.writerID, nil, []interface{}{&t.batchID}
+	case *checkpointOp:
+		return &t.dbID, nil, []interface{}{&t.spans}
+	case *closeOp:
+		return &t.objID, nil, nil
+	case *compactOp:
+		return &t.dbID, nil, []interface{}{&t.start, &t.end, &t.parallelize}
+	case *batchCommitOp:
+		return &t.batchID, nil, nil
+	case *dbRatchetFormatMajorVersionOp:
+		return &t.dbID, nil, []interface{}{&t.vers}
+	case *dbRestartOp:
+		return &t.dbID, nil, nil
+	case *deleteOp:
+		return &t.writerID, nil, []interface{}{&t.key}
+	case *deleteRangeOp:
+		return &t.writerID, nil, []interface{}{&t.start, &t.end}
+	case *iterFirstOp:
+		return &t.iterID, nil, nil
+	case *flushOp:
+		return &t.db, nil, nil
+	case *getOp:
+		return &t.readerID, nil, []interface{}{&t.key}
+	case *ingestOp:
+		return &t.dbID, nil, []interface{}{&t.batchIDs}
+	case *initOp:
+		return nil, nil, []interface{}{&t.dbSlots, &t.batchSlots, &t.iterSlots, &t.snapshotSlots}
+	case *iterLastOp:
+		return &t.iterID, nil, nil
+	case *mergeOp:
+		return &t.writerID, nil, []interface{}{&t.key, &t.value}
+	case *newBatchOp:
+		return &t.dbID, &t.batchID, nil
+	case *newIndexedBatchOp:
+		return &t.dbID, &t.batchID, nil
+	case *newIterOp:
+		return &t.readerID, &t.iterID, []interface{}{&t.lower, &t.upper, &t.keyTypes, &t.filterMin, &t.filterMax, &t.useL6Filters, &t.maskSuffix}
+	case *newIterUsingCloneOp:
+		return &t.existingIterID, &t.iterID, []interface{}{&t.refreshBatch, &t.lower, &t.upper, &t.keyTypes, &t.filterMin, &t.filterMax, &t.useL6Filters, &t.maskSuffix}
+	case *newSnapshotOp:
+		return &t.dbID, &t.snapID, []interface{}{&t.bounds}
+	case *iterNextOp:
+		return &t.iterID, nil, []interface{}{&t.limit}
+	case *iterNextPrefixOp:
+		return &t.iterID, nil, nil
+	case *iterCanSingleDelOp:
+		return &t.iterID, nil, []interface{}{}
+	case *iterPrevOp:
+		return &t.iterID, nil, []interface{}{&t.limit}
+	case *iterSeekLTOp:
+		return &t.iterID, nil, []interface{}{&t.key, &t.limit}
+	case *iterSeekGEOp:
+		return &t.iterID, nil, []interface{}{&t.key, &t.limit}
+	case *iterSeekPrefixGEOp:
+		return &t.iterID, nil, []interface{}{&t.key}
+	case *setOp:
+		return &t.writerID, nil, []interface{}{&t.key, &t.value}
+	case *iterSetBoundsOp:
+		return &t.iterID, nil, []interface{}{&t.lower, &t.upper}
+	case *iterSetOptionsOp:
+		return &t.iterID, nil, []interface{}{&t.lower, &t.upper, &t.keyTypes, &t.filterMin, &t.filterMax, &t.useL6Filters, &t.maskSuffix}
+	case *singleDeleteOp:
+		return &t.writerID, nil, []interface{}{&t.key, &t.maybeReplaceDelete}
+	case *rangeKeyDeleteOp:
+		return &t.writerID, nil, []interface{}{&t.start, &t.end}
+	case *rangeKeySetOp:
+		return &t.writerID, nil, []interface{}{&t.start, &t.end, &t.suffix, &t.value}
+	case *rangeKeyUnsetOp:
+		return &t.writerID, nil, []interface{}{&t.start, &t.end, &t.suffix}
+	case *replicateOp:
+		return &t.source, nil, []interface{}{&t.dest, &t.start, &t.end}
+	}
+	panic(fmt.Sprintf("unsupported op type: %T", op))
+}
+
+var methods = map[string]*methodInfo{
+	"Apply":                     makeMethod(applyOp{}, dbTag, batchTag),
+	"Checkpoint":                makeMethod(checkpointOp{}, dbTag),
+	"Clone":                     makeMethod(newIterUsingCloneOp{}, iterTag),
+	"Close":                     makeMethod(closeOp{}, dbTag, batchTag, iterTag, snapTag),
+	"Commit":                    makeMethod(batchCommitOp{}, batchTag),
+	"Compact":                   makeMethod(compactOp{}, dbTag),
+	"Delete":                    makeMethod(deleteOp{}, dbTag, batchTag),
+	"DeleteRange":               makeMethod(deleteRangeOp{}, dbTag, batchTag),
+	"First":                     makeMethod(iterFirstOp{}, iterTag),
+	"Flush":                     makeMethod(flushOp{}, dbTag),
+	"Get":                       makeMethod(getOp{}, dbTag, batchTag, snapTag),
+	"Ingest":                    makeMethod(ingestOp{}, dbTag),
+	"Init":                      makeMethod(initOp{}, dbTag),
+	"Last":                      makeMethod(iterLastOp{}, iterTag),
+	"Merge":                     makeMethod(mergeOp{}, dbTag, batchTag),
+	"NewBatch":                  makeMethod(newBatchOp{}, dbTag),
+	"NewIndexedBatch":           makeMethod(newIndexedBatchOp{}, dbTag),
+	"NewIter":                   makeMethod(newIterOp{}, dbTag, batchTag, snapTag),
+	"NewSnapshot":               makeMethod(newSnapshotOp{}, dbTag),
+	"Next":                      makeMethod(iterNextOp{}, iterTag),
+	"NextPrefix":                makeMethod(iterNextPrefixOp{}, iterTag),
+	"InternalNext":              makeMethod(iterCanSingleDelOp{}, iterTag),
+	"Prev":                      makeMethod(iterPrevOp{}, iterTag),
+	"RangeKeyDelete":            makeMethod(rangeKeyDeleteOp{}, dbTag, batchTag),
+	"RangeKeySet":               makeMethod(rangeKeySetOp{}, dbTag, batchTag),
+	"RangeKeyUnset":             makeMethod(rangeKeyUnsetOp{}, dbTag, batchTag),
+	"RatchetFormatMajorVersion": makeMethod(dbRatchetFormatMajorVersionOp{}, dbTag),
+	"Replicate":                 makeMethod(replicateOp{}, dbTag),
+	"Restart":                   makeMethod(dbRestartOp{}, dbTag),
+	"SeekGE":                    makeMethod(iterSeekGEOp{}, iterTag),
+	"SeekLT":                    makeMethod(iterSeekLTOp{}, iterTag),
+	"SeekPrefixGE":              makeMethod(iterSeekPrefixGEOp{}, iterTag),
+	"Set":                       makeMethod(setOp{}, dbTag, batchTag),
+	"SetBounds":                 makeMethod(iterSetBoundsOp{}, iterTag),
+	"SetOptions":                makeMethod(iterSetOptionsOp{}, iterTag),
+	"SingleDelete":              makeMethod(singleDeleteOp{}, dbTag, batchTag),
+}
+
+type parser struct {
+	opts parserOpts
+	fset *token.FileSet
+	s    scanner.Scanner
+	objs map[objID]bool
+}
+
+type parserOpts struct {
+	allowUndefinedObjs bool
+}
+
+func parse(src []byte, opts parserOpts) (_ []op, err error) {
+	// Various bits of magic incantation to set up a scanner for Go compatible
+	// syntax. We arranged for the textual format of ops (e.g. op.String()) to
+	// look like Go which allows us to use the Go scanner for parsing.
+	p := &parser{
+		opts: opts,
+		fset: token.NewFileSet(),
+		objs: map[objID]bool{makeObjID(dbTag, 1): true, makeObjID(dbTag, 2): true},
+	}
+	file := p.fset.AddFile("", -1, len(src))
+	p.s.Init(file, src, nil /* no error handler */, 0)
+	return p.parse()
+}
+
+func (p *parser) parse() (_ []op, err error) {
+	defer func() {
+		if r := recover(); r != nil {
+			var ok bool
+			if err, ok = r.(error); ok {
+				return
+			}
+			err = errors.Errorf("%v", r)
+		}
+	}()
+
+	var ops []op
+	for {
+		op := p.parseOp()
+		if op == nil {
+			computeDerivedFields(ops)
+			return ops, nil
+		}
+		ops = append(ops, op)
+	}
+}
+
+func (p *parser) parseOp() op {
+	destPos, destTok, destLit := p.s.Scan()
+	if destTok == token.EOF {
+		return nil
+	}
+	if destTok != token.IDENT {
+		panic(p.errorf(destPos, "unexpected token: %s %q", destTok, destLit))
+	}
+	if destLit == "Init" {
+		// <op>(<args>)
+		return p.makeOp(destLit, makeObjID(dbTag, 1), 0, destPos)
+	}
+
+	destID := p.parseObjID(destPos, destLit)
+
+	pos, tok, lit := p.s.Scan()
+	switch tok {
+	case token.PERIOD:
+		// <obj>.<op>(<args>)
+		if !p.objs[destID] {
+			if p.opts.allowUndefinedObjs {
+				p.objs[destID] = true
+			} else {
+				panic(p.errorf(destPos, "unknown object: %s", destID))
+			}
+		}
+		_, methodLit := p.scanToken(token.IDENT)
+		return p.makeOp(methodLit, destID, 0, destPos)
+
+	case token.ASSIGN:
+		// <obj> = <obj>.<op>(<args>)
+		srcPos, srcLit := p.scanToken(token.IDENT)
+		srcID := p.parseObjID(srcPos, srcLit)
+		if !p.objs[srcID] {
+			if p.opts.allowUndefinedObjs {
+				p.objs[srcID] = true
+			} else {
+				panic(p.errorf(srcPos, "unknown object %q", srcLit))
+			}
+		}
+		p.scanToken(token.PERIOD)
+		_, methodLit := p.scanToken(token.IDENT)
+		p.objs[destID] = true
+		return p.makeOp(methodLit, srcID, destID, srcPos)
+	}
+	panic(p.errorf(pos, "unexpected token: %q", p.tokenf(tok, lit)))
+}
+
+func parseObjID(str string) (objID, error) {
+	var tag objTag
+	switch {
+	case strings.HasPrefix(str, "db"):
+		tag, str = dbTag, str[2:]
+		if str == "" {
+			str = "1"
+		}
+	case strings.HasPrefix(str, "batch"):
+		tag, str = batchTag, str[5:]
+	case strings.HasPrefix(str, "iter"):
+		tag, str = iterTag, str[4:]
+	case strings.HasPrefix(str, "snap"):
+		tag, str = snapTag, str[4:]
+	default:
+		return 0, errors.Newf("unable to parse objectID: %q", str)
+	}
+	id, err := strconv.ParseInt(str, 10, 32)
+	if err != nil {
+		return 0, err
+	}
+	return makeObjID(tag, uint32(id)), nil
+}
+
+func (p *parser) parseObjID(pos token.Pos, str string) objID {
+	id, err := parseObjID(str)
+	if err != nil {
+		panic(p.errorf(pos, "%s", err))
+	}
+	return id
+}
+
+func unquoteBytes(lit string) []byte {
+	s, err := strconv.Unquote(lit)
+	if err != nil {
+		panic(err)
+	}
+	if len(s) == 0 {
+		return nil
+	}
+	return []byte(s)
+}
+
+func (p *parser) parseArgs(op op, methodName string, args []interface{}) {
+	pos, _ := p.scanToken(token.LPAREN)
+	for i := range args {
+		if i > 0 {
+			pos, _ = p.scanToken(token.COMMA)
+		}
+
+		switch t := args[i].(type) {
+		case *uint32:
+			_, lit := p.scanToken(token.INT)
+			val, err := strconv.ParseUint(lit, 10, 32)
+			if err != nil {
+				panic(err)
+			}
+			*t = uint32(val)
+
+		case *uint64:
+			_, lit := p.scanToken(token.INT)
+			val, err := strconv.ParseUint(lit, 10, 64)
+			if err != nil {
+				panic(err)
+			}
+			*t = uint64(val)
+
+		case *[]byte:
+			_, lit := p.scanToken(token.STRING)
+			*t = unquoteBytes(lit)
+
+		case *bool:
+			_, lit := p.scanToken(token.IDENT)
+			b, err := strconv.ParseBool(lit)
+			if err != nil {
+				panic(err)
+			}
+			*t = b
+
+		case *objID:
+			pos, lit := p.scanToken(token.IDENT)
+			*t = p.parseObjID(pos, lit)
+
+		case *[]pebble.KeyRange:
+			var pending pebble.KeyRange
+			for {
+				pos, tok, lit := p.s.Scan()
+				switch tok {
+				case token.STRING:
+					x := unquoteBytes(lit)
+					if pending.Start == nil {
+						pending.Start = x
+					} else {
+						pending.End = x
+						*t = append(*t, pending)
+						pending = pebble.KeyRange{}
+					}
+					pos, tok, lit := p.s.Scan()
+					switch tok {
+					case token.COMMA:
+						continue
+					case token.RPAREN:
+						p.scanToken(token.SEMICOLON)
+						return
+					default:
+						panic(p.errorf(pos, "unexpected token: %q", p.tokenf(tok, lit)))
+					}
+				case token.RPAREN:
+					p.scanToken(token.SEMICOLON)
+					return
+				default:
+					panic(p.errorf(pos, "unexpected token: %q", p.tokenf(tok, lit)))
+				}
+			}
+
+		case *[]objID:
+			for {
+				pos, tok, lit := p.s.Scan()
+				switch tok {
+				case token.IDENT:
+					*t = append(*t, p.parseObjID(pos, lit))
+					pos, tok, lit := p.s.Scan()
+					switch tok {
+					case token.COMMA:
+						continue
+					case token.RPAREN:
+						p.scanToken(token.SEMICOLON)
+						return
+					default:
+						panic(p.errorf(pos, "unexpected token: %q", p.tokenf(tok, lit)))
+					}
+				case token.RPAREN:
+					p.scanToken(token.SEMICOLON)
+					return
+				default:
+					panic(p.errorf(pos, "unexpected token: %q", p.tokenf(tok, lit)))
+				}
+			}
+
+		case *[]pebble.CheckpointSpan:
+			pos, tok, lit := p.s.Scan()
+			switch tok {
+			case token.RPAREN:
+				// No spans.
+				*t = nil
+				p.scanToken(token.SEMICOLON)
+				return
+
+			case token.STRING:
+				var keys [][]byte
+				for {
+					s, err := strconv.Unquote(lit)
+					if err != nil {
+						panic(p.errorf(pos, "unquoting %q: %v", lit, err))
+					}
+					keys = append(keys, []byte(s))
+
+					pos, tok, lit = p.s.Scan()
+					switch tok {
+					case token.COMMA:
+						pos, tok, lit = p.s.Scan()
+						if tok != token.STRING {
+							panic(p.errorf(pos, "unexpected token: %q", p.tokenf(tok, lit)))
+						}
+						continue
+
+					case token.RPAREN:
+						p.scanToken(token.SEMICOLON)
+						if len(keys)%2 == 1 {
+							panic(p.errorf(pos, "expected even number of keys"))
+						}
+						*t = make([]pebble.CheckpointSpan, len(keys)/2)
+						for i := range *t {
+							(*t)[i] = pebble.CheckpointSpan{
+								Start: keys[i*2],
+								End:   keys[i*2+1],
+							}
+						}
+						return
+
+					default:
+						panic(p.errorf(pos, "unexpected token: %q", p.tokenf(tok, lit)))
+					}
+				}
+
+			default:
+				panic(p.errorf(pos, "unexpected token: %q", p.tokenf(tok, lit)))
+			}
+
+		case *pebble.FormatMajorVersion:
+			_, lit := p.scanToken(token.INT)
+			val, err := strconv.ParseUint(lit, 10, 64)
+			if err != nil {
+				panic(err)
+			}
+			*t = pebble.FormatMajorVersion(val)
+
+		default:
+			panic(p.errorf(pos, "%s: unsupported arg[%d] type: %T", methodName, i, args[i]))
+		}
+	}
+	p.scanToken(token.RPAREN)
+	p.scanToken(token.SEMICOLON)
+}
+
+func (p *parser) scanToken(expected token.Token) (pos token.Pos, lit string) {
+	pos, tok, lit := p.s.Scan()
+	if tok != expected {
+		panic(p.errorf(pos, "unexpected token: %q", p.tokenf(tok, lit)))
+	}
+	return pos, lit
+}
+
+func (p *parser) makeOp(methodName string, receiverID, targetID objID, pos token.Pos) op {
+	info := methods[methodName]
+	if info == nil {
+		panic(p.errorf(pos, "unknown op %s.%s", receiverID, methodName))
+	}
+	if info.validTags&(1<<receiverID.tag()) == 0 {
+		panic(p.errorf(pos, "%s.%s: %s is not a method on %s",
+			receiverID, methodName, methodName, receiverID))
+	}
+
+	op := info.constructor()
+	receiver, target, args := opArgs(op)
+
+	// The form of an operation is:
+	//   [target =] receiver.method(args)
+	//
+	// The receiver is the object the operation will be called on, which can be
+	// any valid ID. Certain operations such as Ingest are only valid on the DB
+	// object. That is indicated by opArgs returning a nil receiver.
+	if receiver != nil {
+		*receiver = receiverID
+	} else if receiverID.tag() != dbTag {
+		panic(p.errorf(pos, "unknown op %s.%s", receiverID, methodName))
+	}
+
+	// The target is the object that will be assigned the result of an object
+	// creation operation such as newBatchOp or newIterOp.
+	if target != nil {
+		// It is invalid to not have a targetID for a method which generates a new
+		// object.
+		if targetID == 0 {
+			panic(p.errorf(pos, "assignment expected for %s.%s", receiverID, methodName))
+		}
+		// It is invalid to try to assign to the DB object.
+		if targetID.tag() == dbTag {
+			panic(p.errorf(pos, "cannot use %s as target of assignment", targetID))
+		}
+		*target = targetID
+	} else if targetID != 0 {
+		panic(p.errorf(pos, "cannot use %s.%s in assignment", receiverID, methodName))
+	}
+
+	p.parseArgs(op, methodName, args)
+	return op
+}
+
+func (p *parser) tokenf(tok token.Token, lit string) string {
+	if tok.IsLiteral() {
+		return lit
+	}
+	return tok.String()
+}
+
+func (p *parser) errorf(pos token.Pos, format string, args ...interface{}) error {
+	return errors.New(p.fset.Position(pos).String() + ": " + fmt.Sprintf(format, args...))
+}
+
+// computeDerivedFields makes one pass through the provided operations, filling
+// any derived fields. This pass must happen before execution because concurrent
+// execution depends on these fields.
+func computeDerivedFields(ops []op) {
+	iterToReader := make(map[objID]objID)
+	objToDB := make(map[objID]objID)
+	for i := range ops {
+		switch v := ops[i].(type) {
+		case *newSnapshotOp:
+			objToDB[v.snapID] = v.dbID
+		case *newIterOp:
+			iterToReader[v.iterID] = v.readerID
+			dbReaderID := v.readerID
+			if dbReaderID.tag() != dbTag {
+				dbReaderID = objToDB[dbReaderID]
+			}
+			objToDB[v.iterID] = dbReaderID
+			v.derivedDBID = dbReaderID
+		case *newIterUsingCloneOp:
+			v.derivedReaderID = iterToReader[v.existingIterID]
+			iterToReader[v.iterID] = v.derivedReaderID
+			objToDB[v.iterID] = objToDB[v.existingIterID]
+		case *iterSetOptionsOp:
+			v.derivedReaderID = iterToReader[v.iterID]
+		case *iterFirstOp:
+			v.derivedReaderID = iterToReader[v.iterID]
+		case *iterLastOp:
+			v.derivedReaderID = iterToReader[v.iterID]
+		case *iterSeekGEOp:
+			v.derivedReaderID = iterToReader[v.iterID]
+		case *iterSeekPrefixGEOp:
+			v.derivedReaderID = iterToReader[v.iterID]
+		case *iterSeekLTOp:
+			v.derivedReaderID = iterToReader[v.iterID]
+		case *iterNextOp:
+			v.derivedReaderID = iterToReader[v.iterID]
+		case *iterNextPrefixOp:
+			v.derivedReaderID = iterToReader[v.iterID]
+		case *iterCanSingleDelOp:
+			v.derivedReaderID = iterToReader[v.iterID]
+		case *iterPrevOp:
+			v.derivedReaderID = iterToReader[v.iterID]
+		case *newBatchOp:
+			objToDB[v.batchID] = v.dbID
+		case *newIndexedBatchOp:
+			objToDB[v.batchID] = v.dbID
+		case *applyOp:
+			if derivedDBID, ok := objToDB[v.batchID]; ok && v.writerID.tag() != dbTag {
+				objToDB[v.writerID] = derivedDBID
+			}
+		case *getOp:
+			if derivedDBID, ok := objToDB[v.readerID]; ok {
+				v.derivedDBID = derivedDBID
+			}
+		case *batchCommitOp:
+			v.dbID = objToDB[v.batchID]
+		case *closeOp:
+			if derivedDBID, ok := objToDB[v.objID]; ok && v.objID.tag() != dbTag {
+				v.derivedDBID = derivedDBID
+			}
+		case *ingestOp:
+			v.derivedDBIDs = make([]objID, len(v.batchIDs))
+			for i := range v.batchIDs {
+				v.derivedDBIDs[i] = objToDB[v.batchIDs[i]]
+			}
+		case *deleteOp:
+			derivedDBID := v.writerID
+			if v.writerID.tag() != dbTag {
+				derivedDBID = objToDB[v.writerID]
+			}
+			v.derivedDBID = derivedDBID
+		}
+	}
+}
diff --git a/pebble/metamorphic/parser_test.go b/pebble/metamorphic/parser_test.go
new file mode 100644
index 0000000..2ef3fee
--- /dev/null
+++ b/pebble/metamorphic/parser_test.go
@@ -0,0 +1,66 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package metamorphic
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/randvar"
+	"github.com/stretchr/testify/require"
+)
+
+func TestParser(t *testing.T) {
+	datadriven.RunTest(t, "testdata/parser", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "parse":
+			ops, err := parse([]byte(d.Input), parserOpts{})
+			if err != nil {
+				return err.Error()
+			}
+			return formatOps(ops)
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
+
+func TestParserRandom(t *testing.T) {
+	cfgs := []string{"default", "multiInstance"}
+	for i := range cfgs {
+		t.Run(fmt.Sprintf("config=%s", cfgs[i]), func(t *testing.T) {
+			cfg := defaultConfig()
+			if cfgs[i] == "multiInstance" {
+				cfg = multiInstanceConfig()
+				cfg.numInstances = 2
+			}
+			ops := generate(randvar.NewRand(), 10000, cfg, newKeyManager(cfg.numInstances))
+			src := formatOps(ops)
+
+			parsedOps, err := parse([]byte(src), parserOpts{})
+			if err != nil {
+				t.Fatalf("%s\n%s", err, src)
+			}
+			require.Equal(t, ops, parsedOps)
+		})
+	}
+}
+
+func TestParserNilBounds(t *testing.T) {
+	formatted := formatOps([]op{
+		&newIterOp{
+			readerID: makeObjID(dbTag, 1),
+			iterID:   makeObjID(iterTag, 1),
+			iterOpts: iterOpts{},
+		},
+	})
+	parsedOps, err := parse([]byte(formatted), parserOpts{})
+	require.NoError(t, err)
+	require.Equal(t, 1, len(parsedOps))
+	v := parsedOps[0].(*newIterOp)
+	require.Nil(t, v.lower)
+	require.Nil(t, v.upper)
+}
diff --git a/pebble/metamorphic/retryable.go b/pebble/metamorphic/retryable.go
new file mode 100644
index 0000000..2e24436
--- /dev/null
+++ b/pebble/metamorphic/retryable.go
@@ -0,0 +1,181 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package metamorphic
+
+import (
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/vfs/errorfs"
+)
+
+// withRetries executes fn, retrying it whenever an errorfs.ErrInjected error
+// is returned.  It returns the first nil or non-errorfs.ErrInjected error
+// returned by fn.
+func withRetries(fn func() error) error {
+	for {
+		if err := fn(); !errors.Is(err, errorfs.ErrInjected) {
+			return err
+		}
+	}
+}
+
+// retryableIter holds an iterator and the state necessary to reset it to its
+// state after the last successful operation. This allows us to retry failed
+// iterator operations by running them again on a non-error iterator with the
+// same pre-operation state.
+type retryableIter struct {
+	iter    *pebble.Iterator
+	lastKey []byte
+}
+
+func (i *retryableIter) needRetry() bool {
+	return errors.Is(i.iter.Error(), errorfs.ErrInjected)
+}
+
+func (i *retryableIter) withRetry(fn func()) {
+	for {
+		fn()
+		if !i.needRetry() {
+			break
+		}
+		for i.needRetry() {
+			i.iter.SeekGE(i.lastKey)
+		}
+	}
+
+	i.lastKey = i.lastKey[:0]
+	if i.iter.Valid() {
+		i.lastKey = append(i.lastKey, i.iter.Key()...)
+	}
+}
+
+func (i *retryableIter) Close() error {
+	return i.iter.Close()
+}
+
+func (i *retryableIter) Error() error {
+	return i.iter.Error()
+}
+
+func (i *retryableIter) First() bool {
+	var valid bool
+	i.withRetry(func() {
+		valid = i.iter.First()
+	})
+	return valid
+}
+
+func (i *retryableIter) Key() []byte {
+	return i.iter.Key()
+}
+
+func (i *retryableIter) RangeKeyChanged() bool {
+	return i.iter.RangeKeyChanged()
+}
+
+func (i *retryableIter) HasPointAndRange() (bool, bool) {
+	return i.iter.HasPointAndRange()
+}
+
+func (i *retryableIter) RangeBounds() ([]byte, []byte) {
+	return i.iter.RangeBounds()
+}
+
+func (i *retryableIter) RangeKeys() []pebble.RangeKeyData {
+	return i.iter.RangeKeys()
+}
+
+func (i *retryableIter) Last() bool {
+	var valid bool
+	i.withRetry(func() { valid = i.iter.Last() })
+	return valid
+}
+
+func (i *retryableIter) Next() bool {
+	var valid bool
+	i.withRetry(func() {
+		valid = i.iter.Next()
+	})
+	return valid
+}
+
+func (i *retryableIter) NextWithLimit(limit []byte) pebble.IterValidityState {
+	var validity pebble.IterValidityState
+	i.withRetry(func() {
+		validity = i.iter.NextWithLimit(limit)
+	})
+	return validity
+
+}
+
+func (i *retryableIter) NextPrefix() bool {
+	var valid bool
+	i.withRetry(func() {
+		valid = i.iter.NextPrefix()
+	})
+	return valid
+}
+
+func (i *retryableIter) Prev() bool {
+	var valid bool
+	i.withRetry(func() {
+		valid = i.iter.Prev()
+	})
+	return valid
+}
+
+func (i *retryableIter) PrevWithLimit(limit []byte) pebble.IterValidityState {
+	var validity pebble.IterValidityState
+	i.withRetry(func() {
+		validity = i.iter.PrevWithLimit(limit)
+	})
+	return validity
+}
+
+func (i *retryableIter) SeekGE(key []byte) bool {
+	var valid bool
+	i.withRetry(func() { valid = i.iter.SeekGE(key) })
+	return valid
+}
+
+func (i *retryableIter) SeekGEWithLimit(key []byte, limit []byte) pebble.IterValidityState {
+	var validity pebble.IterValidityState
+	i.withRetry(func() { validity = i.iter.SeekGEWithLimit(key, limit) })
+	return validity
+}
+
+func (i *retryableIter) SeekLT(key []byte) bool {
+	var valid bool
+	i.withRetry(func() { valid = i.iter.SeekLT(key) })
+	return valid
+}
+
+func (i *retryableIter) SeekLTWithLimit(key []byte, limit []byte) pebble.IterValidityState {
+	var validity pebble.IterValidityState
+	i.withRetry(func() { validity = i.iter.SeekLTWithLimit(key, limit) })
+	return validity
+}
+
+func (i *retryableIter) SeekPrefixGE(key []byte) bool {
+	var valid bool
+	i.withRetry(func() { valid = i.iter.SeekPrefixGE(key) })
+	return valid
+}
+
+func (i *retryableIter) SetBounds(lower, upper []byte) {
+	i.iter.SetBounds(lower, upper)
+}
+
+func (i *retryableIter) SetOptions(opts *pebble.IterOptions) {
+	i.iter.SetOptions(opts)
+}
+
+func (i *retryableIter) Valid() bool {
+	return i.iter.Valid()
+}
+
+func (i *retryableIter) Value() []byte {
+	return i.iter.Value()
+}
diff --git a/pebble/metamorphic/test.go b/pebble/metamorphic/test.go
new file mode 100644
index 0000000..1881197
--- /dev/null
+++ b/pebble/metamorphic/test.go
@@ -0,0 +1,443 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package metamorphic
+
+import (
+	"fmt"
+	"io"
+	"os"
+	"path"
+	"sort"
+	"strings"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/cockroachdb/pebble/vfs/errorfs"
+)
+
+type test struct {
+	// The list of ops to execute. The ops refer to slots in the batches, iters,
+	// and snapshots slices.
+	ops       []op
+	opsWaitOn [][]int         // op index -> op indexes
+	opsDone   []chan struct{} // op index -> done channel
+	idx       int
+	dir       string
+	opts      *pebble.Options
+	testOpts  *TestOptions
+	writeOpts *pebble.WriteOptions
+	tmpDir    string
+	// The DBs the test is run on.
+	dbs []*pebble.DB
+	// The slots for the batches, iterators, and snapshots. These are read and
+	// written by the ops to pass state from one op to another.
+	batches   []*pebble.Batch
+	iters     []*retryableIter
+	snapshots []readerCloser
+}
+
+func newTest(ops []op) *test {
+	return &test{
+		ops: ops,
+	}
+}
+
+func (t *test) init(h *history, dir string, testOpts *TestOptions, numInstances int) error {
+	t.dir = dir
+	t.testOpts = testOpts
+	t.writeOpts = pebble.NoSync
+	if testOpts.strictFS {
+		t.writeOpts = pebble.Sync
+	}
+	t.opts = testOpts.Opts.EnsureDefaults()
+	t.opts.Logger = h
+	lel := pebble.MakeLoggingEventListener(t.opts.Logger)
+	t.opts.EventListener = &lel
+	t.opts.DebugCheck = func(db *pebble.DB) error {
+		// Wrap the ordinary DebugCheckLevels with retrying
+		// of injected errors.
+		return withRetries(func() error {
+			return pebble.DebugCheckLevels(db)
+		})
+	}
+	if numInstances < 1 {
+		numInstances = 1
+	}
+
+	t.opsWaitOn, t.opsDone = computeSynchronizationPoints(t.ops)
+
+	defer t.opts.Cache.Unref()
+
+	// If an error occurs and we were using an in-memory FS, attempt to clone to
+	// on-disk in order to allow post-mortem debugging. Note that always using
+	// the on-disk FS isn't desirable because there is a large performance
+	// difference between in-memory and on-disk which causes different code paths
+	// and timings to be exercised.
+	maybeExit := func(err error) {
+		if err == nil || errors.Is(err, errorfs.ErrInjected) || errors.Is(err, pebble.ErrCancelledCompaction) {
+			return
+		}
+		t.maybeSaveData()
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(1)
+	}
+
+	// Exit early on any error from a background operation.
+	t.opts.EventListener.BackgroundError = func(err error) {
+		t.opts.Logger.Infof("background error: %s", err)
+		maybeExit(err)
+	}
+	t.opts.EventListener.CompactionEnd = func(info pebble.CompactionInfo) {
+		t.opts.Logger.Infof("%s", info)
+		maybeExit(info.Err)
+	}
+	t.opts.EventListener.FlushEnd = func(info pebble.FlushInfo) {
+		t.opts.Logger.Infof("%s", info)
+		if info.Err != nil && !strings.Contains(info.Err.Error(), "pebble: empty table") {
+			maybeExit(info.Err)
+		}
+	}
+	t.opts.EventListener.ManifestCreated = func(info pebble.ManifestCreateInfo) {
+		t.opts.Logger.Infof("%s", info)
+		maybeExit(info.Err)
+	}
+	t.opts.EventListener.ManifestDeleted = func(info pebble.ManifestDeleteInfo) {
+		t.opts.Logger.Infof("%s", info)
+		maybeExit(info.Err)
+	}
+	t.opts.EventListener.TableDeleted = func(info pebble.TableDeleteInfo) {
+		t.opts.Logger.Infof("%s", info)
+		maybeExit(info.Err)
+	}
+	t.opts.EventListener.TableIngested = func(info pebble.TableIngestInfo) {
+		t.opts.Logger.Infof("%s", info)
+		maybeExit(info.Err)
+	}
+	t.opts.EventListener.WALCreated = func(info pebble.WALCreateInfo) {
+		t.opts.Logger.Infof("%s", info)
+		maybeExit(info.Err)
+	}
+	t.opts.EventListener.WALDeleted = func(info pebble.WALDeleteInfo) {
+		t.opts.Logger.Infof("%s", info)
+		maybeExit(info.Err)
+	}
+
+	for i := range t.testOpts.CustomOpts {
+		if err := t.testOpts.CustomOpts[i].Open(t.opts); err != nil {
+			return err
+		}
+	}
+
+	t.dbs = make([]*pebble.DB, numInstances)
+	for i := range t.dbs {
+		var db *pebble.DB
+		var err error
+		if len(t.dbs) > 1 {
+			dir = path.Join(t.dir, fmt.Sprintf("db%d", i+1))
+		}
+		err = withRetries(func() error {
+			db, err = pebble.Open(dir, t.opts)
+			return err
+		})
+		if err != nil {
+			return err
+		}
+		t.dbs[i] = db
+		h.log.Printf("// db%d.Open() %v", i+1, err)
+
+		if t.testOpts.sharedStorageEnabled {
+			err = withRetries(func() error {
+				return db.SetCreatorID(uint64(i + 1))
+			})
+			if err != nil {
+				return err
+			}
+			h.log.Printf("// db%d.SetCreatorID() %v", i+1, err)
+		}
+	}
+
+	var err error
+	t.tmpDir = t.opts.FS.PathJoin(t.dir, "tmp")
+	if err = t.opts.FS.MkdirAll(t.tmpDir, 0755); err != nil {
+		return err
+	}
+	if t.testOpts.strictFS {
+		// Sync the whole directory path for the tmpDir, since restartDB() is executed during
+		// the test. That would reset MemFS to the synced state, which would make an unsynced
+		// directory disappear in the middle of the test. It is the responsibility of the test
+		// (not Pebble) to ensure that it can write the ssts that it will subsequently ingest
+		// into Pebble.
+		for {
+			f, err := t.opts.FS.OpenDir(dir)
+			if err != nil {
+				return err
+			}
+			if err = f.Sync(); err != nil {
+				return err
+			}
+			if err = f.Close(); err != nil {
+				return err
+			}
+			if len(dir) == 1 {
+				break
+			}
+			dir = t.opts.FS.PathDir(dir)
+			// TODO(sbhola): PathDir returns ".", which OpenDir() complains about. Fix.
+			if len(dir) == 1 {
+				dir = "/"
+			}
+		}
+	}
+
+	return nil
+}
+
+func (t *test) isFMV(dbID objID, fmv pebble.FormatMajorVersion) bool {
+	db := t.getDB(dbID)
+	return db.FormatMajorVersion() >= fmv
+}
+
+func (t *test) restartDB(dbID objID) error {
+	db := t.getDB(dbID)
+	if !t.testOpts.strictFS {
+		return nil
+	}
+	t.opts.Cache.Ref()
+	// The fs isn't necessarily a MemFS.
+	fs, ok := vfs.Root(t.opts.FS).(*vfs.MemFS)
+	if ok {
+		fs.SetIgnoreSyncs(true)
+	}
+	if err := db.Close(); err != nil {
+		return err
+	}
+	// Release any resources held by custom options. This may be used, for
+	// example, by the encryption-at-rest custom option (within the Cockroach
+	// repository) to close the file registry.
+	for i := range t.testOpts.CustomOpts {
+		if err := t.testOpts.CustomOpts[i].Close(t.opts); err != nil {
+			return err
+		}
+	}
+	if ok {
+		fs.ResetToSyncedState()
+		fs.SetIgnoreSyncs(false)
+	}
+
+	// TODO(jackson): Audit errorRate and ensure custom options' hooks semantics
+	// are well defined within the context of retries.
+	err := withRetries(func() (err error) {
+		// Reacquire any resources required by custom options. This may be used, for
+		// example, by the encryption-at-rest custom option (within the Cockroach
+		// repository) to reopen the file registry.
+		for i := range t.testOpts.CustomOpts {
+			if err := t.testOpts.CustomOpts[i].Open(t.opts); err != nil {
+				return err
+			}
+		}
+		dir := t.dir
+		if len(t.dbs) > 1 {
+			dir = path.Join(dir, fmt.Sprintf("db%d", dbID.slot()))
+		}
+		t.dbs[dbID.slot()-1], err = pebble.Open(dir, t.opts)
+		if err != nil {
+			return err
+		}
+		return err
+	})
+	t.opts.Cache.Unref()
+	return err
+}
+
+// If an in-memory FS is being used, save the contents to disk.
+func (t *test) maybeSaveData() {
+	rootFS := vfs.Root(t.opts.FS)
+	if rootFS == vfs.Default {
+		return
+	}
+	_ = os.RemoveAll(t.dir)
+	if _, err := vfs.Clone(rootFS, vfs.Default, t.dir, t.dir); err != nil {
+		t.opts.Logger.Infof("unable to clone: %s: %v", t.dir, err)
+	}
+}
+
+func (t *test) step(h *history) bool {
+	if t.idx >= len(t.ops) {
+		return false
+	}
+	t.ops[t.idx].run(t, h.recorder(-1 /* thread */, t.idx))
+	t.idx++
+	return true
+}
+
+func (t *test) setBatch(id objID, b *pebble.Batch) {
+	if id.tag() != batchTag {
+		panic(fmt.Sprintf("invalid batch ID: %s", id))
+	}
+	t.batches[id.slot()] = b
+}
+
+func (t *test) setIter(id objID, i *pebble.Iterator) {
+	if id.tag() != iterTag {
+		panic(fmt.Sprintf("invalid iter ID: %s", id))
+	}
+	t.iters[id.slot()] = &retryableIter{
+		iter:    i,
+		lastKey: nil,
+	}
+}
+
+type readerCloser interface {
+	pebble.Reader
+	io.Closer
+}
+
+func (t *test) setSnapshot(id objID, s readerCloser) {
+	if id.tag() != snapTag {
+		panic(fmt.Sprintf("invalid snapshot ID: %s", id))
+	}
+	t.snapshots[id.slot()] = s
+}
+
+func (t *test) clearObj(id objID) {
+	switch id.tag() {
+	case dbTag:
+		t.dbs[id.slot()-1] = nil
+	case batchTag:
+		t.batches[id.slot()] = nil
+	case iterTag:
+		t.iters[id.slot()] = nil
+	case snapTag:
+		t.snapshots[id.slot()] = nil
+	}
+}
+
+func (t *test) getBatch(id objID) *pebble.Batch {
+	if id.tag() != batchTag {
+		panic(fmt.Sprintf("invalid batch ID: %s", id))
+	}
+	return t.batches[id.slot()]
+}
+
+func (t *test) getCloser(id objID) io.Closer {
+	switch id.tag() {
+	case dbTag:
+		return t.dbs[id.slot()-1]
+	case batchTag:
+		return t.batches[id.slot()]
+	case iterTag:
+		return t.iters[id.slot()]
+	case snapTag:
+		return t.snapshots[id.slot()]
+	}
+	panic(fmt.Sprintf("cannot close ID: %s", id))
+}
+
+func (t *test) getIter(id objID) *retryableIter {
+	if id.tag() != iterTag {
+		panic(fmt.Sprintf("invalid iter ID: %s", id))
+	}
+	return t.iters[id.slot()]
+}
+
+func (t *test) getReader(id objID) pebble.Reader {
+	switch id.tag() {
+	case dbTag:
+		return t.dbs[id.slot()-1]
+	case batchTag:
+		return t.batches[id.slot()]
+	case snapTag:
+		return t.snapshots[id.slot()]
+	}
+	panic(fmt.Sprintf("invalid reader ID: %s", id))
+}
+
+func (t *test) getWriter(id objID) pebble.Writer {
+	switch id.tag() {
+	case dbTag:
+		return t.dbs[id.slot()-1]
+	case batchTag:
+		return t.batches[id.slot()]
+	}
+	panic(fmt.Sprintf("invalid writer ID: %s", id))
+}
+
+func (t *test) getDB(id objID) *pebble.DB {
+	switch id.tag() {
+	case dbTag:
+		return t.dbs[id.slot()-1]
+	default:
+		panic(fmt.Sprintf("invalid writer tag: %v", id.tag()))
+	}
+}
+
+// Compute the synchronization points between operations. When operating
+// with more than 1 thread, operations must synchronize access to shared
+// objects. Compute two slices the same length as ops.
+//
+// opsWaitOn: the value v at index i indicates that operation i must wait
+// for the operation at index v to finish before it may run. NB: v < i
+//
+// opsDone: the channel at index i must be closed when the operation at index i
+// completes. This slice is sparse. Operations that are never used as
+// synchronization points may have a nil channel.
+func computeSynchronizationPoints(ops []op) (opsWaitOn [][]int, opsDone []chan struct{}) {
+	opsDone = make([]chan struct{}, len(ops)) // operation index -> done channel
+	opsWaitOn = make([][]int, len(ops))       // operation index -> operation index
+	lastOpReference := make(map[objID]int)    // objID -> operation index
+	for i, o := range ops {
+		// Find the last operation that involved the same receiver object. We at
+		// least need to wait on that operation.
+		receiver := o.receiver()
+		waitIndex, ok := lastOpReference[receiver]
+		lastOpReference[receiver] = i
+		if !ok {
+			// Only valid for i=0. For all other operations, the receiver should
+			// have been referenced by some other operation before it's used as
+			// a receiver.
+			if i != 0 && receiver.tag() != dbTag {
+				panic(fmt.Sprintf("op %s on receiver %s; first reference of %s", ops[i].String(), receiver, receiver))
+			}
+			// The initOp is a little special. We do want to store the objects it's
+			// syncing on, in `lastOpReference`.
+			if i != 0 {
+				continue
+			}
+		}
+
+		// The last operation that referenced `receiver` is the one at index
+		// `waitIndex`. All operations with the same receiver are performed on
+		// the same thread. We only need to synchronize on the operation at
+		// `waitIndex` if `receiver` isn't also the receiver on that operation
+		// too.
+		if ops[waitIndex].receiver() != receiver {
+			opsWaitOn[i] = append(opsWaitOn[i], waitIndex)
+		}
+
+		// In additional to synchronizing on the operation's receiver operation,
+		// we may need to synchronize on additional objects. For example,
+		// batch0.Commit() must synchronize its receiver, batch0, but also on
+		// the DB since it mutates database state.
+		for _, syncObjID := range o.syncObjs() {
+			if vi, vok := lastOpReference[syncObjID]; vok {
+				opsWaitOn[i] = append(opsWaitOn[i], vi)
+			}
+			lastOpReference[syncObjID] = i
+		}
+
+		waitIndexes := opsWaitOn[i]
+		sort.Ints(waitIndexes)
+		for _, waitIndex := range waitIndexes {
+			// If this is the first operation that must wait on the operation at
+			// `waitIndex`, then there will be no channel for the operation yet.
+			// Create one.
+			if opsDone[waitIndex] == nil {
+				opsDone[waitIndex] = make(chan struct{})
+			}
+		}
+	}
+	return opsWaitOn, opsDone
+}
diff --git a/pebble/metamorphic/testdata/key_manager b/pebble/metamorphic/testdata/key_manager
new file mode 100644
index 0000000..8b73f0d
--- /dev/null
+++ b/pebble/metamorphic/testdata/key_manager
@@ -0,0 +1,288 @@
+# run subcommands
+#
+# add-new-key <key>
+# read-keys
+# write-keys
+# singledel-keys <writerID> <dbID>
+# op <operation string as printed to ops files>
+
+run
+add-new-key foo
+add-new-key foo
+----
+"foo" is new
+"foo" already tracked
+
+# Test SET; SINGLEDEL on DB.
+
+run
+read-keys
+write-keys
+singledel-keys db1 db1
+singledel-keys batch1 db1
+op db1.Set("foo", "foo")
+read-keys
+write-keys
+singledel-keys db1 db1
+singledel-keys batch1 db1
+op db1.SingleDelete("foo", false)
+read-keys
+write-keys
+singledel-keys db1 db1
+----
+read keys: "foo"
+write keys: "foo"
+singledel keys: (none)
+singledel keys: (none)
+[db1.Set("foo", "foo")]
+read keys: "foo"
+write keys: "foo"
+singledel keys: "foo"
+singledel keys: "foo"
+[db1.SingleDelete("foo", false /* maybeReplaceDelete */)]
+read keys: "foo"
+write keys: "foo"
+singledel keys: (none)
+
+
+# Test SET; SINGLEDEL on batch on separate key.
+
+run
+add-new-key bar
+op batch1.Set("bar", "bar")
+read-keys
+write-keys
+singledel-keys db1 db1
+singledel-keys batch1 db1
+singledel-keys batch2 db1
+op batch1.SingleDelete("bar", false)
+read-keys
+write-keys
+singledel-keys db1 db1
+singledel-keys batch1 db1
+op db1.Apply(batch1)
+write-keys
+singledel-keys db1 db1
+----
+"bar" is new
+[batch1.Set("bar", "bar")]
+read keys: "bar", "foo"
+write keys: "bar", "foo"
+singledel keys: (none)
+singledel keys: "bar"
+singledel keys: (none)
+[batch1.SingleDelete("bar", false /* maybeReplaceDelete */)]
+read keys: "bar", "foo"
+write keys: "foo"
+singledel keys: (none)
+singledel keys: (none)
+[db1.Apply(batch1)]
+write keys: "bar", "foo"
+singledel keys: (none)
+
+# Test SET on db; SINGLEDEL on batch.
+
+reset
+----
+
+run
+add-new-key foo
+op db1.Set("foo", "foo")
+write-keys
+singledel-keys db1 db1
+singledel-keys batch1 db1
+op batch1.SingleDelete("foo", false)
+write-keys
+singledel-keys db1 db1
+singledel-keys batch1 db1
+op db1.Apply(batch1)
+write-keys
+singledel-keys db1 db1
+op db1.Set("foo", "foo")
+singledel-keys db1 db1
+singledel-keys batch1 db1
+----
+"foo" is new
+[db1.Set("foo", "foo")]
+write keys: "foo"
+singledel keys: "foo"
+singledel keys: "foo"
+[batch1.SingleDelete("foo", false /* maybeReplaceDelete */)]
+write keys: (none)
+singledel keys: (none)
+singledel keys: (none)
+[db1.Apply(batch1)]
+write keys: "foo"
+singledel keys: (none)
+[db1.Set("foo", "foo")]
+singledel keys: "foo"
+singledel keys: "foo"
+
+# Test SET; DEL; SET; SingleDelete on db.
+
+reset
+----
+
+run
+add-new-key foo
+op db1.Set("foo", "foo")
+op db1.Delete("foo")
+write-keys
+singledel-keys db1 db1
+op db1.Set("foo", "foo")
+write-keys
+singledel-keys db1 db1
+op db1.SingleDelete("foo", false)
+write-keys
+singledel-keys db1 db1
+----
+"foo" is new
+[db1.Set("foo", "foo")]
+[db1.Delete("foo")]
+write keys: "foo"
+singledel keys: (none)
+[db1.Set("foo", "foo")]
+write keys: "foo"
+singledel keys: "foo"
+[db1.SingleDelete("foo", false /* maybeReplaceDelete */)]
+write keys: "foo"
+singledel keys: (none)
+
+# Test SET; DEL; SET; DEL on batches.
+
+reset
+----
+
+run
+add-new-key foo
+op batch1.Set("foo", "foo")
+op batch1.Delete("foo")
+op batch1.Set("foo", "foo")
+write-keys
+singledel-keys batch1 db1
+op db1.Apply(batch1)
+write-keys
+----
+"foo" is new
+[batch1.Set("foo", "foo")]
+[batch1.Delete("foo")]
+[batch1.Set("foo", "foo")]
+write keys: "foo"
+singledel keys: (none)
+[db1.Apply(batch1)]
+write keys: "foo"
+
+# "foo" should not be eliible for single delete because set count is 2.
+
+run
+singledel-keys db1 db1
+----
+singledel keys: (none)
+
+run
+op db1.Set("foo", "foo")
+----
+[db1.Set("foo", "foo")]
+
+# "foo" should still not be eliible for single delete because set count is 3.
+
+run
+singledel-keys db1 db1
+----
+singledel keys: (none)
+
+
+run
+op batch2.Delete("foo")
+op db1.Apply(batch2)
+singledel-keys db1 db1
+op db1.Set("foo", "foo")
+singledel-keys db1 db1
+----
+[batch2.Delete("foo")]
+[db1.Apply(batch2)]
+singledel keys: (none)
+[db1.Set("foo", "foo")]
+singledel keys: "foo"
+
+# Test SET; MERGE; DEL; SINGLEDEL on DB.
+
+reset
+----
+
+run
+add-new-key foo
+op db.Set("foo", "foo")
+singledel-keys db1 db1
+op db1.Merge("foo", "foo")
+singledel-keys db1 db1
+op db1.Delete("foo")
+write-keys
+singledel-keys db1 db1
+op db1.Set("foo", "foo")
+write-keys
+singledel-keys db1 db1
+op db1.SingleDelete("foo", false)
+write-keys
+singledel-keys db1 db1
+----
+"foo" is new
+[db1.Set("foo", "foo")]
+singledel keys: "foo"
+[db1.Merge("foo", "foo")]
+singledel keys: (none)
+[db1.Delete("foo")]
+write keys: "foo"
+singledel keys: (none)
+[db1.Set("foo", "foo")]
+write keys: "foo"
+singledel keys: "foo"
+[db1.SingleDelete("foo", false /* maybeReplaceDelete */)]
+write keys: "foo"
+singledel keys: (none)
+
+# Test SET; DEL (db); SET; SINGLEDEL (batch)
+
+reset
+----
+
+run
+add-new-key foo
+op db1.Set("foo", "foo")
+singledel-keys db1 db1
+op db1.Delete("foo")
+write-keys
+singledel-keys db1 db1
+op db1.Set("foo", "foo")
+write-keys
+singledel-keys db1 db1
+singledel-keys batch1 db1
+op batch1.SingleDelete("foo", false)
+write-keys
+singledel-keys db1 db1
+singledel-keys batch1 db1
+op db1.Apply(batch1)
+write-keys
+singledel-keys db1 db1
+op db1.Set("foo", "foo")
+singledel-keys db1 db1
+----
+"foo" is new
+[db1.Set("foo", "foo")]
+singledel keys: "foo"
+[db1.Delete("foo")]
+write keys: "foo"
+singledel keys: (none)
+[db1.Set("foo", "foo")]
+write keys: "foo"
+singledel keys: "foo"
+singledel keys: "foo"
+[batch1.SingleDelete("foo", false /* maybeReplaceDelete */)]
+write keys: (none)
+singledel keys: (none)
+singledel keys: (none)
+[db1.Apply(batch1)]
+write keys: "foo"
+singledel keys: (none)
+[db1.Set("foo", "foo")]
+singledel keys: "foo"
diff --git a/pebble/metamorphic/testdata/parser b/pebble/metamorphic/testdata/parser
new file mode 100644
index 0000000..0b29a88
--- /dev/null
+++ b/pebble/metamorphic/testdata/parser
@@ -0,0 +1,40 @@
+parse
+foo
+----
+1:1: unable to parse objectID: "foo"
+
+parse
+"foo"
+----
+1:1: unexpected token: STRING "\"foo\""
+
+parse
+db.bar()
+----
+1:1: unknown op db1.bar
+
+parse
+db.Apply()
+----
+1:10: unexpected token: ")"
+
+parse
+db.Apply(hello)
+----
+1:10: unable to parse objectID: "hello"
+
+parse
+db.NewBatch()
+----
+1:1: assignment expected for db1.NewBatch
+
+parse
+batch0 = db.Apply()
+----
+1:10: cannot use db1.Apply in assignment
+
+parse
+batch0 = db.NewBatch()
+batch0.First()
+----
+2:1: batch0.First: First is not a method on batch0
diff --git a/pebble/metamorphic/testdata/reorder_history b/pebble/metamorphic/testdata/reorder_history
new file mode 100644
index 0000000..663f7f7
--- /dev/null
+++ b/pebble/metamorphic/testdata/reorder_history
@@ -0,0 +1,33 @@
+reorder
+Init(49 /* batches */, 66 /* iters */, 46 /* snapshots */) #0
+db.Set("ynnjczfq", "rvfk") // <nil> #1
+db.Get("ynnjczfq") // ["rvfk"] <nil> #2
+db.DeleteRange("ynnjczfq", "ynnjczfq") // <nil> #3
+db.SingleDelete("ynnjczfq", false /* maybeReplaceDelete */) // <nil> #4
+db.Restart() #5
+db.DeleteRange("ynnjczfq", "ynnjczfq") // <nil> #6
+----
+Init(49 /* batches */, 66 /* iters */, 46 /* snapshots */) #0
+db.Set("ynnjczfq", "rvfk") // <nil> #1
+db.Get("ynnjczfq") // ["rvfk"] <nil> #2
+db.DeleteRange("ynnjczfq", "ynnjczfq") // <nil> #3
+db.SingleDelete("ynnjczfq", false /* maybeReplaceDelete */) // <nil> #4
+db.Restart() #5
+db.DeleteRange("ynnjczfq", "ynnjczfq") // <nil> #6
+
+reorder
+db.DeleteRange("ynnjczfq", "ynnjczfq") // <nil> #6
+db.Restart() #5
+db.SingleDelete("ynnjczfq", false /* maybeReplaceDelete */) // <nil> #4
+db.DeleteRange("ynnjczfq", "ynnjczfq") // <nil> #3
+db.Get("ynnjczfq") // ["rvfk"] <nil> #2
+db.Set("ynnjczfq", "rvfk") // <nil> #1
+Init(49 /* batches */, 66 /* iters */, 46 /* snapshots */) #0
+----
+Init(49 /* batches */, 66 /* iters */, 46 /* snapshots */) #0
+db.Set("ynnjczfq", "rvfk") // <nil> #1
+db.Get("ynnjczfq") // ["rvfk"] <nil> #2
+db.DeleteRange("ynnjczfq", "ynnjczfq") // <nil> #3
+db.SingleDelete("ynnjczfq", false /* maybeReplaceDelete */) // <nil> #4
+db.Restart() #5
+db.DeleteRange("ynnjczfq", "ynnjczfq") // <nil> #6
diff --git a/pebble/metamorphic/utils.go b/pebble/metamorphic/utils.go
new file mode 100644
index 0000000..f5c0a83
--- /dev/null
+++ b/pebble/metamorphic/utils.go
@@ -0,0 +1,104 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package metamorphic
+
+import (
+	"fmt"
+	"sort"
+
+	"golang.org/x/exp/rand"
+)
+
+// objTag identifies the type for an object: DB, Batch, Iter, or Snapshot.
+type objTag uint8
+
+const (
+	dbTag objTag = iota + 1
+	batchTag
+	iterTag
+	snapTag
+)
+
+// objID identifies a particular object. The top 4-bits store the tag
+// identifying the type of object, while the bottom 28-bits store the slot used
+// to index with the test.{batches,iters,snapshots} slices.
+type objID uint32
+
+func makeObjID(t objTag, slot uint32) objID {
+	return objID((uint32(t) << 28) | slot)
+}
+
+func (i objID) tag() objTag {
+	return objTag(i >> 28)
+}
+
+func (i objID) slot() uint32 {
+	return uint32(i) & ((1 << 28) - 1)
+}
+
+func (i objID) String() string {
+	switch i.tag() {
+	case dbTag:
+		return fmt.Sprintf("db%d", i.slot())
+	case batchTag:
+		return fmt.Sprintf("batch%d", i.slot())
+	case iterTag:
+		return fmt.Sprintf("iter%d", i.slot())
+	case snapTag:
+		return fmt.Sprintf("snap%d", i.slot())
+	}
+	return fmt.Sprintf("unknown%d", i.slot())
+}
+
+// objIDSlice is an unordered set of integers used when random selection of an
+// element is required.
+type objIDSlice []objID
+
+func (s objIDSlice) Len() int           { return len(s) }
+func (s objIDSlice) Less(i, j int) bool { return s[i] < s[j] }
+func (s objIDSlice) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
+
+// Remove removes the specified integer from the set.
+//
+// TODO(peter): If this proves slow, we can replace this implementation with a
+// map and a slice. The slice would provide for random selection of an element,
+// while the map would provide quick location of an element to remove.
+func (s *objIDSlice) remove(id objID) {
+	n := len(*s)
+	for j := 0; j < n; j++ {
+		if (*s)[j] == id {
+			(*s)[j], (*s)[n-1] = (*s)[n-1], (*s)[j]
+			(*s) = (*s)[:n-1]
+			break
+		}
+	}
+}
+
+func (s *objIDSlice) rand(rng *rand.Rand) objID {
+	return (*s)[rng.Intn(len(*s))]
+}
+
+// objIDSet is an unordered set of object IDs.
+type objIDSet map[objID]struct{}
+
+// sortedKeys returns a sorted slice of the set's keys for deterministic
+// iteration.
+func (s objIDSet) sorted() []objID {
+	keys := make(objIDSlice, 0, len(s))
+	for id := range s {
+		keys = append(keys, id)
+	}
+	sort.Sort(keys)
+	return keys
+}
+
+// firstError returns the first non-nil error of err0 and err1, or nil if both
+// are nil.
+func firstError(err0, err1 error) error {
+	if err0 != nil {
+		return err0
+	}
+	return err1
+}
diff --git a/pebble/metrics.go b/pebble/metrics.go
new file mode 100644
index 0000000..0f1e32b
--- /dev/null
+++ b/pebble/metrics.go
@@ -0,0 +1,627 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"fmt"
+	"math"
+	"time"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/cache"
+	"github.com/cockroachdb/pebble/internal/humanize"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache"
+	"github.com/cockroachdb/pebble/record"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/redact"
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+// CacheMetrics holds metrics for the block and table cache.
+type CacheMetrics = cache.Metrics
+
+// FilterMetrics holds metrics for the filter policy
+type FilterMetrics = sstable.FilterMetrics
+
+// ThroughputMetric is a cumulative throughput metric. See the detailed
+// comment in base.
+type ThroughputMetric = base.ThroughputMetric
+
+// SecondaryCacheMetrics holds metrics for the persistent secondary cache
+// that caches commonly accessed blocks from blob storage on a local
+// file system.
+type SecondaryCacheMetrics = sharedcache.Metrics
+
+// LevelMetrics holds per-level metrics such as the number of files and total
+// size of the files, and compaction related metrics.
+type LevelMetrics struct {
+	// The number of sublevels within the level. The sublevel count corresponds
+	// to the read amplification for the level. An empty level will have a
+	// sublevel count of 0, implying no read amplification. Only L0 will have
+	// a sublevel count other than 0 or 1.
+	Sublevels int32
+	// The total number of files in the level.
+	NumFiles int64
+	// The total number of virtual sstables in the level.
+	NumVirtualFiles uint64
+	// The total size in bytes of the files in the level.
+	Size int64
+	// The total size of the virtual sstables in the level.
+	VirtualSize uint64
+	// The level's compaction score. This is the compensatedScoreRatio in the
+	// candidateLevelInfo.
+	Score float64
+	// The number of incoming bytes from other levels read during
+	// compactions. This excludes bytes moved and bytes ingested. For L0 this is
+	// the bytes written to the WAL.
+	BytesIn uint64
+	// The number of bytes ingested. The sibling metric for tables is
+	// TablesIngested.
+	BytesIngested uint64
+	// The number of bytes moved into the level by a "move" compaction. The
+	// sibling metric for tables is TablesMoved.
+	BytesMoved uint64
+	// The number of bytes read for compactions at the level. This includes bytes
+	// read from other levels (BytesIn), as well as bytes read for the level.
+	BytesRead uint64
+	// The number of bytes written during compactions. The sibling
+	// metric for tables is TablesCompacted. This metric may be summed
+	// with BytesFlushed to compute the total bytes written for the level.
+	BytesCompacted uint64
+	// The number of bytes written during flushes. The sibling
+	// metrics for tables is TablesFlushed. This metric is always
+	// zero for all levels other than L0.
+	BytesFlushed uint64
+	// The number of sstables compacted to this level.
+	TablesCompacted uint64
+	// The number of sstables flushed to this level.
+	TablesFlushed uint64
+	// The number of sstables ingested into the level.
+	TablesIngested uint64
+	// The number of sstables moved to this level by a "move" compaction.
+	TablesMoved uint64
+
+	MultiLevel struct {
+		// BytesInTop are the total bytes in a multilevel compaction coming from the top level.
+		BytesInTop uint64
+
+		// BytesIn, exclusively for multiLevel compactions.
+		BytesIn uint64
+
+		// BytesRead, exclusively for multilevel compactions.
+		BytesRead uint64
+	}
+
+	// Additional contains misc additional metrics that are not always printed.
+	Additional struct {
+		// The sum of Properties.ValueBlocksSize for all the sstables in this
+		// level. Printed by LevelMetrics.format iff there is at least one level
+		// with a non-zero value.
+		ValueBlocksSize uint64
+		// Cumulative metrics about bytes written to data blocks and value blocks,
+		// via compactions (except move compactions) or flushes. Not printed by
+		// LevelMetrics.format, but are available to sophisticated clients.
+		BytesWrittenDataBlocks  uint64
+		BytesWrittenValueBlocks uint64
+	}
+}
+
+// Add updates the counter metrics for the level.
+func (m *LevelMetrics) Add(u *LevelMetrics) {
+	m.NumFiles += u.NumFiles
+	m.NumVirtualFiles += u.NumVirtualFiles
+	m.VirtualSize += u.VirtualSize
+	m.Size += u.Size
+	m.BytesIn += u.BytesIn
+	m.BytesIngested += u.BytesIngested
+	m.BytesMoved += u.BytesMoved
+	m.BytesRead += u.BytesRead
+	m.BytesCompacted += u.BytesCompacted
+	m.BytesFlushed += u.BytesFlushed
+	m.TablesCompacted += u.TablesCompacted
+	m.TablesFlushed += u.TablesFlushed
+	m.TablesIngested += u.TablesIngested
+	m.TablesMoved += u.TablesMoved
+	m.MultiLevel.BytesInTop += u.MultiLevel.BytesInTop
+	m.MultiLevel.BytesRead += u.MultiLevel.BytesRead
+	m.MultiLevel.BytesIn += u.MultiLevel.BytesIn
+	m.Additional.BytesWrittenDataBlocks += u.Additional.BytesWrittenDataBlocks
+	m.Additional.BytesWrittenValueBlocks += u.Additional.BytesWrittenValueBlocks
+	m.Additional.ValueBlocksSize += u.Additional.ValueBlocksSize
+}
+
+// WriteAmp computes the write amplification for compactions at this
+// level. Computed as (BytesFlushed + BytesCompacted) / BytesIn.
+func (m *LevelMetrics) WriteAmp() float64 {
+	if m.BytesIn == 0 {
+		return 0
+	}
+	return float64(m.BytesFlushed+m.BytesCompacted) / float64(m.BytesIn)
+}
+
+// Metrics holds metrics for various subsystems of the DB such as the Cache,
+// Compactions, WAL, and per-Level metrics.
+//
+// TODO(peter): The testing of these metrics is relatively weak. There should
+// be testing that performs various operations on a DB and verifies that the
+// metrics reflect those operations.
+type Metrics struct {
+	BlockCache CacheMetrics
+
+	Compact struct {
+		// The total number of compactions, and per-compaction type counts.
+		Count             int64
+		DefaultCount      int64
+		DeleteOnlyCount   int64
+		ElisionOnlyCount  int64
+		MoveCount         int64
+		ReadCount         int64
+		RewriteCount      int64
+		MultiLevelCount   int64
+		CounterLevelCount int64
+		// An estimate of the number of bytes that need to be compacted for the LSM
+		// to reach a stable state.
+		EstimatedDebt uint64
+		// Number of bytes present in sstables being written by in-progress
+		// compactions. This value will be zero if there are no in-progress
+		// compactions.
+		InProgressBytes int64
+		// Number of compactions that are in-progress.
+		NumInProgress int64
+		// MarkedFiles is a count of files that are marked for
+		// compaction. Such files are compacted in a rewrite compaction
+		// when no other compactions are picked.
+		MarkedFiles int
+		// Duration records the cumulative duration of all compactions since the
+		// database was opened.
+		Duration time.Duration
+	}
+
+	Ingest struct {
+		// The total number of ingestions
+		Count uint64
+	}
+
+	Flush struct {
+		// The total number of flushes.
+		Count           int64
+		WriteThroughput ThroughputMetric
+		// Number of flushes that are in-progress. In the current implementation
+		// this will always be zero or one.
+		NumInProgress int64
+		// AsIngestCount is a monotonically increasing counter of flush operations
+		// handling ingested tables.
+		AsIngestCount uint64
+		// AsIngestCount is a monotonically increasing counter of tables ingested as
+		// flushables.
+		AsIngestTableCount uint64
+		// AsIngestBytes is a monotonically increasing counter of the bytes flushed
+		// for flushables that originated as ingestion operations.
+		AsIngestBytes uint64
+	}
+
+	Filter FilterMetrics
+
+	Levels [numLevels]LevelMetrics
+
+	MemTable struct {
+		// The number of bytes allocated by memtables and large (flushable)
+		// batches.
+		Size uint64
+		// The count of memtables.
+		Count int64
+		// The number of bytes present in zombie memtables which are no longer
+		// referenced by the current DB state. An unbounded number of memtables
+		// may be zombie if they're still in use by an iterator. One additional
+		// memtable may be zombie if it's no longer in use and waiting to be
+		// recycled.
+		ZombieSize uint64
+		// The count of zombie memtables.
+		ZombieCount int64
+	}
+
+	Keys struct {
+		// The approximate count of internal range key set keys in the database.
+		RangeKeySetsCount uint64
+		// The approximate count of internal tombstones (DEL, SINGLEDEL and
+		// RANGEDEL key kinds) within the database.
+		TombstoneCount uint64
+		// A cumulative total number of missized DELSIZED keys encountered by
+		// compactions since the database was opened.
+		MissizedTombstonesCount uint64
+	}
+
+	Snapshots struct {
+		// The number of currently open snapshots.
+		Count int
+		// The sequence number of the earliest, currently open snapshot.
+		EarliestSeqNum uint64
+		// A running tally of keys written to sstables during flushes or
+		// compactions that would've been elided if it weren't for open
+		// snapshots.
+		PinnedKeys uint64
+		// A running cumulative sum of the size of keys and values written to
+		// sstables during flushes or compactions that would've been elided if
+		// it weren't for open snapshots.
+		PinnedSize uint64
+	}
+
+	Table struct {
+		// The number of bytes present in obsolete tables which are no longer
+		// referenced by the current DB state or any open iterators.
+		ObsoleteSize uint64
+		// The count of obsolete tables.
+		ObsoleteCount int64
+		// The number of bytes present in zombie tables which are no longer
+		// referenced by the current DB state but are still in use by an iterator.
+		ZombieSize uint64
+		// The count of zombie tables.
+		ZombieCount int64
+		// The count of the backing sstables.
+		BackingTableCount uint64
+		// The sum of the sizes of the all of the backing sstables.
+		BackingTableSize uint64
+	}
+
+	TableCache CacheMetrics
+
+	// Count of the number of open sstable iterators.
+	TableIters int64
+	// Uptime is the total time since this DB was opened.
+	Uptime time.Duration
+
+	WAL struct {
+		// Number of live WAL files.
+		Files int64
+		// Number of obsolete WAL files.
+		ObsoleteFiles int64
+		// Physical size of the obsolete WAL files.
+		ObsoletePhysicalSize uint64
+		// Size of the live data in the WAL files. Note that with WAL file
+		// recycling this is less than the actual on-disk size of the WAL files.
+		Size uint64
+		// Physical size of the WAL files on-disk. With WAL file recycling,
+		// this is greater than the live data in WAL files.
+		PhysicalSize uint64
+		// Number of logical bytes written to the WAL.
+		BytesIn uint64
+		// Number of bytes written to the WAL.
+		BytesWritten uint64
+	}
+
+	LogWriter struct {
+		FsyncLatency prometheus.Histogram
+		record.LogWriterMetrics
+	}
+
+	CategoryStats []sstable.CategoryStatsAggregate
+
+	SecondaryCacheMetrics SecondaryCacheMetrics
+
+	private struct {
+		optionsFileSize  uint64
+		manifestFileSize uint64
+	}
+}
+
+var (
+	// FsyncLatencyBuckets are prometheus histogram buckets suitable for a histogram
+	// that records latencies for fsyncs.
+	FsyncLatencyBuckets = append(
+		prometheus.LinearBuckets(0.0, float64(time.Microsecond*100), 50),
+		prometheus.ExponentialBucketsRange(float64(time.Millisecond*5), float64(10*time.Second), 50)...,
+	)
+
+	// SecondaryCacheIOBuckets exported to enable exporting from package pebble to
+	// enable exporting metrics with below buckets in CRDB.
+	SecondaryCacheIOBuckets = sharedcache.IOBuckets
+	// SecondaryCacheChannelWriteBuckets exported to enable exporting from package
+	// pebble to enable exporting metrics with below buckets in CRDB.
+	SecondaryCacheChannelWriteBuckets = sharedcache.ChannelWriteBuckets
+)
+
+// DiskSpaceUsage returns the total disk space used by the database in bytes,
+// including live and obsolete files.
+func (m *Metrics) DiskSpaceUsage() uint64 {
+	var usageBytes uint64
+	usageBytes += m.WAL.PhysicalSize
+	usageBytes += m.WAL.ObsoletePhysicalSize
+	for _, lm := range m.Levels {
+		usageBytes += uint64(lm.Size)
+	}
+	usageBytes += m.Table.ObsoleteSize
+	usageBytes += m.Table.ZombieSize
+	usageBytes += m.private.optionsFileSize
+	usageBytes += m.private.manifestFileSize
+	usageBytes += uint64(m.Compact.InProgressBytes)
+	return usageBytes
+}
+
+// NumVirtual is the number of virtual sstables in the latest version
+// summed over every level in the lsm.
+func (m *Metrics) NumVirtual() uint64 {
+	var n uint64
+	for _, level := range m.Levels {
+		n += level.NumVirtualFiles
+	}
+	return n
+}
+
+// VirtualSize is the sum of the sizes of the virtual sstables in the
+// latest version. BackingTableSize - VirtualSize gives an estimate for
+// the space amplification caused by not compacting virtual sstables.
+func (m *Metrics) VirtualSize() uint64 {
+	var size uint64
+	for _, level := range m.Levels {
+		size += level.VirtualSize
+	}
+	return size
+}
+
+// ReadAmp returns the current read amplification of the database.
+// It's computed as the number of sublevels in L0 + the number of non-empty
+// levels below L0.
+func (m *Metrics) ReadAmp() int {
+	var ramp int32
+	for _, l := range m.Levels {
+		ramp += l.Sublevels
+	}
+	return int(ramp)
+}
+
+// Total returns the sum of the per-level metrics and WAL metrics.
+func (m *Metrics) Total() LevelMetrics {
+	var total LevelMetrics
+	for level := 0; level < numLevels; level++ {
+		l := &m.Levels[level]
+		total.Add(l)
+		total.Sublevels += l.Sublevels
+	}
+	// Compute total bytes-in as the bytes written to the WAL + bytes ingested.
+	total.BytesIn = m.WAL.BytesWritten + total.BytesIngested
+	// Add the total bytes-in to the total bytes-flushed. This is to account for
+	// the bytes written to the log and bytes written externally and then
+	// ingested.
+	total.BytesFlushed += total.BytesIn
+	return total
+}
+
+// String pretty-prints the metrics as below:
+//
+//	      |                             |       |       |   ingested   |     moved    |    written   |       |    amp
+//	level | tables  size val-bl vtables | score |   in  | tables  size | tables  size | tables  size |  read |   r   w
+//	------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+---------
+//	    0 |   101   102B     0B       0 | 103.0 |  104B |   112   104B |   113   106B |   221   217B |  107B |   1  2.1
+//	    1 |   201   202B     0B       0 | 203.0 |  204B |   212   204B |   213   206B |   421   417B |  207B |   2  2.0
+//	    2 |   301   302B     0B       0 | 303.0 |  304B |   312   304B |   313   306B |   621   617B |  307B |   3  2.0
+//	    3 |   401   402B     0B       0 | 403.0 |  404B |   412   404B |   413   406B |   821   817B |  407B |   4  2.0
+//	    4 |   501   502B     0B       0 | 503.0 |  504B |   512   504B |   513   506B |  1.0K  1017B |  507B |   5  2.0
+//	    5 |   601   602B     0B       0 | 603.0 |  604B |   612   604B |   613   606B |  1.2K  1.2KB |  607B |   6  2.0
+//	    6 |   701   702B     0B       0 |     - |  704B |   712   704B |   713   706B |  1.4K  1.4KB |  707B |   7  2.0
+//	total |  2.8K  2.7KB     0B       0 |     - | 2.8KB |  2.9K  2.8KB |  2.9K  2.8KB |  5.7K  8.4KB | 2.8KB |  28  3.0
+//	-------------------------------------------------------------------------------------------------------------------
+//	WAL: 22 files (24B)  in: 25B  written: 26B (4% overhead)
+//	Flushes: 8
+//	Compactions: 5  estimated debt: 6B  in progress: 2 (7B)
+//	default: 27  delete: 28  elision: 29  move: 30  read: 31  rewrite: 32  multi-level: 33
+//	MemTables: 12 (11B)  zombie: 14 (13B)
+//	Zombie tables: 16 (15B)
+//	Backing tables: 0 (0B)
+//	Block cache: 2 entries (1B)  hit rate: 42.9%
+//	Table cache: 18 entries (17B)  hit rate: 48.7%
+//	Secondary cache: 40 entries (40B)  hit rate: 49.9%
+//	Snapshots: 4  earliest seq num: 1024
+//	Table iters: 21
+//	Filter utility: 47.4%
+//	Ingestions: 27  as flushable: 36 (34B in 35 tables)
+func (m *Metrics) String() string {
+	return redact.StringWithoutMarkers(m)
+}
+
+var _ redact.SafeFormatter = &Metrics{}
+
+// SafeFormat implements redact.SafeFormatter.
+func (m *Metrics) SafeFormat(w redact.SafePrinter, _ rune) {
+	// NB: Pebble does not make any assumptions as to which Go primitive types
+	// have been registered as safe with redact.RegisterSafeType and does not
+	// register any types itself. Some of the calls to `redact.Safe`, etc are
+	// superfluous in the context of CockroachDB, which registers all the Go
+	// numeric types as safe.
+
+	// TODO(jackson): There are a few places where we use redact.SafeValue
+	// instead of redact.RedactableString. This is necessary because of a bug
+	// whereby formatting a redact.RedactableString argument does not respect
+	// width specifiers. When the issue is fixed, we can convert these to
+	// RedactableStrings. https://github.com/cockroachdb/redact/issues/17
+
+	multiExists := m.Compact.MultiLevelCount > 0
+	appendIfMulti := func(line redact.SafeString) {
+		if multiExists {
+			w.SafeString(line)
+		}
+	}
+	newline := func() {
+		w.SafeString("\n")
+	}
+
+	w.SafeString("      |                             |       |       |   ingested   |     moved    |    written   |       |    amp")
+	appendIfMulti("   |     multilevel")
+	newline()
+	w.SafeString("level | tables  size val-bl vtables | score |   in  | tables  size | tables  size | tables  size |  read |   r   w")
+	appendIfMulti("  |    top   in  read")
+	newline()
+	w.SafeString("------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+---------")
+	appendIfMulti("-+------------------")
+	newline()
+
+	// formatRow prints out a row of the table.
+	formatRow := func(m *LevelMetrics, score float64) {
+		scoreStr := "-"
+		if !math.IsNaN(score) {
+			// Try to keep the string no longer than 5 characters.
+			switch {
+			case score < 99.995:
+				scoreStr = fmt.Sprintf("%.2f", score)
+			case score < 999.95:
+				scoreStr = fmt.Sprintf("%.1f", score)
+			default:
+				scoreStr = fmt.Sprintf("%.0f", score)
+			}
+		}
+		var wampStr string
+		if wamp := m.WriteAmp(); wamp > 99.5 {
+			wampStr = fmt.Sprintf("%.0f", wamp)
+		} else {
+			wampStr = fmt.Sprintf("%.1f", wamp)
+		}
+
+		w.Printf("| %5s %6s %6s %7s | %5s | %5s | %5s %6s | %5s %6s | %5s %6s | %5s | %3d %4s",
+			humanize.Count.Int64(m.NumFiles),
+			humanize.Bytes.Int64(m.Size),
+			humanize.Bytes.Uint64(m.Additional.ValueBlocksSize),
+			humanize.Count.Uint64(m.NumVirtualFiles),
+			redact.Safe(scoreStr),
+			humanize.Bytes.Uint64(m.BytesIn),
+			humanize.Count.Uint64(m.TablesIngested),
+			humanize.Bytes.Uint64(m.BytesIngested),
+			humanize.Count.Uint64(m.TablesMoved),
+			humanize.Bytes.Uint64(m.BytesMoved),
+			humanize.Count.Uint64(m.TablesFlushed+m.TablesCompacted),
+			humanize.Bytes.Uint64(m.BytesFlushed+m.BytesCompacted),
+			humanize.Bytes.Uint64(m.BytesRead),
+			redact.Safe(m.Sublevels),
+			redact.Safe(wampStr))
+
+		if multiExists {
+			w.Printf(" | %5s %5s %5s",
+				humanize.Bytes.Uint64(m.MultiLevel.BytesInTop),
+				humanize.Bytes.Uint64(m.MultiLevel.BytesIn),
+				humanize.Bytes.Uint64(m.MultiLevel.BytesRead))
+		}
+		newline()
+	}
+
+	var total LevelMetrics
+	for level := 0; level < numLevels; level++ {
+		l := &m.Levels[level]
+		w.Printf("%5d ", redact.Safe(level))
+
+		// Format the score.
+		score := math.NaN()
+		if level < numLevels-1 {
+			score = l.Score
+		}
+		formatRow(l, score)
+		total.Add(l)
+		total.Sublevels += l.Sublevels
+	}
+	// Compute total bytes-in as the bytes written to the WAL + bytes ingested.
+	total.BytesIn = m.WAL.BytesWritten + total.BytesIngested
+	// Add the total bytes-in to the total bytes-flushed. This is to account for
+	// the bytes written to the log and bytes written externally and then
+	// ingested.
+	total.BytesFlushed += total.BytesIn
+	w.SafeString("total ")
+	formatRow(&total, math.NaN())
+
+	w.SafeString("-------------------------------------------------------------------------------------------------------------------")
+	appendIfMulti("--------------------")
+	newline()
+	w.Printf("WAL: %d files (%s)  in: %s  written: %s (%.0f%% overhead)\n",
+		redact.Safe(m.WAL.Files),
+		humanize.Bytes.Uint64(m.WAL.Size),
+		humanize.Bytes.Uint64(m.WAL.BytesIn),
+		humanize.Bytes.Uint64(m.WAL.BytesWritten),
+		redact.Safe(percent(int64(m.WAL.BytesWritten)-int64(m.WAL.BytesIn), int64(m.WAL.BytesIn))))
+
+	w.Printf("Flushes: %d\n", redact.Safe(m.Flush.Count))
+
+	w.Printf("Compactions: %d  estimated debt: %s  in progress: %d (%s)\n",
+		redact.Safe(m.Compact.Count),
+		humanize.Bytes.Uint64(m.Compact.EstimatedDebt),
+		redact.Safe(m.Compact.NumInProgress),
+		humanize.Bytes.Int64(m.Compact.InProgressBytes))
+
+	w.Printf("             default: %d  delete: %d  elision: %d  move: %d  read: %d  rewrite: %d  multi-level: %d\n",
+		redact.Safe(m.Compact.DefaultCount),
+		redact.Safe(m.Compact.DeleteOnlyCount),
+		redact.Safe(m.Compact.ElisionOnlyCount),
+		redact.Safe(m.Compact.MoveCount),
+		redact.Safe(m.Compact.ReadCount),
+		redact.Safe(m.Compact.RewriteCount),
+		redact.Safe(m.Compact.MultiLevelCount))
+
+	w.Printf("MemTables: %d (%s)  zombie: %d (%s)\n",
+		redact.Safe(m.MemTable.Count),
+		humanize.Bytes.Uint64(m.MemTable.Size),
+		redact.Safe(m.MemTable.ZombieCount),
+		humanize.Bytes.Uint64(m.MemTable.ZombieSize))
+
+	w.Printf("Zombie tables: %d (%s)\n",
+		redact.Safe(m.Table.ZombieCount),
+		humanize.Bytes.Uint64(m.Table.ZombieSize))
+
+	w.Printf("Backing tables: %d (%s)\n",
+		redact.Safe(m.Table.BackingTableCount),
+		humanize.Bytes.Uint64(m.Table.BackingTableSize))
+	w.Printf("Virtual tables: %d (%s)\n",
+		redact.Safe(m.NumVirtual()),
+		humanize.Bytes.Uint64(m.VirtualSize()))
+
+	formatCacheMetrics := func(m *CacheMetrics, name redact.SafeString) {
+		w.Printf("%s: %s entries (%s)  hit rate: %.1f%%\n",
+			name,
+			humanize.Count.Int64(m.Count),
+			humanize.Bytes.Int64(m.Size),
+			redact.Safe(hitRate(m.Hits, m.Misses)))
+	}
+	formatCacheMetrics(&m.BlockCache, "Block cache")
+	formatCacheMetrics(&m.TableCache, "Table cache")
+
+	formatSharedCacheMetrics := func(w redact.SafePrinter, m *SecondaryCacheMetrics, name redact.SafeString) {
+		w.Printf("%s: %s entries (%s)  hit rate: %.1f%%\n",
+			name,
+			humanize.Count.Int64(m.Count),
+			humanize.Bytes.Int64(m.Size),
+			redact.Safe(hitRate(m.ReadsWithFullHit, m.ReadsWithPartialHit+m.ReadsWithNoHit)))
+	}
+	formatSharedCacheMetrics(w, &m.SecondaryCacheMetrics, "Secondary cache")
+
+	w.Printf("Snapshots: %d  earliest seq num: %d\n",
+		redact.Safe(m.Snapshots.Count),
+		redact.Safe(m.Snapshots.EarliestSeqNum))
+
+	w.Printf("Table iters: %d\n", redact.Safe(m.TableIters))
+	w.Printf("Filter utility: %.1f%%\n", redact.Safe(hitRate(m.Filter.Hits, m.Filter.Misses)))
+	w.Printf("Ingestions: %d  as flushable: %d (%s in %d tables)\n",
+		redact.Safe(m.Ingest.Count),
+		redact.Safe(m.Flush.AsIngestCount),
+		humanize.Bytes.Uint64(m.Flush.AsIngestBytes),
+		redact.Safe(m.Flush.AsIngestTableCount))
+}
+
+func hitRate(hits, misses int64) float64 {
+	return percent(hits, hits+misses)
+}
+
+func percent(numerator, denominator int64) float64 {
+	if denominator == 0 {
+		return 0
+	}
+	return 100 * float64(numerator) / float64(denominator)
+}
+
+// StringForTests is identical to m.String() on 64-bit platforms. It is used to
+// provide a platform-independent result for tests.
+func (m *Metrics) StringForTests() string {
+	mCopy := *m
+	if math.MaxInt == math.MaxInt32 {
+		// This is the difference in Sizeof(sstable.Reader{})) between 64 and 32 bit
+		// platforms.
+		const tableCacheSizeAdjustment = 212
+		mCopy.TableCache.Size += mCopy.TableCache.Count * tableCacheSizeAdjustment
+	}
+	return redact.StringWithoutMarkers(&mCopy)
+}
diff --git a/pebble/metrics_test.go b/pebble/metrics_test.go
new file mode 100644
index 0000000..a2485d6
--- /dev/null
+++ b/pebble/metrics_test.go
@@ -0,0 +1,375 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"fmt"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/cache"
+	"github.com/cockroachdb/pebble/internal/humanize"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/cockroachdb/redact"
+	"github.com/stretchr/testify/require"
+)
+
+func exampleMetrics() Metrics {
+	var m Metrics
+	m.BlockCache.Size = 1
+	m.BlockCache.Count = 2
+	m.BlockCache.Hits = 3
+	m.BlockCache.Misses = 4
+	m.Compact.Count = 5
+	m.Compact.DefaultCount = 27
+	m.Compact.DeleteOnlyCount = 28
+	m.Compact.ElisionOnlyCount = 29
+	m.Compact.MoveCount = 30
+	m.Compact.ReadCount = 31
+	m.Compact.RewriteCount = 32
+	m.Compact.MultiLevelCount = 33
+	m.Compact.EstimatedDebt = 6
+	m.Compact.InProgressBytes = 7
+	m.Compact.NumInProgress = 2
+	m.Flush.Count = 8
+	m.Flush.AsIngestBytes = 34
+	m.Flush.AsIngestTableCount = 35
+	m.Flush.AsIngestCount = 36
+	m.Filter.Hits = 9
+	m.Filter.Misses = 10
+	m.MemTable.Size = 11
+	m.MemTable.Count = 12
+	m.MemTable.ZombieSize = 13
+	m.MemTable.ZombieCount = 14
+	m.Snapshots.Count = 4
+	m.Snapshots.EarliestSeqNum = 1024
+	m.Table.ZombieSize = 15
+	m.Table.BackingTableCount = 1
+	m.Table.BackingTableSize = 2 << 20
+	m.Table.ZombieCount = 16
+	m.TableCache.Size = 17
+	m.TableCache.Count = 18
+	m.TableCache.Hits = 19
+	m.TableCache.Misses = 20
+	m.TableIters = 21
+	m.WAL.Files = 22
+	m.WAL.ObsoleteFiles = 23
+	m.WAL.Size = 24
+	m.WAL.BytesIn = 25
+	m.WAL.BytesWritten = 26
+	m.Ingest.Count = 27
+
+	for i := range m.Levels {
+		l := &m.Levels[i]
+		base := uint64((i + 1) * 100)
+		l.Sublevels = int32(i + 1)
+		l.NumFiles = int64(base) + 1
+		l.NumVirtualFiles = uint64(base) + 1
+		l.VirtualSize = base + 3
+		l.Size = int64(base) + 2
+		l.Score = float64(base) + 3
+		l.BytesIn = base + 4
+		l.BytesIngested = base + 4
+		l.BytesMoved = base + 6
+		l.BytesRead = base + 7
+		l.BytesCompacted = base + 8
+		l.BytesFlushed = base + 9
+		l.TablesCompacted = base + 10
+		l.TablesFlushed = base + 11
+		l.TablesIngested = base + 12
+		l.TablesMoved = base + 13
+		l.MultiLevel.BytesInTop = base + 4
+		l.MultiLevel.BytesIn = base + 4
+		l.MultiLevel.BytesRead = base + 4
+	}
+	return m
+}
+
+func TestMetrics(t *testing.T) {
+	c := cache.New(cacheDefaultSize)
+	defer c.Unref()
+	opts := &Options{
+		Cache:                 c,
+		Comparer:              testkeys.Comparer,
+		FormatMajorVersion:    FormatNewest,
+		FS:                    vfs.NewMem(),
+		L0CompactionThreshold: 8,
+		// Large value for determinism.
+		MaxOpenFiles: 10000,
+	}
+	opts.Experimental.EnableValueBlocks = func() bool { return true }
+	opts.Levels = append(opts.Levels, LevelOptions{TargetFileSize: 50})
+
+	// Prevent foreground flushes and compactions from triggering asynchronous
+	// follow-up compactions. This avoids asynchronously-scheduled work from
+	// interfering with the expected metrics output and reduces test flakiness.
+	opts.DisableAutomaticCompactions = true
+
+	// Increase the threshold for memtable stalls to allow for more flushable
+	// ingests.
+	opts.MemTableStopWritesThreshold = 4
+
+	d, err := Open("", opts)
+	require.NoError(t, err)
+	defer func() {
+		require.NoError(t, d.Close())
+	}()
+
+	iters := make(map[string]*Iterator)
+	defer func() {
+		for _, i := range iters {
+			require.NoError(t, i.Close())
+		}
+	}()
+
+	datadriven.RunTest(t, "testdata/metrics", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "example":
+			m := exampleMetrics()
+			res := m.String()
+
+			// Nothing in the metrics should be redacted.
+			redacted := string(redact.Sprintf("%s", &m).Redact())
+			if redacted != res {
+				td.Fatalf(t, "redacted metrics don't match\nunredacted:\n%s\nredacted:%s\n", res, redacted)
+			}
+			return res
+
+		case "batch":
+			b := d.NewBatch()
+			if err := runBatchDefineCmd(td, b); err != nil {
+				return err.Error()
+			}
+			b.Commit(nil)
+			return ""
+
+		case "build":
+			if err := runBuildCmd(td, d, d.opts.FS); err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "compact":
+			if err := runCompactCmd(td, d); err != nil {
+				return err.Error()
+			}
+
+			d.mu.Lock()
+			s := d.mu.versions.currentVersion().String()
+			d.mu.Unlock()
+			return s
+
+		case "delay-flush":
+			d.mu.Lock()
+			defer d.mu.Unlock()
+			switch td.Input {
+			case "enable":
+				d.mu.compact.flushing = true
+			case "disable":
+				d.mu.compact.flushing = false
+			default:
+				return fmt.Sprintf("unknown directive %q (expected 'enable'/'disable')", td.Input)
+			}
+			return ""
+
+		case "flush":
+			if err := d.Flush(); err != nil {
+				return err.Error()
+			}
+
+			d.mu.Lock()
+			s := d.mu.versions.currentVersion().String()
+			d.mu.Unlock()
+			return s
+
+		case "ingest":
+			if err := runIngestCmd(td, d, d.opts.FS); err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "lsm":
+			d.mu.Lock()
+			s := d.mu.versions.currentVersion().String()
+			d.mu.Unlock()
+			return s
+
+		case "ingest-and-excise":
+			if err := runIngestAndExciseCmd(td, d, d.opts.FS); err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "iter-close":
+			if len(td.CmdArgs) != 1 {
+				return "iter-close <name>"
+			}
+			name := td.CmdArgs[0].String()
+			if iter := iters[name]; iter != nil {
+				if err := iter.Close(); err != nil {
+					return err.Error()
+				}
+				delete(iters, name)
+			} else {
+				return fmt.Sprintf("%s: not found", name)
+			}
+
+			// The deletion of obsolete files happens asynchronously when an iterator
+			// is closed. Wait for the obsolete tables to be deleted.
+			d.cleanupManager.Wait()
+			return ""
+
+		case "iter-new":
+			if len(td.CmdArgs) < 1 {
+				return "iter-new <name>"
+			}
+			name := td.CmdArgs[0].String()
+			if iter := iters[name]; iter != nil {
+				if err := iter.Close(); err != nil {
+					return err.Error()
+				}
+			}
+			var categoryAndQoS sstable.CategoryAndQoS
+			if td.HasArg("category") {
+				var s string
+				td.ScanArgs(t, "category", &s)
+				categoryAndQoS.Category = sstable.Category(s)
+			}
+			if td.HasArg("qos") {
+				var qos string
+				td.ScanArgs(t, "qos", &qos)
+				categoryAndQoS.QoSLevel = sstable.StringToQoSForTesting(qos)
+			}
+			iter, _ := d.NewIter(&IterOptions{CategoryAndQoS: categoryAndQoS})
+			// Some iterators (eg. levelIter) do not instantiate the underlying
+			// iterator until the first positioning call. Position the iterator
+			// so that levelIters will have loaded an sstable.
+			iter.First()
+			iters[name] = iter
+			return ""
+
+		case "metrics":
+			// The asynchronous loading of table stats can change metrics, so
+			// wait for all the tables' stats to be loaded.
+			d.mu.Lock()
+			d.waitTableStats()
+			d.mu.Unlock()
+
+			m := d.Metrics()
+			if td.HasArg("zero-cache-hits-misses") {
+				// Avoid non-determinism.
+				m.TableCache.Hits = 0
+				m.TableCache.Misses = 0
+				m.BlockCache.Hits = 0
+				m.BlockCache.Misses = 0
+				// Empirically, the unknown stats are also non-deterministic.
+				if len(m.CategoryStats) > 0 && m.CategoryStats[0].Category == "_unknown" {
+					m.CategoryStats[0].CategoryStats = sstable.CategoryStats{}
+				}
+			}
+			var buf strings.Builder
+			fmt.Fprintf(&buf, "%s", m.StringForTests())
+			if len(m.CategoryStats) > 0 {
+				fmt.Fprintf(&buf, "Iter category stats:\n")
+				for _, stats := range m.CategoryStats {
+					fmt.Fprintf(&buf, "%20s, %11s: %+v\n", stats.Category,
+						redact.StringWithoutMarkers(stats.QoSLevel), stats.CategoryStats)
+				}
+			}
+			return buf.String()
+
+		case "metrics-value":
+			// metrics-value confirms the value of a given metric. Note that there
+			// are some metrics which aren't deterministic and behave differently
+			// for invariant/non-invariant builds. An example of this is cache
+			// hit rates. Under invariant builds, the excising code will try
+			// to create iterators and confirm that the virtual sstable bounds
+			// are accurate. Reads on these iterators will change the cache hit
+			// rates.
+			lines := strings.Split(td.Input, "\n")
+			m := d.Metrics()
+			// TODO(bananabrick): Use reflection to pull the values associated
+			// with the metrics fields.
+			var buf bytes.Buffer
+			for i := range lines {
+				line := lines[i]
+				if line == "num-backing" {
+					buf.WriteString(fmt.Sprintf("%d\n", m.Table.BackingTableCount))
+				} else if line == "backing-size" {
+					buf.WriteString(fmt.Sprintf("%s\n", humanize.Bytes.Uint64(m.Table.BackingTableSize)))
+				} else if line == "virtual-size" {
+					buf.WriteString(fmt.Sprintf("%s\n", humanize.Bytes.Uint64(m.VirtualSize())))
+				} else if strings.HasPrefix(line, "num-virtual") {
+					splits := strings.Split(line, " ")
+					if len(splits) == 1 {
+						buf.WriteString(fmt.Sprintf("%d\n", m.NumVirtual()))
+						continue
+					}
+					// Level is specified.
+					l, err := strconv.Atoi(splits[1])
+					if err != nil {
+						panic(err)
+					}
+					if l >= numLevels {
+						panic(fmt.Sprintf("invalid level %d", l))
+					}
+					buf.WriteString(fmt.Sprintf("%d\n", m.Levels[l].NumVirtualFiles))
+				} else {
+					panic(fmt.Sprintf("invalid field: %s", line))
+				}
+			}
+			return buf.String()
+
+		case "disk-usage":
+			return humanize.Bytes.Uint64(d.Metrics().DiskSpaceUsage()).String()
+
+		case "additional-metrics":
+			// The asynchronous loading of table stats can change metrics, so
+			// wait for all the tables' stats to be loaded.
+			d.mu.Lock()
+			d.waitTableStats()
+			d.mu.Unlock()
+
+			m := d.Metrics()
+			var b strings.Builder
+			fmt.Fprintf(&b, "block bytes written:\n")
+			fmt.Fprintf(&b, " __level___data-block__value-block\n")
+			for i := range m.Levels {
+				fmt.Fprintf(&b, "%7d ", i)
+				fmt.Fprintf(&b, "%12s %12s\n",
+					humanize.Bytes.Uint64(m.Levels[i].Additional.BytesWrittenDataBlocks),
+					humanize.Bytes.Uint64(m.Levels[i].Additional.BytesWrittenValueBlocks))
+			}
+			return b.String()
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestMetricsWAmpDisableWAL(t *testing.T) {
+	d, err := Open("", &Options{FS: vfs.NewMem(), DisableWAL: true})
+	require.NoError(t, err)
+	ks := testkeys.Alpha(2)
+	wo := WriteOptions{Sync: false}
+	for i := 0; i < 5; i++ {
+		v := []byte(strconv.Itoa(i))
+		for j := int64(0); j < ks.Count(); j++ {
+			require.NoError(t, d.Set(testkeys.Key(ks, j), v, &wo))
+		}
+		require.NoError(t, d.Flush())
+		require.NoError(t, d.Compact([]byte("a"), []byte("z"), false /* parallelize */))
+	}
+	m := d.Metrics()
+	tot := m.Total()
+	require.Greater(t, tot.WriteAmp(), 1.0)
+	require.NoError(t, d.Close())
+}
diff --git a/pebble/objstorage/noop_readahead.go b/pebble/objstorage/noop_readahead.go
new file mode 100644
index 0000000..72c0a85
--- /dev/null
+++ b/pebble/objstorage/noop_readahead.go
@@ -0,0 +1,34 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package objstorage
+
+import "context"
+
+// NoopReadHandle can be used by Readable implementations that don't
+// support read-ahead.
+type NoopReadHandle struct {
+	readable Readable
+}
+
+// MakeNoopReadHandle initializes a NoopReadHandle.
+func MakeNoopReadHandle(r Readable) NoopReadHandle {
+	return NoopReadHandle{readable: r}
+}
+
+var _ ReadHandle = (*NoopReadHandle)(nil)
+
+// ReadAt is part of the ReadHandle interface.
+func (h *NoopReadHandle) ReadAt(ctx context.Context, p []byte, off int64) error {
+	return h.readable.ReadAt(ctx, p, off)
+}
+
+// Close is part of the ReadHandle interface.
+func (*NoopReadHandle) Close() error { return nil }
+
+// SetupForCompaction is part of the ReadHandle interface.
+func (*NoopReadHandle) SetupForCompaction() {}
+
+// RecordCacheHit is part of the ReadHandle interface.
+func (*NoopReadHandle) RecordCacheHit(_ context.Context, offset, size int64) {}
diff --git a/pebble/objstorage/objstorage.go b/pebble/objstorage/objstorage.go
new file mode 100644
index 0000000..a4408d1
--- /dev/null
+++ b/pebble/objstorage/objstorage.go
@@ -0,0 +1,330 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package objstorage
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache"
+	"github.com/cockroachdb/pebble/objstorage/remote"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/cockroachdb/redact"
+)
+
+// Readable is the handle for an object that is open for reading.
+type Readable interface {
+	// ReadAt reads len(p) bytes into p starting at offset off.
+	//
+	// Does not return partial results; if off + len(p) is past the end of the
+	// object, an error is returned.
+	//
+	// Clients of ReadAt can execute parallel ReadAt calls on the
+	// same Readable.
+	ReadAt(ctx context.Context, p []byte, off int64) error
+
+	Close() error
+
+	// Size returns the size of the object.
+	Size() int64
+
+	// NewReadHandle creates a read handle for ReadAt requests that are related
+	// and can benefit from optimizations like read-ahead.
+	//
+	// The ReadHandle must be closed before the Readable is closed.
+	//
+	// Multiple separate ReadHandles can be used.
+	NewReadHandle(ctx context.Context) ReadHandle
+}
+
+// ReadHandle is used to perform reads that are related and might benefit from
+// optimizations like read-ahead.
+type ReadHandle interface {
+	// ReadAt reads len(p) bytes into p starting at offset off.
+	//
+	// Does not return partial results; if off + len(p) is past the end of the
+	// object, an error is returned.
+	//
+	// Parallel ReadAt calls on the same ReadHandle are not allowed.
+	ReadAt(ctx context.Context, p []byte, off int64) error
+
+	Close() error
+
+	// SetupForCompaction informs the implementation that the read handle will
+	// be used to read data blocks for a compaction. The implementation can expect
+	// sequential reads, and can decide to not retain data in any caches.
+	SetupForCompaction()
+
+	// RecordCacheHit informs the implementation that we were able to retrieve a
+	// block from cache. This is useful for example when the implementation is
+	// trying to detect a sequential reading pattern.
+	RecordCacheHit(ctx context.Context, offset, size int64)
+}
+
+// Writable is the handle for an object that is open for writing.
+// Either Finish or Abort must be called.
+type Writable interface {
+	// Write writes len(p) bytes from p to the underlying object. The data is not
+	// guaranteed to be durable until Finish is called.
+	//
+	// Note that Write *is* allowed to modify the slice passed in, whether
+	// temporarily or permanently. Callers of Write need to take this into
+	// account.
+	Write(p []byte) error
+
+	// Finish completes the object and makes the data durable.
+	// No further calls are allowed after calling Finish.
+	Finish() error
+
+	// Abort gives up on finishing the object. There is no guarantee about whether
+	// the object exists after calling Abort.
+	// No further calls are allowed after calling Abort.
+	Abort()
+}
+
+// ObjectMetadata contains the metadata required to be able to access an object.
+type ObjectMetadata struct {
+	DiskFileNum base.DiskFileNum
+	FileType    base.FileType
+
+	// The fields below are only set if the object is on remote storage.
+	Remote struct {
+		// CreatorID identifies the DB instance that originally created the object.
+		//
+		// Only used when CustomObjectName is not set.
+		CreatorID CreatorID
+		// CreatorFileNum is the identifier for the object within the context of the
+		// DB instance that originally created the object.
+		//
+		// Only used when CustomObjectName is not set.
+		CreatorFileNum base.DiskFileNum
+		// CustomObjectName (if it is set) overrides the object name that is normally
+		// derived from the CreatorID and CreatorFileNum.
+		CustomObjectName string
+		// CleanupMethod indicates the method for cleaning up unused shared objects.
+		CleanupMethod SharedCleanupMethod
+		// Locator identifies the remote.Storage implementation for this object.
+		Locator remote.Locator
+		// Storage is the remote.Storage object corresponding to the Locator. Used
+		// to avoid lookups in hot paths.
+		Storage remote.Storage
+	}
+}
+
+// IsRemote returns true if the object is on remote storage.
+func (meta *ObjectMetadata) IsRemote() bool {
+	return meta.IsShared() || meta.IsExternal()
+}
+
+// IsExternal returns true if the object is on remote storage but is not owned
+// by any Pebble instances in the cluster.
+func (meta *ObjectMetadata) IsExternal() bool {
+	return meta.Remote.CustomObjectName != ""
+}
+
+// IsShared returns true if the object is on remote storage and is owned by a
+// Pebble instance in the cluster (potentially shared between multiple
+// instances).
+func (meta *ObjectMetadata) IsShared() bool {
+	return meta.Remote.CreatorID.IsSet()
+}
+
+// AssertValid checks that the metadata is sane.
+func (meta *ObjectMetadata) AssertValid() {
+	if !meta.IsRemote() {
+		// Verify all Remote fields are empty.
+		if meta.Remote != (ObjectMetadata{}).Remote {
+			panic(errors.AssertionFailedf("meta.Remote not empty: %#v", meta.Remote))
+		}
+	} else {
+		if meta.Remote.CustomObjectName == "" {
+			if meta.Remote.CreatorID == 0 {
+				panic(errors.AssertionFailedf("CreatorID not set"))
+			}
+			if meta.Remote.CreatorFileNum == base.FileNum(0).DiskFileNum() {
+				panic(errors.AssertionFailedf("CreatorFileNum not set"))
+			}
+		}
+		if meta.Remote.CleanupMethod != SharedNoCleanup && meta.Remote.CleanupMethod != SharedRefTracking {
+			panic(errors.AssertionFailedf("invalid CleanupMethod %d", meta.Remote.CleanupMethod))
+		}
+		if meta.Remote.Storage == nil {
+			panic(errors.AssertionFailedf("Storage not set"))
+		}
+	}
+}
+
+// CreatorID identifies the DB instance that originally created a shared object.
+// This ID is incorporated in backing object names.
+// Must be non-zero.
+type CreatorID uint64
+
+// IsSet returns true if the CreatorID is not zero.
+func (c CreatorID) IsSet() bool { return c != 0 }
+
+func (c CreatorID) String() string { return fmt.Sprintf("%d", c) }
+
+// SafeFormat implements redact.SafeFormatter.
+func (c CreatorID) SafeFormat(w redact.SafePrinter, _ rune) {
+	w.Printf("%d", redact.SafeUint(c))
+}
+
+// SharedCleanupMethod indicates the method for cleaning up unused shared objects.
+type SharedCleanupMethod uint8
+
+const (
+	// SharedRefTracking is used for shared objects for which objstorage providers
+	// keep track of references via reference marker objects.
+	SharedRefTracking SharedCleanupMethod = iota
+
+	// SharedNoCleanup is used for remote objects that are managed externally; the
+	// objstorage provider never deletes such objects.
+	SharedNoCleanup
+)
+
+// OpenOptions contains optional arguments for OpenForReading.
+type OpenOptions struct {
+	// MustExist triggers a fatal error if the file does not exist. The fatal
+	// error message contains extra information helpful for debugging.
+	MustExist bool
+}
+
+// CreateOptions contains optional arguments for Create.
+type CreateOptions struct {
+	// PreferSharedStorage causes the object to be created on shared storage if
+	// the provider has shared storage configured.
+	PreferSharedStorage bool
+
+	// SharedCleanupMethod is used for the object when it is created on shared storage.
+	// The default (zero) value is SharedRefTracking.
+	SharedCleanupMethod SharedCleanupMethod
+}
+
+// Provider is a singleton object used to access and manage objects.
+//
+// An object is conceptually like a large immutable file. The main use of
+// objects is for storing sstables; in the future it could also be used for blob
+// storage.
+//
+// The Provider can only manage objects that it knows about - either objects
+// created by the provider, or existing objects the Provider was informed about
+// via AddObjects.
+//
+// Objects are currently backed by a vfs.File or a remote.Storage object.
+type Provider interface {
+	// OpenForReading opens an existing object.
+	OpenForReading(
+		ctx context.Context, fileType base.FileType, FileNum base.DiskFileNum, opts OpenOptions,
+	) (Readable, error)
+
+	// Create creates a new object and opens it for writing.
+	//
+	// The object is not guaranteed to be durable (accessible in case of crashes)
+	// until Sync is called.
+	Create(
+		ctx context.Context, fileType base.FileType, FileNum base.DiskFileNum, opts CreateOptions,
+	) (w Writable, meta ObjectMetadata, err error)
+
+	// Remove removes an object.
+	//
+	// The object is not guaranteed to be durably removed until Sync is called.
+	Remove(fileType base.FileType, FileNum base.DiskFileNum) error
+
+	// Sync flushes the metadata from creation or removal of objects since the last Sync.
+	// This includes objects that have been Created but for which
+	// Writable.Finish() has not yet been called.
+	Sync() error
+
+	// LinkOrCopyFromLocal creates a new object that is either a copy of a given
+	// local file or a hard link (if the new object is created on the same FS, and
+	// if the FS supports it).
+	//
+	// The object is not guaranteed to be durable (accessible in case of crashes)
+	// until Sync is called.
+	LinkOrCopyFromLocal(
+		ctx context.Context,
+		srcFS vfs.FS,
+		srcFilePath string,
+		dstFileType base.FileType,
+		dstFileNum base.DiskFileNum,
+		opts CreateOptions,
+	) (ObjectMetadata, error)
+
+	// Lookup returns the metadata of an object that is already known to the Provider.
+	// Does not perform any I/O.
+	Lookup(fileType base.FileType, FileNum base.DiskFileNum) (ObjectMetadata, error)
+
+	// Path returns an internal, implementation-dependent path for the object. It is
+	// meant to be used for informational purposes (like logging).
+	Path(meta ObjectMetadata) string
+
+	// Size returns the size of the object.
+	Size(meta ObjectMetadata) (int64, error)
+
+	// List returns the objects currently known to the provider. Does not perform any I/O.
+	List() []ObjectMetadata
+
+	// SetCreatorID sets the CreatorID which is needed in order to use shared
+	// objects. Remote object usage is disabled until this method is called the
+	// first time. Once set, the Creator ID is persisted and cannot change.
+	//
+	// Cannot be called if shared storage is not configured for the provider.
+	SetCreatorID(creatorID CreatorID) error
+
+	// IsSharedForeign returns whether this object is owned by a different node.
+	IsSharedForeign(meta ObjectMetadata) bool
+
+	// RemoteObjectBacking encodes the remote object metadata for the given object.
+	RemoteObjectBacking(meta *ObjectMetadata) (RemoteObjectBackingHandle, error)
+
+	// CreateExternalObjectBacking creates a backing for an existing object with a
+	// custom object name. The object is considered to be managed outside of
+	// Pebble and will never be removed by Pebble.
+	CreateExternalObjectBacking(locator remote.Locator, objName string) (RemoteObjectBacking, error)
+
+	// AttachRemoteObjects registers existing remote objects with this provider.
+	AttachRemoteObjects(objs []RemoteObjectToAttach) ([]ObjectMetadata, error)
+
+	Close() error
+
+	// IsNotExistError indicates whether the error is known to report that a file or
+	// directory does not exist.
+	IsNotExistError(err error) bool
+
+	// Metrics returns metrics about objstorage. Currently, it only returns metrics
+	// about the shared cache.
+	Metrics() sharedcache.Metrics
+}
+
+// RemoteObjectBacking encodes the metadata necessary to incorporate a shared
+// object into a different Pebble instance. The encoding is specific to a given
+// Provider implementation.
+type RemoteObjectBacking []byte
+
+// RemoteObjectBackingHandle is a container for a RemoteObjectBacking which
+// ensures that the backing stays valid. A backing can otherwise become invalid
+// if this provider unrefs the shared object. The RemoteObjectBackingHandle
+// delays any unref until Close.
+type RemoteObjectBackingHandle interface {
+	// Get returns the backing. The backing is only guaranteed to be valid until
+	// Close is called (or until the Provider is closed). If Close was already
+	// called, returns an error.
+	Get() (RemoteObjectBacking, error)
+	Close()
+}
+
+// RemoteObjectToAttach contains the arguments needed to attach an existing remote object.
+type RemoteObjectToAttach struct {
+	// FileNum is the file number that will be used to refer to this object (in
+	// the context of this instance).
+	FileNum  base.DiskFileNum
+	FileType base.FileType
+	// Backing contains the metadata for the remote object backing (normally
+	// generated from a different instance, but using the same Provider
+	// implementation).
+	Backing RemoteObjectBacking
+}
diff --git a/pebble/objstorage/objstorageprovider/objiotracing/obj_io_tracing.go b/pebble/objstorage/objstorageprovider/objiotracing/obj_io_tracing.go
new file mode 100644
index 0000000..2672636
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/objiotracing/obj_io_tracing.go
@@ -0,0 +1,68 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package objiotracing
+
+import "github.com/cockroachdb/pebble/internal/base"
+
+// OpType indicates the type of operation.
+type OpType uint8
+
+// OpType values.
+const (
+	ReadOp OpType = iota
+	WriteOp
+	// RecordCacheHitOp happens when a read is satisfied from the block cache. See
+	// objstorage.ReadHandle.RecordCacheHit().
+	RecordCacheHitOp
+	// SetupForCompactionOp is a "meta operation" that configures a read handle
+	// for large sequential reads. See objstorage.ReadHandle.SetupForCompaction().
+	SetupForCompactionOp
+)
+
+// Reason indicates the higher-level context of the operation.
+type Reason uint8
+
+// Reason values.
+const (
+	UnknownReason Reason = iota
+	ForFlush
+	ForCompaction
+	ForIngestion
+	// TODO(radu): add ForUserFacing.
+)
+
+// BlockType indicates the type of data block relevant to an operation.
+type BlockType uint8
+
+// BlockType values.
+const (
+	UnknownBlock BlockType = iota
+	DataBlock
+	ValueBlock
+	FilterBlock
+	MetadataBlock
+)
+
+// Event is the on-disk format of a tracing event. It is exported here so that
+// trace processing tools can use it by importing this package.
+type Event struct {
+	// Event start time as a Unix time (see time.Time.StartUnixNano()).
+	// Note that recorded events are not necessarily ordered by time - this is
+	// because separate event "streams" use local buffers (for performance).
+	StartUnixNano int64
+	Op            OpType
+	Reason        Reason
+	BlockType     BlockType
+	// LSM level plus one (with 0 indicating unknown level).
+	LevelPlusOne uint8
+	// Hardcoded padding so that struct layout doesn't depend on architecture.
+	_       uint32
+	FileNum base.FileNum
+	// HandleID is a unique identifier corresponding to an objstorage.ReadHandle;
+	// only set for read operations performed through a ReadHandle.
+	HandleID uint64
+	Offset   int64
+	Size     int64
+}
diff --git a/pebble/objstorage/objstorageprovider/objiotracing/obj_io_tracing_off.go b/pebble/objstorage/objstorageprovider/objiotracing/obj_io_tracing_off.go
new file mode 100644
index 0000000..a4923ab
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/objiotracing/obj_io_tracing_off.go
@@ -0,0 +1,59 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build !pebble_obj_io_tracing
+// +build !pebble_obj_io_tracing
+
+package objiotracing
+
+import (
+	"context"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+// Enabled is used to short circuit tracing-related code in regular builds.
+const Enabled = false
+
+// Tracer manages the writing of object IO traces to files.
+type Tracer struct{}
+
+// Open creates a Tracer which generates trace files in the given directory.
+// Each trace file contains a series of Events (as they are in memory).
+func Open(fs vfs.FS, fsDir string) *Tracer {
+	return nil
+}
+
+// Close the tracer, flushing any remaining events.
+func (*Tracer) Close() {}
+
+// WrapReadable wraps an objstorage.Readable with one that generates tracing
+// events.
+func (*Tracer) WrapReadable(
+	ctx context.Context, r objstorage.Readable, fileNum base.DiskFileNum,
+) objstorage.Readable {
+	return r
+}
+
+// WrapWritable wraps an objstorage.Writable with one that generates tracing
+// events.
+func (t *Tracer) WrapWritable(
+	ctx context.Context, w objstorage.Writable, fileNum base.DiskFileNum,
+) objstorage.Writable {
+	return w
+}
+
+// WithReason creates a context that has an associated Reason (which ends up in
+// traces created under that context).
+func WithReason(ctx context.Context, reason Reason) context.Context { return ctx }
+
+// WithBlockType creates a context that has an associated BlockType (which ends up in
+// traces created under that context).
+func WithBlockType(ctx context.Context, blockType BlockType) context.Context { return ctx }
+
+// WithLevel creates a context that has an associated level (which ends up in
+// traces created under that context).
+func WithLevel(ctx context.Context, level int) context.Context { return ctx }
diff --git a/pebble/objstorage/objstorageprovider/objiotracing/obj_io_tracing_on.go b/pebble/objstorage/objstorageprovider/objiotracing/obj_io_tracing_on.go
new file mode 100644
index 0000000..0680b34
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/objiotracing/obj_io_tracing_on.go
@@ -0,0 +1,410 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build pebble_obj_io_tracing
+// +build pebble_obj_io_tracing
+
+package objiotracing
+
+import (
+	"bufio"
+	"context"
+	"fmt"
+	"math/rand"
+	"sync"
+	"sync/atomic"
+	"time"
+	"unsafe"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+// Enabled is used to short circuit tracing-related code in regular builds.
+const Enabled = true
+
+// Tracer manages the writing of object IO traces to files.
+//
+// The tracer runs a background worker goroutine which receives trace event
+// buffers over a channel and dumps them to IOTRACES- files. Wrapper
+// implementations for Readable, ReadHandle, Writable are producers of traces;
+// they maintain internal buffers of events which get flushed to the buffered
+// channel when they get full. This allows for minimal synchronization per IO
+// (as for most of these structures, an instance only allows a single IO at a
+// time).
+type Tracer struct {
+	fs    vfs.FS
+	fsDir string
+
+	handleID atomic.Uint64
+
+	workerStopCh chan struct{}
+	workerDataCh chan eventBuf
+	workerWait   sync.WaitGroup
+}
+
+// Open creates a Tracer which generates trace files in the given directory.
+// Each trace file contains a series of Events (as they are in memory).
+func Open(fs vfs.FS, fsDir string) *Tracer {
+	t := &Tracer{
+		fs:           fs,
+		fsDir:        fsDir,
+		workerStopCh: make(chan struct{}),
+		workerDataCh: make(chan eventBuf, channelBufSize),
+	}
+
+	t.handleID.Store(uint64(rand.NewSource(time.Now().UnixNano()).Int63()))
+
+	t.workerWait.Add(1)
+	go t.workerLoop()
+	return t
+}
+
+// Close the tracer, flushing any remaining events.
+func (t *Tracer) Close() {
+	if t.workerStopCh == nil {
+		return
+	}
+	// Tell the worker to stop and wait for it to finish up.
+	close(t.workerStopCh)
+	t.workerWait.Wait()
+	t.workerStopCh = nil
+}
+
+// WrapWritable wraps an objstorage.Writable with one that generates tracing
+// events.
+func (t *Tracer) WrapWritable(
+	ctx context.Context, w objstorage.Writable, fileNum base.FileNum,
+) objstorage.Writable {
+	return &writable{
+		w:       w,
+		fileNum: fileNum,
+		g:       makeEventGenerator(ctx, t),
+	}
+}
+
+type writable struct {
+	w         objstorage.Writable
+	fileNum   base.FileNum
+	curOffset int64
+	g         eventGenerator
+}
+
+var _ objstorage.Writable = (*writable)(nil)
+
+// Write is part of the objstorage.Writable interface.
+func (w *writable) Write(p []byte) error {
+	w.g.add(context.Background(), Event{
+		Op:      WriteOp,
+		FileNum: w.fileNum,
+		Offset:  w.curOffset,
+		Size:    int64(len(p)),
+	})
+	// If w.w.Write(p) returns an error, a new writable
+	// will be used, so even tho all of p may not have
+	// been written to the underlying "file", it is okay
+	// to add len(p) to curOffset.
+	w.curOffset += int64(len(p))
+	return w.w.Write(p)
+}
+
+// Finish is part of the objstorage.Writable interface.
+func (w *writable) Finish() error {
+	w.g.flush()
+	return w.w.Finish()
+}
+
+// Abort is part of the objstorage.Writable interface.
+func (w *writable) Abort() {
+	w.g.flush()
+	w.w.Abort()
+}
+
+// WrapReadable wraps an objstorage.Readable with one that generates tracing
+// events.
+func (t *Tracer) WrapReadable(
+	ctx context.Context, r objstorage.Readable, fileNum base.FileNum,
+) objstorage.Readable {
+	res := &readable{
+		r:       r,
+		fileNum: fileNum,
+	}
+	res.mu.g = makeEventGenerator(ctx, t)
+	return res
+}
+
+type readable struct {
+	r       objstorage.Readable
+	fileNum base.FileNum
+	mu      struct {
+		sync.Mutex
+		g eventGenerator
+	}
+}
+
+var _ objstorage.Readable = (*readable)(nil)
+
+// ReadAt is part of the objstorage.Readable interface.
+func (r *readable) ReadAt(ctx context.Context, v []byte, off int64) (n int, err error) {
+	r.mu.Lock()
+	r.mu.g.add(ctx, Event{
+		Op:      ReadOp,
+		FileNum: r.fileNum,
+		Offset:  off,
+		Size:    int64(len(v)),
+	})
+	r.mu.Unlock()
+	return r.r.ReadAt(ctx, v, off)
+}
+
+// Close is part of the objstorage.Readable interface.
+func (r *readable) Close() error {
+	r.mu.g.flush()
+	return r.r.Close()
+}
+
+// Size is part of the objstorage.Readable interface.
+func (r *readable) Size() int64 {
+	return r.r.Size()
+}
+
+// NewReadHandle is part of the objstorage.Readable interface.
+func (r *readable) NewReadHandle(ctx context.Context) objstorage.ReadHandle {
+	// It's safe to get the tracer from the generator without the mutex since it never changes.
+	t := r.mu.g.t
+	return &readHandle{
+		rh:       r.r.NewReadHandle(ctx),
+		fileNum:  r.fileNum,
+		handleID: t.handleID.Add(1),
+		g:        makeEventGenerator(ctx, t),
+	}
+}
+
+type readHandle struct {
+	rh       objstorage.ReadHandle
+	fileNum  base.FileNum
+	handleID uint64
+	g        eventGenerator
+}
+
+var _ objstorage.ReadHandle = (*readHandle)(nil)
+
+// ReadAt is part of the objstorage.ReadHandle interface.
+func (rh *readHandle) ReadAt(ctx context.Context, p []byte, off int64) (n int, err error) {
+	rh.g.add(ctx, Event{
+		Op:       ReadOp,
+		FileNum:  rh.fileNum,
+		HandleID: rh.handleID,
+		Offset:   off,
+		Size:     int64(len(p)),
+	})
+	return rh.rh.ReadAt(ctx, p, off)
+}
+
+// Close is part of the objstorage.ReadHandle interface.
+func (rh *readHandle) Close() error {
+	rh.g.flush()
+	return rh.rh.Close()
+}
+
+// SetupForCompaction is part of the objstorage.ReadHandle interface.
+func (rh *readHandle) SetupForCompaction() {
+	rh.g.add(context.Background(), Event{
+		Op:       SetupForCompactionOp,
+		FileNum:  rh.fileNum,
+		HandleID: rh.handleID,
+	})
+	rh.rh.SetupForCompaction()
+}
+
+// RecordCacheHit is part of the objstorage.ReadHandle interface.
+func (rh *readHandle) RecordCacheHit(ctx context.Context, offset, size int64) {
+	rh.g.add(ctx, Event{
+		Op:       RecordCacheHitOp,
+		FileNum:  rh.fileNum,
+		HandleID: rh.handleID,
+		Offset:   offset,
+		Size:     size,
+	})
+	rh.rh.RecordCacheHit(ctx, offset, size)
+}
+
+type ctxInfo struct {
+	reason       Reason
+	blockType    BlockType
+	levelPlusOne uint8
+}
+
+func mergeCtxInfo(base, other ctxInfo) ctxInfo {
+	res := other
+	if res.reason == 0 {
+		res.reason = base.reason
+	}
+	if res.blockType == 0 {
+		res.blockType = base.blockType
+	}
+	if res.levelPlusOne == 0 {
+		res.levelPlusOne = base.levelPlusOne
+	}
+	return res
+}
+
+type ctxInfoKey struct{}
+
+func withInfo(ctx context.Context, info ctxInfo) context.Context {
+	return context.WithValue(ctx, ctxInfoKey{}, info)
+}
+
+func infoFromCtx(ctx context.Context) ctxInfo {
+	res := ctx.Value(ctxInfoKey{})
+	if res == nil {
+		return ctxInfo{}
+	}
+	return res.(ctxInfo)
+}
+
+// WithReason creates a context that has an associated Reason (which ends up in
+// traces created under that context).
+func WithReason(ctx context.Context, reason Reason) context.Context {
+	info := infoFromCtx(ctx)
+	info.reason = reason
+	return withInfo(ctx, info)
+}
+
+// WithBlockType creates a context that has an associated BlockType (which ends up in
+// traces created under that context).
+func WithBlockType(ctx context.Context, blockType BlockType) context.Context {
+	info := infoFromCtx(ctx)
+	info.blockType = blockType
+	return withInfo(ctx, info)
+}
+
+// WithLevel creates a context that has an associated level (which ends up in
+// traces created under that context).
+func WithLevel(ctx context.Context, level int) context.Context {
+	info := infoFromCtx(ctx)
+	info.levelPlusOne = uint8(level) + 1
+	return withInfo(ctx, info)
+}
+
+const (
+	eventSize            = int(unsafe.Sizeof(Event{}))
+	targetEntriesPerFile = 256 * 1024 * 1024 / eventSize // 256MB files
+	eventsPerBuf         = 16
+	channelBufSize       = 512 * 1024 / eventsPerBuf // 512K events.
+	bytesPerFileSync     = 128 * 1024
+)
+
+type eventBuf struct {
+	events [eventsPerBuf]Event
+	num    int
+}
+
+type eventGenerator struct {
+	t           *Tracer
+	baseCtxInfo ctxInfo
+	buf         eventBuf
+}
+
+func makeEventGenerator(ctx context.Context, t *Tracer) eventGenerator {
+	return eventGenerator{
+		t:           t,
+		baseCtxInfo: infoFromCtx(ctx),
+	}
+}
+
+func (g *eventGenerator) flush() {
+	if g.buf.num > 0 {
+		g.t.workerDataCh <- g.buf
+		g.buf.num = 0
+	}
+}
+
+func (g *eventGenerator) add(ctx context.Context, e Event) {
+	e.StartUnixNano = time.Now().UnixNano()
+	info := infoFromCtx(ctx)
+	info = mergeCtxInfo(g.baseCtxInfo, info)
+	e.Reason = info.reason
+	e.BlockType = info.blockType
+	e.LevelPlusOne = info.levelPlusOne
+	if g.buf.num == eventsPerBuf {
+		g.flush()
+	}
+	g.buf.events[g.buf.num] = e
+	g.buf.num++
+}
+
+type workerState struct {
+	curFile          vfs.File
+	curBW            *bufio.Writer
+	numEntriesInFile int
+}
+
+func (t *Tracer) workerLoop() {
+	defer t.workerWait.Done()
+	stopCh := t.workerStopCh
+	dataCh := t.workerDataCh
+	var state workerState
+	t.workerNewFile(&state)
+	for {
+		select {
+		case <-stopCh:
+			close(dataCh)
+			// Flush any remaining traces.
+			for data := range dataCh {
+				t.workerWriteTraces(&state, data)
+			}
+			t.workerCloseFile(&state)
+			return
+
+		case data := <-dataCh:
+			t.workerWriteTraces(&state, data)
+		}
+	}
+}
+
+func (t *Tracer) workerWriteTraces(state *workerState, data eventBuf) {
+	if state.numEntriesInFile >= targetEntriesPerFile {
+		t.workerCloseFile(state)
+		t.workerNewFile(state)
+	}
+	state.numEntriesInFile += data.num
+	p := unsafe.Pointer(&data.events[0])
+	b := unsafe.Slice((*byte)(p), eventSize*data.num)
+	if _, err := state.curBW.Write(b); err != nil {
+		panic(err)
+	}
+}
+
+func (t *Tracer) workerNewFile(state *workerState) {
+	filename := fmt.Sprintf("IOTRACES-%s", time.Now().UTC().Format(time.RFC3339Nano))
+
+	file, err := t.fs.Create(t.fs.PathJoin(t.fsDir, filename))
+	if err != nil {
+		panic(err)
+	}
+	file = vfs.NewSyncingFile(file, vfs.SyncingFileOptions{
+		BytesPerSync: bytesPerFileSync,
+	})
+	state.curFile = file
+	state.curBW = bufio.NewWriter(file)
+	state.numEntriesInFile = 0
+}
+
+func (t *Tracer) workerCloseFile(state *workerState) {
+	if state.curFile != nil {
+		if err := state.curBW.Flush(); err != nil {
+			panic(err)
+		}
+		if err := state.curFile.Sync(); err != nil {
+			panic(err)
+		}
+		if err := state.curFile.Close(); err != nil {
+			panic(err)
+		}
+		state.curFile = nil
+		state.curBW = nil
+	}
+}
diff --git a/pebble/objstorage/objstorageprovider/objiotracing/obj_io_tracing_test.go b/pebble/objstorage/objstorageprovider/objiotracing/obj_io_tracing_test.go
new file mode 100644
index 0000000..5a79b57
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/objiotracing/obj_io_tracing_test.go
@@ -0,0 +1,130 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package objiotracing_test
+
+import (
+	"io"
+	"strings"
+	"testing"
+	"unsafe"
+
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+type Event = objiotracing.Event
+
+const eventSize = int(unsafe.Sizeof(Event{}))
+
+func TestTracing(t *testing.T) {
+	if !objiotracing.Enabled {
+		t.Skipf("test can only be run under pebble_obj_io_tracing build tag")
+	}
+	fs := vfs.NewMem()
+	d, err := pebble.Open("", &pebble.Options{FS: fs})
+	require.NoError(t, err)
+
+	require.NoError(t, d.Set([]byte("a"), []byte("aaa"), nil))
+	require.NoError(t, d.Set([]byte("b"), []byte("bbb"), nil))
+	require.NoError(t, d.Flush())
+	require.NoError(t, d.Set([]byte("c"), []byte("ccc"), nil))
+	require.NoError(t, d.Flush())
+	require.NoError(t, d.Compact([]byte("a"), []byte("z"), false /* parallelize */))
+	require.NoError(t, d.Set([]byte("b"), []byte("bbb2"), nil))
+	require.NoError(t, d.Set([]byte("c"), []byte("ccc2"), nil))
+	require.NoError(t, d.Set([]byte("d"), []byte("ddd"), nil))
+	require.NoError(t, d.Flush())
+	require.NoError(t, d.Compact([]byte("a"), []byte("z"), false /* parallelize */))
+	require.NoError(t, d.Close())
+
+	collectEvents := func() []Event {
+		t.Helper()
+
+		list, err := fs.List("")
+		require.NoError(t, err)
+
+		var events []Event
+		for _, f := range list {
+			if strings.HasPrefix(f, "IOTRACES-") {
+				file, err := fs.Open(f)
+				require.NoError(t, err)
+				data, err := io.ReadAll(file)
+				file.Close()
+				require.NoError(t, err)
+				// Remove the file so we don't read these events again later.
+				fs.Remove(f)
+				if len(data) == 0 {
+					continue
+				}
+				require.Equal(t, len(data)%eventSize, 0)
+				p := unsafe.Pointer(&data[0])
+				asEvents := unsafe.Slice((*Event)(p), len(data)/eventSize)
+				events = append(events, asEvents...)
+			}
+		}
+		if testing.Verbose() {
+			t.Logf("collected events:")
+			for _, e := range events {
+				t.Logf("  %#v", e)
+			}
+		}
+		return events
+	}
+	events := collectEvents()
+	num := func(check func(e Event) bool) int {
+		res := 0
+		for _, e := range events {
+			if check(e) {
+				res += 1
+			}
+		}
+		return res
+	}
+	// Check that we saw at least a few reads and writes.
+	// TODO(radu): check more fields when they are populated.
+	require.Greater(t, num(func(e Event) bool { return e.Op == objiotracing.ReadOp }), 5)
+	require.Greater(t, num(func(e Event) bool { return e.Op == objiotracing.WriteOp }), 5)
+
+	// We should see writes at L0 and L7.
+	require.Greater(t, num(func(e Event) bool { return e.Op == objiotracing.WriteOp && e.LevelPlusOne == 1 }), 0)
+	require.Greater(t, num(func(e Event) bool { return e.Op == objiotracing.WriteOp && e.LevelPlusOne == 7 }), 0)
+
+	// Check that we saw writes for flushing and for compaction.
+	require.Greater(t, num(func(e Event) bool { return e.Reason == objiotracing.ForFlush }), 0)
+	require.Greater(t, num(func(e Event) bool { return e.Reason == objiotracing.ForCompaction }), 0)
+
+	// Check that offset is set on reads & writes as expected.
+	require.Greater(t, num(func(e Event) bool { return e.Op == objiotracing.ReadOp && e.Offset > 0 }), 0)
+	require.Greater(t, num(func(e Event) bool { return e.Op == objiotracing.WriteOp && e.Offset > 0 }), 0)
+
+	// Check that the FileNums are set and that we see at least two different files.
+	fileNums := make(map[base.FileNum]int)
+	for _, e := range events {
+		require.NotZero(t, e.FileNum)
+		fileNums[e.FileNum] += 1
+	}
+	require.GreaterOrEqual(t, len(fileNums), 2)
+
+	// Open again and do some reads.
+	d, err = pebble.Open("", &pebble.Options{FS: fs})
+	require.NoError(t, err)
+	for _, k := range []string{"0", "a", "d", "ccc", "b"} {
+		_, closer, err := d.Get([]byte(k))
+		if err == pebble.ErrNotFound {
+			continue
+		}
+		require.NoError(t, err)
+		closer.Close()
+	}
+	require.NoError(t, d.Close())
+	events = collectEvents()
+	// Expect L6 data block reads.
+	require.Greater(t, num(func(e Event) bool {
+		return e.Op == objiotracing.ReadOp && e.BlockType == objiotracing.DataBlock && e.LevelPlusOne == 7
+	}), 0)
+}
diff --git a/pebble/objstorage/objstorageprovider/provider.go b/pebble/objstorage/objstorageprovider/provider.go
new file mode 100644
index 0000000..6e27f8d
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/provider.go
@@ -0,0 +1,524 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package objstorageprovider
+
+import (
+	"context"
+	"io"
+	"os"
+	"sync"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/errors/oserror"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/remoteobjcat"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache"
+	"github.com/cockroachdb/pebble/objstorage/remote"
+	"github.com/cockroachdb/pebble/shims/cmp"
+	"github.com/cockroachdb/pebble/shims/slices"
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+// provider is the implementation of objstorage.Provider.
+type provider struct {
+	st Settings
+
+	fsDir vfs.File
+
+	tracer *objiotracing.Tracer
+
+	remote remoteSubsystem
+
+	mu struct {
+		sync.RWMutex
+
+		remote struct {
+			// catalogBatch accumulates remote object creations and deletions until
+			// Sync is called.
+			catalogBatch remoteobjcat.Batch
+
+			storageObjects map[remote.Locator]remote.Storage
+		}
+
+		// localObjectsChanged is set if non-remote objects were created or deleted
+		// but Sync was not yet called.
+		localObjectsChanged bool
+
+		// knownObjects maintains information about objects that are known to the provider.
+		// It is initialized with the list of files in the manifest when we open a DB.
+		knownObjects map[base.DiskFileNum]objstorage.ObjectMetadata
+
+		// protectedObjects are objects that cannot be unreferenced because they
+		// have outstanding SharedObjectBackingHandles. The value is a count of outstanding handles
+		protectedObjects map[base.DiskFileNum]int
+	}
+}
+
+var _ objstorage.Provider = (*provider)(nil)
+
+// Settings that must be specified when creating the provider.
+type Settings struct {
+	Logger base.Logger
+
+	// Local filesystem configuration.
+	FS        vfs.FS
+	FSDirName string
+
+	// FSDirInitialListing is a listing of FSDirName at the time of calling Open.
+	//
+	// This is an optional optimization to avoid double listing on Open when the
+	// higher layer already has a listing. When nil, we obtain the listing on
+	// Open.
+	FSDirInitialListing []string
+
+	// Cleaner cleans obsolete files from the local filesystem.
+	//
+	// The default cleaner uses the DeleteCleaner.
+	FSCleaner base.Cleaner
+
+	// NoSyncOnClose decides whether the implementation will enforce a
+	// close-time synchronization (e.g., fdatasync() or sync_file_range())
+	// on files it writes to. Setting this to true removes the guarantee for a
+	// sync on close. Some implementations can still issue a non-blocking sync.
+	NoSyncOnClose bool
+
+	// BytesPerSync enables periodic syncing of files in order to smooth out
+	// writes to disk. This option does not provide any persistence guarantee, but
+	// is used to avoid latency spikes if the OS automatically decides to write
+	// out a large chunk of dirty filesystem buffers.
+	BytesPerSync int
+
+	// Fields here are set only if the provider is to support remote objects
+	// (experimental).
+	Remote struct {
+		StorageFactory remote.StorageFactory
+
+		// If CreateOnShared is non-zero, sstables are created on remote storage using
+		// the CreateOnSharedLocator (when the PreferSharedStorage create option is
+		// true).
+		CreateOnShared        remote.CreateOnSharedStrategy
+		CreateOnSharedLocator remote.Locator
+
+		// CacheSizeBytes is the size of the on-disk block cache for objects
+		// on remote storage. If it is 0, no cache is used.
+		CacheSizeBytes int64
+
+		// CacheBlockSize is the block size of the cache; if 0, the default of 32KB is used.
+		CacheBlockSize int
+
+		// ShardingBlockSize is the size of a shard block. The cache is split into contiguous
+		// ShardingBlockSize units. The units are distributed across multiple independent shards
+		// of the cache, via a hash(offset) modulo num shards operation. The cache replacement
+		// policies operate at the level of shard, not whole cache. This is done to reduce lock
+		// contention.
+		//
+		// If ShardingBlockSize is 0, the default of 1 MB is used.
+		ShardingBlockSize int64
+
+		// The number of independent shards the cache leverages. Each shard is the same size,
+		// and a hash of filenum & offset map a read to a certain shard. If set to 0,
+		// 2*runtime.GOMAXPROCS is used as the shard count.
+		CacheShardCount int
+
+		// TODO(radu): allow the cache to live on another FS/location (e.g. to use
+		// instance-local SSD).
+	}
+}
+
+// DefaultSettings initializes default settings (with no remote storage),
+// suitable for tests and tools.
+func DefaultSettings(fs vfs.FS, dirName string) Settings {
+	return Settings{
+		Logger:        base.DefaultLogger,
+		FS:            fs,
+		FSDirName:     dirName,
+		FSCleaner:     base.DeleteCleaner{},
+		NoSyncOnClose: false,
+		BytesPerSync:  512 * 1024, // 512KB
+	}
+}
+
+// Open creates the provider.
+func Open(settings Settings) (objstorage.Provider, error) {
+	// Note: we can't just `return open(settings)` because in an error case we
+	// would return (*provider)(nil) which is not objstorage.Provider(nil).
+	p, err := open(settings)
+	if err != nil {
+		return nil, err
+	}
+	return p, nil
+}
+
+func open(settings Settings) (p *provider, _ error) {
+	fsDir, err := settings.FS.OpenDir(settings.FSDirName)
+	if err != nil {
+		return nil, err
+	}
+
+	defer func() {
+		if p == nil {
+			fsDir.Close()
+		}
+	}()
+
+	p = &provider{
+		st:    settings,
+		fsDir: fsDir,
+	}
+	p.mu.knownObjects = make(map[base.DiskFileNum]objstorage.ObjectMetadata)
+	p.mu.protectedObjects = make(map[base.DiskFileNum]int)
+
+	if objiotracing.Enabled {
+		p.tracer = objiotracing.Open(settings.FS, settings.FSDirName)
+	}
+
+	// Add local FS objects.
+	if err := p.vfsInit(); err != nil {
+		return nil, err
+	}
+
+	// Initialize remote subsystem (if configured) and add remote objects.
+	if err := p.remoteInit(); err != nil {
+		return nil, err
+	}
+
+	return p, nil
+}
+
+// Close is part of the objstorage.Provider interface.
+func (p *provider) Close() error {
+	err := p.sharedClose()
+	if p.fsDir != nil {
+		err = firstError(err, p.fsDir.Close())
+		p.fsDir = nil
+	}
+	if objiotracing.Enabled {
+		if p.tracer != nil {
+			p.tracer.Close()
+			p.tracer = nil
+		}
+	}
+	return err
+}
+
+// OpenForReading opens an existing object.
+func (p *provider) OpenForReading(
+	ctx context.Context,
+	fileType base.FileType,
+	fileNum base.DiskFileNum,
+	opts objstorage.OpenOptions,
+) (objstorage.Readable, error) {
+	meta, err := p.Lookup(fileType, fileNum)
+	if err != nil {
+		if opts.MustExist {
+			p.st.Logger.Fatalf("%v", err)
+		}
+		return nil, err
+	}
+
+	var r objstorage.Readable
+	if !meta.IsRemote() {
+		r, err = p.vfsOpenForReading(ctx, fileType, fileNum, opts)
+	} else {
+		r, err = p.remoteOpenForReading(ctx, meta, opts)
+		if err != nil && p.isNotExistError(meta, err) {
+			// Wrap the error so that IsNotExistError functions properly.
+			err = errors.Mark(err, os.ErrNotExist)
+		}
+	}
+	if err != nil {
+		return nil, err
+	}
+	if objiotracing.Enabled {
+		r = p.tracer.WrapReadable(ctx, r, fileNum)
+	}
+	return r, nil
+}
+
+// Create creates a new object and opens it for writing.
+//
+// The object is not guaranteed to be durable (accessible in case of crashes)
+// until Sync is called.
+func (p *provider) Create(
+	ctx context.Context,
+	fileType base.FileType,
+	fileNum base.DiskFileNum,
+	opts objstorage.CreateOptions,
+) (w objstorage.Writable, meta objstorage.ObjectMetadata, err error) {
+	if opts.PreferSharedStorage && p.st.Remote.CreateOnShared != remote.CreateOnSharedNone {
+		w, meta, err = p.sharedCreate(ctx, fileType, fileNum, p.st.Remote.CreateOnSharedLocator, opts)
+	} else {
+		w, meta, err = p.vfsCreate(ctx, fileType, fileNum)
+	}
+	if err != nil {
+		err = errors.Wrapf(err, "creating object %s", fileNum)
+		return nil, objstorage.ObjectMetadata{}, err
+	}
+	p.addMetadata(meta)
+	if objiotracing.Enabled {
+		w = p.tracer.WrapWritable(ctx, w, fileNum)
+	}
+	return w, meta, nil
+}
+
+// Remove removes an object.
+//
+// Note that if the object is remote, the object is only (conceptually) removed
+// from this provider. If other providers have references on the remote object,
+// it will not be removed.
+//
+// The object is not guaranteed to be durably removed until Sync is called.
+func (p *provider) Remove(fileType base.FileType, fileNum base.DiskFileNum) error {
+	meta, err := p.Lookup(fileType, fileNum)
+	if err != nil {
+		return err
+	}
+
+	if !meta.IsRemote() {
+		err = p.vfsRemove(fileType, fileNum)
+	} else {
+		// TODO(radu): implement remote object removal (i.e. deref).
+		err = p.sharedUnref(meta)
+		if err != nil && p.isNotExistError(meta, err) {
+			// Wrap the error so that IsNotExistError functions properly.
+			err = errors.Mark(err, os.ErrNotExist)
+		}
+	}
+	if err != nil && !p.IsNotExistError(err) {
+		// We want to be able to retry a Remove, so we keep the object in our list.
+		// TODO(radu): we should mark the object as "zombie" and not allow any other
+		// operations.
+		return errors.Wrapf(err, "removing object %s", fileNum)
+	}
+
+	p.removeMetadata(fileNum)
+	return err
+}
+
+func (p *provider) isNotExistError(meta objstorage.ObjectMetadata, err error) bool {
+	if meta.Remote.Storage != nil {
+		return meta.Remote.Storage.IsNotExistError(err)
+	}
+	return oserror.IsNotExist(err)
+}
+
+// IsNotExistError is part of the objstorage.Provider interface.
+func (p *provider) IsNotExistError(err error) bool {
+	// We use errors.Mark(err, os.ErrNotExist) for not-exist errors coming from
+	// remote.Storage.
+	return oserror.IsNotExist(err)
+}
+
+// Sync flushes the metadata from creation or removal of objects since the last Sync.
+func (p *provider) Sync() error {
+	if err := p.vfsSync(); err != nil {
+		return err
+	}
+	if err := p.sharedSync(); err != nil {
+		return err
+	}
+	return nil
+}
+
+// LinkOrCopyFromLocal creates a new object that is either a copy of a given
+// local file or a hard link (if the new object is created on the same FS, and
+// if the FS supports it).
+//
+// The object is not guaranteed to be durable (accessible in case of crashes)
+// until Sync is called.
+func (p *provider) LinkOrCopyFromLocal(
+	ctx context.Context,
+	srcFS vfs.FS,
+	srcFilePath string,
+	dstFileType base.FileType,
+	dstFileNum base.DiskFileNum,
+	opts objstorage.CreateOptions,
+) (objstorage.ObjectMetadata, error) {
+	shared := opts.PreferSharedStorage && p.st.Remote.CreateOnShared != remote.CreateOnSharedNone
+	if !shared && srcFS == p.st.FS {
+		// Wrap the normal filesystem with one which wraps newly created files with
+		// vfs.NewSyncingFile.
+		fs := vfs.NewSyncingFS(p.st.FS, vfs.SyncingFileOptions{
+			NoSyncOnClose: p.st.NoSyncOnClose,
+			BytesPerSync:  p.st.BytesPerSync,
+		})
+		dstPath := p.vfsPath(dstFileType, dstFileNum)
+		if err := vfs.LinkOrCopy(fs, srcFilePath, dstPath); err != nil {
+			return objstorage.ObjectMetadata{}, err
+		}
+
+		meta := objstorage.ObjectMetadata{
+			DiskFileNum: dstFileNum,
+			FileType:    dstFileType,
+		}
+		p.addMetadata(meta)
+		return meta, nil
+	}
+	// Create the object and copy the data.
+	w, meta, err := p.Create(ctx, dstFileType, dstFileNum, opts)
+	if err != nil {
+		return objstorage.ObjectMetadata{}, err
+	}
+	f, err := srcFS.Open(srcFilePath, vfs.SequentialReadsOption)
+	if err != nil {
+		return objstorage.ObjectMetadata{}, err
+	}
+	defer f.Close()
+	buf := make([]byte, 64*1024)
+	for {
+		n, readErr := f.Read(buf)
+		if readErr != nil && readErr != io.EOF {
+			w.Abort()
+			return objstorage.ObjectMetadata{}, readErr
+		}
+
+		if n > 0 {
+			if err := w.Write(buf[:n]); err != nil {
+				w.Abort()
+				return objstorage.ObjectMetadata{}, err
+			}
+		}
+
+		if readErr == io.EOF {
+			break
+		}
+	}
+	if err := w.Finish(); err != nil {
+		return objstorage.ObjectMetadata{}, err
+	}
+	return meta, nil
+}
+
+// Lookup is part of the objstorage.Provider interface.
+func (p *provider) Lookup(
+	fileType base.FileType, fileNum base.DiskFileNum,
+) (objstorage.ObjectMetadata, error) {
+	p.mu.RLock()
+	defer p.mu.RUnlock()
+	meta, ok := p.mu.knownObjects[fileNum]
+	if !ok {
+		return objstorage.ObjectMetadata{}, errors.Wrapf(
+			os.ErrNotExist,
+			"file %s (type %d) unknown to the objstorage provider",
+			fileNum, errors.Safe(fileType),
+		)
+	}
+	if meta.FileType != fileType {
+		return objstorage.ObjectMetadata{}, errors.AssertionFailedf(
+			"file %s type mismatch (known type %d, expected type %d)",
+			fileNum, errors.Safe(meta.FileType), errors.Safe(fileType),
+		)
+	}
+	return meta, nil
+}
+
+// Path is part of the objstorage.Provider interface.
+func (p *provider) Path(meta objstorage.ObjectMetadata) string {
+	if !meta.IsRemote() {
+		return p.vfsPath(meta.FileType, meta.DiskFileNum)
+	}
+	return p.remotePath(meta)
+}
+
+// Size returns the size of the object.
+func (p *provider) Size(meta objstorage.ObjectMetadata) (int64, error) {
+	if !meta.IsRemote() {
+		return p.vfsSize(meta.FileType, meta.DiskFileNum)
+	}
+	return p.remoteSize(meta)
+}
+
+// List is part of the objstorage.Provider interface.
+func (p *provider) List() []objstorage.ObjectMetadata {
+	p.mu.RLock()
+	defer p.mu.RUnlock()
+	res := make([]objstorage.ObjectMetadata, 0, len(p.mu.knownObjects))
+	for _, meta := range p.mu.knownObjects {
+		res = append(res, meta)
+	}
+	slices.SortFunc(res, func(a, b objstorage.ObjectMetadata) int {
+		return cmp.Compare(a.DiskFileNum, b.DiskFileNum)
+	})
+	return res
+}
+
+// Metrics is part of the objstorage.Provider interface.
+func (p *provider) Metrics() sharedcache.Metrics {
+	if p.remote.cache != nil {
+		return p.remote.cache.Metrics()
+	}
+	return sharedcache.Metrics{}
+}
+
+func (p *provider) addMetadata(meta objstorage.ObjectMetadata) {
+	if invariants.Enabled {
+		meta.AssertValid()
+	}
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	p.mu.knownObjects[meta.DiskFileNum] = meta
+	if meta.IsRemote() {
+		p.mu.remote.catalogBatch.AddObject(remoteobjcat.RemoteObjectMetadata{
+			FileNum:          meta.DiskFileNum,
+			FileType:         meta.FileType,
+			CreatorID:        meta.Remote.CreatorID,
+			CreatorFileNum:   meta.Remote.CreatorFileNum,
+			Locator:          meta.Remote.Locator,
+			CleanupMethod:    meta.Remote.CleanupMethod,
+			CustomObjectName: meta.Remote.CustomObjectName,
+		})
+	} else {
+		p.mu.localObjectsChanged = true
+	}
+}
+
+func (p *provider) removeMetadata(fileNum base.DiskFileNum) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	meta, ok := p.mu.knownObjects[fileNum]
+	if !ok {
+		return
+	}
+	delete(p.mu.knownObjects, fileNum)
+	if meta.IsRemote() {
+		p.mu.remote.catalogBatch.DeleteObject(fileNum)
+	} else {
+		p.mu.localObjectsChanged = true
+	}
+}
+
+// protectObject prevents the unreferencing of a remote object until
+// unprotectObject is called.
+func (p *provider) protectObject(fileNum base.DiskFileNum) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	p.mu.protectedObjects[fileNum] = p.mu.protectedObjects[fileNum] + 1
+}
+
+func (p *provider) unprotectObject(fileNum base.DiskFileNum) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	v := p.mu.protectedObjects[fileNum]
+	if invariants.Enabled && v == 0 {
+		panic("invalid protection count")
+	}
+	if v > 1 {
+		p.mu.protectedObjects[fileNum] = v - 1
+	} else {
+		delete(p.mu.protectedObjects, fileNum)
+		// TODO(radu): check if the object is still in knownObject; if not, unref it
+		// now.
+	}
+}
+
+func (p *provider) isProtected(fileNum base.DiskFileNum) bool {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.mu.protectedObjects[fileNum] > 0
+}
diff --git a/pebble/objstorage/objstorageprovider/provider_test.go b/pebble/objstorage/objstorageprovider/provider_test.go
new file mode 100644
index 0000000..2badf34
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/provider_test.go
@@ -0,0 +1,601 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package objstorageprovider
+
+import (
+	"context"
+	"fmt"
+	"math/rand"
+	"strings"
+	"sync"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/remote"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+func TestProvider(t *testing.T) {
+	datadriven.Walk(t, "testdata/provider", func(t *testing.T, path string) {
+		var log base.InMemLogger
+		fs := vfs.WithLogging(vfs.NewMem(), func(fmt string, args ...interface{}) {
+			log.Infof("<local fs> "+fmt, args...)
+		})
+		sharedStore := remote.WithLogging(remote.NewInMem(), func(fmt string, args ...interface{}) {
+			log.Infof("<remote> "+fmt, args...)
+		})
+		sharedFactory := remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
+			"": sharedStore,
+		})
+		tmpFileCounter := 0
+
+		providers := make(map[string]objstorage.Provider)
+		// We maintain both backings and backing handles to allow tests to use the
+		// backings after the handles have been closed.
+		backings := make(map[string]objstorage.RemoteObjectBacking)
+		backingHandles := make(map[string]objstorage.RemoteObjectBackingHandle)
+		var curProvider objstorage.Provider
+		datadriven.RunTest(t, path, func(t *testing.T, d *datadriven.TestData) string {
+			scanArgs := func(desc string, args ...interface{}) {
+				t.Helper()
+				if len(d.CmdArgs) != len(args) {
+					d.Fatalf(t, "usage: %s %s", d.Cmd, desc)
+				}
+				for i := range args {
+					_, err := fmt.Sscan(d.CmdArgs[i].String(), args[i])
+					if err != nil {
+						d.Fatalf(t, "%s: error parsing argument '%s'", d.Cmd, d.CmdArgs[i])
+					}
+				}
+			}
+			ctx := context.Background()
+
+			log.Reset()
+			switch d.Cmd {
+			case "open":
+				var fsDir string
+				var creatorID objstorage.CreatorID
+				scanArgs("<fs-dir> <remote-creator-id>", &fsDir, &creatorID)
+
+				st := DefaultSettings(fs, fsDir)
+				if creatorID != 0 {
+					st.Remote.StorageFactory = sharedFactory
+					st.Remote.CreateOnShared = remote.CreateOnSharedAll
+					st.Remote.CreateOnSharedLocator = ""
+				}
+				require.NoError(t, fs.MkdirAll(fsDir, 0755))
+				p, err := Open(st)
+				require.NoError(t, err)
+				if creatorID != 0 {
+					require.NoError(t, p.SetCreatorID(creatorID))
+				}
+				// Checking refs on open affects the test output. We don't want tests to
+				// only pass when the `invariants` tag is used, so unconditionally
+				// enable ref checking on open.
+				p.(*provider).remote.shared.checkRefsOnOpen = true
+				providers[fsDir] = p
+				curProvider = p
+
+				return log.String()
+
+			case "switch":
+				var fsDir string
+				scanArgs("<fs-dir>", &fsDir)
+				curProvider = providers[fsDir]
+				if curProvider == nil {
+					t.Fatalf("unknown provider %s", fsDir)
+				}
+
+				return ""
+
+			case "close":
+				require.NoError(t, curProvider.Sync())
+				require.NoError(t, curProvider.Close())
+				delete(providers, curProvider.(*provider).st.FSDirName)
+				curProvider = nil
+
+				return log.String()
+
+			case "create":
+				opts := objstorage.CreateOptions{
+					SharedCleanupMethod: objstorage.SharedRefTracking,
+				}
+				if len(d.CmdArgs) == 5 && d.CmdArgs[4].Key == "no-ref-tracking" {
+					d.CmdArgs = d.CmdArgs[:4]
+					opts.SharedCleanupMethod = objstorage.SharedNoCleanup
+				}
+				var fileNum base.FileNum
+				var typ string
+				var salt, size int
+				scanArgs("<file-num> <local|shared> <salt> <size> [no-ref-tracking]", &fileNum, &typ, &salt, &size)
+				switch typ {
+				case "local":
+				case "shared":
+					opts.PreferSharedStorage = true
+				default:
+					d.Fatalf(t, "'%s' should be 'local' or 'shared'", typ)
+				}
+				w, _, err := curProvider.Create(ctx, base.FileTypeTable, fileNum.DiskFileNum(), opts)
+				if err != nil {
+					return err.Error()
+				}
+				data := make([]byte, size)
+				// TODO(radu): write in chunks?
+				genData(byte(salt), 0, data)
+				require.NoError(t, w.Write(data))
+				require.NoError(t, w.Finish())
+
+				return log.String()
+
+			case "link-or-copy":
+				opts := objstorage.CreateOptions{
+					SharedCleanupMethod: objstorage.SharedRefTracking,
+				}
+				if len(d.CmdArgs) == 5 && d.CmdArgs[4].Key == "no-ref-tracking" {
+					d.CmdArgs = d.CmdArgs[:4]
+					opts.SharedCleanupMethod = objstorage.SharedNoCleanup
+				}
+				var fileNum base.FileNum
+				var typ string
+				var salt, size int
+				scanArgs("<file-num> <local|shared> <salt> <size> [no-ref-tracking]", &fileNum, &typ, &salt, &size)
+				switch typ {
+				case "local":
+				case "shared":
+					opts.PreferSharedStorage = true
+				default:
+					d.Fatalf(t, "'%s' should be 'local' or 'shared'", typ)
+				}
+
+				tmpFileCounter++
+				tmpFilename := fmt.Sprintf("temp-file-%d", tmpFileCounter)
+				f, err := fs.Create(tmpFilename)
+				require.NoError(t, err)
+				data := make([]byte, size)
+				genData(byte(salt), 0, data)
+				n, err := f.Write(data)
+				require.Equal(t, len(data), n)
+				require.NoError(t, err)
+				require.NoError(t, f.Close())
+
+				_, err = curProvider.LinkOrCopyFromLocal(
+					ctx, fs, tmpFilename, base.FileTypeTable, fileNum.DiskFileNum(), opts,
+				)
+				require.NoError(t, err)
+				return log.String()
+
+			case "read":
+				forCompaction := false
+				if len(d.CmdArgs) == 2 && d.CmdArgs[1].Key == "for-compaction" {
+					d.CmdArgs = d.CmdArgs[:1]
+					forCompaction = true
+				}
+				var fileNum base.FileNum
+				scanArgs("<file-num> [for-compaction]", &fileNum)
+				r, err := curProvider.OpenForReading(ctx, base.FileTypeTable, fileNum.DiskFileNum(), objstorage.OpenOptions{})
+				if err != nil {
+					return err.Error()
+				}
+				rh := r.NewReadHandle(ctx)
+				if forCompaction {
+					rh.SetupForCompaction()
+				}
+				log.Infof("size: %d", r.Size())
+				for _, l := range strings.Split(d.Input, "\n") {
+					var offset, size int
+					fmt.Sscanf(l, "%d %d", &offset, &size)
+					data := make([]byte, size)
+					err := rh.ReadAt(ctx, data, int64(offset))
+					if err != nil {
+						log.Infof("%d %d: %v", offset, size, err)
+					} else {
+						salt := checkData(t, offset, data)
+						log.Infof("%d %d: ok (salt %d)", offset, size, salt)
+					}
+				}
+				require.NoError(t, rh.Close())
+				require.NoError(t, r.Close())
+				return log.String()
+
+			case "remove":
+				var fileNum base.FileNum
+				scanArgs("<file-num>", &fileNum)
+				if err := curProvider.Remove(base.FileTypeTable, fileNum.DiskFileNum()); err != nil {
+					return err.Error()
+				}
+				return log.String()
+
+			case "list":
+				for _, meta := range curProvider.List() {
+					log.Infof("%s -> %s", meta.DiskFileNum, curProvider.Path(meta))
+				}
+				return log.String()
+
+			case "save-backing":
+				var key string
+				var fileNum base.FileNum
+				scanArgs("<key> <file-num>", &key, &fileNum)
+				meta, err := curProvider.Lookup(base.FileTypeTable, fileNum.DiskFileNum())
+				require.NoError(t, err)
+				handle, err := curProvider.RemoteObjectBacking(&meta)
+				if err != nil {
+					return err.Error()
+				}
+				backing, err := handle.Get()
+				require.NoError(t, err)
+				backings[key] = backing
+				backingHandles[key] = handle
+				return log.String()
+
+			case "close-backing":
+				var key string
+				scanArgs("<key>", &key)
+				backingHandles[key].Close()
+				return ""
+
+			case "attach":
+				lines := strings.Split(d.Input, "\n")
+				if len(lines) == 0 {
+					d.Fatalf(t, "at least one row expected; format: <key> <file-num>")
+				}
+				var objs []objstorage.RemoteObjectToAttach
+				for _, l := range lines {
+					var key string
+					var fileNum base.FileNum
+					_, err := fmt.Sscan(l, &key, &fileNum)
+					require.NoError(t, err)
+					b, ok := backings[key]
+					if !ok {
+						d.Fatalf(t, "unknown backing key %q", key)
+					}
+					objs = append(objs, objstorage.RemoteObjectToAttach{
+						FileType: base.FileTypeTable,
+						FileNum:  fileNum.DiskFileNum(),
+						Backing:  b,
+					})
+				}
+				metas, err := curProvider.AttachRemoteObjects(objs)
+				if err != nil {
+					return log.String() + "error: " + err.Error()
+				}
+				for _, meta := range metas {
+					log.Infof("%s -> %s", meta.DiskFileNum, curProvider.Path(meta))
+				}
+				return log.String()
+
+			default:
+				d.Fatalf(t, "unknown command %s", d.Cmd)
+				return ""
+			}
+		})
+	})
+}
+
+func TestSharedMultipleLocators(t *testing.T) {
+	ctx := context.Background()
+	stores := map[remote.Locator]remote.Storage{
+		"foo": remote.NewInMem(),
+		"bar": remote.NewInMem(),
+	}
+	sharedFactory := remote.MakeSimpleFactory(stores)
+
+	st1 := DefaultSettings(vfs.NewMem(), "")
+	st1.Remote.StorageFactory = sharedFactory
+	st1.Remote.CreateOnShared = remote.CreateOnSharedAll
+	st1.Remote.CreateOnSharedLocator = "foo"
+	p1, err := Open(st1)
+	require.NoError(t, err)
+	require.NoError(t, p1.SetCreatorID(1))
+
+	st2 := DefaultSettings(vfs.NewMem(), "")
+	st2.Remote.StorageFactory = sharedFactory
+	st2.Remote.CreateOnShared = remote.CreateOnSharedAll
+	st2.Remote.CreateOnSharedLocator = "bar"
+	p2, err := Open(st2)
+	require.NoError(t, err)
+	require.NoError(t, p2.SetCreatorID(2))
+
+	file1 := base.FileNum(1).DiskFileNum()
+	file2 := base.FileNum(2).DiskFileNum()
+
+	for i, provider := range []objstorage.Provider{p1, p2} {
+		w, _, err := provider.Create(ctx, base.FileTypeTable, file1, objstorage.CreateOptions{
+			PreferSharedStorage: true,
+		})
+		require.NoError(t, err)
+		data := make([]byte, 100)
+		genData(byte(i), 0, data)
+		require.NoError(t, w.Write(data))
+		require.NoError(t, w.Finish())
+	}
+
+	// checkObjects reads the given object and verifies the data matches the salt.
+	checkObject := func(p objstorage.Provider, fileNum base.DiskFileNum, salt byte) {
+		t.Helper()
+		r, err := p.OpenForReading(ctx, base.FileTypeTable, fileNum, objstorage.OpenOptions{})
+		require.NoError(t, err)
+		data := make([]byte, r.Size())
+		require.NoError(t, r.ReadAt(ctx, data, 0))
+		r.Close()
+		require.Equal(t, salt, checkData(t, 0, data))
+	}
+
+	// Now attach p1's object (in the "foo" store) to p2.
+	meta1, err := p1.Lookup(base.FileTypeTable, file1)
+	require.NoError(t, err)
+	h1, err := p1.RemoteObjectBacking(&meta1)
+	require.NoError(t, err)
+	b1, err := h1.Get()
+	require.NoError(t, err)
+
+	_, err = p2.AttachRemoteObjects([]objstorage.RemoteObjectToAttach{{
+		FileNum:  file2,
+		FileType: base.FileTypeTable,
+		Backing:  b1,
+	}})
+	require.NoError(t, err)
+	// Close the handle from which we obtained b1.
+	h1.Close()
+	checkObject(p2, file2, 0)
+
+	// Now attach p2's object (in the "bar" store) to p1.
+	meta2, err := p2.Lookup(base.FileTypeTable, file1)
+	require.NoError(t, err)
+	h2, err := p2.RemoteObjectBacking(&meta2)
+	require.NoError(t, err)
+	b2, err := h2.Get()
+	require.NoError(t, err)
+	_, err = p1.AttachRemoteObjects([]objstorage.RemoteObjectToAttach{{
+		FileNum:  file2,
+		FileType: base.FileTypeTable,
+		Backing:  b2,
+	}})
+	require.NoError(t, err)
+	// Close the handle from which we obtained b2.
+	h2.Close()
+	checkObject(p1, file2, 1)
+
+	// Check that the object still works after close/reopen.
+	require.NoError(t, p1.Close())
+	p1, err = Open(st1)
+	require.NoError(t, err)
+	checkObject(p1, file2, 1)
+	require.NoError(t, p1.Close())
+
+	require.NoError(t, p2.Close())
+
+	// Try to attach an object to a provider that doesn't recognize the locator.
+	st3 := DefaultSettings(vfs.NewMem(), "")
+	st3.Remote.StorageFactory = remote.MakeSimpleFactory(nil)
+	p3, err := Open(st3)
+	require.NoError(t, err)
+	require.NoError(t, p3.SetCreatorID(3))
+	_, err = p3.AttachRemoteObjects([]objstorage.RemoteObjectToAttach{{
+		FileNum:  file2,
+		FileType: base.FileTypeTable,
+		Backing:  b2,
+	}})
+	require.Error(t, err)
+	require.NoError(t, p3.Close())
+}
+
+func TestAttachCustomObject(t *testing.T) {
+	ctx := context.Background()
+	storage := remote.NewInMem()
+	sharedFactory := remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
+		"foo": storage,
+	})
+
+	st1 := DefaultSettings(vfs.NewMem(), "")
+	st1.Remote.StorageFactory = sharedFactory
+	p1, err := Open(st1)
+	require.NoError(t, err)
+	defer p1.Close()
+	require.NoError(t, p1.SetCreatorID(1))
+
+	w, err := storage.CreateObject("some-obj-name")
+	require.NoError(t, err)
+	data := make([]byte, 100)
+	genData(123, 0, data)
+	_, err = w.Write(data)
+	require.NoError(t, err)
+	require.NoError(t, w.Close())
+
+	backing, err := p1.CreateExternalObjectBacking("foo", "some-obj-name")
+	require.NoError(t, err)
+
+	_, err = p1.AttachRemoteObjects([]objstorage.RemoteObjectToAttach{{
+		FileNum:  base.FileNum(1).DiskFileNum(),
+		FileType: base.FileTypeTable,
+		Backing:  backing,
+	}})
+	require.NoError(t, err)
+
+	// Verify the provider can read the object.
+	r, err := p1.OpenForReading(ctx, base.FileTypeTable, base.FileNum(1).DiskFileNum(), objstorage.OpenOptions{})
+	require.NoError(t, err)
+	require.Equal(t, int64(len(data)), r.Size())
+	buf := make([]byte, r.Size())
+	require.NoError(t, r.ReadAt(ctx, buf, 0))
+	require.Equal(t, byte(123), checkData(t, 0, buf))
+	require.NoError(t, r.Close())
+
+	// Verify that we can extract a correct backing from this provider and attach
+	// the object to another provider.
+	meta, err := p1.Lookup(base.FileTypeTable, base.FileNum(1).DiskFileNum())
+	require.NoError(t, err)
+	handle, err := p1.RemoteObjectBacking(&meta)
+	require.NoError(t, err)
+	defer handle.Close()
+	backing, err = handle.Get()
+	require.NoError(t, err)
+
+	st2 := DefaultSettings(vfs.NewMem(), "")
+	st2.Remote.StorageFactory = sharedFactory
+	p2, err := Open(st2)
+	require.NoError(t, err)
+	defer p2.Close()
+	require.NoError(t, p2.SetCreatorID(2))
+
+	_, err = p2.AttachRemoteObjects([]objstorage.RemoteObjectToAttach{{
+		FileNum:  base.FileNum(10).DiskFileNum(),
+		FileType: base.FileTypeTable,
+		Backing:  backing,
+	}})
+	require.NoError(t, err)
+
+	// Verify the provider can read the object.
+	r, err = p2.OpenForReading(ctx, base.FileTypeTable, base.FileNum(10).DiskFileNum(), objstorage.OpenOptions{})
+	require.NoError(t, err)
+	require.Equal(t, int64(len(data)), r.Size())
+	buf = make([]byte, r.Size())
+	require.NoError(t, r.ReadAt(ctx, buf, 0))
+	require.Equal(t, byte(123), checkData(t, 0, buf))
+	require.NoError(t, r.Close())
+}
+
+func TestNotExistError(t *testing.T) {
+	fs := vfs.NewMem()
+	st := DefaultSettings(fs, "")
+	sharedStorage := remote.NewInMem()
+	st.Remote.StorageFactory = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
+		"": sharedStorage,
+	})
+	st.Remote.CreateOnShared = remote.CreateOnSharedAll
+	st.Remote.CreateOnSharedLocator = ""
+	provider, err := Open(st)
+	require.NoError(t, err)
+	require.NoError(t, provider.SetCreatorID(1))
+
+	for i, shared := range []bool{false, true} {
+		fileNum := base.FileNum(1 + i).DiskFileNum()
+		name := "local"
+		if shared {
+			name = "remote"
+		}
+		t.Run(name, func(t *testing.T) {
+			// Removing or opening an object that the provider doesn't know anything
+			// about should return a not-exist error.
+			err := provider.Remove(base.FileTypeTable, fileNum)
+			require.True(t, provider.IsNotExistError(err))
+			_, err = provider.OpenForReading(context.Background(), base.FileTypeTable, fileNum, objstorage.OpenOptions{})
+			require.True(t, provider.IsNotExistError(err))
+
+			w, _, err := provider.Create(context.Background(), base.FileTypeTable, fileNum, objstorage.CreateOptions{
+				PreferSharedStorage: shared,
+			})
+			require.NoError(t, err)
+			require.NoError(t, w.Write([]byte("foo")))
+			require.NoError(t, w.Finish())
+
+			// Remove the underlying file or object.
+			if !shared {
+				require.NoError(t, fs.Remove(base.MakeFilename(base.FileTypeTable, fileNum)))
+			} else {
+				meta, err := provider.Lookup(base.FileTypeTable, fileNum)
+				require.NoError(t, err)
+				require.NoError(t, sharedStorage.Delete(remoteObjectName(meta)))
+			}
+
+			_, err = provider.OpenForReading(context.Background(), base.FileTypeTable, fileNum, objstorage.OpenOptions{})
+			require.True(t, provider.IsNotExistError(err))
+
+			// It's acceptable for Remove to return a not-exist error, or no error at all.
+			if err := provider.Remove(base.FileTypeTable, fileNum); err != nil {
+				require.True(t, provider.IsNotExistError(err))
+			}
+		})
+	}
+}
+
+// genData generates object data that can be checked later with checkData.
+func genData(salt byte, offset int, p []byte) {
+	for i := range p {
+		p[i] = salt ^ xor(offset+i)
+	}
+}
+
+func checkData(t *testing.T, offset int, p []byte) (salt byte) {
+	t.Helper()
+	salt = p[0] ^ xor(offset)
+	for i := range p {
+		if p[i]^xor(offset+i) != salt {
+			t.Fatalf("invalid data")
+		}
+	}
+	return salt
+}
+
+// xor returns the XOR of all bytes representing the integer.
+func xor(n int) byte {
+	v := uint64(n)
+	v ^= v >> 32
+	v ^= v >> 16
+	v ^= v >> 8
+	return byte(v)
+}
+
+// TestParallelSync checks that multiple goroutines can create and delete
+// objects and sync in parallel.
+func TestParallelSync(t *testing.T) {
+	for _, shared := range []bool{false, true} {
+		name := "local"
+		if shared {
+			name = "shared"
+		}
+		t.Run(name, func(t *testing.T) {
+			st := DefaultSettings(vfs.NewMem(), "")
+			st.Remote.StorageFactory = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
+				"": remote.NewInMem(),
+			})
+
+			st.Remote.CreateOnShared = remote.CreateOnSharedAll
+			st.Remote.CreateOnSharedLocator = ""
+			p, err := Open(st)
+			require.NoError(t, err)
+			require.NoError(t, p.SetCreatorID(1))
+
+			const numGoroutines = 4
+			const numOps = 100
+			var wg sync.WaitGroup
+			for n := 0; n < numGoroutines; n++ {
+				wg.Add(1)
+				go func(startNum int, shared bool) {
+					defer wg.Done()
+					rng := rand.New(rand.NewSource(int64(startNum)))
+					for i := 0; i < numOps; i++ {
+						num := base.FileNum(startNum + i).DiskFileNum()
+						w, _, err := p.Create(context.Background(), base.FileTypeTable, num, objstorage.CreateOptions{
+							PreferSharedStorage: shared,
+						})
+						if err != nil {
+							panic(err)
+						}
+						if err := w.Finish(); err != nil {
+							panic(err)
+						}
+						if rng.Intn(2) == 0 {
+							if err := p.Sync(); err != nil {
+								panic(err)
+							}
+						}
+						if err := p.Remove(base.FileTypeTable, num); err != nil {
+							panic(err)
+						}
+						if rng.Intn(2) == 0 {
+							if err := p.Sync(); err != nil {
+								panic(err)
+							}
+						}
+					}
+				}(numOps*(n+1), shared)
+			}
+			wg.Wait()
+		})
+	}
+}
diff --git a/pebble/objstorage/objstorageprovider/readahead.go b/pebble/objstorage/objstorageprovider/readahead.go
new file mode 100644
index 0000000..e07017c
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/readahead.go
@@ -0,0 +1,201 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package objstorageprovider
+
+const (
+	// Constants for dynamic readahead of data blocks. Note that the size values
+	// make sense as some multiple of the default block size; and they should
+	// both be larger than the default block size.
+	minFileReadsForReadahead = 2
+	// TODO(bilal): Have the initial size value be a factor of the block size,
+	// as opposed to a hardcoded value.
+	initialReadaheadSize = 64 << 10 /* 64KB */
+)
+
+// readaheadState contains state variables related to readahead. Updated on
+// file reads.
+type readaheadState struct {
+	// Number of sequential reads.
+	numReads         int64
+	maxReadaheadSize int64
+	// Size issued to the next call to Prefetch. Starts at or above
+	// initialReadaheadSize and grows exponentially until maxReadaheadSize.
+	size int64
+	// prevSize is the size used in the last Prefetch call.
+	prevSize int64
+	// The byte offset up to which the OS has been asked to read ahead / cached.
+	// When reading ahead, reads up to this limit should not incur an IO
+	// operation. Reads after this limit can benefit from a new call to
+	// Prefetch.
+	limit int64
+}
+
+func makeReadaheadState(maxReadaheadSize int64) readaheadState {
+	return readaheadState{
+		size:             initialReadaheadSize,
+		maxReadaheadSize: maxReadaheadSize,
+	}
+}
+
+func (rs *readaheadState) recordCacheHit(offset, blockLength int64) {
+	currentReadEnd := offset + blockLength
+	if rs.numReads >= minFileReadsForReadahead {
+		if currentReadEnd >= rs.limit && offset <= rs.limit+rs.maxReadaheadSize {
+			// This is a read that would have resulted in a readahead, had it
+			// not been a cache hit.
+			rs.limit = currentReadEnd
+			return
+		}
+		if currentReadEnd < rs.limit-rs.prevSize || offset > rs.limit+rs.maxReadaheadSize {
+			// We read too far away from rs.limit to benefit from readahead in
+			// any scenario. Reset all variables.
+			rs.numReads = 1
+			rs.limit = currentReadEnd
+			rs.size = initialReadaheadSize
+			rs.prevSize = 0
+			return
+		}
+		// Reads in the range [rs.limit - rs.prevSize, rs.limit] end up
+		// here. This is a read that is potentially benefitting from a past
+		// readahead.
+		return
+	}
+	if currentReadEnd >= rs.limit && offset <= rs.limit+rs.maxReadaheadSize {
+		// Blocks are being read sequentially and would benefit from readahead
+		// down the line.
+		rs.numReads++
+		return
+	}
+	// We read too far ahead of the last read, or before it. This indicates
+	// a random read, where readahead is not desirable. Reset all variables.
+	rs.numReads = 1
+	rs.limit = currentReadEnd
+	rs.size = initialReadaheadSize
+	rs.prevSize = 0
+}
+
+// maybeReadahead updates state and determines whether to issue a readahead /
+// prefetch call for a block read at offset for blockLength bytes.
+// Returns a size value (greater than 0) that should be prefetched if readahead
+// would be beneficial.
+func (rs *readaheadState) maybeReadahead(offset, blockLength int64) int64 {
+	currentReadEnd := offset + blockLength
+	if rs.numReads >= minFileReadsForReadahead {
+		// The minimum threshold of sequential reads to justify reading ahead
+		// has been reached.
+		// There are two intervals: the interval being read:
+		// [offset, currentReadEnd]
+		// as well as the interval where a read would benefit from read ahead:
+		// [rs.limit, rs.limit + rs.size]
+		// We increase the latter interval to
+		// [rs.limit, rs.limit + rs.maxReadaheadSize] to account for cases where
+		// readahead may not be beneficial with a small readahead size, but over
+		// time the readahead size would increase exponentially to make it
+		// beneficial.
+		if currentReadEnd >= rs.limit && offset <= rs.limit+rs.maxReadaheadSize {
+			// We are doing a read in the interval ahead of
+			// the last readahead range. In the diagrams below, ++++ is the last
+			// readahead range, ==== is the range represented by
+			// [rs.limit, rs.limit + rs.maxReadaheadSize], and ---- is the range
+			// being read.
+			//
+			//               rs.limit           rs.limit + rs.maxReadaheadSize
+			//         ++++++++++|===========================|
+			//
+			//              |-------------|
+			//            offset       currentReadEnd
+			//
+			// This case is also possible, as are all cases with an overlap
+			// between [rs.limit, rs.limit + rs.maxReadaheadSize] and [offset,
+			// currentReadEnd]:
+			//
+			//               rs.limit           rs.limit + rs.maxReadaheadSize
+			//         ++++++++++|===========================|
+			//
+			//                                            |-------------|
+			//                                         offset       currentReadEnd
+			//
+			//
+			rs.numReads++
+			rs.limit = offset + rs.size
+			rs.prevSize = rs.size
+			// Increase rs.size for the next read.
+			rs.size *= 2
+			if rs.size > rs.maxReadaheadSize {
+				rs.size = rs.maxReadaheadSize
+			}
+			return rs.prevSize
+		}
+		if currentReadEnd < rs.limit-rs.prevSize || offset > rs.limit+rs.maxReadaheadSize {
+			// The above conditional has rs.limit > rs.prevSize to confirm that
+			// rs.limit - rs.prevSize would not underflow.
+			// We read too far away from rs.limit to benefit from readahead in
+			// any scenario. Reset all variables.
+			// The case where we read too far ahead:
+			//
+			// (rs.limit - rs.prevSize)    (rs.limit)   (rs.limit + rs.maxReadaheadSize)
+			//                    |+++++++++++++|=============|
+			//
+			//                                                  |-------------|
+			//                                             offset       currentReadEnd
+			//
+			// Or too far behind:
+			//
+			// (rs.limit - rs.prevSize)    (rs.limit)   (rs.limit + rs.maxReadaheadSize)
+			//                    |+++++++++++++|=============|
+			//
+			//    |-------------|
+			// offset       currentReadEnd
+			//
+			rs.numReads = 1
+			rs.limit = currentReadEnd
+			rs.size = initialReadaheadSize
+			rs.prevSize = 0
+
+			return 0
+		}
+		// Reads in the range [rs.limit - rs.prevSize, rs.limit] end up
+		// here. This is a read that is potentially benefitting from a past
+		// readahead, but there's no reason to issue a readahead call at the
+		// moment.
+		//
+		// (rs.limit - rs.prevSize)            (rs.limit + rs.maxReadaheadSize)
+		//                    |+++++++++++++|===============|
+		//                             (rs.limit)
+		//
+		//                        |-------|
+		//                     offset    currentReadEnd
+		//
+		rs.numReads++
+		return 0
+	}
+	if currentReadEnd >= rs.limit && offset <= rs.limit+rs.maxReadaheadSize {
+		// Blocks are being read sequentially and would benefit from readahead
+		// down the line.
+		//
+		//                       (rs.limit)   (rs.limit + rs.maxReadaheadSize)
+		//                         |=============|
+		//
+		//                    |-------|
+		//                offset    currentReadEnd
+		//
+		rs.numReads++
+		return 0
+	}
+	// We read too far ahead of the last read, or before it. This indicates
+	// a random read, where readahead is not desirable. Reset all variables.
+	//
+	// (rs.limit - rs.maxReadaheadSize)  (rs.limit)   (rs.limit + rs.maxReadaheadSize)
+	//                     |+++++++++++++|=============|
+	//
+	//                                                    |-------|
+	//                                                offset    currentReadEnd
+	//
+	rs.numReads = 1
+	rs.limit = currentReadEnd
+	rs.size = initialReadaheadSize
+	rs.prevSize = 0
+	return 0
+}
diff --git a/pebble/objstorage/objstorageprovider/readahead_test.go b/pebble/objstorage/objstorageprovider/readahead_test.go
new file mode 100644
index 0000000..a44d78a
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/readahead_test.go
@@ -0,0 +1,59 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package objstorageprovider
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/stretchr/testify/require"
+)
+
+func TestMaybeReadahead(t *testing.T) {
+	rs := makeReadaheadState(256 * 1024)
+	datadriven.RunTest(t, "testdata/readahead", func(t *testing.T, d *datadriven.TestData) string {
+		cacheHit := false
+		switch d.Cmd {
+		case "reset":
+			rs.size = initialReadaheadSize
+			rs.limit = 0
+			rs.numReads = 0
+			return ""
+
+		case "cache-read":
+			cacheHit = true
+			fallthrough
+		case "read":
+			args := strings.Split(d.Input, ",")
+			if len(args) != 2 {
+				return "expected 2 args: offset, size"
+			}
+
+			offset, err := strconv.ParseInt(strings.TrimSpace(args[0]), 10, 64)
+			require.NoError(t, err)
+			size, err := strconv.ParseInt(strings.TrimSpace(args[1]), 10, 64)
+			require.NoError(t, err)
+			var raSize int64
+			if cacheHit {
+				rs.recordCacheHit(offset, size)
+			} else {
+				raSize = rs.maybeReadahead(offset, size)
+			}
+
+			var buf strings.Builder
+			fmt.Fprintf(&buf, "readahead:  %d\n", raSize)
+			fmt.Fprintf(&buf, "numReads:   %d\n", rs.numReads)
+			fmt.Fprintf(&buf, "size:       %d\n", rs.size)
+			fmt.Fprintf(&buf, "prevSize:   %d\n", rs.prevSize)
+			fmt.Fprintf(&buf, "limit:      %d", rs.limit)
+			return buf.String()
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
diff --git a/pebble/objstorage/objstorageprovider/remote.go b/pebble/objstorage/objstorageprovider/remote.go
new file mode 100644
index 0000000..70fdfc7
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/remote.go
@@ -0,0 +1,377 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package objstorageprovider
+
+import (
+	"context"
+	"fmt"
+	"runtime"
+	"sync"
+	"sync/atomic"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/remoteobjcat"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache"
+	"github.com/cockroachdb/pebble/objstorage/remote"
+	"github.com/cockroachdb/redact"
+)
+
+// remoteSubsystem contains the provider fields related to remote storage.
+// All fields remain unset if remote storage is not configured.
+type remoteSubsystem struct {
+	catalog *remoteobjcat.Catalog
+	// catalogSyncMutex is used to correctly serialize two sharedSync operations.
+	// It must be acquired before the provider mutex.
+	catalogSyncMutex sync.Mutex
+
+	cache *sharedcache.Cache
+
+	// shared contains the fields relevant to shared objects, i.e. objects that
+	// are created by Pebble and potentially shared between Pebble instances.
+	shared struct {
+		// initialized guards access to the creatorID field.
+		initialized atomic.Bool
+		creatorID   objstorage.CreatorID
+		initOnce    sync.Once
+
+		// checkRefsOnOpen controls whether we check the ref marker file when opening
+		// an object. Normally this is true when invariants are enabled (but the provider
+		// test tweaks this field).
+		checkRefsOnOpen bool
+	}
+}
+
+// remoteInit initializes the remote object subsystem (if configured) and finds
+// any remote objects.
+func (p *provider) remoteInit() error {
+	if p.st.Remote.StorageFactory == nil {
+		return nil
+	}
+	catalog, contents, err := remoteobjcat.Open(p.st.FS, p.st.FSDirName)
+	if err != nil {
+		return errors.Wrapf(err, "pebble: could not open remote object catalog")
+	}
+	p.remote.catalog = catalog
+	p.remote.shared.checkRefsOnOpen = invariants.Enabled
+
+	// The creator ID may or may not be initialized yet.
+	if contents.CreatorID.IsSet() {
+		p.remote.initShared(contents.CreatorID)
+		p.st.Logger.Infof("remote storage configured; creatorID = %s", contents.CreatorID)
+	} else {
+		p.st.Logger.Infof("remote storage configured; no creatorID yet")
+	}
+
+	if p.st.Remote.CacheSizeBytes > 0 {
+		const defaultBlockSize = 32 * 1024
+		blockSize := p.st.Remote.CacheBlockSize
+		if blockSize == 0 {
+			blockSize = defaultBlockSize
+		}
+
+		const defaultShardingBlockSize = 1024 * 1024
+		shardingBlockSize := p.st.Remote.ShardingBlockSize
+		if shardingBlockSize == 0 {
+			shardingBlockSize = defaultShardingBlockSize
+		}
+
+		numShards := p.st.Remote.CacheShardCount
+		if numShards == 0 {
+			numShards = 2 * runtime.GOMAXPROCS(0)
+		}
+
+		p.remote.cache, err = sharedcache.Open(
+			p.st.FS, p.st.Logger, p.st.FSDirName, blockSize, shardingBlockSize, p.st.Remote.CacheSizeBytes, numShards)
+		if err != nil {
+			return errors.Wrapf(err, "pebble: could not open remote object cache")
+		}
+	}
+
+	for _, meta := range contents.Objects {
+		o := objstorage.ObjectMetadata{
+			DiskFileNum: meta.FileNum,
+			FileType:    meta.FileType,
+		}
+		o.Remote.CreatorID = meta.CreatorID
+		o.Remote.CreatorFileNum = meta.CreatorFileNum
+		o.Remote.CleanupMethod = meta.CleanupMethod
+		o.Remote.Locator = meta.Locator
+		o.Remote.CustomObjectName = meta.CustomObjectName
+		o.Remote.Storage, err = p.ensureStorageLocked(o.Remote.Locator)
+		if err != nil {
+			return errors.Wrapf(err, "creating remote.Storage object for locator '%s'", o.Remote.Locator)
+		}
+		if invariants.Enabled {
+			o.AssertValid()
+		}
+		p.mu.knownObjects[o.DiskFileNum] = o
+	}
+	return nil
+}
+
+// initShared initializes the creator ID, allowing use of shared objects.
+func (ss *remoteSubsystem) initShared(creatorID objstorage.CreatorID) {
+	ss.shared.initOnce.Do(func() {
+		ss.shared.creatorID = creatorID
+		ss.shared.initialized.Store(true)
+	})
+}
+
+func (p *provider) sharedClose() error {
+	if p.st.Remote.StorageFactory == nil {
+		return nil
+	}
+	var err error
+	if p.remote.cache != nil {
+		err = p.remote.cache.Close()
+		p.remote.cache = nil
+	}
+	if p.remote.catalog != nil {
+		err = firstError(err, p.remote.catalog.Close())
+		p.remote.catalog = nil
+	}
+	return err
+}
+
+// SetCreatorID is part of the objstorage.Provider interface.
+func (p *provider) SetCreatorID(creatorID objstorage.CreatorID) error {
+	if p.st.Remote.StorageFactory == nil {
+		return errors.AssertionFailedf("attempt to set CreatorID but remote storage not enabled")
+	}
+	// Note: this call is a cheap no-op if the creator ID was already set. This
+	// call also checks if we are trying to change the ID.
+	if err := p.remote.catalog.SetCreatorID(creatorID); err != nil {
+		return err
+	}
+	if !p.remote.shared.initialized.Load() {
+		p.st.Logger.Infof("remote storage creatorID set to %s", creatorID)
+		p.remote.initShared(creatorID)
+	}
+	return nil
+}
+
+// IsSharedForeign is part of the objstorage.Provider interface.
+func (p *provider) IsSharedForeign(meta objstorage.ObjectMetadata) bool {
+	if !p.remote.shared.initialized.Load() {
+		return false
+	}
+	return meta.IsShared() && (meta.Remote.CreatorID != p.remote.shared.creatorID)
+}
+
+func (p *provider) remoteCheckInitialized() error {
+	if p.st.Remote.StorageFactory == nil {
+		return errors.Errorf("remote object support not configured")
+	}
+	return nil
+}
+
+func (p *provider) sharedCheckInitialized() error {
+	if err := p.remoteCheckInitialized(); err != nil {
+		return err
+	}
+	if !p.remote.shared.initialized.Load() {
+		return errors.Errorf("remote object support not available: remote creator ID not yet set")
+	}
+	return nil
+}
+
+func (p *provider) sharedSync() error {
+	// Serialize parallel sync operations. Note that ApplyBatch is already
+	// serialized internally, but we want to make sure they get called with
+	// batches in the right order.
+	p.remote.catalogSyncMutex.Lock()
+	defer p.remote.catalogSyncMutex.Unlock()
+
+	batch := func() remoteobjcat.Batch {
+		p.mu.Lock()
+		defer p.mu.Unlock()
+		res := p.mu.remote.catalogBatch.Copy()
+		p.mu.remote.catalogBatch.Reset()
+		return res
+	}()
+
+	if batch.IsEmpty() {
+		return nil
+	}
+
+	if err := p.remote.catalog.ApplyBatch(batch); err != nil {
+		// Put back the batch (for the next Sync), appending any operations that
+		// happened in the meantime.
+		p.mu.Lock()
+		defer p.mu.Unlock()
+		batch.Append(p.mu.remote.catalogBatch)
+		p.mu.remote.catalogBatch = batch
+		return err
+	}
+
+	return nil
+}
+
+func (p *provider) remotePath(meta objstorage.ObjectMetadata) string {
+	if meta.Remote.Locator != "" {
+		return fmt.Sprintf("remote-%s://%s", meta.Remote.Locator, remoteObjectName(meta))
+	}
+	return "remote://" + remoteObjectName(meta)
+}
+
+// sharedCreateRef creates a reference marker object.
+func (p *provider) sharedCreateRef(meta objstorage.ObjectMetadata) error {
+	if err := p.sharedCheckInitialized(); err != nil {
+		return err
+	}
+	if meta.Remote.CleanupMethod != objstorage.SharedRefTracking {
+		return nil
+	}
+	refName := p.sharedObjectRefName(meta)
+	writer, err := meta.Remote.Storage.CreateObject(refName)
+	if err == nil {
+		// The object is empty, just close the writer.
+		err = writer.Close()
+	}
+	if err != nil {
+		return errors.Wrapf(err, "creating marker object %q", errors.Safe(refName))
+	}
+	return nil
+}
+
+func (p *provider) sharedCreate(
+	_ context.Context,
+	fileType base.FileType,
+	fileNum base.DiskFileNum,
+	locator remote.Locator,
+	opts objstorage.CreateOptions,
+) (objstorage.Writable, objstorage.ObjectMetadata, error) {
+	if err := p.sharedCheckInitialized(); err != nil {
+		return nil, objstorage.ObjectMetadata{}, err
+	}
+	storage, err := p.ensureStorage(locator)
+	if err != nil {
+		return nil, objstorage.ObjectMetadata{}, err
+	}
+	meta := objstorage.ObjectMetadata{
+		DiskFileNum: fileNum,
+		FileType:    fileType,
+	}
+	meta.Remote.CreatorID = p.remote.shared.creatorID
+	meta.Remote.CreatorFileNum = fileNum
+	meta.Remote.CleanupMethod = opts.SharedCleanupMethod
+	meta.Remote.Locator = locator
+	meta.Remote.Storage = storage
+
+	objName := remoteObjectName(meta)
+	writer, err := storage.CreateObject(objName)
+	if err != nil {
+		return nil, objstorage.ObjectMetadata{}, errors.Wrapf(err, "creating object %q", errors.Safe(objName))
+	}
+	return &sharedWritable{
+		p:             p,
+		meta:          meta,
+		storageWriter: writer,
+	}, meta, nil
+}
+
+func (p *provider) remoteOpenForReading(
+	ctx context.Context, meta objstorage.ObjectMetadata, opts objstorage.OpenOptions,
+) (objstorage.Readable, error) {
+	if err := p.remoteCheckInitialized(); err != nil {
+		return nil, err
+	}
+	// Verify we have a reference on this object; for performance reasons, we only
+	// do this in testing scenarios.
+	if p.remote.shared.checkRefsOnOpen && meta.Remote.CleanupMethod == objstorage.SharedRefTracking {
+		if err := p.sharedCheckInitialized(); err != nil {
+			return nil, err
+		}
+		refName := p.sharedObjectRefName(meta)
+		if _, err := meta.Remote.Storage.Size(refName); err != nil {
+			if meta.Remote.Storage.IsNotExistError(err) {
+				if opts.MustExist {
+					p.st.Logger.Fatalf("marker object %q does not exist", errors.Safe(refName))
+					// TODO(radu): maybe list references for the object.
+				}
+				return nil, errors.Errorf("marker object %q does not exist", errors.Safe(refName))
+			}
+			return nil, errors.Wrapf(err, "checking marker object %q", errors.Safe(refName))
+		}
+	}
+	objName := remoteObjectName(meta)
+	reader, size, err := meta.Remote.Storage.ReadObject(ctx, objName)
+	if err != nil {
+		if opts.MustExist && meta.Remote.Storage.IsNotExistError(err) {
+			p.st.Logger.Fatalf("object %q does not exist", redact.SafeString(objName))
+			// TODO(radu): maybe list references for the object.
+		}
+		return nil, err
+	}
+	return p.newRemoteReadable(reader, size, meta.DiskFileNum), nil
+}
+
+func (p *provider) remoteSize(meta objstorage.ObjectMetadata) (int64, error) {
+	if err := p.remoteCheckInitialized(); err != nil {
+		return 0, err
+	}
+	objName := remoteObjectName(meta)
+	return meta.Remote.Storage.Size(objName)
+}
+
+// sharedUnref implements object "removal" with the remote backend. The ref
+// marker object is removed and the backing object is removed only if there are
+// no other ref markers.
+func (p *provider) sharedUnref(meta objstorage.ObjectMetadata) error {
+	if meta.Remote.CleanupMethod == objstorage.SharedNoCleanup {
+		// Never delete objects in this mode.
+		return nil
+	}
+	if p.isProtected(meta.DiskFileNum) {
+		// TODO(radu): we need a mechanism to unref the object when it becomes
+		// unprotected.
+		return nil
+	}
+
+	refName := p.sharedObjectRefName(meta)
+	// Tolerate a not-exists error.
+	if err := meta.Remote.Storage.Delete(refName); err != nil && !meta.Remote.Storage.IsNotExistError(err) {
+		return err
+	}
+	otherRefs, err := meta.Remote.Storage.List(sharedObjectRefPrefix(meta), "" /* delimiter */)
+	if err != nil {
+		return err
+	}
+	if len(otherRefs) == 0 {
+		objName := remoteObjectName(meta)
+		if err := meta.Remote.Storage.Delete(objName); err != nil && !meta.Remote.Storage.IsNotExistError(err) {
+			return err
+		}
+	}
+	return nil
+}
+
+// ensureStorageLocked populates the remote.Storage object for the given
+// locator, if necessary. p.mu must be held.
+func (p *provider) ensureStorageLocked(locator remote.Locator) (remote.Storage, error) {
+	if p.mu.remote.storageObjects == nil {
+		p.mu.remote.storageObjects = make(map[remote.Locator]remote.Storage)
+	}
+	if res, ok := p.mu.remote.storageObjects[locator]; ok {
+		return res, nil
+	}
+	res, err := p.st.Remote.StorageFactory.CreateStorage(locator)
+	if err != nil {
+		return nil, err
+	}
+
+	p.mu.remote.storageObjects[locator] = res
+	return res, nil
+}
+
+// ensureStorage populates the remote.Storage object for the given locator, if necessary.
+func (p *provider) ensureStorage(locator remote.Locator) (remote.Storage, error) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.ensureStorageLocked(locator)
+}
diff --git a/pebble/objstorage/objstorageprovider/remote_backing.go b/pebble/objstorage/objstorageprovider/remote_backing.go
new file mode 100644
index 0000000..c5526e5
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/remote_backing.go
@@ -0,0 +1,304 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package objstorageprovider
+
+import (
+	"bytes"
+	"encoding/binary"
+	"io"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/remoteobjcat"
+	"github.com/cockroachdb/pebble/objstorage/remote"
+)
+
+const (
+	tagCreatorID      = 1
+	tagCreatorFileNum = 2
+	tagCleanupMethod  = 3
+	// tagRefCheckID encodes the information for a ref marker that needs to be
+	// checked when attaching this object to another provider. This is set to the
+	// creator ID and FileNum for the provider that encodes the backing, and
+	// allows the "target" provider to check that the "source" provider kept its
+	// reference on the object alive.
+	tagRefCheckID = 4
+	// tagLocator encodes the remote.Locator; if absent the locator is "". It is
+	// followed by the locator string length and the locator string.
+	tagLocator = 5
+	// tagLocator encodes a custom object name (if present). It is followed by the
+	// custom name string length and the string.
+	tagCustomObjectName = 6
+
+	// Any new tags that don't have the tagNotSafeToIgnoreMask bit set must be
+	// followed by the length of the data (so they can be skipped).
+
+	// Any new tags that have the tagNotSafeToIgnoreMask bit set cause errors if
+	// they are encountered by earlier code that doesn't know the tag.
+	tagNotSafeToIgnoreMask = 64
+)
+
+func (p *provider) encodeRemoteObjectBacking(
+	meta *objstorage.ObjectMetadata,
+) (objstorage.RemoteObjectBacking, error) {
+	if !meta.IsRemote() {
+		return nil, errors.AssertionFailedf("object %s not on remote storage", meta.DiskFileNum)
+	}
+
+	buf := make([]byte, 0, binary.MaxVarintLen64*4)
+	buf = binary.AppendUvarint(buf, tagCreatorID)
+	buf = binary.AppendUvarint(buf, uint64(meta.Remote.CreatorID))
+	// TODO(radu): encode file type as well?
+	buf = binary.AppendUvarint(buf, tagCreatorFileNum)
+	buf = binary.AppendUvarint(buf, uint64(meta.Remote.CreatorFileNum.FileNum()))
+	buf = binary.AppendUvarint(buf, tagCleanupMethod)
+	buf = binary.AppendUvarint(buf, uint64(meta.Remote.CleanupMethod))
+	if meta.Remote.CleanupMethod == objstorage.SharedRefTracking {
+		buf = binary.AppendUvarint(buf, tagRefCheckID)
+		buf = binary.AppendUvarint(buf, uint64(p.remote.shared.creatorID))
+		buf = binary.AppendUvarint(buf, uint64(meta.DiskFileNum.FileNum()))
+	}
+	if meta.Remote.Locator != "" {
+		buf = binary.AppendUvarint(buf, tagLocator)
+		buf = encodeString(buf, string(meta.Remote.Locator))
+	}
+	if meta.Remote.CustomObjectName != "" {
+		buf = binary.AppendUvarint(buf, tagCustomObjectName)
+		buf = encodeString(buf, meta.Remote.CustomObjectName)
+	}
+	return buf, nil
+}
+
+type remoteObjectBackingHandle struct {
+	backing objstorage.RemoteObjectBacking
+	fileNum base.DiskFileNum
+	p       *provider
+}
+
+func (s *remoteObjectBackingHandle) Get() (objstorage.RemoteObjectBacking, error) {
+	if s.backing == nil {
+		return nil, errors.Errorf("RemoteObjectBackingHandle.Get() called after Close()")
+	}
+	return s.backing, nil
+}
+
+func (s *remoteObjectBackingHandle) Close() {
+	if s.backing != nil {
+		s.backing = nil
+		s.p.unprotectObject(s.fileNum)
+	}
+}
+
+var _ objstorage.RemoteObjectBackingHandle = (*remoteObjectBackingHandle)(nil)
+
+// RemoteObjectBacking is part of the objstorage.Provider interface.
+func (p *provider) RemoteObjectBacking(
+	meta *objstorage.ObjectMetadata,
+) (objstorage.RemoteObjectBackingHandle, error) {
+	backing, err := p.encodeRemoteObjectBacking(meta)
+	if err != nil {
+		return nil, err
+	}
+	p.protectObject(meta.DiskFileNum)
+	return &remoteObjectBackingHandle{
+		backing: backing,
+		fileNum: meta.DiskFileNum,
+		p:       p,
+	}, nil
+}
+
+// CreateExternalObjectBacking is part of the objstorage.Provider interface.
+func (p *provider) CreateExternalObjectBacking(
+	locator remote.Locator, objName string,
+) (objstorage.RemoteObjectBacking, error) {
+	var meta objstorage.ObjectMetadata
+	meta.Remote.Locator = locator
+	meta.Remote.CustomObjectName = objName
+	meta.Remote.CleanupMethod = objstorage.SharedNoCleanup
+	return p.encodeRemoteObjectBacking(&meta)
+}
+
+type decodedBacking struct {
+	meta objstorage.ObjectMetadata
+	// refToCheck is set only when meta.Remote.CleanupMethod is RefTracking
+	refToCheck struct {
+		creatorID objstorage.CreatorID
+		fileNum   base.DiskFileNum
+	}
+}
+
+// decodeRemoteObjectBacking decodes the remote object metadata.
+//
+// Note that the meta.Remote.Storage field is not set.
+func decodeRemoteObjectBacking(
+	fileType base.FileType, fileNum base.DiskFileNum, buf objstorage.RemoteObjectBacking,
+) (decodedBacking, error) {
+	var creatorID, creatorFileNum, cleanupMethod, refCheckCreatorID, refCheckFileNum uint64
+	var locator, customObjName string
+	br := bytes.NewReader(buf)
+	for {
+		tag, err := binary.ReadUvarint(br)
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return decodedBacking{}, err
+		}
+		switch tag {
+		case tagCreatorID:
+			creatorID, err = binary.ReadUvarint(br)
+
+		case tagCreatorFileNum:
+			creatorFileNum, err = binary.ReadUvarint(br)
+
+		case tagCleanupMethod:
+			cleanupMethod, err = binary.ReadUvarint(br)
+
+		case tagRefCheckID:
+			refCheckCreatorID, err = binary.ReadUvarint(br)
+			if err == nil {
+				refCheckFileNum, err = binary.ReadUvarint(br)
+			}
+
+		case tagLocator:
+			locator, err = decodeString(br)
+
+		case tagCustomObjectName:
+			customObjName, err = decodeString(br)
+
+		default:
+			// Ignore unknown tags, unless they're not safe to ignore.
+			if tag&tagNotSafeToIgnoreMask != 0 {
+				return decodedBacking{}, errors.Newf("unknown tag %d", tag)
+			}
+			var dataLen uint64
+			dataLen, err = binary.ReadUvarint(br)
+			if err == nil {
+				_, err = br.Seek(int64(dataLen), io.SeekCurrent)
+			}
+		}
+		if err != nil {
+			return decodedBacking{}, err
+		}
+	}
+	if customObjName == "" {
+		if creatorID == 0 {
+			return decodedBacking{}, errors.Newf("remote object backing missing creator ID")
+		}
+		if creatorFileNum == 0 {
+			return decodedBacking{}, errors.Newf("remote object backing missing creator file num")
+		}
+	}
+	var res decodedBacking
+	res.meta.DiskFileNum = fileNum
+	res.meta.FileType = fileType
+	res.meta.Remote.CreatorID = objstorage.CreatorID(creatorID)
+	res.meta.Remote.CreatorFileNum = base.FileNum(creatorFileNum).DiskFileNum()
+	res.meta.Remote.CleanupMethod = objstorage.SharedCleanupMethod(cleanupMethod)
+	if res.meta.Remote.CleanupMethod == objstorage.SharedRefTracking {
+		if refCheckCreatorID == 0 || refCheckFileNum == 0 {
+			return decodedBacking{}, errors.Newf("remote object backing missing ref to check")
+		}
+		res.refToCheck.creatorID = objstorage.CreatorID(refCheckCreatorID)
+		res.refToCheck.fileNum = base.FileNum(refCheckFileNum).DiskFileNum()
+	}
+	res.meta.Remote.Locator = remote.Locator(locator)
+	res.meta.Remote.CustomObjectName = customObjName
+	return res, nil
+}
+
+func encodeString(buf []byte, s string) []byte {
+	buf = binary.AppendUvarint(buf, uint64(len(s)))
+	buf = append(buf, []byte(s)...)
+	return buf
+}
+
+func decodeString(br io.ByteReader) (string, error) {
+	length, err := binary.ReadUvarint(br)
+	if err != nil || length == 0 {
+		return "", err
+	}
+	buf := make([]byte, length)
+	for i := range buf {
+		buf[i], err = br.ReadByte()
+		if err != nil {
+			return "", err
+		}
+	}
+	return string(buf), nil
+}
+
+// AttachRemoteObjects is part of the objstorage.Provider interface.
+func (p *provider) AttachRemoteObjects(
+	objs []objstorage.RemoteObjectToAttach,
+) ([]objstorage.ObjectMetadata, error) {
+	decoded := make([]decodedBacking, len(objs))
+	for i, o := range objs {
+		var err error
+		decoded[i], err = decodeRemoteObjectBacking(o.FileType, o.FileNum, o.Backing)
+		if err != nil {
+			return nil, err
+		}
+		decoded[i].meta.Remote.Storage, err = p.ensureStorage(decoded[i].meta.Remote.Locator)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	// Create the reference marker objects.
+	// TODO(radu): parallelize this.
+	for _, d := range decoded {
+		if d.meta.Remote.CleanupMethod != objstorage.SharedRefTracking {
+			continue
+		}
+		if err := p.sharedCreateRef(d.meta); err != nil {
+			// TODO(radu): clean up references previously created in this loop.
+			return nil, err
+		}
+		// Check the "origin"'s reference.
+		refName := sharedObjectRefName(d.meta, d.refToCheck.creatorID, d.refToCheck.fileNum)
+		if _, err := d.meta.Remote.Storage.Size(refName); err != nil {
+			_ = p.sharedUnref(d.meta)
+			// TODO(radu): clean up references previously created in this loop.
+			if d.meta.Remote.Storage.IsNotExistError(err) {
+				return nil, errors.Errorf("origin marker object %q does not exist;"+
+					" object probably removed from the provider which created the backing", refName)
+			}
+			return nil, errors.Wrapf(err, "checking origin's marker object %s", refName)
+		}
+	}
+
+	func() {
+		p.mu.Lock()
+		defer p.mu.Unlock()
+		for _, d := range decoded {
+			p.mu.remote.catalogBatch.AddObject(remoteobjcat.RemoteObjectMetadata{
+				FileNum:          d.meta.DiskFileNum,
+				FileType:         d.meta.FileType,
+				CreatorID:        d.meta.Remote.CreatorID,
+				CreatorFileNum:   d.meta.Remote.CreatorFileNum,
+				CleanupMethod:    d.meta.Remote.CleanupMethod,
+				Locator:          d.meta.Remote.Locator,
+				CustomObjectName: d.meta.Remote.CustomObjectName,
+			})
+		}
+	}()
+	if err := p.sharedSync(); err != nil {
+		return nil, err
+	}
+
+	metas := make([]objstorage.ObjectMetadata, len(decoded))
+	for i, d := range decoded {
+		metas[i] = d.meta
+	}
+
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	for _, meta := range metas {
+		p.mu.knownObjects[meta.DiskFileNum] = meta
+	}
+	return metas, nil
+}
diff --git a/pebble/objstorage/objstorageprovider/remote_backing_test.go b/pebble/objstorage/objstorageprovider/remote_backing_test.go
new file mode 100644
index 0000000..805c087
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/remote_backing_test.go
@@ -0,0 +1,158 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package objstorageprovider
+
+import (
+	"encoding/binary"
+	"testing"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/remote"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+func TestSharedObjectBacking(t *testing.T) {
+	for _, cleanup := range []objstorage.SharedCleanupMethod{objstorage.SharedRefTracking, objstorage.SharedNoCleanup} {
+		name := "ref-tracking"
+		if cleanup == objstorage.SharedNoCleanup {
+			name = "no-cleanup"
+		}
+		t.Run(name, func(t *testing.T) {
+			st := DefaultSettings(vfs.NewMem(), "")
+			sharedStorage := remote.NewInMem()
+			st.Remote.StorageFactory = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
+				"foo": sharedStorage,
+			})
+			p, err := Open(st)
+			require.NoError(t, err)
+			defer p.Close()
+
+			const creatorID = objstorage.CreatorID(99)
+			require.NoError(t, p.SetCreatorID(creatorID))
+			meta := objstorage.ObjectMetadata{
+				DiskFileNum: base.FileNum(1).DiskFileNum(),
+				FileType:    base.FileTypeTable,
+			}
+			meta.Remote.CreatorID = 100
+			meta.Remote.CreatorFileNum = base.FileNum(200).DiskFileNum()
+			meta.Remote.CleanupMethod = cleanup
+			meta.Remote.Locator = "foo"
+			meta.Remote.CustomObjectName = "obj-name"
+			meta.Remote.Storage = sharedStorage
+
+			h, err := p.RemoteObjectBacking(&meta)
+			require.NoError(t, err)
+			buf, err := h.Get()
+			require.NoError(t, err)
+			h.Close()
+			_, err = h.Get()
+			require.Error(t, err)
+
+			d1, err := decodeRemoteObjectBacking(base.FileTypeTable, base.FileNum(100).DiskFileNum(), buf)
+			require.NoError(t, err)
+			require.Equal(t, uint64(100), uint64(d1.meta.DiskFileNum.FileNum()))
+			require.Equal(t, base.FileTypeTable, d1.meta.FileType)
+			d1.meta.Remote.Storage = sharedStorage
+			require.Equal(t, meta.Remote, d1.meta.Remote)
+			if cleanup == objstorage.SharedRefTracking {
+				require.Equal(t, creatorID, d1.refToCheck.creatorID)
+				require.Equal(t, base.FileNum(1).DiskFileNum(), d1.refToCheck.fileNum)
+			} else {
+				require.Equal(t, objstorage.CreatorID(0), d1.refToCheck.creatorID)
+				require.Equal(t, base.FileNum(0).DiskFileNum(), d1.refToCheck.fileNum)
+			}
+
+			t.Run("unknown-tags", func(t *testing.T) {
+				// Append a tag that is safe to ignore.
+				buf2 := buf
+				buf2 = binary.AppendUvarint(buf2, 13)
+				buf2 = binary.AppendUvarint(buf2, 2)
+				buf2 = append(buf2, 1, 1)
+
+				d2, err := decodeRemoteObjectBacking(base.FileTypeTable, base.FileNum(100).DiskFileNum(), buf2)
+				require.NoError(t, err)
+				require.Equal(t, uint64(100), uint64(d2.meta.DiskFileNum.FileNum()))
+				require.Equal(t, base.FileTypeTable, d2.meta.FileType)
+				d2.meta.Remote.Storage = sharedStorage
+				require.Equal(t, meta.Remote, d2.meta.Remote)
+				if cleanup == objstorage.SharedRefTracking {
+					require.Equal(t, creatorID, d2.refToCheck.creatorID)
+					require.Equal(t, base.FileNum(1).DiskFileNum(), d2.refToCheck.fileNum)
+				} else {
+					require.Equal(t, objstorage.CreatorID(0), d2.refToCheck.creatorID)
+					require.Equal(t, base.FileNum(0).DiskFileNum(), d2.refToCheck.fileNum)
+				}
+
+				buf3 := buf2
+				buf3 = binary.AppendUvarint(buf3, tagNotSafeToIgnoreMask+5)
+				_, err = decodeRemoteObjectBacking(meta.FileType, meta.DiskFileNum, buf3)
+				require.Error(t, err)
+				require.Contains(t, err.Error(), "unknown tag")
+			})
+		})
+	}
+}
+
+func TestCreateSharedObjectBacking(t *testing.T) {
+	st := DefaultSettings(vfs.NewMem(), "")
+	sharedStorage := remote.NewInMem()
+	st.Remote.StorageFactory = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
+		"foo": sharedStorage,
+	})
+	p, err := Open(st)
+	require.NoError(t, err)
+	defer p.Close()
+
+	require.NoError(t, p.SetCreatorID(1))
+
+	backing, err := p.CreateExternalObjectBacking("foo", "custom-obj-name")
+	require.NoError(t, err)
+	d, err := decodeRemoteObjectBacking(base.FileTypeTable, base.FileNum(100).DiskFileNum(), backing)
+	require.NoError(t, err)
+	require.Equal(t, uint64(100), uint64(d.meta.DiskFileNum.FileNum()))
+	require.Equal(t, base.FileTypeTable, d.meta.FileType)
+	require.Equal(t, remote.Locator("foo"), d.meta.Remote.Locator)
+	require.Equal(t, "custom-obj-name", d.meta.Remote.CustomObjectName)
+	require.Equal(t, objstorage.SharedNoCleanup, d.meta.Remote.CleanupMethod)
+}
+
+func TestAttachRemoteObjects(t *testing.T) {
+	st := DefaultSettings(vfs.NewMem(), "")
+	sharedStorage := remote.NewInMem()
+	st.Remote.StorageFactory = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
+		"foo": sharedStorage,
+	})
+	p, err := Open(st)
+	require.NoError(t, err)
+	defer p.Close()
+	require.NoError(t, p.SetCreatorID(1))
+	backing, err := p.CreateExternalObjectBacking("foo", "custom-obj-name")
+	require.NoError(t, err)
+	_, err = p.AttachRemoteObjects([]objstorage.RemoteObjectToAttach{{
+		FileType: base.FileTypeTable,
+		FileNum:  base.FileNum(100).DiskFileNum(),
+		Backing:  backing,
+	}})
+	require.NoError(t, err)
+
+	// Sync, close, and reopen the provider and expect that we see
+	// our object.
+	require.NoError(t, p.Sync())
+	require.NoError(t, p.Close())
+
+	p, err = Open(st)
+	require.NoError(t, err)
+	defer p.Close()
+	require.NoError(t, p.SetCreatorID(1))
+	objs := p.List()
+	require.Len(t, objs, 1)
+	o := objs[0]
+	require.Equal(t, remote.Locator("foo"), o.Remote.Locator)
+	require.Equal(t, "custom-obj-name", o.Remote.CustomObjectName)
+	require.Equal(t, uint64(100), uint64(o.DiskFileNum.FileNum()))
+	require.Equal(t, base.FileTypeTable, o.FileType)
+}
diff --git a/pebble/objstorage/objstorageprovider/remote_obj_name.go b/pebble/objstorage/objstorageprovider/remote_obj_name.go
new file mode 100644
index 0000000..b33a908
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/remote_obj_name.go
@@ -0,0 +1,91 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package objstorageprovider
+
+import (
+	"fmt"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage"
+)
+
+// remoteObjectName returns the name of an object on remote storage.
+//
+// For sstables, the format is: <hash>-<creator-id>-<file-num>.sst
+// For example: 1a3f-2-000001.sst
+func remoteObjectName(meta objstorage.ObjectMetadata) string {
+	if meta.Remote.CustomObjectName != "" {
+		return meta.Remote.CustomObjectName
+	}
+	switch meta.FileType {
+	case base.FileTypeTable:
+		return fmt.Sprintf(
+			"%04x-%d-%06d.sst",
+			objHash(meta), meta.Remote.CreatorID, meta.Remote.CreatorFileNum.FileNum(),
+		)
+	}
+	panic("unknown FileType")
+}
+
+// sharedObjectRefName returns the name of the object's ref marker associated
+// with a given referencing provider. This name is the object's name concatenated with
+// ".ref.<ref-creator-id>.<local-file-num>".
+//
+// For example: 1a3f-2-000001.sst.ref.5.000008
+func sharedObjectRefName(
+	meta objstorage.ObjectMetadata, refCreatorID objstorage.CreatorID, refFileNum base.DiskFileNum,
+) string {
+	if meta.Remote.CleanupMethod != objstorage.SharedRefTracking {
+		panic("ref object used when ref tracking disabled")
+	}
+	if meta.Remote.CustomObjectName != "" {
+		return fmt.Sprintf(
+			"%s.ref.%d.%06d", meta.Remote.CustomObjectName, refCreatorID, refFileNum.FileNum(),
+		)
+	}
+	switch meta.FileType {
+	case base.FileTypeTable:
+		return fmt.Sprintf(
+			"%04x-%d-%06d.sst.ref.%d.%06d",
+			objHash(meta), meta.Remote.CreatorID, meta.Remote.CreatorFileNum.FileNum(), refCreatorID, refFileNum.FileNum(),
+		)
+	}
+	panic("unknown FileType")
+}
+
+func sharedObjectRefPrefix(meta objstorage.ObjectMetadata) string {
+	if meta.Remote.CustomObjectName != "" {
+		return meta.Remote.CustomObjectName + ".ref."
+	}
+	switch meta.FileType {
+	case base.FileTypeTable:
+		return fmt.Sprintf(
+			"%04x-%d-%06d.sst.ref.",
+			objHash(meta), meta.Remote.CreatorID, meta.Remote.CreatorFileNum.FileNum(),
+		)
+	}
+	panic("unknown FileType")
+}
+
+// sharedObjectRefName returns the name of the object's ref marker associated
+// with this provider. This name is the object's name concatenated with
+// ".ref.<creator-id>.<local-file-num>".
+//
+// For example: 1a3f-2-000001.sst.ref.5.000008
+func (p *provider) sharedObjectRefName(meta objstorage.ObjectMetadata) string {
+	if meta.Remote.CleanupMethod != objstorage.SharedRefTracking {
+		panic("ref object used when ref tracking disabled")
+	}
+	return sharedObjectRefName(meta, p.remote.shared.creatorID, meta.DiskFileNum)
+}
+
+// objHash returns a 16-bit hash value derived from the creator ID and creator
+// file num. We prepend this value to object names to ensure balanced
+// partitioning with AWS (and likely other blob storage providers).
+func objHash(meta objstorage.ObjectMetadata) uint16 {
+	const prime1 = 7459
+	const prime2 = 17539
+	return uint16(uint64(meta.Remote.CreatorID)*prime1 + uint64(meta.Remote.CreatorFileNum.FileNum())*prime2)
+}
diff --git a/pebble/objstorage/objstorageprovider/remote_obj_name_test.go b/pebble/objstorage/objstorageprovider/remote_obj_name_test.go
new file mode 100644
index 0000000..abe6c4d
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/remote_obj_name_test.go
@@ -0,0 +1,64 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package objstorageprovider
+
+import (
+	"fmt"
+	"math/rand"
+	"testing"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/stretchr/testify/require"
+)
+
+func TestSharedObjectNames(t *testing.T) {
+	t.Run("crosscheck", func(t *testing.T) {
+		supportedFileTypes := []base.FileType{
+			base.FileTypeTable,
+		}
+		for it := 0; it < 100; it++ {
+			var meta objstorage.ObjectMetadata
+			meta.DiskFileNum = base.FileNum(rand.Intn(100000)).DiskFileNum()
+			meta.FileType = supportedFileTypes[rand.Int()%len(supportedFileTypes)]
+			meta.Remote.CreatorID = objstorage.CreatorID(rand.Int63())
+			meta.Remote.CreatorFileNum = base.FileNum(rand.Intn(100000)).DiskFileNum()
+			if rand.Intn(4) == 0 {
+				meta.Remote.CustomObjectName = fmt.Sprintf("foo-%d.sst", rand.Intn(10000))
+			}
+
+			obj := remoteObjectName(meta)
+			// Cross-check against cleaner implementations.
+			expObj := meta.Remote.CustomObjectName
+			if expObj == "" {
+				expObj = fmt.Sprintf("%04x-%s-%s", objHash(meta), meta.Remote.CreatorID, base.MakeFilename(meta.FileType, meta.Remote.CreatorFileNum))
+			}
+			require.Equal(t, expObj, obj)
+
+			require.Equal(t, expObj+".ref.", sharedObjectRefPrefix(meta))
+
+			refCreatorID := objstorage.CreatorID(rand.Int63())
+			refObj := sharedObjectRefName(meta, refCreatorID, meta.DiskFileNum)
+			expRefObj := fmt.Sprintf("%s.ref.%s.%s", expObj, refCreatorID, meta.DiskFileNum)
+			require.Equal(t, refObj, expRefObj)
+		}
+	})
+
+	t.Run("example", func(t *testing.T) {
+		var meta objstorage.ObjectMetadata
+		meta.DiskFileNum = base.FileNum(123).DiskFileNum()
+		meta.FileType = base.FileTypeTable
+		meta.Remote.CreatorID = objstorage.CreatorID(456)
+		meta.Remote.CreatorFileNum = base.FileNum(789).DiskFileNum()
+		require.Equal(t, remoteObjectName(meta), "0e17-456-000789.sst")
+		require.Equal(t, sharedObjectRefPrefix(meta), "0e17-456-000789.sst.ref.")
+
+		refCreatorID := objstorage.CreatorID(101112)
+		require.Equal(
+			t, sharedObjectRefName(meta, refCreatorID, meta.DiskFileNum),
+			"0e17-456-000789.sst.ref.101112.000123",
+		)
+	})
+}
diff --git a/pebble/objstorage/objstorageprovider/remote_readable.go b/pebble/objstorage/objstorageprovider/remote_readable.go
new file mode 100644
index 0000000..991a1ba
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/remote_readable.go
@@ -0,0 +1,162 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package objstorageprovider
+
+import (
+	"context"
+	"io"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache"
+	"github.com/cockroachdb/pebble/objstorage/remote"
+)
+
+const remoteMaxReadaheadSize = 1024 * 1024 /* 1MB */
+
+// remoteReadable is a very simple implementation of Readable on top of the
+// ReadCloser returned by remote.Storage.CreateObject.
+type remoteReadable struct {
+	objReader remote.ObjectReader
+	size      int64
+	fileNum   base.DiskFileNum
+	provider  *provider
+}
+
+var _ objstorage.Readable = (*remoteReadable)(nil)
+
+func (p *provider) newRemoteReadable(
+	objReader remote.ObjectReader, size int64, fileNum base.DiskFileNum,
+) *remoteReadable {
+	return &remoteReadable{
+		objReader: objReader,
+		size:      size,
+		fileNum:   fileNum,
+		provider:  p,
+	}
+}
+
+// ReadAt is part of the objstorage.Readable interface.
+func (r *remoteReadable) ReadAt(ctx context.Context, p []byte, offset int64) error {
+	return r.readInternal(ctx, p, offset, false /* forCompaction */)
+}
+
+// readInternal performs a read for the object, using the cache when
+// appropriate.
+func (r *remoteReadable) readInternal(
+	ctx context.Context, p []byte, offset int64, forCompaction bool,
+) error {
+	if cache := r.provider.remote.cache; cache != nil {
+		flags := sharedcache.ReadFlags{
+			// Don't add data to the cache if this read is for a compaction.
+			ReadOnly: forCompaction,
+		}
+		return r.provider.remote.cache.ReadAt(ctx, r.fileNum, p, offset, r.objReader, r.size, flags)
+	}
+	return r.objReader.ReadAt(ctx, p, offset)
+}
+
+func (r *remoteReadable) Close() error {
+	defer func() { r.objReader = nil }()
+	return r.objReader.Close()
+}
+
+func (r *remoteReadable) Size() int64 {
+	return r.size
+}
+
+func (r *remoteReadable) NewReadHandle(_ context.Context) objstorage.ReadHandle {
+	// TODO(radu): use a pool.
+	rh := &remoteReadHandle{readable: r}
+	rh.readahead.state = makeReadaheadState(remoteMaxReadaheadSize)
+	return rh
+}
+
+type remoteReadHandle struct {
+	readable  *remoteReadable
+	readahead struct {
+		state  readaheadState
+		data   []byte
+		offset int64
+	}
+	forCompaction bool
+}
+
+var _ objstorage.ReadHandle = (*remoteReadHandle)(nil)
+
+// ReadAt is part of the objstorage.ReadHandle interface.
+func (r *remoteReadHandle) ReadAt(ctx context.Context, p []byte, offset int64) error {
+	readaheadSize := r.maybeReadahead(offset, len(p))
+
+	// Check if we already have the data from a previous read-ahead.
+	if rhSize := int64(len(r.readahead.data)); rhSize > 0 {
+		if r.readahead.offset <= offset && r.readahead.offset+rhSize > offset {
+			n := copy(p, r.readahead.data[offset-r.readahead.offset:])
+			if n == len(p) {
+				// All data was available.
+				return nil
+			}
+			// Use the data that we had and do a shorter read.
+			offset += int64(n)
+			p = p[n:]
+			readaheadSize -= n
+		}
+	}
+
+	if readaheadSize > len(p) {
+		// Don't try to read past EOF.
+		if offset+int64(readaheadSize) > r.readable.size {
+			readaheadSize = int(r.readable.size - offset)
+			if readaheadSize <= 0 {
+				// This shouldn't happen in practice (Pebble should never try to read
+				// past EOF).
+				return io.EOF
+			}
+		}
+		r.readahead.offset = offset
+		// TODO(radu): we need to somehow account for this memory.
+		if cap(r.readahead.data) >= readaheadSize {
+			r.readahead.data = r.readahead.data[:readaheadSize]
+		} else {
+			r.readahead.data = make([]byte, readaheadSize)
+		}
+
+		if err := r.readable.readInternal(ctx, r.readahead.data, offset, r.forCompaction); err != nil {
+			// Make sure we don't treat the data as valid next time.
+			r.readahead.data = r.readahead.data[:0]
+			return err
+		}
+		copy(p, r.readahead.data)
+		return nil
+	}
+
+	return r.readable.readInternal(ctx, p, offset, r.forCompaction)
+}
+
+func (r *remoteReadHandle) maybeReadahead(offset int64, len int) int {
+	if r.forCompaction {
+		return remoteMaxReadaheadSize
+	}
+	return int(r.readahead.state.maybeReadahead(offset, int64(len)))
+}
+
+// Close is part of the objstorage.ReadHandle interface.
+func (r *remoteReadHandle) Close() error {
+	r.readable = nil
+	r.readahead.data = nil
+	return nil
+}
+
+// SetupForCompaction is part of the objstorage.ReadHandle interface.
+func (r *remoteReadHandle) SetupForCompaction() {
+	r.forCompaction = true
+}
+
+// RecordCacheHit is part of the objstorage.ReadHandle interface.
+func (r *remoteReadHandle) RecordCacheHit(_ context.Context, offset, size int64) {
+	if !r.forCompaction {
+		r.readahead.state.recordCacheHit(offset, size)
+	}
+}
diff --git a/pebble/objstorage/objstorageprovider/remoteobjcat/catalog.go b/pebble/objstorage/objstorageprovider/remoteobjcat/catalog.go
new file mode 100644
index 0000000..aa73e93
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/remoteobjcat/catalog.go
@@ -0,0 +1,388 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package remoteobjcat
+
+import (
+	"fmt"
+	"io"
+	"sync"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/remote"
+	"github.com/cockroachdb/pebble/record"
+	"github.com/cockroachdb/pebble/shims/cmp"
+	"github.com/cockroachdb/pebble/shims/slices"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/cockroachdb/pebble/vfs/atomicfs"
+)
+
+// Catalog is used to manage the on-disk remote object catalog.
+//
+// The catalog file is a log of records, where each record is an encoded
+// VersionEdit.
+type Catalog struct {
+	fs      vfs.FS
+	dirname string
+	mu      struct {
+		sync.Mutex
+
+		creatorID objstorage.CreatorID
+		objects   map[base.DiskFileNum]RemoteObjectMetadata
+
+		marker *atomicfs.Marker
+
+		catalogFile      vfs.File
+		catalogRecWriter *record.Writer
+
+		rotationHelper record.RotationHelper
+
+		// catalogFilename is the filename of catalogFile when catalogFile != nil, otherwise
+		// it is the filename of the last catalog file.
+		catalogFilename string
+	}
+}
+
+// RemoteObjectMetadata encapsulates the data stored in the catalog file for each object.
+type RemoteObjectMetadata struct {
+	// FileNum is the identifier for the object within the context of a single DB
+	// instance.
+	FileNum base.DiskFileNum
+	// FileType is the type of the object. Only certain FileTypes are possible.
+	FileType base.FileType
+	// CreatorID identifies the DB instance that originally created the object.
+	CreatorID objstorage.CreatorID
+	// CreatorFileNum is the identifier for the object within the context of the
+	// DB instance that originally created the object.
+	CreatorFileNum base.DiskFileNum
+	// CleanupMethod indicates the method for cleaning up unused shared objects.
+	CleanupMethod objstorage.SharedCleanupMethod
+	// Locator identifies a remote.Storage implementation.
+	Locator remote.Locator
+	// CustomObjectName (if it is set) overrides the object name that is normally
+	// derived from the CreatorID and CreatorFileNum.
+	CustomObjectName string
+}
+
+const (
+	catalogFilenameBase = "REMOTE-OBJ-CATALOG"
+	catalogMarkerName   = "remote-obj-catalog"
+
+	// We create a new file when the size exceeds 1MB (and some other conditions
+	// hold; see record.RotationHelper).
+	rotateFileSize = 1024 * 1024 // 1MB
+)
+
+// CatalogContents contains the remote objects in the catalog.
+type CatalogContents struct {
+	// CreatorID, if it is set.
+	CreatorID objstorage.CreatorID
+	Objects   []RemoteObjectMetadata
+}
+
+// Open creates a Catalog and loads any existing catalog file, returning the
+// creator ID (if it is set) and the contents.
+func Open(fs vfs.FS, dirname string) (*Catalog, CatalogContents, error) {
+	c := &Catalog{
+		fs:      fs,
+		dirname: dirname,
+	}
+	c.mu.objects = make(map[base.DiskFileNum]RemoteObjectMetadata)
+
+	var err error
+	c.mu.marker, c.mu.catalogFilename, err = atomicfs.LocateMarker(fs, dirname, catalogMarkerName)
+	if err != nil {
+		return nil, CatalogContents{}, err
+	}
+	// If the filename is empty, there is no existing catalog.
+	if c.mu.catalogFilename != "" {
+		if err := c.loadFromCatalogFile(c.mu.catalogFilename); err != nil {
+			return nil, CatalogContents{}, err
+		}
+		if err := c.mu.marker.RemoveObsolete(); err != nil {
+			return nil, CatalogContents{}, err
+		}
+		// TODO(radu): remove obsolete catalog files.
+	}
+	res := CatalogContents{
+		CreatorID: c.mu.creatorID,
+		Objects:   make([]RemoteObjectMetadata, 0, len(c.mu.objects)),
+	}
+	for _, meta := range c.mu.objects {
+		res.Objects = append(res.Objects, meta)
+	}
+	// Sort the objects so the function is deterministic.
+	slices.SortFunc(res.Objects, func(a, b RemoteObjectMetadata) int {
+		return cmp.Compare(a.FileNum, b.FileNum)
+	})
+	return c, res, nil
+}
+
+// SetCreatorID sets the creator ID. If it is already set, it must match.
+func (c *Catalog) SetCreatorID(id objstorage.CreatorID) error {
+	if !id.IsSet() {
+		return errors.AssertionFailedf("attempt to unset CreatorID")
+	}
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if c.mu.creatorID.IsSet() {
+		if c.mu.creatorID != id {
+			return errors.AssertionFailedf("attempt to change CreatorID from %s to %s", c.mu.creatorID, id)
+		}
+		return nil
+	}
+
+	ve := VersionEdit{CreatorID: id}
+	if err := c.writeToCatalogFileLocked(&ve); err != nil {
+		return errors.Wrapf(err, "pebble: could not write to remote object catalog")
+	}
+	c.mu.creatorID = id
+	return nil
+}
+
+// Close any open files.
+func (c *Catalog) Close() error {
+	return c.closeCatalogFile()
+}
+
+func (c *Catalog) closeCatalogFile() error {
+	if c.mu.catalogFile == nil {
+		return nil
+	}
+	err1 := c.mu.catalogRecWriter.Close()
+	err2 := c.mu.catalogFile.Close()
+	c.mu.catalogRecWriter = nil
+	c.mu.catalogFile = nil
+	if err1 != nil {
+		return err1
+	}
+	return err2
+}
+
+// Batch is used to perform multiple object additions/deletions at once.
+type Batch struct {
+	ve VersionEdit
+}
+
+// AddObject adds a new object to the batch.
+//
+// The given FileNum must be new - it must not match that of any object that was
+// ever in the catalog.
+func (b *Batch) AddObject(meta RemoteObjectMetadata) {
+	b.ve.NewObjects = append(b.ve.NewObjects, meta)
+}
+
+// DeleteObject adds an object removal to the batch.
+func (b *Batch) DeleteObject(fileNum base.DiskFileNum) {
+	b.ve.DeletedObjects = append(b.ve.DeletedObjects, fileNum)
+}
+
+// Reset clears the batch.
+func (b *Batch) Reset() {
+	b.ve.NewObjects = b.ve.NewObjects[:0]
+	b.ve.DeletedObjects = b.ve.DeletedObjects[:0]
+}
+
+// IsEmpty returns true if the batch is empty.
+func (b *Batch) IsEmpty() bool {
+	return len(b.ve.NewObjects) == 0 && len(b.ve.DeletedObjects) == 0
+}
+
+// Copy returns a copy of the Batch.
+func (b *Batch) Copy() Batch {
+	var res Batch
+	if len(b.ve.NewObjects) > 0 {
+		res.ve.NewObjects = make([]RemoteObjectMetadata, len(b.ve.NewObjects))
+		copy(res.ve.NewObjects, b.ve.NewObjects)
+	}
+	if len(b.ve.DeletedObjects) > 0 {
+		res.ve.DeletedObjects = make([]base.DiskFileNum, len(b.ve.DeletedObjects))
+		copy(res.ve.DeletedObjects, b.ve.DeletedObjects)
+	}
+	return res
+}
+
+// Append merges two batches.
+func (b *Batch) Append(other Batch) {
+	b.ve.NewObjects = append(b.ve.NewObjects, other.ve.NewObjects...)
+	b.ve.DeletedObjects = append(b.ve.DeletedObjects, other.ve.DeletedObjects...)
+}
+
+// ApplyBatch applies a batch of updates; returns after the change is stably
+// recorded on storage.
+func (c *Catalog) ApplyBatch(b Batch) error {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	// Sanity checks.
+	toAdd := make(map[base.DiskFileNum]struct{}, len(b.ve.NewObjects))
+	exists := func(n base.DiskFileNum) bool {
+		_, ok := c.mu.objects[n]
+		if !ok {
+			_, ok = toAdd[n]
+		}
+		return ok
+	}
+	for _, meta := range b.ve.NewObjects {
+		if exists(meta.FileNum) {
+			return errors.AssertionFailedf("adding existing object %s", meta.FileNum)
+		}
+		toAdd[meta.FileNum] = struct{}{}
+	}
+	for _, n := range b.ve.DeletedObjects {
+		if !exists(n) {
+			return errors.AssertionFailedf("deleting non-existent object %s", n)
+		}
+	}
+
+	if err := c.writeToCatalogFileLocked(&b.ve); err != nil {
+		return errors.Wrapf(err, "pebble: could not write to remote object catalog")
+	}
+
+	// Add new objects before deleting any objects. This allows for cases where
+	// the same batch adds and deletes an object.
+	for _, meta := range b.ve.NewObjects {
+		c.mu.objects[meta.FileNum] = meta
+	}
+	for _, n := range b.ve.DeletedObjects {
+		delete(c.mu.objects, n)
+	}
+
+	return nil
+}
+
+func (c *Catalog) loadFromCatalogFile(filename string) error {
+	catalogPath := c.fs.PathJoin(c.dirname, filename)
+	f, err := c.fs.Open(catalogPath)
+	if err != nil {
+		return errors.Wrapf(
+			err, "pebble: could not open remote object catalog file %q for DB %q",
+			errors.Safe(filename), c.dirname,
+		)
+	}
+	defer f.Close()
+	rr := record.NewReader(f, 0 /* logNum */)
+	for {
+		r, err := rr.Next()
+		if err == io.EOF || record.IsInvalidRecord(err) {
+			break
+		}
+		if err != nil {
+			return errors.Wrapf(err, "pebble: error when loading remote object catalog file %q",
+				errors.Safe(filename))
+		}
+		var ve VersionEdit
+		if err := ve.Decode(r); err != nil {
+			return errors.Wrapf(err, "pebble: error when loading remote object catalog file %q",
+				errors.Safe(filename))
+		}
+		// Apply the version edit to the current state.
+		if err := ve.Apply(&c.mu.creatorID, c.mu.objects); err != nil {
+			return errors.Wrapf(err, "pebble: error when loading remote object catalog file %q",
+				errors.Safe(filename))
+		}
+	}
+	return nil
+}
+
+// writeToCatalogFileLocked writes a VersionEdit to the catalog file.
+// Creates a new file if this is the first write.
+func (c *Catalog) writeToCatalogFileLocked(ve *VersionEdit) error {
+	c.mu.rotationHelper.AddRecord(int64(len(ve.NewObjects) + len(ve.DeletedObjects)))
+	snapshotSize := int64(len(c.mu.objects))
+
+	var shouldRotate bool
+	if c.mu.catalogFile == nil {
+		shouldRotate = true
+	} else if c.mu.catalogRecWriter.Size() >= rotateFileSize {
+		shouldRotate = c.mu.rotationHelper.ShouldRotate(snapshotSize)
+	}
+
+	if shouldRotate {
+		if c.mu.catalogFile != nil {
+			if err := c.closeCatalogFile(); err != nil {
+				return err
+			}
+		}
+		if err := c.createNewCatalogFileLocked(); err != nil {
+			return err
+		}
+		c.mu.rotationHelper.Rotate(snapshotSize)
+	}
+	return writeRecord(ve, c.mu.catalogFile, c.mu.catalogRecWriter)
+}
+
+func makeCatalogFilename(iter uint64) string {
+	return fmt.Sprintf("%s-%06d", catalogFilenameBase, iter)
+}
+
+// createNewCatalogFileLocked creates a new catalog file, populates it with the
+// current catalog and sets c.mu.catalogFile and c.mu.catalogRecWriter.
+func (c *Catalog) createNewCatalogFileLocked() (outErr error) {
+	if c.mu.catalogFile != nil {
+		return errors.AssertionFailedf("catalogFile already open")
+	}
+	filename := makeCatalogFilename(c.mu.marker.NextIter())
+	filepath := c.fs.PathJoin(c.dirname, filename)
+	file, err := c.fs.Create(filepath)
+	if err != nil {
+		return err
+	}
+	recWriter := record.NewWriter(file)
+	err = func() error {
+		// Create a VersionEdit that gets us from an empty catalog to the current state.
+		var ve VersionEdit
+		ve.CreatorID = c.mu.creatorID
+		ve.NewObjects = make([]RemoteObjectMetadata, 0, len(c.mu.objects))
+		for _, meta := range c.mu.objects {
+			ve.NewObjects = append(ve.NewObjects, meta)
+		}
+		if err := writeRecord(&ve, file, recWriter); err != nil {
+			return err
+		}
+
+		// Move the marker to the new filename. Move handles syncing the data
+		// directory as well.
+		if err := c.mu.marker.Move(filename); err != nil {
+			return errors.Wrap(err, "moving marker")
+		}
+
+		return nil
+	}()
+
+	if err != nil {
+		_ = recWriter.Close()
+		_ = file.Close()
+		_ = c.fs.Remove(filepath)
+		return err
+	}
+
+	// Remove any previous file (ignoring any error).
+	if c.mu.catalogFilename != "" {
+		_ = c.fs.Remove(c.fs.PathJoin(c.dirname, c.mu.catalogFilename))
+	}
+
+	c.mu.catalogFile = file
+	c.mu.catalogRecWriter = recWriter
+	c.mu.catalogFilename = filename
+	return nil
+}
+
+func writeRecord(ve *VersionEdit, file vfs.File, recWriter *record.Writer) error {
+	w, err := recWriter.Next()
+	if err != nil {
+		return err
+	}
+	if err := ve.Encode(w); err != nil {
+		return err
+	}
+	if err := recWriter.Flush(); err != nil {
+		return err
+	}
+	return file.Sync()
+}
diff --git a/pebble/objstorage/objstorageprovider/remoteobjcat/catalog_test.go b/pebble/objstorage/objstorageprovider/remoteobjcat/catalog_test.go
new file mode 100644
index 0000000..37a0c3c
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/remoteobjcat/catalog_test.go
@@ -0,0 +1,183 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package remoteobjcat_test
+
+import (
+	"fmt"
+	"math/rand"
+	"sort"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/remoteobjcat"
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+func TestCatalog(t *testing.T) {
+	mem := vfs.NewMem()
+	var memLog base.InMemLogger
+
+	var cat *remoteobjcat.Catalog
+	datadriven.RunTest(t, "testdata/catalog", func(t *testing.T, td *datadriven.TestData) string {
+		toUInt64 := func(args ...string) []uint64 {
+			t.Helper()
+			var res []uint64
+			for _, arg := range args {
+				n, err := strconv.Atoi(arg)
+				if err != nil {
+					td.Fatalf(t, "error parsing arg %s as integer: %v", arg, err)
+				}
+				res = append(res, uint64(n))
+			}
+			return res
+		}
+
+		parseAdd := func(args []string) remoteobjcat.RemoteObjectMetadata {
+			t.Helper()
+			if len(args) != 3 {
+				td.Fatalf(t, "add <file-num> <creator-id> <creator-file-num>")
+			}
+			vals := toUInt64(args...)
+			return remoteobjcat.RemoteObjectMetadata{
+				FileNum: base.FileNum(vals[0]).DiskFileNum(),
+				// When we support other file types, we should let the test determine this.
+				FileType:       base.FileTypeTable,
+				CreatorID:      objstorage.CreatorID(vals[1]),
+				CreatorFileNum: base.FileNum(vals[2]).DiskFileNum(),
+			}
+		}
+
+		parseDel := func(args []string) base.DiskFileNum {
+			t.Helper()
+			if len(args) != 1 {
+				td.Fatalf(t, "delete <file-num>")
+			}
+			return base.FileNum(toUInt64(args[0])[0]).DiskFileNum()
+		}
+
+		memLog.Reset()
+		switch td.Cmd {
+		case "open":
+			if len(td.CmdArgs) != 1 {
+				td.Fatalf(t, "open <dir>")
+			}
+			dirname := td.CmdArgs[0].String()
+			err := mem.MkdirAll(dirname, 0755)
+			if err != nil {
+				td.Fatalf(t, "%v", err)
+			}
+			var contents remoteobjcat.CatalogContents
+			cat, contents, err = remoteobjcat.Open(vfs.WithLogging(mem, memLog.Infof), dirname)
+			if err != nil {
+				return err.Error()
+			}
+			var buf strings.Builder
+			if contents.CreatorID.IsSet() {
+				fmt.Fprintf(&buf, "creator-id: %s\n", contents.CreatorID)
+			}
+			for _, meta := range contents.Objects {
+				fmt.Fprintf(&buf, "%s: %d/%s\n", meta.FileNum, meta.CreatorID, meta.CreatorFileNum)
+			}
+
+			return buf.String()
+
+		case "set-creator-id":
+			if len(td.CmdArgs) != 1 {
+				td.Fatalf(t, "set-creator-id <id>")
+			}
+			id := objstorage.CreatorID(toUInt64(td.CmdArgs[0].String())[0])
+			if err := cat.SetCreatorID(id); err != nil {
+				return fmt.Sprintf("error setting creator ID: %v", err)
+			}
+			return memLog.String()
+
+		case "batch":
+			var b remoteobjcat.Batch
+			for _, cmd := range strings.Split(td.Input, "\n") {
+				tokens := strings.Split(cmd, " ")
+				if len(tokens) == 0 {
+					td.Fatalf(t, "empty batch line")
+				}
+				switch tokens[0] {
+				case "add":
+					b.AddObject(parseAdd(tokens[1:]))
+				case "delete":
+					b.DeleteObject(parseDel(tokens[1:]))
+				default:
+					td.Fatalf(t, "unknown batch command: %s", tokens[0])
+				}
+			}
+			if err := cat.ApplyBatch(b); err != nil {
+				return fmt.Sprintf("error applying batch: %v", err)
+			}
+			b.Reset()
+			return memLog.String()
+
+		case "random-batches":
+			n := 1
+			size := 1000
+			for _, arg := range td.CmdArgs {
+				if len(arg.Vals) != 1 {
+					td.Fatalf(t, "random-batches n=<val> size=<val>")
+				}
+				val := toUInt64(arg.Vals[0])[0]
+				switch arg.Key {
+				case "n":
+					n = int(val)
+				case "size":
+					size = int(val)
+				default:
+					td.Fatalf(t, "random-batches n=<val> size=<val>")
+				}
+			}
+			var b remoteobjcat.Batch
+			for batchIdx := 0; batchIdx < n; batchIdx++ {
+				for i := 0; i < size; i++ {
+					b.AddObject(remoteobjcat.RemoteObjectMetadata{
+						FileNum: base.FileNum(rand.Uint64()).DiskFileNum(),
+						// When we support other file types, we should let the test determine this.
+						FileType:       base.FileTypeTable,
+						CreatorID:      objstorage.CreatorID(rand.Uint64()),
+						CreatorFileNum: base.FileNum(rand.Uint64()).DiskFileNum(),
+					})
+				}
+				if err := cat.ApplyBatch(b); err != nil {
+					td.Fatalf(t, "error applying batch: %v", err)
+				}
+				b.Reset()
+			}
+			return memLog.String()
+
+		case "close":
+			if cat == nil {
+				return "nil catalog"
+			}
+			err := cat.Close()
+			cat = nil
+			if err != nil {
+				return fmt.Sprintf("%v", err)
+			}
+			return memLog.String()
+
+		case "list":
+			if len(td.CmdArgs) != 1 {
+				td.Fatalf(t, "open <dir>")
+			}
+			paths, err := mem.List(td.CmdArgs[0].String())
+			if err != nil {
+				return err.Error()
+			}
+			sort.Strings(paths)
+			return strings.Join(paths, "\n")
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
diff --git a/pebble/objstorage/objstorageprovider/remoteobjcat/testdata/catalog b/pebble/objstorage/objstorageprovider/remoteobjcat/testdata/catalog
new file mode 100644
index 0000000..b67dff2
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/remoteobjcat/testdata/catalog
@@ -0,0 +1,275 @@
+open test
+----
+
+list test
+----
+
+batch
+add 1 10 100
+----
+create: test/REMOTE-OBJ-CATALOG-000001
+sync: test/REMOTE-OBJ-CATALOG-000001
+create: test/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+close: test/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+sync: test
+sync: test/REMOTE-OBJ-CATALOG-000001
+
+list test
+----
+REMOTE-OBJ-CATALOG-000001
+marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+
+batch
+add 2 20 200
+add 3 30 300
+----
+sync: test/REMOTE-OBJ-CATALOG-000001
+
+batch
+delete 1
+----
+sync: test/REMOTE-OBJ-CATALOG-000001
+
+list test
+----
+REMOTE-OBJ-CATALOG-000001
+marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+
+set-creator-id 5
+----
+sync: test/REMOTE-OBJ-CATALOG-000001
+
+set-creator-id 5
+----
+
+set-creator-id 6
+----
+error setting creator ID: attempt to change CreatorID from 5 to 6
+
+# Bad batches.
+batch
+add 3 1 1
+----
+error applying batch: adding existing object 000003
+
+batch
+delete 1000
+----
+error applying batch: deleting non-existent object 001000
+
+close
+----
+close: test/REMOTE-OBJ-CATALOG-000001
+
+open test
+----
+creator-id: 5
+000002: 20/000200
+000003: 30/000300
+
+set-creator-id 6
+----
+error setting creator ID: attempt to change CreatorID from 5 to 6
+
+batch
+add 4 40 40
+delete 3
+add 8 80 80
+----
+create: test/REMOTE-OBJ-CATALOG-000002
+sync: test/REMOTE-OBJ-CATALOG-000002
+create: test/marker.remote-obj-catalog.000002.REMOTE-OBJ-CATALOG-000002
+close: test/marker.remote-obj-catalog.000002.REMOTE-OBJ-CATALOG-000002
+remove: test/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+sync: test
+remove: test/REMOTE-OBJ-CATALOG-000001
+sync: test/REMOTE-OBJ-CATALOG-000002
+
+list test
+----
+REMOTE-OBJ-CATALOG-000002
+marker.remote-obj-catalog.000002.REMOTE-OBJ-CATALOG-000002
+
+close
+----
+close: test/REMOTE-OBJ-CATALOG-000002
+
+open test
+----
+creator-id: 5
+000002: 20/000200
+000004: 40/000040
+000008: 80/000080
+
+close
+----
+
+open other-path
+----
+
+batch
+add 5 50 500
+----
+create: other-path/REMOTE-OBJ-CATALOG-000001
+sync: other-path/REMOTE-OBJ-CATALOG-000001
+create: other-path/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+close: other-path/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+sync: other-path
+sync: other-path/REMOTE-OBJ-CATALOG-000001
+
+# Adding and deleting objects in the same batch is allowed.
+
+batch
+add 9 50 501
+delete 9
+----
+sync: other-path/REMOTE-OBJ-CATALOG-000001
+
+list other-path
+----
+REMOTE-OBJ-CATALOG-000001
+marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+
+list test
+----
+REMOTE-OBJ-CATALOG-000002
+marker.remote-obj-catalog.000002.REMOTE-OBJ-CATALOG-000002
+
+close
+----
+close: other-path/REMOTE-OBJ-CATALOG-000001
+
+open test
+----
+creator-id: 5
+000002: 20/000200
+000004: 40/000040
+000008: 80/000080
+
+# Test rotation.
+list test
+----
+REMOTE-OBJ-CATALOG-000002
+marker.remote-obj-catalog.000002.REMOTE-OBJ-CATALOG-000002
+
+random-batches n=20 size=2000
+----
+create: test/REMOTE-OBJ-CATALOG-000003
+sync: test/REMOTE-OBJ-CATALOG-000003
+create: test/marker.remote-obj-catalog.000003.REMOTE-OBJ-CATALOG-000003
+close: test/marker.remote-obj-catalog.000003.REMOTE-OBJ-CATALOG-000003
+remove: test/marker.remote-obj-catalog.000002.REMOTE-OBJ-CATALOG-000002
+sync: test
+remove: test/REMOTE-OBJ-CATALOG-000002
+sync: test/REMOTE-OBJ-CATALOG-000003
+sync: test/REMOTE-OBJ-CATALOG-000003
+sync: test/REMOTE-OBJ-CATALOG-000003
+sync: test/REMOTE-OBJ-CATALOG-000003
+sync: test/REMOTE-OBJ-CATALOG-000003
+sync: test/REMOTE-OBJ-CATALOG-000003
+sync: test/REMOTE-OBJ-CATALOG-000003
+sync: test/REMOTE-OBJ-CATALOG-000003
+sync: test/REMOTE-OBJ-CATALOG-000003
+sync: test/REMOTE-OBJ-CATALOG-000003
+sync: test/REMOTE-OBJ-CATALOG-000003
+sync: test/REMOTE-OBJ-CATALOG-000003
+sync: test/REMOTE-OBJ-CATALOG-000003
+sync: test/REMOTE-OBJ-CATALOG-000003
+sync: test/REMOTE-OBJ-CATALOG-000003
+sync: test/REMOTE-OBJ-CATALOG-000003
+sync: test/REMOTE-OBJ-CATALOG-000003
+close: test/REMOTE-OBJ-CATALOG-000003
+create: test/REMOTE-OBJ-CATALOG-000004
+sync: test/REMOTE-OBJ-CATALOG-000004
+create: test/marker.remote-obj-catalog.000004.REMOTE-OBJ-CATALOG-000004
+close: test/marker.remote-obj-catalog.000004.REMOTE-OBJ-CATALOG-000004
+remove: test/marker.remote-obj-catalog.000003.REMOTE-OBJ-CATALOG-000003
+sync: test
+remove: test/REMOTE-OBJ-CATALOG-000003
+sync: test/REMOTE-OBJ-CATALOG-000004
+sync: test/REMOTE-OBJ-CATALOG-000004
+sync: test/REMOTE-OBJ-CATALOG-000004
+
+list test
+----
+REMOTE-OBJ-CATALOG-000004
+marker.remote-obj-catalog.000004.REMOTE-OBJ-CATALOG-000004
+
+random-batches n=20 size=2000
+----
+sync: test/REMOTE-OBJ-CATALOG-000004
+sync: test/REMOTE-OBJ-CATALOG-000004
+sync: test/REMOTE-OBJ-CATALOG-000004
+sync: test/REMOTE-OBJ-CATALOG-000004
+sync: test/REMOTE-OBJ-CATALOG-000004
+sync: test/REMOTE-OBJ-CATALOG-000004
+sync: test/REMOTE-OBJ-CATALOG-000004
+sync: test/REMOTE-OBJ-CATALOG-000004
+sync: test/REMOTE-OBJ-CATALOG-000004
+sync: test/REMOTE-OBJ-CATALOG-000004
+sync: test/REMOTE-OBJ-CATALOG-000004
+sync: test/REMOTE-OBJ-CATALOG-000004
+sync: test/REMOTE-OBJ-CATALOG-000004
+sync: test/REMOTE-OBJ-CATALOG-000004
+close: test/REMOTE-OBJ-CATALOG-000004
+create: test/REMOTE-OBJ-CATALOG-000005
+sync: test/REMOTE-OBJ-CATALOG-000005
+create: test/marker.remote-obj-catalog.000005.REMOTE-OBJ-CATALOG-000005
+close: test/marker.remote-obj-catalog.000005.REMOTE-OBJ-CATALOG-000005
+remove: test/marker.remote-obj-catalog.000004.REMOTE-OBJ-CATALOG-000004
+sync: test
+remove: test/REMOTE-OBJ-CATALOG-000004
+sync: test/REMOTE-OBJ-CATALOG-000005
+sync: test/REMOTE-OBJ-CATALOG-000005
+sync: test/REMOTE-OBJ-CATALOG-000005
+sync: test/REMOTE-OBJ-CATALOG-000005
+sync: test/REMOTE-OBJ-CATALOG-000005
+sync: test/REMOTE-OBJ-CATALOG-000005
+
+list test
+----
+REMOTE-OBJ-CATALOG-000005
+marker.remote-obj-catalog.000005.REMOTE-OBJ-CATALOG-000005
+
+# Even with huge batches, we don't rotate on each batch.
+random-batches n=10 size=50000
+----
+sync: test/REMOTE-OBJ-CATALOG-000005
+close: test/REMOTE-OBJ-CATALOG-000005
+create: test/REMOTE-OBJ-CATALOG-000006
+sync: test/REMOTE-OBJ-CATALOG-000006
+create: test/marker.remote-obj-catalog.000006.REMOTE-OBJ-CATALOG-000006
+close: test/marker.remote-obj-catalog.000006.REMOTE-OBJ-CATALOG-000006
+remove: test/marker.remote-obj-catalog.000005.REMOTE-OBJ-CATALOG-000005
+sync: test
+remove: test/REMOTE-OBJ-CATALOG-000005
+sync: test/REMOTE-OBJ-CATALOG-000006
+sync: test/REMOTE-OBJ-CATALOG-000006
+close: test/REMOTE-OBJ-CATALOG-000006
+create: test/REMOTE-OBJ-CATALOG-000007
+sync: test/REMOTE-OBJ-CATALOG-000007
+create: test/marker.remote-obj-catalog.000007.REMOTE-OBJ-CATALOG-000007
+close: test/marker.remote-obj-catalog.000007.REMOTE-OBJ-CATALOG-000007
+remove: test/marker.remote-obj-catalog.000006.REMOTE-OBJ-CATALOG-000006
+sync: test
+remove: test/REMOTE-OBJ-CATALOG-000006
+sync: test/REMOTE-OBJ-CATALOG-000007
+sync: test/REMOTE-OBJ-CATALOG-000007
+sync: test/REMOTE-OBJ-CATALOG-000007
+sync: test/REMOTE-OBJ-CATALOG-000007
+close: test/REMOTE-OBJ-CATALOG-000007
+create: test/REMOTE-OBJ-CATALOG-000008
+sync: test/REMOTE-OBJ-CATALOG-000008
+create: test/marker.remote-obj-catalog.000008.REMOTE-OBJ-CATALOG-000008
+close: test/marker.remote-obj-catalog.000008.REMOTE-OBJ-CATALOG-000008
+remove: test/marker.remote-obj-catalog.000007.REMOTE-OBJ-CATALOG-000007
+sync: test
+remove: test/REMOTE-OBJ-CATALOG-000007
+sync: test/REMOTE-OBJ-CATALOG-000008
+sync: test/REMOTE-OBJ-CATALOG-000008
+sync: test/REMOTE-OBJ-CATALOG-000008
+
+list test
+----
+REMOTE-OBJ-CATALOG-000008
+marker.remote-obj-catalog.000008.REMOTE-OBJ-CATALOG-000008
diff --git a/pebble/objstorage/objstorageprovider/remoteobjcat/version_edit.go b/pebble/objstorage/objstorageprovider/remoteobjcat/version_edit.go
new file mode 100644
index 0000000..44552f5
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/remoteobjcat/version_edit.go
@@ -0,0 +1,254 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package remoteobjcat
+
+import (
+	"bufio"
+	"encoding/binary"
+	"io"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/remote"
+)
+
+// VersionEdit is a modification to the remote object state which can be encoded
+// into a record.
+//
+// TODO(radu): consider adding creation and deletion time for debugging purposes.
+type VersionEdit struct {
+	NewObjects     []RemoteObjectMetadata
+	DeletedObjects []base.DiskFileNum
+	CreatorID      objstorage.CreatorID
+}
+
+const (
+	// tagNewObject is followed by the FileNum, creator ID, creator FileNum,
+	// cleanup method, optional new object tags, and ending with a 0 byte.
+	tagNewObject = 1
+	// tagDeletedObject is followed by the FileNum.
+	tagDeletedObject = 2
+	// tagCreatorID is followed by the Creator ID for this store. This ID can
+	// never change.
+	tagCreatorID = 3
+	// tagNewObjectLocator is an optional tag inside the tagNewObject payload. It
+	// is followed by the encoded length of the locator string and the string.
+	tagNewObjectLocator = 4
+	// tagNewObjectCustomName is an optional tag inside the tagNewObject payload.
+	// It is followed by the encoded length of the custom object name string
+	// followed by the string.
+	tagNewObjectCustomName = 5
+)
+
+// Object type values. We don't want to encode FileType directly because it is
+// more general (and we want freedom to change it in the future).
+const (
+	objTypeTable = 1
+)
+
+func objTypeToFileType(objType uint64) (base.FileType, error) {
+	switch objType {
+	case objTypeTable:
+		return base.FileTypeTable, nil
+	default:
+		return 0, errors.Newf("unknown object type %d", objType)
+	}
+}
+
+func fileTypeToObjType(fileType base.FileType) (uint64, error) {
+	switch fileType {
+	case base.FileTypeTable:
+		return objTypeTable, nil
+
+	default:
+		return 0, errors.Newf("unknown object type for file type %d", fileType)
+	}
+}
+
+// Encode encodes an edit to the specified writer.
+func (v *VersionEdit) Encode(w io.Writer) error {
+	buf := make([]byte, 0, binary.MaxVarintLen64*(len(v.NewObjects)*10+len(v.DeletedObjects)*2+2))
+	for _, meta := range v.NewObjects {
+		objType, err := fileTypeToObjType(meta.FileType)
+		if err != nil {
+			return err
+		}
+		buf = binary.AppendUvarint(buf, uint64(tagNewObject))
+		buf = binary.AppendUvarint(buf, uint64(meta.FileNum.FileNum()))
+		buf = binary.AppendUvarint(buf, objType)
+		buf = binary.AppendUvarint(buf, uint64(meta.CreatorID))
+		buf = binary.AppendUvarint(buf, uint64(meta.CreatorFileNum.FileNum()))
+		buf = binary.AppendUvarint(buf, uint64(meta.CleanupMethod))
+		if meta.Locator != "" {
+			buf = binary.AppendUvarint(buf, uint64(tagNewObjectLocator))
+			buf = encodeString(buf, string(meta.Locator))
+		}
+		if meta.CustomObjectName != "" {
+			buf = binary.AppendUvarint(buf, uint64(tagNewObjectCustomName))
+			buf = encodeString(buf, meta.CustomObjectName)
+		}
+		// Append 0 as the terminator for optional new object tags.
+		buf = binary.AppendUvarint(buf, 0)
+	}
+
+	for _, dfn := range v.DeletedObjects {
+		buf = binary.AppendUvarint(buf, uint64(tagDeletedObject))
+		buf = binary.AppendUvarint(buf, uint64(dfn.FileNum()))
+	}
+	if v.CreatorID.IsSet() {
+		buf = binary.AppendUvarint(buf, uint64(tagCreatorID))
+		buf = binary.AppendUvarint(buf, uint64(v.CreatorID))
+	}
+	_, err := w.Write(buf)
+	return err
+}
+
+// Decode decodes an edit from the specified reader.
+func (v *VersionEdit) Decode(r io.Reader) error {
+	br, ok := r.(io.ByteReader)
+	if !ok {
+		br = bufio.NewReader(r)
+	}
+	for {
+		tag, err := binary.ReadUvarint(br)
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return err
+		}
+
+		err = nil
+		switch tag {
+		case tagNewObject:
+			var fileNum, creatorID, creatorFileNum, cleanupMethod uint64
+			var locator, customName string
+			var fileType base.FileType
+			fileNum, err = binary.ReadUvarint(br)
+			if err == nil {
+				var objType uint64
+				objType, err = binary.ReadUvarint(br)
+				if err == nil {
+					fileType, err = objTypeToFileType(objType)
+				}
+			}
+			if err == nil {
+				creatorID, err = binary.ReadUvarint(br)
+			}
+			if err == nil {
+				creatorFileNum, err = binary.ReadUvarint(br)
+			}
+			if err == nil {
+				cleanupMethod, err = binary.ReadUvarint(br)
+			}
+			for err == nil {
+				var optionalTag uint64
+				optionalTag, err = binary.ReadUvarint(br)
+				if err != nil || optionalTag == 0 {
+					break
+				}
+
+				switch optionalTag {
+				case tagNewObjectLocator:
+					locator, err = decodeString(br)
+
+				case tagNewObjectCustomName:
+					customName, err = decodeString(br)
+
+				default:
+					err = errors.Newf("unknown newObject tag %d", optionalTag)
+				}
+			}
+
+			if err == nil {
+				v.NewObjects = append(v.NewObjects, RemoteObjectMetadata{
+					FileNum:          base.FileNum(fileNum).DiskFileNum(),
+					FileType:         fileType,
+					CreatorID:        objstorage.CreatorID(creatorID),
+					CreatorFileNum:   base.FileNum(creatorFileNum).DiskFileNum(),
+					CleanupMethod:    objstorage.SharedCleanupMethod(cleanupMethod),
+					Locator:          remote.Locator(locator),
+					CustomObjectName: customName,
+				})
+			}
+
+		case tagDeletedObject:
+			var fileNum uint64
+			fileNum, err = binary.ReadUvarint(br)
+			if err == nil {
+				v.DeletedObjects = append(v.DeletedObjects, base.FileNum(fileNum).DiskFileNum())
+			}
+
+		case tagCreatorID:
+			var id uint64
+			id, err = binary.ReadUvarint(br)
+			if err == nil {
+				v.CreatorID = objstorage.CreatorID(id)
+			}
+
+		default:
+			err = errors.Newf("unknown tag %d", tag)
+		}
+
+		if err != nil {
+			if err == io.EOF {
+				return errCorruptCatalog
+			}
+			return err
+		}
+	}
+	return nil
+}
+
+func encodeString(buf []byte, s string) []byte {
+	buf = binary.AppendUvarint(buf, uint64(len(s)))
+	buf = append(buf, []byte(s)...)
+	return buf
+}
+
+func decodeString(br io.ByteReader) (string, error) {
+	length, err := binary.ReadUvarint(br)
+	if err != nil || length == 0 {
+		return "", err
+	}
+	buf := make([]byte, length)
+	for i := range buf {
+		buf[i], err = br.ReadByte()
+		if err != nil {
+			return "", err
+		}
+	}
+	return string(buf), nil
+}
+
+var errCorruptCatalog = base.CorruptionErrorf("pebble: corrupt remote object catalog")
+
+// Apply the version edit to a creator ID and a map of objects.
+func (v *VersionEdit) Apply(
+	creatorID *objstorage.CreatorID, objects map[base.DiskFileNum]RemoteObjectMetadata,
+) error {
+	if v.CreatorID.IsSet() {
+		*creatorID = v.CreatorID
+	}
+	for _, meta := range v.NewObjects {
+		if invariants.Enabled {
+			if _, exists := objects[meta.FileNum]; exists {
+				return errors.AssertionFailedf("version edit adds existing object %s", meta.FileNum)
+			}
+		}
+		objects[meta.FileNum] = meta
+	}
+	for _, fileNum := range v.DeletedObjects {
+		if invariants.Enabled {
+			if _, exists := objects[fileNum]; !exists {
+				return errors.AssertionFailedf("version edit deletes non-existent object %s", fileNum)
+			}
+		}
+		delete(objects, fileNum)
+	}
+	return nil
+}
diff --git a/pebble/objstorage/objstorageprovider/remoteobjcat/version_edit_test.go b/pebble/objstorage/objstorageprovider/remoteobjcat/version_edit_test.go
new file mode 100644
index 0000000..85d6c45
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/remoteobjcat/version_edit_test.go
@@ -0,0 +1,92 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package remoteobjcat
+
+import (
+	"bytes"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/kr/pretty"
+)
+
+func TestVersionEditRoundTrip(t *testing.T) {
+	for _, ve := range []VersionEdit{
+		{},
+		{
+			CreatorID: 12345,
+		},
+		{
+			NewObjects: []RemoteObjectMetadata{
+				{
+					FileNum:          base.FileNum(1).DiskFileNum(),
+					FileType:         base.FileTypeTable,
+					CreatorID:        12,
+					CreatorFileNum:   base.FileNum(123).DiskFileNum(),
+					CleanupMethod:    objstorage.SharedNoCleanup,
+					Locator:          "",
+					CustomObjectName: "foo",
+				},
+			},
+		},
+		{
+			DeletedObjects: []base.DiskFileNum{base.FileNum(1).DiskFileNum()},
+		},
+		{
+			CreatorID: 12345,
+			NewObjects: []RemoteObjectMetadata{
+				{
+					FileNum:          base.FileNum(1).DiskFileNum(),
+					FileType:         base.FileTypeTable,
+					CreatorID:        12,
+					CreatorFileNum:   base.FileNum(123).DiskFileNum(),
+					CleanupMethod:    objstorage.SharedRefTracking,
+					Locator:          "foo",
+					CustomObjectName: "",
+				},
+				{
+					FileNum:          base.FileNum(2).DiskFileNum(),
+					FileType:         base.FileTypeTable,
+					CreatorID:        22,
+					CreatorFileNum:   base.FileNum(223).DiskFileNum(),
+					Locator:          "bar",
+					CustomObjectName: "obj1",
+				},
+				{
+					FileNum:          base.FileNum(3).DiskFileNum(),
+					FileType:         base.FileTypeTable,
+					CreatorID:        32,
+					CreatorFileNum:   base.FileNum(323).DiskFileNum(),
+					CleanupMethod:    objstorage.SharedRefTracking,
+					Locator:          "baz",
+					CustomObjectName: "obj2",
+				},
+			},
+			DeletedObjects: []base.DiskFileNum{base.FileNum(4).DiskFileNum(), base.FileNum(5).DiskFileNum()},
+		},
+	} {
+		if err := checkRoundTrip(ve); err != nil {
+			t.Fatalf("%+v did not roundtrip: %v", ve, err)
+		}
+	}
+}
+
+func checkRoundTrip(e0 VersionEdit) error {
+	var e1 VersionEdit
+	buf := new(bytes.Buffer)
+	if err := e0.Encode(buf); err != nil {
+		return errors.Wrap(err, "encode")
+	}
+	if err := e1.Decode(buf); err != nil {
+		return errors.Wrap(err, "decode")
+	}
+	if diff := pretty.Diff(e0, e1); diff != nil {
+		return errors.Errorf("%s", strings.Join(diff, "\n"))
+	}
+	return nil
+}
diff --git a/pebble/objstorage/objstorageprovider/shared_writable.go b/pebble/objstorage/objstorageprovider/shared_writable.go
new file mode 100644
index 0000000..5e8d45a
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/shared_writable.go
@@ -0,0 +1,65 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package objstorageprovider
+
+import (
+	"io"
+
+	"github.com/cockroachdb/pebble/objstorage"
+)
+
+// NewRemoteWritable creates an objstorage.Writable out of an io.WriteCloser.
+func NewRemoteWritable(obj io.WriteCloser) objstorage.Writable {
+	return &sharedWritable{storageWriter: obj}
+}
+
+// sharedWritable is a very simple implementation of Writable on top of the
+// WriteCloser returned by remote.Storage.CreateObject.
+type sharedWritable struct {
+	// Either both p and meta must be unset / zero values, or both must be set.
+	// The case where both are unset is true only in tests.
+	p             *provider
+	meta          objstorage.ObjectMetadata
+	storageWriter io.WriteCloser
+}
+
+var _ objstorage.Writable = (*sharedWritable)(nil)
+
+// Write is part of the Writable interface.
+func (w *sharedWritable) Write(p []byte) error {
+	_, err := w.storageWriter.Write(p)
+	return err
+}
+
+// Finish is part of the Writable interface.
+func (w *sharedWritable) Finish() error {
+	err := w.storageWriter.Close()
+	w.storageWriter = nil
+	if err != nil {
+		w.Abort()
+		return err
+	}
+
+	// Create the marker object.
+	if w.p != nil {
+		if err := w.p.sharedCreateRef(w.meta); err != nil {
+			w.Abort()
+			return err
+		}
+	}
+	return nil
+}
+
+// Abort is part of the Writable interface.
+func (w *sharedWritable) Abort() {
+	if w.storageWriter != nil {
+		_ = w.storageWriter.Close()
+		w.storageWriter = nil
+	}
+	if w.p != nil {
+		w.p.removeMetadata(w.meta.DiskFileNum)
+	}
+	// TODO(radu): delete the object if it was created.
+}
diff --git a/pebble/objstorage/objstorageprovider/sharedcache/shared_cache.go b/pebble/objstorage/objstorageprovider/sharedcache/shared_cache.go
new file mode 100644
index 0000000..112e362
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/sharedcache/shared_cache.go
@@ -0,0 +1,900 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sharedcache
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"math/bits"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/objstorage/remote"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+// Exported to enable exporting from package pebble to enable
+// exporting metrics with below buckets in CRDB.
+var (
+	IOBuckets           = prometheus.ExponentialBucketsRange(float64(time.Millisecond*1), float64(10*time.Second), 50)
+	ChannelWriteBuckets = prometheus.ExponentialBucketsRange(float64(time.Microsecond*1), float64(10*time.Second), 50)
+)
+
+// Cache is a persistent cache backed by a local filesystem. It is intended
+// to cache data that is in slower shared storage (e.g. S3), hence the
+// package name 'sharedcache'.
+type Cache struct {
+	shards       []shard
+	writeWorkers writeWorkers
+
+	bm                blockMath
+	shardingBlockSize int64
+
+	logger  base.Logger
+	metrics internalMetrics
+}
+
+// Metrics is a struct containing metrics exported by the secondary cache.
+// TODO(josh): Reconsider the set of metrics exported by the secondary cache
+// before we release the secondary cache to users. We choose to export many metrics
+// right now, so we learn a lot from the benchmarking we are doing over the 23.2
+// cycle.
+type Metrics struct {
+	// The number of sstable bytes stored in the cache.
+	Size int64
+	// The count of cache blocks in the cache (not sstable blocks).
+	Count int64
+
+	// The number of calls to ReadAt.
+	TotalReads int64
+	// The number of calls to ReadAt that require reading data from 2+ shards.
+	MultiShardReads int64
+	// The number of calls to ReadAt that require reading data from 2+ cache blocks.
+	MultiBlockReads int64
+	// The number of calls to ReadAt where all data returned was read from the cache.
+	ReadsWithFullHit int64
+	// The number of calls to ReadAt where some data returned was read from the cache.
+	ReadsWithPartialHit int64
+	// The number of calls to ReadAt where no data returned was read from the cache.
+	ReadsWithNoHit int64
+
+	// The number of times a cache block was evicted from the cache.
+	Evictions int64
+	// The number of times writing a cache block to the cache failed.
+	WriteBackFailures int64
+
+	// The latency of calls to get some data from the cache.
+	GetLatency prometheus.Histogram
+	// The latency of reads of a single cache block from disk.
+	DiskReadLatency prometheus.Histogram
+	// The latency of writing data to write back to the cache to a channel.
+	// Generally should be low, but if the channel is full, could be high.
+	QueuePutLatency prometheus.Histogram
+	// The latency of calls to put some data read from block storage into the cache.
+	PutLatency prometheus.Histogram
+	// The latency of writes of a single cache block to disk.
+	DiskWriteLatency prometheus.Histogram
+}
+
+// See docs at Metrics.
+type internalMetrics struct {
+	count atomic.Int64
+
+	totalReads          atomic.Int64
+	multiShardReads     atomic.Int64
+	multiBlockReads     atomic.Int64
+	readsWithFullHit    atomic.Int64
+	readsWithPartialHit atomic.Int64
+	readsWithNoHit      atomic.Int64
+
+	evictions         atomic.Int64
+	writeBackFailures atomic.Int64
+
+	getLatency       prometheus.Histogram
+	diskReadLatency  prometheus.Histogram
+	queuePutLatency  prometheus.Histogram
+	putLatency       prometheus.Histogram
+	diskWriteLatency prometheus.Histogram
+}
+
+const (
+	// writeWorkersPerShard is used to establish the number of worker goroutines
+	// that perform writes to the cache.
+	writeWorkersPerShard = 4
+	// writeTaskPerWorker is used to establish how many tasks can be queued up
+	// until we have to block.
+	writeTasksPerWorker = 4
+)
+
+// Open opens a cache. If there is no existing cache at fsDir, a new one
+// is created.
+func Open(
+	fs vfs.FS,
+	logger base.Logger,
+	fsDir string,
+	blockSize int,
+	// shardingBlockSize is the size of a shard block. The cache is split into contiguous
+	// shardingBlockSize units. The units are distributed across multiple independent shards
+	// of the cache, via a hash(offset) modulo num shards operation. The cache replacement
+	// policies operate at the level of shard, not whole cache. This is done to reduce lock
+	// contention.
+	shardingBlockSize int64,
+	sizeBytes int64,
+	numShards int,
+) (*Cache, error) {
+	if minSize := shardingBlockSize * int64(numShards); sizeBytes < minSize {
+		// Up the size so that we have one block per shard. In practice, this should
+		// only happen in tests.
+		sizeBytes = minSize
+	}
+
+	c := &Cache{
+		logger:            logger,
+		bm:                makeBlockMath(blockSize),
+		shardingBlockSize: shardingBlockSize,
+	}
+	c.shards = make([]shard, numShards)
+	blocksPerShard := sizeBytes / int64(numShards) / int64(blockSize)
+	for i := range c.shards {
+		if err := c.shards[i].init(c, fs, fsDir, i, blocksPerShard, blockSize, shardingBlockSize); err != nil {
+			return nil, err
+		}
+	}
+
+	c.writeWorkers.Start(c, numShards*writeWorkersPerShard)
+
+	c.metrics.getLatency = prometheus.NewHistogram(prometheus.HistogramOpts{Buckets: IOBuckets})
+	c.metrics.diskReadLatency = prometheus.NewHistogram(prometheus.HistogramOpts{Buckets: IOBuckets})
+	c.metrics.putLatency = prometheus.NewHistogram(prometheus.HistogramOpts{Buckets: IOBuckets})
+	c.metrics.diskWriteLatency = prometheus.NewHistogram(prometheus.HistogramOpts{Buckets: IOBuckets})
+
+	// Measures a channel write, so lower min.
+	c.metrics.queuePutLatency = prometheus.NewHistogram(prometheus.HistogramOpts{Buckets: ChannelWriteBuckets})
+
+	return c, nil
+}
+
+// Close closes the cache. Methods such as ReadAt should not be called after Close is
+// called.
+func (c *Cache) Close() error {
+	c.writeWorkers.Stop()
+
+	var retErr error
+	for i := range c.shards {
+		if err := c.shards[i].close(); err != nil && retErr == nil {
+			retErr = err
+		}
+	}
+	c.shards = nil
+	return retErr
+}
+
+// Metrics return metrics for the cache. Callers should not mutate
+// the returned histograms, which are pointer types.
+func (c *Cache) Metrics() Metrics {
+	return Metrics{
+		Count:               c.metrics.count.Load(),
+		Size:                c.metrics.count.Load() * int64(c.bm.BlockSize()),
+		TotalReads:          c.metrics.totalReads.Load(),
+		MultiShardReads:     c.metrics.multiShardReads.Load(),
+		MultiBlockReads:     c.metrics.multiBlockReads.Load(),
+		ReadsWithFullHit:    c.metrics.readsWithFullHit.Load(),
+		ReadsWithPartialHit: c.metrics.readsWithPartialHit.Load(),
+		ReadsWithNoHit:      c.metrics.readsWithNoHit.Load(),
+		Evictions:           c.metrics.evictions.Load(),
+		WriteBackFailures:   c.metrics.writeBackFailures.Load(),
+		GetLatency:          c.metrics.getLatency,
+		DiskReadLatency:     c.metrics.diskReadLatency,
+		QueuePutLatency:     c.metrics.queuePutLatency,
+		PutLatency:          c.metrics.putLatency,
+		DiskWriteLatency:    c.metrics.diskWriteLatency,
+	}
+}
+
+// ReadFlags contains options for Cache.ReadAt.
+type ReadFlags struct {
+	// ReadOnly instructs ReadAt to not write any new data into the cache; it is
+	// used when the data is unlikely to be used again.
+	ReadOnly bool
+}
+
+// ReadAt performs a read form an object, attempting to use cached data when
+// possible.
+func (c *Cache) ReadAt(
+	ctx context.Context,
+	fileNum base.DiskFileNum,
+	p []byte,
+	ofs int64,
+	objReader remote.ObjectReader,
+	objSize int64,
+	flags ReadFlags,
+) error {
+	c.metrics.totalReads.Add(1)
+	if ofs >= objSize {
+		if invariants.Enabled {
+			panic(fmt.Sprintf("invalid ReadAt offset %v %v", ofs, objSize))
+		}
+		return io.EOF
+	}
+	// TODO(radu): for compaction reads, we may not want to read from the cache at
+	// all.
+	{
+		start := time.Now()
+		n, err := c.get(fileNum, p, ofs)
+		c.metrics.getLatency.Observe(float64(time.Since(start)))
+		if err != nil {
+			return err
+		}
+		if n == len(p) {
+			// Everything was in cache!
+			c.metrics.readsWithFullHit.Add(1)
+			return nil
+		}
+		if n == 0 {
+			c.metrics.readsWithNoHit.Add(1)
+		} else {
+			c.metrics.readsWithPartialHit.Add(1)
+		}
+
+		// Note this. The below code does not need the original ofs, as with the earlier
+		// reading from the cache done, the relevant offset is ofs + int64(n). Same with p.
+		ofs += int64(n)
+		p = p[n:]
+
+		if invariants.Enabled {
+			if n != 0 && c.bm.Remainder(ofs) != 0 {
+				panic(fmt.Sprintf("after non-zero read from cache, ofs is not block-aligned: %v %v", ofs, n))
+			}
+		}
+	}
+
+	if flags.ReadOnly {
+		return objReader.ReadAt(ctx, p, ofs)
+	}
+
+	// We must do reads with offset & size that are multiples of the block size. Else
+	// later cache hits may return incorrect zeroed results from the cache.
+	firstBlockInd := c.bm.Block(ofs)
+	adjustedOfs := c.bm.BlockOffset(firstBlockInd)
+
+	// Take the length of what is left to read plus the length of the adjustment of
+	// the offset plus the size of a block minus one and divide by the size of a block
+	// to get the number of blocks to read from the object.
+	sizeOfOffAdjustment := int(ofs - adjustedOfs)
+	adjustedLen := int(c.bm.RoundUp(int64(len(p) + sizeOfOffAdjustment)))
+	adjustedP := make([]byte, adjustedLen)
+
+	// Read the rest from the object. We may need to cap the length to avoid past EOF reads.
+	eofCap := int64(adjustedLen)
+	if adjustedOfs+eofCap > objSize {
+		eofCap = objSize - adjustedOfs
+	}
+	if err := objReader.ReadAt(ctx, adjustedP[:eofCap], adjustedOfs); err != nil {
+		return err
+	}
+	copy(p, adjustedP[sizeOfOffAdjustment:])
+
+	start := time.Now()
+	c.writeWorkers.QueueWrite(fileNum, adjustedP, adjustedOfs)
+	c.metrics.queuePutLatency.Observe(float64(time.Since(start)))
+
+	return nil
+}
+
+// get attempts to read the requested data from the cache, if it is already
+// there.
+//
+// If all data is available, returns n = len(p).
+//
+// If data is partially available, a prefix of the data is read; returns n < len(p)
+// and no error. If no prefix is available, returns n = 0 and no error.
+func (c *Cache) get(fileNum base.DiskFileNum, p []byte, ofs int64) (n int, _ error) {
+	// The data extent might cross shard boundaries, hence the loop. In the hot
+	// path, max two iterations of this loop will be executed, since reads are sized
+	// in units of sstable block size.
+	var multiShard bool
+	for {
+		shard := c.getShard(fileNum, ofs+int64(n))
+		cappedLen := len(p[n:])
+		if toBoundary := int(c.shardingBlockSize - ((ofs + int64(n)) % c.shardingBlockSize)); cappedLen > toBoundary {
+			cappedLen = toBoundary
+		}
+		numRead, err := shard.get(fileNum, p[n:n+cappedLen], ofs+int64(n))
+		if err != nil {
+			return n, err
+		}
+		n += numRead
+		if numRead < cappedLen {
+			// We only read a prefix from this shard.
+			return n, nil
+		}
+		if n == len(p) {
+			// We are done.
+			return n, nil
+		}
+		// Data extent crosses shard boundary, continue with next shard.
+		if !multiShard {
+			c.metrics.multiShardReads.Add(1)
+			multiShard = true
+		}
+	}
+}
+
+// set attempts to write the requested data to the cache. Both ofs & len(p) must
+// be multiples of the block size.
+//
+// If all of p is not written to the shard, set returns a non-nil error.
+func (c *Cache) set(fileNum base.DiskFileNum, p []byte, ofs int64) error {
+	if invariants.Enabled {
+		if c.bm.Remainder(ofs) != 0 || c.bm.Remainder(int64(len(p))) != 0 {
+			panic(fmt.Sprintf("set with ofs & len not multiples of block size: %v %v", ofs, len(p)))
+		}
+	}
+
+	// The data extent might cross shard boundaries, hence the loop. In the hot
+	// path, max two iterations of this loop will be executed, since reads are sized
+	// in units of sstable block size.
+	n := 0
+	for {
+		shard := c.getShard(fileNum, ofs+int64(n))
+		cappedLen := len(p[n:])
+		if toBoundary := int(c.shardingBlockSize - ((ofs + int64(n)) % c.shardingBlockSize)); cappedLen > toBoundary {
+			cappedLen = toBoundary
+		}
+		err := shard.set(fileNum, p[n:n+cappedLen], ofs+int64(n))
+		if err != nil {
+			return err
+		}
+		// set returns an error if cappedLen bytes aren't written to the shard.
+		n += cappedLen
+		if n == len(p) {
+			// We are done.
+			return nil
+		}
+		// Data extent crosses shard boundary, continue with next shard.
+	}
+}
+
+func (c *Cache) getShard(fileNum base.DiskFileNum, ofs int64) *shard {
+	const prime64 = 1099511628211
+	hash := uint64(fileNum.FileNum())*prime64 + uint64(ofs/c.shardingBlockSize)
+	// TODO(josh): Instance change ops are often run in production. Such an operation
+	// updates len(c.shards); see openSharedCache. As a result, the behavior of this
+	// function changes, and the cache empties out at restart time. We may want a better
+	// story here eventually.
+	return &c.shards[hash%uint64(len(c.shards))]
+}
+
+type shard struct {
+	cache             *Cache
+	file              vfs.File
+	sizeInBlocks      int64
+	bm                blockMath
+	shardingBlockSize int64
+	mu                struct {
+		sync.Mutex
+		// TODO(josh): None of these datastructures are space-efficient.
+		// Focusing on correctness to start.
+		where  whereMap
+		blocks []cacheBlockState
+		// Head of LRU list (doubly-linked circular).
+		lruHead cacheBlockIndex
+		// Head of free list (singly-linked chain).
+		freeHead cacheBlockIndex
+	}
+}
+
+type cacheBlockState struct {
+	lock    lockState
+	logical logicalBlockID
+
+	// next is the next block in the LRU or free list (or invalidBlockIndex if it
+	// is the last block in the free list).
+	next cacheBlockIndex
+
+	// prev is the previous block in the LRU list. It is not used when the block
+	// is in the free list.
+	prev cacheBlockIndex
+}
+
+// Maps a logical block in an SST to an index of the cache block with the
+// file contents (to the "cache block index").
+type whereMap map[logicalBlockID]cacheBlockIndex
+
+type logicalBlockID struct {
+	filenum       base.DiskFileNum
+	cacheBlockIdx cacheBlockIndex
+}
+
+type lockState int64
+
+const (
+	unlocked lockState = 0
+	// >0 lockState tracks the number of distinct readers of some cache block / logical block
+	// which is in the secondary cache. It is used to ensure that a cache block is not evicted
+	// and overwritten, while there are active readers.
+	readLockTakenInc = 1
+	// -1 lockState indicates that some cache block is currently being populated with data from
+	// blob storage. It is used to ensure that a cache block is not read or evicted again, while
+	// it is being populated.
+	writeLockTaken = -1
+)
+
+func (s *shard) init(
+	cache *Cache,
+	fs vfs.FS,
+	fsDir string,
+	shardIdx int,
+	sizeInBlocks int64,
+	blockSize int,
+	shardingBlockSize int64,
+) error {
+	*s = shard{
+		cache:        cache,
+		sizeInBlocks: sizeInBlocks,
+	}
+	if blockSize < 1024 || shardingBlockSize%int64(blockSize) != 0 {
+		return errors.Newf("invalid block size %d (must divide %d)", blockSize, shardingBlockSize)
+	}
+	s.bm = makeBlockMath(blockSize)
+	s.shardingBlockSize = shardingBlockSize
+	file, err := fs.OpenReadWrite(fs.PathJoin(fsDir, fmt.Sprintf("SHARED-CACHE-%03d", shardIdx)))
+	if err != nil {
+		return err
+	}
+	// TODO(radu): truncate file if necessary (especially important if we restart
+	// with more shards).
+	if err := file.Preallocate(0, int64(blockSize)*sizeInBlocks); err != nil {
+		return err
+	}
+	s.file = file
+
+	// TODO(josh): Right now, the secondary cache is not persistent. All existing
+	// cache contents will be over-written, since all metadata is only stored in
+	// memory.
+	s.mu.where = make(whereMap)
+	s.mu.blocks = make([]cacheBlockState, sizeInBlocks)
+	s.mu.lruHead = invalidBlockIndex
+	s.mu.freeHead = invalidBlockIndex
+	for i := range s.mu.blocks {
+		s.freePush(cacheBlockIndex(i))
+	}
+
+	return nil
+}
+
+func (s *shard) close() error {
+	defer func() {
+		s.file = nil
+	}()
+	return s.file.Close()
+}
+
+// freePush pushes a block to the front of the free list.
+func (s *shard) freePush(index cacheBlockIndex) {
+	s.mu.blocks[index].next = s.mu.freeHead
+	s.mu.freeHead = index
+}
+
+// freePop removes the block from the front of the free list. Must not be called
+// if the list is empty (i.e. freeHead = invalidBlockIndex).
+func (s *shard) freePop() cacheBlockIndex {
+	index := s.mu.freeHead
+	s.mu.freeHead = s.mu.blocks[index].next
+	return index
+}
+
+// lruInsertFront inserts a block at the front of the LRU list.
+func (s *shard) lruInsertFront(index cacheBlockIndex) {
+	b := &s.mu.blocks[index]
+	if s.mu.lruHead == invalidBlockIndex {
+		b.next = index
+		b.prev = index
+	} else {
+		b.next = s.mu.lruHead
+		h := &s.mu.blocks[s.mu.lruHead]
+		b.prev = h.prev
+		s.mu.blocks[h.prev].next = index
+		h.prev = index
+	}
+	s.mu.lruHead = index
+}
+
+func (s *shard) lruNext(index cacheBlockIndex) cacheBlockIndex {
+	return s.mu.blocks[index].next
+}
+
+func (s *shard) lruPrev(index cacheBlockIndex) cacheBlockIndex {
+	return s.mu.blocks[index].prev
+}
+
+// lruUnlink removes a block from the LRU list.
+func (s *shard) lruUnlink(index cacheBlockIndex) {
+	b := &s.mu.blocks[index]
+	if b.next == index {
+		s.mu.lruHead = invalidBlockIndex
+	} else {
+		s.mu.blocks[b.prev].next = b.next
+		s.mu.blocks[b.next].prev = b.prev
+		if s.mu.lruHead == index {
+			s.mu.lruHead = b.next
+		}
+	}
+	b.next, b.prev = invalidBlockIndex, invalidBlockIndex
+}
+
+// get attempts to read the requested data from the shard. The data must not
+// cross a shard boundary.
+//
+// If all data is available, returns n = len(p).
+//
+// If data is partially available, a prefix of the data is read; returns n < len(p)
+// and no error. If no prefix is available, returns n = 0 and no error.
+//
+// TODO(josh): Today, if there are two cache blocks needed to satisfy a read, and the
+// first block is not in the cache and the second one is, we will read both from
+// blob storage. We should fix this. This is not an unlikely scenario if we are doing
+// a reverse scan, since those iterate over sstable blocks in reverse order and due to
+// cache block aligned reads will have read the suffix of the sstable block that will
+// be needed next.
+func (s *shard) get(fileNum base.DiskFileNum, p []byte, ofs int64) (n int, _ error) {
+	if invariants.Enabled {
+		if ofs/s.shardingBlockSize != (ofs+int64(len(p))-1)/s.shardingBlockSize {
+			panic(fmt.Sprintf("get crosses shard boundary: %v %v", ofs, len(p)))
+		}
+		s.assertShardStateIsConsistent()
+	}
+
+	// The data extent might cross cache block boundaries, hence the loop. In the hot
+	// path, max two iterations of this loop will be executed, since reads are sized
+	// in units of sstable block size.
+	var multiBlock bool
+	for {
+		k := logicalBlockID{
+			filenum:       fileNum,
+			cacheBlockIdx: s.bm.Block(ofs + int64(n)),
+		}
+		s.mu.Lock()
+		cacheBlockIdx, ok := s.mu.where[k]
+		// TODO(josh): Multiple reads within the same few milliseconds (anything that is smaller
+		// than blob storage read latency) that miss on the same logical block ID will not necessarily
+		// be rare. We may want to do only one read, with the later readers blocking on the first read
+		// completing. This could be implemented either here or in the primary block cache. See
+		// https://github.com/cockroachdb/pebble/pull/2586 for additional discussion.
+		if !ok {
+			s.mu.Unlock()
+			return n, nil
+		}
+		if s.mu.blocks[cacheBlockIdx].lock == writeLockTaken {
+			// In practice, if we have two reads of the same SST block in close succession, we
+			// would expect the second to hit in the in-memory block cache. So it's not worth
+			// optimizing this case here.
+			s.mu.Unlock()
+			return n, nil
+		}
+		s.mu.blocks[cacheBlockIdx].lock += readLockTakenInc
+		// Move to front of the LRU list.
+		s.lruUnlink(cacheBlockIdx)
+		s.lruInsertFront(cacheBlockIdx)
+		s.mu.Unlock()
+
+		readAt := s.bm.BlockOffset(cacheBlockIdx)
+		readSize := s.bm.BlockSize()
+		if n == 0 { // if first read
+			rem := s.bm.Remainder(ofs)
+			readAt += rem
+			readSize -= int(rem)
+		}
+
+		if len(p[n:]) <= readSize {
+			start := time.Now()
+			numRead, err := s.file.ReadAt(p[n:], readAt)
+			s.cache.metrics.diskReadLatency.Observe(float64(time.Since(start)))
+			s.dropReadLock(cacheBlockIdx)
+			return n + numRead, err
+		}
+		start := time.Now()
+		numRead, err := s.file.ReadAt(p[n:n+readSize], readAt)
+		s.cache.metrics.diskReadLatency.Observe(float64(time.Since(start)))
+		s.dropReadLock(cacheBlockIdx)
+		if err != nil {
+			return 0, err
+		}
+
+		// Note that numRead == readSize, since we checked for an error above.
+		n += numRead
+
+		if !multiBlock {
+			s.cache.metrics.multiBlockReads.Add(1)
+			multiBlock = true
+		}
+	}
+}
+
+// set attempts to write the requested data to the shard. The data must not
+// cross a shard boundary, and both ofs & len(p) must be multiples of the
+// block size.
+//
+// If all of p is not written to the shard, set returns a non-nil error.
+func (s *shard) set(fileNum base.DiskFileNum, p []byte, ofs int64) error {
+	if invariants.Enabled {
+		if ofs/s.shardingBlockSize != (ofs+int64(len(p))-1)/s.shardingBlockSize {
+			panic(fmt.Sprintf("set crosses shard boundary: %v %v", ofs, len(p)))
+		}
+		if s.bm.Remainder(ofs) != 0 || s.bm.Remainder(int64(len(p))) != 0 {
+			panic(fmt.Sprintf("set with ofs & len not multiples of block size: %v %v", ofs, len(p)))
+		}
+		s.assertShardStateIsConsistent()
+	}
+
+	// The data extent might cross cache block boundaries, hence the loop. In the hot
+	// path, max two iterations of this loop will be executed, since reads are sized
+	// in units of sstable block size.
+	n := 0
+	for {
+		if n == len(p) {
+			return nil
+		}
+		if invariants.Enabled {
+			if n > len(p) {
+				panic(fmt.Sprintf("set with n greater than len(p): %v %v", n, len(p)))
+			}
+		}
+
+		// If the logical block is already in the cache, we should skip doing a set.
+		k := logicalBlockID{
+			filenum:       fileNum,
+			cacheBlockIdx: s.bm.Block(ofs + int64(n)),
+		}
+		s.mu.Lock()
+		if _, ok := s.mu.where[k]; ok {
+			s.mu.Unlock()
+			n += s.bm.BlockSize()
+			continue
+		}
+
+		var cacheBlockIdx cacheBlockIndex
+		if s.mu.freeHead == invalidBlockIndex {
+			if invariants.Enabled && s.mu.lruHead == invalidBlockIndex {
+				panic("both LRU and free lists empty")
+			}
+
+			// Find the last element in the LRU list which is not locked.
+			for idx := s.lruPrev(s.mu.lruHead); ; idx = s.lruPrev(idx) {
+				if lock := s.mu.blocks[idx].lock; lock == unlocked {
+					cacheBlockIdx = idx
+					break
+				}
+				if idx == s.mu.lruHead {
+					// No unlocked block to evict.
+					//
+					// TODO(josh): We may want to block until a block frees up, instead of returning
+					// an error here. But I think we can do that later on, e.g. after running some production
+					// experiments.
+					s.mu.Unlock()
+					return errors.New("no block to evict so skipping write to cache")
+				}
+			}
+			s.cache.metrics.evictions.Add(1)
+			s.lruUnlink(cacheBlockIdx)
+			delete(s.mu.where, s.mu.blocks[cacheBlockIdx].logical)
+		} else {
+			s.cache.metrics.count.Add(1)
+			cacheBlockIdx = s.freePop()
+		}
+
+		s.lruInsertFront(cacheBlockIdx)
+		s.mu.where[k] = cacheBlockIdx
+		s.mu.blocks[cacheBlockIdx].logical = k
+		s.mu.blocks[cacheBlockIdx].lock = writeLockTaken
+		s.mu.Unlock()
+
+		writeAt := s.bm.BlockOffset(cacheBlockIdx)
+
+		writeSize := s.bm.BlockSize()
+		if len(p[n:]) <= writeSize {
+			writeSize = len(p[n:])
+		}
+
+		start := time.Now()
+		_, err := s.file.WriteAt(p[n:n+writeSize], writeAt)
+		s.cache.metrics.diskWriteLatency.Observe(float64(time.Since(start)))
+		if err != nil {
+			// Free the block.
+			s.mu.Lock()
+			defer s.mu.Unlock()
+
+			delete(s.mu.where, k)
+			s.lruUnlink(cacheBlockIdx)
+			s.freePush(cacheBlockIdx)
+			return err
+		}
+		s.dropWriteLock(cacheBlockIdx)
+		n += writeSize
+	}
+}
+
+// Doesn't inline currently. This might be okay, but something to keep in mind.
+func (s *shard) dropReadLock(cacheBlockInd cacheBlockIndex) {
+	s.mu.Lock()
+	s.mu.blocks[cacheBlockInd].lock -= readLockTakenInc
+	if invariants.Enabled && s.mu.blocks[cacheBlockInd].lock < 0 {
+		panic(fmt.Sprintf("unexpected lock state %v in dropReadLock", s.mu.blocks[cacheBlockInd].lock))
+	}
+	s.mu.Unlock()
+}
+
+// Doesn't inline currently. This might be okay, but something to keep in mind.
+func (s *shard) dropWriteLock(cacheBlockInd cacheBlockIndex) {
+	s.mu.Lock()
+	if invariants.Enabled && s.mu.blocks[cacheBlockInd].lock != writeLockTaken {
+		panic(fmt.Sprintf("unexpected lock state %v in dropWriteLock", s.mu.blocks[cacheBlockInd].lock))
+	}
+	s.mu.blocks[cacheBlockInd].lock = unlocked
+	s.mu.Unlock()
+}
+
+func (s *shard) assertShardStateIsConsistent() {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	lruLen := 0
+	if s.mu.lruHead != invalidBlockIndex {
+		for b := s.mu.lruHead; ; {
+			lruLen++
+			if idx, ok := s.mu.where[s.mu.blocks[b].logical]; !ok || idx != b {
+				panic("block in LRU list with no entry in where map")
+			}
+			b = s.lruNext(b)
+			if b == s.mu.lruHead {
+				break
+			}
+		}
+	}
+	if lruLen != len(s.mu.where) {
+		panic(fmt.Sprintf("lru list len is %d but where map has %d entries", lruLen, len(s.mu.where)))
+	}
+	freeLen := 0
+	for n := s.mu.freeHead; n != invalidBlockIndex; n = s.mu.blocks[n].next {
+		freeLen++
+	}
+
+	if lruLen+freeLen != int(s.sizeInBlocks) {
+		panic(fmt.Sprintf("%d lru blocks and %d free blocks don't add up to %d", lruLen, freeLen, s.sizeInBlocks))
+	}
+	for i := range s.mu.blocks {
+		if state := s.mu.blocks[i].lock; state < writeLockTaken {
+			panic(fmt.Sprintf("lock state %v is not allowed", state))
+		}
+	}
+}
+
+// cacheBlockIndex is the index of a blockSize-aligned cache block.
+type cacheBlockIndex int64
+
+// invalidBlockIndex is used for the head of a list when the list is empty.
+const invalidBlockIndex cacheBlockIndex = -1
+
+// blockMath is a helper type for performing conversions between offsets and
+// block indexes.
+type blockMath struct {
+	blockSizeBits int8
+}
+
+func makeBlockMath(blockSize int) blockMath {
+	bm := blockMath{
+		blockSizeBits: int8(bits.Len64(uint64(blockSize)) - 1),
+	}
+	if blockSize != (1 << bm.blockSizeBits) {
+		panic(fmt.Sprintf("blockSize %d is not a power of 2", blockSize))
+	}
+	return bm
+}
+
+func (bm blockMath) mask() int64 {
+	return (1 << bm.blockSizeBits) - 1
+}
+
+// BlockSize returns the block size.
+func (bm blockMath) BlockSize() int {
+	return 1 << bm.blockSizeBits
+}
+
+// Block returns the block index containing the given offset.
+func (bm blockMath) Block(offset int64) cacheBlockIndex {
+	return cacheBlockIndex(offset >> bm.blockSizeBits)
+}
+
+// Remainder returns the offset relative to the start of the cache block.
+func (bm blockMath) Remainder(offset int64) int64 {
+	return offset & bm.mask()
+}
+
+// BlockOffset returns the object offset where the given block starts.
+func (bm blockMath) BlockOffset(block cacheBlockIndex) int64 {
+	return int64(block) << bm.blockSizeBits
+}
+
+// RoundUp rounds up the given value to the closest multiple of block size.
+func (bm blockMath) RoundUp(x int64) int64 {
+	return (x + bm.mask()) & ^(bm.mask())
+}
+
+type writeWorkers struct {
+	doneCh        chan struct{}
+	doneWaitGroup sync.WaitGroup
+
+	numWorkers int
+	tasksCh    chan writeTask
+}
+
+type writeTask struct {
+	fileNum base.DiskFileNum
+	p       []byte
+	offset  int64
+}
+
+// Start starts the worker goroutines.
+func (w *writeWorkers) Start(c *Cache, numWorkers int) {
+	doneCh := make(chan struct{})
+	tasksCh := make(chan writeTask, numWorkers*writeTasksPerWorker)
+
+	w.numWorkers = numWorkers
+	w.doneCh = doneCh
+	w.tasksCh = tasksCh
+	w.doneWaitGroup.Add(numWorkers)
+	for i := 0; i < numWorkers; i++ {
+		go func() {
+			defer w.doneWaitGroup.Done()
+			for {
+				select {
+				case <-doneCh:
+					return
+				case task, ok := <-tasksCh:
+					if !ok {
+						// The tasks channel was closed; this is used in testing code to
+						// ensure all writes are completed.
+						return
+					}
+					// TODO(radu): set() can perform multiple writes; perhaps each one
+					// should be its own task.
+					start := time.Now()
+					err := c.set(task.fileNum, task.p, task.offset)
+					c.metrics.putLatency.Observe(float64(time.Since(start)))
+					if err != nil {
+						c.metrics.writeBackFailures.Add(1)
+						// TODO(radu): throttle logs.
+						c.logger.Errorf("writing back to cache after miss failed: %v", err)
+					}
+				}
+			}
+		}()
+	}
+}
+
+// Stop waits for any in-progress writes to complete and stops the worker
+// goroutines and waits for any in-pro. Any queued writes not yet started are
+// discarded.
+func (w *writeWorkers) Stop() {
+	close(w.doneCh)
+	w.doneCh = nil
+	w.tasksCh = nil
+	w.doneWaitGroup.Wait()
+}
+
+// QueueWrite adds a write task to the queue. Can block if the queue is full.
+func (w *writeWorkers) QueueWrite(fileNum base.DiskFileNum, p []byte, offset int64) {
+	w.tasksCh <- writeTask{
+		fileNum: fileNum,
+		p:       p,
+		offset:  offset,
+	}
+}
diff --git a/pebble/objstorage/objstorageprovider/sharedcache/shared_cache_helpers_test.go b/pebble/objstorage/objstorageprovider/sharedcache/shared_cache_helpers_test.go
new file mode 100644
index 0000000..8f44b66
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/sharedcache/shared_cache_helpers_test.go
@@ -0,0 +1,7 @@
+package sharedcache
+
+func (c *Cache) WaitForWritesToComplete() {
+	close(c.writeWorkers.tasksCh)
+	c.writeWorkers.doneWaitGroup.Wait()
+	c.writeWorkers.Start(c, c.writeWorkers.numWorkers)
+}
diff --git a/pebble/objstorage/objstorageprovider/sharedcache/shared_cache_internal_test.go b/pebble/objstorage/objstorageprovider/sharedcache/shared_cache_internal_test.go
new file mode 100644
index 0000000..1598ff0
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/sharedcache/shared_cache_internal_test.go
@@ -0,0 +1,90 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sharedcache
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestSharedCacheLruList(t *testing.T) {
+	var s shard
+	s.mu.blocks = make([]cacheBlockState, 100)
+	expect := func(vals ...int) {
+		t.Helper()
+		if s.mu.lruHead == invalidBlockIndex {
+			if len(vals) != 0 {
+				t.Fatalf("expected non-empty list")
+			}
+			return
+		}
+		var list []int
+		prev := s.lruPrev(s.mu.lruHead)
+		b := s.mu.lruHead
+		for {
+			list = append(list, int(b))
+			if s.lruPrev(b) != prev {
+				t.Fatalf("back link broken: %d:next=%d,prev=%d  %d:next=%d,prev=%d",
+					prev, s.lruNext(prev), s.lruPrev(prev),
+					b, s.lruNext(b), s.lruPrev(b),
+				)
+			}
+			prev = b
+			b = s.lruNext(b)
+			if b == s.mu.lruHead {
+				break
+			}
+		}
+		if !reflect.DeepEqual(vals, list) {
+			t.Fatalf("expected %v, got %v", vals, list)
+		}
+	}
+
+	s.mu.lruHead = invalidBlockIndex
+	expect()
+	s.lruInsertFront(1)
+	expect(1)
+	s.lruInsertFront(10)
+	expect(10, 1)
+	s.lruInsertFront(5)
+	expect(5, 10, 1)
+	s.lruUnlink(5)
+	expect(10, 1)
+	s.lruUnlink(1)
+	expect(10)
+	s.lruUnlink(10)
+	expect()
+}
+
+func TestSharedCacheFreeList(t *testing.T) {
+	var s shard
+	s.mu.blocks = make([]cacheBlockState, 100)
+	expect := func(vals ...int) {
+		t.Helper()
+		var list []int
+		for b := s.mu.freeHead; b != invalidBlockIndex; b = s.mu.blocks[b].next {
+			list = append(list, int(b))
+		}
+		if !reflect.DeepEqual(vals, list) {
+			t.Fatalf("expected %v, got %v", vals, list)
+		}
+	}
+	s.mu.freeHead = invalidBlockIndex
+	expect()
+	s.freePush(1)
+	expect(1)
+	s.freePush(10)
+	expect(10, 1)
+	s.freePush(20)
+	expect(20, 10, 1)
+	require.Equal(t, cacheBlockIndex(20), s.freePop())
+	expect(10, 1)
+	require.Equal(t, cacheBlockIndex(10), s.freePop())
+	expect(1)
+	require.Equal(t, cacheBlockIndex(1), s.freePop())
+	expect()
+}
diff --git a/pebble/objstorage/objstorageprovider/sharedcache/shared_cache_test.go b/pebble/objstorage/objstorageprovider/sharedcache/shared_cache_test.go
new file mode 100644
index 0000000..19a988a
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/sharedcache/shared_cache_test.go
@@ -0,0 +1,265 @@
+package sharedcache_test
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"strconv"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+func TestSharedCache(t *testing.T) {
+	ctx := context.Background()
+
+	datadriven.Walk(t, "testdata/cache", func(t *testing.T, path string) {
+		var log base.InMemLogger
+		fs := vfs.WithLogging(vfs.NewMem(), func(fmt string, args ...interface{}) {
+			log.Infof("<local fs> "+fmt, args...)
+		})
+
+		provider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(fs, ""))
+		require.NoError(t, err)
+
+		var cache *sharedcache.Cache
+		defer func() {
+			if cache != nil {
+				cache.Close()
+			}
+		}()
+
+		var objData []byte
+		datadriven.RunTest(t, path, func(t *testing.T, d *datadriven.TestData) string {
+			log.Reset()
+			switch d.Cmd {
+			case "init":
+				blockSize := parseBytesArg(t, d, "block-size", 32*1024)
+				shardingBlockSize := parseBytesArg(t, d, "sharding-block-size", 1024*1024)
+				numShards := parseBytesArg(t, d, "num-shards", 32)
+				size := parseBytesArg(t, d, "size", numShards*shardingBlockSize)
+				if size%(numShards*shardingBlockSize) != 0 {
+					d.Fatalf(t, "size (%d) must be a multiple of numShards (%d) * shardingBlockSize(%d)",
+						size, numShards, shardingBlockSize,
+					)
+				}
+				cache, err = sharedcache.Open(
+					fs, base.DefaultLogger, "", blockSize, int64(shardingBlockSize), int64(size), numShards,
+				)
+				require.NoError(t, err)
+				return fmt.Sprintf("initialized with block-size=%d size=%d num-shards=%d", blockSize, size, numShards)
+
+			case "write":
+				size := mustParseBytesArg(t, d, "size")
+
+				writable, _, err := provider.Create(ctx, base.FileTypeTable, base.FileNum(1).DiskFileNum(), objstorage.CreateOptions{})
+				require.NoError(t, err)
+				defer writable.Finish()
+
+				// With invariants on, Write will modify its input buffer.
+				objData = make([]byte, size)
+				wrote := make([]byte, size)
+				for i := 0; i < size; i++ {
+					objData[i] = byte(i)
+					wrote[i] = byte(i)
+				}
+				err = writable.Write(wrote)
+				// Writing a file is test setup, and it always is expected to succeed, so we assert
+				// within the test, rather than returning n and/or err here.
+				require.NoError(t, err)
+
+				return ""
+			case "read", "read-for-compaction":
+				missesBefore := cache.Metrics().ReadsWithPartialHit + cache.Metrics().ReadsWithNoHit
+				offset := mustParseBytesArg(t, d, "offset")
+				size := mustParseBytesArg(t, d, "size")
+
+				readable, err := provider.OpenForReading(ctx, base.FileTypeTable, base.FileNum(1).DiskFileNum(), objstorage.OpenOptions{})
+				require.NoError(t, err)
+				defer readable.Close()
+
+				got := make([]byte, size)
+				flags := sharedcache.ReadFlags{
+					ReadOnly: d.Cmd == "read-for-compaction",
+				}
+				err = cache.ReadAt(ctx, base.FileNum(1).DiskFileNum(), got, int64(offset), readable, readable.Size(), flags)
+				// We always expect cache.ReadAt to succeed.
+				require.NoError(t, err)
+				// It is easier to assert this condition programmatically, rather than returning
+				// got, which may be very large.
+				require.True(t, bytes.Equal(objData[int(offset):int(offset)+size], got), "incorrect data returned")
+
+				// In order to ensure we get a hit on the next read, we must wait for writing to
+				// the cache to complete.
+				cache.WaitForWritesToComplete()
+
+				// TODO(josh): Not tracing out filesystem activity here, since logging_fs.go
+				// doesn't trace calls to ReadAt or WriteAt. We should consider changing this.
+				missesAfter := cache.Metrics().ReadsWithPartialHit + cache.Metrics().ReadsWithNoHit
+				return fmt.Sprintf("misses=%d", missesAfter-missesBefore)
+			default:
+				d.Fatalf(t, "unknown command %s", d.Cmd)
+				return ""
+			}
+		})
+	})
+}
+
+func TestSharedCacheRandomized(t *testing.T) {
+	ctx := context.Background()
+
+	var log base.InMemLogger
+	fs := vfs.WithLogging(vfs.NewMem(), func(fmt string, args ...interface{}) {
+		log.Infof("<local fs> "+fmt, args...)
+	})
+
+	provider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(fs, ""))
+	require.NoError(t, err)
+
+	seed := uint64(time.Now().UnixNano())
+	fmt.Printf("seed: %v\n", seed)
+	rand.Seed(seed)
+
+	helper := func(
+		blockSize int,
+		shardingBlockSize int64) func(t *testing.T) {
+		return func(t *testing.T) {
+			for _, concurrentReads := range []bool{false, true} {
+				t.Run(fmt.Sprintf("concurrentReads=%v", concurrentReads), func(t *testing.T) {
+					maxShards := 32
+					if invariants.RaceEnabled {
+						maxShards = 8
+					}
+					numShards := rand.Intn(maxShards) + 1
+					cacheSize := shardingBlockSize * int64(numShards) // minimum allowed cache size
+
+					cache, err := sharedcache.Open(fs, base.DefaultLogger, "", blockSize, shardingBlockSize, cacheSize, numShards)
+					require.NoError(t, err)
+					defer cache.Close()
+
+					writable, _, err := provider.Create(ctx, base.FileTypeTable, base.FileNum(1).DiskFileNum(), objstorage.CreateOptions{})
+					require.NoError(t, err)
+
+					// With invariants on, Write will modify its input buffer.
+					// If size == 0, we can see panics below, so force a nonzero size.
+					size := rand.Int63n(cacheSize-1) + 1
+					objData := make([]byte, size)
+					wrote := make([]byte, size)
+					for i := 0; i < int(size); i++ {
+						objData[i] = byte(i)
+						wrote[i] = byte(i)
+					}
+
+					require.NoError(t, writable.Write(wrote))
+					require.NoError(t, writable.Finish())
+
+					readable, err := provider.OpenForReading(ctx, base.FileTypeTable, base.FileNum(1).DiskFileNum(), objstorage.OpenOptions{})
+					require.NoError(t, err)
+					defer readable.Close()
+
+					const numDistinctReads = 100
+					wg := sync.WaitGroup{}
+					for i := 0; i < numDistinctReads; i++ {
+						wg.Add(1)
+						go func() {
+							defer wg.Done()
+							offset := rand.Int63n(size)
+
+							got := make([]byte, size-offset)
+							err := cache.ReadAt(ctx, base.FileNum(1).DiskFileNum(), got, offset, readable, readable.Size(), sharedcache.ReadFlags{})
+							require.NoError(t, err)
+							require.Equal(t, objData[int(offset):], got)
+
+							got = make([]byte, size-offset)
+							err = cache.ReadAt(ctx, base.FileNum(1).DiskFileNum(), got, offset, readable, readable.Size(), sharedcache.ReadFlags{})
+							require.NoError(t, err)
+							require.Equal(t, objData[int(offset):], got)
+						}()
+						// If concurrent reads, only wait 50% of loop iterations on average.
+						if concurrentReads && rand.Intn(2) == 0 {
+							wg.Wait()
+						}
+						if !concurrentReads {
+							wg.Wait()
+						}
+					}
+					wg.Wait()
+				})
+			}
+		}
+	}
+	t.Run("32 KB block size", helper(32*1024, 1024*1024))
+	t.Run("1 MB block size", helper(1024*1024, 1024*1024))
+
+	if !invariants.RaceEnabled {
+		for i := 0; i < 5; i++ {
+			exp := rand.Intn(11) + 10   // [10, 20]
+			randomBlockSize := 1 << exp // [1 KB, 1 MB]
+
+			factor := rand.Intn(4) + 1                                 // [1, 4]
+			randomShardingBlockSize := int64(randomBlockSize * factor) // [1 KB, 4 MB]
+
+			t.Run("random block and sharding block size", helper(randomBlockSize, randomShardingBlockSize))
+		}
+	}
+}
+
+// parseBytesArg parses an optional argument that specifies a byte size; if the
+// argument is not specified the default value is used. K/M/G suffixes are
+// supported.
+func parseBytesArg(t testing.TB, d *datadriven.TestData, argName string, defaultValue int) int {
+	res, ok := tryParseBytesArg(t, d, argName)
+	if !ok {
+		return defaultValue
+	}
+	return res
+}
+
+// parseBytesArg parses a mandatory argument that specifies a byte size; K/M/G
+// suffixes are supported.
+func mustParseBytesArg(t testing.TB, d *datadriven.TestData, argName string) int {
+	res, ok := tryParseBytesArg(t, d, argName)
+	if !ok {
+		t.Fatalf("argument '%s' missing", argName)
+	}
+	return res
+}
+
+func tryParseBytesArg(t testing.TB, d *datadriven.TestData, argName string) (val int, ok bool) {
+	arg, ok := d.Arg(argName)
+	if !ok {
+		return 0, false
+	}
+	if len(arg.Vals) != 1 {
+		t.Fatalf("expected 1 value for '%s'", argName)
+	}
+	v := arg.Vals[0]
+	factor := 1
+	switch v[len(v)-1] {
+	case 'k', 'K':
+		factor = 1024
+	case 'm', 'M':
+		factor = 1024 * 1024
+	case 'g', 'G':
+		factor = 1024 * 1024 * 1024
+	}
+	if factor > 1 {
+		v = v[:len(v)-1]
+	}
+	res, err := strconv.Atoi(v)
+	if err != nil {
+		t.Fatalf("could not parse value '%s' for '%s'", arg.Vals[0], argName)
+	}
+
+	return res * factor, true
+}
diff --git a/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/compaction_reads b/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/compaction_reads
new file mode 100644
index 0000000..fc9a2e0
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/compaction_reads
@@ -0,0 +1,26 @@
+init
+----
+initialized with block-size=32768 size=33554432 num-shards=32
+
+write size=200000
+----
+
+read offset=1024 size=10000
+----
+misses=1
+
+# This should be in the cache.
+read-for-compaction offset=4096 size=2000
+----
+misses=0
+
+# This should miss the cache.
+read-for-compaction offset=4096 size=100000
+----
+misses=1
+
+# This should miss the cache again - we don't populate the cache when doing
+# compaction reads.
+read-for-compaction offset=4096 size=100000
+----
+misses=1
diff --git a/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/eof_handling b/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/eof_handling
new file mode 100644
index 0000000..2f55de8
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/eof_handling
@@ -0,0 +1,14 @@
+init
+----
+initialized with block-size=32768 size=33554432 num-shards=32
+
+write size=40000
+----
+
+read offset=35000 size=4000
+----
+misses=1
+
+read offset=35000 size=4000
+----
+misses=0
diff --git a/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/lru b/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/lru
new file mode 100644
index 0000000..3044550
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/lru
@@ -0,0 +1,33 @@
+init num-shards=1 size=1M
+----
+initialized with block-size=32768 size=1048576 num-shards=1
+
+write size=1500000
+----
+
+read offset=0 size=32K
+----
+misses=1
+
+read offset=32K size=32K
+----
+misses=1
+
+read offset=64K size=960K
+----
+misses=1
+
+# The cache should now be full with the first MB. Read a new block.
+read offset=1M size=32K
+----
+misses=1
+
+# The block that was evicted should have been the one at offset 0.
+read offset=0 size=32K
+----
+misses=1
+
+# The block that was evicted should have been the one at offset 32768.
+read offset=32K size=32K
+----
+misses=1
diff --git a/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/read_larger_than_two_cache_shards b/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/read_larger_than_two_cache_shards
new file mode 100644
index 0000000..01205ef
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/read_larger_than_two_cache_shards
@@ -0,0 +1,16 @@
+# Read larger than two cache shards.
+
+init
+----
+initialized with block-size=32768 size=33554432 num-shards=32
+
+write size=3145728
+----
+
+read offset=57 size=3145671
+----
+misses=1
+
+read offset=57 size=3145671
+----
+misses=0
diff --git a/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/read_that_hits_two_cache_blocks b/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/read_that_hits_two_cache_blocks
new file mode 100644
index 0000000..824ce0a
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/read_that_hits_two_cache_blocks
@@ -0,0 +1,20 @@
+# Large read that hits two cache blocks.
+
+init
+----
+initialized with block-size=32768 size=33554432 num-shards=32
+
+write size=32773
+----
+
+read offset=0 size=32773
+----
+misses=1
+
+read offset=0 size=32773
+----
+misses=0
+
+read offset=57 size=32716
+----
+misses=0
diff --git a/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/read_that_hits_two_cache_blocks_with_first_read_at_big_offset b/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/read_that_hits_two_cache_blocks_with_first_read_at_big_offset
new file mode 100644
index 0000000..20d4e1d
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/read_that_hits_two_cache_blocks_with_first_read_at_big_offset
@@ -0,0 +1,16 @@
+# Large read that hits two cache blocks, with first read at big offset.
+
+init
+----
+initialized with block-size=32768 size=33554432 num-shards=32
+
+write size=32773
+----
+
+read offset=32768 size=5
+----
+misses=1
+
+read offset=32768 size=5
+----
+misses=0
diff --git a/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/read_that_hits_two_cache_blocks_with_first_read_at_offset b/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/read_that_hits_two_cache_blocks_with_first_read_at_offset
new file mode 100644
index 0000000..3298ec1
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/read_that_hits_two_cache_blocks_with_first_read_at_offset
@@ -0,0 +1,16 @@
+# Large read that hits two cache blocks, with first read at offset.
+
+init
+----
+initialized with block-size=32768 size=33554432 num-shards=32
+
+write size=32773
+----
+
+read offset=57 size=32716
+----
+misses=1
+
+read offset=57 size=32716
+----
+misses=0
diff --git a/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/read_that_hits_two_cache_shards b/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/read_that_hits_two_cache_shards
new file mode 100644
index 0000000..b447751
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/read_that_hits_two_cache_shards
@@ -0,0 +1,20 @@
+# Large read that hits two cache shards.
+
+init
+----
+initialized with block-size=32768 size=33554432 num-shards=32
+
+write size=1048776
+----
+
+read offset=0 size=1048776
+----
+misses=1
+
+read offset=0 size=1048776
+----
+misses=0
+
+read offset=57 size=1048719
+----
+misses=0
diff --git a/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/read_that_hits_two_cache_shards_with_first_read_at_offset b/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/read_that_hits_two_cache_shards_with_first_read_at_offset
new file mode 100644
index 0000000..9721f68
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/read_that_hits_two_cache_shards_with_first_read_at_offset
@@ -0,0 +1,16 @@
+# Large read that hits two cache shards, with first read at offset.
+
+init
+----
+initialized with block-size=32768 size=33554432 num-shards=32
+
+write size=1048776
+----
+
+read offset=57 size=1048719
+----
+misses=1
+
+read offset=57 size=1048719
+----
+misses=0
diff --git a/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/small_read b/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/small_read
new file mode 100644
index 0000000..695017e
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/small_read
@@ -0,0 +1,20 @@
+# Small read, with one miss then two hits.
+
+init
+----
+initialized with block-size=32768 size=33554432 num-shards=32
+
+write size=10
+----
+
+read offset=0 size=10
+----
+misses=1
+
+read offset=0 size=10
+----
+misses=0
+
+read offset=4 size=6
+----
+misses=0
diff --git a/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/small_read_with_first_read_at_offset b/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/small_read_with_first_read_at_offset
new file mode 100644
index 0000000..2908e38
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/sharedcache/testdata/cache/small_read_with_first_read_at_offset
@@ -0,0 +1,20 @@
+# Small read, with first read at offset.
+
+init
+----
+initialized with block-size=32768 size=33554432 num-shards=32
+
+write size=10
+----
+
+read offset=4 size=6
+----
+misses=1
+
+read offset=4 size=6
+----
+misses=0
+
+read offset=0 size=10
+----
+misses=0
diff --git a/pebble/objstorage/objstorageprovider/testdata/provider/local b/pebble/objstorage/objstorageprovider/testdata/provider/local
new file mode 100644
index 0000000..a2f4ce3
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/testdata/provider/local
@@ -0,0 +1,100 @@
+# Basic provider tests without shared storage.
+
+open p0 0
+----
+<local fs> mkdir-all: p0 0755
+<local fs> open-dir: p0
+
+create 1 local 1 1024
+foo
+----
+<local fs> create: p0/000001.sst
+<local fs> sync-data: p0/000001.sst
+<local fs> close: p0/000001.sst
+
+read 1
+0 512
+0 1024
+512 1024
+----
+<local fs> open: p0/000001.sst
+size: 1024
+<local fs> read-at(0, 512): p0/000001.sst
+0 512: ok (salt 1)
+<local fs> read-at(0, 1024): p0/000001.sst
+0 1024: ok (salt 1)
+<local fs> prefetch(512, 65536): p0/000001.sst
+<local fs> read-at(512, 1024): p0/000001.sst
+512 1024: EOF
+<local fs> close: p0/000001.sst
+
+# A provider without shared storage creates object with shared preference
+# locally.
+create 2 shared 2 1024
+----
+<local fs> create: p0/000002.sst
+<local fs> sync-data: p0/000002.sst
+<local fs> close: p0/000002.sst
+
+read 2
+0 512
+0 1024
+512 1024
+----
+<local fs> open: p0/000002.sst
+size: 1024
+<local fs> read-at(0, 512): p0/000002.sst
+0 512: ok (salt 2)
+<local fs> read-at(0, 1024): p0/000002.sst
+0 1024: ok (salt 2)
+<local fs> prefetch(512, 65536): p0/000002.sst
+<local fs> read-at(512, 1024): p0/000002.sst
+512 1024: EOF
+<local fs> close: p0/000002.sst
+
+remove 1
+----
+<local fs> remove: p0/000001.sst
+
+list
+----
+000002 -> p0/000002.sst
+
+read 1
+----
+file 000001 (type 2) unknown to the objstorage provider: file does not exist
+
+link-or-copy 3 local 3 100
+----
+<local fs> create: temp-file-1
+<local fs> close: temp-file-1
+<local fs> link: temp-file-1 -> p0/000003.sst
+
+read 3
+0 100
+----
+<local fs> open: p0/000003.sst
+size: 100
+<local fs> read-at(0, 100): p0/000003.sst
+0 100: ok (salt 3)
+<local fs> close: p0/000003.sst
+
+link-or-copy 4 shared 4 1234
+----
+<local fs> create: temp-file-2
+<local fs> close: temp-file-2
+<local fs> link: temp-file-2 -> p0/000004.sst
+
+read 4
+0 1234
+----
+<local fs> open: p0/000004.sst
+size: 1234
+<local fs> read-at(0, 1234): p0/000004.sst
+0 1234: ok (salt 4)
+<local fs> close: p0/000004.sst
+
+close
+----
+<local fs> sync: p0
+<local fs> close: p0
diff --git a/pebble/objstorage/objstorageprovider/testdata/provider/local_readahead b/pebble/objstorage/objstorageprovider/testdata/provider/local_readahead
new file mode 100644
index 0000000..87266cc
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/testdata/provider/local_readahead
@@ -0,0 +1,51 @@
+open p1 1
+----
+<local fs> mkdir-all: p1 0755
+<local fs> open-dir: p1
+<local fs> open-dir: p1
+<local fs> create: p1/REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p1/REMOTE-OBJ-CATALOG-000001
+<local fs> create: p1/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> close: p1/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p1
+<local fs> sync: p1/REMOTE-OBJ-CATALOG-000001
+
+create 1 local 1 2000000
+----
+<local fs> create: p1/000001.sst
+<local fs> sync-data: p1/000001.sst
+<local fs> sync-data: p1/000001.sst
+<local fs> close: p1/000001.sst
+
+# We should see prefetch calls, and eventually a reopen
+# (with sequential reads option).
+read 1
+0 1000
+1000 15000
+16000 30000
+46000 10000
+56000 50000
+106000 30000
+140000 80000
+----
+<local fs> open: p1/000001.sst
+size: 2000000
+<local fs> read-at(0, 1000): p1/000001.sst
+0 1000: ok (salt 1)
+<local fs> read-at(1000, 15000): p1/000001.sst
+1000 15000: ok (salt 1)
+<local fs> prefetch(16000, 65536): p1/000001.sst
+<local fs> read-at(16000, 30000): p1/000001.sst
+16000 30000: ok (salt 1)
+<local fs> read-at(46000, 10000): p1/000001.sst
+46000 10000: ok (salt 1)
+<local fs> prefetch(56000, 131072): p1/000001.sst
+<local fs> read-at(56000, 50000): p1/000001.sst
+56000 50000: ok (salt 1)
+<local fs> read-at(106000, 30000): p1/000001.sst
+106000 30000: ok (salt 1)
+<local fs> open: p1/000001.sst
+<local fs> read-at(140000, 80000): p1/000001.sst
+140000 80000: ok (salt 1)
+<local fs> close: p1/000001.sst
+<local fs> close: p1/000001.sst
diff --git a/pebble/objstorage/objstorageprovider/testdata/provider/shared_attach b/pebble/objstorage/objstorageprovider/testdata/provider/shared_attach
new file mode 100644
index 0000000..fac25a8
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/testdata/provider/shared_attach
@@ -0,0 +1,150 @@
+# Basic tests for obtaining the backing of shared objects and attaching them to
+# another provider.
+
+open p1 1
+----
+<local fs> mkdir-all: p1 0755
+<local fs> open-dir: p1
+<local fs> open-dir: p1
+<local fs> create: p1/REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p1/REMOTE-OBJ-CATALOG-000001
+<local fs> create: p1/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> close: p1/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p1
+<local fs> sync: p1/REMOTE-OBJ-CATALOG-000001
+
+create 1 shared 1 100
+----
+<remote> create object "61a6-1-000001.sst"
+<remote> close writer for "61a6-1-000001.sst" after 100 bytes
+<remote> create object "61a6-1-000001.sst.ref.1.000001"
+<remote> close writer for "61a6-1-000001.sst.ref.1.000001" after 0 bytes
+
+create 2 shared 2 200
+----
+<remote> create object "a629-1-000002.sst"
+<remote> close writer for "a629-1-000002.sst" after 200 bytes
+<remote> create object "a629-1-000002.sst.ref.1.000002"
+<remote> close writer for "a629-1-000002.sst.ref.1.000002" after 0 bytes
+
+create 3 shared 3 300
+----
+<remote> create object "eaac-1-000003.sst"
+<remote> close writer for "eaac-1-000003.sst" after 300 bytes
+<remote> create object "eaac-1-000003.sst.ref.1.000003"
+<remote> close writer for "eaac-1-000003.sst.ref.1.000003" after 0 bytes
+
+create 100 local 100 15
+----
+<local fs> create: p1/000100.sst
+<local fs> sync-data: p1/000100.sst
+<local fs> close: p1/000100.sst
+
+list
+----
+000001 -> remote://61a6-1-000001.sst
+000002 -> remote://a629-1-000002.sst
+000003 -> remote://eaac-1-000003.sst
+000100 -> p1/000100.sst
+
+# Can't get backing of local object.
+save-backing foo 100
+----
+object 000100 not on remote storage
+
+save-backing b1 1
+----
+
+save-backing b2 2
+----
+
+save-backing b3 3
+----
+
+close
+----
+<local fs> sync: p1
+<local fs> sync: p1/REMOTE-OBJ-CATALOG-000001
+<local fs> close: p1/REMOTE-OBJ-CATALOG-000001
+<local fs> close: p1
+
+open p2 2
+----
+<local fs> mkdir-all: p2 0755
+<local fs> open-dir: p2
+<local fs> open-dir: p2
+<local fs> create: p2/REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p2/REMOTE-OBJ-CATALOG-000001
+<local fs> create: p2/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> close: p2/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p2
+<local fs> sync: p2/REMOTE-OBJ-CATALOG-000001
+
+create 100 shared 100 15
+----
+<remote> create object "fd72-2-000100.sst"
+<remote> close writer for "fd72-2-000100.sst" after 15 bytes
+<remote> create object "fd72-2-000100.sst.ref.2.000100"
+<remote> close writer for "fd72-2-000100.sst.ref.2.000100" after 0 bytes
+
+attach
+b1 101
+b2 102
+b3 103
+----
+<remote> create object "61a6-1-000001.sst.ref.2.000101"
+<remote> close writer for "61a6-1-000001.sst.ref.2.000101" after 0 bytes
+<remote> size of object "61a6-1-000001.sst.ref.1.000001": 0
+<remote> create object "a629-1-000002.sst.ref.2.000102"
+<remote> close writer for "a629-1-000002.sst.ref.2.000102" after 0 bytes
+<remote> size of object "a629-1-000002.sst.ref.1.000002": 0
+<remote> create object "eaac-1-000003.sst.ref.2.000103"
+<remote> close writer for "eaac-1-000003.sst.ref.2.000103" after 0 bytes
+<remote> size of object "eaac-1-000003.sst.ref.1.000003": 0
+<local fs> sync: p2/REMOTE-OBJ-CATALOG-000001
+000101 -> remote://61a6-1-000001.sst
+000102 -> remote://a629-1-000002.sst
+000103 -> remote://eaac-1-000003.sst
+
+list
+----
+000100 -> remote://fd72-2-000100.sst
+000101 -> remote://61a6-1-000001.sst
+000102 -> remote://a629-1-000002.sst
+000103 -> remote://eaac-1-000003.sst
+
+read 101
+0 100
+15 10
+----
+<remote> size of object "61a6-1-000001.sst.ref.2.000101": 0
+<remote> create reader for object "61a6-1-000001.sst": 100 bytes
+size: 100
+<remote> read object "61a6-1-000001.sst" at 0 (length 100)
+0 100: ok (salt 1)
+<remote> read object "61a6-1-000001.sst" at 15 (length 10)
+15 10: ok (salt 1)
+<remote> close reader for "61a6-1-000001.sst"
+
+read 102
+0 200
+90 100
+----
+<remote> size of object "a629-1-000002.sst.ref.2.000102": 0
+<remote> create reader for object "a629-1-000002.sst": 200 bytes
+size: 200
+<remote> read object "a629-1-000002.sst" at 0 (length 200)
+0 200: ok (salt 2)
+<remote> read object "a629-1-000002.sst" at 90 (length 100)
+90 100: ok (salt 2)
+<remote> close reader for "a629-1-000002.sst"
+
+read 103
+0 300
+----
+<remote> size of object "eaac-1-000003.sst.ref.2.000103": 0
+<remote> create reader for object "eaac-1-000003.sst": 300 bytes
+size: 300
+<remote> read object "eaac-1-000003.sst" at 0 (length 300)
+0 300: ok (salt 3)
+<remote> close reader for "eaac-1-000003.sst"
diff --git a/pebble/objstorage/objstorageprovider/testdata/provider/shared_attach_after_unref b/pebble/objstorage/objstorageprovider/testdata/provider/shared_attach_after_unref
new file mode 100644
index 0000000..719737c
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/testdata/provider/shared_attach_after_unref
@@ -0,0 +1,91 @@
+# Tests when an object is unrefed before it is attached to another provider.
+
+open p5 5
+----
+<local fs> mkdir-all: p5 0755
+<local fs> open-dir: p5
+<local fs> open-dir: p5
+<local fs> create: p5/REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p5/REMOTE-OBJ-CATALOG-000001
+<local fs> create: p5/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> close: p5/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p5
+<local fs> sync: p5/REMOTE-OBJ-CATALOG-000001
+
+create 1 shared 1 100
+----
+<remote> create object "d632-5-000001.sst"
+<remote> close writer for "d632-5-000001.sst" after 100 bytes
+<remote> create object "d632-5-000001.sst.ref.5.000001"
+<remote> close writer for "d632-5-000001.sst.ref.5.000001" after 0 bytes
+
+save-backing p5b1 1
+----
+
+# This should do nothing.
+remove 1
+----
+
+open p6 6
+----
+<local fs> mkdir-all: p6 0755
+<local fs> open-dir: p6
+<local fs> open-dir: p6
+<local fs> create: p6/REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p6/REMOTE-OBJ-CATALOG-000001
+<local fs> create: p6/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> close: p6/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p6
+<local fs> sync: p6/REMOTE-OBJ-CATALOG-000001
+
+# Attach should succeed.
+attach
+p5b1 101
+----
+<remote> create object "d632-5-000001.sst.ref.6.000101"
+<remote> close writer for "d632-5-000001.sst.ref.6.000101" after 0 bytes
+<remote> size of object "d632-5-000001.sst.ref.5.000001": 0
+<local fs> sync: p6/REMOTE-OBJ-CATALOG-000001
+000101 -> remote://d632-5-000001.sst
+
+switch p5
+----
+
+# TODO(radu): after we close the backing, the unref should happen.
+close-backing p5b1
+----
+
+create 2 shared 2 100
+----
+<remote> create object "1ab5-5-000002.sst"
+<remote> close writer for "1ab5-5-000002.sst" after 100 bytes
+<remote> create object "1ab5-5-000002.sst.ref.5.000002"
+<remote> close writer for "1ab5-5-000002.sst.ref.5.000002" after 0 bytes
+
+save-backing p5b2 2
+----
+
+# Close the backing, then unref the object.
+close-backing p5b2
+----
+
+remove 2
+----
+<remote> delete object "1ab5-5-000002.sst.ref.5.000002"
+<remote> list (prefix="1ab5-5-000002.sst.ref.", delimiter="")
+<remote> delete object "1ab5-5-000002.sst"
+
+switch p6
+----
+
+# Attach should error out because it can't find p5's ref.
+attach
+p5b2 102
+----
+<remote> create object "1ab5-5-000002.sst.ref.6.000102"
+<remote> close writer for "1ab5-5-000002.sst.ref.6.000102" after 0 bytes
+<remote> size of object "1ab5-5-000002.sst.ref.5.000002": error: file does not exist
+<remote> delete object "1ab5-5-000002.sst.ref.6.000102"
+<remote> list (prefix="1ab5-5-000002.sst.ref.", delimiter="")
+<remote> delete object "1ab5-5-000002.sst"
+error: origin marker object "1ab5-5-000002.sst.ref.5.000002" does not exist; object probably removed from the provider which created the backing
diff --git a/pebble/objstorage/objstorageprovider/testdata/provider/shared_attach_multi b/pebble/objstorage/objstorageprovider/testdata/provider/shared_attach_multi
new file mode 100644
index 0000000..93baa31
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/testdata/provider/shared_attach_multi
@@ -0,0 +1,92 @@
+# Tests with the same shared object attached as multiple objects.
+
+open p1 1
+----
+<local fs> mkdir-all: p1 0755
+<local fs> open-dir: p1
+<local fs> open-dir: p1
+<local fs> create: p1/REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p1/REMOTE-OBJ-CATALOG-000001
+<local fs> create: p1/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> close: p1/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p1
+<local fs> sync: p1/REMOTE-OBJ-CATALOG-000001
+
+create 1 shared 1 100
+----
+<remote> create object "61a6-1-000001.sst"
+<remote> close writer for "61a6-1-000001.sst" after 100 bytes
+<remote> create object "61a6-1-000001.sst.ref.1.000001"
+<remote> close writer for "61a6-1-000001.sst.ref.1.000001" after 0 bytes
+
+save-backing b1 1
+----
+
+open p2 2
+----
+<local fs> mkdir-all: p2 0755
+<local fs> open-dir: p2
+<local fs> open-dir: p2
+<local fs> create: p2/REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p2/REMOTE-OBJ-CATALOG-000001
+<local fs> create: p2/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> close: p2/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p2
+<local fs> sync: p2/REMOTE-OBJ-CATALOG-000001
+
+# We should create three ref markers to allow independent removal.
+attach
+b1 101
+b1 102
+b1 103
+----
+<remote> create object "61a6-1-000001.sst.ref.2.000101"
+<remote> close writer for "61a6-1-000001.sst.ref.2.000101" after 0 bytes
+<remote> size of object "61a6-1-000001.sst.ref.1.000001": 0
+<remote> create object "61a6-1-000001.sst.ref.2.000102"
+<remote> close writer for "61a6-1-000001.sst.ref.2.000102" after 0 bytes
+<remote> size of object "61a6-1-000001.sst.ref.1.000001": 0
+<remote> create object "61a6-1-000001.sst.ref.2.000103"
+<remote> close writer for "61a6-1-000001.sst.ref.2.000103" after 0 bytes
+<remote> size of object "61a6-1-000001.sst.ref.1.000001": 0
+<local fs> sync: p2/REMOTE-OBJ-CATALOG-000001
+000101 -> remote://61a6-1-000001.sst
+000102 -> remote://61a6-1-000001.sst
+000103 -> remote://61a6-1-000001.sst
+
+close-backing b1
+----
+
+# Remove original object.
+switch p1
+----
+
+remove 1
+----
+<remote> delete object "61a6-1-000001.sst.ref.1.000001"
+<remote> list (prefix="61a6-1-000001.sst.ref.", delimiter="")
+<remote>  - 61a6-1-000001.sst.ref.2.000101
+<remote>  - 61a6-1-000001.sst.ref.2.000102
+<remote>  - 61a6-1-000001.sst.ref.2.000103
+
+switch p2
+----
+
+remove 101
+----
+<remote> delete object "61a6-1-000001.sst.ref.2.000101"
+<remote> list (prefix="61a6-1-000001.sst.ref.", delimiter="")
+<remote>  - 61a6-1-000001.sst.ref.2.000102
+<remote>  - 61a6-1-000001.sst.ref.2.000103
+
+remove 103
+----
+<remote> delete object "61a6-1-000001.sst.ref.2.000103"
+<remote> list (prefix="61a6-1-000001.sst.ref.", delimiter="")
+<remote>  - 61a6-1-000001.sst.ref.2.000102
+
+remove 102
+----
+<remote> delete object "61a6-1-000001.sst.ref.2.000102"
+<remote> list (prefix="61a6-1-000001.sst.ref.", delimiter="")
+<remote> delete object "61a6-1-000001.sst"
diff --git a/pebble/objstorage/objstorageprovider/testdata/provider/shared_basic b/pebble/objstorage/objstorageprovider/testdata/provider/shared_basic
new file mode 100644
index 0000000..540c54c
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/testdata/provider/shared_basic
@@ -0,0 +1,132 @@
+# Basic provider tests with shared storage.
+
+# open <fs-dir> <creator-id>
+open p1 1
+----
+<local fs> mkdir-all: p1 0755
+<local fs> open-dir: p1
+<local fs> open-dir: p1
+<local fs> create: p1/REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p1/REMOTE-OBJ-CATALOG-000001
+<local fs> create: p1/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> close: p1/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p1
+<local fs> sync: p1/REMOTE-OBJ-CATALOG-000001
+
+create 1 local 1 100
+----
+<local fs> create: p1/000001.sst
+<local fs> sync-data: p1/000001.sst
+<local fs> close: p1/000001.sst
+
+read 1
+0 100
+----
+<local fs> open: p1/000001.sst
+size: 100
+<local fs> read-at(0, 100): p1/000001.sst
+0 100: ok (salt 1)
+<local fs> close: p1/000001.sst
+
+create 2 shared 2 100
+----
+<remote> create object "a629-1-000002.sst"
+<remote> close writer for "a629-1-000002.sst" after 100 bytes
+<remote> create object "a629-1-000002.sst.ref.1.000002"
+<remote> close writer for "a629-1-000002.sst.ref.1.000002" after 0 bytes
+
+read 2
+0 100
+----
+<remote> size of object "a629-1-000002.sst.ref.1.000002": 0
+<remote> create reader for object "a629-1-000002.sst": 100 bytes
+size: 100
+<remote> read object "a629-1-000002.sst" at 0 (length 100)
+0 100: ok (salt 2)
+<remote> close reader for "a629-1-000002.sst"
+
+list
+----
+000001 -> p1/000001.sst
+000002 -> remote://a629-1-000002.sst
+
+close
+----
+<local fs> sync: p1
+<local fs> sync: p1/REMOTE-OBJ-CATALOG-000001
+<local fs> close: p1/REMOTE-OBJ-CATALOG-000001
+<local fs> close: p1
+
+# Test that the objects are there on re-open.
+open p1 1
+----
+<local fs> mkdir-all: p1 0755
+<local fs> open-dir: p1
+<local fs> open-dir: p1
+<local fs> open: p1/REMOTE-OBJ-CATALOG-000001
+<local fs> close: p1/REMOTE-OBJ-CATALOG-000001
+
+list
+----
+000001 -> p1/000001.sst
+000002 -> remote://a629-1-000002.sst
+
+remove 1
+----
+<local fs> remove: p1/000001.sst
+
+remove 2
+----
+<remote> delete object "a629-1-000002.sst.ref.1.000002"
+<remote> list (prefix="a629-1-000002.sst.ref.", delimiter="")
+<remote> delete object "a629-1-000002.sst"
+
+link-or-copy 3 local 3 100
+----
+<local fs> create: temp-file-1
+<local fs> close: temp-file-1
+<local fs> link: temp-file-1 -> p1/000003.sst
+
+read 3
+0 100
+----
+<local fs> open: p1/000003.sst
+size: 100
+<local fs> read-at(0, 100): p1/000003.sst
+0 100: ok (salt 3)
+<local fs> close: p1/000003.sst
+
+link-or-copy 4 shared 4 100
+----
+<local fs> create: temp-file-2
+<local fs> close: temp-file-2
+<remote> create object "2f2f-1-000004.sst"
+<local fs> open: temp-file-2
+<remote> close writer for "2f2f-1-000004.sst" after 100 bytes
+<remote> create object "2f2f-1-000004.sst.ref.1.000004"
+<remote> close writer for "2f2f-1-000004.sst.ref.1.000004" after 0 bytes
+<local fs> close: temp-file-2
+
+read 4
+0 100
+----
+<remote> size of object "2f2f-1-000004.sst.ref.1.000004": 0
+<remote> create reader for object "2f2f-1-000004.sst": 100 bytes
+size: 100
+<remote> read object "2f2f-1-000004.sst" at 0 (length 100)
+0 100: ok (salt 4)
+<remote> close reader for "2f2f-1-000004.sst"
+
+close
+----
+<local fs> sync: p1
+<local fs> create: p1/REMOTE-OBJ-CATALOG-000002
+<local fs> sync: p1/REMOTE-OBJ-CATALOG-000002
+<local fs> create: p1/marker.remote-obj-catalog.000002.REMOTE-OBJ-CATALOG-000002
+<local fs> close: p1/marker.remote-obj-catalog.000002.REMOTE-OBJ-CATALOG-000002
+<local fs> remove: p1/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p1
+<local fs> remove: p1/REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p1/REMOTE-OBJ-CATALOG-000002
+<local fs> close: p1/REMOTE-OBJ-CATALOG-000002
+<local fs> close: p1
diff --git a/pebble/objstorage/objstorageprovider/testdata/provider/shared_no_ref b/pebble/objstorage/objstorageprovider/testdata/provider/shared_no_ref
new file mode 100644
index 0000000..6c16f9a
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/testdata/provider/shared_no_ref
@@ -0,0 +1,178 @@
+# Tests with shared storage when ref tracking is disabled.
+
+# open <fs-dir> <creator-id>
+open p1 1
+----
+<local fs> mkdir-all: p1 0755
+<local fs> open-dir: p1
+<local fs> open-dir: p1
+<local fs> create: p1/REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p1/REMOTE-OBJ-CATALOG-000001
+<local fs> create: p1/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> close: p1/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p1
+<local fs> sync: p1/REMOTE-OBJ-CATALOG-000001
+
+create 1 shared 1 100 no-ref-tracking
+----
+<remote> create object "61a6-1-000001.sst"
+<remote> close writer for "61a6-1-000001.sst" after 100 bytes
+
+read 1
+0 100
+----
+<remote> create reader for object "61a6-1-000001.sst": 100 bytes
+size: 100
+<remote> read object "61a6-1-000001.sst" at 0 (length 100)
+0 100: ok (salt 1)
+<remote> close reader for "61a6-1-000001.sst"
+
+create 2 shared 2 100 no-ref-tracking
+----
+<remote> create object "a629-1-000002.sst"
+<remote> close writer for "a629-1-000002.sst" after 100 bytes
+
+read 2
+0 100
+----
+<remote> create reader for object "a629-1-000002.sst": 100 bytes
+size: 100
+<remote> read object "a629-1-000002.sst" at 0 (length 100)
+0 100: ok (salt 2)
+<remote> close reader for "a629-1-000002.sst"
+
+list
+----
+000001 -> remote://61a6-1-000001.sst
+000002 -> remote://a629-1-000002.sst
+
+link-or-copy 3 shared 3 100 no-ref-tracking
+----
+<local fs> create: temp-file-1
+<local fs> close: temp-file-1
+<remote> create object "eaac-1-000003.sst"
+<local fs> open: temp-file-1
+<remote> close writer for "eaac-1-000003.sst" after 100 bytes
+<local fs> close: temp-file-1
+
+read 3
+0 100
+----
+<remote> create reader for object "eaac-1-000003.sst": 100 bytes
+size: 100
+<remote> read object "eaac-1-000003.sst" at 0 (length 100)
+0 100: ok (salt 3)
+<remote> close reader for "eaac-1-000003.sst"
+
+close
+----
+<local fs> sync: p1/REMOTE-OBJ-CATALOG-000001
+<local fs> close: p1/REMOTE-OBJ-CATALOG-000001
+<local fs> close: p1
+
+# Test that the objects are there on re-open.
+open p1 1
+----
+<local fs> mkdir-all: p1 0755
+<local fs> open-dir: p1
+<local fs> open-dir: p1
+<local fs> open: p1/REMOTE-OBJ-CATALOG-000001
+<local fs> close: p1/REMOTE-OBJ-CATALOG-000001
+
+list
+----
+000001 -> remote://61a6-1-000001.sst
+000002 -> remote://a629-1-000002.sst
+000003 -> remote://eaac-1-000003.sst
+
+read 1
+0 100
+----
+<remote> create reader for object "61a6-1-000001.sst": 100 bytes
+size: 100
+<remote> read object "61a6-1-000001.sst" at 0 (length 100)
+0 100: ok (salt 1)
+<remote> close reader for "61a6-1-000001.sst"
+
+read 2
+0 100
+----
+<remote> create reader for object "a629-1-000002.sst": 100 bytes
+size: 100
+<remote> read object "a629-1-000002.sst" at 0 (length 100)
+0 100: ok (salt 2)
+<remote> close reader for "a629-1-000002.sst"
+
+read 3
+0 100
+----
+<remote> create reader for object "eaac-1-000003.sst": 100 bytes
+size: 100
+<remote> read object "eaac-1-000003.sst" at 0 (length 100)
+0 100: ok (salt 3)
+<remote> close reader for "eaac-1-000003.sst"
+
+save-backing b1 1
+----
+
+save-backing b2 1
+----
+
+open p2 2
+----
+<local fs> mkdir-all: p2 0755
+<local fs> open-dir: p2
+<local fs> open-dir: p2
+<local fs> create: p2/REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p2/REMOTE-OBJ-CATALOG-000001
+<local fs> create: p2/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> close: p2/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p2
+<local fs> sync: p2/REMOTE-OBJ-CATALOG-000001
+
+attach
+b1 101
+b2 102
+----
+<local fs> sync: p2/REMOTE-OBJ-CATALOG-000001
+000101 -> remote://61a6-1-000001.sst
+000102 -> remote://61a6-1-000001.sst
+
+list
+----
+000101 -> remote://61a6-1-000001.sst
+000102 -> remote://61a6-1-000001.sst
+
+read 101
+0 100
+----
+<remote> create reader for object "61a6-1-000001.sst": 100 bytes
+size: 100
+<remote> read object "61a6-1-000001.sst" at 0 (length 100)
+0 100: ok (salt 1)
+<remote> close reader for "61a6-1-000001.sst"
+
+read 102
+0 100
+----
+<remote> create reader for object "61a6-1-000001.sst": 100 bytes
+size: 100
+<remote> read object "61a6-1-000001.sst" at 0 (length 100)
+0 100: ok (salt 1)
+<remote> close reader for "61a6-1-000001.sst"
+
+# In this mode, all removes should be no-ops on the shared backend.
+remove 101
+----
+
+remove 102
+----
+
+switch p1
+----
+
+remove 1
+----
+
+remove 2
+----
diff --git a/pebble/objstorage/objstorageprovider/testdata/provider/shared_readahead b/pebble/objstorage/objstorageprovider/testdata/provider/shared_readahead
new file mode 100644
index 0000000..45e1c49
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/testdata/provider/shared_readahead
@@ -0,0 +1,87 @@
+open p1 1
+----
+<local fs> mkdir-all: p1 0755
+<local fs> open-dir: p1
+<local fs> open-dir: p1
+<local fs> create: p1/REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p1/REMOTE-OBJ-CATALOG-000001
+<local fs> create: p1/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> close: p1/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p1
+<local fs> sync: p1/REMOTE-OBJ-CATALOG-000001
+
+create 1 shared 1 2000000
+----
+<remote> create object "61a6-1-000001.sst"
+<remote> close writer for "61a6-1-000001.sst" after 2000000 bytes
+<remote> create object "61a6-1-000001.sst.ref.1.000001"
+<remote> close writer for "61a6-1-000001.sst.ref.1.000001" after 0 bytes
+
+# We should be seeing larger and larger reads. But the last read should be
+# capped to the object size.
+read 1
+0 1000
+1000 15000
+16000 30000
+46000 10000
+56000 50000
+106000 30000
+150000 20000
+180000 10000
+210000 30000
+300000 10000
+500000 1000
+800000 1000
+1500000 10000
+----
+<remote> size of object "61a6-1-000001.sst.ref.1.000001": 0
+<remote> create reader for object "61a6-1-000001.sst": 2000000 bytes
+size: 2000000
+<remote> read object "61a6-1-000001.sst" at 0 (length 1000)
+0 1000: ok (salt 1)
+<remote> read object "61a6-1-000001.sst" at 1000 (length 15000)
+1000 15000: ok (salt 1)
+<remote> read object "61a6-1-000001.sst" at 16000 (length 65536)
+16000 30000: ok (salt 1)
+46000 10000: ok (salt 1)
+<remote> read object "61a6-1-000001.sst" at 81536 (length 105536)
+56000 50000: ok (salt 1)
+106000 30000: ok (salt 1)
+150000 20000: ok (salt 1)
+<remote> read object "61a6-1-000001.sst" at 187072 (length 255072)
+180000 10000: ok (salt 1)
+210000 30000: ok (salt 1)
+300000 10000: ok (salt 1)
+<remote> read object "61a6-1-000001.sst" at 500000 (length 524288)
+500000 1000: ok (salt 1)
+800000 1000: ok (salt 1)
+<remote> read object "61a6-1-000001.sst" at 1500000 (length 500000)
+1500000 10000: ok (salt 1)
+<remote> close reader for "61a6-1-000001.sst"
+
+# When reading for a compaction, we should be doing large reads from the start.
+read 1 for-compaction
+0 1000
+1000 15000
+16000 30000
+46000 10000
+56000 50000
+106000 30000
+150000 20000
+180000 10000
+210000 30000
+----
+<remote> size of object "61a6-1-000001.sst.ref.1.000001": 0
+<remote> create reader for object "61a6-1-000001.sst": 2000000 bytes
+size: 2000000
+<remote> read object "61a6-1-000001.sst" at 0 (length 1048576)
+0 1000: ok (salt 1)
+1000 15000: ok (salt 1)
+16000 30000: ok (salt 1)
+46000 10000: ok (salt 1)
+56000 50000: ok (salt 1)
+106000 30000: ok (salt 1)
+150000 20000: ok (salt 1)
+180000 10000: ok (salt 1)
+210000 30000: ok (salt 1)
+<remote> close reader for "61a6-1-000001.sst"
diff --git a/pebble/objstorage/objstorageprovider/testdata/provider/shared_remove b/pebble/objstorage/objstorageprovider/testdata/provider/shared_remove
new file mode 100644
index 0000000..f9c52a1
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/testdata/provider/shared_remove
@@ -0,0 +1,105 @@
+open p1 1
+----
+<local fs> mkdir-all: p1 0755
+<local fs> open-dir: p1
+<local fs> open-dir: p1
+<local fs> create: p1/REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p1/REMOTE-OBJ-CATALOG-000001
+<local fs> create: p1/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> close: p1/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p1
+<local fs> sync: p1/REMOTE-OBJ-CATALOG-000001
+
+create 1 shared 1 100
+----
+<remote> create object "61a6-1-000001.sst"
+<remote> close writer for "61a6-1-000001.sst" after 100 bytes
+<remote> create object "61a6-1-000001.sst.ref.1.000001"
+<remote> close writer for "61a6-1-000001.sst.ref.1.000001" after 0 bytes
+
+create 2 shared 2 100
+----
+<remote> create object "a629-1-000002.sst"
+<remote> close writer for "a629-1-000002.sst" after 100 bytes
+<remote> create object "a629-1-000002.sst.ref.1.000002"
+<remote> close writer for "a629-1-000002.sst.ref.1.000002" after 0 bytes
+
+create 3 shared 3 100
+----
+<remote> create object "eaac-1-000003.sst"
+<remote> close writer for "eaac-1-000003.sst" after 100 bytes
+<remote> create object "eaac-1-000003.sst.ref.1.000003"
+<remote> close writer for "eaac-1-000003.sst.ref.1.000003" after 0 bytes
+
+save-backing b1 1
+----
+
+save-backing b2 2
+----
+
+open p2 2
+----
+<local fs> mkdir-all: p2 0755
+<local fs> open-dir: p2
+<local fs> open-dir: p2
+<local fs> create: p2/REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p2/REMOTE-OBJ-CATALOG-000001
+<local fs> create: p2/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> close: p2/marker.remote-obj-catalog.000001.REMOTE-OBJ-CATALOG-000001
+<local fs> sync: p2
+<local fs> sync: p2/REMOTE-OBJ-CATALOG-000001
+
+create 4 shared 4 100
+----
+<remote> create object "4c52-2-000004.sst"
+<remote> close writer for "4c52-2-000004.sst" after 100 bytes
+<remote> create object "4c52-2-000004.sst.ref.2.000004"
+<remote> close writer for "4c52-2-000004.sst.ref.2.000004" after 0 bytes
+
+attach
+b1 101
+b2 102
+----
+<remote> create object "61a6-1-000001.sst.ref.2.000101"
+<remote> close writer for "61a6-1-000001.sst.ref.2.000101" after 0 bytes
+<remote> size of object "61a6-1-000001.sst.ref.1.000001": 0
+<remote> create object "a629-1-000002.sst.ref.2.000102"
+<remote> close writer for "a629-1-000002.sst.ref.2.000102" after 0 bytes
+<remote> size of object "a629-1-000002.sst.ref.1.000002": 0
+<local fs> sync: p2/REMOTE-OBJ-CATALOG-000001
+000101 -> remote://61a6-1-000001.sst
+000102 -> remote://a629-1-000002.sst
+
+# Remove of object with no other refs; backing object should be removed.
+remove 4
+----
+<remote> delete object "4c52-2-000004.sst.ref.2.000004"
+<remote> list (prefix="4c52-2-000004.sst.ref.", delimiter="")
+<remote> delete object "4c52-2-000004.sst"
+
+# Object shared with p2; backing object should not be removed.
+remove 101
+----
+<remote> delete object "61a6-1-000001.sst.ref.2.000101"
+<remote> list (prefix="61a6-1-000001.sst.ref.", delimiter="")
+<remote>  - 61a6-1-000001.sst.ref.1.000001
+
+switch p1
+----
+
+# Object no longer shared with p1; backing object should be removed.
+remove 1
+----
+
+# Object shared with p1; backing object should not be removed.
+remove 2
+----
+
+switch p2
+----
+
+remove 102
+----
+<remote> delete object "a629-1-000002.sst.ref.2.000102"
+<remote> list (prefix="a629-1-000002.sst.ref.", delimiter="")
+<remote>  - a629-1-000002.sst.ref.1.000002
diff --git a/pebble/objstorage/objstorageprovider/testdata/readahead b/pebble/objstorage/objstorageprovider/testdata/readahead
new file mode 100644
index 0000000..d2bb217
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/testdata/readahead
@@ -0,0 +1,221 @@
+reset
+----
+
+read
+2048, 16
+----
+readahead:  0
+numReads:   1
+size:       65536
+prevSize:   0
+limit:      0
+
+read
+2096, 16
+----
+readahead:  0
+numReads:   2
+size:       65536
+prevSize:   0
+limit:      0
+
+read
+2112, 16
+----
+readahead:  65536
+numReads:   3
+size:       131072
+prevSize:   65536
+limit:      67648
+
+read
+8000, 16
+----
+readahead:  0
+numReads:   4
+size:       131072
+prevSize:   65536
+limit:      67648
+
+read
+8016, 16
+----
+readahead:  0
+numReads:   5
+size:       131072
+prevSize:   65536
+limit:      67648
+
+# The new limit is 2112 + 65536 = 67648.
+# Since the next read will end at 67646 + 1 = 67647,
+# it doesn't yet trigger a readahead.
+
+read
+67646, 1
+----
+readahead:  0
+numReads:   6
+size:       131072
+prevSize:   65536
+limit:      67648
+
+read
+67646, 2
+----
+readahead:  131072
+numReads:   7
+size:       262144
+prevSize:   131072
+limit:      198718
+
+read
+16192, 16
+----
+readahead:  0
+numReads:   1
+size:       65536
+prevSize:   0
+limit:      16208
+
+read
+16193, 16
+----
+readahead:  0
+numReads:   2
+size:       65536
+prevSize:   0
+limit:      16208
+
+# The next read is too far ahead to benefit from readahead
+# (i.e. 540497 > 16208 (limit) + (512 << 10) (maxReadaheadSize))
+# numReads should get reset to 1.
+
+read
+540497, 16
+----
+readahead:  0
+numReads:   1
+size:       65536
+prevSize:   0
+limit:      540513
+
+read
+7980, 16
+----
+readahead:  0
+numReads:   1
+size:       65536
+prevSize:   0
+limit:      7996
+
+read
+0, 16
+----
+readahead:  0
+numReads:   1
+size:       65536
+prevSize:   0
+limit:      16
+
+# Sizes should start from initial (64kb) again.
+
+read
+7780, 16
+----
+readahead:  0
+numReads:   2
+size:       65536
+prevSize:   0
+limit:      16
+
+read
+7680, 16
+----
+readahead:  65536
+numReads:   3
+size:       131072
+prevSize:   65536
+limit:      73216
+
+read
+7780, 16
+---
+readahead:  0
+numReads:   4
+size:       131072
+prevSize:   65536
+limit:      73216
+
+read
+7880, 16
+----
+expected 2 args: offset, size
+
+read
+7980, 16
+----
+readahead:  0
+numReads:   4
+size:       131072
+prevSize:   65536
+limit:      73216
+
+read
+73416, 16
+----
+readahead:  131072
+numReads:   5
+size:       262144
+prevSize:   131072
+limit:      204488
+
+read
+204488, 16
+----
+readahead:  262144
+numReads:   6
+size:       262144
+prevSize:   262144
+limit:      466632
+
+# The readahead size should not increase beyond the max (256kb)
+
+read
+466632, 16
+----
+readahead:  262144
+numReads:   7
+size:       262144
+prevSize:   262144
+limit:      728776
+
+# A cache read pushes the limit further ahead without issuing a readahead.
+
+cache-read
+728770, 16
+----
+readahead:  0
+numReads:   7
+size:       262144
+prevSize:   262144
+limit:      728786
+
+read
+728780, 16
+----
+readahead:  262144
+numReads:   8
+size:       262144
+prevSize:   262144
+limit:      990924
+
+# An out-of-order cache read still resets readahead state.
+
+cache-read
+1200, 16
+----
+readahead:  0
+numReads:   1
+size:       65536
+prevSize:   0
+limit:      1216
diff --git a/pebble/objstorage/objstorageprovider/vfs.go b/pebble/objstorage/objstorageprovider/vfs.go
new file mode 100644
index 0000000..5b20251
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/vfs.go
@@ -0,0 +1,109 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package objstorageprovider
+
+import (
+	"context"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+func (p *provider) vfsPath(fileType base.FileType, fileNum base.DiskFileNum) string {
+	return base.MakeFilepath(p.st.FS, p.st.FSDirName, fileType, fileNum)
+}
+
+func (p *provider) vfsOpenForReading(
+	ctx context.Context,
+	fileType base.FileType,
+	fileNum base.DiskFileNum,
+	opts objstorage.OpenOptions,
+) (objstorage.Readable, error) {
+	filename := p.vfsPath(fileType, fileNum)
+	file, err := p.st.FS.Open(filename, vfs.RandomReadsOption)
+	if err != nil {
+		if opts.MustExist {
+			base.MustExist(p.st.FS, filename, p.st.Logger, err)
+		}
+		return nil, err
+	}
+	return newFileReadable(file, p.st.FS, filename)
+}
+
+func (p *provider) vfsCreate(
+	_ context.Context, fileType base.FileType, fileNum base.DiskFileNum,
+) (objstorage.Writable, objstorage.ObjectMetadata, error) {
+	filename := p.vfsPath(fileType, fileNum)
+	file, err := p.st.FS.Create(filename)
+	if err != nil {
+		return nil, objstorage.ObjectMetadata{}, err
+	}
+	file = vfs.NewSyncingFile(file, vfs.SyncingFileOptions{
+		NoSyncOnClose: p.st.NoSyncOnClose,
+		BytesPerSync:  p.st.BytesPerSync,
+	})
+	meta := objstorage.ObjectMetadata{
+		DiskFileNum: fileNum,
+		FileType:    fileType,
+	}
+	return newFileBufferedWritable(file), meta, nil
+}
+
+func (p *provider) vfsRemove(fileType base.FileType, fileNum base.DiskFileNum) error {
+	return p.st.FSCleaner.Clean(p.st.FS, fileType, p.vfsPath(fileType, fileNum))
+}
+
+// vfsInit finds any local FS objects.
+func (p *provider) vfsInit() error {
+	listing := p.st.FSDirInitialListing
+	if listing == nil {
+		var err error
+		listing, err = p.st.FS.List(p.st.FSDirName)
+		if err != nil {
+			return errors.Wrapf(err, "pebble: could not list store directory")
+		}
+	}
+
+	for _, filename := range listing {
+		fileType, fileNum, ok := base.ParseFilename(p.st.FS, filename)
+		if ok && fileType == base.FileTypeTable {
+			o := objstorage.ObjectMetadata{
+				FileType:    fileType,
+				DiskFileNum: fileNum,
+			}
+			p.mu.knownObjects[o.DiskFileNum] = o
+		}
+	}
+	return nil
+}
+
+func (p *provider) vfsSync() error {
+	p.mu.Lock()
+	shouldSync := p.mu.localObjectsChanged
+	p.mu.localObjectsChanged = false
+	p.mu.Unlock()
+
+	if !shouldSync {
+		return nil
+	}
+	if err := p.fsDir.Sync(); err != nil {
+		p.mu.Lock()
+		defer p.mu.Unlock()
+		p.mu.localObjectsChanged = true
+		return err
+	}
+	return nil
+}
+
+func (p *provider) vfsSize(fileType base.FileType, fileNum base.DiskFileNum) (int64, error) {
+	filename := p.vfsPath(fileType, fileNum)
+	stat, err := p.st.FS.Stat(filename)
+	if err != nil {
+		return 0, err
+	}
+	return stat.Size(), nil
+}
diff --git a/pebble/objstorage/objstorageprovider/vfs_readable.go b/pebble/objstorage/objstorageprovider/vfs_readable.go
new file mode 100644
index 0000000..9362464
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/vfs_readable.go
@@ -0,0 +1,216 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package objstorageprovider
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"sync"
+
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+const fileMaxReadaheadSize = 256 * 1024 /* 256KB */
+
+// fileReadable implements objstorage.Readable on top of a vfs.File.
+//
+// The implementation might use Prealloc and might reopen the file with
+// SequentialReadsOption.
+type fileReadable struct {
+	file vfs.File
+	size int64
+
+	// The following fields are used to possibly open the file again using the
+	// sequential reads option (see vfsReadHandle).
+	filename string
+	fs       vfs.FS
+}
+
+var _ objstorage.Readable = (*fileReadable)(nil)
+
+func newFileReadable(file vfs.File, fs vfs.FS, filename string) (*fileReadable, error) {
+	info, err := file.Stat()
+	if err != nil {
+		return nil, err
+	}
+	r := &fileReadable{
+		file:     file,
+		size:     info.Size(),
+		filename: filename,
+		fs:       fs,
+	}
+	invariants.SetFinalizer(r, func(obj interface{}) {
+		if obj.(*fileReadable).file != nil {
+			fmt.Fprintf(os.Stderr, "Readable was not closed")
+			os.Exit(1)
+		}
+	})
+	return r, nil
+}
+
+// ReadAt is part of the objstorage.Readable interface.
+func (r *fileReadable) ReadAt(_ context.Context, p []byte, off int64) error {
+	n, err := r.file.ReadAt(p, off)
+	if invariants.Enabled && err == nil && n != len(p) {
+		panic("short read")
+	}
+	return err
+}
+
+// Close is part of the objstorage.Readable interface.
+func (r *fileReadable) Close() error {
+	defer func() { r.file = nil }()
+	return r.file.Close()
+}
+
+// Size is part of the objstorage.Readable interface.
+func (r *fileReadable) Size() int64 {
+	return r.size
+}
+
+// NewReadHandle is part of the objstorage.Readable interface.
+func (r *fileReadable) NewReadHandle(_ context.Context) objstorage.ReadHandle {
+	rh := readHandlePool.Get().(*vfsReadHandle)
+	rh.r = r
+	rh.rs = makeReadaheadState(fileMaxReadaheadSize)
+	return rh
+}
+
+type vfsReadHandle struct {
+	r  *fileReadable
+	rs readaheadState
+
+	// sequentialFile holds a file descriptor to the same underlying File,
+	// except with fadvise(FADV_SEQUENTIAL) called on it to take advantage of
+	// OS-level readahead. Once this is non-nil, the other variables in
+	// readaheadState don't matter much as we defer to OS-level readahead.
+	sequentialFile vfs.File
+}
+
+var _ objstorage.ReadHandle = (*vfsReadHandle)(nil)
+
+var readHandlePool = sync.Pool{
+	New: func() interface{} {
+		i := &vfsReadHandle{}
+		// Note: this is a no-op if invariants are disabled or race is enabled.
+		invariants.SetFinalizer(i, func(obj interface{}) {
+			if obj.(*vfsReadHandle).r != nil {
+				fmt.Fprintf(os.Stderr, "ReadHandle was not closed")
+				os.Exit(1)
+			}
+		})
+		return i
+	},
+}
+
+// Close is part of the objstorage.ReadHandle interface.
+func (rh *vfsReadHandle) Close() error {
+	var err error
+	if rh.sequentialFile != nil {
+		err = rh.sequentialFile.Close()
+	}
+	*rh = vfsReadHandle{}
+	readHandlePool.Put(rh)
+	return err
+}
+
+// ReadAt is part of the objstorage.ReadHandle interface.
+func (rh *vfsReadHandle) ReadAt(_ context.Context, p []byte, offset int64) error {
+	var n int
+	var err error
+	if rh.sequentialFile != nil {
+		// Use OS-level read-ahead.
+		n, err = rh.sequentialFile.ReadAt(p, offset)
+	} else {
+		if readaheadSize := rh.rs.maybeReadahead(offset, int64(len(p))); readaheadSize > 0 {
+			if readaheadSize >= fileMaxReadaheadSize {
+				// We've reached the maximum readahead size. Beyond this point, rely on
+				// OS-level readahead.
+				rh.switchToOSReadahead()
+			} else {
+				_ = rh.r.file.Prefetch(offset, readaheadSize)
+			}
+		}
+		n, err = rh.r.file.ReadAt(p, offset)
+	}
+	if invariants.Enabled && err == nil && n != len(p) {
+		panic("short read")
+	}
+	return err
+}
+
+// SetupForCompaction is part of the objstorage.ReadHandle interface.
+func (rh *vfsReadHandle) SetupForCompaction() {
+	rh.switchToOSReadahead()
+}
+
+func (rh *vfsReadHandle) switchToOSReadahead() {
+	if rh.sequentialFile != nil {
+		return
+	}
+
+	// TODO(radu): we could share the reopened file descriptor across multiple
+	// handles.
+	f, err := rh.r.fs.Open(rh.r.filename, vfs.SequentialReadsOption)
+	if err == nil {
+		rh.sequentialFile = f
+	}
+}
+
+// RecordCacheHit is part of the objstorage.ReadHandle interface.
+func (rh *vfsReadHandle) RecordCacheHit(_ context.Context, offset, size int64) {
+	if rh.sequentialFile != nil {
+		// Using OS-level readahead, so do nothing.
+		return
+	}
+	rh.rs.recordCacheHit(offset, size)
+}
+
+// TestingCheckMaxReadahead returns true if the ReadHandle has switched to
+// OS-level read-ahead.
+func TestingCheckMaxReadahead(rh objstorage.ReadHandle) bool {
+	switch rh := rh.(type) {
+	case *vfsReadHandle:
+		return rh.sequentialFile != nil
+	case *PreallocatedReadHandle:
+		return rh.sequentialFile != nil
+	default:
+		panic("unknown ReadHandle type")
+	}
+}
+
+// PreallocatedReadHandle is used to avoid an allocation in NewReadHandle; see
+// UsePreallocatedReadHandle.
+type PreallocatedReadHandle struct {
+	vfsReadHandle
+}
+
+// Close is part of the objstorage.ReadHandle interface.
+func (rh *PreallocatedReadHandle) Close() error {
+	var err error
+	if rh.sequentialFile != nil {
+		err = rh.sequentialFile.Close()
+	}
+	rh.vfsReadHandle = vfsReadHandle{}
+	return err
+}
+
+// UsePreallocatedReadHandle is equivalent to calling readable.NewReadHandle()
+// but uses the existing storage of a PreallocatedReadHandle when possible
+// (currently this happens if we are reading from a local file).
+// The returned handle still needs to be closed.
+func UsePreallocatedReadHandle(
+	ctx context.Context, readable objstorage.Readable, rh *PreallocatedReadHandle,
+) objstorage.ReadHandle {
+	if r, ok := readable.(*fileReadable); ok {
+		// See fileReadable.NewReadHandle.
+		rh.vfsReadHandle = vfsReadHandle{r: r}
+		return rh
+	}
+	return readable.NewReadHandle(ctx)
+}
diff --git a/pebble/objstorage/objstorageprovider/vfs_writable.go b/pebble/objstorage/objstorageprovider/vfs_writable.go
new file mode 100644
index 0000000..c9b207c
--- /dev/null
+++ b/pebble/objstorage/objstorageprovider/vfs_writable.go
@@ -0,0 +1,65 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package objstorageprovider
+
+import (
+	"bufio"
+
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+// NewFileWritable returns a Writable that uses a file as underlying storage.
+func NewFileWritable(file vfs.File) objstorage.Writable {
+	return newFileBufferedWritable(file)
+}
+
+type fileBufferedWritable struct {
+	file vfs.File
+	bw   *bufio.Writer
+}
+
+var _ objstorage.Writable = (*fileBufferedWritable)(nil)
+
+func newFileBufferedWritable(file vfs.File) *fileBufferedWritable {
+	return &fileBufferedWritable{
+		file: file,
+		bw:   bufio.NewWriter(file),
+	}
+}
+
+// Write is part of the objstorage.Writable interface.
+func (w *fileBufferedWritable) Write(p []byte) error {
+	// Ignoring the length written since bufio.Writer.Write is guaranteed to
+	// return an error if the length written is < len(p).
+	_, err := w.bw.Write(p)
+	return err
+}
+
+// Finish is part of the objstorage.Writable interface.
+func (w *fileBufferedWritable) Finish() error {
+	err := w.bw.Flush()
+	if err == nil {
+		err = w.file.Sync()
+	}
+	err = firstError(err, w.file.Close())
+	w.bw = nil
+	w.file = nil
+	return err
+}
+
+// Abort is part of the objstorage.Writable interface.
+func (w *fileBufferedWritable) Abort() {
+	_ = w.file.Close()
+	w.bw = nil
+	w.file = nil
+}
+
+func firstError(err0, err1 error) error {
+	if err0 != nil {
+		return err0
+	}
+	return err1
+}
diff --git a/pebble/objstorage/remote/factory.go b/pebble/objstorage/remote/factory.go
new file mode 100644
index 0000000..620dbba
--- /dev/null
+++ b/pebble/objstorage/remote/factory.go
@@ -0,0 +1,25 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package remote
+
+import "github.com/pkg/errors"
+
+// MakeSimpleFactory returns a StorageFactory implementation that produces the given
+// Storage objects.
+func MakeSimpleFactory(m map[Locator]Storage) StorageFactory {
+	return simpleFactory(m)
+}
+
+type simpleFactory map[Locator]Storage
+
+var _ StorageFactory = simpleFactory{}
+
+// CreateStorage is part of the StorageFactory interface.
+func (sf simpleFactory) CreateStorage(locator Locator) (Storage, error) {
+	if s, ok := sf[locator]; ok {
+		return s, nil
+	}
+	return nil, errors.Errorf("unknown locator '%s'", locator)
+}
diff --git a/pebble/objstorage/remote/localfs.go b/pebble/objstorage/remote/localfs.go
new file mode 100644
index 0000000..539ecb1
--- /dev/null
+++ b/pebble/objstorage/remote/localfs.go
@@ -0,0 +1,118 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package remote
+
+import (
+	"context"
+	"io"
+	"os"
+	"path"
+
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+// NewLocalFS returns a vfs-backed implementation of the remote.Storage
+// interface (for testing). All objects will be stored at the directory
+// dirname.
+func NewLocalFS(dirname string, fs vfs.FS) Storage {
+	store := &localFSStore{
+		dirname: dirname,
+		vfs:     fs,
+	}
+	return store
+}
+
+// localFSStore is a vfs-backed implementation of the remote.Storage
+// interface (for testing).
+type localFSStore struct {
+	dirname string
+	vfs     vfs.FS
+}
+
+var _ Storage = (*localFSStore)(nil)
+
+// Close is part of the remote.Storage interface.
+func (s *localFSStore) Close() error {
+	*s = localFSStore{}
+	return nil
+}
+
+// ReadObject is part of the remote.Storage interface.
+func (s *localFSStore) ReadObject(
+	ctx context.Context, objName string,
+) (_ ObjectReader, objSize int64, _ error) {
+	f, err := s.vfs.Open(path.Join(s.dirname, objName))
+	if err != nil {
+		return nil, 0, err
+	}
+	stat, err := f.Stat()
+	if err != nil {
+		return nil, 0, err
+	}
+
+	return &localFSReader{f}, stat.Size(), nil
+}
+
+type localFSReader struct {
+	file vfs.File
+}
+
+var _ ObjectReader = (*localFSReader)(nil)
+
+// ReadAt is part of the shared.ObjectReader interface.
+func (r *localFSReader) ReadAt(_ context.Context, p []byte, offset int64) error {
+	n, err := r.file.ReadAt(p, offset)
+	// https://pkg.go.dev/io#ReaderAt
+	if err == io.EOF && n == len(p) {
+		return nil
+	}
+	return err
+}
+
+// Close is part of the shared.ObjectReader interface.
+func (r *localFSReader) Close() error {
+	r.file.Close()
+	r.file = nil
+	return nil
+}
+
+// CreateObject is part of the remote.Storage interface.
+func (s *localFSStore) CreateObject(objName string) (io.WriteCloser, error) {
+	file, err := s.vfs.Create(path.Join(s.dirname, objName))
+	return file, err
+}
+
+// List is part of the remote.Storage interface.
+func (s *localFSStore) List(prefix, delimiter string) ([]string, error) {
+	// TODO(josh): For the intended use case of localfs.go of running 'pebble bench',
+	// List can always return <nil, nil>, since this indicates a file has only one ref,
+	// and since `pebble bench` implies running in a single-pebble-instance context.
+	// https://github.com/cockroachdb/pebble/blob/a9a079d4fb6bf4a9ebc52e4d83a76ad4cbf676cb/objstorage/objstorageprovider/shared.go#L292
+	return nil, nil
+}
+
+// Delete is part of the remote.Storage interface.
+func (s *localFSStore) Delete(objName string) error {
+	return s.vfs.Remove(path.Join(s.dirname, objName))
+}
+
+// Size is part of the remote.Storage interface.
+func (s *localFSStore) Size(objName string) (int64, error) {
+	f, err := s.vfs.Open(path.Join(s.dirname, objName))
+	if err != nil {
+		return 0, err
+	}
+	defer f.Close()
+	stat, err := f.Stat()
+	if err != nil {
+		return 0, err
+	}
+	return stat.Size(), nil
+}
+
+// IsNotExistError is part of the remote.Storage interface.
+func (s *localFSStore) IsNotExistError(err error) bool {
+	return err == os.ErrNotExist
+}
diff --git a/pebble/objstorage/remote/logging.go b/pebble/objstorage/remote/logging.go
new file mode 100644
index 0000000..7a9cd77
--- /dev/null
+++ b/pebble/objstorage/remote/logging.go
@@ -0,0 +1,139 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package remote
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"sort"
+)
+
+// WithLogging wraps the given Storage implementation and emits logs for various
+// operations.
+func WithLogging(wrapped Storage, logf func(fmt string, args ...interface{})) Storage {
+	return &loggingStore{
+		logf:    logf,
+		wrapped: wrapped,
+	}
+}
+
+// loggingStore wraps a remote.Storage implementation and emits logs of the
+// operations.
+type loggingStore struct {
+	logf    func(fmt string, args ...interface{})
+	wrapped Storage
+}
+
+var _ Storage = (*loggingStore)(nil)
+
+func (l *loggingStore) Close() error {
+	l.logf("close")
+	return l.wrapped.Close()
+}
+
+func (l *loggingStore) ReadObject(
+	ctx context.Context, objName string,
+) (_ ObjectReader, objSize int64, _ error) {
+	r, size, err := l.wrapped.ReadObject(ctx, objName)
+	l.logf("create reader for object %q: %s", objName, errOrPrintf(err, "%d bytes", size))
+	if err != nil {
+		return nil, 0, err
+	}
+	return &loggingReader{
+		l:       l,
+		name:    objName,
+		wrapped: r,
+	}, size, nil
+}
+
+type loggingReader struct {
+	l       *loggingStore
+	name    string
+	wrapped ObjectReader
+}
+
+var _ ObjectReader = (*loggingReader)(nil)
+
+func (l *loggingReader) ReadAt(ctx context.Context, p []byte, offset int64) error {
+	if err := l.wrapped.ReadAt(ctx, p, offset); err != nil {
+		l.l.logf("read object %q at %d (length %d): error %v", l.name, offset, len(p), err)
+		return err
+	}
+	l.l.logf("read object %q at %d (length %d)", l.name, offset, len(p))
+	return nil
+}
+
+func (l *loggingReader) Close() error {
+	l.l.logf("close reader for %q", l.name)
+	return l.wrapped.Close()
+}
+
+func (l *loggingStore) CreateObject(objName string) (io.WriteCloser, error) {
+	l.logf("create object %q", objName)
+	writer, err := l.wrapped.CreateObject(objName)
+	if err != nil {
+		return nil, err
+	}
+	return &loggingWriter{
+		l:           l,
+		name:        objName,
+		WriteCloser: writer,
+	}, nil
+}
+
+type loggingWriter struct {
+	l            *loggingStore
+	name         string
+	bytesWritten int64
+	io.WriteCloser
+}
+
+func (l *loggingWriter) Write(p []byte) (n int, err error) {
+	n, err = l.WriteCloser.Write(p)
+	l.bytesWritten += int64(n)
+	return n, err
+}
+
+func (l *loggingWriter) Close() error {
+	l.l.logf("close writer for %q after %d bytes", l.name, l.bytesWritten)
+	return l.WriteCloser.Close()
+}
+
+func (l *loggingStore) List(prefix, delimiter string) ([]string, error) {
+	l.logf("list (prefix=%q, delimiter=%q)", prefix, delimiter)
+	res, err := l.wrapped.List(prefix, delimiter)
+	if err != nil {
+		return nil, err
+	}
+	sorted := append([]string(nil), res...)
+	sort.Strings(sorted)
+	for _, s := range sorted {
+		l.logf(" - %s", s)
+	}
+	return res, nil
+}
+
+func (l *loggingStore) Delete(objName string) error {
+	l.logf("delete object %q", objName)
+	return l.wrapped.Delete(objName)
+}
+
+func (l *loggingStore) Size(objName string) (int64, error) {
+	size, err := l.wrapped.Size(objName)
+	l.logf("size of object %q: %s", objName, errOrPrintf(err, "%d", size))
+	return size, err
+}
+
+func errOrPrintf(err error, format string, args ...interface{}) string {
+	if err != nil {
+		return fmt.Sprintf("error: %v", err)
+	}
+	return fmt.Sprintf(format, args...)
+}
+
+func (l *loggingStore) IsNotExistError(err error) bool {
+	return l.wrapped.IsNotExistError(err)
+}
diff --git a/pebble/objstorage/remote/mem.go b/pebble/objstorage/remote/mem.go
new file mode 100644
index 0000000..9bbda71
--- /dev/null
+++ b/pebble/objstorage/remote/mem.go
@@ -0,0 +1,161 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package remote
+
+import (
+	"bytes"
+	"context"
+	"io"
+	"os"
+	"strings"
+	"sync"
+)
+
+// NewInMem returns an in-memory implementation of the remote.Storage
+// interface (for testing).
+func NewInMem() Storage {
+	store := &inMemStore{}
+	store.mu.objects = make(map[string]*inMemObj)
+	return store
+}
+
+// inMemStore is an in-memory implementation of the remote.Storage interface
+// (for testing).
+type inMemStore struct {
+	mu struct {
+		sync.Mutex
+		objects map[string]*inMemObj
+	}
+}
+
+var _ Storage = (*inMemStore)(nil)
+
+type inMemObj struct {
+	name string
+	data []byte
+}
+
+func (s *inMemStore) Close() error {
+	*s = inMemStore{}
+	return nil
+}
+
+func (s *inMemStore) ReadObject(
+	ctx context.Context, objName string,
+) (_ ObjectReader, objSize int64, _ error) {
+	obj, err := s.getObj(objName)
+	if err != nil {
+		return nil, 0, err
+	}
+	return &inMemReader{data: obj.data}, int64(len(obj.data)), nil
+}
+
+type inMemReader struct {
+	data []byte
+}
+
+var _ ObjectReader = (*inMemReader)(nil)
+
+func (r *inMemReader) ReadAt(ctx context.Context, p []byte, offset int64) error {
+	if offset+int64(len(p)) > int64(len(r.data)) {
+		return io.EOF
+	}
+	copy(p, r.data[offset:])
+	return nil
+}
+
+func (r *inMemReader) Close() error {
+	r.data = nil
+	return nil
+}
+
+func (s *inMemStore) CreateObject(objName string) (io.WriteCloser, error) {
+	return &inMemWriter{
+		store: s,
+		name:  objName,
+	}, nil
+}
+
+type inMemWriter struct {
+	store *inMemStore
+	name  string
+	buf   bytes.Buffer
+}
+
+var _ io.WriteCloser = (*inMemWriter)(nil)
+
+func (o *inMemWriter) Write(p []byte) (n int, err error) {
+	if o.store == nil {
+		panic("Write after Close")
+	}
+	return o.buf.Write(p)
+}
+
+func (o *inMemWriter) Close() error {
+	if o.store != nil {
+		o.store.addObj(&inMemObj{
+			name: o.name,
+			data: o.buf.Bytes(),
+		})
+		o.store = nil
+	}
+	return nil
+}
+
+func (s *inMemStore) List(prefix, delimiter string) ([]string, error) {
+	if delimiter != "" {
+		panic("delimiter unimplemented")
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	res := make([]string, 0, len(s.mu.objects))
+	for name := range s.mu.objects {
+		if strings.HasPrefix(name, prefix) {
+			res = append(res, name)
+		}
+	}
+	return res, nil
+}
+
+func (s *inMemStore) Delete(objName string) error {
+	s.rmObj(objName)
+	return nil
+}
+
+// Size returns the length of the named object in bytesWritten.
+func (s *inMemStore) Size(objName string) (int64, error) {
+	obj, err := s.getObj(objName)
+	if err != nil {
+		return 0, err
+	}
+	return int64(len(obj.data)), nil
+}
+
+func (s *inMemStore) IsNotExistError(err error) bool {
+	return err == os.ErrNotExist
+}
+
+func (s *inMemStore) getObj(name string) (*inMemObj, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	obj, ok := s.mu.objects[name]
+	if !ok {
+		return nil, os.ErrNotExist
+	}
+	return obj, nil
+}
+
+func (s *inMemStore) addObj(o *inMemObj) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.mu.objects[o.name] = o
+}
+
+func (s *inMemStore) rmObj(name string) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	delete(s.mu.objects, name)
+}
diff --git a/pebble/objstorage/remote/storage.go b/pebble/objstorage/remote/storage.go
new file mode 100644
index 0000000..8918764
--- /dev/null
+++ b/pebble/objstorage/remote/storage.go
@@ -0,0 +1,133 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package remote
+
+import (
+	"context"
+	"io"
+
+	"github.com/cockroachdb/redact"
+)
+
+// Locator is an opaque string identifying a remote.Storage implementation.
+//
+// The Locator must not contain secrets (like authentication keys). Locators are
+// stored on disk in the shared object catalog and are passed around as part of
+// RemoteObjectBacking; they can also appear in error messages.
+type Locator string
+
+// SafeFormat implements redact.SafeFormatter.
+func (l Locator) SafeFormat(w redact.SafePrinter, _ rune) {
+	w.Printf("%s", redact.SafeString(l))
+}
+
+// StorageFactory is used to return Storage implementations based on locators. A
+// Pebble store that uses shared storage is configured with a StorageFactory.
+type StorageFactory interface {
+	CreateStorage(locator Locator) (Storage, error)
+}
+
+// SharedLevelsStart denotes the highest (i.e. lowest numbered) level that will
+// have sstables shared across Pebble instances when doing skip-shared
+// iteration (see db.ScanInternal) or shared file ingestion (see
+// db.IngestAndExcise).
+const SharedLevelsStart = 5
+
+// CreateOnSharedStrategy specifies what table files should be created on shared
+// storage. For use with CreateOnShared in options.
+type CreateOnSharedStrategy int
+
+const (
+	// CreateOnSharedNone denotes no files being created on shared storage.
+	CreateOnSharedNone CreateOnSharedStrategy = iota
+	// CreateOnSharedLower denotes the creation of files in lower levels of the
+	// LSM (specifically, L5 and L6 as they're below SharedLevelsStart) on
+	// shared storage, and higher levels on local storage.
+	CreateOnSharedLower
+	// CreateOnSharedAll denotes the creation of all sstables on shared storage.
+	CreateOnSharedAll
+)
+
+// ShouldCreateShared returns whether new table files at the specified level
+// should be created on shared storage.
+func ShouldCreateShared(strategy CreateOnSharedStrategy, level int) bool {
+	switch strategy {
+	case CreateOnSharedAll:
+		return true
+	case CreateOnSharedNone:
+		return false
+	case CreateOnSharedLower:
+		return level >= SharedLevelsStart
+	default:
+		panic("unexpected CreateOnSharedStrategy value")
+	}
+}
+
+// Storage is an interface for a blob storage driver. This is lower-level
+// than an FS-like interface, however FS/File-like abstractions can be built on
+// top of these methods.
+//
+// TODO(bilal): Consider pushing shared file obsoletion as well as path
+// generation behind this interface.
+type Storage interface {
+	io.Closer
+
+	// ReadObject returns an ObjectReader that can be used to perform reads on an
+	// object, along with the total size of the object.
+	ReadObject(ctx context.Context, objName string) (_ ObjectReader, objSize int64, _ error)
+
+	// CreateObject returns a writer for the object at the request name. A new
+	// empty object is created if CreateObject is called on an existing object.
+	//
+	// A Writer *must* be closed via either Close, and if closing returns a
+	// non-nil error, that error should be handled or reported to the user -- an
+	// implementation may buffer written data until Close and only then return
+	// an error, or Write may return an opaque io.EOF with the underlying cause
+	// returned by the subsequent Close().
+	//
+	// TODO(radu): if we encounter some unrelated error while writing to the
+	// WriteCloser, we'd want to abort the whole thing rather than letting Close
+	// finalize the upload.
+	CreateObject(objName string) (io.WriteCloser, error)
+
+	// List enumerates files within the supplied prefix, returning a list of
+	// objects within that prefix. If delimiter is non-empty, names which have the
+	// same prefix, prior to the delimiter but after the prefix, are grouped into a
+	// single result which is that prefix. The order that results are returned is
+	// undefined. If a prefix is specified, the prefix is trimmed from the result
+	// list.
+	//
+	// An example would be, if the storage contains objects a, b/4, b/5 and b/6,
+	// these would be the return values:
+	//   List("", "") -> ["a", "b/4", "b/5", "b/6"]
+	//   List("", "/") -> ["a", "b"]
+	//   List("b", "/") -> ["4", "5", "6"]
+	//   List("b", "") -> ["/4", "/5", "/6"]
+	List(prefix, delimiter string) ([]string, error)
+
+	// Delete removes the named object from the store.
+	Delete(objName string) error
+
+	// Size returns the length of the named object in bytesWritten.
+	Size(objName string) (int64, error)
+
+	// IsNotExistError returns true if the given error (returned by a method in
+	// this interface) indicates that the object does not exist.
+	IsNotExistError(err error) bool
+}
+
+// ObjectReader is used to perform reads on an object.
+type ObjectReader interface {
+	// ReadAt reads len(p) bytes into p starting at offset off.
+	//
+	// Does not return partial results; if offset + len(p) is past the end of the
+	// object, an error is returned.
+	//
+	// Clients of ReadAt can execute parallel ReadAt calls on the same
+	// ObjectReader.
+	ReadAt(ctx context.Context, p []byte, offset int64) error
+
+	Close() error
+}
diff --git a/pebble/open.go b/pebble/open.go
new file mode 100644
index 0000000..9183ee1
--- /dev/null
+++ b/pebble/open.go
@@ -0,0 +1,1191 @@
+// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"context"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"math"
+	"os"
+	"sync/atomic"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/errors/oserror"
+	"github.com/cockroachdb/pebble/internal/arenaskl"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/cache"
+	"github.com/cockroachdb/pebble/internal/constants"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/internal/manual"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/record"
+	"github.com/cockroachdb/pebble/shims/cmp"
+	"github.com/cockroachdb/pebble/shims/slices"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+const (
+	initialMemTableSize = 256 << 10 // 256 KB
+
+	// The max batch size is limited by the uint32 offsets stored in
+	// internal/batchskl.node, DeferredBatchOp, and flushableBatchEntry.
+	//
+	// We limit the size to MaxUint32 (just short of 4GB) so that the exclusive
+	// end of an allocation fits in uint32.
+	//
+	// On 32-bit systems, slices are naturally limited to MaxInt (just short of
+	// 2GB).
+	maxBatchSize = constants.MaxUint32OrInt
+
+	// The max memtable size is limited by the uint32 offsets stored in
+	// internal/arenaskl.node, DeferredBatchOp, and flushableBatchEntry.
+	//
+	// We limit the size to MaxUint32 (just short of 4GB) so that the exclusive
+	// end of an allocation fits in uint32.
+	//
+	// On 32-bit systems, slices are naturally limited to MaxInt (just short of
+	// 2GB).
+	maxMemTableSize = constants.MaxUint32OrInt
+)
+
+// TableCacheSize can be used to determine the table
+// cache size for a single db, given the maximum open
+// files which can be used by a table cache which is
+// only used by a single db.
+func TableCacheSize(maxOpenFiles int) int {
+	tableCacheSize := maxOpenFiles - numNonTableCacheFiles
+	if tableCacheSize < minTableCacheSize {
+		tableCacheSize = minTableCacheSize
+	}
+	return tableCacheSize
+}
+
+// Open opens a DB whose files live in the given directory.
+func Open(dirname string, opts *Options) (db *DB, _ error) {
+	// Make a copy of the options so that we don't mutate the passed in options.
+	opts = opts.Clone()
+	opts = opts.EnsureDefaults()
+	if err := opts.Validate(); err != nil {
+		return nil, err
+	}
+	if opts.LoggerAndTracer == nil {
+		opts.LoggerAndTracer = &base.LoggerWithNoopTracer{Logger: opts.Logger}
+	} else {
+		opts.Logger = opts.LoggerAndTracer
+	}
+
+	// In all error cases, we return db = nil; this is used by various
+	// deferred cleanups.
+
+	// Open the database and WAL directories first.
+	walDirname, dataDir, walDir, err := prepareAndOpenDirs(dirname, opts)
+	if err != nil {
+		return nil, errors.Wrapf(err, "error opening database at %q", dirname)
+	}
+	defer func() {
+		if db == nil {
+			if walDir != dataDir {
+				walDir.Close()
+			}
+			dataDir.Close()
+		}
+	}()
+
+	// Lock the database directory.
+	var fileLock *Lock
+	if opts.Lock != nil {
+		// The caller already acquired the database lock. Ensure that the
+		// directory matches.
+		if dirname != opts.Lock.dirname {
+			return nil, errors.Newf("pebble: opts.Lock acquired in %q not %q", opts.Lock.dirname, dirname)
+		}
+		if err := opts.Lock.refForOpen(); err != nil {
+			return nil, err
+		}
+		fileLock = opts.Lock
+	} else {
+		fileLock, err = LockDirectory(dirname, opts.FS)
+		if err != nil {
+			return nil, err
+		}
+	}
+	defer func() {
+		if db == nil {
+			fileLock.Close()
+		}
+	}()
+
+	// Establish the format major version.
+	formatVersion, formatVersionMarker, err := lookupFormatMajorVersion(opts.FS, dirname)
+	if err != nil {
+		return nil, err
+	}
+	defer func() {
+		if db == nil {
+			formatVersionMarker.Close()
+		}
+	}()
+
+	// Find the currently active manifest, if there is one.
+	manifestMarker, manifestFileNum, manifestExists, err := findCurrentManifest(formatVersion, opts.FS, dirname)
+	if err != nil {
+		return nil, errors.Wrapf(err, "pebble: database %q", dirname)
+	}
+	defer func() {
+		if db == nil {
+			manifestMarker.Close()
+		}
+	}()
+
+	// Atomic markers may leave behind obsolete files if there's a crash
+	// mid-update. Clean these up if we're not in read-only mode.
+	if !opts.ReadOnly {
+		if err := formatVersionMarker.RemoveObsolete(); err != nil {
+			return nil, err
+		}
+		if err := manifestMarker.RemoveObsolete(); err != nil {
+			return nil, err
+		}
+	}
+
+	if opts.Cache == nil {
+		opts.Cache = cache.New(cacheDefaultSize)
+	} else {
+		opts.Cache.Ref()
+	}
+
+	d := &DB{
+		cacheID:             opts.Cache.NewID(),
+		dirname:             dirname,
+		walDirname:          walDirname,
+		opts:                opts,
+		cmp:                 opts.Comparer.Compare,
+		equal:               opts.equal(),
+		merge:               opts.Merger.Merge,
+		split:               opts.Comparer.Split,
+		abbreviatedKey:      opts.Comparer.AbbreviatedKey,
+		largeBatchThreshold: (opts.MemTableSize - uint64(memTableEmptySize)) / 2,
+		fileLock:            fileLock,
+		dataDir:             dataDir,
+		walDir:              walDir,
+		logRecycler:         logRecycler{limit: opts.MemTableStopWritesThreshold + 1},
+		closed:              new(atomic.Value),
+		closedCh:            make(chan struct{}),
+	}
+	d.mu.versions = &versionSet{}
+	d.diskAvailBytes.Store(math.MaxUint64)
+
+	defer func() {
+		// If an error or panic occurs during open, attempt to release the manually
+		// allocated memory resources. Note that rather than look for an error, we
+		// look for the return of a nil DB pointer.
+		if r := recover(); db == nil {
+			// Release our references to the Cache. Note that both the DB, and
+			// tableCache have a reference. When we release the reference to
+			// the tableCache, and if there are no other references to
+			// the tableCache, then the tableCache will also release its
+			// reference to the cache.
+			opts.Cache.Unref()
+
+			if d.tableCache != nil {
+				_ = d.tableCache.close()
+			}
+
+			for _, mem := range d.mu.mem.queue {
+				switch t := mem.flushable.(type) {
+				case *memTable:
+					manual.Free(t.arenaBuf)
+					t.arenaBuf = nil
+				}
+			}
+			if d.cleanupManager != nil {
+				d.cleanupManager.Close()
+			}
+			if d.objProvider != nil {
+				d.objProvider.Close()
+			}
+			if r != nil {
+				panic(r)
+			}
+		}
+	}()
+
+	d.commit = newCommitPipeline(commitEnv{
+		logSeqNum:     &d.mu.versions.logSeqNum,
+		visibleSeqNum: &d.mu.versions.visibleSeqNum,
+		apply:         d.commitApply,
+		write:         d.commitWrite,
+	})
+	d.mu.nextJobID = 1
+	d.mu.mem.nextSize = opts.MemTableSize
+	if d.mu.mem.nextSize > initialMemTableSize {
+		d.mu.mem.nextSize = initialMemTableSize
+	}
+	d.mu.compact.cond.L = &d.mu.Mutex
+	d.mu.compact.inProgress = make(map[*compaction]struct{})
+	d.mu.compact.noOngoingFlushStartTime = time.Now()
+	d.mu.snapshots.init()
+	// logSeqNum is the next sequence number that will be assigned.
+	// Start assigning sequence numbers from base.SeqNumStart to leave
+	// room for reserved sequence numbers (see comments around
+	// SeqNumStart).
+	d.mu.versions.logSeqNum.Store(base.SeqNumStart)
+	d.mu.formatVers.vers.Store(uint64(formatVersion))
+	d.mu.formatVers.marker = formatVersionMarker
+
+	d.timeNow = time.Now
+	d.openedAt = d.timeNow()
+
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	jobID := d.mu.nextJobID
+	d.mu.nextJobID++
+
+	setCurrent := setCurrentFunc(d.FormatMajorVersion(), manifestMarker, opts.FS, dirname, d.dataDir)
+
+	if !manifestExists {
+		// DB does not exist.
+		if d.opts.ErrorIfNotExists || d.opts.ReadOnly {
+			return nil, errors.Wrapf(ErrDBDoesNotExist, "dirname=%q", dirname)
+		}
+
+		// Create the DB.
+		if err := d.mu.versions.create(jobID, dirname, opts, manifestMarker, setCurrent, d.FormatMajorVersion, &d.mu.Mutex); err != nil {
+			return nil, err
+		}
+	} else {
+		if opts.ErrorIfExists {
+			return nil, errors.Wrapf(ErrDBAlreadyExists, "dirname=%q", dirname)
+		}
+		// Load the version set.
+		if err := d.mu.versions.load(dirname, opts, manifestFileNum, manifestMarker, setCurrent, d.FormatMajorVersion, &d.mu.Mutex); err != nil {
+			return nil, err
+		}
+		if opts.ErrorIfNotPristine {
+			liveFileNums := make(map[base.DiskFileNum]struct{})
+			d.mu.versions.addLiveFileNums(liveFileNums)
+			if len(liveFileNums) != 0 {
+				return nil, errors.Wrapf(ErrDBNotPristine, "dirname=%q", dirname)
+			}
+		}
+	}
+
+	// In read-only mode, we replay directly into the mutable memtable but never
+	// flush it. We need to delay creation of the memtable until we know the
+	// sequence number of the first batch that will be inserted.
+	if !d.opts.ReadOnly {
+		var entry *flushableEntry
+		d.mu.mem.mutable, entry = d.newMemTable(0 /* logNum */, d.mu.versions.logSeqNum.Load())
+		d.mu.mem.queue = append(d.mu.mem.queue, entry)
+	}
+
+	// List the objects
+	ls, err := opts.FS.List(d.walDirname)
+	if err != nil {
+		return nil, err
+	}
+	if d.dirname != d.walDirname {
+		ls2, err := opts.FS.List(d.dirname)
+		if err != nil {
+			return nil, err
+		}
+		ls = append(ls, ls2...)
+	}
+	providerSettings := objstorageprovider.Settings{
+		Logger:              opts.Logger,
+		FS:                  opts.FS,
+		FSDirName:           dirname,
+		FSDirInitialListing: ls,
+		FSCleaner:           opts.Cleaner,
+		NoSyncOnClose:       opts.NoSyncOnClose,
+		BytesPerSync:        opts.BytesPerSync,
+	}
+	providerSettings.Remote.StorageFactory = opts.Experimental.RemoteStorage
+	providerSettings.Remote.CreateOnShared = opts.Experimental.CreateOnShared
+	providerSettings.Remote.CreateOnSharedLocator = opts.Experimental.CreateOnSharedLocator
+	providerSettings.Remote.CacheSizeBytes = opts.Experimental.SecondaryCacheSizeBytes
+
+	d.objProvider, err = objstorageprovider.Open(providerSettings)
+	if err != nil {
+		return nil, err
+	}
+
+	d.cleanupManager = openCleanupManager(opts, d.objProvider, d.onObsoleteTableDelete, d.getDeletionPacerInfo)
+
+	if manifestExists {
+		curVersion := d.mu.versions.currentVersion()
+		if err := checkConsistency(curVersion, dirname, d.objProvider); err != nil {
+			return nil, err
+		}
+	}
+
+	tableCacheSize := TableCacheSize(opts.MaxOpenFiles)
+	d.tableCache = newTableCacheContainer(
+		opts.TableCache, d.cacheID, d.objProvider, d.opts, tableCacheSize,
+		&sstable.CategoryStatsCollector{})
+	d.newIters = d.tableCache.newIters
+	d.tableNewRangeKeyIter = d.tableCache.newRangeKeyIter
+
+	// Replay any newer log files than the ones named in the manifest.
+	type fileNumAndName struct {
+		num  base.DiskFileNum
+		name string
+	}
+	var logFiles []fileNumAndName
+	var previousOptionsFileNum FileNum
+	var previousOptionsFilename string
+	for _, filename := range ls {
+		ft, fn, ok := base.ParseFilename(opts.FS, filename)
+		if !ok {
+			continue
+		}
+
+		// Don't reuse any obsolete file numbers to avoid modifying an
+		// ingested sstable's original external file.
+		if d.mu.versions.nextFileNum <= uint64(fn.FileNum()) {
+			d.mu.versions.nextFileNum = uint64(fn.FileNum()) + 1
+		}
+
+		switch ft {
+		case fileTypeLog:
+			if fn >= d.mu.versions.minUnflushedLogNum {
+				logFiles = append(logFiles, fileNumAndName{fn, filename})
+			}
+			if d.logRecycler.minRecycleLogNum <= fn.FileNum() {
+				d.logRecycler.minRecycleLogNum = fn.FileNum() + 1
+			}
+		case fileTypeOptions:
+			if previousOptionsFileNum < fn.FileNum() {
+				previousOptionsFileNum = fn.FileNum()
+				previousOptionsFilename = filename
+			}
+		case fileTypeTemp, fileTypeOldTemp:
+			if !d.opts.ReadOnly {
+				// Some codepaths write to a temporary file and then
+				// rename it to its final location when complete.  A
+				// temp file is leftover if a process exits before the
+				// rename.  Remove it.
+				err := opts.FS.Remove(opts.FS.PathJoin(dirname, filename))
+				if err != nil {
+					return nil, err
+				}
+			}
+		}
+	}
+
+	// Ratchet d.mu.versions.nextFileNum ahead of all known objects in the
+	// objProvider. This avoids FileNum collisions with obsolete sstables.
+	objects := d.objProvider.List()
+	for _, obj := range objects {
+		if d.mu.versions.nextFileNum <= uint64(obj.DiskFileNum) {
+			d.mu.versions.nextFileNum = uint64(obj.DiskFileNum) + 1
+		}
+	}
+
+	// Validate the most-recent OPTIONS file, if there is one.
+	var strictWALTail bool
+	if previousOptionsFilename != "" {
+		path := opts.FS.PathJoin(dirname, previousOptionsFilename)
+		strictWALTail, err = checkOptions(opts, path)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	slices.SortFunc(logFiles, func(a, b fileNumAndName) int {
+		return cmp.Compare(a.num, b.num)
+	})
+
+	var ve versionEdit
+	var toFlush flushableList
+	for i, lf := range logFiles {
+		lastWAL := i == len(logFiles)-1
+		flush, maxSeqNum, err := d.replayWAL(jobID, &ve, opts.FS,
+			opts.FS.PathJoin(d.walDirname, lf.name), lf.num, strictWALTail && !lastWAL)
+		if err != nil {
+			return nil, err
+		}
+		toFlush = append(toFlush, flush...)
+		d.mu.versions.markFileNumUsed(lf.num)
+		if d.mu.versions.logSeqNum.Load() < maxSeqNum {
+			d.mu.versions.logSeqNum.Store(maxSeqNum)
+		}
+	}
+	d.mu.versions.visibleSeqNum.Store(d.mu.versions.logSeqNum.Load())
+
+	if !d.opts.ReadOnly {
+		// Create an empty .log file.
+		newLogNum := d.mu.versions.getNextDiskFileNum()
+
+		// This logic is slightly different than RocksDB's. Specifically, RocksDB
+		// sets MinUnflushedLogNum to max-recovered-log-num + 1. We set it to the
+		// newLogNum. There should be no difference in using either value.
+		ve.MinUnflushedLogNum = newLogNum
+
+		// Create the manifest with the updated MinUnflushedLogNum before
+		// creating the new log file. If we created the log file first, a
+		// crash before the manifest is synced could leave two WALs with
+		// unclean tails.
+		d.mu.versions.logLock()
+		if err := d.mu.versions.logAndApply(jobID, &ve, newFileMetrics(ve.NewFiles), false /* forceRotation */, func() []compactionInfo {
+			return nil
+		}); err != nil {
+			return nil, err
+		}
+
+		for _, entry := range toFlush {
+			entry.readerUnrefLocked(true)
+		}
+
+		newLogName := base.MakeFilepath(opts.FS, d.walDirname, fileTypeLog, newLogNum)
+		d.mu.log.queue = append(d.mu.log.queue, fileInfo{fileNum: newLogNum, fileSize: 0})
+		logFile, err := opts.FS.Create(newLogName)
+		if err != nil {
+			return nil, err
+		}
+		if err := d.walDir.Sync(); err != nil {
+			return nil, err
+		}
+		d.opts.EventListener.WALCreated(WALCreateInfo{
+			JobID:   jobID,
+			Path:    newLogName,
+			FileNum: newLogNum,
+		})
+		// This isn't strictly necessary as we don't use the log number for
+		// memtables being flushed, only for the next unflushed memtable.
+		d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum = newLogNum
+
+		logFile = vfs.NewSyncingFile(logFile, vfs.SyncingFileOptions{
+			NoSyncOnClose:   d.opts.NoSyncOnClose,
+			BytesPerSync:    d.opts.WALBytesPerSync,
+			PreallocateSize: d.walPreallocateSize(),
+		})
+		d.mu.log.metrics.fsyncLatency = prometheus.NewHistogram(prometheus.HistogramOpts{
+			Buckets: FsyncLatencyBuckets,
+		})
+
+		logWriterConfig := record.LogWriterConfig{
+			WALMinSyncInterval: d.opts.WALMinSyncInterval,
+			WALFsyncLatency:    d.mu.log.metrics.fsyncLatency,
+			QueueSemChan:       d.commit.logSyncQSem,
+		}
+		d.mu.log.LogWriter = record.NewLogWriter(logFile, newLogNum, logWriterConfig)
+		d.mu.versions.metrics.WAL.Files++
+	}
+	d.updateReadStateLocked(d.opts.DebugCheck)
+
+	// If the Options specify a format major version higher than the
+	// loaded database's, upgrade it. If this is a new database, this
+	// code path also performs an initial upgrade from the starting
+	// implicit MostCompatible version.
+	//
+	// We ratchet the version this far into Open so that migrations have a read
+	// state available.
+	if !d.opts.ReadOnly && opts.FormatMajorVersion > d.FormatMajorVersion() {
+		if err := d.ratchetFormatMajorVersionLocked(opts.FormatMajorVersion); err != nil {
+			return nil, err
+		}
+	}
+
+	if !d.opts.ReadOnly {
+		// Write the current options to disk.
+		d.optionsFileNum = d.mu.versions.getNextDiskFileNum()
+		tmpPath := base.MakeFilepath(opts.FS, dirname, fileTypeTemp, d.optionsFileNum)
+		optionsPath := base.MakeFilepath(opts.FS, dirname, fileTypeOptions, d.optionsFileNum)
+
+		// Write them to a temporary file first, in case we crash before
+		// we're done. A corrupt options file prevents opening the
+		// database.
+		optionsFile, err := opts.FS.Create(tmpPath)
+		if err != nil {
+			return nil, err
+		}
+		serializedOpts := []byte(opts.String())
+		if _, err := optionsFile.Write(serializedOpts); err != nil {
+			return nil, errors.CombineErrors(err, optionsFile.Close())
+		}
+		d.optionsFileSize = uint64(len(serializedOpts))
+		if err := optionsFile.Sync(); err != nil {
+			return nil, errors.CombineErrors(err, optionsFile.Close())
+		}
+		if err := optionsFile.Close(); err != nil {
+			return nil, err
+		}
+		// Atomically rename to the OPTIONS-XXXXXX path. This rename is
+		// guaranteed to be atomic because the destination path does not
+		// exist.
+		if err := opts.FS.Rename(tmpPath, optionsPath); err != nil {
+			return nil, err
+		}
+		if err := d.dataDir.Sync(); err != nil {
+			return nil, err
+		}
+	}
+
+	if !d.opts.ReadOnly {
+		d.scanObsoleteFiles(ls)
+		d.deleteObsoleteFiles(jobID)
+	} else {
+		// All the log files are obsolete.
+		d.mu.versions.metrics.WAL.Files = int64(len(logFiles))
+	}
+	d.mu.tableStats.cond.L = &d.mu.Mutex
+	d.mu.tableValidation.cond.L = &d.mu.Mutex
+	if !d.opts.ReadOnly {
+		d.maybeCollectTableStatsLocked()
+	}
+	d.calculateDiskAvailableBytes()
+
+	d.maybeScheduleFlush()
+	d.maybeScheduleCompaction()
+
+	// Note: this is a no-op if invariants are disabled or race is enabled.
+	//
+	// Setting a finalizer on *DB causes *DB to never be reclaimed and the
+	// finalizer to never be run. The problem is due to this limitation of
+	// finalizers mention in the SetFinalizer docs:
+	//
+	//   If a cyclic structure includes a block with a finalizer, that cycle is
+	//   not guaranteed to be garbage collected and the finalizer is not
+	//   guaranteed to run, because there is no ordering that respects the
+	//   dependencies.
+	//
+	// DB has cycles with several of its internal structures: readState,
+	// newIters, tableCache, versions, etc. Each of this individually cause a
+	// cycle and prevent the finalizer from being run. But we can workaround this
+	// finializer limitation by setting a finalizer on another object that is
+	// tied to the lifetime of DB: the DB.closed atomic.Value.
+	dPtr := fmt.Sprintf("%p", d)
+	invariants.SetFinalizer(d.closed, func(obj interface{}) {
+		v := obj.(*atomic.Value)
+		if err := v.Load(); err == nil {
+			fmt.Fprintf(os.Stderr, "%s: unreferenced DB not closed\n", dPtr)
+			os.Exit(1)
+		}
+	})
+
+	return d, nil
+}
+
+// prepareAndOpenDirs opens the directories for the store (and creates them if
+// necessary).
+//
+// Returns an error if ReadOnly is set and the directories don't exist.
+func prepareAndOpenDirs(
+	dirname string, opts *Options,
+) (walDirname string, dataDir vfs.File, walDir vfs.File, err error) {
+	walDirname = opts.WALDir
+	if opts.WALDir == "" {
+		walDirname = dirname
+	}
+
+	// Create directories if needed.
+	if !opts.ReadOnly {
+		if err := opts.FS.MkdirAll(dirname, 0755); err != nil {
+			return "", nil, nil, err
+		}
+		if walDirname != dirname {
+			if err := opts.FS.MkdirAll(walDirname, 0755); err != nil {
+				return "", nil, nil, err
+			}
+		}
+	}
+
+	dataDir, err = opts.FS.OpenDir(dirname)
+	if err != nil {
+		if opts.ReadOnly && oserror.IsNotExist(err) {
+			return "", nil, nil, errors.Errorf("pebble: database %q does not exist", dirname)
+		}
+		return "", nil, nil, err
+	}
+
+	if walDirname == dirname {
+		walDir = dataDir
+	} else {
+		walDir, err = opts.FS.OpenDir(walDirname)
+		if err != nil {
+			dataDir.Close()
+			return "", nil, nil, err
+		}
+	}
+	return walDirname, dataDir, walDir, nil
+}
+
+// GetVersion returns the engine version string from the latest options
+// file present in dir. Used to check what Pebble or RocksDB version was last
+// used to write to the database stored in this directory. An empty string is
+// returned if no valid OPTIONS file with a version key was found.
+func GetVersion(dir string, fs vfs.FS) (string, error) {
+	ls, err := fs.List(dir)
+	if err != nil {
+		return "", err
+	}
+	var version string
+	lastOptionsSeen := FileNum(0)
+	for _, filename := range ls {
+		ft, fn, ok := base.ParseFilename(fs, filename)
+		if !ok {
+			continue
+		}
+		switch ft {
+		case fileTypeOptions:
+			// If this file has a higher number than the last options file
+			// processed, reset version. This is because rocksdb often
+			// writes multiple options files without deleting previous ones.
+			// Otherwise, skip parsing this options file.
+			if fn.FileNum() > lastOptionsSeen {
+				version = ""
+				lastOptionsSeen = fn.FileNum()
+			} else {
+				continue
+			}
+			f, err := fs.Open(fs.PathJoin(dir, filename))
+			if err != nil {
+				return "", err
+			}
+			data, err := io.ReadAll(f)
+			f.Close()
+
+			if err != nil {
+				return "", err
+			}
+			err = parseOptions(string(data), func(section, key, value string) error {
+				switch {
+				case section == "Version":
+					switch key {
+					case "pebble_version":
+						version = value
+					case "rocksdb_version":
+						version = fmt.Sprintf("rocksdb v%s", value)
+					}
+				}
+				return nil
+			})
+			if err != nil {
+				return "", err
+			}
+		}
+	}
+	return version, nil
+}
+
+// replayWAL replays the edits in the specified log file. If the DB is in
+// read only mode, then the WALs are replayed into memtables and not flushed. If
+// the DB is not in read only mode, then the contents of the WAL are guaranteed
+// to be flushed.
+//
+// The toFlush return value is a list of flushables associated with the WAL
+// being replayed which will be flushed. Once the version edit has been applied
+// to the manifest, it is up to the caller of replayWAL to unreference the
+// toFlush flushables returned by replayWAL.
+//
+// d.mu must be held when calling this, but the mutex may be dropped and
+// re-acquired during the course of this method.
+func (d *DB) replayWAL(
+	jobID int,
+	ve *versionEdit,
+	fs vfs.FS,
+	filename string,
+	logNum base.DiskFileNum,
+	strictWALTail bool,
+) (toFlush flushableList, maxSeqNum uint64, err error) {
+	file, err := fs.Open(filename)
+	if err != nil {
+		return nil, 0, err
+	}
+	defer file.Close()
+	var (
+		b               Batch
+		buf             bytes.Buffer
+		mem             *memTable
+		entry           *flushableEntry
+		rr              = record.NewReader(file, logNum)
+		offset          int64 // byte offset in rr
+		lastFlushOffset int64
+		keysReplayed    int64 // number of keys replayed
+		batchesReplayed int64 // number of batches replayed
+	)
+
+	// TODO(jackson): This function is interspersed with panics, in addition to
+	// corruption error propagation. Audit them to ensure we're truly only
+	// panicking where the error points to Pebble bug and not user or
+	// hardware-induced corruption.
+
+	if d.opts.ReadOnly {
+		// In read-only mode, we replay directly into the mutable memtable which will
+		// never be flushed.
+		mem = d.mu.mem.mutable
+		if mem != nil {
+			entry = d.mu.mem.queue[len(d.mu.mem.queue)-1]
+		}
+	}
+
+	// Flushes the current memtable, if not nil.
+	flushMem := func() {
+		if mem == nil {
+			return
+		}
+		var logSize uint64
+		if offset >= lastFlushOffset {
+			logSize = uint64(offset - lastFlushOffset)
+		}
+		// Else, this was the initial memtable in the read-only case which must have
+		// been empty, but we need to flush it since we don't want to add to it later.
+		lastFlushOffset = offset
+		entry.logSize = logSize
+		if !d.opts.ReadOnly {
+			toFlush = append(toFlush, entry)
+		}
+		mem, entry = nil, nil
+	}
+	// Creates a new memtable if there is no current memtable.
+	ensureMem := func(seqNum uint64) {
+		if mem != nil {
+			return
+		}
+		mem, entry = d.newMemTable(logNum, seqNum)
+		if d.opts.ReadOnly {
+			d.mu.mem.mutable = mem
+			d.mu.mem.queue = append(d.mu.mem.queue, entry)
+		}
+	}
+
+	// updateVE is used to update ve with information about new files created
+	// during the flush of any flushable not of type ingestedFlushable. For the
+	// flushable of type ingestedFlushable we use custom handling below.
+	updateVE := func() error {
+		// TODO(bananabrick): See if we can use the actual base level here,
+		// instead of using 1.
+		c := newFlush(d.opts, d.mu.versions.currentVersion(),
+			1 /* base level */, toFlush, d.timeNow())
+		newVE, _, _, err := d.runCompaction(jobID, c)
+		if err != nil {
+			return errors.Wrapf(err, "running compaction during WAL replay")
+		}
+		ve.NewFiles = append(ve.NewFiles, newVE.NewFiles...)
+		return nil
+	}
+	defer func() {
+		if err != nil {
+			err = errors.WithDetailf(err, "replaying log %s, offset %d", logNum, offset)
+		}
+	}()
+
+	for {
+		offset = rr.Offset()
+		r, err := rr.Next()
+		if err == nil {
+			_, err = io.Copy(&buf, r)
+		}
+		if err != nil {
+			// It is common to encounter a zeroed or invalid chunk due to WAL
+			// preallocation and WAL recycling. We need to distinguish these
+			// errors from EOF in order to recognize that the record was
+			// truncated and to avoid replaying subsequent WALs, but want
+			// to otherwise treat them like EOF.
+			if err == io.EOF {
+				break
+			} else if record.IsInvalidRecord(err) && !strictWALTail {
+				break
+			}
+			return nil, 0, errors.Wrap(err, "pebble: error when replaying WAL")
+		}
+
+		if buf.Len() < batchHeaderLen {
+			return nil, 0, base.CorruptionErrorf("pebble: corrupt log file %q (num %s)",
+				filename, errors.Safe(logNum))
+		}
+
+		if d.opts.ErrorIfNotPristine {
+			return nil, 0, errors.WithDetailf(ErrDBNotPristine, "location: %q", d.dirname)
+		}
+
+		// Specify Batch.db so that Batch.SetRepr will compute Batch.memTableSize
+		// which is used below.
+		b = Batch{}
+		b.db = d
+		b.SetRepr(buf.Bytes())
+		seqNum := b.SeqNum()
+		maxSeqNum = seqNum + uint64(b.Count())
+		keysReplayed += int64(b.Count())
+		batchesReplayed++
+		{
+			br := b.Reader()
+			if kind, encodedFileNum, _, ok, err := br.Next(); err != nil {
+				return nil, 0, err
+			} else if ok && kind == InternalKeyKindIngestSST {
+				fileNums := make([]base.DiskFileNum, 0, b.Count())
+				addFileNum := func(encodedFileNum []byte) {
+					fileNum, n := binary.Uvarint(encodedFileNum)
+					if n <= 0 {
+						panic("pebble: ingest sstable file num is invalid.")
+					}
+					fileNums = append(fileNums, base.FileNum(fileNum).DiskFileNum())
+				}
+				addFileNum(encodedFileNum)
+
+				for i := 1; i < int(b.Count()); i++ {
+					kind, encodedFileNum, _, ok, err := br.Next()
+					if err != nil {
+						return nil, 0, err
+					}
+					if kind != InternalKeyKindIngestSST {
+						panic("pebble: invalid batch key kind.")
+					}
+					if !ok {
+						panic("pebble: invalid batch count.")
+					}
+					addFileNum(encodedFileNum)
+				}
+
+				if _, _, _, ok, err := br.Next(); err != nil {
+					return nil, 0, err
+				} else if ok {
+					panic("pebble: invalid number of entries in batch.")
+				}
+
+				meta := make([]*fileMetadata, len(fileNums))
+				for i, n := range fileNums {
+					var readable objstorage.Readable
+					objMeta, err := d.objProvider.Lookup(fileTypeTable, n)
+					if err != nil {
+						return nil, 0, errors.Wrap(err, "pebble: error when looking up ingested SSTs")
+					}
+					if objMeta.IsRemote() {
+						readable, err = d.objProvider.OpenForReading(context.TODO(), fileTypeTable, n, objstorage.OpenOptions{MustExist: true})
+						if err != nil {
+							return nil, 0, errors.Wrap(err, "pebble: error when opening flushable ingest files")
+						}
+					} else {
+						path := base.MakeFilepath(d.opts.FS, d.dirname, fileTypeTable, n)
+						f, err := d.opts.FS.Open(path)
+						if err != nil {
+							return nil, 0, err
+						}
+
+						readable, err = sstable.NewSimpleReadable(f)
+						if err != nil {
+							return nil, 0, err
+						}
+					}
+					// NB: ingestLoad1 will close readable.
+					meta[i], err = ingestLoad1(d.opts, d.FormatMajorVersion(), readable, d.cacheID, n)
+					if err != nil {
+						return nil, 0, errors.Wrap(err, "pebble: error when loading flushable ingest files")
+					}
+				}
+
+				if uint32(len(meta)) != b.Count() {
+					panic("pebble: couldn't load all files in WAL entry.")
+				}
+
+				entry, err = d.newIngestedFlushableEntry(
+					meta, seqNum, logNum,
+				)
+				if err != nil {
+					return nil, 0, err
+				}
+
+				if d.opts.ReadOnly {
+					d.mu.mem.queue = append(d.mu.mem.queue, entry)
+					// We added the IngestSST flushable to the queue. But there
+					// must be at least one WAL entry waiting to be replayed. We
+					// have to ensure this newer WAL entry isn't replayed into
+					// the current value of d.mu.mem.mutable because the current
+					// mutable memtable exists before this flushable entry in
+					// the memtable queue. To ensure this, we just need to unset
+					// d.mu.mem.mutable. When a newer WAL is replayed, we will
+					// set d.mu.mem.mutable to a newer value.
+					d.mu.mem.mutable = nil
+				} else {
+					toFlush = append(toFlush, entry)
+					// During WAL replay, the lsm only has L0, hence, the
+					// baseLevel is 1. For the sake of simplicity, we place the
+					// ingested files in L0 here, instead of finding their
+					// target levels. This is a simplification for the sake of
+					// simpler code. It is expected that WAL replay should be
+					// rare, and that flushables of type ingestedFlushable
+					// should also be rare. So, placing the ingested files in L0
+					// is alright.
+					//
+					// TODO(bananabrick): Maybe refactor this function to allow
+					// us to easily place ingested files in levels as low as
+					// possible during WAL replay. It would require breaking up
+					// the application of ve to the manifest into chunks and is
+					// not pretty w/o a refactor to this function and how it's
+					// used.
+					c := newFlush(
+						d.opts, d.mu.versions.currentVersion(),
+						1, /* base level */
+						[]*flushableEntry{entry},
+						d.timeNow(),
+					)
+					for _, file := range c.flushing[0].flushable.(*ingestedFlushable).files {
+						ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: 0, Meta: file.FileMetadata})
+					}
+				}
+				return toFlush, maxSeqNum, nil
+			}
+		}
+
+		if b.memTableSize >= uint64(d.largeBatchThreshold) {
+			flushMem()
+			// Make a copy of the data slice since it is currently owned by buf and will
+			// be reused in the next iteration.
+			b.data = slices.Clone(b.data)
+			b.flushable, err = newFlushableBatch(&b, d.opts.Comparer)
+			if err != nil {
+				return nil, 0, err
+			}
+			entry := d.newFlushableEntry(b.flushable, logNum, b.SeqNum())
+			// Disable memory accounting by adding a reader ref that will never be
+			// removed.
+			entry.readerRefs.Add(1)
+			if d.opts.ReadOnly {
+				d.mu.mem.queue = append(d.mu.mem.queue, entry)
+				// We added the flushable batch to the flushable to the queue.
+				// But there must be at least one WAL entry waiting to be
+				// replayed. We have to ensure this newer WAL entry isn't
+				// replayed into the current value of d.mu.mem.mutable because
+				// the current mutable memtable exists before this flushable
+				// entry in the memtable queue. To ensure this, we just need to
+				// unset d.mu.mem.mutable. When a newer WAL is replayed, we will
+				// set d.mu.mem.mutable to a newer value.
+				d.mu.mem.mutable = nil
+			} else {
+				toFlush = append(toFlush, entry)
+			}
+		} else {
+			ensureMem(seqNum)
+			if err = mem.prepare(&b); err != nil && err != arenaskl.ErrArenaFull {
+				return nil, 0, err
+			}
+			// We loop since DB.newMemTable() slowly grows the size of allocated memtables, so the
+			// batch may not initially fit, but will eventually fit (since it is smaller than
+			// largeBatchThreshold).
+			for err == arenaskl.ErrArenaFull {
+				flushMem()
+				ensureMem(seqNum)
+				err = mem.prepare(&b)
+				if err != nil && err != arenaskl.ErrArenaFull {
+					return nil, 0, err
+				}
+			}
+			if err = mem.apply(&b, seqNum); err != nil {
+				return nil, 0, err
+			}
+			mem.writerUnref()
+		}
+		buf.Reset()
+	}
+
+	d.opts.Logger.Infof("[JOB %d] WAL file %s with log number %s stopped reading at offset: %d; replayed %d keys in %d batches", jobID, filename, logNum.String(), offset, keysReplayed, batchesReplayed)
+	flushMem()
+
+	// mem is nil here.
+	if !d.opts.ReadOnly {
+		err = updateVE()
+		if err != nil {
+			return nil, 0, err
+		}
+	}
+	return toFlush, maxSeqNum, err
+}
+
+func checkOptions(opts *Options, path string) (strictWALTail bool, err error) {
+	f, err := opts.FS.Open(path)
+	if err != nil {
+		return false, err
+	}
+	defer f.Close()
+
+	data, err := io.ReadAll(f)
+	if err != nil {
+		return false, err
+	}
+	return opts.checkOptions(string(data))
+}
+
+// DBDesc briefly describes high-level state about a database.
+type DBDesc struct {
+	// Exists is true if an existing database was found.
+	Exists bool
+	// FormatMajorVersion indicates the database's current format
+	// version.
+	FormatMajorVersion FormatMajorVersion
+	// ManifestFilename is the filename of the current active manifest,
+	// if the database exists.
+	ManifestFilename string
+}
+
+// Peek looks for an existing database in dirname on the provided FS. It
+// returns a brief description of the database. Peek is read-only and
+// does not open the database
+func Peek(dirname string, fs vfs.FS) (*DBDesc, error) {
+	vers, versMarker, err := lookupFormatMajorVersion(fs, dirname)
+	if err != nil {
+		return nil, err
+	}
+	// TODO(jackson): Immediately closing the marker is clunky. Add a
+	// PeekMarker variant that avoids opening the directory.
+	if err := versMarker.Close(); err != nil {
+		return nil, err
+	}
+
+	// Find the currently active manifest, if there is one.
+	manifestMarker, manifestFileNum, exists, err := findCurrentManifest(vers, fs, dirname)
+	if err != nil {
+		return nil, err
+	}
+	// TODO(jackson): Immediately closing the marker is clunky. Add a
+	// PeekMarker variant that avoids opening the directory.
+	if err := manifestMarker.Close(); err != nil {
+		return nil, err
+	}
+
+	desc := &DBDesc{
+		Exists:             exists,
+		FormatMajorVersion: vers,
+	}
+	if exists {
+		desc.ManifestFilename = base.MakeFilepath(fs, dirname, fileTypeManifest, manifestFileNum)
+	}
+	return desc, nil
+}
+
+// LockDirectory acquires the database directory lock in the named directory,
+// preventing another process from opening the database. LockDirectory returns a
+// handle to the held lock that may be passed to Open through Options.Lock to
+// subsequently open the database, skipping lock acquistion during Open.
+//
+// LockDirectory may be used to expand the critical section protected by the
+// database lock to include setup before the call to Open.
+func LockDirectory(dirname string, fs vfs.FS) (*Lock, error) {
+	fileLock, err := fs.Lock(base.MakeFilepath(fs, dirname, fileTypeLock, base.FileNum(0).DiskFileNum()))
+	if err != nil {
+		return nil, err
+	}
+	l := &Lock{dirname: dirname, fileLock: fileLock}
+	l.refs.Store(1)
+	invariants.SetFinalizer(l, func(obj interface{}) {
+		if refs := obj.(*Lock).refs.Load(); refs > 0 {
+			panic(errors.AssertionFailedf("lock for %q finalized with %d refs", dirname, refs))
+		}
+	})
+	return l, nil
+}
+
+// Lock represents a file lock on a directory. It may be passed to Open through
+// Options.Lock to elide lock aquisition during Open.
+type Lock struct {
+	dirname  string
+	fileLock io.Closer
+	// refs is a count of the number of handles on the lock. refs must be 0, 1
+	// or 2.
+	//
+	// When acquired by the client and passed to Open, refs = 1 and the Open
+	// call increments it to 2. When the database is closed, it's decremented to
+	// 1. Finally when the original caller, calls Close on the Lock, it's
+	// drecemented to zero and the underlying file lock is released.
+	//
+	// When Open acquires the file lock, refs remains at 1 until the database is
+	// closed.
+	refs atomic.Int32
+}
+
+func (l *Lock) refForOpen() error {
+	// During Open, when a user passed in a lock, the reference count must be
+	// exactly 1. If it's zero, the lock is no longer held and is invalid. If
+	// it's 2, the lock is already in use by another database within the
+	// process.
+	if !l.refs.CompareAndSwap(1, 2) {
+		return errors.Errorf("pebble: unexpected Lock reference count; is the lock already in use?")
+	}
+	return nil
+}
+
+// Close releases the lock, permitting another process to lock and open the
+// database. Close must not be called until after a database using the Lock has
+// been closed.
+func (l *Lock) Close() error {
+	if l.refs.Add(-1) > 0 {
+		return nil
+	}
+	defer func() { l.fileLock = nil }()
+	return l.fileLock.Close()
+}
+
+// ErrDBDoesNotExist is generated when ErrorIfNotExists is set and the database
+// does not exist.
+//
+// Note that errors can be wrapped with more details; use errors.Is().
+var ErrDBDoesNotExist = errors.New("pebble: database does not exist")
+
+// ErrDBAlreadyExists is generated when ErrorIfExists is set and the database
+// already exists.
+//
+// Note that errors can be wrapped with more details; use errors.Is().
+var ErrDBAlreadyExists = errors.New("pebble: database already exists")
+
+// ErrDBNotPristine is generated when ErrorIfNotPristine is set and the database
+// already exists and is not pristine.
+//
+// Note that errors can be wrapped with more details; use errors.Is().
+var ErrDBNotPristine = errors.New("pebble: database already exists and is not pristine")
+
+// IsCorruptionError returns true if the given error indicates database
+// corruption.
+func IsCorruptionError(err error) bool {
+	return errors.Is(err, base.ErrCorruption)
+}
+
+func checkConsistency(v *manifest.Version, dirname string, objProvider objstorage.Provider) error {
+	var errs []error
+	dedup := make(map[base.DiskFileNum]struct{})
+	for level, files := range v.Levels {
+		iter := files.Iter()
+		for f := iter.First(); f != nil; f = iter.Next() {
+			backingState := f.FileBacking
+			if _, ok := dedup[backingState.DiskFileNum]; ok {
+				continue
+			}
+			dedup[backingState.DiskFileNum] = struct{}{}
+			fileNum := backingState.DiskFileNum
+			fileSize := backingState.Size
+			// We skip over remote objects; those are instead checked asynchronously
+			// by the table stats loading job.
+			meta, err := objProvider.Lookup(base.FileTypeTable, fileNum)
+			var size int64
+			if err == nil {
+				if meta.IsRemote() {
+					continue
+				}
+				size, err = objProvider.Size(meta)
+			}
+			if err != nil {
+				errs = append(errs, errors.Wrapf(err, "L%d: %s", errors.Safe(level), fileNum))
+				continue
+			}
+
+			if size != int64(fileSize) {
+				errs = append(errs, errors.Errorf(
+					"L%d: %s: object size mismatch (%s): %d (disk) != %d (MANIFEST)",
+					errors.Safe(level), fileNum, objProvider.Path(meta),
+					errors.Safe(size), errors.Safe(fileSize)))
+				continue
+			}
+		}
+	}
+	return errors.Join(errs...)
+}
diff --git a/pebble/open_test.go b/pebble/open_test.go
new file mode 100644
index 0000000..fae3237
--- /dev/null
+++ b/pebble/open_test.go
@@ -0,0 +1,1390 @@
+// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+	"reflect"
+	"runtime/debug"
+	"sort"
+	"strconv"
+	"strings"
+	"sync/atomic"
+	"syscall"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/cache"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/objstorage/remote"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/cockroachdb/pebble/vfs/atomicfs"
+	"github.com/cockroachdb/pebble/vfs/errorfs"
+	"github.com/cockroachdb/redact"
+	"github.com/kr/pretty"
+	"github.com/stretchr/testify/require"
+)
+
+func TestOpenSharedTableCache(t *testing.T) {
+	c := cache.New(cacheDefaultSize)
+	tc := NewTableCache(c, 16, 100)
+	defer tc.Unref()
+	defer c.Unref()
+
+	d0, err := Open("", testingRandomized(t, &Options{
+		FS:         vfs.NewMem(),
+		Cache:      c,
+		TableCache: tc,
+	}))
+	if err != nil {
+		t.Errorf("d0 Open: %s", err.Error())
+	}
+	defer d0.Close()
+
+	d1, err := Open("", testingRandomized(t, &Options{
+		FS:         vfs.NewMem(),
+		Cache:      c,
+		TableCache: tc,
+	}))
+	if err != nil {
+		t.Errorf("d1 Open: %s", err.Error())
+	}
+	defer d1.Close()
+
+	// Make sure that the Open function is using the passed in table cache
+	// when the TableCache option is set.
+	require.Equalf(
+		t, d0.tableCache.tableCache, d1.tableCache.tableCache,
+		"expected tableCache for both d0 and d1 to be the same",
+	)
+}
+
+func TestErrorIfExists(t *testing.T) {
+	opts := testingRandomized(t, &Options{
+		FS:            vfs.NewMem(),
+		ErrorIfExists: true,
+	})
+	defer ensureFilesClosed(t, opts)()
+
+	d0, err := Open("", opts)
+	require.NoError(t, err)
+	require.NoError(t, d0.Close())
+
+	if _, err := Open("", opts); !errors.Is(err, ErrDBAlreadyExists) {
+		t.Fatalf("expected db-already-exists error, got %v", err)
+	}
+
+	opts.ErrorIfExists = false
+	d1, err := Open("", opts)
+	require.NoError(t, err)
+	require.NoError(t, d1.Close())
+}
+
+func TestErrorIfNotExists(t *testing.T) {
+	opts := testingRandomized(t, &Options{
+		FS:               vfs.NewMem(),
+		ErrorIfNotExists: true,
+	})
+	defer ensureFilesClosed(t, opts)()
+
+	_, err := Open("", opts)
+	if !errors.Is(err, ErrDBDoesNotExist) {
+		t.Fatalf("expected db-does-not-exist error, got %v", err)
+	}
+
+	// Create the DB and try again.
+	opts.ErrorIfNotExists = false
+	d0, err := Open("", opts)
+	require.NoError(t, err)
+	require.NoError(t, d0.Close())
+
+	opts.ErrorIfNotExists = true
+	d1, err := Open("", opts)
+	require.NoError(t, err)
+	require.NoError(t, d1.Close())
+}
+
+func TestErrorIfNotPristine(t *testing.T) {
+	opts := testingRandomized(t, &Options{
+		FS:                 vfs.NewMem(),
+		ErrorIfNotPristine: true,
+	})
+	defer ensureFilesClosed(t, opts)()
+
+	d0, err := Open("", opts)
+	require.NoError(t, err)
+	require.NoError(t, d0.Close())
+
+	// Store is pristine; ok to open.
+	d1, err := Open("", opts)
+	require.NoError(t, err)
+	require.NoError(t, d1.Set([]byte("foo"), []byte("bar"), Sync))
+	require.NoError(t, d1.Close())
+
+	if _, err := Open("", opts); !errors.Is(err, ErrDBNotPristine) {
+		t.Fatalf("expected db-not-pristine error, got %v", err)
+	}
+
+	// Run compaction and make sure we're still not allowed to open.
+	opts.ErrorIfNotPristine = false
+	d2, err := Open("", opts)
+	require.NoError(t, err)
+	require.NoError(t, d2.Compact([]byte("a"), []byte("z"), false /* parallelize */))
+	require.NoError(t, d2.Close())
+
+	opts.ErrorIfNotPristine = true
+	if _, err := Open("", opts); !errors.Is(err, ErrDBNotPristine) {
+		t.Fatalf("expected db-already-exists error, got %v", err)
+	}
+}
+
+func TestOpenAlreadyLocked(t *testing.T) {
+	runTest := func(t *testing.T, dirname string, fs vfs.FS) {
+		opts := testingRandomized(t, &Options{FS: fs})
+		var err error
+		opts.Lock, err = LockDirectory(dirname, fs)
+		require.NoError(t, err)
+
+		d, err := Open(dirname, opts)
+		require.NoError(t, err)
+		require.NoError(t, d.Set([]byte("foo"), []byte("bar"), Sync))
+
+		// Try to open the same database reusing the Options containing the same
+		// Lock. It should error when it observes that it's already referenced.
+		_, err = Open(dirname, opts)
+		require.Error(t, err)
+
+		// Close the database.
+		require.NoError(t, d.Close())
+
+		// Now Opening should succeed again.
+		d, err = Open(dirname, opts)
+		require.NoError(t, err)
+		require.NoError(t, d.Close())
+
+		require.NoError(t, opts.Lock.Close())
+		// There should be no more remaining references.
+		require.Equal(t, int32(0), opts.Lock.refs.Load())
+	}
+	t.Run("memfs", func(t *testing.T) {
+		runTest(t, "", vfs.NewMem())
+	})
+	t.Run("disk", func(t *testing.T) {
+		runTest(t, t.TempDir(), vfs.Default)
+	})
+}
+
+func TestNewDBFilenames(t *testing.T) {
+	versions := map[FormatMajorVersion][]string{
+		FormatMostCompatible: {
+			"000002.log",
+			"CURRENT",
+			"LOCK",
+			"MANIFEST-000001",
+			"OPTIONS-000003",
+		},
+		internalFormatNewest: {
+			"000002.log",
+			"CURRENT",
+			"LOCK",
+			"MANIFEST-000001",
+			"OPTIONS-000003",
+			"marker.format-version.000015.016",
+			"marker.manifest.000001.MANIFEST-000001",
+		},
+	}
+
+	for formatVers, want := range versions {
+		t.Run(fmt.Sprintf("vers=%s", formatVers), func(t *testing.T) {
+			mem := vfs.NewMem()
+			fooBar := mem.PathJoin("foo", "bar")
+			d, err := Open(fooBar, &Options{
+				FS:                 mem,
+				FormatMajorVersion: formatVers,
+			})
+			if err != nil {
+				t.Fatalf("Open: %v", err)
+			}
+			if err := d.Close(); err != nil {
+				t.Fatalf("Close: %v", err)
+			}
+			got, err := mem.List(fooBar)
+			if err != nil {
+				t.Fatalf("List: %v", err)
+			}
+			sort.Strings(got)
+			if !reflect.DeepEqual(got, want) {
+				t.Errorf("\ngot  %v\nwant %v", got, want)
+			}
+		})
+	}
+}
+
+func testOpenCloseOpenClose(t *testing.T, fs vfs.FS, root string) {
+	opts := testingRandomized(t, &Options{FS: fs})
+
+	for _, startFromEmpty := range []bool{false, true} {
+		for _, walDirname := range []string{"", "wal"} {
+			for _, length := range []int{-1, 0, 1, 1000, 10000, 100000} {
+				dirname := "sharedDatabase" + walDirname
+				if startFromEmpty {
+					dirname = "startFromEmpty" + walDirname + strconv.Itoa(length)
+				}
+				dirname = fs.PathJoin(root, dirname)
+				if walDirname == "" {
+					opts.WALDir = ""
+				} else {
+					opts.WALDir = fs.PathJoin(dirname, walDirname)
+				}
+
+				got, xxx := []byte(nil), ""
+				if length >= 0 {
+					xxx = strings.Repeat("x", length)
+				}
+
+				d0, err := Open(dirname, opts)
+				if err != nil {
+					t.Fatalf("sfe=%t, length=%d: Open #0: %v",
+						startFromEmpty, length, err)
+					continue
+				}
+				if length >= 0 {
+					err = d0.Set([]byte("key"), []byte(xxx), nil)
+					if err != nil {
+						t.Errorf("sfe=%t, length=%d: Set: %v",
+							startFromEmpty, length, err)
+						continue
+					}
+				}
+				err = d0.Close()
+				if err != nil {
+					t.Errorf("sfe=%t, length=%d: Close #0: %v",
+						startFromEmpty, length, err)
+					continue
+				}
+
+				d1, err := Open(dirname, opts)
+				if err != nil {
+					t.Errorf("sfe=%t, length=%d: Open #1: %v",
+						startFromEmpty, length, err)
+					continue
+				}
+				if length >= 0 {
+					var closer io.Closer
+					got, closer, err = d1.Get([]byte("key"))
+					if err != nil {
+						t.Errorf("sfe=%t, length=%d: Get: %v",
+							startFromEmpty, length, err)
+						continue
+					}
+					got = append([]byte(nil), got...)
+					closer.Close()
+				}
+				err = d1.Close()
+				if err != nil {
+					t.Errorf("sfe=%t, length=%d: Close #1: %v",
+						startFromEmpty, length, err)
+					continue
+				}
+
+				if length >= 0 && string(got) != xxx {
+					t.Errorf("sfe=%t, length=%d: got value differs from set value",
+						startFromEmpty, length)
+					continue
+				}
+
+				{
+					got, err := opts.FS.List(dirname)
+					if err != nil {
+						t.Fatalf("List: %v", err)
+					}
+					var optionsCount int
+					for _, s := range got {
+						if t, _, ok := base.ParseFilename(opts.FS, s); ok && t == fileTypeOptions {
+							optionsCount++
+						}
+					}
+					if optionsCount != 1 {
+						t.Fatalf("expected 1 OPTIONS file, but found %d", optionsCount)
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestOpenCloseOpenClose(t *testing.T) {
+	for _, fstype := range []string{"disk", "mem"} {
+		t.Run(fstype, func(t *testing.T) {
+			var fs vfs.FS
+			var dir string
+			switch fstype {
+			case "disk":
+				var err error
+				dir, err = os.MkdirTemp("", "open-close")
+				require.NoError(t, err)
+				defer func() {
+					_ = os.RemoveAll(dir)
+				}()
+				fs = vfs.Default
+			case "mem":
+				dir = ""
+				fs = vfs.NewMem()
+			}
+			testOpenCloseOpenClose(t, fs, dir)
+		})
+	}
+}
+
+func TestOpenOptionsCheck(t *testing.T) {
+	mem := vfs.NewMem()
+	opts := &Options{FS: mem}
+
+	d, err := Open("", opts)
+	require.NoError(t, err)
+	require.NoError(t, d.Close())
+
+	opts = &Options{
+		Comparer: &Comparer{Name: "foo"},
+		FS:       mem,
+	}
+	_, err = Open("", opts)
+	require.Regexp(t, `comparer name from file.*!=.*`, err)
+
+	opts = &Options{
+		Merger: &Merger{Name: "bar"},
+		FS:     mem,
+	}
+	_, err = Open("", opts)
+	require.Regexp(t, `merger name from file.*!=.*`, err)
+}
+
+func TestOpenCrashWritingOptions(t *testing.T) {
+	memFS := vfs.NewMem()
+
+	d, err := Open("", &Options{FS: memFS})
+	require.NoError(t, err)
+	require.NoError(t, d.Close())
+
+	// Open the database again, this time with a mocked filesystem that
+	// will only succeed in partially writing the OPTIONS file.
+	fs := optionsTornWriteFS{FS: memFS}
+	_, err = Open("", &Options{FS: fs})
+	require.Error(t, err)
+
+	// Re-opening the database must succeed.
+	d, err = Open("", &Options{FS: memFS})
+	require.NoError(t, err)
+	require.NoError(t, d.Close())
+}
+
+type optionsTornWriteFS struct {
+	vfs.FS
+}
+
+func (fs optionsTornWriteFS) Create(name string) (vfs.File, error) {
+	file, err := fs.FS.Create(name)
+	if file != nil {
+		file = optionsTornWriteFile{File: file}
+	}
+	return file, err
+}
+
+type optionsTornWriteFile struct {
+	vfs.File
+}
+
+func (f optionsTornWriteFile) Write(b []byte) (int, error) {
+	// Look for the OPTIONS-XXXXXX file's `comparer=` field.
+	comparerKey := []byte("comparer=")
+	i := bytes.Index(b, comparerKey)
+	if i == -1 {
+		return f.File.Write(b)
+	}
+	// Write only the contents through `comparer=` and return an error.
+	n, _ := f.File.Write(b[:i+len(comparerKey)])
+	return n, syscall.EIO
+}
+
+func TestOpenReadOnly(t *testing.T) {
+	mem := vfs.NewMem()
+
+	{
+		// Opening a non-existent DB in read-only mode should result in no mutable
+		// filesystem operations.
+		var memLog base.InMemLogger
+		_, err := Open("non-existent", testingRandomized(t, &Options{
+			FS:       vfs.WithLogging(mem, memLog.Infof),
+			ReadOnly: true,
+			WALDir:   "non-existent-waldir",
+		}))
+		if err == nil {
+			t.Fatalf("expected error, but found success")
+		}
+		const expected = `open-dir: non-existent`
+		if trimmed := strings.TrimSpace(memLog.String()); expected != trimmed {
+			t.Fatalf("expected %q, but found %q", expected, trimmed)
+		}
+	}
+
+	{
+		// Opening a DB with a non-existent WAL dir in read-only mode should result
+		// in no mutable filesystem operations other than the LOCK.
+		var memLog base.InMemLogger
+		_, err := Open("", testingRandomized(t, &Options{
+			FS:       vfs.WithLogging(mem, memLog.Infof),
+			ReadOnly: true,
+			WALDir:   "non-existent-waldir",
+		}))
+		if err == nil {
+			t.Fatalf("expected error, but found success")
+		}
+		const expected = "open-dir: \nopen-dir: non-existent-waldir\nclose:"
+		if trimmed := strings.TrimSpace(memLog.String()); expected != trimmed {
+			t.Fatalf("expected %q, but found %q", expected, trimmed)
+		}
+	}
+
+	var contents []string
+	{
+		// Create a new DB and populate it with a small amount of data.
+		d, err := Open("", testingRandomized(t, &Options{
+			FS: mem,
+		}))
+		require.NoError(t, err)
+		require.NoError(t, d.Set([]byte("test"), nil, nil))
+		require.NoError(t, d.Close())
+		contents, err = mem.List("")
+		require.NoError(t, err)
+		sort.Strings(contents)
+	}
+
+	{
+		// Re-open the DB read-only. The directory contents should be unchanged.
+		d, err := Open("", testingRandomized(t, &Options{
+			FS:       mem,
+			ReadOnly: true,
+		}))
+		require.NoError(t, err)
+
+		// Verify various write operations fail in read-only mode.
+		require.EqualValues(t, ErrReadOnly, d.Compact(nil, []byte("\xff"), false))
+		require.EqualValues(t, ErrReadOnly, d.Flush())
+		require.EqualValues(t, ErrReadOnly, func() error { _, err := d.AsyncFlush(); return err }())
+
+		require.EqualValues(t, ErrReadOnly, d.Delete(nil, nil))
+		require.EqualValues(t, ErrReadOnly, d.DeleteRange(nil, nil, nil))
+		require.EqualValues(t, ErrReadOnly, d.Ingest(nil))
+		require.EqualValues(t, ErrReadOnly, d.LogData(nil, nil))
+		require.EqualValues(t, ErrReadOnly, d.Merge(nil, nil, nil))
+		require.EqualValues(t, ErrReadOnly, d.Set(nil, nil, nil))
+
+		// Verify we can still read in read-only mode.
+		require.NoError(t, func() error {
+			_, closer, err := d.Get([]byte("test"))
+			if closer != nil {
+				closer.Close()
+			}
+			return err
+		}())
+
+		checkIter := func(iter *Iterator, err error) {
+			t.Helper()
+
+			var keys []string
+			for valid := iter.First(); valid; valid = iter.Next() {
+				keys = append(keys, string(iter.Key()))
+			}
+			require.NoError(t, iter.Close())
+			expectedKeys := []string{"test"}
+			if diff := pretty.Diff(keys, expectedKeys); diff != nil {
+				t.Fatalf("%s\n%s", strings.Join(diff, "\n"), keys)
+			}
+		}
+
+		checkIter(d.NewIter(nil))
+
+		b := d.NewIndexedBatch()
+		checkIter(b.NewIter(nil))
+		require.EqualValues(t, ErrReadOnly, b.Commit(nil))
+		require.EqualValues(t, ErrReadOnly, d.Apply(b, nil))
+
+		s := d.NewSnapshot()
+		checkIter(s.NewIter(nil))
+		require.NoError(t, s.Close())
+
+		require.NoError(t, d.Close())
+
+		newContents, err := mem.List("")
+		require.NoError(t, err)
+
+		sort.Strings(newContents)
+		if diff := pretty.Diff(contents, newContents); diff != nil {
+			t.Fatalf("%s", strings.Join(diff, "\n"))
+		}
+	}
+}
+
+func TestOpenWALReplay(t *testing.T) {
+	largeValue := []byte(strings.Repeat("a", 100<<10))
+	hugeValue := []byte(strings.Repeat("b", 10<<20))
+	checkIter := func(iter *Iterator, err error) {
+		t.Helper()
+
+		var keys []string
+		for valid := iter.First(); valid; valid = iter.Next() {
+			keys = append(keys, string(iter.Key()))
+		}
+		require.NoError(t, iter.Close())
+		expectedKeys := []string{"1", "2", "3", "4", "5"}
+		if diff := pretty.Diff(keys, expectedKeys); diff != nil {
+			t.Fatalf("%s\n%s", strings.Join(diff, "\n"), keys)
+		}
+	}
+
+	for _, readOnly := range []bool{false, true} {
+		t.Run(fmt.Sprintf("read-only=%t", readOnly), func(t *testing.T) {
+			// Create a new DB and populate it with some data.
+			const dir = ""
+			mem := vfs.NewMem()
+			d, err := Open(dir, testingRandomized(t, &Options{
+				FS:           mem,
+				MemTableSize: 32 << 20,
+			}))
+			require.NoError(t, err)
+			// All these values will fit in a single memtable, so on closing the db there
+			// will be no sst and all the data is in a single WAL.
+			require.NoError(t, d.Set([]byte("1"), largeValue, nil))
+			require.NoError(t, d.Set([]byte("2"), largeValue, nil))
+			require.NoError(t, d.Set([]byte("3"), largeValue, nil))
+			require.NoError(t, d.Set([]byte("4"), hugeValue, nil))
+			require.NoError(t, d.Set([]byte("5"), largeValue, nil))
+			checkIter(d.NewIter(nil))
+			require.NoError(t, d.Close())
+			files, err := mem.List(dir)
+			require.NoError(t, err)
+			sort.Strings(files)
+			logCount, sstCount := 0, 0
+			for _, fname := range files {
+				if strings.HasSuffix(fname, ".sst") {
+					sstCount++
+				}
+				if strings.HasSuffix(fname, ".log") {
+					logCount++
+				}
+			}
+			require.Equal(t, 0, sstCount)
+			// The memtable size starts at 256KB and doubles up to 32MB so we expect 5
+			// logs (one for each doubling).
+			require.Equal(t, 7, logCount)
+
+			// Re-open the DB with a smaller memtable. Values for 1, 2 will fit in the first memtable;
+			// value for 3 will go in the next memtable; value for 4 will be in a flushable batch
+			// which will cause the previous memtable to be flushed; value for 5 will go in the next
+			// memtable
+			d, err = Open(dir, testingRandomized(t, &Options{
+				FS:           mem,
+				MemTableSize: 300 << 10,
+				ReadOnly:     readOnly,
+			}))
+			require.NoError(t, err)
+
+			if readOnly {
+				m := d.Metrics()
+				require.Equal(t, int64(logCount), m.WAL.Files)
+				d.mu.Lock()
+				require.NotNil(t, d.mu.mem.mutable)
+				d.mu.Unlock()
+			}
+			checkIter(d.NewIter(nil))
+			require.NoError(t, d.Close())
+		})
+	}
+}
+
+// Reproduction for https://github.com/cockroachdb/pebble/issues/2234.
+func TestWALReplaySequenceNumBug(t *testing.T) {
+	mem := vfs.NewMem()
+	d, err := Open("", testingRandomized(t, &Options{
+		FS: mem,
+	}))
+	require.NoError(t, err)
+
+	d.mu.Lock()
+	// Disable any flushes.
+	d.mu.compact.flushing = true
+	d.mu.Unlock()
+
+	require.NoError(t, d.Set([]byte("1"), nil, nil))
+	require.NoError(t, d.Set([]byte("2"), nil, nil))
+
+	// Write a large batch. This should go to a separate memtable.
+	largeValue := []byte(strings.Repeat("a", int(d.largeBatchThreshold)))
+	require.NoError(t, d.Set([]byte("1"), largeValue, nil))
+
+	// This write should go the mutable memtable after the large batch in the
+	// memtable queue.
+	d.Set([]byte("1"), nil, nil)
+
+	d.mu.Lock()
+	d.mu.compact.flushing = false
+	d.mu.Unlock()
+
+	// Make sure none of the flushables have been flushed.
+	require.Equal(t, 3, len(d.mu.mem.queue))
+
+	// Close the db. This doesn't cause a flush of the memtables, so they'll
+	// have to be replayed when the db is reopened.
+	require.NoError(t, d.Close())
+
+	files, err := mem.List("")
+	require.NoError(t, err)
+	sort.Strings(files)
+	sstCount := 0
+	for _, fname := range files {
+		if strings.HasSuffix(fname, ".sst") {
+			sstCount++
+		}
+	}
+	require.Equal(t, 0, sstCount)
+
+	// Reopen db in read only mode to force read only wal replay.
+	d, err = Open("", &Options{
+		FS:       mem,
+		ReadOnly: true,
+	})
+	require.NoError(t, err)
+	val, c, _ := d.Get([]byte("1"))
+	require.Equal(t, []byte{}, val)
+	c.Close()
+	require.NoError(t, d.Close())
+}
+
+// Similar to TestOpenWALReplay, except we test replay behavior after a
+// memtable has been flushed. We test all 3 reasons for flushing: forced, size,
+// and large-batch.
+func TestOpenWALReplay2(t *testing.T) {
+	for _, readOnly := range []bool{false, true} {
+		t.Run(fmt.Sprintf("read-only=%t", readOnly), func(t *testing.T) {
+			for _, reason := range []string{"forced", "size", "large-batch"} {
+				t.Run(reason, func(t *testing.T) {
+					mem := vfs.NewMem()
+					d, err := Open("", testingRandomized(t, &Options{
+						FS:           mem,
+						MemTableSize: 256 << 10,
+					}))
+					require.NoError(t, err)
+
+					switch reason {
+					case "forced":
+						require.NoError(t, d.Set([]byte("1"), nil, nil))
+						require.NoError(t, d.Flush())
+						require.NoError(t, d.Set([]byte("2"), nil, nil))
+					case "size":
+						largeValue := []byte(strings.Repeat("a", 100<<10))
+						require.NoError(t, d.Set([]byte("1"), largeValue, nil))
+						require.NoError(t, d.Set([]byte("2"), largeValue, nil))
+						require.NoError(t, d.Set([]byte("3"), largeValue, nil))
+					case "large-batch":
+						largeValue := []byte(strings.Repeat("a", int(d.largeBatchThreshold)))
+						require.NoError(t, d.Set([]byte("1"), nil, nil))
+						require.NoError(t, d.Set([]byte("2"), largeValue, nil))
+						require.NoError(t, d.Set([]byte("3"), nil, nil))
+					}
+					require.NoError(t, d.Close())
+
+					files, err := mem.List("")
+					require.NoError(t, err)
+					sort.Strings(files)
+					sstCount := 0
+					for _, fname := range files {
+						if strings.HasSuffix(fname, ".sst") {
+							sstCount++
+						}
+					}
+					require.Equal(t, 1, sstCount)
+
+					// Re-open the DB with a smaller memtable. Values for 1, 2 will fit in the first memtable;
+					// value for 3 will go in the next memtable; value for 4 will be in a flushable batch
+					// which will cause the previous memtable to be flushed; value for 5 will go in the next
+					// memtable
+					d, err = Open("", testingRandomized(t, &Options{
+						FS:           mem,
+						MemTableSize: 300 << 10,
+						ReadOnly:     readOnly,
+					}))
+					require.NoError(t, err)
+					require.NoError(t, d.Close())
+				})
+			}
+		})
+	}
+}
+
+// TestTwoWALReplayCorrupt tests WAL-replay behavior when the first of the two
+// WALs is corrupted with an sstable checksum error. Replay must stop at the
+// first WAL because otherwise we may violate point-in-time recovery
+// semantics. See #864.
+func TestTwoWALReplayCorrupt(t *testing.T) {
+	// Use the real filesystem so that we can seek and overwrite WAL data
+	// easily.
+	dir, err := os.MkdirTemp("", "wal-replay")
+	require.NoError(t, err)
+	defer os.RemoveAll(dir)
+
+	d, err := Open(dir, testingRandomized(t, &Options{
+		MemTableStopWritesThreshold: 4,
+		MemTableSize:                2048,
+	}))
+	require.NoError(t, err)
+	d.mu.Lock()
+	d.mu.compact.flushing = true
+	d.mu.Unlock()
+	require.NoError(t, d.Set([]byte("1"), []byte(strings.Repeat("a", 1024)), nil))
+	require.NoError(t, d.Set([]byte("2"), nil, nil))
+	d.mu.Lock()
+	d.mu.compact.flushing = false
+	d.mu.Unlock()
+	require.NoError(t, d.Close())
+
+	// We should have two WALs.
+	var logs []string
+	ls, err := vfs.Default.List(dir)
+	require.NoError(t, err)
+	for _, name := range ls {
+		if filepath.Ext(name) == ".log" {
+			logs = append(logs, name)
+		}
+	}
+	sort.Strings(logs)
+	if len(logs) < 2 {
+		t.Fatalf("expected at least two log files, found %d", len(logs))
+	}
+
+	// Corrupt the (n-1)th WAL by zeroing four bytes, 100 bytes from the end
+	// of the file.
+	f, err := os.OpenFile(filepath.Join(dir, logs[len(logs)-2]), os.O_RDWR, os.ModePerm)
+	require.NoError(t, err)
+	off, err := f.Seek(-100, 2)
+	require.NoError(t, err)
+	_, err = f.Write([]byte{0, 0, 0, 0})
+	require.NoError(t, err)
+	require.NoError(t, f.Close())
+	t.Logf("zeored four bytes in %s at offset %d\n", logs[len(logs)-2], off)
+
+	// Re-opening the database should detect and report the corruption.
+	_, err = Open(dir, nil)
+	require.Error(t, err, "pebble: corruption")
+}
+
+// TestTwoWALReplayCorrupt tests WAL-replay behavior when the first of the two
+// WALs is corrupted with an sstable checksum error and the OPTIONS file does
+// not enable the private strict_wal_tail option, indicating that the WAL was
+// produced by a database that did not guarantee clean WAL tails. See #864.
+func TestTwoWALReplayPermissive(t *testing.T) {
+	// Use the real filesystem so that we can seek and overwrite WAL data
+	// easily.
+	dir, err := os.MkdirTemp("", "wal-replay")
+	require.NoError(t, err)
+	defer os.RemoveAll(dir)
+
+	opts := &Options{
+		MemTableStopWritesThreshold: 4,
+		MemTableSize:                2048,
+	}
+	opts.testingRandomized(t)
+	opts.EnsureDefaults()
+	d, err := Open(dir, opts)
+	require.NoError(t, err)
+	d.mu.Lock()
+	d.mu.compact.flushing = true
+	d.mu.Unlock()
+	require.NoError(t, d.Set([]byte("1"), []byte(strings.Repeat("a", 1024)), nil))
+	require.NoError(t, d.Set([]byte("2"), nil, nil))
+	d.mu.Lock()
+	d.mu.compact.flushing = false
+	d.mu.Unlock()
+	require.NoError(t, d.Close())
+
+	// We should have two WALs.
+	var logs []string
+	var optionFilename string
+	ls, err := vfs.Default.List(dir)
+	require.NoError(t, err)
+	for _, name := range ls {
+		if filepath.Ext(name) == ".log" {
+			logs = append(logs, name)
+		}
+		if strings.HasPrefix(filepath.Base(name), "OPTIONS") {
+			optionFilename = name
+		}
+	}
+	sort.Strings(logs)
+	if len(logs) < 2 {
+		t.Fatalf("expected at least two log files, found %d", len(logs))
+	}
+
+	// Corrupt the (n-1)th WAL by zeroing four bytes, 100 bytes from the end
+	// of the file.
+	f, err := os.OpenFile(filepath.Join(dir, logs[len(logs)-2]), os.O_RDWR, os.ModePerm)
+	require.NoError(t, err)
+	off, err := f.Seek(-100, 2)
+	require.NoError(t, err)
+	_, err = f.Write([]byte{0, 0, 0, 0})
+	require.NoError(t, err)
+	require.NoError(t, f.Close())
+	t.Logf("zeored four bytes in %s at offset %d\n", logs[len(logs)-2], off)
+
+	// Remove the OPTIONS file containing the strict_wal_tail option.
+	require.NoError(t, vfs.Default.Remove(filepath.Join(dir, optionFilename)))
+
+	// Re-opening the database should not report the corruption.
+	d, err = Open(dir, nil)
+	require.NoError(t, err)
+	require.NoError(t, d.Close())
+}
+
+// TestCrashOpenCrashAfterWALCreation tests a database that exits
+// ungracefully, begins recovery, creates the new WAL but promptly exits
+// ungracefully again.
+//
+// This sequence has the potential to be problematic with the strict_wal_tail
+// behavior because the first crash's WAL has an unclean tail. By the time the
+// new WAL is created, the current manifest's MinUnflushedLogNum must be
+// higher than the previous WAL.
+func TestCrashOpenCrashAfterWALCreation(t *testing.T) {
+	fs := vfs.NewStrictMem()
+
+	getLogs := func() (logs []string) {
+		ls, err := fs.List("")
+		require.NoError(t, err)
+		for _, name := range ls {
+			if filepath.Ext(name) == ".log" {
+				logs = append(logs, name)
+			}
+		}
+		return logs
+	}
+
+	{
+		d, err := Open("", testingRandomized(t, &Options{FS: fs}))
+		require.NoError(t, err)
+		require.NoError(t, d.Set([]byte("abc"), nil, Sync))
+
+		// Ignore syncs during close to simulate a crash. This will leave the WAL
+		// without an EOF trailer. It won't be an 'unclean tail' yet since the
+		// log file was not recycled, but we'll fix that down below.
+		fs.SetIgnoreSyncs(true)
+		require.NoError(t, d.Close())
+		fs.ResetToSyncedState()
+		fs.SetIgnoreSyncs(false)
+	}
+
+	// There should be one WAL.
+	logs := getLogs()
+	if len(logs) != 1 {
+		t.Fatalf("expected one log file, found %d", len(logs))
+	}
+
+	// The one WAL file doesn't have an EOF trailer, but since it wasn't
+	// recycled it won't have garbage at the end. Rewrite it so that it has
+	// the same contents it currently has, followed by garbage.
+	{
+		f, err := fs.Open(logs[0])
+		require.NoError(t, err)
+		b, err := io.ReadAll(f)
+		require.NoError(t, err)
+		require.NoError(t, f.Close())
+		f, err = fs.Create(logs[0])
+		require.NoError(t, err)
+		_, err = f.Write(b)
+		require.NoError(t, err)
+		_, err = f.Write([]byte{0xde, 0xad, 0xbe, 0xef})
+		require.NoError(t, err)
+		require.NoError(t, f.Sync())
+		require.NoError(t, f.Close())
+		dir, err := fs.OpenDir("")
+		require.NoError(t, err)
+		require.NoError(t, dir.Sync())
+		require.NoError(t, dir.Close())
+	}
+
+	// Open the database again (with syncs respected again). Wrap the
+	// filesystem with an errorfs that will turn off syncs after a new .log
+	// file is created and after a subsequent directory sync occurs. This
+	// simulates a crash after the new log file is created and synced.
+	{
+		var walCreated, dirSynced atomic.Bool
+		d, err := Open("", &Options{
+			FS: errorfs.Wrap(fs, errorfs.InjectorFunc(func(op errorfs.Op) error {
+				if dirSynced.Load() {
+					fs.SetIgnoreSyncs(true)
+				}
+				if op.Kind == errorfs.OpCreate && filepath.Ext(op.Path) == ".log" {
+					walCreated.Store(true)
+				}
+				// Record when there's a sync of the data directory after the
+				// WAL was created. The data directory will have an empty
+				// path because that's what we passed into Open.
+				if op.Kind == errorfs.OpFileSync && op.Path == "" && walCreated.Load() {
+					dirSynced.Store(true)
+				}
+				return nil
+			})),
+		})
+		require.NoError(t, err)
+		require.NoError(t, d.Close())
+	}
+
+	fs.ResetToSyncedState()
+	fs.SetIgnoreSyncs(false)
+
+	if n := len(getLogs()); n != 2 {
+		t.Fatalf("expected two logs, found %d\n", n)
+	}
+
+	// Finally, open the database with syncs enabled.
+	d, err := Open("", testingRandomized(t, &Options{FS: fs}))
+	require.NoError(t, err)
+	require.NoError(t, d.Close())
+}
+
+// TestOpenWALReplayReadOnlySeqNums tests opening a database:
+//   - in read-only mode
+//   - with multiple unflushed log files that must replayed
+//   - a MANIFEST that sets the last sequence number to a number greater than
+//     the unflushed log files
+//
+// See cockroachdb/cockroach#48660.
+func TestOpenWALReplayReadOnlySeqNums(t *testing.T) {
+	const root = ""
+	mem := vfs.NewMem()
+
+	copyFiles := func(srcDir, dstDir string) {
+		files, err := mem.List(srcDir)
+		require.NoError(t, err)
+		for _, f := range files {
+			require.NoError(t, vfs.Copy(mem, mem.PathJoin(srcDir, f), mem.PathJoin(dstDir, f)))
+		}
+	}
+
+	// Create a new database under `/original` with a couple sstables.
+	dir := mem.PathJoin(root, "original")
+	d, err := Open(dir, testingRandomized(t, &Options{FS: mem}))
+	require.NoError(t, err)
+	require.NoError(t, d.Set([]byte("a"), nil, nil))
+	require.NoError(t, d.Flush())
+	require.NoError(t, d.Set([]byte("a"), nil, nil))
+	require.NoError(t, d.Flush())
+
+	// Prevent flushes so that multiple unflushed log files build up.
+	d.mu.Lock()
+	d.mu.compact.flushing = true
+	d.mu.Unlock()
+
+	require.NoError(t, d.Set([]byte("b"), nil, nil))
+	d.AsyncFlush()
+	require.NoError(t, d.Set([]byte("c"), nil, nil))
+	d.AsyncFlush()
+	require.NoError(t, d.Set([]byte("e"), nil, nil))
+
+	// Manually compact some of the key space so that the latest `logSeqNum` is
+	// written to the MANIFEST. This produces a MANIFEST where the `logSeqNum`
+	// is greater than the sequence numbers contained in the
+	// `minUnflushedLogNum` log file
+	require.NoError(t, d.Compact([]byte("a"), []byte("a\x00"), false))
+	d.mu.Lock()
+	for d.mu.compact.compactingCount > 0 {
+		d.mu.compact.cond.Wait()
+	}
+	d.mu.Unlock()
+
+	d.TestOnlyWaitForCleaning()
+	// While the MANIFEST is still in this state, copy all the files in the
+	// database to a new directory.
+	replayDir := mem.PathJoin(root, "replay")
+	require.NoError(t, mem.MkdirAll(replayDir, os.ModePerm))
+	copyFiles(dir, replayDir)
+
+	d.mu.Lock()
+	d.mu.compact.flushing = false
+	d.mu.Unlock()
+	require.NoError(t, d.Close())
+
+	// Open the copy of the database in read-only mode. Since we copied all
+	// the files before the flushes were allowed to complete, there should be
+	// multiple unflushed log files that need to replay. Since the manual
+	// compaction completed, the `logSeqNum` read from the manifest should be
+	// greater than the unflushed log files' sequence numbers.
+	d, err = Open(replayDir, testingRandomized(t, &Options{
+		FS:       mem,
+		ReadOnly: true,
+	}))
+	require.NoError(t, err)
+	require.NoError(t, d.Close())
+}
+
+func TestOpenWALReplayMemtableGrowth(t *testing.T) {
+	mem := vfs.NewMem()
+	const memTableSize = 64 * 1024 * 1024
+	opts := &Options{
+		MemTableSize: memTableSize,
+		FS:           mem,
+	}
+	opts.testingRandomized(t)
+	func() {
+		db, err := Open("", opts)
+		require.NoError(t, err)
+		defer db.Close()
+		b := db.NewBatch()
+		defer b.Close()
+		key := make([]byte, 8)
+		val := make([]byte, 16*1024*1024)
+		b.Set(key, val, nil)
+		require.NoError(t, db.Apply(b, Sync))
+	}()
+	db, err := Open("", opts)
+	require.NoError(t, err)
+	db.Close()
+}
+
+func TestGetVersion(t *testing.T) {
+	mem := vfs.NewMem()
+	opts := &Options{
+		FS: mem,
+	}
+	opts.testingRandomized(t)
+
+	// Case 1: No options file.
+	version, err := GetVersion("", mem)
+	require.NoError(t, err)
+	require.Empty(t, version)
+
+	// Case 2: Pebble created file.
+	db, err := Open("", opts)
+	require.NoError(t, err)
+	require.NoError(t, db.Close())
+	version, err = GetVersion("", mem)
+	require.NoError(t, err)
+	require.Equal(t, "0.1", version)
+
+	// Case 3: Manually created OPTIONS file with a higher number.
+	highestOptionsNum := FileNum(0)
+	ls, err := mem.List("")
+	require.NoError(t, err)
+	for _, filename := range ls {
+		ft, fn, ok := base.ParseFilename(mem, filename)
+		if !ok {
+			continue
+		}
+		switch ft {
+		case fileTypeOptions:
+			if fn.FileNum() > highestOptionsNum {
+				highestOptionsNum = fn.FileNum()
+			}
+		}
+	}
+	f, _ := mem.Create(fmt.Sprintf("OPTIONS-%d", highestOptionsNum+1))
+	_, err = f.Write([]byte("[Version]\n  pebble_version=0.2\n"))
+	require.NoError(t, err)
+	err = f.Close()
+	require.NoError(t, err)
+	version, err = GetVersion("", mem)
+	require.NoError(t, err)
+	require.Equal(t, "0.2", version)
+
+	// Case 4: Manually created OPTIONS file with a RocksDB number.
+	f, _ = mem.Create(fmt.Sprintf("OPTIONS-%d", highestOptionsNum+2))
+	_, err = f.Write([]byte("[Version]\n  rocksdb_version=6.2.1\n"))
+	require.NoError(t, err)
+	err = f.Close()
+	require.NoError(t, err)
+	version, err = GetVersion("", mem)
+	require.NoError(t, err)
+	require.Equal(t, "rocksdb v6.2.1", version)
+}
+
+func TestRocksDBNoFlushManifest(t *testing.T) {
+	mem := vfs.NewMem()
+	// Have the comparer and merger names match what's in the testdata
+	// directory.
+	comparer := *DefaultComparer
+	merger := *DefaultMerger
+	comparer.Name = "cockroach_comparator"
+	merger.Name = "cockroach_merge_operator"
+	opts := &Options{
+		FS:       mem,
+		Comparer: &comparer,
+		Merger:   &merger,
+	}
+
+	// rocksdb-ingest-only is a RocksDB-generated db directory that has not had
+	// a single flush yet, only ingestion operations. The manifest contains
+	// a next-log-num but no log-num entry. Ensure that pebble can read these
+	// directories without an issue.
+	_, err := vfs.Clone(vfs.Default, mem, "testdata/rocksdb-ingest-only", "testdata")
+	require.NoError(t, err)
+
+	db, err := Open("testdata", opts)
+	require.NoError(t, err)
+	defer db.Close()
+
+	val, closer, err := db.Get([]byte("ajulxeiombjiyw\x00\x00\x00\x00\x00\x00\x00\x01\x12\x09"))
+	require.NoError(t, err)
+	require.NotEmpty(t, val)
+	require.NoError(t, closer.Close())
+}
+
+func TestOpen_ErrorIfUnknownFormatVersion(t *testing.T) {
+	fs := vfs.NewMem()
+	d, err := Open("", &Options{
+		FS:                 fs,
+		FormatMajorVersion: FormatVersioned,
+	})
+	require.NoError(t, err)
+	require.NoError(t, d.Close())
+
+	// Move the marker to a version that does not exist.
+	m, _, err := atomicfs.LocateMarker(fs, "", formatVersionMarkerName)
+	require.NoError(t, err)
+	require.NoError(t, m.Move("999999"))
+	require.NoError(t, m.Close())
+
+	_, err = Open("", &Options{
+		FS:                 fs,
+		FormatMajorVersion: FormatVersioned,
+	})
+	require.Error(t, err)
+	require.EqualError(t, err, `pebble: database "" written in format major version 999999`)
+}
+
+// ensureFilesClosed updates the provided Options to wrap the filesystem. It
+// returns a closure that when invoked fails the test if any files opened by the
+// filesystem are not closed.
+//
+// This function is intended to be used in tests with defer.
+//
+//	opts := &Options{FS: vfs.NewMem()}
+//	defer ensureFilesClosed(t, opts)()
+//	/* test code */
+func ensureFilesClosed(t *testing.T, o *Options) func() {
+	fs := &closeTrackingFS{
+		FS:    o.FS,
+		files: map[*closeTrackingFile]struct{}{},
+	}
+	o.FS = fs
+	return func() {
+		// fs.files should be empty if all the files were closed.
+		for f := range fs.files {
+			t.Errorf("An open file was never closed. Opened at:\n%s", f.stack)
+		}
+	}
+}
+
+type closeTrackingFS struct {
+	vfs.FS
+	files map[*closeTrackingFile]struct{}
+}
+
+func (fs *closeTrackingFS) wrap(file vfs.File, err error) (vfs.File, error) {
+	if err != nil {
+		return nil, err
+	}
+	f := &closeTrackingFile{
+		File:  file,
+		fs:    fs,
+		stack: debug.Stack(),
+	}
+	fs.files[f] = struct{}{}
+	return f, err
+}
+
+func (fs *closeTrackingFS) Create(name string) (vfs.File, error) {
+	return fs.wrap(fs.FS.Create(name))
+}
+
+func (fs *closeTrackingFS) Open(name string, opts ...vfs.OpenOption) (vfs.File, error) {
+	return fs.wrap(fs.FS.Open(name))
+}
+
+func (fs *closeTrackingFS) OpenDir(name string) (vfs.File, error) {
+	return fs.wrap(fs.FS.OpenDir(name))
+}
+
+func (fs *closeTrackingFS) ReuseForWrite(oldname, newname string) (vfs.File, error) {
+	return fs.wrap(fs.FS.ReuseForWrite(oldname, newname))
+}
+
+type closeTrackingFile struct {
+	vfs.File
+	fs    *closeTrackingFS
+	stack []byte
+}
+
+func (f *closeTrackingFile) Close() error {
+	delete(f.fs.files, f)
+	return f.File.Close()
+}
+
+func TestCheckConsistency(t *testing.T) {
+	const dir = "./test"
+	mem := vfs.NewMem()
+	mem.MkdirAll(dir, 0755)
+
+	provider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(mem, dir))
+	require.NoError(t, err)
+	defer provider.Close()
+
+	cmp := base.DefaultComparer.Compare
+	fmtKey := base.DefaultComparer.FormatKey
+	parseMeta := func(s string) (*manifest.FileMetadata, error) {
+		if len(s) == 0 {
+			return nil, nil
+		}
+		parts := strings.Split(s, ":")
+		if len(parts) != 2 {
+			return nil, errors.Errorf("malformed table spec: %q", s)
+		}
+		fileNum, err := strconv.Atoi(strings.TrimSpace(parts[0]))
+		if err != nil {
+			return nil, err
+		}
+		size, err := strconv.Atoi(strings.TrimSpace(parts[1]))
+		if err != nil {
+			return nil, err
+		}
+		m := &manifest.FileMetadata{
+			FileNum: base.FileNum(fileNum),
+			Size:    uint64(size),
+		}
+		m.InitPhysicalBacking()
+		return m, nil
+	}
+
+	datadriven.RunTest(t, "testdata/version_check_consistency",
+		func(t *testing.T, d *datadriven.TestData) string {
+			switch d.Cmd {
+			case "check-consistency":
+				var filesByLevel [manifest.NumLevels][]*manifest.FileMetadata
+				var files *[]*manifest.FileMetadata
+
+				for _, data := range strings.Split(d.Input, "\n") {
+					switch data {
+					case "L0", "L1", "L2", "L3", "L4", "L5", "L6":
+						level, err := strconv.Atoi(data[1:])
+						if err != nil {
+							return err.Error()
+						}
+						files = &filesByLevel[level]
+
+					default:
+						m, err := parseMeta(data)
+						if err != nil {
+							return err.Error()
+						}
+						if m != nil {
+							*files = append(*files, m)
+						}
+					}
+				}
+
+				redactErr := false
+				for _, arg := range d.CmdArgs {
+					switch v := arg.String(); v {
+					case "redact":
+						redactErr = true
+					default:
+						return fmt.Sprintf("unknown argument: %q", v)
+					}
+				}
+
+				v := manifest.NewVersion(cmp, fmtKey, 0, filesByLevel)
+				err := checkConsistency(v, dir, provider)
+				if err != nil {
+					if redactErr {
+						redacted := redact.Sprint(err).Redact()
+						return string(redacted)
+					}
+					return err.Error()
+				}
+				return "OK"
+
+			case "build":
+				for _, data := range strings.Split(d.Input, "\n") {
+					m, err := parseMeta(data)
+					if err != nil {
+						return err.Error()
+					}
+					path := base.MakeFilepath(mem, dir, base.FileTypeTable, m.FileBacking.DiskFileNum)
+					_ = mem.Remove(path)
+					f, err := mem.Create(path)
+					if err != nil {
+						return err.Error()
+					}
+					_, err = f.Write(make([]byte, m.Size))
+					if err != nil {
+						return err.Error()
+					}
+					f.Close()
+				}
+				return ""
+
+			default:
+				return fmt.Sprintf("unknown command: %s", d.Cmd)
+			}
+		})
+}
+
+func TestOpenRatchetsNextFileNum(t *testing.T) {
+	mem := vfs.NewMem()
+	memShared := remote.NewInMem()
+
+	opts := &Options{FS: mem}
+	opts.Experimental.CreateOnShared = remote.CreateOnSharedAll
+	opts.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
+		"": memShared,
+	})
+	d, err := Open("", opts)
+	require.NoError(t, err)
+	d.SetCreatorID(1)
+
+	require.NoError(t, d.Set([]byte("foo"), []byte("value"), nil))
+	require.NoError(t, d.Set([]byte("bar"), []byte("value"), nil))
+	require.NoError(t, d.Flush())
+	require.NoError(t, d.Compact([]byte("a"), []byte("z"), false))
+
+	// Create a shared file with the newest file num and then close the db.
+	d.mu.Lock()
+	nextFileNum := d.mu.versions.getNextFileNum()
+	w, _, err := d.objProvider.Create(context.TODO(), fileTypeTable, nextFileNum.DiskFileNum(), objstorage.CreateOptions{PreferSharedStorage: true})
+	require.NoError(t, err)
+	require.NoError(t, w.Write([]byte("foobar")))
+	require.NoError(t, w.Finish())
+	require.NoError(t, d.objProvider.Sync())
+	d.mu.Unlock()
+
+	// Write one key and then close the db. This write will stay in the memtable,
+	// forcing the reopen to do a compaction on open.
+	require.NoError(t, d.Set([]byte("foo1"), []byte("value"), nil))
+	require.NoError(t, d.Close())
+
+	// Reopen db. Compactions should happen without error.
+	d, err = Open("", opts)
+	require.NoError(t, err)
+	require.NoError(t, d.Set([]byte("foo2"), []byte("value"), nil))
+	require.NoError(t, d.Set([]byte("bar2"), []byte("value"), nil))
+	require.NoError(t, d.Flush())
+	require.NoError(t, d.Compact([]byte("a"), []byte("z"), false))
+
+}
diff --git a/pebble/options.go b/pebble/options.go
new file mode 100644
index 0000000..8a2c609
--- /dev/null
+++ b/pebble/options.go
@@ -0,0 +1,1737 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"runtime"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/cache"
+	"github.com/cockroachdb/pebble/internal/humanize"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/objstorage/remote"
+	"github.com/cockroachdb/pebble/rangekey"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+const (
+	cacheDefaultSize       = 8 << 20 // 8 MB
+	defaultLevelMultiplier = 10
+)
+
+// Compression exports the base.Compression type.
+type Compression = sstable.Compression
+
+// Exported Compression constants.
+const (
+	DefaultCompression = sstable.DefaultCompression
+	NoCompression      = sstable.NoCompression
+	SnappyCompression  = sstable.SnappyCompression
+	ZstdCompression    = sstable.ZstdCompression
+)
+
+// FilterType exports the base.FilterType type.
+type FilterType = base.FilterType
+
+// Exported TableFilter constants.
+const (
+	TableFilter = base.TableFilter
+)
+
+// FilterWriter exports the base.FilterWriter type.
+type FilterWriter = base.FilterWriter
+
+// FilterPolicy exports the base.FilterPolicy type.
+type FilterPolicy = base.FilterPolicy
+
+// TablePropertyCollector exports the sstable.TablePropertyCollector type.
+type TablePropertyCollector = sstable.TablePropertyCollector
+
+// BlockPropertyCollector exports the sstable.BlockPropertyCollector type.
+type BlockPropertyCollector = sstable.BlockPropertyCollector
+
+// BlockPropertyFilter exports the sstable.BlockPropertyFilter type.
+type BlockPropertyFilter = base.BlockPropertyFilter
+
+// ShortAttributeExtractor exports the base.ShortAttributeExtractor type.
+type ShortAttributeExtractor = base.ShortAttributeExtractor
+
+// UserKeyPrefixBound exports the sstable.UserKeyPrefixBound type.
+type UserKeyPrefixBound = sstable.UserKeyPrefixBound
+
+// IterKeyType configures which types of keys an iterator should surface.
+type IterKeyType int8
+
+const (
+	// IterKeyTypePointsOnly configures an iterator to iterate over point keys
+	// only.
+	IterKeyTypePointsOnly IterKeyType = iota
+	// IterKeyTypeRangesOnly configures an iterator to iterate over range keys
+	// only.
+	IterKeyTypeRangesOnly
+	// IterKeyTypePointsAndRanges configures an iterator iterate over both point
+	// keys and range keys simultaneously.
+	IterKeyTypePointsAndRanges
+)
+
+// String implements fmt.Stringer.
+func (t IterKeyType) String() string {
+	switch t {
+	case IterKeyTypePointsOnly:
+		return "points-only"
+	case IterKeyTypeRangesOnly:
+		return "ranges-only"
+	case IterKeyTypePointsAndRanges:
+		return "points-and-ranges"
+	default:
+		panic(fmt.Sprintf("unknown key type %d", t))
+	}
+}
+
+// IterOptions hold the optional per-query parameters for NewIter.
+//
+// Like Options, a nil *IterOptions is valid and means to use the default
+// values.
+type IterOptions struct {
+	// LowerBound specifies the smallest key (inclusive) that the iterator will
+	// return during iteration. If the iterator is seeked or iterated past this
+	// boundary the iterator will return Valid()==false. Setting LowerBound
+	// effectively truncates the key space visible to the iterator.
+	LowerBound []byte
+	// UpperBound specifies the largest key (exclusive) that the iterator will
+	// return during iteration. If the iterator is seeked or iterated past this
+	// boundary the iterator will return Valid()==false. Setting UpperBound
+	// effectively truncates the key space visible to the iterator.
+	UpperBound []byte
+	// TableFilter can be used to filter the tables that are scanned during
+	// iteration based on the user properties. Return true to scan the table and
+	// false to skip scanning. This function must be thread-safe since the same
+	// function can be used by multiple iterators, if the iterator is cloned.
+	TableFilter func(userProps map[string]string) bool
+	// SkipPoint may be used to skip over point keys that don't match an
+	// arbitrary predicate during iteration. If set, the Iterator invokes
+	// SkipPoint for keys encountered. If SkipPoint returns true, the iterator
+	// will skip the key without yielding it to the iterator operation in
+	// progress.
+	//
+	// SkipPoint must be a pure function and always return the same result when
+	// provided the same arguments. The iterator may call SkipPoint multiple
+	// times for the same user key.
+	SkipPoint func(userKey []byte) bool
+	// PointKeyFilters can be used to avoid scanning tables and blocks in tables
+	// when iterating over point keys. This slice represents an intersection
+	// across all filters, i.e., all filters must indicate that the block is
+	// relevant.
+	//
+	// Performance note: When len(PointKeyFilters) > 0, the caller should ensure
+	// that cap(PointKeyFilters) is at least len(PointKeyFilters)+1. This helps
+	// avoid allocations in Pebble internal code that mutates the slice.
+	PointKeyFilters []BlockPropertyFilter
+	// RangeKeyFilters can be usefd to avoid scanning tables and blocks in tables
+	// when iterating over range keys. The same requirements that apply to
+	// PointKeyFilters apply here too.
+	RangeKeyFilters []BlockPropertyFilter
+	// KeyTypes configures which types of keys to iterate over: point keys,
+	// range keys, or both.
+	KeyTypes IterKeyType
+	// RangeKeyMasking can be used to enable automatic masking of point keys by
+	// range keys. Range key masking is only supported during combined range key
+	// and point key iteration mode (IterKeyTypePointsAndRanges).
+	RangeKeyMasking RangeKeyMasking
+
+	// OnlyReadGuaranteedDurable is an advanced option that is only supported by
+	// the Reader implemented by DB. When set to true, only the guaranteed to be
+	// durable state is visible in the iterator.
+	// - This definition is made under the assumption that the FS implementation
+	//   is providing a durability guarantee when data is synced.
+	// - The visible state represents a consistent point in the history of the
+	//   DB.
+	// - The implementation is free to choose a conservative definition of what
+	//   is guaranteed durable. For simplicity, the current implementation
+	//   ignores memtables. A more sophisticated implementation could track the
+	//   highest seqnum that is synced to the WAL and published and use that as
+	//   the visible seqnum for an iterator. Note that the latter approach is
+	//   not strictly better than the former since we can have DBs that are (a)
+	//   synced more rarely than memtable flushes, (b) have no WAL. (a) is
+	//   likely to be true in a future CockroachDB context where the DB
+	//   containing the state machine may be rarely synced.
+	// NB: this current implementation relies on the fact that memtables are
+	// flushed in seqnum order, and any ingested sstables that happen to have a
+	// lower seqnum than a non-flushed memtable don't have any overlapping keys.
+	// This is the fundamental level invariant used in other code too, like when
+	// merging iterators.
+	//
+	// Semantically, using this option provides the caller a "snapshot" as of
+	// the time the most recent memtable was flushed. An alternate interface
+	// would be to add a NewSnapshot variant. Creating a snapshot is heavier
+	// weight than creating an iterator, so we have opted to support this
+	// iterator option.
+	OnlyReadGuaranteedDurable bool
+	// UseL6Filters allows the caller to opt into reading filter blocks for L6
+	// sstables. Helpful if a lot of SeekPrefixGEs are expected in quick
+	// succession, that are also likely to not yield a single key. Filter blocks in
+	// L6 can be relatively large, often larger than data blocks, so the benefit of
+	// loading them in the cache is minimized if the probability of the key
+	// existing is not low or if we just expect a one-time Seek (where loading the
+	// data block directly is better).
+	UseL6Filters bool
+	// CategoryAndQoS is used for categorized iterator stats. This should not be
+	// changed by calling SetOptions.
+	sstable.CategoryAndQoS
+
+	// Internal options.
+
+	logger Logger
+	// Level corresponding to this file. Only passed in if constructed by a
+	// levelIter.
+	level manifest.Level
+	// disableLazyCombinedIteration is an internal testing option.
+	disableLazyCombinedIteration bool
+	// snapshotForHideObsoletePoints is specified for/by levelIter when opening
+	// files and is used to decide whether to hide obsolete points. A value of 0
+	// implies obsolete points should not be hidden.
+	snapshotForHideObsoletePoints uint64
+
+	// NB: If adding new Options, you must account for them in iterator
+	// construction and Iterator.SetOptions.
+}
+
+// GetLowerBound returns the LowerBound or nil if the receiver is nil.
+func (o *IterOptions) GetLowerBound() []byte {
+	if o == nil {
+		return nil
+	}
+	return o.LowerBound
+}
+
+// GetUpperBound returns the UpperBound or nil if the receiver is nil.
+func (o *IterOptions) GetUpperBound() []byte {
+	if o == nil {
+		return nil
+	}
+	return o.UpperBound
+}
+
+func (o *IterOptions) pointKeys() bool {
+	if o == nil {
+		return true
+	}
+	return o.KeyTypes == IterKeyTypePointsOnly || o.KeyTypes == IterKeyTypePointsAndRanges
+}
+
+func (o *IterOptions) rangeKeys() bool {
+	if o == nil {
+		return false
+	}
+	return o.KeyTypes == IterKeyTypeRangesOnly || o.KeyTypes == IterKeyTypePointsAndRanges
+}
+
+func (o *IterOptions) getLogger() Logger {
+	if o == nil || o.logger == nil {
+		return DefaultLogger
+	}
+	return o.logger
+}
+
+// SpanIterOptions creates a SpanIterOptions from this IterOptions.
+func (o *IterOptions) SpanIterOptions() keyspan.SpanIterOptions {
+	if o == nil {
+		return keyspan.SpanIterOptions{}
+	}
+	return keyspan.SpanIterOptions{
+		RangeKeyFilters: o.RangeKeyFilters,
+	}
+}
+
+// scanInternalOptions is similar to IterOptions, meant for use with
+// scanInternalIterator.
+type scanInternalOptions struct {
+	sstable.CategoryAndQoS
+	IterOptions
+
+	visitPointKey   func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error
+	visitRangeDel   func(start, end []byte, seqNum uint64) error
+	visitRangeKey   func(start, end []byte, keys []rangekey.Key) error
+	visitSharedFile func(sst *SharedSSTMeta) error
+
+	// skipSharedLevels skips levels that are shareable (level >=
+	// sharedLevelStart).
+	skipSharedLevels bool
+
+	// includeObsoleteKeys specifies whether keys shadowed by newer internal keys
+	// are exposed. If false, only one internal key per user key is exposed.
+	includeObsoleteKeys bool
+
+	// rateLimitFunc is used to limit the amount of bytes read per second.
+	rateLimitFunc func(key *InternalKey, value LazyValue) error
+}
+
+// RangeKeyMasking configures automatic hiding of point keys by range keys. A
+// non-nil Suffix enables range-key masking. When enabled, range keys with
+// suffixes ≥ Suffix behave as masks. All point keys that are contained within a
+// masking range key's bounds and have suffixes greater than the range key's
+// suffix are automatically skipped.
+//
+// Specifically, when configured with a RangeKeyMasking.Suffix _s_, and there
+// exists a range key with suffix _r_ covering a point key with suffix _p_, and
+//
+//	_s_ ≤ _r_ < _p_
+//
+// then the point key is elided.
+//
+// Range-key masking may only be used when iterating over both point keys and
+// range keys with IterKeyTypePointsAndRanges.
+type RangeKeyMasking struct {
+	// Suffix configures which range keys may mask point keys. Only range keys
+	// that are defined at suffixes greater than or equal to Suffix will mask
+	// point keys.
+	Suffix []byte
+	// Filter is an optional field that may be used to improve performance of
+	// range-key masking through a block-property filter defined over key
+	// suffixes. If non-nil, Filter is called by Pebble to construct a
+	// block-property filter mask at iterator creation. The filter is used to
+	// skip whole point-key blocks containing point keys with suffixes greater
+	// than a covering range-key's suffix.
+	//
+	// To use this functionality, the caller must create and configure (through
+	// Options.BlockPropertyCollectors) a block-property collector that records
+	// the maxmimum suffix contained within a block. The caller then must write
+	// and provide a BlockPropertyFilterMask implementation on that same
+	// property. See the BlockPropertyFilterMask type for more information.
+	Filter func() BlockPropertyFilterMask
+}
+
+// BlockPropertyFilterMask extends the BlockPropertyFilter interface for use
+// with range-key masking. Unlike an ordinary block property filter, a
+// BlockPropertyFilterMask's filtering criteria is allowed to change when Pebble
+// invokes its SetSuffix method.
+//
+// When a Pebble iterator steps into a range key's bounds and the range key has
+// a suffix greater than or equal to RangeKeyMasking.Suffix, the range key acts
+// as a mask. The masking range key hides all point keys that fall within the
+// range key's bounds and have suffixes > the range key's suffix. Without a
+// filter mask configured, Pebble performs this hiding by stepping through point
+// keys and comparing suffixes. If large numbers of point keys are masked, this
+// requires Pebble to load, iterate through and discard a large number of
+// sstable blocks containing masked point keys.
+//
+// If a block-property collector and a filter mask are configured, Pebble may
+// skip loading some point-key blocks altogether. If a block's keys are known to
+// all fall within the bounds of the masking range key and the block was
+// annotated by a block-property collector with the maximal suffix, Pebble can
+// ask the filter mask to compare the property to the current masking range
+// key's suffix. If the mask reports no intersection, the block may be skipped.
+//
+// If unsuffixed and suffixed keys are written to the database, care must be
+// taken to avoid unintentionally masking un-suffixed keys located in the same
+// block as suffixed keys. One solution is to interpret unsuffixed keys as
+// containing the maximal suffix value, ensuring that blocks containing
+// unsuffixed keys are always loaded.
+type BlockPropertyFilterMask interface {
+	BlockPropertyFilter
+
+	// SetSuffix configures the mask with the suffix of a range key. The filter
+	// should return false from Intersects whenever it's provided with a
+	// property encoding a block's minimum suffix that's greater (according to
+	// Compare) than the provided suffix.
+	SetSuffix(suffix []byte) error
+}
+
+// WriteOptions hold the optional per-query parameters for Set and Delete
+// operations.
+//
+// Like Options, a nil *WriteOptions is valid and means to use the default
+// values.
+type WriteOptions struct {
+	// Sync is whether to sync writes through the OS buffer cache and down onto
+	// the actual disk, if applicable. Setting Sync is required for durability of
+	// individual write operations but can result in slower writes.
+	//
+	// If false, and the process or machine crashes, then a recent write may be
+	// lost. This is due to the recently written data being buffered inside the
+	// process running Pebble. This differs from the semantics of a write system
+	// call in which the data is buffered in the OS buffer cache and would thus
+	// survive a process crash.
+	//
+	// The default value is true.
+	Sync bool
+}
+
+// Sync specifies the default write options for writes which synchronize to
+// disk.
+var Sync = &WriteOptions{Sync: true}
+
+// NoSync specifies the default write options for writes which do not
+// synchronize to disk.
+var NoSync = &WriteOptions{Sync: false}
+
+// GetSync returns the Sync value or true if the receiver is nil.
+func (o *WriteOptions) GetSync() bool {
+	return o == nil || o.Sync
+}
+
+// LevelOptions holds the optional per-level parameters.
+type LevelOptions struct {
+	// BlockRestartInterval is the number of keys between restart points
+	// for delta encoding of keys.
+	//
+	// The default value is 16.
+	BlockRestartInterval int
+
+	// BlockSize is the target uncompressed size in bytes of each table block.
+	//
+	// The default value is 4096.
+	BlockSize int
+
+	// BlockSizeThreshold finishes a block if the block size is larger than the
+	// specified percentage of the target block size and adding the next entry
+	// would cause the block to be larger than the target block size.
+	//
+	// The default value is 90
+	BlockSizeThreshold int
+
+	// Compression defines the per-block compression to use.
+	//
+	// The default value (DefaultCompression) uses snappy compression.
+	Compression Compression
+
+	// FilterPolicy defines a filter algorithm (such as a Bloom filter) that can
+	// reduce disk reads for Get calls.
+	//
+	// One such implementation is bloom.FilterPolicy(10) from the pebble/bloom
+	// package.
+	//
+	// The default value means to use no filter.
+	FilterPolicy FilterPolicy
+
+	// FilterType defines whether an existing filter policy is applied at a
+	// block-level or table-level. Block-level filters use less memory to create,
+	// but are slower to access as a check for the key in the index must first be
+	// performed to locate the filter block. A table-level filter will require
+	// memory proportional to the number of keys in an sstable to create, but
+	// avoids the index lookup when determining if a key is present. Table-level
+	// filters should be preferred except under constrained memory situations.
+	FilterType FilterType
+
+	// IndexBlockSize is the target uncompressed size in bytes of each index
+	// block. When the index block size is larger than this target, two-level
+	// indexes are automatically enabled. Setting this option to a large value
+	// (such as math.MaxInt32) disables the automatic creation of two-level
+	// indexes.
+	//
+	// The default value is the value of BlockSize.
+	IndexBlockSize int
+
+	// The target file size for the level.
+	TargetFileSize int64
+}
+
+// EnsureDefaults ensures that the default values for all of the options have
+// been initialized. It is valid to call EnsureDefaults on a nil receiver. A
+// non-nil result will always be returned.
+func (o *LevelOptions) EnsureDefaults() *LevelOptions {
+	if o == nil {
+		o = &LevelOptions{}
+	}
+	if o.BlockRestartInterval <= 0 {
+		o.BlockRestartInterval = base.DefaultBlockRestartInterval
+	}
+	if o.BlockSize <= 0 {
+		o.BlockSize = base.DefaultBlockSize
+	} else if o.BlockSize > sstable.MaximumBlockSize {
+		panic(errors.Errorf("BlockSize %d exceeds MaximumBlockSize", o.BlockSize))
+	}
+	if o.BlockSizeThreshold <= 0 {
+		o.BlockSizeThreshold = base.DefaultBlockSizeThreshold
+	}
+	if o.Compression <= DefaultCompression || o.Compression >= sstable.NCompression {
+		o.Compression = SnappyCompression
+	}
+	if o.IndexBlockSize <= 0 {
+		o.IndexBlockSize = o.BlockSize
+	}
+	if o.TargetFileSize <= 0 {
+		o.TargetFileSize = 2 << 20 // 2 MB
+	}
+	return o
+}
+
+// Options holds the optional parameters for configuring pebble. These options
+// apply to the DB at large; per-query options are defined by the IterOptions
+// and WriteOptions types.
+type Options struct {
+	// Sync sstables periodically in order to smooth out writes to disk. This
+	// option does not provide any persistency guarantee, but is used to avoid
+	// latency spikes if the OS automatically decides to write out a large chunk
+	// of dirty filesystem buffers. This option only controls SSTable syncs; WAL
+	// syncs are controlled by WALBytesPerSync.
+	//
+	// The default value is 512KB.
+	BytesPerSync int
+
+	// Cache is used to cache uncompressed blocks from sstables.
+	//
+	// The default cache size is 8 MB.
+	Cache *cache.Cache
+
+	// Cleaner cleans obsolete files.
+	//
+	// The default cleaner uses the DeleteCleaner.
+	Cleaner Cleaner
+
+	// Comparer defines a total ordering over the space of []byte keys: a 'less
+	// than' relationship. The same comparison algorithm must be used for reads
+	// and writes over the lifetime of the DB.
+	//
+	// The default value uses the same ordering as bytes.Compare.
+	Comparer *Comparer
+
+	// DebugCheck is invoked, if non-nil, whenever a new version is being
+	// installed. Typically, this is set to pebble.DebugCheckLevels in tests
+	// or tools only, to check invariants over all the data in the database.
+	DebugCheck func(*DB) error
+
+	// Disable the write-ahead log (WAL). Disabling the write-ahead log prohibits
+	// crash recovery, but can improve performance if crash recovery is not
+	// needed (e.g. when only temporary state is being stored in the database).
+	//
+	// TODO(peter): untested
+	DisableWAL bool
+
+	// ErrorIfExists causes an error on Open if the database already exists.
+	// The error can be checked with errors.Is(err, ErrDBAlreadyExists).
+	//
+	// The default value is false.
+	ErrorIfExists bool
+
+	// ErrorIfNotExists causes an error on Open if the database does not already
+	// exist. The error can be checked with errors.Is(err, ErrDBDoesNotExist).
+	//
+	// The default value is false which will cause a database to be created if it
+	// does not already exist.
+	ErrorIfNotExists bool
+
+	// ErrorIfNotPristine causes an error on Open if the database already exists
+	// and any operations have been performed on the database. The error can be
+	// checked with errors.Is(err, ErrDBNotPristine).
+	//
+	// Note that a database that contained keys that were all subsequently deleted
+	// may or may not trigger the error. Currently, we check if there are any live
+	// SSTs or log records to replay.
+	ErrorIfNotPristine bool
+
+	// EventListener provides hooks to listening to significant DB events such as
+	// flushes, compactions, and table deletion.
+	EventListener *EventListener
+
+	// Experimental contains experimental options which are off by default.
+	// These options are temporary and will eventually either be deleted, moved
+	// out of the experimental group, or made the non-adjustable default. These
+	// options may change at any time, so do not rely on them.
+	Experimental struct {
+		// The threshold of L0 read-amplification at which compaction concurrency
+		// is enabled (if CompactionDebtConcurrency was not already exceeded).
+		// Every multiple of this value enables another concurrent
+		// compaction up to MaxConcurrentCompactions.
+		L0CompactionConcurrency int
+
+		// CompactionDebtConcurrency controls the threshold of compaction debt
+		// at which additional compaction concurrency slots are added. For every
+		// multiple of this value in compaction debt bytes, an additional
+		// concurrent compaction is added. This works "on top" of
+		// L0CompactionConcurrency, so the higher of the count of compaction
+		// concurrency slots as determined by the two options is chosen.
+		CompactionDebtConcurrency uint64
+
+		// IngestSplit, if it returns true, allows for ingest-time splitting of
+		// existing sstables into two virtual sstables to allow ingestion sstables to
+		// slot into a lower level than they otherwise would have.
+		IngestSplit func() bool
+
+		// ReadCompactionRate controls the frequency of read triggered
+		// compactions by adjusting `AllowedSeeks` in manifest.FileMetadata:
+		//
+		// AllowedSeeks = FileSize / ReadCompactionRate
+		//
+		// From LevelDB:
+		// ```
+		// We arrange to automatically compact this file after
+		// a certain number of seeks. Let's assume:
+		//   (1) One seek costs 10ms
+		//   (2) Writing or reading 1MB costs 10ms (100MB/s)
+		//   (3) A compaction of 1MB does 25MB of IO:
+		//         1MB read from this level
+		//         10-12MB read from next level (boundaries may be misaligned)
+		//         10-12MB written to next level
+		// This implies that 25 seeks cost the same as the compaction
+		// of 1MB of data.  I.e., one seek costs approximately the
+		// same as the compaction of 40KB of data.  We are a little
+		// conservative and allow approximately one seek for every 16KB
+		// of data before triggering a compaction.
+		// ```
+		ReadCompactionRate int64
+
+		// ReadSamplingMultiplier is a multiplier for the readSamplingPeriod in
+		// iterator.maybeSampleRead() to control the frequency of read sampling
+		// to trigger a read triggered compaction. A value of -1 prevents sampling
+		// and disables read triggered compactions. The default is 1 << 4. which
+		// gets multiplied with a constant of 1 << 16 to yield 1 << 20 (1MB).
+		ReadSamplingMultiplier int64
+
+		// TableCacheShards is the number of shards per table cache.
+		// Reducing the value can reduce the number of idle goroutines per DB
+		// instance which can be useful in scenarios with a lot of DB instances
+		// and a large number of CPUs, but doing so can lead to higher contention
+		// in the table cache and reduced performance.
+		//
+		// The default value is the number of logical CPUs, which can be
+		// limited by runtime.GOMAXPROCS.
+		TableCacheShards int
+
+		// KeyValidationFunc is a function to validate a user key in an SSTable.
+		//
+		// Currently, this function is used to validate the smallest and largest
+		// keys in an SSTable undergoing compaction. In this case, returning an
+		// error from the validation function will result in a panic at runtime,
+		// given that there is rarely any way of recovering from malformed keys
+		// present in compacted files. By default, validation is not performed.
+		//
+		// Additional use-cases may be added in the future.
+		//
+		// NOTE: callers should take care to not mutate the key being validated.
+		KeyValidationFunc func(userKey []byte) error
+
+		// ValidateOnIngest schedules validation of sstables after they have
+		// been ingested.
+		//
+		// By default, this value is false.
+		ValidateOnIngest bool
+
+		// LevelMultiplier configures the size multiplier used to determine the
+		// desired size of each level of the LSM. Defaults to 10.
+		LevelMultiplier int
+
+		// MultiLevelCompactionHeuristic determines whether to add an additional
+		// level to a conventional two level compaction. If nil, a multilevel
+		// compaction will never get triggered.
+		MultiLevelCompactionHeuristic MultiLevelHeuristic
+
+		// MaxWriterConcurrency is used to indicate the maximum number of
+		// compression workers the compression queue is allowed to use. If
+		// MaxWriterConcurrency > 0, then the Writer will use parallelism, to
+		// compress and write blocks to disk. Otherwise, the writer will
+		// compress and write blocks to disk synchronously.
+		MaxWriterConcurrency int
+
+		// ForceWriterParallelism is used to force parallelism in the sstable
+		// Writer for the metamorphic tests. Even with the MaxWriterConcurrency
+		// option set, we only enable parallelism in the sstable Writer if there
+		// is enough CPU available, and this option bypasses that.
+		ForceWriterParallelism bool
+
+		// CPUWorkPermissionGranter should be set if Pebble should be given the
+		// ability to optionally schedule additional CPU. See the documentation
+		// for CPUWorkPermissionGranter for more details.
+		CPUWorkPermissionGranter CPUWorkPermissionGranter
+
+		// EnableValueBlocks is used to decide whether to enable writing
+		// TableFormatPebblev3 sstables. This setting is only respected by a
+		// specific subset of format major versions: FormatSSTableValueBlocks,
+		// FormatFlushableIngest and FormatPrePebblev1MarkedCompacted. In lower
+		// format major versions, value blocks are never enabled. In higher
+		// format major versions, value blocks are always enabled.
+		EnableValueBlocks func() bool
+
+		// ShortAttributeExtractor is used iff EnableValueBlocks() returns true
+		// (else ignored). If non-nil, a ShortAttribute can be extracted from the
+		// value and stored with the key, when the value is stored elsewhere.
+		ShortAttributeExtractor ShortAttributeExtractor
+
+		// RequiredInPlaceValueBound specifies an optional span of user key
+		// prefixes that are not-MVCC, but have a suffix. For these the values
+		// must be stored with the key, since the concept of "older versions" is
+		// not defined. It is also useful for statically known exclusions to value
+		// separation. In CockroachDB, this will be used for the lock table key
+		// space that has non-empty suffixes, but those locks don't represent
+		// actual MVCC versions (the suffix ordering is arbitrary). We will also
+		// need to add support for dynamically configured exclusions (we want the
+		// default to be to allow Pebble to decide whether to separate the value
+		// or not, hence this is structured as exclusions), for example, for users
+		// of CockroachDB to dynamically exclude certain tables.
+		//
+		// Any change in exclusion behavior takes effect only on future written
+		// sstables, and does not start rewriting existing sstables.
+		//
+		// Even ignoring changes in this setting, exclusions are interpreted as a
+		// guidance by Pebble, and not necessarily honored. Specifically, user
+		// keys with multiple Pebble-versions *may* have the older versions stored
+		// in value blocks.
+		RequiredInPlaceValueBound UserKeyPrefixBound
+
+		// DisableIngestAsFlushable disables lazy ingestion of sstables through
+		// a WAL write and memtable rotation. Only effectual if the the format
+		// major version is at least `FormatFlushableIngest`.
+		DisableIngestAsFlushable func() bool
+
+		// RemoteStorage enables use of remote storage (e.g. S3) for storing
+		// sstables. Setting this option enables use of CreateOnShared option and
+		// allows ingestion of external files.
+		RemoteStorage remote.StorageFactory
+
+		// If CreateOnShared is non-zero, new sstables are created on remote storage
+		// (using CreateOnSharedLocator and with the appropriate
+		// CreateOnSharedStrategy). These sstables can be shared between different
+		// Pebble instances; the lifecycle of such objects is managed by the
+		// remote.Storage constructed by options.RemoteStorage.
+		//
+		// Can only be used when RemoteStorage is set (and recognizes
+		// CreateOnSharedLocator).
+		CreateOnShared        remote.CreateOnSharedStrategy
+		CreateOnSharedLocator remote.Locator
+
+		// CacheSizeBytesBytes is the size of the on-disk block cache for objects
+		// on shared storage in bytes. If it is 0, no cache is used.
+		SecondaryCacheSizeBytes int64
+	}
+
+	// Filters is a map from filter policy name to filter policy. It is used for
+	// debugging tools which may be used on multiple databases configured with
+	// different filter policies. It is not necessary to populate this filters
+	// map during normal usage of a DB.
+	Filters map[string]FilterPolicy
+
+	// FlushDelayDeleteRange configures how long the database should wait before
+	// forcing a flush of a memtable that contains a range deletion. Disk space
+	// cannot be reclaimed until the range deletion is flushed. No automatic
+	// flush occurs if zero.
+	FlushDelayDeleteRange time.Duration
+
+	// FlushDelayRangeKey configures how long the database should wait before
+	// forcing a flush of a memtable that contains a range key. Range keys in
+	// the memtable prevent lazy combined iteration, so it's desirable to flush
+	// range keys promptly. No automatic flush occurs if zero.
+	FlushDelayRangeKey time.Duration
+
+	// FlushSplitBytes denotes the target number of bytes per sublevel in
+	// each flush split interval (i.e. range between two flush split keys)
+	// in L0 sstables. When set to zero, only a single sstable is generated
+	// by each flush. When set to a non-zero value, flushes are split at
+	// points to meet L0's TargetFileSize, any grandparent-related overlap
+	// options, and at boundary keys of L0 flush split intervals (which are
+	// targeted to contain around FlushSplitBytes bytes in each sublevel
+	// between pairs of boundary keys). Splitting sstables during flush
+	// allows increased compaction flexibility and concurrency when those
+	// tables are compacted to lower levels.
+	FlushSplitBytes int64
+
+	// FormatMajorVersion sets the format of on-disk files. It is
+	// recommended to set the format major version to an explicit
+	// version, as the default may change over time.
+	//
+	// At Open if the existing database is formatted using a later
+	// format major version that is known to this version of Pebble,
+	// Pebble will continue to use the later format major version. If
+	// the existing database's version is unknown, the caller may use
+	// FormatMostCompatible and will be able to open the database
+	// regardless of its actual version.
+	//
+	// If the existing database is formatted using a format major
+	// version earlier than the one specified, Open will automatically
+	// ratchet the database to the specified format major version.
+	FormatMajorVersion FormatMajorVersion
+
+	// FS provides the interface for persistent file storage.
+	//
+	// The default value uses the underlying operating system's file system.
+	FS vfs.FS
+
+	// Lock, if set, must be a database lock acquired through LockDirectory for
+	// the same directory passed to Open. If provided, Open will skip locking
+	// the directory. Closing the database will not release the lock, and it's
+	// the responsibility of the caller to release the lock after closing the
+	// database.
+	//
+	// Open will enforce that the Lock passed locks the same directory passed to
+	// Open. Concurrent calls to Open using the same Lock are detected and
+	// prohibited.
+	Lock *Lock
+
+	// The count of L0 files necessary to trigger an L0 compaction.
+	L0CompactionFileThreshold int
+
+	// The amount of L0 read-amplification necessary to trigger an L0 compaction.
+	L0CompactionThreshold int
+
+	// Hard limit on L0 read-amplification, computed as the number of L0
+	// sublevels. Writes are stopped when this threshold is reached.
+	L0StopWritesThreshold int
+
+	// The maximum number of bytes for LBase. The base level is the level which
+	// L0 is compacted into. The base level is determined dynamically based on
+	// the existing data in the LSM. The maximum number of bytes for other levels
+	// is computed dynamically based on the base level's maximum size. When the
+	// maximum number of bytes for a level is exceeded, compaction is requested.
+	LBaseMaxBytes int64
+
+	// Per-level options. Options for at least one level must be specified. The
+	// options for the last level are used for all subsequent levels.
+	Levels []LevelOptions
+
+	// LoggerAndTracer will be used, if non-nil, else Logger will be used and
+	// tracing will be a noop.
+
+	// Logger used to write log messages.
+	//
+	// The default logger uses the Go standard library log package.
+	Logger Logger
+	// LoggerAndTracer is used for writing log messages and traces.
+	LoggerAndTracer LoggerAndTracer
+
+	// MaxManifestFileSize is the maximum size the MANIFEST file is allowed to
+	// become. When the MANIFEST exceeds this size it is rolled over and a new
+	// MANIFEST is created.
+	MaxManifestFileSize int64
+
+	// MaxOpenFiles is a soft limit on the number of open files that can be
+	// used by the DB.
+	//
+	// The default value is 1000.
+	MaxOpenFiles int
+
+	// The size of a MemTable in steady state. The actual MemTable size starts at
+	// min(256KB, MemTableSize) and doubles for each subsequent MemTable up to
+	// MemTableSize. This reduces the memory pressure caused by MemTables for
+	// short lived (test) DB instances. Note that more than one MemTable can be
+	// in existence since flushing a MemTable involves creating a new one and
+	// writing the contents of the old one in the
+	// background. MemTableStopWritesThreshold places a hard limit on the size of
+	// the queued MemTables.
+	//
+	// The default value is 4MB.
+	MemTableSize uint64
+
+	// Hard limit on the number of queued of MemTables. Writes are stopped when
+	// the sum of the queued memtable sizes exceeds:
+	//   MemTableStopWritesThreshold * MemTableSize.
+	//
+	// This value should be at least 2 or writes will stop whenever a MemTable is
+	// being flushed.
+	//
+	// The default value is 2.
+	MemTableStopWritesThreshold int
+
+	// Merger defines the associative merge operation to use for merging values
+	// written with {Batch,DB}.Merge.
+	//
+	// The default merger concatenates values.
+	Merger *Merger
+
+	// MaxConcurrentCompactions specifies the maximum number of concurrent
+	// compactions. The default is 1. Concurrent compactions are performed
+	// - when L0 read-amplification passes the L0CompactionConcurrency threshold
+	// - for automatic background compactions
+	// - when a manual compaction for a level is split and parallelized
+	// MaxConcurrentCompactions must be greater than 0.
+	MaxConcurrentCompactions func() int
+
+	// DisableAutomaticCompactions dictates whether automatic compactions are
+	// scheduled or not. The default is false (enabled). This option is only used
+	// externally when running a manual compaction, and internally for tests.
+	DisableAutomaticCompactions bool
+
+	// NoSyncOnClose decides whether the Pebble instance will enforce a
+	// close-time synchronization (e.g., fdatasync() or sync_file_range())
+	// on files it writes to. Setting this to true removes the guarantee for a
+	// sync on close. Some implementations can still issue a non-blocking sync.
+	NoSyncOnClose bool
+
+	// NumPrevManifest is the number of non-current or older manifests which
+	// we want to keep around for debugging purposes. By default, we're going
+	// to keep one older manifest.
+	NumPrevManifest int
+
+	// ReadOnly indicates that the DB should be opened in read-only mode. Writes
+	// to the DB will return an error, background compactions are disabled, and
+	// the flush that normally occurs after replaying the WAL at startup is
+	// disabled.
+	ReadOnly bool
+
+	// TableCache is an initialized TableCache which should be set as an
+	// option if the DB needs to be initialized with a pre-existing table cache.
+	// If TableCache is nil, then a table cache which is unique to the DB instance
+	// is created. TableCache can be shared between db instances by setting it here.
+	// The TableCache set here must use the same underlying cache as Options.Cache
+	// and pebble will panic otherwise.
+	TableCache *TableCache
+
+	// TablePropertyCollectors is a list of TablePropertyCollector creation
+	// functions. A new TablePropertyCollector is created for each sstable built
+	// and lives for the lifetime of the table.
+	TablePropertyCollectors []func() TablePropertyCollector
+
+	// BlockPropertyCollectors is a list of BlockPropertyCollector creation
+	// functions. A new BlockPropertyCollector is created for each sstable
+	// built and lives for the lifetime of writing that table.
+	BlockPropertyCollectors []func() BlockPropertyCollector
+
+	// WALBytesPerSync sets the number of bytes to write to a WAL before calling
+	// Sync on it in the background. Just like with BytesPerSync above, this
+	// helps smooth out disk write latencies, and avoids cases where the OS
+	// writes a lot of buffered data to disk at once. However, this is less
+	// necessary with WALs, as many write operations already pass in
+	// Sync = true.
+	//
+	// The default value is 0, i.e. no background syncing. This matches the
+	// default behaviour in RocksDB.
+	WALBytesPerSync int
+
+	// WALDir specifies the directory to store write-ahead logs (WALs) in. If
+	// empty (the default), WALs will be stored in the same directory as sstables
+	// (i.e. the directory passed to pebble.Open).
+	WALDir string
+
+	// WALMinSyncInterval is the minimum duration between syncs of the WAL. If
+	// WAL syncs are requested faster than this interval, they will be
+	// artificially delayed. Introducing a small artificial delay (500us) between
+	// WAL syncs can allow more operations to arrive and reduce IO operations
+	// while having a minimal impact on throughput. This option is supplied as a
+	// closure in order to allow the value to be changed dynamically. The default
+	// value is 0.
+	//
+	// TODO(peter): rather than a closure, should there be another mechanism for
+	// changing options dynamically?
+	WALMinSyncInterval func() time.Duration
+
+	// TargetByteDeletionRate is the rate (in bytes per second) at which sstable file
+	// deletions are limited to (under normal circumstances).
+	//
+	// Deletion pacing is used to slow down deletions when compactions finish up
+	// or readers close and newly-obsolete files need cleaning up. Deleting lots
+	// of files at once can cause disk latency to go up on some SSDs, which this
+	// functionality guards against.
+	//
+	// This value is only a best-effort target; the effective rate can be
+	// higher if deletions are falling behind or disk space is running low.
+	//
+	// Setting this to 0 disables deletion pacing, which is also the default.
+	TargetByteDeletionRate int
+
+	// private options are only used by internal tests or are used internally
+	// for facilitating upgrade paths of unconfigurable functionality.
+	private struct {
+		// strictWALTail configures whether or not a database's WALs created
+		// prior to the most recent one should be interpreted strictly,
+		// requiring a clean EOF. RocksDB 6.2.1 and the version of Pebble
+		// included in CockroachDB 20.1 do not guarantee that closed WALs end
+		// cleanly. If this option is set within an OPTIONS file, Pebble
+		// interprets previous WALs strictly, requiring a clean EOF.
+		// Otherwise, it interprets them permissively in the same manner as
+		// RocksDB 6.2.1.
+		strictWALTail bool
+
+		// disableDeleteOnlyCompactions prevents the scheduling of delete-only
+		// compactions that drop sstables wholy covered by range tombstones or
+		// range key tombstones.
+		disableDeleteOnlyCompactions bool
+
+		// disableElisionOnlyCompactions prevents the scheduling of elision-only
+		// compactions that rewrite sstables in place in order to elide obsolete
+		// keys.
+		disableElisionOnlyCompactions bool
+
+		// disableLazyCombinedIteration is a private option used by the
+		// metamorphic tests to test equivalence between lazy-combined iteration
+		// and constructing the range-key iterator upfront. It's a private
+		// option to avoid littering the public interface with options that we
+		// do not want to allow users to actually configure.
+		disableLazyCombinedIteration bool
+
+		// A private option to disable stats collection.
+		disableTableStats bool
+
+		// testingAlwaysWaitForCleanup is set by some tests to force waiting for
+		// obsolete file deletion (to make events deterministic).
+		testingAlwaysWaitForCleanup bool
+
+		// fsCloser holds a closer that should be invoked after a DB using these
+		// Options is closed. This is used to automatically stop the
+		// long-running goroutine associated with the disk-health-checking FS.
+		// See the initialization of FS in EnsureDefaults. Note that care has
+		// been taken to ensure that it is still safe to continue using the FS
+		// after this closer has been invoked. However, if write operations
+		// against the FS are made after the DB is closed, the FS may leak a
+		// goroutine indefinitely.
+		fsCloser io.Closer
+	}
+}
+
+// DebugCheckLevels calls CheckLevels on the provided database.
+// It may be set in the DebugCheck field of Options to check
+// level invariants whenever a new version is installed.
+func DebugCheckLevels(db *DB) error {
+	return db.CheckLevels(nil)
+}
+
+// EnsureDefaults ensures that the default values for all options are set if a
+// valid value was not already specified. Returns the new options.
+func (o *Options) EnsureDefaults() *Options {
+	if o == nil {
+		o = &Options{}
+	}
+	if o.BytesPerSync <= 0 {
+		o.BytesPerSync = 512 << 10 // 512 KB
+	}
+	if o.Cleaner == nil {
+		o.Cleaner = DeleteCleaner{}
+	}
+	if o.Comparer == nil {
+		o.Comparer = DefaultComparer
+	}
+	if o.Experimental.DisableIngestAsFlushable == nil {
+		o.Experimental.DisableIngestAsFlushable = func() bool { return false }
+	}
+	if o.Experimental.L0CompactionConcurrency <= 0 {
+		o.Experimental.L0CompactionConcurrency = 10
+	}
+	if o.Experimental.CompactionDebtConcurrency <= 0 {
+		o.Experimental.CompactionDebtConcurrency = 1 << 30 // 1 GB
+	}
+	if o.Experimental.KeyValidationFunc == nil {
+		o.Experimental.KeyValidationFunc = func([]byte) error { return nil }
+	}
+	if o.L0CompactionThreshold <= 0 {
+		o.L0CompactionThreshold = 4
+	}
+	if o.L0CompactionFileThreshold <= 0 {
+		// Some justification for the default of 500:
+		// Why not smaller?:
+		// - The default target file size for L0 is 2MB, so 500 files is <= 1GB
+		//   of data. At observed compaction speeds of > 20MB/s, L0 can be
+		//   cleared of all files in < 1min, so this backlog is not huge.
+		// - 500 files is low overhead for instantiating L0 sublevels from
+		//   scratch.
+		// - Lower values were observed to cause excessive and inefficient
+		//   compactions out of L0 in a TPCC import benchmark.
+		// Why not larger?:
+		// - More than 1min to compact everything out of L0.
+		// - CockroachDB's admission control system uses a threshold of 1000
+		//   files to start throttling writes to Pebble. Using 500 here gives
+		//   us headroom between when Pebble should start compacting L0 and
+		//   when the admission control threshold is reached.
+		//
+		// We can revisit this default in the future based on better
+		// experimental understanding.
+		//
+		// TODO(jackson): Experiment with slightly lower thresholds [or higher
+		// admission control thresholds] to see whether a higher L0 score at the
+		// threshold (currently 2.0) is necessary for some workloads to avoid
+		// starving L0 in favor of lower-level compactions.
+		o.L0CompactionFileThreshold = 500
+	}
+	if o.L0StopWritesThreshold <= 0 {
+		o.L0StopWritesThreshold = 12
+	}
+	if o.LBaseMaxBytes <= 0 {
+		o.LBaseMaxBytes = 64 << 20 // 64 MB
+	}
+	if o.Levels == nil {
+		o.Levels = make([]LevelOptions, 1)
+		for i := range o.Levels {
+			if i > 0 {
+				l := &o.Levels[i]
+				if l.TargetFileSize <= 0 {
+					l.TargetFileSize = o.Levels[i-1].TargetFileSize * 2
+				}
+			}
+			o.Levels[i].EnsureDefaults()
+		}
+	} else {
+		for i := range o.Levels {
+			o.Levels[i].EnsureDefaults()
+		}
+	}
+	if o.Logger == nil {
+		o.Logger = DefaultLogger
+	}
+	if o.EventListener == nil {
+		o.EventListener = &EventListener{}
+	}
+	o.EventListener.EnsureDefaults(o.Logger)
+	if o.MaxManifestFileSize == 0 {
+		o.MaxManifestFileSize = 128 << 20 // 128 MB
+	}
+	if o.MaxOpenFiles == 0 {
+		o.MaxOpenFiles = 1000
+	}
+	if o.MemTableSize <= 0 {
+		o.MemTableSize = 4 << 20 // 4 MB
+	}
+	if o.MemTableStopWritesThreshold <= 0 {
+		o.MemTableStopWritesThreshold = 2
+	}
+	if o.Merger == nil {
+		o.Merger = DefaultMerger
+	}
+	o.private.strictWALTail = true
+	if o.MaxConcurrentCompactions == nil {
+		o.MaxConcurrentCompactions = func() int { return 1 }
+	}
+	if o.NumPrevManifest <= 0 {
+		o.NumPrevManifest = 1
+	}
+
+	if o.FormatMajorVersion == FormatDefault {
+		o.FormatMajorVersion = FormatMostCompatible
+	}
+
+	if o.FS == nil {
+		o.WithFSDefaults()
+	}
+	if o.FlushSplitBytes <= 0 {
+		o.FlushSplitBytes = 2 * o.Levels[0].TargetFileSize
+	}
+	if o.Experimental.LevelMultiplier <= 0 {
+		o.Experimental.LevelMultiplier = defaultLevelMultiplier
+	}
+	if o.Experimental.ReadCompactionRate == 0 {
+		o.Experimental.ReadCompactionRate = 16000
+	}
+	if o.Experimental.ReadSamplingMultiplier == 0 {
+		o.Experimental.ReadSamplingMultiplier = 1 << 4
+	}
+	if o.Experimental.TableCacheShards <= 0 {
+		o.Experimental.TableCacheShards = runtime.GOMAXPROCS(0)
+	}
+	if o.Experimental.CPUWorkPermissionGranter == nil {
+		o.Experimental.CPUWorkPermissionGranter = defaultCPUWorkGranter{}
+	}
+	if o.Experimental.MultiLevelCompactionHeuristic == nil {
+		o.Experimental.MultiLevelCompactionHeuristic = WriteAmpHeuristic{}
+	}
+
+	o.initMaps()
+	return o
+}
+
+// WithFSDefaults configures the Options to wrap the configured filesystem with
+// the default virtual file system middleware, like disk-health checking.
+func (o *Options) WithFSDefaults() *Options {
+	if o.FS == nil {
+		o.FS = vfs.Default
+	}
+	o.FS, o.private.fsCloser = vfs.WithDiskHealthChecks(o.FS, 5*time.Second,
+		func(info vfs.DiskSlowInfo) {
+			o.EventListener.DiskSlow(info)
+		})
+	return o
+}
+
+// AddEventListener adds the provided event listener to the Options, in addition
+// to any existing event listener.
+func (o *Options) AddEventListener(l EventListener) {
+	if o.EventListener != nil {
+		l = TeeEventListener(l, *o.EventListener)
+	}
+	o.EventListener = &l
+}
+
+func (o *Options) equal() Equal {
+	if o.Comparer.Equal == nil {
+		return bytes.Equal
+	}
+	return o.Comparer.Equal
+}
+
+// initMaps initializes the Comparers, Filters, and Mergers maps.
+func (o *Options) initMaps() {
+	for i := range o.Levels {
+		l := &o.Levels[i]
+		if l.FilterPolicy != nil {
+			if o.Filters == nil {
+				o.Filters = make(map[string]FilterPolicy)
+			}
+			name := l.FilterPolicy.Name()
+			if _, ok := o.Filters[name]; !ok {
+				o.Filters[name] = l.FilterPolicy
+			}
+		}
+	}
+}
+
+// Level returns the LevelOptions for the specified level.
+func (o *Options) Level(level int) LevelOptions {
+	if level < len(o.Levels) {
+		return o.Levels[level]
+	}
+	n := len(o.Levels) - 1
+	l := o.Levels[n]
+	for i := n; i < level; i++ {
+		l.TargetFileSize *= 2
+	}
+	return l
+}
+
+// Clone creates a shallow-copy of the supplied options.
+func (o *Options) Clone() *Options {
+	n := &Options{}
+	if o != nil {
+		*n = *o
+	}
+	return n
+}
+
+func filterPolicyName(p FilterPolicy) string {
+	if p == nil {
+		return "none"
+	}
+	return p.Name()
+}
+
+func (o *Options) String() string {
+	var buf bytes.Buffer
+
+	cacheSize := int64(cacheDefaultSize)
+	if o.Cache != nil {
+		cacheSize = o.Cache.MaxSize()
+	}
+
+	fmt.Fprintf(&buf, "[Version]\n")
+	fmt.Fprintf(&buf, "  pebble_version=0.1\n")
+	fmt.Fprintf(&buf, "\n")
+	fmt.Fprintf(&buf, "[Options]\n")
+	fmt.Fprintf(&buf, "  bytes_per_sync=%d\n", o.BytesPerSync)
+	fmt.Fprintf(&buf, "  cache_size=%d\n", cacheSize)
+	fmt.Fprintf(&buf, "  cleaner=%s\n", o.Cleaner)
+	fmt.Fprintf(&buf, "  compaction_debt_concurrency=%d\n", o.Experimental.CompactionDebtConcurrency)
+	fmt.Fprintf(&buf, "  comparer=%s\n", o.Comparer.Name)
+	fmt.Fprintf(&buf, "  disable_wal=%t\n", o.DisableWAL)
+	if o.Experimental.DisableIngestAsFlushable != nil && o.Experimental.DisableIngestAsFlushable() {
+		fmt.Fprintf(&buf, "  disable_ingest_as_flushable=%t\n", true)
+	}
+	fmt.Fprintf(&buf, "  flush_delay_delete_range=%s\n", o.FlushDelayDeleteRange)
+	fmt.Fprintf(&buf, "  flush_delay_range_key=%s\n", o.FlushDelayRangeKey)
+	fmt.Fprintf(&buf, "  flush_split_bytes=%d\n", o.FlushSplitBytes)
+	fmt.Fprintf(&buf, "  format_major_version=%d\n", o.FormatMajorVersion)
+	fmt.Fprintf(&buf, "  l0_compaction_concurrency=%d\n", o.Experimental.L0CompactionConcurrency)
+	fmt.Fprintf(&buf, "  l0_compaction_file_threshold=%d\n", o.L0CompactionFileThreshold)
+	fmt.Fprintf(&buf, "  l0_compaction_threshold=%d\n", o.L0CompactionThreshold)
+	fmt.Fprintf(&buf, "  l0_stop_writes_threshold=%d\n", o.L0StopWritesThreshold)
+	fmt.Fprintf(&buf, "  lbase_max_bytes=%d\n", o.LBaseMaxBytes)
+	if o.Experimental.LevelMultiplier != defaultLevelMultiplier {
+		fmt.Fprintf(&buf, "  level_multiplier=%d\n", o.Experimental.LevelMultiplier)
+	}
+	fmt.Fprintf(&buf, "  max_concurrent_compactions=%d\n", o.MaxConcurrentCompactions())
+	fmt.Fprintf(&buf, "  max_manifest_file_size=%d\n", o.MaxManifestFileSize)
+	fmt.Fprintf(&buf, "  max_open_files=%d\n", o.MaxOpenFiles)
+	fmt.Fprintf(&buf, "  mem_table_size=%d\n", o.MemTableSize)
+	fmt.Fprintf(&buf, "  mem_table_stop_writes_threshold=%d\n", o.MemTableStopWritesThreshold)
+	fmt.Fprintf(&buf, "  min_deletion_rate=%d\n", o.TargetByteDeletionRate)
+	fmt.Fprintf(&buf, "  merger=%s\n", o.Merger.Name)
+	fmt.Fprintf(&buf, "  read_compaction_rate=%d\n", o.Experimental.ReadCompactionRate)
+	fmt.Fprintf(&buf, "  read_sampling_multiplier=%d\n", o.Experimental.ReadSamplingMultiplier)
+	fmt.Fprintf(&buf, "  strict_wal_tail=%t\n", o.private.strictWALTail)
+	fmt.Fprintf(&buf, "  table_cache_shards=%d\n", o.Experimental.TableCacheShards)
+	fmt.Fprintf(&buf, "  table_property_collectors=[")
+	for i := range o.TablePropertyCollectors {
+		if i > 0 {
+			fmt.Fprintf(&buf, ",")
+		}
+		// NB: This creates a new TablePropertyCollector, but Options.String() is
+		// called rarely so the overhead of doing so is not consequential.
+		fmt.Fprintf(&buf, "%s", o.TablePropertyCollectors[i]().Name())
+	}
+	fmt.Fprintf(&buf, "]\n")
+	fmt.Fprintf(&buf, "  validate_on_ingest=%t\n", o.Experimental.ValidateOnIngest)
+	fmt.Fprintf(&buf, "  wal_dir=%s\n", o.WALDir)
+	fmt.Fprintf(&buf, "  wal_bytes_per_sync=%d\n", o.WALBytesPerSync)
+	fmt.Fprintf(&buf, "  max_writer_concurrency=%d\n", o.Experimental.MaxWriterConcurrency)
+	fmt.Fprintf(&buf, "  force_writer_parallelism=%t\n", o.Experimental.ForceWriterParallelism)
+	fmt.Fprintf(&buf, "  secondary_cache_size_bytes=%d\n", o.Experimental.SecondaryCacheSizeBytes)
+	fmt.Fprintf(&buf, "  create_on_shared=%d\n", o.Experimental.CreateOnShared)
+
+	// Private options.
+	//
+	// These options are only encoded if true, because we do not want them to
+	// appear in production serialized Options files, since they're testing-only
+	// options. They're only serialized when true, which still ensures that the
+	// metamorphic tests may propagate them to subprocesses.
+	if o.private.disableDeleteOnlyCompactions {
+		fmt.Fprintln(&buf, "  disable_delete_only_compactions=true")
+	}
+	if o.private.disableElisionOnlyCompactions {
+		fmt.Fprintln(&buf, "  disable_elision_only_compactions=true")
+	}
+	if o.private.disableLazyCombinedIteration {
+		fmt.Fprintln(&buf, "  disable_lazy_combined_iteration=true")
+	}
+
+	for i := range o.Levels {
+		l := &o.Levels[i]
+		fmt.Fprintf(&buf, "\n")
+		fmt.Fprintf(&buf, "[Level \"%d\"]\n", i)
+		fmt.Fprintf(&buf, "  block_restart_interval=%d\n", l.BlockRestartInterval)
+		fmt.Fprintf(&buf, "  block_size=%d\n", l.BlockSize)
+		fmt.Fprintf(&buf, "  block_size_threshold=%d\n", l.BlockSizeThreshold)
+		fmt.Fprintf(&buf, "  compression=%s\n", l.Compression)
+		fmt.Fprintf(&buf, "  filter_policy=%s\n", filterPolicyName(l.FilterPolicy))
+		fmt.Fprintf(&buf, "  filter_type=%s\n", l.FilterType)
+		fmt.Fprintf(&buf, "  index_block_size=%d\n", l.IndexBlockSize)
+		fmt.Fprintf(&buf, "  target_file_size=%d\n", l.TargetFileSize)
+	}
+
+	return buf.String()
+}
+
+func parseOptions(s string, fn func(section, key, value string) error) error {
+	var section string
+	for _, line := range strings.Split(s, "\n") {
+		line = strings.TrimSpace(line)
+		if len(line) == 0 {
+			// Skip blank lines.
+			continue
+		}
+		if line[0] == ';' || line[0] == '#' {
+			// Skip comments.
+			continue
+		}
+		n := len(line)
+		if line[0] == '[' && line[n-1] == ']' {
+			// Parse section.
+			section = line[1 : n-1]
+			continue
+		}
+
+		pos := strings.Index(line, "=")
+		if pos < 0 {
+			const maxLen = 50
+			if len(line) > maxLen {
+				line = line[:maxLen-3] + "..."
+			}
+			return base.CorruptionErrorf("invalid key=value syntax: %q", errors.Safe(line))
+		}
+
+		key := strings.TrimSpace(line[:pos])
+		value := strings.TrimSpace(line[pos+1:])
+
+		// RocksDB uses a similar (INI-style) syntax for the OPTIONS file, but
+		// different section names and keys. The "CFOptions ..." paths are the
+		// RocksDB versions which we map to the Pebble paths.
+		mappedSection := section
+		if section == `CFOptions "default"` {
+			mappedSection = "Options"
+			switch key {
+			case "comparator":
+				key = "comparer"
+			case "merge_operator":
+				key = "merger"
+			}
+		}
+
+		if err := fn(mappedSection, key, value); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// ParseHooks contains callbacks to create options fields which can have
+// user-defined implementations.
+type ParseHooks struct {
+	NewCache        func(size int64) *Cache
+	NewCleaner      func(name string) (Cleaner, error)
+	NewComparer     func(name string) (*Comparer, error)
+	NewFilterPolicy func(name string) (FilterPolicy, error)
+	NewMerger       func(name string) (*Merger, error)
+	SkipUnknown     func(name, value string) bool
+}
+
+// Parse parses the options from the specified string. Note that certain
+// options cannot be parsed into populated fields. For example, comparer and
+// merger.
+func (o *Options) Parse(s string, hooks *ParseHooks) error {
+	return parseOptions(s, func(section, key, value string) error {
+		// WARNING: DO NOT remove entries from the switches below because doing so
+		// causes a key previously written to the OPTIONS file to be considered unknown,
+		// a backwards incompatible change. Instead, leave in support for parsing the
+		// key but simply don't parse the value.
+
+		switch {
+		case section == "Version":
+			switch key {
+			case "pebble_version":
+			default:
+				if hooks != nil && hooks.SkipUnknown != nil && hooks.SkipUnknown(section+"."+key, value) {
+					return nil
+				}
+				return errors.Errorf("pebble: unknown option: %s.%s",
+					errors.Safe(section), errors.Safe(key))
+			}
+			return nil
+
+		case section == "Options":
+			var err error
+			switch key {
+			case "bytes_per_sync":
+				o.BytesPerSync, err = strconv.Atoi(value)
+			case "cache_size":
+				var n int64
+				n, err = strconv.ParseInt(value, 10, 64)
+				if err == nil && hooks != nil && hooks.NewCache != nil {
+					if o.Cache != nil {
+						o.Cache.Unref()
+					}
+					o.Cache = hooks.NewCache(n)
+				}
+				// We avoid calling cache.New in parsing because it makes it
+				// too easy to leak a cache.
+			case "cleaner":
+				switch value {
+				case "archive":
+					o.Cleaner = ArchiveCleaner{}
+				case "delete":
+					o.Cleaner = DeleteCleaner{}
+				default:
+					if hooks != nil && hooks.NewCleaner != nil {
+						o.Cleaner, err = hooks.NewCleaner(value)
+					}
+				}
+			case "comparer":
+				switch value {
+				case "leveldb.BytewiseComparator":
+					o.Comparer = DefaultComparer
+				default:
+					if hooks != nil && hooks.NewComparer != nil {
+						o.Comparer, err = hooks.NewComparer(value)
+					}
+				}
+			case "compaction_debt_concurrency":
+				o.Experimental.CompactionDebtConcurrency, err = strconv.ParseUint(value, 10, 64)
+			case "delete_range_flush_delay":
+				// NB: This is a deprecated serialization of the
+				// `flush_delay_delete_range`.
+				o.FlushDelayDeleteRange, err = time.ParseDuration(value)
+			case "disable_delete_only_compactions":
+				o.private.disableDeleteOnlyCompactions, err = strconv.ParseBool(value)
+			case "disable_elision_only_compactions":
+				o.private.disableElisionOnlyCompactions, err = strconv.ParseBool(value)
+			case "disable_ingest_as_flushable":
+				var v bool
+				v, err = strconv.ParseBool(value)
+				if err == nil {
+					o.Experimental.DisableIngestAsFlushable = func() bool { return v }
+				}
+			case "disable_lazy_combined_iteration":
+				o.private.disableLazyCombinedIteration, err = strconv.ParseBool(value)
+			case "disable_wal":
+				o.DisableWAL, err = strconv.ParseBool(value)
+			case "flush_delay_delete_range":
+				o.FlushDelayDeleteRange, err = time.ParseDuration(value)
+			case "flush_delay_range_key":
+				o.FlushDelayRangeKey, err = time.ParseDuration(value)
+			case "flush_split_bytes":
+				o.FlushSplitBytes, err = strconv.ParseInt(value, 10, 64)
+			case "format_major_version":
+				// NB: The version written here may be stale. Open does
+				// not use the format major version encoded in the
+				// OPTIONS file other than to validate that the encoded
+				// version is valid right here.
+				var v uint64
+				v, err = strconv.ParseUint(value, 10, 64)
+				if vers := FormatMajorVersion(v); vers > internalFormatNewest || vers == FormatDefault {
+					err = errors.Newf("unknown format major version %d", o.FormatMajorVersion)
+				}
+				if err == nil {
+					o.FormatMajorVersion = FormatMajorVersion(v)
+				}
+			case "l0_compaction_concurrency":
+				o.Experimental.L0CompactionConcurrency, err = strconv.Atoi(value)
+			case "l0_compaction_file_threshold":
+				o.L0CompactionFileThreshold, err = strconv.Atoi(value)
+			case "l0_compaction_threshold":
+				o.L0CompactionThreshold, err = strconv.Atoi(value)
+			case "l0_stop_writes_threshold":
+				o.L0StopWritesThreshold, err = strconv.Atoi(value)
+			case "l0_sublevel_compactions":
+				// Do nothing; option existed in older versions of pebble.
+			case "lbase_max_bytes":
+				o.LBaseMaxBytes, err = strconv.ParseInt(value, 10, 64)
+			case "level_multiplier":
+				o.Experimental.LevelMultiplier, err = strconv.Atoi(value)
+			case "max_concurrent_compactions":
+				var concurrentCompactions int
+				concurrentCompactions, err = strconv.Atoi(value)
+				if concurrentCompactions <= 0 {
+					err = errors.New("max_concurrent_compactions cannot be <= 0")
+				} else {
+					o.MaxConcurrentCompactions = func() int { return concurrentCompactions }
+				}
+			case "max_manifest_file_size":
+				o.MaxManifestFileSize, err = strconv.ParseInt(value, 10, 64)
+			case "max_open_files":
+				o.MaxOpenFiles, err = strconv.Atoi(value)
+			case "mem_table_size":
+				o.MemTableSize, err = strconv.ParseUint(value, 10, 64)
+			case "mem_table_stop_writes_threshold":
+				o.MemTableStopWritesThreshold, err = strconv.Atoi(value)
+			case "min_compaction_rate":
+				// Do nothing; option existed in older versions of pebble, and
+				// may be meaningful again eventually.
+			case "min_deletion_rate":
+				o.TargetByteDeletionRate, err = strconv.Atoi(value)
+			case "min_flush_rate":
+				// Do nothing; option existed in older versions of pebble, and
+				// may be meaningful again eventually.
+			case "point_tombstone_weight":
+				// Do nothing; deprecated.
+			case "strict_wal_tail":
+				o.private.strictWALTail, err = strconv.ParseBool(value)
+			case "merger":
+				switch value {
+				case "nullptr":
+					o.Merger = nil
+				case "pebble.concatenate":
+					o.Merger = DefaultMerger
+				default:
+					if hooks != nil && hooks.NewMerger != nil {
+						o.Merger, err = hooks.NewMerger(value)
+					}
+				}
+			case "read_compaction_rate":
+				o.Experimental.ReadCompactionRate, err = strconv.ParseInt(value, 10, 64)
+			case "read_sampling_multiplier":
+				o.Experimental.ReadSamplingMultiplier, err = strconv.ParseInt(value, 10, 64)
+			case "table_cache_shards":
+				o.Experimental.TableCacheShards, err = strconv.Atoi(value)
+			case "table_format":
+				switch value {
+				case "leveldb":
+				case "rocksdbv2":
+				default:
+					return errors.Errorf("pebble: unknown table format: %q", errors.Safe(value))
+				}
+			case "table_property_collectors":
+				// TODO(peter): set o.TablePropertyCollectors
+			case "validate_on_ingest":
+				o.Experimental.ValidateOnIngest, err = strconv.ParseBool(value)
+			case "wal_dir":
+				o.WALDir = value
+			case "wal_bytes_per_sync":
+				o.WALBytesPerSync, err = strconv.Atoi(value)
+			case "max_writer_concurrency":
+				o.Experimental.MaxWriterConcurrency, err = strconv.Atoi(value)
+			case "force_writer_parallelism":
+				o.Experimental.ForceWriterParallelism, err = strconv.ParseBool(value)
+			case "secondary_cache_size_bytes":
+				o.Experimental.SecondaryCacheSizeBytes, err = strconv.ParseInt(value, 10, 64)
+			case "create_on_shared":
+				var createOnSharedInt int64
+				createOnSharedInt, err = strconv.ParseInt(value, 10, 64)
+				o.Experimental.CreateOnShared = remote.CreateOnSharedStrategy(createOnSharedInt)
+			default:
+				if hooks != nil && hooks.SkipUnknown != nil && hooks.SkipUnknown(section+"."+key, value) {
+					return nil
+				}
+				return errors.Errorf("pebble: unknown option: %s.%s",
+					errors.Safe(section), errors.Safe(key))
+			}
+			return err
+
+		case strings.HasPrefix(section, "Level "):
+			var index int
+			if n, err := fmt.Sscanf(section, `Level "%d"`, &index); err != nil {
+				return err
+			} else if n != 1 {
+				if hooks != nil && hooks.SkipUnknown != nil && hooks.SkipUnknown(section, value) {
+					return nil
+				}
+				return errors.Errorf("pebble: unknown section: %q", errors.Safe(section))
+			}
+
+			if len(o.Levels) <= index {
+				newLevels := make([]LevelOptions, index+1)
+				copy(newLevels, o.Levels)
+				o.Levels = newLevels
+			}
+			l := &o.Levels[index]
+
+			var err error
+			switch key {
+			case "block_restart_interval":
+				l.BlockRestartInterval, err = strconv.Atoi(value)
+			case "block_size":
+				l.BlockSize, err = strconv.Atoi(value)
+			case "block_size_threshold":
+				l.BlockSizeThreshold, err = strconv.Atoi(value)
+			case "compression":
+				switch value {
+				case "Default":
+					l.Compression = DefaultCompression
+				case "NoCompression":
+					l.Compression = NoCompression
+				case "Snappy":
+					l.Compression = SnappyCompression
+				case "ZSTD":
+					l.Compression = ZstdCompression
+				default:
+					return errors.Errorf("pebble: unknown compression: %q", errors.Safe(value))
+				}
+			case "filter_policy":
+				if hooks != nil && hooks.NewFilterPolicy != nil {
+					l.FilterPolicy, err = hooks.NewFilterPolicy(value)
+				}
+			case "filter_type":
+				switch value {
+				case "table":
+					l.FilterType = TableFilter
+				default:
+					return errors.Errorf("pebble: unknown filter type: %q", errors.Safe(value))
+				}
+			case "index_block_size":
+				l.IndexBlockSize, err = strconv.Atoi(value)
+			case "target_file_size":
+				l.TargetFileSize, err = strconv.ParseInt(value, 10, 64)
+			default:
+				if hooks != nil && hooks.SkipUnknown != nil && hooks.SkipUnknown(section+"."+key, value) {
+					return nil
+				}
+				return errors.Errorf("pebble: unknown option: %s.%s", errors.Safe(section), errors.Safe(key))
+			}
+			return err
+		}
+		if hooks != nil && hooks.SkipUnknown != nil && hooks.SkipUnknown(section+"."+key, value) {
+			return nil
+		}
+		return errors.Errorf("pebble: unknown section: %q", errors.Safe(section))
+	})
+}
+
+func (o *Options) checkOptions(s string) (strictWALTail bool, err error) {
+	// TODO(jackson): Refactor to avoid awkwardness of the strictWALTail return value.
+	return strictWALTail, parseOptions(s, func(section, key, value string) error {
+		switch section + "." + key {
+		case "Options.comparer":
+			if value != o.Comparer.Name {
+				return errors.Errorf("pebble: comparer name from file %q != comparer name from options %q",
+					errors.Safe(value), errors.Safe(o.Comparer.Name))
+			}
+		case "Options.merger":
+			// RocksDB allows the merge operator to be unspecified, in which case it
+			// shows up as "nullptr".
+			if value != "nullptr" && value != o.Merger.Name {
+				return errors.Errorf("pebble: merger name from file %q != merger name from options %q",
+					errors.Safe(value), errors.Safe(o.Merger.Name))
+			}
+		case "Options.strict_wal_tail":
+			strictWALTail, err = strconv.ParseBool(value)
+			if err != nil {
+				return errors.Errorf("pebble: error parsing strict_wal_tail value %q: %w", value, err)
+			}
+		}
+		return nil
+	})
+}
+
+// Check verifies the options are compatible with the previous options
+// serialized by Options.String(). For example, the Comparer and Merger must be
+// the same, or data will not be able to be properly read from the DB.
+func (o *Options) Check(s string) error {
+	_, err := o.checkOptions(s)
+	return err
+}
+
+// Validate verifies that the options are mutually consistent. For example,
+// L0StopWritesThreshold must be >= L0CompactionThreshold, otherwise a write
+// stall would persist indefinitely.
+func (o *Options) Validate() error {
+	// Note that we can presume Options.EnsureDefaults has been called, so there
+	// is no need to check for zero values.
+
+	var buf strings.Builder
+	if o.Experimental.L0CompactionConcurrency < 1 {
+		fmt.Fprintf(&buf, "L0CompactionConcurrency (%d) must be >= 1\n",
+			o.Experimental.L0CompactionConcurrency)
+	}
+	if o.L0StopWritesThreshold < o.L0CompactionThreshold {
+		fmt.Fprintf(&buf, "L0StopWritesThreshold (%d) must be >= L0CompactionThreshold (%d)\n",
+			o.L0StopWritesThreshold, o.L0CompactionThreshold)
+	}
+	if uint64(o.MemTableSize) >= maxMemTableSize {
+		fmt.Fprintf(&buf, "MemTableSize (%s) must be < %s\n",
+			humanize.Bytes.Uint64(uint64(o.MemTableSize)), humanize.Bytes.Uint64(maxMemTableSize))
+	}
+	if o.MemTableStopWritesThreshold < 2 {
+		fmt.Fprintf(&buf, "MemTableStopWritesThreshold (%d) must be >= 2\n",
+			o.MemTableStopWritesThreshold)
+	}
+	if o.FormatMajorVersion > internalFormatNewest {
+		fmt.Fprintf(&buf, "FormatMajorVersion (%d) must be <= %d\n",
+			o.FormatMajorVersion, internalFormatNewest)
+	}
+	if o.TableCache != nil && o.Cache != o.TableCache.cache {
+		fmt.Fprintf(&buf, "underlying cache in the TableCache and the Cache dont match\n")
+	}
+	if buf.Len() == 0 {
+		return nil
+	}
+	return errors.New(buf.String())
+}
+
+// MakeReaderOptions constructs sstable.ReaderOptions from the corresponding
+// options in the receiver.
+func (o *Options) MakeReaderOptions() sstable.ReaderOptions {
+	var readerOpts sstable.ReaderOptions
+	if o != nil {
+		readerOpts.Cache = o.Cache
+		readerOpts.Comparer = o.Comparer
+		readerOpts.Filters = o.Filters
+		if o.Merger != nil {
+			readerOpts.Merge = o.Merger.Merge
+			readerOpts.MergerName = o.Merger.Name
+		}
+		readerOpts.LoggerAndTracer = o.LoggerAndTracer
+	}
+	return readerOpts
+}
+
+// MakeWriterOptions constructs sstable.WriterOptions for the specified level
+// from the corresponding options in the receiver.
+func (o *Options) MakeWriterOptions(level int, format sstable.TableFormat) sstable.WriterOptions {
+	var writerOpts sstable.WriterOptions
+	writerOpts.TableFormat = format
+	if o != nil {
+		writerOpts.Cache = o.Cache
+		writerOpts.Comparer = o.Comparer
+		if o.Merger != nil {
+			writerOpts.MergerName = o.Merger.Name
+		}
+		writerOpts.TablePropertyCollectors = o.TablePropertyCollectors
+		writerOpts.BlockPropertyCollectors = o.BlockPropertyCollectors
+	}
+	if format >= sstable.TableFormatPebblev3 {
+		writerOpts.ShortAttributeExtractor = o.Experimental.ShortAttributeExtractor
+		writerOpts.RequiredInPlaceValueBound = o.Experimental.RequiredInPlaceValueBound
+		if format >= sstable.TableFormatPebblev4 && level == numLevels-1 {
+			writerOpts.WritingToLowestLevel = true
+		}
+	}
+	levelOpts := o.Level(level)
+	writerOpts.BlockRestartInterval = levelOpts.BlockRestartInterval
+	writerOpts.BlockSize = levelOpts.BlockSize
+	writerOpts.BlockSizeThreshold = levelOpts.BlockSizeThreshold
+	writerOpts.Compression = levelOpts.Compression
+	writerOpts.FilterPolicy = levelOpts.FilterPolicy
+	writerOpts.FilterType = levelOpts.FilterType
+	writerOpts.IndexBlockSize = levelOpts.IndexBlockSize
+	return writerOpts
+}
diff --git a/pebble/options_test.go b/pebble/options_test.go
new file mode 100644
index 0000000..46a5863
--- /dev/null
+++ b/pebble/options_test.go
@@ -0,0 +1,325 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"fmt"
+	"math/rand"
+	"runtime"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+// testingRandomized randomizes some default options. Currently, it's
+// used for testing under a random format major version in some tests.
+func (o *Options) testingRandomized(t testing.TB) *Options {
+	if o == nil {
+		o = &Options{}
+	}
+	if o.Logger == nil {
+		o.Logger = testLogger{t: t}
+	}
+	if o.FormatMajorVersion == FormatDefault {
+		// Pick a random format major version from the range
+		// [MostCompatible, FormatNewest].
+		o.FormatMajorVersion = FormatMajorVersion(rand.Intn(int(internalFormatNewest)) + 1)
+		t.Logf("Running %s with format major version %s", t.Name(), o.FormatMajorVersion.String())
+	}
+	return o
+}
+
+func testingRandomized(t testing.TB, o *Options) *Options {
+	o.testingRandomized(t)
+	return o
+}
+
+func TestLevelOptions(t *testing.T) {
+	var opts *Options
+	opts = opts.EnsureDefaults()
+
+	testCases := []struct {
+		level          int
+		targetFileSize int64
+	}{
+		{0, 2 << 20},
+		{1, (2 * 2) << 20},
+		{2, (4 * 2) << 20},
+		{3, (8 * 2) << 20},
+		{4, (16 * 2) << 20},
+		{5, (32 * 2) << 20},
+		{6, (64 * 2) << 20},
+	}
+	for _, c := range testCases {
+		l := opts.Level(c.level)
+		if c.targetFileSize != l.TargetFileSize {
+			t.Fatalf("%d: expected target-file-size %d, but found %d",
+				c.level, c.targetFileSize, l.TargetFileSize)
+		}
+	}
+}
+
+func TestOptionsString(t *testing.T) {
+	n := runtime.GOMAXPROCS(8)
+	defer runtime.GOMAXPROCS(n)
+
+	const expected = `[Version]
+  pebble_version=0.1
+
+[Options]
+  bytes_per_sync=524288
+  cache_size=8388608
+  cleaner=delete
+  compaction_debt_concurrency=1073741824
+  comparer=leveldb.BytewiseComparator
+  disable_wal=false
+  flush_delay_delete_range=0s
+  flush_delay_range_key=0s
+  flush_split_bytes=4194304
+  format_major_version=1
+  l0_compaction_concurrency=10
+  l0_compaction_file_threshold=500
+  l0_compaction_threshold=4
+  l0_stop_writes_threshold=12
+  lbase_max_bytes=67108864
+  max_concurrent_compactions=1
+  max_manifest_file_size=134217728
+  max_open_files=1000
+  mem_table_size=4194304
+  mem_table_stop_writes_threshold=2
+  min_deletion_rate=0
+  merger=pebble.concatenate
+  read_compaction_rate=16000
+  read_sampling_multiplier=16
+  strict_wal_tail=true
+  table_cache_shards=8
+  table_property_collectors=[]
+  validate_on_ingest=false
+  wal_dir=
+  wal_bytes_per_sync=0
+  max_writer_concurrency=0
+  force_writer_parallelism=false
+  secondary_cache_size_bytes=0
+  create_on_shared=0
+
+[Level "0"]
+  block_restart_interval=16
+  block_size=4096
+  block_size_threshold=90
+  compression=Snappy
+  filter_policy=none
+  filter_type=table
+  index_block_size=4096
+  target_file_size=2097152
+`
+
+	var opts *Options
+	opts = opts.EnsureDefaults()
+	if v := opts.String(); expected != v {
+		t.Fatalf("expected\n%s\nbut found\n%s", expected, v)
+	}
+}
+
+func TestOptionsCheck(t *testing.T) {
+	var opts *Options
+	opts = opts.EnsureDefaults()
+	s := opts.String()
+	require.NoError(t, opts.Check(s))
+	require.Regexp(t, `invalid key=value syntax`, opts.Check("foo\n"))
+
+	tmp := *opts
+	tmp.Comparer = &Comparer{Name: "foo"}
+	require.Regexp(t, `comparer name from file.*!=.*`, tmp.Check(s))
+
+	tmp = *opts
+	tmp.Merger = &Merger{Name: "foo"}
+	require.Regexp(t, `merger name from file.*!=.*`, tmp.Check(s))
+
+	// RocksDB uses a similar (INI-style) syntax for the OPTIONS file, but
+	// different section names and keys.
+	s = `
+[CFOptions "default"]
+  comparator=rocksdb-comparer
+  merge_operator=rocksdb-merger
+`
+	tmp = *opts
+	tmp.Comparer = &Comparer{Name: "foo"}
+	require.Regexp(t, `comparer name from file.*!=.*`, tmp.Check(s))
+
+	tmp.Comparer = &Comparer{Name: "rocksdb-comparer"}
+	tmp.Merger = &Merger{Name: "foo"}
+	require.Regexp(t, `merger name from file.*!=.*`, tmp.Check(s))
+
+	tmp.Merger = &Merger{Name: "rocksdb-merger"}
+	require.NoError(t, tmp.Check(s))
+
+	// RocksDB allows the merge operator to be unspecified, in which case it
+	// shows up as "nullptr".
+	s = `
+[CFOptions "default"]
+  merge_operator=nullptr
+`
+	tmp = *opts
+	require.NoError(t, tmp.Check(s))
+}
+
+type testCleaner struct{}
+
+func (testCleaner) Clean(fs vfs.FS, fileType base.FileType, path string) error {
+	return nil
+}
+
+func (testCleaner) String() string {
+	return "test-cleaner"
+}
+
+func TestOptionsParse(t *testing.T) {
+	testComparer := *DefaultComparer
+	testComparer.Name = "test-comparer"
+	testMerger := *DefaultMerger
+	testMerger.Name = "test-merger"
+	var newCacheSize int64
+
+	hooks := &ParseHooks{
+		NewCache: func(size int64) *Cache {
+			newCacheSize = size
+			return nil
+		},
+		NewCleaner: func(name string) (Cleaner, error) {
+			if name == (testCleaner{}).String() {
+				return testCleaner{}, nil
+			}
+			return nil, errors.Errorf("unknown cleaner: %q", name)
+		},
+		NewComparer: func(name string) (*Comparer, error) {
+			if name == testComparer.Name {
+				return &testComparer, nil
+			}
+			return nil, errors.Errorf("unknown comparer: %q", name)
+		},
+		NewMerger: func(name string) (*Merger, error) {
+			if name == testMerger.Name {
+				return &testMerger, nil
+			}
+			return nil, errors.Errorf("unknown merger: %q", name)
+		},
+	}
+
+	testCases := []struct {
+		cleaner  Cleaner
+		comparer *Comparer
+		merger   *Merger
+	}{
+		{testCleaner{}, nil, nil},
+		{nil, &testComparer, nil},
+		{nil, nil, &testMerger},
+	}
+	for _, c := range testCases {
+		t.Run("", func(t *testing.T) {
+			var opts Options
+			opts.Comparer = c.comparer
+			opts.Merger = c.merger
+			opts.WALDir = "wal"
+			opts.Levels = make([]LevelOptions, 3)
+			opts.Levels[0].BlockSize = 1024
+			opts.Levels[1].BlockSize = 2048
+			opts.Levels[2].BlockSize = 4096
+			opts.Experimental.CompactionDebtConcurrency = 100
+			opts.FlushDelayDeleteRange = 10 * time.Second
+			opts.FlushDelayRangeKey = 11 * time.Second
+			opts.Experimental.LevelMultiplier = 5
+			opts.TargetByteDeletionRate = 200
+			opts.Experimental.ReadCompactionRate = 300
+			opts.Experimental.ReadSamplingMultiplier = 400
+			opts.Experimental.TableCacheShards = 500
+			opts.Experimental.MaxWriterConcurrency = 1
+			opts.Experimental.ForceWriterParallelism = true
+			opts.Experimental.SecondaryCacheSizeBytes = 1024
+			opts.EnsureDefaults()
+			str := opts.String()
+
+			newCacheSize = 0
+			var parsedOptions Options
+			require.NoError(t, parsedOptions.Parse(str, hooks))
+			parsedStr := parsedOptions.String()
+			if str != parsedStr {
+				t.Fatalf("expected\n%s\nbut found\n%s", str, parsedStr)
+			}
+			require.Nil(t, parsedOptions.Cache)
+			require.NotEqual(t, newCacheSize, 0)
+		})
+	}
+}
+
+func TestOptionsValidate(t *testing.T) {
+	testCases := []struct {
+		options  string
+		expected string
+	}{
+		{``, ``},
+		{`
+[Options]
+  l0_compaction_concurrency=0
+`,
+			`L0CompactionConcurrency \(0\) must be >= 1`,
+		},
+		{`
+[Options]
+  l0_compaction_threshold=2
+  l0_stop_writes_threshold=1
+`,
+			`L0StopWritesThreshold .* must be >= L0CompactionThreshold .*`,
+		},
+		{`
+[Options]
+  mem_table_size=4294967296
+`,
+			`MemTableSize \(4\.0GB\) must be < [2|4]\.0GB`,
+		},
+		{`
+[Options]
+  mem_table_stop_writes_threshold=1
+`,
+			`MemTableStopWritesThreshold .* must be >= 2`,
+		},
+	}
+
+	for _, c := range testCases {
+		t.Run("", func(t *testing.T) {
+			var opts Options
+			opts.EnsureDefaults()
+			require.NoError(t, opts.Parse(c.options, nil))
+			err := opts.Validate()
+			if c.expected == "" {
+				require.NoError(t, err)
+			} else {
+				require.Error(t, err)
+				require.Regexp(t, c.expected, err.Error())
+			}
+		})
+	}
+}
+
+// This test isn't being done in TestOptionsValidate
+// cause it doesn't support setting pointers.
+func TestOptionsValidateCache(t *testing.T) {
+	var opts Options
+	opts.EnsureDefaults()
+	opts.Cache = NewCache(8 << 20)
+	defer opts.Cache.Unref()
+	opts.TableCache = NewTableCache(NewCache(8<<20), 10, 1)
+	defer opts.TableCache.cache.Unref()
+	defer opts.TableCache.Unref()
+
+	err := opts.Validate()
+	require.Error(t, err)
+	if fmt.Sprint(err) != "underlying cache in the TableCache and the Cache dont match" {
+		t.Errorf("Unexpected error message")
+	}
+}
diff --git a/pebble/pacer.go b/pebble/pacer.go
new file mode 100644
index 0000000..a959ff4
--- /dev/null
+++ b/pebble/pacer.go
@@ -0,0 +1,190 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"sync"
+	"time"
+)
+
+// deletionPacerInfo contains any info from the db necessary to make deletion
+// pacing decisions (to limit background IO usage so that it does not contend
+// with foreground traffic).
+type deletionPacerInfo struct {
+	freeBytes     uint64
+	obsoleteBytes uint64
+	liveBytes     uint64
+}
+
+// deletionPacer rate limits deletions of obsolete files. This is necessary to
+// prevent overloading the disk with too many deletions too quickly after a
+// large compaction, or an iterator close. On some SSDs, disk performance can be
+// negatively impacted if too many blocks are deleted very quickly, so this
+// mechanism helps mitigate that.
+type deletionPacer struct {
+	// If there are less than freeSpaceThreshold bytes of free space on
+	// disk, increase the pace of deletions such that we delete enough bytes to
+	// get back to the threshold within the freeSpaceTimeframe.
+	freeSpaceThreshold uint64
+	freeSpaceTimeframe time.Duration
+
+	// If the ratio of obsolete bytes to live bytes is greater than
+	// obsoleteBytesMaxRatio, increase the pace of deletions such that we delete
+	// enough bytes to get back to the threshold within the obsoleteBytesTimeframe.
+	obsoleteBytesMaxRatio  float64
+	obsoleteBytesTimeframe time.Duration
+
+	mu struct {
+		sync.Mutex
+
+		// history keeps rack of recent deletion history; it used to increase the
+		// deletion rate to match the pace of deletions.
+		history history
+	}
+
+	targetByteDeletionRate int64
+
+	getInfo func() deletionPacerInfo
+}
+
+const deletePacerHistory = 5 * time.Minute
+
+// newDeletionPacer instantiates a new deletionPacer for use when deleting
+// obsolete files.
+//
+// targetByteDeletionRate is the rate (in bytes/sec) at which we want to
+// normally limit deletes (when we are not falling behind or running out of
+// space). A value of 0.0 disables pacing.
+func newDeletionPacer(
+	now time.Time, targetByteDeletionRate int64, getInfo func() deletionPacerInfo,
+) *deletionPacer {
+	d := &deletionPacer{
+		freeSpaceThreshold: 16 << 30, // 16 GB
+		freeSpaceTimeframe: 10 * time.Second,
+
+		obsoleteBytesMaxRatio:  0.20,
+		obsoleteBytesTimeframe: 5 * time.Minute,
+
+		targetByteDeletionRate: targetByteDeletionRate,
+		getInfo:                getInfo,
+	}
+	d.mu.history.Init(now, deletePacerHistory)
+	return d
+}
+
+// ReportDeletion is used to report a deletion to the pacer. The pacer uses it
+// to keep track of the recent rate of deletions and potentially increase the
+// deletion rate accordingly.
+//
+// ReportDeletion is thread-safe.
+func (p *deletionPacer) ReportDeletion(now time.Time, bytesToDelete uint64) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	p.mu.history.Add(now, int64(bytesToDelete))
+}
+
+// PacingDelay returns the recommended pacing wait time (in seconds) for
+// deleting the given number of bytes.
+//
+// PacingDelay is thread-safe.
+func (p *deletionPacer) PacingDelay(now time.Time, bytesToDelete uint64) (waitSeconds float64) {
+	if p.targetByteDeletionRate == 0 {
+		// Pacing disabled.
+		return 0.0
+	}
+
+	baseRate := float64(p.targetByteDeletionRate)
+	// If recent deletion rate is more than our target, use that so that we don't
+	// fall behind.
+	historicRate := func() float64 {
+		p.mu.Lock()
+		defer p.mu.Unlock()
+		return float64(p.mu.history.Sum(now)) / deletePacerHistory.Seconds()
+	}()
+	if historicRate > baseRate {
+		baseRate = historicRate
+	}
+
+	// Apply heuristics to increase the deletion rate.
+	var extraRate float64
+	info := p.getInfo()
+	if info.freeBytes <= p.freeSpaceThreshold {
+		// Increase the rate so that we can free up enough bytes within the timeframe.
+		extraRate = float64(p.freeSpaceThreshold-info.freeBytes) / p.freeSpaceTimeframe.Seconds()
+	}
+	if info.liveBytes == 0 {
+		// We don't know the obsolete bytes ratio. Disable pacing altogether.
+		return 0.0
+	}
+	obsoleteBytesRatio := float64(info.obsoleteBytes) / float64(info.liveBytes)
+	if obsoleteBytesRatio >= p.obsoleteBytesMaxRatio {
+		// Increase the rate so that we can free up enough bytes within the timeframe.
+		r := (obsoleteBytesRatio - p.obsoleteBytesMaxRatio) * float64(info.liveBytes) / p.obsoleteBytesTimeframe.Seconds()
+		if extraRate < r {
+			extraRate = r
+		}
+	}
+
+	return float64(bytesToDelete) / (baseRate + extraRate)
+}
+
+// history is a helper used to keep track of the recent history of a set of
+// data points (in our case deleted bytes), at limited granularity.
+// Specifically, we split the desired timeframe into 100 "epochs" and all times
+// are effectively rounded down to the nearest epoch boundary.
+type history struct {
+	epochDuration time.Duration
+	startTime     time.Time
+	// currEpoch is the epoch of the most recent operation.
+	currEpoch int64
+	// val contains the recent epoch values.
+	// val[currEpoch % historyEpochs] is the current epoch.
+	// val[(currEpoch + 1) % historyEpochs] is the oldest epoch.
+	val [historyEpochs]int64
+	// sum is always equal to the sum of values in val.
+	sum int64
+}
+
+const historyEpochs = 100
+
+// Init the history helper to keep track of data over the given number of
+// seconds.
+func (h *history) Init(now time.Time, timeframe time.Duration) {
+	*h = history{
+		epochDuration: timeframe / time.Duration(historyEpochs),
+		startTime:     now,
+		currEpoch:     0,
+		sum:           0,
+	}
+}
+
+// Add adds a value for the current time.
+func (h *history) Add(now time.Time, val int64) {
+	h.advance(now)
+	h.val[h.currEpoch%historyEpochs] += val
+	h.sum += val
+}
+
+// Sum returns the sum of recent values. The result is approximate in that the
+// cut-off time is within 1% of the exact one.
+func (h *history) Sum(now time.Time) int64 {
+	h.advance(now)
+	return h.sum
+}
+
+func (h *history) epoch(t time.Time) int64 {
+	return int64(t.Sub(h.startTime) / h.epochDuration)
+}
+
+// advance advances the time to the given time.
+func (h *history) advance(now time.Time) {
+	epoch := h.epoch(now)
+	for h.currEpoch < epoch {
+		h.currEpoch++
+		// Forget the data for the oldest epoch.
+		h.sum -= h.val[h.currEpoch%historyEpochs]
+		h.val[h.currEpoch%historyEpochs] = 0
+	}
+}
diff --git a/pebble/pacer_test.go b/pebble/pacer_test.go
new file mode 100644
index 0000000..97009fc
--- /dev/null
+++ b/pebble/pacer_test.go
@@ -0,0 +1,198 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"fmt"
+	"math"
+	"math/rand"
+	"slices"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestDeletionPacer(t *testing.T) {
+	const MB = 1 << 20
+	const GB = 1 << 30
+	testCases := []struct {
+		freeBytes     uint64
+		obsoleteBytes uint64
+		liveBytes     uint64
+		// history of deletion reporting; first value in the pair is the time,
+		// second value is the deleted bytes. The time of pacing is the same as the
+		// last time in the history.
+		history [][2]int64
+		// expected pacing rate in MB/s.
+		expected float64
+	}{
+		{
+			freeBytes:     160 * GB,
+			obsoleteBytes: 1 * MB,
+			liveBytes:     160 * MB,
+			expected:      100.0,
+		},
+		// As freeBytes is 2GB below the free space threshold, rate should be
+		// increased by 204.8MB/s.
+		{
+			freeBytes:     14 * GB,
+			obsoleteBytes: 1 * MB,
+			liveBytes:     100 * MB,
+			expected:      304.8,
+		},
+		// As freeBytes is 10GB below the free space threshold, rate should be
+		// increased to by 1GB/s.
+		{
+			freeBytes:     6 * GB,
+			obsoleteBytes: 1 * MB,
+			liveBytes:     100 * MB,
+			expected:      1124.0,
+		},
+		// obsoleteBytesRatio is 50%. We need to delete 30GB within 5 minutes.
+		{
+			freeBytes:     500 * GB,
+			obsoleteBytes: 50 * GB,
+			liveBytes:     100 * GB,
+			expected:      202.4,
+		},
+		// When obsolete ratio unknown, there should be no throttling.
+		{
+			freeBytes:     500 * GB,
+			obsoleteBytes: 0,
+			liveBytes:     0,
+			expected:      math.Inf(1),
+		},
+		// History shows 200MB/sec deletions on average over last 5 minutes.
+		{
+			freeBytes:     160 * GB,
+			obsoleteBytes: 1 * MB,
+			liveBytes:     160 * MB,
+			history:       [][2]int64{{0, 5 * 60 * 200 * MB}},
+			expected:      200.0,
+		},
+		// History shows 200MB/sec deletions on average over last 5 minutes and
+		// freeBytes is 10GB below the threshold.
+		{
+			freeBytes:     6 * GB,
+			obsoleteBytes: 1 * MB,
+			liveBytes:     160 * MB,
+			history:       [][2]int64{{0, 5 * 60 * 200 * MB}},
+			expected:      1224.0,
+		},
+		// History shows 200MB/sec deletions on average over last 5 minutes and
+		// obsoleteBytesRatio is 50%.
+		{
+			freeBytes:     500 * GB,
+			obsoleteBytes: 50 * GB,
+			liveBytes:     100 * GB,
+			history:       [][2]int64{{0, 5 * 60 * 200 * MB}},
+			expected:      302.4,
+		},
+		// History shows 1000MB/sec deletions on average over last 5 minutes.
+		{
+			freeBytes:     160 * GB,
+			obsoleteBytes: 1 * MB,
+			liveBytes:     160 * MB,
+			history:       [][2]int64{{0, 60 * 1000 * MB}, {3 * 60, 60 * 4 * 1000 * MB}, {4 * 60, 0}},
+			expected:      1000.0,
+		},
+		// First entry in history is too old, it should be discarded.
+		{
+			freeBytes:     160 * GB,
+			obsoleteBytes: 1 * MB,
+			liveBytes:     160 * MB,
+			history:       [][2]int64{{0, 10 * 60 * 10000 * MB}, {3 * 60, 4 * 60 * 200 * MB}, {7 * 60, 1 * 60 * 200 * MB}},
+			expected:      200.0,
+		},
+	}
+	for tcIdx, tc := range testCases {
+		t.Run(fmt.Sprintf("%d", tcIdx), func(t *testing.T) {
+			getInfo := func() deletionPacerInfo {
+				return deletionPacerInfo{
+					freeBytes:     tc.freeBytes,
+					liveBytes:     tc.liveBytes,
+					obsoleteBytes: tc.obsoleteBytes,
+				}
+			}
+			start := time.Now()
+			last := start
+			pacer := newDeletionPacer(start, 100*MB, getInfo)
+			for _, h := range tc.history {
+				last = start.Add(time.Second * time.Duration(h[0]))
+				pacer.ReportDeletion(last, uint64(h[1]))
+			}
+			result := 1.0 / pacer.PacingDelay(last, 1*MB)
+			require.InDelta(t, tc.expected, result, 1e-7)
+		})
+	}
+}
+
+// TestDeletionPacerHistory tests the history helper by crosschecking Sum()
+// against a naive implementation.
+func TestDeletionPacerHistory(t *testing.T) {
+	type event struct {
+		time time.Time
+		// If report is 0, this event is a Sum(). Otherwise it is an Add().
+		report int64
+	}
+	numEvents := 1 + rand.Intn(200)
+	timeframe := time.Duration(1+rand.Intn(60*100)) * time.Second
+	events := make([]event, numEvents)
+	startTime := time.Now()
+	for i := range events {
+		events[i].time = startTime.Add(time.Duration(rand.Int63n(int64(timeframe))))
+		if rand.Intn(3) == 0 {
+			events[i].report = 0
+		} else {
+			events[i].report = int64(rand.Intn(100000))
+		}
+	}
+	slices.SortFunc(events, func(a, b event) int { return a.time.Compare(b.time) })
+
+	var h history
+	h.Init(startTime, timeframe)
+
+	// partialSums[i] := SUM_j<i events[j].report
+	partialSums := make([]int64, len(events)+1)
+	for i := range events {
+		partialSums[i+1] = partialSums[i] + events[i].report
+	}
+
+	for i, e := range events {
+		if e.report != 0 {
+			h.Add(e.time, e.report)
+			continue
+		}
+
+		result := h.Sum(e.time)
+
+		// getIdx returns the largest event index <= i that is before the cutoff
+		// time.
+		getIdx := func(cutoff time.Time) int {
+			for j := i; j >= 0; j-- {
+				if events[j].time.Before(cutoff) {
+					return j
+				}
+			}
+			return -1
+		}
+
+		// Sum all report values in the last timeframe, and see if recent events
+		// (allowing 1% error in the cutoff time) match the result.
+		a := getIdx(e.time.Add(-timeframe * (historyEpochs + 1) / historyEpochs))
+		b := getIdx(e.time.Add(-timeframe * (historyEpochs - 1) / historyEpochs))
+		found := false
+		for j := a; j <= b; j++ {
+			if partialSums[i+1]-partialSums[j+1] == result {
+				found = true
+				break
+			}
+		}
+		if !found {
+			t.Fatalf("incorrect Sum() result %d; %v", result, events[a+1:i+1])
+		}
+	}
+}
diff --git a/pebble/range_del_test.go b/pebble/range_del_test.go
new file mode 100644
index 0000000..75a9ad2
--- /dev/null
+++ b/pebble/range_del_test.go
@@ -0,0 +1,653 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"fmt"
+	"runtime"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+func TestRangeDel(t *testing.T) {
+	var d *DB
+	defer func() {
+		if d != nil {
+			require.NoError(t, closeAllSnapshots(d))
+			require.NoError(t, d.Close())
+		}
+	}()
+	opts := &Options{}
+	opts.DisableAutomaticCompactions = true
+
+	datadriven.RunTest(t, "testdata/range_del", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "define":
+			if d != nil {
+				if err := closeAllSnapshots(d); err != nil {
+					return err.Error()
+				}
+				if err := d.Close(); err != nil {
+					return err.Error()
+				}
+			}
+
+			var err error
+			if d, err = runDBDefineCmd(td, opts); err != nil {
+				return err.Error()
+			}
+
+			d.mu.Lock()
+			// Disable the "dynamic base level" code for this test.
+			d.mu.versions.picker.forceBaseLevel1()
+			s := fmt.Sprintf("mem: %d\n%s", len(d.mu.mem.queue), d.mu.versions.currentVersion().String())
+			d.mu.Unlock()
+			return s
+
+		case "wait-pending-table-stats":
+			return runTableStatsCmd(td, d)
+
+		case "compact":
+			if err := runCompactCmd(td, d); err != nil {
+				return err.Error()
+			}
+			d.mu.Lock()
+			// Disable the "dynamic base level" code for this test.
+			d.mu.versions.picker.forceBaseLevel1()
+			s := d.mu.versions.currentVersion().String()
+			d.mu.Unlock()
+			return s
+
+		case "get":
+			return runGetCmd(t, td, d)
+
+		case "iter":
+			snap := Snapshot{
+				db:     d,
+				seqNum: InternalKeySeqNumMax,
+			}
+			td.MaybeScanArgs(t, "seq", &snap.seqNum)
+			iter, _ := snap.NewIter(nil)
+			return runIterCmd(td, iter, true)
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestFlushDelay(t *testing.T) {
+	opts := &Options{
+		FS:                    vfs.NewMem(),
+		Comparer:              testkeys.Comparer,
+		FlushDelayDeleteRange: 10 * time.Millisecond,
+		FlushDelayRangeKey:    10 * time.Millisecond,
+		FormatMajorVersion:    internalFormatNewest,
+	}
+	d, err := Open("", opts)
+	require.NoError(t, err)
+
+	// Ensure that all the various means of writing a rangedel or range key
+	// trigger their respective flush delays.
+	cases := []func(){
+		func() {
+			require.NoError(t, d.DeleteRange([]byte("a"), []byte("z"), nil))
+		},
+		func() {
+			b := d.NewBatch()
+			require.NoError(t, b.DeleteRange([]byte("a"), []byte("z"), nil))
+			require.NoError(t, b.Commit(nil))
+		},
+		func() {
+			b := d.NewBatch()
+			op := b.DeleteRangeDeferred(1, 1)
+			op.Key[0] = 'a'
+			op.Value[0] = 'z'
+			op.Finish()
+			require.NoError(t, b.Commit(nil))
+		},
+		func() {
+			b := d.NewBatch()
+			b2 := d.NewBatch()
+			require.NoError(t, b.DeleteRange([]byte("a"), []byte("z"), nil))
+			require.NoError(t, b2.SetRepr(b.Repr()))
+			require.NoError(t, b2.Commit(nil))
+			require.NoError(t, b.Close())
+		},
+		func() {
+			b := d.NewBatch()
+			b2 := d.NewBatch()
+			require.NoError(t, b.DeleteRange([]byte("a"), []byte("z"), nil))
+			require.NoError(t, b2.Apply(b, nil))
+			require.NoError(t, b2.Commit(nil))
+			require.NoError(t, b.Close())
+		},
+		func() {
+			require.NoError(t, d.RangeKeySet([]byte("a"), []byte("z"), nil, nil, nil))
+		},
+		func() {
+			require.NoError(t, d.RangeKeyUnset([]byte("a"), []byte("z"), nil, nil))
+		},
+		func() {
+			require.NoError(t, d.RangeKeyDelete([]byte("a"), []byte("z"), nil))
+		},
+		func() {
+			b := d.NewBatch()
+			require.NoError(t, b.RangeKeySet([]byte("a"), []byte("z"), nil, nil, nil))
+			require.NoError(t, b.Commit(nil))
+		},
+		func() {
+			b := d.NewBatch()
+			require.NoError(t, b.RangeKeyUnset([]byte("a"), []byte("z"), nil, nil))
+			require.NoError(t, b.Commit(nil))
+		},
+		func() {
+			b := d.NewBatch()
+			require.NoError(t, b.RangeKeyDelete([]byte("a"), []byte("z"), nil))
+			require.NoError(t, b.Commit(nil))
+		},
+		func() {
+			b := d.NewBatch()
+			b2 := d.NewBatch()
+			require.NoError(t, b.RangeKeySet([]byte("a"), []byte("z"), nil, nil, nil))
+			require.NoError(t, b2.SetRepr(b.Repr()))
+			require.NoError(t, b2.Commit(nil))
+			require.NoError(t, b.Close())
+		},
+		func() {
+			b := d.NewBatch()
+			b2 := d.NewBatch()
+			require.NoError(t, b.RangeKeySet([]byte("a"), []byte("z"), nil, nil, nil))
+			require.NoError(t, b2.Apply(b, nil))
+			require.NoError(t, b2.Commit(nil))
+			require.NoError(t, b.Close())
+		},
+	}
+
+	for _, f := range cases {
+		d.mu.Lock()
+		flushed := d.mu.mem.queue[len(d.mu.mem.queue)-1].flushed
+		d.mu.Unlock()
+		f()
+		<-flushed
+	}
+	require.NoError(t, d.Close())
+}
+
+func TestFlushDelayStress(t *testing.T) {
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+	opts := &Options{
+		FS:                    vfs.NewMem(),
+		Comparer:              testkeys.Comparer,
+		FlushDelayDeleteRange: time.Duration(rng.Intn(10)+1) * time.Millisecond,
+		FlushDelayRangeKey:    time.Duration(rng.Intn(10)+1) * time.Millisecond,
+		FormatMajorVersion:    internalFormatNewest,
+		MemTableSize:          8192,
+	}
+
+	const runs = 100
+	for run := 0; run < runs; run++ {
+		d, err := Open("", opts)
+		require.NoError(t, err)
+
+		now := time.Now().UnixNano()
+		writers := runtime.GOMAXPROCS(0)
+		var wg sync.WaitGroup
+		wg.Add(writers)
+		for i := 0; i < writers; i++ {
+			rng := rand.New(rand.NewSource(uint64(now) + uint64(i)))
+			go func() {
+				const ops = 100
+				defer wg.Done()
+
+				var k1, k2 [32]byte
+				for j := 0; j < ops; j++ {
+					switch rng.Intn(3) {
+					case 0:
+						randStr(k1[:], rng)
+						randStr(k2[:], rng)
+						require.NoError(t, d.DeleteRange(k1[:], k2[:], nil))
+					case 1:
+						randStr(k1[:], rng)
+						randStr(k2[:], rng)
+						require.NoError(t, d.RangeKeySet(k1[:], k2[:], []byte("@2"), nil, nil))
+					case 2:
+						randStr(k1[:], rng)
+						randStr(k2[:], rng)
+						require.NoError(t, d.Set(k1[:], k2[:], nil))
+					default:
+						panic("unreachable")
+					}
+				}
+			}()
+		}
+		wg.Wait()
+		time.Sleep(time.Duration(rng.Intn(10)+1) * time.Millisecond)
+		require.NoError(t, d.Close())
+	}
+}
+
+// Verify that range tombstones at higher levels do not unintentionally delete
+// newer keys at lower levels. This test sets up one such scenario. The base
+// problem is that range tombstones are not truncated to sstable boundaries on
+// disk, only in memory.
+func TestRangeDelCompactionTruncation(t *testing.T) {
+	runTest := func(formatVersion FormatMajorVersion) {
+		// Use a small target file size so that there is a single key per sstable.
+		d, err := Open("", &Options{
+			FS: vfs.NewMem(),
+			Levels: []LevelOptions{
+				{TargetFileSize: 100},
+				{TargetFileSize: 100},
+				{TargetFileSize: 1},
+			},
+			DebugCheck:         DebugCheckLevels,
+			FormatMajorVersion: formatVersion,
+		})
+		require.NoError(t, err)
+		defer d.Close()
+
+		d.mu.Lock()
+		d.mu.versions.dynamicBaseLevel = false
+		d.mu.Unlock()
+
+		lsm := func() string {
+			d.mu.Lock()
+			s := d.mu.versions.currentVersion().String()
+			d.mu.Unlock()
+			return s
+		}
+		expectLSM := func(expected string) {
+			t.Helper()
+			expected = strings.TrimSpace(expected)
+			actual := strings.TrimSpace(lsm())
+			if expected != actual {
+				t.Fatalf("expected\n%s\nbut found\n%s", expected, actual)
+			}
+		}
+
+		require.NoError(t, d.Set([]byte("a"), bytes.Repeat([]byte("b"), 100), nil))
+		snap1 := d.NewSnapshot()
+		defer snap1.Close()
+		// Flush so that each version of "a" ends up in its own L0 table. If we
+		// allowed both versions in the same L0 table, compaction could trivially
+		// move the single L0 table to L1.
+		require.NoError(t, d.Flush())
+		require.NoError(t, d.Set([]byte("b"), bytes.Repeat([]byte("c"), 100), nil))
+
+		snap2 := d.NewSnapshot()
+		defer snap2.Close()
+		require.NoError(t, d.DeleteRange([]byte("a"), []byte("d"), nil))
+
+		// Compact to produce the L1 tables.
+		require.NoError(t, d.Compact([]byte("c"), []byte("c\x00"), false))
+		expectLSM(`
+1:
+  000008:[a#12,RANGEDEL-b#inf,RANGEDEL]
+  000009:[b#12,RANGEDEL-d#inf,RANGEDEL]
+`)
+
+		// Compact again to move one of the tables to L2.
+		require.NoError(t, d.Compact([]byte("c"), []byte("c\x00"), false))
+		expectLSM(`
+1:
+  000008:[a#12,RANGEDEL-b#inf,RANGEDEL]
+2:
+  000009:[b#12,RANGEDEL-d#inf,RANGEDEL]
+`)
+
+		// Write "b" and "c" to a new table.
+		require.NoError(t, d.Set([]byte("b"), []byte("d"), nil))
+		require.NoError(t, d.Set([]byte("c"), []byte("e"), nil))
+		require.NoError(t, d.Flush())
+		expectLSM(`
+0.0:
+  000011:[b#13,SET-c#14,SET]
+1:
+  000008:[a#12,RANGEDEL-b#inf,RANGEDEL]
+2:
+  000009:[b#12,RANGEDEL-d#inf,RANGEDEL]
+`)
+
+		// "b" is still visible at this point as it should be.
+		if _, closer, err := d.Get([]byte("b")); err != nil {
+			t.Fatalf("expected success, but found %v", err)
+		} else {
+			closer.Close()
+		}
+
+		keys := func() string {
+			iter, _ := d.NewIter(nil)
+			defer iter.Close()
+			var buf bytes.Buffer
+			var sep string
+			for iter.First(); iter.Valid(); iter.Next() {
+				fmt.Fprintf(&buf, "%s%s", sep, iter.Key())
+				sep = " "
+			}
+			return buf.String()
+		}
+
+		if expected, actual := `b c`, keys(); expected != actual {
+			t.Fatalf("expected %q, but found %q", expected, actual)
+		}
+
+		// Compact the L0 table. This will compact the L0 table into L1 and do to the
+		// sstable target size settings will create 2 tables in L1. Then L1 table
+		// containing "c" will be compacted again with the L2 table creating two
+		// tables in L2. Lastly, the L2 table containing "c" will be compacted
+		// creating the L3 table.
+		require.NoError(t, d.Compact([]byte("c"), []byte("c\x00"), false))
+		if formatVersion < FormatSetWithDelete {
+			expectLSM(`
+1:
+  000008:[a#12,RANGEDEL-b#inf,RANGEDEL]
+2:
+  000012:[b#13,SET-c#inf,RANGEDEL]
+3:
+  000013:[c#14,SET-d#inf,RANGEDEL]
+`)
+		} else {
+			expectLSM(`
+1:
+  000008:[a#12,RANGEDEL-b#inf,RANGEDEL]
+2:
+  000012:[b#13,SETWITHDEL-c#inf,RANGEDEL]
+3:
+  000013:[c#14,SET-d#inf,RANGEDEL]
+`)
+		}
+
+		// The L1 table still contains a tombstone from [a,d) which will improperly
+		// delete the newer version of "b" in L2.
+		if _, closer, err := d.Get([]byte("b")); err != nil {
+			t.Errorf("expected success, but found %v", err)
+		} else {
+			closer.Close()
+		}
+
+		if expected, actual := `b c`, keys(); expected != actual {
+			t.Errorf("expected %q, but found %q", expected, actual)
+		}
+	}
+
+	versions := []FormatMajorVersion{
+		FormatMostCompatible,
+		FormatSetWithDelete - 1,
+		FormatSetWithDelete,
+		FormatNewest,
+	}
+	for _, version := range versions {
+		t.Run(fmt.Sprintf("version-%s", version), func(t *testing.T) {
+			runTest(version)
+		})
+	}
+}
+
+// This is an alternate scenario to the one created in
+// TestRangeDelCompactionTruncation that would result in the bounds for an
+// sstable expanding to overlap its left neighbor if we failed to truncate an
+// sstable's boundaries to the compaction input boundaries.
+func TestRangeDelCompactionTruncation2(t *testing.T) {
+	// Use a small target file size so that there is a single key per sstable.
+	d, err := Open("", &Options{
+		FS: vfs.NewMem(),
+		Levels: []LevelOptions{
+			{TargetFileSize: 200},
+			{TargetFileSize: 200},
+			{TargetFileSize: 1},
+		},
+		DebugCheck: DebugCheckLevels,
+	})
+	require.NoError(t, err)
+	defer d.Close()
+
+	lsm := func() string {
+		d.mu.Lock()
+		s := d.mu.versions.currentVersion().String()
+		d.mu.Unlock()
+		return s
+	}
+	expectLSM := func(expected string) {
+		t.Helper()
+		expected = strings.TrimSpace(expected)
+		actual := strings.TrimSpace(lsm())
+		if expected != actual {
+			t.Fatalf("expected\n%s\nbut found\n%s", expected, actual)
+		}
+	}
+
+	require.NoError(t, d.Set([]byte("b"), bytes.Repeat([]byte("b"), 100), nil))
+	snap1 := d.NewSnapshot()
+	defer snap1.Close()
+	// Flush so that each version of "b" ends up in its own L0 table. If we
+	// allowed both versions in the same L0 table, compaction could trivially
+	// move the single L0 table to L1.
+	require.NoError(t, d.Flush())
+	require.NoError(t, d.Set([]byte("b"), bytes.Repeat([]byte("c"), 100), nil))
+	snap2 := d.NewSnapshot()
+	defer snap2.Close()
+	require.NoError(t, d.DeleteRange([]byte("a"), []byte("d"), nil))
+
+	// Compact to produce the L1 tables.
+	require.NoError(t, d.Compact([]byte("b"), []byte("b\x00"), false))
+	expectLSM(`
+6:
+  000009:[a#12,RANGEDEL-d#inf,RANGEDEL]
+`)
+
+	require.NoError(t, d.Set([]byte("c"), bytes.Repeat([]byte("d"), 100), nil))
+	require.NoError(t, d.Compact([]byte("c"), []byte("c\x00"), false))
+	expectLSM(`
+6:
+  000012:[a#12,RANGEDEL-c#inf,RANGEDEL]
+  000013:[c#13,SET-d#inf,RANGEDEL]
+`)
+}
+
+// TODO(peter): rewrite this test, TestRangeDelCompactionTruncation, and
+// TestRangeDelCompactionTruncation2 as data-driven tests.
+func TestRangeDelCompactionTruncation3(t *testing.T) {
+	// Use a small target file size so that there is a single key per sstable.
+	d, err := Open("tmp", &Options{
+		Cleaner: ArchiveCleaner{},
+		FS:      vfs.NewMem(),
+		Levels: []LevelOptions{
+			{TargetFileSize: 200},
+			{TargetFileSize: 200},
+			{TargetFileSize: 1},
+		},
+		DebugCheck: DebugCheckLevels,
+	})
+	require.NoError(t, err)
+	defer d.Close()
+
+	d.mu.Lock()
+	d.mu.versions.dynamicBaseLevel = false
+	d.mu.Unlock()
+
+	lsm := func() string {
+		d.mu.Lock()
+		s := d.mu.versions.currentVersion().String()
+		d.mu.Unlock()
+		return s
+	}
+	expectLSM := func(expected string) {
+		t.Helper()
+		expected = strings.TrimSpace(expected)
+		actual := strings.TrimSpace(lsm())
+		if expected != actual {
+			t.Fatalf("expected\n%s\nbut found\n%s", expected, actual)
+		}
+	}
+
+	require.NoError(t, d.Set([]byte("b"), bytes.Repeat([]byte("b"), 100), nil))
+	snap1 := d.NewSnapshot()
+	defer snap1.Close()
+
+	// Flush so that each version of "b" ends up in its own L0 table. If we
+	// allowed both versions in the same L0 table, compaction could trivially
+	// move the single L0 table to L1.
+	require.NoError(t, d.Flush())
+	require.NoError(t, d.Set([]byte("b"), bytes.Repeat([]byte("c"), 100), nil))
+	snap2 := d.NewSnapshot()
+	defer snap2.Close()
+
+	require.NoError(t, d.DeleteRange([]byte("a"), []byte("d"), nil))
+	snap3 := d.NewSnapshot()
+	defer snap3.Close()
+
+	if _, _, err := d.Get([]byte("b")); err != ErrNotFound {
+		t.Fatalf("expected not found, but found %v", err)
+	}
+
+	// Compact a few times to move the tables down to L3.
+	for i := 0; i < 3; i++ {
+		require.NoError(t, d.Compact([]byte("b"), []byte("b\x00"), false))
+	}
+	expectLSM(`
+3:
+  000009:[a#12,RANGEDEL-d#inf,RANGEDEL]
+`)
+
+	require.NoError(t, d.Set([]byte("c"), bytes.Repeat([]byte("d"), 100), nil))
+
+	require.NoError(t, d.Compact([]byte("c"), []byte("c\x00"), false))
+	expectLSM(`
+3:
+  000013:[a#12,RANGEDEL-c#inf,RANGEDEL]
+4:
+  000014:[c#13,SET-d#inf,RANGEDEL]
+`)
+
+	require.NoError(t, d.Compact([]byte("c"), []byte("c\x00"), false))
+	expectLSM(`
+3:
+  000013:[a#12,RANGEDEL-c#inf,RANGEDEL]
+5:
+  000014:[c#13,SET-d#inf,RANGEDEL]
+`)
+
+	if _, _, err := d.Get([]byte("b")); err != ErrNotFound {
+		t.Fatalf("expected not found, but found %v", err)
+	}
+
+	require.NoError(t, d.Compact([]byte("a"), []byte("a\x00"), false))
+	expectLSM(`
+4:
+  000013:[a#12,RANGEDEL-c#inf,RANGEDEL]
+5:
+  000014:[c#13,SET-d#inf,RANGEDEL]
+`)
+
+	if v, _, err := d.Get([]byte("b")); err != ErrNotFound {
+		t.Fatalf("expected not found, but found %v [%s]", err, v)
+	}
+}
+
+func BenchmarkRangeDelIterate(b *testing.B) {
+	for _, entries := range []int{10, 1000, 100000} {
+		b.Run(fmt.Sprintf("entries=%d", entries), func(b *testing.B) {
+			for _, deleted := range []int{entries, entries - 1} {
+				b.Run(fmt.Sprintf("deleted=%d", deleted), func(b *testing.B) {
+					for _, snapshotCompact := range []bool{false, true} {
+						b.Run(fmt.Sprintf("snapshotAndCompact=%t", snapshotCompact), func(b *testing.B) {
+							benchmarkRangeDelIterate(b, entries, deleted, snapshotCompact)
+						})
+					}
+				})
+			}
+		})
+	}
+}
+
+func benchmarkRangeDelIterate(b *testing.B, entries, deleted int, snapshotCompact bool) {
+	mem := vfs.NewMem()
+	cache := NewCache(128 << 20) // 128 MB
+	defer cache.Unref()
+
+	d, err := Open("", &Options{
+		Cache:      cache,
+		FS:         mem,
+		DebugCheck: DebugCheckLevels,
+	})
+	if err != nil {
+		b.Fatal(err)
+	}
+	defer d.Close()
+
+	makeKey := func(i int) []byte {
+		return []byte(fmt.Sprintf("%09d", i))
+	}
+
+	// Create an sstable with N entries and ingest it. This is a fast way
+	// to get a lot of entries into pebble.
+	f, err := mem.Create("ext")
+	if err != nil {
+		b.Fatal(err)
+	}
+	w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
+		BlockSize: 32 << 10, // 32 KB
+	})
+	for i := 0; i < entries; i++ {
+		key := base.MakeInternalKey(makeKey(i), 0, InternalKeyKindSet)
+		if err := w.Add(key, nil); err != nil {
+			b.Fatal(err)
+		}
+	}
+	if err := w.Close(); err != nil {
+		b.Fatal(err)
+	}
+	if err := d.Ingest([]string{"ext"}); err != nil {
+		b.Fatal(err)
+	}
+
+	// Some benchmarks test snapshots that force the range tombstone into the
+	// same level as the covered data.
+	// See https://github.com/cockroachdb/pebble/issues/1070.
+	if snapshotCompact {
+		s := d.NewSnapshot()
+		defer func() { require.NoError(b, s.Close()) }()
+	}
+
+	// Create a range tombstone that deletes most (or all) of those entries.
+	from := makeKey(0)
+	to := makeKey(deleted)
+	if err := d.DeleteRange(from, to, nil); err != nil {
+		b.Fatal(err)
+	}
+
+	if snapshotCompact {
+		require.NoError(b, d.Compact(makeKey(0), makeKey(entries), false))
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		iter, _ := d.NewIter(nil)
+		iter.SeekGE(from)
+		if deleted < entries {
+			if !iter.Valid() {
+				b.Fatal("key not found")
+			}
+		} else if iter.Valid() {
+			b.Fatal("unexpected key found")
+		}
+		if err := iter.Close(); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
diff --git a/pebble/range_keys.go b/pebble/range_keys.go
new file mode 100644
index 0000000..3d0561a
--- /dev/null
+++ b/pebble/range_keys.go
@@ -0,0 +1,713 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"context"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/sstable"
+)
+
+// constructRangeKeyIter constructs the range-key iterator stack, populating
+// i.rangeKey.rangeKeyIter with the resulting iterator.
+func (i *Iterator) constructRangeKeyIter() {
+	i.rangeKey.rangeKeyIter = i.rangeKey.iterConfig.Init(
+		&i.comparer, i.seqNum, i.opts.LowerBound, i.opts.UpperBound,
+		&i.hasPrefix, &i.prefixOrFullSeekKey, false /* internalKeys */, &i.rangeKey.rangeKeyBuffers.internal)
+
+	// If there's an indexed batch with range keys, include it.
+	if i.batch != nil {
+		if i.batch.index == nil {
+			// This isn't an indexed batch. We shouldn't have gotten this far.
+			panic(errors.AssertionFailedf("creating an iterator over an unindexed batch"))
+		} else {
+			// Only include the batch's range key iterator if it has any keys.
+			// NB: This can force reconstruction of the rangekey iterator stack
+			// in SetOptions if subsequently range keys are added. See
+			// SetOptions.
+			if i.batch.countRangeKeys > 0 {
+				i.batch.initRangeKeyIter(&i.opts, &i.batchRangeKeyIter, i.batchSeqNum)
+				i.rangeKey.iterConfig.AddLevel(&i.batchRangeKeyIter)
+			}
+		}
+	}
+
+	if !i.batchOnlyIter {
+		// Next are the flushables: memtables and large batches.
+		if i.readState != nil {
+			for j := len(i.readState.memtables) - 1; j >= 0; j-- {
+				mem := i.readState.memtables[j]
+				// We only need to read from memtables which contain sequence numbers older
+				// than seqNum.
+				if logSeqNum := mem.logSeqNum; logSeqNum >= i.seqNum {
+					continue
+				}
+				if rki := mem.newRangeKeyIter(&i.opts); rki != nil {
+					i.rangeKey.iterConfig.AddLevel(rki)
+				}
+			}
+		}
+
+		current := i.version
+		if current == nil {
+			current = i.readState.current
+		}
+		// Next are the file levels: L0 sub-levels followed by lower levels.
+
+		// Add file-specific iterators for L0 files containing range keys. We
+		// maintain a separate manifest.LevelMetadata for each level containing only
+		// files that contain range keys, however we don't compute a separate
+		// L0Sublevels data structure too.
+		//
+		// We first use L0's LevelMetadata to peek and see whether L0 contains any
+		// range keys at all. If it does, we create a range key level iterator per
+		// level that contains range keys using the information from L0Sublevels.
+		// Some sublevels may not contain any range keys, and we need to iterate
+		// through the fileMetadata to determine that. Since L0's file count should
+		// not significantly exceed ~1000 files (see L0CompactionFileThreshold),
+		// this should be okay.
+		if !current.RangeKeyLevels[0].Empty() {
+			// L0 contains at least 1 file containing range keys.
+			// Add level iterators for the L0 sublevels, iterating from newest to
+			// oldest.
+			for j := len(current.L0SublevelFiles) - 1; j >= 0; j-- {
+				iter := current.L0SublevelFiles[j].Iter()
+				if !containsAnyRangeKeys(iter) {
+					continue
+				}
+
+				li := i.rangeKey.iterConfig.NewLevelIter()
+				li.Init(
+					i.opts.SpanIterOptions(),
+					i.cmp,
+					i.newIterRangeKey,
+					iter.Filter(manifest.KeyTypeRange),
+					manifest.L0Sublevel(j),
+					manifest.KeyTypeRange,
+				)
+				i.rangeKey.iterConfig.AddLevel(li)
+			}
+		}
+
+		// Add level iterators for the non-empty non-L0 levels.
+		for level := 1; level < len(current.RangeKeyLevels); level++ {
+			if current.RangeKeyLevels[level].Empty() {
+				continue
+			}
+			li := i.rangeKey.iterConfig.NewLevelIter()
+			spanIterOpts := i.opts.SpanIterOptions()
+			li.Init(spanIterOpts, i.cmp, i.newIterRangeKey, current.RangeKeyLevels[level].Iter(),
+				manifest.Level(level), manifest.KeyTypeRange)
+			i.rangeKey.iterConfig.AddLevel(li)
+		}
+	}
+}
+
+func containsAnyRangeKeys(iter manifest.LevelIterator) bool {
+	for f := iter.First(); f != nil; f = iter.Next() {
+		if f.HasRangeKeys {
+			return true
+		}
+	}
+	return false
+}
+
+// Range key masking
+//
+// Pebble iterators may be configured such that range keys with suffixes mask
+// point keys with lower suffixes. The intended use is implementing a MVCC
+// delete range operation using range keys, when suffixes are MVCC timestamps.
+//
+// To enable masking, the user populates the IterOptions's RangeKeyMasking
+// field. The Suffix field configures which range keys act as masks. The
+// intended use is to hold a MVCC read timestamp. When implementing a MVCC
+// delete range operation, only range keys that are visible at the read
+// timestamp should be visible. If a range key has a suffix ≤
+// RangeKeyMasking.Suffix, it acts as a mask.
+//
+// Range key masking is facilitated by the keyspan.InterleavingIter. The
+// interleaving iterator interleaves range keys and point keys during combined
+// iteration. During user iteration, the interleaving iterator is configured
+// with a keyspan.SpanMask, implemented by the rangeKeyMasking struct below.
+// The SpanMask interface defines two methods: SpanChanged and SkipPoint.
+//
+// SpanChanged is used to keep the current mask up-to-date. Whenever the point
+// iterator has stepped into or out of the bounds of a range key, the
+// interleaving iterator invokes SpanChanged passing the current covering range
+// key. The below rangeKeyMasking implementation scans the range keys looking
+// for the range key with the largest suffix that's still ≤ the suffix supplied
+// to IterOptions.RangeKeyMasking.Suffix (the "read timestamp"). If it finds a
+// range key that meets the condition, the range key should act as a mask. The
+// span and the relevant range key's suffix are saved.
+//
+// The above ensures that `rangeKeyMasking.maskActiveSuffix` always contains the
+// current masking suffix such that any point keys with lower suffixes should be
+// skipped.
+//
+// There are two ways in which masked point keys are skipped.
+//
+//   1. Interleaving iterator SkipPoint
+//
+// Whenever the interleaving iterator encounters a point key that falls within
+// the bounds of a range key, it invokes SkipPoint. The interleaving iterator
+// guarantees that the SpanChanged method described above has already been
+// invoked with the covering range key. The below rangeKeyMasking implementation
+// of SkipPoint splits the key into prefix and suffix, compares the suffix to
+// the `maskActiveSuffix` updated by SpanChanged and returns true if
+// suffix(point) < maskActiveSuffix.
+//
+// The SkipPoint logic is sufficient to ensure that the Pebble iterator filters
+// out all masked point keys. However, it requires the iterator read each masked
+// point key. For broad range keys that mask many points, this may be expensive.
+//
+//   2. Block property filter
+//
+// For more efficient handling of braad range keys that mask many points, the
+// IterOptions.RangeKeyMasking field has an optional Filter option. This Filter
+// field takes a superset of the block-property filter interface, adding a
+// method to dynamically configure the filter's filtering criteria.
+//
+// To make use of the Filter option, the user is required to define and
+// configure a block-property collector that collects a property containing at
+// least the maximum suffix of a key within a block.
+//
+// When the SpanChanged method described above is invoked, rangeKeyMasking also
+// reconfigures the user-provided filter. It invokes a SetSuffix method,
+// providing the `maskActiveSuffix`, requesting that from now on the
+// block-property filter return Intersects()=false for any properties indicating
+// that a block contains exclusively keys with suffixes greater than the
+// provided suffix.
+//
+// Note that unlike other block-property filters, the filter used for masking
+// must not apply across the entire keyspace. It must only filter blocks that
+// lie within the bounds of the range key that set the mask suffix. To
+// accommodate this, rangeKeyMasking implements a special interface:
+// sstable.BoundLimitedBlockPropertyFilter. This interface extends the block
+// property filter interface with two new methods: KeyIsWithinLowerBound and
+// KeyIsWithinUpperBound. The rangeKeyMasking type wraps the user-provided block
+// property filter, implementing these two methods and overriding Intersects to
+// always return true if there is no active mask.
+//
+// The logic to ensure that a mask block-property filter is only applied within
+// the bounds of the masking range key is subtle. The interleaving iterator
+// guarantees that it never invokes SpanChanged until the point iterator is
+// positioned within the range key. During forward iteration, this guarantees
+// that any block that a sstable reader might attempt to load contains only keys
+// greater than or equal to the range key's lower bound. During backward
+// iteration, it provides the analagous guarantee on the range key's upper
+// bound.
+//
+// The above ensures that an sstable reader only needs to verify that a block
+// that it skips meets the opposite bound. This is where the
+// KeyIsWithinLowerBound and KeyIsWithinUpperBound methods are used. When an
+// sstable iterator is configured with a BoundLimitedBlockPropertyFilter, it
+// checks for intersection with the block-property filter before every block
+// load, like ordinary block-property filters. However, if the bound-limited
+// block property filter indicates that it does NOT intersect, the filter's
+// relevant KeyIsWithin{Lower,Upper}Bound method is queried, using a block
+// index separator as the bound. If the method indicates that the provided index
+// separator does not fall within the range key bounds, the no-intersection
+// result is ignored, and the block is read.
+
+type rangeKeyMasking struct {
+	cmp    base.Compare
+	split  base.Split
+	filter BlockPropertyFilterMask
+	// maskActiveSuffix holds the suffix of a range key currently acting as a
+	// mask, hiding point keys with suffixes greater than it. maskActiveSuffix
+	// is only ever non-nil if IterOptions.RangeKeyMasking.Suffix is non-nil.
+	// maskActiveSuffix is updated whenever the iterator passes over a new range
+	// key. The maskActiveSuffix should only be used if maskSpan is non-nil.
+	//
+	// See SpanChanged.
+	maskActiveSuffix []byte
+	// maskSpan holds the span from which the active mask suffix was extracted.
+	// The span is used for bounds comparisons, to ensure that a range-key mask
+	// is not applied beyond the bounds of the range key.
+	maskSpan *keyspan.Span
+	parent   *Iterator
+}
+
+func (m *rangeKeyMasking) init(parent *Iterator, cmp base.Compare, split base.Split) {
+	m.cmp = cmp
+	m.split = split
+	if parent.opts.RangeKeyMasking.Filter != nil {
+		m.filter = parent.opts.RangeKeyMasking.Filter()
+	}
+	m.parent = parent
+}
+
+// SpanChanged implements the keyspan.SpanMask interface, used during range key
+// iteration.
+func (m *rangeKeyMasking) SpanChanged(s *keyspan.Span) {
+	if s == nil && m.maskSpan == nil {
+		return
+	}
+	m.maskSpan = nil
+	m.maskActiveSuffix = m.maskActiveSuffix[:0]
+
+	// Find the smallest suffix of a range key contained within the Span,
+	// excluding suffixes less than m.opts.RangeKeyMasking.Suffix.
+	if s != nil {
+		m.parent.rangeKey.stale = true
+		if m.parent.opts.RangeKeyMasking.Suffix != nil {
+			for j := range s.Keys {
+				if s.Keys[j].Suffix == nil {
+					continue
+				}
+				if m.cmp(s.Keys[j].Suffix, m.parent.opts.RangeKeyMasking.Suffix) < 0 {
+					continue
+				}
+				if len(m.maskActiveSuffix) == 0 || m.cmp(m.maskActiveSuffix, s.Keys[j].Suffix) > 0 {
+					m.maskSpan = s
+					m.maskActiveSuffix = append(m.maskActiveSuffix[:0], s.Keys[j].Suffix...)
+				}
+			}
+		}
+	}
+
+	if m.maskSpan != nil && m.parent.opts.RangeKeyMasking.Filter != nil {
+		// Update the  block-property filter to filter point keys with suffixes
+		// greater than m.maskActiveSuffix.
+		err := m.filter.SetSuffix(m.maskActiveSuffix)
+		if err != nil {
+			m.parent.err = err
+		}
+	}
+	// If no span is active, we leave the inner block-property filter configured
+	// with its existing suffix. That's okay, because Intersects calls are first
+	// evaluated by iteratorRangeKeyState.Intersects, which considers all blocks
+	// as intersecting if there's no active mask.
+}
+
+// SkipPoint implements the keyspan.SpanMask interface, used during range key
+// iteration. Whenever a point key is covered by a non-empty Span, the
+// interleaving iterator invokes SkipPoint. This function is responsible for
+// performing range key masking.
+//
+// If a non-nil IterOptions.RangeKeyMasking.Suffix is set, range key masking is
+// enabled. Masking hides point keys, transparently skipping over the keys.
+// Whether or not a point key is masked is determined by comparing the point
+// key's suffix, the overlapping span's keys' suffixes, and the user-configured
+// IterOption's RangeKeyMasking.Suffix. When configured with a masking threshold
+// _t_, and there exists a span with suffix _r_ covering a point key with suffix
+// _p_, and
+//
+//	_t_ ≤ _r_ < _p_
+//
+// then the point key is elided. Consider the following rendering, where using
+// integer suffixes with higher integers sort before suffixes with lower
+// integers, (for example @7 ≤ @6 < @5):
+//
+//	     ^
+//	  @9 |        •―――――――――――――――○ [e,m)@9
+//	s  8 |                      • l@8
+//	u  7 |------------------------------------ @7 RangeKeyMasking.Suffix
+//	f  6 |      [h,q)@6 •―――――――――――――――――○            (threshold)
+//	f  5 |              • h@5
+//	f  4 |                          • n@4
+//	i  3 |          •―――――――――――○ [f,l)@3
+//	x  2 |  • b@2
+//	   1 |
+//	   0 |___________________________________
+//	      a b c d e f g h i j k l m n o p q
+//
+// An iterator scanning the entire keyspace with the masking threshold set to @7
+// will observe point keys b@2 and l@8. The span keys [h,q)@6 and [f,l)@3 serve
+// as masks, because cmp(@6,@7) ≥ 0 and cmp(@3,@7) ≥ 0. The span key [e,m)@9
+// does not serve as a mask, because cmp(@9,@7) < 0.
+//
+// Although point l@8 falls within the user key bounds of [e,m)@9, [e,m)@9 is
+// non-masking due to its suffix. The point key l@8 also falls within the user
+// key bounds of [h,q)@6, but since cmp(@6,@8) ≥ 0, l@8 is unmasked.
+//
+// Invariant: The userKey is within the user key bounds of the span most
+// recently provided to `SpanChanged`.
+func (m *rangeKeyMasking) SkipPoint(userKey []byte) bool {
+	m.parent.stats.RangeKeyStats.ContainedPoints++
+	if m.maskSpan == nil {
+		// No range key is currently acting as a mask, so don't skip.
+		return false
+	}
+	// Range key masking is enabled and the current span includes a range key
+	// that is being used as a mask. (NB: SpanChanged already verified that the
+	// range key's suffix is ≥ RangeKeyMasking.Suffix).
+	//
+	// This point key falls within the bounds of the range key (guaranteed by
+	// the InterleavingIter). Skip the point key if the range key's suffix is
+	// greater than the point key's suffix.
+	pointSuffix := userKey[m.split(userKey):]
+	if len(pointSuffix) > 0 && m.cmp(m.maskActiveSuffix, pointSuffix) < 0 {
+		m.parent.stats.RangeKeyStats.SkippedPoints++
+		return true
+	}
+	return false
+}
+
+// The iteratorRangeKeyState type implements the sstable package's
+// BoundLimitedBlockPropertyFilter interface in order to use block property
+// filters for range key masking. The iteratorRangeKeyState implementation wraps
+// the block-property filter provided in Options.RangeKeyMasking.Filter.
+//
+// Using a block-property filter for range-key masking requires limiting the
+// filter's effect to the bounds of the range key currently acting as a mask.
+// Consider the range key [a,m)@10, and an iterator positioned just before the
+// below block, bounded by index separators `c` and `z`:
+//
+//	          c                          z
+//	   x      |  c@9 c@5 c@1 d@7 e@4 y@4 | ...
+//	iter pos
+//
+// The next block cannot be skipped, despite the range key suffix @10 is greater
+// than all the block's keys' suffixes, because it contains a key (y@4) outside
+// the bounds of the range key.
+//
+// This extended BoundLimitedBlockPropertyFilter interface adds two new methods,
+// KeyIsWithinLowerBound and KeyIsWithinUpperBound, for testing whether a
+// particular block is within bounds.
+//
+// The iteratorRangeKeyState implements these new methods by first checking if
+// the iterator is currently positioned within a range key. If not, the provided
+// key is considered out-of-bounds. If the iterator is positioned within a range
+// key, it compares the corresponding range key bound.
+var _ sstable.BoundLimitedBlockPropertyFilter = (*rangeKeyMasking)(nil)
+
+// Name implements the limitedBlockPropertyFilter interface defined in the
+// sstable package by passing through to the user-defined block property filter.
+func (m *rangeKeyMasking) Name() string {
+	return m.filter.Name()
+}
+
+// Intersects implements the limitedBlockPropertyFilter interface defined in the
+// sstable package by passing the intersection decision to the user-provided
+// block property filter only if a range key is covering the current iterator
+// position.
+func (m *rangeKeyMasking) Intersects(prop []byte) (bool, error) {
+	if m.maskSpan == nil {
+		// No span is actively masking.
+		return true, nil
+	}
+	return m.filter.Intersects(prop)
+}
+
+// KeyIsWithinLowerBound implements the limitedBlockPropertyFilter interface
+// defined in the sstable package. It's used to restrict the masking block
+// property filter to only applying within the bounds of the active range key.
+func (m *rangeKeyMasking) KeyIsWithinLowerBound(key []byte) bool {
+	// Invariant: m.maskSpan != nil
+	//
+	// The provided `key` is an inclusive lower bound of the block we're
+	// considering skipping.
+	return m.cmp(m.maskSpan.Start, key) <= 0
+}
+
+// KeyIsWithinUpperBound implements the limitedBlockPropertyFilter interface
+// defined in the sstable package. It's used to restrict the masking block
+// property filter to only applying within the bounds of the active range key.
+func (m *rangeKeyMasking) KeyIsWithinUpperBound(key []byte) bool {
+	// Invariant: m.maskSpan != nil
+	//
+	// The provided `key` is an *inclusive* upper bound of the block we're
+	// considering skipping, so the range key's end must be strictly greater
+	// than the block bound for the block to be within bounds.
+	return m.cmp(m.maskSpan.End, key) > 0
+}
+
+// lazyCombinedIter implements the internalIterator interface, wrapping a
+// pointIter. It requires the pointIter's the levelIters be configured with
+// pointers to its combinedIterState. When the levelIter observes a file
+// containing a range key, the lazyCombinedIter constructs the combined
+// range+point key iterator stack and switches to it.
+type lazyCombinedIter struct {
+	// parent holds a pointer to the root *pebble.Iterator containing this
+	// iterator. It's used to mutate the internalIterator in use when switching
+	// to combined iteration.
+	parent            *Iterator
+	pointIter         internalIterator
+	combinedIterState combinedIterState
+}
+
+// combinedIterState encapsulates the current state of combined iteration.
+// Various low-level iterators (mergingIter, leveliter) hold pointers to the
+// *pebble.Iterator's combinedIterState. This allows them to check whether or
+// not they must monitor for files containing range keys (!initialized), or not.
+//
+// When !initialized, low-level iterators watch for files containing range keys.
+// When one is discovered, they set triggered=true and key to the smallest
+// (forward direction) or largest (reverse direction) range key that's been
+// observed.
+type combinedIterState struct {
+	// key holds the smallest (forward direction) or largest (backward
+	// direction) user key from a range key bound discovered during the iterator
+	// operation that triggered the switch to combined iteration.
+	//
+	// Slices stored here must be stable. This is possible because callers pass
+	// a Smallest/Largest bound from a fileMetadata, which are immutable. A key
+	// slice's bytes must not be overwritten.
+	key         []byte
+	triggered   bool
+	initialized bool
+}
+
+// Assert that *lazyCombinedIter implements internalIterator.
+var _ internalIterator = (*lazyCombinedIter)(nil)
+
+// initCombinedIteration is invoked after a pointIter positioning operation
+// resulted in i.combinedIterState.triggered=true.
+//
+// The `dir` parameter is `+1` or `-1` indicating forward iteration or backward
+// iteration respectively.
+//
+// The `pointKey` and `pointValue` parameters provide the new point key-value
+// pair that the iterator was just positioned to. The combined iterator should
+// be seeded with this point key-value pair and return the smaller (forward
+// iteration) or largest (backward iteration) of the two.
+//
+// The `seekKey` parameter is non-nil only if the iterator operation that
+// triggered the switch to combined iteration was a SeekGE, SeekPrefixGE or
+// SeekLT. It provides the seek key supplied and is used to seek the range-key
+// iterator using the same key. This is necessary for SeekGE/SeekPrefixGE
+// operations that land in the middle of a range key and must truncate to the
+// user-provided seek key.
+func (i *lazyCombinedIter) initCombinedIteration(
+	dir int8, pointKey *InternalKey, pointValue base.LazyValue, seekKey []byte,
+) (*InternalKey, base.LazyValue) {
+	// Invariant: i.parent.rangeKey is nil.
+	// Invariant: !i.combinedIterState.initialized.
+	if invariants.Enabled {
+		if i.combinedIterState.initialized {
+			panic("pebble: combined iterator already initialized")
+		}
+		if i.parent.rangeKey != nil {
+			panic("pebble: iterator already has a range-key iterator stack")
+		}
+	}
+
+	// We need to determine the key to seek the range key iterator to. If
+	// seekKey is not nil, the user-initiated operation that triggered the
+	// switch to combined iteration was itself a seek, and we can use that key.
+	// Otherwise, a First/Last or relative positioning operation triggered the
+	// switch to combined iteration.
+	//
+	// The levelIter that observed a file containing range keys populated
+	// combinedIterState.key with the smallest (forward) or largest (backward)
+	// range key it observed. If multiple levelIters observed files with range
+	// keys during the same operation on the mergingIter, combinedIterState.key
+	// is the smallest [during forward iteration; largest in reverse iteration]
+	// such key.
+	if seekKey == nil {
+		// Use the levelIter-populated key.
+		seekKey = i.combinedIterState.key
+
+		// We may need to adjust the levelIter-populated seek key to the
+		// surfaced point key. If the key observed is beyond [in the iteration
+		// direction] the current point key, there may still exist a range key
+		// at an earlier key. Consider the following example:
+		//
+		//   L5:  000003:[bar.DEL.5, foo.RANGEKEYSET.9]
+		//   L6:  000001:[bar.SET.2] 000002:[bax.RANGEKEYSET.8]
+		//
+		// A call to First() seeks the levels to files L5.000003 and L6.000001.
+		// The L5 levelIter observes that L5.000003 contains the range key with
+		// start key `foo`, and triggers a switch to combined iteration, setting
+		// `combinedIterState.key` = `foo`.
+		//
+		// The L6 levelIter did not observe the true first range key
+		// (bax.RANGEKEYSET.8), because it appears in a later sstable. When the
+		// combined iterator is initialized, the range key iterator must be
+		// seeked to a key that will find `bax`. To accomplish this, we seek the
+		// key instead to `bar`. It is guaranteed that no range key exists
+		// earlier than `bar`, otherwise a levelIter would've observed it and
+		// set `combinedIterState.key` to its start key.
+		if pointKey != nil {
+			if dir == +1 && i.parent.cmp(i.combinedIterState.key, pointKey.UserKey) > 0 {
+				seekKey = pointKey.UserKey
+			} else if dir == -1 && i.parent.cmp(seekKey, pointKey.UserKey) < 0 {
+				seekKey = pointKey.UserKey
+			}
+		}
+	}
+
+	// An operation on the point iterator observed a file containing range keys,
+	// so we must switch to combined interleaving iteration. First, construct
+	// the range key iterator stack. It must not exist, otherwise we'd already
+	// be performing combined iteration.
+	i.parent.rangeKey = iterRangeKeyStateAllocPool.Get().(*iteratorRangeKeyState)
+	i.parent.rangeKey.init(i.parent.comparer.Compare, i.parent.comparer.Split, &i.parent.opts)
+	i.parent.constructRangeKeyIter()
+
+	// Initialize the Iterator's interleaving iterator.
+	i.parent.rangeKey.iiter.Init(
+		&i.parent.comparer, i.parent.pointIter, i.parent.rangeKey.rangeKeyIter,
+		keyspan.InterleavingIterOpts{
+			Mask:       &i.parent.rangeKeyMasking,
+			LowerBound: i.parent.opts.LowerBound,
+			UpperBound: i.parent.opts.UpperBound,
+		})
+
+	// Set the parent's primary iterator to point to the combined, interleaving
+	// iterator that's now initialized with our current state.
+	i.parent.iter = &i.parent.rangeKey.iiter
+	i.combinedIterState.initialized = true
+	i.combinedIterState.key = nil
+
+	// All future iterator operations will go directly through the combined
+	// iterator.
+	//
+	// Initialize the interleaving iterator. We pass the point key-value pair so
+	// that the interleaving iterator knows where the point iterator is
+	// positioned. Additionally, we pass the seek key to which the range-key
+	// iterator should be seeked in order to initialize its position.
+	//
+	// In the forward direction (invert for backwards), the seek key is a key
+	// guaranteed to find the smallest range key that's greater than the last
+	// key the iterator returned. The range key may be less than pointKey, in
+	// which case the range key will be interleaved next instead of the point
+	// key.
+	if dir == +1 {
+		var prefix []byte
+		if i.parent.hasPrefix {
+			prefix = i.parent.prefixOrFullSeekKey
+		}
+		return i.parent.rangeKey.iiter.InitSeekGE(prefix, seekKey, pointKey, pointValue)
+	}
+	return i.parent.rangeKey.iiter.InitSeekLT(seekKey, pointKey, pointValue)
+}
+
+func (i *lazyCombinedIter) SeekGE(
+	key []byte, flags base.SeekGEFlags,
+) (*InternalKey, base.LazyValue) {
+	if i.combinedIterState.initialized {
+		return i.parent.rangeKey.iiter.SeekGE(key, flags)
+	}
+	k, v := i.pointIter.SeekGE(key, flags)
+	if i.combinedIterState.triggered {
+		return i.initCombinedIteration(+1, k, v, key)
+	}
+	return k, v
+}
+
+func (i *lazyCombinedIter) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*InternalKey, base.LazyValue) {
+	if i.combinedIterState.initialized {
+		return i.parent.rangeKey.iiter.SeekPrefixGE(prefix, key, flags)
+	}
+	k, v := i.pointIter.SeekPrefixGE(prefix, key, flags)
+	if i.combinedIterState.triggered {
+		return i.initCombinedIteration(+1, k, v, key)
+	}
+	return k, v
+}
+
+func (i *lazyCombinedIter) SeekLT(
+	key []byte, flags base.SeekLTFlags,
+) (*InternalKey, base.LazyValue) {
+	if i.combinedIterState.initialized {
+		return i.parent.rangeKey.iiter.SeekLT(key, flags)
+	}
+	k, v := i.pointIter.SeekLT(key, flags)
+	if i.combinedIterState.triggered {
+		return i.initCombinedIteration(-1, k, v, key)
+	}
+	return k, v
+}
+
+func (i *lazyCombinedIter) First() (*InternalKey, base.LazyValue) {
+	if i.combinedIterState.initialized {
+		return i.parent.rangeKey.iiter.First()
+	}
+	k, v := i.pointIter.First()
+	if i.combinedIterState.triggered {
+		return i.initCombinedIteration(+1, k, v, nil)
+	}
+	return k, v
+}
+
+func (i *lazyCombinedIter) Last() (*InternalKey, base.LazyValue) {
+	if i.combinedIterState.initialized {
+		return i.parent.rangeKey.iiter.Last()
+	}
+	k, v := i.pointIter.Last()
+	if i.combinedIterState.triggered {
+		return i.initCombinedIteration(-1, k, v, nil)
+	}
+	return k, v
+}
+
+func (i *lazyCombinedIter) Next() (*InternalKey, base.LazyValue) {
+	if i.combinedIterState.initialized {
+		return i.parent.rangeKey.iiter.Next()
+	}
+	k, v := i.pointIter.Next()
+	if i.combinedIterState.triggered {
+		return i.initCombinedIteration(+1, k, v, nil)
+	}
+	return k, v
+}
+
+func (i *lazyCombinedIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
+	if i.combinedIterState.initialized {
+		return i.parent.rangeKey.iiter.NextPrefix(succKey)
+	}
+	k, v := i.pointIter.NextPrefix(succKey)
+	if i.combinedIterState.triggered {
+		return i.initCombinedIteration(+1, k, v, nil)
+	}
+	return k, v
+}
+
+func (i *lazyCombinedIter) Prev() (*InternalKey, base.LazyValue) {
+	if i.combinedIterState.initialized {
+		return i.parent.rangeKey.iiter.Prev()
+	}
+	k, v := i.pointIter.Prev()
+	if i.combinedIterState.triggered {
+		return i.initCombinedIteration(-1, k, v, nil)
+	}
+	return k, v
+}
+
+func (i *lazyCombinedIter) Error() error {
+	if i.combinedIterState.initialized {
+		return i.parent.rangeKey.iiter.Error()
+	}
+	return i.pointIter.Error()
+}
+
+func (i *lazyCombinedIter) Close() error {
+	if i.combinedIterState.initialized {
+		return i.parent.rangeKey.iiter.Close()
+	}
+	return i.pointIter.Close()
+}
+
+func (i *lazyCombinedIter) SetBounds(lower, upper []byte) {
+	if i.combinedIterState.initialized {
+		i.parent.rangeKey.iiter.SetBounds(lower, upper)
+		return
+	}
+	i.pointIter.SetBounds(lower, upper)
+}
+
+func (i *lazyCombinedIter) SetContext(ctx context.Context) {
+	if i.combinedIterState.initialized {
+		i.parent.rangeKey.iiter.SetContext(ctx)
+		return
+	}
+	i.pointIter.SetContext(ctx)
+}
+
+func (i *lazyCombinedIter) String() string {
+	if i.combinedIterState.initialized {
+		return i.parent.rangeKey.iiter.String()
+	}
+	return i.pointIter.String()
+}
diff --git a/pebble/rangekey/rangekey.go b/pebble/rangekey/rangekey.go
new file mode 100644
index 0000000..93e7fbe
--- /dev/null
+++ b/pebble/rangekey/rangekey.go
@@ -0,0 +1,33 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+// Package rangekey provides functionality for working with range keys.
+package rangekey
+
+import (
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/rangekey"
+	"github.com/cockroachdb/pebble/sstable"
+)
+
+// Fragmenter exports the keyspan.Fragmenter type.
+type Fragmenter = keyspan.Fragmenter
+
+// Key exports the keyspan.Key type.
+type Key = keyspan.Key
+
+// Span exports the keyspan.Span type.
+type Span = keyspan.Span
+
+// IsRangeKey returns if this InternalKey is a range key. Alias for
+// rangekey.IsRangeKey.
+func IsRangeKey(ik sstable.InternalKey) bool {
+	return rangekey.IsRangeKey(ik.Kind())
+}
+
+// Decode decodes an InternalKey into a keyspan.Span, if it is a range key. If
+// keysDst is provided, keys will be appended to keysDst to reduce allocations.
+func Decode(ik sstable.InternalKey, val []byte, keysDst []keyspan.Key) (Span, error) {
+	return rangekey.Decode(ik, val, keysDst)
+}
diff --git a/pebble/read_compaction_queue.go b/pebble/read_compaction_queue.go
new file mode 100644
index 0000000..450b7e9
--- /dev/null
+++ b/pebble/read_compaction_queue.go
@@ -0,0 +1,94 @@
+package pebble
+
+import "github.com/cockroachdb/pebble/internal/base"
+
+// The maximum number of elements in the readCompactions queue.
+// We want to limit the number of elements so that we only do
+// compactions for ranges which are being read recently.
+const readCompactionMaxQueueSize = 5
+
+// The readCompactionQueue is a queue of read compactions with
+// 0 overlapping ranges.
+type readCompactionQueue struct {
+	// Invariant: A contiguous prefix of the queue contains
+	// all the elements in the queue, in order of insertion.
+	// When we remove duplicates from the queue, we break
+	// the invariant that a contiguous prefix of the queue
+	// has all the elements in it. To fix this, we shift
+	// the elements of the queue to the left. This is cheap
+	// because the queue has a max length of 5.
+	queue [readCompactionMaxQueueSize]*readCompaction
+
+	// The size of the queue which is occupied.
+	// A size of k, implies that the first k elements
+	// of the queue are occupied.
+	// The size will be <= readCompactionMaxQueueSize.
+	size int
+}
+
+// combine should be used to combine an older queue with a newer
+// queue.
+func (qu *readCompactionQueue) combine(newQu *readCompactionQueue, cmp base.Compare) {
+
+	for i := 0; i < newQu.size; i++ {
+		qu.add(newQu.queue[i], cmp)
+	}
+}
+
+// add adds read compactions to the queue, while maintaining the invariant
+// that there are no overlapping ranges in the queue.
+func (qu *readCompactionQueue) add(rc *readCompaction, cmp base.Compare) {
+	sz := qu.size
+	for i := 0; i < sz; i++ {
+		left := qu.queue[i]
+		right := rc
+		if cmp(left.start, right.start) > 0 {
+			left, right = right, left
+		}
+		if cmp(right.start, left.end) <= 0 {
+			qu.queue[i] = nil
+			qu.size--
+		}
+	}
+
+	// Get rid of the holes which may have been formed
+	// in the queue.
+	qu.shiftLeft()
+
+	if qu.size == readCompactionMaxQueueSize {
+		// Make space at the end.
+		copy(qu.queue[0:], qu.queue[1:])
+		qu.queue[qu.size-1] = rc
+	} else {
+		qu.size++
+		qu.queue[qu.size-1] = rc
+	}
+}
+
+// Shifts the non-nil elements of the queue to the left so
+// that a continguous prefix of the queue is non-nil.
+func (qu *readCompactionQueue) shiftLeft() {
+	nilPos := -1
+	for i := 0; i < readCompactionMaxQueueSize; i++ {
+		if qu.queue[i] == nil && nilPos == -1 {
+			nilPos = i
+		} else if qu.queue[i] != nil && nilPos != -1 {
+			qu.queue[nilPos] = qu.queue[i]
+			qu.queue[i] = nil
+			nilPos++
+		}
+	}
+}
+
+// remove will remove the oldest element from the queue.
+func (qu *readCompactionQueue) remove() *readCompaction {
+	if qu.size == 0 {
+		return nil
+	}
+
+	c := qu.queue[0]
+	copy(qu.queue[0:], qu.queue[1:])
+	qu.queue[qu.size-1] = nil
+	qu.size--
+	return c
+}
diff --git a/pebble/read_state.go b/pebble/read_state.go
new file mode 100644
index 0000000..d3a78ba
--- /dev/null
+++ b/pebble/read_state.go
@@ -0,0 +1,106 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import "sync/atomic"
+
+// readState encapsulates the state needed for reading (the current version and
+// list of memtables). Loading the readState is done without grabbing
+// DB.mu. Instead, a separate DB.readState.RWMutex is used for
+// synchronization. This mutex solely covers the current readState object which
+// means it is rarely or ever contended.
+//
+// Note that various fancy lock-free mechanisms can be imagined for loading the
+// readState, but benchmarking showed the ones considered to purely be
+// pessimizations. The RWMutex version is a single atomic increment for the
+// RLock and an atomic decrement for the RUnlock. It is difficult to do better
+// than that without something like thread-local storage which isn't available
+// in Go.
+type readState struct {
+	db        *DB
+	refcnt    atomic.Int32
+	current   *version
+	memtables flushableList
+}
+
+// ref adds a reference to the readState.
+func (s *readState) ref() {
+	s.refcnt.Add(1)
+}
+
+// unref removes a reference to the readState. If this was the last reference,
+// the reference the readState holds on the version is released. Requires DB.mu
+// is NOT held as version.unref() will acquire it. See unrefLocked() if DB.mu
+// is held by the caller.
+func (s *readState) unref() {
+	if s.refcnt.Add(-1) != 0 {
+		return
+	}
+	s.current.Unref()
+	for _, mem := range s.memtables {
+		mem.readerUnref(true)
+	}
+
+	// The last reference to the readState was released. Check to see if there
+	// are new obsolete tables to delete.
+	s.db.maybeScheduleObsoleteTableDeletion()
+}
+
+// unrefLocked removes a reference to the readState. If this was the last
+// reference, the reference the readState holds on the version is
+// released.
+//
+// DB.mu must be held. See unref() if DB.mu is NOT held by the caller.
+func (s *readState) unrefLocked() {
+	if s.refcnt.Add(-1) != 0 {
+		return
+	}
+	s.current.UnrefLocked()
+	for _, mem := range s.memtables {
+		mem.readerUnrefLocked(true)
+	}
+
+	// In this code path, the caller is responsible for scheduling obsolete table
+	// deletion as necessary.
+}
+
+// loadReadState returns the current readState. The returned readState must be
+// unreferenced when the caller is finished with it.
+func (d *DB) loadReadState() *readState {
+	d.readState.RLock()
+	state := d.readState.val
+	state.ref()
+	d.readState.RUnlock()
+	return state
+}
+
+// updateReadStateLocked creates a new readState from the current version and
+// list of memtables. Requires DB.mu is held. If checker is not nil, it is
+// called after installing the new readState.
+func (d *DB) updateReadStateLocked(checker func(*DB) error) {
+	s := &readState{
+		db:        d,
+		current:   d.mu.versions.currentVersion(),
+		memtables: d.mu.mem.queue,
+	}
+	s.refcnt.Store(1)
+	s.current.Ref()
+	for _, mem := range s.memtables {
+		mem.readerRef()
+	}
+
+	d.readState.Lock()
+	old := d.readState.val
+	d.readState.val = s
+	d.readState.Unlock()
+	if checker != nil {
+		if err := checker(d); err != nil {
+			d.opts.Logger.Fatalf("checker failed with error: %s", err)
+		}
+	}
+	if old != nil {
+		old.unrefLocked()
+	}
+}
diff --git a/pebble/read_state_test.go b/pebble/read_state_test.go
new file mode 100644
index 0000000..4f9d5f5
--- /dev/null
+++ b/pebble/read_state_test.go
@@ -0,0 +1,46 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/pebble/vfs"
+	"golang.org/x/exp/rand"
+)
+
+func BenchmarkReadState(b *testing.B) {
+	d, err := Open("", &Options{
+		FS: vfs.NewMem(),
+	})
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	for _, updateFrac := range []float32{0, 0.1, 0.5} {
+		b.Run(fmt.Sprintf("updates=%.0f", updateFrac*100), func(b *testing.B) {
+			b.RunParallel(func(pb *testing.PB) {
+				rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+
+				for pb.Next() {
+					if rng.Float32() < updateFrac {
+						d.mu.Lock()
+						d.updateReadStateLocked(nil)
+						d.mu.Unlock()
+					} else {
+						s := d.loadReadState()
+						s.unref()
+					}
+				}
+			})
+		})
+	}
+
+	if err := d.Close(); err != nil {
+		b.Fatal(err)
+	}
+}
diff --git a/pebble/record/log_writer.go b/pebble/record/log_writer.go
new file mode 100644
index 0000000..891879d
--- /dev/null
+++ b/pebble/record/log_writer.go
@@ -0,0 +1,771 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package record
+
+import (
+	"context"
+	"encoding/binary"
+	"io"
+	"runtime/pprof"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/crc"
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+var walSyncLabels = pprof.Labels("pebble", "wal-sync")
+var errClosedWriter = errors.New("pebble/record: closed LogWriter")
+
+type block struct {
+	// buf[:written] has already been filled with fragments. Updated atomically.
+	written atomic.Int32
+	// buf[:flushed] has already been flushed to w.
+	flushed int32
+	buf     [blockSize]byte
+}
+
+type flusher interface {
+	Flush() error
+}
+
+type syncer interface {
+	Sync() error
+}
+
+const (
+	syncConcurrencyBits = 12
+
+	// SyncConcurrency is the maximum number of concurrent sync operations that
+	// can be performed. Note that a sync operation is initiated either by a call
+	// to SyncRecord or by a call to Close. Exported as this value also limits
+	// the commit concurrency in commitPipeline.
+	SyncConcurrency = 1 << syncConcurrencyBits
+)
+
+type syncSlot struct {
+	wg  *sync.WaitGroup
+	err *error
+}
+
+// syncQueue is a lock-free fixed-size single-producer, single-consumer
+// queue. The single-producer can push to the head, and the single-consumer can
+// pop multiple values from the tail. Popping calls Done() on each of the
+// available *sync.WaitGroup elements.
+type syncQueue struct {
+	// headTail packs together a 32-bit head index and a 32-bit tail index. Both
+	// are indexes into slots modulo len(slots)-1.
+	//
+	// tail = index of oldest data in queue
+	// head = index of next slot to fill
+	//
+	// Slots in the range [tail, head) are owned by consumers.  A consumer
+	// continues to own a slot outside this range until it nils the slot, at
+	// which point ownership passes to the producer.
+	//
+	// The head index is stored in the most-significant bits so that we can
+	// atomically add to it and the overflow is harmless.
+	headTail atomic.Uint64
+
+	// slots is a ring buffer of values stored in this queue. The size must be a
+	// power of 2. A slot is in use until the tail index has moved beyond it.
+	slots [SyncConcurrency]syncSlot
+
+	// blocked is an atomic boolean which indicates whether syncing is currently
+	// blocked or can proceed. It is used by the implementation of
+	// min-sync-interval to block syncing until the min interval has passed.
+	blocked atomic.Bool
+}
+
+const dequeueBits = 32
+
+func (q *syncQueue) unpack(ptrs uint64) (head, tail uint32) {
+	const mask = 1<<dequeueBits - 1
+	head = uint32((ptrs >> dequeueBits) & mask)
+	tail = uint32(ptrs & mask)
+	return
+}
+
+func (q *syncQueue) push(wg *sync.WaitGroup, err *error) {
+	ptrs := q.headTail.Load()
+	head, tail := q.unpack(ptrs)
+	if (tail+uint32(len(q.slots)))&(1<<dequeueBits-1) == head {
+		panic("pebble: queue is full")
+	}
+
+	slot := &q.slots[head&uint32(len(q.slots)-1)]
+	slot.wg = wg
+	slot.err = err
+
+	// Increment head. This passes ownership of slot to dequeue and acts as a
+	// store barrier for writing the slot.
+	q.headTail.Add(1 << dequeueBits)
+}
+
+func (q *syncQueue) setBlocked() {
+	q.blocked.Store(true)
+}
+
+func (q *syncQueue) clearBlocked() {
+	q.blocked.Store(false)
+}
+
+func (q *syncQueue) empty() bool {
+	head, tail, _ := q.load()
+	return head == tail
+}
+
+// load returns the head, tail of the queue for what should be synced to the
+// caller. It can return a head, tail of zero if syncing is blocked due to
+// min-sync-interval. It additionally returns the real length of this queue,
+// regardless of whether syncing is blocked.
+func (q *syncQueue) load() (head, tail, realLength uint32) {
+	ptrs := q.headTail.Load()
+	head, tail = q.unpack(ptrs)
+	realLength = head - tail
+	if q.blocked.Load() {
+		return 0, 0, realLength
+	}
+	return head, tail, realLength
+}
+
+// REQUIRES: queueSemChan is non-nil.
+func (q *syncQueue) pop(head, tail uint32, err error, queueSemChan chan struct{}) error {
+	if tail == head {
+		// Queue is empty.
+		return nil
+	}
+
+	for ; tail != head; tail++ {
+		slot := &q.slots[tail&uint32(len(q.slots)-1)]
+		wg := slot.wg
+		if wg == nil {
+			return errors.Errorf("nil waiter at %d", errors.Safe(tail&uint32(len(q.slots)-1)))
+		}
+		*slot.err = err
+		slot.wg = nil
+		slot.err = nil
+		// We need to bump the tail count before signalling the wait group as
+		// signalling the wait group can trigger release a blocked goroutine which
+		// will try to enqueue before we've "freed" space in the queue.
+		q.headTail.Add(1)
+		wg.Done()
+		// Is always non-nil in production.
+		if queueSemChan != nil {
+			<-queueSemChan
+		}
+	}
+
+	return nil
+}
+
+// flusherCond is a specialized condition variable that allows its condition to
+// change and readiness be signalled without holding its associated mutex. In
+// particular, when a waiter is added to syncQueue atomically, this condition
+// variable can be signalled without holding flusher.Mutex.
+type flusherCond struct {
+	mu   *sync.Mutex
+	q    *syncQueue
+	cond sync.Cond
+}
+
+func (c *flusherCond) init(mu *sync.Mutex, q *syncQueue) {
+	c.mu = mu
+	c.q = q
+	// Yes, this is a bit circular, but that is intentional. flusherCond.cond.L
+	// points flusherCond so that when cond.L.Unlock is called flusherCond.Unlock
+	// will be called and we can check the !syncQueue.empty() condition.
+	c.cond.L = c
+}
+
+func (c *flusherCond) Signal() {
+	// Pass-through to the cond var.
+	c.cond.Signal()
+}
+
+func (c *flusherCond) Wait() {
+	// Pass-through to the cond var. Note that internally the cond var implements
+	// Wait as:
+	//
+	//   t := notifyListAdd()
+	//   L.Unlock()
+	//   notifyListWait(t)
+	//   L.Lock()
+	//
+	// We've configured the cond var to call flusherReady.Unlock() which allows
+	// us to check the !syncQueue.empty() condition without a danger of missing a
+	// notification. Any call to flusherReady.Signal() after notifyListAdd() is
+	// called will cause the subsequent notifyListWait() to return immediately.
+	c.cond.Wait()
+}
+
+func (c *flusherCond) Lock() {
+	c.mu.Lock()
+}
+
+func (c *flusherCond) Unlock() {
+	c.mu.Unlock()
+	if !c.q.empty() {
+		// If the current goroutine is about to block on sync.Cond.Wait, this call
+		// to Signal will prevent that. The comment in Wait above explains a bit
+		// about what is going on here, but it is worth reiterating:
+		//
+		//   flusherCond.Wait()
+		//     sync.Cond.Wait()
+		//       t := notifyListAdd()
+		//       flusherCond.Unlock()    <-- we are here
+		//       notifyListWait(t)
+		//       flusherCond.Lock()
+		//
+		// The call to Signal here results in:
+		//
+		//     sync.Cond.Signal()
+		//       notifyListNotifyOne()
+		//
+		// The call to notifyListNotifyOne() will prevent the call to
+		// notifyListWait(t) from blocking.
+		c.cond.Signal()
+	}
+}
+
+type durationFunc func() time.Duration
+
+// syncTimer is an interface for timers, modeled on the closure callback mode
+// of time.Timer. See time.AfterFunc and LogWriter.afterFunc. syncTimer is used
+// by tests to mock out the timer functionality used to implement
+// min-sync-interval.
+type syncTimer interface {
+	Reset(time.Duration) bool
+	Stop() bool
+}
+
+// LogWriter writes records to an underlying io.Writer. In order to support WAL
+// file reuse, a LogWriter's records are tagged with the WAL's file
+// number. When reading a log file a record from a previous incarnation of the
+// file will return the error ErrInvalidLogNum.
+type LogWriter struct {
+	// w is the underlying writer.
+	w io.Writer
+	// c is w as a closer.
+	c io.Closer
+	// s is w as a syncer.
+	s syncer
+	// logNum is the low 32-bits of the log's file number.
+	logNum uint32
+	// blockNum is the zero based block number for the current block.
+	blockNum int64
+	// err is any accumulated error. TODO(peter): This needs to be protected in
+	// some fashion. Perhaps using atomic.Value.
+	err error
+	// block is the current block being written. Protected by flusher.Mutex.
+	block *block
+	free  struct {
+		sync.Mutex
+		blocks []*block
+	}
+
+	flusher struct {
+		sync.Mutex
+		// Flusher ready is a condition variable that is signalled when there are
+		// blocks to flush, syncing has been requested, or the LogWriter has been
+		// closed. For signalling of a sync, it is safe to call without holding
+		// flusher.Mutex.
+		ready flusherCond
+		// Set to true when the flush loop should be closed.
+		close bool
+		// Closed when the flush loop has terminated.
+		closed chan struct{}
+		// Accumulated flush error.
+		err error
+		// minSyncInterval is the minimum duration between syncs.
+		minSyncInterval durationFunc
+		fsyncLatency    prometheus.Histogram
+		pending         []*block
+		syncQ           syncQueue
+		metrics         *LogWriterMetrics
+	}
+
+	// afterFunc is a hook to allow tests to mock out the timer functionality
+	// used for min-sync-interval. In normal operation this points to
+	// time.AfterFunc.
+	afterFunc func(d time.Duration, f func()) syncTimer
+
+	// See the comment for LogWriterConfig.QueueSemChan.
+	queueSemChan chan struct{}
+}
+
+// LogWriterConfig is a struct used for configuring new LogWriters
+type LogWriterConfig struct {
+	WALMinSyncInterval durationFunc
+	WALFsyncLatency    prometheus.Histogram
+	// QueueSemChan is an optional channel to pop from when popping from
+	// LogWriter.flusher.syncQueue. It functions as a semaphore that prevents
+	// the syncQueue from overflowing (which will cause a panic). All production
+	// code ensures this is non-nil.
+	QueueSemChan chan struct{}
+}
+
+// initialAllocatedBlocksCap is the initial capacity of the various slices
+// intended to hold LogWriter blocks. The LogWriter may allocate more blocks
+// than this threshold allows.
+const initialAllocatedBlocksCap = 32
+
+// blockPool pools *blocks to avoid allocations. Blocks are only added to the
+// Pool when a LogWriter is closed. Before that, free blocks are maintained
+// within a LogWriter's own internal free list `w.free.blocks`.
+var blockPool = sync.Pool{
+	New: func() any { return &block{} },
+}
+
+// NewLogWriter returns a new LogWriter.
+func NewLogWriter(
+	w io.Writer, logNum base.DiskFileNum, logWriterConfig LogWriterConfig,
+) *LogWriter {
+	c, _ := w.(io.Closer)
+	s, _ := w.(syncer)
+	r := &LogWriter{
+		w: w,
+		c: c,
+		s: s,
+		// NB: we truncate the 64-bit log number to 32-bits. This is ok because a)
+		// we are very unlikely to reach a file number of 4 billion and b) the log
+		// number is used as a validation check and using only the low 32-bits is
+		// sufficient for that purpose.
+		logNum: uint32(logNum),
+		afterFunc: func(d time.Duration, f func()) syncTimer {
+			return time.AfterFunc(d, f)
+		},
+		queueSemChan: logWriterConfig.QueueSemChan,
+	}
+	r.free.blocks = make([]*block, 0, initialAllocatedBlocksCap)
+	r.block = blockPool.Get().(*block)
+	r.flusher.ready.init(&r.flusher.Mutex, &r.flusher.syncQ)
+	r.flusher.closed = make(chan struct{})
+	r.flusher.pending = make([]*block, 0, cap(r.free.blocks))
+	r.flusher.metrics = &LogWriterMetrics{}
+
+	f := &r.flusher
+	f.minSyncInterval = logWriterConfig.WALMinSyncInterval
+	f.fsyncLatency = logWriterConfig.WALFsyncLatency
+
+	go func() {
+		pprof.Do(context.Background(), walSyncLabels, r.flushLoop)
+	}()
+	return r
+}
+
+func (w *LogWriter) flushLoop(context.Context) {
+	f := &w.flusher
+	f.Lock()
+
+	// Initialize idleStartTime to when the loop starts.
+	idleStartTime := time.Now()
+	var syncTimer syncTimer
+	defer func() {
+		// Capture the idle duration between the last piece of work and when the
+		// loop terminated.
+		f.metrics.WriteThroughput.IdleDuration += time.Since(idleStartTime)
+		if syncTimer != nil {
+			syncTimer.Stop()
+		}
+		close(f.closed)
+		f.Unlock()
+	}()
+
+	// The flush loop performs flushing of full and partial data blocks to the
+	// underlying writer (LogWriter.w), syncing of the writer, and notification
+	// to sync requests that they have completed.
+	//
+	// - flusher.ready is a condition variable that is signalled when there is
+	//   work to do. Full blocks are contained in flusher.pending. The current
+	//   partial block is in LogWriter.block. And sync operations are held in
+	//   flusher.syncQ.
+	//
+	// - The decision to sync is determined by whether there are any sync
+	//   requests present in flusher.syncQ and whether enough time has elapsed
+	//   since the last sync. If not enough time has elapsed since the last sync,
+	//   flusher.syncQ.blocked will be set to 1. If syncing is blocked,
+	//   syncQueue.empty() will return true and syncQueue.load() will return 0,0
+	//   (i.e. an empty list).
+	//
+	// - flusher.syncQ.blocked is cleared by a timer that is initialized when
+	//   blocked is set to 1. When blocked is 1, no syncing will take place, but
+	//   flushing will continue to be performed. The on/off toggle for syncing
+	//   does not need to be carefully synchronized with the rest of processing
+	//   -- all we need to ensure is that after any transition to blocked=1 there
+	//   is eventually a transition to blocked=0. syncTimer performs this
+	//   transition. Note that any change to min-sync-interval will not take
+	//   effect until the previous timer elapses.
+	//
+	// - Picking up the syncing work to perform requires coordination with
+	//   picking up the flushing work. Specifically, flushing work is queued
+	//   before syncing work. The guarantee of this code is that when a sync is
+	//   requested, any previously queued flush work will be synced. This
+	//   motivates reading the syncing work (f.syncQ.load()) before picking up
+	//   the flush work (w.block.written.Load()).
+
+	// The list of full blocks that need to be written. This is copied from
+	// f.pending on every loop iteration, though the number of elements is
+	// usually small (most frequently 1). In the case of the WAL LogWriter, the
+	// number of blocks is bounded by the size of the WAL's corresponding
+	// memtable (MemtableSize/BlockSize). With the default 64 MiB memtables,
+	// this works out to at most 2048 elements if the entirety of the memtable's
+	// contents are queued.
+	pending := make([]*block, 0, cap(f.pending))
+	for {
+		for {
+			// Grab the portion of the current block that requires flushing. Note that
+			// the current block can be added to the pending blocks list after we release
+			// the flusher lock, but it won't be part of pending.
+			written := w.block.written.Load()
+			if len(f.pending) > 0 || written > w.block.flushed || !f.syncQ.empty() {
+				break
+			}
+			if f.close {
+				// If the writer is closed, pretend the sync timer fired immediately so
+				// that we can process any queued sync requests.
+				f.syncQ.clearBlocked()
+				if !f.syncQ.empty() {
+					break
+				}
+				return
+			}
+			f.ready.Wait()
+			continue
+		}
+		// Found work to do, so no longer idle.
+		workStartTime := time.Now()
+		idleDuration := workStartTime.Sub(idleStartTime)
+		pending = append(pending[:0], f.pending...)
+		f.pending = f.pending[:0]
+		f.metrics.PendingBufferLen.AddSample(int64(len(pending)))
+
+		// Grab the list of sync waiters. Note that syncQueue.load() will return
+		// 0,0 while we're waiting for the min-sync-interval to expire. This
+		// allows flushing to proceed even if we're not ready to sync.
+		head, tail, realSyncQLen := f.syncQ.load()
+		f.metrics.SyncQueueLen.AddSample(int64(realSyncQLen))
+
+		// Grab the portion of the current block that requires flushing. Note that
+		// the current block can be added to the pending blocks list after we
+		// release the flusher lock, but it won't be part of pending. This has to
+		// be ordered after we get the list of sync waiters from syncQ in order to
+		// prevent a race where a waiter adds itself to syncQ, but this thread
+		// picks up the entry in syncQ and not the buffered data.
+		written := w.block.written.Load()
+		data := w.block.buf[w.block.flushed:written]
+		w.block.flushed = written
+
+		// If flusher has an error, we propagate it to waiters. Note in spite of
+		// error we consume the pending list above to free blocks for writers.
+		if f.err != nil {
+			f.syncQ.pop(head, tail, f.err, w.queueSemChan)
+			// Update the idleStartTime if work could not be done, so that we don't
+			// include the duration we tried to do work as idle. We don't bother
+			// with the rest of the accounting, which means we will undercount.
+			idleStartTime = time.Now()
+			continue
+		}
+		f.Unlock()
+		synced, syncLatency, bytesWritten, err := w.flushPending(data, pending, head, tail)
+		f.Lock()
+		if synced && f.fsyncLatency != nil {
+			f.fsyncLatency.Observe(float64(syncLatency))
+		}
+		f.err = err
+		if f.err != nil {
+			f.syncQ.clearBlocked()
+			// Update the idleStartTime if work could not be done, so that we don't
+			// include the duration we tried to do work as idle. We don't bother
+			// with the rest of the accounting, which means we will undercount.
+			idleStartTime = time.Now()
+			continue
+		}
+
+		if synced && f.minSyncInterval != nil {
+			// A sync was performed. Make sure we've waited for the min sync
+			// interval before syncing again.
+			if min := f.minSyncInterval(); min > 0 {
+				f.syncQ.setBlocked()
+				if syncTimer == nil {
+					syncTimer = w.afterFunc(min, func() {
+						f.syncQ.clearBlocked()
+						f.ready.Signal()
+					})
+				} else {
+					syncTimer.Reset(min)
+				}
+			}
+		}
+		// Finished work, and started idling.
+		idleStartTime = time.Now()
+		workDuration := idleStartTime.Sub(workStartTime)
+		f.metrics.WriteThroughput.Bytes += bytesWritten
+		f.metrics.WriteThroughput.WorkDuration += workDuration
+		f.metrics.WriteThroughput.IdleDuration += idleDuration
+	}
+}
+
+func (w *LogWriter) flushPending(
+	data []byte, pending []*block, head, tail uint32,
+) (synced bool, syncLatency time.Duration, bytesWritten int64, err error) {
+	defer func() {
+		// Translate panics into errors. The errors will cause flushLoop to shut
+		// down, but allows us to do so in a controlled way and avoid swallowing
+		// the stack that created the panic if panic'ing itself hits a panic
+		// (e.g. unlock of unlocked mutex).
+		if r := recover(); r != nil {
+			err = errors.Newf("%v", r)
+		}
+	}()
+
+	for _, b := range pending {
+		bytesWritten += blockSize - int64(b.flushed)
+		if err = w.flushBlock(b); err != nil {
+			break
+		}
+	}
+	if n := len(data); err == nil && n > 0 {
+		bytesWritten += int64(n)
+		_, err = w.w.Write(data)
+	}
+
+	synced = head != tail
+	if synced {
+		if err == nil && w.s != nil {
+			syncLatency, err = w.syncWithLatency()
+		}
+		f := &w.flusher
+		if popErr := f.syncQ.pop(head, tail, err, w.queueSemChan); popErr != nil {
+			return synced, syncLatency, bytesWritten, popErr
+		}
+	}
+
+	return synced, syncLatency, bytesWritten, err
+}
+
+func (w *LogWriter) syncWithLatency() (time.Duration, error) {
+	start := time.Now()
+	err := w.s.Sync()
+	syncLatency := time.Since(start)
+	return syncLatency, err
+}
+
+func (w *LogWriter) flushBlock(b *block) error {
+	if _, err := w.w.Write(b.buf[b.flushed:]); err != nil {
+		return err
+	}
+	b.written.Store(0)
+	b.flushed = 0
+	w.free.Lock()
+	w.free.blocks = append(w.free.blocks, b)
+	w.free.Unlock()
+	return nil
+}
+
+// queueBlock queues the current block for writing to the underlying writer,
+// allocates a new block and reserves space for the next header.
+func (w *LogWriter) queueBlock() {
+	// Allocate a new block, blocking until one is available. We do this first
+	// because w.block is protected by w.flusher.Mutex.
+	w.free.Lock()
+	if len(w.free.blocks) == 0 {
+		w.free.blocks = append(w.free.blocks, blockPool.Get().(*block))
+	}
+	nextBlock := w.free.blocks[len(w.free.blocks)-1]
+	w.free.blocks = w.free.blocks[:len(w.free.blocks)-1]
+	w.free.Unlock()
+
+	f := &w.flusher
+	f.Lock()
+	f.pending = append(f.pending, w.block)
+	w.block = nextBlock
+	f.ready.Signal()
+	w.err = w.flusher.err
+	f.Unlock()
+
+	w.blockNum++
+}
+
+// Close flushes and syncs any unwritten data and closes the writer.
+// Where required, external synchronisation is provided by commitPipeline.mu.
+func (w *LogWriter) Close() error {
+	f := &w.flusher
+
+	// Emit an EOF trailer signifying the end of this log. This helps readers
+	// differentiate between a corrupted entry in the middle of a log from
+	// garbage at the tail from a recycled log file.
+	w.emitEOFTrailer()
+
+	// Signal the flush loop to close.
+	f.Lock()
+	f.close = true
+	f.ready.Signal()
+	f.Unlock()
+
+	// Wait for the flush loop to close. The flush loop will not close until all
+	// pending data has been written or an error occurs.
+	<-f.closed
+
+	// Sync any flushed data to disk. NB: flushLoop will sync after flushing the
+	// last buffered data only if it was requested via syncQ, so we need to sync
+	// here to ensure that all the data is synced.
+	err := w.flusher.err
+	var syncLatency time.Duration
+	if err == nil && w.s != nil {
+		syncLatency, err = w.syncWithLatency()
+	}
+	f.Lock()
+	if f.fsyncLatency != nil {
+		f.fsyncLatency.Observe(float64(syncLatency))
+	}
+	free := w.free.blocks
+	f.Unlock()
+
+	if w.c != nil {
+		cerr := w.c.Close()
+		w.c = nil
+		if cerr != nil {
+			return cerr
+		}
+	}
+
+	for _, b := range free {
+		b.flushed = 0
+		b.written.Store(0)
+		blockPool.Put(b)
+	}
+
+	w.err = errClosedWriter
+	return err
+}
+
+// WriteRecord writes a complete record. Returns the offset just past the end
+// of the record.
+// External synchronisation provided by commitPipeline.mu.
+func (w *LogWriter) WriteRecord(p []byte) (int64, error) {
+	logSize, err := w.SyncRecord(p, nil, nil)
+	return logSize, err
+}
+
+// SyncRecord writes a complete record. If wg != nil the record will be
+// asynchronously persisted to the underlying writer and done will be called on
+// the wait group upon completion. Returns the offset just past the end of the
+// record.
+// External synchronisation provided by commitPipeline.mu.
+func (w *LogWriter) SyncRecord(
+	p []byte, wg *sync.WaitGroup, err *error,
+) (logSize int64, err2 error) {
+	if w.err != nil {
+		return -1, w.err
+	}
+
+	// The `i == 0` condition ensures we handle empty records. Such records can
+	// possibly be generated for VersionEdits stored in the MANIFEST. While the
+	// MANIFEST is currently written using Writer, it is good to support the same
+	// semantics with LogWriter.
+	for i := 0; i == 0 || len(p) > 0; i++ {
+		p = w.emitFragment(i, p)
+	}
+
+	if wg != nil {
+		// If we've been asked to persist the record, add the WaitGroup to the sync
+		// queue and signal the flushLoop. Note that flushLoop will write partial
+		// blocks to the file if syncing has been requested. The contract is that
+		// any record written to the LogWriter to this point will be flushed to the
+		// OS and synced to disk.
+		f := &w.flusher
+		f.syncQ.push(wg, err)
+		f.ready.Signal()
+	}
+
+	offset := w.blockNum*blockSize + int64(w.block.written.Load())
+	// Note that we don't return w.err here as a concurrent call to Close would
+	// race with our read. That's ok because the only error we could be seeing is
+	// one to syncing for which the caller can receive notification of by passing
+	// in a non-nil err argument.
+	return offset, nil
+}
+
+// Size returns the current size of the file.
+// External synchronisation provided by commitPipeline.mu.
+func (w *LogWriter) Size() int64 {
+	return w.blockNum*blockSize + int64(w.block.written.Load())
+}
+
+func (w *LogWriter) emitEOFTrailer() {
+	// Write a recyclable chunk header with a different log number.  Readers
+	// will treat the header as EOF when the log number does not match.
+	b := w.block
+	i := b.written.Load()
+	binary.LittleEndian.PutUint32(b.buf[i+0:i+4], 0) // CRC
+	binary.LittleEndian.PutUint16(b.buf[i+4:i+6], 0) // Size
+	b.buf[i+6] = recyclableFullChunkType
+	binary.LittleEndian.PutUint32(b.buf[i+7:i+11], w.logNum+1) // Log number
+	b.written.Store(i + int32(recyclableHeaderSize))
+}
+
+func (w *LogWriter) emitFragment(n int, p []byte) (remainingP []byte) {
+	b := w.block
+	i := b.written.Load()
+	first := n == 0
+	last := blockSize-i-recyclableHeaderSize >= int32(len(p))
+
+	if last {
+		if first {
+			b.buf[i+6] = recyclableFullChunkType
+		} else {
+			b.buf[i+6] = recyclableLastChunkType
+		}
+	} else {
+		if first {
+			b.buf[i+6] = recyclableFirstChunkType
+		} else {
+			b.buf[i+6] = recyclableMiddleChunkType
+		}
+	}
+
+	binary.LittleEndian.PutUint32(b.buf[i+7:i+11], w.logNum)
+
+	r := copy(b.buf[i+recyclableHeaderSize:], p)
+	j := i + int32(recyclableHeaderSize+r)
+	binary.LittleEndian.PutUint32(b.buf[i+0:i+4], crc.New(b.buf[i+6:j]).Value())
+	binary.LittleEndian.PutUint16(b.buf[i+4:i+6], uint16(r))
+	b.written.Store(j)
+
+	if blockSize-b.written.Load() < recyclableHeaderSize {
+		// There is no room for another fragment in the block, so fill the
+		// remaining bytes with zeros and queue the block for flushing.
+		for i := b.written.Load(); i < blockSize; i++ {
+			b.buf[i] = 0
+		}
+		w.queueBlock()
+	}
+	return p[r:]
+}
+
+// Metrics must be called after Close. The callee will no longer modify the
+// returned LogWriterMetrics.
+func (w *LogWriter) Metrics() *LogWriterMetrics {
+	return w.flusher.metrics
+}
+
+// LogWriterMetrics contains misc metrics for the log writer.
+type LogWriterMetrics struct {
+	WriteThroughput  base.ThroughputMetric
+	PendingBufferLen base.GaugeSampleMetric
+	SyncQueueLen     base.GaugeSampleMetric
+}
+
+// Merge merges metrics from x. Requires that x is non-nil.
+func (m *LogWriterMetrics) Merge(x *LogWriterMetrics) error {
+	m.WriteThroughput.Merge(x.WriteThroughput)
+	m.PendingBufferLen.Merge(x.PendingBufferLen)
+	m.SyncQueueLen.Merge(x.SyncQueueLen)
+	return nil
+}
diff --git a/pebble/record/log_writer_test.go b/pebble/record/log_writer_test.go
new file mode 100644
index 0000000..973105a
--- /dev/null
+++ b/pebble/record/log_writer_test.go
@@ -0,0 +1,593 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package record
+
+import (
+	"bytes"
+	"fmt"
+	"math"
+	"sort"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/humanize"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/cockroachdb/pebble/vfs/errorfs"
+	"github.com/cockroachdb/pebble/vfs/vfstest"
+	"github.com/prometheus/client_golang/prometheus"
+	prometheusgo "github.com/prometheus/client_model/go"
+	"github.com/stretchr/testify/require"
+)
+
+type syncErrorFile struct {
+	vfs.File
+	err error
+}
+
+func (f syncErrorFile) Sync() error {
+	return f.err
+}
+
+func TestSyncQueue(t *testing.T) {
+	var q syncQueue
+	var closed atomic.Bool
+
+	var flusherWG sync.WaitGroup
+	flusherWG.Add(1)
+	go func() {
+		defer flusherWG.Done()
+		for {
+			if closed.Load() {
+				return
+			}
+			head, tail, _ := q.load()
+			q.pop(head, tail, nil, nil)
+		}
+	}()
+
+	var commitMu sync.Mutex
+	var doneWG sync.WaitGroup
+	for i := 0; i < SyncConcurrency; i++ {
+		doneWG.Add(1)
+		go func(i int) {
+			defer doneWG.Done()
+			for j := 0; j < 1000; j++ {
+				wg := &sync.WaitGroup{}
+				wg.Add(1)
+				// syncQueue is a single-producer, single-consumer queue. We need to
+				// provide mutual exclusion on the producer side.
+				commitMu.Lock()
+				q.push(wg, new(error))
+				commitMu.Unlock()
+				wg.Wait()
+			}
+		}(i)
+	}
+	doneWG.Wait()
+
+	closed.Store(true)
+	flusherWG.Wait()
+}
+
+func TestFlusherCond(t *testing.T) {
+	var mu sync.Mutex
+	var q syncQueue
+	var c flusherCond
+	var closed bool
+
+	c.init(&mu, &q)
+
+	var flusherWG sync.WaitGroup
+	flusherWG.Add(1)
+	go func() {
+		defer flusherWG.Done()
+
+		mu.Lock()
+		defer mu.Unlock()
+
+		for {
+			for {
+				if closed {
+					return
+				}
+				if !q.empty() {
+					break
+				}
+				c.Wait()
+			}
+
+			head, tail, _ := q.load()
+			q.pop(head, tail, nil, nil)
+		}
+	}()
+
+	var commitMu sync.Mutex
+	var doneWG sync.WaitGroup
+	// NB: we're testing with low concurrency here, because what we want to
+	// stress is that signalling of the flusherCond works
+	// correctly. Specifically, we want to make sure that a signal is "lost",
+	// causing the test to wedge.
+	for i := 0; i < 2; i++ {
+		doneWG.Add(1)
+		go func(i int) {
+			defer doneWG.Done()
+			for j := 0; j < 10000; j++ {
+				wg := &sync.WaitGroup{}
+				wg.Add(1)
+				// syncQueue is a single-producer, single-consumer queue. We need to
+				// provide mutual exclusion on the producer side.
+				commitMu.Lock()
+				q.push(wg, new(error))
+				commitMu.Unlock()
+				c.Signal()
+				wg.Wait()
+			}
+		}(i)
+	}
+	doneWG.Wait()
+
+	mu.Lock()
+	closed = true
+	c.Signal()
+	mu.Unlock()
+	flusherWG.Wait()
+}
+
+func TestSyncError(t *testing.T) {
+	mem := vfs.NewMem()
+	f, err := mem.Create("log")
+	require.NoError(t, err)
+
+	injectedErr := errors.New("injected error")
+	w := NewLogWriter(syncErrorFile{f, injectedErr}, 0, LogWriterConfig{
+		WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}),
+	})
+
+	syncRecord := func() {
+		var syncErr error
+		var syncWG sync.WaitGroup
+		syncWG.Add(1)
+		_, err = w.SyncRecord([]byte("hello"), &syncWG, &syncErr)
+		require.NoError(t, err)
+		syncWG.Wait()
+		if injectedErr != syncErr {
+			t.Fatalf("unexpected %v but found %v", injectedErr, syncErr)
+		}
+	}
+	// First waiter receives error.
+	syncRecord()
+	// All subsequent waiters also receive the error.
+	syncRecord()
+	syncRecord()
+}
+
+type syncFile struct {
+	writePos atomic.Int64
+	syncPos  atomic.Int64
+}
+
+func (f *syncFile) Write(buf []byte) (int, error) {
+	n := len(buf)
+	f.writePos.Add(int64(n))
+	return n, nil
+}
+
+func (f *syncFile) Sync() error {
+	f.syncPos.Store(f.writePos.Load())
+	return nil
+}
+
+func TestSyncRecord(t *testing.T) {
+	f := &syncFile{}
+	w := NewLogWriter(f, 0, LogWriterConfig{WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{})})
+
+	var syncErr error
+	for i := 0; i < 100000; i++ {
+		var syncWG sync.WaitGroup
+		syncWG.Add(1)
+		offset, err := w.SyncRecord([]byte("hello"), &syncWG, &syncErr)
+		require.NoError(t, err)
+		syncWG.Wait()
+		require.NoError(t, syncErr)
+		if v := f.writePos.Load(); offset != v {
+			t.Fatalf("expected write pos %d, but found %d", offset, v)
+		}
+		if v := f.syncPos.Load(); offset != v {
+			t.Fatalf("expected sync pos %d, but found %d", offset, v)
+		}
+	}
+}
+
+func TestSyncRecordWithSignalChan(t *testing.T) {
+	f := &syncFile{}
+	semChan := make(chan struct{}, 5)
+	for i := 0; i < cap(semChan); i++ {
+		semChan <- struct{}{}
+	}
+	w := NewLogWriter(f, 0, LogWriterConfig{
+		WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}),
+		QueueSemChan:    semChan,
+	})
+	require.Equal(t, cap(semChan), len(semChan))
+	var syncErr error
+	for i := 0; i < 5; i++ {
+		var syncWG sync.WaitGroup
+		syncWG.Add(1)
+		_, err := w.SyncRecord([]byte("hello"), &syncWG, &syncErr)
+		require.NoError(t, err)
+		syncWG.Wait()
+		require.NoError(t, syncErr)
+		// The waitgroup is released before the channel is read, so wait if
+		// necessary.
+		require.Eventually(t, func() bool {
+			return cap(semChan)-(i+1) == len(semChan)
+		}, 10*time.Second, time.Millisecond)
+	}
+}
+
+type fakeTimer struct {
+	f func()
+}
+
+func (t *fakeTimer) Reset(d time.Duration) bool {
+	return false
+}
+
+func (t *fakeTimer) Stop() bool {
+	return false
+}
+
+func try(initialSleep, maxTotalSleep time.Duration, f func() error) error {
+	totalSleep := time.Duration(0)
+	for d := initialSleep; ; d *= 2 {
+		time.Sleep(d)
+		totalSleep += d
+		if err := f(); err == nil || totalSleep >= maxTotalSleep {
+			return err
+		}
+	}
+}
+
+func TestMinSyncInterval(t *testing.T) {
+	const minSyncInterval = 100 * time.Millisecond
+
+	f := &syncFile{}
+	w := NewLogWriter(f, 0, LogWriterConfig{
+		WALMinSyncInterval: func() time.Duration {
+			return minSyncInterval
+		},
+		WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}),
+	})
+
+	var timer fakeTimer
+	w.afterFunc = func(d time.Duration, f func()) syncTimer {
+		if d != minSyncInterval {
+			t.Fatalf("expected minSyncInterval %s, but found %s", minSyncInterval, d)
+		}
+		timer.f = f
+		timer.Reset(d)
+		return &timer
+	}
+
+	syncRecord := func(n int) *sync.WaitGroup {
+		wg := &sync.WaitGroup{}
+		wg.Add(1)
+		_, err := w.SyncRecord(bytes.Repeat([]byte{'a'}, n), wg, new(error))
+		require.NoError(t, err)
+		return wg
+	}
+
+	// Sync one record which will cause the sync timer to kick in.
+	syncRecord(1).Wait()
+
+	startWritePos := f.writePos.Load()
+	startSyncPos := f.syncPos.Load()
+
+	// Write a bunch of large records. The sync position should not change
+	// because we haven't triggered the timer. But note that the writes should
+	// not block either even though syncing isn't being done.
+	var wg *sync.WaitGroup
+	for i := 0; i < 100; i++ {
+		wg = syncRecord(10000)
+		if v := f.syncPos.Load(); startSyncPos != v {
+			t.Fatalf("expected syncPos %d, but found %d", startSyncPos, v)
+		}
+		// NB: we can't use syncQueue.load() here as that will return 0,0 while the
+		// syncQueue is blocked.
+		head, tail := w.flusher.syncQ.unpack(w.flusher.syncQ.headTail.Load())
+		waiters := head - tail
+		if waiters != uint32(i+1) {
+			t.Fatalf("expected %d waiters, but found %d", i+1, waiters)
+		}
+	}
+
+	err := try(time.Millisecond, 5*time.Second, func() error {
+		v := f.writePos.Load()
+		if v > startWritePos {
+			return nil
+		}
+		return errors.Errorf("expected writePos > %d, but found %d", startWritePos, v)
+	})
+	require.NoError(t, err)
+
+	// Fire the timer, and then wait for the last record to sync.
+	timer.f()
+	wg.Wait()
+
+	if w, s := f.writePos.Load(), f.syncPos.Load(); w != s {
+		t.Fatalf("expected syncPos %d, but found %d", s, w)
+	}
+}
+
+func TestMinSyncIntervalClose(t *testing.T) {
+	const minSyncInterval = 100 * time.Millisecond
+
+	f := &syncFile{}
+	w := NewLogWriter(f, 0, LogWriterConfig{
+		WALMinSyncInterval: func() time.Duration {
+			return minSyncInterval
+		},
+		WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}),
+	})
+
+	var timer fakeTimer
+	w.afterFunc = func(d time.Duration, f func()) syncTimer {
+		if d != minSyncInterval {
+			t.Fatalf("expected minSyncInterval %s, but found %s", minSyncInterval, d)
+		}
+		timer.f = f
+		timer.Reset(d)
+		return &timer
+	}
+
+	syncRecord := func(n int) *sync.WaitGroup {
+		wg := &sync.WaitGroup{}
+		wg.Add(1)
+		_, err := w.SyncRecord(bytes.Repeat([]byte{'a'}, n), wg, new(error))
+		require.NoError(t, err)
+		return wg
+	}
+
+	// Sync one record which will cause the sync timer to kick in.
+	syncRecord(1).Wait()
+
+	// Syncing another record will not complete until the timer is fired OR the
+	// writer is closed.
+	wg := syncRecord(1)
+	require.NoError(t, w.Close())
+	wg.Wait()
+}
+
+type syncFileWithWait struct {
+	f       syncFile
+	writeWG sync.WaitGroup
+	syncWG  sync.WaitGroup
+}
+
+func (f *syncFileWithWait) Write(buf []byte) (int, error) {
+	f.writeWG.Wait()
+	return f.f.Write(buf)
+}
+
+func (f *syncFileWithWait) Sync() error {
+	f.syncWG.Wait()
+	return f.f.Sync()
+}
+
+func TestMetricsWithoutSync(t *testing.T) {
+	f := &syncFileWithWait{}
+	f.writeWG.Add(1)
+	w := NewLogWriter(f, 0, LogWriterConfig{WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{})})
+	offset, err := w.SyncRecord([]byte("hello"), nil, nil)
+	require.NoError(t, err)
+	const recordSize = 16
+	require.EqualValues(t, recordSize, offset)
+	// We have 512KB of buffer capacity, and 5 bytes + overhead = 16 bytes for
+	// each record. Write 28 * 1024 records to fill it up to 87.5%. This
+	// constitutes ~14 blocks (each 32KB).
+	const numRecords = 28 << 10
+	for i := 0; i < numRecords; i++ {
+		_, err = w.SyncRecord([]byte("hello"), nil, nil)
+		require.NoError(t, err)
+	}
+	// Unblock the flush loop. It will run once or twice to write these blocks,
+	// plus may run one more time due to the Close, so up to 3 runs. So ~14
+	// blocks flushed over up to 3 runs.
+	f.writeWG.Done()
+	w.Close()
+	m := w.Metrics()
+	// Mean is >= 4 filled blocks.
+	require.LessOrEqual(t, float64(4), m.PendingBufferLen.Mean())
+	// None of these writes asked to be synced.
+	require.EqualValues(t, 0, int(m.SyncQueueLen.Mean()))
+	require.Less(t, int64(numRecords*recordSize), m.WriteThroughput.Bytes)
+}
+
+func TestMetricsWithSync(t *testing.T) {
+	f := &syncFileWithWait{}
+	f.syncWG.Add(1)
+	syncLatencyMicros := prometheus.NewHistogram(prometheus.HistogramOpts{
+		Buckets: []float64{0,
+			float64(time.Millisecond),
+			float64(2 * time.Millisecond),
+			float64(3 * time.Millisecond),
+			float64(4 * time.Millisecond),
+			float64(5 * time.Millisecond),
+			float64(6 * time.Millisecond),
+			float64(7 * time.Millisecond),
+			float64(8 * time.Millisecond),
+			float64(9 * time.Millisecond),
+			float64(10 * time.Millisecond)},
+	})
+
+	w := NewLogWriter(f, 0, LogWriterConfig{
+		WALFsyncLatency: syncLatencyMicros,
+	},
+	)
+	var wg sync.WaitGroup
+	wg.Add(100)
+	for i := 0; i < 100; i++ {
+		var syncErr error
+		_, err := w.SyncRecord([]byte("hello"), &wg, &syncErr)
+		require.NoError(t, err)
+	}
+
+	const syncLatency = 100 * time.Millisecond
+	go func() {
+		time.Sleep(syncLatency)
+		// Unblock the flush loop. It may have run once or twice for these writes,
+		// plus may run one more time due to the Close, so up to 3 runs. So 100
+		// elements in the sync queue, spread over up to 3 runs.
+		f.syncWG.Done()
+	}()
+
+	// Close() will only return after flushing is finished.
+	require.NoError(t, w.Close())
+
+	m := w.Metrics()
+	require.LessOrEqual(t, float64(30), m.SyncQueueLen.Mean())
+
+	writeTo := &prometheusgo.Metric{}
+	require.NoError(t, syncLatencyMicros.Write(writeTo))
+	for i := 0; i < 100; i += 10 {
+		t.Logf("%d%%: %v", i, valueAtQuantileWindowed(writeTo.Histogram, float64(i)))
+	}
+	// Allow for some inaccuracy in sleep and for two syncs, one of which was
+	// fast.
+	require.LessOrEqual(t, float64(syncLatency/(2*time.Microsecond)),
+		valueAtQuantileWindowed(writeTo.Histogram, 90))
+	require.LessOrEqual(t, syncLatency/2, m.WriteThroughput.WorkDuration)
+}
+
+func valueAtQuantileWindowed(histogram *prometheusgo.Histogram, q float64) float64 {
+	buckets := histogram.Bucket
+	n := float64(*histogram.SampleCount)
+	if n == 0 {
+		return 0
+	}
+
+	// NB: The 0.5 is added for rounding purposes; it helps in cases where
+	// SampleCount is small.
+	rank := uint64(((q / 100) * n) + 0.5)
+
+	// Since we are missing the +Inf bucket, CumulativeCounts may never exceed
+	// rank. By omitting the highest bucket we have from the search, the failed
+	// search will land on that last bucket and we don't have to do any special
+	// checks regarding landing on a non-existent bucket.
+	b := sort.Search(len(buckets)-1, func(i int) bool { return *buckets[i].CumulativeCount >= rank })
+
+	var (
+		bucketStart float64 // defaults to 0, which we assume is the lower bound of the smallest bucket
+		bucketEnd   = *buckets[b].UpperBound
+		count       = *buckets[b].CumulativeCount
+	)
+
+	// Calculate the linearly interpolated value within the bucket.
+	if b > 0 {
+		bucketStart = *buckets[b-1].UpperBound
+		count -= *buckets[b-1].CumulativeCount
+		rank -= *buckets[b-1].CumulativeCount
+	}
+	val := bucketStart + (bucketEnd-bucketStart)*(float64(rank)/float64(count))
+	if math.IsNaN(val) || math.IsInf(val, -1) {
+		return 0
+	}
+
+	// Should not extrapolate past the upper bound of the largest bucket.
+	//
+	// NB: SampleCount includes the implicit +Inf bucket but the
+	// buckets[len(buckets)-1].UpperBound refers to the largest bucket defined
+	// by us -- the client library doesn't give us access to the +Inf bucket
+	// which Prometheus uses under the hood. With a high enough quantile, the
+	// val computed further below surpasses the upper bound of the largest
+	// bucket. Using that interpolated value feels wrong since we'd be
+	// extrapolating. Also, for specific metrics if we see our q99 values to be
+	// hitting the top-most bucket boundary, that's an indication for us to
+	// choose better buckets for more accuracy. It's also worth noting that the
+	// prometheus client library does the same thing when the resulting value is
+	// in the +Inf bucket, whereby they return the upper bound of the second
+	// last bucket -- see [1].
+	//
+	// [1]: https://github.com/prometheus/prometheus/blob/d9162189/promql/quantile.go#L103.
+	if val > *buckets[len(buckets)-1].UpperBound {
+		return *buckets[len(buckets)-1].UpperBound
+	}
+
+	return val
+}
+
+// TestQueueWALBlocks tests queueing many un-flushed WAL blocks when syncing is
+// blocked.
+func TestQueueWALBlocks(t *testing.T) {
+	blockWriteCh := make(chan struct{}, 1)
+	f := errorfs.WrapFile(vfstest.DiscardFile, errorfs.InjectorFunc(func(op errorfs.Op) error {
+		if op.Kind == errorfs.OpFileWrite {
+			<-blockWriteCh
+		}
+		return nil
+	}))
+	w := NewLogWriter(f, 0, LogWriterConfig{
+		WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}),
+	})
+	const numBlocks = 1024
+	var b [blockSize]byte
+	var logSize int64
+	for i := 0; i < numBlocks; i++ {
+		var err error
+		logSize, err = w.SyncRecord(b[:], nil, nil)
+		if err != nil {
+			t.Fatal(err)
+		}
+	}
+	close(blockWriteCh)
+	require.NoError(t, w.Close())
+
+	m := w.Metrics()
+	t.Logf("LogSize is %s", humanize.Bytes.Int64(logSize))
+	t.Logf("Mean pending buffer len is %.2f", m.PendingBufferLen.Mean())
+	require.GreaterOrEqual(t, logSize, int64(numBlocks*blockSize))
+}
+
+// BenchmarkQueueWALBlocks exercises queueing within the LogWriter. It can be
+// useful to measure allocations involved when flushing is slow enough to
+// accumulate a large backlog fo queued blocks.
+func BenchmarkQueueWALBlocks(b *testing.B) {
+	const dataVolume = 64 << 20 /* 64 MB */
+	for _, writeSize := range []int64{64, 512, 1024, 2048, 32768} {
+		b.Run(fmt.Sprintf("record-size=%s", humanize.Bytes.Int64(writeSize)), func(b *testing.B) {
+			record := make([]byte, writeSize)
+			numRecords := int(dataVolume / writeSize)
+
+			for j := 0; j < b.N; j++ {
+				b.StopTimer()
+				blockWriteCh := make(chan struct{}, 1)
+				f := errorfs.WrapFile(vfstest.DiscardFile, errorfs.InjectorFunc(func(op errorfs.Op) error {
+					if op.Kind == errorfs.OpFileWrite {
+						<-blockWriteCh
+					}
+					return nil
+				}))
+				w := NewLogWriter(f, 0, LogWriterConfig{
+					WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}),
+				})
+
+				b.StartTimer()
+				for n := numRecords; n > 0; n-- {
+					if _, err := w.SyncRecord(record[:], nil, nil); err != nil {
+						b.Fatal(err)
+					}
+				}
+				b.StopTimer()
+
+				b.SetBytes(dataVolume)
+				close(blockWriteCh)
+				require.NoError(b, w.Close())
+			}
+		})
+	}
+}
diff --git a/pebble/record/record.go b/pebble/record/record.go
new file mode 100644
index 0000000..8924bfb
--- /dev/null
+++ b/pebble/record/record.go
@@ -0,0 +1,644 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+// Package record reads and writes sequences of records. Each record is a stream
+// of bytes that completes before the next record starts.
+//
+// When reading, call Next to obtain an io.Reader for the next record. Next will
+// return io.EOF when there are no more records. It is valid to call Next
+// without reading the current record to exhaustion.
+//
+// When writing, call Next to obtain an io.Writer for the next record. Calling
+// Next finishes the current record. Call Close to finish the final record.
+//
+// Optionally, call Flush to finish the current record and flush the underlying
+// writer without starting a new record. To start a new record after flushing,
+// call Next.
+//
+// Neither Readers or Writers are safe to use concurrently.
+//
+// Example code:
+//
+//	func read(r io.Reader) ([]string, error) {
+//		var ss []string
+//		records := record.NewReader(r)
+//		for {
+//			rec, err := records.Next()
+//			if err == io.EOF {
+//				break
+//			}
+//			if err != nil {
+//				log.Printf("recovering from %v", err)
+//				r.Recover()
+//				continue
+//			}
+//			s, err := io.ReadAll(rec)
+//			if err != nil {
+//				log.Printf("recovering from %v", err)
+//				r.Recover()
+//				continue
+//			}
+//			ss = append(ss, string(s))
+//		}
+//		return ss, nil
+//	}
+//
+//	func write(w io.Writer, ss []string) error {
+//		records := record.NewWriter(w)
+//		for _, s := range ss {
+//			rec, err := records.Next()
+//			if err != nil {
+//				return err
+//			}
+//			if _, err := rec.Write([]byte(s)), err != nil {
+//				return err
+//			}
+//		}
+//		return records.Close()
+//	}
+//
+// The wire format is that the stream is divided into 32KiB blocks, and each
+// block contains a number of tightly packed chunks. Chunks cannot cross block
+// boundaries. The last block may be shorter than 32 KiB. Any unused bytes in a
+// block must be zero.
+//
+// A record maps to one or more chunks. There are two chunk formats: legacy and
+// recyclable. The legacy chunk format:
+//
+//	+----------+-----------+-----------+--- ... ---+
+//	| CRC (4B) | Size (2B) | Type (1B) | Payload   |
+//	+----------+-----------+-----------+--- ... ---+
+//
+// CRC is computed over the type and payload
+// Size is the length of the payload in bytes
+// Type is the chunk type
+//
+// There are four chunk types: whether the chunk is the full record, or the
+// first, middle or last chunk of a multi-chunk record. A multi-chunk record
+// has one first chunk, zero or more middle chunks, and one last chunk.
+//
+// The recyclyable chunk format is similar to the legacy format, but extends
+// the chunk header with an additional log number field. This allows reuse
+// (recycling) of log files which can provide significantly better performance
+// when syncing frequently as it avoids needing to update the file
+// metadata. Additionally, recycling log files is a prequisite for using direct
+// IO with log writing. The recyclyable format is:
+//
+//	+----------+-----------+-----------+----------------+--- ... ---+
+//	| CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Payload   |
+//	+----------+-----------+-----------+----------------+--- ... ---+
+//
+// Recyclable chunks are distinguished from legacy chunks by the addition of 4
+// extra "recyclable" chunk types that map directly to the legacy chunk types
+// (i.e. full, first, middle, last). The CRC is computed over the type, log
+// number, and payload.
+//
+// The wire format allows for limited recovery in the face of data corruption:
+// on a format error (such as a checksum mismatch), the reader moves to the
+// next block and looks for the next full or first chunk.
+package record
+
+// The C++ Level-DB code calls this the log, but it has been renamed to record
+// to avoid clashing with the standard log package, and because it is generally
+// useful outside of logging. The C++ code also uses the term "physical record"
+// instead of "chunk", but "chunk" is shorter and less confusing.
+
+import (
+	"encoding/binary"
+	"io"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/crc"
+)
+
+// These constants are part of the wire format and should not be changed.
+const (
+	fullChunkType   = 1
+	firstChunkType  = 2
+	middleChunkType = 3
+	lastChunkType   = 4
+
+	recyclableFullChunkType   = 5
+	recyclableFirstChunkType  = 6
+	recyclableMiddleChunkType = 7
+	recyclableLastChunkType   = 8
+)
+
+const (
+	blockSize            = 32 * 1024
+	blockSizeMask        = blockSize - 1
+	legacyHeaderSize     = 7
+	recyclableHeaderSize = legacyHeaderSize + 4
+)
+
+var (
+	// ErrNotAnIOSeeker is returned if the io.Reader underlying a Reader does not implement io.Seeker.
+	ErrNotAnIOSeeker = errors.New("pebble/record: reader does not implement io.Seeker")
+
+	// ErrNoLastRecord is returned if LastRecordOffset is called and there is no previous record.
+	ErrNoLastRecord = errors.New("pebble/record: no last record exists")
+
+	// ErrZeroedChunk is returned if a chunk is encountered that is zeroed. This
+	// usually occurs due to log file preallocation.
+	ErrZeroedChunk = base.CorruptionErrorf("pebble/record: zeroed chunk")
+
+	// ErrInvalidChunk is returned if a chunk is encountered with an invalid
+	// header, length, or checksum. This usually occurs when a log is recycled,
+	// but can also occur due to corruption.
+	ErrInvalidChunk = base.CorruptionErrorf("pebble/record: invalid chunk")
+)
+
+// IsInvalidRecord returns true if the error matches one of the error types
+// returned for invalid records. These are treated in a way similar to io.EOF
+// in recovery code.
+func IsInvalidRecord(err error) bool {
+	return err == ErrZeroedChunk || err == ErrInvalidChunk || err == io.ErrUnexpectedEOF
+}
+
+// Reader reads records from an underlying io.Reader.
+type Reader struct {
+	// r is the underlying reader.
+	r io.Reader
+	// logNum is the low 32-bits of the log's file number. May be zero when used
+	// with log files that do not have a file number (e.g. the MANIFEST).
+	logNum uint32
+	// blockNum is the zero based block number currently held in buf.
+	blockNum int64
+	// seq is the sequence number of the current record.
+	seq int
+	// buf[begin:end] is the unread portion of the current chunk's payload. The
+	// low bound, begin, excludes the chunk header.
+	begin, end int
+	// n is the number of bytes of buf that are valid. Once reading has started,
+	// only the final block can have n < blockSize.
+	n int
+	// recovering is true when recovering from corruption.
+	recovering bool
+	// last is whether the current chunk is the last chunk of the record.
+	last bool
+	// err is any accumulated error.
+	err error
+	// buf is the buffer.
+	buf [blockSize]byte
+}
+
+// NewReader returns a new reader. If the file contains records encoded using
+// the recyclable record format, then the log number in those records must
+// match the specified logNum.
+func NewReader(r io.Reader, logNum base.DiskFileNum) *Reader {
+	return &Reader{
+		r:        r,
+		logNum:   uint32(logNum),
+		blockNum: -1,
+	}
+}
+
+// nextChunk sets r.buf[r.i:r.j] to hold the next chunk's payload, reading the
+// next block into the buffer if necessary.
+func (r *Reader) nextChunk(wantFirst bool) error {
+	for {
+		if r.end+legacyHeaderSize <= r.n {
+			checksum := binary.LittleEndian.Uint32(r.buf[r.end+0 : r.end+4])
+			length := binary.LittleEndian.Uint16(r.buf[r.end+4 : r.end+6])
+			chunkType := r.buf[r.end+6]
+
+			if checksum == 0 && length == 0 && chunkType == 0 {
+				if r.end+recyclableHeaderSize > r.n {
+					// Skip the rest of the block if the recyclable header size does not
+					// fit within it.
+					r.end = r.n
+					continue
+				}
+				if r.recovering {
+					// Skip the rest of the block, if it looks like it is all
+					// zeroes. This is common with WAL preallocation.
+					//
+					// Set r.err to be an error so r.recover actually recovers.
+					r.err = ErrZeroedChunk
+					r.recover()
+					continue
+				}
+				return ErrZeroedChunk
+			}
+
+			headerSize := legacyHeaderSize
+			if chunkType >= recyclableFullChunkType && chunkType <= recyclableLastChunkType {
+				headerSize = recyclableHeaderSize
+				if r.end+headerSize > r.n {
+					return ErrInvalidChunk
+				}
+
+				logNum := binary.LittleEndian.Uint32(r.buf[r.end+7 : r.end+11])
+				if logNum != r.logNum {
+					if wantFirst {
+						// If we're looking for the first chunk of a record, we can treat a
+						// previous instance of the log as EOF.
+						return io.EOF
+					}
+					// Otherwise, treat this chunk as invalid in order to prevent reading
+					// of a partial record.
+					return ErrInvalidChunk
+				}
+
+				chunkType -= (recyclableFullChunkType - 1)
+			}
+
+			r.begin = r.end + headerSize
+			r.end = r.begin + int(length)
+			if r.end > r.n {
+				// The chunk straddles a 32KB boundary (or the end of file).
+				if r.recovering {
+					r.recover()
+					continue
+				}
+				return ErrInvalidChunk
+			}
+			if checksum != crc.New(r.buf[r.begin-headerSize+6:r.end]).Value() {
+				if r.recovering {
+					r.recover()
+					continue
+				}
+				return ErrInvalidChunk
+			}
+			if wantFirst {
+				if chunkType != fullChunkType && chunkType != firstChunkType {
+					continue
+				}
+			}
+			r.last = chunkType == fullChunkType || chunkType == lastChunkType
+			r.recovering = false
+			return nil
+		}
+		if r.n < blockSize && r.blockNum >= 0 {
+			if !wantFirst || r.end != r.n {
+				// This can happen if the previous instance of the log ended with a
+				// partial block at the same blockNum as the new log but extended
+				// beyond the partial block of the new log.
+				return ErrInvalidChunk
+			}
+			return io.EOF
+		}
+		n, err := io.ReadFull(r.r, r.buf[:])
+		if err != nil && err != io.ErrUnexpectedEOF {
+			if err == io.EOF && !wantFirst {
+				return io.ErrUnexpectedEOF
+			}
+			return err
+		}
+		r.begin, r.end, r.n = 0, 0, n
+		r.blockNum++
+	}
+}
+
+// Next returns a reader for the next record. It returns io.EOF if there are no
+// more records. The reader returned becomes stale after the next Next call,
+// and should no longer be used.
+func (r *Reader) Next() (io.Reader, error) {
+	r.seq++
+	if r.err != nil {
+		return nil, r.err
+	}
+	r.begin = r.end
+	r.err = r.nextChunk(true)
+	if r.err != nil {
+		return nil, r.err
+	}
+	return singleReader{r, r.seq}, nil
+}
+
+// Offset returns the current offset within the file. If called immediately
+// before a call to Next(), Offset() will return the record offset.
+func (r *Reader) Offset() int64 {
+	if r.blockNum < 0 {
+		return 0
+	}
+	return int64(r.blockNum)*blockSize + int64(r.end)
+}
+
+// recover clears any errors read so far, so that calling Next will start
+// reading from the next good 32KiB block. If there are no such blocks, Next
+// will return io.EOF. recover also marks the current reader, the one most
+// recently returned by Next, as stale. If recover is called without any
+// prior error, then recover is a no-op.
+func (r *Reader) recover() {
+	if r.err == nil {
+		return
+	}
+	r.recovering = true
+	r.err = nil
+	// Discard the rest of the current block.
+	r.begin, r.end, r.last = r.n, r.n, false
+	// Invalidate any outstanding singleReader.
+	r.seq++
+}
+
+// seekRecord seeks in the underlying io.Reader such that calling r.Next
+// returns the record whose first chunk header starts at the provided offset.
+// Its behavior is undefined if the argument given is not such an offset, as
+// the bytes at that offset may coincidentally appear to be a valid header.
+//
+// It returns ErrNotAnIOSeeker if the underlying io.Reader does not implement
+// io.Seeker.
+//
+// seekRecord will fail and return an error if the Reader previously
+// encountered an error, including io.EOF. Such errors can be cleared by
+// calling Recover. Calling seekRecord after Recover will make calling Next
+// return the record at the given offset, instead of the record at the next
+// good 32KiB block as Recover normally would. Calling seekRecord before
+// Recover has no effect on Recover's semantics other than changing the
+// starting point for determining the next good 32KiB block.
+//
+// The offset is always relative to the start of the underlying io.Reader, so
+// negative values will result in an error as per io.Seeker.
+func (r *Reader) seekRecord(offset int64) error {
+	r.seq++
+	if r.err != nil {
+		return r.err
+	}
+
+	s, ok := r.r.(io.Seeker)
+	if !ok {
+		return ErrNotAnIOSeeker
+	}
+
+	// Only seek to an exact block offset.
+	c := int(offset & blockSizeMask)
+	if _, r.err = s.Seek(offset&^blockSizeMask, io.SeekStart); r.err != nil {
+		return r.err
+	}
+
+	// Clear the state of the internal reader.
+	r.begin, r.end, r.n = 0, 0, 0
+	r.blockNum, r.recovering, r.last = -1, false, false
+	if r.err = r.nextChunk(false); r.err != nil {
+		return r.err
+	}
+
+	// Now skip to the offset requested within the block. A subsequent
+	// call to Next will return the block at the requested offset.
+	r.begin, r.end = c, c
+
+	return nil
+}
+
+type singleReader struct {
+	r   *Reader
+	seq int
+}
+
+func (x singleReader) Read(p []byte) (int, error) {
+	r := x.r
+	if r.seq != x.seq {
+		return 0, errors.New("pebble/record: stale reader")
+	}
+	if r.err != nil {
+		return 0, r.err
+	}
+	for r.begin == r.end {
+		if r.last {
+			return 0, io.EOF
+		}
+		if r.err = r.nextChunk(false); r.err != nil {
+			return 0, r.err
+		}
+	}
+	n := copy(p, r.buf[r.begin:r.end])
+	r.begin += n
+	return n, nil
+}
+
+// Writer writes records to an underlying io.Writer.
+type Writer struct {
+	// w is the underlying writer.
+	w io.Writer
+	// seq is the sequence number of the current record.
+	seq int
+	// f is w as a flusher.
+	f flusher
+	// buf[i:j] is the bytes that will become the current chunk.
+	// The low bound, i, includes the chunk header.
+	i, j int
+	// buf[:written] has already been written to w.
+	// written is zero unless Flush has been called.
+	written int
+	// baseOffset is the base offset in w at which writing started. If
+	// w implements io.Seeker, it's relative to the start of w, 0 otherwise.
+	baseOffset int64
+	// blockNumber is the zero based block number currently held in buf.
+	blockNumber int64
+	// lastRecordOffset is the offset in w where the last record was
+	// written (including the chunk header). It is a relative offset to
+	// baseOffset, thus the absolute offset of the last record is
+	// baseOffset + lastRecordOffset.
+	lastRecordOffset int64
+	// first is whether the current chunk is the first chunk of the record.
+	first bool
+	// pending is whether a chunk is buffered but not yet written.
+	pending bool
+	// err is any accumulated error.
+	err error
+	// buf is the buffer.
+	buf [blockSize]byte
+}
+
+// NewWriter returns a new Writer.
+func NewWriter(w io.Writer) *Writer {
+	f, _ := w.(flusher)
+
+	var o int64
+	if s, ok := w.(io.Seeker); ok {
+		var err error
+		if o, err = s.Seek(0, io.SeekCurrent); err != nil {
+			o = 0
+		}
+	}
+	return &Writer{
+		w:                w,
+		f:                f,
+		baseOffset:       o,
+		lastRecordOffset: -1,
+	}
+}
+
+// fillHeader fills in the header for the pending chunk.
+func (w *Writer) fillHeader(last bool) {
+	if w.i+legacyHeaderSize > w.j || w.j > blockSize {
+		panic("pebble/record: bad writer state")
+	}
+	if last {
+		if w.first {
+			w.buf[w.i+6] = fullChunkType
+		} else {
+			w.buf[w.i+6] = lastChunkType
+		}
+	} else {
+		if w.first {
+			w.buf[w.i+6] = firstChunkType
+		} else {
+			w.buf[w.i+6] = middleChunkType
+		}
+	}
+	binary.LittleEndian.PutUint32(w.buf[w.i+0:w.i+4], crc.New(w.buf[w.i+6:w.j]).Value())
+	binary.LittleEndian.PutUint16(w.buf[w.i+4:w.i+6], uint16(w.j-w.i-legacyHeaderSize))
+}
+
+// writeBlock writes the buffered block to the underlying writer, and reserves
+// space for the next chunk's header.
+func (w *Writer) writeBlock() {
+	_, w.err = w.w.Write(w.buf[w.written:])
+	w.i = 0
+	w.j = legacyHeaderSize
+	w.written = 0
+	w.blockNumber++
+}
+
+// writePending finishes the current record and writes the buffer to the
+// underlying writer.
+func (w *Writer) writePending() {
+	if w.err != nil {
+		return
+	}
+	if w.pending {
+		w.fillHeader(true)
+		w.pending = false
+	}
+	_, w.err = w.w.Write(w.buf[w.written:w.j])
+	w.written = w.j
+}
+
+// Close finishes the current record and closes the writer.
+func (w *Writer) Close() error {
+	w.seq++
+	w.writePending()
+	if w.err != nil {
+		return w.err
+	}
+	w.err = errors.New("pebble/record: closed Writer")
+	return nil
+}
+
+// Flush finishes the current record, writes to the underlying writer, and
+// flushes it if that writer implements interface{ Flush() error }.
+func (w *Writer) Flush() error {
+	w.seq++
+	w.writePending()
+	if w.err != nil {
+		return w.err
+	}
+	if w.f != nil {
+		w.err = w.f.Flush()
+		return w.err
+	}
+	return nil
+}
+
+// Next returns a writer for the next record. The writer returned becomes stale
+// after the next Close, Flush or Next call, and should no longer be used.
+func (w *Writer) Next() (io.Writer, error) {
+	w.seq++
+	if w.err != nil {
+		return nil, w.err
+	}
+	if w.pending {
+		w.fillHeader(true)
+	}
+	w.i = w.j
+	w.j = w.j + legacyHeaderSize
+	// Check if there is room in the block for the header.
+	if w.j > blockSize {
+		// Fill in the rest of the block with zeroes.
+		for k := w.i; k < blockSize; k++ {
+			w.buf[k] = 0
+		}
+		w.writeBlock()
+		if w.err != nil {
+			return nil, w.err
+		}
+	}
+	w.lastRecordOffset = w.baseOffset + w.blockNumber*blockSize + int64(w.i)
+	w.first = true
+	w.pending = true
+	return singleWriter{w, w.seq}, nil
+}
+
+// WriteRecord writes a complete record. Returns the offset just past the end
+// of the record.
+func (w *Writer) WriteRecord(p []byte) (int64, error) {
+	if w.err != nil {
+		return -1, w.err
+	}
+	t, err := w.Next()
+	if err != nil {
+		return -1, err
+	}
+	if _, err := t.Write(p); err != nil {
+		return -1, err
+	}
+	w.writePending()
+	offset := w.blockNumber*blockSize + int64(w.j)
+	return offset, w.err
+}
+
+// Size returns the current size of the file.
+func (w *Writer) Size() int64 {
+	if w == nil {
+		return 0
+	}
+	return w.blockNumber*blockSize + int64(w.j)
+}
+
+// LastRecordOffset returns the offset in the underlying io.Writer of the last
+// record so far - the one created by the most recent Next call. It is the
+// offset of the first chunk header, suitable to pass to Reader.SeekRecord.
+//
+// If that io.Writer also implements io.Seeker, the return value is an absolute
+// offset, in the sense of io.SeekStart, regardless of whether the io.Writer
+// was initially at the zero position when passed to NewWriter. Otherwise, the
+// return value is a relative offset, being the number of bytes written between
+// the NewWriter call and any records written prior to the last record.
+//
+// If there is no last record, i.e. nothing was written, LastRecordOffset will
+// return ErrNoLastRecord.
+func (w *Writer) LastRecordOffset() (int64, error) {
+	if w.err != nil {
+		return 0, w.err
+	}
+	if w.lastRecordOffset < 0 {
+		return 0, ErrNoLastRecord
+	}
+	return w.lastRecordOffset, nil
+}
+
+type singleWriter struct {
+	w   *Writer
+	seq int
+}
+
+func (x singleWriter) Write(p []byte) (int, error) {
+	w := x.w
+	if w.seq != x.seq {
+		return 0, errors.New("pebble/record: stale writer")
+	}
+	if w.err != nil {
+		return 0, w.err
+	}
+	n0 := len(p)
+	for len(p) > 0 {
+		// Write a block, if it is full.
+		if w.j == blockSize {
+			w.fillHeader(false)
+			w.writeBlock()
+			if w.err != nil {
+				return 0, w.err
+			}
+			w.first = false
+		}
+		// Copy bytes into the buffer.
+		n := copy(w.buf[w.j:], p)
+		w.j += n
+		p = p[n:]
+	}
+	return n0, nil
+}
diff --git a/pebble/record/record_test.go b/pebble/record/record_test.go
new file mode 100644
index 0000000..d052079
--- /dev/null
+++ b/pebble/record/record_test.go
@@ -0,0 +1,1064 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package record
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"math"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+func short(s string) string {
+	if len(s) < 64 {
+		return s
+	}
+	return fmt.Sprintf("%s...(skipping %d bytes)...%s", s[:20], len(s)-40, s[len(s)-20:])
+}
+
+// big returns a string of length n, composed of repetitions of partial.
+func big(partial string, n int) string {
+	return strings.Repeat(partial, n/len(partial)+1)[:n]
+}
+
+type recordWriter interface {
+	WriteRecord([]byte) (int64, error)
+	Close() error
+}
+
+func testGeneratorWriter(
+	t *testing.T, reset func(), gen func() (string, bool), newWriter func(io.Writer) recordWriter,
+) {
+	buf := new(bytes.Buffer)
+
+	reset()
+	w := newWriter(buf)
+	for {
+		s, ok := gen()
+		if !ok {
+			break
+		}
+		if _, err := w.WriteRecord([]byte(s)); err != nil {
+			t.Fatalf("Write: %v", err)
+		}
+	}
+	if err := w.Close(); err != nil {
+		t.Fatalf("Close: %v", err)
+	}
+	reset()
+	r := NewReader(buf, 0 /* logNum */)
+	for {
+		s, ok := gen()
+		if !ok {
+			break
+		}
+		rr, err := r.Next()
+		if err != nil {
+			t.Fatalf("reader.Next: %v", err)
+		}
+		x, err := io.ReadAll(rr)
+		if err != nil {
+			t.Fatalf("ReadAll: %v", err)
+		}
+		if string(x) != s {
+			t.Fatalf("got %q, want %q", short(string(x)), short(s))
+		}
+	}
+	if _, err := r.Next(); err != io.EOF {
+		t.Fatalf("got %v, want %v", err, io.EOF)
+	}
+}
+
+func testGenerator(t *testing.T, reset func(), gen func() (string, bool)) {
+	t.Run("Writer", func(t *testing.T) {
+		testGeneratorWriter(t, reset, gen, func(w io.Writer) recordWriter {
+			return NewWriter(w)
+		})
+	})
+
+	t.Run("LogWriter", func(t *testing.T) {
+		testGeneratorWriter(t, reset, gen, func(w io.Writer) recordWriter {
+			return NewLogWriter(w, 0 /* logNum */, LogWriterConfig{
+				WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{})})
+		})
+	})
+}
+
+func testLiterals(t *testing.T, s []string) {
+	var i int
+	reset := func() {
+		i = 0
+	}
+	gen := func() (string, bool) {
+		if i == len(s) {
+			return "", false
+		}
+		i++
+		return s[i-1], true
+	}
+	testGenerator(t, reset, gen)
+}
+
+func TestMany(t *testing.T) {
+	const n = 1e5
+	var i int
+	reset := func() {
+		i = 0
+	}
+	gen := func() (string, bool) {
+		if i == n {
+			return "", false
+		}
+		i++
+		return fmt.Sprintf("%d.", i-1), true
+	}
+	testGenerator(t, reset, gen)
+}
+
+func TestRandom(t *testing.T) {
+	const n = 1e2
+	var (
+		i int
+		r *rand.Rand
+	)
+	reset := func() {
+		i, r = 0, rand.New(rand.NewSource(0))
+	}
+	gen := func() (string, bool) {
+		if i == n {
+			return "", false
+		}
+		i++
+		return strings.Repeat(string(uint8(i)), r.Intn(2*blockSize+16)), true
+	}
+	testGenerator(t, reset, gen)
+}
+
+func TestBasic(t *testing.T) {
+	testLiterals(t, []string{
+		strings.Repeat("a", 1000),
+		strings.Repeat("b", 97270),
+		strings.Repeat("c", 8000),
+	})
+}
+
+func TestBoundary(t *testing.T) {
+	for i := blockSize - 16; i < blockSize+16; i++ {
+		s0 := big("abcd", i)
+		for j := blockSize - 16; j < blockSize+16; j++ {
+			s1 := big("ABCDE", j)
+			testLiterals(t, []string{s0, s1})
+			testLiterals(t, []string{s0, "", s1})
+			testLiterals(t, []string{s0, "x", s1})
+		}
+	}
+}
+
+func TestFlush(t *testing.T) {
+	buf := new(bytes.Buffer)
+	w := NewWriter(buf)
+	// Write a couple of records. Everything should still be held
+	// in the record.Writer buffer, so that buf.Len should be 0.
+	w0, _ := w.Next()
+	w0.Write([]byte("0"))
+	w1, _ := w.Next()
+	w1.Write([]byte("11"))
+	if got, want := buf.Len(), 0; got != want {
+		t.Fatalf("buffer length #0: got %d want %d", got, want)
+	}
+	// Flush the record.Writer buffer, which should yield 17 bytes.
+	// 17 = 2*7 + 1 + 2, which is two headers and 1 + 2 payload bytes.
+	require.NoError(t, w.Flush())
+	if got, want := buf.Len(), 17; got != want {
+		t.Fatalf("buffer length #1: got %d want %d", got, want)
+	}
+	// Do another write, one that isn't large enough to complete the block.
+	// The write should not have flowed through to buf.
+	w2, _ := w.Next()
+	w2.Write(bytes.Repeat([]byte("2"), 10000))
+	if got, want := buf.Len(), 17; got != want {
+		t.Fatalf("buffer length #2: got %d want %d", got, want)
+	}
+	// Flushing should get us up to 10024 bytes written.
+	// 10024 = 17 + 7 + 10000.
+	require.NoError(t, w.Flush())
+	if got, want := buf.Len(), 10024; got != want {
+		t.Fatalf("buffer length #3: got %d want %d", got, want)
+	}
+	// Do a bigger write, one that completes the current block.
+	// We should now have 32768 bytes (a complete block), without
+	// an explicit flush.
+	w3, _ := w.Next()
+	w3.Write(bytes.Repeat([]byte("3"), 40000))
+	if got, want := buf.Len(), 32768; got != want {
+		t.Fatalf("buffer length #4: got %d want %d", got, want)
+	}
+	// Flushing should get us up to 50038 bytes written.
+	// 50038 = 10024 + 2*7 + 40000. There are two headers because
+	// the one record was split into two chunks.
+	require.NoError(t, w.Flush())
+	if got, want := buf.Len(), 50038; got != want {
+		t.Fatalf("buffer length #5: got %d want %d", got, want)
+	}
+	// Check that reading those records give the right lengths.
+	r := NewReader(buf, 0 /* logNum */)
+	wants := []int64{1, 2, 10000, 40000}
+	for i, want := range wants {
+		rr, _ := r.Next()
+		n, err := io.Copy(io.Discard, rr)
+		if err != nil {
+			t.Fatalf("read #%d: %v", i, err)
+		}
+		if n != want {
+			t.Fatalf("read #%d: got %d bytes want %d", i, n, want)
+		}
+	}
+}
+
+func TestNonExhaustiveRead(t *testing.T) {
+	const n = 100
+	buf := new(bytes.Buffer)
+	p := make([]byte, 10)
+	rnd := rand.New(rand.NewSource(1))
+
+	w := NewWriter(buf)
+	for i := 0; i < n; i++ {
+		length := len(p) + rnd.Intn(3*blockSize)
+		s := string(uint8(i)) + "123456789abcdefgh"
+		_, _ = w.WriteRecord([]byte(big(s, length)))
+	}
+	if err := w.Close(); err != nil {
+		t.Fatalf("Close: %v", err)
+	}
+
+	r := NewReader(buf, 0 /* logNum */)
+	for i := 0; i < n; i++ {
+		rr, _ := r.Next()
+		_, err := io.ReadFull(rr, p)
+		if err != nil {
+			t.Fatalf("ReadFull: %v", err)
+		}
+		want := string(uint8(i)) + "123456789"
+		if got := string(p); got != want {
+			t.Fatalf("read #%d: got %q want %q", i, got, want)
+		}
+	}
+}
+
+func TestStaleReader(t *testing.T) {
+	buf := new(bytes.Buffer)
+
+	w := NewWriter(buf)
+	_, err := w.WriteRecord([]byte("0"))
+	require.NoError(t, err)
+
+	_, err = w.WriteRecord([]byte("11"))
+	require.NoError(t, err)
+
+	require.NoError(t, w.Close())
+
+	r := NewReader(buf, 0 /* logNum */)
+	r0, err := r.Next()
+	require.NoError(t, err)
+
+	r1, err := r.Next()
+	require.NoError(t, err)
+
+	p := make([]byte, 1)
+	if _, err := r0.Read(p); err == nil || !strings.Contains(err.Error(), "stale") {
+		t.Fatalf("stale read #0: unexpected error: %v", err)
+	}
+	if _, err := r1.Read(p); err != nil {
+		t.Fatalf("fresh read #1: got %v want nil error", err)
+	}
+	if p[0] != '1' {
+		t.Fatalf("fresh read #1: byte contents: got '%c' want '1'", p[0])
+	}
+}
+
+type testRecords struct {
+	records [][]byte // The raw value of each record.
+	offsets []int64  // The offset of each record within buf, derived from writer.LastRecordOffset.
+	buf     []byte   // The serialized records form of all records.
+}
+
+// makeTestRecords generates test records of specified lengths.
+// The first record will consist of repeating 0x00 bytes, the next record of
+// 0x01 bytes, and so forth. The values will loop back to 0x00 after 0xff.
+func makeTestRecords(recordLengths ...int) (*testRecords, error) {
+	ret := &testRecords{}
+	ret.records = make([][]byte, len(recordLengths))
+	ret.offsets = make([]int64, len(recordLengths))
+	for i, n := range recordLengths {
+		ret.records[i] = bytes.Repeat([]byte{byte(i)}, n)
+	}
+
+	buf := new(bytes.Buffer)
+	w := NewWriter(buf)
+	for i, rec := range ret.records {
+		wRec, err := w.Next()
+		if err != nil {
+			return nil, err
+		}
+
+		// Alternate between one big write and many small writes.
+		cSize := 8
+		if i&1 == 0 {
+			cSize = len(rec)
+		}
+		for ; len(rec) > cSize; rec = rec[cSize:] {
+			if _, err := wRec.Write(rec[:cSize]); err != nil {
+				return nil, err
+			}
+		}
+		if _, err := wRec.Write(rec); err != nil {
+			return nil, err
+		}
+
+		ret.offsets[i], err = w.LastRecordOffset()
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	if err := w.Close(); err != nil {
+		return nil, err
+	}
+
+	ret.buf = buf.Bytes()
+	return ret, nil
+}
+
+// corruptBlock corrupts the checksum of the record that starts at the
+// specified block offset. The number of the block offset is 0 based.
+func corruptBlock(buf []byte, blockNum int) {
+	// Ensure we always permute at least 1 byte of the checksum.
+	if buf[blockSize*blockNum] == 0x00 {
+		buf[blockSize*blockNum] = 0xff
+	} else {
+		buf[blockSize*blockNum] = 0x00
+	}
+
+	buf[blockSize*blockNum+1] = 0x00
+	buf[blockSize*blockNum+2] = 0x00
+	buf[blockSize*blockNum+3] = 0x00
+}
+
+func TestRecoverNoOp(t *testing.T) {
+	recs, err := makeTestRecords(
+		blockSize-legacyHeaderSize,
+		blockSize-legacyHeaderSize,
+		blockSize-legacyHeaderSize,
+	)
+	if err != nil {
+		t.Fatalf("makeTestRecords: %v", err)
+	}
+
+	r := NewReader(bytes.NewReader(recs.buf), 0 /* logNum */)
+	_, err = r.Next()
+	if err != nil || r.err != nil {
+		t.Fatalf("reader.Next: %v reader.err: %v", err, r.err)
+	}
+
+	seq, begin, end, n := r.seq, r.begin, r.end, r.n
+
+	// Should be a no-op since r.err == nil.
+	r.recover()
+
+	// r.err was nil, nothing should have changed.
+	if seq != r.seq || begin != r.begin || end != r.end || n != r.n {
+		t.Fatal("reader.Recover when no error existed, was not a no-op")
+	}
+}
+
+func TestBasicRecover(t *testing.T) {
+	recs, err := makeTestRecords(
+		blockSize-legacyHeaderSize,
+		blockSize-legacyHeaderSize,
+		blockSize-legacyHeaderSize,
+	)
+	if err != nil {
+		t.Fatalf("makeTestRecords: %v", err)
+	}
+
+	// Corrupt the checksum of the second record r1 in our file.
+	corruptBlock(recs.buf, 1)
+
+	underlyingReader := bytes.NewReader(recs.buf)
+	r := NewReader(underlyingReader, 0 /* logNum */)
+
+	// The first record r0 should be read just fine.
+	r0, err := r.Next()
+	if err != nil {
+		t.Fatalf("Next: %v", err)
+	}
+	r0Data, err := io.ReadAll(r0)
+	if err != nil {
+		t.Fatalf("ReadAll: %v", err)
+	}
+	if !bytes.Equal(r0Data, recs.records[0]) {
+		t.Fatal("Unexpected output in r0's data")
+	}
+
+	// The next record should have a checksum mismatch.
+	_, err = r.Next()
+	if err == nil {
+		t.Fatal("Expected an error while reading a corrupted record")
+	}
+	if err != ErrInvalidChunk {
+		t.Fatalf("Unexpected error returned: %v", err)
+	}
+
+	// Recover from that checksum mismatch.
+	r.recover()
+	currentOffset, err := underlyingReader.Seek(0, io.SeekCurrent)
+	if err != nil {
+		t.Fatalf("current offset: %v", err)
+	}
+	if currentOffset != blockSize*2 {
+		t.Fatalf("current offset: got %d, want %d", currentOffset, blockSize*2)
+	}
+
+	// The third record r2 should be read just fine.
+	r2, err := r.Next()
+	if err != nil {
+		t.Fatalf("Next: %v", err)
+	}
+	r2Data, err := io.ReadAll(r2)
+	if err != nil {
+		t.Fatalf("ReadAll: %v", err)
+	}
+	if !bytes.Equal(r2Data, recs.records[2]) {
+		t.Fatal("Unexpected output in r2's data")
+	}
+}
+
+func TestRecoverSingleBlock(t *testing.T) {
+	// The first record will be blockSize * 3 bytes long. Since each block has
+	// a 7 byte header, the first record will roll over into 4 blocks.
+	recs, err := makeTestRecords(
+		blockSize*3,
+		blockSize-legacyHeaderSize,
+		blockSize/2,
+	)
+	if err != nil {
+		t.Fatalf("makeTestRecords: %v", err)
+	}
+
+	// Corrupt the checksum for the portion of the first record that exists in
+	// the 4th block.
+	corruptBlock(recs.buf, 3)
+
+	// The first record should fail, but only when we read deeper beyond the
+	// first block.
+	r := NewReader(bytes.NewReader(recs.buf), 0 /* logNum */)
+	r0, err := r.Next()
+	if err != nil {
+		t.Fatalf("Next: %v", err)
+	}
+
+	// Reading deeper should yield a checksum mismatch.
+	_, err = io.ReadAll(r0)
+	if err == nil {
+		t.Fatal("Expected a checksum mismatch error, got nil")
+	}
+	if err != ErrInvalidChunk {
+		t.Fatalf("Unexpected error returned: %v", err)
+	}
+
+	// Recover from that checksum mismatch.
+	r.recover()
+
+	// All of the data in the second record r1 is lost because the first record
+	// r0 shared a partial block with it. The second record also overlapped
+	// into the block with the third record r2. Recovery should jump to that
+	// block, skipping over the end of the second record and start parsing the
+	// third record.
+	r2, err := r.Next()
+	if err != nil {
+		t.Fatalf("Next: %v", err)
+	}
+	r2Data, _ := io.ReadAll(r2)
+	if !bytes.Equal(r2Data, recs.records[2]) {
+		t.Fatal("Unexpected output in r2's data")
+	}
+}
+
+func TestRecoverMultipleBlocks(t *testing.T) {
+	recs, err := makeTestRecords(
+		// The first record will consume 3 entire blocks but a fraction of the 4th.
+		blockSize*3,
+		// The second record will completely fill the remainder of the 4th block.
+		3*(blockSize-legacyHeaderSize)-2*blockSize-2*legacyHeaderSize,
+		// Consume the entirety of the 5th block.
+		blockSize-legacyHeaderSize,
+		// Consume the entirety of the 6th block.
+		blockSize-legacyHeaderSize,
+		// Consume roughly half of the 7th block.
+		blockSize/2,
+	)
+	if err != nil {
+		t.Fatalf("makeTestRecords: %v", err)
+	}
+
+	// Corrupt the checksum for the portion of the first record that exists in the 4th block.
+	corruptBlock(recs.buf, 3)
+
+	// Now corrupt the two blocks in a row that correspond to recs.records[2:4].
+	corruptBlock(recs.buf, 4)
+	corruptBlock(recs.buf, 5)
+
+	// The first record should fail, but only when we read deeper beyond the first block.
+	r := NewReader(bytes.NewReader(recs.buf), 0 /* logNum */)
+	r0, err := r.Next()
+	if err != nil {
+		t.Fatalf("Next: %v", err)
+	}
+
+	// Reading deeper should yield a checksum mismatch.
+	_, err = io.ReadAll(r0)
+	if err == nil {
+		t.Fatal("Exptected a checksum mismatch error, got nil")
+	}
+	if err != ErrInvalidChunk {
+		t.Fatalf("Unexpected error returned: %v", err)
+	}
+
+	// Recover from that checksum mismatch.
+	r.recover()
+
+	// All of the data in the second record is lost because the first
+	// record shared a partial block with it. The following two records
+	// have corrupted checksums as well, so the call above to r.Recover
+	// should result in r.Next() being a reader to the 5th record.
+	r4, err := r.Next()
+	if err != nil {
+		t.Fatalf("Next: %v", err)
+	}
+
+	r4Data, _ := io.ReadAll(r4)
+	if !bytes.Equal(r4Data, recs.records[4]) {
+		t.Fatal("Unexpected output in r4's data")
+	}
+}
+
+// verifyLastBlockRecover reads each record from recs expecting that the
+// last record will be corrupted. It will then try Recover and verify that EOF
+// is returned.
+func verifyLastBlockRecover(recs *testRecords) error {
+	r := NewReader(bytes.NewReader(recs.buf), 0 /* logNum */)
+	// Loop to one element larger than the number of records to verify EOF.
+	for i := 0; i < len(recs.records)+1; i++ {
+		_, err := r.Next()
+		switch i {
+		case len(recs.records) - 1:
+			if err == nil {
+				return errors.New("Expected a checksum mismatch error, got nil")
+			}
+			r.recover()
+		case len(recs.records):
+			if err != io.EOF {
+				return errors.Errorf("Expected io.EOF, got %v", err)
+			}
+		default:
+			if err != nil {
+				return errors.Errorf("Next: %v", err)
+			}
+		}
+	}
+	return nil
+}
+
+func TestRecoverLastPartialBlock(t *testing.T) {
+	recs, err := makeTestRecords(
+		// The first record will consume 3 entire blocks but a fraction of the 4th.
+		blockSize*3,
+		// The second record will completely fill the remainder of the 4th block.
+		3*(blockSize-legacyHeaderSize)-2*blockSize-2*legacyHeaderSize,
+		// Consume roughly half of the 5th block.
+		blockSize/2,
+	)
+	if err != nil {
+		t.Fatalf("makeTestRecords: %v", err)
+	}
+
+	// Corrupt the 5th block.
+	corruptBlock(recs.buf, 4)
+
+	// Verify Recover works when the last block is corrupted.
+	if err := verifyLastBlockRecover(recs); err != nil {
+		t.Fatalf("verifyLastBlockRecover: %v", err)
+	}
+}
+
+func TestRecoverLastCompleteBlock(t *testing.T) {
+	recs, err := makeTestRecords(
+		// The first record will consume 3 entire blocks but a fraction of the 4th.
+		blockSize*3,
+		// The second record will completely fill the remainder of the 4th block.
+		3*(blockSize-legacyHeaderSize)-2*blockSize-2*legacyHeaderSize,
+		// Consume the entire 5th block.
+		blockSize-legacyHeaderSize,
+	)
+	if err != nil {
+		t.Fatalf("makeTestRecords: %v", err)
+	}
+
+	// Corrupt the 5th block.
+	corruptBlock(recs.buf, 4)
+
+	// Verify Recover works when the last block is corrupted.
+	if err := verifyLastBlockRecover(recs); err != nil {
+		t.Fatalf("verifyLastBlockRecover: %v", err)
+	}
+}
+
+func TestReaderOffset(t *testing.T) {
+	recs, err := makeTestRecords(
+		blockSize*2,
+		400,
+		500,
+		600,
+		700,
+		800,
+		9000,
+		1000,
+	)
+	if err != nil {
+		t.Fatalf("makeTestRecords: %v", err)
+	}
+
+	// The first record should fail, but only when we read deeper beyond the first block.
+	r := NewReader(bytes.NewReader(recs.buf), 0 /* logNum */)
+	for i, offset := range recs.offsets {
+		if offset != r.Offset() {
+			t.Fatalf("%d: expected offset %d, but found %d", i, offset, r.Offset())
+		}
+		rec, err := r.Next()
+		if err != nil {
+			t.Fatalf("Next: %v", err)
+		}
+		if _, err = io.ReadAll(rec); err != nil {
+			t.Fatalf("ReadAll: %v", err)
+		}
+	}
+}
+
+func TestSeekRecord(t *testing.T) {
+	recs, err := makeTestRecords(
+		// The first record will consume 3 entire blocks but a fraction of the 4th.
+		blockSize*3,
+		// The second record will completely fill the remainder of the 4th block.
+		3*(blockSize-legacyHeaderSize)-2*blockSize-2*legacyHeaderSize,
+		// Consume the entirety of the 5th block.
+		blockSize-legacyHeaderSize,
+		// Consume the entirety of the 6th block.
+		blockSize-legacyHeaderSize,
+		// Consume roughly half of the 7th block.
+		blockSize/2,
+	)
+	if err != nil {
+		t.Fatalf("makeTestRecords: %v", err)
+	}
+
+	r := NewReader(bytes.NewReader(recs.buf), 0 /* logNum */)
+	// Seek to a valid block offset, but within a multiblock record. This should cause the next call to
+	// Next after SeekRecord to return the next valid FIRST/FULL chunk of the subsequent record.
+	err = r.seekRecord(blockSize)
+	if err != nil {
+		t.Fatalf("SeekRecord: %v", err)
+	}
+	rec, err := r.Next()
+	if err != nil {
+		t.Fatalf("Next: %v", err)
+	}
+	rData, _ := io.ReadAll(rec)
+	if !bytes.Equal(rData, recs.records[1]) {
+		t.Fatalf("Unexpected output in record 1's data, got %v want %v", rData, recs.records[1])
+	}
+
+	// Seek 3 bytes into the second block, which is still in the middle of the first record, but not
+	// at a valid chunk boundary. Should result in an error upon calling r.Next.
+	err = r.seekRecord(blockSize + 3)
+	if err != nil {
+		t.Fatalf("SeekRecord: %v", err)
+	}
+	if _, err = r.Next(); err == nil {
+		t.Fatalf("Expected an error seeking to an invalid chunk boundary")
+	}
+	r.recover()
+
+	// Seek to the fifth block and verify all records can be read as appropriate.
+	err = r.seekRecord(blockSize * 4)
+	if err != nil {
+		t.Fatalf("SeekRecord: %v", err)
+	}
+
+	check := func(i int) {
+		for ; i < len(recs.records); i++ {
+			rec, err := r.Next()
+			if err != nil {
+				t.Fatalf("Next: %v", err)
+			}
+
+			rData, _ := io.ReadAll(rec)
+			if !bytes.Equal(rData, recs.records[i]) {
+				t.Fatalf("Unexpected output in record #%d's data, got %v want %v", i, rData, recs.records[i])
+			}
+		}
+	}
+	check(2)
+
+	// Seek back to the fourth block, and read all subsequent records and verify them.
+	err = r.seekRecord(blockSize * 3)
+	if err != nil {
+		t.Fatalf("SeekRecord: %v", err)
+	}
+	check(1)
+
+	// Now seek past the end of the file and verify it causes an error.
+	err = r.seekRecord(1 << 20)
+	if err == nil {
+		t.Fatalf("Seek past the end of a file didn't cause an error")
+	}
+	if err != io.ErrUnexpectedEOF {
+		t.Fatalf("Seeking past EOF raised unexpected error: %v", err)
+	}
+	r.recover() // Verify recovery works.
+
+	// Validate the current records are returned after seeking to a valid offset.
+	err = r.seekRecord(blockSize * 4)
+	if err != nil {
+		t.Fatalf("SeekRecord: %v", err)
+	}
+	check(2)
+}
+
+func TestLastRecordOffset(t *testing.T) {
+	recs, err := makeTestRecords(
+		// The first record will consume 3 entire blocks but a fraction of the 4th.
+		blockSize*3,
+		// The second record will completely fill the remainder of the 4th block.
+		3*(blockSize-legacyHeaderSize)-2*blockSize-2*legacyHeaderSize,
+		// Consume the entirety of the 5th block.
+		blockSize-legacyHeaderSize,
+		// Consume the entirety of the 6th block.
+		blockSize-legacyHeaderSize,
+		// Consume roughly half of the 7th block.
+		blockSize/2,
+	)
+	if err != nil {
+		t.Fatalf("makeTestRecords: %v", err)
+	}
+
+	wants := []int64{0, 98332, 131072, 163840, 196608}
+	for i, got := range recs.offsets {
+		if want := wants[i]; got != want {
+			t.Errorf("record #%d: got %d, want %d", i, got, want)
+		}
+	}
+}
+
+func TestNoLastRecordOffset(t *testing.T) {
+	buf := new(bytes.Buffer)
+	w := NewWriter(buf)
+	defer w.Close()
+
+	if _, err := w.LastRecordOffset(); err != ErrNoLastRecord {
+		t.Fatalf("Expected ErrNoLastRecord, got: %v", err)
+	}
+
+	require.NoError(t, w.Flush())
+
+	if _, err := w.LastRecordOffset(); err != ErrNoLastRecord {
+		t.Fatalf("LastRecordOffset: got: %v, want ErrNoLastRecord", err)
+	}
+
+	_, err := w.WriteRecord([]byte("testrecord"))
+	require.NoError(t, err)
+
+	if off, err := w.LastRecordOffset(); err != nil {
+		t.Fatalf("LastRecordOffset: %v", err)
+	} else if off != 0 {
+		t.Fatalf("LastRecordOffset: got %d, want 0", off)
+	}
+}
+
+func TestInvalidLogNum(t *testing.T) {
+	var buf bytes.Buffer
+	w := NewLogWriter(&buf, 1, LogWriterConfig{
+		WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{})})
+	for i := 0; i < 10; i++ {
+		s := fmt.Sprintf("%04d\n", i)
+		_, err := w.WriteRecord([]byte(s))
+		require.NoError(t, err)
+	}
+	require.NoError(t, w.Close())
+
+	{
+		r := NewReader(bytes.NewReader(buf.Bytes()), 1)
+		for i := 0; i < 10; i++ {
+			rr, err := r.Next()
+			require.NoError(t, err)
+
+			x, err := io.ReadAll(rr)
+			require.NoError(t, err)
+
+			s := fmt.Sprintf("%04d\n", i)
+			if s != string(x) {
+				t.Fatalf("expected %s, but found %s", s, x)
+			}
+		}
+		if _, err := r.Next(); err != io.EOF {
+			t.Fatalf("expected EOF, but found %s", err)
+		}
+	}
+
+	{
+		r := NewReader(bytes.NewReader(buf.Bytes()), 2)
+		if _, err := r.Next(); err != io.EOF {
+			t.Fatalf("expected %s, but found %s\n", io.EOF, err)
+		}
+	}
+}
+
+func TestSize(t *testing.T) {
+	var buf bytes.Buffer
+	zeroes := make([]byte, 8<<10)
+	w := NewWriter(&buf)
+	for i := 0; i < 100; i++ {
+		n := rand.Intn(len(zeroes))
+		_, err := w.WriteRecord(zeroes[:n])
+		require.NoError(t, err)
+		require.NoError(t, w.Flush())
+		if buf.Len() != int(w.Size()) {
+			t.Fatalf("expected %d, but found %d", buf.Len(), w.Size())
+		}
+	}
+	require.NoError(t, w.Close())
+}
+
+type limitedWriter struct {
+	io.Writer
+	limit int
+}
+
+func (w *limitedWriter) Write(p []byte) (n int, err error) {
+	w.limit--
+	if w.limit < 0 {
+		return len(p), nil
+	}
+	return w.Writer.Write(p)
+}
+
+func TestRecycleLog(t *testing.T) {
+	const min = 16
+	const max = 4096
+
+	rnd := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+	randBlock := func() []byte {
+		data := make([]byte, rand.Intn(max-min)+min)
+		tmp := data
+		for len(tmp) >= 8 {
+			binary.LittleEndian.PutUint64(tmp, rand.Uint64())
+			tmp = tmp[8:]
+		}
+		r := rand.Uint64()
+		for i := 0; i < len(tmp); i++ {
+			tmp[i] = byte(r)
+			r >>= 8
+		}
+		return data
+	}
+
+	// Recycle a log file 100 times, writing a random number of records filled
+	// with random data.
+	backing := make([]byte, 1<<20)
+	for i := 1; i <= 100; i++ {
+		blocks := rnd.Intn(100)
+		limitedBuf := &limitedWriter{
+			Writer: bytes.NewBuffer(backing[:0]),
+			limit:  blocks,
+		}
+
+		w := NewLogWriter(limitedBuf, base.DiskFileNum(i), LogWriterConfig{
+			WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{})})
+		sizes := make([]int, 10+rnd.Intn(100))
+		for j := range sizes {
+			data := randBlock()
+			if _, err := w.WriteRecord(data); err != nil {
+				t.Fatalf("%d/%d: %v", i, j, err)
+			}
+			sizes[j] = len(data)
+		}
+		if err := w.Close(); err != nil {
+			t.Fatalf("%d: %v", i, err)
+		}
+
+		r := NewReader(bytes.NewReader(backing), base.DiskFileNum(i))
+		for j := range sizes {
+			rr, err := r.Next()
+			if err != nil {
+				// If we limited output then an EOF, zeroed, or invalid chunk is expected.
+				if limitedBuf.limit < 0 && (err == io.EOF || err == ErrZeroedChunk || err == ErrInvalidChunk) {
+					break
+				}
+				t.Fatalf("%d/%d: %v", i, j, err)
+			}
+			x, err := io.ReadAll(rr)
+			if err != nil {
+				// If we limited output then an EOF, zeroed, or invalid chunk is expected.
+				if limitedBuf.limit < 0 && (err == io.EOF || err == ErrZeroedChunk || err == ErrInvalidChunk) {
+					break
+				}
+				t.Fatalf("%d/%d: %v", i, j, err)
+			}
+			if sizes[j] != len(x) {
+				t.Fatalf("%d/%d: expected record %d, but found %d", i, j, sizes[j], len(x))
+			}
+		}
+		if _, err := r.Next(); err != io.EOF && err != ErrZeroedChunk && err != ErrInvalidChunk {
+			t.Fatalf("%d: expected EOF, but found %v", i, err)
+		}
+	}
+}
+
+func TestTruncatedLog(t *testing.T) {
+	backing := make([]byte, 2*blockSize)
+	w := NewLogWriter(bytes.NewBuffer(backing[:0]), base.DiskFileNum(1), LogWriterConfig{
+		WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{})})
+	// Write a record that spans 2 blocks.
+	_, err := w.WriteRecord(bytes.Repeat([]byte("s"), blockSize+100))
+	require.NoError(t, err)
+	require.NoError(t, w.Close())
+	// Create a reader only for the first block.
+	r := NewReader(bytes.NewReader(backing[:blockSize]), base.DiskFileNum(1))
+	rr, err := r.Next()
+	require.NoError(t, err)
+	_, err = io.ReadAll(rr)
+	require.EqualValues(t, err, io.ErrUnexpectedEOF)
+}
+
+func TestRecycleLogWithPartialBlock(t *testing.T) {
+	backing := make([]byte, 27)
+	w := NewLogWriter(bytes.NewBuffer(backing[:0]), base.DiskFileNum(1), LogWriterConfig{
+		WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{})})
+	// Will write a chunk with 11 byte header + 5 byte payload.
+	_, err := w.WriteRecord([]byte("aaaaa"))
+	require.NoError(t, err)
+	// Close will write a 11-byte EOF chunk.
+	require.NoError(t, w.Close())
+
+	w = NewLogWriter(bytes.NewBuffer(backing[:0]), base.DiskFileNum(2), LogWriterConfig{
+		WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{})})
+	// Will write a chunk with 11 byte header + 1 byte payload.
+	_, err = w.WriteRecord([]byte("a"))
+	require.NoError(t, err)
+	// Close will write a 11-byte EOF chunk.
+	require.NoError(t, w.Close())
+
+	r := NewReader(bytes.NewReader(backing), base.DiskFileNum(2))
+	_, err = r.Next()
+	require.NoError(t, err)
+	// 4 bytes left, which are not enough for even the legacy header.
+	if _, err = r.Next(); err != io.EOF {
+		t.Fatalf("unexpected error: %v", err)
+	}
+}
+
+func TestRecycleLogNumberOverflow(t *testing.T) {
+	// We truncate log numbers to 32-bits when writing to the WAL. Test log
+	// recycling at the wraparound point, ensuring that EOF chunks are
+	// interpreted correctly.
+
+	backing := make([]byte, 27)
+	w := NewLogWriter(bytes.NewBuffer(backing[:0]), base.DiskFileNum(math.MaxUint32), LogWriterConfig{
+		WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{})})
+	// Will write a chunk with 11 byte header + 5 byte payload.
+	_, err := w.WriteRecord([]byte("aaaaa"))
+	require.NoError(t, err)
+	// Close will write a 11-byte EOF chunk.
+	require.NoError(t, w.Close())
+
+	w = NewLogWriter(bytes.NewBuffer(backing[:0]), base.DiskFileNum(math.MaxUint32+1), LogWriterConfig{
+		WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{})})
+	// Will write a chunk with 11 byte header + 1 byte payload.
+	_, err = w.WriteRecord([]byte("a"))
+	require.NoError(t, err)
+	// Close will write a 11-byte EOF chunk.
+	require.NoError(t, w.Close())
+
+	r := NewReader(bytes.NewReader(backing), base.DiskFileNum(math.MaxUint32+1))
+	_, err = r.Next()
+	require.NoError(t, err)
+	// 4 bytes left, which are not enough for even the legacy header.
+	if _, err = r.Next(); err != io.EOF {
+		t.Fatalf("unexpected error: %v", err)
+	}
+}
+
+func TestRecycleLogWithPartialRecord(t *testing.T) {
+	const recordSize = (blockSize * 3) / 2
+
+	// Write a record that is larger than the log block size.
+	backing1 := make([]byte, 2*blockSize)
+	w := NewLogWriter(bytes.NewBuffer(backing1[:0]), base.DiskFileNum(1), LogWriterConfig{
+		WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{})})
+	_, err := w.WriteRecord(bytes.Repeat([]byte("a"), recordSize))
+	require.NoError(t, err)
+	require.NoError(t, w.Close())
+
+	// Write another record to a new incarnation of the WAL that is larger than
+	// the block size.
+	backing2 := make([]byte, 2*blockSize)
+	w = NewLogWriter(bytes.NewBuffer(backing2[:0]), base.DiskFileNum(2), LogWriterConfig{
+		WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{})})
+	_, err = w.WriteRecord(bytes.Repeat([]byte("b"), recordSize))
+	require.NoError(t, err)
+	require.NoError(t, w.Close())
+
+	// Copy the second block from the first WAL to the second block of the second
+	// WAL. This produces a scenario where it appears we crashed after writing
+	// the first block of the second WAL, but before writing the second block.
+	copy(backing2[blockSize:], backing1[blockSize:])
+
+	// Verify that we can't read a partial record from the second WAL.
+	r := NewReader(bytes.NewReader(backing2), base.DiskFileNum(2))
+	rr, err := r.Next()
+	require.NoError(t, err)
+
+	_, err = io.ReadAll(rr)
+	require.Equal(t, err, ErrInvalidChunk)
+}
+
+func BenchmarkRecordWrite(b *testing.B) {
+	for _, size := range []int{8, 16, 32, 64, 256, 1028, 4096, 65_536} {
+		b.Run(fmt.Sprintf("size=%d", size), func(b *testing.B) {
+			w := NewLogWriter(io.Discard, 0 /* logNum */, LogWriterConfig{
+				WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{})})
+			defer w.Close()
+			buf := make([]byte, size)
+
+			b.SetBytes(int64(len(buf)))
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				if _, err := w.WriteRecord(buf); err != nil {
+					b.Fatal(err)
+				}
+			}
+			b.StopTimer()
+		})
+	}
+}
diff --git a/pebble/record/rotation.go b/pebble/record/rotation.go
new file mode 100644
index 0000000..a2dd322
--- /dev/null
+++ b/pebble/record/rotation.go
@@ -0,0 +1,82 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package record
+
+// RotationHelper is a type used to inform the decision of rotating a record log
+// file.
+//
+// The assumption is that multiple records can be coalesced into a single record
+// (called a snapshot). Starting a new file, where the first record is a
+// snapshot of the current state is referred to as "rotating" the log.
+//
+// Normally we rotate files when a certain file size is reached. But in certain
+// cases (e.g. contents become very large), this can result in too frequent
+// rotation. This helper contains logic to impose extra conditions on the
+// rotation.
+//
+// The rotation helper uses "size" as a unit-less estimation that is correlated
+// with the on-disk size of a record or snapshot.
+type RotationHelper struct {
+	// lastSnapshotSize is the size of the last snapshot.
+	lastSnapshotSize int64
+	// sizeSinceLastSnapshot is the sum of sizes of records applied since the last
+	// snapshot.
+	sizeSinceLastSnapshot int64
+	lastRecordSize        int64
+}
+
+// AddRecord makes the rotation helper aware of a new record.
+func (rh *RotationHelper) AddRecord(recordSize int64) {
+	rh.sizeSinceLastSnapshot += recordSize
+	rh.lastRecordSize = recordSize
+}
+
+// ShouldRotate returns whether we should start a new log file (with a snapshot).
+// Does not need to be called if other rotation factors (log file size) are not
+// satisfied.
+func (rh *RotationHelper) ShouldRotate(nextSnapshotSize int64) bool {
+	// The primary goal is to ensure that when reopening a log file, the number of
+	// edits that need to be replayed on top of the snapshot is "sane" while
+	// keeping the rotation frequency as low as possible.
+	//
+	// For the purposes of this description, we assume that the log is mainly
+	// storing a collection of "entries", with edits adding or removing entries.
+	// Consider the following cases:
+	//
+	// - The number of live entries is roughly stable: after writing the snapshot
+	//   (with S entries), we require that there be enough edits such that the
+	//   cumulative number of entries in those edits, E, be greater than S. This
+	//   will ensure that at most 50% of data written out is due to rotation.
+	//
+	// - The number of live entries K in the DB is shrinking drastically, say from
+	//   S to S/10: After this shrinking, E = 0.9S, and so if we used the previous
+	//   snapshot entry count, S, as the threshold that needs to be exceeded, we
+	//   will further delay the snapshot writing. Which means on reopen we will
+	//   need to replay 0.9S edits to get to a version with 0.1S entries. It would
+	//   be better to create a new snapshot when E exceeds the number of entries in
+	//   the current version.
+	//
+	// - The number of live entries L in the DB is growing; say the last snapshot
+	//   had S entries, and now we have 10S entries, so E = 9S. If we required
+	//   that E is at least the current number of entries, we would further delay
+	//   writing a new snapshot (which is not desirable).
+	//
+	// The logic below uses the min of the last snapshot size count and the size
+	// count in the current version.
+	return rh.sizeSinceLastSnapshot > rh.lastSnapshotSize || rh.sizeSinceLastSnapshot > nextSnapshotSize
+}
+
+// Rotate makes the rotation helper aware that we are rotating to a new snapshot
+// (to which we will apply the latest edit).
+func (rh *RotationHelper) Rotate(snapshotSize int64) {
+	rh.lastSnapshotSize = snapshotSize
+	rh.sizeSinceLastSnapshot = rh.lastRecordSize
+}
+
+// DebugInfo returns the last snapshot size and size of the edits since the last
+// snapshot; used for testing and debugging.
+func (rh *RotationHelper) DebugInfo() (lastSnapshotSize int64, sizeSinceLastSnapshot int64) {
+	return rh.lastSnapshotSize, rh.sizeSinceLastSnapshot
+}
diff --git a/pebble/record/rotation_test.go b/pebble/record/rotation_test.go
new file mode 100644
index 0000000..22660f1
--- /dev/null
+++ b/pebble/record/rotation_test.go
@@ -0,0 +1,49 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package record
+
+import (
+	"fmt"
+	"strconv"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+)
+
+func TestRotation(t *testing.T) {
+	var rh RotationHelper
+	datadriven.RunTest(t, "testdata/rotation", func(t *testing.T, td *datadriven.TestData) string {
+		oneIntArg := func() int64 {
+			if len(td.CmdArgs) != 1 {
+				td.Fatalf(t, "expected one integer argument")
+			}
+			n, err := strconv.Atoi(td.CmdArgs[0].String())
+			if err != nil {
+				td.Fatalf(t, "expected one integer argument")
+			}
+			return int64(n)
+		}
+		switch td.Cmd {
+		case "add":
+			size := oneIntArg()
+			rh.AddRecord(size)
+
+		case "should-rotate":
+			nextSnapshotSize := oneIntArg()
+			return fmt.Sprint(rh.ShouldRotate(nextSnapshotSize))
+
+		case "rotate":
+			snapshotSize := oneIntArg()
+			rh.Rotate(snapshotSize)
+
+		default:
+			td.Fatalf(t, "unknown command %s", td.Cmd)
+		}
+
+		// For commands with no output, show the debug info.
+		a, b := rh.DebugInfo()
+		return fmt.Sprintf("last-snapshot-size: %d\nsize-since-last-snapshot: %d", a, b)
+	})
+}
diff --git a/pebble/record/testdata/rotation b/pebble/record/testdata/rotation
new file mode 100644
index 0000000..ca8d341
--- /dev/null
+++ b/pebble/record/testdata/rotation
@@ -0,0 +1,65 @@
+rotate 100
+----
+last-snapshot-size: 100
+size-since-last-snapshot: 0
+
+add 10
+----
+last-snapshot-size: 100
+size-since-last-snapshot: 10
+
+# We should only rotate if the next snapshot is much smaller.
+should-rotate 100
+----
+false
+
+should-rotate 5
+----
+true
+
+add 50
+----
+last-snapshot-size: 100
+size-since-last-snapshot: 60
+
+add 50
+----
+last-snapshot-size: 100
+size-since-last-snapshot: 110
+
+add 50
+----
+last-snapshot-size: 100
+size-since-last-snapshot: 160
+
+# We exceeded the last snapshot size, we should rotate regardless.
+should-rotate 1
+----
+true
+
+should-rotate 1000
+----
+true
+
+add 1
+----
+last-snapshot-size: 100
+size-since-last-snapshot: 161
+
+rotate 10
+----
+last-snapshot-size: 10
+size-since-last-snapshot: 1
+
+add 5
+----
+last-snapshot-size: 10
+size-since-last-snapshot: 6
+
+should-rotate 5
+----
+true
+
+should-rotate 100
+----
+false
diff --git a/pebble/replay/replay.go b/pebble/replay/replay.go
new file mode 100644
index 0000000..d1d894f
--- /dev/null
+++ b/pebble/replay/replay.go
@@ -0,0 +1,1145 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+// Package replay implements collection and replaying of compaction benchmarking
+// workloads. A workload is a collection of flushed and ingested sstables, along
+// with the corresponding manifests describing the order and grouping with which
+// they were applied. Replaying a workload flushes and ingests the same keys and
+// sstables to reproduce the write workload for the purpose of evaluating
+// compaction heuristics.
+package replay
+
+import (
+	"context"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"os"
+	"sort"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/bytealloc"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/internal/rangedel"
+	"github.com/cockroachdb/pebble/internal/rangekey"
+	"github.com/cockroachdb/pebble/record"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"golang.org/x/perf/benchfmt"
+	"golang.org/x/sync/errgroup"
+)
+
+// A Pacer paces replay of a workload, determining when to apply the next
+// incoming write.
+type Pacer interface {
+	pace(r *Runner, step workloadStep) time.Duration
+}
+
+// computeReadAmp calculates the read amplification from a manifest.Version
+func computeReadAmp(v *manifest.Version) int {
+	refRAmp := v.L0Sublevels.ReadAmplification()
+	for _, lvl := range v.Levels[1:] {
+		if !lvl.Empty() {
+			refRAmp++
+		}
+	}
+	return refRAmp
+}
+
+// waitForReadAmpLE is a common function used by PaceByReferenceReadAmp and
+// PaceByFixedReadAmp to wait on the dbMetricsNotifier condition variable if the
+// read amplification observed is greater than the specified target (refRAmp).
+func waitForReadAmpLE(r *Runner, rAmp int) {
+	r.dbMetricsCond.L.Lock()
+	m := r.dbMetrics
+	ra := m.ReadAmp()
+	for ra > rAmp {
+		r.dbMetricsCond.Wait()
+		ra = r.dbMetrics.ReadAmp()
+	}
+	r.dbMetricsCond.L.Unlock()
+}
+
+// Unpaced implements Pacer by applying each new write as soon as possible. It
+// may be useful for examining performance under high read amplification.
+type Unpaced struct{}
+
+func (Unpaced) pace(*Runner, workloadStep) (d time.Duration) { return }
+
+// PaceByReferenceReadAmp implements Pacer by applying each new write following
+// the collected workloads read amplification.
+type PaceByReferenceReadAmp struct{}
+
+func (PaceByReferenceReadAmp) pace(r *Runner, w workloadStep) time.Duration {
+	startTime := time.Now()
+	refRAmp := computeReadAmp(w.pv)
+	waitForReadAmpLE(r, refRAmp)
+	return time.Since(startTime)
+}
+
+// PaceByFixedReadAmp implements Pacer by applying each new write following a
+// fixed read amplification.
+type PaceByFixedReadAmp int
+
+func (pra PaceByFixedReadAmp) pace(r *Runner, _ workloadStep) time.Duration {
+	startTime := time.Now()
+	waitForReadAmpLE(r, int(pra))
+	return time.Since(startTime)
+}
+
+// Metrics holds the various statistics on a replay run and its performance.
+type Metrics struct {
+	CompactionCounts struct {
+		Total       int64
+		Default     int64
+		DeleteOnly  int64
+		ElisionOnly int64
+		Move        int64
+		Read        int64
+		Rewrite     int64
+		MultiLevel  int64
+	}
+	EstimatedDebt SampledMetric
+	Final         *pebble.Metrics
+	Ingest        struct {
+		BytesIntoL0 uint64
+		// BytesWeightedByLevel is calculated as the number of bytes ingested
+		// into a level multiplied by the level's distance from the bottommost
+		// level (L6), summed across all levels. It can be used to guage how
+		// effective heuristics are at ingesting files into lower levels, saving
+		// write amplification.
+		BytesWeightedByLevel uint64
+	}
+	// PaceDuration is the time waiting for the pacer to allow the workload to
+	// continue.
+	PaceDuration time.Duration
+	ReadAmp      SampledMetric
+	// QuiesceDuration is the time between completing application of the workload and
+	// compactions quiescing.
+	QuiesceDuration time.Duration
+	TombstoneCount  SampledMetric
+	// TotalSize holds the total size of the database, sampled after each
+	// workload step.
+	TotalSize           SampledMetric
+	TotalWriteAmp       float64
+	WorkloadDuration    time.Duration
+	WriteBytes          uint64
+	WriteStalls         map[string]int
+	WriteStallsDuration map[string]time.Duration
+	WriteThroughput     SampledMetric
+}
+
+// Plot holds an ascii plot and its name.
+type Plot struct {
+	Name string
+	Plot string
+}
+
+// Plots returns a slice of ascii plots describing metrics change over time.
+func (m *Metrics) Plots(width, height int) []Plot {
+	const scaleMB = 1.0 / float64(1<<20)
+	return []Plot{
+		{Name: "Write throughput (MB/s)", Plot: m.WriteThroughput.PlotIncreasingPerSec(width, height, scaleMB)},
+		{Name: "Estimated compaction debt (MB)", Plot: m.EstimatedDebt.Plot(width, height, scaleMB)},
+		{Name: "Total database size (MB)", Plot: m.TotalSize.Plot(width, height, scaleMB)},
+		{Name: "ReadAmp", Plot: m.ReadAmp.Plot(width, height, 1.0)},
+	}
+}
+
+// WriteBenchmarkString writes the metrics in the form of a series of
+// 'Benchmark' lines understandable by benchstat.
+func (m *Metrics) WriteBenchmarkString(name string, w io.Writer) error {
+	type benchmarkSection struct {
+		label  string
+		values []benchfmt.Value
+	}
+	groups := []benchmarkSection{
+		{label: "CompactionCounts", values: []benchfmt.Value{
+			{Value: float64(m.CompactionCounts.Total), Unit: "compactions"},
+			{Value: float64(m.CompactionCounts.Default), Unit: "default"},
+			{Value: float64(m.CompactionCounts.DeleteOnly), Unit: "delete"},
+			{Value: float64(m.CompactionCounts.ElisionOnly), Unit: "elision"},
+			{Value: float64(m.CompactionCounts.Move), Unit: "move"},
+			{Value: float64(m.CompactionCounts.Read), Unit: "read"},
+			{Value: float64(m.CompactionCounts.Rewrite), Unit: "rewrite"},
+			{Value: float64(m.CompactionCounts.MultiLevel), Unit: "multilevel"},
+		}},
+		// Total database sizes sampled after every workload step and
+		// compaction. This can be used to evaluate the relative LSM space
+		// amplification between runs of the same workload. Calculating the true
+		// space amplification continuously is prohibitvely expensive (it
+		// requires totally compacting a copy of the LSM).
+		{label: "DatabaseSize/mean", values: []benchfmt.Value{
+			{Value: m.TotalSize.Mean(), Unit: "bytes"},
+		}},
+		{label: "DatabaseSize/max", values: []benchfmt.Value{
+			{Value: float64(m.TotalSize.Max()), Unit: "bytes"},
+		}},
+		// Time applying the workload and time waiting for compactions to
+		// quiesce after the workload has completed.
+		{label: "DurationWorkload", values: []benchfmt.Value{
+			{Value: m.WorkloadDuration.Seconds(), Unit: "sec/op"},
+		}},
+		{label: "DurationQuiescing", values: []benchfmt.Value{
+			{Value: m.QuiesceDuration.Seconds(), Unit: "sec/op"},
+		}},
+		{label: "DurationPaceDelay", values: []benchfmt.Value{
+			{Value: m.PaceDuration.Seconds(), Unit: "sec/op"},
+		}},
+		// Estimated compaction debt, sampled after every workload step and
+		// compaction.
+		{label: "EstimatedDebt/mean", values: []benchfmt.Value{
+			{Value: m.EstimatedDebt.Mean(), Unit: "bytes"},
+		}},
+		{label: "EstimatedDebt/max", values: []benchfmt.Value{
+			{Value: float64(m.EstimatedDebt.Max()), Unit: "bytes"},
+		}},
+		{label: "FlushUtilization", values: []benchfmt.Value{
+			{Value: m.Final.Flush.WriteThroughput.Utilization(), Unit: "util"},
+		}},
+		{label: "IngestedIntoL0", values: []benchfmt.Value{
+			{Value: float64(m.Ingest.BytesIntoL0), Unit: "bytes"},
+		}},
+		{label: "IngestWeightedByLevel", values: []benchfmt.Value{
+			{Value: float64(m.Ingest.BytesWeightedByLevel), Unit: "bytes"},
+		}},
+		{label: "ReadAmp/mean", values: []benchfmt.Value{
+			{Value: m.ReadAmp.Mean(), Unit: "files"},
+		}},
+		{label: "ReadAmp/max", values: []benchfmt.Value{
+			{Value: float64(m.ReadAmp.Max()), Unit: "files"},
+		}},
+		{label: "TombstoneCount/mean", values: []benchfmt.Value{
+			{Value: m.TombstoneCount.Mean(), Unit: "tombstones"},
+		}},
+		{label: "TombstoneCount/max", values: []benchfmt.Value{
+			{Value: float64(m.TombstoneCount.Max()), Unit: "tombstones"},
+		}},
+		{label: "Throughput", values: []benchfmt.Value{
+			{Value: float64(m.WriteBytes) / (m.WorkloadDuration + m.QuiesceDuration).Seconds(), Unit: "B/s"},
+		}},
+		{label: "WriteAmp", values: []benchfmt.Value{
+			{Value: float64(m.TotalWriteAmp), Unit: "wamp"},
+		}},
+	}
+
+	for _, reason := range []string{"L0", "memtable"} {
+		groups = append(groups, benchmarkSection{
+			label: fmt.Sprintf("WriteStall/%s", reason),
+			values: []benchfmt.Value{
+				{Value: float64(m.WriteStalls[reason]), Unit: "stalls"},
+				{Value: float64(m.WriteStallsDuration[reason].Seconds()), Unit: "stallsec/op"},
+			},
+		})
+	}
+
+	bw := benchfmt.NewWriter(w)
+	for _, grp := range groups {
+		err := bw.Write(&benchfmt.Result{
+			Name:   benchfmt.Name(fmt.Sprintf("BenchmarkReplay/%s/%s", name, grp.label)),
+			Iters:  1,
+			Values: grp.values,
+		})
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Runner runs a captured workload against a test database, collecting
+// metrics on performance.
+type Runner struct {
+	RunDir        string
+	WorkloadFS    vfs.FS
+	WorkloadPath  string
+	Pacer         Pacer
+	Opts          *pebble.Options
+	MaxWriteBytes uint64
+
+	// Internal state.
+
+	d *pebble.DB
+	// dbMetrics and dbMetricsCond work in unison to update the metrics and
+	// notify (broadcast) to any waiting clients that metrics have been updated.
+	dbMetrics     *pebble.Metrics
+	dbMetricsCond sync.Cond
+	cancel        func()
+	err           atomic.Value
+	errgroup      *errgroup.Group
+	readerOpts    sstable.ReaderOptions
+	stagingDir    string
+	steps         chan workloadStep
+	stepsApplied  chan workloadStep
+
+	metrics struct {
+		estimatedDebt    SampledMetric
+		quiesceDuration  time.Duration
+		readAmp          SampledMetric
+		tombstoneCount   SampledMetric
+		totalSize        SampledMetric
+		paceDurationNano atomic.Uint64
+		workloadDuration time.Duration
+		writeBytes       atomic.Uint64
+		writeThroughput  SampledMetric
+	}
+	writeStallMetrics struct {
+		sync.Mutex
+		countByReason    map[string]int
+		durationByReason map[string]time.Duration
+	}
+	// compactionMu holds state for tracking the number of compactions
+	// started and completed and waking waiting goroutines when a new compaction
+	// completes. See nextCompactionCompletes.
+	compactionMu struct {
+		sync.Mutex
+		ch        chan struct{}
+		started   int64
+		completed int64
+	}
+	workload struct {
+		manifests []string
+		// manifest{Idx,Off} record the starting position of the workload
+		// relative to the initial database state.
+		manifestIdx int
+		manifestOff int64
+		// sstables records the set of captured workload sstables by file num.
+		sstables map[base.FileNum]struct{}
+	}
+}
+
+// Run begins executing the workload and returns.
+//
+// The workload application will respect the provided context's cancellation.
+func (r *Runner) Run(ctx context.Context) error {
+	// Find the workload start relative to the RunDir's existing database state.
+	// A prefix of the workload's manifest edits are expected to have already
+	// been applied to the checkpointed existing database state.
+	var err error
+	r.workload.manifests, r.workload.sstables, err = findWorkloadFiles(r.WorkloadPath, r.WorkloadFS)
+	if err != nil {
+		return err
+	}
+	r.workload.manifestIdx, r.workload.manifestOff, err = findManifestStart(r.RunDir, r.Opts.FS, r.workload.manifests)
+	if err != nil {
+		return err
+	}
+
+	// Set up a staging dir for files that will be ingested.
+	r.stagingDir = r.Opts.FS.PathJoin(r.RunDir, "staging")
+	if err := r.Opts.FS.MkdirAll(r.stagingDir, os.ModePerm); err != nil {
+		return err
+	}
+
+	r.dbMetricsCond = sync.Cond{
+		L: &sync.Mutex{},
+	}
+
+	// Extend the user-provided Options with extensions necessary for replay
+	// mechanics.
+	r.compactionMu.ch = make(chan struct{})
+	r.Opts.AddEventListener(r.eventListener())
+	r.writeStallMetrics.countByReason = make(map[string]int)
+	r.writeStallMetrics.durationByReason = make(map[string]time.Duration)
+	r.Opts.EnsureDefaults()
+	r.readerOpts = r.Opts.MakeReaderOptions()
+	r.Opts.DisableWAL = true
+	r.d, err = pebble.Open(r.RunDir, r.Opts)
+	if err != nil {
+		return err
+	}
+
+	r.dbMetrics = r.d.Metrics()
+
+	// Use a buffered channel to allow the prepareWorkloadSteps to read ahead,
+	// buffering up to cap(r.steps) steps ahead of the current applied state.
+	// Flushes need to be buffered and ingested sstables need to be copied, so
+	// pipelining this preparation makes it more likely the step will be ready
+	// to apply when the pacer decides to apply it.
+	r.steps = make(chan workloadStep, 5)
+	r.stepsApplied = make(chan workloadStep, 5)
+
+	ctx, r.cancel = context.WithCancel(ctx)
+	r.errgroup, ctx = errgroup.WithContext(ctx)
+	r.errgroup.Go(func() error { return r.prepareWorkloadSteps(ctx) })
+	r.errgroup.Go(func() error { return r.applyWorkloadSteps(ctx) })
+	r.errgroup.Go(func() error { return r.refreshMetrics(ctx) })
+	return nil
+}
+
+// refreshMetrics runs in its own goroutine, collecting metrics from the Pebble
+// instance whenever a) a workload step completes, or b) a compaction completes.
+// The Pacer implementations that pace based on read-amplification rely on these
+// refreshed metrics to decide when to allow the workload to proceed.
+func (r *Runner) refreshMetrics(ctx context.Context) error {
+	startAt := time.Now()
+	var workloadExhausted bool
+	var workloadExhaustedAt time.Time
+	stepsApplied := r.stepsApplied
+	compactionCount, alreadyCompleted, compactionCh := r.nextCompactionCompletes(0)
+	for {
+		if !alreadyCompleted {
+			select {
+			case <-ctx.Done():
+				return ctx.Err()
+			case <-compactionCh:
+				// Fall through to refreshing dbMetrics.
+			case _, ok := <-stepsApplied:
+				if !ok {
+					workloadExhausted = true
+					workloadExhaustedAt = time.Now()
+					// Set the [stepsApplied] channel to nil so that we'll never
+					// hit this case again, and we don't busy loop.
+					stepsApplied = nil
+					// Record the replay time.
+					r.metrics.workloadDuration = workloadExhaustedAt.Sub(startAt)
+				}
+				// Fall through to refreshing dbMetrics.
+			}
+		}
+
+		m := r.d.Metrics()
+		r.dbMetricsCond.L.Lock()
+		r.dbMetrics = m
+		r.dbMetricsCond.Broadcast()
+		r.dbMetricsCond.L.Unlock()
+
+		// Collect sample metrics. These metrics are calculated by sampling
+		// every time we collect metrics.
+		r.metrics.readAmp.record(int64(m.ReadAmp()))
+		r.metrics.estimatedDebt.record(int64(m.Compact.EstimatedDebt))
+		r.metrics.tombstoneCount.record(int64(m.Keys.TombstoneCount))
+		r.metrics.totalSize.record(int64(m.DiskSpaceUsage()))
+		r.metrics.writeThroughput.record(int64(r.metrics.writeBytes.Load()))
+
+		compactionCount, alreadyCompleted, compactionCh = r.nextCompactionCompletes(compactionCount)
+		// Consider whether replaying is complete. There are two necessary
+		// conditions:
+		//
+		//   1. The workload must be exhausted.
+		//   2. Compactions must have quiesced.
+		//
+		// The first condition is simple. The replay tool is responsible for
+		// applying the workload. The goroutine responsible for applying the
+		// workload closes the `stepsApplied` channel after the last step has
+		// been applied, and we'll flip `workloadExhausted` to true.
+		//
+		// The second condition is tricky. The replay tool doesn't control
+		// compactions and doesn't have visibility into whether the compaction
+		// picker is about to schedule a new compaction. We can tell when
+		// compactions are in progress or may be immeninent (eg, flushes in
+		// progress). If it appears that compactions have quiesced, pause for a
+		// fixed duration to see if a new one is scheduled. If not, consider
+		// compactions quiesced.
+		if workloadExhausted && !alreadyCompleted && r.compactionsAppearQuiesced(m) {
+			select {
+			case <-compactionCh:
+				// A new compaction just finished; compactions have not
+				// quiesced.
+				continue
+			case <-time.After(time.Second):
+				// No compactions completed. If it still looks like they've
+				// quiesced according to the metrics, consider them quiesced.
+				if r.compactionsAppearQuiesced(r.d.Metrics()) {
+					r.metrics.quiesceDuration = time.Since(workloadExhaustedAt)
+					return nil
+				}
+			}
+		}
+	}
+}
+
+// compactionsAppearQuiesced returns true if the database may have quiesced, and
+// there likely won't be additional compactions scheduled. Detecting quiescence
+// is a bit fraught: The various signals that Pebble makes available are
+// adjusted at different points in the compaction lifecycle, and database
+// mutexes are dropped and acquired between them. This makes it difficult to
+// reliably identify when compactions quiesce.
+//
+// For example, our call to DB.Metrics() may acquire the DB.mu mutex when a
+// compaction has just successfully completed, but before it's managed to
+// schedule the next compaction (DB.mu is dropped while it attempts to acquire
+// the manifest lock).
+func (r *Runner) compactionsAppearQuiesced(m *pebble.Metrics) bool {
+	r.compactionMu.Lock()
+	defer r.compactionMu.Unlock()
+	if m.Flush.NumInProgress > 0 {
+		return false
+	} else if m.Compact.NumInProgress > 0 && r.compactionMu.started != r.compactionMu.completed {
+		return false
+	}
+	return true
+}
+
+// nextCompactionCompletes may be used to be notified when new compactions
+// complete. The caller is responsible for holding on to a monotonically
+// increasing count representing the number of compactions that have been
+// observed, beginning at zero.
+//
+// The caller passes their current count as an argument. If a new compaction has
+// already completed since their provided count, nextCompactionCompletes returns
+// the new count and a true boolean return value. If a new compaction has not
+// yet completed, it returns a channel that will be closed when the next
+// compaction completes. This scheme allows the caller to select{...},
+// performing some action on every compaction completion.
+func (r *Runner) nextCompactionCompletes(
+	lastObserved int64,
+) (count int64, alreadyOccurred bool, ch chan struct{}) {
+	r.compactionMu.Lock()
+	defer r.compactionMu.Unlock()
+
+	if lastObserved < r.compactionMu.completed {
+		// There has already been another compaction since the last one observed
+		// by this caller. Return immediately.
+		return r.compactionMu.completed, true, nil
+	}
+
+	// The last observed compaction is still the most recent compaction.
+	// Return a channel that the caller can wait on to be notified when the
+	// next compaction occurs.
+	if r.compactionMu.ch == nil {
+		r.compactionMu.ch = make(chan struct{})
+	}
+	return lastObserved, false, r.compactionMu.ch
+}
+
+// Wait waits for the workload replay to complete. Wait returns once the entire
+// workload has been replayed, and compactions have quiesced.
+func (r *Runner) Wait() (Metrics, error) {
+	err := r.errgroup.Wait()
+	if storedErr := r.err.Load(); storedErr != nil {
+		err = storedErr.(error)
+	}
+	pm := r.d.Metrics()
+	total := pm.Total()
+	var ingestBytesWeighted uint64
+	for l := 0; l < len(pm.Levels); l++ {
+		ingestBytesWeighted += pm.Levels[l].BytesIngested * uint64(len(pm.Levels)-l-1)
+	}
+
+	m := Metrics{
+		Final:               pm,
+		EstimatedDebt:       r.metrics.estimatedDebt,
+		PaceDuration:        time.Duration(r.metrics.paceDurationNano.Load()),
+		ReadAmp:             r.metrics.readAmp,
+		QuiesceDuration:     r.metrics.quiesceDuration,
+		TombstoneCount:      r.metrics.tombstoneCount,
+		TotalSize:           r.metrics.totalSize,
+		TotalWriteAmp:       total.WriteAmp(),
+		WorkloadDuration:    r.metrics.workloadDuration,
+		WriteBytes:          r.metrics.writeBytes.Load(),
+		WriteStalls:         make(map[string]int),
+		WriteStallsDuration: make(map[string]time.Duration),
+		WriteThroughput:     r.metrics.writeThroughput,
+	}
+
+	r.writeStallMetrics.Lock()
+	for reason, count := range r.writeStallMetrics.countByReason {
+		m.WriteStalls[reason] = count
+	}
+	for reason, duration := range r.writeStallMetrics.durationByReason {
+		m.WriteStallsDuration[reason] = duration
+	}
+	r.writeStallMetrics.Unlock()
+	m.CompactionCounts.Total = pm.Compact.Count
+	m.CompactionCounts.Default = pm.Compact.DefaultCount
+	m.CompactionCounts.DeleteOnly = pm.Compact.DeleteOnlyCount
+	m.CompactionCounts.ElisionOnly = pm.Compact.ElisionOnlyCount
+	m.CompactionCounts.Move = pm.Compact.MoveCount
+	m.CompactionCounts.Read = pm.Compact.ReadCount
+	m.CompactionCounts.Rewrite = pm.Compact.RewriteCount
+	m.CompactionCounts.MultiLevel = pm.Compact.MultiLevelCount
+	m.Ingest.BytesIntoL0 = pm.Levels[0].BytesIngested
+	m.Ingest.BytesWeightedByLevel = ingestBytesWeighted
+	return m, err
+}
+
+// Close closes remaining open resources, including the database. It must be
+// called after Wait.
+func (r *Runner) Close() error {
+	return r.d.Close()
+}
+
+// A workloadStep describes a single manifest edit in the workload. It may be a
+// flush or ingest that should be applied to the test database, or it may be a
+// compaction that is surfaced to allow the replay logic to compare against the
+// state of the database at workload collection time.
+type workloadStep struct {
+	kind stepKind
+	ve   manifest.VersionEdit
+	// a Version describing the state of the LSM *before* the workload was
+	// collected.
+	pv *manifest.Version
+	// a Version describing the state of the LSM when the workload was
+	// collected.
+	v *manifest.Version
+	// non-nil for flushStepKind
+	flushBatch           *pebble.Batch
+	tablesToIngest       []string
+	cumulativeWriteBytes uint64
+}
+
+type stepKind uint8
+
+const (
+	flushStepKind stepKind = iota
+	ingestStepKind
+	compactionStepKind
+)
+
+// eventListener returns a Pebble EventListener that is installed on the replay
+// database so that the replay runner has access to internal Pebble events.
+func (r *Runner) eventListener() pebble.EventListener {
+	var writeStallBegin time.Time
+	var writeStallReason string
+	l := pebble.EventListener{
+		BackgroundError: func(err error) {
+			r.err.Store(err)
+			r.cancel()
+		},
+		WriteStallBegin: func(info pebble.WriteStallBeginInfo) {
+			r.writeStallMetrics.Lock()
+			defer r.writeStallMetrics.Unlock()
+			writeStallReason = info.Reason
+			// Take just the first word of the reason.
+			if j := strings.IndexByte(writeStallReason, ' '); j != -1 {
+				writeStallReason = writeStallReason[:j]
+			}
+			switch writeStallReason {
+			case "L0", "memtable":
+				r.writeStallMetrics.countByReason[writeStallReason]++
+			default:
+				panic(fmt.Sprintf("unrecognized write stall reason %q", info.Reason))
+			}
+			writeStallBegin = time.Now()
+		},
+		WriteStallEnd: func() {
+			r.writeStallMetrics.Lock()
+			defer r.writeStallMetrics.Unlock()
+			r.writeStallMetrics.durationByReason[writeStallReason] += time.Since(writeStallBegin)
+		},
+		CompactionBegin: func(_ pebble.CompactionInfo) {
+			r.compactionMu.Lock()
+			defer r.compactionMu.Unlock()
+			r.compactionMu.started++
+		},
+		CompactionEnd: func(_ pebble.CompactionInfo) {
+			// Keep track of the number of compactions that complete and notify
+			// anyone waiting for a compaction to complete. See the function
+			// nextCompactionCompletes for the corresponding receiver side.
+			r.compactionMu.Lock()
+			defer r.compactionMu.Unlock()
+			r.compactionMu.completed++
+			if r.compactionMu.ch != nil {
+				// Signal that a compaction has completed.
+				close(r.compactionMu.ch)
+				r.compactionMu.ch = nil
+			}
+		},
+	}
+	l.EnsureDefaults(nil)
+	return l
+}
+
+// applyWorkloadSteps runs in its own goroutine, reading workload steps off the
+// r.steps channel and applying them to the test database.
+func (r *Runner) applyWorkloadSteps(ctx context.Context) error {
+	for {
+		var ok bool
+		var step workloadStep
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case step, ok = <-r.steps:
+			if !ok {
+				// Exhausted the workload. Exit.
+				close(r.stepsApplied)
+				return nil
+			}
+		}
+
+		paceDur := r.Pacer.pace(r, step)
+		r.metrics.paceDurationNano.Add(uint64(paceDur))
+
+		switch step.kind {
+		case flushStepKind:
+			if err := step.flushBatch.Commit(&pebble.WriteOptions{Sync: false}); err != nil {
+				return err
+			}
+			_, err := r.d.AsyncFlush()
+			if err != nil {
+				return err
+			}
+			r.metrics.writeBytes.Store(step.cumulativeWriteBytes)
+			r.stepsApplied <- step
+		case ingestStepKind:
+			if err := r.d.Ingest(step.tablesToIngest); err != nil {
+				return err
+			}
+			r.metrics.writeBytes.Store(step.cumulativeWriteBytes)
+			r.stepsApplied <- step
+		case compactionStepKind:
+			// No-op.
+			// TODO(jackson): Should we elide this earlier?
+		default:
+			panic("unreachable")
+		}
+	}
+}
+
+// prepareWorkloadSteps runs in its own goroutine, reading the workload
+// manifests in order to reconstruct the workload and prepare each step to be
+// applied. It sends each workload step to the r.steps channel.
+func (r *Runner) prepareWorkloadSteps(ctx context.Context) error {
+	defer func() { close(r.steps) }()
+
+	idx := r.workload.manifestIdx
+
+	var cumulativeWriteBytes uint64
+	var flushBufs flushBuffers
+	var v *manifest.Version
+	var previousVersion *manifest.Version
+	var bve manifest.BulkVersionEdit
+	bve.AddedByFileNum = make(map[base.FileNum]*manifest.FileMetadata)
+	applyVE := func(ve *manifest.VersionEdit) error {
+		return bve.Accumulate(ve)
+	}
+	currentVersion := func() (*manifest.Version, error) {
+		var err error
+		v, err = bve.Apply(v,
+			r.Opts.Comparer.Compare,
+			r.Opts.Comparer.FormatKey,
+			r.Opts.FlushSplitBytes,
+			r.Opts.Experimental.ReadCompactionRate,
+			nil, /* zombies */
+			manifest.ProhibitSplitUserKeys)
+		bve = manifest.BulkVersionEdit{AddedByFileNum: bve.AddedByFileNum}
+		return v, err
+	}
+
+	for ; idx < len(r.workload.manifests); idx++ {
+		if r.MaxWriteBytes != 0 && cumulativeWriteBytes > r.MaxWriteBytes {
+			break
+		}
+
+		err := func() error {
+			manifestName := r.workload.manifests[idx]
+			f, err := r.WorkloadFS.Open(r.WorkloadFS.PathJoin(r.WorkloadPath, manifestName))
+			if err != nil {
+				return err
+			}
+			defer f.Close()
+
+			rr := record.NewReader(f, 0 /* logNum */)
+			// A manifest's first record always holds the initial version state.
+			// If this is the first manifest we're examining, we load it in
+			// order to seed `metas` with the file metadata of the existing
+			// files. Otherwise, we can skip it because we already know all the
+			// file metadatas up to this point.
+			rec, err := rr.Next()
+			if err != nil {
+				return err
+			}
+			if idx == r.workload.manifestIdx {
+				var ve manifest.VersionEdit
+				if err := ve.Decode(rec); err != nil {
+					return err
+				}
+				if err := applyVE(&ve); err != nil {
+					return err
+				}
+			}
+
+			// Read the remaining of the manifests version edits, one-by-one.
+			for {
+				rec, err := rr.Next()
+				if err == io.EOF || record.IsInvalidRecord(err) {
+					break
+				} else if err != nil {
+					return err
+				}
+				var ve manifest.VersionEdit
+				if err = ve.Decode(rec); err == io.EOF || record.IsInvalidRecord(err) {
+					break
+				} else if err != nil {
+					return err
+				}
+				if err := applyVE(&ve); err != nil {
+					return err
+				}
+				if idx == r.workload.manifestIdx && rr.Offset() <= r.workload.manifestOff {
+					// The record rec began at an offset strictly less than
+					// rr.Offset(), which means it's strictly less than
+					// r.workload.manifestOff, and we should skip it.
+					continue
+				}
+				if len(ve.NewFiles) == 0 && len(ve.DeletedFiles) == 0 {
+					// Skip WAL rotations and other events that don't affect the
+					// files of the LSM.
+					continue
+				}
+
+				s := workloadStep{ve: ve}
+				if len(ve.DeletedFiles) > 0 {
+					// If a version edit deletes files, we assume it's a compaction.
+					s.kind = compactionStepKind
+				} else {
+					// Default to ingest. If any files have unequal
+					// smallest,largest sequence numbers, we'll update this to a
+					// flush.
+					s.kind = ingestStepKind
+				}
+				var newFiles []base.DiskFileNum
+				for _, nf := range ve.NewFiles {
+					newFiles = append(newFiles, nf.Meta.FileBacking.DiskFileNum)
+					if s.kind == ingestStepKind && (nf.Meta.SmallestSeqNum != nf.Meta.LargestSeqNum || nf.Level != 0) {
+						s.kind = flushStepKind
+					}
+				}
+				// Add the current reference *Version to the step. This provides
+				// access to, for example, the read-amplification of the
+				// database at this point when the workload was collected. This
+				// can be useful for pacing.
+				if s.v, err = currentVersion(); err != nil {
+					return err
+				}
+				// On the first time through, we set the previous version to the current
+				// version otherwise we set it to the actual previous version.
+				if previousVersion == nil {
+					previousVersion = s.v
+				}
+				s.pv = previousVersion
+				previousVersion = s.v
+
+				// It's possible that the workload collector captured this
+				// version edit, but wasn't able to collect all of the
+				// corresponding sstables before being terminated.
+				if s.kind == flushStepKind || s.kind == ingestStepKind {
+					for _, fileNum := range newFiles {
+						if _, ok := r.workload.sstables[fileNum.FileNum()]; !ok {
+							// TODO(jackson,leon): This isn't exactly an error
+							// condition. Give this more thought; do we want to
+							// require graceful exiting of workload collection,
+							// such that the last version edit must have had its
+							// corresponding sstables collected?
+							return errors.Newf("sstable %s not found", fileNum)
+						}
+					}
+				}
+
+				switch s.kind {
+				case flushStepKind:
+					// Load all of the flushed sstables' keys into a batch.
+					s.flushBatch = r.d.NewBatch()
+					if err := loadFlushedSSTableKeys(s.flushBatch, r.WorkloadFS, r.WorkloadPath, newFiles, r.readerOpts, &flushBufs); err != nil {
+						return errors.Wrapf(err, "flush in %q at offset %d", manifestName, rr.Offset())
+					}
+					cumulativeWriteBytes += uint64(s.flushBatch.Len())
+				case ingestStepKind:
+					// Copy the ingested sstables into a staging area within the
+					// run dir. This is necessary for two reasons:
+					//  a) Ingest will remove the source file, and we don't want
+					//     to mutate the workload.
+					//  b) If the workload stored on another volume, Ingest
+					//     would need to fall back to copying the file since
+					//     it's not possible to link across volumes. The true
+					//     workload likely linked the file. Staging the file
+					//     ahead of time ensures that we're able to Link the
+					//     file like the original workload did.
+					for _, fileNum := range newFiles {
+						src := base.MakeFilepath(r.WorkloadFS, r.WorkloadPath, base.FileTypeTable, fileNum)
+						dst := base.MakeFilepath(r.Opts.FS, r.stagingDir, base.FileTypeTable, fileNum)
+						if err := vfs.CopyAcrossFS(r.WorkloadFS, src, r.Opts.FS, dst); err != nil {
+							return errors.Wrapf(err, "ingest in %q at offset %d", manifestName, rr.Offset())
+						}
+						finfo, err := r.Opts.FS.Stat(dst)
+						if err != nil {
+							return errors.Wrapf(err, "stating %q", dst)
+						}
+						cumulativeWriteBytes += uint64(finfo.Size())
+						s.tablesToIngest = append(s.tablesToIngest, dst)
+					}
+				case compactionStepKind:
+					// Nothing to do.
+				}
+				s.cumulativeWriteBytes = cumulativeWriteBytes
+
+				select {
+				case <-ctx.Done():
+					return ctx.Err()
+				case r.steps <- s:
+				}
+
+				if r.MaxWriteBytes != 0 && cumulativeWriteBytes > r.MaxWriteBytes {
+					break
+				}
+			}
+			return nil
+		}()
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// findWorkloadFiles finds all manifests and tables in the provided path on fs.
+func findWorkloadFiles(
+	path string, fs vfs.FS,
+) (manifests []string, sstables map[base.FileNum]struct{}, err error) {
+	dirents, err := fs.List(path)
+	if err != nil {
+		return nil, nil, err
+	}
+	sstables = make(map[base.FileNum]struct{})
+	for _, dirent := range dirents {
+		typ, fileNum, ok := base.ParseFilename(fs, dirent)
+		if !ok {
+			continue
+		}
+		switch typ {
+		case base.FileTypeManifest:
+			manifests = append(manifests, dirent)
+		case base.FileTypeTable:
+			sstables[fileNum.FileNum()] = struct{}{}
+		}
+	}
+	if len(manifests) == 0 {
+		return nil, nil, errors.Newf("no manifests found")
+	}
+	sort.Strings(manifests)
+	return manifests, sstables, err
+}
+
+// findManifestStart takes a database directory and FS containing the initial
+// database state that a workload will be run against, and a list of a workloads
+// manifests. It examines the database's current manifest to determine where
+// workload replay should begin, so as to not duplicate already-applied version
+// edits.
+//
+// It returns the index of the starting manifest, and the database's current
+// offset within the manifest.
+func findManifestStart(
+	dbDir string, dbFS vfs.FS, manifests []string,
+) (index int, offset int64, err error) {
+	// Identify the database's current manifest.
+	dbDesc, err := pebble.Peek(dbDir, dbFS)
+	if err != nil {
+		return 0, 0, err
+	}
+	dbManifest := dbFS.PathBase(dbDesc.ManifestFilename)
+	// If there is no initial database state, begin workload replay from the
+	// beginning of the first manifest.
+	if !dbDesc.Exists {
+		return 0, 0, nil
+	}
+	for index = 0; index < len(manifests); index++ {
+		if manifests[index] == dbManifest {
+			break
+		}
+	}
+	if index == len(manifests) {
+		// The initial database state has a manifest that does not appear within
+		// the workload's set of manifests. This is possible if we began
+		// recording the workload at the same time as a manifest rotation, but
+		// more likely we're applying a workload to a different initial database
+		// state than the one from which the workload was collected. Either way,
+		// start from the beginning of the first manifest.
+		return 0, 0, nil
+	}
+	// Find the initial database's offset within the manifest.
+	info, err := dbFS.Stat(dbFS.PathJoin(dbDir, dbManifest))
+	if err != nil {
+		return 0, 0, err
+	}
+	return index, info.Size(), nil
+}
+
+// loadFlushedSSTableKeys copies keys from the sstables specified by `fileNums`
+// in the directory specified by `path` into the provided the batch. Keys are
+// applied to the batch in the order dictated by their sequence numbers within
+// the sstables, ensuring the relative relationship between sequence numbers is
+// maintained.
+//
+// Preserving the relative relationship between sequence numbers is not strictly
+// necessary, but it ensures we accurately exercise some microoptimizations (eg,
+// detecting user key changes by descending trailer). There may be additional
+// dependencies on sequence numbers in the future.
+func loadFlushedSSTableKeys(
+	b *pebble.Batch,
+	fs vfs.FS,
+	path string,
+	fileNums []base.DiskFileNum,
+	readOpts sstable.ReaderOptions,
+	bufs *flushBuffers,
+) error {
+	// Load all the keys across all the sstables.
+	for _, fileNum := range fileNums {
+		if err := func() error {
+			filePath := base.MakeFilepath(fs, path, base.FileTypeTable, fileNum)
+			f, err := fs.Open(filePath)
+			if err != nil {
+				return err
+			}
+			readable, err := sstable.NewSimpleReadable(f)
+			if err != nil {
+				f.Close()
+				return err
+			}
+			r, err := sstable.NewReader(readable, readOpts)
+			if err != nil {
+				return err
+			}
+			defer r.Close()
+
+			// Load all the point keys.
+			iter, err := r.NewIter(nil, nil)
+			if err != nil {
+				return err
+			}
+			defer iter.Close()
+			for k, lv := iter.First(); k != nil; k, lv = iter.Next() {
+				var key flushedKey
+				key.Trailer = k.Trailer
+				bufs.alloc, key.UserKey = bufs.alloc.Copy(k.UserKey)
+				if v, callerOwned, err := lv.Value(nil); err != nil {
+					return err
+				} else if callerOwned {
+					key.value = v
+				} else {
+					bufs.alloc, key.value = bufs.alloc.Copy(v)
+				}
+				bufs.keys = append(bufs.keys, key)
+			}
+
+			// Load all the range tombstones.
+			if iter, err := r.NewRawRangeDelIter(); err != nil {
+				return err
+			} else if iter != nil {
+				defer iter.Close()
+				for s := iter.First(); s != nil; s = iter.Next() {
+					if err := rangedel.Encode(s, func(k base.InternalKey, v []byte) error {
+						var key flushedKey
+						key.Trailer = k.Trailer
+						bufs.alloc, key.UserKey = bufs.alloc.Copy(k.UserKey)
+						bufs.alloc, key.value = bufs.alloc.Copy(v)
+						bufs.keys = append(bufs.keys, key)
+						return nil
+					}); err != nil {
+						return err
+					}
+				}
+			}
+
+			// Load all the range keys.
+			if iter, err := r.NewRawRangeKeyIter(); err != nil {
+				return err
+			} else if iter != nil {
+				defer iter.Close()
+				for s := iter.First(); s != nil; s = iter.Next() {
+					if err := rangekey.Encode(s, func(k base.InternalKey, v []byte) error {
+						var key flushedKey
+						key.Trailer = k.Trailer
+						bufs.alloc, key.UserKey = bufs.alloc.Copy(k.UserKey)
+						bufs.alloc, key.value = bufs.alloc.Copy(v)
+						bufs.keys = append(bufs.keys, key)
+						return nil
+					}); err != nil {
+						return err
+					}
+				}
+			}
+			return nil
+		}(); err != nil {
+			return err
+		}
+	}
+
+	// Sort the flushed keys by their sequence numbers so that we can apply them
+	// to the batch in the same order, maintaining the relative relationship
+	// between keys.
+	// NB: We use a stable sort so that keys corresponding to span fragments
+	// (eg, range tombstones and range keys) have a deterministic ordering for
+	// testing.
+	sort.Stable(bufs.keys)
+
+	// Add the keys to the batch in the order they were committed when the
+	// workload was captured.
+	for i := 0; i < len(bufs.keys); i++ {
+		var err error
+		switch bufs.keys[i].Kind() {
+		case base.InternalKeyKindDelete:
+			err = b.Delete(bufs.keys[i].UserKey, nil)
+		case base.InternalKeyKindDeleteSized:
+			v, _ := binary.Uvarint(bufs.keys[i].value)
+			// Batch.DeleteSized takes just the length of the value being
+			// deleted and adds the key's length to derive the overall entry
+			// size of the value being deleted. This has already been done to
+			// the key we're reading from the sstable, so we must subtract the
+			// key length from the encoded value before calling b.DeleteSized,
+			// which will again add the key length before encoding.
+			err = b.DeleteSized(bufs.keys[i].UserKey, uint32(v-uint64(len(bufs.keys[i].UserKey))), nil)
+		case base.InternalKeyKindSet, base.InternalKeyKindSetWithDelete:
+			err = b.Set(bufs.keys[i].UserKey, bufs.keys[i].value, nil)
+		case base.InternalKeyKindMerge:
+			err = b.Merge(bufs.keys[i].UserKey, bufs.keys[i].value, nil)
+		case base.InternalKeyKindSingleDelete:
+			err = b.SingleDelete(bufs.keys[i].UserKey, nil)
+		case base.InternalKeyKindRangeDelete:
+			err = b.DeleteRange(bufs.keys[i].UserKey, bufs.keys[i].value, nil)
+		case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset, base.InternalKeyKindRangeKeyDelete:
+			s, err := rangekey.Decode(bufs.keys[i].InternalKey, bufs.keys[i].value, nil)
+			if err != nil {
+				return err
+			}
+			if len(s.Keys) != 1 {
+				return errors.Newf("range key span unexpectedly contains %d keys", len(s.Keys))
+			}
+			switch bufs.keys[i].Kind() {
+			case base.InternalKeyKindRangeKeySet:
+				err = b.RangeKeySet(s.Start, s.End, s.Keys[0].Suffix, s.Keys[0].Value, nil)
+			case base.InternalKeyKindRangeKeyUnset:
+				err = b.RangeKeyUnset(s.Start, s.End, s.Keys[0].Suffix, nil)
+			case base.InternalKeyKindRangeKeyDelete:
+				err = b.RangeKeyDelete(s.Start, s.End, nil)
+			default:
+				err = errors.Newf("unexpected key kind %q", bufs.keys[i].Kind())
+			}
+			if err != nil {
+				return err
+			}
+		default:
+			err = errors.Newf("unexpected key kind %q", bufs.keys[i].Kind())
+		}
+		if err != nil {
+			return err
+		}
+	}
+
+	// Done with the flushBuffers. Reset.
+	bufs.keys = bufs.keys[:0]
+	return nil
+}
+
+type flushBuffers struct {
+	keys  flushedKeysByTrailer
+	alloc bytealloc.A
+}
+
+type flushedKeysByTrailer []flushedKey
+
+func (s flushedKeysByTrailer) Len() int           { return len(s) }
+func (s flushedKeysByTrailer) Less(i, j int) bool { return s[i].Trailer < s[j].Trailer }
+func (s flushedKeysByTrailer) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
+
+type flushedKey struct {
+	base.InternalKey
+	value []byte
+}
diff --git a/pebble/replay/replay_test.go b/pebble/replay/replay_test.go
new file mode 100644
index 0000000..93bbdbe
--- /dev/null
+++ b/pebble/replay/replay_test.go
@@ -0,0 +1,585 @@
+package replay
+
+import (
+	"bytes"
+	"context"
+	"encoding/hex"
+	"fmt"
+	"io"
+	"math/rand"
+	"os"
+	"path/filepath"
+	"sort"
+	"strconv"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/datatest"
+	"github.com/cockroachdb/pebble/internal/humanize"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/rangekey"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+func runReplayTest(t *testing.T, path string) {
+	fs := vfs.NewMem()
+	var ctx context.Context
+	var r Runner
+	var ct *datatest.CompactionTracker
+	datadriven.RunTest(t, path, func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "cat":
+			var buf bytes.Buffer
+			for _, arg := range td.CmdArgs {
+				f, err := fs.Open(arg.String())
+				if err != nil {
+					fmt.Fprintf(&buf, "%s: %s\n", arg, err)
+					continue
+				}
+				io.Copy(&buf, f)
+				require.NoError(t, f.Close())
+			}
+			return buf.String()
+		case "corpus":
+			for _, arg := range td.CmdArgs {
+				t.Run(fmt.Sprintf("corpus/%s", arg.String()), func(t *testing.T) {
+					collectCorpus(t, fs, arg.String())
+				})
+			}
+			return ""
+		case "list-files":
+			return runListFiles(t, fs, td)
+		case "replay":
+			name := td.CmdArgs[0].String()
+			pacerVariant := td.CmdArgs[1].String()
+			var pacer Pacer
+			if pacerVariant == "reference" {
+				pacer = PaceByReferenceReadAmp{}
+			} else if pacerVariant == "fixed" {
+				i, err := strconv.Atoi(td.CmdArgs[2].String())
+				require.NoError(t, err)
+				pacer = PaceByFixedReadAmp(i)
+			} else {
+				pacer = Unpaced{}
+			}
+
+			// Convert the testdata/replay:235 datadriven command position into
+			// a run directory suffixed with the line number: eg, 'run-235'
+			lineOffset := strings.LastIndexByte(td.Pos, ':')
+			require.Positive(t, lineOffset)
+			runDir := fmt.Sprintf("run-%s", td.Pos[lineOffset+1:])
+			if err := fs.MkdirAll(runDir, os.ModePerm); err != nil {
+				return err.Error()
+			}
+
+			checkpointDir := fs.PathJoin(name, "checkpoint")
+			ok, err := vfs.Clone(fs, fs, checkpointDir, runDir)
+			if err != nil {
+				return err.Error()
+			} else if !ok {
+				return fmt.Sprintf("%q does not exist", checkpointDir)
+			}
+
+			opts := &pebble.Options{
+				FS:                        fs,
+				Comparer:                  testkeys.Comparer,
+				FormatMajorVersion:        pebble.FormatRangeKeys,
+				L0CompactionFileThreshold: 1,
+			}
+			setDefaultExperimentalOpts(opts)
+			ct = datatest.NewCompactionTracker(opts)
+
+			r = Runner{
+				RunDir:       runDir,
+				WorkloadFS:   fs,
+				WorkloadPath: name,
+				Pacer:        pacer,
+				Opts:         opts,
+			}
+			ctx = context.Background()
+			if err := r.Run(ctx); err != nil {
+				return err.Error()
+			}
+			return ""
+		case "scan-keys":
+			var buf bytes.Buffer
+			it, _ := r.d.NewIter(nil)
+			defer it.Close()
+			for valid := it.First(); valid; valid = it.Next() {
+				fmt.Fprintf(&buf, "%s: %s\n", it.Key(), it.Value())
+			}
+			if err := it.Error(); err != nil {
+				fmt.Fprintln(&buf, err.Error())
+			}
+			return buf.String()
+		case "tree":
+			return fs.String()
+		case "wait-for-compactions":
+			var target int
+			if len(td.CmdArgs) == 1 {
+				i, err := strconv.Atoi(td.CmdArgs[0].String())
+				require.NoError(t, err)
+				target = i
+			}
+			ct.WaitForInflightCompactionsToEqual(target)
+			return ""
+		case "wait":
+			m, err := r.Wait()
+			if err != nil {
+				return err.Error()
+			}
+			return fmt.Sprintf("replayed %s in writes", humanize.Bytes.Uint64(m.WriteBytes))
+		case "close":
+			if err := r.Close(); err != nil {
+				return err.Error()
+			}
+			return ""
+		default:
+			return fmt.Sprintf("unrecognized command %q", td.Cmd)
+		}
+	})
+}
+
+func setDefaultExperimentalOpts(opts *pebble.Options) {
+	opts.Experimental.TableCacheShards = 2
+}
+
+func TestReplay(t *testing.T) {
+	runReplayTest(t, "testdata/replay")
+}
+
+func TestReplayPaced(t *testing.T) {
+	runReplayTest(t, "testdata/replay_paced")
+}
+
+func TestLoadFlushedSSTableKeys(t *testing.T) {
+	var buf bytes.Buffer
+	var diskFileNums []base.DiskFileNum
+	opts := &pebble.Options{
+		DisableAutomaticCompactions: true,
+		EventListener: &pebble.EventListener{
+			FlushEnd: func(info pebble.FlushInfo) {
+				for _, tbl := range info.Output {
+					diskFileNums = append(diskFileNums, tbl.FileNum.DiskFileNum())
+				}
+			},
+		},
+		FS:                 vfs.NewMem(),
+		Comparer:           testkeys.Comparer,
+		FormatMajorVersion: pebble.FormatRangeKeys,
+	}
+	d, err := pebble.Open("", opts)
+	require.NoError(t, err)
+	defer d.Close()
+
+	var flushBufs flushBuffers
+	datadriven.RunTest(t, "testdata/flushed_sstable_keys", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "commit":
+			b := d.NewIndexedBatch()
+			if err := datatest.DefineBatch(td, b); err != nil {
+				return err.Error()
+			}
+			if err := b.Commit(nil); err != nil {
+				return err.Error()
+			}
+			return ""
+		case "flush":
+			if err := d.Flush(); err != nil {
+				return err.Error()
+			}
+
+			b := d.NewBatch()
+			err := loadFlushedSSTableKeys(b, opts.FS, "", diskFileNums, opts.MakeReaderOptions(), &flushBufs)
+			if err != nil {
+				b.Close()
+				return err.Error()
+			}
+
+			br, _ := pebble.ReadBatch(b.Repr())
+			kind, ukey, v, ok, err := br.Next()
+			for ; ok; kind, ukey, v, ok, err = br.Next() {
+				fmt.Fprintf(&buf, "%s.%s", ukey, kind)
+				switch kind {
+				case base.InternalKeyKindRangeDelete,
+					base.InternalKeyKindRangeKeyDelete:
+					fmt.Fprintf(&buf, "-%s", v)
+				case base.InternalKeyKindSet,
+					base.InternalKeyKindMerge:
+					fmt.Fprintf(&buf, ": %s", v)
+				case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset:
+					s, err := rangekey.Decode(base.MakeInternalKey(ukey, 0, kind), v, nil)
+					if err != nil {
+						return err.Error()
+					}
+					if kind == base.InternalKeyKindRangeKeySet {
+						fmt.Fprintf(&buf, "-%s: %s → %s", s.End, s.Keys[0].Suffix, s.Keys[0].Value)
+					} else {
+						fmt.Fprintf(&buf, "-%s: %s", s.End, s.Keys[0].Suffix)
+					}
+				case base.InternalKeyKindDelete, base.InternalKeyKindSingleDelete:
+				default:
+					fmt.Fprintf(&buf, ": %x", v)
+				}
+				fmt.Fprintln(&buf)
+			}
+			if err != nil {
+				fmt.Fprintf(&buf, "err: %s\n", err)
+			}
+
+			s := buf.String()
+			buf.Reset()
+			require.NoError(t, b.Close())
+
+			diskFileNums = diskFileNums[:0]
+			return s
+		default:
+			return fmt.Sprintf("unrecognized command %q", td.Cmd)
+		}
+	})
+}
+
+func collectCorpus(t *testing.T, fs *vfs.MemFS, name string) {
+	require.NoError(t, fs.RemoveAll("build"))
+	require.NoError(t, fs.MkdirAll("build", os.ModePerm))
+
+	var d *pebble.DB
+	var wc *WorkloadCollector
+	defer func() {
+		if d != nil {
+			require.NoError(t, d.Close())
+		}
+	}()
+	datadriven.RunTest(t, filepath.Join("testdata", "corpus", name), func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "commit":
+			b := d.NewBatch()
+			if err := datatest.DefineBatch(td, b); err != nil {
+				return err.Error()
+			}
+			if err := b.Commit(nil); err != nil {
+				return err.Error()
+			}
+			return ""
+		case "flush":
+			require.NoError(t, d.Flush())
+			return ""
+		case "list-files":
+			if d != nil {
+				d.TestOnlyWaitForCleaning()
+			}
+			return runListFiles(t, fs, td)
+		case "open":
+			wc = NewWorkloadCollector("build")
+			opts := &pebble.Options{
+				Comparer:                    testkeys.Comparer,
+				DisableAutomaticCompactions: true,
+				FormatMajorVersion:          pebble.FormatRangeKeys,
+				FS:                          fs,
+				MaxManifestFileSize:         96,
+			}
+			setDefaultExperimentalOpts(opts)
+			wc.Attach(opts)
+			var err error
+			d, err = pebble.Open("build", opts)
+			require.NoError(t, err)
+			return ""
+		case "close":
+			err := d.Close()
+			require.NoError(t, err)
+			d = nil
+			return ""
+		case "start":
+			require.NoError(t, fs.MkdirAll(name, os.ModePerm))
+			require.NotNil(t, wc)
+			wc.Start(fs, name)
+			require.NoError(t, d.Checkpoint(fs.PathJoin(name, "checkpoint"), pebble.WithFlushedWAL()))
+			return "started"
+		case "stat":
+			var buf bytes.Buffer
+			for _, arg := range td.CmdArgs {
+				fi, err := fs.Stat(arg.String())
+				if err != nil {
+					fmt.Fprintf(&buf, "%s: %s\n", arg.String(), err)
+					continue
+				}
+				fmt.Fprintf(&buf, "%s:\n", arg.String())
+				fmt.Fprintf(&buf, "  size: %d\n", fi.Size())
+			}
+			return buf.String()
+		case "stop":
+			wc.mu.Lock()
+			for wc.mu.tablesEnqueued != wc.mu.tablesCopied {
+				wc.mu.copyCond.Wait()
+			}
+			wc.mu.Unlock()
+			wc.Stop()
+			return "stopped"
+		case "tree":
+			return fs.String()
+		case "make-file":
+			dir := td.CmdArgs[0].String()
+			require.NoError(t, fs.MkdirAll(dir, os.ModePerm))
+			fT := td.CmdArgs[1].String()
+			filePath := fs.PathJoin(dir, td.CmdArgs[2].String())
+
+			if fT != "file" {
+				fileNumInt, err := strconv.Atoi(td.CmdArgs[2].String())
+				require.NoError(t, err)
+				fileNum := base.FileNum(fileNumInt)
+				switch fT {
+				case "table":
+					filePath = base.MakeFilepath(fs, dir, base.FileTypeTable, fileNum.DiskFileNum())
+				case "log":
+					filePath = base.MakeFilepath(fs, dir, base.FileTypeLog, fileNum.DiskFileNum())
+				case "manifest":
+					filePath = base.MakeFilepath(fs, dir, base.FileTypeManifest, fileNum.DiskFileNum())
+				}
+			}
+			f, err := fs.Create(filePath)
+			require.NoError(t, err)
+			b, err := hex.DecodeString(strings.ReplaceAll(td.Input, "\n", ""))
+			require.NoError(t, err)
+			_, err = f.Write(b)
+			require.NoError(t, err)
+			return "created"
+		case "find-workload-files":
+			var buf bytes.Buffer
+			dir := td.CmdArgs[0].String()
+			m, s, err := findWorkloadFiles(dir, fs)
+
+			fmt.Fprintln(&buf, "manifests")
+			sort.Strings(m)
+			for _, elem := range m {
+				fmt.Fprintf(&buf, "  %s\n", elem)
+			}
+			var res []string
+			for key := range s {
+				res = append(res, key.String())
+			}
+			sort.Strings(res)
+
+			fmt.Fprintln(&buf, "sstables")
+			for _, elem := range res {
+				fmt.Fprintf(&buf, "  %s\n", elem)
+			}
+			fmt.Fprintln(&buf, "error")
+			if err != nil {
+				fmt.Fprintf(&buf, "  %s\n", err.Error())
+			}
+			return buf.String()
+		case "find-manifest-start":
+			var buf bytes.Buffer
+			dir := td.CmdArgs[0].String()
+			m, _, err := findWorkloadFiles(dir, fs)
+			sort.Strings(m)
+			require.NoError(t, err)
+			i, o, err := findManifestStart(dir, fs, m)
+			errString := "nil"
+			if err != nil {
+				errString = err.Error()
+			}
+			fmt.Fprintf(&buf, "index: %d, offset: %d, error: %s\n", i, o, errString)
+			return buf.String()
+		case "delete-all":
+			err := fs.RemoveAll(td.CmdArgs[0].String())
+			if err != nil {
+				return err.Error()
+			}
+			return ""
+		default:
+			return fmt.Sprintf("unrecognized command %q", td.Cmd)
+		}
+	})
+}
+
+func TestCollectCorpus(t *testing.T) {
+	fs := vfs.NewMem()
+	datadriven.Walk(t, "testdata/corpus", func(t *testing.T, path string) {
+		collectCorpus(t, fs, filepath.Base(path))
+		fs = vfs.NewMem()
+	})
+}
+
+func runListFiles(t *testing.T, fs vfs.FS, td *datadriven.TestData) string {
+	var buf bytes.Buffer
+	for _, arg := range td.CmdArgs {
+		listFiles(t, fs, &buf, arg.String())
+	}
+	return buf.String()
+}
+
+func TestBenchmarkString(t *testing.T) {
+	m := Metrics{
+		Final:               &pebble.Metrics{},
+		EstimatedDebt:       SampledMetric{samples: []sample{{value: 5 << 25}}},
+		PaceDuration:        time.Second / 4,
+		QuiesceDuration:     time.Second / 2,
+		ReadAmp:             SampledMetric{samples: []sample{{value: 10}}},
+		TombstoneCount:      SampledMetric{samples: []sample{{value: 295}}},
+		TotalSize:           SampledMetric{samples: []sample{{value: 5 << 30}}},
+		TotalWriteAmp:       5.6,
+		WorkloadDuration:    time.Second,
+		WriteBytes:          30 * (1 << 20),
+		WriteStalls:         map[string]int{"memtable": 1, "L0": 2},
+		WriteStallsDuration: map[string]time.Duration{"memtable": time.Minute, "L0": time.Hour},
+	}
+	m.Ingest.BytesIntoL0 = 5 << 20
+	m.Ingest.BytesWeightedByLevel = 9 << 20
+
+	var buf bytes.Buffer
+	require.NoError(t, m.WriteBenchmarkString("tpcc", &buf))
+	require.Equal(t, strings.TrimSpace(`
+BenchmarkBenchmarkReplay/tpcc/CompactionCounts 1 0 compactions 0 default 0 delete 0 elision 0 move 0 read 0 rewrite 0 multilevel
+BenchmarkBenchmarkReplay/tpcc/DatabaseSize/mean 1 5.36870912e+09 bytes
+BenchmarkBenchmarkReplay/tpcc/DatabaseSize/max 1 5.36870912e+09 bytes
+BenchmarkBenchmarkReplay/tpcc/DurationWorkload 1 1 sec/op
+BenchmarkBenchmarkReplay/tpcc/DurationQuiescing 1 0.5 sec/op
+BenchmarkBenchmarkReplay/tpcc/DurationPaceDelay 1 0.25 sec/op
+BenchmarkBenchmarkReplay/tpcc/EstimatedDebt/mean 1 1.6777216e+08 bytes
+BenchmarkBenchmarkReplay/tpcc/EstimatedDebt/max 1 1.6777216e+08 bytes
+BenchmarkBenchmarkReplay/tpcc/FlushUtilization 1 0 util
+BenchmarkBenchmarkReplay/tpcc/IngestedIntoL0 1 5.24288e+06 bytes
+BenchmarkBenchmarkReplay/tpcc/IngestWeightedByLevel 1 9.437184e+06 bytes
+BenchmarkBenchmarkReplay/tpcc/ReadAmp/mean 1 10 files
+BenchmarkBenchmarkReplay/tpcc/ReadAmp/max 1 10 files
+BenchmarkBenchmarkReplay/tpcc/TombstoneCount/mean 1 295 tombstones
+BenchmarkBenchmarkReplay/tpcc/TombstoneCount/max 1 295 tombstones
+BenchmarkBenchmarkReplay/tpcc/Throughput 1 2.097152e+07 B/s
+BenchmarkBenchmarkReplay/tpcc/WriteAmp 1 5.6 wamp
+BenchmarkBenchmarkReplay/tpcc/WriteStall/L0 1 2 stalls 3600 stallsec/op
+BenchmarkBenchmarkReplay/tpcc/WriteStall/memtable 1 1 stalls 60 stallsec/op`),
+		strings.TrimSpace(buf.String()))
+}
+
+func listFiles(t *testing.T, fs vfs.FS, w io.Writer, name string) {
+	ls, err := fs.List(name)
+	if err != nil {
+		fmt.Fprintf(w, "%s: %s\n", name, err)
+		return
+	}
+	sort.Strings(ls)
+	fmt.Fprintf(w, "%s:\n", name)
+	for _, dirent := range ls {
+		fmt.Fprintf(w, "  %s\n", dirent)
+	}
+}
+
+// TestCompactionsQuiesce replays a workload that produces a nontrivial number of
+// compactions several times. It's intended to exercise Waits termination, which
+// is dependent on compactions quiescing.
+func TestCompactionsQuiesce(t *testing.T) {
+	const replayCount = 1
+	workloadFS := getHeavyWorkload(t)
+	fs := vfs.NewMem()
+	var done [replayCount]atomic.Bool
+	for i := 0; i < replayCount; i++ {
+		func(i int) {
+			runDir := fmt.Sprintf("run%d", i)
+			require.NoError(t, fs.MkdirAll(runDir, os.ModePerm))
+			r := Runner{
+				RunDir:       runDir,
+				WorkloadFS:   workloadFS,
+				WorkloadPath: "workload",
+				Pacer:        Unpaced{},
+				Opts: &pebble.Options{
+					Comparer:           testkeys.Comparer,
+					FS:                 fs,
+					FormatMajorVersion: pebble.FormatNewest,
+					LBaseMaxBytes:      1,
+				},
+			}
+			r.Opts.Experimental.LevelMultiplier = 2
+			require.NoError(t, r.Run(context.Background()))
+			defer r.Close()
+
+			var m Metrics
+			var err error
+			go func() {
+				m, err = r.Wait()
+				done[i].Store(true)
+			}()
+
+			wait := 30 * time.Second
+			if invariants.Enabled {
+				wait = time.Minute
+				if invariants.RaceEnabled {
+					wait = 5 * time.Minute
+				}
+			}
+
+			// The above call to [Wait] should eventually return. [Wait] blocks
+			// until the workload has replayed AND compactions have quiesced. A
+			// bug in either could prevent [Wait] from ever returning.
+			require.Eventually(t, func() bool { return done[i].Load() },
+				wait, time.Millisecond, "(*replay.Runner).Wait didn't terminate")
+			require.NoError(t, err)
+			// Require at least 5 compactions.
+			require.Greater(t, m.Final.Compact.Count, int64(5))
+			require.Equal(t, int64(0), m.Final.Compact.NumInProgress)
+			for l := 0; l < len(m.Final.Levels)-1; l++ {
+				require.Less(t, m.Final.Levels[l].Score, 1.0)
+			}
+		}(i)
+	}
+}
+
+// getHeavyWorkload returns a FS containing a workload in the `workload`
+// directory that flushes enough randomly generated keys that replaying it
+// should generate a non-trivial number of compactions.
+func getHeavyWorkload(t *testing.T) vfs.FS {
+	heavyWorkload.Once.Do(func() {
+		t.Run("buildHeavyWorkload", func(t *testing.T) {
+			heavyWorkload.fs = buildHeavyWorkload(t)
+		})
+	})
+	return heavyWorkload.fs
+}
+
+var heavyWorkload struct {
+	sync.Once
+	fs vfs.FS
+}
+
+func buildHeavyWorkload(t *testing.T) vfs.FS {
+	o := &pebble.Options{
+		Comparer:           testkeys.Comparer,
+		FS:                 vfs.NewMem(),
+		FormatMajorVersion: pebble.FormatNewest,
+	}
+	wc := NewWorkloadCollector("")
+	wc.Attach(o)
+	d, err := pebble.Open("", o)
+	require.NoError(t, err)
+
+	destFS := vfs.NewMem()
+	require.NoError(t, destFS.MkdirAll("workload", os.ModePerm))
+	wc.Start(destFS, "workload")
+
+	ks := testkeys.Alpha(5)
+	var bufKey = make([]byte, ks.MaxLen())
+	var bufVal [512]byte
+	rng := rand.New(rand.NewSource(time.Now().UnixNano()))
+	for i := 0; i < 100; i++ {
+		b := d.NewBatch()
+		for j := 0; j < 1000; j++ {
+			rng.Read(bufVal[:])
+			n := testkeys.WriteKey(bufKey[:], ks, rng.Int63n(ks.Count()))
+			require.NoError(t, b.Set(bufKey[:n], bufVal[:], pebble.NoSync))
+		}
+		require.NoError(t, b.Commit(pebble.NoSync))
+		require.NoError(t, d.Flush())
+	}
+	wc.WaitAndStop()
+
+	defer d.Close()
+	return destFS
+}
diff --git a/pebble/replay/sampled_metric.go b/pebble/replay/sampled_metric.go
new file mode 100644
index 0000000..5edcbe1
--- /dev/null
+++ b/pebble/replay/sampled_metric.go
@@ -0,0 +1,135 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package replay
+
+import (
+	"math"
+	"time"
+
+	"github.com/guptarohit/asciigraph"
+)
+
+// SampledMetric holds a metric that is sampled at various points of workload
+// replay. Samples are collected when a new step in the workload is applied to
+// the database, and whenever a compaction completes.
+type SampledMetric struct {
+	samples []sample
+	first   time.Time
+}
+
+type sample struct {
+	since time.Duration
+	value int64
+}
+
+func (m *SampledMetric) record(v int64) {
+	if m.first.IsZero() {
+		m.first = time.Now()
+	}
+	m.samples = append(m.samples, sample{
+		since: time.Since(m.first),
+		value: v,
+	})
+}
+
+// Plot returns an ASCII graph plot of the metric over time, with the provided
+// width and height determining the size of the graph and the number of representable discrete x and y
+// points. All values are first
+// multiplied by the provided scale parameter before graphing.
+func (m *SampledMetric) Plot(width, height int, scale float64) string {
+	values := m.Values(width)
+	for i := range values {
+		values[i] *= scale
+	}
+	return asciigraph.Plot(values, asciigraph.Height(height))
+}
+
+// PlotIncreasingPerSec returns an ASCII graph plot of the increasing delta of a
+// metric over time, per-second. The provided width and height determine the
+// size of the graph and the number of representable discrete x and y points.
+// All deltas are multiplied by the provided scale parameter and scaled to
+// per-second before graphing.
+func (m *SampledMetric) PlotIncreasingPerSec(width, height int, scale float64) string {
+	bucketDur, values := m.values(width)
+	deltas := make([]float64, width)
+	for i := range values {
+		if i == 0 {
+			deltas[i] = (values[i] * scale) / bucketDur.Seconds()
+		} else if values[i] > values[i-1] {
+			deltas[i] = (values[i] - values[i-1]) * scale / bucketDur.Seconds()
+		}
+	}
+	return asciigraph.Plot(deltas, asciigraph.Height(height))
+}
+
+// Mean calculates the mean value of the metric.
+func (m *SampledMetric) Mean() float64 {
+	var sum float64
+	if len(m.samples) == 0 {
+		return 0.0
+	}
+	for _, s := range m.samples {
+		sum += float64(s.value)
+	}
+	return sum / float64(len(m.samples))
+}
+
+// Min calculates the mininum value of the metric.
+func (m *SampledMetric) Min() int64 {
+	min := int64(math.MaxInt64)
+	for _, s := range m.samples {
+		if min > s.value {
+			min = s.value
+		}
+	}
+	return min
+}
+
+// Max calculates the maximum value of the metric.
+func (m *SampledMetric) Max() int64 {
+	var max int64
+	for _, s := range m.samples {
+		if max < s.value {
+			max = s.value
+		}
+	}
+	return max
+}
+
+// Values returns the values of the metric, distributed across n discrete
+// buckets that are equally spaced over time. If multiple values fall within a
+// bucket, the latest recorded value is used. If no values fall within a bucket,
+// the next recorded value is used.
+func (m *SampledMetric) Values(n int) []float64 {
+	_, values := m.values(n)
+	return values
+}
+
+func (m *SampledMetric) values(buckets int) (bucketDur time.Duration, values []float64) {
+	if len(m.samples) == 0 || buckets < 1 {
+		return bucketDur, nil
+	}
+
+	values = make([]float64, buckets)
+	totalDur := m.samples[len(m.samples)-1].since
+	bucketDur = totalDur / time.Duration(buckets)
+
+	for i, b := 0, 0; i < len(m.samples); i++ {
+		// Fill any buckets that precede this value with the previous value.
+		bi := int(m.samples[i].since / bucketDur)
+		if bi == buckets {
+			bi = buckets - 1
+		}
+		if b < bi {
+			b++
+			for ; b < bi; b++ {
+				values[b] = float64(m.samples[i].value)
+			}
+		}
+		values[bi] = float64(m.samples[i].value)
+		b = bi
+	}
+	return bucketDur, values
+}
diff --git a/pebble/replay/sampled_metric_test.go b/pebble/replay/sampled_metric_test.go
new file mode 100644
index 0000000..c4c636f
--- /dev/null
+++ b/pebble/replay/sampled_metric_test.go
@@ -0,0 +1,75 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package replay
+
+import (
+	"bytes"
+	"fmt"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/stretchr/testify/require"
+)
+
+func TestSampledMetric(t *testing.T) {
+	var m SampledMetric
+	var buf bytes.Buffer
+	datadriven.RunTest(t, "testdata/sampled_metric", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "init":
+			m = SampledMetric{samples: m.samples[:0]}
+			var cumDur time.Duration
+			for _, line := range strings.Split(td.Input, "\n") {
+				fields := strings.Fields(line)
+				v, err := strconv.ParseInt(fields[0], 10, 64)
+				require.NoError(t, err)
+				dur, err := time.ParseDuration(fields[1])
+				require.NoError(t, err)
+				cumDur += dur
+				m.samples = append(m.samples, sample{
+					value: v,
+					since: cumDur,
+				})
+			}
+			return ""
+		case "values":
+			buf.Reset()
+			var width int
+			td.ScanArgs(t, "width", &width)
+			for i, v := range m.Values(width) {
+				if i > 0 {
+					fmt.Fprint(&buf, " ")
+				}
+				fmt.Fprintf(&buf, "%.1f", v)
+			}
+			return buf.String()
+		case "plot":
+			var width, height int
+			var scaleStr string
+			td.ScanArgs(t, "width", &width)
+			td.ScanArgs(t, "height", &height)
+			td.ScanArgs(t, "scale", &scaleStr)
+			var scale float64
+			_, err := fmt.Sscanf(scaleStr, "%f", &scale)
+			require.NoError(t, err)
+			return m.Plot(width, height, scale)
+		case "plot-increasing-per-sec":
+			var width, height int
+			var scaleStr string
+			td.ScanArgs(t, "width", &width)
+			td.ScanArgs(t, "height", &height)
+			td.ScanArgs(t, "scale", &scaleStr)
+			var scale float64
+			_, err := fmt.Sscanf(scaleStr, "%f", &scale)
+			require.NoError(t, err)
+			return m.PlotIncreasingPerSec(width, height, scale)
+		default:
+			return fmt.Sprintf("unrecognized command %q", td.Cmd)
+		}
+	})
+}
diff --git a/pebble/replay/testdata/collect/clean_before_copy b/pebble/replay/testdata/collect/clean_before_copy
new file mode 100644
index 0000000..5af61dc
--- /dev/null
+++ b/pebble/replay/testdata/collect/clean_before_copy
@@ -0,0 +1,40 @@
+# This tests a race between Clean and the workload collector. It creates an
+# sstable during a flush, and then immediately Cleans the file. The workload
+# collector's Cleaner must delay the cleaning of the file until it's been copied
+# and then delete it.
+
+start
+----
+
+create-manifest filenum=000001
+----
+
+flush
+000002
+----
+created src/000002.sst
+[JOB 0] flushed 1 memtable (100B) to L0 [000002] (10KB), in 0.1s (0.1s total), output rate 100KB/s
+
+clean
+src/000002.sst
+----
+
+wait
+----
+dst:
+  000002.sst
+  MANIFEST-000001
+
+# The file should now be both removed from src/ and a copy should be present in
+# dst/.
+
+ls src dst
+----
+src:
+  MANIFEST-000001
+dst:
+  000002.sst
+  MANIFEST-000001
+
+stop
+----
diff --git a/pebble/replay/testdata/collect/copy_before_clean b/pebble/replay/testdata/collect/copy_before_clean
new file mode 100644
index 0000000..00b3325
--- /dev/null
+++ b/pebble/replay/testdata/collect/copy_before_clean
@@ -0,0 +1,51 @@
+start
+----
+
+create-manifest filenum=000001
+----
+
+flush
+000002
+----
+created src/000002.sst
+[JOB 0] flushed 1 memtable (100B) to L0 [000002] (10KB), in 0.1s (0.1s total), output rate 100KB/s
+
+# Wait for 000002.sst to be copied.
+
+wait
+----
+dst:
+  000002.sst
+  MANIFEST-000001
+
+# The file 000002.sst should exist in both src and dst.
+
+ls src dst
+----
+src:
+  000002.sst
+  MANIFEST-000001
+dst:
+  000002.sst
+  MANIFEST-000001
+
+cmp-files src/000002.sst dst/000002.sst
+----
+equal
+
+# Now that the file has been copied, a clean should immediately delete it.
+
+clean
+src/000002.sst
+----
+
+ls src dst
+----
+src:
+  MANIFEST-000001
+dst:
+  000002.sst
+  MANIFEST-000001
+
+stop
+----
diff --git a/pebble/replay/testdata/collect/manifest_copying b/pebble/replay/testdata/collect/manifest_copying
new file mode 100644
index 0000000..50c71e6
--- /dev/null
+++ b/pebble/replay/testdata/collect/manifest_copying
@@ -0,0 +1,81 @@
+# When collection begins, it should start from the most recent manifest.
+
+create-manifest filenum=000001
+----
+
+create-manifest filenum=000002
+----
+
+# Since collection hasn't started yet, cleaning the old manifest should
+# immediately remove it.
+
+clean
+src/MANIFEST-000001
+----
+
+ls src
+----
+src:
+  MANIFEST-000002
+
+start
+----
+
+stat src/MANIFEST-000002
+----
+src/MANIFEST-000002:
+  size: 100
+
+flush
+000003
+----
+created src/000003.sst
+[JOB 0] flushed 1 memtable (100B) to L0 [000003] (10KB), in 0.1s (0.1s total), output rate 100KB/s
+
+wait
+----
+dst:
+  000003.sst
+  MANIFEST-000002
+
+# The new file should have a larger size than it did when we stat'd the src
+# manifest because a version edit should've been appended by the flush, and
+# copied while copying the flushed sstables.
+
+stat dst/MANIFEST-000002
+----
+dst/MANIFEST-000002:
+  size: 125
+
+cmp-files src/MANIFEST-000002 dst/MANIFEST-000002
+----
+equal
+
+# Test a manifest rotation.
+
+create-manifest filenum=000004
+----
+
+flush
+000005
+000006
+----
+created src/000005.sst
+created src/000006.sst
+[JOB 0] flushed 1 memtable (100B) to L0 [000005 000006] (20KB), in 0.1s (0.1s total), output rate 200KB/s
+
+wait
+----
+dst:
+  000003.sst
+  000005.sst
+  000006.sst
+  MANIFEST-000002
+  MANIFEST-000004
+
+cmp-files src/MANIFEST-000004 dst/MANIFEST-000004
+----
+equal
+
+stop
+----
diff --git a/pebble/replay/testdata/collect/start_stop b/pebble/replay/testdata/collect/start_stop
new file mode 100644
index 0000000..72d8f24
--- /dev/null
+++ b/pebble/replay/testdata/collect/start_stop
@@ -0,0 +1,98 @@
+# This test exercises starting and stopping the collector twice.
+
+create-manifest filenum=000001
+----
+
+start
+----
+
+ingest
+000002
+000003
+000004
+----
+created src/000002.sst
+created src/000003.sst
+created src/000004.sst
+[JOB 0] ingested L0:000002 (10KB), L0:000003 (10KB), L0:000004 (10KB)
+
+
+wait
+----
+dst:
+  000002.sst
+  000003.sst
+  000004.sst
+  MANIFEST-000001
+
+clean
+src/000003.sst
+----
+
+stop
+----
+
+flush
+000005
+000006
+----
+created src/000005.sst
+created src/000006.sst
+[JOB 0] flushed 1 memtable (100B) to L0 [000005 000006] (20KB), in 0.1s (0.1s total), output rate 200KB/s
+
+# dst/ should now have the original insgested files (00000{2-4}.sst) and the
+# manifest, but not the more-recently flushed files (00000{5-6}.sst). src/
+# should not have 000003.sst, because it was cleaned (and collected).
+
+ls src dst
+----
+src:
+  000002.sst
+  000004.sst
+  000005.sst
+  000006.sst
+  MANIFEST-000001
+dst:
+  000002.sst
+  000003.sst
+  000004.sst
+  MANIFEST-000001
+
+start
+----
+
+# Cleaning one of the files created by the flush while we were not collecting
+# should result in its immediate removal.
+
+clean
+src/000005.sst
+----
+
+ls src
+----
+src:
+  000002.sst
+  000004.sst
+  000006.sst
+  MANIFEST-000001
+
+flush
+000007
+000008
+----
+created src/000007.sst
+created src/000008.sst
+[JOB 0] flushed 1 memtable (100B) to L0 [000007 000008] (20KB), in 0.1s (0.1s total), output rate 200KB/s
+
+wait
+----
+dst:
+  000002.sst
+  000003.sst
+  000004.sst
+  000007.sst
+  000008.sst
+  MANIFEST-000001
+
+stop
+----
diff --git a/pebble/replay/testdata/corpus/findManifestStart b/pebble/replay/testdata/corpus/findManifestStart
new file mode 100644
index 0000000..a0652eb
--- /dev/null
+++ b/pebble/replay/testdata/corpus/findManifestStart
@@ -0,0 +1,128 @@
+make-file build manifest 1
+bf13d7161a00010114636f636b726f6163685f636f6d70617261746f7203
+0204009040740904000102020303f94e6fc7660401020b031a04bc716700
+0ca1e1a10111016989726c67632d0001e00200000000001ef089fd0b5382
+526096000188001729f6a6a778912009010124000000000003ca700605d7
+fff39b060167000d94ed94011ef089fd0b5382526d73800188001729f6a6
+ab521a000901522b00000000001ef089fd0b538252fac0000188001729f6
+a6d69b723809019b2b0000000000d2569b570605d7fff39b060167000ef7
+dd84011ef089fd0b53825301c1800188001729f6a6d8e47bc00901a52b00
+000000001ef089fd0b538253a9e0800188001729f6a70c12ea6809011a2c
+0000000000a5579a580605d7fff39b060167000fb4cc89011ef089fd0b53
+8253bb46000188001729f6a7117960900901242c00000000001ef089fd0b
+5382542d14800188001729f6a73411420809017f2c0000000000a458ff58
+0605d7fff39b0601670010b48e8b011ef089fd0b5382543eeb8001880017
+29f6a73999328009018c2c00000000001ef089fd0b538254f25a00018800
+1729f6a7705567800901092d00000000008c59895a0605d8fff39b060167
+0011dc8e88011ef089fd0b5382550459000188001729f6a775e12c700901
+172d00000000001ef089fd0b538255b420000188001729f6a7ab512e7809
+01832d0000000000975a835b0605d8fff39b0601670012c6bf82011ef089
+fd0b538255c334000188001729f6a7b00bf7680901902d00000000001ef0
+89fd0b53825683a9000188001729f6a7ead329700901002e000000000090
+5b805c0605d8fff39b0601670013ff9d91011ef089fd0b5382568d0d0001
+88001729f6a7edbe201809010a2e00000000001ef089fd0b5382573b6500
+0188001729f6a822d807280901b42e00000000008a5cb45d0605d8fff39b
+0601670014c4ff84011ef089fd0b53825747ac000188001729f6a826a109
+900901c12e00000000001ef089fd0b5382580a
+----
+created
+
+find-manifest-start build
+----
+index: 0, offset: 0, error: nil
+
+open
+----
+
+list-files build
+----
+build:
+  000002.log
+  CURRENT
+  LOCK
+  MANIFEST-000001
+  OPTIONS-000003
+  marker.format-version.000007.008
+  marker.manifest.000001.MANIFEST-000001
+
+commit
+set a a
+set b b
+set c c
+----
+
+flush
+----
+
+list-files build
+----
+build:
+  000002.log
+  000004.log
+  000005.sst
+  CURRENT
+  LOCK
+  MANIFEST-000001
+  OPTIONS-000003
+  marker.format-version.000007.008
+  marker.manifest.000001.MANIFEST-000001
+
+
+close
+----
+
+open
+----
+
+list-files build
+----
+build:
+  000005.sst
+  000006.log
+  CURRENT
+  LOCK
+  MANIFEST-000001
+  MANIFEST-000007
+  OPTIONS-000008
+  marker.format-version.000007.008
+  marker.manifest.000002.MANIFEST-000007
+
+delete-all build/MANIFEST-000007
+----
+
+find-manifest-start build
+----
+index: 0, offset: 0, error: nil
+
+make-file build manifest 7
+bf13d7161a00010114636f636b726f6163685f636f6d70617261746f7203
+0204009040740904000102020303f94e6fc7660401020b031a04bc716700
+0ca1e1a10111016989726c67632d0001e00200000000001ef089fd0b5382
+526096000188001729f6a6a778912009010124000000000003ca700605d7
+fff39b060167000d94ed94011ef089fd0b5382526d73800188001729f6a6
+ab521a000901522b00000000001ef089fd0b538252fac0000188001729f6
+a6d69b723809019b2b0000000000d2569b570605d7fff39b060167000ef7
+dd84011ef089fd0b53825301c1800188001729f6a6d8e47bc00901a52b00
+000000001ef089fd0b538253a9e0800188001729f6a70c12ea6809011a2c
+0000000000a5579a580605d7fff39b060167000fb4cc89011ef089fd0b53
+8253bb46000188001729f6a7117960900901242c00000000001ef089fd0b
+5382542d14800188001729f6a73411420809017f2c0000000000a458ff58
+0605d7fff39b0601670010b48e8b011ef089fd0b5382543eeb8001880017
+29f6a73999328009018c2c00000000001ef089fd0b538254f25a00018800
+1729f6a7705567800901092d00000000008c59895a0605d8fff39b060167
+0011dc8e88011ef089fd0b5382550459000188001729f6a775e12c700901
+172d00000000001ef089fd0b538255b420000188001729f6a7ab512e7809
+01832d0000000000975a835b0605d8fff39b0601670012c6bf82011ef089
+fd0b538255c334000188001729f6a7b00bf7680901902d00000000001ef0
+89fd0b53825683a9000188001729f6a7ead329700901002e000000000090
+5b805c0605d8fff39b0601670013ff9d91011ef089fd0b5382568d0d0001
+88001729f6a7edbe201809010a2e00000000001ef089fd0b5382573b6500
+0188001729f6a822d807280901b42e00000000008a5cb45d0605d8fff39b
+0601670014c4ff84011ef089fd0b53825747ac000188001729f6a826a109
+900901c12e00000000001ef089fd0b5382580a
+----
+created
+
+find-manifest-start build
+----
+index: 1, offset: 739, error: nil
diff --git a/pebble/replay/testdata/corpus/findWorkloadFiles b/pebble/replay/testdata/corpus/findWorkloadFiles
new file mode 100644
index 0000000..30e5d02
--- /dev/null
+++ b/pebble/replay/testdata/corpus/findWorkloadFiles
@@ -0,0 +1,77 @@
+make-file capture log 1
+----
+created
+
+make-file capture log 2
+----
+created
+
+make-file capture log 3
+----
+created
+
+make-file capture table 1
+----
+created
+
+make-file capture table 4
+----
+created
+
+make-file capture file totally_not_relevant_file_000001.log
+----
+created
+
+list-files capture
+----
+capture:
+  000001.log
+  000001.sst
+  000002.log
+  000003.log
+  000004.sst
+  totally_not_relevant_file_000001.log
+
+find-workload-files capture
+----
+manifests
+sstables
+error
+  no manifests found
+
+make-file capture manifest 1
+----
+created
+
+make-file capture manifest 2
+----
+created
+
+make-file capture manifest 3
+----
+created
+
+
+list-files capture
+----
+capture:
+  000001.log
+  000001.sst
+  000002.log
+  000003.log
+  000004.sst
+  MANIFEST-000001
+  MANIFEST-000002
+  MANIFEST-000003
+  totally_not_relevant_file_000001.log
+
+find-workload-files capture
+----
+manifests
+  MANIFEST-000001
+  MANIFEST-000002
+  MANIFEST-000003
+sstables
+  000001
+  000004
+error
diff --git a/pebble/replay/testdata/corpus/high_read_amp b/pebble/replay/testdata/corpus/high_read_amp
new file mode 100644
index 0000000..d05b6f1
--- /dev/null
+++ b/pebble/replay/testdata/corpus/high_read_amp
@@ -0,0 +1,121 @@
+open
+----
+
+list-files build
+----
+build:
+  000002.log
+  CURRENT
+  LOCK
+  MANIFEST-000001
+  OPTIONS-000003
+  marker.format-version.000007.008
+  marker.manifest.000001.MANIFEST-000001
+
+commit
+set a a
+set b b
+set c c
+set d d
+set de d
+set e e
+set ed e
+set f f
+set fe f
+set g g
+set ge g
+set h h
+set he h
+set i i
+set ie i
+set j j
+set k k
+set l l
+set m m
+set n n
+set o o
+set p p
+set q q
+set r r
+set s s
+set t t
+set u u
+set v v
+set w w
+set x x
+set y y
+set z z
+----
+
+flush
+----
+
+commit
+set c c
+----
+
+flush
+----
+
+
+commit
+set a a
+----
+
+flush
+----
+
+list-files build
+----
+build:
+  000005.sst
+  000006.log
+  000007.sst
+  000009.log
+  000010.sst
+  CURRENT
+  LOCK
+  MANIFEST-000008
+  MANIFEST-000011
+  OPTIONS-000003
+  marker.format-version.000007.008
+  marker.manifest.000003.MANIFEST-000011
+
+start
+----
+started
+
+list-files high_read_amp/checkpoint
+----
+high_read_amp/checkpoint:
+  000005.sst
+  000007.sst
+  000009.log
+  000010.sst
+  MANIFEST-000011
+  OPTIONS-000003
+  marker.format-version.000001.008
+  marker.manifest.000001.MANIFEST-000011
+
+commit
+set d d
+set e e
+set f f
+set i i
+set h h
+set g g
+----
+
+flush
+----
+
+stop
+----
+stopped
+
+list-files high_read_amp
+----
+high_read_amp:
+  000013.sst
+  MANIFEST-000011
+  checkpoint
diff --git a/pebble/replay/testdata/corpus/simple b/pebble/replay/testdata/corpus/simple
new file mode 100644
index 0000000..8e61209
--- /dev/null
+++ b/pebble/replay/testdata/corpus/simple
@@ -0,0 +1,87 @@
+open
+----
+
+list-files build
+----
+build:
+  000002.log
+  CURRENT
+  LOCK
+  MANIFEST-000001
+  OPTIONS-000003
+  marker.format-version.000007.008
+  marker.manifest.000001.MANIFEST-000001
+
+commit
+set a a
+set b b
+set c c
+----
+
+flush
+----
+
+list-files build
+----
+build:
+  000002.log
+  000004.log
+  000005.sst
+  CURRENT
+  LOCK
+  MANIFEST-000001
+  OPTIONS-000003
+  marker.format-version.000007.008
+  marker.manifest.000001.MANIFEST-000001
+
+start
+----
+started
+
+list-files simple
+----
+simple:
+  checkpoint
+
+list-files simple/checkpoint
+----
+simple/checkpoint:
+  000004.log
+  000005.sst
+  MANIFEST-000001
+  OPTIONS-000003
+  marker.format-version.000001.008
+  marker.manifest.000001.MANIFEST-000001
+
+commit
+set d d
+set e e
+set f f
+set i i
+set h h
+set g g
+----
+
+flush
+----
+
+stop
+----
+stopped
+
+list-files simple
+----
+simple:
+  000007.sst
+  MANIFEST-000001
+  MANIFEST-000008
+  checkpoint
+
+stat simple/MANIFEST-000001 simple/MANIFEST-000008 simple/000007.sst
+----
+simple/MANIFEST-000001:
+  size: 98
+simple/MANIFEST-000008:
+  size: 122
+simple/000007.sst:
+  size: 686
diff --git a/pebble/replay/testdata/flushed_sstable_keys b/pebble/replay/testdata/flushed_sstable_keys
new file mode 100644
index 0000000..df90927
--- /dev/null
+++ b/pebble/replay/testdata/flushed_sstable_keys
@@ -0,0 +1,65 @@
+commit
+set a a
+set b b
+set c c
+----
+
+flush
+----
+a.SET: a
+b.SET: b
+c.SET: c
+
+# Test that the keys in the batch are in the same order they were originally
+# committed, not sorted by user key.
+
+commit
+set c c
+set b b
+set a a
+----
+
+flush
+----
+c.SET: c
+b.SET: b
+a.SET: a
+
+# Test that the keys in the batch are in the same order they were originally
+# committed, not sorted by user key.
+
+commit
+set c c
+del b
+del-range d f
+singledel a
+----
+
+flush
+----
+c.SET: c
+b.DEL
+d.RANGEDEL-f
+a.SINGLEDEL
+
+commit
+set x foo
+range-key-del a z
+range-key-unset g h @3
+range-key-set l m @1 foo
+set a bar
+del y
+----
+
+flush
+----
+x.SET: foo
+a.RANGEKEYDEL-g
+g.RANGEKEYDEL-h
+h.RANGEKEYDEL-l
+l.RANGEKEYDEL-m
+m.RANGEKEYDEL-z
+g.RANGEKEYUNSET-h: @3
+l.RANGEKEYSET-m: @1 → foo
+a.SET: bar
+y.DEL
diff --git a/pebble/replay/testdata/replay b/pebble/replay/testdata/replay
new file mode 100644
index 0000000..b89d4ba
--- /dev/null
+++ b/pebble/replay/testdata/replay
@@ -0,0 +1,108 @@
+corpus simple
+----
+
+tree
+----
+          /
+            build/
+      89      000004.log
+     658      000005.sst
+      49      000006.log
+     686      000007.sst
+      16      CURRENT
+       0      LOCK
+      98      MANIFEST-000001
+     122      MANIFEST-000008
+    1189      OPTIONS-000003
+       0      marker.format-version.000007.008
+       0      marker.manifest.000002.MANIFEST-000008
+            simple/
+     686      000007.sst
+      98      MANIFEST-000001
+     122      MANIFEST-000008
+              checkpoint/
+      25        000004.log
+     658        000005.sst
+      98        MANIFEST-000001
+    1189        OPTIONS-000003
+       0        marker.format-version.000001.008
+       0        marker.manifest.000001.MANIFEST-000001
+
+cat build/OPTIONS-000003
+----
+----
+[Version]
+  pebble_version=0.1
+
+[Options]
+  bytes_per_sync=524288
+  cache_size=8388608
+  cleaner=replay.WorkloadCollector("delete")
+  compaction_debt_concurrency=1073741824
+  comparer=pebble.internal.testkeys
+  disable_wal=false
+  flush_delay_delete_range=0s
+  flush_delay_range_key=0s
+  flush_split_bytes=4194304
+  format_major_version=8
+  l0_compaction_concurrency=10
+  l0_compaction_file_threshold=500
+  l0_compaction_threshold=4
+  l0_stop_writes_threshold=12
+  lbase_max_bytes=67108864
+  max_concurrent_compactions=1
+  max_manifest_file_size=96
+  max_open_files=1000
+  mem_table_size=4194304
+  mem_table_stop_writes_threshold=2
+  min_deletion_rate=0
+  merger=pebble.concatenate
+  read_compaction_rate=16000
+  read_sampling_multiplier=16
+  strict_wal_tail=true
+  table_cache_shards=2
+  table_property_collectors=[]
+  validate_on_ingest=false
+  wal_dir=
+  wal_bytes_per_sync=0
+  max_writer_concurrency=0
+  force_writer_parallelism=false
+  secondary_cache_size_bytes=0
+  create_on_shared=0
+
+[Level "0"]
+  block_restart_interval=16
+  block_size=4096
+  block_size_threshold=90
+  compression=Snappy
+  filter_policy=none
+  filter_type=table
+  index_block_size=4096
+  target_file_size=2097152
+----
+----
+
+replay simple unpaced
+----
+
+wait
+----
+replayed 42B in writes
+
+# NB: The file sizes are non-deterministic after replay (because compactions are
+# nondeterministic). We don't `tree` here as a result.
+
+scan-keys
+----
+a: a
+b: b
+c: c
+d: d
+e: e
+f: f
+g: g
+h: h
+i: i
+
+close
+----
diff --git a/pebble/replay/testdata/replay_paced b/pebble/replay/testdata/replay_paced
new file mode 100644
index 0000000..0dfad63
--- /dev/null
+++ b/pebble/replay/testdata/replay_paced
@@ -0,0 +1,80 @@
+corpus high_read_amp
+----
+
+tree
+----
+          /
+            build/
+     936      000005.sst
+     632      000007.sst
+      89      000009.log
+     632      000010.sst
+     200      000012.log
+     686      000013.sst
+      16      CURRENT
+       0      LOCK
+     122      MANIFEST-000008
+     205      MANIFEST-000011
+    1189      OPTIONS-000003
+       0      marker.format-version.000007.008
+       0      marker.manifest.000003.MANIFEST-000011
+            high_read_amp/
+     686      000013.sst
+     205      MANIFEST-000011
+              checkpoint/
+     936        000005.sst
+     632        000007.sst
+      39        000009.log
+     632        000010.sst
+     157        MANIFEST-000011
+    1189        OPTIONS-000003
+       0        marker.format-version.000001.008
+       0        marker.manifest.000001.MANIFEST-000011
+
+replay high_read_amp fixed 1
+----
+
+wait-for-compactions
+----
+
+wait
+----
+replayed 42B in writes
+
+scan-keys
+----
+a: a
+b: b
+c: c
+d: d
+de: d
+e: e
+ed: e
+f: f
+fe: f
+g: g
+ge: g
+h: h
+he: h
+i: i
+ie: i
+j: j
+k: k
+l: l
+m: m
+n: n
+o: o
+p: p
+q: q
+r: r
+s: s
+t: t
+u: u
+v: v
+w: w
+x: x
+y: y
+z: z
+
+close
+----
diff --git a/pebble/replay/testdata/sampled_metric b/pebble/replay/testdata/sampled_metric
new file mode 100644
index 0000000..ef4cb72
--- /dev/null
+++ b/pebble/replay/testdata/sampled_metric
@@ -0,0 +1,70 @@
+init
+0 0ns
+0 0ns
+5 1ms
+10 1ms
+10 1ms
+10 0ns
+100 1s
+100 1ms
+1000 3s
+1000 1s
+1000 1s
+1000 3s
+5000 2s
+5000 3s
+5000 10s
+----
+
+values width=1
+----
+5000.0
+
+values width=2
+----
+5000.0 5000.0
+
+values width=3
+----
+1000.0 5000.0 5000.0
+
+values width=4
+----
+1000.0 5000.0 5000.0 5000.0
+
+values width=10
+----
+100.0 1000.0 1000.0 1000.0 5000.0 5000.0 5000.0 5000.0 5000.0 5000.0
+
+values width=20
+----
+100.0 1000.0 1000.0 1000.0 1000.0 1000.0 1000.0 1000.0 5000.0 5000.0 5000.0 5000.0 5000.0 5000.0 5000.0 5000.0 5000.0 5000.0 5000.0 5000.0
+
+plot width=80 height=10 scale=0.1
+----
+ 500 ┤                              ╭────────────────────────────────────────────────
+ 450 ┤                              │
+ 400 ┤                              │
+ 350 ┤                              │
+ 300 ┤                              │
+ 250 ┤                              │
+ 201 ┤                              │
+ 151 ┤                              │
+ 101 ┤   ╭──────────────────────────╯
+  51 ┤   │
+   1 ┼───╯
+
+
+plot-increasing-per-sec width=80 height=10 scale=0.1
+----
+ 1333 ┤                              ╭╮
+ 1200 ┤                              ││
+ 1066 ┤                              ││
+  933 ┤                              ││
+  800 ┤                              ││
+  667 ┤                              ││
+  533 ┤                              ││
+  400 ┤                              ││
+  267 ┤   ╭╮                         ││
+  133 ┤   ││                         ││
+    0 ┼───╯╰─────────────────────────╯╰───────────────────────────────────────────────
diff --git a/pebble/replay/workload_capture.go b/pebble/replay/workload_capture.go
new file mode 100644
index 0000000..743cf70
--- /dev/null
+++ b/pebble/replay/workload_capture.go
@@ -0,0 +1,438 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package replay
+
+import (
+	"fmt"
+	"io"
+	"sync"
+	"sync/atomic"
+
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+type workloadCaptureState uint8
+
+const (
+	obsolete = workloadCaptureState(1) << iota
+	readyForProcessing
+	capturedSuccessfully
+)
+
+func (wcs workloadCaptureState) is(flag workloadCaptureState) bool { return wcs&flag != 0 }
+
+type manifestDetails struct {
+	sourceFilepath string
+	sourceFile     vfs.File
+
+	destFile vfs.File
+}
+
+// WorkloadCollector is designed to capture workloads by handling manifest
+// files, flushed SSTs and ingested SSTs. The collector hooks into the
+// pebble.EventListener and pebble.Cleaner in order keep track of file states.
+type WorkloadCollector struct {
+	mu struct {
+		sync.Mutex
+		fileState map[string]workloadCaptureState
+		// pendingSSTables holds a slice of file paths to sstables that need to
+		// be copied but haven't yet. The `copyFiles` goroutine grabs these
+		// files, and the flush and ingest event handlers append them.
+		pendingSSTables []string
+		// manifestIndex is an index into `manifests`, pointing to the
+		// manifest currently being copied.
+		manifestIndex int
+		// appending to manifests requires holding mu. Only the `copyFiles`
+		// goroutine is permitted to read or edit the struct contents once
+		// appended, so it does not need to hold mu while accessing the structs'
+		// fields.
+		manifests []*manifestDetails
+
+		// The following condition variable and counts are used in tests to
+		// synchronize with the copying goroutine.
+		copyCond       sync.Cond
+		tablesCopied   int
+		tablesEnqueued int
+	}
+	// Stores the current manifest that is being used by the database.
+	curManifest atomic.Uint64
+	// Stores whether the workload collector is enabled.
+	enabled atomic.Bool
+	buffer  []byte
+	// config contains information that is only set on the creation of the
+	// WorkloadCollector.
+	config struct {
+		// srcFS and srcDir represent the location from which the workload collector
+		// collects the files from.
+		srcFS  vfs.FS
+		srcDir string
+		// destFS and destDir represent the location to which the workload collector
+		// sends the files to.
+		destFS  vfs.FS
+		destDir string
+		// cleaner stores the cleaner to use when files become obsolete and need to
+		// be cleaned.
+		cleaner base.Cleaner
+	}
+	copier struct {
+		sync.Cond
+		stop bool
+		done chan struct{}
+	}
+}
+
+// NewWorkloadCollector is used externally to create a New WorkloadCollector.
+func NewWorkloadCollector(srcDir string) *WorkloadCollector {
+	wc := &WorkloadCollector{}
+	wc.buffer = make([]byte, 1<<10 /* 1KB */)
+	wc.config.srcDir = srcDir
+	wc.mu.copyCond.L = &wc.mu.Mutex
+	wc.mu.fileState = make(map[string]workloadCaptureState)
+	wc.copier.Cond.L = &wc.mu.Mutex
+	return wc
+}
+
+// Attach is used to set up the WorkloadCollector by attaching itself to
+// pebble.Options EventListener and Cleaner.
+func (w *WorkloadCollector) Attach(opts *pebble.Options) {
+	opts.AddEventListener(pebble.EventListener{
+		FlushEnd:        w.onFlushEnd,
+		ManifestCreated: w.onManifestCreated,
+		TableIngested:   w.onTableIngest,
+	})
+
+	opts.EnsureDefaults()
+	// Replace the original Cleaner with the workload collector's implementation,
+	// which will invoke the original Cleaner, but only once the collector's copied
+	// what it needs.
+	c := cleaner{
+		name:  fmt.Sprintf("replay.WorkloadCollector(%q)", opts.Cleaner),
+		clean: w.clean,
+	}
+	w.config.cleaner, opts.Cleaner = opts.Cleaner, c
+	w.config.srcFS = opts.FS
+}
+
+// enqueueCopyLocked enqueues the sstable with the provided filenum be copied in
+// the background. Requires w.mu.
+func (w *WorkloadCollector) enqueueCopyLocked(fileNum base.DiskFileNum) {
+	fileName := base.MakeFilename(base.FileTypeTable, fileNum)
+	w.mu.fileState[fileName] |= readyForProcessing
+	w.mu.pendingSSTables = append(w.mu.pendingSSTables, w.srcFilepath(fileName))
+	w.mu.tablesEnqueued++
+}
+
+// cleanFile calls the cleaner on the specified path and removes the path from
+// the fileState map.
+func (w *WorkloadCollector) cleanFile(fileType base.FileType, path string) error {
+	err := w.config.cleaner.Clean(w.config.srcFS, fileType, path)
+	if err == nil {
+		w.mu.Lock()
+		delete(w.mu.fileState, w.config.srcFS.PathBase(path))
+		w.mu.Unlock()
+	}
+	return err
+}
+
+// clean deletes files only after they have been processed or are not required
+// for the workload collection.
+func (w *WorkloadCollector) clean(fs vfs.FS, fileType base.FileType, path string) error {
+	if !w.IsRunning() {
+		return w.cleanFile(fileType, path)
+	}
+	w.mu.Lock()
+	fileName := fs.PathBase(path)
+	if fileState, ok := w.mu.fileState[fileName]; !ok || fileState.is(capturedSuccessfully) {
+		// Delete the file if it has been captured or the file is not important
+		// to capture which means it can be deleted.
+		w.mu.Unlock()
+		return w.cleanFile(fileType, path)
+	}
+	w.mu.fileState[fileName] |= obsolete
+	w.mu.Unlock()
+	return nil
+}
+
+// onTableIngest is attached to a pebble.DB as an EventListener.TableIngested
+// func. It enqueues all ingested tables to be copied.
+func (w *WorkloadCollector) onTableIngest(info pebble.TableIngestInfo) {
+	if !w.IsRunning() {
+		return
+	}
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	for _, table := range info.Tables {
+		w.enqueueCopyLocked(table.FileNum.DiskFileNum())
+	}
+	w.copier.Broadcast()
+}
+
+// onFlushEnd is attached to a pebble.DB as an EventListener.FlushEnd func. It
+// enqueues all flushed tables to be copied.
+func (w *WorkloadCollector) onFlushEnd(info pebble.FlushInfo) {
+	if !w.IsRunning() {
+		return
+	}
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	for _, table := range info.Output {
+		w.enqueueCopyLocked(table.FileNum.DiskFileNum())
+	}
+	w.copier.Broadcast()
+}
+
+// onManifestCreated is attached to a pebble.DB as an
+// EventListener.ManifestCreated func. It records the the new manifest so that
+// it's copied asynchronously in the background.
+func (w *WorkloadCollector) onManifestCreated(info pebble.ManifestCreateInfo) {
+	w.curManifest.Store(uint64(info.FileNum))
+	if !w.enabled.Load() {
+		return
+	}
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	// mark the manifest file as ready for processing to prevent it from being
+	// cleaned before we process it.
+	fileName := base.MakeFilename(base.FileTypeManifest, info.FileNum)
+	w.mu.fileState[fileName] |= readyForProcessing
+	w.mu.manifests = append(w.mu.manifests, &manifestDetails{
+		sourceFilepath: info.Path,
+	})
+}
+
+// copyFiles is run in a separate goroutine, copying sstables and manifests.
+func (w *WorkloadCollector) copyFiles() {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	// NB: This loop must hold w.mu at the beginning of each iteration. It may
+	// drop w.mu at times, but it must reacquire it before the next iteration.
+	for !w.copier.stop {
+		// The following performs the workload capture. It waits on a condition
+		// variable (fileListener) to let it know when new files are available to be
+		// collected.
+		if len(w.mu.pendingSSTables) == 0 {
+			w.copier.Wait()
+		}
+		// Grab the manifests to copy.
+		index := w.mu.manifestIndex
+		pendingManifests := w.mu.manifests[index:]
+		var pending []string
+		pending, w.mu.pendingSSTables = w.mu.pendingSSTables, nil
+		func() {
+			// Note the unusual lock order; Temporarily unlock the
+			// mutex, but re-acquire it before returning.
+			w.mu.Unlock()
+			defer w.mu.Lock()
+
+			// Copy any updates to the manifests files.
+			w.copyManifests(index, pendingManifests)
+			// Copy the SSTables provided in pending. copySSTables takes
+			// ownership of the pending slice.
+			w.copySSTables(pending)
+		}()
+
+		// This helps in tests; Tests can wait on the copyCond condition
+		// variable until the necessary bits have been copied.
+		w.mu.tablesCopied += len(pending)
+		w.mu.copyCond.Broadcast()
+	}
+
+	for idx := range w.mu.manifests {
+		if f := w.mu.manifests[idx].sourceFile; f != nil {
+			if err := f.Close(); err != nil {
+				panic(err)
+			}
+			w.mu.manifests[idx].sourceFile = nil
+		}
+		if f := w.mu.manifests[idx].destFile; f != nil {
+			if err := f.Close(); err != nil {
+				panic(err)
+			}
+			w.mu.manifests[idx].destFile = nil
+		}
+	}
+	close(w.copier.done)
+}
+
+// copyManifests copies any un-copied portions of the source manifests.
+func (w *WorkloadCollector) copyManifests(startAtIndex int, manifests []*manifestDetails) {
+	destFS := w.config.destFS
+
+	for index, manifest := range manifests {
+		if manifest.destFile == nil && manifest.sourceFile == nil {
+			// This is the first time we've read from this manifest, and we
+			// don't yet have open file descriptors for the src or dst files. It
+			// is safe to write to manifest.{destFile,sourceFile} without
+			// holding d.mu, because the copyFiles goroutine is the only
+			// goroutine that accesses the fields of the `manifestDetails`
+			// struct.
+			var err error
+			manifest.destFile, err = destFS.Create(w.destFilepath(destFS.PathBase(manifest.sourceFilepath)))
+			if err != nil {
+				panic(err)
+			}
+			manifest.sourceFile, err = w.config.srcFS.Open(manifest.sourceFilepath)
+			if err != nil {
+				panic(err)
+			}
+		}
+
+		numBytesRead, err := io.CopyBuffer(manifest.destFile, manifest.sourceFile, w.buffer)
+		if err != nil {
+			panic(err)
+		}
+
+		// Read 0 bytes from the current manifest and this is not the
+		// latest/newest manifest which means we have read its entirety. No new
+		// data will be written to it, because only the latest manifest may
+		// receive edits. Close the current source and destination files and
+		// move the manifest to start at the next index in w.mu.manifests.
+		if numBytesRead == 0 && index != len(manifests)-1 {
+			// Rotating the manifests so we can close the files.
+			if err := manifests[index].sourceFile.Close(); err != nil {
+				panic(err)
+			}
+			manifests[index].sourceFile = nil
+			if err := manifests[index].destFile.Close(); err != nil {
+				panic(err)
+			}
+			manifests[index].destFile = nil
+			w.mu.Lock()
+			w.mu.manifestIndex = startAtIndex + index + 1
+			w.mu.Unlock()
+		}
+	}
+}
+
+// copySSTables copies the provided sstables to the stored workload. If a file
+// has already been marked as obsolete, then file will be cleaned by the
+// w.config.cleaner after it is copied. The provided slice will be mutated and
+// should not be used following the call to this function.
+func (w *WorkloadCollector) copySSTables(pending []string) {
+	for _, filePath := range pending {
+		err := vfs.CopyAcrossFS(w.config.srcFS,
+			filePath,
+			w.config.destFS,
+			w.destFilepath(w.config.srcFS.PathBase(filePath)))
+		if err != nil {
+			panic(err)
+		}
+	}
+
+	// Identify the subset of `pending` files that should now be cleaned. The
+	// WorkloadCollector intercepts Cleaner.Clean calls to defer cleaning until
+	// copying has completed. If Cleaner.Clean has already been invoked for any
+	// of the files that copied, we can now actually Clean them.
+	pendingClean := pending[:0]
+	w.mu.Lock()
+	for _, filePath := range pending {
+		fileName := w.config.srcFS.PathBase(filePath)
+		if w.mu.fileState[fileName].is(obsolete) {
+			pendingClean = append(pendingClean, filePath)
+		} else {
+			w.mu.fileState[fileName] |= capturedSuccessfully
+		}
+	}
+	w.mu.Unlock()
+
+	for _, path := range pendingClean {
+		_ = w.cleanFile(base.FileTypeTable, path)
+	}
+}
+
+// Start begins collecting a workload. All flushed and ingested sstables, plus
+// corresponding manifests are copied to the provided destination path on the
+// provided FS.
+func (w *WorkloadCollector) Start(destFS vfs.FS, destPath string) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	// If the collector not is running then that means w.enabled == 0 so swap it
+	// to 1 and continue else return since it is already running.
+	if !w.enabled.CompareAndSwap(false, true) {
+		return
+	}
+	w.config.destFS = destFS
+	w.config.destDir = destPath
+
+	// Initialize the tracked manifests to the database's current manifest, if
+	// the database has already started. Every database Open creates a new
+	// manifest. There are two cases:
+	//   1. The database has already been opened. Then `w.atomic.curManifest`
+	//      contains the file number of the current manifest. We must initialize
+	//      the w.mu.manifests slice to contain this first manifest.
+	//   2. The database has not yet been opened. Then `w.atomic.curManifest` is
+	//      still zero. Once the associated database is opened, it'll invoke
+	//      onManifestCreated which will handle enqueuing the manifest on
+	//      `w.mu.manifests`.
+	fileNum := base.FileNum(w.curManifest.Load())
+	if fileNum != 0 {
+		fileName := base.MakeFilename(base.FileTypeManifest, fileNum.DiskFileNum())
+		w.mu.manifests = append(w.mu.manifests[:0], &manifestDetails{sourceFilepath: w.srcFilepath(fileName)})
+		w.mu.fileState[fileName] |= readyForProcessing
+	}
+
+	// Begin copying files asynchronously in the background.
+	w.copier.done = make(chan struct{})
+	w.copier.stop = false
+	go w.copyFiles()
+}
+
+// WaitAndStop waits for all enqueued sstables to be copied over, and then
+// calls Stop. Gracefully ensures that all sstables referenced in the collected
+// manifest's latest version edit will exist in the copy directory.
+func (w *WorkloadCollector) WaitAndStop() {
+	w.mu.Lock()
+	for w.mu.tablesEnqueued != w.mu.tablesCopied {
+		w.mu.copyCond.Wait()
+	}
+	w.mu.Unlock()
+	w.Stop()
+}
+
+// Stop stops collection of the workload.
+func (w *WorkloadCollector) Stop() {
+	w.mu.Lock()
+	// If the collector is running then that means w.enabled == true so swap it to
+	// false and continue else return since it is not running.
+	if !w.enabled.CompareAndSwap(true, false) {
+		w.mu.Unlock()
+		return
+	}
+	w.copier.stop = true
+	w.copier.Broadcast()
+	w.mu.Unlock()
+	<-w.copier.done
+}
+
+// IsRunning returns whether the WorkloadCollector is currently running.
+func (w *WorkloadCollector) IsRunning() bool {
+	return w.enabled.Load()
+}
+
+// srcFilepath returns the file path to the named file in the source directory
+// on the source filesystem.
+func (w *WorkloadCollector) srcFilepath(name string) string {
+	return w.config.srcFS.PathJoin(w.config.srcDir, name)
+}
+
+// destFilepath returns the file path to the named file in the destination
+// directory on the destination filesystem.
+func (w *WorkloadCollector) destFilepath(name string) string {
+	return w.config.destFS.PathJoin(w.config.destDir, name)
+}
+
+type cleaner struct {
+	name  string
+	clean func(vfs.FS, base.FileType, string) error
+}
+
+func (c cleaner) String() string { return c.name }
+func (c cleaner) Clean(fs vfs.FS, fileType base.FileType, path string) error {
+	return c.clean(fs, fileType, path)
+}
diff --git a/pebble/replay/workload_capture_test.go b/pebble/replay/workload_capture_test.go
new file mode 100644
index 0000000..4fafe71
--- /dev/null
+++ b/pebble/replay/workload_capture_test.go
@@ -0,0 +1,200 @@
+package replay
+
+import (
+	"bytes"
+	"crypto/rand"
+	"fmt"
+	"io"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+	"unicode"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/humanize"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+func TestWorkloadCollector(t *testing.T) {
+	const srcDir = `src`
+	const destDir = `dst`
+	datadriven.Walk(t, "testdata/collect", func(t *testing.T, path string) {
+		fs := vfs.NewMem()
+		require.NoError(t, fs.MkdirAll(srcDir, 0755))
+		require.NoError(t, fs.MkdirAll(destDir, 0755))
+		c := NewWorkloadCollector(srcDir)
+		o := &pebble.Options{FS: fs}
+		c.Attach(o)
+		var currentManifest vfs.File
+		var buf bytes.Buffer
+		defer func() {
+			if currentManifest != nil {
+				currentManifest.Close()
+			}
+		}()
+		datadriven.RunTest(t, path, func(t *testing.T, td *datadriven.TestData) string {
+			buf.Reset()
+			switch td.Cmd {
+			case "cmp-files":
+				if len(td.CmdArgs) != 2 {
+					return fmt.Sprintf("expected exactly 2 args, received %d", len(td.CmdArgs))
+				}
+				b1 := readFile(t, fs, td.CmdArgs[0].String())
+				b2 := readFile(t, fs, td.CmdArgs[1].String())
+				if !bytes.Equal(b1, b2) {
+					return fmt.Sprintf("files are unequal: %s (%s) and %s (%s)",
+						td.CmdArgs[0].String(), humanize.Bytes.Uint64(uint64(len(b1))),
+						td.CmdArgs[1].String(), humanize.Bytes.Uint64(uint64(len(b2))))
+				}
+				return "equal"
+			case "clean":
+				for _, path := range strings.Fields(td.Input) {
+					typ, _, ok := base.ParseFilename(fs, path)
+					require.True(t, ok)
+					require.NoError(t, o.Cleaner.Clean(fs, typ, path))
+				}
+				return ""
+			case "create-manifest":
+				if currentManifest != nil {
+					require.NoError(t, currentManifest.Close())
+				}
+
+				var fileNum uint64
+				var err error
+				td.ScanArgs(t, "filenum", &fileNum)
+				path := base.MakeFilepath(fs, srcDir, base.FileTypeManifest, base.DiskFileNum(fileNum))
+				currentManifest, err = fs.Create(path)
+				require.NoError(t, err)
+				_, err = currentManifest.Write(randData(100))
+				require.NoError(t, err)
+
+				c.onManifestCreated(pebble.ManifestCreateInfo{
+					Path:    path,
+					FileNum: base.DiskFileNum(fileNum),
+				})
+				return ""
+			case "flush":
+				flushInfo := pebble.FlushInfo{
+					Done:          true,
+					Input:         1,
+					Duration:      100 * time.Millisecond,
+					TotalDuration: 100 * time.Millisecond,
+				}
+				for _, line := range strings.Split(td.Input, "\n") {
+					if line == "" {
+						continue
+					}
+
+					parts := strings.FieldsFunc(line, func(r rune) bool { return unicode.IsSpace(r) || r == ':' })
+					tableInfo := pebble.TableInfo{Size: 10 << 10}
+					fileNum, err := strconv.ParseUint(parts[0], 10, 64)
+					require.NoError(t, err)
+					tableInfo.FileNum = base.FileNum(fileNum)
+
+					p := writeFile(t, fs, srcDir, base.FileTypeTable, tableInfo.FileNum.DiskFileNum(), randData(int(tableInfo.Size)))
+					fmt.Fprintf(&buf, "created %s\n", p)
+					flushInfo.Output = append(flushInfo.Output, tableInfo)
+
+					// Simulate a version edit applied to the current manifest.
+					_, err = currentManifest.Write(randData(25))
+					require.NoError(t, err)
+				}
+				flushInfo.InputBytes = 100 // Determinism
+				fmt.Fprint(&buf, flushInfo.String())
+				c.onFlushEnd(flushInfo)
+				return buf.String()
+			case "ingest":
+				ingestInfo := pebble.TableIngestInfo{}
+				for _, line := range strings.Split(td.Input, "\n") {
+					if line == "" {
+						continue
+					}
+
+					parts := strings.FieldsFunc(line, func(r rune) bool { return unicode.IsSpace(r) || r == ':' })
+					tableInfo := pebble.TableInfo{Size: 10 << 10}
+					fileNum, err := strconv.ParseUint(parts[0], 10, 64)
+					require.NoError(t, err)
+					tableInfo.FileNum = base.FileNum(fileNum)
+
+					p := writeFile(t, fs, srcDir, base.FileTypeTable, tableInfo.FileNum.DiskFileNum(), randData(int(tableInfo.Size)))
+					fmt.Fprintf(&buf, "created %s\n", p)
+					ingestInfo.Tables = append(ingestInfo.Tables, struct {
+						pebble.TableInfo
+						Level int
+					}{Level: 0, TableInfo: tableInfo})
+
+					// Simulate a version edit applied to the current manifest.
+					_, err = currentManifest.Write(randData(25))
+					require.NoError(t, err)
+				}
+				fmt.Fprint(&buf, ingestInfo.String())
+				c.onTableIngest(ingestInfo)
+				return buf.String()
+
+			case "ls":
+				return runListFiles(t, fs, td)
+			case "start":
+				c.Start(fs, destDir)
+				return ""
+			case "stat":
+				var buf bytes.Buffer
+				for _, arg := range td.CmdArgs {
+					fi, err := fs.Stat(arg.String())
+					if err != nil {
+						fmt.Fprintf(&buf, "%s: %s\n", arg.String(), err)
+						continue
+					}
+					fmt.Fprintf(&buf, "%s:\n", arg.String())
+					fmt.Fprintf(&buf, "  size: %d\n", fi.Size())
+				}
+				return buf.String()
+			case "stop":
+				c.Stop()
+				return ""
+			case "wait":
+				// Wait until all pending sstables have been copied, then list
+				// the files in the destination directory.
+				c.mu.Lock()
+				for c.mu.tablesEnqueued != c.mu.tablesCopied {
+					c.mu.copyCond.Wait()
+				}
+				c.mu.Unlock()
+				listFiles(t, fs, &buf, destDir)
+				return buf.String()
+			default:
+				return fmt.Sprintf("unrecognized command %q", td.Cmd)
+			}
+		})
+	})
+}
+
+func randData(byteCount int) []byte {
+	b := make([]byte, byteCount)
+	rand.Read(b)
+	return b
+}
+
+func writeFile(
+	t *testing.T, fs vfs.FS, dir string, typ base.FileType, fileNum base.DiskFileNum, data []byte,
+) string {
+	path := base.MakeFilepath(fs, dir, typ, fileNum)
+	f, err := fs.Create(path)
+	require.NoError(t, err)
+	_, err = f.Write(data)
+	require.NoError(t, err)
+	require.NoError(t, f.Close())
+	return path
+}
+
+func readFile(t *testing.T, fs vfs.FS, path string) []byte {
+	r, err := fs.Open(path)
+	require.NoError(t, err)
+	b, err := io.ReadAll(r)
+	require.NoError(t, err)
+	require.NoError(t, r.Close())
+	return b
+}
diff --git a/pebble/scan_internal.go b/pebble/scan_internal.go
new file mode 100644
index 0000000..62bb58e
--- /dev/null
+++ b/pebble/scan_internal.go
@@ -0,0 +1,1016 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/remote"
+	"github.com/cockroachdb/pebble/sstable"
+)
+
+const (
+	// In skip-shared iteration mode, keys in levels sharedLevelsStart and greater
+	// (i.e. lower in the LSM) are skipped.
+	sharedLevelsStart = remote.SharedLevelsStart
+)
+
+// ErrInvalidSkipSharedIteration is returned by ScanInternal if it was called
+// with a shared file visitor function, and a file in a shareable level (i.e.
+// level >= sharedLevelsStart) was found to not be in shared storage according
+// to objstorage.Provider, or not shareable for another reason such as for
+// containing keys newer than the snapshot sequence number.
+var ErrInvalidSkipSharedIteration = errors.New("pebble: cannot use skip-shared iteration due to non-shareable files in lower levels")
+
+// SharedSSTMeta represents an sstable on shared storage that can be ingested
+// by another pebble instance. This struct must contain all fields that are
+// required for a Pebble instance to ingest a foreign sstable on shared storage,
+// including constructing any relevant objstorage.Provider / remoteobjcat.Catalog
+// data structures, as well as creating virtual FileMetadatas.
+//
+// Note that the Pebble instance creating and returning a SharedSSTMeta might
+// not be the one that created the underlying sstable on shared storage to begin
+// with; it's possible for a Pebble instance to reshare an sstable that was
+// shared to it.
+type SharedSSTMeta struct {
+	// Backing is the shared object underlying this SST. Can be attached to an
+	// objstorage.Provider.
+	Backing objstorage.RemoteObjectBackingHandle
+
+	// Smallest and Largest internal keys for the overall bounds. The kind and
+	// SeqNum of these will reflect what is physically present on the source Pebble
+	// instance's view of the sstable; it's up to the ingesting instance to set the
+	// sequence number in the trailer to match the read-time sequence numbers
+	// reserved for the level this SST is being ingested into. The Kind is expected
+	// to remain unchanged by the ingesting instance.
+	//
+	// Note that these bounds could be narrower than the bounds of the underlying
+	// sstable; ScanInternal is expected to truncate sstable bounds to the user key
+	// bounds passed into that method.
+	Smallest, Largest InternalKey
+
+	// SmallestRangeKey and LargestRangeKey are internal keys that denote the
+	// range key bounds of this sstable. Must lie within [Smallest, Largest].
+	SmallestRangeKey, LargestRangeKey InternalKey
+
+	// SmallestPointKey and LargestPointKey are internal keys that denote the
+	// point key bounds of this sstable. Must lie within [Smallest, Largest].
+	SmallestPointKey, LargestPointKey InternalKey
+
+	// Level denotes the level at which this file was present at read time.
+	// For files visited by ScanInternal, this value will only be 5 or 6.
+	Level uint8
+
+	// Size contains an estimate of the size of this sstable.
+	Size uint64
+
+	// fileNum at time of creation in the creator instance. Only used for
+	// debugging/tests.
+	fileNum base.FileNum
+}
+
+func (s *SharedSSTMeta) cloneFromFileMeta(f *fileMetadata) {
+	*s = SharedSSTMeta{
+		Smallest:         f.Smallest.Clone(),
+		Largest:          f.Largest.Clone(),
+		SmallestRangeKey: f.SmallestRangeKey.Clone(),
+		LargestRangeKey:  f.LargestRangeKey.Clone(),
+		SmallestPointKey: f.SmallestPointKey.Clone(),
+		LargestPointKey:  f.LargestPointKey.Clone(),
+		Size:             f.Size,
+		fileNum:          f.FileNum,
+	}
+}
+
+type sharedByLevel []SharedSSTMeta
+
+func (s sharedByLevel) Len() int           { return len(s) }
+func (s sharedByLevel) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
+func (s sharedByLevel) Less(i, j int) bool { return s[i].Level < s[j].Level }
+
+type pcIterPos int
+
+const (
+	pcIterPosCur pcIterPos = iota
+	pcIterPosNext
+)
+
+// pointCollapsingIterator is an internalIterator that collapses point keys and
+// returns at most one point internal key for each user key. Merges and
+// SingleDels are not supported and result in a panic if encountered. Point keys
+// deleted by rangedels are considered shadowed and not exposed.
+//
+// Only used in ScanInternal to return at most one internal key per user key.
+type pointCollapsingIterator struct {
+	iter     keyspan.InterleavingIter
+	pos      pcIterPos
+	comparer *base.Comparer
+	merge    base.Merge
+	err      error
+	seqNum   uint64
+	// The current position of `iter`. Always owned by the underlying iter.
+	iterKey *InternalKey
+	// The last saved key. findNextEntry and similar methods are expected to save
+	// the current value of iterKey to savedKey if they're iterating away from the
+	// current key but still need to retain it. See comments in findNextEntry on
+	// how this field is used.
+	//
+	// At the end of a positioning call:
+	//  - if pos == pcIterPosNext, iterKey is pointing to the next user key owned
+	//    by `iter` while savedKey is holding a copy to our current key.
+	//  - If pos == pcIterPosCur, iterKey is pointing to an `iter`-owned current
+	//    key, and savedKey is either undefined or pointing to a version of the
+	//    current key owned by this iterator (i.e. backed by savedKeyBuf).
+	savedKey    InternalKey
+	savedKeyBuf []byte
+	// Value at the current iterator position, at iterKey.
+	iterValue base.LazyValue
+	// If fixedSeqNum is non-zero, all emitted points are verified to have this
+	// fixed sequence number.
+	fixedSeqNum uint64
+}
+
+func (p *pointCollapsingIterator) Span() *keyspan.Span {
+	return p.iter.Span()
+}
+
+// SeekPrefixGE implements the InternalIterator interface.
+func (p *pointCollapsingIterator) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	p.resetKey()
+	p.iterKey, p.iterValue = p.iter.SeekPrefixGE(prefix, key, flags)
+	p.pos = pcIterPosCur
+	if p.iterKey == nil {
+		return nil, base.LazyValue{}
+	}
+	return p.findNextEntry()
+}
+
+// SeekGE implements the InternalIterator interface.
+func (p *pointCollapsingIterator) SeekGE(
+	key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	p.resetKey()
+	p.iterKey, p.iterValue = p.iter.SeekGE(key, flags)
+	p.pos = pcIterPosCur
+	if p.iterKey == nil {
+		return nil, base.LazyValue{}
+	}
+	return p.findNextEntry()
+}
+
+// SeekLT implements the InternalIterator interface.
+func (p *pointCollapsingIterator) SeekLT(
+	key []byte, flags base.SeekLTFlags,
+) (*base.InternalKey, base.LazyValue) {
+	panic("unimplemented")
+}
+
+func (p *pointCollapsingIterator) resetKey() {
+	p.savedKey.UserKey = p.savedKeyBuf[:0]
+	p.savedKey.Trailer = 0
+	p.iterKey = nil
+	p.pos = pcIterPosCur
+}
+
+func (p *pointCollapsingIterator) verifySeqNum(key *base.InternalKey) *base.InternalKey {
+	if !invariants.Enabled {
+		return key
+	}
+	if p.fixedSeqNum == 0 || key == nil || key.Kind() == InternalKeyKindRangeDelete {
+		return key
+	}
+	if key.SeqNum() != p.fixedSeqNum {
+		panic(fmt.Sprintf("expected foreign point key to have seqnum %d, got %d", p.fixedSeqNum, key.SeqNum()))
+	}
+	return key
+}
+
+// findNextEntry is called to return the next key. p.iter must be positioned at the
+// start of the first user key we are interested in.
+func (p *pointCollapsingIterator) findNextEntry() (*base.InternalKey, base.LazyValue) {
+	p.saveKey()
+	// Saves a comparison in the fast path
+	firstIteration := true
+	for p.iterKey != nil {
+		// NB: p.savedKey is either the current key (iff p.iterKey == firstKey),
+		// or the previous key.
+		if !firstIteration && !p.comparer.Equal(p.iterKey.UserKey, p.savedKey.UserKey) {
+			p.saveKey()
+			continue
+		}
+		firstIteration = false
+		if s := p.iter.Span(); s != nil && s.CoversAt(p.seqNum, p.iterKey.SeqNum()) {
+			// All future keys for this user key must be deleted.
+			if p.savedKey.Kind() == InternalKeyKindSingleDelete {
+				panic("cannot process singledel key in point collapsing iterator")
+			}
+			// Fast forward to the next user key.
+			p.saveKey()
+			p.iterKey, p.iterValue = p.iter.Next()
+			for p.iterKey != nil && p.savedKey.SeqNum() >= p.iterKey.SeqNum() && p.comparer.Equal(p.iterKey.UserKey, p.savedKey.UserKey) {
+				p.iterKey, p.iterValue = p.iter.Next()
+			}
+			continue
+		}
+		switch p.savedKey.Kind() {
+		case InternalKeyKindSet, InternalKeyKindDelete, InternalKeyKindSetWithDelete, InternalKeyKindDeleteSized:
+			// Note that we return SETs directly, even if they would otherwise get
+			// compacted into a Del to turn into a SetWithDelete. This is a fast
+			// path optimization that can break SINGLEDEL determinism. To lead to
+			// consistent SINGLEDEL behaviour, this iterator should *not* be used for
+			// a keyspace where SINGLEDELs could be in use. If this iterator observes
+			// a SINGLEDEL as the first internal key for a user key, it will panic.
+			//
+			// As p.value is a lazy value owned by the child iterator, we can thread
+			// it through without loading it into p.valueBuf.
+			//
+			// TODO(bilal): We can even avoid saving the key in this fast path if
+			// we are in a block where setHasSamePrefix = false in a v3 sstable,
+			// guaranteeing that there's only one internal key for each user key.
+			// Thread this logic through the sstable iterators and/or consider
+			// collapsing (ha) this logic into the sstable iterators that are aware
+			// of blocks and can determine user key changes without doing key saves
+			// or comparisons.
+			p.pos = pcIterPosCur
+			return p.verifySeqNum(p.iterKey), p.iterValue
+		case InternalKeyKindSingleDelete:
+			// Panic, as this iterator is not expected to observe single deletes.
+			panic("cannot process singledel key in point collapsing iterator")
+		case InternalKeyKindMerge:
+			// Panic, as this iterator is not expected to observe merges.
+			panic("cannot process merge key in point collapsing iterator")
+		case InternalKeyKindRangeDelete:
+			// These are interleaved by the interleaving iterator ahead of all points.
+			// We should pass them as-is, but also account for any points ahead of
+			// them.
+			p.pos = pcIterPosCur
+			return p.verifySeqNum(p.iterKey), p.iterValue
+		default:
+			panic(fmt.Sprintf("unexpected kind: %d", p.iterKey.Kind()))
+		}
+	}
+	p.resetKey()
+	return nil, base.LazyValue{}
+}
+
+// First implements the InternalIterator interface.
+func (p *pointCollapsingIterator) First() (*base.InternalKey, base.LazyValue) {
+	p.resetKey()
+	p.iterKey, p.iterValue = p.iter.First()
+	p.pos = pcIterPosCur
+	if p.iterKey == nil {
+		return nil, base.LazyValue{}
+	}
+	return p.findNextEntry()
+}
+
+// Last implements the InternalIterator interface.
+func (p *pointCollapsingIterator) Last() (*base.InternalKey, base.LazyValue) {
+	panic("unimplemented")
+}
+
+func (p *pointCollapsingIterator) saveKey() {
+	if p.iterKey == nil {
+		p.savedKey = InternalKey{UserKey: p.savedKeyBuf[:0]}
+		return
+	}
+	p.savedKeyBuf = append(p.savedKeyBuf[:0], p.iterKey.UserKey...)
+	p.savedKey = InternalKey{UserKey: p.savedKeyBuf, Trailer: p.iterKey.Trailer}
+}
+
+// Next implements the InternalIterator interface.
+func (p *pointCollapsingIterator) Next() (*base.InternalKey, base.LazyValue) {
+	switch p.pos {
+	case pcIterPosCur:
+		p.saveKey()
+		if p.iterKey != nil && p.iterKey.Kind() == InternalKeyKindRangeDelete {
+			// Step over the interleaved range delete and process the very next
+			// internal key, even if it's at the same user key. This is because a
+			// point for that user key has not been returned yet.
+			p.iterKey, p.iterValue = p.iter.Next()
+			break
+		}
+		// Fast forward to the next user key.
+		key, val := p.iter.Next()
+		// p.iterKey.SeqNum() >= key.SeqNum() is an optimization that allows us to
+		// use p.iterKey.SeqNum() < key.SeqNum() as a sign that the user key has
+		// changed, without needing to do the full key comparison.
+		for key != nil && p.savedKey.SeqNum() >= key.SeqNum() &&
+			p.comparer.Equal(p.savedKey.UserKey, key.UserKey) {
+			key, val = p.iter.Next()
+		}
+		if key == nil {
+			// There are no keys to return.
+			p.resetKey()
+			return nil, base.LazyValue{}
+		}
+		p.iterKey, p.iterValue = key, val
+	case pcIterPosNext:
+		p.pos = pcIterPosCur
+	}
+	if p.iterKey == nil {
+		p.resetKey()
+		return nil, base.LazyValue{}
+	}
+	return p.findNextEntry()
+}
+
+// NextPrefix implements the InternalIterator interface.
+func (p *pointCollapsingIterator) NextPrefix(succKey []byte) (*base.InternalKey, base.LazyValue) {
+	panic("unimplemented")
+}
+
+// Prev implements the InternalIterator interface.
+func (p *pointCollapsingIterator) Prev() (*base.InternalKey, base.LazyValue) {
+	panic("unimplemented")
+}
+
+// Error implements the InternalIterator interface.
+func (p *pointCollapsingIterator) Error() error {
+	if p.err != nil {
+		return p.err
+	}
+	return p.iter.Error()
+}
+
+// Close implements the InternalIterator interface.
+func (p *pointCollapsingIterator) Close() error {
+	return p.iter.Close()
+}
+
+// SetBounds implements the InternalIterator interface.
+func (p *pointCollapsingIterator) SetBounds(lower, upper []byte) {
+	p.resetKey()
+	p.iter.SetBounds(lower, upper)
+}
+
+func (p *pointCollapsingIterator) SetContext(ctx context.Context) {
+	p.iter.SetContext(ctx)
+}
+
+// String implements the InternalIterator interface.
+func (p *pointCollapsingIterator) String() string {
+	return p.iter.String()
+}
+
+var _ internalIterator = &pointCollapsingIterator{}
+
+// IteratorLevelKind is used to denote whether the current ScanInternal iterator
+// is unknown, belongs to a flushable, or belongs to an LSM level type.
+type IteratorLevelKind int8
+
+const (
+	// IteratorLevelUnknown indicates an unknown LSM level.
+	IteratorLevelUnknown IteratorLevelKind = iota
+	// IteratorLevelLSM indicates an LSM level.
+	IteratorLevelLSM
+	// IteratorLevelFlushable indicates a flushable (i.e. memtable).
+	IteratorLevelFlushable
+)
+
+// IteratorLevel is used with scanInternalIterator to surface additional iterator-specific info where possible.
+// Note: this is struct is only provided for point keys.
+type IteratorLevel struct {
+	Kind IteratorLevelKind
+	// FlushableIndex indicates the position within the flushable queue of this level.
+	// Only valid if kind == IteratorLevelFlushable.
+	FlushableIndex int
+	// The level within the LSM. Only valid if Kind == IteratorLevelLSM.
+	Level int
+	// Sublevel is only valid if Kind == IteratorLevelLSM and Level == 0.
+	Sublevel int
+}
+
+// scanInternalIterator is an iterator that returns all internal keys, including
+// tombstones. For instance, an InternalKeyKindDelete would be returned as an
+// InternalKeyKindDelete instead of the iterator skipping over to the next key.
+// Internal keys within a user key are collapsed, eg. if there are two SETs, the
+// one with the higher sequence is returned. Useful if an external user of Pebble
+// needs to observe and rebuild Pebble's history of internal keys, such as in
+// node-to-node replication. For use with {db,snapshot}.ScanInternal().
+//
+// scanInternalIterator is expected to ignore point keys deleted by range
+// deletions, and range keys shadowed by a range key unset or delete, however it
+// *must* return the range delete as well as the range key unset/delete that did
+// the shadowing.
+type scanInternalIterator struct {
+	ctx             context.Context
+	db              *DB
+	opts            scanInternalOptions
+	comparer        *base.Comparer
+	merge           Merge
+	iter            internalIterator
+	readState       *readState
+	version         *version
+	rangeKey        *iteratorRangeKeyState
+	pointKeyIter    internalIterator
+	iterKey         *InternalKey
+	iterValue       LazyValue
+	alloc           *iterAlloc
+	newIters        tableNewIters
+	newIterRangeKey keyspan.TableNewSpanIter
+	seqNum          uint64
+	iterLevels      []IteratorLevel
+	mergingIter     *mergingIter
+
+	// boundsBuf holds two buffers used to store the lower and upper bounds.
+	// Whenever the InternalIterator's bounds change, the new bounds are copied
+	// into boundsBuf[boundsBufIdx]. The two bounds share a slice to reduce
+	// allocations. opts.LowerBound and opts.UpperBound point into this slice.
+	boundsBuf    [2][]byte
+	boundsBufIdx int
+}
+
+// truncateSharedFile truncates a shared file's [Smallest, Largest] fields to
+// [lower, upper), potentially opening iterators on the file to find keys within
+// the requested bounds. A SharedSSTMeta is produced that is suitable for
+// external consumption by other Pebble instances. If shouldSkip is true, this
+// file does not contain any keys in [lower, upper) and can be skipped.
+//
+// TODO(bilal): If opening iterators and doing reads in this method is too
+// inefficient, consider producing non-tight file bounds instead.
+func (d *DB) truncateSharedFile(
+	ctx context.Context,
+	lower, upper []byte,
+	level int,
+	file *fileMetadata,
+	objMeta objstorage.ObjectMetadata,
+) (sst *SharedSSTMeta, shouldSkip bool, err error) {
+	cmp := d.cmp
+	sst = &SharedSSTMeta{}
+	sst.cloneFromFileMeta(file)
+	sst.Level = uint8(level)
+	sst.Backing, err = d.objProvider.RemoteObjectBacking(&objMeta)
+	if err != nil {
+		return nil, false, err
+	}
+	needsLowerTruncate := cmp(lower, file.Smallest.UserKey) > 0
+	needsUpperTruncate := cmp(upper, file.Largest.UserKey) < 0 || (cmp(upper, file.Largest.UserKey) == 0 && !file.Largest.IsExclusiveSentinel())
+	// Fast path: file is entirely within [lower, upper).
+	if !needsLowerTruncate && !needsUpperTruncate {
+		return sst, false, nil
+	}
+
+	// We will need to truncate file bounds in at least one direction. Open all
+	// relevant iterators.
+	iter, rangeDelIter, err := d.newIters(ctx, file, &IterOptions{
+		LowerBound: lower,
+		UpperBound: upper,
+		level:      manifest.Level(level),
+	}, internalIterOpts{})
+	if err != nil {
+		return nil, false, err
+	}
+	defer iter.Close()
+	if rangeDelIter != nil {
+		rangeDelIter = keyspan.Truncate(
+			cmp, rangeDelIter, lower, upper, nil, nil,
+			false, /* panicOnUpperTruncate */
+		)
+		defer rangeDelIter.Close()
+	}
+	rangeKeyIter, err := d.tableNewRangeKeyIter(file, keyspan.SpanIterOptions{})
+	if err != nil {
+		return nil, false, err
+	}
+	if rangeKeyIter != nil {
+		rangeKeyIter = keyspan.Truncate(
+			cmp, rangeKeyIter, lower, upper, nil, nil,
+			false, /* panicOnUpperTruncate */
+		)
+		defer rangeKeyIter.Close()
+	}
+	// Check if we need to truncate on the left side. This means finding a new
+	// LargestPointKey and LargestRangeKey that is >= lower.
+	if needsLowerTruncate {
+		sst.SmallestPointKey.UserKey = sst.SmallestPointKey.UserKey[:0]
+		sst.SmallestPointKey.Trailer = 0
+		key, _ := iter.SeekGE(lower, base.SeekGEFlagsNone)
+		foundPointKey := key != nil
+		if key != nil {
+			sst.SmallestPointKey.CopyFrom(*key)
+		}
+		if rangeDelIter != nil {
+			span := rangeDelIter.SeekGE(lower)
+			if span != nil && (len(sst.SmallestPointKey.UserKey) == 0 || base.InternalCompare(cmp, span.SmallestKey(), sst.SmallestPointKey) < 0) {
+				sst.SmallestPointKey.CopyFrom(span.SmallestKey())
+				foundPointKey = true
+			}
+		}
+		if !foundPointKey {
+			// There are no point keys in the span we're interested in.
+			sst.SmallestPointKey = InternalKey{}
+			sst.LargestPointKey = InternalKey{}
+		}
+		sst.SmallestRangeKey.UserKey = sst.SmallestRangeKey.UserKey[:0]
+		sst.SmallestRangeKey.Trailer = 0
+		if rangeKeyIter != nil {
+			span := rangeKeyIter.SeekGE(lower)
+			if span != nil {
+				sst.SmallestRangeKey.CopyFrom(span.SmallestKey())
+			} else {
+				// There are no range keys in the span we're interested in.
+				sst.SmallestRangeKey = InternalKey{}
+				sst.LargestRangeKey = InternalKey{}
+			}
+		}
+	}
+	// Check if we need to truncate on the right side. This means finding a new
+	// LargestPointKey and LargestRangeKey that is < upper.
+	if needsUpperTruncate {
+		sst.LargestPointKey.UserKey = sst.LargestPointKey.UserKey[:0]
+		sst.LargestPointKey.Trailer = 0
+		key, _ := iter.SeekLT(upper, base.SeekLTFlagsNone)
+		foundPointKey := key != nil
+		if key != nil {
+			sst.LargestPointKey.CopyFrom(*key)
+		}
+		if rangeDelIter != nil {
+			span := rangeDelIter.SeekLT(upper)
+			if span != nil && (len(sst.LargestPointKey.UserKey) == 0 || base.InternalCompare(cmp, span.LargestKey(), sst.LargestPointKey) > 0) {
+				sst.LargestPointKey.CopyFrom(span.LargestKey())
+				foundPointKey = true
+			}
+		}
+		if !foundPointKey {
+			// There are no point keys in the span we're interested in.
+			sst.SmallestPointKey = InternalKey{}
+			sst.LargestPointKey = InternalKey{}
+		}
+		sst.LargestRangeKey.UserKey = sst.LargestRangeKey.UserKey[:0]
+		sst.LargestRangeKey.Trailer = 0
+		if rangeKeyIter != nil {
+			span := rangeKeyIter.SeekLT(upper)
+			if span != nil {
+				sst.LargestRangeKey.CopyFrom(span.LargestKey())
+			} else {
+				// There are no range keys in the span we're interested in.
+				sst.SmallestRangeKey = InternalKey{}
+				sst.LargestRangeKey = InternalKey{}
+			}
+		}
+	}
+	// Set overall bounds based on {Smallest,Largest}{Point,Range}Key.
+	switch {
+	case len(sst.SmallestRangeKey.UserKey) == 0:
+		sst.Smallest = sst.SmallestPointKey
+	case len(sst.SmallestPointKey.UserKey) == 0:
+		sst.Smallest = sst.SmallestRangeKey
+	default:
+		sst.Smallest = sst.SmallestPointKey
+		if base.InternalCompare(cmp, sst.SmallestRangeKey, sst.SmallestPointKey) < 0 {
+			sst.Smallest = sst.SmallestRangeKey
+		}
+	}
+	switch {
+	case len(sst.LargestRangeKey.UserKey) == 0:
+		sst.Largest = sst.LargestPointKey
+	case len(sst.LargestPointKey.UserKey) == 0:
+		sst.Largest = sst.LargestRangeKey
+	default:
+		sst.Largest = sst.LargestPointKey
+		if base.InternalCompare(cmp, sst.LargestRangeKey, sst.LargestPointKey) > 0 {
+			sst.Largest = sst.LargestRangeKey
+		}
+	}
+	// On rare occasion, a file might overlap with [lower, upper) but not actually
+	// have any keys within those bounds. Skip such files.
+	if len(sst.Smallest.UserKey) == 0 {
+		return nil, true, nil
+	}
+	sst.Size, err = d.tableCache.estimateSize(file, sst.Smallest.UserKey, sst.Largest.UserKey)
+	if err != nil {
+		return nil, false, err
+	}
+	// On occasion, estimateSize gives us a low estimate, i.e. a 0 file size. This
+	// can cause panics in places where we divide by file sizes. Correct for it
+	// here.
+	if sst.Size == 0 {
+		sst.Size = 1
+	}
+	return sst, false, nil
+}
+
+func scanInternalImpl(
+	ctx context.Context, lower, upper []byte, iter *scanInternalIterator, opts *scanInternalOptions,
+) error {
+	if opts.visitSharedFile != nil && (lower == nil || upper == nil) {
+		panic("lower and upper bounds must be specified in skip-shared iteration mode")
+	}
+	// Before starting iteration, check if any files in levels sharedLevelsStart
+	// and below are *not* shared. Error out if that is the case, as skip-shared
+	// iteration will not produce a consistent point-in-time view of this range
+	// of keys. For files that are shared, call visitSharedFile with a truncated
+	// version of that file.
+	cmp := iter.comparer.Compare
+	provider := iter.db.ObjProvider()
+	seqNum := iter.seqNum
+	current := iter.version
+	if current == nil {
+		current = iter.readState.current
+	}
+	if opts.visitSharedFile != nil {
+		if provider == nil {
+			panic("expected non-nil Provider in skip-shared iteration mode")
+		}
+		for level := sharedLevelsStart; level < numLevels; level++ {
+			files := current.Levels[level].Iter()
+			for f := files.SeekGE(cmp, lower); f != nil && cmp(f.Smallest.UserKey, upper) < 0; f = files.Next() {
+				var objMeta objstorage.ObjectMetadata
+				var err error
+				objMeta, err = provider.Lookup(fileTypeTable, f.FileBacking.DiskFileNum)
+				if err != nil {
+					return err
+				}
+				if !objMeta.IsShared() {
+					return errors.Wrapf(ErrInvalidSkipSharedIteration, "file %s is not shared", objMeta.DiskFileNum)
+				}
+				if !base.Visible(f.LargestSeqNum, seqNum, base.InternalKeySeqNumMax) {
+					return errors.Wrapf(ErrInvalidSkipSharedIteration, "file %s contains keys newer than snapshot", objMeta.DiskFileNum)
+				}
+				var sst *SharedSSTMeta
+				var skip bool
+				sst, skip, err = iter.db.truncateSharedFile(ctx, lower, upper, level, f, objMeta)
+				if err != nil {
+					return err
+				}
+				if skip {
+					continue
+				}
+				if err = opts.visitSharedFile(sst); err != nil {
+					return err
+				}
+			}
+		}
+	}
+
+	for valid := iter.seekGE(lower); valid && iter.error() == nil; valid = iter.next() {
+		key := iter.unsafeKey()
+
+		if opts.rateLimitFunc != nil {
+			if err := opts.rateLimitFunc(key, iter.lazyValue()); err != nil {
+				return err
+			}
+		}
+
+		switch key.Kind() {
+		case InternalKeyKindRangeKeyDelete, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeySet:
+			if opts.visitRangeKey != nil {
+				span := iter.unsafeSpan()
+				// NB: The caller isn't interested in the sequence numbers of these
+				// range keys. Rather, the caller wants them to be in trailer order
+				// _after_ zeroing of sequence numbers. Copy span.Keys, sort it, and then
+				// call visitRangeKey.
+				keysCopy := make([]keyspan.Key, len(span.Keys))
+				for i := range span.Keys {
+					keysCopy[i] = span.Keys[i]
+					keysCopy[i].Trailer = base.MakeTrailer(0, span.Keys[i].Kind())
+				}
+				keyspan.SortKeysByTrailer(&keysCopy)
+				if err := opts.visitRangeKey(span.Start, span.End, keysCopy); err != nil {
+					return err
+				}
+			}
+		case InternalKeyKindRangeDelete:
+			if opts.visitRangeDel != nil {
+				rangeDel := iter.unsafeRangeDel()
+				if err := opts.visitRangeDel(rangeDel.Start, rangeDel.End, rangeDel.LargestSeqNum()); err != nil {
+					return err
+				}
+			}
+		default:
+			if opts.visitPointKey != nil {
+				var info IteratorLevel
+				if len(iter.mergingIter.heap.items) > 0 {
+					mergingIterIdx := iter.mergingIter.heap.items[0].index
+					info = iter.iterLevels[mergingIterIdx]
+				} else {
+					info = IteratorLevel{Kind: IteratorLevelUnknown}
+				}
+				val := iter.lazyValue()
+				if err := opts.visitPointKey(key, val, info); err != nil {
+					return err
+				}
+			}
+		}
+	}
+
+	return nil
+}
+
+// constructPointIter constructs a merging iterator and sets i.iter to it.
+func (i *scanInternalIterator) constructPointIter(
+	categoryAndQoS sstable.CategoryAndQoS, memtables flushableList, buf *iterAlloc,
+) {
+	// Merging levels and levels from iterAlloc.
+	mlevels := buf.mlevels[:0]
+	levels := buf.levels[:0]
+
+	// We compute the number of levels needed ahead of time and reallocate a slice if
+	// the array from the iterAlloc isn't large enough. Doing this allocation once
+	// should improve the performance.
+	numMergingLevels := len(memtables)
+	numLevelIters := 0
+
+	current := i.version
+	if current == nil {
+		current = i.readState.current
+	}
+	numMergingLevels += len(current.L0SublevelFiles)
+	numLevelIters += len(current.L0SublevelFiles)
+
+	for level := 1; level < len(current.Levels); level++ {
+		if current.Levels[level].Empty() {
+			continue
+		}
+		if i.opts.skipSharedLevels && level >= sharedLevelsStart {
+			continue
+		}
+		numMergingLevels++
+		numLevelIters++
+	}
+
+	if numMergingLevels > cap(mlevels) {
+		mlevels = make([]mergingIterLevel, 0, numMergingLevels)
+	}
+	if numLevelIters > cap(levels) {
+		levels = make([]levelIter, 0, numLevelIters)
+	}
+	// TODO(bilal): Push these into the iterAlloc buf.
+	var rangeDelMiter keyspan.MergingIter
+	rangeDelIters := make([]keyspan.FragmentIterator, 0, numMergingLevels)
+	rangeDelLevels := make([]keyspan.LevelIter, 0, numLevelIters)
+
+	i.iterLevels = make([]IteratorLevel, numMergingLevels)
+	mlevelsIndex := 0
+
+	// Next are the memtables.
+	for j := len(memtables) - 1; j >= 0; j-- {
+		mem := memtables[j]
+		mlevels = append(mlevels, mergingIterLevel{
+			iter: mem.newIter(&i.opts.IterOptions),
+		})
+		i.iterLevels[mlevelsIndex] = IteratorLevel{
+			Kind:           IteratorLevelFlushable,
+			FlushableIndex: j,
+		}
+		mlevelsIndex++
+		if rdi := mem.newRangeDelIter(&i.opts.IterOptions); rdi != nil {
+			rangeDelIters = append(rangeDelIters, rdi)
+		}
+	}
+
+	// Next are the file levels: L0 sub-levels followed by lower levels.
+	levelsIndex := len(levels)
+	mlevels = mlevels[:numMergingLevels]
+	levels = levels[:numLevelIters]
+	rangeDelLevels = rangeDelLevels[:numLevelIters]
+	i.opts.IterOptions.snapshotForHideObsoletePoints = i.seqNum
+	i.opts.IterOptions.CategoryAndQoS = categoryAndQoS
+	addLevelIterForFiles := func(files manifest.LevelIterator, level manifest.Level) {
+		li := &levels[levelsIndex]
+		rli := &rangeDelLevels[levelsIndex]
+
+		li.init(
+			i.ctx, i.opts.IterOptions, i.comparer, i.newIters, files, level,
+			internalIterOpts{})
+		li.initBoundaryContext(&mlevels[mlevelsIndex].levelIterBoundaryContext)
+		mlevels[mlevelsIndex].iter = li
+		rli.Init(keyspan.SpanIterOptions{RangeKeyFilters: i.opts.RangeKeyFilters},
+			i.comparer.Compare, tableNewRangeDelIter(i.ctx, i.newIters), files, level,
+			manifest.KeyTypePoint)
+		rangeDelIters = append(rangeDelIters, rli)
+
+		levelsIndex++
+		mlevelsIndex++
+	}
+
+	for j := len(current.L0SublevelFiles) - 1; j >= 0; j-- {
+		i.iterLevels[mlevelsIndex] = IteratorLevel{
+			Kind:     IteratorLevelLSM,
+			Level:    0,
+			Sublevel: j,
+		}
+		addLevelIterForFiles(current.L0SublevelFiles[j].Iter(), manifest.L0Sublevel(j))
+	}
+	// Add level iterators for the non-empty non-L0 levels.
+	for level := 1; level < numLevels; level++ {
+		if current.Levels[level].Empty() {
+			continue
+		}
+		if i.opts.skipSharedLevels && level >= sharedLevelsStart {
+			continue
+		}
+		i.iterLevels[mlevelsIndex] = IteratorLevel{Kind: IteratorLevelLSM, Level: level}
+		addLevelIterForFiles(current.Levels[level].Iter(), manifest.Level(level))
+	}
+
+	buf.merging.init(&i.opts.IterOptions, &InternalIteratorStats{}, i.comparer.Compare, i.comparer.Split, mlevels...)
+	buf.merging.snapshot = i.seqNum
+	rangeDelMiter.Init(i.comparer.Compare, keyspan.VisibleTransform(i.seqNum), new(keyspan.MergingBuffers), rangeDelIters...)
+
+	if i.opts.includeObsoleteKeys {
+		iiter := &keyspan.InterleavingIter{}
+		iiter.Init(i.comparer, &buf.merging, &rangeDelMiter,
+			keyspan.InterleavingIterOpts{
+				LowerBound: i.opts.LowerBound,
+				UpperBound: i.opts.UpperBound,
+			})
+		i.pointKeyIter = iiter
+	} else {
+		pcIter := &pointCollapsingIterator{
+			comparer: i.comparer,
+			merge:    i.merge,
+			seqNum:   i.seqNum,
+		}
+		pcIter.iter.Init(i.comparer, &buf.merging, &rangeDelMiter, keyspan.InterleavingIterOpts{
+			LowerBound: i.opts.LowerBound,
+			UpperBound: i.opts.UpperBound,
+		})
+		i.pointKeyIter = pcIter
+	}
+	i.iter = i.pointKeyIter
+}
+
+// constructRangeKeyIter constructs the range-key iterator stack, populating
+// i.rangeKey.rangeKeyIter with the resulting iterator. This is similar to
+// Iterator.constructRangeKeyIter, except it doesn't handle batches and ensures
+// iterConfig does *not* elide unsets/deletes.
+func (i *scanInternalIterator) constructRangeKeyIter() error {
+	// We want the bounded iter from iterConfig, but not the collapsing of
+	// RangeKeyUnsets and RangeKeyDels.
+	i.rangeKey.rangeKeyIter = i.rangeKey.iterConfig.Init(
+		i.comparer, i.seqNum, i.opts.LowerBound, i.opts.UpperBound,
+		nil /* hasPrefix */, nil /* prefix */, true, /* internalKeys */
+		&i.rangeKey.rangeKeyBuffers.internal)
+
+	// Next are the flushables: memtables and large batches.
+	if i.readState != nil {
+		for j := len(i.readState.memtables) - 1; j >= 0; j-- {
+			mem := i.readState.memtables[j]
+			// We only need to read from memtables which contain sequence numbers older
+			// than seqNum.
+			if logSeqNum := mem.logSeqNum; logSeqNum >= i.seqNum {
+				continue
+			}
+			if rki := mem.newRangeKeyIter(&i.opts.IterOptions); rki != nil {
+				i.rangeKey.iterConfig.AddLevel(rki)
+			}
+		}
+	}
+
+	current := i.version
+	if current == nil {
+		current = i.readState.current
+	}
+	// Next are the file levels: L0 sub-levels followed by lower levels.
+	//
+	// Add file-specific iterators for L0 files containing range keys. This is less
+	// efficient than using levelIters for sublevels of L0 files containing
+	// range keys, but range keys are expected to be sparse anyway, reducing the
+	// cost benefit of maintaining a separate L0Sublevels instance for range key
+	// files and then using it here.
+	//
+	// NB: We iterate L0's files in reverse order. They're sorted by
+	// LargestSeqNum ascending, and we need to add them to the merging iterator
+	// in LargestSeqNum descending to preserve the merging iterator's invariants
+	// around Key Trailer order.
+	iter := current.RangeKeyLevels[0].Iter()
+	for f := iter.Last(); f != nil; f = iter.Prev() {
+		spanIter, err := i.newIterRangeKey(f, i.opts.SpanIterOptions())
+		if err != nil {
+			return err
+		}
+		i.rangeKey.iterConfig.AddLevel(spanIter)
+	}
+
+	// Add level iterators for the non-empty non-L0 levels.
+	for level := 1; level < len(current.RangeKeyLevels); level++ {
+		if current.RangeKeyLevels[level].Empty() {
+			continue
+		}
+		if i.opts.skipSharedLevels && level >= sharedLevelsStart {
+			continue
+		}
+		li := i.rangeKey.iterConfig.NewLevelIter()
+		spanIterOpts := i.opts.SpanIterOptions()
+		li.Init(spanIterOpts, i.comparer.Compare, i.newIterRangeKey, current.RangeKeyLevels[level].Iter(),
+			manifest.Level(level), manifest.KeyTypeRange)
+		i.rangeKey.iterConfig.AddLevel(li)
+	}
+	return nil
+}
+
+// seekGE seeks this iterator to the first key that's greater than or equal
+// to the specified user key.
+func (i *scanInternalIterator) seekGE(key []byte) bool {
+	i.iterKey, i.iterValue = i.iter.SeekGE(key, base.SeekGEFlagsNone)
+	return i.iterKey != nil
+}
+
+// unsafeKey returns the unsafe InternalKey at the current position. The value
+// is nil if the iterator is invalid or exhausted.
+func (i *scanInternalIterator) unsafeKey() *InternalKey {
+	return i.iterKey
+}
+
+// lazyValue returns a value pointer to the value at the current iterator
+// position. Behaviour undefined if unsafeKey() returns a Range key or Rangedel
+// kind key.
+func (i *scanInternalIterator) lazyValue() LazyValue {
+	return i.iterValue
+}
+
+// unsafeRangeDel returns a range key span. Behaviour undefined if UnsafeKey returns
+// a non-rangedel kind.
+func (i *scanInternalIterator) unsafeRangeDel() *keyspan.Span {
+	type spanInternalIterator interface {
+		Span() *keyspan.Span
+	}
+	return i.pointKeyIter.(spanInternalIterator).Span()
+}
+
+// unsafeSpan returns a range key span. Behaviour undefined if UnsafeKey returns
+// a non-rangekey type.
+func (i *scanInternalIterator) unsafeSpan() *keyspan.Span {
+	return i.rangeKey.iiter.Span()
+}
+
+// next advances the iterator in the forward direction, and returns the
+// iterator's new validity state.
+func (i *scanInternalIterator) next() bool {
+	i.iterKey, i.iterValue = i.iter.Next()
+	return i.iterKey != nil
+}
+
+// error returns an error from the internal iterator, if there's any.
+func (i *scanInternalIterator) error() error {
+	return i.iter.Error()
+}
+
+// close closes this iterator, and releases any pooled objects.
+func (i *scanInternalIterator) close() error {
+	if err := i.iter.Close(); err != nil {
+		return err
+	}
+	if i.readState != nil {
+		i.readState.unref()
+	}
+	if i.version != nil {
+		i.version.Unref()
+	}
+	if i.rangeKey != nil {
+		i.rangeKey.PrepareForReuse()
+		*i.rangeKey = iteratorRangeKeyState{
+			rangeKeyBuffers: i.rangeKey.rangeKeyBuffers,
+		}
+		iterRangeKeyStateAllocPool.Put(i.rangeKey)
+		i.rangeKey = nil
+	}
+	if alloc := i.alloc; alloc != nil {
+		for j := range i.boundsBuf {
+			if cap(i.boundsBuf[j]) >= maxKeyBufCacheSize {
+				alloc.boundsBuf[j] = nil
+			} else {
+				alloc.boundsBuf[j] = i.boundsBuf[j]
+			}
+		}
+		*alloc = iterAlloc{
+			keyBuf:              alloc.keyBuf[:0],
+			boundsBuf:           alloc.boundsBuf,
+			prefixOrFullSeekKey: alloc.prefixOrFullSeekKey[:0],
+		}
+		iterAllocPool.Put(alloc)
+		i.alloc = nil
+	}
+	return nil
+}
+
+func (i *scanInternalIterator) initializeBoundBufs(lower, upper []byte) {
+	buf := i.boundsBuf[i.boundsBufIdx][:0]
+	if lower != nil {
+		buf = append(buf, lower...)
+		i.opts.LowerBound = buf
+	} else {
+		i.opts.LowerBound = nil
+	}
+	if upper != nil {
+		buf = append(buf, upper...)
+		i.opts.UpperBound = buf[len(buf)-len(upper):]
+	} else {
+		i.opts.UpperBound = nil
+	}
+	i.boundsBuf[i.boundsBufIdx] = buf
+	i.boundsBufIdx = 1 - i.boundsBufIdx
+}
diff --git a/pebble/scan_internal_test.go b/pebble/scan_internal_test.go
new file mode 100644
index 0000000..2c3acbc
--- /dev/null
+++ b/pebble/scan_internal_test.go
@@ -0,0 +1,566 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"context"
+	"fmt"
+	"math"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/bloom"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/itertest"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/rangekey"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/objstorage/remote"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+func TestScanStatistics(t *testing.T) {
+	var d *DB
+	type scanInternalReader interface {
+		ScanStatistics(
+			ctx context.Context,
+			lower, upper []byte,
+			opts ScanStatisticsOptions,
+		) (LSMKeyStatistics, error)
+	}
+	batches := map[string]*Batch{}
+	snaps := map[string]*Snapshot{}
+	ctx := context.TODO()
+
+	getOpts := func() *Options {
+		opts := &Options{
+			FS:                 vfs.NewMem(),
+			Logger:             testLogger{t: t},
+			Comparer:           testkeys.Comparer,
+			FormatMajorVersion: FormatRangeKeys,
+			BlockPropertyCollectors: []func() BlockPropertyCollector{
+				sstable.NewTestKeysBlockPropertyCollector,
+			},
+		}
+		opts.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
+			"": remote.NewInMem(),
+		})
+		opts.Experimental.CreateOnShared = remote.CreateOnSharedAll
+		opts.Experimental.CreateOnSharedLocator = ""
+		opts.DisableAutomaticCompactions = true
+		opts.EnsureDefaults()
+		opts.WithFSDefaults()
+		return opts
+	}
+	cleanup := func() (err error) {
+		for key, batch := range batches {
+			err = firstError(err, batch.Close())
+			delete(batches, key)
+		}
+		for key, snap := range snaps {
+			err = firstError(err, snap.Close())
+			delete(snaps, key)
+		}
+		if d != nil {
+			err = firstError(err, d.Close())
+			d = nil
+		}
+		return err
+	}
+	defer cleanup()
+
+	datadriven.RunTest(t, "testdata/scan_statistics", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "reset":
+			if err := cleanup(); err != nil {
+				t.Fatal(err)
+				return err.Error()
+			}
+			var err error
+			d, err = Open("", getOpts())
+			require.NoError(t, err)
+			require.NoError(t, d.SetCreatorID(1))
+			return ""
+		case "snapshot":
+			s := d.NewSnapshot()
+			var name string
+			td.ScanArgs(t, "name", &name)
+			snaps[name] = s
+			return ""
+		case "batch":
+			var name string
+			td.MaybeScanArgs(t, "name", &name)
+			commit := td.HasArg("commit")
+			b := d.NewIndexedBatch()
+			require.NoError(t, runBatchDefineCmd(td, b))
+			var err error
+			if commit {
+				func() {
+					defer func() {
+						if r := recover(); r != nil {
+							err = errors.New(r.(string))
+						}
+					}()
+					err = b.Commit(nil)
+				}()
+			} else if name != "" {
+				batches[name] = b
+			}
+			if err != nil {
+				return err.Error()
+			}
+			count := b.Count()
+			if commit {
+				return fmt.Sprintf("committed %d keys\n", count)
+			}
+			return fmt.Sprintf("wrote %d keys to batch %q\n", count, name)
+		case "compact":
+			if err := runCompactCmd(td, d); err != nil {
+				return err.Error()
+			}
+			return runLSMCmd(td, d)
+		case "flush":
+			err := d.Flush()
+			if err != nil {
+				return err.Error()
+			}
+			return ""
+		case "commit":
+			name := pluckStringCmdArg(td, "batch")
+			b := batches[name]
+			defer b.Close()
+			count := b.Count()
+			require.NoError(t, d.Apply(b, nil))
+			delete(batches, name)
+			return fmt.Sprintf("committed %d keys\n", count)
+		case "scan-statistics":
+			var lower, upper []byte
+			var reader scanInternalReader = d
+			var b strings.Builder
+			var showSnapshotPinned = false
+			var keyKindsToDisplay []InternalKeyKind
+			var showLevels []string
+
+			for _, arg := range td.CmdArgs {
+				switch arg.Key {
+				case "lower":
+					lower = []byte(arg.Vals[0])
+				case "upper":
+					upper = []byte(arg.Vals[0])
+				case "show-snapshot-pinned":
+					showSnapshotPinned = true
+				case "keys":
+					for _, key := range arg.Vals {
+						keyKindsToDisplay = append(keyKindsToDisplay, base.ParseKind(key))
+					}
+				case "levels":
+					showLevels = append(showLevels, arg.Vals...)
+				default:
+				}
+			}
+			stats, err := reader.ScanStatistics(ctx, lower, upper, ScanStatisticsOptions{})
+			if err != nil {
+				return err.Error()
+			}
+
+			for _, level := range showLevels {
+				lvl, err := strconv.Atoi(level)
+				if err != nil || lvl >= numLevels {
+					return fmt.Sprintf("invalid level %s", level)
+				}
+
+				fmt.Fprintf(&b, "Level %d:\n", lvl)
+				if showSnapshotPinned {
+					fmt.Fprintf(&b, "  compaction pinned count: %d\n", stats.Levels[lvl].SnapshotPinnedKeys)
+				}
+				for _, kind := range keyKindsToDisplay {
+					fmt.Fprintf(&b, "  %s key count: %d\n", kind.String(), stats.Levels[lvl].KindsCount[kind])
+					if stats.Levels[lvl].LatestKindsCount[kind] > 0 {
+						fmt.Fprintf(&b, "  %s latest count: %d\n", kind.String(), stats.Levels[lvl].LatestKindsCount[kind])
+					}
+				}
+			}
+
+			fmt.Fprintf(&b, "Aggregate:\n")
+			if showSnapshotPinned {
+				fmt.Fprintf(&b, "  snapshot pinned count: %d\n", stats.Accumulated.SnapshotPinnedKeys)
+			}
+			for _, kind := range keyKindsToDisplay {
+				fmt.Fprintf(&b, "  %s key count: %d\n", kind.String(), stats.Accumulated.KindsCount[kind])
+				if stats.Accumulated.LatestKindsCount[kind] > 0 {
+					fmt.Fprintf(&b, "  %s latest count: %d\n", kind.String(), stats.Accumulated.LatestKindsCount[kind])
+				}
+			}
+			return b.String()
+		default:
+			return fmt.Sprintf("unknown command %q", td.Cmd)
+		}
+	})
+}
+
+func TestScanInternal(t *testing.T) {
+	var d *DB
+	type scanInternalReader interface {
+		ScanInternal(
+			ctx context.Context,
+			categoryAndQoS sstable.CategoryAndQoS,
+			lower, upper []byte,
+			visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error,
+			visitRangeDel func(start, end []byte, seqNum uint64) error,
+			visitRangeKey func(start, end []byte, keys []keyspan.Key) error,
+			visitSharedFile func(sst *SharedSSTMeta) error,
+		) error
+	}
+	batches := map[string]*Batch{}
+	snaps := map[string]*Snapshot{}
+	efos := map[string]*EventuallyFileOnlySnapshot{}
+	parseOpts := func(td *datadriven.TestData) (*Options, error) {
+		opts := &Options{
+			FS:                 vfs.NewMem(),
+			Logger:             testLogger{t: t},
+			Comparer:           testkeys.Comparer,
+			FormatMajorVersion: FormatVirtualSSTables,
+			BlockPropertyCollectors: []func() BlockPropertyCollector{
+				sstable.NewTestKeysBlockPropertyCollector,
+			},
+		}
+		opts.Experimental.RemoteStorage = remote.MakeSimpleFactory(map[remote.Locator]remote.Storage{
+			"": remote.NewInMem(),
+		})
+		opts.Experimental.CreateOnShared = remote.CreateOnSharedAll
+		opts.Experimental.CreateOnSharedLocator = ""
+		opts.DisableAutomaticCompactions = true
+		opts.EnsureDefaults()
+		opts.WithFSDefaults()
+
+		for _, cmdArg := range td.CmdArgs {
+			switch cmdArg.Key {
+			case "format-major-version":
+				v, err := strconv.Atoi(cmdArg.Vals[0])
+				if err != nil {
+					return nil, err
+				}
+				// Override the DB version.
+				opts.FormatMajorVersion = FormatMajorVersion(v)
+			case "block-size":
+				v, err := strconv.Atoi(cmdArg.Vals[0])
+				if err != nil {
+					return nil, err
+				}
+				for i := range opts.Levels {
+					opts.Levels[i].BlockSize = v
+				}
+			case "index-block-size":
+				v, err := strconv.Atoi(cmdArg.Vals[0])
+				if err != nil {
+					return nil, err
+				}
+				for i := range opts.Levels {
+					opts.Levels[i].IndexBlockSize = v
+				}
+			case "target-file-size":
+				v, err := strconv.Atoi(cmdArg.Vals[0])
+				if err != nil {
+					return nil, err
+				}
+				for i := range opts.Levels {
+					opts.Levels[i].TargetFileSize = int64(v)
+				}
+			case "bloom-bits-per-key":
+				v, err := strconv.Atoi(cmdArg.Vals[0])
+				if err != nil {
+					return nil, err
+				}
+				fp := bloom.FilterPolicy(v)
+				opts.Filters = map[string]FilterPolicy{fp.Name(): fp}
+				for i := range opts.Levels {
+					opts.Levels[i].FilterPolicy = fp
+				}
+			case "merger":
+				switch cmdArg.Vals[0] {
+				case "appender":
+					opts.Merger = base.DefaultMerger
+				default:
+					return nil, errors.Newf("unrecognized Merger %q\n", cmdArg.Vals[0])
+				}
+			}
+		}
+		return opts, nil
+	}
+	cleanup := func() (err error) {
+		for key, batch := range batches {
+			err = firstError(err, batch.Close())
+			delete(batches, key)
+		}
+		for key, snap := range snaps {
+			err = firstError(err, snap.Close())
+			delete(snaps, key)
+		}
+		for key, es := range efos {
+			err = firstError(err, es.Close())
+			delete(efos, key)
+		}
+		if d != nil {
+			err = firstError(err, d.Close())
+			d = nil
+		}
+		return err
+	}
+	defer cleanup()
+
+	datadriven.RunTest(t, "testdata/scan_internal", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "define":
+			if err := cleanup(); err != nil {
+				return err.Error()
+			}
+			opts, err := parseOpts(td)
+			if err != nil {
+				return err.Error()
+			}
+			d, err = runDBDefineCmd(td, opts)
+			if err != nil {
+				return err.Error()
+			}
+			return runLSMCmd(td, d)
+
+		case "reset":
+			if err := cleanup(); err != nil {
+				t.Fatal(err)
+				return err.Error()
+			}
+			opts, err := parseOpts(td)
+			if err != nil {
+				t.Fatal(err)
+				return err.Error()
+			}
+
+			d, err = Open("", opts)
+			require.NoError(t, err)
+			require.NoError(t, d.SetCreatorID(1))
+			return ""
+		case "snapshot":
+			s := d.NewSnapshot()
+			var name string
+			td.ScanArgs(t, "name", &name)
+			snaps[name] = s
+			return ""
+		case "wait-for-file-only-snapshot":
+			if len(td.CmdArgs) != 1 {
+				panic("insufficient args for file-only-snapshot command")
+			}
+			name := td.CmdArgs[0].Key
+			es := efos[name]
+			if err := es.WaitForFileOnlySnapshot(context.TODO(), 1*time.Millisecond); err != nil {
+				return err.Error()
+			}
+			return "ok"
+		case "file-only-snapshot":
+			if len(td.CmdArgs) != 1 {
+				panic("insufficient args for file-only-snapshot command")
+			}
+			name := td.CmdArgs[0].Key
+			var keyRanges []KeyRange
+			for _, line := range strings.Split(td.Input, "\n") {
+				fields := strings.Fields(line)
+				if len(fields) != 2 {
+					return "expected two fields for file-only snapshot KeyRanges"
+				}
+				kr := KeyRange{Start: []byte(fields[0]), End: []byte(fields[1])}
+				keyRanges = append(keyRanges, kr)
+			}
+
+			s := d.NewEventuallyFileOnlySnapshot(keyRanges)
+			efos[name] = s
+			return "ok"
+		case "batch":
+			var name string
+			td.MaybeScanArgs(t, "name", &name)
+			commit := td.HasArg("commit")
+			ingest := td.HasArg("ingest")
+			b := d.NewIndexedBatch()
+			require.NoError(t, runBatchDefineCmd(td, b))
+			var err error
+			if commit {
+				func() {
+					defer func() {
+						if r := recover(); r != nil {
+							err = errors.New(r.(string))
+						}
+					}()
+					err = b.Commit(nil)
+				}()
+			} else if ingest {
+				points, rangeDels, rangeKeys := batchSort(b)
+				file, err := d.opts.FS.Create("temp0.sst")
+				require.NoError(t, err)
+				w := sstable.NewWriter(objstorageprovider.NewFileWritable(file), d.opts.MakeWriterOptions(0, sstable.TableFormatPebblev4))
+				for span := rangeDels.First(); span != nil; span = rangeDels.Next() {
+					require.NoError(t, w.DeleteRange(span.Start, span.End))
+				}
+				rangeDels.Close()
+				for span := rangeKeys.First(); span != nil; span = rangeKeys.Next() {
+					keys := []keyspan.Key{}
+					for i := range span.Keys {
+						keys = append(keys, span.Keys[i])
+						keys[i].Trailer = base.MakeTrailer(0, keys[i].Kind())
+					}
+					keyspan.SortKeysByTrailer(&keys)
+					newSpan := &keyspan.Span{Start: span.Start, End: span.End, Keys: keys}
+					rangekey.Encode(newSpan, w.AddRangeKey)
+				}
+				rangeKeys.Close()
+				for key, val := points.First(); key != nil; key, val = points.Next() {
+					var value []byte
+					value, _, err = val.Value(value)
+					require.NoError(t, err)
+					require.NoError(t, w.Add(*key, value))
+				}
+				points.Close()
+				require.NoError(t, w.Close())
+				require.NoError(t, d.Ingest([]string{"temp0.sst"}))
+			} else if name != "" {
+				batches[name] = b
+			}
+			if err != nil {
+				return err.Error()
+			}
+			count := b.Count()
+			if commit {
+				return fmt.Sprintf("committed %d keys\n", count)
+			}
+			return fmt.Sprintf("wrote %d keys to batch %q\n", count, name)
+		case "compact":
+			if err := runCompactCmd(td, d); err != nil {
+				return err.Error()
+			}
+			return runLSMCmd(td, d)
+		case "flush":
+			err := d.Flush()
+			if err != nil {
+				return err.Error()
+			}
+			return ""
+		case "lsm":
+			return runLSMCmd(td, d)
+		case "commit":
+			name := pluckStringCmdArg(td, "batch")
+			b := batches[name]
+			defer b.Close()
+			count := b.Count()
+			require.NoError(t, d.Apply(b, nil))
+			delete(batches, name)
+			return fmt.Sprintf("committed %d keys\n", count)
+		case "scan-internal":
+			var lower, upper []byte
+			var reader scanInternalReader = d
+			var b strings.Builder
+			var fileVisitor func(sst *SharedSSTMeta) error
+			for _, arg := range td.CmdArgs {
+				switch arg.Key {
+				case "lower":
+					lower = []byte(arg.Vals[0])
+				case "upper":
+					upper = []byte(arg.Vals[0])
+				case "snapshot":
+					name := arg.Vals[0]
+					snap, ok := snaps[name]
+					if !ok {
+						return fmt.Sprintf("no snapshot found for name %s", name)
+					}
+					reader = snap
+				case "file-only-snapshot":
+					name := arg.Vals[0]
+					efos, ok := efos[name]
+					if !ok {
+						return fmt.Sprintf("no snapshot found for name %s", name)
+					}
+					reader = efos
+				case "skip-shared":
+					fileVisitor = func(sst *SharedSSTMeta) error {
+						fmt.Fprintf(&b, "shared file: %s [%s-%s] [point=%s-%s] [range=%s-%s]\n", sst.fileNum, sst.Smallest.String(), sst.Largest.String(), sst.SmallestPointKey.String(), sst.LargestPointKey.String(), sst.SmallestRangeKey.String(), sst.LargestRangeKey.String())
+						return nil
+					}
+				}
+			}
+			err := reader.ScanInternal(context.TODO(), sstable.CategoryAndQoS{}, lower, upper,
+				func(key *InternalKey, value LazyValue, _ IteratorLevel) error {
+					v := value.InPlaceValue()
+					fmt.Fprintf(&b, "%s (%s)\n", key, v)
+					return nil
+				},
+				func(start, end []byte, seqNum uint64) error {
+					fmt.Fprintf(&b, "%s-%s#%d,RANGEDEL\n", start, end, seqNum)
+					return nil
+				},
+				func(start, end []byte, keys []keyspan.Key) error {
+					s := keyspan.Span{Start: start, End: end, Keys: keys}
+					fmt.Fprintf(&b, "%s\n", s.String())
+					return nil
+				},
+				fileVisitor,
+			)
+			if err != nil {
+				return err.Error()
+			}
+			return b.String()
+		default:
+			return fmt.Sprintf("unknown command %q", td.Cmd)
+		}
+	})
+}
+
+func TestPointCollapsingIter(t *testing.T) {
+	var def string
+	datadriven.RunTest(t, "testdata/point_collapsing_iter", func(t *testing.T, d *datadriven.TestData) string {
+		switch d.Cmd {
+		case "define":
+			def = d.Input
+			return ""
+
+		case "iter":
+			f := &fakeIter{}
+			var spans []keyspan.Span
+			for _, line := range strings.Split(def, "\n") {
+				for _, key := range strings.Fields(line) {
+					j := strings.Index(key, ":")
+					k := base.ParseInternalKey(key[:j])
+					v := []byte(key[j+1:])
+					if k.Kind() == InternalKeyKindRangeDelete {
+						spans = append(spans, keyspan.Span{
+							Start:     k.UserKey,
+							End:       v,
+							Keys:      []keyspan.Key{{Trailer: k.Trailer}},
+							KeysOrder: 0,
+						})
+						continue
+					}
+					f.keys = append(f.keys, k)
+					f.vals = append(f.vals, v)
+				}
+			}
+
+			ksIter := keyspan.NewIter(base.DefaultComparer.Compare, spans)
+			pcIter := &pointCollapsingIterator{
+				comparer: base.DefaultComparer,
+				merge:    base.DefaultMerger.Merge,
+				seqNum:   math.MaxUint64,
+			}
+			pcIter.iter.Init(base.DefaultComparer, f, ksIter, keyspan.InterleavingIterOpts{})
+			defer pcIter.Close()
+			return itertest.RunInternalIterCmd(t, d, pcIter, itertest.Verbose)
+
+		default:
+			return fmt.Sprintf("unknown command: %s", d.Cmd)
+		}
+	})
+}
diff --git a/pebble/scripts/changed-go-pkgs.sh b/pebble/scripts/changed-go-pkgs.sh
new file mode 100755
index 0000000..529ff0d
--- /dev/null
+++ b/pebble/scripts/changed-go-pkgs.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+BASE_SHA="$1"
+HEAD_SHA="$2"
+
+if [ -z "$HEAD_SHA" ];then
+    echo "Usage: $0 <base-sha> <head-sha>"
+    exit 1
+fi
+
+git diff --name-only "${BASE_SHA}..${HEAD_SHA}" -- "*.go" \
+  | xargs -rn1 dirname \
+  | sort -u \
+  | xargs echo
diff --git a/pebble/scripts/code-coverage-publish.sh b/pebble/scripts/code-coverage-publish.sh
new file mode 100755
index 0000000..d5496d0
--- /dev/null
+++ b/pebble/scripts/code-coverage-publish.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# This script runs genhtml (part of lcov) on lcov artifacts generated by the
+# code-coverage.sh script and uploads the result to a GCS bucket.
+
+BUCKET="${BUCKET:-crl-codecover-public}"
+
+set -euxo pipefail
+
+publish() {
+  PROFILE="$1"
+  TITLE="$2"
+
+  if [ ! -f "$PROFILE" ]; then
+    echo "$PROFILE does not exist"
+    exit 1
+  fi
+
+  DIR="$(date -r "$PROFILE" -u '+%Y-%m-%d %H:%MZ') $(git rev-parse --short=8 HEAD) - $TITLE"
+
+  mkdir -p "artifacts/$DIR"
+  # The filename shows up on the generated page, let's make it useful.
+  cp "$PROFILE" "artifacts/$DIR.lcov"
+  genhtml "artifacts/$DIR.lcov" -o "artifacts/$DIR"
+
+  gsutil -m cp -Z -r "artifacts/$DIR" "gs://$BUCKET/pebble/$DIR"
+}
+
+publish "artifacts/profile-tests.lcov" "tests only"
+publish "artifacts/profile-meta.lcov" "meta test only"
+publish "artifacts/profile-tests-and-meta.lcov" "tests + meta"
+
+# Regenerate index.html.
+echo '<title>Pebble coverage</title><body><h2>Pebble coverage runs:</h2><ul>' > artifacts/index.html
+gsutil ls "gs://$BUCKET/pebble" |
+  sed "s#gs://$BUCKET/pebble/##" |
+  sed 's#/$##' |
+  grep -v index.html |
+  sort -r |
+  while read -r d; do
+    echo "<li><a href=\"$d/index.html\">$d</a>" >> artifacts/index.html
+  done
+
+echo '</ul></body>' >> artifacts/index.html
+
+gsutil cp artifacts/index.html "gs://$BUCKET/pebble/index.html"
+gsutil setmeta -h "Cache-Control: public, max-age=300, no-transform" "gs://$BUCKET/pebble/index.html"
diff --git a/pebble/scripts/code-coverage.sh b/pebble/scripts/code-coverage.sh
new file mode 100755
index 0000000..e234f62
--- /dev/null
+++ b/pebble/scripts/code-coverage.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# This script runs unit tests and the metamorphic tests with coverage
+# instrumentation and generates three lcov files:
+#  - ./artifacts/profile-tests.lcov
+#  - ./artifacts/profile-meta.lcov
+#  - ./artifacts/profile-tests-and-meta.lcov
+
+set -euxo pipefail
+
+mkdir -p artifacts
+
+tmpdir=$(mktemp -d)
+trap 'rm -rf "$tmpdir"' EXIT
+
+test_failed=0
+# The coverpkg argument ensures that coverage is not restricted to the tested
+# package; so this will get us overall coverage for all tests.
+go test -tags invariants ./... -coverprofile=artifacts/profile-tests.gocov -coverpkg=./... || test_failed=1
+
+# The metamorphic test executes itself for each run; we don't get coverage for
+# the inner run. To fix this, we use metarunner as the "inner" binary and we
+# instrument it with coverage (see https://go.dev/testing/coverage/#building).
+go build -tags invariants -o "${tmpdir}/metarunner" -cover ./internal/metamorphic/metarunner
+mkdir -p "${tmpdir}/metacover"
+
+GOCOVERDIR="${tmpdir}/metacover" go test ./internal/metamorphic \
+  -count 50 --inner-binary="${tmpdir}/metarunner" || test_failed=1
+
+go tool covdata textfmt -i "${tmpdir}/metacover" -o artifacts/profile-meta.gocov
+
+# TODO(radu): make the crossversion metamorphic test work.
+
+go run github.com/cockroachdb/code-cov-utils/convert@v1.1.0 -out artifacts/profile-tests.lcov \
+  -trim-prefix github.com/cockroachdb/pebble/ \
+  artifacts/profile-tests.gocov
+
+go run github.com/cockroachdb/code-cov-utils/convert@v1.1.0 -out artifacts/profile-meta.lcov \
+  -trim-prefix github.com/cockroachdb/pebble/ \
+  artifacts/profile-meta.gocov
+
+go run github.com/cockroachdb/code-cov-utils/convert@v1.1.0 -out artifacts/profile-tests-and-meta.lcov \
+  -trim-prefix github.com/cockroachdb/pebble/ \
+  artifacts/profile-tests.gocov artifacts/profile-meta.gocov
+
+if [ $test_failed -eq 1 ]; then
+  # TODO(radu): somehow plumb the error and publish it.
+  echo "WARNING: some tests have failed; coverage might be incomplete."
+fi
diff --git a/pebble/scripts/pr-codecov-run-tests.sh b/pebble/scripts/pr-codecov-run-tests.sh
new file mode 100755
index 0000000..eb8c72a
--- /dev/null
+++ b/pebble/scripts/pr-codecov-run-tests.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+
+# This script runs unit tests with coverage enabled for a specific list of
+# package paths and outputs the coverage to a json file.
+#
+# Package paths that are not valid in this tree are tolerated.
+
+set -xeuo pipefail
+
+output_json_file="$1"
+packages="$2"
+
+# Find the targets. We need to convert from, e.g.
+#   . objstorage objstorage/objstorageprovider
+# to
+#   . ./objstorage ./objstorage/objstorageprovider
+
+paths=""
+sep=""
+
+for p in ${packages}; do
+  # Check that the path exists and contains Go files.
+  if ls "${p}"/*.go >/dev/null 2>&1; then
+    if [[ $p != "." ]]; then
+      p="./$p"
+    fi
+    paths="${paths}${sep}${p}"
+    sep=" "
+  fi
+done
+
+if [ -z "${paths}" ]; then
+  echo "Skipping"
+  touch "${output_json_file}"
+  exit 0
+fi
+
+tmpfile=$(mktemp --suffix -coverprofile)
+trap 'rm -f "${tmpfile}"' EXIT
+
+make testcoverage COVER_PROFILE="${tmpfile}" PKG="$paths"
+go run github.com/cockroachdb/code-cov-utils/gocover2json@v1.0.0 \
+  --trim-prefix github.com/cockroachdb/pebble/ \
+  "${tmpfile}" "${output_json_file}"
diff --git a/pebble/scripts/run-crossversion-meta.sh b/pebble/scripts/run-crossversion-meta.sh
new file mode 100755
index 0000000..788820c
--- /dev/null
+++ b/pebble/scripts/run-crossversion-meta.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+set -ex
+
+BRANCH=$(git symbolic-ref --short HEAD)
+
+TEMPDIR=(`mktemp -d -t crossversion-$(date +%Y-%m-%d-%H-%M-%S)-XXXXXXXXXX`)
+
+VERSIONS=""
+for branch in "$@"
+do
+    git checkout "$branch"
+    sha=`git rev-parse --short HEAD`
+
+    # If the branch name has a "-<suffix>", pull off the suffix. With the
+    # crl-release-{XX.X} release branch naming scheme, this will extract the
+    # {XX.X}.
+    version=`cut -d- -f3 <<< "$branch"`
+
+    echo "Building $version ($sha)"
+    go test -c -o "$TEMPDIR/meta.$version.test" ./internal/metamorphic
+    VERSIONS="$VERSIONS -version $version,$sha,$TEMPDIR/meta.$version.test"
+done
+
+# Return to whence we came.
+git checkout $BRANCH
+
+if [[ -z "${STRESS}" ]]; then
+    go test ./internal/metamorphic/crossversion \
+      -test.v \
+      -test.timeout "${TIMEOUT:-30m}" \
+      -test.run 'TestMetaCrossVersion$' \
+      -seed ${SEED:-0} \
+      -factor ${FACTOR:-10} \
+      $(echo $VERSIONS)
+else
+    stress -p 1 go test ./internal/metamorphic/crossversion \
+      -test.v \
+      -test.timeout "${TIMEOUT:-30m}" \
+      -test.run 'TestMetaCrossVersion$' \
+      -seed ${SEED:-0} \
+      -factor ${FACTOR:-10} \
+      $(echo $VERSIONS)
+fi
+
+rm -rf $TEMPDIR
diff --git a/pebble/shims/cmp/cmp.go b/pebble/shims/cmp/cmp.go
new file mode 100644
index 0000000..93d4d28
--- /dev/null
+++ b/pebble/shims/cmp/cmp.go
@@ -0,0 +1,74 @@
+// This file has been ported over from go 1.21.0 so that we can avoid
+// having to upgrade for basic comparison functions. Copyright notice
+// is preserved:
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package cmp provides types and functions related to comparing
+// ordered values.
+package cmp
+
+// Ordered is a constraint that permits any ordered type: any type
+// that supports the operators < <= >= >.
+// If future releases of Go add new ordered types,
+// this constraint will be modified to include them.
+//
+// Note that floating-point types may contain NaN ("not-a-number") values.
+// An operator such as == or < will always report false when
+// comparing a NaN value with any other value, NaN or not.
+// See the [Compare] function for a consistent way to compare NaN values.
+type Ordered interface {
+	~int | ~int8 | ~int16 | ~int32 | ~int64 |
+		~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr |
+		~float32 | ~float64 |
+		~string
+}
+
+// Less reports whether x is less than y.
+// For floating-point types, a NaN is considered less than any non-NaN,
+// and -0.0 is not less than (is equal to) 0.0.
+func Less[T Ordered](x, y T) bool {
+	return (isNaN(x) && !isNaN(y)) || x < y
+}
+
+// Compare returns
+//
+//	-1 if x is less than y,
+//	 0 if x equals y,
+//	+1 if x is greater than y.
+//
+// For floating-point types, a NaN is considered less than any non-NaN,
+// a NaN is considered equal to a NaN, and -0.0 is equal to 0.0.
+func Compare[T Ordered](x, y T) int {
+	xNaN := isNaN(x)
+	yNaN := isNaN(y)
+	if xNaN && yNaN {
+		return 0
+	}
+	if xNaN || x < y {
+		return -1
+	}
+	if yNaN || x > y {
+		return +1
+	}
+	return 0
+}
+
+// isNaN reports whether x is a NaN without requiring the math package.
+// This will always return false if T is not floating-point.
+func isNaN[T Ordered](x T) bool {
+	return x != x
+}
+
+// Or returns the first of its arguments that is not equal to the zero value.
+// If no argument is non-zero, it returns the zero value.
+func Or[T comparable](vals ...T) T {
+	var zero T
+	for _, val := range vals {
+		if val != zero {
+			return val
+		}
+	}
+	return zero
+}
diff --git a/pebble/shims/slices/slices.go b/pebble/shims/slices/slices.go
new file mode 100644
index 0000000..22a5305
--- /dev/null
+++ b/pebble/shims/slices/slices.go
@@ -0,0 +1,519 @@
+// This file has been ported over from go 1.21.0 so that we can avoid
+// having to upgrade for basic comparison functions. Copyright notice
+// is preserved:
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package slices defines various functions useful with slices of any type.
+package slices
+
+import (
+	"unsafe"
+
+	"github.com/cockroachdb/pebble/shims/cmp"
+)
+
+// Equal reports whether two slices are equal: the same length and all
+// elements equal. If the lengths are different, Equal returns false.
+// Otherwise, the elements are compared in increasing index order, and the
+// comparison stops at the first unequal pair.
+// Floating point NaNs are not considered equal.
+func Equal[S ~[]E, E comparable](s1, s2 S) bool {
+	if len(s1) != len(s2) {
+		return false
+	}
+	for i := range s1 {
+		if s1[i] != s2[i] {
+			return false
+		}
+	}
+	return true
+}
+
+// EqualFunc reports whether two slices are equal using an equality
+// function on each pair of elements. If the lengths are different,
+// EqualFunc returns false. Otherwise, the elements are compared in
+// increasing index order, and the comparison stops at the first index
+// for which eq returns false.
+func EqualFunc[S1 ~[]E1, S2 ~[]E2, E1, E2 any](s1 S1, s2 S2, eq func(E1, E2) bool) bool {
+	if len(s1) != len(s2) {
+		return false
+	}
+	for i, v1 := range s1 {
+		v2 := s2[i]
+		if !eq(v1, v2) {
+			return false
+		}
+	}
+	return true
+}
+
+// Compare compares the elements of s1 and s2, using [cmp.Compare] on each pair
+// of elements. The elements are compared sequentially, starting at index 0,
+// until one element is not equal to the other.
+// The result of comparing the first non-matching elements is returned.
+// If both slices are equal until one of them ends, the shorter slice is
+// considered less than the longer one.
+// The result is 0 if s1 == s2, -1 if s1 < s2, and +1 if s1 > s2.
+func Compare[S ~[]E, E cmp.Ordered](s1, s2 S) int {
+	for i, v1 := range s1 {
+		if i >= len(s2) {
+			return +1
+		}
+		v2 := s2[i]
+		if c := cmp.Compare(v1, v2); c != 0 {
+			return c
+		}
+	}
+	if len(s1) < len(s2) {
+		return -1
+	}
+	return 0
+}
+
+// CompareFunc is like [Compare] but uses a custom comparison function on each
+// pair of elements.
+// The result is the first non-zero result of cmp; if cmp always
+// returns 0 the result is 0 if len(s1) == len(s2), -1 if len(s1) < len(s2),
+// and +1 if len(s1) > len(s2).
+func CompareFunc[S1 ~[]E1, S2 ~[]E2, E1, E2 any](s1 S1, s2 S2, cmp func(E1, E2) int) int {
+	for i, v1 := range s1 {
+		if i >= len(s2) {
+			return +1
+		}
+		v2 := s2[i]
+		if c := cmp(v1, v2); c != 0 {
+			return c
+		}
+	}
+	if len(s1) < len(s2) {
+		return -1
+	}
+	return 0
+}
+
+// Index returns the index of the first occurrence of v in s,
+// or -1 if not present.
+func Index[S ~[]E, E comparable](s S, v E) int {
+	for i := range s {
+		if v == s[i] {
+			return i
+		}
+	}
+	return -1
+}
+
+// IndexFunc returns the first index i satisfying f(s[i]),
+// or -1 if none do.
+func IndexFunc[S ~[]E, E any](s S, f func(E) bool) int {
+	for i := range s {
+		if f(s[i]) {
+			return i
+		}
+	}
+	return -1
+}
+
+// Contains reports whether v is present in s.
+func Contains[S ~[]E, E comparable](s S, v E) bool {
+	return Index(s, v) >= 0
+}
+
+// ContainsFunc reports whether at least one
+// element e of s satisfies f(e).
+func ContainsFunc[S ~[]E, E any](s S, f func(E) bool) bool {
+	return IndexFunc(s, f) >= 0
+}
+
+// Insert inserts the values v... into s at index i,
+// returning the modified slice.
+// The elements at s[i:] are shifted up to make room.
+// In the returned slice r, r[i] == v[0],
+// and r[i+len(v)] == value originally at r[i].
+// Insert panics if i is out of range.
+// This function is O(len(s) + len(v)).
+func Insert[S ~[]E, E any](s S, i int, v ...E) S {
+	_ = s[i:] // bounds check
+
+	m := len(v)
+	if m == 0 {
+		return s
+	}
+	n := len(s)
+	if i == n {
+		return append(s, v...)
+	}
+	if n+m > cap(s) {
+		// Use append rather than make so that we bump the size of
+		// the slice up to the next storage class.
+		// This is what Grow does but we don't call Grow because
+		// that might copy the values twice.
+		s2 := append(s[:i], make(S, n+m-i)...)
+		copy(s2[i:], v)
+		copy(s2[i+m:], s[i:])
+		return s2
+	}
+	s = s[:n+m]
+
+	// before:
+	// s: aaaaaaaabbbbccccccccdddd
+	//            ^   ^       ^   ^
+	//            i  i+m      n  n+m
+	// after:
+	// s: aaaaaaaavvvvbbbbcccccccc
+	//            ^   ^       ^   ^
+	//            i  i+m      n  n+m
+	//
+	// a are the values that don't move in s.
+	// v are the values copied in from v.
+	// b and c are the values from s that are shifted up in index.
+	// d are the values that get overwritten, never to be seen again.
+
+	if !overlaps(v, s[i+m:]) {
+		// Easy case - v does not overlap either the c or d regions.
+		// (It might be in some of a or b, or elsewhere entirely.)
+		// The data we copy up doesn't write to v at all, so just do it.
+
+		copy(s[i+m:], s[i:])
+
+		// Now we have
+		// s: aaaaaaaabbbbbbbbcccccccc
+		//            ^   ^       ^   ^
+		//            i  i+m      n  n+m
+		// Note the b values are duplicated.
+
+		copy(s[i:], v)
+
+		// Now we have
+		// s: aaaaaaaavvvvbbbbcccccccc
+		//            ^   ^       ^   ^
+		//            i  i+m      n  n+m
+		// That's the result we want.
+		return s
+	}
+
+	// The hard case - v overlaps c or d. We can't just shift up
+	// the data because we'd move or clobber the values we're trying
+	// to insert.
+	// So instead, write v on top of d, then rotate.
+	copy(s[n:], v)
+
+	// Now we have
+	// s: aaaaaaaabbbbccccccccvvvv
+	//            ^   ^       ^   ^
+	//            i  i+m      n  n+m
+
+	rotateRight(s[i:], m)
+
+	// Now we have
+	// s: aaaaaaaavvvvbbbbcccccccc
+	//            ^   ^       ^   ^
+	//            i  i+m      n  n+m
+	// That's the result we want.
+	return s
+}
+
+// Delete removes the elements s[i:j] from s, returning the modified slice.
+// Delete panics if j > len(s) or s[i:j] is not a valid slice of s.
+// Delete is O(len(s)-i), so if many items must be deleted, it is better to
+// make a single call deleting them all together than to delete one at a time.
+// Delete zeroes the elements s[len(s)-(j-i):len(s)].
+func Delete[S ~[]E, E any](s S, i, j int) S {
+	_ = s[i:j:len(s)] // bounds check
+
+	if i == j {
+		return s
+	}
+
+	// oldlen := len(s)
+	s = append(s[:i], s[j:]...)
+	// go1.21 feature: clear(s[len(s):oldlen]) // zero/nil out the obsolete elements, for GC
+	return s
+}
+
+// DeleteFunc removes any elements from s for which del returns true,
+// returning the modified slice.
+// DeleteFunc zeroes the elements between the new length and the original length.
+func DeleteFunc[S ~[]E, E any](s S, del func(E) bool) S {
+	i := IndexFunc(s, del)
+	if i == -1 {
+		return s
+	}
+	// Don't start copying elements until we find one to delete.
+	for j := i + 1; j < len(s); j++ {
+		if v := s[j]; !del(v) {
+			s[i] = v
+			i++
+		}
+	}
+	// go1.21 feature: clear(s[i:]) // zero/nil out the obsolete elements, for GC
+	return s[:i]
+}
+
+// Replace replaces the elements s[i:j] by the given v, and returns the
+// modified slice.
+// Replace panics if j > len(s) or s[i:j] is not a valid slice of s.
+// When len(v) < (j-i), Replace zeroes the elements between the new length and the original length.
+func Replace[S ~[]E, E any](s S, i, j int, v ...E) S {
+	_ = s[i:j] // bounds check
+
+	if i == j {
+		return Insert(s, i, v...)
+	}
+	if j == len(s) {
+		return append(s[:i], v...)
+	}
+
+	tot := len(s[:i]) + len(v) + len(s[j:])
+	if tot > cap(s) {
+		// Too big to fit, allocate and copy over.
+		s2 := append(s[:i], make(S, tot-i)...) // See Insert
+		copy(s2[i:], v)
+		copy(s2[i+len(v):], s[j:])
+		return s2
+	}
+
+	r := s[:tot]
+
+	if i+len(v) <= j {
+		// Easy, as v fits in the deleted portion.
+		copy(r[i:], v)
+		copy(r[i+len(v):], s[j:])
+		// go1.21 feature: clear(s[tot:]) // zero/nil out the obsolete elements, for GC
+		return r
+	}
+
+	// We are expanding (v is bigger than j-i).
+	// The situation is something like this:
+	// (example has i=4,j=8,len(s)=16,len(v)=6)
+	// s: aaaaxxxxbbbbbbbbyy
+	//        ^   ^       ^ ^
+	//        i   j  len(s) tot
+	// a: prefix of s
+	// x: deleted range
+	// b: more of s
+	// y: area to expand into
+
+	if !overlaps(r[i+len(v):], v) {
+		// Easy, as v is not clobbered by the first copy.
+		copy(r[i+len(v):], s[j:])
+		copy(r[i:], v)
+		return r
+	}
+
+	// This is a situation where we don't have a single place to which
+	// we can copy v. Parts of it need to go to two different places.
+	// We want to copy the prefix of v into y and the suffix into x, then
+	// rotate |y| spots to the right.
+	//
+	//        v[2:]      v[:2]
+	//         |           |
+	// s: aaaavvvvbbbbbbbbvv
+	//        ^   ^       ^ ^
+	//        i   j  len(s) tot
+	//
+	// If either of those two destinations don't alias v, then we're good.
+	y := len(v) - (j - i) // length of y portion
+
+	if !overlaps(r[i:j], v) {
+		copy(r[i:j], v[y:])
+		copy(r[len(s):], v[:y])
+		rotateRight(r[i:], y)
+		return r
+	}
+	if !overlaps(r[len(s):], v) {
+		copy(r[len(s):], v[:y])
+		copy(r[i:j], v[y:])
+		rotateRight(r[i:], y)
+		return r
+	}
+
+	// Now we know that v overlaps both x and y.
+	// That means that the entirety of b is *inside* v.
+	// So we don't need to preserve b at all; instead we
+	// can copy v first, then copy the b part of v out of
+	// v to the right destination.
+	k := startIdx(v, s[j:])
+	copy(r[i:], v)
+	copy(r[i+len(v):], r[i+k:])
+	return r
+}
+
+// Clone returns a copy of the slice.
+// The elements are copied using assignment, so this is a shallow clone.
+func Clone[S ~[]E, E any](s S) S {
+	// The s[:0:0] preserves nil in case it matters.
+	return append(s[:0:0], s...)
+}
+
+// Compact replaces consecutive runs of equal elements with a single copy.
+// This is like the uniq command found on Unix.
+// Compact modifies the contents of the slice s and returns the modified slice,
+// which may have a smaller length.
+// Compact zeroes the elements between the new length and the original length.
+func Compact[S ~[]E, E comparable](s S) S {
+	if len(s) < 2 {
+		return s
+	}
+	i := 1
+	for k := 1; k < len(s); k++ {
+		if s[k] != s[k-1] {
+			if i != k {
+				s[i] = s[k]
+			}
+			i++
+		}
+	}
+	// go1.21 feature: clear(s[i:]) // zero/nil out the obsolete elements, for GC
+	return s[:i]
+}
+
+// CompactFunc is like [Compact] but uses an equality function to compare elements.
+// For runs of elements that compare equal, CompactFunc keeps the first one.
+// CompactFunc zeroes the elements between the new length and the original length.
+func CompactFunc[S ~[]E, E any](s S, eq func(E, E) bool) S {
+	if len(s) < 2 {
+		return s
+	}
+	i := 1
+	for k := 1; k < len(s); k++ {
+		if !eq(s[k], s[k-1]) {
+			if i != k {
+				s[i] = s[k]
+			}
+			i++
+		}
+	}
+	// go1.21 feature: clear(s[i:]) // zero/nil out the obsolete elements, for GC
+	return s[:i]
+}
+
+// Grow increases the slice's capacity, if necessary, to guarantee space for
+// another n elements. After Grow(n), at least n elements can be appended
+// to the slice without another allocation. If n is negative or too large to
+// allocate the memory, Grow panics.
+func Grow[S ~[]E, E any](s S, n int) S {
+	if n < 0 {
+		panic("cannot be negative")
+	}
+	if n -= cap(s) - len(s); n > 0 {
+		s = append(s[:cap(s)], make([]E, n)...)[:len(s)]
+	}
+	return s
+}
+
+// Clip removes unused capacity from the slice, returning s[:len(s):len(s)].
+func Clip[S ~[]E, E any](s S) S {
+	return s[:len(s):len(s)]
+}
+
+// Rotation algorithm explanation:
+//
+// rotate left by 2
+// start with
+//   0123456789
+// split up like this
+//   01 234567 89
+// swap first 2 and last 2
+//   89 234567 01
+// join first parts
+//   89234567 01
+// recursively rotate first left part by 2
+//   23456789 01
+// join at the end
+//   2345678901
+//
+// rotate left by 8
+// start with
+//   0123456789
+// split up like this
+//   01 234567 89
+// swap first 2 and last 2
+//   89 234567 01
+// join last parts
+//   89 23456701
+// recursively rotate second part left by 6
+//   89 01234567
+// join at the end
+//   8901234567
+
+// TODO: There are other rotate algorithms.
+// This algorithm has the desirable property that it moves each element exactly twice.
+// The triple-reverse algorithm is simpler and more cache friendly, but takes more writes.
+// The follow-cycles algorithm can be 1-write but it is not very cache friendly.
+
+// rotateLeft rotates b left by n spaces.
+// s_final[i] = s_orig[i+r], wrapping around.
+func rotateLeft[E any](s []E, r int) {
+	for r != 0 && r != len(s) {
+		if r*2 <= len(s) {
+			swap(s[:r], s[len(s)-r:])
+			s = s[:len(s)-r]
+		} else {
+			swap(s[:len(s)-r], s[r:])
+			s, r = s[len(s)-r:], r*2-len(s)
+		}
+	}
+}
+func rotateRight[E any](s []E, r int) {
+	rotateLeft(s, len(s)-r)
+}
+
+// swap swaps the contents of x and y. x and y must be equal length and disjoint.
+func swap[E any](x, y []E) {
+	for i := 0; i < len(x); i++ {
+		x[i], y[i] = y[i], x[i]
+	}
+}
+
+// overlaps reports whether the memory ranges a[0:len(a)] and b[0:len(b)] overlap.
+func overlaps[E any](a, b []E) bool {
+	if len(a) == 0 || len(b) == 0 {
+		return false
+	}
+	elemSize := unsafe.Sizeof(a[0])
+	if elemSize == 0 {
+		return false
+	}
+	// TODO: use a runtime/unsafe facility once one becomes available. See issue 12445.
+	// Also see crypto/internal/alias/alias.go:AnyOverlap
+	return uintptr(unsafe.Pointer(&a[0])) <= uintptr(unsafe.Pointer(&b[len(b)-1]))+(elemSize-1) &&
+		uintptr(unsafe.Pointer(&b[0])) <= uintptr(unsafe.Pointer(&a[len(a)-1]))+(elemSize-1)
+}
+
+// startIdx returns the index in haystack where the needle starts.
+// prerequisite: the needle must be aliased entirely inside the haystack.
+func startIdx[E any](haystack, needle []E) int {
+	p := &needle[0]
+	for i := range haystack {
+		if p == &haystack[i] {
+			return i
+		}
+	}
+	// TODO: what if the overlap is by a non-integral number of Es?
+	panic("needle not found")
+}
+
+// Reverse reverses the elements of the slice in place.
+func Reverse[S ~[]E, E any](s S) {
+	for i, j := 0, len(s)-1; i < j; i, j = i+1, j-1 {
+		s[i], s[j] = s[j], s[i]
+	}
+}
+
+// Concat returns a new slice concatenating the passed in slices.
+func Concat[S ~[]E, E any](slices ...S) S {
+	size := 0
+	for _, s := range slices {
+		size += len(s)
+		if size < 0 {
+			panic("len out of range")
+		}
+	}
+	newslice := Grow[S](nil, size)
+	for _, s := range slices {
+		newslice = append(newslice, s...)
+	}
+	return newslice
+}
diff --git a/pebble/shims/slices/sort.go b/pebble/shims/slices/sort.go
new file mode 100644
index 0000000..cf445a5
--- /dev/null
+++ b/pebble/shims/slices/sort.go
@@ -0,0 +1,202 @@
+// This file has been ported over from go 1.21.0 so that we can avoid
+// having to upgrade for basic comparison functions. Copyright notice
+// is preserved:
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:generate go run $GOROOT/src/sort/gen_sort_variants.go -generic
+
+package slices
+
+import (
+	"math/bits"
+
+	"github.com/cockroachdb/pebble/shims/cmp"
+)
+
+// Sort sorts a slice of any ordered type in ascending order.
+// When sorting floating-point numbers, NaNs are ordered before other values.
+func Sort[S ~[]E, E cmp.Ordered](x S) {
+	n := len(x)
+	pdqsortOrdered(x, 0, n, bits.Len(uint(n)))
+}
+
+// SortFunc sorts the slice x in ascending order as determined by the cmp
+// function. This sort is not guaranteed to be stable.
+// cmp(a, b) should return a negative number when a < b, a positive number when
+// a > b and zero when a == b.
+//
+// SortFunc requires that cmp is a strict weak ordering.
+// See https://en.wikipedia.org/wiki/Weak_ordering#Strict_weak_orderings.
+func SortFunc[S ~[]E, E any](x S, cmp func(a, b E) int) {
+	n := len(x)
+	pdqsortCmpFunc(x, 0, n, bits.Len(uint(n)), cmp)
+}
+
+// SortStableFunc sorts the slice x while keeping the original order of equal
+// elements, using cmp to compare elements in the same way as [SortFunc].
+func SortStableFunc[S ~[]E, E any](x S, cmp func(a, b E) int) {
+	stableCmpFunc(x, len(x), cmp)
+}
+
+// IsSorted reports whether x is sorted in ascending order.
+func IsSorted[S ~[]E, E cmp.Ordered](x S) bool {
+	for i := len(x) - 1; i > 0; i-- {
+		if cmp.Less(x[i], x[i-1]) {
+			return false
+		}
+	}
+	return true
+}
+
+// IsSortedFunc reports whether x is sorted in ascending order, with cmp as the
+// comparison function as defined by [SortFunc].
+func IsSortedFunc[S ~[]E, E any](x S, cmp func(a, b E) int) bool {
+	for i := len(x) - 1; i > 0; i-- {
+		if cmp(x[i], x[i-1]) < 0 {
+			return false
+		}
+	}
+	return true
+}
+
+// Min returns the minimal value in x. It panics if x is empty.
+// For floating-point numbers, Min propagates NaNs (any NaN value in x
+// forces the output to be NaN).
+func Min[S ~[]E, E cmp.Ordered](x S) E {
+	if len(x) < 1 {
+		panic("slices.Min: empty list")
+	}
+	m := x[0]
+	for i := 1; i < len(x); i++ {
+		if x[i] < m {
+			m = x[i]
+		}
+	}
+	return m
+}
+
+// MinFunc returns the minimal value in x, using cmp to compare elements.
+// It panics if x is empty. If there is more than one minimal element
+// according to the cmp function, MinFunc returns the first one.
+func MinFunc[S ~[]E, E any](x S, cmp func(a, b E) int) E {
+	if len(x) < 1 {
+		panic("slices.MinFunc: empty list")
+	}
+	m := x[0]
+	for i := 1; i < len(x); i++ {
+		if cmp(x[i], m) < 0 {
+			m = x[i]
+		}
+	}
+	return m
+}
+
+// Max returns the maximal value in x. It panics if x is empty.
+// For floating-point E, Max propagates NaNs (any NaN value in x
+// forces the output to be NaN).
+func Max[S ~[]E, E cmp.Ordered](x S) E {
+	if len(x) < 1 {
+		panic("slices.Max: empty list")
+	}
+	m := x[0]
+	for i := 1; i < len(x); i++ {
+		if x[i] > m {
+			m = x[i]
+		}
+	}
+	return m
+}
+
+// MaxFunc returns the maximal value in x, using cmp to compare elements.
+// It panics if x is empty. If there is more than one maximal element
+// according to the cmp function, MaxFunc returns the first one.
+func MaxFunc[S ~[]E, E any](x S, cmp func(a, b E) int) E {
+	if len(x) < 1 {
+		panic("slices.MaxFunc: empty list")
+	}
+	m := x[0]
+	for i := 1; i < len(x); i++ {
+		if cmp(x[i], m) > 0 {
+			m = x[i]
+		}
+	}
+	return m
+}
+
+// BinarySearch searches for target in a sorted slice and returns the position
+// where target is found, or the position where target would appear in the
+// sort order; it also returns a bool saying whether the target is really found
+// in the slice. The slice must be sorted in increasing order.
+func BinarySearch[S ~[]E, E cmp.Ordered](x S, target E) (int, bool) {
+	// Inlining is faster than calling BinarySearchFunc with a lambda.
+	n := len(x)
+	// Define x[-1] < target and x[n] >= target.
+	// Invariant: x[i-1] < target, x[j] >= target.
+	i, j := 0, n
+	for i < j {
+		h := int(uint(i+j) >> 1) // avoid overflow when computing h
+		// i ≤ h < j
+		if cmp.Less(x[h], target) {
+			i = h + 1 // preserves x[i-1] < target
+		} else {
+			j = h // preserves x[j] >= target
+		}
+	}
+	// i == j, x[i-1] < target, and x[j] (= x[i]) >= target  =>  answer is i.
+	return i, i < n && (x[i] == target || (isNaN(x[i]) && isNaN(target)))
+}
+
+// BinarySearchFunc works like [BinarySearch], but uses a custom comparison
+// function. The slice must be sorted in increasing order, where "increasing"
+// is defined by cmp. cmp should return 0 if the slice element matches
+// the target, a negative number if the slice element precedes the target,
+// or a positive number if the slice element follows the target.
+// cmp must implement the same ordering as the slice, such that if
+// cmp(a, t) < 0 and cmp(b, t) >= 0, then a must precede b in the slice.
+func BinarySearchFunc[S ~[]E, E, T any](x S, target T, cmp func(E, T) int) (int, bool) {
+	n := len(x)
+	// Define cmp(x[-1], target) < 0 and cmp(x[n], target) >= 0 .
+	// Invariant: cmp(x[i - 1], target) < 0, cmp(x[j], target) >= 0.
+	i, j := 0, n
+	for i < j {
+		h := int(uint(i+j) >> 1) // avoid overflow when computing h
+		// i ≤ h < j
+		if cmp(x[h], target) < 0 {
+			i = h + 1 // preserves cmp(x[i - 1], target) < 0
+		} else {
+			j = h // preserves cmp(x[j], target) >= 0
+		}
+	}
+	// i == j, cmp(x[i-1], target) < 0, and cmp(x[j], target) (= cmp(x[i], target)) >= 0  =>  answer is i.
+	return i, i < n && cmp(x[i], target) == 0
+}
+
+type sortedHint int // hint for pdqsort when choosing the pivot
+
+const (
+	unknownHint sortedHint = iota
+	increasingHint
+	decreasingHint
+)
+
+// xorshift paper: https://www.jstatsoft.org/article/view/v008i14/xorshift.pdf
+type xorshift uint64
+
+func (r *xorshift) Next() uint64 {
+	*r ^= *r << 13
+	*r ^= *r >> 17
+	*r ^= *r << 5
+	return uint64(*r)
+}
+
+func nextPowerOfTwo(length int) uint {
+	return 1 << bits.Len(uint(length))
+}
+
+// isNaN reports whether x is a NaN without requiring the math package.
+// This will always return false if T is not floating-point.
+func isNaN[T cmp.Ordered](x T) bool {
+	return x != x
+}
diff --git a/pebble/shims/slices/zsortanyfunc.go b/pebble/shims/slices/zsortanyfunc.go
new file mode 100644
index 0000000..386910c
--- /dev/null
+++ b/pebble/shims/slices/zsortanyfunc.go
@@ -0,0 +1,482 @@
+// This file has been ported over from go 1.21.0 so that we can avoid
+// having to upgrade for basic comparison functions. Copyright notice
+// is preserved:
+// Code generated by gen_sort_variants.go; DO NOT EDIT.
+
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package slices
+
+// insertionSortCmpFunc sorts data[a:b] using insertion sort.
+func insertionSortCmpFunc[E any](data []E, a, b int, cmp func(a, b E) int) {
+	for i := a + 1; i < b; i++ {
+		for j := i; j > a && (cmp(data[j], data[j-1]) < 0); j-- {
+			data[j], data[j-1] = data[j-1], data[j]
+		}
+	}
+}
+
+// siftDownCmpFunc implements the heap property on data[lo:hi].
+// first is an offset into the array where the root of the heap lies.
+func siftDownCmpFunc[E any](data []E, lo, hi, first int, cmp func(a, b E) int) {
+	root := lo
+	for {
+		child := 2*root + 1
+		if child >= hi {
+			break
+		}
+		if child+1 < hi && (cmp(data[first+child], data[first+child+1]) < 0) {
+			child++
+		}
+		if !(cmp(data[first+root], data[first+child]) < 0) {
+			return
+		}
+		data[first+root], data[first+child] = data[first+child], data[first+root]
+		root = child
+	}
+}
+
+func heapSortCmpFunc[E any](data []E, a, b int, cmp func(a, b E) int) {
+	first := a
+	lo := 0
+	hi := b - a
+
+	// Build heap with greatest element at top.
+	for i := (hi - 1) / 2; i >= 0; i-- {
+		siftDownCmpFunc(data, i, hi, first, cmp)
+	}
+
+	// Pop elements, largest first, into end of data.
+	for i := hi - 1; i >= 0; i-- {
+		data[first], data[first+i] = data[first+i], data[first]
+		siftDownCmpFunc(data, lo, i, first, cmp)
+	}
+}
+
+// pdqsortCmpFunc sorts data[a:b].
+// The algorithm based on pattern-defeating quicksort(pdqsort), but without the optimizations from BlockQuicksort.
+// pdqsort paper: https://arxiv.org/pdf/2106.05123.pdf
+// C++ implementation: https://github.com/orlp/pdqsort
+// Rust implementation: https://docs.rs/pdqsort/latest/pdqsort/
+// limit is the number of allowed bad (very unbalanced) pivots before falling back to heapsort.
+func pdqsortCmpFunc[E any](data []E, a, b, limit int, cmp func(a, b E) int) {
+	const maxInsertion = 12
+
+	var (
+		wasBalanced    = true // whether the last partitioning was reasonably balanced
+		wasPartitioned = true // whether the slice was already partitioned
+	)
+
+	for {
+		length := b - a
+
+		if length <= maxInsertion {
+			insertionSortCmpFunc(data, a, b, cmp)
+			return
+		}
+
+		// Fall back to heapsort if too many bad choices were made.
+		if limit == 0 {
+			heapSortCmpFunc(data, a, b, cmp)
+			return
+		}
+
+		// If the last partitioning was imbalanced, we need to breaking patterns.
+		if !wasBalanced {
+			breakPatternsCmpFunc(data, a, b, cmp)
+			limit--
+		}
+
+		pivot, hint := choosePivotCmpFunc(data, a, b, cmp)
+		if hint == decreasingHint {
+			reverseRangeCmpFunc(data, a, b, cmp)
+			// The chosen pivot was pivot-a elements after the start of the array.
+			// After reversing it is pivot-a elements before the end of the array.
+			// The idea came from Rust's implementation.
+			pivot = (b - 1) - (pivot - a)
+			hint = increasingHint
+		}
+
+		// The slice is likely already sorted.
+		if wasBalanced && wasPartitioned && hint == increasingHint {
+			if partialInsertionSortCmpFunc(data, a, b, cmp) {
+				return
+			}
+		}
+
+		// Probably the slice contains many duplicate elements, partition the slice into
+		// elements equal to and elements greater than the pivot.
+		if a > 0 && !(cmp(data[a-1], data[pivot]) < 0) {
+			mid := partitionEqualCmpFunc(data, a, b, pivot, cmp)
+			a = mid
+			continue
+		}
+
+		mid, alreadyPartitioned := partitionCmpFunc(data, a, b, pivot, cmp)
+		wasPartitioned = alreadyPartitioned
+
+		leftLen, rightLen := mid-a, b-mid
+		balanceThreshold := length / 8
+		if leftLen < rightLen {
+			wasBalanced = leftLen >= balanceThreshold
+			pdqsortCmpFunc(data, a, mid, limit, cmp)
+			a = mid + 1
+		} else {
+			wasBalanced = rightLen >= balanceThreshold
+			pdqsortCmpFunc(data, mid+1, b, limit, cmp)
+			b = mid
+		}
+	}
+}
+
+// partitionCmpFunc does one quicksort partition.
+// Let p = data[pivot]
+// Moves elements in data[a:b] around, so that data[i]<p and data[j]>=p for i<newpivot and j>newpivot.
+// On return, data[newpivot] = p
+func partitionCmpFunc[E any](data []E, a, b, pivot int, cmp func(a, b E) int) (newpivot int, alreadyPartitioned bool) {
+	data[a], data[pivot] = data[pivot], data[a]
+	i, j := a+1, b-1 // i and j are inclusive of the elements remaining to be partitioned
+
+	for i <= j && (cmp(data[i], data[a]) < 0) {
+		i++
+	}
+	for i <= j && !(cmp(data[j], data[a]) < 0) {
+		j--
+	}
+	if i > j {
+		data[j], data[a] = data[a], data[j]
+		return j, true
+	}
+	data[i], data[j] = data[j], data[i]
+	i++
+	j--
+
+	for {
+		for i <= j && (cmp(data[i], data[a]) < 0) {
+			i++
+		}
+		for i <= j && !(cmp(data[j], data[a]) < 0) {
+			j--
+		}
+		if i > j {
+			break
+		}
+		data[i], data[j] = data[j], data[i]
+		i++
+		j--
+	}
+	data[j], data[a] = data[a], data[j]
+	return j, false
+}
+
+// partitionEqualCmpFunc partitions data[a:b] into elements equal to data[pivot] followed by elements greater than data[pivot].
+// It assumed that data[a:b] does not contain elements smaller than the data[pivot].
+func partitionEqualCmpFunc[E any](data []E, a, b, pivot int, cmp func(a, b E) int) (newpivot int) {
+	data[a], data[pivot] = data[pivot], data[a]
+	i, j := a+1, b-1 // i and j are inclusive of the elements remaining to be partitioned
+
+	for {
+		for i <= j && !(cmp(data[a], data[i]) < 0) {
+			i++
+		}
+		for i <= j && (cmp(data[a], data[j]) < 0) {
+			j--
+		}
+		if i > j {
+			break
+		}
+		data[i], data[j] = data[j], data[i]
+		i++
+		j--
+	}
+	return i
+}
+
+// partialInsertionSortCmpFunc partially sorts a slice, returns true if the slice is sorted at the end.
+func partialInsertionSortCmpFunc[E any](data []E, a, b int, cmp func(a, b E) int) bool {
+	const (
+		maxSteps         = 5  // maximum number of adjacent out-of-order pairs that will get shifted
+		shortestShifting = 50 // don't shift any elements on short arrays
+	)
+	i := a + 1
+	for j := 0; j < maxSteps; j++ {
+		for i < b && !(cmp(data[i], data[i-1]) < 0) {
+			i++
+		}
+
+		if i == b {
+			return true
+		}
+
+		if b-a < shortestShifting {
+			return false
+		}
+
+		data[i], data[i-1] = data[i-1], data[i]
+
+		// Shift the smaller one to the left.
+		if i-a >= 2 {
+			for j := i - 1; j >= 1; j-- {
+				if !(cmp(data[j], data[j-1]) < 0) {
+					break
+				}
+				data[j], data[j-1] = data[j-1], data[j]
+			}
+		}
+		// Shift the greater one to the right.
+		if b-i >= 2 {
+			for j := i + 1; j < b; j++ {
+				if !(cmp(data[j], data[j-1]) < 0) {
+					break
+				}
+				data[j], data[j-1] = data[j-1], data[j]
+			}
+		}
+	}
+	return false
+}
+
+// breakPatternsCmpFunc scatters some elements around in an attempt to break some patterns
+// that might cause imbalanced partitions in quicksort.
+func breakPatternsCmpFunc[E any](data []E, a, b int, cmp func(a, b E) int) {
+	length := b - a
+	if length >= 8 {
+		random := xorshift(length)
+		modulus := nextPowerOfTwo(length)
+
+		for idx := a + (length/4)*2 - 1; idx <= a+(length/4)*2+1; idx++ {
+			other := int(uint(random.Next()) & (modulus - 1))
+			if other >= length {
+				other -= length
+			}
+			data[idx], data[a+other] = data[a+other], data[idx]
+		}
+	}
+}
+
+// choosePivotCmpFunc chooses a pivot in data[a:b].
+//
+// [0,8): chooses a static pivot.
+// [8,shortestNinther): uses the simple median-of-three method.
+// [shortestNinther,∞): uses the Tukey ninther method.
+func choosePivotCmpFunc[E any](data []E, a, b int, cmp func(a, b E) int) (pivot int, hint sortedHint) {
+	const (
+		shortestNinther = 50
+		maxSwaps        = 4 * 3
+	)
+
+	l := b - a
+
+	var (
+		swaps int
+		i     = a + l/4*1
+		j     = a + l/4*2
+		k     = a + l/4*3
+	)
+
+	if l >= 8 {
+		if l >= shortestNinther {
+			// Tukey ninther method, the idea came from Rust's implementation.
+			i = medianAdjacentCmpFunc(data, i, &swaps, cmp)
+			j = medianAdjacentCmpFunc(data, j, &swaps, cmp)
+			k = medianAdjacentCmpFunc(data, k, &swaps, cmp)
+		}
+		// Find the median among i, j, k and stores it into j.
+		j = medianCmpFunc(data, i, j, k, &swaps, cmp)
+	}
+
+	switch swaps {
+	case 0:
+		return j, increasingHint
+	case maxSwaps:
+		return j, decreasingHint
+	default:
+		return j, unknownHint
+	}
+}
+
+// order2CmpFunc returns x,y where data[x] <= data[y], where x,y=a,b or x,y=b,a.
+func order2CmpFunc[E any](data []E, a, b int, swaps *int, cmp func(a, b E) int) (int, int) {
+	if cmp(data[b], data[a]) < 0 {
+		*swaps++
+		return b, a
+	}
+	return a, b
+}
+
+// medianCmpFunc returns x where data[x] is the median of data[a],data[b],data[c], where x is a, b, or c.
+func medianCmpFunc[E any](data []E, a, b, c int, swaps *int, cmp func(a, b E) int) int {
+	a, b = order2CmpFunc(data, a, b, swaps, cmp)
+	b, c = order2CmpFunc(data, b, c, swaps, cmp)
+	a, b = order2CmpFunc(data, a, b, swaps, cmp)
+	return b
+}
+
+// medianAdjacentCmpFunc finds the median of data[a - 1], data[a], data[a + 1] and stores the index into a.
+func medianAdjacentCmpFunc[E any](data []E, a int, swaps *int, cmp func(a, b E) int) int {
+	return medianCmpFunc(data, a-1, a, a+1, swaps, cmp)
+}
+
+func reverseRangeCmpFunc[E any](data []E, a, b int, cmp func(a, b E) int) {
+	i := a
+	j := b - 1
+	for i < j {
+		data[i], data[j] = data[j], data[i]
+		i++
+		j--
+	}
+}
+
+func swapRangeCmpFunc[E any](data []E, a, b, n int, cmp func(a, b E) int) {
+	for i := 0; i < n; i++ {
+		data[a+i], data[b+i] = data[b+i], data[a+i]
+	}
+}
+
+func stableCmpFunc[E any](data []E, n int, cmp func(a, b E) int) {
+	blockSize := 20 // must be > 0
+	a, b := 0, blockSize
+	for b <= n {
+		insertionSortCmpFunc(data, a, b, cmp)
+		a = b
+		b += blockSize
+	}
+	insertionSortCmpFunc(data, a, n, cmp)
+
+	for blockSize < n {
+		a, b = 0, 2*blockSize
+		for b <= n {
+			symMergeCmpFunc(data, a, a+blockSize, b, cmp)
+			a = b
+			b += 2 * blockSize
+		}
+		if m := a + blockSize; m < n {
+			symMergeCmpFunc(data, a, m, n, cmp)
+		}
+		blockSize *= 2
+	}
+}
+
+// symMergeCmpFunc merges the two sorted subsequences data[a:m] and data[m:b] using
+// the SymMerge algorithm from Pok-Son Kim and Arne Kutzner, "Stable Minimum
+// Storage Merging by Symmetric Comparisons", in Susanne Albers and Tomasz
+// Radzik, editors, Algorithms - ESA 2004, volume 3221 of Lecture Notes in
+// Computer Science, pages 714-723. Springer, 2004.
+//
+// Let M = m-a and N = b-n. Wolog M < N.
+// The recursion depth is bound by ceil(log(N+M)).
+// The algorithm needs O(M*log(N/M + 1)) calls to data.Less.
+// The algorithm needs O((M+N)*log(M)) calls to data.Swap.
+//
+// The paper gives O((M+N)*log(M)) as the number of assignments assuming a
+// rotation algorithm which uses O(M+N+gcd(M+N)) assignments. The argumentation
+// in the paper carries through for Swap operations, especially as the block
+// swapping rotate uses only O(M+N) Swaps.
+//
+// symMerge assumes non-degenerate arguments: a < m && m < b.
+// Having the caller check this condition eliminates many leaf recursion calls,
+// which improves performance.
+func symMergeCmpFunc[E any](data []E, a, m, b int, cmp func(a, b E) int) {
+	// Avoid unnecessary recursions of symMerge
+	// by direct insertion of data[a] into data[m:b]
+	// if data[a:m] only contains one element.
+	if m-a == 1 {
+		// Use binary search to find the lowest index i
+		// such that data[i] >= data[a] for m <= i < b.
+		// Exit the search loop with i == b in case no such index exists.
+		i := m
+		j := b
+		for i < j {
+			h := int(uint(i+j) >> 1)
+			if cmp(data[h], data[a]) < 0 {
+				i = h + 1
+			} else {
+				j = h
+			}
+		}
+		// Swap values until data[a] reaches the position before i.
+		for k := a; k < i-1; k++ {
+			data[k], data[k+1] = data[k+1], data[k]
+		}
+		return
+	}
+
+	// Avoid unnecessary recursions of symMerge
+	// by direct insertion of data[m] into data[a:m]
+	// if data[m:b] only contains one element.
+	if b-m == 1 {
+		// Use binary search to find the lowest index i
+		// such that data[i] > data[m] for a <= i < m.
+		// Exit the search loop with i == m in case no such index exists.
+		i := a
+		j := m
+		for i < j {
+			h := int(uint(i+j) >> 1)
+			if !(cmp(data[m], data[h]) < 0) {
+				i = h + 1
+			} else {
+				j = h
+			}
+		}
+		// Swap values until data[m] reaches the position i.
+		for k := m; k > i; k-- {
+			data[k], data[k-1] = data[k-1], data[k]
+		}
+		return
+	}
+
+	mid := int(uint(a+b) >> 1)
+	n := mid + m
+	var start, r int
+	if m > mid {
+		start = n - b
+		r = mid
+	} else {
+		start = a
+		r = m
+	}
+	p := n - 1
+
+	for start < r {
+		c := int(uint(start+r) >> 1)
+		if !(cmp(data[p-c], data[c]) < 0) {
+			start = c + 1
+		} else {
+			r = c
+		}
+	}
+
+	end := n - start
+	if start < m && m < end {
+		rotateCmpFunc(data, start, m, end, cmp)
+	}
+	if a < start && start < mid {
+		symMergeCmpFunc(data, a, start, mid, cmp)
+	}
+	if mid < end && end < b {
+		symMergeCmpFunc(data, mid, end, b, cmp)
+	}
+}
+
+// rotateCmpFunc rotates two consecutive blocks u = data[a:m] and v = data[m:b] in data:
+// Data of the form 'x u v y' is changed to 'x v u y'.
+// rotate performs at most b-a many calls to data.Swap,
+// and it assumes non-degenerate arguments: a < m && m < b.
+func rotateCmpFunc[E any](data []E, a, m, b int, cmp func(a, b E) int) {
+	i := m - a
+	j := b - m
+
+	for i != j {
+		if i > j {
+			swapRangeCmpFunc(data, m-i, m, j, cmp)
+			i -= j
+		} else {
+			swapRangeCmpFunc(data, m-i, m+j-i, i, cmp)
+			j -= i
+		}
+	}
+	// i == j
+	swapRangeCmpFunc(data, m-i, m, i, cmp)
+}
diff --git a/pebble/shims/slices/zsortordered.go b/pebble/shims/slices/zsortordered.go
new file mode 100644
index 0000000..d2d2e7d
--- /dev/null
+++ b/pebble/shims/slices/zsortordered.go
@@ -0,0 +1,484 @@
+// This file has been ported over from go 1.21.0 so that we can avoid
+// having to upgrade for basic comparison functions. Copyright notice
+// is preserved:
+// Code generated by gen_sort_variants.go; DO NOT EDIT.
+
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package slices
+
+import "github.com/cockroachdb/pebble/shims/cmp"
+
+// insertionSortOrdered sorts data[a:b] using insertion sort.
+func insertionSortOrdered[E cmp.Ordered](data []E, a, b int) {
+	for i := a + 1; i < b; i++ {
+		for j := i; j > a && cmp.Less(data[j], data[j-1]); j-- {
+			data[j], data[j-1] = data[j-1], data[j]
+		}
+	}
+}
+
+// siftDownOrdered implements the heap property on data[lo:hi].
+// first is an offset into the array where the root of the heap lies.
+func siftDownOrdered[E cmp.Ordered](data []E, lo, hi, first int) {
+	root := lo
+	for {
+		child := 2*root + 1
+		if child >= hi {
+			break
+		}
+		if child+1 < hi && cmp.Less(data[first+child], data[first+child+1]) {
+			child++
+		}
+		if !cmp.Less(data[first+root], data[first+child]) {
+			return
+		}
+		data[first+root], data[first+child] = data[first+child], data[first+root]
+		root = child
+	}
+}
+
+func heapSortOrdered[E cmp.Ordered](data []E, a, b int) {
+	first := a
+	lo := 0
+	hi := b - a
+
+	// Build heap with greatest element at top.
+	for i := (hi - 1) / 2; i >= 0; i-- {
+		siftDownOrdered(data, i, hi, first)
+	}
+
+	// Pop elements, largest first, into end of data.
+	for i := hi - 1; i >= 0; i-- {
+		data[first], data[first+i] = data[first+i], data[first]
+		siftDownOrdered(data, lo, i, first)
+	}
+}
+
+// pdqsortOrdered sorts data[a:b].
+// The algorithm based on pattern-defeating quicksort(pdqsort), but without the optimizations from BlockQuicksort.
+// pdqsort paper: https://arxiv.org/pdf/2106.05123.pdf
+// C++ implementation: https://github.com/orlp/pdqsort
+// Rust implementation: https://docs.rs/pdqsort/latest/pdqsort/
+// limit is the number of allowed bad (very unbalanced) pivots before falling back to heapsort.
+func pdqsortOrdered[E cmp.Ordered](data []E, a, b, limit int) {
+	const maxInsertion = 12
+
+	var (
+		wasBalanced    = true // whether the last partitioning was reasonably balanced
+		wasPartitioned = true // whether the slice was already partitioned
+	)
+
+	for {
+		length := b - a
+
+		if length <= maxInsertion {
+			insertionSortOrdered(data, a, b)
+			return
+		}
+
+		// Fall back to heapsort if too many bad choices were made.
+		if limit == 0 {
+			heapSortOrdered(data, a, b)
+			return
+		}
+
+		// If the last partitioning was imbalanced, we need to breaking patterns.
+		if !wasBalanced {
+			breakPatternsOrdered(data, a, b)
+			limit--
+		}
+
+		pivot, hint := choosePivotOrdered(data, a, b)
+		if hint == decreasingHint {
+			reverseRangeOrdered(data, a, b)
+			// The chosen pivot was pivot-a elements after the start of the array.
+			// After reversing it is pivot-a elements before the end of the array.
+			// The idea came from Rust's implementation.
+			pivot = (b - 1) - (pivot - a)
+			hint = increasingHint
+		}
+
+		// The slice is likely already sorted.
+		if wasBalanced && wasPartitioned && hint == increasingHint {
+			if partialInsertionSortOrdered(data, a, b) {
+				return
+			}
+		}
+
+		// Probably the slice contains many duplicate elements, partition the slice into
+		// elements equal to and elements greater than the pivot.
+		if a > 0 && !cmp.Less(data[a-1], data[pivot]) {
+			mid := partitionEqualOrdered(data, a, b, pivot)
+			a = mid
+			continue
+		}
+
+		mid, alreadyPartitioned := partitionOrdered(data, a, b, pivot)
+		wasPartitioned = alreadyPartitioned
+
+		leftLen, rightLen := mid-a, b-mid
+		balanceThreshold := length / 8
+		if leftLen < rightLen {
+			wasBalanced = leftLen >= balanceThreshold
+			pdqsortOrdered(data, a, mid, limit)
+			a = mid + 1
+		} else {
+			wasBalanced = rightLen >= balanceThreshold
+			pdqsortOrdered(data, mid+1, b, limit)
+			b = mid
+		}
+	}
+}
+
+// partitionOrdered does one quicksort partition.
+// Let p = data[pivot]
+// Moves elements in data[a:b] around, so that data[i]<p and data[j]>=p for i<newpivot and j>newpivot.
+// On return, data[newpivot] = p
+func partitionOrdered[E cmp.Ordered](data []E, a, b, pivot int) (newpivot int, alreadyPartitioned bool) {
+	data[a], data[pivot] = data[pivot], data[a]
+	i, j := a+1, b-1 // i and j are inclusive of the elements remaining to be partitioned
+
+	for i <= j && cmp.Less(data[i], data[a]) {
+		i++
+	}
+	for i <= j && !cmp.Less(data[j], data[a]) {
+		j--
+	}
+	if i > j {
+		data[j], data[a] = data[a], data[j]
+		return j, true
+	}
+	data[i], data[j] = data[j], data[i]
+	i++
+	j--
+
+	for {
+		for i <= j && cmp.Less(data[i], data[a]) {
+			i++
+		}
+		for i <= j && !cmp.Less(data[j], data[a]) {
+			j--
+		}
+		if i > j {
+			break
+		}
+		data[i], data[j] = data[j], data[i]
+		i++
+		j--
+	}
+	data[j], data[a] = data[a], data[j]
+	return j, false
+}
+
+// partitionEqualOrdered partitions data[a:b] into elements equal to data[pivot] followed by elements greater than data[pivot].
+// It assumed that data[a:b] does not contain elements smaller than the data[pivot].
+func partitionEqualOrdered[E cmp.Ordered](data []E, a, b, pivot int) (newpivot int) {
+	data[a], data[pivot] = data[pivot], data[a]
+	i, j := a+1, b-1 // i and j are inclusive of the elements remaining to be partitioned
+
+	for {
+		for i <= j && !cmp.Less(data[a], data[i]) {
+			i++
+		}
+		for i <= j && cmp.Less(data[a], data[j]) {
+			j--
+		}
+		if i > j {
+			break
+		}
+		data[i], data[j] = data[j], data[i]
+		i++
+		j--
+	}
+	return i
+}
+
+// partialInsertionSortOrdered partially sorts a slice, returns true if the slice is sorted at the end.
+func partialInsertionSortOrdered[E cmp.Ordered](data []E, a, b int) bool {
+	const (
+		maxSteps         = 5  // maximum number of adjacent out-of-order pairs that will get shifted
+		shortestShifting = 50 // don't shift any elements on short arrays
+	)
+	i := a + 1
+	for j := 0; j < maxSteps; j++ {
+		for i < b && !cmp.Less(data[i], data[i-1]) {
+			i++
+		}
+
+		if i == b {
+			return true
+		}
+
+		if b-a < shortestShifting {
+			return false
+		}
+
+		data[i], data[i-1] = data[i-1], data[i]
+
+		// Shift the smaller one to the left.
+		if i-a >= 2 {
+			for j := i - 1; j >= 1; j-- {
+				if !cmp.Less(data[j], data[j-1]) {
+					break
+				}
+				data[j], data[j-1] = data[j-1], data[j]
+			}
+		}
+		// Shift the greater one to the right.
+		if b-i >= 2 {
+			for j := i + 1; j < b; j++ {
+				if !cmp.Less(data[j], data[j-1]) {
+					break
+				}
+				data[j], data[j-1] = data[j-1], data[j]
+			}
+		}
+	}
+	return false
+}
+
+// breakPatternsOrdered scatters some elements around in an attempt to break some patterns
+// that might cause imbalanced partitions in quicksort.
+func breakPatternsOrdered[E cmp.Ordered](data []E, a, b int) {
+	length := b - a
+	if length >= 8 {
+		random := xorshift(length)
+		modulus := nextPowerOfTwo(length)
+
+		for idx := a + (length/4)*2 - 1; idx <= a+(length/4)*2+1; idx++ {
+			other := int(uint(random.Next()) & (modulus - 1))
+			if other >= length {
+				other -= length
+			}
+			data[idx], data[a+other] = data[a+other], data[idx]
+		}
+	}
+}
+
+// choosePivotOrdered chooses a pivot in data[a:b].
+//
+// [0,8): chooses a static pivot.
+// [8,shortestNinther): uses the simple median-of-three method.
+// [shortestNinther,∞): uses the Tukey ninther method.
+func choosePivotOrdered[E cmp.Ordered](data []E, a, b int) (pivot int, hint sortedHint) {
+	const (
+		shortestNinther = 50
+		maxSwaps        = 4 * 3
+	)
+
+	l := b - a
+
+	var (
+		swaps int
+		i     = a + l/4*1
+		j     = a + l/4*2
+		k     = a + l/4*3
+	)
+
+	if l >= 8 {
+		if l >= shortestNinther {
+			// Tukey ninther method, the idea came from Rust's implementation.
+			i = medianAdjacentOrdered(data, i, &swaps)
+			j = medianAdjacentOrdered(data, j, &swaps)
+			k = medianAdjacentOrdered(data, k, &swaps)
+		}
+		// Find the median among i, j, k and stores it into j.
+		j = medianOrdered(data, i, j, k, &swaps)
+	}
+
+	switch swaps {
+	case 0:
+		return j, increasingHint
+	case maxSwaps:
+		return j, decreasingHint
+	default:
+		return j, unknownHint
+	}
+}
+
+// order2Ordered returns x,y where data[x] <= data[y], where x,y=a,b or x,y=b,a.
+func order2Ordered[E cmp.Ordered](data []E, a, b int, swaps *int) (int, int) {
+	if cmp.Less(data[b], data[a]) {
+		*swaps++
+		return b, a
+	}
+	return a, b
+}
+
+// medianOrdered returns x where data[x] is the median of data[a],data[b],data[c], where x is a, b, or c.
+func medianOrdered[E cmp.Ordered](data []E, a, b, c int, swaps *int) int {
+	a, b = order2Ordered(data, a, b, swaps)
+	b, c = order2Ordered(data, b, c, swaps)
+	a, b = order2Ordered(data, a, b, swaps)
+	return b
+}
+
+// medianAdjacentOrdered finds the median of data[a - 1], data[a], data[a + 1] and stores the index into a.
+func medianAdjacentOrdered[E cmp.Ordered](data []E, a int, swaps *int) int {
+	return medianOrdered(data, a-1, a, a+1, swaps)
+}
+
+func reverseRangeOrdered[E cmp.Ordered](data []E, a, b int) {
+	i := a
+	j := b - 1
+	for i < j {
+		data[i], data[j] = data[j], data[i]
+		i++
+		j--
+	}
+}
+
+func swapRangeOrdered[E cmp.Ordered](data []E, a, b, n int) {
+	for i := 0; i < n; i++ {
+		data[a+i], data[b+i] = data[b+i], data[a+i]
+	}
+}
+
+func stableOrdered[E cmp.Ordered](data []E, n int) {
+	blockSize := 20 // must be > 0
+	a, b := 0, blockSize
+	for b <= n {
+		insertionSortOrdered(data, a, b)
+		a = b
+		b += blockSize
+	}
+	insertionSortOrdered(data, a, n)
+
+	for blockSize < n {
+		a, b = 0, 2*blockSize
+		for b <= n {
+			symMergeOrdered(data, a, a+blockSize, b)
+			a = b
+			b += 2 * blockSize
+		}
+		if m := a + blockSize; m < n {
+			symMergeOrdered(data, a, m, n)
+		}
+		blockSize *= 2
+	}
+}
+
+// symMergeOrdered merges the two sorted subsequences data[a:m] and data[m:b] using
+// the SymMerge algorithm from Pok-Son Kim and Arne Kutzner, "Stable Minimum
+// Storage Merging by Symmetric Comparisons", in Susanne Albers and Tomasz
+// Radzik, editors, Algorithms - ESA 2004, volume 3221 of Lecture Notes in
+// Computer Science, pages 714-723. Springer, 2004.
+//
+// Let M = m-a and N = b-n. Wolog M < N.
+// The recursion depth is bound by ceil(log(N+M)).
+// The algorithm needs O(M*log(N/M + 1)) calls to data.Less.
+// The algorithm needs O((M+N)*log(M)) calls to data.Swap.
+//
+// The paper gives O((M+N)*log(M)) as the number of assignments assuming a
+// rotation algorithm which uses O(M+N+gcd(M+N)) assignments. The argumentation
+// in the paper carries through for Swap operations, especially as the block
+// swapping rotate uses only O(M+N) Swaps.
+//
+// symMerge assumes non-degenerate arguments: a < m && m < b.
+// Having the caller check this condition eliminates many leaf recursion calls,
+// which improves performance.
+func symMergeOrdered[E cmp.Ordered](data []E, a, m, b int) {
+	// Avoid unnecessary recursions of symMerge
+	// by direct insertion of data[a] into data[m:b]
+	// if data[a:m] only contains one element.
+	if m-a == 1 {
+		// Use binary search to find the lowest index i
+		// such that data[i] >= data[a] for m <= i < b.
+		// Exit the search loop with i == b in case no such index exists.
+		i := m
+		j := b
+		for i < j {
+			h := int(uint(i+j) >> 1)
+			if cmp.Less(data[h], data[a]) {
+				i = h + 1
+			} else {
+				j = h
+			}
+		}
+		// Swap values until data[a] reaches the position before i.
+		for k := a; k < i-1; k++ {
+			data[k], data[k+1] = data[k+1], data[k]
+		}
+		return
+	}
+
+	// Avoid unnecessary recursions of symMerge
+	// by direct insertion of data[m] into data[a:m]
+	// if data[m:b] only contains one element.
+	if b-m == 1 {
+		// Use binary search to find the lowest index i
+		// such that data[i] > data[m] for a <= i < m.
+		// Exit the search loop with i == m in case no such index exists.
+		i := a
+		j := m
+		for i < j {
+			h := int(uint(i+j) >> 1)
+			if !cmp.Less(data[m], data[h]) {
+				i = h + 1
+			} else {
+				j = h
+			}
+		}
+		// Swap values until data[m] reaches the position i.
+		for k := m; k > i; k-- {
+			data[k], data[k-1] = data[k-1], data[k]
+		}
+		return
+	}
+
+	mid := int(uint(a+b) >> 1)
+	n := mid + m
+	var start, r int
+	if m > mid {
+		start = n - b
+		r = mid
+	} else {
+		start = a
+		r = m
+	}
+	p := n - 1
+
+	for start < r {
+		c := int(uint(start+r) >> 1)
+		if !cmp.Less(data[p-c], data[c]) {
+			start = c + 1
+		} else {
+			r = c
+		}
+	}
+
+	end := n - start
+	if start < m && m < end {
+		rotateOrdered(data, start, m, end)
+	}
+	if a < start && start < mid {
+		symMergeOrdered(data, a, start, mid)
+	}
+	if mid < end && end < b {
+		symMergeOrdered(data, mid, end, b)
+	}
+}
+
+// rotateOrdered rotates two consecutive blocks u = data[a:m] and v = data[m:b] in data:
+// Data of the form 'x u v y' is changed to 'x v u y'.
+// rotate performs at most b-a many calls to data.Swap,
+// and it assumes non-degenerate arguments: a < m && m < b.
+func rotateOrdered[E cmp.Ordered](data []E, a, m, b int) {
+	i := m - a
+	j := b - m
+
+	for i != j {
+		if i > j {
+			swapRangeOrdered(data, m-i, m, j)
+			i -= j
+		} else {
+			swapRangeOrdered(data, m-i, m+j-i, i)
+			j -= i
+		}
+	}
+	// i == j
+	swapRangeOrdered(data, m-i, m, i)
+}
diff --git a/pebble/snapshot.go b/pebble/snapshot.go
new file mode 100644
index 0000000..5477b54
--- /dev/null
+++ b/pebble/snapshot.go
@@ -0,0 +1,560 @@
+// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"context"
+	"io"
+	"math"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/rangekey"
+	"github.com/cockroachdb/pebble/sstable"
+)
+
+// ErrSnapshotExcised is returned from WaitForFileOnlySnapshot if an excise
+// overlapping with one of the EventuallyFileOnlySnapshot's KeyRanges gets
+// applied before the transition of that EFOS to a file-only snapshot.
+var ErrSnapshotExcised = errors.New("pebble: snapshot excised before conversion to file-only snapshot")
+
+// Snapshot provides a read-only point-in-time view of the DB state.
+type Snapshot struct {
+	// The db the snapshot was created from.
+	db     *DB
+	seqNum uint64
+
+	// Set if part of an EventuallyFileOnlySnapshot.
+	efos *EventuallyFileOnlySnapshot
+
+	// The list the snapshot is linked into.
+	list *snapshotList
+
+	// The next/prev link for the snapshotList doubly-linked list of snapshots.
+	prev, next *Snapshot
+}
+
+var _ Reader = (*Snapshot)(nil)
+
+// Get gets the value for the given key. It returns ErrNotFound if the Snapshot
+// does not contain the key.
+//
+// The caller should not modify the contents of the returned slice, but it is
+// safe to modify the contents of the argument after Get returns. The returned
+// slice will remain valid until the returned Closer is closed. On success, the
+// caller MUST call closer.Close() or a memory leak will occur.
+func (s *Snapshot) Get(key []byte) ([]byte, io.Closer, error) {
+	if s.db == nil {
+		panic(ErrClosed)
+	}
+	return s.db.getInternal(key, nil /* batch */, s)
+}
+
+// NewIter returns an iterator that is unpositioned (Iterator.Valid() will
+// return false). The iterator can be positioned via a call to SeekGE,
+// SeekLT, First or Last.
+func (s *Snapshot) NewIter(o *IterOptions) (*Iterator, error) {
+	return s.NewIterWithContext(context.Background(), o)
+}
+
+// NewIterWithContext is like NewIter, and additionally accepts a context for
+// tracing.
+func (s *Snapshot) NewIterWithContext(ctx context.Context, o *IterOptions) (*Iterator, error) {
+	if s.db == nil {
+		panic(ErrClosed)
+	}
+	return s.db.newIter(ctx, nil /* batch */, newIterOpts{
+		snapshot: snapshotIterOpts{seqNum: s.seqNum},
+	}, o), nil
+}
+
+// ScanInternal scans all internal keys within the specified bounds, truncating
+// any rangedels and rangekeys to those bounds. For use when an external user
+// needs to be aware of all internal keys that make up a key range.
+//
+// See comment on db.ScanInternal for the behaviour that can be expected of
+// point keys deleted by range dels and keys masked by range keys.
+func (s *Snapshot) ScanInternal(
+	ctx context.Context,
+	categoryAndQoS sstable.CategoryAndQoS,
+	lower, upper []byte,
+	visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error,
+	visitRangeDel func(start, end []byte, seqNum uint64) error,
+	visitRangeKey func(start, end []byte, keys []rangekey.Key) error,
+	visitSharedFile func(sst *SharedSSTMeta) error,
+) error {
+	if s.db == nil {
+		panic(ErrClosed)
+	}
+	scanInternalOpts := &scanInternalOptions{
+		CategoryAndQoS:   categoryAndQoS,
+		visitPointKey:    visitPointKey,
+		visitRangeDel:    visitRangeDel,
+		visitRangeKey:    visitRangeKey,
+		visitSharedFile:  visitSharedFile,
+		skipSharedLevels: visitSharedFile != nil,
+		IterOptions: IterOptions{
+			KeyTypes:   IterKeyTypePointsAndRanges,
+			LowerBound: lower,
+			UpperBound: upper,
+		},
+	}
+
+	iter, err := s.db.newInternalIter(ctx, snapshotIterOpts{seqNum: s.seqNum}, scanInternalOpts)
+	if err != nil {
+		return err
+	}
+	defer iter.close()
+
+	return scanInternalImpl(ctx, lower, upper, iter, scanInternalOpts)
+}
+
+// closeLocked is similar to Close(), except it requires that db.mu be held
+// by the caller.
+func (s *Snapshot) closeLocked() error {
+	s.db.mu.snapshots.remove(s)
+
+	// If s was the previous earliest snapshot, we might be able to reclaim
+	// disk space by dropping obsolete records that were pinned by s.
+	if e := s.db.mu.snapshots.earliest(); e > s.seqNum {
+		s.db.maybeScheduleCompactionPicker(pickElisionOnly)
+	}
+	s.db = nil
+	return nil
+}
+
+// Close closes the snapshot, releasing its resources. Close must be called.
+// Failure to do so will result in a tiny memory leak and a large leak of
+// resources on disk due to the entries the snapshot is preventing from being
+// deleted.
+//
+// d.mu must NOT be held by the caller.
+func (s *Snapshot) Close() error {
+	db := s.db
+	if db == nil {
+		panic(ErrClosed)
+	}
+	db.mu.Lock()
+	defer db.mu.Unlock()
+	return s.closeLocked()
+}
+
+type snapshotList struct {
+	root Snapshot
+}
+
+func (l *snapshotList) init() {
+	l.root.next = &l.root
+	l.root.prev = &l.root
+}
+
+func (l *snapshotList) empty() bool {
+	return l.root.next == &l.root
+}
+
+func (l *snapshotList) count() int {
+	if l.empty() {
+		return 0
+	}
+	var count int
+	for i := l.root.next; i != &l.root; i = i.next {
+		count++
+	}
+	return count
+}
+
+func (l *snapshotList) earliest() uint64 {
+	v := uint64(math.MaxUint64)
+	if !l.empty() {
+		v = l.root.next.seqNum
+	}
+	return v
+}
+
+func (l *snapshotList) toSlice() []uint64 {
+	if l.empty() {
+		return nil
+	}
+	var results []uint64
+	for i := l.root.next; i != &l.root; i = i.next {
+		results = append(results, i.seqNum)
+	}
+	return results
+}
+
+func (l *snapshotList) pushBack(s *Snapshot) {
+	if s.list != nil || s.prev != nil || s.next != nil {
+		panic("pebble: snapshot list is inconsistent")
+	}
+	s.prev = l.root.prev
+	s.prev.next = s
+	s.next = &l.root
+	s.next.prev = s
+	s.list = l
+}
+
+func (l *snapshotList) remove(s *Snapshot) {
+	if s == &l.root {
+		panic("pebble: cannot remove snapshot list root node")
+	}
+	if s.list != l {
+		panic("pebble: snapshot list is inconsistent")
+	}
+	s.prev.next = s.next
+	s.next.prev = s.prev
+	s.next = nil // avoid memory leaks
+	s.prev = nil // avoid memory leaks
+	s.list = nil // avoid memory leaks
+}
+
+// EventuallyFileOnlySnapshot (aka EFOS) provides a read-only point-in-time view
+// of the database state, similar to Snapshot. An EventuallyFileOnlySnapshot
+// induces less write amplification than Snapshot, at the cost of increased space
+// amplification. While a Snapshot may increase write amplification across all
+// flushes and compactions for the duration of its lifetime, an
+// EventuallyFileOnlySnapshot only incurs that cost for flushes/compactions if
+// memtables at the time of EFOS instantiation contained keys that the EFOS is
+// interested in (i.e. its protectedRanges). In that case, the EFOS prevents
+// elision of keys visible to it, similar to a Snapshot, until those memtables
+// are flushed, and once that happens, the "EventuallyFileOnlySnapshot"
+// transitions to a file-only snapshot state in which it pins zombies sstables
+// like an open Iterator would, without pinning any memtables. Callers that can
+// tolerate the increased space amplification of pinning zombie sstables until
+// the snapshot is closed may prefer EventuallyFileOnlySnapshots for their
+// reduced write amplification. Callers that desire the benefits of the file-only
+// state that requires no pinning of memtables should call
+// `WaitForFileOnlySnapshot()` (and possibly re-mint an EFOS if it returns
+// ErrSnapshotExcised) before relying on the EFOS to keep producing iterators
+// with zero write-amp and zero pinning of memtables in memory.
+//
+// EventuallyFileOnlySnapshots interact with the IngestAndExcise operation in
+// subtle ways. No new iterators can be created once
+// EventuallyFileOnlySnapshot.excised is set to true.
+type EventuallyFileOnlySnapshot struct {
+	mu struct {
+		// NB: If both this mutex and db.mu are being grabbed, db.mu should be
+		// grabbed _before_ grabbing this one.
+		sync.Mutex
+
+		// Either the snap field is set below, or the version is set at any given
+		// point of time. If a snapshot is referenced, this is not a file-only
+		// snapshot yet, and if a version is set (and ref'd) this is a file-only
+		// snapshot.
+
+		// The wrapped regular snapshot, if not a file-only snapshot yet.
+		snap *Snapshot
+		// The wrapped version reference, if a file-only snapshot.
+		vers *version
+	}
+
+	// Key ranges to watch for an excise on.
+	protectedRanges []KeyRange
+	// excised, if true, signals that the above ranges were excised during the
+	// lifetime of this snapshot.
+	excised atomic.Bool
+
+	// The db the snapshot was created from.
+	db     *DB
+	seqNum uint64
+
+	closed chan struct{}
+}
+
+func (d *DB) makeEventuallyFileOnlySnapshot(
+	keyRanges []KeyRange, internalKeyRanges []internalKeyRange,
+) *EventuallyFileOnlySnapshot {
+	isFileOnly := true
+
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	seqNum := d.mu.versions.visibleSeqNum.Load()
+	// Check if any of the keyRanges overlap with a memtable.
+	for i := range d.mu.mem.queue {
+		mem := d.mu.mem.queue[i]
+		if ingestMemtableOverlaps(d.cmp, mem, internalKeyRanges) {
+			isFileOnly = false
+			break
+		}
+	}
+	es := &EventuallyFileOnlySnapshot{
+		db:              d,
+		seqNum:          seqNum,
+		protectedRanges: keyRanges,
+		closed:          make(chan struct{}),
+	}
+	if isFileOnly {
+		es.mu.vers = d.mu.versions.currentVersion()
+		es.mu.vers.Ref()
+	} else {
+		s := &Snapshot{
+			db:     d,
+			seqNum: seqNum,
+		}
+		s.efos = es
+		es.mu.snap = s
+		d.mu.snapshots.pushBack(s)
+	}
+	return es
+}
+
+// Transitions this EventuallyFileOnlySnapshot to a file-only snapshot. Requires
+// earliestUnflushedSeqNum and vers to correspond to the same Version from the
+// current or a past acquisition of db.mu. vers must have been Ref()'d before
+// that mutex was released, if it was released.
+//
+// NB: The caller is expected to check for es.excised before making this
+// call.
+//
+// d.mu must be held when calling this method.
+func (es *EventuallyFileOnlySnapshot) transitionToFileOnlySnapshot(vers *version) error {
+	es.mu.Lock()
+	select {
+	case <-es.closed:
+		vers.UnrefLocked()
+		es.mu.Unlock()
+		return ErrClosed
+	default:
+	}
+	if es.mu.snap == nil {
+		es.mu.Unlock()
+		panic("pebble: tried to transition an eventually-file-only-snapshot twice")
+	}
+	// The caller has already called Ref() on vers.
+	es.mu.vers = vers
+	// NB: The callers should have already done a check of es.excised.
+	oldSnap := es.mu.snap
+	es.mu.snap = nil
+	es.mu.Unlock()
+	return oldSnap.closeLocked()
+}
+
+// hasTransitioned returns true if this EFOS has transitioned to a file-only
+// snapshot.
+func (es *EventuallyFileOnlySnapshot) hasTransitioned() bool {
+	es.mu.Lock()
+	defer es.mu.Unlock()
+	return es.mu.vers != nil
+}
+
+// waitForFlush waits for a flush on any memtables that need to be flushed
+// before this EFOS can transition to a file-only snapshot. If this EFOS is
+// waiting on a flush of the mutable memtable, it forces a rotation within
+// `dur` duration. For immutable memtables, it schedules a flush and waits for
+// it to finish.
+func (es *EventuallyFileOnlySnapshot) waitForFlush(ctx context.Context, dur time.Duration) error {
+	es.db.mu.Lock()
+	defer es.db.mu.Unlock()
+
+	earliestUnflushedSeqNum := es.db.getEarliestUnflushedSeqNumLocked()
+	for earliestUnflushedSeqNum < es.seqNum {
+		select {
+		case <-es.closed:
+			return ErrClosed
+		case <-ctx.Done():
+			return ctx.Err()
+		default:
+		}
+		// Check if the current mutable memtable contains keys less than seqNum.
+		// If so, rotate it.
+		if es.db.mu.mem.mutable.logSeqNum < es.seqNum && dur.Nanoseconds() > 0 {
+			es.db.maybeScheduleDelayedFlush(es.db.mu.mem.mutable, dur)
+		} else {
+			// Find the last memtable that contains seqNums less than es.seqNum,
+			// and force a flush on it.
+			var mem *flushableEntry
+			for i := range es.db.mu.mem.queue {
+				if es.db.mu.mem.queue[i].logSeqNum < es.seqNum {
+					mem = es.db.mu.mem.queue[i]
+				}
+			}
+			mem.flushForced = true
+			es.db.maybeScheduleFlush()
+		}
+		es.db.mu.compact.cond.Wait()
+
+		earliestUnflushedSeqNum = es.db.getEarliestUnflushedSeqNumLocked()
+	}
+	if es.excised.Load() {
+		return ErrSnapshotExcised
+	}
+	return nil
+}
+
+// WaitForFileOnlySnapshot blocks the calling goroutine until this snapshot
+// has been converted into a file-only snapshot (i.e. all memtables containing
+// keys < seqNum are flushed). A duration can be passed in, and if nonzero,
+// a delayed flush will be scheduled at that duration if necessary.
+//
+// Idempotent; can be called multiple times with no side effects.
+func (es *EventuallyFileOnlySnapshot) WaitForFileOnlySnapshot(
+	ctx context.Context, dur time.Duration,
+) error {
+	if es.hasTransitioned() {
+		return nil
+	}
+
+	if err := es.waitForFlush(ctx, dur); err != nil {
+		return err
+	}
+
+	if invariants.Enabled {
+		// Since we aren't returning an error, we _must_ have transitioned to a
+		// file-only snapshot by now.
+		if !es.hasTransitioned() {
+			panic("expected EFOS to have transitioned to file-only snapshot after flush")
+		}
+	}
+	return nil
+}
+
+// Close closes the file-only snapshot and releases all referenced resources.
+// Not idempotent.
+func (es *EventuallyFileOnlySnapshot) Close() error {
+	close(es.closed)
+	es.db.mu.Lock()
+	defer es.db.mu.Unlock()
+	es.mu.Lock()
+	defer es.mu.Unlock()
+
+	if es.mu.snap != nil {
+		if err := es.mu.snap.closeLocked(); err != nil {
+			return err
+		}
+	}
+	if es.mu.vers != nil {
+		es.mu.vers.UnrefLocked()
+	}
+	return nil
+}
+
+// Get implements the Reader interface.
+func (es *EventuallyFileOnlySnapshot) Get(key []byte) (value []byte, closer io.Closer, err error) {
+	// TODO(jackson): Use getInternal.
+	iter, err := es.NewIter(nil)
+	if err != nil {
+		return nil, nil, err
+	}
+	var valid bool
+	if es.db.opts.Comparer.Split != nil {
+		valid = iter.SeekPrefixGE(key)
+	} else {
+		valid = iter.SeekGE(key)
+	}
+	if !valid {
+		if err = firstError(iter.Error(), iter.Close()); err != nil {
+			return nil, nil, err
+		}
+		return nil, nil, ErrNotFound
+	}
+	if !es.db.equal(iter.Key(), key) {
+		return nil, nil, firstError(iter.Close(), ErrNotFound)
+	}
+	return iter.Value(), iter, nil
+}
+
+// NewIter returns an iterator that is unpositioned (Iterator.Valid() will
+// return false). The iterator can be positioned via a call to SeekGE,
+// SeekLT, First or Last.
+func (es *EventuallyFileOnlySnapshot) NewIter(o *IterOptions) (*Iterator, error) {
+	return es.NewIterWithContext(context.Background(), o)
+}
+
+// NewIterWithContext is like NewIter, and additionally accepts a context for
+// tracing.
+func (es *EventuallyFileOnlySnapshot) NewIterWithContext(
+	ctx context.Context, o *IterOptions,
+) (*Iterator, error) {
+	select {
+	case <-es.closed:
+		panic(ErrClosed)
+	default:
+	}
+
+	es.mu.Lock()
+	defer es.mu.Unlock()
+	if es.mu.vers != nil {
+		sOpts := snapshotIterOpts{seqNum: es.seqNum, vers: es.mu.vers}
+		return es.db.newIter(ctx, nil /* batch */, newIterOpts{snapshot: sOpts}, o), nil
+	}
+
+	if es.excised.Load() {
+		return nil, ErrSnapshotExcised
+	}
+	sOpts := snapshotIterOpts{seqNum: es.seqNum}
+	iter := es.db.newIter(ctx, nil /* batch */, newIterOpts{snapshot: sOpts}, o)
+
+	// If excised is true, then keys relevant to the snapshot might not be
+	// present in the readState being used by the iterator. Error out.
+	if es.excised.Load() {
+		iter.Close()
+		return nil, ErrSnapshotExcised
+	}
+	return iter, nil
+}
+
+// ScanInternal scans all internal keys within the specified bounds, truncating
+// any rangedels and rangekeys to those bounds. For use when an external user
+// needs to be aware of all internal keys that make up a key range.
+//
+// See comment on db.ScanInternal for the behaviour that can be expected of
+// point keys deleted by range dels and keys masked by range keys.
+func (es *EventuallyFileOnlySnapshot) ScanInternal(
+	ctx context.Context,
+	categoryAndQoS sstable.CategoryAndQoS,
+	lower, upper []byte,
+	visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error,
+	visitRangeDel func(start, end []byte, seqNum uint64) error,
+	visitRangeKey func(start, end []byte, keys []rangekey.Key) error,
+	visitSharedFile func(sst *SharedSSTMeta) error,
+) error {
+	if es.db == nil {
+		panic(ErrClosed)
+	}
+	if es.excised.Load() {
+		return ErrSnapshotExcised
+	}
+	var sOpts snapshotIterOpts
+	es.mu.Lock()
+	if es.mu.vers != nil {
+		sOpts = snapshotIterOpts{
+			seqNum: es.seqNum,
+			vers:   es.mu.vers,
+		}
+	} else {
+		sOpts = snapshotIterOpts{
+			seqNum: es.seqNum,
+		}
+	}
+	es.mu.Unlock()
+	opts := &scanInternalOptions{
+		CategoryAndQoS: categoryAndQoS,
+		IterOptions: IterOptions{
+			KeyTypes:   IterKeyTypePointsAndRanges,
+			LowerBound: lower,
+			UpperBound: upper,
+		},
+		visitPointKey:    visitPointKey,
+		visitRangeDel:    visitRangeDel,
+		visitRangeKey:    visitRangeKey,
+		visitSharedFile:  visitSharedFile,
+		skipSharedLevels: visitSharedFile != nil,
+	}
+	iter, err := es.db.newInternalIter(ctx, sOpts, opts)
+	if err != nil {
+		return err
+	}
+	defer iter.close()
+
+	// If excised is true, then keys relevant to the snapshot might not be
+	// present in the readState being used by the iterator. Error out.
+	if es.excised.Load() {
+		return ErrSnapshotExcised
+	}
+
+	return scanInternalImpl(ctx, lower, upper, iter, opts)
+}
diff --git a/pebble/snapshot_test.go b/pebble/snapshot_test.go
new file mode 100644
index 0000000..f0514c8
--- /dev/null
+++ b/pebble/snapshot_test.go
@@ -0,0 +1,384 @@
+// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"fmt"
+	"math/rand"
+	"reflect"
+	"runtime"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+func TestSnapshotListToSlice(t *testing.T) {
+	testCases := []struct {
+		vals []uint64
+	}{
+		{nil},
+		{[]uint64{1}},
+		{[]uint64{1, 2, 3}},
+		{[]uint64{3, 2, 1}},
+	}
+	for _, c := range testCases {
+		t.Run("", func(t *testing.T) {
+			var l snapshotList
+			l.init()
+			for _, v := range c.vals {
+				l.pushBack(&Snapshot{seqNum: v})
+			}
+			slice := l.toSlice()
+			if !reflect.DeepEqual(c.vals, slice) {
+				t.Fatalf("expected %d, but got %d", c.vals, slice)
+			}
+		})
+	}
+}
+
+func testSnapshotImpl(t *testing.T, newSnapshot func(d *DB) Reader) {
+	var d *DB
+	var snapshots map[string]Reader
+
+	close := func() {
+		for _, s := range snapshots {
+			require.NoError(t, s.Close())
+		}
+		snapshots = nil
+		if d != nil {
+			require.NoError(t, d.Close())
+			d = nil
+		}
+	}
+	defer close()
+
+	randVersion := func() FormatMajorVersion {
+		minVersion := formatUnusedPrePebblev1MarkedCompacted
+		return FormatMajorVersion(int(minVersion) + rand.Intn(
+			int(internalFormatNewest)-int(minVersion)+1))
+	}
+	datadriven.RunTest(t, "testdata/snapshot", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "define":
+			close()
+
+			var err error
+			options := &Options{
+				FS:                 vfs.NewMem(),
+				FormatMajorVersion: randVersion(),
+			}
+			if td.HasArg("block-size") {
+				var blockSize int
+				td.ScanArgs(t, "block-size", &blockSize)
+				options.Levels = make([]LevelOptions, 1)
+				options.Levels[0].BlockSize = blockSize
+				options.Levels[0].IndexBlockSize = blockSize
+			}
+			d, err = Open("", options)
+			if err != nil {
+				return err.Error()
+			}
+			snapshots = make(map[string]Reader)
+
+			for _, line := range strings.Split(td.Input, "\n") {
+				parts := strings.Fields(line)
+				if len(parts) == 0 {
+					continue
+				}
+				var err error
+				switch parts[0] {
+				case "set":
+					if len(parts) != 3 {
+						return fmt.Sprintf("%s expects 2 arguments", parts[0])
+					}
+					err = d.Set([]byte(parts[1]), []byte(parts[2]), nil)
+				case "del":
+					if len(parts) != 2 {
+						return fmt.Sprintf("%s expects 1 argument", parts[0])
+					}
+					err = d.Delete([]byte(parts[1]), nil)
+				case "merge":
+					if len(parts) != 3 {
+						return fmt.Sprintf("%s expects 2 arguments", parts[0])
+					}
+					err = d.Merge([]byte(parts[1]), []byte(parts[2]), nil)
+				case "snapshot":
+					if len(parts) != 2 {
+						return fmt.Sprintf("%s expects 1 argument", parts[0])
+					}
+					snapshots[parts[1]] = newSnapshot(d)
+				case "compact":
+					if len(parts) != 2 {
+						return fmt.Sprintf("%s expects 1 argument", parts[0])
+					}
+					keys := strings.Split(parts[1], "-")
+					if len(keys) != 2 {
+						return fmt.Sprintf("malformed key range: %s", parts[1])
+					}
+					err = d.Compact([]byte(keys[0]), []byte(keys[1]), false)
+				default:
+					return fmt.Sprintf("unknown op: %s", parts[0])
+				}
+				if err != nil {
+					return err.Error()
+				}
+			}
+			return ""
+
+		case "db-state":
+			d.mu.Lock()
+			s := d.mu.versions.currentVersion().String()
+			d.mu.Unlock()
+			return s
+
+		case "iter":
+			var iter *Iterator
+			if len(td.CmdArgs) == 1 {
+				if td.CmdArgs[0].Key != "snapshot" {
+					return fmt.Sprintf("unknown argument: %s", td.CmdArgs[0])
+				}
+				if len(td.CmdArgs[0].Vals) != 1 {
+					return fmt.Sprintf("%s expects 1 value: %s", td.CmdArgs[0].Key, td.CmdArgs[0])
+				}
+				name := td.CmdArgs[0].Vals[0]
+				snapshot := snapshots[name]
+				if snapshot == nil {
+					return fmt.Sprintf("unable to find snapshot \"%s\"", name)
+				}
+				iter, _ = snapshot.NewIter(nil)
+			} else {
+				iter, _ = d.NewIter(nil)
+			}
+			defer iter.Close()
+
+			var b bytes.Buffer
+			for _, line := range strings.Split(td.Input, "\n") {
+				parts := strings.Fields(line)
+				if len(parts) == 0 {
+					continue
+				}
+				switch parts[0] {
+				case "first":
+					iter.First()
+				case "last":
+					iter.Last()
+				case "seek-ge":
+					if len(parts) != 2 {
+						return "seek-ge <key>\n"
+					}
+					iter.SeekGE([]byte(strings.TrimSpace(parts[1])))
+				case "seek-lt":
+					if len(parts) != 2 {
+						return "seek-lt <key>\n"
+					}
+					iter.SeekLT([]byte(strings.TrimSpace(parts[1])))
+				case "next":
+					iter.Next()
+				case "prev":
+					iter.Prev()
+				default:
+					return fmt.Sprintf("unknown op: %s", parts[0])
+				}
+				if iter.Valid() {
+					fmt.Fprintf(&b, "%s:%s\n", iter.Key(), iter.Value())
+				} else if err := iter.Error(); err != nil {
+					fmt.Fprintf(&b, "err=%v\n", err)
+				} else {
+					fmt.Fprintf(&b, ".\n")
+				}
+			}
+			return b.String()
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestSnapshot(t *testing.T) {
+	testSnapshotImpl(t, func(d *DB) Reader {
+		return d.NewSnapshot()
+	})
+}
+
+func TestEventuallyFileOnlySnapshot(t *testing.T) {
+	testSnapshotImpl(t, func(d *DB) Reader {
+		// NB: all keys in testdata/snapshot fall within the ASCII keyrange a-z.
+		return d.NewEventuallyFileOnlySnapshot([]KeyRange{{Start: []byte("a"), End: []byte("z")}})
+	})
+}
+
+func TestSnapshotClosed(t *testing.T) {
+	d, err := Open("", &Options{
+		FS: vfs.NewMem(),
+	})
+	require.NoError(t, err)
+
+	catch := func(f func()) (err error) {
+		defer func() {
+			if r := recover(); r != nil {
+				err = r.(error)
+			}
+		}()
+		f()
+		return nil
+	}
+
+	snap := d.NewSnapshot()
+	require.NoError(t, snap.Close())
+	require.True(t, errors.Is(catch(func() { _ = snap.Close() }), ErrClosed))
+	require.True(t, errors.Is(catch(func() { _, _, _ = snap.Get(nil) }), ErrClosed))
+	require.True(t, errors.Is(catch(func() { snap.NewIter(nil) }), ErrClosed))
+
+	require.NoError(t, d.Close())
+}
+
+func TestSnapshotRangeDeletionStress(t *testing.T) {
+	const runs = 200
+	const middleKey = runs * runs
+
+	d, err := Open("", &Options{
+		FS: vfs.NewMem(),
+	})
+	require.NoError(t, err)
+
+	mkkey := func(k int) []byte {
+		return []byte(fmt.Sprintf("%08d", k))
+	}
+	v := []byte("hello world")
+
+	snapshots := make([]*Snapshot, 0, runs)
+	for r := 0; r < runs; r++ {
+		// We use a keyspace that is 2*runs*runs wide. In other words there are
+		// 2*runs sections of the keyspace, each with runs elements. On every
+		// run, we write to the r-th element of each section of the keyspace.
+		for i := 0; i < 2*runs; i++ {
+			err := d.Set(mkkey(runs*i+r), v, nil)
+			require.NoError(t, err)
+		}
+
+		// Now we delete some of the keyspace through a DeleteRange. We delete from
+		// the middle of the keyspace outwards. The keyspace is made of 2*runs
+		// sections, and we delete an additional two of these sections per run.
+		err := d.DeleteRange(mkkey(middleKey-runs*r), mkkey(middleKey+runs*r), nil)
+		require.NoError(t, err)
+
+		snapshots = append(snapshots, d.NewSnapshot())
+	}
+
+	// Check that all the snapshots contain the expected number of keys.
+	// Iterating over so many keys is slow, so do it in parallel.
+	var wg sync.WaitGroup
+	sem := make(chan struct{}, runtime.GOMAXPROCS(0))
+	for r := range snapshots {
+		wg.Add(1)
+		sem <- struct{}{}
+		go func(r int) {
+			defer func() {
+				<-sem
+				wg.Done()
+			}()
+
+			// Count the keys at this snapshot.
+			iter, _ := snapshots[r].NewIter(nil)
+			var keysFound int
+			for iter.First(); iter.Valid(); iter.Next() {
+				keysFound++
+			}
+			err := firstError(iter.Error(), iter.Close())
+			if err != nil {
+				t.Error(err)
+				return
+			}
+
+			// At the time that this snapshot was taken, (r+1)*2*runs unique keys
+			// were Set (one in each of the 2*runs sections per run).  But this
+			// run also deleted the 2*r middlemost sections.  When this snapshot
+			// was taken, a Set to each of those sections had been made (r+1)
+			// times, so 2*r*(r+1) previously-set keys are now deleted.
+
+			keysExpected := (r+1)*2*runs - 2*r*(r+1)
+			if keysFound != keysExpected {
+				t.Errorf("%d: found %d keys, want %d", r, keysFound, keysExpected)
+			}
+			if err := snapshots[r].Close(); err != nil {
+				t.Error(err)
+			}
+		}(r)
+	}
+	wg.Wait()
+	require.NoError(t, d.Close())
+}
+
+// TestNewSnapshotRace tests atomicity of NewSnapshot.
+//
+// It tests for a regression of a previous race condition in which NewSnapshot
+// would retrieve the visible sequence number for a new snapshot before
+// locking the database mutex to add the snapshot. A write and flush that
+// that occurred between the reading of the sequence number and appending the
+// snapshot could drop keys required by the snapshot.
+func TestNewSnapshotRace(t *testing.T) {
+	const runs = 10
+	d, err := Open("", &Options{FS: vfs.NewMem()})
+	require.NoError(t, err)
+
+	v := []byte(`foo`)
+	ch := make(chan string)
+	var wg sync.WaitGroup
+	wg.Add(1)
+
+	go func() {
+		defer wg.Done()
+		for k := range ch {
+			if err := d.Set([]byte(k), v, nil); err != nil {
+				t.Error(err)
+				return
+			}
+			if err := d.Flush(); err != nil {
+				t.Error(err)
+				return
+			}
+		}
+	}()
+	for i := 0; i < runs; i++ {
+		// This main test goroutine sets `k` before creating a new snapshot.
+		// The key `k` should always be present within the snapshot.
+		k := fmt.Sprintf("key%06d", i)
+		require.NoError(t, d.Set([]byte(k), v, nil))
+
+		// Lock d.mu in another goroutine so that our call to NewSnapshot
+		// will need to contend for d.mu.
+		wg.Add(1)
+		locked := make(chan struct{})
+		go func() {
+			defer wg.Done()
+			d.mu.Lock()
+			close(locked)
+			time.Sleep(20 * time.Millisecond)
+			d.mu.Unlock()
+		}()
+		<-locked
+
+		// Tell the other goroutine to overwrite `k` with a later sequence
+		// number. It's indeterminate which key we'll read, but we should
+		// always read one of them.
+		ch <- k
+		s := d.NewSnapshot()
+		_, c, err := s.Get([]byte(k))
+		require.NoError(t, err)
+		require.NoError(t, c.Close())
+		require.NoError(t, s.Close())
+	}
+	close(ch)
+	wg.Wait()
+	require.NoError(t, d.Close())
+}
diff --git a/pebble/sstable/block.go b/pebble/sstable/block.go
new file mode 100644
index 0000000..9634d2d
--- /dev/null
+++ b/pebble/sstable/block.go
@@ -0,0 +1,1863 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"context"
+	"encoding/binary"
+	"unsafe"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manual"
+	"github.com/cockroachdb/pebble/internal/rangedel"
+	"github.com/cockroachdb/pebble/internal/rangekey"
+)
+
+func uvarintLen(v uint32) int {
+	i := 0
+	for v >= 0x80 {
+		v >>= 7
+		i++
+	}
+	return i + 1
+}
+
+type blockWriter struct {
+	restartInterval int
+	nEntries        int
+	nextRestart     int
+	buf             []byte
+	// For datablocks in TableFormatPebblev3, we steal the most significant bit
+	// in restarts for encoding setHasSameKeyPrefixSinceLastRestart. This leaves
+	// us with 31 bits, which is more than enough (no one needs > 2GB blocks).
+	// Typically, restarts occur every 16 keys, and by storing this bit with the
+	// restart, we can optimize for the case where a user wants to skip to the
+	// next prefix which happens to be in the same data block, but is > 16 keys
+	// away. We have seen production situations with 100+ versions per MVCC key
+	// (which share the same prefix). Additionally, for such writers, the prefix
+	// compression of the key, that shares the key with the preceding key, is
+	// limited to the prefix part of the preceding key -- this ensures that when
+	// doing NPrefix (see blockIter) we don't need to assemble the full key
+	// for each step since by limiting the length of the shared key we are
+	// ensuring that any of the keys with the same prefix can be used to
+	// assemble the full key when the prefix does change.
+	restarts []uint32
+	// Do not read curKey directly from outside blockWriter since it can have
+	// the InternalKeyKindSSTableInternalObsoleteBit set. Use getCurKey() or
+	// getCurUserKey() instead.
+	curKey []byte
+	// curValue excludes the optional prefix provided to
+	// storeWithOptionalValuePrefix.
+	curValue []byte
+	prevKey  []byte
+	tmp      [4]byte
+	// We don't know the state of the sets that were at the end of the previous
+	// block, so this is initially 0. It may be true for the second and later
+	// restarts in a block. Not having inter-block information is fine since we
+	// will optimize by stepping through restarts only within the same block.
+	// Note that the first restart is the first key in the block.
+	setHasSameKeyPrefixSinceLastRestart bool
+}
+
+func (w *blockWriter) clear() {
+	*w = blockWriter{
+		buf:      w.buf[:0],
+		restarts: w.restarts[:0],
+		curKey:   w.curKey[:0],
+		curValue: w.curValue[:0],
+		prevKey:  w.prevKey[:0],
+	}
+}
+
+// MaximumBlockSize is an extremely generous maximum block size of 256MiB. We
+// explicitly place this limit to reserve a few bits in the restart for
+// internal use.
+const MaximumBlockSize = 1 << 28
+const setHasSameKeyPrefixRestartMask uint32 = 1 << 31
+const restartMaskLittleEndianHighByteWithoutSetHasSamePrefix byte = 0b0111_1111
+const restartMaskLittleEndianHighByteOnlySetHasSamePrefix byte = 0b1000_0000
+
+func (w *blockWriter) getCurKey() InternalKey {
+	k := base.DecodeInternalKey(w.curKey)
+	k.Trailer = k.Trailer & trailerObsoleteMask
+	return k
+}
+
+func (w *blockWriter) getCurUserKey() []byte {
+	n := len(w.curKey) - base.InternalTrailerLen
+	if n < 0 {
+		panic(errors.AssertionFailedf("corrupt key in blockWriter buffer"))
+	}
+	return w.curKey[:n:n]
+}
+
+// If !addValuePrefix, the valuePrefix is ignored.
+func (w *blockWriter) storeWithOptionalValuePrefix(
+	keySize int,
+	value []byte,
+	maxSharedKeyLen int,
+	addValuePrefix bool,
+	valuePrefix valuePrefix,
+	setHasSameKeyPrefix bool,
+) {
+	shared := 0
+	if !setHasSameKeyPrefix {
+		w.setHasSameKeyPrefixSinceLastRestart = false
+	}
+	if w.nEntries == w.nextRestart {
+		w.nextRestart = w.nEntries + w.restartInterval
+		restart := uint32(len(w.buf))
+		if w.setHasSameKeyPrefixSinceLastRestart {
+			restart = restart | setHasSameKeyPrefixRestartMask
+		}
+		w.setHasSameKeyPrefixSinceLastRestart = true
+		w.restarts = append(w.restarts, restart)
+	} else {
+		// TODO(peter): Manually inlined version of base.SharedPrefixLen(). This
+		// is 3% faster on BenchmarkWriter on go1.16. Remove if future versions
+		// show this to not be a performance win. For now, functions that use of
+		// unsafe cannot be inlined.
+		n := maxSharedKeyLen
+		if n > len(w.prevKey) {
+			n = len(w.prevKey)
+		}
+		asUint64 := func(b []byte, i int) uint64 {
+			return binary.LittleEndian.Uint64(b[i:])
+		}
+		for shared < n-7 && asUint64(w.curKey, shared) == asUint64(w.prevKey, shared) {
+			shared += 8
+		}
+		for shared < n && w.curKey[shared] == w.prevKey[shared] {
+			shared++
+		}
+	}
+
+	lenValuePlusOptionalPrefix := len(value)
+	if addValuePrefix {
+		lenValuePlusOptionalPrefix++
+	}
+	needed := 3*binary.MaxVarintLen32 + len(w.curKey[shared:]) + lenValuePlusOptionalPrefix
+	n := len(w.buf)
+	if cap(w.buf) < n+needed {
+		newCap := 2 * cap(w.buf)
+		if newCap == 0 {
+			newCap = 1024
+		}
+		for newCap < n+needed {
+			newCap *= 2
+		}
+		newBuf := make([]byte, n, newCap)
+		copy(newBuf, w.buf)
+		w.buf = newBuf
+	}
+	w.buf = w.buf[:n+needed]
+
+	// TODO(peter): Manually inlined versions of binary.PutUvarint(). This is 15%
+	// faster on BenchmarkWriter on go1.13. Remove if go1.14 or future versions
+	// show this to not be a performance win.
+	{
+		x := uint32(shared)
+		for x >= 0x80 {
+			w.buf[n] = byte(x) | 0x80
+			x >>= 7
+			n++
+		}
+		w.buf[n] = byte(x)
+		n++
+	}
+
+	{
+		x := uint32(keySize - shared)
+		for x >= 0x80 {
+			w.buf[n] = byte(x) | 0x80
+			x >>= 7
+			n++
+		}
+		w.buf[n] = byte(x)
+		n++
+	}
+
+	{
+		x := uint32(lenValuePlusOptionalPrefix)
+		for x >= 0x80 {
+			w.buf[n] = byte(x) | 0x80
+			x >>= 7
+			n++
+		}
+		w.buf[n] = byte(x)
+		n++
+	}
+
+	n += copy(w.buf[n:], w.curKey[shared:])
+	if addValuePrefix {
+		w.buf[n : n+1][0] = byte(valuePrefix)
+		n++
+	}
+	n += copy(w.buf[n:], value)
+	w.buf = w.buf[:n]
+
+	w.curValue = w.buf[n-len(value):]
+
+	w.nEntries++
+}
+
+func (w *blockWriter) add(key InternalKey, value []byte) {
+	w.addWithOptionalValuePrefix(
+		key, false, value, len(key.UserKey), false, 0, false)
+}
+
+// Callers that always set addValuePrefix to false should use add() instead.
+//
+// isObsolete indicates whether this key-value pair is obsolete in this
+// sstable (only applicable when writing data blocks) -- see the comment in
+// table.go and the longer one in format.go. addValuePrefix adds a 1 byte
+// prefix to the value, specified in valuePrefix -- this is used for data
+// blocks in TableFormatPebblev3 onwards for SETs (see the comment in
+// format.go, with more details in value_block.go). setHasSameKeyPrefix is
+// also used in TableFormatPebblev3 onwards for SETs.
+func (w *blockWriter) addWithOptionalValuePrefix(
+	key InternalKey,
+	isObsolete bool,
+	value []byte,
+	maxSharedKeyLen int,
+	addValuePrefix bool,
+	valuePrefix valuePrefix,
+	setHasSameKeyPrefix bool,
+) {
+	w.curKey, w.prevKey = w.prevKey, w.curKey
+
+	size := key.Size()
+	if cap(w.curKey) < size {
+		w.curKey = make([]byte, 0, size*2)
+	}
+	w.curKey = w.curKey[:size]
+	if isObsolete {
+		key.Trailer = key.Trailer | trailerObsoleteBit
+	}
+	key.Encode(w.curKey)
+
+	w.storeWithOptionalValuePrefix(
+		size, value, maxSharedKeyLen, addValuePrefix, valuePrefix, setHasSameKeyPrefix)
+}
+
+func (w *blockWriter) finish() []byte {
+	// Write the restart points to the buffer.
+	if w.nEntries == 0 {
+		// Every block must have at least one restart point.
+		if cap(w.restarts) > 0 {
+			w.restarts = w.restarts[:1]
+			w.restarts[0] = 0
+		} else {
+			w.restarts = append(w.restarts, 0)
+		}
+	}
+	tmp4 := w.tmp[:4]
+	for _, x := range w.restarts {
+		binary.LittleEndian.PutUint32(tmp4, x)
+		w.buf = append(w.buf, tmp4...)
+	}
+	binary.LittleEndian.PutUint32(tmp4, uint32(len(w.restarts)))
+	w.buf = append(w.buf, tmp4...)
+	result := w.buf
+
+	// Reset the block state.
+	w.nEntries = 0
+	w.nextRestart = 0
+	w.buf = w.buf[:0]
+	w.restarts = w.restarts[:0]
+	return result
+}
+
+// emptyBlockSize holds the size of an empty block. Every block ends
+// in a uint32 trailer encoding the number of restart points within the
+// block.
+const emptyBlockSize = 4
+
+func (w *blockWriter) estimatedSize() int {
+	return len(w.buf) + 4*len(w.restarts) + emptyBlockSize
+}
+
+type blockEntry struct {
+	offset   int32
+	keyStart int32
+	keyEnd   int32
+	valStart int32
+	valSize  int32
+}
+
+// blockIter is an iterator over a single block of data.
+//
+// A blockIter provides an additional guarantee around key stability when a
+// block has a restart interval of 1 (i.e. when there is no prefix
+// compression). Key stability refers to whether the InternalKey.UserKey bytes
+// returned by a positioning call will remain stable after a subsequent
+// positioning call. The normal case is that a positioning call will invalidate
+// any previously returned InternalKey.UserKey. If a block has a restart
+// interval of 1 (no prefix compression), blockIter guarantees that
+// InternalKey.UserKey will point to the key as stored in the block itself
+// which will remain valid until the blockIter is closed. The key stability
+// guarantee is used by the range tombstone and range key code, which knows that
+// the respective blocks are always encoded with a restart interval of 1. This
+// per-block key stability guarantee is sufficient for range tombstones and
+// range deletes as they are always encoded in a single block.
+//
+// A blockIter also provides a value stability guarantee for range deletions and
+// range keys since there is only a single range deletion and range key block
+// per sstable and the blockIter will not release the bytes for the block until
+// it is closed.
+//
+// Note on why blockIter knows about lazyValueHandling:
+//
+// blockIter's positioning functions (that return a LazyValue), are too
+// complex to inline even prior to lazyValueHandling. blockIter.Next and
+// blockIter.First were by far the cheapest and had costs 195 and 180
+// respectively, which exceeds the budget of 80. We initially tried to keep
+// the lazyValueHandling logic out of blockIter by wrapping it with a
+// lazyValueDataBlockIter. singleLevelIter and twoLevelIter would use this
+// wrapped iter. The functions in lazyValueDataBlockIter were simple, in that
+// they called the corresponding blockIter func and then decided whether the
+// value was in fact in-place (so return immediately) or needed further
+// handling. But these also turned out too costly for mid-stack inlining since
+// simple calls like the following have a high cost that is barely under the
+// budget of 80
+//
+//	k, v := i.data.SeekGE(key, flags)  // cost 74
+//	k, v := i.data.Next()              // cost 72
+//
+// We have 2 options for minimizing performance regressions:
+//   - Include the lazyValueHandling logic in the already non-inlineable
+//     blockIter functions: Since most of the time is spent in data block iters,
+//     it is acceptable to take the small hit of unnecessary branching (which
+//     hopefully branch prediction will predict correctly) for other kinds of
+//     blocks.
+//   - Duplicate the logic of singleLevelIterator and twoLevelIterator for the
+//     v3 sstable and only use the aforementioned lazyValueDataBlockIter for a
+//     v3 sstable. We would want to manage these copies via code generation.
+//
+// We have picked the first option here.
+type blockIter struct {
+	cmp Compare
+	// offset is the byte index that marks where the current key/value is
+	// encoded in the block.
+	offset int32
+	// nextOffset is the byte index where the next key/value is encoded in the
+	// block.
+	nextOffset int32
+	// A "restart point" in a block is a point where the full key is encoded,
+	// instead of just having a suffix of the key encoded. See readEntry() for
+	// how prefix compression of keys works. Keys in between two restart points
+	// only have a suffix encoded in the block. When restart interval is 1, no
+	// prefix compression of keys happens. This is the case with range tombstone
+	// blocks.
+	//
+	// All restart offsets are listed in increasing order in
+	// i.ptr[i.restarts:len(block)-4], while numRestarts is encoded in the last
+	// 4 bytes of the block as a uint32 (i.ptr[len(block)-4:]). i.restarts can
+	// therefore be seen as the point where data in the block ends, and a list
+	// of offsets of all restart points begins.
+	restarts int32
+	// Number of restart points in this block. Encoded at the end of the block
+	// as a uint32.
+	numRestarts  int32
+	globalSeqNum uint64
+	ptr          unsafe.Pointer
+	data         []byte
+	// key contains the raw key the iterator is currently pointed at. This may
+	// point directly to data stored in the block (for a key which has no prefix
+	// compression), to fullKey (for a prefix compressed key), or to a slice of
+	// data stored in cachedBuf (during reverse iteration).
+	key []byte
+	// fullKey is a buffer used for key prefix decompression.
+	fullKey []byte
+	// val contains the value the iterator is currently pointed at. If non-nil,
+	// this points to a slice of the block data.
+	val []byte
+	// lazyValue is val turned into a LazyValue, whenever a positioning method
+	// returns a non-nil key-value pair.
+	lazyValue base.LazyValue
+	// ikey contains the decoded InternalKey the iterator is currently pointed
+	// at. Note that the memory backing ikey.UserKey is either data stored
+	// directly in the block, fullKey, or cachedBuf. The key stability guarantee
+	// for blocks built with a restart interval of 1 is achieved by having
+	// ikey.UserKey always point to data stored directly in the block.
+	ikey InternalKey
+	// cached and cachedBuf are used during reverse iteration. They are needed
+	// because we can't perform prefix decoding in reverse, only in the forward
+	// direction. In order to iterate in reverse, we decode and cache the entries
+	// between two restart points.
+	//
+	// Note that cached[len(cached)-1] contains the previous entry to the one the
+	// blockIter is currently pointed at. As usual, nextOffset will contain the
+	// offset of the next entry. During reverse iteration, nextOffset will be
+	// updated to point to offset, and we'll set the blockIter to point at the
+	// entry cached[len(cached)-1]. See Prev() for more details.
+	//
+	// For a block encoded with a restart interval of 1, cached and cachedBuf
+	// will not be used as there are no prefix compressed entries between the
+	// restart points.
+	cached    []blockEntry
+	cachedBuf []byte
+	handle    bufferHandle
+	// for block iteration for already loaded blocks.
+	firstUserKey      []byte
+	lazyValueHandling struct {
+		vbr            *valueBlockReader
+		hasValuePrefix bool
+	}
+	hideObsoletePoints bool
+}
+
+// blockIter implements the base.InternalIterator interface.
+var _ base.InternalIterator = (*blockIter)(nil)
+
+func newBlockIter(cmp Compare, block block) (*blockIter, error) {
+	i := &blockIter{}
+	return i, i.init(cmp, block, 0, false)
+}
+
+func (i *blockIter) String() string {
+	return "block"
+}
+
+func (i *blockIter) init(
+	cmp Compare, block block, globalSeqNum uint64, hideObsoletePoints bool,
+) error {
+	numRestarts := int32(binary.LittleEndian.Uint32(block[len(block)-4:]))
+	if numRestarts == 0 {
+		return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)")
+	}
+	i.cmp = cmp
+	i.restarts = int32(len(block)) - 4*(1+numRestarts)
+	i.numRestarts = numRestarts
+	i.globalSeqNum = globalSeqNum
+	i.ptr = unsafe.Pointer(&block[0])
+	i.data = block
+	i.fullKey = i.fullKey[:0]
+	i.val = nil
+	i.hideObsoletePoints = hideObsoletePoints
+	i.clearCache()
+	if i.restarts > 0 {
+		if err := i.readFirstKey(); err != nil {
+			return err
+		}
+	} else {
+		// Block is empty.
+		i.firstUserKey = nil
+	}
+	return nil
+}
+
+// NB: two cases of hideObsoletePoints:
+//   - Local sstable iteration: globalSeqNum will be set iff the sstable was
+//     ingested.
+//   - Foreign sstable iteration: globalSeqNum is always set.
+func (i *blockIter) initHandle(
+	cmp Compare, block bufferHandle, globalSeqNum uint64, hideObsoletePoints bool,
+) error {
+	i.handle.Release()
+	i.handle = block
+	return i.init(cmp, block.Get(), globalSeqNum, hideObsoletePoints)
+}
+
+func (i *blockIter) invalidate() {
+	i.clearCache()
+	i.offset = 0
+	i.nextOffset = 0
+	i.restarts = 0
+	i.numRestarts = 0
+	i.data = nil
+}
+
+// isDataInvalidated returns true when the blockIter has been invalidated
+// using an invalidate call. NB: this is different from blockIter.Valid
+// which is part of the InternalIterator implementation.
+func (i *blockIter) isDataInvalidated() bool {
+	return i.data == nil
+}
+
+func (i *blockIter) resetForReuse() blockIter {
+	return blockIter{
+		fullKey:   i.fullKey[:0],
+		cached:    i.cached[:0],
+		cachedBuf: i.cachedBuf[:0],
+		data:      nil,
+	}
+}
+
+func (i *blockIter) readEntry() {
+	ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
+
+	// This is an ugly performance hack. Reading entries from blocks is one of
+	// the inner-most routines and decoding the 3 varints per-entry takes
+	// significant time. Neither go1.11 or go1.12 will inline decodeVarint for
+	// us, so we do it manually. This provides a 10-15% performance improvement
+	// on blockIter benchmarks on both go1.11 and go1.12.
+	//
+	// TODO(peter): remove this hack if go:inline is ever supported.
+
+	var shared uint32
+	if a := *((*uint8)(ptr)); a < 128 {
+		shared = uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 1)
+	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
+		shared = uint32(b)<<7 | uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 2)
+	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
+		shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 3)
+	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
+		shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 4)
+	} else {
+		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
+		shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 5)
+	}
+
+	var unshared uint32
+	if a := *((*uint8)(ptr)); a < 128 {
+		unshared = uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 1)
+	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
+		unshared = uint32(b)<<7 | uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 2)
+	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
+		unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 3)
+	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
+		unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 4)
+	} else {
+		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
+		unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 5)
+	}
+
+	var value uint32
+	if a := *((*uint8)(ptr)); a < 128 {
+		value = uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 1)
+	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
+		value = uint32(b)<<7 | uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 2)
+	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
+		value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 3)
+	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
+		value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 4)
+	} else {
+		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
+		value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 5)
+	}
+
+	unsharedKey := getBytes(ptr, int(unshared))
+	// TODO(sumeer): move this into the else block below.
+	i.fullKey = append(i.fullKey[:shared], unsharedKey...)
+	if shared == 0 {
+		// Provide stability for the key across positioning calls if the key
+		// doesn't share a prefix with the previous key. This removes requiring the
+		// key to be copied if the caller knows the block has a restart interval of
+		// 1. An important example of this is range-del blocks.
+		i.key = unsharedKey
+	} else {
+		i.key = i.fullKey
+	}
+	ptr = unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
+	i.val = getBytes(ptr, int(value))
+	i.nextOffset = int32(uintptr(ptr)-uintptr(i.ptr)) + int32(value)
+}
+
+func (i *blockIter) readFirstKey() error {
+	ptr := i.ptr
+
+	// This is an ugly performance hack. Reading entries from blocks is one of
+	// the inner-most routines and decoding the 3 varints per-entry takes
+	// significant time. Neither go1.11 or go1.12 will inline decodeVarint for
+	// us, so we do it manually. This provides a 10-15% performance improvement
+	// on blockIter benchmarks on both go1.11 and go1.12.
+	//
+	// TODO(peter): remove this hack if go:inline is ever supported.
+
+	if shared := *((*uint8)(ptr)); shared == 0 {
+		ptr = unsafe.Pointer(uintptr(ptr) + 1)
+	} else {
+		// The shared length is != 0, which is invalid.
+		panic("first key in block must have zero shared length")
+	}
+
+	var unshared uint32
+	if a := *((*uint8)(ptr)); a < 128 {
+		unshared = uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 1)
+	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
+		unshared = uint32(b)<<7 | uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 2)
+	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
+		unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 3)
+	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
+		unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 4)
+	} else {
+		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
+		unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 5)
+	}
+
+	// Skip the value length.
+	if a := *((*uint8)(ptr)); a < 128 {
+		ptr = unsafe.Pointer(uintptr(ptr) + 1)
+	} else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); a < 128 {
+		ptr = unsafe.Pointer(uintptr(ptr) + 2)
+	} else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); a < 128 {
+		ptr = unsafe.Pointer(uintptr(ptr) + 3)
+	} else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); a < 128 {
+		ptr = unsafe.Pointer(uintptr(ptr) + 4)
+	} else {
+		ptr = unsafe.Pointer(uintptr(ptr) + 5)
+	}
+
+	firstKey := getBytes(ptr, int(unshared))
+	// Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
+	// BlockIter benchmarks.
+	if n := len(firstKey) - 8; n >= 0 {
+		i.firstUserKey = firstKey[:n:n]
+	} else {
+		i.firstUserKey = nil
+		return base.CorruptionErrorf("pebble/table: invalid firstKey in block")
+	}
+	return nil
+}
+
+// The sstable internal obsolete bit is set when writing a block and unset by
+// blockIter, so no code outside block writing/reading code ever sees it.
+const trailerObsoleteBit = uint64(base.InternalKeyKindSSTableInternalObsoleteBit)
+const trailerObsoleteMask = (InternalKeySeqNumMax << 8) | uint64(base.InternalKeyKindSSTableInternalObsoleteMask)
+
+func (i *blockIter) decodeInternalKey(key []byte) (hiddenPoint bool) {
+	// Manually inlining base.DecodeInternalKey provides a 5-10% speedup on
+	// BlockIter benchmarks.
+	if n := len(key) - 8; n >= 0 {
+		trailer := binary.LittleEndian.Uint64(key[n:])
+		hiddenPoint = i.hideObsoletePoints &&
+			(trailer&trailerObsoleteBit != 0)
+		i.ikey.Trailer = trailer & trailerObsoleteMask
+		i.ikey.UserKey = key[:n:n]
+		if i.globalSeqNum != 0 {
+			i.ikey.SetSeqNum(i.globalSeqNum)
+		}
+	} else {
+		i.ikey.Trailer = uint64(InternalKeyKindInvalid)
+		i.ikey.UserKey = nil
+	}
+	return hiddenPoint
+}
+
+func (i *blockIter) clearCache() {
+	i.cached = i.cached[:0]
+	i.cachedBuf = i.cachedBuf[:0]
+}
+
+func (i *blockIter) cacheEntry() {
+	var valStart int32
+	valSize := int32(len(i.val))
+	if valSize > 0 {
+		valStart = int32(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr))
+	}
+
+	i.cached = append(i.cached, blockEntry{
+		offset:   i.offset,
+		keyStart: int32(len(i.cachedBuf)),
+		keyEnd:   int32(len(i.cachedBuf) + len(i.key)),
+		valStart: valStart,
+		valSize:  valSize,
+	})
+	i.cachedBuf = append(i.cachedBuf, i.key...)
+}
+
+func (i *blockIter) getFirstUserKey() []byte {
+	return i.firstUserKey
+}
+
+// SeekGE implements internalIterator.SeekGE, as documented in the pebble
+// package.
+func (i *blockIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) {
+	if invariants.Enabled && i.isDataInvalidated() {
+		panic(errors.AssertionFailedf("invalidated blockIter used"))
+	}
+
+	i.clearCache()
+	// Find the index of the smallest restart point whose key is > the key
+	// sought; index will be numRestarts if there is no such restart point.
+	i.offset = 0
+	var index int32
+
+	{
+		// NB: manually inlined sort.Seach is ~5% faster.
+		//
+		// Define f(-1) == false and f(n) == true.
+		// Invariant: f(index-1) == false, f(upper) == true.
+		upper := i.numRestarts
+		for index < upper {
+			h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
+			// index ≤ h < upper
+			offset := decodeRestart(i.data[i.restarts+4*h:])
+			// For a restart point, there are 0 bytes shared with the previous key.
+			// The varint encoding of 0 occupies 1 byte.
+			ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
+
+			// Decode the key at that restart point, and compare it to the key
+			// sought. See the comment in readEntry for why we manually inline the
+			// varint decoding.
+			var v1 uint32
+			if a := *((*uint8)(ptr)); a < 128 {
+				v1 = uint32(a)
+				ptr = unsafe.Pointer(uintptr(ptr) + 1)
+			} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
+				v1 = uint32(b)<<7 | uint32(a)
+				ptr = unsafe.Pointer(uintptr(ptr) + 2)
+			} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
+				v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+				ptr = unsafe.Pointer(uintptr(ptr) + 3)
+			} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
+				v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+				ptr = unsafe.Pointer(uintptr(ptr) + 4)
+			} else {
+				d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
+				v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+				ptr = unsafe.Pointer(uintptr(ptr) + 5)
+			}
+
+			if *((*uint8)(ptr)) < 128 {
+				ptr = unsafe.Pointer(uintptr(ptr) + 1)
+			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
+				ptr = unsafe.Pointer(uintptr(ptr) + 2)
+			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
+				ptr = unsafe.Pointer(uintptr(ptr) + 3)
+			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
+				ptr = unsafe.Pointer(uintptr(ptr) + 4)
+			} else {
+				ptr = unsafe.Pointer(uintptr(ptr) + 5)
+			}
+
+			// Manually inlining part of base.DecodeInternalKey provides a 5-10%
+			// speedup on BlockIter benchmarks.
+			s := getBytes(ptr, int(v1))
+			var k []byte
+			if n := len(s) - 8; n >= 0 {
+				k = s[:n:n]
+			}
+			// Else k is invalid, and left as nil
+
+			if i.cmp(key, k) > 0 {
+				// The search key is greater than the user key at this restart point.
+				// Search beyond this restart point, since we are trying to find the
+				// first restart point with a user key >= the search key.
+				index = h + 1 // preserves f(i-1) == false
+			} else {
+				// k >= search key, so prune everything after index (since index
+				// satisfies the property we are looking for).
+				upper = h // preserves f(j) == true
+			}
+		}
+		// index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
+		// => answer is index.
+	}
+
+	// index is the first restart point with key >= search key. Define the keys
+	// between a restart point and the next restart point as belonging to that
+	// restart point.
+	//
+	// Since keys are strictly increasing, if index > 0 then the restart point
+	// at index-1 will be the first one that has some keys belonging to it that
+	// could be equal to the search key.  If index == 0, then all keys in this
+	// block are larger than the key sought, and offset remains at zero.
+	if index > 0 {
+		i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
+	}
+	i.readEntry()
+	hiddenPoint := i.decodeInternalKey(i.key)
+
+	// Iterate from that restart point to somewhere >= the key sought.
+	if !i.valid() {
+		return nil, base.LazyValue{}
+	}
+	if !hiddenPoint && i.cmp(i.ikey.UserKey, key) >= 0 {
+		// Initialize i.lazyValue
+		if !i.lazyValueHandling.hasValuePrefix ||
+			base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
+			i.lazyValue = base.MakeInPlaceValue(i.val)
+		} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
+			i.lazyValue = base.MakeInPlaceValue(i.val[1:])
+		} else {
+			i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
+		}
+		return &i.ikey, i.lazyValue
+	}
+	for i.Next(); i.valid(); i.Next() {
+		if i.cmp(i.ikey.UserKey, key) >= 0 {
+			// i.Next() has already initialized i.lazyValue.
+			return &i.ikey, i.lazyValue
+		}
+	}
+	return nil, base.LazyValue{}
+}
+
+// SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
+// pebble package.
+func (i *blockIter) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	// This should never be called as prefix iteration is handled by sstable.Iterator.
+	panic("pebble: SeekPrefixGE unimplemented")
+}
+
+// SeekLT implements internalIterator.SeekLT, as documented in the pebble
+// package.
+func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) {
+	if invariants.Enabled && i.isDataInvalidated() {
+		panic(errors.AssertionFailedf("invalidated blockIter used"))
+	}
+
+	i.clearCache()
+	// Find the index of the smallest restart point whose key is >= the key
+	// sought; index will be numRestarts if there is no such restart point.
+	i.offset = 0
+	var index int32
+
+	{
+		// NB: manually inlined sort.Search is ~5% faster.
+		//
+		// Define f(-1) == false and f(n) == true.
+		// Invariant: f(index-1) == false, f(upper) == true.
+		upper := i.numRestarts
+		for index < upper {
+			h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
+			// index ≤ h < upper
+			offset := decodeRestart(i.data[i.restarts+4*h:])
+			// For a restart point, there are 0 bytes shared with the previous key.
+			// The varint encoding of 0 occupies 1 byte.
+			ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
+
+			// Decode the key at that restart point, and compare it to the key
+			// sought. See the comment in readEntry for why we manually inline the
+			// varint decoding.
+			var v1 uint32
+			if a := *((*uint8)(ptr)); a < 128 {
+				v1 = uint32(a)
+				ptr = unsafe.Pointer(uintptr(ptr) + 1)
+			} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
+				v1 = uint32(b)<<7 | uint32(a)
+				ptr = unsafe.Pointer(uintptr(ptr) + 2)
+			} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
+				v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+				ptr = unsafe.Pointer(uintptr(ptr) + 3)
+			} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
+				v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+				ptr = unsafe.Pointer(uintptr(ptr) + 4)
+			} else {
+				d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
+				v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+				ptr = unsafe.Pointer(uintptr(ptr) + 5)
+			}
+
+			if *((*uint8)(ptr)) < 128 {
+				ptr = unsafe.Pointer(uintptr(ptr) + 1)
+			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 {
+				ptr = unsafe.Pointer(uintptr(ptr) + 2)
+			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 {
+				ptr = unsafe.Pointer(uintptr(ptr) + 3)
+			} else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 {
+				ptr = unsafe.Pointer(uintptr(ptr) + 4)
+			} else {
+				ptr = unsafe.Pointer(uintptr(ptr) + 5)
+			}
+
+			// Manually inlining part of base.DecodeInternalKey provides a 5-10%
+			// speedup on BlockIter benchmarks.
+			s := getBytes(ptr, int(v1))
+			var k []byte
+			if n := len(s) - 8; n >= 0 {
+				k = s[:n:n]
+			}
+			// Else k is invalid, and left as nil
+
+			if i.cmp(key, k) > 0 {
+				// The search key is greater than the user key at this restart point.
+				// Search beyond this restart point, since we are trying to find the
+				// first restart point with a user key >= the search key.
+				index = h + 1 // preserves f(i-1) == false
+			} else {
+				// k >= search key, so prune everything after index (since index
+				// satisfies the property we are looking for).
+				upper = h // preserves f(j) == true
+			}
+		}
+		// index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
+		// => answer is index.
+	}
+
+	// index is the first restart point with key >= search key. Define the keys
+	// between a restart point and the next restart point as belonging to that
+	// restart point. Note that index could be equal to i.numRestarts, i.e., we
+	// are past the last restart.
+	//
+	// Since keys are strictly increasing, if index > 0 then the restart point
+	// at index-1 will be the first one that has some keys belonging to it that
+	// are less than the search key.  If index == 0, then all keys in this block
+	// are larger than the search key, so there is no match.
+	targetOffset := i.restarts
+	if index > 0 {
+		i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
+		if index < i.numRestarts {
+			targetOffset = decodeRestart(i.data[i.restarts+4*(index):])
+		}
+	} else if index == 0 {
+		// If index == 0 then all keys in this block are larger than the key
+		// sought.
+		i.offset = -1
+		i.nextOffset = 0
+		return nil, base.LazyValue{}
+	}
+
+	// Iterate from that restart point to somewhere >= the key sought, then back
+	// up to the previous entry. The expectation is that we'll be performing
+	// reverse iteration, so we cache the entries as we advance forward.
+	i.nextOffset = i.offset
+
+	for {
+		i.offset = i.nextOffset
+		i.readEntry()
+		// When hidden keys are common, there is additional optimization possible
+		// by not caching entries that are hidden (note that some calls to
+		// cacheEntry don't decode the internal key before caching, but checking
+		// whether a key is hidden does not require full decoding). However, we do
+		// need to use the blockEntry.offset in the cache for the first entry at
+		// the reset point to do the binary search when the cache is empty -- so
+		// we would need to cache that first entry (though not the key) even if
+		// was hidden. Our current assumption is that if there are large numbers
+		// of hidden keys we will be able to skip whole blocks (using block
+		// property filters) so we don't bother optimizing.
+		hiddenPoint := i.decodeInternalKey(i.key)
+
+		// NB: we don't use the hiddenPoint return value of decodeInternalKey
+		// since we want to stop as soon as we reach a key >= ikey.UserKey, so
+		// that we can reverse.
+		if i.cmp(i.ikey.UserKey, key) >= 0 {
+			// The current key is greater than or equal to our search key. Back up to
+			// the previous key which was less than our search key. Note that this for
+			// loop will execute at least once with this if-block not being true, so
+			// the key we are backing up to is the last one this loop cached.
+			return i.Prev()
+		}
+
+		if i.nextOffset >= targetOffset {
+			// We've reached the end of the current restart block. Return the
+			// current key if not hidden, else call Prev().
+			//
+			// When the restart interval is 1, the first iteration of the for loop
+			// will bring us here. In that case ikey is backed by the block so we
+			// get the desired key stability guarantee for the lifetime of the
+			// blockIter. That is, we never cache anything and therefore never
+			// return a key backed by cachedBuf.
+			if hiddenPoint {
+				return i.Prev()
+			}
+			break
+		}
+
+		i.cacheEntry()
+	}
+
+	if !i.valid() {
+		return nil, base.LazyValue{}
+	}
+	if !i.lazyValueHandling.hasValuePrefix ||
+		base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
+		i.lazyValue = base.MakeInPlaceValue(i.val)
+	} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
+		i.lazyValue = base.MakeInPlaceValue(i.val[1:])
+	} else {
+		i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
+	}
+	return &i.ikey, i.lazyValue
+}
+
+// First implements internalIterator.First, as documented in the pebble
+// package.
+func (i *blockIter) First() (*InternalKey, base.LazyValue) {
+	if invariants.Enabled && i.isDataInvalidated() {
+		panic(errors.AssertionFailedf("invalidated blockIter used"))
+	}
+
+	i.offset = 0
+	if !i.valid() {
+		return nil, base.LazyValue{}
+	}
+	i.clearCache()
+	i.readEntry()
+	hiddenPoint := i.decodeInternalKey(i.key)
+	if hiddenPoint {
+		return i.Next()
+	}
+	if !i.lazyValueHandling.hasValuePrefix ||
+		base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
+		i.lazyValue = base.MakeInPlaceValue(i.val)
+	} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
+		i.lazyValue = base.MakeInPlaceValue(i.val[1:])
+	} else {
+		i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
+	}
+	return &i.ikey, i.lazyValue
+}
+
+func decodeRestart(b []byte) int32 {
+	_ = b[3] // bounds check hint to compiler; see golang.org/issue/14808
+	return int32(uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 |
+		uint32(b[3]&restartMaskLittleEndianHighByteWithoutSetHasSamePrefix)<<24)
+}
+
+// Last implements internalIterator.Last, as documented in the pebble package.
+func (i *blockIter) Last() (*InternalKey, base.LazyValue) {
+	if invariants.Enabled && i.isDataInvalidated() {
+		panic(errors.AssertionFailedf("invalidated blockIter used"))
+	}
+
+	// Seek forward from the last restart point.
+	i.offset = decodeRestart(i.data[i.restarts+4*(i.numRestarts-1):])
+	if !i.valid() {
+		return nil, base.LazyValue{}
+	}
+
+	i.readEntry()
+	i.clearCache()
+
+	for i.nextOffset < i.restarts {
+		i.cacheEntry()
+		i.offset = i.nextOffset
+		i.readEntry()
+	}
+
+	hiddenPoint := i.decodeInternalKey(i.key)
+	if hiddenPoint {
+		return i.Prev()
+	}
+	if !i.lazyValueHandling.hasValuePrefix ||
+		base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
+		i.lazyValue = base.MakeInPlaceValue(i.val)
+	} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
+		i.lazyValue = base.MakeInPlaceValue(i.val[1:])
+	} else {
+		i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
+	}
+	return &i.ikey, i.lazyValue
+}
+
+// Next implements internalIterator.Next, as documented in the pebble
+// package.
+func (i *blockIter) Next() (*InternalKey, base.LazyValue) {
+	if len(i.cachedBuf) > 0 {
+		// We're switching from reverse iteration to forward iteration. We need to
+		// populate i.fullKey with the current key we're positioned at so that
+		// readEntry() can use i.fullKey for key prefix decompression. Note that we
+		// don't know whether i.key is backed by i.cachedBuf or i.fullKey (if
+		// SeekLT was the previous call, i.key may be backed by i.fullKey), but
+		// copying into i.fullKey works for both cases.
+		//
+		// TODO(peter): Rather than clearing the cache, we could instead use the
+		// cache until it is exhausted. This would likely be faster than falling
+		// through to the normal forward iteration code below.
+		i.fullKey = append(i.fullKey[:0], i.key...)
+		i.clearCache()
+	}
+
+start:
+	i.offset = i.nextOffset
+	if !i.valid() {
+		return nil, base.LazyValue{}
+	}
+	i.readEntry()
+	// Manually inlined version of i.decodeInternalKey(i.key).
+	if n := len(i.key) - 8; n >= 0 {
+		trailer := binary.LittleEndian.Uint64(i.key[n:])
+		hiddenPoint := i.hideObsoletePoints &&
+			(trailer&trailerObsoleteBit != 0)
+		i.ikey.Trailer = trailer & trailerObsoleteMask
+		i.ikey.UserKey = i.key[:n:n]
+		if i.globalSeqNum != 0 {
+			i.ikey.SetSeqNum(i.globalSeqNum)
+		}
+		if hiddenPoint {
+			goto start
+		}
+	} else {
+		i.ikey.Trailer = uint64(InternalKeyKindInvalid)
+		i.ikey.UserKey = nil
+	}
+	if !i.lazyValueHandling.hasValuePrefix ||
+		base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
+		i.lazyValue = base.MakeInPlaceValue(i.val)
+	} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
+		i.lazyValue = base.MakeInPlaceValue(i.val[1:])
+	} else {
+		i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
+	}
+	return &i.ikey, i.lazyValue
+}
+
+// NextPrefix implements (base.InternalIterator).NextPrefix.
+func (i *blockIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
+	if i.lazyValueHandling.hasValuePrefix {
+		return i.nextPrefixV3(succKey)
+	}
+	const nextsBeforeSeek = 3
+	k, v := i.Next()
+	for j := 1; k != nil && i.cmp(k.UserKey, succKey) < 0; j++ {
+		if j >= nextsBeforeSeek {
+			return i.SeekGE(succKey, base.SeekGEFlagsNone)
+		}
+		k, v = i.Next()
+	}
+	return k, v
+}
+
+func (i *blockIter) nextPrefixV3(succKey []byte) (*InternalKey, base.LazyValue) {
+	// Doing nexts that involve a key comparison can be expensive (and the cost
+	// depends on the key length), so we use the same threshold of 3 that we use
+	// for TableFormatPebblev2 in blockIter.nextPrefix above. The next fast path
+	// that looks at setHasSamePrefix takes ~5ns per key, which is ~150x faster
+	// than doing a SeekGE within the block, so we do this 16 times
+	// (~5ns*16=80ns), and then switch to looking at restarts. Doing the binary
+	// search for the restart consumes > 100ns. If the number of versions is >
+	// 17, we will increment nextFastCount to 17, then do a binary search, and
+	// on average need to find a key between two restarts, so another 8 steps
+	// corresponding to nextFastCount, for a mean total of 17 + 8 = 25 such
+	// steps.
+	//
+	// TODO(sumeer): use the configured restartInterval for the sstable when it
+	// was written (which we don't currently store) instead of the default value
+	// of 16.
+	const nextCmpThresholdBeforeSeek = 3
+	const nextFastThresholdBeforeRestarts = 16
+	nextCmpCount := 0
+	nextFastCount := 0
+	usedRestarts := false
+	// INVARIANT: blockIter is valid.
+	if invariants.Enabled && !i.valid() {
+		panic(errors.AssertionFailedf("nextPrefixV3 called on invalid blockIter"))
+	}
+	prevKeyIsSet := i.ikey.Kind() == InternalKeyKindSet
+	for {
+		i.offset = i.nextOffset
+		if !i.valid() {
+			return nil, base.LazyValue{}
+		}
+		// Need to decode the length integers, so we can compute nextOffset.
+		ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
+		// This is an ugly performance hack. Reading entries from blocks is one of
+		// the inner-most routines and decoding the 3 varints per-entry takes
+		// significant time. Neither go1.11 or go1.12 will inline decodeVarint for
+		// us, so we do it manually. This provides a 10-15% performance improvement
+		// on blockIter benchmarks on both go1.11 and go1.12.
+		//
+		// TODO(peter): remove this hack if go:inline is ever supported.
+
+		// Decode the shared key length integer.
+		var shared uint32
+		if a := *((*uint8)(ptr)); a < 128 {
+			shared = uint32(a)
+			ptr = unsafe.Pointer(uintptr(ptr) + 1)
+		} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
+			shared = uint32(b)<<7 | uint32(a)
+			ptr = unsafe.Pointer(uintptr(ptr) + 2)
+		} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
+			shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+			ptr = unsafe.Pointer(uintptr(ptr) + 3)
+		} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
+			shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+			ptr = unsafe.Pointer(uintptr(ptr) + 4)
+		} else {
+			d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
+			shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+			ptr = unsafe.Pointer(uintptr(ptr) + 5)
+		}
+		// Decode the unshared key length integer.
+		var unshared uint32
+		if a := *((*uint8)(ptr)); a < 128 {
+			unshared = uint32(a)
+			ptr = unsafe.Pointer(uintptr(ptr) + 1)
+		} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
+			unshared = uint32(b)<<7 | uint32(a)
+			ptr = unsafe.Pointer(uintptr(ptr) + 2)
+		} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
+			unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+			ptr = unsafe.Pointer(uintptr(ptr) + 3)
+		} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
+			unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+			ptr = unsafe.Pointer(uintptr(ptr) + 4)
+		} else {
+			d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
+			unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+			ptr = unsafe.Pointer(uintptr(ptr) + 5)
+		}
+		// Decode the value length integer.
+		var value uint32
+		if a := *((*uint8)(ptr)); a < 128 {
+			value = uint32(a)
+			ptr = unsafe.Pointer(uintptr(ptr) + 1)
+		} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
+			value = uint32(b)<<7 | uint32(a)
+			ptr = unsafe.Pointer(uintptr(ptr) + 2)
+		} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
+			value = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+			ptr = unsafe.Pointer(uintptr(ptr) + 3)
+		} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
+			value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+			ptr = unsafe.Pointer(uintptr(ptr) + 4)
+		} else {
+			d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
+			value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+			ptr = unsafe.Pointer(uintptr(ptr) + 5)
+		}
+		// The starting position of the value.
+		valuePtr := unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
+		i.nextOffset = int32(uintptr(valuePtr)-uintptr(i.ptr)) + int32(value)
+		if invariants.Enabled && unshared < 8 {
+			// This should not happen since only the key prefix is shared, so even
+			// if the prefix length is the same as the user key length, the unshared
+			// will include the trailer.
+			panic(errors.AssertionFailedf("unshared %d is too small", unshared))
+		}
+		// The trailer is written in little endian, so the key kind is the first
+		// byte in the trailer that is encoded in the slice [unshared-8:unshared].
+		keyKind := InternalKeyKind((*[manual.MaxArrayLen]byte)(ptr)[unshared-8])
+		keyKind = keyKind & base.InternalKeyKindSSTableInternalObsoleteMask
+		prefixChanged := false
+		if keyKind == InternalKeyKindSet {
+			if invariants.Enabled && value == 0 {
+				panic(errors.AssertionFailedf("value is of length 0, but we expect a valuePrefix"))
+			}
+			valPrefix := *((*valuePrefix)(valuePtr))
+			if setHasSamePrefix(valPrefix) {
+				// Fast-path. No need to assemble i.fullKey, or update i.key. We know
+				// that subsequent keys will not have a shared length that is greater
+				// than the prefix of the current key, which is also the prefix of
+				// i.key. Since we are continuing to iterate, we don't need to
+				// initialize i.ikey and i.lazyValue (these are initialized before
+				// returning).
+				nextFastCount++
+				if nextFastCount > nextFastThresholdBeforeRestarts {
+					if usedRestarts {
+						// Exhausted iteration budget. This will never happen unless
+						// someone is using a restart interval > 16. It is just to guard
+						// against long restart intervals causing too much iteration.
+						break
+					}
+					// Haven't used restarts yet, so find the first restart at or beyond
+					// the current offset.
+					targetOffset := i.offset
+					var index int32
+					{
+						// NB: manually inlined sort.Sort is ~5% faster.
+						//
+						// f defined for a restart point is true iff the offset >=
+						// targetOffset.
+						// Define f(-1) == false and f(i.numRestarts) == true.
+						// Invariant: f(index-1) == false, f(upper) == true.
+						upper := i.numRestarts
+						for index < upper {
+							h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
+							// index ≤ h < upper
+							offset := decodeRestart(i.data[i.restarts+4*h:])
+							if offset < targetOffset {
+								index = h + 1 // preserves f(index-1) == false
+							} else {
+								upper = h // preserves f(upper) == true
+							}
+						}
+						// index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
+						// => answer is index.
+					}
+					usedRestarts = true
+					nextFastCount = 0
+					if index == i.numRestarts {
+						// Already past the last real restart, so iterate a bit more until
+						// we are done with the block.
+						continue
+					}
+					// Have some real restarts after index. NB: index is the first
+					// restart at or beyond the current offset.
+					startingIndex := index
+					for index != i.numRestarts &&
+						// The restart at index is 4 bytes written in little endian format
+						// starting at i.restart+4*index. The 0th byte is the least
+						// significant and the 3rd byte is the most significant. Since the
+						// most significant bit of the 3rd byte is what we use for
+						// encoding the set-has-same-prefix information, the indexing
+						// below has +3.
+						i.data[i.restarts+4*index+3]&restartMaskLittleEndianHighByteOnlySetHasSamePrefix != 0 {
+						// We still have the same prefix, so move to the next restart.
+						index++
+					}
+					// index is the first restart that did not have the same prefix.
+					if index != startingIndex {
+						// Managed to skip past at least one restart. Resume iteration
+						// from index-1. Since nextFastCount has been reset to 0, we
+						// should be able to iterate to the next prefix.
+						i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
+						i.readEntry()
+					}
+					// Else, unable to skip past any restart. Resume iteration. Since
+					// nextFastCount has been reset to 0, we should be able to iterate
+					// to the next prefix.
+					continue
+				}
+				continue
+			} else if prevKeyIsSet {
+				prefixChanged = true
+			}
+		} else {
+			prevKeyIsSet = false
+		}
+		// Slow-path cases:
+		// - (Likely) The prefix has changed.
+		// - (Unlikely) The prefix has not changed.
+		// We assemble the key etc. under the assumption that it is the likely
+		// case.
+		unsharedKey := getBytes(ptr, int(unshared))
+		// TODO(sumeer): move this into the else block below. This is a bit tricky
+		// since the current logic assumes we have always copied the latest key
+		// into fullKey, which is why when we get to the next key we can (a)
+		// access i.fullKey[:shared], (b) append only the unsharedKey to
+		// i.fullKey. For (a), we can access i.key[:shared] since that memory is
+		// valid (even if unshared). For (b), we will need to remember whether
+		// i.key refers to i.fullKey or not, and can append the unsharedKey only
+		// in the former case and for the latter case need to copy the shared part
+		// too. This same comment applies to the other place where we can do this
+		// optimization, in readEntry().
+		i.fullKey = append(i.fullKey[:shared], unsharedKey...)
+		i.val = getBytes(valuePtr, int(value))
+		if shared == 0 {
+			// Provide stability for the key across positioning calls if the key
+			// doesn't share a prefix with the previous key. This removes requiring the
+			// key to be copied if the caller knows the block has a restart interval of
+			// 1. An important example of this is range-del blocks.
+			i.key = unsharedKey
+		} else {
+			i.key = i.fullKey
+		}
+		// Manually inlined version of i.decodeInternalKey(i.key).
+		hiddenPoint := false
+		if n := len(i.key) - 8; n >= 0 {
+			trailer := binary.LittleEndian.Uint64(i.key[n:])
+			hiddenPoint = i.hideObsoletePoints &&
+				(trailer&trailerObsoleteBit != 0)
+			i.ikey.Trailer = trailer & trailerObsoleteMask
+			i.ikey.UserKey = i.key[:n:n]
+			if i.globalSeqNum != 0 {
+				i.ikey.SetSeqNum(i.globalSeqNum)
+			}
+		} else {
+			i.ikey.Trailer = uint64(InternalKeyKindInvalid)
+			i.ikey.UserKey = nil
+		}
+		nextCmpCount++
+		if invariants.Enabled && prefixChanged && i.cmp(i.ikey.UserKey, succKey) < 0 {
+			panic(errors.AssertionFailedf("prefix should have changed but %x < %x",
+				i.ikey.UserKey, succKey))
+		}
+		if prefixChanged || i.cmp(i.ikey.UserKey, succKey) >= 0 {
+			// Prefix has changed.
+			if hiddenPoint {
+				return i.Next()
+			}
+			if invariants.Enabled && !i.lazyValueHandling.hasValuePrefix {
+				panic(errors.AssertionFailedf("nextPrefixV3 being run for non-v3 sstable"))
+			}
+			if base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
+				i.lazyValue = base.MakeInPlaceValue(i.val)
+			} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
+				i.lazyValue = base.MakeInPlaceValue(i.val[1:])
+			} else {
+				i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
+			}
+			return &i.ikey, i.lazyValue
+		}
+		// Else prefix has not changed.
+
+		if nextCmpCount >= nextCmpThresholdBeforeSeek {
+			break
+		}
+	}
+	return i.SeekGE(succKey, base.SeekGEFlagsNone)
+}
+
+// Prev implements internalIterator.Prev, as documented in the pebble
+// package.
+func (i *blockIter) Prev() (*InternalKey, base.LazyValue) {
+start:
+	for n := len(i.cached) - 1; n >= 0; n-- {
+		i.nextOffset = i.offset
+		e := &i.cached[n]
+		i.offset = e.offset
+		i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize))
+		// Manually inlined version of i.decodeInternalKey(i.key).
+		i.key = i.cachedBuf[e.keyStart:e.keyEnd]
+		if n := len(i.key) - 8; n >= 0 {
+			trailer := binary.LittleEndian.Uint64(i.key[n:])
+			hiddenPoint := i.hideObsoletePoints &&
+				(trailer&trailerObsoleteBit != 0)
+			if hiddenPoint {
+				continue
+			}
+			i.ikey.Trailer = trailer & trailerObsoleteMask
+			i.ikey.UserKey = i.key[:n:n]
+			if i.globalSeqNum != 0 {
+				i.ikey.SetSeqNum(i.globalSeqNum)
+			}
+		} else {
+			i.ikey.Trailer = uint64(InternalKeyKindInvalid)
+			i.ikey.UserKey = nil
+		}
+		i.cached = i.cached[:n]
+		if !i.lazyValueHandling.hasValuePrefix ||
+			base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
+			i.lazyValue = base.MakeInPlaceValue(i.val)
+		} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
+			i.lazyValue = base.MakeInPlaceValue(i.val[1:])
+		} else {
+			i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
+		}
+		return &i.ikey, i.lazyValue
+	}
+
+	i.clearCache()
+	if i.offset <= 0 {
+		i.offset = -1
+		i.nextOffset = 0
+		return nil, base.LazyValue{}
+	}
+
+	targetOffset := i.offset
+	var index int32
+
+	{
+		// NB: manually inlined sort.Sort is ~5% faster.
+		//
+		// Define f(-1) == false and f(n) == true.
+		// Invariant: f(index-1) == false, f(upper) == true.
+		upper := i.numRestarts
+		for index < upper {
+			h := int32(uint(index+upper) >> 1) // avoid overflow when computing h
+			// index ≤ h < upper
+			offset := decodeRestart(i.data[i.restarts+4*h:])
+			if offset < targetOffset {
+				// Looking for the first restart that has offset >= targetOffset, so
+				// ignore h and earlier.
+				index = h + 1 // preserves f(i-1) == false
+			} else {
+				upper = h // preserves f(j) == true
+			}
+		}
+		// index == upper, f(index-1) == false, and f(upper) (= f(index)) == true
+		// => answer is index.
+	}
+
+	// index is first restart with offset >= targetOffset. Note that
+	// targetOffset may not be at a restart point since one can call Prev()
+	// after Next() (so the cache was not populated) and targetOffset refers to
+	// the current entry. index-1 must have an offset < targetOffset (it can't
+	// be equal to targetOffset since the binary search would have selected that
+	// as the index).
+	i.offset = 0
+	if index > 0 {
+		i.offset = decodeRestart(i.data[i.restarts+4*(index-1):])
+	}
+	// TODO(sumeer): why is the else case not an error given targetOffset is a
+	// valid offset.
+
+	i.readEntry()
+
+	// We stop when i.nextOffset == targetOffset since the targetOffset is the
+	// entry we are stepping back from, and we don't need to cache the entry
+	// before it, since it is the candidate to return.
+	for i.nextOffset < targetOffset {
+		i.cacheEntry()
+		i.offset = i.nextOffset
+		i.readEntry()
+	}
+
+	hiddenPoint := i.decodeInternalKey(i.key)
+	if hiddenPoint {
+		// Use the cache.
+		goto start
+	}
+	if !i.lazyValueHandling.hasValuePrefix ||
+		base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet {
+		i.lazyValue = base.MakeInPlaceValue(i.val)
+	} else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) {
+		i.lazyValue = base.MakeInPlaceValue(i.val[1:])
+	} else {
+		i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val)
+	}
+	return &i.ikey, i.lazyValue
+}
+
+// Key implements internalIterator.Key, as documented in the pebble package.
+func (i *blockIter) Key() *InternalKey {
+	return &i.ikey
+}
+
+func (i *blockIter) value() base.LazyValue {
+	return i.lazyValue
+}
+
+// Error implements internalIterator.Error, as documented in the pebble
+// package.
+func (i *blockIter) Error() error {
+	return nil // infallible
+}
+
+// Close implements internalIterator.Close, as documented in the pebble
+// package.
+func (i *blockIter) Close() error {
+	i.handle.Release()
+	i.handle = bufferHandle{}
+	i.val = nil
+	i.lazyValue = base.LazyValue{}
+	i.lazyValueHandling.vbr = nil
+	return nil
+}
+
+func (i *blockIter) SetBounds(lower, upper []byte) {
+	// This should never be called as bounds are handled by sstable.Iterator.
+	panic("pebble: SetBounds unimplemented")
+}
+
+func (i *blockIter) SetContext(_ context.Context) {}
+
+func (i *blockIter) valid() bool {
+	return i.offset >= 0 && i.offset < i.restarts
+}
+
+// fragmentBlockIter wraps a blockIter, implementing the
+// keyspan.FragmentIterator interface. It's used for reading range deletion and
+// range key blocks.
+//
+// Range deletions and range keys are fragmented before they're persisted to the
+// block. Overlapping fragments have identical bounds.  The fragmentBlockIter
+// gathers all the fragments with identical bounds within a block and returns a
+// single keyspan.Span describing all the keys defined over the span.
+//
+// # Memory lifetime
+//
+// A Span returned by fragmentBlockIter is only guaranteed to be stable until
+// the next fragmentBlockIter iteration positioning method. A Span's Keys slice
+// may be reused, so the user must not assume it's stable.
+//
+// Blocks holding range deletions and range keys are configured to use a restart
+// interval of 1. This provides key stability. The caller may treat the various
+// byte slices (start, end, suffix, value) as stable for the lifetime of the
+// iterator.
+type fragmentBlockIter struct {
+	blockIter blockIter
+	keyBuf    [2]keyspan.Key
+	span      keyspan.Span
+	err       error
+	dir       int8
+	closeHook func(i keyspan.FragmentIterator) error
+
+	// elideSameSeqnum, if true, returns only the first-occurring (in forward
+	// order) Key for each sequence number.
+	elideSameSeqnum bool
+}
+
+func (i *fragmentBlockIter) resetForReuse() fragmentBlockIter {
+	return fragmentBlockIter{blockIter: i.blockIter.resetForReuse()}
+}
+
+func (i *fragmentBlockIter) decodeSpanKeys(k *InternalKey, internalValue []byte) {
+	// TODO(jackson): The use of i.span.Keys to accumulate keys across multiple
+	// calls to Decode is too confusing and subtle. Refactor to make it
+	// explicit.
+
+	// decode the contents of the fragment's value. This always includes at
+	// least the end key: RANGEDELs store the end key directly as the value,
+	// whereas the various range key kinds store are more complicated.  The
+	// details of the range key internal value format are documented within the
+	// internal/rangekey package.
+	switch k.Kind() {
+	case base.InternalKeyKindRangeDelete:
+		i.span = rangedel.Decode(*k, internalValue, i.span.Keys)
+		i.err = nil
+	case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset, base.InternalKeyKindRangeKeyDelete:
+		i.span, i.err = rangekey.Decode(*k, internalValue, i.span.Keys)
+	default:
+		i.span = keyspan.Span{}
+		i.err = base.CorruptionErrorf("pebble: corrupt keyspan fragment of kind %d", k.Kind())
+	}
+}
+
+func (i *fragmentBlockIter) elideKeysOfSameSeqNum() {
+	if invariants.Enabled {
+		if !i.elideSameSeqnum || len(i.span.Keys) == 0 {
+			panic("elideKeysOfSameSeqNum called when it should not be")
+		}
+	}
+	lastSeqNum := i.span.Keys[0].SeqNum()
+	k := 1
+	for j := 1; j < len(i.span.Keys); j++ {
+		if lastSeqNum != i.span.Keys[j].SeqNum() {
+			lastSeqNum = i.span.Keys[j].SeqNum()
+			i.span.Keys[k] = i.span.Keys[j]
+			k++
+		}
+	}
+	i.span.Keys = i.span.Keys[:k]
+}
+
+// gatherForward gathers internal keys with identical bounds. Keys defined over
+// spans of the keyspace are fragmented such that any overlapping key spans have
+// identical bounds. When these spans are persisted to a range deletion or range
+// key block, they may be persisted as multiple internal keys in order to encode
+// multiple sequence numbers or key kinds.
+//
+// gatherForward iterates forward, re-combining the fragmented internal keys to
+// reconstruct a keyspan.Span that holds all the keys defined over the span.
+func (i *fragmentBlockIter) gatherForward(k *InternalKey, lazyValue base.LazyValue) *keyspan.Span {
+	i.span = keyspan.Span{}
+	if k == nil || !i.blockIter.valid() {
+		return nil
+	}
+	i.err = nil
+	// Use the i.keyBuf array to back the Keys slice to prevent an allocation
+	// when a span contains few keys.
+	i.span.Keys = i.keyBuf[:0]
+
+	// Decode the span's end key and individual keys from the value.
+	internalValue := lazyValue.InPlaceValue()
+	i.decodeSpanKeys(k, internalValue)
+	if i.err != nil {
+		return nil
+	}
+	prevEnd := i.span.End
+
+	// There might exist additional internal keys with identical bounds encoded
+	// within the block. Iterate forward, accumulating all the keys with
+	// identical bounds to s.
+	k, lazyValue = i.blockIter.Next()
+	internalValue = lazyValue.InPlaceValue()
+	for k != nil && i.blockIter.cmp(k.UserKey, i.span.Start) == 0 {
+		i.decodeSpanKeys(k, internalValue)
+		if i.err != nil {
+			return nil
+		}
+
+		// Since k indicates an equal start key, the encoded end key must
+		// exactly equal the original end key from the first internal key.
+		// Overlapping fragments are required to have exactly equal start and
+		// end bounds.
+		if i.blockIter.cmp(prevEnd, i.span.End) != 0 {
+			i.err = base.CorruptionErrorf("pebble: corrupt keyspan fragmentation")
+			i.span = keyspan.Span{}
+			return nil
+		}
+		k, lazyValue = i.blockIter.Next()
+		internalValue = lazyValue.InPlaceValue()
+	}
+	if i.elideSameSeqnum && len(i.span.Keys) > 0 {
+		i.elideKeysOfSameSeqNum()
+	}
+	// i.blockIter is positioned over the first internal key for the next span.
+	return &i.span
+}
+
+// gatherBackward gathers internal keys with identical bounds. Keys defined over
+// spans of the keyspace are fragmented such that any overlapping key spans have
+// identical bounds. When these spans are persisted to a range deletion or range
+// key block, they may be persisted as multiple internal keys in order to encode
+// multiple sequence numbers or key kinds.
+//
+// gatherBackward iterates backwards, re-combining the fragmented internal keys
+// to reconstruct a keyspan.Span that holds all the keys defined over the span.
+func (i *fragmentBlockIter) gatherBackward(k *InternalKey, lazyValue base.LazyValue) *keyspan.Span {
+	i.span = keyspan.Span{}
+	if k == nil || !i.blockIter.valid() {
+		return nil
+	}
+	i.err = nil
+	// Use the i.keyBuf array to back the Keys slice to prevent an allocation
+	// when a span contains few keys.
+	i.span.Keys = i.keyBuf[:0]
+
+	// Decode the span's end key and individual keys from the value.
+	internalValue := lazyValue.InPlaceValue()
+	i.decodeSpanKeys(k, internalValue)
+	if i.err != nil {
+		return nil
+	}
+	prevEnd := i.span.End
+
+	// There might exist additional internal keys with identical bounds encoded
+	// within the block. Iterate backward, accumulating all the keys with
+	// identical bounds to s.
+	k, lazyValue = i.blockIter.Prev()
+	internalValue = lazyValue.InPlaceValue()
+	for k != nil && i.blockIter.cmp(k.UserKey, i.span.Start) == 0 {
+		i.decodeSpanKeys(k, internalValue)
+		if i.err != nil {
+			return nil
+		}
+
+		// Since k indicates an equal start key, the encoded end key must
+		// exactly equal the original end key from the first internal key.
+		// Overlapping fragments are required to have exactly equal start and
+		// end bounds.
+		if i.blockIter.cmp(prevEnd, i.span.End) != 0 {
+			i.err = base.CorruptionErrorf("pebble: corrupt keyspan fragmentation")
+			i.span = keyspan.Span{}
+			return nil
+		}
+		k, lazyValue = i.blockIter.Prev()
+		internalValue = lazyValue.InPlaceValue()
+	}
+	// i.blockIter is positioned over the last internal key for the previous
+	// span.
+
+	// Backwards iteration encounters internal keys in the wrong order.
+	keyspan.SortKeysByTrailer(&i.span.Keys)
+
+	if i.elideSameSeqnum && len(i.span.Keys) > 0 {
+		i.elideKeysOfSameSeqNum()
+	}
+	return &i.span
+}
+
+// Error implements (keyspan.FragmentIterator).Error.
+func (i *fragmentBlockIter) Error() error {
+	return i.err
+}
+
+// Close implements (keyspan.FragmentIterator).Close.
+func (i *fragmentBlockIter) Close() error {
+	var err error
+	if i.closeHook != nil {
+		err = i.closeHook(i)
+	}
+	err = firstError(err, i.blockIter.Close())
+	return err
+}
+
+// First implements (keyspan.FragmentIterator).First
+func (i *fragmentBlockIter) First() *keyspan.Span {
+	i.dir = +1
+	return i.gatherForward(i.blockIter.First())
+}
+
+// Last implements (keyspan.FragmentIterator).Last.
+func (i *fragmentBlockIter) Last() *keyspan.Span {
+	i.dir = -1
+	return i.gatherBackward(i.blockIter.Last())
+}
+
+// Next implements (keyspan.FragmentIterator).Next.
+func (i *fragmentBlockIter) Next() *keyspan.Span {
+	switch {
+	case i.dir == -1 && !i.span.Valid():
+		// Switching directions.
+		//
+		// i.blockIter is exhausted, before the first key. Move onto the first.
+		i.blockIter.First()
+		i.dir = +1
+	case i.dir == -1 && i.span.Valid():
+		// Switching directions.
+		//
+		// i.blockIter is currently positioned over the last internal key for
+		// the previous span. Next it once to move to the first internal key
+		// that makes up the current span, and gatherForwaad to land on the
+		// first internal key making up the next span.
+		//
+		// In the diagram below, if the last span returned to the user during
+		// reverse iteration was [b,c), i.blockIter is currently positioned at
+		// [a,b). The block iter must be positioned over [d,e) to gather the
+		// next span's fragments.
+		//
+		//    ... [a,b) [b,c) [b,c) [b,c) [d,e) ...
+		//          ^                       ^
+		//     i.blockIter                 want
+		if x := i.gatherForward(i.blockIter.Next()); invariants.Enabled && !x.Valid() {
+			panic("pebble: invariant violation: next entry unexpectedly invalid")
+		}
+		i.dir = +1
+	}
+	// We know that this blockIter has in-place values.
+	return i.gatherForward(&i.blockIter.ikey, base.MakeInPlaceValue(i.blockIter.val))
+}
+
+// Prev implements (keyspan.FragmentIterator).Prev.
+func (i *fragmentBlockIter) Prev() *keyspan.Span {
+	switch {
+	case i.dir == +1 && !i.span.Valid():
+		// Switching directions.
+		//
+		// i.blockIter is exhausted, after the last key. Move onto the last.
+		i.blockIter.Last()
+		i.dir = -1
+	case i.dir == +1 && i.span.Valid():
+		// Switching directions.
+		//
+		// i.blockIter is currently positioned over the first internal key for
+		// the next span. Prev it once to move to the last internal key that
+		// makes up the current span, and gatherBackward to land on the last
+		// internal key making up the previous span.
+		//
+		// In the diagram below, if the last span returned to the user during
+		// forward iteration was [b,c), i.blockIter is currently positioned at
+		// [d,e). The block iter must be positioned over [a,b) to gather the
+		// previous span's fragments.
+		//
+		//    ... [a,b) [b,c) [b,c) [b,c) [d,e) ...
+		//          ^                       ^
+		//        want                  i.blockIter
+		if x := i.gatherBackward(i.blockIter.Prev()); invariants.Enabled && !x.Valid() {
+			panic("pebble: invariant violation: previous entry unexpectedly invalid")
+		}
+		i.dir = -1
+	}
+	// We know that this blockIter has in-place values.
+	return i.gatherBackward(&i.blockIter.ikey, base.MakeInPlaceValue(i.blockIter.val))
+}
+
+// SeekGE implements (keyspan.FragmentIterator).SeekGE.
+func (i *fragmentBlockIter) SeekGE(k []byte) *keyspan.Span {
+	if s := i.SeekLT(k); s != nil && i.blockIter.cmp(k, s.End) < 0 {
+		return s
+	}
+	// TODO(jackson): If the above i.SeekLT(k) discovers a span but the span
+	// doesn't meet the k < s.End comparison, then there's no need for the
+	// SeekLT to gatherBackward.
+	return i.Next()
+}
+
+// SeekLT implements (keyspan.FragmentIterator).SeekLT.
+func (i *fragmentBlockIter) SeekLT(k []byte) *keyspan.Span {
+	i.dir = -1
+	return i.gatherBackward(i.blockIter.SeekLT(k, base.SeekLTFlagsNone))
+}
+
+// String implements fmt.Stringer.
+func (i *fragmentBlockIter) String() string {
+	return "fragment-block-iter"
+}
+
+// SetCloseHook implements sstable.FragmentIterator.
+func (i *fragmentBlockIter) SetCloseHook(fn func(i keyspan.FragmentIterator) error) {
+	i.closeHook = fn
+}
diff --git a/pebble/sstable/block_property.go b/pebble/sstable/block_property.go
new file mode 100644
index 0000000..d85a2b2
--- /dev/null
+++ b/pebble/sstable/block_property.go
@@ -0,0 +1,820 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"encoding/binary"
+	"fmt"
+	"math"
+	"sync"
+	"unsafe"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/rangekey"
+)
+
+// Block properties are an optional user-facing feature that can be used to
+// filter data blocks (and whole sstables) from an Iterator before they are
+// loaded. They do not apply to range delete blocks. These are expected to
+// very concisely represent a set of some attribute value contained within the
+// key or value, such that the set includes all the attribute values in the
+// block. This has some similarities with OLAP pruning approaches that
+// maintain min-max attribute values for some column (which concisely
+// represent a set), that is then used to prune at query time. In Pebble's
+// case, data blocks are small, typically 25-50KB, so these properties should
+// reduce their precision in order to be concise -- a good rule of thumb is to
+// not consume more than 50-100 bytes across all properties maintained for a
+// block, i.e., a 500x reduction compared to loading the data block.
+//
+// A block property must be assigned a unique name, which is encoded and
+// stored in the sstable. This name must be unique among all user-properties
+// encoded in an sstable.
+//
+// A property is represented as a []byte. A nil value or empty byte slice are
+// considered semantically identical. The caller is free to choose the
+// semantics of an empty byte slice e.g. they could use it to represent the
+// empty set or the universal set, whichever they think is more common and
+// therefore better to encode more concisely. The serialization of the
+// property for the various Finish*() calls in a BlockPropertyCollector
+// implementation should be identical, since the corresponding
+// BlockPropertyFilter implementation is not told the context in which it is
+// deserializing the property.
+//
+// Block properties are more general than table properties and should be
+// preferred over using table properties. A BlockPropertyCollector can achieve
+// identical behavior to table properties by returning the nil slice from
+// FinishDataBlock and FinishIndexBlock, and interpret them as the universal
+// set in BlockPropertyFilter, and return a non-universal set in FinishTable.
+//
+// Block property filtering is nondeterministic because the separation of keys
+// into blocks is nondeterministic. Clients use block-property filters to
+// implement efficient application of a filter F that applies to key-value pairs
+// (abbreviated as kv-filter). Consider correctness defined as surfacing exactly
+// the same key-value pairs that would be surfaced if one applied the filter F
+// above normal iteration. With this correctness definition, block property
+// filtering may introduce two kinds of errors:
+//
+//   a) Block property filtering that uses a kv-filter may produce additional
+//      key-value pairs that don't satisfy the filter because of the separation
+//      of keys into blocks. Clients may remove these extra key-value pairs by
+//      re-applying the kv filter while reading results back from Pebble.
+//
+//   b) Block property filtering may surface deleted key-value pairs if the
+//      kv filter is not a strict function of the key's user key. A block
+//      containing k.DEL may be filtered, while a block containing the deleted
+//      key k.SET may not be filtered, if the kv filter applies to one but not
+//      the other.
+//
+//      This error may be avoided trivially by using a kv filter that is a pure
+//      function of the user key. A filter that examines values or key kinds
+//      requires care to ensure F(k.SET, <value>) = F(k.DEL) = F(k.SINGLEDEL).
+//
+// The combination of range deletions and filtering by table-level properties
+// add another opportunity for deleted point keys to be surfaced. The pebble
+// Iterator stack takes care to correctly apply filtered tables' range deletions
+// to lower tables, preventing this form of nondeterministic error.
+//
+// In addition to the non-determinism discussed in (b), which limits the use
+// of properties over values, we now have support for values that are not
+// stored together with the key, and may not even be retrieved during
+// compactions. If Pebble is configured with such value separation, block
+// properties must only apply to the key, and will be provided a nil value.
+
+// BlockPropertyCollector is used when writing a sstable.
+//
+//   - All calls to Add are included in the next FinishDataBlock, after which
+//     the next data block is expected to start.
+//
+//   - The index entry generated for the data block, which contains the return
+//     value from FinishDataBlock, is not immediately included in the current
+//     index block. It is included when AddPrevDataBlockToIndexBlock is called.
+//     An alternative would be to return an opaque handle from FinishDataBlock
+//     and pass it to a new AddToIndexBlock method, which requires more
+//     plumbing, and passing of an interface{} results in a undesirable heap
+//     allocation. AddPrevDataBlockToIndexBlock must be called before keys are
+//     added to the new data block.
+type BlockPropertyCollector interface {
+	// Name returns the name of the block property collector.
+	Name() string
+	// Add is called with each new entry added to a data block in the sstable.
+	// The callee can assume that these are in sorted order.
+	Add(key InternalKey, value []byte) error
+	// FinishDataBlock is called when all the entries have been added to a
+	// data block. Subsequent Add calls will be for the next data block. It
+	// returns the property value for the finished block.
+	FinishDataBlock(buf []byte) ([]byte, error)
+	// AddPrevDataBlockToIndexBlock adds the entry corresponding to the
+	// previous FinishDataBlock to the current index block.
+	AddPrevDataBlockToIndexBlock()
+	// FinishIndexBlock is called when an index block, containing all the
+	// key-value pairs since the last FinishIndexBlock, will no longer see new
+	// entries. It returns the property value for the index block.
+	FinishIndexBlock(buf []byte) ([]byte, error)
+	// FinishTable is called when the sstable is finished, and returns the
+	// property value for the sstable.
+	FinishTable(buf []byte) ([]byte, error)
+}
+
+// SuffixReplaceableBlockCollector is an extension to the BlockPropertyCollector
+// interface that allows a block property collector to indicate that it supports
+// being *updated* during suffix replacement, i.e. when an existing SST in which
+// all keys have the same key suffix is updated to have a new suffix.
+//
+// A collector which supports being updated in such cases must be able to derive
+// its updated value from its old value and the change being made to the suffix,
+// without needing to be passed each updated K/V.
+//
+// For example, a collector that only inspects values would can simply copy its
+// previously computed property as-is, since key-suffix replacement does not
+// change values, while a collector that depends only on key suffixes, like one
+// which collected mvcc-timestamp bounds from timestamp-suffixed keys, can just
+// set its new bounds from the new suffix, as it is common to all keys, without
+// needing to recompute it from every key.
+//
+// An implementation of DataBlockIntervalCollector can also implement this
+// interface, in which case the BlockPropertyCollector returned by passing it to
+// NewBlockIntervalCollector will also implement this interface automatically.
+type SuffixReplaceableBlockCollector interface {
+	// UpdateKeySuffixes is called when a block is updated to change the suffix of
+	// all keys in the block, and is passed the old value for that prop, if any,
+	// for that block as well as the old and new suffix.
+	UpdateKeySuffixes(oldProp []byte, oldSuffix, newSuffix []byte) error
+}
+
+// BlockPropertyFilter is used in an Iterator to filter sstables and blocks
+// within the sstable. It should not maintain any per-sstable state, and must
+// be thread-safe.
+type BlockPropertyFilter = base.BlockPropertyFilter
+
+// BoundLimitedBlockPropertyFilter implements the block-property filter but
+// imposes an additional constraint on its usage, requiring that only blocks
+// containing exclusively keys between its lower and upper bounds may be
+// filtered. The bounds may be change during iteration, so the filter doesn't
+// expose the bounds, instead implementing KeyIsWithin[Lower,Upper]Bound methods
+// for performing bound comparisons.
+//
+// To be used, a BoundLimitedBlockPropertyFilter must be supplied directly
+// through NewBlockPropertiesFilterer's dedicated parameter. If supplied through
+// the ordinary slice of block property filters, this filter's bounds will be
+// ignored.
+//
+// The current [lower,upper) bounds of the filter are unknown, because they may
+// be changing. During forward iteration the lower bound is externally
+// guaranteed, meaning Intersects only returns false if the sstable iterator is
+// already known to be positioned at a key ≥ lower. The sstable iterator is then
+// only responsible for ensuring filtered blocks also meet the upper bound, and
+// should only allow a block to be filtered if all its keys are < upper. The
+// sstable iterator may invoke KeyIsWithinUpperBound(key) to perform this check,
+// where key is an inclusive upper bound on the block's keys.
+//
+// During backward iteration the upper bound is externally guaranteed, and
+// Intersects only returns false if the sstable iterator is already known to be
+// positioned at a key < upper. The sstable iterator is responsible for ensuring
+// filtered blocks also meet the lower bound, enforcing that a block is only
+// filtered if all its keys are ≥ lower. This check is made through passing the
+// block's inclusive lower bound to KeyIsWithinLowerBound.
+//
+// Implementations may become active or inactive through implementing Intersects
+// to return true whenever the filter is disabled.
+//
+// Usage of BoundLimitedBlockPropertyFilter is subtle, and Pebble consumers
+// should not implement this interface directly. This interface is an internal
+// detail in the implementation of block-property range-key masking.
+type BoundLimitedBlockPropertyFilter interface {
+	BlockPropertyFilter
+
+	// KeyIsWithinLowerBound tests whether the provided internal key falls
+	// within the current lower bound of the filter. A true return value
+	// indicates that the filter may be used to filter blocks that exclusively
+	// contain keys ≥ `key`, so long as the blocks' keys also satisfy the upper
+	// bound.
+	KeyIsWithinLowerBound(key []byte) bool
+	// KeyIsWithinUpperBound tests whether the provided internal key falls
+	// within the current upper bound of the filter. A true return value
+	// indicates that the filter may be used to filter blocks that exclusively
+	// contain keys ≤ `key`, so long as the blocks' keys also satisfy the lower
+	// bound.
+	KeyIsWithinUpperBound(key []byte) bool
+}
+
+// BlockIntervalCollector is a helper implementation of BlockPropertyCollector
+// for users who want to represent a set of the form [lower,upper) where both
+// lower and upper are uint64, and lower <= upper.
+//
+// The set is encoded as:
+// - Two varint integers, (lower,upper-lower), when upper-lower > 0
+// - Nil, when upper-lower=0
+//
+// Users must not expect this to preserve differences between empty sets --
+// they will all get turned into the semantically equivalent [0,0).
+//
+// A BlockIntervalCollector that collects over point and range keys needs to
+// have both the point and range DataBlockIntervalCollector specified, since
+// point and range keys are fed to the BlockIntervalCollector in an interleaved
+// fashion, independently of one another. This also implies that the
+// DataBlockIntervalCollectors for point and range keys should be references to
+// independent instances, rather than references to the same collector, as point
+// and range keys are tracked independently.
+type BlockIntervalCollector struct {
+	name   string
+	points DataBlockIntervalCollector
+	ranges DataBlockIntervalCollector
+
+	blockInterval interval
+	indexInterval interval
+	tableInterval interval
+}
+
+var _ BlockPropertyCollector = &BlockIntervalCollector{}
+
+// DataBlockIntervalCollector is the interface used by BlockIntervalCollector
+// that contains the actual logic pertaining to the property. It only
+// maintains state for the current data block, and resets that state in
+// FinishDataBlock. This interface can be used to reduce parsing costs.
+type DataBlockIntervalCollector interface {
+	// Add is called with each new entry added to a data block in the sstable.
+	// The callee can assume that these are in sorted order.
+	Add(key InternalKey, value []byte) error
+	// FinishDataBlock is called when all the entries have been added to a
+	// data block. Subsequent Add calls will be for the next data block. It
+	// returns the [lower, upper) for the finished block.
+	FinishDataBlock() (lower uint64, upper uint64, err error)
+}
+
+// NewBlockIntervalCollector constructs a BlockIntervalCollector with the given
+// name. The BlockIntervalCollector makes use of the given point and range key
+// DataBlockIntervalCollectors when encountering point and range keys,
+// respectively.
+//
+// The caller may pass a nil DataBlockIntervalCollector for one of the point or
+// range key collectors, in which case keys of those types will be ignored. This
+// allows for flexible construction of BlockIntervalCollectors that operate on
+// just point keys, just range keys, or both point and range keys.
+//
+// If both point and range keys are to be tracked, two independent collectors
+// should be provided, rather than the same collector passed in twice (see the
+// comment on BlockIntervalCollector for more detail)
+func NewBlockIntervalCollector(
+	name string, pointCollector, rangeCollector DataBlockIntervalCollector,
+) BlockPropertyCollector {
+	if pointCollector == nil && rangeCollector == nil {
+		panic("sstable: at least one interval collector must be provided")
+	}
+	bic := BlockIntervalCollector{
+		name:   name,
+		points: pointCollector,
+		ranges: rangeCollector,
+	}
+	if _, ok := pointCollector.(SuffixReplaceableBlockCollector); ok {
+		return &suffixReplacementBlockCollectorWrapper{bic}
+	}
+	return &bic
+}
+
+// Name implements the BlockPropertyCollector interface.
+func (b *BlockIntervalCollector) Name() string {
+	return b.name
+}
+
+// Add implements the BlockPropertyCollector interface.
+func (b *BlockIntervalCollector) Add(key InternalKey, value []byte) error {
+	if rangekey.IsRangeKey(key.Kind()) {
+		if b.ranges != nil {
+			return b.ranges.Add(key, value)
+		}
+	} else if b.points != nil {
+		return b.points.Add(key, value)
+	}
+	return nil
+}
+
+// FinishDataBlock implements the BlockPropertyCollector interface.
+func (b *BlockIntervalCollector) FinishDataBlock(buf []byte) ([]byte, error) {
+	if b.points == nil {
+		return buf, nil
+	}
+	var err error
+	b.blockInterval.lower, b.blockInterval.upper, err = b.points.FinishDataBlock()
+	if err != nil {
+		return buf, err
+	}
+	buf = b.blockInterval.encode(buf)
+	b.tableInterval.union(b.blockInterval)
+	return buf, nil
+}
+
+// AddPrevDataBlockToIndexBlock implements the BlockPropertyCollector
+// interface.
+func (b *BlockIntervalCollector) AddPrevDataBlockToIndexBlock() {
+	b.indexInterval.union(b.blockInterval)
+	b.blockInterval = interval{}
+}
+
+// FinishIndexBlock implements the BlockPropertyCollector interface.
+func (b *BlockIntervalCollector) FinishIndexBlock(buf []byte) ([]byte, error) {
+	buf = b.indexInterval.encode(buf)
+	b.indexInterval = interval{}
+	return buf, nil
+}
+
+// FinishTable implements the BlockPropertyCollector interface.
+func (b *BlockIntervalCollector) FinishTable(buf []byte) ([]byte, error) {
+	// If the collector is tracking range keys, the range key interval is union-ed
+	// with the point key interval for the table.
+	if b.ranges != nil {
+		var rangeInterval interval
+		var err error
+		rangeInterval.lower, rangeInterval.upper, err = b.ranges.FinishDataBlock()
+		if err != nil {
+			return buf, err
+		}
+		b.tableInterval.union(rangeInterval)
+	}
+	return b.tableInterval.encode(buf), nil
+}
+
+type interval struct {
+	lower uint64
+	upper uint64
+}
+
+func (i interval) encode(buf []byte) []byte {
+	if i.lower < i.upper {
+		var encoded [binary.MaxVarintLen64 * 2]byte
+		n := binary.PutUvarint(encoded[:], i.lower)
+		n += binary.PutUvarint(encoded[n:], i.upper-i.lower)
+		buf = append(buf, encoded[:n]...)
+	}
+	return buf
+}
+
+func (i *interval) decode(buf []byte) error {
+	if len(buf) == 0 {
+		*i = interval{}
+		return nil
+	}
+	var n int
+	i.lower, n = binary.Uvarint(buf)
+	if n <= 0 || n >= len(buf) {
+		return base.CorruptionErrorf("cannot decode interval from buf %x", buf)
+	}
+	pos := n
+	i.upper, n = binary.Uvarint(buf[pos:])
+	pos += n
+	if pos != len(buf) || n <= 0 {
+		return base.CorruptionErrorf("cannot decode interval from buf %x", buf)
+	}
+	// Delta decode.
+	i.upper += i.lower
+	if i.upper < i.lower {
+		return base.CorruptionErrorf("unexpected overflow, upper %d < lower %d", i.upper, i.lower)
+	}
+	return nil
+}
+
+func (i *interval) union(x interval) {
+	if x.lower >= x.upper {
+		// x is the empty set.
+		return
+	}
+	if i.lower >= i.upper {
+		// i is the empty set.
+		*i = x
+		return
+	}
+	// Both sets are non-empty.
+	if x.lower < i.lower {
+		i.lower = x.lower
+	}
+	if x.upper > i.upper {
+		i.upper = x.upper
+	}
+}
+
+func (i interval) intersects(x interval) bool {
+	if i.lower >= i.upper || x.lower >= x.upper {
+		// At least one of the sets is empty.
+		return false
+	}
+	// Neither set is empty.
+	return i.upper > x.lower && i.lower < x.upper
+}
+
+type suffixReplacementBlockCollectorWrapper struct {
+	BlockIntervalCollector
+}
+
+// UpdateKeySuffixes implements the SuffixReplaceableBlockCollector interface.
+func (w *suffixReplacementBlockCollectorWrapper) UpdateKeySuffixes(
+	oldProp []byte, from, to []byte,
+) error {
+	return w.BlockIntervalCollector.points.(SuffixReplaceableBlockCollector).UpdateKeySuffixes(oldProp, from, to)
+}
+
+// BlockIntervalFilter is an implementation of BlockPropertyFilter when the
+// corresponding collector is a BlockIntervalCollector. That is, the set is of
+// the form [lower, upper).
+type BlockIntervalFilter struct {
+	name           string
+	filterInterval interval
+}
+
+var _ BlockPropertyFilter = (*BlockIntervalFilter)(nil)
+
+// NewBlockIntervalFilter constructs a BlockPropertyFilter that filters blocks
+// based on an interval property collected by BlockIntervalCollector and the
+// given [lower, upper) bounds. The given name specifies the
+// BlockIntervalCollector's properties to read.
+func NewBlockIntervalFilter(name string, lower uint64, upper uint64) *BlockIntervalFilter {
+	b := new(BlockIntervalFilter)
+	b.Init(name, lower, upper)
+	return b
+}
+
+// Init initializes (or re-initializes, clearing previous state) an existing
+// BLockPropertyFilter to filter blocks based on an interval property collected
+// by BlockIntervalCollector and the given [lower, upper) bounds. The given name
+// specifies the BlockIntervalCollector's properties to read.
+func (b *BlockIntervalFilter) Init(name string, lower, upper uint64) {
+	*b = BlockIntervalFilter{
+		name:           name,
+		filterInterval: interval{lower: lower, upper: upper},
+	}
+}
+
+// Name implements the BlockPropertyFilter interface.
+func (b *BlockIntervalFilter) Name() string {
+	return b.name
+}
+
+// Intersects implements the BlockPropertyFilter interface.
+func (b *BlockIntervalFilter) Intersects(prop []byte) (bool, error) {
+	var i interval
+	if err := i.decode(prop); err != nil {
+		return false, err
+	}
+	return i.intersects(b.filterInterval), nil
+}
+
+// SetInterval adjusts the [lower, upper) bounds used by the filter. It is not
+// generally safe to alter the filter while it's in use, except as part of the
+// implementation of BlockPropertyFilterMask.SetSuffix used for range-key
+// masking.
+func (b *BlockIntervalFilter) SetInterval(lower, upper uint64) {
+	b.filterInterval = interval{lower: lower, upper: upper}
+}
+
+// When encoding block properties for each block, we cannot afford to encode
+// the name. Instead, the name is mapped to a shortID, in the scope of that
+// sstable, and the shortID is encoded. Since we use a uint8, there is a limit
+// of 256 block property collectors per sstable.
+type shortID uint8
+
+type blockPropertiesEncoder struct {
+	propsBuf []byte
+	scratch  []byte
+}
+
+func (e *blockPropertiesEncoder) getScratchForProp() []byte {
+	return e.scratch[:0]
+}
+
+func (e *blockPropertiesEncoder) resetProps() {
+	e.propsBuf = e.propsBuf[:0]
+}
+
+func (e *blockPropertiesEncoder) addProp(id shortID, scratch []byte) {
+	const lenID = 1
+	lenProp := uvarintLen(uint32(len(scratch)))
+	n := lenID + lenProp + len(scratch)
+	if cap(e.propsBuf)-len(e.propsBuf) < n {
+		size := len(e.propsBuf) + 2*n
+		if size < 2*cap(e.propsBuf) {
+			size = 2 * cap(e.propsBuf)
+		}
+		buf := make([]byte, len(e.propsBuf), size)
+		copy(buf, e.propsBuf)
+		e.propsBuf = buf
+	}
+	pos := len(e.propsBuf)
+	b := e.propsBuf[pos : pos+lenID]
+	b[0] = byte(id)
+	pos += lenID
+	b = e.propsBuf[pos : pos+lenProp]
+	n = binary.PutUvarint(b, uint64(len(scratch)))
+	pos += n
+	b = e.propsBuf[pos : pos+len(scratch)]
+	pos += len(scratch)
+	copy(b, scratch)
+	e.propsBuf = e.propsBuf[0:pos]
+	e.scratch = scratch
+}
+
+func (e *blockPropertiesEncoder) unsafeProps() []byte {
+	return e.propsBuf
+}
+
+func (e *blockPropertiesEncoder) props() []byte {
+	buf := make([]byte, len(e.propsBuf))
+	copy(buf, e.propsBuf)
+	return buf
+}
+
+type blockPropertiesDecoder struct {
+	props []byte
+}
+
+func (d *blockPropertiesDecoder) done() bool {
+	return len(d.props) == 0
+}
+
+// REQUIRES: !done()
+func (d *blockPropertiesDecoder) next() (id shortID, prop []byte, err error) {
+	const lenID = 1
+	id = shortID(d.props[0])
+	propLen, m := binary.Uvarint(d.props[lenID:])
+	n := lenID + m
+	if m <= 0 || propLen == 0 || (n+int(propLen)) > len(d.props) {
+		return 0, nil, base.CorruptionErrorf("corrupt block property length")
+	}
+	prop = d.props[n : n+int(propLen)]
+	d.props = d.props[n+int(propLen):]
+	return id, prop, nil
+}
+
+// BlockPropertiesFilterer provides filtering support when reading an sstable
+// in the context of an iterator that has a slice of BlockPropertyFilters.
+// After the call to NewBlockPropertiesFilterer, the caller must call
+// IntersectsUserPropsAndFinishInit to check if the sstable intersects with
+// the filters. If it does intersect, this function also finishes initializing
+// the BlockPropertiesFilterer using the shortIDs for the relevant filters.
+// Subsequent checks for relevance of a block should use the intersects
+// method.
+type BlockPropertiesFilterer struct {
+	filters []BlockPropertyFilter
+	// Maps shortID => index in filters. This can be sparse, and shortIDs for
+	// which there is no filter are represented with an index of -1. The
+	// length of this can be shorter than the shortIDs allocated in the
+	// sstable. e.g. if the sstable used shortIDs 0, 1, 2, 3, and the iterator
+	// has two filters, corresponding to shortIDs 2, 0, this would be:
+	// len(shortIDToFiltersIndex)==3, 0=>1, 1=>-1, 2=>0.
+	shortIDToFiltersIndex []int
+
+	// boundLimitedFilter, if non-nil, holds a single block-property filter with
+	// additional constraints on its filtering. A boundLimitedFilter may only
+	// filter blocks that are wholly contained within its bounds. During forward
+	// iteration the lower bound (and during backward iteration the upper bound)
+	// must be externally guaranteed, with Intersects only returning false if
+	// that bound is met. The opposite bound is verified during iteration by the
+	// sstable iterator.
+	//
+	// boundLimitedFilter is permitted to be defined on a property (`Name()`)
+	// for which another filter exists in filters. In this case both filters
+	// will be consulted, and either filter may exclude block(s). Only a single
+	// bound-limited block-property filter may be set.
+	//
+	// The boundLimitedShortID field contains the shortID of the filter's
+	// property within the sstable. It's set to -1 if the property was not
+	// collected when the table was built.
+	boundLimitedFilter  BoundLimitedBlockPropertyFilter
+	boundLimitedShortID int
+}
+
+var blockPropertiesFiltererPool = sync.Pool{
+	New: func() interface{} {
+		return &BlockPropertiesFilterer{}
+	},
+}
+
+// newBlockPropertiesFilterer returns a partially initialized filterer. To complete
+// initialization, call IntersectsUserPropsAndFinishInit.
+func newBlockPropertiesFilterer(
+	filters []BlockPropertyFilter, limited BoundLimitedBlockPropertyFilter,
+) *BlockPropertiesFilterer {
+	filterer := blockPropertiesFiltererPool.Get().(*BlockPropertiesFilterer)
+	*filterer = BlockPropertiesFilterer{
+		filters:               filters,
+		shortIDToFiltersIndex: filterer.shortIDToFiltersIndex[:0],
+		boundLimitedFilter:    limited,
+		boundLimitedShortID:   -1,
+	}
+	return filterer
+}
+
+func releaseBlockPropertiesFilterer(filterer *BlockPropertiesFilterer) {
+	*filterer = BlockPropertiesFilterer{
+		shortIDToFiltersIndex: filterer.shortIDToFiltersIndex[:0],
+	}
+	blockPropertiesFiltererPool.Put(filterer)
+}
+
+// IntersectsTable evaluates the provided block-property filter against the
+// provided set of table-level properties. If there is no intersection between
+// the filters and the table or an error is encountered, IntersectsTable returns
+// a nil filterer (and possibly an error). If there is an intersection,
+// IntersectsTable returns a non-nil filterer that may be used by an iterator
+// reading the table.
+func IntersectsTable(
+	filters []BlockPropertyFilter,
+	limited BoundLimitedBlockPropertyFilter,
+	userProperties map[string]string,
+) (*BlockPropertiesFilterer, error) {
+	f := newBlockPropertiesFilterer(filters, limited)
+	ok, err := f.intersectsUserPropsAndFinishInit(userProperties)
+	if !ok || err != nil {
+		releaseBlockPropertiesFilterer(f)
+		return nil, err
+	}
+	return f, nil
+}
+
+// intersectsUserPropsAndFinishInit is called with the user properties map for
+// the sstable and returns whether the sstable intersects the filters. It
+// additionally initializes the shortIDToFiltersIndex for the filters that are
+// relevant to this sstable.
+func (f *BlockPropertiesFilterer) intersectsUserPropsAndFinishInit(
+	userProperties map[string]string,
+) (bool, error) {
+	for i := range f.filters {
+		props, ok := userProperties[f.filters[i].Name()]
+		if !ok {
+			// Collector was not used when writing this file, so it is
+			// considered intersecting.
+			continue
+		}
+		if len(props) < 1 {
+			return false, base.CorruptionErrorf(
+				"block properties for %s is corrupted", f.filters[i].Name())
+		}
+		shortID := shortID(props[0])
+		{
+			// Use an unsafe conversion to avoid allocating. Intersects() is not
+			// supposed to modify the given slice.
+			// Note that unsafe.StringData only works if the string is not empty
+			// (which we already checked).
+			byteProps := unsafe.Slice(unsafe.StringData(props), len(props))
+			intersects, err := f.filters[i].Intersects(byteProps[1:])
+			if err != nil || !intersects {
+				return false, err
+			}
+		}
+		// Intersects the sstable, so need to use this filter when
+		// deciding whether to read blocks.
+		n := len(f.shortIDToFiltersIndex)
+		if n <= int(shortID) {
+			if cap(f.shortIDToFiltersIndex) <= int(shortID) {
+				index := make([]int, shortID+1, 2*(shortID+1))
+				copy(index, f.shortIDToFiltersIndex)
+				f.shortIDToFiltersIndex = index
+			} else {
+				f.shortIDToFiltersIndex = f.shortIDToFiltersIndex[:shortID+1]
+			}
+			for j := n; j < int(shortID); j++ {
+				f.shortIDToFiltersIndex[j] = -1
+			}
+		}
+		f.shortIDToFiltersIndex[shortID] = i
+	}
+	if f.boundLimitedFilter == nil {
+		return true, nil
+	}
+
+	// There's a bound-limited filter. Find its shortID. It's possible that
+	// there's an existing filter in f.filters on the same property. That's
+	// okay. Both filters will be consulted whenever a relevant prop is decoded.
+	props, ok := userProperties[f.boundLimitedFilter.Name()]
+	if !ok {
+		// The collector was not used when writing this file, so it's
+		// intersecting. We leave f.boundLimitedShortID=-1, so the filter will
+		// be unused within this file.
+		return true, nil
+	}
+	if len(props) < 1 {
+		return false, base.CorruptionErrorf(
+			"block properties for %s is corrupted", f.boundLimitedFilter.Name())
+	}
+	f.boundLimitedShortID = int(props[0])
+
+	// We don't check for table-level intersection for the bound-limited filter.
+	// The bound-limited filter is treated as vacuously intersecting.
+	//
+	// NB: If a block-property filter needs to be toggled inactive/active, it
+	// should be implemented within the Intersects implementation.
+	//
+	// TODO(jackson): We could filter at the table-level by threading the table
+	// smallest and largest bounds here.
+
+	// The bound-limited filter isn't included in shortIDToFiltersIndex.
+	//
+	// When determining intersection, we decode props only up to the shortID
+	// len(shortIDToFiltersIndex). If f.limitedShortID is greater than any of
+	// the existing filters' shortIDs, we need to grow shortIDToFiltersIndex.
+	// Growing the index with -1s ensures we're able to consult the index
+	// without length checks.
+	if n := len(f.shortIDToFiltersIndex); n <= f.boundLimitedShortID {
+		if cap(f.shortIDToFiltersIndex) <= f.boundLimitedShortID {
+			index := make([]int, f.boundLimitedShortID+1)
+			copy(index, f.shortIDToFiltersIndex)
+			f.shortIDToFiltersIndex = index
+		} else {
+			f.shortIDToFiltersIndex = f.shortIDToFiltersIndex[:f.boundLimitedShortID+1]
+		}
+		for j := n; j <= f.boundLimitedShortID; j++ {
+			f.shortIDToFiltersIndex[j] = -1
+		}
+	}
+	return true, nil
+}
+
+type intersectsResult int8
+
+const (
+	blockIntersects intersectsResult = iota
+	blockExcluded
+	// blockMaybeExcluded is returned by BlockPropertiesFilterer.intersects when
+	// no filters unconditionally exclude the block, but the bound-limited block
+	// property filter will exclude it if the block's bounds fall within the
+	// filter's current bounds. See the reader's
+	// {single,two}LevelIterator.resolveMaybeExcluded methods.
+	blockMaybeExcluded
+)
+
+func (f *BlockPropertiesFilterer) intersects(props []byte) (ret intersectsResult, err error) {
+	i := 0
+	decoder := blockPropertiesDecoder{props: props}
+	ret = blockIntersects
+	for i < len(f.shortIDToFiltersIndex) {
+		var id int
+		var prop []byte
+		if !decoder.done() {
+			var shortID shortID
+			var err error
+			shortID, prop, err = decoder.next()
+			if err != nil {
+				return ret, err
+			}
+			id = int(shortID)
+		} else {
+			id = math.MaxUint8 + 1
+		}
+		for i < len(f.shortIDToFiltersIndex) && id > i {
+			// The property for this id is not encoded for this block, but there
+			// may still be a filter for this id.
+			if intersects, err := f.intersectsFilter(i, nil); err != nil {
+				return ret, err
+			} else if intersects == blockExcluded {
+				return blockExcluded, nil
+			} else if intersects == blockMaybeExcluded {
+				ret = blockMaybeExcluded
+			}
+			i++
+		}
+		if i >= len(f.shortIDToFiltersIndex) {
+			return ret, nil
+		}
+		// INVARIANT: id <= i. And since i is always incremented by 1, id==i.
+		if id != i {
+			panic(fmt.Sprintf("%d != %d", id, i))
+		}
+		if intersects, err := f.intersectsFilter(i, prop); err != nil {
+			return ret, err
+		} else if intersects == blockExcluded {
+			return blockExcluded, nil
+		} else if intersects == blockMaybeExcluded {
+			ret = blockMaybeExcluded
+		}
+		i++
+	}
+	// ret == blockIntersects || ret == blockMaybeExcluded
+	return ret, nil
+}
+
+func (f *BlockPropertiesFilterer) intersectsFilter(i int, prop []byte) (intersectsResult, error) {
+	if f.shortIDToFiltersIndex[i] >= 0 {
+		intersects, err := f.filters[f.shortIDToFiltersIndex[i]].Intersects(prop)
+		if err != nil {
+			return blockIntersects, err
+		}
+		if !intersects {
+			return blockExcluded, nil
+		}
+	}
+	if i == f.boundLimitedShortID {
+		// The bound-limited filter uses this id.
+		//
+		// The bound-limited filter only applies within a keyspan interval. We
+		// expect the Intersects call to be cheaper than bounds checks. If
+		// Intersects determines that there is no intersection, we return
+		// `blockMaybeExcluded` if no other bpf unconditionally excludes the
+		// block.
+		intersects, err := f.boundLimitedFilter.Intersects(prop)
+		if err != nil {
+			return blockIntersects, err
+		} else if !intersects {
+			return blockMaybeExcluded, nil
+		}
+	}
+	return blockIntersects, nil
+}
diff --git a/pebble/sstable/block_property_test.go b/pebble/sstable/block_property_test.go
new file mode 100644
index 0000000..3b9bfc3
--- /dev/null
+++ b/pebble/sstable/block_property_test.go
@@ -0,0 +1,1487 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"io"
+	"math"
+	"math/rand"
+	"sort"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/rangekey"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/stretchr/testify/require"
+)
+
+func TestIntervalEncodeDecode(t *testing.T) {
+	testCases := []struct {
+		name  string
+		lower uint64
+		upper uint64
+		len   int
+	}{
+		{
+			name:  "empty zero",
+			lower: 0,
+			upper: 0,
+			len:   0,
+		},
+		{
+			name:  "empty non-zero",
+			lower: 5,
+			upper: 5,
+			len:   0,
+		},
+		{
+			name:  "empty lower > upper",
+			lower: math.MaxUint64,
+			upper: math.MaxUint64 - 1,
+			len:   0,
+		},
+		{
+			name:  "small",
+			lower: 50,
+			upper: 61,
+			len:   2,
+		},
+		{
+			name:  "big",
+			lower: 0,
+			upper: math.MaxUint64,
+			len:   11,
+		},
+	}
+	for _, tc := range testCases {
+		buf := make([]byte, 100)
+		t.Run(tc.name, func(t *testing.T) {
+			i1 := interval{lower: tc.lower, upper: tc.upper}
+			b1 := i1.encode(nil)
+			b2 := i1.encode(buf[:0])
+			require.True(t, bytes.Equal(b1, b2), "%x != %x", b1, b2)
+			expectedInterval := i1
+			if expectedInterval.lower >= expectedInterval.upper {
+				expectedInterval = interval{}
+			}
+			// Arbitrary initial value.
+			arbitraryInterval := interval{lower: 1000, upper: 1000}
+			i2 := arbitraryInterval
+			i2.decode(b1)
+			require.Equal(t, expectedInterval, i2)
+			i2 = arbitraryInterval
+			i2.decode(b2)
+			require.Equal(t, expectedInterval, i2)
+			require.Equal(t, tc.len, len(b1))
+		})
+	}
+}
+
+func TestIntervalUnionIntersects(t *testing.T) {
+	testCases := []struct {
+		name       string
+		i1         interval
+		i2         interval
+		union      interval
+		intersects bool
+	}{
+		{
+			name:       "empty and empty",
+			i1:         interval{},
+			i2:         interval{},
+			union:      interval{},
+			intersects: false,
+		},
+		{
+			name:       "empty and empty non-zero",
+			i1:         interval{},
+			i2:         interval{100, 99},
+			union:      interval{},
+			intersects: false,
+		},
+		{
+			name:       "empty and non-empty",
+			i1:         interval{},
+			i2:         interval{80, 100},
+			union:      interval{80, 100},
+			intersects: false,
+		},
+		{
+			name:       "disjoint sets",
+			i1:         interval{50, 60},
+			i2:         interval{math.MaxUint64 - 5, math.MaxUint64},
+			union:      interval{50, math.MaxUint64},
+			intersects: false,
+		},
+		{
+			name:       "adjacent sets",
+			i1:         interval{50, 60},
+			i2:         interval{60, 100},
+			union:      interval{50, 100},
+			intersects: false,
+		},
+		{
+			name:       "overlapping sets",
+			i1:         interval{50, 60},
+			i2:         interval{59, 120},
+			union:      interval{50, 120},
+			intersects: true,
+		},
+	}
+	isEmpty := func(i interval) bool {
+		return i.lower >= i.upper
+	}
+	// adjustUnionExpectation exists because union does not try to
+	// canonicalize empty sets by turning them into [0, 0), since it is
+	// unnecessary -- the higher level context of the BlockIntervalCollector
+	// will do so when calling interval.encode.
+	adjustUnionExpectation := func(expected interval, i1 interval, i2 interval) interval {
+		if isEmpty(i2) {
+			return i1
+		}
+		if isEmpty(i1) {
+			return i2
+		}
+		return expected
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			require.Equal(t, tc.intersects, tc.i1.intersects(tc.i2))
+			require.Equal(t, tc.intersects, tc.i2.intersects(tc.i1))
+			require.Equal(t, !isEmpty(tc.i1), tc.i1.intersects(tc.i1))
+			require.Equal(t, !isEmpty(tc.i2), tc.i2.intersects(tc.i2))
+			union := tc.i1
+			union.union(tc.i2)
+			require.Equal(t, adjustUnionExpectation(tc.union, tc.i1, tc.i2), union)
+			union = tc.i2
+			union.union(tc.i1)
+			require.Equal(t, adjustUnionExpectation(tc.union, tc.i2, tc.i1), union)
+		})
+	}
+}
+
+type testDataBlockIntervalCollector struct {
+	i interval
+}
+
+func (c *testDataBlockIntervalCollector) Add(key InternalKey, value []byte) error {
+	return nil
+}
+
+func (c *testDataBlockIntervalCollector) FinishDataBlock() (lower uint64, upper uint64, err error) {
+	return c.i.lower, c.i.upper, nil
+}
+
+func TestBlockIntervalCollector(t *testing.T) {
+	var points, ranges testDataBlockIntervalCollector
+	bic := NewBlockIntervalCollector("foo", &points, &ranges)
+	require.Equal(t, "foo", bic.Name())
+	// Set up the point key collector with an initial (empty) interval.
+	points.i = interval{1, 1}
+	// First data block has empty point key interval.
+	encoded, err := bic.FinishDataBlock(nil)
+	require.NoError(t, err)
+	require.True(t, bytes.Equal(nil, encoded))
+	bic.AddPrevDataBlockToIndexBlock()
+	// Second data block contains a point and range key interval. The latter
+	// should not contribute to the block interval.
+	points.i = interval{20, 25}
+	ranges.i = interval{5, 150}
+	encoded, err = bic.FinishDataBlock(nil)
+	require.NoError(t, err)
+	var decoded interval
+	require.NoError(t, decoded.decode(encoded))
+	require.Equal(t, interval{20, 25}, decoded)
+	var encodedIndexBlock []byte
+	// Finish index block before including second data block.
+	encodedIndexBlock, err = bic.FinishIndexBlock(nil)
+	require.NoError(t, err)
+	require.True(t, bytes.Equal(nil, encodedIndexBlock))
+	bic.AddPrevDataBlockToIndexBlock()
+	// Third data block.
+	points.i = interval{10, 15}
+	encoded, err = bic.FinishDataBlock(nil)
+	require.NoError(t, err)
+	require.NoError(t, decoded.decode(encoded))
+	require.Equal(t, interval{10, 15}, decoded)
+	bic.AddPrevDataBlockToIndexBlock()
+	// Fourth data block.
+	points.i = interval{100, 105}
+	encoded, err = bic.FinishDataBlock(nil)
+	require.NoError(t, err)
+	require.NoError(t, decoded.decode(encoded))
+	require.Equal(t, interval{100, 105}, decoded)
+	// Finish index block before including fourth data block.
+	encodedIndexBlock, err = bic.FinishIndexBlock(nil)
+	require.NoError(t, err)
+	require.NoError(t, decoded.decode(encodedIndexBlock))
+	require.Equal(t, interval{10, 25}, decoded)
+	bic.AddPrevDataBlockToIndexBlock()
+	// Finish index block that contains only fourth data block.
+	encodedIndexBlock, err = bic.FinishIndexBlock(nil)
+	require.NoError(t, err)
+	require.NoError(t, decoded.decode(encodedIndexBlock))
+	require.Equal(t, interval{100, 105}, decoded)
+	var encodedTable []byte
+	// Finish table. The table interval is the union of the current point key
+	// table interval [10, 105) and the range key interval [5, 150).
+	encodedTable, err = bic.FinishTable(nil)
+	require.NoError(t, err)
+	require.NoError(t, decoded.decode(encodedTable))
+	require.Equal(t, interval{5, 150}, decoded)
+}
+
+func TestBlockIntervalFilter(t *testing.T) {
+	testCases := []struct {
+		name       string
+		filter     interval
+		prop       interval
+		intersects bool
+	}{
+		{
+			name:       "non-empty and empty",
+			filter:     interval{10, 15},
+			prop:       interval{},
+			intersects: false,
+		},
+		{
+			name:       "does not intersect",
+			filter:     interval{10, 15},
+			prop:       interval{15, 20},
+			intersects: false,
+		},
+		{
+			name:       "intersects",
+			filter:     interval{10, 15},
+			prop:       interval{14, 20},
+			intersects: true,
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			var points testDataBlockIntervalCollector
+			name := "foo"
+			bic := NewBlockIntervalCollector(name, &points, nil)
+			bif := NewBlockIntervalFilter(name, tc.filter.lower, tc.filter.upper)
+			points.i = tc.prop
+			prop, _ := bic.FinishDataBlock(nil)
+			intersects, err := bif.Intersects(prop)
+			require.NoError(t, err)
+			require.Equal(t, tc.intersects, intersects)
+		})
+	}
+}
+
+func TestBlockPropertiesEncoderDecoder(t *testing.T) {
+	var encoder blockPropertiesEncoder
+	scratch := encoder.getScratchForProp()
+	scratch = append(scratch, []byte("foo")...)
+	encoder.addProp(1, scratch)
+	scratch = encoder.getScratchForProp()
+	require.LessOrEqual(t, 3, cap(scratch))
+	scratch = append(scratch, []byte("cockroach")...)
+	encoder.addProp(10, scratch)
+	props1 := encoder.props()
+	unsafeProps := encoder.unsafeProps()
+	require.True(t, bytes.Equal(props1, unsafeProps), "%x != %x", props1, unsafeProps)
+	decodeProps1 := func() {
+		decoder := blockPropertiesDecoder{props: props1}
+		require.False(t, decoder.done())
+		id, prop, err := decoder.next()
+		require.NoError(t, err)
+		require.Equal(t, shortID(1), id)
+		require.Equal(t, string(prop), "foo")
+		require.False(t, decoder.done())
+		id, prop, err = decoder.next()
+		require.NoError(t, err)
+		require.Equal(t, shortID(10), id)
+		require.Equal(t, string(prop), "cockroach")
+		require.True(t, decoder.done())
+	}
+	decodeProps1()
+
+	encoder.resetProps()
+	scratch = encoder.getScratchForProp()
+	require.LessOrEqual(t, 9, cap(scratch))
+	scratch = append(scratch, []byte("bar")...)
+	encoder.addProp(10, scratch)
+	props2 := encoder.props()
+	unsafeProps = encoder.unsafeProps()
+	require.True(t, bytes.Equal(props2, unsafeProps), "%x != %x", props2, unsafeProps)
+	// Safe props should still decode.
+	decodeProps1()
+	// Decode props2
+	decoder := blockPropertiesDecoder{props: props2}
+	require.False(t, decoder.done())
+	id, prop, err := decoder.next()
+	require.NoError(t, err)
+	require.Equal(t, shortID(10), id)
+	require.Equal(t, string(prop), "bar")
+	require.True(t, decoder.done())
+}
+
+// filterWithTrueForEmptyProp is a wrapper for BlockPropertyFilter that
+// delegates to it except when the property is empty, in which case it returns
+// true.
+type filterWithTrueForEmptyProp struct {
+	BlockPropertyFilter
+}
+
+func (b filterWithTrueForEmptyProp) Intersects(prop []byte) (bool, error) {
+	if len(prop) == 0 {
+		return true, nil
+	}
+	return b.BlockPropertyFilter.Intersects(prop)
+}
+
+func TestBlockPropertiesFilterer_IntersectsUserPropsAndFinishInit(t *testing.T) {
+	// props with id=0, interval [10, 20); id=10, interval [110, 120).
+	var dbic testDataBlockIntervalCollector
+	bic0 := NewBlockIntervalCollector("p0", &dbic, nil)
+	bic0Id := byte(0)
+	bic10 := NewBlockIntervalCollector("p10", &dbic, nil)
+	bic10Id := byte(10)
+	dbic.i = interval{10, 20}
+	prop0 := append([]byte(nil), bic0Id)
+	_, err := bic0.FinishDataBlock(nil)
+	require.NoError(t, err)
+	prop0, err = bic0.FinishTable(prop0)
+	require.NoError(t, err)
+	dbic.i = interval{110, 120}
+	prop10 := append([]byte(nil), bic10Id)
+	_, err = bic10.FinishDataBlock(nil)
+	require.NoError(t, err)
+	prop10, err = bic10.FinishTable(prop10)
+	require.NoError(t, err)
+	prop0Str := string(prop0)
+	prop10Str := string(prop10)
+	type filter struct {
+		name string
+		i    interval
+	}
+	testCases := []struct {
+		name      string
+		userProps map[string]string
+		filters   []filter
+
+		// Expected results
+		intersects            bool
+		shortIDToFiltersIndex []int
+	}{
+		{
+			name:       "no filter, no props",
+			userProps:  map[string]string{},
+			filters:    nil,
+			intersects: true,
+		},
+		{
+			name:      "no props",
+			userProps: map[string]string{},
+			filters: []filter{
+				{name: "p0", i: interval{20, 30}},
+				{name: "p10", i: interval{20, 30}},
+			},
+			intersects: true,
+		},
+		{
+			name:      "prop0, does not intersect",
+			userProps: map[string]string{"p0": prop0Str},
+			filters: []filter{
+				{name: "p0", i: interval{20, 30}},
+				{name: "p10", i: interval{20, 30}},
+			},
+			intersects: false,
+		},
+		{
+			name:      "prop0, intersects",
+			userProps: map[string]string{"p0": prop0Str},
+			filters: []filter{
+				{name: "p0", i: interval{11, 21}},
+				{name: "p10", i: interval{20, 30}},
+			},
+			intersects:            true,
+			shortIDToFiltersIndex: []int{0},
+		},
+		{
+			name:      "prop10, does not intersect",
+			userProps: map[string]string{"p10": prop10Str},
+			filters: []filter{
+				{name: "p0", i: interval{11, 21}},
+				{name: "p10", i: interval{20, 30}},
+			},
+			intersects: false,
+		},
+		{
+			name:      "prop10, intersects",
+			userProps: map[string]string{"p10": prop10Str},
+			filters: []filter{
+				{name: "p0", i: interval{11, 21}},
+				{name: "p10", i: interval{115, 125}},
+			},
+			intersects:            true,
+			shortIDToFiltersIndex: []int{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1},
+		},
+		{
+			name:      "prop10, intersects",
+			userProps: map[string]string{"p10": prop10Str},
+			filters: []filter{
+				{name: "p10", i: interval{115, 125}},
+				{name: "p0", i: interval{11, 21}},
+			},
+			intersects:            true,
+			shortIDToFiltersIndex: []int{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0},
+		},
+		{
+			name:      "prop0 and prop10, does not intersect",
+			userProps: map[string]string{"p0": prop0Str, "p10": prop10Str},
+			filters: []filter{
+				{name: "p10", i: interval{115, 125}},
+				{name: "p0", i: interval{20, 30}},
+			},
+			intersects:            false,
+			shortIDToFiltersIndex: []int{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0},
+		},
+		{
+			name:      "prop0 and prop10, does not intersect",
+			userProps: map[string]string{"p0": prop0Str, "p10": prop10Str},
+			filters: []filter{
+				{name: "p0", i: interval{10, 20}},
+				{name: "p10", i: interval{125, 135}},
+			},
+			intersects:            false,
+			shortIDToFiltersIndex: []int{0},
+		},
+		{
+			name:      "prop0 and prop10, intersects",
+			userProps: map[string]string{"p0": prop0Str, "p10": prop10Str},
+			filters: []filter{
+				{name: "p10", i: interval{115, 125}},
+				{name: "p0", i: interval{10, 20}},
+			},
+			intersects:            true,
+			shortIDToFiltersIndex: []int{1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0},
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			var filters []BlockPropertyFilter
+			for _, f := range tc.filters {
+				filter := NewBlockIntervalFilter(f.name, f.i.lower, f.i.upper)
+				filters = append(filters, filter)
+			}
+			filterer := newBlockPropertiesFilterer(filters, nil)
+			intersects, err := filterer.intersectsUserPropsAndFinishInit(tc.userProps)
+			require.NoError(t, err)
+			require.Equal(t, tc.intersects, intersects)
+			require.Equal(t, tc.shortIDToFiltersIndex, filterer.shortIDToFiltersIndex)
+		})
+	}
+}
+
+func TestBlockPropertiesFilterer_Intersects(t *testing.T) {
+	// Setup two different properties values to filter against.
+	var emptyProps []byte
+	// props with id=0, interval [10, 20); id=10, interval [110, 120).
+	var encoder blockPropertiesEncoder
+	var dbic testDataBlockIntervalCollector
+	bic0 := NewBlockIntervalCollector("", &dbic, nil)
+	bic0Id := shortID(0)
+	bic10 := NewBlockIntervalCollector("", &dbic, nil)
+	bic10Id := shortID(10)
+	dbic.i = interval{10, 20}
+	prop, err := bic0.FinishDataBlock(encoder.getScratchForProp())
+	require.NoError(t, err)
+	encoder.addProp(bic0Id, prop)
+	dbic.i = interval{110, 120}
+	prop, err = bic10.FinishDataBlock(encoder.getScratchForProp())
+	require.NoError(t, err)
+	encoder.addProp(bic10Id, prop)
+	props0And10 := encoder.props()
+	type filter struct {
+		shortID                shortID
+		i                      interval
+		intersectsForEmptyProp bool
+	}
+	testCases := []struct {
+		name  string
+		props []byte
+		// filters must be in ascending order of shortID.
+		filters    []filter
+		intersects bool
+	}{
+		{
+			name:       "no filter, empty props",
+			props:      emptyProps,
+			intersects: true,
+		},
+		{
+			name:       "no filter",
+			props:      props0And10,
+			intersects: true,
+		},
+		{
+			name:  "filter 0, empty props, does not intersect",
+			props: emptyProps,
+			filters: []filter{
+				{
+					shortID: 0,
+					i:       interval{5, 15},
+				},
+			},
+			intersects: false,
+		},
+		{
+			name:  "filter 10, empty props, does not intersect",
+			props: emptyProps,
+			filters: []filter{
+				{
+					shortID: 0,
+					i:       interval{105, 111},
+				},
+			},
+			intersects: false,
+		},
+		{
+			name:  "filter 0, intersects",
+			props: props0And10,
+			filters: []filter{
+				{
+					shortID: 0,
+					i:       interval{5, 15},
+				},
+			},
+			intersects: true,
+		},
+		{
+			name:  "filter 0, does not intersect",
+			props: props0And10,
+			filters: []filter{
+				{
+					shortID: 0,
+					i:       interval{20, 25},
+				},
+			},
+			intersects: false,
+		},
+		{
+			name:  "filter 10, intersects",
+			props: props0And10,
+			filters: []filter{
+				{
+					shortID: 10,
+					i:       interval{105, 111},
+				},
+			},
+			intersects: true,
+		},
+		{
+			name:  "filter 10, does not intersect",
+			props: props0And10,
+			filters: []filter{
+				{
+					shortID: 10,
+					i:       interval{105, 110},
+				},
+			},
+			intersects: false,
+		},
+		{
+			name:  "filter 5, does not intersect since no property",
+			props: props0And10,
+			filters: []filter{
+				{
+					shortID: 5,
+					i:       interval{105, 110},
+				},
+			},
+			intersects: false,
+		},
+		{
+			name:  "filter 0 and 5, intersects and not intersects means overall not intersects",
+			props: props0And10,
+			filters: []filter{
+				{
+					shortID: 0,
+					i:       interval{5, 15},
+				},
+				{
+					shortID: 5,
+					i:       interval{105, 110},
+				},
+			},
+			intersects: false,
+		},
+		{
+			name:  "filter 0, 5, 7, 11, all intersect",
+			props: props0And10,
+			filters: []filter{
+				{
+					shortID: 0,
+					i:       interval{5, 15},
+				},
+				{
+					shortID:                5,
+					i:                      interval{105, 110},
+					intersectsForEmptyProp: true,
+				},
+				{
+					shortID:                7,
+					i:                      interval{105, 110},
+					intersectsForEmptyProp: true,
+				},
+				{
+					shortID:                11,
+					i:                      interval{105, 110},
+					intersectsForEmptyProp: true,
+				},
+			},
+			intersects: true,
+		},
+		{
+			name:  "filter 0, 5, 7, 10, 11, all intersect",
+			props: props0And10,
+			filters: []filter{
+				{
+					shortID: 0,
+					i:       interval{5, 15},
+				},
+				{
+					shortID:                5,
+					i:                      interval{105, 110},
+					intersectsForEmptyProp: true,
+				},
+				{
+					shortID:                7,
+					i:                      interval{105, 110},
+					intersectsForEmptyProp: true,
+				},
+				{
+					shortID: 10,
+					i:       interval{105, 111},
+				},
+				{
+					shortID:                11,
+					i:                      interval{105, 110},
+					intersectsForEmptyProp: true,
+				},
+			},
+			intersects: true,
+		},
+		{
+			name:  "filter 0, 5, 7, 10, 11, all intersect except for 10",
+			props: props0And10,
+			filters: []filter{
+				{
+					shortID: 0,
+					i:       interval{5, 15},
+				},
+				{
+					shortID:                5,
+					i:                      interval{105, 110},
+					intersectsForEmptyProp: true,
+				},
+				{
+					shortID:                7,
+					i:                      interval{105, 110},
+					intersectsForEmptyProp: true,
+				},
+				{
+					shortID: 10,
+					i:       interval{105, 110},
+				},
+				{
+					shortID:                11,
+					i:                      interval{105, 110},
+					intersectsForEmptyProp: true,
+				},
+			},
+			intersects: false,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			var filters []BlockPropertyFilter
+			var shortIDToFiltersIndex []int
+			if len(tc.filters) > 0 {
+				shortIDToFiltersIndex = make([]int, tc.filters[len(tc.filters)-1].shortID+1)
+				for i := range shortIDToFiltersIndex {
+					shortIDToFiltersIndex[i] = -1
+				}
+			}
+			for _, f := range tc.filters {
+				filter := NewBlockIntervalFilter("", f.i.lower, f.i.upper)
+				bpf := BlockPropertyFilter(filter)
+				if f.intersectsForEmptyProp {
+					bpf = filterWithTrueForEmptyProp{filter}
+				}
+				shortIDToFiltersIndex[f.shortID] = len(filters)
+				filters = append(filters, bpf)
+			}
+			doFiltering := func() {
+				bpFilterer := BlockPropertiesFilterer{
+					filters:               filters,
+					shortIDToFiltersIndex: shortIDToFiltersIndex,
+					boundLimitedShortID:   -1,
+				}
+				intersects, err := bpFilterer.intersects(tc.props)
+				require.NoError(t, err)
+				require.Equal(t, tc.intersects, intersects == blockIntersects)
+			}
+			doFiltering()
+			if len(filters) > 1 {
+				// Permute the filters so that the use of
+				// shortIDToFiltersIndex is better tested.
+				permutation := rand.Perm(len(filters))
+				filterPerm := make([]BlockPropertyFilter, len(filters))
+				for i := range permutation {
+					filterPerm[i] = filters[permutation[i]]
+					shortIDToFiltersIndex[tc.filters[permutation[i]].shortID] = i
+				}
+				filters = filterPerm
+				doFiltering()
+			}
+		})
+	}
+}
+
+// valueCharBlockIntervalCollector implements DataBlockIntervalCollector by
+// maintaining the (inclusive) lower and (exclusive) upper bound of a fixed
+// character position in the value, when represented as an integer.
+type valueCharBlockIntervalCollector struct {
+	charIdx      int
+	initialized  bool
+	lower, upper uint64
+}
+
+var _ DataBlockIntervalCollector = &valueCharBlockIntervalCollector{}
+
+// Add implements DataBlockIntervalCollector by maintaining the lower and upper
+// bound of a fixed character position in the value.
+func (c *valueCharBlockIntervalCollector) Add(_ InternalKey, value []byte) error {
+	charIdx := c.charIdx
+	if charIdx == -1 {
+		charIdx = len(value) - 1
+	}
+	val, err := strconv.Atoi(string(value[charIdx]))
+	if err != nil {
+		return err
+	}
+	uval := uint64(val)
+	if !c.initialized {
+		c.lower, c.upper = uval, uval+1
+		c.initialized = true
+		return nil
+	}
+	if uval < c.lower {
+		c.lower = uval
+	}
+	if uval >= c.upper {
+		c.upper = uval + 1
+	}
+
+	return nil
+}
+
+// Finish implements DataBlockIntervalCollector, returning the lower and upper
+// bound for the block. The range is reset to zero in anticipation of the next
+// block.
+func (c *valueCharBlockIntervalCollector) FinishDataBlock() (lower, upper uint64, err error) {
+	l, u := c.lower, c.upper
+	c.lower, c.upper = 0, 0
+	c.initialized = false
+	return l, u, nil
+}
+
+// testKeysSuffixIntervalCollector maintains an interval over the timestamps in
+// MVCC-like suffixes for keys (e.g. foo@123).
+type suffixIntervalCollector struct {
+	initialized  bool
+	lower, upper uint64
+}
+
+// Add implements DataBlockIntervalCollector by adding the timestamp(s) in the
+// suffix(es) of this record to the current interval.
+//
+// Note that range sets and unsets may have multiple suffixes. Range key deletes
+// do not have a suffix. All other point keys have a single suffix.
+func (c *suffixIntervalCollector) Add(key InternalKey, value []byte) error {
+	var bs [][]byte
+	// Range keys have their suffixes encoded into the value.
+	if rangekey.IsRangeKey(key.Kind()) {
+		if key.Kind() == base.InternalKeyKindRangeKeyDelete {
+			return nil
+		}
+		s, err := rangekey.Decode(key, value, nil)
+		if err != nil {
+			return err
+		}
+		for _, k := range s.Keys {
+			if len(k.Suffix) > 0 {
+				bs = append(bs, k.Suffix)
+			}
+		}
+	} else {
+		// All other keys have a single suffix encoded into the value.
+		bs = append(bs, key.UserKey)
+	}
+
+	for _, b := range bs {
+		i := testkeys.Comparer.Split(b)
+		ts, err := strconv.Atoi(string(b[i+1:]))
+		if err != nil {
+			return err
+		}
+		uts := uint64(ts)
+		if !c.initialized {
+			c.lower, c.upper = uts, uts+1
+			c.initialized = true
+			continue
+		}
+		if uts < c.lower {
+			c.lower = uts
+		}
+		if uts >= c.upper {
+			c.upper = uts + 1
+		}
+	}
+	return nil
+}
+
+// FinishDataBlock implements DataBlockIntervalCollector.
+func (c *suffixIntervalCollector) FinishDataBlock() (lower, upper uint64, err error) {
+	l, u := c.lower, c.upper
+	c.lower, c.upper = 0, 0
+	c.initialized = false
+	return l, u, nil
+}
+
+func TestBlockProperties(t *testing.T) {
+	var r *Reader
+	defer func() {
+		if r != nil {
+			require.NoError(t, r.Close())
+		}
+	}()
+
+	var stats base.InternalIteratorStats
+	datadriven.RunTest(t, "testdata/block_properties", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "build":
+			if r != nil {
+				_ = r.Close()
+				r = nil
+			}
+			var output string
+			r, output = runBlockPropertiesBuildCmd(td)
+			return output
+
+		case "collectors":
+			return runCollectorsCmd(r, td)
+
+		case "table-props":
+			return runTablePropsCmd(r, td)
+
+		case "block-props":
+			return runBlockPropsCmd(r, td)
+
+		case "filter":
+			var points, ranges []BlockPropertyFilter
+			for _, cmd := range td.CmdArgs {
+				filter, err := parseIntervalFilter(cmd)
+				if err != nil {
+					return err.Error()
+				}
+				switch cmd.Key {
+				case "point-filter":
+					points = append(points, filter)
+				case "range-filter":
+					ranges = append(ranges, filter)
+				default:
+					return fmt.Sprintf("unknown command: %s", td.Cmd)
+				}
+			}
+
+			// Point keys filter matches.
+			var buf bytes.Buffer
+			var f *BlockPropertiesFilterer
+			buf.WriteString("points: ")
+			if len(points) > 0 {
+				f = newBlockPropertiesFilterer(points, nil)
+				ok, err := f.intersectsUserPropsAndFinishInit(r.Properties.UserProperties)
+				if err != nil {
+					return err.Error()
+				}
+				buf.WriteString(strconv.FormatBool(ok))
+				if !ok {
+					f = nil
+				}
+
+				// Enumerate point key data blocks encoded into the index.
+				if f != nil {
+					indexH, err := r.readIndex(context.Background(), nil, nil)
+					if err != nil {
+						return err.Error()
+					}
+					defer indexH.Release()
+
+					buf.WriteString(", blocks=[")
+
+					var blocks []int
+					var i int
+					iter, _ := newBlockIter(r.Compare, indexH.Get())
+					for key, value := iter.First(); key != nil; key, value = iter.Next() {
+						bh, err := decodeBlockHandleWithProperties(value.InPlaceValue())
+						if err != nil {
+							return err.Error()
+						}
+						intersects, err := f.intersects(bh.Props)
+						if err != nil {
+							return err.Error()
+						}
+						if intersects == blockIntersects {
+							blocks = append(blocks, i)
+						}
+						i++
+					}
+					for i, b := range blocks {
+						buf.WriteString(strconv.Itoa(b))
+						if i < len(blocks)-1 {
+							buf.WriteString(",")
+						}
+					}
+					buf.WriteString("]")
+				}
+			} else {
+				// Without filters, the table matches by default.
+				buf.WriteString("true (no filters provided)")
+			}
+			buf.WriteString("\n")
+
+			// Range key filter matches.
+			buf.WriteString("ranges: ")
+			if len(ranges) > 0 {
+				f := newBlockPropertiesFilterer(ranges, nil)
+				ok, err := f.intersectsUserPropsAndFinishInit(r.Properties.UserProperties)
+				if err != nil {
+					return err.Error()
+				}
+				buf.WriteString(strconv.FormatBool(ok))
+			} else {
+				// Without filters, the table matches by default.
+				buf.WriteString("true (no filters provided)")
+			}
+			buf.WriteString("\n")
+
+			return buf.String()
+
+		case "iter":
+			var lower, upper []byte
+			var filters []BlockPropertyFilter
+			for _, arg := range td.CmdArgs {
+				switch arg.Key {
+				case "lower":
+					lower = []byte(arg.Vals[0])
+				case "upper":
+					upper = []byte(arg.Vals[0])
+				case "point-key-filter":
+					f, err := parseIntervalFilter(arg)
+					if err != nil {
+						return err.Error()
+					}
+					filters = append(filters, f)
+				}
+			}
+			filterer := newBlockPropertiesFilterer(filters, nil)
+			ok, err := filterer.intersectsUserPropsAndFinishInit(r.Properties.UserProperties)
+			if err != nil {
+				return err.Error()
+			} else if !ok {
+				return "filter excludes entire table"
+			}
+			iter, err := r.NewIterWithBlockPropertyFilters(
+				lower, upper, filterer, false /* use (bloom) filter */, &stats,
+				CategoryAndQoS{}, nil, TrivialReaderProvider{Reader: r})
+			if err != nil {
+				return err.Error()
+			}
+			return runIterCmd(td, iter, false, runIterCmdEveryOpAfter(func(w io.Writer) {
+				// After every op, point the value of MaybeFilteredKeys.
+				fmt.Fprintf(w, " MaybeFilteredKeys()=%t", iter.MaybeFilteredKeys())
+			}))
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestBlockProperties_BoundLimited(t *testing.T) {
+	var r *Reader
+	defer func() {
+		if r != nil {
+			require.NoError(t, r.Close())
+		}
+	}()
+
+	var stats base.InternalIteratorStats
+	datadriven.RunTest(t, "testdata/block_properties_boundlimited", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "build":
+			if r != nil {
+				_ = r.Close()
+				r = nil
+			}
+			var output string
+			r, output = runBlockPropertiesBuildCmd(td)
+			return output
+		case "collectors":
+			return runCollectorsCmd(r, td)
+		case "table-props":
+			return runTablePropsCmd(r, td)
+		case "block-props":
+			return runBlockPropsCmd(r, td)
+		case "iter":
+			var buf bytes.Buffer
+			var lower, upper []byte
+			filter := boundLimitedWrapper{
+				w:   &buf,
+				cmp: testkeys.Comparer.Compare,
+			}
+			for _, arg := range td.CmdArgs {
+				switch arg.Key {
+				case "lower":
+					lower = []byte(arg.Vals[0])
+				case "upper":
+					upper = []byte(arg.Vals[0])
+				case "filter":
+					f, err := parseIntervalFilter(arg)
+					if err != nil {
+						return err.Error()
+					}
+					filter.inner = f
+				case "filter-upper":
+					ik := base.MakeInternalKey([]byte(arg.Vals[0]), 0, base.InternalKeyKindSet)
+					filter.upper = &ik
+				case "filter-lower":
+					ik := base.MakeInternalKey([]byte(arg.Vals[0]), 0, base.InternalKeyKindSet)
+					filter.lower = &ik
+				}
+			}
+			if filter.inner == nil {
+				return "missing block property filter"
+			}
+
+			filterer := newBlockPropertiesFilterer(nil, &filter)
+			ok, err := filterer.intersectsUserPropsAndFinishInit(r.Properties.UserProperties)
+			if err != nil {
+				return err.Error()
+			} else if !ok {
+				return "filter excludes entire table"
+			}
+			iter, err := r.NewIterWithBlockPropertyFilters(
+				lower, upper, filterer, false /* use (bloom) filter */, &stats,
+				CategoryAndQoS{}, nil, TrivialReaderProvider{Reader: r})
+			if err != nil {
+				return err.Error()
+			}
+			return runIterCmd(td, iter, false, runIterCmdEveryOp(func(w io.Writer) {
+				// Copy the bound-limited-wrapper's accumulated output to the
+				// iterator's writer. This interleaves its output with the
+				// iterator output.
+				io.Copy(w, &buf)
+				buf.Reset()
+			}), runIterCmdEveryOpAfter(func(w io.Writer) {
+				// After every op, point the value of MaybeFilteredKeys.
+				fmt.Fprintf(w, " MaybeFilteredKeys()=%t", iter.MaybeFilteredKeys())
+			}))
+		default:
+			return fmt.Sprintf("unrecognized command %q", td.Cmd)
+		}
+	})
+}
+
+type boundLimitedWrapper struct {
+	w     io.Writer
+	cmp   base.Compare
+	inner BlockPropertyFilter
+	lower *InternalKey
+	upper *InternalKey
+}
+
+func (bl *boundLimitedWrapper) Name() string { return bl.inner.Name() }
+
+func (bl *boundLimitedWrapper) Intersects(prop []byte) (bool, error) {
+	propString := fmt.Sprintf("%x", prop)
+	var i interval
+	if err := i.decode(prop); err == nil {
+		// If it decodes as an interval, pretty print it as an interval.
+		propString = fmt.Sprintf("[%d, %d)", i.lower, i.upper)
+	}
+
+	v, err := bl.inner.Intersects(prop)
+	if bl.w != nil {
+		fmt.Fprintf(bl.w, "    filter.Intersects(%s) = (%t, %v)\n", propString, v, err)
+	}
+	return v, err
+}
+
+func (bl *boundLimitedWrapper) KeyIsWithinLowerBound(key []byte) (ret bool) {
+	if bl.lower == nil {
+		ret = true
+	} else {
+		ret = bl.cmp(key, bl.lower.UserKey) >= 0
+	}
+	if bl.w != nil {
+		fmt.Fprintf(bl.w, "    filter.KeyIsWithinLowerBound(%s) = %t\n", key, ret)
+	}
+	return ret
+}
+
+func (bl *boundLimitedWrapper) KeyIsWithinUpperBound(key []byte) (ret bool) {
+	if bl.upper == nil {
+		ret = true
+	} else {
+		ret = bl.cmp(key, bl.upper.UserKey) <= 0
+	}
+	if bl.w != nil {
+		fmt.Fprintf(bl.w, "    filter.KeyIsWithinUpperBound(%s) = %t\n", key, ret)
+	}
+	return ret
+}
+
+func parseIntervalFilter(cmd datadriven.CmdArg) (BlockPropertyFilter, error) {
+	name := cmd.Vals[0]
+	minS, maxS := cmd.Vals[1], cmd.Vals[2]
+	min, err := strconv.ParseUint(minS, 10, 64)
+	if err != nil {
+		return nil, err
+	}
+	max, err := strconv.ParseUint(maxS, 10, 64)
+	if err != nil {
+		return nil, err
+	}
+	return NewBlockIntervalFilter(name, min, max), nil
+}
+
+func runCollectorsCmd(r *Reader, td *datadriven.TestData) string {
+	var lines []string
+	for k, v := range r.Properties.UserProperties {
+		lines = append(lines, fmt.Sprintf("%d: %s", v[0], k))
+	}
+	linesSorted := sort.StringSlice(lines)
+	linesSorted.Sort()
+	return strings.Join(lines, "\n")
+}
+
+func runTablePropsCmd(r *Reader, td *datadriven.TestData) string {
+	var lines []string
+	for _, val := range r.Properties.UserProperties {
+		id := shortID(val[0])
+		var i interval
+		if err := i.decode([]byte(val[1:])); err != nil {
+			return err.Error()
+		}
+		lines = append(lines, fmt.Sprintf("%d: [%d, %d)", id, i.lower, i.upper))
+	}
+	linesSorted := sort.StringSlice(lines)
+	linesSorted.Sort()
+	return strings.Join(lines, "\n")
+}
+
+func runBlockPropertiesBuildCmd(td *datadriven.TestData) (r *Reader, out string) {
+	opts := WriterOptions{
+		TableFormat:    TableFormatPebblev2,
+		IndexBlockSize: math.MaxInt32, // Default to a single level index for simplicity.
+	}
+	for _, cmd := range td.CmdArgs {
+		switch cmd.Key {
+		case "block-size":
+			if len(cmd.Vals) != 1 {
+				return r, fmt.Sprintf("%s: arg %s expects 1 value", td.Cmd, cmd.Key)
+			}
+			var err error
+			opts.BlockSize, err = strconv.Atoi(cmd.Vals[0])
+			if err != nil {
+				return r, err.Error()
+			}
+		case "collectors":
+			for _, c := range cmd.Vals {
+				var points, ranges DataBlockIntervalCollector
+				switch c {
+				case "value-first":
+					points = &valueCharBlockIntervalCollector{charIdx: 0}
+				case "value-last":
+					points = &valueCharBlockIntervalCollector{charIdx: -1}
+				case "suffix":
+					points, ranges = &suffixIntervalCollector{}, &suffixIntervalCollector{}
+				case "suffix-point-keys-only":
+					points = &suffixIntervalCollector{}
+				case "suffix-range-keys-only":
+					ranges = &suffixIntervalCollector{}
+				case "nil-points-and-ranges":
+					points, ranges = nil, nil
+				default:
+					return r, fmt.Sprintf("unknown collector: %s", c)
+				}
+				name := c
+				opts.BlockPropertyCollectors = append(
+					opts.BlockPropertyCollectors,
+					func() BlockPropertyCollector {
+						return NewBlockIntervalCollector(name, points, ranges)
+					})
+			}
+		case "index-block-size":
+			var err error
+			opts.IndexBlockSize, err = strconv.Atoi(cmd.Vals[0])
+			if err != nil {
+				return r, err.Error()
+			}
+		}
+	}
+	var meta *WriterMetadata
+	var err error
+	func() {
+		defer func() {
+			if r := recover(); r != nil {
+				err = errors.Errorf("%v", r)
+			}
+		}()
+		meta, r, err = runBuildCmd(td, &opts, 0)
+	}()
+	if err != nil {
+		return r, err.Error()
+	}
+	return r, fmt.Sprintf("point:    [%s,%s]\nrangedel: [%s,%s]\nrangekey: [%s,%s]\nseqnums:  [%d,%d]\n",
+		meta.SmallestPoint, meta.LargestPoint,
+		meta.SmallestRangeDel, meta.LargestRangeDel,
+		meta.SmallestRangeKey, meta.LargestRangeKey,
+		meta.SmallestSeqNum, meta.LargestSeqNum)
+}
+
+func runBlockPropsCmd(r *Reader, td *datadriven.TestData) string {
+	bh, err := r.readIndex(context.Background(), nil, nil)
+	if err != nil {
+		return err.Error()
+	}
+	twoLevelIndex := r.Properties.IndexPartitions > 0
+	i, err := newBlockIter(r.Compare, bh.Get())
+	if err != nil {
+		return err.Error()
+	}
+	defer bh.Release()
+	var sb strings.Builder
+	decodeProps := func(props []byte, indent string) error {
+		d := blockPropertiesDecoder{props: props}
+		var lines []string
+		for !d.done() {
+			id, prop, err := d.next()
+			if err != nil {
+				return err
+			}
+			var i interval
+			if err := i.decode(prop); err != nil {
+				return err
+			}
+			lines = append(lines, fmt.Sprintf("%s%d: [%d, %d)\n", indent, id, i.lower, i.upper))
+		}
+		linesSorted := sort.StringSlice(lines)
+		linesSorted.Sort()
+		for _, line := range lines {
+			sb.WriteString(line)
+		}
+		return nil
+	}
+
+	for key, val := i.First(); key != nil; key, val = i.Next() {
+		sb.WriteString(fmt.Sprintf("%s:\n", key))
+		bhp, err := decodeBlockHandleWithProperties(val.InPlaceValue())
+		if err != nil {
+			return err.Error()
+		}
+		if err := decodeProps(bhp.Props, "  "); err != nil {
+			return err.Error()
+		}
+
+		// If the table has a two-level index, also decode the index
+		// block that bhp points to, along with its block properties.
+		if twoLevelIndex {
+			subiter := &blockIter{}
+			subIndex, err := r.readBlock(
+				context.Background(), bhp.BlockHandle, nil, nil, nil, nil, nil)
+			if err != nil {
+				return err.Error()
+			}
+			if err := subiter.init(
+				r.Compare, subIndex.Get(), 0 /* globalSeqNum */, false); err != nil {
+				return err.Error()
+			}
+			for key, value := subiter.First(); key != nil; key, value = subiter.Next() {
+				sb.WriteString(fmt.Sprintf("  %s:\n", key))
+				dataBH, err := decodeBlockHandleWithProperties(value.InPlaceValue())
+				if err != nil {
+					return err.Error()
+				}
+				if err := decodeProps(dataBH.Props, "    "); err != nil {
+					return err.Error()
+				}
+			}
+			subIndex.Release()
+		}
+	}
+	return sb.String()
+}
+
+type keyCountCollector struct {
+	name                string
+	block, index, table int
+}
+
+var _ BlockPropertyCollector = &keyCountCollector{}
+var _ SuffixReplaceableBlockCollector = &keyCountCollector{}
+
+func keyCountCollectorFn(name string) func() BlockPropertyCollector {
+	return func() BlockPropertyCollector { return &keyCountCollector{name: name} }
+}
+
+func (p *keyCountCollector) Name() string { return p.name }
+
+func (p *keyCountCollector) Add(k InternalKey, _ []byte) error {
+	if rangekey.IsRangeKey(k.Kind()) {
+		p.table++
+	} else {
+		p.block++
+	}
+	return nil
+}
+
+func (p *keyCountCollector) FinishDataBlock(buf []byte) ([]byte, error) {
+	buf = append(buf, []byte(strconv.Itoa(int(p.block)))...)
+	p.table += p.block
+	return buf, nil
+}
+
+func (p *keyCountCollector) AddPrevDataBlockToIndexBlock() {
+	p.index += p.block
+	p.block = 0
+}
+
+func (p *keyCountCollector) FinishIndexBlock(buf []byte) ([]byte, error) {
+	buf = append(buf, []byte(strconv.Itoa(int(p.index)))...)
+	p.index = 0
+	return buf, nil
+}
+
+func (p *keyCountCollector) FinishTable(buf []byte) ([]byte, error) {
+	buf = append(buf, []byte(strconv.Itoa(int(p.table)))...)
+	p.table = 0
+	return buf, nil
+}
+
+func (p *keyCountCollector) UpdateKeySuffixes(old []byte, _, _ []byte) error {
+	n, err := strconv.Atoi(string(old))
+	if err != nil {
+		return err
+	}
+	p.block = n
+	return nil
+}
+
+// intSuffixCollector is testing prop collector that collects the min and
+// max value of numeric suffix of keys (interpreting suffixLen bytes as ascii
+// for conversion with atoi).
+type intSuffixCollector struct {
+	suffixLen int
+	min, max  uint64 // inclusive
+}
+
+func makeIntSuffixCollector(len int) intSuffixCollector {
+	return intSuffixCollector{len, math.MaxUint64, 0}
+}
+
+func (p *intSuffixCollector) setFromSuffix(to []byte) error {
+	if len(to) >= p.suffixLen {
+		parsed, err := strconv.Atoi(string(to[len(to)-p.suffixLen:]))
+		if err != nil {
+			return err
+		}
+		p.min = uint64(parsed)
+		p.max = uint64(parsed)
+	}
+	return nil
+}
+
+type intSuffixTablePropCollector struct {
+	name string
+	intSuffixCollector
+}
+
+var _ TablePropertyCollector = &intSuffixTablePropCollector{}
+var _ SuffixReplaceableTableCollector = &intSuffixTablePropCollector{}
+
+func intSuffixTablePropCollectorFn(name string, len int) func() TablePropertyCollector {
+	return func() TablePropertyCollector { return &intSuffixTablePropCollector{name, makeIntSuffixCollector(len)} }
+}
+
+func (p *intSuffixCollector) Add(key InternalKey, _ []byte) error {
+	if len(key.UserKey) > p.suffixLen {
+		parsed, err := strconv.Atoi(string(key.UserKey[len(key.UserKey)-p.suffixLen:]))
+		if err != nil {
+			return err
+		}
+		v := uint64(parsed)
+		if v > p.max {
+			p.max = v
+		}
+		if v < p.min {
+			p.min = v
+		}
+	}
+	return nil
+}
+
+func (p *intSuffixTablePropCollector) Finish(userProps map[string]string) error {
+	userProps[p.name+".min"] = fmt.Sprint(p.min)
+	userProps[p.name+".max"] = fmt.Sprint(p.max)
+	return nil
+}
+
+func (p *intSuffixTablePropCollector) Name() string { return p.name }
+
+func (p *intSuffixTablePropCollector) UpdateKeySuffixes(
+	oldProps map[string]string, from, to []byte,
+) error {
+	return p.setFromSuffix(to)
+}
+
+// testIntSuffixIntervalCollector is a wrapper for testIntSuffixCollector that
+// uses it to implement a block interval collector.
+type intSuffixIntervalCollector struct {
+	intSuffixCollector
+}
+
+func intSuffixIntervalCollectorFn(name string, length int) func() BlockPropertyCollector {
+	return func() BlockPropertyCollector {
+		return NewBlockIntervalCollector(name, &intSuffixIntervalCollector{makeIntSuffixCollector(length)}, nil)
+	}
+}
+
+var _ DataBlockIntervalCollector = &intSuffixIntervalCollector{}
+var _ SuffixReplaceableBlockCollector = &intSuffixIntervalCollector{}
+
+func (p *intSuffixIntervalCollector) FinishDataBlock() (lower uint64, upper uint64, err error) {
+	return p.min, p.max + 1, nil
+}
+
+func (p *intSuffixIntervalCollector) UpdateKeySuffixes(oldProp []byte, from, to []byte) error {
+	return p.setFromSuffix(to)
+}
diff --git a/pebble/sstable/block_property_test_utils.go b/pebble/sstable/block_property_test_utils.go
new file mode 100644
index 0000000..0ade68f
--- /dev/null
+++ b/pebble/sstable/block_property_test_utils.go
@@ -0,0 +1,117 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"math"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+)
+
+// Code in this file contains utils for testing. It implements interval block
+// property collectors and filters on the suffixes of keys in the format used
+// by the testkeys package (eg, 'key@5').
+
+const testKeysBlockPropertyName = `pebble.internal.testkeys.suffixes`
+
+// NewTestKeysBlockPropertyCollector constructs a sstable property collector
+// over testkey suffixes.
+func NewTestKeysBlockPropertyCollector() BlockPropertyCollector {
+	return NewBlockIntervalCollector(
+		testKeysBlockPropertyName,
+		&testKeysSuffixIntervalCollector{},
+		nil)
+}
+
+// NewTestKeysBlockPropertyFilter constructs a new block-property filter that excludes
+// blocks containing exclusively suffixed keys where all the suffixes fall
+// outside of the range [filterMin, filterMax).
+//
+// The filter only filters based on data derived from the key. The iteration
+// results of this block property filter are deterministic for unsuffixed keys
+// and keys with suffixes within the range [filterMin, filterMax). For keys with
+// suffixes outside the range, iteration is nondeterministic.
+func NewTestKeysBlockPropertyFilter(filterMin, filterMax uint64) *BlockIntervalFilter {
+	return NewBlockIntervalFilter(testKeysBlockPropertyName, filterMin, filterMax)
+}
+
+// NewTestKeysMaskingFilter constructs a TestKeysMaskingFilter that implements
+// pebble.BlockPropertyFilterMask for efficient range-key masking using the
+// testkeys block property filter. The masking filter wraps a block interval
+// filter, and modifies the configured interval when Pebble requests it.
+func NewTestKeysMaskingFilter() TestKeysMaskingFilter {
+	return TestKeysMaskingFilter{BlockIntervalFilter: NewTestKeysBlockPropertyFilter(0, math.MaxUint64)}
+}
+
+// TestKeysMaskingFilter implements BlockPropertyFilterMask and may be used to mask
+// point keys with the testkeys-style suffixes (eg, @4) that are masked by range
+// keys with testkeys-style suffixes.
+type TestKeysMaskingFilter struct {
+	*BlockIntervalFilter
+}
+
+// SetSuffix implements pebble.BlockPropertyFilterMask.
+func (f TestKeysMaskingFilter) SetSuffix(suffix []byte) error {
+	ts, err := testkeys.ParseSuffix(suffix)
+	if err != nil {
+		return err
+	}
+	f.BlockIntervalFilter.SetInterval(uint64(ts), math.MaxUint64)
+	return nil
+}
+
+// Intersects implements the BlockPropertyFilter interface.
+func (f TestKeysMaskingFilter) Intersects(prop []byte) (bool, error) {
+	return f.BlockIntervalFilter.Intersects(prop)
+}
+
+var _ DataBlockIntervalCollector = (*testKeysSuffixIntervalCollector)(nil)
+
+// testKeysSuffixIntervalCollector maintains an interval over the timestamps in
+// MVCC-like suffixes for keys (e.g. foo@123).
+type testKeysSuffixIntervalCollector struct {
+	initialized  bool
+	lower, upper uint64
+}
+
+// Add implements DataBlockIntervalCollector by adding the timestamp(s) in the
+// suffix(es) of this record to the current interval.
+//
+// Note that range sets and unsets may have multiple suffixes. Range key deletes
+// do not have a suffix. All other point keys have a single suffix.
+func (c *testKeysSuffixIntervalCollector) Add(key base.InternalKey, value []byte) error {
+	i := testkeys.Comparer.Split(key.UserKey)
+	if i == len(key.UserKey) {
+		c.initialized = true
+		c.lower, c.upper = 0, math.MaxUint64
+		return nil
+	}
+	ts, err := testkeys.ParseSuffix(key.UserKey[i:])
+	if err != nil {
+		return err
+	}
+	uts := uint64(ts)
+	if !c.initialized {
+		c.lower, c.upper = uts, uts+1
+		c.initialized = true
+		return nil
+	}
+	if uts < c.lower {
+		c.lower = uts
+	}
+	if uts >= c.upper {
+		c.upper = uts + 1
+	}
+	return nil
+}
+
+// FinishDataBlock implements DataBlockIntervalCollector.
+func (c *testKeysSuffixIntervalCollector) FinishDataBlock() (lower, upper uint64, err error) {
+	l, u := c.lower, c.upper
+	c.lower, c.upper = 0, 0
+	c.initialized = false
+	return l, u, nil
+}
diff --git a/pebble/sstable/block_test.go b/pebble/sstable/block_test.go
new file mode 100644
index 0000000..14e6f7f
--- /dev/null
+++ b/pebble/sstable/block_test.go
@@ -0,0 +1,513 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"bytes"
+	"fmt"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+	"unsafe"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/itertest"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+func ikey(s string) InternalKey {
+	return InternalKey{UserKey: []byte(s)}
+}
+
+func TestBlockWriter(t *testing.T) {
+	w := &rawBlockWriter{
+		blockWriter: blockWriter{restartInterval: 16},
+	}
+	w.add(ikey("apple"), nil)
+	w.add(ikey("apricot"), nil)
+	w.add(ikey("banana"), nil)
+	block := w.finish()
+
+	expected := []byte(
+		"\x00\x05\x00apple" +
+			"\x02\x05\x00ricot" +
+			"\x00\x06\x00banana" +
+			"\x00\x00\x00\x00\x01\x00\x00\x00")
+	if !bytes.Equal(expected, block) {
+		t.Fatalf("expected\n%q\nfound\n%q", expected, block)
+	}
+}
+
+func TestBlockWriterWithPrefix(t *testing.T) {
+	w := &rawBlockWriter{
+		blockWriter: blockWriter{restartInterval: 2},
+	}
+	curKey := func() string {
+		return string(base.DecodeInternalKey(w.curKey).UserKey)
+	}
+	addAdapter := func(
+		key InternalKey,
+		value []byte,
+		addValuePrefix bool,
+		valuePrefix valuePrefix,
+		setHasSameKeyPrefix bool) {
+		w.addWithOptionalValuePrefix(
+			key, false, value, len(key.UserKey), addValuePrefix, valuePrefix, setHasSameKeyPrefix)
+	}
+	addAdapter(
+		ikey("apple"), []byte("red"), false, 0, true)
+	require.Equal(t, "apple", curKey())
+	require.Equal(t, "red", string(w.curValue))
+	addAdapter(
+		ikey("apricot"), []byte("orange"), true, '\xff', false)
+	require.Equal(t, "apricot", curKey())
+	require.Equal(t, "orange", string(w.curValue))
+	// Even though this call has setHasSameKeyPrefix=true, the previous call,
+	// which was after the last restart set it to false. So the restart encoded
+	// with banana has this cumulative bit set to false.
+	addAdapter(
+		ikey("banana"), []byte("yellow"), true, '\x00', true)
+	require.Equal(t, "banana", curKey())
+	require.Equal(t, "yellow", string(w.curValue))
+	addAdapter(
+		ikey("cherry"), []byte("red"), false, 0, true)
+	require.Equal(t, "cherry", curKey())
+	require.Equal(t, "red", string(w.curValue))
+	// All intervening calls has setHasSameKeyPrefix=true, so the cumulative bit
+	// will be set to true in this restart.
+	addAdapter(
+		ikey("mango"), []byte("juicy"), false, 0, true)
+	require.Equal(t, "mango", curKey())
+	require.Equal(t, "juicy", string(w.curValue))
+
+	block := w.finish()
+
+	expected := []byte(
+		"\x00\x0d\x03apple\x00\x00\x00\x00\x00\x00\x00\x00red" +
+			"\x02\x0d\x07ricot\x00\x00\x00\x00\x00\x00\x00\x00\xfforange" +
+			"\x00\x0e\x07banana\x00\x00\x00\x00\x00\x00\x00\x00\x00yellow" +
+			"\x00\x0e\x03cherry\x00\x00\x00\x00\x00\x00\x00\x00red" +
+			"\x00\x0d\x05mango\x00\x00\x00\x00\x00\x00\x00\x00juicy" +
+			// Restarts are:
+			// 00000000 (restart at apple), 2a000000 (restart at banana), 56000080 (restart at mango)
+			// 03000000 (number of restart, i.e., 3). The restart at mango has 1 in the most significant
+			// bit of the uint32, so the last byte in the little endian encoding is \x80.
+			"\x00\x00\x00\x00\x2a\x00\x00\x00\x56\x00\x00\x80\x03\x00\x00\x00")
+	if !bytes.Equal(expected, block) {
+		t.Fatalf("expected\n%x\nfound\n%x", expected, block)
+	}
+}
+
+func testBlockCleared(t *testing.T, w, b *blockWriter) {
+	require.Equal(t, w.restartInterval, b.restartInterval)
+	require.Equal(t, w.nEntries, b.nEntries)
+	require.Equal(t, w.nextRestart, b.nextRestart)
+	require.Equal(t, len(w.buf), len(b.buf))
+	require.Equal(t, len(w.restarts), len(b.restarts))
+	require.Equal(t, len(w.curKey), len(b.curKey))
+	require.Equal(t, len(w.prevKey), len(b.prevKey))
+	require.Equal(t, len(w.curValue), len(b.curValue))
+	require.Equal(t, w.tmp, b.tmp)
+
+	// Make sure that we didn't lose the allocated byte slices.
+	require.True(t, cap(w.buf) > 0 && cap(b.buf) == 0)
+	require.True(t, cap(w.restarts) > 0 && cap(b.restarts) == 0)
+	require.True(t, cap(w.curKey) > 0 && cap(b.curKey) == 0)
+	require.True(t, cap(w.prevKey) > 0 && cap(b.prevKey) == 0)
+	require.True(t, cap(w.curValue) > 0 && cap(b.curValue) == 0)
+}
+
+func TestBlockClear(t *testing.T) {
+	w := blockWriter{restartInterval: 16}
+	w.add(ikey("apple"), nil)
+	w.add(ikey("apricot"), nil)
+	w.add(ikey("banana"), nil)
+
+	w.clear()
+
+	// Once a block is cleared, we expect its fields to be cleared, but we expect
+	// it to keep its allocated byte slices.
+	b := blockWriter{}
+	testBlockCleared(t, &w, &b)
+}
+
+func TestInvalidInternalKeyDecoding(t *testing.T) {
+	// Invalid keys since they don't have an 8 byte trailer.
+	testCases := []string{
+		"",
+		"\x01\x02\x03\x04\x05\x06\x07",
+		"foo",
+	}
+	for _, tc := range testCases {
+		i := blockIter{}
+		i.decodeInternalKey([]byte(tc))
+		require.Nil(t, i.ikey.UserKey)
+		require.Equal(t, uint64(InternalKeyKindInvalid), i.ikey.Trailer)
+	}
+}
+
+func TestBlockIter(t *testing.T) {
+	// k is a block that maps three keys "apple", "apricot", "banana" to empty strings.
+	k := block([]byte(
+		"\x00\x05\x00apple" +
+			"\x02\x05\x00ricot" +
+			"\x00\x06\x00banana" +
+			"\x00\x00\x00\x00\x01\x00\x00\x00"))
+	var testcases = []struct {
+		index int
+		key   string
+	}{
+		{0, ""},
+		{0, "a"},
+		{0, "aaaaaaaaaaaaaaa"},
+		{0, "app"},
+		{0, "apple"},
+		{1, "appliance"},
+		{1, "apricos"},
+		{1, "apricot"},
+		{2, "azzzzzzzzzzzzzz"},
+		{2, "b"},
+		{2, "banan"},
+		{2, "banana"},
+		{3, "banana\x00"},
+		{3, "c"},
+	}
+	for _, tc := range testcases {
+		i, err := newRawBlockIter(bytes.Compare, k)
+		require.NoError(t, err)
+		i.SeekGE([]byte(tc.key))
+		for j, keyWant := range []string{"apple", "apricot", "banana"}[tc.index:] {
+			if !i.Valid() {
+				t.Fatalf("key=%q, index=%d, j=%d: Valid got false, keyWant true", tc.key, tc.index, j)
+			}
+			if keyGot := string(i.Key().UserKey); keyGot != keyWant {
+				t.Fatalf("key=%q, index=%d, j=%d: got %q, keyWant %q", tc.key, tc.index, j, keyGot, keyWant)
+			}
+			i.Next()
+		}
+		if i.Valid() {
+			t.Fatalf("key=%q, index=%d: Valid got true, keyWant false", tc.key, tc.index)
+		}
+		if err := i.Close(); err != nil {
+			t.Fatalf("key=%q, index=%d: got err=%v", tc.key, tc.index, err)
+		}
+	}
+
+	{
+		i, err := newRawBlockIter(bytes.Compare, k)
+		require.NoError(t, err)
+		i.Last()
+		for j, keyWant := range []string{"banana", "apricot", "apple"} {
+			if !i.Valid() {
+				t.Fatalf("j=%d: Valid got false, want true", j)
+			}
+			if keyGot := string(i.Key().UserKey); keyGot != keyWant {
+				t.Fatalf("j=%d: got %q, want %q", j, keyGot, keyWant)
+			}
+			i.Prev()
+		}
+		if i.Valid() {
+			t.Fatalf("Valid got true, want false")
+		}
+		if err := i.Close(); err != nil {
+			t.Fatalf("got err=%v", err)
+		}
+	}
+}
+
+func TestBlockIter2(t *testing.T) {
+	makeIkey := func(s string) InternalKey {
+		j := strings.Index(s, ":")
+		seqNum, err := strconv.Atoi(s[j+1:])
+		if err != nil {
+			panic(err)
+		}
+		return base.MakeInternalKey([]byte(s[:j]), uint64(seqNum), InternalKeyKindSet)
+	}
+
+	var block []byte
+
+	for _, r := range []int{1, 2, 3, 4} {
+		t.Run(fmt.Sprintf("restart=%d", r), func(t *testing.T) {
+			datadriven.RunTest(t, "testdata/block", func(t *testing.T, d *datadriven.TestData) string {
+				switch d.Cmd {
+				case "build":
+					w := &blockWriter{restartInterval: r}
+					for _, e := range strings.Split(strings.TrimSpace(d.Input), ",") {
+						w.add(makeIkey(e), nil)
+					}
+					block = w.finish()
+					return ""
+
+				case "iter":
+					iter, err := newBlockIter(bytes.Compare, block)
+					if err != nil {
+						return err.Error()
+					}
+
+					iter.globalSeqNum, err = scanGlobalSeqNum(d)
+					if err != nil {
+						return err.Error()
+					}
+					return itertest.RunInternalIterCmd(t, d, iter, itertest.Condensed)
+
+				default:
+					return fmt.Sprintf("unknown command: %s", d.Cmd)
+				}
+			})
+		})
+	}
+}
+
+func TestBlockIterKeyStability(t *testing.T) {
+	w := &blockWriter{restartInterval: 1}
+	expected := [][]byte{
+		[]byte("apple"),
+		[]byte("apricot"),
+		[]byte("banana"),
+	}
+	for i := range expected {
+		w.add(InternalKey{UserKey: expected[i]}, nil)
+	}
+	block := w.finish()
+
+	i, err := newBlockIter(bytes.Compare, block)
+	require.NoError(t, err)
+
+	// Check that the supplied slice resides within the bounds of the block.
+	check := func(v []byte) {
+		t.Helper()
+		begin := unsafe.Pointer(&v[0])
+		end := unsafe.Pointer(uintptr(begin) + uintptr(len(v)))
+		blockBegin := unsafe.Pointer(&block[0])
+		blockEnd := unsafe.Pointer(uintptr(blockBegin) + uintptr(len(block)))
+		if uintptr(begin) < uintptr(blockBegin) || uintptr(end) > uintptr(blockEnd) {
+			t.Fatalf("key %p-%p resides outside of block %p-%p", begin, end, blockBegin, blockEnd)
+		}
+	}
+
+	// Check that various means of iterating over the data match our expected
+	// values. Note that this is only guaranteed because of the usage of a
+	// restart-interval of 1 so that prefix compression was not performed.
+	for j := range expected {
+		keys := [][]byte{}
+		for key, _ := i.SeekGE(expected[j], base.SeekGEFlagsNone); key != nil; key, _ = i.Next() {
+			check(key.UserKey)
+			keys = append(keys, key.UserKey)
+		}
+		require.EqualValues(t, expected[j:], keys)
+	}
+
+	for j := range expected {
+		keys := [][]byte{}
+		for key, _ := i.SeekLT(expected[j], base.SeekLTFlagsNone); key != nil; key, _ = i.Prev() {
+			check(key.UserKey)
+			keys = append(keys, key.UserKey)
+		}
+		for i, j := 0, len(keys)-1; i < j; i, j = i+1, j-1 {
+			keys[i], keys[j] = keys[j], keys[i]
+		}
+		require.EqualValues(t, expected[:j], keys)
+	}
+}
+
+// Regression test for a bug in blockIter.Next where it was failing to handle
+// the case where it is switching from reverse to forward iteration. When that
+// switch occurs we need to populate blockIter.fullKey so that prefix
+// decompression works properly.
+func TestBlockIterReverseDirections(t *testing.T) {
+	w := &blockWriter{restartInterval: 4}
+	keys := [][]byte{
+		[]byte("apple0"),
+		[]byte("apple1"),
+		[]byte("apple2"),
+		[]byte("banana"),
+		[]byte("carrot"),
+	}
+	for i := range keys {
+		w.add(InternalKey{UserKey: keys[i]}, nil)
+	}
+	block := w.finish()
+
+	for targetPos := 0; targetPos < w.restartInterval; targetPos++ {
+		t.Run("", func(t *testing.T) {
+			i, err := newBlockIter(bytes.Compare, block)
+			require.NoError(t, err)
+
+			pos := 3
+			if key, _ := i.SeekLT([]byte("carrot"), base.SeekLTFlagsNone); !bytes.Equal(keys[pos], key.UserKey) {
+				t.Fatalf("expected %s, but found %s", keys[pos], key.UserKey)
+			}
+			for pos > targetPos {
+				pos--
+				if key, _ := i.Prev(); !bytes.Equal(keys[pos], key.UserKey) {
+					t.Fatalf("expected %s, but found %s", keys[pos], key.UserKey)
+				}
+			}
+			pos++
+			if key, _ := i.Next(); !bytes.Equal(keys[pos], key.UserKey) {
+				t.Fatalf("expected %s, but found %s", keys[pos], key.UserKey)
+			}
+		})
+	}
+}
+
+func BenchmarkBlockIterSeekGE(b *testing.B) {
+	const blockSize = 32 << 10
+
+	for _, restartInterval := range []int{16} {
+		b.Run(fmt.Sprintf("restart=%d", restartInterval),
+			func(b *testing.B) {
+				w := &blockWriter{
+					restartInterval: restartInterval,
+				}
+
+				var ikey InternalKey
+				var keys [][]byte
+				for i := 0; w.estimatedSize() < blockSize; i++ {
+					key := []byte(fmt.Sprintf("%05d", i))
+					keys = append(keys, key)
+					ikey.UserKey = key
+					w.add(ikey, nil)
+				}
+
+				it, err := newBlockIter(bytes.Compare, w.finish())
+				if err != nil {
+					b.Fatal(err)
+				}
+				rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+
+				b.ResetTimer()
+				for i := 0; i < b.N; i++ {
+					k := keys[rng.Intn(len(keys))]
+					it.SeekGE(k, base.SeekGEFlagsNone)
+					if testing.Verbose() {
+						if !it.valid() {
+							b.Fatal("expected to find key")
+						}
+						if !bytes.Equal(k, it.Key().UserKey) {
+							b.Fatalf("expected %s, but found %s", k, it.Key().UserKey)
+						}
+					}
+				}
+			})
+	}
+}
+
+func BenchmarkBlockIterSeekLT(b *testing.B) {
+	const blockSize = 32 << 10
+
+	for _, restartInterval := range []int{16} {
+		b.Run(fmt.Sprintf("restart=%d", restartInterval),
+			func(b *testing.B) {
+				w := &blockWriter{
+					restartInterval: restartInterval,
+				}
+
+				var ikey InternalKey
+				var keys [][]byte
+				for i := 0; w.estimatedSize() < blockSize; i++ {
+					key := []byte(fmt.Sprintf("%05d", i))
+					keys = append(keys, key)
+					ikey.UserKey = key
+					w.add(ikey, nil)
+				}
+
+				it, err := newBlockIter(bytes.Compare, w.finish())
+				if err != nil {
+					b.Fatal(err)
+				}
+				rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+
+				b.ResetTimer()
+				for i := 0; i < b.N; i++ {
+					j := rng.Intn(len(keys))
+					it.SeekLT(keys[j], base.SeekLTFlagsNone)
+					if testing.Verbose() {
+						if j == 0 {
+							if it.valid() {
+								b.Fatal("unexpected key")
+							}
+						} else {
+							if !it.valid() {
+								b.Fatal("expected to find key")
+							}
+							k := keys[j-1]
+							if !bytes.Equal(k, it.Key().UserKey) {
+								b.Fatalf("expected %s, but found %s", k, it.Key().UserKey)
+							}
+						}
+					}
+				}
+			})
+	}
+}
+
+func BenchmarkBlockIterNext(b *testing.B) {
+	const blockSize = 32 << 10
+
+	for _, restartInterval := range []int{16} {
+		b.Run(fmt.Sprintf("restart=%d", restartInterval),
+			func(b *testing.B) {
+				w := &blockWriter{
+					restartInterval: restartInterval,
+				}
+
+				var ikey InternalKey
+				for i := 0; w.estimatedSize() < blockSize; i++ {
+					ikey.UserKey = []byte(fmt.Sprintf("%05d", i))
+					w.add(ikey, nil)
+				}
+
+				it, err := newBlockIter(bytes.Compare, w.finish())
+				if err != nil {
+					b.Fatal(err)
+				}
+
+				b.ResetTimer()
+				for i := 0; i < b.N; i++ {
+					if !it.valid() {
+						it.First()
+					}
+					it.Next()
+				}
+			})
+	}
+}
+
+func BenchmarkBlockIterPrev(b *testing.B) {
+	const blockSize = 32 << 10
+
+	for _, restartInterval := range []int{16} {
+		b.Run(fmt.Sprintf("restart=%d", restartInterval),
+			func(b *testing.B) {
+				w := &blockWriter{
+					restartInterval: restartInterval,
+				}
+
+				var ikey InternalKey
+				for i := 0; w.estimatedSize() < blockSize; i++ {
+					ikey.UserKey = []byte(fmt.Sprintf("%05d", i))
+					w.add(ikey, nil)
+				}
+
+				it, err := newBlockIter(bytes.Compare, w.finish())
+				if err != nil {
+					b.Fatal(err)
+				}
+
+				b.ResetTimer()
+				for i := 0; i < b.N; i++ {
+					if !it.valid() {
+						it.Last()
+					}
+					it.Prev()
+				}
+			})
+	}
+}
diff --git a/pebble/sstable/buffer_pool.go b/pebble/sstable/buffer_pool.go
new file mode 100644
index 0000000..2e98d44
--- /dev/null
+++ b/pebble/sstable/buffer_pool.go
@@ -0,0 +1,148 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/cache"
+)
+
+// A bufferHandle is a handle to manually-managed memory. The handle may point
+// to a block in the block cache (h.Get() != nil), or a buffer that exists
+// outside the block cache allocated from a BufferPool (b.Valid()).
+type bufferHandle struct {
+	h cache.Handle
+	b Buf
+}
+
+// Get retrieves the underlying buffer referenced by the handle.
+func (bh bufferHandle) Get() []byte {
+	if v := bh.h.Get(); v != nil {
+		return v
+	} else if bh.b.p != nil {
+		return bh.b.p.pool[bh.b.i].b
+	}
+	return nil
+}
+
+// Release releases the buffer, either back to the block cache or BufferPool.
+func (bh bufferHandle) Release() {
+	bh.h.Release()
+	bh.b.Release()
+}
+
+// A BufferPool holds a pool of buffers for holding sstable blocks. An initial
+// size of the pool is provided on Init, but a BufferPool will grow to meet the
+// largest working set size. It'll never shrink. When a buffer is released, the
+// BufferPool recycles the buffer for future allocations.
+//
+// A BufferPool should only be used for short-lived allocations with
+// well-understood working set sizes to avoid excessive memory consumption.
+//
+// BufferPool is not thread-safe.
+type BufferPool struct {
+	// pool contains all the buffers held by the pool, including buffers that
+	// are in-use. For every i < len(pool): pool[i].v is non-nil.
+	pool []allocedBuffer
+}
+
+type allocedBuffer struct {
+	v *cache.Value
+	// b holds the current byte slice. It's backed by v, but may be a subslice
+	// of v's memory while the buffer is in-use [ len(b) ≤ len(v.Buf()) ].
+	//
+	// If the buffer is not currently in-use, b is nil. When being recycled, the
+	// BufferPool.Alloc will reset b to be a subslice of v.Buf().
+	b []byte
+}
+
+// Init initializes the pool with an initial working set buffer size of
+// `initialSize`.
+func (p *BufferPool) Init(initialSize int) {
+	*p = BufferPool{
+		pool: make([]allocedBuffer, 0, initialSize),
+	}
+}
+
+// initPreallocated is like Init but for internal sstable package use in
+// instances where a pre-allocated slice of []allocedBuffer already exists. It's
+// used to avoid an extra allocation initializing BufferPool.pool.
+func (p *BufferPool) initPreallocated(pool []allocedBuffer) {
+	*p = BufferPool{
+		pool: pool[:0],
+	}
+}
+
+// Release releases all buffers held by the pool and resets the pool to an
+// uninitialized state.
+func (p *BufferPool) Release() {
+	for i := range p.pool {
+		if p.pool[i].b != nil {
+			panic(errors.AssertionFailedf("Release called on a BufferPool with in-use buffers"))
+		}
+		cache.Free(p.pool[i].v)
+	}
+	*p = BufferPool{}
+}
+
+// Alloc allocates a new buffer of size n. If the pool already holds a buffer at
+// least as large as n, the pooled buffer is used instead.
+//
+// Alloc is O(MAX(N,M)) where N is the largest number of concurrently in-use
+// buffers allocated and M is the initialSize passed to Init.
+func (p *BufferPool) Alloc(n int) Buf {
+	unusableBufferIdx := -1
+	for i := 0; i < len(p.pool); i++ {
+		if p.pool[i].b == nil {
+			if len(p.pool[i].v.Buf()) >= n {
+				p.pool[i].b = p.pool[i].v.Buf()[:n]
+				return Buf{p: p, i: i}
+			}
+			unusableBufferIdx = i
+		}
+	}
+
+	// If we would need to grow the size of the pool to allocate another buffer,
+	// but there was a slot available occupied by a buffer that's just too
+	// small, replace the too-small buffer.
+	if len(p.pool) == cap(p.pool) && unusableBufferIdx >= 0 {
+		i := unusableBufferIdx
+		cache.Free(p.pool[i].v)
+		p.pool[i].v = cache.Alloc(n)
+		p.pool[i].b = p.pool[i].v.Buf()
+		return Buf{p: p, i: i}
+	}
+
+	// Allocate a new buffer.
+	v := cache.Alloc(n)
+	p.pool = append(p.pool, allocedBuffer{v: v, b: v.Buf()[:n]})
+	return Buf{p: p, i: len(p.pool) - 1}
+}
+
+// A Buf holds a reference to a manually-managed, pooled byte buffer.
+type Buf struct {
+	p *BufferPool
+	// i holds the index into p.pool where the buffer may be found. This scheme
+	// avoids needing to allocate the handle to the buffer on the heap at the
+	// cost of copying two words instead of one.
+	i int
+}
+
+// Valid returns true if the buf holds a valid buffer.
+func (b Buf) Valid() bool {
+	return b.p != nil
+}
+
+// Release releases the buffer back to the pool.
+func (b *Buf) Release() {
+	if b.p == nil {
+		return
+	}
+	// Clear the allocedBuffer's byte slice. This signals the allocated buffer
+	// is no longer in use and a future call to BufferPool.Alloc may reuse this
+	// buffer.
+	b.p.pool[b.i].b = nil
+	b.p = nil
+}
diff --git a/pebble/sstable/buffer_pool_test.go b/pebble/sstable/buffer_pool_test.go
new file mode 100644
index 0000000..66ae094
--- /dev/null
+++ b/pebble/sstable/buffer_pool_test.go
@@ -0,0 +1,78 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+)
+
+func writeBufferPool(w io.Writer, bp *BufferPool) {
+	for i := 0; i < cap(bp.pool); i++ {
+		if i > 0 {
+			fmt.Fprint(w, " ")
+		}
+		if i >= len(bp.pool) {
+			fmt.Fprint(w, "[    ]")
+			continue
+		}
+		sz := len(bp.pool[i].v.Buf())
+		if bp.pool[i].b == nil {
+			fmt.Fprintf(w, "[%4d]", sz)
+		} else {
+			fmt.Fprintf(w, "<%4d>", sz)
+		}
+	}
+}
+
+func TestBufferPool(t *testing.T) {
+	var bp BufferPool
+	var buf bytes.Buffer
+	handles := map[string]Buf{}
+	drainPool := func() {
+		for h, b := range handles {
+			b.Release()
+			delete(handles, h)
+		}
+		bp.Release()
+	}
+	defer drainPool()
+	datadriven.RunTest(t, "testdata/buffer_pool", func(t *testing.T, td *datadriven.TestData) string {
+		buf.Reset()
+		switch td.Cmd {
+		case "init":
+			if cap(bp.pool) > 0 {
+				drainPool()
+			}
+			var initialSize int
+			td.ScanArgs(t, "size", &initialSize)
+			bp.Init(initialSize)
+			writeBufferPool(&buf, &bp)
+			return buf.String()
+		case "alloc":
+			var n int
+			var handle string
+			td.ScanArgs(t, "n", &n)
+			td.ScanArgs(t, "handle", &handle)
+			handles[handle] = bp.Alloc(n)
+			writeBufferPool(&buf, &bp)
+			return buf.String()
+		case "release":
+			var handle string
+			td.ScanArgs(t, "handle", &handle)
+			b := handles[handle]
+			b.Release()
+			delete(handles, handle)
+			writeBufferPool(&buf, &bp)
+			return buf.String()
+		default:
+			return fmt.Sprintf("unrecognized command %q", td.Cmd)
+		}
+	})
+}
diff --git a/pebble/sstable/category_stats.go b/pebble/sstable/category_stats.go
new file mode 100644
index 0000000..4a5d5e3
--- /dev/null
+++ b/pebble/sstable/category_stats.go
@@ -0,0 +1,172 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"sync"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/redact"
+	"github.com/cockroachdb/pebble/shims/cmp"
+	"github.com/cockroachdb/pebble/shims/slices"
+)
+
+// Category is a user-understandable string, where stats are aggregated for
+// each category. The cardinality of this should be low, say < 20. The prefix
+// "pebble-" is reserved for internal Pebble categories.
+//
+// Examples of categories that can be useful in the CockroachDB context are:
+// sql-user, sql-stats, raft, rangefeed, mvcc-gc, range-snapshot.
+type Category string
+
+// QoSLevel describes whether the read is latency-sensitive or not. Each
+// category must map to a single QoSLevel. While category strings are opaque
+// to Pebble, the QoSLevel may be internally utilized in Pebble to better
+// optimize future reads.
+type QoSLevel int
+
+const (
+	// LatencySensitiveQoSLevel is the default when QoSLevel is not specified,
+	// and represents reads that are latency-sensitive.
+	LatencySensitiveQoSLevel QoSLevel = iota
+	// NonLatencySensitiveQoSLevel represents reads that are not
+	// latency-sensitive.
+	NonLatencySensitiveQoSLevel
+)
+
+// SafeFormat implements the redact.SafeFormatter interface.
+func (q QoSLevel) SafeFormat(p redact.SafePrinter, verb rune) {
+	switch q {
+	case LatencySensitiveQoSLevel:
+		p.Printf("latency")
+	case NonLatencySensitiveQoSLevel:
+		p.Printf("non-latency")
+	default:
+		p.Printf("<unknown-qos>")
+	}
+}
+
+// StringToQoSForTesting returns the QoSLevel for the string, or panics if the
+// string is not known.
+func StringToQoSForTesting(s string) QoSLevel {
+	switch s {
+	case "latency":
+		return LatencySensitiveQoSLevel
+	case "non-latency":
+		return NonLatencySensitiveQoSLevel
+	}
+	panic(errors.AssertionFailedf("unknown QoS %s", s))
+}
+
+// CategoryAndQoS specifies both the Category and the QoSLevel.
+type CategoryAndQoS struct {
+	Category
+	QoSLevel
+}
+
+// CategoryStats provides stats about a category of reads.
+type CategoryStats struct {
+	// BlockBytes is the bytes in the loaded blocks. If the block was
+	// compressed, this is the compressed bytes. Currently, only the index
+	// blocks, data blocks containing points, and filter blocks are included.
+	// Additionally, value blocks read after the corresponding iterator is
+	// closed are not included.
+	BlockBytes uint64
+	// BlockBytesInCache is the subset of BlockBytes that were in the block
+	// cache.
+	BlockBytesInCache uint64
+}
+
+func (s *CategoryStats) aggregate(a CategoryStats) {
+	s.BlockBytes += a.BlockBytes
+	s.BlockBytesInCache += a.BlockBytesInCache
+}
+
+// CategoryStatsAggregate is the aggregate for the given category.
+type CategoryStatsAggregate struct {
+	Category
+	QoSLevel
+	CategoryStats
+}
+
+type categoryStatsWithMu struct {
+	mu sync.Mutex
+	// Protected by mu.
+	stats CategoryStatsAggregate
+}
+
+// CategoryStatsCollector collects and aggregates the stats per category.
+type CategoryStatsCollector struct {
+	// mu protects additions to statsMap.
+	mu sync.Mutex
+	// Category => categoryStatsWithMu.
+	statsMap sync.Map
+}
+
+func (c *CategoryStatsCollector) reportStats(
+	category Category, qosLevel QoSLevel, stats CategoryStats,
+) {
+	v, ok := c.statsMap.Load(category)
+	if !ok {
+		c.mu.Lock()
+		v, _ = c.statsMap.LoadOrStore(category, &categoryStatsWithMu{
+			stats: CategoryStatsAggregate{Category: category, QoSLevel: qosLevel},
+		})
+		c.mu.Unlock()
+	}
+	aggStats := v.(*categoryStatsWithMu)
+	aggStats.mu.Lock()
+	aggStats.stats.CategoryStats.aggregate(stats)
+	aggStats.mu.Unlock()
+}
+
+// GetStats returns the aggregated stats.
+func (c *CategoryStatsCollector) GetStats() []CategoryStatsAggregate {
+	var stats []CategoryStatsAggregate
+	c.statsMap.Range(func(_, v any) bool {
+		aggStats := v.(*categoryStatsWithMu)
+		aggStats.mu.Lock()
+		s := aggStats.stats
+		aggStats.mu.Unlock()
+		if len(s.Category) == 0 {
+			s.Category = "_unknown"
+		}
+		stats = append(stats, s)
+		return true
+	})
+	slices.SortFunc(stats, func(a, b CategoryStatsAggregate) int {
+		return cmp.Compare(a.Category, b.Category)
+	})
+	return stats
+}
+
+// iterStatsAccumulator is a helper for a sstable iterator to accumulate
+// stats, which are reported to the CategoryStatsCollector when the
+// accumulator is closed.
+type iterStatsAccumulator struct {
+	Category
+	QoSLevel
+	stats     CategoryStats
+	collector *CategoryStatsCollector
+}
+
+func (accum *iterStatsAccumulator) init(
+	categoryAndQoS CategoryAndQoS, collector *CategoryStatsCollector,
+) {
+	accum.Category = categoryAndQoS.Category
+	accum.QoSLevel = categoryAndQoS.QoSLevel
+	accum.collector = collector
+}
+
+func (accum *iterStatsAccumulator) reportStats(blockBytes, blockBytesInCache uint64) {
+	accum.stats.BlockBytes += blockBytes
+	accum.stats.BlockBytesInCache += blockBytesInCache
+}
+
+func (accum *iterStatsAccumulator) close() {
+	if accum.collector != nil {
+		accum.collector.reportStats(accum.Category, accum.QoSLevel, accum.stats)
+	}
+}
diff --git a/pebble/sstable/comparer.go b/pebble/sstable/comparer.go
new file mode 100644
index 0000000..66a20b5
--- /dev/null
+++ b/pebble/sstable/comparer.go
@@ -0,0 +1,34 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import "github.com/cockroachdb/pebble/internal/base"
+
+// Compare exports the base.Compare type.
+type Compare = base.Compare
+
+// Equal exports the base.Equal type.
+type Equal = base.Equal
+
+// AbbreviatedKey exports the base.AbbreviatedKey type.
+type AbbreviatedKey = base.AbbreviatedKey
+
+// Separator exports the base.Separator type.
+type Separator = base.Separator
+
+// Successor exports the base.Successor type.
+type Successor = base.Successor
+
+// Split exports the base.Split type.
+type Split = base.Split
+
+// Comparer exports the base.Comparer type.
+type Comparer = base.Comparer
+
+// DefaultComparer exports the base.DefaultComparer variable.
+var DefaultComparer = base.DefaultComparer
+
+// Merger exports the base.Merger type.
+type Merger = base.Merger
diff --git a/pebble/sstable/compression.go b/pebble/sstable/compression.go
new file mode 100644
index 0000000..0db70c8
--- /dev/null
+++ b/pebble/sstable/compression.go
@@ -0,0 +1,99 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"encoding/binary"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/cache"
+	"github.com/golang/snappy"
+)
+
+func decompressedLen(blockType blockType, b []byte) (int, int, error) {
+	switch blockType {
+	case noCompressionBlockType:
+		return 0, 0, nil
+	case snappyCompressionBlockType:
+		l, err := snappy.DecodedLen(b)
+		return l, 0, err
+	case zstdCompressionBlockType:
+		// This will also be used by zlib, bzip2 and lz4 to retrieve the decodedLen
+		// if we implement these algorithms in the future.
+		decodedLenU64, varIntLen := binary.Uvarint(b)
+		if varIntLen <= 0 {
+			return 0, 0, base.CorruptionErrorf("pebble/table: compression block has invalid length")
+		}
+		return int(decodedLenU64), varIntLen, nil
+	default:
+		return 0, 0, base.CorruptionErrorf("pebble/table: unknown block compression: %d", errors.Safe(blockType))
+	}
+}
+
+func decompressInto(blockType blockType, compressed []byte, buf []byte) ([]byte, error) {
+	var result []byte
+	var err error
+	switch blockType {
+	case snappyCompressionBlockType:
+		result, err = snappy.Decode(buf, compressed)
+	case zstdCompressionBlockType:
+		result, err = decodeZstd(buf, compressed)
+	}
+	if err != nil {
+		return nil, base.MarkCorruptionError(err)
+	}
+	if len(result) != 0 && (len(result) != len(buf) || &result[0] != &buf[0]) {
+		return nil, base.CorruptionErrorf("pebble/table: decompressed into unexpected buffer: %p != %p",
+			errors.Safe(result), errors.Safe(buf))
+	}
+	return result, nil
+}
+
+// decompressBlock decompresses an SST block, with manually-allocated space.
+// NB: If decompressBlock returns (nil, nil), no decompression was necessary and
+// the caller may use `b` directly.
+func decompressBlock(blockType blockType, b []byte) (*cache.Value, error) {
+	if blockType == noCompressionBlockType {
+		return nil, nil
+	}
+	// first obtain the decoded length.
+	decodedLen, prefixLen, err := decompressedLen(blockType, b)
+	if err != nil {
+		return nil, err
+	}
+	b = b[prefixLen:]
+	// Allocate sufficient space from the cache.
+	decoded := cache.Alloc(decodedLen)
+	decodedBuf := decoded.Buf()
+	if _, err := decompressInto(blockType, b, decodedBuf); err != nil {
+		cache.Free(decoded)
+		return nil, err
+	}
+	return decoded, nil
+}
+
+// compressBlock compresses an SST block, using compressBuf as the desired destination.
+func compressBlock(
+	compression Compression, b []byte, compressedBuf []byte,
+) (blockType blockType, compressed []byte) {
+	switch compression {
+	case SnappyCompression:
+		return snappyCompressionBlockType, snappy.Encode(compressedBuf, b)
+	case NoCompression:
+		return noCompressionBlockType, b
+	}
+
+	if len(compressedBuf) < binary.MaxVarintLen64 {
+		compressedBuf = append(compressedBuf, make([]byte, binary.MaxVarintLen64-len(compressedBuf))...)
+	}
+	varIntLen := binary.PutUvarint(compressedBuf, uint64(len(b)))
+	switch compression {
+	case ZstdCompression:
+		return zstdCompressionBlockType, encodeZstd(compressedBuf, varIntLen, b)
+	default:
+		return noCompressionBlockType, b
+	}
+}
diff --git a/pebble/sstable/compression_cgo.go b/pebble/sstable/compression_cgo.go
new file mode 100644
index 0000000..ad7d844
--- /dev/null
+++ b/pebble/sstable/compression_cgo.go
@@ -0,0 +1,34 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build cgo
+// +build cgo
+
+package sstable
+
+import (
+	"bytes"
+
+	"github.com/DataDog/zstd"
+)
+
+// decodeZstd decompresses b with the Zstandard algorithm.
+// It reuses the preallocated capacity of decodedBuf if it is sufficient.
+// On success, it returns the decoded byte slice.
+func decodeZstd(decodedBuf, b []byte) ([]byte, error) {
+	return zstd.Decompress(decodedBuf, b)
+}
+
+// encodeZstd compresses b with the Zstandard algorithm at default compression
+// level (level 3). It reuses the preallocated capacity of compressedBuf if it
+// is sufficient. The subslice `compressedBuf[:varIntLen]` should already encode
+// the length of `b` before calling encodeZstd. It returns the encoded byte
+// slice, including the `compressedBuf[:varIntLen]` prefix.
+func encodeZstd(compressedBuf []byte, varIntLen int, b []byte) []byte {
+	buf := bytes.NewBuffer(compressedBuf[:varIntLen])
+	writer := zstd.NewWriterLevel(buf, 3)
+	writer.Write(b)
+	writer.Close()
+	return buf.Bytes()
+}
diff --git a/pebble/sstable/compression_cgo_test.go b/pebble/sstable/compression_cgo_test.go
new file mode 100644
index 0000000..1de7395
--- /dev/null
+++ b/pebble/sstable/compression_cgo_test.go
@@ -0,0 +1,19 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build cgo
+// +build cgo
+
+package sstable
+
+// useStandardZstdLib indicates whether the zstd implementation is a port of the
+// official one in the facebook/zstd repository.
+//
+// This constant is only used in tests. Some tests rely on reproducibility of
+// SST files, but a custom implementation of zstd will produce different
+// compression result. So those tests have to be disabled in such cases.
+//
+// We cannot always use the official facebook/zstd implementation since it
+// relies on CGo.
+const useStandardZstdLib = true
diff --git a/pebble/sstable/compression_nocgo.go b/pebble/sstable/compression_nocgo.go
new file mode 100644
index 0000000..42c34fb
--- /dev/null
+++ b/pebble/sstable/compression_nocgo.go
@@ -0,0 +1,30 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build !cgo
+// +build !cgo
+
+package sstable
+
+import "github.com/klauspost/compress/zstd"
+
+// decodeZstd decompresses b with the Zstandard algorithm.
+// It reuses the preallocated capacity of decodedBuf if it is sufficient.
+// On success, it returns the decoded byte slice.
+func decodeZstd(decodedBuf, b []byte) ([]byte, error) {
+	decoder, _ := zstd.NewReader(nil)
+	defer decoder.Close()
+	return decoder.DecodeAll(b, decodedBuf[:0])
+}
+
+// encodeZstd compresses b with the Zstandard algorithm at default compression
+// level (level 3). It reuses the preallocated capacity of compressedBuf if it
+// is sufficient. The subslice `compressedBuf[:varIntLen]` should already encode
+// the length of `b` before calling encodeZstd. It returns the encoded byte
+// slice, including the `compressedBuf[:varIntLen]` prefix.
+func encodeZstd(compressedBuf []byte, varIntLen int, b []byte) []byte {
+	encoder, _ := zstd.NewWriter(nil)
+	defer encoder.Close()
+	return encoder.EncodeAll(b, compressedBuf[:varIntLen])
+}
diff --git a/pebble/sstable/compression_nocgo_test.go b/pebble/sstable/compression_nocgo_test.go
new file mode 100644
index 0000000..1c755a2
--- /dev/null
+++ b/pebble/sstable/compression_nocgo_test.go
@@ -0,0 +1,19 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build !cgo
+// +build !cgo
+
+package sstable
+
+// useStandardZstdLib indicates whether the zstd implementation is a port of the
+// official one in the facebook/zstd repository.
+//
+// This constant is only used in tests. Some tests rely on reproducibility of
+// SST files, but a custom implementation of zstd will produce different
+// compression result. So those tests have to be disabled in such cases.
+//
+// We cannot always use the official facebook/zstd implementation since it
+// relies on CGo.
+const useStandardZstdLib = false
diff --git a/pebble/sstable/compression_test.go b/pebble/sstable/compression_test.go
new file mode 100644
index 0000000..4ee542f
--- /dev/null
+++ b/pebble/sstable/compression_test.go
@@ -0,0 +1,60 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"encoding/binary"
+	"math/rand"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/pebble/internal/cache"
+	"github.com/stretchr/testify/require"
+)
+
+func TestCompressionRoundtrip(t *testing.T) {
+	seed := time.Now().UnixNano()
+	t.Logf("seed %d", seed)
+	rng := rand.New(rand.NewSource(seed))
+
+	for compression := DefaultCompression + 1; compression < NCompression; compression++ {
+		t.Run(compression.String(), func(t *testing.T) {
+			payload := make([]byte, rng.Intn(10<<10 /* 10 KiB */))
+			rng.Read(payload)
+			// Create a randomly-sized buffer to house the compressed output. If it's
+			// not sufficient, compressBlock should allocate one that is.
+			compressedBuf := make([]byte, rng.Intn(1<<10 /* 1 KiB */))
+
+			btyp, compressed := compressBlock(compression, payload, compressedBuf)
+			v, err := decompressBlock(btyp, compressed)
+			require.NoError(t, err)
+			got := payload
+			if v != nil {
+				got = v.Buf()
+				require.Equal(t, payload, got)
+				cache.Free(v)
+			}
+		})
+	}
+}
+
+// TestDecompressionError tests that a decompressing a value that does not
+// decompress returns an error.
+func TestDecompressionError(t *testing.T) {
+	rng := rand.New(rand.NewSource(1 /* fixed seed */))
+
+	// Create a buffer to represent a faux zstd compressed block. It's prefixed
+	// with a uvarint of the appropriate length, followed by garabge.
+	fauxCompressed := make([]byte, rng.Intn(10<<10 /* 10 KiB */))
+	compressedPayloadLen := len(fauxCompressed) - binary.MaxVarintLen64
+	n := binary.PutUvarint(fauxCompressed, uint64(compressedPayloadLen))
+	fauxCompressed = fauxCompressed[:n+compressedPayloadLen]
+	rng.Read(fauxCompressed[n:])
+
+	v, err := decompressBlock(zstdCompressionBlockType, fauxCompressed)
+	t.Log(err)
+	require.Error(t, err)
+	require.Nil(t, v)
+}
diff --git a/pebble/sstable/data_test.go b/pebble/sstable/data_test.go
new file mode 100644
index 0000000..2b1926a
--- /dev/null
+++ b/pebble/sstable/data_test.go
@@ -0,0 +1,500 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"io"
+	"strconv"
+	"strings"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/bloom"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/cache"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+func optsFromArgs(td *datadriven.TestData, writerOpts *WriterOptions) error {
+	for _, arg := range td.CmdArgs {
+		switch arg.Key {
+		case "leveldb":
+			if len(arg.Vals) != 0 {
+				return errors.Errorf("%s: arg %s expects 0 values", td.Cmd, arg.Key)
+			}
+			writerOpts.TableFormat = TableFormatLevelDB
+		case "block-size":
+			if len(arg.Vals) != 1 {
+				return errors.Errorf("%s: arg %s expects 1 value", td.Cmd, arg.Key)
+			}
+			var err error
+			writerOpts.BlockSize, err = strconv.Atoi(arg.Vals[0])
+			if err != nil {
+				return err
+			}
+		case "index-block-size":
+			if len(arg.Vals) != 1 {
+				return errors.Errorf("%s: arg %s expects 1 value", td.Cmd, arg.Key)
+			}
+			var err error
+			writerOpts.IndexBlockSize, err = strconv.Atoi(arg.Vals[0])
+			if err != nil {
+				return err
+			}
+		case "filter":
+			writerOpts.FilterPolicy = bloom.FilterPolicy(10)
+		case "comparer-split-4b-suffix":
+			writerOpts.Comparer = test4bSuffixComparer
+		case "writing-to-lowest-level":
+			writerOpts.WritingToLowestLevel = true
+		case "is-strict-obsolete":
+			writerOpts.IsStrictObsolete = true
+		}
+	}
+	return nil
+}
+
+func runBuildCmd(
+	td *datadriven.TestData, writerOpts *WriterOptions, cacheSize int,
+) (*WriterMetadata, *Reader, error) {
+
+	f0 := &memFile{}
+	if err := optsFromArgs(td, writerOpts); err != nil {
+		return nil, nil, err
+	}
+
+	w := NewWriter(f0, *writerOpts)
+	var rangeDels []keyspan.Span
+	rangeDelFrag := keyspan.Fragmenter{
+		Cmp:    DefaultComparer.Compare,
+		Format: DefaultComparer.FormatKey,
+		Emit: func(s keyspan.Span) {
+			rangeDels = append(rangeDels, s)
+		},
+	}
+	var rangeKeys []keyspan.Span
+	rangeKeyFrag := keyspan.Fragmenter{
+		Cmp:    DefaultComparer.Compare,
+		Format: DefaultComparer.FormatKey,
+		Emit: func(s keyspan.Span) {
+			rangeKeys = append(rangeKeys, s)
+		},
+	}
+	for _, data := range strings.Split(td.Input, "\n") {
+		if strings.HasPrefix(data, "rangekey:") {
+			var err error
+			func() {
+				defer func() {
+					if r := recover(); r != nil {
+						err = errors.Errorf("%v", r)
+					}
+				}()
+				rangeKeyFrag.Add(keyspan.ParseSpan(strings.TrimPrefix(data, "rangekey:")))
+			}()
+			if err != nil {
+				return nil, nil, err
+			}
+			continue
+		}
+
+		forceObsolete := false
+		if strings.HasPrefix(data, "force-obsolete:") {
+			data = strings.TrimSpace(strings.TrimPrefix(data, "force-obsolete:"))
+			forceObsolete = true
+		}
+		j := strings.Index(data, ":")
+		key := base.ParseInternalKey(data[:j])
+		value := []byte(data[j+1:])
+		switch key.Kind() {
+		case InternalKeyKindRangeDelete:
+			if forceObsolete {
+				return nil, nil, errors.Errorf("force-obsolete is not allowed for RANGEDEL")
+			}
+			var err error
+			func() {
+				defer func() {
+					if r := recover(); r != nil {
+						err = errors.Errorf("%v", r)
+					}
+				}()
+				rangeDelFrag.Add(keyspan.Span{
+					Start: key.UserKey,
+					End:   value,
+					Keys:  []keyspan.Key{{Trailer: key.Trailer}},
+				})
+			}()
+			if err != nil {
+				return nil, nil, err
+			}
+		default:
+			if err := w.AddWithForceObsolete(key, value, forceObsolete); err != nil {
+				return nil, nil, err
+			}
+		}
+	}
+	rangeDelFrag.Finish()
+	for _, v := range rangeDels {
+		for _, k := range v.Keys {
+			ik := base.InternalKey{UserKey: v.Start, Trailer: k.Trailer}
+			if err := w.Add(ik, v.End); err != nil {
+				return nil, nil, err
+			}
+		}
+	}
+	rangeKeyFrag.Finish()
+	for _, s := range rangeKeys {
+		if err := w.addRangeKeySpan(s); err != nil {
+			return nil, nil, err
+		}
+	}
+	if err := w.Close(); err != nil {
+		return nil, nil, err
+	}
+	meta, err := w.Metadata()
+	if err != nil {
+		return nil, nil, err
+	}
+
+	readerOpts := ReaderOptions{Comparer: writerOpts.Comparer}
+	if writerOpts.FilterPolicy != nil {
+		readerOpts.Filters = map[string]FilterPolicy{
+			writerOpts.FilterPolicy.Name(): writerOpts.FilterPolicy,
+		}
+	}
+	if cacheSize > 0 {
+		readerOpts.Cache = cache.New(int64(cacheSize))
+		defer readerOpts.Cache.Unref()
+	}
+	r, err := NewMemReader(f0.Data(), readerOpts)
+	if err != nil {
+		return nil, nil, err
+	}
+	return meta, r, nil
+}
+
+func runBuildRawCmd(
+	td *datadriven.TestData, opts *WriterOptions,
+) (*WriterMetadata, *Reader, error) {
+	mem := vfs.NewMem()
+	provider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(mem, "" /* dirName */))
+	if err != nil {
+		return nil, nil, err
+	}
+	defer provider.Close()
+
+	f0, _, err := provider.Create(context.Background(), base.FileTypeTable, base.FileNum(0).DiskFileNum(), objstorage.CreateOptions{})
+	if err != nil {
+		return nil, nil, err
+	}
+
+	w := NewWriter(f0, *opts)
+	for i := range td.CmdArgs {
+		arg := &td.CmdArgs[i]
+		if arg.Key == "range-del-v1" {
+			w.rangeDelV1Format = true
+			break
+		}
+	}
+
+	for _, data := range strings.Split(td.Input, "\n") {
+		if strings.HasPrefix(data, "rangekey:") {
+			data = strings.TrimPrefix(data, "rangekey:")
+			if err := w.addRangeKeySpan(keyspan.ParseSpan(data)); err != nil {
+				return nil, nil, err
+			}
+			continue
+		}
+
+		j := strings.Index(data, ":")
+		key := base.ParseInternalKey(data[:j])
+		value := []byte(data[j+1:])
+		switch key.Kind() {
+		case base.InternalKeyKindRangeKeyDelete,
+			base.InternalKeyKindRangeKeyUnset,
+			base.InternalKeyKindRangeKeySet:
+			if err := w.AddRangeKey(key, value); err != nil {
+				return nil, nil, err
+			}
+		default:
+			if err := w.Add(key, value); err != nil {
+				return nil, nil, err
+			}
+		}
+	}
+	if err := w.Close(); err != nil {
+		return nil, nil, err
+	}
+	meta, err := w.Metadata()
+	if err != nil {
+		return nil, nil, err
+	}
+
+	f1, err := provider.OpenForReading(context.Background(), base.FileTypeTable, base.FileNum(0).DiskFileNum(), objstorage.OpenOptions{})
+	if err != nil {
+		return nil, nil, err
+	}
+	r, err := NewReader(f1, ReaderOptions{})
+	if err != nil {
+		return nil, nil, err
+	}
+	return meta, r, nil
+}
+
+func scanGlobalSeqNum(td *datadriven.TestData) (uint64, error) {
+	for _, arg := range td.CmdArgs {
+		switch arg.Key {
+		case "globalSeqNum":
+			if len(arg.Vals) != 1 {
+				return 0, errors.Errorf("%s: arg %s expects 1 value", td.Cmd, arg.Key)
+			}
+			v, err := strconv.Atoi(arg.Vals[0])
+			if err != nil {
+				return 0, err
+			}
+			return uint64(v), nil
+		}
+	}
+	return 0, nil
+}
+
+type runIterCmdOption func(*runIterCmdOptions)
+
+type runIterCmdOptions struct {
+	everyOp      func(io.Writer)
+	everyOpAfter func(io.Writer)
+	stats        *base.InternalIteratorStats
+}
+
+func runIterCmdEveryOp(everyOp func(io.Writer)) runIterCmdOption {
+	return func(opts *runIterCmdOptions) { opts.everyOp = everyOp }
+}
+
+func runIterCmdEveryOpAfter(everyOp func(io.Writer)) runIterCmdOption {
+	return func(opts *runIterCmdOptions) { opts.everyOpAfter = everyOp }
+}
+
+func runIterCmdStats(stats *base.InternalIteratorStats) runIterCmdOption {
+	return func(opts *runIterCmdOptions) { opts.stats = stats }
+}
+
+func runIterCmd(
+	td *datadriven.TestData, origIter Iterator, printValue bool, opt ...runIterCmdOption,
+) string {
+	var opts runIterCmdOptions
+	for _, o := range opt {
+		o(&opts)
+	}
+
+	iter := newIterAdapter(origIter)
+	defer iter.Close()
+
+	var b bytes.Buffer
+	var prefix []byte
+	for _, line := range strings.Split(td.Input, "\n") {
+		parts := strings.Fields(line)
+		if len(parts) == 0 {
+			continue
+		}
+		switch parts[0] {
+		case "seek-ge":
+			if len(parts) < 2 || len(parts) > 3 {
+				return "seek-ge <key> [<try-seek-using-next]\n"
+			}
+			prefix = nil
+			var flags base.SeekGEFlags
+			if len(parts) == 3 {
+				if trySeekUsingNext, err := strconv.ParseBool(parts[2]); err != nil {
+					return err.Error()
+				} else if trySeekUsingNext {
+					flags = flags.EnableTrySeekUsingNext()
+				}
+			}
+			iter.SeekGE([]byte(strings.TrimSpace(parts[1])), flags)
+		case "seek-prefix-ge":
+			if len(parts) != 2 && len(parts) != 3 {
+				return "seek-prefix-ge <key> [<try-seek-using-next>]\n"
+			}
+			prefix = []byte(strings.TrimSpace(parts[1]))
+			var flags base.SeekGEFlags
+			if len(parts) == 3 {
+				if trySeekUsingNext, err := strconv.ParseBool(parts[2]); err != nil {
+					return err.Error()
+				} else if trySeekUsingNext {
+					flags = flags.EnableTrySeekUsingNext()
+				}
+			}
+			iter.SeekPrefixGE(prefix, prefix /* key */, flags)
+		case "seek-lt":
+			if len(parts) != 2 {
+				return "seek-lt <key>\n"
+			}
+			prefix = nil
+			iter.SeekLT([]byte(strings.TrimSpace(parts[1])), base.SeekLTFlagsNone)
+		case "first":
+			prefix = nil
+			iter.First()
+		case "last":
+			prefix = nil
+			iter.Last()
+		case "next":
+			iter.Next()
+		case "next-ignore-result":
+			iter.NextIgnoreResult()
+		case "prev":
+			iter.Prev()
+		case "next-prefix":
+			if len(parts) != 1 {
+				return "next-prefix should have no parameter\n"
+			}
+			if iter.Key() == nil {
+				return "next-prefix cannot be called on exhauster iterator\n"
+			}
+			k := iter.Key().UserKey
+			prefixLen := testkeys.Comparer.Split(k)
+			k = k[:prefixLen]
+			kSucc := testkeys.Comparer.ImmediateSuccessor(nil, k)
+			iter.NextPrefix(kSucc)
+		case "set-bounds":
+			if len(parts) <= 1 || len(parts) > 3 {
+				return "set-bounds lower=<lower> upper=<upper>\n"
+			}
+			var lower []byte
+			var upper []byte
+			for _, part := range parts[1:] {
+				arg := strings.Split(strings.TrimSpace(part), "=")
+				switch arg[0] {
+				case "lower":
+					lower = []byte(arg[1])
+					if len(lower) == 0 {
+						lower = nil
+					}
+				case "upper":
+					upper = []byte(arg[1])
+					if len(upper) == 0 {
+						upper = nil
+					}
+				default:
+					return fmt.Sprintf("set-bounds: unknown arg: %s", arg)
+				}
+			}
+			iter.SetBounds(lower, upper)
+		case "stats":
+			// The timing is non-deterministic, so set to 0.
+			opts.stats.BlockReadDuration = 0
+			fmt.Fprintf(&b, "%+v\n", *opts.stats)
+			continue
+		case "reset-stats":
+			*opts.stats = base.InternalIteratorStats{}
+			continue
+		case "internal-iter-state":
+			fmt.Fprintf(&b, "| %T:\n", origIter)
+			si, _ := origIter.(*singleLevelIterator)
+			if twoLevelIter, ok := origIter.(*twoLevelIterator); ok {
+				si = &twoLevelIter.singleLevelIterator
+				if twoLevelIter.topLevelIndex.valid() {
+					fmt.Fprintf(&b, "|  topLevelIndex.Key() = %q\n", twoLevelIter.topLevelIndex.Key())
+					v := twoLevelIter.topLevelIndex.value()
+					bhp, err := decodeBlockHandleWithProperties(v.InPlaceValue())
+					if err != nil {
+						fmt.Fprintf(&b, "|  topLevelIndex.InPlaceValue() failed to decode as BHP: %s\n", err)
+					} else {
+						fmt.Fprintf(&b, "|  topLevelIndex.InPlaceValue() = (Offset: %d, Length: %d, Props: %x)\n",
+							bhp.Offset, bhp.Length, bhp.Props)
+					}
+				} else {
+					fmt.Fprintf(&b, "|  topLevelIndex iter invalid\n")
+				}
+				fmt.Fprintf(&b, "|  topLevelIndex.isDataInvalidated()=%t\n", twoLevelIter.topLevelIndex.isDataInvalidated())
+			}
+			if si.index.valid() {
+				fmt.Fprintf(&b, "|  index.Key() = %q\n", si.index.Key())
+				v := si.index.value()
+				bhp, err := decodeBlockHandleWithProperties(v.InPlaceValue())
+				if err != nil {
+					fmt.Fprintf(&b, "|  index.InPlaceValue() failed to decode as BHP: %s\n", err)
+				} else {
+					fmt.Fprintf(&b, "|  index.InPlaceValue() = (Offset: %d, Length: %d, Props: %x)\n",
+						bhp.Offset, bhp.Length, bhp.Props)
+				}
+			} else {
+				fmt.Fprintf(&b, "|  index iter invalid\n")
+			}
+			fmt.Fprintf(&b, "|  index.isDataInvalidated()=%t\n", si.index.isDataInvalidated())
+			fmt.Fprintf(&b, "|  data.isDataInvalidated()=%t\n", si.data.isDataInvalidated())
+			fmt.Fprintf(&b, "|  hideObsoletePoints = %t\n", si.hideObsoletePoints)
+			fmt.Fprintf(&b, "|  dataBH = (Offset: %d, Length: %d)\n", si.dataBH.Offset, si.dataBH.Length)
+			fmt.Fprintf(&b, "|  (boundsCmp,positionedUsingLatestBounds) = (%d,%t)\n", si.boundsCmp, si.positionedUsingLatestBounds)
+			fmt.Fprintf(&b, "|  exhaustedBounds = %d\n", si.exhaustedBounds)
+
+			continue
+		}
+		if opts.everyOp != nil {
+			opts.everyOp(&b)
+		}
+		if iter.Valid() && checkValidPrefix(prefix, iter.Key().UserKey) {
+			fmt.Fprintf(&b, "<%s:%d>", iter.Key().UserKey, iter.Key().SeqNum())
+			if printValue {
+				fmt.Fprintf(&b, ":%s", string(iter.Value()))
+			}
+		} else if err := iter.Error(); err != nil {
+			fmt.Fprintf(&b, "<err=%v>", err)
+		} else {
+			fmt.Fprintf(&b, ".")
+		}
+		if opts.everyOpAfter != nil {
+			opts.everyOpAfter(&b)
+		}
+		b.WriteString("\n")
+	}
+	return b.String()
+}
+
+func runRewriteCmd(
+	td *datadriven.TestData, r *Reader, writerOpts WriterOptions,
+) (*WriterMetadata, *Reader, error) {
+	var from, to []byte
+	for _, arg := range td.CmdArgs {
+		switch arg.Key {
+		case "from":
+			from = []byte(arg.Vals[0])
+		case "to":
+			to = []byte(arg.Vals[0])
+		}
+	}
+	if from == nil || to == nil {
+		return nil, r, errors.New("missing from/to")
+	}
+
+	opts := writerOpts
+	if err := optsFromArgs(td, &opts); err != nil {
+		return nil, r, err
+	}
+
+	f := &memFile{}
+	meta, _, err := rewriteKeySuffixesInBlocks(r, f, opts, from, to, 2)
+	if err != nil {
+		return nil, r, errors.Wrap(err, "rewrite failed")
+	}
+	readerOpts := ReaderOptions{Comparer: opts.Comparer}
+	if opts.FilterPolicy != nil {
+		readerOpts.Filters = map[string]FilterPolicy{
+			opts.FilterPolicy.Name(): opts.FilterPolicy,
+		}
+	}
+	r.Close()
+
+	r, err = NewMemReader(f.Data(), readerOpts)
+	if err != nil {
+		return nil, nil, err
+	}
+	return meta, r, nil
+}
diff --git a/pebble/sstable/filter.go b/pebble/sstable/filter.go
new file mode 100644
index 0000000..7b2e1ab
--- /dev/null
+++ b/pebble/sstable/filter.go
@@ -0,0 +1,122 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import "sync/atomic"
+
+// FilterMetrics holds metrics for the filter policy.
+type FilterMetrics struct {
+	// The number of hits for the filter policy. This is the
+	// number of times the filter policy was successfully used to avoid access
+	// of a data block.
+	Hits int64
+	// The number of misses for the filter policy. This is the number of times
+	// the filter policy was checked but was unable to filter an access of a data
+	// block.
+	Misses int64
+}
+
+// FilterMetricsTracker is used to keep track of filter metrics. It contains the
+// same metrics as FilterMetrics, but they can be updated atomically. An
+// instance of FilterMetricsTracker can be passed to a Reader as a ReaderOption.
+type FilterMetricsTracker struct {
+	// See FilterMetrics.Hits.
+	hits atomic.Int64
+	// See FilterMetrics.Misses.
+	misses atomic.Int64
+}
+
+var _ ReaderOption = (*FilterMetricsTracker)(nil)
+
+func (m *FilterMetricsTracker) readerApply(r *Reader) {
+	if r.tableFilter != nil {
+		r.tableFilter.metrics = m
+	}
+}
+
+// Load returns the current values as FilterMetrics.
+func (m *FilterMetricsTracker) Load() FilterMetrics {
+	return FilterMetrics{
+		Hits:   m.hits.Load(),
+		Misses: m.misses.Load(),
+	}
+}
+
+// BlockHandle is the file offset and length of a block.
+type BlockHandle struct {
+	Offset, Length uint64
+}
+
+// BlockHandleWithProperties is used for data blocks and first/lower level
+// index blocks, since they can be annotated using BlockPropertyCollectors.
+type BlockHandleWithProperties struct {
+	BlockHandle
+	Props []byte
+}
+
+type filterWriter interface {
+	addKey(key []byte)
+	finish() ([]byte, error)
+	metaName() string
+	policyName() string
+}
+
+type tableFilterReader struct {
+	policy  FilterPolicy
+	metrics *FilterMetricsTracker
+}
+
+func newTableFilterReader(policy FilterPolicy) *tableFilterReader {
+	return &tableFilterReader{
+		policy:  policy,
+		metrics: nil,
+	}
+}
+
+func (f *tableFilterReader) mayContain(data, key []byte) bool {
+	mayContain := f.policy.MayContain(TableFilter, data, key)
+	if f.metrics != nil {
+		if mayContain {
+			f.metrics.misses.Add(1)
+		} else {
+			f.metrics.hits.Add(1)
+		}
+	}
+	return mayContain
+}
+
+type tableFilterWriter struct {
+	policy FilterPolicy
+	writer FilterWriter
+	// count is the count of the number of keys added to the filter.
+	count int
+}
+
+func newTableFilterWriter(policy FilterPolicy) *tableFilterWriter {
+	return &tableFilterWriter{
+		policy: policy,
+		writer: policy.NewWriter(TableFilter),
+	}
+}
+
+func (f *tableFilterWriter) addKey(key []byte) {
+	f.count++
+	f.writer.AddKey(key)
+}
+
+func (f *tableFilterWriter) finish() ([]byte, error) {
+	if f.count == 0 {
+		return nil, nil
+	}
+	return f.writer.Finish(nil), nil
+}
+
+func (f *tableFilterWriter) metaName() string {
+	return "fullfilter." + f.policy.Name()
+}
+
+func (f *tableFilterWriter) policyName() string {
+	return f.policy.Name()
+}
diff --git a/pebble/sstable/format.go b/pebble/sstable/format.go
new file mode 100644
index 0000000..82310a5
--- /dev/null
+++ b/pebble/sstable/format.go
@@ -0,0 +1,257 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+)
+
+// TableFormat specifies the format version for sstables. The legacy LevelDB
+// format is format version 1.
+type TableFormat uint32
+
+// The available table formats, representing the tuple (magic number, version
+// number). Note that these values are not (and should not) be serialized to
+// disk. The ordering should follow the order the versions were introduced to
+// Pebble (i.e. the history is linear).
+const (
+	TableFormatUnspecified TableFormat = iota
+	TableFormatLevelDB
+	TableFormatRocksDBv2
+	TableFormatPebblev1 // Block properties.
+	TableFormatPebblev2 // Range keys.
+	TableFormatPebblev3 // Value blocks.
+	TableFormatPebblev4 // DELSIZED tombstones.
+	NumTableFormats
+
+	TableFormatMax = NumTableFormats - 1
+)
+
+// TableFormatPebblev4, in addition to DELSIZED, introduces the use of
+// InternalKeyKindSSTableInternalObsoleteBit.
+//
+// 1. Motivation
+//
+// We have various related problems caused by Pebble snapshots:
+//
+// - P1: RANGEDELs that delete points in the same sstable, but the points
+//   happen to not get deleted during compactions because of an open snapshot.
+//   This causes very expensive iteration, that has been observed in
+//   production deployments
+//
+// - P2: When iterating over a foreign sstable (in disaggregated storage), we
+//   need to do (a) point collapsing to expose at most one point per user key,
+//   (b) apply RANGEDELs in the sstable to hide deleted points in the same
+//   sstable. This per-sstable point collapsing iteration needs to be very
+//   efficient (ideally as efficient from a CPU perspective as iteration over
+//   regular sstables) since foreign sstables can be very long-lived -- one of
+//   the goals of disaggregated storage is to scale compute and disk bandwidth
+//   resources as a function of the hot (from a write perspective) data and
+//   not the whole data, so we don't want to have to rewrite foreign sstables
+//   solely to improve read performance.
+//
+// The ideal solution for P2 would allow user-facing reads to utilize the
+// existing SST iterators (with slight modifications) and with no loss of
+// efficiency. And for P1 and P2 we would like to skip whole blocks of
+// overwritten/deleted points. Even when we can't skip whole blocks, avoiding
+// key comparisons at iteration time to discover what points are deleted is
+// very desirable, since keys can be long.
+//
+// We observe that:
+//
+// - Reads:
+//   - All user-facing reads in CockroachDB use iterators over the DB, hence
+//     have a higher read seqnum than all sstables (there are some rare cases
+//     that can violate this, but those are not important from a performance
+//     optimization perspective).
+//
+//   - Certain internal-facing reads in CockroachDB use snapshots, but the
+//     snapshots are shortlived enough that most L5 and L6 sstables will have
+//     all seqnums lower than the snapshot seqnum.
+//
+// - Writes:
+//   - We already do key comparisons between points when writing the sstable
+//     to ensure that the sstable invariant (monotonically increasing internal
+//     keys) is not violated. So we know which points share the same userkey,
+//     and thereby which points are obsolete because there is a more recent
+//     point in the same sstable.
+//
+//   - The compactionIter knows which point id deleted by a RANGEDEL even if
+//     the point does need to be written because of a snapshot.
+//
+//   So this known information can be encoded in the sstable at write time and
+//   utilized for optimized reading.
+//
+// 2. Solution
+//
+// We primarily scope the solution to the following point kinds: SET,
+// SETWITHDEL, DEL, DELSIZED, SINGLEDEL. These are the ones marked locally
+// obsolete, i.e., obsolete within the sstable, and we can guarantee that at
+// most one point will be exposed per user key. MERGE keys create more
+// complexity: MERGE followed by MERGE causes multiple keys to not be
+// obsolete. Same applies for MERGE followed by SET/SETWITHDEL/DEL*. Note
+// that:
+//
+// - For regular sst iteration, the obsolete marking is a performance
+//   optimization, and multiple keys for the same userkey can be handled by
+//   higher layers in the iterator tree (specifically pebble.Iterator).
+//
+// - For foreign sst iteration, we disallow MERGEs to be written to such
+//   shared ssts (details below).
+//
+// The key kinds are marked with an obsolete bit
+// (InternalKeyKindSSTableInternalObsoleteBit) when the key-value pair is
+// obsolete. This marking is done within blockWriter, based on information
+// passed to it by Writer. In turn, Writer uses a combination of key
+// comparisons, and information provided by compactionIter to decide whether a
+// key-value pair is obsolete. Additionally, a Pebble-internal
+// BlockPropertyCollector (obsoleteKeyBlockPropertyCollector) is used to mark
+// blocks where all key-value pairs are obsolete. Since the common case is
+// non-obsolete blocks, this block property collector uses the empty byte
+// slice to represent a non-obsolete block, which consumes no space in
+// BlockHandleWithProperties.Props.
+//
+// At read time, the obsolete bit is only visible to the blockIter, which can
+// be optionally configured to hide obsolete points. This hiding is only
+// configured for data block iterators for sstables being read by user-facing
+// iterators at a seqnum greater than the max seqnum in the sstable.
+// Additionally, when this hiding is configured, a Pebble-internal block
+// property filter (obsoleteKeyBlockPropertyFilter), is used to skip whole
+// blocks that are obsolete.
+//
+// 2.1 Correctness
+//
+// Due to the level invariant, the sequence of seqnums for a user key in a
+// sstable represents a contiguous subsequence of the seqnums for the userkey
+// across the whole LSM, and is more recent than the seqnums in a sstable in a
+// lower level. So exposing exactly one point from a sstable for a userkey
+// will also mask the points for the userkey in lower levels. If we expose no
+// point, because of RANGEDELs, that RANGEDEL will also mask the points in
+// lower levels.
+//
+// Note that we do not need to do anything special at write time for
+// SETWITHDEL and SINGLEDEL. This is because these key kinds are treated
+// specially only by compactions, which do not hide obsolete points. For
+// regular reads, SETWITHDEL behaves the same as SET and SINGLEDEL behaves the
+// same as DEL.
+//
+// 2.2 Strictness and MERGE
+//
+// Setting the obsolete bit on point keys is advanced usage, so we support two
+// modes, both of which must be truthful when setting the obsolete bit, but
+// vary in when they don't set the obsolete bit.
+//
+// - Non-strict: In this mode, the bit does not need to be set for keys that
+//   are obsolete. Additionally, any sstable containing MERGE keys can only
+//   use this mode. An iterator over such an sstable, when configured to
+//   hideObsoletePoints, can expose multiple internal keys per user key, and
+//   can expose keys that are deleted by rangedels in the same sstable. This
+//   is the mode that non-advanced users should use. Pebble without
+//   disaggregated storage will also use this mode and will best-effort set
+//   the obsolete bit, to optimize iteration when snapshots have retained many
+//   obsolete keys.
+//
+// - Strict: In this mode, every obsolete key must have the obsolete bit set,
+//   and no MERGE keys are permitted. An iterator over such an sstable, when
+//   configured to hideObsoletePoints satisfies two properties:
+//   - S1: will expose at most one internal key per user key, which is the
+//     most recent one.
+//   - S2: will never expose keys that are deleted by rangedels in the same
+//     sstable.
+//
+//   This is the mode for two use cases in disaggregated storage (which will
+//   exclude parts of the key space that has MERGEs), for levels that contain
+//   sstables that can become foreign sstables:
+//   - Pebble compaction output to these levels that can become foreign
+//     sstables.
+//
+//   - CockroachDB ingest operations that can ingest into the levels that can
+//     become foreign sstables. Note, these are not sstables corresponding to
+//     copied data for CockroachDB range snapshots. This case occurs for
+//     operations like index backfills: these trivially satisfy the strictness
+//     criteria since they only write one key per userkey.
+//
+//     TODO(sumeer): this latter case is not currently supported, since only
+//     Writer.AddWithForceObsolete calls are permitted for writing strict
+//     obsolete sstables. This is done to reduce the likelihood of bugs. One
+//     simple way to lift this limitation would be to disallow adding any
+//     RANGEDELs when a Pebble-external writer is trying to construct a strict
+//     obsolete sstable.
+
+// ParseTableFormat parses the given magic bytes and version into its
+// corresponding internal TableFormat.
+func ParseTableFormat(magic []byte, version uint32) (TableFormat, error) {
+	switch string(magic) {
+	case levelDBMagic:
+		return TableFormatLevelDB, nil
+	case rocksDBMagic:
+		if version != rocksDBFormatVersion2 {
+			return TableFormatUnspecified, base.CorruptionErrorf(
+				"pebble/table: unsupported rocksdb format version %d", errors.Safe(version),
+			)
+		}
+		return TableFormatRocksDBv2, nil
+	case pebbleDBMagic:
+		switch version {
+		case 1:
+			return TableFormatPebblev1, nil
+		case 2:
+			return TableFormatPebblev2, nil
+		case 3:
+			return TableFormatPebblev3, nil
+		case 4:
+			return TableFormatPebblev4, nil
+		default:
+			return TableFormatUnspecified, base.CorruptionErrorf(
+				"pebble/table: unsupported pebble format version %d", errors.Safe(version),
+			)
+		}
+	default:
+		return TableFormatUnspecified, base.CorruptionErrorf(
+			"pebble/table: invalid table (bad magic number: 0x%x)", magic,
+		)
+	}
+}
+
+// AsTuple returns the TableFormat's (Magic String, Version) tuple.
+func (f TableFormat) AsTuple() (string, uint32) {
+	switch f {
+	case TableFormatLevelDB:
+		return levelDBMagic, 0
+	case TableFormatRocksDBv2:
+		return rocksDBMagic, 2
+	case TableFormatPebblev1:
+		return pebbleDBMagic, 1
+	case TableFormatPebblev2:
+		return pebbleDBMagic, 2
+	case TableFormatPebblev3:
+		return pebbleDBMagic, 3
+	case TableFormatPebblev4:
+		return pebbleDBMagic, 4
+	default:
+		panic("sstable: unknown table format version tuple")
+	}
+}
+
+// String returns the TableFormat (Magic String,Version) tuple.
+func (f TableFormat) String() string {
+	switch f {
+	case TableFormatLevelDB:
+		return "(LevelDB)"
+	case TableFormatRocksDBv2:
+		return "(RocksDB,v2)"
+	case TableFormatPebblev1:
+		return "(Pebble,v1)"
+	case TableFormatPebblev2:
+		return "(Pebble,v2)"
+	case TableFormatPebblev3:
+		return "(Pebble,v3)"
+	case TableFormatPebblev4:
+		return "(Pebble,v4)"
+	default:
+		panic("sstable: unknown table format version tuple")
+	}
+}
diff --git a/pebble/sstable/format_test.go b/pebble/sstable/format_test.go
new file mode 100644
index 0000000..f5589c1
--- /dev/null
+++ b/pebble/sstable/format_test.go
@@ -0,0 +1,96 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestTableFormat_RoundTrip(t *testing.T) {
+	tcs := []struct {
+		name    string
+		magic   string
+		version uint32
+		want    TableFormat
+		wantErr string
+	}{
+		// Valid cases.
+		{
+			name:    "LevelDB",
+			magic:   levelDBMagic,
+			version: 0,
+			want:    TableFormatLevelDB,
+		},
+		{
+			name:    "RocksDBv2",
+			magic:   rocksDBMagic,
+			version: 2,
+			want:    TableFormatRocksDBv2,
+		},
+		{
+			name:    "PebbleDBv1",
+			magic:   pebbleDBMagic,
+			version: 1,
+			want:    TableFormatPebblev1,
+		},
+		{
+			name:    "PebbleDBv2",
+			magic:   pebbleDBMagic,
+			version: 2,
+			want:    TableFormatPebblev2,
+		},
+		{
+			name:    "PebbleDBv3",
+			magic:   pebbleDBMagic,
+			version: 3,
+			want:    TableFormatPebblev3,
+		},
+		{
+			name:    "PebbleDBv4",
+			magic:   pebbleDBMagic,
+			version: 4,
+			want:    TableFormatPebblev4,
+		},
+		// Invalid cases.
+		{
+			name:    "Invalid RocksDB version",
+			magic:   rocksDBMagic,
+			version: 1,
+			wantErr: "pebble/table: unsupported rocksdb format version 1",
+		},
+		{
+			name:    "Invalid PebbleDB version",
+			magic:   pebbleDBMagic,
+			version: 5,
+			wantErr: "pebble/table: unsupported pebble format version 5",
+		},
+		{
+			name:    "Unknown magic string",
+			magic:   "foo",
+			wantErr: "pebble/table: invalid table (bad magic number: 0x666f6f)",
+		},
+	}
+
+	for _, tc := range tcs {
+		t.Run(tc.name, func(t *testing.T) {
+			// Tuple -> TableFormat.
+			f, err := ParseTableFormat([]byte(tc.magic), tc.version)
+			if tc.wantErr != "" {
+				require.Error(t, err)
+				require.Equal(t, tc.wantErr, err.Error())
+				return
+			}
+			require.NoError(t, err)
+			require.Equal(t, tc.want, f)
+
+			// TableFormat -> Tuple.
+			s, v := f.AsTuple()
+			require.Equal(t, tc.magic, s)
+			require.Equal(t, tc.version, v)
+		})
+	}
+}
diff --git a/pebble/sstable/internal.go b/pebble/sstable/internal.go
new file mode 100644
index 0000000..0fe7c99
--- /dev/null
+++ b/pebble/sstable/internal.go
@@ -0,0 +1,36 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import "github.com/cockroachdb/pebble/internal/base"
+
+// InternalKeyKind exports the base.InternalKeyKind type.
+type InternalKeyKind = base.InternalKeyKind
+
+// SeekGEFlags exports base.SeekGEFlags.
+type SeekGEFlags = base.SeekGEFlags
+
+// SeekLTFlags exports base.SeekLTFlags.
+type SeekLTFlags = base.SeekLTFlags
+
+// These constants are part of the file format, and should not be changed.
+const (
+	InternalKeyKindDelete          = base.InternalKeyKindDelete
+	InternalKeyKindSet             = base.InternalKeyKindSet
+	InternalKeyKindMerge           = base.InternalKeyKindMerge
+	InternalKeyKindLogData         = base.InternalKeyKindLogData
+	InternalKeyKindSingleDelete    = base.InternalKeyKindSingleDelete
+	InternalKeyKindRangeDelete     = base.InternalKeyKindRangeDelete
+	InternalKeyKindSetWithDelete   = base.InternalKeyKindSetWithDelete
+	InternalKeyKindDeleteSized     = base.InternalKeyKindDeleteSized
+	InternalKeyKindMax             = base.InternalKeyKindMax
+	InternalKeyKindInvalid         = base.InternalKeyKindInvalid
+	InternalKeySeqNumBatch         = base.InternalKeySeqNumBatch
+	InternalKeySeqNumMax           = base.InternalKeySeqNumMax
+	InternalKeyRangeDeleteSentinel = base.InternalKeyRangeDeleteSentinel
+)
+
+// InternalKey exports the base.InternalKey type.
+type InternalKey = base.InternalKey
diff --git a/pebble/sstable/layout.go b/pebble/sstable/layout.go
new file mode 100644
index 0000000..5736ead
--- /dev/null
+++ b/pebble/sstable/layout.go
@@ -0,0 +1,307 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"bytes"
+	"context"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"sort"
+	"unsafe"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/shims/cmp"
+	"github.com/cockroachdb/pebble/shims/slices"
+)
+
+// Layout describes the block organization of an sstable.
+type Layout struct {
+	// NOTE: changes to fields in this struct should also be reflected in
+	// ValidateBlockChecksums, which validates a static list of BlockHandles
+	// referenced in this struct.
+
+	Data       []BlockHandleWithProperties
+	Index      []BlockHandle
+	TopIndex   BlockHandle
+	Filter     BlockHandle
+	RangeDel   BlockHandle
+	RangeKey   BlockHandle
+	ValueBlock []BlockHandle
+	ValueIndex BlockHandle
+	Properties BlockHandle
+	MetaIndex  BlockHandle
+	Footer     BlockHandle
+	Format     TableFormat
+}
+
+// Describe returns a description of the layout. If the verbose parameter is
+// true, details of the structure of each block are returned as well.
+func (l *Layout) Describe(
+	w io.Writer, verbose bool, r *Reader, fmtRecord func(key *base.InternalKey, value []byte),
+) {
+	ctx := context.TODO()
+	type block struct {
+		BlockHandle
+		name string
+	}
+	var blocks []block
+
+	for i := range l.Data {
+		blocks = append(blocks, block{l.Data[i].BlockHandle, "data"})
+	}
+	for i := range l.Index {
+		blocks = append(blocks, block{l.Index[i], "index"})
+	}
+	if l.TopIndex.Length != 0 {
+		blocks = append(blocks, block{l.TopIndex, "top-index"})
+	}
+	if l.Filter.Length != 0 {
+		blocks = append(blocks, block{l.Filter, "filter"})
+	}
+	if l.RangeDel.Length != 0 {
+		blocks = append(blocks, block{l.RangeDel, "range-del"})
+	}
+	if l.RangeKey.Length != 0 {
+		blocks = append(blocks, block{l.RangeKey, "range-key"})
+	}
+	for i := range l.ValueBlock {
+		blocks = append(blocks, block{l.ValueBlock[i], "value-block"})
+	}
+	if l.ValueIndex.Length != 0 {
+		blocks = append(blocks, block{l.ValueIndex, "value-index"})
+	}
+	if l.Properties.Length != 0 {
+		blocks = append(blocks, block{l.Properties, "properties"})
+	}
+	if l.MetaIndex.Length != 0 {
+		blocks = append(blocks, block{l.MetaIndex, "meta-index"})
+	}
+	if l.Footer.Length != 0 {
+		if l.Footer.Length == levelDBFooterLen {
+			blocks = append(blocks, block{l.Footer, "leveldb-footer"})
+		} else {
+			blocks = append(blocks, block{l.Footer, "footer"})
+		}
+	}
+
+	slices.SortFunc(blocks, func(a, b block) int {
+		return cmp.Compare(a.Offset, b.Offset)
+	})
+	for i := range blocks {
+		b := &blocks[i]
+		fmt.Fprintf(w, "%10d  %s (%d)\n", b.Offset, b.name, b.Length)
+
+		if !verbose {
+			continue
+		}
+		if b.name == "filter" {
+			continue
+		}
+
+		if b.name == "footer" || b.name == "leveldb-footer" {
+			trailer, offset := make([]byte, b.Length), b.Offset
+			_ = r.readable.ReadAt(ctx, trailer, int64(offset))
+
+			if b.name == "footer" {
+				checksumType := ChecksumType(trailer[0])
+				fmt.Fprintf(w, "%10d    checksum type: %s\n", offset, checksumType)
+				trailer, offset = trailer[1:], offset+1
+			}
+
+			metaHandle, n := binary.Uvarint(trailer)
+			metaLen, m := binary.Uvarint(trailer[n:])
+			fmt.Fprintf(w, "%10d    meta: offset=%d, length=%d\n", offset, metaHandle, metaLen)
+			trailer, offset = trailer[n+m:], offset+uint64(n+m)
+
+			indexHandle, n := binary.Uvarint(trailer)
+			indexLen, m := binary.Uvarint(trailer[n:])
+			fmt.Fprintf(w, "%10d    index: offset=%d, length=%d\n", offset, indexHandle, indexLen)
+			trailer, offset = trailer[n+m:], offset+uint64(n+m)
+
+			fmt.Fprintf(w, "%10d    [padding]\n", offset)
+
+			trailing := 12
+			if b.name == "leveldb-footer" {
+				trailing = 8
+			}
+
+			offset += uint64(len(trailer) - trailing)
+			trailer = trailer[len(trailer)-trailing:]
+
+			if b.name == "footer" {
+				version := trailer[:4]
+				fmt.Fprintf(w, "%10d    version: %d\n", offset, binary.LittleEndian.Uint32(version))
+				trailer, offset = trailer[4:], offset+4
+			}
+
+			magicNumber := trailer
+			fmt.Fprintf(w, "%10d    magic number: 0x%x\n", offset, magicNumber)
+
+			continue
+		}
+
+		h, err := r.readBlock(
+			context.Background(), b.BlockHandle, nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* iterStats */, nil /* buffer pool */)
+		if err != nil {
+			fmt.Fprintf(w, "  [err: %s]\n", err)
+			continue
+		}
+
+		getRestart := func(data []byte, restarts, i int32) int32 {
+			return decodeRestart(data[restarts+4*i:])
+		}
+
+		formatIsRestart := func(data []byte, restarts, numRestarts, offset int32) {
+			i := sort.Search(int(numRestarts), func(i int) bool {
+				return getRestart(data, restarts, int32(i)) >= offset
+			})
+			if i < int(numRestarts) && getRestart(data, restarts, int32(i)) == offset {
+				fmt.Fprintf(w, " [restart]\n")
+			} else {
+				fmt.Fprintf(w, "\n")
+			}
+		}
+
+		formatRestarts := func(data []byte, restarts, numRestarts int32) {
+			for i := int32(0); i < numRestarts; i++ {
+				offset := getRestart(data, restarts, i)
+				fmt.Fprintf(w, "%10d    [restart %d]\n",
+					b.Offset+uint64(restarts+4*i), b.Offset+uint64(offset))
+			}
+		}
+
+		formatTrailer := func() {
+			trailer := make([]byte, blockTrailerLen)
+			offset := int64(b.Offset + b.Length)
+			_ = r.readable.ReadAt(ctx, trailer, offset)
+			bt := blockType(trailer[0])
+			checksum := binary.LittleEndian.Uint32(trailer[1:])
+			fmt.Fprintf(w, "%10d    [trailer compression=%s checksum=0x%04x]\n", offset, bt, checksum)
+		}
+
+		var lastKey InternalKey
+		switch b.name {
+		case "data", "range-del", "range-key":
+			iter, _ := newBlockIter(r.Compare, h.Get())
+			for key, value := iter.First(); key != nil; key, value = iter.Next() {
+				ptr := unsafe.Pointer(uintptr(iter.ptr) + uintptr(iter.offset))
+				shared, ptr := decodeVarint(ptr)
+				unshared, ptr := decodeVarint(ptr)
+				value2, _ := decodeVarint(ptr)
+
+				total := iter.nextOffset - iter.offset
+				// The format of the numbers in the record line is:
+				//
+				//   (<total> = <length> [<shared>] + <unshared> + <value>)
+				//
+				// <total>    is the total number of bytes for the record.
+				// <length>   is the size of the 3 varint encoded integers for <shared>,
+				//            <unshared>, and <value>.
+				// <shared>   is the number of key bytes shared with the previous key.
+				// <unshared> is the number of unshared key bytes.
+				// <value>    is the number of value bytes.
+				fmt.Fprintf(w, "%10d    record (%d = %d [%d] + %d + %d)",
+					b.Offset+uint64(iter.offset), total,
+					total-int32(unshared+value2), shared, unshared, value2)
+				formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
+				if fmtRecord != nil {
+					fmt.Fprintf(w, "              ")
+					if l.Format < TableFormatPebblev3 {
+						fmtRecord(key, value.InPlaceValue())
+					} else {
+						// InPlaceValue() will succeed even for data blocks where the
+						// actual value is in a different location, since this value was
+						// fetched from a blockIter which does not know about value
+						// blocks.
+						v := value.InPlaceValue()
+						if base.TrailerKind(key.Trailer) != InternalKeyKindSet {
+							fmtRecord(key, v)
+						} else if !isValueHandle(valuePrefix(v[0])) {
+							fmtRecord(key, v[1:])
+						} else {
+							vh := decodeValueHandle(v[1:])
+							fmtRecord(key, []byte(fmt.Sprintf("value handle %+v", vh)))
+						}
+					}
+				}
+
+				if base.InternalCompare(r.Compare, lastKey, *key) >= 0 {
+					fmt.Fprintf(w, "              WARNING: OUT OF ORDER KEYS!\n")
+				}
+				lastKey.Trailer = key.Trailer
+				lastKey.UserKey = append(lastKey.UserKey[:0], key.UserKey...)
+			}
+			formatRestarts(iter.data, iter.restarts, iter.numRestarts)
+			formatTrailer()
+		case "index", "top-index":
+			iter, _ := newBlockIter(r.Compare, h.Get())
+			for key, value := iter.First(); key != nil; key, value = iter.Next() {
+				bh, err := decodeBlockHandleWithProperties(value.InPlaceValue())
+				if err != nil {
+					fmt.Fprintf(w, "%10d    [err: %s]\n", b.Offset+uint64(iter.offset), err)
+					continue
+				}
+				fmt.Fprintf(w, "%10d    block:%d/%d",
+					b.Offset+uint64(iter.offset), bh.Offset, bh.Length)
+				formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
+			}
+			formatRestarts(iter.data, iter.restarts, iter.numRestarts)
+			formatTrailer()
+		case "properties":
+			iter, _ := newRawBlockIter(r.Compare, h.Get())
+			for valid := iter.First(); valid; valid = iter.Next() {
+				fmt.Fprintf(w, "%10d    %s (%d)",
+					b.Offset+uint64(iter.offset), iter.Key().UserKey, iter.nextOffset-iter.offset)
+				formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
+			}
+			formatRestarts(iter.data, iter.restarts, iter.numRestarts)
+			formatTrailer()
+		case "meta-index":
+			iter, _ := newRawBlockIter(r.Compare, h.Get())
+			for valid := iter.First(); valid; valid = iter.Next() {
+				value := iter.Value()
+				var bh BlockHandle
+				var n int
+				var vbih valueBlocksIndexHandle
+				isValueBlocksIndexHandle := false
+				if bytes.Equal(iter.Key().UserKey, []byte(metaValueIndexName)) {
+					vbih, n, err = decodeValueBlocksIndexHandle(value)
+					bh = vbih.h
+					isValueBlocksIndexHandle = true
+				} else {
+					bh, n = decodeBlockHandle(value)
+				}
+				if n == 0 || n != len(value) {
+					fmt.Fprintf(w, "%10d    [err: %s]\n", b.Offset+uint64(iter.offset), err)
+					continue
+				}
+				var vbihStr string
+				if isValueBlocksIndexHandle {
+					vbihStr = fmt.Sprintf(" value-blocks-index-lengths: %d(num), %d(offset), %d(length)",
+						vbih.blockNumByteLength, vbih.blockOffsetByteLength, vbih.blockLengthByteLength)
+				}
+				fmt.Fprintf(w, "%10d    %s block:%d/%d%s",
+					b.Offset+uint64(iter.offset), iter.Key().UserKey,
+					bh.Offset, bh.Length, vbihStr)
+				formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
+			}
+			formatRestarts(iter.data, iter.restarts, iter.numRestarts)
+			formatTrailer()
+		case "value-block":
+			// We don't peer into the value-block since it can't be interpreted
+			// without the valueHandles.
+		case "value-index":
+			// We have already read the value-index to construct the list of
+			// value-blocks, so no need to do it again.
+		}
+
+		h.Release()
+	}
+
+	last := blocks[len(blocks)-1]
+	fmt.Fprintf(w, "%10d  EOF\n", last.Offset+last.Length)
+}
diff --git a/pebble/sstable/options.go b/pebble/sstable/options.go
new file mode 100644
index 0000000..c5e1f79
--- /dev/null
+++ b/pebble/sstable/options.go
@@ -0,0 +1,305 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/cache"
+)
+
+// Compression is the per-block compression algorithm to use.
+type Compression int
+
+// The available compression types.
+const (
+	DefaultCompression Compression = iota
+	NoCompression
+	SnappyCompression
+	ZstdCompression
+	NCompression
+)
+
+var ignoredInternalProperties = map[string]struct{}{
+	"rocksdb.column.family.id":             {},
+	"rocksdb.fixed.key.length":             {},
+	"rocksdb.index.key.is.user.key":        {},
+	"rocksdb.index.value.is.delta.encoded": {},
+	"rocksdb.oldest.key.time":              {},
+	"rocksdb.creation.time":                {},
+	"rocksdb.file.creation.time":           {},
+	"rocksdb.format.version":               {},
+}
+
+func (c Compression) String() string {
+	switch c {
+	case DefaultCompression:
+		return "Default"
+	case NoCompression:
+		return "NoCompression"
+	case SnappyCompression:
+		return "Snappy"
+	case ZstdCompression:
+		return "ZSTD"
+	default:
+		return "Unknown"
+	}
+}
+
+// FilterType exports the base.FilterType type.
+type FilterType = base.FilterType
+
+// Exported TableFilter constants.
+const (
+	TableFilter = base.TableFilter
+)
+
+// FilterWriter exports the base.FilterWriter type.
+type FilterWriter = base.FilterWriter
+
+// FilterPolicy exports the base.FilterPolicy type.
+type FilterPolicy = base.FilterPolicy
+
+// TablePropertyCollector provides a hook for collecting user-defined
+// properties based on the keys and values stored in an sstable. A new
+// TablePropertyCollector is created for an sstable when the sstable is being
+// written.
+type TablePropertyCollector interface {
+	// Add is called with each new entry added to the sstable. While the sstable
+	// is itself sorted by key, do not assume that the entries are added in any
+	// order. In particular, the ordering of point entries and range tombstones
+	// is unspecified.
+	Add(key InternalKey, value []byte) error
+
+	// Finish is called when all entries have been added to the sstable. The
+	// collected properties (if any) should be added to the specified map. Note
+	// that in case of an error during sstable construction, Finish may not be
+	// called.
+	Finish(userProps map[string]string) error
+
+	// The name of the property collector.
+	Name() string
+}
+
+// SuffixReplaceableTableCollector is an extension to the TablePropertyCollector
+// interface that allows a table property collector to indicate that it supports
+// being *updated* during suffix replacement, i.e. when an existing SST in which
+// all keys have the same key suffix is updated to have a new suffix.
+//
+// A collector which supports being updated in such cases must be able to derive
+// its updated value from its old value and the change being made to the suffix,
+// without needing to be passed each updated K/V.
+//
+// For example, a collector that only inspects values can simply copy its
+// previously computed property as-is, since key-suffix replacement does not
+// change values, while a collector that depends only on key suffixes, like one
+// which collected mvcc-timestamp bounds from timestamp-suffixed keys, can just
+// set its new bounds from the new suffix, as it is common to all keys, without
+// needing to recompute it from every key.
+type SuffixReplaceableTableCollector interface {
+	// UpdateKeySuffixes is called when a table is updated to change the suffix of
+	// all keys in the table, and is passed the old value for that prop, if any,
+	// for that table as well as the old and new suffix.
+	UpdateKeySuffixes(oldProps map[string]string, oldSuffix, newSuffix []byte) error
+}
+
+// ReaderOptions holds the parameters needed for reading an sstable.
+type ReaderOptions struct {
+	// Cache is used to cache uncompressed blocks from sstables.
+	//
+	// The default cache size is a zero-size cache.
+	Cache *cache.Cache
+
+	// User properties specified in this map will not be added to sst.Properties.UserProperties.
+	DeniedUserProperties map[string]struct{}
+
+	// Comparer defines a total ordering over the space of []byte keys: a 'less
+	// than' relationship. The same comparison algorithm must be used for reads
+	// and writes over the lifetime of the DB.
+	//
+	// The default value uses the same ordering as bytes.Compare.
+	Comparer *Comparer
+
+	// Merge defines the Merge function in use for this keyspace.
+	Merge base.Merge
+
+	// Filters is a map from filter policy name to filter policy. It is used for
+	// debugging tools which may be used on multiple databases configured with
+	// different filter policies. It is not necessary to populate this filters
+	// map during normal usage of a DB.
+	Filters map[string]FilterPolicy
+
+	// Merger defines the associative merge operation to use for merging values
+	// written with {Batch,DB}.Merge. The MergerName is checked for consistency
+	// with the value stored in the sstable when it was written.
+	MergerName string
+
+	// Logger is an optional logger and tracer.
+	LoggerAndTracer base.LoggerAndTracer
+}
+
+func (o ReaderOptions) ensureDefaults() ReaderOptions {
+	if o.Comparer == nil {
+		o.Comparer = base.DefaultComparer
+	}
+	if o.Merge == nil {
+		o.Merge = base.DefaultMerger.Merge
+	}
+	if o.MergerName == "" {
+		o.MergerName = base.DefaultMerger.Name
+	}
+	if o.LoggerAndTracer == nil {
+		o.LoggerAndTracer = base.NoopLoggerAndTracer{}
+	}
+	if o.DeniedUserProperties == nil {
+		o.DeniedUserProperties = ignoredInternalProperties
+	}
+	return o
+}
+
+// WriterOptions holds the parameters used to control building an sstable.
+type WriterOptions struct {
+	// BlockRestartInterval is the number of keys between restart points
+	// for delta encoding of keys.
+	//
+	// The default value is 16.
+	BlockRestartInterval int
+
+	// BlockSize is the target uncompressed size in bytes of each table block.
+	//
+	// The default value is 4096.
+	BlockSize int
+
+	// BlockSizeThreshold finishes a block if the block size is larger than the
+	// specified percentage of the target block size and adding the next entry
+	// would cause the block to be larger than the target block size.
+	//
+	// The default value is 90
+	BlockSizeThreshold int
+
+	// Cache is used to cache uncompressed blocks from sstables.
+	//
+	// The default is a nil cache.
+	Cache *cache.Cache
+
+	// Comparer defines a total ordering over the space of []byte keys: a 'less
+	// than' relationship. The same comparison algorithm must be used for reads
+	// and writes over the lifetime of the DB.
+	//
+	// The default value uses the same ordering as bytes.Compare.
+	Comparer *Comparer
+
+	// Compression defines the per-block compression to use.
+	//
+	// The default value (DefaultCompression) uses snappy compression.
+	Compression Compression
+
+	// FilterPolicy defines a filter algorithm (such as a Bloom filter) that can
+	// reduce disk reads for Get calls.
+	//
+	// One such implementation is bloom.FilterPolicy(10) from the pebble/bloom
+	// package.
+	//
+	// The default value means to use no filter.
+	FilterPolicy FilterPolicy
+
+	// FilterType defines whether an existing filter policy is applied at a
+	// block-level or table-level. Block-level filters use less memory to create,
+	// but are slower to access as a check for the key in the index must first be
+	// performed to locate the filter block. A table-level filter will require
+	// memory proportional to the number of keys in an sstable to create, but
+	// avoids the index lookup when determining if a key is present. Table-level
+	// filters should be preferred except under constrained memory situations.
+	FilterType FilterType
+
+	// IndexBlockSize is the target uncompressed size in bytes of each index
+	// block. When the index block size is larger than this target, two-level
+	// indexes are automatically enabled. Setting this option to a large value
+	// (such as math.MaxInt32) disables the automatic creation of two-level
+	// indexes.
+	//
+	// The default value is the value of BlockSize.
+	IndexBlockSize int
+
+	// Merger defines the associative merge operation to use for merging values
+	// written with {Batch,DB}.Merge. The MergerName is checked for consistency
+	// with the value stored in the sstable when it was written.
+	MergerName string
+
+	// TableFormat specifies the format version for writing sstables. The default
+	// is TableFormatRocksDBv2 which creates RocksDB compatible sstables. Use
+	// TableFormatLevelDB to create LevelDB compatible sstable which can be used
+	// by a wider range of tools and libraries.
+	TableFormat TableFormat
+
+	// IsStrictObsolete is only relevant for >= TableFormatPebblev4. See comment
+	// in format.go. Must be false if format < TableFormatPebblev4.
+	//
+	// TODO(bilal): set this when writing shared ssts.
+	IsStrictObsolete bool
+
+	// WritingToLowestLevel is only relevant for >= TableFormatPebblev4. It is
+	// used to set the obsolete bit on DEL/DELSIZED/SINGLEDEL if they are the
+	// youngest for a userkey.
+	WritingToLowestLevel bool
+
+	// TablePropertyCollectors is a list of TablePropertyCollector creation
+	// functions. A new TablePropertyCollector is created for each sstable built
+	// and lives for the lifetime of the table.
+	TablePropertyCollectors []func() TablePropertyCollector
+
+	// BlockPropertyCollectors is a list of BlockPropertyCollector creation
+	// functions. A new BlockPropertyCollector is created for each sstable
+	// built and lives for the lifetime of writing that table.
+	BlockPropertyCollectors []func() BlockPropertyCollector
+
+	// Checksum specifies which checksum to use.
+	Checksum ChecksumType
+
+	// Parallelism is used to indicate that the sstable Writer is allowed to
+	// compress data blocks and write datablocks to disk in parallel with the
+	// Writer client goroutine.
+	Parallelism bool
+
+	// ShortAttributeExtractor mirrors
+	// Options.Experimental.ShortAttributeExtractor.
+	ShortAttributeExtractor base.ShortAttributeExtractor
+
+	// RequiredInPlaceValueBound mirrors
+	// Options.Experimental.RequiredInPlaceValueBound.
+	RequiredInPlaceValueBound UserKeyPrefixBound
+}
+
+func (o WriterOptions) ensureDefaults() WriterOptions {
+	if o.BlockRestartInterval <= 0 {
+		o.BlockRestartInterval = base.DefaultBlockRestartInterval
+	}
+	if o.BlockSize <= 0 {
+		o.BlockSize = base.DefaultBlockSize
+	}
+	if o.BlockSizeThreshold <= 0 {
+		o.BlockSizeThreshold = base.DefaultBlockSizeThreshold
+	}
+	if o.Comparer == nil {
+		o.Comparer = base.DefaultComparer
+	}
+	if o.Compression <= DefaultCompression || o.Compression >= NCompression {
+		o.Compression = SnappyCompression
+	}
+	if o.IndexBlockSize <= 0 {
+		o.IndexBlockSize = o.BlockSize
+	}
+	if o.MergerName == "" {
+		o.MergerName = base.DefaultMerger.Name
+	}
+	if o.Checksum == ChecksumTypeNone {
+		o.Checksum = ChecksumTypeCRC32c
+	}
+	// By default, if the table format is not specified, fall back to using the
+	// most compatible format.
+	if o.TableFormat == TableFormatUnspecified {
+		o.TableFormat = TableFormatRocksDBv2
+	}
+	return o
+}
diff --git a/pebble/sstable/properties.go b/pebble/sstable/properties.go
new file mode 100644
index 0000000..3bbf34a
--- /dev/null
+++ b/pebble/sstable/properties.go
@@ -0,0 +1,450 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"math"
+	"reflect"
+	"sort"
+	"unsafe"
+
+	"github.com/cockroachdb/pebble/internal/intern"
+)
+
+const propertiesBlockRestartInterval = math.MaxInt32
+const propGlobalSeqnumName = "rocksdb.external_sst_file.global_seqno"
+
+var propTagMap = make(map[string]reflect.StructField)
+var propBoolTrue = []byte{'1'}
+var propBoolFalse = []byte{'0'}
+
+var propOffsetTagMap = make(map[uintptr]string)
+
+func generateTagMaps(t reflect.Type, indexPrefix []int) {
+	for i := 0; i < t.NumField(); i++ {
+		f := t.Field(i)
+		if f.Type.Kind() == reflect.Struct {
+			if tag := f.Tag.Get("prop"); i == 0 && tag == "pebble.embbeded_common_properties" {
+				// CommonProperties struct embedded in Properties. Note that since
+				// CommonProperties is placed at the top of properties we can use
+				// the offsets of the fields within CommonProperties to determine
+				// the offsets of those fields within Properties.
+				generateTagMaps(f.Type, []int{i})
+				continue
+			}
+			panic("pebble: unknown struct type in Properties")
+		}
+		if tag := f.Tag.Get("prop"); tag != "" {
+			switch f.Type.Kind() {
+			case reflect.Bool:
+			case reflect.Uint32:
+			case reflect.Uint64:
+			case reflect.String:
+			default:
+				panic(fmt.Sprintf("unsupported property field type: %s %s", f.Name, f.Type))
+			}
+			if len(indexPrefix) > 0 {
+				// Prepend the index prefix so that we can use FieldByIndex on the top-level struct.
+				f.Index = append(indexPrefix[:len(indexPrefix):len(indexPrefix)], f.Index...)
+			}
+			propTagMap[tag] = f
+			propOffsetTagMap[f.Offset] = tag
+		}
+	}
+}
+
+func init() {
+	generateTagMaps(reflect.TypeOf(Properties{}), nil)
+}
+
+// CommonProperties holds properties for either a virtual or a physical sstable. This
+// can be used by code which doesn't care to make the distinction between physical
+// and virtual sstables properties.
+//
+// For virtual sstables, fields are constructed through extrapolation upon virtual
+// reader construction. See MakeVirtualReader for implementation details.
+//
+// NB: The values of these properties can affect correctness. For example,
+// if NumRangeKeySets == 0, but the sstable actually contains range keys, then
+// the iterators will behave incorrectly.
+type CommonProperties struct {
+	// The number of entries in this table.
+	NumEntries uint64 `prop:"rocksdb.num.entries"`
+	// Total raw key size.
+	RawKeySize uint64 `prop:"rocksdb.raw.key.size"`
+	// Total raw value size.
+	RawValueSize uint64 `prop:"rocksdb.raw.value.size"`
+	// Total raw key size of point deletion tombstones. This value is comparable
+	// to RawKeySize.
+	RawPointTombstoneKeySize uint64 `prop:"pebble.raw.point-tombstone.key.size"`
+	// Sum of the raw value sizes carried by point deletion tombstones
+	// containing size estimates. See the DeleteSized key kind. This value is
+	// comparable to Raw{Key,Value}Size.
+	RawPointTombstoneValueSize uint64 `prop:"pebble.raw.point-tombstone.value.size"`
+	// The number of point deletion entries ("tombstones") in this table that
+	// carry a size hint indicating the size of the value the tombstone deletes.
+	NumSizedDeletions uint64 `prop:"pebble.num.deletions.sized"`
+	// The number of deletion entries in this table, including both point and
+	// range deletions.
+	NumDeletions uint64 `prop:"rocksdb.deleted.keys"`
+	// The number of range deletions in this table.
+	NumRangeDeletions uint64 `prop:"rocksdb.num.range-deletions"`
+	// The number of RANGEKEYDELs in this table.
+	NumRangeKeyDels uint64 `prop:"pebble.num.range-key-dels"`
+	// The number of RANGEKEYSETs in this table.
+	NumRangeKeySets uint64 `prop:"pebble.num.range-key-sets"`
+	// Total size of value blocks and value index block. Only serialized if > 0.
+	ValueBlocksSize uint64 `prop:"pebble.value-blocks.size"`
+}
+
+// String is only used for testing purposes.
+func (c *CommonProperties) String() string {
+	var buf bytes.Buffer
+	v := reflect.ValueOf(*c)
+	loaded := make(map[uintptr]struct{})
+	writeProperties(loaded, v, &buf)
+	return buf.String()
+}
+
+// NumPointDeletions is the number of point deletions in the sstable. For virtual
+// sstables, this is an estimate.
+func (c *CommonProperties) NumPointDeletions() uint64 {
+	return c.NumDeletions - c.NumRangeDeletions
+}
+
+// Properties holds the sstable property values. The properties are
+// automatically populated during sstable creation and load from the properties
+// meta block when an sstable is opened.
+type Properties struct {
+	// CommonProperties needs to be at the top of the Properties struct so that the
+	// offsets of the fields in CommonProperties match the offsets of the embedded
+	// fields of CommonProperties in Properties.
+	CommonProperties `prop:"pebble.embbeded_common_properties"`
+
+	// The name of the comparer used in this table.
+	ComparerName string `prop:"rocksdb.comparator"`
+	// The compression algorithm used to compress blocks.
+	CompressionName string `prop:"rocksdb.compression"`
+	// The compression options used to compress blocks.
+	CompressionOptions string `prop:"rocksdb.compression_options"`
+	// The total size of all data blocks.
+	DataSize uint64 `prop:"rocksdb.data.size"`
+	// The external sstable version format. Version 2 is the one RocksDB has been
+	// using since 5.13. RocksDB only uses the global sequence number for an
+	// sstable if this property has been set.
+	ExternalFormatVersion uint32 `prop:"rocksdb.external_sst_file.version"`
+	// The name of the filter policy used in this table. Empty if no filter
+	// policy is used.
+	FilterPolicyName string `prop:"rocksdb.filter.policy"`
+	// The size of filter block.
+	FilterSize uint64 `prop:"rocksdb.filter.size"`
+	// The global sequence number to use for all entries in the table. Present if
+	// the table was created externally and ingested whole.
+	GlobalSeqNum uint64 `prop:"rocksdb.external_sst_file.global_seqno"`
+	// Total number of index partitions if kTwoLevelIndexSearch is used.
+	IndexPartitions uint64 `prop:"rocksdb.index.partitions"`
+	// The size of index block.
+	IndexSize uint64 `prop:"rocksdb.index.size"`
+	// The index type. TODO(peter): add a more detailed description.
+	IndexType uint32 `prop:"rocksdb.block.based.table.index.type"`
+	// For formats >= TableFormatPebblev4, this is set to true if the obsolete
+	// bit is strict for all the point keys.
+	IsStrictObsolete bool `prop:"pebble.obsolete.is_strict"`
+	// The name of the merger used in this table. Empty if no merger is used.
+	MergerName string `prop:"rocksdb.merge.operator"`
+	// The number of blocks in this table.
+	NumDataBlocks uint64 `prop:"rocksdb.num.data.blocks"`
+	// The number of merge operands in the table.
+	NumMergeOperands uint64 `prop:"rocksdb.merge.operands"`
+	// The number of RANGEKEYUNSETs in this table.
+	NumRangeKeyUnsets uint64 `prop:"pebble.num.range-key-unsets"`
+	// The number of value blocks in this table. Only serialized if > 0.
+	NumValueBlocks uint64 `prop:"pebble.num.value-blocks"`
+	// The number of values stored in value blocks. Only serialized if > 0.
+	NumValuesInValueBlocks uint64 `prop:"pebble.num.values.in.value-blocks"`
+	// The name of the prefix extractor used in this table. Empty if no prefix
+	// extractor is used.
+	PrefixExtractorName string `prop:"rocksdb.prefix.extractor.name"`
+	// If filtering is enabled, was the filter created on the key prefix.
+	PrefixFiltering bool `prop:"rocksdb.block.based.table.prefix.filtering"`
+	// A comma separated list of names of the property collectors used in this
+	// table.
+	PropertyCollectorNames string `prop:"rocksdb.property.collectors"`
+	// Total raw rangekey key size.
+	RawRangeKeyKeySize uint64 `prop:"pebble.raw.range-key.key.size"`
+	// Total raw rangekey value size.
+	RawRangeKeyValueSize uint64 `prop:"pebble.raw.range-key.value.size"`
+	// The total number of keys in this table that were pinned by open snapshots.
+	SnapshotPinnedKeys uint64 `prop:"pebble.num.snapshot-pinned-keys"`
+	// The cumulative bytes of keys in this table that were pinned by
+	// open snapshots. This value is comparable to RawKeySize.
+	SnapshotPinnedKeySize uint64 `prop:"pebble.raw.snapshot-pinned-keys.size"`
+	// The cumulative bytes of values in this table that were pinned by
+	// open snapshots. This value is comparable to RawValueSize.
+	SnapshotPinnedValueSize uint64 `prop:"pebble.raw.snapshot-pinned-values.size"`
+	// Size of the top-level index if kTwoLevelIndexSearch is used.
+	TopLevelIndexSize uint64 `prop:"rocksdb.top-level.index.size"`
+	// User collected properties.
+	UserProperties map[string]string
+	// If filtering is enabled, was the filter created on the whole key.
+	WholeKeyFiltering bool `prop:"rocksdb.block.based.table.whole.key.filtering"`
+
+	// Loaded set indicating which fields have been loaded from disk. Indexed by
+	// the field's byte offset within the struct
+	// (reflect.StructField.Offset). Only set if the properties have been loaded
+	// from a file. Only exported for testing purposes.
+	Loaded map[uintptr]struct{}
+}
+
+// NumPointDeletions returns the number of point deletions in this table.
+func (p *Properties) NumPointDeletions() uint64 {
+	return p.NumDeletions - p.NumRangeDeletions
+}
+
+// NumRangeKeys returns a count of the number of range keys in this table.
+func (p *Properties) NumRangeKeys() uint64 {
+	return p.NumRangeKeyDels + p.NumRangeKeySets + p.NumRangeKeyUnsets
+}
+
+func writeProperties(loaded map[uintptr]struct{}, v reflect.Value, buf *bytes.Buffer) {
+	vt := v.Type()
+	for i := 0; i < v.NumField(); i++ {
+		ft := vt.Field(i)
+		if ft.Type.Kind() == reflect.Struct {
+			// Embedded struct within the properties.
+			writeProperties(loaded, v.Field(i), buf)
+			continue
+		}
+		tag := ft.Tag.Get("prop")
+		if tag == "" {
+			continue
+		}
+
+		f := v.Field(i)
+		// TODO(peter): Use f.IsZero() when we can rely on go1.13.
+		if zero := reflect.Zero(f.Type()); zero.Interface() == f.Interface() {
+			// Skip printing of zero values which were not loaded from disk.
+			if _, ok := loaded[ft.Offset]; !ok {
+				continue
+			}
+		}
+
+		fmt.Fprintf(buf, "%s: ", tag)
+		switch ft.Type.Kind() {
+		case reflect.Bool:
+			fmt.Fprintf(buf, "%t\n", f.Bool())
+		case reflect.Uint32:
+			fmt.Fprintf(buf, "%d\n", f.Uint())
+		case reflect.Uint64:
+			fmt.Fprintf(buf, "%d\n", f.Uint())
+		case reflect.String:
+			fmt.Fprintf(buf, "%s\n", f.String())
+		default:
+			panic("not reached")
+		}
+	}
+}
+
+func (p *Properties) String() string {
+	var buf bytes.Buffer
+	v := reflect.ValueOf(*p)
+	writeProperties(p.Loaded, v, &buf)
+
+	// Write the UserProperties.
+	keys := make([]string, 0, len(p.UserProperties))
+	for key := range p.UserProperties {
+		keys = append(keys, key)
+	}
+	sort.Strings(keys)
+	for _, key := range keys {
+		fmt.Fprintf(&buf, "%s: %s\n", key, p.UserProperties[key])
+	}
+	return buf.String()
+}
+
+func (p *Properties) load(
+	b block, blockOffset uint64, deniedUserProperties map[string]struct{},
+) error {
+	i, err := newRawBlockIter(bytes.Compare, b)
+	if err != nil {
+		return err
+	}
+	p.Loaded = make(map[uintptr]struct{})
+	v := reflect.ValueOf(p).Elem()
+
+	for valid := i.First(); valid; valid = i.Next() {
+		if f, ok := propTagMap[string(i.Key().UserKey)]; ok {
+			p.Loaded[f.Offset] = struct{}{}
+			field := v.FieldByIndex(f.Index)
+			switch f.Type.Kind() {
+			case reflect.Bool:
+				field.SetBool(bytes.Equal(i.Value(), propBoolTrue))
+			case reflect.Uint32:
+				field.SetUint(uint64(binary.LittleEndian.Uint32(i.Value())))
+			case reflect.Uint64:
+				var n uint64
+				if string(i.Key().UserKey) == propGlobalSeqnumName {
+					n = binary.LittleEndian.Uint64(i.Value())
+				} else {
+					n, _ = binary.Uvarint(i.Value())
+				}
+				field.SetUint(n)
+			case reflect.String:
+				field.SetString(intern.Bytes(i.Value()))
+			default:
+				panic("not reached")
+			}
+			continue
+		}
+		if p.UserProperties == nil {
+			p.UserProperties = make(map[string]string)
+		}
+
+		if _, denied := deniedUserProperties[string(i.Key().UserKey)]; !denied {
+			p.UserProperties[intern.Bytes(i.Key().UserKey)] = string(i.Value())
+		}
+	}
+	return nil
+}
+
+func (p *Properties) saveBool(m map[string][]byte, offset uintptr, value bool) {
+	tag := propOffsetTagMap[offset]
+	if value {
+		m[tag] = propBoolTrue
+	} else {
+		m[tag] = propBoolFalse
+	}
+}
+
+func (p *Properties) saveUint32(m map[string][]byte, offset uintptr, value uint32) {
+	var buf [4]byte
+	binary.LittleEndian.PutUint32(buf[:], value)
+	m[propOffsetTagMap[offset]] = buf[:]
+}
+
+func (p *Properties) saveUint64(m map[string][]byte, offset uintptr, value uint64) {
+	var buf [8]byte
+	binary.LittleEndian.PutUint64(buf[:], value)
+	m[propOffsetTagMap[offset]] = buf[:]
+}
+
+func (p *Properties) saveUvarint(m map[string][]byte, offset uintptr, value uint64) {
+	var buf [10]byte
+	n := binary.PutUvarint(buf[:], value)
+	m[propOffsetTagMap[offset]] = buf[:n]
+}
+
+func (p *Properties) saveString(m map[string][]byte, offset uintptr, value string) {
+	m[propOffsetTagMap[offset]] = []byte(value)
+}
+
+func (p *Properties) save(tblFormat TableFormat, w *rawBlockWriter) {
+	m := make(map[string][]byte)
+	for k, v := range p.UserProperties {
+		m[k] = []byte(v)
+	}
+
+	if p.ComparerName != "" {
+		p.saveString(m, unsafe.Offsetof(p.ComparerName), p.ComparerName)
+	}
+	if p.CompressionName != "" {
+		p.saveString(m, unsafe.Offsetof(p.CompressionName), p.CompressionName)
+	}
+	if p.CompressionOptions != "" {
+		p.saveString(m, unsafe.Offsetof(p.CompressionOptions), p.CompressionOptions)
+	}
+	p.saveUvarint(m, unsafe.Offsetof(p.DataSize), p.DataSize)
+	if p.ExternalFormatVersion != 0 {
+		p.saveUint32(m, unsafe.Offsetof(p.ExternalFormatVersion), p.ExternalFormatVersion)
+		p.saveUint64(m, unsafe.Offsetof(p.GlobalSeqNum), p.GlobalSeqNum)
+	}
+	if p.FilterPolicyName != "" {
+		p.saveString(m, unsafe.Offsetof(p.FilterPolicyName), p.FilterPolicyName)
+	}
+	p.saveUvarint(m, unsafe.Offsetof(p.FilterSize), p.FilterSize)
+	if p.IndexPartitions != 0 {
+		p.saveUvarint(m, unsafe.Offsetof(p.IndexPartitions), p.IndexPartitions)
+		p.saveUvarint(m, unsafe.Offsetof(p.TopLevelIndexSize), p.TopLevelIndexSize)
+	}
+	p.saveUvarint(m, unsafe.Offsetof(p.IndexSize), p.IndexSize)
+	p.saveUint32(m, unsafe.Offsetof(p.IndexType), p.IndexType)
+	if p.IsStrictObsolete {
+		p.saveBool(m, unsafe.Offsetof(p.IsStrictObsolete), p.IsStrictObsolete)
+	}
+	if p.MergerName != "" {
+		p.saveString(m, unsafe.Offsetof(p.MergerName), p.MergerName)
+	}
+	p.saveUvarint(m, unsafe.Offsetof(p.NumDataBlocks), p.NumDataBlocks)
+	p.saveUvarint(m, unsafe.Offsetof(p.NumEntries), p.NumEntries)
+	p.saveUvarint(m, unsafe.Offsetof(p.NumDeletions), p.NumDeletions)
+	if p.NumSizedDeletions > 0 {
+		p.saveUvarint(m, unsafe.Offsetof(p.NumSizedDeletions), p.NumSizedDeletions)
+	}
+	p.saveUvarint(m, unsafe.Offsetof(p.NumMergeOperands), p.NumMergeOperands)
+	p.saveUvarint(m, unsafe.Offsetof(p.NumRangeDeletions), p.NumRangeDeletions)
+	// NB: We only write out some properties for Pebble formats. This isn't
+	// strictly necessary because unrecognized properties are interpreted as
+	// user-defined properties, however writing them prevents byte-for-byte
+	// equivalence with RocksDB files that some of our testing requires.
+	if p.RawPointTombstoneKeySize > 0 && tblFormat >= TableFormatPebblev1 {
+		p.saveUvarint(m, unsafe.Offsetof(p.RawPointTombstoneKeySize), p.RawPointTombstoneKeySize)
+	}
+	if p.RawPointTombstoneValueSize > 0 {
+		p.saveUvarint(m, unsafe.Offsetof(p.RawPointTombstoneValueSize), p.RawPointTombstoneValueSize)
+	}
+	if p.NumRangeKeys() > 0 {
+		p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeyDels), p.NumRangeKeyDels)
+		p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeySets), p.NumRangeKeySets)
+		p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeyUnsets), p.NumRangeKeyUnsets)
+		p.saveUvarint(m, unsafe.Offsetof(p.RawRangeKeyKeySize), p.RawRangeKeyKeySize)
+		p.saveUvarint(m, unsafe.Offsetof(p.RawRangeKeyValueSize), p.RawRangeKeyValueSize)
+	}
+	if p.NumValueBlocks > 0 {
+		p.saveUvarint(m, unsafe.Offsetof(p.NumValueBlocks), p.NumValueBlocks)
+	}
+	if p.NumValuesInValueBlocks > 0 {
+		p.saveUvarint(m, unsafe.Offsetof(p.NumValuesInValueBlocks), p.NumValuesInValueBlocks)
+	}
+	if p.PrefixExtractorName != "" {
+		p.saveString(m, unsafe.Offsetof(p.PrefixExtractorName), p.PrefixExtractorName)
+	}
+	p.saveBool(m, unsafe.Offsetof(p.PrefixFiltering), p.PrefixFiltering)
+	if p.PropertyCollectorNames != "" {
+		p.saveString(m, unsafe.Offsetof(p.PropertyCollectorNames), p.PropertyCollectorNames)
+	}
+	if p.SnapshotPinnedKeys > 0 {
+		p.saveUvarint(m, unsafe.Offsetof(p.SnapshotPinnedKeys), p.SnapshotPinnedKeys)
+		p.saveUvarint(m, unsafe.Offsetof(p.SnapshotPinnedKeySize), p.SnapshotPinnedKeySize)
+		p.saveUvarint(m, unsafe.Offsetof(p.SnapshotPinnedValueSize), p.SnapshotPinnedValueSize)
+	}
+	p.saveUvarint(m, unsafe.Offsetof(p.RawKeySize), p.RawKeySize)
+	p.saveUvarint(m, unsafe.Offsetof(p.RawValueSize), p.RawValueSize)
+	if p.ValueBlocksSize > 0 {
+		p.saveUvarint(m, unsafe.Offsetof(p.ValueBlocksSize), p.ValueBlocksSize)
+	}
+	p.saveBool(m, unsafe.Offsetof(p.WholeKeyFiltering), p.WholeKeyFiltering)
+
+	if tblFormat < TableFormatPebblev1 {
+		m["rocksdb.column.family.id"] = binary.AppendUvarint([]byte(nil), math.MaxInt32)
+		m["rocksdb.fixed.key.length"] = []byte{0x00}
+		m["rocksdb.index.key.is.user.key"] = []byte{0x00}
+		m["rocksdb.index.value.is.delta.encoded"] = []byte{0x00}
+		m["rocksdb.oldest.key.time"] = []byte{0x00}
+		m["rocksdb.creation.time"] = []byte{0x00}
+		m["rocksdb.format.version"] = []byte{0x00}
+	}
+
+	keys := make([]string, 0, len(m))
+	for key := range m {
+		keys = append(keys, key)
+	}
+	sort.Strings(keys)
+	for _, key := range keys {
+		w.add(InternalKey{UserKey: []byte(key)}, m[key])
+	}
+}
diff --git a/pebble/sstable/properties_test.go b/pebble/sstable/properties_test.go
new file mode 100644
index 0000000..09c719d
--- /dev/null
+++ b/pebble/sstable/properties_test.go
@@ -0,0 +1,146 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"math/rand"
+	"os"
+	"path/filepath"
+	"reflect"
+	"strings"
+	"testing"
+	"testing/quick"
+	"time"
+
+	"github.com/kr/pretty"
+	"github.com/stretchr/testify/require"
+)
+
+func TestPropertiesLoad(t *testing.T) {
+	expected := Properties{
+		CommonProperties: CommonProperties{
+			NumEntries:        1727,
+			NumDeletions:      17,
+			NumRangeDeletions: 17,
+			RawKeySize:        23938,
+			RawValueSize:      1912,
+		},
+		ComparerName:           "leveldb.BytewiseComparator",
+		CompressionName:        "Snappy",
+		CompressionOptions:     "window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; ",
+		DataSize:               13913,
+		ExternalFormatVersion:  2,
+		IndexSize:              325,
+		MergerName:             "nullptr",
+		NumDataBlocks:          14,
+		PrefixExtractorName:    "nullptr",
+		PropertyCollectorNames: "[KeyCountPropertyCollector]",
+		UserProperties: map[string]string{
+			"test.key-count": "1727",
+		},
+		WholeKeyFiltering: false,
+	}
+
+	{
+		// Check that we can read properties from a table.
+		f, err := os.Open(filepath.FromSlash("testdata/h.sst"))
+		require.NoError(t, err)
+
+		r, err := newReader(f, ReaderOptions{})
+
+		require.NoError(t, err)
+		defer r.Close()
+
+		r.Properties.Loaded = nil
+
+		if diff := pretty.Diff(expected, r.Properties); diff != nil {
+			t.Fatalf("%s", strings.Join(diff, "\n"))
+		}
+	}
+}
+
+var testProps = Properties{
+	CommonProperties: CommonProperties{
+		NumDeletions:      15,
+		NumEntries:        16,
+		NumRangeDeletions: 18,
+		NumRangeKeyDels:   19,
+		NumRangeKeySets:   20,
+		RawKeySize:        25,
+		RawValueSize:      26,
+	},
+	ComparerName:           "comparator name",
+	CompressionName:        "compression name",
+	CompressionOptions:     "compression option",
+	DataSize:               3,
+	ExternalFormatVersion:  4,
+	FilterPolicyName:       "filter policy name",
+	FilterSize:             5,
+	GlobalSeqNum:           8,
+	IndexPartitions:        10,
+	IndexSize:              11,
+	IndexType:              12,
+	IsStrictObsolete:       true,
+	MergerName:             "merge operator name",
+	NumDataBlocks:          14,
+	NumMergeOperands:       17,
+	NumRangeKeyUnsets:      21,
+	NumValueBlocks:         22,
+	NumValuesInValueBlocks: 23,
+	PrefixExtractorName:    "prefix extractor name",
+	PrefixFiltering:        true,
+	PropertyCollectorNames: "prefix collector names",
+	TopLevelIndexSize:      27,
+	WholeKeyFiltering:      true,
+	UserProperties: map[string]string{
+		"user-prop-a": "1",
+		"user-prop-b": "2",
+	},
+}
+
+func TestPropertiesSave(t *testing.T) {
+	expected := &Properties{}
+	*expected = testProps
+
+	check1 := func(expected *Properties) {
+		// Check that we can save properties and read them back.
+		var w rawBlockWriter
+		w.restartInterval = propertiesBlockRestartInterval
+		expected.save(TableFormatPebblev2, &w)
+		var props Properties
+
+		require.NoError(t, props.load(w.finish(), 0, make(map[string]struct{})))
+		props.Loaded = nil
+		if diff := pretty.Diff(*expected, props); diff != nil {
+			t.Fatalf("%s", strings.Join(diff, "\n"))
+		}
+	}
+
+	check1(expected)
+
+	rng := rand.New(rand.NewSource(time.Now().UnixNano()))
+	for i := 0; i < 1000; i++ {
+		v, _ := quick.Value(reflect.TypeOf(Properties{}), rng)
+		props := v.Interface().(Properties)
+		if props.IndexPartitions == 0 {
+			props.TopLevelIndexSize = 0
+		}
+		check1(&props)
+	}
+}
+
+func BenchmarkPropertiesLoad(b *testing.B) {
+	var w rawBlockWriter
+	w.restartInterval = propertiesBlockRestartInterval
+	testProps.save(TableFormatPebblev2, &w)
+	block := w.finish()
+
+	b.ResetTimer()
+	p := &Properties{}
+	for i := 0; i < b.N; i++ {
+		*p = Properties{}
+		require.NoError(b, p.load(block, 0, nil))
+	}
+}
diff --git a/pebble/sstable/random_test.go b/pebble/sstable/random_test.go
new file mode 100644
index 0000000..83e83ca
--- /dev/null
+++ b/pebble/sstable/random_test.go
@@ -0,0 +1,395 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"fmt"
+	"math/rand"
+	"runtime/debug"
+	"slices"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/metamorphic"
+	"github.com/cockroachdb/pebble/bloom"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/bytealloc"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/cockroachdb/pebble/vfs/errorfs"
+	"github.com/stretchr/testify/require"
+)
+
+// IterIterator_RandomErrors builds random sstables and runs random iterator
+// operations against them while randomly injecting errors. It ensures that if
+// an error is injected during an operation, operation surfaces the error to the
+// caller.
+func TestIterator_RandomErrors(t *testing.T) {
+	root := time.Now().UnixNano()
+	// Run the test a few times with various seeds for more consistent code
+	// coverage.
+	for i := int64(0); i < 50; i++ {
+		seed := root + i
+		t.Run(fmt.Sprintf("seed=%d", seed), func(t *testing.T) {
+			runErrorInjectionTest(t, seed)
+		})
+	}
+}
+
+func runErrorInjectionTest(t *testing.T, seed int64) {
+	t.Logf("seed %d", seed)
+	fs := vfs.NewMem()
+	f, err := fs.Create("random.sst")
+	require.NoError(t, err)
+	rng := rand.New(rand.NewSource(seed))
+	cfg := randomTableConfig{
+		wopts:     nil, /* leave to randomize */
+		keys:      testkeys.Alpha(3 + rng.Intn(2)),
+		keyCount:  10_000,
+		maxValLen: rng.Intn(64) + 1,
+		maxSuffix: rng.Int63n(95) + 5,
+		maxSeqNum: rng.Int63n(1000) + 10,
+		rng:       rng,
+	}
+	cfg.randomize()
+	_, err = buildRandomSSTable(f, cfg)
+	require.NoError(t, err)
+
+	f, err = fs.Open("random.sst")
+	require.NoError(t, err)
+	// Randomly inject errors into 25% of file operations. We use an
+	// errorfs.Toggle to avoid injecting errors until the file has been opened.
+	toggle := &errorfs.Toggle{Injector: errorfs.ErrInjected.If(errorfs.Randomly(0.25, seed))}
+	counter := &errorfs.Counter{Injector: toggle}
+	var stack []byte
+	f = errorfs.WrapFile(f, errorfs.InjectorFunc(func(op errorfs.Op) error {
+		err := counter.MaybeError(op)
+		if err != nil {
+			// Save the stack trace of the most recently injected error.
+			stack = debug.Stack()
+		}
+		return err
+	}))
+	readable, err := NewSimpleReadable(f)
+	require.NoError(t, err)
+	r, err := NewReader(readable, cfg.readerOpts())
+	require.NoError(t, err)
+	defer r.Close()
+
+	var filterer *BlockPropertiesFilterer
+	if rng.Float64() < 0.75 {
+		low, high := uint64(cfg.randSuffix()), uint64(cfg.randSuffix())
+		if low > high {
+			low, high = high, low
+		}
+		filterer = newBlockPropertiesFilterer([]BlockPropertyFilter{
+			NewTestKeysBlockPropertyFilter(low, high),
+		}, nil)
+	}
+
+	// TOOD(jackson): NewIterWithBlockPropertyFilters returns an iterator over
+	// point keys only. Should we add variants of this test that run random
+	// operations on the range deletion and range key iterators?
+	var stats base.InternalIteratorStats
+	it, err := r.NewIterWithBlockPropertyFilters(
+		nil /* lower TODO */, nil, /* upper TODO */
+		filterer,
+		rng.Intn(2) == 1, /* use filter block */
+		&stats,
+		CategoryAndQoS{},
+		nil, /* CategoryStatsCollector */
+		TrivialReaderProvider{r},
+	)
+	require.NoError(t, err)
+	defer it.Close()
+
+	// Begin injecting errors.
+	toggle.On()
+
+	ops := opRunner{randomTableConfig: cfg, it: it}
+	nextOp := metamorphic.Weighted[func() bool]{
+		{Item: ops.runSeekGE, Weight: 2},
+		{Item: ops.runSeekPrefixGE, Weight: 2},
+		{Item: ops.runSeekLT, Weight: 2},
+		{Item: ops.runFirst, Weight: 1},
+		{Item: ops.runLast, Weight: 1},
+		{Item: ops.runNext, Weight: 5},
+		{Item: ops.runNextPrefix, Weight: 5},
+		{Item: ops.runPrev, Weight: 5},
+	}.RandomDeck(rng)
+
+	for i := 0; i < 1000; i++ {
+		beforeCount := counter.Load()
+
+		// nextOp returns a function that *may* run the operation. If the
+		// current test state makes the operation an invalid operation, the the
+		// function returns `false` indicating it was not run. If the operation
+		// is a valid operation and was performed, `opFunc` returns true.
+		//
+		// This loop will run exactly 1 operation, skipping randomly chosen
+		// operations that cannot be run on an iterator in its current state.
+		for opFunc := nextOp(); !opFunc(); {
+			opFunc = nextOp()
+		}
+
+		t.Logf("%s = %s [err = %v]", ops.latestOpDesc, ops.k, it.Error())
+		afterCount := counter.Load()
+		// TODO(jackson): Consider running all commands against a parallel
+		// iterator constructed over a sstable containing the same data in a
+		// standard construction (eg, typical block sizes) and no error
+		// injection. Then we can assert the results are identical.
+
+		if afterCount > beforeCount {
+			if ops.k != nil || it.Error() == nil {
+				t.Errorf("error swallowed during %s with stack %s",
+					ops.latestOpDesc, string(stack))
+			}
+		}
+	}
+}
+
+type opRunner struct {
+	randomTableConfig
+	it Iterator
+
+	latestOpDesc  string
+	latestSeekKey []byte
+	dir           int8
+	k             *base.InternalKey
+	v             base.LazyValue
+}
+
+func (r *opRunner) runSeekGE() bool {
+	k := r.randKey()
+	flags := base.SeekGEFlagsNone
+	if strings.HasPrefix(r.latestOpDesc, "SeekGE") &&
+		r.wopts.Comparer.Compare(k, r.latestSeekKey) > 0 && r.rng.Intn(2) == 1 {
+		flags = flags.EnableTrySeekUsingNext()
+	}
+	r.latestOpDesc = fmt.Sprintf("SeekGE(%q, TrySeekUsingNext()=%t)",
+		k, flags.TrySeekUsingNext())
+	r.latestSeekKey = k
+	r.k, r.v = r.it.SeekGE(k, base.SeekGEFlagsNone)
+	r.dir = +1
+	return true
+}
+
+func (r *opRunner) runSeekPrefixGE() bool {
+	k := r.randKey()
+	i := r.wopts.Comparer.Split(k)
+	flags := base.SeekGEFlagsNone
+	if strings.HasPrefix(r.latestOpDesc, "SeekPrefixGE") &&
+		r.wopts.Comparer.Compare(k, r.latestSeekKey) > 0 && r.rng.Intn(2) == 1 {
+		flags = flags.EnableTrySeekUsingNext()
+	}
+	r.latestOpDesc = fmt.Sprintf("SeekPrefixGE(%q, %q, TrySeekUsingNext()=%t)",
+		k[:i], k, flags.TrySeekUsingNext())
+	r.latestSeekKey = k
+	r.k, r.v = r.it.SeekPrefixGE(k[:i], k, flags)
+	r.dir = +1
+	return true
+}
+
+func (r *opRunner) runSeekLT() bool {
+	k := r.randKey()
+	r.latestOpDesc = fmt.Sprintf("SeekLT(%q)", k)
+	r.k, r.v = r.it.SeekLT(k, base.SeekLTFlagsNone)
+	r.dir = -1
+	return true
+}
+
+func (r *opRunner) runFirst() bool {
+	r.latestOpDesc = "First()"
+	r.k, r.v = r.it.First()
+	r.dir = +1
+	return true
+}
+
+func (r *opRunner) runLast() bool {
+	r.latestOpDesc = "Last()"
+	r.k, r.v = r.it.Last()
+	r.dir = -1
+	return true
+}
+
+func (r *opRunner) runNext() bool {
+	if r.dir == +1 && r.k == nil {
+		return false
+	}
+	r.latestOpDesc = "Next()"
+	r.k, r.v = r.it.Next()
+	r.dir = +1
+	return true
+}
+
+func (r *opRunner) runNextPrefix() bool {
+	// NextPrefix cannot be called to change directions or when an iterator is
+	// exhausted.
+	if r.dir == -1 || r.k == nil {
+		return false
+	}
+	p := r.k.UserKey[:r.wopts.Comparer.Split(r.k.UserKey)]
+	succKey := r.wopts.Comparer.ImmediateSuccessor(nil, p)
+	r.latestOpDesc = fmt.Sprintf("NextPrefix(%q)", succKey)
+	r.k, r.v = r.it.NextPrefix(succKey)
+	r.dir = +1
+	return true
+}
+
+func (r *opRunner) runPrev() bool {
+	if r.dir == -1 && r.k == nil {
+		return false
+	}
+	r.latestOpDesc = "Prev()"
+	r.k, r.v = r.it.Prev()
+	r.dir = -1
+	return true
+}
+
+type randomTableConfig struct {
+	wopts     *WriterOptions
+	keys      testkeys.Keyspace
+	keyCount  int
+	maxValLen int
+	maxSuffix int64
+	maxSeqNum int64
+	rng       *rand.Rand
+}
+
+func (cfg *randomTableConfig) readerOpts() ReaderOptions {
+	rOpts := ReaderOptions{
+		Comparer: testkeys.Comparer,
+		Filters:  map[string]FilterPolicy{},
+	}
+	if cfg.wopts.FilterPolicy != nil {
+		rOpts.Filters[cfg.wopts.FilterPolicy.Name()] = cfg.wopts.FilterPolicy
+	}
+	return rOpts
+}
+
+func (cfg *randomTableConfig) randomize() {
+	if cfg.wopts == nil {
+		cfg.wopts = &WriterOptions{
+			// Test all table formats in [TableFormatLevelDB, TableFormatMax].
+			TableFormat:             TableFormat(cfg.rng.Intn(int(TableFormatMax)) + 1),
+			BlockRestartInterval:    (1 << cfg.rng.Intn(6)),             // {1, 2, 4, ..., 32}
+			BlockSizeThreshold:      min(int(100*cfg.rng.Float64()), 1), // 1-100%
+			BlockSize:               (1 << cfg.rng.Intn(18)),            // {1, 2, 4, ..., 128 KiB}
+			IndexBlockSize:          (1 << cfg.rng.Intn(20)),            // {1, 2, 4, ..., 512 KiB}
+			BlockPropertyCollectors: nil,
+			WritingToLowestLevel:    cfg.rng.Intn(2) == 1,
+			Parallelism:             cfg.rng.Intn(2) == 1,
+		}
+		if v := cfg.rng.Intn(11); v > 0 {
+			cfg.wopts.FilterPolicy = bloom.FilterPolicy(v)
+		}
+		if cfg.wopts.TableFormat >= TableFormatPebblev1 && cfg.rng.Float64() < 0.75 {
+			cfg.wopts.BlockPropertyCollectors = append(cfg.wopts.BlockPropertyCollectors, NewTestKeysBlockPropertyCollector)
+		}
+	}
+	cfg.wopts.ensureDefaults()
+	cfg.wopts.Comparer = testkeys.Comparer
+}
+
+func (cfg *randomTableConfig) randKey() []byte {
+	return testkeys.KeyAt(cfg.keys, cfg.randKeyIdx(), cfg.randSuffix())
+}
+func (cfg *randomTableConfig) randSuffix() int64 { return cfg.rng.Int63n(cfg.maxSuffix + 1) }
+func (cfg *randomTableConfig) randKeyIdx() int64 { return cfg.rng.Int63n(cfg.keys.Count()) }
+
+func buildRandomSSTable(f vfs.File, cfg randomTableConfig) (*WriterMetadata, error) {
+	// Construct a weighted distribution of key kinds.
+	kinds := metamorphic.Weighted[base.InternalKeyKind]{
+		{Item: base.InternalKeyKindSet, Weight: 25},
+		{Item: base.InternalKeyKindSetWithDelete, Weight: 25},
+		{Item: base.InternalKeyKindDelete, Weight: 5},
+		{Item: base.InternalKeyKindSingleDelete, Weight: 2},
+		{Item: base.InternalKeyKindMerge, Weight: 1},
+	}
+	// TODO(jackson): Support writing range deletions and range keys.
+	// TestIterator_RandomErrors only reads through the point iterator, so those
+	// keys won't be visible regardless, but their existence should be benign.
+
+	// DELSIZED require Pebblev4 or later.
+	if cfg.wopts.TableFormat >= TableFormatPebblev4 {
+		kinds = append(kinds, metamorphic.ItemWeight[base.InternalKeyKind]{
+			Item: base.InternalKeyKindDeleteSized, Weight: 5,
+		})
+	}
+	nextRandomKind := kinds.RandomDeck(cfg.rng)
+
+	type keyID struct {
+		idx    int64
+		suffix int64
+		seqNum int64
+	}
+	keyMap := make(map[keyID]bool)
+	// Constrain the space we generate keys to the middle 90% of the keyspace.
+	// This helps exercise code paths that are only run when a seek key is
+	// beyond or before all index block entries.
+	sstKeys := cfg.keys.Slice(cfg.keys.Count()/20, cfg.keys.Count()-cfg.keys.Count()/20)
+	randomKey := func() keyID {
+		k := keyID{
+			idx:    cfg.rng.Int63n(sstKeys.Count()),
+			suffix: cfg.rng.Int63n(cfg.maxSuffix + 1),
+			seqNum: cfg.rng.Int63n(cfg.maxSeqNum + 1),
+		}
+		// If we've already generated this exact key, try again.
+		for keyMap[k] {
+			k = keyID{
+				idx:    cfg.rng.Int63n(sstKeys.Count()),
+				suffix: cfg.rng.Int63n(cfg.maxSuffix + 1),
+				seqNum: cfg.rng.Int63n(cfg.maxSeqNum + 1),
+			}
+		}
+		keyMap[k] = true
+		return k
+	}
+
+	var alloc bytealloc.A
+	keys := make([]base.InternalKey, cfg.keyCount)
+	for i := range keys {
+		keyID := randomKey()
+		kind := nextRandomKind()
+
+		var keyBuf []byte
+		alloc, keyBuf = alloc.Alloc(testkeys.SuffixLen(keyID.suffix) + cfg.keys.MaxLen())
+		n := testkeys.WriteKeyAt(keyBuf, sstKeys, keyID.idx, keyID.suffix)
+		keys[i] = base.MakeInternalKey(keyBuf[:n], uint64(keyID.seqNum), kind)
+	}
+	// The Writer requires the keys to be written in sorted order. Sort them.
+	slices.SortFunc(keys, func(a, b base.InternalKey) int {
+		return base.InternalCompare(testkeys.Comparer.Compare, a, b)
+	})
+
+	// Release keyMap and alloc; we don't need them and this function can be
+	// memory intensive.
+	keyMap = nil
+	alloc = nil
+
+	valueBuf := make([]byte, cfg.maxValLen)
+	w := NewWriter(objstorageprovider.NewFileWritable(f), *cfg.wopts)
+	for i := 0; i < len(keys); i++ {
+		var value []byte
+		switch keys[i].Kind() {
+		case base.InternalKeyKindSet, base.InternalKeyKindMerge:
+			value = valueBuf[:cfg.rng.Intn(cfg.maxValLen+1)]
+			cfg.rng.Read(value)
+		}
+		if err := w.Add(keys[i], value); err != nil {
+			return nil, err
+		}
+	}
+	if err := w.Close(); err != nil {
+		return nil, err
+	}
+	metadata, err := w.Metadata()
+	if err != nil {
+		return nil, err
+	}
+	return metadata, nil
+}
diff --git a/pebble/sstable/raw_block.go b/pebble/sstable/raw_block.go
new file mode 100644
index 0000000..d33b51a
--- /dev/null
+++ b/pebble/sstable/raw_block.go
@@ -0,0 +1,262 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"encoding/binary"
+	"sort"
+	"unsafe"
+
+	"github.com/cockroachdb/pebble/internal/base"
+)
+
+type rawBlockWriter struct {
+	blockWriter
+}
+
+func (w *rawBlockWriter) add(key InternalKey, value []byte) {
+	w.curKey, w.prevKey = w.prevKey, w.curKey
+
+	size := len(key.UserKey)
+	if cap(w.curKey) < size {
+		w.curKey = make([]byte, 0, size*2)
+	}
+	w.curKey = w.curKey[:size]
+	copy(w.curKey, key.UserKey)
+
+	w.storeWithOptionalValuePrefix(
+		size, value, len(key.UserKey), false, 0, false)
+}
+
+// rawBlockIter is an iterator over a single block of data. Unlike blockIter,
+// keys are stored in "raw" format (i.e. not as internal keys). Note that there
+// is significant similarity between this code and the code in blockIter. Yet
+// reducing duplication is difficult due to the blockIter being performance
+// critical. rawBlockIter must only be used for blocks where the value is
+// stored together with the key.
+type rawBlockIter struct {
+	cmp         Compare
+	offset      int32
+	nextOffset  int32
+	restarts    int32
+	numRestarts int32
+	ptr         unsafe.Pointer
+	data        []byte
+	key, val    []byte
+	ikey        InternalKey
+	cached      []blockEntry
+	cachedBuf   []byte
+}
+
+func newRawBlockIter(cmp Compare, block block) (*rawBlockIter, error) {
+	i := &rawBlockIter{}
+	return i, i.init(cmp, block)
+}
+
+func (i *rawBlockIter) init(cmp Compare, block block) error {
+	numRestarts := int32(binary.LittleEndian.Uint32(block[len(block)-4:]))
+	if numRestarts == 0 {
+		return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)")
+	}
+	i.cmp = cmp
+	i.restarts = int32(len(block)) - 4*(1+numRestarts)
+	i.numRestarts = numRestarts
+	i.ptr = unsafe.Pointer(&block[0])
+	i.data = block
+	if i.key == nil {
+		i.key = make([]byte, 0, 256)
+	} else {
+		i.key = i.key[:0]
+	}
+	i.val = nil
+	i.clearCache()
+	return nil
+}
+
+func (i *rawBlockIter) readEntry() {
+	ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset))
+	shared, ptr := decodeVarint(ptr)
+	unshared, ptr := decodeVarint(ptr)
+	value, ptr := decodeVarint(ptr)
+	i.key = append(i.key[:shared], getBytes(ptr, int(unshared))...)
+	i.key = i.key[:len(i.key):len(i.key)]
+	ptr = unsafe.Pointer(uintptr(ptr) + uintptr(unshared))
+	i.val = getBytes(ptr, int(value))
+	i.nextOffset = int32(uintptr(ptr)-uintptr(i.ptr)) + int32(value)
+}
+
+func (i *rawBlockIter) loadEntry() {
+	i.readEntry()
+	i.ikey.UserKey = i.key
+}
+
+func (i *rawBlockIter) clearCache() {
+	i.cached = i.cached[:0]
+	i.cachedBuf = i.cachedBuf[:0]
+}
+
+func (i *rawBlockIter) cacheEntry() {
+	var valStart int32
+	valSize := int32(len(i.val))
+	if valSize > 0 {
+		valStart = int32(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr))
+	}
+
+	i.cached = append(i.cached, blockEntry{
+		offset:   i.offset,
+		keyStart: int32(len(i.cachedBuf)),
+		keyEnd:   int32(len(i.cachedBuf) + len(i.key)),
+		valStart: valStart,
+		valSize:  valSize,
+	})
+	i.cachedBuf = append(i.cachedBuf, i.key...)
+}
+
+// SeekGE implements internalIterator.SeekGE, as documented in the pebble
+// package.
+func (i *rawBlockIter) SeekGE(key []byte) bool {
+	// Find the index of the smallest restart point whose key is > the key
+	// sought; index will be numRestarts if there is no such restart point.
+	i.offset = 0
+	index := sort.Search(int(i.numRestarts), func(j int) bool {
+		offset := int32(binary.LittleEndian.Uint32(i.data[int(i.restarts)+4*j:]))
+		// For a restart point, there are 0 bytes shared with the previous key.
+		// The varint encoding of 0 occupies 1 byte.
+		ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1))
+		// Decode the key at that restart point, and compare it to the key sought.
+		v1, ptr := decodeVarint(ptr)
+		_, ptr = decodeVarint(ptr)
+		s := getBytes(ptr, int(v1))
+		return i.cmp(key, s) < 0
+	})
+
+	// Since keys are strictly increasing, if index > 0 then the restart point at
+	// index-1 will be the largest whose key is <= the key sought.  If index ==
+	// 0, then all keys in this block are larger than the key sought, and offset
+	// remains at zero.
+	if index > 0 {
+		i.offset = int32(binary.LittleEndian.Uint32(i.data[int(i.restarts)+4*(index-1):]))
+	}
+	i.loadEntry()
+
+	// Iterate from that restart point to somewhere >= the key sought.
+	for valid := i.Valid(); valid; valid = i.Next() {
+		if i.cmp(key, i.key) <= 0 {
+			break
+		}
+	}
+	return i.Valid()
+}
+
+// First implements internalIterator.First, as documented in the pebble
+// package.
+func (i *rawBlockIter) First() bool {
+	i.offset = 0
+	i.loadEntry()
+	return i.Valid()
+}
+
+// Last implements internalIterator.Last, as documented in the pebble package.
+func (i *rawBlockIter) Last() bool {
+	// Seek forward from the last restart point.
+	i.offset = int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*(i.numRestarts-1):]))
+
+	i.readEntry()
+	i.clearCache()
+	i.cacheEntry()
+
+	for i.nextOffset < i.restarts {
+		i.offset = i.nextOffset
+		i.readEntry()
+		i.cacheEntry()
+	}
+
+	i.ikey.UserKey = i.key
+	return i.Valid()
+}
+
+// Next implements internalIterator.Next, as documented in the pebble
+// package.
+func (i *rawBlockIter) Next() bool {
+	i.offset = i.nextOffset
+	if !i.Valid() {
+		return false
+	}
+	i.loadEntry()
+	return true
+}
+
+// Prev implements internalIterator.Prev, as documented in the pebble
+// package.
+func (i *rawBlockIter) Prev() bool {
+	if n := len(i.cached) - 1; n > 0 && i.cached[n].offset == i.offset {
+		i.nextOffset = i.offset
+		e := &i.cached[n-1]
+		i.offset = e.offset
+		i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize))
+		i.ikey.UserKey = i.cachedBuf[e.keyStart:e.keyEnd]
+		i.cached = i.cached[:n]
+		return true
+	}
+
+	if i.offset == 0 {
+		i.offset = -1
+		i.nextOffset = 0
+		return false
+	}
+
+	targetOffset := i.offset
+	index := sort.Search(int(i.numRestarts), func(j int) bool {
+		offset := int32(binary.LittleEndian.Uint32(i.data[int(i.restarts)+4*j:]))
+		return offset >= targetOffset
+	})
+	i.offset = 0
+	if index > 0 {
+		i.offset = int32(binary.LittleEndian.Uint32(i.data[int(i.restarts)+4*(index-1):]))
+	}
+
+	i.readEntry()
+	i.clearCache()
+	i.cacheEntry()
+
+	for i.nextOffset < targetOffset {
+		i.offset = i.nextOffset
+		i.readEntry()
+		i.cacheEntry()
+	}
+
+	i.ikey.UserKey = i.key
+	return true
+}
+
+// Key implements internalIterator.Key, as documented in the pebble package.
+func (i *rawBlockIter) Key() InternalKey {
+	return i.ikey
+}
+
+// Value implements internalIterator.Value, as documented in the pebble
+// package.
+func (i *rawBlockIter) Value() []byte {
+	return i.val
+}
+
+// Valid implements internalIterator.Valid, as documented in the pebble
+// package.
+func (i *rawBlockIter) Valid() bool {
+	return i.offset >= 0 && i.offset < i.restarts
+}
+
+// Error implements internalIterator.Error, as documented in the pebble
+// package.
+func (i *rawBlockIter) Error() error {
+	return nil
+}
+
+// Close implements internalIterator.Close, as documented in the pebble
+// package.
+func (i *rawBlockIter) Close() error {
+	i.val = nil
+	return nil
+}
diff --git a/pebble/sstable/reader.go b/pebble/sstable/reader.go
new file mode 100644
index 0000000..c6884cc
--- /dev/null
+++ b/pebble/sstable/reader.go
@@ -0,0 +1,1268 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"bytes"
+	"context"
+	"encoding/binary"
+	"io"
+	"os"
+	"time"
+
+	"github.com/cespare/xxhash/v2"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/bytealloc"
+	"github.com/cockroachdb/pebble/internal/cache"
+	"github.com/cockroachdb/pebble/internal/crc"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/private"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing"
+	"github.com/cockroachdb/pebble/shims/cmp"
+	"github.com/cockroachdb/pebble/shims/slices"
+)
+
+var errCorruptIndexEntry = base.CorruptionErrorf("pebble/table: corrupt index entry")
+var errReaderClosed = errors.New("pebble/table: reader is closed")
+
+// decodeBlockHandle returns the block handle encoded at the start of src, as
+// well as the number of bytes it occupies. It returns zero if given invalid
+// input. A block handle for a data block or a first/lower level index block
+// should not be decoded using decodeBlockHandle since the caller may validate
+// that the number of bytes decoded is equal to the length of src, which will
+// be false if the properties are not decoded. In those cases the caller
+// should use decodeBlockHandleWithProperties.
+func decodeBlockHandle(src []byte) (BlockHandle, int) {
+	offset, n := binary.Uvarint(src)
+	length, m := binary.Uvarint(src[n:])
+	if n == 0 || m == 0 {
+		return BlockHandle{}, 0
+	}
+	return BlockHandle{offset, length}, n + m
+}
+
+// decodeBlockHandleWithProperties returns the block handle and properties
+// encoded in src. src needs to be exactly the length that was encoded. This
+// method must be used for data block and first/lower level index blocks. The
+// properties in the block handle point to the bytes in src.
+func decodeBlockHandleWithProperties(src []byte) (BlockHandleWithProperties, error) {
+	bh, n := decodeBlockHandle(src)
+	if n == 0 {
+		return BlockHandleWithProperties{}, errors.Errorf("invalid BlockHandle")
+	}
+	return BlockHandleWithProperties{
+		BlockHandle: bh,
+		Props:       src[n:],
+	}, nil
+}
+
+func encodeBlockHandle(dst []byte, b BlockHandle) int {
+	n := binary.PutUvarint(dst, b.Offset)
+	m := binary.PutUvarint(dst[n:], b.Length)
+	return n + m
+}
+
+func encodeBlockHandleWithProperties(dst []byte, b BlockHandleWithProperties) []byte {
+	n := encodeBlockHandle(dst, b.BlockHandle)
+	dst = append(dst[:n], b.Props...)
+	return dst
+}
+
+// block is a []byte that holds a sequence of key/value pairs plus an index
+// over those pairs.
+type block []byte
+
+type loadBlockResult int8
+
+const (
+	loadBlockOK loadBlockResult = iota
+	// Could be due to error or because no block left to load.
+	loadBlockFailed
+	loadBlockIrrelevant
+)
+
+type blockTransform func([]byte) ([]byte, error)
+
+// ReaderOption provide an interface to do work on Reader while it is being
+// opened.
+type ReaderOption interface {
+	// readerApply is called on the reader during opening in order to set internal
+	// parameters.
+	readerApply(*Reader)
+}
+
+// Comparers is a map from comparer name to comparer. It is used for debugging
+// tools which may be used on multiple databases configured with different
+// comparers. Comparers implements the OpenOption interface and can be passed
+// as a parameter to NewReader.
+type Comparers map[string]*Comparer
+
+func (c Comparers) readerApply(r *Reader) {
+	if r.Compare != nil || r.Properties.ComparerName == "" {
+		return
+	}
+	if comparer, ok := c[r.Properties.ComparerName]; ok {
+		r.Compare = comparer.Compare
+		r.FormatKey = comparer.FormatKey
+		r.Split = comparer.Split
+	}
+}
+
+// Mergers is a map from merger name to merger. It is used for debugging tools
+// which may be used on multiple databases configured with different
+// mergers. Mergers implements the OpenOption interface and can be passed as
+// a parameter to NewReader.
+type Mergers map[string]*Merger
+
+func (m Mergers) readerApply(r *Reader) {
+	if r.mergerOK || r.Properties.MergerName == "" {
+		return
+	}
+	_, r.mergerOK = m[r.Properties.MergerName]
+}
+
+// cacheOpts is a Reader open option for specifying the cache ID and sstable file
+// number. If not specified, a unique cache ID will be used.
+type cacheOpts struct {
+	cacheID uint64
+	fileNum base.DiskFileNum
+}
+
+// Marker function to indicate the option should be applied before reading the
+// sstable properties and, in the write path, before writing the default
+// sstable properties.
+func (c *cacheOpts) preApply() {}
+
+func (c *cacheOpts) readerApply(r *Reader) {
+	if r.cacheID == 0 {
+		r.cacheID = c.cacheID
+	}
+	if r.fileNum.FileNum() == 0 {
+		r.fileNum = c.fileNum
+	}
+}
+
+func (c *cacheOpts) writerApply(w *Writer) {
+	if w.cacheID == 0 {
+		w.cacheID = c.cacheID
+	}
+	if w.fileNum.FileNum() == 0 {
+		w.fileNum = c.fileNum
+	}
+}
+
+// rawTombstonesOpt is a Reader open option for specifying that range
+// tombstones returned by Reader.NewRangeDelIter() should not be
+// fragmented. Used by debug tools to get a raw view of the tombstones
+// contained in an sstable.
+type rawTombstonesOpt struct{}
+
+func (rawTombstonesOpt) preApply() {}
+
+func (rawTombstonesOpt) readerApply(r *Reader) {
+	r.rawTombstones = true
+}
+
+func init() {
+	private.SSTableCacheOpts = func(cacheID uint64, fileNum base.DiskFileNum) interface{} {
+		return &cacheOpts{cacheID, fileNum}
+	}
+	private.SSTableRawTombstonesOpt = rawTombstonesOpt{}
+}
+
+// CommonReader abstracts functionality over a Reader or a VirtualReader. This
+// can be used by code which doesn't care to distinguish between a reader and a
+// virtual reader.
+type CommonReader interface {
+	NewRawRangeKeyIter() (keyspan.FragmentIterator, error)
+	NewRawRangeDelIter() (keyspan.FragmentIterator, error)
+	NewIterWithBlockPropertyFiltersAndContextEtc(
+		ctx context.Context, lower, upper []byte,
+		filterer *BlockPropertiesFilterer,
+		hideObsoletePoints, useFilterBlock bool,
+		stats *base.InternalIteratorStats,
+		categoryAndQoS CategoryAndQoS,
+		statsCollector *CategoryStatsCollector,
+		rp ReaderProvider,
+	) (Iterator, error)
+	NewCompactionIter(
+		bytesIterated *uint64,
+		categoryAndQoS CategoryAndQoS,
+		statsCollector *CategoryStatsCollector,
+		rp ReaderProvider,
+		bufferPool *BufferPool,
+	) (Iterator, error)
+	EstimateDiskUsage(start, end []byte) (uint64, error)
+	CommonProperties() *CommonProperties
+}
+
+// Reader is a table reader.
+type Reader struct {
+	readable          objstorage.Readable
+	cacheID           uint64
+	fileNum           base.DiskFileNum
+	err               error
+	indexBH           BlockHandle
+	filterBH          BlockHandle
+	rangeDelBH        BlockHandle
+	rangeKeyBH        BlockHandle
+	rangeDelTransform blockTransform
+	valueBIH          valueBlocksIndexHandle
+	propertiesBH      BlockHandle
+	metaIndexBH       BlockHandle
+	footerBH          BlockHandle
+	opts              ReaderOptions
+	Compare           Compare
+	FormatKey         base.FormatKey
+	Split             Split
+	tableFilter       *tableFilterReader
+	// Keep types that are not multiples of 8 bytes at the end and with
+	// decreasing size.
+	Properties    Properties
+	tableFormat   TableFormat
+	rawTombstones bool
+	mergerOK      bool
+	checksumType  ChecksumType
+	// metaBufferPool is a buffer pool used exclusively when opening a table and
+	// loading its meta blocks. metaBufferPoolAlloc is used to batch-allocate
+	// the BufferPool.pool slice as a part of the Reader allocation. It's
+	// capacity 3 to accommodate the meta block (1), and both the compressed
+	// properties block (1) and decompressed properties block (1)
+	// simultaneously.
+	metaBufferPool      BufferPool
+	metaBufferPoolAlloc [3]allocedBuffer
+}
+
+// Close implements DB.Close, as documented in the pebble package.
+func (r *Reader) Close() error {
+	r.opts.Cache.Unref()
+
+	if r.readable != nil {
+		r.err = firstError(r.err, r.readable.Close())
+		r.readable = nil
+	}
+
+	if r.err != nil {
+		return r.err
+	}
+	// Make any future calls to Get, NewIter or Close return an error.
+	r.err = errReaderClosed
+	return nil
+}
+
+// NewIterWithBlockPropertyFilters returns an iterator for the contents of the
+// table. If an error occurs, NewIterWithBlockPropertyFilters cleans up after
+// itself and returns a nil iterator.
+func (r *Reader) NewIterWithBlockPropertyFilters(
+	lower, upper []byte,
+	filterer *BlockPropertiesFilterer,
+	useFilterBlock bool,
+	stats *base.InternalIteratorStats,
+	categoryAndQoS CategoryAndQoS,
+	statsCollector *CategoryStatsCollector,
+	rp ReaderProvider,
+) (Iterator, error) {
+	return r.newIterWithBlockPropertyFiltersAndContext(
+		context.Background(), lower, upper, filterer, false, useFilterBlock, stats,
+		categoryAndQoS, statsCollector, rp, nil)
+}
+
+// NewIterWithBlockPropertyFiltersAndContextEtc is similar to
+// NewIterWithBlockPropertyFilters and additionally accepts a context for
+// tracing.
+//
+// If hideObsoletePoints, the callee assumes that filterer already includes
+// obsoleteKeyBlockPropertyFilter. The caller can satisfy this contract by
+// first calling TryAddBlockPropertyFilterForHideObsoletePoints.
+func (r *Reader) NewIterWithBlockPropertyFiltersAndContextEtc(
+	ctx context.Context,
+	lower, upper []byte,
+	filterer *BlockPropertiesFilterer,
+	hideObsoletePoints, useFilterBlock bool,
+	stats *base.InternalIteratorStats,
+	categoryAndQoS CategoryAndQoS,
+	statsCollector *CategoryStatsCollector,
+	rp ReaderProvider,
+) (Iterator, error) {
+	return r.newIterWithBlockPropertyFiltersAndContext(
+		ctx, lower, upper, filterer, hideObsoletePoints, useFilterBlock, stats, categoryAndQoS,
+		statsCollector, rp, nil)
+}
+
+// TryAddBlockPropertyFilterForHideObsoletePoints is expected to be called
+// before the call to NewIterWithBlockPropertyFiltersAndContextEtc, to get the
+// value of hideObsoletePoints and potentially add a block property filter.
+func (r *Reader) TryAddBlockPropertyFilterForHideObsoletePoints(
+	snapshotForHideObsoletePoints uint64,
+	fileLargestSeqNum uint64,
+	pointKeyFilters []BlockPropertyFilter,
+) (hideObsoletePoints bool, filters []BlockPropertyFilter) {
+	hideObsoletePoints = r.tableFormat >= TableFormatPebblev4 &&
+		snapshotForHideObsoletePoints > fileLargestSeqNum
+	if hideObsoletePoints {
+		pointKeyFilters = append(pointKeyFilters, obsoleteKeyBlockPropertyFilter{})
+	}
+	return hideObsoletePoints, pointKeyFilters
+}
+
+func (r *Reader) newIterWithBlockPropertyFiltersAndContext(
+	ctx context.Context,
+	lower, upper []byte,
+	filterer *BlockPropertiesFilterer,
+	hideObsoletePoints bool,
+	useFilterBlock bool,
+	stats *base.InternalIteratorStats,
+	categoryAndQoS CategoryAndQoS,
+	statsCollector *CategoryStatsCollector,
+	rp ReaderProvider,
+	v *virtualState,
+) (Iterator, error) {
+	// NB: pebble.tableCache wraps the returned iterator with one which performs
+	// reference counting on the Reader, preventing the Reader from being closed
+	// until the final iterator closes.
+	if r.Properties.IndexType == twoLevelIndex {
+		i := twoLevelIterPool.Get().(*twoLevelIterator)
+		err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, hideObsoletePoints, stats,
+			categoryAndQoS, statsCollector, rp, nil /* bufferPool */)
+		if err != nil {
+			return nil, err
+		}
+		return i, nil
+	}
+
+	i := singleLevelIterPool.Get().(*singleLevelIterator)
+	err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, hideObsoletePoints, stats,
+		categoryAndQoS, statsCollector, rp, nil /* bufferPool */)
+	if err != nil {
+		return nil, err
+	}
+	return i, nil
+}
+
+// NewIter returns an iterator for the contents of the table. If an error
+// occurs, NewIter cleans up after itself and returns a nil iterator. NewIter
+// must only be used when the Reader is guaranteed to outlive any LazyValues
+// returned from the iter.
+func (r *Reader) NewIter(lower, upper []byte) (Iterator, error) {
+	return r.NewIterWithBlockPropertyFilters(
+		lower, upper, nil, true /* useFilterBlock */, nil, /* stats */
+		CategoryAndQoS{}, nil /*statsCollector */, TrivialReaderProvider{Reader: r})
+}
+
+// NewCompactionIter returns an iterator similar to NewIter but it also increments
+// the number of bytes iterated. If an error occurs, NewCompactionIter cleans up
+// after itself and returns a nil iterator.
+func (r *Reader) NewCompactionIter(
+	bytesIterated *uint64,
+	categoryAndQoS CategoryAndQoS,
+	statsCollector *CategoryStatsCollector,
+	rp ReaderProvider,
+	bufferPool *BufferPool,
+) (Iterator, error) {
+	return r.newCompactionIter(bytesIterated, categoryAndQoS, statsCollector, rp, nil, bufferPool)
+}
+
+func (r *Reader) newCompactionIter(
+	bytesIterated *uint64,
+	categoryAndQoS CategoryAndQoS,
+	statsCollector *CategoryStatsCollector,
+	rp ReaderProvider,
+	v *virtualState,
+	bufferPool *BufferPool,
+) (Iterator, error) {
+	if r.Properties.IndexType == twoLevelIndex {
+		i := twoLevelIterPool.Get().(*twoLevelIterator)
+		err := i.init(
+			context.Background(),
+			r, v, nil /* lower */, nil /* upper */, nil,
+			false /* useFilter */, v != nil && v.isForeign, /* hideObsoletePoints */
+			nil /* stats */, categoryAndQoS, statsCollector, rp, bufferPool,
+		)
+		if err != nil {
+			return nil, err
+		}
+		i.setupForCompaction()
+		return &twoLevelCompactionIterator{
+			twoLevelIterator: i,
+			bytesIterated:    bytesIterated,
+		}, nil
+	}
+	i := singleLevelIterPool.Get().(*singleLevelIterator)
+	err := i.init(
+		context.Background(), r, v, nil /* lower */, nil, /* upper */
+		nil, false /* useFilter */, v != nil && v.isForeign, /* hideObsoletePoints */
+		nil /* stats */, categoryAndQoS, statsCollector, rp, bufferPool,
+	)
+	if err != nil {
+		return nil, err
+	}
+	i.setupForCompaction()
+	return &compactionIterator{
+		singleLevelIterator: i,
+		bytesIterated:       bytesIterated,
+	}, nil
+}
+
+// NewRawRangeDelIter returns an internal iterator for the contents of the
+// range-del block for the table. Returns nil if the table does not contain
+// any range deletions.
+//
+// TODO(sumeer): plumb context.Context since this path is relevant in the user-facing
+// iterator. Add WithContext methods since the existing ones are public.
+func (r *Reader) NewRawRangeDelIter() (keyspan.FragmentIterator, error) {
+	if r.rangeDelBH.Length == 0 {
+		return nil, nil
+	}
+	h, err := r.readRangeDel(nil /* stats */, nil /* iterStats */)
+	if err != nil {
+		return nil, err
+	}
+	i := &fragmentBlockIter{elideSameSeqnum: true}
+	if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum, false); err != nil {
+		return nil, err
+	}
+	return i, nil
+}
+
+// NewRawRangeKeyIter returns an internal iterator for the contents of the
+// range-key block for the table. Returns nil if the table does not contain any
+// range keys.
+//
+// TODO(sumeer): plumb context.Context since this path is relevant in the user-facing
+// iterator. Add WithContext methods since the existing ones are public.
+func (r *Reader) NewRawRangeKeyIter() (keyspan.FragmentIterator, error) {
+	if r.rangeKeyBH.Length == 0 {
+		return nil, nil
+	}
+	h, err := r.readRangeKey(nil /* stats */, nil /* iterStats */)
+	if err != nil {
+		return nil, err
+	}
+	i := rangeKeyFragmentBlockIterPool.Get().(*rangeKeyFragmentBlockIter)
+	if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum, false); err != nil {
+		return nil, err
+	}
+	return i, nil
+}
+
+type rangeKeyFragmentBlockIter struct {
+	fragmentBlockIter
+}
+
+func (i *rangeKeyFragmentBlockIter) Close() error {
+	err := i.fragmentBlockIter.Close()
+	i.fragmentBlockIter = i.fragmentBlockIter.resetForReuse()
+	rangeKeyFragmentBlockIterPool.Put(i)
+	return err
+}
+
+func (r *Reader) readIndex(
+	ctx context.Context, stats *base.InternalIteratorStats, iterStats *iterStatsAccumulator,
+) (bufferHandle, error) {
+	ctx = objiotracing.WithBlockType(ctx, objiotracing.MetadataBlock)
+	return r.readBlock(ctx, r.indexBH, nil, nil, stats, iterStats, nil /* buffer pool */)
+}
+
+func (r *Reader) readFilter(
+	ctx context.Context, stats *base.InternalIteratorStats, iterStats *iterStatsAccumulator,
+) (bufferHandle, error) {
+	ctx = objiotracing.WithBlockType(ctx, objiotracing.FilterBlock)
+	return r.readBlock(ctx, r.filterBH, nil /* transform */, nil /* readHandle */, stats, iterStats, nil /* buffer pool */)
+}
+
+func (r *Reader) readRangeDel(
+	stats *base.InternalIteratorStats, iterStats *iterStatsAccumulator,
+) (bufferHandle, error) {
+	ctx := objiotracing.WithBlockType(context.Background(), objiotracing.MetadataBlock)
+	return r.readBlock(ctx, r.rangeDelBH, r.rangeDelTransform, nil /* readHandle */, stats, iterStats, nil /* buffer pool */)
+}
+
+func (r *Reader) readRangeKey(
+	stats *base.InternalIteratorStats, iterStats *iterStatsAccumulator,
+) (bufferHandle, error) {
+	ctx := objiotracing.WithBlockType(context.Background(), objiotracing.MetadataBlock)
+	return r.readBlock(ctx, r.rangeKeyBH, nil /* transform */, nil /* readHandle */, stats, iterStats, nil /* buffer pool */)
+}
+
+func checkChecksum(
+	checksumType ChecksumType, b []byte, bh BlockHandle, fileNum base.FileNum,
+) error {
+	expectedChecksum := binary.LittleEndian.Uint32(b[bh.Length+1:])
+	var computedChecksum uint32
+	switch checksumType {
+	case ChecksumTypeCRC32c:
+		computedChecksum = crc.New(b[:bh.Length+1]).Value()
+	case ChecksumTypeXXHash64:
+		computedChecksum = uint32(xxhash.Sum64(b[:bh.Length+1]))
+	default:
+		return errors.Errorf("unsupported checksum type: %d", checksumType)
+	}
+
+	if expectedChecksum != computedChecksum {
+		return base.CorruptionErrorf(
+			"pebble/table: invalid table %s (checksum mismatch at %d/%d)",
+			errors.Safe(fileNum), errors.Safe(bh.Offset), errors.Safe(bh.Length))
+	}
+	return nil
+}
+
+type cacheValueOrBuf struct {
+	// buf.Valid() returns true if backed by a BufferPool.
+	buf Buf
+	// v is non-nil if backed by the block cache.
+	v *cache.Value
+}
+
+func (b cacheValueOrBuf) get() []byte {
+	if b.buf.Valid() {
+		return b.buf.p.pool[b.buf.i].b
+	}
+	return b.v.Buf()
+}
+
+func (b cacheValueOrBuf) release() {
+	if b.buf.Valid() {
+		b.buf.Release()
+	} else {
+		cache.Free(b.v)
+	}
+}
+
+func (b cacheValueOrBuf) truncate(n int) {
+	if b.buf.Valid() {
+		b.buf.p.pool[b.buf.i].b = b.buf.p.pool[b.buf.i].b[:n]
+	} else {
+		b.v.Truncate(n)
+	}
+}
+
+func (r *Reader) readBlock(
+	ctx context.Context,
+	bh BlockHandle,
+	transform blockTransform,
+	readHandle objstorage.ReadHandle,
+	stats *base.InternalIteratorStats,
+	iterStats *iterStatsAccumulator,
+	bufferPool *BufferPool,
+) (handle bufferHandle, _ error) {
+	if h := r.opts.Cache.Get(r.cacheID, r.fileNum, bh.Offset); h.Get() != nil {
+		// Cache hit.
+		if readHandle != nil {
+			readHandle.RecordCacheHit(ctx, int64(bh.Offset), int64(bh.Length+blockTrailerLen))
+		}
+		if stats != nil {
+			stats.BlockBytes += bh.Length
+			stats.BlockBytesInCache += bh.Length
+		}
+		if iterStats != nil {
+			iterStats.reportStats(bh.Length, bh.Length)
+		}
+		// This block is already in the cache; return a handle to existing vlaue
+		// in the cache.
+		return bufferHandle{h: h}, nil
+	}
+
+	// Cache miss.
+	var compressed cacheValueOrBuf
+	if bufferPool != nil {
+		compressed = cacheValueOrBuf{
+			buf: bufferPool.Alloc(int(bh.Length + blockTrailerLen)),
+		}
+	} else {
+		compressed = cacheValueOrBuf{
+			v: cache.Alloc(int(bh.Length + blockTrailerLen)),
+		}
+	}
+
+	readStartTime := time.Now()
+	var err error
+	if readHandle != nil {
+		err = readHandle.ReadAt(ctx, compressed.get(), int64(bh.Offset))
+	} else {
+		err = r.readable.ReadAt(ctx, compressed.get(), int64(bh.Offset))
+	}
+	readDuration := time.Since(readStartTime)
+	// TODO(sumeer): should the threshold be configurable.
+	const slowReadTracingThreshold = 5 * time.Millisecond
+	// The invariants.Enabled path is for deterministic testing.
+	if invariants.Enabled {
+		readDuration = slowReadTracingThreshold
+	}
+	// Call IsTracingEnabled to avoid the allocations of boxing integers into an
+	// interface{}, unless necessary.
+	if readDuration >= slowReadTracingThreshold && r.opts.LoggerAndTracer.IsTracingEnabled(ctx) {
+		r.opts.LoggerAndTracer.Eventf(ctx, "reading %d bytes took %s",
+			int(bh.Length+blockTrailerLen), readDuration.String())
+	}
+	if stats != nil {
+		stats.BlockReadDuration += readDuration
+	}
+	if err != nil {
+		compressed.release()
+		return bufferHandle{}, err
+	}
+	if err := checkChecksum(r.checksumType, compressed.get(), bh, r.fileNum.FileNum()); err != nil {
+		compressed.release()
+		return bufferHandle{}, err
+	}
+
+	typ := blockType(compressed.get()[bh.Length])
+	compressed.truncate(int(bh.Length))
+
+	var decompressed cacheValueOrBuf
+	if typ == noCompressionBlockType {
+		decompressed = compressed
+	} else {
+		// Decode the length of the decompressed value.
+		decodedLen, prefixLen, err := decompressedLen(typ, compressed.get())
+		if err != nil {
+			compressed.release()
+			return bufferHandle{}, err
+		}
+
+		if bufferPool != nil {
+			decompressed = cacheValueOrBuf{buf: bufferPool.Alloc(decodedLen)}
+		} else {
+			decompressed = cacheValueOrBuf{v: cache.Alloc(decodedLen)}
+		}
+		if _, err := decompressInto(typ, compressed.get()[prefixLen:], decompressed.get()); err != nil {
+			compressed.release()
+			return bufferHandle{}, err
+		}
+		compressed.release()
+	}
+
+	if transform != nil {
+		// Transforming blocks is very rare, so the extra copy of the
+		// transformed data is not problematic.
+		tmpTransformed, err := transform(decompressed.get())
+		if err != nil {
+			decompressed.release()
+			return bufferHandle{}, err
+		}
+
+		var transformed cacheValueOrBuf
+		if bufferPool != nil {
+			transformed = cacheValueOrBuf{buf: bufferPool.Alloc(len(tmpTransformed))}
+		} else {
+			transformed = cacheValueOrBuf{v: cache.Alloc(len(tmpTransformed))}
+		}
+		copy(transformed.get(), tmpTransformed)
+		decompressed.release()
+		decompressed = transformed
+	}
+
+	if stats != nil {
+		stats.BlockBytes += bh.Length
+	}
+	if iterStats != nil {
+		iterStats.reportStats(bh.Length, 0)
+	}
+	if decompressed.buf.Valid() {
+		return bufferHandle{b: decompressed.buf}, nil
+	}
+	h := r.opts.Cache.Set(r.cacheID, r.fileNum, bh.Offset, decompressed.v)
+	return bufferHandle{h: h}, nil
+}
+
+func (r *Reader) transformRangeDelV1(b []byte) ([]byte, error) {
+	// Convert v1 (RocksDB format) range-del blocks to v2 blocks on the fly. The
+	// v1 format range-del blocks have unfragmented and unsorted range
+	// tombstones. We need properly fragmented and sorted range tombstones in
+	// order to serve from them directly.
+	iter := &blockIter{}
+	if err := iter.init(r.Compare, b, r.Properties.GlobalSeqNum, false); err != nil {
+		return nil, err
+	}
+	var tombstones []keyspan.Span
+	for key, value := iter.First(); key != nil; key, value = iter.Next() {
+		t := keyspan.Span{
+			Start: key.UserKey,
+			End:   value.InPlaceValue(),
+			Keys:  []keyspan.Key{{Trailer: key.Trailer}},
+		}
+		tombstones = append(tombstones, t)
+	}
+	keyspan.Sort(r.Compare, tombstones)
+
+	// Fragment the tombstones, outputting them directly to a block writer.
+	rangeDelBlock := blockWriter{
+		restartInterval: 1,
+	}
+	frag := keyspan.Fragmenter{
+		Cmp:    r.Compare,
+		Format: r.FormatKey,
+		Emit: func(s keyspan.Span) {
+			for _, k := range s.Keys {
+				startIK := InternalKey{UserKey: s.Start, Trailer: k.Trailer}
+				rangeDelBlock.add(startIK, s.End)
+			}
+		},
+	}
+	for i := range tombstones {
+		frag.Add(tombstones[i])
+	}
+	frag.Finish()
+
+	// Return the contents of the constructed v2 format range-del block.
+	return rangeDelBlock.finish(), nil
+}
+
+func (r *Reader) readMetaindex(metaindexBH BlockHandle) error {
+	// We use a BufferPool when reading metaindex blocks in order to avoid
+	// populating the block cache with these blocks. In heavy-write workloads,
+	// especially with high compaction concurrency, new tables may be created
+	// frequently. Populating the block cache with these metaindex blocks adds
+	// additional contention on the block cache mutexes (see #1997).
+	// Additionally, these blocks are exceedingly unlikely to be read again
+	// while they're still in the block cache except in misconfigurations with
+	// excessive sstables counts or a table cache that's far too small.
+	r.metaBufferPool.initPreallocated(r.metaBufferPoolAlloc[:0])
+	// When we're finished, release the buffers we've allocated back to memory
+	// allocator. We don't expect to use metaBufferPool again.
+	defer r.metaBufferPool.Release()
+
+	b, err := r.readBlock(
+		context.Background(), metaindexBH, nil /* transform */, nil /* readHandle */, nil, /* stats */
+		nil /* iterStats */, &r.metaBufferPool)
+	if err != nil {
+		return err
+	}
+	data := b.Get()
+	defer b.Release()
+
+	if uint64(len(data)) != metaindexBH.Length {
+		return base.CorruptionErrorf("pebble/table: unexpected metaindex block size: %d vs %d",
+			errors.Safe(len(data)), errors.Safe(metaindexBH.Length))
+	}
+
+	i, err := newRawBlockIter(bytes.Compare, data)
+	if err != nil {
+		return err
+	}
+
+	meta := map[string]BlockHandle{}
+	for valid := i.First(); valid; valid = i.Next() {
+		value := i.Value()
+		if bytes.Equal(i.Key().UserKey, []byte(metaValueIndexName)) {
+			vbih, n, err := decodeValueBlocksIndexHandle(i.Value())
+			if err != nil {
+				return err
+			}
+			if n == 0 || n != len(value) {
+				return base.CorruptionErrorf("pebble/table: invalid table (bad value blocks index handle)")
+			}
+			r.valueBIH = vbih
+		} else {
+			bh, n := decodeBlockHandle(value)
+			if n == 0 || n != len(value) {
+				return base.CorruptionErrorf("pebble/table: invalid table (bad block handle)")
+			}
+			meta[string(i.Key().UserKey)] = bh
+		}
+	}
+	if err := i.Close(); err != nil {
+		return err
+	}
+
+	if bh, ok := meta[metaPropertiesName]; ok {
+		b, err = r.readBlock(
+			context.Background(), bh, nil /* transform */, nil /* readHandle */, nil, /* stats */
+			nil /* iterStats */, nil /* buffer pool */)
+		if err != nil {
+			return err
+		}
+		r.propertiesBH = bh
+		err := r.Properties.load(b.Get(), bh.Offset, r.opts.DeniedUserProperties)
+		b.Release()
+		if err != nil {
+			return err
+		}
+	}
+
+	if bh, ok := meta[metaRangeDelV2Name]; ok {
+		r.rangeDelBH = bh
+	} else if bh, ok := meta[metaRangeDelName]; ok {
+		r.rangeDelBH = bh
+		if !r.rawTombstones {
+			r.rangeDelTransform = r.transformRangeDelV1
+		}
+	}
+
+	if bh, ok := meta[metaRangeKeyName]; ok {
+		r.rangeKeyBH = bh
+	}
+
+	for name, fp := range r.opts.Filters {
+		types := []struct {
+			ftype  FilterType
+			prefix string
+		}{
+			{TableFilter, "fullfilter."},
+		}
+		var done bool
+		for _, t := range types {
+			if bh, ok := meta[t.prefix+name]; ok {
+				r.filterBH = bh
+
+				switch t.ftype {
+				case TableFilter:
+					r.tableFilter = newTableFilterReader(fp)
+				default:
+					return base.CorruptionErrorf("unknown filter type: %v", errors.Safe(t.ftype))
+				}
+
+				done = true
+				break
+			}
+		}
+		if done {
+			break
+		}
+	}
+	return nil
+}
+
+// Layout returns the layout (block organization) for an sstable.
+func (r *Reader) Layout() (*Layout, error) {
+	if r.err != nil {
+		return nil, r.err
+	}
+
+	l := &Layout{
+		Data:       make([]BlockHandleWithProperties, 0, r.Properties.NumDataBlocks),
+		Filter:     r.filterBH,
+		RangeDel:   r.rangeDelBH,
+		RangeKey:   r.rangeKeyBH,
+		ValueIndex: r.valueBIH.h,
+		Properties: r.propertiesBH,
+		MetaIndex:  r.metaIndexBH,
+		Footer:     r.footerBH,
+		Format:     r.tableFormat,
+	}
+
+	indexH, err := r.readIndex(context.Background(), nil, nil)
+	if err != nil {
+		return nil, err
+	}
+	defer indexH.Release()
+
+	var alloc bytealloc.A
+
+	if r.Properties.IndexPartitions == 0 {
+		l.Index = append(l.Index, r.indexBH)
+		iter, _ := newBlockIter(r.Compare, indexH.Get())
+		for key, value := iter.First(); key != nil; key, value = iter.Next() {
+			dataBH, err := decodeBlockHandleWithProperties(value.InPlaceValue())
+			if err != nil {
+				return nil, errCorruptIndexEntry
+			}
+			if len(dataBH.Props) > 0 {
+				alloc, dataBH.Props = alloc.Copy(dataBH.Props)
+			}
+			l.Data = append(l.Data, dataBH)
+		}
+	} else {
+		l.TopIndex = r.indexBH
+		topIter, _ := newBlockIter(r.Compare, indexH.Get())
+		iter := &blockIter{}
+		for key, value := topIter.First(); key != nil; key, value = topIter.Next() {
+			indexBH, err := decodeBlockHandleWithProperties(value.InPlaceValue())
+			if err != nil {
+				return nil, errCorruptIndexEntry
+			}
+			l.Index = append(l.Index, indexBH.BlockHandle)
+
+			subIndex, err := r.readBlock(context.Background(), indexBH.BlockHandle,
+				nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* iterStats */, nil /* buffer pool */)
+			if err != nil {
+				return nil, err
+			}
+			if err := iter.init(r.Compare, subIndex.Get(), 0, /* globalSeqNum */
+				false /* hideObsoletePoints */); err != nil {
+				return nil, err
+			}
+			for key, value := iter.First(); key != nil; key, value = iter.Next() {
+				dataBH, err := decodeBlockHandleWithProperties(value.InPlaceValue())
+				if len(dataBH.Props) > 0 {
+					alloc, dataBH.Props = alloc.Copy(dataBH.Props)
+				}
+				if err != nil {
+					return nil, errCorruptIndexEntry
+				}
+				l.Data = append(l.Data, dataBH)
+			}
+			subIndex.Release()
+			*iter = iter.resetForReuse()
+		}
+	}
+	if r.valueBIH.h.Length != 0 {
+		vbiH, err := r.readBlock(context.Background(), r.valueBIH.h, nil, nil, nil, nil, nil /* buffer pool */)
+		if err != nil {
+			return nil, err
+		}
+		defer vbiH.Release()
+		vbiBlock := vbiH.Get()
+		indexEntryLen := int(r.valueBIH.blockNumByteLength + r.valueBIH.blockOffsetByteLength +
+			r.valueBIH.blockLengthByteLength)
+		i := 0
+		for len(vbiBlock) != 0 {
+			if len(vbiBlock) < indexEntryLen {
+				return nil, errors.Errorf(
+					"remaining value index block %d does not contain a full entry of length %d",
+					len(vbiBlock), indexEntryLen)
+			}
+			n := int(r.valueBIH.blockNumByteLength)
+			bn := int(littleEndianGet(vbiBlock, n))
+			if bn != i {
+				return nil, errors.Errorf("unexpected block num %d, expected %d",
+					bn, i)
+			}
+			i++
+			vbiBlock = vbiBlock[n:]
+			n = int(r.valueBIH.blockOffsetByteLength)
+			blockOffset := littleEndianGet(vbiBlock, n)
+			vbiBlock = vbiBlock[n:]
+			n = int(r.valueBIH.blockLengthByteLength)
+			blockLen := littleEndianGet(vbiBlock, n)
+			vbiBlock = vbiBlock[n:]
+			l.ValueBlock = append(l.ValueBlock, BlockHandle{Offset: blockOffset, Length: blockLen})
+		}
+	}
+
+	return l, nil
+}
+
+// ValidateBlockChecksums validates the checksums for each block in the SSTable.
+func (r *Reader) ValidateBlockChecksums() error {
+	// Pre-compute the BlockHandles for the underlying file.
+	l, err := r.Layout()
+	if err != nil {
+		return err
+	}
+
+	// Construct the set of blocks to check. Note that the footer is not checked
+	// as it is not a block with a checksum.
+	blocks := make([]BlockHandle, len(l.Data))
+	for i := range l.Data {
+		blocks[i] = l.Data[i].BlockHandle
+	}
+	blocks = append(blocks, l.Index...)
+	blocks = append(blocks, l.TopIndex, l.Filter, l.RangeDel, l.RangeKey, l.Properties, l.MetaIndex)
+
+	// Sorting by offset ensures we are performing a sequential scan of the
+	// file.
+	slices.SortFunc(blocks, func(a, b BlockHandle) int {
+		return cmp.Compare(a.Offset, b.Offset)
+	})
+
+	// Check all blocks sequentially. Make use of read-ahead, given we are
+	// scanning the entire file from start to end.
+	rh := r.readable.NewReadHandle(context.TODO())
+	defer rh.Close()
+
+	for _, bh := range blocks {
+		// Certain blocks may not be present, in which case we skip them.
+		if bh.Length == 0 {
+			continue
+		}
+
+		// Read the block, which validates the checksum.
+		h, err := r.readBlock(context.Background(), bh, nil, rh, nil, nil /* iterStats */, nil /* buffer pool */)
+		if err != nil {
+			return err
+		}
+		h.Release()
+	}
+
+	return nil
+}
+
+// CommonProperties implemented the CommonReader interface.
+func (r *Reader) CommonProperties() *CommonProperties {
+	return &r.Properties.CommonProperties
+}
+
+// EstimateDiskUsage returns the total size of data blocks overlapping the range
+// `[start, end]`. Even if a data block partially overlaps, or we cannot
+// determine overlap due to abbreviated index keys, the full data block size is
+// included in the estimation.
+//
+// This function does not account for any metablock space usage. Assumes there
+// is at least partial overlap, i.e., `[start, end]` falls neither completely
+// before nor completely after the file's range.
+//
+// Only blocks containing point keys are considered. Range deletion and range
+// key blocks are not considered.
+//
+// TODO(ajkr): account for metablock space usage. Perhaps look at the fraction of
+// data blocks overlapped and add that same fraction of the metadata blocks to the
+// estimate.
+func (r *Reader) EstimateDiskUsage(start, end []byte) (uint64, error) {
+	if r.err != nil {
+		return 0, r.err
+	}
+
+	indexH, err := r.readIndex(context.Background(), nil, nil)
+	if err != nil {
+		return 0, err
+	}
+	defer indexH.Release()
+
+	// Iterators over the bottom-level index blocks containing start and end.
+	// These may be different in case of partitioned index but will both point
+	// to the same blockIter over the single index in the unpartitioned case.
+	var startIdxIter, endIdxIter *blockIter
+	if r.Properties.IndexPartitions == 0 {
+		iter, err := newBlockIter(r.Compare, indexH.Get())
+		if err != nil {
+			return 0, err
+		}
+		startIdxIter = iter
+		endIdxIter = iter
+	} else {
+		topIter, err := newBlockIter(r.Compare, indexH.Get())
+		if err != nil {
+			return 0, err
+		}
+
+		key, val := topIter.SeekGE(start, base.SeekGEFlagsNone)
+		if key == nil {
+			// The range falls completely after this file, or an error occurred.
+			return 0, topIter.Error()
+		}
+		startIdxBH, err := decodeBlockHandleWithProperties(val.InPlaceValue())
+		if err != nil {
+			return 0, errCorruptIndexEntry
+		}
+		startIdxBlock, err := r.readBlock(context.Background(), startIdxBH.BlockHandle,
+			nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* iterStats */, nil /* buffer pool */)
+		if err != nil {
+			return 0, err
+		}
+		defer startIdxBlock.Release()
+		startIdxIter, err = newBlockIter(r.Compare, startIdxBlock.Get())
+		if err != nil {
+			return 0, err
+		}
+
+		key, val = topIter.SeekGE(end, base.SeekGEFlagsNone)
+		if key == nil {
+			if err := topIter.Error(); err != nil {
+				return 0, err
+			}
+		} else {
+			endIdxBH, err := decodeBlockHandleWithProperties(val.InPlaceValue())
+			if err != nil {
+				return 0, errCorruptIndexEntry
+			}
+			endIdxBlock, err := r.readBlock(context.Background(),
+				endIdxBH.BlockHandle, nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* iterStats */, nil /* buffer pool */)
+			if err != nil {
+				return 0, err
+			}
+			defer endIdxBlock.Release()
+			endIdxIter, err = newBlockIter(r.Compare, endIdxBlock.Get())
+			if err != nil {
+				return 0, err
+			}
+		}
+	}
+	// startIdxIter should not be nil at this point, while endIdxIter can be if the
+	// range spans past the end of the file.
+
+	key, val := startIdxIter.SeekGE(start, base.SeekGEFlagsNone)
+	if key == nil {
+		// The range falls completely after this file, or an error occurred.
+		return 0, startIdxIter.Error()
+	}
+	startBH, err := decodeBlockHandleWithProperties(val.InPlaceValue())
+	if err != nil {
+		return 0, errCorruptIndexEntry
+	}
+
+	includeInterpolatedValueBlocksSize := func(dataBlockSize uint64) uint64 {
+		// INVARIANT: r.Properties.DataSize > 0 since startIdxIter is not nil.
+		// Linearly interpolate what is stored in value blocks.
+		//
+		// TODO(sumeer): if we need more accuracy, without loading any data blocks
+		// (which contain the value handles, and which may also be insufficient if
+		// the values are in separate files), we will need to accumulate the
+		// logical size of the key-value pairs and store the cumulative value for
+		// each data block in the index block entry. This increases the size of
+		// the BlockHandle, so wait until this becomes necessary.
+		return dataBlockSize +
+			uint64((float64(dataBlockSize)/float64(r.Properties.DataSize))*
+				float64(r.Properties.ValueBlocksSize))
+	}
+	if endIdxIter == nil {
+		// The range spans beyond this file. Include data blocks through the last.
+		return includeInterpolatedValueBlocksSize(r.Properties.DataSize - startBH.Offset), nil
+	}
+	key, val = endIdxIter.SeekGE(end, base.SeekGEFlagsNone)
+	if key == nil {
+		if err := endIdxIter.Error(); err != nil {
+			return 0, err
+		}
+		// The range spans beyond this file. Include data blocks through the last.
+		return includeInterpolatedValueBlocksSize(r.Properties.DataSize - startBH.Offset), nil
+	}
+	endBH, err := decodeBlockHandleWithProperties(val.InPlaceValue())
+	if err != nil {
+		return 0, errCorruptIndexEntry
+	}
+	return includeInterpolatedValueBlocksSize(
+		endBH.Offset + endBH.Length + blockTrailerLen - startBH.Offset), nil
+}
+
+// TableFormat returns the format version for the table.
+func (r *Reader) TableFormat() (TableFormat, error) {
+	if r.err != nil {
+		return TableFormatUnspecified, r.err
+	}
+	return r.tableFormat, nil
+}
+
+// NewReader returns a new table reader for the file. Closing the reader will
+// close the file.
+func NewReader(f objstorage.Readable, o ReaderOptions, extraOpts ...ReaderOption) (*Reader, error) {
+	o = o.ensureDefaults()
+	r := &Reader{
+		readable: f,
+		opts:     o,
+	}
+	if r.opts.Cache == nil {
+		r.opts.Cache = cache.New(0)
+	} else {
+		r.opts.Cache.Ref()
+	}
+
+	if f == nil {
+		r.err = errors.New("pebble/table: nil file")
+		return nil, r.Close()
+	}
+
+	// Note that the extra options are applied twice. First here for pre-apply
+	// options, and then below for post-apply options. Pre and post refer to
+	// before and after reading the metaindex and properties.
+	type preApply interface{ preApply() }
+	for _, opt := range extraOpts {
+		if _, ok := opt.(preApply); ok {
+			opt.readerApply(r)
+		}
+	}
+	if r.cacheID == 0 {
+		r.cacheID = r.opts.Cache.NewID()
+	}
+
+	footer, err := readFooter(f)
+	if err != nil {
+		r.err = err
+		return nil, r.Close()
+	}
+	r.checksumType = footer.checksum
+	r.tableFormat = footer.format
+	// Read the metaindex.
+	if err := r.readMetaindex(footer.metaindexBH); err != nil {
+		r.err = err
+		return nil, r.Close()
+	}
+	r.indexBH = footer.indexBH
+	r.metaIndexBH = footer.metaindexBH
+	r.footerBH = footer.footerBH
+
+	if r.Properties.ComparerName == "" || o.Comparer.Name == r.Properties.ComparerName {
+		r.Compare = o.Comparer.Compare
+		r.FormatKey = o.Comparer.FormatKey
+		r.Split = o.Comparer.Split
+	}
+
+	if o.MergerName == r.Properties.MergerName {
+		r.mergerOK = true
+	}
+
+	// Apply the extra options again now that the comparer and merger names are
+	// known.
+	for _, opt := range extraOpts {
+		if _, ok := opt.(preApply); !ok {
+			opt.readerApply(r)
+		}
+	}
+
+	if r.Compare == nil {
+		r.err = errors.Errorf("pebble/table: %d: unknown comparer %s",
+			errors.Safe(r.fileNum), errors.Safe(r.Properties.ComparerName))
+	}
+	if !r.mergerOK {
+		if name := r.Properties.MergerName; name != "" && name != "nullptr" {
+			r.err = errors.Errorf("pebble/table: %d: unknown merger %s",
+				errors.Safe(r.fileNum), errors.Safe(r.Properties.MergerName))
+		}
+	}
+	if r.err != nil {
+		return nil, r.Close()
+	}
+
+	return r, nil
+}
+
+// ReadableFile describes the smallest subset of vfs.File that is required for
+// reading SSTs.
+type ReadableFile interface {
+	io.ReaderAt
+	io.Closer
+	Stat() (os.FileInfo, error)
+}
+
+// NewSimpleReadable wraps a ReadableFile in a objstorage.Readable
+// implementation (which does not support read-ahead)
+func NewSimpleReadable(r ReadableFile) (objstorage.Readable, error) {
+	info, err := r.Stat()
+	if err != nil {
+		return nil, err
+	}
+	res := &simpleReadable{
+		f:    r,
+		size: info.Size(),
+	}
+	res.rh = objstorage.MakeNoopReadHandle(res)
+	return res, nil
+}
+
+// simpleReadable wraps a ReadableFile to implement objstorage.Readable.
+type simpleReadable struct {
+	f    ReadableFile
+	size int64
+	rh   objstorage.NoopReadHandle
+}
+
+var _ objstorage.Readable = (*simpleReadable)(nil)
+
+// ReadAt is part of the objstorage.Readable interface.
+func (s *simpleReadable) ReadAt(_ context.Context, p []byte, off int64) error {
+	n, err := s.f.ReadAt(p, off)
+	if invariants.Enabled && err == nil && n != len(p) {
+		panic("short read")
+	}
+	return err
+}
+
+// Close is part of the objstorage.Readable interface.
+func (s *simpleReadable) Close() error {
+	return s.f.Close()
+}
+
+// Size is part of the objstorage.Readable interface.
+func (s *simpleReadable) Size() int64 {
+	return s.size
+}
+
+// NewReaddHandle is part of the objstorage.Readable interface.
+func (s *simpleReadable) NewReadHandle(_ context.Context) objstorage.ReadHandle {
+	return &s.rh
+}
diff --git a/pebble/sstable/reader_iter.go b/pebble/sstable/reader_iter.go
new file mode 100644
index 0000000..2b5a267
--- /dev/null
+++ b/pebble/sstable/reader_iter.go
@@ -0,0 +1,291 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"fmt"
+	"os"
+	"sync"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+)
+
+// Iterator iterates over an entire table of data.
+type Iterator interface {
+	base.InternalIterator
+
+	// NextPrefix implements (base.InternalIterator).NextPrefix.
+	NextPrefix(succKey []byte) (*InternalKey, base.LazyValue)
+
+	// MaybeFilteredKeys may be called when an iterator is exhausted to indicate
+	// whether or not the last positioning method may have skipped any keys due
+	// to block-property filters. This is used by the Pebble levelIter to
+	// control when an iterator steps to the next sstable.
+	//
+	// MaybeFilteredKeys may always return false positives, that is it may
+	// return true when no keys were filtered. It should only be called when the
+	// iterator is exhausted. It must never return false negatives when the
+	// iterator is exhausted.
+	MaybeFilteredKeys() bool
+
+	SetCloseHook(fn func(i Iterator) error)
+}
+
+// Iterator positioning optimizations and singleLevelIterator and
+// twoLevelIterator:
+//
+// An iterator is absolute positioned using one of the Seek or First or Last
+// calls. After absolute positioning, there can be relative positioning done
+// by stepping using Prev or Next.
+//
+// We implement optimizations below where an absolute positioning call can in
+// some cases use the current position to do less work. To understand these,
+// we first define some terms. An iterator is bounds-exhausted if the bounds
+// (upper of lower) have been reached. An iterator is data-exhausted if it has
+// the reached the end of the data (forward or reverse) in the sstable. A
+// singleLevelIterator only knows a local-data-exhausted property since when
+// it is used as part of a twoLevelIterator, the twoLevelIterator can step to
+// the next lower-level index block.
+//
+// The bounds-exhausted property is tracked by
+// singleLevelIterator.exhaustedBounds being +1 (upper bound reached) or -1
+// (lower bound reached). The same field is reused by twoLevelIterator. Either
+// may notice the exhaustion of the bound and set it. Note that if
+// singleLevelIterator sets this property, it is not a local property (since
+// the bound has been reached regardless of whether this is in the context of
+// the twoLevelIterator or not).
+//
+// The data-exhausted property is tracked in a more subtle manner. We define
+// two predicates:
+// - partial-local-data-exhausted (PLDE):
+//   i.data.isDataInvalidated() || !i.data.valid()
+// - partial-global-data-exhausted (PGDE):
+//   i.index.isDataInvalidated() || !i.index.valid() || i.data.isDataInvalidated() ||
+//   !i.data.valid()
+//
+// PLDE is defined for a singleLevelIterator. PGDE is defined for a
+// twoLevelIterator. Oddly, in our code below the singleLevelIterator does not
+// know when it is part of a twoLevelIterator so it does not know when its
+// property is local or global.
+//
+// Now to define data-exhausted:
+// - Prerequisite: we must know that the iterator has been positioned and
+//   i.err is nil.
+// - bounds-exhausted must not be true:
+//   If bounds-exhausted is true, we have incomplete knowledge of
+//   data-exhausted since PLDE or PGDE could be true because we could have
+//   chosen not to load index block or data block and figured out that the
+//   bound is exhausted (due to block property filters filtering out index and
+//   data blocks and going past the bound on the top level index block). Note
+//   that if we tried to separate out the BPF case from others we could
+//   develop more knowledge here.
+// - PGDE is true for twoLevelIterator. PLDE is true if it is a standalone
+//   singleLevelIterator. !PLDE or !PGDE of course imply that data-exhausted
+//   is not true.
+//
+// An implication of the above is that if we are going to somehow utilize
+// knowledge of data-exhausted in an optimization, we must not forget the
+// existing value of bounds-exhausted since by forgetting the latter we can
+// erroneously think that data-exhausted is true. Bug #2036 was due to this
+// forgetting.
+//
+// Now to the two categories of optimizations we currently have:
+// - Monotonic bounds optimization that reuse prior iterator position when
+//   doing seek: These only work with !data-exhausted. We could choose to make
+//   these work with data-exhausted but have not bothered because in the
+//   context of a DB if data-exhausted were true, the DB would move to the
+//   next file in the level. Note that this behavior of moving to the next
+//   file is not necessarily true for L0 files, so there could be some benefit
+//   in the future in this optimization. See the WARNING-data-exhausted
+//   comments if trying to optimize this in the future.
+// - TrySeekUsingNext optimizations: these work regardless of exhaustion
+//   state.
+//
+// Implementation detail: In the code PLDE only checks that
+// i.data.isDataInvalidated(). This narrower check is safe, since this is a
+// subset of the set expressed by the OR expression. Also, it is not a
+// de-optimization since whenever we exhaust the iterator we explicitly call
+// i.data.invalidate(). PGDE checks i.index.isDataInvalidated() &&
+// i.data.isDataInvalidated(). Again, this narrower check is safe, and not a
+// de-optimization since whenever we exhaust the iterator we explicitly call
+// i.index.invalidate() and i.data.invalidate(). The && is questionable -- for
+// now this is a bit of defensive code. We should seriously consider removing
+// it, since defensive code suggests we are not confident about our invariants
+// (and if we are not confident, we need more invariant assertions, not
+// defensive code).
+//
+// TODO(sumeer): remove the aforementioned defensive code.
+
+var singleLevelIterPool = sync.Pool{
+	New: func() interface{} {
+		i := &singleLevelIterator{}
+		// Note: this is a no-op if invariants are disabled or race is enabled.
+		invariants.SetFinalizer(i, checkSingleLevelIterator)
+		return i
+	},
+}
+
+var twoLevelIterPool = sync.Pool{
+	New: func() interface{} {
+		i := &twoLevelIterator{}
+		// Note: this is a no-op if invariants are disabled or race is enabled.
+		invariants.SetFinalizer(i, checkTwoLevelIterator)
+		return i
+	},
+}
+
+// TODO(jackson): rangedel fragmentBlockIters can't be pooled because of some
+// code paths that double Close the iters. Fix the double close and pool the
+// *fragmentBlockIter type directly.
+
+var rangeKeyFragmentBlockIterPool = sync.Pool{
+	New: func() interface{} {
+		i := &rangeKeyFragmentBlockIter{}
+		// Note: this is a no-op if invariants are disabled or race is enabled.
+		invariants.SetFinalizer(i, checkRangeKeyFragmentBlockIterator)
+		return i
+	},
+}
+
+func checkSingleLevelIterator(obj interface{}) {
+	i := obj.(*singleLevelIterator)
+	if p := i.data.handle.Get(); p != nil {
+		fmt.Fprintf(os.Stderr, "singleLevelIterator.data.handle is not nil: %p\n", p)
+		os.Exit(1)
+	}
+	if p := i.index.handle.Get(); p != nil {
+		fmt.Fprintf(os.Stderr, "singleLevelIterator.index.handle is not nil: %p\n", p)
+		os.Exit(1)
+	}
+}
+
+func checkTwoLevelIterator(obj interface{}) {
+	i := obj.(*twoLevelIterator)
+	if p := i.data.handle.Get(); p != nil {
+		fmt.Fprintf(os.Stderr, "singleLevelIterator.data.handle is not nil: %p\n", p)
+		os.Exit(1)
+	}
+	if p := i.index.handle.Get(); p != nil {
+		fmt.Fprintf(os.Stderr, "singleLevelIterator.index.handle is not nil: %p\n", p)
+		os.Exit(1)
+	}
+}
+
+func checkRangeKeyFragmentBlockIterator(obj interface{}) {
+	i := obj.(*rangeKeyFragmentBlockIter)
+	if p := i.blockIter.handle.Get(); p != nil {
+		fmt.Fprintf(os.Stderr, "fragmentBlockIter.blockIter.handle is not nil: %p\n", p)
+		os.Exit(1)
+	}
+}
+
+// compactionIterator is similar to Iterator but it increments the number of
+// bytes that have been iterated through.
+type compactionIterator struct {
+	*singleLevelIterator
+	bytesIterated *uint64
+	prevOffset    uint64
+}
+
+// compactionIterator implements the base.InternalIterator interface.
+var _ base.InternalIterator = (*compactionIterator)(nil)
+
+func (i *compactionIterator) String() string {
+	if i.vState != nil {
+		return i.vState.fileNum.String()
+	}
+	return i.reader.fileNum.String()
+}
+
+func (i *compactionIterator) SeekGE(
+	key []byte, flags base.SeekGEFlags,
+) (*InternalKey, base.LazyValue) {
+	panic("pebble: SeekGE unimplemented")
+}
+
+func (i *compactionIterator) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	panic("pebble: SeekPrefixGE unimplemented")
+}
+
+func (i *compactionIterator) SeekLT(
+	key []byte, flags base.SeekLTFlags,
+) (*InternalKey, base.LazyValue) {
+	panic("pebble: SeekLT unimplemented")
+}
+
+func (i *compactionIterator) First() (*InternalKey, base.LazyValue) {
+	i.err = nil // clear cached iteration error
+	return i.skipForward(i.singleLevelIterator.First())
+}
+
+func (i *compactionIterator) Last() (*InternalKey, base.LazyValue) {
+	panic("pebble: Last unimplemented")
+}
+
+// Note: compactionIterator.Next mirrors the implementation of Iterator.Next
+// due to performance. Keep the two in sync.
+func (i *compactionIterator) Next() (*InternalKey, base.LazyValue) {
+	if i.err != nil {
+		return nil, base.LazyValue{}
+	}
+	return i.skipForward(i.data.Next())
+}
+
+func (i *compactionIterator) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
+	panic("pebble: NextPrefix unimplemented")
+}
+
+func (i *compactionIterator) Prev() (*InternalKey, base.LazyValue) {
+	panic("pebble: Prev unimplemented")
+}
+
+func (i *compactionIterator) skipForward(
+	key *InternalKey, val base.LazyValue,
+) (*InternalKey, base.LazyValue) {
+	if key == nil {
+		for {
+			if key, _ := i.index.Next(); key == nil {
+				break
+			}
+			result := i.loadBlock(+1)
+			if result != loadBlockOK {
+				if i.err != nil {
+					break
+				}
+				switch result {
+				case loadBlockFailed:
+					// We checked that i.index was at a valid entry, so
+					// loadBlockFailed could not have happened due to to i.index
+					// being exhausted, and must be due to an error.
+					panic("loadBlock should not have failed with no error")
+				case loadBlockIrrelevant:
+					panic("compactionIter should not be using block intervals for skipping")
+				default:
+					panic(fmt.Sprintf("unexpected case %d", result))
+				}
+			}
+			// result == loadBlockOK
+			if key, val = i.data.First(); key != nil {
+				break
+			}
+		}
+	}
+
+	curOffset := i.recordOffset()
+	*i.bytesIterated += uint64(curOffset - i.prevOffset)
+	i.prevOffset = curOffset
+
+	if i.vState != nil && key != nil {
+		cmp := i.cmp(key.UserKey, i.vState.upper.UserKey)
+		if cmp > 0 || (i.vState.upper.IsExclusiveSentinel() && cmp == 0) {
+			return nil, base.LazyValue{}
+		}
+	}
+
+	return key, val
+}
diff --git a/pebble/sstable/reader_iter_single_lvl.go b/pebble/sstable/reader_iter_single_lvl.go
new file mode 100644
index 0000000..73780b8
--- /dev/null
+++ b/pebble/sstable/reader_iter_single_lvl.go
@@ -0,0 +1,1413 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"context"
+	"fmt"
+	"unsafe"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing"
+)
+
+// singleLevelIterator iterates over an entire table of data. To seek for a given
+// key, it first looks in the index for the block that contains that key, and then
+// looks inside that block.
+type singleLevelIterator struct {
+	ctx context.Context
+	cmp Compare
+	// Global lower/upper bound for the iterator.
+	lower []byte
+	upper []byte
+	bpfs  *BlockPropertiesFilterer
+	// Per-block lower/upper bound. Nil if the bound does not apply to the block
+	// because we determined the block lies completely within the bound.
+	blockLower []byte
+	blockUpper []byte
+	reader     *Reader
+	// vState will be set iff the iterator is constructed for virtual sstable
+	// iteration.
+	vState *virtualState
+	// endKeyInclusive is set to force the iterator to treat the upper field as
+	// inclusive while iterating instead of exclusive.
+	endKeyInclusive bool
+	index           blockIter
+	data            blockIter
+	dataRH          objstorage.ReadHandle
+	dataRHPrealloc  objstorageprovider.PreallocatedReadHandle
+	// dataBH refers to the last data block that the iterator considered
+	// loading. It may not actually have loaded the block, due to an error or
+	// because it was considered irrelevant.
+	dataBH   BlockHandle
+	vbReader *valueBlockReader
+	// vbRH is the read handle for value blocks, which are in a different
+	// part of the sstable than data blocks.
+	vbRH         objstorage.ReadHandle
+	vbRHPrealloc objstorageprovider.PreallocatedReadHandle
+	err          error
+	closeHook    func(i Iterator) error
+	// stats and iterStats are slightly different. stats is a shared struct
+	// supplied from the outside, and represents stats for the whole iterator
+	// tree and can be reset from the outside (e.g. when the pebble.Iterator is
+	// being reused). It is currently only provided when the iterator tree is
+	// rooted at pebble.Iterator. iterStats is this sstable iterator's private
+	// stats that are reported to a CategoryStatsCollector when this iterator is
+	// closed. More paths are instrumented with this as the
+	// CategoryStatsCollector needed for this is provided by the
+	// tableCacheContainer (which is more universally used).
+	stats      *base.InternalIteratorStats
+	iterStats  iterStatsAccumulator
+	bufferPool *BufferPool
+
+	// boundsCmp and positionedUsingLatestBounds are for optimizing iteration
+	// that uses multiple adjacent bounds. The seek after setting a new bound
+	// can use the fact that the iterator is either within the previous bounds
+	// or exactly one key before or after the bounds. If the new bounds is
+	// after/before the previous bounds, and we are already positioned at a
+	// block that is relevant for the new bounds, we can try to first position
+	// using Next/Prev (repeatedly) instead of doing a more expensive seek.
+	//
+	// When there are wide files at higher levels that match the bounds
+	// but don't have any data for the bound, we will already be
+	// positioned at the key beyond the bounds and won't need to do much
+	// work -- given that most data is in L6, such files are likely to
+	// dominate the performance of the mergingIter, and may be the main
+	// benefit of this performance optimization (of course it also helps
+	// when the file that has the data has successive seeks that stay in
+	// the same block).
+	//
+	// Specifically, boundsCmp captures the relationship between the previous
+	// and current bounds, if the iterator had been positioned after setting
+	// the previous bounds. If it was not positioned, i.e., Seek/First/Last
+	// were not called, we don't know where it is positioned and cannot
+	// optimize.
+	//
+	// Example: Bounds moving forward, and iterator exhausted in forward direction.
+	//      bounds = [f, h), ^ shows block iterator position
+	//  file contents [ a  b  c  d  e  f  g  h  i  j  k ]
+	//                                       ^
+	//  new bounds = [j, k). Since positionedUsingLatestBounds=true, boundsCmp is
+	//  set to +1. SeekGE(j) can use next (the optimization also requires that j
+	//  is within the block, but that is not for correctness, but to limit the
+	//  optimization to when it will actually be an optimization).
+	//
+	// Example: Bounds moving forward.
+	//      bounds = [f, h), ^ shows block iterator position
+	//  file contents [ a  b  c  d  e  f  g  h  i  j  k ]
+	//                                 ^
+	//  new bounds = [j, k). Since positionedUsingLatestBounds=true, boundsCmp is
+	//  set to +1. SeekGE(j) can use next.
+	//
+	// Example: Bounds moving forward, but iterator not positioned using previous
+	//  bounds.
+	//      bounds = [f, h), ^ shows block iterator position
+	//  file contents [ a  b  c  d  e  f  g  h  i  j  k ]
+	//                                             ^
+	//  new bounds = [i, j). Iterator is at j since it was never positioned using
+	//  [f, h). So positionedUsingLatestBounds=false, and boundsCmp is set to 0.
+	//  SeekGE(i) will not use next.
+	//
+	// Example: Bounds moving forward and sparse file
+	//      bounds = [f, h), ^ shows block iterator position
+	//  file contents [ a z ]
+	//                    ^
+	//  new bounds = [j, k). Since positionedUsingLatestBounds=true, boundsCmp is
+	//  set to +1. SeekGE(j) notices that the iterator is already past j and does
+	//  not need to do anything.
+	//
+	// Similar examples can be constructed for backward iteration.
+	//
+	// This notion of exactly one key before or after the bounds is not quite
+	// true when block properties are used to ignore blocks. In that case we
+	// can't stop precisely at the first block that is past the bounds since
+	// we are using the index entries to enforce the bounds.
+	//
+	// e.g. 3 blocks with keys [b, c]  [f, g], [i, j, k] with index entries d,
+	// h, l. And let the lower bound be k, and we are reverse iterating. If
+	// the block [i, j, k] is ignored due to the block interval annotations we
+	// do need to move the index to block [f, g] since the index entry for the
+	// [i, j, k] block is l which is not less than the lower bound of k. So we
+	// have passed the entries i, j.
+	//
+	// This behavior is harmless since the block property filters are fixed
+	// for the lifetime of the iterator so i, j are irrelevant. In addition,
+	// the current code will not load the [f, g] block, so the seek
+	// optimization that attempts to use Next/Prev do not apply anyway.
+	boundsCmp                   int
+	positionedUsingLatestBounds bool
+
+	// exhaustedBounds represents whether the iterator is exhausted for
+	// iteration by reaching the upper or lower bound. +1 when exhausted
+	// the upper bound, -1 when exhausted the lower bound, and 0 when
+	// neither. exhaustedBounds is also used for the TrySeekUsingNext
+	// optimization in twoLevelIterator and singleLevelIterator. Care should be
+	// taken in setting this in twoLevelIterator before calling into
+	// singleLevelIterator, given that these two iterators share this field.
+	exhaustedBounds int8
+
+	// maybeFilteredKeysSingleLevel indicates whether the last iterator
+	// positioning operation may have skipped any data blocks due to
+	// block-property filters when positioning the index.
+	maybeFilteredKeysSingleLevel bool
+
+	// useFilter specifies whether the filter block in this sstable, if present,
+	// should be used for prefix seeks or not. In some cases it is beneficial
+	// to skip a filter block even if it exists (eg. if probability of a match
+	// is high).
+	useFilter              bool
+	lastBloomFilterMatched bool
+
+	hideObsoletePoints bool
+}
+
+// singleLevelIterator implements the base.InternalIterator interface.
+var _ base.InternalIterator = (*singleLevelIterator)(nil)
+
+// init initializes a singleLevelIterator for reading from the table. It is
+// synonmous with Reader.NewIter, but allows for reusing of the iterator
+// between different Readers.
+//
+// Note that lower, upper passed into init has nothing to do with virtual sstable
+// bounds. If the virtualState passed in is not nil, then virtual sstable bounds
+// will be enforced.
+func (i *singleLevelIterator) init(
+	ctx context.Context,
+	r *Reader,
+	v *virtualState,
+	lower, upper []byte,
+	filterer *BlockPropertiesFilterer,
+	useFilter, hideObsoletePoints bool,
+	stats *base.InternalIteratorStats,
+	categoryAndQoS CategoryAndQoS,
+	statsCollector *CategoryStatsCollector,
+	rp ReaderProvider,
+	bufferPool *BufferPool,
+) error {
+	if r.err != nil {
+		return r.err
+	}
+	i.iterStats.init(categoryAndQoS, statsCollector)
+	indexH, err := r.readIndex(ctx, stats, &i.iterStats)
+	if err != nil {
+		return err
+	}
+	if v != nil {
+		i.vState = v
+		i.endKeyInclusive, lower, upper = v.constrainBounds(lower, upper, false /* endInclusive */)
+	}
+
+	i.ctx = ctx
+	i.lower = lower
+	i.upper = upper
+	i.bpfs = filterer
+	i.useFilter = useFilter
+	i.reader = r
+	i.cmp = r.Compare
+	i.stats = stats
+	i.hideObsoletePoints = hideObsoletePoints
+	i.bufferPool = bufferPool
+	err = i.index.initHandle(i.cmp, indexH, r.Properties.GlobalSeqNum, false)
+	if err != nil {
+		// blockIter.Close releases indexH and always returns a nil error
+		_ = i.index.Close()
+		return err
+	}
+	i.dataRH = objstorageprovider.UsePreallocatedReadHandle(ctx, r.readable, &i.dataRHPrealloc)
+	if r.tableFormat >= TableFormatPebblev3 {
+		if r.Properties.NumValueBlocks > 0 {
+			// NB: we cannot avoid this ~248 byte allocation, since valueBlockReader
+			// can outlive the singleLevelIterator due to be being embedded in a
+			// LazyValue. This consumes ~2% in microbenchmark CPU profiles, but we
+			// should only optimize this if it shows up as significant in end-to-end
+			// CockroachDB benchmarks, since it is tricky to do so. One possibility
+			// is that if many sstable iterators only get positioned at latest
+			// versions of keys, and therefore never expose a LazyValue that is
+			// separated to their callers, they can put this valueBlockReader into a
+			// sync.Pool.
+			i.vbReader = &valueBlockReader{
+				bpOpen: i,
+				rp:     rp,
+				vbih:   r.valueBIH,
+				stats:  stats,
+			}
+			i.data.lazyValueHandling.vbr = i.vbReader
+			i.vbRH = objstorageprovider.UsePreallocatedReadHandle(ctx, r.readable, &i.vbRHPrealloc)
+		}
+		i.data.lazyValueHandling.hasValuePrefix = true
+	}
+	return nil
+}
+
+// Helper function to check if keys returned from iterator are within global and virtual bounds.
+func (i *singleLevelIterator) maybeVerifyKey(
+	iKey *InternalKey, val base.LazyValue,
+) (*InternalKey, base.LazyValue) {
+	// maybeVerify key is only used for virtual sstable iterators.
+	if invariants.Enabled && i.vState != nil && iKey != nil {
+		key := iKey.UserKey
+
+		uc, vuc := i.cmp(key, i.upper), i.cmp(key, i.vState.upper.UserKey)
+		lc, vlc := i.cmp(key, i.lower), i.cmp(key, i.vState.lower.UserKey)
+
+		if (i.vState.upper.IsExclusiveSentinel() && vuc == 0) || (!i.endKeyInclusive && uc == 0) || uc > 0 || vuc > 0 || lc < 0 || vlc < 0 {
+			panic(fmt.Sprintf("key: %s out of bounds of singleLevelIterator", key))
+		}
+	}
+	return iKey, val
+}
+
+// setupForCompaction sets up the singleLevelIterator for use with compactionIter.
+// Currently, it skips readahead ramp-up. It should be called after init is called.
+func (i *singleLevelIterator) setupForCompaction() {
+	i.dataRH.SetupForCompaction()
+	if i.vbRH != nil {
+		i.vbRH.SetupForCompaction()
+	}
+}
+
+func (i *singleLevelIterator) resetForReuse() singleLevelIterator {
+	return singleLevelIterator{
+		index: i.index.resetForReuse(),
+		data:  i.data.resetForReuse(),
+	}
+}
+
+func (i *singleLevelIterator) initBounds() {
+	// Trim the iteration bounds for the current block. We don't have to check
+	// the bounds on each iteration if the block is entirely contained within the
+	// iteration bounds.
+	i.blockLower = i.lower
+	if i.blockLower != nil {
+		key, _ := i.data.First()
+		if key != nil && i.cmp(i.blockLower, key.UserKey) < 0 {
+			// The lower-bound is less than the first key in the block. No need
+			// to check the lower-bound again for this block.
+			i.blockLower = nil
+		}
+	}
+	i.blockUpper = i.upper
+	if i.blockUpper != nil && i.cmp(i.blockUpper, i.index.Key().UserKey) > 0 {
+		// The upper-bound is greater than the index key which itself is greater
+		// than or equal to every key in the block. No need to check the
+		// upper-bound again for this block. Even if blockUpper is inclusive
+		// because of upper being inclusive, we can still safely set blockUpper
+		// to nil here.
+		//
+		// TODO(bananabrick): We could also set blockUpper to nil for the >=
+		// case, if blockUpper is inclusive.
+		i.blockUpper = nil
+	}
+}
+
+// Deterministic disabling of the bounds-based optimization that avoids seeking.
+// Uses the iterator pointer, since we want diversity in iterator behavior for
+// the same SetBounds call. Used for tests.
+func disableBoundsOpt(bound []byte, ptr uintptr) bool {
+	// Fibonacci hash https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
+	simpleHash := (11400714819323198485 * uint64(ptr)) >> 63
+	return bound[len(bound)-1]&byte(1) == 0 && simpleHash == 0
+}
+
+// ensureBoundsOptDeterminism provides a facility for disabling of the bounds
+// optimizations performed by disableBoundsOpt for tests that require
+// deterministic iterator behavior. Some unit tests examine internal iterator
+// state and require this behavior to be deterministic.
+var ensureBoundsOptDeterminism bool
+
+// SetBounds implements internalIterator.SetBounds, as documented in the pebble
+// package. Note that the upper field is exclusive.
+func (i *singleLevelIterator) SetBounds(lower, upper []byte) {
+	i.boundsCmp = 0
+	if i.vState != nil {
+		// If the reader is constructed for a virtual sstable, then we must
+		// constrain the bounds of the reader. For physical sstables, the bounds
+		// can be wider than the actual sstable's bounds because we won't
+		// accidentally expose additional keys as there are no additional keys.
+		i.endKeyInclusive, lower, upper = i.vState.constrainBounds(
+			lower, upper, false,
+		)
+	} else {
+		// TODO(bananabrick): Figure out the logic here to enable the boundsCmp
+		// optimization for virtual sstables.
+		if i.positionedUsingLatestBounds {
+			if i.upper != nil && lower != nil && i.cmp(i.upper, lower) <= 0 {
+				i.boundsCmp = +1
+				if invariants.Enabled && !ensureBoundsOptDeterminism &&
+					disableBoundsOpt(lower, uintptr(unsafe.Pointer(i))) {
+					i.boundsCmp = 0
+				}
+			} else if i.lower != nil && upper != nil && i.cmp(upper, i.lower) <= 0 {
+				i.boundsCmp = -1
+				if invariants.Enabled && !ensureBoundsOptDeterminism &&
+					disableBoundsOpt(upper, uintptr(unsafe.Pointer(i))) {
+					i.boundsCmp = 0
+				}
+			}
+		}
+	}
+
+	i.positionedUsingLatestBounds = false
+	i.lower = lower
+	i.upper = upper
+	i.blockLower = nil
+	i.blockUpper = nil
+}
+
+func (i *singleLevelIterator) SetContext(ctx context.Context) {
+	i.ctx = ctx
+}
+
+// loadBlock loads the block at the current index position and leaves i.data
+// unpositioned. If unsuccessful, it sets i.err to any error encountered, which
+// may be nil if we have simply exhausted the entire table.
+func (i *singleLevelIterator) loadBlock(dir int8) loadBlockResult {
+	if !i.index.valid() {
+		// Ensure the data block iterator is invalidated even if loading of the block
+		// fails.
+		i.data.invalidate()
+		return loadBlockFailed
+	}
+	// Load the next block.
+	v := i.index.value()
+	bhp, err := decodeBlockHandleWithProperties(v.InPlaceValue())
+	if i.dataBH == bhp.BlockHandle && i.data.valid() {
+		// We're already at the data block we want to load. Reset bounds in case
+		// they changed since the last seek, but don't reload the block from cache
+		// or disk.
+		//
+		// It's safe to leave i.data in its original state here, as all callers to
+		// loadBlock make an absolute positioning call (i.e. a seek, first, or last)
+		// to `i.data` right after loadBlock returns loadBlockOK.
+		i.initBounds()
+		return loadBlockOK
+	}
+	// Ensure the data block iterator is invalidated even if loading of the block
+	// fails.
+	i.data.invalidate()
+	i.dataBH = bhp.BlockHandle
+	if err != nil {
+		i.err = errCorruptIndexEntry
+		return loadBlockFailed
+	}
+	if i.bpfs != nil {
+		intersects, err := i.bpfs.intersects(bhp.Props)
+		if err != nil {
+			i.err = errCorruptIndexEntry
+			return loadBlockFailed
+		}
+		if intersects == blockMaybeExcluded {
+			intersects = i.resolveMaybeExcluded(dir)
+		}
+		if intersects == blockExcluded {
+			i.maybeFilteredKeysSingleLevel = true
+			return loadBlockIrrelevant
+		}
+		// blockIntersects
+	}
+	ctx := objiotracing.WithBlockType(i.ctx, objiotracing.DataBlock)
+	block, err := i.reader.readBlock(
+		ctx, i.dataBH, nil /* transform */, i.dataRH, i.stats, &i.iterStats, i.bufferPool)
+	if err != nil {
+		i.err = err
+		return loadBlockFailed
+	}
+	i.err = i.data.initHandle(i.cmp, block, i.reader.Properties.GlobalSeqNum, i.hideObsoletePoints)
+	if i.err != nil {
+		// The block is partially loaded, and we don't want it to appear valid.
+		i.data.invalidate()
+		return loadBlockFailed
+	}
+	i.initBounds()
+	return loadBlockOK
+}
+
+// readBlockForVBR implements the blockProviderWhenOpen interface for use by
+// the valueBlockReader.
+func (i *singleLevelIterator) readBlockForVBR(
+	h BlockHandle, stats *base.InternalIteratorStats,
+) (bufferHandle, error) {
+	ctx := objiotracing.WithBlockType(i.ctx, objiotracing.ValueBlock)
+	return i.reader.readBlock(ctx, h, nil, i.vbRH, stats, &i.iterStats, i.bufferPool)
+}
+
+// resolveMaybeExcluded is invoked when the block-property filterer has found
+// that a block is excluded according to its properties but only if its bounds
+// fall within the filter's current bounds.  This function consults the
+// apprioriate bound, depending on the iteration direction, and returns either
+// `blockIntersects` or `blockMaybeExcluded`.
+func (i *singleLevelIterator) resolveMaybeExcluded(dir int8) intersectsResult {
+	// TODO(jackson): We could first try comparing to top-level index block's
+	// key, and if within bounds avoid per-data block key comparisons.
+
+	// This iterator is configured with a bound-limited block property
+	// filter. The bpf determined this block could be excluded from
+	// iteration based on the property encoded in the block handle.
+	// However, we still need to determine if the block is wholly
+	// contained within the filter's key bounds.
+	//
+	// External guarantees ensure all the block's keys are ≥ the
+	// filter's lower bound during forward iteration, and that all the
+	// block's keys are < the filter's upper bound during backward
+	// iteration. We only need to determine if the opposite bound is
+	// also met.
+	//
+	// The index separator in index.Key() provides an inclusive
+	// upper-bound for the data block's keys, guaranteeing that all its
+	// keys are ≤ index.Key(). For forward iteration, this is all we
+	// need.
+	if dir > 0 {
+		// Forward iteration.
+		if i.bpfs.boundLimitedFilter.KeyIsWithinUpperBound(i.index.Key().UserKey) {
+			return blockExcluded
+		}
+		return blockIntersects
+	}
+
+	// Reverse iteration.
+	//
+	// Because we're iterating in the reverse direction, we don't yet have
+	// enough context available to determine if the block is wholly contained
+	// within its bounds. This case arises only during backward iteration,
+	// because of the way the index is structured.
+	//
+	// Consider a bound-limited bpf limited to the bounds [b,d), loading the
+	// block with separator `c`. During reverse iteration, the guarantee that
+	// all the block's keys are < `d` is externally provided, but no guarantee
+	// is made on the bpf's lower bound. The separator `c` only provides an
+	// inclusive upper bound on the block's keys, indicating that the
+	// corresponding block handle points to a block containing only keys ≤ `c`.
+	//
+	// To establish a lower bound, we step the index backwards to read the
+	// previous block's separator, which provides an inclusive lower bound on
+	// the original block's keys. Afterwards, we step forward to restore our
+	// index position.
+	if peekKey, _ := i.index.Prev(); peekKey == nil {
+		// The original block points to the first block of this index block. If
+		// there's a two-level index, it could potentially provide a lower
+		// bound, but the code refactoring necessary to read it doesn't seem
+		// worth the payoff. We fall through to loading the block.
+	} else if i.bpfs.boundLimitedFilter.KeyIsWithinLowerBound(peekKey.UserKey) {
+		// The lower-bound on the original block falls within the filter's
+		// bounds, and we can skip the block (after restoring our current index
+		// position).
+		_, _ = i.index.Next()
+		return blockExcluded
+	}
+	_, _ = i.index.Next()
+	return blockIntersects
+}
+
+func (i *singleLevelIterator) initBoundsForAlreadyLoadedBlock() {
+	if i.data.getFirstUserKey() == nil {
+		panic("initBoundsForAlreadyLoadedBlock must not be called on empty or corrupted block")
+	}
+	i.blockLower = i.lower
+	if i.blockLower != nil {
+		firstUserKey := i.data.getFirstUserKey()
+		if firstUserKey != nil && i.cmp(i.blockLower, firstUserKey) < 0 {
+			// The lower-bound is less than the first key in the block. No need
+			// to check the lower-bound again for this block.
+			i.blockLower = nil
+		}
+	}
+	i.blockUpper = i.upper
+	if i.blockUpper != nil && i.cmp(i.blockUpper, i.index.Key().UserKey) > 0 {
+		// The upper-bound is greater than the index key which itself is greater
+		// than or equal to every key in the block. No need to check the
+		// upper-bound again for this block.
+		i.blockUpper = nil
+	}
+}
+
+// The number of times to call Next/Prev in a block before giving up and seeking.
+// The value of 4 is arbitrary.
+// TODO(sumeer): experiment with dynamic adjustment based on the history of
+// seeks for a particular iterator.
+const numStepsBeforeSeek = 4
+
+func (i *singleLevelIterator) trySeekGEUsingNextWithinBlock(
+	key []byte,
+) (k *InternalKey, v base.LazyValue, done bool) {
+	k, v = i.data.Key(), i.data.value()
+	for j := 0; j < numStepsBeforeSeek; j++ {
+		curKeyCmp := i.cmp(k.UserKey, key)
+		if curKeyCmp >= 0 {
+			if i.blockUpper != nil {
+				cmp := i.cmp(k.UserKey, i.blockUpper)
+				if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
+					i.exhaustedBounds = +1
+					return nil, base.LazyValue{}, true
+				}
+			}
+			return k, v, true
+		}
+		k, v = i.data.Next()
+		if k == nil {
+			break
+		}
+	}
+	return k, v, false
+}
+
+func (i *singleLevelIterator) trySeekLTUsingPrevWithinBlock(
+	key []byte,
+) (k *InternalKey, v base.LazyValue, done bool) {
+	k, v = i.data.Key(), i.data.value()
+	for j := 0; j < numStepsBeforeSeek; j++ {
+		curKeyCmp := i.cmp(k.UserKey, key)
+		if curKeyCmp < 0 {
+			if i.blockLower != nil && i.cmp(k.UserKey, i.blockLower) < 0 {
+				i.exhaustedBounds = -1
+				return nil, base.LazyValue{}, true
+			}
+			return k, v, true
+		}
+		k, v = i.data.Prev()
+		if k == nil {
+			break
+		}
+	}
+	return k, v, false
+}
+
+func (i *singleLevelIterator) recordOffset() uint64 {
+	offset := i.dataBH.Offset
+	if i.data.valid() {
+		// - i.dataBH.Length/len(i.data.data) is the compression ratio. If
+		//   uncompressed, this is 1.
+		// - i.data.nextOffset is the uncompressed position of the current record
+		//   in the block.
+		// - i.dataBH.Offset is the offset of the block in the sstable before
+		//   decompression.
+		offset += (uint64(i.data.nextOffset) * i.dataBH.Length) / uint64(len(i.data.data))
+	} else {
+		// Last entry in the block must increment bytes iterated by the size of the block trailer
+		// and restart points.
+		offset += i.dataBH.Length + blockTrailerLen
+	}
+	return offset
+}
+
+// SeekGE implements internalIterator.SeekGE, as documented in the pebble
+// package. Note that SeekGE only checks the upper bound. It is up to the
+// caller to ensure that key is greater than or equal to the lower bound.
+func (i *singleLevelIterator) SeekGE(
+	key []byte, flags base.SeekGEFlags,
+) (*InternalKey, base.LazyValue) {
+	if i.vState != nil {
+		// Callers of SeekGE don't know about virtual sstable bounds, so we may
+		// have to internally restrict the bounds.
+		//
+		// TODO(bananabrick): We can optimize this check away for the level iter
+		// if necessary.
+		if i.cmp(key, i.lower) < 0 {
+			key = i.lower
+		}
+	}
+
+	if flags.TrySeekUsingNext() {
+		// The i.exhaustedBounds comparison indicates that the upper bound was
+		// reached. The i.data.isDataInvalidated() indicates that the sstable was
+		// exhausted.
+		if (i.exhaustedBounds == +1 || i.data.isDataInvalidated()) && i.err == nil {
+			// Already exhausted, so return nil.
+			return nil, base.LazyValue{}
+		}
+		if i.err != nil {
+			// The current iterator position cannot be used.
+			flags = flags.DisableTrySeekUsingNext()
+		}
+		// INVARIANT: flags.TrySeekUsingNext() => i.err == nil &&
+		// !i.exhaustedBounds==+1 && !i.data.isDataInvalidated(). That is,
+		// data-exhausted and bounds-exhausted, as defined earlier, are both
+		// false. Ths makes it safe to clear out i.exhaustedBounds and i.err
+		// before calling into seekGEHelper.
+	}
+
+	i.exhaustedBounds = 0
+	i.err = nil // clear cached iteration error
+	boundsCmp := i.boundsCmp
+	// Seek optimization only applies until iterator is first positioned after SetBounds.
+	i.boundsCmp = 0
+	i.positionedUsingLatestBounds = true
+	return i.seekGEHelper(key, boundsCmp, flags)
+}
+
+// seekGEHelper contains the common functionality for SeekGE and SeekPrefixGE.
+func (i *singleLevelIterator) seekGEHelper(
+	key []byte, boundsCmp int, flags base.SeekGEFlags,
+) (*InternalKey, base.LazyValue) {
+	// Invariant: trySeekUsingNext => !i.data.isDataInvalidated() && i.exhaustedBounds != +1
+
+	// SeekGE performs various step-instead-of-seeking optimizations: eg enabled
+	// by trySeekUsingNext, or by monotonically increasing bounds (i.boundsCmp).
+	// Care must be taken to ensure that when performing these optimizations and
+	// the iterator becomes exhausted, i.maybeFilteredKeys is set appropriately.
+	// Consider a previous SeekGE that filtered keys from k until the current
+	// iterator position.
+	//
+	// If the previous SeekGE exhausted the iterator, it's possible keys greater
+	// than or equal to the current search key were filtered. We must not reuse
+	// the current iterator position without remembering the previous value of
+	// maybeFilteredKeys.
+
+	var dontSeekWithinBlock bool
+	if !i.data.isDataInvalidated() && !i.index.isDataInvalidated() && i.data.valid() && i.index.valid() &&
+		boundsCmp > 0 && i.cmp(key, i.index.Key().UserKey) <= 0 {
+		// Fast-path: The bounds have moved forward and this SeekGE is
+		// respecting the lower bound (guaranteed by Iterator). We know that
+		// the iterator must already be positioned within or just outside the
+		// previous bounds. Therefore it cannot be positioned at a block (or
+		// the position within that block) that is ahead of the seek position.
+		// However it can be positioned at an earlier block. This fast-path to
+		// use Next() on the block is only applied when we are already at the
+		// block that the slow-path (the else-clause) would load -- this is
+		// the motivation for the i.cmp(key, i.index.Key().UserKey) <= 0
+		// predicate.
+		i.initBoundsForAlreadyLoadedBlock()
+		ikey, val, done := i.trySeekGEUsingNextWithinBlock(key)
+		if done {
+			return ikey, val
+		}
+		if ikey == nil {
+			// Done with this block.
+			dontSeekWithinBlock = true
+		}
+	} else {
+		// Cannot use bounds monotonicity. But may be able to optimize if
+		// caller claimed externally known invariant represented by
+		// flags.TrySeekUsingNext().
+		if flags.TrySeekUsingNext() {
+			// seekPrefixGE or SeekGE has already ensured
+			// !i.data.isDataInvalidated() && i.exhaustedBounds != +1
+			currKey := i.data.Key()
+			value := i.data.value()
+			less := i.cmp(currKey.UserKey, key) < 0
+			// We could be more sophisticated and confirm that the seek
+			// position is within the current block before applying this
+			// optimization. But there may be some benefit even if it is in
+			// the next block, since we can avoid seeking i.index.
+			for j := 0; less && j < numStepsBeforeSeek; j++ {
+				currKey, value = i.Next()
+				if currKey == nil {
+					return nil, base.LazyValue{}
+				}
+				less = i.cmp(currKey.UserKey, key) < 0
+			}
+			if !less {
+				if i.blockUpper != nil {
+					cmp := i.cmp(currKey.UserKey, i.blockUpper)
+					if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
+						i.exhaustedBounds = +1
+						return nil, base.LazyValue{}
+					}
+				}
+				return currKey, value
+			}
+		}
+
+		// Slow-path.
+		// Since we're re-seeking the iterator, the previous value of
+		// maybeFilteredKeysSingleLevel is irrelevant. If we filter out blocks
+		// during seeking, loadBlock will set it to true.
+		i.maybeFilteredKeysSingleLevel = false
+
+		var ikey *InternalKey
+		if ikey, _ = i.index.SeekGE(key, flags.DisableTrySeekUsingNext()); ikey == nil {
+			// The target key is greater than any key in the index block.
+			// Invalidate the block iterator so that a subsequent call to Prev()
+			// will return the last key in the table.
+			i.data.invalidate()
+			return nil, base.LazyValue{}
+		}
+		result := i.loadBlock(+1)
+		if result == loadBlockFailed {
+			return nil, base.LazyValue{}
+		}
+		if result == loadBlockIrrelevant {
+			// Enforce the upper bound here since don't want to bother moving
+			// to the next block if upper bound is already exceeded. Note that
+			// the next block starts with keys >= ikey.UserKey since even
+			// though this is the block separator, the same user key can span
+			// multiple blocks. If upper is exclusive we use >= below, else
+			// we use >.
+			if i.upper != nil {
+				cmp := i.cmp(ikey.UserKey, i.upper)
+				if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
+					i.exhaustedBounds = +1
+					return nil, base.LazyValue{}
+				}
+			}
+			// Want to skip to the next block.
+			dontSeekWithinBlock = true
+		}
+	}
+	if !dontSeekWithinBlock {
+		if ikey, val := i.data.SeekGE(key, flags.DisableTrySeekUsingNext()); ikey != nil {
+			if i.blockUpper != nil {
+				cmp := i.cmp(ikey.UserKey, i.blockUpper)
+				if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
+					i.exhaustedBounds = +1
+					return nil, base.LazyValue{}
+				}
+			}
+			return ikey, val
+		}
+	}
+	return i.skipForward()
+}
+
+// SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
+// pebble package. Note that SeekPrefixGE only checks the upper bound. It is up
+// to the caller to ensure that key is greater than or equal to the lower bound.
+func (i *singleLevelIterator) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	if i.vState != nil {
+		// Callers of SeekPrefixGE aren't aware of virtual sstable bounds, so
+		// we may have to internally restrict the bounds.
+		//
+		// TODO(bananabrick): We can optimize away this check for the level iter
+		// if necessary.
+		if i.cmp(key, i.lower) < 0 {
+			key = i.lower
+		}
+	}
+	return i.seekPrefixGE(prefix, key, flags, i.useFilter)
+}
+
+func (i *singleLevelIterator) seekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags, checkFilter bool,
+) (k *InternalKey, value base.LazyValue) {
+	// NOTE: prefix is only used for bloom filter checking and not later work in
+	// this method. Hence, we can use the existing iterator position if the last
+	// SeekPrefixGE did not fail bloom filter matching.
+
+	err := i.err
+	i.err = nil // clear cached iteration error
+	if checkFilter && i.reader.tableFilter != nil {
+		if !i.lastBloomFilterMatched {
+			// Iterator is not positioned based on last seek.
+			flags = flags.DisableTrySeekUsingNext()
+		}
+		i.lastBloomFilterMatched = false
+		// Check prefix bloom filter.
+		var dataH bufferHandle
+		dataH, i.err = i.reader.readFilter(i.ctx, i.stats, &i.iterStats)
+		if i.err != nil {
+			i.data.invalidate()
+			return nil, base.LazyValue{}
+		}
+		mayContain := i.reader.tableFilter.mayContain(dataH.Get(), prefix)
+		dataH.Release()
+		if !mayContain {
+			// This invalidation may not be necessary for correctness, and may
+			// be a place to optimize later by reusing the already loaded
+			// block. It was necessary in earlier versions of the code since
+			// the caller was allowed to call Next when SeekPrefixGE returned
+			// nil. This is no longer allowed.
+			i.data.invalidate()
+			return nil, base.LazyValue{}
+		}
+		i.lastBloomFilterMatched = true
+	}
+	if flags.TrySeekUsingNext() {
+		// The i.exhaustedBounds comparison indicates that the upper bound was
+		// reached. The i.data.isDataInvalidated() indicates that the sstable was
+		// exhausted.
+		if (i.exhaustedBounds == +1 || i.data.isDataInvalidated()) && err == nil {
+			// Already exhausted, so return nil.
+			return nil, base.LazyValue{}
+		}
+		if err != nil {
+			// The current iterator position cannot be used.
+			flags = flags.DisableTrySeekUsingNext()
+		}
+		// INVARIANT: flags.TrySeekUsingNext() => err == nil &&
+		// !i.exhaustedBounds==+1 && !i.data.isDataInvalidated(). That is,
+		// data-exhausted and bounds-exhausted, as defined earlier, are both
+		// false. Ths makes it safe to clear out i.exhaustedBounds and i.err
+		// before calling into seekGEHelper.
+	}
+	// Bloom filter matches, or skipped, so this method will position the
+	// iterator.
+	i.exhaustedBounds = 0
+	boundsCmp := i.boundsCmp
+	// Seek optimization only applies until iterator is first positioned after SetBounds.
+	i.boundsCmp = 0
+	i.positionedUsingLatestBounds = true
+	k, value = i.seekGEHelper(key, boundsCmp, flags)
+	return i.maybeVerifyKey(k, value)
+}
+
+// virtualLast should only be called if i.vReader != nil.
+func (i *singleLevelIterator) virtualLast() (*InternalKey, base.LazyValue) {
+	if i.vState == nil {
+		panic("pebble: invalid call to virtualLast")
+	}
+
+	// Seek to the first internal key.
+	ikey, _ := i.SeekGE(i.upper, base.SeekGEFlagsNone)
+	if i.endKeyInclusive {
+		// Let's say the virtual sstable upper bound is c#1, with the keys c#3, c#2,
+		// c#1, d, e, ... in the sstable. So, the last key in the virtual sstable is
+		// c#1. We can perform SeekGE(i.upper) and then keep nexting until we find
+		// the last key with userkey == i.upper.
+		//
+		// TODO(bananabrick): Think about how to improve this. If many internal keys
+		// with the same user key at the upper bound then this could be slow, but
+		// maybe the odds of having many internal keys with the same user key at the
+		// upper bound are low.
+		for ikey != nil && i.cmp(ikey.UserKey, i.upper) == 0 {
+			ikey, _ = i.Next()
+		}
+		return i.Prev()
+	}
+
+	// We seeked to the first key >= i.upper.
+	return i.Prev()
+}
+
+// SeekLT implements internalIterator.SeekLT, as documented in the pebble
+// package. Note that SeekLT only checks the lower bound. It is up to the
+// caller to ensure that key is less than or equal to the upper bound.
+func (i *singleLevelIterator) SeekLT(
+	key []byte, flags base.SeekLTFlags,
+) (*InternalKey, base.LazyValue) {
+	if i.vState != nil {
+		// Might have to fix upper bound since virtual sstable bounds are not
+		// known to callers of SeekLT.
+		//
+		// TODO(bananabrick): We can optimize away this check for the level iter
+		// if necessary.
+		cmp := i.cmp(key, i.upper)
+		// key == i.upper is fine. We'll do the right thing and return the
+		// first internal key with user key < key.
+		if cmp > 0 {
+			// Return the last key in the virtual sstable.
+			return i.virtualLast()
+		}
+	}
+
+	i.exhaustedBounds = 0
+	i.err = nil // clear cached iteration error
+	boundsCmp := i.boundsCmp
+	// Seek optimization only applies until iterator is first positioned after SetBounds.
+	i.boundsCmp = 0
+
+	// Seeking operations perform various step-instead-of-seeking optimizations:
+	// eg by considering monotonically increasing bounds (i.boundsCmp). Care
+	// must be taken to ensure that when performing these optimizations and the
+	// iterator becomes exhausted i.maybeFilteredKeysSingleLevel is set
+	// appropriately.  Consider a previous SeekLT that filtered keys from k
+	// until the current iterator position.
+	//
+	// If the previous SeekLT did exhausted the iterator, it's possible keys
+	// less than the current search key were filtered. We must not reuse the
+	// current iterator position without remembering the previous value of
+	// maybeFilteredKeysSingleLevel.
+
+	i.positionedUsingLatestBounds = true
+
+	var dontSeekWithinBlock bool
+	if !i.data.isDataInvalidated() && !i.index.isDataInvalidated() && i.data.valid() && i.index.valid() &&
+		boundsCmp < 0 && i.cmp(i.data.getFirstUserKey(), key) < 0 {
+		// Fast-path: The bounds have moved backward, and this SeekLT is
+		// respecting the upper bound (guaranteed by Iterator). We know that
+		// the iterator must already be positioned within or just outside the
+		// previous bounds. Therefore it cannot be positioned at a block (or
+		// the position within that block) that is behind the seek position.
+		// However it can be positioned at a later block. This fast-path to
+		// use Prev() on the block is only applied when we are already at the
+		// block that can satisfy this seek -- this is the motivation for the
+		// the i.cmp(i.data.firstKey.UserKey, key) < 0 predicate.
+		i.initBoundsForAlreadyLoadedBlock()
+		ikey, val, done := i.trySeekLTUsingPrevWithinBlock(key)
+		if done {
+			return ikey, val
+		}
+		if ikey == nil {
+			// Done with this block.
+			dontSeekWithinBlock = true
+		}
+	} else {
+		// Slow-path.
+		i.maybeFilteredKeysSingleLevel = false
+		var ikey *InternalKey
+
+		// NB: If a bound-limited block property filter is configured, it's
+		// externally ensured that the filter is disabled (through returning
+		// Intersects=false irrespective of the block props provided) during
+		// seeks.
+		if ikey, _ = i.index.SeekGE(key, base.SeekGEFlagsNone); ikey == nil {
+			ikey, _ = i.index.Last()
+			if ikey == nil {
+				return nil, base.LazyValue{}
+			}
+		}
+		// INVARIANT: ikey != nil.
+		result := i.loadBlock(-1)
+		if result == loadBlockFailed {
+			return nil, base.LazyValue{}
+		}
+		if result == loadBlockIrrelevant {
+			// Enforce the lower bound here since don't want to bother moving
+			// to the previous block if lower bound is already exceeded. Note
+			// that the previous block starts with keys <= ikey.UserKey since
+			// even though this is the current block's separator, the same
+			// user key can span multiple blocks.
+			if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 {
+				i.exhaustedBounds = -1
+				return nil, base.LazyValue{}
+			}
+			// Want to skip to the previous block.
+			dontSeekWithinBlock = true
+		}
+	}
+	if !dontSeekWithinBlock {
+		if ikey, val := i.data.SeekLT(key, flags); ikey != nil {
+			if i.blockLower != nil && i.cmp(ikey.UserKey, i.blockLower) < 0 {
+				i.exhaustedBounds = -1
+				return nil, base.LazyValue{}
+			}
+			return ikey, val
+		}
+	}
+	// The index contains separator keys which may lie between
+	// user-keys. Consider the user-keys:
+	//
+	//   complete
+	// ---- new block ---
+	//   complexion
+	//
+	// If these two keys end one block and start the next, the index key may
+	// be chosen as "compleu". The SeekGE in the index block will then point
+	// us to the block containing "complexion". If this happens, we want the
+	// last key from the previous data block.
+	return i.maybeVerifyKey(i.skipBackward())
+}
+
+// First implements internalIterator.First, as documented in the pebble
+// package. Note that First only checks the upper bound. It is up to the caller
+// to ensure that key is greater than or equal to the lower bound (e.g. via a
+// call to SeekGE(lower)).
+func (i *singleLevelIterator) First() (*InternalKey, base.LazyValue) {
+	// If the iterator was created on a virtual sstable, we will SeekGE to the
+	// lower bound instead of using First, because First does not respect
+	// bounds.
+	if i.vState != nil {
+		return i.SeekGE(i.lower, base.SeekGEFlagsNone)
+	}
+
+	if i.lower != nil {
+		panic("singleLevelIterator.First() used despite lower bound")
+	}
+	i.positionedUsingLatestBounds = true
+	i.maybeFilteredKeysSingleLevel = false
+
+	return i.firstInternal()
+}
+
+// firstInternal is a helper used for absolute positioning in a single-level
+// index file, or for positioning in the second-level index in a two-level
+// index file. For the latter, one cannot make any claims about absolute
+// positioning.
+func (i *singleLevelIterator) firstInternal() (*InternalKey, base.LazyValue) {
+	i.exhaustedBounds = 0
+	i.err = nil // clear cached iteration error
+	// Seek optimization only applies until iterator is first positioned after SetBounds.
+	i.boundsCmp = 0
+
+	var ikey *InternalKey
+	if ikey, _ = i.index.First(); ikey == nil {
+		i.data.invalidate()
+		return nil, base.LazyValue{}
+	}
+	result := i.loadBlock(+1)
+	if result == loadBlockFailed {
+		return nil, base.LazyValue{}
+	}
+	if result == loadBlockOK {
+		if ikey, val := i.data.First(); ikey != nil {
+			if i.blockUpper != nil {
+				cmp := i.cmp(ikey.UserKey, i.blockUpper)
+				if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
+					i.exhaustedBounds = +1
+					return nil, base.LazyValue{}
+				}
+			}
+			return ikey, val
+		}
+		// Else fall through to skipForward.
+	} else {
+		// result == loadBlockIrrelevant. Enforce the upper bound here since
+		// don't want to bother moving to the next block if upper bound is
+		// already exceeded. Note that the next block starts with keys >=
+		// ikey.UserKey since even though this is the block separator, the
+		// same user key can span multiple blocks. If upper is exclusive we
+		// use >= below, else we use >.
+		if i.upper != nil {
+			cmp := i.cmp(ikey.UserKey, i.upper)
+			if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
+				i.exhaustedBounds = +1
+				return nil, base.LazyValue{}
+			}
+		}
+		// Else fall through to skipForward.
+	}
+
+	return i.skipForward()
+}
+
+// Last implements internalIterator.Last, as documented in the pebble
+// package. Note that Last only checks the lower bound. It is up to the caller
+// to ensure that key is less than the upper bound (e.g. via a call to
+// SeekLT(upper))
+func (i *singleLevelIterator) Last() (*InternalKey, base.LazyValue) {
+	if i.vState != nil {
+		return i.virtualLast()
+	}
+
+	if i.upper != nil {
+		panic("singleLevelIterator.Last() used despite upper bound")
+	}
+	i.positionedUsingLatestBounds = true
+	i.maybeFilteredKeysSingleLevel = false
+	return i.lastInternal()
+}
+
+// lastInternal is a helper used for absolute positioning in a single-level
+// index file, or for positioning in the second-level index in a two-level
+// index file. For the latter, one cannot make any claims about absolute
+// positioning.
+func (i *singleLevelIterator) lastInternal() (*InternalKey, base.LazyValue) {
+	i.exhaustedBounds = 0
+	i.err = nil // clear cached iteration error
+	// Seek optimization only applies until iterator is first positioned after SetBounds.
+	i.boundsCmp = 0
+
+	var ikey *InternalKey
+	if ikey, _ = i.index.Last(); ikey == nil {
+		i.data.invalidate()
+		return nil, base.LazyValue{}
+	}
+	result := i.loadBlock(-1)
+	if result == loadBlockFailed {
+		return nil, base.LazyValue{}
+	}
+	if result == loadBlockOK {
+		if ikey, val := i.data.Last(); ikey != nil {
+			if i.blockLower != nil && i.cmp(ikey.UserKey, i.blockLower) < 0 {
+				i.exhaustedBounds = -1
+				return nil, base.LazyValue{}
+			}
+			return ikey, val
+		}
+		// Else fall through to skipBackward.
+	} else {
+		// result == loadBlockIrrelevant. Enforce the lower bound here since
+		// don't want to bother moving to the previous block if lower bound is
+		// already exceeded. Note that the previous block starts with keys <=
+		// key.UserKey since even though this is the current block's
+		// separator, the same user key can span multiple blocks.
+		if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 {
+			i.exhaustedBounds = -1
+			return nil, base.LazyValue{}
+		}
+	}
+
+	return i.skipBackward()
+}
+
+// Next implements internalIterator.Next, as documented in the pebble
+// package.
+// Note: compactionIterator.Next mirrors the implementation of Iterator.Next
+// due to performance. Keep the two in sync.
+func (i *singleLevelIterator) Next() (*InternalKey, base.LazyValue) {
+	if i.exhaustedBounds == +1 {
+		panic("Next called even though exhausted upper bound")
+	}
+	i.exhaustedBounds = 0
+	i.maybeFilteredKeysSingleLevel = false
+	// Seek optimization only applies until iterator is first positioned after SetBounds.
+	i.boundsCmp = 0
+
+	if i.err != nil {
+		// TODO(jackson): Can this case be turned into a panic? Once an error is
+		// encountered, the iterator must be re-seeked.
+		return nil, base.LazyValue{}
+	}
+	if key, val := i.data.Next(); key != nil {
+		if i.blockUpper != nil {
+			cmp := i.cmp(key.UserKey, i.blockUpper)
+			if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
+				i.exhaustedBounds = +1
+				return nil, base.LazyValue{}
+			}
+		}
+		return key, val
+	}
+	return i.skipForward()
+}
+
+// NextPrefix implements (base.InternalIterator).NextPrefix.
+func (i *singleLevelIterator) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
+	if i.exhaustedBounds == +1 {
+		panic("NextPrefix called even though exhausted upper bound")
+	}
+	i.exhaustedBounds = 0
+	i.maybeFilteredKeysSingleLevel = false
+	// Seek optimization only applies until iterator is first positioned after SetBounds.
+	i.boundsCmp = 0
+	if i.err != nil {
+		// TODO(jackson): Can this case be turned into a panic? Once an error is
+		// encountered, the iterator must be re-seeked.
+		return nil, base.LazyValue{}
+	}
+	if key, val := i.data.NextPrefix(succKey); key != nil {
+		if i.blockUpper != nil {
+			cmp := i.cmp(key.UserKey, i.blockUpper)
+			if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
+				i.exhaustedBounds = +1
+				return nil, base.LazyValue{}
+			}
+		}
+		return key, val
+	}
+	// Did not find prefix in the existing data block. This is the slow-path
+	// where we effectively seek the iterator.
+	var ikey *InternalKey
+	// The key is likely to be in the next data block, so try one step.
+	if ikey, _ = i.index.Next(); ikey == nil {
+		// The target key is greater than any key in the index block.
+		// Invalidate the block iterator so that a subsequent call to Prev()
+		// will return the last key in the table.
+		i.data.invalidate()
+		return nil, base.LazyValue{}
+	}
+	if i.cmp(succKey, ikey.UserKey) > 0 {
+		// Not in the next data block, so seek the index.
+		if ikey, _ = i.index.SeekGE(succKey, base.SeekGEFlagsNone); ikey == nil {
+			// The target key is greater than any key in the index block.
+			// Invalidate the block iterator so that a subsequent call to Prev()
+			// will return the last key in the table.
+			i.data.invalidate()
+			return nil, base.LazyValue{}
+		}
+	}
+	result := i.loadBlock(+1)
+	if result == loadBlockFailed {
+		return nil, base.LazyValue{}
+	}
+	if result == loadBlockIrrelevant {
+		// Enforce the upper bound here since don't want to bother moving
+		// to the next block if upper bound is already exceeded. Note that
+		// the next block starts with keys >= ikey.UserKey since even
+		// though this is the block separator, the same user key can span
+		// multiple blocks. If upper is exclusive we use >= below, else we use
+		// >.
+		if i.upper != nil {
+			cmp := i.cmp(ikey.UserKey, i.upper)
+			if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
+				i.exhaustedBounds = +1
+				return nil, base.LazyValue{}
+			}
+		}
+	} else if key, val := i.data.SeekGE(succKey, base.SeekGEFlagsNone); key != nil {
+		if i.blockUpper != nil {
+			cmp := i.cmp(key.UserKey, i.blockUpper)
+			if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
+				i.exhaustedBounds = +1
+				return nil, base.LazyValue{}
+			}
+		}
+		return i.maybeVerifyKey(key, val)
+	}
+
+	return i.skipForward()
+}
+
+// Prev implements internalIterator.Prev, as documented in the pebble
+// package.
+func (i *singleLevelIterator) Prev() (*InternalKey, base.LazyValue) {
+	if i.exhaustedBounds == -1 {
+		panic("Prev called even though exhausted lower bound")
+	}
+	i.exhaustedBounds = 0
+	i.maybeFilteredKeysSingleLevel = false
+	// Seek optimization only applies until iterator is first positioned after SetBounds.
+	i.boundsCmp = 0
+
+	if i.err != nil {
+		return nil, base.LazyValue{}
+	}
+	if key, val := i.data.Prev(); key != nil {
+		if i.blockLower != nil && i.cmp(key.UserKey, i.blockLower) < 0 {
+			i.exhaustedBounds = -1
+			return nil, base.LazyValue{}
+		}
+		return key, val
+	}
+	return i.skipBackward()
+}
+
+func (i *singleLevelIterator) skipForward() (*InternalKey, base.LazyValue) {
+	for {
+		var key *InternalKey
+		if key, _ = i.index.Next(); key == nil {
+			i.data.invalidate()
+			break
+		}
+		result := i.loadBlock(+1)
+		if result != loadBlockOK {
+			if i.err != nil {
+				break
+			}
+			if result == loadBlockFailed {
+				// We checked that i.index was at a valid entry, so
+				// loadBlockFailed could not have happened due to to i.index
+				// being exhausted, and must be due to an error.
+				panic("loadBlock should not have failed with no error")
+			}
+			// result == loadBlockIrrelevant. Enforce the upper bound here
+			// since don't want to bother moving to the next block if upper
+			// bound is already exceeded. Note that the next block starts with
+			// keys >= key.UserKey since even though this is the block
+			// separator, the same user key can span multiple blocks. If upper
+			// is exclusive we use >= below, else we use >.
+			if i.upper != nil {
+				cmp := i.cmp(key.UserKey, i.upper)
+				if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
+					i.exhaustedBounds = +1
+					return nil, base.LazyValue{}
+				}
+			}
+			continue
+		}
+		if key, val := i.data.First(); key != nil {
+			if i.blockUpper != nil {
+				cmp := i.cmp(key.UserKey, i.blockUpper)
+				if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
+					i.exhaustedBounds = +1
+					return nil, base.LazyValue{}
+				}
+			}
+			return i.maybeVerifyKey(key, val)
+		}
+	}
+	return nil, base.LazyValue{}
+}
+
+func (i *singleLevelIterator) skipBackward() (*InternalKey, base.LazyValue) {
+	for {
+		var key *InternalKey
+		if key, _ = i.index.Prev(); key == nil {
+			i.data.invalidate()
+			break
+		}
+		result := i.loadBlock(-1)
+		if result != loadBlockOK {
+			if i.err != nil {
+				break
+			}
+			if result == loadBlockFailed {
+				// We checked that i.index was at a valid entry, so
+				// loadBlockFailed could not have happened due to to i.index
+				// being exhausted, and must be due to an error.
+				panic("loadBlock should not have failed with no error")
+			}
+			// result == loadBlockIrrelevant. Enforce the lower bound here
+			// since don't want to bother moving to the previous block if lower
+			// bound is already exceeded. Note that the previous block starts with
+			// keys <= key.UserKey since even though this is the current block's
+			// separator, the same user key can span multiple blocks.
+			if i.lower != nil && i.cmp(key.UserKey, i.lower) < 0 {
+				i.exhaustedBounds = -1
+				return nil, base.LazyValue{}
+			}
+			continue
+		}
+		key, val := i.data.Last()
+		if key == nil {
+			return nil, base.LazyValue{}
+		}
+		if i.blockLower != nil && i.cmp(key.UserKey, i.blockLower) < 0 {
+			i.exhaustedBounds = -1
+			return nil, base.LazyValue{}
+		}
+		return i.maybeVerifyKey(key, val)
+	}
+	return nil, base.LazyValue{}
+}
+
+// Error implements internalIterator.Error, as documented in the pebble
+// package.
+func (i *singleLevelIterator) Error() error {
+	if err := i.data.Error(); err != nil {
+		return err
+	}
+	return i.err
+}
+
+// MaybeFilteredKeys may be called when an iterator is exhausted to indicate
+// whether or not the last positioning method may have skipped any keys due to
+// block-property filters.
+func (i *singleLevelIterator) MaybeFilteredKeys() bool {
+	return i.maybeFilteredKeysSingleLevel
+}
+
+// SetCloseHook sets a function that will be called when the iterator is
+// closed.
+func (i *singleLevelIterator) SetCloseHook(fn func(i Iterator) error) {
+	i.closeHook = fn
+}
+
+func firstError(err0, err1 error) error {
+	if err0 != nil {
+		return err0
+	}
+	return err1
+}
+
+// Close implements internalIterator.Close, as documented in the pebble
+// package.
+func (i *singleLevelIterator) Close() error {
+	i.iterStats.close()
+	var err error
+	if i.closeHook != nil {
+		err = firstError(err, i.closeHook(i))
+	}
+	err = firstError(err, i.data.Close())
+	err = firstError(err, i.index.Close())
+	if i.dataRH != nil {
+		err = firstError(err, i.dataRH.Close())
+		i.dataRH = nil
+	}
+	err = firstError(err, i.err)
+	if i.bpfs != nil {
+		releaseBlockPropertiesFilterer(i.bpfs)
+	}
+	if i.vbReader != nil {
+		i.vbReader.close()
+	}
+	if i.vbRH != nil {
+		err = firstError(err, i.vbRH.Close())
+		i.vbRH = nil
+	}
+	*i = i.resetForReuse()
+	singleLevelIterPool.Put(i)
+	return err
+}
+
+func (i *singleLevelIterator) String() string {
+	if i.vState != nil {
+		return i.vState.fileNum.String()
+	}
+	return i.reader.fileNum.String()
+}
diff --git a/pebble/sstable/reader_iter_two_lvl.go b/pebble/sstable/reader_iter_two_lvl.go
new file mode 100644
index 0000000..36fa8ac
--- /dev/null
+++ b/pebble/sstable/reader_iter_two_lvl.go
@@ -0,0 +1,1092 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing"
+)
+
+type twoLevelIterator struct {
+	singleLevelIterator
+	// maybeFilteredKeysSingleLevel indicates whether the last iterator
+	// positioning operation may have skipped any index blocks due to
+	// block-property filters when positioning the top-level-index.
+	maybeFilteredKeysTwoLevel bool
+	topLevelIndex             blockIter
+}
+
+// twoLevelIterator implements the base.InternalIterator interface.
+var _ base.InternalIterator = (*twoLevelIterator)(nil)
+
+// loadIndex loads the index block at the current top level index position and
+// leaves i.index unpositioned. If unsuccessful, it gets i.err to any error
+// encountered, which may be nil if we have simply exhausted the entire table.
+// This is used for two level indexes.
+func (i *twoLevelIterator) loadIndex(dir int8) loadBlockResult {
+	// Ensure the index data block iterators are invalidated even if loading of
+	// the index fails.
+	i.data.invalidate()
+	i.index.invalidate()
+	if !i.topLevelIndex.valid() {
+		i.index.offset = 0
+		i.index.restarts = 0
+		return loadBlockFailed
+	}
+	v := i.topLevelIndex.value()
+	bhp, err := decodeBlockHandleWithProperties(v.InPlaceValue())
+	if err != nil {
+		i.err = base.CorruptionErrorf("pebble/table: corrupt top level index entry")
+		return loadBlockFailed
+	}
+	if i.bpfs != nil {
+		intersects, err := i.bpfs.intersects(bhp.Props)
+		if err != nil {
+			i.err = errCorruptIndexEntry
+			return loadBlockFailed
+		}
+		if intersects == blockMaybeExcluded {
+			intersects = i.resolveMaybeExcluded(dir)
+		}
+		if intersects == blockExcluded {
+			i.maybeFilteredKeysTwoLevel = true
+			return loadBlockIrrelevant
+		}
+		// blockIntersects
+	}
+	ctx := objiotracing.WithBlockType(i.ctx, objiotracing.MetadataBlock)
+	indexBlock, err := i.reader.readBlock(
+		ctx, bhp.BlockHandle, nil /* transform */, nil /* readHandle */, i.stats, &i.iterStats, i.bufferPool)
+	if err != nil {
+		i.err = err
+		return loadBlockFailed
+	}
+	if i.err = i.index.initHandle(i.cmp, indexBlock, i.reader.Properties.GlobalSeqNum, false); i.err == nil {
+		return loadBlockOK
+	}
+	return loadBlockFailed
+}
+
+// resolveMaybeExcluded is invoked when the block-property filterer has found
+// that an index block is excluded according to its properties but only if its
+// bounds fall within the filter's current bounds. This function consults the
+// apprioriate bound, depending on the iteration direction, and returns either
+// `blockIntersects` or
+// `blockMaybeExcluded`.
+func (i *twoLevelIterator) resolveMaybeExcluded(dir int8) intersectsResult {
+	// This iterator is configured with a bound-limited block property filter.
+	// The bpf determined this entire index block could be excluded from
+	// iteration based on the property encoded in the block handle. However, we
+	// still need to determine if the index block is wholly contained within the
+	// filter's key bounds.
+	//
+	// External guarantees ensure all its data blocks' keys are ≥ the filter's
+	// lower bound during forward iteration, and that all its data blocks' keys
+	// are < the filter's upper bound during backward iteration. We only need to
+	// determine if the opposite bound is also met.
+	//
+	// The index separator in topLevelIndex.Key() provides an inclusive
+	// upper-bound for the index block's keys, guaranteeing that all its keys
+	// are ≤ topLevelIndex.Key(). For forward iteration, this is all we need.
+	if dir > 0 {
+		// Forward iteration.
+		if i.bpfs.boundLimitedFilter.KeyIsWithinUpperBound(i.topLevelIndex.Key().UserKey) {
+			return blockExcluded
+		}
+		return blockIntersects
+	}
+
+	// Reverse iteration.
+	//
+	// Because we're iterating in the reverse direction, we don't yet have
+	// enough context available to determine if the block is wholly contained
+	// within its bounds. This case arises only during backward iteration,
+	// because of the way the index is structured.
+	//
+	// Consider a bound-limited bpf limited to the bounds [b,d), loading the
+	// block with separator `c`. During reverse iteration, the guarantee that
+	// all the block's keys are < `d` is externally provided, but no guarantee
+	// is made on the bpf's lower bound. The separator `c` only provides an
+	// inclusive upper bound on the block's keys, indicating that the
+	// corresponding block handle points to a block containing only keys ≤ `c`.
+	//
+	// To establish a lower bound, we step the top-level index backwards to read
+	// the previous block's separator, which provides an inclusive lower bound
+	// on the original index block's keys. Afterwards, we step forward to
+	// restore our top-level index position.
+	if peekKey, _ := i.topLevelIndex.Prev(); peekKey == nil {
+		// The original block points to the first index block of this table. If
+		// we knew the lower bound for the entire table, it could provide a
+		// lower bound, but the code refactoring necessary to read it doesn't
+		// seem worth the payoff. We fall through to loading the block.
+	} else if i.bpfs.boundLimitedFilter.KeyIsWithinLowerBound(peekKey.UserKey) {
+		// The lower-bound on the original index block falls within the filter's
+		// bounds, and we can skip the block (after restoring our current
+		// top-level index position).
+		_, _ = i.topLevelIndex.Next()
+		return blockExcluded
+	}
+	_, _ = i.topLevelIndex.Next()
+	return blockIntersects
+}
+
+// Note that lower, upper passed into init has nothing to do with virtual sstable
+// bounds. If the virtualState passed in is not nil, then virtual sstable bounds
+// will be enforced.
+func (i *twoLevelIterator) init(
+	ctx context.Context,
+	r *Reader,
+	v *virtualState,
+	lower, upper []byte,
+	filterer *BlockPropertiesFilterer,
+	useFilter, hideObsoletePoints bool,
+	stats *base.InternalIteratorStats,
+	categoryAndQoS CategoryAndQoS,
+	statsCollector *CategoryStatsCollector,
+	rp ReaderProvider,
+	bufferPool *BufferPool,
+) error {
+	if r.err != nil {
+		return r.err
+	}
+	i.iterStats.init(categoryAndQoS, statsCollector)
+	topLevelIndexH, err := r.readIndex(ctx, stats, &i.iterStats)
+	if err != nil {
+		return err
+	}
+	if v != nil {
+		i.vState = v
+		// Note that upper is exclusive here.
+		i.endKeyInclusive, lower, upper = v.constrainBounds(lower, upper, false /* endInclusive */)
+	}
+
+	i.ctx = ctx
+	i.lower = lower
+	i.upper = upper
+	i.bpfs = filterer
+	i.useFilter = useFilter
+	i.reader = r
+	i.cmp = r.Compare
+	i.stats = stats
+	i.hideObsoletePoints = hideObsoletePoints
+	i.bufferPool = bufferPool
+	err = i.topLevelIndex.initHandle(i.cmp, topLevelIndexH, r.Properties.GlobalSeqNum, false)
+	if err != nil {
+		// blockIter.Close releases topLevelIndexH and always returns a nil error
+		_ = i.topLevelIndex.Close()
+		return err
+	}
+	i.dataRH = r.readable.NewReadHandle(ctx)
+	if r.tableFormat >= TableFormatPebblev3 {
+		if r.Properties.NumValueBlocks > 0 {
+			i.vbReader = &valueBlockReader{
+				bpOpen: i,
+				rp:     rp,
+				vbih:   r.valueBIH,
+				stats:  stats,
+			}
+			i.data.lazyValueHandling.vbr = i.vbReader
+			i.vbRH = r.readable.NewReadHandle(ctx)
+		}
+		i.data.lazyValueHandling.hasValuePrefix = true
+	}
+	return nil
+}
+
+func (i *twoLevelIterator) String() string {
+	if i.vState != nil {
+		return i.vState.fileNum.String()
+	}
+	return i.reader.fileNum.String()
+}
+
+// MaybeFilteredKeys may be called when an iterator is exhausted to indicate
+// whether or not the last positioning method may have skipped any keys due to
+// block-property filters.
+func (i *twoLevelIterator) MaybeFilteredKeys() bool {
+	// While reading sstables with two-level indexes, knowledge of whether we've
+	// filtered keys is tracked separately for each index level. The
+	// seek-using-next optimizations have different criteria. We can only reset
+	// maybeFilteredKeys back to false during a seek when NOT using the
+	// fast-path that uses the current iterator position.
+	//
+	// If either level might have filtered keys to arrive at the current
+	// iterator position, return MaybeFilteredKeys=true.
+	return i.maybeFilteredKeysTwoLevel || i.maybeFilteredKeysSingleLevel
+}
+
+// SeekGE implements internalIterator.SeekGE, as documented in the pebble
+// package. Note that SeekGE only checks the upper bound. It is up to the
+// caller to ensure that key is greater than or equal to the lower bound.
+func (i *twoLevelIterator) SeekGE(
+	key []byte, flags base.SeekGEFlags,
+) (*InternalKey, base.LazyValue) {
+	if i.vState != nil {
+		// Callers of SeekGE don't know about virtual sstable bounds, so we may
+		// have to internally restrict the bounds.
+		//
+		// TODO(bananabrick): We can optimize away this check for the level iter
+		// if necessary.
+		if i.cmp(key, i.lower) < 0 {
+			key = i.lower
+		}
+	}
+
+	err := i.err
+	i.err = nil // clear cached iteration error
+
+	// The twoLevelIterator could be already exhausted. Utilize that when
+	// trySeekUsingNext is true. See the comment about data-exhausted, PGDE, and
+	// bounds-exhausted near the top of the file.
+	if flags.TrySeekUsingNext() &&
+		(i.exhaustedBounds == +1 || (i.data.isDataInvalidated() && i.index.isDataInvalidated())) &&
+		err == nil {
+		// Already exhausted, so return nil.
+		return nil, base.LazyValue{}
+	}
+
+	// SeekGE performs various step-instead-of-seeking optimizations: eg enabled
+	// by trySeekUsingNext, or by monotonically increasing bounds (i.boundsCmp).
+	// Care must be taken to ensure that when performing these optimizations and
+	// the iterator becomes exhausted, i.maybeFilteredKeys is set appropriately.
+	// Consider a previous SeekGE that filtered keys from k until the current
+	// iterator position.
+	//
+	// If the previous SeekGE exhausted the iterator while seeking within the
+	// two-level index, it's possible keys greater than or equal to the current
+	// search key were filtered through skipped index blocks. We must not reuse
+	// the position of the two-level index iterator without remembering the
+	// previous value of maybeFilteredKeys.
+
+	// We fall into the slow path if i.index.isDataInvalidated() even if the
+	// top-level iterator is already positioned correctly and all other
+	// conditions are met. An alternative structure could reuse topLevelIndex's
+	// current position and reload the index block to which it points. Arguably,
+	// an index block load is expensive and the index block may still be earlier
+	// than the index block containing the sought key, resulting in a wasteful
+	// block load.
+
+	var dontSeekWithinSingleLevelIter bool
+	if i.topLevelIndex.isDataInvalidated() || !i.topLevelIndex.valid() || i.index.isDataInvalidated() || err != nil ||
+		(i.boundsCmp <= 0 && !flags.TrySeekUsingNext()) || i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 {
+		// Slow-path: need to position the topLevelIndex.
+
+		// The previous exhausted state of singleLevelIterator is no longer
+		// relevant, since we may be moving to a different index block.
+		i.exhaustedBounds = 0
+		i.maybeFilteredKeysTwoLevel = false
+		flags = flags.DisableTrySeekUsingNext()
+		var ikey *InternalKey
+		if ikey, _ = i.topLevelIndex.SeekGE(key, flags); ikey == nil {
+			i.data.invalidate()
+			i.index.invalidate()
+			return nil, base.LazyValue{}
+		}
+
+		result := i.loadIndex(+1)
+		if result == loadBlockFailed {
+			i.boundsCmp = 0
+			return nil, base.LazyValue{}
+		}
+		if result == loadBlockIrrelevant {
+			// Enforce the upper bound here since don't want to bother moving
+			// to the next entry in the top level index if upper bound is
+			// already exceeded. Note that the next entry starts with keys >=
+			// ikey.UserKey since even though this is the block separator, the
+			// same user key can span multiple index blocks. If upper is
+			// exclusive we use >= below, else we use >.
+			if i.upper != nil {
+				cmp := i.cmp(ikey.UserKey, i.upper)
+				if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
+					i.exhaustedBounds = +1
+				}
+			}
+			// Fall through to skipForward.
+			dontSeekWithinSingleLevelIter = true
+			// Clear boundsCmp.
+			//
+			// In the typical cases where dontSeekWithinSingleLevelIter=false,
+			// the singleLevelIterator.SeekGE call will clear boundsCmp.
+			// However, in this case where dontSeekWithinSingleLevelIter=true,
+			// we never seek on the single-level iterator. This call will fall
+			// through to skipForward, which may improperly leave boundsCmp=+1
+			// unless we clear it here.
+			i.boundsCmp = 0
+		}
+	} else {
+		// INVARIANT: err == nil.
+		//
+		// Else fast-path: There are two possible cases, from
+		// (i.boundsCmp > 0 || flags.TrySeekUsingNext()):
+		//
+		// 1) The bounds have moved forward (i.boundsCmp > 0) and this SeekGE is
+		// respecting the lower bound (guaranteed by Iterator). We know that the
+		// iterator must already be positioned within or just outside the previous
+		// bounds. Therefore, the topLevelIndex iter cannot be positioned at an
+		// entry ahead of the seek position (though it can be positioned behind).
+		// The !i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 confirms that it is
+		// not behind. Since it is not ahead and not behind it must be at the
+		// right position.
+		//
+		// 2) This SeekGE will land on a key that is greater than the key we are
+		// currently at (guaranteed by trySeekUsingNext), but since i.cmp(key,
+		// i.topLevelIndex.Key().UserKey) <= 0, we are at the correct lower level
+		// index block. No need to reset the state of singleLevelIterator.
+		//
+		// Note that cases 1 and 2 never overlap, and one of them must be true,
+		// but we have some test code (TestIterRandomizedMaybeFilteredKeys) that
+		// sets both to true, so we fix things here and then do an invariant
+		// check.
+		//
+		// This invariant checking is important enough that we do not gate it
+		// behind invariants.Enabled.
+		if i.boundsCmp > 0 {
+			// TODO(sumeer): fix TestIterRandomizedMaybeFilteredKeys so as to not
+			// need this behavior.
+			flags = flags.DisableTrySeekUsingNext()
+		}
+		if i.boundsCmp > 0 == flags.TrySeekUsingNext() {
+			panic(fmt.Sprintf("inconsistency in optimization case 1 %t and case 2 %t",
+				i.boundsCmp > 0, flags.TrySeekUsingNext()))
+		}
+
+		if !flags.TrySeekUsingNext() {
+			// Case 1. Bounds have changed so the previous exhausted bounds state is
+			// irrelevant.
+			// WARNING-data-exhausted: this is safe to do only because the monotonic
+			// bounds optimizations only work when !data-exhausted. If they also
+			// worked with data-exhausted, we have made it unclear whether
+			// data-exhausted is actually true. See the comment at the top of the
+			// file.
+			i.exhaustedBounds = 0
+		}
+		// Else flags.TrySeekUsingNext(). The i.exhaustedBounds is important to
+		// preserve for singleLevelIterator, and twoLevelIterator.skipForward. See
+		// bug https://github.com/cockroachdb/pebble/issues/2036.
+	}
+
+	if !dontSeekWithinSingleLevelIter {
+		// Note that while trySeekUsingNext could be false here, singleLevelIterator
+		// could do its own boundsCmp-based optimization to seek using next.
+		if ikey, val := i.singleLevelIterator.SeekGE(key, flags); ikey != nil {
+			return ikey, val
+		}
+	}
+	return i.skipForward()
+}
+
+// SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
+// pebble package. Note that SeekPrefixGE only checks the upper bound. It is up
+// to the caller to ensure that key is greater than or equal to the lower bound.
+func (i *twoLevelIterator) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	if i.vState != nil {
+		// Callers of SeekGE don't know about virtual sstable bounds, so we may
+		// have to internally restrict the bounds.
+		//
+		// TODO(bananabrick): We can optimize away this check for the level iter
+		// if necessary.
+		if i.cmp(key, i.lower) < 0 {
+			key = i.lower
+		}
+	}
+
+	// NOTE: prefix is only used for bloom filter checking and not later work in
+	// this method. Hence, we can use the existing iterator position if the last
+	// SeekPrefixGE did not fail bloom filter matching.
+
+	err := i.err
+	i.err = nil // clear cached iteration error
+
+	// The twoLevelIterator could be already exhausted. Utilize that when
+	// trySeekUsingNext is true. See the comment about data-exhausted, PGDE, and
+	// bounds-exhausted near the top of the file.
+	filterUsedAndDidNotMatch :=
+		i.reader.tableFilter != nil && i.useFilter && !i.lastBloomFilterMatched
+	if flags.TrySeekUsingNext() && !filterUsedAndDidNotMatch &&
+		(i.exhaustedBounds == +1 || (i.data.isDataInvalidated() && i.index.isDataInvalidated())) &&
+		err == nil {
+		// Already exhausted, so return nil.
+		return nil, base.LazyValue{}
+	}
+
+	// Check prefix bloom filter.
+	if i.reader.tableFilter != nil && i.useFilter {
+		if !i.lastBloomFilterMatched {
+			// Iterator is not positioned based on last seek.
+			flags = flags.DisableTrySeekUsingNext()
+		}
+		i.lastBloomFilterMatched = false
+		var dataH bufferHandle
+		dataH, i.err = i.reader.readFilter(i.ctx, i.stats, &i.iterStats)
+		if i.err != nil {
+			i.data.invalidate()
+			return nil, base.LazyValue{}
+		}
+		mayContain := i.reader.tableFilter.mayContain(dataH.Get(), prefix)
+		dataH.Release()
+		if !mayContain {
+			// This invalidation may not be necessary for correctness, and may
+			// be a place to optimize later by reusing the already loaded
+			// block. It was necessary in earlier versions of the code since
+			// the caller was allowed to call Next when SeekPrefixGE returned
+			// nil. This is no longer allowed.
+			i.data.invalidate()
+			return nil, base.LazyValue{}
+		}
+		i.lastBloomFilterMatched = true
+	}
+
+	// Bloom filter matches.
+
+	// SeekPrefixGE performs various step-instead-of-seeking optimizations: eg
+	// enabled by trySeekUsingNext, or by monotonically increasing bounds
+	// (i.boundsCmp).  Care must be taken to ensure that when performing these
+	// optimizations and the iterator becomes exhausted,
+	// i.maybeFilteredKeysTwoLevel is set appropriately.  Consider a previous
+	// SeekPrefixGE that filtered keys from k until the current iterator
+	// position.
+	//
+	// If the previous SeekPrefixGE exhausted the iterator while seeking within
+	// the two-level index, it's possible keys greater than or equal to the
+	// current search key were filtered through skipped index blocks. We must
+	// not reuse the position of the two-level index iterator without
+	// remembering the previous value of maybeFilteredKeysTwoLevel.
+
+	// We fall into the slow path if i.index.isDataInvalidated() even if the
+	// top-level iterator is already positioned correctly and all other
+	// conditions are met. An alternative structure could reuse topLevelIndex's
+	// current position and reload the index block to which it points. Arguably,
+	// an index block load is expensive and the index block may still be earlier
+	// than the index block containing the sought key, resulting in a wasteful
+	// block load.
+
+	var dontSeekWithinSingleLevelIter bool
+	if i.topLevelIndex.isDataInvalidated() || !i.topLevelIndex.valid() || i.index.isDataInvalidated() || err != nil ||
+		(i.boundsCmp <= 0 && !flags.TrySeekUsingNext()) || i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 {
+		// Slow-path: need to position the topLevelIndex.
+
+		// The previous exhausted state of singleLevelIterator is no longer
+		// relevant, since we may be moving to a different index block.
+		i.exhaustedBounds = 0
+		i.maybeFilteredKeysTwoLevel = false
+		flags = flags.DisableTrySeekUsingNext()
+		var ikey *InternalKey
+		if ikey, _ = i.topLevelIndex.SeekGE(key, flags); ikey == nil {
+			i.data.invalidate()
+			i.index.invalidate()
+			return nil, base.LazyValue{}
+		}
+
+		result := i.loadIndex(+1)
+		if result == loadBlockFailed {
+			i.boundsCmp = 0
+			return nil, base.LazyValue{}
+		}
+		if result == loadBlockIrrelevant {
+			// Enforce the upper bound here since don't want to bother moving
+			// to the next entry in the top level index if upper bound is
+			// already exceeded. Note that the next entry starts with keys >=
+			// ikey.UserKey since even though this is the block separator, the
+			// same user key can span multiple index blocks. If upper is
+			// exclusive we use >= below, else we use >.
+			if i.upper != nil {
+				cmp := i.cmp(ikey.UserKey, i.upper)
+				if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
+					i.exhaustedBounds = +1
+				}
+			}
+			// Fall through to skipForward.
+			dontSeekWithinSingleLevelIter = true
+			// Clear boundsCmp.
+			//
+			// In the typical cases where dontSeekWithinSingleLevelIter=false,
+			// the singleLevelIterator.SeekPrefixGE call will clear boundsCmp.
+			// However, in this case where dontSeekWithinSingleLevelIter=true,
+			// we never seek on the single-level iterator. This call will fall
+			// through to skipForward, which may improperly leave boundsCmp=+1
+			// unless we clear it here.
+			i.boundsCmp = 0
+		}
+	} else {
+		// INVARIANT: err == nil.
+		//
+		// Else fast-path: There are two possible cases, from
+		// (i.boundsCmp > 0 || flags.TrySeekUsingNext()):
+		//
+		// 1) The bounds have moved forward (i.boundsCmp > 0) and this
+		// SeekPrefixGE is respecting the lower bound (guaranteed by Iterator). We
+		// know that the iterator must already be positioned within or just
+		// outside the previous bounds. Therefore, the topLevelIndex iter cannot
+		// be positioned at an entry ahead of the seek position (though it can be
+		// positioned behind). The !i.cmp(key, i.topLevelIndex.Key().UserKey) > 0
+		// confirms that it is not behind. Since it is not ahead and not behind it
+		// must be at the right position.
+		//
+		// 2) This SeekPrefixGE will land on a key that is greater than the key we
+		// are currently at (guaranteed by trySeekUsingNext), but since i.cmp(key,
+		// i.topLevelIndex.Key().UserKey) <= 0, we are at the correct lower level
+		// index block. No need to reset the state of singleLevelIterator.
+		//
+		// Note that cases 1 and 2 never overlap, and one of them must be true.
+		// This invariant checking is important enough that we do not gate it
+		// behind invariants.Enabled.
+		if i.boundsCmp > 0 == flags.TrySeekUsingNext() {
+			panic(fmt.Sprintf("inconsistency in optimization case 1 %t and case 2 %t",
+				i.boundsCmp > 0, flags.TrySeekUsingNext()))
+		}
+
+		if !flags.TrySeekUsingNext() {
+			// Case 1. Bounds have changed so the previous exhausted bounds state is
+			// irrelevant.
+			// WARNING-data-exhausted: this is safe to do only because the monotonic
+			// bounds optimizations only work when !data-exhausted. If they also
+			// worked with data-exhausted, we have made it unclear whether
+			// data-exhausted is actually true. See the comment at the top of the
+			// file.
+			i.exhaustedBounds = 0
+		}
+		// Else flags.TrySeekUsingNext(). The i.exhaustedBounds is important to
+		// preserve for singleLevelIterator, and twoLevelIterator.skipForward. See
+		// bug https://github.com/cockroachdb/pebble/issues/2036.
+	}
+
+	if !dontSeekWithinSingleLevelIter {
+		if ikey, val := i.singleLevelIterator.seekPrefixGE(
+			prefix, key, flags, false /* checkFilter */); ikey != nil {
+			return ikey, val
+		}
+	}
+	// NB: skipForward checks whether exhaustedBounds is already +1.
+	return i.skipForward()
+}
+
+// virtualLast should only be called if i.vReader != nil and i.endKeyInclusive
+// is true.
+func (i *twoLevelIterator) virtualLast() (*InternalKey, base.LazyValue) {
+	if i.vState == nil {
+		panic("pebble: invalid call to virtualLast")
+	}
+
+	// Seek to the first internal key.
+	ikey, _ := i.SeekGE(i.upper, base.SeekGEFlagsNone)
+	if i.endKeyInclusive {
+		// Let's say the virtual sstable upper bound is c#1, with the keys c#3, c#2,
+		// c#1, d, e, ... in the sstable. So, the last key in the virtual sstable is
+		// c#1. We can perform SeekGE(i.upper) and then keep nexting until we find
+		// the last key with userkey == i.upper.
+		//
+		// TODO(bananabrick): Think about how to improve this. If many internal keys
+		// with the same user key at the upper bound then this could be slow, but
+		// maybe the odds of having many internal keys with the same user key at the
+		// upper bound are low.
+		for ikey != nil && i.cmp(ikey.UserKey, i.upper) == 0 {
+			ikey, _ = i.Next()
+		}
+		return i.Prev()
+	}
+	// We seeked to the first key >= i.upper.
+	return i.Prev()
+}
+
+// SeekLT implements internalIterator.SeekLT, as documented in the pebble
+// package. Note that SeekLT only checks the lower bound. It is up to the
+// caller to ensure that key is less than the upper bound.
+func (i *twoLevelIterator) SeekLT(
+	key []byte, flags base.SeekLTFlags,
+) (*InternalKey, base.LazyValue) {
+	if i.vState != nil {
+		// Might have to fix upper bound since virtual sstable bounds are not
+		// known to callers of SeekLT.
+		//
+		// TODO(bananabrick): We can optimize away this check for the level iter
+		// if necessary.
+		cmp := i.cmp(key, i.upper)
+		// key == i.upper is fine. We'll do the right thing and return the
+		// first internal key with user key < key.
+		if cmp > 0 {
+			return i.virtualLast()
+		}
+	}
+
+	i.exhaustedBounds = 0
+	i.err = nil // clear cached iteration error
+	// Seek optimization only applies until iterator is first positioned after SetBounds.
+	i.boundsCmp = 0
+
+	var result loadBlockResult
+	var ikey *InternalKey
+	// NB: Unlike SeekGE, we don't have a fast-path here since we don't know
+	// whether the topLevelIndex is positioned after the position that would
+	// be returned by doing i.topLevelIndex.SeekGE(). To know this we would
+	// need to know the index key preceding the current one.
+	// NB: If a bound-limited block property filter is configured, it's
+	// externally ensured that the filter is disabled (through returning
+	// Intersects=false irrespective of the block props provided) during seeks.
+	i.maybeFilteredKeysTwoLevel = false
+	if ikey, _ = i.topLevelIndex.SeekGE(key, base.SeekGEFlagsNone); ikey == nil {
+		if ikey, _ = i.topLevelIndex.Last(); ikey == nil {
+			i.data.invalidate()
+			i.index.invalidate()
+			return nil, base.LazyValue{}
+		}
+
+		result = i.loadIndex(-1)
+		if result == loadBlockFailed {
+			return nil, base.LazyValue{}
+		}
+		if result == loadBlockOK {
+			if ikey, val := i.singleLevelIterator.lastInternal(); ikey != nil {
+				return i.maybeVerifyKey(ikey, val)
+			}
+			// Fall through to skipBackward since the singleLevelIterator did
+			// not have any blocks that satisfy the block interval
+			// constraints, or the lower bound was reached.
+		}
+		// Else loadBlockIrrelevant, so fall through.
+	} else {
+		result = i.loadIndex(-1)
+		if result == loadBlockFailed {
+			return nil, base.LazyValue{}
+		}
+		if result == loadBlockOK {
+			if ikey, val := i.singleLevelIterator.SeekLT(key, flags); ikey != nil {
+				return i.maybeVerifyKey(ikey, val)
+			}
+			// Fall through to skipBackward since the singleLevelIterator did
+			// not have any blocks that satisfy the block interval
+			// constraint, or the lower bound was reached.
+		}
+		// Else loadBlockIrrelevant, so fall through.
+	}
+	if result == loadBlockIrrelevant {
+		// Enforce the lower bound here since don't want to bother moving to
+		// the previous entry in the top level index if lower bound is already
+		// exceeded. Note that the previous entry starts with keys <=
+		// ikey.UserKey since even though this is the current block's
+		// separator, the same user key can span multiple index blocks.
+		if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 {
+			i.exhaustedBounds = -1
+		}
+	}
+	// NB: skipBackward checks whether exhaustedBounds is already -1.
+	return i.skipBackward()
+}
+
+// First implements internalIterator.First, as documented in the pebble
+// package. Note that First only checks the upper bound. It is up to the caller
+// to ensure that key is greater than or equal to the lower bound (e.g. via a
+// call to SeekGE(lower)).
+func (i *twoLevelIterator) First() (*InternalKey, base.LazyValue) {
+	// If the iterator was created on a virtual sstable, we will SeekGE to the
+	// lower bound instead of using First, because First does not respect
+	// bounds.
+	if i.vState != nil {
+		return i.SeekGE(i.lower, base.SeekGEFlagsNone)
+	}
+
+	if i.lower != nil {
+		panic("twoLevelIterator.First() used despite lower bound")
+	}
+	i.exhaustedBounds = 0
+	i.maybeFilteredKeysTwoLevel = false
+	i.err = nil // clear cached iteration error
+	// Seek optimization only applies until iterator is first positioned after SetBounds.
+	i.boundsCmp = 0
+
+	var ikey *InternalKey
+	if ikey, _ = i.topLevelIndex.First(); ikey == nil {
+		return nil, base.LazyValue{}
+	}
+
+	result := i.loadIndex(+1)
+	if result == loadBlockFailed {
+		return nil, base.LazyValue{}
+	}
+	if result == loadBlockOK {
+		if ikey, val := i.singleLevelIterator.First(); ikey != nil {
+			return ikey, val
+		}
+		// Else fall through to skipForward.
+	} else {
+		// result == loadBlockIrrelevant. Enforce the upper bound here since
+		// don't want to bother moving to the next entry in the top level
+		// index if upper bound is already exceeded. Note that the next entry
+		// starts with keys >= ikey.UserKey since even though this is the
+		// block separator, the same user key can span multiple index blocks.
+		// If upper is exclusive we use >= below, else we use >.
+		if i.upper != nil {
+			cmp := i.cmp(ikey.UserKey, i.upper)
+			if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
+				i.exhaustedBounds = +1
+			}
+		}
+	}
+	// NB: skipForward checks whether exhaustedBounds is already +1.
+	return i.skipForward()
+}
+
+// Last implements internalIterator.Last, as documented in the pebble
+// package. Note that Last only checks the lower bound. It is up to the caller
+// to ensure that key is less than the upper bound (e.g. via a call to
+// SeekLT(upper))
+func (i *twoLevelIterator) Last() (*InternalKey, base.LazyValue) {
+	if i.vState != nil {
+		if i.endKeyInclusive {
+			return i.virtualLast()
+		}
+		return i.SeekLT(i.upper, base.SeekLTFlagsNone)
+	}
+
+	if i.upper != nil {
+		panic("twoLevelIterator.Last() used despite upper bound")
+	}
+	i.exhaustedBounds = 0
+	i.maybeFilteredKeysTwoLevel = false
+	i.err = nil // clear cached iteration error
+	// Seek optimization only applies until iterator is first positioned after SetBounds.
+	i.boundsCmp = 0
+
+	var ikey *InternalKey
+	if ikey, _ = i.topLevelIndex.Last(); ikey == nil {
+		return nil, base.LazyValue{}
+	}
+
+	result := i.loadIndex(-1)
+	if result == loadBlockFailed {
+		return nil, base.LazyValue{}
+	}
+	if result == loadBlockOK {
+		if ikey, val := i.singleLevelIterator.Last(); ikey != nil {
+			return ikey, val
+		}
+		// Else fall through to skipBackward.
+	} else {
+		// result == loadBlockIrrelevant. Enforce the lower bound here
+		// since don't want to bother moving to the previous entry in the
+		// top level index if lower bound is already exceeded. Note that
+		// the previous entry starts with keys <= ikey.UserKey since even
+		// though this is the current block's separator, the same user key
+		// can span multiple index blocks.
+		if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 {
+			i.exhaustedBounds = -1
+		}
+	}
+	// NB: skipBackward checks whether exhaustedBounds is already -1.
+	return i.skipBackward()
+}
+
+// Next implements internalIterator.Next, as documented in the pebble
+// package.
+// Note: twoLevelCompactionIterator.Next mirrors the implementation of
+// twoLevelIterator.Next due to performance. Keep the two in sync.
+func (i *twoLevelIterator) Next() (*InternalKey, base.LazyValue) {
+	// Seek optimization only applies until iterator is first positioned after SetBounds.
+	i.boundsCmp = 0
+	i.maybeFilteredKeysTwoLevel = false
+	if i.err != nil {
+		// TODO(jackson): Can this case be turned into a panic? Once an error is
+		// encountered, the iterator must be re-seeked.
+		return nil, base.LazyValue{}
+	}
+	if key, val := i.singleLevelIterator.Next(); key != nil {
+		return key, val
+	}
+	return i.skipForward()
+}
+
+// NextPrefix implements (base.InternalIterator).NextPrefix.
+func (i *twoLevelIterator) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
+	if i.exhaustedBounds == +1 {
+		panic("Next called even though exhausted upper bound")
+	}
+	// Seek optimization only applies until iterator is first positioned after SetBounds.
+	i.boundsCmp = 0
+	i.maybeFilteredKeysTwoLevel = false
+	if i.err != nil {
+		// TODO(jackson): Can this case be turned into a panic? Once an error is
+		// encountered, the iterator must be re-seeked.
+		return nil, base.LazyValue{}
+	}
+	if key, val := i.singleLevelIterator.NextPrefix(succKey); key != nil {
+		return key, val
+	}
+	// key == nil
+	if i.err != nil {
+		return nil, base.LazyValue{}
+	}
+
+	// Did not find prefix in the existing second-level index block. This is the
+	// slow-path where we seek the iterator.
+	var ikey *InternalKey
+	if ikey, _ = i.topLevelIndex.SeekGE(succKey, base.SeekGEFlagsNone); ikey == nil {
+		i.data.invalidate()
+		i.index.invalidate()
+		return nil, base.LazyValue{}
+	}
+	result := i.loadIndex(+1)
+	if result == loadBlockFailed {
+		return nil, base.LazyValue{}
+	}
+	if result == loadBlockIrrelevant {
+		// Enforce the upper bound here since don't want to bother moving to the
+		// next entry in the top level index if upper bound is already exceeded.
+		// Note that the next entry starts with keys >= ikey.UserKey since even
+		// though this is the block separator, the same user key can span multiple
+		// index blocks. If upper is exclusive we use >= below, else we use >.
+		if i.upper != nil {
+			cmp := i.cmp(ikey.UserKey, i.upper)
+			if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
+				i.exhaustedBounds = +1
+			}
+		}
+	} else if key, val := i.singleLevelIterator.SeekGE(succKey, base.SeekGEFlagsNone); key != nil {
+		return i.maybeVerifyKey(key, val)
+	}
+	return i.skipForward()
+}
+
+// Prev implements internalIterator.Prev, as documented in the pebble
+// package.
+func (i *twoLevelIterator) Prev() (*InternalKey, base.LazyValue) {
+	// Seek optimization only applies until iterator is first positioned after SetBounds.
+	i.boundsCmp = 0
+	i.maybeFilteredKeysTwoLevel = false
+	if i.err != nil {
+		return nil, base.LazyValue{}
+	}
+	if key, val := i.singleLevelIterator.Prev(); key != nil {
+		return key, val
+	}
+	return i.skipBackward()
+}
+
+func (i *twoLevelIterator) skipForward() (*InternalKey, base.LazyValue) {
+	for {
+		if i.err != nil || i.exhaustedBounds > 0 {
+			return nil, base.LazyValue{}
+		}
+		i.exhaustedBounds = 0
+		var ikey *InternalKey
+		if ikey, _ = i.topLevelIndex.Next(); ikey == nil {
+			i.data.invalidate()
+			i.index.invalidate()
+			return nil, base.LazyValue{}
+		}
+		result := i.loadIndex(+1)
+		if result == loadBlockFailed {
+			return nil, base.LazyValue{}
+		}
+		if result == loadBlockOK {
+			if ikey, val := i.singleLevelIterator.firstInternal(); ikey != nil {
+				return i.maybeVerifyKey(ikey, val)
+			}
+			// Next iteration will return if singleLevelIterator set
+			// exhaustedBounds = +1.
+		} else {
+			// result == loadBlockIrrelevant. Enforce the upper bound here
+			// since don't want to bother moving to the next entry in the top
+			// level index if upper bound is already exceeded. Note that the
+			// next entry starts with keys >= ikey.UserKey since even though
+			// this is the block separator, the same user key can span
+			// multiple index blocks. If upper is exclusive we use >=
+			// below, else we use >.
+			if i.upper != nil {
+				cmp := i.cmp(ikey.UserKey, i.upper)
+				if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 {
+					i.exhaustedBounds = +1
+					// Next iteration will return.
+				}
+			}
+		}
+	}
+}
+
+func (i *twoLevelIterator) skipBackward() (*InternalKey, base.LazyValue) {
+	for {
+		if i.err != nil || i.exhaustedBounds < 0 {
+			return nil, base.LazyValue{}
+		}
+		i.exhaustedBounds = 0
+		var ikey *InternalKey
+		if ikey, _ = i.topLevelIndex.Prev(); ikey == nil {
+			i.data.invalidate()
+			i.index.invalidate()
+			return nil, base.LazyValue{}
+		}
+		result := i.loadIndex(-1)
+		if result == loadBlockFailed {
+			return nil, base.LazyValue{}
+		}
+		if result == loadBlockOK {
+			if ikey, val := i.singleLevelIterator.lastInternal(); ikey != nil {
+				return i.maybeVerifyKey(ikey, val)
+			}
+			// Next iteration will return if singleLevelIterator set
+			// exhaustedBounds = -1.
+		} else {
+			// result == loadBlockIrrelevant. Enforce the lower bound here
+			// since don't want to bother moving to the previous entry in the
+			// top level index if lower bound is already exceeded. Note that
+			// the previous entry starts with keys <= ikey.UserKey since even
+			// though this is the current block's separator, the same user key
+			// can span multiple index blocks.
+			if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 {
+				i.exhaustedBounds = -1
+				// Next iteration will return.
+			}
+		}
+	}
+}
+
+// Close implements internalIterator.Close, as documented in the pebble
+// package.
+func (i *twoLevelIterator) Close() error {
+	i.iterStats.close()
+	var err error
+	if i.closeHook != nil {
+		err = firstError(err, i.closeHook(i))
+	}
+	err = firstError(err, i.data.Close())
+	err = firstError(err, i.index.Close())
+	err = firstError(err, i.topLevelIndex.Close())
+	if i.dataRH != nil {
+		err = firstError(err, i.dataRH.Close())
+		i.dataRH = nil
+	}
+	err = firstError(err, i.err)
+	if i.bpfs != nil {
+		releaseBlockPropertiesFilterer(i.bpfs)
+	}
+	if i.vbReader != nil {
+		i.vbReader.close()
+	}
+	if i.vbRH != nil {
+		err = firstError(err, i.vbRH.Close())
+		i.vbRH = nil
+	}
+	*i = twoLevelIterator{
+		singleLevelIterator: i.singleLevelIterator.resetForReuse(),
+		topLevelIndex:       i.topLevelIndex.resetForReuse(),
+	}
+	twoLevelIterPool.Put(i)
+	return err
+}
+
+// Note: twoLevelCompactionIterator and compactionIterator are very similar but
+// were separated due to performance.
+type twoLevelCompactionIterator struct {
+	*twoLevelIterator
+	bytesIterated *uint64
+	prevOffset    uint64
+}
+
+// twoLevelCompactionIterator implements the base.InternalIterator interface.
+var _ base.InternalIterator = (*twoLevelCompactionIterator)(nil)
+
+func (i *twoLevelCompactionIterator) Close() error {
+	return i.twoLevelIterator.Close()
+}
+
+func (i *twoLevelCompactionIterator) SeekGE(
+	key []byte, flags base.SeekGEFlags,
+) (*InternalKey, base.LazyValue) {
+	panic("pebble: SeekGE unimplemented")
+}
+
+func (i *twoLevelCompactionIterator) SeekPrefixGE(
+	prefix, key []byte, flags base.SeekGEFlags,
+) (*base.InternalKey, base.LazyValue) {
+	panic("pebble: SeekPrefixGE unimplemented")
+}
+
+func (i *twoLevelCompactionIterator) SeekLT(
+	key []byte, flags base.SeekLTFlags,
+) (*InternalKey, base.LazyValue) {
+	panic("pebble: SeekLT unimplemented")
+}
+
+func (i *twoLevelCompactionIterator) First() (*InternalKey, base.LazyValue) {
+	i.err = nil // clear cached iteration error
+	return i.skipForward(i.twoLevelIterator.First())
+}
+
+func (i *twoLevelCompactionIterator) Last() (*InternalKey, base.LazyValue) {
+	panic("pebble: Last unimplemented")
+}
+
+// Note: twoLevelCompactionIterator.Next mirrors the implementation of
+// twoLevelIterator.Next due to performance. Keep the two in sync.
+func (i *twoLevelCompactionIterator) Next() (*InternalKey, base.LazyValue) {
+	if i.err != nil {
+		return nil, base.LazyValue{}
+	}
+	return i.skipForward(i.singleLevelIterator.Next())
+}
+
+func (i *twoLevelCompactionIterator) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
+	panic("pebble: NextPrefix unimplemented")
+}
+
+func (i *twoLevelCompactionIterator) Prev() (*InternalKey, base.LazyValue) {
+	panic("pebble: Prev unimplemented")
+}
+
+func (i *twoLevelCompactionIterator) String() string {
+	if i.vState != nil {
+		return i.vState.fileNum.String()
+	}
+	return i.reader.fileNum.String()
+}
+
+func (i *twoLevelCompactionIterator) skipForward(
+	key *InternalKey, val base.LazyValue,
+) (*InternalKey, base.LazyValue) {
+	if key == nil {
+		for {
+			if key, _ := i.topLevelIndex.Next(); key == nil {
+				break
+			}
+			result := i.loadIndex(+1)
+			if result != loadBlockOK {
+				if i.err != nil {
+					break
+				}
+				switch result {
+				case loadBlockFailed:
+					// We checked that i.index was at a valid entry, so
+					// loadBlockFailed could not have happened due to to i.index
+					// being exhausted, and must be due to an error.
+					panic("loadBlock should not have failed with no error")
+				case loadBlockIrrelevant:
+					panic("compactionIter should not be using block intervals for skipping")
+				default:
+					panic(fmt.Sprintf("unexpected case %d", result))
+				}
+			}
+			// result == loadBlockOK
+			if key, val = i.singleLevelIterator.First(); key != nil {
+				break
+			}
+		}
+	}
+
+	curOffset := i.recordOffset()
+	*i.bytesIterated += uint64(curOffset - i.prevOffset)
+	i.prevOffset = curOffset
+
+	if i.vState != nil && key != nil {
+		cmp := i.cmp(key.UserKey, i.vState.upper.UserKey)
+		if cmp > 0 || (i.vState.upper.IsExclusiveSentinel() && cmp == 0) {
+			return nil, base.LazyValue{}
+		}
+	}
+
+	return key, val
+}
diff --git a/pebble/sstable/reader_test.go b/pebble/sstable/reader_test.go
new file mode 100644
index 0000000..fa5237d
--- /dev/null
+++ b/pebble/sstable/reader_test.go
@@ -0,0 +1,2117 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"bytes"
+	"context"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"math"
+	"os"
+	"path"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/bloom"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/cache"
+	"github.com/cockroachdb/pebble/internal/humanize"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/cockroachdb/pebble/vfs/errorfs"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+// get is a testing helper that simulates a read and helps verify bloom filters
+// until they are available through iterators.
+func (r *Reader) get(key []byte) (value []byte, err error) {
+	if r.err != nil {
+		return nil, r.err
+	}
+
+	if r.tableFilter != nil {
+		dataH, err := r.readFilter(context.Background(), nil /* stats */, nil)
+		if err != nil {
+			return nil, err
+		}
+		var lookupKey []byte
+		if r.Split != nil {
+			lookupKey = key[:r.Split(key)]
+		} else {
+			lookupKey = key
+		}
+		mayContain := r.tableFilter.mayContain(dataH.Get(), lookupKey)
+		dataH.Release()
+		if !mayContain {
+			return nil, base.ErrNotFound
+		}
+	}
+
+	i, err := r.NewIter(nil /* lower */, nil /* upper */)
+	if err != nil {
+		return nil, err
+	}
+	var v base.LazyValue
+	ikey, v := i.SeekGE(key, base.SeekGEFlagsNone)
+	value, _, err = v.Value(nil)
+	if err != nil {
+		return nil, err
+	}
+
+	if ikey == nil || r.Compare(key, ikey.UserKey) != 0 {
+		err := i.Close()
+		if err == nil {
+			err = base.ErrNotFound
+		}
+		return nil, err
+	}
+
+	// The value will be "freed" when the iterator is closed, so make a copy
+	// which will outlast the lifetime of the iterator.
+	newValue := make([]byte, len(value))
+	copy(newValue, value)
+	if err := i.Close(); err != nil {
+		return nil, err
+	}
+	return newValue, nil
+}
+
+// iterAdapter adapts the new Iterator API which returns the key and value from
+// positioning methods (Seek*, First, Last, Next, Prev) to the old API which
+// returned a boolean corresponding to Valid. Only used by test code.
+type iterAdapter struct {
+	Iterator
+	key *InternalKey
+	val []byte
+}
+
+func newIterAdapter(iter Iterator) *iterAdapter {
+	return &iterAdapter{
+		Iterator: iter,
+	}
+}
+
+func (i *iterAdapter) update(key *InternalKey, val base.LazyValue) bool {
+	i.key = key
+	if v, _, err := val.Value(nil); err != nil {
+		i.key = nil
+		i.val = nil
+	} else {
+		i.val = v
+	}
+	return i.key != nil
+}
+
+func (i *iterAdapter) String() string {
+	return "iter-adapter"
+}
+
+func (i *iterAdapter) SeekGE(key []byte, flags base.SeekGEFlags) bool {
+	return i.update(i.Iterator.SeekGE(key, flags))
+}
+
+func (i *iterAdapter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) bool {
+	return i.update(i.Iterator.SeekPrefixGE(prefix, key, flags))
+}
+
+func (i *iterAdapter) SeekLT(key []byte, flags base.SeekLTFlags) bool {
+	return i.update(i.Iterator.SeekLT(key, flags))
+}
+
+func (i *iterAdapter) First() bool {
+	return i.update(i.Iterator.First())
+}
+
+func (i *iterAdapter) Last() bool {
+	return i.update(i.Iterator.Last())
+}
+
+func (i *iterAdapter) Next() bool {
+	return i.update(i.Iterator.Next())
+}
+
+func (i *iterAdapter) NextPrefix(succKey []byte) bool {
+	return i.update(i.Iterator.NextPrefix(succKey))
+}
+
+func (i *iterAdapter) NextIgnoreResult() {
+	i.Iterator.Next()
+	i.update(nil, base.LazyValue{})
+}
+
+func (i *iterAdapter) Prev() bool {
+	return i.update(i.Iterator.Prev())
+}
+
+func (i *iterAdapter) Key() *InternalKey {
+	return i.key
+}
+
+func (i *iterAdapter) Value() []byte {
+	return i.val
+}
+
+func (i *iterAdapter) Valid() bool {
+	return i.key != nil
+}
+
+func (i *iterAdapter) SetBounds(lower, upper []byte) {
+	i.Iterator.SetBounds(lower, upper)
+	i.key = nil
+}
+
+func (i *iterAdapter) SetContext(ctx context.Context) {
+	i.Iterator.SetContext(ctx)
+}
+
+func TestVirtualReader(t *testing.T) {
+	// A faux filenum used to create fake filemetadata for testing.
+	var fileNum int = 1
+	nextFileNum := func() base.FileNum {
+		fileNum++
+		return base.FileNum(fileNum - 1)
+	}
+
+	// Set during the latest build command.
+	var r *Reader
+	var meta manifest.PhysicalFileMeta
+	var bp BufferPool
+
+	// Set during the latest virtualize command.
+	var vMeta1 manifest.VirtualFileMeta
+	var v VirtualReader
+
+	defer func() {
+		if r != nil {
+			require.NoError(t, r.Close())
+			bp.Release()
+		}
+	}()
+
+	createPhysicalMeta := func(w *WriterMetadata, r *Reader) (manifest.PhysicalFileMeta, error) {
+		meta := &manifest.FileMetadata{}
+		meta.FileNum = nextFileNum()
+		meta.CreationTime = time.Now().Unix()
+		meta.Size = w.Size
+		meta.SmallestSeqNum = w.SmallestSeqNum
+		meta.LargestSeqNum = w.LargestSeqNum
+
+		if w.HasPointKeys {
+			meta.ExtendPointKeyBounds(r.Compare, w.SmallestPoint, w.LargestPoint)
+		}
+		if w.HasRangeDelKeys {
+			meta.ExtendPointKeyBounds(r.Compare, w.SmallestRangeDel, w.LargestRangeDel)
+		}
+		if w.HasRangeKeys {
+			meta.ExtendRangeKeyBounds(r.Compare, w.SmallestRangeKey, w.LargestRangeKey)
+		}
+		meta.InitPhysicalBacking()
+
+		if err := meta.Validate(r.Compare, r.opts.Comparer.FormatKey); err != nil {
+			return manifest.PhysicalFileMeta{}, err
+		}
+
+		return meta.PhysicalMeta(), nil
+	}
+
+	formatWMeta := func(m *WriterMetadata) string {
+		var b bytes.Buffer
+		if m.HasPointKeys {
+			fmt.Fprintf(&b, "point:    [%s-%s]\n", m.SmallestPoint, m.LargestPoint)
+		}
+		if m.HasRangeDelKeys {
+			fmt.Fprintf(&b, "rangedel: [%s-%s]\n", m.SmallestRangeDel, m.LargestRangeDel)
+		}
+		if m.HasRangeKeys {
+			fmt.Fprintf(&b, "rangekey: [%s-%s]\n", m.SmallestRangeKey, m.LargestRangeKey)
+		}
+		fmt.Fprintf(&b, "seqnums:  [%d-%d]\n", m.SmallestSeqNum, m.LargestSeqNum)
+		return b.String()
+	}
+
+	formatVirtualReader := func(v *VirtualReader) string {
+		var b bytes.Buffer
+		fmt.Fprintf(&b, "bounds:  [%s-%s]\n", v.vState.lower, v.vState.upper)
+		fmt.Fprintf(&b, "filenum: %s\n", v.vState.fileNum.String())
+		fmt.Fprintf(
+			&b, "props: %s: %d, %s: %d, %s: %d, %s: %d, %s: %d, %s: %d, %s: %d, %s: %d, %s: %d, %s: %d, %s: %d\n",
+			"NumEntries",
+			v.Properties.NumEntries,
+			"RawKeySize",
+			v.Properties.RawKeySize,
+			"RawValueSize",
+			v.Properties.RawValueSize,
+			"RawPointTombstoneKeySize",
+			v.Properties.RawPointTombstoneKeySize,
+			"RawPointTombstoneValueSize",
+			v.Properties.RawPointTombstoneValueSize,
+			"NumSizedDeletions",
+			v.Properties.NumSizedDeletions,
+			"NumDeletions",
+			v.Properties.NumDeletions,
+			"NumRangeDeletions",
+			v.Properties.NumRangeDeletions,
+			"NumRangeKeyDels",
+			v.Properties.NumRangeKeyDels,
+			"NumRangeKeySets",
+			v.Properties.NumRangeKeySets,
+			"ValueBlocksSize",
+			v.Properties.ValueBlocksSize,
+		)
+		return b.String()
+	}
+
+	datadriven.RunTest(t, "testdata/virtual_reader", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "build":
+			if r != nil {
+				bp.Release()
+				_ = r.Close()
+				r = nil
+				meta.FileMetadata = nil
+				vMeta1.FileMetadata = nil
+				v = VirtualReader{}
+			}
+			var wMeta *WriterMetadata
+			var err error
+			writerOpts := &WriterOptions{
+				TableFormat: TableFormatMax,
+			}
+			// Use a single level index by default.
+			writerOpts.IndexBlockSize = 100000
+			if len(td.CmdArgs) == 1 {
+				if td.CmdArgs[0].String() == "twoLevel" {
+					// Force a two level index.
+					writerOpts.IndexBlockSize = 1
+					writerOpts.BlockSize = 1
+				}
+			}
+			wMeta, r, err = runBuildCmd(td, writerOpts, 0)
+			if err != nil {
+				return err.Error()
+			}
+			bp.Init(5)
+
+			// Create a fake filemetada using the writer meta.
+			meta, err = createPhysicalMeta(wMeta, r)
+			if err != nil {
+				return err.Error()
+			}
+			r.fileNum = meta.FileBacking.DiskFileNum
+			return formatWMeta(wMeta)
+
+		case "virtualize":
+			// virtualize will split the previously built physical sstable into
+			// a single sstable with virtual bounds. The command assumes that
+			// the bounds for the virtual sstable are valid. For the purposes of
+			// this command the bounds must be valid keys. In general, and for
+			// this command, range key/range del spans must also not span across
+			// virtual sstable bounds.
+			if meta.FileMetadata == nil {
+				return "build must be called at least once before virtualize"
+			}
+			if vMeta1.FileMetadata != nil {
+				vMeta1.FileMetadata = nil
+				v = VirtualReader{}
+			}
+			vMeta := &manifest.FileMetadata{
+				FileBacking:    meta.FileBacking,
+				SmallestSeqNum: meta.SmallestSeqNum,
+				LargestSeqNum:  meta.LargestSeqNum,
+				Virtual:        true,
+			}
+			// Parse the virtualization bounds.
+			bounds := strings.Split(td.CmdArgs[0].String(), "-")
+			vMeta.Smallest = base.ParseInternalKey(bounds[0])
+			vMeta.Largest = base.ParseInternalKey(bounds[1])
+			vMeta.FileNum = nextFileNum()
+			var err error
+			vMeta.Size, err = r.EstimateDiskUsage(vMeta.Smallest.UserKey, vMeta.Largest.UserKey)
+			if err != nil {
+				return err.Error()
+			}
+			vMeta.ValidateVirtual(meta.FileMetadata)
+
+			vMeta1 = vMeta.VirtualMeta()
+			v = MakeVirtualReader(r, vMeta1, false /* isForeign */)
+			return formatVirtualReader(&v)
+
+		case "citer":
+			// Creates a compaction iterator from the virtual reader, and then
+			// just scans the keyspace. Which is all a compaction iterator is
+			// used for. This tests the First and Next calls.
+			if vMeta1.FileMetadata == nil {
+				return "virtualize must be called before creating compaction iters"
+			}
+
+			var rp ReaderProvider
+			var bytesIterated uint64
+			iter, err := v.NewCompactionIter(&bytesIterated, CategoryAndQoS{}, nil, rp, &bp)
+			if err != nil {
+				return err.Error()
+			}
+
+			var buf bytes.Buffer
+			for key, val := iter.First(); key != nil; key, val = iter.Next() {
+				fmt.Fprintf(&buf, "%s:%s\n", key.String(), val.InPlaceValue())
+			}
+			err = iter.Close()
+			if err != nil {
+				return err.Error()
+			}
+			return buf.String()
+
+		case "constrain":
+			if vMeta1.FileMetadata == nil {
+				return "virtualize must be called before constrain"
+			}
+			splits := strings.Split(td.CmdArgs[0].String(), ",")
+			of, ol := []byte(splits[0]), []byte(splits[1])
+			inclusive, f, l := v.vState.constrainBounds(of, ol, splits[2] == "true")
+			var buf bytes.Buffer
+			buf.Write(f)
+			buf.WriteByte(',')
+			buf.Write(l)
+			buf.WriteByte(',')
+			if inclusive {
+				buf.WriteString("true")
+			} else {
+				buf.WriteString("false")
+			}
+			buf.WriteByte('\n')
+			return buf.String()
+
+		case "scan-range-del":
+			if vMeta1.FileMetadata == nil {
+				return "virtualize must be called before scan-range-del"
+			}
+			iter, err := v.NewRawRangeDelIter()
+			if err != nil {
+				return err.Error()
+			}
+			if iter == nil {
+				return ""
+			}
+			defer iter.Close()
+
+			var buf bytes.Buffer
+			for s := iter.First(); s != nil; s = iter.Next() {
+				fmt.Fprintf(&buf, "%s\n", s)
+			}
+			return buf.String()
+
+		case "scan-range-key":
+			if vMeta1.FileMetadata == nil {
+				return "virtualize must be called before scan-range-key"
+			}
+			iter, err := v.NewRawRangeKeyIter()
+			if err != nil {
+				return err.Error()
+			}
+			if iter == nil {
+				return ""
+			}
+			defer iter.Close()
+
+			var buf bytes.Buffer
+			for s := iter.First(); s != nil; s = iter.Next() {
+				fmt.Fprintf(&buf, "%s\n", s)
+			}
+			return buf.String()
+
+		case "iter":
+			if vMeta1.FileMetadata == nil {
+				return "virtualize must be called before iter"
+			}
+			var lower, upper []byte
+			if len(td.CmdArgs) > 0 {
+				splits := strings.Split(td.CmdArgs[0].String(), "-")
+				lower, upper = []byte(splits[0]), []byte(splits[1])
+			}
+
+			var stats base.InternalIteratorStats
+			iter, err := v.NewIterWithBlockPropertyFiltersAndContextEtc(
+				context.Background(), lower, upper, nil, false, false,
+				&stats, CategoryAndQoS{}, nil, TrivialReaderProvider{Reader: r})
+			if err != nil {
+				return err.Error()
+			}
+			return runIterCmd(td, iter, true, runIterCmdStats(&stats))
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestReader(t *testing.T) {
+	writerOpts := map[string]WriterOptions{
+		// No bloom filters.
+		"default": {},
+		"bloom10bit": {
+			// The standard policy.
+			FilterPolicy: bloom.FilterPolicy(10),
+			FilterType:   base.TableFilter,
+		},
+		"bloom1bit": {
+			// A policy with many false positives.
+			FilterPolicy: bloom.FilterPolicy(1),
+			FilterType:   base.TableFilter,
+		},
+		"bloom100bit": {
+			// A policy unlikely to have false positives.
+			FilterPolicy: bloom.FilterPolicy(100),
+			FilterType:   base.TableFilter,
+		},
+	}
+
+	blockSizes := map[string]int{
+		"1bytes":   1,
+		"5bytes":   5,
+		"10bytes":  10,
+		"25bytes":  25,
+		"Maxbytes": math.MaxInt32,
+	}
+
+	opts := map[string]*Comparer{
+		"default":      testkeys.Comparer,
+		"prefixFilter": fixtureComparer,
+	}
+
+	testDirs := map[string]string{
+		"default":      "testdata/reader",
+		"prefixFilter": "testdata/prefixreader",
+	}
+
+	for format := TableFormatPebblev2; format <= TableFormatMax; format++ {
+		for dName, blockSize := range blockSizes {
+			for iName, indexBlockSize := range blockSizes {
+				for lName, tableOpt := range writerOpts {
+					for oName, cmp := range opts {
+						tableOpt.BlockSize = blockSize
+						tableOpt.Comparer = cmp
+						tableOpt.IndexBlockSize = indexBlockSize
+						tableOpt.TableFormat = format
+
+						t.Run(
+							fmt.Sprintf("format=%d,opts=%s,writerOpts=%s,blockSize=%s,indexSize=%s",
+								format, oName, lName, dName, iName),
+							func(t *testing.T) {
+								runTestReader(
+									t, tableOpt, testDirs[oName], nil /* Reader */, true)
+							})
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestReaderHideObsolete(t *testing.T) {
+	blockSizes := map[string]int{
+		"1bytes":   1,
+		"5bytes":   5,
+		"10bytes":  10,
+		"25bytes":  25,
+		"Maxbytes": math.MaxInt32,
+	}
+	for dName, blockSize := range blockSizes {
+		opts := WriterOptions{
+			TableFormat:    TableFormatPebblev4,
+			BlockSize:      blockSize,
+			IndexBlockSize: blockSize,
+			Comparer:       testkeys.Comparer,
+		}
+		t.Run(fmt.Sprintf("blockSize=%s", dName), func(t *testing.T) {
+			runTestReader(
+				t, opts, "testdata/reader_hide_obsolete",
+				nil /* Reader */, true)
+		})
+	}
+}
+
+func TestHamletReader(t *testing.T) {
+	prebuiltSSTs := []string{
+		"testdata/h.ldb",
+		"testdata/h.sst",
+		"testdata/h.no-compression.sst",
+		"testdata/h.no-compression.two_level_index.sst",
+		"testdata/h.block-bloom.no-compression.sst",
+		"testdata/h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst",
+		"testdata/h.table-bloom.no-compression.sst",
+	}
+
+	for _, prebuiltSST := range prebuiltSSTs {
+		f, err := os.Open(filepath.FromSlash(prebuiltSST))
+		require.NoError(t, err)
+
+		r, err := newReader(f, ReaderOptions{})
+		require.NoError(t, err)
+
+		t.Run(
+			fmt.Sprintf("sst=%s", prebuiltSST),
+			func(t *testing.T) {
+				runTestReader(t, WriterOptions{}, "testdata/hamletreader", r, false)
+			},
+		)
+	}
+}
+
+func forEveryTableFormat[I any](
+	t *testing.T, formatTable [NumTableFormats]I, runTest func(*testing.T, TableFormat, I),
+) {
+	t.Helper()
+	for tf := TableFormatUnspecified + 1; tf <= TableFormatMax; tf++ {
+		t.Run(tf.String(), func(t *testing.T) {
+			runTest(t, tf, formatTable[tf])
+		})
+	}
+}
+
+func TestReaderStats(t *testing.T) {
+	forEveryTableFormat[string](t,
+		[NumTableFormats]string{
+			TableFormatUnspecified: "",
+			TableFormatLevelDB:     "testdata/readerstats_LevelDB",
+			TableFormatRocksDBv2:   "testdata/readerstats_LevelDB",
+			TableFormatPebblev1:    "testdata/readerstats_LevelDB",
+			TableFormatPebblev2:    "testdata/readerstats_LevelDB",
+			TableFormatPebblev3:    "testdata/readerstats_Pebblev3",
+			TableFormatPebblev4:    "testdata/readerstats_Pebblev3",
+		}, func(t *testing.T, format TableFormat, dir string) {
+			if dir == "" {
+				t.Skip()
+			}
+			writerOpt := WriterOptions{
+				BlockSize:      32 << 10,
+				IndexBlockSize: 32 << 10,
+				Comparer:       testkeys.Comparer,
+				TableFormat:    format,
+			}
+			runTestReader(t, writerOpt, dir, nil /* Reader */, false /* printValue */)
+		})
+}
+
+func TestReaderWithBlockPropertyFilter(t *testing.T) {
+	// Some of these tests examine internal iterator state, so they require
+	// determinism. When the invariants tag is set, disableBoundsOpt may disable
+	// the bounds optimization depending on the iterator pointer address. This
+	// can add nondeterminism to the internal iterator statae. Disable this
+	// nondeterminism for the duration of this test.
+	ensureBoundsOptDeterminism = true
+	defer func() { ensureBoundsOptDeterminism = false }()
+
+	forEveryTableFormat[string](t,
+		[NumTableFormats]string{
+			TableFormatUnspecified: "", // Block properties unsupported
+			TableFormatLevelDB:     "", // Block properties unsupported
+			TableFormatRocksDBv2:   "", // Block properties unsupported
+			TableFormatPebblev1:    "", // Block properties unsupported
+			TableFormatPebblev2:    "testdata/reader_bpf/Pebblev2",
+			TableFormatPebblev3:    "testdata/reader_bpf/Pebblev3",
+			TableFormatPebblev4:    "testdata/reader_bpf/Pebblev3",
+		}, func(t *testing.T, format TableFormat, dir string) {
+			if dir == "" {
+				t.Skip("Block-properties unsupported")
+			}
+			writerOpt := WriterOptions{
+				Comparer:                testkeys.Comparer,
+				TableFormat:             format,
+				BlockPropertyCollectors: []func() BlockPropertyCollector{NewTestKeysBlockPropertyCollector},
+			}
+			runTestReader(t, writerOpt, dir, nil /* Reader */, false)
+		})
+}
+
+func TestInjectedErrors(t *testing.T) {
+	prebuiltSSTs := []string{
+		"testdata/h.ldb",
+		"testdata/h.sst",
+		"testdata/h.no-compression.sst",
+		"testdata/h.no-compression.two_level_index.sst",
+		"testdata/h.block-bloom.no-compression.sst",
+		"testdata/h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst",
+		"testdata/h.table-bloom.no-compression.sst",
+	}
+
+	for _, prebuiltSST := range prebuiltSSTs {
+		run := func(i int) (reterr error) {
+			f, err := vfs.Default.Open(filepath.FromSlash(prebuiltSST))
+			require.NoError(t, err)
+
+			r, err := newReader(errorfs.WrapFile(f, errorfs.ErrInjected.If(errorfs.OnIndex(int32(i)))), ReaderOptions{})
+			if err != nil {
+				return firstError(err, f.Close())
+			}
+			defer func() { reterr = firstError(reterr, r.Close()) }()
+
+			_, err = r.EstimateDiskUsage([]byte("borrower"), []byte("lender"))
+			if err != nil {
+				return err
+			}
+
+			iter, err := r.NewIter(nil, nil)
+			if err != nil {
+				return err
+			}
+			defer func() { reterr = firstError(reterr, iter.Close()) }()
+			for k, v := iter.First(); k != nil; k, v = iter.Next() {
+				val, _, err := v.Value(nil)
+				if err != nil {
+					return err
+				}
+				if val == nil {
+					break
+				}
+			}
+			if err = iter.Error(); err != nil {
+				return err
+			}
+			return nil
+		}
+		for i := 0; ; i++ {
+			err := run(i)
+			if errors.Is(err, errorfs.ErrInjected) {
+				t.Logf("%q, index %d: %s", prebuiltSST, i, err)
+				continue
+			}
+			if err != nil {
+				t.Errorf("%q, index %d: non-injected error: %+v", prebuiltSST, i, err)
+				break
+			}
+			t.Logf("%q: no error at index %d", prebuiltSST, i)
+			break
+		}
+	}
+}
+
+func TestInvalidReader(t *testing.T) {
+	invalid, err := NewSimpleReadable(vfs.NewMemFile([]byte("invalid sst bytes")))
+	if err != nil {
+		t.Fatal(err)
+	}
+	testCases := []struct {
+		readable objstorage.Readable
+		expected string
+	}{
+		{nil, "nil file"},
+		{invalid, "invalid table"},
+	}
+	for _, tc := range testCases {
+		r, err := NewReader(tc.readable, ReaderOptions{})
+		if !strings.Contains(err.Error(), tc.expected) {
+			t.Fatalf("expected %q, but found %q", tc.expected, err.Error())
+		}
+		if r != nil {
+			t.Fatalf("found non-nil reader returned with non-nil error %q", err.Error())
+		}
+	}
+}
+
+func indexLayoutString(t *testing.T, r *Reader) string {
+	indexH, err := r.readIndex(context.Background(), nil, nil)
+	require.NoError(t, err)
+	defer indexH.Release()
+	var buf strings.Builder
+	twoLevelIndex := r.Properties.IndexType == twoLevelIndex
+	buf.WriteString("index entries:\n")
+	iter, err := newBlockIter(r.Compare, indexH.Get())
+	defer func() {
+		require.NoError(t, iter.Close())
+	}()
+	require.NoError(t, err)
+	for key, value := iter.First(); key != nil; key, value = iter.Next() {
+		bh, err := decodeBlockHandleWithProperties(value.InPlaceValue())
+		require.NoError(t, err)
+		fmt.Fprintf(&buf, " %s: size %d\n", string(key.UserKey), bh.Length)
+		if twoLevelIndex {
+			b, err := r.readBlock(
+				context.Background(), bh.BlockHandle, nil, nil, nil, nil, nil)
+			require.NoError(t, err)
+			defer b.Release()
+			iter2, err := newBlockIter(r.Compare, b.Get())
+			defer func() {
+				require.NoError(t, iter2.Close())
+			}()
+			require.NoError(t, err)
+			for key, value := iter2.First(); key != nil; key, value = iter2.Next() {
+				bh, err := decodeBlockHandleWithProperties(value.InPlaceValue())
+				require.NoError(t, err)
+				fmt.Fprintf(&buf, "   %s: size %d\n", string(key.UserKey), bh.Length)
+			}
+		}
+	}
+	return buf.String()
+}
+
+func runTestReader(t *testing.T, o WriterOptions, dir string, r *Reader, printValue bool) {
+	datadriven.Walk(t, dir, func(t *testing.T, path string) {
+		defer func() {
+			if r != nil {
+				r.Close()
+				r = nil
+			}
+		}()
+
+		datadriven.RunTest(t, path, func(t *testing.T, d *datadriven.TestData) string {
+			switch d.Cmd {
+			case "build":
+				if r != nil {
+					r.Close()
+					r = nil
+				}
+				var cacheSize int
+				var printLayout bool
+				d.MaybeScanArgs(t, "cache-size", &cacheSize)
+				d.MaybeScanArgs(t, "print-layout", &printLayout)
+				d.MaybeScanArgs(t, "block-size", &o.BlockSize)
+				d.MaybeScanArgs(t, "index-block-size", &o.IndexBlockSize)
+
+				var err error
+				_, r, err = runBuildCmd(d, &o, cacheSize)
+				if err != nil {
+					return err.Error()
+				}
+				if printLayout {
+					return indexLayoutString(t, r)
+				}
+				return ""
+
+			case "iter":
+				seqNum, err := scanGlobalSeqNum(d)
+				if err != nil {
+					return err.Error()
+				}
+				var stats base.InternalIteratorStats
+				r.Properties.GlobalSeqNum = seqNum
+				var bpfs []BlockPropertyFilter
+				if d.HasArg("block-property-filter") {
+					var filterMin, filterMax uint64
+					d.ScanArgs(t, "block-property-filter", &filterMin, &filterMax)
+					bpf := NewTestKeysBlockPropertyFilter(filterMin, filterMax)
+					bpfs = append(bpfs, bpf)
+				}
+				hideObsoletePoints := false
+				if d.HasArg("hide-obsolete-points") {
+					d.ScanArgs(t, "hide-obsolete-points", &hideObsoletePoints)
+					if hideObsoletePoints {
+						hideObsoletePoints, bpfs = r.TryAddBlockPropertyFilterForHideObsoletePoints(
+							InternalKeySeqNumMax, InternalKeySeqNumMax-1, bpfs)
+						require.True(t, hideObsoletePoints)
+					}
+				}
+				var filterer *BlockPropertiesFilterer
+				if len(bpfs) > 0 {
+					filterer = newBlockPropertiesFilterer(bpfs, nil)
+					intersects, err :=
+						filterer.intersectsUserPropsAndFinishInit(r.Properties.UserProperties)
+					if err != nil {
+						return err.Error()
+					}
+					if !intersects {
+						return "table does not intersect BlockPropertyFilter"
+					}
+				}
+				iter, err := r.NewIterWithBlockPropertyFiltersAndContextEtc(
+					context.Background(),
+					nil, /* lower */
+					nil, /* upper */
+					filterer,
+					hideObsoletePoints,
+					true, /* use filter block */
+					&stats,
+					CategoryAndQoS{},
+					nil,
+					TrivialReaderProvider{Reader: r},
+				)
+				if err != nil {
+					return err.Error()
+				}
+				return runIterCmd(d, iter, printValue, runIterCmdStats(&stats))
+
+			case "get":
+				var b bytes.Buffer
+				for _, k := range strings.Split(d.Input, "\n") {
+					v, err := r.get([]byte(k))
+					if err != nil {
+						fmt.Fprintf(&b, "<err: %s>\n", err)
+					} else {
+						fmt.Fprintln(&b, string(v))
+					}
+				}
+				return b.String()
+			default:
+				return fmt.Sprintf("unknown command: %s", d.Cmd)
+			}
+		})
+	})
+}
+
+func TestReaderCheckComparerMerger(t *testing.T) {
+	const testTable = "test"
+
+	testComparer := &base.Comparer{
+		Name:      "test.comparer",
+		Compare:   base.DefaultComparer.Compare,
+		Equal:     base.DefaultComparer.Equal,
+		Separator: base.DefaultComparer.Separator,
+		Successor: base.DefaultComparer.Successor,
+	}
+	testMerger := &base.Merger{
+		Name:  "test.merger",
+		Merge: base.DefaultMerger.Merge,
+	}
+	writerOpts := WriterOptions{
+		Comparer:   testComparer,
+		MergerName: "test.merger",
+	}
+
+	mem := vfs.NewMem()
+	f0, err := mem.Create(testTable)
+	require.NoError(t, err)
+
+	w := NewWriter(objstorageprovider.NewFileWritable(f0), writerOpts)
+	require.NoError(t, w.Set([]byte("test"), nil))
+	require.NoError(t, w.Close())
+
+	testCases := []struct {
+		comparers []*base.Comparer
+		mergers   []*base.Merger
+		expected  string
+	}{
+		{
+			[]*base.Comparer{testComparer},
+			[]*base.Merger{testMerger},
+			"",
+		},
+		{
+			[]*base.Comparer{testComparer, base.DefaultComparer},
+			[]*base.Merger{testMerger, base.DefaultMerger},
+			"",
+		},
+		{
+			[]*base.Comparer{},
+			[]*base.Merger{testMerger},
+			"unknown comparer test.comparer",
+		},
+		{
+			[]*base.Comparer{base.DefaultComparer},
+			[]*base.Merger{testMerger},
+			"unknown comparer test.comparer",
+		},
+		{
+			[]*base.Comparer{testComparer},
+			[]*base.Merger{},
+			"unknown merger test.merger",
+		},
+		{
+			[]*base.Comparer{testComparer},
+			[]*base.Merger{base.DefaultMerger},
+			"unknown merger test.merger",
+		},
+	}
+
+	for _, c := range testCases {
+		t.Run("", func(t *testing.T) {
+			f1, err := mem.Open(testTable)
+			require.NoError(t, err)
+
+			comparers := make(Comparers)
+			for _, comparer := range c.comparers {
+				comparers[comparer.Name] = comparer
+			}
+			mergers := make(Mergers)
+			for _, merger := range c.mergers {
+				mergers[merger.Name] = merger
+			}
+
+			r, err := newReader(f1, ReaderOptions{}, comparers, mergers)
+			if err != nil {
+				if r != nil {
+					t.Fatalf("found non-nil reader returned with non-nil error %q", err.Error())
+				}
+				if !strings.HasSuffix(err.Error(), c.expected) {
+					t.Fatalf("expected %q, but found %q", c.expected, err.Error())
+				}
+			} else if c.expected != "" {
+				t.Fatalf("expected %q, but found success", c.expected)
+			}
+			if r != nil {
+				_ = r.Close()
+			}
+		})
+	}
+}
+func checkValidPrefix(prefix, key []byte) bool {
+	return prefix == nil || bytes.HasPrefix(key, prefix)
+}
+
+func testBytesIteratedWithCompression(
+	t *testing.T,
+	compression Compression,
+	allowedSizeDeviationPercent uint64,
+	blockSizes []int,
+	maxNumEntries []uint64,
+) {
+	for i, blockSize := range blockSizes {
+		for _, indexBlockSize := range blockSizes {
+			for _, numEntries := range []uint64{0, 1, maxNumEntries[i]} {
+				r := buildTestTable(t, numEntries, blockSize, indexBlockSize, compression)
+				var bytesIterated, prevIterated uint64
+				var pool BufferPool
+				pool.Init(5)
+				citer, err := r.NewCompactionIter(
+					&bytesIterated, CategoryAndQoS{}, nil, TrivialReaderProvider{Reader: r}, &pool)
+				require.NoError(t, err)
+
+				for key, _ := citer.First(); key != nil; key, _ = citer.Next() {
+					if bytesIterated < prevIterated {
+						t.Fatalf("bytesIterated moved backward: %d < %d", bytesIterated, prevIterated)
+					}
+					prevIterated = bytesIterated
+				}
+
+				expected := r.Properties.DataSize
+				allowedSizeDeviation := expected * allowedSizeDeviationPercent / 100
+				// There is some inaccuracy due to compression estimation.
+				if bytesIterated < expected-allowedSizeDeviation || bytesIterated > expected+allowedSizeDeviation {
+					t.Fatalf("bytesIterated: got %d, want %d", bytesIterated, expected)
+				}
+
+				require.NoError(t, citer.Close())
+				require.NoError(t, r.Close())
+				pool.Release()
+			}
+		}
+	}
+}
+
+func TestBytesIterated(t *testing.T) {
+	blockSizes := []int{10, 100, 1000, 4096, math.MaxInt32}
+	t.Run("Compressed", func(t *testing.T) {
+		testBytesIteratedWithCompression(t, SnappyCompression, 1, blockSizes, []uint64{1e5, 1e5, 1e5, 1e5, 1e5})
+	})
+	t.Run("Uncompressed", func(t *testing.T) {
+		testBytesIteratedWithCompression(t, NoCompression, 0, blockSizes, []uint64{1e5, 1e5, 1e5, 1e5, 1e5})
+	})
+	t.Run("Zstd", func(t *testing.T) {
+		// compression with zstd is extremely slow with small block size (esp the nocgo version).
+		// use less numEntries to make the test run at reasonable speed (under 10 seconds).
+		maxNumEntries := []uint64{1e2, 1e2, 1e3, 4e3, 1e5}
+		if useStandardZstdLib {
+			maxNumEntries = []uint64{1e3, 1e3, 1e4, 4e4, 1e5}
+		}
+		testBytesIteratedWithCompression(t, ZstdCompression, 1, blockSizes, maxNumEntries)
+	})
+}
+
+func TestCompactionIteratorSetupForCompaction(t *testing.T) {
+	tmpDir := path.Join(t.TempDir())
+	provider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(vfs.Default, tmpDir))
+	require.NoError(t, err)
+	defer provider.Close()
+	blockSizes := []int{10, 100, 1000, 4096, math.MaxInt32}
+	for _, blockSize := range blockSizes {
+		for _, indexBlockSize := range blockSizes {
+			for _, numEntries := range []uint64{0, 1, 1e5} {
+				r := buildTestTableWithProvider(t, provider, numEntries, blockSize, indexBlockSize, DefaultCompression)
+				var bytesIterated uint64
+				var pool BufferPool
+				pool.Init(5)
+				citer, err := r.NewCompactionIter(
+					&bytesIterated, CategoryAndQoS{}, nil, TrivialReaderProvider{Reader: r}, &pool)
+				require.NoError(t, err)
+				switch i := citer.(type) {
+				case *compactionIterator:
+					require.True(t, objstorageprovider.TestingCheckMaxReadahead(i.dataRH))
+					// Each key has one version, so no value block, regardless of
+					// sstable version.
+					require.Nil(t, i.vbRH)
+				case *twoLevelCompactionIterator:
+					require.True(t, objstorageprovider.TestingCheckMaxReadahead(i.dataRH))
+					// Each key has one version, so no value block, regardless of
+					// sstable version.
+					require.Nil(t, i.vbRH)
+				default:
+					require.Failf(t, fmt.Sprintf("unknown compaction iterator type: %T", citer), "")
+				}
+				require.NoError(t, citer.Close())
+				require.NoError(t, r.Close())
+				pool.Release()
+			}
+		}
+	}
+}
+
+func TestReadaheadSetupForV3TablesWithMultipleVersions(t *testing.T) {
+	tmpDir := path.Join(t.TempDir())
+	provider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(vfs.Default, tmpDir))
+	require.NoError(t, err)
+	defer provider.Close()
+	f0, _, err := provider.Create(context.Background(), base.FileTypeTable, base.FileNum(0).DiskFileNum(), objstorage.CreateOptions{})
+	require.NoError(t, err)
+
+	w := NewWriter(f0, WriterOptions{
+		TableFormat: TableFormatPebblev3,
+		Comparer:    testkeys.Comparer,
+	})
+	keys := testkeys.Alpha(1)
+	keyBuf := make([]byte, 1+testkeys.MaxSuffixLen)
+	// Write a few keys with multiple timestamps (MVCC versions).
+	for i := int64(0); i < 2; i++ {
+		for j := int64(2); j >= 1; j-- {
+			n := testkeys.WriteKeyAt(keyBuf[:], keys, i, j)
+			key := keyBuf[:n]
+			require.NoError(t, w.Set(key, key))
+		}
+	}
+	require.NoError(t, w.Close())
+	f1, err := provider.OpenForReading(context.Background(), base.FileTypeTable, base.FileNum(0).DiskFileNum(), objstorage.OpenOptions{})
+	require.NoError(t, err)
+	r, err := NewReader(f1, ReaderOptions{Comparer: testkeys.Comparer})
+	require.NoError(t, err)
+	defer r.Close()
+	{
+		var pool BufferPool
+		pool.Init(5)
+		citer, err := r.NewCompactionIter(
+			nil, CategoryAndQoS{}, nil, TrivialReaderProvider{Reader: r}, &pool)
+		require.NoError(t, err)
+		defer citer.Close()
+		i := citer.(*compactionIterator)
+		require.True(t, objstorageprovider.TestingCheckMaxReadahead(i.dataRH))
+		require.True(t, objstorageprovider.TestingCheckMaxReadahead(i.vbRH))
+	}
+	{
+		iter, err := r.NewIter(nil, nil)
+		require.NoError(t, err)
+		defer iter.Close()
+		i := iter.(*singleLevelIterator)
+		require.False(t, objstorageprovider.TestingCheckMaxReadahead(i.dataRH))
+		require.False(t, objstorageprovider.TestingCheckMaxReadahead(i.vbRH))
+	}
+}
+
+func TestReaderChecksumErrors(t *testing.T) {
+	for _, checksumType := range []ChecksumType{ChecksumTypeCRC32c, ChecksumTypeXXHash64} {
+		t.Run(fmt.Sprintf("checksum-type=%d", checksumType), func(t *testing.T) {
+			for _, twoLevelIndex := range []bool{false, true} {
+				t.Run(fmt.Sprintf("two-level-index=%t", twoLevelIndex), func(t *testing.T) {
+					mem := vfs.NewMem()
+
+					{
+						// Create an sstable with 3 data blocks.
+						f, err := mem.Create("test")
+						require.NoError(t, err)
+
+						const blockSize = 32
+						indexBlockSize := 4096
+						if twoLevelIndex {
+							indexBlockSize = 1
+						}
+
+						w := NewWriter(objstorageprovider.NewFileWritable(f), WriterOptions{
+							BlockSize:      blockSize,
+							IndexBlockSize: indexBlockSize,
+							Checksum:       checksumType,
+						})
+						require.NoError(t, w.Set(bytes.Repeat([]byte("a"), blockSize), nil))
+						require.NoError(t, w.Set(bytes.Repeat([]byte("b"), blockSize), nil))
+						require.NoError(t, w.Set(bytes.Repeat([]byte("c"), blockSize), nil))
+						require.NoError(t, w.Close())
+					}
+
+					// Load the layout so that we no the location of the data blocks.
+					var layout *Layout
+					{
+						f, err := mem.Open("test")
+						require.NoError(t, err)
+
+						r, err := newReader(f, ReaderOptions{})
+						require.NoError(t, err)
+						layout, err = r.Layout()
+						require.NoError(t, err)
+						require.EqualValues(t, len(layout.Data), 3)
+						require.NoError(t, r.Close())
+					}
+
+					for _, bh := range layout.Data {
+						// Read the sstable and corrupt the first byte in the target data
+						// block.
+						orig, err := mem.Open("test")
+						require.NoError(t, err)
+						data, err := io.ReadAll(orig)
+						require.NoError(t, err)
+						require.NoError(t, orig.Close())
+
+						// Corrupt the first byte in the block.
+						data[bh.Offset] ^= 0xff
+
+						corrupted, err := mem.Create("corrupted")
+						require.NoError(t, err)
+						_, err = corrupted.Write(data)
+						require.NoError(t, err)
+						require.NoError(t, corrupted.Close())
+
+						// Verify that we encounter a checksum mismatch error while iterating
+						// over the sstable.
+						corrupted, err = mem.Open("corrupted")
+						require.NoError(t, err)
+
+						r, err := newReader(corrupted, ReaderOptions{})
+						require.NoError(t, err)
+
+						iter, err := r.NewIter(nil, nil)
+						require.NoError(t, err)
+						for k, _ := iter.First(); k != nil; k, _ = iter.Next() {
+						}
+						require.Regexp(t, `checksum mismatch`, iter.Error())
+						require.Regexp(t, `checksum mismatch`, iter.Close())
+
+						iter, err = r.NewIter(nil, nil)
+						require.NoError(t, err)
+						for k, _ := iter.Last(); k != nil; k, _ = iter.Prev() {
+						}
+						require.Regexp(t, `checksum mismatch`, iter.Error())
+						require.Regexp(t, `checksum mismatch`, iter.Close())
+
+						require.NoError(t, r.Close())
+					}
+				})
+			}
+		})
+	}
+}
+
+func TestValidateBlockChecksums(t *testing.T) {
+	seed := uint64(time.Now().UnixNano())
+	rng := rand.New(rand.NewSource(seed))
+	t.Logf("using seed = %d", seed)
+
+	allFiles := []string{
+		"testdata/h.no-compression.sst",
+		"testdata/h.no-compression.two_level_index.sst",
+		"testdata/h.sst",
+		"testdata/h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst",
+		"testdata/h.table-bloom.no-compression.sst",
+		"testdata/h.table-bloom.sst",
+		"testdata/h.zstd-compression.sst",
+	}
+
+	type corruptionLocation int
+	const (
+		corruptionLocationData corruptionLocation = iota
+		corruptionLocationIndex
+		corruptionLocationTopIndex
+		corruptionLocationFilter
+		corruptionLocationRangeDel
+		corruptionLocationProperties
+		corruptionLocationMetaIndex
+	)
+
+	testCases := []struct {
+		name                string
+		files               []string
+		corruptionLocations []corruptionLocation
+	}{
+		{
+			name:                "no corruption",
+			corruptionLocations: []corruptionLocation{},
+		},
+		{
+			name: "data block corruption",
+			corruptionLocations: []corruptionLocation{
+				corruptionLocationData,
+			},
+		},
+		{
+			name: "index block corruption",
+			corruptionLocations: []corruptionLocation{
+				corruptionLocationIndex,
+			},
+		},
+		{
+			name: "top index block corruption",
+			files: []string{
+				"testdata/h.no-compression.two_level_index.sst",
+			},
+			corruptionLocations: []corruptionLocation{
+				corruptionLocationTopIndex,
+			},
+		},
+		{
+			name: "filter block corruption",
+			files: []string{
+				"testdata/h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst",
+				"testdata/h.table-bloom.no-compression.sst",
+				"testdata/h.table-bloom.sst",
+			},
+			corruptionLocations: []corruptionLocation{
+				corruptionLocationFilter,
+			},
+		},
+		{
+			name: "range deletion block corruption",
+			corruptionLocations: []corruptionLocation{
+				corruptionLocationRangeDel,
+			},
+		},
+		{
+			name: "properties block corruption",
+			corruptionLocations: []corruptionLocation{
+				corruptionLocationProperties,
+			},
+		},
+		{
+			name: "metaindex block corruption",
+			corruptionLocations: []corruptionLocation{
+				corruptionLocationMetaIndex,
+			},
+		},
+		{
+			name: "multiple blocks corrupted",
+			corruptionLocations: []corruptionLocation{
+				corruptionLocationData,
+				corruptionLocationIndex,
+				corruptionLocationRangeDel,
+				corruptionLocationProperties,
+				corruptionLocationMetaIndex,
+			},
+		},
+	}
+
+	testFn := func(t *testing.T, file string, corruptionLocations []corruptionLocation) {
+		// Create a copy of the SSTable that we can freely corrupt.
+		f, err := os.Open(filepath.FromSlash(file))
+		require.NoError(t, err)
+
+		pathCopy := path.Join(t.TempDir(), path.Base(file))
+		fCopy, err := os.OpenFile(pathCopy, os.O_CREATE|os.O_RDWR, 0600)
+		require.NoError(t, err)
+		defer fCopy.Close()
+
+		_, err = io.Copy(fCopy, f)
+		require.NoError(t, err)
+		err = fCopy.Sync()
+		require.NoError(t, err)
+		require.NoError(t, f.Close())
+
+		filter := bloom.FilterPolicy(10)
+		r, err := newReader(fCopy, ReaderOptions{
+			Filters: map[string]FilterPolicy{
+				filter.Name(): filter,
+			},
+		})
+		require.NoError(t, err)
+		defer func() { require.NoError(t, r.Close()) }()
+
+		// Prior to corruption, validation is successful.
+		require.NoError(t, r.ValidateBlockChecksums())
+
+		// If we are not testing for corruption, we can stop here.
+		if len(corruptionLocations) == 0 {
+			return
+		}
+
+		// Perform bit flips in various corruption locations.
+		layout, err := r.Layout()
+		require.NoError(t, err)
+		for _, location := range corruptionLocations {
+			var bh BlockHandle
+			switch location {
+			case corruptionLocationData:
+				bh = layout.Data[rng.Intn(len(layout.Data))].BlockHandle
+			case corruptionLocationIndex:
+				bh = layout.Index[rng.Intn(len(layout.Index))]
+			case corruptionLocationTopIndex:
+				bh = layout.TopIndex
+			case corruptionLocationFilter:
+				bh = layout.Filter
+			case corruptionLocationRangeDel:
+				bh = layout.RangeDel
+			case corruptionLocationProperties:
+				bh = layout.Properties
+			case corruptionLocationMetaIndex:
+				bh = layout.MetaIndex
+			default:
+				t.Fatalf("unknown location")
+			}
+
+			// Corrupt a random byte within the selected block.
+			pos := int64(bh.Offset) + rng.Int63n(int64(bh.Length))
+			t.Logf("altering file=%s @ offset = %d", file, pos)
+
+			b := make([]byte, 1)
+			n, err := fCopy.ReadAt(b, pos)
+			require.NoError(t, err)
+			require.Equal(t, 1, n)
+			t.Logf("data (before) = %08b", b)
+
+			b[0] ^= 0xff
+			t.Logf("data (after) = %08b", b)
+
+			_, err = fCopy.WriteAt(b, pos)
+			require.NoError(t, err)
+		}
+
+		// Write back to the file.
+		err = fCopy.Sync()
+		require.NoError(t, err)
+
+		// Confirm that checksum validation fails.
+		err = r.ValidateBlockChecksums()
+		require.Error(t, err)
+		require.Regexp(t, `checksum mismatch`, err.Error())
+	}
+
+	for _, tc := range testCases {
+		// By default, test across all files, unless overridden.
+		files := tc.files
+		if files == nil {
+			files = allFiles
+		}
+		for _, file := range files {
+			t.Run(tc.name+" "+path.Base(file), func(t *testing.T) {
+				testFn(t, file, tc.corruptionLocations)
+			})
+		}
+	}
+}
+
+func TestReader_TableFormat(t *testing.T) {
+	test := func(t *testing.T, want TableFormat) {
+		fs := vfs.NewMem()
+		f, err := fs.Create("test")
+		require.NoError(t, err)
+
+		opts := WriterOptions{TableFormat: want}
+		w := NewWriter(objstorageprovider.NewFileWritable(f), opts)
+		err = w.Close()
+		require.NoError(t, err)
+
+		f, err = fs.Open("test")
+		require.NoError(t, err)
+		r, err := newReader(f, ReaderOptions{})
+		require.NoError(t, err)
+		defer r.Close()
+
+		got, err := r.TableFormat()
+		require.NoError(t, err)
+		require.Equal(t, want, got)
+	}
+
+	for tf := TableFormatLevelDB; tf <= TableFormatMax; tf++ {
+		t.Run(tf.String(), func(t *testing.T) {
+			test(t, tf)
+		})
+	}
+}
+
+func buildTestTable(
+	t *testing.T, numEntries uint64, blockSize, indexBlockSize int, compression Compression,
+) *Reader {
+	provider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(vfs.NewMem(), "" /* dirName */))
+	require.NoError(t, err)
+	defer provider.Close()
+	return buildTestTableWithProvider(t, provider, numEntries, blockSize, indexBlockSize, compression)
+}
+
+func buildTestTableWithProvider(
+	t *testing.T,
+	provider objstorage.Provider,
+	numEntries uint64,
+	blockSize, indexBlockSize int,
+	compression Compression,
+) *Reader {
+	f0, _, err := provider.Create(context.Background(), base.FileTypeTable, base.FileNum(0).DiskFileNum(), objstorage.CreateOptions{})
+	require.NoError(t, err)
+
+	w := NewWriter(f0, WriterOptions{
+		BlockSize:      blockSize,
+		IndexBlockSize: indexBlockSize,
+		Compression:    compression,
+		FilterPolicy:   nil,
+	})
+
+	var ikey InternalKey
+	for i := uint64(0); i < numEntries; i++ {
+		key := make([]byte, 8+i%3)
+		value := make([]byte, i%100)
+		binary.BigEndian.PutUint64(key, i)
+		ikey.UserKey = key
+		w.Add(ikey, value)
+	}
+
+	require.NoError(t, w.Close())
+
+	// Re-open that filename for reading.
+	f1, err := provider.OpenForReading(context.Background(), base.FileTypeTable, base.FileNum(0).DiskFileNum(), objstorage.OpenOptions{})
+	require.NoError(t, err)
+
+	c := cache.New(128 << 20)
+	defer c.Unref()
+	r, err := NewReader(f1, ReaderOptions{
+		Cache: c,
+	})
+	require.NoError(t, err)
+	return r
+}
+
+func buildBenchmarkTable(
+	b *testing.B, options WriterOptions, confirmTwoLevelIndex bool, offset int,
+) (*Reader, [][]byte) {
+	mem := vfs.NewMem()
+	f0, err := mem.Create("bench")
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	w := NewWriter(objstorageprovider.NewFileWritable(f0), options)
+
+	var keys [][]byte
+	var ikey InternalKey
+	for i := uint64(0); i < 1e6; i++ {
+		key := make([]byte, 8)
+		binary.BigEndian.PutUint64(key, i+uint64(offset))
+		keys = append(keys, key)
+		ikey.UserKey = key
+		w.Add(ikey, nil)
+	}
+
+	if err := w.Close(); err != nil {
+		b.Fatal(err)
+	}
+
+	// Re-open that filename for reading.
+	f1, err := mem.Open("bench")
+	if err != nil {
+		b.Fatal(err)
+	}
+	c := cache.New(128 << 20)
+	defer c.Unref()
+	r, err := newReader(f1, ReaderOptions{
+		Cache: c,
+	})
+	if err != nil {
+		b.Fatal(err)
+	}
+	if confirmTwoLevelIndex && r.Properties.IndexPartitions == 0 {
+		b.Fatalf("should have constructed two level index")
+	}
+	return r, keys
+}
+
+var basicBenchmarks = []struct {
+	name    string
+	options WriterOptions
+}{
+	{
+		name: "restart=16,compression=Snappy",
+		options: WriterOptions{
+			BlockSize:            32 << 10,
+			BlockRestartInterval: 16,
+			FilterPolicy:         nil,
+			Compression:          SnappyCompression,
+			TableFormat:          TableFormatPebblev2,
+		},
+	},
+	{
+		name: "restart=16,compression=ZSTD",
+		options: WriterOptions{
+			BlockSize:            32 << 10,
+			BlockRestartInterval: 16,
+			FilterPolicy:         nil,
+			Compression:          ZstdCompression,
+			TableFormat:          TableFormatPebblev2,
+		},
+	},
+}
+
+func BenchmarkTableIterSeekGE(b *testing.B) {
+	for _, bm := range basicBenchmarks {
+		b.Run(bm.name,
+			func(b *testing.B) {
+				r, keys := buildBenchmarkTable(b, bm.options, false, 0)
+				it, err := r.NewIter(nil /* lower */, nil /* upper */)
+				require.NoError(b, err)
+				rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+
+				b.ResetTimer()
+				for i := 0; i < b.N; i++ {
+					it.SeekGE(keys[rng.Intn(len(keys))], base.SeekGEFlagsNone)
+				}
+
+				b.StopTimer()
+				it.Close()
+				r.Close()
+			})
+	}
+}
+
+func BenchmarkTableIterSeekLT(b *testing.B) {
+	for _, bm := range basicBenchmarks {
+		b.Run(bm.name,
+			func(b *testing.B) {
+				r, keys := buildBenchmarkTable(b, bm.options, false, 0)
+				it, err := r.NewIter(nil /* lower */, nil /* upper */)
+				require.NoError(b, err)
+				rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+
+				b.ResetTimer()
+				for i := 0; i < b.N; i++ {
+					it.SeekLT(keys[rng.Intn(len(keys))], base.SeekLTFlagsNone)
+				}
+
+				b.StopTimer()
+				it.Close()
+				r.Close()
+			})
+	}
+}
+
+func BenchmarkTableIterNext(b *testing.B) {
+	for _, bm := range basicBenchmarks {
+		b.Run(bm.name,
+			func(b *testing.B) {
+				r, _ := buildBenchmarkTable(b, bm.options, false, 0)
+				it, err := r.NewIter(nil /* lower */, nil /* upper */)
+				require.NoError(b, err)
+
+				b.ResetTimer()
+				var sum int64
+				var key *InternalKey
+				for i := 0; i < b.N; i++ {
+					if key == nil {
+						key, _ = it.First()
+					}
+					sum += int64(binary.BigEndian.Uint64(key.UserKey))
+					key, _ = it.Next()
+				}
+				if testing.Verbose() {
+					fmt.Fprint(io.Discard, sum)
+				}
+
+				b.StopTimer()
+				it.Close()
+				r.Close()
+			})
+	}
+}
+
+func BenchmarkTableIterPrev(b *testing.B) {
+	for _, bm := range basicBenchmarks {
+		b.Run(bm.name,
+			func(b *testing.B) {
+				r, _ := buildBenchmarkTable(b, bm.options, false, 0)
+				it, err := r.NewIter(nil /* lower */, nil /* upper */)
+				require.NoError(b, err)
+
+				b.ResetTimer()
+				var sum int64
+				var key *InternalKey
+				for i := 0; i < b.N; i++ {
+					if key == nil {
+						key, _ = it.Last()
+					}
+					sum += int64(binary.BigEndian.Uint64(key.UserKey))
+					key, _ = it.Prev()
+				}
+				if testing.Verbose() {
+					fmt.Fprint(io.Discard, sum)
+				}
+
+				b.StopTimer()
+				it.Close()
+				r.Close()
+			})
+	}
+}
+
+func BenchmarkLayout(b *testing.B) {
+	r, _ := buildBenchmarkTable(b, WriterOptions{}, false, 0)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		r.Layout()
+	}
+	b.StopTimer()
+	r.Close()
+}
+
+func BenchmarkSeqSeekGEExhausted(b *testing.B) {
+	// Snappy with no bloom filter.
+	options := basicBenchmarks[0].options
+
+	for _, twoLevelIndex := range []bool{false, true} {
+		switch twoLevelIndex {
+		case false:
+			options.IndexBlockSize = 0
+		case true:
+			options.IndexBlockSize = 512
+		}
+		const offsetCount = 5000
+		reader, keys := buildBenchmarkTable(b, options, twoLevelIndex, offsetCount)
+		var preKeys [][]byte
+		for i := 0; i < offsetCount; i++ {
+			key := make([]byte, 8)
+			binary.BigEndian.PutUint64(key, uint64(i))
+			preKeys = append(preKeys, key)
+		}
+		var postKeys [][]byte
+		for i := 0; i < offsetCount; i++ {
+			key := make([]byte, 8)
+			binary.BigEndian.PutUint64(key, uint64(i+offsetCount+len(keys)))
+			postKeys = append(postKeys, key)
+		}
+		for _, exhaustedBounds := range []bool{false, true} {
+			for _, prefixSeek := range []bool{false, true} {
+				exhausted := "file"
+				if exhaustedBounds {
+					exhausted = "bounds"
+				}
+				seekKind := "ge"
+				if prefixSeek {
+					seekKind = "prefix-ge"
+				}
+				b.Run(fmt.Sprintf(
+					"two-level=%t/exhausted=%s/seek=%s", twoLevelIndex, exhausted, seekKind),
+					func(b *testing.B) {
+						var upper []byte
+						var seekKeys [][]byte
+						if exhaustedBounds {
+							seekKeys = preKeys
+							upper = keys[0]
+						} else {
+							seekKeys = postKeys
+						}
+						it, err := reader.NewIter(nil /* lower */, upper)
+						require.NoError(b, err)
+						b.ResetTimer()
+						pos := 0
+						var seekGEFlags SeekGEFlags
+						for i := 0; i < b.N; i++ {
+							seekKey := seekKeys[0]
+							var k *InternalKey
+							if prefixSeek {
+								k, _ = it.SeekPrefixGE(seekKey, seekKey, seekGEFlags)
+							} else {
+								k, _ = it.SeekGE(seekKey, seekGEFlags)
+							}
+							if k != nil {
+								b.Fatal("found a key")
+							}
+							if it.Error() != nil {
+								b.Fatalf("%s", it.Error().Error())
+							}
+							pos++
+							if pos == len(seekKeys) {
+								pos = 0
+								seekGEFlags = seekGEFlags.DisableTrySeekUsingNext()
+							} else {
+								seekGEFlags = seekGEFlags.EnableTrySeekUsingNext()
+							}
+						}
+						b.StopTimer()
+						it.Close()
+					})
+			}
+		}
+		reader.Close()
+	}
+}
+
+func BenchmarkIteratorScanManyVersions(b *testing.B) {
+	options := WriterOptions{
+		BlockSize:            32 << 10,
+		BlockRestartInterval: 16,
+		FilterPolicy:         nil,
+		Compression:          SnappyCompression,
+		Comparer:             testkeys.Comparer,
+	}
+	// 10,000 key prefixes, each with 100 versions.
+	const keyCount = 10000
+	const sharedPrefixLen = 32
+	const unsharedPrefixLen = 8
+	const versionCount = 100
+
+	// Take the very large keyspace consisting of alphabetic characters of
+	// lengths up to unsharedPrefixLen and reduce it down to keyCount keys by
+	// picking every 1 key every keyCount keys.
+	keys := testkeys.Alpha(unsharedPrefixLen)
+	keys = keys.EveryN(keys.Count() / keyCount)
+	if keys.Count() < keyCount {
+		b.Fatalf("expected %d keys, found %d", keyCount, keys.Count())
+	}
+	keyBuf := make([]byte, sharedPrefixLen+unsharedPrefixLen+testkeys.MaxSuffixLen)
+	for i := 0; i < sharedPrefixLen; i++ {
+		keyBuf[i] = 'A' + byte(i)
+	}
+	// v2 sstable is 115,178,070 bytes. v3 sstable is 107,181,105 bytes with
+	// 99,049,269 bytes in value blocks.
+	setupBench := func(b *testing.B, tableFormat TableFormat, cacheSize int64) *Reader {
+		mem := vfs.NewMem()
+		f0, err := mem.Create("bench")
+		require.NoError(b, err)
+		options.TableFormat = tableFormat
+		w := NewWriter(objstorageprovider.NewFileWritable(f0), options)
+		val := make([]byte, 100)
+		rng := rand.New(rand.NewSource(100))
+		for i := int64(0); i < keys.Count(); i++ {
+			for v := 0; v < versionCount; v++ {
+				n := testkeys.WriteKeyAt(keyBuf[sharedPrefixLen:], keys, i, int64(versionCount-v+1))
+				key := keyBuf[:n+sharedPrefixLen]
+				rng.Read(val)
+				require.NoError(b, w.Set(key, val))
+			}
+		}
+		require.NoError(b, w.Close())
+		c := cache.New(cacheSize)
+		defer c.Unref()
+		// Re-open the filename for reading.
+		f0, err = mem.Open("bench")
+		require.NoError(b, err)
+		r, err := newReader(f0, ReaderOptions{
+			Cache:    c,
+			Comparer: testkeys.Comparer,
+		})
+		require.NoError(b, err)
+		return r
+	}
+	for _, format := range []TableFormat{TableFormatPebblev2, TableFormatPebblev3} {
+		b.Run(fmt.Sprintf("format=%s", format.String()), func(b *testing.B) {
+			// 150MiB results in a high cache hit rate for both formats. 20MiB
+			// results in a high cache hit rate for the data blocks in
+			// TableFormatPebblev3.
+			for _, cacheSize := range []int64{20 << 20, 150 << 20} {
+				b.Run(fmt.Sprintf("cache-size=%s", humanize.Bytes.Int64(cacheSize)),
+					func(b *testing.B) {
+						r := setupBench(b, format, cacheSize)
+						defer func() {
+							require.NoError(b, r.Close())
+						}()
+						for _, readValue := range []bool{false, true} {
+							b.Run(fmt.Sprintf("read-value=%t", readValue), func(b *testing.B) {
+								iter, err := r.NewIter(nil, nil)
+								require.NoError(b, err)
+								var k *InternalKey
+								var v base.LazyValue
+								var valBuf [100]byte
+								b.ResetTimer()
+								for i := 0; i < b.N; i++ {
+									if k == nil {
+										k, _ = iter.First()
+										if k == nil {
+											b.Fatalf("k is nil")
+										}
+									}
+									k, v = iter.Next()
+									if k != nil && readValue {
+										_, callerOwned, err := v.Value(valBuf[:])
+										if err != nil {
+											b.Fatal(err)
+										} else if callerOwned {
+											b.Fatalf("unexpected callerOwned: %t", callerOwned)
+										}
+									}
+								}
+							})
+						}
+					})
+			}
+		})
+	}
+}
+
+func BenchmarkIteratorScanNextPrefix(b *testing.B) {
+	options := WriterOptions{
+		BlockSize:            32 << 10,
+		BlockRestartInterval: 16,
+		FilterPolicy:         nil,
+		Compression:          SnappyCompression,
+		TableFormat:          TableFormatPebblev3,
+		Comparer:             testkeys.Comparer,
+	}
+	const keyCount = 10000
+	const sharedPrefixLen = 32
+	const unsharedPrefixLen = 8
+	val := make([]byte, 100)
+	rand.New(rand.NewSource(100)).Read(val)
+
+	// Take the very large keyspace consisting of alphabetic characters of
+	// lengths up to unsharedPrefixLen and reduce it down to keyCount keys by
+	// picking every 1 key every keyCount keys.
+	keys := testkeys.Alpha(unsharedPrefixLen)
+	keys = keys.EveryN(keys.Count() / keyCount)
+	if keys.Count() < keyCount {
+		b.Fatalf("expected %d keys, found %d", keyCount, keys.Count())
+	}
+	keyBuf := make([]byte, sharedPrefixLen+unsharedPrefixLen+testkeys.MaxSuffixLen)
+	for i := 0; i < sharedPrefixLen; i++ {
+		keyBuf[i] = 'A' + byte(i)
+	}
+	setupBench := func(b *testing.B, versCount int) (r *Reader, succKeys [][]byte) {
+		mem := vfs.NewMem()
+		f0, err := mem.Create("bench")
+		require.NoError(b, err)
+		w := NewWriter(objstorageprovider.NewFileWritable(f0), options)
+		for i := int64(0); i < keys.Count(); i++ {
+			for v := 0; v < versCount; v++ {
+				n := testkeys.WriteKeyAt(keyBuf[sharedPrefixLen:], keys, i, int64(versCount-v+1))
+				key := keyBuf[:n+sharedPrefixLen]
+				require.NoError(b, w.Set(key, val))
+				if v == 0 {
+					prefixLen := testkeys.Comparer.Split(key)
+					prefixKey := key[:prefixLen]
+					succKey := testkeys.Comparer.ImmediateSuccessor(nil, prefixKey)
+					succKeys = append(succKeys, succKey)
+				}
+			}
+		}
+		require.NoError(b, w.Close())
+		// NB: This 200MiB cache is sufficient for even the largest file: 10,000
+		// keys * 100 versions = 1M keys, where each key-value pair is ~140 bytes
+		// = 140MB. So we are not measuring the caching benefit of
+		// TableFormatPebblev3 storing older values in value blocks.
+		c := cache.New(200 << 20)
+		defer c.Unref()
+		// Re-open the filename for reading.
+		f0, err = mem.Open("bench")
+		require.NoError(b, err)
+		r, err = newReader(f0, ReaderOptions{
+			Cache:    c,
+			Comparer: testkeys.Comparer,
+		})
+		require.NoError(b, err)
+		return r, succKeys
+	}
+	// Analysis of some sample results with TableFormatPebblev2:
+	// versions=1/method=seek-ge-10         	22107622	        53.57 ns/op
+	// versions=1/method=next-prefix-10     	36292837	        33.07 ns/op
+	// versions=2/method=seek-ge-10         	14429138	        82.92 ns/op
+	// versions=2/method=next-prefix-10     	19676055	        60.78 ns/op
+	// versions=10/method=seek-ge-10        	 1453726	       825.2 ns/op
+	// versions=10/method=next-prefix-10    	 2450498	       489.6 ns/op
+	// versions=100/method=seek-ge-10       	  965143	      1257 ns/op
+	// versions=100/method=next-prefix-10   	 1000000	      1054 ns/op
+	//
+	// With 1 version, both SeekGE and NextPrefix will be able to complete after
+	// doing a single call to blockIter.Next. However, SeekGE has to do two key
+	// comparisons unlike the one key comparison in NextPrefix. This is because
+	// SeekGE also compares *before* calling Next since it is possible that the
+	// preceding SeekGE is already at the right place.
+	//
+	// With 2 versions, both will do two calls to blockIter.Next. The difference
+	// in the cost is the same as in the 1 version case.
+	//
+	// With 10 versions, it is still likely that the desired key is in the same
+	// data block. NextPrefix will seek only the blockIter. And in the rare case
+	// that the key is in the next data block, it will step the index block (not
+	// seek). In comparison, SeekGE will seek the index block too.
+	//
+	// With 100 versions we more often cross from one data block to the next, so
+	// the difference in cost declines.
+	//
+	// Some sample results with TableFormatPebblev3:
+
+	// versions=1/method=seek-ge-10         	18702609	        53.90 ns/op
+	// versions=1/method=next-prefix-10     	77440167	        15.41 ns/op
+	// versions=2/method=seek-ge-10         	13554286	        87.91 ns/op
+	// versions=2/method=next-prefix-10     	62148526	        19.25 ns/op
+	// versions=10/method=seek-ge-10        	 1316676	       910.5 ns/op
+	// versions=10/method=next-prefix-10    	18829448	        62.61 ns/op
+	// versions=100/method=seek-ge-10       	 1166139	      1025 ns/op
+	// versions=100/method=next-prefix-10   	 4443386	       265.3 ns/op
+	//
+	// NextPrefix is much cheaper than in TableFormatPebblev2 with larger number
+	// of versions. It is also cheaper with 1 and 2 versions since
+	// setHasSamePrefix=false eliminates a key comparison.
+	for _, versionCount := range []int{1, 2, 10, 100} {
+		b.Run(fmt.Sprintf("versions=%d", versionCount), func(b *testing.B) {
+			r, succKeys := setupBench(b, versionCount)
+			defer func() {
+				require.NoError(b, r.Close())
+			}()
+			for _, method := range []string{"seek-ge", "next-prefix"} {
+				b.Run(fmt.Sprintf("method=%s", method), func(b *testing.B) {
+					for _, readValue := range []bool{false, true} {
+						b.Run(fmt.Sprintf("read-value=%t", readValue), func(b *testing.B) {
+							iter, err := r.NewIter(nil, nil)
+							require.NoError(b, err)
+							var nextFunc func(index int) (*InternalKey, base.LazyValue)
+							switch method {
+							case "seek-ge":
+								nextFunc = func(index int) (*InternalKey, base.LazyValue) {
+									var flags base.SeekGEFlags
+									return iter.SeekGE(succKeys[index], flags.EnableTrySeekUsingNext())
+								}
+							case "next-prefix":
+								nextFunc = func(index int) (*InternalKey, base.LazyValue) {
+									return iter.NextPrefix(succKeys[index])
+								}
+							default:
+								b.Fatalf("unknown method %s", method)
+							}
+							n := keys.Count()
+							j := n
+							var k *InternalKey
+							var v base.LazyValue
+							var valBuf [100]byte
+							b.ResetTimer()
+							for i := 0; i < b.N; i++ {
+								if k == nil {
+									if j != n {
+										b.Fatalf("unexpected %d != %d", j, n)
+									}
+									k, _ = iter.First()
+									j = 0
+								} else {
+									k, v = nextFunc(int(j - 1))
+									if k != nil && readValue {
+										_, callerOwned, err := v.Value(valBuf[:])
+										if err != nil {
+											b.Fatal(err)
+										} else if callerOwned {
+											b.Fatalf("unexpected callerOwned: %t", callerOwned)
+										}
+									}
+
+								}
+								if k != nil {
+									j++
+								}
+							}
+						})
+					}
+				})
+			}
+		})
+	}
+}
+
+func BenchmarkIteratorScanObsolete(b *testing.B) {
+	options := WriterOptions{
+		BlockSize:            32 << 10,
+		BlockRestartInterval: 16,
+		FilterPolicy:         nil,
+		Compression:          SnappyCompression,
+		Comparer:             testkeys.Comparer,
+	}
+	const keyCount = 1 << 20
+	const keyLen = 10
+
+	// Take the very large keyspace consisting of alphabetic characters of
+	// lengths up to unsharedPrefixLen and reduce it down to keyCount keys by
+	// picking every 1 key every keyCount keys.
+	keys := testkeys.Alpha(keyLen)
+	keys = keys.EveryN(keys.Count() / keyCount)
+	if keys.Count() < keyCount {
+		b.Fatalf("expected %d keys, found %d", keyCount, keys.Count())
+	}
+	expectedKeyCount := keys.Count()
+	keyBuf := make([]byte, keyLen)
+	setupBench := func(b *testing.B, tableFormat TableFormat, cacheSize int64) *Reader {
+		mem := vfs.NewMem()
+		f0, err := mem.Create("bench")
+		require.NoError(b, err)
+		options.TableFormat = tableFormat
+		w := NewWriter(objstorageprovider.NewFileWritable(f0), options)
+		val := make([]byte, 100)
+		rng := rand.New(rand.NewSource(100))
+		for i := int64(0); i < keys.Count(); i++ {
+			n := testkeys.WriteKey(keyBuf, keys, i)
+			key := keyBuf[:n]
+			rng.Read(val)
+			forceObsolete := true
+			if i == 0 {
+				forceObsolete = false
+			}
+			require.NoError(b, w.AddWithForceObsolete(
+				base.MakeInternalKey(key, 0, InternalKeyKindSet), val, forceObsolete))
+		}
+		require.NoError(b, w.Close())
+		c := cache.New(cacheSize)
+		defer c.Unref()
+		// Re-open the filename for reading.
+		f0, err = mem.Open("bench")
+		require.NoError(b, err)
+		r, err := newReader(f0, ReaderOptions{
+			Cache:    c,
+			Comparer: testkeys.Comparer,
+		})
+		require.NoError(b, err)
+		return r
+	}
+	for _, format := range []TableFormat{TableFormatPebblev3, TableFormatPebblev4} {
+		b.Run(fmt.Sprintf("format=%s", format.String()), func(b *testing.B) {
+			// 150MiB results in a high cache hit rate for both formats.
+			for _, cacheSize := range []int64{1, 150 << 20} {
+				b.Run(fmt.Sprintf("cache-size=%s", humanize.Bytes.Int64(cacheSize)),
+					func(b *testing.B) {
+						r := setupBench(b, format, cacheSize)
+						defer func() {
+							require.NoError(b, r.Close())
+						}()
+						for _, hideObsoletePoints := range []bool{false, true} {
+							b.Run(fmt.Sprintf("hide-obsolete=%t", hideObsoletePoints), func(b *testing.B) {
+								var filterer *BlockPropertiesFilterer
+								if format == TableFormatPebblev4 && hideObsoletePoints {
+									filterer = newBlockPropertiesFilterer(
+										[]BlockPropertyFilter{obsoleteKeyBlockPropertyFilter{}}, nil)
+									intersects, err :=
+										filterer.intersectsUserPropsAndFinishInit(r.Properties.UserProperties)
+									if err != nil {
+										b.Fatalf("%s", err.Error())
+									}
+									if !intersects {
+										b.Fatalf("sstable does not intersect")
+									}
+								}
+								iter, err := r.NewIterWithBlockPropertyFiltersAndContextEtc(
+									context.Background(), nil, nil, filterer, hideObsoletePoints,
+									true, nil, CategoryAndQoS{}, nil,
+									TrivialReaderProvider{Reader: r})
+								require.NoError(b, err)
+								b.ResetTimer()
+								for i := 0; i < b.N; i++ {
+									count := int64(0)
+									k, _ := iter.First()
+									for k != nil {
+										count++
+										k, _ = iter.Next()
+									}
+									if format == TableFormatPebblev4 && hideObsoletePoints {
+										if count != 1 {
+											b.Fatalf("found %d points", count)
+										}
+									} else {
+										if count != expectedKeyCount {
+											b.Fatalf("found %d points", count)
+										}
+									}
+								}
+							})
+						}
+					})
+			}
+		})
+	}
+}
+
+func newReader(r ReadableFile, o ReaderOptions, extraOpts ...ReaderOption) (*Reader, error) {
+	readable, err := NewSimpleReadable(r)
+	if err != nil {
+		return nil, err
+	}
+	return NewReader(readable, o, extraOpts...)
+}
diff --git a/pebble/sstable/reader_virtual.go b/pebble/sstable/reader_virtual.go
new file mode 100644
index 0000000..4f05e39
--- /dev/null
+++ b/pebble/sstable/reader_virtual.go
@@ -0,0 +1,213 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"context"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+)
+
+// VirtualReader wraps Reader. Its purpose is to restrict functionality of the
+// Reader which should be inaccessible to virtual sstables, and enforce bounds
+// invariants associated with virtual sstables. All reads on virtual sstables
+// should go through a VirtualReader.
+//
+// INVARIANT: Any iterators created through a virtual reader will guarantee that
+// they don't expose keys outside the virtual sstable bounds.
+type VirtualReader struct {
+	vState     virtualState
+	reader     *Reader
+	Properties CommonProperties
+}
+
+// Lightweight virtual sstable state which can be passed to sstable iterators.
+type virtualState struct {
+	lower     InternalKey
+	upper     InternalKey
+	fileNum   base.FileNum
+	Compare   Compare
+	isForeign bool
+}
+
+func ceilDiv(a, b uint64) uint64 {
+	return (a + b - 1) / b
+}
+
+// MakeVirtualReader is used to contruct a reader which can read from virtual
+// sstables.
+func MakeVirtualReader(
+	reader *Reader, meta manifest.VirtualFileMeta, isForeign bool,
+) VirtualReader {
+	if reader.fileNum != meta.FileBacking.DiskFileNum {
+		panic("pebble: invalid call to MakeVirtualReader")
+	}
+
+	vState := virtualState{
+		lower:     meta.Smallest,
+		upper:     meta.Largest,
+		fileNum:   meta.FileNum,
+		Compare:   reader.Compare,
+		isForeign: isForeign,
+	}
+	v := VirtualReader{
+		vState: vState,
+		reader: reader,
+	}
+
+	v.Properties.RawKeySize = ceilDiv(reader.Properties.RawKeySize*meta.Size, meta.FileBacking.Size)
+	v.Properties.RawValueSize = ceilDiv(reader.Properties.RawValueSize*meta.Size, meta.FileBacking.Size)
+	v.Properties.NumEntries = ceilDiv(reader.Properties.NumEntries*meta.Size, meta.FileBacking.Size)
+	v.Properties.NumDeletions = ceilDiv(reader.Properties.NumDeletions*meta.Size, meta.FileBacking.Size)
+	v.Properties.NumRangeDeletions = ceilDiv(reader.Properties.NumRangeDeletions*meta.Size, meta.FileBacking.Size)
+	v.Properties.NumRangeKeyDels = ceilDiv(reader.Properties.NumRangeKeyDels*meta.Size, meta.FileBacking.Size)
+
+	// Note that we rely on NumRangeKeySets for correctness. If the sstable may
+	// contain range keys, then NumRangeKeySets must be > 0. ceilDiv works because
+	// meta.Size will not be 0 for virtual sstables.
+	v.Properties.NumRangeKeySets = ceilDiv(reader.Properties.NumRangeKeySets*meta.Size, meta.FileBacking.Size)
+	v.Properties.ValueBlocksSize = ceilDiv(reader.Properties.ValueBlocksSize*meta.Size, meta.FileBacking.Size)
+	v.Properties.NumSizedDeletions = ceilDiv(reader.Properties.NumSizedDeletions*meta.Size, meta.FileBacking.Size)
+	v.Properties.RawPointTombstoneKeySize = ceilDiv(reader.Properties.RawPointTombstoneKeySize*meta.Size, meta.FileBacking.Size)
+	v.Properties.RawPointTombstoneValueSize = ceilDiv(reader.Properties.RawPointTombstoneValueSize*meta.Size, meta.FileBacking.Size)
+	return v
+}
+
+// NewCompactionIter is the compaction iterator function for virtual readers.
+func (v *VirtualReader) NewCompactionIter(
+	bytesIterated *uint64,
+	categoryAndQoS CategoryAndQoS,
+	statsCollector *CategoryStatsCollector,
+	rp ReaderProvider,
+	bufferPool *BufferPool,
+) (Iterator, error) {
+	return v.reader.newCompactionIter(
+		bytesIterated, categoryAndQoS, statsCollector, rp, &v.vState, bufferPool)
+}
+
+// NewIterWithBlockPropertyFiltersAndContextEtc wraps
+// Reader.NewIterWithBlockPropertyFiltersAndContext. We assume that the passed
+// in [lower, upper) bounds will have at least some overlap with the virtual
+// sstable bounds. No overlap is not currently supported in the iterator.
+func (v *VirtualReader) NewIterWithBlockPropertyFiltersAndContextEtc(
+	ctx context.Context,
+	lower, upper []byte,
+	filterer *BlockPropertiesFilterer,
+	hideObsoletePoints, useFilterBlock bool,
+	stats *base.InternalIteratorStats,
+	categoryAndQoS CategoryAndQoS,
+	statsCollector *CategoryStatsCollector,
+	rp ReaderProvider,
+) (Iterator, error) {
+	return v.reader.newIterWithBlockPropertyFiltersAndContext(
+		ctx, lower, upper, filterer, hideObsoletePoints, useFilterBlock, stats,
+		categoryAndQoS, statsCollector, rp, &v.vState)
+}
+
+// ValidateBlockChecksumsOnBacking will call ValidateBlockChecksumsOnBacking on the underlying reader.
+// Note that block checksum validation is NOT restricted to virtual sstable bounds.
+func (v *VirtualReader) ValidateBlockChecksumsOnBacking() error {
+	return v.reader.ValidateBlockChecksums()
+}
+
+// NewRawRangeDelIter wraps Reader.NewRawRangeDelIter.
+func (v *VirtualReader) NewRawRangeDelIter() (keyspan.FragmentIterator, error) {
+	iter, err := v.reader.NewRawRangeDelIter()
+	if err != nil {
+		return nil, err
+	}
+	if iter == nil {
+		return nil, nil
+	}
+
+	// Truncation of spans isn't allowed at a user key that also contains points
+	// in the same virtual sstable, as it would lead to covered points getting
+	// uncovered. Set panicOnUpperTruncate to true if the file's upper bound
+	// is not an exclusive sentinel.
+	//
+	// As an example, if an sstable contains a rangedel a-c and point keys at
+	// a.SET.2 and b.SET.3, the file bounds [a#2,SET-b#RANGEDELSENTINEL] are
+	// allowed (as they exclude b.SET.3), or [a#2,SET-c#RANGEDELSENTINEL] (as it
+	// includes both point keys), but not [a#2,SET-b#3,SET] (as it would truncate
+	// the rangedel at b and lead to the point being uncovered).
+	return keyspan.Truncate(
+		v.reader.Compare, iter, v.vState.lower.UserKey, v.vState.upper.UserKey,
+		&v.vState.lower, &v.vState.upper, !v.vState.upper.IsExclusiveSentinel(), /* panicOnUpperTruncate */
+	), nil
+}
+
+// NewRawRangeKeyIter wraps Reader.NewRawRangeKeyIter.
+func (v *VirtualReader) NewRawRangeKeyIter() (keyspan.FragmentIterator, error) {
+	iter, err := v.reader.NewRawRangeKeyIter()
+	if err != nil {
+		return nil, err
+	}
+	if iter == nil {
+		return nil, nil
+	}
+
+	// Truncation of spans isn't allowed at a user key that also contains points
+	// in the same virtual sstable, as it would lead to covered points getting
+	// uncovered. Set panicOnUpperTruncate to true if the file's upper bound
+	// is not an exclusive sentinel.
+	//
+	// As an example, if an sstable contains a range key a-c and point keys at
+	// a.SET.2 and b.SET.3, the file bounds [a#2,SET-b#RANGEKEYSENTINEL] are
+	// allowed (as they exclude b.SET.3), or [a#2,SET-c#RANGEKEYSENTINEL] (as it
+	// includes both point keys), but not [a#2,SET-b#3,SET] (as it would truncate
+	// the range key at b and lead to the point being uncovered).
+	return keyspan.Truncate(
+		v.reader.Compare, iter, v.vState.lower.UserKey, v.vState.upper.UserKey,
+		&v.vState.lower, &v.vState.upper, !v.vState.upper.IsExclusiveSentinel(), /* panicOnUpperTruncate */
+	), nil
+}
+
+// Constrain bounds will narrow the start, end bounds if they do not fit within
+// the virtual sstable. The function will return if the new end key is
+// inclusive.
+func (v *virtualState) constrainBounds(
+	start, end []byte, endInclusive bool,
+) (lastKeyInclusive bool, first []byte, last []byte) {
+	first = start
+	if start == nil || v.Compare(start, v.lower.UserKey) < 0 {
+		first = v.lower.UserKey
+	}
+
+	// Note that we assume that start, end has some overlap with the virtual
+	// sstable bounds.
+	last = v.upper.UserKey
+	lastKeyInclusive = !v.upper.IsExclusiveSentinel()
+	if end != nil {
+		cmp := v.Compare(end, v.upper.UserKey)
+		switch {
+		case cmp == 0:
+			lastKeyInclusive = !v.upper.IsExclusiveSentinel() && endInclusive
+			last = v.upper.UserKey
+		case cmp > 0:
+			lastKeyInclusive = !v.upper.IsExclusiveSentinel()
+			last = v.upper.UserKey
+		default:
+			lastKeyInclusive = endInclusive
+			last = end
+		}
+	}
+	// TODO(bananabrick): What if someone passes in bounds completely outside of
+	// virtual sstable bounds?
+	return lastKeyInclusive, first, last
+}
+
+// EstimateDiskUsage just calls VirtualReader.reader.EstimateDiskUsage after
+// enforcing the virtual sstable bounds.
+func (v *VirtualReader) EstimateDiskUsage(start, end []byte) (uint64, error) {
+	_, f, l := v.vState.constrainBounds(start, end, true /* endInclusive */)
+	return v.reader.EstimateDiskUsage(f, l)
+}
+
+// CommonProperties implements the CommonReader interface.
+func (v *VirtualReader) CommonProperties() *CommonProperties {
+	return &v.Properties
+}
diff --git a/pebble/sstable/suffix_rewriter.go b/pebble/sstable/suffix_rewriter.go
new file mode 100644
index 0000000..9672ded
--- /dev/null
+++ b/pebble/sstable/suffix_rewriter.go
@@ -0,0 +1,589 @@
+package sstable
+
+import (
+	"bytes"
+	"context"
+	"math"
+	"sync"
+
+	"github.com/cespare/xxhash/v2"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/bytealloc"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/rangekey"
+	"github.com/cockroachdb/pebble/objstorage"
+)
+
+// RewriteKeySuffixes is deprecated.
+//
+// TODO(sumeer): remove after switching CockroachDB to RewriteKeySuffixesAndReturnFormat.
+func RewriteKeySuffixes(
+	sst []byte,
+	rOpts ReaderOptions,
+	out objstorage.Writable,
+	o WriterOptions,
+	from, to []byte,
+	concurrency int,
+) (*WriterMetadata, error) {
+	meta, _, err := RewriteKeySuffixesAndReturnFormat(sst, rOpts, out, o, from, to, concurrency)
+	return meta, err
+}
+
+// RewriteKeySuffixesAndReturnFormat copies the content of the passed SSTable
+// bytes to a new sstable, written to `out`, in which the suffix `from` has is
+// replaced with `to` in every key. The input sstable must consist of only
+// Sets or RangeKeySets and every key must have `from` as its suffix as
+// determined by the Split function of the Comparer in the passed
+// WriterOptions. Range deletes must not exist in this sstable, as they will
+// be ignored.
+//
+// Data blocks are rewritten in parallel by `concurrency` workers and then
+// assembled into a final SST. Filters are copied from the original SST without
+// modification as they are not affected by the suffix, while block and table
+// properties are only minimally recomputed.
+//
+// TODO(sumeer): document limitations, if any, due to this limited
+// re-computation of properties (is there any loss of fidelity?).
+//
+// Any block and table property collectors configured in the WriterOptions must
+// implement SuffixReplaceableTableCollector/SuffixReplaceableBlockCollector.
+//
+// The WriterOptions.TableFormat is ignored, and the output sstable has the
+// same TableFormat as the input, which is returned in case the caller wants
+// to do some error checking. Suffix rewriting is meant to be efficient, and
+// allowing changes in the TableFormat detracts from that efficiency.
+//
+// Any obsolete bits that key-value pairs may be annotated with are ignored
+// and lost during the rewrite. Additionally, the output sstable has the
+// pebble.obsolete.is_strict property set to false. These limitations could be
+// removed if needed. The current use case for
+// RewriteKeySuffixesAndReturnFormat in CockroachDB is for MVCC-compliant file
+// ingestion, where these files do not contain RANGEDELs and have one
+// key-value pair per userkey -- so they trivially satisfy the strict
+// criteria, and we don't need the obsolete bit as a performance optimization.
+// For disaggregated storage, strict obsolete sstables are needed for L5 and
+// L6, but at the time of writing, we expect such MVCC-compliant file
+// ingestion to only ingest into levels L4 and higher. If this changes, we can
+// do one of two things to get rid of this limitation:
+//   - Validate that there are no duplicate userkeys and no RANGEDELs/MERGEs
+//     in the sstable to be rewritten. Validating no duplicate userkeys is
+//     non-trivial when rewriting blocks in parallel, so we could encode the
+//     pre-existing condition in the (existing) SnapshotPinnedKeys property --
+//     we need to update the external sst writer to calculate and encode this
+//     property.
+//   - Preserve the obsolete bit (with changes to the blockIter).
+func RewriteKeySuffixesAndReturnFormat(
+	sst []byte,
+	rOpts ReaderOptions,
+	out objstorage.Writable,
+	o WriterOptions,
+	from, to []byte,
+	concurrency int,
+) (*WriterMetadata, TableFormat, error) {
+	r, err := NewMemReader(sst, rOpts)
+	if err != nil {
+		return nil, TableFormatUnspecified, err
+	}
+	defer r.Close()
+	return rewriteKeySuffixesInBlocks(r, out, o, from, to, concurrency)
+}
+
+func rewriteKeySuffixesInBlocks(
+	r *Reader, out objstorage.Writable, o WriterOptions, from, to []byte, concurrency int,
+) (*WriterMetadata, TableFormat, error) {
+	if o.Comparer == nil || o.Comparer.Split == nil {
+		return nil, TableFormatUnspecified,
+			errors.New("a valid splitter is required to rewrite suffixes")
+	}
+	if concurrency < 1 {
+		return nil, TableFormatUnspecified, errors.New("concurrency must be >= 1")
+	}
+	// Even though NumValueBlocks = 0 => NumValuesInValueBlocks = 0, check both
+	// as a defensive measure.
+	if r.Properties.NumValueBlocks > 0 || r.Properties.NumValuesInValueBlocks > 0 {
+		return nil, TableFormatUnspecified,
+			errors.New("sstable with a single suffix should not have value blocks")
+	}
+
+	tableFormat := r.tableFormat
+	o.TableFormat = tableFormat
+	w := NewWriter(out, o)
+	defer func() {
+		if w != nil {
+			w.Close()
+		}
+	}()
+
+	for _, c := range w.propCollectors {
+		if _, ok := c.(SuffixReplaceableTableCollector); !ok {
+			return nil, TableFormatUnspecified,
+				errors.Errorf("property collector %s does not support suffix replacement", c.Name())
+		}
+	}
+	for _, c := range w.blockPropCollectors {
+		if _, ok := c.(SuffixReplaceableBlockCollector); !ok {
+			return nil, TableFormatUnspecified,
+				errors.Errorf("block property collector %s does not support suffix replacement", c.Name())
+		}
+	}
+
+	l, err := r.Layout()
+	if err != nil {
+		return nil, TableFormatUnspecified, errors.Wrap(err, "reading layout")
+	}
+
+	if err := rewriteDataBlocksToWriter(r, w, l.Data, from, to, w.split, concurrency); err != nil {
+		return nil, TableFormatUnspecified, errors.Wrap(err, "rewriting data blocks")
+	}
+
+	// Copy over the range key block and replace suffixes in it if it exists.
+	if err := rewriteRangeKeyBlockToWriter(r, w, from, to); err != nil {
+		return nil, TableFormatUnspecified, errors.Wrap(err, "rewriting range key blocks")
+	}
+
+	// Copy over the filter block if it exists (rewriteDataBlocksToWriter will
+	// already have ensured this is valid if it exists).
+	if w.filter != nil && l.Filter.Length > 0 {
+		filterBlock, _, err := readBlockBuf(r, l.Filter, nil)
+		if err != nil {
+			return nil, TableFormatUnspecified, errors.Wrap(err, "reading filter")
+		}
+		w.filter = copyFilterWriter{
+			origPolicyName: w.filter.policyName(), origMetaName: w.filter.metaName(), data: filterBlock,
+		}
+	}
+
+	if err := w.Close(); err != nil {
+		w = nil
+		return nil, TableFormatUnspecified, err
+	}
+	writerMeta, err := w.Metadata()
+	w = nil
+	return writerMeta, tableFormat, err
+}
+
+var errBadKind = errors.New("key does not have expected kind (set)")
+
+type blockWithSpan struct {
+	start, end InternalKey
+	data       []byte
+}
+
+func rewriteBlocks(
+	r *Reader,
+	restartInterval int,
+	checksumType ChecksumType,
+	compression Compression,
+	input []BlockHandleWithProperties,
+	output []blockWithSpan,
+	totalWorkers, worker int,
+	from, to []byte,
+	split Split,
+) error {
+	bw := blockWriter{
+		restartInterval: restartInterval,
+	}
+	buf := blockBuf{checksummer: checksummer{checksumType: checksumType}}
+	if checksumType == ChecksumTypeXXHash {
+		buf.checksummer.xxHasher = xxhash.New()
+	}
+
+	var blockAlloc bytealloc.A
+	var keyAlloc bytealloc.A
+	var scratch InternalKey
+
+	var inputBlock, inputBlockBuf []byte
+
+	iter := &blockIter{}
+
+	// We'll assume all blocks are _roughly_ equal so round-robin static partition
+	// of each worker doing every ith block is probably enough.
+	for i := worker; i < len(input); i += totalWorkers {
+		bh := input[i]
+
+		var err error
+		inputBlock, inputBlockBuf, err = readBlockBuf(r, bh.BlockHandle, inputBlockBuf)
+		if err != nil {
+			return err
+		}
+		if err := iter.init(r.Compare, inputBlock, r.Properties.GlobalSeqNum, false); err != nil {
+			return err
+		}
+
+		if cap(bw.restarts) < int(iter.restarts) {
+			bw.restarts = make([]uint32, 0, iter.restarts)
+		}
+		if cap(bw.buf) == 0 {
+			bw.buf = make([]byte, 0, len(inputBlock))
+		}
+		if cap(bw.restarts) < int(iter.numRestarts) {
+			bw.restarts = make([]uint32, 0, iter.numRestarts)
+		}
+
+		for key, val := iter.First(); key != nil; key, val = iter.Next() {
+			if key.Kind() != InternalKeyKindSet {
+				return errBadKind
+			}
+			si := split(key.UserKey)
+			oldSuffix := key.UserKey[si:]
+			if !bytes.Equal(oldSuffix, from) {
+				err := errors.Errorf("key has suffix %q, expected %q", oldSuffix, from)
+				return err
+			}
+			newLen := si + len(to)
+			if cap(scratch.UserKey) < newLen {
+				scratch.UserKey = make([]byte, 0, len(key.UserKey)*2+len(to)-len(from))
+			}
+
+			scratch.Trailer = key.Trailer
+			scratch.UserKey = scratch.UserKey[:newLen]
+			copy(scratch.UserKey, key.UserKey[:si])
+			copy(scratch.UserKey[si:], to)
+
+			// NB: for TableFormatPebblev3 and higher, since
+			// !iter.lazyValueHandling.hasValuePrefix, it will return the raw value
+			// in the block, which includes the 1-byte prefix. This is fine since bw
+			// also does not know about the prefix and will preserve it in bw.add.
+			v := val.InPlaceValue()
+			if invariants.Enabled && r.tableFormat >= TableFormatPebblev3 &&
+				key.Kind() == InternalKeyKindSet {
+				if len(v) < 1 {
+					return errors.Errorf("value has no prefix")
+				}
+				prefix := valuePrefix(v[0])
+				if isValueHandle(prefix) {
+					return errors.Errorf("value prefix is incorrect")
+				}
+				if setHasSamePrefix(prefix) {
+					return errors.Errorf("multiple keys with same key prefix")
+				}
+			}
+			bw.add(scratch, v)
+			if output[i].start.UserKey == nil {
+				keyAlloc, output[i].start = cloneKeyWithBuf(scratch, keyAlloc)
+			}
+		}
+		*iter = iter.resetForReuse()
+
+		keyAlloc, output[i].end = cloneKeyWithBuf(scratch, keyAlloc)
+
+		finished := compressAndChecksum(bw.finish(), compression, &buf)
+
+		// copy our finished block into the output buffer.
+		blockAlloc, output[i].data = blockAlloc.Alloc(len(finished) + blockTrailerLen)
+		copy(output[i].data, finished)
+		copy(output[i].data[len(finished):], buf.tmp[:blockTrailerLen])
+	}
+	return nil
+}
+
+func rewriteDataBlocksToWriter(
+	r *Reader,
+	w *Writer,
+	data []BlockHandleWithProperties,
+	from, to []byte,
+	split Split,
+	concurrency int,
+) error {
+	if r.Properties.NumEntries == 0 {
+		// No point keys.
+		return nil
+	}
+	blocks := make([]blockWithSpan, len(data))
+
+	if w.filter != nil {
+		if r.Properties.FilterPolicyName != w.filter.policyName() {
+			return errors.New("mismatched filters")
+		}
+		if was, is := r.Properties.ComparerName, w.props.ComparerName; was != is {
+			return errors.Errorf("mismatched Comparer %s vs %s, replacement requires same splitter to copy filters", was, is)
+		}
+	}
+
+	g := &sync.WaitGroup{}
+	g.Add(concurrency)
+	errCh := make(chan error, concurrency)
+	for i := 0; i < concurrency; i++ {
+		worker := i
+		go func() {
+			defer g.Done()
+			err := rewriteBlocks(
+				r,
+				w.dataBlockBuf.dataBlock.restartInterval,
+				w.blockBuf.checksummer.checksumType,
+				w.compression,
+				data,
+				blocks,
+				concurrency,
+				worker,
+				from, to,
+				split,
+			)
+			if err != nil {
+				errCh <- err
+			}
+		}()
+	}
+	g.Wait()
+	close(errCh)
+	if err, ok := <-errCh; ok {
+		return err
+	}
+
+	for _, p := range w.propCollectors {
+		if err := p.(SuffixReplaceableTableCollector).UpdateKeySuffixes(r.Properties.UserProperties, from, to); err != nil {
+			return err
+		}
+	}
+
+	var decoder blockPropertiesDecoder
+	var oldShortIDs []shortID
+	var oldProps [][]byte
+	if len(w.blockPropCollectors) > 0 {
+		oldProps = make([][]byte, len(w.blockPropCollectors))
+		oldShortIDs = make([]shortID, math.MaxUint8)
+		for i, p := range w.blockPropCollectors {
+			if prop, ok := r.Properties.UserProperties[p.Name()]; ok {
+				was, is := shortID(byte(prop[0])), shortID(i)
+				oldShortIDs[was] = is
+			}
+		}
+	}
+
+	for i := range blocks {
+		// Write the rewritten block to the file.
+		if err := w.writable.Write(blocks[i].data); err != nil {
+			return err
+		}
+
+		n := len(blocks[i].data)
+		bh := BlockHandle{Offset: w.meta.Size, Length: uint64(n) - blockTrailerLen}
+		// Update the overall size.
+		w.meta.Size += uint64(n)
+
+		// Load any previous values for our prop collectors into oldProps.
+		for i := range oldProps {
+			oldProps[i] = nil
+		}
+		decoder.props = data[i].Props
+		for !decoder.done() {
+			id, val, err := decoder.next()
+			if err != nil {
+				return err
+			}
+			oldProps[oldShortIDs[id]] = val
+		}
+
+		for i, p := range w.blockPropCollectors {
+			if err := p.(SuffixReplaceableBlockCollector).UpdateKeySuffixes(oldProps[i], from, to); err != nil {
+				return err
+			}
+		}
+
+		bhp, err := w.maybeAddBlockPropertiesToBlockHandle(bh)
+		if err != nil {
+			return err
+		}
+		var nextKey InternalKey
+		if i+1 < len(blocks) {
+			nextKey = blocks[i+1].start
+		}
+		if err = w.addIndexEntrySync(blocks[i].end, nextKey, bhp, w.dataBlockBuf.tmp[:]); err != nil {
+			return err
+		}
+	}
+
+	w.meta.updateSeqNum(blocks[0].start.SeqNum())
+	w.props.NumEntries = r.Properties.NumEntries
+	w.props.RawKeySize = r.Properties.RawKeySize
+	w.props.RawValueSize = r.Properties.RawValueSize
+	w.meta.SetSmallestPointKey(blocks[0].start)
+	w.meta.SetLargestPointKey(blocks[len(blocks)-1].end)
+	return nil
+}
+
+func rewriteRangeKeyBlockToWriter(r *Reader, w *Writer, from, to []byte) error {
+	iter, err := r.NewRawRangeKeyIter()
+	if err != nil {
+		return err
+	}
+	if iter == nil {
+		// No range keys.
+		return nil
+	}
+	defer iter.Close()
+
+	for s := iter.First(); s != nil; s = iter.Next() {
+		if !s.Valid() {
+			break
+		}
+		for i := range s.Keys {
+			if s.Keys[i].Kind() != base.InternalKeyKindRangeKeySet {
+				return errBadKind
+			}
+			if !bytes.Equal(s.Keys[i].Suffix, from) {
+				return errors.Errorf("key has suffix %q, expected %q", s.Keys[i].Suffix, from)
+			}
+			s.Keys[i].Suffix = to
+		}
+
+		err := rangekey.Encode(s, func(k base.InternalKey, v []byte) error {
+			// Calling AddRangeKey instead of addRangeKeySpan bypasses the fragmenter.
+			// This is okay because the raw fragments off of `iter` are already
+			// fragmented, and suffix replacement should not affect fragmentation.
+			return w.AddRangeKey(k, v)
+		})
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+type copyFilterWriter struct {
+	origMetaName   string
+	origPolicyName string
+	data           []byte
+}
+
+func (copyFilterWriter) addKey(key []byte)         { panic("unimplemented") }
+func (c copyFilterWriter) finish() ([]byte, error) { return c.data, nil }
+func (c copyFilterWriter) metaName() string        { return c.origMetaName }
+func (c copyFilterWriter) policyName() string      { return c.origPolicyName }
+
+// RewriteKeySuffixesViaWriter is similar to RewriteKeySuffixes but uses just a
+// single loop over the Reader that writes each key to the Writer with the new
+// suffix. The is significantly slower than the parallelized rewriter, and does
+// more work to rederive filters, props, etc.
+//
+// Any obsolete bits that key-value pairs may be annotated with are ignored
+// and lost during the rewrite. Some of the obsolete bits may be recreated --
+// specifically when there are multiple keys with the same user key.
+// Additionally, the output sstable has the pebble.obsolete.is_strict property
+// set to false. See the longer comment at RewriteKeySuffixesAndReturnFormat.
+func RewriteKeySuffixesViaWriter(
+	r *Reader, out objstorage.Writable, o WriterOptions, from, to []byte,
+) (*WriterMetadata, error) {
+	if o.Comparer == nil || o.Comparer.Split == nil {
+		return nil, errors.New("a valid splitter is required to rewrite suffixes")
+	}
+
+	o.IsStrictObsolete = false
+	w := NewWriter(out, o)
+	defer func() {
+		if w != nil {
+			w.Close()
+		}
+	}()
+	i, err := r.NewIter(nil, nil)
+	if err != nil {
+		return nil, err
+	}
+	defer i.Close()
+
+	k, v := i.First()
+	var scratch InternalKey
+	for k != nil {
+		if k.Kind() != InternalKeyKindSet {
+			return nil, errors.New("invalid key type")
+		}
+		oldSuffix := k.UserKey[r.Split(k.UserKey):]
+		if !bytes.Equal(oldSuffix, from) {
+			return nil, errors.Errorf("key has suffix %q, expected %q", oldSuffix, from)
+		}
+		scratch.UserKey = append(scratch.UserKey[:0], k.UserKey[:len(k.UserKey)-len(from)]...)
+		scratch.UserKey = append(scratch.UserKey, to...)
+		scratch.Trailer = k.Trailer
+
+		val, _, err := v.Value(nil)
+		if err != nil {
+			return nil, err
+		}
+		if w.addPoint(scratch, val, false); err != nil {
+			return nil, err
+		}
+		k, v = i.Next()
+	}
+	if err := rewriteRangeKeyBlockToWriter(r, w, from, to); err != nil {
+		return nil, err
+	}
+	if err := w.Close(); err != nil {
+		w = nil
+		return nil, err
+	}
+	writerMeta, err := w.Metadata()
+	w = nil
+	return writerMeta, err
+}
+
+// NewMemReader opens a reader over the SST stored in the passed []byte.
+func NewMemReader(sst []byte, o ReaderOptions) (*Reader, error) {
+	return NewReader(newMemReader(sst), o)
+}
+
+func readBlockBuf(r *Reader, bh BlockHandle, buf []byte) ([]byte, []byte, error) {
+	raw := r.readable.(*memReader).b[bh.Offset : bh.Offset+bh.Length+blockTrailerLen]
+	if err := checkChecksum(r.checksumType, raw, bh, 0); err != nil {
+		return nil, buf, err
+	}
+	typ := blockType(raw[bh.Length])
+	raw = raw[:bh.Length]
+	if typ == noCompressionBlockType {
+		return raw, buf, nil
+	}
+	decompressedLen, prefix, err := decompressedLen(typ, raw)
+	if err != nil {
+		return nil, buf, err
+	}
+	if cap(buf) < decompressedLen {
+		buf = make([]byte, decompressedLen)
+	}
+	res, err := decompressInto(typ, raw[prefix:], buf[:decompressedLen])
+	return res, buf, err
+}
+
+// memReader is a thin wrapper around a []byte such that it can be passed to
+// sstable.Reader. It supports concurrent use, and does so without locking in
+// contrast to the heavier read/write vfs.MemFile.
+type memReader struct {
+	b  []byte
+	r  *bytes.Reader
+	rh objstorage.NoopReadHandle
+}
+
+var _ objstorage.Readable = (*memReader)(nil)
+
+func newMemReader(b []byte) *memReader {
+	r := &memReader{
+		b: b,
+		r: bytes.NewReader(b),
+	}
+	r.rh = objstorage.MakeNoopReadHandle(r)
+	return r
+}
+
+// ReadAt is part of objstorage.Readable.
+func (m *memReader) ReadAt(_ context.Context, p []byte, off int64) error {
+	n, err := m.r.ReadAt(p, off)
+	if invariants.Enabled && err == nil && n != len(p) {
+		panic("short read")
+	}
+	return err
+}
+
+// Close is part of objstorage.Readable.
+func (*memReader) Close() error {
+	return nil
+}
+
+// Stat is part of objstorage.Readable.
+func (m *memReader) Size() int64 {
+	return int64(len(m.b))
+}
+
+// NewReadHandle is part of objstorage.Readable.
+func (m *memReader) NewReadHandle(_ context.Context) objstorage.ReadHandle {
+	return &m.rh
+}
diff --git a/pebble/sstable/suffix_rewriter_test.go b/pebble/sstable/suffix_rewriter_test.go
new file mode 100644
index 0000000..897e0dd
--- /dev/null
+++ b/pebble/sstable/suffix_rewriter_test.go
@@ -0,0 +1,278 @@
+package sstable
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"math/rand"
+	"strconv"
+	"testing"
+
+	"github.com/cockroachdb/pebble/bloom"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/stretchr/testify/require"
+)
+
+func TestRewriteSuffixProps(t *testing.T) {
+	from, to := []byte("_212"), []byte("_646")
+	for format := TableFormatPebblev2; format <= TableFormatMax; format++ {
+		t.Run(format.String(), func(t *testing.T) {
+			wOpts := WriterOptions{
+				FilterPolicy: bloom.FilterPolicy(10),
+				Comparer:     test4bSuffixComparer,
+				TablePropertyCollectors: []func() TablePropertyCollector{
+					intSuffixTablePropCollectorFn("ts3", 3), intSuffixTablePropCollectorFn("ts2", 2),
+				},
+				BlockPropertyCollectors: []func() BlockPropertyCollector{
+					keyCountCollectorFn("count"),
+					intSuffixIntervalCollectorFn("bp3", 3),
+					intSuffixIntervalCollectorFn("bp2", 2),
+					intSuffixIntervalCollectorFn("bp1", 1),
+				},
+				TableFormat: format,
+			}
+			if format >= TableFormatPebblev4 {
+				wOpts.IsStrictObsolete = true
+			}
+
+			const keyCount = 1e5
+			const rangeKeyCount = 100
+			// Setup our test SST.
+			sst := make4bSuffixTestSST(t, wOpts, []byte(from), keyCount, rangeKeyCount)
+
+			expectedProps := make(map[string]string)
+			expectedProps["ts2.min"] = "46"
+			expectedProps["ts2.max"] = "46"
+			expectedProps["ts3.min"] = "646"
+			expectedProps["ts3.max"] = "646"
+
+			// Also expect to see the aggregated block properties with their updated value
+			// at the correct (new) shortIDs. Seeing the rolled up value here is almost an
+			// end-to-end test since we only fed them each block during rewrite.
+			expectedProps["count"] = string(append([]byte{1}, strconv.Itoa(keyCount+rangeKeyCount)...))
+			expectedProps["bp2"] = string(interval{46, 47}.encode([]byte{2}))
+			expectedProps["bp3"] = string(interval{646, 647}.encode([]byte{0}))
+
+			// Swap the order of two of the props so they have new shortIDs, and remove
+			// one. rwOpts inherits the IsStrictObsolete value from wOpts.
+			rwOpts := wOpts
+			if rand.Intn(2) != 0 {
+				rwOpts.TableFormat = TableFormatPebblev2
+				rwOpts.IsStrictObsolete = false
+				t.Log("table format set to TableFormatPebblev2")
+			}
+			fmt.Printf("from format %s, to format %s\n", format.String(), rwOpts.TableFormat.String())
+			rwOpts.BlockPropertyCollectors = rwOpts.BlockPropertyCollectors[:3]
+			rwOpts.BlockPropertyCollectors[0], rwOpts.BlockPropertyCollectors[1] = rwOpts.BlockPropertyCollectors[1], rwOpts.BlockPropertyCollectors[0]
+
+			// Rewrite the SST using updated options and check the returned props.
+			readerOpts := ReaderOptions{
+				Comparer: test4bSuffixComparer,
+				Filters:  map[string]base.FilterPolicy{wOpts.FilterPolicy.Name(): wOpts.FilterPolicy},
+			}
+			r, err := NewMemReader(sst, readerOpts)
+			require.NoError(t, err)
+			defer r.Close()
+
+			var sstBytes [2][]byte
+			adjustPropsForEffectiveFormat := func(effectiveFormat TableFormat) {
+				if effectiveFormat == TableFormatPebblev4 {
+					expectedProps["obsolete-key"] = string([]byte{3})
+				} else {
+					delete(expectedProps, "obsolete-key")
+				}
+			}
+			for i, byBlocks := range []bool{false, true} {
+				t.Run(fmt.Sprintf("byBlocks=%v", byBlocks), func(t *testing.T) {
+					rewrittenSST := &memFile{}
+					if byBlocks {
+						_, rewriteFormat, err := rewriteKeySuffixesInBlocks(
+							r, rewrittenSST, rwOpts, from, to, 8)
+						// rewriteFormat is equal to the original format, since
+						// rwOpts.TableFormat is ignored.
+						require.Equal(t, wOpts.TableFormat, rewriteFormat)
+						require.NoError(t, err)
+						adjustPropsForEffectiveFormat(rewriteFormat)
+					} else {
+						_, err := RewriteKeySuffixesViaWriter(r, rewrittenSST, rwOpts, from, to)
+						require.NoError(t, err)
+						adjustPropsForEffectiveFormat(rwOpts.TableFormat)
+					}
+
+					sstBytes[i] = rewrittenSST.Data()
+					// Check that a reader on the rewritten STT has the expected props.
+					rRewritten, err := NewMemReader(rewrittenSST.Data(), readerOpts)
+					require.NoError(t, err)
+					defer rRewritten.Close()
+					require.Equal(t, expectedProps, rRewritten.Properties.UserProperties)
+					require.False(t, rRewritten.Properties.IsStrictObsolete)
+
+					// Compare the block level props from the data blocks in the layout,
+					// only if we did not do a rewrite from one format to another. If the
+					// format changes, the block boundaries change slightly.
+					if !byBlocks && wOpts.TableFormat != rwOpts.TableFormat {
+						return
+					}
+					layout, err := r.Layout()
+					require.NoError(t, err)
+					newLayout, err := rRewritten.Layout()
+					require.NoError(t, err)
+
+					ival := interval{}
+					for i := range layout.Data {
+						oldProps := make([][]byte, len(wOpts.BlockPropertyCollectors))
+						oldDecoder := blockPropertiesDecoder{layout.Data[i].Props}
+						for !oldDecoder.done() {
+							id, val, err := oldDecoder.next()
+							require.NoError(t, err)
+							oldProps[id] = val
+						}
+						newProps := make([][]byte, len(rwOpts.BlockPropertyCollectors))
+						newDecoder := blockPropertiesDecoder{newLayout.Data[i].Props}
+						for !newDecoder.done() {
+							id, val, err := newDecoder.next()
+							require.NoError(t, err)
+							if int(id) < len(newProps) {
+								newProps[id] = val
+							}
+						}
+						require.Equal(t, oldProps[0], newProps[1])
+						ival.decode(newProps[0])
+						require.Equal(t, interval{646, 647}, ival)
+						ival.decode(newProps[2])
+						require.Equal(t, interval{46, 47}, ival)
+					}
+				})
+			}
+			if wOpts.TableFormat == rwOpts.TableFormat {
+				// Both methods of rewriting should produce the same result.
+				require.Equal(t, sstBytes[0], sstBytes[1])
+			}
+		})
+	}
+}
+
+// memFile is a file-like struct that buffers all data written to it in memory.
+// Implements the objstorage.Writable interface.
+type memFile struct {
+	buf bytes.Buffer
+}
+
+var _ objstorage.Writable = (*memFile)(nil)
+
+// Finish is part of the objstorage.Writable interface.
+func (*memFile) Finish() error {
+	return nil
+}
+
+// Abort is part of the objstorage.Writable interface.
+func (*memFile) Abort() {}
+
+// Write is part of the objstorage.Writable interface.
+func (f *memFile) Write(p []byte) error {
+	_, err := f.buf.Write(p)
+	return err
+}
+
+// Data returns the in-memory buffer behind this MemFile.
+func (f *memFile) Data() []byte {
+	return f.buf.Bytes()
+}
+
+func make4bSuffixTestSST(
+	t testing.TB, writerOpts WriterOptions, suffix []byte, keys int, rangeKeys int,
+) []byte {
+	key := make([]byte, 28)
+	endKey := make([]byte, 24)
+	copy(key[24:], suffix)
+
+	f := &memFile{}
+	w := NewWriter(f, writerOpts)
+	for i := 0; i < keys; i++ {
+		binary.BigEndian.PutUint64(key[:8], 123) // 16-byte shared prefix
+		binary.BigEndian.PutUint64(key[8:16], 456)
+		binary.BigEndian.PutUint64(key[16:], uint64(i))
+		err := w.AddWithForceObsolete(
+			base.MakeInternalKey(key, 0, InternalKeyKindSet), key, false)
+		if err != nil {
+			t.Fatal(err)
+		}
+	}
+	for i := 0; i < rangeKeys; i++ {
+		binary.BigEndian.PutUint64(key[:8], 123) // 16-byte shared prefix
+		binary.BigEndian.PutUint64(key[8:16], 456)
+		binary.BigEndian.PutUint64(key[16:], uint64(i))
+		binary.BigEndian.PutUint64(endKey[:8], 123) // 16-byte shared prefix
+		binary.BigEndian.PutUint64(endKey[8:16], 456)
+		binary.BigEndian.PutUint64(endKey[16:], uint64(i+1))
+		if err := w.RangeKeySet(key[:24], endKey[:24], suffix, key); err != nil {
+			t.Fatal(err)
+		}
+	}
+	if err := w.Close(); err != nil {
+		t.Fatal(err)
+	}
+
+	return f.buf.Bytes()
+}
+
+func BenchmarkRewriteSST(b *testing.B) {
+	from, to := []byte("_123"), []byte("_456")
+	writerOpts := WriterOptions{
+		FilterPolicy: bloom.FilterPolicy(10),
+		Comparer:     test4bSuffixComparer,
+		TableFormat:  TableFormatPebblev2,
+	}
+
+	sizes := []int{100, 10000, 1e6}
+	compressions := []Compression{NoCompression, SnappyCompression}
+
+	files := make([][]*Reader, len(compressions))
+
+	for comp := range compressions {
+		files[comp] = make([]*Reader, len(sizes))
+
+		for size := range sizes {
+			writerOpts.Compression = compressions[comp]
+			sst := make4bSuffixTestSST(b, writerOpts, from, sizes[size], 0 /* rangeKeys */)
+			r, err := NewMemReader(sst, ReaderOptions{
+				Comparer: test4bSuffixComparer,
+				Filters:  map[string]base.FilterPolicy{writerOpts.FilterPolicy.Name(): writerOpts.FilterPolicy},
+			})
+			if err != nil {
+				b.Fatal(err)
+			}
+			files[comp][size] = r
+		}
+	}
+
+	b.ResetTimer()
+	for comp := range compressions {
+		b.Run(compressions[comp].String(), func(b *testing.B) {
+			for sz := range sizes {
+				r := files[comp][sz]
+				b.Run(fmt.Sprintf("keys=%d", sizes[sz]), func(b *testing.B) {
+					b.Run("ReaderWriterLoop", func(b *testing.B) {
+						b.SetBytes(r.readable.Size())
+						for i := 0; i < b.N; i++ {
+							if _, err := RewriteKeySuffixesViaWriter(r, &discardFile{}, writerOpts, from, to); err != nil {
+								b.Fatal(err)
+							}
+						}
+					})
+					for _, concurrency := range []int{1, 2, 4, 8, 16} {
+						b.Run(fmt.Sprintf("RewriteKeySuffixes,concurrency=%d", concurrency), func(b *testing.B) {
+							b.SetBytes(r.readable.Size())
+							for i := 0; i < b.N; i++ {
+								if _, _, err := rewriteKeySuffixesInBlocks(r, &discardFile{}, writerOpts, []byte("_123"), []byte("_456"), concurrency); err != nil {
+									b.Fatal(err)
+								}
+							}
+						})
+					}
+				})
+			}
+		})
+	}
+}
diff --git a/pebble/sstable/table.go b/pebble/sstable/table.go
new file mode 100644
index 0000000..040efaf
--- /dev/null
+++ b/pebble/sstable/table.go
@@ -0,0 +1,455 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+// Package sstable implements readers and writers of pebble tables.
+//
+// Tables are either opened for reading or created for writing but not both.
+//
+// A reader can create iterators, which allow seeking and next/prev
+// iteration. There may be multiple key/value pairs that have the same key and
+// different sequence numbers.
+//
+// A reader can be used concurrently. Multiple goroutines can call NewIter
+// concurrently, and each iterator can run concurrently with other iterators.
+// However, any particular iterator should not be used concurrently, and iterators
+// should not be used once a reader is closed.
+//
+// A writer writes key/value pairs in increasing key order, and cannot be used
+// concurrently. A table cannot be read until the writer has finished.
+//
+// Readers and writers can be created with various options. Passing a nil
+// Options pointer is valid and means to use the default values.
+//
+// One such option is to define the 'less than' ordering for keys. The default
+// Comparer uses the natural ordering consistent with bytes.Compare. The same
+// ordering should be used for reading and writing a table.
+//
+// To return the value for a key:
+//
+//	r := table.NewReader(file, options)
+//	defer r.Close()
+//	i := r.NewIter(nil, nil)
+//	defer i.Close()
+//	ikey, value := r.SeekGE(key)
+//	if options.Comparer.Compare(ikey.UserKey, key) != 0 {
+//	  // not found
+//	} else {
+//	  // value is the first record containing key
+//	}
+//
+// To count the number of entries in a table:
+//
+//	i, n := r.NewIter(nil, nil), 0
+//	for key, value := i.First(); key != nil; key, value = i.Next() {
+//		n++
+//	}
+//	if err := i.Close(); err != nil {
+//		return 0, err
+//	}
+//	return n, nil
+//
+// To write a table with three entries:
+//
+//	w := table.NewWriter(file, options)
+//	if err := w.Set([]byte("apple"), []byte("red")); err != nil {
+//		w.Close()
+//		return err
+//	}
+//	if err := w.Set([]byte("banana"), []byte("yellow")); err != nil {
+//		w.Close()
+//		return err
+//	}
+//	if err := w.Set([]byte("cherry"), []byte("red")); err != nil {
+//		w.Close()
+//		return err
+//	}
+//	return w.Close()
+package sstable // import "github.com/cockroachdb/pebble/sstable"
+
+import (
+	"context"
+	"encoding/binary"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage"
+)
+
+/*
+The table file format looks like:
+
+<start_of_file>
+[data block 0]
+[data block 1]
+...
+[data block N-1]
+[meta filter block] (optional)
+[index block] (for single level index)
+[meta rangedel block] (optional)
+[meta range key block] (optional)
+[value block 0] (optional)
+[value block M-1] (optional)
+[meta value index block] (optional)
+[meta properties block]
+[metaindex block]
+[footer]
+<end_of_file>
+
+A Reader eagerly loads the footer, metaindex block and meta properties block,
+because the data contained in those blocks is needed on every read, and even
+before reading. For example, the meta properties block is used to verify the
+comparer and merger are compatible, and the metaindex block contains the
+location of the meta properties (and other meta blocks). In situations where
+file system locality matters, or one wants to minimize number of read
+requests when eagerly loading these blocks, having these three as a suffix
+of the file is convenient.
+
+The interleaving of the index block(s) between the meta blocks is done to
+match RocksDB/LevelDB behavior.
+
+Each block consists of some data and a 5 byte trailer: a 1 byte block type and a
+4 byte checksum. The checksum is computed over the compressed data and the first
+byte of the trailer (i.e. the block type), and is serialized as little-endian.
+The block type gives the per-block compression used; each block is compressed
+independently. The checksum algorithm is described in the pebble/crc package.
+
+Most blocks, other than the meta filter block, value blocks and meta value
+index block, contain key/value pairs. The remainder of this comment refers to
+the decompressed block, containing key/value pairs, which has its 5 byte
+trailer stripped. The decompressed block data consists of a sequence of such
+key/value entries followed by a block suffix. Each key is encoded as a shared
+prefix length and a remainder string. For example, if two adjacent keys are
+"tweedledee" and "tweedledum", then the second key would be encoded as {8,
+"um"}. The shared prefix length is varint encoded. The remainder string and the
+value are encoded as a varint-encoded length followed by the literal contents.
+To continue the example, suppose that the key "tweedledum" mapped to the value
+"socks". The encoded key/value entry would be: "\x08\x02\x05umsocks".
+
+Every block has a restart interval I. Every I'th key/value entry in that block
+is called a restart point, and shares no key prefix with the previous entry.
+Continuing the example above, if the key after "tweedledum" was "two", but was
+part of a restart point, then that key would be encoded as {0, "two"} instead
+of {2, "o"}. If a block has P restart points, then the block suffix consists
+of (P+1)*4 bytes: (P+1) little-endian uint32 values. The first P of these
+uint32 values are the block offsets of each restart point. The final uint32
+value is P itself. Thus, when seeking for a particular key, one can use binary
+search to find the largest restart point whose key is <= the key sought.
+
+An index block is a block with N key/value entries. The i'th value is the
+encoded block handle of the i'th data block. The i'th key is a separator for
+i < N-1, and a successor for i == N-1. The separator between blocks i and i+1
+is a key that is >= every key in block i and is < every key i block i+1. The
+successor for the final block is a key that is >= every key in block N-1. The
+index block restart interval is 1: every entry is a restart point.
+
+A block handle is an offset, a length, and optional block properties (for data
+blocks and first/lower level index blocks); the length does not include the 5
+byte trailer. All numbers are varint-encoded, with no padding between the two
+values. The maximum size of an encoded block handle without properties is 20
+bytes. It is not advised to have properties that accumulate to be longer than
+100 bytes.
+
+Instead of a single index block, the sstable can have a two-level index (this
+is used to prevent a single huge index block). A two-level index consists of a
+sequence of lower-level index blocks with block handles for data blocks
+followed by a single top-level index block with block handles for the
+lower-level index blocks.
+
+The metaindex block also contains block handles as values, with keys being
+the names of the meta blocks.
+
+For a description of value blocks and the meta value index block, see
+value_block.go.
+
+Data blocks have some additional features:
+- For TableFormatPebblev3 onwards:
+  - For SETs, the value has a 1 byte value prefix, which indicates whether the
+    value is inline, or in a separate value block, and indicates whether the
+    prefix of the userkey (as defined by split) has changed or not. See
+    value_block.go for details.
+  - The most significant bit of the restart points is used to indicate whether
+    userkey prefix has changed since the last restart point. See the detailed
+    comment in blockWriter.
+  - The maximum length of the "shared prefix" when encoding the key, is the
+    length of the prefix of the userkey (as defined by split) of the previous
+    key.
+
+- For TableFormatPebblev4 onwards:
+  - The key kinds may be altered to set the
+    InternalKeyKindSSTableInternalObsoleteBit if the key-value pair is obsolete
+    in the context of that sstable (for a reader that reads at a higher seqnum
+    than the highest seqnum in the sstable). For details, see the comment in
+    format.go.
+*/
+
+const (
+	blockTrailerLen                    = 5
+	blockHandleMaxLenWithoutProperties = 10 + 10
+	// blockHandleLikelyMaxLen can be used for pre-allocating buffers to
+	// reduce memory copies. It is not guaranteed that a block handle will not
+	// exceed this length.
+	blockHandleLikelyMaxLen = blockHandleMaxLenWithoutProperties + 100
+
+	levelDBFooterLen   = 48
+	levelDBMagic       = "\x57\xfb\x80\x8b\x24\x75\x47\xdb"
+	levelDBMagicOffset = levelDBFooterLen - len(levelDBMagic)
+
+	rocksDBFooterLen             = 1 + 2*blockHandleMaxLenWithoutProperties + 4 + 8
+	rocksDBMagic                 = "\xf7\xcf\xf4\x85\xb7\x41\xe2\x88"
+	rocksDBMagicOffset           = rocksDBFooterLen - len(rocksDBMagic)
+	rocksDBVersionOffset         = rocksDBMagicOffset - 4
+	rocksDBExternalFormatVersion = 2
+
+	pebbleDBMagic = "\xf0\x9f\xaa\xb3\xf0\x9f\xaa\xb3" // 🪳🪳
+
+	minFooterLen = levelDBFooterLen
+	maxFooterLen = rocksDBFooterLen
+
+	levelDBFormatVersion  = 0
+	rocksDBFormatVersion2 = 2
+
+	metaRangeKeyName   = "pebble.range_key"
+	metaValueIndexName = "pebble.value_index"
+	metaPropertiesName = "rocksdb.properties"
+	metaRangeDelName   = "rocksdb.range_del"
+	metaRangeDelV2Name = "rocksdb.range_del2"
+
+	// Index Types.
+	// A space efficient index block that is optimized for binary-search-based
+	// index.
+	binarySearchIndex = 0
+	// hashSearchIndex               = 1
+	// A two-level index implementation. Both levels are binary search indexes.
+	twoLevelIndex = 2
+	// binarySearchWithFirstKeyIndex = 3
+
+	// RocksDB always includes this in the properties block. Since Pebble
+	// doesn't use zstd compression, the string will always be the same.
+	// This should be removed if we ever decide to diverge from the RocksDB
+	// properties block.
+	rocksDBCompressionOptions = "window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; "
+)
+
+// ChecksumType specifies the checksum used for blocks.
+type ChecksumType byte
+
+// The available checksum types.
+const (
+	ChecksumTypeNone     ChecksumType = 0
+	ChecksumTypeCRC32c   ChecksumType = 1
+	ChecksumTypeXXHash   ChecksumType = 2
+	ChecksumTypeXXHash64 ChecksumType = 3
+)
+
+// String implements fmt.Stringer.
+func (t ChecksumType) String() string {
+	switch t {
+	case ChecksumTypeCRC32c:
+		return "crc32c"
+	case ChecksumTypeNone:
+		return "none"
+	case ChecksumTypeXXHash:
+		return "xxhash"
+	case ChecksumTypeXXHash64:
+		return "xxhash64"
+	default:
+		panic(errors.Newf("sstable: unknown checksum type: %d", t))
+	}
+}
+
+type blockType byte
+
+const (
+	// The block type gives the per-block compression format.
+	// These constants are part of the file format and should not be changed.
+	// They are different from the Compression constants because the latter
+	// are designed so that the zero value of the Compression type means to
+	// use the default compression (which is snappy).
+	// Not all compression types listed here are supported.
+	noCompressionBlockType     blockType = 0
+	snappyCompressionBlockType blockType = 1
+	zlibCompressionBlockType   blockType = 2
+	bzip2CompressionBlockType  blockType = 3
+	lz4CompressionBlockType    blockType = 4
+	lz4hcCompressionBlockType  blockType = 5
+	xpressCompressionBlockType blockType = 6
+	zstdCompressionBlockType   blockType = 7
+)
+
+// String implements fmt.Stringer.
+func (t blockType) String() string {
+	switch t {
+	case 0:
+		return "none"
+	case 1:
+		return "snappy"
+	case 2:
+		return "zlib"
+	case 3:
+		return "bzip2"
+	case 4:
+		return "lz4"
+	case 5:
+		return "lz4hc"
+	case 6:
+		return "xpress"
+	case 7:
+		return "zstd"
+	default:
+		panic(errors.Newf("sstable: unknown block type: %d", t))
+	}
+}
+
+// legacy (LevelDB) footer format:
+//
+//	metaindex handle (varint64 offset, varint64 size)
+//	index handle     (varint64 offset, varint64 size)
+//	<padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
+//	table_magic_number (8 bytes)
+//
+// new (RocksDB) footer format:
+//
+//	checksum type (char, 1 byte)
+//	metaindex handle (varint64 offset, varint64 size)
+//	index handle     (varint64 offset, varint64 size)
+//	<padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
+//	footer version (4 bytes)
+//	table_magic_number (8 bytes)
+type footer struct {
+	format      TableFormat
+	checksum    ChecksumType
+	metaindexBH BlockHandle
+	indexBH     BlockHandle
+	footerBH    BlockHandle
+}
+
+func readFooter(f objstorage.Readable) (footer, error) {
+	var footer footer
+	size := f.Size()
+	if size < minFooterLen {
+		return footer, base.CorruptionErrorf("pebble/table: invalid table (file size is too small)")
+	}
+
+	buf := make([]byte, maxFooterLen)
+	off := size - maxFooterLen
+	if off < 0 {
+		off = 0
+		buf = buf[:size]
+	}
+	if err := f.ReadAt(context.TODO(), buf, off); err != nil {
+		return footer, errors.Wrap(err, "pebble/table: invalid table (could not read footer)")
+	}
+
+	switch magic := buf[len(buf)-len(rocksDBMagic):]; string(magic) {
+	case levelDBMagic:
+		if len(buf) < levelDBFooterLen {
+			return footer, base.CorruptionErrorf(
+				"pebble/table: invalid table (footer too short): %d", errors.Safe(len(buf)))
+		}
+		footer.footerBH.Offset = uint64(off+int64(len(buf))) - levelDBFooterLen
+		buf = buf[len(buf)-levelDBFooterLen:]
+		footer.footerBH.Length = uint64(len(buf))
+		footer.format = TableFormatLevelDB
+		footer.checksum = ChecksumTypeCRC32c
+
+	case rocksDBMagic, pebbleDBMagic:
+		// NOTE: The Pebble magic string implies the same footer format as that used
+		// by the RocksDBv2 table format.
+		if len(buf) < rocksDBFooterLen {
+			return footer, base.CorruptionErrorf("pebble/table: invalid table (footer too short): %d", errors.Safe(len(buf)))
+		}
+		footer.footerBH.Offset = uint64(off+int64(len(buf))) - rocksDBFooterLen
+		buf = buf[len(buf)-rocksDBFooterLen:]
+		footer.footerBH.Length = uint64(len(buf))
+		version := binary.LittleEndian.Uint32(buf[rocksDBVersionOffset:rocksDBMagicOffset])
+
+		format, err := ParseTableFormat(magic, version)
+		if err != nil {
+			return footer, err
+		}
+		footer.format = format
+
+		switch ChecksumType(buf[0]) {
+		case ChecksumTypeCRC32c:
+			footer.checksum = ChecksumTypeCRC32c
+		case ChecksumTypeXXHash64:
+			footer.checksum = ChecksumTypeXXHash64
+		default:
+			return footer, base.CorruptionErrorf("pebble/table: unsupported checksum type %d", errors.Safe(footer.checksum))
+		}
+		buf = buf[1:]
+
+	default:
+		return footer, base.CorruptionErrorf("pebble/table: invalid table (bad magic number: 0x%x)", magic)
+	}
+
+	{
+		end := uint64(size)
+		var n int
+		footer.metaindexBH, n = decodeBlockHandle(buf)
+		if n == 0 || footer.metaindexBH.Offset+footer.metaindexBH.Length > end {
+			return footer, base.CorruptionErrorf("pebble/table: invalid table (bad metaindex block handle)")
+		}
+		buf = buf[n:]
+
+		footer.indexBH, n = decodeBlockHandle(buf)
+		if n == 0 || footer.indexBH.Offset+footer.indexBH.Length > end {
+			return footer, base.CorruptionErrorf("pebble/table: invalid table (bad index block handle)")
+		}
+	}
+
+	return footer, nil
+}
+
+func (f footer) encode(buf []byte) []byte {
+	switch magic, version := f.format.AsTuple(); magic {
+	case levelDBMagic:
+		buf = buf[:levelDBFooterLen]
+		for i := range buf {
+			buf[i] = 0
+		}
+		n := encodeBlockHandle(buf[0:], f.metaindexBH)
+		encodeBlockHandle(buf[n:], f.indexBH)
+		copy(buf[len(buf)-len(levelDBMagic):], levelDBMagic)
+
+	case rocksDBMagic, pebbleDBMagic:
+		buf = buf[:rocksDBFooterLen]
+		for i := range buf {
+			buf[i] = 0
+		}
+		switch f.checksum {
+		case ChecksumTypeNone:
+			buf[0] = byte(ChecksumTypeNone)
+		case ChecksumTypeCRC32c:
+			buf[0] = byte(ChecksumTypeCRC32c)
+		case ChecksumTypeXXHash:
+			buf[0] = byte(ChecksumTypeXXHash)
+		case ChecksumTypeXXHash64:
+			buf[0] = byte(ChecksumTypeXXHash64)
+		default:
+			panic("unknown checksum type")
+		}
+		n := 1
+		n += encodeBlockHandle(buf[n:], f.metaindexBH)
+		encodeBlockHandle(buf[n:], f.indexBH)
+		binary.LittleEndian.PutUint32(buf[rocksDBVersionOffset:], version)
+		copy(buf[len(buf)-len(rocksDBMagic):], magic)
+
+	default:
+		panic("sstable: unspecified table format version")
+	}
+
+	return buf
+}
+
+func supportsTwoLevelIndex(format TableFormat) bool {
+	switch format {
+	case TableFormatLevelDB:
+		return false
+	case TableFormatRocksDBv2, TableFormatPebblev1, TableFormatPebblev2, TableFormatPebblev3, TableFormatPebblev4:
+		return true
+	default:
+		panic("sstable: unspecified table format version")
+	}
+}
diff --git a/pebble/sstable/table_test.go b/pebble/sstable/table_test.go
new file mode 100644
index 0000000..19fffef
--- /dev/null
+++ b/pebble/sstable/table_test.go
@@ -0,0 +1,867 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"bufio"
+	"bytes"
+	"context"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"math"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/bloom"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/kr/pretty"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+// nonsenseWords are words that aren't in testdata/h.txt.
+var nonsenseWords = []string{
+	// Edge cases.
+	"",
+	"\x00",
+	"\xff",
+	"`",
+	"a\x00",
+	"aaaaaa",
+	"pol\x00nius",
+	"youth\x00",
+	"youti",
+	"zzzzzz",
+	// Capitalized versions of actual words in testdata/h.txt.
+	"A",
+	"Hamlet",
+	"thEE",
+	"YOUTH",
+	// The following were generated by http://soybomb.com/tricks/words/
+	"pectures",
+	"exectly",
+	"tricatrippian",
+	"recens",
+	"whiratroce",
+	"troped",
+	"balmous",
+	"droppewry",
+	"toilizing",
+	"crocias",
+	"eathrass",
+	"cheakden",
+	"speablett",
+	"skirinies",
+	"prefing",
+	"bonufacision",
+}
+
+var (
+	wordCount = map[string]string{}
+	minWord   = ""
+	maxWord   = ""
+)
+
+func init() {
+	f, err := os.Open(filepath.FromSlash("testdata/h.txt"))
+	if err != nil {
+		panic(err)
+	}
+	defer f.Close()
+	r := bufio.NewReader(f)
+
+	for first := true; ; {
+		s, err := r.ReadBytes('\n')
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			panic(err)
+		}
+		k := strings.TrimSpace(string(s[8:]))
+		v := strings.TrimSpace(string(s[:8]))
+		wordCount[k] = v
+
+		if first {
+			first = false
+			minWord = k
+			maxWord = k
+			continue
+		}
+		if minWord > k {
+			minWord = k
+		}
+		if maxWord < k {
+			maxWord = k
+		}
+	}
+
+	if len(wordCount) != 1710 {
+		panic(fmt.Sprintf("h.txt entry count: got %d, want %d", len(wordCount), 1710))
+	}
+
+	for _, s := range nonsenseWords {
+		if _, ok := wordCount[s]; ok {
+			panic(fmt.Sprintf("nonsense word %q was in h.txt", s))
+		}
+	}
+}
+
+func check(f vfs.File, comparer *Comparer, fp FilterPolicy) error {
+	opts := ReaderOptions{
+		Comparer: comparer,
+	}
+	if fp != nil {
+		opts.Filters = map[string]FilterPolicy{
+			fp.Name(): fp,
+		}
+	}
+
+	r, err := newReader(f, opts)
+	if err != nil {
+		return err
+	}
+
+	// Check that each key/value pair in wordCount is also in the table.
+	words := make([]string, 0, len(wordCount))
+	for k, v := range wordCount {
+		words = append(words, k)
+		// Check using Get.
+		if v1, err := r.get([]byte(k)); string(v1) != string(v) || err != nil {
+			return errors.Errorf("Get %q: got (%q, %v), want (%q, %v)", k, v1, err, v, error(nil))
+		} else if len(v1) != cap(v1) {
+			return errors.Errorf("Get %q: len(v1)=%d, cap(v1)=%d", k, len(v1), cap(v1))
+		}
+
+		// Check using SeekGE.
+		iter, err := r.NewIter(nil /* lower */, nil /* upper */)
+		if err != nil {
+			return err
+		}
+		i := newIterAdapter(iter)
+		if !i.SeekGE([]byte(k), base.SeekGEFlagsNone) || string(i.Key().UserKey) != k {
+			return errors.Errorf("Find %q: key was not in the table", k)
+		}
+		if k1 := i.Key().UserKey; len(k1) != cap(k1) {
+			return errors.Errorf("Find %q: len(k1)=%d, cap(k1)=%d", k, len(k1), cap(k1))
+		}
+		if string(i.Value()) != v {
+			return errors.Errorf("Find %q: got value %q, want %q", k, i.Value(), v)
+		}
+		if v1 := i.Value(); len(v1) != cap(v1) {
+			return errors.Errorf("Find %q: len(v1)=%d, cap(v1)=%d", k, len(v1), cap(v1))
+		}
+
+		// Check using SeekLT.
+		if !i.SeekLT([]byte(k), base.SeekLTFlagsNone) {
+			i.First()
+		} else {
+			i.Next()
+		}
+		if string(i.Key().UserKey) != k {
+			return errors.Errorf("Find %q: key was not in the table", k)
+		}
+		if k1 := i.Key().UserKey; len(k1) != cap(k1) {
+			return errors.Errorf("Find %q: len(k1)=%d, cap(k1)=%d", k, len(k1), cap(k1))
+		}
+		if string(i.Value()) != v {
+			return errors.Errorf("Find %q: got value %q, want %q", k, i.Value(), v)
+		}
+		if v1 := i.Value(); len(v1) != cap(v1) {
+			return errors.Errorf("Find %q: len(v1)=%d, cap(v1)=%d", k, len(v1), cap(v1))
+		}
+
+		if err := i.Close(); err != nil {
+			return err
+		}
+	}
+
+	// Check that nonsense words are not in the table.
+	for _, s := range nonsenseWords {
+		// Check using Get.
+		if _, err := r.get([]byte(s)); err != base.ErrNotFound {
+			return errors.Errorf("Get %q: got %v, want ErrNotFound", s, err)
+		}
+
+		// Check using Find.
+		iter, err := r.NewIter(nil /* lower */, nil /* upper */)
+		if err != nil {
+			return err
+		}
+		i := newIterAdapter(iter)
+		if i.SeekGE([]byte(s), base.SeekGEFlagsNone) && s == string(i.Key().UserKey) {
+			return errors.Errorf("Find %q: unexpectedly found key in the table", s)
+		}
+		if err := i.Close(); err != nil {
+			return err
+		}
+	}
+
+	// Check that the number of keys >= a given start key matches the expected number.
+	var countTests = []struct {
+		count int
+		start string
+	}{
+		// cat h.txt | cut -c 9- | wc -l gives 1710.
+		{1710, ""},
+		// cat h.txt | cut -c 9- | grep -v "^[a-b]" | wc -l gives 1522.
+		{1522, "c"},
+		// cat h.txt | cut -c 9- | grep -v "^[a-j]" | wc -l gives 940.
+		{940, "k"},
+		// cat h.txt | cut -c 9- | grep -v "^[a-x]" | wc -l gives 12.
+		{12, "y"},
+		// cat h.txt | cut -c 9- | grep -v "^[a-z]" | wc -l gives 0.
+		{0, "~"},
+	}
+	for _, ct := range countTests {
+		iter, err := r.NewIter(nil /* lower */, nil /* upper */)
+		if err != nil {
+			return err
+		}
+		n, i := 0, newIterAdapter(iter)
+		for valid := i.SeekGE([]byte(ct.start), base.SeekGEFlagsNone); valid; valid = i.Next() {
+			n++
+		}
+		if n != ct.count {
+			return errors.Errorf("count %q: got %d, want %d", ct.start, n, ct.count)
+		}
+		n = 0
+		for valid := i.Last(); valid; valid = i.Prev() {
+			if bytes.Compare(i.Key().UserKey, []byte(ct.start)) < 0 {
+				break
+			}
+			n++
+		}
+		if n != ct.count {
+			return errors.Errorf("count %q: got %d, want %d", ct.start, n, ct.count)
+		}
+		if err := i.Close(); err != nil {
+			return err
+		}
+	}
+
+	// Check lower/upper bounds behavior. Randomly choose a lower and upper bound
+	// and then guarantee that iteration finds the expected number if entries.
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+	sort.Strings(words)
+	for i := 0; i < 10; i++ {
+		lowerIdx := -1
+		upperIdx := len(words)
+		if rng.Intn(5) != 0 {
+			lowerIdx = rng.Intn(len(words))
+		}
+		if rng.Intn(5) != 0 {
+			upperIdx = rng.Intn(len(words))
+		}
+		if lowerIdx > upperIdx {
+			lowerIdx, upperIdx = upperIdx, lowerIdx
+		}
+
+		var lower, upper []byte
+		if lowerIdx >= 0 {
+			lower = []byte(words[lowerIdx])
+		} else {
+			lowerIdx = 0
+		}
+		if upperIdx < len(words) {
+			upper = []byte(words[upperIdx])
+		}
+
+		iter, err := r.NewIter(lower, upper)
+		if err != nil {
+			return err
+		}
+		i := newIterAdapter(iter)
+
+		if lower == nil {
+			n := 0
+			for valid := i.First(); valid; valid = i.Next() {
+				n++
+			}
+			if expected := upperIdx; expected != n {
+				return errors.Errorf("expected %d, but found %d", expected, n)
+			}
+		}
+
+		if upper == nil {
+			n := 0
+			for valid := i.Last(); valid; valid = i.Prev() {
+				n++
+			}
+			if expected := len(words) - lowerIdx; expected != n {
+				return errors.Errorf("expected %d, but found %d", expected, n)
+			}
+		}
+
+		if lower != nil {
+			n := 0
+			for valid := i.SeekGE(lower, base.SeekGEFlagsNone); valid; valid = i.Next() {
+				n++
+			}
+			if expected := upperIdx - lowerIdx; expected != n {
+				return errors.Errorf("expected %d, but found %d", expected, n)
+			}
+		}
+
+		if upper != nil {
+			n := 0
+			for valid := i.SeekLT(upper, base.SeekLTFlagsNone); valid; valid = i.Prev() {
+				n++
+			}
+			if expected := upperIdx - lowerIdx; expected != n {
+				return errors.Errorf("expected %d, but found %d", expected, n)
+			}
+		}
+
+		if err := i.Close(); err != nil {
+			return err
+		}
+	}
+
+	return r.Close()
+}
+
+var (
+	memFileSystem = vfs.NewMem()
+	tmpFileCount  int
+)
+
+func build(
+	compression Compression,
+	fp FilterPolicy,
+	ftype FilterType,
+	comparer *Comparer,
+	propCollector func() TablePropertyCollector,
+	blockSize int,
+	indexBlockSize int,
+) (vfs.File, error) {
+	// Create a sorted list of wordCount's keys.
+	keys := make([]string, len(wordCount))
+	i := 0
+	for k := range wordCount {
+		keys[i] = k
+		i++
+	}
+	sort.Strings(keys)
+
+	// Write the key/value pairs to a new table, in increasing key order.
+	filename := fmt.Sprintf("/tmp%d", tmpFileCount)
+	f0, err := memFileSystem.Create(filename)
+	if err != nil {
+		return nil, err
+	}
+	tmpFileCount++
+
+	writerOpts := WriterOptions{
+		BlockSize:      blockSize,
+		Comparer:       comparer,
+		Compression:    compression,
+		FilterPolicy:   fp,
+		FilterType:     ftype,
+		IndexBlockSize: indexBlockSize,
+		MergerName:     "nullptr",
+	}
+	if propCollector != nil {
+		writerOpts.TablePropertyCollectors = append(writerOpts.TablePropertyCollectors, propCollector)
+	}
+
+	w := NewWriter(objstorageprovider.NewFileWritable(f0), writerOpts)
+	// Use rangeDelV1Format for testing byte equality with RocksDB.
+	w.rangeDelV1Format = true
+	var rangeDelLength int
+	var rangeDelCounter int
+	var rangeDelStart InternalKey
+	for i, k := range keys {
+		v := wordCount[k]
+		ikey := base.MakeInternalKey([]byte(k), 0, InternalKeyKindSet)
+		if err := w.Add(ikey, []byte(v)); err != nil {
+			return nil, err
+		}
+		// This mirrors the logic in `make-table.cc`. It adds range deletions of
+		// increasing length for every 100 keys added.
+		if i%100 == 0 {
+			rangeDelStart = ikey.Clone()
+			rangeDelCounter = 0
+			rangeDelLength++
+		}
+		rangeDelCounter++
+
+		if rangeDelCounter == rangeDelLength {
+			if err := w.DeleteRange(rangeDelStart.UserKey, ikey.UserKey); err != nil {
+				return nil, err
+			}
+		}
+	}
+	if err := w.Close(); err != nil {
+		return nil, err
+	}
+
+	// Re-open that filename for reading.
+	f1, err := memFileSystem.Open(filename)
+	if err != nil {
+		return nil, err
+	}
+	return f1, nil
+}
+
+func testReader(t *testing.T, filename string, comparer *Comparer, fp FilterPolicy) {
+	// Check that we can read a pre-made table.
+	f, err := vfs.Default.Open(filepath.FromSlash("testdata/" + filename))
+	if err != nil {
+		t.Error(err)
+		return
+	}
+	err = check(f, comparer, fp)
+	if err != nil {
+		t.Error(err)
+		return
+	}
+}
+
+func TestReaderLevelDB(t *testing.T)            { testReader(t, "h.ldb", nil, nil) }
+func TestReaderDefaultCompression(t *testing.T) { testReader(t, "h.sst", nil, nil) }
+func TestReaderNoCompression(t *testing.T)      { testReader(t, "h.no-compression.sst", nil, nil) }
+func TestReaderBlockBloomIgnored(t *testing.T) {
+	testReader(t, "h.block-bloom.no-compression.sst", nil, nil)
+}
+func TestReaderTableBloomIgnored(t *testing.T) {
+	testReader(t, "h.table-bloom.no-compression.sst", nil, nil)
+}
+
+func TestReaderBloomUsed(t *testing.T) {
+	// wantActualNegatives is the minimum number of nonsense words (i.e. false
+	// positives or true negatives) to run through our filter. Some nonsense
+	// words might be rejected even before the filtering step, if they are out
+	// of the [minWord, maxWord] range of keys in the table.
+	wantActualNegatives := 0
+	for _, s := range nonsenseWords {
+		if minWord < s && s < maxWord {
+			wantActualNegatives++
+		}
+	}
+
+	files := []struct {
+		path     string
+		comparer *Comparer
+	}{
+		{"h.table-bloom.no-compression.sst", nil},
+		{"h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst", fixtureComparer},
+	}
+	for _, tc := range files {
+		t.Run(tc.path, func(t *testing.T) {
+			for _, degenerate := range []bool{false, true} {
+				t.Run(fmt.Sprintf("degenerate=%t", degenerate), func(t *testing.T) {
+					c := &countingFilterPolicy{
+						FilterPolicy: bloom.FilterPolicy(10),
+						degenerate:   degenerate,
+					}
+					testReader(t, tc.path, tc.comparer, c)
+
+					if c.truePositives != len(wordCount) {
+						t.Errorf("degenerate=%t: true positives: got %d, want %d", degenerate, c.truePositives, len(wordCount))
+					}
+					if c.falseNegatives != 0 {
+						t.Errorf("degenerate=%t: false negatives: got %d, want %d", degenerate, c.falseNegatives, 0)
+					}
+
+					if got := c.falsePositives + c.trueNegatives; got < wantActualNegatives {
+						t.Errorf("degenerate=%t: actual negatives (false positives + true negatives): "+
+							"got %d (%d + %d), want >= %d",
+							degenerate, got, c.falsePositives, c.trueNegatives, wantActualNegatives)
+					}
+
+					if !degenerate {
+						// The true negative count should be much greater than the false
+						// positive count.
+						if c.trueNegatives < 10*c.falsePositives {
+							t.Errorf("degenerate=%t: true negative to false positive ratio (%d:%d) is too small",
+								degenerate, c.trueNegatives, c.falsePositives)
+						}
+					}
+				})
+			}
+		})
+	}
+}
+
+func TestBloomFilterFalsePositiveRate(t *testing.T) {
+	f, err := os.Open(filepath.FromSlash("testdata/h.table-bloom.no-compression.sst"))
+	require.NoError(t, err)
+
+	c := &countingFilterPolicy{
+		FilterPolicy: bloom.FilterPolicy(1),
+	}
+	r, err := newReader(f, ReaderOptions{
+		Filters: map[string]FilterPolicy{
+			c.Name(): c,
+		},
+	})
+	require.NoError(t, err)
+
+	const n = 10000
+	// key is a buffer that will be re-used for n Get calls, each with a
+	// different key. The "m" in the 2-byte prefix means that the key falls in
+	// the [minWord, maxWord] range and so will not be rejected prior to
+	// applying the Bloom filter. The "!" in the 2-byte prefix means that the
+	// key is not actually in the table. The filter will only see actual
+	// negatives: false positives or true negatives.
+	key := []byte("m!....")
+	for i := 0; i < n; i++ {
+		binary.LittleEndian.PutUint32(key[2:6], uint32(i))
+		r.get(key)
+	}
+
+	if c.truePositives != 0 {
+		t.Errorf("true positives: got %d, want 0", c.truePositives)
+	}
+	if c.falseNegatives != 0 {
+		t.Errorf("false negatives: got %d, want 0", c.falseNegatives)
+	}
+	if got := c.falsePositives + c.trueNegatives; got != n {
+		t.Errorf("actual negatives (false positives + true negatives): got %d (%d + %d), want %d",
+			got, c.falsePositives, c.trueNegatives, n)
+	}
+
+	// According the the comments in the C++ LevelDB code, the false positive
+	// rate should be approximately 1% for for bloom.FilterPolicy(10). The 10
+	// was the parameter used to write the .sst file. When reading the file,
+	// the 1 in the bloom.FilterPolicy(1) above doesn't matter, only the
+	// bloom.FilterPolicy matters.
+	if got := float64(100*c.falsePositives) / n; got < 0.2 || 5 < got {
+		t.Errorf("false positive rate: got %v%%, want approximately 1%%", got)
+	}
+
+	require.NoError(t, r.Close())
+}
+
+type countingFilterPolicy struct {
+	FilterPolicy
+	degenerate bool
+
+	truePositives  int
+	falsePositives int
+	falseNegatives int
+	trueNegatives  int
+}
+
+func (c *countingFilterPolicy) MayContain(ftype FilterType, filter, key []byte) bool {
+	got := true
+	if c.degenerate {
+		// When degenerate is true, we override the embedded FilterPolicy's
+		// MayContain method to always return true. Doing so is a valid, if
+		// inefficient, implementation of the FilterPolicy interface.
+	} else {
+		got = c.FilterPolicy.MayContain(ftype, filter, key)
+	}
+	_, want := wordCount[string(key)]
+
+	switch {
+	case got && want:
+		c.truePositives++
+	case got && !want:
+		c.falsePositives++
+	case !got && want:
+		c.falseNegatives++
+	case !got && !want:
+		c.trueNegatives++
+	}
+	return got
+}
+
+func TestWriterRoundTrip(t *testing.T) {
+	blockSizes := []int{100, 1000, 2048, 4096, math.MaxInt32}
+	for _, blockSize := range blockSizes {
+		for _, indexBlockSize := range blockSizes {
+			for name, fp := range map[string]FilterPolicy{
+				"none":       nil,
+				"bloom10bit": bloom.FilterPolicy(10),
+			} {
+				t.Run(fmt.Sprintf("bloom=%s", name), func(t *testing.T) {
+					f, err := build(DefaultCompression, fp, TableFilter,
+						nil, nil, blockSize, indexBlockSize)
+					require.NoError(t, err)
+
+					// Check that we can read a freshly made table.
+					require.NoError(t, check(f, nil, nil))
+				})
+			}
+		}
+	}
+}
+
+func TestFinalBlockIsWritten(t *testing.T) {
+	keys := []string{"A", "B", "C", "D", "E", "F", "G", "H", "I", "J"}
+	valueLengths := []int{0, 1, 22, 28, 33, 40, 50, 61, 87, 100, 143, 200}
+	xxx := bytes.Repeat([]byte("x"), valueLengths[len(valueLengths)-1])
+	for _, blockSize := range []int{5, 10, 25, 50, 100} {
+		for _, indexBlockSize := range []int{5, 10, 25, 50, 100, math.MaxInt32} {
+			for nk := 0; nk <= len(keys); nk++ {
+			loop:
+				for _, vLen := range valueLengths {
+					got, memFS := 0, vfs.NewMem()
+
+					wf, err := memFS.Create("foo")
+					if err != nil {
+						t.Errorf("nk=%d, vLen=%d: memFS create: %v", nk, vLen, err)
+						continue
+					}
+					w := NewWriter(objstorageprovider.NewFileWritable(wf), WriterOptions{
+						BlockSize:      blockSize,
+						IndexBlockSize: indexBlockSize,
+					})
+					for _, k := range keys[:nk] {
+						if err := w.Add(InternalKey{UserKey: []byte(k)}, xxx[:vLen]); err != nil {
+							t.Errorf("nk=%d, vLen=%d: set: %v", nk, vLen, err)
+							continue loop
+						}
+					}
+					if err := w.Close(); err != nil {
+						t.Errorf("nk=%d, vLen=%d: writer close: %v", nk, vLen, err)
+						continue
+					}
+
+					rf, err := memFS.Open("foo")
+					if err != nil {
+						t.Errorf("nk=%d, vLen=%d: memFS open: %v", nk, vLen, err)
+						continue
+					}
+					r, err := newReader(rf, ReaderOptions{})
+					if err != nil {
+						t.Errorf("nk=%d, vLen=%d: reader open: %v", nk, vLen, err)
+					}
+					iter, err := r.NewIter(nil /* lower */, nil /* upper */)
+					require.NoError(t, err)
+					i := newIterAdapter(iter)
+					for valid := i.First(); valid; valid = i.Next() {
+						got++
+					}
+					if err := i.Close(); err != nil {
+						t.Errorf("nk=%d, vLen=%d: Iterator close: %v", nk, vLen, err)
+						continue
+					}
+					if err := r.Close(); err != nil {
+						t.Errorf("nk=%d, vLen=%d: reader close: %v", nk, vLen, err)
+						continue
+					}
+
+					if got != nk {
+						t.Errorf("nk=%2d, vLen=%3d: got %2d keys, want %2d", nk, vLen, got, nk)
+						continue
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestReaderGlobalSeqNum(t *testing.T) {
+	f, err := os.Open(filepath.FromSlash("testdata/h.sst"))
+	require.NoError(t, err)
+
+	r, err := newReader(f, ReaderOptions{})
+	require.NoError(t, err)
+
+	const globalSeqNum = 42
+	r.Properties.GlobalSeqNum = globalSeqNum
+
+	iter, err := r.NewIter(nil /* lower */, nil /* upper */)
+	require.NoError(t, err)
+	i := newIterAdapter(iter)
+	for valid := i.First(); valid; valid = i.Next() {
+		if globalSeqNum != i.Key().SeqNum() {
+			t.Fatalf("expected %d, but found %d", globalSeqNum, i.Key().SeqNum())
+		}
+	}
+	require.NoError(t, i.Close())
+	require.NoError(t, r.Close())
+}
+
+func TestMetaIndexEntriesSorted(t *testing.T) {
+	f, err := build(DefaultCompression, nil, /* filter policy */
+		TableFilter, nil, nil, 4096, 4096)
+	require.NoError(t, err)
+
+	r, err := newReader(f, ReaderOptions{})
+	require.NoError(t, err)
+
+	b, err := r.readBlock(
+		context.Background(), r.metaIndexBH, nil, nil, nil, nil, nil)
+	require.NoError(t, err)
+	defer b.Release()
+
+	i, err := newRawBlockIter(bytes.Compare, b.Get())
+	require.NoError(t, err)
+
+	var keys []string
+	for valid := i.First(); valid; valid = i.Next() {
+		keys = append(keys, string(i.Key().UserKey))
+	}
+	if !sort.StringsAreSorted(keys) {
+		t.Fatalf("metaindex block out of order: %v", keys)
+	}
+
+	require.NoError(t, i.Close())
+	require.NoError(t, r.Close())
+}
+
+func TestFooterRoundTrip(t *testing.T) {
+	buf := make([]byte, 100+maxFooterLen)
+	for format := TableFormatLevelDB; format < TableFormatMax; format++ {
+		t.Run(fmt.Sprintf("format=%s", format), func(t *testing.T) {
+			checksums := []ChecksumType{ChecksumTypeCRC32c}
+			if format != TableFormatLevelDB {
+				checksums = []ChecksumType{ChecksumTypeCRC32c, ChecksumTypeXXHash64}
+			}
+			for _, checksum := range checksums {
+				t.Run(fmt.Sprintf("checksum=%d", checksum), func(t *testing.T) {
+					footer := footer{
+						format:      format,
+						checksum:    checksum,
+						metaindexBH: BlockHandle{Offset: 1, Length: 2},
+						indexBH:     BlockHandle{Offset: 3, Length: 4},
+					}
+					for _, offset := range []int64{0, 1, 100} {
+						t.Run(fmt.Sprintf("offset=%d", offset), func(t *testing.T) {
+							mem := vfs.NewMem()
+							f, err := mem.Create("test")
+							require.NoError(t, err)
+
+							_, err = f.Write(buf[:offset])
+							require.NoError(t, err)
+
+							encoded := footer.encode(buf[100:])
+							_, err = f.Write(encoded)
+							require.NoError(t, err)
+							require.NoError(t, f.Close())
+
+							footer.footerBH.Offset = uint64(offset)
+							footer.footerBH.Length = uint64(len(encoded))
+
+							f, err = mem.Open("test")
+							require.NoError(t, err)
+
+							readable, err := NewSimpleReadable(f)
+							require.NoError(t, err)
+
+							result, err := readFooter(readable)
+							require.NoError(t, err)
+							require.NoError(t, readable.Close())
+
+							if diff := pretty.Diff(footer, result); diff != nil {
+								t.Fatalf("expected %+v, but found %+v\n%s",
+									footer, result, strings.Join(diff, "\n"))
+							}
+						})
+					}
+				})
+			}
+		})
+	}
+}
+
+func TestReadFooter(t *testing.T) {
+	encode := func(format TableFormat, checksum ChecksumType) string {
+		f := footer{
+			format:   format,
+			checksum: checksum,
+		}
+		return string(f.encode(make([]byte, maxFooterLen)))
+	}
+
+	testCases := []struct {
+		encoded  string
+		expected string
+	}{
+		{strings.Repeat("a", minFooterLen-1), "file size is too small"},
+		{strings.Repeat("a", levelDBFooterLen), "bad magic number"},
+		{strings.Repeat("a", rocksDBFooterLen), "bad magic number"},
+		{encode(TableFormatLevelDB, 0)[1:], "file size is too small"},
+		{encode(TableFormatRocksDBv2, 0)[1:], "footer too short"},
+		{encode(TableFormatRocksDBv2, ChecksumTypeNone), "unsupported checksum type"},
+		{encode(TableFormatRocksDBv2, ChecksumTypeXXHash), "unsupported checksum type"},
+	}
+	for _, c := range testCases {
+		t.Run("", func(t *testing.T) {
+			mem := vfs.NewMem()
+			f, err := mem.Create("test")
+			require.NoError(t, err)
+
+			_, err = f.Write([]byte(c.encoded))
+			require.NoError(t, err)
+			require.NoError(t, f.Close())
+
+			f, err = mem.Open("test")
+			require.NoError(t, err)
+
+			readable, err := NewSimpleReadable(f)
+			require.NoError(t, err)
+
+			if _, err := readFooter(readable); err == nil {
+				t.Fatalf("expected %q, but found success", c.expected)
+			} else if !strings.Contains(err.Error(), c.expected) {
+				t.Fatalf("expected %q, but found %v", c.expected, err)
+			}
+		})
+	}
+}
+
+type errorPropCollector struct{}
+
+func (errorPropCollector) Add(key InternalKey, _ []byte) error {
+	return errors.Errorf("add %s failed", key)
+}
+
+func (errorPropCollector) Finish(_ map[string]string) error {
+	return errors.Errorf("finish failed")
+}
+
+func (errorPropCollector) Name() string {
+	return "errorPropCollector"
+}
+
+func TestTablePropertyCollectorErrors(t *testing.T) {
+
+	var testcases map[string]func(w *Writer) error = map[string]func(w *Writer) error{
+		"add a#0,1 failed": func(w *Writer) error {
+			return w.Set([]byte("a"), []byte("b"))
+		},
+		"add c#0,0 failed": func(w *Writer) error {
+			return w.Delete([]byte("c"))
+		},
+		"add d#0,15 failed": func(w *Writer) error {
+			return w.DeleteRange([]byte("d"), []byte("e"))
+		},
+		"add f#0,2 failed": func(w *Writer) error {
+			return w.Merge([]byte("f"), []byte("g"))
+		},
+		"finish failed": func(w *Writer) error {
+			return w.Close()
+		},
+	}
+
+	for e, fun := range testcases {
+		mem := vfs.NewMem()
+		f, err := mem.Create("foo")
+		require.NoError(t, err)
+
+		var opts WriterOptions
+		opts.TablePropertyCollectors = append(opts.TablePropertyCollectors,
+			func() TablePropertyCollector {
+				return errorPropCollector{}
+			})
+
+		w := NewWriter(objstorageprovider.NewFileWritable(f), opts)
+
+		require.Regexp(t, e, fun(w))
+	}
+}
diff --git a/pebble/sstable/testdata/.gitignore b/pebble/sstable/testdata/.gitignore
new file mode 100644
index 0000000..20ae9cb
--- /dev/null
+++ b/pebble/sstable/testdata/.gitignore
@@ -0,0 +1 @@
+/make-table
diff --git a/pebble/sstable/testdata/Makefile b/pebble/sstable/testdata/Makefile
new file mode 100644
index 0000000..cdb2890
--- /dev/null
+++ b/pebble/sstable/testdata/Makefile
@@ -0,0 +1,17 @@
+all: rebuild
+
+.PHONY: rebuild
+rebuild: make-table h.txt
+	./make-table
+
+h.txt: hamlet-act-1.txt
+	cat hamlet-act-1.txt | \
+		tr '[:upper:]' '[:lower:]' | \
+		grep -o -E '\w+' | \
+		sort | \
+		uniq -c | \
+		awk '{printf "%7s %s\n", $$1, $$2}' \
+		> h.txt
+
+make-table: make-table.cc
+	g++ -std=c++14 -o make-table make-table.cc -lrocksdb
diff --git a/pebble/sstable/testdata/block b/pebble/sstable/testdata/block
new file mode 100644
index 0000000..121640b
--- /dev/null
+++ b/pebble/sstable/testdata/block
@@ -0,0 +1,135 @@
+build
+a:1,b:2,c:3,d:4
+----
+
+iter
+first
+next
+next
+next
+next
+----
+<a:1><b:2><c:3><d:4>.
+
+iter
+seek-ge a
+next
+next
+next
+next
+----
+<a:1><b:2><c:3><d:4>.
+
+iter
+seek-ge b
+next
+next
+next
+----
+<b:2><c:3><d:4>.
+
+iter
+seek-ge c
+next
+next
+----
+<c:3><d:4>.
+
+iter
+seek-ge d
+next
+----
+<d:4>.
+
+iter
+seek-ge e
+----
+.
+
+iter
+seek-ge b
+seek-ge c
+seek-ge d
+seek-ge e
+----
+<b:2><c:3><d:4>.
+
+iter
+last
+prev
+prev
+prev
+prev
+----
+<d:4><c:3><b:2><a:1>.
+
+iter
+seek-lt e
+prev
+prev
+prev
+prev
+----
+<d:4><c:3><b:2><a:1>.
+
+iter
+seek-lt d
+prev
+prev
+prev
+----
+<c:3><b:2><a:1>.
+
+iter
+seek-lt c
+prev
+prev
+----
+<b:2><a:1>.
+
+iter
+seek-lt b
+prev
+----
+<a:1>.
+
+iter
+seek-lt a
+prev
+next
+----
+..<a:1>
+
+iter
+seek-ge d
+next
+next
+prev
+----
+<d:4>..<d:4>
+
+iter
+seek-lt d
+seek-lt c
+seek-lt b
+seek-lt a
+----
+<c:3><b:2><a:1>.
+
+iter globalSeqNum=1
+first
+next
+next
+next
+next
+----
+<a:1><b:1><c:1><d:1>.
+
+iter globalSeqNum=10
+first
+next
+next
+next
+next
+----
+<a:10><b:10><c:10><d:10>.
diff --git a/pebble/sstable/testdata/block_properties b/pebble/sstable/testdata/block_properties
new file mode 100644
index 0000000..8cf2db1
--- /dev/null
+++ b/pebble/sstable/testdata/block_properties
@@ -0,0 +1,671 @@
+# The following collectors are available:
+# - value-first - uses the first character of the value to construct an interval
+# - value-last - uses the last character of the value to construct an interval
+# - suffix - constructs an interval from the '@timestamp' suffix of each key
+# - suffix-point-keys-only - same as "suffix", but only applies to point keys
+# - suffix-range-keys-only - same as "suffix", but only applies to range keys
+# - nil-points-and-ranges - a trivial collector with neither a point nor range collector
+
+# Single collector.
+
+build collectors=(value-first)
+a.SET.1:10
+b.SET.2:20
+c.SET.3:30
+----
+point:    [a#1,1,c#3,1]
+rangedel: [#0,0,#0,0]
+rangekey: [#0,0,#0,0]
+seqnums:  [1,3]
+
+# collectors returns the collectors used when writing the table, keyed by the
+# shortID of the collector.
+collectors
+----
+0: value-first
+
+# table-props returns the table-level properties, keyed by the shortID.
+table-props
+----
+0: [1, 4)
+
+# block-props returns the block-level properties. For each block, the separator
+# is printed, along with the properties for the block, keyed by the shortID.
+block-props
+----
+d#72057594037927935,17:
+  0: [1, 4)
+
+# Multiple collectors.
+
+build collectors=(value-first,value-last)
+a.SET.1:17
+b.SET.2:29
+c.SET.3:38
+----
+point:    [a#1,1,c#3,1]
+rangedel: [#0,0,#0,0]
+rangekey: [#0,0,#0,0]
+seqnums:  [1,3]
+
+collectors
+----
+0: value-first
+1: value-last
+
+table-props
+----
+0: [1, 4)
+1: [7, 10)
+
+block-props
+----
+d#72057594037927935,17:
+  0: [1, 4)
+  1: [7, 10)
+
+# Reduce the block size to a value such that each block has at most two KV
+# pairs.
+
+build block-size=25 collectors=(value-first,value-last)
+a.SET.1:15
+b.SET.2:86
+c.SET.3:72
+d.SET.4:21
+e.SET.5:47
+f.SET.6:54
+g.SET.7:63
+h.SET.8:38
+----
+point:    [a#1,1,h#8,1]
+rangedel: [#0,0,#0,0]
+rangekey: [#0,0,#0,0]
+seqnums:  [1,8]
+
+collectors
+----
+0: value-first
+1: value-last
+
+table-props
+----
+0: [1, 9)
+1: [1, 9)
+
+block-props
+----
+b#2,1:
+  0: [1, 9)
+  1: [5, 7)
+d#4,1:
+  0: [2, 8)
+  1: [1, 3)
+f#6,1:
+  0: [4, 6)
+  1: [4, 8)
+i#72057594037927935,17:
+  0: [3, 7)
+  1: [3, 9)
+
+# Range keys contribute to the table-level property but do not affect point key
+# data blocks.
+
+build collectors=(suffix)
+a@5.SET.1:foo
+b@10.SET.2:bar
+c@15.SET.3:baz
+rangekey: d@10-e@15:{(#4,RANGEKEYSET,@20,foo)}
+rangekey: e@15-f@20:{(#5,RANGEKEYUNSET,@25)}
+rangekey: f@20-z@25:{(#6,RANGEKEYDEL)}
+----
+point:    [a@5#1,1,c@15#3,1]
+rangedel: [#0,0,#0,0]
+rangekey: [d@10#4,21,z@25#72057594037927935,19]
+seqnums:  [1,6]
+
+collectors
+----
+0: suffix
+
+block-props
+----
+d#72057594037927935,17:
+  0: [5, 16)
+
+table-props
+----
+0: [5, 26)
+
+# Same as the above, but only collect point key properties.
+
+build collectors=(suffix-point-keys-only)
+a@5.SET.1:foo
+b@10.SET.2:bar
+c@15.SET.3:baz
+rangekey: d@10-e@15:{(#4,RANGEKEYSET,@20,foo)}
+rangekey: e@15-f@20:{(#5,RANGEKEYUNSET,@25)}
+rangekey: f@20-z@25:{(#6,RANGEKEYDEL)}
+----
+point:    [a@5#1,1,c@15#3,1]
+rangedel: [#0,0,#0,0]
+rangekey: [d@10#4,21,z@25#72057594037927935,19]
+seqnums:  [1,6]
+
+collectors
+----
+0: suffix-point-keys-only
+
+block-props
+----
+d#72057594037927935,17:
+  0: [5, 16)
+
+table-props
+----
+0: [5, 16)
+
+# Same as the above, but only collect range key properties.
+
+build collectors=(suffix-range-keys-only)
+a@5.SET.1:foo
+b@10.SET.2:bar
+c@15.SET.3:baz
+rangekey: d@10-e@15:{(#4,RANGEKEYSET,@20,foo)}
+rangekey: e@15-f@20:{(#5,RANGEKEYUNSET,@25)}
+rangekey: f@20-z@25:{(#6,RANGEKEYDEL)}
+----
+point:    [a@5#1,1,c@15#3,1]
+rangedel: [#0,0,#0,0]
+rangekey: [d@10#4,21,z@25#72057594037927935,19]
+seqnums:  [1,6]
+
+collectors
+----
+0: suffix-range-keys-only
+
+block-props
+----
+d#72057594037927935,17:
+
+table-props
+----
+0: [20, 26)
+
+# Create a table with multiple data blocks and a range key block. Two block
+# property collectors are used, one for range keys and one for point keys, each
+# acting independently.
+
+build block-size=1 collectors=(suffix-point-keys-only,suffix-range-keys-only)
+a@5.SET.1:foo
+b@10.SET.2:bar
+c@15.SET.3:baz
+rangekey: d@10-e@15:{(#4,RANGEKEYSET,@20,foo)}
+rangekey: e@15-f@20:{(#5,RANGEKEYUNSET,@25)}
+rangekey: f@20-z@25:{(#6,RANGEKEYDEL)}
+----
+point:    [a@5#1,1,c@15#3,1]
+rangedel: [#0,0,#0,0]
+rangekey: [d@10#4,21,z@25#72057594037927935,19]
+seqnums:  [1,6]
+
+collectors
+----
+0: suffix-point-keys-only
+1: suffix-range-keys-only
+
+block-props
+----
+b#72057594037927935,17:
+  0: [5, 6)
+c#72057594037927935,17:
+  0: [10, 11)
+d#72057594037927935,17:
+  0: [15, 16)
+
+table-props
+----
+0: [5, 16)
+1: [20, 26)
+
+# Partially matching point key filter.
+
+filter point-filter=(suffix-point-keys-only,10,20)
+----
+points: true, blocks=[1,2]
+ranges: true (no filters provided)
+
+# Non-matching point key filter.
+
+filter point-filter=(suffix-point-keys-only,100,200)
+----
+points: false
+ranges: true (no filters provided)
+
+# Partially matching range key filter.
+
+filter range-filter=(suffix-range-keys-only,10,25)
+----
+points: true (no filters provided)
+ranges: true
+
+# Non-matching range key filter.
+
+filter range-filter=(suffix-range-keys-only,100,200)
+----
+points: true (no filters provided)
+ranges: false
+
+# Matching point and range key filter.
+
+filter point-filter=(suffix-point-keys-only,10,20) range-filter=(suffix-range-keys-only,10,25)
+----
+points: true, blocks=[1,2]
+ranges: true
+
+# Matching point key filter and non-matching range key filter.
+
+filter point-filter=(suffix-point-keys-only,10,20) range-filter=(suffix-range-keys-only,100,200)
+----
+points: true, blocks=[1,2]
+ranges: false
+
+# Non-matching point key filter and matching range key filter.
+
+filter point-filter=(suffix-point-keys-only,100,200) range-filter=(suffix-range-keys-only,10,25)
+----
+points: false
+ranges: true
+
+# Non-matching point and range key filter.
+
+filter point-filter=(suffix-point-keys-only,100,200) range-filter=(suffix-range-keys-only,100,100)
+----
+points: false
+ranges: false
+
+# Providing a nil collector for both points and ranges is a user-error.
+
+build collectors=(nil-points-and-ranges)
+----
+sstable: at least one interval collector must be provided
+
+# Test a small index-block-size and block-size, so every data block has one KV
+# and every index block points to one data block.
+
+build collectors=(suffix-point-keys-only) index-block-size=1 block-size=1
+a@1.SET.1:foo
+b@10.SET.2:bar
+c@15.SET.3:baz
+d@25.SET.4:bax
+e@3.SET.5:box
+f@5.SET.3:mop
+----
+point:    [a@1#1,1,f@5#3,1]
+rangedel: [#0,0,#0,0]
+rangekey: [#0,0,#0,0]
+seqnums:  [1,5]
+
+collectors
+----
+0: suffix-point-keys-only
+
+table-props
+----
+0: [1, 26)
+
+# Because of the tiny index block size, every index block should have the same
+# properties as the single data block contained in the table. Indentation shows
+# the hierarchy.
+
+block-props
+----
+b#72057594037927935,17:
+  0: [1, 2)
+  b#72057594037927935,17:
+    0: [1, 2)
+c#72057594037927935,17:
+  0: [10, 11)
+  c#72057594037927935,17:
+    0: [10, 11)
+d#72057594037927935,17:
+  0: [15, 16)
+  d#72057594037927935,17:
+    0: [15, 16)
+e#72057594037927935,17:
+  0: [25, 26)
+  e#72057594037927935,17:
+    0: [25, 26)
+f#72057594037927935,17:
+  0: [3, 4)
+  f#72057594037927935,17:
+    0: [3, 4)
+g#72057594037927935,17:
+  0: [5, 6)
+  g#72057594037927935,17:
+    0: [5, 6)
+
+# Test the same sstable, but with a larger index block size that fits multiple
+# (3) KV pairs. Each entry in the top-level index should hold the unioned ranges
+# of all the data blocks' properties.
+
+build collectors=(suffix-point-keys-only) index-block-size=64 block-size=1
+a@1.SET.1:foo
+b@10.SET.2:bar
+c@15.SET.3:baz
+d@25.SET.4:bax
+e@3.SET.5:box
+f@5.SET.3:mop
+----
+point:    [a@1#1,1,f@5#3,1]
+rangedel: [#0,0,#0,0]
+rangekey: [#0,0,#0,0]
+seqnums:  [1,5]
+
+collectors
+----
+0: suffix-point-keys-only
+
+block-props
+----
+d#72057594037927935,17:
+  0: [1, 16)
+  b#72057594037927935,17:
+    0: [1, 2)
+  c#72057594037927935,17:
+    0: [10, 11)
+  d#72057594037927935,17:
+    0: [15, 16)
+g#72057594037927935,17:
+  0: [3, 26)
+  e#72057594037927935,17:
+    0: [25, 26)
+  f#72057594037927935,17:
+    0: [3, 4)
+  g#72057594037927935,17:
+    0: [5, 6)
+
+# Regression test for a bug in boundary checking when skipping over irrelevant
+# index blocks in a two-level indexed sstable.
+
+iter lower=a point-key-filter=(suffix-point-keys-only,1,2)
+seek-lt h
+last
+----
+<a@1:1> MaybeFilteredKeys()=true
+<a@1:1> MaybeFilteredKeys()=true
+
+# Same as above, but each index block holds 2 keys. This exercises a variant of
+# the above bug. Specifically, the bounds check performed /within/ skipBackward,
+# instead of within SeekLT and Last.
+
+build collectors=(suffix-point-keys-only) index-block-size=48 block-size=1
+a@1.SET.1:foo
+b@10.SET.2:bar
+c@15.SET.3:baz
+d@25.SET.4:bax
+e@3.SET.5:box
+f@5.SET.3:mop
+----
+point:    [a@1#1,1,f@5#3,1]
+rangedel: [#0,0,#0,0]
+rangekey: [#0,0,#0,0]
+seqnums:  [1,5]
+
+block-props
+----
+c#72057594037927935,17:
+  0: [1, 11)
+  b#72057594037927935,17:
+    0: [1, 2)
+  c#72057594037927935,17:
+    0: [10, 11)
+e#72057594037927935,17:
+  0: [15, 26)
+  d#72057594037927935,17:
+    0: [15, 16)
+  e#72057594037927935,17:
+    0: [25, 26)
+g#72057594037927935,17:
+  0: [3, 6)
+  f#72057594037927935,17:
+    0: [3, 4)
+  g#72057594037927935,17:
+    0: [5, 6)
+
+# Regression test for a bug in boundary checking when skipping over irrelevant
+# index blocks in a two-level indexed sstable.
+
+iter lower=a upper=z point-key-filter=(suffix-point-keys-only,1,2)
+seek-lt h
+----
+<a@1:1> MaybeFilteredKeys()=true
+
+# Test MaybeFilteredKeys().
+
+# Use timestamp range [1,9), which matches a@1, e@3 and f@5 and filters
+# a continuous section of three keys b@10, c@15 and d@25.
+
+iter point-key-filter=(suffix-point-keys-only,1,9)
+first
+next
+next
+next
+seek-ge b
+seek-ge e@3
+----
+<a@1:1> MaybeFilteredKeys()=false
+<e@3:5> MaybeFilteredKeys()=true
+<f@5:3> MaybeFilteredKeys()=false
+. MaybeFilteredKeys()=false
+<e@3:5> MaybeFilteredKeys()=true
+<e@3:5> MaybeFilteredKeys()=false
+
+
+# NB: `seek-ge e` and `seek-ge dog` return MaybeFilteredKeys()=true, despite no
+# filtered keys existing within the range [e,e@3) or [dog,e@3). This is a
+# consequence of the index separator `e`. After seeking the index block, the
+# iterator only knows that the first block MAY contain keys ≤ e. However, it can
+# be skipped regardless, because block properties filters exclude it. In this
+# case, the iterator still returns MaybeFilteredKeys()=true, since keys MAY have
+# been excluded by the filter.
+
+iter point-key-filter=(suffix-point-keys-only,1,9)
+seek-ge e
+seek-ge dog
+----
+<e@3:5> MaybeFilteredKeys()=true
+<e@3:5> MaybeFilteredKeys()=true
+
+iter point-key-filter=(suffix-point-keys-only,1,100)
+first
+next
+next
+next
+next
+next
+next
+seek-lt d
+seek-ge d
+----
+<a@1:1> MaybeFilteredKeys()=false
+<b@10:2> MaybeFilteredKeys()=false
+<c@15:3> MaybeFilteredKeys()=false
+<d@25:4> MaybeFilteredKeys()=false
+<e@3:5> MaybeFilteredKeys()=false
+<f@5:3> MaybeFilteredKeys()=false
+. MaybeFilteredKeys()=false
+<c@15:3> MaybeFilteredKeys()=false
+<d@25:4> MaybeFilteredKeys()=false
+
+# [10,16) intersects {b@10, c@15}.
+
+iter point-key-filter=(suffix-point-keys-only,10,16)
+last
+prev
+prev
+seek-lt a
+seek-lt c
+seek-lt ca
+seek-lt f
+seek-lt e
+seek-lt d
+----
+<c@15:3> MaybeFilteredKeys()=true
+<b@10:2> MaybeFilteredKeys()=false
+. MaybeFilteredKeys()=true
+. MaybeFilteredKeys()=true
+<b@10:2> MaybeFilteredKeys()=false
+<c@15:3> MaybeFilteredKeys()=false
+<c@15:3> MaybeFilteredKeys()=true
+<c@15:3> MaybeFilteredKeys()=true
+<c@15:3> MaybeFilteredKeys()=false
+
+# Test monotonically increasing bounds optimization, with the first seek
+# filtering keys. The subsequent seek must not reuse the current iterator
+# position and improperly returning MaybeFilteredKeys=false when keys were
+# filtered.
+
+iter point-key-filter=(suffix-point-keys-only,10,16)
+set-bounds lower=b upper=ee
+seek-ge d
+set-bounds lower=ee upper=g
+seek-ge ee
+----
+. MaybeFilteredKeys()=false
+. MaybeFilteredKeys()=true
+. MaybeFilteredKeys()=true
+. MaybeFilteredKeys()=true
+
+iter point-key-filter=(suffix-point-keys-only,10,16)
+set-bounds lower=a upper=b
+seek-ge a
+set-bounds lower=b upper=e
+seek-ge b
+seek-ge bb
+----
+. MaybeFilteredKeys()=false
+. MaybeFilteredKeys()=true
+. MaybeFilteredKeys()=true
+<b@10:2> MaybeFilteredKeys()=true
+<c@15:3> MaybeFilteredKeys()=false
+
+# Test monotonically decreasing bounds optimization, with the first seek
+# filtering keys. The subsequent seek must not reuse the current iterator
+# position and improperly returning MaybeFilteredKeys=false when keys were
+# filtered.
+
+iter point-key-filter=(suffix-point-keys-only,10,16)
+set-bounds lower=e upper=f
+seek-lt f
+set-bounds lower=c upper=e
+seek-lt e
+set-bounds lower=a upper=c
+seek-lt c
+seek-lt b
+----
+. MaybeFilteredKeys()=false
+. MaybeFilteredKeys()=true
+. MaybeFilteredKeys()=true
+<c@15:3> MaybeFilteredKeys()=true
+. MaybeFilteredKeys()=true
+<b@10:2> MaybeFilteredKeys()=false
+. MaybeFilteredKeys()=true
+
+# The below case tests try-seek-using-next.
+#
+# The `seek-ge aa` does not reposition the iterator. This case should preserve
+# the existing MaybeFilteredKeys()=true value.
+#
+# The `seek-ge c@16` and seek-ge c@19` must also return MaybeFilteredKeys()=true.
+
+iter point-key-filter=(suffix-point-keys-only,10,16)
+seek-ge a
+seek-ge aa true
+seek-ge bb true
+seek-ge c@16 true
+seek-ge c@19 true
+----
+<b@10:2> MaybeFilteredKeys()=true
+<b@10:2> MaybeFilteredKeys()=true
+<c@15:3> MaybeFilteredKeys()=false
+. MaybeFilteredKeys()=true
+. MaybeFilteredKeys()=true
+
+# Test another case of monotonically increasing bounds optimization, with a
+# different index block structure. The first seek down below should filter keys,
+# and leave the top-level index positioned at the last index block. The
+# subsequent seek must return MaybeFilteredKeys=true when keys were filtered.
+
+build collectors=(suffix-point-keys-only) index-block-size=1 block-size=64
+a@1.SET.1:foo
+b@10.SET.2:bar
+c@15.SET.3:baz
+d@25.SET.4:bax
+e@3.SET.5:box
+f@5.SET.3:mop
+----
+point:    [a@1#1,1,f@5#3,1]
+rangedel: [#0,0,#0,0]
+rangekey: [#0,0,#0,0]
+seqnums:  [1,5]
+
+block-props
+----
+d#72057594037927935,17:
+  0: [1, 16)
+  d#72057594037927935,17:
+    0: [1, 16)
+g#72057594037927935,17:
+  0: [3, 26)
+  g#72057594037927935,17:
+    0: [3, 26)
+
+iter point-key-filter=(suffix-point-keys-only,1,2)
+set-bounds lower=b upper=e
+seek-ge d
+set-bounds lower=e upper=g
+seek-ge ee
+----
+. MaybeFilteredKeys()=false
+. MaybeFilteredKeys()=true
+. MaybeFilteredKeys()=true
+. MaybeFilteredKeys()=true
+
+# Test another case of monotonically increasing bounds optimization, with a new
+# index block structure: This one has only one level. The first seek down below
+# should filter keys, and leave the index positioned at the last index block.
+# The subsequent seek must return MaybeFilteredKeys=true when keys were
+# filtered.
+
+build collectors=(suffix-point-keys-only) block-size=32
+a@1.SET.1:foo
+b@10.SET.2:bar
+c@15.SET.3:baz
+d@25.SET.4:bax
+e@3.SET.5:box
+f@5.SET.3:mop
+----
+point:    [a@1#1,1,f@5#3,1]
+rangedel: [#0,0,#0,0]
+rangekey: [#0,0,#0,0]
+seqnums:  [1,5]
+
+block-props
+----
+c#72057594037927935,17:
+  0: [1, 11)
+e#72057594037927935,17:
+  0: [15, 26)
+g#72057594037927935,17:
+  0: [3, 6)
+
+iter point-key-filter=(suffix-point-keys-only,1,2)
+set-bounds lower=b upper=ee
+seek-ge d
+set-bounds lower=ee upper=g
+seek-ge ee
+----
+. MaybeFilteredKeys()=false
+. MaybeFilteredKeys()=true
+. MaybeFilteredKeys()=true
+. MaybeFilteredKeys()=true
diff --git a/pebble/sstable/testdata/block_properties_boundlimited b/pebble/sstable/testdata/block_properties_boundlimited
new file mode 100644
index 0000000..d67907e
--- /dev/null
+++ b/pebble/sstable/testdata/block_properties_boundlimited
@@ -0,0 +1,167 @@
+build block-size=28 collectors=(suffix)
+a@5.SET.1:15
+b@2.SET.2:86
+c@9.SET.3:72
+d@3.SET.4:21
+e@2.SET.5:47
+f@0.SET.6:54
+g@8.SET.7:63
+h@3.SET.8:38
+----
+point:    [a@5#1,1,h@3#8,1]
+rangedel: [#0,0,#0,0]
+rangekey: [#0,0,#0,0]
+seqnums:  [1,8]
+
+collectors
+----
+0: suffix
+
+table-props
+----
+0: [0, 10)
+
+block-props
+----
+c#72057594037927935,17:
+  0: [2, 6)
+e#72057594037927935,17:
+  0: [3, 10)
+g#72057594037927935,17:
+  0: [0, 3)
+i#72057594037927935,17:
+  0: [3, 9)
+
+# Test an interator with a bound-limited filter that has a filtering criteria
+# too narrow to exclude any blocks.
+
+iter filter=(suffix,1,20)
+first
+next
+next
+next
+next
+next
+next
+next
+next
+----
+    filter.Intersects([2, 6)) = (true, <nil>)
+<a@5:1> MaybeFilteredKeys()=false
+<b@2:2> MaybeFilteredKeys()=false
+    filter.Intersects([3, 10)) = (true, <nil>)
+<c@9:3> MaybeFilteredKeys()=false
+<d@3:4> MaybeFilteredKeys()=false
+    filter.Intersects([0, 3)) = (true, <nil>)
+<e@2:5> MaybeFilteredKeys()=false
+<f@0:6> MaybeFilteredKeys()=false
+    filter.Intersects([3, 9)) = (true, <nil>)
+<g@8:7> MaybeFilteredKeys()=false
+<h@3:8> MaybeFilteredKeys()=false
+. MaybeFilteredKeys()=false
+
+# Test an interator with a bound-limited filter that excludes one block, the
+# third block.
+
+iter filter=(suffix,3,20)
+first
+next
+next
+next
+next
+next
+next
+----
+    filter.Intersects([2, 6)) = (true, <nil>)
+<a@5:1> MaybeFilteredKeys()=false
+<b@2:2> MaybeFilteredKeys()=false
+    filter.Intersects([3, 10)) = (true, <nil>)
+<c@9:3> MaybeFilteredKeys()=false
+<d@3:4> MaybeFilteredKeys()=false
+    filter.Intersects([0, 3)) = (false, <nil>)
+    filter.KeyIsWithinUpperBound(g) = true
+    filter.Intersects([3, 9)) = (true, <nil>)
+<g@8:7> MaybeFilteredKeys()=true
+<h@3:8> MaybeFilteredKeys()=false
+. MaybeFilteredKeys()=false
+
+# Test the same case but with an upper bound set that prevents skipping the
+# block.
+
+iter filter=(suffix,3,20) filter-upper=f@9
+first
+next
+next
+next
+next
+next
+next
+next
+next
+----
+    filter.Intersects([2, 6)) = (true, <nil>)
+<a@5:1> MaybeFilteredKeys()=false
+<b@2:2> MaybeFilteredKeys()=false
+    filter.Intersects([3, 10)) = (true, <nil>)
+<c@9:3> MaybeFilteredKeys()=false
+<d@3:4> MaybeFilteredKeys()=false
+    filter.Intersects([0, 3)) = (false, <nil>)
+    filter.KeyIsWithinUpperBound(g) = false
+<e@2:5> MaybeFilteredKeys()=false
+<f@0:6> MaybeFilteredKeys()=false
+    filter.Intersects([3, 9)) = (true, <nil>)
+<g@8:7> MaybeFilteredKeys()=false
+<h@3:8> MaybeFilteredKeys()=false
+. MaybeFilteredKeys()=false
+
+# Test a case that filters the first two blocks. The third block is not filtered
+# due to block-property intersection. The fourth block is not filtered due to
+# the upper bound.
+
+iter filter=(suffix,0,1) filter-upper=h@6
+first
+next
+next
+next
+next
+----
+    filter.Intersects([2, 6)) = (false, <nil>)
+    filter.KeyIsWithinUpperBound(c) = true
+    filter.Intersects([3, 10)) = (false, <nil>)
+    filter.KeyIsWithinUpperBound(e) = true
+    filter.Intersects([0, 3)) = (true, <nil>)
+<e@2:5> MaybeFilteredKeys()=true
+<f@0:6> MaybeFilteredKeys()=false
+    filter.Intersects([3, 9)) = (false, <nil>)
+    filter.KeyIsWithinUpperBound(i) = false
+<g@8:7> MaybeFilteredKeys()=false
+<h@3:8> MaybeFilteredKeys()=false
+. MaybeFilteredKeys()=false
+
+# Test a similar case in reverse. In reverse if the very first block is reached,
+# we do not know whether or not it's actually within the bounds because we don't
+# have another index separator to bound the block. As such, there's no call to
+# KeyIsWithinLowerBound for the first block of the sstable [ie, the last one
+# visited by the iterator].
+
+iter filter=(suffix,9,10) filter-lower=a@0
+last
+prev
+prev
+prev
+prev
+----
+    filter.Intersects([3, 9)) = (false, <nil>)
+    filter.KeyIsWithinLowerBound(g) = true
+    filter.Intersects([0, 3)) = (false, <nil>)
+    filter.KeyIsWithinLowerBound(e) = true
+    filter.Intersects([3, 10)) = (true, <nil>)
+<d@3:4> MaybeFilteredKeys()=true
+<c@9:3> MaybeFilteredKeys()=false
+    filter.Intersects([2, 6)) = (false, <nil>)
+<b@2:2> MaybeFilteredKeys()=false
+<a@5:1> MaybeFilteredKeys()=false
+. MaybeFilteredKeys()=false
+
+# Add tests with other non-limited filters set, including one with the same
+# Name.
diff --git a/pebble/sstable/testdata/buffer_pool b/pebble/sstable/testdata/buffer_pool
new file mode 100644
index 0000000..7016162
--- /dev/null
+++ b/pebble/sstable/testdata/buffer_pool
@@ -0,0 +1,84 @@
+# Each command prints the current state of the buffer pool.
+#
+# [    ] - Indicates a cell within BufferPool.pool's underlying array that's
+#          unused and does not hold a buffer.
+# [   n] - Indicates a cell within BufferPool.pool that is not currently in use,
+#          but does hold a buffer of size n.
+# <   n> - Indicates a cell within BufferPool.pool that holds a buffer of size
+#          n, and that buffer is presently in-use and ineligible for reuse.
+
+init size=5
+----
+[    ] [    ] [    ] [    ] [    ]
+
+alloc n=512 handle=foo
+----
+< 512> [    ] [    ] [    ] [    ]
+
+release handle=foo
+----
+[ 512] [    ] [    ] [    ] [    ]
+
+# Allocating again should use the existing buffer.
+
+alloc n=512 handle=bar
+----
+< 512> [    ] [    ] [    ] [    ]
+
+# Allocating again should allocate a new buffer for the next slot.
+
+alloc n=512 handle=bax
+----
+< 512> < 512> [    ] [    ] [    ]
+
+release handle=bar
+----
+[ 512] < 512> [    ] [    ] [    ]
+
+release handle=bax
+----
+[ 512] [ 512] [    ] [    ] [    ]
+
+# Fill up the entire preallocated pool slice.
+
+alloc n=128 handle=bar
+----
+< 512> [ 512] [    ] [    ] [    ]
+
+alloc n=1 handle=bax
+----
+< 512> < 512> [    ] [    ] [    ]
+
+alloc n=1 handle=bux
+----
+< 512> < 512> <   1> [    ] [    ]
+
+alloc n=1024 handle=foo
+----
+< 512> < 512> <   1> <1024> [    ]
+
+alloc n=1024 handle=fax
+----
+< 512> < 512> <   1> <1024> <1024>
+
+# Allocating one more should grow the underlying slice, and allocate a
+# new appropriately sized buffer.
+
+alloc n=2048 handle=zed
+----
+< 512> < 512> <   1> <1024> <1024> <2048> [    ] [    ] [    ] [    ]
+
+release handle=bux
+----
+< 512> < 512> [   1] <1024> <1024> <2048> [    ] [    ] [    ] [    ]
+
+alloc n=2 handle=bux
+----
+< 512> < 512> [   1] <1024> <1024> <2048> <   2> [    ] [    ] [    ]
+
+init size=0
+----
+
+alloc n=1 handle=foo
+----
+<   1>
diff --git a/pebble/sstable/testdata/h.block-bloom.no-compression.sst b/pebble/sstable/testdata/h.block-bloom.no-compression.sst
new file mode 100644
index 0000000..8dd2188
Binary files /dev/null and b/pebble/sstable/testdata/h.block-bloom.no-compression.sst differ
diff --git a/pebble/sstable/testdata/h.ldb b/pebble/sstable/testdata/h.ldb
new file mode 100644
index 0000000..6c474ed
Binary files /dev/null and b/pebble/sstable/testdata/h.ldb differ
diff --git a/pebble/sstable/testdata/h.no-compression.sst b/pebble/sstable/testdata/h.no-compression.sst
new file mode 100644
index 0000000..6b3c7f2
Binary files /dev/null and b/pebble/sstable/testdata/h.no-compression.sst differ
diff --git a/pebble/sstable/testdata/h.no-compression.two_level_index.sst b/pebble/sstable/testdata/h.no-compression.two_level_index.sst
new file mode 100644
index 0000000..b0e85a0
Binary files /dev/null and b/pebble/sstable/testdata/h.no-compression.two_level_index.sst differ
diff --git a/pebble/sstable/testdata/h.sst b/pebble/sstable/testdata/h.sst
new file mode 100644
index 0000000..cc5c23b
Binary files /dev/null and b/pebble/sstable/testdata/h.sst differ
diff --git a/pebble/sstable/testdata/h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst b/pebble/sstable/testdata/h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst
new file mode 100644
index 0000000..0978e58
Binary files /dev/null and b/pebble/sstable/testdata/h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst differ
diff --git a/pebble/sstable/testdata/h.table-bloom.no-compression.sst b/pebble/sstable/testdata/h.table-bloom.no-compression.sst
new file mode 100644
index 0000000..dcffbfe
Binary files /dev/null and b/pebble/sstable/testdata/h.table-bloom.no-compression.sst differ
diff --git a/pebble/sstable/testdata/h.table-bloom.sst b/pebble/sstable/testdata/h.table-bloom.sst
new file mode 100644
index 0000000..916496f
Binary files /dev/null and b/pebble/sstable/testdata/h.table-bloom.sst differ
diff --git a/pebble/sstable/testdata/h.txt b/pebble/sstable/testdata/h.txt
new file mode 100644
index 0000000..ed8f750
--- /dev/null
+++ b/pebble/sstable/testdata/h.txt
@@ -0,0 +1,1710 @@
+     97 a
+      2 aboard
+      2 about
+      1 above
+      1 abroad
+      1 absurd
+      1 abused
+      1 accord
+      1 account
+      1 achievements
+      1 acquaint
+      5 act
+      1 action
+      1 actions
+      1 addition
+      1 address
+      4 adieu
+      1 admiration
+      1 adoption
+      1 adulterate
+      1 advantage
+      1 advice
+      2 affair
+      3 affection
+      1 after
+      1 afternoon
+     13 again
+      5 against
+      2 ah
+      5 air
+      1 airs
+      1 alas
+     36 all
+      1 alleys
+      1 allow
+      4 almost
+      4 alone
+      2 along
+      1 already
+      1 always
+      9 am
+      1 amazed
+      1 ambiguous
+      1 ambitious
+     13 an
+    227 and
+      1 angel
+      1 angels
+      1 anger
+      1 angry
+      1 another
+      4 answer
+      1 antic
+      6 any
+      1 apparel
+      2 apparition
+      4 appear
+      1 appears
+      1 appetite
+      1 approve
+      1 apt
+     21 are
+      2 arm
+      2 armed
+      1 armour
+      2 arms
+      1 arrant
+      6 art
+      1 artery
+      1 article
+      1 articles
+     56 as
+      1 aside
+      1 asking
+      1 assail
+      1 assistant
+      2 assume
+     18 at
+      1 attendants
+      1 attent
+      1 attribute
+      1 audience
+      2 aught
+      1 auspicious
+      1 avoid
+      1 avouch
+      1 awake
+      7 away
+      2 awhile
+      7 ay
+      1 baby
+      1 back
+      1 baked
+      1 bark
+      1 barr
+      1 base
+      1 baser
+      1 bawds
+     42 be
+      5 bear
+      1 beard
+      1 bearers
+      1 bears
+      2 beast
+      1 beating
+      1 beauty
+      1 beaver
+      2 beckons
+      4 bed
+      4 been
+      1 beetles
+      1 befitted
+      6 before
+      1 beg
+      1 beguile
+      1 behold
+      1 behoves
+      4 being
+      1 belief
+      6 believe
+      1 bell
+      2 bend
+      5 beneath
+      1 benefit
+     30 bernardo
+      2 beseech
+      1 besmirch
+      5 best
+      1 beteem
+      1 bethought
+      2 better
+      2 between
+      2 beware
+      1 beyond
+      2 bid
+      2 bird
+      3 birth
+      1 bites
+      1 bitter
+      1 black
+      1 blast
+      1 blastments
+      1 blasts
+      1 blazes
+      1 blazon
+      3 blessing
+      7 blood
+      1 blossoms
+      1 blows
+      1 bodes
+      5 body
+      1 bonds
+      1 bones
+      1 book
+      1 books
+      2 born
+      1 borrower
+      1 borrowing
+      1 bosom
+      3 both
+      2 bound
+      1 bounteous
+      1 bow
+      2 boy
+      2 brain
+      1 bray
+      1 brazen
+      1 breach
+      3 break
+      1 breaking
+      1 breath
+      1 breathing
+      1 brief
+      1 bring
+      1 brokers
+      6 brother
+      1 brow
+      1 bruit
+      1 bulk
+      1 buried
+      2 burns
+      1 burnt
+      2 burst
+      4 business
+     58 but
+      1 buttons
+      1 buy
+     31 by
+      4 call
+      1 calumnious
+      2 came
+      5 can
+      1 canker
+      2 cannon
+      3 cannot
+      1 canon
+      1 canonized
+      2 canst
+      1 cap
+      1 carefully
+      1 carriage
+      1 carrying
+      1 carve
+      3 cast
+      2 castle
+      1 catch
+      1 cautel
+      1 caution
+      1 celebrated
+      1 celestial
+      1 cellarage
+      2 censure
+      1 cerements
+      1 certain
+      1 chances
+      1 change
+      1 character
+      3 charge
+      1 chariest
+      1 charitable
+      1 charm
+      1 chaste
+      1 cheer
+      2 chief
+      1 chiefest
+      2 choice
+      1 choose
+      1 circumscribed
+      2 circumstance
+      1 clad
+      8 claudius
+      1 clearly
+      1 clepe
+      1 cliff
+      1 climatures
+      1 cloak
+      2 clouds
+      5 cock
+      2 cold
+      1 coldly
+      1 colleagued
+      1 colour
+      1 combat
+      1 combated
+      1 combined
+     17 come
+      7 comes
+      1 comest
+      1 comfort
+      1 coming
+      1 command
+      1 commandment
+      2 commend
+      1 commendable
+      4 common
+      1 compact
+      1 competent
+      1 complete
+      1 complexion
+      1 compulsatory
+      1 comrade
+      1 conceal
+      1 condolement
+      1 confess
+      1 confine
+      1 confined
+      1 conqueror
+      3 consent
+      1 constantly
+      1 contagious
+      1 contracted
+      1 contrive
+      1 conveniently
+      1 convoy
+      1 copied
+      4 cornelius
+      1 coronation
+      1 corruption
+      2 corse
+      1 costly
+      1 couch
+      2 could
+      2 countenance
+      1 country
+      1 countrymen
+      1 couple
+      2 course
+      1 courses
+      1 court
+      1 courteous
+      1 courtier
+      2 cousin
+      1 covenant
+      1 crack
+      1 credent
+      1 crescent
+      2 crew
+      1 cried
+      1 cries
+      1 crimes
+      1 cross
+      1 crowing
+      2 crown
+      1 crows
+      1 crust
+      1 curd
+      2 cursed
+      3 custom
+      1 customary
+      1 cut
+     54 d
+      1 daily
+      1 dalliance
+      1 damn
+      2 damned
+      3 dane
+      1 danger
+      1 dared
+      1 dares
+      2 daughter
+      1 dawning
+      8 day
+      1 days
+      7 dead
+      4 dear
+      2 dearest
+      1 dearly
+      6 death
+      1 decline
+      1 deed
+      1 deeds
+      1 deep
+      1 defeated
+      1 defect
+      1 defend
+      1 dejected
+      1 delated
+      1 delight
+      2 deliver
+      1 demonstrated
+     13 denmark
+      1 denote
+      1 depart
+      1 depends
+      1 deprive
+      1 design
+      6 desire
+      1 desperate
+      1 desperation
+      3 dew
+      1 dews
+      1 dexterity
+     14 did
+      1 didst
+      1 die
+      1 died
+      1 diet
+      1 dignity
+      1 direct
+      1 dirge
+      1 disappointed
+      1 disasters
+      1 disclosed
+      1 discourse
+      1 discretion
+      1 disjoint
+      2 dispatch
+      3 disposition
+      1 distilled
+      1 distilment
+      1 distracted
+      1 divide
+     36 do
+      3 does
+      1 dole
+      3 done
+      1 doom
+      1 doomsday
+      7 doth
+      2 double
+      4 doubt
+      1 doubtful
+      7 down
+      1 drains
+      1 dram
+      1 draughts
+      1 draw
+      1 draws
+      1 dread
+      1 dreaded
+      2 dreadful
+      1 dream
+      1 dreamt
+      1 drink
+      1 drinks
+      1 dropping
+      1 droppings
+      1 drum
+      1 drunkards
+      1 dull
+      1 duller
+      1 dulls
+      2 dumb
+      1 dust
+      1 duties
+      7 duty
+      1 dwelling
+      1 dye
+      1 e
+      5 each
+      2 eager
+      1 eale
+      5 ear
+      3 ears
+      9 earth
+      1 earthly
+      2 ease
+      1 east
+      1 eastward
+      1 eclipse
+      1 edge
+      2 effect
+      1 eleven
+      4 else
+      2 elsinore
+      1 embark
+      1 empire
+      1 emulate
+      2 en
+      1 encounter
+      1 encumber
+      1 end
+      1 enemy
+      1 enmity
+      1 enough
+     12 enter
+      1 enterprise
+      1 entertainment
+      1 entrance
+      1 entreated
+      1 entreatments
+      1 equal
+      5 er
+      4 ere
+      1 ergrowth
+      1 ermaster
+      1 erring
+      1 eruption
+      1 erwhelm
+      1 esteem
+      1 et
+      1 eternal
+      1 eternity
+      8 even
+      1 events
+      6 ever
+      1 everlasting
+      3 every
+      1 exactly
+      1 excellent
+      8 exeunt
+      6 exit
+      2 express
+      1 extinct
+      1 extorted
+      1 extravagant
+      6 eye
+      7 eyes
+      2 face
+      1 faded
+      1 fail
+      3 fair
+      1 fairy
+      4 faith
+      1 falling
+      1 false
+      1 familiar
+      1 fancy
+      2 fantasy
+      2 far
+      2 fare
+      8 farewell
+      3 fashion
+      2 fast
+      1 fat
+      2 fate
+      1 fates
+     28 father
+      1 fathers
+      1 fathoms
+      4 fault
+      2 favour
+      9 fear
+      1 fearful
+      1 fed
+      1 fee
+      2 fell
+      2 fellow
+      3 few
+      4 fie
+      1 fierce
+      3 figure
+      1 filial
+      2 find
+      1 fingers
+      4 fire
+      1 fires
+      1 first
+      2 fit
+      1 fits
+      1 fitting
+      2 fix
+      1 flames
+      1 flat
+      2 flesh
+      1 flood
+      1 flourish
+      1 flushing
+      1 foe
+      9 follow
+      1 follows
+      1 fond
+      1 food
+      1 fool
+      1 fools
+      1 foot
+     45 for
+      1 forbid
+      1 forced
+      1 foreign
+      1 foreknowing
+      1 foresaid
+      1 forfeit
+      1 forged
+      1 forget
+      4 form
+      2 forms
+      3 forth
+      1 fortified
+      6 fortinbras
+      1 forts
+      1 fortune
+      1 forward
+      1 fought
+      6 foul
+      1 frailty
+      1 frame
+      3 france
+     10 francisco
+      1 free
+      1 freely
+      1 freeze
+      1 fretful
+      3 friend
+      1 friending
+      5 friends
+     21 from
+      1 frown
+      1 frowningly
+      1 fruitful
+      2 full
+      3 funeral
+      1 furnish
+      4 further
+      1 gaged
+      2 gainst
+      1 gait
+      1 galled
+      1 galls
+      1 gape
+      1 garbage
+      1 garden
+      1 gates
+      1 gaudy
+      1 general
+      1 generous
+      1 gentle
+      5 gentlemen
+      4 gertrude
+      1 get
+     26 ghost
+      1 gibber
+      3 gifts
+      1 gins
+      1 girl
+     13 give
+      4 given
+      3 giving
+      2 glad
+      1 glimpses
+      1 globe
+      1 glow
+     15 go
+      1 goblin
+      8 god
+      3 goes
+      1 going
+      4 gone
+     20 good
+      1 goodly
+      6 grace
+      1 graces
+      2 gracious
+      1 grapple
+      1 grave
+      1 graves
+      1 great
+      1 greatness
+      2 green
+      1 greeting
+      3 grief
+      1 grizzled
+      2 gross
+      3 ground
+      2 grow
+      1 grown
+      2 grows
+      1 guard
+      2 guilty
+      1 ha
+      2 habit
+     13 had
+      1 hail
+      1 hair
+      1 hallow
+    100 hamlet
+      5 hand
+      4 hands
+      2 hang
+      1 hap
+      1 happily
+      1 harbingers
+      2 hard
+      1 hardy
+      1 harrow
+      1 harrows
+      3 has
+      4 hast
+      7 haste
+      1 hatch
+     15 hath
+     31 have
+      1 havior
+     34 he
+      6 head
+      1 headed
+      1 headshake
+      3 health
+      9 hear
+      4 heard
+      1 hearing
+      2 hears
+      1 hearsed
+     10 heart
+      3 heartily
+      1 hearts
+      1 heat
+     21 heaven
+      1 heavens
+      1 heavy
+      1 hebenon
+      1 height
+      1 held
+      3 hell
+      2 help
+      8 her
+      1 heraldry
+      1 hercules
+     11 here
+      1 hereafter
+      3 herein
+      1 hic
+      1 hideous
+      1 hies
+      2 high
+      1 higher
+      1 hill
+      2 hillo
+     21 him
+      3 himself
+     57 his
+      1 hither
+      1 hitherto
+      5 ho
+      9 hold
+      1 holding
+      2 holds
+      1 holla
+      1 holy
+      2 honest
+      5 honour
+      1 honourable
+      1 hoops
+     85 horatio
+      4 horrible
+      1 horridly
+      1 host
+      1 hot
+      6 hour
+      2 house
+      7 how
+      1 howsoever
+      1 humbly
+      1 hundred
+      1 husbandry
+      1 hyperion
+    124 i
+      1 ice
+     22 if
+      1 ignorance
+      1 ii
+      1 iii
+      1 illume
+      1 illusion
+      1 image
+      1 imagination
+      1 immediate
+      1 imminent
+      1 immortal
+      3 impart
+      1 impartment
+      1 impatient
+      1 imperfections
+      1 imperial
+      1 impious
+      1 implements
+      1 implorators
+      1 importing
+      1 importuned
+      1 importunity
+      1 impotent
+      1 impress
+    118 in
+      1 incest
+      2 incestuous
+      1 incorrect
+      1 increase
+      8 indeed
+      1 infants
+      1 infinite
+      1 influence
+      1 inform
+      1 inheritance
+      1 inky
+      2 instant
+      1 instrumental
+      1 intent
+      1 intents
+      5 into
+      1 inurn
+      1 investments
+      1 invites
+      1 invulnerable
+      1 inward
+     62 is
+      1 issue
+    126 it
+      1 its
+      9 itself
+      1 iv
+      1 jaws
+      1 jelly
+      1 jocund
+      2 joint
+      1 jointress
+      1 joy
+      1 judgment
+      1 juice
+      1 julius
+      1 jump
+      3 keep
+      1 keeps
+      1 kept
+      1 kettle
+      1 key
+      1 kin
+      1 kind
+     23 king
+      1 kingdom
+      1 knave
+      1 knew
+      1 knotted
+     17 know
+      2 known
+      1 knows
+      1 labourer
+      1 laboursome
+      1 lack
+      1 lacks
+     16 laertes
+      2 land
+      3 lands
+      1 larger
+      3 last
+      1 lasting
+      3 late
+      2 law
+      1 lawless
+      1 lay
+      1 lazar
+      1 lead
+      2 least
+      8 leave
+      1 leavens
+      1 left
+      1 leisure
+      1 lend
+      1 lender
+      1 lends
+      1 length
+      1 leperous
+      2 less
+      1 lesson
+     23 let
+      1 lethe
+      1 lets
+      1 levies
+      1 lewdness
+      1 libertine
+      1 lids
+      1 liegemen
+      1 lies
+      7 life
+      1 lifted
+      1 light
+      1 lightest
+     23 like
+      1 link
+      1 lion
+      1 lips
+      1 liquid
+      6 list
+      1 lists
+      3 little
+      3 live
+      1 livery
+      1 lives
+     18 ll
+      1 lo
+      1 loan
+      1 loathsome
+      1 lock
+      1 locks
+      1 lodge
+      1 lofty
+      4 long
+      3 longer
+     10 look
+      2 looks
+      1 loose
+     60 lord
+      1 lords
+      1 lordship
+      2 lose
+      1 loses
+      1 loss
+      5 lost
+      1 loud
+      8 love
+      5 loves
+      2 loving
+      2 lust
+      1 luxury
+      2 m
+      4 madam
+      8 made
+      1 madness
+      1 maid
+      1 maiden
+      2 main
+      1 majestical
+      1 majesty
+      8 make
+      2 makes
+      2 making
+      1 malicious
+     11 man
+      1 manner
+      1 manners
+      1 mantle
+      2 many
+      1 marble
+     46 marcellus
+      2 march
+      2 mark
+      3 marriage
+      2 married
+      1 marrow
+      3 marry
+      1 mart
+      1 martial
+      1 marvel
+      1 matin
+      1 matter
+     19 may
+     47 me
+      2 mean
+      2 means
+      1 meats
+      1 meditation
+      3 meet
+      1 meeting
+      1 melt
+      5 memory
+      3 men
+      2 mercy
+      1 mere
+      1 merely
+      1 message
+      1 met
+      2 methinks
+      1 methought
+      1 mettle
+      1 middle
+      7 might
+      1 mightiest
+      1 milk
+      6 mind
+      6 mine
+      1 ministers
+      1 minute
+      1 minutes
+      1 mirth
+      1 mock
+      1 mockery
+      1 moderate
+      1 moiety
+      1 moist
+      2 mole
+      1 moment
+      3 month
+      1 months
+      1 moods
+      2 moon
+     19 more
+      3 morn
+      3 morning
+     28 most
+      1 mote
+      5 mother
+      1 motion
+      2 motive
+      1 mourn
+      1 mourning
+      1 mouse
+      1 mouth
+      1 moved
+      9 much
+      3 murder
+     14 must
+    126 my
+      4 myself
+      2 name
+      1 nations
+      2 native
+      2 natural
+     13 nature
+      7 nay
+      1 ne
+      3 near
+      1 necessaries
+      1 need
+      1 needful
+      1 needs
+      1 neither
+      1 nemean
+      1 nephew
+      1 neptune
+      1 nerve
+      6 never
+      1 new
+      2 news
+     22 night
+      1 nighted
+      1 nightly
+      3 nights
+      1 niobe
+      1 nipping
+     28 no
+      1 nobility
+      5 noble
+      2 none
+     14 nor
+      5 norway
+     80 not
+      2 note
+      2 nothing
+     19 now
+     30 o
+      1 oath
+      3 obey
+      1 object
+      1 obligation
+      1 obsequious
+      1 observance
+      1 observant
+      1 observation
+      1 obstinate
+      1 occasion
+      1 odd
+    176 of
+      6 off
+      2 offence
+      1 offend
+      1 offended
+      2 offer
+      7 oft
+      4 old
+      1 omen
+     25 on
+      8 once
+      6 one
+      1 oped
+      1 open
+     15 ophelia
+      1 opinion
+      1 opposed
+      1 opposition
+      1 oppress
+     28 or
+      2 orchard
+      1 ordnance
+      1 origin
+      4 other
+     45 our
+      2 ourself
+      1 ourselves
+      8 out
+      6 own
+      1 ownself
+      4 pale
+      1 pales
+      1 palm
+      1 palmy
+      1 pardon
+      1 parle
+      1 parley
+      6 part
+      6 particular
+      1 partisan
+      1 passeth
+      1 passing
+      1 past
+      1 pastors
+      1 path
+      1 patrick
+      1 pay
+      1 pe
+      2 peace
+      1 peevish
+      2 perchance
+      1 perform
+      1 perfume
+      1 perhaps
+      1 perilous
+      1 permanent
+      1 pernicious
+      1 persever
+      1 person
+      1 personal
+      1 persons
+      1 perturbed
+      1 pester
+      1 petition
+      1 petty
+      1 philosophy
+      3 phrase
+      1 piece
+      1 pin
+      1 pioner
+      1 pious
+      1 pith
+      1 pity
+      3 place
+      1 plain
+      1 planets
+      5 platform
+      1 plausive
+      2 play
+      1 please
+      1 pledge
+      2 point
+      1 polacks
+      1 pole
+     13 polonius
+      1 ponderous
+      1 pooh
+      9 poor
+      1 porches
+      1 porpentine
+      1 portentous
+      1 possess
+      1 posset
+      3 post
+      1 pour
+      3 power
+      7 pray
+      1 prayers
+      1 preceding
+      1 precepts
+      1 precurse
+      1 preparations
+      1 presence
+      1 present
+      1 pressures
+      1 prey
+      2 prick
+      1 pride
+      1 primrose
+      1 primy
+      1 prince
+      1 prison
+      1 private
+      1 privy
+      1 probation
+      1 process
+      1 proclaims
+      2 prodigal
+      1 prologue
+      1 promise
+      1 pronouncing
+      1 prophetic
+      1 proportions
+      1 propose
+      1 puff
+      1 pure
+      1 purged
+      1 purpose
+      1 purse
+      1 pursuest
+      2 put
+      1 puts
+      1 quarrel
+      7 queen
+      2 question
+      1 questionable
+      1 quicksilver
+      1 quiet
+      1 quietly
+      1 quills
+      1 radiant
+      2 rank
+      1 rankly
+      1 rate
+      1 ratified
+      2 re
+      1 reaches
+      1 rear
+      5 reason
+      1 rebels
+      1 reckless
+      1 reckoning
+      1 recks
+      1 records
+      1 recover
+      1 red
+      1 rede
+      1 reels
+      1 relief
+      1 relieved
+      1 remain
+      6 remember
+      1 remembrance
+      1 remove
+      1 removed
+      1 render
+      1 reply
+      1 report
+      1 request
+      1 requite
+      1 reserve
+      1 resolutes
+      1 resolve
+      2 rest
+      1 retrograde
+      2 return
+      1 reveal
+      1 revel
+      3 revenge
+      1 revisit
+      1 rhenish
+      1 rich
+      1 rid
+      3 right
+      1 rise
+      1 rivals
+      1 river
+      1 roar
+      1 romage
+      1 roman
+      1 rome
+      2 room
+      1 roots
+      1 rotten
+      1 roughly
+      2 rouse
+      2 royal
+      1 ruled
+      1 running
+      1 russet
+     39 s
+      1 sable
+      2 safety
+      3 said
+      1 sail
+      1 saint
+      1 salt
+      5 same
+      1 sanctified
+      1 sate
+      1 satyr
+      1 saviour
+      6 saw
+      1 saws
+     11 say
+      1 saying
+      3 says
+      1 scale
+      1 scandal
+      1 scanter
+      1 scapes
+      1 scarcely
+      5 scene
+      1 scent
+      1 scholar
+      1 scholars
+      1 school
+      2 scope
+      3 sea
+      2 seal
+      4 season
+      1 seat
+      1 second
+      1 secrecy
+      1 secret
+      1 secrets
+      2 secure
+      1 seduce
+      7 see
+      1 seed
+      1 seeing
+      1 seek
+      2 seem
+      1 seeming
+      3 seems
+      8 seen
+      1 seized
+      1 select
+      1 self
+      1 sense
+      1 sensible
+      1 sent
+      1 sepulchre
+      1 serious
+      2 serpent
+      1 servant
+      1 servants
+      1 service
+      4 set
+      2 shake
+     22 shall
+      1 shalt
+      1 shame
+      1 shameful
+      2 shape
+      1 shapes
+      1 shark
+      6 she
+      1 sheeted
+      1 sheets
+      1 shift
+      1 shipwrights
+      1 shoes
+      2 shot
+      6 should
+      1 shoulder
+      1 shouldst
+      6 show
+      2 shows
+      1 shrewdly
+      1 shrill
+      1 shrunk
+      2 sick
+      1 side
+      3 sight
+      1 silence
+      1 silver
+      1 simple
+      1 sin
+      1 since
+      1 sinews
+      1 singeth
+      3 sir
+      1 sirs
+      3 sister
+      4 sit
+      2 sits
+      1 skirts
+      1 slander
+      1 slaughter
+      1 slay
+      1 sledded
+      1 sleep
+      3 sleeping
+      2 slow
+      2 smile
+      1 smiles
+      2 smiling
+      1 smooth
+      1 smote
+     48 so
+      1 soe
+      2 soft
+      2 soil
+      1 soldier
+      1 soldiers
+      2 solemn
+      1 solid
+     13 some
+      3 something
+      1 sometime
+      1 sometimes
+      1 somewhat
+      3 son
+      1 songs
+      1 sore
+      3 sorrow
+      1 sorry
+      1 sort
+      8 soul
+      1 souls
+      2 sound
+      1 sounding
+      1 source
+      1 sovereignty
+     27 speak
+      1 speaking
+      1 speech
+      1 speed
+      1 spend
+      1 spheres
+      8 spirit
+      1 spirits
+      1 spite
+      1 spoke
+      2 spring
+      1 springes
+      1 squeak
+      4 st
+      1 stale
+      1 stalk
+      1 stalks
+      1 stamp
+      5 stand
+      1 stands
+      3 star
+      2 stars
+      1 start
+      1 started
+      8 state
+      1 stately
+      1 station
+      7 stay
+      2 steel
+      1 steep
+      1 sterling
+      1 stiffly
+      8 still
+      2 sting
+      2 stir
+      1 stirring
+      1 stole
+      1 stomach
+      2 stood
+      1 stop
+      1 story
+      6 strange
+      1 stranger
+      1 streets
+      1 strict
+      2 strike
+      1 strokes
+      1 strong
+      2 struck
+      1 stubbornness
+      1 student
+      1 stung
+      3 subject
+      1 substance
+     10 such
+      1 sudden
+      1 suit
+      3 suits
+      1 sulphurous
+      1 summit
+      1 summons
+      2 sun
+      1 sunday
+      1 suppliance
+      1 supposal
+      1 suppress
+      1 sure
+      1 surprised
+      1 surrender
+      1 survivor
+      1 suspiration
+      1 sustain
+      1 swaggering
+     10 swear
+      1 sweaty
+      1 sweep
+      2 sweet
+      2 swift
+      1 swinish
+      5 sword
+      2 sworn
+     18 t
+      1 ta
+      1 table
+      2 tables
+      1 taint
+     10 take
+      1 taken
+      3 takes
+      1 tale
+      1 talk
+      1 task
+      1 tax
+      2 teach
+      2 tears
+      9 tell
+      1 temple
+      1 tempt
+      1 tenable
+      1 tenantless
+      1 tend
+      2 tender
+      3 tenders
+      2 term
+      2 terms
+      1 tether
+      1 tetter
+     15 than
+      2 thanks
+     83 that
+      1 thaw
+    237 the
+     23 thee
+     10 their
+     10 them
+      1 theme
+     15 then
+     18 there
+      4 therefore
+      1 thereto
+     13 these
+      1 thews
+     14 they
+      1 thin
+      3 thine
+      6 thing
+      3 things
+     16 think
+      1 thinking
+      1 third
+     67 this
+      1 thorns
+      1 thorny
+      7 those
+     28 thou
+     10 though
+      2 thought
+      4 thoughts
+      1 thrice
+      2 thrift
+      1 throat
+      2 throne
+      3 through
+      1 throw
+      1 thunder
+      9 thus
+     36 thy
+      1 thyself
+      4 till
+     10 time
+      1 times
+     22 tis
+    192 to
+      1 toe
+      7 together
+      1 toils
+      2 told
+      4 tongue
+      9 too
+      1 top
+      1 tormenting
+      3 touching
+      4 toward
+      1 toy
+      1 toys
+      1 traduced
+      1 tragedy
+      1 trains
+      1 traitorous
+      1 trappings
+      1 treads
+      2 treasure
+      1 tremble
+      1 tried
+      1 trifling
+      1 triumph
+      1 trivial
+      1 trouble
+      1 troubles
+      2 truant
+      5 true
+      1 truepenny
+      1 truly
+      2 trumpet
+      1 trumpets
+      1 truncheon
+      1 truster
+      2 truth
+      2 tush
+      3 twelve
+      1 twere
+      2 twice
+      2 twill
+      1 twixt
+      5 two
+      1 ubique
+      1 unanel
+      5 uncle
+      1 undergo
+      1 understand
+      2 understanding
+      1 uneffectual
+      1 unfledged
+      3 unfold
+      1 unforced
+      1 unfortified
+      1 ungracious
+      1 unhand
+      1 unholy
+      1 unhousel
+      1 unimproved
+      1 unmanly
+      1 unmask
+      1 unmaster
+      1 unmix
+      2 unnatural
+      1 unprevailing
+      1 unprofitable
+      1 unproportioned
+      1 unrighteous
+      1 unschool
+      1 unsifted
+      4 unto
+      1 unvalued
+      1 unweeded
+     10 up
+      1 uphoarded
+     18 upon
+     19 us
+      1 use
+      1 uses
+      1 usurp
+      1 v
+      1 vailed
+      1 vain
+      2 valiant
+      1 vanish
+      1 vanquisher
+      1 vast
+      9 very
+      1 vial
+      1 vicious
+      1 vigour
+      1 vile
+      5 villain
+      2 violence
+      1 violet
+      3 virtue
+      1 virtues
+      1 virtuous
+      1 visage
+      1 vision
+      2 visit
+      5 voice
+      4 voltimand
+      1 volume
+      1 vow
+      3 vows
+      2 vulgar
+      1 wake
+      6 walk
+      1 walks
+      1 wants
+      1 war
+      2 warlike
+      1 warning
+      1 warrant
+      1 wars
+      1 wary
+     17 was
+      1 wassail
+     12 watch
+      1 watchman
+      3 waves
+      2 waxes
+      2 way
+      1 ways
+     34 we
+      1 weak
+      1 wears
+      1 weary
+      1 wedding
+      1 weed
+      1 week
+      2 weigh
+      1 weighing
+      3 welcome
+     14 well
+      1 went
+      3 were
+      1 west
+      1 westward
+      1 wharf
+     42 what
+      1 whatsoever
+      8 when
+      1 whence
+      9 where
+      1 wherefore
+      4 wherein
+      2 whereof
+      1 whether
+     16 which
+      2 while
+      1 whiles
+      1 whilst
+      1 whirling
+      1 whisper
+      8 who
+      3 whole
+      2 wholesome
+      8 whose
+     13 why
+      3 wicked
+      1 wide
+      1 wife
+      1 wild
+     25 will
+      1 willing
+      1 willingly
+      1 wilt
+      2 wind
+      2 winds
+      1 windy
+      1 wings
+      1 wipe
+      1 wisdom
+      1 wisdoms
+      1 wisest
+      1 wishes
+      2 wit
+      1 witch
+      1 witchcraft
+     65 with
+      2 withal
+     11 within
+      3 without
+      1 witness
+      4 wittenberg
+      3 woe
+      2 woman
+      1 womb
+      1 won
+      1 wonder
+      1 wonderful
+      1 wondrous
+      1 wont
+      1 woodcocks
+      3 word
+      2 words
+      1 wore
+      2 work
+      3 world
+      1 worm
+      1 worth
+      1 worthy
+     14 would
+      3 wouldst
+      1 wretch
+      2 writ
+      1 writing
+      1 wrong
+      1 wrung
+      1 yea
+      4 yes
+      1 yesternight
+      7 yet
+      1 yielding
+      1 yon
+      1 yond
+    110 you
+      6 young
+     49 your
+      7 yourself
+      5 youth
diff --git a/pebble/sstable/testdata/h.zstd-compression.sst b/pebble/sstable/testdata/h.zstd-compression.sst
new file mode 100644
index 0000000..37de250
Binary files /dev/null and b/pebble/sstable/testdata/h.zstd-compression.sst differ
diff --git a/pebble/sstable/testdata/hamlet-act-1.txt b/pebble/sstable/testdata/hamlet-act-1.txt
new file mode 100644
index 0000000..2491678
--- /dev/null
+++ b/pebble/sstable/testdata/hamlet-act-1.txt
@@ -0,0 +1,1234 @@
+The Tragedy of Hamlet, Prince of Denmark
+
+ACT I
+
+SCENE I. Elsinore. A platform before the castle.
+
+FRANCISCO at his post. Enter to him BERNARDO
+BERNARDO
+Who's there?
+FRANCISCO
+Nay, answer me: stand, and unfold yourself.
+BERNARDO
+Long live the king!
+FRANCISCO
+Bernardo?
+BERNARDO
+He.
+FRANCISCO
+You come most carefully upon your hour.
+BERNARDO
+'Tis now struck twelve; get thee to bed, Francisco.
+FRANCISCO
+For this relief much thanks: 'tis bitter cold,
+And I am sick at heart.
+BERNARDO
+Have you had quiet guard?
+FRANCISCO
+Not a mouse stirring.
+BERNARDO
+Well, good night.
+If you do meet Horatio and Marcellus,
+The rivals of my watch, bid them make haste.
+FRANCISCO
+I think I hear them. Stand, ho! Who's there?
+Enter HORATIO and MARCELLUS
+
+HORATIO
+Friends to this ground.
+MARCELLUS
+And liegemen to the Dane.
+FRANCISCO
+Give you good night.
+MARCELLUS
+O, farewell, honest soldier:
+Who hath relieved you?
+FRANCISCO
+Bernardo has my place.
+Give you good night.
+Exit
+
+MARCELLUS
+Holla! Bernardo!
+BERNARDO
+Say,
+What, is Horatio there?
+HORATIO
+A piece of him.
+BERNARDO
+Welcome, Horatio: welcome, good Marcellus.
+MARCELLUS
+What, has this thing appear'd again to-night?
+BERNARDO
+I have seen nothing.
+MARCELLUS
+Horatio says 'tis but our fantasy,
+And will not let belief take hold of him
+Touching this dreaded sight, twice seen of us:
+Therefore I have entreated him along
+With us to watch the minutes of this night;
+That if again this apparition come,
+He may approve our eyes and speak to it.
+HORATIO
+Tush, tush, 'twill not appear.
+BERNARDO
+Sit down awhile;
+And let us once again assail your ears,
+That are so fortified against our story
+What we have two nights seen.
+HORATIO
+Well, sit we down,
+And let us hear Bernardo speak of this.
+BERNARDO
+Last night of all,
+When yond same star that's westward from the pole
+Had made his course to illume that part of heaven
+Where now it burns, Marcellus and myself,
+The bell then beating one,--
+Enter Ghost
+
+MARCELLUS
+Peace, break thee off; look, where it comes again!
+BERNARDO
+In the same figure, like the king that's dead.
+MARCELLUS
+Thou art a scholar; speak to it, Horatio.
+BERNARDO
+Looks it not like the king? mark it, Horatio.
+HORATIO
+Most like: it harrows me with fear and wonder.
+BERNARDO
+It would be spoke to.
+MARCELLUS
+Question it, Horatio.
+HORATIO
+What art thou that usurp'st this time of night,
+Together with that fair and warlike form
+In which the majesty of buried Denmark
+Did sometimes march? by heaven I charge thee, speak!
+MARCELLUS
+It is offended.
+BERNARDO
+See, it stalks away!
+HORATIO
+Stay! speak, speak! I charge thee, speak!
+Exit Ghost
+
+MARCELLUS
+'Tis gone, and will not answer.
+BERNARDO
+How now, Horatio! you tremble and look pale:
+Is not this something more than fantasy?
+What think you on't?
+HORATIO
+Before my God, I might not this believe
+Without the sensible and true avouch
+Of mine own eyes.
+MARCELLUS
+Is it not like the king?
+HORATIO
+As thou art to thyself:
+Such was the very armour he had on
+When he the ambitious Norway combated;
+So frown'd he once, when, in an angry parle,
+He smote the sledded Polacks on the ice.
+'Tis strange.
+MARCELLUS
+Thus twice before, and jump at this dead hour,
+With martial stalk hath he gone by our watch.
+HORATIO
+In what particular thought to work I know not;
+But in the gross and scope of my opinion,
+This bodes some strange eruption to our state.
+MARCELLUS
+Good now, sit down, and tell me, he that knows,
+Why this same strict and most observant watch
+So nightly toils the subject of the land,
+And why such daily cast of brazen cannon,
+And foreign mart for implements of war;
+Why such impress of shipwrights, whose sore task
+Does not divide the Sunday from the week;
+What might be toward, that this sweaty haste
+Doth make the night joint-labourer with the day:
+Who is't that can inform me?
+HORATIO
+That can I;
+At least, the whisper goes so. Our last king,
+Whose image even but now appear'd to us,
+Was, as you know, by Fortinbras of Norway,
+Thereto prick'd on by a most emulate pride,
+Dared to the combat; in which our valiant Hamlet--
+For so this side of our known world esteem'd him--
+Did slay this Fortinbras; who by a seal'd compact,
+Well ratified by law and heraldry,
+Did forfeit, with his life, all those his lands
+Which he stood seized of, to the conqueror:
+Against the which, a moiety competent
+Was gaged by our king; which had return'd
+To the inheritance of Fortinbras,
+Had he been vanquisher; as, by the same covenant,
+And carriage of the article design'd,
+His fell to Hamlet. Now, sir, young Fortinbras,
+Of unimproved mettle hot and full,
+Hath in the skirts of Norway here and there
+Shark'd up a list of lawless resolutes,
+For food and diet, to some enterprise
+That hath a stomach in't; which is no other--
+As it doth well appear unto our state--
+But to recover of us, by strong hand
+And terms compulsatory, those foresaid lands
+So by his father lost: and this, I take it,
+Is the main motive of our preparations,
+The source of this our watch and the chief head
+Of this post-haste and romage in the land.
+BERNARDO
+I think it be no other but e'en so:
+Well may it sort that this portentous figure
+Comes armed through our watch; so like the king
+That was and is the question of these wars.
+HORATIO
+A mote it is to trouble the mind's eye.
+In the most high and palmy state of Rome,
+A little ere the mightiest Julius fell,
+The graves stood tenantless and the sheeted dead
+Did squeak and gibber in the Roman streets:
+As stars with trains of fire and dews of blood,
+Disasters in the sun; and the moist star
+Upon whose influence Neptune's empire stands
+Was sick almost to doomsday with eclipse:
+And even the like precurse of fierce events,
+As harbingers preceding still the fates
+And prologue to the omen coming on,
+Have heaven and earth together demonstrated
+Unto our climatures and countrymen.--
+But soft, behold! lo, where it comes again!
+Re-enter Ghost
+
+I'll cross it, though it blast me. Stay, illusion!
+If thou hast any sound, or use of voice,
+Speak to me:
+If there be any good thing to be done,
+That may to thee do ease and grace to me,
+Speak to me:
+Cock crows
+
+If thou art privy to thy country's fate,
+Which, happily, foreknowing may avoid, O, speak!
+Or if thou hast uphoarded in thy life
+Extorted treasure in the womb of earth,
+For which, they say, you spirits oft walk in death,
+Speak of it: stay, and speak! Stop it, Marcellus.
+MARCELLUS
+Shall I strike at it with my partisan?
+HORATIO
+Do, if it will not stand.
+BERNARDO
+'Tis here!
+HORATIO
+'Tis here!
+MARCELLUS
+'Tis gone!
+Exit Ghost
+
+We do it wrong, being so majestical,
+To offer it the show of violence;
+For it is, as the air, invulnerable,
+And our vain blows malicious mockery.
+BERNARDO
+It was about to speak, when the cock crew.
+HORATIO
+And then it started like a guilty thing
+Upon a fearful summons. I have heard,
+The cock, that is the trumpet to the morn,
+Doth with his lofty and shrill-sounding throat
+Awake the god of day; and, at his warning,
+Whether in sea or fire, in earth or air,
+The extravagant and erring spirit hies
+To his confine: and of the truth herein
+This present object made probation.
+MARCELLUS
+It faded on the crowing of the cock.
+Some say that ever 'gainst that season comes
+Wherein our Saviour's birth is celebrated,
+The bird of dawning singeth all night long:
+And then, they say, no spirit dares stir abroad;
+The nights are wholesome; then no planets strike,
+No fairy takes, nor witch hath power to charm,
+So hallow'd and so gracious is the time.
+HORATIO
+So have I heard and do in part believe it.
+But, look, the morn, in russet mantle clad,
+Walks o'er the dew of yon high eastward hill:
+Break we our watch up; and by my advice,
+Let us impart what we have seen to-night
+Unto young Hamlet; for, upon my life,
+This spirit, dumb to us, will speak to him.
+Do you consent we shall acquaint him with it,
+As needful in our loves, fitting our duty?
+MARCELLUS
+Let's do't, I pray; and I this morning know
+Where we shall find him most conveniently.
+Exeunt
+
+SCENE II. A room of state in the castle.
+
+Enter KING CLAUDIUS, QUEEN GERTRUDE, HAMLET, POLONIUS, LAERTES, VOLTIMAND, CORNELIUS, Lords, and Attendants
+KING CLAUDIUS
+Though yet of Hamlet our dear brother's death
+The memory be green, and that it us befitted
+To bear our hearts in grief and our whole kingdom
+To be contracted in one brow of woe,
+Yet so far hath discretion fought with nature
+That we with wisest sorrow think on him,
+Together with remembrance of ourselves.
+Therefore our sometime sister, now our queen,
+The imperial jointress to this warlike state,
+Have we, as 'twere with a defeated joy,--
+With an auspicious and a dropping eye,
+With mirth in funeral and with dirge in marriage,
+In equal scale weighing delight and dole,--
+Taken to wife: nor have we herein barr'd
+Your better wisdoms, which have freely gone
+With this affair along. For all, our thanks.
+Now follows, that you know, young Fortinbras,
+Holding a weak supposal of our worth,
+Or thinking by our late dear brother's death
+Our state to be disjoint and out of frame,
+Colleagued with the dream of his advantage,
+He hath not fail'd to pester us with message,
+Importing the surrender of those lands
+Lost by his father, with all bonds of law,
+To our most valiant brother. So much for him.
+Now for ourself and for this time of meeting:
+Thus much the business is: we have here writ
+To Norway, uncle of young Fortinbras,--
+Who, impotent and bed-rid, scarcely hears
+Of this his nephew's purpose,--to suppress
+His further gait herein; in that the levies,
+The lists and full proportions, are all made
+Out of his subject: and we here dispatch
+You, good Cornelius, and you, Voltimand,
+For bearers of this greeting to old Norway;
+Giving to you no further personal power
+To business with the king, more than the scope
+Of these delated articles allow.
+Farewell, and let your haste commend your duty.
+CORNELIUS VOLTIMAND
+In that and all things will we show our duty.
+KING CLAUDIUS
+We doubt it nothing: heartily farewell.
+Exeunt VOLTIMAND and CORNELIUS
+
+And now, Laertes, what's the news with you?
+You told us of some suit; what is't, Laertes?
+You cannot speak of reason to the Dane,
+And loose your voice: what wouldst thou beg, Laertes,
+That shall not be my offer, not thy asking?
+The head is not more native to the heart,
+The hand more instrumental to the mouth,
+Than is the throne of Denmark to thy father.
+What wouldst thou have, Laertes?
+LAERTES
+My dread lord,
+Your leave and favour to return to France;
+From whence though willingly I came to Denmark,
+To show my duty in your coronation,
+Yet now, I must confess, that duty done,
+My thoughts and wishes bend again toward France
+And bow them to your gracious leave and pardon.
+KING CLAUDIUS
+Have you your father's leave? What says Polonius?
+LORD POLONIUS
+He hath, my lord, wrung from me my slow leave
+By laboursome petition, and at last
+Upon his will I seal'd my hard consent:
+I do beseech you, give him leave to go.
+KING CLAUDIUS
+Take thy fair hour, Laertes; time be thine,
+And thy best graces spend it at thy will!
+But now, my cousin Hamlet, and my son,--
+HAMLET
+[Aside] A little more than kin, and less than kind.
+KING CLAUDIUS
+How is it that the clouds still hang on you?
+HAMLET
+Not so, my lord; I am too much i' the sun.
+QUEEN GERTRUDE
+Good Hamlet, cast thy nighted colour off,
+And let thine eye look like a friend on Denmark.
+Do not for ever with thy vailed lids
+Seek for thy noble father in the dust:
+Thou know'st 'tis common; all that lives must die,
+Passing through nature to eternity.
+HAMLET
+Ay, madam, it is common.
+QUEEN GERTRUDE
+If it be,
+Why seems it so particular with thee?
+HAMLET
+Seems, madam! nay it is; I know not 'seems.'
+'Tis not alone my inky cloak, good mother,
+Nor customary suits of solemn black,
+Nor windy suspiration of forced breath,
+No, nor the fruitful river in the eye,
+Nor the dejected 'havior of the visage,
+Together with all forms, moods, shapes of grief,
+That can denote me truly: these indeed seem,
+For they are actions that a man might play:
+But I have that within which passeth show;
+These but the trappings and the suits of woe.
+KING CLAUDIUS
+'Tis sweet and commendable in your nature, Hamlet,
+To give these mourning duties to your father:
+But, you must know, your father lost a father;
+That father lost, lost his, and the survivor bound
+In filial obligation for some term
+To do obsequious sorrow: but to persever
+In obstinate condolement is a course
+Of impious stubbornness; 'tis unmanly grief;
+It shows a will most incorrect to heaven,
+A heart unfortified, a mind impatient,
+An understanding simple and unschool'd:
+For what we know must be and is as common
+As any the most vulgar thing to sense,
+Why should we in our peevish opposition
+Take it to heart? Fie! 'tis a fault to heaven,
+A fault against the dead, a fault to nature,
+To reason most absurd: whose common theme
+Is death of fathers, and who still hath cried,
+From the first corse till he that died to-day,
+'This must be so.' We pray you, throw to earth
+This unprevailing woe, and think of us
+As of a father: for let the world take note,
+You are the most immediate to our throne;
+And with no less nobility of love
+Than that which dearest father bears his son,
+Do I impart toward you. For your intent
+In going back to school in Wittenberg,
+It is most retrograde to our desire:
+And we beseech you, bend you to remain
+Here, in the cheer and comfort of our eye,
+Our chiefest courtier, cousin, and our son.
+QUEEN GERTRUDE
+Let not thy mother lose her prayers, Hamlet:
+I pray thee, stay with us; go not to Wittenberg.
+HAMLET
+I shall in all my best obey you, madam.
+KING CLAUDIUS
+Why, 'tis a loving and a fair reply:
+Be as ourself in Denmark. Madam, come;
+This gentle and unforced accord of Hamlet
+Sits smiling to my heart: in grace whereof,
+No jocund health that Denmark drinks to-day,
+But the great cannon to the clouds shall tell,
+And the king's rouse the heavens all bruit again,
+Re-speaking earthly thunder. Come away.
+Exeunt all but HAMLET
+
+HAMLET
+O, that this too too solid flesh would melt
+Thaw and resolve itself into a dew!
+Or that the Everlasting had not fix'd
+His canon 'gainst self-slaughter! O God! God!
+How weary, stale, flat and unprofitable,
+Seem to me all the uses of this world!
+Fie on't! ah fie! 'tis an unweeded garden,
+That grows to seed; things rank and gross in nature
+Possess it merely. That it should come to this!
+But two months dead: nay, not so much, not two:
+So excellent a king; that was, to this,
+Hyperion to a satyr; so loving to my mother
+That he might not beteem the winds of heaven
+Visit her face too roughly. Heaven and earth!
+Must I remember? why, she would hang on him,
+As if increase of appetite had grown
+By what it fed on: and yet, within a month--
+Let me not think on't--Frailty, thy name is woman!--
+A little month, or ere those shoes were old
+With which she follow'd my poor father's body,
+Like Niobe, all tears:--why she, even she--
+O, God! a beast, that wants discourse of reason,
+Would have mourn'd longer--married with my uncle,
+My father's brother, but no more like my father
+Than I to Hercules: within a month:
+Ere yet the salt of most unrighteous tears
+Had left the flushing in her galled eyes,
+She married. O, most wicked speed, to post
+With such dexterity to incestuous sheets!
+It is not nor it cannot come to good:
+But break, my heart; for I must hold my tongue.
+Enter HORATIO, MARCELLUS, and BERNARDO
+
+HORATIO
+Hail to your lordship!
+HAMLET
+I am glad to see you well:
+Horatio,--or I do forget myself.
+HORATIO
+The same, my lord, and your poor servant ever.
+HAMLET
+Sir, my good friend; I'll change that name with you:
+And what make you from Wittenberg, Horatio? Marcellus?
+MARCELLUS
+My good lord--
+HAMLET
+I am very glad to see you. Good even, sir.
+But what, in faith, make you from Wittenberg?
+HORATIO
+A truant disposition, good my lord.
+HAMLET
+I would not hear your enemy say so,
+Nor shall you do mine ear that violence,
+To make it truster of your own report
+Against yourself: I know you are no truant.
+But what is your affair in Elsinore?
+We'll teach you to drink deep ere you depart.
+HORATIO
+My lord, I came to see your father's funeral.
+HAMLET
+I pray thee, do not mock me, fellow-student;
+I think it was to see my mother's wedding.
+HORATIO
+Indeed, my lord, it follow'd hard upon.
+HAMLET
+Thrift, thrift, Horatio! the funeral baked meats
+Did coldly furnish forth the marriage tables.
+Would I had met my dearest foe in heaven
+Or ever I had seen that day, Horatio!
+My father!--methinks I see my father.
+HORATIO
+Where, my lord?
+HAMLET
+In my mind's eye, Horatio.
+HORATIO
+I saw him once; he was a goodly king.
+HAMLET
+He was a man, take him for all in all,
+I shall not look upon his like again.
+HORATIO
+My lord, I think I saw him yesternight.
+HAMLET
+Saw? who?
+HORATIO
+My lord, the king your father.
+HAMLET
+The king my father!
+HORATIO
+Season your admiration for awhile
+With an attent ear, till I may deliver,
+Upon the witness of these gentlemen,
+This marvel to you.
+HAMLET
+For God's love, let me hear.
+HORATIO
+Two nights together had these gentlemen,
+Marcellus and Bernardo, on their watch,
+In the dead vast and middle of the night,
+Been thus encounter'd. A figure like your father,
+Armed at point exactly, cap-a-pe,
+Appears before them, and with solemn march
+Goes slow and stately by them: thrice he walk'd
+By their oppress'd and fear-surprised eyes,
+Within his truncheon's length; whilst they, distilled
+Almost to jelly with the act of fear,
+Stand dumb and speak not to him. This to me
+In dreadful secrecy impart they did;
+And I with them the third night kept the watch;
+Where, as they had deliver'd, both in time,
+Form of the thing, each word made true and good,
+The apparition comes: I knew your father;
+These hands are not more like.
+HAMLET
+But where was this?
+MARCELLUS
+My lord, upon the platform where we watch'd.
+HAMLET
+Did you not speak to it?
+HORATIO
+My lord, I did;
+But answer made it none: yet once methought
+It lifted up its head and did address
+Itself to motion, like as it would speak;
+But even then the morning cock crew loud,
+And at the sound it shrunk in haste away,
+And vanish'd from our sight.
+HAMLET
+'Tis very strange.
+HORATIO
+As I do live, my honour'd lord, 'tis true;
+And we did think it writ down in our duty
+To let you know of it.
+HAMLET
+Indeed, indeed, sirs, but this troubles me.
+Hold you the watch to-night?
+MARCELLUS BERNARDO
+We do, my lord.
+HAMLET
+Arm'd, say you?
+MARCELLUS BERNARDO
+Arm'd, my lord.
+HAMLET
+From top to toe?
+MARCELLUS BERNARDO
+My lord, from head to foot.
+HAMLET
+Then saw you not his face?
+HORATIO
+O, yes, my lord; he wore his beaver up.
+HAMLET
+What, look'd he frowningly?
+HORATIO
+A countenance more in sorrow than in anger.
+HAMLET
+Pale or red?
+HORATIO
+Nay, very pale.
+HAMLET
+And fix'd his eyes upon you?
+HORATIO
+Most constantly.
+HAMLET
+I would I had been there.
+HORATIO
+It would have much amazed you.
+HAMLET
+Very like, very like. Stay'd it long?
+HORATIO
+While one with moderate haste might tell a hundred.
+MARCELLUS BERNARDO
+Longer, longer.
+HORATIO
+Not when I saw't.
+HAMLET
+His beard was grizzled--no?
+HORATIO
+It was, as I have seen it in his life,
+A sable silver'd.
+HAMLET
+I will watch to-night;
+Perchance 'twill walk again.
+HORATIO
+I warrant it will.
+HAMLET
+If it assume my noble father's person,
+I'll speak to it, though hell itself should gape
+And bid me hold my peace. I pray you all,
+If you have hitherto conceal'd this sight,
+Let it be tenable in your silence still;
+And whatsoever else shall hap to-night,
+Give it an understanding, but no tongue:
+I will requite your loves. So, fare you well:
+Upon the platform, 'twixt eleven and twelve,
+I'll visit you.
+All
+Our duty to your honour.
+HAMLET
+Your loves, as mine to you: farewell.
+Exeunt all but HAMLET
+
+My father's spirit in arms! all is not well;
+I doubt some foul play: would the night were come!
+Till then sit still, my soul: foul deeds will rise,
+Though all the earth o'erwhelm them, to men's eyes.
+Exit
+
+SCENE III. A room in Polonius' house.
+
+Enter LAERTES and OPHELIA
+LAERTES
+My necessaries are embark'd: farewell:
+And, sister, as the winds give benefit
+And convoy is assistant, do not sleep,
+But let me hear from you.
+OPHELIA
+Do you doubt that?
+LAERTES
+For Hamlet and the trifling of his favour,
+Hold it a fashion and a toy in blood,
+A violet in the youth of primy nature,
+Forward, not permanent, sweet, not lasting,
+The perfume and suppliance of a minute; No more.
+OPHELIA
+No more but so?
+LAERTES
+Think it no more;
+For nature, crescent, does not grow alone
+In thews and bulk, but, as this temple waxes,
+The inward service of the mind and soul
+Grows wide withal. Perhaps he loves you now,
+And now no soil nor cautel doth besmirch
+The virtue of his will: but you must fear,
+His greatness weigh'd, his will is not his own;
+For he himself is subject to his birth:
+He may not, as unvalued persons do,
+Carve for himself; for on his choice depends
+The safety and health of this whole state;
+And therefore must his choice be circumscribed
+Unto the voice and yielding of that body
+Whereof he is the head. Then if he says he loves you,
+It fits your wisdom so far to believe it
+As he in his particular act and place
+May give his saying deed; which is no further
+Than the main voice of Denmark goes withal.
+Then weigh what loss your honour may sustain,
+If with too credent ear you list his songs,
+Or lose your heart, or your chaste treasure open
+To his unmaster'd importunity.
+Fear it, Ophelia, fear it, my dear sister,
+And keep you in the rear of your affection,
+Out of the shot and danger of desire.
+The chariest maid is prodigal enough,
+If she unmask her beauty to the moon:
+Virtue itself 'scapes not calumnious strokes:
+The canker galls the infants of the spring,
+Too oft before their buttons be disclosed,
+And in the morn and liquid dew of youth
+Contagious blastments are most imminent.
+Be wary then; best safety lies in fear:
+Youth to itself rebels, though none else near.
+OPHELIA
+I shall the effect of this good lesson keep,
+As watchman to my heart. But, good my brother,
+Do not, as some ungracious pastors do,
+Show me the steep and thorny way to heaven;
+Whiles, like a puff'd and reckless libertine,
+Himself the primrose path of dalliance treads,
+And recks not his own rede.
+LAERTES
+O, fear me not.
+I stay too long: but here my father comes.
+Enter POLONIUS
+
+A double blessing is a double grace,
+Occasion smiles upon a second leave.
+LORD POLONIUS
+Yet here, Laertes! aboard, aboard, for shame!
+The wind sits in the shoulder of your sail,
+And you are stay'd for. There; my blessing with thee!
+And these few precepts in thy memory
+See thou character. Give thy thoughts no tongue,
+Nor any unproportioned thought his act.
+Be thou familiar, but by no means vulgar.
+Those friends thou hast, and their adoption tried,
+Grapple them to thy soul with hoops of steel;
+But do not dull thy palm with entertainment
+Of each new-hatch'd, unfledged comrade. Beware
+Of entrance to a quarrel, but being in,
+Bear't that the opposed may beware of thee.
+Give every man thy ear, but few thy voice;
+Take each man's censure, but reserve thy judgment.
+Costly thy habit as thy purse can buy,
+But not express'd in fancy; rich, not gaudy;
+For the apparel oft proclaims the man,
+And they in France of the best rank and station
+Are of a most select and generous chief in that.
+Neither a borrower nor a lender be;
+For loan oft loses both itself and friend,
+And borrowing dulls the edge of husbandry.
+This above all: to thine ownself be true,
+And it must follow, as the night the day,
+Thou canst not then be false to any man.
+Farewell: my blessing season this in thee!
+LAERTES
+Most humbly do I take my leave, my lord.
+LORD POLONIUS
+The time invites you; go; your servants tend.
+LAERTES
+Farewell, Ophelia; and remember well
+What I have said to you.
+OPHELIA
+'Tis in my memory lock'd,
+And you yourself shall keep the key of it.
+LAERTES
+Farewell.
+Exit
+
+LORD POLONIUS
+What is't, Ophelia, be hath said to you?
+OPHELIA
+So please you, something touching the Lord Hamlet.
+LORD POLONIUS
+Marry, well bethought:
+'Tis told me, he hath very oft of late
+Given private time to you; and you yourself
+Have of your audience been most free and bounteous:
+If it be so, as so 'tis put on me,
+And that in way of caution, I must tell you,
+You do not understand yourself so clearly
+As it behoves my daughter and your honour.
+What is between you? give me up the truth.
+OPHELIA
+He hath, my lord, of late made many tenders
+Of his affection to me.
+LORD POLONIUS
+Affection! pooh! you speak like a green girl,
+Unsifted in such perilous circumstance.
+Do you believe his tenders, as you call them?
+OPHELIA
+I do not know, my lord, what I should think.
+LORD POLONIUS
+Marry, I'll teach you: think yourself a baby;
+That you have ta'en these tenders for true pay,
+Which are not sterling. Tender yourself more dearly;
+Or--not to crack the wind of the poor phrase,
+Running it thus--you'll tender me a fool.
+OPHELIA
+My lord, he hath importuned me with love
+In honourable fashion.
+LORD POLONIUS
+Ay, fashion you may call it; go to, go to.
+OPHELIA
+And hath given countenance to his speech, my lord,
+With almost all the holy vows of heaven.
+LORD POLONIUS
+Ay, springes to catch woodcocks. I do know,
+When the blood burns, how prodigal the soul
+Lends the tongue vows: these blazes, daughter,
+Giving more light than heat, extinct in both,
+Even in their promise, as it is a-making,
+You must not take for fire. From this time
+Be somewhat scanter of your maiden presence;
+Set your entreatments at a higher rate
+Than a command to parley. For Lord Hamlet,
+Believe so much in him, that he is young
+And with a larger tether may he walk
+Than may be given you: in few, Ophelia,
+Do not believe his vows; for they are brokers,
+Not of that dye which their investments show,
+But mere implorators of unholy suits,
+Breathing like sanctified and pious bawds,
+The better to beguile. This is for all:
+I would not, in plain terms, from this time forth,
+Have you so slander any moment leisure,
+As to give words or talk with the Lord Hamlet.
+Look to't, I charge you: come your ways.
+OPHELIA
+I shall obey, my lord.
+Exeunt
+
+SCENE IV. The platform.
+
+Enter HAMLET, HORATIO, and MARCELLUS
+HAMLET
+The air bites shrewdly; it is very cold.
+HORATIO
+It is a nipping and an eager air.
+HAMLET
+What hour now?
+HORATIO
+I think it lacks of twelve.
+HAMLET
+No, it is struck.
+HORATIO
+Indeed? I heard it not: then it draws near the season
+Wherein the spirit held his wont to walk.
+A flourish of trumpets, and ordnance shot off, within
+
+What does this mean, my lord?
+HAMLET
+The king doth wake to-night and takes his rouse,
+Keeps wassail, and the swaggering up-spring reels;
+And, as he drains his draughts of Rhenish down,
+The kettle-drum and trumpet thus bray out
+The triumph of his pledge.
+HORATIO
+Is it a custom?
+HAMLET
+Ay, marry, is't:
+But to my mind, though I am native here
+And to the manner born, it is a custom
+More honour'd in the breach than the observance.
+This heavy-headed revel east and west
+Makes us traduced and tax'd of other nations:
+They clepe us drunkards, and with swinish phrase
+Soil our addition; and indeed it takes
+From our achievements, though perform'd at height,
+The pith and marrow of our attribute.
+So, oft it chances in particular men,
+That for some vicious mole of nature in them,
+As, in their birth--wherein they are not guilty,
+Since nature cannot choose his origin--
+By the o'ergrowth of some complexion,
+Oft breaking down the pales and forts of reason,
+Or by some habit that too much o'er-leavens
+The form of plausive manners, that these men,
+Carrying, I say, the stamp of one defect,
+Being nature's livery, or fortune's star,--
+Their virtues else--be they as pure as grace,
+As infinite as man may undergo--
+Shall in the general censure take corruption
+From that particular fault: the dram of eale
+Doth all the noble substance of a doubt
+To his own scandal.
+HORATIO
+Look, my lord, it comes!
+Enter Ghost
+
+HAMLET
+Angels and ministers of grace defend us!
+Be thou a spirit of health or goblin damn'd,
+Bring with thee airs from heaven or blasts from hell,
+Be thy intents wicked or charitable,
+Thou comest in such a questionable shape
+That I will speak to thee: I'll call thee Hamlet,
+King, father, royal Dane: O, answer me!
+Let me not burst in ignorance; but tell
+Why thy canonized bones, hearsed in death,
+Have burst their cerements; why the sepulchre,
+Wherein we saw thee quietly inurn'd,
+Hath oped his ponderous and marble jaws,
+To cast thee up again. What may this mean,
+That thou, dead corse, again in complete steel
+Revisit'st thus the glimpses of the moon,
+Making night hideous; and we fools of nature
+So horridly to shake our disposition
+With thoughts beyond the reaches of our souls?
+Say, why is this? wherefore? what should we do?
+Ghost beckons HAMLET
+
+HORATIO
+It beckons you to go away with it,
+As if it some impartment did desire
+To you alone.
+MARCELLUS
+Look, with what courteous action
+It waves you to a more removed ground:
+But do not go with it.
+HORATIO
+No, by no means.
+HAMLET
+It will not speak; then I will follow it.
+HORATIO
+Do not, my lord.
+HAMLET
+Why, what should be the fear?
+I do not set my life in a pin's fee;
+And for my soul, what can it do to that,
+Being a thing immortal as itself?
+It waves me forth again: I'll follow it.
+HORATIO
+What if it tempt you toward the flood, my lord,
+Or to the dreadful summit of the cliff
+That beetles o'er his base into the sea,
+And there assume some other horrible form,
+Which might deprive your sovereignty of reason
+And draw you into madness? think of it:
+The very place puts toys of desperation,
+Without more motive, into every brain
+That looks so many fathoms to the sea
+And hears it roar beneath.
+HAMLET
+It waves me still.
+Go on; I'll follow thee.
+MARCELLUS
+You shall not go, my lord.
+HAMLET
+Hold off your hands.
+HORATIO
+Be ruled; you shall not go.
+HAMLET
+My fate cries out,
+And makes each petty artery in this body
+As hardy as the Nemean lion's nerve.
+Still am I call'd. Unhand me, gentlemen.
+By heaven, I'll make a ghost of him that lets me!
+I say, away! Go on; I'll follow thee.
+Exeunt Ghost and HAMLET
+
+HORATIO
+He waxes desperate with imagination.
+MARCELLUS
+Let's follow; 'tis not fit thus to obey him.
+HORATIO
+Have after. To what issue will this come?
+MARCELLUS
+Something is rotten in the state of Denmark.
+HORATIO
+Heaven will direct it.
+MARCELLUS
+Nay, let's follow him.
+Exeunt
+
+SCENE V. Another part of the platform.
+
+Enter GHOST and HAMLET
+HAMLET
+Where wilt thou lead me? speak; I'll go no further.
+Ghost
+Mark me.
+HAMLET
+I will.
+Ghost
+My hour is almost come,
+When I to sulphurous and tormenting flames
+Must render up myself.
+HAMLET
+Alas, poor ghost!
+Ghost
+Pity me not, but lend thy serious hearing
+To what I shall unfold.
+HAMLET
+Speak; I am bound to hear.
+Ghost
+So art thou to revenge, when thou shalt hear.
+HAMLET
+What?
+Ghost
+I am thy father's spirit,
+Doom'd for a certain term to walk the night,
+And for the day confined to fast in fires,
+Till the foul crimes done in my days of nature
+Are burnt and purged away. But that I am forbid
+To tell the secrets of my prison-house,
+I could a tale unfold whose lightest word
+Would harrow up thy soul, freeze thy young blood,
+Make thy two eyes, like stars, start from their spheres,
+Thy knotted and combined locks to part
+And each particular hair to stand on end,
+Like quills upon the fretful porpentine:
+But this eternal blazon must not be
+To ears of flesh and blood. List, list, O, list!
+If thou didst ever thy dear father love--
+HAMLET
+O God!
+Ghost
+Revenge his foul and most unnatural murder.
+HAMLET
+Murder!
+Ghost
+Murder most foul, as in the best it is;
+But this most foul, strange and unnatural.
+HAMLET
+Haste me to know't, that I, with wings as swift
+As meditation or the thoughts of love,
+May sweep to my revenge.
+Ghost
+I find thee apt;
+And duller shouldst thou be than the fat weed
+That roots itself in ease on Lethe wharf,
+Wouldst thou not stir in this. Now, Hamlet, hear:
+'Tis given out that, sleeping in my orchard,
+A serpent stung me; so the whole ear of Denmark
+Is by a forged process of my death
+Rankly abused: but know, thou noble youth,
+The serpent that did sting thy father's life
+Now wears his crown.
+HAMLET
+O my prophetic soul! My uncle!
+Ghost
+Ay, that incestuous, that adulterate beast,
+With witchcraft of his wit, with traitorous gifts,--
+O wicked wit and gifts, that have the power
+So to seduce!--won to his shameful lust
+The will of my most seeming-virtuous queen:
+O Hamlet, what a falling-off was there!
+From me, whose love was of that dignity
+That it went hand in hand even with the vow
+I made to her in marriage, and to decline
+Upon a wretch whose natural gifts were poor
+To those of mine!
+But virtue, as it never will be moved,
+Though lewdness court it in a shape of heaven,
+So lust, though to a radiant angel link'd,
+Will sate itself in a celestial bed,
+And prey on garbage.
+But, soft! methinks I scent the morning air;
+Brief let me be. Sleeping within my orchard,
+My custom always of the afternoon,
+Upon my secure hour thy uncle stole,
+With juice of cursed hebenon in a vial,
+And in the porches of my ears did pour
+The leperous distilment; whose effect
+Holds such an enmity with blood of man
+That swift as quicksilver it courses through
+The natural gates and alleys of the body,
+And with a sudden vigour doth posset
+And curd, like eager droppings into milk,
+The thin and wholesome blood: so did it mine;
+And a most instant tetter bark'd about,
+Most lazar-like, with vile and loathsome crust,
+All my smooth body.
+Thus was I, sleeping, by a brother's hand
+Of life, of crown, of queen, at once dispatch'd:
+Cut off even in the blossoms of my sin,
+Unhousel'd, disappointed, unanel'd,
+No reckoning made, but sent to my account
+With all my imperfections on my head:
+O, horrible! O, horrible! most horrible!
+If thou hast nature in thee, bear it not;
+Let not the royal bed of Denmark be
+A couch for luxury and damned incest.
+But, howsoever thou pursuest this act,
+Taint not thy mind, nor let thy soul contrive
+Against thy mother aught: leave her to heaven
+And to those thorns that in her bosom lodge,
+To prick and sting her. Fare thee well at once!
+The glow-worm shows the matin to be near,
+And 'gins to pale his uneffectual fire:
+Adieu, adieu! Hamlet, remember me.
+Exit
+
+HAMLET
+O all you host of heaven! O earth! what else?
+And shall I couple hell? O, fie! Hold, hold, my heart;
+And you, my sinews, grow not instant old,
+But bear me stiffly up. Remember thee!
+Ay, thou poor ghost, while memory holds a seat
+In this distracted globe. Remember thee!
+Yea, from the table of my memory
+I'll wipe away all trivial fond records,
+All saws of books, all forms, all pressures past,
+That youth and observation copied there;
+And thy commandment all alone shall live
+Within the book and volume of my brain,
+Unmix'd with baser matter: yes, by heaven!
+O most pernicious woman!
+O villain, villain, smiling, damned villain!
+My tables,--meet it is I set it down,
+That one may smile, and smile, and be a villain;
+At least I'm sure it may be so in Denmark:
+Writing
+
+So, uncle, there you are. Now to my word;
+It is 'Adieu, adieu! remember me.'
+I have sworn 't.
+MARCELLUS HORATIO
+[Within] My lord, my lord,--
+MARCELLUS
+[Within]	Lord Hamlet,--
+HORATIO
+[Within]	Heaven secure him!
+HAMLET
+So be it!
+HORATIO
+[Within] Hillo, ho, ho, my lord!
+HAMLET
+Hillo, ho, ho, boy! come, bird, come.
+Enter HORATIO and MARCELLUS
+
+MARCELLUS
+How is't, my noble lord?
+HORATIO
+What news, my lord?
+HAMLET
+O, wonderful!
+HORATIO
+Good my lord, tell it.
+HAMLET
+No; you'll reveal it.
+HORATIO
+Not I, my lord, by heaven.
+MARCELLUS
+Nor I, my lord.
+HAMLET
+How say you, then; would heart of man once think it?
+But you'll be secret?
+HORATIO MARCELLUS
+Ay, by heaven, my lord.
+HAMLET
+There's ne'er a villain dwelling in all Denmark
+But he's an arrant knave.
+HORATIO
+There needs no ghost, my lord, come from the grave
+To tell us this.
+HAMLET
+Why, right; you are i' the right;
+And so, without more circumstance at all,
+I hold it fit that we shake hands and part:
+You, as your business and desire shall point you;
+For every man has business and desire,
+Such as it is; and for mine own poor part,
+Look you, I'll go pray.
+HORATIO
+These are but wild and whirling words, my lord.
+HAMLET
+I'm sorry they offend you, heartily;
+Yes, 'faith heartily.
+HORATIO
+There's no offence, my lord.
+HAMLET
+Yes, by Saint Patrick, but there is, Horatio,
+And much offence too. Touching this vision here,
+It is an honest ghost, that let me tell you:
+For your desire to know what is between us,
+O'ermaster 't as you may. And now, good friends,
+As you are friends, scholars and soldiers,
+Give me one poor request.
+HORATIO
+What is't, my lord? we will.
+HAMLET
+Never make known what you have seen to-night.
+HORATIO MARCELLUS
+My lord, we will not.
+HAMLET
+Nay, but swear't.
+HORATIO
+In faith,
+My lord, not I.
+MARCELLUS
+Nor I, my lord, in faith.
+HAMLET
+Upon my sword.
+MARCELLUS
+We have sworn, my lord, already.
+HAMLET
+Indeed, upon my sword, indeed.
+Ghost
+[Beneath] Swear.
+HAMLET
+Ah, ha, boy! say'st thou so? art thou there,
+truepenny?
+Come on--you hear this fellow in the cellarage--
+Consent to swear.
+HORATIO
+Propose the oath, my lord.
+HAMLET
+Never to speak of this that you have seen,
+Swear by my sword.
+Ghost
+[Beneath] Swear.
+HAMLET
+Hic et ubique? then we'll shift our ground.
+Come hither, gentlemen,
+And lay your hands again upon my sword:
+Never to speak of this that you have heard,
+Swear by my sword.
+Ghost
+[Beneath] Swear.
+HAMLET
+Well said, old mole! canst work i' the earth so fast?
+A worthy pioner! Once more remove, good friends.
+HORATIO
+O day and night, but this is wondrous strange!
+HAMLET
+And therefore as a stranger give it welcome.
+There are more things in heaven and earth, Horatio,
+Than are dreamt of in your philosophy. But come;
+Here, as before, never, so help you mercy,
+How strange or odd soe'er I bear myself,
+As I perchance hereafter shall think meet
+To put an antic disposition on,
+That you, at such times seeing me, never shall,
+With arms encumber'd thus, or this headshake,
+Or by pronouncing of some doubtful phrase,
+As 'Well, well, we know,' or 'We could, an if we would,'
+Or 'If we list to speak,' or 'There be, an if they might,'
+Or such ambiguous giving out, to note
+That you know aught of me: this not to do,
+So grace and mercy at your most need help you, Swear.
+Ghost
+[Beneath] Swear.
+HAMLET
+Rest, rest, perturbed spirit!
+They swear
+
+So, gentlemen,
+With all my love I do commend me to you:
+And what so poor a man as Hamlet is
+May do, to express his love and friending to you,
+God willing, shall not lack. Let us go in together;
+And still your fingers on your lips, I pray.
+The time is out of joint: O cursed spite,
+That ever I was born to set it right!
+Nay, come, let's go together.
+Exeunt
diff --git a/pebble/sstable/testdata/hamletreader/hamlet_iter b/pebble/sstable/testdata/hamletreader/hamlet_iter
new file mode 100644
index 0000000..c680f29
--- /dev/null
+++ b/pebble/sstable/testdata/hamletreader/hamlet_iter
@@ -0,0 +1,389 @@
+iter
+first
+next
+next
+next
+next
+----
+<a:0>
+<aboard:0>
+<about:0>
+<above:0>
+<abroad:0>
+
+iter
+seek-ge a
+next
+next
+next
+next
+----
+<a:0>
+<aboard:0>
+<about:0>
+<above:0>
+<abroad:0>
+
+iter
+seek-ge b
+next
+next
+next
+----
+<baby:0>
+<back:0>
+<baked:0>
+<bark:0>
+
+iter
+seek-ge c
+next
+next
+----
+<call:0>
+<calumnious:0>
+<came:0>
+
+iter
+seek-ge d
+next
+----
+<d:0>
+<daily:0>
+
+iter
+seek-ge e
+----
+<e:0>
+
+iter
+seek-ge b
+seek-ge c
+seek-ge d
+seek-ge e
+----
+<baby:0>
+<call:0>
+<d:0>
+<e:0>
+
+iter
+last
+prev
+prev
+prev
+prev
+----
+<youth:0>
+<yourself:0>
+<your:0>
+<young:0>
+<you:0>
+
+iter
+seek-lt e
+prev
+prev
+prev
+prev
+----
+<dye:0>
+<dwelling:0>
+<duty:0>
+<duties:0>
+<dust:0>
+
+iter
+seek-lt d
+prev
+prev
+prev
+----
+<cut:0>
+<customary:0>
+<custom:0>
+<cursed:0>
+
+iter
+seek-lt c
+prev
+prev
+----
+<by:0>
+<buy:0>
+<buttons:0>
+
+iter
+seek-lt b
+prev
+----
+<ay:0>
+<awhile:0>
+
+iter
+seek-lt a
+----
+.
+
+iter
+seek-lt d
+seek-lt c
+seek-lt b
+seek-lt a
+----
+<cut:0>
+<by:0>
+<ay:0>
+.
+
+iter globalSeqNum=1
+first
+next
+next
+next
+next
+----
+<a:1>
+<aboard:1>
+<about:1>
+<above:1>
+<abroad:1>
+
+iter globalSeqNum=10
+first
+next
+next
+next
+next
+----
+<a:10>
+<aboard:10>
+<about:10>
+<above:10>
+<abroad:10>
+
+iter globalSeqNum=0
+seek-lt x
+----
+<wrung:0>
+
+get
+b
+a
+f
+d
+c
+----
+<err: pebble: not found>
+97
+<err: pebble: not found>
+54
+<err: pebble: not found>
+
+iter
+seek-ge aboard
+prev
+prev
+----
+<aboard:0>
+<a:0>
+.
+
+iter
+seek-ge calumnious
+next
+next
+next
+prev
+prev
+prev
+prev
+prev
+----
+<calumnious:0>
+<came:0>
+<can:0>
+<canker:0>
+<can:0>
+<came:0>
+<calumnious:0>
+<call:0>
+<by:0>
+
+iter
+seek-lt yourself
+next
+next
+next
+----
+<your:0>
+<yourself:0>
+<youth:0>
+.
+
+iter
+seek-lt yourself
+prev
+prev
+prev
+next
+next
+seek-ge calumnious
+prev
+prev
+prev
+first
+next
+next
+prev
+prev
+prev
+----
+<your:0>
+<young:0>
+<you:0>
+<yond:0>
+<you:0>
+<young:0>
+<calumnious:0>
+<call:0>
+<by:0>
+<buy:0>
+<a:0>
+<aboard:0>
+<about:0>
+<aboard:0>
+<a:0>
+.
+
+iter
+seek-ge m
+next
+next
+next
+----
+<m:0>
+<madam:0>
+<made:0>
+<madness:0>
+
+iter
+seek-lt m
+next
+next
+next
+----
+<luxury:0>
+<m:0>
+<madam:0>
+<made:0>
+
+iter
+seek-ge a
+seek-ge b
+seek-ge c
+seek-ge d
+seek-ge e
+seek-ge f
+seek-ge g
+seek-ge h
+seek-ge i
+seek-ge j
+seek-ge k
+seek-ge l
+seek-ge m
+seek-ge n
+seek-ge o
+seek-ge p
+seek-ge q
+seek-ge r
+seek-ge s
+seek-ge t
+seek-ge u
+seek-ge v
+seek-ge w
+seek-ge x
+seek-ge y
+seek-ge z
+----
+<a:0>
+<baby:0>
+<call:0>
+<d:0>
+<e:0>
+<face:0>
+<gaged:0>
+<ha:0>
+<i:0>
+<jaws:0>
+<keep:0>
+<labourer:0>
+<m:0>
+<name:0>
+<o:0>
+<pale:0>
+<quarrel:0>
+<radiant:0>
+<s:0>
+<t:0>
+<ubique:0>
+<v:0>
+<wake:0>
+<yea:0>
+<yea:0>
+.
+
+iter
+seek-lt a
+seek-lt b
+seek-lt c
+seek-lt d
+seek-lt e
+seek-lt f
+seek-lt g
+seek-lt h
+seek-lt i
+seek-lt j
+seek-lt k
+seek-lt l
+seek-lt m
+seek-lt n
+seek-lt o
+seek-lt p
+seek-lt q
+seek-lt r
+seek-lt s
+seek-lt t
+seek-lt u
+seek-lt v
+seek-lt w
+seek-lt x
+seek-lt y
+seek-lt z
+----
+.
+<ay:0>
+<by:0>
+<cut:0>
+<dye:0>
+<eyes:0>
+<further:0>
+<guilty:0>
+<hyperion:0>
+<iv:0>
+<jump:0>
+<knows:0>
+<luxury:0>
+<myself:0>
+<now:0>
+<ownself:0>
+<puts:0>
+<quills:0>
+<russet:0>
+<sworn:0>
+<two:0>
+<usurp:0>
+<vulgar:0>
+<wrung:0>
+<wrung:0>
+<youth:0>
diff --git a/pebble/sstable/testdata/make-table.cc b/pebble/sstable/testdata/make-table.cc
new file mode 100644
index 0000000..44d12b3
--- /dev/null
+++ b/pebble/sstable/testdata/make-table.cc
@@ -0,0 +1,245 @@
+// Copyright 2011 The LevelDB-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This program adds N lines from infile to a leveldb table at outfile.
+// The h.txt infile was generated via:
+// cat hamlet-act-1.txt | tr '[:upper:]' '[:lower:]' | grep -o -E '\w+' | sort | uniq -c > infile
+//
+// To build and run:
+// g++ make-table.cc -lleveldb && ./a.out
+
+#include <fstream>
+#include <iostream>
+#include <string>
+
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/sst_file_writer.h"
+#include "rocksdb/table.h"
+
+const char* infile = "h.txt";
+
+// A dummy prefix extractor that cuts off the last two bytes for keys of
+// length three or over. This is not a valid prefix extractor and barely
+// enough to do a little bit of unit testing.
+//
+// TODO(tbg): write some test infra using CockroachDB MVCC data.
+class PrefixExtractor : public rocksdb::SliceTransform {
+ public:
+  PrefixExtractor() {}
+
+  virtual const char* Name() const { return "leveldb.BytewiseComparator"; }
+
+  virtual rocksdb::Slice Transform(const rocksdb::Slice& src) const {
+    auto sl = rocksdb::Slice(src.data(), src.size());
+    return sl;
+  }
+
+  virtual bool InDomain(const rocksdb::Slice& src) const { return true; }
+};
+
+class KeyCountPropertyCollector : public rocksdb::TablePropertiesCollector {
+ public:
+  KeyCountPropertyCollector()
+      : count_(0) {
+  }
+
+  rocksdb::Status AddUserKey(const rocksdb::Slice&, const rocksdb::Slice&,
+                             rocksdb::EntryType type, rocksdb::SequenceNumber,
+                             uint64_t) override {
+    count_++;
+    return rocksdb::Status::OK();
+  }
+
+  rocksdb::Status Finish(rocksdb::UserCollectedProperties* properties) override {
+    char buf[16];
+    sprintf(buf, "%d", count_);
+    *properties = rocksdb::UserCollectedProperties{
+      {"test.key-count", buf},
+    };
+    return rocksdb::Status::OK();
+  }
+
+  const char* Name() const override { return "KeyCountPropertyCollector"; }
+
+  rocksdb::UserCollectedProperties GetReadableProperties() const override {
+    return rocksdb::UserCollectedProperties{};
+  }
+
+ private:
+  int count_;
+};
+
+class KeyCountPropertyCollectorFactory : public rocksdb::TablePropertiesCollectorFactory {
+  virtual rocksdb::TablePropertiesCollector* CreateTablePropertiesCollector(
+      rocksdb::TablePropertiesCollectorFactory::Context context) override {
+    return new KeyCountPropertyCollector();
+  }
+  const char* Name() const override { return "KeyCountPropertyCollector"; }
+};
+
+int write() {
+  for (int i = 0; i < 12; ++i) {
+    rocksdb::Options options;
+    rocksdb::BlockBasedTableOptions table_options;
+    const char* outfile;
+
+    table_options.block_size = 2048;
+    table_options.index_shortening = rocksdb::BlockBasedTableOptions::IndexShorteningMode::kShortenSeparatorsAndSuccessor;
+
+    switch (i) {
+      case 0:
+        outfile = "h.ldb";
+        table_options.format_version = 0;
+        table_options.whole_key_filtering = false;
+        break;
+
+      case 1:
+        outfile = "h.sst";
+        options.table_properties_collector_factories.emplace_back(
+            new KeyCountPropertyCollectorFactory);
+        table_options.whole_key_filtering = false;
+        break;
+
+      case 2:
+        outfile = "h.no-compression.sst";
+        options.table_properties_collector_factories.emplace_back(
+            new KeyCountPropertyCollectorFactory);
+        options.compression = rocksdb::kNoCompression;
+        table_options.whole_key_filtering = false;
+        break;
+
+      case 3:
+        outfile = "h.block-bloom.no-compression.sst";
+        options.compression = rocksdb::kNoCompression;
+        table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, true));
+        table_options.whole_key_filtering = true;
+        break;
+
+      case 4:
+        outfile = "h.table-bloom.sst";
+        table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
+        table_options.whole_key_filtering = true;
+        break;
+
+      case 5:
+        outfile = "h.table-bloom.no-compression.sst";
+        options.compression = rocksdb::kNoCompression;
+        table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
+        table_options.whole_key_filtering = true;
+        break;
+
+      case 6:
+        // TODO(peter): unused at this time
+        //
+        // outfile = "h.block-bloom.no-compression.prefix_extractor.sst";
+        // options.compression = rocksdb::kNoCompression;
+        // options.prefix_extractor.reset(new PrefixExtractor);
+        // table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, true));
+        // table_options.whole_key_filtering = true;
+        // break;
+        continue;
+
+      case 7:
+        // TODO(peter): unused at this time
+        //
+        // outfile = "h.table-bloom.no-compression.prefix_extractor.sst";
+        // options.compression = rocksdb::kNoCompression;
+        // options.prefix_extractor.reset(new PrefixExtractor);
+        // table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
+        // table_options.whole_key_filtering = true;
+        // break;
+        continue;
+
+      case 8:
+        // TODO(peter): unused at this time
+        //
+        // outfile = "h.block-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst";
+        // options.compression = rocksdb::kNoCompression;
+        // options.prefix_extractor.reset(new PrefixExtractor);
+        // table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, true));
+        // table_options.whole_key_filtering = false;
+        // break;
+        continue;
+
+      case 9:
+        outfile = "h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst";
+        options.compression = rocksdb::kNoCompression;
+        options.prefix_extractor.reset(new PrefixExtractor);
+        table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
+        table_options.whole_key_filtering = false;
+        break;
+
+      case 10:
+        outfile = "h.no-compression.two_level_index.sst";
+        options.table_properties_collector_factories.emplace_back(
+            new KeyCountPropertyCollectorFactory);
+        options.compression = rocksdb::kNoCompression;
+        table_options.index_type = rocksdb::BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+        // Use small metadata_block_size to stress two_level_index.
+        table_options.metadata_block_size = 128;
+        table_options.whole_key_filtering = false;
+        break;
+
+      case 11:
+        outfile = "h.zstd-compression.sst";
+        options.table_properties_collector_factories.emplace_back(
+            new KeyCountPropertyCollectorFactory);
+        options.compression = rocksdb::kZSTD;
+        table_options.whole_key_filtering = false;
+        break;
+
+      default:
+        continue;
+    }
+
+    options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(table_options));
+
+    std::unique_ptr<rocksdb::SstFileWriter> tb(new rocksdb::SstFileWriter({}, options));
+    rocksdb::Status status = tb->Open(outfile);
+    if (!status.ok()) {
+      std::cerr << "SstFileWriter::Open: " << status.ToString() << std::endl;
+      return 1;
+    }
+
+    int rangeDelLength = 0;
+    int rangeDelCounter = 0;
+    std::ifstream in(infile);
+    std::string s;
+    std::string rangeDelStart;
+    for (int i = 0; getline(in, s); i++) {
+      std::string key(s, 8);
+      std::string val(s, 0, 7);
+      val = val.substr(1 + val.rfind(' '));
+      tb->Put(key.c_str(), val.c_str());
+      // Add range deletions of increasing length.
+      if (i % 100 == 0) {
+        rangeDelStart = key;
+        rangeDelCounter = 0;
+        rangeDelLength++;
+      }
+      rangeDelCounter++;
+
+      if (rangeDelCounter == rangeDelLength) {
+        tb->DeleteRange(rangeDelStart, key.c_str());
+      }
+    }
+
+    rocksdb::ExternalSstFileInfo info;
+    status = tb->Finish(&info);
+    if (!status.ok()) {
+      std::cerr << "TableBuilder::Finish: " << status.ToString() << std::endl;
+      return 1;
+    }
+
+    std::cout << outfile << ": wrote " << info.num_entries << " entries, " << info.file_size << "b" << std::endl;
+  }
+  return 0;
+}
+
+int main(int argc, char** argv) {
+  return write();
+
+}
diff --git a/pebble/sstable/testdata/prefixreader/bloom b/pebble/sstable/testdata/prefixreader/bloom
new file mode 100644
index 0000000..58d478b
--- /dev/null
+++ b/pebble/sstable/testdata/prefixreader/bloom
@@ -0,0 +1,22 @@
+build
+a.SET.1:A
+aa.SET.2:AA
+aaa.SET.3:AAA
+aaaa.SET.4:AAAA
+b.SET.5:B
+----
+
+get
+b
+ab
+a
+aaa
+aa
+aaaa
+----
+B
+<err: pebble: not found>
+A
+AAA
+AA
+AAAA
diff --git a/pebble/sstable/testdata/prefixreader/iter b/pebble/sstable/testdata/prefixreader/iter
new file mode 100644
index 0000000..d1b8ac8
--- /dev/null
+++ b/pebble/sstable/testdata/prefixreader/iter
@@ -0,0 +1,270 @@
+build
+a.SET.1:A
+aa.SET.2:AA
+c.SET.3:C
+d.SET.4:D
+----
+
+iter
+seek-prefix-ge a
+next
+next
+----
+<a:1>:A
+<aa:2>:AA
+.
+
+iter
+seek-prefix-ge aa
+next
+----
+<aa:2>:AA
+.
+
+iter
+seek-prefix-ge aa
+prev
+----
+<aa:2>:AA
+.
+
+iter
+seek-prefix-ge c
+prev
+----
+<c:3>:C
+.
+
+iter
+seek-prefix-ge b
+----
+.
+
+iter
+seek-prefix-ge c
+next
+----
+<c:3>:C
+.
+
+iter
+seek-prefix-ge d
+next
+----
+<d:4>:D
+.
+
+iter
+seek-prefix-ge e
+----
+.
+
+iter
+seek-prefix-ge c
+seek-prefix-ge d
+seek-prefix-ge e
+----
+<c:3>:C
+<d:4>:D
+.
+
+iter
+seek-prefix-ge c
+next
+seek-prefix-ge a
+next
+next
+----
+<c:3>:C
+.
+<a:1>:A
+<aa:2>:AA
+.
+
+iter
+seek-prefix-ge aa
+next
+seek-prefix-ge a
+next
+next
+seek-prefix-ge c
+next
+----
+<aa:2>:AA
+.
+<a:1>:A
+<aa:2>:AA
+.
+<c:3>:C
+.
+
+iter
+seek-prefix-ge c
+next
+seek-prefix-ge aa
+next
+seek-prefix-ge a
+next
+next
+----
+<c:3>:C
+.
+<aa:2>:AA
+.
+<a:1>:A
+<aa:2>:AA
+.
+
+iter
+seek-prefix-ge a
+next
+next
+----
+<a:1>:A
+<aa:2>:AA
+.
+
+iter
+seek-prefix-ge a
+next
+prev
+prev
+----
+<a:1>:A
+<aa:2>:AA
+<a:1>:A
+.
+
+iter
+seek-prefix-ge a
+prev
+----
+<a:1>:A
+.
+
+iter
+seek-prefix-ge a
+seek-ge a
+next
+next
+next
+next
+----
+<a:1>:A
+<a:1>:A
+<aa:2>:AA
+<c:3>:C
+<d:4>:D
+.
+
+iter
+seek-prefix-ge a
+seek-ge aa
+next
+next
+next
+----
+<a:1>:A
+<aa:2>:AA
+<c:3>:C
+<d:4>:D
+.
+
+iter
+seek-prefix-ge aa
+seek-ge c
+next
+next
+----
+<aa:2>:AA
+<c:3>:C
+<d:4>:D
+.
+
+iter
+seek-prefix-ge aa
+seek-lt c
+next
+next
+next
+----
+<aa:2>:AA
+<aa:2>:AA
+<c:3>:C
+<d:4>:D
+.
+
+iter
+seek-prefix-ge aa
+seek-lt c
+prev
+prev
+----
+<aa:2>:AA
+<aa:2>:AA
+<a:1>:A
+.
+
+iter
+seek-lt c
+seek-prefix-ge aa
+prev
+----
+<aa:2>:AA
+<aa:2>:AA
+.
+
+iter
+seek-lt c
+seek-prefix-ge a
+next
+next
+
+----
+<aa:2>:AA
+<a:1>:A
+<aa:2>:AA
+.
+
+iter
+seek-ge aa
+seek-prefix-ge a
+next
+next
+
+----
+<aa:2>:AA
+<a:1>:A
+<aa:2>:AA
+.
+
+iter
+seek-prefix-ge 1
+----
+.
+
+get
+a
+aa
+f
+d
+c
+----
+A
+AA
+<err: pebble: not found>
+D
+C
+
+iter
+seek-prefix-ge a false
+seek-prefix-ge a true
+seek-prefix-ge aa true
+seek-prefix-ge d true
+seek-prefix-ge c false
+----
+<a:1>:A
+<a:1>:A
+<aa:2>:AA
+<d:4>:D
+<c:3>:C
diff --git a/pebble/sstable/testdata/reader/bloom b/pebble/sstable/testdata/reader/bloom
new file mode 100644
index 0000000..58d478b
--- /dev/null
+++ b/pebble/sstable/testdata/reader/bloom
@@ -0,0 +1,22 @@
+build
+a.SET.1:A
+aa.SET.2:AA
+aaa.SET.3:AAA
+aaaa.SET.4:AAAA
+b.SET.5:B
+----
+
+get
+b
+ab
+a
+aaa
+aa
+aaaa
+----
+B
+<err: pebble: not found>
+A
+AAA
+AA
+AAAA
diff --git a/pebble/sstable/testdata/reader/iter b/pebble/sstable/testdata/reader/iter
new file mode 100644
index 0000000..6256795
--- /dev/null
+++ b/pebble/sstable/testdata/reader/iter
@@ -0,0 +1,720 @@
+build
+a.SET.1:A
+b.SET.2:B
+c.SET.3:C
+d.SET.4:D
+----
+
+iter
+first
+next
+next
+next
+next
+----
+<a:1>:A
+<b:2>:B
+<c:3>:C
+<d:4>:D
+.
+
+iter
+seek-ge a
+next
+next
+next
+next
+----
+<a:1>:A
+<b:2>:B
+<c:3>:C
+<d:4>:D
+.
+
+iter
+seek-ge b
+next
+next
+next
+----
+<b:2>:B
+<c:3>:C
+<d:4>:D
+.
+
+iter
+seek-ge c
+next
+next
+----
+<c:3>:C
+<d:4>:D
+.
+
+iter
+seek-ge d
+next
+----
+<d:4>:D
+.
+
+iter
+seek-ge e
+----
+.
+
+iter
+seek-ge d
+seek-ge z
+----
+<d:4>:D
+.
+
+iter
+seek-ge b
+seek-ge c
+seek-ge d
+seek-ge e
+----
+<b:2>:B
+<c:3>:C
+<d:4>:D
+.
+
+iter
+last
+prev
+prev
+prev
+prev
+----
+<d:4>:D
+<c:3>:C
+<b:2>:B
+<a:1>:A
+.
+
+iter
+seek-lt e
+prev
+prev
+prev
+prev
+----
+<d:4>:D
+<c:3>:C
+<b:2>:B
+<a:1>:A
+.
+
+iter
+seek-lt d
+prev
+prev
+prev
+----
+<c:3>:C
+<b:2>:B
+<a:1>:A
+.
+
+iter
+seek-lt c
+prev
+prev
+----
+<b:2>:B
+<a:1>:A
+.
+
+iter
+seek-lt b
+prev
+----
+<a:1>:A
+.
+
+iter
+seek-lt a
+----
+.
+
+iter
+seek-lt d
+seek-lt c
+seek-lt b
+seek-lt a
+----
+<c:3>:C
+<b:2>:B
+<a:1>:A
+.
+
+iter globalSeqNum=1
+first
+next
+next
+next
+next
+----
+<a:1>:A
+<b:1>:B
+<c:1>:C
+<d:1>:D
+.
+
+iter globalSeqNum=10
+first
+next
+next
+next
+next
+----
+<a:10>:A
+<b:10>:B
+<c:10>:C
+<d:10>:D
+.
+
+iter globalSeqNum=0
+seek-lt x
+----
+<d:4>:D
+
+get
+b
+a
+f
+d
+c
+----
+B
+A
+<err: pebble: not found>
+D
+C
+
+# Verify that clearing the bounds on an iterator also clears
+# previously set block{Lower,Upper}.
+
+iter
+seek-ge c
+seek-lt b
+set-bounds lower=b upper=c
+seek-ge c
+seek-lt b
+set-bounds lower= upper=
+seek-ge c
+seek-lt b
+----
+<c:3>:C
+<a:1>:A
+.
+.
+.
+.
+<c:3>:C
+<a:1>:A
+
+# Verify that seeking past the end of the sstable leaves the iterator
+# in a state where prev returns the last key in the table.
+
+iter
+seek-lt d
+seek-ge f
+prev
+----
+<c:3>:C
+.
+<d:4>:D
+
+# Verify that seeking before the beginning of the sstable leaves the
+# iterator in a state where next returns the first key in the table.
+
+iter
+seek-ge b
+seek-lt a
+next
+----
+<b:2>:B
+.
+<a:1>:A
+
+
+# Verify the optimization to use next when doing SeekGE.
+
+iter
+seek-ge a false
+seek-ge a true
+seek-ge b true
+seek-ge c true
+seek-ge d true
+seek-ge e true
+----
+<a:1>:A
+<a:1>:A
+<b:2>:B
+<c:3>:C
+<d:4>:D
+.
+
+# Verify the optimization to use next when doing SeekPrefixGE.
+
+iter
+seek-prefix-ge a false
+seek-prefix-ge a true
+seek-prefix-ge b true
+seek-prefix-ge c true
+seek-prefix-ge d true
+seek-prefix-ge e true
+----
+<a:1>:A
+<a:1>:A
+<b:2>:B
+<c:3>:C
+<d:4>:D
+.
+
+# Verify that iteration from before the beginning or after the end of
+# the sstable does not "wrap around". A bug previously allowed this to
+# happen by letting the data block iterator and index iterator get out
+# of sync.
+
+build
+a.SET.1:a
+----
+
+iter
+first
+prev
+next
+next
+next
+----
+<a:1>:a
+.
+<a:1>:a
+.
+.
+
+iter
+last
+next
+prev
+prev
+prev
+----
+<a:1>:a
+.
+<a:1>:a
+.
+.
+
+# Build a sufficiently large SST to enable two-level indexes.
+
+build
+a.SET.1:A
+aae.SET.1:E
+aaf.SET.1:F
+aag.SET.1:G
+aah.SET.1:H
+aai.SET.1:I
+aaj.SET.1:J
+aak.SET.1:K
+aal.SET.1:L
+aam.SET.1:M
+aan.SET.1:N
+aao.SET.1:O
+aap.SET.1:P
+aaq.SET.1:Q
+aar.SET.1:R
+aas.SET.1:S
+aat.SET.1:T
+aau.SET.1:U
+aav.SET.1:V
+aaw.SET.1:W
+aax.SET.1:X
+aay.SET.1:Y
+aaz.SET.1:Z
+b.SET.2:B
+bbe.SET.2:E
+bbf.SET.2:F
+bbg.SET.2:G
+bbh.SET.2:H
+bbi.SET.2:I
+bbj.SET.2:J
+bbk.SET.2:K
+bbl.SET.2:L
+bbm.SET.2:M
+bbn.SET.2:N
+bbo.SET.2:O
+bbp.SET.2:P
+bbq.SET.2:Q
+bbr.SET.2:R
+bbs.SET.2:S
+bbt.SET.2:T
+bbu.SET.2:U
+bbv.SET.2:V
+bbw.SET.2:W
+bbx.SET.2:X
+bby.SET.2:Y
+bbz.SET.2:Z
+c.SET.3:C
+cc.RANGEDEL.3:ccc
+cce.SET.3:E
+ccf.SET.3:F
+ccg.SET.3:G
+cch.SET.3:H
+cci.SET.3:I
+ccj.SET.3:J
+cck.SET.3:K
+ccl.SET.3:L
+ccm.SET.3:M
+ccn.SET.3:N
+cco.SET.3:O
+ccp.SET.3:P
+ccq.SET.3:Q
+ccr.SET.3:R
+ccs.SET.3:S
+cct.SET.3:T
+ccu.SET.3:U
+ccv.SET.3:V
+ccw.SET.3:W
+ccx.SET.3:X
+ccy.SET.3:Y
+ccz.SET.3:Z
+d.SET.4:D
+dd.RANGEDEL.4:ddd
+dde.SET.4:E
+ddf.SET.4:F
+ddg.SET.4:G
+ddh.SET.4:H
+ddi.SET.4:I
+ddj.SET.4:J
+ddk.SET.4:K
+ddl.SET.4:L
+ddm.SET.4:M
+ddn.SET.4:N
+ddo.SET.4:O
+ddp.SET.4:P
+ddq.SET.4:Q
+ddr.SET.4:R
+dds.SET.4:S
+ddt.SET.4:T
+ddu.SET.4:U
+ddv.SET.4:V
+ddw.SET.4:W
+ddx.SET.4:X
+ddy.SET.4:Y
+ddz.SET.4:Z
+----
+
+iter
+first
+prev
+next
+next
+next
+next
+next
+----
+<a:1>:A
+.
+<a:1>:A
+<aae:1>:E
+<aaf:1>:F
+<aag:1>:G
+<aah:1>:H
+
+iter
+last
+next
+prev
+prev
+prev
+----
+<ddz:4>:Z
+.
+<ddz:4>:Z
+<ddy:4>:Y
+<ddx:4>:X
+
+iter
+first
+prev
+next
+next
+seek-ge x
+prev
+prev
+----
+<a:1>:A
+.
+<a:1>:A
+<aae:1>:E
+.
+<ddz:4>:Z
+<ddy:4>:Y
+
+iter
+first
+prev
+next
+next
+seek-prefix-ge x
+prev
+prev
+----
+<a:1>:A
+.
+<a:1>:A
+<aae:1>:E
+.
+.
+.
+
+iter
+last
+next
+prev
+prev
+seek-lt a
+next
+next
+----
+<ddz:4>:Z
+.
+<ddz:4>:Z
+<ddy:4>:Y
+.
+<a:1>:A
+<aae:1>:E
+
+# Test that SeekPrefixGE does not position the iterator far outside the iterator bounds.
+# Doing so would break the subsequent SeekGE that is utilizing the next instead of seek
+# optimization.
+iter
+set-bounds lower=a upper=aae
+seek-ge a
+seek-prefix-ge aad
+set-bounds lower=aae upper=b
+seek-ge aae
+next
+----
+.
+<a:1>:A
+.
+.
+<aae:1>:E
+<aaf:1>:F
+
+# Test that using Next does not mislead a twoLevelIterator into believing that the
+# iterator has been positioned based on the latest iterator bounds. The Next call
+# immediately after SetBounds has a non-deterministic result, hence we use
+# next-ignore-result.
+iter
+set-bounds lower=bbq upper=d
+seek-ge bbq
+set-bounds lower=b upper=bbf
+next-ignore-result
+set-bounds lower=bbf upper=c
+seek-ge bbf
+next
+----
+.
+<bbq:2>:Q
+.
+.
+.
+<bbf:2>:F
+<bbg:2>:G
+
+build
+a@10.SET.10:a10
+a@5.SET.5:a5
+b@20.SET.20:b20
+b@17.SET.17:b17
+c@30.SET.30:c30
+d@40.SET.40:d40
+----
+
+iter
+first
+next
+next
+next
+next
+next
+next
+----
+<a@10:10>:a10
+<a@5:5>:a5
+<b@20:20>:b20
+<b@17:17>:b17
+<c@30:30>:c30
+<d@40:40>:d40
+.
+
+iter
+seek-ge a@5
+prev
+seek-lt b
+next
+next
+seek-lt c
+prev
+seek-ge b@18
+prev
+next
+----
+<a@5:5>:a5
+<a@10:10>:a10
+<a@5:5>:a5
+<b@20:20>:b20
+<b@17:17>:b17
+<b@17:17>:b17
+<b@20:20>:b20
+<b@17:17>:b17
+<b@20:20>:b20
+<b@17:17>:b17
+
+iter
+seek-ge a@10
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+----
+<a@10:10>:a10
+<b@20:20>:b20
+<c@30:30>:c30
+<d@40:40>:d40
+.
+
+build
+a@10.SET.10:a10
+a@5.SET.5:a5
+a@3.DEL.3:
+aa@30.SET.10:aa30
+abcd@50.SET.10:abcd50
+abcd@49.SET.9:abcd49
+abcd@48.SET.8:abcd48
+abcd@47.SET.7:abcd47
+b@20.SET.20:b20
+b@17.SET.17:b17
+b@15.SET.15:b15
+c.SET.20:c
+c@90.SET.18:c90
+d@70.SET.16:d70
+----
+
+iter
+seek-ge a@10
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+----
+<a@10:10>:a10
+<aa@30:10>:aa30
+<abcd@50:10>:abcd50
+<b@20:20>:b20
+<c:20>:c
+<d@70:16>:d70
+.
+
+
+build
+a@49.SET.49:a49
+a@48.SET.48:a48
+a@47.SET.47:a47
+a@46.SET.46:a46
+a@45.SET.45:a45
+a@44.SET.44:a44
+a@43.SET.43:a43
+a@42.SET.42:a42
+a@41.SET.41:a41
+a@40.SET.40:a40
+a@39.SET.39:a39
+a@38.SET.38:a38
+a@37.SET.37:a37
+a@36.SET.36:a36
+a@35.SET.35:a35
+a@34.SET.34:a34
+a@33.SET.33:a33
+a@32.SET.32:a32
+a@31.SET.31:a31
+a@30.SET.30:a30
+a@29.SET.29:a29
+a@28.SET.28:a28
+a@27.SET.27:a27
+a@26.SET.26:a26
+a@25.SET.25:a25
+a@24.SET.24:a24
+a@23.SET.23:a23
+a@22.SET.22:a22
+a@21.SET.21:a21
+a@20.SET.20:a20
+a@19.SET.19:a19
+a@18.SET.18:a18
+a@17.SET.17:a17
+a@16.SET.16:a16
+a@15.SET.15:a15
+a@14.SET.14:a14
+a@13.SET.13:a13
+a@12.SET.12:a12
+a@11.SET.11:a11
+a@10.SET.10:a10
+b@20.SET.20:b20
+----
+
+iter
+seek-ge a@49
+next-prefix
+next-prefix
+seek-ge a@47
+next-prefix
+next-prefix
+seek-ge a@36
+next-prefix
+next-prefix
+seek-ge a@33
+next-prefix
+next-prefix
+seek-ge a@30
+next-prefix
+next-prefix
+seek-ge a@26
+next-prefix
+next-prefix
+seek-ge a@20
+next-prefix
+next-prefix
+seek-ge aa@10
+next-prefix
+----
+<a@49:49>:a49
+<b@20:20>:b20
+.
+<a@47:47>:a47
+<b@20:20>:b20
+.
+<a@36:36>:a36
+<b@20:20>:b20
+.
+<a@33:33>:a33
+<b@20:20>:b20
+.
+<a@30:30>:a30
+<b@20:20>:b20
+.
+<a@26:26>:a26
+<b@20:20>:b20
+.
+<a@20:20>:a20
+<b@20:20>:b20
+.
+<b@20:20>:b20
+.
diff --git a/pebble/sstable/testdata/reader_bpf/Pebblev2/iter b/pebble/sstable/testdata/reader_bpf/Pebblev2/iter
new file mode 100644
index 0000000..407eb0d
--- /dev/null
+++ b/pebble/sstable/testdata/reader_bpf/Pebblev2/iter
@@ -0,0 +1,56 @@
+# Test case for bug https://github.com/cockroachdb/pebble/issues/2036 Build
+# sstable with two-level index, with two data blocks in each lower-level index
+# block.
+build block-size=1 index-block-size=40 print-layout=true
+c@10.SET.10:cAT10
+d@7.SET.9:dAT7
+e@15.SET.8:eAT15
+f@7.SET.5:fAT7
+----
+index entries:
+ d@7: size 53
+   c@10: size 28
+   d@7: size 26
+ g: size 51
+   e@15: size 28
+   g: size 26
+
+iter
+first
+next
+next
+next
+----
+<c@10:10>
+<d@7:9>
+<e@15:8>
+<f@7:5>
+
+
+# The block property filter matches data block 2 and 4.
+iter block-property-filter=(7,8)
+first
+next
+----
+<d@7:9>
+<f@7:5>
+
+# Use the same block property filter, but use seeks to find these entries.
+# With the bug the second seek-ge below would step to the second lower-level
+# index block and only see the entry in the data block 4.
+iter block-property-filter=(7,8)
+set-bounds lower=a upper=c
+seek-ge a
+seek-ge b true
+set-bounds lower=c upper=g
+seek-ge c
+next
+next
+----
+.
+.
+.
+.
+<d@7:9>
+<f@7:5>
+.
diff --git a/pebble/sstable/testdata/reader_bpf/Pebblev3/iter b/pebble/sstable/testdata/reader_bpf/Pebblev3/iter
new file mode 100644
index 0000000..8a37664
--- /dev/null
+++ b/pebble/sstable/testdata/reader_bpf/Pebblev3/iter
@@ -0,0 +1,146 @@
+# Test case for bug https://github.com/cockroachdb/pebble/issues/2036 Build
+# sstable with two-level index, with two data blocks in each lower-level index
+# block.
+build block-size=1 index-block-size=40 print-layout=true
+c@10.SET.10:cAT10
+d@7.SET.9:dAT7
+e@15.SET.8:eAT15
+f@7.SET.5:fAT7
+----
+index entries:
+ d@7: size 53
+   c@10: size 29
+   d@7: size 27
+ g: size 51
+   e@15: size 29
+   g: size 27
+
+iter
+first
+next
+next
+next
+----
+<c@10:10>
+<d@7:9>
+<e@15:8>
+<f@7:5>
+
+
+# The block property filter matches data block 2 and 4.
+iter block-property-filter=(7,8)
+first
+next
+----
+<d@7:9>
+<f@7:5>
+
+# Use the same block property filter, but use seeks to find these entries.
+# With the bug the second seek-ge below would step to the second lower-level
+# index block and only see the entry in the data block 4.
+iter block-property-filter=(7,8)
+set-bounds lower=a upper=c
+seek-ge a
+seek-ge b true
+set-bounds lower=c upper=g
+seek-ge c
+next
+next
+----
+.
+.
+.
+.
+<d@7:9>
+<f@7:5>
+.
+
+# Regression test for #2816
+#
+# This unit test tests a scenario where the two-level index iterator's position
+# could diverge from the currently loaded index block. When taking advantage of
+# the monotonic bounds optimization at the two-level index level, the iterator
+# would mistakenly seek within the wrong index block.
+#
+# This allowed the final `seek-ge wc` and `next` to both return wz@8.
+
+build  block-size=1 index-block-size=1 print-layout=true
+eu@2.SET.2:eu
+wb@2.SET.2:wb
+wz@8.SET.8:wzAT8
+ye@1.SET.1:yeAT1
+----
+index entries:
+ f: size 26
+   f: size 26
+ wc: size 27
+   wc: size 26
+ x: size 26
+   x: size 29
+ z: size 26
+   z: size 29
+
+iter block-property-filter=(8,9)
+set-bounds lower=v upper=v
+seek-ge wz@8
+internal-iter-state
+seek-ge wb@2
+internal-iter-state
+set-bounds lower=v upper=z
+internal-iter-state
+seek-ge wc
+internal-iter-state
+next
+----
+.
+.
+| *sstable.twoLevelIterator:
+|  topLevelIndex.Key() = "x#72057594037927935,17"
+|  topLevelIndex.InPlaceValue() = (Offset: 193, Length: 26, Props: 00020801)
+|  topLevelIndex.isDataInvalidated()=false
+|  index.Key() = "x#72057594037927935,17"
+|  index.InPlaceValue() = (Offset: 62, Length: 29, Props: 00020801)
+|  index.isDataInvalidated()=false
+|  data.isDataInvalidated()=false
+|  hideObsoletePoints = false
+|  dataBH = (Offset: 62, Length: 29)
+|  (boundsCmp,positionedUsingLatestBounds) = (0,true)
+|  exhaustedBounds = 1
+.
+| *sstable.twoLevelIterator:
+|  topLevelIndex.Key() = "wc#72057594037927935,17"
+|  topLevelIndex.InPlaceValue() = (Offset: 161, Length: 27, Props: 00020201)
+|  topLevelIndex.isDataInvalidated()=false
+|  index iter invalid
+|  index.isDataInvalidated()=true
+|  data.isDataInvalidated()=true
+|  hideObsoletePoints = false
+|  dataBH = (Offset: 62, Length: 29)
+|  (boundsCmp,positionedUsingLatestBounds) = (0,true)
+|  exhaustedBounds = 1
+.
+| *sstable.twoLevelIterator:
+|  topLevelIndex.Key() = "wc#72057594037927935,17"
+|  topLevelIndex.InPlaceValue() = (Offset: 161, Length: 27, Props: 00020201)
+|  topLevelIndex.isDataInvalidated()=false
+|  index iter invalid
+|  index.isDataInvalidated()=true
+|  data.isDataInvalidated()=true
+|  hideObsoletePoints = false
+|  dataBH = (Offset: 62, Length: 29)
+|  (boundsCmp,positionedUsingLatestBounds) = (1,false)
+|  exhaustedBounds = 1
+<wz@8:8>
+| *sstable.twoLevelIterator:
+|  topLevelIndex.Key() = "x#72057594037927935,17"
+|  topLevelIndex.InPlaceValue() = (Offset: 193, Length: 26, Props: 00020801)
+|  topLevelIndex.isDataInvalidated()=false
+|  index.Key() = "x#72057594037927935,17"
+|  index.InPlaceValue() = (Offset: 62, Length: 29, Props: 00020801)
+|  index.isDataInvalidated()=false
+|  data.isDataInvalidated()=false
+|  hideObsoletePoints = false
+|  dataBH = (Offset: 62, Length: 29)
+|  (boundsCmp,positionedUsingLatestBounds) = (0,false)
+|  exhaustedBounds = 0
+.
diff --git a/pebble/sstable/testdata/reader_hide_obsolete/iter b/pebble/sstable/testdata/reader_hide_obsolete/iter
new file mode 100644
index 0000000..5f6eab7
--- /dev/null
+++ b/pebble/sstable/testdata/reader_hide_obsolete/iter
@@ -0,0 +1,318 @@
+build
+a.SET.1:A
+b.SINGLEDEL.4:
+b.SET.2:B
+c.DEL.5:
+c.SET.3:C
+d.SET.4:D4
+d.SET.2:D2
+----
+
+iter
+first
+next
+next
+next
+next
+next
+next
+next
+prev
+prev
+prev
+prev
+prev
+prev
+prev
+prev
+----
+<a:1>:A
+<b:4>:
+<b:2>:B
+<c:5>:
+<c:3>:C
+<d:4>:D4
+<d:2>:D2
+.
+<d:2>:D2
+<d:4>:D4
+<c:3>:C
+<c:5>:
+<b:2>:B
+<b:4>:
+<a:1>:A
+.
+
+iter hide-obsolete-points=true
+first
+next
+next
+next
+next
+prev
+prev
+prev
+prev
+prev
+----
+<a:1>:A
+<b:4>:
+<c:5>:
+<d:4>:D4
+.
+<d:4>:D4
+<c:5>:
+<b:4>:
+<a:1>:A
+.
+
+iter hide-obsolete-points=true
+seek-ge c
+prev
+prev
+next
+next
+next
+seek-lt c
+next
+prev
+prev
+----
+<c:5>:
+<b:4>:
+<a:1>:A
+<b:4>:
+<c:5>:
+<d:4>:D4
+<b:4>:
+<c:5>:
+<b:4>:
+<a:1>:A
+
+build
+a.SET.3:A
+a.MERGE.2:A2
+b.MERGE.20:B20
+b.MERGE.18:B18
+b.SET.16:B16
+b.SET.14:B14
+c.MERGE.30:C30
+c.MERGE.28:C28
+c.DEL.26:
+----
+
+iter
+first
+next
+next
+next
+next
+next
+next
+next
+----
+<a:3>:A
+<a:2>:A2
+<b:20>:B20
+<b:18>:B18
+<b:16>:B16
+<b:14>:B14
+<c:30>:C30
+<c:28>:C28
+
+iter hide-obsolete-points=true
+first
+next
+next
+next
+next
+next
+next
+last
+prev
+prev
+prev
+prev
+prev
+prev
+----
+<a:3>:A
+<b:20>:B20
+<b:18>:B18
+<b:16>:B16
+<c:30>:C30
+<c:28>:C28
+<c:26>:
+<c:26>:
+<c:28>:C28
+<c:30>:C30
+<b:16>:B16
+<b:18>:B18
+<b:20>:B20
+<a:3>:A
+
+build
+b.MERGE.20:B20
+b.SETWITHDEL.16:B16
+b.SETWITHDEL.14:B14
+c.MERGE.30:C30
+c.DELSIZED.28:
+c.DEL.26:
+----
+
+iter
+first
+next
+next
+next
+next
+next
+next
+----
+<b:20>:B20
+<b:16>:B16
+<b:14>:B14
+<c:30>:C30
+<c:28>:
+<c:26>:
+.
+
+iter hide-obsolete-points=true
+first
+next
+next
+next
+next
+----
+<b:20>:B20
+<b:16>:B16
+<c:30>:C30
+<c:28>:
+.
+
+build
+b.SETWITHDEL.20:B20
+b.MERGE.16:B16
+b.MERGE.14:B14
+b.SET.12:B12
+----
+
+iter
+first
+next
+next
+next
+next
+----
+<b:20>:B20
+<b:16>:B16
+<b:14>:B14
+<b:12>:B12
+.
+
+iter hide-obsolete-points=true
+first
+next
+----
+<b:20>:B20
+.
+
+build writing-to-lowest-level
+a.SET.10:A10
+b.DEL.20:
+b.MERGE.16:B16
+b.MERGE.14:B14
+b.SET.12:B12
+----
+
+iter
+first
+next
+next
+next
+next
+next
+----
+<a:10>:A10
+<b:20>:
+<b:16>:B16
+<b:14>:B14
+<b:12>:B12
+.
+
+iter hide-obsolete-points=true
+first
+next
+----
+<a:10>:A10
+.
+
+build writing-to-lowest-level
+a.SET.10:A10
+b.DEL.16:B16
+b.SETWITHDEL.14:B14
+c.DELSIZED.30:
+c.SET.28:C28
+d.SINGLEDEL.40:
+d.MERGE.30:D30
+----
+
+iter
+first
+next
+next
+next
+next
+next
+next
+next
+----
+<a:10>:A10
+<b:16>:B16
+<b:14>:B14
+<c:30>:
+<c:28>:C28
+<d:40>:
+<d:30>:D30
+.
+
+iter hide-obsolete-points=true
+first
+next
+----
+<a:10>:A10
+.
+
+build writing-to-lowest-level
+force-obsolete: a.SET.1:A
+force-obsolete: b.SINGLEDEL.4:
+force-obsolete: b.SET.2:B
+c.DEL.5:
+force-obsolete: d.SET.10:D10
+----
+
+iter
+first
+next
+next
+next
+next
+next
+----
+<a:1>:A
+<b:4>:
+<b:2>:B
+<c:5>:
+<d:10>:D10
+.
+
+iter hide-obsolete-points=true
+first
+----
+table does not intersect BlockPropertyFilter
+
+build is-strict-obsolete
+d.SINGLEDEL.40:
+d.MERGE.30:D30
+----
+MERGE not supported in a strict-obsolete sstable
diff --git a/pebble/sstable/testdata/readerstats_LevelDB/iter b/pebble/sstable/testdata/readerstats_LevelDB/iter
new file mode 100644
index 0000000..1ba247d
--- /dev/null
+++ b/pebble/sstable/testdata/readerstats_LevelDB/iter
@@ -0,0 +1,59 @@
+# Two keys in each data block.
+build block-size=30 index-block-size=30 cache-size=10000
+a.SET.1:A
+b.SET.2:B
+c.SET.3:C
+d.SET.4:D
+----
+
+# The first iteration has cache misses for both blocks. The second iteration
+# hits the cache. Then reset stats.
+iter
+first
+stats
+next
+stats
+next
+stats
+next
+stats
+next
+stats
+first
+stats
+next
+stats
+next
+stats
+next
+stats
+next
+stats
+reset-stats
+stats
+first
+stats
+----
+<a:1>
+{BlockBytes:74 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+<b:2>
+{BlockBytes:74 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+<c:3>
+{BlockBytes:108 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+<d:4>
+{BlockBytes:108 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+.
+{BlockBytes:108 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+<a:1>
+{BlockBytes:142 BlockBytesInCache:34 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+<b:2>
+{BlockBytes:142 BlockBytesInCache:34 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+<c:3>
+{BlockBytes:176 BlockBytesInCache:68 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+<d:4>
+{BlockBytes:176 BlockBytesInCache:68 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+.
+{BlockBytes:176 BlockBytesInCache:68 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+{BlockBytes:0 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+<a:1>
+{BlockBytes:34 BlockBytesInCache:34 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
diff --git a/pebble/sstable/testdata/readerstats_Pebblev3/iter b/pebble/sstable/testdata/readerstats_Pebblev3/iter
new file mode 100644
index 0000000..b20a4b2
--- /dev/null
+++ b/pebble/sstable/testdata/readerstats_Pebblev3/iter
@@ -0,0 +1,79 @@
+build print-layout=true
+c@10.SET.10:cAT10
+c@9.SET.9:cAT9
+c@8.SET.8:cAT8
+d@7.SET.9:dAT7
+e@39.SET.49:eAT39
+e@38.SET.48:eAT38
+e@37.SET.47:eAT37
+e@36.SET.46:eAT36
+e@35.SET.45:eAT35
+e@34.SET.44:eAT34
+e@33.SET.43:eAT33
+e@32.SET.42:eAT32
+e@31.SET.41:eAT31
+e@30.SET.40:eAT30
+e@29.SET.39:eAT29
+e@28.SET.38:eAT28
+e@27.SET.37:eAT27
+e@26.SET.36:eAT26
+----
+index entries:
+ f: size 228
+
+# Iterating across older versions and fetching the older version values.
+iter
+first
+stats
+next
+stats
+next
+stats
+next
+stats
+----
+<c@10:10>
+{BlockBytes:251 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+<c@9:9>
+{BlockBytes:328 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:1 ValueBytes:4 ValueBytesFetched:4}}
+<c@8:8>
+{BlockBytes:328 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:2 ValueBytes:8 ValueBytesFetched:8}}
+<d@7:9>
+{BlockBytes:328 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:2 ValueBytes:8 ValueBytesFetched:8}}
+
+# seek-ge e@37 starts at the restart point at the beginning of the block and
+# iterates over 3 irrelevant separated versions before getting to e@37
+# (another separated version). Which is why the SeparatedPointValue count is
+# 4. Only the last separated version has its value fetched.
+iter
+seek-ge e@37
+stats
+next
+next
+next
+next
+stats
+----
+<e@37:47>
+{BlockBytes:328 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:4 ValueBytes:18 ValueBytesFetched:5}}
+<e@36:46>
+<e@35:45>
+<e@34:44>
+<e@33:43>
+{BlockBytes:328 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:8 ValueBytes:38 ValueBytesFetched:25}}
+
+# seek-ge e@26 lands at the restart point e@26.
+iter
+seek-ge e@26
+stats
+prev
+stats
+prev
+stats
+----
+<e@26:36>
+{BlockBytes:328 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:1 ValueBytes:5 ValueBytesFetched:5}}
+<e@27:37>
+{BlockBytes:328 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:2 ValueBytes:10 ValueBytesFetched:10}}
+<e@28:38>
+{BlockBytes:328 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:3 ValueBytes:15 ValueBytesFetched:15}}
diff --git a/pebble/sstable/testdata/rewriter b/pebble/sstable/testdata/rewriter
new file mode 100644
index 0000000..3f2ea58
--- /dev/null
+++ b/pebble/sstable/testdata/rewriter
@@ -0,0 +1,237 @@
+build block-size=1 index-block-size=1 filter
+a_xyz.SET.1:a
+b_xyz.SET.1:b
+c_xyz.SET.1:c
+----
+point:    [a_xyz#1,1-c_xyz#1,1]
+seqnums:  [1-1]
+
+rewrite from=xyz to=123 block-size=1 index-block-size=1 filter
+----
+rewrite failed: a valid splitter is required to rewrite suffixes
+
+rewrite from=xyz to=123 block-size=1 index-block-size=1 filter comparer-split-4b-suffix
+----
+rewrite failed: rewriting data blocks: mismatched Comparer leveldb.BytewiseComparator vs comparer-split-4b-suffix, replacement requires same splitter to copy filters
+
+build block-size=1 index-block-size=1 filter comparer-split-4b-suffix
+aa_xyz.SET.1:a
+ba_xyz.SET.1:b
+ca_xyz.SET.1:c
+----
+point:    [aa_xyz#1,1-ca_xyz#1,1]
+seqnums:  [1-1]
+
+rewrite from=yz to=23 block-size=1 index-block-size=1 filter comparer-split-4b-suffix
+----
+rewrite failed: rewriting data blocks: key has suffix "_xyz", expected "yz"
+
+rewrite from=a_xyz to=a_123 block-size=1 index-block-size=1 filter comparer-split-4b-suffix
+----
+rewrite failed: rewriting data blocks: key has suffix "_xyz", expected "a_xyz"
+
+build block-size=1 index-block-size=1 filter comparer-split-4b-suffix
+a_xyz.SET.1:a
+b_xyz.SET.1:b
+c_xyz.SET.1:c
+----
+point:    [a_xyz#1,1-c_xyz#1,1]
+seqnums:  [1-1]
+
+layout
+----
+         0  data (25)
+        30  data (25)
+        60  data (25)
+        90  filter (69)
+       164  index (22)
+       191  index (22)
+       218  index (22)
+       245  top-index (48)
+       298  properties (630)
+       933  meta-index (79)
+      1017  footer (53)
+      1070  EOF
+
+scan
+----
+a_xyz#1,1:a
+b_xyz#1,1:b
+c_xyz#1,1:c
+
+get
+b_xyz
+f_xyz
+c_xyz
+----
+b
+get f_xyz: pebble: not found
+c
+
+rewrite from=_xyz to=_123 block-size=1 index-block-size=1 filter comparer-split-4b-suffix
+----
+point:    [a_123#1,1-c_123#1,1]
+seqnums:  [1-1]
+
+layout
+----
+         0  data (25)
+        30  data (25)
+        60  data (25)
+        90  filter (69)
+       164  index (22)
+       191  index (22)
+       218  index (22)
+       245  top-index (48)
+       298  properties (630)
+       933  meta-index (79)
+      1017  footer (53)
+      1070  EOF
+
+scan
+----
+a_123#1,1:a
+b_123#1,1:b
+c_123#1,1:c
+
+get
+b_123
+f_123
+c_123
+----
+b
+get f_123: pebble: not found
+c
+
+rewrite from=_123 to=_456 block-size=1 index-block-size=1 filter comparer-split-4b-suffix concurrency=2
+----
+point:    [a_456#1,1-c_456#1,1]
+seqnums:  [1-1]
+
+layout
+----
+         0  data (25)
+        30  data (25)
+        60  data (25)
+        90  filter (69)
+       164  index (22)
+       191  index (22)
+       218  index (22)
+       245  top-index (48)
+       298  properties (630)
+       933  meta-index (79)
+      1017  footer (53)
+      1070  EOF
+
+scan
+----
+a_456#1,1:a
+b_456#1,1:b
+c_456#1,1:c
+
+get
+b_456
+f_456
+c_456
+----
+b
+get f_456: pebble: not found
+c
+
+rewrite from=_456 to=_xyz block-size=1 index-block-size=1 filter comparer-split-4b-suffix concurrency=3
+----
+point:    [a_xyz#1,1-c_xyz#1,1]
+seqnums:  [1-1]
+
+layout
+----
+         0  data (25)
+        30  data (25)
+        60  data (25)
+        90  filter (69)
+       164  index (22)
+       191  index (22)
+       218  index (22)
+       245  top-index (48)
+       298  properties (630)
+       933  meta-index (79)
+      1017  footer (53)
+      1070  EOF
+
+scan
+----
+a_xyz#1,1:a
+b_xyz#1,1:b
+c_xyz#1,1:c
+
+get
+b_xyz
+f_xyz
+c_xyz
+----
+b
+get f_xyz: pebble: not found
+c
+
+
+rewrite from=_xyz to=_123 block-size=1 index-block-size=1 filter comparer-split-4b-suffix concurrency=4
+----
+point:    [a_123#1,1-c_123#1,1]
+seqnums:  [1-1]
+
+layout
+----
+         0  data (25)
+        30  data (25)
+        60  data (25)
+        90  filter (69)
+       164  index (22)
+       191  index (22)
+       218  index (22)
+       245  top-index (48)
+       298  properties (630)
+       933  meta-index (79)
+      1017  footer (53)
+      1070  EOF
+
+scan
+----
+a_123#1,1:a
+b_123#1,1:b
+c_123#1,1:c
+
+get
+b_123
+f_123
+c_123
+----
+b
+get f_123: pebble: not found
+c
+
+# Rewrite a table that contain only range keys.
+
+build block-size=1 index-block-size=1 filter comparer-split-4b-suffix
+rangekey: a-b:{(#1,RANGEKEYSET,_xyz)}
+rangekey: b-c:{(#1,RANGEKEYSET,_xyz)}
+rangekey: c-d:{(#1,RANGEKEYSET,_xyz)}
+----
+rangekey: [a#1,21-d#72057594037927935,21]
+seqnums:  [1-1]
+
+scan-range-key
+----
+a-b:{(#1,RANGEKEYSET,_xyz)}
+b-c:{(#1,RANGEKEYSET,_xyz)}
+c-d:{(#1,RANGEKEYSET,_xyz)}
+
+rewrite from=_xyz to=_123 block-size=1 index-block-size=1 filter comparer-split-4b-suffix
+----
+rangekey: [a#1,21-d#72057594037927935,21]
+seqnums:  [1-1]
+
+scan-range-key
+----
+a-b:{(#1,RANGEKEYSET,_123)}
+b-c:{(#1,RANGEKEYSET,_123)}
+c-d:{(#1,RANGEKEYSET,_123)}
diff --git a/pebble/sstable/testdata/rewriter_v3 b/pebble/sstable/testdata/rewriter_v3
new file mode 100644
index 0000000..4f37789
--- /dev/null
+++ b/pebble/sstable/testdata/rewriter_v3
@@ -0,0 +1,237 @@
+build block-size=1 index-block-size=1 filter
+a_xyz.SET.1:a
+b_xyz.SET.1:b
+c_xyz.SET.1:c
+----
+point:    [a_xyz#1,1-c_xyz#1,1]
+seqnums:  [1-1]
+
+rewrite from=xyz to=123 block-size=1 index-block-size=1 filter
+----
+rewrite failed: a valid splitter is required to rewrite suffixes
+
+rewrite from=xyz to=123 block-size=1 index-block-size=1 filter comparer-split-4b-suffix
+----
+rewrite failed: rewriting data blocks: mismatched Comparer leveldb.BytewiseComparator vs comparer-split-4b-suffix, replacement requires same splitter to copy filters
+
+build block-size=1 index-block-size=1 filter comparer-split-4b-suffix
+aa_xyz.SET.1:a
+ba_xyz.SET.1:b
+ca_xyz.SET.1:c
+----
+point:    [aa_xyz#1,1-ca_xyz#1,1]
+seqnums:  [1-1]
+
+rewrite from=yz to=23 block-size=1 index-block-size=1 filter comparer-split-4b-suffix
+----
+rewrite failed: rewriting data blocks: key has suffix "_xyz", expected "yz"
+
+rewrite from=a_xyz to=a_123 block-size=1 index-block-size=1 filter comparer-split-4b-suffix
+----
+rewrite failed: rewriting data blocks: key has suffix "_xyz", expected "a_xyz"
+
+build block-size=1 index-block-size=1 filter comparer-split-4b-suffix
+a_xyz.SET.1:a
+b_xyz.SET.1:b
+c_xyz.SET.1:c
+----
+point:    [a_xyz#1,1-c_xyz#1,1]
+seqnums:  [1-1]
+
+layout
+----
+         0  data (26)
+        31  data (26)
+        62  data (26)
+        93  filter (69)
+       167  index (22)
+       194  index (22)
+       221  index (22)
+       248  top-index (48)
+       301  properties (630)
+       936  meta-index (79)
+      1020  footer (53)
+      1073  EOF
+
+scan
+----
+a_xyz#1,1:a
+b_xyz#1,1:b
+c_xyz#1,1:c
+
+get
+b_xyz
+f_xyz
+c_xyz
+----
+b
+get f_xyz: pebble: not found
+c
+
+rewrite from=_xyz to=_123 block-size=1 index-block-size=1 filter comparer-split-4b-suffix
+----
+point:    [a_123#1,1-c_123#1,1]
+seqnums:  [1-1]
+
+layout
+----
+         0  data (26)
+        31  data (26)
+        62  data (26)
+        93  filter (69)
+       167  index (22)
+       194  index (22)
+       221  index (22)
+       248  top-index (48)
+       301  properties (630)
+       936  meta-index (79)
+      1020  footer (53)
+      1073  EOF
+
+scan
+----
+a_123#1,1:a
+b_123#1,1:b
+c_123#1,1:c
+
+get
+b_123
+f_123
+c_123
+----
+b
+get f_123: pebble: not found
+c
+
+rewrite from=_123 to=_456 block-size=1 index-block-size=1 filter comparer-split-4b-suffix concurrency=2
+----
+point:    [a_456#1,1-c_456#1,1]
+seqnums:  [1-1]
+
+layout
+----
+         0  data (26)
+        31  data (26)
+        62  data (26)
+        93  filter (69)
+       167  index (22)
+       194  index (22)
+       221  index (22)
+       248  top-index (48)
+       301  properties (630)
+       936  meta-index (79)
+      1020  footer (53)
+      1073  EOF
+
+scan
+----
+a_456#1,1:a
+b_456#1,1:b
+c_456#1,1:c
+
+get
+b_456
+f_456
+c_456
+----
+b
+get f_456: pebble: not found
+c
+
+rewrite from=_456 to=_xyz block-size=1 index-block-size=1 filter comparer-split-4b-suffix concurrency=3
+----
+point:    [a_xyz#1,1-c_xyz#1,1]
+seqnums:  [1-1]
+
+layout
+----
+         0  data (26)
+        31  data (26)
+        62  data (26)
+        93  filter (69)
+       167  index (22)
+       194  index (22)
+       221  index (22)
+       248  top-index (48)
+       301  properties (630)
+       936  meta-index (79)
+      1020  footer (53)
+      1073  EOF
+
+scan
+----
+a_xyz#1,1:a
+b_xyz#1,1:b
+c_xyz#1,1:c
+
+get
+b_xyz
+f_xyz
+c_xyz
+----
+b
+get f_xyz: pebble: not found
+c
+
+
+rewrite from=_xyz to=_123 block-size=1 index-block-size=1 filter comparer-split-4b-suffix concurrency=4
+----
+point:    [a_123#1,1-c_123#1,1]
+seqnums:  [1-1]
+
+layout
+----
+         0  data (26)
+        31  data (26)
+        62  data (26)
+        93  filter (69)
+       167  index (22)
+       194  index (22)
+       221  index (22)
+       248  top-index (48)
+       301  properties (630)
+       936  meta-index (79)
+      1020  footer (53)
+      1073  EOF
+
+scan
+----
+a_123#1,1:a
+b_123#1,1:b
+c_123#1,1:c
+
+get
+b_123
+f_123
+c_123
+----
+b
+get f_123: pebble: not found
+c
+
+# Rewrite a table that contain only range keys.
+
+build block-size=1 index-block-size=1 filter comparer-split-4b-suffix
+rangekey: a-b:{(#1,RANGEKEYSET,_xyz)}
+rangekey: b-c:{(#1,RANGEKEYSET,_xyz)}
+rangekey: c-d:{(#1,RANGEKEYSET,_xyz)}
+----
+rangekey: [a#1,21-d#72057594037927935,21]
+seqnums:  [1-1]
+
+scan-range-key
+----
+a-b:{(#1,RANGEKEYSET,_xyz)}
+b-c:{(#1,RANGEKEYSET,_xyz)}
+c-d:{(#1,RANGEKEYSET,_xyz)}
+
+rewrite from=_xyz to=_123 block-size=1 index-block-size=1 filter comparer-split-4b-suffix
+----
+rangekey: [a#1,21-d#72057594037927935,21]
+seqnums:  [1-1]
+
+scan-range-key
+----
+a-b:{(#1,RANGEKEYSET,_123)}
+b-c:{(#1,RANGEKEYSET,_123)}
+c-d:{(#1,RANGEKEYSET,_123)}
diff --git a/pebble/sstable/testdata/size_estimate b/pebble/sstable/testdata/size_estimate
new file mode 100644
index 0000000..114ac69
--- /dev/null
+++ b/pebble/sstable/testdata/size_estimate
@@ -0,0 +1,144 @@
+# Sequence of ops which tests all of the code paths in the size_estimate type.
+
+init 1
+----
+success
+
+# Empty size should be 1
+size
+----
+1
+
+# There's a single inflight entry, so the size should be 4.
+add_inflight 4
+----
+4
+
+num_inflight_entries
+----
+1
+
+num_entries
+----
+1
+
+# Compression ratio defaults to 1, so the size of the inflight entry fully
+# counts towards size.
+size
+----
+4
+
+# After compression, entry only had a size of 3. The total size is therefore
+# 3, since this is the first entry. The max estimated size is 4 since we
+# ensure that it is monotonically non decreasing.
+entry_written 3 4
+----
+4
+
+num_entries
+----
+1
+
+# There should be 0 inflight entries once the previous entry has been written.
+num_inflight_entries
+----
+0
+
+# Compression ratio is 0.75 at this point. The total size is 3, and the inflight
+# size is 5, so that returned size is uint64(3 + 0.75*5) = uint64(6.75).
+add_inflight 5
+----
+6
+
+num_entries
+----
+2
+
+# We don't clear the empty size, so even after clearing a size of 1 is returned.
+clear
+----
+1
+
+# Test writing multiple inflight entries.
+add_inflight 4
+----
+4
+
+add_inflight 5
+----
+9
+
+num_entries
+----
+2
+
+num_inflight_entries
+----
+2
+
+# First inflight entry written. The entry didn't get compressed. The total size
+# now is less than 9, but the max estimated size should still be 9.
+entry_written 4 4
+----
+9
+
+num_entries
+----
+2
+
+num_inflight_entries
+----
+1
+
+# At this point, inflightSize is 13, the totalSize is 4. The compression ratio
+# is 1. So, the returned size should be 17.
+add_inflight 8
+----
+17
+
+# One entry has been written.
+num_written_entries
+----
+1
+
+# The inflight entry had a size of 5, but the entry added had a size of 3
+# because of compression/size estimation. The compression ratio is (4+3)/(4+5)
+# = 0.77 at this point. The inflightSize is 8. The true size is 7+8*0.77 =
+# 13.22, but the maxEstimatedSize is returned.
+entry_written 7 5
+----
+17
+
+# The inflight size is 0, and the total size is 11.
+entry_written 11 8
+----
+17
+
+num_written_entries
+----
+3
+
+# The compression ratio is (4+3+4)/(4+5+8)=0.647, and the inflight size is 20,
+# 20*0.64 = 12.94, so the total size is uint64(12.94 + 11)
+add_inflight 20
+----
+23
+
+num_inflight_entries
+----
+1
+
+# We can write an entry, which increases the written size from 11 to 19, but
+# it might not have an inflightSize, because it was never inflight. In such a
+# case, the numInflightEntries, shouldn't be decreased.
+entry_written 19 0
+----
+31
+
+num_inflight_entries
+----
+1
+
+num_written_entries
+----
+4
diff --git a/pebble/sstable/testdata/virtual_reader b/pebble/sstable/testdata/virtual_reader
new file mode 100644
index 0000000..1fa1669
--- /dev/null
+++ b/pebble/sstable/testdata/virtual_reader
@@ -0,0 +1,692 @@
+# Test 1: Start with a simple sanity checking test which uses singleLevel
+# iterators as the backing iterator for the sstable. This will also test the
+# compaction iterator since it's the simplest.
+build
+a.SET.1:a
+b.SET.1:b
+c.SET.1:c
+d.SET.1:d
+----
+point:    [a#1,1-d#1,1]
+seqnums:  [1-1]
+
+# Note that the RawKeySize,RawValueSize aren't accurate here because we use
+# Reader.EstimateDiskUsage with virtual sstables bounds to determine virtual
+# sstable size which is then used to extrapolate virtual sstable properties,
+# and for tiny sstables, virtual sstable sizes aren't accurate. In this
+# testcase, the virtual sstable size is 50, whereas the backing sstable size is
+# 850.
+virtualize b.SET.1-c.SET.1
+----
+bounds:  [b#1,1-c#1,1]
+filenum: 000002
+props: NumEntries: 1, RawKeySize: 3, RawValueSize: 1, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 0, NumRangeDeletions: 0, NumRangeKeyDels: 0, NumRangeKeySets: 0, ValueBlocksSize: 0
+
+citer
+----
+b#1,1:b
+c#1,1:c
+
+# Test 2: Similar to test 1 but force two level iterators.
+build twoLevel
+a.SET.1:a
+b.SET.1:b
+c.SET.1:c
+d.SET.1:d
+----
+point:    [a#1,1-d#1,1]
+seqnums:  [1-1]
+
+virtualize b.SET.1-c.SET.1
+----
+bounds:  [b#1,1-c#1,1]
+filenum: 000004
+props: NumEntries: 1, RawKeySize: 2, RawValueSize: 1, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 0, NumRangeDeletions: 0, NumRangeKeyDels: 0, NumRangeKeySets: 0, ValueBlocksSize: 0
+
+citer
+----
+b#1,1:b
+c#1,1:c
+
+# Test the constrain bounds function. It performs some subtle shrinking and
+# expanding of bounds. The current virtual sstable bounds are [b,c].
+# 1. start key < virtual sstable start key, end key is exclusive.
+constrain a,bb,false
+----
+b,bb,false
+
+# 2. start key < virtual sstable start key, end key is inclusive.
+constrain a,bb,true
+----
+b,bb,true
+
+# 3. start key is within virtual sstable bounds, end key is at virtual sstable
+# end bound, but is exclusive.
+constrain bb,c,false
+----
+bb,c,false
+
+# 3. start key is within virtual sstable bounds, end key is at virtual sstable
+# end bound, but is inclusive.
+constrain bb,c,true
+----
+bb,c,true
+
+# 4. start key is within virtual sstable bounds, end key is above virtual
+# sstable end bound and is exclusive.
+constrain bb,e,false
+----
+bb,c,true
+
+# 5. start key is within virtual sstable bounds, end key is above virtual
+# sstable end bound and is inclusive.
+constrain bb,e,true
+----
+bb,c,true
+
+# 6. Both start, end keys fit within virtual sstable bounds.
+constrain bb,bbb,false
+----
+bb,bbb,false
+
+# 6. Both start, end keys are out of bounds, but overlap.
+constrain a,d,false
+----
+b,c,true
+
+# 7. start, end keys have no overlap with virtual sstable bounds. Note that
+# lower becomes greater than upper here. We support this in the iterators
+# and don't return any keys for this case.
+constrain a,aa,false
+----
+b,aa,false
+
+scan-range-del
+----
+
+scan-range-key
+----
+
+# Test 3: Tests raw range key/range del iterators, and makes sure that they
+# respect virtual bounds.
+build twoLevel
+a.SET.1:a
+d.SET.2:d
+f.SET.3:f
+d.RANGEDEL.4:e
+rangekey: a-d:{(#11,RANGEKEYSET,@t10,foo)}
+g.RANGEDEL.5:l
+rangekey: y-z:{(#12,RANGEKEYSET,@t11,foo)}
+----
+point:    [a#1,1-f#3,1]
+rangedel: [d#4,15-l#72057594037927935,15]
+rangekey: [a#11,21-z#72057594037927935,21]
+seqnums:  [1-12]
+
+# Note that we shouldn't have range del spans which cross virtual sstable
+# boundaries. NumRangeKeySets must be > 1.
+virtualize a.SET.1-f.SET.1
+----
+bounds:  [a#1,1-f#1,1]
+filenum: 000006
+props: NumEntries: 1, RawKeySize: 4, RawValueSize: 1, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 1, NumRangeDeletions: 1, NumRangeKeyDels: 0, NumRangeKeySets: 1, ValueBlocksSize: 0
+
+scan-range-del
+----
+d-e:{(#4,RANGEDEL)}
+
+scan-range-key
+----
+a-d:{(#11,RANGEKEYSET,@t10,foo)}
+
+# Test 4: Test iterators with various bounds, and various operations. This calls
+# VirtualReader.NewIterWithBlockPropertyFilters and performs various operations
+# on those.
+build
+a.SET.1:a
+b.SET.2:b
+c.SET.3:c
+d.SET.4:d
+dd.SET.5:dd
+ddd.SET.6:ddd
+g.SET.8:g
+h.SET.9:h
+----
+point:    [a#1,1-h#9,1]
+seqnums:  [1-9]
+
+virtualize dd.SET.5-ddd.SET.6
+----
+bounds:  [dd#5,1-ddd#6,1]
+filenum: 000008
+props: NumEntries: 1, RawKeySize: 10, RawValueSize: 2, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 0, NumRangeDeletions: 0, NumRangeKeyDels: 0, NumRangeKeySets: 0, ValueBlocksSize: 0
+
+# Check lower bound enforcement during SeekPrefixGE.
+iter
+seek-prefix-ge d
+next
+next
+----
+<dd:5>:dd
+<ddd:6>:ddd
+.
+
+# Build a simpler sstable for the rest of the tests.
+build
+a.SET.1:a
+b.SET.2:b
+c.SET.3:c
+d.SET.4:d
+e.SET.5:e
+f.SET.6:f
+g.SET.8:g
+h.SET.9:h
+----
+point:    [a#1,1-h#9,1]
+seqnums:  [1-9]
+
+# Set bounds c-f for the virtual sstable.
+virtualize c.SET.3-f.SET.6
+----
+bounds:  [c#3,1-f#6,1]
+filenum: 000010
+props: NumEntries: 1, RawKeySize: 9, RawValueSize: 1, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 0, NumRangeDeletions: 0, NumRangeKeyDels: 0, NumRangeKeySets: 0, ValueBlocksSize: 0
+
+# Just test a basic iterator once virtual sstable bounds have been set.
+iter
+first
+next
+next
+next
+next
+----
+<c:3>:c
+<d:4>:d
+<e:5>:e
+<f:6>:f
+.
+
+# Create an iterator with bounds. External bounds should still be restricted
+# along with virtual sstable bounds.
+iter a-d
+first
+next
+----
+<c:3>:c
+.
+
+iter d-g
+first
+next
+next
+next
+----
+<d:4>:d
+<e:5>:e
+<f:6>:f
+.
+
+# e is turned into an exclusive bounds, and thus it is hidden.
+iter
+set-bounds lower=d upper=e
+first
+next
+----
+.
+<d:4>:d
+.
+
+# Virtual sstable lower bound must be enforced internally from within the
+# iterator.
+iter
+seek-ge b
+next
+next
+next
+next
+----
+<c:3>:c
+<d:4>:d
+<e:5>:e
+<f:6>:f
+.
+
+# Upper bound enforcement by SeekGE.
+iter
+seek-ge g
+----
+.
+
+# Test prev.
+iter
+seek-ge d
+prev
+next
+prev
+prev
+----
+<d:4>:d
+<c:3>:c
+<d:4>:d
+<c:3>:c
+.
+
+# Test SeekLT
+build
+a.SET.1:a
+b.SET.2:b
+c.SET.3:c
+d.SET.4:d
+e.SET.5:e
+f.SET.6:f
+f.SET.1:ff
+g.SET.8:g
+h.SET.9:h
+----
+point:    [a#1,1-h#9,1]
+seqnums:  [1-9]
+
+virtualize c.SET.3-f.SET.1:ff
+----
+bounds:  [c#3,1-f#0,1]
+filenum: 000012
+props: NumEntries: 2, RawKeySize: 11, RawValueSize: 2, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 0, NumRangeDeletions: 0, NumRangeKeyDels: 0, NumRangeKeySets: 0, ValueBlocksSize: 3
+
+iter
+set-bounds lower=d upper=e
+seek-lt e
+----
+.
+<d:4>:d
+
+iter
+seek-ge f
+next
+next
+----
+<f:6>:f
+<f:1>:ff
+.
+
+iter
+seek-lt f
+next
+next
+prev
+prev
+prev
+prev
+prev
+----
+<e:5>:e
+<f:6>:f
+<f:1>:ff
+<f:6>:f
+<e:5>:e
+<d:4>:d
+<c:3>:c
+.
+
+# We should get f here, not g as SeekLT will apply the virtual sstable end
+# bound.
+iter
+seek-lt h
+----
+<f:1>:ff
+
+iter
+last
+----
+<f:1>:ff
+
+virtualize f.SET.6-h.SET.9
+----
+bounds:  [f#6,1-h#9,1]
+filenum: 000013
+props: NumEntries: 2, RawKeySize: 11, RawValueSize: 2, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 0, NumRangeDeletions: 0, NumRangeKeyDels: 0, NumRangeKeySets: 0, ValueBlocksSize: 3
+
+iter
+seek-lt z
+----
+<h:9>:h
+
+iter
+last
+----
+<h:9>:h
+
+iter
+set-bounds lower=c upper=g
+first
+last
+----
+.
+<f:6>:f
+<f:1>:ff
+
+# Test 5: Same as test 4, but force two level iterators.
+build twoLevel
+a.SET.1:a
+b.SET.2:b
+c.SET.3:c
+d.SET.4:d
+dd.SET.5:dd
+ddd.SET.6:ddd
+g.SET.8:g
+h.SET.9:h
+----
+point:    [a#1,1-h#9,1]
+seqnums:  [1-9]
+
+virtualize dd.SET.5-ddd.SET.6
+----
+bounds:  [dd#5,1-ddd#6,1]
+filenum: 000015
+props: NumEntries: 1, RawKeySize: 4, RawValueSize: 1, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 0, NumRangeDeletions: 0, NumRangeKeyDels: 0, NumRangeKeySets: 0, ValueBlocksSize: 0
+
+# Check lower bound enforcement during SeekPrefixGE.
+iter
+seek-prefix-ge d
+next
+next
+----
+<dd:5>:dd
+<ddd:6>:ddd
+.
+
+# Build a simpler sstable for the rest of the tests.
+build twoLevel
+a.SET.1:a
+b.SET.2:b
+c.SET.3:c
+d.SET.4:d
+e.SET.5:e
+f.SET.6:f
+g.SET.8:g
+h.SET.9:h
+----
+point:    [a#1,1-h#9,1]
+seqnums:  [1-9]
+
+# Set bounds c-f for the virtual sstable.
+virtualize c.SET.3-f.SET.6
+----
+bounds:  [c#3,1-f#6,1]
+filenum: 000017
+props: NumEntries: 1, RawKeySize: 7, RawValueSize: 1, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 0, NumRangeDeletions: 0, NumRangeKeyDels: 0, NumRangeKeySets: 0, ValueBlocksSize: 0
+
+# Just test a basic iterator once virtual sstable bounds have been set.
+iter
+first
+next
+next
+next
+next
+----
+<c:3>:c
+<d:4>:d
+<e:5>:e
+<f:6>:f
+.
+
+# Create an iterator with bounds. External bounds should still be restricted
+# along with virtual sstable bounds.
+iter a-d
+first
+next
+----
+<c:3>:c
+.
+
+iter d-g
+first
+next
+next
+next
+----
+<d:4>:d
+<e:5>:e
+<f:6>:f
+.
+
+# e is turned into an exclusive bounds, and thus it is hidden.
+iter
+set-bounds lower=d upper=e
+first
+next
+----
+.
+<d:4>:d
+.
+
+# Virtual sstable lower bound must be enforced internally from within the
+# iterator.
+iter
+seek-ge b
+next
+next
+next
+next
+----
+<c:3>:c
+<d:4>:d
+<e:5>:e
+<f:6>:f
+.
+
+# Upper bound enforcement by SeekGE.
+iter
+seek-ge g
+----
+.
+
+# Test prev.
+iter
+seek-ge d
+prev
+next
+prev
+prev
+----
+<d:4>:d
+<c:3>:c
+<d:4>:d
+<c:3>:c
+.
+
+# Test SeekLT
+build twoLevel
+a.SET.1:a
+b.SET.2:b
+c.SET.3:c
+d.SET.4:d
+e.SET.5:e
+f.SET.6:f
+f.SET.1:ff
+g.SET.8:g
+h.SET.9:h
+----
+point:    [a#1,1-h#9,1]
+seqnums:  [1-9]
+
+virtualize c.SET.3-f.SET.1:ff
+----
+bounds:  [c#3,1-f#0,1]
+filenum: 000019
+props: NumEntries: 1, RawKeySize: 7, RawValueSize: 1, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 0, NumRangeDeletions: 0, NumRangeKeyDels: 0, NumRangeKeySets: 0, ValueBlocksSize: 2
+
+iter
+set-bounds lower=d upper=e
+seek-lt e
+----
+.
+<d:4>:d
+
+iter
+seek-ge f
+next
+next
+----
+<f:6>:f
+<f:1>:ff
+.
+
+iter
+seek-lt f
+next
+next
+prev
+prev
+prev
+prev
+prev
+----
+<e:5>:e
+<f:6>:f
+<f:1>:ff
+<f:6>:f
+<e:5>:e
+<d:4>:d
+<c:3>:c
+.
+
+# We should get f here, not g as SeekLT will apply the virtual sstable end
+# bound.
+iter
+seek-lt h
+----
+<f:1>:ff
+
+iter
+last
+----
+<f:1>:ff
+
+virtualize f.SET.6-h.SET.9
+----
+bounds:  [f#6,1-h#9,1]
+filenum: 000020
+props: NumEntries: 1, RawKeySize: 7, RawValueSize: 1, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 0, NumRangeDeletions: 0, NumRangeKeyDels: 0, NumRangeKeySets: 0, ValueBlocksSize: 2
+
+iter
+seek-lt z
+----
+<h:9>:h
+
+iter
+last
+----
+<h:9>:h
+
+iter
+set-bounds lower=c upper=g
+first
+last
+----
+.
+<f:6>:f
+<f:1>:ff
+
+# Test 6: Exclusive sentinel handling. Note that this test only ensures that
+# exclusive sentinel handling is correct for some code path, but not all of
+# them, in the iterators. Consider a randomized test.
+build
+a.SET.1:a
+d.SET.2:d
+e.SET.3:e
+d.RANGEDEL.4:e
+f.SET.5:f
+----
+point:    [a#1,1-f#5,1]
+rangedel: [d#4,15-e#72057594037927935,15]
+seqnums:  [1-5]
+
+virtualize a.SET.1-e.RANGEDEL.72057594037927935
+----
+bounds:  [a#1,1-e#72057594037927935,15]
+filenum: 000022
+props: NumEntries: 1, RawKeySize: 4, RawValueSize: 1, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 1, NumRangeDeletions: 1, NumRangeKeyDels: 0, NumRangeKeySets: 0, ValueBlocksSize: 0
+
+iter
+first
+next
+next
+seek-lt f
+----
+<a:1>:a
+<d:2>:d
+.
+<d:2>:d
+
+# Don't expose e from the compaction iter.
+citer
+----
+a#1,1:a
+d#2,1:d
+
+scan-range-del
+----
+d-e:{(#4,RANGEDEL)}
+
+
+build twoLevel
+a.SET.1:a
+d.SET.2:d
+e.SET.3:e
+d.RANGEDEL.4:e
+f.SET.5:f
+----
+point:    [a#1,1-f#5,1]
+rangedel: [d#4,15-e#72057594037927935,15]
+seqnums:  [1-5]
+
+virtualize a.SET.1-e.RANGEDEL.72057594037927935
+----
+bounds:  [a#1,1-e#72057594037927935,15]
+filenum: 000024
+props: NumEntries: 1, RawKeySize: 4, RawValueSize: 1, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 1, NumRangeDeletions: 1, NumRangeKeyDels: 0, NumRangeKeySets: 0, ValueBlocksSize: 0
+
+iter
+first
+next
+next
+seek-lt f
+----
+<a:1>:a
+<d:2>:d
+.
+<d:2>:d
+
+# Don't expose e from the compaction iter.
+citer
+----
+a#1,1:a
+d#2,1:d
+
+scan-range-del
+----
+d-e:{(#4,RANGEDEL)}
+
+# Test NumRangeKeySets.
+build twoLevel
+a.SET.1:a
+b.SET.5:b
+d.SET.2:d
+f.SET.3:f
+d.RANGEDEL.4:e
+rangekey: a-d:{(#11,RANGEKEYSET,@t10,foo)}
+g.RANGEDEL.5:l
+rangekey: y-z:{(#12,RANGEKEYSET,@t11,foo)}
+----
+point:    [a#1,1-f#3,1]
+rangedel: [d#4,15-l#72057594037927935,15]
+rangekey: [a#11,21-z#72057594037927935,21]
+seqnums:  [1-12]
+
+# Virtual sstable doesn't contain range key set, but NumRangeKeySets in the
+# properties must be > 0.
+virtualize a.SET.1-b.SET.5
+----
+bounds:  [a#1,1-b#5,1]
+filenum: 000026
+props: NumEntries: 1, RawKeySize: 3, RawValueSize: 1, RawPointTombstoneKeySize: 0, RawPointTombstoneValueSize: 0, NumSizedDeletions: 0, NumDeletions: 1, NumRangeDeletions: 1, NumRangeKeyDels: 0, NumRangeKeySets: 1, ValueBlocksSize: 0
diff --git a/pebble/sstable/testdata/writer b/pebble/sstable/testdata/writer
new file mode 100644
index 0000000..61d3a24
--- /dev/null
+++ b/pebble/sstable/testdata/writer
@@ -0,0 +1,370 @@
+build
+a.SET.1:a
+----
+point:    [a#1,1-a#1,1]
+seqnums:  [1-1]
+
+scan
+----
+a#1,1:a
+
+scan-range-del
+----
+
+scan-range-key
+----
+
+build props=(deletions,deleted)
+a.SET.1:a
+b.DEL.2:
+c.MERGE.3:c
+d.RANGEDEL.4:e
+f.SET.5:f
+g.DEL.6:
+h.MERGE.7:h
+i.RANGEDEL.8:j
+rangekey: j-k:{(#9,RANGEKEYDEL)}
+rangekey: k-l:{(#10,RANGEKEYUNSET,@t5)}
+rangekey: l-m:{(#11,RANGEKEYSET,@t10,foo)}
+----
+point:    [a#1,1-h#7,2]
+rangedel: [d#4,15-j#72057594037927935,15]
+rangekey: [j#9,19-m#72057594037927935,21]
+seqnums:  [1-11]
+props "deletions":
+  rocksdb.num.range-deletions: 2
+props "deleted":
+  rocksdb.deleted.keys: 4
+
+
+build props=(deletions,deleted)
+a.SET.1:a
+b.DEL.2:
+c.MERGE.3:c
+d.SINGLEDEL.4:
+e.SINGLEDEL.5:
+f.SET.6:f
+g.DEL.7:
+h.SINGLEDEL.8:
+rangekey: j-k:{(#9,RANGEKEYDEL)}
+rangekey: k-l:{(#10,RANGEKEYUNSET,@t5)}
+rangekey: l-m:{(#11,RANGEKEYSET,@t10,foo)}
+----
+point:    [a#1,1-h#8,7]
+rangekey: [j#9,19-m#72057594037927935,21]
+seqnums:  [1-11]
+props "deletions":
+  rocksdb.num.range-deletions: 0
+props "deleted":
+  rocksdb.deleted.keys: 5
+
+
+build
+a.SET.1:a
+b.DEL.2:
+c.MERGE.3:c
+d.RANGEDEL.4:e
+f.SET.5:f
+g.DEL.6:
+h.MERGE.7:h
+i.RANGEDEL.8:j
+----
+point:    [a#1,1-h#7,2]
+rangedel: [d#4,15-j#72057594037927935,15]
+seqnums:  [1-8]
+
+scan
+----
+a#1,1:a
+b#2,0:
+c#3,2:c
+f#5,1:f
+g#6,0:
+h#7,2:h
+
+scan-range-del
+----
+d-e:{(#4,RANGEDEL)}
+i-j:{(#8,RANGEDEL)}
+
+# 3: a-----------m
+# 2:      f------------s
+# 1:          j---------------z
+
+build
+a.RANGEDEL.3:m
+f.RANGEDEL.2:s
+j.RANGEDEL.1:z
+----
+rangedel: [a#3,15-z#72057594037927935,15]
+seqnums:  [1-3]
+
+scan
+----
+
+scan-range-del
+----
+a-f:{(#3,RANGEDEL)}
+f-j:{(#3,RANGEDEL) (#2,RANGEDEL)}
+j-m:{(#3,RANGEDEL) (#2,RANGEDEL) (#1,RANGEDEL)}
+m-s:{(#2,RANGEDEL) (#1,RANGEDEL)}
+s-z:{(#1,RANGEDEL)}
+
+scan-range-key
+----
+
+# The range tombstone upper bound is exclusive, so a point operation
+# on that same key will be the actual boundary.
+
+build
+a.RANGEDEL.3:b
+b.SET.4:c
+----
+point:    [b#4,1-b#4,1]
+rangedel: [a#3,15-b#72057594037927935,15]
+seqnums:  [3-4]
+
+build
+a.RANGEDEL.3:b
+b.SET.2:c
+----
+point:    [b#2,1-b#2,1]
+rangedel: [a#3,15-b#72057594037927935,15]
+seqnums:  [2-3]
+
+build
+a.RANGEDEL.3:c
+b.SET.2:c
+----
+point:    [b#2,1-b#2,1]
+rangedel: [a#3,15-c#72057594037927935,15]
+seqnums:  [2-3]
+
+# Keys must be added in order.
+
+build
+a.SET.1:b
+a.SET.2:c
+----
+pebble: keys must be added in strictly increasing order: a#1,SET, a#2,SET
+
+build
+b.SET.1:a
+a.SET.2:b
+----
+pebble: keys must be added in strictly increasing order: b#1,SET, a#2,SET
+
+build
+b.RANGEDEL.1:c
+a.RANGEDEL.2:b
+----
+pebble: keys must be added in order: b > a
+
+build-raw
+.RANGEDEL.1:b
+----
+rangedel: [#1,15-b#72057594037927935,15]
+seqnums:  [1-1]
+
+build-raw
+a.RANGEDEL.1:c
+a.RANGEDEL.2:c
+----
+pebble: keys must be added in strictly increasing order: a#1,RANGEDEL, a#2,RANGEDEL
+
+build-raw
+a.RANGEDEL.1:c
+b.RANGEDEL.2:d
+----
+pebble: overlapping tombstones must be fragmented: a-c:{(#1,RANGEDEL)} vs b-d:{(#2,RANGEDEL)}
+
+build-raw
+a.RANGEDEL.2:c
+a.RANGEDEL.1:d
+----
+pebble: overlapping tombstones must be fragmented: a-c:{(#2,RANGEDEL)} vs a-d:{(#1,RANGEDEL)}
+
+build-raw
+a.RANGEDEL.1:c
+c.RANGEDEL.2:d
+----
+rangedel: [a#1,15-d#72057594037927935,15]
+seqnums:  [1-2]
+
+build-raw
+rangekey: a-b:{(#1,RANGEKEYSET,@t10,foo)}
+rangekey: a-b:{(#2,RANGEKEYSET,@t10,foo)}
+----
+rangekey: [a#2,21-b#72057594037927935,21]
+seqnums:  [1-2]
+
+build-raw
+rangekey: b-c:{(#2,RANGEKEYSET,@t10,foo)}
+rangekey: a-b:{(#1,RANGEKEYSET,@t10,foo)}
+----
+pebble: spans must be added in order: b > a
+
+build-raw
+a.RANGEKEYDEL.1:c
+b.RANGEKEYDEL.2:d
+----
+pebble: overlapping range keys must be fragmented: a#1,RANGEKEYDEL, b#2,RANGEKEYDEL
+
+build-raw
+a.RANGEKEYDEL.2:c
+a.RANGEKEYDEL.1:d
+----
+pebble: overlapping range keys must be fragmented: a#2,RANGEKEYDEL, a#1,RANGEKEYDEL
+
+build-raw
+rangekey: a-c:{(#1,RANGEKEYSET,@t10,foo)}
+rangekey: c-d:{(#2,RANGEKEYSET,@t10,foo)}
+----
+rangekey: [a#1,21-d#72057594037927935,21]
+seqnums:  [1-2]
+
+# Range keys may have perfectly aligned spans (including sequence numbers),
+# though the key kinds must be ordered (descending).
+
+build-raw
+a.RANGEKEYDEL.1:b
+a.RANGEKEYDEL.1:b
+----
+pebble: range keys starts must be added in increasing order: a#1,RANGEKEYDEL, a#1,RANGEKEYDEL
+
+build-raw
+rangekey: a-b:{(#1,RANGEKEYSET,@t10,foo) (#1,RANGEKEYUNSET,@t10) (#1,RANGEKEYDEL)}
+----
+rangekey: [a#1,21-b#72057594037927935,19]
+seqnums:  [1-1]
+
+# The range-del-v1 format supports unfragmented and unsorted range
+# tombstones.
+
+build-raw range-del-v1
+a.RANGEDEL.1:c
+a.RANGEDEL.2:c
+----
+rangedel: [a#2,15-c#72057594037927935,15]
+seqnums:  [1-2]
+
+scan-range-del
+----
+a-c:{(#2,RANGEDEL) (#1,RANGEDEL)}
+
+build-raw range-del-v1
+a.RANGEDEL.1:c
+b.RANGEDEL.2:d
+----
+rangedel: [a#1,15-d#72057594037927935,15]
+seqnums:  [1-2]
+
+scan-range-del
+----
+a-b:{(#1,RANGEDEL)}
+b-c:{(#2,RANGEDEL) (#1,RANGEDEL)}
+c-d:{(#2,RANGEDEL)}
+
+build-raw range-del-v1
+a.RANGEDEL.2:c
+a.RANGEDEL.1:d
+----
+rangedel: [a#2,15-d#72057594037927935,15]
+seqnums:  [1-2]
+
+scan-range-del
+----
+a-c:{(#2,RANGEDEL) (#1,RANGEDEL)}
+c-d:{(#1,RANGEDEL)}
+
+# This matches an early test case, except we're passing overlapping
+# range tombstones to the sstable writer and requiring them to be
+# fragmented at read time.
+
+build-raw range-del-v1
+j.RANGEDEL.1:z
+f.RANGEDEL.2:s
+a.RANGEDEL.3:m
+----
+rangedel: [a#3,15-z#72057594037927935,15]
+seqnums:  [1-3]
+
+scan-range-del
+----
+a-f:{(#3,RANGEDEL)}
+f-j:{(#3,RANGEDEL) (#2,RANGEDEL)}
+j-m:{(#3,RANGEDEL) (#2,RANGEDEL) (#1,RANGEDEL)}
+m-s:{(#2,RANGEDEL) (#1,RANGEDEL)}
+s-z:{(#1,RANGEDEL)}
+
+# Setting a very small index-block-size results in a two-level index.
+
+build block-size=1 index-block-size=1
+a.SET.1:a
+b.SET.1:b
+c.SET.1:c
+----
+point:    [a#1,1-c#1,1]
+seqnums:  [1-1]
+
+layout
+----
+         0  data (21)
+        26  data (21)
+        52  data (21)
+        78  index (22)
+       105  index (22)
+       132  index (22)
+       159  top-index (50)
+       214  properties (580)
+       799  meta-index (33)
+       837  footer (53)
+       890  EOF
+
+scan
+----
+a#1,1:a
+b#1,1:b
+c#1,1:c
+
+# Enabling leveldb format disables the creation of a two-level index
+# (the input data here mirrors the test case above).
+
+build leveldb block-size=1 index-block-size=1
+a.SET.1:a
+b.SET.1:b
+c.SET.1:c
+----
+point:    [a#1,1-c#1,1]
+seqnums:  [1-1]
+
+layout
+----
+         0  data (21)
+        26  data (21)
+        52  data (21)
+        78  index (47)
+       130  properties (678)
+       813  meta-index (33)
+       851  leveldb-footer (48)
+       899  EOF
+
+# Range keys, if present, are shown in the layout.
+
+build
+rangekey: a-b:{(#3,RANGEKEYSET,@t3,foo)}
+rangekey: b-c:{(#2,RANGEKEYSET,@t2,bar)}
+rangekey: c-d:{(#1,RANGEKEYSET,@t1,baz)}
+----
+rangekey: [a#3,21-d#72057594037927935,21]
+seqnums:  [1-3]
+
+layout
+----
+         0  data (8)
+        13  index (21)
+        39  range-key (82)
+       126  properties (628)
+       759  meta-index (57)
+       821  footer (53)
+       874  EOF
diff --git a/pebble/sstable/testdata/writer_range_keys b/pebble/sstable/testdata/writer_range_keys
new file mode 100644
index 0000000..7b379b6
--- /dev/null
+++ b/pebble/sstable/testdata/writer_range_keys
@@ -0,0 +1,135 @@
+# NOTE: The operations SET, UNSET, and DEL in this test file are aliases for
+# RANGEKEYSET, RANGEKEYUNSET and RANGEKEYDEL, respectively.
+
+# Keys must be added in order of start key.
+
+build
+SET b-d @3=foo
+SET a-c @5=bar
+----
+pebble: spans must be added in order: b > a
+
+# All disjoint RANGEKEYSETs.
+#
+#  ^
+#  |                •―――○    [e,f) SET @3=baz
+#  |        •―――○            [c,d) SET @2=bar
+#  |•―――――――○                [a,c) SET @1=foo
+#  |___________________________________
+#   a   b   c   d   e   f   g   h   i
+
+build
+SET a-c @1=foo
+SET c-d @2=bar
+SET e-f @3=baz
+----
+a-c:{(#0,RANGEKEYSET,@1,foo)}
+c-d:{(#0,RANGEKEYSET,@2,bar)}
+e-f:{(#0,RANGEKEYSET,@3,baz)}
+
+# Merge aligned RANGEKEYSETs.
+#
+#  ^
+#  |•―――――――○    [a,c) SET @3=baz
+#  |•―――――――○    [a,c) SET @1=bar
+#  |•―――――――○    [a,c) SET @2=foo
+#  |___________________________________
+#   a   b   c   d   e   f   g   h   i
+#
+# Note that suffixes are sorted in descending order of the timestamp value in
+# the suffix, rather than in lexical order.
+
+build
+SET a-c @2=foo
+SET a-c @1=bar
+SET a-c @3=baz
+----
+a-c:{(#0,RANGEKEYSET,@3,baz) (#0,RANGEKEYSET,@2,foo) (#0,RANGEKEYSET,@1,bar)}
+
+# Aligned spans, mixed range key kinds.
+#
+#  ^
+#  |                    •―――――――――――○    [f,i) DEL
+#  |                    •―――――――――――○    [f,i) SET   @9=v9
+#  |                    •―――――――――――○    [f,i) SET   @8=v8
+#  |                    •―――――――――――○    [f,i) SET   @7=v7
+#  |                    •―――――――――――○    [f,i) UNSET @6
+#  |                    •―――――――――――○    [f,i) SET   @5=v5
+#  |                    •―――――――――――○    [f,i) SET   @4=v4
+#  |            •―――○                    [d,e) SET   @9=v9
+#  |            •―――○                    [d,e) DEL
+#  |        •―――○                        [c,d) SET   @5=v5
+#  |        •―――○                        [c,d) SET   @2=v2
+#  |•―――――――○                            [a,c) UNSET @5
+#  |•―――――――○                            [a,c) SET   @1=v1
+#  |___________________________________
+#   a   b   c   d   e   f   g   h   i
+
+build
+SET a-c @1=v1
+UNSET a-c @5
+SET c-d @2=v2
+SET c-d @5=v5
+DEL d-e
+SET d-e @9=v9
+SET f-i @4=v4
+SET f-i @5=v5
+UNSET f-i @6
+SET f-i @7=v7
+SET f-i @8=v8
+SET f-i @9=v9
+DEL f-i
+----
+a-c:{(#0,RANGEKEYSET,@1,v1) (#0,RANGEKEYUNSET,@5)}
+c-d:{(#0,RANGEKEYSET,@5,v5) (#0,RANGEKEYSET,@2,v2)}
+d-e:{(#0,RANGEKEYSET,@9,v9) (#0,RANGEKEYDEL)}
+f-i:{(#0,RANGEKEYSET,@9,v9) (#0,RANGEKEYSET,@8,v8) (#0,RANGEKEYSET,@7,v7) (#0,RANGEKEYSET,@5,v5) (#0,RANGEKEYSET,@4,v4) (#0,RANGEKEYUNSET,@6) (#0,RANGEKEYDEL)}
+
+# Merge overlapping RANGEKEYSETs.
+#
+#  ^
+#  |        •―――○           [c,d) SET @3=baz
+#  |    •―――――――――――――――○   [b,f) SET @2=bar
+#  |•―――――――○               [a,c) SET @1=foo
+#  |___________________________________
+#   a   b   c   d   e   f
+
+build
+SET a-c @1=foo
+SET b-f @2=bar
+SET c-d @3=baz
+----
+a-b:{(#0,RANGEKEYSET,@1,foo)}
+b-c:{(#0,RANGEKEYSET,@2,bar) (#0,RANGEKEYSET,@1,foo)}
+c-d:{(#0,RANGEKEYSET,@3,baz) (#0,RANGEKEYSET,@2,bar)}
+d-f:{(#0,RANGEKEYSET,@2,bar)}
+
+# Overlapping spans, mixed range keys kinds.
+#
+#  ^
+#  |                      •―――――○     [l,o) DEL
+#  |        •―――――――――――――○           [e,l) SET   @4=baz
+#  |                              •―○ [p,q) DEL
+#  |    •―――――――――○                   [c,h) UNSET @3
+#  |  •―○                             [b,c) SET   @2=bar
+#  |•―――――――――――――――○                 [a,i) SET   @1=foo
+#  |___________________________________
+#   a b c d e f g h i j k l m n o p q
+
+build
+SET a-i @1=foo
+SET b-c @2=bar
+UNSET c-h @3
+DEL c-d
+SET e-q @4=baz
+DEL l-o
+----
+a-b:{(#0,RANGEKEYSET,@1,foo)}
+b-c:{(#0,RANGEKEYSET,@2,bar) (#0,RANGEKEYSET,@1,foo)}
+c-d:{(#0,RANGEKEYSET,@1,foo) (#0,RANGEKEYUNSET,@3) (#0,RANGEKEYDEL)}
+d-e:{(#0,RANGEKEYSET,@1,foo) (#0,RANGEKEYUNSET,@3)}
+e-h:{(#0,RANGEKEYSET,@4,baz) (#0,RANGEKEYSET,@1,foo) (#0,RANGEKEYUNSET,@3)}
+h-i:{(#0,RANGEKEYSET,@4,baz) (#0,RANGEKEYSET,@1,foo)}
+i-l:{(#0,RANGEKEYSET,@4,baz)}
+l-o:{(#0,RANGEKEYSET,@4,baz) (#0,RANGEKEYDEL)}
+o-q:{(#0,RANGEKEYSET,@4,baz)}
diff --git a/pebble/sstable/testdata/writer_v3 b/pebble/sstable/testdata/writer_v3
new file mode 100644
index 0000000..a003c5f
--- /dev/null
+++ b/pebble/sstable/testdata/writer_v3
@@ -0,0 +1,343 @@
+build
+a.SET.1:a
+----
+point:    [a#1,1-a#1,1]
+seqnums:  [1-1]
+
+scan
+----
+a#1,1:a
+
+scan-range-del
+----
+
+scan-range-key
+----
+
+build
+a.SET.1:a
+b.DEL.2:
+c.MERGE.3:c
+d.RANGEDEL.4:e
+f.SET.5:f
+g.DEL.6:
+h.MERGE.7:h
+i.RANGEDEL.8:j
+rangekey: j-k:{(#9,RANGEKEYDEL)}
+rangekey: k-l:{(#10,RANGEKEYUNSET,@t5)}
+rangekey: l-m:{(#11,RANGEKEYSET,@t10,foo)}
+----
+point:    [a#1,1-h#7,2]
+rangedel: [d#4,15-j#72057594037927935,15]
+rangekey: [j#9,19-m#72057594037927935,21]
+seqnums:  [1-11]
+
+build
+a.SET.1:a
+b.DEL.2:
+c.MERGE.3:c
+d.RANGEDEL.4:e
+f.SET.5:f
+g.DEL.6:
+h.MERGE.7:h
+i.RANGEDEL.8:j
+----
+point:    [a#1,1-h#7,2]
+rangedel: [d#4,15-j#72057594037927935,15]
+seqnums:  [1-8]
+
+scan
+----
+a#1,1:a
+b#2,0:
+c#3,2:c
+f#5,1:f
+g#6,0:
+h#7,2:h
+
+scan-range-del
+----
+d-e:{(#4,RANGEDEL)}
+i-j:{(#8,RANGEDEL)}
+
+# 3: a-----------m
+# 2:      f------------s
+# 1:          j---------------z
+
+build
+a.RANGEDEL.3:m
+f.RANGEDEL.2:s
+j.RANGEDEL.1:z
+----
+rangedel: [a#3,15-z#72057594037927935,15]
+seqnums:  [1-3]
+
+scan
+----
+
+scan-range-del
+----
+a-f:{(#3,RANGEDEL)}
+f-j:{(#3,RANGEDEL) (#2,RANGEDEL)}
+j-m:{(#3,RANGEDEL) (#2,RANGEDEL) (#1,RANGEDEL)}
+m-s:{(#2,RANGEDEL) (#1,RANGEDEL)}
+s-z:{(#1,RANGEDEL)}
+
+scan-range-key
+----
+
+# The range tombstone upper bound is exclusive, so a point operation
+# on that same key will be the actual boundary.
+
+build
+a.RANGEDEL.3:b
+b.SET.4:c
+----
+point:    [b#4,1-b#4,1]
+rangedel: [a#3,15-b#72057594037927935,15]
+seqnums:  [3-4]
+
+build
+a.RANGEDEL.3:b
+b.SET.2:c
+----
+point:    [b#2,1-b#2,1]
+rangedel: [a#3,15-b#72057594037927935,15]
+seqnums:  [2-3]
+
+build
+a.RANGEDEL.3:c
+b.SET.2:c
+----
+point:    [b#2,1-b#2,1]
+rangedel: [a#3,15-c#72057594037927935,15]
+seqnums:  [2-3]
+
+# Keys must be added in order.
+
+build
+a.SET.1:b
+a.SET.2:c
+----
+pebble: keys must be added in strictly increasing order: a#1,SET, a#2,SET
+
+build
+b.SET.1:a
+a.SET.2:b
+----
+pebble: keys must be added in strictly increasing order: b#1,SET, a#2,SET
+
+build
+b.RANGEDEL.1:c
+a.RANGEDEL.2:b
+----
+pebble: keys must be added in order: b > a
+
+build-raw
+.RANGEDEL.1:b
+----
+rangedel: [#1,15-b#72057594037927935,15]
+seqnums:  [1-1]
+
+build-raw
+a.RANGEDEL.1:c
+a.RANGEDEL.2:c
+----
+pebble: keys must be added in strictly increasing order: a#1,RANGEDEL, a#2,RANGEDEL
+
+build-raw
+a.RANGEDEL.1:c
+b.RANGEDEL.2:d
+----
+pebble: overlapping tombstones must be fragmented: a-c:{(#1,RANGEDEL)} vs b-d:{(#2,RANGEDEL)}
+
+build-raw
+a.RANGEDEL.2:c
+a.RANGEDEL.1:d
+----
+pebble: overlapping tombstones must be fragmented: a-c:{(#2,RANGEDEL)} vs a-d:{(#1,RANGEDEL)}
+
+build-raw
+a.RANGEDEL.1:c
+c.RANGEDEL.2:d
+----
+rangedel: [a#1,15-d#72057594037927935,15]
+seqnums:  [1-2]
+
+build-raw
+rangekey: a-b:{(#1,RANGEKEYSET,@t10,foo)}
+rangekey: a-b:{(#2,RANGEKEYSET,@t10,foo)}
+----
+rangekey: [a#2,21-b#72057594037927935,21]
+seqnums:  [1-2]
+
+build-raw
+rangekey: b-c:{(#2,RANGEKEYSET,@t10,foo)}
+rangekey: a-b:{(#1,RANGEKEYSET,@t10,foo)}
+----
+pebble: spans must be added in order: b > a
+
+build-raw
+a.RANGEKEYDEL.1:c
+b.RANGEKEYDEL.2:d
+----
+pebble: overlapping range keys must be fragmented: a#1,RANGEKEYDEL, b#2,RANGEKEYDEL
+
+build-raw
+a.RANGEKEYDEL.2:c
+a.RANGEKEYDEL.1:d
+----
+pebble: overlapping range keys must be fragmented: a#2,RANGEKEYDEL, a#1,RANGEKEYDEL
+
+build-raw
+rangekey: a-c:{(#1,RANGEKEYSET,@t10,foo)}
+rangekey: c-d:{(#2,RANGEKEYSET,@t10,foo)}
+----
+rangekey: [a#1,21-d#72057594037927935,21]
+seqnums:  [1-2]
+
+# Range keys may have perfectly aligned spans (including sequence numbers),
+# though the key kinds must be ordered (descending).
+
+build-raw
+a.RANGEKEYDEL.1:b
+a.RANGEKEYDEL.1:b
+----
+pebble: range keys starts must be added in increasing order: a#1,RANGEKEYDEL, a#1,RANGEKEYDEL
+
+build-raw
+rangekey: a-b:{(#1,RANGEKEYSET,@t10,foo) (#1,RANGEKEYUNSET,@t10) (#1,RANGEKEYDEL)}
+----
+rangekey: [a#1,21-b#72057594037927935,19]
+seqnums:  [1-1]
+
+# The range-del-v1 format supports unfragmented and unsorted range
+# tombstones.
+
+build-raw range-del-v1
+a.RANGEDEL.1:c
+a.RANGEDEL.2:c
+----
+rangedel: [a#2,15-c#72057594037927935,15]
+seqnums:  [1-2]
+
+scan-range-del
+----
+a-c:{(#2,RANGEDEL) (#1,RANGEDEL)}
+
+build-raw range-del-v1
+a.RANGEDEL.1:c
+b.RANGEDEL.2:d
+----
+rangedel: [a#1,15-d#72057594037927935,15]
+seqnums:  [1-2]
+
+scan-range-del
+----
+a-b:{(#1,RANGEDEL)}
+b-c:{(#2,RANGEDEL) (#1,RANGEDEL)}
+c-d:{(#2,RANGEDEL)}
+
+build-raw range-del-v1
+a.RANGEDEL.2:c
+a.RANGEDEL.1:d
+----
+rangedel: [a#2,15-d#72057594037927935,15]
+seqnums:  [1-2]
+
+scan-range-del
+----
+a-c:{(#2,RANGEDEL) (#1,RANGEDEL)}
+c-d:{(#1,RANGEDEL)}
+
+# This matches an early test case, except we're passing overlapping
+# range tombstones to the sstable writer and requiring them to be
+# fragmented at read time.
+
+build-raw range-del-v1
+j.RANGEDEL.1:z
+f.RANGEDEL.2:s
+a.RANGEDEL.3:m
+----
+rangedel: [a#3,15-z#72057594037927935,15]
+seqnums:  [1-3]
+
+scan-range-del
+----
+a-f:{(#3,RANGEDEL)}
+f-j:{(#3,RANGEDEL) (#2,RANGEDEL)}
+j-m:{(#3,RANGEDEL) (#2,RANGEDEL) (#1,RANGEDEL)}
+m-s:{(#2,RANGEDEL) (#1,RANGEDEL)}
+s-z:{(#1,RANGEDEL)}
+
+# Setting a very small index-block-size results in a two-level index.
+
+build block-size=1 index-block-size=1
+a.SET.1:a
+b.SET.1:b
+c.SET.1:c
+----
+point:    [a#1,1-c#1,1]
+seqnums:  [1-1]
+
+layout
+----
+         0  data (22)
+        27  data (22)
+        54  data (22)
+        81  index (22)
+       108  index (22)
+       135  index (22)
+       162  top-index (51)
+       218  properties (580)
+       803  meta-index (33)
+       841  footer (53)
+       894  EOF
+
+scan
+----
+a#1,1:a
+b#1,1:b
+c#1,1:c
+
+# Enabling leveldb format disables the creation of a two-level index
+# (the input data here mirrors the test case above).
+
+build leveldb block-size=1 index-block-size=1
+a.SET.1:a
+b.SET.1:b
+c.SET.1:c
+----
+point:    [a#1,1-c#1,1]
+seqnums:  [1-1]
+
+layout
+----
+         0  data (21)
+        26  data (21)
+        52  data (21)
+        78  index (47)
+       130  properties (678)
+       813  meta-index (33)
+       851  leveldb-footer (48)
+       899  EOF
+
+# Range keys, if present, are shown in the layout.
+
+build
+rangekey: a-b:{(#3,RANGEKEYSET,@t3,foo)}
+rangekey: b-c:{(#2,RANGEKEYSET,@t2,bar)}
+rangekey: c-d:{(#1,RANGEKEYSET,@t1,baz)}
+----
+rangekey: [a#3,21-d#72057594037927935,21]
+seqnums:  [1-3]
+
+layout
+----
+         0  data (8)
+        13  index (21)
+        39  range-key (82)
+       126  properties (628)
+       759  meta-index (57)
+       821  footer (53)
+       874  EOF
diff --git a/pebble/sstable/testdata/writer_value_blocks b/pebble/sstable/testdata/writer_value_blocks
new file mode 100644
index 0000000..e98a4b5
--- /dev/null
+++ b/pebble/sstable/testdata/writer_value_blocks
@@ -0,0 +1,330 @@
+# Size of value index is 3 bytes plus 5 + 5 = 10 bytes of trailer of the value
+# block and value index block. So size 18 - 13 = 5 size of the value in the
+# value block.
+build
+a@2.SET.1:a2
+b@5.SET.7:b5
+b@4.DEL.3:
+b@3.SET.2:bat3
+b@2.SET.1:vbat2
+----
+value-blocks: num-values 1, num-blocks: 1, size: 18
+
+scan-raw
+----
+a@2#1,1:in-place a2, same-pre false
+b@5#7,1:in-place b5, same-pre false
+b@4#3,0:
+b@3#2,1:in-place bat3, same-pre false
+b@2#1,1:value-handle len 5 block 0 offset 0, att 5, same-pre true
+
+scan
+----
+a@2#1,1:a2
+b@5#7,1:b5
+b@4#3,0:
+b@3#2,1:bat3
+b@2#1,1:vbat2
+
+scan-cloned-lazy-values
+----
+0(in-place: len 2): a2
+1(in-place: len 2): b5
+2(in-place: len 0): 
+3(in-place: len 4): bat3
+4(lazy: len 5, attr: 5): vbat2
+
+# Size of value index is 3 bytes plus 5 + 5 = 10 bytes of trailer of the value
+# block and value index block. So size 33 - 13 = 20 is the total size of the
+# values in the value block.
+build
+blue@10.SET.20:blue10
+blue@8.SET.18:blue8
+blue@8.SET.16:blue8s
+blue@6.DEL.14:
+blue@4.SET.12:blue4
+blue@3.SET.10:blue3
+red@9.SET.18:red9
+red@7.SET.8:red7
+----
+value-blocks: num-values 4, num-blocks: 1, size: 33
+
+scan-raw
+----
+blue@10#20,1:in-place blue10, same-pre false
+blue@8#18,1:value-handle len 5 block 0 offset 0, att 5, same-pre true
+blue@8#16,1:value-handle len 6 block 0 offset 5, att 6, same-pre true
+blue@6#14,0:
+blue@4#12,1:in-place blue4, same-pre false
+blue@3#10,1:value-handle len 5 block 0 offset 11, att 5, same-pre true
+red@9#18,1:in-place red9, same-pre false
+red@7#8,1:value-handle len 4 block 0 offset 16, att 4, same-pre true
+
+scan
+----
+blue@10#20,1:blue10
+blue@8#18,1:blue8
+blue@8#16,1:blue8s
+blue@6#14,0:
+blue@4#12,1:blue4
+blue@3#10,1:blue3
+red@9#18,1:red9
+red@7#8,1:red7
+
+scan-cloned-lazy-values
+----
+0(in-place: len 6): blue10
+1(lazy: len 5, attr: 5): blue8
+2(lazy: len 6, attr: 6): blue8s
+3(in-place: len 0): 
+4(in-place: len 5): blue4
+5(lazy: len 5, attr: 5): blue3
+6(in-place: len 4): red9
+7(lazy: len 4, attr: 4): red7
+
+# Multiple value blocks. Trailers of 5+5+5 for the two value blocks and the
+# value index block, totals to 15. The values are 5+6+15=26. The value index
+# block has to encode two tuples, each of 4 bytes (blockNumByteLength=1,
+# blockOffsetByteLength=2, blockLengthByteLength=1), so 2*4=8. The total is
+# 15+26+8=49 bytes, which corresponds to "size: 49" below.
+build block-size=8
+blue@10.SET.20:blue10
+blue@8.SET.18:blue8
+blue@8.SET.16:blue8s
+blue@6.SET.16:blue6isverylong
+----
+value-blocks: num-values 3, num-blocks: 2, size: 49
+
+scan-raw
+----
+blue@10#20,1:in-place blue10, same-pre false
+blue@8#18,1:value-handle len 5 block 0 offset 0, att 5, same-pre true
+blue@8#16,1:value-handle len 6 block 0 offset 5, att 6, same-pre true
+blue@6#16,1:value-handle len 15 block 1 offset 0, att 7, same-pre true
+
+scan
+----
+blue@10#20,1:blue10
+blue@8#18,1:blue8
+blue@8#16,1:blue8s
+blue@6#16,1:blue6isverylong
+
+scan-cloned-lazy-values
+----
+0(in-place: len 6): blue10
+1(lazy: len 5, attr: 5): blue8
+2(lazy: len 6, attr: 6): blue8s
+3(lazy: len 15, attr: 7): blue6isverylong
+
+layout
+----
+         0  data (33)
+         0    record (25 = 3 [0] + 15 + 7) [restart]
+                blue@10#20,1:blue10
+        25    [restart 0]
+        33    [trailer compression=none checksum=0x5fb0d551]
+        38  data (29)
+        38    record (21 = 3 [0] + 14 + 4) [restart]
+                blue@8#18,1:value handle {valueLen:5 blockNum:0 offsetInBlock:0}
+        59    [restart 38]
+        67    [trailer compression=none checksum=0x628e4a10]
+        72  data (29)
+        72    record (21 = 3 [0] + 14 + 4) [restart]
+                blue@8#16,1:value handle {valueLen:6 blockNum:0 offsetInBlock:5}
+        93    [restart 72]
+       101    [trailer compression=none checksum=0x4e65b9b6]
+       106  data (29)
+       106    record (21 = 3 [0] + 14 + 4) [restart]
+                blue@6#16,1:value handle {valueLen:15 blockNum:1 offsetInBlock:0}
+       127    [restart 106]
+       135    [trailer compression=none checksum=0x9f60e629]
+       140  index (28)
+       140    block:0/33 [restart]
+       160    [restart 140]
+       168    [trailer compression=none checksum=0x32b37f08]
+       173  index (27)
+       173    block:38/29 [restart]
+       192    [restart 173]
+       200    [trailer compression=none checksum=0x21d27815]
+       205  index (30)
+       205    block:72/29 [restart]
+       227    [restart 205]
+       235    [trailer compression=none checksum=0xba0b26fe]
+       240  index (22)
+       240    block:106/29 [restart]
+       254    [restart 240]
+       262    [trailer compression=none checksum=0x802be702]
+       267  top-index (85)
+       267    block:140/28 [restart]
+       288    block:173/27 [restart]
+       308    block:205/30 [restart]
+       331    block:240/22 [restart]
+       346    [restart 267]
+       350    [restart 288]
+       354    [restart 308]
+       358    [restart 331]
+       352    [trailer compression=snappy checksum=0x8bd0d63a]
+       357  value-block (11)
+       373  value-block (15)
+       393  value-index (8)
+       406  properties (676)
+       406    obsolete-key (16) [restart]
+       422    pebble.num.value-blocks (27)
+       449    pebble.num.values.in.value-blocks (21)
+       470    pebble.value-blocks.size (21)
+       491    rocksdb.block.based.table.index.type (43)
+       534    rocksdb.block.based.table.prefix.filtering (20)
+       554    rocksdb.block.based.table.whole.key.filtering (23)
+       577    rocksdb.comparator (37)
+       614    rocksdb.compression (16)
+       630    rocksdb.compression_options (106)
+       736    rocksdb.data.size (14)
+       750    rocksdb.deleted.keys (15)
+       765    rocksdb.external_sst_file.global_seqno (41)
+       806    rocksdb.external_sst_file.version (14)
+       820    rocksdb.filter.size (15)
+       835    rocksdb.index.partitions (20)
+       855    rocksdb.index.size (9)
+       864    rocksdb.merge.operands (18)
+       882    rocksdb.merge.operator (24)
+       906    rocksdb.num.data.blocks (19)
+       925    rocksdb.num.entries (11)
+       936    rocksdb.num.range-deletions (19)
+       955    rocksdb.prefix.extractor.name (31)
+       986    rocksdb.property.collectors (34)
+      1020    rocksdb.raw.key.size (16)
+      1036    rocksdb.raw.value.size (14)
+      1050    rocksdb.top-level.index.size (24)
+      1074    [restart 406]
+      1082    [trailer compression=none checksum=0xbf6fe705]
+      1087  meta-index (64)
+      1087    pebble.value_index block:393/8 value-blocks-index-lengths: 1(num), 2(offset), 1(length) [restart]
+      1114    rocksdb.properties block:406/676 [restart]
+      1139    [restart 1087]
+      1143    [restart 1114]
+      1151    [trailer compression=none checksum=0x5a8a2a98]
+      1156  footer (53)
+      1156    checksum type: crc32c
+      1157    meta: offset=1087, length=64
+      1160    index: offset=267, length=85
+      1163    [padding]
+      1197    version: 4
+      1201    magic number: 0xf09faab3f09faab3
+      1209  EOF
+
+# Require that [c,e) must be in-place.
+build in-place-bound=(c,e)
+blue@10.SET.20:blue10
+blue@8.SET.18:blue8
+c@10.SET.16:c10
+c@8.SET.14:c8
+e@20.SET.25:eat20
+e@18.SET.23:eat18
+----
+value-blocks: num-values 2, num-blocks: 1, size: 23
+
+scan-raw
+----
+blue@10#20,1:in-place blue10, same-pre false
+blue@8#18,1:value-handle len 5 block 0 offset 0, att 5, same-pre true
+c@10#16,1:in-place c10, same-pre false
+c@8#14,1:in-place c8, same-pre false
+e@20#25,1:in-place eat20, same-pre false
+e@18#23,1:value-handle len 5 block 0 offset 5, att 5, same-pre true
+
+scan
+----
+blue@10#20,1:blue10
+blue@8#18,1:blue8
+c@10#16,1:c10
+c@8#14,1:c8
+e@20#25,1:eat20
+e@18#23,1:eat18
+
+scan-cloned-lazy-values
+----
+0(in-place: len 6): blue10
+1(lazy: len 5, attr: 5): blue8
+2(in-place: len 3): c10
+3(in-place: len 2): c8
+4(in-place: len 5): eat20
+5(lazy: len 5, attr: 5): eat18
+
+# Try write empty values to value blocks.
+build
+b@5.SET.7:b5
+b@3.SET.2:
+c@6.DEL.7:
+c@5.DEL.6:
+----
+value-blocks: num-values 0, num-blocks: 0, size: 0
+
+scan-raw
+----
+b@5#7,1:in-place b5, same-pre false
+b@3#2,1:in-place , same-pre true
+c@6#7,0:
+c@5#6,0:
+
+scan
+----
+b@5#7,1:b5
+b@3#2,1:
+c@6#7,0:
+c@5#6,0:
+
+layout
+----
+         0  data (66)
+         0    record (17 = 3 [0] + 11 + 3) [restart]
+                b@5#7,1:b5
+        17    record (14 = 3 [1] + 10 + 1)
+                b@3#2,1:
+        31    record (14 = 3 [0] + 11 + 0)
+                c@6#7,0:
+        45    record (13 = 3 [1] + 10 + 0)
+                c@5#6,0:
+        58    [restart 0]
+        66    [trailer compression=none checksum=0x4e91250f]
+        71  index (22)
+        71    block:0/66 [restart]
+        85    [restart 71]
+        93    [trailer compression=none checksum=0xf80f5bcf]
+        98  properties (606)
+        98    obsolete-key (16) [restart]
+       114    pebble.raw.point-tombstone.key.size (39)
+       153    rocksdb.block.based.table.index.type (43)
+       196    rocksdb.block.based.table.prefix.filtering (20)
+       216    rocksdb.block.based.table.whole.key.filtering (23)
+       239    rocksdb.comparator (37)
+       276    rocksdb.compression (16)
+       292    rocksdb.compression_options (106)
+       398    rocksdb.data.size (13)
+       411    rocksdb.deleted.keys (15)
+       426    rocksdb.external_sst_file.global_seqno (41)
+       467    rocksdb.external_sst_file.version (14)
+       481    rocksdb.filter.size (15)
+       496    rocksdb.index.size (14)
+       510    rocksdb.merge.operands (18)
+       528    rocksdb.merge.operator (24)
+       552    rocksdb.num.data.blocks (19)
+       571    rocksdb.num.entries (11)
+       582    rocksdb.num.range-deletions (19)
+       601    rocksdb.prefix.extractor.name (31)
+       632    rocksdb.property.collectors (34)
+       666    rocksdb.raw.key.size (16)
+       682    rocksdb.raw.value.size (14)
+       696    [restart 98]
+       704    [trailer compression=none checksum=0xb3084f65]
+       709  meta-index (32)
+       709    rocksdb.properties block:98/606 [restart]
+       733    [restart 709]
+       741    [trailer compression=none checksum=0x907a9f2c]
+       746  footer (53)
+       746    checksum type: crc32c
+       747    meta: offset=709, length=32
+       750    index: offset=71, length=22
+       752    [padding]
+       787    version: 4
+       791    magic number: 0xf09faab3f09faab3
+       799  EOF
diff --git a/pebble/sstable/unsafe.go b/pebble/sstable/unsafe.go
new file mode 100644
index 0000000..11ec068
--- /dev/null
+++ b/pebble/sstable/unsafe.go
@@ -0,0 +1,35 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"unsafe"
+
+	"github.com/cockroachdb/pebble/internal/manual"
+)
+
+func getBytes(ptr unsafe.Pointer, length int) []byte {
+	return (*[manual.MaxArrayLen]byte)(ptr)[:length:length]
+}
+
+func decodeVarint(ptr unsafe.Pointer) (uint32, unsafe.Pointer) {
+	if a := *((*uint8)(ptr)); a < 128 {
+		return uint32(a),
+			unsafe.Pointer(uintptr(ptr) + 1)
+	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
+		return uint32(b)<<7 | uint32(a),
+			unsafe.Pointer(uintptr(ptr) + 2)
+	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
+		return uint32(c)<<14 | uint32(b)<<7 | uint32(a),
+			unsafe.Pointer(uintptr(ptr) + 3)
+	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
+		return uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a),
+			unsafe.Pointer(uintptr(ptr) + 4)
+	} else {
+		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
+		return uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a),
+			unsafe.Pointer(uintptr(ptr) + 5)
+	}
+}
diff --git a/pebble/sstable/unsafe_test.go b/pebble/sstable/unsafe_test.go
new file mode 100644
index 0000000..2a071b1
--- /dev/null
+++ b/pebble/sstable/unsafe_test.go
@@ -0,0 +1,75 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+	"testing"
+	"time"
+	"unsafe"
+
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+func TestGetBytes(t *testing.T) {
+	const size = (1 << 31) - 1
+	// No need to actually allocate a huge slice, which can cause OOM on small
+	// machines (like the GitHub CI runners).
+	block := make([]byte, 100)
+	data := getBytes(unsafe.Pointer(&block[0]), size)
+	require.EqualValues(t, size, len(data))
+}
+
+func TestDecodeVarint(t *testing.T) {
+	vals := []uint32{
+		0,
+		1,
+		1 << 7,
+		1 << 8,
+		1 << 14,
+		1 << 15,
+		1 << 20,
+		1 << 21,
+		1 << 28,
+		1 << 29,
+		1 << 31,
+	}
+	buf := make([]byte, 5)
+	for _, v := range vals {
+		binary.PutUvarint(buf, uint64(v))
+		u, _ := decodeVarint(unsafe.Pointer(&buf[0]))
+		if v != u {
+			fmt.Printf("%d %d\n", v, u)
+		}
+	}
+}
+
+func BenchmarkDecodeVarint(b *testing.B) {
+	rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+	vals := make([]unsafe.Pointer, 10000)
+	for i := range vals {
+		buf := make([]byte, 5)
+		binary.PutUvarint(buf, uint64(rng.Uint32()))
+		vals[i] = unsafe.Pointer(&buf[0])
+	}
+
+	b.ResetTimer()
+	var ptr unsafe.Pointer
+	for i, n := 0, 0; i < b.N; i += n {
+		n = len(vals)
+		if n > b.N-i {
+			n = b.N - i
+		}
+		for j := 0; j < n; j++ {
+			_, ptr = decodeVarint(vals[j])
+		}
+	}
+	if testing.Verbose() {
+		fmt.Fprint(io.Discard, ptr)
+	}
+}
diff --git a/pebble/sstable/value_block.go b/pebble/sstable/value_block.go
new file mode 100644
index 0000000..a93b4ba
--- /dev/null
+++ b/pebble/sstable/value_block.go
@@ -0,0 +1,957 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"context"
+	"encoding/binary"
+	"io"
+	"sync"
+	"unsafe"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing"
+	"golang.org/x/exp/rand"
+)
+
+// Value blocks are supported in TableFormatPebblev3.
+//
+// 1. Motivation and overview
+//
+// Value blocks are a mechanism designed for sstables storing MVCC data, where
+// there can be many versions of a key that need to be kept, but only the
+// latest value is typically read (see the documentation for Comparer.Split
+// regarding MVCC keys). The goal is faster reads. Unlike Pebble versions,
+// which can be eagerly thrown away (except when there are snapshots), MVCC
+// versions are long-lived (e.g. default CockroachDB garbage collection
+// threshold for older versions is 24 hours) and can significantly slow down
+// reads. We have seen CockroachDB production workloads with very slow reads
+// due to:
+// - 100s of versions for each key in a table.
+//
+// - Tables with mostly MVCC garbage consisting of 2 versions per key -- a
+//   real key-value pair, followed by a key-value pair whose value (usually
+//   with zero byte length) indicates it is an MVCC tombstone.
+//
+// The value blocks mechanism attempts to improve read throughput in these
+// cases when the key size is smaller than the value sizes of older versions.
+// This is done by moving the value of an older version to a value block in a
+// different part of the sstable. This improves spatial locality of the data
+// being read by the workload, which increases caching effectiveness.
+//
+// Additionally, even when the key size is not smaller than the value of older
+// versions (e.g. secondary indexes in CockroachDB), TableFormatPebblev3
+// stores the result of key comparisons done at write time inside the sstable,
+// which makes stepping from one key prefix to the next prefix (i.e., skipping
+// over older versions of a MVCC key) more efficient by avoiding key
+// comparisons and key decoding. See the results in
+// https://github.com/cockroachdb/pebble/pull/2149 and more details in the
+// comment inside BenchmarkIteratorScanNextPrefix. These improvements are also
+// visible in end-to-end CockroachDB tests, as outlined in
+// https://github.com/cockroachdb/cockroach/pull/96652.
+//
+// In TableFormatPebblev3, each SET has a one byte value prefix that tells us
+// whether the value is in-place or in a value block. This 1 byte prefix
+// encodes additional information:
+//
+// - ShortAttribute: This is an attribute of the value. Currently, CockroachDB
+//   uses it to represent whether the value is a tombstone or not. This avoids
+//   the need to fetch a value from the value block if the caller only wants
+//   to figure out whether it is an MVCC tombstone. The length of the value is
+//   another attribute that the caller can be interested in, and it is also
+//   accessible without reading the value in the value block (see the value
+//   handle in the details section).
+//
+// - SET-same-prefix: this enables the aforementioned optimization when
+//   stepping from one key prefix to the next key prefix.
+//
+// We further optimize this iteration over prefixes by using the restart
+// points in a block to encode whether the SET at a restart point has the same
+// prefix since the last restart point. This allows us to skip over restart
+// points within the same block. See the comment in blockWriter, and how both
+// SET-same-prefix and the restart point information is used in
+// blockIter.nextPrefixV3.
+//
+// This flexibility of values that are in-place or in value blocks requires
+// flexibility in the iterator interface. The InternalIterator interface
+// returns a LazyValue instead of a byte slice. Additionally, pebble.Iterator
+// allows the caller to ask for a LazyValue. See lazy_value.go for details,
+// including the memory lifetime management.
+//
+// For historical discussions about this feature, see the issue
+// https://github.com/cockroachdb/pebble/issues/1170 and the prototype in
+// https://github.com/cockroachdb/pebble/pull/1443.
+//
+// The code in this file mainly covers value block and related encodings. We
+// discuss these in the next section.
+//
+// 2. Details
+//
+// Note that the notion of the latest value is local to the sstable. It is
+// possible that that latest value has been deleted by a sstable in a higher
+// level, and what is the latest value from the perspective of the whole LSM
+// is an older MVCC version. This only affects performance and not
+// correctness. This local knowledge is also why we continue to store these
+// older versions in the same sstable -- we need to be able to conveniently
+// read them. The code in this file is agnostic to the policy regarding what
+// should be stored in value blocks -- it allows even the latest MVCC version
+// to be stored in a value block. The policy decision in made in the
+// sstable.Writer. See Writer.makeAddPointDecisionV3.
+//
+// Data blocks contain two kinds of SET keys: those with in-place values and
+// those with a value handle. To distinguish these two cases we use a single
+// byte prefix (valuePrefix). This single byte prefix is split into multiple
+// parts, where nb represents information that is encoded in n bits.
+//
+// +---------------+--------------------+-----------+--------------------+
+// | value-kind 2b | SET-same-prefix 1b | unused 2b | short-attribute 3b |
+// +---------------+--------------------+-----------+--------------------+
+//
+// The 2 bit value-kind specifies whether this is an in-place value or a value
+// handle pointing to a value block. We use 2 bits here for future
+// representation of values that are in separate files. The 1 bit
+// SET-same-prefix is true if this key is a SET and is immediately preceded by
+// a SET that shares the same prefix. The 3 bit short-attribute is described
+// in base.ShortAttribute -- it stores user-defined attributes about the
+// value. It is unused for in-place values.
+//
+// Value Handle and Value Blocks:
+// valueHandles refer to values in value blocks. Value blocks are simpler than
+// normal data blocks (that contain key-value pairs, and allow for binary
+// search), which makes them cheap for value retrieval purposes. A valueHandle
+// is a tuple (valueLen, blockNum, offsetInBlock), where blockNum is the 0
+// indexed value block number and offsetInBlock is the byte offset in that
+// block containing the value. The valueHandle.valueLen is included since
+// there are multiple use cases in CockroachDB that need the value length but
+// not the value, for which we can avoid reading the value in the value block
+// (see
+// https://github.com/cockroachdb/pebble/issues/1170#issuecomment-958203245).
+//
+// A value block has a checksum like other blocks, and is optionally
+// compressed. An uncompressed value block is a sequence of values with no
+// separator or length (we rely on the valueHandle to demarcate). The
+// valueHandle.offsetInBlock points to the value, of length
+// valueHandle.valueLen. While writing a sstable, all the (possibly
+// compressed) value blocks need to be held in-memory until they can be
+// written. Value blocks are placed after the "meta rangedel" and "meta range
+// key" blocks since value blocks are considered less likely to be read.
+//
+// Meta Value Index Block:
+// Since the (key, valueHandle) pair are written before there is any knowledge
+// of the byte offset of the value block in the file, or its compressed
+// length, we need another lookup to map the valueHandle.blockNum to the
+// information needed to read it from the file. This information is provided
+// by the "value index block". The "value index block" is referred to by the
+// metaindex block. The design intentionally avoids making the "value index
+// block" a general purpose key-value block, since each caller wants to lookup
+// the information for a particular blockNum (there is no need for SeekGE
+// etc.). Instead, this index block stores a sequence of (blockNum,
+// blockOffset, blockLength) tuples, where the blockNums are consecutive
+// integers, and the tuples are encoded with a fixed width encoding. This
+// allows a reader to find the tuple for block K by looking at the offset
+// K*fixed-width. The fixed width for each field is decided by looking at the
+// maximum value of each of these fields. As a concrete example of a large
+// sstable with many value blocks, we constructed a 100MB sstable with many
+// versions and had 2475 value blocks (~32KB each). This sstable had this
+// tuple encoded using 2+4+2=8 bytes, which means the uncompressed value index
+// block was 2475*8=~19KB, which is modest. Therefore, we don't support more
+// than one value index block. Consider the example of 2 byte blockNum, 4 byte
+// blockOffset and 2 byte blockLen. The value index block will look like:
+//
+//   +---------------+------------------+---------------+
+//   | blockNum (2B) | blockOffset (4B) | blockLen (2B) |
+//   +---------------+------------------+---------------+
+//   |       0       |    7,123,456     |  30,000       |
+//   +---------------+------------------+---------------+
+//   |       1       |    7,153,456     |  20,000       |
+//   +---------------+------------------+---------------+
+//   |       2       |    7,173,456     |  25,567       |
+//   +---------------+------------------+---------------+
+//   |     ....      |      ...         |    ...        |
+//
+//
+// The metaindex block contains the valueBlocksIndexHandle which in addition
+// to the BlockHandle also specifies the widths of these tuple fields. In the
+// above example, the
+// valueBlockIndexHandle.{blockNumByteLength,blockOffsetByteLength,blockLengthByteLength}
+// will be (2,4,2).
+
+// valueHandle is stored with a key when the value is in a value block. This
+// handle is the pointer to that value.
+type valueHandle struct {
+	valueLen      uint32
+	blockNum      uint32
+	offsetInBlock uint32
+}
+
+// valuePrefix is the single byte prefix for either the in-place value or the
+// encoded valueHandle. It encoded multiple kinds of information.
+type valuePrefix byte
+
+const (
+	// 2 most-significant bits of valuePrefix encodes the value-kind.
+	valueKindMask           valuePrefix = '\xC0'
+	valueKindIsValueHandle  valuePrefix = '\x80'
+	valueKindIsInPlaceValue valuePrefix = '\x00'
+
+	// 1 bit indicates SET has same key prefix as immediately preceding key that
+	// is also a SET. If the immediately preceding key in the same block is a
+	// SET, AND this bit is 0, the prefix must have changed.
+	//
+	// Note that the current policy of only storing older MVCC versions in value
+	// blocks means that valueKindIsValueHandle => SET has same prefix. But no
+	// code should rely on this behavior. Also, SET has same prefix does *not*
+	// imply valueKindIsValueHandle.
+	setHasSameKeyPrefixMask valuePrefix = '\x20'
+
+	// 3 least-significant bits for the user-defined base.ShortAttribute.
+	// Undefined for valueKindIsInPlaceValue.
+	userDefinedShortAttributeMask valuePrefix = '\x07'
+)
+
+// valueHandle fields are varint encoded, so maximum 5 bytes each, plus 1 byte
+// for the valuePrefix. This could alternatively be group varint encoded, but
+// experiments were inconclusive
+// (https://github.com/cockroachdb/pebble/pull/1443#issuecomment-1270298802).
+const valueHandleMaxLen = 5*3 + 1
+
+// Assert blockHandleLikelyMaxLen >= valueHandleMaxLen.
+const _ = uint(blockHandleLikelyMaxLen - valueHandleMaxLen)
+
+func encodeValueHandle(dst []byte, v valueHandle) int {
+	n := 0
+	n += binary.PutUvarint(dst[n:], uint64(v.valueLen))
+	n += binary.PutUvarint(dst[n:], uint64(v.blockNum))
+	n += binary.PutUvarint(dst[n:], uint64(v.offsetInBlock))
+	return n
+}
+
+func makePrefixForValueHandle(setHasSameKeyPrefix bool, attribute base.ShortAttribute) valuePrefix {
+	prefix := valueKindIsValueHandle | valuePrefix(attribute)
+	if setHasSameKeyPrefix {
+		prefix = prefix | setHasSameKeyPrefixMask
+	}
+	return prefix
+}
+
+func makePrefixForInPlaceValue(setHasSameKeyPrefix bool) valuePrefix {
+	prefix := valueKindIsInPlaceValue
+	if setHasSameKeyPrefix {
+		prefix = prefix | setHasSameKeyPrefixMask
+	}
+	return prefix
+}
+
+func isValueHandle(b valuePrefix) bool {
+	return b&valueKindMask == valueKindIsValueHandle
+}
+
+// REQUIRES: isValueHandle(b)
+func getShortAttribute(b valuePrefix) base.ShortAttribute {
+	return base.ShortAttribute(b & userDefinedShortAttributeMask)
+}
+
+func setHasSamePrefix(b valuePrefix) bool {
+	return b&setHasSameKeyPrefixMask == setHasSameKeyPrefixMask
+}
+
+func decodeLenFromValueHandle(src []byte) (uint32, []byte) {
+	ptr := unsafe.Pointer(&src[0])
+	var v uint32
+	if a := *((*uint8)(ptr)); a < 128 {
+		v = uint32(a)
+		src = src[1:]
+	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
+		v = uint32(b)<<7 | uint32(a)
+		src = src[2:]
+	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
+		v = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+		src = src[3:]
+	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
+		v = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+		src = src[4:]
+	} else {
+		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
+		v = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+		src = src[5:]
+	}
+	return v, src
+}
+
+func decodeRemainingValueHandle(src []byte) valueHandle {
+	var vh valueHandle
+	ptr := unsafe.Pointer(&src[0])
+	// Manually inlined uvarint decoding. Saves ~25% in benchmarks. Unrolling
+	// a loop for i:=0; i<2; i++, saves ~6%.
+	var v uint32
+	if a := *((*uint8)(ptr)); a < 128 {
+		v = uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 1)
+	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
+		v = uint32(b)<<7 | uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 2)
+	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
+		v = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 3)
+	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
+		v = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 4)
+	} else {
+		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
+		v = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+		ptr = unsafe.Pointer(uintptr(ptr) + 5)
+	}
+	vh.blockNum = v
+
+	if a := *((*uint8)(ptr)); a < 128 {
+		v = uint32(a)
+	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
+		v = uint32(b)<<7 | uint32(a)
+	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
+		v = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
+		v = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+	} else {
+		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
+		v = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
+	}
+	vh.offsetInBlock = v
+
+	return vh
+}
+
+func decodeValueHandle(src []byte) valueHandle {
+	valLen, src := decodeLenFromValueHandle(src)
+	vh := decodeRemainingValueHandle(src)
+	vh.valueLen = valLen
+	return vh
+}
+
+// valueBlocksIndexHandle is placed in the metaindex if there are any value
+// blocks. If there are no value blocks, there is no value blocks index, and
+// no entry in the metaindex. Note that the lack of entry in the metaindex
+// should not be used to ascertain whether the values are prefixed, since the
+// former is an emergent property of the data that was written and not known
+// until all the key-value pairs in the sstable are written.
+type valueBlocksIndexHandle struct {
+	h                     BlockHandle
+	blockNumByteLength    uint8
+	blockOffsetByteLength uint8
+	blockLengthByteLength uint8
+}
+
+const valueBlocksIndexHandleMaxLen = blockHandleMaxLenWithoutProperties + 3
+
+// Assert blockHandleLikelyMaxLen >= valueBlocksIndexHandleMaxLen.
+const _ = uint(blockHandleLikelyMaxLen - valueBlocksIndexHandleMaxLen)
+
+func encodeValueBlocksIndexHandle(dst []byte, v valueBlocksIndexHandle) int {
+	n := encodeBlockHandle(dst, v.h)
+	dst[n] = v.blockNumByteLength
+	n++
+	dst[n] = v.blockOffsetByteLength
+	n++
+	dst[n] = v.blockLengthByteLength
+	n++
+	return n
+}
+
+func decodeValueBlocksIndexHandle(src []byte) (valueBlocksIndexHandle, int, error) {
+	var vbih valueBlocksIndexHandle
+	var n int
+	vbih.h, n = decodeBlockHandle(src)
+	if n <= 0 {
+		return vbih, 0, errors.Errorf("bad BlockHandle %x", src)
+	}
+	if len(src) != n+3 {
+		return vbih, 0, errors.Errorf("bad BlockHandle %x", src)
+	}
+	vbih.blockNumByteLength = src[n]
+	vbih.blockOffsetByteLength = src[n+1]
+	vbih.blockLengthByteLength = src[n+2]
+	return vbih, n + 3, nil
+}
+
+type valueBlocksAndIndexStats struct {
+	numValueBlocks         uint64
+	numValuesInValueBlocks uint64
+	// Includes both value blocks and value index block.
+	valueBlocksAndIndexSize uint64
+}
+
+// valueBlockWriter writes a sequence of value blocks, and the value blocks
+// index, for a sstable.
+type valueBlockWriter struct {
+	// The configured uncompressed block size and size threshold
+	blockSize, blockSizeThreshold int
+	// Configured compression.
+	compression Compression
+	// checksummer with configured checksum type.
+	checksummer checksummer
+	// Block finished callback.
+	blockFinishedFunc func(compressedSize int)
+
+	// buf is the current block being written to (uncompressed).
+	buf *blockBuffer
+	// compressedBuf is used for compressing the block.
+	compressedBuf *blockBuffer
+	// Sequence of blocks that are finished.
+	blocks []blockAndHandle
+	// Cumulative value block bytes written so far.
+	totalBlockBytes uint64
+	numValues       uint64
+}
+
+type blockAndHandle struct {
+	block      *blockBuffer
+	handle     BlockHandle
+	compressed bool
+}
+
+type blockBuffer struct {
+	b []byte
+}
+
+// Pool of block buffers that should be roughly the blockSize.
+var uncompressedValueBlockBufPool = sync.Pool{
+	New: func() interface{} {
+		return &blockBuffer{}
+	},
+}
+
+// Pool of block buffers for compressed value blocks. These may widely vary in
+// size based on compression ratios.
+var compressedValueBlockBufPool = sync.Pool{
+	New: func() interface{} {
+		return &blockBuffer{}
+	},
+}
+
+func releaseToValueBlockBufPool(pool *sync.Pool, b *blockBuffer) {
+	// Don't pool buffers larger than 128KB, in case we had some rare large
+	// values.
+	if len(b.b) > 128*1024 {
+		return
+	}
+	if invariants.Enabled {
+		// Set the bytes to a random value. Cap the number of bytes being
+		// randomized to prevent test timeouts.
+		length := cap(b.b)
+		if length > 1000 {
+			length = 1000
+		}
+		b.b = b.b[:length:length]
+		rand.Read(b.b)
+	}
+	pool.Put(b)
+}
+
+var valueBlockWriterPool = sync.Pool{
+	New: func() interface{} {
+		return &valueBlockWriter{}
+	},
+}
+
+func newValueBlockWriter(
+	blockSize int,
+	blockSizeThreshold int,
+	compression Compression,
+	checksumType ChecksumType,
+	// compressedSize should exclude the block trailer.
+	blockFinishedFunc func(compressedSize int),
+) *valueBlockWriter {
+	w := valueBlockWriterPool.Get().(*valueBlockWriter)
+	*w = valueBlockWriter{
+		blockSize:          blockSize,
+		blockSizeThreshold: blockSizeThreshold,
+		compression:        compression,
+		checksummer: checksummer{
+			checksumType: checksumType,
+		},
+		blockFinishedFunc: blockFinishedFunc,
+		buf:               uncompressedValueBlockBufPool.Get().(*blockBuffer),
+		compressedBuf:     compressedValueBlockBufPool.Get().(*blockBuffer),
+		blocks:            w.blocks[:0],
+	}
+	w.buf.b = w.buf.b[:0]
+	w.compressedBuf.b = w.compressedBuf.b[:0]
+	return w
+}
+
+func releaseValueBlockWriter(w *valueBlockWriter) {
+	for i := range w.blocks {
+		if w.blocks[i].compressed {
+			releaseToValueBlockBufPool(&compressedValueBlockBufPool, w.blocks[i].block)
+		} else {
+			releaseToValueBlockBufPool(&uncompressedValueBlockBufPool, w.blocks[i].block)
+		}
+		w.blocks[i].block = nil
+	}
+	if w.buf != nil {
+		releaseToValueBlockBufPool(&uncompressedValueBlockBufPool, w.buf)
+	}
+	if w.compressedBuf != nil {
+		releaseToValueBlockBufPool(&compressedValueBlockBufPool, w.compressedBuf)
+	}
+	*w = valueBlockWriter{
+		blocks: w.blocks[:0],
+	}
+	valueBlockWriterPool.Put(w)
+}
+
+func (w *valueBlockWriter) addValue(v []byte) (valueHandle, error) {
+	if invariants.Enabled && len(v) == 0 {
+		return valueHandle{}, errors.Errorf("cannot write empty value to value block")
+	}
+	w.numValues++
+	blockLen := len(w.buf.b)
+	valueLen := len(v)
+	if blockLen >= w.blockSize ||
+		(blockLen > w.blockSizeThreshold && blockLen+valueLen > w.blockSize) {
+		// Block is not currently empty and adding this value will become too big,
+		// so finish this block.
+		w.compressAndFlush()
+		blockLen = len(w.buf.b)
+		if invariants.Enabled && blockLen != 0 {
+			panic("blockLen of new block should be 0")
+		}
+	}
+	vh := valueHandle{
+		valueLen:      uint32(valueLen),
+		blockNum:      uint32(len(w.blocks)),
+		offsetInBlock: uint32(blockLen),
+	}
+	blockLen = int(vh.offsetInBlock + vh.valueLen)
+	if cap(w.buf.b) < blockLen {
+		size := 2 * cap(w.buf.b)
+		if size < 1024 {
+			size = 1024
+		}
+		for size < blockLen {
+			size *= 2
+		}
+		buf := make([]byte, blockLen, size)
+		_ = copy(buf, w.buf.b)
+		w.buf.b = buf
+	} else {
+		w.buf.b = w.buf.b[:blockLen]
+	}
+	buf := w.buf.b[vh.offsetInBlock:]
+	n := copy(buf, v)
+	if n != len(buf) {
+		panic("incorrect length computation")
+	}
+	return vh, nil
+}
+
+func (w *valueBlockWriter) compressAndFlush() {
+	// Compress the buffer, discarding the result if the improvement isn't at
+	// least 12.5%.
+	blockType := noCompressionBlockType
+	b := w.buf
+	if w.compression != NoCompression {
+		blockType, w.compressedBuf.b =
+			compressBlock(w.compression, w.buf.b, w.compressedBuf.b[:cap(w.compressedBuf.b)])
+		if len(w.compressedBuf.b) < len(w.buf.b)-len(w.buf.b)/8 {
+			b = w.compressedBuf
+		} else {
+			blockType = noCompressionBlockType
+		}
+	}
+	n := len(b.b)
+	if n+blockTrailerLen > cap(b.b) {
+		block := make([]byte, n+blockTrailerLen)
+		copy(block, b.b)
+		b.b = block
+	} else {
+		b.b = b.b[:n+blockTrailerLen]
+	}
+	b.b[n] = byte(blockType)
+	w.computeChecksum(b.b)
+	bh := BlockHandle{Offset: w.totalBlockBytes, Length: uint64(n)}
+	w.totalBlockBytes += uint64(len(b.b))
+	// blockFinishedFunc length excludes the block trailer.
+	w.blockFinishedFunc(n)
+	compressed := blockType != noCompressionBlockType
+	w.blocks = append(w.blocks, blockAndHandle{
+		block:      b,
+		handle:     bh,
+		compressed: compressed,
+	})
+	// Handed off a buffer to w.blocks, so need get a new one.
+	if compressed {
+		w.compressedBuf = compressedValueBlockBufPool.Get().(*blockBuffer)
+	} else {
+		w.buf = uncompressedValueBlockBufPool.Get().(*blockBuffer)
+	}
+	w.buf.b = w.buf.b[:0]
+}
+
+func (w *valueBlockWriter) computeChecksum(block []byte) {
+	n := len(block) - blockTrailerLen
+	checksum := w.checksummer.checksum(block[:n], block[n:n+1])
+	binary.LittleEndian.PutUint32(block[n+1:], checksum)
+}
+
+func (w *valueBlockWriter) finish(
+	writer io.Writer, fileOffset uint64,
+) (valueBlocksIndexHandle, valueBlocksAndIndexStats, error) {
+	if len(w.buf.b) > 0 {
+		w.compressAndFlush()
+	}
+	n := len(w.blocks)
+	if n == 0 {
+		return valueBlocksIndexHandle{}, valueBlocksAndIndexStats{}, nil
+	}
+	largestOffset := uint64(0)
+	largestLength := uint64(0)
+	for i := range w.blocks {
+		_, err := writer.Write(w.blocks[i].block.b)
+		if err != nil {
+			return valueBlocksIndexHandle{}, valueBlocksAndIndexStats{}, err
+		}
+		w.blocks[i].handle.Offset += fileOffset
+		largestOffset = w.blocks[i].handle.Offset
+		if largestLength < w.blocks[i].handle.Length {
+			largestLength = w.blocks[i].handle.Length
+		}
+	}
+	vbihOffset := fileOffset + w.totalBlockBytes
+
+	vbih := valueBlocksIndexHandle{
+		h: BlockHandle{
+			Offset: vbihOffset,
+		},
+		blockNumByteLength:    uint8(lenLittleEndian(uint64(n - 1))),
+		blockOffsetByteLength: uint8(lenLittleEndian(largestOffset)),
+		blockLengthByteLength: uint8(lenLittleEndian(largestLength)),
+	}
+	var err error
+	if vbih, err = w.writeValueBlocksIndex(writer, vbih); err != nil {
+		return valueBlocksIndexHandle{}, valueBlocksAndIndexStats{}, err
+	}
+	stats := valueBlocksAndIndexStats{
+		numValueBlocks:          uint64(n),
+		numValuesInValueBlocks:  w.numValues,
+		valueBlocksAndIndexSize: w.totalBlockBytes + vbih.h.Length + blockTrailerLen,
+	}
+	return vbih, stats, err
+}
+
+func (w *valueBlockWriter) writeValueBlocksIndex(
+	writer io.Writer, h valueBlocksIndexHandle,
+) (valueBlocksIndexHandle, error) {
+	blockLen :=
+		int(h.blockNumByteLength+h.blockOffsetByteLength+h.blockLengthByteLength) * len(w.blocks)
+	h.h.Length = uint64(blockLen)
+	blockLen += blockTrailerLen
+	var buf []byte
+	if cap(w.buf.b) < blockLen {
+		buf = make([]byte, blockLen)
+		w.buf.b = buf
+	} else {
+		buf = w.buf.b[:blockLen]
+	}
+	b := buf
+	for i := range w.blocks {
+		littleEndianPut(uint64(i), b, int(h.blockNumByteLength))
+		b = b[int(h.blockNumByteLength):]
+		littleEndianPut(w.blocks[i].handle.Offset, b, int(h.blockOffsetByteLength))
+		b = b[int(h.blockOffsetByteLength):]
+		littleEndianPut(w.blocks[i].handle.Length, b, int(h.blockLengthByteLength))
+		b = b[int(h.blockLengthByteLength):]
+	}
+	if len(b) != blockTrailerLen {
+		panic("incorrect length calculation")
+	}
+	b[0] = byte(noCompressionBlockType)
+	w.computeChecksum(buf)
+	if _, err := writer.Write(buf); err != nil {
+		return valueBlocksIndexHandle{}, err
+	}
+	return h, nil
+}
+
+// littleEndianPut writes v to b using little endian encoding, under the
+// assumption that v can be represented using n bytes.
+func littleEndianPut(v uint64, b []byte, n int) {
+	_ = b[n-1] // bounds check
+	for i := 0; i < n; i++ {
+		b[i] = byte(v)
+		v = v >> 8
+	}
+}
+
+// lenLittleEndian returns the minimum number of bytes needed to encode v
+// using little endian encoding.
+func lenLittleEndian(v uint64) int {
+	n := 0
+	for i := 0; i < 8; i++ {
+		n++
+		v = v >> 8
+		if v == 0 {
+			break
+		}
+	}
+	return n
+}
+
+func littleEndianGet(b []byte, n int) uint64 {
+	_ = b[n-1] // bounds check
+	v := uint64(b[0])
+	for i := 1; i < n; i++ {
+		v |= uint64(b[i]) << (8 * i)
+	}
+	return v
+}
+
+// UserKeyPrefixBound represents a [Lower,Upper) bound of user key prefixes.
+// If both are nil, there is no bound specified. Else, Compare(Lower,Upper)
+// must be < 0.
+type UserKeyPrefixBound struct {
+	// Lower is a lower bound user key prefix.
+	Lower []byte
+	// Upper is an upper bound user key prefix.
+	Upper []byte
+}
+
+// IsEmpty returns true iff the bound is empty.
+func (ukb *UserKeyPrefixBound) IsEmpty() bool {
+	return len(ukb.Lower) == 0 && len(ukb.Upper) == 0
+}
+
+type blockProviderWhenOpen interface {
+	readBlockForVBR(
+		h BlockHandle, stats *base.InternalIteratorStats,
+	) (bufferHandle, error)
+}
+
+type blockProviderWhenClosed struct {
+	rp ReaderProvider
+	r  *Reader
+}
+
+func (bpwc *blockProviderWhenClosed) open() error {
+	var err error
+	bpwc.r, err = bpwc.rp.GetReader()
+	return err
+}
+
+func (bpwc *blockProviderWhenClosed) close() {
+	bpwc.rp.Close()
+	bpwc.r = nil
+}
+
+func (bpwc blockProviderWhenClosed) readBlockForVBR(
+	h BlockHandle, stats *base.InternalIteratorStats,
+) (bufferHandle, error) {
+	// This is rare, since most block reads happen when the corresponding
+	// sstable iterator is open. So we are willing to sacrifice a proper context
+	// for tracing.
+	//
+	// TODO(sumeer): consider fixing this. See
+	// https://github.com/cockroachdb/pebble/pull/3065#issue-1991175365 for an
+	// alternative.
+	ctx := objiotracing.WithBlockType(context.Background(), objiotracing.ValueBlock)
+	// TODO(jackson,sumeer): Consider whether to use a buffer pool in this case.
+	// The bpwc is not allowed to outlive the iterator tree, so it cannot
+	// outlive the buffer pool.
+	return bpwc.r.readBlock(
+		ctx, h, nil, nil, stats, nil /* iterStats */, nil /* buffer pool */)
+}
+
+// ReaderProvider supports the implementation of blockProviderWhenClosed.
+// GetReader and Close can be called multiple times in pairs.
+type ReaderProvider interface {
+	GetReader() (r *Reader, err error)
+	Close()
+}
+
+// TrivialReaderProvider implements ReaderProvider for a Reader that will
+// outlive the top-level iterator in the iterator tree.
+type TrivialReaderProvider struct {
+	*Reader
+}
+
+var _ ReaderProvider = TrivialReaderProvider{}
+
+// GetReader implements ReaderProvider.
+func (trp TrivialReaderProvider) GetReader() (*Reader, error) {
+	return trp.Reader, nil
+}
+
+// Close implements ReaderProvider.
+func (trp TrivialReaderProvider) Close() {}
+
+// valueBlockReader is used to retrieve values in value
+// blocks. It is used when the sstable was written with
+// Properties.ValueBlocksAreEnabled.
+type valueBlockReader struct {
+	bpOpen blockProviderWhenOpen
+	rp     ReaderProvider
+	vbih   valueBlocksIndexHandle
+	stats  *base.InternalIteratorStats
+
+	// The value blocks index is lazily retrieved the first time the reader
+	// needs to read a value that resides in a value block.
+	vbiBlock []byte
+	vbiCache bufferHandle
+	// When sequentially iterating through all key-value pairs, the cost of
+	// repeatedly getting a block that is already in the cache and releasing the
+	// bufferHandle can be ~40% of the cpu overhead. So the reader remembers the
+	// last value block it retrieved, in case there is locality of access, and
+	// this value block can be used for the next value retrieval.
+	valueBlockNum uint32
+	valueBlock    []byte
+	valueBlockPtr unsafe.Pointer
+	valueCache    bufferHandle
+	lazyFetcher   base.LazyFetcher
+	closed        bool
+	bufToMangle   []byte
+}
+
+func (r *valueBlockReader) getLazyValueForPrefixAndValueHandle(handle []byte) base.LazyValue {
+	fetcher := &r.lazyFetcher
+	valLen, h := decodeLenFromValueHandle(handle[1:])
+	*fetcher = base.LazyFetcher{
+		Fetcher: r,
+		Attribute: base.AttributeAndLen{
+			ValueLen:       int32(valLen),
+			ShortAttribute: getShortAttribute(valuePrefix(handle[0])),
+		},
+	}
+	if r.stats != nil {
+		r.stats.SeparatedPointValue.Count++
+		r.stats.SeparatedPointValue.ValueBytes += uint64(valLen)
+	}
+	return base.LazyValue{
+		ValueOrHandle: h,
+		Fetcher:       fetcher,
+	}
+}
+
+func (r *valueBlockReader) close() {
+	r.bpOpen = nil
+	r.vbiBlock = nil
+	r.vbiCache.Release()
+	// Set the handle to empty since Release does not nil the Handle.value. If
+	// we were to reopen this valueBlockReader and retrieve the same
+	// Handle.value from the cache, we don't want to accidentally unref it when
+	// attempting to unref the old handle.
+	r.vbiCache = bufferHandle{}
+	r.valueBlock = nil
+	r.valueBlockPtr = nil
+	r.valueCache.Release()
+	// See comment above.
+	r.valueCache = bufferHandle{}
+	r.closed = true
+	// rp, vbih, stats remain valid, so that LazyFetcher.ValueFetcher can be
+	// implemented.
+}
+
+// Fetch implements base.ValueFetcher.
+func (r *valueBlockReader) Fetch(
+	handle []byte, valLen int32, buf []byte,
+) (val []byte, callerOwned bool, err error) {
+	if !r.closed {
+		val, err := r.getValueInternal(handle, valLen)
+		if invariants.Enabled {
+			val = r.doValueMangling(val)
+		}
+		return val, false, err
+	}
+
+	bp := blockProviderWhenClosed{rp: r.rp}
+	err = bp.open()
+	if err != nil {
+		return nil, false, err
+	}
+	defer bp.close()
+	defer r.close()
+	r.bpOpen = bp
+	var v []byte
+	v, err = r.getValueInternal(handle, valLen)
+	if err != nil {
+		return nil, false, err
+	}
+	buf = append(buf[:0], v...)
+	return buf, true, nil
+}
+
+// doValueMangling attempts to uncover violations of the contract listed in
+// the declaration comment of LazyValue. It is expensive, hence only called
+// when invariants.Enabled.
+func (r *valueBlockReader) doValueMangling(v []byte) []byte {
+	// Randomly set the bytes in the previous retrieved value to 0, since
+	// property P1 only requires the valueBlockReader to maintain the memory of
+	// one fetched value.
+	if rand.Intn(2) == 0 {
+		for i := range r.bufToMangle {
+			r.bufToMangle[i] = 0
+		}
+	}
+	// Store the current value in a new buffer for future mangling.
+	r.bufToMangle = append([]byte(nil), v...)
+	return r.bufToMangle
+}
+
+func (r *valueBlockReader) getValueInternal(handle []byte, valLen int32) (val []byte, err error) {
+	vh := decodeRemainingValueHandle(handle)
+	vh.valueLen = uint32(valLen)
+	if r.vbiBlock == nil {
+		ch, err := r.bpOpen.readBlockForVBR(r.vbih.h, r.stats)
+		if err != nil {
+			return nil, err
+		}
+		r.vbiCache = ch
+		r.vbiBlock = ch.Get()
+	}
+	if r.valueBlock == nil || r.valueBlockNum != vh.blockNum {
+		vbh, err := r.getBlockHandle(vh.blockNum)
+		if err != nil {
+			return nil, err
+		}
+		vbCacheHandle, err := r.bpOpen.readBlockForVBR(vbh, r.stats)
+		if err != nil {
+			return nil, err
+		}
+		r.valueBlockNum = vh.blockNum
+		r.valueCache.Release()
+		r.valueCache = vbCacheHandle
+		r.valueBlock = vbCacheHandle.Get()
+		r.valueBlockPtr = unsafe.Pointer(&r.valueBlock[0])
+	}
+	if r.stats != nil {
+		r.stats.SeparatedPointValue.ValueBytesFetched += uint64(valLen)
+	}
+	return r.valueBlock[vh.offsetInBlock : vh.offsetInBlock+vh.valueLen], nil
+}
+
+func (r *valueBlockReader) getBlockHandle(blockNum uint32) (BlockHandle, error) {
+	indexEntryLen :=
+		int(r.vbih.blockNumByteLength + r.vbih.blockOffsetByteLength + r.vbih.blockLengthByteLength)
+	offsetInIndex := indexEntryLen * int(blockNum)
+	if len(r.vbiBlock) < offsetInIndex+indexEntryLen {
+		return BlockHandle{}, errors.Errorf(
+			"cannot read at offset %d and length %d from block of length %d",
+			offsetInIndex, indexEntryLen, len(r.vbiBlock))
+	}
+	b := r.vbiBlock[offsetInIndex : offsetInIndex+indexEntryLen]
+	n := int(r.vbih.blockNumByteLength)
+	bn := littleEndianGet(b, n)
+	if uint32(bn) != blockNum {
+		return BlockHandle{},
+			errors.Errorf("expected block num %d but found %d", blockNum, bn)
+	}
+	b = b[n:]
+	n = int(r.vbih.blockOffsetByteLength)
+	blockOffset := littleEndianGet(b, n)
+	b = b[n:]
+	n = int(r.vbih.blockLengthByteLength)
+	blockLen := littleEndianGet(b, n)
+	return BlockHandle{Offset: blockOffset, Length: blockLen}, nil
+}
diff --git a/pebble/sstable/value_block_test.go b/pebble/sstable/value_block_test.go
new file mode 100644
index 0000000..f292c8a
--- /dev/null
+++ b/pebble/sstable/value_block_test.go
@@ -0,0 +1,111 @@
+// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"fmt"
+	"math"
+	"math/rand"
+	"testing"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/stretchr/testify/require"
+)
+
+func TestValueHandleEncodeDecode(t *testing.T) {
+	testCases := []valueHandle{
+		{valueLen: 23, blockNum: 100003, offsetInBlock: 2300},
+		{valueLen: math.MaxUint32 - 1, blockNum: math.MaxUint32 / 2, offsetInBlock: math.MaxUint32 - 2},
+	}
+	var buf [valueHandleMaxLen]byte
+	for _, tc := range testCases {
+		t.Run(fmt.Sprintf("%+v", tc), func(t *testing.T) {
+			n := encodeValueHandle(buf[:], tc)
+			vh := decodeValueHandle(buf[:n])
+			require.Equal(t, tc, vh)
+		})
+	}
+}
+
+func TestValuePrefix(t *testing.T) {
+	testCases := []struct {
+		isHandle         bool
+		setHasSamePrefix bool
+		attr             base.ShortAttribute
+	}{
+		{
+			isHandle:         false,
+			setHasSamePrefix: false,
+		},
+		{
+			isHandle:         false,
+			setHasSamePrefix: true,
+		},
+		{
+			isHandle:         true,
+			setHasSamePrefix: false,
+			attr:             5,
+		},
+		{
+			isHandle:         true,
+			setHasSamePrefix: true,
+			attr:             2,
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(fmt.Sprintf("%+v", tc), func(t *testing.T) {
+			var prefix valuePrefix
+			if tc.isHandle {
+				prefix = makePrefixForValueHandle(tc.setHasSamePrefix, tc.attr)
+			} else {
+				prefix = makePrefixForInPlaceValue(tc.setHasSamePrefix)
+			}
+			require.Equal(t, tc.isHandle, isValueHandle(prefix))
+			require.Equal(t, tc.setHasSamePrefix, setHasSamePrefix(prefix))
+			if tc.isHandle {
+				require.Equal(t, tc.attr, getShortAttribute(prefix))
+			}
+		})
+	}
+}
+
+func TestValueBlocksIndexHandleEncodeDecode(t *testing.T) {
+	testCases := []valueBlocksIndexHandle{
+		{
+			h: BlockHandle{
+				Offset: math.MaxUint64 / 2,
+				Length: math.MaxUint64 / 4,
+			},
+			blockNumByteLength:    53,
+			blockOffsetByteLength: math.MaxUint8,
+			blockLengthByteLength: math.MaxUint8 / 2,
+		},
+	}
+	var buf [valueBlocksIndexHandleMaxLen]byte
+	for _, tc := range testCases {
+		t.Run(fmt.Sprintf("%+v", tc), func(t *testing.T) {
+			n := encodeValueBlocksIndexHandle(buf[:], tc)
+			vbih, n2, err := decodeValueBlocksIndexHandle(buf[:n])
+			require.NoError(t, err)
+			require.Equal(t, n, n2)
+			require.Equal(t, tc, vbih)
+		})
+	}
+}
+
+func TestLittleEndianGetPut(t *testing.T) {
+	testCases := []uint64{
+		0, (1 << 10) - 1, (1 << 25) + 1, math.MaxUint32, math.MaxUint64, uint64(rand.Int63())}
+	var buf [8]byte
+	for _, tc := range testCases {
+		t.Run(fmt.Sprintf("%d", tc), func(t *testing.T) {
+			length := lenLittleEndian(tc)
+			b := buf[:length:length]
+			littleEndianPut(tc, b, length)
+			v := littleEndianGet(b, length)
+			require.Equal(t, tc, v)
+		})
+	}
+}
diff --git a/pebble/sstable/write_queue.go b/pebble/sstable/write_queue.go
new file mode 100644
index 0000000..898f327
--- /dev/null
+++ b/pebble/sstable/write_queue.go
@@ -0,0 +1,140 @@
+package sstable
+
+import (
+	"sync"
+
+	"github.com/cockroachdb/pebble/internal/base"
+)
+
+type writeTask struct {
+	// Since writeTasks are pooled, the compressionDone channel will be re-used.
+	// It is necessary that any writes to the channel have already been read,
+	// before adding the writeTask back to the pool.
+	compressionDone chan bool
+	buf             *dataBlockBuf
+	// If this is not nil, then this index block will be flushed.
+	flushableIndexBlock *indexBlockBuf
+	// currIndexBlock is the index block on which indexBlock.add must be called.
+	currIndexBlock *indexBlockBuf
+	indexEntrySep  InternalKey
+	// inflightIndexEntrySize is used to decrement Writer.indexBlock.sizeEstimate.inflightSize.
+	indexInflightSize int
+	// If the index block is finished, then we set the finishedIndexProps here.
+	finishedIndexProps []byte
+}
+
+// It is not the responsibility of the writeTask to clear the
+// task.flushableIndexBlock, and task.buf.
+func (task *writeTask) clear() {
+	*task = writeTask{
+		indexEntrySep:   base.InvalidInternalKey,
+		compressionDone: task.compressionDone,
+	}
+}
+
+// Note that only the Writer client goroutine will be adding tasks to the writeQueue.
+// Both the Writer client and the compression goroutines will be able to write to
+// writeTask.compressionDone to indicate that the compression job associated with
+// a writeTask has finished.
+type writeQueue struct {
+	tasks  chan *writeTask
+	wg     sync.WaitGroup
+	writer *Writer
+
+	// err represents an error which is encountered when the write queue attempts
+	// to write a block to disk. The error is stored here to skip unnecessary block
+	// writes once the first error is encountered.
+	err    error
+	closed bool
+}
+
+func newWriteQueue(size int, writer *Writer) *writeQueue {
+	w := &writeQueue{}
+	w.tasks = make(chan *writeTask, size)
+	w.writer = writer
+
+	w.wg.Add(1)
+	go w.runWorker()
+	return w
+}
+
+func (w *writeQueue) performWrite(task *writeTask) error {
+	var bh BlockHandle
+	var bhp BlockHandleWithProperties
+
+	var err error
+	if bh, err = w.writer.writeCompressedBlock(task.buf.compressed, task.buf.tmp[:]); err != nil {
+		return err
+	}
+
+	bhp = BlockHandleWithProperties{BlockHandle: bh, Props: task.buf.dataBlockProps}
+	if err = w.writer.addIndexEntry(
+		task.indexEntrySep, bhp, task.buf.tmp[:], task.flushableIndexBlock, task.currIndexBlock,
+		task.indexInflightSize, task.finishedIndexProps); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// It is necessary to ensure that none of the buffers in the writeTask,
+// dataBlockBuf, indexBlockBuf, are pointed to by another struct.
+func (w *writeQueue) releaseBuffers(task *writeTask) {
+	task.buf.clear()
+	dataBlockBufPool.Put(task.buf)
+
+	// This index block is no longer used by the Writer, so we can add it back
+	// to the pool.
+	if task.flushableIndexBlock != nil {
+		task.flushableIndexBlock.clear()
+		indexBlockBufPool.Put(task.flushableIndexBlock)
+	}
+
+	task.clear()
+	writeTaskPool.Put(task)
+}
+
+func (w *writeQueue) runWorker() {
+	for task := range w.tasks {
+		<-task.compressionDone
+
+		if w.err == nil {
+			w.err = w.performWrite(task)
+		}
+
+		w.releaseBuffers(task)
+	}
+	w.wg.Done()
+}
+
+func (w *writeQueue) add(task *writeTask) {
+	w.tasks <- task
+}
+
+// addSync will perform the writeTask synchronously with the caller goroutine. Calls to addSync
+// are no longer valid once writeQueue.add has been called at least once.
+func (w *writeQueue) addSync(task *writeTask) error {
+	// This should instantly return without blocking.
+	<-task.compressionDone
+
+	if w.err == nil {
+		w.err = w.performWrite(task)
+	}
+
+	w.releaseBuffers(task)
+
+	return w.err
+}
+
+// finish should only be called once no more tasks will be added to the writeQueue.
+// finish will return any error which was encountered while tasks were processed.
+func (w *writeQueue) finish() error {
+	if w.closed {
+		return w.err
+	}
+
+	close(w.tasks)
+	w.wg.Wait()
+	w.closed = true
+	return w.err
+}
diff --git a/pebble/sstable/writer.go b/pebble/sstable/writer.go
new file mode 100644
index 0000000..883ee7c
--- /dev/null
+++ b/pebble/sstable/writer.go
@@ -0,0 +1,2450 @@
+// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"math"
+	"runtime"
+	"sort"
+	"sync"
+
+	"github.com/cespare/xxhash/v2"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/bytealloc"
+	"github.com/cockroachdb/pebble/internal/cache"
+	"github.com/cockroachdb/pebble/internal/crc"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/private"
+	"github.com/cockroachdb/pebble/internal/rangekey"
+	"github.com/cockroachdb/pebble/objstorage"
+)
+
+// encodedBHPEstimatedSize estimates the size of the encoded BlockHandleWithProperties.
+// It would also be nice to account for the length of the data block properties here,
+// but isn't necessary since this is an estimate.
+const encodedBHPEstimatedSize = binary.MaxVarintLen64 * 2
+
+var errWriterClosed = errors.New("pebble: writer is closed")
+
+// WriterMetadata holds info about a finished sstable.
+type WriterMetadata struct {
+	Size          uint64
+	SmallestPoint InternalKey
+	// LargestPoint, LargestRangeKey, LargestRangeDel should not be accessed
+	// before Writer.Close is called, because they may only be set on
+	// Writer.Close.
+	LargestPoint     InternalKey
+	SmallestRangeDel InternalKey
+	LargestRangeDel  InternalKey
+	SmallestRangeKey InternalKey
+	LargestRangeKey  InternalKey
+	HasPointKeys     bool
+	HasRangeDelKeys  bool
+	HasRangeKeys     bool
+	SmallestSeqNum   uint64
+	LargestSeqNum    uint64
+	Properties       Properties
+}
+
+// SetSmallestPointKey sets the smallest point key to the given key.
+// NB: this method set the "absolute" smallest point key. Any existing key is
+// overridden.
+func (m *WriterMetadata) SetSmallestPointKey(k InternalKey) {
+	m.SmallestPoint = k
+	m.HasPointKeys = true
+}
+
+// SetSmallestRangeDelKey sets the smallest rangedel key to the given key.
+// NB: this method set the "absolute" smallest rangedel key. Any existing key is
+// overridden.
+func (m *WriterMetadata) SetSmallestRangeDelKey(k InternalKey) {
+	m.SmallestRangeDel = k
+	m.HasRangeDelKeys = true
+}
+
+// SetSmallestRangeKey sets the smallest range key to the given key.
+// NB: this method set the "absolute" smallest range key. Any existing key is
+// overridden.
+func (m *WriterMetadata) SetSmallestRangeKey(k InternalKey) {
+	m.SmallestRangeKey = k
+	m.HasRangeKeys = true
+}
+
+// SetLargestPointKey sets the largest point key to the given key.
+// NB: this method set the "absolute" largest point key. Any existing key is
+// overridden.
+func (m *WriterMetadata) SetLargestPointKey(k InternalKey) {
+	m.LargestPoint = k
+	m.HasPointKeys = true
+}
+
+// SetLargestRangeDelKey sets the largest rangedel key to the given key.
+// NB: this method set the "absolute" largest rangedel key. Any existing key is
+// overridden.
+func (m *WriterMetadata) SetLargestRangeDelKey(k InternalKey) {
+	m.LargestRangeDel = k
+	m.HasRangeDelKeys = true
+}
+
+// SetLargestRangeKey sets the largest range key to the given key.
+// NB: this method set the "absolute" largest range key. Any existing key is
+// overridden.
+func (m *WriterMetadata) SetLargestRangeKey(k InternalKey) {
+	m.LargestRangeKey = k
+	m.HasRangeKeys = true
+}
+
+func (m *WriterMetadata) updateSeqNum(seqNum uint64) {
+	if m.SmallestSeqNum > seqNum {
+		m.SmallestSeqNum = seqNum
+	}
+	if m.LargestSeqNum < seqNum {
+		m.LargestSeqNum = seqNum
+	}
+}
+
+// Writer is a table writer.
+type Writer struct {
+	writable objstorage.Writable
+	meta     WriterMetadata
+	err      error
+	// cacheID and fileNum are used to remove blocks written to the sstable from
+	// the cache, providing a defense in depth against bugs which cause cache
+	// collisions.
+	cacheID uint64
+	fileNum base.DiskFileNum
+	// The following fields are copied from Options.
+	blockSize               int
+	blockSizeThreshold      int
+	indexBlockSize          int
+	indexBlockSizeThreshold int
+	compare                 Compare
+	split                   Split
+	formatKey               base.FormatKey
+	compression             Compression
+	separator               Separator
+	successor               Successor
+	tableFormat             TableFormat
+	isStrictObsolete        bool
+	writingToLowestLevel    bool
+	cache                   *cache.Cache
+	restartInterval         int
+	checksumType            ChecksumType
+	// disableKeyOrderChecks disables the checks that keys are added to an
+	// sstable in order. It is intended for internal use only in the construction
+	// of invalid sstables for testing. See tool/make_test_sstables.go.
+	disableKeyOrderChecks bool
+	// With two level indexes, the index/filter of a SST file is partitioned into
+	// smaller blocks with an additional top-level index on them. When reading an
+	// index/filter, only the top-level index is loaded into memory. The two level
+	// index/filter then uses the top-level index to load on demand into the block
+	// cache the partitions that are required to perform the index/filter query.
+	//
+	// Two level indexes are enabled automatically when there is more than one
+	// index block.
+	//
+	// This is useful when there are very large index blocks, which generally occurs
+	// with the usage of large keys. With large index blocks, the index blocks fight
+	// the data blocks for block cache space and the index blocks are likely to be
+	// re-read many times from the disk. The top level index, which has a much
+	// smaller memory footprint, can be used to prevent the entire index block from
+	// being loaded into the block cache.
+	twoLevelIndex bool
+	// Internal flag to allow creation of range-del-v1 format blocks. Only used
+	// for testing. Note that v2 format blocks are backwards compatible with v1
+	// format blocks.
+	rangeDelV1Format    bool
+	indexBlock          *indexBlockBuf
+	rangeDelBlock       blockWriter
+	rangeKeyBlock       blockWriter
+	topLevelIndexBlock  blockWriter
+	props               Properties
+	propCollectors      []TablePropertyCollector
+	blockPropCollectors []BlockPropertyCollector
+	obsoleteCollector   obsoleteKeyBlockPropertyCollector
+	blockPropsEncoder   blockPropertiesEncoder
+	// filter accumulates the filter block. If populated, the filter ingests
+	// either the output of w.split (i.e. a prefix extractor) if w.split is not
+	// nil, or the full keys otherwise.
+	filter          filterWriter
+	indexPartitions []indexBlockAndBlockProperties
+
+	// indexBlockAlloc is used to bulk-allocate byte slices used to store index
+	// blocks in indexPartitions. These live until the index finishes.
+	indexBlockAlloc []byte
+	// indexSepAlloc is used to bulk-allocate index block separator slices stored
+	// in indexPartitions. These live until the index finishes.
+	indexSepAlloc bytealloc.A
+
+	// To allow potentially overlapping (i.e. un-fragmented) range keys spans to
+	// be added to the Writer, a keyspan.Fragmenter is used to retain the keys
+	// and values, emitting fragmented, coalesced spans as appropriate. Range
+	// keys must be added in order of their start user-key.
+	fragmenter        keyspan.Fragmenter
+	rangeKeyEncoder   rangekey.Encoder
+	rangeKeysBySuffix keyspan.KeysBySuffix
+	rangeKeySpan      keyspan.Span
+	rkBuf             []byte
+	// dataBlockBuf consists of the state which is currently owned by and used by
+	// the Writer client goroutine. This state can be handed off to other goroutines.
+	dataBlockBuf *dataBlockBuf
+	// blockBuf consists of the state which is owned by and used by the Writer client
+	// goroutine.
+	blockBuf blockBuf
+
+	coordination coordinationState
+
+	// Information (other than the byte slice) about the last point key, to
+	// avoid extracting it again.
+	lastPointKeyInfo pointKeyInfo
+
+	// For value blocks.
+	shortAttributeExtractor   base.ShortAttributeExtractor
+	requiredInPlaceValueBound UserKeyPrefixBound
+	valueBlockWriter          *valueBlockWriter
+}
+
+type pointKeyInfo struct {
+	trailer uint64
+	// Only computed when w.valueBlockWriter is not nil.
+	userKeyLen int
+	// prefixLen uses w.split, if not nil. Only computed when w.valueBlockWriter
+	// is not nil.
+	prefixLen int
+	// True iff the point was marked obsolete.
+	isObsolete bool
+}
+
+type coordinationState struct {
+	parallelismEnabled bool
+
+	// writeQueue is used to write data blocks to disk. The writeQueue is primarily
+	// used to maintain the order in which data blocks must be written to disk. For
+	// this reason, every single data block write must be done through the writeQueue.
+	writeQueue *writeQueue
+
+	sizeEstimate dataBlockEstimates
+}
+
+func (c *coordinationState) init(parallelismEnabled bool, writer *Writer) {
+	c.parallelismEnabled = parallelismEnabled
+	// useMutex is false regardless of parallelismEnabled, because we do not do
+	// parallel compression yet.
+	c.sizeEstimate.useMutex = false
+
+	// writeQueueSize determines the size of the write queue, or the number
+	// of items which can be added to the queue without blocking. By default, we
+	// use a writeQueue size of 0, since we won't be doing any block writes in
+	// parallel.
+	writeQueueSize := 0
+	if parallelismEnabled {
+		writeQueueSize = runtime.GOMAXPROCS(0)
+	}
+	c.writeQueue = newWriteQueue(writeQueueSize, writer)
+}
+
+// sizeEstimate is a general purpose helper for estimating two kinds of sizes:
+// A. The compressed sstable size, which is useful for deciding when to start
+//
+//	a new sstable during flushes or compactions. In practice, we use this in
+//	estimating the data size (excluding the index).
+//
+// B. The size of index blocks to decide when to start a new index block.
+//
+// There are some terminology peculiarities which are due to the origin of
+// sizeEstimate for use case A with parallel compression enabled (for which
+// the code has not been merged). Specifically this relates to the terms
+// "written" and "compressed".
+//   - The notion of "written" for case A is sufficiently defined by saying that
+//     the data block is compressed. Waiting for the actual data block write to
+//     happen can result in unnecessary estimation, when we already know how big
+//     it will be in compressed form. Additionally, with the forthcoming value
+//     blocks containing older MVCC values, these compressed block will be held
+//     in-memory until late in the sstable writing, and we do want to accurately
+//     account for them without waiting for the actual write.
+//     For case B, "written" means that the index entry has been fully
+//     generated, and has been added to the uncompressed block buffer for that
+//     index block. It does not include actually writing a potentially
+//     compressed index block.
+//   - The notion of "compressed" is to differentiate between a "inflight" size
+//     and the actual size, and is handled via computing a compression ratio
+//     observed so far (defaults to 1).
+//     For case A, this is actual data block compression, so the "inflight" size
+//     is uncompressed blocks (that are no longer being written to) and the
+//     "compressed" size is after they have been compressed.
+//     For case B the inflight size is for a key-value pair in the index for
+//     which the value size (the encoded size of the BlockHandleWithProperties)
+//     is not accurately known, while the compressed size is the size of that
+//     entry when it has been added to the (in-progress) index ssblock.
+//
+// Usage: To update state, one can optionally provide an inflight write value
+// using addInflight (used for case B). When something is "written" the state
+// can be updated using either writtenWithDelta or writtenWithTotal, which
+// provide the actual delta size or the total size (latter must be
+// monotonically non-decreasing). If there were no calls to addInflight, there
+// isn't any real estimation happening here. So case A does not do any real
+// estimation. However, when we introduce parallel compression, there will be
+// estimation in that the client goroutine will call addInFlight and the
+// compression goroutines will call writtenWithDelta.
+type sizeEstimate struct {
+	// emptySize is the size when there is no inflight data, and numEntries is 0.
+	// emptySize is constant once set.
+	emptySize uint64
+
+	// inflightSize is the estimated size of some inflight data which hasn't
+	// been written yet.
+	inflightSize uint64
+
+	// totalSize is the total size of the data which has already been written.
+	totalSize uint64
+
+	// numWrittenEntries is the total number of entries which have already been
+	// written.
+	numWrittenEntries uint64
+	// numInflightEntries is the total number of entries which are inflight, and
+	// haven't been written.
+	numInflightEntries uint64
+
+	// maxEstimatedSize stores the maximum result returned from sizeEstimate.size.
+	// It ensures that values returned from subsequent calls to Writer.EstimatedSize
+	// never decrease.
+	maxEstimatedSize uint64
+
+	// We assume that the entries added to the sizeEstimate can be compressed.
+	// For this reason, we keep track of a compressedSize and an uncompressedSize
+	// to compute a compression ratio for the inflight entries. If the entries
+	// aren't being compressed, then compressedSize and uncompressedSize must be
+	// equal.
+	compressedSize   uint64
+	uncompressedSize uint64
+}
+
+func (s *sizeEstimate) init(emptySize uint64) {
+	s.emptySize = emptySize
+}
+
+func (s *sizeEstimate) size() uint64 {
+	ratio := float64(1)
+	if s.uncompressedSize > 0 {
+		ratio = float64(s.compressedSize) / float64(s.uncompressedSize)
+	}
+	estimatedInflightSize := uint64(float64(s.inflightSize) * ratio)
+	total := s.totalSize + estimatedInflightSize
+	if total > s.maxEstimatedSize {
+		s.maxEstimatedSize = total
+	} else {
+		total = s.maxEstimatedSize
+	}
+
+	if total == 0 {
+		return s.emptySize
+	}
+
+	return total
+}
+
+func (s *sizeEstimate) numTotalEntries() uint64 {
+	return s.numWrittenEntries + s.numInflightEntries
+}
+
+func (s *sizeEstimate) addInflight(size int) {
+	s.numInflightEntries++
+	s.inflightSize += uint64(size)
+}
+
+func (s *sizeEstimate) writtenWithTotal(newTotalSize uint64, inflightSize int) {
+	finalEntrySize := int(newTotalSize - s.totalSize)
+	s.writtenWithDelta(finalEntrySize, inflightSize)
+}
+
+func (s *sizeEstimate) writtenWithDelta(finalEntrySize int, inflightSize int) {
+	if inflightSize > 0 {
+		// This entry was previously inflight, so we should decrement inflight
+		// entries and update the "compression" stats for future estimation.
+		s.numInflightEntries--
+		s.inflightSize -= uint64(inflightSize)
+		s.uncompressedSize += uint64(inflightSize)
+		s.compressedSize += uint64(finalEntrySize)
+	}
+	s.numWrittenEntries++
+	s.totalSize += uint64(finalEntrySize)
+}
+
+func (s *sizeEstimate) clear() {
+	*s = sizeEstimate{emptySize: s.emptySize}
+}
+
+type indexBlockBuf struct {
+	// block will only be accessed from the writeQueue.
+	block blockWriter
+
+	size struct {
+		useMutex bool
+		mu       sync.Mutex
+		estimate sizeEstimate
+	}
+
+	// restartInterval matches indexBlockBuf.block.restartInterval. We store it twice, because the `block`
+	// must only be accessed from the writeQueue goroutine.
+	restartInterval int
+}
+
+func (i *indexBlockBuf) clear() {
+	i.block.clear()
+	if i.size.useMutex {
+		i.size.mu.Lock()
+		defer i.size.mu.Unlock()
+	}
+	i.size.estimate.clear()
+	i.restartInterval = 0
+}
+
+var indexBlockBufPool = sync.Pool{
+	New: func() interface{} {
+		return &indexBlockBuf{}
+	},
+}
+
+const indexBlockRestartInterval = 1
+
+func newIndexBlockBuf(useMutex bool) *indexBlockBuf {
+	i := indexBlockBufPool.Get().(*indexBlockBuf)
+	i.size.useMutex = useMutex
+	i.restartInterval = indexBlockRestartInterval
+	i.block.restartInterval = indexBlockRestartInterval
+	i.size.estimate.init(emptyBlockSize)
+	return i
+}
+
+func (i *indexBlockBuf) shouldFlush(
+	sep InternalKey, valueLen, targetBlockSize, sizeThreshold int,
+) bool {
+	if i.size.useMutex {
+		i.size.mu.Lock()
+		defer i.size.mu.Unlock()
+	}
+
+	nEntries := i.size.estimate.numTotalEntries()
+	return shouldFlush(
+		sep, valueLen, i.restartInterval, int(i.size.estimate.size()),
+		int(nEntries), targetBlockSize, sizeThreshold)
+}
+
+func (i *indexBlockBuf) add(key InternalKey, value []byte, inflightSize int) {
+	i.block.add(key, value)
+	size := i.block.estimatedSize()
+	if i.size.useMutex {
+		i.size.mu.Lock()
+		defer i.size.mu.Unlock()
+	}
+	i.size.estimate.writtenWithTotal(uint64(size), inflightSize)
+}
+
+func (i *indexBlockBuf) finish() []byte {
+	b := i.block.finish()
+	return b
+}
+
+func (i *indexBlockBuf) addInflight(inflightSize int) {
+	if i.size.useMutex {
+		i.size.mu.Lock()
+		defer i.size.mu.Unlock()
+	}
+	i.size.estimate.addInflight(inflightSize)
+}
+
+func (i *indexBlockBuf) estimatedSize() uint64 {
+	if i.size.useMutex {
+		i.size.mu.Lock()
+		defer i.size.mu.Unlock()
+	}
+
+	// Make sure that the size estimation works as expected when parallelism
+	// is disabled.
+	if invariants.Enabled && !i.size.useMutex {
+		if i.size.estimate.inflightSize != 0 {
+			panic("unexpected inflight entry in index block size estimation")
+		}
+
+		// NB: The i.block should only be accessed from the writeQueue goroutine,
+		// when parallelism is enabled. We break that invariant here, but that's
+		// okay since parallelism is disabled.
+		if i.size.estimate.size() != uint64(i.block.estimatedSize()) {
+			panic("index block size estimation sans parallelism is incorrect")
+		}
+	}
+	return i.size.estimate.size()
+}
+
+// sizeEstimate is used for sstable size estimation. sizeEstimate can be
+// accessed by the Writer client and compressionQueue goroutines. Fields
+// should only be read/updated through the functions defined on the
+// *sizeEstimate type.
+type dataBlockEstimates struct {
+	// If we don't do block compression in parallel, then we don't need to take
+	// the performance hit of synchronizing using this mutex.
+	useMutex bool
+	mu       sync.Mutex
+
+	estimate sizeEstimate
+}
+
+// inflightSize is the uncompressed block size estimate which has been
+// previously provided to addInflightDataBlock(). If addInflightDataBlock()
+// has not been called, this must be set to 0. compressedSize is the
+// compressed size of the block.
+func (d *dataBlockEstimates) dataBlockCompressed(compressedSize int, inflightSize int) {
+	if d.useMutex {
+		d.mu.Lock()
+		defer d.mu.Unlock()
+	}
+	d.estimate.writtenWithDelta(compressedSize+blockTrailerLen, inflightSize)
+}
+
+// size is an estimated size of datablock data which has been written to disk.
+func (d *dataBlockEstimates) size() uint64 {
+	if d.useMutex {
+		d.mu.Lock()
+		defer d.mu.Unlock()
+	}
+	// If there is no parallel compression, there should not be any inflight bytes.
+	if invariants.Enabled && !d.useMutex {
+		if d.estimate.inflightSize != 0 {
+			panic("unexpected inflight entry in data block size estimation")
+		}
+	}
+	return d.estimate.size()
+}
+
+// Avoid linter unused error.
+var _ = (&dataBlockEstimates{}).addInflightDataBlock
+
+// NB: unused since no parallel compression.
+func (d *dataBlockEstimates) addInflightDataBlock(size int) {
+	if d.useMutex {
+		d.mu.Lock()
+		defer d.mu.Unlock()
+	}
+
+	d.estimate.addInflight(size)
+}
+
+var writeTaskPool = sync.Pool{
+	New: func() interface{} {
+		t := &writeTask{}
+		t.compressionDone = make(chan bool, 1)
+		return t
+	},
+}
+
+type checksummer struct {
+	checksumType ChecksumType
+	xxHasher     *xxhash.Digest
+}
+
+func (c *checksummer) checksum(block []byte, blockType []byte) (checksum uint32) {
+	// Calculate the checksum.
+	switch c.checksumType {
+	case ChecksumTypeCRC32c:
+		checksum = crc.New(block).Update(blockType).Value()
+	case ChecksumTypeXXHash64:
+		if c.xxHasher == nil {
+			c.xxHasher = xxhash.New()
+		} else {
+			c.xxHasher.Reset()
+		}
+		c.xxHasher.Write(block)
+		c.xxHasher.Write(blockType)
+		checksum = uint32(c.xxHasher.Sum64())
+	default:
+		panic(errors.Newf("unsupported checksum type: %d", c.checksumType))
+	}
+	return checksum
+}
+
+type blockBuf struct {
+	// tmp is a scratch buffer, large enough to hold either footerLen bytes,
+	// blockTrailerLen bytes, (5 * binary.MaxVarintLen64) bytes, and most
+	// likely large enough for a block handle with properties.
+	tmp [blockHandleLikelyMaxLen]byte
+	// compressedBuf is the destination buffer for compression. It is re-used over the
+	// lifetime of the blockBuf, avoiding the allocation of a temporary buffer for each block.
+	compressedBuf []byte
+	checksummer   checksummer
+}
+
+func (b *blockBuf) clear() {
+	// We can't assign b.compressedBuf[:0] to compressedBuf because snappy relies
+	// on the length of the buffer, and not the capacity to determine if it needs
+	// to make an allocation.
+	*b = blockBuf{
+		compressedBuf: b.compressedBuf, checksummer: b.checksummer,
+	}
+}
+
+// A dataBlockBuf holds all the state required to compress and write a data block to disk.
+// A dataBlockBuf begins its lifecycle owned by the Writer client goroutine. The Writer
+// client goroutine adds keys to the sstable, writing directly into a dataBlockBuf's blockWriter
+// until the block is full. Once a dataBlockBuf's block is full, the dataBlockBuf may be passed
+// to other goroutines for compression and file I/O.
+type dataBlockBuf struct {
+	blockBuf
+	dataBlock blockWriter
+
+	// uncompressed is a reference to a byte slice which is owned by the dataBlockBuf. It is the
+	// next byte slice to be compressed. The uncompressed byte slice will be backed by the
+	// dataBlock.buf.
+	uncompressed []byte
+	// compressed is a reference to a byte slice which is owned by the dataBlockBuf. It is the
+	// compressed byte slice which must be written to disk. The compressed byte slice may be
+	// backed by the dataBlock.buf, or the dataBlockBuf.compressedBuf, depending on whether
+	// we use the result of the compression.
+	compressed []byte
+
+	// We're making calls to BlockPropertyCollectors from the Writer client goroutine. We need to
+	// pass the encoded block properties over to the write queue. To prevent copies, and allocations,
+	// we give each dataBlockBuf, a blockPropertiesEncoder.
+	blockPropsEncoder blockPropertiesEncoder
+	// dataBlockProps is set when Writer.finishDataBlockProps is called. The dataBlockProps slice is
+	// a shallow copy of the internal buffer of the dataBlockBuf.blockPropsEncoder.
+	dataBlockProps []byte
+
+	// sepScratch is reusable scratch space for computing separator keys.
+	sepScratch []byte
+}
+
+func (d *dataBlockBuf) clear() {
+	d.blockBuf.clear()
+	d.dataBlock.clear()
+
+	d.uncompressed = nil
+	d.compressed = nil
+	d.dataBlockProps = nil
+	d.sepScratch = d.sepScratch[:0]
+}
+
+var dataBlockBufPool = sync.Pool{
+	New: func() interface{} {
+		return &dataBlockBuf{}
+	},
+}
+
+func newDataBlockBuf(restartInterval int, checksumType ChecksumType) *dataBlockBuf {
+	d := dataBlockBufPool.Get().(*dataBlockBuf)
+	d.dataBlock.restartInterval = restartInterval
+	d.checksummer.checksumType = checksumType
+	return d
+}
+
+func (d *dataBlockBuf) finish() {
+	d.uncompressed = d.dataBlock.finish()
+}
+
+func (d *dataBlockBuf) compressAndChecksum(c Compression) {
+	d.compressed = compressAndChecksum(d.uncompressed, c, &d.blockBuf)
+}
+
+func (d *dataBlockBuf) shouldFlush(
+	key InternalKey, valueLen, targetBlockSize, sizeThreshold int,
+) bool {
+	return shouldFlush(
+		key, valueLen, d.dataBlock.restartInterval, d.dataBlock.estimatedSize(),
+		d.dataBlock.nEntries, targetBlockSize, sizeThreshold)
+}
+
+type indexBlockAndBlockProperties struct {
+	nEntries int
+	// sep is the last key added to this block, for computing a separator later.
+	sep        InternalKey
+	properties []byte
+	// block is the encoded block produced by blockWriter.finish.
+	block []byte
+}
+
+// Set sets the value for the given key. The sequence number is set to 0.
+// Intended for use to externally construct an sstable before ingestion into a
+// DB. For a given Writer, the keys passed to Set must be in strictly increasing
+// order.
+//
+// TODO(peter): untested
+func (w *Writer) Set(key, value []byte) error {
+	if w.err != nil {
+		return w.err
+	}
+	if w.isStrictObsolete {
+		return errors.Errorf("use AddWithForceObsolete")
+	}
+	// forceObsolete is false based on the assumption that no RANGEDELs in the
+	// sstable delete the added points.
+	return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindSet), value, false)
+}
+
+// Delete deletes the value for the given key. The sequence number is set to
+// 0. Intended for use to externally construct an sstable before ingestion into
+// a DB.
+//
+// TODO(peter): untested
+func (w *Writer) Delete(key []byte) error {
+	if w.err != nil {
+		return w.err
+	}
+	if w.isStrictObsolete {
+		return errors.Errorf("use AddWithForceObsolete")
+	}
+	// forceObsolete is false based on the assumption that no RANGEDELs in the
+	// sstable delete the added points.
+	return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindDelete), nil, false)
+}
+
+// DeleteRange deletes all of the keys (and values) in the range [start,end)
+// (inclusive on start, exclusive on end). The sequence number is set to
+// 0. Intended for use to externally construct an sstable before ingestion into
+// a DB.
+//
+// TODO(peter): untested
+func (w *Writer) DeleteRange(start, end []byte) error {
+	if w.err != nil {
+		return w.err
+	}
+	return w.addTombstone(base.MakeInternalKey(start, 0, InternalKeyKindRangeDelete), end)
+}
+
+// Merge adds an action to the DB that merges the value at key with the new
+// value. The details of the merge are dependent upon the configured merge
+// operator. The sequence number is set to 0. Intended for use to externally
+// construct an sstable before ingestion into a DB.
+//
+// TODO(peter): untested
+func (w *Writer) Merge(key, value []byte) error {
+	if w.err != nil {
+		return w.err
+	}
+	if w.isStrictObsolete {
+		return errors.Errorf("use AddWithForceObsolete")
+	}
+	// forceObsolete is false based on the assumption that no RANGEDELs in the
+	// sstable that delete the added points. If the user configured this writer
+	// to be strict-obsolete, addPoint will reject the addition of this MERGE.
+	return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindMerge), value, false)
+}
+
+// Add adds a key/value pair to the table being written. For a given Writer,
+// the keys passed to Add must be in increasing order. The exception to this
+// rule is range deletion tombstones. Range deletion tombstones need to be
+// added ordered by their start key, but they can be added out of order from
+// point entries. Additionally, range deletion tombstones must be fragmented
+// (i.e. by keyspan.Fragmenter).
+func (w *Writer) Add(key InternalKey, value []byte) error {
+	if w.isStrictObsolete {
+		return errors.Errorf("use AddWithForceObsolete")
+	}
+	return w.AddWithForceObsolete(key, value, false)
+}
+
+// AddWithForceObsolete must be used when writing a strict-obsolete sstable.
+//
+// forceObsolete indicates whether the caller has determined that this key is
+// obsolete even though it may be the latest point key for this userkey. This
+// should be set to true for keys obsoleted by RANGEDELs, and is required for
+// strict-obsolete sstables.
+//
+// Note that there are two properties, S1 and S2 (see comment in format.go)
+// that strict-obsolete ssts must satisfy. S2, due to RANGEDELs, is solely the
+// responsibility of the caller. S1 is solely the responsibility of the
+// callee.
+func (w *Writer) AddWithForceObsolete(key InternalKey, value []byte, forceObsolete bool) error {
+	if w.err != nil {
+		return w.err
+	}
+
+	switch key.Kind() {
+	case InternalKeyKindRangeDelete:
+		return w.addTombstone(key, value)
+	case base.InternalKeyKindRangeKeyDelete,
+		base.InternalKeyKindRangeKeySet,
+		base.InternalKeyKindRangeKeyUnset:
+		w.err = errors.Errorf(
+			"pebble: range keys must be added via one of the RangeKey* functions")
+		return w.err
+	}
+	return w.addPoint(key, value, forceObsolete)
+}
+
+func (w *Writer) makeAddPointDecisionV2(key InternalKey) error {
+	prevTrailer := w.lastPointKeyInfo.trailer
+	w.lastPointKeyInfo.trailer = key.Trailer
+	if w.dataBlockBuf.dataBlock.nEntries == 0 {
+		return nil
+	}
+	if !w.disableKeyOrderChecks {
+		prevPointUserKey := w.dataBlockBuf.dataBlock.getCurUserKey()
+		cmpUser := w.compare(prevPointUserKey, key.UserKey)
+		if cmpUser > 0 || (cmpUser == 0 && prevTrailer <= key.Trailer) {
+			return errors.Errorf(
+				"pebble: keys must be added in strictly increasing order: %s, %s",
+				InternalKey{UserKey: prevPointUserKey, Trailer: prevTrailer}.Pretty(w.formatKey),
+				key.Pretty(w.formatKey))
+		}
+	}
+	return nil
+}
+
+// REQUIRES: at least one point has been written to the Writer.
+func (w *Writer) getLastPointUserKey() []byte {
+	if w.dataBlockBuf.dataBlock.nEntries == 0 {
+		panic(errors.AssertionFailedf("no point keys added to writer"))
+	}
+	return w.dataBlockBuf.dataBlock.getCurUserKey()
+}
+
+func (w *Writer) makeAddPointDecisionV3(
+	key InternalKey, valueLen int,
+) (setHasSamePrefix bool, writeToValueBlock bool, isObsolete bool, err error) {
+	prevPointKeyInfo := w.lastPointKeyInfo
+	w.lastPointKeyInfo.userKeyLen = len(key.UserKey)
+	w.lastPointKeyInfo.prefixLen = w.lastPointKeyInfo.userKeyLen
+	if w.split != nil {
+		w.lastPointKeyInfo.prefixLen = w.split(key.UserKey)
+	}
+	w.lastPointKeyInfo.trailer = key.Trailer
+	w.lastPointKeyInfo.isObsolete = false
+	if !w.meta.HasPointKeys {
+		return false, false, false, nil
+	}
+	keyKind := base.TrailerKind(key.Trailer)
+	prevPointUserKey := w.getLastPointUserKey()
+	prevPointKey := InternalKey{UserKey: prevPointUserKey, Trailer: prevPointKeyInfo.trailer}
+	prevKeyKind := base.TrailerKind(prevPointKeyInfo.trailer)
+	considerWriteToValueBlock := prevKeyKind == InternalKeyKindSet &&
+		keyKind == InternalKeyKindSet
+	if considerWriteToValueBlock && !w.requiredInPlaceValueBound.IsEmpty() {
+		keyPrefix := key.UserKey[:w.lastPointKeyInfo.prefixLen]
+		cmpUpper := w.compare(
+			w.requiredInPlaceValueBound.Upper, keyPrefix)
+		if cmpUpper <= 0 {
+			// Common case for CockroachDB. Make it empty since all future keys in
+			// this sstable will also have cmpUpper <= 0.
+			w.requiredInPlaceValueBound = UserKeyPrefixBound{}
+		} else if w.compare(keyPrefix, w.requiredInPlaceValueBound.Lower) >= 0 {
+			considerWriteToValueBlock = false
+		}
+	}
+	// cmpPrefix is initialized iff considerWriteToValueBlock.
+	var cmpPrefix int
+	var cmpUser int
+	if considerWriteToValueBlock {
+		// Compare the prefixes.
+		cmpPrefix = w.compare(prevPointUserKey[:prevPointKeyInfo.prefixLen],
+			key.UserKey[:w.lastPointKeyInfo.prefixLen])
+		cmpUser = cmpPrefix
+		if cmpPrefix == 0 {
+			// Need to compare suffixes to compute cmpUser.
+			cmpUser = w.compare(prevPointUserKey[prevPointKeyInfo.prefixLen:],
+				key.UserKey[w.lastPointKeyInfo.prefixLen:])
+		}
+	} else {
+		cmpUser = w.compare(prevPointUserKey, key.UserKey)
+	}
+	// Ensure that no one adds a point key kind without considering the obsolete
+	// handling for that kind.
+	switch keyKind {
+	case InternalKeyKindSet, InternalKeyKindSetWithDelete, InternalKeyKindMerge,
+		InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
+	default:
+		panic(errors.AssertionFailedf("unexpected key kind %s", keyKind.String()))
+	}
+	// If same user key, then the current key is obsolete if any of the
+	// following is true:
+	// C1 The prev key was obsolete.
+	// C2 The prev key was not a MERGE. When the previous key is a MERGE we must
+	//    preserve SET* and MERGE since their values will be merged into the
+	//    previous key. We also must preserve DEL* since there may be an older
+	//    SET*/MERGE in a lower level that must not be merged with the MERGE --
+	//    if we omit the DEL* that lower SET*/MERGE will become visible.
+	//
+	// Regardless of whether it is the same user key or not
+	// C3 The current key is some kind of point delete, and we are writing to
+	//    the lowest level, then it is also obsolete. The correctness of this
+	//    relies on the same user key not spanning multiple sstables in a level.
+	//
+	// C1 ensures that for a user key there is at most one transition from
+	// !obsolete to obsolete. Consider a user key k, for which the first n keys
+	// are not obsolete. We consider the various value of n:
+	//
+	// n = 0: This happens due to forceObsolete being set by the caller, or due
+	// to C3. forceObsolete must only be set due a RANGEDEL, and that RANGEDEL
+	// must also delete all the lower seqnums for the same user key. C3 triggers
+	// due to a point delete and that deletes all the lower seqnums for the same
+	// user key.
+	//
+	// n = 1: This is the common case. It happens when the first key is not a
+	// MERGE, or the current key is some kind of point delete.
+	//
+	// n > 1: This is due to a sequence of MERGE keys, potentially followed by a
+	// single non-MERGE key.
+	isObsoleteC1AndC2 := cmpUser == 0 &&
+		(prevPointKeyInfo.isObsolete || prevKeyKind != InternalKeyKindMerge)
+	isObsoleteC3 := w.writingToLowestLevel &&
+		(keyKind == InternalKeyKindDelete || keyKind == InternalKeyKindSingleDelete ||
+			keyKind == InternalKeyKindDeleteSized)
+	isObsolete = isObsoleteC1AndC2 || isObsoleteC3
+	// TODO(sumeer): storing isObsolete SET and SETWITHDEL in value blocks is
+	// possible, but requires some care in documenting and checking invariants.
+	// There is code that assumes nothing in value blocks because of single MVCC
+	// version (those should be ok). We have to ensure setHasSamePrefix is
+	// correctly initialized here etc.
+
+	if !w.disableKeyOrderChecks &&
+		(cmpUser > 0 || (cmpUser == 0 && prevPointKeyInfo.trailer <= key.Trailer)) {
+		return false, false, false, errors.Errorf(
+			"pebble: keys must be added in strictly increasing order: %s, %s",
+			prevPointKey.Pretty(w.formatKey), key.Pretty(w.formatKey))
+	}
+	if !considerWriteToValueBlock {
+		return false, false, isObsolete, nil
+	}
+	// NB: it is possible that cmpUser == 0, i.e., these two SETs have identical
+	// user keys (because of an open snapshot). This should be the rare case.
+	setHasSamePrefix = cmpPrefix == 0
+	considerWriteToValueBlock = setHasSamePrefix
+	// Use of 0 here is somewhat arbitrary. Given the minimum 3 byte encoding of
+	// valueHandle, this should be > 3. But tiny values are common in test and
+	// unlikely in production, so we use 0 here for better test coverage.
+	const tinyValueThreshold = 0
+	if considerWriteToValueBlock && valueLen <= tinyValueThreshold {
+		considerWriteToValueBlock = false
+	}
+	return setHasSamePrefix, considerWriteToValueBlock, isObsolete, nil
+}
+
+func (w *Writer) addPoint(key InternalKey, value []byte, forceObsolete bool) error {
+	if w.isStrictObsolete && key.Kind() == InternalKeyKindMerge {
+		return errors.Errorf("MERGE not supported in a strict-obsolete sstable")
+	}
+	var err error
+	var setHasSameKeyPrefix, writeToValueBlock, addPrefixToValueStoredWithKey bool
+	var isObsolete bool
+	maxSharedKeyLen := len(key.UserKey)
+	if w.valueBlockWriter != nil {
+		// maxSharedKeyLen is limited to the prefix of the preceding key. If the
+		// preceding key was in a different block, then the blockWriter will
+		// ignore this maxSharedKeyLen.
+		maxSharedKeyLen = w.lastPointKeyInfo.prefixLen
+		setHasSameKeyPrefix, writeToValueBlock, isObsolete, err =
+			w.makeAddPointDecisionV3(key, len(value))
+		addPrefixToValueStoredWithKey = base.TrailerKind(key.Trailer) == InternalKeyKindSet
+	} else {
+		err = w.makeAddPointDecisionV2(key)
+	}
+	if err != nil {
+		return err
+	}
+	isObsolete = w.tableFormat >= TableFormatPebblev4 && (isObsolete || forceObsolete)
+	w.lastPointKeyInfo.isObsolete = isObsolete
+	var valueStoredWithKey []byte
+	var prefix valuePrefix
+	var valueStoredWithKeyLen int
+	if writeToValueBlock {
+		vh, err := w.valueBlockWriter.addValue(value)
+		if err != nil {
+			return err
+		}
+		n := encodeValueHandle(w.blockBuf.tmp[:], vh)
+		valueStoredWithKey = w.blockBuf.tmp[:n]
+		valueStoredWithKeyLen = len(valueStoredWithKey) + 1
+		var attribute base.ShortAttribute
+		if w.shortAttributeExtractor != nil {
+			// TODO(sumeer): for compactions, it is possible that the input sstable
+			// already has this value in the value section and so we have already
+			// extracted the ShortAttribute. Avoid extracting it again. This will
+			// require changing the Writer.Add interface.
+			if attribute, err = w.shortAttributeExtractor(
+				key.UserKey, w.lastPointKeyInfo.prefixLen, value); err != nil {
+				return err
+			}
+		}
+		prefix = makePrefixForValueHandle(setHasSameKeyPrefix, attribute)
+	} else {
+		valueStoredWithKey = value
+		valueStoredWithKeyLen = len(value)
+		if addPrefixToValueStoredWithKey {
+			valueStoredWithKeyLen++
+		}
+		prefix = makePrefixForInPlaceValue(setHasSameKeyPrefix)
+	}
+
+	if err := w.maybeFlush(key, valueStoredWithKeyLen); err != nil {
+		return err
+	}
+
+	for i := range w.propCollectors {
+		if err := w.propCollectors[i].Add(key, value); err != nil {
+			w.err = err
+			return err
+		}
+	}
+	for i := range w.blockPropCollectors {
+		v := value
+		if addPrefixToValueStoredWithKey {
+			// Values for SET are not required to be in-place, and in the future may
+			// not even be read by the compaction, so pass nil values. Block
+			// property collectors in such Pebble DB's must not look at the value.
+			v = nil
+		}
+		if err := w.blockPropCollectors[i].Add(key, v); err != nil {
+			w.err = err
+			return err
+		}
+	}
+	if w.tableFormat >= TableFormatPebblev4 {
+		w.obsoleteCollector.AddPoint(isObsolete)
+	}
+
+	w.maybeAddToFilter(key.UserKey)
+	w.dataBlockBuf.dataBlock.addWithOptionalValuePrefix(
+		key, isObsolete, valueStoredWithKey, maxSharedKeyLen, addPrefixToValueStoredWithKey, prefix,
+		setHasSameKeyPrefix)
+
+	w.meta.updateSeqNum(key.SeqNum())
+
+	if !w.meta.HasPointKeys {
+		k := w.dataBlockBuf.dataBlock.getCurKey()
+		// NB: We need to ensure that SmallestPoint.UserKey is set, so we create
+		// an InternalKey which is semantically identical to the key, but won't
+		// have a nil UserKey. We do this, because key.UserKey could be nil, and
+		// we don't want SmallestPoint.UserKey to be nil.
+		//
+		// todo(bananabrick): Determine if it's okay to have a nil SmallestPoint
+		// .UserKey now that we don't rely on a nil UserKey to determine if the
+		// key has been set or not.
+		w.meta.SetSmallestPointKey(k.Clone())
+	}
+
+	w.props.NumEntries++
+	switch key.Kind() {
+	case InternalKeyKindDelete, InternalKeyKindSingleDelete:
+		w.props.NumDeletions++
+		w.props.RawPointTombstoneKeySize += uint64(len(key.UserKey))
+	case InternalKeyKindDeleteSized:
+		var size uint64
+		if len(value) > 0 {
+			var n int
+			size, n = binary.Uvarint(value)
+			if n <= 0 {
+				w.err = errors.Newf("%s key's value (%x) does not parse as uvarint",
+					errors.Safe(key.Kind().String()), value)
+				return w.err
+			}
+		}
+		w.props.NumDeletions++
+		w.props.NumSizedDeletions++
+		w.props.RawPointTombstoneKeySize += uint64(len(key.UserKey))
+		w.props.RawPointTombstoneValueSize += size
+	case InternalKeyKindMerge:
+		w.props.NumMergeOperands++
+	}
+	w.props.RawKeySize += uint64(key.Size())
+	w.props.RawValueSize += uint64(len(value))
+	return nil
+}
+
+func (w *Writer) prettyTombstone(k InternalKey, value []byte) fmt.Formatter {
+	return keyspan.Span{
+		Start: k.UserKey,
+		End:   value,
+		Keys:  []keyspan.Key{{Trailer: k.Trailer}},
+	}.Pretty(w.formatKey)
+}
+
+func (w *Writer) addTombstone(key InternalKey, value []byte) error {
+	if !w.disableKeyOrderChecks && !w.rangeDelV1Format && w.rangeDelBlock.nEntries > 0 {
+		// Check that tombstones are being added in fragmented order. If the two
+		// tombstones overlap, their start and end keys must be identical.
+		prevKey := w.rangeDelBlock.getCurKey()
+		switch c := w.compare(prevKey.UserKey, key.UserKey); {
+		case c > 0:
+			w.err = errors.Errorf("pebble: keys must be added in order: %s, %s",
+				prevKey.Pretty(w.formatKey), key.Pretty(w.formatKey))
+			return w.err
+		case c == 0:
+			prevValue := w.rangeDelBlock.curValue
+			if w.compare(prevValue, value) != 0 {
+				w.err = errors.Errorf("pebble: overlapping tombstones must be fragmented: %s vs %s",
+					w.prettyTombstone(prevKey, prevValue),
+					w.prettyTombstone(key, value))
+				return w.err
+			}
+			if prevKey.SeqNum() <= key.SeqNum() {
+				w.err = errors.Errorf("pebble: keys must be added in strictly increasing order: %s, %s",
+					prevKey.Pretty(w.formatKey), key.Pretty(w.formatKey))
+				return w.err
+			}
+		default:
+			prevValue := w.rangeDelBlock.curValue
+			if w.compare(prevValue, key.UserKey) > 0 {
+				w.err = errors.Errorf("pebble: overlapping tombstones must be fragmented: %s vs %s",
+					w.prettyTombstone(prevKey, prevValue),
+					w.prettyTombstone(key, value))
+				return w.err
+			}
+		}
+	}
+
+	if key.Trailer == InternalKeyRangeDeleteSentinel {
+		w.err = errors.Errorf("pebble: cannot add range delete sentinel: %s", key.Pretty(w.formatKey))
+		return w.err
+	}
+
+	for i := range w.propCollectors {
+		if err := w.propCollectors[i].Add(key, value); err != nil {
+			w.err = err
+			return err
+		}
+	}
+
+	w.meta.updateSeqNum(key.SeqNum())
+
+	switch {
+	case w.rangeDelV1Format:
+		// Range tombstones are not fragmented in the v1 (i.e. RocksDB) range
+		// deletion block format, so we need to track the largest range tombstone
+		// end key as every range tombstone is added.
+		//
+		// Note that writing the v1 format is only supported for tests.
+		if w.props.NumRangeDeletions == 0 {
+			w.meta.SetSmallestRangeDelKey(key.Clone())
+			w.meta.SetLargestRangeDelKey(base.MakeRangeDeleteSentinelKey(value).Clone())
+		} else {
+			if base.InternalCompare(w.compare, w.meta.SmallestRangeDel, key) > 0 {
+				w.meta.SetSmallestRangeDelKey(key.Clone())
+			}
+			end := base.MakeRangeDeleteSentinelKey(value)
+			if base.InternalCompare(w.compare, w.meta.LargestRangeDel, end) < 0 {
+				w.meta.SetLargestRangeDelKey(end.Clone())
+			}
+		}
+
+	default:
+		// Range tombstones are fragmented in the v2 range deletion block format,
+		// so the start key of the first range tombstone added will be the smallest
+		// range tombstone key. The largest range tombstone key will be determined
+		// in Writer.Close() as the end key of the last range tombstone added.
+		if w.props.NumRangeDeletions == 0 {
+			w.meta.SetSmallestRangeDelKey(key.Clone())
+		}
+	}
+
+	w.props.NumEntries++
+	w.props.NumDeletions++
+	w.props.NumRangeDeletions++
+	w.props.RawKeySize += uint64(key.Size())
+	w.props.RawValueSize += uint64(len(value))
+	w.rangeDelBlock.add(key, value)
+	return nil
+}
+
+// RangeKeySet sets a range between start (inclusive) and end (exclusive) with
+// the given suffix to the given value. The resulting range key is given the
+// sequence number zero, with the expectation that the resulting sstable will be
+// ingested.
+//
+// Keys must be added to the table in increasing order of start key. Spans are
+// not required to be fragmented. The same suffix may not be set or unset twice
+// over the same keyspan, because it would result in inconsistent state. Both
+// the Set and Unset would share the zero sequence number, and a key cannot be
+// both simultaneously set and unset.
+func (w *Writer) RangeKeySet(start, end, suffix, value []byte) error {
+	return w.addRangeKeySpan(keyspan.Span{
+		Start: w.tempRangeKeyCopy(start),
+		End:   w.tempRangeKeyCopy(end),
+		Keys: []keyspan.Key{
+			{
+				Trailer: base.MakeTrailer(0, base.InternalKeyKindRangeKeySet),
+				Suffix:  w.tempRangeKeyCopy(suffix),
+				Value:   w.tempRangeKeyCopy(value),
+			},
+		},
+	})
+}
+
+// RangeKeyUnset un-sets a range between start (inclusive) and end (exclusive)
+// with the given suffix. The resulting range key is given the
+// sequence number zero, with the expectation that the resulting sstable will be
+// ingested.
+//
+// Keys must be added to the table in increasing order of start key. Spans are
+// not required to be fragmented. The same suffix may not be set or unset twice
+// over the same keyspan, because it would result in inconsistent state. Both
+// the Set and Unset would share the zero sequence number, and a key cannot be
+// both simultaneously set and unset.
+func (w *Writer) RangeKeyUnset(start, end, suffix []byte) error {
+	return w.addRangeKeySpan(keyspan.Span{
+		Start: w.tempRangeKeyCopy(start),
+		End:   w.tempRangeKeyCopy(end),
+		Keys: []keyspan.Key{
+			{
+				Trailer: base.MakeTrailer(0, base.InternalKeyKindRangeKeyUnset),
+				Suffix:  w.tempRangeKeyCopy(suffix),
+			},
+		},
+	})
+}
+
+// RangeKeyDelete deletes a range between start (inclusive) and end (exclusive).
+//
+// Keys must be added to the table in increasing order of start key. Spans are
+// not required to be fragmented.
+func (w *Writer) RangeKeyDelete(start, end []byte) error {
+	return w.addRangeKeySpan(keyspan.Span{
+		Start: w.tempRangeKeyCopy(start),
+		End:   w.tempRangeKeyCopy(end),
+		Keys: []keyspan.Key{
+			{Trailer: base.MakeTrailer(0, base.InternalKeyKindRangeKeyDelete)},
+		},
+	})
+}
+
+// AddRangeKey adds a range key set, unset, or delete key/value pair to the
+// table being written.
+//
+// Range keys must be supplied in strictly ascending order of start key (i.e.
+// user key ascending, sequence number descending, and key type descending).
+// Ranges added must also be supplied in fragmented span order - i.e. other than
+// spans that are perfectly aligned (same start and end keys), spans may not
+// overlap. Range keys may be added out of order relative to point keys and
+// range deletions.
+func (w *Writer) AddRangeKey(key InternalKey, value []byte) error {
+	if w.err != nil {
+		return w.err
+	}
+	return w.addRangeKey(key, value)
+}
+
+func (w *Writer) addRangeKeySpan(span keyspan.Span) error {
+	if w.compare(span.Start, span.End) >= 0 {
+		return errors.Errorf(
+			"pebble: start key must be strictly less than end key",
+		)
+	}
+	if w.fragmenter.Start() != nil && w.compare(w.fragmenter.Start(), span.Start) > 0 {
+		return errors.Errorf("pebble: spans must be added in order: %s > %s",
+			w.formatKey(w.fragmenter.Start()), w.formatKey(span.Start))
+	}
+	// Add this span to the fragmenter.
+	w.fragmenter.Add(span)
+	return w.err
+}
+
+func (w *Writer) encodeRangeKeySpan(span keyspan.Span) {
+	// This method is the emit function of the Fragmenter.
+	//
+	// NB: The span should only contain range keys and be internally consistent
+	// (eg, no duplicate suffixes, no additional keys after a RANGEKEYDEL).
+	//
+	// We use w.rangeKeysBySuffix and w.rangeKeySpan to avoid allocations.
+
+	// Sort the keys by suffix. Iteration doesn't *currently* depend on it, but
+	// we may want to in the future.
+	w.rangeKeysBySuffix.Cmp = w.compare
+	w.rangeKeysBySuffix.Keys = span.Keys
+	sort.Sort(&w.rangeKeysBySuffix)
+
+	w.rangeKeySpan = span
+	w.rangeKeySpan.Keys = w.rangeKeysBySuffix.Keys
+	w.err = firstError(w.err, w.rangeKeyEncoder.Encode(&w.rangeKeySpan))
+}
+
+func (w *Writer) addRangeKey(key InternalKey, value []byte) error {
+	if !w.disableKeyOrderChecks && w.rangeKeyBlock.nEntries > 0 {
+		prevStartKey := w.rangeKeyBlock.getCurKey()
+		prevEndKey, _, ok := rangekey.DecodeEndKey(prevStartKey.Kind(), w.rangeKeyBlock.curValue)
+		if !ok {
+			// We panic here as we should have previously decoded and validated this
+			// key and value when it was first added to the range key block.
+			panic(errors.Errorf("pebble: invalid end key for span: %s",
+				prevStartKey.Pretty(w.formatKey)))
+		}
+
+		curStartKey := key
+		curEndKey, _, ok := rangekey.DecodeEndKey(curStartKey.Kind(), value)
+		if !ok {
+			w.err = errors.Errorf("pebble: invalid end key for span: %s",
+				curStartKey.Pretty(w.formatKey))
+			return w.err
+		}
+
+		// Start keys must be strictly increasing.
+		if base.InternalCompare(w.compare, prevStartKey, curStartKey) >= 0 {
+			w.err = errors.Errorf(
+				"pebble: range keys starts must be added in increasing order: %s, %s",
+				prevStartKey.Pretty(w.formatKey), key.Pretty(w.formatKey))
+			return w.err
+		}
+
+		// Start keys are increasing. If the start user keys are equal, the
+		// end keys must be equal (i.e. aligned spans).
+		if w.compare(prevStartKey.UserKey, curStartKey.UserKey) == 0 {
+			if w.compare(prevEndKey, curEndKey) != 0 {
+				w.err = errors.Errorf("pebble: overlapping range keys must be fragmented: %s, %s",
+					prevStartKey.Pretty(w.formatKey),
+					curStartKey.Pretty(w.formatKey))
+				return w.err
+			}
+		} else if w.compare(prevEndKey, curStartKey.UserKey) > 0 {
+			// If the start user keys are NOT equal, the spans must be disjoint (i.e.
+			// no overlap).
+			// NOTE: the inequality excludes zero, as we allow the end key of the
+			// lower span be the same as the start key of the upper span, because
+			// the range end key is considered an exclusive bound.
+			w.err = errors.Errorf("pebble: overlapping range keys must be fragmented: %s, %s",
+				prevStartKey.Pretty(w.formatKey),
+				curStartKey.Pretty(w.formatKey))
+			return w.err
+		}
+	}
+
+	// TODO(travers): Add an invariant-gated check to ensure that suffix-values
+	// are sorted within coalesced spans.
+
+	// Range-keys and point-keys are intended to live in "parallel" keyspaces.
+	// However, we track a single seqnum in the table metadata that spans both of
+	// these keyspaces.
+	// TODO(travers): Consider tracking range key seqnums separately.
+	w.meta.updateSeqNum(key.SeqNum())
+
+	// Range tombstones are fragmented, so the start key of the first range key
+	// added will be the smallest. The largest range key is determined in
+	// Writer.Close() as the end key of the last range key added to the block.
+	if w.props.NumRangeKeys() == 0 {
+		w.meta.SetSmallestRangeKey(key.Clone())
+	}
+
+	// Update block properties.
+	w.props.RawRangeKeyKeySize += uint64(key.Size())
+	w.props.RawRangeKeyValueSize += uint64(len(value))
+	switch key.Kind() {
+	case base.InternalKeyKindRangeKeyDelete:
+		w.props.NumRangeKeyDels++
+	case base.InternalKeyKindRangeKeySet:
+		w.props.NumRangeKeySets++
+	case base.InternalKeyKindRangeKeyUnset:
+		w.props.NumRangeKeyUnsets++
+	default:
+		panic(errors.Errorf("pebble: invalid range key type: %s", key.Kind()))
+	}
+
+	for i := range w.blockPropCollectors {
+		if err := w.blockPropCollectors[i].Add(key, value); err != nil {
+			return err
+		}
+	}
+
+	// Add the key to the block.
+	w.rangeKeyBlock.add(key, value)
+	return nil
+}
+
+// tempRangeKeyBuf returns a slice of length n from the Writer's rkBuf byte
+// slice. Any byte written to the returned slice is retained for the lifetime of
+// the Writer.
+func (w *Writer) tempRangeKeyBuf(n int) []byte {
+	if cap(w.rkBuf)-len(w.rkBuf) < n {
+		size := len(w.rkBuf) + 2*n
+		if size < 2*cap(w.rkBuf) {
+			size = 2 * cap(w.rkBuf)
+		}
+		buf := make([]byte, len(w.rkBuf), size)
+		copy(buf, w.rkBuf)
+		w.rkBuf = buf
+	}
+	b := w.rkBuf[len(w.rkBuf) : len(w.rkBuf)+n]
+	w.rkBuf = w.rkBuf[:len(w.rkBuf)+n]
+	return b
+}
+
+// tempRangeKeyCopy returns a copy of the provided slice, stored in the Writer's
+// range key buffer.
+func (w *Writer) tempRangeKeyCopy(k []byte) []byte {
+	if len(k) == 0 {
+		return nil
+	}
+	buf := w.tempRangeKeyBuf(len(k))
+	copy(buf, k)
+	return buf
+}
+
+func (w *Writer) maybeAddToFilter(key []byte) {
+	if w.filter != nil {
+		if w.split != nil {
+			prefix := key[:w.split(key)]
+			w.filter.addKey(prefix)
+		} else {
+			w.filter.addKey(key)
+		}
+	}
+}
+
+func (w *Writer) flush(key InternalKey) error {
+	// We're finishing a data block.
+	err := w.finishDataBlockProps(w.dataBlockBuf)
+	if err != nil {
+		return err
+	}
+	w.dataBlockBuf.finish()
+	w.dataBlockBuf.compressAndChecksum(w.compression)
+	// Since dataBlockEstimates.addInflightDataBlock was never called, the
+	// inflightSize is set to 0.
+	w.coordination.sizeEstimate.dataBlockCompressed(len(w.dataBlockBuf.compressed), 0)
+
+	// Determine if the index block should be flushed. Since we're accessing the
+	// dataBlockBuf.dataBlock.curKey here, we have to make sure that once we start
+	// to pool the dataBlockBufs, the curKey isn't used by the Writer once the
+	// dataBlockBuf is added back to a sync.Pool. In this particular case, the
+	// byte slice which supports "sep" will eventually be copied when "sep" is
+	// added to the index block.
+	prevKey := w.dataBlockBuf.dataBlock.getCurKey()
+	sep := w.indexEntrySep(prevKey, key, w.dataBlockBuf)
+	// We determine that we should flush an index block from the Writer client
+	// goroutine, but we actually finish the index block from the writeQueue.
+	// When we determine that an index block should be flushed, we need to call
+	// BlockPropertyCollector.FinishIndexBlock. But block property collector
+	// calls must happen sequentially from the Writer client. Therefore, we need
+	// to determine that we are going to flush the index block from the Writer
+	// client.
+	shouldFlushIndexBlock := supportsTwoLevelIndex(w.tableFormat) && w.indexBlock.shouldFlush(
+		sep, encodedBHPEstimatedSize, w.indexBlockSize, w.indexBlockSizeThreshold,
+	)
+
+	var indexProps []byte
+	var flushableIndexBlock *indexBlockBuf
+	if shouldFlushIndexBlock {
+		flushableIndexBlock = w.indexBlock
+		w.indexBlock = newIndexBlockBuf(w.coordination.parallelismEnabled)
+		// Call BlockPropertyCollector.FinishIndexBlock, since we've decided to
+		// flush the index block.
+		indexProps, err = w.finishIndexBlockProps()
+		if err != nil {
+			return err
+		}
+	}
+
+	// We've called BlockPropertyCollector.FinishDataBlock, and, if necessary,
+	// BlockPropertyCollector.FinishIndexBlock. Since we've decided to finish
+	// the data block, we can call
+	// BlockPropertyCollector.AddPrevDataBlockToIndexBlock.
+	w.addPrevDataBlockToIndexBlockProps()
+
+	// Schedule a write.
+	writeTask := writeTaskPool.Get().(*writeTask)
+	// We're setting compressionDone to indicate that compression of this block
+	// has already been completed.
+	writeTask.compressionDone <- true
+	writeTask.buf = w.dataBlockBuf
+	writeTask.indexEntrySep = sep
+	writeTask.currIndexBlock = w.indexBlock
+	writeTask.indexInflightSize = sep.Size() + encodedBHPEstimatedSize
+	writeTask.finishedIndexProps = indexProps
+	writeTask.flushableIndexBlock = flushableIndexBlock
+
+	// The writeTask corresponds to an unwritten index entry.
+	w.indexBlock.addInflight(writeTask.indexInflightSize)
+
+	w.dataBlockBuf = nil
+	if w.coordination.parallelismEnabled {
+		w.coordination.writeQueue.add(writeTask)
+	} else {
+		err = w.coordination.writeQueue.addSync(writeTask)
+	}
+	w.dataBlockBuf = newDataBlockBuf(w.restartInterval, w.checksumType)
+
+	return err
+}
+
+func (w *Writer) maybeFlush(key InternalKey, valueLen int) error {
+	if !w.dataBlockBuf.shouldFlush(key, valueLen, w.blockSize, w.blockSizeThreshold) {
+		return nil
+	}
+
+	err := w.flush(key)
+
+	if err != nil {
+		w.err = err
+		return err
+	}
+
+	return nil
+}
+
+// dataBlockBuf.dataBlockProps set by this method must be encoded before any future use of the
+// dataBlockBuf.blockPropsEncoder, since the properties slice will get reused by the
+// blockPropsEncoder.
+func (w *Writer) finishDataBlockProps(buf *dataBlockBuf) error {
+	if len(w.blockPropCollectors) == 0 {
+		return nil
+	}
+	var err error
+	buf.blockPropsEncoder.resetProps()
+	for i := range w.blockPropCollectors {
+		scratch := buf.blockPropsEncoder.getScratchForProp()
+		if scratch, err = w.blockPropCollectors[i].FinishDataBlock(scratch); err != nil {
+			return err
+		}
+		if len(scratch) > 0 {
+			buf.blockPropsEncoder.addProp(shortID(i), scratch)
+		}
+	}
+
+	buf.dataBlockProps = buf.blockPropsEncoder.unsafeProps()
+	return nil
+}
+
+// The BlockHandleWithProperties returned by this method must be encoded before any future use of
+// the Writer.blockPropsEncoder, since the properties slice will get reused by the blockPropsEncoder.
+// maybeAddBlockPropertiesToBlockHandle should only be called if block is being written synchronously
+// with the Writer client.
+func (w *Writer) maybeAddBlockPropertiesToBlockHandle(
+	bh BlockHandle,
+) (BlockHandleWithProperties, error) {
+	err := w.finishDataBlockProps(w.dataBlockBuf)
+	if err != nil {
+		return BlockHandleWithProperties{}, err
+	}
+	return BlockHandleWithProperties{BlockHandle: bh, Props: w.dataBlockBuf.dataBlockProps}, nil
+}
+
+func (w *Writer) indexEntrySep(prevKey, key InternalKey, dataBlockBuf *dataBlockBuf) InternalKey {
+	// Make a rough guess that we want key-sized scratch to compute the separator.
+	if cap(dataBlockBuf.sepScratch) < key.Size() {
+		dataBlockBuf.sepScratch = make([]byte, 0, key.Size()*2)
+	}
+
+	var sep InternalKey
+	if key.UserKey == nil && key.Trailer == 0 {
+		sep = prevKey.Successor(w.compare, w.successor, dataBlockBuf.sepScratch[:0])
+	} else {
+		sep = prevKey.Separator(w.compare, w.separator, dataBlockBuf.sepScratch[:0], key)
+	}
+	return sep
+}
+
+// addIndexEntry adds an index entry for the specified key and block handle.
+// addIndexEntry can be called from both the Writer client goroutine, and the
+// writeQueue goroutine. If the flushIndexBuf != nil, then the indexProps, as
+// they're used when the index block is finished.
+//
+// Invariant:
+//  1. addIndexEntry must not store references to the sep InternalKey, the tmp
+//     byte slice, bhp.Props. That is, these must be either deep copied or
+//     encoded.
+//  2. addIndexEntry must not hold references to the flushIndexBuf, and the writeTo
+//     indexBlockBufs.
+func (w *Writer) addIndexEntry(
+	sep InternalKey,
+	bhp BlockHandleWithProperties,
+	tmp []byte,
+	flushIndexBuf *indexBlockBuf,
+	writeTo *indexBlockBuf,
+	inflightSize int,
+	indexProps []byte,
+) error {
+	if bhp.Length == 0 {
+		// A valid blockHandle must be non-zero.
+		// In particular, it must have a non-zero length.
+		return nil
+	}
+
+	encoded := encodeBlockHandleWithProperties(tmp, bhp)
+
+	if flushIndexBuf != nil {
+		if cap(w.indexPartitions) == 0 {
+			w.indexPartitions = make([]indexBlockAndBlockProperties, 0, 32)
+		}
+		// Enable two level indexes if there is more than one index block.
+		w.twoLevelIndex = true
+		if err := w.finishIndexBlock(flushIndexBuf, indexProps); err != nil {
+			return err
+		}
+	}
+
+	writeTo.add(sep, encoded, inflightSize)
+	return nil
+}
+
+func (w *Writer) addPrevDataBlockToIndexBlockProps() {
+	for i := range w.blockPropCollectors {
+		w.blockPropCollectors[i].AddPrevDataBlockToIndexBlock()
+	}
+}
+
+// addIndexEntrySync adds an index entry for the specified key and block handle.
+// Writer.addIndexEntry is only called synchronously once Writer.Close is called.
+// addIndexEntrySync should only be called if we're sure that index entries
+// aren't being written asynchronously.
+//
+// Invariant:
+//  1. addIndexEntrySync must not store references to the prevKey, key InternalKey's,
+//     the tmp byte slice. That is, these must be either deep copied or encoded.
+func (w *Writer) addIndexEntrySync(
+	prevKey, key InternalKey, bhp BlockHandleWithProperties, tmp []byte,
+) error {
+	sep := w.indexEntrySep(prevKey, key, w.dataBlockBuf)
+	shouldFlush := supportsTwoLevelIndex(
+		w.tableFormat) && w.indexBlock.shouldFlush(
+		sep, encodedBHPEstimatedSize, w.indexBlockSize, w.indexBlockSizeThreshold,
+	)
+	var flushableIndexBlock *indexBlockBuf
+	var props []byte
+	var err error
+	if shouldFlush {
+		flushableIndexBlock = w.indexBlock
+		w.indexBlock = newIndexBlockBuf(w.coordination.parallelismEnabled)
+
+		// Call BlockPropertyCollector.FinishIndexBlock, since we've decided to
+		// flush the index block.
+		props, err = w.finishIndexBlockProps()
+		if err != nil {
+			return err
+		}
+	}
+
+	err = w.addIndexEntry(sep, bhp, tmp, flushableIndexBlock, w.indexBlock, 0, props)
+	if flushableIndexBlock != nil {
+		flushableIndexBlock.clear()
+		indexBlockBufPool.Put(flushableIndexBlock)
+	}
+	w.addPrevDataBlockToIndexBlockProps()
+	return err
+}
+
+func shouldFlush(
+	key InternalKey,
+	valueLen int,
+	restartInterval, estimatedBlockSize, numEntries, targetBlockSize, sizeThreshold int,
+) bool {
+	if numEntries == 0 {
+		return false
+	}
+
+	if estimatedBlockSize >= targetBlockSize {
+		return true
+	}
+
+	// The block is currently smaller than the target size.
+	if estimatedBlockSize <= sizeThreshold {
+		// The block is smaller than the threshold size at which we'll consider
+		// flushing it.
+		return false
+	}
+
+	newSize := estimatedBlockSize + key.Size() + valueLen
+	if numEntries%restartInterval == 0 {
+		newSize += 4
+	}
+	newSize += 4                              // varint for shared prefix length
+	newSize += uvarintLen(uint32(key.Size())) // varint for unshared key bytes
+	newSize += uvarintLen(uint32(valueLen))   // varint for value size
+	// Flush if the block plus the new entry is larger than the target size.
+	return newSize > targetBlockSize
+}
+
+func cloneKeyWithBuf(k InternalKey, a bytealloc.A) (bytealloc.A, InternalKey) {
+	if len(k.UserKey) == 0 {
+		return a, k
+	}
+	a, keyCopy := a.Copy(k.UserKey)
+	return a, InternalKey{UserKey: keyCopy, Trailer: k.Trailer}
+}
+
+// Invariants: The byte slice returned by finishIndexBlockProps is heap-allocated
+//
+//	and has its own lifetime, independent of the Writer and the blockPropsEncoder,
+//
+// and it is safe to:
+//  1. Reuse w.blockPropsEncoder without first encoding the byte slice returned.
+//  2. Store the byte slice in the Writer since it is a copy and not supported by
+//     an underlying buffer.
+func (w *Writer) finishIndexBlockProps() ([]byte, error) {
+	w.blockPropsEncoder.resetProps()
+	for i := range w.blockPropCollectors {
+		scratch := w.blockPropsEncoder.getScratchForProp()
+		var err error
+		if scratch, err = w.blockPropCollectors[i].FinishIndexBlock(scratch); err != nil {
+			return nil, err
+		}
+		if len(scratch) > 0 {
+			w.blockPropsEncoder.addProp(shortID(i), scratch)
+		}
+	}
+	return w.blockPropsEncoder.props(), nil
+}
+
+// finishIndexBlock finishes the current index block and adds it to the top
+// level index block. This is only used when two level indexes are enabled.
+//
+// Invariants:
+//  1. The props slice passed into finishedIndexBlock must not be a
+//     owned by any other struct, since it will be stored in the Writer.indexPartitions
+//     slice.
+//  2. None of the buffers owned by indexBuf will be shallow copied and stored elsewhere.
+//     That is, it must be safe to reuse indexBuf after finishIndexBlock has been called.
+func (w *Writer) finishIndexBlock(indexBuf *indexBlockBuf, props []byte) error {
+	part := indexBlockAndBlockProperties{
+		nEntries: indexBuf.block.nEntries, properties: props,
+	}
+	w.indexSepAlloc, part.sep = cloneKeyWithBuf(
+		indexBuf.block.getCurKey(), w.indexSepAlloc,
+	)
+	bk := indexBuf.finish()
+	if len(w.indexBlockAlloc) < len(bk) {
+		// Allocate enough bytes for approximately 16 index blocks.
+		w.indexBlockAlloc = make([]byte, len(bk)*16)
+	}
+	n := copy(w.indexBlockAlloc, bk)
+	part.block = w.indexBlockAlloc[:n:n]
+	w.indexBlockAlloc = w.indexBlockAlloc[n:]
+	w.indexPartitions = append(w.indexPartitions, part)
+	return nil
+}
+
+func (w *Writer) writeTwoLevelIndex() (BlockHandle, error) {
+	props, err := w.finishIndexBlockProps()
+	if err != nil {
+		return BlockHandle{}, err
+	}
+	// Add the final unfinished index.
+	if err = w.finishIndexBlock(w.indexBlock, props); err != nil {
+		return BlockHandle{}, err
+	}
+
+	for i := range w.indexPartitions {
+		b := &w.indexPartitions[i]
+		w.props.NumDataBlocks += uint64(b.nEntries)
+
+		data := b.block
+		w.props.IndexSize += uint64(len(data))
+		bh, err := w.writeBlock(data, w.compression, &w.blockBuf)
+		if err != nil {
+			return BlockHandle{}, err
+		}
+		bhp := BlockHandleWithProperties{
+			BlockHandle: bh,
+			Props:       b.properties,
+		}
+		encoded := encodeBlockHandleWithProperties(w.blockBuf.tmp[:], bhp)
+		w.topLevelIndexBlock.add(b.sep, encoded)
+	}
+
+	// NB: RocksDB includes the block trailer length in the index size
+	// property, though it doesn't include the trailer in the top level
+	// index size property.
+	w.props.IndexPartitions = uint64(len(w.indexPartitions))
+	w.props.TopLevelIndexSize = uint64(w.topLevelIndexBlock.estimatedSize())
+	w.props.IndexSize += w.props.TopLevelIndexSize + blockTrailerLen
+
+	return w.writeBlock(w.topLevelIndexBlock.finish(), w.compression, &w.blockBuf)
+}
+
+func compressAndChecksum(b []byte, compression Compression, blockBuf *blockBuf) []byte {
+	// Compress the buffer, discarding the result if the improvement isn't at
+	// least 12.5%.
+	blockType, compressed := compressBlock(compression, b, blockBuf.compressedBuf)
+	if blockType != noCompressionBlockType && cap(compressed) > cap(blockBuf.compressedBuf) {
+		blockBuf.compressedBuf = compressed[:cap(compressed)]
+	}
+	if len(compressed) < len(b)-len(b)/8 {
+		b = compressed
+	} else {
+		blockType = noCompressionBlockType
+	}
+
+	blockBuf.tmp[0] = byte(blockType)
+
+	// Calculate the checksum.
+	checksum := blockBuf.checksummer.checksum(b, blockBuf.tmp[:1])
+	binary.LittleEndian.PutUint32(blockBuf.tmp[1:5], checksum)
+	return b
+}
+
+func (w *Writer) writeCompressedBlock(block []byte, blockTrailerBuf []byte) (BlockHandle, error) {
+	bh := BlockHandle{Offset: w.meta.Size, Length: uint64(len(block))}
+
+	if w.cacheID != 0 && w.fileNum.FileNum() != 0 {
+		// Remove the block being written from the cache. This provides defense in
+		// depth against bugs which cause cache collisions.
+		//
+		// TODO(peter): Alternatively, we could add the uncompressed value to the
+		// cache.
+		w.cache.Delete(w.cacheID, w.fileNum, bh.Offset)
+	}
+
+	// Write the bytes to the file.
+	if err := w.writable.Write(block); err != nil {
+		return BlockHandle{}, err
+	}
+	w.meta.Size += uint64(len(block))
+	if err := w.writable.Write(blockTrailerBuf[:blockTrailerLen]); err != nil {
+		return BlockHandle{}, err
+	}
+	w.meta.Size += blockTrailerLen
+
+	return bh, nil
+}
+
+// Write implements io.Writer. This is analogous to writeCompressedBlock for
+// blocks that already incorporate the trailer, and don't need the callee to
+// return a BlockHandle.
+func (w *Writer) Write(blockWithTrailer []byte) (n int, err error) {
+	offset := w.meta.Size
+	if w.cacheID != 0 && w.fileNum.FileNum() != 0 {
+		// Remove the block being written from the cache. This provides defense in
+		// depth against bugs which cause cache collisions.
+		//
+		// TODO(peter): Alternatively, we could add the uncompressed value to the
+		// cache.
+		w.cache.Delete(w.cacheID, w.fileNum, offset)
+	}
+	w.meta.Size += uint64(len(blockWithTrailer))
+	if err := w.writable.Write(blockWithTrailer); err != nil {
+		return 0, err
+	}
+	return len(blockWithTrailer), nil
+}
+
+func (w *Writer) writeBlock(
+	b []byte, compression Compression, blockBuf *blockBuf,
+) (BlockHandle, error) {
+	b = compressAndChecksum(b, compression, blockBuf)
+	return w.writeCompressedBlock(b, blockBuf.tmp[:])
+}
+
+// assertFormatCompatibility ensures that the features present on the table are
+// compatible with the table format version.
+func (w *Writer) assertFormatCompatibility() error {
+	// PebbleDBv1: block properties.
+	if len(w.blockPropCollectors) > 0 && w.tableFormat < TableFormatPebblev1 {
+		return errors.Newf(
+			"table format version %s is less than the minimum required version %s for block properties",
+			w.tableFormat, TableFormatPebblev1,
+		)
+	}
+
+	// PebbleDBv2: range keys.
+	if w.props.NumRangeKeys() > 0 && w.tableFormat < TableFormatPebblev2 {
+		return errors.Newf(
+			"table format version %s is less than the minimum required version %s for range keys",
+			w.tableFormat, TableFormatPebblev2,
+		)
+	}
+
+	// PebbleDBv3: value blocks.
+	if (w.props.NumValueBlocks > 0 || w.props.NumValuesInValueBlocks > 0 ||
+		w.props.ValueBlocksSize > 0) && w.tableFormat < TableFormatPebblev3 {
+		return errors.Newf(
+			"table format version %s is less than the minimum required version %s for value blocks",
+			w.tableFormat, TableFormatPebblev3)
+	}
+
+	// PebbleDBv4: DELSIZED tombstones.
+	if w.props.NumSizedDeletions > 0 && w.tableFormat < TableFormatPebblev4 {
+		return errors.Newf(
+			"table format version %s is less than the minimum required version %s for sized deletion tombstones",
+			w.tableFormat, TableFormatPebblev4)
+	}
+	return nil
+}
+
+// Close finishes writing the table and closes the underlying file that the
+// table was written to.
+func (w *Writer) Close() (err error) {
+	defer func() {
+		if w.valueBlockWriter != nil {
+			releaseValueBlockWriter(w.valueBlockWriter)
+			// Defensive code in case Close gets called again. We don't want to put
+			// the same object to a sync.Pool.
+			w.valueBlockWriter = nil
+		}
+		if w.writable != nil {
+			w.writable.Abort()
+			w.writable = nil
+		}
+		// Record any error in the writer (so we can exit early if Close is called
+		// again).
+		if err != nil {
+			w.err = err
+		}
+	}()
+
+	// finish must be called before we check for an error, because finish will
+	// block until every single task added to the writeQueue has been processed,
+	// and an error could be encountered while any of those tasks are processed.
+	if err := w.coordination.writeQueue.finish(); err != nil {
+		return err
+	}
+
+	if w.err != nil {
+		return w.err
+	}
+
+	// The w.meta.LargestPointKey is only used once the Writer is closed, so it is safe to set it
+	// when the Writer is closed.
+	//
+	// The following invariants ensure that setting the largest key at this point of a Writer close
+	// is correct:
+	// 1. Keys must only be added to the Writer in an increasing order.
+	// 2. The current w.dataBlockBuf is guaranteed to have the latest key added to the Writer. This
+	//    must be true, because a w.dataBlockBuf is only switched out when a dataBlock is flushed,
+	//    however, if a dataBlock is flushed, then we add a key to the new w.dataBlockBuf in the
+	//    addPoint function after the flush occurs.
+	if w.dataBlockBuf.dataBlock.nEntries >= 1 {
+		w.meta.SetLargestPointKey(w.dataBlockBuf.dataBlock.getCurKey().Clone())
+	}
+
+	// Finish the last data block, or force an empty data block if there
+	// aren't any data blocks at all.
+	if w.dataBlockBuf.dataBlock.nEntries > 0 || w.indexBlock.block.nEntries == 0 {
+		bh, err := w.writeBlock(w.dataBlockBuf.dataBlock.finish(), w.compression, &w.dataBlockBuf.blockBuf)
+		if err != nil {
+			return err
+		}
+		bhp, err := w.maybeAddBlockPropertiesToBlockHandle(bh)
+		if err != nil {
+			return err
+		}
+		prevKey := w.dataBlockBuf.dataBlock.getCurKey()
+		if err := w.addIndexEntrySync(prevKey, InternalKey{}, bhp, w.dataBlockBuf.tmp[:]); err != nil {
+			return err
+		}
+	}
+	w.props.DataSize = w.meta.Size
+
+	// Write the filter block.
+	var metaindex rawBlockWriter
+	metaindex.restartInterval = 1
+	if w.filter != nil {
+		b, err := w.filter.finish()
+		if err != nil {
+			return err
+		}
+		bh, err := w.writeBlock(b, NoCompression, &w.blockBuf)
+		if err != nil {
+			return err
+		}
+		n := encodeBlockHandle(w.blockBuf.tmp[:], bh)
+		metaindex.add(InternalKey{UserKey: []byte(w.filter.metaName())}, w.blockBuf.tmp[:n])
+		w.props.FilterPolicyName = w.filter.policyName()
+		w.props.FilterSize = bh.Length
+	}
+
+	var indexBH BlockHandle
+	if w.twoLevelIndex {
+		w.props.IndexType = twoLevelIndex
+		// Write the two level index block.
+		indexBH, err = w.writeTwoLevelIndex()
+		if err != nil {
+			return err
+		}
+	} else {
+		w.props.IndexType = binarySearchIndex
+		// NB: RocksDB includes the block trailer length in the index size
+		// property, though it doesn't include the trailer in the filter size
+		// property.
+		w.props.IndexSize = uint64(w.indexBlock.estimatedSize()) + blockTrailerLen
+		w.props.NumDataBlocks = uint64(w.indexBlock.block.nEntries)
+
+		// Write the single level index block.
+		indexBH, err = w.writeBlock(w.indexBlock.finish(), w.compression, &w.blockBuf)
+		if err != nil {
+			return err
+		}
+	}
+
+	// Write the range-del block. The block handle must added to the meta index block
+	// after the properties block has been written. This is because the entries in the
+	// metaindex block must be sorted by key.
+	var rangeDelBH BlockHandle
+	if w.props.NumRangeDeletions > 0 {
+		if !w.rangeDelV1Format {
+			// Because the range tombstones are fragmented in the v2 format, the end
+			// key of the last added range tombstone will be the largest range
+			// tombstone key. Note that we need to make this into a range deletion
+			// sentinel because sstable boundaries are inclusive while the end key of
+			// a range deletion tombstone is exclusive. A Clone() is necessary as
+			// rangeDelBlock.curValue is the same slice that will get passed
+			// into w.writer, and some implementations of vfs.File mutate the
+			// slice passed into Write(). Also, w.meta will often outlive the
+			// blockWriter, and so cloning curValue allows the rangeDelBlock's
+			// internal buffer to get gc'd.
+			k := base.MakeRangeDeleteSentinelKey(w.rangeDelBlock.curValue).Clone()
+			w.meta.SetLargestRangeDelKey(k)
+		}
+		rangeDelBH, err = w.writeBlock(w.rangeDelBlock.finish(), NoCompression, &w.blockBuf)
+		if err != nil {
+			return err
+		}
+	}
+
+	// Write the range-key block, flushing any remaining spans from the
+	// fragmenter first.
+	w.fragmenter.Finish()
+
+	var rangeKeyBH BlockHandle
+	if w.props.NumRangeKeys() > 0 {
+		key := w.rangeKeyBlock.getCurKey()
+		kind := key.Kind()
+		endKey, _, ok := rangekey.DecodeEndKey(kind, w.rangeKeyBlock.curValue)
+		if !ok {
+			return errors.Newf("invalid end key: %s", w.rangeKeyBlock.curValue)
+		}
+		k := base.MakeExclusiveSentinelKey(kind, endKey).Clone()
+		w.meta.SetLargestRangeKey(k)
+		// TODO(travers): The lack of compression on the range key block matches the
+		// lack of compression on the range-del block. Revisit whether we want to
+		// enable compression on this block.
+		rangeKeyBH, err = w.writeBlock(w.rangeKeyBlock.finish(), NoCompression, &w.blockBuf)
+		if err != nil {
+			return err
+		}
+	}
+
+	if w.valueBlockWriter != nil {
+		vbiHandle, vbStats, err := w.valueBlockWriter.finish(w, w.meta.Size)
+		if err != nil {
+			return err
+		}
+		w.props.NumValueBlocks = vbStats.numValueBlocks
+		w.props.NumValuesInValueBlocks = vbStats.numValuesInValueBlocks
+		w.props.ValueBlocksSize = vbStats.valueBlocksAndIndexSize
+		if vbStats.numValueBlocks > 0 {
+			n := encodeValueBlocksIndexHandle(w.blockBuf.tmp[:], vbiHandle)
+			metaindex.add(InternalKey{UserKey: []byte(metaValueIndexName)}, w.blockBuf.tmp[:n])
+		}
+	}
+
+	// Add the range key block handle to the metaindex block. Note that we add the
+	// block handle to the metaindex block before the other meta blocks as the
+	// metaindex block entries must be sorted, and the range key block name sorts
+	// before the other block names.
+	if w.props.NumRangeKeys() > 0 {
+		n := encodeBlockHandle(w.blockBuf.tmp[:], rangeKeyBH)
+		metaindex.add(InternalKey{UserKey: []byte(metaRangeKeyName)}, w.blockBuf.tmp[:n])
+	}
+
+	{
+		userProps := make(map[string]string)
+		for i := range w.propCollectors {
+			if err := w.propCollectors[i].Finish(userProps); err != nil {
+				return err
+			}
+		}
+		for i := range w.blockPropCollectors {
+			scratch := w.blockPropsEncoder.getScratchForProp()
+			// Place the shortID in the first byte.
+			scratch = append(scratch, byte(i))
+			buf, err := w.blockPropCollectors[i].FinishTable(scratch)
+			if err != nil {
+				return err
+			}
+			var prop string
+			if len(buf) > 0 {
+				prop = string(buf)
+			}
+			// NB: The property is populated in the map even if it is the
+			// empty string, since the presence in the map is what indicates
+			// that the block property collector was used when writing.
+			userProps[w.blockPropCollectors[i].Name()] = prop
+		}
+		if len(userProps) > 0 {
+			w.props.UserProperties = userProps
+		}
+
+		// Write the properties block.
+		var raw rawBlockWriter
+		// The restart interval is set to infinity because the properties block
+		// is always read sequentially and cached in a heap located object. This
+		// reduces table size without a significant impact on performance.
+		raw.restartInterval = propertiesBlockRestartInterval
+		w.props.CompressionOptions = rocksDBCompressionOptions
+		w.props.save(w.tableFormat, &raw)
+		bh, err := w.writeBlock(raw.finish(), NoCompression, &w.blockBuf)
+		if err != nil {
+			return err
+		}
+		n := encodeBlockHandle(w.blockBuf.tmp[:], bh)
+		metaindex.add(InternalKey{UserKey: []byte(metaPropertiesName)}, w.blockBuf.tmp[:n])
+	}
+
+	// Add the range deletion block handle to the metaindex block.
+	if w.props.NumRangeDeletions > 0 {
+		n := encodeBlockHandle(w.blockBuf.tmp[:], rangeDelBH)
+		// The v2 range-del block encoding is backwards compatible with the v1
+		// encoding. We add meta-index entries for both the old name and the new
+		// name so that old code can continue to find the range-del block and new
+		// code knows that the range tombstones in the block are fragmented and
+		// sorted.
+		metaindex.add(InternalKey{UserKey: []byte(metaRangeDelName)}, w.blockBuf.tmp[:n])
+		if !w.rangeDelV1Format {
+			metaindex.add(InternalKey{UserKey: []byte(metaRangeDelV2Name)}, w.blockBuf.tmp[:n])
+		}
+	}
+
+	// Write the metaindex block. It might be an empty block, if the filter
+	// policy is nil. NoCompression is specified because a) RocksDB never
+	// compresses the meta-index block and b) RocksDB has some code paths which
+	// expect the meta-index block to not be compressed.
+	metaindexBH, err := w.writeBlock(metaindex.blockWriter.finish(), NoCompression, &w.blockBuf)
+	if err != nil {
+		return err
+	}
+
+	// Write the table footer.
+	footer := footer{
+		format:      w.tableFormat,
+		checksum:    w.blockBuf.checksummer.checksumType,
+		metaindexBH: metaindexBH,
+		indexBH:     indexBH,
+	}
+	encoded := footer.encode(w.blockBuf.tmp[:])
+	if err := w.writable.Write(footer.encode(w.blockBuf.tmp[:])); err != nil {
+		return err
+	}
+	w.meta.Size += uint64(len(encoded))
+	w.meta.Properties = w.props
+
+	// Check that the features present in the table are compatible with the format
+	// configured for the table.
+	if err = w.assertFormatCompatibility(); err != nil {
+		return err
+	}
+
+	if err := w.writable.Finish(); err != nil {
+		w.writable = nil
+		return err
+	}
+	w.writable = nil
+
+	w.dataBlockBuf.clear()
+	dataBlockBufPool.Put(w.dataBlockBuf)
+	w.dataBlockBuf = nil
+	w.indexBlock.clear()
+	indexBlockBufPool.Put(w.indexBlock)
+	w.indexBlock = nil
+
+	// Make any future calls to Set or Close return an error.
+	w.err = errWriterClosed
+	return nil
+}
+
+// EstimatedSize returns the estimated size of the sstable being written if a
+// call to Finish() was made without adding additional keys.
+func (w *Writer) EstimatedSize() uint64 {
+	return w.coordination.sizeEstimate.size() +
+		uint64(w.dataBlockBuf.dataBlock.estimatedSize()) +
+		w.indexBlock.estimatedSize()
+}
+
+// Metadata returns the metadata for the finished sstable. Only valid to call
+// after the sstable has been finished.
+func (w *Writer) Metadata() (*WriterMetadata, error) {
+	if w.writable != nil {
+		return nil, errors.New("pebble: writer is not closed")
+	}
+	return &w.meta, nil
+}
+
+// WriterOption provide an interface to do work on Writer while it is being
+// opened.
+type WriterOption interface {
+	// writerApply is called on the writer during opening in order to set
+	// internal parameters.
+	writerApply(*Writer)
+}
+
+// PreviousPointKeyOpt is a WriterOption that provides access to the last
+// point key written to the writer while building a sstable.
+type PreviousPointKeyOpt struct {
+	w *Writer
+}
+
+// UnsafeKey returns the last point key written to the writer to which this
+// option was passed during creation. The returned key points directly into
+// a buffer belonging to the Writer. The value's lifetime ends the next time a
+// point key is added to the Writer.
+// Invariant: UnsafeKey isn't and shouldn't be called after the Writer is closed.
+func (o PreviousPointKeyOpt) UnsafeKey() base.InternalKey {
+	if o.w == nil {
+		return base.InvalidInternalKey
+	}
+
+	if o.w.dataBlockBuf.dataBlock.nEntries >= 1 {
+		// o.w.dataBlockBuf.dataBlock.curKey is guaranteed to point to the last point key
+		// which was added to the Writer.
+		return o.w.dataBlockBuf.dataBlock.getCurKey()
+	}
+	return base.InternalKey{}
+}
+
+func (o *PreviousPointKeyOpt) writerApply(w *Writer) {
+	o.w = w
+}
+
+// NewWriter returns a new table writer for the file. Closing the writer will
+// close the file.
+func NewWriter(writable objstorage.Writable, o WriterOptions, extraOpts ...WriterOption) *Writer {
+	o = o.ensureDefaults()
+	w := &Writer{
+		writable: writable,
+		meta: WriterMetadata{
+			SmallestSeqNum: math.MaxUint64,
+		},
+		blockSize:               o.BlockSize,
+		blockSizeThreshold:      (o.BlockSize*o.BlockSizeThreshold + 99) / 100,
+		indexBlockSize:          o.IndexBlockSize,
+		indexBlockSizeThreshold: (o.IndexBlockSize*o.BlockSizeThreshold + 99) / 100,
+		compare:                 o.Comparer.Compare,
+		split:                   o.Comparer.Split,
+		formatKey:               o.Comparer.FormatKey,
+		compression:             o.Compression,
+		separator:               o.Comparer.Separator,
+		successor:               o.Comparer.Successor,
+		tableFormat:             o.TableFormat,
+		isStrictObsolete:        o.IsStrictObsolete,
+		writingToLowestLevel:    o.WritingToLowestLevel,
+		cache:                   o.Cache,
+		restartInterval:         o.BlockRestartInterval,
+		checksumType:            o.Checksum,
+		indexBlock:              newIndexBlockBuf(o.Parallelism),
+		rangeDelBlock: blockWriter{
+			restartInterval: 1,
+		},
+		rangeKeyBlock: blockWriter{
+			restartInterval: 1,
+		},
+		topLevelIndexBlock: blockWriter{
+			restartInterval: 1,
+		},
+		fragmenter: keyspan.Fragmenter{
+			Cmp:    o.Comparer.Compare,
+			Format: o.Comparer.FormatKey,
+		},
+	}
+	if w.tableFormat >= TableFormatPebblev3 {
+		w.shortAttributeExtractor = o.ShortAttributeExtractor
+		w.requiredInPlaceValueBound = o.RequiredInPlaceValueBound
+		w.valueBlockWriter = newValueBlockWriter(
+			w.blockSize, w.blockSizeThreshold, w.compression, w.checksumType, func(compressedSize int) {
+				w.coordination.sizeEstimate.dataBlockCompressed(compressedSize, 0)
+			})
+	}
+
+	w.dataBlockBuf = newDataBlockBuf(w.restartInterval, w.checksumType)
+
+	w.blockBuf = blockBuf{
+		checksummer: checksummer{checksumType: o.Checksum},
+	}
+
+	w.coordination.init(o.Parallelism, w)
+
+	if writable == nil {
+		w.err = errors.New("pebble: nil writable")
+		return w
+	}
+
+	// Note that WriterOptions are applied in two places; the ones with a
+	// preApply() method are applied here. The rest are applied down below after
+	// default properties are set.
+	type preApply interface{ preApply() }
+	for _, opt := range extraOpts {
+		if _, ok := opt.(preApply); ok {
+			opt.writerApply(w)
+		}
+	}
+
+	w.props.PrefixExtractorName = "nullptr"
+	if o.FilterPolicy != nil {
+		switch o.FilterType {
+		case TableFilter:
+			w.filter = newTableFilterWriter(o.FilterPolicy)
+			if w.split != nil {
+				w.props.PrefixExtractorName = o.Comparer.Name
+				w.props.PrefixFiltering = true
+			} else {
+				w.props.WholeKeyFiltering = true
+			}
+		default:
+			panic(fmt.Sprintf("unknown filter type: %v", o.FilterType))
+		}
+	}
+
+	w.props.ComparerName = o.Comparer.Name
+	w.props.CompressionName = o.Compression.String()
+	w.props.MergerName = o.MergerName
+	w.props.PropertyCollectorNames = "[]"
+	w.props.ExternalFormatVersion = rocksDBExternalFormatVersion
+
+	if len(o.TablePropertyCollectors) > 0 || len(o.BlockPropertyCollectors) > 0 ||
+		w.tableFormat >= TableFormatPebblev4 {
+		var buf bytes.Buffer
+		buf.WriteString("[")
+		if len(o.TablePropertyCollectors) > 0 {
+			w.propCollectors = make([]TablePropertyCollector, len(o.TablePropertyCollectors))
+			for i := range o.TablePropertyCollectors {
+				w.propCollectors[i] = o.TablePropertyCollectors[i]()
+				if i > 0 {
+					buf.WriteString(",")
+				}
+				buf.WriteString(w.propCollectors[i].Name())
+			}
+		}
+		numBlockPropertyCollectors := len(o.BlockPropertyCollectors)
+		if w.tableFormat >= TableFormatPebblev4 {
+			numBlockPropertyCollectors++
+		}
+		// shortID is a uint8, so we cannot exceed that number of block
+		// property collectors.
+		if numBlockPropertyCollectors > math.MaxUint8 {
+			w.err = errors.New("pebble: too many block property collectors")
+			return w
+		}
+		if numBlockPropertyCollectors > 0 {
+			w.blockPropCollectors = make([]BlockPropertyCollector, numBlockPropertyCollectors)
+		}
+		if len(o.BlockPropertyCollectors) > 0 {
+			// The shortID assigned to a collector is the same as its index in
+			// this slice.
+			for i := range o.BlockPropertyCollectors {
+				w.blockPropCollectors[i] = o.BlockPropertyCollectors[i]()
+				if i > 0 || len(o.TablePropertyCollectors) > 0 {
+					buf.WriteString(",")
+				}
+				buf.WriteString(w.blockPropCollectors[i].Name())
+			}
+		}
+		if w.tableFormat >= TableFormatPebblev4 {
+			if numBlockPropertyCollectors > 1 || len(o.TablePropertyCollectors) > 0 {
+				buf.WriteString(",")
+			}
+			w.blockPropCollectors[numBlockPropertyCollectors-1] = &w.obsoleteCollector
+			buf.WriteString(w.obsoleteCollector.Name())
+		}
+		buf.WriteString("]")
+		w.props.PropertyCollectorNames = buf.String()
+	}
+
+	// Apply the remaining WriterOptions that do not have a preApply() method.
+	for _, opt := range extraOpts {
+		if _, ok := opt.(preApply); ok {
+			continue
+		}
+		opt.writerApply(w)
+	}
+
+	// Initialize the range key fragmenter and encoder.
+	w.fragmenter.Emit = w.encodeRangeKeySpan
+	w.rangeKeyEncoder.Emit = w.addRangeKey
+	return w
+}
+
+// internalGetProperties is a private, internal-use-only function that takes a
+// Writer and returns a pointer to its Properties, allowing direct mutation.
+// It's used by internal Pebble flushes and compactions to set internal
+// properties. It gets installed in private.
+func internalGetProperties(w *Writer) *Properties {
+	return &w.props
+}
+
+func init() {
+	private.SSTableWriterDisableKeyOrderChecks = func(i interface{}) {
+		w := i.(*Writer)
+		w.disableKeyOrderChecks = true
+	}
+	private.SSTableInternalProperties = internalGetProperties
+}
+
+type obsoleteKeyBlockPropertyCollector struct {
+	blockIsNonObsolete bool
+	indexIsNonObsolete bool
+	tableIsNonObsolete bool
+}
+
+func encodeNonObsolete(isNonObsolete bool, buf []byte) []byte {
+	if isNonObsolete {
+		return buf
+	}
+	return append(buf, 't')
+}
+
+func (o *obsoleteKeyBlockPropertyCollector) Name() string {
+	return "obsolete-key"
+}
+
+func (o *obsoleteKeyBlockPropertyCollector) Add(key InternalKey, value []byte) error {
+	// Ignore.
+	return nil
+}
+
+func (o *obsoleteKeyBlockPropertyCollector) AddPoint(isObsolete bool) {
+	o.blockIsNonObsolete = o.blockIsNonObsolete || !isObsolete
+}
+
+func (o *obsoleteKeyBlockPropertyCollector) FinishDataBlock(buf []byte) ([]byte, error) {
+	o.tableIsNonObsolete = o.tableIsNonObsolete || o.blockIsNonObsolete
+	return encodeNonObsolete(o.blockIsNonObsolete, buf), nil
+}
+
+func (o *obsoleteKeyBlockPropertyCollector) AddPrevDataBlockToIndexBlock() {
+	o.indexIsNonObsolete = o.indexIsNonObsolete || o.blockIsNonObsolete
+	o.blockIsNonObsolete = false
+}
+
+func (o *obsoleteKeyBlockPropertyCollector) FinishIndexBlock(buf []byte) ([]byte, error) {
+	indexIsNonObsolete := o.indexIsNonObsolete
+	o.indexIsNonObsolete = false
+	return encodeNonObsolete(indexIsNonObsolete, buf), nil
+}
+
+func (o *obsoleteKeyBlockPropertyCollector) FinishTable(buf []byte) ([]byte, error) {
+	return encodeNonObsolete(o.tableIsNonObsolete, buf), nil
+}
+
+func (o *obsoleteKeyBlockPropertyCollector) UpdateKeySuffixes(
+	oldProp []byte, oldSuffix, newSuffix []byte,
+) error {
+	_, err := propToIsObsolete(oldProp)
+	if err != nil {
+		return err
+	}
+	// Suffix rewriting currently loses the obsolete bit.
+	o.blockIsNonObsolete = true
+	return nil
+}
+
+// NB: obsoleteKeyBlockPropertyFilter is stateless. This aspect of the filter
+// is used in table_cache.go for in-place modification of a filters slice.
+type obsoleteKeyBlockPropertyFilter struct{}
+
+func (o obsoleteKeyBlockPropertyFilter) Name() string {
+	return "obsolete-key"
+}
+
+// Intersects returns true if the set represented by prop intersects with
+// the set in the filter.
+func (o obsoleteKeyBlockPropertyFilter) Intersects(prop []byte) (bool, error) {
+	return propToIsObsolete(prop)
+}
+
+func propToIsObsolete(prop []byte) (bool, error) {
+	if len(prop) == 0 {
+		return true, nil
+	}
+	if len(prop) > 1 || prop[0] != 't' {
+		return false, errors.Errorf("unexpected property %x", prop)
+	}
+	return false, nil
+}
diff --git a/pebble/sstable/writer_fixture_test.go b/pebble/sstable/writer_fixture_test.go
new file mode 100644
index 0000000..12932ed
--- /dev/null
+++ b/pebble/sstable/writer_fixture_test.go
@@ -0,0 +1,191 @@
+// Copyright 2019 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License.
+
+package sstable
+
+import (
+	"bytes"
+	"fmt"
+	"math"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/bloom"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/stretchr/testify/require"
+)
+
+const (
+	noPrefixFilter = false
+	prefixFilter   = true
+
+	noFullKeyBloom = false
+	fullKeyBloom   = true
+
+	defaultIndexBlockSize = math.MaxInt32
+	smallIndexBlockSize   = 128
+)
+
+type keyCountPropertyCollector struct {
+	count int
+}
+
+func (c *keyCountPropertyCollector) Add(key InternalKey, value []byte) error {
+	c.count++
+	return nil
+}
+
+func (c *keyCountPropertyCollector) Finish(userProps map[string]string) error {
+	userProps["test.key-count"] = fmt.Sprint(c.count)
+	return nil
+}
+
+func (c *keyCountPropertyCollector) Name() string {
+	return "KeyCountPropertyCollector"
+}
+
+var fixtureComparer = func() *Comparer {
+	c := *base.DefaultComparer
+	// NB: this is named as such only to match the built-in RocksDB comparer.
+	c.Name = "leveldb.BytewiseComparator"
+	c.Split = func(a []byte) int {
+		// TODO(tbg): this matches logic in testdata/make-table.cc. It's
+		// difficult to provide a more meaningful prefix extractor on the given
+		// dataset since it's not MVCC, and so it's impossible to come up with a
+		// sensible one. We need to add a better dataset and use that instead to
+		// get confidence that prefix extractors are working as intended.
+		return len(a)
+	}
+	return &c
+}()
+
+type fixtureOpts struct {
+	compression    Compression
+	fullKeyFilter  bool
+	prefixFilter   bool
+	indexBlockSize int
+}
+
+func (o fixtureOpts) String() string {
+	return fmt.Sprintf(
+		"compression=%s,fullKeyFilter=%t,prefixFilter=%t",
+		o.compression, o.fullKeyFilter, o.prefixFilter,
+	)
+}
+
+var fixtures = map[fixtureOpts]struct {
+	filename      string
+	comparer      *Comparer
+	propCollector func() TablePropertyCollector
+}{
+	{SnappyCompression, noFullKeyBloom, noPrefixFilter, defaultIndexBlockSize}: {
+		"testdata/h.sst", nil,
+		func() TablePropertyCollector {
+			return &keyCountPropertyCollector{}
+		},
+	},
+	{SnappyCompression, fullKeyBloom, noPrefixFilter, defaultIndexBlockSize}: {
+		"testdata/h.table-bloom.sst", nil, nil,
+	},
+	{NoCompression, noFullKeyBloom, noPrefixFilter, defaultIndexBlockSize}: {
+		"testdata/h.no-compression.sst", nil,
+		func() TablePropertyCollector {
+			return &keyCountPropertyCollector{}
+		},
+	},
+	{NoCompression, fullKeyBloom, noPrefixFilter, defaultIndexBlockSize}: {
+		"testdata/h.table-bloom.no-compression.sst", nil, nil,
+	},
+	{NoCompression, noFullKeyBloom, prefixFilter, defaultIndexBlockSize}: {
+		"testdata/h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst",
+		fixtureComparer, nil,
+	},
+	{NoCompression, noFullKeyBloom, noPrefixFilter, smallIndexBlockSize}: {
+		"testdata/h.no-compression.two_level_index.sst", nil,
+		func() TablePropertyCollector {
+			return &keyCountPropertyCollector{}
+		},
+	},
+	{ZstdCompression, noFullKeyBloom, noPrefixFilter, defaultIndexBlockSize}: {
+		"testdata/h.zstd-compression.sst", nil,
+		func() TablePropertyCollector {
+			return &keyCountPropertyCollector{}
+		},
+	},
+}
+
+func runTestFixtureOutput(opts fixtureOpts) error {
+	fixture, ok := fixtures[opts]
+	if !ok {
+		return errors.Errorf("fixture missing: %+v", opts)
+	}
+
+	compression := opts.compression
+
+	var fp base.FilterPolicy
+	if opts.fullKeyFilter || opts.prefixFilter {
+		fp = bloom.FilterPolicy(10)
+	}
+	ftype := base.TableFilter
+
+	// Check that a freshly made table is byte-for-byte equal to a pre-made
+	// table.
+	want, err := os.ReadFile(filepath.FromSlash(fixture.filename))
+	if err != nil {
+		return err
+	}
+
+	f, err := build(compression, fp, ftype, fixture.comparer, fixture.propCollector, 2048, opts.indexBlockSize)
+	if err != nil {
+		return err
+	}
+	stat, err := f.Stat()
+	if err != nil {
+		return err
+	}
+	got := make([]byte, stat.Size())
+	_, err = f.ReadAt(got, 0)
+	if err != nil {
+		return err
+	}
+
+	if !bytes.Equal(got, want) {
+		i := 0
+		for ; i < len(got) && i < len(want) && got[i] == want[i]; i++ {
+		}
+		os.WriteFile("fail.txt", got, 0644)
+		return errors.Errorf("built table %s does not match pre-made table. From byte %d onwards,\ngot:\n% x\nwant:\n% x",
+			fixture.filename, i, got[i:], want[i:])
+	}
+	return nil
+}
+
+func TestFixtureOutput(t *testing.T) {
+	for opt := range fixtures {
+		// Note: we disabled the zstd fixture test when CGO_ENABLED=0, because the
+		// implementation between DataDog/zstd and klauspost/compress are
+		// different, which leads to different compression output
+		// <https://github.com/klauspost/compress/issues/109#issuecomment-498763233>.
+		// Since the fixture test requires bit-to-bit reproducibility, we cannot
+		// run the zstd test when the implementation is not based on facebook/zstd.
+		if !useStandardZstdLib && opt.compression == ZstdCompression {
+			continue
+		}
+		t.Run(opt.String(), func(t *testing.T) {
+			require.NoError(t, runTestFixtureOutput(opt))
+		})
+	}
+}
diff --git a/pebble/sstable/writer_rangekey_test.go b/pebble/sstable/writer_rangekey_test.go
new file mode 100644
index 0000000..bbe6946
--- /dev/null
+++ b/pebble/sstable/writer_rangekey_test.go
@@ -0,0 +1,124 @@
+package sstable
+
+import (
+	"bytes"
+	"crypto/rand"
+	"fmt"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+func TestWriter_RangeKeys(t *testing.T) {
+	var r *Reader
+	defer func() {
+		if r != nil {
+			require.NoError(t, r.Close())
+		}
+	}()
+
+	buildFn := func(td *datadriven.TestData) (*Reader, error) {
+		mem := vfs.NewMem()
+		f, err := mem.Create("test")
+		if err != nil {
+			return nil, err
+		}
+
+		// Use a "suffix-aware" Comparer, that will sort suffix-values in
+		// descending order of timestamp, rather than in lexical order.
+		cmp := testkeys.Comparer
+		w := NewWriter(objstorageprovider.NewFileWritable(f), WriterOptions{
+			Comparer:    cmp,
+			TableFormat: TableFormatPebblev2,
+		})
+		for _, data := range strings.Split(td.Input, "\n") {
+			// Format. One of:
+			// - SET $START-$END $SUFFIX=$VALUE
+			// - UNSET $START-$END $SUFFIX
+			// - DEL $START-$END
+			parts := strings.Split(data, " ")
+			kind, startEnd := parts[0], parts[1]
+
+			startEndSplit := bytes.Split([]byte(startEnd), []byte("-"))
+
+			var start, end, suffix, value []byte
+			start, end = startEndSplit[0], startEndSplit[1]
+
+			switch kind {
+			case "SET":
+				sv := bytes.Split([]byte(parts[2]), []byte("="))
+				suffix, value = sv[0], sv[1]
+				err = w.RangeKeySet(start, end, suffix, value)
+			case "UNSET":
+				suffix = []byte(parts[2])
+				err = w.RangeKeyUnset(start, end, suffix)
+			case "DEL":
+				err = w.RangeKeyDelete(start, end)
+			default:
+				return nil, errors.Newf("unexpected key kind: %s", kind)
+			}
+			if err != nil {
+				return nil, err
+			}
+
+			// Scramble the bytes in each of the input arrays. This helps with
+			// flushing out subtle bugs due to byte slice re-use.
+			for _, slice := range [][]byte{start, end, suffix, value} {
+				_, _ = rand.Read(slice)
+			}
+		}
+
+		if err = w.Close(); err != nil {
+			return nil, err
+		}
+
+		f, err = mem.Open("test")
+		if err != nil {
+			return nil, err
+		}
+
+		r, err = newReader(f, ReaderOptions{Comparer: cmp})
+		if err != nil {
+			return nil, err
+		}
+
+		return r, nil
+	}
+
+	datadriven.RunTest(t, "testdata/writer_range_keys", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "build":
+			if r != nil {
+				_ = r.Close()
+				r = nil
+			}
+
+			var err error
+			r, err = buildFn(td)
+			if err != nil {
+				return err.Error()
+			}
+
+			iter, err := r.NewRawRangeKeyIter()
+			if err != nil {
+				return err.Error()
+			}
+			defer iter.Close()
+
+			var buf bytes.Buffer
+			for s := iter.First(); s != nil; s = iter.Next() {
+				_, _ = fmt.Fprintf(&buf, "%s\n", s)
+			}
+			return buf.String()
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
diff --git a/pebble/sstable/writer_test.go b/pebble/sstable/writer_test.go
new file mode 100644
index 0000000..20f9e90
--- /dev/null
+++ b/pebble/sstable/writer_test.go
@@ -0,0 +1,1075 @@
+// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package sstable
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"math/rand"
+	"strconv"
+	"strings"
+	"sync"
+	"testing"
+	"unsafe"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/bloom"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/cache"
+	"github.com/cockroachdb/pebble/internal/humanize"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+func testWriterParallelism(t *testing.T, parallelism bool) {
+	for _, format := range []TableFormat{TableFormatPebblev2, TableFormatPebblev3} {
+		tdFile := "testdata/writer"
+		if format == TableFormatPebblev3 {
+			tdFile = "testdata/writer_v3"
+		}
+		t.Run(format.String(), func(t *testing.T) { runDataDriven(t, tdFile, format, parallelism) })
+	}
+}
+func TestWriter(t *testing.T) {
+	testWriterParallelism(t, false)
+}
+
+func testRewriterParallelism(t *testing.T, parallelism bool) {
+	for _, format := range []TableFormat{TableFormatPebblev2, TableFormatPebblev3} {
+		tdFile := "testdata/rewriter"
+		if format == TableFormatPebblev3 {
+			tdFile = "testdata/rewriter_v3"
+		}
+		t.Run(format.String(), func(t *testing.T) { runDataDriven(t, tdFile, format, parallelism) })
+	}
+}
+
+func TestRewriter(t *testing.T) {
+	testRewriterParallelism(t, false)
+}
+
+func TestWriterParallel(t *testing.T) {
+	testWriterParallelism(t, true)
+}
+
+func TestRewriterParallel(t *testing.T) {
+	testRewriterParallelism(t, true)
+}
+
+func runDataDriven(t *testing.T, file string, tableFormat TableFormat, parallelism bool) {
+	var r *Reader
+	defer func() {
+		if r != nil {
+			require.NoError(t, r.Close())
+		}
+	}()
+
+	format := func(td *datadriven.TestData, m *WriterMetadata) string {
+		var requestedProps []string
+		for _, cmdArg := range td.CmdArgs {
+			switch cmdArg.Key {
+			case "props":
+				requestedProps = cmdArg.Vals
+			}
+		}
+
+		var b bytes.Buffer
+		if m.HasPointKeys {
+			fmt.Fprintf(&b, "point:    [%s-%s]\n", m.SmallestPoint, m.LargestPoint)
+		}
+		if m.HasRangeDelKeys {
+			fmt.Fprintf(&b, "rangedel: [%s-%s]\n", m.SmallestRangeDel, m.LargestRangeDel)
+		}
+		if m.HasRangeKeys {
+			fmt.Fprintf(&b, "rangekey: [%s-%s]\n", m.SmallestRangeKey, m.LargestRangeKey)
+		}
+		fmt.Fprintf(&b, "seqnums:  [%d-%d]\n", m.SmallestSeqNum, m.LargestSeqNum)
+
+		if len(requestedProps) > 0 {
+			props := strings.Split(r.Properties.String(), "\n")
+			for _, requestedProp := range requestedProps {
+				fmt.Fprintf(&b, "props %q:\n", requestedProp)
+				for _, prop := range props {
+					if strings.Contains(prop, requestedProp) {
+						fmt.Fprintf(&b, "  %s\n", prop)
+					}
+				}
+			}
+		}
+
+		return b.String()
+	}
+
+	datadriven.RunTest(t, file, func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "build":
+			if r != nil {
+				_ = r.Close()
+				r = nil
+			}
+			var meta *WriterMetadata
+			var err error
+			meta, r, err = runBuildCmd(td, &WriterOptions{
+				TableFormat: tableFormat,
+				Parallelism: parallelism,
+			}, 0)
+			if err != nil {
+				return err.Error()
+			}
+			return format(td, meta)
+
+		case "build-raw":
+			if r != nil {
+				_ = r.Close()
+				r = nil
+			}
+			var meta *WriterMetadata
+			var err error
+			meta, r, err = runBuildRawCmd(td, &WriterOptions{
+				TableFormat: tableFormat,
+			})
+			if err != nil {
+				return err.Error()
+			}
+			return format(td, meta)
+
+		case "scan":
+			origIter, err := r.NewIter(nil /* lower */, nil /* upper */)
+			if err != nil {
+				return err.Error()
+			}
+			iter := newIterAdapter(origIter)
+			defer iter.Close()
+
+			var buf bytes.Buffer
+			for valid := iter.First(); valid; valid = iter.Next() {
+				fmt.Fprintf(&buf, "%s:%s\n", iter.Key(), iter.Value())
+			}
+			return buf.String()
+
+		case "get":
+			var buf bytes.Buffer
+			for _, k := range strings.Split(td.Input, "\n") {
+				value, err := r.get([]byte(k))
+				if err != nil {
+					fmt.Fprintf(&buf, "get %s: %s\n", k, err.Error())
+				} else {
+					fmt.Fprintf(&buf, "%s\n", value)
+				}
+			}
+			return buf.String()
+
+		case "scan-range-del":
+			iter, err := r.NewRawRangeDelIter()
+			if err != nil {
+				return err.Error()
+			}
+			if iter == nil {
+				return ""
+			}
+			defer iter.Close()
+
+			var buf bytes.Buffer
+			for s := iter.First(); s != nil; s = iter.Next() {
+				fmt.Fprintf(&buf, "%s\n", s)
+			}
+			return buf.String()
+
+		case "scan-range-key":
+			iter, err := r.NewRawRangeKeyIter()
+			if err != nil {
+				return err.Error()
+			}
+			if iter == nil {
+				return ""
+			}
+			defer iter.Close()
+
+			var buf bytes.Buffer
+			for s := iter.First(); s != nil; s = iter.Next() {
+				fmt.Fprintf(&buf, "%s\n", s)
+			}
+			return buf.String()
+
+		case "layout":
+			l, err := r.Layout()
+			if err != nil {
+				return err.Error()
+			}
+			verbose := false
+			if len(td.CmdArgs) > 0 {
+				if td.CmdArgs[0].Key == "verbose" {
+					verbose = true
+				} else {
+					return "unknown arg"
+				}
+			}
+			var buf bytes.Buffer
+			l.Describe(&buf, verbose, r, nil)
+			return buf.String()
+
+		case "rewrite":
+			var meta *WriterMetadata
+			var err error
+			meta, r, err = runRewriteCmd(td, r, WriterOptions{
+				TableFormat: tableFormat,
+			})
+			if err != nil {
+				return err.Error()
+			}
+			if err != nil {
+				return err.Error()
+			}
+			return format(td, meta)
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestWriterWithValueBlocks(t *testing.T) {
+	var r *Reader
+	defer func() {
+		if r != nil {
+			require.NoError(t, r.Close())
+		}
+	}()
+	formatVersion := TableFormatMax
+	formatMeta := func(m *WriterMetadata) string {
+		return fmt.Sprintf("value-blocks: num-values %d, num-blocks: %d, size: %d",
+			m.Properties.NumValuesInValueBlocks, m.Properties.NumValueBlocks,
+			m.Properties.ValueBlocksSize)
+	}
+
+	parallelism := false
+	if rand.Intn(2) == 0 {
+		parallelism = true
+	}
+	t.Logf("writer parallelism %t", parallelism)
+	attributeExtractor := func(
+		key []byte, keyPrefixLen int, value []byte) (base.ShortAttribute, error) {
+		require.NotNil(t, key)
+		require.Less(t, 0, keyPrefixLen)
+		attribute := base.ShortAttribute(len(value) & '\x07')
+		return attribute, nil
+	}
+
+	datadriven.RunTest(t, "testdata/writer_value_blocks", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "build":
+			if r != nil {
+				_ = r.Close()
+				r = nil
+			}
+			var meta *WriterMetadata
+			var err error
+			var blockSize int
+			if td.HasArg("block-size") {
+				td.ScanArgs(t, "block-size", &blockSize)
+			}
+			var inPlaceValueBound UserKeyPrefixBound
+			if td.HasArg("in-place-bound") {
+				var l, u string
+				td.ScanArgs(t, "in-place-bound", &l, &u)
+				inPlaceValueBound.Lower = []byte(l)
+				inPlaceValueBound.Upper = []byte(u)
+			}
+			meta, r, err = runBuildCmd(td, &WriterOptions{
+				BlockSize:                 blockSize,
+				Comparer:                  testkeys.Comparer,
+				TableFormat:               formatVersion,
+				Parallelism:               parallelism,
+				RequiredInPlaceValueBound: inPlaceValueBound,
+				ShortAttributeExtractor:   attributeExtractor,
+			}, 0)
+			if err != nil {
+				return err.Error()
+			}
+			return formatMeta(meta)
+
+		case "layout":
+			l, err := r.Layout()
+			if err != nil {
+				return err.Error()
+			}
+			var buf bytes.Buffer
+			l.Describe(&buf, true, r, func(key *base.InternalKey, value []byte) {
+				fmt.Fprintf(&buf, "  %s:%s\n", key.String(), string(value))
+			})
+			return buf.String()
+
+		case "scan-raw":
+			// Raw scan does not fetch from value blocks.
+			origIter, err := r.NewIter(nil /* lower */, nil /* upper */)
+			if err != nil {
+				return err.Error()
+			}
+			forceIgnoreValueBlocks := func(i *singleLevelIterator) {
+				i.vbReader = nil
+				i.data.lazyValueHandling.vbr = nil
+				i.data.lazyValueHandling.hasValuePrefix = false
+			}
+			switch i := origIter.(type) {
+			case *twoLevelIterator:
+				forceIgnoreValueBlocks(&i.singleLevelIterator)
+			case *singleLevelIterator:
+				forceIgnoreValueBlocks(i)
+			}
+			iter := newIterAdapter(origIter)
+			defer iter.Close()
+
+			var buf bytes.Buffer
+			for valid := iter.First(); valid; valid = iter.Next() {
+				v := iter.Value()
+				if iter.Key().Kind() == InternalKeyKindSet {
+					prefix := valuePrefix(v[0])
+					setWithSamePrefix := setHasSamePrefix(prefix)
+					if isValueHandle(prefix) {
+						attribute := getShortAttribute(prefix)
+						vh := decodeValueHandle(v[1:])
+						fmt.Fprintf(&buf, "%s:value-handle len %d block %d offset %d, att %d, same-pre %t\n",
+							iter.Key(), vh.valueLen, vh.blockNum, vh.offsetInBlock, attribute, setWithSamePrefix)
+					} else {
+						fmt.Fprintf(&buf, "%s:in-place %s, same-pre %t\n", iter.Key(), v[1:], setWithSamePrefix)
+					}
+				} else {
+					fmt.Fprintf(&buf, "%s:%s\n", iter.Key(), v)
+				}
+			}
+			return buf.String()
+
+		case "scan":
+			origIter, err := r.NewIter(nil /* lower */, nil /* upper */)
+			if err != nil {
+				return err.Error()
+			}
+			iter := newIterAdapter(origIter)
+			defer iter.Close()
+			var buf bytes.Buffer
+			for valid := iter.First(); valid; valid = iter.Next() {
+				fmt.Fprintf(&buf, "%s:%s\n", iter.Key(), iter.Value())
+			}
+			return buf.String()
+
+		case "scan-cloned-lazy-values":
+			iter, err := r.NewIter(nil /* lower */, nil /* upper */)
+			if err != nil {
+				return err.Error()
+			}
+			var fetchers [100]base.LazyFetcher
+			var values []base.LazyValue
+			n := 0
+			var b []byte
+			for k, lv := iter.First(); k != nil; k, lv = iter.Next() {
+				var lvClone base.LazyValue
+				lvClone, b = lv.Clone(b, &fetchers[n])
+				if lv.Fetcher != nil {
+					_, callerOwned, err := lv.Value(nil)
+					require.False(t, callerOwned)
+					require.NoError(t, err)
+				}
+				n++
+				values = append(values, lvClone)
+			}
+			require.NoError(t, iter.Error())
+			iter.Close()
+			var buf bytes.Buffer
+			for i := range values {
+				fmt.Fprintf(&buf, "%d", i)
+				v, callerOwned, err := values[i].Value(nil)
+				require.NoError(t, err)
+				if values[i].Fetcher != nil {
+					require.True(t, callerOwned)
+					fmt.Fprintf(&buf, "(lazy: len %d, attr: %d): %s\n",
+						values[i].Len(), values[i].Fetcher.Attribute.ShortAttribute, string(v))
+					v2, callerOwned, err := values[i].Value(nil)
+					require.NoError(t, err)
+					require.True(t, callerOwned)
+					require.Equal(t, &v[0], &v2[0])
+
+				} else {
+					require.False(t, callerOwned)
+					fmt.Fprintf(&buf, "(in-place: len %d): %s\n", values[i].Len(), string(v))
+				}
+			}
+			return buf.String()
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func testBlockBufClear(t *testing.T, b1, b2 *blockBuf) {
+	require.Equal(t, b1.tmp, b2.tmp)
+}
+
+func TestBlockBufClear(t *testing.T) {
+	b1 := &blockBuf{}
+	b1.tmp[0] = 1
+	b1.compressedBuf = make([]byte, 1)
+	b1.clear()
+	testBlockBufClear(t, b1, &blockBuf{})
+}
+
+func TestClearDataBlockBuf(t *testing.T) {
+	d := newDataBlockBuf(1, ChecksumTypeCRC32c)
+	d.blockBuf.compressedBuf = make([]byte, 1)
+	d.dataBlock.add(ikey("apple"), nil)
+	d.dataBlock.add(ikey("banana"), nil)
+
+	d.clear()
+	testBlockCleared(t, &d.dataBlock, &blockWriter{})
+	testBlockBufClear(t, &d.blockBuf, &blockBuf{})
+
+	dataBlockBufPool.Put(d)
+}
+
+func TestClearIndexBlockBuf(t *testing.T) {
+	i := newIndexBlockBuf(false)
+	i.block.add(ikey("apple"), nil)
+	i.block.add(ikey("banana"), nil)
+	i.clear()
+
+	testBlockCleared(t, &i.block, &blockWriter{})
+	require.Equal(
+		t, i.size.estimate, sizeEstimate{emptySize: emptyBlockSize},
+	)
+	indexBlockBufPool.Put(i)
+}
+
+func TestClearWriteTask(t *testing.T) {
+	w := writeTaskPool.Get().(*writeTask)
+	ch := make(chan bool, 1)
+	w.compressionDone = ch
+	w.buf = &dataBlockBuf{}
+	w.flushableIndexBlock = &indexBlockBuf{}
+	w.currIndexBlock = &indexBlockBuf{}
+	w.indexEntrySep = ikey("apple")
+	w.indexInflightSize = 1
+	w.finishedIndexProps = []byte{'a', 'v'}
+
+	w.clear()
+
+	var nilDataBlockBuf *dataBlockBuf
+	var nilIndexBlockBuf *indexBlockBuf
+	// Channels should be the same(no new channel should be allocated)
+	require.Equal(t, w.compressionDone, ch)
+	require.Equal(t, w.buf, nilDataBlockBuf)
+	require.Equal(t, w.flushableIndexBlock, nilIndexBlockBuf)
+	require.Equal(t, w.currIndexBlock, nilIndexBlockBuf)
+	require.Equal(t, w.indexEntrySep, base.InvalidInternalKey)
+	require.Equal(t, w.indexInflightSize, 0)
+	require.Equal(t, w.finishedIndexProps, []byte(nil))
+
+	writeTaskPool.Put(w)
+}
+
+func TestDoubleClose(t *testing.T) {
+	// There is code in Cockroach land which relies on Writer.Close being
+	// idempotent. We should test this in Pebble, so that we don't cause
+	// Cockroach test failures.
+	f := &discardFile{}
+	w := NewWriter(f, WriterOptions{
+		BlockSize:   1,
+		TableFormat: TableFormatPebblev1,
+	})
+	w.Set(ikey("a").UserKey, nil)
+	w.Set(ikey("b").UserKey, nil)
+	err := w.Close()
+	require.NoError(t, err)
+	err = w.Close()
+	require.Equal(t, err, errWriterClosed)
+}
+
+func TestParallelWriterErrorProp(t *testing.T) {
+	fs := vfs.NewMem()
+	f, err := fs.Create("test")
+	require.NoError(t, err)
+	opts := WriterOptions{
+		TableFormat: TableFormatPebblev1, BlockSize: 1, Parallelism: true,
+	}
+
+	w := NewWriter(objstorageprovider.NewFileWritable(f), opts)
+	// Directly testing this, because it's difficult to get the Writer to
+	// encounter an error, precisely when the writeQueue is doing block writes.
+	w.coordination.writeQueue.err = errors.New("write queue write error")
+	w.Set(ikey("a").UserKey, nil)
+	w.Set(ikey("b").UserKey, nil)
+	err = w.Close()
+	require.Equal(t, err.Error(), "write queue write error")
+}
+
+func TestSizeEstimate(t *testing.T) {
+	var sizeEstimate sizeEstimate
+	datadriven.RunTest(t, "testdata/size_estimate",
+		func(t *testing.T, td *datadriven.TestData) string {
+			switch td.Cmd {
+			case "init":
+				if len(td.CmdArgs) != 1 {
+					return "init <empty size>"
+				}
+				emptySize, err := strconv.Atoi(td.CmdArgs[0].String())
+				if err != nil {
+					return "invalid empty size"
+				}
+				sizeEstimate.init(uint64(emptySize))
+				return "success"
+			case "clear":
+				sizeEstimate.clear()
+				return fmt.Sprintf("%d", sizeEstimate.size())
+			case "size":
+				return fmt.Sprintf("%d", sizeEstimate.size())
+			case "add_inflight":
+				if len(td.CmdArgs) != 1 {
+					return "add_inflight <inflight size estimate>"
+				}
+				inflightSize, err := strconv.Atoi(td.CmdArgs[0].String())
+				if err != nil {
+					return "invalid inflight size"
+				}
+				sizeEstimate.addInflight(inflightSize)
+				return fmt.Sprintf("%d", sizeEstimate.size())
+			case "entry_written":
+				if len(td.CmdArgs) != 2 {
+					return "entry_written <new_total_size> <prev_inflight_size>"
+				}
+				newTotalSize, err := strconv.Atoi(td.CmdArgs[0].String())
+				if err != nil {
+					return "invalid inflight size"
+				}
+				inflightSize, err := strconv.Atoi(td.CmdArgs[1].String())
+				if err != nil {
+					return "invalid inflight size"
+				}
+				sizeEstimate.writtenWithTotal(uint64(newTotalSize), inflightSize)
+				return fmt.Sprintf("%d", sizeEstimate.size())
+			case "num_written_entries":
+				return fmt.Sprintf("%d", sizeEstimate.numWrittenEntries)
+			case "num_inflight_entries":
+				return fmt.Sprintf("%d", sizeEstimate.numInflightEntries)
+			case "num_entries":
+				return fmt.Sprintf("%d", sizeEstimate.numWrittenEntries+sizeEstimate.numInflightEntries)
+			default:
+				return fmt.Sprintf("unknown command: %s", td.Cmd)
+			}
+		})
+}
+
+func TestWriterClearCache(t *testing.T) {
+	// Verify that Writer clears the cache of blocks that it writes.
+	mem := vfs.NewMem()
+	opts := ReaderOptions{
+		Cache:    cache.New(64 << 20),
+		Comparer: testkeys.Comparer,
+	}
+	defer opts.Cache.Unref()
+
+	writerOpts := WriterOptions{
+		Cache:       opts.Cache,
+		Comparer:    testkeys.Comparer,
+		TableFormat: TableFormatPebblev3,
+	}
+	cacheOpts := &cacheOpts{cacheID: 1, fileNum: base.FileNum(1).DiskFileNum()}
+	invalidData := func() *cache.Value {
+		invalid := []byte("invalid data")
+		v := cache.Alloc(len(invalid))
+		copy(v.Buf(), invalid)
+		return v
+	}
+
+	build := func(name string) {
+		f, err := mem.Create(name)
+		require.NoError(t, err)
+
+		w := NewWriter(objstorageprovider.NewFileWritable(f), writerOpts, cacheOpts)
+		require.NoError(t, w.Set([]byte("hello"), []byte("world")))
+		require.NoError(t, w.Set([]byte("hello@42"), []byte("world@42")))
+		require.NoError(t, w.Set([]byte("hello@5"), []byte("world@5")))
+		require.NoError(t, w.Close())
+	}
+
+	// Build the sstable a first time so that we can determine the locations of
+	// all of the blocks.
+	build("test")
+
+	f, err := mem.Open("test")
+	require.NoError(t, err)
+
+	r, err := newReader(f, opts)
+	require.NoError(t, err)
+
+	layout, err := r.Layout()
+	require.NoError(t, err)
+
+	foreachBH := func(layout *Layout, f func(bh BlockHandle)) {
+		for _, bh := range layout.Data {
+			f(bh.BlockHandle)
+		}
+		for _, bh := range layout.Index {
+			f(bh)
+		}
+		f(layout.TopIndex)
+		f(layout.Filter)
+		f(layout.RangeDel)
+		for _, bh := range layout.ValueBlock {
+			f(bh)
+		}
+		if layout.ValueIndex.Length != 0 {
+			f(layout.ValueIndex)
+		}
+		f(layout.Properties)
+		f(layout.MetaIndex)
+	}
+
+	// Poison the cache for each of the blocks.
+	poison := func(bh BlockHandle) {
+		opts.Cache.Set(cacheOpts.cacheID, cacheOpts.fileNum, bh.Offset, invalidData()).Release()
+	}
+	foreachBH(layout, poison)
+
+	// Build the table a second time. This should clear the cache for the blocks
+	// that are written.
+	build("test")
+
+	// Verify that the written blocks have been cleared from the cache.
+	check := func(bh BlockHandle) {
+		h := opts.Cache.Get(cacheOpts.cacheID, cacheOpts.fileNum, bh.Offset)
+		if h.Get() != nil {
+			t.Fatalf("%d: expected cache to be cleared, but found %q", bh.Offset, h.Get())
+		}
+	}
+	foreachBH(layout, check)
+
+	require.NoError(t, r.Close())
+}
+
+type discardFile struct {
+	wrote int64
+}
+
+var _ objstorage.Writable = (*discardFile)(nil)
+
+func (f *discardFile) Finish() error {
+	return nil
+}
+
+func (f *discardFile) Abort() {}
+
+func (f *discardFile) Write(p []byte) error {
+	f.wrote += int64(len(p))
+	return nil
+}
+
+type blockPropErrSite uint
+
+const (
+	errSiteAdd blockPropErrSite = iota
+	errSiteFinishBlock
+	errSiteFinishIndex
+	errSiteFinishTable
+	errSiteNone
+)
+
+type testBlockPropCollector struct {
+	errSite blockPropErrSite
+	err     error
+}
+
+func (c *testBlockPropCollector) Name() string { return "testBlockPropCollector" }
+
+func (c *testBlockPropCollector) Add(_ InternalKey, _ []byte) error {
+	if c.errSite == errSiteAdd {
+		return c.err
+	}
+	return nil
+}
+
+func (c *testBlockPropCollector) FinishDataBlock(_ []byte) ([]byte, error) {
+	if c.errSite == errSiteFinishBlock {
+		return nil, c.err
+	}
+	return nil, nil
+}
+
+func (c *testBlockPropCollector) AddPrevDataBlockToIndexBlock() {}
+
+func (c *testBlockPropCollector) FinishIndexBlock(_ []byte) ([]byte, error) {
+	if c.errSite == errSiteFinishIndex {
+		return nil, c.err
+	}
+	return nil, nil
+}
+
+func (c *testBlockPropCollector) FinishTable(_ []byte) ([]byte, error) {
+	if c.errSite == errSiteFinishTable {
+		return nil, c.err
+	}
+	return nil, nil
+}
+
+func TestWriterBlockPropertiesErrors(t *testing.T) {
+	blockPropErr := errors.Newf("block property collector failed")
+	testCases := []blockPropErrSite{
+		errSiteAdd,
+		errSiteFinishBlock,
+		errSiteFinishIndex,
+		errSiteFinishTable,
+		errSiteNone,
+	}
+
+	var (
+		k1 = base.MakeInternalKey([]byte("a"), 0, base.InternalKeyKindSet)
+		v1 = []byte("apples")
+		k2 = base.MakeInternalKey([]byte("b"), 0, base.InternalKeyKindSet)
+		v2 = []byte("bananas")
+		k3 = base.MakeInternalKey([]byte("c"), 0, base.InternalKeyKindSet)
+		v3 = []byte("carrots")
+	)
+
+	for _, tc := range testCases {
+		t.Run("", func(t *testing.T) {
+			fs := vfs.NewMem()
+			f, err := fs.Create("test")
+			require.NoError(t, err)
+
+			w := NewWriter(objstorageprovider.NewFileWritable(f), WriterOptions{
+				BlockSize: 1,
+				BlockPropertyCollectors: []func() BlockPropertyCollector{
+					func() BlockPropertyCollector {
+						return &testBlockPropCollector{
+							errSite: tc,
+							err:     blockPropErr,
+						}
+					},
+				},
+				TableFormat: TableFormatPebblev1,
+			})
+
+			err = w.Add(k1, v1)
+			switch tc {
+			case errSiteAdd:
+				require.Error(t, err)
+				require.Equal(t, blockPropErr, err)
+				return
+			case errSiteFinishBlock:
+				require.NoError(t, err)
+				// Addition of a second key completes the first block.
+				err = w.Add(k2, v2)
+				require.Error(t, err)
+				require.Equal(t, blockPropErr, err)
+				return
+			case errSiteFinishIndex:
+				require.NoError(t, err)
+				// Addition of a second key completes the first block.
+				err = w.Add(k2, v2)
+				require.NoError(t, err)
+				// The index entry for the first block is added after the completion of
+				// the second block, which is triggered by adding a third key.
+				err = w.Add(k3, v3)
+				require.Error(t, err)
+				require.Equal(t, blockPropErr, err)
+				return
+			}
+
+			err = w.Close()
+			if tc == errSiteFinishTable {
+				require.Error(t, err)
+				require.Equal(t, blockPropErr, err)
+			} else {
+				require.NoError(t, err)
+			}
+		})
+	}
+}
+
+func TestWriter_TableFormatCompatibility(t *testing.T) {
+	testCases := []struct {
+		name        string
+		minFormat   TableFormat
+		configureFn func(opts *WriterOptions)
+		writeFn     func(w *Writer) error
+	}{
+		{
+			name:      "block properties",
+			minFormat: TableFormatPebblev1,
+			configureFn: func(opts *WriterOptions) {
+				opts.BlockPropertyCollectors = []func() BlockPropertyCollector{
+					func() BlockPropertyCollector {
+						return NewBlockIntervalCollector(
+							"collector", &valueCharBlockIntervalCollector{charIdx: 0}, nil,
+						)
+					},
+				}
+			},
+		},
+		{
+			name:      "range keys",
+			minFormat: TableFormatPebblev2,
+			writeFn: func(w *Writer) error {
+				return w.RangeKeyDelete([]byte("a"), []byte("b"))
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			for tf := TableFormatLevelDB; tf <= TableFormatMax; tf++ {
+				t.Run(tf.String(), func(t *testing.T) {
+					fs := vfs.NewMem()
+					f, err := fs.Create("sst")
+					require.NoError(t, err)
+
+					opts := WriterOptions{TableFormat: tf}
+					if tc.configureFn != nil {
+						tc.configureFn(&opts)
+					}
+
+					w := NewWriter(objstorageprovider.NewFileWritable(f), opts)
+					if tc.writeFn != nil {
+						err = tc.writeFn(w)
+						require.NoError(t, err)
+					}
+
+					err = w.Close()
+					if tf < tc.minFormat {
+						require.Error(t, err)
+					} else {
+						require.NoError(t, err)
+					}
+				})
+			}
+		})
+	}
+}
+
+// Tests for races, such as https://github.com/cockroachdb/cockroach/issues/77194,
+// in the Writer.
+func TestWriterRace(t *testing.T) {
+	ks := testkeys.Alpha(5)
+	ks = ks.EveryN(ks.Count() / 1_000)
+	keys := make([][]byte, ks.Count())
+	for ki := 0; ki < len(keys); ki++ {
+		keys[ki] = testkeys.Key(ks, int64(ki))
+	}
+	readerOpts := ReaderOptions{
+		Comparer: testkeys.Comparer,
+		Filters:  map[string]base.FilterPolicy{},
+	}
+
+	var wg sync.WaitGroup
+	for i := 0; i < 16; i++ {
+		wg.Add(1)
+		go func() {
+			val := make([]byte, rand.Intn(1000))
+			opts := WriterOptions{
+				Comparer:    testkeys.Comparer,
+				BlockSize:   rand.Intn(1 << 10),
+				Compression: NoCompression,
+			}
+			defer wg.Done()
+			f := &memFile{}
+			w := NewWriter(f, opts)
+			for ki := 0; ki < len(keys); ki++ {
+				require.NoError(
+					t,
+					w.Add(base.MakeInternalKey(keys[ki], uint64(ki), InternalKeyKindSet), val),
+				)
+				require.Equal(
+					t, w.dataBlockBuf.dataBlock.getCurKey().UserKey, keys[ki],
+				)
+			}
+			require.NoError(t, w.Close())
+			require.Equal(t, w.meta.LargestPoint.UserKey, keys[len(keys)-1])
+			r, err := NewMemReader(f.Data(), readerOpts)
+			require.NoError(t, err)
+			defer r.Close()
+			it, err := r.NewIter(nil, nil)
+			require.NoError(t, err)
+			defer it.Close()
+			ki := 0
+			for k, v := it.First(); k != nil; k, v = it.Next() {
+				require.Equal(t, k.UserKey, keys[ki])
+				vBytes, _, err := v.Value(nil)
+				require.NoError(t, err)
+				require.Equal(t, vBytes, val)
+				ki++
+			}
+		}()
+	}
+	wg.Wait()
+}
+
+func TestObsoleteBlockPropertyCollectorFilter(t *testing.T) {
+	var c obsoleteKeyBlockPropertyCollector
+	var f obsoleteKeyBlockPropertyFilter
+	require.Equal(t, c.Name(), f.Name())
+	// Data block with 1 obsolete and 1 non-obsolete point.
+	c.AddPoint(false)
+	c.AddPoint(true)
+	finishAndCheck := func(finishFunc func([]byte) ([]byte, error), expectedIntersects bool) {
+		var buf [1]byte
+		prop, err := finishFunc(buf[:0:1])
+		require.NoError(t, err)
+		expectedLength := 1
+		if expectedIntersects {
+			// The common case is encoded in 0 bytes
+			expectedLength = 0
+		}
+		require.Equal(t, expectedLength, len(prop))
+		// Confirm that the collector used the slice.
+		require.Equal(t, unsafe.Pointer(&buf[0]), unsafe.Pointer(&prop[:1][0]))
+		intersects, err := f.Intersects(prop)
+		require.NoError(t, err)
+		require.Equal(t, expectedIntersects, intersects)
+	}
+	finishAndCheck(c.FinishDataBlock, true)
+	c.AddPrevDataBlockToIndexBlock()
+	// Data block with only obsolete points.
+	c.AddPoint(true)
+	c.AddPoint(true)
+	finishAndCheck(c.FinishDataBlock, false)
+	c.AddPrevDataBlockToIndexBlock()
+	// Index block has one obsolete block and one non-obsolete block.
+	finishAndCheck(c.FinishIndexBlock, true)
+
+	// Data block with obsolete point.
+	c.AddPoint(true)
+	finishAndCheck(c.FinishDataBlock, false)
+	c.AddPrevDataBlockToIndexBlock()
+	// Data block with obsolete point.
+	c.AddPoint(true)
+	finishAndCheck(c.FinishDataBlock, false)
+	c.AddPrevDataBlockToIndexBlock()
+	// Index block has only obsolete blocks.
+	finishAndCheck(c.FinishIndexBlock, false)
+	// Table is not obsolete.
+	finishAndCheck(c.FinishTable, true)
+
+	// Reset the collector state.
+	c = obsoleteKeyBlockPropertyCollector{}
+	// Table with only obsolete blocks.
+
+	// Data block with obsolete point.
+	c.AddPoint(true)
+	finishAndCheck(c.FinishDataBlock, false)
+	c.AddPrevDataBlockToIndexBlock()
+	// Data block with obsolete point.
+	c.AddPoint(true)
+	finishAndCheck(c.FinishDataBlock, false)
+	c.AddPrevDataBlockToIndexBlock()
+	// Index block has only obsolete blocks.
+	finishAndCheck(c.FinishIndexBlock, false)
+	// Table is obsolete.
+	finishAndCheck(c.FinishTable, false)
+}
+
+func BenchmarkWriter(b *testing.B) {
+	keys := make([][]byte, 1e6)
+	const keyLen = 24
+	keySlab := make([]byte, keyLen*len(keys))
+	for i := range keys {
+		key := keySlab[i*keyLen : i*keyLen+keyLen]
+		binary.BigEndian.PutUint64(key[:8], 123) // 16-byte shared prefix
+		binary.BigEndian.PutUint64(key[8:16], 456)
+		binary.BigEndian.PutUint64(key[16:], uint64(i))
+		keys[i] = key
+	}
+	for _, format := range []TableFormat{TableFormatPebblev2, TableFormatPebblev3} {
+		b.Run(fmt.Sprintf("format=%s", format.String()), func(b *testing.B) {
+			runWriterBench(b, keys, nil, format)
+		})
+	}
+}
+
+func BenchmarkWriterWithVersions(b *testing.B) {
+	keys := make([][]byte, 1e6)
+	const keyLen = 26
+	keySlab := make([]byte, keyLen*len(keys))
+	for i := range keys {
+		key := keySlab[i*keyLen : i*keyLen+keyLen]
+		binary.BigEndian.PutUint64(key[:8], 123) // 16-byte shared prefix
+		binary.BigEndian.PutUint64(key[8:16], 456)
+		// @ is ascii value 64. Placing any byte with value 64 in these 8 bytes
+		// will confuse testkeys.Comparer, when we pass it a key after splitting
+		// of the suffix, since Comparer thinks this prefix is also a key with a
+		// suffix. Hence, we print as a base 10 string.
+		require.Equal(b, 8, copy(key[16:], fmt.Sprintf("%8d", i/2)))
+		key[24] = '@'
+		// Ascii representation of single digit integer 2-(i%2).
+		key[25] = byte(48 + 2 - (i % 2))
+		keys[i] = key
+	}
+	// TableFormatPebblev3 can sometimes be ~50% slower than
+	// TableFormatPebblev2, since testkeys.Compare is expensive (mainly due to
+	// split) and with v3 we have to call it twice for 50% of the Set calls,
+	// since they have the same prefix as the preceding key.
+	for _, format := range []TableFormat{TableFormatPebblev2, TableFormatPebblev3} {
+		b.Run(fmt.Sprintf("format=%s", format.String()), func(b *testing.B) {
+			runWriterBench(b, keys, testkeys.Comparer, format)
+		})
+	}
+}
+
+func runWriterBench(b *testing.B, keys [][]byte, comparer *base.Comparer, format TableFormat) {
+	for _, bs := range []int{base.DefaultBlockSize, 32 << 10} {
+		b.Run(fmt.Sprintf("block=%s", humanize.Bytes.Int64(int64(bs))), func(b *testing.B) {
+			for _, filter := range []bool{true, false} {
+				b.Run(fmt.Sprintf("filter=%t", filter), func(b *testing.B) {
+					for _, comp := range []Compression{NoCompression, SnappyCompression, ZstdCompression} {
+						b.Run(fmt.Sprintf("compression=%s", comp), func(b *testing.B) {
+							opts := WriterOptions{
+								BlockRestartInterval: 16,
+								BlockSize:            bs,
+								Comparer:             comparer,
+								Compression:          comp,
+								TableFormat:          format,
+							}
+							if filter {
+								opts.FilterPolicy = bloom.FilterPolicy(10)
+							}
+							f := &discardFile{}
+							b.ResetTimer()
+							for i := 0; i < b.N; i++ {
+								f.wrote = 0
+								w := NewWriter(f, opts)
+
+								for j := range keys {
+									if err := w.Set(keys[j], keys[j]); err != nil {
+										b.Fatal(err)
+									}
+								}
+								if err := w.Close(); err != nil {
+									b.Fatal(err)
+								}
+								b.SetBytes(int64(f.wrote))
+							}
+						})
+					}
+				})
+			}
+		})
+	}
+}
+
+var test4bSuffixComparer = &base.Comparer{
+	Compare:   base.DefaultComparer.Compare,
+	Equal:     base.DefaultComparer.Equal,
+	Separator: base.DefaultComparer.Separator,
+	Successor: base.DefaultComparer.Successor,
+	Split: func(key []byte) int {
+		if len(key) > 4 {
+			return len(key) - 4
+		}
+		return len(key)
+	},
+	Name: "comparer-split-4b-suffix",
+}
diff --git a/pebble/table_cache.go b/pebble/table_cache.go
new file mode 100644
index 0000000..c4e8bf1
--- /dev/null
+++ b/pebble/table_cache.go
@@ -0,0 +1,1208 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"io"
+	"runtime/debug"
+	"runtime/pprof"
+	"sync"
+	"sync/atomic"
+	"unsafe"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/internal/private"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing"
+	"github.com/cockroachdb/pebble/sstable"
+)
+
+var emptyIter = &errorIter{err: nil}
+var emptyKeyspanIter = &errorKeyspanIter{err: nil}
+
+// filteredAll is a singleton internalIterator implementation used when an
+// sstable does contain point keys, but all the keys are filtered by the active
+// PointKeyFilters set in the iterator's IterOptions.
+//
+// filteredAll implements filteredIter, ensuring the level iterator recognizes
+// when it may need to return file boundaries to keep the rangeDelIter open
+// during mergingIter operation.
+var filteredAll = &filteredAllKeysIter{errorIter: errorIter{err: nil}}
+
+var _ filteredIter = filteredAll
+
+type filteredAllKeysIter struct {
+	errorIter
+}
+
+func (s *filteredAllKeysIter) MaybeFilteredKeys() bool {
+	return true
+}
+
+var tableCacheLabels = pprof.Labels("pebble", "table-cache")
+
+// tableCacheOpts contains the db specific fields
+// of a table cache. This is stored in the tableCacheContainer
+// along with the table cache.
+// NB: It is important to make sure that the fields in this
+// struct are read-only. Since the fields here are shared
+// by every single tableCacheShard, if non read-only fields
+// are updated, we could have unnecessary evictions of those
+// fields, and the surrounding fields from the CPU caches.
+type tableCacheOpts struct {
+	// iterCount keeps track of how many iterators are open. It is used to keep
+	// track of leaked iterators on a per-db level.
+	iterCount *atomic.Int32
+
+	loggerAndTracer   LoggerAndTracer
+	cacheID           uint64
+	objProvider       objstorage.Provider
+	opts              sstable.ReaderOptions
+	filterMetrics     *sstable.FilterMetricsTracker
+	sstStatsCollector *sstable.CategoryStatsCollector
+}
+
+// tableCacheContainer contains the table cache and
+// fields which are unique to the DB.
+type tableCacheContainer struct {
+	tableCache *TableCache
+
+	// dbOpts contains fields relevant to the table cache
+	// which are unique to each DB.
+	dbOpts tableCacheOpts
+}
+
+// newTableCacheContainer will panic if the underlying cache in the table cache
+// doesn't match Options.Cache.
+func newTableCacheContainer(
+	tc *TableCache,
+	cacheID uint64,
+	objProvider objstorage.Provider,
+	opts *Options,
+	size int,
+	sstStatsCollector *sstable.CategoryStatsCollector,
+) *tableCacheContainer {
+	// We will release a ref to table cache acquired here when tableCacheContainer.close is called.
+	if tc != nil {
+		if tc.cache != opts.Cache {
+			panic("pebble: underlying cache for the table cache and db are different")
+		}
+		tc.Ref()
+	} else {
+		// NewTableCache should create a ref to tc which the container should
+		// drop whenever it is closed.
+		tc = NewTableCache(opts.Cache, opts.Experimental.TableCacheShards, size)
+	}
+
+	t := &tableCacheContainer{}
+	t.tableCache = tc
+	t.dbOpts.loggerAndTracer = opts.LoggerAndTracer
+	t.dbOpts.cacheID = cacheID
+	t.dbOpts.objProvider = objProvider
+	t.dbOpts.opts = opts.MakeReaderOptions()
+	t.dbOpts.filterMetrics = &sstable.FilterMetricsTracker{}
+	t.dbOpts.iterCount = new(atomic.Int32)
+	t.dbOpts.sstStatsCollector = sstStatsCollector
+	return t
+}
+
+// Before calling close, make sure that there will be no further need
+// to access any of the files associated with the store.
+func (c *tableCacheContainer) close() error {
+	// We want to do some cleanup work here. Check for leaked iterators
+	// by the DB using this container. Note that we'll still perform cleanup
+	// below in the case that there are leaked iterators.
+	var err error
+	if v := c.dbOpts.iterCount.Load(); v > 0 {
+		err = errors.Errorf("leaked iterators: %d", errors.Safe(v))
+	}
+
+	// Release nodes here.
+	for _, shard := range c.tableCache.shards {
+		if shard != nil {
+			shard.removeDB(&c.dbOpts)
+		}
+	}
+	return firstError(err, c.tableCache.Unref())
+}
+
+func (c *tableCacheContainer) newIters(
+	ctx context.Context,
+	file *manifest.FileMetadata,
+	opts *IterOptions,
+	internalOpts internalIterOpts,
+) (internalIterator, keyspan.FragmentIterator, error) {
+	return c.tableCache.getShard(file.FileBacking.DiskFileNum).newIters(ctx, file, opts, internalOpts, &c.dbOpts)
+}
+
+func (c *tableCacheContainer) newRangeKeyIter(
+	file *manifest.FileMetadata, opts keyspan.SpanIterOptions,
+) (keyspan.FragmentIterator, error) {
+	return c.tableCache.getShard(file.FileBacking.DiskFileNum).newRangeKeyIter(file, opts, &c.dbOpts)
+}
+
+// getTableProperties returns the properties associated with the backing physical
+// table if the input metadata belongs to a virtual sstable.
+func (c *tableCacheContainer) getTableProperties(file *fileMetadata) (*sstable.Properties, error) {
+	return c.tableCache.getShard(file.FileBacking.DiskFileNum).getTableProperties(file, &c.dbOpts)
+}
+
+func (c *tableCacheContainer) evict(fileNum base.DiskFileNum) {
+	c.tableCache.getShard(fileNum).evict(fileNum, &c.dbOpts, false)
+}
+
+func (c *tableCacheContainer) metrics() (CacheMetrics, FilterMetrics) {
+	var m CacheMetrics
+	for i := range c.tableCache.shards {
+		s := c.tableCache.shards[i]
+		s.mu.RLock()
+		m.Count += int64(len(s.mu.nodes))
+		s.mu.RUnlock()
+		m.Hits += s.hits.Load()
+		m.Misses += s.misses.Load()
+	}
+	m.Size = m.Count * int64(unsafe.Sizeof(sstable.Reader{}))
+	f := c.dbOpts.filterMetrics.Load()
+	return m, f
+}
+
+func (c *tableCacheContainer) estimateSize(
+	meta *fileMetadata, lower, upper []byte,
+) (size uint64, err error) {
+	if meta.Virtual {
+		err = c.withVirtualReader(
+			meta.VirtualMeta(),
+			func(r sstable.VirtualReader) (err error) {
+				size, err = r.EstimateDiskUsage(lower, upper)
+				return err
+			},
+		)
+	} else {
+		err = c.withReader(
+			meta.PhysicalMeta(),
+			func(r *sstable.Reader) (err error) {
+				size, err = r.EstimateDiskUsage(lower, upper)
+				return err
+			},
+		)
+	}
+	if err != nil {
+		return 0, err
+	}
+	return size, nil
+}
+
+// createCommonReader creates a Reader for this file. isForeign, if true for
+// virtual sstables, is passed into the vSSTable reader so its iterators can
+// collapse obsolete points accordingly.
+func createCommonReader(
+	v *tableCacheValue, file *fileMetadata, isForeign bool,
+) sstable.CommonReader {
+	// TODO(bananabrick): We suffer an allocation if file is a virtual sstable.
+	var cr sstable.CommonReader = v.reader
+	if file.Virtual {
+		virtualReader := sstable.MakeVirtualReader(
+			v.reader, file.VirtualMeta(), isForeign,
+		)
+		cr = &virtualReader
+	}
+	return cr
+}
+
+func (c *tableCacheContainer) withCommonReader(
+	meta *fileMetadata, fn func(sstable.CommonReader) error,
+) error {
+	s := c.tableCache.getShard(meta.FileBacking.DiskFileNum)
+	v := s.findNode(meta, &c.dbOpts)
+	defer s.unrefValue(v)
+	if v.err != nil {
+		return v.err
+	}
+	provider := c.dbOpts.objProvider
+	objMeta, err := provider.Lookup(fileTypeTable, meta.FileBacking.DiskFileNum)
+	if err != nil {
+		return err
+	}
+	return fn(createCommonReader(v, meta, provider.IsSharedForeign(objMeta)))
+}
+
+func (c *tableCacheContainer) withReader(meta physicalMeta, fn func(*sstable.Reader) error) error {
+	s := c.tableCache.getShard(meta.FileBacking.DiskFileNum)
+	v := s.findNode(meta.FileMetadata, &c.dbOpts)
+	defer s.unrefValue(v)
+	if v.err != nil {
+		return v.err
+	}
+	return fn(v.reader)
+}
+
+// withVirtualReader fetches a VirtualReader associated with a virtual sstable.
+func (c *tableCacheContainer) withVirtualReader(
+	meta virtualMeta, fn func(sstable.VirtualReader) error,
+) error {
+	s := c.tableCache.getShard(meta.FileBacking.DiskFileNum)
+	v := s.findNode(meta.FileMetadata, &c.dbOpts)
+	defer s.unrefValue(v)
+	if v.err != nil {
+		return v.err
+	}
+	provider := c.dbOpts.objProvider
+	objMeta, err := provider.Lookup(fileTypeTable, meta.FileBacking.DiskFileNum)
+	if err != nil {
+		return err
+	}
+	return fn(sstable.MakeVirtualReader(v.reader, meta, provider.IsSharedForeign(objMeta)))
+}
+
+func (c *tableCacheContainer) iterCount() int64 {
+	return int64(c.dbOpts.iterCount.Load())
+}
+
+// TableCache is a shareable cache for open sstables.
+type TableCache struct {
+	refs atomic.Int64
+
+	cache  *Cache
+	shards []*tableCacheShard
+}
+
+// Ref adds a reference to the table cache. Once tableCache.init returns,
+// the table cache only remains valid if there is at least one reference
+// to it.
+func (c *TableCache) Ref() {
+	v := c.refs.Add(1)
+	// We don't want the reference count to ever go from 0 -> 1,
+	// cause a reference count of 0 implies that we've closed the cache.
+	if v <= 1 {
+		panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v))
+	}
+}
+
+// Unref removes a reference to the table cache.
+func (c *TableCache) Unref() error {
+	v := c.refs.Add(-1)
+	switch {
+	case v < 0:
+		panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v))
+	case v == 0:
+		var err error
+		for i := range c.shards {
+			// The cache shard is not allocated yet, nothing to close
+			if c.shards[i] == nil {
+				continue
+			}
+			err = firstError(err, c.shards[i].Close())
+		}
+
+		// Unref the cache which we create a reference to when the tableCache
+		// is first instantiated.
+		c.cache.Unref()
+		return err
+	}
+	return nil
+}
+
+// NewTableCache will create a reference to the table cache. It is the callers responsibility
+// to call tableCache.Unref if they will no longer hold a reference to the table cache.
+func NewTableCache(cache *Cache, numShards int, size int) *TableCache {
+	if size == 0 {
+		panic("pebble: cannot create a table cache of size 0")
+	} else if numShards == 0 {
+		panic("pebble: cannot create a table cache with 0 shards")
+	}
+
+	c := &TableCache{}
+	c.cache = cache
+	c.cache.Ref()
+
+	c.shards = make([]*tableCacheShard, numShards)
+	for i := range c.shards {
+		c.shards[i] = &tableCacheShard{}
+		c.shards[i].init(size / len(c.shards))
+	}
+
+	// Hold a ref to the cache here.
+	c.refs.Store(1)
+
+	return c
+}
+
+func (c *TableCache) getShard(fileNum base.DiskFileNum) *tableCacheShard {
+	return c.shards[uint64(fileNum.FileNum())%uint64(len(c.shards))]
+}
+
+type tableCacheKey struct {
+	cacheID uint64
+	fileNum base.DiskFileNum
+}
+
+type tableCacheShard struct {
+	hits      atomic.Int64
+	misses    atomic.Int64
+	iterCount atomic.Int32
+
+	size int
+
+	mu struct {
+		sync.RWMutex
+		nodes map[tableCacheKey]*tableCacheNode
+		// The iters map is only created and populated in race builds.
+		iters map[io.Closer][]byte
+
+		handHot  *tableCacheNode
+		handCold *tableCacheNode
+		handTest *tableCacheNode
+
+		coldTarget int
+		sizeHot    int
+		sizeCold   int
+		sizeTest   int
+	}
+	releasing       sync.WaitGroup
+	releasingCh     chan *tableCacheValue
+	releaseLoopExit sync.WaitGroup
+}
+
+func (c *tableCacheShard) init(size int) {
+	c.size = size
+
+	c.mu.nodes = make(map[tableCacheKey]*tableCacheNode)
+	c.mu.coldTarget = size
+	c.releasingCh = make(chan *tableCacheValue, 100)
+	c.releaseLoopExit.Add(1)
+	go c.releaseLoop()
+
+	if invariants.RaceEnabled {
+		c.mu.iters = make(map[io.Closer][]byte)
+	}
+}
+
+func (c *tableCacheShard) releaseLoop() {
+	pprof.Do(context.Background(), tableCacheLabels, func(context.Context) {
+		defer c.releaseLoopExit.Done()
+		for v := range c.releasingCh {
+			v.release(c)
+		}
+	})
+}
+
+// checkAndIntersectFilters checks the specific table and block property filters
+// for intersection with any available table and block-level properties. Returns
+// true for ok if this table should be read by this iterator.
+func (c *tableCacheShard) checkAndIntersectFilters(
+	v *tableCacheValue,
+	tableFilter func(userProps map[string]string) bool,
+	blockPropertyFilters []BlockPropertyFilter,
+	boundLimitedFilter sstable.BoundLimitedBlockPropertyFilter,
+) (ok bool, filterer *sstable.BlockPropertiesFilterer, err error) {
+	if tableFilter != nil &&
+		!tableFilter(v.reader.Properties.UserProperties) {
+		return false, nil, nil
+	}
+
+	if boundLimitedFilter != nil || len(blockPropertyFilters) > 0 {
+		filterer, err = sstable.IntersectsTable(
+			blockPropertyFilters,
+			boundLimitedFilter,
+			v.reader.Properties.UserProperties,
+		)
+		// NB: IntersectsTable will return a nil filterer if the table-level
+		// properties indicate there's no intersection with the provided filters.
+		if filterer == nil || err != nil {
+			return false, nil, err
+		}
+	}
+	return true, filterer, nil
+}
+
+func (c *tableCacheShard) newIters(
+	ctx context.Context,
+	file *manifest.FileMetadata,
+	opts *IterOptions,
+	internalOpts internalIterOpts,
+	dbOpts *tableCacheOpts,
+) (internalIterator, keyspan.FragmentIterator, error) {
+	// TODO(sumeer): constructing the Reader should also use a plumbed context,
+	// since parts of the sstable are read during the construction. The Reader
+	// should not remember that context since the Reader can be long-lived.
+
+	// Calling findNode gives us the responsibility of decrementing v's
+	// refCount. If opening the underlying table resulted in error, then we
+	// decrement this straight away. Otherwise, we pass that responsibility to
+	// the sstable iterator, which decrements when it is closed.
+	v := c.findNode(file, dbOpts)
+	if v.err != nil {
+		defer c.unrefValue(v)
+		return nil, nil, v.err
+	}
+
+	hideObsoletePoints := false
+	var pointKeyFilters []BlockPropertyFilter
+	if opts != nil {
+		// This code is appending (at most one filter) in-place to
+		// opts.PointKeyFilters even though the slice is shared for iterators in
+		// the same iterator tree. This is acceptable since all the following
+		// properties are true:
+		// - The iterator tree is single threaded, so the shared backing for the
+		//   slice is being mutated in a single threaded manner.
+		// - Each shallow copy of the slice has its own notion of length.
+		// - The appended element is always the obsoleteKeyBlockPropertyFilter
+		//   struct, which is stateless, so overwriting that struct when creating
+		//   one sstable iterator is harmless to other sstable iterators that are
+		//   relying on that struct.
+		//
+		// An alternative would be to have different slices for different sstable
+		// iterators, but that requires more work to avoid allocations.
+		hideObsoletePoints, pointKeyFilters =
+			v.reader.TryAddBlockPropertyFilterForHideObsoletePoints(
+				opts.snapshotForHideObsoletePoints, file.LargestSeqNum, opts.PointKeyFilters)
+	}
+	ok := true
+	var filterer *sstable.BlockPropertiesFilterer
+	var err error
+	if opts != nil {
+		ok, filterer, err = c.checkAndIntersectFilters(v, opts.TableFilter,
+			pointKeyFilters, internalOpts.boundLimitedFilter)
+	}
+	if err != nil {
+		c.unrefValue(v)
+		return nil, nil, err
+	}
+
+	provider := dbOpts.objProvider
+	// Check if this file is a foreign file.
+	objMeta, err := provider.Lookup(fileTypeTable, file.FileBacking.DiskFileNum)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	// Note: This suffers an allocation for virtual sstables.
+	cr := createCommonReader(v, file, provider.IsSharedForeign(objMeta))
+
+	// NB: range-del iterator does not maintain a reference to the table, nor
+	// does it need to read from it after creation.
+	rangeDelIter, err := cr.NewRawRangeDelIter()
+	if err != nil {
+		c.unrefValue(v)
+		return nil, nil, err
+	}
+
+	if !ok {
+		c.unrefValue(v)
+		// Return an empty iterator. This iterator has no mutable state, so
+		// using a singleton is fine.
+		// NB: We still return the potentially non-empty rangeDelIter. This
+		// ensures the iterator observes the file's range deletions even if the
+		// block property filters exclude all the file's point keys. The range
+		// deletions may still delete keys lower in the LSM in files that DO
+		// match the active filters.
+		//
+		// The point iterator returned must implement the filteredIter
+		// interface, so that the level iterator surfaces file boundaries when
+		// range deletions are present.
+		return filteredAll, rangeDelIter, err
+	}
+
+	var iter sstable.Iterator
+	useFilter := true
+	if opts != nil {
+		useFilter = manifest.LevelToInt(opts.level) != 6 || opts.UseL6Filters
+		ctx = objiotracing.WithLevel(ctx, manifest.LevelToInt(opts.level))
+	}
+	tableFormat, err := v.reader.TableFormat()
+	if err != nil {
+		return nil, nil, err
+	}
+	var rp sstable.ReaderProvider
+	if tableFormat >= sstable.TableFormatPebblev3 && v.reader.Properties.NumValueBlocks > 0 {
+		rp = &tableCacheShardReaderProvider{c: c, file: file, dbOpts: dbOpts}
+	}
+
+	if provider.IsSharedForeign(objMeta) {
+		if tableFormat < sstable.TableFormatPebblev4 {
+			return nil, nil, errors.New("pebble: shared foreign sstable has a lower table format than expected")
+		}
+		hideObsoletePoints = true
+	}
+	var categoryAndQoS sstable.CategoryAndQoS
+	if opts != nil {
+		categoryAndQoS = opts.CategoryAndQoS
+	}
+	if internalOpts.bytesIterated != nil {
+		iter, err = cr.NewCompactionIter(
+			internalOpts.bytesIterated, categoryAndQoS, dbOpts.sstStatsCollector, rp,
+			internalOpts.bufferPool)
+	} else {
+		iter, err = cr.NewIterWithBlockPropertyFiltersAndContextEtc(
+			ctx, opts.GetLowerBound(), opts.GetUpperBound(), filterer, hideObsoletePoints, useFilter,
+			internalOpts.stats, categoryAndQoS, dbOpts.sstStatsCollector, rp)
+	}
+	if err != nil {
+		if rangeDelIter != nil {
+			_ = rangeDelIter.Close()
+		}
+		c.unrefValue(v)
+		return nil, nil, err
+	}
+	// NB: v.closeHook takes responsibility for calling unrefValue(v) here. Take
+	// care to avoid introducing an allocation here by adding a closure.
+	iter.SetCloseHook(v.closeHook)
+
+	c.iterCount.Add(1)
+	dbOpts.iterCount.Add(1)
+	if invariants.RaceEnabled {
+		c.mu.Lock()
+		c.mu.iters[iter] = debug.Stack()
+		c.mu.Unlock()
+	}
+	return iter, rangeDelIter, nil
+}
+
+func (c *tableCacheShard) newRangeKeyIter(
+	file *manifest.FileMetadata, opts keyspan.SpanIterOptions, dbOpts *tableCacheOpts,
+) (keyspan.FragmentIterator, error) {
+	// Calling findNode gives us the responsibility of decrementing v's
+	// refCount. If opening the underlying table resulted in error, then we
+	// decrement this straight away. Otherwise, we pass that responsibility to
+	// the sstable iterator, which decrements when it is closed.
+	v := c.findNode(file, dbOpts)
+	if v.err != nil {
+		defer c.unrefValue(v)
+		return nil, v.err
+	}
+
+	ok := true
+	var err error
+	// Don't filter a table's range keys if the file contains RANGEKEYDELs.
+	// The RANGEKEYDELs may delete range keys in other levels. Skipping the
+	// file's range key blocks may surface deleted range keys below. This is
+	// done here, rather than deferring to the block-property collector in order
+	// to maintain parity with point keys and the treatment of RANGEDELs.
+	if v.reader.Properties.NumRangeKeyDels == 0 {
+		ok, _, err = c.checkAndIntersectFilters(v, nil, opts.RangeKeyFilters, nil)
+	}
+	if err != nil {
+		c.unrefValue(v)
+		return nil, err
+	}
+	if !ok {
+		c.unrefValue(v)
+		// Return the empty iterator. This iterator has no mutable state, so
+		// using a singleton is fine.
+		return emptyKeyspanIter, err
+	}
+
+	var iter keyspan.FragmentIterator
+	if file.Virtual {
+		provider := dbOpts.objProvider
+		var objMeta objstorage.ObjectMetadata
+		objMeta, err = provider.Lookup(fileTypeTable, file.FileBacking.DiskFileNum)
+		if err == nil {
+			virtualReader := sstable.MakeVirtualReader(
+				v.reader, file.VirtualMeta(), provider.IsSharedForeign(objMeta),
+			)
+			iter, err = virtualReader.NewRawRangeKeyIter()
+		}
+	} else {
+		iter, err = v.reader.NewRawRangeKeyIter()
+	}
+
+	// iter is a block iter that holds the entire value of the block in memory.
+	// No need to hold onto a ref of the cache value.
+	c.unrefValue(v)
+
+	if err != nil {
+		return nil, err
+	}
+
+	if iter == nil {
+		// NewRawRangeKeyIter can return nil even if there's no error. However,
+		// the keyspan.LevelIter expects a non-nil iterator if err is nil.
+		return emptyKeyspanIter, nil
+	}
+
+	return iter, nil
+}
+
+type tableCacheShardReaderProvider struct {
+	c      *tableCacheShard
+	file   *manifest.FileMetadata
+	dbOpts *tableCacheOpts
+	v      *tableCacheValue
+}
+
+var _ sstable.ReaderProvider = &tableCacheShardReaderProvider{}
+
+// GetReader implements sstable.ReaderProvider. Note that it is not the
+// responsibility of tableCacheShardReaderProvider to ensure that the file
+// continues to exist. The ReaderProvider is used in iterators where the
+// top-level iterator is pinning the read state and preventing the files from
+// being deleted.
+//
+// The caller must call tableCacheShardReaderProvider.Close.
+//
+// Note that currently the Reader returned here is only used to read value
+// blocks. This reader shouldn't be used for other purposes like reading keys
+// outside of virtual sstable bounds.
+//
+// TODO(bananabrick): We could return a wrapper over the Reader to ensure
+// that the reader isn't used for other purposes.
+func (rp *tableCacheShardReaderProvider) GetReader() (*sstable.Reader, error) {
+	// Calling findNode gives us the responsibility of decrementing v's
+	// refCount.
+	v := rp.c.findNode(rp.file, rp.dbOpts)
+	if v.err != nil {
+		defer rp.c.unrefValue(v)
+		return nil, v.err
+	}
+	rp.v = v
+	return v.reader, nil
+}
+
+// Close implements sstable.ReaderProvider.
+func (rp *tableCacheShardReaderProvider) Close() {
+	rp.c.unrefValue(rp.v)
+	rp.v = nil
+}
+
+// getTableProperties return sst table properties for target file
+func (c *tableCacheShard) getTableProperties(
+	file *fileMetadata, dbOpts *tableCacheOpts,
+) (*sstable.Properties, error) {
+	// Calling findNode gives us the responsibility of decrementing v's refCount here
+	v := c.findNode(file, dbOpts)
+	defer c.unrefValue(v)
+
+	if v.err != nil {
+		return nil, v.err
+	}
+	return &v.reader.Properties, nil
+}
+
+// releaseNode releases a node from the tableCacheShard.
+//
+// c.mu must be held when calling this.
+func (c *tableCacheShard) releaseNode(n *tableCacheNode) {
+	c.unlinkNode(n)
+	c.clearNode(n)
+}
+
+// unlinkNode removes a node from the tableCacheShard, leaving the shard
+// reference in place.
+//
+// c.mu must be held when calling this.
+func (c *tableCacheShard) unlinkNode(n *tableCacheNode) {
+	key := tableCacheKey{n.cacheID, n.fileNum}
+	delete(c.mu.nodes, key)
+
+	switch n.ptype {
+	case tableCacheNodeHot:
+		c.mu.sizeHot--
+	case tableCacheNodeCold:
+		c.mu.sizeCold--
+	case tableCacheNodeTest:
+		c.mu.sizeTest--
+	}
+
+	if n == c.mu.handHot {
+		c.mu.handHot = c.mu.handHot.prev()
+	}
+	if n == c.mu.handCold {
+		c.mu.handCold = c.mu.handCold.prev()
+	}
+	if n == c.mu.handTest {
+		c.mu.handTest = c.mu.handTest.prev()
+	}
+
+	if n.unlink() == n {
+		// This was the last entry in the cache.
+		c.mu.handHot = nil
+		c.mu.handCold = nil
+		c.mu.handTest = nil
+	}
+
+	n.links.prev = nil
+	n.links.next = nil
+}
+
+func (c *tableCacheShard) clearNode(n *tableCacheNode) {
+	if v := n.value; v != nil {
+		n.value = nil
+		c.unrefValue(v)
+	}
+}
+
+// unrefValue decrements the reference count for the specified value, releasing
+// it if the reference count fell to 0. Note that the value has a reference if
+// it is present in tableCacheShard.mu.nodes, so a reference count of 0 means
+// the node has already been removed from that map.
+func (c *tableCacheShard) unrefValue(v *tableCacheValue) {
+	if v.refCount.Add(-1) == 0 {
+		c.releasing.Add(1)
+		c.releasingCh <- v
+	}
+}
+
+// findNode returns the node for the table with the given file number, creating
+// that node if it didn't already exist. The caller is responsible for
+// decrementing the returned node's refCount.
+func (c *tableCacheShard) findNode(meta *fileMetadata, dbOpts *tableCacheOpts) *tableCacheValue {
+	v := c.findNodeInternal(meta, dbOpts)
+
+	// Loading a file before its global sequence number is known (eg,
+	// during ingest before entering the commit pipeline) can pollute
+	// the cache with incorrect state. In invariant builds, verify
+	// that the global sequence number of the returned reader matches.
+	if invariants.Enabled {
+		if v.reader != nil && meta.LargestSeqNum == meta.SmallestSeqNum &&
+			v.reader.Properties.GlobalSeqNum != meta.SmallestSeqNum {
+			panic(errors.AssertionFailedf("file %s loaded from table cache with the wrong global sequence number %d",
+				meta, v.reader.Properties.GlobalSeqNum))
+		}
+	}
+	return v
+}
+
+func (c *tableCacheShard) findNodeInternal(
+	meta *fileMetadata, dbOpts *tableCacheOpts,
+) *tableCacheValue {
+	if refs := meta.Refs(); refs <= 0 {
+		panic(errors.AssertionFailedf("attempting to load file %s with refs=%d from table cache",
+			meta, refs))
+	}
+	// Fast-path for a hit in the cache.
+	c.mu.RLock()
+	key := tableCacheKey{dbOpts.cacheID, meta.FileBacking.DiskFileNum}
+	if n := c.mu.nodes[key]; n != nil && n.value != nil {
+		// Fast-path hit.
+		//
+		// The caller is responsible for decrementing the refCount.
+		v := n.value
+		v.refCount.Add(1)
+		c.mu.RUnlock()
+		n.referenced.Store(true)
+		c.hits.Add(1)
+		<-v.loaded
+		return v
+	}
+	c.mu.RUnlock()
+
+	c.mu.Lock()
+
+	n := c.mu.nodes[key]
+	switch {
+	case n == nil:
+		// Slow-path miss of a non-existent node.
+		n = &tableCacheNode{
+			fileNum: meta.FileBacking.DiskFileNum,
+			ptype:   tableCacheNodeCold,
+		}
+		c.addNode(n, dbOpts)
+		c.mu.sizeCold++
+
+	case n.value != nil:
+		// Slow-path hit of a hot or cold node.
+		//
+		// The caller is responsible for decrementing the refCount.
+		v := n.value
+		v.refCount.Add(1)
+		n.referenced.Store(true)
+		c.hits.Add(1)
+		c.mu.Unlock()
+		<-v.loaded
+		return v
+
+	default:
+		// Slow-path miss of a test node.
+		c.unlinkNode(n)
+		c.mu.coldTarget++
+		if c.mu.coldTarget > c.size {
+			c.mu.coldTarget = c.size
+		}
+
+		n.referenced.Store(false)
+		n.ptype = tableCacheNodeHot
+		c.addNode(n, dbOpts)
+		c.mu.sizeHot++
+	}
+
+	c.misses.Add(1)
+
+	v := &tableCacheValue{
+		loaded: make(chan struct{}),
+	}
+	v.refCount.Store(2)
+	// Cache the closure invoked when an iterator is closed. This avoids an
+	// allocation on every call to newIters.
+	v.closeHook = func(i sstable.Iterator) error {
+		if invariants.RaceEnabled {
+			c.mu.Lock()
+			delete(c.mu.iters, i)
+			c.mu.Unlock()
+		}
+		c.unrefValue(v)
+		c.iterCount.Add(-1)
+		dbOpts.iterCount.Add(-1)
+		return nil
+	}
+	n.value = v
+
+	c.mu.Unlock()
+
+	// Note adding to the cache lists must complete before we begin loading the
+	// table as a failure during load will result in the node being unlinked.
+	pprof.Do(context.Background(), tableCacheLabels, func(context.Context) {
+		v.load(
+			loadInfo{
+				backingFileNum: meta.FileBacking.DiskFileNum,
+				smallestSeqNum: meta.SmallestSeqNum,
+				largestSeqNum:  meta.LargestSeqNum,
+			}, c, dbOpts)
+	})
+	return v
+}
+
+func (c *tableCacheShard) addNode(n *tableCacheNode, dbOpts *tableCacheOpts) {
+	c.evictNodes()
+	n.cacheID = dbOpts.cacheID
+	key := tableCacheKey{n.cacheID, n.fileNum}
+	c.mu.nodes[key] = n
+
+	n.links.next = n
+	n.links.prev = n
+	if c.mu.handHot == nil {
+		// First element.
+		c.mu.handHot = n
+		c.mu.handCold = n
+		c.mu.handTest = n
+	} else {
+		c.mu.handHot.link(n)
+	}
+
+	if c.mu.handCold == c.mu.handHot {
+		c.mu.handCold = c.mu.handCold.prev()
+	}
+}
+
+func (c *tableCacheShard) evictNodes() {
+	for c.size <= c.mu.sizeHot+c.mu.sizeCold && c.mu.handCold != nil {
+		c.runHandCold()
+	}
+}
+
+func (c *tableCacheShard) runHandCold() {
+	n := c.mu.handCold
+	if n.ptype == tableCacheNodeCold {
+		if n.referenced.Load() {
+			n.referenced.Store(false)
+			n.ptype = tableCacheNodeHot
+			c.mu.sizeCold--
+			c.mu.sizeHot++
+		} else {
+			c.clearNode(n)
+			n.ptype = tableCacheNodeTest
+			c.mu.sizeCold--
+			c.mu.sizeTest++
+			for c.size < c.mu.sizeTest && c.mu.handTest != nil {
+				c.runHandTest()
+			}
+		}
+	}
+
+	c.mu.handCold = c.mu.handCold.next()
+
+	for c.size-c.mu.coldTarget <= c.mu.sizeHot && c.mu.handHot != nil {
+		c.runHandHot()
+	}
+}
+
+func (c *tableCacheShard) runHandHot() {
+	if c.mu.handHot == c.mu.handTest && c.mu.handTest != nil {
+		c.runHandTest()
+		if c.mu.handHot == nil {
+			return
+		}
+	}
+
+	n := c.mu.handHot
+	if n.ptype == tableCacheNodeHot {
+		if n.referenced.Load() {
+			n.referenced.Store(false)
+		} else {
+			n.ptype = tableCacheNodeCold
+			c.mu.sizeHot--
+			c.mu.sizeCold++
+		}
+	}
+
+	c.mu.handHot = c.mu.handHot.next()
+}
+
+func (c *tableCacheShard) runHandTest() {
+	if c.mu.sizeCold > 0 && c.mu.handTest == c.mu.handCold && c.mu.handCold != nil {
+		c.runHandCold()
+		if c.mu.handTest == nil {
+			return
+		}
+	}
+
+	n := c.mu.handTest
+	if n.ptype == tableCacheNodeTest {
+		c.mu.coldTarget--
+		if c.mu.coldTarget < 0 {
+			c.mu.coldTarget = 0
+		}
+		c.unlinkNode(n)
+		c.clearNode(n)
+	}
+
+	c.mu.handTest = c.mu.handTest.next()
+}
+
+func (c *tableCacheShard) evict(fileNum base.DiskFileNum, dbOpts *tableCacheOpts, allowLeak bool) {
+	c.mu.Lock()
+	key := tableCacheKey{dbOpts.cacheID, fileNum}
+	n := c.mu.nodes[key]
+	var v *tableCacheValue
+	if n != nil {
+		// NB: This is equivalent to tableCacheShard.releaseNode(), but we perform
+		// the tableCacheNode.release() call synchronously below to ensure the
+		// sstable file descriptor is closed before returning. Note that
+		// tableCacheShard.releasing needs to be incremented while holding
+		// tableCacheShard.mu in order to avoid a race with Close()
+		c.unlinkNode(n)
+		v = n.value
+		if v != nil {
+			if !allowLeak {
+				if t := v.refCount.Add(-1); t != 0 {
+					dbOpts.loggerAndTracer.Fatalf("sstable %s: refcount is not zero: %d\n%s", fileNum, t, debug.Stack())
+				}
+			}
+			c.releasing.Add(1)
+		}
+	}
+
+	c.mu.Unlock()
+
+	if v != nil {
+		v.release(c)
+	}
+
+	dbOpts.opts.Cache.EvictFile(dbOpts.cacheID, fileNum)
+}
+
+// removeDB evicts any nodes which have a reference to the DB
+// associated with dbOpts.cacheID. Make sure that there will
+// be no more accesses to the files associated with the DB.
+func (c *tableCacheShard) removeDB(dbOpts *tableCacheOpts) {
+	var fileNums []base.DiskFileNum
+
+	c.mu.RLock()
+	// Collect the fileNums which need to be cleaned.
+	var firstNode *tableCacheNode
+	node := c.mu.handHot
+	for node != firstNode {
+		if firstNode == nil {
+			firstNode = node
+		}
+
+		if node.cacheID == dbOpts.cacheID {
+			fileNums = append(fileNums, node.fileNum)
+		}
+		node = node.next()
+	}
+	c.mu.RUnlock()
+
+	// Evict all the nodes associated with the DB.
+	// This should synchronously close all the files
+	// associated with the DB.
+	for _, fileNum := range fileNums {
+		c.evict(fileNum, dbOpts, true)
+	}
+}
+
+func (c *tableCacheShard) Close() error {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	// Check for leaked iterators. Note that we'll still perform cleanup below in
+	// the case that there are leaked iterators.
+	var err error
+	if v := c.iterCount.Load(); v > 0 {
+		if !invariants.RaceEnabled {
+			err = errors.Errorf("leaked iterators: %d", errors.Safe(v))
+		} else {
+			var buf bytes.Buffer
+			for _, stack := range c.mu.iters {
+				fmt.Fprintf(&buf, "%s\n", stack)
+			}
+			err = errors.Errorf("leaked iterators: %d\n%s", errors.Safe(v), buf.String())
+		}
+	}
+
+	for c.mu.handHot != nil {
+		n := c.mu.handHot
+		if n.value != nil {
+			if n.value.refCount.Add(-1) == 0 {
+				c.releasing.Add(1)
+				c.releasingCh <- n.value
+			}
+		}
+		c.unlinkNode(n)
+	}
+	c.mu.nodes = nil
+	c.mu.handHot = nil
+	c.mu.handCold = nil
+	c.mu.handTest = nil
+
+	// Only shutdown the releasing goroutine if there were no leaked
+	// iterators. If there were leaked iterators, we leave the goroutine running
+	// and the releasingCh open so that a subsequent iterator close can
+	// complete. This behavior is used by iterator leak tests. Leaking the
+	// goroutine for these tests is less bad not closing the iterator which
+	// triggers other warnings about block cache handles not being released.
+	if err != nil {
+		c.releasing.Wait()
+		return err
+	}
+
+	close(c.releasingCh)
+	c.releasing.Wait()
+	c.releaseLoopExit.Wait()
+	return err
+}
+
+type tableCacheValue struct {
+	closeHook func(i sstable.Iterator) error
+	reader    *sstable.Reader
+	err       error
+	loaded    chan struct{}
+	// Reference count for the value. The reader is closed when the reference
+	// count drops to zero.
+	refCount atomic.Int32
+}
+
+type loadInfo struct {
+	backingFileNum base.DiskFileNum
+	largestSeqNum  uint64
+	smallestSeqNum uint64
+}
+
+func (v *tableCacheValue) load(loadInfo loadInfo, c *tableCacheShard, dbOpts *tableCacheOpts) {
+	// Try opening the file first.
+	var f objstorage.Readable
+	var err error
+	f, err = dbOpts.objProvider.OpenForReading(
+		context.TODO(), fileTypeTable, loadInfo.backingFileNum, objstorage.OpenOptions{MustExist: true},
+	)
+	if err == nil {
+		cacheOpts := private.SSTableCacheOpts(dbOpts.cacheID, loadInfo.backingFileNum).(sstable.ReaderOption)
+		v.reader, err = sstable.NewReader(f, dbOpts.opts, cacheOpts, dbOpts.filterMetrics)
+	}
+	if err != nil {
+		v.err = errors.Wrapf(
+			err, "pebble: backing file %s error", errors.Safe(loadInfo.backingFileNum.FileNum()))
+	}
+	if v.err == nil && loadInfo.smallestSeqNum == loadInfo.largestSeqNum {
+		v.reader.Properties.GlobalSeqNum = loadInfo.largestSeqNum
+	}
+	if v.err != nil {
+		c.mu.Lock()
+		defer c.mu.Unlock()
+		// Lookup the node in the cache again as it might have already been
+		// removed.
+		key := tableCacheKey{dbOpts.cacheID, loadInfo.backingFileNum}
+		n := c.mu.nodes[key]
+		if n != nil && n.value == v {
+			c.releaseNode(n)
+		}
+	}
+	close(v.loaded)
+}
+
+func (v *tableCacheValue) release(c *tableCacheShard) {
+	<-v.loaded
+	// Nothing to be done about an error at this point. Close the reader if it is
+	// open.
+	if v.reader != nil {
+		_ = v.reader.Close()
+	}
+	c.releasing.Done()
+}
+
+type tableCacheNodeType int8
+
+const (
+	tableCacheNodeTest tableCacheNodeType = iota
+	tableCacheNodeCold
+	tableCacheNodeHot
+)
+
+func (p tableCacheNodeType) String() string {
+	switch p {
+	case tableCacheNodeTest:
+		return "test"
+	case tableCacheNodeCold:
+		return "cold"
+	case tableCacheNodeHot:
+		return "hot"
+	}
+	return "unknown"
+}
+
+type tableCacheNode struct {
+	fileNum base.DiskFileNum
+	value   *tableCacheValue
+
+	links struct {
+		next *tableCacheNode
+		prev *tableCacheNode
+	}
+	ptype tableCacheNodeType
+	// referenced is atomically set to indicate that this entry has been accessed
+	// since the last time one of the clock hands swept it.
+	referenced atomic.Bool
+
+	// Storing the cache id associated with the DB instance here
+	// avoids the need to thread the dbOpts struct through many functions.
+	cacheID uint64
+}
+
+func (n *tableCacheNode) next() *tableCacheNode {
+	if n == nil {
+		return nil
+	}
+	return n.links.next
+}
+
+func (n *tableCacheNode) prev() *tableCacheNode {
+	if n == nil {
+		return nil
+	}
+	return n.links.prev
+}
+
+func (n *tableCacheNode) link(s *tableCacheNode) {
+	s.links.prev = n.links.prev
+	s.links.prev.links.next = s
+	s.links.next = n
+	s.links.next.links.prev = s
+}
+
+func (n *tableCacheNode) unlink() *tableCacheNode {
+	next := n.links.next
+	n.links.prev.links.next = n.links.next
+	n.links.next.links.prev = n.links.prev
+	n.links.prev = n
+	n.links.next = n
+	return next
+}
diff --git a/pebble/table_cache_test.go b/pebble/table_cache_test.go
new file mode 100644
index 0000000..103d62a
--- /dev/null
+++ b/pebble/table_cache_test.go
@@ -0,0 +1,1263 @@
+// Copyright 2013 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bufio"
+	"bytes"
+	"context"
+	"fmt"
+	"io"
+	"os"
+	"path"
+	"strconv"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/rand"
+)
+
+type tableCacheTestFile struct {
+	vfs.File
+	fs   *tableCacheTestFS
+	name string
+}
+
+func (f *tableCacheTestFile) Close() error {
+	f.fs.mu.Lock()
+	if f.fs.closeCounts != nil {
+		f.fs.closeCounts[f.name]++
+	}
+	f.fs.mu.Unlock()
+	return f.File.Close()
+}
+
+type tableCacheTestFS struct {
+	vfs.FS
+
+	mu               sync.Mutex
+	openCounts       map[string]int
+	closeCounts      map[string]int
+	openErrorEnabled bool
+}
+
+func (fs *tableCacheTestFS) Open(name string, opts ...vfs.OpenOption) (vfs.File, error) {
+	fs.mu.Lock()
+	if fs.openErrorEnabled {
+		fs.mu.Unlock()
+		return nil, errors.New("injected error")
+	}
+	if fs.openCounts != nil {
+		fs.openCounts[name]++
+	}
+	fs.mu.Unlock()
+	f, err := fs.FS.Open(name, opts...)
+	if len(opts) < 1 || opts[0] != vfs.RandomReadsOption {
+		return nil, errors.Errorf("sstable file %s not opened with random reads option", name)
+	}
+	if err != nil {
+		return nil, err
+	}
+	return &tableCacheTestFile{f, fs, name}, nil
+}
+
+func (fs *tableCacheTestFS) validate(
+	t *testing.T, c *tableCacheContainer, f func(i, gotO, gotC int) error,
+) {
+	if err := fs.validateOpenTables(f); err != nil {
+		t.Error(err)
+		return
+	}
+	c.close()
+	if err := fs.validateNoneStillOpen(); err != nil {
+		t.Error(err)
+		return
+	}
+}
+
+func (fs *tableCacheTestFS) setOpenError(enabled bool) {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	fs.openErrorEnabled = enabled
+}
+
+// validateOpenTables validates that no tables in the cache are open twice, and
+// the number still open is no greater than tableCacheTestCacheSize.
+func (fs *tableCacheTestFS) validateOpenTables(f func(i, gotO, gotC int) error) error {
+	// try backs off to let any clean-up goroutines do their work.
+	return try(100*time.Microsecond, 20*time.Second, func() error {
+		fs.mu.Lock()
+		defer fs.mu.Unlock()
+
+		numStillOpen := 0
+		for i := 0; i < tableCacheTestNumTables; i++ {
+			filename := base.MakeFilepath(fs, "", fileTypeTable, base.FileNum(uint64(i)).DiskFileNum())
+			gotO, gotC := fs.openCounts[filename], fs.closeCounts[filename]
+			if gotO > gotC {
+				numStillOpen++
+			}
+			if gotC != gotO && gotC != gotO-1 {
+				return errors.Errorf("i=%d: table closed too many or too few times: opened %d times, closed %d times",
+					i, gotO, gotC)
+			}
+			if f != nil {
+				if err := f(i, gotO, gotC); err != nil {
+					return err
+				}
+			}
+		}
+		if numStillOpen > tableCacheTestCacheSize {
+			return errors.Errorf("numStillOpen is %d, want <= %d", numStillOpen, tableCacheTestCacheSize)
+		}
+		return nil
+	})
+}
+
+// validateNoneStillOpen validates that no tables in the cache are open.
+func (fs *tableCacheTestFS) validateNoneStillOpen() error {
+	// try backs off to let any clean-up goroutines do their work.
+	return try(100*time.Microsecond, 20*time.Second, func() error {
+		fs.mu.Lock()
+		defer fs.mu.Unlock()
+
+		for i := 0; i < tableCacheTestNumTables; i++ {
+			filename := base.MakeFilepath(fs, "", fileTypeTable, base.FileNum(uint64(i)).DiskFileNum())
+			gotO, gotC := fs.openCounts[filename], fs.closeCounts[filename]
+			if gotO != gotC {
+				return errors.Errorf("i=%d: opened %d times, closed %d times", i, gotO, gotC)
+			}
+		}
+		return nil
+	})
+}
+
+const (
+	tableCacheTestNumTables = 300
+	tableCacheTestCacheSize = 100
+)
+
+// newTableCacheTest returns a shareable table cache to be used for tests.
+// It is the caller's responsibility to unref the table cache.
+func newTableCacheTest(size int64, tableCacheSize int, numShards int) *TableCache {
+	cache := NewCache(size)
+	defer cache.Unref()
+	return NewTableCache(cache, numShards, tableCacheSize)
+}
+
+func newTableCacheContainerTest(
+	tc *TableCache, dirname string,
+) (*tableCacheContainer, *tableCacheTestFS, error) {
+	xxx := bytes.Repeat([]byte("x"), tableCacheTestNumTables)
+	fs := &tableCacheTestFS{
+		FS: vfs.NewMem(),
+	}
+	objProvider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(fs, dirname))
+	if err != nil {
+		return nil, nil, err
+	}
+	defer objProvider.Close()
+
+	for i := 0; i < tableCacheTestNumTables; i++ {
+		w, _, err := objProvider.Create(context.Background(), fileTypeTable, base.FileNum(uint64(i)).DiskFileNum(), objstorage.CreateOptions{})
+		if err != nil {
+			return nil, nil, errors.Wrap(err, "fs.Create")
+		}
+		tw := sstable.NewWriter(w, sstable.WriterOptions{TableFormat: sstable.TableFormatPebblev2})
+		ik := base.ParseInternalKey(fmt.Sprintf("k.SET.%d", i))
+		if err := tw.Add(ik, xxx[:i]); err != nil {
+			return nil, nil, errors.Wrap(err, "tw.Set")
+		}
+		if err := tw.RangeKeySet([]byte("k"), []byte("l"), nil, xxx[:i]); err != nil {
+			return nil, nil, errors.Wrap(err, "tw.Set")
+		}
+		if err := tw.Close(); err != nil {
+			return nil, nil, errors.Wrap(err, "tw.Close")
+		}
+	}
+
+	fs.mu.Lock()
+	fs.openCounts = map[string]int{}
+	fs.closeCounts = map[string]int{}
+	fs.mu.Unlock()
+
+	opts := &Options{}
+	opts.EnsureDefaults()
+	if tc == nil {
+		opts.Cache = NewCache(8 << 20) // 8 MB
+		defer opts.Cache.Unref()
+	} else {
+		opts.Cache = tc.cache
+	}
+
+	c := newTableCacheContainer(tc, opts.Cache.NewID(), objProvider, opts, tableCacheTestCacheSize,
+		&sstable.CategoryStatsCollector{})
+	return c, fs, nil
+}
+
+// Test basic reference counting for the table cache.
+func TestTableCacheRefs(t *testing.T) {
+	tc := newTableCacheTest(8<<20, 10, 2)
+
+	v := tc.refs.Load()
+	if v != 1 {
+		require.Equal(t, 1, v)
+	}
+
+	tc.Ref()
+	v = tc.refs.Load()
+	if v != 2 {
+		require.Equal(t, 2, v)
+	}
+
+	tc.Unref()
+	v = tc.refs.Load()
+	if v != 1 {
+		require.Equal(t, 1, v)
+	}
+
+	tc.Unref()
+	v = tc.refs.Load()
+	if v != 0 {
+		require.Equal(t, 0, v)
+	}
+
+	defer func() {
+		if r := recover(); r != nil {
+			if fmt.Sprint(r) != "pebble: inconsistent reference count: -1" {
+				t.Fatalf("unexpected panic message")
+			}
+		} else if r == nil {
+			t.Fatalf("expected panic")
+		}
+	}()
+	tc.Unref()
+}
+
+// Basic test to determine if reads through the table cache are wired correctly.
+func TestVirtualReadsWiring(t *testing.T) {
+	var d *DB
+	var err error
+	d, err = Open("",
+		&Options{
+			FS:                 vfs.NewMem(),
+			FormatMajorVersion: internalFormatNewest,
+			Comparer:           testkeys.Comparer,
+			// Compactions which conflict with virtual sstable creation can be
+			// picked by Pebble. We disable that.
+			DisableAutomaticCompactions: true,
+		})
+	require.NoError(t, err)
+	defer d.Close()
+
+	b := newBatch(d)
+	// Some combination of sets, range deletes, and range key sets/unsets, so
+	// all of the table cache iterator functions are utilized.
+	require.NoError(t, b.Set([]byte{'a'}, []byte{'a'}, nil))
+	require.NoError(t, b.Set([]byte{'d'}, []byte{'d'}, nil))
+	require.NoError(t, b.DeleteRange([]byte{'c'}, []byte{'e'}, nil))
+	require.NoError(t, b.Set([]byte{'f'}, []byte{'f'}, nil))
+	require.NoError(t, b.RangeKeySet([]byte{'f'}, []byte{'k'}, nil, []byte{'c'}, nil))
+	require.NoError(t, b.RangeKeyUnset([]byte{'j'}, []byte{'k'}, nil, nil))
+	require.NoError(t, b.Set([]byte{'z'}, []byte{'z'}, nil))
+	require.NoError(t, d.Apply(b, nil))
+	require.NoError(t, d.Flush())
+	require.NoError(t, d.Compact([]byte{'a'}, []byte{'b'}, false))
+	require.Equal(t, 1, int(d.Metrics().Levels[6].NumFiles))
+
+	d.mu.Lock()
+
+	// Virtualize the single sstable in the lsm.
+
+	currVersion := d.mu.versions.currentVersion()
+	l6 := currVersion.Levels[6]
+	l6FileIter := l6.Iter()
+	parentFile := l6FileIter.First()
+	f1 := FileNum(d.mu.versions.nextFileNum)
+	f2 := f1 + 1
+	d.mu.versions.nextFileNum += 2
+
+	v1 := &manifest.FileMetadata{
+		FileBacking:    parentFile.FileBacking,
+		FileNum:        f1,
+		CreationTime:   time.Now().Unix(),
+		Size:           parentFile.Size / 2,
+		SmallestSeqNum: parentFile.SmallestSeqNum,
+		LargestSeqNum:  parentFile.LargestSeqNum,
+		Smallest:       base.MakeInternalKey([]byte{'a'}, parentFile.Smallest.SeqNum(), InternalKeyKindSet),
+		Largest:        base.MakeInternalKey([]byte{'a'}, parentFile.Smallest.SeqNum(), InternalKeyKindSet),
+		HasPointKeys:   true,
+		Virtual:        true,
+	}
+	v1.Stats.NumEntries = 1
+
+	v2 := &manifest.FileMetadata{
+		FileBacking:    parentFile.FileBacking,
+		FileNum:        f2,
+		CreationTime:   time.Now().Unix(),
+		Size:           parentFile.Size / 2,
+		SmallestSeqNum: parentFile.SmallestSeqNum,
+		LargestSeqNum:  parentFile.LargestSeqNum,
+		Smallest:       base.MakeInternalKey([]byte{'d'}, parentFile.Smallest.SeqNum()+1, InternalKeyKindSet),
+		Largest:        base.MakeInternalKey([]byte{'z'}, parentFile.Largest.SeqNum(), InternalKeyKindSet),
+		HasPointKeys:   true,
+		Virtual:        true,
+	}
+	v2.Stats.NumEntries = 6
+
+	v1.LargestPointKey = v1.Largest
+	v1.SmallestPointKey = v1.Smallest
+
+	v2.LargestPointKey = v2.Largest
+	v2.SmallestPointKey = v2.Smallest
+
+	v1.ValidateVirtual(parentFile)
+	d.checkVirtualBounds(v1)
+	v2.ValidateVirtual(parentFile)
+	d.checkVirtualBounds(v2)
+
+	// Write the version edit.
+	fileMetrics := func(ve *versionEdit) map[int]*LevelMetrics {
+		metrics := newFileMetrics(ve.NewFiles)
+		for de, f := range ve.DeletedFiles {
+			lm := metrics[de.Level]
+			if lm == nil {
+				lm = &LevelMetrics{}
+				metrics[de.Level] = lm
+			}
+			metrics[de.Level].NumFiles--
+			metrics[de.Level].Size -= int64(f.Size)
+		}
+		return metrics
+	}
+
+	applyVE := func(ve *versionEdit) error {
+		d.mu.versions.logLock()
+		jobID := d.mu.nextJobID
+		d.mu.nextJobID++
+
+		err := d.mu.versions.logAndApply(jobID, ve, fileMetrics(ve), false, func() []compactionInfo {
+			return d.getInProgressCompactionInfoLocked(nil)
+		})
+		d.updateReadStateLocked(nil)
+		return err
+	}
+
+	ve := manifest.VersionEdit{}
+	d1 := manifest.DeletedFileEntry{Level: 6, FileNum: parentFile.FileNum}
+	n1 := manifest.NewFileEntry{Level: 6, Meta: v1}
+	n2 := manifest.NewFileEntry{Level: 6, Meta: v2}
+
+	ve.DeletedFiles = make(map[manifest.DeletedFileEntry]*manifest.FileMetadata)
+	ve.DeletedFiles[d1] = parentFile
+	ve.NewFiles = append(ve.NewFiles, n1)
+	ve.NewFiles = append(ve.NewFiles, n2)
+	ve.CreatedBackingTables = append(ve.CreatedBackingTables, parentFile.FileBacking)
+
+	require.NoError(t, applyVE(&ve))
+
+	currVersion = d.mu.versions.currentVersion()
+	l6 = currVersion.Levels[6]
+	l6FileIter = l6.Iter()
+	for f := l6FileIter.First(); f != nil; f = l6FileIter.Next() {
+		require.Equal(t, true, f.Virtual)
+	}
+	d.mu.Unlock()
+
+	// Confirm that there were only 2 virtual sstables in L6.
+	require.Equal(t, 2, int(d.Metrics().Levels[6].NumFiles))
+
+	// These reads will go through the table cache.
+	iter, _ := d.NewIter(nil)
+	expected := []byte{'a', 'f', 'z'}
+	for i, x := 0, iter.First(); x; i, x = i+1, iter.Next() {
+		require.Equal(t, []byte{expected[i]}, iter.Value())
+	}
+	iter.Close()
+}
+
+// The table cache shouldn't be usable after all the dbs close.
+func TestSharedTableCacheUseAfterAllFree(t *testing.T) {
+	tc := newTableCacheTest(8<<20, 10, 1)
+	db1, err := Open("test",
+		&Options{
+			FS:         vfs.NewMem(),
+			Cache:      tc.cache,
+			TableCache: tc,
+		})
+	require.NoError(t, err)
+
+	// Release our reference, now that the db has a reference.
+	tc.Unref()
+
+	db2, err := Open("test",
+		&Options{
+			FS:         vfs.NewMem(),
+			Cache:      tc.cache,
+			TableCache: tc,
+		})
+	require.NoError(t, err)
+
+	require.NoError(t, db1.Close())
+	require.NoError(t, db2.Close())
+
+	v := tc.refs.Load()
+	if v != 0 {
+		t.Fatalf("expected reference count %d, got %d", 0, v)
+	}
+
+	defer func() {
+		// The cache ref gets incremented before the panic, so we should
+		// decrement it to prevent the finalizer from detecting a leak.
+		tc.cache.Unref()
+
+		if r := recover(); r != nil {
+			if fmt.Sprint(r) != "pebble: inconsistent reference count: 1" {
+				t.Fatalf("unexpected panic message")
+			}
+		} else if r == nil {
+			t.Fatalf("expected panic")
+		}
+	}()
+
+	db3, _ := Open("test",
+		&Options{
+			FS:         vfs.NewMem(),
+			Cache:      tc.cache,
+			TableCache: tc,
+		})
+	_ = db3
+}
+
+// Test whether a shared table cache is usable by a db, after
+// one of the db's releases its reference.
+func TestSharedTableCacheUseAfterOneFree(t *testing.T) {
+	tc := newTableCacheTest(8<<20, 10, 1)
+	db1, err := Open("test",
+		&Options{
+			FS:         vfs.NewMem(),
+			Cache:      tc.cache,
+			TableCache: tc,
+		})
+	require.NoError(t, err)
+
+	// Release our reference, now that the db has a reference.
+	tc.Unref()
+
+	db2, err := Open("test",
+		&Options{
+			FS:         vfs.NewMem(),
+			Cache:      tc.cache,
+			TableCache: tc,
+		})
+	require.NoError(t, err)
+	defer func() {
+		require.NoError(t, db2.Close())
+	}()
+
+	// Make db1 release a reference to the cache. It should
+	// still be usable by db2.
+	require.NoError(t, db1.Close())
+	v := tc.refs.Load()
+	if v != 1 {
+		t.Fatalf("expected reference count %d, got %d", 1, v)
+	}
+
+	// Check if db2 is still usable.
+	start := []byte("a")
+	end := []byte("d")
+	require.NoError(t, db2.Set(start, nil, nil))
+	require.NoError(t, db2.Flush())
+	require.NoError(t, db2.DeleteRange(start, end, nil))
+	require.NoError(t, db2.Compact(start, end, false))
+}
+
+// A basic test which makes sure that a shared table cache is usable
+// by more than one database at once.
+func TestSharedTableCacheUsable(t *testing.T) {
+	tc := newTableCacheTest(8<<20, 10, 1)
+	db1, err := Open("test",
+		&Options{
+			FS:         vfs.NewMem(),
+			Cache:      tc.cache,
+			TableCache: tc,
+		})
+	require.NoError(t, err)
+
+	// Release our reference, now that the db has a reference.
+	tc.Unref()
+
+	defer func() {
+		require.NoError(t, db1.Close())
+	}()
+
+	db2, err := Open("test",
+		&Options{
+			FS:         vfs.NewMem(),
+			Cache:      tc.cache,
+			TableCache: tc,
+		})
+	require.NoError(t, err)
+	defer func() {
+		require.NoError(t, db2.Close())
+	}()
+
+	start := []byte("a")
+	end := []byte("z")
+	require.NoError(t, db1.Set(start, nil, nil))
+	require.NoError(t, db1.Flush())
+	require.NoError(t, db1.DeleteRange(start, end, nil))
+	require.NoError(t, db1.Compact(start, end, false))
+
+	start = []byte("x")
+	end = []byte("y")
+	require.NoError(t, db2.Set(start, nil, nil))
+	require.NoError(t, db2.Flush())
+	require.NoError(t, db2.Set(start, []byte{'a'}, nil))
+	require.NoError(t, db2.Flush())
+	require.NoError(t, db2.DeleteRange(start, end, nil))
+	require.NoError(t, db2.Compact(start, end, false))
+}
+
+func TestSharedTableConcurrent(t *testing.T) {
+	tc := newTableCacheTest(8<<20, 10, 1)
+	db1, err := Open("test",
+		&Options{
+			FS:         vfs.NewMem(),
+			Cache:      tc.cache,
+			TableCache: tc,
+		})
+	require.NoError(t, err)
+
+	// Release our reference, now that the db has a reference.
+	tc.Unref()
+
+	defer func() {
+		require.NoError(t, db1.Close())
+	}()
+
+	db2, err := Open("test",
+		&Options{
+			FS:         vfs.NewMem(),
+			Cache:      tc.cache,
+			TableCache: tc,
+		})
+	require.NoError(t, err)
+	defer func() {
+		require.NoError(t, db2.Close())
+	}()
+
+	var wg sync.WaitGroup
+	wg.Add(2)
+
+	// Now that both dbs have a reference to the table cache,
+	// we'll run go routines which will use the DBs concurrently.
+	concFunc := func(db *DB) {
+		for i := 0; i < 1000; i++ {
+			start := []byte("a")
+			end := []byte("z")
+			require.NoError(t, db.Set(start, nil, nil))
+			require.NoError(t, db.Flush())
+			require.NoError(t, db.DeleteRange(start, end, nil))
+			require.NoError(t, db.Compact(start, end, false))
+		}
+		wg.Done()
+	}
+
+	go concFunc(db1)
+	go concFunc(db2)
+
+	wg.Wait()
+}
+
+func testTableCacheRandomAccess(t *testing.T, concurrent bool) {
+	const N = 2000
+	c, fs, err := newTableCacheContainerTest(nil, "")
+	require.NoError(t, err)
+
+	rngMu := sync.Mutex{}
+	rng := rand.New(rand.NewSource(1))
+
+	errc := make(chan error, N)
+	for i := 0; i < N; i++ {
+		go func(i int) {
+			rngMu.Lock()
+			fileNum, sleepTime := rng.Intn(tableCacheTestNumTables), rng.Intn(1000)
+			rngMu.Unlock()
+			m := &fileMetadata{FileNum: FileNum(fileNum)}
+			m.InitPhysicalBacking()
+			m.Ref()
+			defer m.Unref()
+			iter, _, err := c.newIters(context.Background(), m, nil, internalIterOpts{})
+			if err != nil {
+				errc <- errors.Errorf("i=%d, fileNum=%d: find: %v", i, fileNum, err)
+				return
+			}
+			key, value := iter.SeekGE([]byte("k"), base.SeekGEFlagsNone)
+			if concurrent {
+				time.Sleep(time.Duration(sleepTime) * time.Microsecond)
+			}
+			if key == nil {
+				errc <- errors.Errorf("i=%d, fileNum=%d: valid.0: got false, want true", i, fileNum)
+				return
+			}
+			v, _, err := value.Value(nil)
+			if err != nil {
+				errc <- errors.Errorf("i=%d, fileNum=%d: err extracting value: %v", err)
+			}
+			if got := len(v); got != fileNum {
+				errc <- errors.Errorf("i=%d, fileNum=%d: value: got %d bytes, want %d", i, fileNum, got, fileNum)
+				return
+			}
+			if key, _ := iter.Next(); key != nil {
+				errc <- errors.Errorf("i=%d, fileNum=%d: next.1: got true, want false", i, fileNum)
+				return
+			}
+			if err := iter.Close(); err != nil {
+				errc <- errors.Wrapf(err, "close error i=%d, fileNum=%dv", i, fileNum)
+				return
+			}
+			errc <- nil
+		}(i)
+		if !concurrent {
+			require.NoError(t, <-errc)
+		}
+	}
+	if concurrent {
+		for i := 0; i < N; i++ {
+			require.NoError(t, <-errc)
+		}
+	}
+	fs.validate(t, c, nil)
+}
+
+func TestTableCacheRandomAccessSequential(t *testing.T) { testTableCacheRandomAccess(t, false) }
+func TestTableCacheRandomAccessConcurrent(t *testing.T) { testTableCacheRandomAccess(t, true) }
+
+func testTableCacheFrequentlyUsedInternal(t *testing.T, rangeIter bool) {
+	const (
+		N       = 1000
+		pinned0 = 7
+		pinned1 = 11
+	)
+	c, fs, err := newTableCacheContainerTest(nil, "")
+	require.NoError(t, err)
+
+	for i := 0; i < N; i++ {
+		for _, j := range [...]int{pinned0, i % tableCacheTestNumTables, pinned1} {
+			var iter io.Closer
+			var err error
+			m := &fileMetadata{FileNum: FileNum(j)}
+			m.InitPhysicalBacking()
+			m.Ref()
+			if rangeIter {
+				iter, err = c.newRangeKeyIter(m, keyspan.SpanIterOptions{})
+			} else {
+				iter, _, err = c.newIters(context.Background(), m, nil, internalIterOpts{})
+			}
+			if err != nil {
+				t.Fatalf("i=%d, j=%d: find: %v", i, j, err)
+			}
+			if err := iter.Close(); err != nil {
+				t.Fatalf("i=%d, j=%d: close: %v", i, j, err)
+			}
+		}
+	}
+
+	fs.validate(t, c, func(i, gotO, gotC int) error {
+		if i == pinned0 || i == pinned1 {
+			if gotO != 1 || gotC != 0 {
+				return errors.Errorf("i=%d: pinned table: got %d, %d, want %d, %d", i, gotO, gotC, 1, 0)
+			}
+		}
+		return nil
+	})
+}
+
+func TestTableCacheFrequentlyUsed(t *testing.T) {
+	for i, iterType := range []string{"point", "range"} {
+		t.Run(fmt.Sprintf("iter=%s", iterType), func(t *testing.T) {
+			testTableCacheFrequentlyUsedInternal(t, i == 1)
+		})
+	}
+}
+
+func TestSharedTableCacheFrequentlyUsed(t *testing.T) {
+	const (
+		N       = 1000
+		pinned0 = 7
+		pinned1 = 11
+	)
+	tc := newTableCacheTest(8<<20, 2*tableCacheTestCacheSize, 16)
+	c1, fs1, err := newTableCacheContainerTest(tc, "")
+	require.NoError(t, err)
+	c2, fs2, err := newTableCacheContainerTest(tc, "")
+	require.NoError(t, err)
+	tc.Unref()
+
+	for i := 0; i < N; i++ {
+		for _, j := range [...]int{pinned0, i % tableCacheTestNumTables, pinned1} {
+			m := &fileMetadata{FileNum: FileNum(j)}
+			m.InitPhysicalBacking()
+			m.Ref()
+			iter1, _, err := c1.newIters(context.Background(), m, nil, internalIterOpts{})
+			if err != nil {
+				t.Fatalf("i=%d, j=%d: find: %v", i, j, err)
+			}
+			iter2, _, err := c2.newIters(context.Background(), m, nil, internalIterOpts{})
+			if err != nil {
+				t.Fatalf("i=%d, j=%d: find: %v", i, j, err)
+			}
+
+			if err := iter1.Close(); err != nil {
+				t.Fatalf("i=%d, j=%d: close: %v", i, j, err)
+			}
+			if err := iter2.Close(); err != nil {
+				t.Fatalf("i=%d, j=%d: close: %v", i, j, err)
+			}
+		}
+	}
+
+	fs1.validate(t, c1, func(i, gotO, gotC int) error {
+		if i == pinned0 || i == pinned1 {
+			if gotO != 1 || gotC != 0 {
+				return errors.Errorf("i=%d: pinned table: got %d, %d, want %d, %d", i, gotO, gotC, 1, 0)
+			}
+		}
+		return nil
+	})
+
+	fs2.validate(t, c2, func(i, gotO, gotC int) error {
+		if i == pinned0 || i == pinned1 {
+			if gotO != 1 || gotC != 0 {
+				return errors.Errorf("i=%d: pinned table: got %d, %d, want %d, %d", i, gotO, gotC, 1, 0)
+			}
+		}
+		return nil
+	})
+}
+
+func testTableCacheEvictionsInternal(t *testing.T, rangeIter bool) {
+	const (
+		N      = 1000
+		lo, hi = 10, 20
+	)
+	c, fs, err := newTableCacheContainerTest(nil, "")
+	require.NoError(t, err)
+
+	rng := rand.New(rand.NewSource(2))
+	for i := 0; i < N; i++ {
+		j := rng.Intn(tableCacheTestNumTables)
+		var iter io.Closer
+		var err error
+		m := &fileMetadata{FileNum: FileNum(j)}
+		m.InitPhysicalBacking()
+		m.Ref()
+		if rangeIter {
+			iter, err = c.newRangeKeyIter(m, keyspan.SpanIterOptions{})
+		} else {
+			iter, _, err = c.newIters(context.Background(), m, nil, internalIterOpts{})
+		}
+		if err != nil {
+			t.Fatalf("i=%d, j=%d: find: %v", i, j, err)
+		}
+		if err := iter.Close(); err != nil {
+			t.Fatalf("i=%d, j=%d: close: %v", i, j, err)
+		}
+
+		c.evict(base.FileNum(lo + rng.Uint64n(hi-lo)).DiskFileNum())
+	}
+
+	sumEvicted, nEvicted := 0, 0
+	sumSafe, nSafe := 0, 0
+	fs.validate(t, c, func(i, gotO, gotC int) error {
+		if lo <= i && i < hi {
+			sumEvicted += gotO
+			nEvicted++
+		} else {
+			sumSafe += gotO
+			nSafe++
+		}
+		return nil
+	})
+	fEvicted := float64(sumEvicted) / float64(nEvicted)
+	fSafe := float64(sumSafe) / float64(nSafe)
+	// The magic 1.25 number isn't derived from formal modeling. It's just a guess. For
+	// (lo, hi, tableCacheTestCacheSize, tableCacheTestNumTables) = (10, 20, 100, 300),
+	// the ratio seems to converge on roughly 1.5 for large N, compared to 1.0 if we do
+	// not evict any cache entries.
+	if ratio := fEvicted / fSafe; ratio < 1.25 {
+		t.Errorf("evicted tables were opened %.3f times on average, safe tables %.3f, ratio %.3f < 1.250",
+			fEvicted, fSafe, ratio)
+	}
+}
+
+func TestTableCacheEvictions(t *testing.T) {
+	for i, iterType := range []string{"point", "range"} {
+		t.Run(fmt.Sprintf("iter=%s", iterType), func(t *testing.T) {
+			testTableCacheEvictionsInternal(t, i == 1)
+		})
+	}
+}
+
+func TestSharedTableCacheEvictions(t *testing.T) {
+	const (
+		N      = 1000
+		lo, hi = 10, 20
+	)
+	tc := newTableCacheTest(8<<20, 2*tableCacheTestCacheSize, 16)
+	c1, fs1, err := newTableCacheContainerTest(tc, "")
+	require.NoError(t, err)
+	c2, fs2, err := newTableCacheContainerTest(tc, "")
+	require.NoError(t, err)
+	tc.Unref()
+
+	rng := rand.New(rand.NewSource(2))
+	for i := 0; i < N; i++ {
+		j := rng.Intn(tableCacheTestNumTables)
+		m := &fileMetadata{FileNum: FileNum(j)}
+		m.InitPhysicalBacking()
+		m.Ref()
+		iter1, _, err := c1.newIters(context.Background(), m, nil, internalIterOpts{})
+		if err != nil {
+			t.Fatalf("i=%d, j=%d: find: %v", i, j, err)
+		}
+
+		iter2, _, err := c2.newIters(context.Background(), m, nil, internalIterOpts{})
+		if err != nil {
+			t.Fatalf("i=%d, j=%d: find: %v", i, j, err)
+		}
+
+		if err := iter1.Close(); err != nil {
+			t.Fatalf("i=%d, j=%d: close: %v", i, j, err)
+		}
+
+		if err := iter2.Close(); err != nil {
+			t.Fatalf("i=%d, j=%d: close: %v", i, j, err)
+		}
+
+		c1.evict(base.FileNum(lo + rng.Uint64n(hi-lo)).DiskFileNum())
+		c2.evict(base.FileNum(lo + rng.Uint64n(hi-lo)).DiskFileNum())
+	}
+
+	check := func(fs *tableCacheTestFS, c *tableCacheContainer) (float64, float64, float64) {
+		sumEvicted, nEvicted := 0, 0
+		sumSafe, nSafe := 0, 0
+		fs.validate(t, c, func(i, gotO, gotC int) error {
+			if lo <= i && i < hi {
+				sumEvicted += gotO
+				nEvicted++
+			} else {
+				sumSafe += gotO
+				nSafe++
+			}
+			return nil
+		})
+		fEvicted := float64(sumEvicted) / float64(nEvicted)
+		fSafe := float64(sumSafe) / float64(nSafe)
+
+		return fEvicted, fSafe, fEvicted / fSafe
+	}
+
+	// The magic 1.25 number isn't derived from formal modeling. It's just a guess. For
+	// (lo, hi, tableCacheTestCacheSize, tableCacheTestNumTables) = (10, 20, 100, 300),
+	// the ratio seems to converge on roughly 1.5 for large N, compared to 1.0 if we do
+	// not evict any cache entries.
+	if fEvicted, fSafe, ratio := check(fs1, c1); ratio < 1.25 {
+		t.Errorf(
+			"evicted tables were opened %.3f times on average, safe tables %.3f, ratio %.3f < 1.250",
+			fEvicted, fSafe, ratio,
+		)
+	}
+
+	if fEvicted, fSafe, ratio := check(fs2, c2); ratio < 1.25 {
+		t.Errorf(
+			"evicted tables were opened %.3f times on average, safe tables %.3f, ratio %.3f < 1.250",
+			fEvicted, fSafe, ratio,
+		)
+	}
+}
+
+func TestTableCacheIterLeak(t *testing.T) {
+	c, _, err := newTableCacheContainerTest(nil, "")
+	require.NoError(t, err)
+
+	m := &fileMetadata{FileNum: 0}
+	m.InitPhysicalBacking()
+	m.Ref()
+	defer m.Unref()
+	iter, _, err := c.newIters(context.Background(), m, nil, internalIterOpts{})
+	require.NoError(t, err)
+
+	if err := c.close(); err == nil {
+		t.Fatalf("expected failure, but found success")
+	} else if !strings.HasPrefix(err.Error(), "leaked iterators:") {
+		t.Fatalf("expected leaked iterators, but found %+v", err)
+	} else {
+		t.Log(err.Error())
+	}
+	require.NoError(t, iter.Close())
+}
+
+func TestSharedTableCacheIterLeak(t *testing.T) {
+	tc := newTableCacheTest(8<<20, 2*tableCacheTestCacheSize, 16)
+	c1, _, err := newTableCacheContainerTest(tc, "")
+	require.NoError(t, err)
+	c2, _, err := newTableCacheContainerTest(tc, "")
+	require.NoError(t, err)
+	c3, _, err := newTableCacheContainerTest(tc, "")
+	require.NoError(t, err)
+	tc.Unref()
+
+	m := &fileMetadata{FileNum: 0}
+	m.InitPhysicalBacking()
+	m.Ref()
+	defer m.Unref()
+	iter, _, err := c1.newIters(context.Background(), m, nil, internalIterOpts{})
+	require.NoError(t, err)
+
+	if err := c1.close(); err == nil {
+		t.Fatalf("expected failure, but found success")
+	} else if !strings.HasPrefix(err.Error(), "leaked iterators:") {
+		t.Fatalf("expected leaked iterators, but found %+v", err)
+	} else {
+		t.Log(err.Error())
+	}
+
+	// Closing c2 shouldn't error out since c2 isn't leaking any iterators.
+	require.NoError(t, c2.close())
+
+	// Closing c3 should error out since c3 holds the last reference to
+	// the TableCache, and when the TableCache closes, it will detect
+	// that there was a leaked iterator.
+	if err := c3.close(); err == nil {
+		t.Fatalf("expected failure, but found success")
+	} else if !strings.HasPrefix(err.Error(), "leaked iterators:") {
+		t.Fatalf("expected leaked iterators, but found %+v", err)
+	} else {
+		t.Log(err.Error())
+	}
+
+	require.NoError(t, iter.Close())
+}
+
+func TestTableCacheRetryAfterFailure(t *testing.T) {
+	// Test a retry can succeed after a failure, i.e., errors are not cached.
+	c, fs, err := newTableCacheContainerTest(nil, "")
+	require.NoError(t, err)
+
+	fs.setOpenError(true /* enabled */)
+	m := &fileMetadata{FileNum: 0}
+	m.InitPhysicalBacking()
+	m.Ref()
+	defer m.Unref()
+	if _, _, err = c.newIters(context.Background(), m, nil, internalIterOpts{}); err == nil {
+		t.Fatalf("expected failure, but found success")
+	}
+	require.Equal(t, "pebble: backing file 000000 error: injected error", err.Error())
+	fs.setOpenError(false /* enabled */)
+	var iter internalIterator
+	iter, _, err = c.newIters(context.Background(), m, nil, internalIterOpts{})
+	require.NoError(t, err)
+	require.NoError(t, iter.Close())
+	fs.validate(t, c, nil)
+}
+
+// memFile is a file-like struct that buffers all data written to it in memory.
+// Implements the objstorage.Writable interface.
+type memFile struct {
+	buf bytes.Buffer
+}
+
+var _ objstorage.Writable = (*memFile)(nil)
+
+// Finish is part of the objstorage.Writable interface.
+func (*memFile) Finish() error {
+	return nil
+}
+
+// Abort is part of the objstorage.Writable interface.
+func (*memFile) Abort() {}
+
+// Write is part of the objstorage.Writable interface.
+func (f *memFile) Write(p []byte) error {
+	_, err := f.buf.Write(p)
+	return err
+}
+
+func TestTableCacheErrorBadMagicNumber(t *testing.T) {
+	var file memFile
+	tw := sstable.NewWriter(&file, sstable.WriterOptions{TableFormat: sstable.TableFormatPebblev2})
+	tw.Set([]byte("a"), nil)
+	require.NoError(t, tw.Close())
+	buf := file.buf.Bytes()
+	// Bad magic number.
+	buf[len(buf)-1] = 0
+	fs := &tableCacheTestFS{
+		FS: vfs.NewMem(),
+	}
+	const testFileNum = 3
+	objProvider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(fs, ""))
+	require.NoError(t, err)
+	w, _, err := objProvider.Create(context.Background(), fileTypeTable,
+		base.FileNum(testFileNum).DiskFileNum(), objstorage.CreateOptions{})
+	w.Write(buf)
+	require.NoError(t, w.Finish())
+	opts := &Options{}
+	opts.EnsureDefaults()
+	opts.Cache = NewCache(8 << 20) // 8 MB
+	defer opts.Cache.Unref()
+	c := newTableCacheContainer(nil, opts.Cache.NewID(), objProvider, opts, tableCacheTestCacheSize,
+		&sstable.CategoryStatsCollector{})
+	require.NoError(t, err)
+	defer c.close()
+
+	m := &fileMetadata{FileNum: testFileNum}
+	m.InitPhysicalBacking()
+	m.Ref()
+	defer m.Unref()
+	if _, _, err = c.newIters(context.Background(), m, nil, internalIterOpts{}); err == nil {
+		t.Fatalf("expected failure, but found success")
+	}
+	require.Equal(t,
+		"pebble: backing file 000003 error: pebble/table: invalid table (bad magic number: 0xf09faab3f09faa00)",
+		err.Error())
+}
+
+func TestTableCacheEvictClose(t *testing.T) {
+	errs := make(chan error, 10)
+	db, err := Open("test",
+		&Options{
+			FS: vfs.NewMem(),
+			EventListener: &EventListener{
+				TableDeleted: func(info TableDeleteInfo) {
+					errs <- info.Err
+				},
+			},
+		})
+	require.NoError(t, err)
+
+	start := []byte("a")
+	end := []byte("z")
+	require.NoError(t, db.Set(start, nil, nil))
+	require.NoError(t, db.Flush())
+	require.NoError(t, db.DeleteRange(start, end, nil))
+	require.NoError(t, db.Compact(start, end, false))
+	require.NoError(t, db.Close())
+	close(errs)
+
+	for err := range errs {
+		require.NoError(t, err)
+	}
+}
+
+func TestTableCacheClockPro(t *testing.T) {
+	// Test data was generated from the python code. See also
+	// internal/cache/clockpro_test.go:TestCache.
+	f, err := os.Open("internal/cache/testdata/cache")
+	require.NoError(t, err)
+
+	mem := vfs.NewMem()
+	objProvider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(mem, ""))
+	require.NoError(t, err)
+	defer objProvider.Close()
+
+	makeTable := func(dfn base.DiskFileNum) {
+		require.NoError(t, err)
+		f, _, err := objProvider.Create(context.Background(), fileTypeTable, dfn, objstorage.CreateOptions{})
+		require.NoError(t, err)
+		w := sstable.NewWriter(f, sstable.WriterOptions{})
+		require.NoError(t, w.Set([]byte("a"), nil))
+		require.NoError(t, w.Close())
+	}
+
+	opts := &Options{
+		Cache: NewCache(8 << 20), // 8 MB
+	}
+	opts.EnsureDefaults()
+	defer opts.Cache.Unref()
+
+	cache := &tableCacheShard{}
+	// NB: The table cache size of 200 is required for the expected test values.
+	cache.init(200)
+	dbOpts := &tableCacheOpts{}
+	dbOpts.loggerAndTracer = &base.LoggerWithNoopTracer{Logger: opts.Logger}
+	dbOpts.cacheID = 0
+	dbOpts.objProvider = objProvider
+	dbOpts.opts = opts.MakeReaderOptions()
+
+	scanner := bufio.NewScanner(f)
+	tables := make(map[int]bool)
+	line := 1
+
+	for scanner.Scan() {
+		fields := bytes.Fields(scanner.Bytes())
+
+		key, err := strconv.Atoi(string(fields[0]))
+		require.NoError(t, err)
+
+		// Ensure that underlying sstables exist on disk, creating each table the
+		// first time it is seen.
+		if !tables[key] {
+			makeTable(base.FileNum(uint64(key)).DiskFileNum())
+			tables[key] = true
+		}
+
+		oldHits := cache.hits.Load()
+		m := &fileMetadata{FileNum: FileNum(key)}
+		m.InitPhysicalBacking()
+		m.Ref()
+		v := cache.findNode(m, dbOpts)
+		cache.unrefValue(v)
+
+		hit := cache.hits.Load() != oldHits
+		wantHit := fields[1][0] == 'h'
+		if hit != wantHit {
+			t.Errorf("%d: cache hit mismatch: got %v, want %v\n", line, hit, wantHit)
+		}
+		line++
+		m.Unref()
+	}
+}
+
+func BenchmarkNewItersAlloc(b *testing.B) {
+	opts := &Options{
+		FS:                 vfs.NewMem(),
+		FormatMajorVersion: internalFormatNewest,
+	}
+	d, err := Open("", opts)
+	require.NoError(b, err)
+	defer func() { require.NoError(b, d.Close()) }()
+
+	require.NoError(b, d.Set([]byte{'a'}, []byte{'a'}, nil))
+	require.NoError(b, d.Flush())
+	require.NoError(b, d.Compact([]byte{'a'}, []byte{'z'}, false))
+
+	d.mu.Lock()
+	currVersion := d.mu.versions.currentVersion()
+	it := currVersion.Levels[6].Iter()
+	m := it.First()
+	require.NotNil(b, m)
+	d.mu.Unlock()
+
+	// Open once so that the Reader is cached.
+	iter, _, err := d.newIters(context.Background(), m, nil, internalIterOpts{})
+	require.NoError(b, iter.Close())
+	require.NoError(b, err)
+
+	for i := 0; i < b.N; i++ {
+		b.StartTimer()
+		iter, _, err := d.newIters(context.Background(), m, nil, internalIterOpts{})
+		b.StopTimer()
+		require.NoError(b, err)
+		require.NoError(b, iter.Close())
+	}
+}
+
+// TestTableCacheNoSuchFileError verifies that when the table cache hits a "no
+// such file" error, it generates a useful fatal message.
+func TestTableCacheNoSuchFileError(t *testing.T) {
+	const dirname = "test"
+	mem := vfs.NewMem()
+	logger := &catchFatalLogger{}
+
+	d, err := Open(dirname, &Options{
+		FS:     mem,
+		Logger: logger,
+	})
+	require.NoError(t, err)
+	defer func() { _ = d.Close() }()
+	require.NoError(t, d.Set([]byte("a"), []byte("val_a"), nil))
+	require.NoError(t, d.Set([]byte("b"), []byte("val_b"), nil))
+	require.NoError(t, d.Flush())
+	ls, err := mem.List(dirname)
+	require.NoError(t, err)
+
+	// Find the sst file.
+	var sst string
+	for _, file := range ls {
+		if strings.HasSuffix(file, ".sst") {
+			if sst != "" {
+				t.Fatalf("multiple SSTs found: %s, %s", sst, file)
+			}
+			sst = file
+		}
+	}
+	if sst == "" {
+		t.Fatalf("no SST found after flush")
+	}
+	require.NoError(t, mem.Remove(path.Join(dirname, sst)))
+
+	_, _, _ = d.Get([]byte("a"))
+	require.NotZero(t, len(logger.fatalMsgs), "no fatal message emitted")
+	require.Equal(t, 1, len(logger.fatalMsgs), "expected one fatal message; got: %v", logger.fatalMsgs)
+	require.Contains(t, logger.fatalMsgs[0], "directory contains 6 files, 0 unknown, 0 tables, 2 logs, 1 manifests")
+}
+
+func BenchmarkTableCacheHotPath(b *testing.B) {
+	mem := vfs.NewMem()
+	objProvider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(mem, ""))
+	require.NoError(b, err)
+	defer objProvider.Close()
+
+	makeTable := func(dfn base.DiskFileNum) {
+		require.NoError(b, err)
+		f, _, err := objProvider.Create(context.Background(), fileTypeTable, dfn, objstorage.CreateOptions{})
+		require.NoError(b, err)
+		w := sstable.NewWriter(f, sstable.WriterOptions{})
+		require.NoError(b, w.Set([]byte("a"), nil))
+		require.NoError(b, w.Close())
+	}
+
+	opts := &Options{
+		Cache: NewCache(8 << 20), // 8 MB
+	}
+	opts.EnsureDefaults()
+	defer opts.Cache.Unref()
+
+	cache := &tableCacheShard{}
+	cache.init(2)
+	dbOpts := &tableCacheOpts{}
+	dbOpts.loggerAndTracer = &base.LoggerWithNoopTracer{Logger: opts.Logger}
+	dbOpts.cacheID = 0
+	dbOpts.objProvider = objProvider
+	dbOpts.opts = opts.MakeReaderOptions()
+
+	makeTable(1)
+
+	m := &fileMetadata{FileNum: 1}
+	m.InitPhysicalBacking()
+	m.Ref()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		v := cache.findNode(m, dbOpts)
+		cache.unrefValue(v)
+	}
+}
+
+type catchFatalLogger struct {
+	fatalMsgs []string
+}
+
+var _ Logger = (*catchFatalLogger)(nil)
+
+func (tl *catchFatalLogger) Infof(format string, args ...interface{})  {}
+func (tl *catchFatalLogger) Errorf(format string, args ...interface{}) {}
+
+func (tl *catchFatalLogger) Fatalf(format string, args ...interface{}) {
+	tl.fatalMsgs = append(tl.fatalMsgs, fmt.Sprintf(format, args...))
+}
diff --git a/pebble/table_stats.go b/pebble/table_stats.go
new file mode 100644
index 0000000..76c940c
--- /dev/null
+++ b/pebble/table_stats.go
@@ -0,0 +1,1086 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"fmt"
+	"math"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/sstable"
+)
+
+// In-memory statistics about tables help inform compaction picking, but may
+// be expensive to calculate or load from disk. Every time a database is
+// opened, these statistics must be reloaded or recalculated. To minimize
+// impact on user activity and compactions, we load these statistics
+// asynchronously in the background and store loaded statistics in each
+// table's *FileMetadata.
+//
+// This file implements the asynchronous loading of statistics by maintaining
+// a list of files that require statistics, alongside their LSM levels.
+// Whenever new files are added to the LSM, the files are appended to
+// d.mu.tableStats.pending. If a stats collection job is not currently
+// running, one is started in a separate goroutine.
+//
+// The stats collection job grabs and clears the pending list, computes table
+// statistics relative to the current readState and updates the tables' file
+// metadata. New pending files may accumulate during a stats collection job,
+// so a completing job triggers a new job if necessary. Only one job runs at a
+// time.
+//
+// When an existing database is opened, all files lack in-memory statistics.
+// These files' stats are loaded incrementally whenever the pending list is
+// empty by scanning a current readState for files missing statistics. Once a
+// job completes a scan without finding any remaining files without
+// statistics, it flips a `loadedInitial` flag. From then on, the stats
+// collection job only needs to load statistics for new files appended to the
+// pending list.
+
+func (d *DB) maybeCollectTableStatsLocked() {
+	if d.shouldCollectTableStatsLocked() {
+		go d.collectTableStats()
+	}
+}
+
+// updateTableStatsLocked is called when new files are introduced, after the
+// read state has been updated. It may trigger a new stat collection.
+// DB.mu must be locked when calling.
+func (d *DB) updateTableStatsLocked(newFiles []manifest.NewFileEntry) {
+	var needStats bool
+	for _, nf := range newFiles {
+		if !nf.Meta.StatsValid() {
+			needStats = true
+			break
+		}
+	}
+	if !needStats {
+		return
+	}
+
+	d.mu.tableStats.pending = append(d.mu.tableStats.pending, newFiles...)
+	d.maybeCollectTableStatsLocked()
+}
+
+func (d *DB) shouldCollectTableStatsLocked() bool {
+	return !d.mu.tableStats.loading &&
+		d.closed.Load() == nil &&
+		!d.opts.private.disableTableStats &&
+		(len(d.mu.tableStats.pending) > 0 || !d.mu.tableStats.loadedInitial)
+}
+
+// collectTableStats runs a table stats collection job, returning true if the
+// invocation did the collection work, false otherwise (e.g. if another job was
+// already running).
+func (d *DB) collectTableStats() bool {
+	const maxTableStatsPerScan = 50
+
+	d.mu.Lock()
+	if !d.shouldCollectTableStatsLocked() {
+		d.mu.Unlock()
+		return false
+	}
+
+	pending := d.mu.tableStats.pending
+	d.mu.tableStats.pending = nil
+	d.mu.tableStats.loading = true
+	jobID := d.mu.nextJobID
+	d.mu.nextJobID++
+	loadedInitial := d.mu.tableStats.loadedInitial
+	// Drop DB.mu before performing IO.
+	d.mu.Unlock()
+
+	// Every run of collectTableStats either collects stats from the pending
+	// list (if non-empty) or from scanning the version (loadedInitial is
+	// false). This job only runs if at least one of those conditions holds.
+
+	// Grab a read state to scan for tables.
+	rs := d.loadReadState()
+	var collected []collectedStats
+	var hints []deleteCompactionHint
+	if len(pending) > 0 {
+		collected, hints = d.loadNewFileStats(rs, pending)
+	} else {
+		var moreRemain bool
+		var buf [maxTableStatsPerScan]collectedStats
+		collected, hints, moreRemain = d.scanReadStateTableStats(rs, buf[:0])
+		loadedInitial = !moreRemain
+	}
+	rs.unref()
+
+	// Update the FileMetadata with the loaded stats while holding d.mu.
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	d.mu.tableStats.loading = false
+	if loadedInitial && !d.mu.tableStats.loadedInitial {
+		d.mu.tableStats.loadedInitial = loadedInitial
+		d.opts.EventListener.TableStatsLoaded(TableStatsInfo{
+			JobID: jobID,
+		})
+	}
+
+	maybeCompact := false
+	for _, c := range collected {
+		c.fileMetadata.Stats = c.TableStats
+		maybeCompact = maybeCompact || fileCompensation(c.fileMetadata) > 0
+		c.fileMetadata.StatsMarkValid()
+	}
+	d.mu.tableStats.cond.Broadcast()
+	d.maybeCollectTableStatsLocked()
+	if len(hints) > 0 && !d.opts.private.disableDeleteOnlyCompactions {
+		// Verify that all of the hint tombstones' files still exist in the
+		// current version. Otherwise, the tombstone itself may have been
+		// compacted into L6 and more recent keys may have had their sequence
+		// numbers zeroed.
+		//
+		// Note that it's possible that the tombstone file is being compacted
+		// presently. In that case, the file will be present in v. When the
+		// compaction finishes compacting the tombstone file, it will detect
+		// and clear the hint.
+		//
+		// See DB.maybeUpdateDeleteCompactionHints.
+		v := d.mu.versions.currentVersion()
+		keepHints := hints[:0]
+		for _, h := range hints {
+			if v.Contains(h.tombstoneLevel, d.cmp, h.tombstoneFile) {
+				keepHints = append(keepHints, h)
+			}
+		}
+		d.mu.compact.deletionHints = append(d.mu.compact.deletionHints, keepHints...)
+	}
+	if maybeCompact {
+		d.maybeScheduleCompaction()
+	}
+	return true
+}
+
+type collectedStats struct {
+	*fileMetadata
+	manifest.TableStats
+}
+
+func (d *DB) loadNewFileStats(
+	rs *readState, pending []manifest.NewFileEntry,
+) ([]collectedStats, []deleteCompactionHint) {
+	var hints []deleteCompactionHint
+	collected := make([]collectedStats, 0, len(pending))
+	for _, nf := range pending {
+		// A file's stats might have been populated by an earlier call to
+		// loadNewFileStats if the file was moved.
+		// NB: We're not holding d.mu which protects f.Stats, but only
+		// collectTableStats updates f.Stats for active files, and we
+		// ensure only one goroutine runs it at a time through
+		// d.mu.tableStats.loading.
+		if nf.Meta.StatsValid() {
+			continue
+		}
+
+		// The file isn't guaranteed to still be live in the readState's
+		// version. It may have been deleted or moved. Skip it if it's not in
+		// the expected level.
+		if !rs.current.Contains(nf.Level, d.cmp, nf.Meta) {
+			continue
+		}
+
+		stats, newHints, err := d.loadTableStats(
+			rs.current, nf.Level,
+			nf.Meta,
+		)
+		if err != nil {
+			d.opts.EventListener.BackgroundError(err)
+			continue
+		}
+		// NB: We don't update the FileMetadata yet, because we aren't
+		// holding DB.mu. We'll copy it to the FileMetadata after we're
+		// finished with IO.
+		collected = append(collected, collectedStats{
+			fileMetadata: nf.Meta,
+			TableStats:   stats,
+		})
+		hints = append(hints, newHints...)
+	}
+	return collected, hints
+}
+
+// scanReadStateTableStats is run by an active stat collection job when there
+// are no pending new files, but there might be files that existed at Open for
+// which we haven't loaded table stats.
+func (d *DB) scanReadStateTableStats(
+	rs *readState, fill []collectedStats,
+) ([]collectedStats, []deleteCompactionHint, bool) {
+	moreRemain := false
+	var hints []deleteCompactionHint
+	sizesChecked := make(map[base.DiskFileNum]struct{})
+	for l, levelMetadata := range rs.current.Levels {
+		iter := levelMetadata.Iter()
+		for f := iter.First(); f != nil; f = iter.Next() {
+			// NB: We're not holding d.mu which protects f.Stats, but only the
+			// active stats collection job updates f.Stats for active files,
+			// and we ensure only one goroutine runs it at a time through
+			// d.mu.tableStats.loading. This makes it safe to read validity
+			// through f.Stats.ValidLocked despite not holding d.mu.
+			if f.StatsValid() {
+				continue
+			}
+
+			// Limit how much work we do per read state. The older the read
+			// state is, the higher the likelihood files are no longer being
+			// used in the current version. If we've exhausted our allowance,
+			// return true for the last return value to signal there's more
+			// work to do.
+			if len(fill) == cap(fill) {
+				moreRemain = true
+				return fill, hints, moreRemain
+			}
+
+			// If the file is remote and not SharedForeign, we should check if its size
+			// matches. This is because checkConsistency skips over remote files.
+			//
+			// SharedForeign and External files are skipped as their sizes are allowed
+			// to have a mismatch; the size stored in the FileBacking is just the part
+			// of the file that is referenced by this Pebble instance, not the size of
+			// the whole object.
+			objMeta, err := d.objProvider.Lookup(fileTypeTable, f.FileBacking.DiskFileNum)
+			if err != nil {
+				// Set `moreRemain` so we'll try again.
+				moreRemain = true
+				d.opts.EventListener.BackgroundError(err)
+				continue
+			}
+
+			shouldCheckSize := objMeta.IsRemote() &&
+				!d.objProvider.IsSharedForeign(objMeta) &&
+				!objMeta.IsExternal()
+			if _, ok := sizesChecked[f.FileBacking.DiskFileNum]; !ok && shouldCheckSize {
+				size, err := d.objProvider.Size(objMeta)
+				fileSize := f.FileBacking.Size
+				if err != nil {
+					moreRemain = true
+					d.opts.EventListener.BackgroundError(err)
+					continue
+				}
+				if size != int64(fileSize) {
+					err := errors.Errorf(
+						"during consistency check in loadTableStats: L%d: %s: object size mismatch (%s): %d (provider) != %d (MANIFEST)",
+						errors.Safe(l), f.FileNum, d.objProvider.Path(objMeta),
+						errors.Safe(size), errors.Safe(fileSize))
+					d.opts.EventListener.BackgroundError(err)
+					d.opts.Logger.Fatalf("%s", err)
+				}
+
+				sizesChecked[f.FileBacking.DiskFileNum] = struct{}{}
+			}
+
+			stats, newHints, err := d.loadTableStats(
+				rs.current, l, f,
+			)
+			if err != nil {
+				// Set `moreRemain` so we'll try again.
+				moreRemain = true
+				d.opts.EventListener.BackgroundError(err)
+				continue
+			}
+			fill = append(fill, collectedStats{
+				fileMetadata: f,
+				TableStats:   stats,
+			})
+			hints = append(hints, newHints...)
+		}
+	}
+	return fill, hints, moreRemain
+}
+
+func (d *DB) loadTableStats(
+	v *version, level int, meta *fileMetadata,
+) (manifest.TableStats, []deleteCompactionHint, error) {
+	var stats manifest.TableStats
+	var compactionHints []deleteCompactionHint
+	err := d.tableCache.withCommonReader(
+		meta, func(r sstable.CommonReader) (err error) {
+			props := r.CommonProperties()
+			stats.NumEntries = props.NumEntries
+			stats.NumDeletions = props.NumDeletions
+			if props.NumPointDeletions() > 0 {
+				if err = d.loadTablePointKeyStats(props, v, level, meta, &stats); err != nil {
+					return
+				}
+			}
+			if props.NumRangeDeletions > 0 || props.NumRangeKeyDels > 0 {
+				if compactionHints, err = d.loadTableRangeDelStats(
+					r, v, level, meta, &stats,
+				); err != nil {
+					return
+				}
+			}
+			// TODO(travers): Once we have real-world data, consider collecting
+			// additional stats that may provide improved heuristics for compaction
+			// picking.
+			stats.NumRangeKeySets = props.NumRangeKeySets
+			stats.ValueBlocksSize = props.ValueBlocksSize
+			return
+		})
+	if err != nil {
+		return stats, nil, err
+	}
+	return stats, compactionHints, nil
+}
+
+// loadTablePointKeyStats calculates the point key statistics for the given
+// table. The provided manifest.TableStats are updated.
+func (d *DB) loadTablePointKeyStats(
+	props *sstable.CommonProperties,
+	v *version,
+	level int,
+	meta *fileMetadata,
+	stats *manifest.TableStats,
+) error {
+	// TODO(jackson): If the file has a wide keyspace, the average
+	// value size beneath the entire file might not be representative
+	// of the size of the keys beneath the point tombstones.
+	// We could write the ranges of 'clusters' of point tombstones to
+	// a sstable property and call averageValueSizeBeneath for each of
+	// these narrower ranges to improve the estimate.
+	avgValLogicalSize, compressionRatio, err := d.estimateSizesBeneath(v, level, meta, props)
+	if err != nil {
+		return err
+	}
+	stats.PointDeletionsBytesEstimate =
+		pointDeletionsBytesEstimate(meta.Size, props, avgValLogicalSize, compressionRatio)
+	return nil
+}
+
+// loadTableRangeDelStats calculates the range deletion and range key deletion
+// statistics for the given table.
+func (d *DB) loadTableRangeDelStats(
+	r sstable.CommonReader, v *version, level int, meta *fileMetadata, stats *manifest.TableStats,
+) ([]deleteCompactionHint, error) {
+	iter, err := newCombinedDeletionKeyspanIter(d.opts.Comparer, r, meta)
+	if err != nil {
+		return nil, err
+	}
+	defer iter.Close()
+	var compactionHints []deleteCompactionHint
+	// We iterate over the defragmented range tombstones and range key deletions,
+	// which ensures we don't double count ranges deleted at different sequence
+	// numbers. Also, merging abutting tombstones reduces the number of calls to
+	// estimateReclaimedSizeBeneath which is costly, and improves the accuracy of
+	// our overall estimate.
+	for s := iter.First(); s != nil; s = iter.Next() {
+		start, end := s.Start, s.End
+		// We only need to consider deletion size estimates for tables that contain
+		// RANGEDELs.
+		var maxRangeDeleteSeqNum uint64
+		for _, k := range s.Keys {
+			if k.Kind() == base.InternalKeyKindRangeDelete && maxRangeDeleteSeqNum < k.SeqNum() {
+				maxRangeDeleteSeqNum = k.SeqNum()
+				break
+			}
+		}
+
+		// If the file is in the last level of the LSM, there is no data beneath
+		// it. The fact that there is still a range tombstone in a bottommost file
+		// indicates two possibilites:
+		//   1. an open snapshot kept the tombstone around, and the data the
+		//      tombstone deletes is contained within the file itself.
+		//   2. the file was ingested.
+		// In the first case, we'd like to estimate disk usage within the file
+		// itself since compacting the file will drop that covered data. In the
+		// second case, we expect that compacting the file will NOT drop any
+		// data and rewriting the file is a waste of write bandwidth. We can
+		// distinguish these cases by looking at the file metadata's sequence
+		// numbers. A file's range deletions can only delete data within the
+		// file at lower sequence numbers. All keys in an ingested sstable adopt
+		// the same sequence number, preventing tombstones from deleting keys
+		// within the same file. We check here if the largest RANGEDEL sequence
+		// number is greater than the file's smallest sequence number. If it is,
+		// the RANGEDEL could conceivably (although inconclusively) delete data
+		// within the same file.
+		//
+		// Note that this heuristic is imperfect. If a table containing a range
+		// deletion is ingested into L5 and subsequently compacted into L6 but
+		// an open snapshot prevents elision of covered keys in L6, the
+		// resulting RangeDeletionsBytesEstimate will incorrectly include all
+		// covered keys.
+		//
+		// TODO(jackson): We could prevent the above error in the heuristic by
+		// computing the file's RangeDeletionsBytesEstimate during the
+		// compaction itself. It's unclear how common this is.
+		//
+		// NOTE: If the span `s` wholly contains a table containing range keys,
+		// the returned size estimate will be slightly inflated by the range key
+		// block. However, in practice, range keys are expected to be rare, and
+		// the size of the range key block relative to the overall size of the
+		// table is expected to be small.
+		if level == numLevels-1 && meta.SmallestSeqNum < maxRangeDeleteSeqNum {
+			size, err := r.EstimateDiskUsage(start, end)
+			if err != nil {
+				return nil, err
+			}
+			stats.RangeDeletionsBytesEstimate += size
+
+			// As the file is in the bottommost level, there is no need to collect a
+			// deletion hint.
+			continue
+		}
+
+		// While the size estimates for point keys should only be updated if this
+		// span contains a range del, the sequence numbers are required for the
+		// hint. Unconditionally descend, but conditionally update the estimates.
+		hintType := compactionHintFromKeys(s.Keys)
+		estimate, hintSeqNum, err := d.estimateReclaimedSizeBeneath(v, level, start, end, hintType)
+		if err != nil {
+			return nil, err
+		}
+		stats.RangeDeletionsBytesEstimate += estimate
+
+		// If any files were completely contained with the range,
+		// hintSeqNum is the smallest sequence number contained in any
+		// such file.
+		if hintSeqNum == math.MaxUint64 {
+			continue
+		}
+		hint := deleteCompactionHint{
+			hintType:                hintType,
+			start:                   make([]byte, len(start)),
+			end:                     make([]byte, len(end)),
+			tombstoneFile:           meta,
+			tombstoneLevel:          level,
+			tombstoneLargestSeqNum:  s.LargestSeqNum(),
+			tombstoneSmallestSeqNum: s.SmallestSeqNum(),
+			fileSmallestSeqNum:      hintSeqNum,
+		}
+		copy(hint.start, start)
+		copy(hint.end, end)
+		compactionHints = append(compactionHints, hint)
+	}
+	return compactionHints, err
+}
+
+func (d *DB) estimateSizesBeneath(
+	v *version, level int, meta *fileMetadata, fileProps *sstable.CommonProperties,
+) (avgValueLogicalSize, compressionRatio float64, err error) {
+	// Find all files in lower levels that overlap with meta,
+	// summing their value sizes and entry counts.
+	file := meta
+	var fileSum, keySum, valSum, entryCount uint64
+	// Include the file itself. This is important because in some instances, the
+	// computed compression ratio is applied to the tombstones contained within
+	// `meta` itself. If there are no files beneath `meta` in the LSM, we would
+	// calculate a compression ratio of 0 which is not accurate for the file's
+	// own tombstones.
+	fileSum += file.Size
+	entryCount += fileProps.NumEntries
+	keySum += fileProps.RawKeySize
+	valSum += fileProps.RawValueSize
+
+	addPhysicalTableStats := func(r *sstable.Reader) (err error) {
+		fileSum += file.Size
+		entryCount += r.Properties.NumEntries
+		keySum += r.Properties.RawKeySize
+		valSum += r.Properties.RawValueSize
+		return nil
+	}
+	addVirtualTableStats := func(v sstable.VirtualReader) (err error) {
+		fileSum += file.Size
+		entryCount += file.Stats.NumEntries
+		keySum += v.Properties.RawKeySize
+		valSum += v.Properties.RawValueSize
+		return nil
+	}
+
+	for l := level + 1; l < numLevels; l++ {
+		overlaps := v.Overlaps(l, d.cmp, meta.Smallest.UserKey,
+			meta.Largest.UserKey, meta.Largest.IsExclusiveSentinel())
+		iter := overlaps.Iter()
+		for file = iter.First(); file != nil; file = iter.Next() {
+			var err error
+			if file.Virtual {
+				err = d.tableCache.withVirtualReader(file.VirtualMeta(), addVirtualTableStats)
+			} else {
+				err = d.tableCache.withReader(file.PhysicalMeta(), addPhysicalTableStats)
+			}
+			if err != nil {
+				return 0, 0, err
+			}
+		}
+	}
+	if entryCount == 0 {
+		return 0, 0, nil
+	}
+	// RawKeySize and RawValueSize are uncompressed totals. We'll need to scale
+	// the value sum according to the data size to account for compression,
+	// index blocks and metadata overhead. Eg:
+	//
+	//    Compression rate        ×  Average uncompressed value size
+	//
+	//                            ↓
+	//
+	//         FileSize              RawValueSize
+	//   -----------------------  ×  ------------
+	//   RawKeySize+RawValueSize     NumEntries
+	//
+	// We return the average logical value size plus the compression ratio,
+	// leaving the scaling to the caller. This allows the caller to perform
+	// additional compression ratio scaling if necessary.
+	uncompressedSum := float64(keySum + valSum)
+	compressionRatio = float64(fileSum) / uncompressedSum
+	avgValueLogicalSize = (float64(valSum) / float64(entryCount))
+	return avgValueLogicalSize, compressionRatio, nil
+}
+
+func (d *DB) estimateReclaimedSizeBeneath(
+	v *version, level int, start, end []byte, hintType deleteCompactionHintType,
+) (estimate uint64, hintSeqNum uint64, err error) {
+	// Find all files in lower levels that overlap with the deleted range
+	// [start, end).
+	//
+	// An overlapping file might be completely contained by the range
+	// tombstone, in which case we can count the entire file size in
+	// our estimate without doing any additional I/O.
+	//
+	// Otherwise, estimating the range for the file requires
+	// additional I/O to read the file's index blocks.
+	hintSeqNum = math.MaxUint64
+	for l := level + 1; l < numLevels; l++ {
+		overlaps := v.Overlaps(l, d.cmp, start, end, true /* exclusiveEnd */)
+		iter := overlaps.Iter()
+		for file := iter.First(); file != nil; file = iter.Next() {
+			startCmp := d.cmp(start, file.Smallest.UserKey)
+			endCmp := d.cmp(file.Largest.UserKey, end)
+			if startCmp <= 0 && (endCmp < 0 || endCmp == 0 && file.Largest.IsExclusiveSentinel()) {
+				// The range fully contains the file, so skip looking it up in table
+				// cache/looking at its indexes and add the full file size. Whether the
+				// disk estimate and hint seqnums are updated depends on a) the type of
+				// hint that requested the estimate and b) the keys contained in this
+				// current file.
+				var updateEstimates, updateHints bool
+				switch hintType {
+				case deleteCompactionHintTypePointKeyOnly:
+					// The range deletion byte estimates should only be updated if this
+					// table contains point keys. This ends up being an overestimate in
+					// the case that table also has range keys, but such keys are expected
+					// to contribute a negligible amount of the table's overall size,
+					// relative to point keys.
+					if file.HasPointKeys {
+						updateEstimates = true
+					}
+					// As the initiating span contained only range dels, hints can only be
+					// updated if this table does _not_ contain range keys.
+					if !file.HasRangeKeys {
+						updateHints = true
+					}
+				case deleteCompactionHintTypeRangeKeyOnly:
+					// The initiating span contained only range key dels. The estimates
+					// apply only to point keys, and are therefore not updated.
+					updateEstimates = false
+					// As the initiating span contained only range key dels, hints can
+					// only be updated if this table does _not_ contain point keys.
+					if !file.HasPointKeys {
+						updateHints = true
+					}
+				case deleteCompactionHintTypePointAndRangeKey:
+					// Always update the estimates and hints, as this hint type can drop a
+					// file, irrespective of the mixture of keys. Similar to above, the
+					// range del bytes estimates is an overestimate.
+					updateEstimates, updateHints = true, true
+				default:
+					panic(fmt.Sprintf("pebble: unknown hint type %s", hintType))
+				}
+				if updateEstimates {
+					estimate += file.Size
+				}
+				if updateHints && hintSeqNum > file.SmallestSeqNum {
+					hintSeqNum = file.SmallestSeqNum
+				}
+			} else if d.cmp(file.Smallest.UserKey, end) <= 0 && d.cmp(start, file.Largest.UserKey) <= 0 {
+				// Partial overlap.
+				if hintType == deleteCompactionHintTypeRangeKeyOnly {
+					// If the hint that generated this overlap contains only range keys,
+					// there is no need to calculate disk usage, as the reclaimable space
+					// is expected to be minimal relative to point keys.
+					continue
+				}
+				var size uint64
+				var err error
+				if file.Virtual {
+					err = d.tableCache.withVirtualReader(
+						file.VirtualMeta(), func(r sstable.VirtualReader) (err error) {
+							size, err = r.EstimateDiskUsage(start, end)
+							return err
+						})
+				} else {
+					err = d.tableCache.withReader(
+						file.PhysicalMeta(), func(r *sstable.Reader) (err error) {
+							size, err = r.EstimateDiskUsage(start, end)
+							return err
+						})
+				}
+
+				if err != nil {
+					return 0, hintSeqNum, err
+				}
+				estimate += size
+			}
+		}
+	}
+	return estimate, hintSeqNum, nil
+}
+
+func maybeSetStatsFromProperties(meta physicalMeta, props *sstable.Properties) bool {
+	// If a table contains range deletions or range key deletions, we defer the
+	// stats collection. There are two main reasons for this:
+	//
+	//  1. Estimating the potential for reclaimed space due to a range deletion
+	//     tombstone requires scanning the LSM - a potentially expensive operation
+	//     that should be deferred.
+	//  2. Range deletions and / or range key deletions present an opportunity to
+	//     compute "deletion hints", which also requires a scan of the LSM to
+	//     compute tables that would be eligible for deletion.
+	//
+	// These two tasks are deferred to the table stats collector goroutine.
+	if props.NumRangeDeletions != 0 || props.NumRangeKeyDels != 0 {
+		return false
+	}
+
+	// If a table is more than 10% point deletions without user-provided size
+	// estimates, don't calculate the PointDeletionsBytesEstimate statistic
+	// using our limited knowledge. The table stats collector can populate the
+	// stats and calculate an average of value size of all the tables beneath
+	// the table in the LSM, which will be more accurate.
+	if unsizedDels := (props.NumDeletions - props.NumSizedDeletions); unsizedDels > props.NumEntries/10 {
+		return false
+	}
+
+	var pointEstimate uint64
+	if props.NumEntries > 0 {
+		// Use the file's own average key and value sizes as an estimate. This
+		// doesn't require any additional IO and since the number of point
+		// deletions in the file is low, the error introduced by this crude
+		// estimate is expected to be small.
+		commonProps := &props.CommonProperties
+		avgValSize, compressionRatio := estimatePhysicalSizes(meta.Size, commonProps)
+		pointEstimate = pointDeletionsBytesEstimate(meta.Size, commonProps, avgValSize, compressionRatio)
+	}
+
+	meta.Stats.NumEntries = props.NumEntries
+	meta.Stats.NumDeletions = props.NumDeletions
+	meta.Stats.NumRangeKeySets = props.NumRangeKeySets
+	meta.Stats.PointDeletionsBytesEstimate = pointEstimate
+	meta.Stats.RangeDeletionsBytesEstimate = 0
+	meta.Stats.ValueBlocksSize = props.ValueBlocksSize
+	meta.StatsMarkValid()
+	return true
+}
+
+func pointDeletionsBytesEstimate(
+	fileSize uint64, props *sstable.CommonProperties, avgValLogicalSize, compressionRatio float64,
+) (estimate uint64) {
+	if props.NumEntries == 0 {
+		return 0
+	}
+	numPointDels := props.NumPointDeletions()
+	if numPointDels == 0 {
+		return 0
+	}
+	// Estimate the potential space to reclaim using the table's own properties.
+	// There may or may not be keys covered by any individual point tombstone.
+	// If not, compacting the point tombstone into L6 will at least allow us to
+	// drop the point deletion key and will reclaim the tombstone's key bytes.
+	// If there are covered key(s), we also get to drop key and value bytes for
+	// each covered key.
+	//
+	// Some point tombstones (DELSIZEDs) carry a user-provided estimate of the
+	// uncompressed size of entries that will be elided by fully compacting the
+	// tombstone. For these tombstones, there's no guesswork—we use the
+	// RawPointTombstoneValueSizeHint property which is the sum of all these
+	// tombstones' encoded values.
+	//
+	// For un-sized point tombstones (DELs), we estimate assuming that each
+	// point tombstone on average covers 1 key and using average value sizes.
+	// This is almost certainly an overestimate, but that's probably okay
+	// because point tombstones can slow range iterations even when they don't
+	// cover a key.
+	//
+	// TODO(jackson): This logic doesn't directly incorporate fixed per-key
+	// overhead (8-byte trailer, plus at least 1 byte encoding the length of the
+	// key and 1 byte encoding the length of the value). This overhead is
+	// indirectly incorporated through the compression ratios, but that results
+	// in the overhead being smeared per key-byte and value-byte, rather than
+	// per-entry. This per-key fixed overhead can be nontrivial, especially for
+	// dense swaths of point tombstones. Give some thought as to whether we
+	// should directly include fixed per-key overhead in the calculations.
+
+	// Below, we calculate the tombstone contributions and the shadowed keys'
+	// contributions separately.
+	var tombstonesLogicalSize float64
+	var shadowedLogicalSize float64
+
+	// 1. Calculate the contribution of the tombstone keys themselves.
+	if props.RawPointTombstoneKeySize > 0 {
+		tombstonesLogicalSize += float64(props.RawPointTombstoneKeySize)
+	} else {
+		// This sstable predates the existence of the RawPointTombstoneKeySize
+		// property. We can use the average key size within the file itself and
+		// the count of point deletions to estimate the size.
+		tombstonesLogicalSize += float64(numPointDels * props.RawKeySize / props.NumEntries)
+	}
+
+	// 2. Calculate the contribution of the keys shadowed by tombstones.
+	//
+	// 2a. First account for keys shadowed by DELSIZED tombstones. THE DELSIZED
+	// tombstones encode the size of both the key and value of the shadowed KV
+	// entries. These sizes are aggregated into a sstable property.
+	shadowedLogicalSize += float64(props.RawPointTombstoneValueSize)
+
+	// 2b. Calculate the contribution of the KV entries shadowed by ordinary DEL
+	// keys.
+	numUnsizedDels := numPointDels - props.NumSizedDeletions
+	{
+		// The shadowed keys have the same exact user keys as the tombstones
+		// themselves, so we can use the `tombstonesLogicalSize` we computed
+		// earlier as an estimate. There's a complication that
+		// `tombstonesLogicalSize` may include DELSIZED keys we already
+		// accounted for.
+		shadowedLogicalSize += float64(tombstonesLogicalSize) / float64(numPointDels) * float64(numUnsizedDels)
+
+		// Calculate the contribution of the deleted values. The caller has
+		// already computed an average logical size (possibly computed across
+		// many sstables).
+		shadowedLogicalSize += float64(numUnsizedDels) * avgValLogicalSize
+	}
+
+	// Scale both tombstone and shadowed totals by logical:physical ratios to
+	// account for compression, metadata overhead, etc.
+	//
+	//      Physical             FileSize
+	//     -----------  = -----------------------
+	//      Logical       RawKeySize+RawValueSize
+	//
+	return uint64((tombstonesLogicalSize + shadowedLogicalSize) * compressionRatio)
+}
+
+func estimatePhysicalSizes(
+	fileSize uint64, props *sstable.CommonProperties,
+) (avgValLogicalSize, compressionRatio float64) {
+	// RawKeySize and RawValueSize are uncompressed totals. Scale according to
+	// the data size to account for compression, index blocks and metadata
+	// overhead. Eg:
+	//
+	//    Compression rate        ×  Average uncompressed value size
+	//
+	//                            ↓
+	//
+	//         FileSize              RawValSize
+	//   -----------------------  ×  ----------
+	//   RawKeySize+RawValueSize     NumEntries
+	//
+	uncompressedSum := props.RawKeySize + props.RawValueSize
+	compressionRatio = float64(fileSize) / float64(uncompressedSum)
+	avgValLogicalSize = (float64(props.RawValueSize) / float64(props.NumEntries))
+	return avgValLogicalSize, compressionRatio
+}
+
+// newCombinedDeletionKeyspanIter returns a keyspan.FragmentIterator that
+// returns "ranged deletion" spans for a single table, providing a combined view
+// of both range deletion and range key deletion spans. The
+// tableRangedDeletionIter is intended for use in the specific case of computing
+// the statistics and deleteCompactionHints for a single table.
+//
+// As an example, consider the following set of spans from the range deletion
+// and range key blocks of a table:
+//
+//		      |---------|     |---------|         |-------| RANGEKEYDELs
+//		|-----------|-------------|           |-----|       RANGEDELs
+//	  __________________________________________________________
+//		a b c d e f g h i j k l m n o p q r s t u v w x y z
+//
+// The tableRangedDeletionIter produces the following set of output spans, where
+// '1' indicates a span containing only range deletions, '2' is a span
+// containing only range key deletions, and '3' is a span containing a mixture
+// of both range deletions and range key deletions.
+//
+//		   1       3       1    3    2          1  3   2
+//		|-----|---------|-----|---|-----|     |---|-|-----|
+//	  __________________________________________________________
+//		a b c d e f g h i j k l m n o p q r s t u v w x y z
+//
+// Algorithm.
+//
+// The iterator first defragments the range deletion and range key blocks
+// separately. During this defragmentation, the range key block is also filtered
+// so that keys other than range key deletes are ignored. The range delete and
+// range key delete keyspaces are then merged.
+//
+// Note that the only fragmentation introduced by merging is from where a range
+// del span overlaps with a range key del span. Within the bounds of any overlap
+// there is guaranteed to be no further fragmentation, as the constituent spans
+// have already been defragmented. To the left and right of any overlap, the
+// same reasoning applies. For example,
+//
+//		         |--------|         |-------| RANGEKEYDEL
+//		|---------------------------|         RANGEDEL
+//		|----1---|----3---|----1----|---2---| Merged, fragmented spans.
+//	  __________________________________________________________
+//		a b c d e f g h i j k l m n o p q r s t u v w x y z
+//
+// Any fragmented abutting spans produced by the merging iter will be of
+// differing types (i.e. a transition from a span with homogenous key kinds to a
+// heterogeneous span, or a transition from a span with exclusively range dels
+// to a span with exclusively range key dels). Therefore, further
+// defragmentation is not required.
+//
+// Each span returned by the tableRangeDeletionIter will have at most four keys,
+// corresponding to the largest and smallest sequence numbers encountered across
+// the range deletes and range keys deletes that comprised the merged spans.
+func newCombinedDeletionKeyspanIter(
+	comparer *base.Comparer, cr sstable.CommonReader, m *fileMetadata,
+) (keyspan.FragmentIterator, error) {
+	// The range del iter and range key iter are each wrapped in their own
+	// defragmenting iter. For each iter, abutting spans can always be merged.
+	var equal = keyspan.DefragmentMethodFunc(func(_ base.Equal, a, b *keyspan.Span) bool { return true })
+	// Reduce keys by maintaining a slice of at most length two, corresponding to
+	// the largest and smallest keys in the defragmented span. This maintains the
+	// contract that the emitted slice is sorted by (SeqNum, Kind) descending.
+	reducer := func(current, incoming []keyspan.Key) []keyspan.Key {
+		if len(current) == 0 && len(incoming) == 0 {
+			// While this should never occur in practice, a defensive return is used
+			// here to preserve correctness.
+			return current
+		}
+		var largest, smallest keyspan.Key
+		var set bool
+		for _, keys := range [2][]keyspan.Key{current, incoming} {
+			if len(keys) == 0 {
+				continue
+			}
+			first, last := keys[0], keys[len(keys)-1]
+			if !set {
+				largest, smallest = first, last
+				set = true
+				continue
+			}
+			if first.Trailer > largest.Trailer {
+				largest = first
+			}
+			if last.Trailer < smallest.Trailer {
+				smallest = last
+			}
+		}
+		if largest.Equal(comparer.Equal, smallest) {
+			current = append(current[:0], largest)
+		} else {
+			current = append(current[:0], largest, smallest)
+		}
+		return current
+	}
+
+	// The separate iters for the range dels and range keys are wrapped in a
+	// merging iter to join the keyspaces into a single keyspace. The separate
+	// iters are only added if the particular key kind is present.
+	mIter := &keyspan.MergingIter{}
+	var transform = keyspan.TransformerFunc(func(cmp base.Compare, in keyspan.Span, out *keyspan.Span) error {
+		if in.KeysOrder != keyspan.ByTrailerDesc {
+			panic("pebble: combined deletion iter encountered keys in non-trailer descending order")
+		}
+		out.Start, out.End = in.Start, in.End
+		out.Keys = append(out.Keys[:0], in.Keys...)
+		out.KeysOrder = keyspan.ByTrailerDesc
+		// NB: The order of by-trailer descending may have been violated,
+		// because we've layered rangekey and rangedel iterators from the same
+		// sstable into the same keyspan.MergingIter. The MergingIter will
+		// return the keys in the order that the child iterators were provided.
+		// Sort the keys to ensure they're sorted by trailer descending.
+		keyspan.SortKeysByTrailer(&out.Keys)
+		return nil
+	})
+	mIter.Init(comparer.Compare, transform, new(keyspan.MergingBuffers))
+
+	iter, err := cr.NewRawRangeDelIter()
+	if err != nil {
+		return nil, err
+	}
+	if iter != nil {
+		dIter := &keyspan.DefragmentingIter{}
+		dIter.Init(comparer, iter, equal, reducer, new(keyspan.DefragmentingBuffers))
+		iter = dIter
+		// Truncate tombstones to the containing file's bounds if necessary.
+		// See docs/range_deletions.md for why this is necessary.
+		iter = keyspan.Truncate(
+			comparer.Compare, iter, m.Smallest.UserKey, m.Largest.UserKey,
+			nil, nil, false, /* panicOnUpperTruncate */
+		)
+		mIter.AddLevel(iter)
+	}
+
+	iter, err = cr.NewRawRangeKeyIter()
+	if err != nil {
+		return nil, err
+	}
+	if iter != nil {
+		// Wrap the range key iterator in a filter that elides keys other than range
+		// key deletions.
+		iter = keyspan.Filter(iter, func(in *keyspan.Span, out *keyspan.Span) (keep bool) {
+			out.Start, out.End = in.Start, in.End
+			out.Keys = out.Keys[:0]
+			for _, k := range in.Keys {
+				if k.Kind() != base.InternalKeyKindRangeKeyDelete {
+					continue
+				}
+				out.Keys = append(out.Keys, k)
+			}
+			return len(out.Keys) > 0
+		}, comparer.Compare)
+		dIter := &keyspan.DefragmentingIter{}
+		dIter.Init(comparer, iter, equal, reducer, new(keyspan.DefragmentingBuffers))
+		iter = dIter
+		mIter.AddLevel(iter)
+	}
+
+	return mIter, nil
+}
+
+// rangeKeySetsAnnotator implements manifest.Annotator, annotating B-Tree nodes
+// with the sum of the files' counts of range key fragments. Its annotation type
+// is a *uint64. The count of range key sets may change once a table's stats are
+// loaded asynchronously, so its values are marked as cacheable only if a file's
+// stats have been loaded.
+type rangeKeySetsAnnotator struct{}
+
+var _ manifest.Annotator = rangeKeySetsAnnotator{}
+
+func (a rangeKeySetsAnnotator) Zero(dst interface{}) interface{} {
+	if dst == nil {
+		return new(uint64)
+	}
+	v := dst.(*uint64)
+	*v = 0
+	return v
+}
+
+func (a rangeKeySetsAnnotator) Accumulate(
+	f *fileMetadata, dst interface{},
+) (v interface{}, cacheOK bool) {
+	vptr := dst.(*uint64)
+	*vptr = *vptr + f.Stats.NumRangeKeySets
+	return vptr, f.StatsValid()
+}
+
+func (a rangeKeySetsAnnotator) Merge(src interface{}, dst interface{}) interface{} {
+	srcV := src.(*uint64)
+	dstV := dst.(*uint64)
+	*dstV = *dstV + *srcV
+	return dstV
+}
+
+// countRangeKeySetFragments counts the number of RANGEKEYSET keys across all
+// files of the LSM. It only counts keys in files for which table stats have
+// been loaded. It uses a b-tree annotator to cache intermediate values between
+// calculations when possible.
+func countRangeKeySetFragments(v *version) (count uint64) {
+	for l := 0; l < numLevels; l++ {
+		if v.RangeKeyLevels[l].Empty() {
+			continue
+		}
+		count += *v.RangeKeyLevels[l].Annotation(rangeKeySetsAnnotator{}).(*uint64)
+	}
+	return count
+}
+
+// tombstonesAnnotator implements manifest.Annotator, annotating B-Tree nodes
+// with the sum of the files' counts of tombstones (DEL, SINGLEDEL and RANGEDELk
+// eys). Its annotation type is a *uint64. The count of tombstones may change
+// once a table's stats are loaded asynchronously, so its values are marked as
+// cacheable only if a file's stats have been loaded.
+type tombstonesAnnotator struct{}
+
+var _ manifest.Annotator = tombstonesAnnotator{}
+
+func (a tombstonesAnnotator) Zero(dst interface{}) interface{} {
+	if dst == nil {
+		return new(uint64)
+	}
+	v := dst.(*uint64)
+	*v = 0
+	return v
+}
+
+func (a tombstonesAnnotator) Accumulate(
+	f *fileMetadata, dst interface{},
+) (v interface{}, cacheOK bool) {
+	vptr := dst.(*uint64)
+	*vptr = *vptr + f.Stats.NumDeletions
+	return vptr, f.StatsValid()
+}
+
+func (a tombstonesAnnotator) Merge(src interface{}, dst interface{}) interface{} {
+	srcV := src.(*uint64)
+	dstV := dst.(*uint64)
+	*dstV = *dstV + *srcV
+	return dstV
+}
+
+// countTombstones counts the number of tombstone (DEL, SINGLEDEL and RANGEDEL)
+// internal keys across all files of the LSM. It only counts keys in files for
+// which table stats have been loaded. It uses a b-tree annotator to cache
+// intermediate values between calculations when possible.
+func countTombstones(v *version) (count uint64) {
+	for l := 0; l < numLevels; l++ {
+		if v.Levels[l].Empty() {
+			continue
+		}
+		count += *v.Levels[l].Annotation(tombstonesAnnotator{}).(*uint64)
+	}
+	return count
+}
+
+// valueBlocksSizeAnnotator implements manifest.Annotator, annotating B-Tree
+// nodes with the sum of the files' Properties.ValueBlocksSize. Its annotation
+// type is a *uint64. The value block size may change once a table's stats are
+// loaded asynchronously, so its values are marked as cacheable only if a
+// file's stats have been loaded.
+type valueBlocksSizeAnnotator struct{}
+
+var _ manifest.Annotator = valueBlocksSizeAnnotator{}
+
+func (a valueBlocksSizeAnnotator) Zero(dst interface{}) interface{} {
+	if dst == nil {
+		return new(uint64)
+	}
+	v := dst.(*uint64)
+	*v = 0
+	return v
+}
+
+func (a valueBlocksSizeAnnotator) Accumulate(
+	f *fileMetadata, dst interface{},
+) (v interface{}, cacheOK bool) {
+	vptr := dst.(*uint64)
+	*vptr = *vptr + f.Stats.ValueBlocksSize
+	return vptr, f.StatsValid()
+}
+
+func (a valueBlocksSizeAnnotator) Merge(src interface{}, dst interface{}) interface{} {
+	srcV := src.(*uint64)
+	dstV := dst.(*uint64)
+	*dstV = *dstV + *srcV
+	return dstV
+}
+
+// valueBlocksSizeForLevel returns the Properties.ValueBlocksSize across all
+// files for a level of the LSM. It only includes the size for files for which
+// table stats have been loaded. It uses a b-tree annotator to cache
+// intermediate values between calculations when possible. It must not be
+// called concurrently.
+//
+// REQUIRES: 0 <= level <= numLevels.
+func valueBlocksSizeForLevel(v *version, level int) (count uint64) {
+	if v.Levels[level].Empty() {
+		return 0
+	}
+	return *v.Levels[level].Annotation(valueBlocksSizeAnnotator{}).(*uint64)
+}
diff --git a/pebble/table_stats_test.go b/pebble/table_stats_test.go
new file mode 100644
index 0000000..3abece9
--- /dev/null
+++ b/pebble/table_stats_test.go
@@ -0,0 +1,277 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"fmt"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/rangekey"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+func TestTableStats(t *testing.T) {
+	// loadedInfo is protected by d.mu.
+	var loadedInfo *TableStatsInfo
+	opts := &Options{
+		FS: vfs.NewMem(),
+		EventListener: &EventListener{
+			TableStatsLoaded: func(info TableStatsInfo) {
+				loadedInfo = &info
+			},
+		},
+	}
+	opts.DisableAutomaticCompactions = true
+	opts.Comparer = testkeys.Comparer
+	opts.FormatMajorVersion = FormatRangeKeys
+
+	d, err := Open("", opts)
+	require.NoError(t, err)
+	defer func() {
+		if d != nil {
+			require.NoError(t, closeAllSnapshots(d))
+			require.NoError(t, d.Close())
+		}
+	}()
+
+	datadriven.RunTest(t, "testdata/table_stats", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "disable":
+			d.mu.Lock()
+			d.opts.private.disableTableStats = true
+			d.mu.Unlock()
+			return ""
+
+		case "enable":
+			d.mu.Lock()
+			d.opts.private.disableTableStats = false
+			d.maybeCollectTableStatsLocked()
+			d.mu.Unlock()
+			return ""
+
+		case "define":
+			require.NoError(t, closeAllSnapshots(d))
+			require.NoError(t, d.Close())
+			loadedInfo = nil
+
+			d, err = runDBDefineCmd(td, opts)
+			if err != nil {
+				return err.Error()
+			}
+			d.mu.Lock()
+			s := d.mu.versions.currentVersion().String()
+			d.mu.Unlock()
+			return s
+
+		case "reopen":
+			require.NoError(t, d.Close())
+			loadedInfo = nil
+
+			// Open using existing file system.
+			d, err = Open("", opts)
+			require.NoError(t, err)
+			return ""
+
+		case "batch":
+			b := d.NewBatch()
+			if err := runBatchDefineCmd(td, b); err != nil {
+				return err.Error()
+			}
+			b.Commit(nil)
+			return ""
+
+		case "flush":
+			if err := d.Flush(); err != nil {
+				return err.Error()
+			}
+
+			d.mu.Lock()
+			s := d.mu.versions.currentVersion().String()
+			d.mu.Unlock()
+			return s
+
+		case "ingest":
+			if err = runBuildCmd(td, d, d.opts.FS); err != nil {
+				return err.Error()
+			}
+			if err = runIngestCmd(td, d, d.opts.FS); err != nil {
+				return err.Error()
+			}
+			d.mu.Lock()
+			s := d.mu.versions.currentVersion().String()
+			d.mu.Unlock()
+			return s
+
+		case "metric":
+			m := d.Metrics()
+			// TODO(jackson): Make a generalized command that uses reflection to
+			// pull out arbitrary Metrics fields.
+			var buf bytes.Buffer
+			for _, arg := range td.CmdArgs {
+				switch arg.String() {
+				case "keys.missized-tombstones-count":
+					fmt.Fprintf(&buf, "%s: %d", arg.String(), m.Keys.MissizedTombstonesCount)
+				default:
+					return fmt.Sprintf("unrecognized metric %s", arg)
+				}
+			}
+			return buf.String()
+
+		case "lsm":
+			d.mu.Lock()
+			s := d.mu.versions.currentVersion().String()
+			d.mu.Unlock()
+			return s
+
+		case "build":
+			if err := runBuildCmd(td, d, d.opts.FS); err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "ingest-and-excise":
+			if err := runIngestAndExciseCmd(td, d, d.opts.FS); err != nil {
+				return err.Error()
+			}
+			// Wait for a possible flush.
+			d.mu.Lock()
+			for d.mu.compact.flushing {
+				d.mu.compact.cond.Wait()
+			}
+			d.mu.Unlock()
+			return ""
+
+		case "wait-pending-table-stats":
+			return runTableStatsCmd(td, d)
+
+		case "wait-loaded-initial":
+			d.mu.Lock()
+			for d.mu.tableStats.loading || !d.mu.tableStats.loadedInitial {
+				d.mu.tableStats.cond.Wait()
+			}
+			s := loadedInfo.String()
+			d.mu.Unlock()
+			return s
+
+		case "compact":
+			if err := runCompactCmd(td, d); err != nil {
+				return err.Error()
+			}
+			d.mu.Lock()
+			// Disable the "dynamic base level" code for this test.
+			d.mu.versions.picker.forceBaseLevel1()
+			s := d.mu.versions.currentVersion().String()
+			d.mu.Unlock()
+			return s
+
+		case "metadata-stats":
+			// Prints some metadata about some sstable which is currently in the
+			// latest version.
+			return runMetadataCommand(t, td, d)
+
+		case "properties":
+			return runSSTablePropertiesCmd(t, td, d)
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestTableRangeDeletionIter(t *testing.T) {
+	var m *fileMetadata
+	cmp := base.DefaultComparer.Compare
+	fs := vfs.NewMem()
+	datadriven.RunTest(t, "testdata/table_stats_deletion_iter", func(t *testing.T, td *datadriven.TestData) string {
+		switch cmd := td.Cmd; cmd {
+		case "build":
+			f, err := fs.Create("tmp.sst")
+			if err != nil {
+				return err.Error()
+			}
+			w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
+				TableFormat: sstable.TableFormatMax,
+			})
+			m = &fileMetadata{}
+			for _, line := range strings.Split(td.Input, "\n") {
+				s := keyspan.ParseSpan(line)
+				// Range dels can be written sequentially. Range keys must be collected.
+				rKeySpan := &keyspan.Span{Start: s.Start, End: s.End}
+				for _, k := range s.Keys {
+					if rangekey.IsRangeKey(k.Kind()) {
+						rKeySpan.Keys = append(rKeySpan.Keys, k)
+					} else {
+						k := base.InternalKey{UserKey: s.Start, Trailer: k.Trailer}
+						if err = w.Add(k, s.End); err != nil {
+							return err.Error()
+						}
+					}
+				}
+				err = rangekey.Encode(rKeySpan, func(k base.InternalKey, v []byte) error {
+					return w.AddRangeKey(k, v)
+				})
+				if err != nil {
+					return err.Error()
+				}
+			}
+			if err = w.Close(); err != nil {
+				return err.Error()
+			}
+			meta, err := w.Metadata()
+			if err != nil {
+				return err.Error()
+			}
+			if meta.HasPointKeys {
+				m.ExtendPointKeyBounds(cmp, meta.SmallestPoint, meta.LargestPoint)
+			}
+			if meta.HasRangeDelKeys {
+				m.ExtendPointKeyBounds(cmp, meta.SmallestRangeDel, meta.LargestRangeDel)
+			}
+			if meta.HasRangeKeys {
+				m.ExtendRangeKeyBounds(cmp, meta.SmallestRangeKey, meta.LargestRangeKey)
+			}
+			return m.DebugString(base.DefaultFormatter, false /* verbose */)
+		case "spans":
+			f, err := fs.Open("tmp.sst")
+			if err != nil {
+				return err.Error()
+			}
+			var r *sstable.Reader
+			readable, err := sstable.NewSimpleReadable(f)
+			if err != nil {
+				return err.Error()
+			}
+			r, err = sstable.NewReader(readable, sstable.ReaderOptions{})
+			if err != nil {
+				return err.Error()
+			}
+			defer r.Close()
+			iter, err := newCombinedDeletionKeyspanIter(base.DefaultComparer, r, m)
+			if err != nil {
+				return err.Error()
+			}
+			defer iter.Close()
+			var buf bytes.Buffer
+			for s := iter.First(); s != nil; s = iter.Next() {
+				buf.WriteString(s.String() + "\n")
+			}
+			if buf.Len() == 0 {
+				return "(none)"
+			}
+			return buf.String()
+		default:
+			return fmt.Sprintf("unknown command: %s", cmd)
+		}
+	})
+}
diff --git a/pebble/testdata/.gitignore b/pebble/testdata/.gitignore
new file mode 100644
index 0000000..00b4a6c
--- /dev/null
+++ b/pebble/testdata/.gitignore
@@ -0,0 +1 @@
+/make-db
diff --git a/pebble/testdata/Makefile b/pebble/testdata/Makefile
new file mode 100644
index 0000000..ee84143
--- /dev/null
+++ b/pebble/testdata/Makefile
@@ -0,0 +1,8 @@
+all: rebuild
+
+.PHONY: rebuild
+rebuild: make-db.go
+	for stage in 1 2 3 4; do \
+  	rm -fr db-stage-$$stage && \
+		go run make-db.go $$stage; \
+	done
diff --git a/pebble/testdata/batch_get b/pebble/testdata/batch_get
new file mode 100644
index 0000000..d34f97a
--- /dev/null
+++ b/pebble/testdata/batch_get
@@ -0,0 +1,114 @@
+define
+set a 1
+----
+
+get a
+----
+1
+
+get b
+----
+pebble: not found
+
+define
+set a 1
+set a 2
+----
+
+get a
+----
+2
+
+define
+set a 1
+set a 2
+del a
+----
+
+get a
+----
+pebble: not found
+
+define
+set a 1
+set a 2
+del a
+set a 3
+----
+
+get a
+----
+3
+
+define
+merge a 1
+merge a 2
+merge a 3
+----
+
+get a
+----
+123
+
+commit
+----
+
+define
+merge a 4
+----
+
+get a
+----
+1234
+
+commit
+----
+
+define
+del a
+set a 5
+----
+
+get a
+----
+5
+
+define
+del-range a b
+----
+
+get a
+----
+pebble: not found
+
+commit
+----
+
+define
+----
+
+get a
+----
+pebble: not found
+
+define
+merge b 1
+merge b 2
+del-range b c
+merge b 3
+merge b 4
+----
+
+get b
+----
+34
+
+commit
+----
+
+define
+----
+
+get b
+----
+34
diff --git a/pebble/testdata/batch_range_ops b/pebble/testdata/batch_range_ops
new file mode 100644
index 0000000..1e66512
--- /dev/null
+++ b/pebble/testdata/batch_range_ops
@@ -0,0 +1,140 @@
+define
+set a 1
+set b 2
+merge c 3
+del-range a c
+del d
+range-key-set b c @4 value
+----
+
+scan
+----
+a#12,1:1
+b#17,1:2
+c#22,2:3
+d#32,0:
+
+scan range-del
+----
+a-c:{(#27,RANGEDEL)}
+
+scan range-key
+----
+b-c:{(#35,RANGEKEYSET,@4,value)}
+
+clear
+----
+
+define
+del-range a b
+del-range b c
+del-range a c
+del-range b d
+----
+
+scan range-del
+----
+a-b:{(#22,RANGEDEL) (#12,RANGEDEL)}
+b-c:{(#27,RANGEDEL) (#22,RANGEDEL) (#17,RANGEDEL)}
+c-d:{(#27,RANGEDEL)}
+
+clear
+----
+
+define
+range-key-del a b
+range-key-del b c
+range-key-del a c
+range-key-del b d
+----
+
+scan range-key
+----
+a-b:{(#22,RANGEKEYDEL) (#12,RANGEKEYDEL)}
+b-c:{(#27,RANGEKEYDEL) (#22,RANGEKEYDEL) (#17,RANGEKEYDEL)}
+c-d:{(#27,RANGEKEYDEL)}
+
+clear
+----
+
+define
+del-range a b
+----
+
+scan range-del
+----
+a-b:{(#12,RANGEDEL)}
+
+define
+del-range b c
+----
+
+scan range-del
+----
+a-b:{(#12,RANGEDEL)}
+b-c:{(#17,RANGEDEL)}
+
+define
+del-range a c
+----
+
+scan range-del
+----
+a-b:{(#22,RANGEDEL) (#12,RANGEDEL)}
+b-c:{(#22,RANGEDEL) (#17,RANGEDEL)}
+
+define
+del-range b d
+----
+
+scan range-del
+----
+a-b:{(#22,RANGEDEL) (#12,RANGEDEL)}
+b-c:{(#27,RANGEDEL) (#22,RANGEDEL) (#17,RANGEDEL)}
+c-d:{(#27,RANGEDEL)}
+
+# Verify that adding a range tombstone via Batch.Apply invalidates the
+# cached fragmented tombstones.
+
+clear
+----
+
+define
+del-range a b
+----
+
+scan range-del
+----
+a-b:{(#12,RANGEDEL)}
+
+apply
+del-range c d
+----
+
+scan range-del
+----
+a-b:{(#12,RANGEDEL)}
+c-d:{(#17,RANGEDEL)}
+
+# Verify that adding a range key via Batch.Apply invalidates the
+# cached fragmented range keys.
+
+clear
+----
+
+define
+range-key-set a c @2 v
+----
+
+scan range-key
+----
+a-c:{(#12,RANGEKEYSET,@2,v)}
+
+apply
+range-key-unset a b @2
+----
+
+scan range-key
+----
+a-b:{(#23,RANGEKEYUNSET,@2) (#12,RANGEKEYSET,@2,v)}
+b-c:{(#12,RANGEKEYSET,@2,v)}
diff --git a/pebble/testdata/batch_reader b/pebble/testdata/batch_reader
new file mode 100644
index 0000000..01ea8a2
--- /dev/null
+++ b/pebble/testdata/batch_reader
@@ -0,0 +1,98 @@
+scan
+----
+Count: 0
+eof
+
+scan
+ffffffffffffffffffffffffffffffffffffffffffffffff
+----
+Count: 4294967295
+err: invalid key kind 0xff: pebble: invalid batch
+
+scan
+0000000000000000 01000000   # Seqnum = 0, Count = 1
+00 01 61                    # DEL "a"
+----
+Count: 1
+DEL: "a": ""
+eof
+
+scan
+0000000000000000 01000000   # Seqnum = 0, Count = 1
+01 01 62 01 62              # SET "b" = "b"
+----
+Count: 1
+SET: "b": "b"
+eof
+
+scan
+0000000000000000 01000000   # Seqnum = 0, Count = 1
+01 01 62 01 62              # SET "b" = "b"
+----
+Count: 1
+SET: "b": "b"
+eof
+
+scan
+0000000000000000 02000000   # Seqnum = 0, Count = 2
+00 01 61                    # DEL "a"
+01 01 62 01 62              # SET "b" = "b"
+----
+Count: 2
+DEL: "a": ""
+SET: "b": "b"
+eof
+
+scan
+0000000000000000 03000000   # Seqnum = 0, Count = 3
+00 01 61                    # DEL "a"
+01 01 62 01 62              # SET "b" = "b"
+0F 01 62 01 63              # RANGEDEL "b" = "c"
+----
+Count: 3
+DEL: "a": ""
+SET: "b": "b"
+RANGEDEL: "b": "c"
+eof
+
+scan
+0000000000000000 03000000   # Seqnum = 0, Count = 3
+00 01 61                    # DEL "a"
+01 01 62 01 62              # SET "b" = "b"
+0F 01 62 01                 # RANGEDEL "b"... missing end key string data
+----
+Count: 3
+DEL: "a": ""
+SET: "b": "b"
+err: decoding RANGEDEL value: pebble: invalid batch
+
+scan
+0000000000000000 03000000   # Seqnum = 0, Count = 3
+00 01 61                    # DEL "a"
+01 01 62 01 62              # SET "b" = "b"
+0F 01 62 01                 # RANGEDEL "b"... missing end key string data
+----
+Count: 3
+DEL: "a": ""
+SET: "b": "b"
+err: decoding RANGEDEL value: pebble: invalid batch
+
+
+scan
+0000000000000000 03000000   # Seqnum = 0, Count = 3
+00 01 61                    # DEL "a"
+01 01 62 01 62              # SET "b" = "b"
+1F 01 62 01                 # "1F" kind is garbage
+----
+Count: 3
+DEL: "a": ""
+SET: "b": "b"
+err: invalid key kind 0x1f: pebble: invalid batch
+
+scan
+0000000000000000 01000000   # Seqnum = 0, Count = 1
+01 01                       # SET missing user key string data
+----
+Count: 1
+err: decoding user key: pebble: invalid batch
+
diff --git a/pebble/testdata/checkpoint b/pebble/testdata/checkpoint
new file mode 100644
index 0000000..6ace513
--- /dev/null
+++ b/pebble/testdata/checkpoint
@@ -0,0 +1,811 @@
+open db
+----
+mkdir-all: db 0755
+open-dir: db
+lock: db/LOCK
+open-dir: db
+open-dir: db
+open: db/CURRENT
+create: db/MANIFEST-000001
+sync: db/MANIFEST-000001
+remove: db/temporary.000001.dbtmp
+create: db/temporary.000001.dbtmp
+sync: db/temporary.000001.dbtmp
+close: db/temporary.000001.dbtmp
+rename: db/temporary.000001.dbtmp -> db/CURRENT
+sync: db
+open-dir: db
+sync: db/MANIFEST-000001
+create: db/000002.log
+sync: db
+create: db/marker.manifest.000001.MANIFEST-000001
+close: db/marker.manifest.000001.MANIFEST-000001
+sync: db
+create: db/marker.format-version.000001.002
+close: db/marker.format-version.000001.002
+sync: db
+remove: db/temporary.000000.dbtmp
+create: db/temporary.000000.dbtmp
+sync: db/temporary.000000.dbtmp
+close: db/temporary.000000.dbtmp
+rename: db/temporary.000000.dbtmp -> db/CURRENT
+create: db/marker.format-version.000002.003
+close: db/marker.format-version.000002.003
+remove: db/marker.format-version.000001.002
+sync: db
+create: db/marker.format-version.000003.004
+close: db/marker.format-version.000003.004
+remove: db/marker.format-version.000002.003
+sync: db
+create: db/marker.format-version.000004.005
+close: db/marker.format-version.000004.005
+remove: db/marker.format-version.000003.004
+sync: db
+create: db/marker.format-version.000005.006
+close: db/marker.format-version.000005.006
+remove: db/marker.format-version.000004.005
+sync: db
+create: db/marker.format-version.000006.007
+close: db/marker.format-version.000006.007
+remove: db/marker.format-version.000005.006
+sync: db
+create: db/marker.format-version.000007.008
+close: db/marker.format-version.000007.008
+remove: db/marker.format-version.000006.007
+sync: db
+create: db/marker.format-version.000008.009
+close: db/marker.format-version.000008.009
+remove: db/marker.format-version.000007.008
+sync: db
+create: db/marker.format-version.000009.010
+close: db/marker.format-version.000009.010
+remove: db/marker.format-version.000008.009
+sync: db
+create: db/marker.format-version.000010.011
+close: db/marker.format-version.000010.011
+remove: db/marker.format-version.000009.010
+sync: db
+create: db/marker.format-version.000011.012
+close: db/marker.format-version.000011.012
+remove: db/marker.format-version.000010.011
+sync: db
+create: db/marker.format-version.000012.013
+close: db/marker.format-version.000012.013
+remove: db/marker.format-version.000011.012
+sync: db
+create: db/marker.format-version.000013.014
+close: db/marker.format-version.000013.014
+remove: db/marker.format-version.000012.013
+sync: db
+create: db/marker.format-version.000014.015
+close: db/marker.format-version.000014.015
+remove: db/marker.format-version.000013.014
+sync: db
+create: db/marker.format-version.000015.016
+close: db/marker.format-version.000015.016
+remove: db/marker.format-version.000014.015
+sync: db
+create: db/temporary.000003.dbtmp
+sync: db/temporary.000003.dbtmp
+close: db/temporary.000003.dbtmp
+rename: db/temporary.000003.dbtmp -> db/OPTIONS-000003
+sync: db
+
+batch db
+set a 1
+set b 2
+set c 3
+----
+sync-data: db/000002.log
+
+flush db
+----
+sync-data: db/000002.log
+close: db/000002.log
+create: db/000004.log
+sync: db
+create: db/000005.sst
+sync-data: db/000005.sst
+close: db/000005.sst
+sync: db
+sync: db/MANIFEST-000001
+
+batch db
+set b 5
+set d 7
+set e 8
+----
+sync-data: db/000004.log
+
+flush db
+----
+sync-data: db/000004.log
+close: db/000004.log
+reuseForWrite: db/000002.log -> db/000006.log
+sync: db
+create: db/000007.sst
+sync-data: db/000007.sst
+close: db/000007.sst
+sync: db
+sync: db/MANIFEST-000001
+
+batch db
+set f 9
+set g 10
+----
+sync-data: db/000006.log
+
+checkpoint db checkpoints/checkpoint1
+----
+mkdir-all: checkpoints/checkpoint1 0755
+open-dir: checkpoints
+sync: checkpoints
+close: checkpoints
+open-dir: 
+sync: 
+close: 
+open-dir: checkpoints/checkpoint1
+link: db/OPTIONS-000003 -> checkpoints/checkpoint1/OPTIONS-000003
+open-dir: checkpoints/checkpoint1
+create: checkpoints/checkpoint1/marker.format-version.000001.016
+sync-data: checkpoints/checkpoint1/marker.format-version.000001.016
+close: checkpoints/checkpoint1/marker.format-version.000001.016
+sync: checkpoints/checkpoint1
+close: checkpoints/checkpoint1
+link: db/000005.sst -> checkpoints/checkpoint1/000005.sst
+link: db/000007.sst -> checkpoints/checkpoint1/000007.sst
+open: db/MANIFEST-000001
+create: checkpoints/checkpoint1/MANIFEST-000001
+sync-data: checkpoints/checkpoint1/MANIFEST-000001
+close: checkpoints/checkpoint1/MANIFEST-000001
+close: db/MANIFEST-000001
+open-dir: checkpoints/checkpoint1
+create: checkpoints/checkpoint1/marker.manifest.000001.MANIFEST-000001
+sync-data: checkpoints/checkpoint1/marker.manifest.000001.MANIFEST-000001
+close: checkpoints/checkpoint1/marker.manifest.000001.MANIFEST-000001
+sync: checkpoints/checkpoint1
+close: checkpoints/checkpoint1
+open: db/000006.log
+create: checkpoints/checkpoint1/000006.log
+sync-data: checkpoints/checkpoint1/000006.log
+close: checkpoints/checkpoint1/000006.log
+close: db/000006.log
+sync: checkpoints/checkpoint1
+close: checkpoints/checkpoint1
+
+checkpoint db checkpoints/checkpoint1
+----
+checkpoint checkpoints/checkpoint1: file already exists
+
+# Create a checkpoint that omits SSTs that don't overlap with the [d - f) range.
+checkpoint db checkpoints/checkpoint2 restrict=(d-f)
+----
+mkdir-all: checkpoints/checkpoint2 0755
+open-dir: checkpoints
+sync: checkpoints
+close: checkpoints
+open-dir: checkpoints/checkpoint2
+link: db/OPTIONS-000003 -> checkpoints/checkpoint2/OPTIONS-000003
+open-dir: checkpoints/checkpoint2
+create: checkpoints/checkpoint2/marker.format-version.000001.016
+sync-data: checkpoints/checkpoint2/marker.format-version.000001.016
+close: checkpoints/checkpoint2/marker.format-version.000001.016
+sync: checkpoints/checkpoint2
+close: checkpoints/checkpoint2
+link: db/000007.sst -> checkpoints/checkpoint2/000007.sst
+open: db/MANIFEST-000001
+create: checkpoints/checkpoint2/MANIFEST-000001
+sync-data: checkpoints/checkpoint2/MANIFEST-000001
+close: checkpoints/checkpoint2/MANIFEST-000001
+close: db/MANIFEST-000001
+open-dir: checkpoints/checkpoint2
+create: checkpoints/checkpoint2/marker.manifest.000001.MANIFEST-000001
+sync-data: checkpoints/checkpoint2/marker.manifest.000001.MANIFEST-000001
+close: checkpoints/checkpoint2/marker.manifest.000001.MANIFEST-000001
+sync: checkpoints/checkpoint2
+close: checkpoints/checkpoint2
+open: db/000006.log
+create: checkpoints/checkpoint2/000006.log
+sync-data: checkpoints/checkpoint2/000006.log
+close: checkpoints/checkpoint2/000006.log
+close: db/000006.log
+sync: checkpoints/checkpoint2
+close: checkpoints/checkpoint2
+
+# Create a checkpoint that omits SSTs that don't overlap with [a - e) and [d - f).
+checkpoint db checkpoints/checkpoint3 restrict=(a-e, d-f)
+----
+mkdir-all: checkpoints/checkpoint3 0755
+open-dir: checkpoints
+sync: checkpoints
+close: checkpoints
+open-dir: checkpoints/checkpoint3
+link: db/OPTIONS-000003 -> checkpoints/checkpoint3/OPTIONS-000003
+open-dir: checkpoints/checkpoint3
+create: checkpoints/checkpoint3/marker.format-version.000001.016
+sync-data: checkpoints/checkpoint3/marker.format-version.000001.016
+close: checkpoints/checkpoint3/marker.format-version.000001.016
+sync: checkpoints/checkpoint3
+close: checkpoints/checkpoint3
+link: db/000005.sst -> checkpoints/checkpoint3/000005.sst
+link: db/000007.sst -> checkpoints/checkpoint3/000007.sst
+open: db/MANIFEST-000001
+create: checkpoints/checkpoint3/MANIFEST-000001
+sync-data: checkpoints/checkpoint3/MANIFEST-000001
+close: checkpoints/checkpoint3/MANIFEST-000001
+close: db/MANIFEST-000001
+open-dir: checkpoints/checkpoint3
+create: checkpoints/checkpoint3/marker.manifest.000001.MANIFEST-000001
+sync-data: checkpoints/checkpoint3/marker.manifest.000001.MANIFEST-000001
+close: checkpoints/checkpoint3/marker.manifest.000001.MANIFEST-000001
+sync: checkpoints/checkpoint3
+close: checkpoints/checkpoint3
+open: db/000006.log
+create: checkpoints/checkpoint3/000006.log
+sync-data: checkpoints/checkpoint3/000006.log
+close: checkpoints/checkpoint3/000006.log
+close: db/000006.log
+sync: checkpoints/checkpoint3
+close: checkpoints/checkpoint3
+
+compact db
+----
+sync-data: db/000006.log
+close: db/000006.log
+reuseForWrite: db/000004.log -> db/000008.log
+sync: db
+create: db/000009.sst
+sync-data: db/000009.sst
+close: db/000009.sst
+sync: db
+sync: db/MANIFEST-000001
+open: db/000005.sst
+read-at(630, 53): db/000005.sst
+read-at(593, 37): db/000005.sst
+read-at(74, 519): db/000005.sst
+read-at(47, 27): db/000005.sst
+open: db/000005.sst
+close: db/000005.sst
+open: db/000009.sst
+read-at(625, 53): db/000009.sst
+read-at(588, 37): db/000009.sst
+read-at(69, 519): db/000009.sst
+read-at(42, 27): db/000009.sst
+open: db/000009.sst
+close: db/000009.sst
+open: db/000007.sst
+read-at(630, 53): db/000007.sst
+read-at(593, 37): db/000007.sst
+read-at(74, 519): db/000007.sst
+read-at(47, 27): db/000007.sst
+open: db/000007.sst
+close: db/000007.sst
+open: db/000005.sst
+read-at(0, 47): db/000005.sst
+open: db/000007.sst
+read-at(0, 47): db/000007.sst
+create: db/000010.sst
+close: db/000005.sst
+open: db/000009.sst
+read-at(0, 42): db/000009.sst
+close: db/000007.sst
+close: db/000009.sst
+sync-data: db/000010.sst
+close: db/000010.sst
+sync: db
+sync: db/MANIFEST-000001
+close: db/000005.sst
+close: db/000007.sst
+close: db/000009.sst
+remove: db/000005.sst
+remove: db/000007.sst
+remove: db/000009.sst
+
+batch db
+set h 11
+----
+sync-data: db/000008.log
+
+list db
+----
+000006.log
+000008.log
+000010.sst
+CURRENT
+LOCK
+MANIFEST-000001
+OPTIONS-000003
+marker.format-version.000015.016
+marker.manifest.000001.MANIFEST-000001
+
+list checkpoints/checkpoint1
+----
+000005.sst
+000006.log
+000007.sst
+MANIFEST-000001
+OPTIONS-000003
+marker.format-version.000001.016
+marker.manifest.000001.MANIFEST-000001
+
+open checkpoints/checkpoint1 readonly
+----
+open-dir: checkpoints/checkpoint1
+lock: checkpoints/checkpoint1/LOCK
+open-dir: checkpoints/checkpoint1
+open-dir: checkpoints/checkpoint1
+open: checkpoints/checkpoint1/MANIFEST-000001
+close: checkpoints/checkpoint1/MANIFEST-000001
+open-dir: checkpoints/checkpoint1
+open: checkpoints/checkpoint1/OPTIONS-000003
+close: checkpoints/checkpoint1/OPTIONS-000003
+open: checkpoints/checkpoint1/000006.log
+close: checkpoints/checkpoint1/000006.log
+
+scan checkpoints/checkpoint1
+----
+open: checkpoints/checkpoint1/000007.sst
+read-at(630, 53): checkpoints/checkpoint1/000007.sst
+read-at(593, 37): checkpoints/checkpoint1/000007.sst
+read-at(74, 519): checkpoints/checkpoint1/000007.sst
+read-at(47, 27): checkpoints/checkpoint1/000007.sst
+read-at(0, 47): checkpoints/checkpoint1/000007.sst
+open: checkpoints/checkpoint1/000005.sst
+read-at(630, 53): checkpoints/checkpoint1/000005.sst
+read-at(593, 37): checkpoints/checkpoint1/000005.sst
+read-at(74, 519): checkpoints/checkpoint1/000005.sst
+read-at(47, 27): checkpoints/checkpoint1/000005.sst
+read-at(0, 47): checkpoints/checkpoint1/000005.sst
+a 1
+b 5
+c 3
+d 7
+e 8
+f 9
+g 10
+.
+
+scan db
+----
+open: db/000010.sst
+read-at(657, 53): db/000010.sst
+read-at(620, 37): db/000010.sst
+read-at(101, 519): db/000010.sst
+read-at(74, 27): db/000010.sst
+read-at(0, 74): db/000010.sst
+a 1
+b 5
+c 3
+d 7
+e 8
+f 9
+g 10
+h 11
+.
+
+# This checkpoint should only contain the second SST.
+list checkpoints/checkpoint2
+----
+000006.log
+000007.sst
+MANIFEST-000001
+OPTIONS-000003
+marker.format-version.000001.016
+marker.manifest.000001.MANIFEST-000001
+
+open checkpoints/checkpoint2 readonly
+----
+open-dir: checkpoints/checkpoint2
+lock: checkpoints/checkpoint2/LOCK
+open-dir: checkpoints/checkpoint2
+open-dir: checkpoints/checkpoint2
+open: checkpoints/checkpoint2/MANIFEST-000001
+close: checkpoints/checkpoint2/MANIFEST-000001
+open-dir: checkpoints/checkpoint2
+open: checkpoints/checkpoint2/OPTIONS-000003
+close: checkpoints/checkpoint2/OPTIONS-000003
+open: checkpoints/checkpoint2/000006.log
+close: checkpoints/checkpoint2/000006.log
+
+scan checkpoints/checkpoint2
+----
+open: checkpoints/checkpoint2/000007.sst
+read-at(630, 53): checkpoints/checkpoint2/000007.sst
+read-at(593, 37): checkpoints/checkpoint2/000007.sst
+read-at(74, 519): checkpoints/checkpoint2/000007.sst
+read-at(47, 27): checkpoints/checkpoint2/000007.sst
+read-at(0, 47): checkpoints/checkpoint2/000007.sst
+b 5
+d 7
+e 8
+f 9
+g 10
+.
+
+# This checkpoint should contain both SSTs.
+list checkpoints/checkpoint3
+----
+000005.sst
+000006.log
+000007.sst
+MANIFEST-000001
+OPTIONS-000003
+marker.format-version.000001.016
+marker.manifest.000001.MANIFEST-000001
+
+open checkpoints/checkpoint3 readonly
+----
+open-dir: checkpoints/checkpoint3
+lock: checkpoints/checkpoint3/LOCK
+open-dir: checkpoints/checkpoint3
+open-dir: checkpoints/checkpoint3
+open: checkpoints/checkpoint3/MANIFEST-000001
+close: checkpoints/checkpoint3/MANIFEST-000001
+open-dir: checkpoints/checkpoint3
+open: checkpoints/checkpoint3/OPTIONS-000003
+close: checkpoints/checkpoint3/OPTIONS-000003
+open: checkpoints/checkpoint3/000006.log
+close: checkpoints/checkpoint3/000006.log
+
+scan checkpoints/checkpoint3
+----
+open: checkpoints/checkpoint3/000007.sst
+read-at(630, 53): checkpoints/checkpoint3/000007.sst
+read-at(593, 37): checkpoints/checkpoint3/000007.sst
+read-at(74, 519): checkpoints/checkpoint3/000007.sst
+read-at(47, 27): checkpoints/checkpoint3/000007.sst
+read-at(0, 47): checkpoints/checkpoint3/000007.sst
+open: checkpoints/checkpoint3/000005.sst
+read-at(630, 53): checkpoints/checkpoint3/000005.sst
+read-at(593, 37): checkpoints/checkpoint3/000005.sst
+read-at(74, 519): checkpoints/checkpoint3/000005.sst
+read-at(47, 27): checkpoints/checkpoint3/000005.sst
+read-at(0, 47): checkpoints/checkpoint3/000005.sst
+a 1
+b 5
+c 3
+d 7
+e 8
+f 9
+g 10
+.
+
+# Test virtual sstable checkpointing. Virtual sstable checkpointing will remove
+# the backing files which won't be required by the checkpoint. Need to make sure
+# that the virtual sstables which are present in the checkpoint manifest are
+# still readable, and that the backing files not required are deleted.
+
+lsm db
+----
+6:
+  000010:[a#0,SET-g#0,SET]
+
+build db ext1 format=pebblev2
+set i i
+set j j
+set k k
+----
+
+ingest-and-excise db ext1 excise=c-d
+----
+
+# 12, 13 are virtual sstables.
+lsm db
+----
+6:
+  000012:[a#0,SET-b#0,SET]
+  000013:[d#0,SET-g#0,SET]
+  000011:[i#19,SET-k#19,SET]
+
+build db ext2 format=pebblev2
+set z z
+----
+
+ingest-and-excise db ext2 excise=j-k
+----
+
+# 12, 13, 15, 16 are virtual.
+lsm db
+----
+6:
+  000012:[a#0,SET-b#0,SET]
+  000013:[d#0,SET-g#0,SET]
+  000015:[i#19,SET-i#19,SET]
+  000016:[k#19,SET-k#19,SET]
+  000014:[z#20,SET-z#20,SET]
+
+# scan db so that it is known what to expect from the checkpoints.
+scan db
+----
+a 1
+b 5
+d 7
+e 8
+f 9
+g 10
+h 11
+i i
+k k
+open: db/000014.sst
+read-at(636, 53): db/000014.sst
+read-at(599, 37): db/000014.sst
+z z
+.
+
+# Create a basic checkpoint to see if virtual sstables can be read.
+checkpoint db checkpoints/checkpoint4
+----
+mkdir-all: checkpoints/checkpoint4 0755
+open-dir: checkpoints
+sync: checkpoints
+close: checkpoints
+open-dir: checkpoints/checkpoint4
+link: db/OPTIONS-000003 -> checkpoints/checkpoint4/OPTIONS-000003
+open-dir: checkpoints/checkpoint4
+create: checkpoints/checkpoint4/marker.format-version.000001.016
+sync-data: checkpoints/checkpoint4/marker.format-version.000001.016
+close: checkpoints/checkpoint4/marker.format-version.000001.016
+sync: checkpoints/checkpoint4
+close: checkpoints/checkpoint4
+link: db/000010.sst -> checkpoints/checkpoint4/000010.sst
+link: db/000011.sst -> checkpoints/checkpoint4/000011.sst
+link: db/000014.sst -> checkpoints/checkpoint4/000014.sst
+open: db/MANIFEST-000001
+create: checkpoints/checkpoint4/MANIFEST-000001
+sync-data: checkpoints/checkpoint4/MANIFEST-000001
+close: checkpoints/checkpoint4/MANIFEST-000001
+close: db/MANIFEST-000001
+open-dir: checkpoints/checkpoint4
+create: checkpoints/checkpoint4/marker.manifest.000001.MANIFEST-000001
+sync-data: checkpoints/checkpoint4/marker.manifest.000001.MANIFEST-000001
+close: checkpoints/checkpoint4/marker.manifest.000001.MANIFEST-000001
+sync: checkpoints/checkpoint4
+close: checkpoints/checkpoint4
+open: db/000008.log
+create: checkpoints/checkpoint4/000008.log
+sync-data: checkpoints/checkpoint4/000008.log
+close: checkpoints/checkpoint4/000008.log
+close: db/000008.log
+sync: checkpoints/checkpoint4
+close: checkpoints/checkpoint4
+
+open checkpoints/checkpoint4 readonly
+----
+open-dir: checkpoints/checkpoint4
+lock: checkpoints/checkpoint4/LOCK
+open-dir: checkpoints/checkpoint4
+open-dir: checkpoints/checkpoint4
+open: checkpoints/checkpoint4/MANIFEST-000001
+close: checkpoints/checkpoint4/MANIFEST-000001
+open-dir: checkpoints/checkpoint4
+open: checkpoints/checkpoint4/OPTIONS-000003
+close: checkpoints/checkpoint4/OPTIONS-000003
+open: checkpoints/checkpoint4/000008.log
+close: checkpoints/checkpoint4/000008.log
+
+scan checkpoints/checkpoint4
+----
+open: checkpoints/checkpoint4/000010.sst
+read-at(657, 53): checkpoints/checkpoint4/000010.sst
+read-at(620, 37): checkpoints/checkpoint4/000010.sst
+read-at(101, 519): checkpoints/checkpoint4/000010.sst
+read-at(74, 27): checkpoints/checkpoint4/000010.sst
+read-at(0, 74): checkpoints/checkpoint4/000010.sst
+a 1
+b 5
+d 7
+e 8
+f 9
+g 10
+open: checkpoints/checkpoint4/000011.sst
+read-at(653, 53): checkpoints/checkpoint4/000011.sst
+read-at(616, 37): checkpoints/checkpoint4/000011.sst
+read-at(70, 546): checkpoints/checkpoint4/000011.sst
+read-at(43, 27): checkpoints/checkpoint4/000011.sst
+read-at(0, 43): checkpoints/checkpoint4/000011.sst
+h 11
+i i
+k k
+open: checkpoints/checkpoint4/000014.sst
+read-at(636, 53): checkpoints/checkpoint4/000014.sst
+read-at(599, 37): checkpoints/checkpoint4/000014.sst
+read-at(53, 546): checkpoints/checkpoint4/000014.sst
+read-at(26, 27): checkpoints/checkpoint4/000014.sst
+read-at(0, 26): checkpoints/checkpoint4/000014.sst
+z z
+.
+
+close checkpoints/checkpoint4
+----
+
+
+# Backing sst 10 is in the list as it is backing sstables 12, 13.
+list db
+----
+000006.log
+000008.log
+000010.sst
+000011.sst
+000014.sst
+CURRENT
+LOCK
+MANIFEST-000001
+OPTIONS-000003
+marker.format-version.000015.016
+marker.manifest.000001.MANIFEST-000001
+
+
+# Exclude virtual sstable 12. The backing sst should still be present on disk
+# in the checkpoint. See the "link: db/000010.sst" line.
+checkpoint db checkpoints/checkpoint5 restrict=(d-zz)
+----
+mkdir-all: checkpoints/checkpoint5 0755
+open-dir: checkpoints
+sync: checkpoints
+close: checkpoints
+open-dir: checkpoints/checkpoint5
+link: db/OPTIONS-000003 -> checkpoints/checkpoint5/OPTIONS-000003
+open-dir: checkpoints/checkpoint5
+create: checkpoints/checkpoint5/marker.format-version.000001.016
+sync-data: checkpoints/checkpoint5/marker.format-version.000001.016
+close: checkpoints/checkpoint5/marker.format-version.000001.016
+sync: checkpoints/checkpoint5
+close: checkpoints/checkpoint5
+link: db/000010.sst -> checkpoints/checkpoint5/000010.sst
+link: db/000011.sst -> checkpoints/checkpoint5/000011.sst
+link: db/000014.sst -> checkpoints/checkpoint5/000014.sst
+open: db/MANIFEST-000001
+create: checkpoints/checkpoint5/MANIFEST-000001
+sync-data: checkpoints/checkpoint5/MANIFEST-000001
+close: checkpoints/checkpoint5/MANIFEST-000001
+close: db/MANIFEST-000001
+open-dir: checkpoints/checkpoint5
+create: checkpoints/checkpoint5/marker.manifest.000001.MANIFEST-000001
+sync-data: checkpoints/checkpoint5/marker.manifest.000001.MANIFEST-000001
+close: checkpoints/checkpoint5/marker.manifest.000001.MANIFEST-000001
+sync: checkpoints/checkpoint5
+close: checkpoints/checkpoint5
+open: db/000008.log
+create: checkpoints/checkpoint5/000008.log
+sync-data: checkpoints/checkpoint5/000008.log
+close: checkpoints/checkpoint5/000008.log
+close: db/000008.log
+sync: checkpoints/checkpoint5
+close: checkpoints/checkpoint5
+
+open checkpoints/checkpoint5
+----
+mkdir-all: checkpoints/checkpoint5 0755
+open-dir: checkpoints/checkpoint5
+lock: checkpoints/checkpoint5/LOCK
+open-dir: checkpoints/checkpoint5
+open-dir: checkpoints/checkpoint5
+open: checkpoints/checkpoint5/MANIFEST-000001
+close: checkpoints/checkpoint5/MANIFEST-000001
+open-dir: checkpoints/checkpoint5
+open: checkpoints/checkpoint5/OPTIONS-000003
+close: checkpoints/checkpoint5/OPTIONS-000003
+open: checkpoints/checkpoint5/000008.log
+create: checkpoints/checkpoint5/000017.sst
+sync-data: checkpoints/checkpoint5/000017.sst
+close: checkpoints/checkpoint5/000017.sst
+sync: checkpoints/checkpoint5
+close: checkpoints/checkpoint5/000008.log
+create: checkpoints/checkpoint5/MANIFEST-000019
+sync: checkpoints/checkpoint5/MANIFEST-000019
+create: checkpoints/checkpoint5/marker.manifest.000002.MANIFEST-000019
+close: checkpoints/checkpoint5/marker.manifest.000002.MANIFEST-000019
+remove: checkpoints/checkpoint5/marker.manifest.000001.MANIFEST-000001
+sync: checkpoints/checkpoint5
+create: checkpoints/checkpoint5/000018.log
+sync: checkpoints/checkpoint5
+create: checkpoints/checkpoint5/temporary.000020.dbtmp
+sync: checkpoints/checkpoint5/temporary.000020.dbtmp
+close: checkpoints/checkpoint5/temporary.000020.dbtmp
+rename: checkpoints/checkpoint5/temporary.000020.dbtmp -> checkpoints/checkpoint5/OPTIONS-000020
+sync: checkpoints/checkpoint5
+remove: checkpoints/checkpoint5/000008.log
+remove: checkpoints/checkpoint5/OPTIONS-000003
+
+print-backing checkpoints/checkpoint5
+----
+000010
+000011
+
+# sstable 12 is gone.
+lsm checkpoints/checkpoint5
+----
+0.0:
+  000017:[h#18,SET-h#18,SET]
+6:
+  000013:[d#0,SET-g#0,SET]
+  000015:[i#19,SET-i#19,SET]
+  000016:[k#19,SET-k#19,SET]
+  000014:[z#20,SET-z#20,SET]
+
+close checkpoints/checkpoint5
+----
+
+# Exclude both sstables 12 and 13. The backing sstable 10 should not be linked.
+# There should be a remove backing table entry for backing sstable 10.
+checkpoint db checkpoints/checkpoint6 restrict=(i-zz)
+----
+mkdir-all: checkpoints/checkpoint6 0755
+open-dir: checkpoints
+sync: checkpoints
+close: checkpoints
+open-dir: checkpoints/checkpoint6
+link: db/OPTIONS-000003 -> checkpoints/checkpoint6/OPTIONS-000003
+open-dir: checkpoints/checkpoint6
+create: checkpoints/checkpoint6/marker.format-version.000001.016
+sync-data: checkpoints/checkpoint6/marker.format-version.000001.016
+close: checkpoints/checkpoint6/marker.format-version.000001.016
+sync: checkpoints/checkpoint6
+close: checkpoints/checkpoint6
+link: db/000011.sst -> checkpoints/checkpoint6/000011.sst
+link: db/000014.sst -> checkpoints/checkpoint6/000014.sst
+open: db/MANIFEST-000001
+create: checkpoints/checkpoint6/MANIFEST-000001
+sync-data: checkpoints/checkpoint6/MANIFEST-000001
+close: checkpoints/checkpoint6/MANIFEST-000001
+close: db/MANIFEST-000001
+open-dir: checkpoints/checkpoint6
+create: checkpoints/checkpoint6/marker.manifest.000001.MANIFEST-000001
+sync-data: checkpoints/checkpoint6/marker.manifest.000001.MANIFEST-000001
+close: checkpoints/checkpoint6/marker.manifest.000001.MANIFEST-000001
+sync: checkpoints/checkpoint6
+close: checkpoints/checkpoint6
+open: db/000008.log
+create: checkpoints/checkpoint6/000008.log
+sync-data: checkpoints/checkpoint6/000008.log
+close: checkpoints/checkpoint6/000008.log
+close: db/000008.log
+sync: checkpoints/checkpoint6
+close: checkpoints/checkpoint6
+
+open checkpoints/checkpoint6
+----
+mkdir-all: checkpoints/checkpoint6 0755
+open-dir: checkpoints/checkpoint6
+lock: checkpoints/checkpoint6/LOCK
+open-dir: checkpoints/checkpoint6
+open-dir: checkpoints/checkpoint6
+open: checkpoints/checkpoint6/MANIFEST-000001
+close: checkpoints/checkpoint6/MANIFEST-000001
+open-dir: checkpoints/checkpoint6
+open: checkpoints/checkpoint6/OPTIONS-000003
+close: checkpoints/checkpoint6/OPTIONS-000003
+open: checkpoints/checkpoint6/000008.log
+create: checkpoints/checkpoint6/000017.sst
+sync-data: checkpoints/checkpoint6/000017.sst
+close: checkpoints/checkpoint6/000017.sst
+sync: checkpoints/checkpoint6
+close: checkpoints/checkpoint6/000008.log
+create: checkpoints/checkpoint6/MANIFEST-000019
+sync: checkpoints/checkpoint6/MANIFEST-000019
+create: checkpoints/checkpoint6/marker.manifest.000002.MANIFEST-000019
+close: checkpoints/checkpoint6/marker.manifest.000002.MANIFEST-000019
+remove: checkpoints/checkpoint6/marker.manifest.000001.MANIFEST-000001
+sync: checkpoints/checkpoint6
+create: checkpoints/checkpoint6/000018.log
+sync: checkpoints/checkpoint6
+create: checkpoints/checkpoint6/temporary.000020.dbtmp
+sync: checkpoints/checkpoint6/temporary.000020.dbtmp
+close: checkpoints/checkpoint6/temporary.000020.dbtmp
+rename: checkpoints/checkpoint6/temporary.000020.dbtmp -> checkpoints/checkpoint6/OPTIONS-000020
+sync: checkpoints/checkpoint6
+remove: checkpoints/checkpoint6/000008.log
+remove: checkpoints/checkpoint6/OPTIONS-000003
+
+print-backing checkpoints/checkpoint6
+----
+000011
+
+lsm checkpoints/checkpoint6
+----
+0.0:
+  000017:[h#18,SET-h#18,SET]
+6:
+  000015:[i#19,SET-i#19,SET]
+  000016:[k#19,SET-k#19,SET]
+  000014:[z#20,SET-z#20,SET]
diff --git a/pebble/testdata/cleaner b/pebble/testdata/cleaner
new file mode 100644
index 0000000..cd96e7d
--- /dev/null
+++ b/pebble/testdata/cleaner
@@ -0,0 +1,243 @@
+# Test archive cleaner.
+open db archive
+----
+mkdir-all: db 0755
+mkdir-all: db_wal 0755
+open-dir: db
+open-dir: db_wal
+lock: db/LOCK
+open-dir: db
+open-dir: db
+open: db/CURRENT
+create: db/MANIFEST-000001
+sync: db/MANIFEST-000001
+remove: db/temporary.000001.dbtmp
+create: db/temporary.000001.dbtmp
+sync: db/temporary.000001.dbtmp
+close: db/temporary.000001.dbtmp
+rename: db/temporary.000001.dbtmp -> db/CURRENT
+sync: db
+open-dir: db
+sync: db/MANIFEST-000001
+create: db_wal/000002.log
+sync: db_wal
+create: db/temporary.000003.dbtmp
+sync: db/temporary.000003.dbtmp
+close: db/temporary.000003.dbtmp
+rename: db/temporary.000003.dbtmp -> db/OPTIONS-000003
+sync: db
+
+batch db
+set a 1
+set b 2
+set c 3
+----
+sync-data: db_wal/000002.log
+
+flush db
+----
+sync-data: db_wal/000002.log
+close: db_wal/000002.log
+create: db_wal/000004.log
+sync: db_wal
+create: db/000005.sst
+sync-data: db/000005.sst
+close: db/000005.sst
+sync: db
+sync: db/MANIFEST-000001
+mkdir-all: db_wal/archive 0755
+rename: db_wal/000002.log -> db_wal/archive/000002.log
+
+batch db
+set d 4
+----
+sync-data: db_wal/000004.log
+
+compact db
+----
+sync-data: db_wal/000004.log
+close: db_wal/000004.log
+create: db_wal/000006.log
+sync: db_wal
+create: db/000007.sst
+sync-data: db/000007.sst
+close: db/000007.sst
+sync: db
+sync: db/MANIFEST-000001
+mkdir-all: db_wal/archive 0755
+rename: db_wal/000004.log -> db_wal/archive/000004.log
+open: db/000005.sst
+read-at(744, 53): db/000005.sst
+read-at(707, 37): db/000005.sst
+read-at(79, 628): db/000005.sst
+read-at(52, 27): db/000005.sst
+open: db/000005.sst
+close: db/000005.sst
+open: db/000007.sst
+read-at(718, 53): db/000007.sst
+read-at(681, 37): db/000007.sst
+read-at(53, 628): db/000007.sst
+read-at(26, 27): db/000007.sst
+open: db/000007.sst
+close: db/000007.sst
+open: db/000005.sst
+read-at(0, 52): db/000005.sst
+create: db/000008.sst
+close: db/000005.sst
+open: db/000007.sst
+read-at(0, 26): db/000007.sst
+close: db/000007.sst
+sync-data: db/000008.sst
+close: db/000008.sst
+sync: db
+sync: db/MANIFEST-000001
+close: db/000005.sst
+close: db/000007.sst
+mkdir-all: db/archive 0755
+rename: db/000005.sst -> db/archive/000005.sst
+mkdir-all: db/archive 0755
+rename: db/000007.sst -> db/archive/000007.sst
+
+list db
+----
+000008.sst
+CURRENT
+LOCK
+MANIFEST-000001
+OPTIONS-000003
+archive
+
+list db_wal
+----
+000006.log
+archive
+
+list db/archive
+----
+000005.sst
+000007.sst
+
+list db_wal/archive
+----
+000002.log
+000004.log
+
+# Test cleanup of extra sstables on open.
+open db1
+----
+mkdir-all: db1 0755
+mkdir-all: db1_wal 0755
+open-dir: db1
+open-dir: db1_wal
+lock: db1/LOCK
+open-dir: db1
+open-dir: db1
+open: db1/CURRENT
+create: db1/MANIFEST-000001
+sync: db1/MANIFEST-000001
+remove: db1/temporary.000001.dbtmp
+create: db1/temporary.000001.dbtmp
+sync: db1/temporary.000001.dbtmp
+close: db1/temporary.000001.dbtmp
+rename: db1/temporary.000001.dbtmp -> db1/CURRENT
+sync: db1
+open-dir: db1
+sync: db1/MANIFEST-000001
+create: db1_wal/000002.log
+sync: db1_wal
+create: db1/temporary.000003.dbtmp
+sync: db1/temporary.000003.dbtmp
+close: db1/temporary.000003.dbtmp
+rename: db1/temporary.000003.dbtmp -> db1/OPTIONS-000003
+sync: db1
+
+batch db1
+set a 1
+set b 2
+set c 3
+----
+sync-data: db1_wal/000002.log
+
+flush db1
+----
+sync-data: db1_wal/000002.log
+close: db1_wal/000002.log
+create: db1_wal/000004.log
+sync: db1_wal
+create: db1/000005.sst
+sync-data: db1/000005.sst
+close: db1/000005.sst
+sync: db1
+sync: db1/MANIFEST-000001
+
+close db1
+----
+close: db1
+sync-data: db1_wal/000004.log
+close: db1_wal/000004.log
+close: db1/MANIFEST-000001
+close: db1
+close: db1
+close: db1_wal
+close: db1
+
+create-bogus-file db1/000123.sst
+----
+create: db1/000123.sst
+sync: db1/000123.sst
+close: db1/000123.sst
+
+create-bogus-file db1/000456.sst
+----
+create: db1/000456.sst
+sync: db1/000456.sst
+close: db1/000456.sst
+
+open db1
+----
+mkdir-all: db1 0755
+mkdir-all: db1_wal 0755
+open-dir: db1
+open-dir: db1_wal
+lock: db1/LOCK
+open-dir: db1
+open-dir: db1
+open: db1/CURRENT
+read-at(0, 16): db1/CURRENT
+close: db1/CURRENT
+open: db1/MANIFEST-000001
+close: db1/MANIFEST-000001
+open-dir: db1
+open: db1/OPTIONS-000003
+close: db1/OPTIONS-000003
+open: db1_wal/000004.log
+close: db1_wal/000004.log
+create: db1/MANIFEST-000458
+sync: db1/MANIFEST-000458
+remove: db1/temporary.000458.dbtmp
+create: db1/temporary.000458.dbtmp
+sync: db1/temporary.000458.dbtmp
+close: db1/temporary.000458.dbtmp
+rename: db1/temporary.000458.dbtmp -> db1/CURRENT
+sync: db1
+create: db1_wal/000457.log
+sync: db1_wal
+create: db1/temporary.000459.dbtmp
+sync: db1/temporary.000459.dbtmp
+close: db1/temporary.000459.dbtmp
+rename: db1/temporary.000459.dbtmp -> db1/OPTIONS-000459
+sync: db1
+remove: db1_wal/000002.log
+remove: db1_wal/000004.log
+remove: db1/000123.sst
+remove: db1/000456.sst
+remove: db1/OPTIONS-000003
+
+list db1
+----
+000005.sst
+CURRENT
+LOCK
+MANIFEST-000001
+MANIFEST-000458
+OPTIONS-000459
diff --git a/pebble/testdata/compaction_allow_zero_seqnum b/pebble/testdata/compaction_allow_zero_seqnum
new file mode 100644
index 0000000..efe62ab
--- /dev/null
+++ b/pebble/testdata/compaction_allow_zero_seqnum
@@ -0,0 +1,72 @@
+define
+L2
+  c.SET.2:2
+----
+2:
+  000004:[c#2,SET-c#2,SET]
+
+allow-zero-seqnum
+L0:b-b
+L0:c-c
+L0:d-d
+----
+true
+false
+true
+
+allow-zero-seqnum
+L0:c-c L0:d-d
+L0:c-c L1:d-d
+L0:b-b L0:b-c
+L0:b-b L1:b-c
+----
+false
+false
+false
+false
+
+# We only look for overlaps at L<N+2> as it isn't valid for a
+# compaction rooted at L<N> to not include overlapping tables at
+# L<N+1>.
+
+allow-zero-seqnum
+L1:c-c
+----
+true
+
+# Regression test for a bug where the allow-zero-seqnum check was not
+# actually working for flushes due to a failure to clone the
+# lower-bound key used for checking for overlap. This caused the
+# overlap check to use [b,b] in the test below, rather than [a,b].
+
+define
+mem
+  a.SET.2:2
+  b.SET.3:3
+L1
+  a.SET.0:0
+----
+1:
+  000004:[a#0,SET-a#0,SET]
+
+allow-zero-seqnum
+flush
+----
+false
+
+# We never allow zeroing of seqnums during flushing as doing so runs
+# afoul of the WAL replay logic which flushes after each WAL is
+# replayed, but doesn't construct a version edit in between each
+# flush. Both disallowing of seqnum zeroing during flushing, and the
+# WAL replay behavior match RocksDB's behavior.
+
+define
+mem
+  a.SET.2:2
+  b.SET.3:3
+----
+
+allow-zero-seqnum
+flush
+----
+false
diff --git a/pebble/testdata/compaction_atomic_unit_bounds b/pebble/testdata/compaction_atomic_unit_bounds
new file mode 100644
index 0000000..ce140eb
--- /dev/null
+++ b/pebble/testdata/compaction_atomic_unit_bounds
@@ -0,0 +1,61 @@
+define
+a.SET.1-b.SET.2
+----
+
+atomic-unit-bounds 0
+----
+a-b
+
+define
+a.SET.1-b.SET.2
+c.SET.3-d.SET.4
+e.SET.5-f.SET.6
+----
+
+atomic-unit-bounds 0
+----
+a-b
+
+atomic-unit-bounds 1
+----
+c-d
+
+atomic-unit-bounds 2
+----
+e-f
+
+define
+a.SET.1-b.RANGEDEL.3
+b.SET.2-c.RANGEDEL.5
+c.SET.4-d.SET.6
+----
+
+atomic-unit-bounds 0
+----
+a-d
+
+atomic-unit-bounds 1
+----
+a-d
+
+atomic-unit-bounds 2
+----
+a-d
+
+define
+a.SET.1-b.RANGEDEL.72057594037927935
+b.SET.2-c.RANGEDEL.5
+c.SET.4-d.SET.6
+----
+
+atomic-unit-bounds 0
+----
+a-b
+
+atomic-unit-bounds 1
+----
+b-d
+
+atomic-unit-bounds 2
+----
+b-d
diff --git a/pebble/testdata/compaction_check_ordering b/pebble/testdata/compaction_check_ordering
new file mode 100644
index 0000000..07e3197
--- /dev/null
+++ b/pebble/testdata/compaction_check_ordering
@@ -0,0 +1,161 @@
+check-ordering
+L0
+  a.SET.1-b.SET.2
+----
+OK
+
+check-ordering
+L0
+  a.SET.1-b.SET.2
+  c.SET.3-d.SET.4
+----
+OK
+
+check-ordering
+L0
+  c.SET.3-d.SET.4
+  a.SET.1-b.SET.2
+----
+L0 files 000001 and 000002 are not properly ordered: <#3-#4> vs <#1-#2>
+
+# Seqnum overlaps are allowed in L0 as long as no key ranges overlap.
+check-ordering
+L0
+  c.SET.3-d.SET.4
+  a.SET.1-b.SET.5
+----
+OK
+
+check-ordering
+L0
+  a.SET.3-d.SET.3
+  a.SET.1-b.SET.2
+----
+L0 files 000001 and 000002 are not properly ordered: <#3-#3> vs <#1-#2>
+
+check-ordering
+L0
+  a.SET.2-d.SET.4
+  a.SET.3-b.SET.3
+----
+L0 files 000001 and 000002 are not properly ordered: <#2-#4> vs <#3-#3>
+
+check-ordering
+L0
+  a.SET.3-d.SET.3
+  a.SET.3-b.SET.3
+----
+OK
+
+check-ordering
+L1
+  a.SET.1-b.SET.2
+----
+OK
+
+check-ordering
+L1
+  b.SET.1-a.SET.2
+----
+L1 : file 000001 has inconsistent bounds: b#1,SET vs a#2,SET
+
+check-ordering
+L1
+  a.SET.1-b.SET.2
+  c.SET.3-d.SET.4
+----
+OK
+
+check-ordering
+L1
+  a.SET.1-b.SET.2
+  d.SET.3-c.SET.4
+----
+L1 : file 000002 has inconsistent bounds: d#3,SET vs c#4,SET
+
+check-ordering
+L1
+  a.SET.1-b.SET.2
+  b.SET.1-d.SET.4
+----
+OK
+
+check-ordering
+L1
+  a.SET.1-b.SET.2
+  b.SET.2-d.SET.4
+----
+L1 files 000001 and 000002 have overlapping ranges: [a#1,SET-b#2,SET] vs [b#2,SET-d#4,SET]
+
+check-ordering
+L1
+  a.SET.1-c.SET.2
+  b.SET.3-d.SET.4
+----
+L1 files 000001 and 000002 have overlapping ranges: [a#1,SET-c#2,SET] vs [b#3,SET-d#4,SET]
+
+check-ordering
+L1
+  a.SET.1-c.SET.2
+L2
+  b.SET.3-d.SET.4
+----
+OK
+
+check-ordering
+L1
+  a.SET.1-c.SET.2
+L2
+  b.SET.3-d.SET.4
+  c.SET.5-e.SET.6
+----
+L2 files 000002 and 000003 have overlapping ranges: [b#3,SET-d#4,SET] vs [c#5,SET-e#6,SET]
+
+# Single sublevel, ordering is fine.
+check-ordering
+L0.0
+  a.SET.1-b.SET.2
+  b.SET.1-d.SET.5
+----
+L0.0 files 000001 and 000002 have overlapping ranges: [a#1,SET-b#2,SET] vs [b#1,SET-d#5,SET]
+
+# Single sublevel, ordering is incorrect.
+check-ordering
+L0.0
+  a.SET.1-b.SET.2
+  b.SET.2-d.SET.4
+----
+L0.0 files 000001 and 000002 have overlapping ranges: [a#1,SET-b#2,SET] vs [b#2,SET-d#4,SET]
+
+# Two sublevels, but ordering is fine.
+check-ordering
+L0.0
+  a.SET.1-b.SET.2
+  c.SET.3-d.SET.4
+L0.1
+  a.SET.5-b.SET.6
+  c.SET.6-d.SET.8
+----
+OK
+
+# Two sublevels, but first ordering is broken
+check-ordering
+L0.0
+  a.SET.1-b.SET.2
+  b.SET.3-d.SET.4
+L0.1
+  a.SET.5-b.SET.6
+  c.SET.6-d.SET.8
+----
+L0.0 files 000001 and 000002 have overlapping ranges: [a#1,SET-b#2,SET] vs [b#3,SET-d#4,SET]
+
+# Two sublevels, but second ordering is broken
+check-ordering
+L0.0
+  a.SET.1-b.SET.2
+  b.SET.1-d.SET.4
+L0.1
+  a.SET.5-b.SET.6
+  b.SET.7-d.SET.8
+----
+L0.0 files 000001 and 000002 have overlapping ranges: [a#1,SET-b#2,SET] vs [b#1,SET-d#4,SET]
diff --git a/pebble/testdata/compaction_delete_only_hints b/pebble/testdata/compaction_delete_only_hints
new file mode 100644
index 0000000..e11d120
--- /dev/null
+++ b/pebble/testdata/compaction_delete_only_hints
@@ -0,0 +1,417 @@
+# The first few cases are adapted from this ASCII example. The y-axis is
+# sequence numbers and the x-axis is the user key space. LSM levels are
+# omitted from the visualization.
+#
+# 250
+#       +--------00004 (fragmented)------+
+#       V                                |
+#       |-b...230:h-|                    |
+# _______________________________________V_____________ snapshot #210
+# 200               |--h.RANGEDEL.200:r--|
+#
+# _____________________________________________________ snapshot #180
+#
+# 150                     +--------+
+#           +---------+   | 000006 |
+#           | 000005  |   |        |
+#           +_________+   |        |
+# 100_____________________|________|___________________ snapshot #100
+#                         +--------+
+# _____________________________________________________ snapshot #70
+#                             +---------------+
+#  50                         | 000007        |
+#                             |               |
+#                             +---------------+
+# ______________________________________________________________
+#     a b c d e f g h i j k l m n o p q r s t u v w x y z
+
+define snapshots=(70, 100, 180, 210)
+L0
+b.RANGEDEL.230:h h.RANGEDEL.200:r
+L2
+d.SET.110:d i.SET.140:i
+L3
+k.SET.90:k o.SET.150:o
+L4
+m.SET.30:m u.SET.60:u
+----
+0.0:
+  000004:[b#230,RANGEDEL-r#inf,RANGEDEL]
+2:
+  000005:[d#110,SET-i#140,SET]
+3:
+  000006:[k#90,SET-o#150,SET]
+4:
+  000007:[m#30,SET-u#60,SET]
+
+# Test a hint that is blocked by open snapshots. No compaction should occur
+# and the hint should not be removed.
+
+get-hints
+----
+L0.000004 b-r seqnums(tombstone=200-230, file-smallest=90, type=point-key-only)
+
+maybe-compact
+----
+Deletion hints:
+  L0.000004 b-r seqnums(tombstone=200-230, file-smallest=90, type=point-key-only)
+Compactions:
+  (none)
+
+# Adopt the same LSM but without snapshots 100, 180 and 210.
+
+define snapshots=(70)
+L0
+b.RANGEDEL.230:h h.RANGEDEL.200:r
+L2
+d.SET.110:d i.SET.140:i
+L3
+k.SET.90:k o.SET.150:o
+L4
+m.SET.30:m u.SET.60:u
+----
+0.0:
+  000004:[b#230,RANGEDEL-r#inf,RANGEDEL]
+2:
+  000005:[d#110,SET-i#140,SET]
+3:
+  000006:[k#90,SET-o#150,SET]
+4:
+  000007:[m#30,SET-u#60,SET]
+
+get-hints
+----
+L0.000004 b-r seqnums(tombstone=200-230, file-smallest=90, type=point-key-only)
+
+maybe-compact
+----
+Deletion hints:
+  (none)
+Compactions:
+  [JOB 100] compacted(delete-only) L2 [000005] (677B) Score=0.00 + L3 [000006] (677B) Score=0.00 -> L6 [] (0B), in 1.0s (2.0s total), output rate 0B/s
+
+# Verify that compaction correctly handles the presence of multiple
+# overlapping hints which might delete a file multiple times. All of the
+# resolvable hints should be removed.
+
+define snapshots=(70)
+L0
+a.RANGEDEL.300:k
+L1
+b.RANGEDEL.230:h h.RANGEDEL.200:r
+L2
+d.SET.110:d i.SET.140:i
+L3
+k.SET.90:k o.SET.150:o
+L4
+m.SET.30:m u.SET.60:u
+----
+0.0:
+  000004:[a#300,RANGEDEL-k#inf,RANGEDEL]
+1:
+  000005:[b#230,RANGEDEL-r#inf,RANGEDEL]
+2:
+  000006:[d#110,SET-i#140,SET]
+3:
+  000007:[k#90,SET-o#150,SET]
+4:
+  000008:[m#30,SET-u#60,SET]
+
+get-hints
+----
+L0.000004 a-k seqnums(tombstone=300-300, file-smallest=110, type=point-key-only)
+L1.000005 b-r seqnums(tombstone=200-230, file-smallest=90, type=point-key-only)
+
+maybe-compact
+----
+Deletion hints:
+  (none)
+Compactions:
+  [JOB 100] compacted(delete-only) L2 [000006] (677B) Score=0.00 + L3 [000007] (677B) Score=0.00 -> L6 [] (0B), in 1.0s (2.0s total), output rate 0B/s
+
+# Test a range tombstone that is already compacted into L6.
+
+define snapshots=(70)
+L0
+m.SET.300:m b.RANGEDEL.230:h h.RANGEDEL.200:r
+L2
+d.SET.110:d i.SET.140:i
+L3
+k.SET.90:k o.SET.150:o
+L4
+m.SET.30:m u.SET.60:u
+----
+0.0:
+  000004:[b#230,RANGEDEL-r#inf,RANGEDEL]
+2:
+  000005:[d#110,SET-i#140,SET]
+3:
+  000006:[k#90,SET-o#150,SET]
+4:
+  000007:[m#30,SET-u#60,SET]
+
+get-hints
+----
+L0.000004 b-r seqnums(tombstone=200-230, file-smallest=90, type=point-key-only)
+
+compact a-z
+----
+5:
+  000008:[b#230,RANGEDEL-u#0,SET]
+
+maybe-compact
+----
+Deletion hints:
+  (none)
+Compactions:
+  (none)
+
+# The same test case, without snapshots, with a table (000008) that exists
+# within the range del user key bounds, but above it in the LSM.
+
+define
+L1
+b.RANGEDEL.230:h h.RANGEDEL.200:r
+L2
+d.SET.110:d i.SET.140:i
+L3
+k.SET.90:k o.SET.150:o
+L4
+m.SET.30:m u.SET.60:u
+L0
+e.SET.240:e m.SET.260:m
+----
+0.0:
+  000008:[e#240,SET-m#260,SET]
+1:
+  000004:[b#230,RANGEDEL-r#inf,RANGEDEL]
+2:
+  000005:[d#110,SET-i#140,SET]
+3:
+  000006:[k#90,SET-o#150,SET]
+4:
+  000007:[m#30,SET-u#60,SET]
+
+get-hints
+----
+L1.000004 b-r seqnums(tombstone=200-230, file-smallest=90, type=point-key-only)
+
+# Tables 000005 and 000006 can be deleted as their largest sequence numbers fall
+# below the smallest sequence number of the range del. Table 000007 falls
+# outside the user key bounds, and table 000008 exists at a sequence number
+# above the range del, so neither are deleted.
+
+maybe-compact
+----
+Deletion hints:
+  (none)
+Compactions:
+  [JOB 100] compacted(delete-only) L2 [000005] (677B) Score=0.00 + L3 [000006] (677B) Score=0.00 -> L6 [] (0B), in 1.0s (2.0s total), output rate 0B/s
+
+# A deletion hint present on an sstable in a higher level should NOT result in a
+# deletion-only compaction incorrectly removing an sstable in L6 following an
+# elision-only compaction that zeroes the sequence numbers in an L6 table.
+#
+# This is a regression test for pebble#1285.
+
+# Create an sstable at L6. We expect that the SET survives the following
+# sequence of compactions.
+define snapshots=(10, 25)
+L6
+a.SET.20:b a.RANGEDEL.15:z
+----
+6:
+  000004:[a#20,SETWITHDEL-z#inf,RANGEDEL]
+
+# Note that this test depends on stats being present on the sstables, so we
+# collect hints here. We expect none, as the table is in L6.
+get-hints
+----
+(none)
+
+# Place a compaction hint on a non-existent table in a higher level in the LSM.
+#
+# The selection of the sequence numbers for the hints is nuanced, and warrants
+# some explanation. The largest tombstone sequence number (27) and file smallest
+# sequence number (0) were chosen such that they fall into different snapshot
+# stripes, which ensures the hint is not resolved and dropped. The deletion
+# range 5-27 is also chosen such that it covers the sequence number range from
+# the table, i.e. 15-20, which *appears* to make the keys eligible for deletion.
+force-set-hints
+L0.000001 a-z 0 5-27 point_key_only
+----
+L0.000001 a-z seqnums(tombstone=5-27, file-smallest=0, type=point-key-only)
+
+# Hints on the table are unchanged, as the new sstable is at L6, and hints are
+# not generated on tables at this level.
+get-hints
+----
+L0.000001 a-z seqnums(tombstone=5-27, file-smallest=0, type=point-key-only)
+
+# Closing snapshot 10 triggers an elision-only compaction in L6 rather than a
+# deletion-only compaction, as the earliest snapshot that remains open is 25,
+# preventing the delete compaction hint from being resolved as it does not exist
+# in the same snapshot stripe as the table in L6.
+close-snapshot
+10
+----
+[JOB 100] compacted(elision-only) L6 [000004] (741B) Score=0.00 + L6 [] (0B) Score=0.00 -> L6 [000005] (662B), in 1.0s (2.0s total), output rate 662B/s
+
+# The deletion hint was removed by the elision-only compaction.
+get-hints
+----
+(none)
+
+# The LSM contains the key, as expected.
+iter
+first
+next
+----
+a: (b, .)
+.
+
+# Closing the next snapshot should NOT trigger another compaction, as the
+# deletion hint was removed in the elision-only compaction.
+close-snapshot
+25
+----
+(none)
+
+# The key remains in the LSM.
+iter
+first
+next
+----
+a: (b, .)
+.
+
+# Construct a scenario with tables containing a mixture of range dels and range
+# key dels that sit within different types of hints.
+#
+#   +------- 000013 (internally fragmented spans) ----|
+#   |                                                 V
+#   |                       |-------------------------| m.RANGEKEYDEL:z
+#   |               |-------|                           i.RANGEKEYDEL:m
+#   V         |-----------------------|                 f.RANGEDEL:r
+#   |---------|                                         a.RANGEDEL:f
+#               +-+             +---+             +---+
+#               | | 000006      |   | 000009      |   | 000012 <- Point keys only.
+#               +-+             +---+             +---+
+#         +---+           +---+             +---+
+#         |   | 000005    |   | 000008      |   | 000011       <- Range keys only.
+#         +---+           +---+             +---+
+#   +---+           +---+             +---+
+#   |   | 000004    |   | 000007      |   | 000010             <- Point and range keys.
+#   +---+           +---+             +---+
+# __________________________________________________________
+#   a b c d e f g h i j k l m n o p q r s t u v w x y z
+#
+# Note that table 000013 contains both range dels and range key dels that have
+# been internally fragmented. After defragmentation there are three hints
+# created:
+# - [a, i) - a point-key-only hint
+# - [i, r) - a point-and-range-key hint
+# - [r, z) - a range-key-only hint
+#
+# Based on the defragmented hints, the following tables can be deleted:
+# - 000006: covered by range del hint [a, i), table contains only point keys.
+# - 000007: covered by mixed hint [i, r), table contains point and range keys.
+# - 000008: covered by mixed hint [i, r), table contains only range keys.
+# - 000009: covered by mixed hint [i, r), table contains only point keys.
+# - 000011: covered by range key hint [r, z), table contains only range keys.
+#
+
+# NOTE: the LSM shown in the example above is created bottom-up via ingestions.
+
+reset
+----
+
+ingest ext
+set a a
+range-key-set a c @1 foo
+set c c
+----
+OK
+
+ingest ext
+range-key-set d f @2 bar
+----
+OK
+
+ingest ext
+set g g
+set h h
+----
+OK
+
+ingest ext
+set i i
+range-key-set i k @1 v1
+set k k
+----
+OK
+
+ingest ext
+range-key-set l n @2 bar
+----
+OK
+
+ingest ext
+set o o
+set q q
+----
+OK
+
+ingest ext
+set r r
+range-key-set r t @1 v1
+set t t
+----
+OK
+
+ingest ext
+range-key-set u w @2 bar
+----
+OK
+
+ingest ext
+set x x
+set z z
+----
+OK
+
+ingest ext
+del-range a f
+del-range f r
+range-key-del i m
+range-key-del m z
+----
+OK
+
+describe-lsm
+----
+0.0:
+  000013:[a#19,RANGEDEL-z#inf,RANGEKEYDEL]
+6:
+  000004:[a#10,RANGEKEYSET-c#10,SET]
+  000005:[d#11,RANGEKEYSET-f#inf,RANGEKEYSET]
+  000006:[g#12,SET-h#12,SET]
+  000007:[i#13,RANGEKEYSET-k#13,SET]
+  000008:[l#14,RANGEKEYSET-n#inf,RANGEKEYSET]
+  000009:[o#15,SET-q#15,SET]
+  000010:[r#16,RANGEKEYSET-t#16,SET]
+  000011:[u#17,RANGEKEYSET-w#inf,RANGEKEYSET]
+  000012:[x#18,SET-z#18,SET]
+
+get-hints
+----
+L0.000013 a-i seqnums(tombstone=19-19, file-smallest=12, type=point-key-only)
+L0.000013 i-r seqnums(tombstone=19-19, file-smallest=13, type=point-and-range-key)
+L0.000013 r-z seqnums(tombstone=19-19, file-smallest=17, type=range-key-only)
+
+maybe-compact
+----
+Deletion hints:
+  (none)
+Compactions:
+  [JOB 100] compacted(delete-only) L6 [000006 000007 000008 000009 000011] (3.9KB) Score=0.00 -> L6 [] (0B), in 1.0s (2.0s total), output rate 0B/s
diff --git a/pebble/testdata/compaction_elide_tombstone b/pebble/testdata/compaction_elide_tombstone
new file mode 100644
index 0000000..30ceed7
--- /dev/null
+++ b/pebble/testdata/compaction_elide_tombstone
@@ -0,0 +1,208 @@
+define
+----
+
+elide start-level=5
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+----
+elideTombstone("a") = true
+elideTombstone("b") = true
+elideTombstone("c") = true
+elideTombstone("d") = true
+elideTombstone("e") = true
+elideTombstone("f") = true
+elideTombstone("g") = true
+elideTombstone("h") = true
+elideTombstone("i") = true
+elideTombstone("j") = true
+elideTombstone("k") = true
+
+elide start-level=1
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+----
+elideTombstone("a") = true
+elideTombstone("b") = true
+elideTombstone("c") = true
+elideTombstone("d") = true
+elideTombstone("e") = true
+elideTombstone("f") = true
+elideTombstone("g") = true
+elideTombstone("h") = true
+elideTombstone("i") = true
+elideTombstone("j") = true
+elideTombstone("k") = true
+
+define
+L1
+  c.SET.801:c
+  g.SET.800:g
+L1
+  x.SET.701:x
+  y.SET.700:y
+L2
+  d.SET.601:d
+  h.SET.600:h
+L2
+  r.SET.501:r
+  t.SET.500:t
+L3
+  f.SET.401:f
+  g.SET.400:g
+L3
+  w.SET.301:w
+  x.SET.300:x
+L4
+  f.SET.201:f
+  m.SET.200:m
+L4
+  t.SET.101:t
+  t.SET.100:t
+----
+1:
+  000004:[c#801,SET-g#800,SET]
+  000005:[x#701,SET-y#700,SET]
+2:
+  000006:[d#601,SET-h#600,SET]
+  000007:[r#501,SET-t#500,SET]
+3:
+  000008:[f#401,SET-g#400,SET]
+  000009:[w#301,SET-x#300,SET]
+4:
+  000010:[f#201,SET-m#200,SET]
+  000011:[t#101,SET-t#101,SET]
+
+elide start-level=1
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+----
+elideTombstone("b") = true
+elideTombstone("c") = true
+elideTombstone("d") = true
+elideTombstone("e") = true
+elideTombstone("f") = false
+elideTombstone("g") = false
+elideTombstone("h") = false
+elideTombstone("i") = false
+elideTombstone("j") = false
+elideTombstone("k") = false
+elideTombstone("l") = false
+elideTombstone("m") = false
+elideTombstone("n") = true
+elideTombstone("o") = true
+elideTombstone("p") = true
+elideTombstone("q") = true
+elideTombstone("r") = true
+elideTombstone("s") = true
+elideTombstone("t") = false
+elideTombstone("u") = true
+elideTombstone("v") = true
+elideTombstone("w") = false
+elideTombstone("x") = false
+elideTombstone("y") = true
+elideTombstone("z") = true
+
+define
+L1
+  a.SET.3:v
+L2
+  a.RANGEDEL.2:g
+L3
+  a.SET.0:v
+  b.SET.0:v
+L3
+  c.SET.0:v
+  d.SET.0:v
+L3
+  e.SET.0:v
+  f.SET.1:v
+L3
+  g.SET.1:v
+  g.SET.0:v
+----
+1:
+  000004:[a#3,SET-a#3,SET]
+2:
+  000005:[a#2,RANGEDEL-g#inf,RANGEDEL]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+  000008:[e#0,SET-f#1,SET]
+  000009:[g#1,SET-g#1,SET]
+
+elide start-level=0
+b
+c
+d
+e
+f
+g
+----
+elideTombstone("b") = false
+elideTombstone("c") = false
+elideTombstone("d") = false
+elideTombstone("e") = false
+elideTombstone("f") = false
+elideTombstone("g") = false
+
+define
+L6
+  g.SET.0:g
+  h.RANGEDEL.1:z
+----
+6:
+  000004:[g#0,SET-z#inf,RANGEDEL]
+
+elide start-level=1
+a
+b
+g
+goo
+z
+----
+elideTombstone("a") = true
+elideTombstone("b") = true
+elideTombstone("g") = false
+elideTombstone("goo") = false
+elideTombstone("z") = false
diff --git a/pebble/testdata/compaction_error_on_user_key_overlap b/pebble/testdata/compaction_error_on_user_key_overlap
new file mode 100644
index 0000000..e9dd9d1
--- /dev/null
+++ b/pebble/testdata/compaction_error_on_user_key_overlap
@@ -0,0 +1,20 @@
+
+error-on-user-key-overlap
+a.SET.2-b.SET.3
+c.SET.4-d.SET.5
+----
+OK
+
+# If the end key is the rangedel sentinel key, no error should be returned.
+
+error-on-user-key-overlap
+a.SET.2-c.RANGEDEL.72057594037927935
+c.SET.4-d.SET.5
+----
+OK
+
+error-on-user-key-overlap
+a.SET.2-c.SET.5
+c.SET.4-d.SET.5
+----
+pebble: compaction split user key across two sstables: c#5,SET in 000001 and 000002
diff --git a/pebble/testdata/compaction_expand_inputs b/pebble/testdata/compaction_expand_inputs
new file mode 100644
index 0000000..86035f1
--- /dev/null
+++ b/pebble/testdata/compaction_expand_inputs
@@ -0,0 +1,78 @@
+define
+a.SET.1-b.SET.2
+----
+
+expand-inputs 0
+----
+0: a#1,1-b#2,1
+
+define
+a.SET.1-b.SET.2
+c.SET.3-d.SET.4
+e.SET.5-f.SET.6
+----
+
+expand-inputs 0
+----
+0: a#1,1-b#2,1
+
+expand-inputs 1
+----
+1: c#3,1-d#4,1
+
+expand-inputs 2
+----
+2: e#5,1-f#6,1
+
+define
+a.SET.1-b.SET.2
+b.SET.1-d.SET.4
+e.SET.5-f.SET.6
+----
+
+expand-inputs 0
+----
+0: a#1,1-b#2,1
+1: b#1,1-d#4,1
+
+expand-inputs 1
+----
+0: a#1,1-b#2,1
+1: b#1,1-d#4,1
+
+expand-inputs 2
+----
+2: e#5,1-f#6,1
+
+define
+a.SET.1-b.SET.2
+b.SET.1-d.SET.4
+d.SET.2-f.SET.6
+----
+
+expand-inputs 0
+----
+0: a#1,1-b#2,1
+1: b#1,1-d#4,1
+2: d#2,1-f#6,1
+
+expand-inputs 1
+----
+0: a#1,1-b#2,1
+1: b#1,1-d#4,1
+2: d#2,1-f#6,1
+
+define
+a.SET.1-b.RANGEDEL.72057594037927935
+b.SET.1-d.SET.4
+d.SET.2-f.SET.6
+----
+
+expand-inputs 0
+----
+0: a#1,1-b#72057594037927935,15
+
+expand-inputs 1
+----
+1: b#1,1-d#4,1
+2: d#2,1-f#6,1
diff --git a/pebble/testdata/compaction_find_grandparent_limit b/pebble/testdata/compaction_find_grandparent_limit
new file mode 100644
index 0000000..e1c88ba
--- /dev/null
+++ b/pebble/testdata/compaction_find_grandparent_limit
@@ -0,0 +1,100 @@
+# no grandparents
+define
+----
+
+compact max-overlap=1
+a b c d e f g
+----
+a-g
+
+# 3 equal size grandparents
+define
+a-b 2
+c-d 2
+e-f 2
+----
+
+compact max-overlap=1
+a b c d e f
+----
+a-c
+d-e
+f-f
+
+compact max-overlap=2
+a b c d e f
+----
+a-c
+d-e
+f-f
+
+compact max-overlap=4
+a b c d e f
+----
+a-e
+f-f
+
+compact max-overlap=1
+b c d e f g h i j
+----
+b-c
+d-e
+f-j
+
+compact max-overlap=1
+a g h i j
+----
+a-a
+g-j
+
+compact max-overlap=1
+a e ee eee eeee f
+----
+a-a
+e-f
+
+compact max-overlap=1
+c d e f
+----
+c-e
+f-f
+
+# Unequal size grandparents
+define
+a-b 1
+c-d 2
+e-f 3
+----
+
+compact max-overlap=1
+a b c d e f
+----
+a-c
+d-e
+f-f
+
+compact max-overlap=3
+a b c d e f
+----
+a-e
+f-f
+
+# Unequal size grandparents
+define
+a-b 3
+c-d 2
+e-f 1
+----
+
+compact max-overlap=1
+a b c d e f
+----
+a-c
+d-e
+f-f
+
+compact max-overlap=3
+a b c d e f
+----
+a-c
+d-f
diff --git a/pebble/testdata/compaction_find_l0_limit b/pebble/testdata/compaction_find_l0_limit
new file mode 100644
index 0000000..4ea4e6c
--- /dev/null
+++ b/pebble/testdata/compaction_find_l0_limit
@@ -0,0 +1,88 @@
+define flush_split_bytes=4
+L0
+   a.SET.1-k.SET.10 size=2
+   l.SET.11-o.SET.13 size=2
+   p.SET.14-s.SET.16 size=2
+   t.SET.17-w.SET.19 size=2
+----
+0.0:
+  000001:[a#1,SET-k#10,SET]
+  000002:[l#11,SET-o#13,SET]
+  000003:[p#14,SET-s#16,SET]
+  000004:[t#17,SET-w#19,SET]
+flush split keys:
+	s
+
+flush
+a c f l o s u x
+----
+a-s
+u-x
+
+define flush_split_bytes=2
+L0
+   a.SET.1-k.SET.10 size=2
+   l.SET.11-o.SET.13 size=2
+   p.SET.14-s.SET.16 size=2
+   t.SET.17-w.SET.19 size=2
+----
+0.0:
+  000005:[a#1,SET-k#10,SET]
+  000006:[l#11,SET-o#13,SET]
+  000007:[p#14,SET-s#16,SET]
+  000008:[t#17,SET-w#19,SET]
+flush split keys:
+	o
+	w
+
+flush
+a c f l o s u x
+----
+a-o
+s-u
+x-x
+
+define flush_split_bytes=1
+L0
+   a.SET.1-k.SET.10 size=2
+   l.SET.11-o.SET.13 size=2
+   p.SET.14-s.SET.16 size=2
+   t.SET.17-w.SET.19 size=2
+----
+0.0:
+  000009:[a#1,SET-k#10,SET]
+  000010:[l#11,SET-o#13,SET]
+  000011:[p#14,SET-s#16,SET]
+  000012:[t#17,SET-w#19,SET]
+flush split keys:
+	k
+	o
+	s
+	w
+
+flush
+a c f l o s u x
+----
+a-f
+l-o
+s-u
+x-x
+
+define flush_split_bytes=0
+L0
+   a.SET.1-k.SET.10 size=2
+   l.SET.11-o.SET.13 size=2
+   p.SET.14-s.SET.16 size=2
+   t.SET.17-w.SET.19 size=2
+----
+0.0:
+  000013:[a#1,SET-k#10,SET]
+  000014:[l#11,SET-o#13,SET]
+  000015:[p#14,SET-s#16,SET]
+  000016:[t#17,SET-w#19,SET]
+flush split keys:
+
+flush
+a c f l o s u x
+----
+a-x
diff --git a/pebble/testdata/compaction_inuse_key_ranges b/pebble/testdata/compaction_inuse_key_ranges
new file mode 100644
index 0000000..704de7e
--- /dev/null
+++ b/pebble/testdata/compaction_inuse_key_ranges
@@ -0,0 +1,409 @@
+define
+L1
+  a.SET.1-b.SET.1
+  d.SET.1-e.SET.1
+  e.SET.1-f.SET.1
+----
+1:
+  000001:[a#1,SET-b#1,SET]
+  000002:[d#1,SET-e#1,SET]
+  000003:[e#1,SET-f#1,SET]
+
+inuse-key-ranges
+0 a b
+0 c d
+0 g h
+1 a b
+----
+a-b
+d-e
+.
+.
+
+define
+L1
+  a.SET.1-b.SET.1
+L2
+  b.SET.1-c.SET.2
+----
+1:
+  000001:[a#1,SET-b#1,SET]
+2:
+  000002:[b#1,SET-c#2,SET]
+
+inuse-key-ranges
+0 a c
+----
+a-c
+
+define
+L1
+  a.SET.1-b.SET.1
+L2
+  c.SET.1-d.SET.2
+----
+1:
+  000001:[a#1,SET-b#1,SET]
+2:
+  000002:[c#1,SET-d#2,SET]
+
+inuse-key-ranges
+0 a c
+----
+a-b c-d
+
+define
+L1
+  b.SET.1-c.SET.1
+L2
+  a.SET.1-b.SET.2
+----
+1:
+  000001:[b#1,SET-c#1,SET]
+2:
+  000002:[a#1,SET-b#2,SET]
+
+inuse-key-ranges
+0 a c
+----
+a-c
+
+define
+L1
+  c.SET.1-d.SET.1
+L2
+  a.SET.1-b.SET.2
+----
+1:
+  000001:[c#1,SET-d#1,SET]
+2:
+  000002:[a#1,SET-b#2,SET]
+
+inuse-key-ranges
+0 a c
+----
+a-b c-d
+
+define
+L1
+  a.SET.1-b.SET.1
+  c.SET.1-d.SET.1
+  f.SET.1-g.SET.1
+  i.SET.1-j.SET.1
+----
+1:
+  000001:[a#1,SET-b#1,SET]
+  000002:[c#1,SET-d#1,SET]
+  000003:[f#1,SET-g#1,SET]
+  000004:[i#1,SET-j#1,SET]
+
+inuse-key-ranges
+0 a z
+0 a c
+0 g z
+----
+a-b c-d f-g i-j
+a-b c-d
+f-g i-j
+
+define
+L1
+  a.SET.1-b.SET.1
+  c.SET.1-d.SET.1
+  f.SET.1-g.SET.1
+  i.SET.1-j.SET.1
+L6
+  a.SET.0-i.SET.0
+  k.SET.0-z.SET.0
+----
+1:
+  000001:[a#1,SET-b#1,SET]
+  000002:[c#1,SET-d#1,SET]
+  000003:[f#1,SET-g#1,SET]
+  000004:[i#1,SET-j#1,SET]
+6:
+  000005:[a#0,SET-i#0,SET]
+  000006:[k#0,SET-z#0,SET]
+
+inuse-key-ranges
+0 a z
+----
+a-j k-z
+
+define
+L0
+  a.SET.1-b.SET.1
+  c.SET.1-d.SET.1
+  f.SET.1-g.SET.1
+  i.SET.1-j.SET.1
+L6
+  a.SET.0-i.SET.0
+  k.SET.0-z.SET.0
+----
+0.0:
+  000001:[a#1,SET-b#1,SET]
+  000002:[c#1,SET-d#1,SET]
+  000003:[f#1,SET-g#1,SET]
+  000004:[i#1,SET-j#1,SET]
+6:
+  000005:[a#0,SET-i#0,SET]
+  000006:[k#0,SET-z#0,SET]
+
+inuse-key-ranges
+0 a z
+----
+a-j k-z
+
+define
+L0
+  a.SET.1-b.SET.1
+  aa.SET.1-ab.SET.1
+  b.SET.2-d.SET.1
+  bb.SET.1-dd.SET.1
+  c.SET.1-d.SET.1
+  e.SET.1-m.SET.1
+  g.SET.1-p.SET.1
+----
+0.3:
+  000005:[c#1,SET-d#1,SET]
+0.2:
+  000004:[bb#1,SET-dd#1,SET]
+0.1:
+  000002:[aa#1,SET-ab#1,SET]
+  000003:[b#2,SET-d#1,SET]
+  000007:[g#1,SET-p#1,SET]
+0.0:
+  000001:[a#1,SET-b#1,SET]
+  000006:[e#1,SET-m#1,SET]
+
+inuse-key-ranges
+0 a z
+0 e p
+0 e f
+0 b c
+0 q r
+0 1 2
+0 ddd dddd
+----
+a-dd e-p
+e-p
+e-m
+b-dd
+.
+.
+.
+
+define
+L1
+  a.SET.6-b.SET.6
+  d.SET.6-g.SET.6
+L2
+  c.SET.5-d.SET.5
+  i.SET.5-j.SET.5
+L3
+  b.SET.1-c.SET.1
+L4
+  f.SET.1-k.SET.1
+L6
+  m.SET.1-z.SET.1
+----
+1:
+  000001:[a#6,SET-b#6,SET]
+  000002:[d#6,SET-g#6,SET]
+2:
+  000003:[c#5,SET-d#5,SET]
+  000004:[i#5,SET-j#5,SET]
+3:
+  000005:[b#1,SET-c#1,SET]
+4:
+  000006:[f#1,SET-k#1,SET]
+6:
+  000007:[m#1,SET-z#1,SET]
+
+inuse-key-ranges
+5 a z
+5 a b
+5 m z
+5 m zz
+5 mm zz
+5 l x
+5 l zz
+----
+m-z
+.
+m-z
+m-z
+m-z
+m-z
+m-z
+
+inuse-key-ranges
+3 a z
+3 f k
+3 k m
+3 l ll
+3 b n
+----
+f-k m-z
+f-k
+f-k m-z
+.
+f-k m-z
+
+inuse-key-ranges
+2 a z
+----
+b-c f-k m-z
+
+inuse-key-ranges
+1 a z
+----
+b-d f-k m-z
+
+inuse-key-ranges
+0 a z
+0 a k
+0 a b
+0 bb bc
+0 f k
+----
+a-k m-z
+a-k
+a-c
+b-c
+d-k
+
+define
+L1
+  m.SET.6-p.SET.6
+L2
+  j.SET.5-n.SET.5
+  o.SET.5-t.SET.5
+L3
+  e.SET.2-k.SET.2
+  s.SET.2-x.SET.2
+L4
+  a.SET.1-f.SET.1
+  w.SET.1-z.SET.1
+----
+1:
+  000001:[m#6,SET-p#6,SET]
+2:
+  000002:[j#5,SET-n#5,SET]
+  000003:[o#5,SET-t#5,SET]
+3:
+  000004:[e#2,SET-k#2,SET]
+  000005:[s#2,SET-x#2,SET]
+4:
+  000006:[a#1,SET-f#1,SET]
+  000007:[w#1,SET-z#1,SET]
+
+inuse-key-ranges
+3 a z
+2 a z
+1 a z
+0 a z
+0 a n
+0 a mm
+0 a nn
+0 p z
+0 pp z
+0 oo z
+----
+a-f w-z
+a-k s-z
+a-n o-z
+a-z
+a-p
+a-p
+a-p
+m-z
+o-z
+m-z
+
+define
+L1
+  a.SET.6-c.SET.6
+L2
+  b.SET.5-b.SET.5
+  bb.SET.5-bb.SET.5
+  cc.SET.5-cc.SET.5
+----
+1:
+  000001:[a#6,SET-c#6,SET]
+2:
+  000002:[b#5,SET-b#5,SET]
+  000003:[bb#5,SET-bb#5,SET]
+  000004:[cc#5,SET-cc#5,SET]
+
+inuse-key-ranges
+0 a c
+0 a cc
+----
+a-c
+a-c cc-cc
+
+define
+L1
+  a.SET.6-c.SET.6
+L2
+  b.SET.5-b.SET.5
+  bb.SET.5-bb.SET.5
+  bc.SET.5-c.SET.5
+  c.SET.5-c.SET.5
+  c.SET.5-d.SET.5
+----
+1:
+  000001:[a#6,SET-c#6,SET]
+2:
+  000002:[b#5,SET-b#5,SET]
+  000003:[bb#5,SET-bb#5,SET]
+  000004:[bc#5,SET-c#5,SET]
+  000005:[c#5,SET-c#5,SET]
+  000006:[c#5,SET-d#5,SET]
+
+inuse-key-ranges
+0 a c
+0 a cc
+0 a d
+0 c c
+----
+a-d
+a-d
+a-d
+a-d
+
+
+define
+L0
+  d.SET.7-i.SET.7
+L1
+  a.SET.6-a.SET.6
+  d.SET.6-d.SET.6
+  h.SET.6-i.SET.6
+L2
+  b.SET.5-b.SET.5
+  c.SET.5-c.SET.5
+  e.SET.5-e.SET.5
+L3
+  bb.SET.4-bb.SET.4
+----
+0.0:
+  000001:[d#7,SET-i#7,SET]
+1:
+  000002:[a#6,SET-a#6,SET]
+  000003:[d#6,SET-d#6,SET]
+  000004:[h#6,SET-i#6,SET]
+2:
+  000005:[b#5,SET-b#5,SET]
+  000006:[c#5,SET-c#5,SET]
+  000007:[e#5,SET-e#5,SET]
+3:
+  000008:[bb#4,SET-bb#4,SET]
+
+inuse-key-ranges
+0 a z
+1 a z
+----
+a-a b-b bb-bb c-c d-i
+b-b bb-bb c-c e-e
diff --git a/pebble/testdata/compaction_iter b/pebble/testdata/compaction_iter
new file mode 100644
index 0000000..3e83116
--- /dev/null
+++ b/pebble/testdata/compaction_iter
@@ -0,0 +1,1218 @@
+define
+a.SET.1:b
+----
+
+iter print-snapshot-pinned
+first
+next
+----
+a#1,1:b (not pinned)
+.
+
+define
+a.SET.2:c
+a.SET.1:b
+----
+
+iter print-snapshot-pinned
+first
+next
+----
+a#2,1:c (not pinned)
+.
+
+iter print-snapshot-pinned snapshots=0
+first
+next
+----
+a#2,1:c (not pinned)
+.
+
+iter snapshots=1
+first
+next
+----
+a#2,1:c
+.
+
+iter print-snapshot-pinned snapshots=2
+first
+next
+next
+----
+a#2,1:c (not pinned)
+a#1,1:b (pinned)
+.
+
+define
+a.DEL.2:
+a.SET.1:b
+----
+
+iter
+first
+next
+----
+a#2,0:
+.
+
+iter elide-tombstones=true
+first
+----
+.
+
+iter print-snapshot-pinned elide-tombstones=true snapshots=2
+first
+next
+next
+----
+a#2,0: (pinned)
+a#1,1:b (pinned)
+.
+
+iter print-snapshot-pinned elide-tombstones=true snapshots=1
+first
+next
+----
+a#2,0: (pinned)
+.
+
+define
+a.DEL.2:
+a.SET.1:b
+b.SET.3:c
+----
+
+iter print-snapshot-pinned
+first
+next
+next
+----
+a#2,0: (not pinned)
+b#3,1:c (not pinned)
+.
+
+iter snapshots=1
+first
+next
+next
+----
+a#2,0:
+b#3,1:c
+.
+
+iter snapshots=2
+first
+next
+next
+next
+----
+a#2,0:
+a#1,1:b
+b#3,1:c
+.
+
+define
+a.SET.1:a
+b.SET.2:b
+c.SET.3:c
+----
+
+iter
+first
+next
+next
+next
+----
+a#1,1:a
+b#2,1:b
+c#3,1:c
+.
+
+define
+a.MERGE.3:d
+a.MERGE.2:c
+a.SET.1:b
+b.MERGE.2:b
+b.MERGE.1:a
+----
+
+iter
+first
+next
+next
+----
+a#3,1:bcd[base]
+b#2,2:ab
+.
+
+iter snapshots=3  print-snapshot-pinned
+first
+next
+next
+next
+----
+a#3,2:d (not pinned)
+a#2,1:bc[base] (pinned)
+b#2,2:ab (not pinned)
+.
+
+define
+a.SET.9:b
+a.DEL.8:
+a.SET.7:d
+a.DEL.6:
+a.SET.5:f
+----
+
+iter
+first
+next
+----
+a#9,1:b
+.
+
+iter snapshots=6
+first
+next
+next
+----
+a#9,1:b
+a#5,1:f
+.
+
+iter snapshots=7
+first
+next
+next
+----
+a#9,1:b
+a#6,0:
+.
+
+iter snapshots=8
+first
+next
+next
+----
+a#9,1:b
+a#7,1:d
+.
+
+iter snapshots=9
+first
+next
+next
+----
+a#9,1:b
+a#8,0:
+.
+
+iter snapshots=10
+first
+next
+----
+a#9,1:b
+.
+
+iter snapshots=(5,6,7,8,9) print-snapshot-pinned
+first
+next
+next
+next
+next
+next
+----
+a#9,1:b (not pinned)
+a#8,0: (pinned)
+a#7,1:d (pinned)
+a#6,0: (pinned)
+a#5,1:f (pinned)
+.
+
+define
+a.INVALID.2:b
+a.SET.1:c
+----
+
+iter
+first
+----
+err=invalid internal key kind: INVALID
+
+define
+a.SET.2:b
+a.INVALID.1:c
+----
+
+iter
+first
+next
+----
+a#2,1:b
+err=invalid internal key kind: INVALID
+
+define
+a.MERGE.2:b
+a.INVALID.1:c
+----
+
+iter
+first
+next
+----
+a#2,2:b
+err=invalid internal key kind: INVALID
+
+define
+a.INVALID.2:c
+a.RANGEDEL.1:d
+----
+
+iter
+first
+tombstones
+----
+err=invalid internal key kind: INVALID
+.
+
+define
+a.MERGE.2:b
+a.MERGE.1:c
+a.MERGE.0:d
+----
+
+iter snapshots=(1,2) print-snapshot-pinned
+first
+next
+next
+next
+----
+a#2,2:b (not pinned)
+a#1,2:c (pinned)
+a#0,2:d (pinned)
+.
+
+define
+a.SET.2:b
+a.RANGEDEL.1:c
+b.RANGEDEL.4:d
+b.SET.2:e
+c.SET.3:f
+----
+
+# NB: Range deletions are always marked as 'not pinned' currently. Extending
+# snapshot-pinning statistics to range deletions and range keys is TODO.
+
+iter print-snapshot-pinned
+first
+next
+next
+next
+tombstones
+----
+a#2,1:b (not pinned)
+a#1,15:c (not pinned)
+b#4,15:d (not pinned)
+.
+a-b#1
+b-c#4
+c-d#4
+.
+
+iter snapshots=2 print-snapshot-pinned
+first
+next
+next
+next
+tombstones
+----
+a#2,1:b (not pinned)
+a#1,15:c (not pinned)
+b#4,15:d (not pinned)
+.
+a-b#1
+b-c#4
+b-c#1
+c-d#4
+.
+
+iter snapshots=3 print-snapshot-pinned
+first
+next
+next
+next
+next
+tombstones
+----
+a#2,1:b (not pinned)
+a#1,15:c (not pinned)
+b#4,15:d (not pinned)
+b#2,1:e (pinned)
+.
+a-b#1
+b-c#4
+b-c#1
+c-d#4
+.
+
+iter snapshots=4 print-snapshot-pinned
+first
+next
+next
+next
+next
+next
+tombstones
+----
+a#2,1:b (not pinned)
+a#1,15:c (not pinned)
+b#4,15:d (not pinned)
+b#2,1:e (pinned)
+c#3,1:f (pinned)
+.
+a-b#1
+b-c#4
+b-c#1
+c-d#4
+.
+
+define
+a.RANGEDEL.3:e
+b.SET.4:b
+c.SET.3:c
+d.SET.2:d
+e.SET.1:e
+----
+
+iter
+first
+next
+next
+next
+next
+tombstones
+----
+a#3,15:e
+b#4,1:b
+c#3,1:c
+e#1,1:e
+.
+a-e#3
+.
+
+define
+a.RANGEDEL.3:e
+b.MERGE.4:b
+c.MERGE.3:c
+d.MERGE.2:d
+e.MERGE.1:e
+----
+
+iter
+first
+next
+next
+next
+next
+tombstones
+----
+a#3,15:e
+b#4,2:b
+c#3,2:c
+e#1,2:e
+.
+a-e#3
+.
+
+define
+a.RANGEDEL.3:c
+b.MERGE.5:e
+b.MERGE.4:d
+b.MERGE.2:c
+b.MERGE.1:b
+d.MERGE.5:c
+d.MERGE.4:b
+d.RANGEDEL.3:f
+d.MERGE.2:e
+d.MERGE.1:d
+----
+
+iter
+first
+next
+next
+next
+next
+tombstones
+----
+a#3,15:c
+b#5,1:de[base]
+d#5,2:bc
+d#3,15:f
+.
+a-c#3
+d-f#3
+.
+
+define
+a.RANGEDEL.3:d
+b.RANGEDEL.2:e
+c.RANGEDEL.1:f
+----
+
+iter
+first
+next
+next
+next
+tombstones
+----
+a#3,15:d
+b#2,15:e
+c#1,15:f
+.
+a-b#3
+b-c#3
+c-d#3
+d-e#2
+e-f#1
+.
+
+iter snapshots=2
+first
+next
+next
+next
+tombstones
+----
+a#3,15:d
+b#2,15:e
+c#1,15:f
+.
+a-b#3
+b-c#3
+c-d#3
+c-d#1
+d-e#2
+d-e#1
+e-f#1
+.
+
+iter snapshots=3
+first
+next
+next
+next
+tombstones
+----
+a#3,15:d
+b#2,15:e
+c#1,15:f
+.
+a-b#3
+b-c#3
+b-c#2
+c-d#3
+c-d#2
+d-e#2
+e-f#1
+.
+
+iter snapshots=(2,3)
+first
+next
+next
+next
+tombstones
+----
+a#3,15:d
+b#2,15:e
+c#1,15:f
+.
+a-b#3
+b-c#3
+b-c#2
+c-d#3
+c-d#2
+c-d#1
+d-e#2
+d-e#1
+e-f#1
+.
+
+define
+a.RANGEDEL.10:k
+f.SET.9:f
+f.SET.8:f
+----
+
+iter snapshots=(9,10)
+first
+next
+tombstones f
+next
+tombstones
+----
+a#10,15:k
+f#9,1:f
+a-f#10
+.
+f#8,1:f
+f-k#10
+.
+
+define
+f.RANGEDEL.10:k
+f.SET.9:f
+f.SET.8:f
+----
+
+iter snapshots=(9,10)
+first
+next
+tombstones f
+next
+tombstones
+----
+f#10,15:k
+f#9,1:f
+.
+f#8,1:f
+f-k#10
+.
+
+define
+a.SET.1:a
+b.RANGEDEL.2:d
+c.RANGEDEL.3:e
+d.SET.4:d
+----
+
+iter
+first
+next
+next
+next
+tombstones c
+tombstones
+----
+a#1,1:a
+b#2,15:d
+c#3,15:e
+d#4,1:d
+b-c#2
+.
+c-d#3
+d-e#3
+.
+
+iter snapshots=3
+first
+next
+next
+next
+tombstones c
+tombstones
+----
+a#1,1:a
+b#2,15:d
+c#3,15:e
+d#4,1:d
+b-c#2
+.
+c-d#3
+c-d#2
+d-e#3
+.
+
+define
+a.SET.1:a
+b.RANGEDEL.2:d
+c.SET.4:d
+----
+
+iter
+first
+next
+next
+tombstones c
+tombstones
+----
+a#1,1:a
+b#2,15:d
+c#4,1:d
+b-c#2
+.
+c-d#2
+.
+
+define
+a.RANGEDEL.2:d
+a.SET.2:a
+b.SET.2:b
+c.SET.2:c
+----
+
+iter
+first
+next
+next
+next
+next
+----
+a#2,15:d
+a#2,1:a
+b#2,1:b
+c#2,1:c
+.
+
+define
+a.SINGLEDEL.1:
+----
+
+iter
+first
+next
+----
+a#1,7:
+.
+
+iter elide-tombstones=true
+first
+----
+.
+
+define
+a.SINGLEDEL.2:
+a.SINGLEDEL.1:
+----
+
+iter
+first
+next
+----
+a#2,7:
+.
+
+define
+a.SINGLEDEL.3:
+a.SINGLEDEL.2:
+a.SET.1:a
+----
+
+iter
+first
+----
+.
+
+define
+a.SET.3:a
+b.SINGLEDEL.2:
+b.DEL.1:
+----
+
+iter
+first
+next
+next
+----
+a#3,1:a
+b#2,0:
+.
+
+define
+a.SINGLEDEL.2:
+a.DEL.1:
+----
+
+iter
+first
+next
+----
+a#2,0:
+.
+
+iter elide-tombstones=true
+first
+----
+.
+
+define
+a.SINGLEDEL.2:
+a.MERGE.1:
+----
+
+iter
+first
+next
+----
+a#2,0:
+.
+
+iter elide-tombstones=true
+first
+----
+.
+
+define
+a.SINGLEDEL.2:
+a.SET.1:b
+----
+
+iter
+first
+----
+.
+
+define
+a.SET.2:b
+a.SINGLEDEL.1:
+----
+
+iter
+first
+next
+----
+a#2,1:b
+.
+
+define
+a.MERGE.6:b
+a.SINGLEDEL.5:
+a.SET.4:a
+----
+
+iter
+first
+next
+----
+a#6,18:b[base]
+.
+
+# Non-deterministic use of SINGLEDEL where there are two older SETs that have
+# not been deleted or single deleted. It is permitted to shadow both.
+define
+a.MERGE.6:b
+a.SINGLEDEL.5:
+a.SET.4:a
+a.SET.3:a
+----
+
+iter
+first
+next
+----
+a#6,18:b[base]
+.
+
+define
+a.SINGLEDEL.2:
+a.SET.1:b
+b.SET.3:c
+----
+
+iter
+first
+next
+----
+b#3,1:c
+.
+
+define
+a.SINGLEDEL.3:
+a.SET.2:b
+a.SET.1:a
+----
+
+iter
+first
+next
+----
+a#1,1:a
+.
+
+define
+a.SINGLEDEL.3:
+a.MERGE.2:b
+a.MERGE.1:a
+----
+
+iter
+first
+next
+----
+a#3,0:
+.
+
+define
+a.SINGLEDEL.4:
+a.SET.3:val
+a.SINGLEDEL.2:
+a.SET.1:val
+----
+
+iter
+first
+----
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#2,7:
+a#1,1:val
+.
+
+define
+a.SINGLEDEL.4:
+a.SET.3:val
+a.DEL.2:
+a.SET.1:val
+----
+
+iter
+first
+next
+----
+a#2,0:
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#2,0:
+a#1,1:val
+.
+
+iter snapshots=3
+first
+next
+----
+a#2,0:
+.
+
+iter snapshots=(2,3)
+first
+next
+next
+----
+a#2,0:
+a#1,1:val
+.
+
+define
+a.SINGLEDEL.4:
+a.SET.3:c
+a.MERGE.2:b
+a.SET.1:a
+----
+
+iter
+first
+next
+----
+a#2,1:ab[base]
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#2,2:b
+a#1,1:a
+.
+
+iter snapshots=3
+first
+next
+----
+a#2,1:ab[base]
+.
+
+iter snapshots=(2,3,4)
+first
+next
+next
+next
+next
+----
+a#4,7:
+a#3,1:c
+a#2,2:b
+a#1,1:a
+.
+
+define
+a.SINGLEDEL.3:
+a.RANGEDEL.2:c
+a.SET.1:val
+----
+
+iter
+first
+next
+next
+tombstones
+----
+a#3,7:
+a#2,15:c
+.
+a-c#2
+.
+
+define
+a.RANGEDEL.3:d
+a.DEL.2:
+a.SET.1:a
+d.DEL.2:
+----
+
+iter
+first
+next
+next
+tombstones
+----
+a#3,15:d
+d#2,0:
+.
+a-d#3
+.
+
+iter snapshots=3
+first
+next
+next
+next
+----
+a#3,15:d
+a#2,0:
+d#2,0:
+.
+
+iter snapshots=2
+first
+next
+next
+next
+----
+a#3,15:d
+a#1,1:a
+d#2,0:
+.
+
+iter snapshots=1
+first
+next
+next
+----
+a#3,15:d
+d#2,0:
+.
+
+define
+a.MERGE.2:a
+b.RANGEDEL.1:c
+----
+
+iter
+first
+tombstones a
+next
+next
+tombstones
+----
+a#2,2:a
+.
+b#1,15:c
+.
+b-c#1
+.
+
+define
+a.MERGE.2:v2
+a.RANGEDEL.1:b
+a.MERGE.1:v1
+----
+
+iter allow-zero-seqnum=true
+first
+next
+next
+next
+tombstones
+----
+a#2,2:v2
+a#1,15:b
+a#0,2:v1
+.
+a-b#1
+.
+
+# Verify that we transform merge+del -> set.
+
+define
+a.MERGE.5:5
+a.DEL.3:
+a.MERGE.1:1
+----
+
+iter
+first
+next
+----
+a#5,18:5[base]
+.
+
+iter allow-zero-seqnum=true
+first
+next
+----
+a#0,18:5[base]
+.
+
+iter elide-tombstones=true
+first
+next
+----
+a#5,18:5[base]
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#5,18:5[base]
+a#1,2:1
+.
+
+iter snapshots=2 elide-tombstones=true
+first
+next
+next
+----
+a#5,18:5[base]
+a#1,2:1
+.
+
+# Verify that we transform merge+rangedel -> set. This isn't strictly
+# necessary, but provides consistency with the behavior for merge+del.
+
+define
+a.RANGEDEL.3:c
+b.MERGE.5:5
+b.SET.2:2
+b.MERGE.1:1
+----
+
+iter
+first
+next
+next
+----
+a#3,15:c
+b#5,1:5[base]
+.
+
+iter allow-zero-seqnum=true
+first
+next
+next
+----
+a#3,15:c
+b#0,1:5[base]
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#3,15:c
+b#5,1:5[base]
+b#1,2:1
+
+define
+a.RANGEDEL.3:c
+b.MERGE.5:5
+b.MERGE.2:2
+b.MERGE.1:1
+----
+
+iter
+first
+next
+next
+----
+a#3,15:c
+b#5,1:5[base]
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#3,15:c
+b#5,1:5[base]
+b#1,2:1
+
+# NB: Zero values are skipped by deletable merger.
+define merger=deletable
+a.MERGE.4:-2
+a.MERGE.3:-1
+a.MERGE.2:2
+a.MERGE.1:1
+b.MERGE.4:-3
+b.MERGE.3:3
+b.MERGE.2:2
+b.MERGE.1:-2
+----
+
+iter
+first
+next
+next
+----
+.
+.
+.
+
+# Test that range keys are interleaved, and exposed to the fragmenter.
+
+define
+a.SINGLEDEL.4:
+a.SET.3:val
+a.DEL.2:
+a.SET.1:val
+c.SET.3:val
+----
+
+define-range-keys
+a-b:{(#3,RANGEKEYSET,@2,foo)}
+d-e:{(#3,RANGEKEYSET,@2,foo)}
+----
+
+iter
+first
+next
+next
+next
+next
+range-keys
+----
+a#72057594037927935,21:
+a#2,0:
+c#3,1:val
+d#72057594037927935,21:
+.
+a-b:{(#3,RANGEKEYSET,@2,foo)}
+d-e:{(#3,RANGEKEYSET,@2,foo)}
+.
diff --git a/pebble/testdata/compaction_iter_delete_sized b/pebble/testdata/compaction_iter_delete_sized
new file mode 100644
index 0000000..275927e
--- /dev/null
+++ b/pebble/testdata/compaction_iter_delete_sized
@@ -0,0 +1,1897 @@
+define
+a.SET.1:b
+----
+
+iter
+first
+next
+----
+a#1,1:b
+.
+
+define
+a.SET.2:c
+a.SET.1:b
+----
+
+iter
+first
+next
+----
+a#2,1:c
+.
+
+iter snapshots=0
+first
+next
+----
+a#2,1:c
+.
+
+iter snapshots=1
+first
+next
+----
+a#2,1:c
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#2,1:c
+a#1,1:b
+.
+
+define
+a.DEL.2:
+a.SET.1:b
+----
+
+iter
+first
+next
+----
+a#2,0:
+.
+
+iter elide-tombstones=true
+first
+----
+.
+
+iter elide-tombstones=true snapshots=2
+first
+next
+next
+----
+a#2,0:
+a#1,1:b
+.
+
+iter elide-tombstones=true snapshots=1
+first
+next
+----
+a#2,0:
+.
+
+define
+a.DEL.2:
+a.SET.1:b
+b.SET.3:c
+----
+
+iter
+first
+next
+next
+----
+a#2,0:
+b#3,1:c
+.
+
+iter snapshots=1
+first
+next
+next
+----
+a#2,0:
+b#3,1:c
+.
+
+iter snapshots=2
+first
+next
+next
+next
+----
+a#2,0:
+a#1,1:b
+b#3,1:c
+.
+
+define
+a.SET.1:a
+b.SET.2:b
+c.SET.3:c
+----
+
+iter
+first
+next
+next
+next
+----
+a#1,1:a
+b#2,1:b
+c#3,1:c
+.
+
+define
+a.MERGE.3:d
+a.MERGE.2:c
+a.SET.1:b
+b.MERGE.2:b
+b.MERGE.1:a
+----
+
+iter
+first
+next
+next
+----
+a#3,1:bcd[base]
+b#2,2:ab
+.
+
+iter snapshots=3 print-snapshot-pinned print-force-obsolete
+first
+next
+next
+next
+----
+a#3,2:d (not pinned) (not force obsolete)
+a#2,1:bc[base] (pinned) (not force obsolete)
+b#2,2:ab (not pinned) (not force obsolete)
+.
+
+define
+a.SET.9:b
+a.DEL.8:
+a.SET.7:d
+a.DEL.6:
+a.SET.5:f
+----
+
+iter
+first
+next
+----
+a#9,18:b
+.
+
+iter snapshots=6
+first
+next
+next
+----
+a#9,18:b
+a#5,1:f
+.
+
+iter snapshots=7
+first
+next
+next
+----
+a#9,18:b
+a#6,0:
+.
+
+iter snapshots=8
+first
+next
+next
+----
+a#9,18:b
+a#7,18:d
+.
+
+iter snapshots=9
+first
+next
+next
+----
+a#9,1:b
+a#8,0:
+.
+
+iter snapshots=10
+first
+next
+----
+a#9,18:b
+.
+
+iter snapshots=(5,6,7,8,9)
+first
+next
+next
+next
+next
+next
+----
+a#9,1:b
+a#8,0:
+a#7,1:d
+a#6,0:
+a#5,1:f
+.
+
+define
+a.INVALID.2:b
+a.SET.1:c
+----
+
+iter
+first
+----
+err=invalid internal key kind: INVALID
+
+define
+a.SET.2:b
+a.INVALID.1:c
+----
+
+iter
+first
+next
+----
+a#2,18:b
+err=invalid internal key kind: INVALID
+
+define
+a.MERGE.2:b
+a.INVALID.1:c
+----
+
+iter
+first
+next
+----
+a#2,2:b
+err=invalid internal key kind: INVALID
+
+define
+a.INVALID.2:c
+a.RANGEDEL.1:d
+----
+
+iter
+first
+tombstones
+----
+err=invalid internal key kind: INVALID
+.
+
+define
+a.MERGE.2:b
+a.MERGE.1:c
+a.MERGE.0:d
+----
+
+iter snapshots=(1,2) print-snapshot-pinned print-force-obsolete
+first
+next
+next
+next
+----
+a#2,2:b (not pinned) (not force obsolete)
+a#1,2:c (pinned) (not force obsolete)
+a#0,2:d (pinned) (not force obsolete)
+.
+
+define
+a.SET.2:b
+a.RANGEDEL.1:c
+b.RANGEDEL.4:d
+b.SET.2:e
+c.SET.3:f
+----
+
+iter
+first
+next
+next
+next
+tombstones
+----
+a#2,18:b
+a#1,15:c
+b#4,15:d
+.
+a-b#1
+b-c#4
+c-d#4
+.
+
+iter snapshots=2
+first
+next
+next
+next
+tombstones
+----
+a#2,1:b
+a#1,15:c
+b#4,15:d
+.
+a-b#1
+b-c#4
+b-c#1
+c-d#4
+.
+
+iter snapshots=3 print-snapshot-pinned print-force-obsolete
+first
+next
+next
+next
+next
+tombstones
+----
+a#2,18:b (not pinned) (not force obsolete)
+a#1,15:c (not pinned) (not force obsolete)
+b#4,15:d (not pinned) (not force obsolete)
+b#2,1:e (pinned) (force obsolete)
+.
+a-b#1
+b-c#4
+b-c#1
+c-d#4
+.
+
+iter snapshots=4 print-snapshot-pinned print-force-obsolete
+first
+next
+next
+next
+next
+next
+tombstones
+----
+a#2,18:b (not pinned) (not force obsolete)
+a#1,15:c (not pinned) (not force obsolete)
+b#4,15:d (not pinned) (not force obsolete)
+b#2,1:e (pinned) (force obsolete)
+c#3,1:f (pinned) (force obsolete)
+.
+a-b#1
+b-c#4
+b-c#1
+c-d#4
+.
+
+define
+a.RANGEDEL.3:e
+b.SET.4:b
+c.SET.3:c
+d.SET.2:d
+e.SET.1:e
+----
+
+iter
+first
+next
+next
+next
+next
+tombstones
+----
+a#3,15:e
+b#4,1:b
+c#3,1:c
+e#1,1:e
+.
+a-e#3
+.
+
+define
+a.RANGEDEL.3:e
+b.MERGE.4:b
+c.MERGE.3:c
+d.MERGE.2:d
+e.MERGE.1:e
+----
+
+iter
+first
+next
+next
+next
+next
+tombstones
+----
+a#3,15:e
+b#4,2:b
+c#3,2:c
+e#1,2:e
+.
+a-e#3
+.
+
+define
+a.RANGEDEL.3:c
+b.MERGE.5:e
+b.MERGE.4:d
+b.MERGE.2:c
+b.MERGE.1:b
+d.MERGE.5:c
+d.MERGE.4:b
+d.RANGEDEL.3:f
+d.MERGE.2:e
+d.MERGE.1:d
+----
+
+iter
+first
+next
+next
+next
+next
+tombstones
+----
+a#3,15:c
+b#5,1:de[base]
+d#5,2:bc
+d#3,15:f
+.
+a-c#3
+d-f#3
+.
+
+define
+a.RANGEDEL.3:d
+b.RANGEDEL.2:e
+c.RANGEDEL.1:f
+----
+
+iter
+first
+next
+next
+next
+tombstones
+----
+a#3,15:d
+b#2,15:e
+c#1,15:f
+.
+a-b#3
+b-c#3
+c-d#3
+d-e#2
+e-f#1
+.
+
+iter snapshots=2
+first
+next
+next
+next
+tombstones
+----
+a#3,15:d
+b#2,15:e
+c#1,15:f
+.
+a-b#3
+b-c#3
+c-d#3
+c-d#1
+d-e#2
+d-e#1
+e-f#1
+.
+
+iter snapshots=3
+first
+next
+next
+next
+tombstones
+----
+a#3,15:d
+b#2,15:e
+c#1,15:f
+.
+a-b#3
+b-c#3
+b-c#2
+c-d#3
+c-d#2
+d-e#2
+e-f#1
+.
+
+iter snapshots=(2,3)
+first
+next
+next
+next
+tombstones
+----
+a#3,15:d
+b#2,15:e
+c#1,15:f
+.
+a-b#3
+b-c#3
+b-c#2
+c-d#3
+c-d#2
+c-d#1
+d-e#2
+d-e#1
+e-f#1
+.
+
+define
+a.RANGEDEL.10:k
+f.SET.9:f
+f.SET.8:f
+----
+
+iter snapshots=(9,10) print-snapshot-pinned print-force-obsolete
+first
+next
+tombstones f
+next
+tombstones
+----
+a#10,15:k (not pinned) (not force obsolete)
+f#9,1:f (pinned) (force obsolete)
+a-f#10
+.
+f#8,1:f (pinned) (force obsolete)
+f-k#10
+.
+
+define
+f.RANGEDEL.10:k
+f.SET.9:f
+f.SET.8:f
+----
+
+iter snapshots=(9,10)
+first
+next
+tombstones f
+next
+tombstones
+----
+f#10,15:k
+f#9,1:f
+.
+f#8,1:f
+f-k#10
+.
+
+define
+a.SET.1:a
+b.RANGEDEL.2:d
+c.RANGEDEL.3:e
+d.SET.4:d
+----
+
+iter
+first
+next
+next
+next
+tombstones c
+tombstones
+----
+a#1,1:a
+b#2,15:d
+c#3,15:e
+d#4,1:d
+b-c#2
+.
+c-d#3
+d-e#3
+.
+
+iter snapshots=3
+first
+next
+next
+next
+tombstones c
+tombstones
+----
+a#1,1:a
+b#2,15:d
+c#3,15:e
+d#4,1:d
+b-c#2
+.
+c-d#3
+c-d#2
+d-e#3
+.
+
+define
+a.SET.1:a
+b.RANGEDEL.2:d
+c.SET.4:d
+----
+
+iter
+first
+next
+next
+tombstones c
+tombstones
+----
+a#1,1:a
+b#2,15:d
+c#4,1:d
+b-c#2
+.
+c-d#2
+.
+
+define
+a.RANGEDEL.2:d
+a.SET.2:a
+b.SET.2:b
+c.SET.2:c
+----
+
+iter
+first
+next
+next
+next
+next
+----
+a#2,15:d
+a#2,1:a
+b#2,1:b
+c#2,1:c
+.
+
+define
+a.SINGLEDEL.1:
+----
+
+iter
+first
+next
+----
+a#1,7:
+.
+
+iter elide-tombstones=true
+first
+----
+.
+
+define
+a.SINGLEDEL.2:
+a.SINGLEDEL.1:
+----
+
+iter
+first
+next
+----
+a#2,7:
+.
+
+define
+a.SINGLEDEL.3:
+a.SINGLEDEL.2:
+a.SET.1:a
+----
+
+iter
+first
+----
+.
+
+define
+a.SET.3:a
+b.SINGLEDEL.2:
+b.DEL.1:
+----
+
+iter
+first
+next
+next
+----
+a#3,1:a
+b#2,0:
+.
+
+define
+a.SINGLEDEL.2:
+a.DEL.1:
+----
+
+iter
+first
+next
+----
+a#2,0:
+.
+
+iter elide-tombstones=true
+first
+----
+.
+
+define
+a.SINGLEDEL.2:
+a.MERGE.1:
+----
+
+iter
+first
+next
+----
+a#2,0:
+.
+
+iter elide-tombstones=true
+first
+----
+.
+
+define
+a.SINGLEDEL.2:
+a.SET.1:b
+----
+
+iter
+first
+----
+.
+
+# SET that meets a SINGLEDEL is transformed into a SETWITHDEL.
+
+define
+a.SET.2:b
+a.SINGLEDEL.1:
+----
+
+iter
+first
+next
+----
+a#2,18:b
+.
+
+define
+a.MERGE.6:b
+a.SINGLEDEL.5:
+a.SET.4:a
+----
+
+iter
+first
+next
+----
+a#6,18:b[base]
+.
+
+# Non-deterministic use of SINGLEDEL where there are two older SETs that have
+# not been deleted or single deleted. It is permitted to shadow both.
+define
+a.MERGE.6:b
+a.SINGLEDEL.5:
+a.SET.4:a
+a.SET.3:a
+----
+
+iter
+first
+next
+----
+a#6,18:b[base]
+.
+
+define
+a.SINGLEDEL.2:
+a.SET.1:b
+b.SET.3:c
+----
+
+iter
+first
+next
+----
+b#3,1:c
+.
+
+define
+a.SINGLEDEL.3:
+a.SET.2:b
+a.SET.1:a
+----
+
+iter
+first
+next
+----
+a#1,1:a
+.
+
+define
+a.SINGLEDEL.3:
+a.MERGE.2:b
+a.MERGE.1:a
+----
+
+iter
+first
+next
+----
+a#3,0:
+.
+
+define
+a.SINGLEDEL.4:
+a.SET.3:val
+a.SINGLEDEL.2:
+a.SET.1:val
+----
+
+iter
+first
+----
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#2,7:
+a#1,1:val
+.
+
+define
+a.SINGLEDEL.4:
+a.SET.3:val
+a.DEL.2:
+a.SET.1:val
+----
+
+iter
+first
+next
+----
+a#2,0:
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#2,0:
+a#1,1:val
+.
+
+iter snapshots=3
+first
+next
+----
+a#2,0:
+.
+
+iter snapshots=(2,3)
+first
+next
+next
+----
+a#2,0:
+a#1,1:val
+.
+
+define
+a.SINGLEDEL.4:
+a.SET.3:c
+a.MERGE.2:b
+a.SET.1:a
+----
+
+iter
+first
+next
+----
+a#2,1:ab[base]
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#2,2:b
+a#1,1:a
+.
+
+iter snapshots=3
+first
+next
+----
+a#2,1:ab[base]
+.
+
+iter snapshots=(2,3,4)
+first
+next
+next
+next
+next
+----
+a#4,7:
+a#3,1:c
+a#2,2:b
+a#1,1:a
+.
+
+define
+a.SINGLEDEL.3:
+a.RANGEDEL.2:c
+a.SET.1:val
+----
+
+iter
+first
+next
+next
+tombstones
+----
+a#3,7:
+a#2,15:c
+.
+a-c#2
+.
+
+define
+a.RANGEDEL.3:d
+a.DEL.2:
+a.SET.1:a
+d.DEL.2:
+----
+
+iter
+first
+next
+next
+tombstones
+----
+a#3,15:d
+d#2,0:
+.
+a-d#3
+.
+
+iter snapshots=3
+first
+next
+next
+next
+----
+a#3,15:d
+a#2,0:
+d#2,0:
+.
+
+iter snapshots=2
+first
+next
+next
+next
+----
+a#3,15:d
+a#1,1:a
+d#2,0:
+.
+
+iter snapshots=1
+first
+next
+next
+----
+a#3,15:d
+d#2,0:
+.
+
+define
+a.MERGE.2:a
+b.RANGEDEL.1:c
+----
+
+iter
+first
+tombstones a
+next
+next
+tombstones
+----
+a#2,2:a
+.
+b#1,15:c
+.
+b-c#1
+.
+
+define
+a.MERGE.2:v2
+a.RANGEDEL.1:b
+a.MERGE.1:v1
+----
+
+iter allow-zero-seqnum=true
+first
+next
+next
+next
+tombstones
+----
+a#2,2:v2
+a#1,15:b
+a#0,2:v1
+.
+a-b#1
+.
+
+# Verify that we transform merge+del -> set.
+
+define
+a.MERGE.5:5
+a.DEL.3:
+a.MERGE.1:1
+----
+
+iter
+first
+next
+----
+a#5,18:5[base]
+.
+
+iter allow-zero-seqnum=true
+first
+next
+----
+a#0,18:5[base]
+.
+
+iter elide-tombstones=true
+first
+next
+----
+a#5,18:5[base]
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#5,18:5[base]
+a#1,2:1
+.
+
+iter snapshots=2 elide-tombstones=true
+first
+next
+next
+----
+a#5,18:5[base]
+a#1,2:1
+.
+
+# Verify that we transform merge+rangedel -> set. This isn't strictly
+# necessary, but provides consistency with the behavior for merge+del.
+
+define
+a.RANGEDEL.3:c
+b.MERGE.5:5
+b.SET.2:2
+b.MERGE.1:1
+----
+
+iter
+first
+next
+next
+----
+a#3,15:c
+b#5,1:5[base]
+.
+
+iter allow-zero-seqnum=true
+first
+next
+next
+----
+a#3,15:c
+b#0,1:5[base]
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#3,15:c
+b#5,1:5[base]
+b#1,2:1
+
+define
+a.RANGEDEL.3:c
+b.MERGE.5:5
+b.MERGE.2:2
+b.MERGE.1:1
+----
+
+iter
+first
+next
+next
+----
+a#3,15:c
+b#5,1:5[base]
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#3,15:c
+b#5,1:5[base]
+b#1,2:1
+
+# SET that meets a DEL is transformed into a SETWITHDEL.
+
+define
+a.SET.2:b
+a.DEL.1:
+----
+
+iter
+first
+next
+----
+a#2,18:b
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#2,1:b
+a#1,0:
+.
+
+define
+a.SET.3:c
+a.DEL.2:
+a.SET.1:b
+----
+
+iter
+first
+next
+----
+a#3,18:c
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#3,18:c
+a#1,1:b
+.
+
+define
+a.SET.3:c
+a.SET.2:b
+a.DEL.1:
+----
+
+iter
+first
+next
+----
+a#3,18:c
+.
+
+iter snapshots=3
+first
+next
+next
+----
+a#3,1:c
+a#2,18:b
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#3,1:c
+a#1,0:
+.
+
+define
+a.DEL.3:
+a.SET.2:b
+a.DEL.1:
+----
+
+iter
+first
+next
+----
+a#3,0:
+.
+
+iter snapshots=3
+first
+next
+next
+----
+a#3,0:
+a#2,18:b
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#3,0:
+a#1,0:
+.
+
+# SETWITHDEL-eligible entries at or under a RANGEDEL at the same user key should
+# be skipped.
+define
+a.SET.3:c
+a.RANGEDEL.2:z
+a.SET.2:b
+a.DEL.1:
+----
+
+iter allow-zero-seqnum=true
+first
+next
+next
+----
+a#0,18:c
+a#2,15:z
+.
+
+iter allow-zero-seqnum=true snapshots=3
+first
+next
+next
+next
+----
+a#3,1:c
+a#2,15:z
+a#0,18:b
+.
+
+iter allow-zero-seqnum=true snapshots=2
+first
+next
+next
+next
+----
+a#3,18:c
+a#2,15:z
+a#1,0:
+.
+
+define
+a.SET.4:c
+a.RANGEDEL.3:z
+a.SET.2:b
+a.DEL.1:
+----
+
+iter
+first
+next
+next
+----
+a#4,18:c
+a#3,15:z
+.
+
+# Invalid keys are emitted under SETWITHDEL.
+
+define
+a.SET.2:b
+a.INVALID.1:
+----
+
+iter
+first
+next
+----
+a#2,18:b
+err=invalid internal key kind: INVALID
+
+define
+a.SET.3:c
+a.INVALID.2:
+a.SET.1:b
+----
+
+iter
+first
+next
+----
+a#3,18:c
+err=invalid internal key kind: INVALID
+
+# SINGLEDEL that meets a SETWITHDEL is transformed into a DEL.
+
+define
+a.SINGLEDEL.3:
+a.SETWITHDEL.2:d
+b.SET.1:c
+----
+
+iter
+first
+next
+next
+----
+a#3,0:
+b#1,1:c
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#3,0:
+b#1,1:c
+.
+
+iter snapshots=3
+first
+next
+next
+next
+----
+a#3,7:
+a#2,18:d
+b#1,1:c
+.
+
+define
+a.SETWITHDEL.3:3
+a.SET.2:d
+b.SET.1:c
+----
+
+iter print-missized-dels
+first
+next
+next
+----
+a#3,18:3
+b#1,1:c
+.
+missized-dels=0
+
+iter snapshots=3
+first
+next
+next
+next
+----
+a#3,18:3
+a#2,1:d
+b#1,1:c
+.
+
+# Test a DELSIZED whose encoded value matches the size of a deleted key. The
+# DELSIZED's value should be removed, reflecting that the tombstone already
+# dropped the key that it was expected to drop.
+
+define
+a.SET.9:foo
+b.DELSIZED.8:varint(11)
+b.SET.5:helloworld
+c.SET.2:bar
+----
+
+iter print-missized-dels
+first
+next
+next
+----
+a#9,1:foo
+b#8,23:
+c#2,1:bar
+missized-dels=0
+
+# Test two DELSIZEDs meeting. The lower-sequenced number value should carry
+# forward, at the higher sequence number. The first DELSIZED should be consider
+# missized: It never found the key it was supposed to delete.
+
+define
+a.SET.9:foo
+b.DELSIZED.9:varint(20)
+b.DELSIZED.8:varint(10)
+c.SET.2:bar
+----
+
+iter print-missized-dels
+first
+next
+next
+----
+a#9,1:foo
+b#9,23:varint(10)
+c#2,1:bar
+missized-dels=1
+
+# Test a DELSIZED whose encoded value is larger than the size of the deleted
+# key. The DELSIZED should be replaced by an ordinary DEL with the same sequence
+# number.
+
+define
+a.SET.2:foo
+b.DELSIZED.8:varint(25)
+b.SET.3:hello
+c.SET.9:bar
+----
+
+iter print-missized-dels
+first
+next
+next
+----
+a#2,1:foo
+b#8,0:
+c#9,1:bar
+missized-dels=1
+
+# Test two DELSIZED at the same user key, but with correctly sized deleted keys.
+
+define
+a.DELSIZED.9:varint(4)
+a.SET.8:foo
+a.DELSIZED.8:varint(6)
+a.SET.5:hello
+----
+
+iter print-missized-dels
+first
+next
+----
+a#9,23:
+.
+missized-dels=0
+
+# Test the above scenario, except the second DELSIZED is missized. It should
+# still count as missized.
+
+define
+a.DELSIZED.9:varint(4)
+a.SET.8:foo
+a.DELSIZED.8:varint(1)
+a.SET.5:hello
+----
+
+iter print-missized-dels
+first
+next
+----
+a#9,0:
+.
+missized-dels=1
+
+# Test the above scenario, except the second tombstone is a DEL. It should
+# NOT count as missized.
+
+define
+a.DELSIZED.9:varint(4)
+a.SET.8:foo
+a.DEL.8:
+a.SET.5:hello
+----
+
+iter print-missized-dels
+first
+next
+----
+a#9,0:
+.
+missized-dels=0
+
+# Test various DELSIZEDs beneath live keys. SETS should be converted to
+# SETWITHDELs when they meet a DELSIZED.
+
+define
+a.SET.7:foo
+a.DELSIZED.5:varint(5)
+b.SET.4:bar
+b.DELSIZED.2:varint(4)
+b.SET.1:bax
+c.SET.9:coconut
+c.DEL.8:del
+c.DELSIZED.5:varint(2)
+d.SET.8:dragonfruit
+----
+
+iter print-missized-dels
+first
+next
+next
+next
+next
+----
+a#7,18:foo
+b#4,18:bar
+c#9,18:coconut
+d#8,1:dragonfruit
+.
+missized-dels=0
+
+# Test a DELSIZED meeting a MERGE. This counts as a missized DEL—The user can't
+# know the value of the most recent MERGE since it's dependent on LSM state.
+
+define
+a.DELSIZED.9:varint(4)
+a.MERGE.8:fo
+a.MERGE.7:o
+----
+
+iter print-missized-dels
+first
+next
+----
+a#9,0:
+.
+missized-dels=1
+
+# Test a DELSIZED that shadows a SINGLEDEL'd key.
+
+define
+a.DELSIZED.4:varint(4)
+b.SINGLEDEL.3:
+b.SET.1:val
+----
+
+iter
+first
+next
+tombstones
+----
+a#4,23:varint(4)
+.
+.
+
+# Repeat the above but with elision of tombstones.
+
+iter elide-tombstones=t
+first
+tombstones
+----
+.
+.
+
+# Test DELSIZED shadowing SINGLEDEL.
+
+define
+a.DELSIZED.4:varint(4)
+a.SET.2:foo
+b.SINGLEDEL.3:
+b.SET.1:val
+----
+
+iter
+first
+next
+tombstones
+----
+a#4,23:
+.
+.
+
+# Repeat the above but with elision of tombstones.
+
+iter elide-tombstones=t
+first
+tombstones
+----
+.
+.
+
+# Test a very subtle sequence where a elision of tombstones is active, and a
+# unskippable RANGEDEL sits between a DELSIZED and the key it was intended to
+# delete. The unskippable RANGEDEL breaks the skipping of keys within the
+# snapshot stripe, but it's ultimately okay because we preserve skip=true across
+# the RANGEDEL return.
+
+define
+a.DELSIZED.5:varint(4)
+a.RANGEDEL.4:d
+a.SET.3:foo
+----
+
+iter elide-tombstones=t
+first
+next
+tombstones
+----
+a#4,15:d
+.
+.
+
+# Try the same test as above, but with allowing sequence number zeroing as well.
+
+iter elide-tombstones=t allow-zero-seqnum=t
+first
+next
+tombstones
+----
+a#4,15:d
+.
+.
+
+# Perform a variant of the above test but with a DEL key.
+
+define
+a.DEL.5:
+a.RANGEDEL.4:d
+a.SET.3:foo
+----
+
+iter elide-tombstones=t
+first
+next
+tombstones
+----
+a#4,15:d
+.
+.
+
+# Perform a variant of the above test but with a SINGLEDEL key.
+
+define
+a.SINGLEDEL.5:
+a.RANGEDEL.4:d
+a.SET.3:foo
+----
+
+iter elide-tombstones=t
+first
+next
+tombstones
+----
+a#4,15:d
+.
+.
+
+# Perform a few variants of the above but with a range del with a seqnum equal to
+# keys. NB: When seqnums are equal, the order of keys with various kinds is:
+#
+# DeleteSized < RangeKey{Delete,Unset,Set} < SetWithDelete < RangeDelete < SingleDelete < Set < Delete
+#
+# NB: Range keys are interleaved always at the maximal sequence number, so the
+# compaction iterator should always observe them first.
+
+define
+a.SINGLEDEL.6:
+a.SETWITHDEL.5:foo
+a.RANGEDEL.5:z
+----
+
+define-range-keys
+a-z:{(#5,RANGEKEYDEL)}
+----
+
+# In the following case, the SINGLEDEL meets a SETWITHDEL, promoting the
+# SINGLEDEL into a DEL.
+
+iter
+first
+next
+next
+next
+tombstones
+----
+a#72057594037927935,19:
+a#6,0:
+a#5,15:z
+.
+a-z#5
+.
+
+# In this case, SINGLEDEL is elided (despite its transformation into a DEL) due
+# to elide-tombstones=t.
+
+iter elide-tombstones=t
+first
+next
+next
+tombstones
+----
+a#72057594037927935,19:
+a#5,15:z
+.
+.
+
+define
+a.SINGLEDEL.6:
+a.RANGEDEL.5:d
+a.SET.5:foo
+----
+
+# NB: In this case, the RANGEDEL acts as an unintentional snapshot stripe
+# change. This is a code artifact, and we will be able to remove this behavior
+# when range deletes are interleaved at the maximal sequence number by an
+# interleaving iterator (like range keys are).
+
+iter
+first
+next
+next
+next
+tombstones
+----
+a#6,7:
+a#5,15:d
+a#5,1:foo
+.
+a-d#5
+.
+
+iter elide-tombstones=t allow-zero-seqnum=t
+first
+next
+tombstones
+----
+a#5,15:d
+.
+.
+
+define
+a.SINGLEDEL.6:
+a.SETWITHDEL.5:foo
+a.RANGEDEL.5:d
+----
+
+# When the SINGLEDEL and SETWITHDEL meet, the SINGLEDEL is promoted into a DEL.
+
+iter
+first
+next
+tombstones
+----
+a#6,0:
+a#5,15:d
+a-d#5
+.
+
+iter elide-tombstones=t
+first
+next
+tombstones
+----
+a#5,15:d
+.
+.
+
+define
+a.DELSIZED.6:varint(3)
+a.RANGEDEL.5:d
+a.SET.5:foo
+----
+
+iter
+first
+next
+tombstones
+----
+a#6,23:varint(3)
+a#5,15:d
+a-d#5
+.
+
+iter elide-tombstones=t
+first
+next
+tombstones
+----
+a#5,15:d
+.
+.
+
+# Test a DELSIZED with a value that fails to decode.
+
+define
+a.DELSIZED.5:notavarint
+a.SET.4:foo
+----
+
+iter
+first
+----
+err=DELSIZED holds invalid value: 6e6f7461766172696e74
+
+# Test a value-less DELSIZED.
+
+define
+a.DELSIZED.5:
+a.SET.4:foo
+a.SET.3:bar
+----
+
+iter print-missized-dels
+first
+next
+----
+a#5,0:
+.
+missized-dels=0
+
+# Regression test for #3087.
+#
+# When a DELSIZED and a SINGLEDEL meet in a compaction, a DEL key should be
+# emitted.
+
+define
+a.DELSIZED.5:
+a.SINGLEDEL.3:
+a.SET.2:foo
+a.SET.1:bar
+----
+
+iter
+first
+next
+----
+a#5,0:
+.
+
+# When a MERGE and a DEL[SIZED] meet in a compaction, a SETWITHDEL key (NOT a
+# SET) should be emitted. Otherwise, a sequence such as SINGLEDDEL, MERGE, DEL,
+# SET could result in the SET re-appearing.
+
+define
+a.MERGE.5:foo
+a.DEL.3:
+----
+
+iter
+first
+next
+----
+a#5,18:foo[base]
+.
diff --git a/pebble/testdata/compaction_iter_set_with_del b/pebble/testdata/compaction_iter_set_with_del
new file mode 100644
index 0000000..a0924af
--- /dev/null
+++ b/pebble/testdata/compaction_iter_set_with_del
@@ -0,0 +1,1417 @@
+define
+a.SET.1:b
+----
+
+iter
+first
+next
+----
+a#1,1:b
+.
+
+define
+a.SET.2:c
+a.SET.1:b
+----
+
+iter
+first
+next
+----
+a#2,1:c
+.
+
+iter snapshots=0
+first
+next
+----
+a#2,1:c
+.
+
+iter snapshots=1
+first
+next
+----
+a#2,1:c
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#2,1:c
+a#1,1:b
+.
+
+define
+a.DEL.2:
+a.SET.1:b
+----
+
+iter
+first
+next
+----
+a#2,0:
+.
+
+iter elide-tombstones=true
+first
+----
+.
+
+iter elide-tombstones=true snapshots=2
+first
+next
+next
+----
+a#2,0:
+a#1,1:b
+.
+
+iter elide-tombstones=true snapshots=1
+first
+next
+----
+a#2,0:
+.
+
+define
+a.DEL.2:
+a.SET.1:b
+b.SET.3:c
+----
+
+iter
+first
+next
+next
+----
+a#2,0:
+b#3,1:c
+.
+
+iter snapshots=1
+first
+next
+next
+----
+a#2,0:
+b#3,1:c
+.
+
+iter snapshots=2
+first
+next
+next
+next
+----
+a#2,0:
+a#1,1:b
+b#3,1:c
+.
+
+define
+a.SET.1:a
+b.SET.2:b
+c.SET.3:c
+----
+
+iter
+first
+next
+next
+next
+----
+a#1,1:a
+b#2,1:b
+c#3,1:c
+.
+
+define
+a.MERGE.3:d
+a.MERGE.2:c
+a.SET.1:b
+b.MERGE.2:b
+b.MERGE.1:a
+----
+
+iter
+first
+next
+next
+----
+a#3,1:bcd[base]
+b#2,2:ab
+.
+
+iter snapshots=3
+first
+next
+next
+next
+----
+a#3,2:d
+a#2,1:bc[base]
+b#2,2:ab
+.
+
+define
+a.SET.9:b
+a.DEL.8:
+a.SET.7:d
+a.DEL.6:
+a.SET.5:f
+----
+
+iter
+first
+next
+----
+a#9,18:b
+.
+
+iter snapshots=6
+first
+next
+next
+----
+a#9,18:b
+a#5,1:f
+.
+
+iter snapshots=7
+first
+next
+next
+----
+a#9,18:b
+a#6,0:
+.
+
+iter snapshots=8
+first
+next
+next
+----
+a#9,18:b
+a#7,18:d
+.
+
+iter snapshots=9
+first
+next
+next
+----
+a#9,1:b
+a#8,0:
+.
+
+iter snapshots=10
+first
+next
+----
+a#9,18:b
+.
+
+iter snapshots=(5,6,7,8,9)
+first
+next
+next
+next
+next
+next
+----
+a#9,1:b
+a#8,0:
+a#7,1:d
+a#6,0:
+a#5,1:f
+.
+
+define
+a.INVALID.2:b
+a.SET.1:c
+----
+
+iter
+first
+----
+err=invalid internal key kind: INVALID
+
+define
+a.SET.2:b
+a.INVALID.1:c
+----
+
+iter
+first
+next
+----
+a#2,18:b
+err=invalid internal key kind: INVALID
+
+define
+a.MERGE.2:b
+a.INVALID.1:c
+----
+
+iter
+first
+next
+----
+a#2,2:b
+err=invalid internal key kind: INVALID
+
+define
+a.INVALID.2:c
+a.RANGEDEL.1:d
+----
+
+iter
+first
+tombstones
+----
+err=invalid internal key kind: INVALID
+.
+
+define
+a.MERGE.2:b
+a.MERGE.1:c
+a.MERGE.0:d
+----
+
+iter snapshots=(1,2)
+first
+next
+next
+next
+----
+a#2,2:b
+a#1,2:c
+a#0,2:d
+.
+
+define
+a.SET.2:b
+a.RANGEDEL.1:c
+b.RANGEDEL.4:d
+b.SET.2:e
+c.SET.3:f
+----
+
+iter
+first
+next
+next
+next
+tombstones
+----
+a#2,18:b
+a#1,15:c
+b#4,15:d
+.
+a-b#1
+b-c#4
+c-d#4
+.
+
+iter snapshots=2
+first
+next
+next
+next
+tombstones
+----
+a#2,1:b
+a#1,15:c
+b#4,15:d
+.
+a-b#1
+b-c#4
+b-c#1
+c-d#4
+.
+
+iter snapshots=3
+first
+next
+next
+next
+next
+tombstones
+----
+a#2,18:b
+a#1,15:c
+b#4,15:d
+b#2,1:e
+.
+a-b#1
+b-c#4
+b-c#1
+c-d#4
+.
+
+iter snapshots=4
+first
+next
+next
+next
+next
+next
+tombstones
+----
+a#2,18:b
+a#1,15:c
+b#4,15:d
+b#2,1:e
+c#3,1:f
+.
+a-b#1
+b-c#4
+b-c#1
+c-d#4
+.
+
+define
+a.RANGEDEL.3:e
+b.SET.4:b
+c.SET.3:c
+d.SET.2:d
+e.SET.1:e
+----
+
+iter
+first
+next
+next
+next
+next
+tombstones
+----
+a#3,15:e
+b#4,1:b
+c#3,1:c
+e#1,1:e
+.
+a-e#3
+.
+
+define
+a.RANGEDEL.3:e
+b.MERGE.4:b
+c.MERGE.3:c
+d.MERGE.2:d
+e.MERGE.1:e
+----
+
+iter
+first
+next
+next
+next
+next
+tombstones
+----
+a#3,15:e
+b#4,2:b
+c#3,2:c
+e#1,2:e
+.
+a-e#3
+.
+
+define
+a.RANGEDEL.3:c
+b.MERGE.5:e
+b.MERGE.4:d
+b.MERGE.2:c
+b.MERGE.1:b
+d.MERGE.5:c
+d.MERGE.4:b
+d.RANGEDEL.3:f
+d.MERGE.2:e
+d.MERGE.1:d
+----
+
+iter
+first
+next
+next
+next
+next
+tombstones
+----
+a#3,15:c
+b#5,1:de[base]
+d#5,2:bc
+d#3,15:f
+.
+a-c#3
+d-f#3
+.
+
+define
+a.RANGEDEL.3:d
+b.RANGEDEL.2:e
+c.RANGEDEL.1:f
+----
+
+iter
+first
+next
+next
+next
+tombstones
+----
+a#3,15:d
+b#2,15:e
+c#1,15:f
+.
+a-b#3
+b-c#3
+c-d#3
+d-e#2
+e-f#1
+.
+
+iter snapshots=2
+first
+next
+next
+next
+tombstones
+----
+a#3,15:d
+b#2,15:e
+c#1,15:f
+.
+a-b#3
+b-c#3
+c-d#3
+c-d#1
+d-e#2
+d-e#1
+e-f#1
+.
+
+iter snapshots=3
+first
+next
+next
+next
+tombstones
+----
+a#3,15:d
+b#2,15:e
+c#1,15:f
+.
+a-b#3
+b-c#3
+b-c#2
+c-d#3
+c-d#2
+d-e#2
+e-f#1
+.
+
+iter snapshots=(2,3)
+first
+next
+next
+next
+tombstones
+----
+a#3,15:d
+b#2,15:e
+c#1,15:f
+.
+a-b#3
+b-c#3
+b-c#2
+c-d#3
+c-d#2
+c-d#1
+d-e#2
+d-e#1
+e-f#1
+.
+
+define
+a.RANGEDEL.10:k
+f.SET.9:f
+f.SET.8:f
+----
+
+iter snapshots=(9,10)
+first
+next
+tombstones f
+next
+tombstones
+----
+a#10,15:k
+f#9,1:f
+a-f#10
+.
+f#8,1:f
+f-k#10
+.
+
+define
+f.RANGEDEL.10:k
+f.SET.9:f
+f.SET.8:f
+----
+
+iter snapshots=(9,10)
+first
+next
+tombstones f
+next
+tombstones
+----
+f#10,15:k
+f#9,1:f
+.
+f#8,1:f
+f-k#10
+.
+
+define
+a.SET.1:a
+b.RANGEDEL.2:d
+c.RANGEDEL.3:e
+d.SET.4:d
+----
+
+iter
+first
+next
+next
+next
+tombstones c
+tombstones
+----
+a#1,1:a
+b#2,15:d
+c#3,15:e
+d#4,1:d
+b-c#2
+.
+c-d#3
+d-e#3
+.
+
+iter snapshots=3
+first
+next
+next
+next
+tombstones c
+tombstones
+----
+a#1,1:a
+b#2,15:d
+c#3,15:e
+d#4,1:d
+b-c#2
+.
+c-d#3
+c-d#2
+d-e#3
+.
+
+define
+a.SET.1:a
+b.RANGEDEL.2:d
+c.SET.4:d
+----
+
+iter
+first
+next
+next
+tombstones c
+tombstones
+----
+a#1,1:a
+b#2,15:d
+c#4,1:d
+b-c#2
+.
+c-d#2
+.
+
+define
+a.RANGEDEL.2:d
+a.SET.2:a
+b.SET.2:b
+c.SET.2:c
+----
+
+iter
+first
+next
+next
+next
+next
+----
+a#2,15:d
+a#2,1:a
+b#2,1:b
+c#2,1:c
+.
+
+define
+a.SINGLEDEL.1:
+----
+
+iter
+first
+next
+----
+a#1,7:
+.
+
+iter elide-tombstones=true
+first
+----
+.
+
+define
+a.SINGLEDEL.2:
+a.SINGLEDEL.1:
+----
+
+iter
+first
+next
+----
+a#2,7:
+.
+
+define
+a.SINGLEDEL.3:
+a.SINGLEDEL.2:
+a.SET.1:a
+----
+
+iter
+first
+----
+.
+
+define
+a.SET.3:a
+b.SINGLEDEL.2:
+b.DEL.1:
+----
+
+iter
+first
+next
+next
+----
+a#3,1:a
+b#2,0:
+.
+
+define
+a.SINGLEDEL.2:
+a.DEL.1:
+----
+
+iter
+first
+next
+----
+a#2,0:
+.
+
+iter elide-tombstones=true
+first
+----
+.
+
+define
+a.SINGLEDEL.2:
+a.MERGE.1:
+----
+
+iter
+first
+next
+----
+a#2,0:
+.
+
+iter elide-tombstones=true
+first
+----
+.
+
+define
+a.SINGLEDEL.2:
+a.SET.1:b
+----
+
+iter
+first
+----
+.
+
+# SET that meets a SINGLEDEL is transformed into a SETWITHDEL.
+
+define
+a.SET.2:b
+a.SINGLEDEL.1:
+----
+
+iter
+first
+next
+----
+a#2,18:b
+.
+
+define
+a.MERGE.6:b
+a.SINGLEDEL.5:
+a.SET.4:a
+----
+
+iter
+first
+next
+----
+a#6,18:b[base]
+.
+
+# Non-deterministic use of SINGLEDEL where there are two older SETs that have
+# not been deleted or single deleted. It is permitted to shadow both.
+define
+a.MERGE.6:b
+a.SINGLEDEL.5:
+a.SET.4:a
+a.SET.3:a
+----
+
+iter
+first
+next
+----
+a#6,18:b[base]
+.
+
+define
+a.SINGLEDEL.2:
+a.SET.1:b
+b.SET.3:c
+----
+
+iter
+first
+next
+----
+b#3,1:c
+.
+
+define
+a.SINGLEDEL.3:
+a.SET.2:b
+a.SET.1:a
+----
+
+iter
+first
+next
+----
+a#1,1:a
+.
+
+define
+a.SINGLEDEL.3:
+a.MERGE.2:b
+a.MERGE.1:a
+----
+
+iter
+first
+next
+----
+a#3,0:
+.
+
+define
+a.SINGLEDEL.4:
+a.SET.3:val
+a.SINGLEDEL.2:
+a.SET.1:val
+----
+
+iter
+first
+----
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#2,7:
+a#1,1:val
+.
+
+define
+a.SINGLEDEL.4:
+a.SET.3:val
+a.DEL.2:
+a.SET.1:val
+----
+
+iter
+first
+next
+----
+a#2,0:
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#2,0:
+a#1,1:val
+.
+
+iter snapshots=3
+first
+next
+----
+a#2,0:
+.
+
+iter snapshots=(2,3)
+first
+next
+next
+----
+a#2,0:
+a#1,1:val
+.
+
+define
+a.SINGLEDEL.4:
+a.SET.3:c
+a.MERGE.2:b
+a.SET.1:a
+----
+
+iter
+first
+next
+----
+a#2,1:ab[base]
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#2,2:b
+a#1,1:a
+.
+
+iter snapshots=3
+first
+next
+----
+a#2,1:ab[base]
+.
+
+iter snapshots=(2,3,4)
+first
+next
+next
+next
+next
+----
+a#4,7:
+a#3,1:c
+a#2,2:b
+a#1,1:a
+.
+
+define
+a.SINGLEDEL.3:
+a.RANGEDEL.2:c
+a.SET.1:val
+----
+
+iter
+first
+next
+next
+tombstones
+----
+a#3,7:
+a#2,15:c
+.
+a-c#2
+.
+
+define
+a.RANGEDEL.3:d
+a.DEL.2:
+a.SET.1:a
+d.DEL.2:
+----
+
+iter
+first
+next
+next
+tombstones
+----
+a#3,15:d
+d#2,0:
+.
+a-d#3
+.
+
+iter snapshots=3
+first
+next
+next
+next
+----
+a#3,15:d
+a#2,0:
+d#2,0:
+.
+
+iter snapshots=2
+first
+next
+next
+next
+----
+a#3,15:d
+a#1,1:a
+d#2,0:
+.
+
+iter snapshots=1
+first
+next
+next
+----
+a#3,15:d
+d#2,0:
+.
+
+define
+a.MERGE.2:a
+b.RANGEDEL.1:c
+----
+
+iter
+first
+tombstones a
+next
+next
+tombstones
+----
+a#2,2:a
+.
+b#1,15:c
+.
+b-c#1
+.
+
+define
+a.MERGE.2:v2
+a.RANGEDEL.1:b
+a.MERGE.1:v1
+----
+
+iter allow-zero-seqnum=true
+first
+next
+next
+next
+tombstones
+----
+a#2,2:v2
+a#1,15:b
+a#0,2:v1
+.
+a-b#1
+.
+
+# Verify that we transform merge+del -> set.
+
+define
+a.MERGE.5:5
+a.DEL.3:
+a.MERGE.1:1
+----
+
+iter
+first
+next
+----
+a#5,18:5[base]
+.
+
+iter allow-zero-seqnum=true
+first
+next
+----
+a#0,18:5[base]
+.
+
+iter elide-tombstones=true
+first
+next
+----
+a#5,18:5[base]
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#5,18:5[base]
+a#1,2:1
+.
+
+iter snapshots=2 elide-tombstones=true
+first
+next
+next
+----
+a#5,18:5[base]
+a#1,2:1
+.
+
+# Verify that we transform merge+rangedel -> set. This isn't strictly
+# necessary, but provides consistency with the behavior for merge+del.
+
+define
+a.RANGEDEL.3:c
+b.MERGE.5:5
+b.SET.2:2
+b.MERGE.1:1
+----
+
+iter
+first
+next
+next
+----
+a#3,15:c
+b#5,1:5[base]
+.
+
+iter allow-zero-seqnum=true
+first
+next
+next
+----
+a#3,15:c
+b#0,1:5[base]
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#3,15:c
+b#5,1:5[base]
+b#1,2:1
+
+define
+a.RANGEDEL.3:c
+b.MERGE.5:5
+b.MERGE.2:2
+b.MERGE.1:1
+----
+
+iter
+first
+next
+next
+----
+a#3,15:c
+b#5,1:5[base]
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#3,15:c
+b#5,1:5[base]
+b#1,2:1
+
+# SET that meets a DEL is transformed into a SETWITHDEL.
+
+define
+a.SET.2:b
+a.DEL.1:
+----
+
+iter
+first
+next
+----
+a#2,18:b
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#2,1:b
+a#1,0:
+.
+
+define
+a.SET.3:c
+a.DEL.2:
+a.SET.1:b
+----
+
+iter
+first
+next
+----
+a#3,18:c
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#3,18:c
+a#1,1:b
+.
+
+define
+a.SET.3:c
+a.SET.2:b
+a.DEL.1:
+----
+
+iter
+first
+next
+----
+a#3,18:c
+.
+
+iter snapshots=3
+first
+next
+next
+----
+a#3,1:c
+a#2,18:b
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#3,1:c
+a#1,0:
+.
+
+define
+a.DEL.3:
+a.SET.2:b
+a.DEL.1:
+----
+
+iter
+first
+next
+----
+a#3,0:
+.
+
+iter snapshots=3
+first
+next
+next
+----
+a#3,0:
+a#2,18:b
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#3,0:
+a#1,0:
+.
+
+# SETWITHDEL-eligible entries at or under a RANGEDEL at the same user key should
+# be skipped.
+define
+a.SET.3:c
+a.RANGEDEL.2:z
+a.SET.2:b
+a.DEL.1:
+----
+
+iter allow-zero-seqnum=true
+first
+next
+next
+----
+a#0,18:c
+a#2,15:z
+.
+
+iter allow-zero-seqnum=true snapshots=3
+first
+next
+next
+next
+----
+a#3,1:c
+a#2,15:z
+a#0,18:b
+.
+
+iter allow-zero-seqnum=true snapshots=2
+first
+next
+next
+next
+----
+a#3,18:c
+a#2,15:z
+a#1,0:
+.
+
+define
+a.SET.4:c
+a.RANGEDEL.3:z
+a.SET.2:b
+a.DEL.1:
+----
+
+iter
+first
+next
+next
+----
+a#4,18:c
+a#3,15:z
+.
+
+# Invalid keys are emitted under SETWITHDEL.
+
+define
+a.SET.2:b
+a.INVALID.1:
+----
+
+iter
+first
+next
+----
+a#2,18:b
+err=invalid internal key kind: INVALID
+
+define
+a.SET.3:c
+a.INVALID.2:
+a.SET.1:b
+----
+
+iter
+first
+next
+----
+a#3,18:c
+err=invalid internal key kind: INVALID
+
+# SINGLEDEL that meets a SETWITHDEL is transformed into a DEL.
+
+define
+a.SINGLEDEL.3:
+a.SETWITHDEL.2:d
+b.SET.1:c
+----
+
+iter
+first
+next
+next
+----
+a#3,0:
+b#1,1:c
+.
+
+iter snapshots=2
+first
+next
+next
+----
+a#3,0:
+b#1,1:c
+.
+
+iter snapshots=3
+first
+next
+next
+next
+----
+a#3,7:
+a#2,18:d
+b#1,1:c
+.
+
+define
+a.SETWITHDEL.3:3
+a.SET.2:d
+b.SET.1:c
+----
+
+iter
+first
+next
+next
+----
+a#3,18:3
+b#1,1:c
+.
+
+iter snapshots=3
+first
+next
+next
+next
+----
+a#3,18:3
+a#2,1:d
+b#1,1:c
+.
diff --git a/pebble/testdata/compaction_output_file_size b/pebble/testdata/compaction_output_file_size
new file mode 100644
index 0000000..ee212c4
--- /dev/null
+++ b/pebble/testdata/compaction_output_file_size
@@ -0,0 +1,55 @@
+define
+L3
+  010001:a.SET.1111-f.SET.1112 size=10
+  010002:g.SET.1111-l.SET.1112 size=10
+L4
+  001001:a.SET.111-f.SET.112 size=100
+  001002:g.SET.111-l.SET.112 size=100
+L5
+  000101:a.SET.11-f.SET.12 size=1000
+  000102:g.SET.11-l.SET.12 size=1000
+L6
+  000010:a.SET.1-f.SET.2 size=128000000
+  000011:g.SET.1-l.SET.2 size=128000000 range-deletions-bytes-estimate=28000000
+----
+3:
+  010001:[a#1111,SET-f#1112,SET]
+  010002:[g#1111,SET-l#1112,SET]
+4:
+  001001:[a#111,SET-f#112,SET]
+  001002:[g#111,SET-l#112,SET]
+5:
+  000101:[a#11,SET-f#12,SET]
+  000102:[g#11,SET-l#12,SET]
+6:
+  000010:[a#1,SET-f#2,SET]
+  000011:[g#1,SET-l#2,SET]
+
+# Max output file size should be 32MiB because Lbase is L3.
+pick-auto
+----
+L6 -> L6
+L6: 000011
+maxOutputFileSize: 33554432
+
+define
+L5
+  000101:a.SET.11-f.SET.12 size=1000
+  000102:g.SET.11-l.SET.12 size=1000
+L6
+  000010:a.SET.1-f.SET.2 size=128000000
+  000011:g.SET.1-l.SET.2 size=128000000 range-deletions-bytes-estimate=28000000
+----
+5:
+  000101:[a#11,SET-f#12,SET]
+  000102:[g#11,SET-l#12,SET]
+6:
+  000010:[a#1,SET-f#2,SET]
+  000011:[g#1,SET-l#2,SET]
+
+# Max output file size should be 8MiB because Lbase is L5.
+pick-auto
+----
+L6 -> L6
+L6: 000011
+maxOutputFileSize: 8388608
diff --git a/pebble/testdata/compaction_output_level b/pebble/testdata/compaction_output_level
new file mode 100644
index 0000000..7d67475
--- /dev/null
+++ b/pebble/testdata/compaction_output_level
@@ -0,0 +1,199 @@
+compact start=0 base=1
+----
+output=1
+max-output-file-size=4194304
+
+compact start=1 base=1
+----
+output=2
+max-output-file-size=8388608
+
+compact start=2 base=1
+----
+output=3
+max-output-file-size=16777216
+
+compact start=3 base=1
+----
+output=4
+max-output-file-size=33554432
+
+compact start=4 base=1
+----
+output=5
+max-output-file-size=67108864
+
+compact start=5 base=1
+----
+output=6
+max-output-file-size=134217728
+
+compact start=6 base=1
+----
+output=6
+max-output-file-size=134217728
+
+
+compact start=0 base=2
+----
+output=2
+max-output-file-size=4194304
+
+compact start=1 base=2
+----
+invalid compaction: start level 1 should not be empty (base level 2)
+
+compact start=2 base=2
+----
+output=3
+max-output-file-size=8388608
+
+compact start=3 base=2
+----
+output=4
+max-output-file-size=16777216
+
+compact start=4 base=2
+----
+output=5
+max-output-file-size=33554432
+
+compact start=5 base=2
+----
+output=6
+max-output-file-size=67108864
+
+compact start=6 base=2
+----
+output=6
+max-output-file-size=67108864
+
+
+compact start=0 base=3
+----
+output=3
+max-output-file-size=4194304
+
+compact start=1 base=3
+----
+invalid compaction: start level 1 should not be empty (base level 3)
+
+compact start=2 base=3
+----
+invalid compaction: start level 2 should not be empty (base level 3)
+
+compact start=3 base=3
+----
+output=4
+max-output-file-size=8388608
+
+compact start=4 base=3
+----
+output=5
+max-output-file-size=16777216
+
+compact start=5 base=3
+----
+output=6
+max-output-file-size=33554432
+
+compact start=6 base=3
+----
+output=6
+max-output-file-size=33554432
+
+
+compact start=0 base=4
+----
+output=4
+max-output-file-size=4194304
+
+compact start=1 base=4
+----
+invalid compaction: start level 1 should not be empty (base level 4)
+
+compact start=2 base=4
+----
+invalid compaction: start level 2 should not be empty (base level 4)
+
+compact start=3 base=4
+----
+invalid compaction: start level 3 should not be empty (base level 4)
+
+compact start=4 base=4
+----
+output=5
+max-output-file-size=8388608
+
+compact start=5 base=4
+----
+output=6
+max-output-file-size=16777216
+
+compact start=6 base=4
+----
+output=6
+max-output-file-size=16777216
+
+
+compact start=0 base=5
+----
+output=5
+max-output-file-size=4194304
+
+compact start=1 base=5
+----
+invalid compaction: start level 1 should not be empty (base level 5)
+
+compact start=2 base=5
+----
+invalid compaction: start level 2 should not be empty (base level 5)
+
+compact start=3 base=5
+----
+invalid compaction: start level 3 should not be empty (base level 5)
+
+compact start=4 base=5
+----
+invalid compaction: start level 4 should not be empty (base level 5)
+
+compact start=5 base=5
+----
+output=6
+max-output-file-size=8388608
+
+compact start=6 base=5
+----
+output=6
+max-output-file-size=8388608
+
+
+compact start=0 base=6
+----
+output=6
+max-output-file-size=4194304
+
+compact start=1 base=6
+----
+invalid compaction: start level 1 should not be empty (base level 6)
+
+compact start=2 base=6
+----
+invalid compaction: start level 2 should not be empty (base level 6)
+
+compact start=3 base=6
+----
+invalid compaction: start level 3 should not be empty (base level 6)
+
+compact start=4 base=6
+----
+invalid compaction: start level 4 should not be empty (base level 6)
+
+compact start=5 base=6
+----
+invalid compaction: start level 5 should not be empty (base level 6)
+
+compact start=6 base=6
+----
+output=6
+max-output-file-size=4194304
diff --git a/pebble/testdata/compaction_output_splitters b/pebble/testdata/compaction_output_splitters
new file mode 100644
index 0000000..237b1ce
--- /dev/null
+++ b/pebble/testdata/compaction_output_splitters
@@ -0,0 +1,112 @@
+
+# arraySplitter tests
+
+reset
+----
+ok
+
+init child0 mock
+----
+ok
+
+init child1 mock
+----
+ok
+
+init main array
+----
+ok
+
+set-should-split child0 no-split
+----
+ok
+
+set-should-split child1 no-split
+----
+ok
+
+should-split-before foo.SET.2
+----
+no-split
+
+set-should-split child1 split-now
+----
+ok
+
+should-split-before foo.SET.2
+----
+split-now
+
+set-should-split child1 no-split
+----
+ok
+
+should-split-before foo.SET.2
+----
+no-split
+
+set-should-split child0 split-now
+----
+ok
+
+should-split-before foo.SET.2
+----
+split-now
+
+# userKeyChangeSplitter tests
+
+reset
+----
+ok
+
+init child0 mock
+----
+ok
+
+init main userkey
+----
+ok
+
+should-split-before foo.SET.6
+----
+no-split
+
+should-split-before foo.SET.5
+----
+no-split
+
+set-should-split child0 split-now
+----
+ok
+
+should-split-before foo.SET.4
+----
+no-split
+
+should-split-before foo.SET.3
+----
+no-split
+
+should-split-before food.SET.6
+----
+split-now
+
+set-should-split child0 no-split
+----
+ok
+
+should-split-before food.SET.5
+----
+no-split
+
+set-should-split child0 split-now
+----
+ok
+
+should-split-before food.SET.4
+----
+no-split
+
+should-split-before food2.SET.4
+----
+split-now
diff --git a/pebble/testdata/compaction_picker_L0 b/pebble/testdata/compaction_picker_L0
new file mode 100644
index 0000000..de6f62c
--- /dev/null
+++ b/pebble/testdata/compaction_picker_L0
@@ -0,0 +1,432 @@
+# 1 L0 file.
+define
+L0
+   000100:i.SET.101-j.SET.102
+----
+0.0:
+  000100:[i#101,SET-j#102,SET]
+
+pick-auto l0_compaction_threshold=1
+----
+L0 -> L6
+L0: 000100
+
+pick-auto l0_compaction_file_threshold=1
+----
+L0 -> L6
+L0: 000100
+
+pick-auto l0_compaction_threshold=4 l0_compaction_file_threshold=2
+----
+nil
+
+# 1 L0 file, 1 Lbase file.
+
+define
+L0
+   000100:i.SET.101-j.SET.102
+L6
+   000200:f.SET.51-l.SET.52
+----
+0.0:
+  000100:[i#101,SET-j#102,SET]
+6:
+  000200:[f#51,SET-l#52,SET]
+
+pick-auto l0_compaction_threshold=1
+----
+L0 -> L6
+L0: 000100
+L6: 000200
+
+pick-auto l0_compaction_threshold=2
+----
+L0 -> L6
+L0: 000100
+L6: 000200
+
+pick-auto l0_compaction_threshold=3
+----
+nil
+
+# 2 L0 files, no overlaps.
+
+define
+L0
+   000100:i.SET.101-j.SET.102
+   000110:k.SET.111-l.SET.112
+L6
+   000200:f.SET.51-l.SET.52
+----
+0.0:
+  000100:[i#101,SET-j#102,SET]
+  000110:[k#111,SET-l#112,SET]
+6:
+  000200:[f#51,SET-l#52,SET]
+
+pick-auto l0_compaction_threshold=1
+----
+L0 -> L6
+L0: 000100,000110
+L6: 000200
+
+pick-auto l0_compaction_threshold=2
+----
+L0 -> L6
+L0: 000100,000110
+L6: 000200
+
+pick-auto l0_compaction_threshold=3 l0_compaction_file_threshold=512
+----
+nil
+
+pick-auto l0_compaction_threshold=3 l0_compaction_file_threshold=3
+----
+nil
+
+pick-auto l0_compaction_threshold=3 l0_compaction_file_threshold=2
+----
+L0 -> L6
+L0: 000100,000110
+L6: 000200
+
+# 2 L0 files, with ikey overlap.
+
+define
+L0
+   000100:i.SET.101-p.SET.102
+   000110:j.SET.111-q.SET.112
+L6
+   000200:f.SET.51-s.SET.52
+----
+0.1:
+  000110:[j#111,SET-q#112,SET]
+0.0:
+  000100:[i#101,SET-p#102,SET]
+6:
+  000200:[f#51,SET-s#52,SET]
+
+pick-auto l0_compaction_threshold=2
+----
+L0 -> L6
+L0: 000100,000110
+L6: 000200
+
+define
+L0
+   000100:i.SET.101-p.SET.102
+   000110:j.SET.111-q.SET.112
+L6
+   000200:f.SET.51-s.SET.52
+----
+0.1:
+  000110:[j#111,SET-q#112,SET]
+0.0:
+  000100:[i#101,SET-p#102,SET]
+6:
+  000200:[f#51,SET-s#52,SET]
+
+pick-auto l0_compaction_threshold=2
+----
+L0 -> L6
+L0: 000100,000110
+L6: 000200
+
+# 2 L0 files, with ukey overlap.
+
+define
+L0
+   000100:i.SET.101-i.SET.102
+   000110:i.SET.111-i.SET.112
+L6
+   000200:f.SET.51-l.SET.52
+----
+0.1:
+  000110:[i#111,SET-i#112,SET]
+0.0:
+  000100:[i#101,SET-i#102,SET]
+6:
+  000200:[f#51,SET-l#52,SET]
+
+pick-auto l0_compaction_threshold=2
+----
+L0 -> L6
+L0: 000100,000110
+L6: 000200
+
+# 3 L0 files (1 overlap).
+
+define
+L0
+   000100:i.SET.101-p.SET.102
+   000110:j.SET.111-q.SET.112
+   000120:r.SET.113-s.SET.114
+L6
+   000200:f.SET.51-s.SET.52
+----
+0.1:
+  000110:[j#111,SET-q#112,SET]
+0.0:
+  000100:[i#101,SET-p#102,SET]
+  000120:[r#113,SET-s#114,SET]
+6:
+  000200:[f#51,SET-s#52,SET]
+
+pick-auto l0_compaction_threshold=2
+----
+L0 -> L6
+L0: 000100,000110,000120
+L6: 000200
+
+pick-auto l0_compaction_threshold=3
+----
+L0 -> L6
+L0: 000100,000110,000120
+L6: 000200
+
+pick-auto l0_compaction_threshold=4
+----
+L0 -> L6
+L0: 000100,000110,000120
+L6: 000200
+
+pick-auto l0_compaction_threshold=6 l0_compaction_file_threshold=512
+----
+nil
+
+# 3 L0 files (1 overlap, 1 intra-L0 compacting). Should avoid the compacting
+# file.
+
+define
+L0
+   000100:i.SET.101-p.SET.102
+   000110:j.SET.111-q.SET.112
+   000120:r.SET.113-s.SET.114
+L6
+   000200:f.SET.51-s.SET.52
+compactions
+  L0 000120 -> L0
+----
+0.1:
+  000110:[j#111,SET-q#112,SET]
+0.0:
+  000100:[i#101,SET-p#102,SET]
+  000120:[r#113,SET-s#114,SET]
+6:
+  000200:[f#51,SET-s#52,SET]
+compactions
+  L0 000120 -> L0
+
+pick-auto l0_compaction_threshold=2
+----
+L0 -> L6
+L0: 000100,000110
+L6: 000200
+
+# 3 L0 files (1 overlap), Lbase compacting.
+# Should choose an intra-L0 compaction. Note that intra-L0 compactions
+# don't follow l0_compaction_threshold, but rather a minIntraL0Count constant
+# in compaction_picker.go
+
+define
+L0
+   000100:i.SET.101-p.SET.102
+   000110:j.SET.111-q.SET.112
+   000120:r.SET.113-s.SET.114
+   000130:i.SET.110-p.SET.110
+   000140:i.SET.120-p.SET.120
+L6
+   000200:f.SET.51-s.SET.52
+compactions
+  L6 000200 -> L6
+----
+0.3:
+  000140:[i#120,SET-p#120,SET]
+0.2:
+  000130:[i#110,SET-p#110,SET]
+0.1:
+  000110:[j#111,SET-q#112,SET]
+0.0:
+  000100:[i#101,SET-p#102,SET]
+  000120:[r#113,SET-s#114,SET]
+6:
+  000200:[f#51,SET-s#52,SET]
+compactions
+  L6 000200 -> L6
+
+pick-auto
+----
+L0 -> L0
+L0: 000100,000110,000130,000140
+
+max-output-file-size
+----
+2097152
+
+max-overlap-bytes
+----
+20971520
+
+# 1 L0 file. Should not choose any compaction, as an intra-L0 compaction
+# with one input is unhelpful.
+
+define
+L0
+   000100:i.SET.101-p.SET.102
+L6
+   000200:f.SET.51-s.SET.52
+compactions
+  L6 000200 -> L6
+----
+0.0:
+  000100:[i#101,SET-p#102,SET]
+6:
+  000200:[f#51,SET-s#52,SET]
+compactions
+  L6 000200 -> L6
+
+pick-auto l0_compaction_threshold=1
+----
+nil
+
+# Test an in-progress L0->Lbase compaction with another L0 file that does not
+# overlap any of the compacting files in L0 or Lbase, but does overlap the
+# compaction's range. No new compaction should be picked because the
+# in-progress compaction's output tables could overlap the non-compacting
+# file.
+
+define
+L0
+  000010:a.SET.11-b.SET.12
+  000013:k.SET.23-n.SET.24
+  000011:x.SET.13-z.SET.25
+L1
+  000101:a.SET.1-f.SET.2
+  000102:w.SET.3-z.SET.4
+compactions
+  L0 000010 000011 -> L1 000101 000102
+----
+0.0:
+  000010:[a#11,SET-b#12,SET]
+  000013:[k#23,SET-n#24,SET]
+  000011:[x#13,SET-z#25,SET]
+1:
+  000101:[a#1,SET-f#2,SET]
+  000102:[w#3,SET-z#4,SET]
+compactions
+  L0 000010 000011 -> L1 000101 000102
+
+pick-auto l0_compaction_threshold=2
+----
+nil
+
+define
+L0
+  001621:b.MERGE.1261-b.MERGE.1261
+  001603:d.DEL.1248-d.DEL.1248
+  001609:e.DEL.1253-e.DEL.1253
+L6
+  001615:a.RANGEDEL.1254-c.RANGEDEL.72057594037927935
+  001619:c.SET.0-c.SET.0
+----
+0.0:
+  001621:[b#1261,MERGE-b#1261,MERGE]
+  001603:[d#1248,DEL-d#1248,DEL]
+  001609:[e#1253,DEL-e#1253,DEL]
+6:
+  001615:[a#1254,RANGEDEL-c#inf,RANGEDEL]
+  001619:[c#0,SET-c#0,SET]
+
+pick-auto
+----
+L0 -> L6
+L0: 001621
+L6: 001615
+
+define
+L0
+  001445:b.RANGEDEL.528-e.RANGEDEL.72057594037927935
+  001448:g.RANGEDEL.529-h.RANGEDEL.72057594037927935
+L6
+  001428:a.MERGE.486-c.RANGEDEL.72057594037927935
+  001424:c.MERGE.479-d.RANGEDEL.72057594037927935
+  001442:f.MERGE.0-i.SET.0
+----
+0.0:
+  001445:[b#528,RANGEDEL-e#inf,RANGEDEL]
+  001448:[g#529,RANGEDEL-h#inf,RANGEDEL]
+6:
+  001428:[a#486,MERGE-c#inf,RANGEDEL]
+  001424:[c#479,MERGE-d#inf,RANGEDEL]
+  001442:[f#0,MERGE-i#0,SET]
+
+pick-auto
+----
+L0 -> L6
+L0: 001445
+L6: 001424,001428
+
+define
+L0
+  000002:b.SET.12-b.SET.12
+  000003:c.SET.13-c.SET.13
+L6
+  000603:c.SET.03-c.SET.03
+----
+0.0:
+  000002:[b#12,SET-b#12,SET]
+  000003:[c#13,SET-c#13,SET]
+6:
+  000603:[c#3,SET-c#3,SET]
+
+pick-auto
+----
+L0 -> L6
+L0: 000002
+
+define
+L0
+  000053:e.SET.24-e.SET.24
+  000055:x.SET.25-x.SET.25
+  000051:e.DEL.23-e.DEL.23
+  000049:t.SET.22-t.SET.22
+  000046:x.MERGE.21-x.MERGE.21
+L6
+  000045:f.SET.0-x.SET.0
+----
+0.1:
+  000051:[e#23,DEL-e#23,DEL]
+  000046:[x#21,MERGE-x#21,MERGE]
+0.0:
+  000053:[e#24,SET-e#24,SET]
+  000049:[t#22,SET-t#22,SET]
+  000055:[x#25,SET-x#25,SET]
+6:
+  000045:[f#0,SET-x#0,SET]
+
+pick-auto
+----
+L0 -> L6
+L0: 000051,000053
+
+# At low priority, find and compact marked-for-compaction files.
+
+define
+L0
+  000049:t.SET.22-t.SET.22
+L6
+  000045:f.SET.0-x.SET.0
+----
+0.0:
+  000049:[t#22,SET-t#22,SET]
+6:
+  000045:[f#0,SET-x#0,SET]
+
+mark-for-compaction file=000049
+----
+marked L0.000049
+
+pick-auto l0_compaction_threshold=1000
+----
+L0 -> L0
+L0: 000049
diff --git a/pebble/testdata/compaction_picker_concurrency b/pebble/testdata/compaction_picker_concurrency
new file mode 100644
index 0000000..d446d93
--- /dev/null
+++ b/pebble/testdata/compaction_picker_concurrency
@@ -0,0 +1,172 @@
+# Test a file L1.000203 that would be a candidate for a move compaction into
+# L2, except that it's bordered by two files participating in the same
+# compaction. This is possible if 000203 created by a L0->L1 compaction that
+# completed after the compaction of 000201 and 000202 began.
+#
+# The in-progress compaction of 000201 and 000202  will write an output table
+# to L2 that would conflict with 000203 if 000203 was moved into L2.
+#
+# NB: The L0 files are used to increase the permitted compaction concurrency.
+define
+L0
+  000301:a.SET.31-a.SET.31
+  000302:a.SET.32-a.SET.32
+  000303:a.SET.33-a.SET.33
+  000304:a.SET.34-a.SET.34
+  000305:a.SET.35-a.SET.35
+L1
+  000201:a.SET.21-b.SET.22
+  000203:k.SET.25-n.SET.26 size=512000000
+  000202:x.SET.23-z.SET.24
+L2
+  000101:a.SET.11-f.SET.12
+L3
+  000010:a.SET.1-z.SET.2
+compactions
+  L1 000201 000202 -> L2 000101
+----
+0.4:
+  000305:[a#35,SET-a#35,SET]
+0.3:
+  000304:[a#34,SET-a#34,SET]
+0.2:
+  000303:[a#33,SET-a#33,SET]
+0.1:
+  000302:[a#32,SET-a#32,SET]
+0.0:
+  000301:[a#31,SET-a#31,SET]
+1:
+  000201:[a#21,SET-b#22,SET]
+  000203:[k#25,SET-n#26,SET]
+  000202:[x#23,SET-z#24,SET]
+2:
+  000101:[a#11,SET-f#12,SET]
+3:
+  000010:[a#1,SET-z#2,SET]
+compactions
+  L1 000201 000202 -> L2 000101
+
+pick-auto l0_compaction_threshold=10
+----
+nil
+
+# Test that lowering L0CompactionConcurrency opens up more compaction slots.
+
+define
+L0
+  000301:a.SET.31-a.SET.31
+  000302:a.SET.32-a.SET.32
+  000303:a.SET.33-a.SET.33
+  000304:a.SET.34-a.SET.34
+  000305:a.SET.35-a.SET.35
+L1
+  000201:a.SET.21-b.SET.22
+  000203:k.SET.25-n.SET.26
+  000202:x.SET.23-z.SET.24
+L2
+  000101:a.SET.11-f.SET.12
+L3
+  000010:a.SET.1-z.SET.2
+compactions
+  L1 000202 -> L2 000101
+----
+0.4:
+  000305:[a#35,SET-a#35,SET]
+0.3:
+  000304:[a#34,SET-a#34,SET]
+0.2:
+  000303:[a#33,SET-a#33,SET]
+0.1:
+  000302:[a#32,SET-a#32,SET]
+0.0:
+  000301:[a#31,SET-a#31,SET]
+1:
+  000201:[a#21,SET-b#22,SET]
+  000203:[k#25,SET-n#26,SET]
+  000202:[x#23,SET-z#24,SET]
+2:
+  000101:[a#11,SET-f#12,SET]
+3:
+  000010:[a#1,SET-z#2,SET]
+compactions
+  L1 000202 -> L2 000101
+
+pick-auto l0_compaction_concurrency=10
+----
+nil
+
+pick-auto l0_compaction_concurrency=5
+----
+L0 -> L1
+L0: 000301,000302,000303,000304,000305
+L1: 000201
+grandparents: 000101
+
+pick-auto l0_compaction_concurrency=1
+----
+L0 -> L1
+L0: 000301,000302,000303,000304,000305
+L1: 000201
+grandparents: 000101
+
+# Test that lowering CompactionDebtConcurrency opens up more concurrent
+# compaction slots.
+
+# Test that lowering L0CompactionConcurrency opens up more compaction slots.
+
+define
+L0
+  000301:a.SET.31-a.SET.31 size=64000
+  000302:a.SET.32-a.SET.32 size=64000
+  000303:a.SET.33-a.SET.33 size=64000
+  000304:a.SET.34-a.SET.34 size=64000
+  000305:a.SET.35-a.SET.35 size=64000
+L1
+  000201:a.SET.21-b.SET.22 size=640000
+  000203:k.SET.25-n.SET.26 size=640000
+  000202:x.SET.23-z.SET.24 size=640000
+L2
+  000101:a.SET.11-f.SET.12 size=6400000
+L3
+  000010:a.SET.1-z.SET.2
+compactions
+  L1 000202 -> L2 000101
+----
+0.4:
+  000305:[a#35,SET-a#35,SET]
+0.3:
+  000304:[a#34,SET-a#34,SET]
+0.2:
+  000303:[a#33,SET-a#33,SET]
+0.1:
+  000302:[a#32,SET-a#32,SET]
+0.0:
+  000301:[a#31,SET-a#31,SET]
+1:
+  000201:[a#21,SET-b#22,SET]
+  000203:[k#25,SET-n#26,SET]
+  000202:[x#23,SET-z#24,SET]
+2:
+  000101:[a#11,SET-f#12,SET]
+3:
+  000010:[a#1,SET-z#2,SET]
+compactions
+  L1 000202 -> L2 000101
+
+pick-auto l0_compaction_concurrency=10 compaction_debt_concurrency=5120000
+----
+nil
+
+pick-auto l0_compaction_concurrency=10 compaction_debt_concurrency=512000
+----
+L0 -> L1
+L0: 000301,000302,000303,000304,000305
+L1: 000201
+grandparents: 000101
+
+pick-auto l0_compaction_concurrency=5 compaction_debt_concurrency=5120000
+----
+L0 -> L1
+L0: 000301,000302,000303,000304,000305
+L1: 000201
+grandparents: 000101
diff --git a/pebble/testdata/compaction_picker_estimated_debt b/pebble/testdata/compaction_picker_estimated_debt
new file mode 100644
index 0000000..70bb7a9
--- /dev/null
+++ b/pebble/testdata/compaction_picker_estimated_debt
@@ -0,0 +1,108 @@
+init 1
+----
+0
+
+init 1
+6: 1
+----
+0
+
+init 1
+6: 2
+----
+0
+
+init 1
+3: 1
+4: 1
+5: 1
+6: 1
+----
+0
+
+init 1
+1: 1
+2: 1
+3: 1
+4: 1
+5: 1
+6: 1
+----
+0
+
+init 1
+1: 1
+2: 10
+3: 100
+4: 1000
+5: 10000
+6: 100000
+----
+0
+
+init 1
+5: 10
+6: 10
+----
+18
+
+init 1
+0: 10
+5: 10
+6: 10
+----
+39
+
+init 1
+0: 10
+6: 100
+----
+0
+
+init 1
+0: 10
+4: 1
+5: 10
+6: 100
+----
+90
+
+init 1
+0: 10
+6: 1000
+----
+0
+
+init 1
+5: 101
+6: 1000
+----
+21
+
+init 1000
+6: 10000
+----
+0
+
+init 1000
+5: 1
+6: 10000
+----
+0
+
+init 1000
+5: 2000
+6: 10000
+----
+0
+
+# Regression test case which was previously computing an overly large
+# estimated debt due to faulty handling of L0.
+
+init 64
+0: 236
+4: 113
+5: 480
+6: 2457
+----
+2414
diff --git a/pebble/testdata/compaction_picker_level_max_bytes b/pebble/testdata/compaction_picker_level_max_bytes
new file mode 100644
index 0000000..b0cc6a0
--- /dev/null
+++ b/pebble/testdata/compaction_picker_level_max_bytes
@@ -0,0 +1,148 @@
+init 1
+----
+6: 9223372036854775807
+
+init 1
+6: 1
+----
+6: 1
+
+init 1
+6: 2
+----
+5: 1
+6: 2
+
+init 1
+6: 2
+----
+5: 1
+6: 2
+
+init 1
+3: 1
+4: 1
+5: 1
+6: 1
+----
+3: 1
+4: 2
+5: 3
+6: 4
+
+init 1
+1: 1
+2: 1
+3: 1
+4: 1
+5: 1
+6: 1
+----
+1: 1
+2: 1
+3: 2
+4: 3
+5: 4
+6: 6
+
+init 1
+1: 1
+2: 10
+3: 100
+4: 1000
+5: 10000
+6: 100000
+----
+1: 1
+2: 10
+3: 100
+4: 1000
+5: 10000
+6: 100000
+
+init 1
+6: 10
+----
+5: 1
+6: 9
+
+init 1
+6: 100
+----
+4: 1
+5: 9
+6: 90
+
+init 1
+6: 1000
+----
+3: 1
+4: 10
+5: 93
+6: 900
+
+init 1
+6: 10000
+----
+2: 1
+3: 10
+4: 95
+5: 924
+6: 9000
+
+init 1
+6: 100000
+----
+1: 1
+2: 10
+3: 96
+4: 939
+5: 9192
+6: 90000
+
+# Smoothing multiplier is
+# `(size(Lbottom)/size(Lbase))^(1/(Lbottom-Lbase)) = (1000000/1)^(1/(6-1)) = 1000000^(1/5)`
+#
+# size(L1) = size(Lbase) = 1
+# size(L2) = size(L1) * 1000000^(1/5) ~= 16
+# size(L3) = size(L2) * 1000000^(1/5) ~= 251
+# size(L4) = size(L3) * 1000000^(1/5) ~= 3981
+# size(L5) = size(L4) * 1000000^(1/5) ~= 63096
+# size(L6) = size(L5) * 1000000^(1/5) ~= 1000000
+
+init 1
+6: 1000000
+----
+1: 1
+2: 16
+3: 241
+4: 3737
+5: 57995
+6: 900000
+
+# Smoothing multiplier is
+# `(size(Lbottom)/size(Lbase))^(Lbottom-Lbase) = (64000000/64)^(1/(6-1)) = 1000000^(1/5)`
+#
+# size(L1) = size(Lbase) = 64
+# size(L2) = size(L1) * 1000000^(1/5) ~= 1014
+# size(L3) = size(L2) * 1000000^(1/5) ~= 16076
+# size(L4) = size(L3) * 1000000^(1/5) ~= 254789
+# size(L5) = size(L4) * 1000000^(1/5) ~= 4038127
+# size(L6) = size(L5) * 1000000^(1/5) ~= 64000000
+
+init 64
+6: 64000000
+----
+1: 64
+2: 993
+3: 15413
+4: 239180
+5: 3711710
+6: 57600000
+
+init 1
+0: 4
+6: 10
+----
+5: 1
+6: 13
diff --git a/pebble/testdata/compaction_picker_pick_file b/pebble/testdata/compaction_picker_pick_file
new file mode 100644
index 0000000..6ecc838
--- /dev/null
+++ b/pebble/testdata/compaction_picker_pick_file
@@ -0,0 +1,106 @@
+# Simple test with a single file per level.
+
+define
+L1
+  b.SET.11:foo
+  c.SET.11:foo
+L2
+  c.SET.0:foo
+  d.SET.0:foo
+----
+1:
+  000004:[b#11,SET-c#11,SET]
+2:
+  000005:[c#0,SET-d#0,SET]
+
+file-sizes
+----
+L1:
+  000004:[b#11,1-c#11,1]: 669 bytes (669B)
+L2:
+  000005:[c#0,1-d#0,1]: 668 bytes (668B)
+
+pick-file L1
+----
+000004:[b#11,1-c#11,1]
+
+pick-file L2
+----
+000005:[c#0,1-d#0,1]
+
+# Test a scenario where we should pick a file with a tiny file size over one
+# with a larger file size, because the tiny sized one overlaps zero data in the
+# output level.
+
+define
+L5
+  b.SET.11:<rand-bytes=65536>
+  c.SET.11:<rand-bytes=65536>
+L5
+  e.SET.11:<rand-bytes=2>
+L6
+  a.SET.0:foo
+  d.SET.0:foo
+----
+5:
+  000004:[b#11,SET-c#11,SET]
+  000005:[e#11,SET-e#11,SET]
+6:
+  000006:[a#0,SET-d#0,SET]
+
+pick-file L5
+----
+000005:[e#11,1-e#11,1]
+
+# Test the same scenario as above, but the larger file that overlaps the next
+# level only overlaps on its start boundary key ("c").
+
+define
+L5
+  c.SET.11:<rand-bytes=65536>
+  d.SET.11:<rand-bytes=65536>
+L5
+  e.SET.11:<rand-bytes=2>
+L6
+  a.SET.0:foo
+  c.SET.0:foo
+----
+5:
+  000004:[c#11,SET-d#11,SET]
+  000005:[e#11,SET-e#11,SET]
+6:
+  000006:[a#0,SET-c#0,SET]
+
+pick-file L5
+----
+000005:[e#11,1-e#11,1]
+
+
+# Test a scenario where the file containing e.SET.11 overlaps an L6 file
+# containing e.SET.0. These files should be considered overlapping, despite the
+# fact that they don't overlap within the internal key keyspace. The overlap
+# should then cause the larger file (with a lower overlapping ratio) to be
+# picked.
+
+define
+L5
+  c.SET.11:<rand-bytes=65536>
+  d.SET.11:<rand-bytes=65536>
+L5
+  e.SET.11:<rand-bytes=2>
+L6
+  a.SET.0:foo
+  c.SET.0:foo
+L6
+  e.SET.0:foo
+----
+5:
+  000004:[c#11,SET-d#11,SET]
+  000005:[e#11,SET-e#11,SET]
+6:
+  000006:[a#0,SET-c#0,SET]
+  000007:[e#0,SET-e#0,SET]
+
+pick-file L5
+----
+000004:[c#11,1-d#11,1]
diff --git a/pebble/testdata/compaction_picker_read_triggered b/pebble/testdata/compaction_picker_read_triggered
new file mode 100644
index 0000000..9f92228
--- /dev/null
+++ b/pebble/testdata/compaction_picker_read_triggered
@@ -0,0 +1,145 @@
+# Verify that pickAuto picks read triggered compactions that are scheduled and LSM is in good shape. This ensures
+# that read triggered compactions are lower priority than score based ones. This also verifies that only the files
+# within the range set in a readCompaction are chosen for compaction.
+define
+L5
+  000101:a.SET.11-f.SET.12 size=10
+  000102:g.SET.11-l.SET.12 size=10
+L6
+  000010:a.SET.1-f.SET.2 size=100
+  000011:g.SET.1-l.SET.2 size=100
+----
+5:
+  000101:[a#11,SET-f#12,SET]
+  000102:[g#11,SET-l#12,SET]
+6:
+  000010:[a#1,SET-f#2,SET]
+  000011:[g#1,SET-l#2,SET]
+
+pick-auto
+----
+nil
+
+add-read-compaction
+5: a-f 000101
+----
+
+show-read-compactions
+----
+(level: 5, start: a, end: f)
+
+pick-auto
+----
+L5 -> L6
+L5: 000101
+L6: 000010
+
+show-read-compactions
+----
+(none)
+
+
+# Verify that pickAuto does not pick read triggered compactions when the LSM is in bad shape and instead schedules a
+# score-based one.
+define
+L5
+  000101:a.SET.11-f.SET.12 size=1000000000
+L6
+  000010:a.SET.1-f.SET.2 size=1000000000
+----
+5:
+  000101:[a#11,SET-f#12,SET]
+6:
+  000010:[a#1,SET-f#2,SET]
+
+add-read-compaction
+5: a-f 000101
+----
+
+show-read-compactions
+----
+(level: 5, start: a, end: f)
+
+pick-auto
+----
+L5 -> L6
+L5: 000101
+L6: 000010
+
+show-read-compactions
+----
+(level: 5, start: a, end: f)
+
+# Verify that read compactions out of a level
+# are disabled if the size ratio of level sizes
+# is higher than what we want.
+define
+L5
+  000101:a.SET.11-f.SET.12 size=10
+  000102:g.SET.11-l.SET.12 size=10
+L6
+  000010:a.SET.1-f.SET.2 size=100000000
+  000012:g.SET.1-l.SET.2 size=100
+----
+5:
+  000101:[a#11,SET-f#12,SET]
+  000102:[g#11,SET-l#12,SET]
+6:
+  000010:[a#1,SET-f#2,SET]
+  000012:[g#1,SET-l#2,SET]
+
+pick-auto
+----
+nil
+
+add-read-compaction
+5: a-f 000101
+----
+
+show-read-compactions
+----
+(level: 5, start: a, end: f)
+
+pick-auto
+----
+nil
+
+show-read-compactions
+----
+(none)
+
+# Verify that wide read compactions are disabled.
+define
+L5
+  000101:a.SET.11-f.SET.12 size=5000000
+  000102:g.SET.11-l.SET.12 size=10
+L6
+  000010:a.SET.1-f.SET.2 size=100000000
+  000012:g.SET.1-l.SET.2 size=100
+----
+5:
+  000101:[a#11,SET-f#12,SET]
+  000102:[g#11,SET-l#12,SET]
+6:
+  000010:[a#1,SET-f#2,SET]
+  000012:[g#1,SET-l#2,SET]
+
+pick-auto
+----
+nil
+
+add-read-compaction
+5: a-f 000101
+----
+
+show-read-compactions
+----
+(level: 5, start: a, end: f)
+
+pick-auto
+----
+nil
+
+show-read-compactions
+----
+(none)
diff --git a/pebble/testdata/compaction_picker_scores b/pebble/testdata/compaction_picker_scores
new file mode 100644
index 0000000..314ee6a
--- /dev/null
+++ b/pebble/testdata/compaction_picker_scores
@@ -0,0 +1,264 @@
+# Ensure that a range deletion in a higher level results in a compensated level
+# size and a higher level score as a result.
+
+define lbase-max-bytes=65536 enable-table-stats=false
+L5
+  a.RANGEDEL.2:f
+L6
+  a.SET.1:<rand-bytes=65536>
+  b.SET.1:<rand-bytes=65536>
+  c.SET.1:<rand-bytes=65536>
+  d.SET.1:<rand-bytes=65536>
+  e.SET.1:<rand-bytes=65536>
+----
+5:
+  000004:[a#2,RANGEDEL-f#inf,RANGEDEL]
+6:
+  000005:[a#1,SET-e#1,SET]
+
+scores
+----
+L       Size   Score
+L0  	0B     0.0
+L1  	0B     0.0
+L2  	0B     0.0
+L3  	0B     0.0
+L4  	0B     0.0
+L5  	729B   0.0
+L6  	321KB  -
+
+enable-table-stats
+----
+
+wait-pending-table-stats
+000004
+----
+num-entries: 1
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 328519
+
+scores
+----
+L       Size   Score
+L0  	0B     0.0
+L1  	0B     0.0
+L2  	0B     0.0
+L3  	0B     0.0
+L4  	0B     0.0
+L5  	729B   4.5
+L6  	321KB  -
+
+# Ensure that point deletions in a higher level result in a compensated level
+# size and higher level scores as a result.
+
+define lbase-max-bytes=65536  enable-table-stats=false
+L5
+  a.DEL.2:
+  b.DEL.2:
+  c.DEL.2:
+  d.DEL.2:
+  e.DEL.2:
+L6
+  a.SET.1:<rand-bytes=65536>
+  b.SET.1:<rand-bytes=65536>
+  c.SET.1:<rand-bytes=65536>
+  d.SET.1:<rand-bytes=65536>
+  e.SET.1:<rand-bytes=65536>
+----
+5:
+  000004:[a#2,DEL-e#2,DEL]
+6:
+  000005:[a#1,SET-e#1,SET]
+
+scores
+----
+L       Size   Score
+L0  	0B     0.0
+L1  	0B     0.0
+L2  	0B     0.0
+L3  	0B     0.0
+L4  	0B     0.0
+L5  	715B   0.0
+L6  	321KB  -
+
+enable-table-stats
+----
+
+wait-pending-table-stats
+000004
+----
+num-entries: 5
+num-deletions: 5
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 164581
+range-deletions-bytes-estimate: 0
+
+scores
+----
+L       Size   Score
+L0  	0B     0.0
+L1  	0B     0.0
+L2  	0B     0.0
+L3  	0B     0.0
+L4  	0B     0.0
+L5  	715B   2.3
+L6  	321KB  -
+
+# Run a similar test as above, but this time the table containing the DELs is
+# ingested after the database is initialized. When the ingested sstable's stats
+# are loaded and automatic compactions are re-enabled, it should trigger an
+# automatic compaction of the ingested sstable on account of the high
+# point-deletions-bytes-estimate value.
+#
+# This a regression test for an issue where the table stats collector wouldn't
+# attempt to schedule a compaction if a file only had compensation due to point
+# deletions and not range deletions.
+
+define lbase-max-bytes=65536  enable-table-stats=true auto-compactions=off
+L6
+  a.SET.1:<rand-bytes=65536>
+  b.SET.1:<rand-bytes=65536>
+  c.SET.1:<rand-bytes=65536>
+  d.SET.1:<rand-bytes=65536>
+  e.SET.1:<rand-bytes=65536>
+----
+6:
+  000004:[a#1,SET-e#1,SET]
+
+ingest ext1
+del a:
+del b:
+del c:
+del d:
+del e:
+----
+5:
+  000005:[a:#10,DEL-e:#10,DEL]
+6:
+  000004:[a#1,SET-e#1,SET]
+
+wait-pending-table-stats
+000005
+----
+num-entries: 5
+num-deletions: 5
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 164616
+range-deletions-bytes-estimate: 0
+
+maybe-compact
+----
+1 compactions in progress:
+5: 000005:a:#10,0-e:#10,0
+6: 000004:a#1,1-e#1,1
+
+scores
+----
+L       Size   Score
+L0  	0B     0.0
+L1  	0B     0.0
+L2  	0B     0.0
+L3  	0B     0.0
+L4  	0B     0.0
+L5  	0B     0.0
+L6  	321KB  -
+
+lsm
+----
+6:
+  000006:[a#0,SET-e#0,SET]
+
+# Test the adjustment of level sizes to accommodate in-progress compactions. A
+# compaction may be "inProgress" if it's already been applied, but is still
+# deleting obsolete files. These compactions' effects have already been applied
+# to the LSM, so size adjustment should ignore them and not doubly adjust sizes.
+
+define lbase-max-bytes=65536  enable-table-stats=false auto-compactions=on pause-cleaning
+L5
+  aa.SET.2:<rand-bytes=131072>
+  bb.SET.2:<rand-bytes=131072>
+  cc.SET.2:<rand-bytes=131072>
+  dd.SET.2:<rand-bytes=131072>
+L5
+  e.SET.2:<rand-bytes=131072>
+L6
+  a.SET.1:<rand-bytes=65536>
+  b.SET.1:<rand-bytes=65536>
+  c.SET.1:<rand-bytes=65536>
+  d.SET.1:<rand-bytes=65536>
+L6
+  e.SET.1:<rand-bytes=131072>
+----
+5:
+  000004:[aa#2,SET-dd#2,SET]
+  000005:[e#2,SET-e#2,SET]
+6:
+  000006:[a#1,SET-d#1,SET]
+  000007:[e#1,SET-e#1,SET]
+
+scores
+----
+L       Size   Score
+L0  	0B     0.0
+L1  	0B     0.0
+L2  	0B     0.0
+L3  	0B     0.0
+L4  	0B     0.0
+L5  	641KB  6.3
+L6  	385KB  -
+
+lsm verbose
+----
+5:
+  000004:[aa#2,SET-dd#2,SET] seqnums:[2-2] points:[aa#2,SET-dd#2,SET]
+  000005:[e#2,SET-e#2,SET] seqnums:[2-2] points:[e#2,SET-e#2,SET]
+6:
+  000006:[a#1,SET-d#1,SET] seqnums:[1-1] points:[a#1,SET-d#1,SET]
+  000007:[e#1,SET-e#1,SET] seqnums:[1-1] points:[e#1,SET-e#1,SET]
+
+# Attempting to schedule a compaction should begin a L5->L6 compaction.
+
+maybe-compact
+----
+1 compactions in progress:
+5: 000004:aa#2,1-dd#2,1
+6: 000006:a#1,1-d#1,1
+
+# The scores and sizes should be stable between when the version edit has been
+# applied but the compaction has not completed, and when the compaction is
+# finally complete.
+
+scores wait-for-compaction=version-edit
+----
+L       Size   Score
+L0  	0B     0.0
+L1  	0B     0.0
+L2  	0B     0.0
+L3  	0B     0.0
+L4  	0B     0.0
+L5  	129KB  0.5
+L6  	898KB  -
+
+lsm
+----
+5:
+  000005:[e#2,SET-e#2,SET]
+6:
+  000008:[a#0,SET-dd#0,SET]
+  000007:[e#1,SET-e#1,SET]
+
+resume-cleaning
+----
+
+scores wait-for-compaction=completion
+----
+L       Size   Score
+L0  	0B     0.0
+L1  	0B     0.0
+L2  	0B     0.0
+L3  	0B     0.0
+L4  	0B     0.0
+L5  	129KB  0.5
+L6  	898KB  -
diff --git a/pebble/testdata/compaction_picker_target_level b/pebble/testdata/compaction_picker_target_level
new file mode 100644
index 0000000..922e887
--- /dev/null
+++ b/pebble/testdata/compaction_picker_target_level
@@ -0,0 +1,1312 @@
+init 1
+----
+
+init_cp
+----
+base: 6
+
+queue
+----
+
+init 1
+6: 1
+----
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+
+init_cp
+----
+base: 6
+
+queue
+----
+
+init 1
+6: 1000000
+----
+L6:
+  600001:[0001#1,1-1000000#1,1]: 1000000 bytes (977KB)
+
+init_cp
+----
+base: 1
+
+queue
+----
+
+init 1
+5: 1
+6: 10
+----
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  600006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  600007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  600008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  600009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  600010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+
+init_cp
+----
+base: 5
+
+queue
+----
+L5->L6: 1.0
+  500001:[0001#1,1-0001#1,1] marked as compacting
+  600001:[0001#1,1-0001#1,1] marked as compacting
+
+init 1
+5: 2
+6: 10
+----
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  500002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  600006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  600007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  600008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  600009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  600010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+
+init_cp
+----
+base: 5
+
+queue
+----
+L5->L6: 2.2
+  500001:[0001#1,1-0001#1,1] marked as compacting
+  600001:[0001#1,1-0001#1,1] marked as compacting
+
+# Smoothing multiplier is
+# `(size(Lbottom)/size(Lbase))^(Lbottom-Lbase) = (30/1)^(1/(6-4)) = 30^(1/2)`
+#
+# size(L4) = size(Lbase) = 1
+# size(L5) = size(L4) * 30^(1/2) ~= 5
+# size(L6) = size(L5) * 30^(1/2) = 30
+
+init 1
+5: 2
+6: 30
+----
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  500002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  600006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  600007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  600008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  600009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  600010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+  600011:[0011#11,1-0011#11,1]: 1 bytes (1B)
+  600012:[0012#12,1-0012#12,1]: 1 bytes (1B)
+  600013:[0013#13,1-0013#13,1]: 1 bytes (1B)
+  600014:[0014#14,1-0014#14,1]: 1 bytes (1B)
+  600015:[0015#15,1-0015#15,1]: 1 bytes (1B)
+  600016:[0016#16,1-0016#16,1]: 1 bytes (1B)
+  600017:[0017#17,1-0017#17,1]: 1 bytes (1B)
+  600018:[0018#18,1-0018#18,1]: 1 bytes (1B)
+  600019:[0019#19,1-0019#19,1]: 1 bytes (1B)
+  600020:[0020#20,1-0020#20,1]: 1 bytes (1B)
+  600021:[0021#21,1-0021#21,1]: 1 bytes (1B)
+  600022:[0022#22,1-0022#22,1]: 1 bytes (1B)
+  600023:[0023#23,1-0023#23,1]: 1 bytes (1B)
+  600024:[0024#24,1-0024#24,1]: 1 bytes (1B)
+  600025:[0025#25,1-0025#25,1]: 1 bytes (1B)
+  600026:[0026#26,1-0026#26,1]: 1 bytes (1B)
+  600027:[0027#27,1-0027#27,1]: 1 bytes (1B)
+  600028:[0028#28,1-0028#28,1]: 1 bytes (1B)
+  600029:[0029#29,1-0029#29,1]: 1 bytes (1B)
+  600030:[0030#30,1-0030#30,1]: 1 bytes (1B)
+
+init_cp
+----
+base: 4
+
+queue
+----
+
+init 1
+4: 2
+5: 2
+6: 100
+----
+L4:
+  400001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  400002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  500002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0100#1,1]: 100 bytes (100B)
+
+init_cp
+----
+base: 4
+
+queue
+----
+L4->L5: 10.0
+  400001:[0001#1,1-0001#1,1] marked as compacting
+  500001:[0001#1,1-0001#1,1] marked as compacting
+
+init 1
+4: 1
+5: 2
+6: 100
+----
+L4:
+  400001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  500002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0100#1,1]: 100 bytes (100B)
+
+init_cp
+----
+base: 4
+
+queue
+----
+L4->L5: 5.0
+  400001:[0001#1,1-0001#1,1] marked as compacting
+  500001:[0001#1,1-0001#1,1] marked as compacting
+
+init 1
+4: 1
+5: 11
+6: 100
+----
+L4:
+  400001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  500002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  500003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  500004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  500005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  500006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  500007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  500008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  500009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  500010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+  500011:[0011#11,1-0011#11,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0100#1,1]: 100 bytes (100B)
+
+init_cp
+----
+base: 4
+
+queue
+----
+L5->L6: 1.1
+  500001:[0001#1,1-0001#1,1] marked as compacting
+  500002:[0002#2,1-0002#2,1] marked as compacting
+  500003:[0003#3,1-0003#3,1] marked as compacting
+  500004:[0004#4,1-0004#4,1] marked as compacting
+  500005:[0005#5,1-0005#5,1] marked as compacting
+  500006:[0006#6,1-0006#6,1] marked as compacting
+  500007:[0007#7,1-0007#7,1] marked as compacting
+  500008:[0008#8,1-0008#8,1] marked as compacting
+  500009:[0009#9,1-0009#9,1] marked as compacting
+  500010:[0010#10,1-0010#10,1] marked as compacting
+  500011:[0011#11,1-0011#11,1] marked as compacting
+  600001:[0001#1,1-0100#1,1] marked as compacting
+
+init 1
+4: 1
+5: 11 50
+6: 100
+----
+L4:
+  400001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  500002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  500003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  500004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  500005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  500006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  500007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  500008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  500009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  500010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+  500011:[0011#11,1-0011#11,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0100#1,1]: 100 bytes (100B)
+
+init_cp
+----
+base: 4
+
+queue
+----
+L5->L6: 6.2
+  500001:[0001#1,1-0001#1,1] marked as compacting
+  500002:[0002#2,1-0002#2,1] marked as compacting
+  500003:[0003#3,1-0003#3,1] marked as compacting
+  500004:[0004#4,1-0004#4,1] marked as compacting
+  500005:[0005#5,1-0005#5,1] marked as compacting
+  500006:[0006#6,1-0006#6,1] marked as compacting
+  500007:[0007#7,1-0007#7,1] marked as compacting
+  500008:[0008#8,1-0008#8,1] marked as compacting
+  500009:[0009#9,1-0009#9,1] marked as compacting
+  500010:[0010#10,1-0010#10,1] marked as compacting
+  500011:[0011#11,1-0011#11,1] marked as compacting
+  600001:[0001#1,1-0100#1,1] marked as compacting
+
+init 1
+4: 2
+5: 11
+6: 100
+----
+L4:
+  400001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  400002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  500002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  500003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  500004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  500005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  500006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  500007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  500008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  500009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  500010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+  500011:[0011#11,1-0011#11,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0100#1,1]: 100 bytes (100B)
+
+init_cp
+----
+base: 4
+
+queue
+----
+L4->L5: 1.8
+  400001:[0001#1,1-0001#1,1] marked as compacting
+  500001:[0001#1,1-0001#1,1] marked as compacting
+
+init 1
+0: 4
+----
+L0:
+  000001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  000002:[0001#2,1-0001#2,1]: 1 bytes (1B)
+  000003:[0001#3,1-0001#3,1]: 1 bytes (1B)
+  000004:[0001#4,1-0001#4,1]: 1 bytes (1B)
+
+init_cp
+----
+base: 6
+
+queue
+----
+L0->L6: 200.0
+  000001:[0001#1,1-0001#1,1] marked as compacting
+  000002:[0001#2,1-0001#2,1] marked as compacting
+  000003:[0001#3,1-0001#3,1] marked as compacting
+  000004:[0001#4,1-0001#4,1] marked as compacting
+
+init 1
+0: 5
+----
+L0:
+  000001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  000002:[0001#2,1-0001#2,1]: 1 bytes (1B)
+  000003:[0001#3,1-0001#3,1]: 1 bytes (1B)
+  000004:[0001#4,1-0001#4,1]: 1 bytes (1B)
+  000005:[0001#5,1-0001#5,1]: 1 bytes (1B)
+
+init_cp
+----
+base: 6
+
+queue
+----
+L0->L6: 250.0
+  000001:[0001#1,1-0001#1,1] marked as compacting
+  000002:[0001#2,1-0001#2,1] marked as compacting
+  000003:[0001#3,1-0001#3,1] marked as compacting
+  000004:[0001#4,1-0001#4,1] marked as compacting
+  000005:[0001#5,1-0001#5,1] marked as compacting
+
+init 1
+0: 5
+5: 2
+6: 10
+----
+L0:
+  000001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  000002:[0001#2,1-0001#2,1]: 1 bytes (1B)
+  000003:[0001#3,1-0001#3,1]: 1 bytes (1B)
+  000004:[0001#4,1-0001#4,1]: 1 bytes (1B)
+  000005:[0001#5,1-0001#5,1]: 1 bytes (1B)
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  500002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  600006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  600007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  600008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  600009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  600010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+
+init_cp
+----
+base: 5
+
+queue
+----
+L5->L6: 3.2
+  500001:[0001#1,1-0001#1,1] marked as compacting
+  600001:[0001#1,1-0001#1,1] marked as compacting
+
+pick
+----
+Initial state before pick:
+L0:
+  000001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  000002:[0001#2,1-0001#2,1]: 1 bytes (1B)
+  000003:[0001#3,1-0001#3,1]: 1 bytes (1B)
+  000004:[0001#4,1-0001#4,1]: 1 bytes (1B)
+  000005:[0001#5,1-0001#5,1]: 1 bytes (1B)
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  500002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  600006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  600007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  600008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  600009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  600010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+Picked: L5->L6: 3.2
+
+pick ongoing=(5,6)
+----
+Initial state before pick:
+L0:
+  000001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  000002:[0001#2,1-0001#2,1]: 1 bytes (1B)
+  000003:[0001#3,1-0001#3,1]: 1 bytes (1B)
+  000004:[0001#4,1-0001#4,1]: 1 bytes (1B)
+  000005:[0001#5,1-0001#5,1]: 1 bytes (1B)
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B) (IsCompacting)
+  500002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B) (IsCompacting)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  600006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  600007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  600008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  600009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  600010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+Picked: no compaction
+
+init 1
+0: 10
+4: 10
+5: 6
+6: 10
+----
+L0:
+  000001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  000002:[0001#2,1-0001#2,1]: 1 bytes (1B)
+  000003:[0001#3,1-0001#3,1]: 1 bytes (1B)
+  000004:[0001#4,1-0001#4,1]: 1 bytes (1B)
+  000005:[0001#5,1-0001#5,1]: 1 bytes (1B)
+  000006:[0001#6,1-0001#6,1]: 1 bytes (1B)
+  000007:[0001#7,1-0001#7,1]: 1 bytes (1B)
+  000008:[0001#8,1-0001#8,1]: 1 bytes (1B)
+  000009:[0001#9,1-0001#9,1]: 1 bytes (1B)
+  000010:[0001#10,1-0001#10,1]: 1 bytes (1B)
+L4:
+  400001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  400002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  400003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  400004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  400005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  400006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  400007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  400008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  400009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  400010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  500002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  500003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  500004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  500005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  500006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  600006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  600007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  600008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  600009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  600010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+
+init_cp
+----
+base: 4
+
+queue
+----
+L4->L5: 10.0
+  400007:[0007#7,1-0007#7,1] marked as compacting
+L4->L5: 7.7
+  400008:[0008#8,1-0008#8,1] marked as compacting
+
+pick ongoing=(5,6)
+----
+Initial state before pick:
+L0:
+  000001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  000002:[0001#2,1-0001#2,1]: 1 bytes (1B)
+  000003:[0001#3,1-0001#3,1]: 1 bytes (1B)
+  000004:[0001#4,1-0001#4,1]: 1 bytes (1B)
+  000005:[0001#5,1-0001#5,1]: 1 bytes (1B)
+  000006:[0001#6,1-0001#6,1]: 1 bytes (1B)
+  000007:[0001#7,1-0001#7,1]: 1 bytes (1B)
+  000008:[0001#8,1-0001#8,1]: 1 bytes (1B)
+  000009:[0001#9,1-0001#9,1]: 1 bytes (1B)
+  000010:[0001#10,1-0001#10,1]: 1 bytes (1B)
+L4:
+  400001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  400002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  400003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  400004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  400005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  400006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  400007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  400008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  400009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  400010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B) (IsCompacting)
+  500002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  500003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  500004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  500005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  500006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B) (IsCompacting)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  600006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  600007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  600008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  600009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  600010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+Picked: L4->L5: 12.0
+
+pick ongoing=(0,4)
+----
+Initial state before pick:
+L0:
+  000001:[0001#1,1-0001#1,1]: 1 bytes (1B) (IsCompacting)
+  000002:[0001#2,1-0001#2,1]: 1 bytes (1B)
+  000003:[0001#3,1-0001#3,1]: 1 bytes (1B)
+  000004:[0001#4,1-0001#4,1]: 1 bytes (1B)
+  000005:[0001#5,1-0001#5,1]: 1 bytes (1B)
+  000006:[0001#6,1-0001#6,1]: 1 bytes (1B)
+  000007:[0001#7,1-0001#7,1]: 1 bytes (1B)
+  000008:[0001#8,1-0001#8,1]: 1 bytes (1B)
+  000009:[0001#9,1-0001#9,1]: 1 bytes (1B)
+  000010:[0001#10,1-0001#10,1]: 1 bytes (1B)
+L4:
+  400001:[0001#1,1-0001#1,1]: 1 bytes (1B) (IsCompacting)
+  400002:[0002#2,1-0002#2,1]: 1 bytes (1B) (IsCompacting)
+  400003:[0003#3,1-0003#3,1]: 1 bytes (1B) (IsCompacting)
+  400004:[0004#4,1-0004#4,1]: 1 bytes (1B) (IsCompacting)
+  400005:[0005#5,1-0005#5,1]: 1 bytes (1B) (IsCompacting)
+  400006:[0006#6,1-0006#6,1]: 1 bytes (1B) (IsCompacting)
+  400007:[0007#7,1-0007#7,1]: 1 bytes (1B) (IsCompacting)
+  400008:[0008#8,1-0008#8,1]: 1 bytes (1B) (IsCompacting)
+  400009:[0009#9,1-0009#9,1]: 1 bytes (1B) (IsCompacting)
+  400010:[0010#10,1-0010#10,1]: 1 bytes (1B) (IsCompacting)
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  500002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  500003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  500004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  500005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  500006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  600006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  600007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  600008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  600009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  600010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+Picked: L5->L6: 3.3
+
+pick ongoing=(0,0,0,4)
+----
+Initial state before pick:
+L0:
+  000001:[0001#1,1-0001#1,1]: 1 bytes (1B) (IsCompacting)
+  000002:[0001#2,1-0001#2,1]: 1 bytes (1B) (IsCompacting)
+  000003:[0001#3,1-0001#3,1]: 1 bytes (1B) (IsCompacting)
+  000004:[0001#4,1-0001#4,1]: 1 bytes (1B)
+  000005:[0001#5,1-0001#5,1]: 1 bytes (1B)
+  000006:[0001#6,1-0001#6,1]: 1 bytes (1B)
+  000007:[0001#7,1-0001#7,1]: 1 bytes (1B)
+  000008:[0001#8,1-0001#8,1]: 1 bytes (1B)
+  000009:[0001#9,1-0001#9,1]: 1 bytes (1B)
+  000010:[0001#10,1-0001#10,1]: 1 bytes (1B)
+L4:
+  400001:[0001#1,1-0001#1,1]: 1 bytes (1B) (IsCompacting)
+  400002:[0002#2,1-0002#2,1]: 1 bytes (1B) (IsCompacting)
+  400003:[0003#3,1-0003#3,1]: 1 bytes (1B) (IsCompacting)
+  400004:[0004#4,1-0004#4,1]: 1 bytes (1B) (IsCompacting)
+  400005:[0005#5,1-0005#5,1]: 1 bytes (1B) (IsCompacting)
+  400006:[0006#6,1-0006#6,1]: 1 bytes (1B) (IsCompacting)
+  400007:[0007#7,1-0007#7,1]: 1 bytes (1B) (IsCompacting)
+  400008:[0008#8,1-0008#8,1]: 1 bytes (1B) (IsCompacting)
+  400009:[0009#9,1-0009#9,1]: 1 bytes (1B) (IsCompacting)
+  400010:[0010#10,1-0010#10,1]: 1 bytes (1B) (IsCompacting)
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  500002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  500003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  500004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  500005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  500006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  600006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  600007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  600008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  600009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  600010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+Picked: no compaction
+
+init_cp
+----
+base: 4
+
+pick ongoing=(0,5)
+----
+Initial state before pick:
+L0:
+  000001:[0001#1,1-0001#1,1]: 1 bytes (1B) (IsCompacting)
+  000002:[0001#2,1-0001#2,1]: 1 bytes (1B)
+  000003:[0001#3,1-0001#3,1]: 1 bytes (1B)
+  000004:[0001#4,1-0001#4,1]: 1 bytes (1B)
+  000005:[0001#5,1-0001#5,1]: 1 bytes (1B)
+  000006:[0001#6,1-0001#6,1]: 1 bytes (1B)
+  000007:[0001#7,1-0001#7,1]: 1 bytes (1B)
+  000008:[0001#8,1-0001#8,1]: 1 bytes (1B)
+  000009:[0001#9,1-0001#9,1]: 1 bytes (1B)
+  000010:[0001#10,1-0001#10,1]: 1 bytes (1B)
+L4:
+  400001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  400002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  400003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  400004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  400005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  400006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  400007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  400008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  400009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  400010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B) (IsCompacting)
+  500002:[0002#2,1-0002#2,1]: 1 bytes (1B) (IsCompacting)
+  500003:[0003#3,1-0003#3,1]: 1 bytes (1B) (IsCompacting)
+  500004:[0004#4,1-0004#4,1]: 1 bytes (1B) (IsCompacting)
+  500005:[0005#5,1-0005#5,1]: 1 bytes (1B) (IsCompacting)
+  500006:[0006#6,1-0006#6,1]: 1 bytes (1B) (IsCompacting)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  600006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  600007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  600008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  600009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  600010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+Picked: L4->L5: 8.6
+
+init 1
+0: 20
+6: 10
+----
+L0:
+  000001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  000002:[0001#2,1-0001#2,1]: 1 bytes (1B)
+  000003:[0001#3,1-0001#3,1]: 1 bytes (1B)
+  000004:[0001#4,1-0001#4,1]: 1 bytes (1B)
+  000005:[0001#5,1-0001#5,1]: 1 bytes (1B)
+  000006:[0001#6,1-0001#6,1]: 1 bytes (1B)
+  000007:[0001#7,1-0001#7,1]: 1 bytes (1B)
+  000008:[0001#8,1-0001#8,1]: 1 bytes (1B)
+  000009:[0001#9,1-0001#9,1]: 1 bytes (1B)
+  000010:[0001#10,1-0001#10,1]: 1 bytes (1B)
+  000011:[0001#11,1-0001#11,1]: 1 bytes (1B)
+  000012:[0001#12,1-0001#12,1]: 1 bytes (1B)
+  000013:[0001#13,1-0001#13,1]: 1 bytes (1B)
+  000014:[0001#14,1-0001#14,1]: 1 bytes (1B)
+  000015:[0001#15,1-0001#15,1]: 1 bytes (1B)
+  000016:[0001#16,1-0001#16,1]: 1 bytes (1B)
+  000017:[0001#17,1-0001#17,1]: 1 bytes (1B)
+  000018:[0001#18,1-0001#18,1]: 1 bytes (1B)
+  000019:[0001#19,1-0001#19,1]: 1 bytes (1B)
+  000020:[0001#20,1-0001#20,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  600006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  600007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  600008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  600009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  600010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+
+init_cp ongoing=(0,5)
+----
+base: 4
+
+queue
+----
+L0->L4: 1000.0
+  000001:[0001#1,1-0001#1,1] marked as compacting
+  000002:[0001#2,1-0001#2,1] marked as compacting
+  000003:[0001#3,1-0001#3,1] marked as compacting
+  000004:[0001#4,1-0001#4,1] marked as compacting
+  000005:[0001#5,1-0001#5,1] marked as compacting
+  000006:[0001#6,1-0001#6,1] marked as compacting
+  000007:[0001#7,1-0001#7,1] marked as compacting
+  000008:[0001#8,1-0001#8,1] marked as compacting
+  000009:[0001#9,1-0001#9,1] marked as compacting
+  000010:[0001#10,1-0001#10,1] marked as compacting
+  000011:[0001#11,1-0001#11,1] marked as compacting
+  000012:[0001#12,1-0001#12,1] marked as compacting
+  000013:[0001#13,1-0001#13,1] marked as compacting
+  000014:[0001#14,1-0001#14,1] marked as compacting
+  000015:[0001#15,1-0001#15,1] marked as compacting
+  000016:[0001#16,1-0001#16,1] marked as compacting
+  000017:[0001#17,1-0001#17,1] marked as compacting
+  000018:[0001#18,1-0001#18,1] marked as compacting
+  000019:[0001#19,1-0001#19,1] marked as compacting
+  000020:[0001#20,1-0001#20,1] marked as compacting
+
+pick
+----
+Initial state before pick:
+L0:
+  000001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  000002:[0001#2,1-0001#2,1]: 1 bytes (1B)
+  000003:[0001#3,1-0001#3,1]: 1 bytes (1B)
+  000004:[0001#4,1-0001#4,1]: 1 bytes (1B)
+  000005:[0001#5,1-0001#5,1]: 1 bytes (1B)
+  000006:[0001#6,1-0001#6,1]: 1 bytes (1B)
+  000007:[0001#7,1-0001#7,1]: 1 bytes (1B)
+  000008:[0001#8,1-0001#8,1]: 1 bytes (1B)
+  000009:[0001#9,1-0001#9,1]: 1 bytes (1B)
+  000010:[0001#10,1-0001#10,1]: 1 bytes (1B)
+  000011:[0001#11,1-0001#11,1]: 1 bytes (1B)
+  000012:[0001#12,1-0001#12,1]: 1 bytes (1B)
+  000013:[0001#13,1-0001#13,1]: 1 bytes (1B)
+  000014:[0001#14,1-0001#14,1]: 1 bytes (1B)
+  000015:[0001#15,1-0001#15,1]: 1 bytes (1B)
+  000016:[0001#16,1-0001#16,1]: 1 bytes (1B)
+  000017:[0001#17,1-0001#17,1]: 1 bytes (1B)
+  000018:[0001#18,1-0001#18,1]: 1 bytes (1B)
+  000019:[0001#19,1-0001#19,1]: 1 bytes (1B)
+  000020:[0001#20,1-0001#20,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  600006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  600007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  600008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  600009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  600010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+Picked: L0->L4: 1000.0
+
+pick ongoing=(0,4)
+----
+Initial state before pick:
+L0:
+  000001:[0001#1,1-0001#1,1]: 1 bytes (1B) (IsCompacting)
+  000002:[0001#2,1-0001#2,1]: 1 bytes (1B)
+  000003:[0001#3,1-0001#3,1]: 1 bytes (1B)
+  000004:[0001#4,1-0001#4,1]: 1 bytes (1B)
+  000005:[0001#5,1-0001#5,1]: 1 bytes (1B)
+  000006:[0001#6,1-0001#6,1]: 1 bytes (1B)
+  000007:[0001#7,1-0001#7,1]: 1 bytes (1B)
+  000008:[0001#8,1-0001#8,1]: 1 bytes (1B)
+  000009:[0001#9,1-0001#9,1]: 1 bytes (1B)
+  000010:[0001#10,1-0001#10,1]: 1 bytes (1B)
+  000011:[0001#11,1-0001#11,1]: 1 bytes (1B)
+  000012:[0001#12,1-0001#12,1]: 1 bytes (1B)
+  000013:[0001#13,1-0001#13,1]: 1 bytes (1B)
+  000014:[0001#14,1-0001#14,1]: 1 bytes (1B)
+  000015:[0001#15,1-0001#15,1]: 1 bytes (1B)
+  000016:[0001#16,1-0001#16,1]: 1 bytes (1B)
+  000017:[0001#17,1-0001#17,1]: 1 bytes (1B)
+  000018:[0001#18,1-0001#18,1]: 1 bytes (1B)
+  000019:[0001#19,1-0001#19,1]: 1 bytes (1B)
+  000020:[0001#20,1-0001#20,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  600006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  600007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  600008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  600009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  600010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+Picked: no compaction
+
+# We'll only pick a concurrent compaction if it is "high" priority
+# (i.e. has a score >= highPriorityThreshold).
+
+init 1
+0: 20
+5: 1
+6: 10
+----
+L0:
+  000001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  000002:[0001#2,1-0001#2,1]: 1 bytes (1B)
+  000003:[0001#3,1-0001#3,1]: 1 bytes (1B)
+  000004:[0001#4,1-0001#4,1]: 1 bytes (1B)
+  000005:[0001#5,1-0001#5,1]: 1 bytes (1B)
+  000006:[0001#6,1-0001#6,1]: 1 bytes (1B)
+  000007:[0001#7,1-0001#7,1]: 1 bytes (1B)
+  000008:[0001#8,1-0001#8,1]: 1 bytes (1B)
+  000009:[0001#9,1-0001#9,1]: 1 bytes (1B)
+  000010:[0001#10,1-0001#10,1]: 1 bytes (1B)
+  000011:[0001#11,1-0001#11,1]: 1 bytes (1B)
+  000012:[0001#12,1-0001#12,1]: 1 bytes (1B)
+  000013:[0001#13,1-0001#13,1]: 1 bytes (1B)
+  000014:[0001#14,1-0001#14,1]: 1 bytes (1B)
+  000015:[0001#15,1-0001#15,1]: 1 bytes (1B)
+  000016:[0001#16,1-0001#16,1]: 1 bytes (1B)
+  000017:[0001#17,1-0001#17,1]: 1 bytes (1B)
+  000018:[0001#18,1-0001#18,1]: 1 bytes (1B)
+  000019:[0001#19,1-0001#19,1]: 1 bytes (1B)
+  000020:[0001#20,1-0001#20,1]: 1 bytes (1B)
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  600006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  600007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  600008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  600009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  600010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+
+init_cp ongoing=(0,4)
+----
+base: 4
+
+queue
+----
+L0->L4: 1000.0
+  000001:[0001#1,1-0001#1,1] marked as compacting
+  000002:[0001#2,1-0001#2,1] marked as compacting
+  000003:[0001#3,1-0001#3,1] marked as compacting
+  000004:[0001#4,1-0001#4,1] marked as compacting
+  000005:[0001#5,1-0001#5,1] marked as compacting
+  000006:[0001#6,1-0001#6,1] marked as compacting
+  000007:[0001#7,1-0001#7,1] marked as compacting
+  000008:[0001#8,1-0001#8,1] marked as compacting
+  000009:[0001#9,1-0001#9,1] marked as compacting
+  000010:[0001#10,1-0001#10,1] marked as compacting
+  000011:[0001#11,1-0001#11,1] marked as compacting
+  000012:[0001#12,1-0001#12,1] marked as compacting
+  000013:[0001#13,1-0001#13,1] marked as compacting
+  000014:[0001#14,1-0001#14,1] marked as compacting
+  000015:[0001#15,1-0001#15,1] marked as compacting
+  000016:[0001#16,1-0001#16,1] marked as compacting
+  000017:[0001#17,1-0001#17,1] marked as compacting
+  000018:[0001#18,1-0001#18,1] marked as compacting
+  000019:[0001#19,1-0001#19,1] marked as compacting
+  000020:[0001#20,1-0001#20,1] marked as compacting
+
+pick ongoing=(0,4,4,5)
+----
+Initial state before pick:
+L0:
+  000001:[0001#1,1-0001#1,1]: 1 bytes (1B) (IsCompacting)
+  000002:[0001#2,1-0001#2,1]: 1 bytes (1B)
+  000003:[0001#3,1-0001#3,1]: 1 bytes (1B)
+  000004:[0001#4,1-0001#4,1]: 1 bytes (1B)
+  000005:[0001#5,1-0001#5,1]: 1 bytes (1B)
+  000006:[0001#6,1-0001#6,1]: 1 bytes (1B)
+  000007:[0001#7,1-0001#7,1]: 1 bytes (1B)
+  000008:[0001#8,1-0001#8,1]: 1 bytes (1B)
+  000009:[0001#9,1-0001#9,1]: 1 bytes (1B)
+  000010:[0001#10,1-0001#10,1]: 1 bytes (1B)
+  000011:[0001#11,1-0001#11,1]: 1 bytes (1B)
+  000012:[0001#12,1-0001#12,1]: 1 bytes (1B)
+  000013:[0001#13,1-0001#13,1]: 1 bytes (1B)
+  000014:[0001#14,1-0001#14,1]: 1 bytes (1B)
+  000015:[0001#15,1-0001#15,1]: 1 bytes (1B)
+  000016:[0001#16,1-0001#16,1]: 1 bytes (1B)
+  000017:[0001#17,1-0001#17,1]: 1 bytes (1B)
+  000018:[0001#18,1-0001#18,1]: 1 bytes (1B)
+  000019:[0001#19,1-0001#19,1]: 1 bytes (1B)
+  000020:[0001#20,1-0001#20,1]: 1 bytes (1B)
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B) (IsCompacting)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  600006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  600007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  600008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  600009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  600010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+Picked: no compaction
+
+pick ongoing=(4,5)
+----
+Initial state before pick:
+L0:
+  000001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  000002:[0001#2,1-0001#2,1]: 1 bytes (1B)
+  000003:[0001#3,1-0001#3,1]: 1 bytes (1B)
+  000004:[0001#4,1-0001#4,1]: 1 bytes (1B)
+  000005:[0001#5,1-0001#5,1]: 1 bytes (1B)
+  000006:[0001#6,1-0001#6,1]: 1 bytes (1B)
+  000007:[0001#7,1-0001#7,1]: 1 bytes (1B)
+  000008:[0001#8,1-0001#8,1]: 1 bytes (1B)
+  000009:[0001#9,1-0001#9,1]: 1 bytes (1B)
+  000010:[0001#10,1-0001#10,1]: 1 bytes (1B)
+  000011:[0001#11,1-0001#11,1]: 1 bytes (1B)
+  000012:[0001#12,1-0001#12,1]: 1 bytes (1B)
+  000013:[0001#13,1-0001#13,1]: 1 bytes (1B)
+  000014:[0001#14,1-0001#14,1]: 1 bytes (1B)
+  000015:[0001#15,1-0001#15,1]: 1 bytes (1B)
+  000016:[0001#16,1-0001#16,1]: 1 bytes (1B)
+  000017:[0001#17,1-0001#17,1]: 1 bytes (1B)
+  000018:[0001#18,1-0001#18,1]: 1 bytes (1B)
+  000019:[0001#19,1-0001#19,1]: 1 bytes (1B)
+  000020:[0001#20,1-0001#20,1]: 1 bytes (1B)
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B) (IsCompacting)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  600006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  600007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  600008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  600009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  600010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+Picked: L0->L4: 1000.0
+
+# Verify we can start concurrent Ln->Ln+1 compactions given sufficient
+# priority.
+
+init 1
+5: 4
+6: 10
+----
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  500002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  500003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  500004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  600006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  600007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  600008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  600009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  600010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+
+init_cp
+----
+base: 5
+
+queue
+----
+L5->L6: 5.2
+  500001:[0001#1,1-0001#1,1] marked as compacting
+  600001:[0001#1,1-0001#1,1] marked as compacting
+
+pick
+----
+Initial state before pick:
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  500002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  500003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  500004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  600006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  600007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  600008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  600009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  600010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+Picked: L5->L6: 5.2
+
+pick ongoing=(5,6)
+----
+Initial state before pick:
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B) (IsCompacting)
+  500002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  500003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  500004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B) (IsCompacting)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  600006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  600007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  600008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  600009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  600010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+Picked: no compaction
+
+# Verify that L0 score doesn't change with respect to L5's compensation.
+
+init 5
+0: 10
+5: 5 1
+6: 5
+----
+L0:
+  000001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  000002:[0001#2,1-0001#2,1]: 1 bytes (1B)
+  000003:[0001#3,1-0001#3,1]: 1 bytes (1B)
+  000004:[0001#4,1-0001#4,1]: 1 bytes (1B)
+  000005:[0001#5,1-0001#5,1]: 1 bytes (1B)
+  000006:[0001#6,1-0001#6,1]: 1 bytes (1B)
+  000007:[0001#7,1-0001#7,1]: 1 bytes (1B)
+  000008:[0001#8,1-0001#8,1]: 1 bytes (1B)
+  000009:[0001#9,1-0001#9,1]: 1 bytes (1B)
+  000010:[0001#10,1-0001#10,1]: 1 bytes (1B)
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  500002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  500003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  500004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  500005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+
+init_cp
+----
+base: 5
+
+queue
+----
+L0->L5: 5.0
+  000001:[0001#1,1-0001#1,1] marked as compacting
+  000002:[0001#2,1-0001#2,1] marked as compacting
+  000003:[0001#3,1-0001#3,1] marked as compacting
+  000004:[0001#4,1-0001#4,1] marked as compacting
+  000005:[0001#5,1-0001#5,1] marked as compacting
+  000006:[0001#6,1-0001#6,1] marked as compacting
+  000007:[0001#7,1-0001#7,1] marked as compacting
+  000008:[0001#8,1-0001#8,1] marked as compacting
+  000009:[0001#9,1-0001#9,1] marked as compacting
+  000010:[0001#10,1-0001#10,1] marked as compacting
+  500001:[0001#1,1-0001#1,1] marked as compacting
+L5->L6: 11.5
+  500005:[0005#5,1-0005#5,1] marked as compacting
+  600005:[0005#5,1-0005#5,1] marked as compacting
+
+init 5
+0: 10
+5: 5 0
+6: 5
+----
+L0:
+  000001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  000002:[0001#2,1-0001#2,1]: 1 bytes (1B)
+  000003:[0001#3,1-0001#3,1]: 1 bytes (1B)
+  000004:[0001#4,1-0001#4,1]: 1 bytes (1B)
+  000005:[0001#5,1-0001#5,1]: 1 bytes (1B)
+  000006:[0001#6,1-0001#6,1]: 1 bytes (1B)
+  000007:[0001#7,1-0001#7,1]: 1 bytes (1B)
+  000008:[0001#8,1-0001#8,1]: 1 bytes (1B)
+  000009:[0001#9,1-0001#9,1]: 1 bytes (1B)
+  000010:[0001#10,1-0001#10,1]: 1 bytes (1B)
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  500002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  500003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  500004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  500005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+
+init_cp
+----
+base: 5
+
+queue
+----
+L0->L5: 5.0
+  000001:[0001#1,1-0001#1,1] marked as compacting
+  000002:[0001#2,1-0001#2,1] marked as compacting
+  000003:[0001#3,1-0001#3,1] marked as compacting
+  000004:[0001#4,1-0001#4,1] marked as compacting
+  000005:[0001#5,1-0001#5,1] marked as compacting
+  000006:[0001#6,1-0001#6,1] marked as compacting
+  000007:[0001#7,1-0001#7,1] marked as compacting
+  000008:[0001#8,1-0001#8,1] marked as compacting
+  000009:[0001#9,1-0001#9,1] marked as compacting
+  000010:[0001#10,1-0001#10,1] marked as compacting
+  500001:[0001#1,1-0001#1,1] marked as compacting
+L5->L6: 10.8
+  500002:[0002#2,1-0002#2,1] marked as compacting
+  600002:[0002#2,1-0002#2,1] marked as compacting
+
+# Verify that successive manual compactions interleaved with an automatic
+# compaction does not trigger an error.
+
+init 5
+0: 10
+5: 10
+6: 5
+----
+L0:
+  000001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  000002:[0001#2,1-0001#2,1]: 1 bytes (1B)
+  000003:[0001#3,1-0001#3,1]: 1 bytes (1B)
+  000004:[0001#4,1-0001#4,1]: 1 bytes (1B)
+  000005:[0001#5,1-0001#5,1]: 1 bytes (1B)
+  000006:[0001#6,1-0001#6,1]: 1 bytes (1B)
+  000007:[0001#7,1-0001#7,1]: 1 bytes (1B)
+  000008:[0001#8,1-0001#8,1]: 1 bytes (1B)
+  000009:[0001#9,1-0001#9,1]: 1 bytes (1B)
+  000010:[0001#10,1-0001#10,1]: 1 bytes (1B)
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  500002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  500003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  500004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  500005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  500006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  500007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  500008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  500009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  500010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+
+init_cp
+----
+base: 5
+
+queue
+----
+L5->L6: 9.2
+  500006:[0006#6,1-0006#6,1] marked as compacting
+L5->L6: 6.9
+  500007:[0007#7,1-0007#7,1] marked as compacting
+
+pick_manual level=0 start=0 end=12
+----
+L0->L5, retryLater = false
+
+pick
+----
+Initial state before pick:
+L0:
+  000001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  000002:[0001#2,1-0001#2,1]: 1 bytes (1B)
+  000003:[0001#3,1-0001#3,1]: 1 bytes (1B)
+  000004:[0001#4,1-0001#4,1]: 1 bytes (1B)
+  000005:[0001#5,1-0001#5,1]: 1 bytes (1B)
+  000006:[0001#6,1-0001#6,1]: 1 bytes (1B)
+  000007:[0001#7,1-0001#7,1]: 1 bytes (1B)
+  000008:[0001#8,1-0001#8,1]: 1 bytes (1B)
+  000009:[0001#9,1-0001#9,1]: 1 bytes (1B)
+  000010:[0001#10,1-0001#10,1]: 1 bytes (1B)
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  500002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  500003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  500004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  500005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+  500006:[0006#6,1-0006#6,1]: 1 bytes (1B)
+  500007:[0007#7,1-0007#7,1]: 1 bytes (1B)
+  500008:[0008#8,1-0008#8,1]: 1 bytes (1B)
+  500009:[0009#9,1-0009#9,1]: 1 bytes (1B)
+  500010:[0010#10,1-0010#10,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+Picked: L5->L6: 9.2
+
+# Assume the above two compactions (one manual L0 -> L5 and one automatic
+# L5 -> L6) have run, and Lbase = L6 now, but the manual compaction code is
+# still going to try running a manual compaction from L5 -> L6 since L5 was the
+# output of the last manual compaction. No compaction should be picked.
+
+init 5
+0: 7
+6: 5
+----
+L0:
+  000001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  000002:[0001#2,1-0001#2,1]: 1 bytes (1B)
+  000003:[0001#3,1-0001#3,1]: 1 bytes (1B)
+  000004:[0001#4,1-0001#4,1]: 1 bytes (1B)
+  000005:[0001#5,1-0001#5,1]: 1 bytes (1B)
+  000006:[0001#6,1-0001#6,1]: 1 bytes (1B)
+  000007:[0001#7,1-0001#7,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+
+init_cp
+----
+base: 5
+
+pick_manual level=5 start=0 end=12
+----
+nil, retryLater = false
+
+
+# Initialize with LbaseMaxBytes of 5, and give L5 a compensated size of 10000.
+# Prior to prioritizing levels by the score instead of rawSmoothed score, L5
+# would be picked for compaction over L0, because of its absurdly high compensated
+# score.
+init 5
+0: 7
+5: 4 10000
+6: 5
+----
+L0:
+  000001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  000002:[0001#2,1-0001#2,1]: 1 bytes (1B)
+  000003:[0001#3,1-0001#3,1]: 1 bytes (1B)
+  000004:[0001#4,1-0001#4,1]: 1 bytes (1B)
+  000005:[0001#5,1-0001#5,1]: 1 bytes (1B)
+  000006:[0001#6,1-0001#6,1]: 1 bytes (1B)
+  000007:[0001#7,1-0001#7,1]: 1 bytes (1B)
+L5:
+  500001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  500002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  500003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  500004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+L6:
+  600001:[0001#1,1-0001#1,1]: 1 bytes (1B)
+  600002:[0002#2,1-0002#2,1]: 1 bytes (1B)
+  600003:[0003#3,1-0003#3,1]: 1 bytes (1B)
+  600004:[0004#4,1-0004#4,1]: 1 bytes (1B)
+  600005:[0005#5,1-0005#5,1]: 1 bytes (1B)
+
+init_cp
+----
+base: 5
+
+queue
+----
+L0->L5: 4.4
+  000001:[0001#1,1-0001#1,1] marked as compacting
+  000002:[0001#2,1-0001#2,1] marked as compacting
+  000003:[0001#3,1-0001#3,1] marked as compacting
+  000004:[0001#4,1-0001#4,1] marked as compacting
+  000005:[0001#5,1-0001#5,1] marked as compacting
+  000006:[0001#6,1-0001#6,1] marked as compacting
+  000007:[0001#7,1-0001#7,1] marked as compacting
+  500001:[0001#1,1-0001#1,1] marked as compacting
diff --git a/pebble/testdata/compaction_read_triggered b/pebble/testdata/compaction_read_triggered
new file mode 100644
index 0000000..5c3c796
--- /dev/null
+++ b/pebble/testdata/compaction_read_triggered
@@ -0,0 +1,166 @@
+# A simple case of read compaction, 2 files in different levels with overlapping ranges
+define
+L5
+a.SET.55:a b.SET.5:b
+L6
+a.SET.54:a b.SET.4:b
+----
+5:
+  000004:[a#55,SET-b#5,SET]
+6:
+  000005:[a#54,SET-b#4,SET]
+
+add-read-compaction
+5: a-b 000004
+----
+
+show-read-compactions
+----
+(level: 5, start: a, end: b)
+
+maybe-compact
+----
+[JOB 100] compacted(read) L5 [000004] (784B) Score=0.00 + L6 [000005] (784B) Score=0.00 -> L6 [000006] (778B), in 1.0s (2.0s total), output rate 778B/s
+
+show-read-compactions
+----
+(none)
+
+version
+----
+6:
+  000006:[a#0,SET-b#0,SET]
+
+# Check to make sure another compaction will not take place
+
+maybe-compact
+----
+(none)
+
+# Case where there is an in-progress flush. No compaction should occur while flushing is true.
+define
+L5
+a.SET.55:a b.SET.5:b
+L6
+a.SET.54:a b.SET.4:b
+----
+5:
+  000004:[a#55,SET-b#5,SET]
+6:
+  000005:[a#54,SET-b#4,SET]
+
+add-read-compaction flushing=true
+5: a-b 000004
+----
+
+show-read-compactions
+----
+(level: 5, start: a, end: b)
+
+maybe-compact
+----
+(none)
+
+show-read-compactions
+----
+(level: 5, start: a, end: b)
+
+version
+----
+5:
+  000004:[a#55,SET-b#5,SET]
+6:
+  000005:[a#54,SET-b#4,SET]
+
+add-read-compaction flushing=false
+----
+
+show-read-compactions
+----
+(level: 5, start: a, end: b)
+
+maybe-compact
+----
+[JOB 100] compacted(read) L5 [000004] (784B) Score=0.00 + L6 [000005] (784B) Score=0.00 -> L6 [000006] (778B), in 1.0s (2.0s total), output rate 778B/s
+
+show-read-compactions
+----
+(none)
+
+version
+----
+6:
+  000006:[a#0,SET-b#0,SET]
+
+# Test case where there is mismatch in the level of chosen read compaction and current version.
+# In this case, we skip the compaction.
+define
+L5
+a.SET.55:a b.SET.5:b
+L6
+a.SET.55:a b.SET.5:b
+----
+5:
+  000004:[a#55,SET-b#5,SET]
+6:
+  000005:[a#55,SET-b#5,SET]
+
+add-read-compaction
+4: a-b 000004
+----
+
+show-read-compactions
+----
+(level: 4, start: a, end: b)
+
+maybe-compact
+----
+(none)
+
+show-read-compactions
+----
+(none)
+
+version
+----
+5:
+  000004:[a#55,SET-b#5,SET]
+6:
+  000005:[a#55,SET-b#5,SET]
+
+# The read compaction range overlaps with the appropriate level, but
+# the file number is different.
+# So, we skip the compaction.
+define
+L5
+a.SET.55:a b.SET.5:b
+L6
+a.SET.55:a b.SET.5:b
+----
+5:
+  000004:[a#55,SET-b#5,SET]
+6:
+  000005:[a#55,SET-b#5,SET]
+
+add-read-compaction
+5: a-b 000003
+----
+
+show-read-compactions
+----
+(level: 5, start: a, end: b)
+
+maybe-compact
+----
+(none)
+
+show-read-compactions
+----
+(none)
+
+version
+----
+5:
+  000004:[a#55,SET-b#5,SET]
+6:
+  000005:[a#55,SET-b#5,SET]
diff --git a/pebble/testdata/compaction_setup_inputs b/pebble/testdata/compaction_setup_inputs
new file mode 100644
index 0000000..6755a8f
--- /dev/null
+++ b/pebble/testdata/compaction_setup_inputs
@@ -0,0 +1,153 @@
+setup-inputs a a
+L0
+  a.SET.1-b.SET.2
+----
+L0
+  000001:[a#1,1-b#2,1]
+
+setup-inputs c c
+L0
+  a.SET.1-b.SET.2
+----
+
+# Verify we expand the start level inputs to a clean cut.
+setup-inputs a a
+L1
+  a.SET.1-b.SET.2
+  b.SET.1-c.SET.2
+----
+L1
+  000001:[a#1,1-b#2,1]
+  000002:[b#1,1-c#2,1]
+
+# The range deletion sentinel acts as a clean cut boundary.
+setup-inputs a a
+L1
+  a.SET.1-b.RANGEDEL.72057594037927935
+  b.SET.1-c.SET.2
+----
+L1
+  000001:[a#1,1-b#72057594037927935,15]
+
+# Verify we expand the output level inputs to a clean cut.
+setup-inputs a a
+L1
+  a.SET.5-b.SET.6
+L2
+  a.SET.3-c.SET.4
+  c.SET.3-d.SET.2
+----
+L1
+  000001:[a#5,1-b#6,1]
+L2
+  000002:[a#3,1-c#4,1]
+  000003:[c#3,1-d#2,1]
+
+# Verify we expand the output level inputs to a clean cut.
+setup-inputs a a
+L1
+  a.SET.5-b.SET.6
+L2
+  a.SET.3-c.RANGEDEL.72057594037927935
+  c.SET.3-d.SET.2
+----
+L1
+  000001:[a#5,1-b#6,1]
+L2
+  000002:[a#3,1-c#72057594037927935,15]
+
+# Verify we grow the start level inputs to include all sstables which
+# lie within the output level bounds.
+setup-inputs a a
+L1
+  a.SET.5-b.SET.6
+  c.SET.4-e.SET.3
+L2
+  a.SET.3-d.SET.4
+----
+L1
+  000001:[a#5,1-b#6,1]
+  000002:[c#4,1-e#3,1]
+L2
+  000003:[a#3,1-d#4,1]
+
+# Verify we limit the start level input expansion according to available
+# disk capacity.
+setup-inputs avail-bytes=10 a a
+L1
+  a.SET.5-b.SET.6 size=2
+  c.SET.4-e.SET.3 size=1
+L2
+  a.SET.3-d.SET.4 size=3
+----
+L1
+  000001:[a#5,1-b#6,1]
+L2
+  000003:[a#3,1-d#4,1]
+
+# Verify the available disk capacity limit doesn't affect the
+# output level clean-cut expansion.
+setup-inputs avail-bytes=10 a a
+L1
+  a.SET.5-b.SET.6 size=5
+  c.SET.4-e.SET.3 size=10
+L2
+  a.SET.3-d.SET.4 size=5
+  d.SET.2-e.SET.2 size=5
+----
+L1
+  000001:[a#5,1-b#6,1]
+L2
+  000003:[a#3,1-d#4,1]
+  000004:[d#2,1-e#2,1]
+
+# We won't grow the start level inputs if doing so would grow the
+# output level inputs.
+setup-inputs a a
+L1
+  a.SET.5-b.SET.6
+  c.SET.4-e.SET.3
+L2
+  a.SET.3-d.SET.4
+  e.SET.2-f.SET.1
+----
+L1
+  000001:[a#5,1-b#6,1]
+L2
+  000003:[a#3,1-d#4,1]
+
+# Verify setup inputs can identify compacting files in range
+setup-inputs a a
+L1
+  a.SET.5-b.SET.6
+L2
+  a.SET.3-c.SET.4
+  c.SET.3-d.SET.2 compacting
+  d.SET.3-e.SET.6
+----
+L1
+  000001:[a#5,1-b#6,1]
+L2
+  000002:[a#3,1-c#4,1]
+  000003:[c#3,1-d#2,1]
+  000004:[d#3,1-e#6,1]
+is-compacting
+
+# Verify when there is one file in range and it is compacting
+setup-inputs a a
+L2
+  a.SET.3-c.SET.4 compacting
+  d.SET.3-e.SET.2
+----
+L2
+  000001:[a#3,1-c#4,1]
+is-compacting
+
+# Verify when there is one file in level and is compacting
+setup-inputs a a
+L2
+  a.SET.3-c.SET.4 compacting
+----
+L2
+  000001:[a#3,1-c#4,1]
+is-compacting
diff --git a/pebble/testdata/compaction_setup_inputs_multilevel_dummy b/pebble/testdata/compaction_setup_inputs_multilevel_dummy
new file mode 100644
index 0000000..ac92102
--- /dev/null
+++ b/pebble/testdata/compaction_setup_inputs_multilevel_dummy
@@ -0,0 +1,18 @@
+# init a multi-level compaction with dummy hueristic
+setup-inputs a a
+L1
+  a.SET.1-b.SET.2 size=1
+L2
+  a.SET.3-c.SET.4 size=1
+L3
+  c.SET.3-d.SET.2 size=1
+----
+L1
+  000001:[a#1,1-b#2,1]
+L2
+  000002:[a#3,1-c#4,1]
+L3
+  000003:[c#3,1-d#2,1]
+init-multi-level(1,2,3)
+Original WriteAmp 2.00; ML WriteAmp 1.50
+Original OverlappingRatio 1.00; ML OverlappingRatio 0.50
diff --git a/pebble/testdata/compaction_setup_inputs_multilevel_write_amp b/pebble/testdata/compaction_setup_inputs_multilevel_write_amp
new file mode 100644
index 0000000..beb31e9
--- /dev/null
+++ b/pebble/testdata/compaction_setup_inputs_multilevel_write_amp
@@ -0,0 +1,339 @@
+# Init a multi-level compaction, because multi level write amp is lower
+setup-inputs a a
+L1
+  a.SET.1-b.SET.2 size=1
+L2
+  a.SET.3-c.SET.4 size=1
+L3
+  c.SET.3-d.SET.2 size=1
+----
+L1
+  000001:[a#1,1-b#2,1]
+L2
+  000002:[a#3,1-c#4,1]
+L3
+  000003:[c#3,1-d#2,1]
+init-multi-level(1,2,3)
+Original WriteAmp 2.00; ML WriteAmp 1.50
+Original OverlappingRatio 1.00; ML OverlappingRatio 0.50
+
+# Verify that the input level size should not affect the decision to conduct a multi
+# level compaction.
+setup-inputs a a
+L1
+  a.SET.1-b.SET.2 size=10
+L2
+  a.SET.3-c.SET.4 size=1
+L3
+  c.SET.3-d.SET.2 size=1
+----
+L1
+  000001:[a#1,1-b#2,1]
+L2
+  000002:[a#3,1-c#4,1]
+L3
+  000003:[c#3,1-d#2,1]
+init-multi-level(1,2,3)
+Original WriteAmp 1.10; ML WriteAmp 1.09
+Original OverlappingRatio 0.10; ML OverlappingRatio 0.09
+
+# Don't init a multi-level compaction because write amp from multi level compaction is larger
+setup-inputs a a
+L1
+  a.SET.1-b.SET.2 size=1
+L2
+  a.SET.3-c.SET.4 size=1
+L3
+  c.SET.3-d.SET.2 size=3
+----
+L1
+  000001:[a#1,1-b#2,1]
+L2
+  000002:[a#3,1-c#4,1]
+
+# Init a multi-level compaction, but note that the second files in L2 and L3 do not get
+# chosen, as they don't overlap with the original compaction.
+setup-inputs a a
+L1
+  a.SET.1-b.SET.2 size=6
+L2
+  a.SET.3-c.SET.4 size=5
+  e.SET.1-h.SET.4 size=4
+L3
+  c.SET.3-d.SET.2 size=6
+  e.SET.2-h.SET.4 size=4
+----
+L1
+  000001:[a#1,1-b#2,1]
+L2
+  000002:[a#3,1-c#4,1]
+L3
+  000004:[c#3,1-d#2,1]
+init-multi-level(1,2,3)
+Original WriteAmp 1.83; ML WriteAmp 1.55
+Original OverlappingRatio 0.83; ML OverlappingRatio 0.55
+
+# Init a multi-level compaction without an overlapping file in the lowest level.
+setup-inputs a a
+L1
+  a.SET.1-b.SET.2 size=6
+L2 max-size=5
+  a.SET.3-c.SET.4 size=5
+L3
+  e.SET.3-f.SET.2 size=100
+----
+L1
+  000001:[a#1,1-b#2,1]
+L2
+  000002:[a#3,1-c#4,1]
+init-multi-level(1,2,3)
+Original WriteAmp 1.83; ML WriteAmp 1.00
+Original OverlappingRatio 0.83; ML OverlappingRatio 0.00
+
+# Init a multi-level compaction with no file in the lowest level.
+setup-inputs a a
+L1
+  a.SET.1-b.SET.2 size=6
+L2
+  a.SET.3-c.SET.4 size=5
+----
+L1
+  000001:[a#1,1-b#2,1]
+L2
+  000002:[a#3,1-c#4,1]
+init-multi-level(1,2,3)
+Original WriteAmp 1.83; ML WriteAmp 1.00
+Original OverlappingRatio 0.83; ML OverlappingRatio 0.00
+
+
+# Don't init a multi-level compaction, as the single level compaction results in a move (
+# write amp is 1) while the multi level compaction results in a Write Amp greater than 1
+setup-inputs a a
+L1
+  a.SET.1-b.SET.2 size=6
+L2
+  e.SET.3-f.SET.2 size=100
+L3
+  a.SET.3-c.SET.4 size=5
+----
+L1
+  000001:[a#1,1-b#2,1]
+
+# Init a multi-level compaction, without an overlapping file in the (tie goes to the ML compaction!)
+# intermediate and output levels
+setup-inputs a a
+L1
+  a.SET.1-b.SET.2 size=6
+L2
+  e.SET.3-f.SET.2 size=1
+L3
+  e.SET.4-f.SET.5 size=5
+----
+L1
+  000001:[a#1,1-b#2,1]
+init-multi-level(1,2,3)
+Original WriteAmp 1.00; ML WriteAmp 1.00
+Original OverlappingRatio 0.00; ML OverlappingRatio 0.00
+
+
+# Init a multi-level compaction which expands the intermediate level with a file that only
+# overlaps with the lowest level. (I.e. it gets included during second setupInputs call)
+setup-inputs a a
+L1
+  a.SET.1-b.SET.2 size=3
+L2
+  a.SET.2-b.SET.3 size=5
+  c.SET.2-d.SET.3 size=3
+L3
+  a.SET.3-c.SET.4 size=3
+----
+L1
+  000001:[a#1,1-b#2,1]
+L2
+  000002:[a#2,1-b#3,1]
+  000003:[c#2,1-d#3,1]
+L3
+  000004:[a#3,1-c#4,1]
+init-multi-level(1,2,3)
+Original WriteAmp 2.67; ML WriteAmp 1.27
+Original OverlappingRatio 1.67; ML OverlappingRatio 0.27
+
+# Init a multi-level compaction which DOES NOT expand the input level with a file that
+# only overlaps with the lowest level, even if it doesn't expand the output level keyspan.
+# TODO(msbutler): include this file in the compaction
+setup-inputs a a
+L1
+  a.SET.1-b.SET.2 size=1
+  c.SET.2-d.SET.3 size=10
+L2
+  a.SET.2-b.SET.3 size=1
+L3
+  a.SET.3-c.SET.4 size=1
+----
+L1
+  000001:[a#1,1-b#2,1]
+L2
+  000003:[a#2,1-b#3,1]
+L3
+  000004:[a#3,1-c#4,1]
+init-multi-level(1,2,3)
+Original WriteAmp 2.00; ML WriteAmp 1.50
+Original OverlappingRatio 1.00; ML OverlappingRatio 0.50
+
+# Verify an expansion of the output level in the initial setupInputs will init a multi-level
+# compaction. i.e. without the initial expansion, the multil level compaction would not have
+# occurred.
+setup-inputs a a
+L1
+  a.SET.5-b.SET.6 size=1
+L2
+  a.SET.3-c.SET.4 size=1
+  c.SET.3-d.SET.2 size=1
+L3
+  c.SET.4-d.SET.4 size=3
+----
+L1
+  000001:[a#5,1-b#6,1]
+L2
+  000002:[a#3,1-c#4,1]
+  000003:[c#3,1-d#2,1]
+L3
+  000004:[c#4,1-d#4,1]
+init-multi-level(1,2,3)
+Original WriteAmp 3.00; ML WriteAmp 2.00
+Original OverlappingRatio 2.00; ML OverlappingRatio 1.00
+
+setup-inputs a a
+L1
+  a.SET.5-b.SET.6 size=1
+L2
+  a.SET.3-c.SET.4 size=1
+L3
+  c.SET.4-d.SET.4 size=3
+----
+L1
+  000001:[a#5,1-b#6,1]
+L2
+  000002:[a#3,1-c#4,1]
+
+
+# Verify the second setupInputs call does not add an intermediate file if doing so would expand the
+# output level (i.e. the pc.grow logic).
+setup-inputs a a
+L1
+  a.SET.1-b.SET.2 size=1
+L2
+  a.SET.2-b.SET.3 size=1
+  d.SET.2-f.SET.2 size=1
+L3
+  b.SET.1-d.SET.1 size=1
+  e.SET.4-f.SET.5 size=1
+----
+L1
+  000001:[a#1,1-b#2,1]
+L2
+  000002:[a#2,1-b#3,1]
+L3
+  000004:[b#1,1-d#1,1]
+init-multi-level(1,2,3)
+Original WriteAmp 2.00; ML WriteAmp 1.50
+Original OverlappingRatio 1.00; ML OverlappingRatio 0.50
+
+# Verify the max number of input levels equals 2.
+setup-inputs a a
+L1
+  a.SET.1-b.SET.2 size=6
+L2
+  a.SET.3-c.SET.4 size=5
+L3
+  c.SET.3-d.SET.2 size=2
+L4
+  c.SET.4-d.SET.3 size=1
+----
+L1
+  000001:[a#1,1-b#2,1]
+L2
+  000002:[a#3,1-c#4,1]
+L3
+  000003:[c#3,1-d#2,1]
+init-multi-level(1,2,3)
+Original WriteAmp 1.83; ML WriteAmp 1.18
+Original OverlappingRatio 0.83; ML OverlappingRatio 0.18
+
+# Don't init multi-level compaction if the max size limit exceeded by initial setupInputs.
+setup-inputs avail-bytes=10 a a
+L1
+  a.SET.1-b.SET.2 size=6
+L2
+  a.SET.5-b.SET.6 size=5
+L3
+  a.SET.3-d.SET.4 size=3
+----
+L1
+  000001:[a#1,1-b#2,1]
+L2
+  000002:[a#5,1-b#6,1]
+
+# During second setupInputs call, allow output level expansion even if max size
+# limit is exceeded, but not conduct intermediate level expansion.
+#
+# TODO(msbutler): If second setup inputs exceeds maxSize limits, should the first compaction get
+# returned?
+setup-inputs avail-bytes=20 a a
+L1
+  a.SET.1-b.SET.2 size=4
+L2
+  a.SET.5-b.SET.6 size=5
+  c.SET.4-e.SET.3 size=8
+L3
+  a.SET.3-d.SET.4 size=2
+  d.SET.2-e.SET.2 size=2
+----
+L1
+  000001:[a#1,1-b#2,1]
+L2
+  000002:[a#5,1-b#6,1]
+L3
+  000004:[a#3,1-d#4,1]
+  000005:[d#2,1-e#2,1]
+init-multi-level(1,2,3)
+Original WriteAmp 2.25; ML WriteAmp 1.44
+Original OverlappingRatio 1.25; ML OverlappingRatio 0.44
+
+# Don't init a multi-level compaction if the start level is L5.
+setup-inputs a a
+L5
+  a.SET.1-b.SET.2 size=6
+L6
+  a.SET.3-c.SET.4 size=5
+----
+L5
+  000001:[a#1,1-b#2,1]
+L6
+  000002:[a#3,1-c#4,1]
+
+# Don't init a multi-level compaction if the start level is L0.
+setup-inputs a a
+L0
+  a.SET.1-b.SET.2 size=6
+L1
+  a.SET.3-c.SET.4 size=5
+----
+L0
+  000001:[a#1,1-b#2,1]
+L1
+  000002:[a#3,1-c#4,1]
+
+# Verify a multi level compaction will not init on a compacting file.
+setup-inputs a a
+L1
+  a.SET.1-b.SET.2 size=1
+L2
+  a.SET.3-c.SET.4 size=1
+L3
+  c.SET.3-d.SET.2 size=1 compacting
+----
+L1
+  000001:[a#1,1-b#2,1]
+L2
+  000002:[a#3,1-c#4,1]
diff --git a/pebble/testdata/compaction_tombstones b/pebble/testdata/compaction_tombstones
new file mode 100644
index 0000000..795c019
--- /dev/null
+++ b/pebble/testdata/compaction_tombstones
@@ -0,0 +1,419 @@
+# Test an L6 file that contains range tombstones, but whose keys are not in
+# the last snapshot stripe. The tombstones wouldn't be elided, so no
+# compaction is pursued.
+define snapshots=(70, 100, 180, 210)
+L6
+b.RANGEDEL.230:h h.RANGEDEL.200:r
+----
+6:
+  000004:[b#230,RANGEDEL-r#inf,RANGEDEL]
+
+wait-pending-table-stats
+000004
+----
+num-entries: 2
+num-deletions: 2
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 0
+
+maybe-compact
+----
+(none)
+
+# Test the same scenario, but the file is in the last stripe. Since the file
+# only contains deletes, no new sstable is written.
+define snapshots=(270, 300, 380, 410)
+L6
+b.RANGEDEL.230:h h.RANGEDEL.200:r
+----
+6:
+  000004:[b#230,RANGEDEL-r#inf,RANGEDEL]
+
+wait-pending-table-stats
+000004
+----
+num-entries: 2
+num-deletions: 2
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 0
+
+maybe-compact
+----
+[JOB 100] compacted(elision-only) L6 [000004] (748B) Score=0.00 + L6 [] (0B) Score=0.00 -> L6 [] (0B), in 1.0s (2.0s total), output rate 0B/s
+
+# Test a table that straddles a snapshot. It should not be compacted.
+define snapshots=(50) auto-compactions=off
+L6
+a.SET.55:a b.RANGEDEL.5:h
+----
+6:
+  000004:[a#55,SET-h#inf,RANGEDEL]
+
+wait-pending-table-stats
+000004
+----
+num-entries: 2
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 0
+
+maybe-compact
+----
+(none)
+
+# Test a table with a point deletion and a non-deletion entry. The table
+# should be compacted, and a new table with the point tombstone should be
+# written.
+define auto-compactions=off
+L6
+a.SET.55:a b.DEL.5:
+----
+6:
+  000004:[a#55,SET-b#5,DEL]
+
+wait-pending-table-stats
+000004
+----
+num-entries: 2
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 93
+range-deletions-bytes-estimate: 0
+
+maybe-compact
+----
+[JOB 100] compacted(elision-only) L6 [000004] (714B) Score=0.00 + L6 [] (0B) Score=0.00 -> L6 [000005] (663B), in 1.0s (2.0s total), output rate 663B/s
+
+version
+----
+6:
+  000005:[a#0,SET-a#0,SET]
+
+# Checking for a compaction again should not trigger a compaction, because
+# 000005 does not contain deletions.
+
+maybe-compact
+----
+(none)
+
+maybe-compact
+----
+(none)
+
+# Test a table that contains both deletions and non-deletions, but whose
+# deletions remove the non-deletions. The compaction should not create a new
+# table, but shouldn't happen until the snapshots are removed.
+define snapshots=(59, 103) auto-compactions=off
+L6
+a.DEL.60: a.SET.55:a b.SET.100:b c.SET.101:c d.SET.102:d b.RANGEDEL.103:z
+----
+6:
+  000004:[a#60,DEL-z#inf,RANGEDEL]
+
+wait-pending-table-stats
+000004
+----
+num-entries: 6
+num-deletions: 2
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 42
+range-deletions-bytes-estimate: 66
+
+maybe-compact
+----
+(none)
+
+close-snapshot
+59
+----
+(none)
+
+close-snapshot
+103
+----
+[JOB 100] compacted(elision-only) L6 [000004] (892B) Score=0.00 + L6 [] (0B) Score=0.00 -> L6 [] (0B), in 1.0s (2.0s total), output rate 0B/s
+
+# Test a table that contains both deletions and non-deletions, but whose
+# non-deletions well outnumber its deletions. The table should not be
+# compacted because it falls beneath the threshold.
+define snapshots=(15) auto-compactions=off
+L6
+a.DEL.20: a.SET.1:a b.SET.2:b c.SET.3:c d.SET.4:d e.SET.5:e f.SET.6:f g.SET.7:g h.SET.8:h i.SET.9:i j.SET.10:j
+----
+6:
+  000004:[a#20,DEL-j#10,SET]
+
+wait-pending-table-stats
+000004
+----
+num-entries: 11
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 23
+range-deletions-bytes-estimate: 0
+
+close-snapshot
+15
+----
+(none)
+
+# Test a table that contains both deletions and non-deletions, but whose
+# deletions remove the non-deletions. Set L5's max bytes low so that an
+# automatic compaction will be pursued when we call maybe-compact.
+# Automatic compactions need to be disabled to prevent a race where an
+# automatic compaction compacts before we've closen the snapshot.
+define snapshots=(103) level-max-bytes=(L5 : 1000) auto-compactions=off
+L5
+b.SET.200:<rand-bytes=4096> bb.SET.203:<rand-bytes=4096> cc.SET.204:<rand-bytes=4096>
+L5
+d.SET.302:<rand-bytes=4096> dd.SET.303:<rand-bytes=4096> de.SET.303:<rand-bytes=4096>
+L5
+m.SET.320:<rand-bytes=4096> n.SET.330:<rand-bytes=4096> o.SET.340:<rand-bytes=4096>
+L6
+a.SET.55:<rand-bytes=4096> b.SET.100:<rand-bytes=4096> c.SET.101:<rand-bytes=4096> d.SET.102:<rand-bytes=4096> a.RANGEDEL.103:e
+L6
+f.SET.30:<rand-bytes=4096> z.SET.31:<rand-bytes=4096>
+----
+5:
+  000004:[b#200,SET-cc#204,SET]
+  000005:[d#302,SET-de#303,SET]
+  000006:[m#320,SET-o#340,SET]
+6:
+  000007:[a#103,RANGEDEL-e#inf,RANGEDEL]
+  000008:[f#30,SET-z#31,SET]
+
+close-snapshot
+103
+----
+(none)
+
+wait-pending-table-stats
+000007
+----
+num-entries: 5
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 16492
+
+# Because we set max bytes low, maybe-compact will trigger an automatic
+# compaction in preference over an elision-only compaction.
+# By plain file size, 000006 should be picked because it overlaps
+# significantly less data in L6. However, 000007 has significant obsolete
+# data. The compaction picker should recognize that it's more efficient to
+# compact (000004 + 000005) into 000007.
+
+maybe-compact
+----
+[JOB 100] compacted(default) L5 [000004 000005] (26KB) Score=87.72 + L6 [000007] (17KB) Score=0.73 -> L6 [000009] (25KB), in 1.0s (2.0s total), output rate 25KB/s
+
+define level-max-bytes=(L5 : 1000) auto-compactions=off
+L5
+a.DEL.101: b.DEL.102: c.DEL.103:
+L5
+m.SET.107:<rand-bytes=4096>
+L6
+a.SET.001:<rand-bytes=4096> b.SET.002:<rand-bytes=4096> c.SET.003:<rand-bytes=4096>
+L6
+f.SET.007:<rand-bytes=4096> x.SET.008:<rand-bytes=4096> z.SET.009:<rand-bytes=4096>
+----
+5:
+  000004:[a#101,DEL-c#103,DEL]
+  000005:[m#107,SET-m#107,SET]
+6:
+  000006:[a#1,SET-c#3,SET]
+  000007:[f#7,SET-z#9,SET]
+
+wait-pending-table-stats
+000004
+----
+num-entries: 3
+num-deletions: 3
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 6858
+range-deletions-bytes-estimate: 0
+
+# By plain file size, 000005 should be picked because it is larger and
+# overlaps the same amount of data in L6. However, 000004 has a high
+# point-deletions-bytes-estimate, and the compaction picker should pick 000004
+# instead.
+
+maybe-compact
+----
+[JOB 100] compacted(default) L5 [000004] (724B) Score=13.45 + L6 [000006] (13KB) Score=0.92 -> L6 [] (0B), in 1.0s (2.0s total), output rate 0B/s
+
+# A table containing only range keys is not eligible for elision.
+# RANGEKEYDEL or RANGEKEYUNSET.
+
+define auto-compactions=off
+L6
+  rangekey:a-b:{(#1,RANGEKEYDEL)}
+L6
+  rangekey:b-c:{(#2,RANGEKEYUNSET,@1)}
+L6
+  rangekey:c-d:{(#3,RANGEKEYSET,@1)}
+----
+6:
+  000004:[a#1,RANGEKEYDEL-b#inf,RANGEKEYDEL]
+  000005:[b#2,RANGEKEYUNSET-c#inf,RANGEKEYUNSET]
+  000006:[c#3,RANGEKEYSET-d#inf,RANGEKEYSET]
+
+wait-pending-table-stats
+000004
+----
+num-entries: 0
+num-deletions: 0
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 0
+
+wait-pending-table-stats
+000005
+----
+num-entries: 0
+num-deletions: 0
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 0
+
+wait-pending-table-stats
+000006
+----
+num-entries: 0
+num-deletions: 0
+num-range-key-sets: 1
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 0
+
+maybe-compact
+----
+(none)
+
+# Regression test for cockroachdb/cockroach#90149, exercising reference counting
+# on tables that contain a mixture of point, range dels and range keys.
+#
+# Place a table in L6 that contains a RANGEKEYDEL. Because this table is in the
+# bottom of the LSM, and there are no range keys below it, the RANGEKEYDEL is
+# eligible for elision (the RANGEDEL too). After the elision, the input table
+# should be deleted. In #90149, the table still had a reference count, and
+# therefore could not be deleted from the filesystem.
+
+define auto-compactions=off
+L6
+  rangekey:a-b:{(#1,RANGEKEYDEL)}
+  a.SET.2:a
+  b.SET.3:b
+  c.SET.4:c
+  c.RANGEDEL.5:z
+----
+6:
+  000004:[a#2,SET-z#inf,RANGEDEL]
+
+wait-pending-table-stats
+000004
+----
+num-entries: 3
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 41
+
+maybe-compact
+----
+[JOB 100] compacted(elision-only) L6 [000004] (894B) Score=0.00 + L6 [] (0B) Score=0.00 -> L6 [000005] (669B), in 1.0s (2.0s total), output rate 669B/s
+
+# Close the DB, asserting that the reference counts balance.
+close
+----
+
+# Demonstration of point tombstone weighting.
+#
+# Construct an LSM with two tables in L6, with a table above each in L5. The
+# layout of the tables is such that the range deletion bytes estimate for table
+# 000005 is greater than the point deletion bytes estimate for table 000004.
+# Without weighting, table 000005 will be selected.
+
+define auto-compactions=off level-max-bytes=(L5 : 1000)
+L5
+a.DEL.101: b.SET.102:
+L5
+e.RANGEDEL.107:f f.SET.108:
+L6
+a.SET.001:<rand-bytes=4096> b.SET.002:<rand-bytes=4096> c.SET.003:<rand-bytes=4096>
+L6
+e.SET.007:<rand-bytes=4096> f.SET.008:<rand-bytes=4096> g.SET.009:<rand-bytes=4096>
+----
+5:
+  000004:[a#101,DEL-b#102,SET]
+  000005:[e#107,RANGEDEL-f#108,SET]
+6:
+  000006:[a#1,SET-c#3,SET]
+  000007:[e#7,SET-g#9,SET]
+
+wait-pending-table-stats
+000004
+----
+num-entries: 2
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 2742
+range-deletions-bytes-estimate: 0
+
+wait-pending-table-stats
+000005
+----
+num-entries: 2
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 8246
+
+maybe-compact
+----
+[JOB 100] compacted(default) L5 [000005] (741B) Score=11.82 + L6 [000007] (13KB) Score=1.05 -> L6 [000008] (4.7KB), in 1.0s (2.0s total), output rate 4.7KB/s
+
+# The same LSM as above. However, this time, with point tombstone weighting at
+# 2x, the table with the point tombstone (000004) will be selected as the
+# compaction input.
+
+define auto-compactions=off level-max-bytes=(L5 : 1000) point-tombstone-weight=2
+L5
+a.DEL.101: b.SET.102:
+L5
+e.RANGEDEL.107:f f.SET.108:
+L6
+a.SET.001:<rand-bytes=4096> b.SET.002:<rand-bytes=4096> c.SET.003:<rand-bytes=4096>
+L6
+e.SET.007:<rand-bytes=4096> f.SET.008:<rand-bytes=4096> g.SET.009:<rand-bytes=4096>
+----
+5:
+  000004:[a#101,DEL-b#102,SET]
+  000005:[e#107,RANGEDEL-f#108,SET]
+6:
+  000006:[a#1,SET-c#3,SET]
+  000007:[e#7,SET-g#9,SET]
+
+wait-pending-table-stats
+000004
+----
+num-entries: 2
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 2742
+range-deletions-bytes-estimate: 0
+
+wait-pending-table-stats
+000005
+----
+num-entries: 2
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 8246
+
+maybe-compact
+----
+[JOB 100] compacted(default) L5 [000005] (741B) Score=11.82 + L6 [000007] (13KB) Score=1.05 -> L6 [000008] (4.7KB), in 1.0s (2.0s total), output rate 4.7KB/s
diff --git a/pebble/testdata/compaction_transform b/pebble/testdata/compaction_transform
new file mode 100644
index 0000000..4fd9fd0
--- /dev/null
+++ b/pebble/testdata/compaction_transform
@@ -0,0 +1,116 @@
+
+# Test snapshot striping and coalescing.
+
+transform snapshots=(5,10,15) disable-elision
+a-c:{(#9,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)}
+----
+a-c:{(#9,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3)}
+
+transform snapshots=(5,10,15) disable-elision
+a-c:{(#9,RANGEKEYUNSET,@3) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)}
+----
+a-c:{(#9,RANGEKEYUNSET,@3) (#4,RANGEKEYSET,@3,foo3)}
+
+transform snapshots=(5,10,15) disable-elision
+a-c:{(#9,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)}
+----
+a-c:{(#9,RANGEKEYDEL) (#4,RANGEKEYSET,@3,foo3)}
+
+transform snapshots=(5,10,15) disable-elision
+a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)}
+----
+a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3)}
+
+transform disable-elision
+a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)}
+----
+a-c:{(#11,RANGEKEYDEL)}
+
+# Test that elision works on the last snapshot stripe.
+
+transform snapshots=(5,10,15)
+a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)}
+----
+a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3)}
+
+transform snapshots=(3,10,15)
+a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYUNSET,@4) (#2,RANGEKEYSET,@3,foo2)}
+----
+a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#3,RANGEKEYUNSET,@4) (#2,RANGEKEYSET,@3,foo2)}
+
+transform snapshots=(2,10,15)
+a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYUNSET,@4) (#2,RANGEKEYSET,@3,foo2)}
+----
+a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5)}
+
+# The RANGEKEYDEL deletes all underlying keys and there are no snapshots or
+# in-use key ranges at play, so all keys should empty out.
+
+transform
+a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)}
+----
+a-c:{}
+
+# Test RANGEKEYDELs are preserved over in-use key ranges in the last snapshot stripe.
+# in-use key ranges cover keys that exist in lower levels of the LSM, so dropping
+# range keys in that space could cause correctness issues.
+
+transform in-use-key-ranges=(b-d)
+a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)}
+----
+a-c:{(#11,RANGEKEYDEL)}
+
+# Test RANGEKEYSETs are preserved in the non-last snapshot stripe.
+
+transform in-use-key-ranges=(b-d) snapshots=(8)
+a-c:{(#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)}
+----
+a-c:{(#11,RANGEKEYDEL) (#4,RANGEKEYSET,@3,foo3)}
+
+transform
+a-c:{(#13,RANGEKEYSET,@3,bar1) (#12,RANGEKEYSET,@2,bar2) (#11,RANGEKEYDEL) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)}
+----
+a-c:{(#13,RANGEKEYSET,@3,bar1) (#12,RANGEKEYSET,@2,bar2)}
+
+# Test RANGEKEYUNSETs are preserved over in-use key ranges.
+
+transform
+a-c:{(#11,RANGEKEYUNSET,@3) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)}
+----
+a-c:{}
+
+transform in-use-key-ranges=(b-d)
+a-c:{(#11,RANGEKEYUNSET,@3) (#8,RANGEKEYSET,@3,foo5) (#4,RANGEKEYSET,@3,foo3) (#3,RANGEKEYSET,@3,foo2)}
+----
+a-c:{(#11,RANGEKEYUNSET,@3)}
+
+# Test cases where multiple keys have the same sequence number.
+
+transform
+a-c:{(#11,RANGEKEYSET,@3,foo5) (#11,RANGEKEYUNSET,@4) (#11,RANGEKEYDEL)}
+----
+a-c:{(#11,RANGEKEYSET,@3,foo5)}
+
+transform
+a-c:{(#11,RANGEKEYSET,@3,foo5) (#11,RANGEKEYUNSET,@3) (#11,RANGEKEYDEL)}
+----
+a-c:{(#11,RANGEKEYSET,@3,foo5)}
+
+# Test that UNSETs and DELs are retained over in-use key ranges.
+
+transform in-use-key-ranges=(b-d)
+a-c:{(#11,RANGEKEYSET,@3,foo5) (#11,RANGEKEYUNSET,@4) (#11,RANGEKEYDEL)}
+----
+a-c:{(#11,RANGEKEYSET,@3,foo5) (#11,RANGEKEYUNSET,@4) (#11,RANGEKEYDEL)}
+
+# Test that sets shadow unset at the same prefix, even if elision is disabled.
+
+transform in-use-key-ranges=(b-d)
+a-c:{(#11,RANGEKEYSET,@3,foo5) (#11,RANGEKEYUNSET,@3) (#11,RANGEKEYDEL)}
+----
+a-c:{(#11,RANGEKEYSET,@3,foo5) (#11,RANGEKEYDEL)}
+
+transform disable-elision
+a-c:{(#11,RANGEKEYSET,@3,foo5) (#11,RANGEKEYUNSET,@3) (#11,RANGEKEYDEL)
+----
+a-c:{(#11,RANGEKEYSET,@3,foo5) (#11,RANGEKEYDEL)}
diff --git a/pebble/testdata/concurrent_excise b/pebble/testdata/concurrent_excise
new file mode 100644
index 0000000..135566f
--- /dev/null
+++ b/pebble/testdata/concurrent_excise
@@ -0,0 +1,176 @@
+
+reset
+----
+
+switch 1
+----
+ok
+
+batch
+set d foo
+set e bar
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+switch 2
+----
+ok
+
+batch
+set c fooz
+set f foobar
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+batch
+set d foobar
+----
+
+flush
+----
+
+lsm
+----
+0.0:
+  000007:[d#12,SET-d#12,SET]
+6:
+  000005:[c#10,SET-f#11,SET]
+
+compact a-z block=c1
+----
+spun off in separate goroutine
+
+iter
+first
+next
+next
+next
+next
+----
+c: (fooz, .)
+d: (foobar, .)
+f: (foobar, .)
+.
+.
+
+# This excise should cancel the in-flight compaction, causing it to error out
+# below. The eventually file-only snapshot should go through because it's not
+# waiting on any keys in memtables
+
+file-only-snapshot s1
+  c e
+----
+ok
+
+replicate 1 2 b e
+----
+replicated 1 shared SSTs
+
+unblock c1
+----
+ok
+
+wait-for-file-only-snapshot s1
+----
+ok
+
+lsm
+----
+6:
+  000010:[d#13,SET-d#13,SET]
+  000011:[f#11,SET-f#11,SET]
+
+compact a-z
+----
+ok
+
+wait-for-background-error
+----
+pebble: compaction cancelled by a concurrent operation, will retry compaction
+
+iter
+first
+next
+next
+next
+next
+----
+d: (foo, .)
+f: (foobar, .)
+.
+.
+.
+
+batch
+set d fo
+set ee foobar
+set f3 something
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+switch 1
+----
+ok
+
+# The below file-only snapshot should be errored out by the concurrent excise.
+
+batch
+set d something
+----
+
+flush
+----
+
+batch
+set dd memory
+----
+
+file-only-snapshot s2
+ c e
+----
+ok
+
+iter snapshot=s2
+first
+next
+next
+next
+----
+d: (something, .)
+dd: (memory, .)
+e: (bar, .)
+.
+
+replicate 2 1 c dd
+----
+replicated 1 shared SSTs
+
+wait-for-file-only-snapshot s2
+----
+pebble: snapshot excised before conversion to file-only snapshot
+
+iter snapshot=s2
+first
+next
+next
+next
+----
+pebble: snapshot excised before conversion to file-only snapshot
diff --git a/pebble/testdata/db-stage-1/000002.log b/pebble/testdata/db-stage-1/000002.log
new file mode 100644
index 0000000..4f472ae
Binary files /dev/null and b/pebble/testdata/db-stage-1/000002.log differ
diff --git a/pebble/testdata/db-stage-1/CURRENT b/pebble/testdata/db-stage-1/CURRENT
new file mode 100644
index 0000000..feda7d6
--- /dev/null
+++ b/pebble/testdata/db-stage-1/CURRENT
@@ -0,0 +1 @@
+MANIFEST-000000
diff --git a/pebble/testdata/db-stage-1/LOCK b/pebble/testdata/db-stage-1/LOCK
new file mode 100644
index 0000000..e69de29
diff --git a/pebble/testdata/db-stage-1/MANIFEST-000001 b/pebble/testdata/db-stage-1/MANIFEST-000001
new file mode 100644
index 0000000..7c30d31
Binary files /dev/null and b/pebble/testdata/db-stage-1/MANIFEST-000001 differ
diff --git a/pebble/testdata/db-stage-1/OPTIONS-000003 b/pebble/testdata/db-stage-1/OPTIONS-000003
new file mode 100644
index 0000000..370b7b8
--- /dev/null
+++ b/pebble/testdata/db-stage-1/OPTIONS-000003
@@ -0,0 +1,48 @@
+[Version]
+  pebble_version=0.1
+
+[Options]
+  bytes_per_sync=524288
+  cache_size=8388608
+  cleaner=delete
+  compaction_debt_concurrency=1073741824
+  comparer=leveldb.BytewiseComparator
+  disable_wal=false
+  flush_delay_delete_range=0s
+  flush_delay_range_key=0s
+  flush_split_bytes=4194304
+  format_major_version=13
+  l0_compaction_concurrency=10
+  l0_compaction_file_threshold=500
+  l0_compaction_threshold=4
+  l0_stop_writes_threshold=12
+  lbase_max_bytes=67108864
+  max_concurrent_compactions=1
+  max_manifest_file_size=134217728
+  max_open_files=1000
+  mem_table_size=4194304
+  mem_table_stop_writes_threshold=2
+  min_deletion_rate=0
+  merger=pebble.concatenate
+  read_compaction_rate=16000
+  read_sampling_multiplier=16
+  strict_wal_tail=true
+  table_cache_shards=10
+  table_property_collectors=[]
+  validate_on_ingest=false
+  wal_dir=
+  wal_bytes_per_sync=0
+  max_writer_concurrency=0
+  force_writer_parallelism=false
+  secondary_cache_size_bytes=0
+  create_on_shared=0
+
+[Level "0"]
+  block_restart_interval=16
+  block_size=4096
+  block_size_threshold=90
+  compression=Snappy
+  filter_policy=none
+  filter_type=table
+  index_block_size=4096
+  target_file_size=2097152
diff --git a/pebble/testdata/db-stage-1/marker.format-version.000012.013 b/pebble/testdata/db-stage-1/marker.format-version.000012.013
new file mode 100644
index 0000000..e69de29
diff --git a/pebble/testdata/db-stage-1/marker.manifest.000001.MANIFEST-000001 b/pebble/testdata/db-stage-1/marker.manifest.000001.MANIFEST-000001
new file mode 100644
index 0000000..e69de29
diff --git a/pebble/testdata/db-stage-2/000002.log b/pebble/testdata/db-stage-2/000002.log
new file mode 100644
index 0000000..a253646
Binary files /dev/null and b/pebble/testdata/db-stage-2/000002.log differ
diff --git a/pebble/testdata/db-stage-2/CURRENT b/pebble/testdata/db-stage-2/CURRENT
new file mode 100644
index 0000000..feda7d6
--- /dev/null
+++ b/pebble/testdata/db-stage-2/CURRENT
@@ -0,0 +1 @@
+MANIFEST-000000
diff --git a/pebble/testdata/db-stage-2/LOCK b/pebble/testdata/db-stage-2/LOCK
new file mode 100644
index 0000000..e69de29
diff --git a/pebble/testdata/db-stage-2/MANIFEST-000001 b/pebble/testdata/db-stage-2/MANIFEST-000001
new file mode 100644
index 0000000..7c30d31
Binary files /dev/null and b/pebble/testdata/db-stage-2/MANIFEST-000001 differ
diff --git a/pebble/testdata/db-stage-2/OPTIONS-000003 b/pebble/testdata/db-stage-2/OPTIONS-000003
new file mode 100644
index 0000000..370b7b8
--- /dev/null
+++ b/pebble/testdata/db-stage-2/OPTIONS-000003
@@ -0,0 +1,48 @@
+[Version]
+  pebble_version=0.1
+
+[Options]
+  bytes_per_sync=524288
+  cache_size=8388608
+  cleaner=delete
+  compaction_debt_concurrency=1073741824
+  comparer=leveldb.BytewiseComparator
+  disable_wal=false
+  flush_delay_delete_range=0s
+  flush_delay_range_key=0s
+  flush_split_bytes=4194304
+  format_major_version=13
+  l0_compaction_concurrency=10
+  l0_compaction_file_threshold=500
+  l0_compaction_threshold=4
+  l0_stop_writes_threshold=12
+  lbase_max_bytes=67108864
+  max_concurrent_compactions=1
+  max_manifest_file_size=134217728
+  max_open_files=1000
+  mem_table_size=4194304
+  mem_table_stop_writes_threshold=2
+  min_deletion_rate=0
+  merger=pebble.concatenate
+  read_compaction_rate=16000
+  read_sampling_multiplier=16
+  strict_wal_tail=true
+  table_cache_shards=10
+  table_property_collectors=[]
+  validate_on_ingest=false
+  wal_dir=
+  wal_bytes_per_sync=0
+  max_writer_concurrency=0
+  force_writer_parallelism=false
+  secondary_cache_size_bytes=0
+  create_on_shared=0
+
+[Level "0"]
+  block_restart_interval=16
+  block_size=4096
+  block_size_threshold=90
+  compression=Snappy
+  filter_policy=none
+  filter_type=table
+  index_block_size=4096
+  target_file_size=2097152
diff --git a/pebble/testdata/db-stage-2/marker.format-version.000012.013 b/pebble/testdata/db-stage-2/marker.format-version.000012.013
new file mode 100644
index 0000000..e69de29
diff --git a/pebble/testdata/db-stage-2/marker.manifest.000001.MANIFEST-000001 b/pebble/testdata/db-stage-2/marker.manifest.000001.MANIFEST-000001
new file mode 100644
index 0000000..e69de29
diff --git a/pebble/testdata/db-stage-3/000004.sst b/pebble/testdata/db-stage-3/000004.sst
new file mode 100644
index 0000000..4daa79e
Binary files /dev/null and b/pebble/testdata/db-stage-3/000004.sst differ
diff --git a/pebble/testdata/db-stage-3/000005.log b/pebble/testdata/db-stage-3/000005.log
new file mode 100644
index 0000000..6284624
Binary files /dev/null and b/pebble/testdata/db-stage-3/000005.log differ
diff --git a/pebble/testdata/db-stage-3/CURRENT b/pebble/testdata/db-stage-3/CURRENT
new file mode 100644
index 0000000..feda7d6
--- /dev/null
+++ b/pebble/testdata/db-stage-3/CURRENT
@@ -0,0 +1 @@
+MANIFEST-000000
diff --git a/pebble/testdata/db-stage-3/LOCK b/pebble/testdata/db-stage-3/LOCK
new file mode 100644
index 0000000..e69de29
diff --git a/pebble/testdata/db-stage-3/MANIFEST-000001 b/pebble/testdata/db-stage-3/MANIFEST-000001
new file mode 100644
index 0000000..7c30d31
Binary files /dev/null and b/pebble/testdata/db-stage-3/MANIFEST-000001 differ
diff --git a/pebble/testdata/db-stage-3/MANIFEST-000006 b/pebble/testdata/db-stage-3/MANIFEST-000006
new file mode 100644
index 0000000..d920a94
Binary files /dev/null and b/pebble/testdata/db-stage-3/MANIFEST-000006 differ
diff --git a/pebble/testdata/db-stage-3/OPTIONS-000007 b/pebble/testdata/db-stage-3/OPTIONS-000007
new file mode 100644
index 0000000..370b7b8
--- /dev/null
+++ b/pebble/testdata/db-stage-3/OPTIONS-000007
@@ -0,0 +1,48 @@
+[Version]
+  pebble_version=0.1
+
+[Options]
+  bytes_per_sync=524288
+  cache_size=8388608
+  cleaner=delete
+  compaction_debt_concurrency=1073741824
+  comparer=leveldb.BytewiseComparator
+  disable_wal=false
+  flush_delay_delete_range=0s
+  flush_delay_range_key=0s
+  flush_split_bytes=4194304
+  format_major_version=13
+  l0_compaction_concurrency=10
+  l0_compaction_file_threshold=500
+  l0_compaction_threshold=4
+  l0_stop_writes_threshold=12
+  lbase_max_bytes=67108864
+  max_concurrent_compactions=1
+  max_manifest_file_size=134217728
+  max_open_files=1000
+  mem_table_size=4194304
+  mem_table_stop_writes_threshold=2
+  min_deletion_rate=0
+  merger=pebble.concatenate
+  read_compaction_rate=16000
+  read_sampling_multiplier=16
+  strict_wal_tail=true
+  table_cache_shards=10
+  table_property_collectors=[]
+  validate_on_ingest=false
+  wal_dir=
+  wal_bytes_per_sync=0
+  max_writer_concurrency=0
+  force_writer_parallelism=false
+  secondary_cache_size_bytes=0
+  create_on_shared=0
+
+[Level "0"]
+  block_restart_interval=16
+  block_size=4096
+  block_size_threshold=90
+  compression=Snappy
+  filter_policy=none
+  filter_type=table
+  index_block_size=4096
+  target_file_size=2097152
diff --git a/pebble/testdata/db-stage-3/marker.format-version.000012.013 b/pebble/testdata/db-stage-3/marker.format-version.000012.013
new file mode 100644
index 0000000..e69de29
diff --git a/pebble/testdata/db-stage-3/marker.manifest.000002.MANIFEST-000006 b/pebble/testdata/db-stage-3/marker.manifest.000002.MANIFEST-000006
new file mode 100644
index 0000000..e69de29
diff --git a/pebble/testdata/db-stage-4/000004.sst b/pebble/testdata/db-stage-4/000004.sst
new file mode 100644
index 0000000..4daa79e
Binary files /dev/null and b/pebble/testdata/db-stage-4/000004.sst differ
diff --git a/pebble/testdata/db-stage-4/000005.log b/pebble/testdata/db-stage-4/000005.log
new file mode 100644
index 0000000..069de58
Binary files /dev/null and b/pebble/testdata/db-stage-4/000005.log differ
diff --git a/pebble/testdata/db-stage-4/CURRENT b/pebble/testdata/db-stage-4/CURRENT
new file mode 100644
index 0000000..feda7d6
--- /dev/null
+++ b/pebble/testdata/db-stage-4/CURRENT
@@ -0,0 +1 @@
+MANIFEST-000000
diff --git a/pebble/testdata/db-stage-4/LOCK b/pebble/testdata/db-stage-4/LOCK
new file mode 100644
index 0000000..e69de29
diff --git a/pebble/testdata/db-stage-4/MANIFEST-000001 b/pebble/testdata/db-stage-4/MANIFEST-000001
new file mode 100644
index 0000000..7c30d31
Binary files /dev/null and b/pebble/testdata/db-stage-4/MANIFEST-000001 differ
diff --git a/pebble/testdata/db-stage-4/MANIFEST-000006 b/pebble/testdata/db-stage-4/MANIFEST-000006
new file mode 100644
index 0000000..309491d
Binary files /dev/null and b/pebble/testdata/db-stage-4/MANIFEST-000006 differ
diff --git a/pebble/testdata/db-stage-4/OPTIONS-000007 b/pebble/testdata/db-stage-4/OPTIONS-000007
new file mode 100644
index 0000000..370b7b8
--- /dev/null
+++ b/pebble/testdata/db-stage-4/OPTIONS-000007
@@ -0,0 +1,48 @@
+[Version]
+  pebble_version=0.1
+
+[Options]
+  bytes_per_sync=524288
+  cache_size=8388608
+  cleaner=delete
+  compaction_debt_concurrency=1073741824
+  comparer=leveldb.BytewiseComparator
+  disable_wal=false
+  flush_delay_delete_range=0s
+  flush_delay_range_key=0s
+  flush_split_bytes=4194304
+  format_major_version=13
+  l0_compaction_concurrency=10
+  l0_compaction_file_threshold=500
+  l0_compaction_threshold=4
+  l0_stop_writes_threshold=12
+  lbase_max_bytes=67108864
+  max_concurrent_compactions=1
+  max_manifest_file_size=134217728
+  max_open_files=1000
+  mem_table_size=4194304
+  mem_table_stop_writes_threshold=2
+  min_deletion_rate=0
+  merger=pebble.concatenate
+  read_compaction_rate=16000
+  read_sampling_multiplier=16
+  strict_wal_tail=true
+  table_cache_shards=10
+  table_property_collectors=[]
+  validate_on_ingest=false
+  wal_dir=
+  wal_bytes_per_sync=0
+  max_writer_concurrency=0
+  force_writer_parallelism=false
+  secondary_cache_size_bytes=0
+  create_on_shared=0
+
+[Level "0"]
+  block_restart_interval=16
+  block_size=4096
+  block_size_threshold=90
+  compression=Snappy
+  filter_policy=none
+  filter_type=table
+  index_block_size=4096
+  target_file_size=2097152
diff --git a/pebble/testdata/db-stage-4/marker.format-version.000012.013 b/pebble/testdata/db-stage-4/marker.format-version.000012.013
new file mode 100644
index 0000000..e69de29
diff --git a/pebble/testdata/db-stage-4/marker.manifest.000002.MANIFEST-000006 b/pebble/testdata/db-stage-4/marker.manifest.000002.MANIFEST-000006
new file mode 100644
index 0000000..e69de29
diff --git a/pebble/testdata/delete_range b/pebble/testdata/delete_range
new file mode 100644
index 0000000..7f5f7bf
--- /dev/null
+++ b/pebble/testdata/delete_range
@@ -0,0 +1,89 @@
+define
+set a 1
+set b 2
+merge c 3
+del-range a c
+del d
+----
+
+scan
+----
+a#0,1:1
+b#1,1:2
+c#2,2:3
+d#4,0:
+
+scan range-del
+----
+a-c:{(#3,RANGEDEL)}
+
+clear
+----
+
+define
+del-range a b
+del-range b c
+del-range a c
+del-range b d
+----
+
+scan range-del
+----
+a-b:{(#2,RANGEDEL) (#0,RANGEDEL)}
+b-c:{(#3,RANGEDEL) (#2,RANGEDEL) (#1,RANGEDEL)}
+c-d:{(#3,RANGEDEL)}
+
+clear
+----
+
+define
+del-range a b
+----
+
+scan range-del
+----
+a-b:{(#0,RANGEDEL)}
+
+define
+del-range b c
+----
+
+scan range-del
+----
+a-b:{(#0,RANGEDEL)}
+b-c:{(#1,RANGEDEL)}
+
+define
+del-range a c
+----
+
+scan range-del
+----
+a-b:{(#2,RANGEDEL) (#0,RANGEDEL)}
+b-c:{(#2,RANGEDEL) (#1,RANGEDEL)}
+
+define
+del-range b d
+----
+
+scan range-del
+----
+a-b:{(#2,RANGEDEL) (#0,RANGEDEL)}
+b-c:{(#3,RANGEDEL) (#2,RANGEDEL) (#1,RANGEDEL)}
+c-d:{(#3,RANGEDEL)}
+
+clear
+----
+
+define
+set <nil> 1
+del-range <nil> d
+----
+
+scan
+----
+#0,1:1
+
+scan range-del
+----
+-d:{(#1,RANGEDEL)}
diff --git a/pebble/testdata/event_listener b/pebble/testdata/event_listener
new file mode 100644
index 0000000..2922840
--- /dev/null
+++ b/pebble/testdata/event_listener
@@ -0,0 +1,462 @@
+open
+----
+mkdir-all: db 0755
+mkdir-all: wal 0755
+open-dir: db
+open-dir: wal
+lock: db/LOCK
+open-dir: db
+open-dir: db
+open: db/CURRENT
+create: db/MANIFEST-000001
+sync: db/MANIFEST-000001
+remove: db/temporary.000001.dbtmp
+create: db/temporary.000001.dbtmp
+sync: db/temporary.000001.dbtmp
+close: db/temporary.000001.dbtmp
+rename: db/temporary.000001.dbtmp -> db/CURRENT
+sync: db
+[JOB 1] MANIFEST created 000001
+open-dir: db
+sync: db/MANIFEST-000001
+create: wal/000002.log
+sync: wal
+[JOB 1] WAL created 000002
+create: db/marker.manifest.000001.MANIFEST-000001
+close: db/marker.manifest.000001.MANIFEST-000001
+sync: db
+create: db/marker.format-version.000001.002
+close: db/marker.format-version.000001.002
+sync: db
+upgraded to format version: 002
+remove: db/temporary.000000.dbtmp
+create: db/temporary.000000.dbtmp
+sync: db/temporary.000000.dbtmp
+close: db/temporary.000000.dbtmp
+rename: db/temporary.000000.dbtmp -> db/CURRENT
+create: db/marker.format-version.000002.003
+close: db/marker.format-version.000002.003
+remove: db/marker.format-version.000001.002
+sync: db
+upgraded to format version: 003
+create: db/marker.format-version.000003.004
+close: db/marker.format-version.000003.004
+remove: db/marker.format-version.000002.003
+sync: db
+upgraded to format version: 004
+create: db/marker.format-version.000004.005
+close: db/marker.format-version.000004.005
+remove: db/marker.format-version.000003.004
+sync: db
+upgraded to format version: 005
+create: db/marker.format-version.000005.006
+close: db/marker.format-version.000005.006
+remove: db/marker.format-version.000004.005
+sync: db
+upgraded to format version: 006
+create: db/marker.format-version.000006.007
+close: db/marker.format-version.000006.007
+remove: db/marker.format-version.000005.006
+sync: db
+upgraded to format version: 007
+create: db/marker.format-version.000007.008
+close: db/marker.format-version.000007.008
+remove: db/marker.format-version.000006.007
+sync: db
+upgraded to format version: 008
+create: db/marker.format-version.000008.009
+close: db/marker.format-version.000008.009
+remove: db/marker.format-version.000007.008
+sync: db
+upgraded to format version: 009
+create: db/marker.format-version.000009.010
+close: db/marker.format-version.000009.010
+remove: db/marker.format-version.000008.009
+sync: db
+upgraded to format version: 010
+create: db/marker.format-version.000010.011
+close: db/marker.format-version.000010.011
+remove: db/marker.format-version.000009.010
+sync: db
+upgraded to format version: 011
+create: db/marker.format-version.000011.012
+close: db/marker.format-version.000011.012
+remove: db/marker.format-version.000010.011
+sync: db
+upgraded to format version: 012
+create: db/marker.format-version.000012.013
+close: db/marker.format-version.000012.013
+remove: db/marker.format-version.000011.012
+sync: db
+upgraded to format version: 013
+create: db/marker.format-version.000013.014
+close: db/marker.format-version.000013.014
+remove: db/marker.format-version.000012.013
+sync: db
+upgraded to format version: 014
+create: db/marker.format-version.000014.015
+close: db/marker.format-version.000014.015
+remove: db/marker.format-version.000013.014
+sync: db
+upgraded to format version: 015
+create: db/marker.format-version.000015.016
+close: db/marker.format-version.000015.016
+remove: db/marker.format-version.000014.015
+sync: db
+upgraded to format version: 016
+create: db/temporary.000003.dbtmp
+sync: db/temporary.000003.dbtmp
+close: db/temporary.000003.dbtmp
+rename: db/temporary.000003.dbtmp -> db/OPTIONS-000003
+sync: db
+
+flush
+----
+sync-data: wal/000002.log
+sync-data: wal/000002.log
+close: wal/000002.log
+create: wal/000004.log
+sync: wal
+[JOB 4] WAL created 000004
+[JOB 5] flushing 1 memtable (100B) to L0
+create: db/000005.sst
+[JOB 5] flushing: sstable created 000005
+sync-data: db/000005.sst
+close: db/000005.sst
+sync: db
+create: db/MANIFEST-000006
+close: db/MANIFEST-000001
+sync: db/MANIFEST-000006
+create: db/marker.manifest.000002.MANIFEST-000006
+close: db/marker.manifest.000002.MANIFEST-000006
+remove: db/marker.manifest.000001.MANIFEST-000001
+sync: db
+[JOB 5] MANIFEST created 000006
+[JOB 5] flushed 1 memtable (100B) to L0 [000005] (662B), in 1.0s (2.0s total), output rate 662B/s
+
+compact
+----
+sync-data: wal/000004.log
+sync-data: wal/000004.log
+close: wal/000004.log
+reuseForWrite: wal/000002.log -> wal/000007.log
+sync: wal
+[JOB 6] WAL created 000007 (recycled 000002)
+[JOB 7] flushing 1 memtable (100B) to L0
+create: db/000008.sst
+[JOB 7] flushing: sstable created 000008
+sync-data: db/000008.sst
+close: db/000008.sst
+sync: db
+create: db/MANIFEST-000009
+close: db/MANIFEST-000006
+sync: db/MANIFEST-000009
+create: db/marker.manifest.000003.MANIFEST-000009
+close: db/marker.manifest.000003.MANIFEST-000009
+remove: db/marker.manifest.000002.MANIFEST-000006
+sync: db
+[JOB 7] MANIFEST created 000009
+[JOB 7] flushed 1 memtable (100B) to L0 [000008] (662B), in 1.0s (2.0s total), output rate 662B/s
+remove: db/MANIFEST-000001
+[JOB 7] MANIFEST deleted 000001
+[JOB 8] compacting(default) L0 [000005 000008] (1.3KB) Score=0.00 + L6 [] (0B) Score=0.00; OverlappingRatio: Single 0.00, Multi 0.00
+open: db/000005.sst
+read-at(609, 53): db/000005.sst
+read-at(572, 37): db/000005.sst
+read-at(53, 519): db/000005.sst
+read-at(26, 27): db/000005.sst
+open: db/000005.sst
+close: db/000005.sst
+open: db/000008.sst
+read-at(609, 53): db/000008.sst
+read-at(572, 37): db/000008.sst
+read-at(53, 519): db/000008.sst
+read-at(26, 27): db/000008.sst
+open: db/000008.sst
+close: db/000008.sst
+open: db/000005.sst
+read-at(0, 26): db/000005.sst
+open: db/000008.sst
+read-at(0, 26): db/000008.sst
+close: db/000008.sst
+close: db/000005.sst
+create: db/000010.sst
+[JOB 8] compacting: sstable created 000010
+sync-data: db/000010.sst
+close: db/000010.sst
+sync: db
+create: db/MANIFEST-000011
+close: db/MANIFEST-000009
+sync: db/MANIFEST-000011
+create: db/marker.manifest.000004.MANIFEST-000011
+close: db/marker.manifest.000004.MANIFEST-000011
+remove: db/marker.manifest.000003.MANIFEST-000009
+sync: db
+[JOB 8] MANIFEST created 000011
+[JOB 8] compacted(default) L0 [000005 000008] (1.3KB) Score=0.00 + L6 [] (0B) Score=0.00 -> L6 [000010] (662B), in 1.0s (3.0s total), output rate 662B/s
+close: db/000005.sst
+close: db/000008.sst
+remove: db/000005.sst
+[JOB 8] sstable deleted 000005
+remove: db/000008.sst
+[JOB 8] sstable deleted 000008
+remove: db/MANIFEST-000006
+[JOB 8] MANIFEST deleted 000006
+
+disable-file-deletions
+----
+
+flush
+----
+sync-data: wal/000007.log
+sync-data: wal/000007.log
+close: wal/000007.log
+reuseForWrite: wal/000004.log -> wal/000012.log
+sync: wal
+[JOB 9] WAL created 000012 (recycled 000004)
+[JOB 10] flushing 1 memtable (100B) to L0
+create: db/000013.sst
+[JOB 10] flushing: sstable created 000013
+sync-data: db/000013.sst
+close: db/000013.sst
+sync: db
+create: db/MANIFEST-000014
+close: db/MANIFEST-000011
+sync: db/MANIFEST-000014
+create: db/marker.manifest.000005.MANIFEST-000014
+close: db/marker.manifest.000005.MANIFEST-000014
+remove: db/marker.manifest.000004.MANIFEST-000011
+sync: db
+[JOB 10] MANIFEST created 000014
+[JOB 10] flushed 1 memtable (100B) to L0 [000013] (662B), in 1.0s (2.0s total), output rate 662B/s
+
+enable-file-deletions
+----
+remove: db/MANIFEST-000009
+[JOB 11] MANIFEST deleted 000009
+
+ingest
+----
+open: ext/0
+read-at(664, 53): ext/0
+read-at(627, 37): ext/0
+read-at(53, 574): ext/0
+read-at(26, 27): ext/0
+read-at(0, 26): ext/0
+close: ext/0
+link: ext/0 -> db/000015.sst
+[JOB 12] ingesting: sstable created 000015
+sync: db
+open: db/000013.sst
+read-at(609, 53): db/000013.sst
+read-at(572, 37): db/000013.sst
+read-at(53, 519): db/000013.sst
+read-at(26, 27): db/000013.sst
+read-at(0, 26): db/000013.sst
+create: db/MANIFEST-000016
+close: db/MANIFEST-000014
+sync: db/MANIFEST-000016
+create: db/marker.manifest.000006.MANIFEST-000016
+close: db/marker.manifest.000006.MANIFEST-000016
+remove: db/marker.manifest.000005.MANIFEST-000014
+sync: db
+[JOB 12] MANIFEST created 000016
+remove: db/MANIFEST-000011
+[JOB 12] MANIFEST deleted 000011
+remove: ext/0
+[JOB 12] ingested L0:000015 (717B)
+
+metrics
+----
+      |                             |       |       |   ingested   |     moved    |    written   |       |    amp
+level | tables  size val-bl vtables | score |   in  | tables  size | tables  size | tables  size |  read |   r   w
+------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+---------
+    0 |     2  1.3KB     0B       0 |  0.40 |   81B |     1   717B |     0     0B |     3  1.9KB |    0B |   2 24.5
+    1 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    2 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    3 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    4 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    5 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    6 |     1   662B     0B       0 |     - | 1.3KB |     0     0B |     0     0B |     1   662B | 1.3KB |   1  0.5
+total |     3  2.0KB     0B       0 |     - |  825B |     1   717B |     0     0B |     4  3.4KB | 1.3KB |   3  4.2
+-------------------------------------------------------------------------------------------------------------------
+WAL: 1 files (27B)  in: 48B  written: 108B (125% overhead)
+Flushes: 3
+Compactions: 1  estimated debt: 2.0KB  in progress: 0 (0B)
+             default: 1  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  multi-level: 0
+MemTables: 1 (256KB)  zombie: 1 (256KB)
+Zombie tables: 0 (0B)
+Backing tables: 0 (0B)
+Virtual tables: 0 (0B)
+Block cache: 6 entries (1.1KB)  hit rate: 11.1%
+Table cache: 1 entries (800B)  hit rate: 40.0%
+Secondary cache: 0 entries (0B)  hit rate: 0.0%
+Snapshots: 0  earliest seq num: 0
+Table iters: 0
+Filter utility: 0.0%
+Ingestions: 1  as flushable: 0 (0B in 0 tables)
+
+# Set up a scenario where the table to be ingested overlaps with the memtable.
+# The table is ingested as a flushable. The flush metrics refect the flushed
+# ingestion.
+
+ingest-flushable
+----
+sync-data: wal/000012.log
+open: ext/a
+read-at(664, 53): ext/a
+read-at(627, 37): ext/a
+read-at(53, 574): ext/a
+read-at(26, 27): ext/a
+read-at(0, 26): ext/a
+close: ext/a
+open: ext/b
+read-at(664, 53): ext/b
+read-at(627, 37): ext/b
+read-at(53, 574): ext/b
+read-at(26, 27): ext/b
+read-at(0, 26): ext/b
+close: ext/b
+link: ext/a -> db/000017.sst
+[JOB 13] ingesting: sstable created 000017
+link: ext/b -> db/000018.sst
+[JOB 13] ingesting: sstable created 000018
+sync: db
+sync-data: wal/000012.log
+close: wal/000012.log
+reuseForWrite: wal/000007.log -> wal/000019.log
+sync: wal
+[JOB 14] WAL created 000019 (recycled 000007)
+sync-data: wal/000019.log
+sync-data: wal/000019.log
+close: wal/000019.log
+create: wal/000020.log
+sync: wal
+[JOB 15] WAL created 000020
+remove: ext/a
+remove: ext/b
+[JOB 13] ingested as flushable 000017 (717B), 000018 (717B)
+sync-data: wal/000020.log
+close: wal/000020.log
+create: wal/000021.log
+sync: wal
+[JOB 16] WAL created 000021
+[JOB 17] flushing 1 memtable (100B) to L0
+create: db/000022.sst
+[JOB 17] flushing: sstable created 000022
+sync-data: db/000022.sst
+close: db/000022.sst
+sync: db
+sync: db/MANIFEST-000016
+[JOB 17] flushed 1 memtable (100B) to L0 [000022] (662B), in 1.0s (2.0s total), output rate 662B/s
+[JOB 18] flushing 2 ingested tables
+create: db/MANIFEST-000023
+close: db/MANIFEST-000016
+sync: db/MANIFEST-000023
+create: db/marker.manifest.000007.MANIFEST-000023
+close: db/marker.manifest.000007.MANIFEST-000023
+remove: db/marker.manifest.000006.MANIFEST-000016
+sync: db
+[JOB 18] MANIFEST created 000023
+[JOB 18] flushed 2 ingested flushables L0:000017 (717B) + L6:000018 (717B) in 1.0s (2.0s total), output rate 1.4KB/s
+remove: db/MANIFEST-000014
+[JOB 18] MANIFEST deleted 000014
+[JOB 19] flushing 1 memtable (100B) to L0
+sync: db/MANIFEST-000023
+[JOB 19] flush error: pebble: empty table
+
+metrics
+----
+      |                             |       |       |   ingested   |     moved    |    written   |       |    amp
+level | tables  size val-bl vtables | score |   in  | tables  size | tables  size | tables  size |  read |   r   w
+------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+---------
+    0 |     4  2.7KB     0B       0 |  0.80 |   81B |     2  1.4KB |     0     0B |     4  2.6KB |    0B |   4 32.7
+    1 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    2 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    3 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    4 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    5 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    6 |     2  1.3KB     0B       0 |     - | 1.3KB |     1   717B |     0     0B |     1   662B | 1.3KB |   1  0.5
+total |     6  4.0KB     0B       0 |     - | 2.2KB |     3  2.1KB |     0     0B |     5  5.4KB | 1.3KB |   5  2.5
+-------------------------------------------------------------------------------------------------------------------
+WAL: 1 files (29B)  in: 82B  written: 110B (34% overhead)
+Flushes: 6
+Compactions: 1  estimated debt: 4.0KB  in progress: 0 (0B)
+             default: 1  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  multi-level: 0
+MemTables: 1 (512KB)  zombie: 1 (512KB)
+Zombie tables: 0 (0B)
+Backing tables: 0 (0B)
+Virtual tables: 0 (0B)
+Block cache: 12 entries (2.3KB)  hit rate: 14.3%
+Table cache: 1 entries (800B)  hit rate: 50.0%
+Secondary cache: 0 entries (0B)  hit rate: 0.0%
+Snapshots: 0  earliest seq num: 0
+Table iters: 0
+Filter utility: 0.0%
+Ingestions: 1  as flushable: 1 (1.4KB in 2 tables)
+
+sstables
+----
+0:
+  13:[a-a]
+  15:[a-a]
+  22:[a-a]
+  17:[a-a]
+6:
+  10:[a-a]
+  18:[b-b]
+
+checkpoint
+----
+mkdir-all: checkpoint 0755
+open-dir: 
+sync: 
+close: 
+open-dir: checkpoint
+link: db/OPTIONS-000003 -> checkpoint/OPTIONS-000003
+open-dir: checkpoint
+create: checkpoint/marker.format-version.000001.016
+sync-data: checkpoint/marker.format-version.000001.016
+close: checkpoint/marker.format-version.000001.016
+sync: checkpoint
+close: checkpoint
+link: db/000013.sst -> checkpoint/000013.sst
+link: db/000015.sst -> checkpoint/000015.sst
+link: db/000022.sst -> checkpoint/000022.sst
+link: db/000017.sst -> checkpoint/000017.sst
+link: db/000010.sst -> checkpoint/000010.sst
+link: db/000018.sst -> checkpoint/000018.sst
+open: db/MANIFEST-000023
+create: checkpoint/MANIFEST-000023
+sync-data: checkpoint/MANIFEST-000023
+close: checkpoint/MANIFEST-000023
+close: db/MANIFEST-000023
+open-dir: checkpoint
+create: checkpoint/marker.manifest.000001.MANIFEST-000023
+sync-data: checkpoint/marker.manifest.000001.MANIFEST-000023
+close: checkpoint/marker.manifest.000001.MANIFEST-000023
+sync: checkpoint
+close: checkpoint
+open: wal/000021.log
+create: checkpoint/000021.log
+sync-data: checkpoint/000021.log
+close: checkpoint/000021.log
+close: wal/000021.log
+sync: checkpoint
+close: checkpoint
+
+enable-file-deletions
+----
+pebble: file deletion disablement invariant violated
+
+close
+----
+close: db
+close: db/000013.sst
+sync-data: wal/000021.log
+close: wal/000021.log
+close: db/MANIFEST-000023
+close: db
+close: db
+close: wal
+close: db
diff --git a/pebble/testdata/excise b/pebble/testdata/excise
new file mode 100644
index 0000000..4102e1e
--- /dev/null
+++ b/pebble/testdata/excise
@@ -0,0 +1,339 @@
+
+build ext0 format=pebblev2
+set a 1
+set l 2
+----
+
+ingest ext0
+----
+
+lsm
+----
+6:
+  000004:[a#10,SET-l#10,SET]
+
+
+batch
+set d foo
+set f bar
+----
+
+flush
+----
+
+lsm
+----
+0.0:
+  000006:[d#11,SET-f#12,SET]
+6:
+  000004:[a#10,SET-l#10,SET]
+
+excise c k
+----
+would excise 2 files, use ingest-and-excise to excise.
+  deleted:       L0 000006
+  deleted:       L6 000004
+  added:         L6 000007:[a#10,SET-a#10,SET] seqnums:[10-10] points:[a#10,SET-a#10,SET]
+  added:         L6 000008:[l#10,SET-l#10,SET] seqnums:[10-10] points:[l#10,SET-l#10,SET]
+
+
+excise a e
+----
+would excise 2 files, use ingest-and-excise to excise.
+  deleted:       L0 000006
+  deleted:       L6 000004
+  added:         L0 000009:[f#12,SET-f#12,SET] seqnums:[11-12] points:[f#12,SET-f#12,SET]
+  added:         L6 000010:[l#10,SET-l#10,SET] seqnums:[10-10] points:[l#10,SET-l#10,SET]
+
+excise e z
+----
+would excise 2 files, use ingest-and-excise to excise.
+  deleted:       L0 000006
+  deleted:       L6 000004
+  added:         L0 000011:[d#11,SET-d#11,SET] seqnums:[11-12] points:[d#11,SET-d#11,SET]
+  added:         L6 000012:[a#10,SET-a#10,SET] seqnums:[10-10] points:[a#10,SET-a#10,SET]
+
+excise f l
+----
+would excise 2 files, use ingest-and-excise to excise.
+  deleted:       L0 000006
+  deleted:       L6 000004
+  added:         L0 000013:[d#11,SET-d#11,SET] seqnums:[11-12] points:[d#11,SET-d#11,SET]
+  added:         L6 000014:[a#10,SET-a#10,SET] seqnums:[10-10] points:[a#10,SET-a#10,SET]
+  added:         L6 000015:[l#10,SET-l#10,SET] seqnums:[10-10] points:[l#10,SET-l#10,SET]
+
+excise f ll
+----
+would excise 2 files, use ingest-and-excise to excise.
+  deleted:       L0 000006
+  deleted:       L6 000004
+  added:         L0 000016:[d#11,SET-d#11,SET] seqnums:[11-12] points:[d#11,SET-d#11,SET]
+  added:         L6 000017:[a#10,SET-a#10,SET] seqnums:[10-10] points:[a#10,SET-a#10,SET]
+
+excise p q
+----
+would excise 0 files, use ingest-and-excise to excise.
+
+lsm
+----
+0.0:
+  000006:[d#11,SET-f#12,SET]
+6:
+  000004:[a#10,SET-l#10,SET]
+
+iter
+first
+next
+next
+next
+next
+----
+a: (1, .)
+d: (foo, .)
+f: (bar, .)
+l: (2, .)
+.
+
+build ext1 format=pebblev2
+set d foo3
+set e bar2
+----
+
+ingest-and-excise ext1 excise=c-k
+----
+
+lsm
+----
+6:
+  000019:[a#10,SET-a#10,SET]
+  000018:[d#13,SET-e#13,SET]
+  000020:[l#10,SET-l#10,SET]
+
+iter
+first
+next
+next
+next
+next
+----
+a: (1, .)
+d: (foo3, .)
+e: (bar2, .)
+l: (2, .)
+.
+
+# More complex cases, with the truncation of file bounds happening at rangedel
+# and rangekey bounds.
+
+reset
+----
+
+build ext3 format=pebblev2
+range-key-set c f @4 foobar
+----
+
+ingest ext3
+----
+
+build ext4 format=pebblev2
+set b bar
+del-range g i
+----
+
+ingest ext4
+----
+
+lsm
+----
+0.0:
+  000005:[b#11,SET-i#inf,RANGEDEL]
+6:
+  000004:[c#10,RANGEKEYSET-f#inf,RANGEKEYSET]
+
+excise f g
+----
+would excise 1 files, use ingest-and-excise to excise.
+  deleted:       L0 000005
+  added:         L0 000006:[b#11,SET-b#11,SET] seqnums:[11-11] points:[b#11,SET-b#11,SET]
+  added:         L0 000007:[g#11,RANGEDEL-i#inf,RANGEDEL] seqnums:[11-11] points:[g#11,RANGEDEL-i#inf,RANGEDEL]
+
+excise b c
+----
+would excise 1 files, use ingest-and-excise to excise.
+  deleted:       L0 000005
+  added:         L0 000008:[g#11,RANGEDEL-i#inf,RANGEDEL] seqnums:[11-11] points:[g#11,RANGEDEL-i#inf,RANGEDEL]
+
+excise i j
+----
+would excise 0 files, use ingest-and-excise to excise.
+
+# Excise mid range key. This will not happen in practice, but excise()
+# supports it.
+
+excise c d
+----
+would excise 2 files, use ingest-and-excise to excise.
+  deleted:       L0 000005
+  deleted:       L6 000004
+  added:         L0 000009:[b#11,SET-b#11,SET] seqnums:[11-11] points:[b#11,SET-b#11,SET]
+  added:         L0 000010:[g#11,RANGEDEL-i#inf,RANGEDEL] seqnums:[11-11] points:[g#11,RANGEDEL-i#inf,RANGEDEL]
+  added:         L6 000011:[d#10,RANGEKEYSET-f#inf,RANGEKEYSET] seqnums:[10-10] ranges:[d#10,RANGEKEYSET-f#inf,RANGEKEYSET]
+
+reset
+----
+
+# Create an sstable with a range key set.
+batch
+set a a
+set b b
+set d d
+range-key-set e ee @1 foo
+----
+
+flush
+----
+
+lsm
+----
+0.0:
+  000005:[a#10,SET-ee#inf,RANGEKEYSET]
+
+build ext2
+set z z
+----
+
+ingest-and-excise ext2 excise=b-c
+----
+
+lsm
+----
+0.0:
+  000007:[a#10,SET-a#10,SET]
+  000008:[d#12,SET-ee#inf,RANGEKEYSET]
+6:
+  000006:[z#14,SET-z#14,SET]
+
+# Regression test for https://github.com/cockroachdb/pebble/issues/2947.
+reset
+----
+
+batch
+set a a
+set b b
+set c c
+set d d
+set e e
+set f f
+set g g
+set h h
+set i i
+set j j
+----
+
+flush
+----
+
+lsm
+----
+0.0:
+  000005:[a#10,SET-j#19,SET]
+
+build ext2
+set z z
+----
+
+ingest-and-excise ext2 excise=d-e
+----
+
+lsm
+----
+0.0:
+  000007:[a#10,SET-c#12,SET]
+  000008:[e#14,SET-j#19,SET]
+6:
+  000006:[z#20,SET-z#20,SET]
+
+build ext3
+set zz zz
+----
+
+ingest-and-excise ext3 excise=g-h
+----
+
+# 7, 10, 11 should have the same file backing struct.
+lsm
+----
+0.0:
+  000007:[a#10,SET-c#12,SET]
+  000010:[e#14,SET-f#15,SET]
+  000011:[h#17,SET-j#19,SET]
+6:
+  000006:[z#20,SET-z#20,SET]
+  000009:[zz#21,SET-zz#21,SET]
+
+confirm-backing 7 10 11
+----
+file backings are the same
+
+reopen
+----
+
+# 7, 10, 11 should still have the same file backing struct even after manifest
+# replay.
+lsm
+----
+0.0:
+  000007:[a#10,SET-c#12,SET]
+  000010:[e#14,SET-f#15,SET]
+  000011:[h#17,SET-j#19,SET]
+6:
+  000006:[z#20,SET-z#20,SET]
+  000009:[zz#21,SET-zz#21,SET]
+
+confirm-backing 7 10 11
+----
+file backings are the same
+
+# Excise one boundary, the file backing should still be set.
+reset
+----
+
+batch
+set a a
+set b b
+set c c
+set d d
+set e e
+----
+
+flush
+----
+
+lsm
+----
+0.0:
+  000005:[a#10,SET-e#14,SET]
+
+build ext2
+set z z
+----
+
+ingest-and-excise ext2 excise=d-f
+----
+
+lsm
+----
+0.0:
+  000007:[a#10,SET-c#12,SET]
+6:
+  000006:[z#15,SET-z#15,SET]
+
+reopen
+----
+
+lsm
+----
+0.0:
+  000007:[a#10,SET-c#12,SET]
+6:
+  000006:[z#15,SET-z#15,SET]
diff --git a/pebble/testdata/external_iterator b/pebble/testdata/external_iterator
new file mode 100644
index 0000000..589b950
--- /dev/null
+++ b/pebble/testdata/external_iterator
@@ -0,0 +1,286 @@
+build 1
+set b b
+set c c
+----
+
+build 2
+del-range c z
+----
+
+# Test that a delete range in a more recent file shadows keys in an
+# earlier file.
+
+iter files=(1)
+first
+next
+next
+----
+b: (b, .)
+c: (c, .)
+.
+
+iter files=(1)
+seek-ge bb
+next
+----
+c: (c, .)
+.
+
+iter files=(2, 1) fwd-only
+first
+next
+----
+b: (b, .)
+.
+
+build 3
+set a a
+set f f
+----
+
+# Test including an even more recent file with point keys overlapping
+# the rangedel. Since the point keys are assigned a higher sequence
+# number, they should NOT be shadowed by the rangedel.
+
+iter files=(3, 2, 1) fwd-only
+first
+next
+next
+next
+----
+a: (a, .)
+b: (b, .)
+f: (f, .)
+.
+
+# Test including range keys, and merging the range key state across
+# files. Range keys should be interleaved.
+
+build 4
+range-key-set a c @2 foo
+range-key-set c e @3 bar
+----
+
+build 5
+range-key-del b d
+----
+
+iter files=(5, 4, 3, 2, 1) fwd-only
+first
+next
+next
+next
+next
+----
+a: (a, [a-b) @2=foo UPDATED)
+b: (b, . UPDATED)
+d: (., [d-e) @3=bar UPDATED)
+f: (f, . UPDATED)
+.
+
+# Test including range keys with empty spans and a merge in between. At no point
+# should an empty span be returned.
+
+build 6
+merge bb ac
+----
+
+iter files=(6, 5, 4, 3, 2, 1)
+seek-lt c
+prev
+next
+next
+----
+bb: (ac, .)
+b: (b, .)
+bb: (ac, .)
+d: (., [d-e) @3=bar UPDATED)
+
+iter files=(6, 5, 4, 3, 2, 1)
+seek-ge b
+next
+prev
+prev
+next
+next
+next
+----
+b: (b, .)
+bb: (ac, .)
+b: (b, .)
+a: (a, [a-b) @2=foo UPDATED)
+b: (b, . UPDATED)
+bb: (ac, .)
+d: (., [d-e) @3=bar UPDATED)
+
+# Test range keys that overlap each other with identical state. These
+# should be defragmented and exposed as a single range key.
+
+reset
+----
+
+build ag
+range-key-set a g @5 foo
+----
+
+build ek
+range-key-set e k @5 foo
+----
+
+iter files=(ag, ek) fwd-only
+first
+next
+----
+a: (., [a-k) @5=foo UPDATED)
+.
+
+# Test range-key masking by creating points, some with suffixes above
+# the range key's suffix, some with suffixes below the range key's
+# suffix.
+
+build points
+set a@4 v
+set c@2 v
+set d@9 v
+set e@5 v
+set k@3 v
+set p@4 v
+----
+
+iter files=(points, ag, ek) mask-suffix=@7 fwd-only
+first
+next
+next
+next
+next
+next
+----
+a: (., [a-k) @5=foo UPDATED)
+d@9: (v, [a-k) @5=foo)
+e@5: (v, [a-k) @5=foo)
+k@3: (v, . UPDATED)
+p@4: (v, .)
+.
+
+# Test that 'stacked' range keys (eg, multiple defined over the same keyspan at
+# varying suffixes) work  as expected.
+
+build stacked
+range-key-set a k @4 bar
+range-key-set a k @1 bax
+----
+
+iter files=(points, ag, ek, stacked) fwd-only
+first
+next
+----
+a: (., [a-k) @5=foo, @4=bar, @1=bax UPDATED)
+a@4: (v, [a-k) @5=foo, @4=bar, @1=bax)
+
+# Test mutating the external iterator's options through SetOptions.
+
+iter files=(points, ag, ek) fwd-only
+set-options key-types=point
+first
+next
+set-options lower=e upper=p
+first
+next
+----
+.
+a@4: (v, .)
+c@2: (v, .)
+.
+e@5: (v, .)
+k@3: (v, .)
+
+# Test the TrySeekUsingNext optimization that's enabled only for fwd-only
+# external iterators. Seed the database with keys like 'a', 'aa', 'aaa', etc so
+# that the index block uses a final separator that's beyond all the other live
+# keys.
+
+reset
+----
+
+build a
+set a@3 a@3
+set a@1 a@1
+----
+
+build aa
+set aa@3 aa@3
+set aa@1 aa@1
+----
+
+build aaa
+set aaa@3 aaa@3
+set aaa@1 aaa@1
+----
+
+build aaaa
+set aaaa@3 aaaa@3
+set aaaa@1 aaaa@1
+----
+
+build aaaaa
+set aaaaa@3 aaaaa@3
+set aaaaa@1 aaaaa@1
+----
+
+# Note the absence of fwd-only. This iterator will not use the TrySeekUsingNext
+# optimization.
+
+iter files=(a, aa, aaa, aaaa, aaaaa)
+seek-ge a
+next
+seek-ge aa
+next
+seek-ge aaa
+next
+seek-ge aaaa
+next
+seek-ge aaaaa
+next
+stats
+----
+a@3: (a@3, .)
+a@1: (a@1, .)
+aa@3: (aa@3, .)
+aa@1: (aa@1, .)
+aaa@3: (aaa@3, .)
+aaa@1: (aaa@1, .)
+aaaa@3: (aaaa@3, .)
+aaaa@1: (aaaa@1, .)
+aaaaa@3: (aaaaa@3, .)
+aaaaa@1: (aaaaa@1, .)
+stats: (interface (dir, seek, step): (fwd, 5, 5), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 5, 5), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 475B, cached 0B, read-time 0s)), (points: (count 10, key-bytes 50B, value-bytes 50B, tombstoned 0)))
+
+# Note the inclusion of fwd-only. This iterator will use the TrySeekUsingNext
+# optimization and loads ~half the block-bytes as a result.
+
+iter files=(a, aa, aaa, aaaa, aaaaa) fwd-only
+seek-ge a
+next
+seek-ge aa
+next
+seek-ge aaa
+next
+seek-ge aaaa
+next
+seek-ge aaaaa
+next
+stats
+----
+a@3: (a@3, .)
+a@1: (a@1, .)
+aa@3: (aa@3, .)
+aa@1: (aa@1, .)
+aaa@3: (aaa@3, .)
+aaa@1: (aaa@1, .)
+aaaa@3: (aaaa@3, .)
+aaaa@1: (aaaa@1, .)
+aaaaa@3: (aaaaa@3, .)
+aaaaa@1: (aaaaa@1, .)
+stats: (interface (dir, seek, step): (fwd, 5, 5), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 5, 5), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 281B, cached 0B, read-time 0s)), (points: (count 10, key-bytes 50B, value-bytes 50B, tombstoned 0)))
diff --git a/pebble/testdata/flushable_batch b/pebble/testdata/flushable_batch
new file mode 100644
index 0000000..c8cbc4d
--- /dev/null
+++ b/pebble/testdata/flushable_batch
@@ -0,0 +1,156 @@
+define
+a.SET.1:1
+a.SET.2:2
+b.SET.1:1
+b.DEL.2:
+c.SET.1:1
+c.MERGE.2:2
+1.RANGEDEL.3:
+2.RANGEKEYSET.4:
+1.RANGEKEYDEL.3:
+2.RANGEKEYUNSET.3:
+----
+
+# NB: The range keys get fragmented.
+
+dump seq=0
+----
+a#1,1:2
+a#0,1:1
+b#3,0:
+b#2,1:1
+c#5,2:2
+c#4,1:1
+1-3:{(#6,RANGEDEL)}
+1-2:{(#8,RANGEKEYDEL)}
+2-3:{(#9,RANGEKEYUNSET,3) (#8,RANGEKEYDEL) (#7,RANGEKEYSET,4,4)}
+3-4:{(#7,RANGEKEYSET,4,4)}
+
+dump seq=100
+----
+a#101,1:2
+a#100,1:1
+b#103,0:
+b#102,1:1
+c#105,2:2
+c#104,1:1
+1-3:{(#106,RANGEDEL)}
+1-2:{(#108,RANGEKEYDEL)}
+2-3:{(#109,RANGEKEYUNSET,3) (#108,RANGEKEYDEL) (#107,RANGEKEYSET,4,4)}
+3-4:{(#107,RANGEKEYSET,4,4)}
+
+define
+c.SET.1:1
+b.SET.1:1
+a.SET.1:1
+----
+
+dump seq=1000
+----
+a#1002,1:1
+b#1001,1:1
+c#1000,1:1
+
+iter
+first
+next
+next
+next
+----
+a:1
+b:1
+c:1
+.
+
+iter
+last
+prev
+prev
+prev
+----
+c:1
+b:1
+a:1
+.
+
+iter lower=b
+seek-ge b
+prev
+----
+b:1
+.
+
+iter lower=c
+seek-lt b
+----
+.
+
+iter lower=e
+last
+----
+.
+
+iter upper=1
+first
+----
+.
+
+iter upper=b
+first
+next
+----
+a:1
+.
+
+iter upper=b
+seek-ge b
+----
+.
+
+iter
+set-bounds lower=b
+seek-ge b
+prev
+----
+b:1
+.
+
+iter
+set-bounds lower=c
+seek-lt b
+----
+.
+
+iter
+set-bounds lower=e
+last
+----
+.
+
+iter
+set-bounds upper=1
+first
+----
+.
+
+iter
+set-bounds upper=b
+first
+next
+----
+a:1
+.
+
+iter
+set-bounds upper=b
+seek-ge b
+----
+.
+
+iter
+first
+set-bounds upper=b
+seek-ge b
+----
+a:1
+.
diff --git a/pebble/testdata/flushable_ingest b/pebble/testdata/flushable_ingest
new file mode 100644
index 0000000..ea20491
--- /dev/null
+++ b/pebble/testdata/flushable_ingest
@@ -0,0 +1,627 @@
+# We create multiple SSTs, one of which overlaps with the memtable (scheduling a flush).
+# Check that the SSTs get ingested to the lowest levels possible.
+
+batch
+set a 0
+----
+
+# The SST below overlaps with memtable and thus should be placed in L0
+# post flush.
+build ext1
+set a 1
+----
+
+# The SST below overlaps with nothing and thus should be placed in L6 post
+# flush.
+build ext2
+set b 1
+----
+
+# The SST below doesn't overlap with any SSTs in the LSM and thus can be placed
+# in L6 post-flush.
+build ext3
+set d 1
+----
+
+# We block the flush, so the SSTs should still be in the flushable queue.
+blockFlush
+----
+
+ingest ext1 ext2 ext3
+----
+
+allowFlush
+----
+
+lsm
+----
+
+get
+a
+b
+d
+----
+a:1
+b:1
+d:1
+
+# We expect 1 WAL for an immutable memtable, 1 file for the ingested ssts,
+# one for the mutable memtable. We also expect 3 ssts corresponding to the
+# ingested files.
+ls
+----
+000002.log
+000004.sst
+000005.sst
+000006.sst
+000007.log
+000008.log
+CURRENT
+LOCK
+MANIFEST-000001
+OPTIONS-000003
+ext
+marker.format-version.000015.016
+marker.manifest.000001.MANIFEST-000001
+
+# Test basic WAL replay
+close
+----
+
+# In this case only the flushable was holding a reference to the sstables. Even
+# after the DB is closed, those sstables should still be hanging around.
+ls
+----
+000002.log
+000004.sst
+000005.sst
+000006.sst
+000007.log
+000008.log
+CURRENT
+LOCK
+MANIFEST-000001
+OPTIONS-000003
+ext
+marker.format-version.000015.016
+marker.manifest.000001.MANIFEST-000001
+
+open
+----
+
+# Make sure that the sstables got flushed in the correct order on a WAL replay.
+lsm
+----
+0.1:
+  000004:[a#11,SET-a#11,SET]
+0.0:
+  000009:[a#10,SET-a#10,SET]
+  000005:[b#12,SET-b#12,SET]
+  000006:[d#13,SET-d#13,SET]
+
+get
+a
+b
+d
+----
+a:1
+b:1
+d:1
+
+reset
+----
+
+# Repeat the steps above without closing Pebble. Note that the final lsm state
+# will be different because WAL replay just placed the files in L0.
+batch
+set a 0
+----
+
+build ext1
+set a 1
+----
+
+build ext2
+set b 1
+----
+
+build ext3
+set d 1
+----
+
+ingest ext1 ext2 ext3
+----
+
+lsm
+----
+0.1:
+  000004:[a#11,SET-a#11,SET]
+0.0:
+  000009:[a#10,SET-a#10,SET]
+6:
+  000005:[b#12,SET-b#12,SET]
+  000006:[d#13,SET-d#13,SET]
+
+reset
+----
+
+# Test multiple overlapping ingests interleaving batch sets, and then flushing.
+batch
+set a 0
+----
+
+build ext4
+set a 1
+----
+
+build ext5
+set a 2
+----
+
+blockFlush
+----
+
+ingest ext4
+----
+
+allowFlush
+----
+
+get
+a
+----
+a:1
+
+batch
+set b 1
+----
+
+get
+a
+b
+----
+a:1
+b:1
+
+# Should get ingested into L0 above the memtable flush.
+blockFlush
+----
+
+ingest ext5
+----
+
+allowFlush
+----
+
+get
+a
+b
+----
+a:2
+b:1
+
+batch
+set c 1
+----
+
+flush
+----
+
+lsm
+----
+0.2:
+  000007:[a#13,SET-a#13,SET]
+0.1:
+  000004:[a#11,SET-a#11,SET]
+0.0:
+  000010:[a#10,SET-a#10,SET]
+  000011:[b#12,SET-b#12,SET]
+
+# Value of a should be the value of a in the second ingested SST.
+get
+a
+b
+c
+----
+a:2
+b:1
+c:1
+
+# Test that non-overlapping ingest still works normally.
+reset
+----
+
+batch
+set a 0
+----
+
+build ext1
+set b 1
+----
+
+build ext2
+set d 1
+----
+
+ingest ext1 ext2
+----
+
+lsm
+----
+6:
+  000004:[b#11,SET-b#11,SET]
+  000005:[d#12,SET-d#12,SET]
+
+
+# Verify target level of ingestedFlushable.
+reset
+----
+
+batch
+set a 0
+----
+
+build ext1
+set a 1
+----
+
+build ext2
+set b 1
+----
+
+build ext3
+set d 1
+----
+
+ingest ext1 ext2 ext3
+----
+
+lsm
+----
+0.1:
+  000004:[a#11,SET-a#11,SET]
+0.0:
+  000009:[a#10,SET-a#10,SET]
+6:
+  000005:[b#12,SET-b#12,SET]
+  000006:[d#13,SET-d#13,SET]
+
+
+batch
+set a 3
+----
+
+build ext4
+set a 4
+----
+
+build ext5
+set b 5
+----
+
+ingest ext4 ext5
+----
+
+# Looking for the sstable with the key a to go into 0.3, and the sstable with
+# key b to go into 0.0. The sstable doesn't go into L5, because L5 isn't open
+# yet.
+lsm
+----
+0.3:
+  000010:[a#15,SET-a#15,SET]
+0.2:
+  000014:[a#14,SET-a#14,SET]
+0.1:
+  000004:[a#11,SET-a#11,SET]
+0.0:
+  000009:[a#10,SET-a#10,SET]
+  000011:[b#16,SET-b#16,SET]
+6:
+  000005:[b#12,SET-b#12,SET]
+  000006:[d#13,SET-d#13,SET]
+
+# Testing whether the new mutable memtable with data is flushed correctly during
+# WAL replay.
+reset
+----
+
+batch
+set a 0
+----
+
+# The SST below overlaps with memtable and thus should be placed in L0
+# post flush.
+build ext1
+set a 1
+----
+
+# The SST below overlaps with nothing and thus should be placed in L6 post
+# flush.
+build ext2
+set b 1
+----
+
+# The SST below doesn't overlap with any SSTs in the LSM and thus can be placed
+# in L6 post-flush.
+build ext3
+set d 1
+----
+
+# We block the flush, so the SSTs should still be in the flushable queue.
+blockFlush
+----
+
+ingest ext1 ext2 ext3
+----
+
+# Add another write which should go to the new mutable memtable.
+batch
+set f 1
+----
+
+allowFlush
+----
+
+lsm
+----
+
+get
+a
+b
+d
+f
+----
+a:1
+b:1
+d:1
+f:1
+
+# We expect 1 WAL for an immutable memtable, 1 file for the ingested ssts,
+# one for the mutable memtable. We also expect 3 ssts corresponding to the
+# ingested files.
+ls
+----
+000002.log
+000004.sst
+000005.sst
+000006.sst
+000007.log
+000008.log
+CURRENT
+LOCK
+MANIFEST-000001
+OPTIONS-000003
+ext
+marker.format-version.000015.016
+marker.manifest.000001.MANIFEST-000001
+
+close
+----
+
+# In this case only the memtable was holding a reference to the sstables. Even
+# after the DB is closed, those memtables should still be hanging around.
+ls
+----
+000002.log
+000004.sst
+000005.sst
+000006.sst
+000007.log
+000008.log
+CURRENT
+LOCK
+MANIFEST-000001
+OPTIONS-000003
+ext
+marker.format-version.000015.016
+marker.manifest.000001.MANIFEST-000001
+
+open
+----
+
+# Make sure that the sstables got flushed in the correct order on a WAL replay.
+lsm
+----
+0.1:
+  000004:[a#11,SET-a#11,SET]
+0.0:
+  000009:[a#10,SET-a#10,SET]
+  000005:[b#12,SET-b#12,SET]
+  000006:[d#13,SET-d#13,SET]
+  000010:[f#14,SET-f#14,SET]
+
+# Check if the new mutable memtable is using a new log file, and that the
+# previous log files have been deleted appropriately after the flush.
+ls
+----
+000004.sst
+000005.sst
+000006.sst
+000009.sst
+000010.sst
+000011.log
+CURRENT
+LOCK
+MANIFEST-000001
+MANIFEST-000012
+OPTIONS-000013
+ext
+marker.format-version.000015.016
+marker.manifest.000002.MANIFEST-000012
+
+# Make sure that the new mutable memtable can accept writes.
+batch
+set h 2
+----
+
+get
+h
+----
+h:2
+
+# Test correct WAL replay with read only mode. We essentially want to make sure
+# that once a flushable is added to the queue, we create a new mutable memtable
+# on top of the flushable. Otherwise, we can invert sequence number invariants.
+reset
+----
+
+batch
+set a 0
+----
+
+# The SST below overlaps with memtable and thus should be placed in L0
+# post flush.
+build ext1
+set a 1
+----
+
+# The SST below overlaps with nothing and thus should be placed in L6 post
+# flush.
+build ext2
+set b 1
+----
+
+# The SST below doesn't overlap with any SSTs in the LSM and thus can be placed
+# in L6 post-flush.
+build ext3
+set d 1
+----
+
+# We block the flush, so the SSTs should still be in the flushable queue.
+blockFlush
+----
+
+ingest ext1 ext2 ext3
+----
+
+# Add another write which should go to the new mutable memtable.
+batch
+set a 3
+----
+
+allowFlush
+----
+
+lsm
+----
+
+get
+a
+b
+d
+----
+a:3
+b:1
+d:1
+
+close
+----
+
+open readOnly
+----
+
+get
+a
+b
+d
+----
+a:3
+b:1
+d:1
+
+# Test with StrictFS
+reset strictMem
+----
+
+batch
+set a 1
+set b 1
+----
+
+build ext1
+set a 2
+set b 2
+----
+
+blockFlush
+----
+
+ingest ext1
+----
+
+get
+a
+b
+----
+a:2
+b:2
+
+ignoreSyncs true
+----
+
+lsm
+----
+
+allowFlush
+----
+
+flush
+----
+
+# The ingested file is placed above the sstable generated by memtable flush. The
+# ingested file has a lower file number, but a higher sequence number as
+# expected.
+lsm
+----
+0.1:
+  000004:[a#12,SET-b#12,SET]
+0.0:
+  000007:[a#10,SET-b#11,SET]
+
+ls
+----
+000002.log
+000004.sst
+000005.log
+000006.log
+000007.sst
+CURRENT
+LOCK
+MANIFEST-000001
+OPTIONS-000003
+ext
+marker.format-version.000015.016
+marker.manifest.000001.MANIFEST-000001
+
+close
+----
+
+# At this point, the changes to the manifest should be lost. Note that 7.sst
+# is gone because that file was never synced.
+resetToSynced
+----
+000002.log
+000004.sst
+000005.log
+000006.log
+CURRENT
+LOCK
+MANIFEST-000001
+OPTIONS-000003
+ext
+ext1
+marker.format-version.000015.016
+marker.manifest.000001.MANIFEST-000001
+
+ignoreSyncs false
+----
+
+open
+----
+
+lsm
+----
+0.1:
+  000004:[a#12,SET-b#12,SET]
+0.0:
+  000007:[a#10,SET-b#11,SET]
diff --git a/pebble/testdata/format_major_version_pebblev1_migration b/pebble/testdata/format_major_version_pebblev1_migration
new file mode 100644
index 0000000..c579a53
--- /dev/null
+++ b/pebble/testdata/format_major_version_pebblev1_migration
@@ -0,0 +1,170 @@
+# Open the DB at one version prior to the version that enforces Pebblev1 tables.
+open version=8
+----
+
+format-major-version
+----
+008
+
+# Confirm the allowable range of table formats.
+
+min-table-format
+----
+(LevelDB)
+
+max-table-format
+----
+(Pebble,v2)
+
+# Disable automatic compactions while we create the tables.
+
+disable-automatic-compactions true
+----
+
+# Create and flush a table. The table is created at the max format version for
+# this DB version (i.e. Pebblev2).
+
+batch
+set a a
+----
+
+flush
+----
+
+# Ingest an external table written at the max table format for the current
+# version (i.e. Pebblev2).
+
+ingest a format=pebblev2
+set pebblev2 pebblev2
+----
+
+# Ingest some external table written at earlier versions (i.e. Pebblev1,
+# RocksDBv2, LevelDB).
+
+ingest b format=pebblev1
+set pebblev1 pebblev1
+----
+
+ingest c format=rocksdbv2
+set rocksdbv2 rockdbv2
+----
+
+ingest d format=leveldb
+set leveldb leveldb
+----
+
+lsm
+----
+0.0:
+  000005:[a#10,SET-a#10,SET]
+6:
+  000009:[leveldb#14,SET-leveldb#14,SET]
+  000007:[pebblev1#12,SET-pebblev1#12,SET]
+  000006:[pebblev2#11,SET-pebblev2#11,SET]
+  000008:[rocksdbv2#13,SET-rocksdbv2#13,SET]
+
+tally-table-formats
+----
+(LevelDB): 1
+(RocksDB,v2): 1
+(Pebble,v1): 1
+(Pebble,v2): 2
+(Pebble,v3): 0
+(Pebble,v4): 0
+
+# Upgrade the DB to FormatMinTableFormatPebblev1.
+
+ratchet-format-major-version 009
+----
+
+format-major-version
+----
+009
+
+# The min table format version has been raised to Pebblev1.
+
+min-table-format
+----
+(Pebble,v1)
+
+max-table-format
+----
+(Pebble,v2)
+
+# Ingesting a table with a format prior to this version fails.
+
+ingest e format=rocksdbv2
+set rocksdbv2 rockdbv2
+----
+pebble: table format (RocksDB,v2) is not within range supported at DB format major version 9, ((Pebble,v1),(Pebble,v2))
+
+# Upgrade the DB to FormatPrePebblev1Marked. The marked count increases to the
+# count of tables at versions pre-Pebblev1 (i.e. two tables).
+
+ratchet-format-major-version 010
+----
+
+format-major-version
+----
+010
+
+min-table-format
+----
+(Pebble,v1)
+
+max-table-format
+----
+(Pebble,v2)
+
+marked-file-count
+----
+2 files marked for compaction
+
+# Upgrade the DB to FormatPrePebblev1MarkedCompacted. The marked count returns
+# to zero.
+
+disable-automatic-compactions false
+----
+
+ratchet-format-major-version 014
+----
+
+format-major-version
+----
+014
+
+min-table-format
+----
+(Pebble,v1)
+
+max-table-format
+----
+(Pebble,v3)
+
+marked-file-count
+----
+0 files marked for compaction
+
+# The two tables with older table formats were rewritten with newer table format
+# versions (note updated table numbers for the leveldb and rocksdb2 tables).
+
+lsm
+----
+0.0:
+  000005:[a#10,SET-a#10,SET]
+6:
+  000013:[leveldb#0,SET-leveldb#0,SET]
+  000007:[pebblev1#12,SET-pebblev1#12,SET]
+  000006:[pebblev2#11,SET-pebblev2#11,SET]
+  000012:[rocksdbv2#0,SET-rocksdbv2#0,SET]
+
+# Confirm all tables are at least the minimum supported table format version.
+
+tally-table-formats
+----
+(LevelDB): 0
+(RocksDB,v2): 0
+(Pebble,v1): 1
+(Pebble,v2): 4
+(Pebble,v3): 0
+(Pebble,v4): 0
diff --git a/pebble/testdata/format_major_version_split_user_key_migration b/pebble/testdata/format_major_version_split_user_key_migration
new file mode 100644
index 0000000..735e7c0
--- /dev/null
+++ b/pebble/testdata/format_major_version_split_user_key_migration
@@ -0,0 +1,148 @@
+define
+L1
+d.SET.110:d e.SET.140:e
+----
+1:
+  000004:[d#110,SET-e#140,SET] seqnums:[110-140] points:[d#110,SET-e#140,SET]
+
+reopen
+----
+OK
+
+# The current public Pebble interface offers no way of constructing a multi-file
+# atomic compaction unit, so use the force-ingest command to force an ingestion
+# into L1.
+
+build cd
+set c c
+set d d
+----
+
+force-ingest paths=(cd) level=1
+----
+1:
+  000008:[c#141,SET-d#141,SET] seqnums:[141-141] points:[c#141,SET-d#141,SET]
+  000004:[d#110,SET-e#140,SET] seqnums:[110-140] points:[d#110,SET-e#140,SET]
+
+format-major-version
+----
+005
+
+marked-file-count
+----
+0 files marked for compaction
+
+ratchet-format-major-version 006
+----
+
+format-major-version
+----
+006
+
+# Upgrading to format major version 006 should've marked files for compaction.
+
+marked-file-count
+----
+2 files marked for compaction
+
+reopen
+----
+OK
+
+format-major-version
+----
+006
+
+# Ensure the files previously marked for compaction are still marked for
+# compaction.
+
+marked-file-count
+----
+2 files marked for compaction
+
+disable-automatic-compactions false
+----
+
+# Ratcheting to 007 should force compaction of any files still marked for
+# compaction.
+
+ratchet-format-major-version 007
+----
+[JOB 100] compacted(rewrite) L1 [000008 000004] (1.3KB) Score=0.00 + L1 [] (0B) Score=0.00 -> L1 [000013] (649B), in 1.0s (2.0s total), output rate 649B/s
+
+format-major-version
+----
+007
+
+lsm
+----
+1:
+  000013:[c#0,SET-e#0,SET]
+
+# Reset to a new LSM.
+
+define
+L1
+b.SET.0:b c.SET.5:c
+L1
+m.SET.0:m l.SET.5:l
+L1
+x.SET.0:x y.SET.5:y
+----
+1:
+  000004:[b#0,SET-c#5,SET] seqnums:[0-5] points:[b#0,SET-c#5,SET]
+  000005:[l#5,SET-m#0,SET] seqnums:[0-5] points:[l#5,SET-m#0,SET]
+  000006:[x#0,SET-y#5,SET] seqnums:[0-5] points:[x#0,SET-y#5,SET]
+
+build ab
+set a a
+set b b
+----
+
+build wx
+set w w
+set x x
+----
+
+force-ingest paths=(ab, wx) level=1
+----
+1:
+  000007:[a#10,SET-b#10,SET] seqnums:[10-10] points:[a#10,SET-b#10,SET]
+  000004:[b#0,SET-c#5,SET] seqnums:[0-5] points:[b#0,SET-c#5,SET]
+  000005:[l#5,SET-m#0,SET] seqnums:[0-5] points:[l#5,SET-m#0,SET]
+  000008:[w#11,SET-x#11,SET] seqnums:[11-11] points:[w#11,SET-x#11,SET]
+  000006:[x#0,SET-y#5,SET] seqnums:[0-5] points:[x#0,SET-y#5,SET]
+
+format-major-version
+----
+005
+
+ratchet-format-major-version 006
+----
+
+format-major-version
+----
+006
+
+marked-file-count
+----
+4 files marked for compaction
+
+disable-automatic-compactions false
+----
+
+ratchet-format-major-version 007
+----
+[JOB 100] compacted(rewrite) L1 [000007 000004] (1.3KB) Score=0.00 + L1 [] (0B) Score=0.00 -> L1 [000010] (649B), in 1.0s (2.0s total), output rate 649B/s
+[JOB 100] compacted(rewrite) L1 [000008 000006] (1.3KB) Score=0.00 + L1 [] (0B) Score=0.00 -> L1 [000011] (649B), in 1.0s (2.0s total), output rate 649B/s
+
+lsm
+----
+1:
+  000010:[a#0,SET-c#0,SET]
+  000005:[l#5,SET-m#0,SET]
+  000011:[w#0,SET-y#0,SET]
+
+format-major-version
+----
+007
diff --git a/pebble/testdata/frontiers b/pebble/testdata/frontiers
new file mode 100644
index 0000000..313f693
--- /dev/null
+++ b/pebble/testdata/frontiers
@@ -0,0 +1,71 @@
+# NB: The empty line in 'init' configures a frontier with no keys. It should
+# never be added to the heap.
+
+init
+b e j
+a p n z
+
+f
+----
+
+scan
+a b c d e f g h j i k l m n o p q r s t u v w x y z
+----
+a : { b: "b", p: "p", f: "f" }
+b : { e: "e", p: "p", f: "f" }
+c : { e: "e", p: "p", f: "f" }
+d : { e: "e", p: "p", f: "f" }
+e : { f: "f", p: "p", j: "j" }
+f : { j: "j", p: "p" }
+g : { j: "j", p: "p" }
+h : { j: "j", p: "p" }
+j : { p: "p" }
+i : { p: "p" }
+k : { p: "p" }
+l : { p: "p" }
+m : { p: "p" }
+n : { p: "p" }
+o : { p: "p" }
+p : { z: "z" }
+q : { z: "z" }
+r : { z: "z" }
+s : { z: "z" }
+t : { z: "z" }
+u : { z: "z" }
+v : { z: "z" }
+w : { z: "z" }
+x : { z: "z" }
+y : { z: "z" }
+z : {  }
+
+scan
+z
+----
+z : {  }
+
+scan
+a z
+----
+a : { b: "b", p: "p", f: "f" }
+z : {  }
+
+scan
+e
+----
+e : { f: "f", p: "p", j: "j" }
+
+# Test duplicate user keys within a frontier and across individual frontiers.
+
+init
+b e e g
+c e z
+----
+
+scan
+a c d f z
+----
+a : { b: "b", c: "c" }
+c : { e: "e", e: "e" }
+d : { e: "e", e: "e" }
+f : { g: "g", z: "z" }
+z : {  }
diff --git a/pebble/testdata/indexed_batch_mutation b/pebble/testdata/indexed_batch_mutation
new file mode 100644
index 0000000..c3e60ad
--- /dev/null
+++ b/pebble/testdata/indexed_batch_mutation
@@ -0,0 +1,759 @@
+# Set a key within the indexed batch.
+new-batch
+set foo foo
+----
+
+# Construct an iterator over the indexed batch.
+
+new-batch-iter i0
+----
+
+# The key we set should be visible.
+
+iter iter=i0
+first
+next
+----
+foo: (foo, .)
+.
+
+# Same behavior with a batch-only iterator.
+new-batch-only-iter i-bo0
+----
+
+iter iter=i-bo0
+first
+next
+----
+foo: (foo, .)
+.
+
+# Set a new key, while the above iterator is still open.
+
+mutate
+set bar bar
+----
+
+# The new key should be invisible.
+
+iter iter=i0
+prev
+next
+----
+foo: (foo, .)
+.
+
+# Same behavior with a batch-only iterator.
+iter iter=i-bo0
+prev
+next
+----
+foo: (foo, .)
+.
+
+# A set-options operation should refresh the Iterator's view of the batch. The
+# bar key should now be visibile.
+
+iter iter=i0
+set-options
+first
+next
+next
+----
+.
+bar: (bar, .)
+foo: (foo, .)
+.
+
+# Same behavior with a batch-only iterator.
+iter iter=i-bo0
+set-options
+first
+next
+next
+----
+.
+bar: (bar, .)
+foo: (foo, .)
+.
+
+# Delete foo with a range deletion.
+
+mutate
+del-range f g
+----
+
+# Both keys should still be visible.
+
+iter iter=i0
+prev
+prev
+----
+foo: (foo, .)
+bar: (bar, .)
+
+# Same behavior with a batch-only iterator.
+iter iter=i-bo0
+prev
+prev
+----
+foo: (foo, .)
+bar: (bar, .)
+
+# After refreshing the iterator's view of the batch, foo should be deleted.
+
+iter iter=i0
+set-options
+seek-ge foo
+seek-lt foo
+----
+.
+.
+bar: (bar, .)
+
+# Same behavior with a batch-only iterator.
+iter iter=i-bo0
+set-options
+seek-ge foo
+seek-lt foo
+----
+.
+.
+bar: (bar, .)
+
+# Write a range key set and a point key.
+
+mutate
+range-key-set a c @1 boop
+set b b
+----
+
+# The mutations should not be visible.
+
+iter iter=i0
+prev
+next
+----
+.
+bar: (bar, .)
+
+# Same behavior with a batch-only iterator.
+iter iter=i-bo0
+prev
+next
+----
+.
+bar: (bar, .)
+
+# But refreshing the batch through a call to SetOptions should surface them.
+
+iter iter=i0
+set-options
+first
+next
+next
+----
+.
+a: (., [a-c) @1=boop UPDATED)
+b: (b, [a-c) @1=boop)
+bar: (bar, [a-c) @1=boop)
+
+# Same behavior with a batch-only iterator.
+iter iter=i-bo0
+set-options
+first
+next
+next
+----
+.
+a: (., [a-c) @1=boop UPDATED)
+b: (b, [a-c) @1=boop)
+bar: (bar, [a-c) @1=boop)
+
+# Remove part of the range key to fragment it.
+
+mutate
+range-key-del ace arc
+----
+
+iter iter=i0
+next
+prev
+prev
+prev
+prev
+----
+.
+bar: (bar, [a-c) @1=boop UPDATED)
+b: (b, [a-c) @1=boop)
+a: (., [a-c) @1=boop)
+.
+
+# Same behavior with a batch-only iterator.
+iter iter=i-bo0
+next
+prev
+prev
+prev
+prev
+----
+.
+bar: (bar, [a-c) @1=boop UPDATED)
+b: (b, [a-c) @1=boop)
+a: (., [a-c) @1=boop)
+.
+
+iter iter=i0
+set-options
+first
+next
+next
+next
+----
+.
+a: (., [a-ace) @1=boop UPDATED)
+arc: (., [arc-c) @1=boop UPDATED)
+b: (b, [arc-c) @1=boop)
+bar: (bar, [arc-c) @1=boop)
+
+# Same behavior with a batch-only iterator.
+iter iter=i-bo0
+set-options
+first
+next
+next
+next
+----
+.
+a: (., [a-ace) @1=boop UPDATED)
+arc: (., [arc-c) @1=boop UPDATED)
+b: (b, [arc-c) @1=boop)
+bar: (bar, [arc-c) @1=boop)
+
+# Create a new indexed batch and a new iterator over it.
+
+new-batch
+set foo foo
+----
+
+new-batch-iter i1
+----
+
+iter iter=i1
+first
+next
+----
+foo: (foo, .)
+.
+
+# Test interactions with cloned iterators.
+# First, apply mutations to the batch. They should remain invisible.
+
+mutate
+set bar bar
+range-key-set a z @1 boop
+del-range f g
+----
+
+iter iter=i1
+first
+next
+----
+foo: (foo, .)
+.
+
+# Clone i1 to create i2.
+
+clone from=i1 to=i2 refresh-batch=false
+----
+
+# i1 unchanged.
+
+iter iter=i1
+first
+next
+----
+foo: (foo, .)
+.
+
+# i2 sees exactly the same stale state as i1 until SetOptions is called to
+# explicitly refresh the view of the underlying batch.
+
+iter iter=i2
+first
+next
+set-options
+first
+next
+next
+----
+foo: (foo, .)
+.
+.
+a: (., [a-z) @1=boop UPDATED)
+bar: (bar, [a-z) @1=boop)
+.
+
+# Clone i1 to create i3, this time passing RefreshBatchView: true. This clone
+# should view the updated view of the underlying batch.
+clone from=i1 to=i3 refresh-batch=true
+----
+
+iter iter=i3
+first
+next
+----
+a: (., [a-z) @1=boop UPDATED)
+bar: (bar, [a-z) @1=boop)
+
+# i1 should still have the old, stale view of the batch.
+
+iter iter=i1
+first
+next
+----
+foo: (foo, .)
+.
+
+# Mutate the underlying batch again.
+
+mutate
+set foo foo
+range-key-set a z @2 bax
+del-range b c
+----
+
+# The new mutations should be invisible until SetOptions is called.
+
+iter iter=i1
+first
+next
+set-options
+first
+next
+next
+----
+foo: (foo, .)
+.
+.
+a: (., [a-z) @2=bax, @1=boop UPDATED)
+foo: (foo, [a-z) @2=bax, @1=boop)
+.
+
+iter iter=i2
+first
+next
+next
+set-options
+first
+next
+next
+----
+a: (., [a-z) @1=boop UPDATED)
+bar: (bar, [a-z) @1=boop)
+.
+.
+a: (., [a-z) @2=bax, @1=boop UPDATED)
+foo: (foo, [a-z) @2=bax, @1=boop)
+.
+
+# Commit a separate batch to the underlying engine.
+batch
+range-key-set a z @5 poi
+set apple apple
+----
+
+# The writes to the underlying engine should be invisible.
+
+iter iter=i1
+first
+next
+next
+----
+a: (., [a-z) @2=bax, @1=boop UPDATED)
+foo: (foo, [a-z) @2=bax, @1=boop)
+.
+
+# Clone i1 to create i4.
+
+clone from=i1 to=i4 refresh-batch=false
+----
+
+iter iter=i4
+first
+next
+next
+----
+a: (., [a-z) @2=bax, @1=boop UPDATED)
+foo: (foo, [a-z) @2=bax, @1=boop)
+.
+
+# Refresh i4's view of its batch. It should still not see the newly committed
+# writes.
+
+iter iter=i4
+set-options
+first
+next
+next
+----
+.
+a: (., [a-z) @2=bax, @1=boop UPDATED)
+foo: (foo, [a-z) @2=bax, @1=boop)
+.
+
+# Create a new iterator i5 over the indexed batch [not a Clone]. It should see
+# all committed writes and uncommitted writes.
+
+new-batch-iter i5
+----
+
+iter iter=i5
+first
+next
+next
+next
+----
+a: (., [a-z) @5=poi, @2=bax, @1=boop UPDATED)
+apple: (apple, [a-z) @5=poi, @2=bax, @1=boop)
+foo: (foo, [a-z) @5=poi, @2=bax, @1=boop)
+.
+
+# The batch-only iter only sees the contents of the batch.
+new-batch-only-iter i-bo1
+----
+
+iter iter=i-bo1
+first
+next
+next
+----
+a: (., [a-z) @2=bax, @1=boop UPDATED)
+foo: (foo, [a-z) @2=bax, @1=boop)
+.
+
+# Mutate all the open iterators' underlying batch.
+
+mutate
+range-key-set a z @6 yaya
+set c c
+----
+
+# The iterators should still not see the committed writes, even after refreshing
+# to observe more recent batch writes.
+
+iter iter=i1
+first
+next
+next
+----
+a: (., [a-z) @2=bax, @1=boop UPDATED)
+foo: (foo, [a-z) @2=bax, @1=boop)
+.
+
+iter iter=i4
+first
+next
+next
+set-options
+first
+next
+next
+----
+a: (., [a-z) @2=bax, @1=boop UPDATED)
+foo: (foo, [a-z) @2=bax, @1=boop)
+.
+.
+a: (., [a-z) @6=yaya, @2=bax, @1=boop UPDATED)
+c: (c, [a-z) @6=yaya, @2=bax, @1=boop)
+foo: (foo, [a-z) @6=yaya, @2=bax, @1=boop)
+
+
+# The batch-only iter sees the more recent batch writes after refreshing.
+iter iter=i-bo1
+first
+next
+next
+set-options
+first
+next
+next
+next
+----
+a: (., [a-z) @2=bax, @1=boop UPDATED)
+foo: (foo, [a-z) @2=bax, @1=boop)
+.
+.
+a: (., [a-z) @6=yaya, @2=bax, @1=boop UPDATED)
+c: (c, [a-z) @6=yaya, @2=bax, @1=boop)
+foo: (foo, [a-z) @6=yaya, @2=bax, @1=boop)
+.
+
+# Test a scenario where constructing an Iterator should NOT use the cached
+# fragmented tombstones / range keys, because the new Iterator is a Clone which
+# must read at an earlier batch sequence number.
+
+# Reset and start a new batch.
+
+reset
+----
+
+new-batch
+set foo foo
+----
+
+new-batch-iter i1
+----
+
+iter iter=i1
+first
+next
+----
+foo: (foo, .)
+.
+
+# Apply a range deletion and a range key.
+
+mutate
+del-range a z
+range-key-set a z @1 foo
+----
+
+# Create a new iterator which will see both the range deletion and the range
+# key, and cache both on the batch so that future iterators constructed over the
+# batch do not need to.
+
+new-batch-iter i2
+----
+
+iter iter=i2
+first
+next
+----
+a: (., [a-z) @1=foo UPDATED)
+.
+
+# Clone the original iterator from before the delete range and the range key
+# were created. It should not use the cached fragments of range deletions or
+# range keys, and should not see the effects of either.
+
+clone from=i1 to=i3 refresh-batch=false
+----
+
+iter iter=i3
+first
+next
+----
+foo: (foo, .)
+.
+
+reset
+----
+
+new-batch
+range-key-set a c @1 poi
+range-key-set b d @2 yaya
+----
+
+new-batch-iter i1
+----
+
+# The batch contains 2 range keys, but the skiplist of fragmented range keys
+# contains 3 elements (a-b, b-c, c-d).
+
+iter iter=i1
+first
+next
+next
+----
+a: (., [a-b) @1=poi UPDATED)
+b: (., [b-c) @2=yaya, @1=poi UPDATED)
+c: (., [c-d) @2=yaya UPDATED)
+
+# Add a new range key to the batch. The batch contains 3 internal range keys,
+# and the skiplist of fragmented range keys contains 3 elements.
+
+mutate
+range-key-set e f @3 foo
+----
+
+# Refreshing the iterator's view of the batch through SetOptions should surface
+# the new range key. An earlier bug incorrectly compared the number of
+# fragmented range keys to the number of internal batch range keys in order to
+# determine when to refresh the iterator.
+
+iter iter=i1
+first
+next
+next
+set-options
+first
+next
+next
+next
+seek-ge bat
+----
+a: (., [a-b) @1=poi UPDATED)
+b: (., [b-c) @2=yaya, @1=poi UPDATED)
+c: (., [c-d) @2=yaya UPDATED)
+.
+a: (., [a-b) @1=poi UPDATED)
+b: (., [b-c) @2=yaya, @1=poi UPDATED)
+c: (., [c-d) @2=yaya UPDATED)
+e: (., [e-f) @3=foo UPDATED)
+bat: (., [b-c) @2=yaya, @1=poi UPDATED)
+
+# Mutate the range key under the interleaving iterator's current position in the
+# indexed batch.
+#
+# The last `seek-ge` operation landed on the range key [b-c). The top-level
+# *pebble.Iterator needs to step the iterator again to see if there's a
+# coincident point key at (`bat`), which would've advanced the interleaving
+# iterator to the range key with bounds [c,d), so the underlying interleaving
+# iterator is positioned ahead at:
+#
+#     c: (., [c-d) @2=yaya)
+#
+# If we call set-options to refresh the iterator's view of the indexed batch,
+# the range-key-unset [c,d)@2 becomes visible, and the range key that the
+# underlying interleaving iterator is positioned over should not be visible.
+#
+# A bug previously allowed this range key to be visible when seeking into this
+# span's bounds (see the optimization in InterleavingIter.SeekGE). Now, the call
+# to SetOptions clears the interleaving iterator's positional state to avoid the
+# SeekGE optimization.
+
+mutate
+range-key-unset b d @2
+----
+
+iter iter=i1
+set-options
+seek-ge cat
+----
+.
+e: (., [e-f) @3=foo UPDATED)
+
+reset
+----
+
+batch
+range-key-set a e @1 foo
+----
+
+flush
+----
+
+new-batch
+----
+
+new-batch-iter batchiter
+----
+
+new-db-iter dbiter
+----
+
+# Test RangeKeyChanged() semantics.
+# Seeking to the same prefix returns RangeKeyChanged()=false.
+# Seeking to a new prefix returns RangeKeyChanged()=true.
+# Seeking to the same prefix with a SetOptions call in between returns
+# RangeKeyChanged()=true.
+
+iter iter=dbiter
+seek-prefix-ge b@3
+seek-prefix-ge b@4
+seek-prefix-ge c@3
+seek-prefix-ge d@3
+set-options
+seek-prefix-ge d@1
+----
+b@3: (., [b-"b\x00") @1=foo UPDATED)
+b@4: (., [b-"b\x00") @1=foo)
+c@3: (., [c-"c\x00") @1=foo UPDATED)
+d@3: (., [d-"d\x00") @1=foo UPDATED)
+.
+d@1: (., [d-"d\x00") @1=foo UPDATED)
+
+# Test the same semantics on a batch iterator.
+
+iter iter=batchiter
+seek-prefix-ge b@3
+seek-prefix-ge b@4
+seek-prefix-ge c@3
+seek-prefix-ge d@3
+set-options
+seek-prefix-ge d@1
+----
+b@3: (., [b-"b\x00") @1=foo UPDATED)
+b@4: (., [b-"b\x00") @1=foo)
+c@3: (., [c-"c\x00") @1=foo UPDATED)
+d@3: (., [d-"d\x00") @1=foo UPDATED)
+.
+d@1: (., [d-"d\x00") @1=foo UPDATED)
+
+# Test mutating the indexed batch's range keys, overlapping the existing seek
+# position. It should not see the new mutations, but after a call to SetOptions
+# it should AND it should return RangeKeyChanged()=true.
+
+mutate
+range-key-set d e @2 foo
+----
+
+iter iter=batchiter
+seek-prefix-ge d@2
+set-options
+seek-prefix-ge d@2
+----
+d@2: (., [d-"d\x00") @1=foo)
+.
+d@2: (., [d-"d\x00") @2=foo, @1=foo UPDATED)
+
+# Test cloning an iterator with a range-key mask block property filter
+# configured. If the cloned and the clonee iterators have different suffixes
+# configured, their suffixes should be respected. Previously, the
+# RangeKeyMasking.Filter option was a footgun, because it was a single mutable
+# instance. Cloning the iterator without supplying new iterator options would
+# result in two iterators using the same filter.
+
+reset
+----
+
+batch
+range-key-set a e @5 foo
+set b@4 b@4
+----
+
+new-db-iter iter-a
+----
+
+iter iter=iter-a
+set-options mask-suffix=@3 mask-filter=true
+seek-ge a
+next
+next
+----
+.
+a: (., [a-e) @5=foo UPDATED)
+b@4: (b@4, [a-e) @5=foo)
+.
+
+clone from=iter-a to=iter-b refresh-batch=false
+----
+
+iter iter=iter-b
+set-options mask-suffix=@6
+seek-ge a
+next
+----
+.
+a: (., [a-e) @5=foo UPDATED)
+.
+
+iter iter=iter-a
+seek-ge a
+next
+next
+----
+a: (., [a-e) @5=foo UPDATED)
+b@4: (b@4, [a-e) @5=foo)
+.
diff --git a/pebble/testdata/ingest b/pebble/testdata/ingest
new file mode 100644
index 0000000..ee30ab9
--- /dev/null
+++ b/pebble/testdata/ingest
@@ -0,0 +1,1180 @@
+ingest
+----
+
+ingest non-existent
+----
+open non-existent: file does not exist
+
+# Elide ingestion of empty sstables.
+
+build ext0
+----
+
+ingest ext0
+----
+
+lsm
+----
+
+build ext0 format=pebblev2
+set a 1
+set b 2
+----
+
+ingest ext0
+----
+
+lsm
+----
+6:
+  000006:[a#10,SET-b#10,SET]
+
+metrics
+----
+      |                             |       |       |   ingested   |     moved    |    written   |       |    amp
+level | tables  size val-bl vtables | score |   in  | tables  size | tables  size | tables  size |  read |   r   w
+------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+---------
+    0 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    1 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    2 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    3 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    4 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    5 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    6 |     1   696B     0B       0 |     - |    0B |     1   696B |     0     0B |     0     0B |    0B |   1  0.0
+total |     1   696B     0B       0 |     - |  696B |     1   696B |     0     0B |     0   696B |    0B |   1  1.0
+-------------------------------------------------------------------------------------------------------------------
+WAL: 1 files (0B)  in: 0B  written: 0B (0% overhead)
+Flushes: 0
+Compactions: 0  estimated debt: 0B  in progress: 0 (0B)
+             default: 0  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  multi-level: 0
+MemTables: 1 (256KB)  zombie: 0 (0B)
+Zombie tables: 0 (0B)
+Backing tables: 0 (0B)
+Virtual tables: 0 (0B)
+Block cache: 6 entries (1.2KB)  hit rate: 35.7%
+Table cache: 1 entries (800B)  hit rate: 50.0%
+Secondary cache: 0 entries (0B)  hit rate: 0.0%
+Snapshots: 0  earliest seq num: 0
+Table iters: 0
+Filter utility: 0.0%
+Ingestions: 1  as flushable: 0 (0B in 0 tables)
+
+
+iter
+seek-ge a
+next
+next
+----
+a: (1, .)
+b: (2, .)
+.
+
+get
+a
+b
+----
+a:1
+b:2
+
+wait-pending-table-stats
+000006
+----
+num-entries: 2
+num-deletions: 0
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 0
+
+build ext1
+set a 3
+del b
+----
+
+ingest ext1
+----
+
+lsm
+----
+0.0:
+  000007:[a#11,SET-b#11,DEL]
+6:
+  000006:[a#10,SET-b#10,SET]
+
+iter
+seek-ge a
+next
+----
+a: (3, .)
+.
+
+get
+a
+b
+----
+a:3
+b: pebble: not found
+
+build ext2 format=pebblev2
+set a 4
+set b 5
+set c 6
+----
+
+ingest ext2
+----
+
+lsm
+----
+0.1:
+  000008:[a#12,SET-c#12,SET]
+0.0:
+  000007:[a#11,SET-b#11,DEL]
+6:
+  000006:[a#10,SET-b#10,SET]
+
+iter
+seek-ge a
+next
+next
+----
+a: (4, .)
+b: (5, .)
+c: (6, .)
+
+get
+a
+b
+c
+----
+a:4
+b:5
+c:6
+
+build ext3
+merge b 5
+del c
+----
+
+ingest ext3
+----
+
+lsm
+----
+0.2:
+  000009:[b#13,MERGE-c#13,DEL]
+0.1:
+  000008:[a#12,SET-c#12,SET]
+0.0:
+  000007:[a#11,SET-b#11,DEL]
+6:
+  000006:[a#10,SET-b#10,SET]
+
+iter
+seek-ge a
+next
+next
+----
+a: (4, .)
+b: (55, .)
+.
+
+get
+a
+b
+c
+----
+a:4
+b:55
+c: pebble: not found
+
+build ext4
+set x 7
+set y 8
+----
+
+ingest ext4
+----
+
+lsm
+----
+0.2:
+  000009:[b#13,MERGE-c#13,DEL]
+0.1:
+  000008:[a#12,SET-c#12,SET]
+0.0:
+  000007:[a#11,SET-b#11,DEL]
+6:
+  000006:[a#10,SET-b#10,SET]
+  000010:[x#14,SET-y#14,SET]
+
+iter
+seek-lt y
+prev
+prev
+----
+x: (7, .)
+b: (55, .)
+a: (4, .)
+
+get
+x
+y
+----
+x:7
+y:8
+
+batch
+set j 9
+set k 10
+----
+
+# Overlap with point keys in memtable, hence memtable will be flushed.
+
+build ext5
+set k 11
+----
+
+ingest ext5
+----
+memtable flushed
+
+lsm
+----
+0.2:
+  000009:[b#13,MERGE-c#13,DEL]
+0.1:
+  000008:[a#12,SET-c#12,SET]
+  000011:[k#17,SET-k#17,SET]
+0.0:
+  000007:[a#11,SET-b#11,DEL]
+  000014:[j#15,SET-k#16,SET]
+6:
+  000006:[a#10,SET-b#10,SET]
+  000010:[x#14,SET-y#14,SET]
+
+iter
+seek-ge j
+next
+----
+j: (9, .)
+k: (11, .)
+
+get
+j
+k
+----
+j:9
+k:11
+
+# No data overlap in memtable, hence it will not be flushed.
+
+batch
+set m 12
+----
+
+build ext6
+set n 13
+----
+
+ingest ext6
+----
+
+lsm
+----
+0.2:
+  000009:[b#13,MERGE-c#13,DEL]
+0.1:
+  000008:[a#12,SET-c#12,SET]
+  000011:[k#17,SET-k#17,SET]
+0.0:
+  000007:[a#11,SET-b#11,DEL]
+  000014:[j#15,SET-k#16,SET]
+6:
+  000006:[a#10,SET-b#10,SET]
+  000015:[n#19,SET-n#19,SET]
+  000010:[x#14,SET-y#14,SET]
+
+get
+m
+n
+----
+m:12
+n:13
+
+build ext7 format=pebblev2
+del-range a c
+del-range x z
+----
+
+ingest ext7
+----
+memtable flushed
+
+lsm
+----
+0.3:
+  000016:[a#20,RANGEDEL-z#inf,RANGEDEL]
+0.2:
+  000009:[b#13,MERGE-c#13,DEL]
+0.1:
+  000008:[a#12,SET-c#12,SET]
+  000011:[k#17,SET-k#17,SET]
+0.0:
+  000007:[a#11,SET-b#11,DEL]
+  000014:[j#15,SET-k#16,SET]
+  000019:[m#18,SET-m#18,SET]
+6:
+  000006:[a#10,SET-b#10,SET]
+  000015:[n#19,SET-n#19,SET]
+  000010:[x#14,SET-y#14,SET]
+
+get
+a
+b
+c
+j
+k
+m
+n
+x
+y
+----
+a: pebble: not found
+b: pebble: not found
+c: pebble: not found
+j:9
+k:11
+m:12
+n:13
+x: pebble: not found
+y: pebble: not found
+
+wait-pending-table-stats
+000016
+----
+num-entries: 2
+num-deletions: 2
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 1420
+
+# A set operation takes precedence over a range deletion at the same
+# sequence number as can occur during ingestion.
+
+build ext8
+set j 20
+del-range j k
+set m 30
+----
+
+ingest ext8
+----
+
+get
+j
+k
+m
+----
+j:20
+k:11
+m:30
+
+build ext9
+set a 40
+set f 40
+set g 40
+----
+
+ingest ext9
+----
+
+lsm
+----
+0.4:
+  000021:[a#22,SET-g#22,SET]
+  000020:[j#21,RANGEDEL-m#21,SET]
+0.3:
+  000016:[a#20,RANGEDEL-z#inf,RANGEDEL]
+0.2:
+  000009:[b#13,MERGE-c#13,DEL]
+0.1:
+  000008:[a#12,SET-c#12,SET]
+  000011:[k#17,SET-k#17,SET]
+0.0:
+  000007:[a#11,SET-b#11,DEL]
+  000014:[j#15,SET-k#16,SET]
+  000019:[m#18,SET-m#18,SET]
+6:
+  000006:[a#10,SET-b#10,SET]
+  000015:[n#19,SET-n#19,SET]
+  000010:[x#14,SET-y#14,SET]
+
+# Overlap with sst boundary containing range del sentinel (fileNum 000015) is not considered an overlap since
+# range del's end key is exclusive. Hence ext9 gets ingested into L6.
+
+build ext10
+set z 40
+----
+
+# Although ext11 falls into sst boundaries of fileNum 000019, 000015, they don't actually contain any key within ext11's boundary.
+# Hence ext11 is allowed to go further down and get ingested into L6.
+
+build ext11
+set d 40
+----
+
+# Overlap with fileNum 000018 is not considered an overlap since ext12's end key is range del sentinel which is exclusive.
+
+build ext12
+del-range i j
+----
+
+# Ingesting multiple files into L0 is allowed.
+
+ingest ext10 ext11 ext12
+----
+
+get
+z
+d
+----
+z:40
+d:40
+
+lsm
+----
+0.4:
+  000021:[a#22,SET-g#22,SET]
+  000020:[j#21,RANGEDEL-m#21,SET]
+0.3:
+  000016:[a#20,RANGEDEL-z#inf,RANGEDEL]
+0.2:
+  000009:[b#13,MERGE-c#13,DEL]
+0.1:
+  000008:[a#12,SET-c#12,SET]
+  000011:[k#17,SET-k#17,SET]
+0.0:
+  000007:[a#11,SET-b#11,DEL]
+  000014:[j#15,SET-k#16,SET]
+  000019:[m#18,SET-m#18,SET]
+6:
+  000006:[a#10,SET-b#10,SET]
+  000023:[d#23,SET-d#23,SET]
+  000024:[i#24,RANGEDEL-j#inf,RANGEDEL]
+  000015:[n#19,SET-n#19,SET]
+  000010:[x#14,SET-y#14,SET]
+  000022:[z#25,SET-z#25,SET]
+
+# No overlap between fileNum 000019 that contains point key f, since f is ingested file's range del sentinel.
+
+build ext13
+del-range e f
+----
+
+ingest ext13
+----
+
+lsm
+----
+0.4:
+  000021:[a#22,SET-g#22,SET]
+  000020:[j#21,RANGEDEL-m#21,SET]
+0.3:
+  000016:[a#20,RANGEDEL-z#inf,RANGEDEL]
+0.2:
+  000009:[b#13,MERGE-c#13,DEL]
+0.1:
+  000008:[a#12,SET-c#12,SET]
+  000011:[k#17,SET-k#17,SET]
+0.0:
+  000007:[a#11,SET-b#11,DEL]
+  000014:[j#15,SET-k#16,SET]
+  000019:[m#18,SET-m#18,SET]
+6:
+  000006:[a#10,SET-b#10,SET]
+  000023:[d#23,SET-d#23,SET]
+  000025:[e#26,RANGEDEL-f#inf,RANGEDEL]
+  000024:[i#24,RANGEDEL-j#inf,RANGEDEL]
+  000015:[n#19,SET-n#19,SET]
+  000010:[x#14,SET-y#14,SET]
+  000022:[z#25,SET-z#25,SET]
+
+# Overlap with range delete keys in memtable, hence memtable will be flushed.
+
+batch
+del-range a d
+----
+
+build ext14
+set b 1
+----
+
+ingest ext14
+----
+memtable flushed
+
+lsm
+----
+0.6:
+  000026:[b#28,SET-b#28,SET]
+0.5:
+  000029:[a#27,RANGEDEL-d#inf,RANGEDEL]
+0.4:
+  000021:[a#22,SET-g#22,SET]
+  000020:[j#21,RANGEDEL-m#21,SET]
+0.3:
+  000016:[a#20,RANGEDEL-z#inf,RANGEDEL]
+0.2:
+  000009:[b#13,MERGE-c#13,DEL]
+0.1:
+  000008:[a#12,SET-c#12,SET]
+  000011:[k#17,SET-k#17,SET]
+0.0:
+  000007:[a#11,SET-b#11,DEL]
+  000014:[j#15,SET-k#16,SET]
+  000019:[m#18,SET-m#18,SET]
+6:
+  000006:[a#10,SET-b#10,SET]
+  000023:[d#23,SET-d#23,SET]
+  000025:[e#26,RANGEDEL-f#inf,RANGEDEL]
+  000024:[i#24,RANGEDEL-j#inf,RANGEDEL]
+  000015:[n#19,SET-n#19,SET]
+  000010:[x#14,SET-y#14,SET]
+  000022:[z#25,SET-z#25,SET]
+
+reset
+----
+
+# Tests to show that keys don't overlap with range delete sentinels.
+
+batch
+set b 1
+----
+
+build ext15
+del-range a b
+----
+
+ingest ext15
+----
+
+lsm
+----
+6:
+  000004:[a#11,RANGEDEL-b#inf,RANGEDEL]
+
+reset
+----
+
+batch
+del-range b c
+----
+
+build ext16
+del-range a b
+----
+
+ingest ext16
+----
+
+lsm
+----
+6:
+  000004:[a#11,RANGEDEL-b#inf,RANGEDEL]
+
+reset
+----
+
+# Tests for branch coverage of method overlapWithIterator,
+# when levelIter is used and it produces a range del sentinel boundary
+# because it finds no overlapping point key.
+
+# Case 1) levelIter produced boundary is less than ingested file's largest key.
+
+build ext17
+del-range a b
+----
+
+ingest ext17
+----
+
+build ext18
+set a 10
+set c 10
+----
+
+ingest ext18
+----
+
+lsm
+----
+0.0:
+  000005:[a#11,SET-c#11,SET]
+6:
+  000004:[a#10,RANGEDEL-b#inf,RANGEDEL]
+
+reset
+----
+
+# Case 2) levelIter produced boundary is more than ingested file's largest key.
+
+build ext19
+del-range c d
+----
+
+ingest ext19
+----
+
+build ext20
+set a 10
+set b 10
+----
+
+ingest ext20
+----
+
+build ext21
+set c 10
+----
+
+ingest ext21
+----
+
+lsm
+----
+0.0:
+  000006:[c#12,SET-c#12,SET]
+6:
+  000005:[a#11,SET-b#11,SET]
+  000004:[c#10,RANGEDEL-d#inf,RANGEDEL]
+
+reset
+----
+
+# Case 3) levelIter produced boundary is equal to ingested file's largest key,
+# where the latter is not a range del sentinel.
+
+build ext22
+del-range a b
+----
+
+ingest ext22
+----
+
+build ext23
+set a 10
+set b 10
+----
+
+ingest ext23
+----
+
+lsm
+----
+0.0:
+  000005:[a#11,SET-b#11,SET]
+6:
+  000004:[a#10,RANGEDEL-b#inf,RANGEDEL]
+
+reset
+----
+
+# Case 4) levelIter produced boundary is equal to ingested file's largest key,
+# where the latter is a range del sentinel.
+
+build ext24
+del-range a b
+----
+
+ingest ext24
+----
+
+build ext25
+del-range a b
+----
+
+ingest ext25
+----
+
+lsm
+----
+0.0:
+  000005:[a#11,RANGEDEL-b#inf,RANGEDEL]
+6:
+  000004:[a#10,RANGEDEL-b#inf,RANGEDEL]
+
+# Check for range key ingestion bug fix in
+# https://github.com/cockroachdb/pebble/pull/2082. Without the fix, we expect
+# the range key associated with the table ext3 to get elided. This test checks
+# that the elision does not happen.
+reset
+----
+
+build ext1
+range-key-set d g 1 val1
+----
+
+ingest ext1
+----
+
+lsm
+----
+6:
+  000004:[d#10,RANGEKEYSET-g#inf,RANGEKEYSET]
+
+build ext2
+range-key-set b e 1 val2
+----
+
+ingest ext2
+----
+
+lsm
+----
+0.0:
+  000005:[b#11,RANGEKEYSET-e#inf,RANGEKEYSET]
+6:
+  000004:[d#10,RANGEKEYSET-g#inf,RANGEKEYSET]
+
+build ext3
+range-key-del a c
+----
+
+ingest ext3
+----
+
+# Without the fix in #2082 we would expect ext3 file to be ingested into L6.
+lsm
+----
+0.1:
+  000006:[a#12,RANGEKEYDEL-c#inf,RANGEKEYDEL]
+0.0:
+  000005:[b#11,RANGEKEYSET-e#inf,RANGEKEYSET]
+6:
+  000004:[d#10,RANGEKEYSET-g#inf,RANGEKEYSET]
+
+build ext4
+set a a
+----
+
+ingest ext4
+----
+
+lsm
+----
+0.2:
+  000007:[a#13,SET-a#13,SET]
+0.1:
+  000006:[a#12,RANGEKEYDEL-c#inf,RANGEKEYDEL]
+0.0:
+  000005:[b#11,RANGEKEYSET-e#inf,RANGEKEYSET]
+6:
+  000004:[d#10,RANGEKEYSET-g#inf,RANGEKEYSET]
+
+compact a aa
+----
+
+# Without the fix in #2082, we would expect the range key delete a-c to
+# get elided as it would be in L6 beneath the b-e range key in L0.
+lsm
+----
+6:
+  000008:[a#0,SET-g#inf,RANGEKEYSET]
+
+# Shouldn't show results for the b-c range as it must be deleted.
+iter
+first
+next
+next
+next
+----
+a: (a, .)
+c: (., [c-e) 1=val2 UPDATED)
+e: (., [e-g) 1=val1 UPDATED)
+.
+
+# Keys can have exclusive sentinels. Check that files boundaries which contain
+# such keys are ingested ingested into the lowest level possible.
+reset
+----
+
+build ext1
+set c c
+set e e
+----
+
+ingest ext1
+----
+
+lsm
+----
+6:
+  000004:[c#10,SET-e#10,SET]
+
+
+build ext2
+range-key-set a c 1 val1
+----
+
+ingest ext2
+----
+
+lsm
+----
+6:
+  000005:[a#11,RANGEKEYSET-c#inf,RANGEKEYSET]
+  000004:[c#10,SET-e#10,SET]
+
+# The following test cases will test that files where the end bound is an
+# exclusive sentinel due to range keys are ingested into the correct levels.
+build ext3
+set f f
+set h h
+----
+
+ingest ext3
+----
+
+lsm
+----
+6:
+  000005:[a#11,RANGEKEYSET-c#inf,RANGEKEYSET]
+  000004:[c#10,SET-e#10,SET]
+  000006:[f#12,SET-h#12,SET]
+
+
+build ext4
+range-key-unset eee f 1
+----
+
+ingest ext4
+----
+
+lsm
+----
+6:
+  000005:[a#11,RANGEKEYSET-c#inf,RANGEKEYSET]
+  000004:[c#10,SET-e#10,SET]
+  000007:[eee#13,RANGEKEYUNSET-f#inf,RANGEKEYUNSET]
+  000006:[f#12,SET-h#12,SET]
+
+build ext5
+range-key-set ee eee 1 val3
+----
+
+ingest ext5
+----
+
+lsm
+----
+6:
+  000005:[a#11,RANGEKEYSET-c#inf,RANGEKEYSET]
+  000004:[c#10,SET-e#10,SET]
+  000008:[ee#14,RANGEKEYSET-eee#inf,RANGEKEYSET]
+  000007:[eee#13,RANGEKEYUNSET-f#inf,RANGEKEYUNSET]
+  000006:[f#12,SET-h#12,SET]
+
+build ext6
+set x x
+set y y
+----
+
+ingest ext6
+----
+
+lsm
+----
+6:
+  000005:[a#11,RANGEKEYSET-c#inf,RANGEKEYSET]
+  000004:[c#10,SET-e#10,SET]
+  000008:[ee#14,RANGEKEYSET-eee#inf,RANGEKEYSET]
+  000007:[eee#13,RANGEKEYUNSET-f#inf,RANGEKEYUNSET]
+  000006:[f#12,SET-h#12,SET]
+  000009:[x#15,SET-y#15,SET]
+
+build ext7
+range-key-del s x
+----
+
+ingest ext7
+----
+
+lsm
+----
+6:
+  000005:[a#11,RANGEKEYSET-c#inf,RANGEKEYSET]
+  000004:[c#10,SET-e#10,SET]
+  000008:[ee#14,RANGEKEYSET-eee#inf,RANGEKEYSET]
+  000007:[eee#13,RANGEKEYUNSET-f#inf,RANGEKEYUNSET]
+  000006:[f#12,SET-h#12,SET]
+  000010:[s#16,RANGEKEYDEL-x#inf,RANGEKEYDEL]
+  000009:[x#15,SET-y#15,SET]
+
+reset enable-split
+----
+
+build ext10
+set a foo
+set e bar
+----
+
+ingest ext10
+----
+
+lsm
+----
+6:
+  000004:[a#10,SET-e#10,SET]
+
+# The below ingestion should split one existing file.
+
+build ext11
+set b foobar
+set d foobar
+----
+
+ingest ext11
+----
+
+lsm
+----
+6:
+  000006:[a#10,SET-a#10,SET]
+  000005:[b#11,SET-d#11,SET]
+  000007:[e#10,SET-e#10,SET]
+
+iter
+first
+next
+next
+next
+----
+a: (foo, .)
+b: (foobar, .)
+d: (foobar, .)
+e: (bar, .)
+
+# This ingestion should not split any files due to data overlap.
+
+build ext12
+set c foobar
+set e baz
+----
+
+ingest ext12
+----
+
+lsm
+----
+0.0:
+  000008:[c#12,SET-e#12,SET]
+6:
+  000006:[a#10,SET-a#10,SET]
+  000005:[b#11,SET-d#11,SET]
+  000007:[e#10,SET-e#10,SET]
+
+# The below ingestion should fall through one existing file and split another
+# file.
+
+build ext13
+set cc foo
+set ccc foooo
+----
+
+ingest ext13
+----
+
+lsm
+----
+0.0:
+  000008:[c#12,SET-e#12,SET]
+6:
+  000006:[a#10,SET-a#10,SET]
+  000010:[b#11,SET-b#11,SET]
+  000009:[cc#13,SET-ccc#13,SET]
+  000011:[d#11,SET-d#11,SET]
+  000007:[e#10,SET-e#10,SET]
+
+iter
+seek-ge c
+next
+next
+next
+next
+----
+c: (foobar, .)
+cc: (foo, .)
+ccc: (foooo, .)
+d: (foobar, .)
+e: (baz, .)
+
+# Ingestion splitting doesn't kick in at L0.
+
+build ext14
+set d updated
+set dd new
+----
+
+ingest ext14
+----
+
+lsm
+----
+0.1:
+  000012:[d#14,SET-dd#14,SET]
+0.0:
+  000008:[c#12,SET-e#12,SET]
+6:
+  000006:[a#10,SET-a#10,SET]
+  000010:[b#11,SET-b#11,SET]
+  000009:[cc#13,SET-ccc#13,SET]
+  000011:[d#11,SET-d#11,SET]
+  000007:[e#10,SET-e#10,SET]
+
+iter
+seek-lt d
+next
+next
+next
+next
+----
+ccc: (foooo, .)
+d: (updated, .)
+dd: (new, .)
+e: (baz, .)
+.
+
+# Multi-sstable ingestion batches. This exercises logic to find the appropriate
+# file to split for each newly ingested file, as we will be repeatedly splitting
+# files into smaller virtual files.
+
+reset enable-split
+----
+
+build ext10
+set a foo
+set e bar
+set g baz
+----
+
+ingest ext10
+----
+
+lsm
+----
+6:
+  000004:[a#10,SET-g#10,SET]
+
+build ext11
+set b foobar
+set c foobar
+----
+
+build ext12
+set cc foobar
+set d foobar
+----
+
+# This ingestion should slide in the same gap between keys in ext10.
+
+ingest ext11 ext12
+----
+
+lsm
+----
+6:
+  000007:[a#10,SET-a#10,SET]
+  000005:[b#11,SET-c#11,SET]
+  000006:[cc#12,SET-d#12,SET]
+  000008:[e#10,SET-g#10,SET]
+
+# A virtual sstable produced from an ingest split can be ingest split again.
+
+build ext13
+set ee foooo
+set f bar
+----
+
+ingest ext13
+----
+
+lsm
+----
+6:
+  000007:[a#10,SET-a#10,SET]
+  000005:[b#11,SET-c#11,SET]
+  000006:[cc#12,SET-d#12,SET]
+  000010:[e#10,SET-e#10,SET]
+  000009:[ee#13,SET-f#13,SET]
+  000011:[g#10,SET-g#10,SET]
+
+reset enable-split
+----
+
+build ext10
+set a foo
+set e bar
+set g baz
+----
+
+ingest ext10
+----
+
+lsm
+----
+6:
+  000004:[a#10,SET-g#10,SET]
+
+build ext11
+set b foobar
+set c foobar
+----
+
+build ext12
+set cc foobar
+set d foobar
+----
+
+build ext13
+set ee foooo
+set f bar
+----
+
+# This ingestion should split ext10 twice, and land two files on one side
+# of a key in it, and another file on another side of it.
+
+ingest ext11 ext12 ext13
+----
+
+lsm
+----
+6:
+  000008:[a#10,SET-a#10,SET]
+  000005:[b#11,SET-c#11,SET]
+  000006:[cc#12,SET-d#12,SET]
+  000010:[e#10,SET-e#10,SET]
+  000007:[ee#13,SET-f#13,SET]
+  000011:[g#10,SET-g#10,SET]
+
+iter
+first
+next
+next
+next
+next
+next
+next
+next
+next
+next
+----
+a: (foo, .)
+b: (foobar, .)
+c: (foobar, .)
+cc: (foobar, .)
+d: (foobar, .)
+e: (bar, .)
+ee: (foooo, .)
+f: (bar, .)
+g: (baz, .)
+.
diff --git a/pebble/testdata/ingest_external b/pebble/testdata/ingest_external
new file mode 100644
index 0000000..1370ac1
--- /dev/null
+++ b/pebble/testdata/ingest_external
@@ -0,0 +1,122 @@
+
+# Simple case.
+
+build-remote f1
+set a foo
+set b bar
+set c foobar
+----
+
+ingest-external
+f1,5,a,cc
+----
+
+lsm
+----
+6:
+  000004:[a#10,DELSIZED-cc#inf,RANGEDEL]
+
+iter
+first
+next
+next
+next
+----
+a: (foo, .)
+b: (bar, .)
+c: (foobar, .)
+.
+
+# Above case but with c left out at ingestion time.
+
+reset
+----
+
+build-remote f2
+set a foo
+set b bar
+set c foobar
+----
+
+ingest-external
+f2,5,a,c
+----
+
+lsm
+----
+6:
+  000004:[a#10,DELSIZED-c#inf,RANGEDEL]
+
+iter
+first
+next
+next
+next
+----
+a: (foo, .)
+b: (bar, .)
+.
+.
+
+build-remote f3
+set c foobarbaz
+set d haha
+set e something
+----
+
+build-remote f4
+set f foo
+set g foo
+set h foo
+----
+
+# This ingestion should error out due to the overlap between file spans.
+
+ingest-external
+f3,10,c,f
+f4,10,e,h
+----
+pebble: external sstables have overlapping ranges
+
+ingest-external
+f3,10,c,f
+f4,10,f,hh
+----
+
+lsm
+----
+6:
+  000004:[a#10,DELSIZED-c#inf,RANGEDEL]
+  000007:[c#11,DELSIZED-f#inf,RANGEDEL]
+  000008:[f#12,DELSIZED-hh#inf,RANGEDEL]
+
+iter
+first
+next
+next
+next
+next
+next
+next
+next
+next
+----
+a: (foo, .)
+b: (bar, .)
+c: (foobarbaz, .)
+d: (haha, .)
+e: (something, .)
+f: (foo, .)
+g: (foo, .)
+h: (foo, .)
+.
+
+compact a z
+----
+
+lsm
+----
+6:
+  000004:[a#10,DELSIZED-c#inf,RANGEDEL]
+  000007:[c#11,DELSIZED-f#inf,RANGEDEL]
+  000008:[f#12,DELSIZED-hh#inf,RANGEDEL]
diff --git a/pebble/testdata/ingest_load b/pebble/testdata/ingest_load
new file mode 100644
index 0000000..3e5d690
--- /dev/null
+++ b/pebble/testdata/ingest_load
@@ -0,0 +1,152 @@
+load
+----
+malformed input: 
+
+load
+a.SET.1:
+----
+pebble: external sstable has non-zero seqnum: a#1,SET
+
+load
+a.INVALID.0:
+----
+pebble: external sstable has corrupted key: a#0,INVALID
+
+load
+a.SET.0:
+----
+1: a#0,1-a#0,1
+  points: a#0,1-a#0,1
+  ranges: #0,0-#0,0
+
+load
+a.SET.0:
+b.SET.0:
+----
+1: a#0,1-b#0,1
+  points: a#0,1-b#0,1
+  ranges: #0,0-#0,0
+
+load
+a.DEL.0:
+----
+1: a#0,0-a#0,0
+  points: a#0,0-a#0,0
+  ranges: #0,0-#0,0
+
+load
+a.DEL.0:
+b.DEL.0:
+----
+1: a#0,0-b#0,0
+  points: a#0,0-b#0,0
+  ranges: #0,0-#0,0
+
+load
+a.MERGE.0:
+----
+1: a#0,2-a#0,2
+  points: a#0,2-a#0,2
+  ranges: #0,0-#0,0
+
+load
+a.MERGE.0:
+b.MERGE.0:
+----
+1: a#0,2-b#0,2
+  points: a#0,2-b#0,2
+  ranges: #0,0-#0,0
+
+load
+a.RANGEDEL.0:b
+----
+1: a#0,15-b#72057594037927935,15
+  points: a#0,15-b#72057594037927935,15
+  ranges: #0,0-#0,0
+
+load
+a.SET.0:
+a.RANGEDEL.0:b
+----
+1: a#0,15-b#72057594037927935,15
+  points: a#0,15-b#72057594037927935,15
+  ranges: #0,0-#0,0
+
+load
+a.SET.0:
+a.RANGEDEL.0:b
+----
+1: a#0,15-b#72057594037927935,15
+  points: a#0,15-b#72057594037927935,15
+  ranges: #0,0-#0,0
+
+load
+b.SET.0:
+a.RANGEDEL.0:b
+----
+1: a#0,15-b#0,1
+  points: a#0,15-b#0,1
+  ranges: #0,0-#0,0
+
+# Loading tables at an unsupported table format results in an error.
+# Write a table at version 7 (Pebble,v2) into a DB at version 6 (Pebble,v1).
+load writer-version=8 db-version=7
+a.SET.1:
+----
+pebble: table format (Pebble,v2) is not within range supported at DB format major version 7, ((LevelDB),(Pebble,v1))
+
+# Tables with range keys only.
+
+load writer-version=10 db-version=10
+rangekey: a-z:{(#0,RANGEKEYSET,@1,foo)}
+----
+1: a#0,21-z#72057594037927935,21
+  points: #0,0-#0,0
+  ranges: a#0,21-z#72057594037927935,21
+
+# Tables with a mixture of point and range keys.
+
+load writer-version=10 db-version=10
+a.SET.0:
+b.SET.0:
+c.SET.0:
+rangekey: w-x:{(#0,RANGEKEYSET,@1,foo)}
+rangekey: x-y:{(#0,RANGEKEYSET,@2,bar)}
+rangekey: y-z:{(#0,RANGEKEYSET,@3,baz)}
+----
+1: a#0,1-z#72057594037927935,21
+  points: a#0,1-c#0,1
+  ranges: w#0,21-z#72057594037927935,21
+
+load writer-version=10 db-version=10
+c.SET.0:d
+rangekey: a-z:{(#0,RANGEKEYSET,@1,foo)}
+----
+1: a#0,21-z#72057594037927935,21
+  points: c#0,1-c#0,1
+  ranges: a#0,21-z#72057594037927935,21
+
+load writer-version=10 db-version=10
+a.SET.0:z
+rangekey: c-d:{(#0,RANGEKEYSET,@1,foo)}
+----
+1: a#0,1-d#72057594037927935,21
+  points: a#0,1-a#0,1
+  ranges: c#0,21-d#72057594037927935,21
+
+# NB: range dels sort before range keys
+
+load writer-version=10 db-version=10
+a.RANGEDEL.0:z
+rangekey: a-z:{(#0,RANGEKEYSET,@1,foo)}
+----
+1: a#0,21-z#72057594037927935,15
+  points: a#0,15-z#72057594037927935,15
+  ranges: a#0,21-z#72057594037927935,21
+
+# Loading tables at an unsupported table format results in an error.
+
+load writer-version=15 db-version=14
+a.SET.0:
+----
+pebble: table format (Pebble,v4) is not within range supported at DB format major version 14, ((Pebble,v1),(Pebble,v3))
diff --git a/pebble/testdata/ingest_memtable_overlaps b/pebble/testdata/ingest_memtable_overlaps
new file mode 100644
index 0000000..6f9b387
--- /dev/null
+++ b/pebble/testdata/ingest_memtable_overlaps
@@ -0,0 +1,175 @@
+define
+set a 1
+----
+
+overlaps
+a-b
+b-c
+aa-ab
+----
+true
+false
+false
+
+define
+set b 1
+----
+
+overlaps
+a-b
+b-c
+----
+true
+true
+
+define
+set c 1
+----
+
+overlaps
+a-b
+a-c
+----
+false
+true
+
+define
+set a 1
+set d 2
+set g 3
+----
+
+overlaps
+b-c
+e-f
+b-c e-f
+b-c e-g
+----
+false
+false
+false
+true
+
+define
+set a 1
+set d 2
+set e 3
+set g 4
+----
+
+overlaps
+b-c
+e-f
+b-c e-f
+b-c e0-f
+----
+false
+true
+true
+false
+
+define
+set a 1
+set c 2
+set d 3
+set g 4
+----
+
+overlaps
+b-c
+e-f
+b-c e-f
+b-b1 e-f
+----
+true
+false
+true
+false
+
+# The del-range tests are specific to the comparer.
+
+define default
+del-range a c
+del-range e g
+----
+
+overlaps
+a-b
+b-c
+c-d
+c-e
+f-h
+g-h
+----
+true
+true
+false
+true
+true
+false
+
+define reverse
+del-range d b
+----
+
+overlaps
+c-b
+b-a
+e-d
+----
+true
+false
+true
+
+define default
+set b 1
+----
+
+overlaps
+a.RANGEDEL.2-b.RANGEDEL.72057594037927935
+----
+false
+
+define default
+del-range b c
+----
+
+overlaps
+a.RANGEDEL.2-b.RANGEDEL.72057594037927935
+----
+false
+
+define default
+del-range a f
+del-range b c
+----
+
+overlaps
+d.RANGEDEL.2-e.RANGEDEL.72057594037927935
+----
+true
+
+define default
+range-key-set a f 1 val1
+range-key-set b c 2 val2
+----
+
+overlaps
+d-e
+----
+true
+
+define default
+range-key-set a c 1 val1
+----
+
+overlaps
+a-c
+b-c
+a.RANGEDEL.2-b.RANGEDEL.72057594037927935
+d-e
+----
+true
+true
+true
+false
diff --git a/pebble/testdata/ingest_shared b/pebble/testdata/ingest_shared
new file mode 100644
index 0000000..ca7c7dd
--- /dev/null
+++ b/pebble/testdata/ingest_shared
@@ -0,0 +1,1125 @@
+
+switch 1
+----
+ok
+
+build ext0
+set a 1
+set l 2
+----
+
+ingest ext0
+----
+
+lsm
+----
+6:
+  000004:[a#10,SET-l#10,SET]
+
+
+batch
+set d foo
+set f bar
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+lsm
+----
+6:
+  000007:[a#0,SET-l#0,SET]
+
+switch 2
+----
+ok
+
+iter
+first
+----
+.
+
+replicate 1 2 d g
+----
+replicated 1 shared SSTs
+
+lsm
+----
+6:
+  000005:[d#10,SET-f#10,SET]
+
+iter
+first
+next
+next
+----
+d: (foo, .)
+f: (bar, .)
+.
+
+batch
+set e bar
+set f bar2
+set g bar3
+----
+
+iter
+first
+next
+next
+next
+----
+d: (foo, .)
+e: (bar, .)
+f: (bar2, .)
+g: (bar3, .)
+
+compact a-z
+----
+ok
+
+iter
+first
+next
+next
+next
+----
+d: (foo, .)
+e: (bar, .)
+f: (bar2, .)
+g: (bar3, .)
+
+# Write a new key at f, but don't compact it down.
+
+batch
+set f bar3
+----
+
+switch 1
+----
+ok
+
+lsm
+----
+6:
+  000007:[a#0,SET-l#0,SET]
+
+excise e gg
+----
+would excise 1 files, use ingest-and-excise to excise.
+  deleted:       L6 000007
+  added:         L6 000008:[a#0,1-d#0,1]
+  added:         L6 000009:[l#0,1-l#0,1]
+
+replicate 2 1 e gg
+----
+replicated 1 shared SSTs
+
+iter
+first
+next
+next
+next
+next
+----
+a: (1, .)
+d: (foo, .)
+e: (bar, .)
+f: (bar3, .)
+g: (bar3, .)
+
+# Range key masking test. Write some masked keys, then replicate before
+# compacting, then compact, then replicate back.
+
+batch
+set h@3 foobar
+set i@5 baz
+range-key-set g j @4 value
+----
+
+iter
+first
+next
+next
+next
+next
+next
+next
+next
+----
+a: (1, .)
+d: (foo, .)
+e: (bar, .)
+f: (bar3, .)
+g: (bar3, [g-j) @4=value UPDATED)
+h@3: (foobar, [g-j) @4=value)
+i@5: (baz, [g-j) @4=value)
+l: (2, . UPDATED)
+
+iter mask-filter mask-suffix=@6
+first
+next
+next
+next
+next
+next
+next
+----
+a: (1, .)
+d: (foo, .)
+e: (bar, .)
+f: (bar3, .)
+g: (bar3, [g-j) @4=value UPDATED)
+i@5: (baz, [g-j) @4=value)
+l: (2, . UPDATED)
+
+switch 2
+----
+ok
+
+replicate 1 2 a z
+----
+replicated 3 shared SSTs
+
+iter
+first
+next
+next
+next
+next
+next
+next
+next
+----
+a: (1, .)
+d: (foo, .)
+e: (bar, .)
+f: (bar3, .)
+g: (bar3, [g-j) @4=value UPDATED)
+h@3: (foobar, [g-j) @4=value)
+i@5: (baz, [g-j) @4=value)
+l: (2, . UPDATED)
+
+iter mask-filter mask-suffix=@6
+first
+next
+next
+next
+next
+next
+next
+----
+a: (1, .)
+d: (foo, .)
+e: (bar, .)
+f: (bar3, .)
+g: (bar3, [g-j) @4=value UPDATED)
+i@5: (baz, [g-j) @4=value)
+l: (2, . UPDATED)
+
+compact a-z
+----
+ok
+
+replicate 2 1 a z
+----
+replicated 3 shared SSTs
+
+restart
+----
+ok, note that the active db has been set to 1 (use 'switch' to change)
+
+iter
+first
+next
+next
+next
+next
+next
+next
+next
+next
+----
+a: (1, .)
+d: (foo, .)
+e: (bar, .)
+f: (bar3, .)
+g: (bar3, [g-j) @4=value UPDATED)
+h@3: (foobar, [g-j) @4=value)
+i@5: (baz, [g-j) @4=value)
+l: (2, . UPDATED)
+.
+
+iter mask-filter mask-suffix=@6
+first
+next
+next
+next
+next
+next
+next
+next
+----
+a: (1, .)
+d: (foo, .)
+e: (bar, .)
+f: (bar3, .)
+g: (bar3, [g-j) @4=value UPDATED)
+i@5: (baz, [g-j) @4=value)
+l: (2, . UPDATED)
+.
+
+# Reverse iteration test with masking.
+
+iter mask-filter mask-suffix=@6
+last
+prev
+prev
+prev
+prev
+prev
+prev
+prev
+----
+l: (2, .)
+i@5: (baz, [g-j) @4=value UPDATED)
+g: (bar3, [g-j) @4=value)
+f: (bar3, . UPDATED)
+e: (bar, .)
+d: (foo, .)
+a: (1, .)
+.
+
+# Range del tests.
+
+reset
+----
+
+switch 1
+----
+ok
+
+batch
+set a@3 o
+set b@5 foo
+set c@6 bar
+set e baz
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+batch
+del-range b d
+----
+
+flush
+----
+
+batch
+set a@3 abc
+set b@7 notdeleted
+set c@9 foobar
+----
+
+flush
+----
+
+lsm
+----
+0.1:
+  000009:[a@3#15,SET-c@9#17,SET]
+0.0:
+  000007:[b#14,RANGEDEL-d#inf,RANGEDEL]
+6:
+  000005:[a@3#10,SET-e#13,SET]
+
+iter
+first
+next
+next
+next
+next
+----
+a@3: (abc, .)
+b@7: (notdeleted, .)
+c@9: (foobar, .)
+e: (baz, .)
+.
+
+replicate 1 2 a z
+----
+replicated 1 shared SSTs
+
+switch 2
+----
+ok
+
+lsm
+----
+0.0:
+  000004:[a@3#11,SET-d#inf,RANGEDEL]
+6:
+  000005:[a@3#10,SET-e#10,SET]
+
+iter
+first
+next
+next
+next
+next
+----
+a@3: (abc, .)
+b@7: (notdeleted, .)
+c@9: (foobar, .)
+e: (baz, .)
+.
+
+# Similar to the above test, except this time we bring the rangedel into
+# L5 using an ingestion.
+
+reset
+----
+
+switch 1
+----
+ok
+
+batch
+set a@3 o
+set b@5 foo
+set c@6 bar
+set e baz
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+build s1
+del-range b d
+----
+
+ingest s1
+----
+
+lsm
+----
+5:
+  000006:[b#14,RANGEDEL-d#inf,RANGEDEL]
+6:
+  000005:[a@3#10,SET-e#13,SET]
+
+batch
+set a@3 abc
+set b@7 notdeleted
+set c@9 foobar
+----
+
+flush
+----
+
+lsm
+----
+0.0:
+  000008:[a@3#15,SET-c@9#17,SET]
+5:
+  000006:[b#14,RANGEDEL-d#inf,RANGEDEL]
+6:
+  000005:[a@3#10,SET-e#13,SET]
+
+iter
+first
+next
+next
+next
+next
+next
+----
+a@3: (abc, .)
+b@7: (notdeleted, .)
+c@9: (foobar, .)
+e: (baz, .)
+.
+.
+
+replicate 1 2 a z
+----
+replicated 2 shared SSTs
+
+switch 2
+----
+ok
+
+lsm
+----
+0.0:
+  000004:[a@3#12,SET-c@9#12,SET]
+5:
+  000005:[b#11,RANGEDEL-d#inf,RANGEDEL]
+6:
+  000006:[a@3#10,SET-e#10,SET]
+
+iter
+first
+next
+next
+next
+next
+----
+a@3: (abc, .)
+b@7: (notdeleted, .)
+c@9: (foobar, .)
+e: (baz, .)
+.
+
+# Test for cases where an excise produces a range key on one side and point keys
+# on the other.
+
+reset
+----
+
+switch 1
+----
+ok
+
+batch
+range-key-set a aaa @3 foo
+set d foobar
+set e barbaz
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+lsm
+----
+6:
+  000005:[a#10,RANGEKEYSET-e#12,SET]
+
+switch 2
+----
+ok
+
+batch
+set b bcd
+set c cde
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+replicate 2 1 b cc
+----
+replicated 1 shared SSTs
+
+switch 1
+----
+ok
+
+lsm
+----
+6:
+  000008:[a#10,RANGEKEYSET-aaa#inf,RANGEKEYSET]
+  000007:[b#13,SET-c#13,SET]
+  000009:[d#11,SET-e#12,SET]
+
+iter
+first
+next
+next
+next
+next
+----
+a: (., [a-aaa) @3=foo UPDATED)
+b: (bcd, . UPDATED)
+c: (cde, .)
+d: (foobar, .)
+e: (barbaz, .)
+
+reset
+----
+
+switch 1
+----
+ok
+
+batch
+set a@3 o
+set b@5 foo
+set c@6 bar
+set e baz
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+build s2
+del-range bb g
+----
+
+ingest s2
+----
+
+lsm
+----
+5:
+  000006:[bb#14,RANGEDEL-g#inf,RANGEDEL]
+6:
+  000005:[a@3#10,SET-e#13,SET]
+
+switch 2
+----
+ok
+
+batch
+set ff notdeleted
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+lsm
+----
+6:
+  000005:[ff#10,SET-ff#10,SET]
+
+# This replication should truncate the range deletion in pebble instance 1
+# at f, leaving ff undeleted.
+
+replicate 1 2 b f
+----
+replicated 2 shared SSTs
+
+lsm
+----
+5:
+  000007:[bb#12,RANGEDEL-f#inf,RANGEDEL]
+6:
+  000008:[b@5#11,SET-e#11,SET]
+  000005:[ff#10,SET-ff#10,SET]
+
+iter
+seek-ge b
+next
+next
+----
+b@5: (foo, .)
+ff: (notdeleted, .)
+.
+
+restart
+----
+ok, note that the active db has been set to 1 (use 'switch' to change)
+
+switch 2
+----
+ok
+
+iter
+seek-ge b
+next
+next
+----
+b@5: (foo, .)
+ff: (notdeleted, .)
+.
+
+# Same as above, but with a truncated range key instead of a truncated range del.
+
+reset
+----
+
+switch 1
+----
+ok
+
+batch
+set a@3 o
+set b@5 foo
+set c@6 bar
+set e baz
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+build s3
+range-key-set bb g @8 foo
+----
+
+ingest s3
+----
+
+lsm
+----
+5:
+  000006:[bb#14,RANGEKEYSET-g#inf,RANGEKEYSET]
+6:
+  000005:[a@3#10,SET-e#13,SET]
+
+switch 2
+----
+ok
+
+batch
+set ff notcovered
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+lsm
+----
+6:
+  000005:[ff#10,SET-ff#10,SET]
+
+# This replication should truncate the range key in pebble instance 1
+# at f, leaving ff uncovered.
+
+replicate 1 2 b f
+----
+replicated 2 shared SSTs
+
+lsm
+----
+5:
+  000007:[bb#12,RANGEKEYSET-f#inf,RANGEKEYSET]
+6:
+  000008:[b@5#11,SET-e#11,SET]
+  000005:[ff#10,SET-ff#10,SET]
+
+iter
+seek-ge b
+next
+next
+next
+next
+next
+----
+b@5: (foo, .)
+bb: (., [bb-f) @8=foo UPDATED)
+c@6: (bar, [bb-f) @8=foo)
+e: (baz, [bb-f) @8=foo)
+ff: (notcovered, . UPDATED)
+.
+
+
+iter mask-filter mask-suffix=@9
+seek-ge b
+next
+next
+next
+next
+----
+b@5: (foo, .)
+bb: (., [bb-f) @8=foo UPDATED)
+e: (baz, [bb-f) @8=foo)
+ff: (notcovered, . UPDATED)
+.
+
+# Tests for Eventually file-only snapshots.
+
+reset
+----
+
+switch 1
+----
+ok
+
+batch
+set a foo
+set b bar
+set c baz
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+switch 2
+----
+ok
+
+batch
+set b foobar
+----
+
+file-only-snapshot s1
+ aa bb
+ e f
+----
+ok
+
+lsm
+----
+
+iter snapshot=s1
+first
+next
+next
+----
+b: (foobar, .)
+.
+.
+
+# The below call should do a flush.
+
+wait-for-file-only-snapshot s1
+----
+ok
+
+lsm
+----
+0.0:
+  000005:[b#10,SET-b#10,SET]
+
+iter snapshot=s1
+first
+next
+next
+----
+b: (foobar, .)
+.
+.
+
+replicate 1 2 a d
+----
+replicated 1 shared SSTs
+
+iter snapshot=s1
+first
+next
+next
+----
+b: (foobar, .)
+.
+.
+
+iter
+first
+next
+next
+next
+----
+a: (foo, .)
+b: (bar, .)
+c: (baz, .)
+.
+
+switch 1
+----
+ok
+
+batch
+del c
+----
+
+# The below excise and wait should succeed as the flush will end up transitioning
+# the file-only snapshot.
+
+lsm
+----
+6:
+  000005:[a#10,SET-c#12,SET]
+
+file-only-snapshot s2
+ a cc
+----
+ok
+
+iter snapshot=s2
+first
+next
+next
+next
+next
+----
+a: (foo, .)
+b: (bar, .)
+.
+.
+.
+
+flush
+----
+
+compact a-z
+----
+ok
+
+replicate 2 1 a d
+----
+replicated 1 shared SSTs
+
+wait-for-file-only-snapshot s2
+----
+ok
+
+iter snapshot=s2
+first
+next
+next
+next
+----
+a: (foo, .)
+b: (bar, .)
+.
+.
+
+iter snapshot=s2
+first
+clone
+first
+next
+next
+next
+----
+a: (foo, .)
+.
+a: (foo, .)
+b: (bar, .)
+.
+.
+
+iter
+first
+next
+next
+next
+next
+----
+a: (foo, .)
+b: (bar, .)
+c: (baz, .)
+.
+.
+
+batch
+set d foo
+set e bar
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+switch 2
+----
+ok
+
+flush
+----
+
+batch
+set f foobar
+----
+
+# The below file-only snapshot is the more challenging case of a partial overlap
+# between an excise and a file-only snapshot. In this case the EFOS transition
+# blocks on the memtable but the excise proceeds through, causing the EFOS'
+# WaitForFileOnlySnapshot() call to error out. Opening iterators also returns
+# the same errors.
+
+file-only-snapshot s3
+ c g
+----
+ok
+
+iter snapshot=s3
+first
+next
+next
+next
+next
+----
+a: (foo, .)
+b: (bar, .)
+c: (baz, .)
+f: (foobar, .)
+.
+
+iter snapshot=s3
+first
+next
+clone
+first
+next
+next
+next
+----
+a: (foo, .)
+b: (bar, .)
+.
+a: (foo, .)
+b: (bar, .)
+c: (baz, .)
+f: (foobar, .)
+
+
+replicate 1 2 b e
+----
+replicated 2 shared SSTs
+
+wait-for-file-only-snapshot s3
+----
+pebble: snapshot excised before conversion to file-only snapshot
+
+iter snapshot=s3
+first
+next
+next
+next
+next
+----
+pebble: snapshot excised before conversion to file-only snapshot
+
+iter snapshot=s3
+first
+next
+clone
+first
+next
+next
+next
+----
+pebble: snapshot excised before conversion to file-only snapshot
+
+iter
+first
+next
+next
+next
+next
+----
+a: (foo, .)
+b: (bar, .)
+c: (baz, .)
+d: (foo, .)
+f: (foobar, .)
+
+# The below example tests for a file-only snapshot that overlaps completely
+# with an excise right after it. The wait succeeds and snapshot consistency is
+# maintained.
+
+reset
+----
+
+switch 1
+----
+ok
+
+batch
+set a foo
+set b bar
+set c baz
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+switch 2
+----
+ok
+
+batch
+set d foobar
+----
+
+file-only-snapshot s4
+ b e
+----
+ok
+
+iter snapshot=s4
+first
+next
+next
+----
+d: (foobar, .)
+.
+.
+
+replicate 1 2 b e
+----
+replicated 1 shared SSTs
+
+wait-for-file-only-snapshot s4
+----
+ok
+
+iter snapshot=s4
+first
+next
+next
+----
+d: (foobar, .)
+.
+.
+
+compact a-z
+----
+ok
+
+iter snapshot=s4
+first
+next
+next
+----
+d: (foobar, .)
+.
+.
+
+iter
+first
+next
+next
+----
+b: (bar, .)
+c: (baz, .)
+.
diff --git a/pebble/testdata/ingest_shared_lower b/pebble/testdata/ingest_shared_lower
new file mode 100644
index 0000000..98b7ba8
--- /dev/null
+++ b/pebble/testdata/ingest_shared_lower
@@ -0,0 +1,1108 @@
+
+switch 1
+----
+ok
+
+build ext0
+set a 1
+set l 2
+----
+
+ingest ext0
+----
+
+lsm
+----
+6:
+  000004:[a#10,SET-l#10,SET]
+
+
+batch
+set d foo
+set f bar
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+lsm
+----
+6:
+  000008:[a#0,SET-l#0,SET]
+
+switch 2
+----
+ok
+
+iter
+first
+----
+.
+
+replicate 1 2 d g
+----
+replicated 1 shared SSTs
+
+lsm
+----
+6:
+  000005:[d#10,SET-f#10,SET]
+
+iter
+first
+next
+next
+----
+d: (foo, .)
+f: (bar, .)
+.
+
+batch
+set e bar
+set f bar2
+set g bar3
+----
+
+iter
+first
+next
+next
+next
+----
+d: (foo, .)
+e: (bar, .)
+f: (bar2, .)
+g: (bar3, .)
+
+compact a-z
+----
+ok
+
+iter
+first
+next
+next
+next
+----
+d: (foo, .)
+e: (bar, .)
+f: (bar2, .)
+g: (bar3, .)
+
+# Write a new key at f, but don't compact it down.
+
+batch
+set f bar3
+----
+
+switch 1
+----
+ok
+
+lsm
+----
+6:
+  000008:[a#0,SET-l#0,SET]
+
+excise e gg
+----
+would excise 1 files, use ingest-and-excise to excise.
+  deleted:       L6 000008
+  added:         L6 000009:[a#0,1-d#0,1]
+  added:         L6 000010:[l#0,1-l#0,1]
+
+replicate 2 1 e gg
+----
+replicated 1 shared SSTs
+
+iter
+first
+next
+next
+next
+next
+----
+a: (1, .)
+d: (foo, .)
+e: (bar, .)
+f: (bar3, .)
+g: (bar3, .)
+
+# Range key masking test. Write some masked keys, then replicate before
+# compacting, then compact, then replicate back.
+
+batch
+set h@3 foobar
+set i@5 baz
+range-key-set g j @4 value
+----
+
+iter
+first
+next
+next
+next
+next
+next
+next
+next
+----
+a: (1, .)
+d: (foo, .)
+e: (bar, .)
+f: (bar3, .)
+g: (bar3, [g-j) @4=value UPDATED)
+h@3: (foobar, [g-j) @4=value)
+i@5: (baz, [g-j) @4=value)
+l: (2, . UPDATED)
+
+iter mask-filter mask-suffix=@6
+first
+next
+next
+next
+next
+next
+next
+----
+a: (1, .)
+d: (foo, .)
+e: (bar, .)
+f: (bar3, .)
+g: (bar3, [g-j) @4=value UPDATED)
+i@5: (baz, [g-j) @4=value)
+l: (2, . UPDATED)
+
+switch 2
+----
+ok
+
+replicate 1 2 a z
+----
+replicated 3 shared SSTs
+
+iter
+first
+next
+next
+next
+next
+next
+next
+next
+----
+a: (1, .)
+d: (foo, .)
+e: (bar, .)
+f: (bar3, .)
+g: (bar3, [g-j) @4=value UPDATED)
+h@3: (foobar, [g-j) @4=value)
+i@5: (baz, [g-j) @4=value)
+l: (2, . UPDATED)
+
+iter mask-filter mask-suffix=@6
+first
+next
+next
+next
+next
+next
+next
+----
+a: (1, .)
+d: (foo, .)
+e: (bar, .)
+f: (bar3, .)
+g: (bar3, [g-j) @4=value UPDATED)
+i@5: (baz, [g-j) @4=value)
+l: (2, . UPDATED)
+
+compact a-z
+----
+ok
+
+replicate 2 1 a z
+----
+replicated 3 shared SSTs
+
+switch 1
+----
+ok
+
+iter
+first
+next
+next
+next
+next
+next
+next
+next
+next
+----
+a: (1, .)
+d: (foo, .)
+e: (bar, .)
+f: (bar3, .)
+g: (bar3, [g-j) @4=value UPDATED)
+h@3: (foobar, [g-j) @4=value)
+i@5: (baz, [g-j) @4=value)
+l: (2, . UPDATED)
+.
+
+iter mask-filter mask-suffix=@6
+first
+next
+next
+next
+next
+next
+next
+next
+----
+a: (1, .)
+d: (foo, .)
+e: (bar, .)
+f: (bar3, .)
+g: (bar3, [g-j) @4=value UPDATED)
+i@5: (baz, [g-j) @4=value)
+l: (2, . UPDATED)
+.
+
+# Reverse iteration test with masking.
+
+iter mask-filter mask-suffix=@6
+last
+prev
+prev
+prev
+prev
+prev
+prev
+prev
+----
+l: (2, .)
+i@5: (baz, [g-j) @4=value UPDATED)
+g: (bar3, [g-j) @4=value)
+f: (bar3, . UPDATED)
+e: (bar, .)
+d: (foo, .)
+a: (1, .)
+.
+
+# Range del tests.
+
+reset
+----
+
+switch 1
+----
+ok
+
+batch
+set a@3 o
+set b@5 foo
+set c@6 bar
+set e baz
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+batch
+del-range b d
+----
+
+flush
+----
+
+batch
+set a@3 abc
+set b@7 notdeleted
+set c@9 foobar
+----
+
+flush
+----
+
+lsm
+----
+0.1:
+  000010:[a@3#15,SET-c@9#17,SET]
+0.0:
+  000008:[b#14,RANGEDEL-d#inf,RANGEDEL]
+6:
+  000006:[a@3#10,SET-e#13,SET]
+
+iter
+first
+next
+next
+next
+next
+----
+a@3: (abc, .)
+b@7: (notdeleted, .)
+c@9: (foobar, .)
+e: (baz, .)
+.
+
+replicate 1 2 a z
+----
+replicated 1 shared SSTs
+
+switch 2
+----
+ok
+
+lsm
+----
+0.0:
+  000004:[a@3#11,SET-d#inf,RANGEDEL]
+6:
+  000005:[a@3#10,SET-e#10,SET]
+
+iter
+first
+next
+next
+next
+next
+----
+a@3: (abc, .)
+b@7: (notdeleted, .)
+c@9: (foobar, .)
+e: (baz, .)
+.
+
+# Similar to the above test, except this time we bring the rangedel into
+# L5 using an ingestion.
+
+reset
+----
+
+switch 1
+----
+ok
+
+batch
+set a@3 o
+set b@5 foo
+set c@6 bar
+set e baz
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+build s1
+del-range b d
+----
+
+ingest s1
+----
+
+lsm
+----
+5:
+  000007:[b#14,RANGEDEL-d#inf,RANGEDEL]
+6:
+  000006:[a@3#10,SET-e#13,SET]
+
+batch
+set a@3 abc
+set b@7 notdeleted
+set c@9 foobar
+----
+
+flush
+----
+
+lsm
+----
+0.0:
+  000009:[a@3#15,SET-c@9#17,SET]
+5:
+  000007:[b#14,RANGEDEL-d#inf,RANGEDEL]
+6:
+  000006:[a@3#10,SET-e#13,SET]
+
+iter
+first
+next
+next
+next
+next
+next
+----
+a@3: (abc, .)
+b@7: (notdeleted, .)
+c@9: (foobar, .)
+e: (baz, .)
+.
+.
+
+replicate 1 2 a z
+----
+replicated 2 shared SSTs
+
+switch 2
+----
+ok
+
+lsm
+----
+0.0:
+  000004:[a@3#12,SET-c@9#12,SET]
+5:
+  000005:[b#11,RANGEDEL-d#inf,RANGEDEL]
+6:
+  000006:[a@3#10,SET-e#10,SET]
+
+iter
+first
+next
+next
+next
+next
+----
+a@3: (abc, .)
+b@7: (notdeleted, .)
+c@9: (foobar, .)
+e: (baz, .)
+.
+
+# Test for cases where an excise produces a range key on one side and point keys
+# on the other.
+
+reset
+----
+
+switch 1
+----
+ok
+
+batch
+range-key-set a aaa @3 foo
+set d foobar
+set e barbaz
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+lsm
+----
+6:
+  000006:[a#10,RANGEKEYSET-e#12,SET]
+
+switch 2
+----
+ok
+
+batch
+set b bcd
+set c cde
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+replicate 2 1 b cc
+----
+replicated 1 shared SSTs
+
+switch 1
+----
+ok
+
+lsm
+----
+6:
+  000009:[a#10,RANGEKEYSET-aaa#inf,RANGEKEYSET]
+  000008:[b#13,SET-c#13,SET]
+  000010:[d#11,SET-e#12,SET]
+
+iter
+first
+next
+next
+next
+next
+----
+a: (., [a-aaa) @3=foo UPDATED)
+b: (bcd, . UPDATED)
+c: (cde, .)
+d: (foobar, .)
+e: (barbaz, .)
+
+reset
+----
+
+switch 1
+----
+ok
+
+batch
+set a@3 o
+set b@5 foo
+set c@6 bar
+set e baz
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+build s2
+del-range bb g
+----
+
+ingest s2
+----
+
+lsm
+----
+5:
+  000007:[bb#14,RANGEDEL-g#inf,RANGEDEL]
+6:
+  000006:[a@3#10,SET-e#13,SET]
+
+switch 2
+----
+ok
+
+batch
+set ff notdeleted
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+lsm
+----
+6:
+  000006:[ff#10,SET-ff#10,SET]
+
+# This replication should truncate the range deletion in pebble instance 1
+# at f, leaving ff undeleted.
+
+replicate 1 2 b f
+----
+replicated 2 shared SSTs
+
+lsm
+----
+5:
+  000008:[bb#12,RANGEDEL-f#inf,RANGEDEL]
+6:
+  000009:[b@5#11,SET-e#11,SET]
+  000006:[ff#10,SET-ff#10,SET]
+
+iter
+seek-ge b
+next
+next
+----
+b@5: (foo, .)
+ff: (notdeleted, .)
+.
+
+# Same as above, but with a truncated range key instead of a truncated range del.
+
+reset
+----
+
+switch 1
+----
+ok
+
+batch
+set a@3 o
+set b@5 foo
+set c@6 bar
+set e baz
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+build s3
+range-key-set bb g @8 foo
+----
+
+ingest s3
+----
+
+lsm
+----
+5:
+  000007:[bb#14,RANGEKEYSET-g#inf,RANGEKEYSET]
+6:
+  000006:[a@3#10,SET-e#13,SET]
+
+switch 2
+----
+ok
+
+batch
+set ff notcovered
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+lsm
+----
+6:
+  000006:[ff#10,SET-ff#10,SET]
+
+# This replication should truncate the range key in pebble instance 1
+# at f, leaving ff uncovered.
+
+replicate 1 2 b f
+----
+replicated 2 shared SSTs
+
+lsm
+----
+5:
+  000008:[bb#12,RANGEKEYSET-f#inf,RANGEKEYSET]
+6:
+  000009:[b@5#11,SET-e#11,SET]
+  000006:[ff#10,SET-ff#10,SET]
+
+iter
+seek-ge b
+next
+next
+next
+next
+next
+----
+b@5: (foo, .)
+bb: (., [bb-f) @8=foo UPDATED)
+c@6: (bar, [bb-f) @8=foo)
+e: (baz, [bb-f) @8=foo)
+ff: (notcovered, . UPDATED)
+.
+
+
+iter mask-filter mask-suffix=@9
+seek-ge b
+next
+next
+next
+next
+----
+b@5: (foo, .)
+bb: (., [bb-f) @8=foo UPDATED)
+e: (baz, [bb-f) @8=foo)
+ff: (notcovered, . UPDATED)
+.
+
+# Tests for Eventually file-only snapshots.
+
+reset
+----
+
+switch 1
+----
+ok
+
+batch
+set a foo
+set b bar
+set c baz
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+switch 2
+----
+ok
+
+batch
+set b foobar
+----
+
+file-only-snapshot s1
+ aa bb
+ e f
+----
+ok
+
+lsm
+----
+
+iter snapshot=s1
+first
+next
+next
+----
+b: (foobar, .)
+.
+.
+
+# The below call should do a flush.
+
+wait-for-file-only-snapshot s1
+----
+ok
+
+lsm
+----
+0.0:
+  000005:[b#10,SET-b#10,SET]
+
+iter snapshot=s1
+first
+next
+next
+----
+b: (foobar, .)
+.
+.
+
+replicate 1 2 a d
+----
+replicated 1 shared SSTs
+
+iter snapshot=s1
+first
+next
+next
+----
+b: (foobar, .)
+.
+.
+
+iter
+first
+next
+next
+next
+----
+a: (foo, .)
+b: (bar, .)
+c: (baz, .)
+.
+
+switch 1
+----
+ok
+
+batch
+del c
+----
+
+# The below excise and wait should succeed as the flush will end up transitioning
+# the file-only snapshot.
+
+lsm
+----
+6:
+  000006:[a#10,SET-c#12,SET]
+
+file-only-snapshot s2
+ a cc
+----
+ok
+
+iter snapshot=s2
+first
+next
+next
+next
+next
+----
+a: (foo, .)
+b: (bar, .)
+.
+.
+.
+
+flush
+----
+
+compact a-z
+----
+ok
+
+replicate 2 1 a d
+----
+replicated 1 shared SSTs
+
+wait-for-file-only-snapshot s2
+----
+ok
+
+iter snapshot=s2
+first
+next
+next
+next
+----
+a: (foo, .)
+b: (bar, .)
+.
+.
+
+iter snapshot=s2
+first
+clone
+first
+next
+next
+next
+----
+a: (foo, .)
+.
+a: (foo, .)
+b: (bar, .)
+.
+.
+
+iter
+first
+next
+next
+next
+next
+----
+a: (foo, .)
+b: (bar, .)
+c: (baz, .)
+.
+.
+
+batch
+set d foo
+set e bar
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+switch 2
+----
+ok
+
+flush
+----
+
+batch
+set f foobar
+----
+
+# The below file-only snapshot is the more challenging case of a partial overlap
+# between an excise and a file-only snapshot. In this case the EFOS transition
+# blocks on the memtable but the excise proceeds through, causing the EFOS'
+# WaitForFileOnlySnapshot() call to error out. Opening iterators also returns
+# the same errors.
+
+file-only-snapshot s3
+ c g
+----
+ok
+
+iter snapshot=s3
+first
+next
+next
+next
+next
+----
+a: (foo, .)
+b: (bar, .)
+c: (baz, .)
+f: (foobar, .)
+.
+
+iter snapshot=s3
+first
+next
+clone
+first
+next
+next
+next
+----
+a: (foo, .)
+b: (bar, .)
+.
+a: (foo, .)
+b: (bar, .)
+c: (baz, .)
+f: (foobar, .)
+
+
+replicate 1 2 b e
+----
+replicated 2 shared SSTs
+
+wait-for-file-only-snapshot s3
+----
+pebble: snapshot excised before conversion to file-only snapshot
+
+iter snapshot=s3
+first
+next
+next
+next
+next
+----
+pebble: snapshot excised before conversion to file-only snapshot
+
+iter snapshot=s3
+first
+next
+clone
+first
+next
+next
+next
+----
+pebble: snapshot excised before conversion to file-only snapshot
+
+iter
+first
+next
+next
+next
+next
+----
+a: (foo, .)
+b: (bar, .)
+c: (baz, .)
+d: (foo, .)
+f: (foobar, .)
+
+# The below example tests for a file-only snapshot that overlaps completely
+# with an excise right after it. The wait succeeds and snapshot consistency is
+# maintained.
+
+reset
+----
+
+switch 1
+----
+ok
+
+batch
+set a foo
+set b bar
+set c baz
+----
+
+flush
+----
+
+compact a-z
+----
+ok
+
+switch 2
+----
+ok
+
+batch
+set d foobar
+----
+
+file-only-snapshot s4
+ b e
+----
+ok
+
+iter snapshot=s4
+first
+next
+next
+----
+d: (foobar, .)
+.
+.
+
+replicate 1 2 b e
+----
+replicated 1 shared SSTs
+
+wait-for-file-only-snapshot s4
+----
+ok
+
+iter snapshot=s4
+first
+next
+next
+----
+d: (foobar, .)
+.
+.
+
+compact a-z
+----
+ok
+
+iter snapshot=s4
+first
+next
+next
+----
+d: (foobar, .)
+.
+.
+
+iter
+first
+next
+next
+----
+b: (bar, .)
+c: (baz, .)
+.
diff --git a/pebble/testdata/ingest_sort_and_verify b/pebble/testdata/ingest_sort_and_verify
new file mode 100644
index 0000000..3ead5c1
--- /dev/null
+++ b/pebble/testdata/ingest_sort_and_verify
@@ -0,0 +1,180 @@
+# Default comparator specific tests.
+
+ingest cmp=default
+a.SET.0-b.SET.0
+----
+0: a#0,1-b#0,1
+
+ingest cmp=default
+a.SET.0-b.SET.0
+c.SET.0-d.SET.0
+e.SET.0-f.SET.0
+----
+0: a#0,1-b#0,1
+1: c#0,1-d#0,1
+2: e#0,1-f#0,1
+
+ingest cmp=default
+c.SET.0-d.SET.0
+a.SET.0-b.SET.0
+e.SET.0-f.SET.0
+----
+1: a#0,1-b#0,1
+0: c#0,1-d#0,1
+2: e#0,1-f#0,1
+
+ingest cmp=default
+a.SET.0-b.SET.0
+b.SET.0-d.SET.0
+e.SET.0-f.SET.0
+----
+pebble: local ingestion sstables have overlapping ranges
+
+ingest cmp=default
+c.SET.0-d.SET.0
+d.SET.0-e.SET.0
+a.SET.0-b.SET.0
+----
+pebble: local ingestion sstables have overlapping ranges
+
+ingest cmp=default
+a.SET.1-b.SET.1
+b.SET.0-c.SET.0
+----
+pebble: local ingestion sstables have overlapping ranges
+
+ingest cmp=default
+a.RANGEDEL.0-b.RANGEDEL.72057594037927935
+b.RANGEDEL.0-d.RANGEDEL.72057594037927935
+e.RANGEDEL.0-f.RANGEDEL.72057594037927935
+----
+0: a#0,15-b#72057594037927935,15
+1: b#0,15-d#72057594037927935,15
+2: e#0,15-f#72057594037927935,15
+
+ingest cmp=default
+a.RANGEDEL.0-b.RANGEDEL.72057594037927935
+c.RANGEDEL.0-e.RANGEDEL.72057594037927935
+e.RANGEDEL.0-f.RANGEDEL.72057594037927935
+----
+0: a#0,15-b#72057594037927935,15
+1: c#0,15-e#72057594037927935,15
+2: e#0,15-f#72057594037927935,15
+
+ingest cmp=default
+a.RANGEDEL.0-b.RANGEDEL.72057594037927935
+b.RANGEDEL.0-e.RANGEDEL.72057594037927935
+e.RANGEDEL.0-f.RANGEDEL.72057594037927935
+----
+0: a#0,15-b#72057594037927935,15
+1: b#0,15-e#72057594037927935,15
+2: e#0,15-f#72057594037927935,15
+
+ingest cmp=default
+a.RANGEDEL.0-c.RANGEDEL.72057594037927935
+b.SET.0-d.SET.0
+----
+pebble: local ingestion sstables have overlapping ranges
+
+ingest cmp=default
+b.RANGEDEL.0-d.RANGEDEL.72057594037927935
+a.SET.0-c.SET.0
+----
+pebble: local ingestion sstables have overlapping ranges
+
+ingest cmp=default
+a.RANGEDEL.0-b.RANGEDEL.72057594037927935
+b.SET.0-c.SET.0
+----
+0: a#0,15-b#72057594037927935,15
+1: b#0,1-c#0,1
+
+# Reverse comparator specific tests.
+
+ingest cmp=reverse
+b.SET.0-a.SET.0
+----
+0: b#0,1-a#0,1
+
+ingest cmp=reverse
+f.SET.0-e.SET.0
+d.SET.0-c.SET.0
+b.SET.0-a.SET.0
+----
+0: f#0,1-e#0,1
+1: d#0,1-c#0,1
+2: b#0,1-a#0,1
+
+ingest cmp=reverse
+f.SET.0-e.SET.0
+b.SET.0-a.SET.0
+d.SET.0-c.SET.0
+----
+0: f#0,1-e#0,1
+2: d#0,1-c#0,1
+1: b#0,1-a#0,1
+
+ingest cmp=reverse
+f.SET.0-e.SET.0
+d.SET.0-b.SET.0
+b.SET.0-a.SET.0
+----
+pebble: local ingestion sstables have overlapping ranges
+
+ingest cmp=reverse
+b.SET.0-a.SET.0
+e.SET.0-d.SET.0
+d.SET.0-c.SET.0
+----
+pebble: local ingestion sstables have overlapping ranges
+
+ingest cmp=reverse
+c.SET.0-b.SET.0
+b.SET.1-a.SET.1
+----
+pebble: local ingestion sstables have overlapping ranges
+
+ingest cmp=reverse
+b.RANGEDEL.0-a.RANGEDEL.72057594037927935
+d.RANGEDEL.0-b.RANGEDEL.72057594037927935
+f.RANGEDEL.0-e.RANGEDEL.72057594037927935
+----
+2: f#0,15-e#72057594037927935,15
+1: d#0,15-b#72057594037927935,15
+0: b#0,15-a#72057594037927935,15
+
+ingest cmp=reverse
+b.RANGEDEL.0-a.RANGEDEL.72057594037927935
+e.RANGEDEL.0-c.RANGEDEL.72057594037927935
+f.RANGEDEL.0-e.RANGEDEL.72057594037927935
+----
+2: f#0,15-e#72057594037927935,15
+1: e#0,15-c#72057594037927935,15
+0: b#0,15-a#72057594037927935,15
+
+ingest cmp=reverse
+b.RANGEDEL.0-a.RANGEDEL.72057594037927935
+e.RANGEDEL.0-b.RANGEDEL.72057594037927935
+f.RANGEDEL.0-e.RANGEDEL.72057594037927935
+----
+2: f#0,15-e#72057594037927935,15
+1: e#0,15-b#72057594037927935,15
+0: b#0,15-a#72057594037927935,15
+
+ingest cmp=reverse
+c.RANGEDEL.0-a.RANGEDEL.72057594037927935
+d.SET.0-b.SET.0
+----
+pebble: local ingestion sstables have overlapping ranges
+
+ingest cmp=reverse
+d.RANGEDEL.0-b.RANGEDEL.72057594037927935
+c.SET.0-a.SET.0
+----
+pebble: local ingestion sstables have overlapping ranges
+
+ingest cmp=reverse
+b.RANGEDEL.0-a.RANGEDEL.72057594037927935
+c.SET.0-b.SET.0
+----
+pebble: local ingestion sstables have overlapping ranges
diff --git a/pebble/testdata/ingest_target_level b/pebble/testdata/ingest_target_level
new file mode 100644
index 0000000..e6b8edf
--- /dev/null
+++ b/pebble/testdata/ingest_target_level
@@ -0,0 +1,324 @@
+define
+----
+
+# An empty LSM ingests into the bottom level.
+target
+a-b
+----
+6
+
+define
+L5
+  b.SET.1:1
+  c.SET.2:2
+----
+5:
+  000004:[b#1,SET-c#2,SET]
+
+# Overlapping cases.
+target
+a-b
+b-c
+c-d
+----
+4
+4
+4
+
+# Non-overlapping cases:
+# - Ingested file lies entirely before the existing file.
+# - Ingested file lies entirely after the existing file.
+# - Ingested file has no data overlap (falls through the middle of the existing
+#   file).
+target
+a-aa
+d-e
+bb-bb
+----
+6
+6
+6
+
+define
+L0
+  b.SET.3:3
+  e.SET.4:4
+L0
+  d.SET.5:5
+  f.SET.6:6
+L0
+  x.SET.7:7
+  y.SET.8:8
+L3
+  g.SET.1:1
+  h.SET.2:2
+----
+0.1:
+  000005:[d#5,SET-f#6,SET]
+0.0:
+  000004:[b#3,SET-e#4,SET]
+  000006:[x#7,SET-y#8,SET]
+3:
+  000007:[g#1,SET-h#2,SET]
+
+# Files overlap with L0. Files ingested into L0.
+target
+b-c
+d-e
+----
+0
+0
+
+# Files overlap with L3. Files ingested into L2.
+target
+g-m
+----
+2
+
+# No overlap. Files ingested into L6.
+target
+i-m
+c-c
+----
+6
+6
+
+define
+L5
+  a.SET.4:4
+L5
+  c.SET.3:3
+L6
+  a.SET.2:2
+L6
+  c.SET.1:1
+----
+5:
+  000004:[a#4,SET-a#4,SET]
+  000005:[c#3,SET-c#3,SET]
+6:
+  000006:[a#2,SET-a#2,SET]
+  000007:[c#1,SET-c#1,SET]
+
+# The ingested file slips through the gaps in both L5 and L6.
+target
+b-b
+----
+6
+
+define
+L5
+  a.SET.4:4
+L5
+  c.SET.3:3
+L6
+  a.SET.2:2
+L6
+  c.SET.1:1
+  compact:a-c
+----
+5:
+  000004:[a#4,SET-a#4,SET]
+  000005:[c#3,SET-c#3,SET]
+6:
+  000006:[a#2,SET-a#2,SET]
+  000007:[c#1,SET-c#1,SET]
+
+# The ingested file cannot reach L6 as there is a compaction outputting a file
+# into the range [a,c].
+target
+b-b
+----
+5
+
+define
+L0
+  c.SET.4:4
+  d.SET.3:3
+  d.RANGEDEL.2:g
+L2
+  a.RANGEDEL.1:g
+----
+0.0:
+  000004:[c#4,SET-g#inf,RANGEDEL]
+2:
+  000005:[a#1,RANGEDEL-g#inf,RANGEDEL]
+
+# Overlapping cases:
+# - The ingested file overlaps with with [c,c].
+# - The rangedel over [d,g) keeps the ingested file in L0.
+# - Ditto.
+target
+c-c
+d-d
+e-e
+----
+0
+0
+0
+
+# Non-overlapping cases:
+# - The ingested file [cc,cc] slips through L0, but is kept at L1 by the
+#   rangedel in L2.
+# - The ingested file is to completely to right of all files.
+# - The ingested file is to the left of all files in L0, but is kept at L1 by
+#   the rangedel in L2.
+target
+cc-cc
+g-g
+a-a
+----
+1
+6
+1
+
+# A more complicated example demonstrating data overlap.
+#            |--|        ingested file: [d-e] - data overlap
+#          |-|           ingested file: [cc-d] - no data overlap
+#                |--|    ingested file: [ee-ff] - no data overlap
+#  |*--*--*----*------*| existing file: [a-g], points: [a, b, c, dd, g]
+#  _____________________
+#   a  b  c  d  e  f  g
+define
+L1
+  a.SET.0:a
+  b.SET.0:b
+  c.SET.0:c
+  dd.SET.0:dd
+  g.SET.0:g
+----
+1:
+  000004:[a#0,SET-g#0,SET]
+
+# Data overlap.
+target
+d-e
+----
+0
+
+# No data overlap.
+target
+cc-d
+ee-ff
+----
+6
+6
+
+# Range key-point key data overlap will always correctly identify overlap because
+# we seek using the combined point and range key bounds of the ingested file
+# to determine overlap and don't check the data within the ingested file.
+define
+L5
+  a.SET.0:a
+  b.SET.0:b
+  c.SET.0:c
+----
+5:
+  000004:[a#0,SET-c#0,SET]
+
+target
+rkey:a-c
+----
+4
+
+# Point key-range key overlap
+define
+L5
+  rangekey:a-c:{(#1,RANGEKEYSET,@t10,foo)}
+----
+5:
+  000004:[a#1,RANGEKEYSET-c#inf,RANGEKEYSET]
+
+target
+a-c
+----
+4
+
+# Range key-range key overlap.
+define
+L5
+  rangekey:a-c:{(#1,RANGEKEYSET,@t10,foo)}
+----
+5:
+  000004:[a#1,RANGEKEYSET-c#inf,RANGEKEYSET]
+
+target
+rkey:a-c
+----
+4
+
+# Cases with boundary overlap and no data overlap. With suggest-split off
+# we get a target level of L0, but with suggest-split on, we get suggested
+# a file split.
+
+define
+L6
+  a.SET.2:2
+  d.SET.3:3
+L6
+  f.SET.4:4
+  k.SET.6:6
+----
+6:
+  000004:[a#2,SET-d#3,SET]
+  000005:[f#4,SET-k#6,SET]
+
+target
+b-c
+e-g
+----
+5
+5
+
+target suggest-split
+b-c
+e-g
+----
+6 (split file: 000004)
+5
+
+target suggest-split
+g-i
+----
+6 (split file: 000005)
+
+# suggest-split recognizes and avoids in-progress compactions.
+
+define
+L6
+  a.SET.2:2
+  d.SET.3:3
+L6
+  f.SET.4:4
+  k.SET.6:6
+  compact:f-k
+----
+6:
+  000004:[a#2,SET-d#3,SET]
+  000005:[f#4,SET-k#6,SET]
+
+target suggest-split
+g-i
+----
+5
+
+# Ingestion splitting correctly recognizes data overlap in L6, and suggests
+# split in L5.
+
+define
+L5
+  a.SET.2:2
+  e.SET.3:3
+L6
+  c.SET.1:1
+  k.SET.1:1
+----
+5:
+  000004:[a#2,SET-e#3,SET]
+6:
+  000005:[c#1,SET-k#1,SET]
+
+target suggest-split
+b-c
+----
+5 (split file: 000004)
diff --git a/pebble/testdata/ingest_update_seqnums b/pebble/testdata/ingest_update_seqnums
new file mode 100644
index 0000000..7459fcf
--- /dev/null
+++ b/pebble/testdata/ingest_update_seqnums
@@ -0,0 +1,142 @@
+# Starting sequence number. Each file increments the sequence number.
+
+starting-seqnum
+42
+----
+
+# Point keys only (no range dels).
+
+load
+a.SET.0:
+b.SET.0:
+c.SET.0:
+----
+file 0
+
+# Point keys only (range del lower bound).
+
+load
+a.RANGEDEL.0:b
+c.SET.0:
+----
+file 1
+
+# Point keys only (range del upper bound).
+
+load
+a.SET.0:
+b.RANGEDEL.0:c
+----
+file 2
+
+# Update the sequence numbers across all three files.
+# NB: the sequence numbers are expected to increment by one from the starting
+# sequence number, for each file.
+
+update-files
+----
+file 0:
+  combined: a#42,1-c#42,1
+    points: a#42,1-c#42,1
+    ranges: #0,0-#0,0
+file 1:
+  combined: a#43,15-c#43,1
+    points: a#43,15-c#43,1
+    ranges: #0,0-#0,0
+file 2:
+  combined: a#44,1-c#72057594037927935,15
+    points: a#44,1-c#72057594037927935,15
+    ranges: #0,0-#0,0
+
+# Reset to the starting sequence number and reset the slice of files. The
+# following tests consider a single file at a time.
+
+# Range keys only.
+
+reset
+----
+
+load
+rangekey: a-c:{#0,RANGEKEYSET,@1,foo)}
+----
+file 0
+
+update-files
+----
+file 0:
+  combined: a#42,21-c#72057594037927935,21
+    points: #0,0-#0,0
+    ranges: a#42,21-c#72057594037927935,21
+
+# Combined point and range keys (point key lower and upper bound).
+
+reset
+----
+
+load
+a.SET.0:
+rangekey: b-c:{#0,RANGEKEYSET,@1,foo)}
+d.SET.0:
+----
+file 0
+
+update-files
+----
+file 0:
+  combined: a#42,1-d#42,1
+    points: a#42,1-d#42,1
+    ranges: b#42,21-c#72057594037927935,21
+
+# Combined point and range keys (point key lower and range key upper bound).
+
+reset
+----
+
+load
+a.SET.0:
+rangekey: b-c:{(#0,RANGEKEYSET,@1,foo)}
+----
+file 0
+
+update-files
+----
+file 0:
+  combined: a#42,1-c#72057594037927935,21
+    points: a#42,1-a#42,1
+    ranges: b#42,21-c#72057594037927935,21
+
+# Combined point and range keys (range key lower and point key upper bound).
+
+reset
+----
+
+load
+rangekey: a-c:{#0,RANGEKEYSET,@1,foo)}
+d.SET.0:
+----
+file 0
+
+update-files
+----
+file 0:
+  combined: a#42,21-d#42,1
+    points: d#42,1-d#42,1
+    ranges: a#42,21-c#72057594037927935,21
+
+# Combined point and range keys (range key lower and upper bound).
+
+reset
+----
+
+load
+rangekey: a-d:{#0,RANGEKEYSET,@1,foo)}
+c.SET.0:
+----
+file 0
+
+update-files
+----
+file 0:
+  combined: a#42,21-d#72057594037927935,21
+    points: c#42,1-c#42,1
+    ranges: a#42,21-d#72057594037927935,21
diff --git a/pebble/testdata/ingested_flushable_api b/pebble/testdata/ingested_flushable_api
new file mode 100644
index 0000000..48f8547
--- /dev/null
+++ b/pebble/testdata/ingested_flushable_api
@@ -0,0 +1,112 @@
+build ext1
+set c c
+set e e
+----
+
+flushable ext1
+----
+
+iter
+----
+c#0,1
+e#0,1
+
+rangekeyIter
+----
+
+rangedelIter
+----
+
+containsRangeKey
+----
+false
+
+readyForFlush
+----
+true
+
+reset
+----
+
+build ext2
+range-key-set d g 1 val1
+----
+
+flushable ext2
+----
+
+iter
+----
+
+rangekeyIter
+----
+d-g:{(#0,RANGEKEYSET,1,val1)}
+
+containsRangeKey
+----
+true
+
+rangedelIter
+----
+
+reset
+----
+
+build ext3
+del-range a j
+del-range o z
+----
+
+flushable ext3
+----
+
+iter
+----
+
+rangedelIter
+----
+a-j:{(#0,RANGEDEL)}
+o-z:{(#0,RANGEDEL)}
+
+rangekeyIter
+----
+
+readyForFlush
+----
+true
+
+containsRangeKey
+----
+false
+
+reset
+----
+
+build ext4
+del-range a j
+set k kk
+range-key-set y z 1 val1
+----
+
+flushable ext4
+----
+
+iter
+----
+k#0,1
+
+rangekeyIter
+----
+y-z:{(#0,RANGEKEYSET,1,val1)}
+
+rangedelIter
+----
+a-j:{(#0,RANGEDEL)}
+
+readyForFlush
+----
+true
+
+containsRangeKey
+----
+true
diff --git a/pebble/testdata/internal_iter_bounds b/pebble/testdata/internal_iter_bounds
new file mode 100644
index 0000000..f9fbd10
--- /dev/null
+++ b/pebble/testdata/internal_iter_bounds
@@ -0,0 +1,36 @@
+define
+a.SET.1:1
+a.SET.2:2
+b.SET.1:1
+b.SET.2:2
+c.SET.1:1
+c.SET.2:2
+d.SET.2:2
+----
+
+iter lower=b upper=c
+seek-ge d
+seek-prefix-ge d
+seek-lt c
+seek-lt b
+next
+prev
+next
+next
+next
+next
+next
+prev
+----
+.
+.
+b:1
+.
+a:2
+.
+a:2
+a:1
+b:2
+b:1
+.
+d:2
diff --git a/pebble/testdata/internal_iter_next b/pebble/testdata/internal_iter_next
new file mode 100644
index 0000000..a66b8ed
--- /dev/null
+++ b/pebble/testdata/internal_iter_next
@@ -0,0 +1,82 @@
+define
+a.SET.1:1
+a.SET.2:2
+b.SET.1:1
+b.SET.2:2
+c.SET.1:1
+c.SET.2:2
+----
+
+iter
+seek-ge a
+seek-ge b
+seek-ge c
+seek-ge d
+seek-lt a
+seek-lt b
+seek-lt c
+seek-lt d
+seek-prefix-ge a
+seek-prefix-ge b
+seek-prefix-ge c
+seek-prefix-ge d
+----
+a:2
+b:2
+c:2
+.
+.
+a:1
+b:1
+c:1
+a:2
+b:2
+c:2
+.
+
+iter
+first
+next
+next
+prev
+prev
+prev
+next
+next
+next
+next
+next
+next
+prev
+prev
+next
+prev
+prev
+next
+next
+next
+next
+prev
+----
+a:2
+a:1
+b:2
+a:1
+a:2
+.
+a:2
+a:1
+b:2
+b:1
+c:2
+c:1
+c:2
+b:1
+c:2
+b:1
+b:2
+b:1
+c:2
+c:1
+.
+c:1
diff --git a/pebble/testdata/iter_histories/clone b/pebble/testdata/iter_histories/clone
new file mode 100644
index 0000000..5fcee11
--- /dev/null
+++ b/pebble/testdata/iter_histories/clone
@@ -0,0 +1,151 @@
+# Test iterator bounds provided via IterOptions.
+
+reset
+----
+
+batch commit
+set a a
+set b b
+set c c
+set d d
+set f f
+range-key-set a   ap  @6 foo
+range-key-set ap  c   @5 bar
+range-key-set cat zoo @3 bax
+----
+committed 8 keys
+
+# Ensure bounds provided at initialization are respected, and propagated to
+# cloned iterators.
+
+combined-iter lower=b upper=e
+first
+next
+next
+next
+next
+clone
+first
+next
+next
+next
+next
+----
+b: (b, [b-c) @5=bar UPDATED)
+c: (c, . UPDATED)
+cat: (., [cat-e) @3=bax UPDATED)
+d: (d, [cat-e) @3=bax)
+.
+.
+b: (b, [b-c) @5=bar UPDATED)
+c: (c, . UPDATED)
+cat: (., [cat-e) @3=bax UPDATED)
+d: (d, [cat-e) @3=bax)
+.
+
+# Ensure bounds provided during clone are propagated to cloned iterators.
+
+combined-iter lower=b upper=e
+first
+next
+next
+next
+next
+clone lower=a upper=cat key-types=both
+first
+next
+next
+next
+clone lower=a upper=cat key-types=point
+first
+next
+next
+next
+----
+b: (b, [b-c) @5=bar UPDATED)
+c: (c, . UPDATED)
+cat: (., [cat-e) @3=bax UPDATED)
+d: (d, [cat-e) @3=bax)
+.
+.
+a: (a, [a-ap) @6=foo UPDATED)
+ap: (., [ap-c) @5=bar UPDATED)
+b: (b, [ap-c) @5=bar)
+c: (c, . UPDATED)
+.
+a: (a, .)
+b: (b, .)
+c: (c, .)
+.
+
+# Test cloning an iterator that reads through an indexed batch.
+
+batch name=batchfoo
+del b
+set c c2
+range-key-unset b c @5
+----
+wrote 3 keys to batch "batchfoo"
+
+combined-iter reader=batchfoo name=itera
+seek-ge b
+seek-ge c
+----
+c: (c2, .)
+c: (c2, .)
+
+combined-iter
+seek-ge b
+seek-ge c
+----
+b: (b, [ap-c) @5=bar UPDATED)
+c: (c, . UPDATED)
+
+clone from=itera to=iterb refresh-batch=false
+----
+
+iter iter=iterb
+seek-ge b
+seek-ge c
+----
+c: (c2, .)
+c: (c2, .)
+
+mutate batch=batchfoo
+set c c3
+range-key-set b c @9 final
+----
+
+iter iter=itera
+seek-ge b
+seek-ge c
+----
+c: (c2, .)
+c: (c2, .)
+
+iter iter=iterb
+seek-ge b
+seek-ge c
+----
+c: (c2, .)
+c: (c2, .)
+
+clone from=iterb to=iterc refresh-batch=false
+----
+
+iter iter=iterc
+seek-ge b
+seek-ge c
+----
+c: (c2, .)
+c: (c2, .)
+
+clone from=iterb to=iterd refresh-batch=true
+----
+
+iter iter=iterd
+seek-ge b
+seek-ge c
+----
+b: (., [b-c) @9=final UPDATED)
+c: (c3, . UPDATED)
diff --git a/pebble/testdata/iter_histories/errors b/pebble/testdata/iter_histories/errors
new file mode 100644
index 0000000..07668a9
--- /dev/null
+++ b/pebble/testdata/iter_histories/errors
@@ -0,0 +1,99 @@
+reset
+----
+
+batch commit
+set a a
+set b b
+set c c
+set d d
+----
+committed 4 keys
+
+# Scan forward
+
+combined-iter
+seek-ge a
+next
+next
+next
+next
+----
+a: (a, .)
+b: (b, .)
+c: (c, .)
+d: (d, .)
+.
+
+reopen
+----
+
+combined-iter
+first
+next
+next
+next
+next
+----
+a: (a, .)
+b: (b, .)
+c: (c, .)
+d: (d, .)
+.
+
+reopen enable-table-stats=false inject-errors=((ErrInjected (And Reads (PathMatch "*.sst") (OnIndex 4))))
+----
+
+combined-iter
+first
+first
+next
+next
+next
+next
+----
+err=pebble: backing file 000004 error: injected error
+a: (a, .)
+b: (b, .)
+c: (c, .)
+d: (d, .)
+.
+
+# Regression test for #2994.
+#
+# Previously, an error while loading an L0 sstable's range key block could
+# result in an iterator that would always return the same error. Now, the IO is
+# deferred to the first seek. If a seek encounters an IO error, re-seeking the
+# iterator should re-attempt the failed IO operation, potentially succeeding if
+# the IO error was transient.
+
+define auto-compactions=off
+L0
+  a.SET.9:a
+  rangekey:c-d:{(#0,RANGEKEYSET,@1,foo)}
+  e@2.SET.2:e@2
+----
+0.0:
+  000004:[a#9,SET-e@2#2,SET]
+
+layout filename=000004.sst
+----
+         0  data (38)
+        43  index (35)
+        83  range-key (29)
+       117  properties (645)
+       767  meta-index (57)
+       829  footer (53)
+       882  EOF
+
+# Inject an error on the first `ReadAt` call on 000004.sst's range key block
+# (which is at offset 83).
+
+reopen auto-compactions=off enable-table-stats=false inject-errors=((ErrInjected (And (PathMatch "000004.sst") (OpFileReadAt 83) (OnIndex 0))))
+----
+
+combined-iter
+first
+first
+----
+err=injected error
+a: (a, .)
diff --git a/pebble/testdata/iter_histories/internal_next b/pebble/testdata/iter_histories/internal_next
new file mode 100644
index 0000000..820a1dc
--- /dev/null
+++ b/pebble/testdata/iter_histories/internal_next
@@ -0,0 +1,374 @@
+reset
+----
+
+# For all prefixes a-z, write 3 keys at timestamps @1, @10, @100.
+# This populates a total of 26 * 3 = 78 keys.
+
+populate keylen=1 timestamps=(1, 10, 100)
+----
+wrote 78 keys
+
+combined-iter
+first
+next-prefix
+internal-next
+internal-next
+next
+next-prefix
+internal-next
+internal-next
+next
+internal-next
+next
+internal-next
+----
+a@100: (a@100, .)
+b@100: (b@100, .)
+.
+.
+b@10: (b@10, .)
+c@100: (c@100, .)
+.
+.
+c@10: (c@10, .)
+.
+c@1: (c@1, .)
+.
+
+combined-iter
+first
+next-prefix
+can-deterministically-single-delete
+can-deterministically-single-delete
+next
+next-prefix
+can-deterministically-single-delete
+next
+can-deterministically-single-delete
+next
+can-deterministically-single-delete
+----
+a@100: (a@100, .)
+b@100: (b@100, .)
+true
+err: pebble: CanDeterministicallySingleDelete called twice
+b@10: (b@10, .)
+c@100: (c@100, .)
+true
+c@10: (c@10, .)
+true
+c@1: (c@1, .)
+true
+
+# The start boundaries of range keys are interleaved and can cause the internal
+# iterator to be advanced ahead to look for a point at the same user key. This
+# is one of the few situations in which InternalNext may find the iterator is
+# already positioned at iterPosNext. Test this scenario.
+
+batch commit
+range-key-set a b @1 foo
+range-key-set bb c @2 bar
+----
+committed 2 keys
+
+combined-iter
+first
+internal-next
+next
+internal-next
+seek-ge b@10
+internal-next
+next
+internal-next
+internal-next
+next
+----
+a: (., [a-b) @1=foo UPDATED)
+.
+a@100: (a@100, [a-b) @1=foo)
+.
+b@10: (b@10, . UPDATED)
+.
+b@1: (b@1, .)
+.
+.
+bb: (., [bb-c) @2=bar UPDATED)
+
+combined-iter
+first
+can-deterministically-single-delete
+next
+can-deterministically-single-delete
+seek-ge b@10
+can-deterministically-single-delete
+next
+can-deterministically-single-delete
+next
+----
+a: (., [a-b) @1=foo UPDATED)
+true
+a@100: (a@100, [a-b) @1=foo)
+true
+b@10: (b@10, . UPDATED)
+true
+b@1: (b@1, .)
+true
+bb: (., [bb-c) @2=bar UPDATED)
+
+
+reset
+----
+
+batch commit
+set a a
+set b b
+range-key-set b c @1 foo
+set d d
+----
+committed 4 keys
+
+combined-iter
+first
+internal-next
+next
+internal-next
+next
+prev
+internal-next
+----
+a: (a, .)
+.
+b: (b, [b-c) @1=foo UPDATED)
+.
+d: (d, . UPDATED)
+b: (b, [b-c) @1=foo UPDATED)
+err: switching from reverse to forward via internalNext is prohibited
+
+combined-iter
+first
+can-deterministically-single-delete
+next
+can-deterministically-single-delete
+next
+prev
+can-deterministically-single-delete
+----
+a: (a, .)
+true
+b: (b, [b-c) @1=foo UPDATED)
+true
+d: (d, . UPDATED)
+b: (b, [b-c) @1=foo UPDATED)
+err: switching from reverse to forward via internalNext is prohibited
+
+# Perform a test where we produce two internal versions (both SETs) for each
+# user key. Note that this test disables automatic compactions, so the presence
+# of the internal keys will be deterministic.
+
+reset
+----
+
+populate keylen=1 timestamps=(1, 10, 100)
+----
+wrote 78 keys
+
+flush
+----
+
+populate keylen=1 timestamps=(1, 10, 100)
+----
+wrote 78 keys
+
+combined-iter
+first
+next-prefix
+internal-next
+internal-next
+next
+next-prefix
+internal-next
+internal-next
+next
+internal-next
+next
+internal-next
+----
+a@100: (a@100, .)
+b@100: (b@100, .)
+SET
+.
+b@10: (b@10, .)
+c@100: (c@100, .)
+SET
+.
+c@10: (c@10, .)
+SET
+c@1: (c@1, .)
+SET
+
+combined-iter
+seek-ge z
+internal-next
+next
+next
+internal-next
+internal-next
+next
+internal-next
+----
+z@100: (z@100, .)
+SET
+z@10: (z@10, .)
+z@1: (z@1, .)
+SET
+.
+.
+.
+
+combined-iter
+first
+next-prefix
+can-deterministically-single-delete
+next
+next-prefix
+can-deterministically-single-delete
+next
+can-deterministically-single-delete
+next
+can-deterministically-single-delete
+----
+a@100: (a@100, .)
+b@100: (b@100, .)
+false
+b@10: (b@10, .)
+c@100: (c@100, .)
+false
+c@10: (c@10, .)
+false
+c@1: (c@1, .)
+false
+
+# Test that a CanDeterministicallySingleDelete is true if the old SETs are all
+# deleted by a range delete.
+
+batch commit
+del-range a zzz
+----
+committed 1 keys
+
+populate keylen=1 timestamps=(1, 10, 100)
+----
+wrote 78 keys
+
+combined-iter
+first
+next-prefix
+can-deterministically-single-delete
+next
+next-prefix
+can-deterministically-single-delete
+next
+can-deterministically-single-delete
+next
+can-deterministically-single-delete
+----
+a@100: (a@100, .)
+b@100: (b@100, .)
+true
+b@10: (b@10, .)
+c@100: (c@100, .)
+true
+c@10: (c@10, .)
+true
+c@1: (c@1, .)
+true
+
+# Set fmv=FormatDeleteSizedAndObsolete.
+
+reset format-major-version=15
+----
+
+# Test that a SET > SINGLEDEL > SET sequence yields
+# CanDeterministicallySingleDelete() = true. This is okay because if the SET
+# consumes the SINGLEDEL, it becomes a SETWITHDEL. If a SINGLEDEL consumes a
+# SETWITHDEL, it becomes a DEL.
+
+batch commit
+set a a
+singledel a
+set a a
+----
+committed 3 keys
+
+combined-iter
+first
+internal-next
+internal-next
+first
+can-deterministically-single-delete
+----
+a: (a, .)
+SINGLEDEL
+SET
+a: (a, .)
+true
+
+# Deleting with a DEL[SIZED] should then allow deterministic single delete
+# again.
+
+batch commit
+del a
+set a a
+----
+committed 2 keys
+
+combined-iter
+first
+internal-next
+internal-next
+internal-next
+internal-next
+internal-next
+first
+can-deterministically-single-delete
+----
+a: (a, .)
+DEL
+SET
+SINGLEDEL
+SET
+.
+a: (a, .)
+true
+
+# The above case tested DEL. Explicitly test DELSIZED by setting the key again,
+# then writing a DELSIZED, then another key.
+
+batch commit
+del-sized a 1
+set a a
+----
+committed 2 keys
+
+combined-iter
+first
+internal-next
+internal-next
+internal-next
+internal-next
+internal-next
+internal-next
+internal-next
+first
+can-deterministically-single-delete
+----
+a: (a, .)
+DELSIZED
+SET
+DEL
+SET
+SINGLEDEL
+SET
+.
+a: (a, .)
+true
diff --git a/pebble/testdata/iter_histories/iter_optimizations b/pebble/testdata/iter_histories/iter_optimizations
new file mode 100644
index 0000000..33a7969
--- /dev/null
+++ b/pebble/testdata/iter_histories/iter_optimizations
@@ -0,0 +1,723 @@
+# Test repeated seeks into the same range key, while TrySeekUsingNext=true.
+# Test for regression fixed in #1849.
+
+reset
+----
+
+batch commit
+range-key-set a c @5 boop
+range-key-set c e @5 beep
+----
+committed 2 keys
+
+combined-iter
+seek-ge a
+seek-ge b
+----
+a: (., [a-c) @5=boop UPDATED)
+b: (., [a-c) @5=boop)
+
+# Ensure that no-op optimizations do not reuse range key iterator state across
+# SetOptions calls. No-op optimizations have the potential to fail to update
+# RangeKeyChanged().
+
+reset
+----
+
+batch commit
+range-key-set p s @1 foo
+----
+committed 1 keys
+
+combined-iter lower=n@9 upper=x@5
+seek-lt y@3
+set-options lower=n@9 upper=x@5
+seek-lt-limit t o
+----
+p: (., [p-s) @1=foo UPDATED)
+.
+p: valid (., [p-s) @1=foo UPDATED)
+
+combined-iter lower=n@9 upper=x@5
+seek-ge o
+set-options lower=n@9 upper=x@5
+seek-ge oat
+----
+p: (., [p-s) @1=foo UPDATED)
+.
+p: (., [p-s) @1=foo UPDATED)
+
+combined-iter lower=n@9 upper=x@5
+seek-prefix-ge p@5
+set-options lower=n@9 upper=x@5
+seek-prefix-ge p
+----
+p@5: (., [p-"p\x00") @1=foo UPDATED)
+.
+p: (., [p-"p\x00") @1=foo UPDATED)
+
+# Regression test for #1963 / cockroachdb/cockroach#88296.
+#
+# The iterators in this test move their bounds monotonically forward
+# [a,b)→[b,e). This enables the sstable iterator optimization for monotonically
+# moving bounds (see boundsCmp in sstable/reader.go). With this optimization,
+# the first seek after the SetBounds may use the fact that the bounds moved
+# forward monotonically to avoid re-seeking within the index.
+#
+# The test cases below exercise a seek to a key, followed by a seek to a smaller
+# key. The second seek should not make use of the bounds optimization because
+# doing so may incorrectly skip all keys between the lower bound and the first
+# seek key. Previously, the code paths that handled block-property filtering on
+# a two-level iterator could leave the iterator in a state such that the second
+# seek would improperly also exercise the monotonic bounds optimization. In the
+# test cases below, this would result in the key 'b' not being found. Each test
+# case exercises a different combination of seek-ge and seek-prefix-ge.
+
+reset block-size=1 index-block-size=1
+----
+
+batch commit
+set a a
+set b b
+set b@4 b@4
+set z@6 z@6
+----
+committed 4 keys
+
+flush
+----
+
+combined-iter lower=a upper=b point-key-filter=(1,4)
+seek-ge a
+set-bounds lower=b upper=e
+seek-prefix-ge d@5
+seek-prefix-ge b
+----
+a: (a, .)
+.
+.
+b: (b, .)
+
+combined-iter lower=a upper=b point-key-filter=(1,4)
+seek-ge a
+set-bounds lower=b upper=e
+seek-ge d@5
+seek-prefix-ge b
+----
+a: (a, .)
+.
+.
+b: (b, .)
+
+combined-iter lower=a upper=b point-key-filter=(1,4)
+seek-ge a
+set-bounds lower=b upper=e
+seek-ge d@5
+seek-ge b
+----
+a: (a, .)
+.
+.
+b: (b, .)
+
+combined-iter lower=a upper=b point-key-filter=(1,4)
+seek-ge a
+set-bounds lower=b upper=e
+seek-prefix-ge d@5
+seek-ge b
+----
+a: (a, .)
+.
+.
+b: (b, .)
+
+# Test a similar case with range key masking. The previous bug did not apply to
+# this case, because range-key masking never skips blocks on a seek.
+
+reset block-size=1 index-block-size=1
+----
+
+batch commit
+set a a
+set b b
+set b@4 b@4
+set z@6 z@6
+range-key-set a z @9 v
+----
+committed 5 keys
+
+flush
+----
+
+combined-iter lower=a upper=b mask-suffix=@10 mask-filter
+seek-ge a
+set-bounds lower=b upper=e
+seek-prefix-ge d@5
+seek-ge b
+----
+a: (a, [a-b) @9=v UPDATED)
+.
+d@5: (., [d-"d\x00") @9=v UPDATED)
+b: (b, [b-e) @9=v UPDATED)
+
+# Test TrySeekUsingNext across no-op SetOptions when reading through an indexed
+# batch with modifications. The seek-prefix-ges after the first should make use
+# of the TrySeekUsingNext optimization.
+#
+# TODO(jackson): The iterator stats don't signal the use of try-seek-using-next,
+# so we inspect lastPositioningOp as a proxy since that's the
+# try-seek-using-next prerequisite that previously regressed. Is there a way to
+# adapt to this test so that the absence of the try-seek-using-next optimization
+# is visible in the iterator statistics?
+#
+# Regression test for cockroachdb/cockroach#88819.
+
+reset
+----
+
+batch commit
+set b@5 b@5
+set c@3 c@3
+set d@9 d@9
+set e@8 e@8
+set f@8 f@8
+----
+committed 5 keys
+
+flush
+----
+
+batch name=foo
+set g@4 g@4
+----
+wrote 1 keys to batch "foo"
+
+combined-iter reader=foo name=fooiter
+inspect lastPositioningOp
+seek-prefix-ge b@10
+stats
+----
+lastPositioningOp="unknown"
+b@5: (b@5, .)
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 119B, cached 0B, read-time 0s)), (points: (count 1, key-bytes 3B, value-bytes 3B, tombstoned 0)))
+
+mutate batch=foo
+set h@2 h@2
+----
+
+iter iter=fooiter
+set-options
+inspect lastPositioningOp
+seek-prefix-ge c@10
+stats
+----
+.
+lastPositioningOp="seekprefixge"
+c@3: (c@3, .)
+stats: (interface (dir, seek, step): (fwd, 2, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 2, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 119B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 6B, value-bytes 6B, tombstoned 0)))
+
+mutate batch=foo
+set i@1 i@1
+----
+
+iter iter=fooiter
+set-options
+inspect lastPositioningOp
+seek-prefix-ge d@10
+stats
+----
+.
+lastPositioningOp="seekprefixge"
+d@9: (d@9, .)
+stats: (interface (dir, seek, step): (fwd, 3, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 3, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 119B, cached 0B, read-time 0s)), (points: (count 3, key-bytes 9B, value-bytes 9B, tombstoned 0)))
+
+mutate batch=foo
+set j@6 j@6
+----
+
+iter iter=fooiter
+set-options
+inspect lastPositioningOp
+seek-prefix-ge e@10
+stats
+----
+.
+lastPositioningOp="seekprefixge"
+e@8: (e@8, .)
+stats: (interface (dir, seek, step): (fwd, 4, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 4, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 119B, cached 0B, read-time 0s)), (points: (count 4, key-bytes 12B, value-bytes 12B, tombstoned 0)))
+
+# Ensure that a case eligible for TrySeekUsingNext across a SetOptions correctly
+# sees new batch mutations. The batch iterator should ignore the
+# TrySeekUsingNext designation.
+
+reset
+----
+
+batch commit
+set b@3 b@3
+set c@3 c@3
+----
+committed 2 keys
+
+batch name=b1
+----
+wrote 0 keys to batch "b1"
+
+combined-iter name=i1 reader=b1
+seek-prefix-ge b@6
+----
+b@3: (b@3, .)
+
+mutate batch=b1
+set b@4 b@4
+----
+
+iter iter=i1
+set-options
+inspect lastPositioningOp
+seek-prefix-ge b@5
+----
+.
+lastPositioningOp="seekprefixge"
+b@4: (b@4, .)
+
+# Similar case with SeekGE.
+
+iter iter=i1
+seek-ge b@2
+----
+c@3: (c@3, .)
+
+mutate batch=b1
+set c@9 c@9
+----
+
+iter iter=i1
+set-options
+inspect lastPositioningOp
+seek-ge b@1
+----
+.
+lastPositioningOp="seekge"
+c@9: (c@9, .)
+
+# Test a case similar to the above, but with an intermediate switch to
+# range-key-only iteration, so that the batchIter is not re-seeked.
+
+reset
+----
+
+batch commit
+set b@5 b@5
+set c@3 c@3
+----
+committed 2 keys
+
+batch name=b1
+----
+wrote 0 keys to batch "b1"
+
+combined-iter name=i1 reader=b1
+seek-ge b@9
+----
+b@5: (b@5, .)
+
+mutate batch=b1
+set b@6 b@6
+----
+
+iter iter=i1
+set-options key-types=range
+seek-ge b@8
+set-options key-types=both
+inspect lastPositioningOp
+seek-ge b@7
+----
+.
+.
+.
+lastPositioningOp="invalidate"
+b@6: (b@6, .)
+
+reset
+----
+
+batch commit
+set b@2 b@2
+set c@3 c@3
+----
+committed 2 keys
+
+batch name=b1
+----
+wrote 0 keys to batch "b1"
+
+combined-iter name=i1 reader=b1
+seek-prefix-ge b@1
+----
+.
+
+mutate batch=b1
+set c@4 c@4
+----
+
+iter iter=i1
+set-options
+inspect lastPositioningOp
+seek-prefix-ge c@8
+----
+.
+lastPositioningOp="seekprefixge"
+c@4: (c@4, .)
+
+# Regression test for #2084.
+#
+# The optimization added in #2058 began using an enabled TrySeekUsingNext flag
+# to avoid re-seeking within a level's file metadata. This optimization was
+# dependent on the invariant that the iterator remained positioned at the
+# previous seek key, so that a subsequent seek to a larger key does not need to
+# backtrack.
+#
+# This invariant wasn't strictly preserved by the levelIter during SeekPrefixGE
+# calls. During a SeekPrefixGE, the sstable iterator may return nil despite the
+# existence of sstable keys greater than the seek key if the sstable's bloom
+# filter excludes the seek prefix. If the sstable DOES NOT contain any range
+# tombstones, the levelIter does not advance to the next file if the file's
+# largest bound has a prefix larger than the seek prefix, returning nil, else it
+# does advance since the next file could contain the seek prefix.
+#
+# However, if the file DOES contain range tombstones, the levelIter returns a
+# synthetic largest boundary key so that the file remains open until the merging
+# iterator passes beyond its bounds. This ensures the file's range deletions'
+# effects on other keys are observed. If another level returned a key greater
+# than this largest boundary key (eg, because the other level doesn't restrict
+# results to the seek prefix), the merging iterator could step beyond the
+# level's synthetic boundary key.  This step could advance the levelIter to the
+# next file, despite its irrelevance to the current prefix. This step would also
+# break the invariant that the level iterator remained positioned at the seek
+# key.
+#
+# The bug was fixed by comparing the synthetic boundary key to the seek prefix,
+# avoiding ever Next-ing the level iterator beyond the seek prefix.
+
+# Set 100 bloom-filter bits per key to ensure the bloom-filter exclusivity
+# checks successfully exclude prefixes that aren't present.
+reset bloom-bits-per-key=100
+----
+
+# [a           -d)
+#    b@3          d@1
+batch commit
+del-range a d
+set b@3 b@3
+set d@1 d@1
+----
+committed 3 keys
+
+flush
+----
+
+# c@0 e@0
+batch commit
+del c@0
+set e@0 e@0
+----
+committed 2 keys
+
+flush
+----
+
+lsm
+----
+0.1:
+  000007:[c@0#13,DEL-e@0#14,SET]
+0.0:
+  000005:[a#10,RANGEDEL-d@1#12,SET]
+
+# The first SeekPrefixGE(b@3) positions each level iterator over their
+# respective files and correctly finds b@3.
+#
+# The second SeekPrefixGE(c@5) seeks in both files. The 0.0 level iterator finds
+# that its file does not contain the prefix 'c', so it returns nil. Since the file
+# contains a range deletion, it returns a synthetic boundary key with user key
+# d@1 to ensure the file stays open until the iterator has moved beyond the
+# file's bounds. The seek in level 0.1 finds a key with the prefix 'c': a point
+# tombstone c@0#4,DEL. This gets bubbled up to the Iterator, which skips it
+# because it's a point tombstone, nexting within 000007 to e@0#5.
+#
+# Previously, in the bug highlighted by #2084, the merging iterator would then
+# see that level 0.0's synthetic boundary key at d@1 was at the top of the heap
+# and move to the next file in 0.0. The subsequent call to SeekPrefixGE(d@1,
+# TrySeekUsingNext=true) would incorrectly use the current position within the
+# 0.0 file metadata (nil), and miss the d@1 key.
+
+combined-iter
+seek-prefix-ge b@3
+seek-prefix-ge c@5
+seek-prefix-ge d@1
+----
+b@3: (b@3, .)
+.
+d@1: (d@1, .)
+
+
+# Test an instance where unequal application of TrySeekUsingNext optimizations
+# among a merging iterator's levels can result in surfacing deleted keys.
+# Regression test for #2101.
+
+reset
+----
+
+batch commit
+set b b
+----
+committed 1 keys
+
+flush
+----
+
+compact a-h
+----
+6:
+  000005:[b#10,SET-b#10,SET]
+
+batch commit
+set g g
+----
+committed 1 keys
+
+flush
+----
+
+compact a-h
+----
+6:
+  000005:[b#10,SET-b#10,SET]
+  000007:[g#11,SET-g#11,SET]
+
+batch commit
+del-range b d
+----
+committed 1 keys
+
+flush
+----
+
+batch commit
+set e e
+----
+committed 1 keys
+
+flush
+----
+
+lsm
+----
+0.0:
+  000009:[b#12,RANGEDEL-d#inf,RANGEDEL]
+  000011:[e#13,SET-e#13,SET]
+6:
+  000005:[b#10,SET-b#10,SET]
+  000007:[g#11,SET-g#11,SET]
+
+# The `seek-ge b` could incorrectly return `b` if the level 0.0 levelIter obeys
+# the TrySeekUsingNext optimization but the level 6 levelIter does not. The
+# TrySeekUsingNext optimization must be applied equally across all the levels of
+# a merging iterator.
+
+combined-iter
+seek-ge a
+seek-ge b
+----
+e: (e, .)
+e: (e, .)
+
+# Regression test for #2118, where a MERGE pushes child iterators to the next
+# key, and possibly past a file that contained a range tombstone that we
+# should have paused at in a SeekPrefixGE, affecting future TrySeekUsingNexts.
+# This test constructs this example (suffixes ignored), where square brackets
+# consist of one SST:
+#
+# L0: [(b, MERGE)  (c-d, RANGEDEL)] [(m, DEL)]
+# L6: [(c, SET) (c-e, RANGEKEYSET)] [(j, SET)]
+#
+# We create an iterator with L6 filters enabled and create relatively large
+# bloom filter blocks to reduce the false positive rate. Then we SeekPrefixGE(b)
+# and end up with the L0 levelIter landing on the (b, MERGE), and the L6 iterator
+# is exhausted as no SST filter blocks match the prefix. The top-level iterator
+# then Next()s to find the next internal key at b if there is any, we land
+# on the pause key at (d, RANGEDELSENTINEL). Crucially since there are no
+# more items in the mergingIter heap and the merging iter is set to elide
+# range tombstones, we Next() the level iter again as part of the same top-level
+# iterator Next(), and land on (m, DEL). The type of the key here doesn't really
+# matter.
+#
+# We then do a SeekPrefixGE(c), and since c > b, in the buggy scenario we
+# TrySeekUsingNext. The bottom levelIter correctly finds the sstable containing
+# the set, but the upper levelIter is already past the sstable containing the
+# rangedel, so it just returns (m, DEL) again, and we surface the (c, SET) that
+# should have been deleted.
+
+reset bloom-bits-per-key=100
+----
+
+batch commit
+set c@2 foo
+range-key-set c e @5 bar
+----
+committed 2 keys
+
+flush
+----
+
+compact a-z
+----
+6:
+  000005:[c#11,RANGEKEYSET-e#inf,RANGEKEYSET]
+
+batch commit
+set j k
+----
+committed 1 keys
+
+flush
+----
+
+compact a-z
+----
+6:
+  000005:[c#11,RANGEKEYSET-e#inf,RANGEKEYSET]
+  000007:[j#12,SET-j#12,SET]
+
+batch commit
+del-range c@2 d
+merge b@2 g
+----
+committed 2 keys
+
+flush
+----
+
+batch commit
+del m
+----
+committed 1 keys
+
+flush
+----
+
+lsm
+----
+0.0:
+  000009:[b@2#14,MERGE-d#inf,RANGEDEL]
+  000011:[m#15,DEL-m#15,DEL]
+6:
+  000005:[c#11,RANGEKEYSET-e#inf,RANGEKEYSET]
+  000007:[j#12,SET-j#12,SET]
+
+combined-iter upper=z@3 mask-suffix=@3 mask-filter use-l6-filter
+seek-prefix-ge b@2
+seek-prefix-ge c@2
+----
+b@2: (g, .)
+c@2: (., [c-"c\x00") @5=bar UPDATED)
+
+# Regression test for Cockroachdb#92205. This test constructs this scenario:
+#
+# A DEL in a middle level (L0.0) that we SeekPrefixGE directly for. Note that
+# this DEL is not deleted by any range deletes; it gets exposed to the
+# Iterator. There's a key after this DEL in the L0.0 levelIter, and there's a
+# level above it (L0.1) that has a rangedel deleting that key, but not the DEL
+# we SeekPrefixGE for. In the lowest level, there's a SET at L6 that is to the
+# right of the DEL in L0.0, but is also not deleted by the RANGEDEL in L0.1.
+# Our second SeekPrefixGE will be for this SET. Visualization, where square
+# brackets are files:
+#
+# L0.1                 [dd-ee#RANGEDEL]
+# L0.0    [b#DEL          e#SET]
+# L6            [d#SET]       [f#SET g#SET]
+#
+# When the Iterator encounters the above DEL internal key in the SeekPrefixGE, it
+# calls Iterator.nextUserKey in the Iterator.findNextEntry call that was part of the
+# SeekPrefixGE call. While Iterator.findNextEntry has a conditional to exit
+# out of the loop if we're in prefix iteration and have gone past the prefix,
+# this break only happens _after_ nextUserKey() has run. As a result we Next()
+# the levelIter in L0.0, land on e#SET, and the mergingIter realizes that it
+# is deleted by the rangedel in a higher level (L0.1). The mergingIter does not
+# see d#SET because that sstable was excluded by the bloom filter. We then do a relative
+# seek of all levels below L0.1 to ee (the end key of the rangedel), and in that
+# process we advance the L6 levelIter to the second file.
+#
+# When we do the second SeekPrefixGE for d, the outer Iterator thinks d > b and
+# so TrySeekUsingNext can work. However, the L6 levelIter has already advanced
+# past the file containing d#SET, so we don't surface it even though we should
+# have.
+
+reset bloom-bits-per-key=100
+----
+
+batch commit
+set d@4 foo
+----
+committed 1 keys
+
+flush
+----
+
+compact a-f
+----
+6:
+  000005:[d@4#10,SET-d@4#10,SET]
+
+batch commit
+set f@5 bar
+set g@5 baz
+----
+committed 2 keys
+
+flush
+----
+
+compact e-k
+----
+6:
+  000005:[d@4#10,SET-d@4#10,SET]
+  000007:[f@5#11,SET-g@5#12,SET]
+
+batch commit
+del b@5
+set e@4 foobar
+----
+committed 2 keys
+
+flush
+----
+
+batch commit
+del-range dd ee
+----
+committed 1 keys
+
+flush
+----
+
+lsm
+----
+0.1:
+  000011:[dd#15,RANGEDEL-ee#inf,RANGEDEL]
+0.0:
+  000009:[b@5#13,DEL-e@4#14,SET]
+6:
+  000005:[d@4#10,SET-d@4#10,SET]
+  000007:[f@5#11,SET-g@5#12,SET]
+
+combined-iter upper=z@3 use-l6-filter
+seek-prefix-ge b@6
+seek-prefix-ge d@5
+----
+.
+d@4: (foo, .)
diff --git a/pebble/testdata/iter_histories/lazy_combined_iteration b/pebble/testdata/iter_histories/lazy_combined_iteration
new file mode 100644
index 0000000..4392823
--- /dev/null
+++ b/pebble/testdata/iter_histories/lazy_combined_iteration
@@ -0,0 +1,279 @@
+# Test a lazy-combined iteration edge case. Consider the LSM:
+#
+#   L5:  000003:[bar.DEL.3, foo.RANGEKEYSET.4]
+#   L6:  000001:[bar.SET.1] 000002:[bax.RANGEKEYSET.2]
+#
+# A call to First() seeks the levels to files L5.000003 and L6.000001.
+# The L5 levelIter observes that L5.000003 contains the range key with
+# start key `foo`, and triggers a switch to combined iteration, setting
+# `combinedIterState.key` = `foo`. While switching to combined iteration, the
+# iterator must recognize that `foo` > `bar`, and there may yet exist range keys
+# that begin before `foo` (in this case `bax`).
+
+reset
+----
+
+batch commit
+set bar bar
+----
+committed 1 keys
+
+flush
+----
+
+batch commit
+range-key-set bax zoo @1 foo
+----
+committed 1 keys
+
+flush
+----
+
+batch commit
+del bar
+range-key-set foo zoo @2 bar
+----
+committed 2 keys
+
+flush
+----
+
+lsm
+----
+0.1:
+  000009:[bar#12,DEL-zoo#inf,RANGEKEYSET]
+0.0:
+  000005:[bar#10,SET-bar#10,SET]
+  000007:[bax#11,RANGEKEYSET-zoo#inf,RANGEKEYSET]
+
+# Assert that First correctly finds [bax,zoo), despite the discovery of
+# [foo,zoo) triggering the switch to combined iteration.
+
+combined-iter
+first
+next
+----
+bax: (., [bax-foo) @1=foo UPDATED)
+foo: (., [foo-zoo) @2=bar, @1=foo UPDATED)
+
+# Test seeking into the middle of a range key during lazy-combined iteration.
+# The iterator should surface Key() = the seek key.
+
+combined-iter
+seek-ge bop
+----
+bop: (., [bax-foo) @1=foo UPDATED)
+
+combined-iter
+last
+----
+foo: (., [foo-zoo) @2=bar, @1=foo UPDATED)
+
+# Test a lazy combined iterator that must next/prev through fileMetdata when
+# skipping through a RANGEDEL.
+#
+# L5
+#     b-----------------------y RANGEDEL
+# L6
+#  [a]   [[d,e)@1]  [[l,m)@1]   [z]
+#
+# A SeekGE(k) must surface [l,m)@1 and a SeekLT(k) must surface [d,e)@1.
+
+reset
+----
+
+batch commit
+set a a
+----
+committed 1 keys
+
+flush
+----
+
+batch commit
+set z z
+----
+committed 1 keys
+
+flush
+----
+
+batch commit
+range-key-set d e @1 foo
+----
+committed 1 keys
+
+flush
+----
+
+batch commit
+range-key-set l m @1 foo
+----
+committed 1 keys
+
+flush
+----
+
+batch commit
+del-range b y
+----
+committed 1 keys
+
+flush
+----
+
+lsm
+----
+0.1:
+  000013:[b#14,RANGEDEL-y#inf,RANGEDEL]
+0.0:
+  000005:[a#10,SET-a#10,SET]
+  000009:[d#12,RANGEKEYSET-e#inf,RANGEKEYSET]
+  000011:[l#13,RANGEKEYSET-m#inf,RANGEKEYSET]
+  000007:[z#11,SET-z#11,SET]
+
+combined-iter
+seek-ge k
+next
+----
+l: (., [l-m) @1=foo UPDATED)
+z: (z, . UPDATED)
+
+combined-iter
+seek-lt k
+prev
+----
+d: (., [d-e) @1=foo UPDATED)
+a: (a, . UPDATED)
+
+
+reset
+----
+
+batch commit
+set a a
+set b b
+set c c
+set e e
+range-key-del a f
+range-key-unset a f @5
+----
+committed 6 keys
+
+flush
+----
+
+wait-table-stats
+----
+
+# The lazy iterator shouldn't switch to combined iteration when it encounters a
+# file that is known to only contain RANGEKEYDELs and RANGEKEYUNSETs.
+
+combined-iter
+is-using-combined
+seek-ge a
+seek-ge b
+is-using-combined
+----
+using lazy iterator
+a: (a, .)
+b: (b, .)
+using lazy iterator
+
+# Write a range key to the memtable. The combined iterator should be forced to
+# use non-lazy iteration.
+
+batch commit
+range-key-set m z @5 foo
+set s s
+----
+committed 2 keys
+
+combined-iter
+is-using-combined
+seek-ge a
+is-using-combined
+seek-ge n
+is-using-combined
+----
+using combined (non-lazy) iterator
+a: (a, .)
+using combined (non-lazy) iterator
+n: (., [m-z) @5=foo UPDATED)
+using combined (non-lazy) iterator
+
+flush
+----
+
+# Now that the range key is flushed, a switch to combined iteration should only
+# happen once the sstable containing the set is encountered.
+
+combined-iter
+is-using-combined
+seek-ge a
+is-using-combined
+seek-ge n
+is-using-combined
+----
+using lazy iterator
+a: (a, .)
+using lazy iterator
+n: (., [m-z) @5=foo UPDATED)
+using combined (non-lazy) iterator
+
+# Regression tests for #2210 metamorphic test failure.
+#
+# Lazy-combined iteration depends on individual point level iterators triggering
+# a switch to combined iteration when they observe a file containing relevant
+# range keys. Previously, this switch did not happen if the observed range
+# keys all lied outside the current iteration prefix.
+#
+# This made it possible for a level to become positioned beyond the file
+# containing range keys, without ever triggering the switch to combined
+# iteration. A subsequent seek that made use of the TrySeekUsingNext
+# optimization would never observe the file containing range keys, and omit the
+# range keys.
+
+define
+L6
+  bax.DEL.9:
+L6
+  rangekey:c-d:{(#0,RANGEKEYSET,@1,foo)}
+L6
+  d@2.SET.2:v
+----
+6:
+  000004:[bax#9,DEL-bax#9,DEL]
+  000005:[c#0,RANGEKEYSET-d#inf,RANGEKEYSET]
+  000006:[d@2#2,SET-d@2#2,SET]
+
+combined-iter
+seek-prefix-ge bax
+seek-prefix-ge cat
+----
+.
+cat: (., [cat-"cat\x00") @1=foo UPDATED)
+
+# Another regression test for the #2210 metamorphic test failure, this one using
+# a MERGE key to force the Iterator to step the internal iterator beyond the
+# range key file.
+
+define
+L6
+  bax.MERGE.9:v
+L6
+  rangekey:c-d:{(#0,RANGEKEYSET,@1,foo)}
+L6
+  d@2.SET.2:v
+----
+6:
+  000004:[bax#9,MERGE-bax#9,MERGE]
+  000005:[c#0,RANGEKEYSET-d#inf,RANGEKEYSET]
+  000006:[d@2#2,SET-d@2#2,SET]
+
+combined-iter
+seek-prefix-ge bax
+seek-prefix-ge cat
+----
+bax: (v, .)
+cat: (., [cat-"cat\x00") @1=foo UPDATED)
diff --git a/pebble/testdata/iter_histories/merge b/pebble/testdata/iter_histories/merge
new file mode 100644
index 0000000..d9d8ec8
--- /dev/null
+++ b/pebble/testdata/iter_histories/merge
@@ -0,0 +1,37 @@
+# Test semantics of reading merge keys through the database and through a batch,
+# both with an iterator and with Get.
+
+reset merger=appender
+----
+
+batch name=bar
+merge k bar
+----
+wrote 1 keys to batch "bar"
+
+batch commit
+merge k foo
+----
+committed 1 keys
+
+combined-iter
+seek-ge k
+----
+k: (foo, .)
+
+get
+k
+----
+k: foo
+
+combined-iter reader=bar
+seek-ge k
+seek-prefix-ge k
+----
+k: (foobar, .)
+k: (foobar, .)
+
+get reader=bar
+k
+----
+k: foobar
diff --git a/pebble/testdata/iter_histories/next_prefix b/pebble/testdata/iter_histories/next_prefix
new file mode 100644
index 0000000..cc74d47
--- /dev/null
+++ b/pebble/testdata/iter_histories/next_prefix
@@ -0,0 +1,292 @@
+reset
+----
+
+# For all prefixes a-z, write 3 keys at timestamps @1, @10, @100.
+# This populates a total of 26 * 3 = 78 keys.
+
+populate keylen=1 timestamps=(1, 10, 100)
+----
+wrote 78 keys
+
+combined-iter
+first
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+----
+a@100: (a@100, .)
+b@100: (b@100, .)
+c@100: (c@100, .)
+d@100: (d@100, .)
+e@100: (e@100, .)
+f@100: (f@100, .)
+g@100: (g@100, .)
+h@100: (h@100, .)
+i@100: (i@100, .)
+j@100: (j@100, .)
+k@100: (k@100, .)
+l@100: (l@100, .)
+m@100: (m@100, .)
+n@100: (n@100, .)
+o@100: (o@100, .)
+p@100: (p@100, .)
+
+combined-iter
+seek-ge n@30
+next-prefix
+next
+next
+next-prefix
+----
+n@10: (n@10, .)
+o@100: (o@100, .)
+o@10: (o@10, .)
+o@1: (o@1, .)
+p@100: (p@100, .)
+
+combined-iter
+seek-prefix-ge p@210
+next-prefix
+----
+p@100: (p@100, .)
+.
+
+combined-iter
+seek-ge p@210
+next-prefix
+seek-ge p@210
+next
+next-prefix
+seek-ge p@210
+next
+next
+next-prefix
+----
+p@100: (p@100, .)
+q@100: (q@100, .)
+p@100: (p@100, .)
+p@10: (p@10, .)
+q@100: (q@100, .)
+p@100: (p@100, .)
+p@10: (p@10, .)
+p@1: (p@1, .)
+q@100: (q@100, .)
+
+reset target-file-size=1
+----
+
+populate keylen=1 timestamps=(1, 10, 100)
+----
+wrote 78 keys
+
+flush
+----
+
+combined-iter
+first
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+next-prefix
+----
+a@100: (a@100, .)
+b@100: (b@100, .)
+c@100: (c@100, .)
+d@100: (d@100, .)
+e@100: (e@100, .)
+f@100: (f@100, .)
+g@100: (g@100, .)
+h@100: (h@100, .)
+i@100: (i@100, .)
+j@100: (j@100, .)
+k@100: (k@100, .)
+l@100: (l@100, .)
+m@100: (m@100, .)
+n@100: (n@100, .)
+o@100: (o@100, .)
+p@100: (p@100, .)
+
+combined-iter
+seek-ge n@30
+next-prefix
+next
+next
+next-prefix
+----
+n@10: (n@10, .)
+o@100: (o@100, .)
+o@10: (o@10, .)
+o@1: (o@1, .)
+p@100: (p@100, .)
+
+combined-iter
+seek-prefix-ge p@210
+next-prefix
+----
+p@100: (p@100, .)
+.
+
+combined-iter
+seek-ge p@210
+next-prefix
+seek-ge p@210
+next
+next-prefix
+seek-ge p@210
+next
+next
+next-prefix
+----
+p@100: (p@100, .)
+q@100: (q@100, .)
+p@100: (p@100, .)
+p@10: (p@10, .)
+q@100: (q@100, .)
+p@100: (p@100, .)
+p@10: (p@10, .)
+p@1: (p@1, .)
+q@100: (q@100, .)
+
+batch commit
+range-key-set p r @1 foo
+----
+committed 1 keys
+
+combined-iter
+seek-ge p@210
+next-prefix
+----
+p@210: (., [p-r) @1=foo UPDATED)
+q@100: (q@100, [p-r) @1=foo)
+
+combined-iter
+seek-ge p@210
+next-prefix
+seek-ge p@210
+next
+next-prefix
+seek-ge p@210
+next
+next
+next-prefix
+----
+p@210: (., [p-r) @1=foo UPDATED)
+q@100: (q@100, [p-r) @1=foo)
+p@210: (., [p-r) @1=foo)
+p@100: (p@100, [p-r) @1=foo)
+q@100: (q@100, [p-r) @1=foo)
+p@210: (., [p-r) @1=foo)
+p@100: (p@100, [p-r) @1=foo)
+p@10: (p@10, [p-r) @1=foo)
+q@100: (q@100, [p-r) @1=foo)
+
+# Test an iterator that is positioned on a range key start of a prefix, and the
+# next key is a point key with that same prefix. The interleaving iterator must
+# correctly handle this case and advance the point key iterator.
+combined-iter
+seek-ge p
+next-prefix
+----
+p: (., [p-r) @1=foo UPDATED)
+q@100: (q@100, [p-r) @1=foo)
+
+# Test switching directions via NextPrefix.
+combined-iter
+seek-ge p@100
+prev
+next-prefix
+----
+p@100: (p@100, [p-r) @1=foo UPDATED)
+p: (., [p-r) @1=foo)
+q@100: (q@100, [p-r) @1=foo)
+
+# Test switching directions via NextPrefix when the internal iterator is
+# exhausted (in the reverse direction), but the Iterator is not.
+#  eg, i.pos = iterPosPrev and i.iterKey == nil.
+combined-iter
+seek-ge a@10
+prev
+next-prefix
+----
+a@10: (a@10, .)
+a@100: (a@100, .)
+b@100: (b@100, .)
+
+reset
+----
+
+populate keylen=1 timestamps=(1, 10, 100)
+----
+wrote 78 keys
+
+flush
+----
+
+lsm
+----
+0.0:
+  000005:[a@100#12,SET-z@1#85,SET]
+
+# Test for https://github.com/cockroachdb/pebble/issues/2260. Triggered the
+# bug. The second call to first would return c@100 instead of the correct key,
+# b@1.
+combined-iter upper=b@1
+first
+next-prefix
+next-prefix
+set-bounds lower=b@1 upper=d
+first
+next
+first
+----
+a@100: (a@100, .)
+err=NextPrefix not permitted with upper bound b@1
+err=NextPrefix not permitted with upper bound b@1
+.
+b@1: (b@1, .)
+c@100: (c@100, .)
+b@1: (b@1, .)
+
+# Did not trigger https://github.com/cockroachdb/pebble/issues/2260 since
+# Iterator.NextPrefix first does a Next. So the second call to NextPrefix
+# returned after the Next, since the upper bound was reached, which left the
+# Iterator positioned at b@1.
+combined-iter upper=b@10
+first
+next-prefix
+next-prefix
+set-bounds lower=b@1 upper=d
+first
+next
+first
+----
+a@100: (a@100, .)
+err=NextPrefix not permitted with upper bound b@10
+err=NextPrefix not permitted with upper bound b@10
+.
+b@1: (b@1, .)
+c@100: (c@100, .)
+b@1: (b@1, .)
diff --git a/pebble/testdata/iter_histories/prefix_iteration b/pebble/testdata/iter_histories/prefix_iteration
new file mode 100644
index 0000000..2481d3d
--- /dev/null
+++ b/pebble/testdata/iter_histories/prefix_iteration
@@ -0,0 +1,335 @@
+# Regression test for a bug discovered in #1878.
+# A lazy-combined iterator triggers combined iteration during an initial
+# seek-prefix-ge call. The initial seek-prefix-ge call avoids defragmenting
+# fragments beyond the initial fragment [c,f). A subsequent seek-ge that seeks
+# within the bounds of the initial fragment [c,f) must not fall into the
+# optimization that reuses the span without reseeking the keypsan iterator,
+# because the span is not defragmented.
+#
+# In the bug surfaced by #1878, the initial seek-prefix-ge that switched to
+# combined iteration failed to record that the iterator was now in prefix mode,
+# allowing the subsequent seek-ge to incorrectly reuse the existing span.
+
+reset
+----
+
+batch commit
+range-key-set a c @5 foo
+----
+committed 1 keys
+
+flush
+----
+
+batch commit
+range-key-set c f @5 foo
+----
+committed 1 keys
+
+flush
+----
+
+batch commit
+range-key-set f m @5 foo
+----
+committed 1 keys
+
+flush
+----
+
+lsm
+----
+0.0:
+  000005:[a#10,RANGEKEYSET-c#inf,RANGEKEYSET]
+  000007:[c#11,RANGEKEYSET-f#inf,RANGEKEYSET]
+  000009:[f#12,RANGEKEYSET-m#inf,RANGEKEYSET]
+
+combined-iter
+seek-prefix-ge d@5
+seek-ge d
+----
+d@5: (., [d-"d\x00") @5=foo UPDATED)
+d: (., [a-m) @5=foo UPDATED)
+
+# Test that repeated SeekPrefixGEs correctly return truncated spans with
+# RangeKeyChanged() -> UPDATED.
+
+combined-iter
+seek-prefix-ge c@5
+seek-prefix-ge d@5
+seek-ge d@7
+seek-prefix-ge d@7
+----
+c@5: (., [c-"c\x00") @5=foo UPDATED)
+d@5: (., [d-"d\x00") @5=foo UPDATED)
+d@7: (., [a-m) @5=foo UPDATED)
+d@7: (., [d-"d\x00") @5=foo UPDATED)
+
+# Test a LSM with range keys fragmented within a prefix.
+# This is a regression test for cockroachdb/cockroach#86102.
+
+reset target-file-size=1
+----
+
+batch commit
+range-key-set a c @1 bar
+range-key-set c e @1 foo
+set c@9 c@9
+set c@8 c@8
+set c@7 c@7
+set c@6 c@6
+set c@5 c@5
+set c@4 c@4
+set c@3 c@3
+set c@2 c@2
+set d@0 d@0
+range-key-set y z @1 foo
+set z z
+----
+committed 13 keys
+
+flush
+----
+
+lsm
+----
+0.0:
+  000005:[a#10,RANGEKEYSET-c@8#inf,RANGEKEYSET]
+  000006:[c@8#13,SET-c@7#inf,RANGEKEYSET]
+  000007:[c@7#14,SET-c@6#inf,RANGEKEYSET]
+  000008:[c@6#15,SET-c@5#inf,RANGEKEYSET]
+  000009:[c@5#16,SET-c@4#inf,RANGEKEYSET]
+  000010:[c@4#17,SET-c@3#inf,RANGEKEYSET]
+  000011:[c@3#18,SET-c@2#inf,RANGEKEYSET]
+  000012:[c@2#19,SET-d@0#inf,RANGEKEYSET]
+  000013:[d@0#20,SET-e#inf,RANGEKEYSET]
+  000014:[y#21,RANGEKEYSET-z#22,SET]
+
+# The first seek-prefix-ge y@1 converts the iterator from lazy combined iterator
+# to combined iteration.
+#
+# The second seek-prefix-ge d@1 does not fully defragment the range key. The
+# underlying range key is defragmented to [c@2,e). This incomplete
+# defragmentation is still hidden from the user at this point, since the range
+# key is truncated to [d,d\x00).
+#
+# The third seek-prefix-ge c@0 seeks to a key that falls within the
+# range key currently defragmented on interleaving iterator. A previous bug
+# would use this span without defragmenting the span to include the full
+# span of the prefix [c,c\x00).
+
+combined-iter
+seek-prefix-ge y@1
+seek-prefix-ge d@1
+seek-prefix-ge c@0
+----
+y@1: (., [y-"y\x00") @1=foo UPDATED)
+d@1: (., [d-"d\x00") @1=foo UPDATED)
+c@0: (., [c-"c\x00") @1=foo UPDATED)
+
+# Test a LSM with range keys fragmented within a prefix.
+# This is a regression test for cockroachdb/cockroach#86102.
+
+reset
+----
+
+ingest ext1
+range-key-set a c@8 @1 bar
+set c@9 c@9
+----
+
+ingest ext2
+range-key-set c@8 e @1 bar
+set c@8 c@8
+set c@7 c@7
+set c@6 c@6
+set c@5 c@5
+set c@4 c@4
+set c@3 c@3
+set c@2 c@2
+----
+
+ingest ext2
+range-key-set y z @1 foo
+set z z
+----
+
+lsm
+----
+6:
+  000004:[a#10,RANGEKEYSET-c@8#inf,RANGEKEYSET]
+  000005:[c@8#11,RANGEKEYSET-e#inf,RANGEKEYSET]
+  000006:[y#12,RANGEKEYSET-z#12,SET]
+
+
+# The first seek-prefix-ge y@1 converts the iterator from lazy combined iterator
+# to combined iteration.
+#
+# The second seek-prefix-ge d@1 does not fully defragment the range key. The
+# underlying range key is defragmented to [a,c@8). This incomplete
+# defragmentation is still hidden from the user at this point, since the range
+# key is truncated to [a,a\x00).
+#
+# The third seek-prefix-ge c@10 seeks to a key that falls within the
+# range key currently defragmented on interleaving iterator. A previous bug
+# would use this span without defragmenting the span to include the full
+# span of the prefix [c,c\x00).
+
+combined-iter
+seek-prefix-ge y@1
+seek-prefix-ge a@1
+seek-prefix-ge c@10
+----
+y@1: (., [y-"y\x00") @1=foo UPDATED)
+a@1: (., [a-"a\x00") @1=bar UPDATED)
+c@10: (., [c-"c\x00") @1=bar UPDATED)
+
+# Regression test for an invariant violation in the range key defragmenting
+# iterator during prefix iteration. [Related to #1893]. There is a lot of
+# subtlety here. Do not modify this test case without verifying that it still
+# exercises the right conditions.
+#
+# Normally during forward iteration, if a switch to lazy-combined iteration is
+# triggered, the lazy-combined iterator establishes a seek key for the range key
+# iterator such that the seek key is:
+#   1. greater than or equal to the key at previous iterator position.
+#   2. less than or equal to the first range key with a start key greater than
+#       or equal to the previous iterator position.
+# These invariants are important so that the range key iterator is positioned
+# appropriately after the switch to combined iteration and no range keys are
+# missed.
+#
+# Parts of the iterator stack depend on the above invariants. For example,
+# during forward iteration the BoundedIter only checks span start keys against
+# iterator bounds and the configured prefix, with the expectation that the seek
+# is always already greater than or equal to the lower bound. In turn, the
+# DefragmentingIter indirectly relies on the same invariant, because it requires
+# a consistent view of the fragments. If the BoundedIter returns a span in one
+# direction, but skips it when iterating back, the defragmenting iterator will
+# end up on a different fragment.
+#
+# This test exercises a case in which previously, during prefix iteration, it
+# was possible for the switch to lazy-combined iteration to trigger using a seek
+# key k, such that there exist range key fragments between the current iterator
+# position and k (violating the 2nd invariant up above).
+#
+# The sequence of events is:
+#   1. SeekPrefixGE("b@9") = 'b@4':
+#      a. This seek positions the two levels, L0 and L6. The L0 iterator seeks
+#         to file 000006. This file does not contain any keys with the prefix
+#         "b", and the bloom filter must succeed in excluding the file. Since the
+#         file contains a range deletion, SeekPrefixGE returns the level's
+#         largest point key (`d#inf,RANGEDEL`) to ensure the file stays open until
+#         the iterator advances past the range deletion.
+#      b. In L6, the level iterator seeks to 000004 which contains a key with
+#         the prefix, returning 'b@4'.
+#   2. Next():
+#      a. Next advances the the L6 iterator to file 000005. This file contains a
+#         range key [e,f)@1=bar, which updates the lazy-combined iterator's
+#         state, recording the earliest observed range key as 'e'. The L6 level
+#         iterator then returns the file single point key 'c'.
+#      b. The merging iterator checks whether point key 'c' is deleted by any
+#         range key deletions. It is. It's deleted by L0's [c,d) range deletion.
+#         The merging iterator then seeks the iterator to the tombstone's end
+#         key 'd'.
+#      c. After seeking, the range deletion sentinel d is at the top of the
+#         heap. At this point, the merging iterator checks whether the keyspace
+#         of the prefix has been exceeded, and it has. It returns nil.
+#   3. Switch to combined iteration:
+#      a. The Next has completed and triggered combined iteration. The only file
+#         containing range keys that was observed was 000005, containing the
+#         range key [e,f). The switch to combined iteration seeks the keyspan
+#         iterator to 'e'. Note that the iterator never observed L0's [d,e)
+#         range key that precedes [e,f) in the keyspace.
+#      b. Seeking the keyspan iterator calls DefragmentingIter.SeekLT('e'),
+#         which lands on the [d,e) fragment. This fragment does NOT check to see
+#         if the span starts at a prefix greater than the current prefix 'b',
+#         because only bounds in the direction of iteration are check.
+#      c. The DefragmentingIter observes disappearing range key fragments when
+#         it switches directions, as a result of (b).
+#
+
+# Use 100-bits per key to ensure the bloom filter provides total recall.
+reset bloom-bits-per-key=100
+----
+
+# Ingest L6 files:
+#
+# 000004: b@4
+# 000005: c, [e,f)@1=bar
+
+ingest ext1
+set b@4 b@4
+----
+
+ingest ext1
+set c c
+range-key-set e f @1 bar
+----
+
+# Ingest L0 files:
+#
+# 000006: a, del-range(c, d)
+# 000007: [d,e)@1=bar
+
+ingest ext2
+set a a
+del-range c d
+----
+
+ingest ext3
+range-key-set d e @1 bar
+----
+
+lsm
+----
+0.0:
+  000006:[a#12,SET-d#inf,RANGEDEL]
+  000007:[d#13,RANGEKEYSET-e#inf,RANGEKEYSET]
+6:
+  000004:[b@4#10,SET-b@4#10,SET]
+  000005:[c#11,SET-f#inf,RANGEKEYSET]
+
+combined-iter
+seek-prefix-ge b@9
+next
+----
+b@4: (b@4, .)
+.
+
+# Regression test for #2151.
+#
+# This test consists of two SeekPrefixGEs for ascending keys, which results in
+# TrySeekUsingNext()=true for the second seek. The entirety of both seeked
+# prefixes is deleted by the range deletion [b-d). The iterator being used is
+# created from a snapshot at sequence number #4. At that sequence number, the
+# iterator observes the range deletion and all of L6's point keys, but none of
+# the point keys in L5.
+#
+# Previously, a bug existed where the SeekPrefixGE("b@9") would cause the
+# iterator to next beyond the L5 sstable. The subsequent SeekPrefixGE with
+# TrySeekUsingNext would mistakenly miss the range deletion [b-d) because it had
+# already proceeded beyond the file.
+
+define snapshots=(4)
+L5
+  b.RANGEDEL.3:d
+  b@9.SET.9:v
+  c@9.SET.9:v
+  d@9.SET.9:v
+L6
+  b@2.SET.2:v
+  c@2.SET.2:v
+  d@2.SET.2:v
+----
+5:
+  000004:[b#3,RANGEDEL-d@9#9,SET]
+6:
+  000005:[b@2#2,SET-d@2#2,SET]
+
+combined-iter snapshot=4
+seek-prefix-ge b@9
+seek-prefix-ge c@9
+----
+.
+.
diff --git a/pebble/testdata/iter_histories/range_key_changed b/pebble/testdata/iter_histories/range_key_changed
new file mode 100644
index 0000000..f4e6d64
--- /dev/null
+++ b/pebble/testdata/iter_histories/range_key_changed
@@ -0,0 +1,274 @@
+reset
+----
+
+populate keylen=2 timestamps=(1,3,5)
+----
+wrote 2106 keys
+
+batch commit
+range-key-set a d @1 foo
+range-key-set d f @1 foo
+range-key-set f g @2 bar
+----
+committed 3 keys
+
+combined-iter
+seek-ge ba@4
+next
+next
+prev
+prev
+seek-ge ba@1
+seek-ge ca@1
+seek-ge a
+prev
+next
+prev
+seek-ge a
+seek-ge dog
+seek-ge foo
+seek-ge f
+prev
+next
+seek-lt f@1
+----
+ba@4: (., [a-f) @1=foo UPDATED)
+ba@3: (ba@3, [a-f) @1=foo)
+ba@1: (ba@1, [a-f) @1=foo)
+ba@3: (ba@3, [a-f) @1=foo)
+ba@5: (ba@5, [a-f) @1=foo)
+ba@1: (ba@1, [a-f) @1=foo)
+ca@1: (ca@1, [a-f) @1=foo)
+a: (., [a-f) @1=foo)
+.
+a: (., [a-f) @1=foo UPDATED)
+.
+a: (., [a-f) @1=foo UPDATED)
+dog: (., [a-f) @1=foo)
+foo: (., [f-g) @2=bar UPDATED)
+f: (., [f-g) @2=bar)
+ez@1: (ez@1, [a-f) @1=foo UPDATED)
+f: (., [f-g) @2=bar UPDATED)
+f@3: (f@3, [f-g) @2=bar)
+
+combined-iter
+seek-prefix-ge ba@9
+seek-prefix-ge ba@5
+seek-prefix-ge ba@3
+next
+seek-prefix-ge da@5
+next
+next
+next
+seek-prefix-ge da@5
+----
+ba@9: (., [ba-"ba\x00") @1=foo UPDATED)
+ba@5: (ba@5, [ba-"ba\x00") @1=foo)
+ba@3: (ba@3, [ba-"ba\x00") @1=foo)
+ba@1: (ba@1, [ba-"ba\x00") @1=foo)
+da@5: (da@5, [da-"da\x00") @1=foo UPDATED)
+da@3: (da@3, [da-"da\x00") @1=foo)
+da@1: (da@1, [da-"da\x00") @1=foo)
+.
+da@5: (da@5, [da-"da\x00") @1=foo UPDATED)
+
+# Regression test for #1947 — Test a no-op call to SetBounds. Even if the
+# underlying iterator doesn't need to be invalidated because the bounds didn't
+# change, a subsequent Seek that finds the same range key must still report
+# RangeKeyChanged() -> true.
+
+reset
+----
+
+batch commit
+range-key-set a d @1 foo
+----
+committed 1 keys
+
+combined-iter lower=a upper=z
+last
+set-bounds lower=a upper=z
+last
+set-bounds lower=a upper=z
+first
+set-bounds lower=a upper=z
+seek-ge a
+set-bounds lower=a upper=z
+seek-lt z
+set-bounds lower=a upper=z
+seek-prefix-ge a
+set-bounds lower=a upper=z
+seek-prefix-ge a
+----
+a: (., [a-d) @1=foo UPDATED)
+.
+a: (., [a-d) @1=foo UPDATED)
+.
+a: (., [a-d) @1=foo UPDATED)
+.
+a: (., [a-d) @1=foo UPDATED)
+.
+a: (., [a-d) @1=foo UPDATED)
+.
+a: (., [a-"a\x00") @1=foo UPDATED)
+.
+a: (., [a-"a\x00") @1=foo UPDATED)
+
+combined-iter lower=a upper=z
+last
+set-options lower=a upper=z
+last
+set-options lower=a upper=z
+first
+set-options lower=a upper=z
+seek-ge a
+set-options lower=a upper=z
+seek-lt z
+set-options lower=a upper=z
+seek-prefix-ge a
+set-options lower=a upper=z
+seek-prefix-ge a
+----
+a: (., [a-d) @1=foo UPDATED)
+.
+a: (., [a-d) @1=foo UPDATED)
+.
+a: (., [a-d) @1=foo UPDATED)
+.
+a: (., [a-d) @1=foo UPDATED)
+.
+a: (., [a-d) @1=foo UPDATED)
+.
+a: (., [a-"a\x00") @1=foo UPDATED)
+.
+a: (., [a-"a\x00") @1=foo UPDATED)
+
+# Regression test for #1950 — Test a no-op call to SeekGE/SeekLT after a
+# SetBounds/SetOptions noop. The SetBounds/SetOptions noop made the iterator
+# appear to be invalidated, but the internal iterator state was preserved.
+# However, if the previous iterator state had a range key, this range key must
+# be considered changed for the purpose of calculating RangeKeyChanged().
+
+combined-iter lower=a upper=z
+seek-lt z
+set-bounds lower=a upper=z
+seek-lt y
+seek-ge 1
+set-bounds lower=a upper=z
+seek-ge a
+----
+a: (., [a-d) @1=foo UPDATED)
+.
+a: (., [a-d) @1=foo UPDATED)
+a: (., [a-d) @1=foo)
+.
+a: (., [a-d) @1=foo UPDATED)
+
+combined-iter lower=a upper=z
+seek-lt z
+set-options lower=a upper=z
+seek-lt y
+seek-ge 1
+set-options lower=a upper=z
+seek-ge a
+----
+a: (., [a-d) @1=foo UPDATED)
+.
+a: (., [a-d) @1=foo UPDATED)
+a: (., [a-d) @1=foo)
+.
+a: (., [a-d) @1=foo UPDATED)
+
+
+# Similar to the above regression, test that a no-op correctly returns
+# RangeKeyChanged()=false if there's no intervening SetOptions/SetBounds call.
+
+combined-iter lower=a upper=z
+seek-lt z
+seek-lt y
+set-bounds lower=a upper=z
+seek-ge 1
+seek-ge a
+----
+a: (., [a-d) @1=foo UPDATED)
+a: (., [a-d) @1=foo)
+.
+a: (., [a-d) @1=foo UPDATED)
+a: (., [a-d) @1=foo)
+
+combined-iter lower=a upper=z
+seek-lt z
+seek-lt y
+set-options lower=a upper=z
+seek-ge 1
+seek-ge a
+----
+a: (., [a-d) @1=foo UPDATED)
+a: (., [a-d) @1=foo)
+.
+a: (., [a-d) @1=foo UPDATED)
+a: (., [a-d) @1=foo)
+
+# Regression test for #1980. An iterator with RangeKeyChanged()=true that is
+# then reconfigured to iterate over point keys should always return
+# RangeKeyChanged()=false.
+
+reset
+----
+
+batch commit
+range-key-set a b @1 foo
+set c c
+----
+committed 2 keys
+
+combined-iter
+seek-ge a
+set-options key-types=point
+seek-ge c
+----
+a: (., [a-b) @1=foo UPDATED)
+.
+c: (c, .)
+
+combined-iter
+seek-ge a
+set-options key-types=point
+seek-prefix-ge c
+----
+a: (., [a-b) @1=foo UPDATED)
+.
+c: (c, .)
+
+combined-iter
+seek-ge a
+set-options key-types=point
+seek-lt cat
+----
+a: (., [a-b) @1=foo UPDATED)
+.
+c: (c, .)
+
+combined-iter
+seek-ge a
+set-options key-types=point
+last
+----
+a: (., [a-b) @1=foo UPDATED)
+.
+c: (c, .)
+
+batch commit
+range-key-del a b
+range-key-set d e @1 foo
+----
+committed 2 keys
+
+combined-iter
+seek-ge d
+set-options key-types=point
+first
+----
+d: (., [d-e) @1=foo UPDATED)
+.
+c: (c, .)
diff --git a/pebble/testdata/iter_histories/range_key_masking b/pebble/testdata/iter_histories/range_key_masking
new file mode 100644
index 0000000..f13c71a
--- /dev/null
+++ b/pebble/testdata/iter_histories/range_key_masking
@@ -0,0 +1,243 @@
+reset
+----
+
+batch commit
+range-key-set a d @8 boop
+set a@2 a@2
+set a@3 a@3
+set a@9 a@9
+set a@10 a@10
+set b b
+----
+committed 6 keys
+
+combined-iter
+seek-prefix-ge a
+next
+next
+next
+next
+next
+----
+a: (., [a-"a\x00") @8=boop UPDATED)
+a@10: (a@10, [a-"a\x00") @8=boop)
+a@9: (a@9, [a-"a\x00") @8=boop)
+a@3: (a@3, [a-"a\x00") @8=boop)
+a@2: (a@2, [a-"a\x00") @8=boop)
+.
+
+# Perform the above iteration with range-key masking enabled at a suffix equal
+# to the range key's. The [a,d)@8 range key should serve as a masking, obscuring
+# the points a@3 and a@2.
+
+combined-iter mask-suffix=@8
+seek-prefix-ge a
+next
+next
+next
+----
+a: (., [a-"a\x00") @8=boop UPDATED)
+a@10: (a@10, [a-"a\x00") @8=boop)
+a@9: (a@9, [a-"a\x00") @8=boop)
+.
+
+# Perform the same thing but with a mask suffix below the range key's. All the
+# points should be visible again.
+#
+# Then use SetOptions to raise the mask. The masked points should disappear.
+
+combined-iter mask-suffix=@7
+seek-prefix-ge a
+next
+next
+next
+next
+next
+set-options key-types=both mask-suffix=@8
+seek-prefix-ge a
+next
+next
+next
+----
+a: (., [a-"a\x00") @8=boop UPDATED)
+a@10: (a@10, [a-"a\x00") @8=boop)
+a@9: (a@9, [a-"a\x00") @8=boop)
+a@3: (a@3, [a-"a\x00") @8=boop)
+a@2: (a@2, [a-"a\x00") @8=boop)
+.
+.
+a: (., [a-"a\x00") @8=boop UPDATED)
+a@10: (a@10, [a-"a\x00") @8=boop)
+a@9: (a@9, [a-"a\x00") @8=boop)
+.
+
+# Test that switching out of prefix iteration correctly expands the bounds
+# beyond the scope of the previous prefix.
+
+combined-iter
+seek-prefix-ge a
+next
+seek-ge a@3
+----
+a: (., [a-"a\x00") @8=boop UPDATED)
+a@10: (a@10, [a-"a\x00") @8=boop)
+a@3: (a@3, [a-d) @8=boop UPDATED)
+
+# Test a range key masking case where the range key is not immediately
+# masking point keys, but masks point keys once positioned beneath it.
+
+reset
+----
+
+batch commit
+range-key-set d e @5 boop
+set a@1 a1
+set b@3 b3
+set d@3 d3
+----
+committed 4 keys
+
+combined-iter mask-suffix=@9
+first
+next
+next
+next
+----
+a@1: (a1, .)
+b@3: (b3, .)
+d: (., [d-e) @5=boop UPDATED)
+.
+
+# Test a broad range key that masks all the point keys.
+
+reset block-size=20
+----
+
+batch commit
+range-key-set a z @5 boop
+set a@1 foo
+set b@3 foo
+set c@3 foo
+set d@1 foo
+set e@3 foo
+set f@3 foo
+set g@2 foo
+set h@2 foo
+set i@2 foo
+set j@2 foo
+set k@0 foo
+set l@2 foo
+set m@1 foo
+set n@3 foo
+set o@4 foo
+set p@2 foo
+set q@2 foo
+set r@1 foo
+set s@2 foo
+set t@3 foo
+set u@2 foo
+set v@0 foo
+set w@0 foo
+set x@2 foo
+set y@4 foo
+----
+committed 26 keys
+
+flush
+----
+
+combined-iter mask-suffix=@9
+first
+next
+stats
+----
+a: (., [a-z) @5=boop UPDATED)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 1.1KB, cached 0B, read-time 0s)), (points: (count 25, key-bytes 75B, value-bytes 75B, tombstoned 0))),
+(range-key-stats: (count 1), (contained points: (count 25, skipped 25)))
+
+# Repeat the above test, but with an iterator that uses a block-property filter
+# mask. The internal stats should reflect fewer bytes read and fewer points
+# visited by the internal iterators.
+
+combined-iter mask-suffix=@9 mask-filter
+first
+next
+stats
+----
+a: (., [a-z) @5=boop UPDATED)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 514B, cached 514B, read-time 0s)), (points: (count 2, key-bytes 6B, value-bytes 6B, tombstoned 0))),
+(range-key-stats: (count 1), (contained points: (count 2, skipped 2)))
+
+# Perform a similar comparison in reverse.
+
+combined-iter mask-suffix=@9
+last
+prev
+stats
+----
+a: (., [a-z) @5=boop UPDATED)
+.
+stats: (interface (dir, seek, step): (fwd, 0, 0), (rev, 1, 1)), (internal (dir, seek, step): (fwd, 0, 0), (rev, 1, 1)),
+(internal-stats: (block-bytes: (total 1.1KB, cached 1.1KB, read-time 0s)), (points: (count 25, key-bytes 75B, value-bytes 75B, tombstoned 0))),
+(range-key-stats: (count 1), (contained points: (count 25, skipped 25)))
+
+combined-iter mask-suffix=@9 mask-filter
+last
+prev
+stats
+----
+a: (., [a-z) @5=boop UPDATED)
+.
+stats: (interface (dir, seek, step): (fwd, 0, 0), (rev, 1, 1)), (internal (dir, seek, step): (fwd, 0, 0), (rev, 1, 1)),
+(internal-stats: (block-bytes: (total 514B, cached 514B, read-time 0s)), (points: (count 2, key-bytes 6B, value-bytes 6B, tombstoned 0))),
+(range-key-stats: (count 1), (contained points: (count 2, skipped 2)))
+
+# Perform similar comparisons with seeks.
+
+combined-iter mask-suffix=@9
+seek-ge m
+next
+stats
+----
+m: (., [a-z) @5=boop UPDATED)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 789B, cached 789B, read-time 0s)), (points: (count 13, key-bytes 39B, value-bytes 39B, tombstoned 0))),
+(range-key-stats: (count 1), (contained points: (count 13, skipped 13)))
+
+combined-iter mask-suffix=@9 mask-filter
+seek-ge m
+next
+stats
+----
+m: (., [a-z) @5=boop UPDATED)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 514B, cached 514B, read-time 0s)), (points: (count 2, key-bytes 6B, value-bytes 6B, tombstoned 0))),
+(range-key-stats: (count 1), (contained points: (count 2, skipped 2)))
+
+combined-iter mask-suffix=@9
+seek-lt m
+prev
+stats
+----
+a: (., [a-z) @5=boop UPDATED)
+.
+stats: (interface (dir, seek, step): (fwd, 0, 0), (rev, 1, 1)), (internal (dir, seek, step): (fwd, 0, 0), (rev, 1, 1)),
+(internal-stats: (block-bytes: (total 789B, cached 789B, read-time 0s)), (points: (count 12, key-bytes 36B, value-bytes 36B, tombstoned 0))),
+(range-key-stats: (count 1), (contained points: (count 12, skipped 12)))
+
+combined-iter mask-suffix=@9 mask-filter
+seek-lt m
+prev
+stats
+----
+a: (., [a-z) @5=boop UPDATED)
+.
+stats: (interface (dir, seek, step): (fwd, 0, 0), (rev, 1, 1)), (internal (dir, seek, step): (fwd, 0, 0), (rev, 1, 1)),
+(internal-stats: (block-bytes: (total 539B, cached 539B, read-time 0s)), (points: (count 2, key-bytes 6B, value-bytes 6B, tombstoned 0))),
+(range-key-stats: (count 1), (contained points: (count 2, skipped 2)))
diff --git a/pebble/testdata/iter_histories/range_keys_simple b/pebble/testdata/iter_histories/range_keys_simple
new file mode 100644
index 0000000..6748821
--- /dev/null
+++ b/pebble/testdata/iter_histories/range_keys_simple
@@ -0,0 +1,508 @@
+reset
+----
+
+# Use the key string as the value so that it's easy to tell when we surface the
+# wrong value.
+
+batch commit
+set a a
+set b b
+set c c
+set d d
+range-key-set b   c   @5 boop
+range-key-set cat dog @3 beep
+----
+committed 6 keys
+
+# Scan forward
+
+combined-iter
+seek-ge a
+next
+next
+next
+next
+next
+----
+a: (a, .)
+b: (b, [b-c) @5=boop UPDATED)
+c: (c, . UPDATED)
+cat: (., [cat-dog) @3=beep UPDATED)
+d: (d, [cat-dog) @3=beep)
+.
+
+# Do the above forward iteration but with a mask suffix. The results should be
+# identical despite range keys serving as masks, because none of the point keys
+# have suffixes.
+
+combined-iter mask-suffix=@9
+seek-ge a
+next
+next
+next
+next
+next
+----
+a: (a, .)
+b: (b, [b-c) @5=boop UPDATED)
+c: (c, . UPDATED)
+cat: (., [cat-dog) @3=beep UPDATED)
+d: (d, [cat-dog) @3=beep)
+.
+
+# Scan backward
+
+combined-iter
+seek-lt z
+prev
+prev
+prev
+prev
+prev
+----
+d: (d, [cat-dog) @3=beep UPDATED)
+cat: (., [cat-dog) @3=beep)
+c: (c, . UPDATED)
+b: (b, [b-c) @5=boop UPDATED)
+a: (a, . UPDATED)
+.
+
+combined-iter
+seek-ge ace
+seek-ge b
+seek-ge c
+seek-ge cab
+seek-ge cat
+seek-ge d
+seek-ge day
+seek-ge dog
+----
+b: (b, [b-c) @5=boop UPDATED)
+b: (b, [b-c) @5=boop)
+c: (c, . UPDATED)
+cat: (., [cat-dog) @3=beep UPDATED)
+cat: (., [cat-dog) @3=beep)
+d: (d, [cat-dog) @3=beep)
+day: (., [cat-dog) @3=beep)
+.
+
+combined-iter
+seek-lt 1
+seek-lt ace
+seek-lt b
+seek-lt c
+seek-lt cab
+seek-lt cat
+seek-lt d
+seek-lt day
+seek-lt dog
+seek-lt zebra
+----
+.
+a: (a, .)
+a: (a, .)
+b: (b, [b-c) @5=boop UPDATED)
+c: (c, . UPDATED)
+c: (c, .)
+cat: (., [cat-dog) @3=beep UPDATED)
+d: (d, [cat-dog) @3=beep)
+d: (d, [cat-dog) @3=beep)
+d: (d, [cat-dog) @3=beep)
+
+rangekey-iter
+first
+next
+next
+set-bounds lower=bat upper=catatonic
+first
+next
+next
+----
+b [b-c) @5=boop UPDATED
+cat [cat-dog) @3=beep UPDATED
+.
+.
+bat [bat-c) @5=boop UPDATED
+cat [cat-catatonic) @3=beep UPDATED
+.
+
+rangekey-iter
+seek-ge bat
+----
+bat [b-c) @5=boop UPDATED
+
+# Delete 'b': The Iterator should still stop at b because of the range key
+# with a start boundary at b.
+
+batch commit
+del b
+----
+committed 1 keys
+
+combined-iter
+seek-ge b
+seek-ge ace
+----
+b: (., [b-c) @5=boop UPDATED)
+b: (., [b-c) @5=boop)
+
+rangekey-iter
+seek-ge b
+seek-ge ace
+----
+b [b-c) @5=boop UPDATED
+b [b-c) @5=boop
+
+# Delete the b-c range key and the beginning of the cat-dog range key,
+# truncating it to now begin at 'd'.
+
+batch name=indexed
+range-key-del b d
+----
+wrote 1 keys to batch "indexed"
+
+# Reading through the indexed batch, we should see the beginning of the cat-dog
+# range key now beginning at 'd'.
+
+combined-iter reader=indexed
+seek-ge b
+next
+----
+c: (c, .)
+d: (d, [d-dog) @3=beep UPDATED)
+
+commit batch=indexed
+----
+committed 1 keys
+
+# Reading through the database after applying the batch, we should still see the
+# beginning of the cat-dog range key now beginning at 'd'.
+
+combined-iter
+seek-ge b
+next
+----
+c: (c, .)
+d: (d, [d-dog) @3=beep UPDATED)
+
+# Reading through the database after flushing, we should still see the
+# beginning of the cat-dog range key now beginning at 'd'.
+
+flush
+----
+
+combined-iter
+seek-ge b
+next
+----
+c: (c, .)
+d: (d, [d-dog) @3=beep UPDATED)
+
+
+reset
+----
+
+batch commit
+range-key-set c d @1 boop
+range-key-set apple c @3 beep
+range-key-set ace apple @3 beep
+set a a1
+set b b1
+set c c1
+del a
+set b b2
+set c c2
+----
+committed 9 keys
+
+# Test that reverse iteration surfaces range key start boundaries alongside
+# point keys at the same key, and defragments logically equivalent ranges.
+
+combined-iter
+last
+prev
+prev
+prev
+----
+c: (c2, [c-d) @1=boop UPDATED)
+b: (b2, [ace-c) @3=beep UPDATED)
+ace: (., [ace-c) @3=beep)
+.
+
+# Test that forward iteration surfaces range key start boundaries alongside
+# point keys at the same key, and defragments logically equivalent ranges.
+
+combined-iter
+first
+next
+next
+next
+----
+ace: (., [ace-c) @3=beep UPDATED)
+b: (b2, [ace-c) @3=beep)
+c: (c2, [c-d) @1=boop UPDATED)
+.
+
+# NB: seek-prefix-ge truncates bounds to the prefix.
+
+combined-iter
+seek-prefix-ge b
+next
+----
+b: (b2, [b-"b\x00") @3=beep UPDATED)
+.
+
+reset
+----
+
+# For all prefixes a, aa, ab, ... zz, write 3 keys at timestamps @1, @10, @100.
+# This populates a total of (26**2 + 26) * 3 = 2106 keys.
+
+populate keylen=2 timestamps=(1, 10, 100)
+----
+wrote 2106 keys
+
+batch commit
+range-key-set   b c @5 beep
+range-key-unset c d @1
+range-key-del   d e
+----
+committed 3 keys
+
+flush
+----
+
+metrics
+----
+Metrics.Keys.RangeKeySetsCount = 1
+
+combined-iter
+seek-ge az
+next
+next
+next
+next
+next
+seek-ge bz@10
+next
+next
+----
+az@100: (az@100, .)
+az@10: (az@10, .)
+az@1: (az@1, .)
+b: (., [b-c) @5=beep UPDATED)
+b@100: (b@100, [b-c) @5=beep)
+b@10: (b@10, [b-c) @5=beep)
+bz@10: (bz@10, [b-c) @5=beep)
+bz@1: (bz@1, [b-c) @5=beep)
+c@100: (c@100, . UPDATED)
+
+# Perform the same iteration with all range keys serving as masks. The bz@1
+# point key should be elided.
+
+combined-iter mask-suffix=@100
+seek-ge az
+next
+next
+next
+next
+next
+seek-ge bz@10
+next
+next
+----
+az@100: (az@100, .)
+az@10: (az@10, .)
+az@1: (az@1, .)
+b: (., [b-c) @5=beep UPDATED)
+b@100: (b@100, [b-c) @5=beep)
+b@10: (b@10, [b-c) @5=beep)
+bz@10: (bz@10, [b-c) @5=beep)
+c@100: (c@100, . UPDATED)
+c@10: (c@10, .)
+
+# Ensure that a cloned iterator includes range keys.
+
+combined-iter
+seek-ge bz@10
+clone
+seek-ge bz@10
+----
+bz@10: (bz@10, [b-c) @5=beep UPDATED)
+.
+bz@10: (bz@10, [b-c) @5=beep UPDATED)
+
+# Within a batch, later writes overwrite earlier writes. Here, the range-key-del
+# of [bat, bus) overwrites the earlier writes of [b,c) and [b,e).
+
+batch commit
+range-key-set   b c @5 beep
+range-key-set   b e @1 bop
+range-key-set   c z @1000 boop
+range-key-del   bat bus
+----
+committed 4 keys
+
+flush
+----
+
+lsm
+----
+0.1:
+  000008:[b#2120,RANGEKEYSET-z#inf,RANGEKEYSET]
+0.0:
+  000006:[a@100#12,SET-zz@1#2113,SET]
+
+scan-rangekeys
+----
+[b, bat)
+ @5=beep, @1=bop
+[bus, c)
+ @5=beep, @1=bop
+[c, e)
+ @1000=boop, @1=bop
+[e, z)
+ @1000=boop
+
+# NB: There are now 8 range key sets in the database. See the 7 range keys in
+# the above scan-rangekeys. Additionally, the sstable flushed earlier up above
+# included a rangekeyset [b,c) @5=beep.
+
+metrics
+----
+Metrics.Keys.RangeKeySetsCount = 8
+
+
+combined-iter
+seek-prefix-ge ca
+next
+seek-prefix-ge ca@100
+----
+ca: (., [ca-"ca\x00") @1000=boop, @1=bop UPDATED)
+ca@100: (ca@100, [ca-"ca\x00") @1000=boop, @1=bop)
+ca@100: (ca@100, [ca-"ca\x00") @1000=boop, @1=bop)
+
+
+# Perform the same iteration as above, but with @1000 range-key masking. The
+# previously encountered point keys should be elided.
+
+combined-iter mask-suffix=@1000
+seek-prefix-ge ca
+next
+seek-prefix-ge ca@100
+----
+ca: (., [ca-"ca\x00") @1000=boop, @1=bop UPDATED)
+.
+ca@100: (., [ca-"ca\x00") @1000=boop, @1=bop UPDATED)
+
+# Test masked, non-prefixed iteration. We should see the range keys, but all the
+# points should be masked except those beginning with z which were excluded by
+# the range key's exclusive z end bound.
+
+combined-iter mask-suffix=@1000
+seek-ge ca
+next
+next
+next
+next
+next
+----
+ca: (., [c-e) @1000=boop, @1=bop UPDATED)
+e: (., [e-z) @1000=boop UPDATED)
+z@100: (z@100, . UPDATED)
+z@10: (z@10, .)
+z@1: (z@1, .)
+za@100: (za@100, .)
+
+# Applying range keys to a DB running with a version that doesn't support them
+# results in an error. Range keys were added in version 7.
+reset format-major-version=6
+----
+
+batch commit
+range-key-set a   z   @5 boop
+----
+pebble: batch requires at least format major version 8 (current: 6)
+
+# Constructing iterator over range keys on a DB that doesn't support them
+# results in an error.
+
+reset format-major-version=6
+----
+
+combined-iter
+----
+pebble: range keys require at least format major version 8 (current: 6)
+
+# Test Prev-ing back over a synthetic range key marker. Synthetic range-key
+# markers (the keys interleaved at 'c' during a SeekGE(c) when there's a
+# straddling range key) are ephemeral, and Prev-ing back must move back the
+# appropriate number of times.
+
+reset
+----
+
+batch commit
+set a a
+range-key-set b e @1 foo
+----
+committed 2 keys
+
+flush
+----
+
+combined-iter
+seek-ge b
+prev
+seek-ge c
+prev
+----
+b: (., [b-e) @1=foo UPDATED)
+a: (a, . UPDATED)
+c: (., [b-e) @1=foo UPDATED)
+b: (., [b-e) @1=foo)
+
+define
+L6
+a.RANGEDEL.3:z
+rangekey:b-d:{(#5,RANGEKEYSET,@2,foo)}
+----
+6:
+  000004:[a#3,RANGEDEL-z#inf,RANGEDEL]
+
+combined-iter
+seek-lt apple
+----
+.
+
+combined-iter
+seek-ge apple
+seek-ge z
+seek-lt apple
+seek-lt z
+first
+last
+----
+b: (., [b-d) @2=foo UPDATED)
+.
+.
+b: (., [b-d) @2=foo UPDATED)
+b: (., [b-d) @2=foo)
+b: (., [b-d) @2=foo)
+
+combined-iter
+seek-ge apple
+prev
+last
+next
+prev
+seek-lt c
+prev
+----
+b: (., [b-d) @2=foo UPDATED)
+.
+b: (., [b-d) @2=foo UPDATED)
+.
+b: (., [b-d) @2=foo UPDATED)
+b: (., [b-d) @2=foo)
+.
diff --git a/pebble/testdata/iter_histories/set_options b/pebble/testdata/iter_histories/set_options
new file mode 100644
index 0000000..f344773
--- /dev/null
+++ b/pebble/testdata/iter_histories/set_options
@@ -0,0 +1,56 @@
+# Ensure bounds and key-types provided through SetOptions are respected.
+
+reset
+----
+
+batch commit
+set a a
+set b b
+set c c
+set d d
+set f f
+range-key-set a   ap  @6 foo
+range-key-set ap  c   @5 bar
+range-key-set cat zoo @3 bax
+----
+committed 8 keys
+
+combined-iter lower=b upper=e
+first
+next
+next
+next
+next
+set-options lower=a upper=cat key-types=both
+first
+next
+next
+next
+set-options lower=a upper=cat key-types=point
+first
+next
+next
+next
+----
+b: (b, [b-c) @5=bar UPDATED)
+c: (c, . UPDATED)
+cat: (., [cat-e) @3=bax UPDATED)
+d: (d, [cat-e) @3=bax)
+.
+.
+a: (a, [a-ap) @6=foo UPDATED)
+ap: (., [ap-c) @5=bar UPDATED)
+b: (b, [ap-c) @5=bar)
+c: (c, . UPDATED)
+.
+a: (a, .)
+b: (b, .)
+c: (c, .)
+.
+
+flush
+----
+
+metrics
+----
+Metrics.Keys.RangeKeySetsCount = 3
diff --git a/pebble/testdata/iter_histories/skip_point b/pebble/testdata/iter_histories/skip_point
new file mode 100644
index 0000000..1ed27a7
--- /dev/null
+++ b/pebble/testdata/iter_histories/skip_point
@@ -0,0 +1,115 @@
+reset
+----
+
+populate keylen=1 timestamps=(1, 10, 100)
+----
+wrote 78 keys
+
+# With a filter [20,30) all keys should be hidden, in both forward and reverse
+# iteration directions.
+
+combined-iter point-key-filter=(20,30)
+first
+last
+seek-ge d
+seek-lt m
+----
+.
+.
+.
+.
+
+# With a filter [1,2) only the keys @1 should be visible.
+# Test forward direction.
+
+combined-iter point-key-filter=(1,2)
+first
+next
+next
+next
+next
+next
+next
+next
+next
+next
+----
+a@1: (a@1, .)
+b@1: (b@1, .)
+c@1: (c@1, .)
+d@1: (d@1, .)
+e@1: (e@1, .)
+f@1: (f@1, .)
+g@1: (g@1, .)
+h@1: (h@1, .)
+i@1: (i@1, .)
+j@1: (j@1, .)
+
+
+# With a filter [1,2) only the keys @1 should be visible.
+# And reverse direction.
+
+combined-iter point-key-filter=(1,2)
+last
+prev
+prev
+prev
+prev
+prev
+prev
+prev
+prev
+prev
+----
+z@1: (z@1, .)
+y@1: (y@1, .)
+x@1: (x@1, .)
+w@1: (w@1, .)
+v@1: (v@1, .)
+u@1: (u@1, .)
+t@1: (t@1, .)
+s@1: (s@1, .)
+r@1: (r@1, .)
+q@1: (q@1, .)
+
+# With an expansive filter, all keys should be visible.
+
+combined-iter point-key-filter=(0,1000)
+first
+next
+seek-ge m
+prev
+seek-lt m
+next
+last
+----
+a@100: (a@100, .)
+a@10: (a@10, .)
+m@100: (m@100, .)
+l@1: (l@1, .)
+l@1: (l@1, .)
+m@100: (m@100, .)
+z@1: (z@1, .)
+
+# Test the case where a range key [a,z) is truncated to a seek key
+# at which there exists a point key, but the point key should be skipped.
+# The seek should stop at the seek key, but show no visible point key.
+
+batch commit
+range-key-set a z @5 boop
+----
+committed 1 keys
+
+combined-iter point-key-filter=(9,12)
+seek-ge c@1
+----
+c@1: (., [a-z) @5=boop UPDATED)
+
+
+# Try the same scenario, but this time with a filter that should NOT skip the
+# point key.
+
+combined-iter point-key-filter=(1,12)
+seek-ge c@1
+----
+c@1: (c@1, [a-z) @5=boop UPDATED)
diff --git a/pebble/testdata/iter_histories/stats_no_invariants b/pebble/testdata/iter_histories/stats_no_invariants
new file mode 100644
index 0000000..fb64b72
--- /dev/null
+++ b/pebble/testdata/iter_histories/stats_no_invariants
@@ -0,0 +1,159 @@
+reset
+----
+
+# Use the key string as the value so that it's easy to tell when we surface the
+# wrong value.
+
+batch commit
+set a a
+set b b
+set c c
+set d d
+range-key-set b   c   @5 boop
+range-key-set cat dog @3 beep
+----
+committed 6 keys
+
+flush
+----
+
+# Scan forward
+
+combined-iter
+stats
+seek-ge a
+next
+stats
+next
+next
+next
+next
+stats
+----
+stats: (interface (dir, seek, step): (fwd, 0, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 0, 0), (rev, 0, 0))
+a: (a, .)
+b: (b, [b-c) @5=boop UPDATED)
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 89B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0))),
+(range-key-stats: (count 1), (contained points: (count 1, skipped 0)))
+c: (c, . UPDATED)
+cat: (., [cat-dog) @3=beep UPDATED)
+d: (d, [cat-dog) @3=beep)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 5), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 6), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 89B, cached 0B, read-time 0s)), (points: (count 4, key-bytes 4B, value-bytes 4B, tombstoned 0))),
+(range-key-stats: (count 2), (contained points: (count 2, skipped 0)))
+
+# Do the above forward iteration but with a mask suffix. The results should be
+# identical despite range keys serving as masks, because none of the point keys
+# have suffixes.
+
+combined-iter mask-suffix=@9
+seek-ge a
+next
+next
+next
+next
+next
+stats
+----
+a: (a, .)
+b: (b, [b-c) @5=boop UPDATED)
+c: (c, . UPDATED)
+cat: (., [cat-dog) @3=beep UPDATED)
+d: (d, [cat-dog) @3=beep)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 5), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 6), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 89B, cached 89B, read-time 0s)), (points: (count 4, key-bytes 4B, value-bytes 4B, tombstoned 0))),
+(range-key-stats: (count 2), (contained points: (count 2, skipped 0)))
+
+# Scan backward
+
+combined-iter
+seek-lt z
+prev
+prev
+prev
+prev
+prev
+stats
+----
+d: (d, [cat-dog) @3=beep UPDATED)
+cat: (., [cat-dog) @3=beep)
+c: (c, . UPDATED)
+b: (b, [b-c) @5=boop UPDATED)
+a: (a, . UPDATED)
+.
+stats: (interface (dir, seek, step): (fwd, 0, 0), (rev, 1, 5)), (internal (dir, seek, step): (fwd, 0, 0), (rev, 1, 6)),
+(internal-stats: (block-bytes: (total 89B, cached 89B, read-time 0s)), (points: (count 4, key-bytes 4B, value-bytes 4B, tombstoned 0))),
+(range-key-stats: (count 2), (contained points: (count 2, skipped 0)))
+
+combined-iter
+seek-ge ace
+seek-ge b
+seek-ge c
+seek-ge cab
+seek-ge cat
+seek-ge d
+seek-ge day
+seek-ge dog
+stats
+----
+b: (b, [b-c) @5=boop UPDATED)
+b: (b, [b-c) @5=boop)
+c: (c, . UPDATED)
+cat: (., [cat-dog) @3=beep UPDATED)
+cat: (., [cat-dog) @3=beep)
+d: (d, [cat-dog) @3=beep)
+day: (., [cat-dog) @3=beep)
+.
+stats: (interface (dir, seek, step): (fwd, 8, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 6, 4), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 89B, cached 89B, read-time 0s)), (points: (count 4, key-bytes 4B, value-bytes 4B, tombstoned 0))),
+(range-key-stats: (count 2), (contained points: (count 3, skipped 0)))
+
+combined-iter
+seek-lt 1
+seek-lt ace
+seek-lt b
+seek-lt c
+seek-lt cab
+seek-lt cat
+seek-lt d
+seek-lt day
+seek-lt dog
+seek-lt zebra
+stats
+----
+.
+a: (a, .)
+a: (a, .)
+b: (b, [b-c) @5=boop UPDATED)
+c: (c, . UPDATED)
+c: (c, .)
+cat: (., [cat-dog) @3=beep UPDATED)
+d: (d, [cat-dog) @3=beep)
+d: (d, [cat-dog) @3=beep)
+d: (d, [cat-dog) @3=beep)
+stats: (interface (dir, seek, step): (fwd, 0, 0), (rev, 10, 0)), (internal (dir, seek, step): (fwd, 0, 0), (rev, 10, 10)),
+(internal-stats: (block-bytes: (total 267B, cached 267B, read-time 0s)), (points: (count 15, key-bytes 15B, value-bytes 15B, tombstoned 0))),
+(range-key-stats: (count 2), (contained points: (count 6, skipped 0)))
+
+rangekey-iter
+first
+next
+next
+set-bounds lower=bat upper=catatonic
+first
+next
+next
+stats
+----
+b [b-c) @5=boop UPDATED
+cat [cat-dog) @3=beep UPDATED
+.
+.
+bat [bat-c) @5=boop UPDATED
+cat [cat-catatonic) @3=beep UPDATED
+.
+stats: (interface (dir, seek, step): (fwd, 2, 4), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 2, 4), (rev, 0, 0)),
+(range-key-stats: (count 4), (contained points: (count 0, skipped 0)))
diff --git a/pebble/testdata/iter_histories/with_limit b/pebble/testdata/iter_histories/with_limit
new file mode 100644
index 0000000..359f0b2
--- /dev/null
+++ b/pebble/testdata/iter_histories/with_limit
@@ -0,0 +1,206 @@
+reset
+----
+
+batch commit
+range-key-set x z @5 boop
+----
+committed 1 keys
+
+combined-iter
+last
+next
+prev
+----
+x: (., [x-z) @5=boop UPDATED)
+.
+x: (., [x-z) @5=boop UPDATED)
+
+# Test limited reverse iteration. The seek-lt-limit z y must see the [x-z) range
+# key because it covers a key within the range [y, z). The range key start
+# boundary isn't until x.
+
+combined-iter
+seek-lt-limit z y
+next
+prev-limit y
+----
+x: valid (., [x-z) @5=boop UPDATED)
+.
+x: valid (., [x-z) @5=boop UPDATED)
+
+# Test limited forward iteration. Since range keys are interleaved at the start
+# boundaries, the iterator is guaranteed to encounter covering range keys
+# without any special casing in the implementation.
+
+combined-iter
+seek-ge-limit w y
+prev
+next-limit y
+----
+x: valid (., [x-z) @5=boop UPDATED)
+.
+x: valid (., [x-z) @5=boop UPDATED)
+
+# Test another limited backward iteration case where there exists a deleted
+# point key and the underlying internalIterator is Prev'd to a key beyond the
+# limit. This should still surface the covering range key.
+
+batch commit
+del yy
+----
+committed 1 keys
+
+combined-iter
+seek-lt-limit z y
+next
+prev-limit y
+----
+x: valid (., [x-z) @5=boop UPDATED)
+.
+x: valid (., [x-z) @5=boop UPDATED)
+
+# Test a case during limited reverse iteration where a range key covers a
+# portion of the keyspace within the limit. The iterator should NOT pause and
+# should surface the range key.
+
+reset
+----
+
+batch commit
+del b
+range-key-set a d @1 foo
+----
+committed 2 keys
+
+flush
+----
+
+combined-iter
+seek-ge z
+prev-limit c
+----
+.
+a: valid (., [a-d) @1=foo UPDATED)
+
+# Test a case during limited reverse iteration where there exists a range key
+# but it ends before the limit. The iterator should pause.
+
+reset
+----
+
+batch commit
+del b
+range-key-set a c @1 foo
+----
+committed 2 keys
+
+combined-iter
+seek-ge z
+prev-limit c
+----
+.
+. at-limit
+
+# Test at-limit interactions with RangeKeyChanged().
+# Regression test for #1963.
+
+reset
+----
+
+# Construct a range key and points such that there are deleted keys that a
+# -WithLimit iterator operation may pause at both at the beginning and end of
+# the range key's bounds.
+#
+#                    * b.DEL             * cat.SET         * dog.DEL
+#   |-------------------------------------------------------------------) [a,e)@1→foo
+#   a                b                c                d                e
+
+batch commit
+del b
+set cat cat
+del dog
+range-key-set a e @1 foo
+----
+committed 4 keys
+
+combined-iter
+seek-ge-limit a bat
+seek-ge-limit a bat
+seek-ge-limit apple bat
+seek-ge-limit cow zoo
+----
+a: valid (., [a-e) @1=foo UPDATED)
+a: valid (., [a-e) @1=foo)
+apple: valid (., [a-e) @1=foo)
+cow: valid (., [a-e) @1=foo)
+
+combined-iter
+seek-ge a
+next-limit bat
+next-limit dog
+next-limit zoo
+----
+a: (., [a-e) @1=foo UPDATED)
+. at-limit
+cat: valid (cat, [a-e) @1=foo UPDATED)
+. exhausted
+
+combined-iter
+seek-lt-limit f e
+seek-lt-limit e d
+seek-lt-limit e d
+seek-lt-limit f e
+seek-lt-limit e d
+----
+. at-limit
+cat: valid (cat, [a-e) @1=foo UPDATED)
+cat: valid (cat, [a-e) @1=foo)
+. at-limit
+cat: valid (cat, [a-e) @1=foo UPDATED)
+
+# Add a new dz.SET key.
+#
+#                    * b.DEL             * cat.SET         * dog.DEL  * dz.SET
+#   |-------------------------------------------------------------------) [a,e)@1→foo
+#   a                b                c                d                e
+
+batch commit
+set dz dz
+----
+committed 1 keys
+
+combined-iter
+seek-ge dz
+prev-limit e
+prev-limit dy
+prev-limit c
+prev
+next
+----
+dz: (dz, [a-e) @1=foo UPDATED)
+. at-limit
+cat: valid (cat, [a-e) @1=foo UPDATED)
+a: valid (., [a-e) @1=foo)
+.
+a: (., [a-e) @1=foo UPDATED)
+
+# Test enforcing limits even while skipping point keys.
+
+reset
+----
+
+batch commit
+set b@9 b@9
+set c@2 c@2
+set d@8 d@8
+----
+committed 3 keys
+
+combined-iter point-key-filter=(6,10)
+last
+prev-limit c
+seek-ge-limit c@9 d
+----
+d@8: (d@8, .)
+. at-limit
+. at-limit
diff --git a/pebble/testdata/iterator b/pebble/testdata/iterator
new file mode 100644
index 0000000..cad4f0d
--- /dev/null
+++ b/pebble/testdata/iterator
@@ -0,0 +1,1706 @@
+define
+a.SET.1:b
+----
+
+iter seq=2
+seek-ge a
+next
+prev
+----
+a: (b, .)
+.
+a: (b, .)
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 1, 1)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+iter seq=2
+seek-ge b
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 0, 0))
+
+iter seq=2
+seek-lt a
+----
+.
+stats: (interface (dir, seek, step): (fwd, 0, 0), (rev, 1, 0)), (internal (dir, seek, step): (fwd, 0, 0), (rev, 1, 0))
+
+
+define
+a.SET.2:c
+a.SET.1:b
+----
+
+iter seq=2
+seek-ge a
+next
+prev
+----
+a: (b, .)
+.
+a: (b, .)
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 1, 1)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 4, key-bytes 4B, value-bytes 4B, tombstoned 0)))
+
+iter seq=3
+seek-ge a
+next
+prev
+----
+a: (c, .)
+.
+a: (c, .)
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 1, 2)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 4, key-bytes 4B, value-bytes 4B, tombstoned 0)))
+
+iter seq=2
+seek-prefix-ge a
+next
+prev
+next
+----
+a: (b, .)
+.
+err=pebble: unsupported reverse prefix iteration
+err=pebble: unsupported reverse prefix iteration
+stats: (interface (dir, seek, step): (fwd, 1, 2), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+iter seq=3
+seek-prefix-ge a
+next
+----
+a: (c, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+
+define
+a.DEL.2:
+a.SET.1:b
+----
+
+iter seq=3
+seek-ge a
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 1B, tombstoned 0)))
+
+iter seq=2
+seek-ge 1
+next
+----
+a: (b, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 1B, tombstoned 0)))
+
+iter seq=3
+seek-lt b
+----
+.
+stats: (interface (dir, seek, step): (fwd, 0, 0), (rev, 1, 0)), (internal (dir, seek, step): (fwd, 0, 0), (rev, 1, 2)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 1B, tombstoned 0)))
+
+iter seq=2
+seek-lt b
+prev
+next
+----
+a: (b, .)
+.
+a: (b, .)
+stats: (interface (dir, seek, step): (fwd, 0, 1), (rev, 1, 1)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 1, 1)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 4, key-bytes 4B, value-bytes 2B, tombstoned 0)))
+
+iter seq=3
+seek-prefix-ge a
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 1B, tombstoned 0)))
+
+iter seq=2
+seek-prefix-ge 1
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 1, key-bytes 1B, value-bytes 0B, tombstoned 0)))
+
+define
+a.DEL.2:
+a.SET.1:b
+b.SET.3:c
+----
+
+iter seq=4
+seek-ge a
+next
+----
+b: (c, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 3), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 3, key-bytes 3B, value-bytes 2B, tombstoned 0)))
+
+iter seq=3
+seek-ge a
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 3, key-bytes 3B, value-bytes 2B, tombstoned 0)))
+
+iter seq=2
+seek-ge a
+----
+a: (b, .)
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 1B, tombstoned 0)))
+
+iter seq=4
+seek-prefix-ge a
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 3, key-bytes 3B, value-bytes 2B, tombstoned 0)))
+
+iter seq=3
+seek-prefix-ge a
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 3, key-bytes 3B, value-bytes 2B, tombstoned 0)))
+
+iter seq=2
+seek-prefix-ge a
+----
+a: (b, .)
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 1B, tombstoned 0)))
+
+iter seq=2
+seek-prefix-ge a
+seek-prefix-ge b
+----
+a: (b, .)
+.
+stats: (interface (dir, seek, step): (fwd, 2, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 2, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 3, key-bytes 3B, value-bytes 2B, tombstoned 0)))
+
+define
+a.DEL.3:
+a.SET.1:b
+b.DEL.3:
+b.SET.2:c
+c.SET.3:d
+----
+
+iter seq=4
+seek-prefix-ge a
+seek-prefix-ge b
+seek-prefix-ge c
+----
+.
+.
+c: (d, .)
+stats: (interface (dir, seek, step): (fwd, 3, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 3, 4), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 7, key-bytes 7B, value-bytes 4B, tombstoned 0)))
+
+iter seq=3
+seek-prefix-ge a
+seek-prefix-ge b
+seek-prefix-ge c
+----
+a: (b, .)
+b: (c, .)
+.
+stats: (interface (dir, seek, step): (fwd, 3, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 3, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 5, key-bytes 5B, value-bytes 3B, tombstoned 0)))
+
+iter seq=3
+seek-ge a
+seek-ge b
+seek-ge c
+----
+a: (b, .)
+b: (c, .)
+.
+stats: (interface (dir, seek, step): (fwd, 3, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 3, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 5, key-bytes 5B, value-bytes 3B, tombstoned 0)))
+
+define
+a.SET.1:a
+b.SET.2:b
+c.SET.3:c
+----
+
+iter seq=4
+seek-ge a
+next
+next
+next
+----
+a: (a, .)
+b: (b, .)
+c: (c, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 3), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 3), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 3, key-bytes 3B, value-bytes 3B, tombstoned 0)))
+
+iter seq=4
+seek-ge b
+next
+----
+b: (b, .)
+c: (c, .)
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+iter seq=4
+seek-ge c
+----
+c: (c, .)
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 1, key-bytes 1B, value-bytes 1B, tombstoned 0)))
+
+iter seq=4
+seek-lt a
+----
+.
+stats: (interface (dir, seek, step): (fwd, 0, 0), (rev, 1, 0)), (internal (dir, seek, step): (fwd, 0, 0), (rev, 1, 0))
+
+iter seq=4
+seek-lt b
+prev
+next
+----
+a: (a, .)
+.
+a: (a, .)
+stats: (interface (dir, seek, step): (fwd, 0, 1), (rev, 1, 1)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 1, 1)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+iter seq=4
+seek-lt c
+prev
+prev
+next
+----
+b: (b, .)
+a: (a, .)
+.
+a: (a, .)
+stats: (interface (dir, seek, step): (fwd, 0, 1), (rev, 1, 2)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 1, 2)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 3, key-bytes 3B, value-bytes 3B, tombstoned 0)))
+
+
+iter seq=4
+seek-lt d
+prev
+prev
+prev
+next
+----
+c: (c, .)
+b: (b, .)
+a: (a, .)
+.
+a: (a, .)
+stats: (interface (dir, seek, step): (fwd, 0, 1), (rev, 1, 3)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 1, 3)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 4, key-bytes 4B, value-bytes 4B, tombstoned 0)))
+
+iter seq=4
+seek-prefix-ge a
+next
+----
+a: (a, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+iter seq=4
+seek-prefix-ge b
+next
+----
+b: (b, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+
+iter seq=4
+seek-prefix-ge c
+next
+----
+c: (c, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 1, key-bytes 1B, value-bytes 1B, tombstoned 0)))
+
+
+iter seq=4
+seek-prefix-ge d
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 0, 0))
+
+iter seq=4
+seek-prefix-ge a
+seek-prefix-ge c
+seek-prefix-ge b
+----
+a: (a, .)
+c: (c, .)
+b: (b, .)
+stats: (interface (dir, seek, step): (fwd, 3, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 3, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 3, key-bytes 3B, value-bytes 3B, tombstoned 0)))
+
+define
+a.SET.b2:b
+b.SET.2:c
+----
+
+iter seq=2
+seek-ge a
+next
+prev
+----
+a: (b, .)
+.
+a: (b, .)
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 1, 1)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 4, key-bytes 4B, value-bytes 4B, tombstoned 0)))
+
+iter seq=2
+seek-ge b
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 1, key-bytes 1B, value-bytes 1B, tombstoned 0)))
+
+iter seq=2
+seek-lt a
+----
+.
+stats: (interface (dir, seek, step): (fwd, 0, 0), (rev, 1, 0)), (internal (dir, seek, step): (fwd, 0, 0), (rev, 1, 0))
+
+iter seq=2
+seek-lt b
+prev
+next
+----
+a: (b, .)
+.
+a: (b, .)
+stats: (interface (dir, seek, step): (fwd, 0, 1), (rev, 1, 1)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 1, 1)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+iter seq=2
+seek-lt c
+prev
+next
+----
+a: (b, .)
+.
+a: (b, .)
+stats: (interface (dir, seek, step): (fwd, 0, 1), (rev, 1, 1)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 1, 1)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 3, key-bytes 3B, value-bytes 3B, tombstoned 0)))
+
+iter seq=2
+seek-prefix-ge a
+next
+----
+a: (b, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+iter seq=2
+seek-prefix-ge b
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 1, key-bytes 1B, value-bytes 1B, tombstoned 0)))
+
+
+define
+a.SET.1:a
+aa.SET.2:aa
+aaa.SET.3:aaa
+b.SET.4:b
+----
+
+iter seq=5
+seek-prefix-ge a
+next
+----
+a: (a, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 3B, value-bytes 3B, tombstoned 0)))
+
+iter seq=5
+seek-prefix-ge a
+next
+----
+a: (a, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 3B, value-bytes 3B, tombstoned 0)))
+
+iter seq=5
+seek-prefix-ge aa
+----
+aa: (aa, .)
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 1, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+iter seq=5
+seek-prefix-ge aa
+next
+----
+aa: (aa, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 5B, value-bytes 5B, tombstoned 0)))
+
+iter seq=5
+seek-prefix-ge aa
+next
+----
+aa: (aa, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 5B, value-bytes 5B, tombstoned 0)))
+
+iter seq=5
+seek-prefix-ge aaa
+next
+----
+aaa: (aaa, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 4B, value-bytes 4B, tombstoned 0)))
+
+iter seq=5
+seek-prefix-ge aaa
+----
+aaa: (aaa, .)
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 1, key-bytes 3B, value-bytes 3B, tombstoned 0)))
+
+iter seq=5
+seek-prefix-ge b
+next
+----
+b: (b, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 1, key-bytes 1B, value-bytes 1B, tombstoned 0)))
+
+iter seq=5
+seek-prefix-ge aa
+last
+prev
+prev
+prev
+prev
+----
+aa: (aa, .)
+b: (b, .)
+aaa: (aaa, .)
+aa: (aa, .)
+a: (a, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 1, 4)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 1, 4)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 5, key-bytes 9B, value-bytes 9B, tombstoned 0)))
+
+iter seq=5
+seek-prefix-ge aa
+first
+next
+next
+next
+next
+----
+aa: (aa, .)
+a: (a, .)
+aa: (aa, .)
+aaa: (aaa, .)
+b: (b, .)
+.
+stats: (interface (dir, seek, step): (fwd, 2, 4), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 2, 4), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 5, key-bytes 9B, value-bytes 9B, tombstoned 0)))
+
+iter seq=5
+seek-prefix-ge aaa
+seek-ge aa
+next
+next
+next
+----
+aaa: (aaa, .)
+aa: (aa, .)
+aaa: (aaa, .)
+b: (b, .)
+.
+stats: (interface (dir, seek, step): (fwd, 2, 3), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 2, 3), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 4, key-bytes 9B, value-bytes 9B, tombstoned 0)))
+
+iter seq=5
+seek-prefix-ge aaa
+seek-ge aaa
+next
+next
+----
+aaa: (aaa, .)
+aaa: (aaa, .)
+b: (b, .)
+.
+stats: (interface (dir, seek, step): (fwd, 2, 2), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 2, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 3, key-bytes 7B, value-bytes 7B, tombstoned 0)))
+
+iter seq=5
+seek-prefix-ge aaa
+seek-lt aa
+next
+next
+next
+next
+----
+aaa: (aaa, .)
+a: (a, .)
+aa: (aa, .)
+aaa: (aaa, .)
+b: (b, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 4), (rev, 1, 0)), (internal (dir, seek, step): (fwd, 2, 4), (rev, 1, 1)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 6, key-bytes 11B, value-bytes 11B, tombstoned 0)))
+
+
+iter seq=5
+seek-prefix-ge aaa
+seek-lt b
+next
+next
+----
+aaa: (aaa, .)
+aaa: (aaa, .)
+b: (b, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 2), (rev, 1, 0)), (internal (dir, seek, step): (fwd, 1, 3), (rev, 1, 1)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 5, key-bytes 12B, value-bytes 12B, tombstoned 0)))
+
+iter seq=4
+seek-prefix-ge a
+seek-prefix-ge aa
+seek-prefix-ge aaa
+seek-prefix-ge b
+----
+a: (a, .)
+aa: (aa, .)
+aaa: (aaa, .)
+.
+stats: (interface (dir, seek, step): (fwd, 4, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 4, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 4, key-bytes 7B, value-bytes 7B, tombstoned 0)))
+
+iter seq=3
+seek-prefix-ge aaa
+seek-prefix-ge b
+seek-prefix-ge a
+seek-prefix-ge aa
+seek-prefix-ge aaa
+----
+.
+.
+a: (a, .)
+aa: (aa, .)
+.
+stats: (interface (dir, seek, step): (fwd, 5, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 5, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 7, key-bytes 12B, value-bytes 12B, tombstoned 0)))
+
+define
+bb.DEL.2:
+bb.SET.1:1
+bb2.SET.3:2
+----
+
+iter seq=4
+seek-prefix-ge bb
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 3, key-bytes 7B, value-bytes 2B, tombstoned 0)))
+
+
+define
+a.MERGE.3:d
+a.MERGE.2:c
+a.SET.1:b
+b.MERGE.2:b
+b.MERGE.1:a
+----
+
+iter seq=4
+seek-ge a
+next
+next
+prev
+----
+a: (bcd, .)
+b: (ab, .)
+.
+b: (ab, .)
+stats: (interface (dir, seek, step): (fwd, 1, 2), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 1, 5), (rev, 1, 2)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 8, key-bytes 8B, value-bytes 8B, tombstoned 0)))
+
+iter seq=3
+seek-ge a
+next
+----
+a: (bc, .)
+b: (ab, .)
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 4), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 5, key-bytes 5B, value-bytes 5B, tombstoned 0)))
+
+iter seq=2
+seek-ge a
+next
+----
+a: (b, .)
+b: (a, .)
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 5, key-bytes 5B, value-bytes 5B, tombstoned 0)))
+
+iter seq=4
+seek-lt c
+prev
+prev
+next
+----
+b: (ab, .)
+a: (bcd, .)
+.
+a: (bcd, .)
+stats: (interface (dir, seek, step): (fwd, 0, 1), (rev, 1, 2)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 1, 5)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 8, key-bytes 8B, value-bytes 8B, tombstoned 0)))
+
+iter seq=3
+seek-lt c
+prev
+----
+b: (ab, .)
+a: (bc, .)
+stats: (interface (dir, seek, step): (fwd, 0, 0), (rev, 1, 1)), (internal (dir, seek, step): (fwd, 0, 0), (rev, 1, 4)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 5, key-bytes 5B, value-bytes 5B, tombstoned 0)))
+
+iter seq=2
+seek-lt c
+prev
+----
+b: (a, .)
+a: (b, .)
+stats: (interface (dir, seek, step): (fwd, 0, 0), (rev, 1, 1)), (internal (dir, seek, step): (fwd, 0, 0), (rev, 1, 2)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 5, key-bytes 5B, value-bytes 5B, tombstoned 0)))
+
+iter seq=4
+seek-ge a
+next
+prev
+next
+----
+a: (bcd, .)
+b: (ab, .)
+a: (bcd, .)
+b: (ab, .)
+stats: (interface (dir, seek, step): (fwd, 1, 2), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 2, 10), (rev, 1, 5)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 15, key-bytes 15B, value-bytes 15B, tombstoned 0)))
+
+iter seq=3
+seek-ge a
+next
+prev
+next
+----
+a: (bc, .)
+b: (ab, .)
+a: (bc, .)
+b: (ab, .)
+stats: (interface (dir, seek, step): (fwd, 1, 2), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 2, 8), (rev, 1, 4)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 15, key-bytes 15B, value-bytes 15B, tombstoned 0)))
+
+iter seq=2
+seek-ge a
+next
+prev
+next
+----
+a: (b, .)
+b: (a, .)
+a: (b, .)
+b: (a, .)
+stats: (interface (dir, seek, step): (fwd, 1, 2), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 2, 4), (rev, 1, 2)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 15, key-bytes 15B, value-bytes 15B, tombstoned 0)))
+
+iter seq=4
+seek-lt c
+prev
+next
+prev
+----
+b: (ab, .)
+a: (bcd, .)
+b: (ab, .)
+a: (bcd, .)
+stats: (interface (dir, seek, step): (fwd, 0, 1), (rev, 1, 2)), (internal (dir, seek, step): (fwd, 1, 5), (rev, 2, 10)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 15, key-bytes 15B, value-bytes 15B, tombstoned 0)))
+
+iter seq=3
+seek-lt c
+prev
+next
+prev
+----
+b: (ab, .)
+a: (bc, .)
+b: (ab, .)
+a: (bc, .)
+stats: (interface (dir, seek, step): (fwd, 0, 1), (rev, 1, 2)), (internal (dir, seek, step): (fwd, 1, 4), (rev, 2, 8)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 15, key-bytes 15B, value-bytes 15B, tombstoned 0)))
+
+iter seq=2
+seek-lt c
+prev
+next
+prev
+----
+b: (a, .)
+a: (b, .)
+b: (a, .)
+a: (b, .)
+stats: (interface (dir, seek, step): (fwd, 0, 1), (rev, 1, 2)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 2, 4)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 15, key-bytes 15B, value-bytes 15B, tombstoned 0)))
+
+iter seq=3
+seek-prefix-ge a
+next
+----
+a: (bc, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 4, key-bytes 4B, value-bytes 4B, tombstoned 0)))
+
+iter seq=2
+seek-prefix-ge a
+next
+----
+a: (b, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 4, key-bytes 4B, value-bytes 4B, tombstoned 0)))
+
+iter seq=4
+seek-prefix-ge a
+next
+----
+a: (bcd, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 3), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 4, key-bytes 4B, value-bytes 4B, tombstoned 0)))
+
+iter seq=2
+seek-prefix-ge a
+next
+----
+a: (b, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 4, key-bytes 4B, value-bytes 4B, tombstoned 0)))
+
+iter seq=3
+seek-prefix-ge a
+next
+----
+a: (bc, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 4, key-bytes 4B, value-bytes 4B, tombstoned 0)))
+
+iter seq=3
+seek-prefix-ge c
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 0, 0))
+
+iter seq=3
+seek-prefix-ge 1
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 1, key-bytes 1B, value-bytes 1B, tombstoned 0)))
+
+iter seq=3
+seek-prefix-ge a
+----
+a: (bc, .)
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 3, key-bytes 3B, value-bytes 3B, tombstoned 0)))
+
+
+define
+a.MERGE.3:d
+a.MERGE.2:c
+a.MERGE.1:b
+aa.MERGE.2:b
+aa.MERGE.1:a
+b.MERGE.2:b
+b.MERGE.1:a
+----
+
+iter seq=3
+seek-prefix-ge a
+next
+next
+----
+a: (bc, .)
+.
+.
+stats: (interface (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 4, key-bytes 5B, value-bytes 4B, tombstoned 0)))
+
+iter seq=2
+seek-prefix-ge a
+next
+next
+----
+a: (b, .)
+.
+.
+stats: (interface (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 4, key-bytes 5B, value-bytes 4B, tombstoned 0)))
+
+iter seq=4
+seek-prefix-ge a
+next
+----
+a: (bcd, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 3), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 4, key-bytes 5B, value-bytes 4B, tombstoned 0)))
+
+iter seq=2
+seek-prefix-ge a
+next
+----
+a: (b, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 4, key-bytes 5B, value-bytes 4B, tombstoned 0)))
+
+iter seq=3
+seek-prefix-ge aa
+next
+----
+aa: (ab, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 3, key-bytes 5B, value-bytes 3B, tombstoned 0)))
+
+iter seq=4
+seek-prefix-ge aa
+----
+aa: (ab, .)
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 3, key-bytes 5B, value-bytes 3B, tombstoned 0)))
+
+define
+a.SET.1:a
+b.SET.1:b
+c.SET.1:c
+d.SET.1:d
+----
+
+iter seq=2 lower=a
+seek-ge a
+first
+prev
+----
+a: (a, .)
+a: (a, .)
+.
+stats: (interface (dir, seek, step): (fwd, 2, 0), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 2, 0), (rev, 0, 1)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+iter seq=2 lower=b
+seek-ge a
+first
+prev
+----
+b: (b, .)
+b: (b, .)
+.
+stats: (interface (dir, seek, step): (fwd, 2, 0), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 2, 0), (rev, 0, 1)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+iter seq=2 lower=c
+seek-ge a
+first
+prev
+----
+c: (c, .)
+c: (c, .)
+.
+stats: (interface (dir, seek, step): (fwd, 2, 0), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 2, 0), (rev, 0, 1)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+iter seq=2 lower=d
+seek-ge a
+first
+prev
+----
+d: (d, .)
+d: (d, .)
+.
+stats: (interface (dir, seek, step): (fwd, 2, 0), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 2, 0), (rev, 0, 1)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+iter seq=2 lower=e
+seek-ge a
+first
+----
+.
+.
+stats: (interface (dir, seek, step): (fwd, 2, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 2, 0), (rev, 0, 0))
+
+iter seq=2 upper=d
+seek-lt d
+last
+next
+----
+c: (c, .)
+c: (c, .)
+.
+stats: (interface (dir, seek, step): (fwd, 0, 1), (rev, 2, 0)), (internal (dir, seek, step): (fwd, 0, 2), (rev, 2, 2)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 5, key-bytes 5B, value-bytes 5B, tombstoned 0)))
+
+iter seq=2 upper=c
+seek-lt d
+last
+next
+----
+b: (b, .)
+b: (b, .)
+.
+stats: (interface (dir, seek, step): (fwd, 0, 1), (rev, 2, 0)), (internal (dir, seek, step): (fwd, 0, 2), (rev, 2, 2)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 5, key-bytes 5B, value-bytes 5B, tombstoned 0)))
+
+iter seq=2 upper=b
+seek-lt d
+last
+next
+----
+a: (a, .)
+a: (a, .)
+.
+stats: (interface (dir, seek, step): (fwd, 0, 1), (rev, 2, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 2, 2)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 3, key-bytes 3B, value-bytes 3B, tombstoned 0)))
+
+iter seq=2 upper=a
+seek-lt d
+last
+----
+.
+.
+stats: (interface (dir, seek, step): (fwd, 0, 0), (rev, 2, 0)), (internal (dir, seek, step): (fwd, 0, 0), (rev, 2, 0))
+
+iter seq=2 lower=b upper=c
+seek-ge a
+next
+----
+b: (b, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 1, key-bytes 1B, value-bytes 1B, tombstoned 0)))
+
+iter seq=2
+set-bounds lower=a
+seek-ge a
+first
+prev
+----
+.
+a: (a, .)
+a: (a, .)
+.
+stats: (interface (dir, seek, step): (fwd, 2, 0), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 2, 0), (rev, 0, 1)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+iter seq=2
+set-bounds lower=b
+seek-ge a
+first
+prev
+----
+.
+b: (b, .)
+b: (b, .)
+.
+stats: (interface (dir, seek, step): (fwd, 2, 0), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 2, 0), (rev, 0, 1)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+iter seq=2
+set-bounds lower=c
+seek-ge a
+first
+prev
+----
+.
+c: (c, .)
+c: (c, .)
+.
+stats: (interface (dir, seek, step): (fwd, 2, 0), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 2, 0), (rev, 0, 1)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+iter seq=2
+set-bounds lower=d
+seek-ge a
+first
+prev
+----
+.
+d: (d, .)
+d: (d, .)
+.
+stats: (interface (dir, seek, step): (fwd, 2, 0), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 2, 0), (rev, 0, 1)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+iter seq=2
+set-bounds lower=e
+seek-ge a
+first
+----
+.
+.
+.
+stats: (interface (dir, seek, step): (fwd, 2, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 2, 0), (rev, 0, 0))
+
+iter seq=2
+set-bounds upper=d
+seek-lt d
+last
+next
+----
+.
+c: (c, .)
+c: (c, .)
+.
+stats: (interface (dir, seek, step): (fwd, 0, 1), (rev, 2, 0)), (internal (dir, seek, step): (fwd, 0, 2), (rev, 2, 2)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 5, key-bytes 5B, value-bytes 5B, tombstoned 0)))
+
+iter seq=2
+set-bounds upper=c
+seek-lt d
+last
+next
+----
+.
+b: (b, .)
+b: (b, .)
+.
+stats: (interface (dir, seek, step): (fwd, 0, 1), (rev, 2, 0)), (internal (dir, seek, step): (fwd, 0, 2), (rev, 2, 2)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 5, key-bytes 5B, value-bytes 5B, tombstoned 0)))
+
+iter seq=2
+set-bounds upper=b
+seek-lt d
+last
+next
+----
+.
+a: (a, .)
+a: (a, .)
+.
+stats: (interface (dir, seek, step): (fwd, 0, 1), (rev, 2, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 2, 2)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 3, key-bytes 3B, value-bytes 3B, tombstoned 0)))
+
+iter seq=2
+set-bounds upper=a
+seek-lt d
+last
+----
+.
+.
+.
+stats: (interface (dir, seek, step): (fwd, 0, 0), (rev, 2, 0)), (internal (dir, seek, step): (fwd, 0, 0), (rev, 2, 0))
+
+iter seq=2
+set-bounds lower=a
+seek-lt d
+next
+next
+----
+.
+c: (c, .)
+d: (d, .)
+.
+stats: (interface (dir, seek, step): (fwd, 0, 2), (rev, 1, 0)), (internal (dir, seek, step): (fwd, 0, 3), (rev, 1, 1)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 4, key-bytes 4B, value-bytes 4B, tombstoned 0)))
+
+iter seq=2
+set-bounds lower=b upper=c
+seek-ge a
+next
+----
+.
+b: (b, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 1, key-bytes 1B, value-bytes 1B, tombstoned 0)))
+
+iter seq=2
+set-bounds lower=b
+seek-ge a
+set-bounds lower=b upper=z
+seek-ge a
+----
+.
+b: (b, .)
+.
+b: (b, .)
+stats: (interface (dir, seek, step): (fwd, 2, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 2, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+iter seq=2
+seek-ge a
+set-bounds upper=e
+----
+a: (a, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 1, key-bytes 1B, value-bytes 1B, tombstoned 0)))
+
+iter seq=2
+set-bounds lower=b
+seek-ge a
+set-bounds upper=e
+----
+.
+b: (b, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 1, key-bytes 1B, value-bytes 1B, tombstoned 0)))
+
+iter seq=2
+set-bounds lower=b
+first
+----
+.
+b: (b, .)
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 1, key-bytes 1B, value-bytes 1B, tombstoned 0)))
+
+iter seq=2
+set-bounds upper=b
+first
+----
+.
+a: (a, .)
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 1, key-bytes 1B, value-bytes 1B, tombstoned 0)))
+
+iter seq=2
+set-bounds lower=b
+last
+----
+.
+d: (d, .)
+stats: (interface (dir, seek, step): (fwd, 0, 0), (rev, 1, 0)), (internal (dir, seek, step): (fwd, 0, 0), (rev, 1, 1)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+iter seq=2
+set-bounds upper=b
+last
+----
+.
+a: (a, .)
+stats: (interface (dir, seek, step): (fwd, 0, 0), (rev, 1, 0)), (internal (dir, seek, step): (fwd, 0, 0), (rev, 1, 1)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 1, key-bytes 1B, value-bytes 1B, tombstoned 0)))
+
+# The prev call after "set-bounds upper=c" will assume that the iterator
+# is exhausted due to having stepped up to c. Which means prev should step
+# back to below c, hence returning b.
+iter seq=2
+last
+next
+set-bounds upper=c
+prev
+----
+d: (d, .)
+.
+.
+b: (b, .)
+stats: (interface (dir, seek, step): (fwd, 0, 1), (rev, 1, 1)), (internal (dir, seek, step): (fwd, 0, 2), (rev, 2, 2)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 5, key-bytes 5B, value-bytes 5B, tombstoned 0)))
+
+# The next call after "set-bounds lower=b" will assume that the iterator
+# is exhausted due to having stepped below b. Which means next should step
+# up to b (or higher), hence returning b.
+iter seq=2
+first
+prev
+set-bounds lower=b
+next
+----
+a: (a, .)
+.
+.
+b: (b, .)
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 2, 0), (rev, 0, 1)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+iter seq=2
+set-bounds lower=b
+seek-lt c
+next
+----
+.
+b: (b, .)
+c: (c, .)
+stats: (interface (dir, seek, step): (fwd, 0, 1), (rev, 1, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 1, 1)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 3, key-bytes 3B, value-bytes 3B, tombstoned 0)))
+
+iter seq=2
+set-bounds upper=d
+seek-ge c
+prev
+----
+.
+c: (c, .)
+b: (b, .)
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 0, 2)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 3, key-bytes 3B, value-bytes 3B, tombstoned 0)))
+
+define
+a.SET.1:a
+aa.SET.1:aa
+aaa.SET.1:aaa
+b.SET.1:b
+----
+
+iter seq=2 lower=a
+seek-prefix-ge a
+first
+prev
+----
+a: (a, .)
+a: (a, .)
+.
+stats: (interface (dir, seek, step): (fwd, 2, 0), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 2, 0), (rev, 0, 1)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+
+iter seq=2 lower=aa
+seek-prefix-ge a
+----
+err=pebble: SeekPrefixGE supplied with key outside of lower bound
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 0, 0), (rev, 0, 0))
+
+iter seq=2 lower=a upper=aa
+seek-prefix-ge a
+next
+----
+a: (a, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 1, key-bytes 1B, value-bytes 1B, tombstoned 0)))
+
+iter seq=2 lower=a upper=aaa
+seek-prefix-ge a
+next
+----
+a: (a, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 3B, value-bytes 3B, tombstoned 0)))
+
+iter seq=2 lower=a upper=b
+seek-prefix-ge a
+next
+----
+a: (a, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 3B, value-bytes 3B, tombstoned 0)))
+
+iter seq=2 lower=a upper=c
+seek-prefix-ge a
+next
+----
+a: (a, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 3B, value-bytes 3B, tombstoned 0)))
+
+iter seq=2 lower=a upper=aaa
+seek-prefix-ge aa
+----
+aa: (aa, .)
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 1, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+iter seq=2 lower=a upper=aaa
+seek-prefix-ge aa
+next
+----
+aa: (aa, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 1, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+define
+a.SET.1:a
+b.SET.2:b
+----
+
+iter seq=4
+first
+next
+next
+----
+a: (a, .)
+b: (b, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+define
+a.SINGLEDEL.1:
+----
+
+iter seq=2
+first
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 1, key-bytes 1B, value-bytes 0B, tombstoned 0)))
+
+define
+a.SINGLEDEL.2:
+a.SINGLEDEL.1:
+----
+
+iter seq=3
+first
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 0B, tombstoned 0)))
+
+define
+a.SINGLEDEL.2:
+a.DEL.1:
+----
+
+iter seq=3
+first
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 0B, tombstoned 0)))
+
+define
+a.SINGLEDEL.2:
+a.MERGE.1:
+----
+
+iter seq=3
+first
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 0B, tombstoned 0)))
+
+define
+a.SINGLEDEL.2:
+a.SET.1:b
+----
+
+iter seq=3
+first
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 1B, tombstoned 0)))
+
+define
+a.SET.2:b
+a.SINGLEDEL.1:
+----
+
+iter seq=3
+first
+next
+----
+a: (b, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 1B, tombstoned 0)))
+
+define
+a.SINGLEDEL.2:
+a.SET.1:b
+b.SET.3:c
+----
+
+iter seq=4
+first
+next
+----
+b: (c, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 3), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 3, key-bytes 3B, value-bytes 2B, tombstoned 0)))
+
+define
+a.SINGLEDEL.3:
+a.SET.2:b
+a.SET.1:a
+----
+
+iter
+first
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 3, key-bytes 3B, value-bytes 2B, tombstoned 0)))
+
+define
+a.SINGLEDEL.3:
+a.MERGE.2:b
+a.MERGE.1:a
+----
+
+iter seq=4
+first
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 3), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 3, key-bytes 3B, value-bytes 2B, tombstoned 0)))
+
+define
+a.SINGLEDEL.4:
+a.SET.3:val
+a.SINGLEDEL.2:
+a.SET.1:val
+----
+
+iter seq=5
+first
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 4), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 4, key-bytes 4B, value-bytes 6B, tombstoned 0)))
+
+define
+a.SINGLEDEL.4:
+a.SET.3:val
+a.DEL.2:
+a.SET.1:val
+----
+
+iter seq=5
+first
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 4), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 4, key-bytes 4B, value-bytes 6B, tombstoned 0)))
+
+define
+a.SINGLEDEL.4:
+a.SET.3:c
+a.MERGE.2:b
+a.SET.1:a
+----
+
+iter seq=5
+first
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 4), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 4, key-bytes 4B, value-bytes 3B, tombstoned 0)))
+
+define
+a.SINGLEDEL.3:
+a.SET.1:val
+----
+
+iter seq=4
+first
+----
+.
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 3B, tombstoned 0)))
+
+# Exercise iteration with limits, when there are no deletes.
+define
+a.SET.1:a
+b.SET.1:b
+c.SET.1:c
+d.SET.1:d
+----
+
+iter seq=2
+seek-ge-limit a b
+next-limit b
+prev-limit a
+next-limit b
+next-limit b
+seek-lt-limit d d
+prev-limit d
+next-limit e
+prev-limit d
+prev-limit c
+prev-limit b
+prev-limit a
+prev-limit a
+next-limit a
+next-limit b
+----
+a: valid (a, .)
+. at-limit
+a: valid (a, .)
+. at-limit
+. at-limit
+. at-limit
+. at-limit
+d: valid (d, .)
+. at-limit
+c: valid (c, .)
+b: valid (b, .)
+a: valid (a, .)
+. exhausted
+. at-limit
+a: valid (a, .)
+stats: (interface (dir, seek, step): (fwd, 1, 6), (rev, 1, 7)), (internal (dir, seek, step): (fwd, 3, 3), (rev, 1, 6)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 11, key-bytes 11B, value-bytes 11B, tombstoned 0)))
+
+# Exercise iteration with limits when we have deletes.
+
+define
+a.SET.1:a
+b.DEL.3:
+b.SET.2:b
+c.DEL.3:
+c.SET.2:c
+d.SET.1:d
+----
+
+iter seq=4
+seek-ge-limit a b
+next-limit b
+prev-limit a
+prev-limit a
+next-limit b
+next-limit b
+next-limit b
+prev-limit a
+next-limit c
+prev-limit b
+next-limit c
+next-limit d
+next-limit e
+next-limit e
+prev-limit d
+next-limit e
+----
+a: valid (a, .)
+. at-limit
+a: valid (a, .)
+. exhausted
+a: valid (a, .)
+. at-limit
+. at-limit
+a: valid (a, .)
+. at-limit
+. at-limit
+. at-limit
+. at-limit
+d: valid (d, .)
+. exhausted
+d: valid (d, .)
+. exhausted
+stats: (interface (dir, seek, step): (fwd, 1, 10), (rev, 0, 5)), (internal (dir, seek, step): (fwd, 3, 13), (rev, 1, 8)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 21, key-bytes 21B, value-bytes 14B, tombstoned 0)))
+
+iter seq=4
+seek-ge-limit b d
+next-limit d
+prev-limit b
+next-limit e
+----
+. at-limit
+. at-limit
+. at-limit
+d: valid (d, .)
+stats: (interface (dir, seek, step): (fwd, 1, 2), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 1, 9), (rev, 0, 5)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 15, key-bytes 15B, value-bytes 9B, tombstoned 0)))
+
+iter seq=4
+seek-lt-limit d c
+prev-limit c
+prev-limit b
+prev-limit a
+prev-limit a
+next-limit b
+----
+. at-limit
+. at-limit
+. at-limit
+a: valid (a, .)
+. exhausted
+a: valid (a, .)
+stats: (interface (dir, seek, step): (fwd, 0, 1), (rev, 1, 4)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 1, 5)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 6, key-bytes 6B, value-bytes 4B, tombstoned 0)))
+
+# NB: Zero values are skipped by deletable merger.
+define merger=deletable
+a.MERGE.1:1
+a.MERGE.2:2
+a.MERGE.3:-1
+a.MERGE.4:-2
+b.MERGE.4:-3
+b.MERGE.3:3
+b.MERGE.2:2
+b.MERGE.1:-2
+----
+
+iter seq=5
+seek-ge a
+next
+next
+prev
+prev
+----
+.
+.
+.
+.
+.
+stats: (interface (dir, seek, step): (fwd, 1, 2), (rev, 0, 2)), (internal (dir, seek, step): (fwd, 1, 8), (rev, 1, 8)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 16, key-bytes 16B, value-bytes 24B, tombstoned 0)))
+
+iter seq=4
+seek-ge a
+next
+next
+prev
+prev
+----
+a: (2, .)
+b: (3, .)
+.
+b: (3, .)
+a: (2, .)
+stats: (interface (dir, seek, step): (fwd, 1, 2), (rev, 0, 2)), (internal (dir, seek, step): (fwd, 1, 6), (rev, 1, 6)),
+(internal-stats: (block-bytes: (total 0B, cached 0B, read-time 0s)), (points: (count 16, key-bytes 16B, value-bytes 24B, tombstoned 0)))
diff --git a/pebble/testdata/iterator_block_interval_filter b/pebble/testdata/iterator_block_interval_filter
new file mode 100644
index 0000000..8e46099
--- /dev/null
+++ b/pebble/testdata/iterator_block_interval_filter
@@ -0,0 +1,306 @@
+# Block size is 1, so each block contains one key, and two level index is used
+# since the lower index blocks have only one key.
+
+# Build a table with a single interval collector with id=2 and 2 character
+# suffix. The keys in the table are in the interval [1,7).
+build id_offset=(2,0)
+set a01 a
+set b02 b
+set c03 c
+set d04 d
+set e05 e
+set f06 f
+----
+0.0:
+  000005:[a01#10,SET-f06#15,SET]
+
+# Iterate without a filter.
+iter
+first
+next
+next
+next
+next
+next
+next
+----
+a01: (a, .)
+b02: (b, .)
+c03: (c, .)
+d04: (d, .)
+e05: (e, .)
+f06: (f, .)
+.
+
+# Iterate with a filter interval [1,2) that selects a key at the beginning of
+# the file.
+iter id_lower_upper=(2,1,2)
+first
+next
+prev
+prev
+next
+next
+last
+seek-lt f
+seek-ge a
+seek-ge b
+prev
+----
+a01: (a, .)
+.
+a01: (a, .)
+.
+a01: (a, .)
+.
+a01: (a, .)
+a01: (a, .)
+a01: (a, .)
+.
+a01: (a, .)
+
+# Iterate with a filter interval [3,5) that selects keys in the middle of the
+# file.
+iter id_lower_upper=(2,3,5)
+first
+next
+next
+last
+prev
+prev
+seek-lt f
+prev
+next
+prev
+prev
+last
+seek-ge c
+seek-ge d
+next
+prev
+prev
+prev
+----
+c03: (c, .)
+d04: (d, .)
+.
+d04: (d, .)
+c03: (c, .)
+.
+d04: (d, .)
+c03: (c, .)
+d04: (d, .)
+c03: (c, .)
+.
+d04: (d, .)
+c03: (c, .)
+d04: (d, .)
+.
+d04: (d, .)
+c03: (c, .)
+.
+
+# Iterate with a filter interval [6,8) that selects a key at the end of the
+# file.
+iter id_lower_upper=(2,6,8)
+first
+next
+prev
+prev
+next
+last
+prev
+seek-lt g
+seek-ge b
+----
+f06: (f, .)
+.
+f06: (f, .)
+.
+f06: (f, .)
+f06: (f, .)
+.
+f06: (f, .)
+f06: (f, .)
+
+iter id_lower_upper=(2,2,2)
+first
+last
+seek-ge a
+seek-lt g
+----
+.
+.
+.
+.
+
+# Iterate with a filter interval [7,9) that is after the interval in the file.
+iter id_lower_upper=(2, 7, 9)
+first
+last
+seek-ge a
+seek-lt g
+----
+.
+.
+.
+.
+
+# Iterate with a filter interval [0,1) that is before the interval in the
+# file.
+iter id_lower_upper=(2, 0, 1)
+first
+last
+seek-ge a
+seek-lt g
+----
+.
+.
+.
+.
+
+# Iterate with a filter id=3, which is unknown to the file, so all blocks are
+# visible.
+iter id_lower_upper=(3, 1, 2)
+first
+next
+next
+next
+next
+next
+next
+----
+a01: (a, .)
+b02: (b, .)
+c03: (c, .)
+d04: (d, .)
+e05: (e, .)
+f06: (f, .)
+.
+
+# Build a table with two interval collectors:
+# - id=3 and 2 character suffix. The keys in the table are in the interval
+#   [1,6).
+# - id=5 and 2 characters offset by 2 from the suffix. The keys in the table
+#   are in the interval [6,11).
+build id_offset=(3,0) id_offset=(5,2)
+set a1001 a
+set b0902 b
+set c0803 c
+set d0704 d
+set e0605 e
+----
+0.0:
+  000005:[a1001#10,SET-e0605#14,SET]
+
+# Iterate without a filter.
+iter
+first
+next
+next
+next
+next
+next
+----
+a1001: (a, .)
+b0902: (b, .)
+c0803: (c, .)
+d0704: (d, .)
+e0605: (e, .)
+.
+
+# Iterate with filter id=5, interval [7,9).
+iter id_lower_upper=(5,7,9)
+first
+next
+next
+prev
+prev
+----
+c0803: (c, .)
+d0704: (d, .)
+.
+d0704: (d, .)
+c0803: (c, .)
+
+# Iterate with filter id=5, interval [7,9), and an unknown filter id. The
+# result should only be affected by the filter id=5.
+iter id_lower_upper=(5,7,9) id_lower_upper=(10,0,1)
+first
+next
+next
+prev
+prev
+----
+c0803: (c, .)
+d0704: (d, .)
+.
+d0704: (d, .)
+c0803: (c, .)
+
+# Iterate with filter id=3, interval [4,5) and filter id=5, interval [7,9).
+# The set of blocks admitted by these two filters are intersecting, but not
+# identical. Key c0803, which is allowed by the latter is not allowed by the
+# former, and hence omitted.
+iter id_lower_upper=(3,4,5) id_lower_upper=(5,7,9)
+first
+next
+prev
+prev
+----
+d0704: (d, .)
+.
+d0704: (d, .)
+.
+
+# Repeat the above test, but calling set-options before iteration to set the
+# same filter. The results should be identical.
+iter id_lower_upper=(3,4,5) id_lower_upper=(5,7,9)
+set-options point-filters=reuse
+first
+next
+prev
+prev
+----
+.
+d0704: (d, .)
+.
+d0704: (d, .)
+.
+
+# Repeat the above test, but calling set-options before iteration to remove the
+# filter.
+iter id_lower_upper=(3,4,5) id_lower_upper=(5,7,9)
+set-options point-filters=none
+first
+next
+prev
+prev
+----
+.
+a1001: (a, .)
+b0902: (b, .)
+a1001: (a, .)
+.
+
+
+# Iterate with filter id=3 and id=5, where the two admitted sets are
+# non-empty, but the intersection is empty.
+iter id_lower_upper=(3,4,5) id_lower_upper=(5,8,9)
+first
+----
+.
+
+# Iterate with filter id=3 and id=5, where filter id=5 set is empty, so the
+# intersection is empty.
+iter id_lower_upper=(3,4,5) id_lower_upper=(5,11,12)
+first
+----
+.
+
+# Iterate with filter id=3 and id=5, where filter id=3 set is empty, so the
+# intersection is empty.
+iter id_lower_upper=(3,6,7) id_lower_upper=(5,7,9)
+first
+----
+.
diff --git a/pebble/testdata/iterator_bounds_lifetimes b/pebble/testdata/iterator_bounds_lifetimes
new file mode 100644
index 0000000..7799c25
--- /dev/null
+++ b/pebble/testdata/iterator_bounds_lifetimes
@@ -0,0 +1,83 @@
+new-iter label=first lower=bar upper=foo
+----
+first: ("bar", "foo") boundsBufIdx=1
+
+iter label=first
+first
+next
+----
+bb@29: (bb@29, .)
+bc@30: (bc@30, .)
+
+# Clone an iterator from the original iterator. The clone should have its own
+# copy of the bounds.
+
+clone from=first to=second
+----
+first: ("bar", "foo") boundsBufIdx=1
+second: ("bar", "foo") boundsBufIdx=1
+
+iter label=second
+last
+prev
+----
+fo@150: (fo@150, .)
+fn@149: (fn@149, .)
+
+# Changing the bounds on the original should leave the clone's bounds unchanged.
+
+set-bounds label=first lower=boop
+----
+first: ("boop", <nil>) boundsBufIdx=0
+second: ("bar", "foo") boundsBufIdx=1
+
+iter label=first
+seek-ge goop
+----
+gp@178: (gp@178, .)
+
+iter label=second
+prev
+----
+fm@148: (fm@148, .)
+
+set-bounds label=first lower=boop upper=bop
+----
+first: ("boop", "bop") boundsBufIdx=1
+second: ("bar", "foo") boundsBufIdx=1
+
+# Changing the bounds on the clone should leave the original's bounds unchanged.
+
+set-options label=second lower=a upper=z
+----
+first: ("boop", "bop") boundsBufIdx=1
+second: ("a", "z") boundsBufIdx=0
+
+# Test no-op set-options. The boundsBufIdx should remain unchanged, reflecting
+# that the bounds were not copied again.
+
+set-options label=second lower=a upper=z
+----
+first: ("boop", "bop") boundsBufIdx=1
+second: ("a", "z") boundsBufIdx=0
+
+# Test SetOptions with unchanged bounds but changes to other options. SetOptions
+# should hold onto the existing bounds buffers. The boundsBufIdx should still
+# remain unchanged, reflecting that the bounds were not copied.
+
+set-options label=second lower=a upper=z key-types=both
+----
+first: ("boop", "bop") boundsBufIdx=1
+second: ("a", "z") boundsBufIdx=0
+
+iter label=second
+seek-ge foo
+----
+fp@151: (fp@151, .)
+
+close label=first
+----
+second: ("a", "z") boundsBufIdx=0
+
+close label=second
+----
diff --git a/pebble/testdata/iterator_next_prev b/pebble/testdata/iterator_next_prev
new file mode 100644
index 0000000..3121be3
--- /dev/null
+++ b/pebble/testdata/iterator_next_prev
@@ -0,0 +1,278 @@
+build ext1
+merge a 1
+set c 2
+----
+
+ingest ext1
+----
+6:
+  000004:[a#10,MERGE-c#10,SET]
+
+build ext2
+del-range b c
+----
+
+ingest ext2
+----
+0.0:
+  000005:[b#11,RANGEDEL-c#inf,RANGEDEL]
+6:
+  000004:[a#10,MERGE-c#10,SET]
+
+# Regression test for a bug where range tombstones were not properly
+# ignored by Iterator.prevUserKey when switching from forward to
+# reverse iteration. In the forward direction, the Iterator sees the
+# keys:
+#
+#   a#1,MERGE
+#   c#1,SET
+#
+# Due to the synthetic boundary key generated for sstable 5, in the
+# reverse direction Iterator sees the keys:
+#
+#   c#1,SET
+#   b#2,RANGEDEL
+#   a#1,MERGE
+#
+# Normally the record b#2,RANGEDEL is skipped by Iterator during
+# iteration, but logic to do so was missing from Iterator.prevUserKey.
+# The result was that prev could return the same key that iterator was
+# currently pointed at.
+
+iter
+first
+prev
+----
+a: (1, .)
+.
+
+reset
+----
+
+build ext1
+set t 1
+merge z 2
+----
+
+ingest ext1
+----
+6:
+  000004:[t#10,SET-z#10,MERGE]
+
+build ext2
+del-range x y
+----
+
+ingest ext2
+----
+0.0:
+  000005:[x#11,RANGEDEL-y#inf,RANGEDEL]
+6:
+  000004:[t#10,SET-z#10,MERGE]
+
+# Regression test for a bug where range tombstones were not properly
+# ignored by Iterator.nextUserKey when switching from reverse to
+# forward iteration. In the reverse direction, the Iterator sees the
+# keys:
+#
+#   z#1,MERGE
+#   t#1,SET
+#
+# Due to the synthetic boundary key generated for sstable 5, in the
+# forward direction Iterator sees the keys:
+#
+#   t#1,SET
+#   y#72057594037927935,RANGEDEL
+#   z#1,MERGE
+#
+# Normally the record y#72057594037927935,RANGEDEL is skipped by
+# Iterator during iteration, but logic to do so was missing from
+# Iterator.nextUserKey. The result was that next could return the same
+# key that iterator was currently pointed at.
+
+iter
+last
+next
+----
+z: (2, .)
+.
+
+# Verify that switching from reverse iteration to forward iteration
+# properly skips over range tombstones at the start of forward
+# iteration.
+
+reset
+----
+
+build ext1
+set e e
+----
+
+ingest ext1
+----
+6:
+  000004:[e#10,SET-e#10,SET]
+
+build ext2
+set b b
+del-range c d
+----
+
+ingest ext2
+----
+6:
+  000005:[b#11,SET-d#inf,RANGEDEL]
+  000004:[e#10,SET-e#10,SET]
+
+# The scenario requires iteration at a snapshot. The "last" operation
+# will exhaust the mergingIter looking backwards for visible
+# records. The subsequent "next" will seek-ge(lower) which will skip
+# over the "b" record and find the boundary key due to the range
+# deletion sentinel.
+
+iter seq=11
+set-bounds lower=c upper=f
+last
+next
+----
+.
+e: (e, .)
+.
+
+# Test that the cloned iterator sees all the keys.
+iter
+set-bounds lower=a upper=f
+first
+next
+next
+clone
+seek-ge a
+next
+next
+----
+.
+b: (b, .)
+e: (e, .)
+.
+.
+b: (b, .)
+e: (e, .)
+.
+
+# Test that the cloned iterator respects the original bounds.
+iter
+set-bounds lower=a upper=d
+first
+next
+clone
+seek-ge a
+next
+----
+.
+b: (b, .)
+.
+.
+b: (b, .)
+.
+
+# Test that a cloned iterator set with new bounds, respects the new bounds and
+# options.
+iter
+set-bounds lower=a upper=d
+first
+next
+clone lower=a upper=z key-types=both
+seek-ge a
+next
+----
+.
+b: (b, .)
+.
+.
+b: (b, .)
+e: (e, .)
+
+# Test that the cloned iterator respects the seq num.
+iter seq=11
+set-bounds lower=a upper=f
+first
+next
+clone
+last
+prev
+----
+.
+e: (e, .)
+.
+.
+e: (e, .)
+.
+
+# Verify that switching from forward iteration to reverse iteration
+# properly skips over range tombstones at the end of reverse
+# iteration.
+
+reset
+----
+
+build ext1
+merge a a
+----
+
+ingest ext1
+----
+6:
+  000004:[a#10,MERGE-a#10,MERGE]
+
+build ext2
+set e e
+del-range c d
+----
+
+ingest ext2
+----
+6:
+  000004:[a#10,MERGE-a#10,MERGE]
+  000005:[c#11,RANGEDEL-e#11,SET]
+
+iter seq=11
+set-bounds lower=a upper=e
+first
+prev
+----
+.
+a: (a, .)
+.
+
+reset
+----
+
+# Test demonstrating inadvertent exposure of ordering effects of the
+# InternalKeyKind numbering. We build an sst with a (del/singledel, set) pair
+# for two user keys. When ingested, all 4 keys have the same seqnum. The set
+# overrides the del, and the singedel overrides the set.
+#
+# The test input setup looks peculiar because the build uses an indexed batch,
+# and iterates over it to write to the sst, so we need to place the set after
+# the del, and the singledel after the set in order for the batch ordering to
+# be one that is suitable for feeding into the sstable writer. All 4 keys are
+# being written to the sst (notice the bounds in the ingest).
+
+build ext1
+del a
+set a 1
+set b 2
+singledel b
+----
+
+ingest ext1
+----
+6:
+  000004:[a#10,SET-b#10,SET]
+
+iter
+first
+next
+----
+a: (1, .)
+.
diff --git a/pebble/testdata/iterator_read_sampling b/pebble/testdata/iterator_read_sampling
new file mode 100644
index 0000000..889a95d
--- /dev/null
+++ b/pebble/testdata/iterator_read_sampling
@@ -0,0 +1,351 @@
+# Test with overlapping keys across levels, should pick top level to compact after allowed-seeks goes to 0
+# Verify that Iterator.First(), Iterator.SeekGE() and Iterator.Next() call maybe sample read.
+define auto-compactions=off
+L0
+  a.SET.4:4
+L1
+  a.SET.3:3
+L2
+  d.SET.2:2
+L3
+  d.SET.1:1
+----
+0.0:
+  000004:[a#4,SET-a#4,SET]
+1:
+  000005:[a#3,SET-a#3,SET]
+2:
+  000006:[d#2,SET-d#2,SET]
+3:
+  000007:[d#1,SET-d#1,SET]
+
+set allowed-seeks=2
+----
+
+
+iter
+first
+----
+a: (4, .)
+
+iter-read-compactions
+----
+(none)
+
+iter
+first
+----
+a: (4, .)
+
+iter-read-compactions
+----
+(level: 0, start: a, end: a)
+
+read-compactions
+----
+(none)
+
+close-iter
+----
+
+read-compactions
+----
+(level: 0, start: a, end: a)
+
+iter
+seek-ge d
+----
+d: (2, .)
+
+iter
+prev
+----
+a: (4, .)
+
+iter
+next
+----
+d: (2, .)
+
+iter-read-compactions
+----
+(level: 2, start: d, end: d)
+
+close-iter
+----
+
+read-compactions
+----
+(level: 0, start: a, end: a)
+(level: 2, start: d, end: d)
+
+
+
+# Verify that Iterator.Last(), Iterator.SeekLT() and Iterator.Prev() call maybe sample read.
+define auto-compactions=off
+L0
+  a.SET.4:4
+  c.SET.8:8
+L1
+  a.SET.3:3
+  c.SET.9:9
+L2
+  d.SET.2:2
+  l.SET.7:7
+L3
+  d.SET.1:1
+  l.SET.8:8
+----
+0.0:
+  000004:[a#4,SET-c#8,SET]
+1:
+  000005:[a#3,SET-c#9,SET]
+2:
+  000006:[d#2,SET-l#7,SET]
+3:
+  000007:[d#1,SET-l#8,SET]
+
+set allowed-seeks=2
+----
+
+
+iter
+last
+----
+l: (8, .)
+
+iter-read-compactions
+----
+(none)
+
+iter
+last
+----
+l: (8, .)
+
+iter-read-compactions
+----
+(level: 2, start: d, end: l)
+
+read-compactions
+----
+(none)
+
+close-iter
+----
+
+read-compactions
+----
+(level: 2, start: d, end: l)
+
+iter
+seek-lt d
+----
+c: (9, .)
+
+iter
+next
+----
+d: (2, .)
+
+iter
+prev
+----
+c: (9, .)
+
+iter-read-compactions
+----
+(level: 0, start: a, end: c)
+
+close-iter
+----
+
+read-compactions
+----
+(level: 2, start: d, end: l)
+(level: 0, start: a, end: c)
+
+
+# For Iterator.Last(), Iterator.SeekLT() and Iterator.Prev(), if the key is the first key of the file or
+# the only key, sampling skips it because the iterator has already moved past it.
+define auto-compactions=off
+L0
+  a.SET.4:4
+L1
+  a.SET.3:3
+L2
+  d.SET.2:2
+L3
+  d.SET.1:1
+----
+0.0:
+  000004:[a#4,SET-a#4,SET]
+1:
+  000005:[a#3,SET-a#3,SET]
+2:
+  000006:[d#2,SET-d#2,SET]
+3:
+  000007:[d#1,SET-d#1,SET]
+
+set allowed-seeks=2
+----
+
+
+iter
+last
+----
+d: (2, .)
+
+iter-read-compactions
+----
+(none)
+
+iter
+last
+----
+d: (2, .)
+
+iter-read-compactions
+----
+(none)
+
+read-compactions
+----
+(none)
+
+close-iter
+----
+
+read-compactions
+----
+(none)
+
+iter
+seek-lt d
+----
+a: (4, .)
+
+iter
+next
+----
+d: (2, .)
+
+iter
+prev
+----
+a: (4, .)
+
+iter-read-compactions
+----
+(none)
+
+close-iter
+----
+
+read-compactions
+----
+(none)
+
+
+
+
+# Test with no overlapping keys across levels, should not pick any compaction
+define auto-compactions=off
+L0
+  a.SET.4:4
+L1
+  b.SET.3:3
+L2
+  c.SET.2:2
+L3
+  d.SET.1:1
+----
+0.0:
+  000004:[a#4,SET-a#4,SET]
+1:
+  000005:[b#3,SET-b#3,SET]
+2:
+  000006:[c#2,SET-c#2,SET]
+3:
+  000007:[d#1,SET-d#1,SET]
+
+set allowed-seeks=3
+----
+
+iter
+first
+----
+a: (4, .)
+
+iter
+first
+----
+a: (4, .)
+
+iter
+first
+----
+a: (4, .)
+
+iter-read-compactions
+----
+(none)
+
+close-iter
+----
+
+read-compactions
+----
+(none)
+
+# Test to see if the allowedSeeks associated with a file
+# is reset once it hits 0.
+define auto-compactions=off
+L0
+  a.SET.4:4
+  c.SET.8:8
+L1
+  a.SET.3:3
+  c.SET.9:9
+L2
+  d.SET.2:2
+  l.SET.7:7
+L3
+  d.SET.1:1
+  l.SET.8:8
+----
+0.0:
+  000004:[a#4,SET-c#8,SET]
+1:
+  000005:[a#3,SET-c#9,SET]
+2:
+  000006:[d#2,SET-l#7,SET]
+3:
+  000007:[d#1,SET-l#8,SET]
+
+set allowed-seeks=1
+----
+
+iter
+last
+----
+l: (8, .)
+
+iter-read-compactions
+----
+(level: 2, start: d, end: l)
+
+close-iter
+----
+
+read-compactions
+----
+(level: 2, start: d, end: l)
+
+# The allowedSeeks on this file should have been reset.
+# Since the value of allowedSeeks determined
+# by the code is 100, we check if allowed-seeks has been
+# reset to 100.
+show allowed-seeks=(000006,)
+----
+100
diff --git a/pebble/testdata/iterator_seek_opt b/pebble/testdata/iterator_seek_opt
new file mode 100644
index 0000000..73eda49
--- /dev/null
+++ b/pebble/testdata/iterator_seek_opt
@@ -0,0 +1,317 @@
+
+define auto-compactions=off
+L0
+  a.SET.4:4
+L1
+  a.SET.3:3
+L2
+  d.SET.2:2
+L3
+  b.SET.1:1
+  c.SET.1:1
+  d.SET.1:1
+  e.SET.1:1
+----
+0.0:
+  000004:[a#4,SET-a#4,SET]
+1:
+  000005:[a#3,SET-a#3,SET]
+2:
+  000006:[d#2,SET-d#2,SET]
+3:
+  000007:[b#1,SET-e#1,SET]
+
+# Simple case: three successive seeks, at increasing keys. Should use
+# trySeekUsingNext.
+
+iter
+seek-ge a
+----
+a: (4, .)
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 0, 0))
+SeekGEs with trySeekUsingNext: 0
+SeekPrefixGEs with trySeekUsingNext: 0
+
+iter
+seek-ge b
+----
+b: (1, .)
+stats: (interface (dir, seek, step): (fwd, 2, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 2, 0), (rev, 0, 0))
+SeekGEs with trySeekUsingNext: 2
+SeekPrefixGEs with trySeekUsingNext: 0
+
+iter
+seek-ge c
+----
+c: (1, .)
+stats: (interface (dir, seek, step): (fwd, 3, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 3, 0), (rev, 0, 0))
+SeekGEs with trySeekUsingNext: 4
+SeekPrefixGEs with trySeekUsingNext: 0
+
+# Seek at a lower key. Should not call with trySeekUsingNext = true.
+
+iter
+seek-ge bb
+----
+c: (1, .)
+stats: (interface (dir, seek, step): (fwd, 4, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 4, 0), (rev, 0, 0))
+SeekGEs with trySeekUsingNext: 4
+SeekPrefixGEs with trySeekUsingNext: 0
+
+# Seek at a greater key than last seek, but lands on the same key. Should
+# not call internalIterator at all.
+
+iter
+seek-ge bbb
+----
+c: (1, .)
+stats: (interface (dir, seek, step): (fwd, 5, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 4, 0), (rev, 0, 0))
+SeekGEs with trySeekUsingNext: 4
+SeekPrefixGEs with trySeekUsingNext: 0
+
+# A step followed by a seek should not call with trySeekUsingNext = true.
+
+iter
+next
+seek-ge e
+----
+d: (2, .)
+e: (1, .)
+stats: (interface (dir, seek, step): (fwd, 6, 1), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 5, 1), (rev, 0, 0))
+SeekGEs with trySeekUsingNext: 4
+SeekPrefixGEs with trySeekUsingNext: 0
+
+iter
+prev
+seek-ge b
+----
+d: (2, .)
+b: (1, .)
+stats: (interface (dir, seek, step): (fwd, 7, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 6, 1), (rev, 0, 3))
+SeekGEs with trySeekUsingNext: 4
+SeekPrefixGEs with trySeekUsingNext: 0
+
+# SeekPrefixGE simple case.
+
+iter
+seek-prefix-ge a
+----
+a: (4, .)
+stats: (interface (dir, seek, step): (fwd, 8, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 7, 1), (rev, 0, 3))
+SeekGEs with trySeekUsingNext: 4
+SeekPrefixGEs with trySeekUsingNext: 0
+
+iter
+seek-prefix-ge b
+----
+b: (1, .)
+stats: (interface (dir, seek, step): (fwd, 9, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 8, 1), (rev, 0, 3))
+SeekGEs with trySeekUsingNext: 4
+SeekPrefixGEs with trySeekUsingNext: 2
+
+iter
+seek-prefix-ge c
+----
+c: (1, .)
+stats: (interface (dir, seek, step): (fwd, 10, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 9, 1), (rev, 0, 3))
+SeekGEs with trySeekUsingNext: 4
+SeekPrefixGEs with trySeekUsingNext: 4
+
+# Seek at a lower key. Should not call with trySeekUsingNext = true.
+
+iter
+seek-prefix-ge bb
+----
+.
+stats: (interface (dir, seek, step): (fwd, 11, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 10, 1), (rev, 0, 3))
+SeekGEs with trySeekUsingNext: 4
+SeekPrefixGEs with trySeekUsingNext: 4
+
+# Shifting bounds followed by SeekGEs. The one immediately after a bounds change
+# does not use trySeekUsingNext, but successive ones do (while still respecting
+# bounds).
+
+iter
+set-bounds lower=a upper=aa
+seek-ge a
+----
+.
+a: (4, .)
+stats: (interface (dir, seek, step): (fwd, 12, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 11, 1), (rev, 0, 3))
+SeekGEs with trySeekUsingNext: 4
+SeekPrefixGEs with trySeekUsingNext: 4
+
+iter
+set-bounds lower=a upper=c
+seek-ge b
+----
+.
+b: (1, .)
+stats: (interface (dir, seek, step): (fwd, 13, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 12, 1), (rev, 0, 3))
+SeekGEs with trySeekUsingNext: 4
+SeekPrefixGEs with trySeekUsingNext: 4
+
+iter
+seek-ge bb
+----
+.
+stats: (interface (dir, seek, step): (fwd, 14, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 13, 1), (rev, 0, 3))
+SeekGEs with trySeekUsingNext: 5
+SeekPrefixGEs with trySeekUsingNext: 4
+
+iter
+set-bounds lower=a upper=d
+seek-ge bbb
+----
+.
+c: (1, .)
+stats: (interface (dir, seek, step): (fwd, 15, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 14, 1), (rev, 0, 3))
+SeekGEs with trySeekUsingNext: 5
+SeekPrefixGEs with trySeekUsingNext: 4
+
+iter
+seek-ge cc
+----
+.
+stats: (interface (dir, seek, step): (fwd, 16, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 15, 1), (rev, 0, 3))
+SeekGEs with trySeekUsingNext: 6
+SeekPrefixGEs with trySeekUsingNext: 4
+
+# Shifting bounds, with non-overlapping and monotonic bounds. A set-bounds sits
+# between every two seeks. We don't call trySeekUsingNext=true when the bounds
+# are set to unequal bounds, but the results are still correct and within
+# bounds. We do call trySeekUsingNext=true when the set bounds are identical.
+
+iter
+set-bounds lower=a upper=c
+seek-ge b
+----
+.
+b: (1, .)
+stats: (interface (dir, seek, step): (fwd, 17, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 16, 1), (rev, 0, 3))
+SeekGEs with trySeekUsingNext: 6
+SeekPrefixGEs with trySeekUsingNext: 4
+
+iter
+set-bounds lower=c upper=e
+seek-ge c
+----
+.
+c: (1, .)
+stats: (interface (dir, seek, step): (fwd, 18, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 17, 1), (rev, 0, 3))
+SeekGEs with trySeekUsingNext: 6
+SeekPrefixGEs with trySeekUsingNext: 4
+
+# NB: Equal bounds.
+
+iter
+set-bounds lower=c upper=e
+seek-ge d
+----
+.
+d: (2, .)
+stats: (interface (dir, seek, step): (fwd, 19, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 18, 1), (rev, 0, 3))
+SeekGEs with trySeekUsingNext: 8
+SeekPrefixGEs with trySeekUsingNext: 4
+
+iter
+set-bounds lower=a upper=c
+seek-prefix-ge b
+----
+.
+b: (1, .)
+stats: (interface (dir, seek, step): (fwd, 20, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 19, 1), (rev, 0, 3))
+SeekGEs with trySeekUsingNext: 8
+SeekPrefixGEs with trySeekUsingNext: 4
+
+iter
+set-bounds lower=c upper=e
+seek-prefix-ge c
+----
+.
+c: (1, .)
+stats: (interface (dir, seek, step): (fwd, 21, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 20, 1), (rev, 0, 3))
+SeekGEs with trySeekUsingNext: 8
+SeekPrefixGEs with trySeekUsingNext: 4
+
+# NB: Equal bounds.
+
+iter
+set-bounds lower=c upper=e
+seek-prefix-ge d
+----
+.
+d: (2, .)
+stats: (interface (dir, seek, step): (fwd, 22, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 21, 1), (rev, 0, 3))
+SeekGEs with trySeekUsingNext: 8
+SeekPrefixGEs with trySeekUsingNext: 6
+
+# Shifting bounds, with non-overlapping and monotonic bounds, but using
+# SetOptions. A set-options sits between every two seeks. We don't call
+# trySeekUsingNext=true when the bounds are set to unequal bounds, but the
+# results are still correct and within bounds. We do call trySeekUsingNext=true
+# when the set bounds are identical.
+
+iter
+set-options lower=a upper=c
+seek-ge b
+----
+.
+b: (1, .)
+stats: (interface (dir, seek, step): (fwd, 23, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 22, 1), (rev, 0, 3))
+SeekGEs with trySeekUsingNext: 8
+SeekPrefixGEs with trySeekUsingNext: 6
+
+iter
+set-options lower=c upper=e
+seek-ge c
+----
+.
+c: (1, .)
+stats: (interface (dir, seek, step): (fwd, 24, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 23, 1), (rev, 0, 3))
+SeekGEs with trySeekUsingNext: 8
+SeekPrefixGEs with trySeekUsingNext: 6
+
+# NB: Equal bounds.
+
+iter
+set-options lower=c upper=e
+seek-ge d
+----
+.
+d: (2, .)
+stats: (interface (dir, seek, step): (fwd, 25, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 24, 1), (rev, 0, 3))
+SeekGEs with trySeekUsingNext: 10
+SeekPrefixGEs with trySeekUsingNext: 6
+
+iter
+set-options lower=a upper=c
+seek-prefix-ge b
+----
+.
+b: (1, .)
+stats: (interface (dir, seek, step): (fwd, 26, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 25, 1), (rev, 0, 3))
+SeekGEs with trySeekUsingNext: 10
+SeekPrefixGEs with trySeekUsingNext: 6
+
+iter
+set-options lower=c upper=e
+seek-prefix-ge c
+----
+.
+c: (1, .)
+stats: (interface (dir, seek, step): (fwd, 27, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 26, 1), (rev, 0, 3))
+SeekGEs with trySeekUsingNext: 10
+SeekPrefixGEs with trySeekUsingNext: 6
+
+# NB: Equal bounds.
+
+iter
+set-options lower=c upper=e
+seek-prefix-ge d
+----
+.
+d: (2, .)
+stats: (interface (dir, seek, step): (fwd, 28, 1), (rev, 0, 1)), (internal (dir, seek, step): (fwd, 27, 1), (rev, 0, 3))
+SeekGEs with trySeekUsingNext: 10
+SeekPrefixGEs with trySeekUsingNext: 8
diff --git a/pebble/testdata/iterator_seek_opt_errors b/pebble/testdata/iterator_seek_opt_errors
new file mode 100644
index 0000000..631dc04
--- /dev/null
+++ b/pebble/testdata/iterator_seek_opt_errors
@@ -0,0 +1,87 @@
+define
+a.SET.1:a
+b.SET.1:b
+c.SET.1:c
+d.SET.1:d
+----
+
+# Exercise noop optimization with no errors
+
+iter
+seek-ge aa
+seek-ge aa
+seek-ge aaa
+seek-ge b
+seek-ge bb
+----
+b: (b, .)
+b: (b, .)
+b: (b, .)
+b: (b, .)
+c: (c, .)
+
+iter
+seek-lt ddd
+seek-lt ddd
+seek-lt dd
+seek-lt d
+seek-lt c
+----
+d: (d, .)
+d: (d, .)
+d: (d, .)
+c: (c, .)
+b: (b, .)
+
+# Exercise errors which should prevent seek optimizations.
+
+iter seek-error=(0,1)
+seek-ge a
+seek-ge b
+seek-ge c
+seek-ge d
+----
+err=injecting error
+err=injecting error
+c: (c, .)
+d: (d, .)
+
+iter seek-error=(1)
+seek-ge d
+seek-ge a
+seek-ge b
+seek-ge b
+----
+d: (d, .)
+err=injecting error
+b: (b, .)
+b: (b, .)
+
+iter seek-error=(0,1)
+seek-lt e
+seek-lt d
+seek-lt c
+seek-lt b
+----
+err=injecting error
+err=injecting error
+b: (b, .)
+a: (a, .)
+
+iter seek-error=(1)
+seek-lt b
+seek-lt e
+seek-lt e
+----
+a: (a, .)
+err=injecting error
+d: (d, .)
+
+iter seek-error=(1)
+seek-prefix-ge d
+seek-prefix-ge a
+seek-prefix-ge b
+----
+d: (d, .)
+err=injecting error
+b: (b, .)
diff --git a/pebble/testdata/iterator_stats b/pebble/testdata/iterator_stats
new file mode 100644
index 0000000..c6375b4
--- /dev/null
+++ b/pebble/testdata/iterator_stats
@@ -0,0 +1,80 @@
+build ext1
+merge a 1
+set c 2
+----
+
+ingest ext1
+----
+6:
+  000004:[a#10,MERGE-c#10,SET]
+
+iter
+first
+next
+next
+stats
+----
+a: (1, .)
+c: (2, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 57B, cached 57B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+# Perform the same operation again with a new iterator. It should yield
+# identical statistics.
+
+iter
+first
+next
+next
+stats
+----
+a: (1, .)
+c: (2, .)
+.
+stats: (interface (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 57B, cached 57B, read-time 0s)), (points: (count 2, key-bytes 2B, value-bytes 2B, tombstoned 0)))
+
+build ext2
+set d@10 d10
+set d@9 d9
+set d@8 d8
+set e@20 e20
+set e@18 e18
+----
+
+ingest ext2
+----
+6:
+  000004:[a#10,MERGE-c#10,SET]
+  000005:[d@10#11,SET-e@18#11,SET]
+
+iter
+seek-ge c
+stats
+next
+next
+stats
+next
+stats
+next
+stats
+next
+stats
+----
+c: (2, .)
+stats: (interface (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 0), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 57B, cached 57B, read-time 0s)), (points: (count 1, key-bytes 1B, value-bytes 1B, tombstoned 0)))
+d@10: (d10, .)
+d@9: (d9, .)
+stats: (interface (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 2), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 157B, cached 147B, read-time 0s)), (points: (count 3, key-bytes 8B, value-bytes 6B, tombstoned 0)), (separated: (count 1, bytes 2B, fetched 2B)))
+d@8: (d8, .)
+stats: (interface (dir, seek, step): (fwd, 1, 3), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 3), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 157B, cached 147B, read-time 0s)), (points: (count 4, key-bytes 11B, value-bytes 8B, tombstoned 0)), (separated: (count 2, bytes 4B, fetched 4B)))
+e@20: (e20, .)
+stats: (interface (dir, seek, step): (fwd, 1, 4), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 4), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 157B, cached 147B, read-time 0s)), (points: (count 5, key-bytes 15B, value-bytes 11B, tombstoned 0)), (separated: (count 2, bytes 4B, fetched 4B)))
+e@18: (e18, .)
+stats: (interface (dir, seek, step): (fwd, 1, 5), (rev, 0, 0)), (internal (dir, seek, step): (fwd, 1, 5), (rev, 0, 0)),
+(internal-stats: (block-bytes: (total 157B, cached 147B, read-time 0s)), (points: (count 6, key-bytes 19B, value-bytes 13B, tombstoned 0)), (separated: (count 3, bytes 7B, fetched 7B)))
diff --git a/pebble/testdata/iterator_table_filter b/pebble/testdata/iterator_table_filter
new file mode 100644
index 0000000..9eeffea
--- /dev/null
+++ b/pebble/testdata/iterator_table_filter
@@ -0,0 +1,66 @@
+define
+L0
+  a.SET.4:4
+L1
+  a.SET.3:3
+L2
+  a.SET.2:2
+L3
+  a.SET.1:1
+----
+0.0:
+  000004:[a#4,SET-a#4,SET]
+1:
+  000005:[a#3,SET-a#3,SET]
+2:
+  000006:[a#2,SET-a#2,SET]
+3:
+  000007:[a#1,SET-a#1,SET]
+
+iter
+first
+----
+a: (4, .)
+
+# Only scan tables with min-seq-num < filter.
+
+iter filter=5
+first
+----
+a: (4, .)
+
+iter filter=4
+first
+----
+a: (3, .)
+
+iter filter=3
+first
+----
+a: (2, .)
+
+iter filter=2
+first
+----
+a: (1, .)
+
+iter filter=1
+first
+----
+.
+
+# Set-options that reuses the filter should still see the filter apply.
+# Set-options that removes the filter should not.
+
+iter filter=4
+first
+set-options table-filter=reuse
+first
+set-options table-filter=none
+first
+----
+a: (3, .)
+.
+a: (3, .)
+.
+a: (4, .)
diff --git a/pebble/testdata/level_checker b/pebble/testdata/level_checker
new file mode 100644
index 0000000..d5788ed
--- /dev/null
+++ b/pebble/testdata/level_checker
@@ -0,0 +1,358 @@
+# Format for define command:
+# Levels are ordered from higher to lower, and each new level starts with an L
+# Each level is defined using an even number of lines where every pair of lines represents
+# a file. The files within a level are ordered from smaller to larger keys.
+# Each file is defined using: the first line specifies the smallest and largest internal
+# keys and the second line the point key-value pairs in the sstable in order. The rangedel
+# key-value pairs should also be in increasing order relative to the other rangedel pairs.
+# The largest file key can take the form of <userkey>.RANGEDEL.72057594037927935, which
+# represents the range deletion sentinel.
+#
+# Many of the correct case definitions are borrowed from merging_iter since it defines
+# some tricky configurations.
+
+# The untruncated range tombstone at the higher level does not overlap with the lower level once
+# we consider the file boundaries, which is why its lower sequence number is ok.
+define
+L
+a.SET.30 e.RANGEDEL.72057594037927935
+a.SET.30:30 c.SET.27:27 a.RANGEDEL.8:f
+L
+e.SET.10 g.SET.20
+e.SET.10:10 g.SET.20:20 e.RANGEDEL.8:f
+----
+Level 1
+  file 0: [a#30,1-e#72057594037927935,15]
+Level 2
+  file 0: [e#10,1-g#20,1]
+
+check
+----
+
+# The untruncated range tombstone at the higher level does not overlap with the g point at the
+# lower level once we consider the file boundaries, which is why its lower sequence number is ok.
+define
+L
+a.SET.15 f.SET.16
+a.SET.15:15 c.SET.13:13 f.SET.16:16 a.RANGEDEL.12:h
+L
+e.SET.10 g.SET.15
+e.SET.10:10 g.SET.15:15
+----
+Level 1
+  file 0: [a#15,1-f#16,1]
+Level 2
+  file 0: [e#10,1-g#15,1]
+
+check
+----
+
+# The untruncated range tombstones in both levels do not overlap once we consider the file
+# boundaries (which are non-overlapping).
+define
+L
+c.SET.30 f.RANGEDEL.0
+c.SET.30:30 d.SET.27:27 a.RANGEDEL.8:f
+L
+a.SET.10 c.RANGEDEL.72057594037927935
+a.SET.10:10 b.SET.12:12 a.RANGEDEL.8:f
+----
+Level 1
+  file 0: [c#30,1-f#0,15]
+Level 2
+  file 0: [a#10,1-c#72057594037927935,15]
+
+check
+----
+
+# The range tombstone in the higher level does not overlap with the b point in the lower level,
+# which has a higher sequence number, when we consider the file boundaries.
+define
+L
+c.SET.15 g.SET.16
+c.SET.15:15 f.SET.13:13 g.SET.16:16 a.RANGEDEL.12:h
+L
+b.SET.14 d.SET.10
+b.SET.14:14 d.SET.10:10
+----
+Level 1
+  file 0: [c#15,1-g#16,1]
+Level 2
+  file 0: [b#14,1-d#10,1]
+
+check
+----
+
+# The two files do not overlap despite the seeming overlap of the range tombstones.
+define
+L
+a.SET.30 e.RANGEDEL.72057594037927935
+a.SET.30:30 c.SET.27:27 a.RANGEDEL.8:g
+L
+e.SET.10 g.SET.20
+e.SET.10:10 g.SET.20:20 e.RANGEDEL.8:g
+----
+Level 1
+  file 0: [a#30,1-e#72057594037927935,15]
+Level 2
+  file 0: [e#10,1-g#20,1]
+
+check
+----
+
+define
+L
+a.SET.30 e.RANGEDEL.72057594037927935
+a.SET.30:30 c.SET.27:27 a.RANGEDEL.8:g
+L
+a.SET.10 g.SET.20
+a.SET.10:10 c.SET.28:28 g.SET.20:20
+----
+Level 1
+  file 0: [a#30,1-e#72057594037927935,15]
+Level 2
+  file 0: [a#10,1-g#20,1]
+
+check
+----
+found InternalKey c#27,SET in L1: fileNum=000010 and InternalKey c#28,SET in L2: fileNum=000011
+
+# The sentinel key for the RANGEDEL should not violate g having a higher seq num at a
+# lower level.
+define
+L
+a.SET.30 g.RANGEDEL.72057594037927935
+a.SET.30:30 c.SET.27:27 a.RANGEDEL.8:g
+L
+g.SET.10 j.SET.20
+g.SET.10:10 j.SET.20:20
+----
+Level 1
+  file 0: [a#30,1-g#72057594037927935,15]
+Level 2
+  file 0: [g#10,1-j#20,1]
+
+check
+----
+
+define
+L
+a.SET.30 g.SET.8
+a.SET.30:30 c.SET.27:27 a.RANGEDEL.8:g g.SET.8:8
+L
+g.SET.10 j.SET.20
+g.SET.10:10 j.SET.20:20
+----
+Level 1
+  file 0: [a#30,1-g#8,1]
+Level 2
+  file 0: [g#10,1-j#20,1]
+
+check
+----
+found InternalKey g#8,SET in L1: fileNum=000014 and InternalKey g#10,SET in L2: fileNum=000015
+
+define
+L
+a.SET.30 g.SET.30
+a.SET.30:30 c.SET.8:8 g.SET.30:30
+L
+a.SET.10 j.SET.20
+a.SET.10:10 j.SET.20:20 b.RANGEDEL.10:g
+----
+Level 1
+  file 0: [a#30,1-g#30,1]
+Level 2
+  file 0: [a#10,1-j#20,1]
+
+check
+----
+tombstone b-g:{(#10,RANGEDEL)} in L2: fileNum=000017 deletes key c#8,SET in L1: fileNum=000016
+
+define
+L
+a.RANGEDEL.8 c.RANGEDEL.72057594037927935
+a.RANGEDEL.8:c
+L
+a.RANGEDEL.6 d.RANGEDEL.72057594037927935
+a.RANGEDEL.6:d b.RANGEDEL.10:c
+----
+Level 1
+  file 0: [a#8,15-c#72057594037927935,15]
+Level 2
+  file 0: [a#6,15-d#72057594037927935,15]
+
+check
+----
+encountered tombstone b-c:{(#8,RANGEDEL)} in L1: fileNum=000018 that has a lower seqnum than the same tombstone in L2: fileNum=000019
+
+# Check incorrect ordering of point keys in an sstable.
+define disable-key-order-checks
+L
+a.SET.3 e.SET.4
+e.SET.4:e a.SET.3:a
+L
+d.SET.1 f.SET.2
+d.SET.1:d f.SET.2:f
+----
+Level 1
+  file 0: [a#3,1-e#4,1]
+Level 2
+  file 0: [d#1,1-f#2,1]
+
+check
+----
+out of order keys e#4,SET >= a#3,SET in L1: fileNum=000020
+
+# Check successive sstables on a level are ordered.
+define disable-key-order-checks
+L
+a.SET.1 b.SET.2
+a.SET.1:a b.SET.2:b
+b.SET.3 c.SET.4
+b.SET.3:b c.SET.4:c
+----
+Level 1
+  file 0: [a#1,1-b#2,1]
+  file 1: [b#3,1-c#4,1]
+
+check
+----
+out of order keys b#2,SET >= b#3,SET in L1: fileNum=000023
+
+# Check range delete keys are fragmented and ordered in an sstable having
+# rangeDelV2 formatted range delete blocks.
+
+# Case 1: Fragmented but not ordered.
+define write-unfragmented disable-key-order-checks
+L
+a.RANGEDEL.1 g.RANGEDEL.72057594037927935
+d.RANGEDEL.2:e d.RANGEDEL.1:e f.RANGEDEL.3:g a.RANGEDEL.4:b
+----
+Level 1
+  file 0: [a#1,15-g#72057594037927935,15]
+
+check
+----
+unordered or unfragmented range delete tombstones f-g:{(#3,RANGEDEL)}, a-b:{(#4,RANGEDEL)} in L1: fileNum=000024
+
+# Case 2: Ordered but not fragmented.
+define write-unfragmented disable-key-order-checks
+L
+a.RANGEDEL.1 d.RANGEDEL.72057594037927935
+a.RANGEDEL.1:d b.RANGEDEL.2:c
+----
+Level 1
+  file 0: [a#1,15-d#72057594037927935,15]
+
+check
+----
+unordered or unfragmented range delete tombstones a-d:{(#1,RANGEDEL)}, b-c:{(#2,RANGEDEL)} in L1: fileNum=000025
+
+# Case 3: Verify check is done before truncation.
+define write-unfragmented disable-key-order-checks
+L
+a.RANGEDEL.1 b.RANGEDEL.72057594037927935
+a.RANGEDEL.1:z d.RANGEDEL.2:e
+----
+Level 1
+  file 0: [a#1,15-b#72057594037927935,15]
+
+check
+----
+unordered or unfragmented range delete tombstones a-z:{(#1,RANGEDEL)}, d-e:{(#2,RANGEDEL)} in L1: fileNum=000026
+
+# Merge record processing.
+
+# Case 1: Latest versions of a key are MERGE records and processing one of
+# them fails.
+define
+L
+a.MERGE.10 a.MERGE.9
+a.MERGE.10:10 a.MERGE.9:fail-merge
+----
+Level 1
+  file 0: [a#10,2-a#9,2]
+
+check merger=fail-merger
+----
+merge processing error on key a#9,MERGE in L1: fileNum=000027: merge failed
+
+# Case 2: Last checked key is a MERGE record.
+define
+L
+a.MERGE.10 a.MERGE.9
+a.MERGE.10:10 a.MERGE.9:fail-finish
+----
+Level 1
+  file 0: [a#10,2-a#9,2]
+
+check merger=fail-merger
+----
+merge processing error on key a#9,MERGE in L1: fileNum=000028: finish failed
+
+# Case 3: MERGE records succeeded by newer versions of a key are also
+# processed.
+define
+L
+a.MERGE.10 a.SINGLEDEL.3
+a.MERGE.10:10 a.MERGE.9:9 a.SET.8:8 a.MERGE.7:7 a.MERGE.6:6 a.DEL.5: a.MERGE.4:fail-finish a.SINGLEDEL.3:
+----
+Level 1
+  file 0: [a#10,2-a#3,7]
+
+check merger=fail-merger
+----
+merge processing error on key a#3,SINGLEDEL in L1: fileNum=000029: finish failed
+
+# Case 4: Finish processing on key change.
+define
+L
+a.MERGE.10 b.SET.11
+a.MERGE.10:10 a.MERGE.9:fail-finish b.SET.11:11
+----
+Level 1
+  file 0: [a#10,2-b#11,1]
+
+check merger=fail-merger
+----
+merge processing error on key b#11,SET in L1: fileNum=000030: finish failed
+
+# Case 5: SET finishes MERGE record processing.
+define
+L
+a.MERGE.10 a.SET.9
+a.MERGE.10:10 a.SET.9:fail-finish
+----
+Level 1
+  file 0: [a#10,2-a#9,1]
+
+check merger=fail-merger
+----
+merge processing error on key a#9,SET in L1: fileNum=000031: finish failed
+
+# Case 6: DEL finishes MERGE record processing.
+define
+L
+a.MERGE.10 a.DEL.9
+a.MERGE.10:fail-finish a.DEL.9:
+----
+Level 1
+  file 0: [a#10,2-a#9,0]
+
+check merger=fail-merger
+----
+merge processing error on key a#9,DEL in L1: fileNum=000032: finish failed
+
+# Case 7: SINGLEDEL finishes MERGE record processing.
+define
+L
+a.MERGE.10 a.SINGLEDEL.9
+a.MERGE.10:fail-finish a.SINGLEDEL.9:
+----
+Level 1
+  file 0: [a#10,2-a#9,7]
+
+check merger=fail-merger
+----
+merge processing error on key a#9,SINGLEDEL in L1: fileNum=000033: finish failed
diff --git a/pebble/testdata/level_iter b/pebble/testdata/level_iter
new file mode 100644
index 0000000..3728027
--- /dev/null
+++ b/pebble/testdata/level_iter
@@ -0,0 +1,500 @@
+define
+a.SET.1:1 b.SET.2:2
+c.SET.3:3 d.SET.4:4
+dd.SET.5:5
+----
+
+iter
+seek-ge a
+next
+next
+next
+next
+next
+----
+a#1,1:1
+b#2,1:2
+c#3,1:3
+d#4,1:4
+dd#5,1:5
+.
+
+iter
+seek-ge b
+next
+next
+next
+next
+----
+b#2,1:2
+c#3,1:3
+d#4,1:4
+dd#5,1:5
+.
+
+iter
+seek-ge c
+next
+next
+next
+----
+c#3,1:3
+d#4,1:4
+dd#5,1:5
+.
+
+iter
+seek-ge d
+next
+next
+----
+d#4,1:4
+dd#5,1:5
+.
+
+iter
+seek-ge dd
+next
+----
+dd#5,1:5
+.
+
+iter
+seek-ge e
+----
+.
+
+iter
+seek-lt a
+----
+.
+
+iter
+seek-lt b
+prev
+----
+a#1,1:1
+.
+
+iter
+seek-lt c
+prev
+prev
+----
+b#2,1:2
+a#1,1:1
+.
+
+iter
+seek-lt d
+prev
+prev
+prev
+----
+c#3,1:3
+b#2,1:2
+a#1,1:1
+.
+
+iter
+seek-lt e
+prev
+prev
+prev
+prev
+prev
+----
+dd#5,1:5
+d#4,1:4
+c#3,1:3
+b#2,1:2
+a#1,1:1
+.
+
+iter
+seek-prefix-ge a
+next
+----
+a#1,1:1
+b#2,1:2
+
+iter
+seek-prefix-ge d
+next
+next
+----
+d#4,1:4
+dd#5,1:5
+.
+
+iter
+seek-prefix-ge dd
+next
+----
+dd#5,1:5
+.
+
+iter
+seek-prefix-ge d
+next
+prev
+prev
+----
+d#4,1:4
+dd#5,1:5
+d#4,1:4
+c#3,1:3
+
+iter
+seek-prefix-ge d
+prev
+----
+d#4,1:4
+c#3,1:3
+
+iter
+seek-prefix-ge dd
+prev
+----
+dd#5,1:5
+d#4,1:4
+
+iter lower=a
+seek-ge a
+first
+----
+a#1,1:1
+a#1,1:1
+
+iter
+set-bounds lower=a
+seek-ge a
+first
+----
+a#1,1:1
+a#1,1:1
+
+iter
+set-bounds lower=dd upper=f
+seek-lt dc
+set-bounds lower=a upper=f
+seek-lt dc
+prev
+prev
+prev
+prev
+----
+.
+d#4,1:4
+c#3,1:3
+b#2,1:2
+a#1,1:1
+.
+
+iter
+set-bounds lower=a upper=b
+seek-ge c
+set-bounds lower=a upper=f
+seek-ge c
+next
+next
+next
+----
+.
+c#3,1:3
+d#4,1:4
+dd#5,1:5
+.
+
+# levelIter trims lower/upper bound in the options passed to sstables.
+load a
+----
+[,]
+
+load b lower=aa upper=bb
+----
+[aa,]
+
+load b lower=aa upper=c
+----
+[aa,]
+
+load c lower=b upper=d
+----
+[,d]
+
+load c lower=b upper=e
+----
+[,]
+
+# levelIter only checks lower bound when loading sstables.
+iter lower=b
+seek-ge a
+first
+----
+a#1,1:1
+a#1,1:1
+
+iter lower=c
+seek-ge a
+first
+----
+c#3,1:3
+c#3,1:3
+
+iter
+set-bounds lower=b
+seek-ge a
+first
+----
+a#1,1:1
+a#1,1:1
+
+iter
+set-bounds lower=c
+seek-ge a
+first
+----
+c#3,1:3
+c#3,1:3
+
+# levelIter only checks lower bound when loading sstables.
+iter lower=d
+seek-ge a
+first
+----
+c#3,1:3
+c#3,1:3
+
+iter lower=e
+seek-ge a
+first
+----
+.
+.
+
+iter upper=e
+seek-lt e
+last
+----
+dd#5,1:5
+dd#5,1:5
+
+iter
+set-bounds lower=d
+seek-ge a
+first
+----
+c#3,1:3
+c#3,1:3
+
+iter
+set-bounds lower=e
+seek-ge a
+first
+----
+.
+.
+
+iter
+set-bounds upper=e
+seek-lt e
+last
+----
+dd#5,1:5
+dd#5,1:5
+
+# levelIter only checks upper bound when loading sstables.
+iter upper=d
+seek-lt e
+last
+----
+d#4,1:4
+d#4,1:4
+
+iter upper=c
+seek-lt e
+last
+----
+b#2,1:2
+b#2,1:2
+
+iter
+set-bounds upper=d
+seek-lt e
+last
+----
+d#4,1:4
+d#4,1:4
+
+iter
+set-bounds upper=c
+seek-lt e
+last
+----
+b#2,1:2
+b#2,1:2
+
+# levelIter only checks upper bound when loading sstables.
+iter upper=b
+seek-lt e
+last
+----
+b#2,1:2
+b#2,1:2
+
+iter upper=a
+seek-lt e
+last
+----
+.
+.
+
+iter upper=dd
+seek-prefix-ge d
+next
+----
+d#4,1:4
+.
+
+iter
+set-bounds upper=b
+seek-lt e
+last
+----
+b#2,1:2
+b#2,1:2
+
+iter
+set-bounds upper=a
+seek-lt e
+last
+----
+.
+.
+
+iter
+set-bounds upper=dd
+seek-prefix-ge d
+next
+----
+d#4,1:4
+.
+
+iter upper=e
+seek-prefix-ge d
+next
+next
+----
+d#4,1:4
+dd#5,1:5
+.
+
+iter lower=dd
+seek-prefix-ge d
+next
+----
+dd#5,1:5
+.
+
+iter lower=d
+seek-prefix-ge dd
+prev
+----
+dd#5,1:5
+d#4,1:4
+
+iter lower=c
+seek-prefix-ge dd
+prev
+----
+dd#5,1:5
+d#4,1:4
+
+iter lower=c
+seek-lt c
+----
+.
+
+iter
+seek-lt c
+set-bounds lower=c
+seek-lt c
+----
+b#2,1:2
+.
+
+iter upper=c
+seek-ge c
+----
+.
+
+iter
+seek-ge c
+set-bounds upper=c
+seek-ge c
+----
+c#3,1:3
+.
+
+# The behavior of next/prev after set-bounds is undefined. We're just
+# asserting the current behavior.
+
+# The lower bound is beyond the current table's bounds.
+
+iter
+seek-ge c
+set-bounds lower=e
+next
+----
+c#3,1:3
+.
+
+# The lower bound lies within the current table's bounds.
+
+iter
+seek-ge c
+set-bounds lower=d
+next
+----
+c#3,1:3
+d#4,1:4
+
+# The upper bound is before the current table's bounds.
+
+iter
+seek-ge d
+set-bounds upper=c
+prev
+----
+d#4,1:4
+.
+
+# The upper bound lies within the current table's bounds.
+
+iter
+seek-ge d
+set-bounds upper=cc
+prev
+----
+d#4,1:4
+c#3,1:3
+
+# Setting bounds should update the table bounds, allowing a subsequent
+# seek-ge/seek-lt to see the boundary keys.
+
+iter
+seek-ge d
+set-bounds lower=cc
+seek-lt d
+----
+d#4,1:4
+.
+
+iter
+seek-ge c
+set-bounds upper=cc
+seek-ge d
+----
+c#3,1:3
+.
diff --git a/pebble/testdata/level_iter_boundaries b/pebble/testdata/level_iter_boundaries
new file mode 100644
index 0000000..319baf6
--- /dev/null
+++ b/pebble/testdata/level_iter_boundaries
@@ -0,0 +1,382 @@
+build
+a.RANGEDEL.1:c
+b.RANGEDEL.2:d
+----
+0: a#1,15-d#72057594037927935,15
+
+iter
+first
+next
+last
+prev
+----
+d#72057594037927935,15:
+.
+a#1,15:
+.
+
+iter
+seek-ge c
+seek-ge d
+seek-lt b
+prev
+----
+d#72057594037927935,15:
+.
+a#1,15:
+.
+
+iter
+seek-prefix-ge c
+seek-prefix-ge d
+seek-lt b
+prev
+----
+d#72057594037927935,15:
+.
+a#1,15:
+.
+
+iter
+seek-ge e
+seek-lt a
+----
+.
+.
+
+iter
+seek-prefix-ge e
+seek-lt a
+----
+.
+.
+
+clear
+----
+
+build
+a.SET.1:a
+----
+0: a#1,1-a#1,1
+
+build
+b.RANGEDEL.2:c
+----
+0: a#1,1-a#1,1
+1: b#2,15-c#72057594037927935,15
+
+build
+c.SET.3:c
+----
+0: a#1,1-a#1,1
+1: b#2,15-c#72057594037927935,15
+2: c#3,1-c#3,1
+
+iter
+first
+next
+next
+next
+----
+a#1,1:a
+c#72057594037927935,15:
+c#3,1:c
+.
+
+iter
+last
+prev
+prev
+prev
+----
+c#3,1:c
+b#2,15:
+a#1,1:a
+.
+
+clear
+----
+
+build
+a.SET.1:b
+b.RANGEDEL.2:c
+----
+0: a#1,1-c#72057594037927935,15
+
+iter
+first
+next
+next
+----
+a#1,1:b
+c#72057594037927935,15:
+.
+
+iter
+last
+prev
+----
+a#1,1:b
+.
+
+clear
+----
+
+build
+a.RANGEDEL.1:b
+c.SET.2:c
+----
+0: a#1,15-c#2,1
+
+iter
+first
+next
+----
+c#2,1:c
+.
+
+iter
+last
+prev
+prev
+----
+c#2,1:c
+a#1,15:
+.
+
+# Regression test to check that Seek{GE,LT} work properly in certain
+# cases when then levelIter is positioned at a boundary key.
+
+clear
+----
+
+build
+d.SET.3:d
+c.RANGEDEL.2:e
+----
+0: c#2,15-e#72057594037927935,15
+
+iter
+seek-ge d
+next
+seek-ge d
+next
+seek-lt e
+prev
+seek-ge d
+prev
+seek-lt e
+----
+d#3,1:d
+e#72057594037927935,15:
+d#3,1:d
+e#72057594037927935,15:
+d#3,1:d
+c#2,15:
+d#3,1:d
+c#2,15:
+d#3,1:d
+
+# Regression test to check that Seek{GE,LT}, First, and Last do not
+# have iteration bounds affected by SeekPrefixGE. Previously,
+# SeekPrefixGE adjusted the iteration upper bound which would leak
+# over to other positioning operations. While SeekPrefixGE no longer
+# has this behavior, it is good to check the iteration bounds handling
+# regardless.
+
+clear
+----
+
+build
+b.SET.4:b
+d.SET.3:d
+----
+0: b#4,1-d#3,1
+
+iter
+seek-prefix-ge c
+seek-ge d
+next
+----
+.
+d#3,1:d
+.
+
+iter
+seek-prefix-ge c
+seek-lt e
+next
+----
+.
+d#3,1:d
+.
+
+iter
+seek-prefix-ge c
+first
+next
+next
+----
+.
+b#4,1:b
+d#3,1:d
+.
+
+iter
+seek-prefix-ge c
+last
+next
+----
+.
+d#3,1:d
+.
+
+clear
+----
+
+build
+a.SET.3:z
+d.SET.4:z
+----
+0: a#3,1-d#4,1
+
+build
+e.SET.5:z
+f.SET.6:z
+g.RANGEDEL.2:h
+----
+0: a#3,1-d#4,1
+1: e#5,1-h#72057594037927935,15
+
+build
+j.SET.6:z
+----
+0: a#3,1-d#4,1
+1: e#5,1-h#72057594037927935,15
+2: j#6,1-j#6,1
+
+# Test cases to check that when the bounds are contained within a file, iterating
+# beyond the bounds does not cause the levelIter to change to the next/prev file.
+# This is not a correctness issue, but is a useful performance optimization and
+# we want to verify that the code does what we want it to.
+iter save
+set-bounds lower=a upper=b
+seek-ge a
+next
+----
+a#3,1:z
+.
+
+file-pos
+----
+file 0
+
+iter save continue
+seek-prefix-ge a
+next
+----
+a#3,1:z
+.
+
+file-pos
+----
+file 0
+
+iter save continue
+set-bounds lower=b upper=c
+seek-ge b
+----
+.
+
+file-pos
+----
+file 0
+
+iter save continue
+seek-prefix-ge b
+----
+.
+
+file-pos
+----
+file 0
+
+# Seek to an earlier position just as a sanity check.
+iter save continue
+set-bounds lower=a upper=b
+seek-ge a
+next
+----
+a#3,1:z
+.
+
+file-pos
+----
+file 0
+
+iter save continue
+set-bounds lower=d upper=e
+seek-ge d
+next
+----
+d#4,1:z
+.
+
+file-pos
+----
+file 1
+
+iter save continue
+seek-prefix-ge d
+next
+----
+d#4,1:z
+.
+
+file-pos
+----
+file 1
+
+iter save continue
+set-bounds lower=e upper=f
+seek-ge e
+next
+next
+----
+e#5,1:z
+f#72057594037927935,15:
+.
+
+file-pos
+----
+file 1
+
+iter save continue
+seek-lt f
+prev
+prev
+----
+e#5,1:z
+.
+.
+
+file-pos
+----
+file 0
+
+iter save continue
+set-bounds lower=f upper=g
+seek-lt g
+prev
+prev
+----
+f#6,1:z
+f#72057594037927935,15:
+.
+
+file-pos
+----
+file 1
+
+iter continue
+----
diff --git a/pebble/testdata/level_iter_seek b/pebble/testdata/level_iter_seek
new file mode 100644
index 0000000..0557080
--- /dev/null
+++ b/pebble/testdata/level_iter_seek
@@ -0,0 +1,391 @@
+# Note that this test file uses a levelIterTestIter which combines a
+# point iterator and a range-del iterator, returning both results in a
+# single key:
+#
+#   <point-key>/<tombstone>#<point-seqnum,point-kind>
+
+# Verify that SeekGE, SeekLT, Next, and Prev all pause at table
+# boundaries in the presence of lower/upper bounds and range
+# tombstones. Verify that SeekPrefixGE pauses at a table boundary in
+# the presence of range tombstones.
+
+build
+a.SET.9:a
+b.SET.8:b
+----
+0: a#9,1-b#8,1
+
+build
+c.SET.7:c
+d.RANGEDEL.6:e
+f.SET.5:f
+----
+0: a#9,1-b#8,1
+1: c#7,1-f#5,1
+
+build
+g.SET.4:g
+h.SET.3:h
+----
+0: a#9,1-b#8,1
+1: c#7,1-f#5,1
+2: g#4,1-h#3,1
+
+iter
+seek-ge d
+----
+f/d-e:{(#6,RANGEDEL)}#5,1:f
+
+iter
+set-bounds upper=d
+seek-ge d
+----
+d/d-e:{(#6,RANGEDEL)}#72057594037927935,15:
+
+iter
+set-bounds upper=d
+seek-ge c
+next
+prev
+next
+next
+----
+c/d-e:{(#6,RANGEDEL)}#7,1:c
+d#72057594037927935,15:
+c#7,1:c
+d#72057594037927935,15:
+.
+
+# There is no point key with d, but since there is a rangedel, levelIter returns
+# the boundary key using the largest key, f, in the file.
+iter
+seek-prefix-ge d
+----
+f/d-e:{(#6,RANGEDEL)}#5,1:
+
+# Tests a sequence of SeekPrefixGE with monotonically increasing keys, some of
+# which are present and some not (so fail the bloom filter match). The seek to
+# cc returns a boundary key.
+iter
+seek-prefix-ge aa
+seek-prefix-ge c
+seek-prefix-ge cc
+seek-prefix-ge f
+seek-prefix-ge g
+seek-prefix-ge gg
+seek-prefix-ge h
+----
+./<invalid>#0,0:
+c/d-e:{(#6,RANGEDEL)}#7,1:c
+f/d-e:{(#6,RANGEDEL)}#5,1:
+f/<invalid>#5,1:f
+g/<invalid>#4,1:g
+./<invalid>#0,0:
+h/<invalid>#3,1:h
+
+# Test that when sequentially iterate through all 3 files, the stats
+# accumulate as we close a file and switch to the next one. Also, while in the
+# middle of the first file, a reset-stats propagates to the underlying
+# iterators, and when done iterating, a reset-stats does reset the local
+# state.
+iter
+seek-ge a
+stats
+reset-stats
+stats
+next
+stats
+next
+stats
+next
+stats
+next
+stats
+next
+stats
+next
+stats
+reset-stats
+stats
+----
+a/<invalid>#9,1:a
+{BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+{BlockBytes:0 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+b#8,1:b
+{BlockBytes:0 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+c#7,1:c
+{BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+f#5,1:f
+{BlockBytes:56 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+g#4,1:g
+{BlockBytes:112 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+h#3,1:h
+{BlockBytes:112 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+.
+{BlockBytes:112 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+{BlockBytes:0 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+
+iter
+set-bounds lower=d
+seek-lt d
+----
+d/d-e:{(#6,RANGEDEL)}#72057594037927935,15:
+
+iter
+set-bounds lower=d
+seek-lt g
+prev
+next
+prev
+prev
+----
+f/d-e:{(#6,RANGEDEL)}#5,1:f
+d#72057594037927935,15:
+f#5,1:f
+d#72057594037927935,15:
+.
+
+# Verify that First() in the presence of an upper-bound pauses at the
+# table containing the upper-bound.
+
+clear
+----
+
+build
+d.RANGEDEL.6:e
+f.SET.5:f
+----
+0: d#6,15-f#5,1
+
+iter
+set-bounds upper=f
+first
+----
+f#72057594037927935,15:
+
+# Verify that Last() in the presence of a lower-bound pauses at the
+# table containing the lower-bound.
+
+clear
+----
+
+build
+c.SET.7:c
+d.RANGEDEL.6:e
+----
+0: c#7,1-e#72057594037927935,15
+
+iter
+set-bounds lower=c
+last
+----
+c#7,1:c
+
+# Verify that a seek to a file with range tombstones as boundaries pauses on
+# those boundaries.
+
+clear
+----
+
+build
+a.RANGEDEL.5:b
+c.SET.7:c
+d.RANGEDEL.6:e
+----
+0: a#5,15-e#72057594037927935,15
+
+build
+f.SET.8:f
+g.SET.9:g
+----
+0: a#5,15-e#72057594037927935,15
+1: f#8,1-g#9,1
+
+iter
+seek-ge d
+prev
+next
+next
+----
+e/d-e:{(#6,RANGEDEL)}#72057594037927935,15:
+c#7,1:c
+e#72057594037927935,15:
+f#8,1:f
+
+iter
+seek-lt b
+next
+prev
+prev
+----
+a/a-b:{(#5,RANGEDEL)}#5,15:
+c#7,1:c
+a#5,15:
+.
+
+# Verify that prev when positioned at the largest boundary returns the
+# last key.
+
+clear
+----
+
+build
+a.SET.1:a
+b.SET.1:b
+d.RANGEDEL.2:e
+----
+0: a#1,1-e#72057594037927935,15
+
+iter
+seek-lt c
+seek-ge d
+prev
+----
+b/<invalid>#1,1:b
+e/d-e:{(#2,RANGEDEL)}#72057594037927935,15:
+b#1,1:b
+
+# Verify that next when positioned at the smallest boundary returns
+# the first key.
+
+clear
+----
+
+build
+a.RANGEDEL.1:b
+d.SET.2:d
+e.SET.2:e
+----
+0: a#1,15-e#2,1
+
+iter
+seek-ge d
+seek-lt d
+next
+----
+d/<invalid>#2,1:d
+a/a-b:{(#1,RANGEDEL)}#1,15:
+d#2,1:d
+
+# Verify SeekPrefixGE correctness with trySeekUsingNext=true
+clear
+----
+
+build
+a.SET.1:a
+b.SET.2:b
+c.RANGEDEL.4:e
+----
+0: a#1,1-e#72057594037927935,15
+
+build
+e.SET.4:e
+f.SINGLEDEL.5:
+f.SET.4:f
+g.SET.6:g
+h.SINGLEDEL.7:
+----
+0: a#1,1-e#72057594037927935,15
+1: e#4,1-h#7,7
+
+build
+h.SET.6:h
+i.SET.6:i
+----
+0: a#1,1-e#72057594037927935,15
+1: e#4,1-h#7,7
+2: h#6,1-i#6,1
+
+build
+j.SET.7:j
+----
+0: a#1,1-e#72057594037927935,15
+1: e#4,1-h#7,7
+2: h#6,1-i#6,1
+3: j#7,1-j#7,1
+
+# Seeks to immediately following keys.
+iter
+seek-prefix-ge a false
+seek-prefix-ge a true
+seek-prefix-ge b true
+next
+seek-prefix-ge c false
+seek-prefix-ge d true
+seek-prefix-ge f true
+seek-prefix-ge g true
+seek-prefix-ge h true
+seek-prefix-ge i true
+seek-prefix-ge j true
+----
+a/c-e:{(#4,RANGEDEL)}#1,1:a
+a/c-e:{(#4,RANGEDEL)}#1,1:a
+b/c-e:{(#4,RANGEDEL)}#2,1:b
+e#72057594037927935,15:
+e/c-e:{(#4,RANGEDEL)}#72057594037927935,15:
+e/c-e:{(#4,RANGEDEL)}#72057594037927935,15:
+f/<invalid>#5,7:
+g/<invalid>#6,1:g
+h/<invalid>#7,7:
+i/<invalid>#6,1:i
+j/<invalid>#7,1:j
+
+# Seeks to keys that are in the next file, so cannot use Next.
+iter
+seek-prefix-ge a false
+seek-prefix-ge e true
+seek-prefix-ge i true
+seek-prefix-ge j true
+----
+a/c-e:{(#4,RANGEDEL)}#1,1:a
+e/<invalid>#4,1:e
+i/<invalid>#6,1:i
+j/<invalid>#7,1:j
+
+# Verify that we do not open files that do not have point keys.
+
+clear
+----
+
+build
+a.SET.9:a
+b.SET.8:b
+----
+0: a#9,1-b#8,1
+
+build
+c.SET.7:c
+d.RANGEDEL.6:e
+f.SET.5:f
+----
+0: a#9,1-b#8,1
+1: c#7,1-f#5,1
+
+build format=pebblev2
+g.RANGEKEYDEL.6:h
+----
+0: a#9,1-b#8,1
+1: c#7,1-f#5,1
+2: g#6,19-h#72057594037927935,19
+
+build
+i.SET.4:i
+j.SET.3:j
+----
+0: a#9,1-b#8,1
+1: c#7,1-f#5,1
+2: g#6,19-h#72057594037927935,19
+3: i#4,1-j#3,1
+
+iter
+seek-ge f
+next
+----
+f/<invalid>#5,1:f
+i#4,1:i
+
+# The below count should be 2, as we skip over the rangekey-only file.
+
+iters-created
+----
+2
diff --git a/pebble/testdata/make-db.go b/pebble/testdata/make-db.go
new file mode 100644
index 0000000..060935a
--- /dev/null
+++ b/pebble/testdata/make-db.go
@@ -0,0 +1,96 @@
+package main
+
+import (
+	"fmt"
+	"log"
+	"os"
+	"strconv"
+
+	"github.com/cockroachdb/pebble"
+)
+
+const version = pebble.FormatFlushableIngest
+
+func usage() {
+	fmt.Fprintf(os.Stderr, "usage: %s [1,2,3,4]\n", os.Args[0])
+	os.Exit(1)
+}
+
+func main() {
+	if len(os.Args) != 2 {
+		usage()
+	}
+	// The program consists of up to 4 stages. If stage is in the range [1, 4],
+	// the program will exit after the stage'th stage.
+	// 1. create an empty DB.
+	// 2. add some key/value pairs.
+	// 3. close and re-open the DB, which forces a compaction.
+	// 4. add some more key/value pairs.
+	stage, err := strconv.Atoi(os.Args[1])
+	if err != nil || stage < 1 || stage > 4 {
+		usage()
+	}
+	dbName := fmt.Sprintf("db-stage-%d", stage)
+	opts := &pebble.Options{
+		FormatMajorVersion: version,
+	}
+
+	fmt.Printf("Stage 1\n")
+	db, err := pebble.Open(dbName, opts)
+	if err != nil {
+		log.Fatal(err)
+	}
+	defer func() {
+		if db != nil {
+			db.Close()
+		}
+	}()
+
+	if stage < 2 {
+		return
+	}
+	fmt.Printf("Stage 2\n")
+
+	if err := db.Set([]byte("foo"), []byte("one"), pebble.Sync); err != nil {
+		log.Fatal(err)
+	}
+	if err := db.Set([]byte("bar"), []byte("two"), pebble.Sync); err != nil {
+		log.Fatal(err)
+	}
+	if err := db.Set([]byte("baz"), []byte("three"), pebble.Sync); err != nil {
+		log.Fatal(err)
+	}
+	if err := db.Set([]byte("foo"), []byte("four"), pebble.Sync); err != nil {
+		log.Fatal(err)
+	}
+	if err := db.Delete([]byte("bar"), pebble.Sync); err != nil {
+		log.Fatal(err)
+	}
+
+	if stage < 3 {
+		return
+	}
+	fmt.Printf("Stage 3\n")
+
+	db.Close()
+	db = nil
+	db, err = pebble.Open(dbName, opts)
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	if stage < 4 {
+		return
+	}
+	fmt.Printf("Stage 4\n")
+
+	if err := db.Set([]byte("foo"), []byte("five"), pebble.Sync); err != nil {
+		log.Fatal(err)
+	}
+	if err := db.Set([]byte("quux"), []byte("six"), pebble.Sync); err != nil {
+		log.Fatal(err)
+	}
+	if err := db.Delete([]byte("baz"), pebble.Sync); err != nil {
+		log.Fatal(err)
+	}
+}
diff --git a/pebble/testdata/manual_compaction b/pebble/testdata/manual_compaction
new file mode 100644
index 0000000..cd3ad5f
--- /dev/null
+++ b/pebble/testdata/manual_compaction
@@ -0,0 +1,1236 @@
+batch
+set a 1
+set b 2
+----
+
+compact a-b
+----
+6:
+  000005:[a#10,SET-b#11,SET]
+
+batch
+set c 3
+set d 4
+----
+
+compact c-d
+----
+6:
+  000005:[a#10,SET-b#11,SET]
+  000007:[c#12,SET-d#13,SET]
+
+batch
+set b 5
+set c 6
+----
+
+compact a-d
+----
+6:
+  000010:[a#0,SET-d#0,SET]
+
+# This also tests flushing a memtable that only contains range
+# deletions.
+
+batch
+del-range a e
+----
+
+compact a-d
+----
+
+# Test that a multi-output-file compaction generates non-overlapping files.
+
+define target-file-sizes=(100, 1)
+L0
+  b.SET.1:v
+L0
+  a.SET.2:v
+----
+0.0:
+  000005:[a#2,SET-a#2,SET]
+  000004:[b#1,SET-b#1,SET]
+
+compact a-b
+----
+1:
+  000006:[a#0,SET-a#0,SET]
+  000007:[b#0,SET-b#0,SET]
+
+# A range tombstone extends past the grandparent file boundary used to limit the
+# size of future compactions. Verify the range tombstone is split at that file
+# boundary.
+
+define target-file-sizes=(1, 1, 1, 1)
+L1
+  a.SET.3:v
+L2
+  a.RANGEDEL.2:e
+L3
+  a.SET.0:v
+  b.SET.0:v
+L3
+  c.SET.0:v
+  d.SET.0:v
+----
+1:
+  000004:[a#3,SET-a#3,SET]
+2:
+  000005:[a#2,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+
+wait-pending-table-stats
+000005
+----
+num-entries: 1
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 1552
+
+compact a-e L1
+----
+2:
+  000008:[a#3,SET-c#inf,RANGEDEL]
+  000009:[c#2,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+
+wait-pending-table-stats
+000008
+----
+num-entries: 2
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 776
+
+# Same as above, except range tombstone covers multiple grandparent file boundaries.
+
+define target-file-sizes=(1, 1, 1, 1) format-major-version=1
+L1
+  a.SET.3:v
+L2
+  a.RANGEDEL.2:g
+L3
+  a.SET.0:v
+  b.SET.0:v
+L3
+  c.SET.0:v
+  d.SET.0:v
+L3
+  e.SET.0:v
+  f.SET.1:v
+L3
+  f.SET.0:v
+  g.SET.0:v
+----
+1:
+  000004:[a#3,SET-a#3,SET]
+2:
+  000005:[a#2,RANGEDEL-g#inf,RANGEDEL]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+  000008:[e#0,SET-f#1,SET]
+  000009:[f#0,SET-g#0,SET]
+
+compact a-e L1
+----
+2:
+  000010:[a#3,SET-c#inf,RANGEDEL]
+  000011:[c#2,RANGEDEL-e#inf,RANGEDEL]
+  000012:[e#2,RANGEDEL-f#inf,RANGEDEL]
+  000013:[f#2,RANGEDEL-g#inf,RANGEDEL]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+  000008:[e#0,SET-f#1,SET]
+  000009:[f#0,SET-g#0,SET]
+
+# A range tombstone covers multiple grandparent file boundaries between point keys,
+# rather than after all point keys.
+
+define target-file-sizes=(1, 1, 1, 1)
+L1
+  a.SET.3:v
+  h.SET.3:v
+L2
+  a.RANGEDEL.2:g
+L3
+  a.SET.0:v
+  b.SET.0:v
+L3
+  c.SET.0:v
+  d.SET.0:v
+L3
+  e.SET.0:v
+  f.SET.1:v
+----
+1:
+  000004:[a#3,SET-h#3,SET]
+2:
+  000005:[a#2,RANGEDEL-g#inf,RANGEDEL]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+  000008:[e#0,SET-f#1,SET]
+
+compact a-e L1
+----
+2:
+  000009:[a#3,SET-c#inf,RANGEDEL]
+  000010:[c#2,RANGEDEL-h#3,SET]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+  000008:[e#0,SET-f#1,SET]
+
+# A range tombstone is the first and only item output by a compaction, and it
+# extends past the grandparent file boundary used to limit the size of future
+# compactions. Verify the range tombstone is split at that file boundary.
+
+define target-file-sizes=(1, 1, 1, 1)
+L1
+  a.RANGEDEL.3:e
+L2
+  a.SET.2:v
+L3
+  a.SET.0:v
+  b.SET.0:v
+L3
+  c.SET.0:v
+  d.SET.0:v
+----
+1:
+  000004:[a#3,RANGEDEL-e#inf,RANGEDEL]
+2:
+  000005:[a#2,SET-a#2,SET]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+
+compact a-e L1
+----
+2:
+  000008:[a#3,RANGEDEL-c#inf,RANGEDEL]
+  000009:[c#3,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+
+# An elided range tombstone is the first item encountered by a compaction,
+# and the grandparent limit set by it extends to the next item, also a range
+# tombstone. The first item should be elided, and the second item should
+# reset the grandparent limit.
+
+define target-file-sizes=(100, 100, 100, 100)
+L1
+  a.RANGEDEL.4:d
+L1
+  grandparent.RANGEDEL.2:z
+  h.SET.3:v
+L2
+  grandparent.SET.1:v
+L3
+  grandparent.SET.0:v
+L3
+  m.SET.0:v
+----
+1:
+  000004:[a#4,RANGEDEL-d#inf,RANGEDEL]
+  000005:[grandparent#2,RANGEDEL-z#inf,RANGEDEL]
+2:
+  000006:[grandparent#1,SET-grandparent#1,SET]
+3:
+  000007:[grandparent#0,SET-grandparent#0,SET]
+  000008:[m#0,SET-m#0,SET]
+
+compact a-h L1
+----
+2:
+  000009:[grandparent#2,RANGEDEL-m#inf,RANGEDEL]
+  000010:[m#2,RANGEDEL-z#inf,RANGEDEL]
+3:
+  000007:[grandparent#0,SET-grandparent#0,SET]
+  000008:[m#0,SET-m#0,SET]
+
+# Setup such that grandparent overlap limit is exceeded multiple times at the same user key ("b").
+# Ensures the compaction output files are non-overlapping.
+
+define target-file-sizes=(1, 1, 1, 1)
+L1
+  a.SET.2:v
+  c.SET.2:v
+L2
+  a.RANGEDEL.3:c
+L3
+  b.SET.2:v
+L3
+  b.SET.1:v
+L3
+  b.SET.0:v
+----
+1:
+  000004:[a#2,SET-c#2,SET]
+2:
+  000005:[a#3,RANGEDEL-c#inf,RANGEDEL]
+3:
+  000006:[b#2,SET-b#2,SET]
+  000007:[b#1,SET-b#1,SET]
+  000008:[b#0,SET-b#0,SET]
+
+compact a-c L1
+----
+2:
+  000009:[a#3,RANGEDEL-b#inf,RANGEDEL]
+  000010:[b#3,RANGEDEL-c#2,SET]
+3:
+  000006:[b#2,SET-b#2,SET]
+  000007:[b#1,SET-b#1,SET]
+  000008:[b#0,SET-b#0,SET]
+
+# Regression test for a bug where compaction would stop process range
+# tombstones for an input level upon finding an sstable in the input
+# level with no range tombstones. In the scenario below, sstable 6
+# does not contain any range tombstones while sstable 7 does. Both are
+# compacted together with sstable 5.
+
+reset
+----
+
+batch
+set a 1
+set b 1
+set c 1
+set d 1
+set z 1
+----
+
+compact a-z
+----
+6:
+  000005:[a#10,SET-z#14,SET]
+
+build ext1
+set a 2
+----
+
+build ext2
+set b 2
+del-range c z
+----
+
+ingest ext1 ext2
+----
+0.0:
+  000006:[a#15,SET-a#15,SET]
+  000007:[b#16,SET-z#inf,RANGEDEL]
+6:
+  000005:[a#10,SET-z#14,SET]
+
+iter
+first
+next
+next
+next
+----
+a: (2, .)
+b: (2, .)
+z: (1, .)
+.
+
+compact a-z
+----
+6:
+  000008:[a#0,SET-z#0,SET]
+
+iter
+first
+next
+next
+next
+----
+a: (2, .)
+b: (2, .)
+z: (1, .)
+.
+
+# Regresion test for a bug in sstable smallest boundary generation
+# where the smallest key for an sstable was set to a key "larger" than
+# the start key of the first range tombstone. This in turn fouled up
+# the processing logic of range tombstones used by mergingIter which
+# allowed stepping out of an sstable even though it contained a range
+# tombstone that covered keys in lower levels.
+
+define target-file-sizes=(1, 1, 1, 1)
+L0
+  c.SET.4:4
+L1
+  a.SET.3:3
+L2
+  a.RANGEDEL.2:e
+L3
+  b.SET.1:1
+----
+0.0:
+  000004:[c#4,SET-c#4,SET]
+1:
+  000005:[a#3,SET-a#3,SET]
+2:
+  000006:[a#2,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000007:[b#1,SET-b#1,SET]
+
+compact a-e L1
+----
+0.0:
+  000004:[c#4,SET-c#4,SET]
+2:
+  000008:[a#3,SET-b#inf,RANGEDEL]
+  000009:[b#2,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000007:[b#1,SET-b#1,SET]
+
+# We should only see a:3 and c:4 at this point.
+
+iter
+first
+next
+next
+----
+a: (3, .)
+c: (4, .)
+.
+
+# The bug allowed seeing b:1 during reverse iteration.
+
+iter
+last
+prev
+prev
+----
+c: (4, .)
+a: (3, .)
+.
+
+# This is a similar scenario to the one above. In older versions of Pebble this
+# case necessitated adjusting the seqnum of the range tombstone to
+# prev.LargestKey.SeqNum-1. We no longer allow user keys to be split across
+# sstables, and the seqnum adjustment is no longer necessary.
+#
+# Note the target-file-size of 26 is specially tailored to get the
+# desired compaction output.
+
+define target-file-sizes=(26, 26, 26, 26) snapshots=(1, 2, 3)
+L1
+  a.SET.4:4
+L1
+  b.SET.2:2
+  b.RANGEDEL.3:e
+L3
+  b.SET.1:1
+----
+1:
+  000004:[a#4,SET-a#4,SET]
+  000005:[b#3,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000006:[b#1,SET-b#1,SET]
+
+compact a-e L1
+----
+2:
+  000007:[a#4,SET-a#4,SET]
+  000008:[b#3,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000006:[b#1,SET-b#1,SET]
+
+iter
+first
+next
+last
+prev
+----
+a: (4, .)
+.
+a: (4, .)
+.
+
+# Similar to the preceding scenario, except the range tombstone has
+# the same seqnum as the largest key in the preceding file.
+
+define target-file-sizes=(26, 26, 26, 26) snapshots=(1, 2, 3)
+L1
+  a.SET.4:4
+L1
+  b.SET.3:3
+  b.RANGEDEL.3:e
+L3
+  b.SET.1:1
+----
+1:
+  000004:[a#4,SET-a#4,SET]
+  000005:[b#3,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000006:[b#1,SET-b#1,SET]
+
+compact a-e L1
+----
+2:
+  000007:[a#4,SET-a#4,SET]
+  000008:[b#3,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000006:[b#1,SET-b#1,SET]
+
+iter
+first
+next
+next
+last
+prev
+prev
+----
+a: (4, .)
+b: (3, .)
+.
+b: (3, .)
+a: (4, .)
+.
+
+# Similar to the preceding scenario, except the range tombstone has
+# a smaller seqnum than the largest key in the preceding file.
+
+define target-file-sizes=(26, 26, 26, 26) snapshots=(1, 2, 3)
+L1
+  a.SET.4:4
+L1
+  b.SET.4:4
+  b.RANGEDEL.2:e
+L3
+  b.SET.1:1
+----
+1:
+  000004:[a#4,SET-a#4,SET]
+  000005:[b#4,SET-e#inf,RANGEDEL]
+3:
+  000006:[b#1,SET-b#1,SET]
+
+compact a-e L1
+----
+2:
+  000007:[a#4,SET-a#4,SET]
+  000008:[b#4,SET-e#inf,RANGEDEL]
+3:
+  000006:[b#1,SET-b#1,SET]
+
+iter
+first
+next
+next
+last
+prev
+prev
+----
+a: (4, .)
+b: (4, .)
+.
+b: (4, .)
+a: (4, .)
+.
+
+# Test a scenario where the last point key in an sstable has a seqnum
+# of 0.
+
+define target-file-sizes=(1, 1, 26) snapshots=(2)
+L1
+  a.SET.3:3
+  b.RANGEDEL.3:e
+  b.SET.0:0
+L3
+  a.RANGEDEL.2:b
+L3
+  c.SET.0:0
+  d.SET.0:0
+----
+1:
+  000004:[a#3,SET-e#inf,RANGEDEL]
+3:
+  000005:[a#2,RANGEDEL-b#inf,RANGEDEL]
+  000006:[c#0,SET-d#0,SET]
+
+iter
+last
+prev
+----
+a: (3, .)
+.
+
+compact a-e L1
+----
+2:
+  000007:[a#3,SET-c#inf,RANGEDEL]
+  000008:[c#3,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000005:[a#2,RANGEDEL-b#inf,RANGEDEL]
+  000006:[c#0,SET-d#0,SET]
+
+iter
+last
+prev
+----
+a: (3, .)
+.
+
+# Test a scenario where the last point key in an sstable before the
+# grandparent limit is reached has a seqnum of 0. We want to cut the
+# sstable after the next point key is added, rather than continuing to
+# add keys indefinitely (or till the size limit is reached).
+
+define target-file-sizes=(100, 1, 52) snapshots=(2)
+L1
+  a.SET.3:3
+  b.RANGEDEL.3:e
+  b.SET.0:0
+  c.SET.3:1
+  d.SET.1:1
+L3
+  c.RANGEDEL.2:d
+----
+1:
+  000004:[a#3,SET-e#inf,RANGEDEL]
+3:
+  000005:[c#2,RANGEDEL-d#inf,RANGEDEL]
+
+compact a-f L1
+----
+2:
+  000006:[a#3,SET-c#inf,RANGEDEL]
+  000007:[c#3,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000005:[c#2,RANGEDEL-d#inf,RANGEDEL]
+
+# Test a scenario where we the last point key in an sstable has a
+# seqnum of 0, but there is another range tombstone later in the
+# compaction. This scenario was previously triggering an assertion due
+# to the rangedel.Fragmenter being finished prematurely.
+
+define target-file-sizes=(1, 1, 1)
+L1
+  a.SET.0:0
+  c.RANGEDEL.1:d
+L3
+  b.SET.0:0
+----
+1:
+  000004:[a#0,SET-d#inf,RANGEDEL]
+3:
+  000005:[b#0,SET-b#0,SET]
+
+compact a-e L1
+----
+2:
+  000006:[a#0,SET-a#0,SET]
+3:
+  000005:[b#0,SET-b#0,SET]
+
+define target-file-sizes=(1, 1, 1, 1)
+L0
+  b.SET.1:v
+L0
+  a.SET.2:v
+----
+0.0:
+  000005:[a#2,SET-a#2,SET]
+  000004:[b#1,SET-b#1,SET]
+
+add-ongoing-compaction startLevel=0 outputLevel=1 start=a end=b
+----
+
+async-compact a-b L0
+----
+manual compaction blocked until ongoing finished
+1:
+  000006:[a#0,SET-a#0,SET]
+  000007:[b#0,SET-b#0,SET]
+
+compact a-b L1
+----
+2:
+  000008:[a#0,SET-a#0,SET]
+  000009:[b#0,SET-b#0,SET]
+
+add-ongoing-compaction startLevel=0 outputLevel=1 start=a end=b
+----
+
+async-compact a-b L2
+----
+manual compaction blocked until ongoing finished
+3:
+  000010:[a#0,SET-a#0,SET]
+  000011:[b#0,SET-b#0,SET]
+
+add-ongoing-compaction startLevel=0 outputLevel=1 start=a end=b
+----
+
+set-concurrent-compactions num=2
+----
+
+async-compact a-b L3
+----
+manual compaction did not block for ongoing
+4:
+  000012:[a#0,SET-a#0,SET]
+  000013:[b#0,SET-b#0,SET]
+
+remove-ongoing-compaction
+----
+
+add-ongoing-compaction startLevel=4 outputLevel=5 start=a end=b
+----
+
+async-compact a-b L4
+----
+manual compaction blocked until ongoing finished
+5:
+  000014:[a#0,SET-a#0,SET]
+  000015:[b#0,SET-b#0,SET]
+
+# Test of a scenario where consecutive elided range tombstones and grandparent
+# boundaries could result in an invariant violation in the rangedel fragmenter.
+
+define target-file-sizes=(1, 1, 1, 1)
+L1
+  a.RANGEDEL.4:b
+  c.RANGEDEL.4:d
+  e.RANGEDEL.4:f
+L1
+  g.RANGEDEL.6:h
+  i.RANGEDEL.4:j
+L1
+  k.RANGEDEL.5:q
+  m.RANGEDEL.4:q
+L2
+  a.SET.2:foo
+L3
+  a.SET.1:foo
+  c.SET.1:foo
+L3
+  ff.SET.1:v
+L3
+  k.SET.1:foo
+----
+1:
+  000004:[a#4,RANGEDEL-f#inf,RANGEDEL]
+  000005:[g#6,RANGEDEL-j#inf,RANGEDEL]
+  000006:[k#5,RANGEDEL-q#inf,RANGEDEL]
+2:
+  000007:[a#2,SET-a#2,SET]
+3:
+  000008:[a#1,SET-c#1,SET]
+  000009:[ff#1,SET-ff#1,SET]
+  000010:[k#1,SET-k#1,SET]
+
+compact a-q L1
+----
+2:
+  000011:[a#4,RANGEDEL-d#inf,RANGEDEL]
+  000012:[k#5,RANGEDEL-m#inf,RANGEDEL]
+3:
+  000008:[a#1,SET-c#1,SET]
+  000009:[ff#1,SET-ff#1,SET]
+  000010:[k#1,SET-k#1,SET]
+
+# Test a case where a new output file is started, there are no previous output
+# files, there are no additional keys (key = nil) and the rangedel fragmenter
+# is non-empty.
+define target-file-sizes=(1, 1, 1)
+L1
+  a.RANGEDEL.10:b
+  d.RANGEDEL.9:e
+  q.RANGEDEL.8:r
+L2
+  g.RANGEDEL.7:h
+L3
+  q.SET.6:6
+----
+1:
+  000004:[a#10,RANGEDEL-r#inf,RANGEDEL]
+2:
+  000005:[g#7,RANGEDEL-h#inf,RANGEDEL]
+3:
+  000006:[q#6,SET-q#6,SET]
+
+compact a-r L1
+----
+2:
+  000007:[q#8,RANGEDEL-r#inf,RANGEDEL]
+3:
+  000006:[q#6,SET-q#6,SET]
+
+define target-file-sizes=(100, 100, 100)
+L1
+  a.RANGEDEL.10:b
+  b.SET.0:foo
+  d.RANGEDEL.0:e
+  j.SET.10:foo
+L2
+  f.RANGEDEL.7:g
+L3
+  c.SET.6:6
+L3
+  c.SET.5:5
+L3
+  c.SET.4:4
+L4
+  a.SET.0:0
+  f.SET.0:0
+----
+1:
+  000004:[a#10,RANGEDEL-j#10,SET]
+2:
+  000005:[f#7,RANGEDEL-g#inf,RANGEDEL]
+3:
+  000006:[c#6,SET-c#6,SET]
+  000007:[c#5,SET-c#5,SET]
+  000008:[c#4,SET-c#4,SET]
+4:
+  000009:[a#0,SET-f#0,SET]
+
+compact a-r L1
+----
+2:
+  000010:[a#10,RANGEDEL-b#0,SET]
+  000011:[d#0,RANGEDEL-j#10,SET]
+3:
+  000006:[c#6,SET-c#6,SET]
+  000007:[c#5,SET-c#5,SET]
+  000008:[c#4,SET-c#4,SET]
+4:
+  000009:[a#0,SET-f#0,SET]
+
+# Test a snapshot that separates a range deletion from all the data that it
+# deletes. Ensure that we respect the target-file-size and split into multiple
+# outputs.
+
+define target-file-sizes=(1, 1, 1) snapshots=(14)
+L1
+  a.RANGEDEL.15:z
+  b.SET.11:foo
+  c.SET.11:foo
+L2
+  c.SET.0:foo
+  d.SET.0:foo
+----
+1:
+  000004:[a#15,RANGEDEL-z#inf,RANGEDEL]
+2:
+  000005:[c#0,SET-d#0,SET]
+
+sstable-properties file=000004
+snapshot-pinned-keys
+----
+snapshot-pinned-keys:
+  pebble.num.snapshot-pinned-keys: 2
+  pebble.raw.snapshot-pinned-keys.size: 18
+
+compact a-z L1
+----
+2:
+  000006:[a#15,RANGEDEL-c#inf,RANGEDEL]
+  000007:[c#15,RANGEDEL-d#inf,RANGEDEL]
+  000008:[d#15,RANGEDEL-z#inf,RANGEDEL]
+
+sstable-properties file=000006
+snapshot-pinned-keys
+----
+snapshot-pinned-keys:
+  pebble.num.snapshot-pinned-keys: 1
+  pebble.raw.snapshot-pinned-keys.size: 9
+
+sstable-properties file=000007
+snapshot-pinned-keys
+----
+snapshot-pinned-keys:
+  pebble.num.snapshot-pinned-keys: 1
+  pebble.raw.snapshot-pinned-keys.size: 9
+
+sstable-properties file=000008
+snapshot-pinned-keys
+----
+snapshot-pinned-keys:
+  pebble.num.snapshot-pinned-keys: 1
+  pebble.raw.snapshot-pinned-keys.size: 9
+
+# Test an interaction between a range deletion that will be elided with
+# output splitting. Ensure that the output is still split (previous versions
+# of the code did not, because of intricacies around preventing a zero
+# sequence number in an output's largest key).
+
+define target-file-sizes=(1, 1, 1)
+L1
+  a.RANGEDEL.10:z
+  b.SET.11:foo
+  c.SET.11:foo
+L2
+  c.SET.0:foo
+  d.SET.0:foo
+----
+1:
+  000004:[a#10,RANGEDEL-z#inf,RANGEDEL]
+2:
+  000005:[c#0,SET-d#0,SET]
+
+compact a-z L1
+----
+2:
+  000006:[b#0,SET-b#0,SET]
+  000007:[c#0,SET-c#0,SET]
+
+define target-file-sizes=(1, 1, 1, 1)
+L0
+  a.SET.3:v
+  b.SET.2:v
+L2
+  a.SET.1:v
+L3
+  a.SET.0:v
+  b.SET.0:v
+L3
+  c.SET.0:v
+----
+0.0:
+  000004:[a#3,SET-b#2,SET]
+2:
+  000005:[a#1,SET-a#1,SET]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-c#0,SET]
+
+set-concurrent-compactions num=3
+----
+
+compact a-c parallel hide-file-num
+----
+4:
+  [a#0,SET-a#0,SET]
+  [b#0,SET-b#0,SET]
+  [c#0,SET-c#0,SET]
+
+define target-file-sizes=(1, 1, 1, 1)
+L0
+  a.SET.3:v
+  b.SET.2:v
+L0
+  a.SET.2:v
+  c.SET.2:v
+L2
+  a.SET.1:v
+  b.SET.1:v
+L2
+  c.SET.1:v
+L2
+  d.SET.0:v
+L3
+  a.SET.0:v
+  b.SET.0:v
+L3
+  c.SET.0:v
+----
+0.1:
+  000004:[a#3,SET-b#2,SET]
+0.0:
+  000005:[a#2,SET-c#2,SET]
+2:
+  000006:[a#1,SET-b#1,SET]
+  000007:[c#1,SET-c#1,SET]
+  000008:[d#0,SET-d#0,SET]
+3:
+  000009:[a#0,SET-b#0,SET]
+  000010:[c#0,SET-c#0,SET]
+
+set-concurrent-compactions num=2
+----
+
+compact a-c L0 parallel
+----
+1:
+  000011:[a#3,SET-a#3,SET]
+  000012:[b#2,SET-b#2,SET]
+  000013:[c#2,SET-c#2,SET]
+2:
+  000006:[a#1,SET-b#1,SET]
+  000007:[c#1,SET-c#1,SET]
+  000008:[d#0,SET-d#0,SET]
+3:
+  000009:[a#0,SET-b#0,SET]
+  000010:[c#0,SET-c#0,SET]
+
+add-ongoing-compaction startLevel=3 outputLevel=4 start=a end=d
+----
+
+# We allow 2 maximum concurrent compactions. The operation below generates
+# 2 concurrent compactions (a-b, c) from L1 to L2. With 1 ongoing compaction with
+# output level L4, there is no conflict and thus the concurrent compactions should
+# be queued up and executed sequentially. We ensure that the compactions finish and
+# that the final result of the compactions is correct.
+
+async-compact a-d L1 parallel
+----
+manual compaction did not block for ongoing
+2:
+  000014:[a#3,SET-a#3,SET]
+  000015:[b#2,SET-b#2,SET]
+  000016:[c#2,SET-c#2,SET]
+  000008:[d#0,SET-d#0,SET]
+3:
+  000009:[a#0,SET-b#0,SET]
+  000010:[c#0,SET-c#0,SET]
+
+remove-ongoing-compaction
+----
+
+set-concurrent-compactions num=3
+----
+
+compact a-d parallel hide-file-num
+----
+4:
+  [a#0,SET-a#0,SET]
+  [b#0,SET-b#0,SET]
+  [c#0,SET-c#0,SET]
+  [d#0,SET-d#0,SET]
+
+# Create a contrived compaction that forces point key and rangedel iterators
+# to stay in sync to emit a correct view of visible and deleted keys. Note that
+# Pebble no longer produces range tombstones that go outside a file's bounds,
+# but past versions of pebble did, and we should still be able to handle those
+# well.
+
+define target-file-sizes=(1, 1, 1, 1, 1, 1) snapshots=(390)
+L3  start=tmgc.MERGE.391 end=tmgc.MERGE.391
+  tmgc.MERGE.391:foo
+	tmgc.RANGEDEL.331:udkatvs
+L3 start=tmgc.MERGE.384 end=tmgc.MERGE.384
+  tmgc.MERGE.384:bar
+  tmgc.RANGEDEL.383:tvsalezade
+  tmgc.RANGEDEL.331:tvsalezade
+L3 start=tmgc.RANGEDEL.383 end=tvsalezade.RANGEDEL.72057594037927935
+  tmgc.RANGEDEL.383:tvsalezade
+  tmgc.SET.375:baz
+  tmgc.RANGEDEL.356:tvsalezade
+----
+3:
+  000004:[tmgc#391,MERGE-tmgc#391,MERGE]
+  000005:[tmgc#384,MERGE-tmgc#384,MERGE]
+  000006:[tmgc#383,RANGEDEL-tvsalezade#inf,RANGEDEL]
+
+compact a-z L3
+----
+4:
+  000007:[tmgc#391,MERGE-tmgc#384,MERGE]
+
+# baz should NOT be visible in the value.
+
+iter
+first
+next
+next
+----
+tmgc: (barfoo, .)
+.
+.
+
+# Test split user keys containing RANGEDELs.
+# Note that this manual compaction is multilevel!
+
+define
+L4
+  b.SET.10:b10
+L5 start=b.SET.9 end=b.SET.8
+  b.SET.9:b9
+  b.SET.8:b8
+  b.RANGEDEL.1:z
+L5 start=b.SET.7 end=b.SET.6
+  b.SET.7:b7
+  b.SET.6:b6
+  b.RANGEDEL.1:z
+L5 start=b.SET.5 end=z.RANGEDEL.72057594037927935
+  b.SET.5:b5
+  b.SET.4:b4
+  b.RANGEDEL.1:z
+L6
+  b.SET.0:b0
+  bat.SET.0:bat
+  cat.SET.0:cat
+----
+4:
+  000004:[b#10,SET-b#10,SET]
+5:
+  000005:[b#9,SET-b#8,SET]
+  000006:[b#7,SET-b#6,SET]
+  000007:[b#5,SET-z#inf,RANGEDEL]
+6:
+  000008:[b#0,SET-cat#0,SET]
+
+compact a-z
+----
+6:
+  000009:[b#0,SET-b#0,SET]
+
+iter
+first
+next
+----
+b: (b10, .)
+.
+
+# Ensure an untruncated range tombstone (eg, written by an earlier version of
+# Pebble, Cockroach v21.2 or earlier) cannot delete newer keys outside the
+# containing file's bounds.
+#
+# Regression test for cockroachdb/cockroach#89777.
+
+define target-file-sizes=(1)
+L5 start=g.RANGEDEL.1 end=z.RANGEDEL.72057594037927935
+  a.RANGEDEL.1:z
+  m.SET.1:bar
+L6
+  m.SET.0:bax
+L6 start=a.SET.1 end=g.RANGEDEL.72057594037927935
+  a.SET.1:foo
+  a.RANGEDEL.1:z
+----
+5:
+  000004:[g#1,RANGEDEL-z#inf,RANGEDEL]
+6:
+  000006:[a#1,SET-g#inf,RANGEDEL]
+  000005:[m#0,SET-m#0,SET]
+
+# Compacting g-z should result in the elision of the range tombstone in the g-z
+# span.
+
+compact g-z
+----
+6:
+  000006:[a#1,SET-g#inf,RANGEDEL]
+  000007:[m#0,SET-m#0,SET]
+
+# Write a bunch of keys within the keyspace [g,z), and flush them.
+
+batch
+set b b
+set h h
+set i i
+set j j
+set k k
+set m m
+set q q
+set y y
+----
+
+flush
+----
+0.0:
+  000009:[b#10,SET-b#10,SET]
+  000010:[h#11,SET-h#11,SET]
+  000011:[i#12,SET-i#12,SET]
+  000012:[j#13,SET-j#13,SET]
+  000013:[k#14,SET-k#14,SET]
+  000014:[m#15,SET-m#15,SET]
+  000015:[q#16,SET-q#16,SET]
+  000016:[y#17,SET-y#17,SET]
+6:
+  000006:[a#1,SET-g#inf,RANGEDEL]
+  000007:[m#0,SET-m#0,SET]
+
+# Compact g-z to zero the sequence numbers of the newer keys.
+
+compact g-z
+----
+0.0:
+  000009:[b#10,SET-b#10,SET]
+6:
+  000006:[a#1,SET-g#inf,RANGEDEL]
+  000048:[h#0,SET-k#0,SET]
+  000049:[m#0,SET-y#0,SET]
+
+batch
+set t t
+----
+
+flush
+----
+0.0:
+  000009:[b#10,SET-b#10,SET]
+  000051:[t#18,SET-t#18,SET]
+6:
+  000006:[a#1,SET-g#inf,RANGEDEL]
+  000048:[h#0,SET-k#0,SET]
+  000049:[m#0,SET-y#0,SET]
+
+# Compact everything. The batch-committed keys with zeroed sequence numbers (eg,
+# h, i, j, k, m, q, y) should all still exist because the a-z tombstone in
+# 000006 should be limited in how far it expands.
+
+compact a-z
+----
+6:
+  000062:[a#0,SET-i#0,SET]
+  000063:[j#0,SET-q#0,SET]
+  000064:[t#0,SET-y#0,SET]
+
+iter
+first
+next
+next
+next
+next
+next
+next
+next
+next
+next
+----
+a: (foo, .)
+b: (b, .)
+h: (h, .)
+i: (i, .)
+j: (j, .)
+k: (k, .)
+m: (m, .)
+q: (q, .)
+t: (t, .)
+y: (y, .)
+
+# Ensure an untruncated range tombstone (eg, written by an earlier version of
+# Pebble, Cockroach v21.2 or earlier) cannot expand beyond its previous
+# truncated bounds.
+#
+# Regression test for cockroachdb/cockroach#89777.
+
+define snapshots=(2)
+L5 start=g.RANGEDEL.3 end=n.RANGEDEL.72057594037927935
+  a.RANGEDEL.3:z
+  m.SET.1:bar
+L5
+  s.SET.0:bax
+L6
+  q.SET.0:foo
+----
+5:
+  000004:[g#3,RANGEDEL-n#inf,RANGEDEL]
+  000005:[s#0,SET-s#0,SET]
+6:
+  000006:[q#0,SET-q#0,SET]
+
+# Compacting all the files in a single expansion should not expand the
+# untruncated range tombstone to the larger untruncated bounds. The rangedel
+# should remain bounded as [a,n).
+
+compact a-z
+----
+6:
+  000007:[g#3,RANGEDEL-s#0,SET]
diff --git a/pebble/testdata/manual_compaction_file_boundaries b/pebble/testdata/manual_compaction_file_boundaries
new file mode 100644
index 0000000..1ab48ee
--- /dev/null
+++ b/pebble/testdata/manual_compaction_file_boundaries
@@ -0,0 +1,518 @@
+# Test the file-size grandparent boundary alignment heuristic. This test sets up
+# L3 with a file at each of 'a', 'b', ..., 'z'. It also creates a single file in
+# L2 spanning a-z. Then, it commits, flushes and compacts into L1 keys 'a@1',
+# 'aa@1', 'ab@1', ..., 'zz@1'. Finally, it tests compacting L1 into L2.
+#
+# With L3 as the grandparent level, the alignment heuristic should attempt to
+# align the output files with grandparent's boundaries. Each output file should
+# have a key range formed by the prefix of a single letter.
+
+define target-file-sizes=(5000, 5000, 5000, 5000)
+L2
+  a.SET.101:<rand-bytes=1000>
+  z.SET.102:<rand-bytes=1000>
+L3
+  a.SET.001:<rand-bytes=10000>
+L3
+  b.SET.002:<rand-bytes=10000>
+L3
+  c.SET.003:<rand-bytes=10000>
+L3
+  d.SET.004:<rand-bytes=10000>
+L3
+  e.SET.005:<rand-bytes=10000>
+L3
+  f.SET.006:<rand-bytes=10000>
+L3
+  g.SET.007:<rand-bytes=10000>
+L3
+  h.SET.008:<rand-bytes=10000>
+L3
+  i.SET.009:<rand-bytes=10000>
+L3
+  j.SET.010:<rand-bytes=10000>
+L3
+  k.SET.011:<rand-bytes=10000>
+L3
+  l.SET.012:<rand-bytes=10000>
+L3
+  m.SET.013:<rand-bytes=10000>
+L3
+  n.SET.014:<rand-bytes=10000>
+L3
+  o.SET.015:<rand-bytes=10000>
+L3
+  p.SET.016:<rand-bytes=10000>
+L3
+  q.SET.017:<rand-bytes=10000>
+L3
+  r.SET.018:<rand-bytes=10000>
+L3
+  s.SET.019:<rand-bytes=10000>
+L3
+  t.SET.020:<rand-bytes=10000>
+L3
+  u.SET.021:<rand-bytes=10000>
+L3
+  v.SET.022:<rand-bytes=10000>
+L3
+  w.SET.023:<rand-bytes=10000>
+L3
+  x.SET.024:<rand-bytes=10000>
+L3
+  y.SET.025:<rand-bytes=10000>
+L3
+  z.SET.026:<rand-bytes=10000>
+----
+2:
+  000004:[a#101,SET-z#102,SET]
+3:
+  000005:[a#1,SET-a#1,SET]
+  000006:[b#2,SET-b#2,SET]
+  000007:[c#3,SET-c#3,SET]
+  000008:[d#4,SET-d#4,SET]
+  000009:[e#5,SET-e#5,SET]
+  000010:[f#6,SET-f#6,SET]
+  000011:[g#7,SET-g#7,SET]
+  000012:[h#8,SET-h#8,SET]
+  000013:[i#9,SET-i#9,SET]
+  000014:[j#10,SET-j#10,SET]
+  000015:[k#11,SET-k#11,SET]
+  000016:[l#12,SET-l#12,SET]
+  000017:[m#13,SET-m#13,SET]
+  000018:[n#14,SET-n#14,SET]
+  000019:[o#15,SET-o#15,SET]
+  000020:[p#16,SET-p#16,SET]
+  000021:[q#17,SET-q#17,SET]
+  000022:[r#18,SET-r#18,SET]
+  000023:[s#19,SET-s#19,SET]
+  000024:[t#20,SET-t#20,SET]
+  000025:[u#21,SET-u#21,SET]
+  000026:[v#22,SET-v#22,SET]
+  000027:[w#23,SET-w#23,SET]
+  000028:[x#24,SET-x#24,SET]
+  000029:[y#25,SET-y#25,SET]
+  000030:[z#26,SET-z#26,SET]
+
+populate keylen=2 vallen=200 timestamps=(1)
+----
+wrote 702 keys
+
+flush
+----
+0.0:
+  000033:[a@1#103,SET-aw@1#126,SET]
+  000034:[ax@1#127,SET-bt@1#150,SET]
+  000035:[bu@1#151,SET-cq@1#174,SET]
+  000036:[cr@1#175,SET-dn@1#198,SET]
+  000037:[do@1#199,SET-ek@1#222,SET]
+  000038:[el@1#223,SET-fh@1#246,SET]
+  000039:[fi@1#247,SET-ge@1#270,SET]
+  000040:[gf@1#271,SET-hb@1#294,SET]
+  000041:[hc@1#295,SET-hz@1#318,SET]
+  000042:[i@1#319,SET-iw@1#342,SET]
+  000043:[ix@1#343,SET-jt@1#366,SET]
+  000044:[ju@1#367,SET-kq@1#390,SET]
+  000045:[kr@1#391,SET-ln@1#414,SET]
+  000046:[lo@1#415,SET-mk@1#438,SET]
+  000047:[ml@1#439,SET-nh@1#462,SET]
+  000048:[ni@1#463,SET-oe@1#486,SET]
+  000049:[of@1#487,SET-pb@1#510,SET]
+  000050:[pc@1#511,SET-pz@1#534,SET]
+  000051:[q@1#535,SET-qw@1#558,SET]
+  000052:[qx@1#559,SET-rt@1#582,SET]
+  000053:[ru@1#583,SET-sq@1#606,SET]
+  000054:[sr@1#607,SET-tn@1#630,SET]
+  000055:[to@1#631,SET-uk@1#654,SET]
+  000056:[ul@1#655,SET-vh@1#678,SET]
+  000057:[vi@1#679,SET-we@1#702,SET]
+  000058:[wf@1#703,SET-xb@1#726,SET]
+  000059:[xc@1#727,SET-xz@1#750,SET]
+  000060:[y@1#751,SET-yw@1#774,SET]
+  000061:[yx@1#775,SET-zt@1#798,SET]
+  000062:[zu@1#799,SET-zz@1#804,SET]
+2:
+  000004:[a#101,SET-z#102,SET]
+3:
+  000005:[a#1,SET-a#1,SET]
+  000006:[b#2,SET-b#2,SET]
+  000007:[c#3,SET-c#3,SET]
+  000008:[d#4,SET-d#4,SET]
+  000009:[e#5,SET-e#5,SET]
+  000010:[f#6,SET-f#6,SET]
+  000011:[g#7,SET-g#7,SET]
+  000012:[h#8,SET-h#8,SET]
+  000013:[i#9,SET-i#9,SET]
+  000014:[j#10,SET-j#10,SET]
+  000015:[k#11,SET-k#11,SET]
+  000016:[l#12,SET-l#12,SET]
+  000017:[m#13,SET-m#13,SET]
+  000018:[n#14,SET-n#14,SET]
+  000019:[o#15,SET-o#15,SET]
+  000020:[p#16,SET-p#16,SET]
+  000021:[q#17,SET-q#17,SET]
+  000022:[r#18,SET-r#18,SET]
+  000023:[s#19,SET-s#19,SET]
+  000024:[t#20,SET-t#20,SET]
+  000025:[u#21,SET-u#21,SET]
+  000026:[v#22,SET-v#22,SET]
+  000027:[w#23,SET-w#23,SET]
+  000028:[x#24,SET-x#24,SET]
+  000029:[y#25,SET-y#25,SET]
+  000030:[z#26,SET-z#26,SET]
+
+compact a-zz L0
+----
+1:
+  000063:[a@1#103,SET-aw@1#126,SET]
+  000064:[ax@1#127,SET-bt@1#150,SET]
+  000065:[bu@1#151,SET-cq@1#174,SET]
+  000066:[cr@1#175,SET-dn@1#198,SET]
+  000067:[do@1#199,SET-ek@1#222,SET]
+  000068:[el@1#223,SET-fh@1#246,SET]
+  000069:[fi@1#247,SET-ge@1#270,SET]
+  000070:[gf@1#271,SET-hb@1#294,SET]
+  000071:[hc@1#295,SET-hz@1#318,SET]
+  000072:[i@1#319,SET-iw@1#342,SET]
+  000073:[ix@1#343,SET-jt@1#366,SET]
+  000074:[ju@1#367,SET-kq@1#390,SET]
+  000075:[kr@1#391,SET-ln@1#414,SET]
+  000076:[lo@1#415,SET-mk@1#438,SET]
+  000077:[ml@1#439,SET-nh@1#462,SET]
+  000078:[ni@1#463,SET-oe@1#486,SET]
+  000079:[of@1#487,SET-pb@1#510,SET]
+  000080:[pc@1#511,SET-pz@1#534,SET]
+  000081:[q@1#535,SET-qw@1#558,SET]
+  000082:[qx@1#559,SET-rt@1#582,SET]
+  000083:[ru@1#583,SET-sq@1#606,SET]
+  000084:[sr@1#607,SET-tn@1#630,SET]
+  000085:[to@1#631,SET-uk@1#654,SET]
+  000086:[ul@1#655,SET-vh@1#678,SET]
+  000087:[vi@1#679,SET-we@1#702,SET]
+  000088:[wf@1#703,SET-xb@1#726,SET]
+  000089:[xc@1#727,SET-xz@1#750,SET]
+  000090:[y@1#751,SET-yw@1#774,SET]
+  000091:[yx@1#775,SET-zt@1#798,SET]
+  000092:[zu@1#799,SET-zz@1#804,SET]
+2:
+  000004:[a#101,SET-z#102,SET]
+3:
+  000005:[a#1,SET-a#1,SET]
+  000006:[b#2,SET-b#2,SET]
+  000007:[c#3,SET-c#3,SET]
+  000008:[d#4,SET-d#4,SET]
+  000009:[e#5,SET-e#5,SET]
+  000010:[f#6,SET-f#6,SET]
+  000011:[g#7,SET-g#7,SET]
+  000012:[h#8,SET-h#8,SET]
+  000013:[i#9,SET-i#9,SET]
+  000014:[j#10,SET-j#10,SET]
+  000015:[k#11,SET-k#11,SET]
+  000016:[l#12,SET-l#12,SET]
+  000017:[m#13,SET-m#13,SET]
+  000018:[n#14,SET-n#14,SET]
+  000019:[o#15,SET-o#15,SET]
+  000020:[p#16,SET-p#16,SET]
+  000021:[q#17,SET-q#17,SET]
+  000022:[r#18,SET-r#18,SET]
+  000023:[s#19,SET-s#19,SET]
+  000024:[t#20,SET-t#20,SET]
+  000025:[u#21,SET-u#21,SET]
+  000026:[v#22,SET-v#22,SET]
+  000027:[w#23,SET-w#23,SET]
+  000028:[x#24,SET-x#24,SET]
+  000029:[y#25,SET-y#25,SET]
+  000030:[z#26,SET-z#26,SET]
+
+# Perform the actual test. Compacting L1 into L2 should use L3's boundaries to
+# inform compaction output splitting.
+#
+compact a-zz L1
+----
+2:
+  000093:[a#101,SET-az@1#129,SET]
+  000094:[b@1#130,SET-bz@1#156,SET]
+  000095:[c@1#157,SET-cz@1#183,SET]
+  000096:[d@1#184,SET-dz@1#210,SET]
+  000097:[e@1#211,SET-ez@1#237,SET]
+  000098:[f@1#238,SET-fz@1#264,SET]
+  000099:[g@1#265,SET-gz@1#291,SET]
+  000100:[h@1#292,SET-hz@1#318,SET]
+  000101:[i@1#319,SET-iz@1#345,SET]
+  000102:[j@1#346,SET-jz@1#372,SET]
+  000103:[k@1#373,SET-kz@1#399,SET]
+  000104:[l@1#400,SET-lz@1#426,SET]
+  000105:[m@1#427,SET-mz@1#453,SET]
+  000106:[n@1#454,SET-nz@1#480,SET]
+  000107:[o@1#481,SET-oz@1#507,SET]
+  000108:[p@1#508,SET-pz@1#534,SET]
+  000109:[q@1#535,SET-qz@1#561,SET]
+  000110:[r@1#562,SET-rz@1#588,SET]
+  000111:[s@1#589,SET-sz@1#615,SET]
+  000112:[t@1#616,SET-tz@1#642,SET]
+  000113:[u@1#643,SET-uz@1#669,SET]
+  000114:[v@1#670,SET-vz@1#696,SET]
+  000115:[w@1#697,SET-wz@1#723,SET]
+  000116:[x@1#724,SET-xz@1#750,SET]
+  000117:[y@1#751,SET-yz@1#777,SET]
+  000118:[z#102,SET-zr@1#796,SET]
+  000119:[zs@1#797,SET-zz@1#804,SET]
+3:
+  000005:[a#1,SET-a#1,SET]
+  000006:[b#2,SET-b#2,SET]
+  000007:[c#3,SET-c#3,SET]
+  000008:[d#4,SET-d#4,SET]
+  000009:[e#5,SET-e#5,SET]
+  000010:[f#6,SET-f#6,SET]
+  000011:[g#7,SET-g#7,SET]
+  000012:[h#8,SET-h#8,SET]
+  000013:[i#9,SET-i#9,SET]
+  000014:[j#10,SET-j#10,SET]
+  000015:[k#11,SET-k#11,SET]
+  000016:[l#12,SET-l#12,SET]
+  000017:[m#13,SET-m#13,SET]
+  000018:[n#14,SET-n#14,SET]
+  000019:[o#15,SET-o#15,SET]
+  000020:[p#16,SET-p#16,SET]
+  000021:[q#17,SET-q#17,SET]
+  000022:[r#18,SET-r#18,SET]
+  000023:[s#19,SET-s#19,SET]
+  000024:[t#20,SET-t#20,SET]
+  000025:[u#21,SET-u#21,SET]
+  000026:[v#22,SET-v#22,SET]
+  000027:[w#23,SET-w#23,SET]
+  000028:[x#24,SET-x#24,SET]
+  000029:[y#25,SET-y#25,SET]
+  000030:[z#26,SET-z#26,SET]
+
+file-sizes
+----
+L2:
+  000093:[a#101,1-az@1#129,1]: 7472 bytes (7.3KB)
+  000094:[b@1#130,1-bz@1#156,1]: 6465 bytes (6.3KB)
+  000095:[c@1#157,1-cz@1#183,1]: 6465 bytes (6.3KB)
+  000096:[d@1#184,1-dz@1#210,1]: 6465 bytes (6.3KB)
+  000097:[e@1#211,1-ez@1#237,1]: 6465 bytes (6.3KB)
+  000098:[f@1#238,1-fz@1#264,1]: 6465 bytes (6.3KB)
+  000099:[g@1#265,1-gz@1#291,1]: 6465 bytes (6.3KB)
+  000100:[h@1#292,1-hz@1#318,1]: 6465 bytes (6.3KB)
+  000101:[i@1#319,1-iz@1#345,1]: 6465 bytes (6.3KB)
+  000102:[j@1#346,1-jz@1#372,1]: 6465 bytes (6.3KB)
+  000103:[k@1#373,1-kz@1#399,1]: 6465 bytes (6.3KB)
+  000104:[l@1#400,1-lz@1#426,1]: 6465 bytes (6.3KB)
+  000105:[m@1#427,1-mz@1#453,1]: 6465 bytes (6.3KB)
+  000106:[n@1#454,1-nz@1#480,1]: 6465 bytes (6.3KB)
+  000107:[o@1#481,1-oz@1#507,1]: 6465 bytes (6.3KB)
+  000108:[p@1#508,1-pz@1#534,1]: 6465 bytes (6.3KB)
+  000109:[q@1#535,1-qz@1#561,1]: 6464 bytes (6.3KB)
+  000110:[r@1#562,1-rz@1#588,1]: 6465 bytes (6.3KB)
+  000111:[s@1#589,1-sz@1#615,1]: 6465 bytes (6.3KB)
+  000112:[t@1#616,1-tz@1#642,1]: 6465 bytes (6.3KB)
+  000113:[u@1#643,1-uz@1#669,1]: 6465 bytes (6.3KB)
+  000114:[v@1#670,1-vz@1#696,1]: 6465 bytes (6.3KB)
+  000115:[w@1#697,1-wz@1#723,1]: 6465 bytes (6.3KB)
+  000116:[x@1#724,1-xz@1#750,1]: 6465 bytes (6.3KB)
+  000117:[y@1#751,1-yz@1#777,1]: 6465 bytes (6.3KB)
+  000118:[z#102,1-zr@1#796,1]: 5752 bytes (5.6KB)
+  000119:[zs@1#797,1-zz@1#804,1]: 2346 bytes (2.3KB)
+L3:
+  000005:[a#1,1-a#1,1]: 10638 bytes (10KB)
+  000006:[b#2,1-b#2,1]: 10638 bytes (10KB)
+  000007:[c#3,1-c#3,1]: 10638 bytes (10KB)
+  000008:[d#4,1-d#4,1]: 10638 bytes (10KB)
+  000009:[e#5,1-e#5,1]: 10638 bytes (10KB)
+  000010:[f#6,1-f#6,1]: 10638 bytes (10KB)
+  000011:[g#7,1-g#7,1]: 10638 bytes (10KB)
+  000012:[h#8,1-h#8,1]: 10638 bytes (10KB)
+  000013:[i#9,1-i#9,1]: 10638 bytes (10KB)
+  000014:[j#10,1-j#10,1]: 10638 bytes (10KB)
+  000015:[k#11,1-k#11,1]: 10638 bytes (10KB)
+  000016:[l#12,1-l#12,1]: 10638 bytes (10KB)
+  000017:[m#13,1-m#13,1]: 10638 bytes (10KB)
+  000018:[n#14,1-n#14,1]: 10638 bytes (10KB)
+  000019:[o#15,1-o#15,1]: 10638 bytes (10KB)
+  000020:[p#16,1-p#16,1]: 10638 bytes (10KB)
+  000021:[q#17,1-q#17,1]: 10638 bytes (10KB)
+  000022:[r#18,1-r#18,1]: 10638 bytes (10KB)
+  000023:[s#19,1-s#19,1]: 10638 bytes (10KB)
+  000024:[t#20,1-t#20,1]: 10638 bytes (10KB)
+  000025:[u#21,1-u#21,1]: 10638 bytes (10KB)
+  000026:[v#22,1-v#22,1]: 10638 bytes (10KB)
+  000027:[w#23,1-w#23,1]: 10638 bytes (10KB)
+  000028:[x#24,1-x#24,1]: 10638 bytes (10KB)
+  000029:[y#25,1-y#25,1]: 10638 bytes (10KB)
+  000030:[z#26,1-z#26,1]: 10638 bytes (10KB)
+
+# Test a scenario where there exists a grandparent file (in L3), but the L1->L2
+# compaction doesn't reach it until late in the compaction. The output file
+# should be split at 2x the target file size (~10K), despite not being aligned
+# with a grandparent.
+#
+# Additionally, when the compaction does reach the grandparent's start bound,
+# the compaction should NOT split the output if the current output is less than
+# 0.5x the target file size (~2.5K).
+#
+# Lastly, once past the final grandparent, the compaction should optimize for
+# cutting as close to file size as possible, resulting in an output file ~5K.
+
+define target-file-sizes=(5000, 5000, 5000, 5000)
+L1
+  a.SET.201:<rand-bytes=1000>
+  b.SET.202:<rand-bytes=1000>
+  c.SET.203:<rand-bytes=1000>
+  d.SET.204:<rand-bytes=1000>
+  e.SET.205:<rand-bytes=1000>
+  f.SET.206:<rand-bytes=1000>
+  g.SET.207:<rand-bytes=1000>
+  h.SET.208:<rand-bytes=1000>
+  i.SET.209:<rand-bytes=1000>
+  j.SET.210:<rand-bytes=1000>
+  k.SET.211:<rand-bytes=1000>
+  l.SET.212:<rand-bytes=1000>
+  m.SET.213:<rand-bytes=1000>
+  n.SET.214:<rand-bytes=1000>
+  o.SET.215:<rand-bytes=1000>
+L2
+  a.SET.101:<rand-bytes=10>
+  z.SET.102:<rand-bytes=10>
+L3
+  m.SET.001:<rand-bytes=10000>
+----
+1:
+  000004:[a#201,SET-o#215,SET]
+2:
+  000005:[a#101,SET-z#102,SET]
+3:
+  000006:[m#1,SET-m#1,SET]
+
+compact a-zz L1
+----
+2:
+  000007:[a#201,SET-j#210,SET]
+  000008:[k#211,SET-o#215,SET]
+  000009:[z#102,SET-z#102,SET]
+3:
+  000006:[m#1,SET-m#1,SET]
+
+file-sizes
+----
+L2:
+  000007:[a#201,1-j#210,1]: 10811 bytes (11KB)
+  000008:[k#211,1-o#215,1]: 5723 bytes (5.6KB)
+  000009:[z#102,1-z#102,1]: 643 bytes (643B)
+L3:
+  000006:[m#1,1-m#1,1]: 10638 bytes (10KB)
+
+# Test the file-size splitter's adaptive tolerance for early-splitting at a
+# grandparent boundary. The L1->L2 compaction has many opportunities to split at
+# a grandparent boundary at file sizes ≥ 2.5K. Because it's seen more than 8
+# grandparent boundaries, waits until file size is ≥ 90% of the target file size
+# (eg, ~4.5K).
+
+define target-file-sizes=(5000, 5000, 5000, 5000)
+L1
+  a.SET.201:<rand-bytes=1000>
+  b.SET.202:<rand-bytes=1000>
+  c.SET.203:<rand-bytes=1000>
+  d.SET.204:<rand-bytes=1000>
+  e.SET.205:<rand-bytes=1000>
+  f.SET.206:<rand-bytes=1000>
+  g.SET.207:<rand-bytes=1000>
+  h.SET.208:<rand-bytes=1000>
+  i.SET.209:<rand-bytes=1000>
+  j.SET.210:<rand-bytes=1000>
+  k.SET.211:<rand-bytes=1000>
+  l.SET.212:<rand-bytes=1000>
+  m.SET.213:<rand-bytes=1000>
+  n.SET.214:<rand-bytes=1000>
+  o.SET.215:<rand-bytes=1000>
+L2
+  a.SET.101:<rand-bytes=10>
+  z.SET.102:<rand-bytes=10>
+L3
+  a.SET.001:<rand-bytes=1000>
+L3
+  ab.SET.002:<rand-bytes=1000>
+L3
+  ac.SET.003:<rand-bytes=1000>
+L3
+  ad.SET.004:<rand-bytes=1000>
+L3
+  ad.SET.005:<rand-bytes=1000>
+L3
+  ad.SET.006:<rand-bytes=1000>
+L3
+  ad.SET.007:<rand-bytes=1000>
+L3
+  ad.SET.008:<rand-bytes=1000>
+L3
+  c.SET.009:<rand-bytes=1000>
+L3
+  d.SET.010:<rand-bytes=1000>
+L3
+  e.SET.011:<rand-bytes=1000>
+L3
+  f.SET.012:<rand-bytes=1000>
+L3
+  m.SET.013:<rand-bytes=1000>
+----
+1:
+  000004:[a#201,SET-o#215,SET]
+2:
+  000005:[a#101,SET-z#102,SET]
+3:
+  000006:[a#1,SET-a#1,SET]
+  000007:[ab#2,SET-ab#2,SET]
+  000008:[ac#3,SET-ac#3,SET]
+  000013:[ad#8,SET-ad#8,SET]
+  000012:[ad#7,SET-ad#7,SET]
+  000011:[ad#6,SET-ad#6,SET]
+  000010:[ad#5,SET-ad#5,SET]
+  000009:[ad#4,SET-ad#4,SET]
+  000014:[c#9,SET-c#9,SET]
+  000015:[d#10,SET-d#10,SET]
+  000016:[e#11,SET-e#11,SET]
+  000017:[f#12,SET-f#12,SET]
+  000018:[m#13,SET-m#13,SET]
+
+compact a-zz L1
+----
+2:
+  000019:[a#201,SET-e#205,SET]
+  000020:[f#206,SET-l#212,SET]
+  000021:[m#213,SET-z#102,SET]
+3:
+  000006:[a#1,SET-a#1,SET]
+  000007:[ab#2,SET-ab#2,SET]
+  000008:[ac#3,SET-ac#3,SET]
+  000013:[ad#8,SET-ad#8,SET]
+  000012:[ad#7,SET-ad#7,SET]
+  000011:[ad#6,SET-ad#6,SET]
+  000010:[ad#5,SET-ad#5,SET]
+  000009:[ad#4,SET-ad#4,SET]
+  000014:[c#9,SET-c#9,SET]
+  000015:[d#10,SET-d#10,SET]
+  000016:[e#11,SET-e#11,SET]
+  000017:[f#12,SET-f#12,SET]
+  000018:[m#13,SET-m#13,SET]
+
+file-sizes
+----
+L2:
+  000019:[a#201,1-e#205,1]: 5723 bytes (5.6KB)
+  000020:[f#206,1-l#212,1]: 7749 bytes (7.6KB)
+  000021:[m#213,1-z#102,1]: 3686 bytes (3.6KB)
+L3:
+  000006:[a#1,1-a#1,1]: 1638 bytes (1.6KB)
+  000007:[ab#2,1-ab#2,1]: 1639 bytes (1.6KB)
+  000008:[ac#3,1-ac#3,1]: 1639 bytes (1.6KB)
+  000013:[ad#8,1-ad#8,1]: 1639 bytes (1.6KB)
+  000012:[ad#7,1-ad#7,1]: 1639 bytes (1.6KB)
+  000011:[ad#6,1-ad#6,1]: 1639 bytes (1.6KB)
+  000010:[ad#5,1-ad#5,1]: 1639 bytes (1.6KB)
+  000009:[ad#4,1-ad#4,1]: 1639 bytes (1.6KB)
+  000014:[c#9,1-c#9,1]: 1638 bytes (1.6KB)
+  000015:[d#10,1-d#10,1]: 1638 bytes (1.6KB)
+  000016:[e#11,1-e#11,1]: 1638 bytes (1.6KB)
+  000017:[f#12,1-f#12,1]: 1638 bytes (1.6KB)
+  000018:[m#13,1-m#13,1]: 1638 bytes (1.6KB)
diff --git a/pebble/testdata/manual_compaction_file_boundaries_delsized b/pebble/testdata/manual_compaction_file_boundaries_delsized
new file mode 100644
index 0000000..61a2b31
--- /dev/null
+++ b/pebble/testdata/manual_compaction_file_boundaries_delsized
@@ -0,0 +1,520 @@
+# Test the file-size grandparent boundary alignment heuristic. This test sets up
+# L3 with a file at each of 'a', 'b', ..., 'z'. It also creates a single file in
+# L2 spanning a-z. Then, it commits, flushes and compacts into L1 keys 'a@1',
+# 'aa@1', 'ab@1', ..., 'zz@1'. Finally, it tests compacting L1 into L2.
+#
+# With L3 as the grandparent level, the alignment heuristic should attempt to
+# align the output files with grandparent's boundaries. Each output file should
+# have a key range formed by the prefix of a single letter.
+
+define target-file-sizes=(5000, 5000, 5000, 5000)
+L2
+  a.SET.101:<rand-bytes=1000>
+  z.SET.102:<rand-bytes=1000>
+L3
+  a.SET.001:<rand-bytes=10000>
+L3
+  b.SET.002:<rand-bytes=10000>
+L3
+  c.SET.003:<rand-bytes=10000>
+L3
+  d.SET.004:<rand-bytes=10000>
+L3
+  e.SET.005:<rand-bytes=10000>
+L3
+  f.SET.006:<rand-bytes=10000>
+L3
+  g.SET.007:<rand-bytes=10000>
+L3
+  h.SET.008:<rand-bytes=10000>
+L3
+  i.SET.009:<rand-bytes=10000>
+L3
+  j.SET.010:<rand-bytes=10000>
+L3
+  k.SET.011:<rand-bytes=10000>
+L3
+  l.SET.012:<rand-bytes=10000>
+L3
+  m.SET.013:<rand-bytes=10000>
+L3
+  n.SET.014:<rand-bytes=10000>
+L3
+  o.SET.015:<rand-bytes=10000>
+L3
+  p.SET.016:<rand-bytes=10000>
+L3
+  q.SET.017:<rand-bytes=10000>
+L3
+  r.SET.018:<rand-bytes=10000>
+L3
+  s.SET.019:<rand-bytes=10000>
+L3
+  t.SET.020:<rand-bytes=10000>
+L3
+  u.SET.021:<rand-bytes=10000>
+L3
+  v.SET.022:<rand-bytes=10000>
+L3
+  w.SET.023:<rand-bytes=10000>
+L3
+  x.SET.024:<rand-bytes=10000>
+L3
+  y.SET.025:<rand-bytes=10000>
+L3
+  z.SET.026:<rand-bytes=10000>
+----
+2:
+  000004:[a#101,SET-z#102,SET]
+3:
+  000005:[a#1,SET-a#1,SET]
+  000006:[b#2,SET-b#2,SET]
+  000007:[c#3,SET-c#3,SET]
+  000008:[d#4,SET-d#4,SET]
+  000009:[e#5,SET-e#5,SET]
+  000010:[f#6,SET-f#6,SET]
+  000011:[g#7,SET-g#7,SET]
+  000012:[h#8,SET-h#8,SET]
+  000013:[i#9,SET-i#9,SET]
+  000014:[j#10,SET-j#10,SET]
+  000015:[k#11,SET-k#11,SET]
+  000016:[l#12,SET-l#12,SET]
+  000017:[m#13,SET-m#13,SET]
+  000018:[n#14,SET-n#14,SET]
+  000019:[o#15,SET-o#15,SET]
+  000020:[p#16,SET-p#16,SET]
+  000021:[q#17,SET-q#17,SET]
+  000022:[r#18,SET-r#18,SET]
+  000023:[s#19,SET-s#19,SET]
+  000024:[t#20,SET-t#20,SET]
+  000025:[u#21,SET-u#21,SET]
+  000026:[v#22,SET-v#22,SET]
+  000027:[w#23,SET-w#23,SET]
+  000028:[x#24,SET-x#24,SET]
+  000029:[y#25,SET-y#25,SET]
+  000030:[z#26,SET-z#26,SET]
+
+populate keylen=2 vallen=200 timestamps=(1)
+----
+wrote 702 keys
+
+flush
+----
+0.0:
+  000033:[a@1#103,SET-av@1#125,SET]
+  000034:[aw@1#126,SET-br@1#148,SET]
+  000035:[bs@1#149,SET-cn@1#171,SET]
+  000036:[co@1#172,SET-dj@1#194,SET]
+  000037:[dk@1#195,SET-ef@1#217,SET]
+  000038:[eg@1#218,SET-fb@1#240,SET]
+  000039:[fc@1#241,SET-fy@1#263,SET]
+  000040:[fz@1#264,SET-gu@1#286,SET]
+  000041:[gv@1#287,SET-hq@1#309,SET]
+  000042:[hr@1#310,SET-im@1#332,SET]
+  000043:[in@1#333,SET-ji@1#355,SET]
+  000044:[jj@1#356,SET-ke@1#378,SET]
+  000045:[kf@1#379,SET-la@1#401,SET]
+  000046:[lb@1#402,SET-lx@1#424,SET]
+  000047:[ly@1#425,SET-mt@1#447,SET]
+  000048:[mu@1#448,SET-np@1#470,SET]
+  000049:[nq@1#471,SET-ol@1#493,SET]
+  000050:[om@1#494,SET-ph@1#516,SET]
+  000051:[pi@1#517,SET-qd@1#539,SET]
+  000052:[qe@1#540,SET-r@1#562,SET]
+  000053:[ra@1#563,SET-rw@1#585,SET]
+  000054:[rx@1#586,SET-ss@1#608,SET]
+  000055:[st@1#609,SET-to@1#631,SET]
+  000056:[tp@1#632,SET-uk@1#654,SET]
+  000057:[ul@1#655,SET-vg@1#677,SET]
+  000058:[vh@1#678,SET-wc@1#700,SET]
+  000059:[wd@1#701,SET-wz@1#723,SET]
+  000060:[x@1#724,SET-xv@1#746,SET]
+  000061:[xw@1#747,SET-yr@1#769,SET]
+  000062:[ys@1#770,SET-zn@1#792,SET]
+  000063:[zo@1#793,SET-zz@1#804,SET]
+2:
+  000004:[a#101,SET-z#102,SET]
+3:
+  000005:[a#1,SET-a#1,SET]
+  000006:[b#2,SET-b#2,SET]
+  000007:[c#3,SET-c#3,SET]
+  000008:[d#4,SET-d#4,SET]
+  000009:[e#5,SET-e#5,SET]
+  000010:[f#6,SET-f#6,SET]
+  000011:[g#7,SET-g#7,SET]
+  000012:[h#8,SET-h#8,SET]
+  000013:[i#9,SET-i#9,SET]
+  000014:[j#10,SET-j#10,SET]
+  000015:[k#11,SET-k#11,SET]
+  000016:[l#12,SET-l#12,SET]
+  000017:[m#13,SET-m#13,SET]
+  000018:[n#14,SET-n#14,SET]
+  000019:[o#15,SET-o#15,SET]
+  000020:[p#16,SET-p#16,SET]
+  000021:[q#17,SET-q#17,SET]
+  000022:[r#18,SET-r#18,SET]
+  000023:[s#19,SET-s#19,SET]
+  000024:[t#20,SET-t#20,SET]
+  000025:[u#21,SET-u#21,SET]
+  000026:[v#22,SET-v#22,SET]
+  000027:[w#23,SET-w#23,SET]
+  000028:[x#24,SET-x#24,SET]
+  000029:[y#25,SET-y#25,SET]
+  000030:[z#26,SET-z#26,SET]
+
+compact a-zz L0
+----
+1:
+  000064:[a@1#103,SET-av@1#125,SET]
+  000065:[aw@1#126,SET-br@1#148,SET]
+  000066:[bs@1#149,SET-cn@1#171,SET]
+  000067:[co@1#172,SET-dj@1#194,SET]
+  000068:[dk@1#195,SET-ef@1#217,SET]
+  000069:[eg@1#218,SET-fb@1#240,SET]
+  000070:[fc@1#241,SET-fy@1#263,SET]
+  000071:[fz@1#264,SET-gu@1#286,SET]
+  000072:[gv@1#287,SET-hq@1#309,SET]
+  000073:[hr@1#310,SET-im@1#332,SET]
+  000074:[in@1#333,SET-ji@1#355,SET]
+  000075:[jj@1#356,SET-ke@1#378,SET]
+  000076:[kf@1#379,SET-la@1#401,SET]
+  000077:[lb@1#402,SET-lx@1#424,SET]
+  000078:[ly@1#425,SET-mt@1#447,SET]
+  000079:[mu@1#448,SET-np@1#470,SET]
+  000080:[nq@1#471,SET-ol@1#493,SET]
+  000081:[om@1#494,SET-ph@1#516,SET]
+  000082:[pi@1#517,SET-qd@1#539,SET]
+  000083:[qe@1#540,SET-r@1#562,SET]
+  000084:[ra@1#563,SET-rw@1#585,SET]
+  000085:[rx@1#586,SET-ss@1#608,SET]
+  000086:[st@1#609,SET-to@1#631,SET]
+  000087:[tp@1#632,SET-uk@1#654,SET]
+  000088:[ul@1#655,SET-vg@1#677,SET]
+  000089:[vh@1#678,SET-wc@1#700,SET]
+  000090:[wd@1#701,SET-wz@1#723,SET]
+  000091:[x@1#724,SET-xv@1#746,SET]
+  000092:[xw@1#747,SET-yr@1#769,SET]
+  000093:[ys@1#770,SET-zn@1#792,SET]
+  000094:[zo@1#793,SET-zz@1#804,SET]
+2:
+  000004:[a#101,SET-z#102,SET]
+3:
+  000005:[a#1,SET-a#1,SET]
+  000006:[b#2,SET-b#2,SET]
+  000007:[c#3,SET-c#3,SET]
+  000008:[d#4,SET-d#4,SET]
+  000009:[e#5,SET-e#5,SET]
+  000010:[f#6,SET-f#6,SET]
+  000011:[g#7,SET-g#7,SET]
+  000012:[h#8,SET-h#8,SET]
+  000013:[i#9,SET-i#9,SET]
+  000014:[j#10,SET-j#10,SET]
+  000015:[k#11,SET-k#11,SET]
+  000016:[l#12,SET-l#12,SET]
+  000017:[m#13,SET-m#13,SET]
+  000018:[n#14,SET-n#14,SET]
+  000019:[o#15,SET-o#15,SET]
+  000020:[p#16,SET-p#16,SET]
+  000021:[q#17,SET-q#17,SET]
+  000022:[r#18,SET-r#18,SET]
+  000023:[s#19,SET-s#19,SET]
+  000024:[t#20,SET-t#20,SET]
+  000025:[u#21,SET-u#21,SET]
+  000026:[v#22,SET-v#22,SET]
+  000027:[w#23,SET-w#23,SET]
+  000028:[x#24,SET-x#24,SET]
+  000029:[y#25,SET-y#25,SET]
+  000030:[z#26,SET-z#26,SET]
+
+# Perform the actual test. Compacting L1 into L2 should use L3's boundaries to
+# inform compaction output splitting.
+#
+compact a-zz L1
+----
+2:
+  000095:[a#101,SET-az@1#129,SET]
+  000096:[b@1#130,SET-bz@1#156,SET]
+  000097:[c@1#157,SET-cz@1#183,SET]
+  000098:[d@1#184,SET-dz@1#210,SET]
+  000099:[e@1#211,SET-ez@1#237,SET]
+  000100:[f@1#238,SET-fz@1#264,SET]
+  000101:[g@1#265,SET-gz@1#291,SET]
+  000102:[h@1#292,SET-hz@1#318,SET]
+  000103:[i@1#319,SET-iz@1#345,SET]
+  000104:[j@1#346,SET-jz@1#372,SET]
+  000105:[k@1#373,SET-kz@1#399,SET]
+  000106:[l@1#400,SET-lz@1#426,SET]
+  000107:[m@1#427,SET-mz@1#453,SET]
+  000108:[n@1#454,SET-nz@1#480,SET]
+  000109:[o@1#481,SET-oz@1#507,SET]
+  000110:[p@1#508,SET-pz@1#534,SET]
+  000111:[q@1#535,SET-qz@1#561,SET]
+  000112:[r@1#562,SET-rz@1#588,SET]
+  000113:[s@1#589,SET-sz@1#615,SET]
+  000114:[t@1#616,SET-tz@1#642,SET]
+  000115:[u@1#643,SET-uz@1#669,SET]
+  000116:[v@1#670,SET-vz@1#696,SET]
+  000117:[w@1#697,SET-wz@1#723,SET]
+  000118:[x@1#724,SET-xz@1#750,SET]
+  000119:[y@1#751,SET-yz@1#777,SET]
+  000120:[z#102,SET-zr@1#796,SET]
+  000121:[zs@1#797,SET-zz@1#804,SET]
+3:
+  000005:[a#1,SET-a#1,SET]
+  000006:[b#2,SET-b#2,SET]
+  000007:[c#3,SET-c#3,SET]
+  000008:[d#4,SET-d#4,SET]
+  000009:[e#5,SET-e#5,SET]
+  000010:[f#6,SET-f#6,SET]
+  000011:[g#7,SET-g#7,SET]
+  000012:[h#8,SET-h#8,SET]
+  000013:[i#9,SET-i#9,SET]
+  000014:[j#10,SET-j#10,SET]
+  000015:[k#11,SET-k#11,SET]
+  000016:[l#12,SET-l#12,SET]
+  000017:[m#13,SET-m#13,SET]
+  000018:[n#14,SET-n#14,SET]
+  000019:[o#15,SET-o#15,SET]
+  000020:[p#16,SET-p#16,SET]
+  000021:[q#17,SET-q#17,SET]
+  000022:[r#18,SET-r#18,SET]
+  000023:[s#19,SET-s#19,SET]
+  000024:[t#20,SET-t#20,SET]
+  000025:[u#21,SET-u#21,SET]
+  000026:[v#22,SET-v#22,SET]
+  000027:[w#23,SET-w#23,SET]
+  000028:[x#24,SET-x#24,SET]
+  000029:[y#25,SET-y#25,SET]
+  000030:[z#26,SET-z#26,SET]
+
+file-sizes
+----
+L2:
+  000095:[a#101,1-az@1#129,1]: 7528 bytes (7.4KB)
+  000096:[b@1#130,1-bz@1#156,1]: 6520 bytes (6.4KB)
+  000097:[c@1#157,1-cz@1#183,1]: 6520 bytes (6.4KB)
+  000098:[d@1#184,1-dz@1#210,1]: 6520 bytes (6.4KB)
+  000099:[e@1#211,1-ez@1#237,1]: 6520 bytes (6.4KB)
+  000100:[f@1#238,1-fz@1#264,1]: 6520 bytes (6.4KB)
+  000101:[g@1#265,1-gz@1#291,1]: 6520 bytes (6.4KB)
+  000102:[h@1#292,1-hz@1#318,1]: 6520 bytes (6.4KB)
+  000103:[i@1#319,1-iz@1#345,1]: 6520 bytes (6.4KB)
+  000104:[j@1#346,1-jz@1#372,1]: 6520 bytes (6.4KB)
+  000105:[k@1#373,1-kz@1#399,1]: 6520 bytes (6.4KB)
+  000106:[l@1#400,1-lz@1#426,1]: 6520 bytes (6.4KB)
+  000107:[m@1#427,1-mz@1#453,1]: 6520 bytes (6.4KB)
+  000108:[n@1#454,1-nz@1#480,1]: 6520 bytes (6.4KB)
+  000109:[o@1#481,1-oz@1#507,1]: 6520 bytes (6.4KB)
+  000110:[p@1#508,1-pz@1#534,1]: 6520 bytes (6.4KB)
+  000111:[q@1#535,1-qz@1#561,1]: 6519 bytes (6.4KB)
+  000112:[r@1#562,1-rz@1#588,1]: 6520 bytes (6.4KB)
+  000113:[s@1#589,1-sz@1#615,1]: 6520 bytes (6.4KB)
+  000114:[t@1#616,1-tz@1#642,1]: 6520 bytes (6.4KB)
+  000115:[u@1#643,1-uz@1#669,1]: 6520 bytes (6.4KB)
+  000116:[v@1#670,1-vz@1#696,1]: 6520 bytes (6.4KB)
+  000117:[w@1#697,1-wz@1#723,1]: 6520 bytes (6.4KB)
+  000118:[x@1#724,1-xz@1#750,1]: 6520 bytes (6.4KB)
+  000119:[y@1#751,1-yz@1#777,1]: 6520 bytes (6.4KB)
+  000120:[z#102,1-zr@1#796,1]: 5800 bytes (5.7KB)
+  000121:[zs@1#797,1-zz@1#804,1]: 2382 bytes (2.3KB)
+L3:
+  000005:[a#1,1-a#1,1]: 10667 bytes (10KB)
+  000006:[b#2,1-b#2,1]: 10667 bytes (10KB)
+  000007:[c#3,1-c#3,1]: 10667 bytes (10KB)
+  000008:[d#4,1-d#4,1]: 10667 bytes (10KB)
+  000009:[e#5,1-e#5,1]: 10667 bytes (10KB)
+  000010:[f#6,1-f#6,1]: 10667 bytes (10KB)
+  000011:[g#7,1-g#7,1]: 10667 bytes (10KB)
+  000012:[h#8,1-h#8,1]: 10667 bytes (10KB)
+  000013:[i#9,1-i#9,1]: 10667 bytes (10KB)
+  000014:[j#10,1-j#10,1]: 10667 bytes (10KB)
+  000015:[k#11,1-k#11,1]: 10667 bytes (10KB)
+  000016:[l#12,1-l#12,1]: 10667 bytes (10KB)
+  000017:[m#13,1-m#13,1]: 10667 bytes (10KB)
+  000018:[n#14,1-n#14,1]: 10667 bytes (10KB)
+  000019:[o#15,1-o#15,1]: 10667 bytes (10KB)
+  000020:[p#16,1-p#16,1]: 10667 bytes (10KB)
+  000021:[q#17,1-q#17,1]: 10667 bytes (10KB)
+  000022:[r#18,1-r#18,1]: 10667 bytes (10KB)
+  000023:[s#19,1-s#19,1]: 10667 bytes (10KB)
+  000024:[t#20,1-t#20,1]: 10667 bytes (10KB)
+  000025:[u#21,1-u#21,1]: 10667 bytes (10KB)
+  000026:[v#22,1-v#22,1]: 10667 bytes (10KB)
+  000027:[w#23,1-w#23,1]: 10667 bytes (10KB)
+  000028:[x#24,1-x#24,1]: 10667 bytes (10KB)
+  000029:[y#25,1-y#25,1]: 10667 bytes (10KB)
+  000030:[z#26,1-z#26,1]: 10667 bytes (10KB)
+
+# Test a scenario where there exists a grandparent file (in L3), but the L1->L2
+# compaction doesn't reach it until late in the compaction. The output file
+# should be split at 2x the target file size (~10K), despite not being aligned
+# with a grandparent.
+#
+# Additionally, when the compaction does reach the grandparent's start bound,
+# the compaction should NOT split the output if the current output is less than
+# 0.5x the target file size (~2.5K).
+#
+# Lastly, once past the final grandparent, the compaction should optimize for
+# cutting as close to file size as possible, resulting in an output file ~5K.
+
+define target-file-sizes=(5000, 5000, 5000, 5000)
+L1
+  a.SET.201:<rand-bytes=1000>
+  b.SET.202:<rand-bytes=1000>
+  c.SET.203:<rand-bytes=1000>
+  d.SET.204:<rand-bytes=1000>
+  e.SET.205:<rand-bytes=1000>
+  f.SET.206:<rand-bytes=1000>
+  g.SET.207:<rand-bytes=1000>
+  h.SET.208:<rand-bytes=1000>
+  i.SET.209:<rand-bytes=1000>
+  j.SET.210:<rand-bytes=1000>
+  k.SET.211:<rand-bytes=1000>
+  l.SET.212:<rand-bytes=1000>
+  m.SET.213:<rand-bytes=1000>
+  n.SET.214:<rand-bytes=1000>
+  o.SET.215:<rand-bytes=1000>
+L2
+  a.SET.101:<rand-bytes=10>
+  z.SET.102:<rand-bytes=10>
+L3
+  m.SET.001:<rand-bytes=10000>
+----
+1:
+  000004:[a#201,SET-o#215,SET]
+2:
+  000005:[a#101,SET-z#102,SET]
+3:
+  000006:[m#1,SET-m#1,SET]
+
+compact a-zz L1
+----
+2:
+  000007:[a#201,SET-j#210,SET]
+  000008:[k#211,SET-o#215,SET]
+  000009:[z#102,SET-z#102,SET]
+3:
+  000006:[m#1,SET-m#1,SET]
+
+file-sizes
+----
+L2:
+  000007:[a#201,1-j#210,1]: 10849 bytes (11KB)
+  000008:[k#211,1-o#215,1]: 5756 bytes (5.6KB)
+  000009:[z#102,1-z#102,1]: 672 bytes (672B)
+L3:
+  000006:[m#1,1-m#1,1]: 10667 bytes (10KB)
+
+# Test the file-size splitter's adaptive tolerance for early-splitting at a
+# grandparent boundary. The L1->L2 compaction has many opportunities to split at
+# a grandparent boundary at file sizes ≥ 2.5K. Because it's seen more than 8
+# grandparent boundaries, waits until file size is ≥ 90% of the target file size
+# (eg, ~4.5K).
+
+define target-file-sizes=(5000, 5000, 5000, 5000)
+L1
+  a.SET.201:<rand-bytes=1000>
+  b.SET.202:<rand-bytes=1000>
+  c.SET.203:<rand-bytes=1000>
+  d.SET.204:<rand-bytes=1000>
+  e.SET.205:<rand-bytes=1000>
+  f.SET.206:<rand-bytes=1000>
+  g.SET.207:<rand-bytes=1000>
+  h.SET.208:<rand-bytes=1000>
+  i.SET.209:<rand-bytes=1000>
+  j.SET.210:<rand-bytes=1000>
+  k.SET.211:<rand-bytes=1000>
+  l.SET.212:<rand-bytes=1000>
+  m.SET.213:<rand-bytes=1000>
+  n.SET.214:<rand-bytes=1000>
+  o.SET.215:<rand-bytes=1000>
+L2
+  a.SET.101:<rand-bytes=10>
+  z.SET.102:<rand-bytes=10>
+L3
+  a.SET.001:<rand-bytes=1000>
+L3
+  ab.SET.002:<rand-bytes=1000>
+L3
+  ac.SET.003:<rand-bytes=1000>
+L3
+  ad.SET.004:<rand-bytes=1000>
+L3
+  ae.SET.005:<rand-bytes=1000>
+L3
+  af.SET.006:<rand-bytes=1000>
+L3
+  ag.SET.007:<rand-bytes=1000>
+L3
+  ah.SET.008:<rand-bytes=1000>
+L3
+  c.SET.009:<rand-bytes=1000>
+L3
+  d.SET.010:<rand-bytes=1000>
+L3
+  e.SET.011:<rand-bytes=1000>
+L3
+  f.SET.012:<rand-bytes=1000>
+L3
+  m.SET.013:<rand-bytes=1000>
+----
+1:
+  000004:[a#201,SET-o#215,SET]
+2:
+  000005:[a#101,SET-z#102,SET]
+3:
+  000006:[a#1,SET-a#1,SET]
+  000007:[ab#2,SET-ab#2,SET]
+  000008:[ac#3,SET-ac#3,SET]
+  000009:[ad#4,SET-ad#4,SET]
+  000010:[ae#5,SET-ae#5,SET]
+  000011:[af#6,SET-af#6,SET]
+  000012:[ag#7,SET-ag#7,SET]
+  000013:[ah#8,SET-ah#8,SET]
+  000014:[c#9,SET-c#9,SET]
+  000015:[d#10,SET-d#10,SET]
+  000016:[e#11,SET-e#11,SET]
+  000017:[f#12,SET-f#12,SET]
+  000018:[m#13,SET-m#13,SET]
+
+compact a-zz L1
+----
+2:
+  000019:[a#201,SET-e#205,SET]
+  000020:[f#206,SET-l#212,SET]
+  000021:[m#213,SET-z#102,SET]
+3:
+  000006:[a#1,SET-a#1,SET]
+  000007:[ab#2,SET-ab#2,SET]
+  000008:[ac#3,SET-ac#3,SET]
+  000009:[ad#4,SET-ad#4,SET]
+  000010:[ae#5,SET-ae#5,SET]
+  000011:[af#6,SET-af#6,SET]
+  000012:[ag#7,SET-ag#7,SET]
+  000013:[ah#8,SET-ah#8,SET]
+  000014:[c#9,SET-c#9,SET]
+  000015:[d#10,SET-d#10,SET]
+  000016:[e#11,SET-e#11,SET]
+  000017:[f#12,SET-f#12,SET]
+  000018:[m#13,SET-m#13,SET]
+
+file-sizes
+----
+L2:
+  000019:[a#201,1-e#205,1]: 5756 bytes (5.6KB)
+  000020:[f#206,1-l#212,1]: 7784 bytes (7.6KB)
+  000021:[m#213,1-z#102,1]: 3718 bytes (3.6KB)
+L3:
+  000006:[a#1,1-a#1,1]: 1667 bytes (1.6KB)
+  000007:[ab#2,1-ab#2,1]: 1668 bytes (1.6KB)
+  000008:[ac#3,1-ac#3,1]: 1668 bytes (1.6KB)
+  000009:[ad#4,1-ad#4,1]: 1668 bytes (1.6KB)
+  000010:[ae#5,1-ae#5,1]: 1668 bytes (1.6KB)
+  000011:[af#6,1-af#6,1]: 1668 bytes (1.6KB)
+  000012:[ag#7,1-ag#7,1]: 1668 bytes (1.6KB)
+  000013:[ah#8,1-ah#8,1]: 1668 bytes (1.6KB)
+  000014:[c#9,1-c#9,1]: 1667 bytes (1.6KB)
+  000015:[d#10,1-d#10,1]: 1667 bytes (1.6KB)
+  000016:[e#11,1-e#11,1]: 1667 bytes (1.6KB)
+  000017:[f#12,1-f#12,1]: 1667 bytes (1.6KB)
+  000018:[m#13,1-m#13,1]: 1667 bytes (1.6KB)
diff --git a/pebble/testdata/manual_compaction_multilevel b/pebble/testdata/manual_compaction_multilevel
new file mode 100644
index 0000000..7dad58e
--- /dev/null
+++ b/pebble/testdata/manual_compaction_multilevel
@@ -0,0 +1,121 @@
+# This set of tests validates that manually executed multi level compactions work
+# The multilevel compaction tests mainly live in
+# /testdata/compaction_setup_inputs_multilevel_write_amp
+
+# A vanilla multi level compaction
+define level-max-bytes=(L2 : 5) auto-compactions=off
+L1
+  a.SET.3:v b.SET.2:v
+L2
+  a.SET.2:v c.SET.4:v
+L3
+  c.SET.3:v d.SET.2:v
+L4
+  c.SET.2:v d.SET.1:v
+----
+1:
+  000004:[a#3,SET-b#2,SET]
+2:
+  000005:[a#2,SET-c#4,SET]
+3:
+  000006:[c#3,SET-d#2,SET]
+4:
+  000007:[c#2,SET-d#1,SET]
+
+compact a-b L1
+----
+3:
+  000008:[a#3,SET-d#2,SET]
+4:
+  000007:[c#2,SET-d#1,SET]
+
+# Conduct a multi level compaction with no output level files
+define level-max-bytes=(L2 : 5) auto-compactions=off
+L1
+  a.SET.3:v b.SET.2:v
+L2
+  a.SET.2:v c.SET.4:v
+L4
+  c.SET.2:v d.SET.1:v
+----
+1:
+  000004:[a#3,SET-b#2,SET]
+2:
+  000005:[a#2,SET-c#4,SET]
+4:
+  000006:[c#2,SET-d#1,SET]
+
+compact a-b L1
+----
+3:
+  000007:[a#3,SET-c#4,SET]
+4:
+  000006:[c#2,SET-d#1,SET]
+
+# No multilevel compaction because a move to L2 results in less writeamp than the ML compaction
+# which includes the file in L3.
+define level-max-bytes=(L2 : 5) auto-compactions=off
+L1
+  a.SET.3:v b.SET.2:v
+L3
+  a.SET.2:v c.SET.4:v
+L4
+  c.SET.2:v d.SET.1:v
+----
+1:
+  000004:[a#3,SET-b#2,SET]
+3:
+  000005:[a#2,SET-c#4,SET]
+4:
+  000006:[c#2,SET-d#1,SET]
+
+compact a-b L1
+----
+2:
+  000004:[a#3,SET-b#2,SET]
+3:
+  000005:[a#2,SET-c#4,SET]
+4:
+  000006:[c#2,SET-d#1,SET]
+
+# Conduct a multi input compaction without intermediate or output level, basically a move.
+define level-max-bytes=(L2 : 5) auto-compactions=off multi-input-level
+L1
+  a.SET.3:v b.SET.2:v
+L4
+  c.SET.2:v d.SET.1:v
+----
+1:
+  000004:[a#3,SET-b#2,SET]
+4:
+  000005:[c#2,SET-d#1,SET]
+
+compact a-b L1
+----
+3:
+  000004:[a#3,SET-b#2,SET]
+4:
+  000005:[c#2,SET-d#1,SET]
+
+# Don't conduct a multi level compaction on L0.
+define level-max-bytes=(L1 : 5) auto-compactions=off multi-input-level
+L0
+  a.SET.1:v b.SET.2:v
+L1
+  a.SET.3:v c.SET.4:v
+L2
+  c.SET.2:v d.SET.2:v
+----
+0.0:
+  000004:[a#1,SET-b#2,SET]
+1:
+  000005:[a#3,SET-c#4,SET]
+2:
+  000006:[c#2,SET-d#2,SET]
+
+compact a-b L0
+----
+1:
+  000007:[a#3,SET-c#4,SET]
+2:
+  000006:[c#2,SET-d#2,SET]
diff --git a/pebble/testdata/manual_compaction_range_keys b/pebble/testdata/manual_compaction_range_keys
new file mode 100644
index 0000000..bd283e4
--- /dev/null
+++ b/pebble/testdata/manual_compaction_range_keys
@@ -0,0 +1,48 @@
+
+# Test compaction of range keys.
+
+define target-file-sizes=(1, 1, 1, 1)
+L0
+  rangekey:a-c:{(#4,RANGEKEYSET,@2,foo)}
+  a.SET.3:b
+L2
+  a.SET.2:v
+L3
+  a.SET.0:v
+  b.SET.0:v
+  rangekey:b-c:{(#1,RANGEKEYSET,@2,bar)}
+L3
+  c.SET.0:v
+----
+0.0:
+  000004:[a#4,RANGEKEYSET-c#inf,RANGEKEYSET] seqnums:[3-4] points:[a#3,SET-a#3,SET] ranges:[a#4,RANGEKEYSET-c#inf,RANGEKEYSET]
+2:
+  000005:[a#2,SET-a#2,SET] seqnums:[2-2] points:[a#2,SET-a#2,SET]
+3:
+  000006:[a#0,SET-c#inf,RANGEKEYSET] seqnums:[0-1] points:[a#0,SET-b#0,SET] ranges:[b#1,RANGEKEYSET-c#inf,RANGEKEYSET]
+  000007:[c#0,SET-c#0,SET] seqnums:[0-0] points:[c#0,SET-c#0,SET]
+
+compact a-d L0
+----
+1:
+  000008:[a#4,RANGEKEYSET-c#inf,RANGEKEYSET] seqnums:[3-4] points:[a#3,SET-a#3,SET] ranges:[a#4,RANGEKEYSET-c#inf,RANGEKEYSET]
+2:
+  000005:[a#2,SET-a#2,SET] seqnums:[2-2] points:[a#2,SET-a#2,SET]
+3:
+  000006:[a#0,SET-c#inf,RANGEKEYSET] seqnums:[0-1] points:[a#0,SET-b#0,SET] ranges:[b#1,RANGEKEYSET-c#inf,RANGEKEYSET]
+  000007:[c#0,SET-c#0,SET] seqnums:[0-0] points:[c#0,SET-c#0,SET]
+
+compact a-d L1
+----
+2:
+  000009:[a#4,RANGEKEYSET-c#inf,RANGEKEYSET] seqnums:[3-4] points:[a#3,SET-a#3,SET] ranges:[a#4,RANGEKEYSET-c#inf,RANGEKEYSET]
+3:
+  000006:[a#0,SET-c#inf,RANGEKEYSET] seqnums:[0-1] points:[a#0,SET-b#0,SET] ranges:[b#1,RANGEKEYSET-c#inf,RANGEKEYSET]
+  000007:[c#0,SET-c#0,SET] seqnums:[0-0] points:[c#0,SET-c#0,SET]
+
+compact a-d L2
+----
+3:
+  000010:[a#4,RANGEKEYSET-b#inf,RANGEKEYSET] seqnums:[0-4] points:[a#0,SET-a#0,SET] ranges:[a#4,RANGEKEYSET-b#inf,RANGEKEYSET]
+  000011:[b#4,RANGEKEYSET-c#inf,RANGEKEYSET] seqnums:[0-4] points:[b#0,SET-b#0,SET] ranges:[b#4,RANGEKEYSET-c#inf,RANGEKEYSET]
+  000007:[c#0,SET-c#0,SET] seqnums:[0-0] points:[c#0,SET-c#0,SET]
diff --git a/pebble/testdata/manual_compaction_set_with_del b/pebble/testdata/manual_compaction_set_with_del
new file mode 100644
index 0000000..95dc46c
--- /dev/null
+++ b/pebble/testdata/manual_compaction_set_with_del
@@ -0,0 +1,863 @@
+batch
+set a 1
+set b 2
+----
+
+compact a-b
+----
+6:
+  000005:[a#10,SET-b#11,SET]
+
+batch
+set c 3
+set d 4
+----
+
+compact c-d
+----
+6:
+  000005:[a#10,SET-b#11,SET]
+  000007:[c#12,SET-d#13,SET]
+
+batch
+set b 5
+set c 6
+----
+
+compact a-d
+----
+6:
+  000010:[a#0,SET-d#0,SET]
+
+# This also tests flushing a memtable that only contains range
+# deletions.
+
+batch
+del-range a e
+----
+
+compact a-d
+----
+
+# Test that a multi-output-file compaction generates non-overlapping files.
+
+define target-file-sizes=(100, 1)
+L0
+  b.SET.1:v
+L0
+  a.SET.2:v
+----
+0.0:
+  000005:[a#2,SET-a#2,SET]
+  000004:[b#1,SET-b#1,SET]
+
+compact a-b
+----
+1:
+  000006:[a#0,SET-a#0,SET]
+  000007:[b#0,SET-b#0,SET]
+
+# A range tombstone extends past the grandparent file boundary used to limit the
+# size of future compactions. Verify the range tombstone is split at that file
+# boundary.
+
+define target-file-sizes=(1, 1, 1, 1)
+L1
+  a.SET.3:v
+L2
+  a.RANGEDEL.2:e
+L3
+  a.SET.0:v
+  b.SET.0:v
+L3
+  c.SET.0:v
+  d.SET.0:v
+----
+1:
+  000004:[a#3,SET-a#3,SET]
+2:
+  000005:[a#2,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+
+wait-pending-table-stats
+000005
+----
+num-entries: 1
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 1278
+
+compact a-e L1
+----
+2:
+  000008:[a#3,SETWITHDEL-c#inf,RANGEDEL]
+  000009:[c#2,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+
+wait-pending-table-stats
+000008
+----
+num-entries: 2
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 639
+
+# Same as above, except range tombstone covers multiple grandparent file boundaries.
+
+define target-file-sizes=(1, 1, 1, 1)
+L1
+  a.SET.3:v
+L2
+  a.RANGEDEL.2:g
+L3
+  a.SET.0:v
+  b.SET.0:v
+L3
+  c.SET.0:v
+  d.SET.0:v
+L3
+  e.SET.0:v
+  f.SET.1:v
+L3
+  f.SET.0:v
+  g.SET.0:v
+----
+1:
+  000004:[a#3,SET-a#3,SET]
+2:
+  000005:[a#2,RANGEDEL-g#inf,RANGEDEL]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+  000008:[e#0,SET-f#1,SET]
+  000009:[f#0,SET-g#0,SET]
+
+compact a-e L1
+----
+2:
+  000010:[a#3,SETWITHDEL-c#inf,RANGEDEL]
+  000011:[c#2,RANGEDEL-e#inf,RANGEDEL]
+  000012:[e#2,RANGEDEL-f#inf,RANGEDEL]
+  000013:[f#2,RANGEDEL-g#inf,RANGEDEL]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+  000008:[e#0,SET-f#1,SET]
+  000009:[f#0,SET-g#0,SET]
+
+# A range tombstone covers multiple grandparent file boundaries between point keys,
+# rather than after all point keys.
+
+define target-file-sizes=(1, 1, 1, 1)
+L1
+  a.SET.3:v
+  h.SET.3:v
+L2
+  a.RANGEDEL.2:g
+L3
+  a.SET.0:v
+  b.SET.0:v
+L3
+  c.SET.0:v
+  d.SET.0:v
+L3
+  e.SET.0:v
+  f.SET.1:v
+----
+1:
+  000004:[a#3,SET-h#3,SET]
+2:
+  000005:[a#2,RANGEDEL-g#inf,RANGEDEL]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+  000008:[e#0,SET-f#1,SET]
+
+compact a-e L1
+----
+2:
+  000009:[a#3,SETWITHDEL-c#inf,RANGEDEL]
+  000010:[c#2,RANGEDEL-h#3,SET]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+  000008:[e#0,SET-f#1,SET]
+
+# A range tombstone is the first and only item output by a compaction, and it
+# extends past the grandparent file boundary used to limit the size of future
+# compactions. Verify the range tombstone is split at that file boundary.
+
+define target-file-sizes=(1, 1, 1, 1)
+L1
+  a.RANGEDEL.3:e
+L2
+  a.SET.2:v
+L3
+  a.SET.0:v
+  b.SET.0:v
+L3
+  c.SET.0:v
+  d.SET.0:v
+----
+1:
+  000004:[a#3,RANGEDEL-e#inf,RANGEDEL]
+2:
+  000005:[a#2,SET-a#2,SET]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+
+compact a-e L1
+----
+2:
+  000008:[a#3,RANGEDEL-c#inf,RANGEDEL]
+  000009:[c#3,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+
+# An elided range tombstone is the first item encountered by a compaction,
+# and the grandparent limit set by it extends to the next item, also a range
+# tombstone. The first item should be elided, and the second item should
+# reset the grandparent limit.
+
+define target-file-sizes=(100, 100, 100, 100)
+L1
+  a.RANGEDEL.4:d
+L1
+  grandparent.RANGEDEL.2:z
+  h.SET.3:v
+L2
+  grandparent.SET.1:v
+L3
+  grandparent.SET.0:v
+L3
+  m.SET.0:v
+----
+1:
+  000004:[a#4,RANGEDEL-d#inf,RANGEDEL]
+  000005:[grandparent#2,RANGEDEL-z#inf,RANGEDEL]
+2:
+  000006:[grandparent#1,SET-grandparent#1,SET]
+3:
+  000007:[grandparent#0,SET-grandparent#0,SET]
+  000008:[m#0,SET-m#0,SET]
+
+compact a-h L1
+----
+2:
+  000009:[grandparent#2,RANGEDEL-m#inf,RANGEDEL]
+  000010:[m#2,RANGEDEL-z#inf,RANGEDEL]
+3:
+  000007:[grandparent#0,SET-grandparent#0,SET]
+  000008:[m#0,SET-m#0,SET]
+
+# Setup such that grandparent overlap limit is exceeded multiple times at the same user key ("b").
+# Ensures the compaction output files are non-overlapping.
+
+define target-file-sizes=(1, 1, 1, 1)
+L1
+  a.SET.2:v
+  c.SET.2:v
+L2
+  a.RANGEDEL.3:c
+L3
+  b.SET.2:v
+L3
+  b.SET.1:v
+L3
+  b.SET.0:v
+----
+1:
+  000004:[a#2,SET-c#2,SET]
+2:
+  000005:[a#3,RANGEDEL-c#inf,RANGEDEL]
+3:
+  000006:[b#2,SET-b#2,SET]
+  000007:[b#1,SET-b#1,SET]
+  000008:[b#0,SET-b#0,SET]
+
+compact a-c L1
+----
+2:
+  000009:[a#3,RANGEDEL-b#inf,RANGEDEL]
+  000010:[b#3,RANGEDEL-c#2,SET]
+3:
+  000006:[b#2,SET-b#2,SET]
+  000007:[b#1,SET-b#1,SET]
+  000008:[b#0,SET-b#0,SET]
+
+# Regression test for a bug where compaction would stop process range
+# tombstones for an input level upon finding an sstable in the input
+# level with no range tombstones. In the scenario below, sstable 6
+# does not contain any range tombstones while sstable 7 does. Both are
+# compacted together with sstable 5.
+
+reset
+----
+
+batch
+set a 1
+set b 1
+set c 1
+set d 1
+set z 1
+----
+
+compact a-z
+----
+6:
+  000005:[a#10,SET-z#14,SET]
+
+build ext1
+set a 2
+----
+
+build ext2
+set b 2
+del-range c z
+----
+
+ingest ext1 ext2
+----
+0.0:
+  000006:[a#15,SET-a#15,SET]
+  000007:[b#16,SET-z#inf,RANGEDEL]
+6:
+  000005:[a#10,SET-z#14,SET]
+
+iter
+first
+next
+next
+next
+----
+a: (2, .)
+b: (2, .)
+z: (1, .)
+.
+
+compact a-z
+----
+6:
+  000008:[a#0,SET-z#0,SET]
+
+iter
+first
+next
+next
+next
+----
+a: (2, .)
+b: (2, .)
+z: (1, .)
+.
+
+# Regresion test for a bug in sstable smallest boundary generation
+# where the smallest key for an sstable was set to a key "larger" than
+# the start key of the first range tombstone. This in turn fouled up
+# the processing logic of range tombstones used by mergingIter which
+# allowed stepping out of an sstable even though it contained a range
+# tombstone that covered keys in lower levels.
+
+define target-file-sizes=(1, 1, 1, 1)
+L0
+  c.SET.4:4
+L1
+  a.SET.3:3
+L2
+  a.RANGEDEL.2:e
+L3
+  b.SET.1:1
+----
+0.0:
+  000004:[c#4,SET-c#4,SET]
+1:
+  000005:[a#3,SET-a#3,SET]
+2:
+  000006:[a#2,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000007:[b#1,SET-b#1,SET]
+
+compact a-e L1
+----
+0.0:
+  000004:[c#4,SET-c#4,SET]
+2:
+  000008:[a#3,SETWITHDEL-b#inf,RANGEDEL]
+  000009:[b#2,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000007:[b#1,SET-b#1,SET]
+
+# We should only see a:3 and c:4 at this point.
+
+iter
+first
+next
+next
+----
+a: (3, .)
+c: (4, .)
+.
+
+# The bug allowed seeing b:1 during reverse iteration.
+
+iter
+last
+prev
+prev
+----
+c: (4, .)
+a: (3, .)
+.
+
+# This is a similar scenario to the one above. In older versions of Pebble this
+# case necessitated adjusting the seqnum of the range tombstone to
+# prev.LargestKey.SeqNum-1. We no longer allow user keys to be split across
+# sstables, and the seqnum adjustment is no longer necessary.
+#
+# Note the target-file-size of 26 is specially tailored to get the
+# desired compaction output.
+
+define target-file-sizes=(26, 26, 26, 26) snapshots=(1, 2, 3)
+L1
+  a.SET.4:4
+L1
+  b.SET.2:2
+  b.RANGEDEL.3:e
+L3
+  b.SET.1:1
+----
+1:
+  000004:[a#4,SET-a#4,SET]
+  000005:[b#3,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000006:[b#1,SET-b#1,SET]
+
+compact a-e L1
+----
+2:
+  000007:[a#4,SET-a#4,SET]
+  000008:[b#3,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000006:[b#1,SET-b#1,SET]
+
+iter
+first
+next
+last
+prev
+----
+a: (4, .)
+.
+a: (4, .)
+.
+
+# Similar to the preceding scenario, except the range tombstone has
+# the same seqnum as the largest key in the preceding file.
+
+define target-file-sizes=(26, 26, 26, 26) snapshots=(1, 2, 3)
+L1
+  a.SET.4:4
+L1
+  b.SET.3:3
+  b.RANGEDEL.3:e
+L3
+  b.SET.1:1
+----
+1:
+  000004:[a#4,SET-a#4,SET]
+  000005:[b#3,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000006:[b#1,SET-b#1,SET]
+
+compact a-e L1
+----
+2:
+  000007:[a#4,SET-a#4,SET]
+  000008:[b#3,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000006:[b#1,SET-b#1,SET]
+
+iter
+first
+next
+next
+last
+prev
+prev
+----
+a: (4, .)
+b: (3, .)
+.
+b: (3, .)
+a: (4, .)
+.
+
+# Similar to the preceding scenario, except the range tombstone has
+# a smaller seqnum than the largest key in the preceding file.
+
+define target-file-sizes=(26, 26, 26, 26) snapshots=(1, 2, 3)
+L1
+  a.SET.4:4
+L1
+  b.SET.4:4
+  b.RANGEDEL.2:e
+L3
+  b.SET.1:1
+----
+1:
+  000004:[a#4,SET-a#4,SET]
+  000005:[b#4,SET-e#inf,RANGEDEL]
+3:
+  000006:[b#1,SET-b#1,SET]
+
+compact a-e L1
+----
+2:
+  000007:[a#4,SET-a#4,SET]
+  000008:[b#4,SET-e#inf,RANGEDEL]
+3:
+  000006:[b#1,SET-b#1,SET]
+
+iter
+first
+next
+next
+last
+prev
+prev
+----
+a: (4, .)
+b: (4, .)
+.
+b: (4, .)
+a: (4, .)
+.
+
+# Test a scenario where the last point key in an sstable has a seqnum
+# of 0.
+
+define target-file-sizes=(1, 1, 26) snapshots=(2)
+L1
+  a.SET.3:3
+  b.RANGEDEL.3:e
+  b.SET.0:0
+L3
+  a.RANGEDEL.2:b
+L3
+  c.SET.0:0
+  d.SET.0:0
+----
+1:
+  000004:[a#3,SET-e#inf,RANGEDEL]
+3:
+  000005:[a#2,RANGEDEL-b#inf,RANGEDEL]
+  000006:[c#0,SET-d#0,SET]
+
+iter
+last
+prev
+----
+a: (3, .)
+.
+
+compact a-e L1
+----
+2:
+  000007:[a#3,SET-c#inf,RANGEDEL]
+  000008:[c#3,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000005:[a#2,RANGEDEL-b#inf,RANGEDEL]
+  000006:[c#0,SET-d#0,SET]
+
+iter
+last
+prev
+----
+a: (3, .)
+.
+
+# Test a scenario where the last point key in an sstable before the
+# grandparent limit is reached has a seqnum of 0. We want to cut the
+# sstable after the next point key is added, rather than continuing to
+# add keys indefinitely (or till the size limit is reached).
+
+define target-file-sizes=(100, 1, 52) snapshots=(2)
+L1
+  a.SET.3:3
+  b.RANGEDEL.3:e
+  b.SET.0:0
+  c.SET.3:1
+  d.SET.1:1
+L3
+  c.RANGEDEL.2:d
+----
+1:
+  000004:[a#3,SET-e#inf,RANGEDEL]
+3:
+  000005:[c#2,RANGEDEL-d#inf,RANGEDEL]
+
+compact a-f L1
+----
+2:
+  000006:[a#3,SET-c#inf,RANGEDEL]
+  000007:[c#3,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000005:[c#2,RANGEDEL-d#inf,RANGEDEL]
+
+
+# Test a scenario where we the last point key in an sstable has a
+# seqnum of 0, but there is another range tombstone later in the
+# compaction. This scenario was previously triggering an assertion due
+# to the rangedel.Fragmenter being finished prematurely.
+
+define target-file-sizes=(1, 1, 1)
+L1
+  a.SET.0:0
+  c.RANGEDEL.1:d
+L3
+  b.SET.0:0
+----
+1:
+  000004:[a#0,SET-d#inf,RANGEDEL]
+3:
+  000005:[b#0,SET-b#0,SET]
+
+compact a-e L1
+----
+2:
+  000006:[a#0,SET-a#0,SET]
+3:
+  000005:[b#0,SET-b#0,SET]
+
+define target-file-sizes=(1, 1, 1, 1)
+L0
+  b.SET.1:v
+L0
+  a.SET.2:v
+----
+0.0:
+  000005:[a#2,SET-a#2,SET]
+  000004:[b#1,SET-b#1,SET]
+
+add-ongoing-compaction startLevel=0 outputLevel=1 start=a end=z
+----
+
+async-compact a-b L0
+----
+manual compaction blocked until ongoing finished
+1:
+  000006:[a#0,SET-a#0,SET]
+  000007:[b#0,SET-b#0,SET]
+
+compact a-b L1
+----
+2:
+  000008:[a#0,SET-a#0,SET]
+  000009:[b#0,SET-b#0,SET]
+
+add-ongoing-compaction startLevel=0 outputLevel=1 start=a end=z
+----
+
+async-compact a-b L2
+----
+manual compaction blocked until ongoing finished
+3:
+  000010:[a#0,SET-a#0,SET]
+  000011:[b#0,SET-b#0,SET]
+
+add-ongoing-compaction startLevel=0 outputLevel=1 start=a end=z
+----
+
+set-concurrent-compactions num=2
+----
+
+async-compact a-b L3
+----
+manual compaction did not block for ongoing
+4:
+  000012:[a#0,SET-a#0,SET]
+  000013:[b#0,SET-b#0,SET]
+
+remove-ongoing-compaction
+----
+
+add-ongoing-compaction startLevel=4 outputLevel=5 start=a end=b
+----
+
+async-compact a-b L4
+----
+manual compaction blocked until ongoing finished
+5:
+  000014:[a#0,SET-a#0,SET]
+  000015:[b#0,SET-b#0,SET]
+
+# Test of a scenario where consecutive elided range tombstones and grandparent
+# boundaries could result in an invariant violation in the rangedel fragmenter.
+
+define target-file-sizes=(1, 1, 1, 1)
+L1
+  a.RANGEDEL.4:b
+  c.RANGEDEL.4:d
+  e.RANGEDEL.4:f
+L1
+  g.RANGEDEL.6:h
+  i.RANGEDEL.4:j
+L1
+  k.RANGEDEL.5:q
+  m.RANGEDEL.4:q
+L2
+  a.SET.2:foo
+L3
+  a.SET.1:foo
+  c.SET.1:foo
+L3
+  ff.SET.1:v
+L3
+  k.SET.1:foo
+----
+1:
+  000004:[a#4,RANGEDEL-f#inf,RANGEDEL]
+  000005:[g#6,RANGEDEL-j#inf,RANGEDEL]
+  000006:[k#5,RANGEDEL-q#inf,RANGEDEL]
+2:
+  000007:[a#2,SET-a#2,SET]
+3:
+  000008:[a#1,SET-c#1,SET]
+  000009:[ff#1,SET-ff#1,SET]
+  000010:[k#1,SET-k#1,SET]
+
+compact a-q L1
+----
+2:
+  000011:[a#4,RANGEDEL-d#inf,RANGEDEL]
+  000012:[k#5,RANGEDEL-m#inf,RANGEDEL]
+3:
+  000008:[a#1,SET-c#1,SET]
+  000009:[ff#1,SET-ff#1,SET]
+  000010:[k#1,SET-k#1,SET]
+
+# Test a case where a new output file is started, there are no previous output
+# files, there are no additional keys (key = nil) and the rangedel fragmenter
+# is non-empty.
+define target-file-sizes=(1, 1, 1)
+L1
+  a.RANGEDEL.10:b
+  d.RANGEDEL.9:e
+  q.RANGEDEL.8:r
+L2
+  g.RANGEDEL.7:h
+L3
+  q.SET.6:6
+----
+1:
+  000004:[a#10,RANGEDEL-r#inf,RANGEDEL]
+2:
+  000005:[g#7,RANGEDEL-h#inf,RANGEDEL]
+3:
+  000006:[q#6,SET-q#6,SET]
+
+compact a-r L1
+----
+2:
+  000007:[q#8,RANGEDEL-r#inf,RANGEDEL]
+3:
+  000006:[q#6,SET-q#6,SET]
+
+define target-file-sizes=(100, 100, 100)
+L1
+  a.RANGEDEL.10:b
+  b.SET.0:foo
+  d.RANGEDEL.0:e
+  j.SET.10:foo
+L2
+  f.RANGEDEL.7:g
+L3
+  c.SET.6:6
+L3
+  c.SET.5:5
+L3
+  c.SET.4:4
+L4
+  a.SET.0:0
+  f.SET.0:0
+----
+1:
+  000004:[a#10,RANGEDEL-j#10,SET]
+2:
+  000005:[f#7,RANGEDEL-g#inf,RANGEDEL]
+3:
+  000006:[c#6,SET-c#6,SET]
+  000007:[c#5,SET-c#5,SET]
+  000008:[c#4,SET-c#4,SET]
+4:
+  000009:[a#0,SET-f#0,SET]
+
+compact a-r L1
+----
+2:
+  000010:[a#10,RANGEDEL-b#0,SET]
+  000011:[d#0,RANGEDEL-j#10,SET]
+3:
+  000006:[c#6,SET-c#6,SET]
+  000007:[c#5,SET-c#5,SET]
+  000008:[c#4,SET-c#4,SET]
+4:
+  000009:[a#0,SET-f#0,SET]
+
+# Test a snapshot that separates a range deletion from all the data that it
+# deletes. Ensure that we respect the target-file-size and split into multiple
+# outputs.
+
+define target-file-sizes=(1, 1, 1) snapshots=(14)
+L1
+  a.RANGEDEL.15:z
+  b.SET.11:foo
+  c.SET.11:foo
+L2
+  c.SET.0:foo
+  d.SET.0:foo
+----
+1:
+  000004:[a#15,RANGEDEL-z#inf,RANGEDEL]
+2:
+  000005:[c#0,SET-d#0,SET]
+
+compact a-z L1
+----
+2:
+  000006:[a#15,RANGEDEL-c#inf,RANGEDEL]
+  000007:[c#15,RANGEDEL-d#inf,RANGEDEL]
+  000008:[d#15,RANGEDEL-z#inf,RANGEDEL]
+
+# Test an interaction between a range deletion that will be elided with
+# output splitting. Ensure that the output is still split (previous versions
+# of the code did not, because of intricacies around preventing a zero
+# sequence number in an output's largest key).
+
+define target-file-sizes=(1, 1, 1)
+L1
+  a.RANGEDEL.10:z
+  b.SET.11:foo
+  c.SET.11:foo
+L2
+  c.SET.0:foo
+  d.SET.0:foo
+----
+1:
+  000004:[a#10,RANGEDEL-z#inf,RANGEDEL]
+2:
+  000005:[c#0,SET-d#0,SET]
+
+compact a-z L1
+----
+2:
+  000006:[b#0,SET-b#0,SET]
+  000007:[c#0,SET-c#0,SET]
diff --git a/pebble/testdata/manual_compaction_set_with_del_sstable_Pebblev4 b/pebble/testdata/manual_compaction_set_with_del_sstable_Pebblev4
new file mode 100644
index 0000000..5807ec5
--- /dev/null
+++ b/pebble/testdata/manual_compaction_set_with_del_sstable_Pebblev4
@@ -0,0 +1,811 @@
+batch
+set a 1
+set b 2
+----
+
+compact a-b
+----
+6:
+  000005:[a#10,SET-b#11,SET]
+
+batch
+set c 3
+set d 4
+----
+
+compact c-d
+----
+6:
+  000005:[a#10,SET-b#11,SET]
+  000007:[c#12,SET-d#13,SET]
+
+batch
+set b 5
+set c 6
+----
+
+compact a-d
+----
+6:
+  000010:[a#0,SET-d#0,SET]
+
+# This also tests flushing a memtable that only contains range
+# deletions.
+
+batch
+del-range a e
+----
+
+compact a-d
+----
+
+# Test that a multi-output-file compaction generates non-overlapping files.
+
+define target-file-sizes=(100, 1)
+L0
+  b.SET.1:v
+L0
+  a.SET.2:v
+----
+0.0:
+  000005:[a#2,SET-a#2,SET]
+  000004:[b#1,SET-b#1,SET]
+
+compact a-b
+----
+1:
+  000006:[a#0,SET-a#0,SET]
+  000007:[b#0,SET-b#0,SET]
+
+# A range tombstone extends past the grandparent file boundary used to limit the
+# size of future compactions. Verify the range tombstone is split at that file
+# boundary.
+
+define target-file-sizes=(1, 1, 1, 1)
+L1
+  a.SET.3:v
+L2
+  a.RANGEDEL.2:e
+L3
+  a.SET.0:v
+  b.SET.0:v
+L3
+  c.SET.0:v
+  d.SET.0:v
+----
+1:
+  000004:[a#3,SET-a#3,SET]
+2:
+  000005:[a#2,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+
+wait-pending-table-stats
+000005
+----
+num-entries: 1
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 1334
+
+compact a-e L1
+----
+2:
+  000008:[a#3,SETWITHDEL-c#inf,RANGEDEL]
+  000009:[c#2,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+
+wait-pending-table-stats
+000008
+----
+num-entries: 2
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 667
+
+# Same as above, except range tombstone covers multiple grandparent file boundaries.
+
+define target-file-sizes=(1, 1, 1, 1)
+L1
+  a.SET.3:v
+L2
+  a.RANGEDEL.2:g
+L3
+  a.SET.0:v
+  b.SET.0:v
+L3
+  c.SET.0:v
+  d.SET.0:v
+L3
+  e.SET.0:v
+  f.SET.1:v
+L3
+  g.SET.1:v
+  g.SET.0:v
+----
+1:
+  000004:[a#3,SET-a#3,SET]
+2:
+  000005:[a#2,RANGEDEL-g#inf,RANGEDEL]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+  000008:[e#0,SET-f#1,SET]
+  000009:[g#1,SET-g#1,SET]
+
+compact a-e L1
+----
+2:
+  000010:[a#3,SETWITHDEL-c#inf,RANGEDEL]
+  000011:[c#2,RANGEDEL-e#inf,RANGEDEL]
+  000012:[e#2,RANGEDEL-g#inf,RANGEDEL]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+  000008:[e#0,SET-f#1,SET]
+  000009:[g#1,SET-g#1,SET]
+
+# A range tombstone covers multiple grandparent file boundaries between point keys,
+# rather than after all point keys.
+
+define target-file-sizes=(1, 1, 1, 1)
+L1
+  a.SET.3:v
+  h.SET.3:v
+L2
+  a.RANGEDEL.2:g
+L3
+  a.SET.0:v
+  b.SET.0:v
+L3
+  c.SET.0:v
+  d.SET.0:v
+L3
+  e.SET.0:v
+  f.SET.1:v
+----
+1:
+  000004:[a#3,SET-h#3,SET]
+2:
+  000005:[a#2,RANGEDEL-g#inf,RANGEDEL]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+  000008:[e#0,SET-f#1,SET]
+
+compact a-e L1
+----
+2:
+  000009:[a#3,SETWITHDEL-c#inf,RANGEDEL]
+  000010:[c#2,RANGEDEL-h#3,SET]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+  000008:[e#0,SET-f#1,SET]
+
+# A range tombstone is the first and only item output by a compaction, and it
+# extends past the grandparent file boundary used to limit the size of future
+# compactions. Verify the range tombstone is split at that file boundary.
+
+define target-file-sizes=(1, 1, 1, 1)
+L1
+  a.RANGEDEL.3:e
+L2
+  a.SET.2:v
+L3
+  a.SET.0:v
+  b.SET.0:v
+L3
+  c.SET.0:v
+  d.SET.0:v
+----
+1:
+  000004:[a#3,RANGEDEL-e#inf,RANGEDEL]
+2:
+  000005:[a#2,SET-a#2,SET]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+
+compact a-e L1
+----
+2:
+  000008:[a#3,RANGEDEL-c#inf,RANGEDEL]
+  000009:[c#3,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000006:[a#0,SET-b#0,SET]
+  000007:[c#0,SET-d#0,SET]
+
+# An elided range tombstone is the first item encountered by a compaction,
+# and the grandparent limit set by it extends to the next item, also a range
+# tombstone. The first item should be elided, and the second item should
+# reset the grandparent limit.
+
+define target-file-sizes=(100, 100, 100, 100)
+L1
+  a.RANGEDEL.4:d
+L1
+  grandparent.RANGEDEL.2:z
+  h.SET.3:v
+L2
+  grandparent.SET.1:v
+L3
+  grandparent.SET.0:v
+L3
+  m.SET.0:v
+----
+1:
+  000004:[a#4,RANGEDEL-d#inf,RANGEDEL]
+  000005:[grandparent#2,RANGEDEL-z#inf,RANGEDEL]
+2:
+  000006:[grandparent#1,SET-grandparent#1,SET]
+3:
+  000007:[grandparent#0,SET-grandparent#0,SET]
+  000008:[m#0,SET-m#0,SET]
+
+compact a-h L1
+----
+2:
+  000009:[grandparent#2,RANGEDEL-m#inf,RANGEDEL]
+  000010:[m#2,RANGEDEL-z#inf,RANGEDEL]
+3:
+  000007:[grandparent#0,SET-grandparent#0,SET]
+  000008:[m#0,SET-m#0,SET]
+
+# Regression test for a bug where compaction would stop process range
+# tombstones for an input level upon finding an sstable in the input
+# level with no range tombstones. In the scenario below, sstable 6
+# does not contain any range tombstones while sstable 7 does. Both are
+# compacted together with sstable 5.
+
+reset
+----
+
+batch
+set a 1
+set b 1
+set c 1
+set d 1
+set z 1
+----
+
+compact a-z
+----
+6:
+  000005:[a#10,SET-z#14,SET]
+
+build ext1
+set a 2
+----
+
+build ext2
+set b 2
+del-range c z
+----
+
+ingest ext1 ext2
+----
+0.0:
+  000006:[a#15,SET-a#15,SET]
+  000007:[b#16,SET-z#inf,RANGEDEL]
+6:
+  000005:[a#10,SET-z#14,SET]
+
+iter
+first
+next
+next
+next
+----
+a: (2, .)
+b: (2, .)
+z: (1, .)
+.
+
+compact a-z
+----
+6:
+  000008:[a#0,SET-z#0,SET]
+
+iter
+first
+next
+next
+next
+----
+a: (2, .)
+b: (2, .)
+z: (1, .)
+.
+
+# Regresion test for a bug in sstable smallest boundary generation
+# where the smallest key for an sstable was set to a key "larger" than
+# the start key of the first range tombstone. This in turn fouled up
+# the processing logic of range tombstones used by mergingIter which
+# allowed stepping out of an sstable even though it contained a range
+# tombstone that covered keys in lower levels.
+
+define target-file-sizes=(1, 1, 1, 1)
+L0
+  c.SET.4:4
+L1
+  a.SET.3:3
+L2
+  a.RANGEDEL.2:e
+L3
+  b.SET.1:1
+----
+0.0:
+  000004:[c#4,SET-c#4,SET]
+1:
+  000005:[a#3,SET-a#3,SET]
+2:
+  000006:[a#2,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000007:[b#1,SET-b#1,SET]
+
+compact a-e L1
+----
+0.0:
+  000004:[c#4,SET-c#4,SET]
+2:
+  000008:[a#3,SETWITHDEL-b#inf,RANGEDEL]
+  000009:[b#2,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000007:[b#1,SET-b#1,SET]
+
+# We should only see a:3 and c:4 at this point.
+
+iter
+first
+next
+next
+----
+a: (3, .)
+c: (4, .)
+.
+
+# The bug allowed seeing b:1 during reverse iteration.
+
+iter
+last
+prev
+prev
+----
+c: (4, .)
+a: (3, .)
+.
+
+# This is a similar scenario to the one above. In older versions of Pebble this
+# case necessitated adjusting the seqnum of the range tombstone to
+# prev.LargestKey.SeqNum-1. We no longer allow user keys to be split across
+# sstables, and the seqnum adjustment is no longer necessary.
+#
+# Note the target-file-size of 26 is specially tailored to get the
+# desired compaction output.
+
+define target-file-sizes=(26, 26, 26, 26) snapshots=(1, 2, 3)
+L1
+  a.SET.4:4
+L1
+  b.SET.2:2
+  b.RANGEDEL.3:e
+L3
+  b.SET.1:1
+----
+1:
+  000004:[a#4,SET-a#4,SET]
+  000005:[b#3,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000006:[b#1,SET-b#1,SET]
+
+compact a-e L1
+----
+2:
+  000007:[a#4,SET-a#4,SET]
+  000008:[b#3,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000006:[b#1,SET-b#1,SET]
+
+iter
+first
+next
+last
+prev
+----
+a: (4, .)
+.
+a: (4, .)
+.
+
+# Similar to the preceding scenario, except the range tombstone has
+# the same seqnum as the largest key in the preceding file.
+
+define target-file-sizes=(26, 26, 26, 26) snapshots=(1, 2, 3)
+L1
+  a.SET.4:4
+L1
+  b.SET.3:3
+  b.RANGEDEL.3:e
+L3
+  b.SET.1:1
+----
+1:
+  000004:[a#4,SET-a#4,SET]
+  000005:[b#3,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000006:[b#1,SET-b#1,SET]
+
+compact a-e L1
+----
+2:
+  000007:[a#4,SET-a#4,SET]
+  000008:[b#3,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000006:[b#1,SET-b#1,SET]
+
+iter
+first
+next
+next
+last
+prev
+prev
+----
+a: (4, .)
+b: (3, .)
+.
+b: (3, .)
+a: (4, .)
+.
+
+# Similar to the preceding scenario, except the range tombstone has
+# a smaller seqnum than the largest key in the preceding file.
+
+define target-file-sizes=(26, 26, 26, 26) snapshots=(1, 2, 3)
+L1
+  a.SET.4:4
+L1
+  b.SET.4:4
+  b.RANGEDEL.2:e
+L3
+  b.SET.1:1
+----
+1:
+  000004:[a#4,SET-a#4,SET]
+  000005:[b#4,SET-e#inf,RANGEDEL]
+3:
+  000006:[b#1,SET-b#1,SET]
+
+compact a-e L1
+----
+2:
+  000007:[a#4,SET-a#4,SET]
+  000008:[b#4,SET-e#inf,RANGEDEL]
+3:
+  000006:[b#1,SET-b#1,SET]
+
+iter
+first
+next
+next
+last
+prev
+prev
+----
+a: (4, .)
+b: (4, .)
+.
+b: (4, .)
+a: (4, .)
+.
+
+# Test a scenario where the last point key in an sstable has a seqnum
+# of 0.
+
+define target-file-sizes=(1, 1, 26) snapshots=(2)
+L1
+  a.SET.3:3
+  b.RANGEDEL.3:e
+  b.SET.0:0
+L3
+  a.RANGEDEL.2:b
+L3
+  c.SET.0:0
+  d.SET.0:0
+----
+1:
+  000004:[a#3,SET-e#inf,RANGEDEL]
+3:
+  000005:[a#2,RANGEDEL-b#inf,RANGEDEL]
+  000006:[c#0,SET-d#0,SET]
+
+iter
+last
+prev
+----
+a: (3, .)
+.
+
+compact a-e L1
+----
+2:
+  000007:[a#3,SET-c#inf,RANGEDEL]
+  000008:[c#3,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000005:[a#2,RANGEDEL-b#inf,RANGEDEL]
+  000006:[c#0,SET-d#0,SET]
+
+iter
+last
+prev
+----
+a: (3, .)
+.
+
+# Test a scenario where the last point key in an sstable before the
+# grandparent limit is reached has a seqnum of 0. We want to cut the
+# sstable after the next point key is added, rather than continuing to
+# add keys indefinitely (or till the size limit is reached).
+
+define target-file-sizes=(100, 1, 52) snapshots=(2)
+L1
+  a.SET.3:3
+  b.RANGEDEL.3:e
+  b.SET.0:0
+  c.SET.3:1
+  d.SET.1:1
+L3
+  c.RANGEDEL.2:d
+----
+1:
+  000004:[a#3,SET-e#inf,RANGEDEL]
+3:
+  000005:[c#2,RANGEDEL-d#inf,RANGEDEL]
+
+compact a-f L1
+----
+2:
+  000006:[a#3,SET-c#inf,RANGEDEL]
+  000007:[c#3,RANGEDEL-e#inf,RANGEDEL]
+3:
+  000005:[c#2,RANGEDEL-d#inf,RANGEDEL]
+
+
+# Test a scenario where we the last point key in an sstable has a
+# seqnum of 0, but there is another range tombstone later in the
+# compaction. This scenario was previously triggering an assertion due
+# to the rangedel.Fragmenter being finished prematurely.
+
+define target-file-sizes=(1, 1, 1)
+L1
+  a.SET.0:0
+  c.RANGEDEL.1:d
+L3
+  b.SET.0:0
+----
+1:
+  000004:[a#0,SET-d#inf,RANGEDEL]
+3:
+  000005:[b#0,SET-b#0,SET]
+
+compact a-e L1
+----
+2:
+  000006:[a#0,SET-a#0,SET]
+3:
+  000005:[b#0,SET-b#0,SET]
+
+define target-file-sizes=(1, 1, 1, 1)
+L0
+  b.SET.1:v
+L0
+  a.SET.2:v
+----
+0.0:
+  000005:[a#2,SET-a#2,SET]
+  000004:[b#1,SET-b#1,SET]
+
+add-ongoing-compaction startLevel=0 outputLevel=1 start=a end=z
+----
+
+async-compact a-b L0
+----
+manual compaction blocked until ongoing finished
+1:
+  000006:[a#0,SET-a#0,SET]
+  000007:[b#0,SET-b#0,SET]
+
+compact a-b L1
+----
+2:
+  000008:[a#0,SET-a#0,SET]
+  000009:[b#0,SET-b#0,SET]
+
+add-ongoing-compaction startLevel=0 outputLevel=1 start=a end=z
+----
+
+async-compact a-b L2
+----
+manual compaction blocked until ongoing finished
+3:
+  000010:[a#0,SET-a#0,SET]
+  000011:[b#0,SET-b#0,SET]
+
+add-ongoing-compaction startLevel=0 outputLevel=1 start=a end=z
+----
+
+set-concurrent-compactions num=2
+----
+
+async-compact a-b L3
+----
+manual compaction did not block for ongoing
+4:
+  000012:[a#0,SET-a#0,SET]
+  000013:[b#0,SET-b#0,SET]
+
+remove-ongoing-compaction
+----
+
+add-ongoing-compaction startLevel=4 outputLevel=5 start=a end=b
+----
+
+async-compact a-b L4
+----
+manual compaction blocked until ongoing finished
+5:
+  000014:[a#0,SET-a#0,SET]
+  000015:[b#0,SET-b#0,SET]
+
+# Test of a scenario where consecutive elided range tombstones and grandparent
+# boundaries could result in an invariant violation in the rangedel fragmenter.
+
+define target-file-sizes=(1, 1, 1, 1)
+L1
+  a.RANGEDEL.4:b
+  c.RANGEDEL.4:d
+  e.RANGEDEL.4:f
+L1
+  g.RANGEDEL.6:h
+  i.RANGEDEL.4:j
+L1
+  k.RANGEDEL.5:q
+  m.RANGEDEL.4:q
+L2
+  a.SET.2:foo
+L3
+  a.SET.1:foo
+  c.SET.1:foo
+L3
+  ff.SET.1:v
+L3
+  k.SET.1:foo
+----
+1:
+  000004:[a#4,RANGEDEL-f#inf,RANGEDEL]
+  000005:[g#6,RANGEDEL-j#inf,RANGEDEL]
+  000006:[k#5,RANGEDEL-q#inf,RANGEDEL]
+2:
+  000007:[a#2,SET-a#2,SET]
+3:
+  000008:[a#1,SET-c#1,SET]
+  000009:[ff#1,SET-ff#1,SET]
+  000010:[k#1,SET-k#1,SET]
+
+compact a-q L1
+----
+2:
+  000011:[a#4,RANGEDEL-d#inf,RANGEDEL]
+  000012:[k#5,RANGEDEL-m#inf,RANGEDEL]
+3:
+  000008:[a#1,SET-c#1,SET]
+  000009:[ff#1,SET-ff#1,SET]
+  000010:[k#1,SET-k#1,SET]
+
+# Test a case where a new output file is started, there are no previous output
+# files, there are no additional keys (key = nil) and the rangedel fragmenter
+# is non-empty.
+define target-file-sizes=(1, 1, 1)
+L1
+  a.RANGEDEL.10:b
+  d.RANGEDEL.9:e
+  q.RANGEDEL.8:r
+L2
+  g.RANGEDEL.7:h
+L3
+  q.SET.6:6
+----
+1:
+  000004:[a#10,RANGEDEL-r#inf,RANGEDEL]
+2:
+  000005:[g#7,RANGEDEL-h#inf,RANGEDEL]
+3:
+  000006:[q#6,SET-q#6,SET]
+
+compact a-r L1
+----
+2:
+  000007:[q#8,RANGEDEL-r#inf,RANGEDEL]
+3:
+  000006:[q#6,SET-q#6,SET]
+
+# Test a snapshot that separates a range deletion from all the data that it
+# deletes. Ensure that we respect the target-file-size and split into multiple
+# outputs.
+
+define target-file-sizes=(1, 1, 1) snapshots=(14)
+L1
+  a.RANGEDEL.15:z
+  b.SET.11:foo
+  c.SET.11:foo
+L2
+  c.SET.0:foo
+  d.SET.0:foo
+----
+1:
+  000004:[a#15,RANGEDEL-z#inf,RANGEDEL]
+2:
+  000005:[c#0,SET-d#0,SET]
+
+compact a-z L1
+----
+2:
+  000006:[a#15,RANGEDEL-c#inf,RANGEDEL]
+  000007:[c#15,RANGEDEL-d#inf,RANGEDEL]
+  000008:[d#15,RANGEDEL-z#inf,RANGEDEL]
+
+# Test an interaction between a range deletion that will be elided with
+# output splitting. Ensure that the output is still split (previous versions
+# of the code did not, because of intricacies around preventing a zero
+# sequence number in an output's largest key).
+
+define target-file-sizes=(1, 1, 1)
+L1
+  a.RANGEDEL.10:z
+  b.SET.11:foo
+  c.SET.11:foo
+L2
+  c.SET.0:foo
+  d.SET.0:foo
+----
+1:
+  000004:[a#10,RANGEDEL-z#inf,RANGEDEL]
+2:
+  000005:[c#0,SET-d#0,SET]
+
+compact a-z L1
+----
+2:
+  000006:[b#0,SET-b#0,SET]
+  000007:[c#0,SET-c#0,SET]
+
+define snapshots=(10)
+L1
+  a.MERGE.15:a15
+L2
+  a.SET.5:a5
+----
+1:
+  000004:[a#15,MERGE-a#15,MERGE]
+2:
+  000005:[a#5,SET-a#5,SET]
+
+compact a-z
+----
+3:
+  000006:[a#15,MERGE-a#0,SET]
+
+# Fix for #2705. When snapshotPinned was used to set force obsolete, the
+# merged value would be a15 since the SET was incorrectly ignored.
+iter
+first
+next
+----
+a: (a5a15, .)
+.
diff --git a/pebble/testdata/manual_flush b/pebble/testdata/manual_flush
new file mode 100644
index 0000000..1cb0d98
--- /dev/null
+++ b/pebble/testdata/manual_flush
@@ -0,0 +1,77 @@
+batch
+set a 1
+set b 2
+----
+
+# The first L0 table can have its seqnums zeroed.
+flush
+----
+0.0:
+  000005:[a#10,SET-b#11,SET]
+
+reset
+----
+
+batch
+set a 1
+set b 2
+del a
+del b
+----
+
+flush
+----
+0.0:
+  000005:[a#12,DEL-b#13,DEL]
+
+batch
+set a 3
+----
+
+# A second (overlapping) L0 table will have non-zero seqnums.
+flush
+----
+0.1:
+  000007:[a#14,SET-a#14,SET]
+0.0:
+  000005:[a#12,DEL-b#13,DEL]
+
+batch
+set c 4
+----
+
+# A third (non-overlapping) L0 table will have non-zero seqnums.
+flush
+----
+0.1:
+  000007:[a#14,SET-a#14,SET]
+0.0:
+  000005:[a#12,DEL-b#13,DEL]
+  000009:[c#15,SET-c#15,SET]
+
+reset
+----
+
+batch
+set a 1
+set b 2
+del-range a c
+----
+
+flush
+----
+0.0:
+  000005:[a#12,RANGEDEL-c#inf,RANGEDEL]
+
+reset
+----
+
+batch
+set a 1
+set b 2
+----
+
+async-flush
+----
+0.0:
+  000005:[a#10,SET-b#11,SET]
diff --git a/pebble/testdata/marked_for_compaction b/pebble/testdata/marked_for_compaction
new file mode 100644
index 0000000..25a6181
--- /dev/null
+++ b/pebble/testdata/marked_for_compaction
@@ -0,0 +1,28 @@
+define
+L0
+  c.SET.11:foo
+L1
+  c.SET.0:foo
+  d.SET.0:foo
+----
+0.0:
+  000004:[c#11,SET-c#11,SET] seqnums:[11-11] points:[c#11,SET-c#11,SET]
+1:
+  000005:[c#0,SET-d#0,SET] seqnums:[0-0] points:[c#0,SET-d#0,SET]
+
+mark-for-compaction file=000005
+----
+marked L1.000005
+
+mark-for-compaction file=000004
+----
+marked L0.000004
+
+maybe-compact
+----
+[JOB 100] compacted(rewrite) L1 [000005] (670B) Score=0.00 + L1 [] (0B) Score=0.00 -> L1 [000006] (670B), in 1.0s (2.0s total), output rate 670B/s
+[JOB 100] compacted(rewrite) L0 [000004] (665B) Score=0.00 + L0 [] (0B) Score=0.00 -> L0 [000007] (665B), in 1.0s (2.0s total), output rate 665B/s
+0.0:
+  000007:[c#11,SET-c#11,SET] seqnums:[11-11] points:[c#11,SET-c#11,SET]
+1:
+  000006:[c#0,SET-d#0,SET] seqnums:[0-0] points:[c#0,SET-d#0,SET]
diff --git a/pebble/testdata/merging_iter b/pebble/testdata/merging_iter
new file mode 100644
index 0000000..e106170
--- /dev/null
+++ b/pebble/testdata/merging_iter
@@ -0,0 +1,592 @@
+# Format for define command:
+# Levels are ordered from higher to lower, and each new level starts with an L
+# Each level is defined using an even number of lines where every pair of lines represents
+# a file. The files within a level are ordered from smaller to larger keys.
+# Each file is defined using: the first line specifies the smallest and largest internal
+# keys and the second line the point key-value pairs in the sstable in order. The rangedel
+# key-value pairs should also be in increasing order relative to the other rangedel pairs.
+# The largest file key can take the form of <userkey>.RANGEDEL.72057594037927935, which
+# represents the range deletion sentinel.
+
+# The rangedel should not delete any points in any sstable.  The two files were involved in a
+# compaction and then the second file got moved to a lower level.
+define
+L
+a.SET.30 e.RANGEDEL.72057594037927935
+a.SET.30:30 c.SET.27:27 a.RANGEDEL.8:f
+L
+e.SET.10 g.SET.20
+e.SET.10:10 g.SET.20:20 e.RANGEDEL.8:f
+----
+1:
+  000000:[a#30,SET-e#inf,RANGEDEL]
+2:
+  000001:[e#10,SET-g#20,SET]
+
+# isNextEntryDeleted() should not allow the rangedel to act on the points in the lower sstable
+# that are after it.
+iter
+first
+next
+next
+next
+next
+stats
+reset-stats
+stats
+----
+a#30,1:30
+c#27,1:27
+e#10,1:10
+g#20,1:20
+.
+{BlockBytes:116 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:5 ValueBytes:8 PointCount:5 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+{BlockBytes:0 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+
+# seekGE() should not allow the rangedel to act on points in the lower sstable that are after it.
+iter
+seek-ge d
+next
+next
+----
+e#10,1:10
+g#20,1:20
+.
+
+# isPrevEntryDeleted() should not allow the rangedel to act on the points in the lower sstable
+# that are after it.
+iter
+last
+prev
+prev
+prev
+----
+g#20,1:20
+e#10,1:10
+c#27,1:27
+a#30,1:30
+
+# seekLT() should not allow the rangedel to act on points in the lower sstable that are after it.
+iter
+seek-lt h
+prev
+prev
+prev
+----
+g#20,1:20
+e#10,1:10
+c#27,1:27
+a#30,1:30
+
+# We keep the rangedel alive by having a point in the higher level past the first point in the
+# lower level. This rangedel hides that first point in the lower level but we should not seek to
+# h and hide the second point.
+define
+L
+a.SET.15 f.SET.16
+a.SET.15:15 c.SET.13:13 f.SET.16:16 a.RANGEDEL.12:h
+L
+e.SET.10 g.SET.15
+e.SET.10:10 g.SET.15:15
+----
+1:
+  000002:[a#15,SET-f#16,SET]
+2:
+  000003:[e#10,SET-g#15,SET]
+
+iter
+first
+next
+next
+next
+----
+a#15,1:15
+c#13,1:13
+f#16,1:16
+g#15,1:15
+
+iter
+seek-ge d
+next
+----
+f#16,1:16
+g#15,1:15
+
+iter
+last
+prev
+prev
+prev
+----
+g#15,1:15
+f#16,1:16
+c#13,1:13
+a#15,1:15
+
+# The rangedel should not delete any points in any sstable.  The two files were involved in an
+# compaction and then the first file got moved to a lower level.
+define
+L
+c.SET.30 f.RANGEDEL.0
+c.SET.30:30 d.SET.27:27 a.RANGEDEL.8:f
+L
+a.SET.10 c.RANGEDEL.72057594037927935
+a.SET.10:10 b.SET.12:12 a.RANGEDEL.8:f
+----
+1:
+  000004:[c#30,SET-f#0,RANGEDEL]
+2:
+  000005:[a#10,SET-c#inf,RANGEDEL]
+
+# isNextEntryDeleted() should not allow the rangedel to act on the points in the lower sstable
+# that are before it.
+iter
+first
+next
+next
+next
+----
+a#10,1:10
+b#12,1:12
+c#30,1:30
+d#27,1:27
+
+# seekGE() should not allow the rangedel to act on points in the lower sstable that are before it.
+iter
+seek-ge a
+next
+next
+next
+----
+a#10,1:10
+b#12,1:12
+c#30,1:30
+d#27,1:27
+
+# isPrevEntryDeleted() should not allow the rangedel to act on the points in the lower sstable
+# that are before it.
+iter
+last
+prev
+prev
+prev
+----
+d#27,1:27
+c#30,1:30
+b#12,1:12
+a#10,1:10
+
+# seekLT() should not allow the rangedel to act on points in the lower sstable that are before it.
+iter
+seek-lt e
+prev
+prev
+prev
+----
+d#27,1:27
+c#30,1:30
+b#12,1:12
+a#10,1:10
+
+# We keep the rangedel alive in prev iteration by having a point in the higher level before
+# the last point in the lower level. This rangedel hides that first point in the lower level
+# but we should not seek to a and hide the second point.
+define
+L
+c.SET.15 g.SET.16
+c.SET.15:15 f.SET.13:13 g.SET.16:16 a.RANGEDEL.12:h
+L
+b.SET.14 d.SET.10
+b.SET.14:14 d.SET.10:10
+----
+1:
+  000006:[c#15,SET-g#16,SET]
+2:
+  000007:[b#14,SET-d#10,SET]
+
+iter
+last
+prev
+prev
+prev
+----
+g#16,1:16
+f#13,1:13
+c#15,1:15
+b#14,1:14
+
+iter
+seek-lt f
+prev
+----
+c#15,1:15
+b#14,1:14
+
+# The rangedel should not delete anything.
+define
+L
+a.SET.30 e.RANGEDEL.72057594037927935
+a.SET.30:30 c.SET.27:27 a.RANGEDEL.8:g
+L
+e.SET.10 g.SET.20
+e.SET.10:10 g.SET.20:20 e.RANGEDEL.8:g
+----
+1:
+  000008:[a#30,SET-e#inf,RANGEDEL]
+2:
+  000009:[e#10,SET-g#20,SET]
+
+# When doing seek-lt f, the rangedel should not apply to e in the lower sstable. This is the
+# reason we cannot just use largest user key to constrain the rangedel and we need to
+# know whether it is the sentinel key.
+iter
+seek-lt f
+prev
+prev
+----
+e#10,1:10
+c#27,1:27
+a#30,1:30
+
+iter
+seek-ge e
+next
+----
+e#10,1:10
+g#20,1:20
+
+iter
+first
+seek-ge e
+next
+----
+a#30,1:30
+e#10,1:10
+g#20,1:20
+
+iter
+first
+next
+next
+next
+next
+----
+a#30,1:30
+c#27,1:27
+e#10,1:10
+g#20,1:20
+.
+
+# Verify that switching directions respects lower/upper bound.
+
+define
+L
+a.SET.9 d.SET.6
+a.SET.9:9 b.SET.8:8 c.SET.7:7 d.SET.6:6
+----
+1:
+  000010:[a#9,SET-d#6,SET]
+
+# Verify the lower bound is respected in switchToMinHeap() when the
+# heap is empty.
+
+iter
+set-bounds lower=c
+seek-ge c
+prev
+prev
+next
+----
+c#7,1:7
+.
+.
+c#7,1:7
+
+# Verify the upper bound is respected in switchToMaxHeap() when the
+# heap is empty.
+
+iter
+set-bounds upper=c
+seek-lt c
+next
+next
+prev
+----
+b#8,1:8
+.
+.
+b#8,1:8
+
+# Verify the lower bound is respected in switchToMinHeap() when the
+# heap is not empty.
+
+define
+L
+a.SET.9 d.SET.6
+a.SET.9:9 b.SET.8:8 c.SET.7:7 d.SET.6:6
+L
+c.SET.5 f.SET.2
+c.SET.5:5 d.SET.4:4 e.SET.3:3 f.SET.2:2
+----
+1:
+  000011:[a#9,SET-d#6,SET]
+2:
+  000012:[c#5,SET-f#2,SET]
+
+iter
+set-bounds lower=d
+seek-ge d
+prev
+prev
+next
+next
+----
+d#6,1:6
+.
+.
+d#6,1:6
+d#4,1:4
+
+# Check the behavior of reverse prefix iteration.
+
+iter
+seek-prefix-ge d
+prev
+next
+----
+d#6,1:6
+err=pebble: unsupported reverse prefix iteration
+err=pebble: unsupported reverse prefix iteration
+
+# Verify the upper bound is respected in switchToMaxHeap() when the
+# heap is not empty.
+
+define
+L
+c.SET.9 f.SET.6
+c.SET.9:9 d.SET.8:8 e.SET.7:7 f.SET.6:6
+L
+a.SET.5 d.SET.2
+a.SET.5:5 b.SET.4:4 c.SET.3:3 d.SET.2:2
+----
+1:
+  000013:[c#9,SET-f#6,SET]
+2:
+  000014:[a#5,SET-d#2,SET]
+
+iter
+set-bounds upper=d
+seek-lt d
+next
+next
+prev
+prev
+----
+c#3,1:3
+.
+.
+c#3,1:3
+c#9,1:9
+
+# Verify that the tombstone for the current level is updated correctly
+# when we advance the iterator on the level and step into a new
+# sstable. In the scenario below, the keys "c" and "d" should not show
+# up in the iteration output.
+
+define
+L
+a.SET.2 a.SET.2
+a.SET.2:2
+c.RANGEDEL.4 e.RANGEDEL.72057594037927935
+c.RANGEDEL.4:e
+f.SET.3 f.SET.3
+f.SET.3:3
+L
+a.SET.0 f.SET.0
+a.SET.0:1 b.SET.0:1 c.SET.0:1 d.SET.0:1 e.SET.0:1 f.SET.0:1
+----
+1:
+  000015:[a#2,SET-a#2,SET]
+  000016:[c#4,RANGEDEL-e#inf,RANGEDEL]
+  000017:[f#3,SET-f#3,SET]
+2:
+  000018:[a#0,SET-f#0,SET]
+
+iter
+first
+next
+next
+next
+next
+next
+next
+next
+----
+a#2,1:2
+a#0,1:1
+b#0,1:1
+e#0,1:1
+f#3,1:3
+f#0,1:1
+.
+.
+
+iter
+last
+prev
+prev
+prev
+prev
+prev
+prev
+prev
+----
+f#0,1:1
+f#3,1:3
+e#0,1:1
+b#0,1:1
+a#0,1:1
+a#2,1:2
+.
+.
+
+# Verify the upper bound is respected when switching directions at a RANGEDEL
+# boundary.
+
+define
+L
+kq.RANGEDEL.100 p.RANGEDEL.72057594037927935
+kq.RANGEDEL.100:p
+L
+b.SET.90 o.SET.65
+b.SET.90:90 cat.SET.70:70 g.SET.80:80 o.SET.65:65
+L
+a.SET.41 z.RANGEDEL.72057594037927935
+a.SET.41:41 koujdlp.MERGE.37:37 ok.SET.46:46 v.SET.43:43 v.RANGEDEL.19:z
+----
+1:
+  000019:[kq#100,RANGEDEL-p#inf,RANGEDEL]
+2:
+  000020:[b#90,SET-o#65,SET]
+3:
+  000021:[a#41,SET-z#inf,RANGEDEL]
+
+iter
+set-bounds upper=n
+seek-ge krgywquurww
+prev
+----
+.
+koujdlp#37,2:37
+
+# Verify the lower bound is respected when switching directions at a RANGEDEL
+# boundary.
+
+define
+L
+a.SET.103 jyk.RANGEDEL.72057594037927935
+a.SET.103:103 imd.SET.793:793 iwoeionch.SET.792:792 c.RANGEDEL.101:jyk
+L
+b.SET.90 o.SET.65
+b.SET.90:90 cat.SET.70:70 g.SET.80:80 o.SET.65:65
+L
+all.SET.0 zk.SET.722
+all.SET.0:0 c.SET.0:0 zk.SET.722:722
+----
+1:
+  000022:[a#103,SET-jyk#inf,RANGEDEL]
+2:
+  000023:[b#90,SET-o#65,SET]
+3:
+  000024:[all#0,SET-zk#722,SET]
+
+iter
+set-bounds lower=cz upper=jd
+seek-lt jd
+next
+----
+iwoeionch#792,1:792
+.
+
+# Exercise the early stopping behavior for prefix iteration when encountering
+# range deletion tombstones. Keys a, d are not deleted, while the rest are.
+define
+L
+a.SET.10 d.SET.10
+a.SET.10:a10 b.SET.10:b10 c.SET.10:c10 d.SET.10:d10 b.RANGEDEL.12:d
+----
+1:
+  000025:[a#10,SET-d#10,SET]
+
+iter
+first
+next
+next
+----
+a#10,1:a10
+d#10,1:d10
+.
+
+# The seek to c finds d since iteration cannot stop at c as it matches the
+# prefix, and when it steps to d, it finds d is not deleted. Note that
+# mergingIter is an InternalIterator and does not need to guarantee prefix
+# match -- that is job of the higher-level Iterator. So "seek-prefix-ge c" is
+# allowed to return d.
+iter
+seek-prefix-ge a false
+seek-prefix-ge aa true
+seek-prefix-ge b true
+seek-prefix-ge c true
+seek-prefix-ge d true
+----
+a#10,1:a10
+.
+.
+d#10,1:d10
+d#10,1:d10
+
+iter
+seek-prefix-ge a false
+next
+seek-prefix-ge b false
+seek-prefix-ge d true
+next
+----
+a#10,1:a10
+.
+.
+d#10,1:d10
+.
+
+# Create a sstable which has a range tombstone that covers 4 points in the
+# same sstable. This tests the PointsCoveredByRangeTombstones and PointCount
+# stats.
+define
+L
+a.SET.30 g.RANGEDEL.72057594037927935
+a.SET.30:30 a.RANGEDEL.20:g b.SET.19:19 c.SET.18:18 d.SET.17:17 e.SET.16:16 f.SET.21:21
+----
+1:
+  000026:[a#30,SET-g#inf,RANGEDEL]
+
+iter
+first
+stats
+reset-stats
+stats
+next
+stats
+next
+stats
+next
+stats
+----
+a#30,1:30
+{BlockBytes:97 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:1 ValueBytes:2 PointCount:1 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+{BlockBytes:0 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:0 ValueBytes:0 PointCount:0 PointsCoveredByRangeTombstones:0 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+f#21,1:21
+{BlockBytes:0 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:5 ValueBytes:10 PointCount:5 PointsCoveredByRangeTombstones:4 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+.
+{BlockBytes:0 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:6 ValueBytes:10 PointCount:6 PointsCoveredByRangeTombstones:4 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
+.
+{BlockBytes:0 BlockBytesInCache:0 BlockReadDuration:0s KeyBytes:6 ValueBytes:10 PointCount:6 PointsCoveredByRangeTombstones:4 SeparatedPointValue:{Count:0 ValueBytes:0 ValueBytesFetched:0}}
diff --git a/pebble/testdata/merging_iter_seek b/pebble/testdata/merging_iter_seek
new file mode 100644
index 0000000..2205da2
--- /dev/null
+++ b/pebble/testdata/merging_iter_seek
@@ -0,0 +1,303 @@
+define
+a0.SET.0:0
+a1.SET.1:1
+a2.SET.2:2
+----
+
+iter
+seek-ge a0
+next
+next
+next
+----
+a0:0
+a1:1
+a2:2
+.
+
+iter
+seek-ge a1
+next
+next
+----
+a1:1
+a2:2
+.
+
+iter
+seek-ge a2
+next
+----
+a2:2
+.
+
+iter
+seek-ge a3
+----
+.
+
+iter
+seek-lt a0
+----
+.
+
+iter
+seek-lt a1
+prev
+----
+a0:0
+.
+
+iter
+seek-lt a2
+prev
+prev
+----
+a1:1
+a0:0
+.
+
+iter
+seek-lt a3
+prev
+prev
+prev
+----
+a2:2
+a1:1
+a0:0
+.
+
+iter
+seek-prefix-ge a0
+next
+----
+a0:0
+a1:1
+
+iter
+seek-prefix-ge a0
+prev
+----
+a0:0
+err=pebble: unsupported reverse prefix iteration
+
+iter
+seek-prefix-ge a0
+first
+next
+next
+next
+----
+a0:0
+a0:0
+a1:1
+a2:2
+.
+
+iter
+seek-prefix-ge a0
+last
+next
+----
+a0:0
+a2:2
+.
+
+iter
+seek-prefix-ge a0
+seek-ge a0
+next
+next
+next
+----
+a0:0
+a0:0
+a1:1
+a2:2
+.
+
+iter
+seek-prefix-ge a0
+seek-lt a2
+next
+next
+----
+a0:0
+a1:1
+a2:2
+.
+
+iter
+seek-prefix-ge a1
+last
+prev
+prev
+prev
+----
+a1:1
+a2:2
+a1:1
+a0:0
+.
+
+iter
+seek-prefix-ge a1
+first
+prev
+----
+a1:1
+a0:0
+.
+
+define
+a0.SET.0:0 b3.SET.3:3
+a1.SET.1:1
+a2.SET.2:2
+----
+
+iter
+seek-ge a2
+next
+next
+----
+a2:2
+b3:3
+.
+
+iter
+seek-lt a2
+prev
+prev
+----
+a1:1
+a0:0
+.
+
+define
+a.SET.0:0 b.SET.3:3
+aa.SET.1:1
+aaa.SET.2:2
+----
+
+iter
+seek-prefix-ge a
+next
+seek-prefix-ge aaa
+next
+----
+a:0
+aa:1
+aaa:2
+b:3
+
+iter
+seek-prefix-ge aa
+prev
+----
+aa:1
+err=pebble: unsupported reverse prefix iteration
+
+iter
+seek-prefix-ge aa
+next
+prev
+----
+aa:1
+aaa:2
+err=pebble: unsupported reverse prefix iteration
+
+iter
+seek-prefix-ge aa
+next
+prev
+----
+aa:1
+aaa:2
+err=pebble: unsupported reverse prefix iteration
+
+iter
+seek-prefix-ge aaa
+next
+----
+aaa:2
+b:3
+
+iter
+seek-prefix-ge aaa
+prev
+----
+aaa:2
+err=pebble: unsupported reverse prefix iteration
+
+iter
+seek-prefix-ge b
+prev
+----
+b:3
+err=pebble: unsupported reverse prefix iteration
+
+iter
+seek-prefix-ge b
+next
+----
+b:3
+.
+
+iter
+seek-prefix-ge aa
+last
+prev
+prev
+prev
+prev
+----
+aa:1
+b:3
+aaa:2
+aa:1
+a:0
+.
+
+iter
+seek-prefix-ge aa
+first
+next
+next
+next
+next
+----
+aa:1
+a:0
+aa:1
+aaa:2
+b:3
+.
+
+iter
+seek-prefix-ge aa
+seek-ge a
+next
+next
+next
+next
+----
+aa:1
+a:0
+aa:1
+aaa:2
+b:3
+.
+
+iter
+seek-prefix-ge aa
+seek-lt aaa
+next
+next
+next
+----
+aa:1
+aa:1
+aaa:2
+b:3
+.
diff --git a/pebble/testdata/metrics b/pebble/testdata/metrics
new file mode 100644
index 0000000..1dbfbb1
--- /dev/null
+++ b/pebble/testdata/metrics
@@ -0,0 +1,720 @@
+example
+----
+      |                             |       |       |   ingested   |     moved    |    written   |       |    amp   |     multilevel
+level | tables  size val-bl vtables | score |   in  | tables  size | tables  size | tables  size |  read |   r   w  |    top   in  read
+------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+----------+------------------
+    0 |   101   102B     0B     101 | 103.0 |  104B |   112   104B |   113   106B |   221   217B |  107B |   1  2.1 |  104B  104B  104B
+    1 |   201   202B     0B     201 | 203.0 |  204B |   212   204B |   213   206B |   421   417B |  207B |   2  2.0 |  204B  204B  204B
+    2 |   301   302B     0B     301 | 303.0 |  304B |   312   304B |   313   306B |   621   617B |  307B |   3  2.0 |  304B  304B  304B
+    3 |   401   402B     0B     401 | 403.0 |  404B |   412   404B |   413   406B |   821   817B |  407B |   4  2.0 |  404B  404B  404B
+    4 |   501   502B     0B     501 | 503.0 |  504B |   512   504B |   513   506B |  1.0K  1017B |  507B |   5  2.0 |  504B  504B  504B
+    5 |   601   602B     0B     601 | 603.0 |  604B |   612   604B |   613   606B |  1.2K  1.2KB |  607B |   6  2.0 |  604B  604B  604B
+    6 |   701   702B     0B     701 |     - |  704B |   712   704B |   713   706B |  1.4K  1.4KB |  707B |   7  2.0 |  704B  704B  704B
+total |  2.8K  2.7KB     0B    2.8K |     - | 2.8KB |  2.9K  2.8KB |  2.9K  2.8KB |  5.7K  8.4KB | 2.8KB |  28  3.0 | 2.8KB 2.8KB 2.8KB
+---------------------------------------------------------------------------------------------------------------------------------------
+WAL: 22 files (24B)  in: 25B  written: 26B (4% overhead)
+Flushes: 8
+Compactions: 5  estimated debt: 6B  in progress: 2 (7B)
+             default: 27  delete: 28  elision: 29  move: 30  read: 31  rewrite: 32  multi-level: 33
+MemTables: 12 (11B)  zombie: 14 (13B)
+Zombie tables: 16 (15B)
+Backing tables: 1 (2.0MB)
+Virtual tables: 2807 (2.8KB)
+Block cache: 2 entries (1B)  hit rate: 42.9%
+Table cache: 18 entries (17B)  hit rate: 48.7%
+Secondary cache: 0 entries (0B)  hit rate: 0.0%
+Snapshots: 4  earliest seq num: 1024
+Table iters: 21
+Filter utility: 47.4%
+Ingestions: 27  as flushable: 36 (34B in 35 tables)
+
+batch
+set a 1
+----
+
+iter-new a category=a qos=non-latency
+----
+
+flush
+----
+0.0:
+  000005:[a#10,SET-a#10,SET]
+
+# iter b references both a memtable and sstable 5.
+
+iter-new b category=b qos=latency
+----
+
+metrics
+----
+      |                             |       |       |   ingested   |     moved    |    written   |       |    amp
+level | tables  size val-bl vtables | score |   in  | tables  size | tables  size | tables  size |  read |   r   w
+------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+---------
+    0 |     1   661B     0B       0 |  0.25 |   28B |     0     0B |     0     0B |     1   661B |    0B |   1 23.6
+    1 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    2 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    3 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    4 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    5 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    6 |     0     0B     0B       0 |     - |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+total |     1   661B     0B       0 |     - |   56B |     0     0B |     0     0B |     1   717B |    0B |   1 12.8
+-------------------------------------------------------------------------------------------------------------------
+WAL: 1 files (28B)  in: 17B  written: 56B (229% overhead)
+Flushes: 1
+Compactions: 0  estimated debt: 0B  in progress: 0 (0B)
+             default: 0  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  multi-level: 0
+MemTables: 1 (256KB)  zombie: 1 (256KB)
+Zombie tables: 0 (0B)
+Backing tables: 0 (0B)
+Virtual tables: 0 (0B)
+Block cache: 3 entries (556B)  hit rate: 0.0%
+Table cache: 1 entries (800B)  hit rate: 0.0%
+Secondary cache: 0 entries (0B)  hit rate: 0.0%
+Snapshots: 0  earliest seq num: 0
+Table iters: 1
+Filter utility: 0.0%
+Ingestions: 0  as flushable: 0 (0B in 0 tables)
+
+disk-usage
+----
+1.9KB
+
+batch
+set b 2
+----
+
+flush
+----
+0.0:
+  000005:[a#10,SET-a#10,SET]
+  000007:[b#11,SET-b#11,SET]
+
+# iter c references both a memtable and sstables 5 and 7.
+
+iter-new c category=c qos=non-latency
+----
+
+compact a-z
+----
+6:
+  000008:[a#0,SET-b#0,SET]
+
+metrics
+----
+      |                             |       |       |   ingested   |     moved    |    written   |       |    amp
+level | tables  size val-bl vtables | score |   in  | tables  size | tables  size | tables  size |  read |   r   w
+------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+---------
+    0 |     0     0B     0B       0 |  0.00 |   56B |     0     0B |     0     0B |     2  1.3KB |    0B |   0 23.6
+    1 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    2 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    3 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    4 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    5 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    6 |     1   667B     0B       0 |     - | 1.3KB |     0     0B |     0     0B |     1   667B | 1.3KB |   1  0.5
+total |     1   667B     0B       0 |     - |   84B |     0     0B |     0     0B |     3  2.0KB | 1.3KB |   1 24.7
+-------------------------------------------------------------------------------------------------------------------
+WAL: 1 files (28B)  in: 34B  written: 84B (147% overhead)
+Flushes: 2
+Compactions: 1  estimated debt: 0B  in progress: 0 (0B)
+             default: 1  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  multi-level: 0
+MemTables: 1 (256KB)  zombie: 2 (512KB)
+Zombie tables: 2 (1.3KB)
+Backing tables: 0 (0B)
+Virtual tables: 0 (0B)
+Block cache: 5 entries (1.1KB)  hit rate: 42.9%
+Table cache: 2 entries (1.6KB)  hit rate: 66.7%
+Secondary cache: 0 entries (0B)  hit rate: 0.0%
+Snapshots: 0  earliest seq num: 0
+Table iters: 2
+Filter utility: 0.0%
+Ingestions: 0  as flushable: 0 (0B in 0 tables)
+Iter category stats:
+   pebble-compaction, non-latency: {BlockBytes:132 BlockBytesInCache:88}
+
+disk-usage
+----
+3.3KB
+
+# Closing iter a will release one of the zombie memtables.
+
+iter-close a
+----
+
+metrics
+----
+      |                             |       |       |   ingested   |     moved    |    written   |       |    amp
+level | tables  size val-bl vtables | score |   in  | tables  size | tables  size | tables  size |  read |   r   w
+------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+---------
+    0 |     0     0B     0B       0 |  0.00 |   56B |     0     0B |     0     0B |     2  1.3KB |    0B |   0 23.6
+    1 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    2 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    3 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    4 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    5 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    6 |     1   667B     0B       0 |     - | 1.3KB |     0     0B |     0     0B |     1   667B | 1.3KB |   1  0.5
+total |     1   667B     0B       0 |     - |   84B |     0     0B |     0     0B |     3  2.0KB | 1.3KB |   1 24.7
+-------------------------------------------------------------------------------------------------------------------
+WAL: 1 files (28B)  in: 34B  written: 84B (147% overhead)
+Flushes: 2
+Compactions: 1  estimated debt: 0B  in progress: 0 (0B)
+             default: 1  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  multi-level: 0
+MemTables: 1 (256KB)  zombie: 2 (512KB)
+Zombie tables: 2 (1.3KB)
+Backing tables: 0 (0B)
+Virtual tables: 0 (0B)
+Block cache: 5 entries (1.1KB)  hit rate: 42.9%
+Table cache: 2 entries (1.6KB)  hit rate: 66.7%
+Secondary cache: 0 entries (0B)  hit rate: 0.0%
+Snapshots: 0  earliest seq num: 0
+Table iters: 2
+Filter utility: 0.0%
+Ingestions: 0  as flushable: 0 (0B in 0 tables)
+Iter category stats:
+   pebble-compaction, non-latency: {BlockBytes:132 BlockBytesInCache:88}
+
+# Closing iter c will release one of the zombie sstables. The other
+# zombie sstable is still referenced by iter b.
+
+iter-close c
+----
+
+metrics
+----
+      |                             |       |       |   ingested   |     moved    |    written   |       |    amp
+level | tables  size val-bl vtables | score |   in  | tables  size | tables  size | tables  size |  read |   r   w
+------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+---------
+    0 |     0     0B     0B       0 |  0.00 |   56B |     0     0B |     0     0B |     2  1.3KB |    0B |   0 23.6
+    1 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    2 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    3 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    4 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    5 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    6 |     1   667B     0B       0 |     - | 1.3KB |     0     0B |     0     0B |     1   667B | 1.3KB |   1  0.5
+total |     1   667B     0B       0 |     - |   84B |     0     0B |     0     0B |     3  2.0KB | 1.3KB |   1 24.7
+-------------------------------------------------------------------------------------------------------------------
+WAL: 1 files (28B)  in: 34B  written: 84B (147% overhead)
+Flushes: 2
+Compactions: 1  estimated debt: 0B  in progress: 0 (0B)
+             default: 1  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  multi-level: 0
+MemTables: 1 (256KB)  zombie: 2 (512KB)
+Zombie tables: 1 (661B)
+Backing tables: 0 (0B)
+Virtual tables: 0 (0B)
+Block cache: 3 entries (556B)  hit rate: 42.9%
+Table cache: 1 entries (800B)  hit rate: 66.7%
+Secondary cache: 0 entries (0B)  hit rate: 0.0%
+Snapshots: 0  earliest seq num: 0
+Table iters: 1
+Filter utility: 0.0%
+Ingestions: 0  as flushable: 0 (0B in 0 tables)
+Iter category stats:
+                   c, non-latency: {BlockBytes:44 BlockBytesInCache:44}
+   pebble-compaction, non-latency: {BlockBytes:132 BlockBytesInCache:88}
+
+disk-usage
+----
+2.7KB
+
+# Closing iter b will release the last zombie sstable and the last zombie memtable.
+
+iter-close b
+----
+
+metrics
+----
+      |                             |       |       |   ingested   |     moved    |    written   |       |    amp
+level | tables  size val-bl vtables | score |   in  | tables  size | tables  size | tables  size |  read |   r   w
+------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+---------
+    0 |     0     0B     0B       0 |  0.00 |   56B |     0     0B |     0     0B |     2  1.3KB |    0B |   0 23.6
+    1 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    2 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    3 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    4 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    5 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    6 |     1   667B     0B       0 |     - | 1.3KB |     0     0B |     0     0B |     1   667B | 1.3KB |   1  0.5
+total |     1   667B     0B       0 |     - |   84B |     0     0B |     0     0B |     3  2.0KB | 1.3KB |   1 24.7
+-------------------------------------------------------------------------------------------------------------------
+WAL: 1 files (28B)  in: 34B  written: 84B (147% overhead)
+Flushes: 2
+Compactions: 1  estimated debt: 0B  in progress: 0 (0B)
+             default: 1  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  multi-level: 0
+MemTables: 1 (256KB)  zombie: 1 (256KB)
+Zombie tables: 0 (0B)
+Backing tables: 0 (0B)
+Virtual tables: 0 (0B)
+Block cache: 0 entries (0B)  hit rate: 42.9%
+Table cache: 0 entries (0B)  hit rate: 66.7%
+Secondary cache: 0 entries (0B)  hit rate: 0.0%
+Snapshots: 0  earliest seq num: 0
+Table iters: 0
+Filter utility: 0.0%
+Ingestions: 0  as flushable: 0 (0B in 0 tables)
+Iter category stats:
+                   b,     latency: {BlockBytes:44 BlockBytesInCache:0}
+                   c, non-latency: {BlockBytes:44 BlockBytesInCache:44}
+   pebble-compaction, non-latency: {BlockBytes:132 BlockBytesInCache:88}
+
+disk-usage
+----
+2.0KB
+
+additional-metrics
+----
+block bytes written:
+ __level___data-block__value-block
+      0          54B           0B
+      1           0B           0B
+      2           0B           0B
+      3           0B           0B
+      4           0B           0B
+      5           0B           0B
+      6          33B           0B
+
+batch
+set c@20 c20
+set c@19 c19
+set c@18 c18
+set c@17 c17
+set c@16 c16
+set c@15 c15
+set c@14 c14
+----
+
+flush
+----
+0.0:
+  000010:[c@20#12,SET-c@18#14,SET]
+  000011:[c@17#15,SET-c@15#17,SET]
+  000012:[c@14#18,SET-c@14#18,SET]
+6:
+  000008:[a#0,SET-b#0,SET]
+
+metrics
+----
+      |                             |       |       |   ingested   |     moved    |    written   |       |    amp
+level | tables  size val-bl vtables | score |   in  | tables  size | tables  size | tables  size |  read |   r   w
+------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+---------
+    0 |     3  2.2KB    38B       0 |  0.25 |  149B |     0     0B |     0     0B |     5  3.5KB |    0B |   1 24.2
+    1 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    2 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    3 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    4 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    5 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    6 |     1   667B     0B       0 |     - | 1.3KB |     0     0B |     0     0B |     1   667B | 1.3KB |   1  0.5
+total |     4  2.9KB    38B       0 |     - |  242B |     0     0B |     0     0B |     6  4.4KB | 1.3KB |   2 18.6
+-------------------------------------------------------------------------------------------------------------------
+WAL: 1 files (93B)  in: 116B  written: 242B (109% overhead)
+Flushes: 3
+Compactions: 1  estimated debt: 2.9KB  in progress: 0 (0B)
+             default: 1  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  multi-level: 0
+MemTables: 1 (256KB)  zombie: 1 (256KB)
+Zombie tables: 0 (0B)
+Backing tables: 0 (0B)
+Virtual tables: 0 (0B)
+Block cache: 0 entries (0B)  hit rate: 42.9%
+Table cache: 0 entries (0B)  hit rate: 66.7%
+Secondary cache: 0 entries (0B)  hit rate: 0.0%
+Snapshots: 0  earliest seq num: 0
+Table iters: 0
+Filter utility: 0.0%
+Ingestions: 0  as flushable: 0 (0B in 0 tables)
+Iter category stats:
+                   b,     latency: {BlockBytes:44 BlockBytesInCache:0}
+                   c, non-latency: {BlockBytes:44 BlockBytesInCache:44}
+   pebble-compaction, non-latency: {BlockBytes:132 BlockBytesInCache:88}
+
+additional-metrics
+----
+block bytes written:
+ __level___data-block__value-block
+      0         198B          38B
+      1           0B           0B
+      2           0B           0B
+      3           0B           0B
+      4           0B           0B
+      5           0B           0B
+      6          33B           0B
+
+compact a-z
+----
+6:
+  000008:[a#0,SET-b#0,SET]
+  000013:[c@20#0,SET-c@16#0,SET]
+  000014:[c@15#0,SET-c@14#0,SET]
+
+metrics
+----
+      |                             |       |       |   ingested   |     moved    |    written   |       |    amp
+level | tables  size val-bl vtables | score |   in  | tables  size | tables  size | tables  size |  read |   r   w
+------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+---------
+    0 |     0     0B     0B       0 |  0.00 |  149B |     0     0B |     0     0B |     5  3.5KB |    0B |   0 24.2
+    1 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    2 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    3 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    4 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    5 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    6 |     3  2.2KB    41B       0 |     - | 3.5KB |     0     0B |     0     0B |     3  2.2KB | 3.5KB |   1  0.6
+total |     3  2.2KB    41B       0 |     - |  242B |     0     0B |     0     0B |     8  6.0KB | 3.5KB |   1 25.3
+-------------------------------------------------------------------------------------------------------------------
+WAL: 1 files (93B)  in: 116B  written: 242B (109% overhead)
+Flushes: 3
+Compactions: 2  estimated debt: 0B  in progress: 0 (0B)
+             default: 2  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  multi-level: 0
+MemTables: 1 (256KB)  zombie: 1 (256KB)
+Zombie tables: 0 (0B)
+Backing tables: 0 (0B)
+Virtual tables: 0 (0B)
+Block cache: 0 entries (0B)  hit rate: 27.3%
+Table cache: 0 entries (0B)  hit rate: 58.3%
+Secondary cache: 0 entries (0B)  hit rate: 0.0%
+Snapshots: 0  earliest seq num: 0
+Table iters: 0
+Filter utility: 0.0%
+Ingestions: 0  as flushable: 0 (0B in 0 tables)
+Iter category stats:
+                   b,     latency: {BlockBytes:44 BlockBytesInCache:0}
+                   c, non-latency: {BlockBytes:44 BlockBytesInCache:44}
+   pebble-compaction, non-latency: {BlockBytes:411 BlockBytesInCache:154}
+
+additional-metrics
+----
+block bytes written:
+ __level___data-block__value-block
+      0         198B          38B
+      1           0B           0B
+      2           0B           0B
+      3           0B           0B
+      4           0B           0B
+      5           0B           0B
+      6         143B          41B
+
+# Flushable ingestion metrics. This requires there be data in a memtable that
+# would overlap with the ingested table(s). Delayed flushes are disabled here to
+# prevent the ingestion from immediately triggering a flush of the memtable.
+# Instead, we wish to flush manually _after_ the ingestion of the two tables has
+# completed, linking the two tables into the flushable queue.
+
+delay-flush
+enable
+----
+
+batch
+set d d
+set e e
+set f f
+----
+
+build ext1.sst
+set d d
+----
+
+build ext2.sst
+set e e
+----
+
+ingest ext1.sst ext2.sst
+----
+
+build ext3.sst
+set f f
+----
+
+ingest ext3.sst
+----
+
+delay-flush
+disable
+----
+
+flush
+----
+0.1:
+  000015:[d#22,SET-d#22,SET]
+  000016:[e#23,SET-e#23,SET]
+  000019:[f#24,SET-f#24,SET]
+0.0:
+  000023:[d#19,SET-f#21,SET]
+6:
+  000008:[a#0,SET-b#0,SET]
+  000013:[c@20#0,SET-c@16#0,SET]
+  000014:[c@15#0,SET-c@14#0,SET]
+
+# We expect the ingested-as-flushable count to be three (one for each ingested
+# table). The unknown category in the iter category stats is because of a gap
+# in instrumentation for checking overlap with an existing flushable ingest,
+# where we open and close a point iterator when constructing a range-del
+# iterator.
+metrics
+----
+      |                             |       |       |   ingested   |     moved    |    written   |       |    amp
+level | tables  size val-bl vtables | score |   in  | tables  size | tables  size | tables  size |  read |   r   w
+------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+---------
+    0 |     4  2.8KB     0B       0 |  0.50 |  149B |     3  2.1KB |     0     0B |     6  4.2KB |    0B |   2 28.8
+    1 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    2 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    3 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    4 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    5 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    6 |     3  2.2KB    41B       0 |     - | 3.5KB |     0     0B |     0     0B |     3  2.2KB | 3.5KB |   1  0.6
+total |     7  5.0KB    41B       0 |     - | 2.3KB |     3  2.1KB |     0     0B |     9  8.7KB | 3.5KB |   3  3.8
+-------------------------------------------------------------------------------------------------------------------
+WAL: 1 files (26B)  in: 176B  written: 175B (-1% overhead)
+Flushes: 8
+Compactions: 2  estimated debt: 5.0KB  in progress: 0 (0B)
+             default: 2  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  multi-level: 0
+MemTables: 1 (1.0MB)  zombie: 1 (1.0MB)
+Zombie tables: 0 (0B)
+Backing tables: 0 (0B)
+Virtual tables: 0 (0B)
+Block cache: 12 entries (2.4KB)  hit rate: 31.1%
+Table cache: 3 entries (2.3KB)  hit rate: 57.9%
+Secondary cache: 0 entries (0B)  hit rate: 0.0%
+Snapshots: 0  earliest seq num: 0
+Table iters: 0
+Filter utility: 0.0%
+Ingestions: 0  as flushable: 2 (2.1KB in 3 tables)
+Iter category stats:
+            _unknown,     latency: {BlockBytes:88 BlockBytesInCache:88}
+                   b,     latency: {BlockBytes:44 BlockBytesInCache:0}
+                   c, non-latency: {BlockBytes:44 BlockBytesInCache:44}
+   pebble-compaction, non-latency: {BlockBytes:411 BlockBytesInCache:154}
+       pebble-ingest,     latency: {BlockBytes:192 BlockBytesInCache:128}
+
+batch
+set g g
+set h h
+set i i
+set j j
+set k k
+set l l
+set m m
+----
+
+flush
+----
+0.1:
+  000015:[d#22,SET-d#22,SET]
+  000016:[e#23,SET-e#23,SET]
+  000019:[f#24,SET-f#24,SET]
+0.0:
+  000023:[d#19,SET-f#21,SET]
+  000025:[g#25,SET-i#27,SET]
+  000026:[j#28,SET-l#30,SET]
+  000027:[m#31,SET-m#31,SET]
+6:
+  000008:[a#0,SET-b#0,SET]
+  000013:[c@20#0,SET-c@16#0,SET]
+  000014:[c@15#0,SET-c@14#0,SET]
+
+metrics
+----
+      |                             |       |       |   ingested   |     moved    |    written   |       |    amp
+level | tables  size val-bl vtables | score |   in  | tables  size | tables  size | tables  size |  read |   r   w
+------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+---------
+    0 |     7  4.7KB     0B       0 |  0.50 |  207B |     3  2.1KB |     0     0B |     9  6.2KB |    0B |   2 30.5
+    1 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    2 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    3 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    4 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    5 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    6 |     3  2.2KB    41B       0 |     - | 3.5KB |     0     0B |     0     0B |     3  2.2KB | 3.5KB |   1  0.6
+total |    10  7.0KB    41B       0 |     - | 2.4KB |     3  2.1KB |     0     0B |    12   11KB | 3.5KB |   3  4.6
+-------------------------------------------------------------------------------------------------------------------
+WAL: 1 files (58B)  in: 223B  written: 265B (19% overhead)
+Flushes: 9
+Compactions: 2  estimated debt: 7.0KB  in progress: 0 (0B)
+             default: 2  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  multi-level: 0
+MemTables: 1 (1.0MB)  zombie: 1 (1.0MB)
+Zombie tables: 0 (0B)
+Backing tables: 0 (0B)
+Virtual tables: 0 (0B)
+Block cache: 12 entries (2.4KB)  hit rate: 31.1%
+Table cache: 3 entries (2.3KB)  hit rate: 57.9%
+Secondary cache: 0 entries (0B)  hit rate: 0.0%
+Snapshots: 0  earliest seq num: 0
+Table iters: 0
+Filter utility: 0.0%
+Ingestions: 0  as flushable: 2 (2.1KB in 3 tables)
+Iter category stats:
+            _unknown,     latency: {BlockBytes:88 BlockBytesInCache:88}
+                   b,     latency: {BlockBytes:44 BlockBytesInCache:0}
+                   c, non-latency: {BlockBytes:44 BlockBytesInCache:44}
+   pebble-compaction, non-latency: {BlockBytes:411 BlockBytesInCache:154}
+       pebble-ingest,     latency: {BlockBytes:192 BlockBytesInCache:128}
+
+build ext1
+set z z
+----
+
+ingest-and-excise ext1 excise=i-k
+----
+
+# sstable 29, 30 were created as virtual when i-k was excised.
+lsm
+----
+0.1:
+  000015:[d#22,SET-d#22,SET]
+  000016:[e#23,SET-e#23,SET]
+  000019:[f#24,SET-f#24,SET]
+0.0:
+  000023:[d#19,SET-f#21,SET]
+  000029:[g#25,SET-h#26,SET]
+  000030:[k#29,SET-l#30,SET]
+  000027:[m#31,SET-m#31,SET]
+6:
+  000008:[a#0,SET-b#0,SET]
+  000013:[c@20#0,SET-c@16#0,SET]
+  000014:[c@15#0,SET-c@14#0,SET]
+  000028:[z#32,SET-z#32,SET]
+
+# There should be 2 backing tables. Note that tiny sstables have inaccurate
+# virtual sstable sizes.
+metrics-value
+num-backing
+backing-size
+num-virtual
+num-virtual 0
+virtual-size
+----
+2
+1.3KB
+2
+2
+102B
+
+metrics zero-cache-hits-misses
+----
+      |                             |       |       |   ingested   |     moved    |    written   |       |    amp
+level | tables  size val-bl vtables | score |   in  | tables  size | tables  size | tables  size |  read |   r   w
+------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+---------
+    0 |     7  3.5KB     0B       2 |  0.50 |  207B |     3  2.1KB |     0     0B |     9  6.2KB |    0B |   2 30.5
+    1 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    2 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    3 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    4 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    5 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    6 |     4  2.9KB    41B       0 |     - | 3.5KB |     1   716B |     0     0B |     3  2.2KB | 3.5KB |   1  0.6
+total |    11  6.4KB    41B       2 |     - | 3.1KB |     4  2.8KB |     0     0B |    12   12KB | 3.5KB |   3  3.7
+-------------------------------------------------------------------------------------------------------------------
+WAL: 1 files (58B)  in: 223B  written: 265B (19% overhead)
+Flushes: 9
+Compactions: 2  estimated debt: 6.4KB  in progress: 0 (0B)
+             default: 2  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  multi-level: 0
+MemTables: 1 (1.0MB)  zombie: 1 (1.0MB)
+Zombie tables: 0 (0B)
+Backing tables: 2 (1.3KB)
+Virtual tables: 2 (102B)
+Block cache: 21 entries (4.1KB)  hit rate: 0.0%
+Table cache: 5 entries (3.9KB)  hit rate: 0.0%
+Secondary cache: 0 entries (0B)  hit rate: 0.0%
+Snapshots: 0  earliest seq num: 0
+Table iters: 0
+Filter utility: 0.0%
+Ingestions: 1  as flushable: 2 (2.1KB in 3 tables)
+Iter category stats:
+            _unknown,     latency: {BlockBytes:0 BlockBytesInCache:0}
+                   b,     latency: {BlockBytes:44 BlockBytesInCache:0}
+                   c, non-latency: {BlockBytes:44 BlockBytesInCache:44}
+   pebble-compaction, non-latency: {BlockBytes:411 BlockBytesInCache:154}
+       pebble-ingest,     latency: {BlockBytes:328 BlockBytesInCache:128}
+
+# Virtualize a virtual sstable.
+build ext1
+set zz zz
+----
+
+ingest-and-excise ext1 excise=k-l
+----
+
+# sstable 32 created when k-l was excised, but no new backing file should be
+# created.
+lsm
+----
+0.1:
+  000015:[d#22,SET-d#22,SET]
+  000016:[e#23,SET-e#23,SET]
+  000019:[f#24,SET-f#24,SET]
+0.0:
+  000023:[d#19,SET-f#21,SET]
+  000029:[g#25,SET-h#26,SET]
+  000032:[l#30,SET-l#30,SET]
+  000027:[m#31,SET-m#31,SET]
+6:
+  000008:[a#0,SET-b#0,SET]
+  000013:[c@20#0,SET-c@16#0,SET]
+  000014:[c@15#0,SET-c@14#0,SET]
+  000028:[z#32,SET-z#32,SET]
+  000031:[zz#33,SET-zz#33,SET]
+
+metrics-value
+num-backing
+backing-size
+num-virtual
+num-virtual 0
+virtual-size
+----
+2
+1.3KB
+2
+2
+102B
+
+compact a-z
+----
+6:
+  000008:[a#0,SET-b#0,SET]
+  000013:[c@20#0,SET-c@16#0,SET]
+  000014:[c@15#0,SET-c@14#0,SET]
+  000033:[d#0,SET-m#0,SET]
+  000028:[z#32,SET-z#32,SET]
+  000031:[zz#33,SET-zz#33,SET]
+
+# Virtual sstables metrics should be gone after the compaction.
+metrics-value
+num-backing
+backing-size
+num-virtual
+num-virtual 0
+virtual-size
+----
+0
+0B
+0
+0
+0B
+
+metrics zero-cache-hits-misses
+----
+      |                             |       |       |   ingested   |     moved    |    written   |       |    amp
+level | tables  size val-bl vtables | score |   in  | tables  size | tables  size | tables  size |  read |   r   w
+------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+---------
+    0 |     0     0B     0B       0 |  0.00 |  207B |     3  2.1KB |     0     0B |     9  6.2KB |    0B |   0 30.5
+    1 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    2 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    3 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    4 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    5 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    6 |     6  4.3KB    41B       0 |     - | 7.0KB |     2  1.4KB |     0     0B |     4  2.9KB | 7.0KB |   1  0.4
+total |     6  4.3KB    41B       0 |     - | 3.8KB |     5  3.5KB |     0     0B |    13   13KB | 7.0KB |   1  3.4
+-------------------------------------------------------------------------------------------------------------------
+WAL: 1 files (58B)  in: 223B  written: 265B (19% overhead)
+Flushes: 9
+Compactions: 3  estimated debt: 0B  in progress: 0 (0B)
+             default: 3  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  multi-level: 0
+MemTables: 1 (1.0MB)  zombie: 1 (1.0MB)
+Zombie tables: 0 (0B)
+Backing tables: 0 (0B)
+Virtual tables: 0 (0B)
+Block cache: 6 entries (1.2KB)  hit rate: 0.0%
+Table cache: 0 entries (0B)  hit rate: 0.0%
+Secondary cache: 0 entries (0B)  hit rate: 0.0%
+Snapshots: 0  earliest seq num: 0
+Table iters: 0
+Filter utility: 0.0%
+Ingestions: 2  as flushable: 2 (2.1KB in 3 tables)
+Iter category stats:
+            _unknown,     latency: {BlockBytes:0 BlockBytesInCache:0}
+                   b,     latency: {BlockBytes:44 BlockBytesInCache:0}
+                   c, non-latency: {BlockBytes:44 BlockBytesInCache:44}
+   pebble-compaction, non-latency: {BlockBytes:941 BlockBytesInCache:640}
+       pebble-ingest,     latency: {BlockBytes:400 BlockBytesInCache:200}
diff --git a/pebble/testdata/point_collapsing_iter b/pebble/testdata/point_collapsing_iter
new file mode 100644
index 0000000..da98864
--- /dev/null
+++ b/pebble/testdata/point_collapsing_iter
@@ -0,0 +1,67 @@
+
+define
+a.SET.5:foo
+b.SET.6:foo
+b.DEL.4:
+c.SET.7:bar
+c.SET.5:foo
+----
+
+iter
+first
+next
+next
+next
+next
+----
+a#5,1:foo
+b#6,1:foo
+c#7,1:bar
+.
+.
+
+# Ensure that we pause at (and return) rangedel start points correctly.
+
+define
+a.RANGEDEL.4:b
+a.SET.5:foo
+b.RANGEDEL.3:c
+b.SET.6:foo
+b.DEL.4:
+c.SET.7:bar
+c.SET.5:foo
+----
+
+iter
+seek-ge b
+next
+next
+----
+b#72057594037927935,15:
+b#6,1:foo
+c#7,1:bar
+
+# More rangedel elision tests
+
+define
+a.RANGEDEL.4:b
+a.SET.5:foo
+b.RANGEDEL.4:c
+b.SET.3:foo
+b.DEL.2:
+c.SET.7:bar
+c.SET.5:foo
+----
+
+iter
+seek-ge a
+next
+next
+next
+next
+----
+a#72057594037927935,15:
+a#5,1:foo
+b#72057594037927935,15:
+c#7,1:bar
+.
diff --git a/pebble/testdata/range_del b/pebble/testdata/range_del
new file mode 100644
index 0000000..14102e0
--- /dev/null
+++ b/pebble/testdata/range_del
@@ -0,0 +1,1543 @@
+# 1 memtable.
+
+define
+mem
+  a.SET.10:b
+  a.SET.12:c
+  a.SET.14:d
+  b.MERGE.10:b
+  b.MERGE.12:c
+  b.MERGE.14:d
+  b.RANGEDEL.15:c
+  b.MERGE.16:e
+  c.SET.10:b
+  c.SET.12:c
+  c.SET.14:d
+----
+mem: 1
+
+get seq=11
+a
+b
+c
+----
+a:b
+b:b
+c:b
+
+get seq=13
+a
+b
+c
+----
+a:c
+b:bc
+c:c
+
+get seq=15
+a
+b
+c
+----
+a:d
+b:bcd
+c:d
+
+get seq=16
+a
+b
+c
+----
+a:d
+b: pebble: not found
+c:d
+
+get seq=17
+a
+b
+c
+----
+a:d
+b:e
+c:d
+
+get seq=15
+a
+b
+c
+----
+a:d
+b:bcd
+c:d
+
+iter seq=15
+first
+next
+next
+next
+seek-ge a
+seek-ge b
+seek-ge c
+seek-ge d
+last
+prev
+prev
+prev
+seek-lt a
+seek-lt b
+seek-lt c
+seek-lt d
+----
+a: (d, .)
+b: (bcd, .)
+c: (d, .)
+.
+a: (d, .)
+b: (bcd, .)
+c: (d, .)
+.
+c: (d, .)
+b: (bcd, .)
+a: (d, .)
+.
+.
+a: (d, .)
+b: (bcd, .)
+c: (d, .)
+
+iter seq=16
+first
+next
+next
+seek-ge a
+seek-ge b
+seek-ge c
+seek-ge d
+last
+prev
+prev
+seek-lt a
+seek-lt b
+seek-lt c
+seek-lt d
+----
+a: (d, .)
+c: (d, .)
+.
+a: (d, .)
+c: (d, .)
+c: (d, .)
+.
+c: (d, .)
+a: (d, .)
+.
+.
+a: (d, .)
+a: (d, .)
+c: (d, .)
+
+# Multiple memtables.
+
+define
+mem
+  a.SET.10:b
+  b.MERGE.10:b
+  c.SET.10:b
+mem
+  a.SET.12:c
+  b.MERGE.12:c
+  c.SET.12:c
+mem
+  a.SET.14:d
+  b.MERGE.14:d
+  c.SET.14:d
+mem
+  b.RANGEDEL.15:c
+mem
+  b.MERGE.16:e
+----
+mem: 5
+
+get seq=11
+a
+b
+c
+----
+a:b
+b:b
+c:b
+
+get seq=13
+a
+b
+c
+----
+a:c
+b:bc
+c:c
+
+get seq=15
+a
+b
+c
+----
+a:d
+b:bcd
+c:d
+
+get seq=16
+a
+b
+c
+----
+a:d
+b: pebble: not found
+c:d
+
+get seq=17
+a
+b
+c
+----
+a:d
+b:e
+c:d
+
+get seq=15
+a
+b
+c
+----
+a:d
+b:bcd
+c:d
+
+iter seq=15
+first
+next
+next
+next
+seek-ge a
+seek-ge b
+seek-ge c
+seek-ge d
+last
+prev
+prev
+prev
+seek-lt a
+seek-lt b
+seek-lt c
+seek-lt d
+----
+a: (d, .)
+b: (bcd, .)
+c: (d, .)
+.
+a: (d, .)
+b: (bcd, .)
+c: (d, .)
+.
+c: (d, .)
+b: (bcd, .)
+a: (d, .)
+.
+.
+a: (d, .)
+b: (bcd, .)
+c: (d, .)
+
+iter seq=16
+first
+next
+next
+seek-ge a
+seek-ge b
+seek-ge c
+seek-ge d
+last
+prev
+prev
+seek-lt a
+seek-lt b
+seek-lt c
+seek-lt d
+----
+a: (d, .)
+c: (d, .)
+.
+a: (d, .)
+c: (d, .)
+c: (d, .)
+.
+c: (d, .)
+a: (d, .)
+.
+.
+a: (d, .)
+a: (d, .)
+c: (d, .)
+
+# Overlapping range deletions in the same memtable.
+
+define
+mem
+  a.SET.10:1
+  a.SET.12:2
+  a.SET.14:3
+  a.SET.16:4
+  b.SET.10:1
+  b.SET.12:2
+  b.SET.14:3
+  b.SET.16:4
+  c.SET.10:1
+  c.SET.12:2
+  c.SET.14:3
+  c.SET.16:4
+  d.SET.10:1
+  d.SET.12:2
+  d.SET.14:3
+  d.SET.16:4
+  a.RANGEDEL.11:b
+  b.RANGEDEL.13:c
+  b.RANGEDEL.11:c
+  c.RANGEDEL.15:d
+  c.RANGEDEL.13:d
+  c.RANGEDEL.11:d
+----
+mem: 1
+
+get seq=11
+a
+b
+c
+d
+----
+a:1
+b:1
+c:1
+d:1
+
+get seq=12
+a
+b
+c
+d
+----
+a: pebble: not found
+b: pebble: not found
+c: pebble: not found
+d:1
+
+get seq=14
+a
+b
+c
+d
+----
+a:2
+b: pebble: not found
+c: pebble: not found
+d:2
+
+get seq=16
+a
+b
+c
+d
+----
+a:3
+b:3
+c: pebble: not found
+d:3
+
+get seq=18
+a
+b
+c
+d
+----
+a:4
+b:4
+c:4
+d:4
+
+iter seq=11
+first
+next
+next
+next
+next
+last
+prev
+prev
+prev
+prev
+----
+a: (1, .)
+b: (1, .)
+c: (1, .)
+d: (1, .)
+.
+d: (1, .)
+c: (1, .)
+b: (1, .)
+a: (1, .)
+.
+
+iter seq=12
+first
+next
+last
+prev
+----
+d: (1, .)
+.
+d: (1, .)
+.
+
+iter seq=14
+first
+next
+next
+last
+prev
+prev
+----
+a: (2, .)
+d: (2, .)
+.
+d: (2, .)
+a: (2, .)
+.
+
+iter seq=16
+first
+next
+next
+next
+last
+prev
+prev
+prev
+----
+a: (3, .)
+b: (3, .)
+d: (3, .)
+.
+d: (3, .)
+b: (3, .)
+a: (3, .)
+.
+
+iter seq=18
+first
+next
+next
+next
+next
+last
+prev
+prev
+prev
+prev
+----
+a: (4, .)
+b: (4, .)
+c: (4, .)
+d: (4, .)
+.
+d: (4, .)
+c: (4, .)
+b: (4, .)
+a: (4, .)
+.
+
+# Overlapping range deletions in different memtables. Note that the
+# range tombstones are not fragmented in this case.
+
+define
+mem
+  a.SET.10:1
+  b.SET.10:1
+  c.SET.10:1
+  d.SET.10:1
+mem
+  a.SET.12:2
+  b.SET.12:2
+  c.SET.12:2
+  d.SET.12:2
+  a.RANGEDEL.11:d
+mem
+  a.SET.14:3
+  b.SET.14:3
+  c.SET.14:3
+  d.SET.14:3
+  b.RANGEDEL.13:d
+mem
+  a.SET.16:4
+  b.SET.16:4
+  c.SET.16:4
+  d.SET.16:4
+  c.RANGEDEL.13:d
+----
+mem: 4
+
+get seq=11
+a
+b
+c
+d
+----
+a:1
+b:1
+c:1
+d:1
+
+get seq=12
+a
+b
+c
+d
+----
+a: pebble: not found
+b: pebble: not found
+c: pebble: not found
+d:1
+
+get seq=14
+a
+b
+c
+d
+----
+a:2
+b: pebble: not found
+c: pebble: not found
+d:2
+
+get seq=16
+a
+b
+c
+d
+----
+a:3
+b:3
+c: pebble: not found
+d:3
+
+get seq=18
+a
+b
+c
+d
+----
+a:4
+b:4
+c:4
+d:4
+
+iter seq=11
+first
+next
+next
+next
+next
+last
+prev
+prev
+prev
+prev
+----
+a: (1, .)
+b: (1, .)
+c: (1, .)
+d: (1, .)
+.
+d: (1, .)
+c: (1, .)
+b: (1, .)
+a: (1, .)
+.
+
+iter seq=12
+first
+next
+last
+prev
+----
+d: (1, .)
+.
+d: (1, .)
+.
+
+iter seq=14
+first
+next
+next
+last
+prev
+prev
+----
+a: (2, .)
+d: (2, .)
+.
+d: (2, .)
+a: (2, .)
+.
+
+iter seq=16
+first
+next
+next
+next
+last
+prev
+prev
+prev
+----
+a: (3, .)
+b: (3, .)
+d: (3, .)
+.
+d: (3, .)
+b: (3, .)
+a: (3, .)
+.
+
+iter seq=18
+first
+next
+next
+next
+next
+last
+prev
+prev
+prev
+prev
+----
+a: (4, .)
+b: (4, .)
+c: (4, .)
+d: (4, .)
+.
+d: (4, .)
+c: (4, .)
+b: (4, .)
+a: (4, .)
+.
+
+# User-key that spans tables in a level.
+
+define
+L1
+  a.SET.12:3
+L1
+  a.SET.11:2
+L1
+  a.SET.10:1
+----
+mem: 1
+1:
+  000004:[a#12,SET-a#12,SET]
+  000005:[a#11,SET-a#11,SET]
+  000006:[a#10,SET-a#10,SET]
+
+get seq=10
+a
+----
+a: pebble: not found
+
+get seq=11
+a
+----
+a:1
+
+get seq=12
+a
+----
+a:2
+
+get seq=13
+a
+----
+a:3
+
+iter seq=11
+first
+seek-ge a
+seek-ge b
+last
+seek-lt a
+seek-lt b
+----
+a: (1, .)
+a: (1, .)
+.
+a: (1, .)
+.
+a: (1, .)
+
+iter seq=12
+first
+seek-ge a
+seek-ge b
+last
+seek-lt a
+seek-lt b
+----
+a: (2, .)
+a: (2, .)
+.
+a: (2, .)
+.
+a: (2, .)
+
+iter seq=13
+first
+seek-ge a
+seek-ge b
+last
+seek-lt a
+seek-lt b
+----
+a: (3, .)
+a: (3, .)
+.
+a: (3, .)
+.
+a: (3, .)
+
+define
+L1
+  a.MERGE.12:3
+L1
+  a.MERGE.11:2
+L1
+  a.MERGE.10:1
+----
+mem: 1
+1:
+  000004:[a#12,MERGE-a#12,MERGE]
+  000005:[a#11,MERGE-a#11,MERGE]
+  000006:[a#10,MERGE-a#10,MERGE]
+
+get seq=10
+a
+----
+a: pebble: not found
+
+get seq=11
+a
+----
+a:1
+
+get seq=12
+a
+----
+a:12
+
+get seq=13
+a
+----
+a:123
+
+iter seq=11
+first
+seek-ge a
+seek-ge b
+last
+seek-lt a
+seek-lt b
+----
+a: (1, .)
+a: (1, .)
+.
+a: (1, .)
+.
+a: (1, .)
+
+iter seq=12
+first
+seek-ge a
+seek-ge b
+last
+seek-lt a
+seek-lt b
+----
+a: (12, .)
+a: (12, .)
+.
+a: (12, .)
+.
+a: (12, .)
+
+iter seq=13
+first
+seek-ge a
+seek-ge b
+last
+seek-lt a
+seek-lt b
+----
+a: (123, .)
+a: (123, .)
+.
+a: (123, .)
+.
+a: (123, .)
+
+# User-key spread across multiple levels.
+
+define
+mem
+  a.MERGE.13:4
+L1
+  a.MERGE.12:3
+L2
+  a.MERGE.11:2
+L3
+  a.MERGE.10:1
+----
+mem: 1
+1:
+  000004:[a#12,MERGE-a#12,MERGE]
+2:
+  000005:[a#11,MERGE-a#11,MERGE]
+3:
+  000006:[a#10,MERGE-a#10,MERGE]
+
+get seq=10
+a
+----
+a: pebble: not found
+
+get seq=11
+a
+----
+a:1
+
+get seq=12
+a
+----
+a:12
+
+get seq=13
+a
+----
+a:123
+
+get seq=14
+a
+----
+a:1234
+
+iter seq=11
+first
+seek-ge a
+seek-ge b
+last
+seek-lt a
+seek-lt b
+----
+a: (1, .)
+a: (1, .)
+.
+a: (1, .)
+.
+a: (1, .)
+
+iter seq=12
+first
+seek-ge a
+seek-ge b
+last
+seek-lt a
+seek-lt b
+----
+a: (12, .)
+a: (12, .)
+.
+a: (12, .)
+.
+a: (12, .)
+
+iter seq=13
+first
+seek-ge a
+seek-ge b
+last
+seek-lt a
+seek-lt b
+----
+a: (123, .)
+a: (123, .)
+.
+a: (123, .)
+.
+a: (123, .)
+
+iter seq=14
+first
+seek-ge a
+seek-ge b
+last
+seek-lt a
+seek-lt b
+----
+a: (1234, .)
+a: (1234, .)
+.
+a: (1234, .)
+.
+a: (1234, .)
+
+# Range deletions on multiple levels.
+define
+L0
+  a.SET.13:4
+  b.SET.13:4
+  d.SET.13:4
+  c.RANGEDEL.13:d
+L1
+  a.SET.12:3
+  d.SET.12:3
+  b.RANGEDEL.12:d
+L2
+  d.SET.11:2
+  a.RANGEDEL.11:d
+L3
+  a.SET.10:1
+  b.SET.10:1
+  c.SET.10:1
+  d.SET.10:1
+----
+mem: 1
+0.0:
+  000004:[a#13,SET-d#13,SET]
+1:
+  000005:[a#12,SET-d#12,SET]
+2:
+  000006:[a#11,RANGEDEL-d#11,SET]
+3:
+  000007:[a#10,SET-d#10,SET]
+
+get seq=11
+a
+b
+c
+d
+----
+a:1
+b:1
+c:1
+d:1
+
+get seq=12
+a
+b
+c
+d
+----
+a: pebble: not found
+b: pebble: not found
+c: pebble: not found
+d:2
+
+get seq=13
+a
+b
+c
+d
+----
+a:3
+b: pebble: not found
+c: pebble: not found
+d:3
+
+get seq=14
+a
+b
+c
+d
+----
+a:4
+b:4
+c: pebble: not found
+d:4
+
+iter seq=11
+first
+next
+next
+next
+last
+prev
+prev
+prev
+----
+a: (1, .)
+b: (1, .)
+c: (1, .)
+d: (1, .)
+d: (1, .)
+c: (1, .)
+b: (1, .)
+a: (1, .)
+
+iter seq=12
+first
+last
+----
+d: (2, .)
+d: (2, .)
+
+iter seq=13
+first
+next
+last
+prev
+----
+a: (3, .)
+d: (3, .)
+d: (3, .)
+a: (3, .)
+
+iter seq=14
+first
+next
+next
+last
+prev
+prev
+----
+a: (4, .)
+b: (4, .)
+d: (4, .)
+d: (4, .)
+b: (4, .)
+a: (4, .)
+
+# Range deletions spanning tables within a level.
+
+define
+mem
+  a.SET.12:3
+  b.SET.12:3
+  c.SET.12:3
+  d.SET.12:3
+L1
+  a.RANGEDEL.11:b
+L1
+  b.RANGEDEL.11:c
+L1
+  c.RANGEDEL.11:d
+L2
+  a.SET.10:1
+  b.SET.10:1
+  c.SET.10:1
+  d.SET.10:1
+----
+mem: 1
+1:
+  000004:[a#11,RANGEDEL-b#inf,RANGEDEL]
+  000005:[b#11,RANGEDEL-c#inf,RANGEDEL]
+  000006:[c#11,RANGEDEL-d#inf,RANGEDEL]
+2:
+  000007:[a#10,SET-d#10,SET]
+
+get seq=11
+a
+b
+c
+d
+----
+a:1
+b:1
+c:1
+d:1
+
+get seq=12
+a
+b
+c
+d
+----
+a: pebble: not found
+b: pebble: not found
+c: pebble: not found
+d:1
+
+get seq=13
+a
+b
+c
+d
+----
+a:3
+b:3
+c:3
+d:3
+
+iter seq=11
+first
+next
+next
+next
+last
+prev
+prev
+prev
+----
+a: (1, .)
+b: (1, .)
+c: (1, .)
+d: (1, .)
+d: (1, .)
+c: (1, .)
+b: (1, .)
+a: (1, .)
+
+iter seq=12
+first
+last
+----
+d: (1, .)
+d: (1, .)
+
+iter seq=13
+first
+next
+next
+next
+last
+prev
+prev
+prev
+----
+a: (3, .)
+b: (3, .)
+c: (3, .)
+d: (3, .)
+d: (3, .)
+c: (3, .)
+b: (3, .)
+a: (3, .)
+
+# Invalid LSM structure (range deletion at newer level covers newer
+# write at an older level). This LSM structure is not generated
+# naturally, but tested here to show the level-by-level nature of Get.
+
+define
+L1
+  a.RANGEDEL.10:b
+L2
+  a.SET.11:2
+----
+mem: 1
+1:
+  000004:[a#10,RANGEDEL-b#inf,RANGEDEL]
+2:
+  000005:[a#11,SET-a#11,SET]
+
+get seq=12
+a
+----
+a: pebble: not found
+
+# A range tombstone straddles two SSTs. One is compacted to a lower level. Its
+# keys that are newer than the range tombstone should not disappear.
+#
+# Uses a snapshot to prevent range tombstone from being elided when it gets
+# compacted to the bottommost level.
+
+define target-file-sizes=(100, 1) snapshots=(1)
+L0
+  a.RANGEDEL.10:e
+L0
+  a.SET.11:v
+L0
+  c.SET.12:v
+----
+mem: 1
+0.1:
+  000005:[a#11,SET-a#11,SET]
+  000006:[c#12,SET-c#12,SET]
+0.0:
+  000004:[a#10,RANGEDEL-e#inf,RANGEDEL]
+
+compact a-e
+----
+1:
+  000007:[a#11,SET-c#inf,RANGEDEL]
+  000008:[c#12,SET-e#inf,RANGEDEL]
+
+compact d-e
+----
+1:
+  000007:[a#11,SET-c#inf,RANGEDEL]
+2:
+  000008:[c#12,SET-e#inf,RANGEDEL]
+
+iter seq=13
+seek-ge b
+next
+----
+c: (v, .)
+.
+
+# Reverse the above test: compact the left file containing the split range
+# tombstone downwards, and iterate from right to left.
+
+define target-file-sizes=(100, 1) snapshots=(1)
+L0
+  a.RANGEDEL.10:e
+L0
+  a.SET.11:v
+L0
+  c.SET.12:v
+----
+mem: 1
+0.1:
+  000005:[a#11,SET-a#11,SET]
+  000006:[c#12,SET-c#12,SET]
+0.0:
+  000004:[a#10,RANGEDEL-e#inf,RANGEDEL]
+
+compact a-e
+----
+1:
+  000007:[a#11,SET-c#inf,RANGEDEL]
+  000008:[c#12,SET-e#inf,RANGEDEL]
+
+compact a-b
+----
+1:
+  000008:[c#12,SET-e#inf,RANGEDEL]
+2:
+  000007:[a#11,SET-c#inf,RANGEDEL]
+
+iter seq=13
+seek-lt d
+prev
+prev
+----
+c: (v, .)
+a: (v, .)
+.
+
+# A range tombstone straddles two sstables. One is compacted two
+# levels lower. The other is compacted one level lower. The one that
+# is compacted one level lower should not see its boundaries expand
+# causing it to delete more keys. A snapshot is used to prevent range
+# tombstone from being elided when it gets compacted to the bottommost
+# level.
+
+define target-file-sizes=(100, 1) snapshots=(1)
+L0
+  a.RANGEDEL.10:e
+L0
+  a.SET.11:v
+L0
+  c.SET.12:v
+L2
+  d.SET.0:v
+----
+mem: 1
+0.1:
+  000005:[a#11,SET-a#11,SET]
+  000006:[c#12,SET-c#12,SET]
+0.0:
+  000004:[a#10,RANGEDEL-e#inf,RANGEDEL]
+2:
+  000007:[d#0,SET-d#0,SET]
+
+compact a-b
+----
+1:
+  000008:[a#11,SET-c#inf,RANGEDEL]
+  000009:[c#12,SET-d#inf,RANGEDEL]
+  000010:[d#10,RANGEDEL-e#inf,RANGEDEL]
+2:
+  000007:[d#0,SET-d#0,SET]
+
+compact d-e
+----
+1:
+  000008:[a#11,SET-c#inf,RANGEDEL]
+  000009:[c#12,SET-d#inf,RANGEDEL]
+3:
+  000011:[d#10,RANGEDEL-e#inf,RANGEDEL]
+
+get seq=13
+c
+----
+c:v
+
+compact a-b L1
+----
+1:
+  000009:[c#12,SET-d#inf,RANGEDEL]
+2:
+  000008:[a#11,SET-c#inf,RANGEDEL]
+3:
+  000011:[d#10,RANGEDEL-e#inf,RANGEDEL]
+
+get seq=13
+c
+----
+c:v
+
+# A slight variation on the scenario above where a range tombstone is
+# expanded past the boundaries of its "atomic compaction unit".
+
+define target-file-sizes=(100, 1) snapshots=(1)
+L0
+  a.RANGEDEL.10:e
+L0
+  a.SET.11:v
+L0
+  c.SET.12:v
+L0
+  f.SET.13:v
+L2
+  d.SET.0:v
+----
+mem: 1
+0.1:
+  000005:[a#11,SET-a#11,SET]
+  000006:[c#12,SET-c#12,SET]
+0.0:
+  000004:[a#10,RANGEDEL-e#inf,RANGEDEL]
+  000007:[f#13,SET-f#13,SET]
+2:
+  000008:[d#0,SET-d#0,SET]
+
+compact a-b
+----
+0.0:
+  000007:[f#13,SET-f#13,SET]
+1:
+  000009:[a#11,SET-c#inf,RANGEDEL]
+  000010:[c#12,SET-d#inf,RANGEDEL]
+  000011:[d#10,RANGEDEL-e#inf,RANGEDEL]
+2:
+  000008:[d#0,SET-d#0,SET]
+
+compact d-e
+----
+0.0:
+  000007:[f#13,SET-f#13,SET]
+1:
+  000009:[a#11,SET-c#inf,RANGEDEL]
+  000010:[c#12,SET-d#inf,RANGEDEL]
+3:
+  000012:[d#10,RANGEDEL-e#inf,RANGEDEL]
+
+get seq=13
+c
+----
+c:v
+
+compact f-f L0
+----
+1:
+  000009:[a#11,SET-c#inf,RANGEDEL]
+  000010:[c#12,SET-d#inf,RANGEDEL]
+  000007:[f#13,SET-f#13,SET]
+3:
+  000012:[d#10,RANGEDEL-e#inf,RANGEDEL]
+
+compact a-f L1
+----
+2:
+  000013:[a#11,SET-c#inf,RANGEDEL]
+  000014:[c#12,SET-d#inf,RANGEDEL]
+  000015:[f#13,SET-f#13,SET]
+3:
+  000012:[d#10,RANGEDEL-e#inf,RANGEDEL]
+
+get seq=13
+c
+----
+c:v
+
+define
+L0
+  a.RANGEDEL.12:f
+L0
+  a.RANGEDEL.13:c
+  c.RANGEDEL.13:f
+L1
+  b.RANGEDEL.11:e
+L2
+  c.RANGEDEL.10:d
+----
+mem: 1
+0.1:
+  000005:[a#13,RANGEDEL-f#inf,RANGEDEL]
+0.0:
+  000004:[a#12,RANGEDEL-f#inf,RANGEDEL]
+1:
+  000006:[b#11,RANGEDEL-e#inf,RANGEDEL]
+2:
+  000007:[c#10,RANGEDEL-d#inf,RANGEDEL]
+
+wait-pending-table-stats
+000007
+----
+num-entries: 1
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 0
+
+wait-pending-table-stats
+000006
+----
+num-entries: 1
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 836
+
+wait-pending-table-stats
+000004
+----
+num-entries: 1
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 1672
+
+wait-pending-table-stats
+000005
+----
+num-entries: 2
+num-deletions: 2
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 1672
+
+
+# Range deletions with varying overlap.
+define
+L0
+  a.SET.13:4
+  b.SET.13:4
+  d.SET.13:4
+  c.RANGEDEL.13:d
+L1
+  a.SET.12:3
+  d.SET.12:3
+  b.RANGEDEL.12:d
+L2
+  d.SET.11:2
+  a.RANGEDEL.11:d
+L3
+  a.SET.10:1
+  b.SET.10:1
+  c.SET.10:1
+  d.SET.10:1
+----
+mem: 1
+0.0:
+  000004:[a#13,SET-d#13,SET]
+1:
+  000005:[a#12,SET-d#12,SET]
+2:
+  000006:[a#11,RANGEDEL-d#11,SET]
+3:
+  000007:[a#10,SET-d#10,SET]
+
+wait-pending-table-stats
+000007
+----
+num-entries: 4
+num-deletions: 0
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 0
+
+wait-pending-table-stats
+000006
+----
+num-entries: 2
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 42
+
+wait-pending-table-stats
+000005
+----
+num-entries: 3
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 68
+
+wait-pending-table-stats
+000004
+----
+num-entries: 4
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 100
+
+# Multiple Range deletions in a table.
+define
+L0
+  a.RANGEDEL.15:d
+  e.RANGEDEL.15:z
+L0
+  a.RANGEDEL.14:d
+L0
+  e.RANGEDEL.13:z
+L1
+  a.SET.11:1
+  b.SET.11:1
+  c.SET.11:1
+L2
+  x.SET.10:2
+----
+mem: 1
+0.1:
+  000004:[a#15,RANGEDEL-z#inf,RANGEDEL]
+0.0:
+  000005:[a#14,RANGEDEL-d#inf,RANGEDEL]
+  000006:[e#13,RANGEDEL-z#inf,RANGEDEL]
+1:
+  000007:[a#11,SET-c#11,SET]
+2:
+  000008:[x#10,SET-x#10,SET]
+
+wait-pending-table-stats
+000005
+----
+num-entries: 1
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 782
+
+wait-pending-table-stats
+000006
+----
+num-entries: 1
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 771
+
+wait-pending-table-stats
+000004
+----
+num-entries: 2
+num-deletions: 2
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 1553
diff --git a/pebble/testdata/read_compaction_queue b/pebble/testdata/read_compaction_queue
new file mode 100644
index 0000000..54644d6
--- /dev/null
+++ b/pebble/testdata/read_compaction_queue
@@ -0,0 +1,627 @@
+# Verify these invariantLs:
+# No overlapping ranges.
+# Size should reflect what's currently in the queue.
+# Empty queue should return nil.
+# Oldest element should be removed first.
+
+# Remove from empty queue.
+# Removing from the empty queue should return nil.
+create
+----
+(success)
+
+print-size
+----
+0
+
+remove-compaction
+----
+(nil)
+
+# Add to empty queue.
+create
+----
+(success)
+
+add-compaction
+L5: a-e 1
+----
+
+print-size
+----
+1
+
+remove-compaction
+----
+L5: a-e 1
+
+print-size
+----
+0
+
+remove-compaction
+----
+(nil)
+
+# No intersection in a non-full queue.
+create
+----
+(success)
+
+add-compaction
+L5: a-e 1
+----
+
+print-queue
+----
+L5: a-e 1
+
+add-compaction
+L5: g-k 2
+----
+
+print-queue
+----
+L5: a-e 1
+L5: g-k 2
+
+print-size
+----
+2
+
+remove-compaction
+----
+L5: a-e 1
+
+print-size
+----
+1
+
+remove-compaction
+----
+L5: g-k 2
+
+print-size
+----
+0
+
+# No intersection in a full queue.
+# This adds a compaction once the queue is already full.
+# This new compaction doesn't intersect with any compaction
+# already in the queue, so the oldest element is evicted.
+create
+----
+(success)
+
+add-compaction
+L5: a-e 1
+----
+
+add-compaction
+L5: f-g 2
+----
+
+add-compaction
+L5: k-m 3
+----
+
+add-compaction
+L5: n-o 4
+----
+
+add-compaction
+L5: p-r 5
+----
+
+print-size
+----
+5
+
+print-queue
+----
+L5: a-e 1
+L5: f-g 2
+L5: k-m 3
+L5: n-o 4
+L5: p-r 5
+
+add-compaction
+L4: t-u 6
+----
+
+print-queue
+----
+L5: f-g 2
+L5: k-m 3
+L5: n-o 4
+L5: p-r 5
+L4: t-u 6
+
+print-size
+----
+5
+
+
+# One intersection in a non-full queue.
+# Try intersections with intervals at
+# various positions in the queue, and examine
+# the state of the queue after.
+create
+----
+(success)
+
+add-compaction
+L5: a-e 1
+----
+
+add-compaction
+L5: f-g 2
+----
+
+add-compaction
+L5: k-m 3
+----
+
+add-compaction
+L5: n-w 4
+----
+
+# 
+print-size
+----
+4
+
+print-queue
+----
+L5: a-e 1
+L5: f-g 2
+L5: k-m 3
+L5: n-w 4
+
+# Add interval which intersects with the first
+# interval in the queue.
+add-compaction
+L4: c-d 5
+----
+
+print-queue
+----
+L5: f-g 2
+L5: k-m 3
+L5: n-w 4
+L4: c-d 5
+
+print-size
+----
+4
+
+# Add an interval which intersects with the last
+# element in the queue.
+add-compaction
+L3: a-d 6
+----
+
+print-queue
+----
+L5: f-g 2
+L5: k-m 3
+L5: n-w 4
+L3: a-d 6
+
+print-size
+----
+4
+
+# Add an interval which intersects with an element in the
+# middle of the queue.
+add-compaction
+L3: u-z 7
+----
+
+print-queue
+----
+L5: f-g 2
+L5: k-m 3
+L3: a-d 6
+L3: u-z 7
+
+print-size
+----
+4
+
+remove-compaction
+----
+L5: f-g 2
+
+remove-compaction
+----
+L5: k-m 3
+
+remove-compaction
+----
+L3: a-d 6
+
+remove-compaction
+----
+L3: u-z 7
+
+remove-compaction
+----
+(nil)
+
+# One intersection in a full queue.
+# We're doing some tests with full/non-full queues
+# because the logic for those cases is sometimes different.
+create
+----
+(success)
+
+add-compaction
+L5: a-e 1
+----
+
+add-compaction
+L5: f-g 2
+----
+
+add-compaction
+L5: k-m 3
+----
+
+add-compaction
+L5: n-w 4
+----
+
+add-compaction
+L5: x-z 5
+----
+
+print-size
+----
+5
+
+print-queue
+----
+L5: a-e 1
+L5: f-g 2
+L5: k-m 3
+L5: n-w 4
+L5: x-z 5
+
+# Add interval which intersects with the first
+# interval in the queue.
+add-compaction
+L4: c-d 6
+----
+
+print-queue
+----
+L5: f-g 2
+L5: k-m 3
+L5: n-w 4
+L5: x-z 5
+L4: c-d 6
+
+print-size
+----
+5
+
+# Add an interval which intersects with the last
+# element in the queue.
+add-compaction
+L3: a-d 6
+----
+
+print-queue
+----
+L5: f-g 2
+L5: k-m 3
+L5: n-w 4
+L5: x-z 5
+L3: a-d 6
+
+print-size
+----
+5
+
+# Add an interval which intersects with an element in the
+# middle of the queue.
+add-compaction
+L3: u-z 7
+----
+
+print-queue
+----
+L5: f-g 2
+L5: k-m 3
+L3: a-d 6
+L3: u-z 7
+
+print-size
+----
+4
+
+remove-compaction
+----
+L5: f-g 2
+
+remove-compaction
+----
+L5: k-m 3
+
+remove-compaction
+----
+L3: a-d 6
+
+remove-compaction
+----
+L3: u-z 7
+
+remove-compaction
+----
+(nil)
+
+# More than one intersection in a non-full queue.
+create
+----
+(success)
+
+add-compaction
+L5: a-e 1
+----
+
+add-compaction
+L5: f-g 2
+----
+
+add-compaction
+L5: k-m 3
+----
+
+add-compaction
+L5: n-w 4
+----
+
+# 
+print-size
+----
+4
+
+print-queue
+----
+L5: a-e 1
+L5: f-g 2
+L5: k-m 3
+L5: n-w 4
+
+# Add an interval with intersects with two elements of the queue.
+add-compaction
+L4: f-m 5
+----
+
+print-queue
+----
+L5: a-e 1
+L5: n-w 4
+L4: f-m 5
+
+print-size
+----
+3
+
+# Add an interval which clears the entire queue.
+add-compaction
+L3: a-z 6
+----
+
+print-queue
+----
+L3: a-z 6
+
+print-size
+----
+1
+
+remove-compaction
+----
+L3: a-z 6
+
+
+remove-compaction
+----
+(nil)
+
+print-size
+----
+0
+
+# More than one intersection in a full queue.
+create
+----
+(success)
+
+add-compaction
+L5: a-e 1
+----
+
+add-compaction
+L5: f-g 2
+----
+
+add-compaction
+L5: k-m 3
+----
+
+add-compaction
+L5: n-w 4
+----
+
+add-compaction
+L3: y-z 5
+----
+
+add-compaction
+L2: a-z 6
+----
+
+print-queue
+----
+L2: a-z 6
+
+print-size
+----
+1
+
+create
+----
+(success)
+
+add-compaction
+L5: a-e 1
+----
+
+add-compaction
+L5: f-g 2
+----
+
+add-compaction
+L5: k-m 3
+----
+
+add-compaction
+L5: n-w 4
+----
+
+add-compaction
+L3: y-z 5
+----
+
+# Test multiple overlap which doesn't cover the entire full queue.
+
+add-compaction
+L2: o-y 6
+----
+
+print-queue
+----
+L5: a-e 1
+L5: f-g 2
+L5: k-m 3
+L2: o-y 6
+
+remove-compaction
+----
+L5: a-e 1
+
+remove-compaction
+----
+L5: f-g 2
+
+remove-compaction
+----
+L5: k-m 3
+
+remove-compaction
+----
+L2: o-y 6
+
+remove-compaction
+----
+(nil)
+
+# Test a queue which becomes full, then empty, then fills up again.
+create
+----
+(success)
+
+add-compaction
+L5: a-e 1
+----
+
+add-compaction
+L5: f-g 2
+----
+
+add-compaction
+L5: k-m 3
+----
+
+add-compaction
+L5: n-w 4
+----
+
+add-compaction
+L3: y-z 5
+----
+
+add-compaction
+L2: o-y 6
+----
+
+print-queue
+----
+L5: a-e 1
+L5: f-g 2
+L5: k-m 3
+L2: o-y 6
+
+remove-compaction
+----
+L5: a-e 1
+
+remove-compaction
+----
+L5: f-g 2
+
+remove-compaction
+----
+L5: k-m 3
+
+remove-compaction
+----
+L2: o-y 6
+
+remove-compaction
+----
+(nil)
+
+add-compaction
+L5: a-e 1
+----
+
+add-compaction
+L5: f-g 2
+----
+
+add-compaction
+L5: k-m 3
+----
+
+add-compaction
+L5: n-w 4
+----
+
+print-queue
+----
+L5: a-e 1
+L5: f-g 2
+L5: k-m 3
+L5: n-w 4
+
+print-size
+----
+4
+
+# Test overlap once we refill the queue.
+add-compaction
+L4: b-l 5
+----
+
+print-queue
+----
+L5: n-w 4
+L4: b-l 5
+
+print-size
+----
+2
diff --git a/pebble/testdata/rocksdb-ingest-only/000003.log b/pebble/testdata/rocksdb-ingest-only/000003.log
new file mode 100644
index 0000000..e69de29
diff --git a/pebble/testdata/rocksdb-ingest-only/000006.sst b/pebble/testdata/rocksdb-ingest-only/000006.sst
new file mode 100644
index 0000000..1b98083
Binary files /dev/null and b/pebble/testdata/rocksdb-ingest-only/000006.sst differ
diff --git a/pebble/testdata/rocksdb-ingest-only/CURRENT b/pebble/testdata/rocksdb-ingest-only/CURRENT
new file mode 100644
index 0000000..875cf23
--- /dev/null
+++ b/pebble/testdata/rocksdb-ingest-only/CURRENT
@@ -0,0 +1 @@
+MANIFEST-000007
diff --git a/pebble/testdata/rocksdb-ingest-only/IDENTITY b/pebble/testdata/rocksdb-ingest-only/IDENTITY
new file mode 100644
index 0000000..b11f8d2
--- /dev/null
+++ b/pebble/testdata/rocksdb-ingest-only/IDENTITY
@@ -0,0 +1 @@
+160040b17ab41758-6d6a810cd585edc2
\ No newline at end of file
diff --git a/pebble/testdata/rocksdb-ingest-only/LOCK b/pebble/testdata/rocksdb-ingest-only/LOCK
new file mode 100644
index 0000000..e69de29
diff --git a/pebble/testdata/rocksdb-ingest-only/MANIFEST-000001 b/pebble/testdata/rocksdb-ingest-only/MANIFEST-000001
new file mode 100644
index 0000000..d89025a
Binary files /dev/null and b/pebble/testdata/rocksdb-ingest-only/MANIFEST-000001 differ
diff --git a/pebble/testdata/rocksdb-ingest-only/MANIFEST-000007 b/pebble/testdata/rocksdb-ingest-only/MANIFEST-000007
new file mode 100644
index 0000000..4b500e2
Binary files /dev/null and b/pebble/testdata/rocksdb-ingest-only/MANIFEST-000007 differ
diff --git a/pebble/testdata/rocksdb-ingest-only/OPTIONS-000005 b/pebble/testdata/rocksdb-ingest-only/OPTIONS-000005
new file mode 100644
index 0000000..542dea6
--- /dev/null
+++ b/pebble/testdata/rocksdb-ingest-only/OPTIONS-000005
@@ -0,0 +1,158 @@
+# This is a RocksDB option file.
+#
+# For detailed file format spec, please refer to the example file
+# in examples/rocksdb_option_file_example.ini
+#
+
+[Version]
+  rocksdb_version=6.2.1
+  options_file_version=1.1
+
+[DBOptions]
+  allow_mmap_writes=false
+  base_background_compactions=-1
+  new_table_reader_for_compaction_inputs=false
+  db_log_dir=
+  wal_recovery_mode=kTolerateCorruptedTailRecords
+  use_direct_reads=false
+  write_thread_max_yield_usec=100
+  max_manifest_file_size=134217728
+  allow_2pc=false
+  allow_fallocate=true
+  avoid_unnecessary_blocking_io=false
+  fail_if_options_file_error=false
+  allow_ingest_behind=false
+  allow_mmap_reads=false
+  skip_log_error_on_recovery=false
+  strict_bytes_per_sync=true
+  stats_history_buffer_size=1048576
+  recycle_log_file_num=1
+  delete_obsolete_files_period_micros=21600000000
+  compaction_readahead_size=0
+  use_direct_io_for_flush_and_compaction=false
+  log_file_time_to_roll=0
+  create_missing_column_families=false
+  advise_random_on_open=true
+  max_log_file_size=0
+  stats_dump_period_sec=0
+  enable_thread_tracking=false
+  use_adaptive_mutex=false
+  create_if_missing=true
+  is_fd_close_on_exec=true
+  max_background_flushes=-1
+  manifest_preallocation_size=4194304
+  error_if_exists=false
+  skip_stats_update_on_db_open=false
+  max_open_files=256
+  random_access_max_buffer_size=1048576
+  use_fsync=false
+  max_background_jobs=4
+  two_write_queues=false
+  max_background_compactions=-1
+  max_file_opening_threads=16
+  table_cache_numshardbits=6
+  keep_log_file_num=1000
+  avoid_flush_during_shutdown=false
+  db_write_buffer_size=0
+  max_total_wal_size=0
+  wal_dir=/var/folders/df/d5bj8zws5wvbhc_6szqzmh6r0000gn/T/TestRocksPebbleCheck_rocksdb,pebble321860949/store
+  max_subcompactions=1
+  atomic_flush=false
+  WAL_size_limit_MB=0
+  paranoid_checks=true
+  allow_concurrent_memtable_write=true
+  writable_file_max_buffer_size=1048576
+  WAL_ttl_seconds=0
+  delayed_write_rate=16777216
+  bytes_per_sync=524288
+  wal_bytes_per_sync=0
+  stats_persist_period_sec=600
+  preserve_deletes=false
+  enable_pipelined_write=false
+  enable_write_thread_adaptive_yield=true
+  write_thread_slow_yield_usec=3
+  access_hint_on_compaction_start=NORMAL
+  info_log_level=INFO_LEVEL
+  dump_malloc_stats=false
+  avoid_flush_during_recovery=false
+  manual_wal_flush=true
+  
+
+[CFOptions "default"]
+  report_bg_io_stats=false
+  inplace_update_support=false
+  max_compaction_bytes=104857600
+  disable_auto_compactions=false
+  write_buffer_size=67108864
+  bloom_locality=0
+  max_bytes_for_level_multiplier=10.000000
+  compaction_filter_factory=nullptr
+  optimize_filters_for_hits=true
+  target_file_size_base=4194304
+  max_write_buffer_number_to_maintain=0
+  hard_pending_compaction_bytes_limit=4400193994752
+  paranoid_file_checks=false
+  memtable_prefix_bloom_size_ratio=0.000000
+  force_consistency_checks=false
+  max_write_buffer_number=4
+  ttl=0
+  max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1
+  level0_slowdown_writes_trigger=950
+  level_compaction_dynamic_level_bytes=true
+  compaction_options_fifo={allow_compaction=false;max_table_files_size=1073741824;}
+  inplace_update_num_locks=10000
+  level0_file_num_compaction_trigger=2
+  compression=kSnappyCompression
+  level0_stop_writes_trigger=1000
+  periodic_compaction_seconds=0
+  num_levels=7
+  table_factory=BlockBasedTable
+  target_file_size_multiplier=2
+  min_write_buffer_number_to_merge=1
+  arena_block_size=8388608
+  max_successive_merges=0
+  memtable_huge_page_size=0
+  compaction_pri=kMinOverlappingRatio
+  soft_pending_compaction_bytes_limit=2199023255552
+  memtable_whole_key_filtering=false
+  memtable_insert_with_hint_prefix_extractor=nullptr
+  max_bytes_for_level_base=67108864
+  comparator=cockroach_comparator
+  max_sequential_skip_in_iterations=8
+  compression_per_level=
+  bottommost_compression=kDisableCompressionOption
+  prefix_extractor=cockroach_prefix_extractor
+  memtable_factory=SkipListFactory
+  compaction_filter=nullptr
+  compaction_options_universal={allow_trivial_move=false;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;compression_size_percent=-1;max_size_amplification_percent=200;max_merge_width=4294967295;size_ratio=1;}
+  merge_operator=cockroach_merge_operator
+  compaction_style=kCompactionStyleLevel
+  sample_for_compression=0
+  
+[TableOptions/BlockBasedTable "default"]
+  read_amp_bytes_per_bit=8589934592
+  format_version=2
+  verify_compression=false
+  partition_filters=false
+  enable_index_compression=true
+  checksum=kCRC32c
+  index_block_restart_interval=1
+  pin_top_level_index_and_filter=true
+  block_align=false
+  block_size=32768
+  index_type=kBinarySearch
+  filter_policy=rocksdb.BuiltinBloomFilter
+  metadata_block_size=4096
+  no_block_cache=false
+  whole_key_filtering=false
+  index_shortening=kShortenSeparators
+  block_size_deviation=10
+  data_block_index_type=kDataBlockBinarySearch
+  data_block_hash_table_util_ratio=0.750000
+  cache_index_and_filter_blocks=false
+  block_restart_interval=16
+  pin_l0_filter_and_index_blocks_in_cache=false
+  hash_index_allow_collision=true
+  cache_index_and_filter_blocks_with_high_priority=false
+  flush_block_policy_factory=FlushBlockBySizePolicyFactory
+  
diff --git a/pebble/testdata/scan_internal b/pebble/testdata/scan_internal
new file mode 100644
index 0000000..2ae2764
--- /dev/null
+++ b/pebble/testdata/scan_internal
@@ -0,0 +1,464 @@
+
+reset
+----
+
+batch commit
+range-key-set a c @5 boop
+range-key-set c e @5 beep
+----
+committed 2 keys
+
+snapshot name=foo
+----
+
+batch commit
+set b d
+set e foo
+----
+committed 2 keys
+
+file-only-snapshot efos1
+  b g
+----
+ok
+
+# EFOS work with scan-internal.
+
+scan-internal file-only-snapshot=efos1
+----
+a-c:{(#0,RANGEKEYSET,@5,boop)}
+b#12,1 (d)
+c-e:{(#0,RANGEKEYSET,@5,beep)}
+e#13,1 (foo)
+
+flush
+----
+
+scan-internal
+----
+a-c:{(#0,RANGEKEYSET,@5,boop)}
+b#12,1 (d)
+c-e:{(#0,RANGEKEYSET,@5,beep)}
+e#13,1 (foo)
+
+# Keys deleted by rangedels are elided.
+
+batch commit
+del-range b d
+----
+committed 1 keys
+
+scan-internal
+----
+a-c:{(#0,RANGEKEYSET,@5,boop)}
+b-d#14,RANGEDEL
+c-e:{(#0,RANGEKEYSET,@5,beep)}
+e#13,1 (foo)
+
+flush
+----
+
+scan-internal
+----
+a-c:{(#0,RANGEKEYSET,@5,boop)}
+b-d#14,RANGEDEL
+c-e:{(#0,RANGEKEYSET,@5,beep)}
+e#13,1 (foo)
+
+# Snapshots work with scan internal.
+
+scan-internal snapshot=foo
+----
+a-c:{(#0,RANGEKEYSET,@5,boop)}
+c-e:{(#0,RANGEKEYSET,@5,beep)}
+
+wait-for-file-only-snapshot efos1
+----
+ok
+
+scan-internal file-only-snapshot=efos1
+----
+a-c:{(#0,RANGEKEYSET,@5,boop)}
+b#12,1 (d)
+c-e:{(#0,RANGEKEYSET,@5,beep)}
+e#13,1 (foo)
+
+# Force keys newer than the snapshot into a lower level, then try skip-shared
+# iteration through it. This should return an error as it would expose keys
+# newer than the snapshot in the shared sstable.
+
+compact a-z
+----
+6:
+  000008:[a#10,RANGEKEYSET-e#13,SET]
+
+lsm
+----
+6:
+  000008:[a#10,RANGEKEYSET-e#13,SET]
+
+scan-internal lower=a upper=z skip-shared snapshot=foo
+----
+file 000008 contains keys newer than snapshot: pebble: cannot use skip-shared iteration due to non-shareable files in lower levels
+
+# Range keys and range dels are truncated to [lower,upper).
+
+scan-internal lower=bb upper=dd
+----
+bb-c:{(#0,RANGEKEYSET,@5,boop)}
+bb-d#14,RANGEDEL
+c-dd:{(#0,RANGEKEYSET,@5,beep)}
+
+scan-internal lower=b upper=cc
+----
+b-c:{(#0,RANGEKEYSET,@5,boop)}
+b-cc#14,RANGEDEL
+c-cc:{(#0,RANGEKEYSET,@5,beep)}
+
+reset
+----
+
+# Range key unsets and dels are allowed to delete keys they observe, however
+# the unset/del must also be returned to the user.
+
+batch commit
+range-key-set a c @8
+range-key-set b e @6
+----
+committed 2 keys
+
+flush
+----
+
+compact a-z
+----
+6:
+  000005:[a#10,RANGEKEYSET-e#inf,RANGEKEYSET]
+
+batch commit
+range-key-unset b d @6
+----
+committed 1 keys
+
+flush
+----
+
+batch commit
+range-key-del a b
+----
+committed 1 keys
+
+scan-internal
+----
+a-b:{(#0,RANGEKEYDEL)}
+b-c:{(#0,RANGEKEYSET,@8) (#0,RANGEKEYUNSET,@6)}
+c-d:{(#0,RANGEKEYUNSET,@6)}
+d-e:{(#0,RANGEKEYSET,@6)}
+
+flush
+----
+
+lsm
+----
+0.0:
+  000009:[a#13,RANGEKEYDEL-b#inf,RANGEKEYDEL]
+  000007:[b#12,RANGEKEYUNSET-d#inf,RANGEKEYUNSET]
+6:
+  000005:[a#10,RANGEKEYSET-e#inf,RANGEKEYSET]
+
+scan-internal
+----
+a-b:{(#0,RANGEKEYDEL)}
+b-c:{(#0,RANGEKEYSET,@8) (#0,RANGEKEYUNSET,@6)}
+c-d:{(#0,RANGEKEYUNSET,@6)}
+d-e:{(#0,RANGEKEYSET,@6)}
+
+batch ingest
+range-key-set e f @3
+range-key-unset f g @3
+----
+wrote 2 keys to batch ""
+
+scan-internal
+----
+a-b:{(#0,RANGEKEYDEL)}
+b-c:{(#0,RANGEKEYSET,@8) (#0,RANGEKEYUNSET,@6)}
+c-d:{(#0,RANGEKEYUNSET,@6)}
+d-e:{(#0,RANGEKEYSET,@6)}
+e-f:{(#0,RANGEKEYSET,@3)}
+f-g:{(#0,RANGEKEYUNSET,@3)}
+
+batch ingest
+range-key-unset e f @3
+range-key-set f g @3
+----
+wrote 2 keys to batch ""
+
+scan-internal
+----
+a-b:{(#0,RANGEKEYDEL)}
+b-c:{(#0,RANGEKEYSET,@8) (#0,RANGEKEYUNSET,@6)}
+c-d:{(#0,RANGEKEYUNSET,@6)}
+d-e:{(#0,RANGEKEYSET,@6)}
+e-f:{(#0,RANGEKEYUNSET,@3)}
+f-g:{(#0,RANGEKEYSET,@3)}
+
+# Range key masking is not exercised, with range keys that could mask point
+# keys being returned alongside point keys.
+
+reset
+----
+
+batch commit
+set b@3 bar
+----
+committed 1 keys
+
+batch commit
+range-key-set a c @5 boop
+range-key-set c e @5 beep
+----
+committed 2 keys
+
+scan-internal
+----
+a-c:{(#0,RANGEKEYSET,@5,boop)}
+b@3#10,1 (bar)
+c-e:{(#0,RANGEKEYSET,@5,beep)}
+
+# Point keys are collapsed in a way similar to a compaction.
+
+reset
+----
+
+batch commit
+set b@3 bar
+set c foo
+----
+committed 2 keys
+
+scan-internal
+----
+b@3#10,1 (bar)
+c#11,1 (foo)
+
+batch commit
+set b@3 barfoo
+----
+committed 1 keys
+
+scan-internal
+----
+b@3#12,1 (barfoo)
+c#11,1 (foo)
+
+batch commit
+set b@3 baz
+del c
+set d@4 bar
+----
+committed 3 keys
+
+scan-internal
+----
+b@3#13,1 (baz)
+c#14,0 ()
+d@4#15,1 (bar)
+
+batch commit
+set f barbaz
+----
+committed 1 keys
+
+scan-internal
+----
+b@3#13,1 (baz)
+c#14,0 ()
+d@4#15,1 (bar)
+f#16,1 (barbaz)
+
+# Skip-shared iteration mode. Test truncation of range key at scan bounds.
+
+reset
+----
+
+batch commit
+set b@3 bar
+----
+committed 1 keys
+
+batch commit
+range-key-set a c @5 boop
+del-range c e
+----
+committed 2 keys
+
+flush
+----
+
+compact a-z
+----
+6:
+  000005:[a#11,RANGEKEYSET-e#inf,RANGEDEL]
+
+batch commit
+set f@8 baz
+----
+committed 1 keys
+
+lsm
+----
+6:
+  000005:[a#11,RANGEKEYSET-e#inf,RANGEDEL]
+
+scan-internal
+----
+a-c:{(#0,RANGEKEYSET,@5,boop)}
+b@3#10,1 (bar)
+c-e#12,RANGEDEL
+f@8#13,1 (baz)
+
+scan-internal skip-shared lower=a upper=z
+----
+shared file: 000005 [a#11,21-e#72057594037927935,15] [point=b@3#10,1-e#72057594037927935,15] [range=a#11,21-c#72057594037927935,21]
+f@8#13,1 (baz)
+
+scan-internal skip-shared lower=a upper=e
+----
+shared file: 000005 [a#11,21-e#72057594037927935,15] [point=b@3#10,1-e#72057594037927935,15] [range=a#11,21-c#72057594037927935,21]
+
+scan-internal skip-shared lower=a upper=d
+----
+shared file: 000005 [a#11,21-d#72057594037927935,15] [point=b@3#10,1-d#72057594037927935,15] [range=a#11,21-c#72057594037927935,21]
+
+scan-internal skip-shared lower=a upper=c
+----
+shared file: 000005 [a#11,21-c#72057594037927935,21] [point=b@3#10,1-b@3#10,1] [range=a#11,21-c#72057594037927935,21]
+
+scan-internal skip-shared lower=a upper=b
+----
+shared file: 000005 [a#11,21-b#72057594037927935,21] [point=#0,0-#0,0] [range=a#11,21-b#72057594037927935,21]
+
+scan-internal skip-shared lower=b upper=z
+----
+shared file: 000005 [b#11,21-e#72057594037927935,15] [point=b@3#10,1-e#72057594037927935,15] [range=b#11,21-c#72057594037927935,21]
+f@8#13,1 (baz)
+
+scan-internal skip-shared lower=b upper=bb
+----
+shared file: 000005 [b#11,21-bb#72057594037927935,21] [point=b@3#10,1-b@3#10,1] [range=b#11,21-bb#72057594037927935,21]
+
+scan-internal skip-shared lower=e upper=ff
+----
+f@8#13,1 (baz)
+
+# Shared files that don't have any keys in [lower, upper) are ignored.
+
+reset
+----
+
+batch commit
+set a foo
+set f bar
+del-range b c
+range-key-set e ee @5 boop
+----
+committed 4 keys
+
+flush
+----
+
+compact a-z
+----
+6:
+  000005:[a#10,SET-f#11,SET]
+
+lsm
+----
+6:
+  000005:[a#10,SET-f#11,SET]
+
+scan-internal skip-shared lower=c upper=d
+----
+
+scan-internal skip-shared lower=a upper=d
+----
+shared file: 000005 [a#10,1-c#72057594037927935,15] [point=a#10,1-c#72057594037927935,15] [range=#0,0-#0,0]
+
+scan-internal skip-shared lower=bb upper=d
+----
+shared file: 000005 [bb#12,15-c#72057594037927935,15] [point=bb#12,15-c#72057594037927935,15] [range=#0,0-#0,0]
+
+scan-internal skip-shared lower=c upper=ea
+----
+shared file: 000005 [e#13,21-ea#72057594037927935,21] [point=#0,0-#0,0] [range=e#13,21-ea#72057594037927935,21]
+
+scan-internal skip-shared lower=c upper=z
+----
+shared file: 000005 [e#13,21-f#11,1] [point=f#11,1-f#11,1] [range=e#13,21-ee#72057594037927935,21]
+
+# An upper bound equalling a file's Largest user key should be reason enough to
+# truncate that file's bounds.
+
+scan-internal skip-shared lower=c upper=f
+----
+shared file: 000005 [e#13,21-ee#72057594037927935,21] [point=#0,0-#0,0] [range=e#13,21-ee#72057594037927935,21]
+
+# Construct a file with an exclusive sentinel as the largest key. Verify that
+# scan-internal with skip-shared and an upper bound at that exclusive sentinel
+# does not truncate that file.
+
+reset
+----
+
+batch commit
+set a foo
+del-range b c
+range-key-set e ee @5 boop
+----
+committed 3 keys
+
+flush
+----
+
+compact a-z
+----
+6:
+  000005:[a#10,SET-ee#inf,RANGEKEYSET]
+
+scan-internal skip-shared lower=a upper=ee
+----
+shared file: 000005 [a#10,1-ee#72057594037927935,21] [point=a#10,1-c#72057594037927935,15] [range=e#12,21-ee#72057594037927935,21]
+
+scan-internal skip-shared lower=b upper=ee
+----
+shared file: 000005 [b#11,15-ee#72057594037927935,21] [point=b#11,15-c#72057594037927935,15] [range=e#12,21-ee#72057594037927935,21]
+
+# Ensure we don't leave any range key bounds unintentionally set.
+
+reset
+----
+
+batch commit
+range-key-set a aa @5 boop
+set b foo
+set c bar
+set d baz
+----
+committed 4 keys
+
+flush
+----
+
+compact a-z
+----
+6:
+  000005:[a#10,RANGEKEYSET-d#13,SET]
+
+scan-internal skip-shared lower=b upper=e
+----
+shared file: 000005 [b#11,1-d#13,1] [point=b#11,1-d#13,1] [range=#0,0-#0,0]
+
+scan-internal skip-shared lower=a upper=aaa
+----
+shared file: 000005 [a#10,21-aa#72057594037927935,21] [point=#0,0-#0,0] [range=a#10,21-aa#72057594037927935,21]
diff --git a/pebble/testdata/scan_statistics b/pebble/testdata/scan_statistics
new file mode 100644
index 0000000..a080099
--- /dev/null
+++ b/pebble/testdata/scan_statistics
@@ -0,0 +1,146 @@
+
+reset
+----
+
+batch commit
+set b d
+set e foo
+----
+committed 2 keys
+
+scan-statistics lower=b upper=f keys=(SET)
+----
+Aggregate:
+  SET key count: 2
+  SET latest count: 2
+
+flush
+----
+
+scan-statistics lower=b upper=e keys=(SET) levels=(0)
+----
+Level 0:
+  SET key count: 1
+  SET latest count: 1
+Aggregate:
+  SET key count: 1
+  SET latest count: 1
+
+scan-statistics lower=b upper=f keys=(SET) levels=(0)
+----
+Level 0:
+  SET key count: 2
+  SET latest count: 2
+Aggregate:
+  SET key count: 2
+  SET latest count: 2
+
+scan-statistics lower=f upper=l keys=(SET)
+----
+Aggregate:
+  SET key count: 0
+
+batch commit
+del b
+del e
+----
+committed 2 keys
+
+flush
+----
+
+scan-statistics lower=b upper=f keys=(SET, DEL) levels=(0)
+----
+Level 0:
+  SET key count: 2
+  DEL key count: 2
+  DEL latest count: 2
+Aggregate:
+  SET key count: 2
+  DEL key count: 2
+  DEL latest count: 2
+
+reset
+----
+
+batch commit
+set b hi
+----
+committed 1 keys
+
+flush
+----
+
+batch commit
+set b hello
+----
+committed 1 keys
+
+flush
+----
+
+compact a-z
+----
+6:
+  000008:[b#0,SET-b#0,SET]
+
+scan-statistics lower=b upper=f keys=(SET) levels=(6)
+----
+Level 6:
+  SET key count: 1
+  SET latest count: 1
+Aggregate:
+  SET key count: 1
+  SET latest count: 1
+
+batch commit
+set c a
+----
+committed 1 keys
+
+flush
+----
+
+scan-statistics lower=b upper=f keys=(SET) levels=(0, 6)
+----
+Level 0:
+  SET key count: 1
+  SET latest count: 1
+Level 6:
+  SET key count: 1
+  SET latest count: 1
+Aggregate:
+  SET key count: 2
+  SET latest count: 2
+
+reset
+----
+
+batch commit
+set a b
+----
+committed 1 keys
+
+flush
+----
+
+snapshot name=first
+----
+
+batch commit
+set a c
+----
+committed 1 keys
+
+flush
+----
+
+compact a-z
+----
+6:
+  000008:[a#11,SET-a#0,SET]
+
+scan-statistics lower=a upper=z show-snapshot-pinned
+----
+Aggregate:
+  snapshot pinned count: 1
diff --git a/pebble/testdata/simple_level_iter b/pebble/testdata/simple_level_iter
new file mode 100644
index 0000000..76cd658
--- /dev/null
+++ b/pebble/testdata/simple_level_iter
@@ -0,0 +1,77 @@
+build 1
+set b b
+set c c
+----
+
+
+iter files=(1)
+first
+next
+next
+----
+b:b
+c:c
+.
+
+build 2
+set d d
+set f f
+----
+
+iter files=(1, 2)
+first
+next
+next
+next
+----
+b:b
+c:c
+d:d
+f:f
+
+# Test seeks within files.
+
+iter files=(1, 2)
+seek-ge bb
+next
+next
+next
+----
+c:c
+d:d
+f:f
+.
+
+iter files=(1, 2)
+seek-ge a
+next
+next
+next
+----
+b:b
+c:c
+d:d
+f:f
+
+iter files=(1, 2)
+seek-ge d
+next
+next
+----
+d:d
+f:f
+.
+
+iter files=(1, 2)
+seek-ge f
+next
+----
+f:f
+.
+
+iter files=(1, 2)
+seek-ge ff
+next
+----
+.
+.
diff --git a/pebble/testdata/singledel_manual_compaction b/pebble/testdata/singledel_manual_compaction
new file mode 100644
index 0000000..6b55c9e
--- /dev/null
+++ b/pebble/testdata/singledel_manual_compaction
@@ -0,0 +1,77 @@
+# This is not actually a manual compaction test, and simply uses manual
+# compaction to demonstrate single delete semantics. Specifically, it
+# demonstrates that the behavior can be non-deterministic if not used
+# correctly.
+
+# Define a sequence of SET=>SET=>DEL=>SET=>SINGLEDEL.
+define target-file-sizes=(1, 1, 1, 1, 1)
+L1
+  a.SINGLEDEL.10:
+L2
+  a.SET.9:v3
+L3
+  a.DEL.8:
+L4
+  a.SET.7:v2
+L5
+  a.SET.6:v1
+----
+1:
+  000004:[a#10,SINGLEDEL-a#10,SINGLEDEL]
+2:
+  000005:[a#9,SET-a#9,SET]
+3:
+  000006:[a#8,DEL-a#8,DEL]
+4:
+  000007:[a#7,SET-a#7,SET]
+5:
+  000008:[a#6,SET-a#6,SET]
+
+# No data.
+iter
+first
+----
+.
+
+# Compact away the DEL.
+compact a-b L2
+----
+1:
+  000004:[a#10,SINGLEDEL-a#10,SINGLEDEL]
+3:
+  000009:[a#9,SET-a#9,SET]
+4:
+  000007:[a#7,SET-a#7,SET]
+5:
+  000008:[a#6,SET-a#6,SET]
+
+# No data.
+iter
+first
+----
+.
+
+# Do two compactions to compact away the SINGLEDEL and 1 SET.
+compact a-b L1
+----
+2:
+  000010:[a#10,SINGLEDEL-a#10,SINGLEDEL]
+3:
+  000009:[a#9,SET-a#9,SET]
+4:
+  000007:[a#7,SET-a#7,SET]
+5:
+  000008:[a#6,SET-a#6,SET]
+
+compact a-b L2
+----
+4:
+  000007:[a#7,SET-a#7,SET]
+5:
+  000008:[a#6,SET-a#6,SET]
+
+# Deleted data reappears.
+iter
+first
+----
+a: (v2, .)
diff --git a/pebble/testdata/singledel_manual_compaction_set_with_del b/pebble/testdata/singledel_manual_compaction_set_with_del
new file mode 100644
index 0000000..afeff65
--- /dev/null
+++ b/pebble/testdata/singledel_manual_compaction_set_with_del
@@ -0,0 +1,282 @@
+# This is not actually a manual compaction test, and simply uses manual
+# compaction to demonstrate single delete semantics when used with
+# set-with-delete.
+
+# Define a sequence of SET=>SET=>DEL=>SET=>SINGLEDEL.
+define target-file-sizes=(1, 1, 1, 1, 1)
+L1
+  a.SINGLEDEL.10:
+L2
+  a.SET.9:v3
+L3
+  a.DEL.8:
+L4
+  a.SET.7:v2
+L5
+  a.SET.6:v1
+----
+1:
+  000004:[a#10,SINGLEDEL-a#10,SINGLEDEL]
+2:
+  000005:[a#9,SET-a#9,SET]
+3:
+  000006:[a#8,DEL-a#8,DEL]
+4:
+  000007:[a#7,SET-a#7,SET]
+5:
+  000008:[a#6,SET-a#6,SET]
+
+# No data.
+iter
+first
+----
+.
+
+# Compact away the DEL.
+compact a-b L2
+----
+1:
+  000004:[a#10,SINGLEDEL-a#10,SINGLEDEL]
+3:
+  000009:[a#9,SETWITHDEL-a#9,SETWITHDEL]
+4:
+  000007:[a#7,SET-a#7,SET]
+5:
+  000008:[a#6,SET-a#6,SET]
+
+# No data.
+iter
+first
+----
+.
+
+# Do two compactions to compact away the SINGLEDEL and 1 SET.
+compact a-b L1
+----
+2:
+  000010:[a#10,SINGLEDEL-a#10,SINGLEDEL]
+3:
+  000009:[a#9,SETWITHDEL-a#9,SETWITHDEL]
+4:
+  000007:[a#7,SET-a#7,SET]
+5:
+  000008:[a#6,SET-a#6,SET]
+
+compact a-b L2
+----
+3:
+  000011:[a#10,DEL-a#10,DEL]
+4:
+  000007:[a#7,SET-a#7,SET]
+5:
+  000008:[a#6,SET-a#6,SET]
+
+# Deleted data is not resurrected.
+iter
+first
+----
+.
+
+# Define a sequence of SET=>SINGLEDEL=>SET=>SINGLEDEL.
+define target-file-sizes=(1, 1, 1, 1, 1)
+L1
+  a.SINGLEDEL.10:
+L2
+  a.SET.9:v3
+L3
+  a.SINGLEDEL.8:
+L4
+  a.SET.7:v2
+----
+1:
+  000004:[a#10,SINGLEDEL-a#10,SINGLEDEL]
+2:
+  000005:[a#9,SET-a#9,SET]
+3:
+  000006:[a#8,SINGLEDEL-a#8,SINGLEDEL]
+4:
+  000007:[a#7,SET-a#7,SET]
+
+# No data.
+iter
+first
+----
+.
+
+# Compact away the older SINGLEDEL.
+compact a-b L2
+----
+1:
+  000004:[a#10,SINGLEDEL-a#10,SINGLEDEL]
+3:
+  000008:[a#9,SETWITHDEL-a#9,SETWITHDEL]
+4:
+  000007:[a#7,SET-a#7,SET]
+
+# No data.
+iter
+first
+----
+.
+
+# Do two compactions to compact away the newer SINGLEDEL and 1 SET.
+compact a-b L1
+----
+2:
+  000009:[a#10,SINGLEDEL-a#10,SINGLEDEL]
+3:
+  000008:[a#9,SETWITHDEL-a#9,SETWITHDEL]
+4:
+  000007:[a#7,SET-a#7,SET]
+
+compact a-b L2
+----
+3:
+  000010:[a#10,DEL-a#10,DEL]
+4:
+  000007:[a#7,SET-a#7,SET]
+
+# Deleted data is not resurrected.
+iter
+first
+----
+.
+
+# Define a sequence of SET=>DEL=>SET=>SINGLEDEL, such that the DEL and
+# SINGLEDEL meet in a compaction. Disable multilevel compaction to exercise the proper test case.
+define snapshots=(9) disable-multi-level
+L1
+  a.SINGLEDEL.10:
+L2
+  a.SET.9:v3
+L3
+  a.DEL.8:
+L4
+  a.SET.7:v2
+----
+1:
+  000004:[a#10,SINGLEDEL-a#10,SINGLEDEL]
+2:
+  000005:[a#9,SET-a#9,SET]
+3:
+  000006:[a#8,DEL-a#8,DEL]
+4:
+  000007:[a#7,SET-a#7,SET]
+
+# No data.
+iter
+first
+----
+.
+
+# Compact L2 and L3. The snapshot prevents the DEL=>SET from being collapsed.
+compact a-b L2
+----
+1:
+  000004:[a#10,SINGLEDEL-a#10,SINGLEDEL]
+3:
+  000008:[a#9,SET-a#8,DEL]
+4:
+  000007:[a#7,SET-a#7,SET]
+
+# No data.
+iter
+first
+----
+.
+
+close-snapshots
+----
+
+compact a-b L1
+----
+2:
+  000004:[a#10,SINGLEDEL-a#10,SINGLEDEL]
+3:
+  000008:[a#9,SET-a#8,DEL]
+4:
+  000007:[a#7,SET-a#7,SET]
+
+# The DEL survives.
+compact a-b L2
+----
+3:
+  000009:[a#8,DEL-a#8,DEL]
+4:
+  000007:[a#7,SET-a#7,SET]
+
+# No data
+iter
+first
+----
+.
+
+# Define a sequence of SET=>SINGLEDEL=>SET=>SINGLEDEL, such that the two
+# SINGLEDELs meet in a compaction.
+# To test surface the right test case, disable multi level compaction.
+define snapshots=(9) disable-multi-level
+L1
+  a.SINGLEDEL.10:
+L2
+  a.SET.9:v3
+L3
+  a.SINGLEDEL.8:
+L4
+  a.SET.7:v2
+----
+1:
+  000004:[a#10,SINGLEDEL-a#10,SINGLEDEL]
+2:
+  000005:[a#9,SET-a#9,SET]
+3:
+  000006:[a#8,SINGLEDEL-a#8,SINGLEDEL]
+4:
+  000007:[a#7,SET-a#7,SET]
+
+# No data.
+iter
+first
+----
+.
+
+# Compact L2 and L3. The snapshot prevents the SINGLEDEL=>SET from being collapsed.
+compact a-b L2
+----
+1:
+  000004:[a#10,SINGLEDEL-a#10,SINGLEDEL]
+3:
+  000008:[a#9,SET-a#8,SINGLEDEL]
+4:
+  000007:[a#7,SET-a#7,SET]
+
+# No data.
+iter
+first
+----
+.
+
+close-snapshots
+----
+
+compact a-b L1
+----
+2:
+  000004:[a#10,SINGLEDEL-a#10,SINGLEDEL]
+3:
+  000008:[a#9,SET-a#8,SINGLEDEL]
+4:
+  000007:[a#7,SET-a#7,SET]
+
+# The SINGLEDEL survives.
+compact a-b L2
+----
+3:
+  000009:[a#8,SINGLEDEL-a#8,SINGLEDEL]
+4:
+  000007:[a#7,SET-a#7,SET]
+
+# No data
+iter
+first
+----
+.
diff --git a/pebble/testdata/snapshot b/pebble/testdata/snapshot
new file mode 100644
index 0000000..9399812
--- /dev/null
+++ b/pebble/testdata/snapshot
@@ -0,0 +1,180 @@
+define
+set a 1
+snapshot 1
+set b 2
+snapshot 2
+set c 3
+snapshot 3
+----
+
+iter snapshot=1
+first
+next
+prev
+----
+a:1
+.
+a:1
+
+iter snapshot=2
+first
+next
+next
+prev
+----
+a:1
+b:2
+.
+b:2
+
+iter snapshot=3
+first
+next
+next
+next
+prev
+----
+a:1
+b:2
+c:3
+.
+c:3
+
+define
+set a 1
+snapshot 1
+set a 2
+snapshot 2
+set a 3
+snapshot 3
+----
+
+iter snapshot=1
+first
+next
+prev
+----
+a:1
+.
+a:1
+
+iter snapshot=2
+first
+next
+prev
+----
+a:2
+.
+a:2
+
+iter snapshot=3
+first
+next
+prev
+----
+a:3
+.
+a:3
+
+define
+set a 1
+snapshot 1
+set a 2
+snapshot 2
+set a 3
+snapshot 3
+compact a-b
+----
+
+iter snapshot=1
+first
+next
+prev
+----
+a:1
+.
+a:1
+
+iter snapshot=2
+first
+next
+prev
+----
+a:2
+.
+a:2
+
+iter snapshot=3
+first
+next
+prev
+----
+a:3
+.
+a:3
+
+define
+merge a 1
+snapshot 1
+merge a 2
+snapshot 2
+merge a 3
+snapshot 3
+compact a-b
+----
+
+iter snapshot=1
+first
+next
+prev
+----
+a:1
+.
+a:1
+
+iter snapshot=2
+first
+next
+prev
+----
+a:12
+.
+a:12
+
+iter snapshot=3
+first
+next
+prev
+----
+a:123
+.
+a:123
+
+# Fix for #2705. levelIter encounters two files where the first has seqnum
+# below snapshot seqnum, so obsolete points can be hidden. The second file has
+# a more recent seqnum so obsolete points cannot be hidden. But the in-place
+# modification of the filters slice was causing obsolete points to be hidden
+# in the second file.
+define block-size=1
+set a 1
+compact a-b
+set c 2
+snapshot 1
+set c 3
+compact c-d
+----
+
+db-state
+----
+6:
+  000005:[a#10,SET-a#10,SET]
+  000007:[c#12,SET-c#11,SET]
+
+iter snapshot=1
+first
+next
+next
+----
+a:1
+c:2
+.
diff --git a/pebble/testdata/sstable_key_compare b/pebble/testdata/sstable_key_compare
new file mode 100644
index 0000000..cc06afd
--- /dev/null
+++ b/pebble/testdata/sstable_key_compare
@@ -0,0 +1,96 @@
+cmp
+a.SET.4 a.SET.4
+a.SET.4 b.SET.4
+a.SET.4 a.SET.3
+a.SET.3 a.SET.4
+cat.SET.3 cat.MERGE.4
+dog.SET.3 cat.MERGE.4
+----
+                               a#4,SET = a#4,SET
+                               a#4,SET < b#4,SET
+                               a#4,SET = a#3,SET
+                               a#3,SET = a#4,SET
+                             cat#3,SET = cat#4,MERGE
+                             dog#3,SET > cat#4,MERGE
+
+cmp
+a.SET.4 a.RANGEDEL.72057594037927935
+a.RANGEDEL.72057594037927935 a.SET.4
+cat.SET.4 a.RANGEDEL.72057594037927935
+a.SET.4 cat.RANGEDEL.72057594037927935
+cat.RANGEDEL.2 cat.SET.3
+cat.RANGEDEL.2 cat.RANGEDEL.3
+cat.RANGEDEL.2 cat.RANGEDEL.72057594037927935
+----
+                               a#4,SET > a#inf,RANGEDEL
+                        a#inf,RANGEDEL < a#4,SET
+                             cat#4,SET > a#inf,RANGEDEL
+                               a#4,SET < cat#inf,RANGEDEL
+                        cat#2,RANGEDEL = cat#3,SET
+                        cat#2,RANGEDEL = cat#3,RANGEDEL
+                        cat#2,RANGEDEL > cat#inf,RANGEDEL
+
+cmp
+a.RANGEKEYSET.5 a.SET.3
+a.RANGEKEYSET.5 a.RANGEDEL.3
+a.RANGEKEYSET.5 a.RANGEDEL.72057594037927935
+a.RANGEKEYSET.72057594037927935 a.RANGEDEL.72057594037927935
+a.RANGEKEYSET.72057594037927935 a.RANGEKEYSET.5
+a.RANGEKEYSET.5 a.RANGEKEYSET.72057594037927935
+----
+                       a#5,RANGEKEYSET = a#3,SET
+                       a#5,RANGEKEYSET = a#3,RANGEDEL
+                       a#5,RANGEKEYSET > a#inf,RANGEDEL
+                     a#inf,RANGEKEYSET = a#inf,RANGEDEL
+                     a#inf,RANGEKEYSET < a#5,RANGEKEYSET
+                       a#5,RANGEKEYSET > a#inf,RANGEKEYSET
+
+cmp
+a.RANGEKEYUNSET.5 a.RANGEKEYUNSET.72057594037927935
+a.RANGEKEYUNSET.72057594037927935 a.RANGEKEYUNSET.5
+foo.RANGEKEYUNSET.72057594037927935 a.RANGEKEYUNSET.5
+a.SET.5 a.RANGEKEYUNSET.72057594037927935
+a.RANGEKEYUNSET.72057594037927935 a.SET.5
+a.RANGEKEYUNSET.72057594037927935 a.RANGEDEL.72057594037927935
+a.RANGEDEL.72057594037927935 a.RANGEKEYUNSET.72057594037927935
+a.RANGEKEYUNSET.72057594037927935 a.RANGEKEYSET.72057594037927935
+a.RANGEKEYSET.72057594037927935 a.RANGEKEYUNSET.72057594037927935
+----
+                     a#5,RANGEKEYUNSET > a#inf,RANGEKEYUNSET
+                   a#inf,RANGEKEYUNSET < a#5,RANGEKEYUNSET
+                 foo#inf,RANGEKEYUNSET > a#5,RANGEKEYUNSET
+                               a#5,SET > a#inf,RANGEKEYUNSET
+                   a#inf,RANGEKEYUNSET < a#5,SET
+                   a#inf,RANGEKEYUNSET = a#inf,RANGEDEL
+                        a#inf,RANGEDEL = a#inf,RANGEKEYUNSET
+                   a#inf,RANGEKEYUNSET = a#inf,RANGEKEYSET
+                     a#inf,RANGEKEYSET = a#inf,RANGEKEYUNSET
+
+cmp
+a.RANGEKEYDEL.5 a.RANGEKEYDEL.72057594037927935
+a.RANGEKEYDEL.72057594037927935 a.RANGEKEYDEL.5
+foo.RANGEKEYDEL.72057594037927935 a.RANGEKEYDEL.5
+a.SET.5 a.RANGEKEYDEL.72057594037927935
+a.RANGEKEYDEL.72057594037927935 a.SET.5
+a.RANGEKEYDEL.72057594037927935 a.RANGEDEL.72057594037927935
+a.RANGEDEL.72057594037927935 a.RANGEKEYDEL.72057594037927935
+a.RANGEKEYDEL.72057594037927935 a.RANGEKEYSET.72057594037927935
+a.RANGEKEYSET.72057594037927935 a.RANGEKEYDEL.72057594037927935
+a.RANGEKEYUNSET.72057594037927935 a.RANGEKEYDEL.72057594037927935
+a.RANGEKEYDEL.72057594037927935 a.RANGEKEYUNSET.72057594037927935
+a.RANGEKEYDEL.72057594037927935 a.RANGEKEYSET.72057594037927935
+a.RANGEKEYSET.72057594037927935 a.RANGEKEYDEL.72057594037927935
+----
+                       a#5,RANGEKEYDEL > a#inf,RANGEKEYDEL
+                     a#inf,RANGEKEYDEL < a#5,RANGEKEYDEL
+                   foo#inf,RANGEKEYDEL > a#5,RANGEKEYDEL
+                               a#5,SET > a#inf,RANGEKEYDEL
+                     a#inf,RANGEKEYDEL < a#5,SET
+                     a#inf,RANGEKEYDEL = a#inf,RANGEDEL
+                        a#inf,RANGEDEL = a#inf,RANGEKEYDEL
+                     a#inf,RANGEKEYDEL = a#inf,RANGEKEYSET
+                     a#inf,RANGEKEYSET = a#inf,RANGEKEYDEL
+                   a#inf,RANGEKEYUNSET = a#inf,RANGEKEYDEL
+                     a#inf,RANGEKEYDEL = a#inf,RANGEKEYUNSET
+                     a#inf,RANGEKEYDEL = a#inf,RANGEKEYSET
+                     a#inf,RANGEKEYSET = a#inf,RANGEKEYDEL
diff --git a/pebble/testdata/table_stats b/pebble/testdata/table_stats
new file mode 100644
index 0000000..a14ef69
--- /dev/null
+++ b/pebble/testdata/table_stats
@@ -0,0 +1,911 @@
+batch
+set a 1
+set b 2
+del c
+----
+
+flush
+----
+0.0:
+  000005:[a#10,SET-c#12,DEL]
+
+wait-pending-table-stats
+000005
+----
+num-entries: 3
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 64
+range-deletions-bytes-estimate: 0
+
+compact a-c
+----
+6:
+  000005:[a#10,SET-c#12,DEL]
+
+batch
+del-range a c
+----
+
+flush
+----
+0.0:
+  000007:[a#13,RANGEDEL-c#inf,RANGEDEL]
+6:
+  000005:[a#10,SET-c#12,DEL]
+
+wait-pending-table-stats
+000007
+----
+num-entries: 1
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 51
+
+reopen
+----
+
+# After re-opening the database, the table stats collector should eventually
+# load 000007's stats.
+
+wait-loaded-initial
+----
+[JOB 2] all initial table stats loaded
+
+wait-pending-table-stats
+000007
+----
+num-entries: 1
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 51
+
+compact a-c
+----
+
+# Test a file that is moved by a compaction before its table stats are
+# collected. The stats collector should silently skip the first pending file,
+# but the second entry from the move compaction should cause the file's stats
+# to be loaded.
+
+disable
+----
+
+batch
+set a 1
+set b 2
+----
+
+flush
+----
+0.0:
+  000012:[a#14,SET-b#15,SET]
+
+compact a-c
+----
+6:
+  000012:[a#14,SET-b#15,SET]
+
+enable
+----
+
+wait-pending-table-stats
+000012
+----
+num-entries: 2
+num-deletions: 0
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 0
+
+# Test a file that is deleted by a compaction before its table stats are
+# collected. The stats collector should just silently skip the pending file.
+
+disable
+----
+
+batch
+del-range a c
+----
+
+flush
+----
+0.0:
+  000014:[a#16,RANGEDEL-c#inf,RANGEDEL]
+6:
+  000012:[a#14,SET-b#15,SET]
+
+compact a-c
+----
+
+enable
+----
+
+wait-pending-table-stats
+000014
+----
+(not found)
+
+# Test range tombstones that need to be truncated to file bounds. The
+# grandparent limit and small target file size ensures that our manual
+# compaction of L4->L5 will split the range tombstone across several files.
+
+define target-file-sizes=(100, 1)
+L4
+  a.RANGEDEL.8:f
+L5
+  b.SET.7:v
+L6
+  a.SET.1:v
+L6
+  b.SET.2:v
+L6
+  c.SET.3:v
+L6
+  d.SET.4:v
+L6
+  e.SET.5:v
+----
+4:
+  000004:[a#8,RANGEDEL-f#inf,RANGEDEL]
+5:
+  000005:[b#7,SET-b#7,SET]
+6:
+  000006:[a#1,SET-a#1,SET]
+  000007:[b#2,SET-b#2,SET]
+  000008:[c#3,SET-c#3,SET]
+  000009:[d#4,SET-d#4,SET]
+  000010:[e#5,SET-e#5,SET]
+
+compact a-b L4
+----
+5:
+  000011:[a#8,RANGEDEL-b#inf,RANGEDEL]
+  000012:[b#8,RANGEDEL-c#inf,RANGEDEL]
+  000013:[c#8,RANGEDEL-d#inf,RANGEDEL]
+  000014:[d#8,RANGEDEL-e#inf,RANGEDEL]
+  000015:[e#8,RANGEDEL-f#inf,RANGEDEL]
+6:
+  000006:[a#1,SET-a#1,SET]
+  000007:[b#2,SET-b#2,SET]
+  000008:[c#3,SET-c#3,SET]
+  000009:[d#4,SET-d#4,SET]
+  000010:[e#5,SET-e#5,SET]
+
+wait-pending-table-stats
+000011
+----
+num-entries: 1
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 632
+
+wait-pending-table-stats
+000012
+----
+num-entries: 1
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 632
+
+# A table in L6 with two point keys blocks, each covered by distinct range dels.
+# The deletion estimate takes into account the contribution from both deleted
+# blocks. Note that the snapshot is required to allow the hint to be computed.
+define block-size=1 snapshots=(10)
+L6
+  e.SET.5:e a.RANGEDEL.15:f m.SET.5:m g.RANGEDEL.15:z
+----
+6:
+  000004:[a#15,RANGEDEL-z#inf,RANGEDEL]
+
+wait-pending-table-stats
+000004
+----
+num-entries: 4
+num-deletions: 2
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 68
+
+# Hints that partially overlap tables in lower levels only count blocks that are
+# contained within the hint.
+#
+#  L0 |-|              000004: a.RANGEDEL:b
+#  L1        |---|     000005: d.RANGEDEL:f
+#  L2  x     x         000006: Two blocks [a, d]
+#  L2          x     x 000007: Two blocks [e, h]
+#  -------------------
+#      a b c d e f g h
+
+define block-size=1
+L0
+  a.RANGEDEL.2:b
+L1
+  d.RANGEDEL.1:f
+L2
+  a.SET.0:a d.SET.0:d
+L2
+  e.SET.0:e h.SET.0:h
+----
+0.0:
+  000004:[a#2,RANGEDEL-b#inf,RANGEDEL]
+1:
+  000005:[d#1,RANGEDEL-f#inf,RANGEDEL]
+2:
+  000006:[a#0,SET-d#0,SET]
+  000007:[e#0,SET-h#0,SET]
+
+# Table 000004 deletes the first block in table 000006.
+wait-pending-table-stats
+000004
+----
+num-entries: 1
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 33
+
+# Table 000005 deletes the second block in table 000006 (containing 'd') and the
+# first block in table 000007 (containing 'e').
+wait-pending-table-stats
+000005
+----
+num-entries: 1
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 66
+
+# Test the interaction between point and range key deletions.
+
+define
+----
+
+# Start with a table that contains point and range keys, but no range dels or
+# range key dels.
+batch
+set a a
+range-key-set a b @1 foo
+range-key-unset a b @2
+----
+
+flush
+----
+0.0:
+  000005:[a#12,RANGEKEYUNSET-b#inf,RANGEKEYSET]
+
+# Add a table that contains only point keys, to the right of the existing table.
+batch
+set c c
+----
+
+flush
+----
+0.0:
+  000005:[a#12,RANGEKEYUNSET-b#inf,RANGEKEYSET]
+  000007:[c#13,SET-c#13,SET]
+
+compact a-c
+----
+6:
+  000008:[a#11,RANGEKEYSET-b#inf,RANGEKEYSET]
+  000009:[c#0,SET-c#0,SET]
+
+# Add a table that contains a RANGEKEYDEL covering the first table in L6.
+batch
+range-key-del a b
+----
+
+flush
+----
+0.0:
+  000011:[a#14,RANGEKEYDEL-b#inf,RANGEKEYDEL]
+6:
+  000008:[a#11,RANGEKEYSET-b#inf,RANGEKEYSET]
+  000009:[c#0,SET-c#0,SET]
+
+# Add one more table containing a RANGEDEL.
+batch
+del-range a c
+----
+
+flush
+----
+0.1:
+  000013:[a#15,RANGEDEL-c#inf,RANGEDEL]
+0.0:
+  000011:[a#14,RANGEKEYDEL-b#inf,RANGEKEYDEL]
+6:
+  000008:[a#11,RANGEKEYSET-b#inf,RANGEKEYSET]
+  000009:[c#0,SET-c#0,SET]
+
+# Compute stats on the table containing range key del. It should not show an
+# estimate for deleted point keys as there are no tables below it that contain
+# only range keys.
+wait-pending-table-stats
+000011
+----
+num-entries: 0
+num-deletions: 0
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 0
+
+# Compute stats on the table containing the range del. It should show an
+# estimate for deleted point keys, as a table below it (000008) contains point
+# keys. Note that even though table 000008 contains range keys, the range del
+# estimates are non-zero, as this number is agnostic of range keys.
+wait-pending-table-stats
+000013
+----
+num-entries: 1
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 778
+
+# Drop a range del and a range key del over the entire keyspace. This table can
+# delete everything underneath it.
+ingest ext1
+del-range a z
+range-key-del a z
+----
+0.2:
+  000014:[a#16,RANGEKEYDEL-z#inf,RANGEDEL]
+0.1:
+  000013:[a#15,RANGEDEL-c#inf,RANGEDEL]
+0.0:
+  000011:[a#14,RANGEKEYDEL-b#inf,RANGEKEYDEL]
+6:
+  000008:[a#11,RANGEKEYSET-b#inf,RANGEKEYSET]
+  000009:[c#0,SET-c#0,SET]
+
+compact a-z
+----
+
+# Ingest another sstable with range tombstones again, but this time into an
+# empty LSM. The table should ingest into L6. Its table stats should reflect
+# that its range tombstones cannot delete any of the data contained within the
+# file itself.
+ingest ext1
+del-range a z
+range-key-del a z
+set d d
+set e e
+set f f
+----
+6:
+  000015:[a#17,RANGEKEYDEL-z#inf,RANGEDEL]
+
+wait-pending-table-stats
+000015
+----
+num-entries: 4
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 0
+
+# A hint for exclusively range key deletions that covers a table with point keys
+# should not contain an estimate for point keys.
+
+define
+----
+
+# A table with point keys.
+batch
+set b b
+----
+
+flush
+----
+0.0:
+  000005:[b#10,SET-b#10,SET]
+
+# A table with a mixture of point and range keys.
+batch
+set c c
+range-key-set d d @1 foo
+----
+
+flush
+----
+0.0:
+  000005:[b#10,SET-b#10,SET]
+  000007:[c#11,SET-c#11,SET]
+
+compact a-z
+----
+6:
+  000008:[b#0,SET-b#0,SET]
+  000009:[c#0,SET-c#0,SET]
+
+# The table with the range key del, that spans the previous two tables.
+batch
+range-key-del a z
+----
+
+flush
+----
+0.0:
+  000011:[a#13,RANGEKEYDEL-z#inf,RANGEKEYDEL]
+6:
+  000008:[b#0,SET-b#0,SET]
+  000009:[c#0,SET-c#0,SET]
+
+# The hint on table 000011 does estimates zero size for range deleted point
+# keys.
+wait-pending-table-stats
+000011
+----
+num-entries: 0
+num-deletions: 0
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 0
+
+# A hint from a range del that covers a table with only range keys should not
+# contain an estimate for the range keys.
+
+define
+L4
+  a.RANGEDEL.4:c
+L5
+  a.RANGEDEL.2:e
+  b.SET.3:b
+L6
+  rangekey:c-d:{(#1,RANGEKEYSET,@1,foo)}
+----
+4:
+  000004:[a#4,RANGEDEL-c#inf,RANGEDEL]
+5:
+  000005:[a#2,RANGEDEL-e#inf,RANGEDEL]
+6:
+  000006:[c#1,RANGEKEYSET-d#inf,RANGEKEYSET]
+
+# The table in L5 should not contain an estimate for the table below it, which
+# contains only range keys.
+wait-pending-table-stats
+000005
+----
+num-entries: 2
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 0
+
+# The table in L4 can delete the table in L5, which contains point keys. The
+# estimate is only partial, as the range del does not fully overlap the table.
+wait-pending-table-stats
+000004
+----
+num-entries: 1
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 26
+
+# Test point tombstone compensation that uses DELSIZED keys.
+
+define format-major-version=15
+L6
+  bar.SET.0:<rand-bytes=10>
+  bax.SET.0:<rand-bytes=10>
+  foo.SET.0:<rand-bytes=100000>
+  moo.SET.0:<rand-bytes=1>
+----
+6:
+  000004:[bar#0,SET-moo#0,SET]
+
+batch
+set a apple
+set b banana
+set c coconut
+del-sized foo 100000
+del moo
+----
+
+flush
+----
+0.0:
+  000006:[a#10,SET-moo#14,DEL]
+6:
+  000004:[bar#0,SET-moo#0,SET]
+
+metric keys.missized-tombstones-count
+----
+keys.missized-tombstones-count: 0
+
+# The foo DELSIZED tombstone should cause the
+# `pebble.raw.point-tombstone.value.size` property to be 100000 + len(foo) =
+# 100003.
+
+properties file=000006
+num.deletions
+deleted.keys
+raw.point-tombstone
+----
+num.deletions:
+  pebble.num.deletions.sized: 1
+deleted.keys:
+  rocksdb.deleted.keys: 2
+raw.point-tombstone:
+  pebble.raw.point-tombstone.key.size: 6
+  pebble.raw.point-tombstone.value.size: 100003
+
+# And the size hint should then appear in the point-deletions-bytes-estimate,
+# scaled according to the computed 'compression ratio'.
+
+wait-pending-table-stats
+000006
+----
+num-entries: 5
+num-deletions: 2
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 112732
+range-deletions-bytes-estimate: 0
+
+# Try a missized point tombstone. It should appear in the Metrics after the
+# flush that will elide the a.SET.
+
+batch
+set a boop
+del-sized a 10000
+----
+
+flush
+----
+0.1:
+  000008:[a#16,DEL-a#16,DEL]
+0.0:
+  000006:[a#10,SET-moo#14,DEL]
+6:
+  000004:[bar#0,SET-moo#0,SET]
+
+metric keys.missized-tombstones-count
+----
+keys.missized-tombstones-count: 1
+
+# Virtual sstables tests. Note that these tests are just for sanity checking
+# purposes. Small sstables lead to inaccurate values during extrapolation.
+define format-major-version=16
+----
+
+batch
+set a 1
+set b 2
+del d
+----
+
+flush
+----
+0.0:
+  000005:[a#10,SET-d#12,DEL]
+
+metadata-stats file=5
+----
+size: 726
+
+# Just grab the physical sstable properties as these are used to construct the
+# virtual sstable properties.
+properties file=5
+rocksdb
+pebble
+----
+rocksdb:
+  rocksdb.num.entries: 3
+  rocksdb.raw.key.size: 27
+  rocksdb.raw.value.size: 2
+  rocksdb.deleted.keys: 1
+  rocksdb.num.range-deletions: 0
+  rocksdb.comparator: pebble.internal.testkeys
+  rocksdb.compression: Snappy
+  rocksdb.compression_options: window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; 
+  rocksdb.data.size: 53
+  rocksdb.filter.size: 0
+  rocksdb.index.size: 27
+  rocksdb.block.based.table.index.type: 0
+  rocksdb.merge.operator: pebble.concatenate
+  rocksdb.num.data.blocks: 1
+  rocksdb.merge.operands: 0
+  rocksdb.prefix.extractor.name: nullptr
+  rocksdb.block.based.table.prefix.filtering: false
+  rocksdb.property.collectors: [obsolete-key]
+  rocksdb.block.based.table.whole.key.filtering: false
+pebble:
+  pebble.raw.point-tombstone.key.size: 1
+  rocksdb.comparator: pebble.internal.testkeys
+  rocksdb.merge.operator: pebble.concatenate
+
+build ext1
+set f f
+----
+
+ingest-and-excise ext1 excise=b-c
+----
+
+lsm
+----
+0.0:
+  000007:[a#10,SET-a#10,SET]
+  000008:[d#12,DEL-d#12,DEL]
+6:
+  000006:[f#13,SET-f#13,SET]
+
+metadata-stats file=7
+----
+size: 53
+
+metadata-stats file=8
+----
+size: 53
+
+# Note that the backing file size is much larger than the virtual file sizes.
+# For tiny sstables, the metadata contained in the sstable is much larger than
+# the actual sizes.
+
+# While sstable 8 has no point tombstones, because of the nature of extrapolation
+# both file 7 and file 8 will have a point tombstone key size property. Because
+# of this both the files have a point deletion bytes estimate.
+properties file=7
+----
+rocksdb.num.entries: 1
+rocksdb.raw.key.size: 2
+rocksdb.raw.value.size: 1
+pebble.raw.point-tombstone.key.size: 1
+rocksdb.deleted.keys: 1
+
+properties file=8
+----
+rocksdb.num.entries: 1
+rocksdb.raw.key.size: 2
+rocksdb.raw.value.size: 1
+pebble.raw.point-tombstone.key.size: 1
+rocksdb.deleted.keys: 1
+
+wait-pending-table-stats
+000007
+----
+num-entries: 1
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 53
+range-deletions-bytes-estimate: 0
+
+wait-pending-table-stats
+000008
+----
+num-entries: 1
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 53
+range-deletions-bytes-estimate: 0
+
+# Create an sstable with a range key set.
+batch
+set a a
+set b b
+set d d
+range-key-set e ee @1 foo
+----
+
+flush
+----
+0.1:
+  000010:[a#14,SET-ee#inf,RANGEKEYSET]
+0.0:
+  000007:[a#10,SET-a#10,SET]
+  000008:[d#12,DEL-d#12,DEL]
+6:
+  000006:[f#13,SET-f#13,SET]
+
+properties file=10
+rocksdb
+pebble
+----
+rocksdb:
+  rocksdb.num.entries: 3
+  rocksdb.raw.key.size: 27
+  rocksdb.raw.value.size: 3
+  rocksdb.deleted.keys: 0
+  rocksdb.num.range-deletions: 0
+  rocksdb.comparator: pebble.internal.testkeys
+  rocksdb.compression: Snappy
+  rocksdb.compression_options: window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; 
+  rocksdb.data.size: 47
+  rocksdb.filter.size: 0
+  rocksdb.index.size: 27
+  rocksdb.block.based.table.index.type: 0
+  rocksdb.merge.operator: pebble.concatenate
+  rocksdb.num.data.blocks: 1
+  rocksdb.merge.operands: 0
+  rocksdb.prefix.extractor.name: nullptr
+  rocksdb.block.based.table.prefix.filtering: false
+  rocksdb.property.collectors: [obsolete-key]
+  rocksdb.block.based.table.whole.key.filtering: false
+pebble:
+  pebble.num.range-key-dels: 0
+  pebble.num.range-key-sets: 1
+  rocksdb.comparator: pebble.internal.testkeys
+  rocksdb.merge.operator: pebble.concatenate
+  pebble.num.range-key-unsets: 0
+  pebble.raw.range-key.key.size: 9
+  pebble.raw.range-key.value.size: 10
+
+metadata-stats file=10
+----
+size: 828
+
+build ext2
+set z z
+----
+
+ingest-and-excise ext2 excise=b-c
+----
+
+lsm
+----
+0.1:
+  000012:[a#14,SET-a#14,SET]
+  000013:[d#16,SET-ee#inf,RANGEKEYSET]
+0.0:
+  000007:[a#10,SET-a#10,SET]
+  000008:[d#12,DEL-d#12,DEL]
+6:
+  000006:[f#13,SET-f#13,SET]
+  000011:[z#18,SET-z#18,SET]
+
+metadata-stats file=12
+----
+size: 47
+
+metadata-stats file=13
+----
+size: 47
+
+# range key sets shows up for both files. This is expected.
+properties file=12
+----
+rocksdb.num.entries: 1
+rocksdb.raw.key.size: 2
+rocksdb.raw.value.size: 1
+pebble.num.range-key-sets: 1
+
+properties file=13
+----
+rocksdb.num.entries: 1
+rocksdb.raw.key.size: 2
+rocksdb.raw.value.size: 1
+pebble.num.range-key-sets: 1
+
+wait-pending-table-stats
+000012
+----
+num-entries: 1
+num-deletions: 0
+num-range-key-sets: 1
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 0
+
+wait-pending-table-stats
+000013
+----
+num-entries: 1
+num-deletions: 0
+num-range-key-sets: 1
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 0
+
+# Create an sstable with range deletes to view the range delete byte estimates.
+
+# Compact everything to L6. Range deletion bytes estimate doesn't account for
+# bytes in L0.
+compact a-z
+----
+6:
+  000014:[a#0,SET-a#0,SET]
+  000015:[d#0,SETWITHDEL-d#0,SETWITHDEL]
+  000016:[e#17,RANGEKEYSET-ee#inf,RANGEKEYSET]
+  000006:[f#13,SET-f#13,SET]
+  000011:[z#18,SET-z#18,SET]
+
+batch
+del-range a e
+----
+
+flush
+----
+0.0:
+  000018:[a#19,RANGEDEL-e#inf,RANGEDEL]
+6:
+  000014:[a#0,SET-a#0,SET]
+  000015:[d#0,SETWITHDEL-d#0,SETWITHDEL]
+  000016:[e#17,RANGEKEYSET-ee#inf,RANGEKEYSET]
+  000006:[f#13,SET-f#13,SET]
+  000011:[z#18,SET-z#18,SET]
+
+properties file=18
+rocksdb
+pebble
+----
+rocksdb:
+  rocksdb.num.entries: 1
+  rocksdb.raw.key.size: 9
+  rocksdb.raw.value.size: 1
+  rocksdb.deleted.keys: 1
+  rocksdb.num.range-deletions: 1
+  rocksdb.comparator: pebble.internal.testkeys
+  rocksdb.compression: Snappy
+  rocksdb.compression_options: window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; 
+  rocksdb.data.size: 13
+  rocksdb.filter.size: 0
+  rocksdb.index.size: 29
+  rocksdb.block.based.table.index.type: 0
+  rocksdb.merge.operator: pebble.concatenate
+  rocksdb.num.data.blocks: 1
+  rocksdb.merge.operands: 0
+  rocksdb.prefix.extractor.name: nullptr
+  rocksdb.block.based.table.prefix.filtering: false
+  rocksdb.property.collectors: [obsolete-key]
+  rocksdb.block.based.table.whole.key.filtering: false
+pebble:
+  rocksdb.comparator: pebble.internal.testkeys
+  rocksdb.merge.operator: pebble.concatenate
+
+build ext3
+set zz zz
+----
+
+ingest-and-excise ext3 excise=b-c
+----
+
+lsm
+----
+0.0:
+  000020:[a#19,RANGEDEL-b#inf,RANGEDEL]
+  000021:[c#19,RANGEDEL-e#inf,RANGEDEL]
+6:
+  000014:[a#0,SET-a#0,SET]
+  000015:[d#0,SETWITHDEL-d#0,SETWITHDEL]
+  000016:[e#17,RANGEKEYSET-ee#inf,RANGEKEYSET]
+  000006:[f#13,SET-f#13,SET]
+  000011:[z#18,SET-z#18,SET]
+  000019:[zz#20,SET-zz#20,SET]
+
+properties file=20
+----
+rocksdb.num.entries: 1
+rocksdb.raw.key.size: 1
+rocksdb.raw.value.size: 1
+rocksdb.deleted.keys: 1
+rocksdb.num.range-deletions: 1
+
+properties file=21
+----
+rocksdb.num.entries: 1
+rocksdb.raw.key.size: 1
+rocksdb.raw.value.size: 1
+rocksdb.deleted.keys: 1
+rocksdb.num.range-deletions: 1
+
+wait-pending-table-stats
+000020
+----
+num-entries: 1
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 661
+
+wait-pending-table-stats
+000021
+----
+num-entries: 1
+num-deletions: 1
+num-range-key-sets: 0
+point-deletions-bytes-estimate: 0
+range-deletions-bytes-estimate: 660
diff --git a/pebble/testdata/table_stats_deletion_iter b/pebble/testdata/table_stats_deletion_iter
new file mode 100644
index 0000000..d1cbba4
--- /dev/null
+++ b/pebble/testdata/table_stats_deletion_iter
@@ -0,0 +1,199 @@
+# The example used in the documentation for tableRangedDeletionIter. An iterator
+# over the following table:
+#
+#
+#         |---------|     |---------|         |-------| RANGEKEYDELs
+#   |-----------|-------------|           |-----|       RANGEDELs
+# __________________________________________________________
+#   a b c d e f g h i j k l m n o p q r s t u v w x y z
+#
+# yields the following spans:
+#
+#      1       3       1    3    2          1  3   2
+#   |-----|---------|-----|---|-----|     |---|-|-----|
+# __________________________________________________________
+#   a b c d e f g h i j k l m n o p q r s t u v w x y z
+#
+# where:
+# - 1: homogenous span with range dels only
+# - 2: homogenous span with range key dels only
+# - 3: heterogeneous span with range dels and range key dels
+
+build
+a-g:{(#0,RANGEDEL)}
+g-n:{(#0,RANGEDEL)}
+t-w:{(#0,RANGEDEL)}
+d-i:{(#0,RANGEKEYDEL)}
+l-q:{(#0,RANGEKEYDEL)}
+v-z:{(#0,RANGEKEYDEL)}
+----
+000000:[a#0,RANGEDEL-z#inf,RANGEKEYDEL]
+
+spans
+----
+a-d:{(#0,RANGEDEL)}
+d-i:{(#0,RANGEKEYDEL) (#0,RANGEDEL)}
+i-l:{(#0,RANGEDEL)}
+l-n:{(#0,RANGEKEYDEL) (#0,RANGEDEL)}
+n-q:{(#0,RANGEKEYDEL)}
+t-v:{(#0,RANGEDEL)}
+v-w:{(#0,RANGEKEYDEL) (#0,RANGEDEL)}
+w-z:{(#0,RANGEKEYDEL)}
+
+# The iterator elides anything other than range deletes and range key deletes.
+
+#   |---------------RANGEKEYUNSET----------------------|
+#   |---------------RANGEKEYSET------------------------|
+#   |---------------RANGEKEYDEL------------------------|
+# __________________________________________________________
+#   a b c d e f g h i j k l m n o p q r s t u v w x y z
+#
+# Becomes:
+#   |---------------RANGEKEYDEL------------------------|
+# __________________________________________________________
+#   a b c d e f g h i j k l m n o p q r s t u v w x y z
+
+build
+a-z:{(#0,RANGEKEYDEL) (#0,RANGEKEYSET,@1,foo) (#0,RANGEKEYUNSET,@2)}
+----
+000000:[a#0,RANGEKEYSET-z#inf,RANGEKEYDEL]
+
+spans
+----
+a-z:{(#0,RANGEKEYDEL)}
+
+# The same as the above, except that the RANGEKEYDEL is at a lower sequence
+# number than the SET and UNSET. The three keys exist at different sequence
+# numbers, with the DEL at the bottom to avoid the perceived affects of
+# shadowing.
+
+build
+a-z:{(#3,RANGEKEYSET,@1,foo) (#2,RANGEKEYUNSET,@2) (#1,RANGEKEYDEL)}
+----
+000000:[a#3,RANGEKEYSET-z#inf,RANGEKEYDEL]
+
+spans
+----
+a-z:{(#1,RANGEKEYDEL)}
+
+#   |-RANGEKEYSET-|-RANGEKEYUNSET-|-RANGEKEYDEL--------|
+# __________________________________________________________
+#   a b c d e f g h i j k l m n o p q r s t u v w x y z
+#
+# Becomes:
+#                                 |-RANGEKEYDEL-------|
+# __________________________________________________________
+#   a b c d e f g h i j k l m n o p q r s t u v w x y z
+
+build
+a-h:{(#0,RANGEKEYSET,@1,foo)}
+h-p:{(#0,RANGEKEYUNSET,@1)}
+p-z:{(#0,RANGEKEYDEL)}
+----
+000000:[a#0,RANGEKEYSET-z#inf,RANGEKEYDEL]
+
+spans
+----
+p-z:{(#0,RANGEKEYDEL)}
+
+#   |-RANGEKEYSET-|-RANGEKEYUNSET-|-RANGEKEYDEL--------|
+#   |--RANGEDEL-----------------------------|
+# __________________________________________________________
+#   a b c d e f g h i j k l m n o p q r s t u v w x y z
+#
+# Becomes:
+#
+#   |---------------1-------------|----3----|----2----|
+# __________________________________________________________
+#   a b c d e f g h i j k l m n o p q r s t u v w x y z
+
+build
+a-u:{(#0,RANGEDEL)}
+a-h:{(#0,RANGEKEYSET,@1,foo)}
+h-p:{(#0,RANGEKEYUNSET,@1)}
+p-z:{(#0,RANGEKEYDEL)}
+----
+000000:[a#0,RANGEKEYSET-z#inf,RANGEKEYDEL]
+
+spans
+----
+a-p:{(#0,RANGEDEL)}
+p-u:{(#0,RANGEKEYDEL) (#0,RANGEDEL)}
+u-z:{(#0,RANGEKEYDEL)}
+
+# An iterator over a table with neither range dels or range key dels present
+# yields no spans.
+
+build
+a-m:{(#0,RANGEKEYSET,@1,foo)}
+m-z:{(#0,RANGEKEYUNSET,@1)}
+----
+000000:[a#0,RANGEKEYSET-z#inf,RANGEKEYUNSET]
+
+spans
+----
+(none)
+
+# Abutting spans that are merged emit only the largest and smallest key.
+
+build
+a-c:{(#4,RANGEDEL) (#3,RANGEDEL) (#1,RANGEDEL)}
+c-d:{(#9,RANGEDEL) (#7,RANGEDEL) (#6,RANGEDEL) (#2,RANGEDEL)}
+----
+000000:[a#4,RANGEDEL-d#inf,RANGEDEL]
+
+spans
+----
+a-d:{(#9,RANGEDEL) (#1,RANGEDEL)}
+
+# The same as above, but for range key dels.
+
+build
+a-c:{(#4,RANGEKEYDEL) (#3,RANGEKEYDEL) (#1,RANGEKEYDEL)}
+c-d:{(#9,RANGEKEYDEL) (#7,RANGEKEYDEL) (#6,RANGEKEYDEL) (#2,RANGEKEYDEL)}
+----
+000000:[a#4,RANGEKEYDEL-d#inf,RANGEKEYDEL]
+
+spans
+----
+a-d:{(#9,RANGEKEYDEL) (#1,RANGEKEYDEL)}
+
+# The same as above, but for mixed key kinds. The resulting span has four keys,
+# given the respective largest / smallest keys from the range del and range key
+# del spans are interleaved.
+
+build
+a-c:{(#4,RANGEDEL) (#2,RANGEDEL)}
+a-c:{(#3,RANGEKEYDEL) (#1,RANGEKEYDEL)}
+----
+000000:[a#4,RANGEDEL-c#inf,RANGEDEL]
+
+spans
+----
+a-c:{(#4,RANGEDEL) (#3,RANGEKEYDEL) (#2,RANGEDEL) (#1,RANGEKEYDEL)}
+
+# "Hetrogenous spans" (i.e. spans containing both range dels and range key dels)
+# are preserved when abutting spans are merged. This tests that the reducing
+# function does not "lose information" of the merged spans, effectively
+# resulting in the span switching types from a heterogeneous span to a
+# homogenous span.
+
+# 10 |-------|---|---| RANGEDELs
+# 5  |-------|---|     RANGEKEYDELs
+# 3  |-------|---|---| RANGEDELs
+# _____________________
+#    a   b   c   d   e
+
+build
+a-c:{(#10,RANGEDEL) (#3,RANGEDEL)}
+c-d:{(#10,RANGEDEL) (#3,RANGEDEL)}
+d-e:{(#10,RANGEDEL) (#3,RANGEDEL)}
+a-c:{(#5,RANGEKEYDEL)}
+c-d:{(#5,RANGEKEYDEL)}
+----
+000000:[a#10,RANGEDEL-e#inf,RANGEDEL]
+
+spans
+----
+a-d:{(#10,RANGEDEL) (#5,RANGEKEYDEL) (#3,RANGEDEL)}
+d-e:{(#10,RANGEDEL) (#3,RANGEDEL)}
diff --git a/pebble/testdata/version_check_consistency b/pebble/testdata/version_check_consistency
new file mode 100644
index 0000000..a8ca9b3
--- /dev/null
+++ b/pebble/testdata/version_check_consistency
@@ -0,0 +1,70 @@
+build
+000001:10
+000002:20
+000003:30
+----
+open test/000001.sst: file does not exist
+
+check-consistency
+----
+OK
+
+check-consistency
+L0
+  000005:10
+----
+L0: 000005: file 000005 (type 2) unknown to the objstorage provider: file does not exist
+
+check-consistency
+L0
+  000001:10
+----
+L0: 000001: file 000001 (type 2) unknown to the objstorage provider: file does not exist
+
+check-consistency
+L0
+  000001:11
+----
+L0: 000001: file 000001 (type 2) unknown to the objstorage provider: file does not exist
+
+check-consistency redact
+L0
+  000001:11
+----
+L0: 000001: file 000001 (type 2) unknown to the objstorage provider: file does not exist
+
+check-consistency
+L0
+  000001:10
+L1
+  000002:20
+L2
+  000003:30
+----
+L0: 000001: file 000001 (type 2) unknown to the objstorage provider: file does not exist
+L1: 000002: file 000002 (type 2) unknown to the objstorage provider: file does not exist
+L2: 000003: file 000003 (type 2) unknown to the objstorage provider: file does not exist
+
+check-consistency
+L0
+  000001:11
+L1
+  000002:22
+L2
+  000003:33
+----
+L0: 000001: file 000001 (type 2) unknown to the objstorage provider: file does not exist
+L1: 000002: file 000002 (type 2) unknown to the objstorage provider: file does not exist
+L2: 000003: file 000003 (type 2) unknown to the objstorage provider: file does not exist
+
+check-consistency redact
+L0
+  000001:11
+L1
+  000002:22
+L2
+  000004:30
+----
+L0: 000001: file 000001 (type 2) unknown to the objstorage provider: file does not exist
+L1: 000002: file 000002 (type 2) unknown to the objstorage provider: file does not exist
+L2: 000004: file 000004 (type 2) unknown to the objstorage provider: file does not exist
diff --git a/pebble/tool/data/d3.v5.min.js b/pebble/tool/data/d3.v5.min.js
new file mode 100644
index 0000000..a75674c
--- /dev/null
+++ b/pebble/tool/data/d3.v5.min.js
@@ -0,0 +1,2 @@
+// https://d3js.org Version 5.1.0. Copyright 2018 Mike Bostock.
+(function(t,n){"object"==typeof exports&&"undefined"!=typeof module?n(exports):"function"==typeof define&&define.amd?define(["exports"],n):n(t.d3=t.d3||{})})(this,function(t){"use strict";function n(t,n){return t<n?-1:t>n?1:t>=n?0:NaN}function e(t){return 1===t.length&&(t=function(t){return function(e,r){return n(t(e),r)}}(t)),{left:function(n,e,r,i){for(null==r&&(r=0),null==i&&(i=n.length);r<i;){var o=r+i>>>1;t(n[o],e)<0?r=o+1:i=o}return r},right:function(n,e,r,i){for(null==r&&(r=0),null==i&&(i=n.length);r<i;){var o=r+i>>>1;t(n[o],e)>0?i=o:r=o+1}return r}}}function r(t,n){return[t,n]}function i(t){return null===t?NaN:+t}function o(t,n){var e,r,o=t.length,a=0,u=-1,f=0,c=0;if(null==n)for(;++u<o;)isNaN(e=i(t[u]))||(c+=(r=e-f)*(e-(f+=r/++a)));else for(;++u<o;)isNaN(e=i(n(t[u],u,t)))||(c+=(r=e-f)*(e-(f+=r/++a)));if(a>1)return c/(a-1)}function a(t,n){var e=o(t,n);return e?Math.sqrt(e):e}function u(t,n){var e,r,i,o=t.length,a=-1;if(null==n){for(;++a<o;)if(null!=(e=t[a])&&e>=e)for(r=i=e;++a<o;)null!=(e=t[a])&&(r>e&&(r=e),i<e&&(i=e))}else for(;++a<o;)if(null!=(e=n(t[a],a,t))&&e>=e)for(r=i=e;++a<o;)null!=(e=n(t[a],a,t))&&(r>e&&(r=e),i<e&&(i=e));return[r,i]}function f(t){return function(){return t}}function c(t){return t}function s(t,n,e){t=+t,n=+n,e=(i=arguments.length)<2?(n=t,t=0,1):i<3?1:+e;for(var r=-1,i=0|Math.max(0,Math.ceil((n-t)/e)),o=new Array(i);++r<i;)o[r]=t+r*e;return o}function l(t,n,e){var r,i,o,a,u=-1;if(n=+n,t=+t,e=+e,t===n&&e>0)return[t];if((r=n<t)&&(i=t,t=n,n=i),0===(a=h(t,n,e))||!isFinite(a))return[];if(a>0)for(t=Math.ceil(t/a),n=Math.floor(n/a),o=new Array(i=Math.ceil(n-t+1));++u<i;)o[u]=(t+u)*a;else for(t=Math.floor(t*a),n=Math.ceil(n*a),o=new Array(i=Math.ceil(t-n+1));++u<i;)o[u]=(t-u)/a;return r&&o.reverse(),o}function h(t,n,e){var r=(n-t)/Math.max(0,e),i=Math.floor(Math.log(r)/Math.LN10),o=r/Math.pow(10,i);return i>=0?(o>=es?10:o>=rs?5:o>=is?2:1)*Math.pow(10,i):-Math.pow(10,-i)/(o>=es?10:o>=rs?5:o>=is?2:1)}function d(t,n,e){var r=Math.abs(n-t)/Math.max(0,e),i=Math.pow(10,Math.floor(Math.log(r)/Math.LN10)),o=r/i;return o>=es?i*=10:o>=rs?i*=5:o>=is&&(i*=2),n<t?-i:i}function p(t){return Math.ceil(Math.log(t.length)/Math.LN2)+1}function v(t,n,e){if(null==e&&(e=i),r=t.length){if((n=+n)<=0||r<2)return+e(t[0],0,t);if(n>=1)return+e(t[r-1],r-1,t);var r,o=(r-1)*n,a=Math.floor(o),u=+e(t[a],a,t);return u+(+e(t[a+1],a+1,t)-u)*(o-a)}}function g(t,n){var e,r,i=t.length,o=-1;if(null==n){for(;++o<i;)if(null!=(e=t[o])&&e>=e)for(r=e;++o<i;)null!=(e=t[o])&&e>r&&(r=e)}else for(;++o<i;)if(null!=(e=n(t[o],o,t))&&e>=e)for(r=e;++o<i;)null!=(e=n(t[o],o,t))&&e>r&&(r=e);return r}function y(t){for(var n,e,r,i=t.length,o=-1,a=0;++o<i;)a+=t[o].length;for(e=new Array(a);--i>=0;)for(n=(r=t[i]).length;--n>=0;)e[--a]=r[n];return e}function _(t,n){var e,r,i=t.length,o=-1;if(null==n){for(;++o<i;)if(null!=(e=t[o])&&e>=e)for(r=e;++o<i;)null!=(e=t[o])&&r>e&&(r=e)}else for(;++o<i;)if(null!=(e=n(t[o],o,t))&&e>=e)for(r=e;++o<i;)null!=(e=n(t[o],o,t))&&r>e&&(r=e);return r}function b(t){if(!(i=t.length))return[];for(var n=-1,e=_(t,m),r=new Array(e);++n<e;)for(var i,o=-1,a=r[n]=new Array(i);++o<i;)a[o]=t[o][n];return r}function m(t){return t.length}function x(t){return t}function w(t){return"translate("+(t+.5)+",0)"}function M(t){return"translate(0,"+(t+.5)+")"}function A(){return!this.__axis}function T(t,n){function e(e){var h=null==i?n.ticks?n.ticks.apply(n,r):n.domain():i,d=null==o?n.tickFormat?n.tickFormat.apply(n,r):x:o,p=Math.max(a,0)+f,v=n.range(),g=+v[0]+.5,y=+v[v.length-1]+.5,_=(n.bandwidth?function(t){var n=Math.max(0,t.bandwidth()-1)/2;return t.round()&&(n=Math.round(n)),function(e){return+t(e)+n}}:function(t){return function(n){return+t(n)}})(n.copy()),b=e.selection?e.selection():e,m=b.selectAll(".domain").data([null]),w=b.selectAll(".tick").data(h,n).order(),M=w.exit(),T=w.enter().append("g").attr("class","tick"),N=w.select("line"),S=w.select("text");m=m.merge(m.enter().insert("path",".tick").attr("class","domain").attr("stroke","#000")),w=w.merge(T),N=N.merge(T.append("line").attr("stroke","#000").attr(s+"2",c*a)),S=S.merge(T.append("text").attr("fill","#000").attr(s,c*p).attr("dy",t===as?"0em":t===fs?"0.71em":"0.32em")),e!==b&&(m=m.transition(e),w=w.transition(e),N=N.transition(e),S=S.transition(e),M=M.transition(e).attr("opacity",ss).attr("transform",function(t){return isFinite(t=_(t))?l(t):this.getAttribute("transform")}),T.attr("opacity",ss).attr("transform",function(t){var n=this.parentNode.__axis;return l(n&&isFinite(n=n(t))?n:_(t))})),M.remove(),m.attr("d",t===cs||t==us?"M"+c*u+","+g+"H0.5V"+y+"H"+c*u:"M"+g+","+c*u+"V0.5H"+y+"V"+c*u),w.attr("opacity",1).attr("transform",function(t){return l(_(t))}),N.attr(s+"2",c*a),S.attr(s,c*p).text(d),b.filter(A).attr("fill","none").attr("font-size",10).attr("font-family","sans-serif").attr("text-anchor",t===us?"start":t===cs?"end":"middle"),b.each(function(){this.__axis=_})}var r=[],i=null,o=null,a=6,u=6,f=3,c=t===as||t===cs?-1:1,s=t===cs||t===us?"x":"y",l=t===as||t===fs?w:M;return e.scale=function(t){return arguments.length?(n=t,e):n},e.ticks=function(){return r=os.call(arguments),e},e.tickArguments=function(t){return arguments.length?(r=null==t?[]:os.call(t),e):r.slice()},e.tickValues=function(t){return arguments.length?(i=null==t?null:os.call(t),e):i&&i.slice()},e.tickFormat=function(t){return arguments.length?(o=t,e):o},e.tickSize=function(t){return arguments.length?(a=u=+t,e):a},e.tickSizeInner=function(t){return arguments.length?(a=+t,e):a},e.tickSizeOuter=function(t){return arguments.length?(u=+t,e):u},e.tickPadding=function(t){return arguments.length?(f=+t,e):f},e}function N(){for(var t,n=0,e=arguments.length,r={};n<e;++n){if(!(t=arguments[n]+"")||t in r)throw new Error("illegal type: "+t);r[t]=[]}return new S(r)}function S(t){this._=t}function E(t,n,e){for(var r=0,i=t.length;r<i;++r)if(t[r].name===n){t[r]=ls,t=t.slice(0,r).concat(t.slice(r+1));break}return null!=e&&t.push({name:n,value:e}),t}function k(t){var n=t+="",e=n.indexOf(":");return e>=0&&"xmlns"!==(n=t.slice(0,e))&&(t=t.slice(e+1)),ds.hasOwnProperty(n)?{space:ds[n],local:t}:t}function C(t){var n=k(t);return(n.local?function(t){return function(){return this.ownerDocument.createElementNS(t.space,t.local)}}:function(t){return function(){var n=this.ownerDocument,e=this.namespaceURI;return e===hs&&n.documentElement.namespaceURI===hs?n.createElement(t):n.createElementNS(e,t)}})(n)}function P(){}function z(t){return null==t?P:function(){return this.querySelector(t)}}function R(){return[]}function L(t){return null==t?R:function(){return this.querySelectorAll(t)}}function D(t){return new Array(t.length)}function U(t,n){this.ownerDocument=t.ownerDocument,this.namespaceURI=t.namespaceURI,this._next=null,this._parent=t,this.__data__=n}function q(t,n,e,r,i,o){for(var a,u=0,f=n.length,c=o.length;u<c;++u)(a=n[u])?(a.__data__=o[u],r[u]=a):e[u]=new U(t,o[u]);for(;u<f;++u)(a=n[u])&&(i[u]=a)}function O(t,n,e,r,i,o,a){var u,f,c,s={},l=n.length,h=o.length,d=new Array(l);for(u=0;u<l;++u)(f=n[u])&&(d[u]=c=_s+a.call(f,f.__data__,u,n),c in s?i[u]=f:s[c]=f);for(u=0;u<h;++u)(f=s[c=_s+a.call(t,o[u],u,o)])?(r[u]=f,f.__data__=o[u],s[c]=null):e[u]=new U(t,o[u]);for(u=0;u<l;++u)(f=n[u])&&s[d[u]]===f&&(i[u]=f)}function Y(t,n){return t<n?-1:t>n?1:t>=n?0:NaN}function B(t){return t.ownerDocument&&t.ownerDocument.defaultView||t.document&&t||t.defaultView}function F(t,n){return t.style.getPropertyValue(n)||B(t).getComputedStyle(t,null).getPropertyValue(n)}function I(t){return t.trim().split(/^|\s+/)}function j(t){return t.classList||new H(t)}function H(t){this._node=t,this._names=I(t.getAttribute("class")||"")}function X(t,n){for(var e=j(t),r=-1,i=n.length;++r<i;)e.add(n[r])}function G(t,n){for(var e=j(t),r=-1,i=n.length;++r<i;)e.remove(n[r])}function V(){this.textContent=""}function $(){this.innerHTML=""}function W(){this.nextSibling&&this.parentNode.appendChild(this)}function Z(){this.previousSibling&&this.parentNode.insertBefore(this,this.parentNode.firstChild)}function Q(){return null}function J(){var t=this.parentNode;t&&t.removeChild(this)}function K(){return this.parentNode.insertBefore(this.cloneNode(!1),this.nextSibling)}function tt(){return this.parentNode.insertBefore(this.cloneNode(!0),this.nextSibling)}function nt(t,n,e){return t=et(t,n,e),function(n){var e=n.relatedTarget;e&&(e===this||8&e.compareDocumentPosition(this))||t.call(this,n)}}function et(n,e,r){return function(i){var o=t.event;t.event=i;try{n.call(this,this.__data__,e,r)}finally{t.event=o}}}function rt(t){return function(){var n=this.__on;if(n){for(var e,r=0,i=-1,o=n.length;r<o;++r)e=n[r],t.type&&e.type!==t.type||e.name!==t.name?n[++i]=e:this.removeEventListener(e.type,e.listener,e.capture);++i?n.length=i:delete this.__on}}}function it(t,n,e){var r=bs.hasOwnProperty(t.type)?nt:et;return function(i,o,a){var u,f=this.__on,c=r(n,o,a);if(f)for(var s=0,l=f.length;s<l;++s)if((u=f[s]).type===t.type&&u.name===t.name)return this.removeEventListener(u.type,u.listener,u.capture),this.addEventListener(u.type,u.listener=c,u.capture=e),void(u.value=n);this.addEventListener(t.type,c,e),u={type:t.type,name:t.name,value:n,listener:c,capture:e},f?f.push(u):this.__on=[u]}}function ot(n,e,r,i){var o=t.event;n.sourceEvent=t.event,t.event=n;try{return e.apply(r,i)}finally{t.event=o}}function at(t,n,e){var r=B(t),i=r.CustomEvent;"function"==typeof i?i=new i(n,e):(i=r.document.createEvent("Event"),e?(i.initEvent(n,e.bubbles,e.cancelable),i.detail=e.detail):i.initEvent(n,!1,!1)),t.dispatchEvent(i)}function ut(t,n){this._groups=t,this._parents=n}function ft(){return new ut([[document.documentElement]],ms)}function ct(t){return"string"==typeof t?new ut([[document.querySelector(t)]],[document.documentElement]):new ut([[t]],ms)}function st(){return new lt}function lt(){this._="@"+(++xs).toString(36)}function ht(){for(var n,e=t.event;n=e.sourceEvent;)e=n;return e}function dt(t,n){var e=t.ownerSVGElement||t;if(e.createSVGPoint){var r=e.createSVGPoint();return r.x=n.clientX,r.y=n.clientY,r=r.matrixTransform(t.getScreenCTM().inverse()),[r.x,r.y]}var i=t.getBoundingClientRect();return[n.clientX-i.left-t.clientLeft,n.clientY-i.top-t.clientTop]}function pt(t){var n=ht();return n.changedTouches&&(n=n.changedTouches[0]),dt(t,n)}function vt(t,n,e){arguments.length<3&&(e=n,n=ht().changedTouches);for(var r,i=0,o=n?n.length:0;i<o;++i)if((r=n[i]).identifier===e)return dt(t,r);return null}function gt(){t.event.stopImmediatePropagation()}function yt(){t.event.preventDefault(),t.event.stopImmediatePropagation()}function _t(t){var n=t.document.documentElement,e=ct(t).on("dragstart.drag",yt,!0);"onselectstart"in n?e.on("selectstart.drag",yt,!0):(n.__noselect=n.style.MozUserSelect,n.style.MozUserSelect="none")}function bt(t,n){var e=t.document.documentElement,r=ct(t).on("dragstart.drag",null);n&&(r.on("click.drag",yt,!0),setTimeout(function(){r.on("click.drag",null)},0)),"onselectstart"in e?r.on("selectstart.drag",null):(e.style.MozUserSelect=e.__noselect,delete e.__noselect)}function mt(t){return function(){return t}}function xt(t,n,e,r,i,o,a,u,f,c){this.target=t,this.type=n,this.subject=e,this.identifier=r,this.active=i,this.x=o,this.y=a,this.dx=u,this.dy=f,this._=c}function wt(){return!t.event.button}function Mt(){return this.parentNode}function At(n){return null==n?{x:t.event.x,y:t.event.y}:n}function Tt(){return"ontouchstart"in this}function Nt(t,n,e){t.prototype=n.prototype=e,e.constructor=t}function St(t,n){var e=Object.create(t.prototype);for(var r in n)e[r]=n[r];return e}function Et(){}function kt(t){var n;return t=(t+"").trim().toLowerCase(),(n=Ts.exec(t))?(n=parseInt(n[1],16),new Lt(n>>8&15|n>>4&240,n>>4&15|240&n,(15&n)<<4|15&n,1)):(n=Ns.exec(t))?Ct(parseInt(n[1],16)):(n=Ss.exec(t))?new Lt(n[1],n[2],n[3],1):(n=Es.exec(t))?new Lt(255*n[1]/100,255*n[2]/100,255*n[3]/100,1):(n=ks.exec(t))?Pt(n[1],n[2],n[3],n[4]):(n=Cs.exec(t))?Pt(255*n[1]/100,255*n[2]/100,255*n[3]/100,n[4]):(n=Ps.exec(t))?Dt(n[1],n[2]/100,n[3]/100,1):(n=zs.exec(t))?Dt(n[1],n[2]/100,n[3]/100,n[4]):Rs.hasOwnProperty(t)?Ct(Rs[t]):"transparent"===t?new Lt(NaN,NaN,NaN,0):null}function Ct(t){return new Lt(t>>16&255,t>>8&255,255&t,1)}function Pt(t,n,e,r){return r<=0&&(t=n=e=NaN),new Lt(t,n,e,r)}function zt(t){return t instanceof Et||(t=kt(t)),t?(t=t.rgb(),new Lt(t.r,t.g,t.b,t.opacity)):new Lt}function Rt(t,n,e,r){return 1===arguments.length?zt(t):new Lt(t,n,e,null==r?1:r)}function Lt(t,n,e,r){this.r=+t,this.g=+n,this.b=+e,this.opacity=+r}function Dt(t,n,e,r){return r<=0?t=n=e=NaN:e<=0||e>=1?t=n=NaN:n<=0&&(t=NaN),new qt(t,n,e,r)}function Ut(t,n,e,r){return 1===arguments.length?function(t){if(t instanceof qt)return new qt(t.h,t.s,t.l,t.opacity);if(t instanceof Et||(t=kt(t)),!t)return new qt;if(t instanceof qt)return t;var n=(t=t.rgb()).r/255,e=t.g/255,r=t.b/255,i=Math.min(n,e,r),o=Math.max(n,e,r),a=NaN,u=o-i,f=(o+i)/2;return u?(a=n===o?(e-r)/u+6*(e<r):e===o?(r-n)/u+2:(n-e)/u+4,u/=f<.5?o+i:2-o-i,a*=60):u=f>0&&f<1?0:a,new qt(a,u,f,t.opacity)}(t):new qt(t,n,e,null==r?1:r)}function qt(t,n,e,r){this.h=+t,this.s=+n,this.l=+e,this.opacity=+r}function Ot(t,n,e){return 255*(t<60?n+(e-n)*t/60:t<180?e:t<240?n+(e-n)*(240-t)/60:n)}function Yt(t){if(t instanceof Ft)return new Ft(t.l,t.a,t.b,t.opacity);if(t instanceof $t){if(isNaN(t.h))return new Ft(t.l,0,0,t.opacity);var n=t.h*Ls;return new Ft(t.l,Math.cos(n)*t.c,Math.sin(n)*t.c,t.opacity)}t instanceof Lt||(t=zt(t));var e,r,i=Xt(t.r),o=Xt(t.g),a=Xt(t.b),u=It((.2225045*i+.7168786*o+.0606169*a)/qs);return i===o&&o===a?e=r=u:(e=It((.4360747*i+.3850649*o+.1430804*a)/Us),r=It((.0139322*i+.0971045*o+.7141733*a)/Os)),new Ft(116*u-16,500*(e-u),200*(u-r),t.opacity)}function Bt(t,n,e,r){return 1===arguments.length?Yt(t):new Ft(t,n,e,null==r?1:r)}function Ft(t,n,e,r){this.l=+t,this.a=+n,this.b=+e,this.opacity=+r}function It(t){return t>Is?Math.pow(t,1/3):t/Fs+Ys}function jt(t){return t>Bs?t*t*t:Fs*(t-Ys)}function Ht(t){return 255*(t<=.0031308?12.92*t:1.055*Math.pow(t,1/2.4)-.055)}function Xt(t){return(t/=255)<=.04045?t/12.92:Math.pow((t+.055)/1.055,2.4)}function Gt(t){if(t instanceof $t)return new $t(t.h,t.c,t.l,t.opacity);if(t instanceof Ft||(t=Yt(t)),0===t.a&&0===t.b)return new $t(NaN,0,t.l,t.opacity);var n=Math.atan2(t.b,t.a)*Ds;return new $t(n<0?n+360:n,Math.sqrt(t.a*t.a+t.b*t.b),t.l,t.opacity)}function Vt(t,n,e,r){return 1===arguments.length?Gt(t):new $t(t,n,e,null==r?1:r)}function $t(t,n,e,r){this.h=+t,this.c=+n,this.l=+e,this.opacity=+r}function Wt(t,n,e,r){return 1===arguments.length?function(t){if(t instanceof Zt)return new Zt(t.h,t.s,t.l,t.opacity);t instanceof Lt||(t=zt(t));var n=t.r/255,e=t.g/255,r=t.b/255,i=($s*r+Gs*n-Vs*e)/($s+Gs-Vs),o=r-i,a=(Xs*(e-i)-js*o)/Hs,u=Math.sqrt(a*a+o*o)/(Xs*i*(1-i)),f=u?Math.atan2(a,o)*Ds-120:NaN;return new Zt(f<0?f+360:f,u,i,t.opacity)}(t):new Zt(t,n,e,null==r?1:r)}function Zt(t,n,e,r){this.h=+t,this.s=+n,this.l=+e,this.opacity=+r}function Qt(t,n,e,r,i){var o=t*t,a=o*t;return((1-3*t+3*o-a)*n+(4-6*o+3*a)*e+(1+3*t+3*o-3*a)*r+a*i)/6}function Jt(t){var n=t.length-1;return function(e){var r=e<=0?e=0:e>=1?(e=1,n-1):Math.floor(e*n),i=t[r],o=t[r+1],a=r>0?t[r-1]:2*i-o,u=r<n-1?t[r+2]:2*o-i;return Qt((e-r/n)*n,a,i,o,u)}}function Kt(t){var n=t.length;return function(e){var r=Math.floor(((e%=1)<0?++e:e)*n),i=t[(r+n-1)%n],o=t[r%n],a=t[(r+1)%n],u=t[(r+2)%n];return Qt((e-r/n)*n,i,o,a,u)}}function tn(t){return function(){return t}}function nn(t,n){return function(e){return t+e*n}}function en(t,n){var e=n-t;return e?nn(t,e>180||e<-180?e-360*Math.round(e/360):e):tn(isNaN(t)?n:t)}function rn(t){return 1==(t=+t)?on:function(n,e){return e-n?function(t,n,e){return t=Math.pow(t,e),n=Math.pow(n,e)-t,e=1/e,function(r){return Math.pow(t+r*n,e)}}(n,e,t):tn(isNaN(n)?e:n)}}function on(t,n){var e=n-t;return e?nn(t,e):tn(isNaN(t)?n:t)}function an(t){return function(n){var e,r,i=n.length,o=new Array(i),a=new Array(i),u=new Array(i);for(e=0;e<i;++e)r=Rt(n[e]),o[e]=r.r||0,a[e]=r.g||0,u[e]=r.b||0;return o=t(o),a=t(a),u=t(u),r.opacity=1,function(t){return r.r=o(t),r.g=a(t),r.b=u(t),r+""}}}function un(t,n){var e,r=n?n.length:0,i=t?Math.min(r,t.length):0,o=new Array(i),a=new Array(r);for(e=0;e<i;++e)o[e]=hn(t[e],n[e]);for(;e<r;++e)a[e]=n[e];return function(t){for(e=0;e<i;++e)a[e]=o[e](t);return a}}function fn(t,n){var e=new Date;return t=+t,n-=t,function(r){return e.setTime(t+n*r),e}}function cn(t,n){return t=+t,n-=t,function(e){return t+n*e}}function sn(t,n){var e,r={},i={};null!==t&&"object"==typeof t||(t={}),null!==n&&"object"==typeof n||(n={});for(e in n)e in t?r[e]=hn(t[e],n[e]):i[e]=n[e];return function(t){for(e in r)i[e]=r[e](t);return i}}function ln(t,n){var e,r,i,o=il.lastIndex=ol.lastIndex=0,a=-1,u=[],f=[];for(t+="",n+="";(e=il.exec(t))&&(r=ol.exec(n));)(i=r.index)>o&&(i=n.slice(o,i),u[a]?u[a]+=i:u[++a]=i),(e=e[0])===(r=r[0])?u[a]?u[a]+=r:u[++a]=r:(u[++a]=null,f.push({i:a,x:cn(e,r)})),o=ol.lastIndex;return o<n.length&&(i=n.slice(o),u[a]?u[a]+=i:u[++a]=i),u.length<2?f[0]?function(t){return function(n){return t(n)+""}}(f[0].x):function(t){return function(){return t}}(n):(n=f.length,function(t){for(var e,r=0;r<n;++r)u[(e=f[r]).i]=e.x(t);return u.join("")})}function hn(t,n){var e,r=typeof n;return null==n||"boolean"===r?tn(n):("number"===r?cn:"string"===r?(e=kt(n))?(n=e,nl):ln:n instanceof kt?nl:n instanceof Date?fn:Array.isArray(n)?un:"function"!=typeof n.valueOf&&"function"!=typeof n.toString||isNaN(n)?sn:cn)(t,n)}function dn(t,n){return t=+t,n-=t,function(e){return Math.round(t+n*e)}}function pn(t,n,e,r,i,o){var a,u,f;return(a=Math.sqrt(t*t+n*n))&&(t/=a,n/=a),(f=t*e+n*r)&&(e-=t*f,r-=n*f),(u=Math.sqrt(e*e+r*r))&&(e/=u,r/=u,f/=u),t*r<n*e&&(t=-t,n=-n,f=-f,a=-a),{translateX:i,translateY:o,rotate:Math.atan2(n,t)*al,skewX:Math.atan(f)*al,scaleX:a,scaleY:u}}function vn(t,n,e,r){function i(t){return t.length?t.pop()+" ":""}return function(o,a){var u=[],f=[];return o=t(o),a=t(a),function(t,r,i,o,a,u){if(t!==i||r!==o){var f=a.push("translate(",null,n,null,e);u.push({i:f-4,x:cn(t,i)},{i:f-2,x:cn(r,o)})}else(i||o)&&a.push("translate("+i+n+o+e)}(o.translateX,o.translateY,a.translateX,a.translateY,u,f),function(t,n,e,o){t!==n?(t-n>180?n+=360:n-t>180&&(t+=360),o.push({i:e.push(i(e)+"rotate(",null,r)-2,x:cn(t,n)})):n&&e.push(i(e)+"rotate("+n+r)}(o.rotate,a.rotate,u,f),function(t,n,e,o){t!==n?o.push({i:e.push(i(e)+"skewX(",null,r)-2,x:cn(t,n)}):n&&e.push(i(e)+"skewX("+n+r)}(o.skewX,a.skewX,u,f),function(t,n,e,r,o,a){if(t!==e||n!==r){var u=o.push(i(o)+"scale(",null,",",null,")");a.push({i:u-4,x:cn(t,e)},{i:u-2,x:cn(n,r)})}else 1===e&&1===r||o.push(i(o)+"scale("+e+","+r+")")}(o.scaleX,o.scaleY,a.scaleX,a.scaleY,u,f),o=a=null,function(t){for(var n,e=-1,r=f.length;++e<r;)u[(n=f[e]).i]=n.x(t);return u.join("")}}}function gn(t){return((t=Math.exp(t))+1/t)/2}function yn(t,n){var e,r,i=t[0],o=t[1],a=t[2],u=n[0],f=n[1],c=n[2],s=u-i,l=f-o,h=s*s+l*l;if(h<dl)r=Math.log(c/a)/sl,e=function(t){return[i+t*s,o+t*l,a*Math.exp(sl*t*r)]};else{var d=Math.sqrt(h),p=(c*c-a*a+hl*h)/(2*a*ll*d),v=(c*c-a*a-hl*h)/(2*c*ll*d),g=Math.log(Math.sqrt(p*p+1)-p),y=Math.log(Math.sqrt(v*v+1)-v);r=(y-g)/sl,e=function(t){var n=t*r,e=gn(g),u=a/(ll*d)*(e*function(t){return((t=Math.exp(2*t))-1)/(t+1)}(sl*n+g)-function(t){return((t=Math.exp(t))-1/t)/2}(g));return[i+u*s,o+u*l,a*e/gn(sl*n+g)]}}return e.duration=1e3*r,e}function _n(t){return function(n,e){var r=t((n=Ut(n)).h,(e=Ut(e)).h),i=on(n.s,e.s),o=on(n.l,e.l),a=on(n.opacity,e.opacity);return function(t){return n.h=r(t),n.s=i(t),n.l=o(t),n.opacity=a(t),n+""}}}function bn(t){return function(n,e){var r=t((n=Vt(n)).h,(e=Vt(e)).h),i=on(n.c,e.c),o=on(n.l,e.l),a=on(n.opacity,e.opacity);return function(t){return n.h=r(t),n.c=i(t),n.l=o(t),n.opacity=a(t),n+""}}}function mn(t){return function n(e){function r(n,r){var i=t((n=Wt(n)).h,(r=Wt(r)).h),o=on(n.s,r.s),a=on(n.l,r.l),u=on(n.opacity,r.opacity);return function(t){return n.h=i(t),n.s=o(t),n.l=a(Math.pow(t,e)),n.opacity=u(t),n+""}}return e=+e,r.gamma=n,r}(1)}function xn(){return Tl||(El(wn),Tl=Sl.now()+Nl)}function wn(){Tl=0}function Mn(){this._call=this._time=this._next=null}function An(t,n,e){var r=new Mn;return r.restart(t,n,e),r}function Tn(){xn(),++ml;for(var t,n=Ks;n;)(t=Tl-n._time)>=0&&n._call.call(null,t),n=n._next;--ml}function Nn(){Tl=(Al=Sl.now())+Nl,ml=xl=0;try{Tn()}finally{ml=0,function(){var t,n,e=Ks,r=1/0;for(;e;)e._call?(r>e._time&&(r=e._time),t=e,e=e._next):(n=e._next,e._next=null,e=t?t._next=n:Ks=n);tl=t,En(r)}(),Tl=0}}function Sn(){var t=Sl.now(),n=t-Al;n>Ml&&(Nl-=n,Al=t)}function En(t){if(!ml){xl&&(xl=clearTimeout(xl));t-Tl>24?(t<1/0&&(xl=setTimeout(Nn,t-Sl.now()-Nl)),wl&&(wl=clearInterval(wl))):(wl||(Al=Sl.now(),wl=setInterval(Sn,Ml)),ml=1,El(Nn))}}function kn(t,n,e){var r=new Mn;return n=null==n?0:+n,r.restart(function(e){r.stop(),t(e+n)},n,e),r}function Cn(t,n,e,r,i,o){var a=t.__transition;if(a){if(e in a)return}else t.__transition={};(function(t,n,e){function r(f){var c,s,l,h;if(e.state!==zl)return o();for(c in u)if((h=u[c]).name===e.name){if(h.state===Ll)return kn(r);h.state===Dl?(h.state=ql,h.timer.stop(),h.on.call("interrupt",t,t.__data__,h.index,h.group),delete u[c]):+c<n&&(h.state=ql,h.timer.stop(),delete u[c])}if(kn(function(){e.state===Ll&&(e.state=Dl,e.timer.restart(i,e.delay,e.time),i(f))}),e.state=Rl,e.on.call("start",t,t.__data__,e.index,e.group),e.state===Rl){for(e.state=Ll,a=new Array(l=e.tween.length),c=0,s=-1;c<l;++c)(h=e.tween[c].value.call(t,t.__data__,e.index,e.group))&&(a[++s]=h);a.length=s+1}}function i(n){for(var r=n<e.duration?e.ease.call(null,n/e.duration):(e.timer.restart(o),e.state=Ul,1),i=-1,u=a.length;++i<u;)a[i].call(null,r);e.state===Ul&&(e.on.call("end",t,t.__data__,e.index,e.group),o())}function o(){e.state=ql,e.timer.stop(),delete u[n];for(var r in u)return;delete t.__transition}var a,u=t.__transition;u[n]=e,e.timer=An(function(t){e.state=zl,e.timer.restart(r,e.delay,e.time),e.delay<=t&&r(t-e.delay)},0,e.time)})(t,e,{name:n,index:r,group:i,on:kl,tween:Cl,time:o.time,delay:o.delay,duration:o.duration,ease:o.ease,timer:null,state:Pl})}function Pn(t,n){var e=Rn(t,n);if(e.state>Pl)throw new Error("too late; already scheduled");return e}function zn(t,n){var e=Rn(t,n);if(e.state>Rl)throw new Error("too late; already started");return e}function Rn(t,n){var e=t.__transition;if(!e||!(e=e[n]))throw new Error("transition not found");return e}function Ln(t,n){var e,r,i,o=t.__transition,a=!0;if(o){n=null==n?null:n+"";for(i in o)(e=o[i]).name===n?(r=e.state>Rl&&e.state<Ul,e.state=ql,e.timer.stop(),r&&e.on.call("interrupt",t,t.__data__,e.index,e.group),delete o[i]):a=!1;a&&delete t.__transition}}function Dn(t,n,e){var r=t._id;return t.each(function(){var t=zn(this,r);(t.value||(t.value={}))[n]=e.apply(this,arguments)}),function(t){return Rn(t,r).value[n]}}function Un(t,n){var e;return("number"==typeof n?cn:n instanceof kt?nl:(e=kt(n))?(n=e,nl):ln)(t,n)}function qn(t,n,e,r){this._groups=t,this._parents=n,this._name=e,this._id=r}function On(t){return ft().transition(t)}function Yn(){return++Yl}function Bn(t){return((t*=2)<=1?t*t:--t*(2-t)+1)/2}function Fn(t){return((t*=2)<=1?t*t*t:(t-=2)*t*t+2)/2}function In(t){return(1-Math.cos(Hl*t))/2}function jn(t){return((t*=2)<=1?Math.pow(2,10*t-10):2-Math.pow(2,10-10*t))/2}function Hn(t){return((t*=2)<=1?1-Math.sqrt(1-t*t):Math.sqrt(1-(t-=2)*t)+1)/2}function Xn(t){return(t=+t)<Gl?nh*t*t:t<$l?nh*(t-=Vl)*t+Wl:t<Ql?nh*(t-=Zl)*t+Jl:nh*(t-=Kl)*t+th}function Gn(t,n){for(var e;!(e=t.__transition)||!(e=e[n]);)if(!(t=t.parentNode))return ch.time=xn(),ch;return e}function Vn(t){return function(){return t}}function $n(){t.event.stopImmediatePropagation()}function Wn(){t.event.preventDefault(),t.event.stopImmediatePropagation()}function Zn(t){return{type:t}}function Qn(){return!t.event.button}function Jn(){var t=this.ownerSVGElement||this;return[[0,0],[t.width.baseVal.value,t.height.baseVal.value]]}function Kn(t){for(;!t.__brush;)if(!(t=t.parentNode))return;return t.__brush}function te(t){return t[0][0]===t[1][0]||t[0][1]===t[1][1]}function ne(n){function e(t){var e=t.property("__brush",u).selectAll(".overlay").data([Zn("overlay")]);e.enter().append("rect").attr("class","overlay").attr("pointer-events","all").attr("cursor",_h.overlay).merge(e).each(function(){var t=Kn(this).extent;ct(this).attr("x",t[0][0]).attr("y",t[0][1]).attr("width",t[1][0]-t[0][0]).attr("height",t[1][1]-t[0][1])}),t.selectAll(".selection").data([Zn("selection")]).enter().append("rect").attr("class","selection").attr("cursor",_h.selection).attr("fill","#777").attr("fill-opacity",.3).attr("stroke","#fff").attr("shape-rendering","crispEdges");var i=t.selectAll(".handle").data(n.handles,function(t){return t.type});i.exit().remove(),i.enter().append("rect").attr("class",function(t){return"handle handle--"+t.type}).attr("cursor",function(t){return _h[t.type]}),t.each(r).attr("fill","none").attr("pointer-events","all").style("-webkit-tap-highlight-color","rgba(0,0,0,0)").on("mousedown.brush touchstart.brush",a)}function r(){var t=ct(this),n=Kn(this).selection;n?(t.selectAll(".selection").style("display",null).attr("x",n[0][0]).attr("y",n[0][1]).attr("width",n[1][0]-n[0][0]).attr("height",n[1][1]-n[0][1]),t.selectAll(".handle").style("display",null).attr("x",function(t){return"e"===t.type[t.type.length-1]?n[1][0]-h/2:n[0][0]-h/2}).attr("y",function(t){return"s"===t.type[0]?n[1][1]-h/2:n[0][1]-h/2}).attr("width",function(t){return"n"===t.type||"s"===t.type?n[1][0]-n[0][0]+h:h}).attr("height",function(t){return"e"===t.type||"w"===t.type?n[1][1]-n[0][1]+h:h})):t.selectAll(".selection,.handle").style("display","none").attr("x",null).attr("y",null).attr("width",null).attr("height",null)}function i(t,n){return t.__brush.emitter||new o(t,n)}function o(t,n){this.that=t,this.args=n,this.state=t.__brush,this.active=0}function a(){function e(){var t=pt(w);!L||m||x||(Math.abs(t[0]-U[0])>Math.abs(t[1]-U[1])?x=!0:m=!0),U=t,b=!0,Wn(),o()}function o(){var t;switch(y=U[0]-D[0],_=U[1]-D[1],A){case hh:case lh:T&&(y=Math.max(C-u,Math.min(z-d,y)),c=u+y,p=d+y),N&&(_=Math.max(P-l,Math.min(R-v,_)),h=l+_,g=v+_);break;case dh:T<0?(y=Math.max(C-u,Math.min(z-u,y)),c=u+y,p=d):T>0&&(y=Math.max(C-d,Math.min(z-d,y)),c=u,p=d+y),N<0?(_=Math.max(P-l,Math.min(R-l,_)),h=l+_,g=v):N>0&&(_=Math.max(P-v,Math.min(R-v,_)),h=l,g=v+_);break;case ph:T&&(c=Math.max(C,Math.min(z,u-y*T)),p=Math.max(C,Math.min(z,d+y*T))),N&&(h=Math.max(P,Math.min(R,l-_*N)),g=Math.max(P,Math.min(R,v+_*N)))}p<c&&(T*=-1,t=u,u=d,d=t,t=c,c=p,p=t,M in bh&&Y.attr("cursor",_h[M=bh[M]])),g<h&&(N*=-1,t=l,l=v,v=t,t=h,h=g,g=t,M in mh&&Y.attr("cursor",_h[M=mh[M]])),S.selection&&(k=S.selection),m&&(c=k[0][0],p=k[1][0]),x&&(h=k[0][1],g=k[1][1]),k[0][0]===c&&k[0][1]===h&&k[1][0]===p&&k[1][1]===g||(S.selection=[[c,h],[p,g]],r.call(w),q.brush())}function a(){if($n(),t.event.touches){if(t.event.touches.length)return;f&&clearTimeout(f),f=setTimeout(function(){f=null},500),O.on("touchmove.brush touchend.brush touchcancel.brush",null)}else bt(t.event.view,b),B.on("keydown.brush keyup.brush mousemove.brush mouseup.brush",null);O.attr("pointer-events","all"),Y.attr("cursor",_h.overlay),S.selection&&(k=S.selection),te(k)&&(S.selection=null,r.call(w)),q.end()}if(t.event.touches){if(t.event.changedTouches.length<t.event.touches.length)return Wn()}else if(f)return;if(s.apply(this,arguments)){var u,c,l,h,d,p,v,g,y,_,b,m,x,w=this,M=t.event.target.__data__.type,A="selection"===(t.event.metaKey?M="overlay":M)?lh:t.event.altKey?ph:dh,T=n===gh?null:xh[M],N=n===vh?null:wh[M],S=Kn(w),E=S.extent,k=S.selection,C=E[0][0],P=E[0][1],z=E[1][0],R=E[1][1],L=T&&N&&t.event.shiftKey,D=pt(w),U=D,q=i(w,arguments).beforestart();"overlay"===M?S.selection=k=[[u=n===gh?C:D[0],l=n===vh?P:D[1]],[d=n===gh?z:u,v=n===vh?R:l]]:(u=k[0][0],l=k[0][1],d=k[1][0],v=k[1][1]),c=u,h=l,p=d,g=v;var O=ct(w).attr("pointer-events","none"),Y=O.selectAll(".overlay").attr("cursor",_h[M]);if(t.event.touches)O.on("touchmove.brush",e,!0).on("touchend.brush touchcancel.brush",a,!0);else{var B=ct(t.event.view).on("keydown.brush",function(){switch(t.event.keyCode){case 16:L=T&&N;break;case 18:A===dh&&(T&&(d=p-y*T,u=c+y*T),N&&(v=g-_*N,l=h+_*N),A=ph,o());break;case 32:A!==dh&&A!==ph||(T<0?d=p-y:T>0&&(u=c-y),N<0?v=g-_:N>0&&(l=h-_),A=hh,Y.attr("cursor",_h.selection),o());break;default:return}Wn()},!0).on("keyup.brush",function(){switch(t.event.keyCode){case 16:L&&(m=x=L=!1,o());break;case 18:A===ph&&(T<0?d=p:T>0&&(u=c),N<0?v=g:N>0&&(l=h),A=dh,o());break;case 32:A===hh&&(t.event.altKey?(T&&(d=p-y*T,u=c+y*T),N&&(v=g-_*N,l=h+_*N),A=ph):(T<0?d=p:T>0&&(u=c),N<0?v=g:N>0&&(l=h),A=dh),Y.attr("cursor",_h[M]),o());break;default:return}Wn()},!0).on("mousemove.brush",e,!0).on("mouseup.brush",a,!0);_t(t.event.view)}$n(),Ln(w),r.call(w),q.start()}}function u(){var t=this.__brush||{selection:null};return t.extent=c.apply(this,arguments),t.dim=n,t}var f,c=Jn,s=Qn,l=N(e,"start","brush","end"),h=6;return e.move=function(t,e){t.selection?t.on("start.brush",function(){i(this,arguments).beforestart().start()}).on("interrupt.brush end.brush",function(){i(this,arguments).end()}).tween("brush",function(){function t(t){a.selection=1===t&&te(c)?null:s(t),r.call(o),u.brush()}var o=this,a=o.__brush,u=i(o,arguments),f=a.selection,c=n.input("function"==typeof e?e.apply(this,arguments):e,a.extent),s=hn(f,c);return f&&c?t:t(1)}):t.each(function(){var t=arguments,o=this.__brush,a=n.input("function"==typeof e?e.apply(this,t):e,o.extent),u=i(this,t).beforestart();Ln(this),o.selection=null==a||te(a)?null:a,r.call(this),u.start().brush().end()})},o.prototype={beforestart:function(){return 1==++this.active&&(this.state.emitter=this,this.starting=!0),this},start:function(){return this.starting&&(this.starting=!1,this.emit("start")),this},brush:function(){return this.emit("brush"),this},end:function(){return 0==--this.active&&(delete this.state.emitter,this.emit("end")),this},emit:function(t){ot(new function(t,n,e){this.target=t,this.type=n,this.selection=e}(e,t,n.output(this.state.selection)),l.apply,l,[t,this.that,this.args])}},e.extent=function(t){return arguments.length?(c="function"==typeof t?t:Vn([[+t[0][0],+t[0][1]],[+t[1][0],+t[1][1]]]),e):c},e.filter=function(t){return arguments.length?(s="function"==typeof t?t:Vn(!!t),e):s},e.handleSize=function(t){return arguments.length?(h=+t,e):h},e.on=function(){var t=l.on.apply(l,arguments);return t===l?e:t},e}function ee(t){return function(){return t}}function re(){this._x0=this._y0=this._x1=this._y1=null,this._=""}function ie(){return new re}function oe(t){return t.source}function ae(t){return t.target}function ue(t){return t.radius}function fe(t){return t.startAngle}function ce(t){return t.endAngle}function se(){}function le(t,n){var e=new se;if(t instanceof se)t.each(function(t,n){e.set(n,t)});else if(Array.isArray(t)){var r,i=-1,o=t.length;if(null==n)for(;++i<o;)e.set(i,t[i]);else for(;++i<o;)e.set(n(r=t[i],i,t),r)}else if(t)for(var a in t)e.set(a,t[a]);return e}function he(){return{}}function de(t,n,e){t[n]=e}function pe(){return le()}function ve(t,n,e){t.set(n,e)}function ge(){}function ye(t,n){var e=new ge;if(t instanceof ge)t.each(function(t){e.add(t)});else if(t){var r=-1,i=t.length;if(null==n)for(;++r<i;)e.add(t[r]);else for(;++r<i;)e.add(n(t[r],r,t))}return e}function _e(t,n){return t-n}function be(t){return function(){return t}}function me(t,n){for(var e,r=-1,i=n.length;++r<i;)if(e=function(t,n){for(var e=n[0],r=n[1],i=-1,o=0,a=t.length,u=a-1;o<a;u=o++){var f=t[o],c=f[0],s=f[1],l=t[u],h=l[0],d=l[1];if(function(t,n,e){var r;return function(t,n,e){return(n[0]-t[0])*(e[1]-t[1])==(e[0]-t[0])*(n[1]-t[1])}(t,n,e)&&function(t,n,e){return t<=n&&n<=e||e<=n&&n<=t}(t[r=+(t[0]===n[0])],e[r],n[r])}(f,l,n))return 0;s>r!=d>r&&e<(h-c)*(r-s)/(d-s)+c&&(i=-i)}return i}(t,n[r]))return e;return 0}function xe(){}function we(){function t(t){var e=a(t);if(Array.isArray(e))e=e.slice().sort(_e);else{var r=u(t),i=r[0],o=r[1];e=d(i,o,e),e=s(Math.floor(i/e)*e,Math.floor(o/e)*e,e)}return e.map(function(e){return n(t,e)})}function n(t,n){var r=[],a=[];return function(t,n,r){function a(t){var n,i,o=[t[0][0]+u,t[0][1]+f],a=[t[1][0]+u,t[1][1]+f],c=e(o),s=e(a);(n=p[c])?(i=d[s])?(delete p[n.end],delete d[i.start],n===i?(n.ring.push(a),r(n.ring)):d[n.start]=p[i.end]={start:n.start,end:i.end,ring:n.ring.concat(i.ring)}):(delete p[n.end],n.ring.push(a),p[n.end=s]=n):(n=d[s])?(i=p[c])?(delete d[n.start],delete p[i.end],n===i?(n.ring.push(a),r(n.ring)):d[i.start]=p[n.end]={start:i.start,end:n.end,ring:i.ring.concat(n.ring)}):(delete d[n.start],n.ring.unshift(o),d[n.start=c]=n):d[c]=p[s]={start:c,end:s,ring:[o,a]}}var u,f,c,s,l,h,d=new Array,p=new Array;u=f=-1,s=t[0]>=n,Dh[s<<1].forEach(a);for(;++u<i-1;)c=s,s=t[u+1]>=n,Dh[c|s<<1].forEach(a);Dh[s<<0].forEach(a);for(;++f<o-1;){for(u=-1,s=t[f*i+i]>=n,l=t[f*i]>=n,Dh[s<<1|l<<2].forEach(a);++u<i-1;)c=s,s=t[f*i+i+u+1]>=n,h=l,l=t[f*i+u+1]>=n,Dh[c|s<<1|l<<2|h<<3].forEach(a);Dh[s|l<<3].forEach(a)}u=-1,l=t[f*i]>=n,Dh[l<<2].forEach(a);for(;++u<i-1;)h=l,l=t[f*i+u+1]>=n,Dh[l<<2|h<<3].forEach(a);Dh[l<<3].forEach(a)}(t,n,function(e){f(e,t,n),function(t){for(var n=0,e=t.length,r=t[e-1][1]*t[0][0]-t[e-1][0]*t[0][1];++n<e;)r+=t[n-1][1]*t[n][0]-t[n-1][0]*t[n][1];return r}(e)>0?r.push([e]):a.push(e)}),a.forEach(function(t){for(var n,e=0,i=r.length;e<i;++e)if(-1!==me((n=r[e])[0],t))return void n.push(t)}),{type:"MultiPolygon",value:n,coordinates:r}}function e(t){return 2*t[0]+t[1]*(i+1)*4}function r(t,n,e){t.forEach(function(t){var r,a=t[0],u=t[1],f=0|a,c=0|u,s=n[c*i+f];a>0&&a<i&&f===a&&(r=n[c*i+f-1],t[0]=a+(e-r)/(s-r)-.5),u>0&&u<o&&c===u&&(r=n[(c-1)*i+f],t[1]=u+(e-r)/(s-r)-.5)})}var i=1,o=1,a=p,f=r;return t.contour=n,t.size=function(n){if(!arguments.length)return[i,o];var e=Math.ceil(n[0]),r=Math.ceil(n[1]);if(!(e>0&&r>0))throw new Error("invalid size");return i=e,o=r,t},t.thresholds=function(n){return arguments.length?(a="function"==typeof n?n:Array.isArray(n)?be(Lh.call(n)):be(n),t):a},t.smooth=function(n){return arguments.length?(f=n?r:xe,t):f===r},t}function Me(t,n,e){for(var r=t.width,i=t.height,o=1+(e<<1),a=0;a<i;++a)for(var u=0,f=0;u<r+e;++u)u<r&&(f+=t.data[u+a*r]),u>=e&&(u>=o&&(f-=t.data[u-o+a*r]),n.data[u-e+a*r]=f/Math.min(u+1,r-1+o-u,o))}function Ae(t,n,e){for(var r=t.width,i=t.height,o=1+(e<<1),a=0;a<r;++a)for(var u=0,f=0;u<i+e;++u)u<i&&(f+=t.data[a+u*r]),u>=e&&(u>=o&&(f-=t.data[a+(u-o)*r]),n.data[a+(u-e)*r]=f/Math.min(u+1,i-1+o-u,o))}function Te(t){return t[0]}function Ne(t){return t[1]}function Se(t){return new Function("d","return {"+t.map(function(t,n){return JSON.stringify(t)+": d["+n+"]"}).join(",")+"}")}function Ee(t){function n(t,n){function e(){if(c)return qh;if(s)return s=!1,Uh;var n,e,r=u;if(t.charCodeAt(r)===Oh){for(;u++<a&&t.charCodeAt(u)!==Oh||t.charCodeAt(++u)===Oh;);return(n=u)>=a?c=!0:(e=t.charCodeAt(u++))===Yh?s=!0:e===Bh&&(s=!0,t.charCodeAt(u)===Yh&&++u),t.slice(r+1,n-1).replace(/""/g,'"')}for(;u<a;){if((e=t.charCodeAt(n=u++))===Yh)s=!0;else if(e===Bh)s=!0,t.charCodeAt(u)===Yh&&++u;else if(e!==o)continue;return t.slice(r,n)}return c=!0,t.slice(r,a)}var r,i=[],a=t.length,u=0,f=0,c=a<=0,s=!1;for(t.charCodeAt(a-1)===Yh&&--a,t.charCodeAt(a-1)===Bh&&--a;(r=e())!==qh;){for(var l=[];r!==Uh&&r!==qh;)l.push(r),r=e();n&&null==(l=n(l,f++))||i.push(l)}return i}function e(n){return n.map(r).join(t)}function r(t){return null==t?"":i.test(t+="")?'"'+t.replace(/"/g,'""')+'"':t}var i=new RegExp('["'+t+"\n\r]"),o=t.charCodeAt(0);return{parse:function(t,e){var r,i,o=n(t,function(t,n){if(r)return r(t,n-1);i=t,r=e?function(t,n){var e=Se(t);return function(r,i){return n(e(r),i,t)}}(t,e):Se(t)});return o.columns=i||[],o},parseRows:n,format:function(n,e){return null==e&&(e=function(t){var n=Object.create(null),e=[];return t.forEach(function(t){for(var r in t)r in n||e.push(n[r]=r)}),e}(n)),[e.map(r).join(t)].concat(n.map(function(n){return e.map(function(t){return r(n[t])}).join(t)})).join("\n")},formatRows:function(t){return t.map(e).join("\n")}}}function ke(t){if(!t.ok)throw new Error(t.status+" "+t.statusText);return t.blob()}function Ce(t){if(!t.ok)throw new Error(t.status+" "+t.statusText);return t.arrayBuffer()}function Pe(t){if(!t.ok)throw new Error(t.status+" "+t.statusText);return t.text()}function ze(t,n){return fetch(t,n).then(Pe)}function Re(t){return function(n,e,r){return 2===arguments.length&&"function"==typeof e&&(r=e,e=void 0),ze(n,e).then(function(n){return t(n,r)})}}function Le(t){if(!t.ok)throw new Error(t.status+" "+t.statusText);return t.json()}function De(t){return function(n,e){return ze(n,e).then(function(n){return(new DOMParser).parseFromString(n,t)})}}function Ue(t){return function(){return t}}function qe(){return 1e-6*(Math.random()-.5)}function Oe(t,n,e,r){if(isNaN(n)||isNaN(e))return t;var i,o,a,u,f,c,s,l,h,d=t._root,p={data:r},v=t._x0,g=t._y0,y=t._x1,_=t._y1;if(!d)return t._root=p,t;for(;d.length;)if((c=n>=(o=(v+y)/2))?v=o:y=o,(s=e>=(a=(g+_)/2))?g=a:_=a,i=d,!(d=d[l=s<<1|c]))return i[l]=p,t;if(u=+t._x.call(null,d.data),f=+t._y.call(null,d.data),n===u&&e===f)return p.next=d,i?i[l]=p:t._root=p,t;do{i=i?i[l]=new Array(4):t._root=new Array(4),(c=n>=(o=(v+y)/2))?v=o:y=o,(s=e>=(a=(g+_)/2))?g=a:_=a}while((l=s<<1|c)==(h=(f>=a)<<1|u>=o));return i[h]=d,i[l]=p,t}function Ye(t,n,e,r,i){this.node=t,this.x0=n,this.y0=e,this.x1=r,this.y1=i}function Be(t){return t[0]}function Fe(t){return t[1]}function Ie(t,n,e){var r=new je(null==n?Be:n,null==e?Fe:e,NaN,NaN,NaN,NaN);return null==t?r:r.addAll(t)}function je(t,n,e,r,i,o){this._x=t,this._y=n,this._x0=e,this._y0=r,this._x1=i,this._y1=o,this._root=void 0}function He(t){for(var n={data:t.data},e=n;t=t.next;)e=e.next={data:t.data};return n}function Xe(t){return t.x+t.vx}function Ge(t){return t.y+t.vy}function Ve(t){return t.index}function $e(t,n){var e=t.get(n);if(!e)throw new Error("missing: "+n);return e}function We(t){return t.x}function Ze(t){return t.y}function Qe(t,n){if((e=(t=n?t.toExponential(n-1):t.toExponential()).indexOf("e"))<0)return null;var e,r=t.slice(0,e);return[r.length>1?r[0]+r.slice(2):r,+t.slice(e+1)]}function Je(t){return(t=Qe(Math.abs(t)))?t[1]:NaN}function Ke(t,n){var e=Qe(t,n);if(!e)return t+"";var r=e[0],i=e[1];return i<0?"0."+new Array(-i).join("0")+r:r.length>i+1?r.slice(0,i+1)+"."+r.slice(i+1):r+new Array(i-r.length+2).join("0")}function tr(t){return new nr(t)}function nr(t){if(!(n=ud.exec(t)))throw new Error("invalid format: "+t);var n,e=n[1]||" ",r=n[2]||">",i=n[3]||"-",o=n[4]||"",a=!!n[5],u=n[6]&&+n[6],f=!!n[7],c=n[8]&&+n[8].slice(1),s=n[9]||"";"n"===s?(f=!0,s="g"):ad[s]||(s=""),(a||"0"===e&&"="===r)&&(a=!0,e="0",r="="),this.fill=e,this.align=r,this.sign=i,this.symbol=o,this.zero=a,this.width=u,this.comma=f,this.precision=c,this.type=s}function er(t){return t}function rr(t){function n(t){function n(t){var n,r,a,s=g,m=y;if("c"===v)m=_(t)+m,t="";else{var x=(t=+t)<0;if(t=_(Math.abs(t),p),x&&0==+t&&(x=!1),s=(x?"("===c?c:"-":"-"===c||"("===c?"":c)+s,m=("s"===v?cd[8+rd/3]:"")+m+(x&&"("===c?")":""),b)for(n=-1,r=t.length;++n<r;)if(48>(a=t.charCodeAt(n))||a>57){m=(46===a?i+t.slice(n+1):t.slice(n))+m,t=t.slice(0,n);break}}d&&!l&&(t=e(t,1/0));var w=s.length+t.length+m.length,M=w<h?new Array(h-w+1).join(u):"";switch(d&&l&&(t=e(M+t,M.length?h-m.length:1/0),M=""),f){case"<":t=s+t+m+M;break;case"=":t=s+M+t+m;break;case"^":t=M.slice(0,w=M.length>>1)+s+t+m+M.slice(w);break;default:t=M+s+t+m}return o(t)}var u=(t=tr(t)).fill,f=t.align,c=t.sign,s=t.symbol,l=t.zero,h=t.width,d=t.comma,p=t.precision,v=t.type,g="$"===s?r[0]:"#"===s&&/[boxX]/.test(v)?"0"+v.toLowerCase():"",y="$"===s?r[1]:/[%p]/.test(v)?a:"",_=ad[v],b=!v||/[defgprs%]/.test(v);return p=null==p?v?6:12:/[gprs]/.test(v)?Math.max(1,Math.min(21,p)):Math.max(0,Math.min(20,p)),n.toString=function(){return t+""},n}var e=t.grouping&&t.thousands?function(t,n){return function(e,r){for(var i=e.length,o=[],a=0,u=t[0],f=0;i>0&&u>0&&(f+u+1>r&&(u=Math.max(1,r-f)),o.push(e.substring(i-=u,i+u)),!((f+=u+1)>r));)u=t[a=(a+1)%t.length];return o.reverse().join(n)}}(t.grouping,t.thousands):er,r=t.currency,i=t.decimal,o=t.numerals?function(t){return function(n){return n.replace(/[0-9]/g,function(n){return t[+n]})}}(t.numerals):er,a=t.percent||"%";return{format:n,formatPrefix:function(t,e){var r=n((t=tr(t),t.type="f",t)),i=3*Math.max(-8,Math.min(8,Math.floor(Je(e)/3))),o=Math.pow(10,-i),a=cd[8+i/3];return function(t){return r(o*t)+a}}}}function ir(n){return fd=rr(n),t.format=fd.format,t.formatPrefix=fd.formatPrefix,fd}function or(t){return Math.max(0,-Je(Math.abs(t)))}function ar(t,n){return Math.max(0,3*Math.max(-8,Math.min(8,Math.floor(Je(n)/3)))-Je(Math.abs(t)))}function ur(t,n){return t=Math.abs(t),n=Math.abs(n)-t,Math.max(0,Je(n)-Je(t))+1}function fr(){return new cr}function cr(){this.reset()}function sr(t,n,e){var r=t.s=n+e,i=r-n,o=r-i;t.t=n-o+(e-i)}function lr(t){return t>1?0:t<-1?Hd:Math.acos(t)}function hr(t){return t>1?Xd:t<-1?-Xd:Math.asin(t)}function dr(t){return(t=ip(t/2))*t}function pr(){}function vr(t,n){t&&cp.hasOwnProperty(t.type)&&cp[t.type](t,n)}function gr(t,n,e){var r,i=-1,o=t.length-e;for(n.lineStart();++i<o;)r=t[i],n.point(r[0],r[1],r[2]);n.lineEnd()}function yr(t,n){var e=-1,r=t.length;for(n.polygonStart();++e<r;)gr(t[e],n,1);n.polygonEnd()}function _r(t,n){t&&fp.hasOwnProperty(t.type)?fp[t.type](t,n):vr(t,n)}function br(){hp.point=xr}function mr(){wr(sd,ld)}function xr(t,n){hp.point=wr,sd=t,ld=n,hd=t*=Wd,dd=Kd(n=(n*=Wd)/2+Gd),pd=ip(n)}function wr(t,n){n=(n*=Wd)/2+Gd;var e=(t*=Wd)-hd,r=e>=0?1:-1,i=r*e,o=Kd(n),a=ip(n),u=pd*a,f=dd*o+u*Kd(i),c=u*r*ip(i);sp.add(Jd(c,f)),hd=t,dd=o,pd=a}function Mr(t){return[Jd(t[1],t[0]),hr(t[2])]}function Ar(t){var n=t[0],e=t[1],r=Kd(e);return[r*Kd(n),r*ip(n),ip(e)]}function Tr(t,n){return t[0]*n[0]+t[1]*n[1]+t[2]*n[2]}function Nr(t,n){return[t[1]*n[2]-t[2]*n[1],t[2]*n[0]-t[0]*n[2],t[0]*n[1]-t[1]*n[0]]}function Sr(t,n){t[0]+=n[0],t[1]+=n[1],t[2]+=n[2]}function Er(t,n){return[t[0]*n,t[1]*n,t[2]*n]}function kr(t){var n=ap(t[0]*t[0]+t[1]*t[1]+t[2]*t[2]);t[0]/=n,t[1]/=n,t[2]/=n}function Cr(t,n){Md.push(Ad=[vd=t,yd=t]),n<gd&&(gd=n),n>_d&&(_d=n)}function Pr(t,n){var e=Ar([t*Wd,n*Wd]);if(wd){var r=Nr(wd,e),i=Nr([r[1],-r[0],0],r);kr(i),i=Mr(i);var o,a=t-bd,u=a>0?1:-1,f=i[0]*$d*u,c=Zd(a)>180;c^(u*bd<f&&f<u*t)?(o=i[1]*$d)>_d&&(_d=o):(f=(f+360)%360-180,c^(u*bd<f&&f<u*t)?(o=-i[1]*$d)<gd&&(gd=o):(n<gd&&(gd=n),n>_d&&(_d=n))),c?t<bd?qr(vd,t)>qr(vd,yd)&&(yd=t):qr(t,yd)>qr(vd,yd)&&(vd=t):yd>=vd?(t<vd&&(vd=t),t>yd&&(yd=t)):t>bd?qr(vd,t)>qr(vd,yd)&&(yd=t):qr(t,yd)>qr(vd,yd)&&(vd=t)}else Md.push(Ad=[vd=t,yd=t]);n<gd&&(gd=n),n>_d&&(_d=n),wd=e,bd=t}function zr(){pp.point=Pr}function Rr(){Ad[0]=vd,Ad[1]=yd,pp.point=Cr,wd=null}function Lr(t,n){if(wd){var e=t-bd;dp.add(Zd(e)>180?e+(e>0?360:-360):e)}else md=t,xd=n;hp.point(t,n),Pr(t,n)}function Dr(){hp.lineStart()}function Ur(){Lr(md,xd),hp.lineEnd(),Zd(dp)>Id&&(vd=-(yd=180)),Ad[0]=vd,Ad[1]=yd,wd=null}function qr(t,n){return(n-=t)<0?n+360:n}function Or(t,n){return t[0]-n[0]}function Yr(t,n){return t[0]<=t[1]?t[0]<=n&&n<=t[1]:n<t[0]||t[1]<n}function Br(t,n){t*=Wd;var e=Kd(n*=Wd);Fr(e*Kd(t),e*ip(t),ip(n))}function Fr(t,n,e){Sd+=(t-Sd)/++Td,Ed+=(n-Ed)/Td,kd+=(e-kd)/Td}function Ir(){vp.point=jr}function jr(t,n){t*=Wd;var e=Kd(n*=Wd);Od=e*Kd(t),Yd=e*ip(t),Bd=ip(n),vp.point=Hr,Fr(Od,Yd,Bd)}function Hr(t,n){t*=Wd;var e=Kd(n*=Wd),r=e*Kd(t),i=e*ip(t),o=ip(n),a=Jd(ap((a=Yd*o-Bd*i)*a+(a=Bd*r-Od*o)*a+(a=Od*i-Yd*r)*a),Od*r+Yd*i+Bd*o);Nd+=a,Cd+=a*(Od+(Od=r)),Pd+=a*(Yd+(Yd=i)),zd+=a*(Bd+(Bd=o)),Fr(Od,Yd,Bd)}function Xr(){vp.point=Br}function Gr(){vp.point=$r}function Vr(){Wr(Ud,qd),vp.point=Br}function $r(t,n){Ud=t,qd=n,t*=Wd,n*=Wd,vp.point=Wr;var e=Kd(n);Od=e*Kd(t),Yd=e*ip(t),Bd=ip(n),Fr(Od,Yd,Bd)}function Wr(t,n){t*=Wd;var e=Kd(n*=Wd),r=e*Kd(t),i=e*ip(t),o=ip(n),a=Yd*o-Bd*i,u=Bd*r-Od*o,f=Od*i-Yd*r,c=ap(a*a+u*u+f*f),s=hr(c),l=c&&-s/c;Rd+=l*a,Ld+=l*u,Dd+=l*f,Nd+=s,Cd+=s*(Od+(Od=r)),Pd+=s*(Yd+(Yd=i)),zd+=s*(Bd+(Bd=o)),Fr(Od,Yd,Bd)}function Zr(t){return function(){return t}}function Qr(t,n){function e(e,r){return e=t(e,r),n(e[0],e[1])}return t.invert&&n.invert&&(e.invert=function(e,r){return(e=n.invert(e,r))&&t.invert(e[0],e[1])}),e}function Jr(t,n){return[t>Hd?t-Vd:t<-Hd?t+Vd:t,n]}function Kr(t,n,e){return(t%=Vd)?n||e?Qr(ni(t),ei(n,e)):ni(t):n||e?ei(n,e):Jr}function ti(t){return function(n,e){return n+=t,[n>Hd?n-Vd:n<-Hd?n+Vd:n,e]}}function ni(t){var n=ti(t);return n.invert=ti(-t),n}function ei(t,n){function e(t,n){var e=Kd(n),u=Kd(t)*e,f=ip(t)*e,c=ip(n),s=c*r+u*i;return[Jd(f*o-s*a,u*r-c*i),hr(s*o+f*a)]}var r=Kd(t),i=ip(t),o=Kd(n),a=ip(n);return e.invert=function(t,n){var e=Kd(n),u=Kd(t)*e,f=ip(t)*e,c=ip(n),s=c*o-f*a;return[Jd(f*o+c*a,u*r+s*i),hr(s*r-u*i)]},e}function ri(t){function n(n){return n=t(n[0]*Wd,n[1]*Wd),n[0]*=$d,n[1]*=$d,n}return t=Kr(t[0]*Wd,t[1]*Wd,t.length>2?t[2]*Wd:0),n.invert=function(n){return n=t.invert(n[0]*Wd,n[1]*Wd),n[0]*=$d,n[1]*=$d,n},n}function ii(t,n,e,r,i,o){if(e){var a=Kd(n),u=ip(n),f=r*e;null==i?(i=n+r*Vd,o=n-f/2):(i=oi(a,i),o=oi(a,o),(r>0?i<o:i>o)&&(i+=r*Vd));for(var c,s=i;r>0?s>o:s<o;s-=f)c=Mr([a,-u*Kd(s),-u*ip(s)]),t.point(c[0],c[1])}}function oi(t,n){(n=Ar(n))[0]-=t,kr(n);var e=lr(-n[1]);return((-n[2]<0?-e:e)+Vd-Id)%Vd}function ai(){var t,n=[];return{point:function(n,e){t.push([n,e])},lineStart:function(){n.push(t=[])},lineEnd:pr,rejoin:function(){n.length>1&&n.push(n.pop().concat(n.shift()))},result:function(){var e=n;return n=[],t=null,e}}}function ui(t,n){return Zd(t[0]-n[0])<Id&&Zd(t[1]-n[1])<Id}function fi(t,n,e,r){this.x=t,this.z=n,this.o=e,this.e=r,this.v=!1,this.n=this.p=null}function ci(t,n,e,r,i){var o,a,u=[],f=[];if(t.forEach(function(t){if(!((n=t.length-1)<=0)){var n,e,r=t[0],a=t[n];if(ui(r,a)){for(i.lineStart(),o=0;o<n;++o)i.point((r=t[o])[0],r[1]);i.lineEnd()}else u.push(e=new fi(r,t,null,!0)),f.push(e.o=new fi(r,null,e,!1)),u.push(e=new fi(a,t,null,!1)),f.push(e.o=new fi(a,null,e,!0))}}),u.length){for(f.sort(n),si(u),si(f),o=0,a=f.length;o<a;++o)f[o].e=e=!e;for(var c,s,l=u[0];;){for(var h=l,d=!0;h.v;)if((h=h.n)===l)return;c=h.z,i.lineStart();do{if(h.v=h.o.v=!0,h.e){if(d)for(o=0,a=c.length;o<a;++o)i.point((s=c[o])[0],s[1]);else r(h.x,h.n.x,1,i);h=h.n}else{if(d)for(c=h.p.z,o=c.length-1;o>=0;--o)i.point((s=c[o])[0],s[1]);else r(h.x,h.p.x,-1,i);h=h.p}c=(h=h.o).z,d=!d}while(!h.v);i.lineEnd()}}}function si(t){if(n=t.length){for(var n,e,r=0,i=t[0];++r<n;)i.n=e=t[r],e.p=i,i=e;i.n=e=t[0],e.p=i}}function li(t,n){var e=n[0],r=n[1],i=ip(r),o=[ip(e),-Kd(e),0],a=0,u=0;Sp.reset(),1===i?r=Xd+Id:-1===i&&(r=-Xd-Id);for(var f=0,c=t.length;f<c;++f)if(l=(s=t[f]).length)for(var s,l,h=s[l-1],d=h[0],p=h[1]/2+Gd,v=ip(p),g=Kd(p),y=0;y<l;++y,d=b,v=x,g=w,h=_){var _=s[y],b=_[0],m=_[1]/2+Gd,x=ip(m),w=Kd(m),M=b-d,A=M>=0?1:-1,T=A*M,N=T>Hd,S=v*x;if(Sp.add(Jd(S*A*ip(T),g*w+S*Kd(T))),a+=N?M+A*Vd:M,N^d>=e^b>=e){var E=Nr(Ar(h),Ar(_));kr(E);var k=Nr(o,E);kr(k);var C=(N^M>=0?-1:1)*hr(k[2]);(r>C||r===C&&(E[0]||E[1]))&&(u+=N^M>=0?1:-1)}}return(a<-Id||a<Id&&Sp<-Id)^1&u}function hi(t,n,e,r){return function(i){function o(n,e){t(n,e)&&i.point(n,e)}function a(t,n){v.point(t,n)}function u(){m.point=a,v.lineStart()}function f(){m.point=o,v.lineEnd()}function c(t,n){p.push([t,n]),_.point(t,n)}function s(){_.lineStart(),p=[]}function l(){c(p[0][0],p[0][1]),_.lineEnd();var t,n,e,r,o=_.clean(),a=g.result(),u=a.length;if(p.pop(),h.push(p),p=null,u)if(1&o){if(e=a[0],(n=e.length-1)>0){for(b||(i.polygonStart(),b=!0),i.lineStart(),t=0;t<n;++t)i.point((r=e[t])[0],r[1]);i.lineEnd()}}else u>1&&2&o&&a.push(a.pop().concat(a.shift())),d.push(a.filter(di))}var h,d,p,v=n(i),g=ai(),_=n(g),b=!1,m={point:o,lineStart:u,lineEnd:f,polygonStart:function(){m.point=c,m.lineStart=s,m.lineEnd=l,d=[],h=[]},polygonEnd:function(){m.point=o,m.lineStart=u,m.lineEnd=f,d=y(d);var t=li(h,r);d.length?(b||(i.polygonStart(),b=!0),ci(d,pi,t,e,i)):t&&(b||(i.polygonStart(),b=!0),i.lineStart(),e(null,null,1,i),i.lineEnd()),b&&(i.polygonEnd(),b=!1),d=h=null},sphere:function(){i.polygonStart(),i.lineStart(),e(null,null,1,i),i.lineEnd(),i.polygonEnd()}};return m}}function di(t){return t.length>1}function pi(t,n){return((t=t.x)[0]<0?t[1]-Xd-Id:Xd-t[1])-((n=n.x)[0]<0?n[1]-Xd-Id:Xd-n[1])}function vi(t){function n(t,n){return Kd(t)*Kd(n)>i}function e(t,n,e){var r=[1,0,0],o=Nr(Ar(t),Ar(n)),a=Tr(o,o),u=o[0],f=a-u*u;if(!f)return!e&&t;var c=i*a/f,s=-i*u/f,l=Nr(r,o),h=Er(r,c);Sr(h,Er(o,s));var d=l,p=Tr(h,d),v=Tr(d,d),g=p*p-v*(Tr(h,h)-1);if(!(g<0)){var y=ap(g),_=Er(d,(-p-y)/v);if(Sr(_,h),_=Mr(_),!e)return _;var b,m=t[0],x=n[0],w=t[1],M=n[1];x<m&&(b=m,m=x,x=b);var A=x-m,T=Zd(A-Hd)<Id;if(!T&&M<w&&(b=w,w=M,M=b),T||A<Id?T?w+M>0^_[1]<(Zd(_[0]-m)<Id?w:M):w<=_[1]&&_[1]<=M:A>Hd^(m<=_[0]&&_[0]<=x)){var N=Er(d,(-p+y)/v);return Sr(N,h),[_,Mr(N)]}}}function r(n,e){var r=a?t:Hd-t,i=0;return n<-r?i|=1:n>r&&(i|=2),e<-r?i|=4:e>r&&(i|=8),i}var i=Kd(t),o=6*Wd,a=i>0,u=Zd(i)>Id;return hi(n,function(t){var i,o,f,c,s;return{lineStart:function(){c=f=!1,s=1},point:function(l,h){var d,p=[l,h],v=n(l,h),g=a?v?0:r(l,h):v?r(l+(l<0?Hd:-Hd),h):0;if(!i&&(c=f=v)&&t.lineStart(),v!==f&&(!(d=e(i,p))||ui(i,d)||ui(p,d))&&(p[0]+=Id,p[1]+=Id,v=n(p[0],p[1])),v!==f)s=0,v?(t.lineStart(),d=e(p,i),t.point(d[0],d[1])):(d=e(i,p),t.point(d[0],d[1]),t.lineEnd()),i=d;else if(u&&i&&a^v){var y;g&o||!(y=e(p,i,!0))||(s=0,a?(t.lineStart(),t.point(y[0][0],y[0][1]),t.point(y[1][0],y[1][1]),t.lineEnd()):(t.point(y[1][0],y[1][1]),t.lineEnd(),t.lineStart(),t.point(y[0][0],y[0][1])))}!v||i&&ui(i,p)||t.point(p[0],p[1]),i=p,f=v,o=g},lineEnd:function(){f&&t.lineEnd(),i=null},clean:function(){return s|(c&&f)<<1}}},function(n,e,r,i){ii(i,t,o,r,n,e)},a?[0,-t]:[-Hd,t-Hd])}function gi(t,n,e,r){function i(i,o){return t<=i&&i<=e&&n<=o&&o<=r}function o(i,o,u,c){var s=0,l=0;if(null==i||(s=a(i,u))!==(l=a(o,u))||f(i,o)<0^u>0)do{c.point(0===s||3===s?t:e,s>1?r:n)}while((s=(s+u+4)%4)!==l);else c.point(o[0],o[1])}function a(r,i){return Zd(r[0]-t)<Id?i>0?0:3:Zd(r[0]-e)<Id?i>0?2:1:Zd(r[1]-n)<Id?i>0?1:0:i>0?3:2}function u(t,n){return f(t.x,n.x)}function f(t,n){var e=a(t,1),r=a(n,1);return e!==r?e-r:0===e?n[1]-t[1]:1===e?t[0]-n[0]:2===e?t[1]-n[1]:n[0]-t[0]}return function(a){function f(t,n){i(t,n)&&w.point(t,n)}function c(o,a){var u=i(o,a);if(l&&h.push([o,a]),m)d=o,p=a,v=u,m=!1,u&&(w.lineStart(),w.point(o,a));else if(u&&b)w.point(o,a);else{var f=[g=Math.max(Cp,Math.min(kp,g)),_=Math.max(Cp,Math.min(kp,_))],c=[o=Math.max(Cp,Math.min(kp,o)),a=Math.max(Cp,Math.min(kp,a))];!function(t,n,e,r,i,o){var a,u=t[0],f=t[1],c=0,s=1,l=n[0]-u,h=n[1]-f;if(a=e-u,l||!(a>0)){if(a/=l,l<0){if(a<c)return;a<s&&(s=a)}else if(l>0){if(a>s)return;a>c&&(c=a)}if(a=i-u,l||!(a<0)){if(a/=l,l<0){if(a>s)return;a>c&&(c=a)}else if(l>0){if(a<c)return;a<s&&(s=a)}if(a=r-f,h||!(a>0)){if(a/=h,h<0){if(a<c)return;a<s&&(s=a)}else if(h>0){if(a>s)return;a>c&&(c=a)}if(a=o-f,h||!(a<0)){if(a/=h,h<0){if(a>s)return;a>c&&(c=a)}else if(h>0){if(a<c)return;a<s&&(s=a)}return c>0&&(t[0]=u+c*l,t[1]=f+c*h),s<1&&(n[0]=u+s*l,n[1]=f+s*h),!0}}}}}(f,c,t,n,e,r)?u&&(w.lineStart(),w.point(o,a),x=!1):(b||(w.lineStart(),w.point(f[0],f[1])),w.point(c[0],c[1]),u||w.lineEnd(),x=!1)}g=o,_=a,b=u}var s,l,h,d,p,v,g,_,b,m,x,w=a,M=ai(),A={point:f,lineStart:function(){A.point=c,l&&l.push(h=[]),m=!0,b=!1,g=_=NaN},lineEnd:function(){s&&(c(d,p),v&&b&&M.rejoin(),s.push(M.result())),A.point=f,b&&w.lineEnd()},polygonStart:function(){w=M,s=[],l=[],x=!0},polygonEnd:function(){var n=function(){for(var n=0,e=0,i=l.length;e<i;++e)for(var o,a,u=l[e],f=1,c=u.length,s=u[0],h=s[0],d=s[1];f<c;++f)o=h,a=d,h=(s=u[f])[0],d=s[1],a<=r?d>r&&(h-o)*(r-a)>(d-a)*(t-o)&&++n:d<=r&&(h-o)*(r-a)<(d-a)*(t-o)&&--n;return n}(),e=x&&n,i=(s=y(s)).length;(e||i)&&(a.polygonStart(),e&&(a.lineStart(),o(null,null,1,a),a.lineEnd()),i&&ci(s,u,n,o,a),a.polygonEnd()),w=a,s=l=h=null}};return A}}function yi(){zp.point=zp.lineEnd=pr}function _i(t,n){gp=t*=Wd,yp=ip(n*=Wd),_p=Kd(n),zp.point=bi}function bi(t,n){t*=Wd;var e=ip(n*=Wd),r=Kd(n),i=Zd(t-gp),o=Kd(i),a=r*ip(i),u=_p*e-yp*r*o,f=yp*e+_p*r*o;Pp.add(Jd(ap(a*a+u*u),f)),gp=t,yp=e,_p=r}function mi(t){return Pp.reset(),_r(t,zp),+Pp}function xi(t,n){return Rp[0]=t,Rp[1]=n,mi(Lp)}function wi(t,n){return!(!t||!Up.hasOwnProperty(t.type))&&Up[t.type](t,n)}function Mi(t,n){return 0===xi(t,n)}function Ai(t,n){var e=xi(t[0],t[1]);return xi(t[0],n)+xi(n,t[1])<=e+Id}function Ti(t,n){return!!li(t.map(Ni),Si(n))}function Ni(t){return(t=t.map(Si)).pop(),t}function Si(t){return[t[0]*Wd,t[1]*Wd]}function Ei(t,n,e){var r=s(t,n-Id,e).concat(n);return function(t){return r.map(function(n){return[t,n]})}}function ki(t,n,e){var r=s(t,n-Id,e).concat(n);return function(t){return r.map(function(n){return[n,t]})}}function Ci(){function t(){return{type:"MultiLineString",coordinates:n()}}function n(){return s(tp(o/y)*y,i,y).map(d).concat(s(tp(c/_)*_,f,_).map(p)).concat(s(tp(r/v)*v,e,v).filter(function(t){return Zd(t%y)>Id}).map(l)).concat(s(tp(u/g)*g,a,g).filter(function(t){return Zd(t%_)>Id}).map(h))}var e,r,i,o,a,u,f,c,l,h,d,p,v=10,g=v,y=90,_=360,b=2.5;return t.lines=function(){return n().map(function(t){return{type:"LineString",coordinates:t}})},t.outline=function(){return{type:"Polygon",coordinates:[d(o).concat(p(f).slice(1),d(i).reverse().slice(1),p(c).reverse().slice(1))]}},t.extent=function(n){return arguments.length?t.extentMajor(n).extentMinor(n):t.extentMinor()},t.extentMajor=function(n){return arguments.length?(o=+n[0][0],i=+n[1][0],c=+n[0][1],f=+n[1][1],o>i&&(n=o,o=i,i=n),c>f&&(n=c,c=f,f=n),t.precision(b)):[[o,c],[i,f]]},t.extentMinor=function(n){return arguments.length?(r=+n[0][0],e=+n[1][0],u=+n[0][1],a=+n[1][1],r>e&&(n=r,r=e,e=n),u>a&&(n=u,u=a,a=n),t.precision(b)):[[r,u],[e,a]]},t.step=function(n){return arguments.length?t.stepMajor(n).stepMinor(n):t.stepMinor()},t.stepMajor=function(n){return arguments.length?(y=+n[0],_=+n[1],t):[y,_]},t.stepMinor=function(n){return arguments.length?(v=+n[0],g=+n[1],t):[v,g]},t.precision=function(n){return arguments.length?(b=+n,l=Ei(u,a,90),h=ki(r,e,b),d=Ei(c,f,90),p=ki(o,i,b),t):b},t.extentMajor([[-180,-90+Id],[180,90-Id]]).extentMinor([[-180,-80-Id],[180,80+Id]])}function Pi(t){return t}function zi(){Yp.point=Ri}function Ri(t,n){Yp.point=Li,bp=xp=t,mp=wp=n}function Li(t,n){Op.add(wp*t-xp*n),xp=t,wp=n}function Di(){Li(bp,mp)}function Ui(t,n){Xp+=t,Gp+=n,++Vp}function qi(){tv.point=Oi}function Oi(t,n){tv.point=Yi,Ui(Tp=t,Np=n)}function Yi(t,n){var e=t-Tp,r=n-Np,i=ap(e*e+r*r);$p+=i*(Tp+t)/2,Wp+=i*(Np+n)/2,Zp+=i,Ui(Tp=t,Np=n)}function Bi(){tv.point=Ui}function Fi(){tv.point=ji}function Ii(){Hi(Mp,Ap)}function ji(t,n){tv.point=Hi,Ui(Mp=Tp=t,Ap=Np=n)}function Hi(t,n){var e=t-Tp,r=n-Np,i=ap(e*e+r*r);$p+=i*(Tp+t)/2,Wp+=i*(Np+n)/2,Zp+=i,Qp+=(i=Np*t-Tp*n)*(Tp+t),Jp+=i*(Np+n),Kp+=3*i,Ui(Tp=t,Np=n)}function Xi(t){this._context=t}function Gi(t,n){uv.point=Vi,ev=iv=t,rv=ov=n}function Vi(t,n){iv-=t,ov-=n,av.add(ap(iv*iv+ov*ov)),iv=t,ov=n}function $i(){this._string=[]}function Wi(t){return"m0,"+t+"a"+t+","+t+" 0 1,1 0,"+-2*t+"a"+t+","+t+" 0 1,1 0,"+2*t+"z"}function Zi(t){return function(n){var e=new Qi;for(var r in t)e[r]=t[r];return e.stream=n,e}}function Qi(){}function Ji(t,n,e){var r=t.clipExtent&&t.clipExtent();return t.scale(150).translate([0,0]),null!=r&&t.clipExtent(null),_r(e,t.stream(Hp)),n(Hp.result()),null!=r&&t.clipExtent(r),t}function Ki(t,n,e){return Ji(t,function(e){var r=n[1][0]-n[0][0],i=n[1][1]-n[0][1],o=Math.min(r/(e[1][0]-e[0][0]),i/(e[1][1]-e[0][1])),a=+n[0][0]+(r-o*(e[1][0]+e[0][0]))/2,u=+n[0][1]+(i-o*(e[1][1]+e[0][1]))/2;t.scale(150*o).translate([a,u])},e)}function to(t,n,e){return Ki(t,[[0,0],n],e)}function no(t,n,e){return Ji(t,function(e){var r=+n,i=r/(e[1][0]-e[0][0]),o=(r-i*(e[1][0]+e[0][0]))/2,a=-i*e[0][1];t.scale(150*i).translate([o,a])},e)}function eo(t,n,e){return Ji(t,function(e){var r=+n,i=r/(e[1][1]-e[0][1]),o=-i*e[0][0],a=(r-i*(e[1][1]+e[0][1]))/2;t.scale(150*i).translate([o,a])},e)}function ro(t,n){return+n?function(t,n){function e(r,i,o,a,u,f,c,s,l,h,d,p,v,g){var y=c-r,_=s-i,b=y*y+_*_;if(b>4*n&&v--){var m=a+h,x=u+d,w=f+p,M=ap(m*m+x*x+w*w),A=hr(w/=M),T=Zd(Zd(w)-1)<Id||Zd(o-l)<Id?(o+l)/2:Jd(x,m),N=t(T,A),S=N[0],E=N[1],k=S-r,C=E-i,P=_*k-y*C;(P*P/b>n||Zd((y*k+_*C)/b-.5)>.3||a*h+u*d+f*p<cv)&&(e(r,i,o,a,u,f,S,E,T,m/=M,x/=M,w,v,g),g.point(S,E),e(S,E,T,m,x,w,c,s,l,h,d,p,v,g))}}return function(n){function r(e,r){e=t(e,r),n.point(e[0],e[1])}function i(){y=NaN,w.point=o,n.lineStart()}function o(r,i){var o=Ar([r,i]),a=t(r,i);e(y,_,g,b,m,x,y=a[0],_=a[1],g=r,b=o[0],m=o[1],x=o[2],fv,n),n.point(y,_)}function a(){w.point=r,n.lineEnd()}function u(){i(),w.point=f,w.lineEnd=c}function f(t,n){o(s=t,n),l=y,h=_,d=b,p=m,v=x,w.point=o}function c(){e(y,_,g,b,m,x,l,h,s,d,p,v,fv,n),w.lineEnd=a,a()}var s,l,h,d,p,v,g,y,_,b,m,x,w={point:r,lineStart:i,lineEnd:a,polygonStart:function(){n.polygonStart(),w.lineStart=u},polygonEnd:function(){n.polygonEnd(),w.lineStart=i}};return w}}(t,n):function(t){return Zi({point:function(n,e){n=t(n,e),this.stream.point(n[0],n[1])}})}(t)}function io(t,n,e,r){function i(t,r){return[u*t-f*r+n,e-f*t-u*r]}var o=Kd(r),a=ip(r),u=o*t,f=a*t,c=o/t,s=a/t,l=(a*e-o*n)/t,h=(a*n+o*e)/t;return i.invert=function(t,n){return[c*t-s*n+l,h-s*t-c*n]},i}function oo(t){return ao(function(){return t})()}function ao(t){function n(t){return l(t[0]*Wd,t[1]*Wd)}function e(){var t=io(p,0,0,w).apply(null,i(y,_)),n=(w?io:function(t,n,e){function r(r,i){return[n+t*r,e-t*i]}return r.invert=function(r,i){return[(r-n)/t,(e-i)/t]},r})(p,v-t[0],g-t[1],w);return o=Kr(b,m,x),s=Qr(i,n),l=Qr(o,s),c=ro(s,S),r()}function r(){return h=d=null,n}var i,o,a,u,f,c,s,l,h,d,p=150,v=480,g=250,y=0,_=0,b=0,m=0,x=0,w=0,M=null,A=Ep,T=null,N=Pi,S=.5;return n.stream=function(t){return h&&d===t?h:h=sv(function(t){return Zi({point:function(n,e){var r=t(n,e);return this.stream.point(r[0],r[1])}})}(o)(A(c(N(d=t)))))},n.preclip=function(t){return arguments.length?(A=t,M=void 0,r()):A},n.postclip=function(t){return arguments.length?(N=t,T=a=u=f=null,r()):N},n.clipAngle=function(t){return arguments.length?(A=+t?vi(M=t*Wd):(M=null,Ep),r()):M*$d},n.clipExtent=function(t){return arguments.length?(N=null==t?(T=a=u=f=null,Pi):gi(T=+t[0][0],a=+t[0][1],u=+t[1][0],f=+t[1][1]),r()):null==T?null:[[T,a],[u,f]]},n.scale=function(t){return arguments.length?(p=+t,e()):p},n.translate=function(t){return arguments.length?(v=+t[0],g=+t[1],e()):[v,g]},n.center=function(t){return arguments.length?(y=t[0]%360*Wd,_=t[1]%360*Wd,e()):[y*$d,_*$d]},n.rotate=function(t){return arguments.length?(b=t[0]%360*Wd,m=t[1]%360*Wd,x=t.length>2?t[2]%360*Wd:0,e()):[b*$d,m*$d,x*$d]},n.angle=function(t){return arguments.length?(w=t%360*Wd,e()):w*$d},n.precision=function(t){return arguments.length?(c=ro(s,S=t*t),r()):ap(S)},n.fitExtent=function(t,e){return Ki(n,t,e)},n.fitSize=function(t,e){return to(n,t,e)},n.fitWidth=function(t,e){return no(n,t,e)},n.fitHeight=function(t,e){return eo(n,t,e)},function(){return i=t.apply(this,arguments),n.invert=i.invert&&function(t){return(t=l.invert(t[0],t[1]))&&[t[0]*$d,t[1]*$d]},e()}}function uo(t){var n=0,e=Hd/3,r=ao(t),i=r(n,e);return i.parallels=function(t){return arguments.length?r(n=t[0]*Wd,e=t[1]*Wd):[n*$d,e*$d]},i}function fo(t,n){function e(t,n){var e=ap(o-2*i*ip(n))/i;return[e*ip(t*=i),a-e*Kd(t)]}var r=ip(t),i=(r+ip(n))/2;if(Zd(i)<Id)return function(t){function n(t,n){return[t*e,ip(n)/e]}var e=Kd(t);return n.invert=function(t,n){return[t/e,hr(n*e)]},n}(t);var o=1+r*(2*i-r),a=ap(o)/i;return e.invert=function(t,n){var e=a-n;return[Jd(t,Zd(e))/i*op(e),hr((o-(t*t+e*e)*i*i)/(2*i))]},e}function co(){return uo(fo).scale(155.424).center([0,33.6442])}function so(){return co().parallels([29.5,45.5]).scale(1070).translate([480,250]).rotate([96,0]).center([-.6,38.7])}function lo(t){return function(n,e){var r=Kd(n),i=Kd(e),o=t(r*i);return[o*i*ip(n),o*ip(e)]}}function ho(t){return function(n,e){var r=ap(n*n+e*e),i=t(r),o=ip(i),a=Kd(i);return[Jd(n*o,r*a),hr(r&&e*o/r)]}}function po(t,n){return[t,ep(up((Xd+n)/2))]}function vo(t){function n(){var n=Hd*u(),a=o(ri(o.rotate()).invert([0,0]));return c(null==s?[[a[0]-n,a[1]-n],[a[0]+n,a[1]+n]]:t===po?[[Math.max(a[0]-n,s),e],[Math.min(a[0]+n,r),i]]:[[s,Math.max(a[1]-n,e)],[r,Math.min(a[1]+n,i)]])}var e,r,i,o=oo(t),a=o.center,u=o.scale,f=o.translate,c=o.clipExtent,s=null;return o.scale=function(t){return arguments.length?(u(t),n()):u()},o.translate=function(t){return arguments.length?(f(t),n()):f()},o.center=function(t){return arguments.length?(a(t),n()):a()},o.clipExtent=function(t){return arguments.length?(null==t?s=e=r=i=null:(s=+t[0][0],e=+t[0][1],r=+t[1][0],i=+t[1][1]),n()):null==s?null:[[s,e],[r,i]]},n()}function go(t){return up((Xd+t)/2)}function yo(t,n){function e(t,n){o>0?n<-Xd+Id&&(n=-Xd+Id):n>Xd-Id&&(n=Xd-Id);var e=o/rp(go(n),i);return[e*ip(i*t),o-e*Kd(i*t)]}var r=Kd(t),i=t===n?ip(t):ep(r/Kd(n))/ep(go(n)/go(t)),o=r*rp(go(t),i)/i;return i?(e.invert=function(t,n){var e=o-n,r=op(i)*ap(t*t+e*e);return[Jd(t,Zd(e))/i*op(e),2*Qd(rp(o/r,1/i))-Xd]},e):po}function _o(t,n){return[t,n]}function bo(t,n){function e(t,n){var e=o-n,r=i*t;return[e*ip(r),o-e*Kd(r)]}var r=Kd(t),i=t===n?ip(t):(r-Kd(n))/(n-t),o=r/i+t;return Zd(i)<Id?_o:(e.invert=function(t,n){var e=o-n;return[Jd(t,Zd(e))/i*op(e),o-op(i)*ap(t*t+e*e)]},e)}function mo(t,n){var e=Kd(n),r=Kd(t)*e;return[e*ip(t)/r,ip(n)/r]}function xo(t,n,e,r){return 1===t&&1===n&&0===e&&0===r?Pi:Zi({point:function(i,o){this.stream.point(i*t+e,o*n+r)}})}function wo(t,n){var e=n*n,r=e*e;return[t*(.8707-.131979*e+r*(r*(.003971*e-.001529*r)-.013791)),n*(1.007226+e*(.015085+r*(.028874*e-.044475-.005916*r)))]}function Mo(t,n){return[Kd(n)*ip(t),ip(n)]}function Ao(t,n){var e=Kd(n),r=1+Kd(t)*e;return[e*ip(t)/r,ip(n)/r]}function To(t,n){return[ep(up((Xd+n)/2)),-t]}function No(t,n){return t.parent===n.parent?1:2}function So(t,n){return t+n.x}function Eo(t,n){return Math.max(t,n.y)}function ko(t){var n=0,e=t.children,r=e&&e.length;if(r)for(;--r>=0;)n+=e[r].value;else n=1;t.value=n}function Co(t,n){var e,r,i,o,a,u=new Lo(t),f=+t.value&&(u.value=t.value),c=[u];for(null==n&&(n=Po);e=c.pop();)if(f&&(e.value=+e.data.value),(i=n(e.data))&&(a=i.length))for(e.children=new Array(a),o=a-1;o>=0;--o)c.push(r=e.children[o]=new Lo(i[o])),r.parent=e,r.depth=e.depth+1;return u.eachBefore(Ro)}function Po(t){return t.children}function zo(t){t.data=t.data.data}function Ro(t){var n=0;do{t.height=n}while((t=t.parent)&&t.height<++n)}function Lo(t){this.data=t,this.depth=this.height=0,this.parent=null}function Do(t){for(var n,e,r=0,i=(t=function(t){for(var n,e,r=t.length;r;)e=Math.random()*r--|0,n=t[r],t[r]=t[e],t[e]=n;return t}(dv.call(t))).length,o=[];r<i;)n=t[r],e&&qo(e,n)?++r:(e=function(t){switch(t.length){case 1:return function(t){return{x:t.x,y:t.y,r:t.r}}(t[0]);case 2:return Yo(t[0],t[1]);case 3:return Bo(t[0],t[1],t[2])}}(o=function(t,n){var e,r;if(Oo(n,t))return[n];for(e=0;e<t.length;++e)if(Uo(n,t[e])&&Oo(Yo(t[e],n),t))return[t[e],n];for(e=0;e<t.length-1;++e)for(r=e+1;r<t.length;++r)if(Uo(Yo(t[e],t[r]),n)&&Uo(Yo(t[e],n),t[r])&&Uo(Yo(t[r],n),t[e])&&Oo(Bo(t[e],t[r],n),t))return[t[e],t[r],n];throw new Error}(o,n)),r=0);return e}function Uo(t,n){var e=t.r-n.r,r=n.x-t.x,i=n.y-t.y;return e<0||e*e<r*r+i*i}function qo(t,n){var e=t.r-n.r+1e-6,r=n.x-t.x,i=n.y-t.y;return e>0&&e*e>r*r+i*i}function Oo(t,n){for(var e=0;e<n.length;++e)if(!qo(t,n[e]))return!1;return!0}function Yo(t,n){var e=t.x,r=t.y,i=t.r,o=n.x,a=n.y,u=n.r,f=o-e,c=a-r,s=u-i,l=Math.sqrt(f*f+c*c);return{x:(e+o+f/l*s)/2,y:(r+a+c/l*s)/2,r:(l+i+u)/2}}function Bo(t,n,e){var r=t.x,i=t.y,o=t.r,a=n.x,u=n.y,f=n.r,c=e.x,s=e.y,l=e.r,h=r-a,d=r-c,p=i-u,v=i-s,g=f-o,y=l-o,_=r*r+i*i-o*o,b=_-a*a-u*u+f*f,m=_-c*c-s*s+l*l,x=d*p-h*v,w=(p*m-v*b)/(2*x)-r,M=(v*g-p*y)/x,A=(d*b-h*m)/(2*x)-i,T=(h*y-d*g)/x,N=M*M+T*T-1,S=2*(o+w*M+A*T),E=w*w+A*A-o*o,k=-(N?(S+Math.sqrt(S*S-4*N*E))/(2*N):E/S);return{x:r+w+M*k,y:i+A+T*k,r:k}}function Fo(t,n,e){var r,i,o,a,u=t.x-n.x,f=t.y-n.y,c=u*u+f*f;c?(i=n.r+e.r,i*=i,a=t.r+e.r,i>(a*=a)?(r=(c+a-i)/(2*c),o=Math.sqrt(Math.max(0,a/c-r*r)),e.x=t.x-r*u-o*f,e.y=t.y-r*f+o*u):(r=(c+i-a)/(2*c),o=Math.sqrt(Math.max(0,i/c-r*r)),e.x=n.x+r*u-o*f,e.y=n.y+r*f+o*u)):(e.x=n.x+e.r,e.y=n.y)}function Io(t,n){var e=t.r+n.r-1e-6,r=n.x-t.x,i=n.y-t.y;return e>0&&e*e>r*r+i*i}function jo(t){var n=t._,e=t.next._,r=n.r+e.r,i=(n.x*e.r+e.x*n.r)/r,o=(n.y*e.r+e.y*n.r)/r;return i*i+o*o}function Ho(t){this._=t,this.next=null,this.previous=null}function Xo(t){if(!(i=t.length))return 0;var n,e,r,i,o,a,u,f,c,s,l;if(n=t[0],n.x=0,n.y=0,!(i>1))return n.r;if(e=t[1],n.x=-e.r,e.x=n.r,e.y=0,!(i>2))return n.r+e.r;Fo(e,n,r=t[2]),n=new Ho(n),e=new Ho(e),r=new Ho(r),n.next=r.previous=e,e.next=n.previous=r,r.next=e.previous=n;t:for(u=3;u<i;++u){Fo(n._,e._,r=t[u]),r=new Ho(r),f=e.next,c=n.previous,s=e._.r,l=n._.r;do{if(s<=l){if(Io(f._,r._)){e=f,n.next=e,e.previous=n,--u;continue t}s+=f._.r,f=f.next}else{if(Io(c._,r._)){(n=c).next=e,e.previous=n,--u;continue t}l+=c._.r,c=c.previous}}while(f!==c.next);for(r.previous=n,r.next=e,n.next=e.previous=e=r,o=jo(n);(r=r.next)!==e;)(a=jo(r))<o&&(n=r,o=a);e=n.next}for(n=[e._],r=e;(r=r.next)!==e;)n.push(r._);for(r=Do(n),u=0;u<i;++u)n=t[u],n.x-=r.x,n.y-=r.y;return r.r}function Go(t){if("function"!=typeof t)throw new Error;return t}function Vo(){return 0}function $o(t){return function(){return t}}function Wo(t){return Math.sqrt(t.value)}function Zo(t){return function(n){n.children||(n.r=Math.max(0,+t(n)||0))}}function Qo(t,n){return function(e){if(r=e.children){var r,i,o,a=r.length,u=t(e)*n||0;if(u)for(i=0;i<a;++i)r[i].r+=u;if(o=Xo(r),u)for(i=0;i<a;++i)r[i].r-=u;e.r=o+u}}}function Jo(t){return function(n){var e=n.parent;n.r*=t,e&&(n.x=e.x+t*n.x,n.y=e.y+t*n.y)}}function Ko(t){t.x0=Math.round(t.x0),t.y0=Math.round(t.y0),t.x1=Math.round(t.x1),t.y1=Math.round(t.y1)}function ta(t,n,e,r,i){for(var o,a=t.children,u=-1,f=a.length,c=t.value&&(r-n)/t.value;++u<f;)(o=a[u]).y0=e,o.y1=i,o.x0=n,o.x1=n+=o.value*c}function na(t){return t.id}function ea(t){return t.parentId}function ra(t,n){return t.parent===n.parent?1:2}function ia(t){var n=t.children;return n?n[0]:t.t}function oa(t){var n=t.children;return n?n[n.length-1]:t.t}function aa(t,n,e){var r=e/(n.i-t.i);n.c-=r,n.s+=e,t.c+=r,n.z+=e,n.m+=e}function ua(t,n,e){return t.a.parent===n.parent?t.a:e}function fa(t,n){this._=t,this.parent=null,this.children=null,this.A=null,this.a=this,this.z=0,this.m=0,this.c=0,this.s=0,this.t=null,this.i=n}function ca(t,n,e,r,i){for(var o,a=t.children,u=-1,f=a.length,c=t.value&&(i-e)/t.value;++u<f;)(o=a[u]).x0=n,o.x1=r,o.y0=e,o.y1=e+=o.value*c}function sa(t,n,e,r,i,o){for(var a,u,f,c,s,l,h,d,p,v,g,y=[],_=n.children,b=0,m=0,x=_.length,w=n.value;b<x;){f=i-e,c=o-r;do{s=_[m++].value}while(!s&&m<x);for(l=h=s,g=s*s*(v=Math.max(c/f,f/c)/(w*t)),p=Math.max(h/g,g/l);m<x;++m){if(s+=u=_[m].value,u<l&&(l=u),u>h&&(h=u),g=s*s*v,(d=Math.max(h/g,g/l))>p){s-=u;break}p=d}y.push(a={value:s,dice:f<c,children:_.slice(b,m)}),a.dice?ta(a,e,r,i,w?r+=c*s/w:o):ca(a,e,r,w?e+=f*s/w:i,o),w-=s,b=m}return y}function la(t,n,e){return(n[0]-t[0])*(e[1]-t[1])-(n[1]-t[1])*(e[0]-t[0])}function ha(t,n){return t[0]-n[0]||t[1]-n[1]}function da(t){for(var n=t.length,e=[0,1],r=2,i=2;i<n;++i){for(;r>1&&la(t[e[r-2]],t[e[r-1]],t[i])<=0;)--r;e[r++]=i}return e.slice(0,r)}function pa(){return Math.random()}function va(t){function n(n){var o=n+"",a=e.get(o);if(!a){if(i!==kv)return i;e.set(o,a=r.push(n))}return t[(a-1)%t.length]}var e=le(),r=[],i=kv;return t=null==t?[]:Ev.call(t),n.domain=function(t){if(!arguments.length)return r.slice();r=[],e=le();for(var i,o,a=-1,u=t.length;++a<u;)e.has(o=(i=t[a])+"")||e.set(o,r.push(i));return n},n.range=function(e){return arguments.length?(t=Ev.call(e),n):t.slice()},n.unknown=function(t){return arguments.length?(i=t,n):i},n.copy=function(){return va().domain(r).range(t).unknown(i)},n}function ga(){function t(){var t=i().length,r=a[1]<a[0],h=a[r-0],d=a[1-r];n=(d-h)/Math.max(1,t-f+2*c),u&&(n=Math.floor(n)),h+=(d-h-n*(t-f))*l,e=n*(1-f),u&&(h=Math.round(h),e=Math.round(e));var p=s(t).map(function(t){return h+n*t});return o(r?p.reverse():p)}var n,e,r=va().unknown(void 0),i=r.domain,o=r.range,a=[0,1],u=!1,f=0,c=0,l=.5;return delete r.unknown,r.domain=function(n){return arguments.length?(i(n),t()):i()},r.range=function(n){return arguments.length?(a=[+n[0],+n[1]],t()):a.slice()},r.rangeRound=function(n){return a=[+n[0],+n[1]],u=!0,t()},r.bandwidth=function(){return e},r.step=function(){return n},r.round=function(n){return arguments.length?(u=!!n,t()):u},r.padding=function(n){return arguments.length?(f=c=Math.max(0,Math.min(1,n)),t()):f},r.paddingInner=function(n){return arguments.length?(f=Math.max(0,Math.min(1,n)),t()):f},r.paddingOuter=function(n){return arguments.length?(c=Math.max(0,Math.min(1,n)),t()):c},r.align=function(n){return arguments.length?(l=Math.max(0,Math.min(1,n)),t()):l},r.copy=function(){return ga().domain(i()).range(a).round(u).paddingInner(f).paddingOuter(c).align(l)},t()}function ya(t){var n=t.copy;return t.padding=t.paddingOuter,delete t.paddingInner,delete t.paddingOuter,t.copy=function(){return ya(n())},t}function _a(t){return function(){return t}}function ba(t){return+t}function ma(t,n){return(n-=t=+t)?function(e){return(e-t)/n}:_a(n)}function xa(t,n,e,r){var i=t[0],o=t[1],a=n[0],u=n[1];return o<i?(i=e(o,i),a=r(u,a)):(i=e(i,o),a=r(a,u)),function(t){return a(i(t))}}function wa(t,n,e,r){var i=Math.min(t.length,n.length)-1,o=new Array(i),a=new Array(i),u=-1;for(t[i]<t[0]&&(t=t.slice().reverse(),n=n.slice().reverse());++u<i;)o[u]=e(t[u],t[u+1]),a[u]=r(n[u],n[u+1]);return function(n){var e=Qc(t,n,1,i)-1;return a[e](o[e](n))}}function Ma(t,n){return n.domain(t.domain()).range(t.range()).interpolate(t.interpolate()).clamp(t.clamp())}function Aa(t,n){function e(){return i=Math.min(u.length,f.length)>2?wa:xa,o=a=null,r}function r(n){return(o||(o=i(u,f,s?function(t){return function(n,e){var r=t(n=+n,e=+e);return function(t){return t<=n?0:t>=e?1:r(t)}}}(t):t,c)))(+n)}var i,o,a,u=Cv,f=Cv,c=hn,s=!1;return r.invert=function(t){return(a||(a=i(f,u,ma,s?function(t){return function(n,e){var r=t(n=+n,e=+e);return function(t){return t<=0?n:t>=1?e:r(t)}}}(n):n)))(+t)},r.domain=function(t){return arguments.length?(u=Sv.call(t,ba),e()):u.slice()},r.range=function(t){return arguments.length?(f=Ev.call(t),e()):f.slice()},r.rangeRound=function(t){return f=Ev.call(t),c=dn,e()},r.clamp=function(t){return arguments.length?(s=!!t,e()):s},r.interpolate=function(t){return arguments.length?(c=t,e()):c},e()}function Ta(n){var e=n.domain;return n.ticks=function(t){var n=e();return l(n[0],n[n.length-1],null==t?10:t)},n.tickFormat=function(n,r){return function(n,e,r){var i,o=n[0],a=n[n.length-1],u=d(o,a,null==e?10:e);switch((r=tr(null==r?",f":r)).type){case"s":var f=Math.max(Math.abs(o),Math.abs(a));return null!=r.precision||isNaN(i=ar(u,f))||(r.precision=i),t.formatPrefix(r,f);case"":case"e":case"g":case"p":case"r":null!=r.precision||isNaN(i=ur(u,Math.max(Math.abs(o),Math.abs(a))))||(r.precision=i-("e"===r.type));break;case"f":case"%":null!=r.precision||isNaN(i=or(u))||(r.precision=i-2*("%"===r.type))}return t.format(r)}(e(),n,r)},n.nice=function(t){null==t&&(t=10);var r,i=e(),o=0,a=i.length-1,u=i[o],f=i[a];return f<u&&(r=u,u=f,f=r,r=o,o=a,a=r),(r=h(u,f,t))>0?r=h(u=Math.floor(u/r)*r,f=Math.ceil(f/r)*r,t):r<0&&(r=h(u=Math.ceil(u*r)/r,f=Math.floor(f*r)/r,t)),r>0?(i[o]=Math.floor(u/r)*r,i[a]=Math.ceil(f/r)*r,e(i)):r<0&&(i[o]=Math.ceil(u*r)/r,i[a]=Math.floor(f*r)/r,e(i)),n},n}function Na(){var t=Aa(ma,cn);return t.copy=function(){return Ma(t,Na())},Ta(t)}function Sa(){function t(t){return+t}var n=[0,1];return t.invert=t,t.domain=t.range=function(e){return arguments.length?(n=Sv.call(e,ba),t):n.slice()},t.copy=function(){return Sa().domain(n)},Ta(t)}function Ea(t,n){var e,r=0,i=(t=t.slice()).length-1,o=t[r],a=t[i];return a<o&&(e=r,r=i,i=e,e=o,o=a,a=e),t[r]=n.floor(o),t[i]=n.ceil(a),t}function ka(t,n){return(n=Math.log(n/t))?function(e){return Math.log(e/t)/n}:_a(n)}function Ca(t,n){return t<0?function(e){return-Math.pow(-n,e)*Math.pow(-t,1-e)}:function(e){return Math.pow(n,e)*Math.pow(t,1-e)}}function Pa(t){return isFinite(t)?+("1e"+t):t<0?0:t}function za(t){return 10===t?Pa:t===Math.E?Math.exp:function(n){return Math.pow(t,n)}}function Ra(t){return t===Math.E?Math.log:10===t&&Math.log10||2===t&&Math.log2||(t=Math.log(t),function(n){return Math.log(n)/t})}function La(t){return function(n){return-t(-n)}}function Da(){function n(){return o=Ra(i),a=za(i),r()[0]<0&&(o=La(o),a=La(a)),e}var e=Aa(ka,Ca).domain([1,10]),r=e.domain,i=10,o=Ra(10),a=za(10);return e.base=function(t){return arguments.length?(i=+t,n()):i},e.domain=function(t){return arguments.length?(r(t),n()):r()},e.ticks=function(t){var n,e=r(),u=e[0],f=e[e.length-1];(n=f<u)&&(d=u,u=f,f=d);var c,s,h,d=o(u),p=o(f),v=null==t?10:+t,g=[];if(!(i%1)&&p-d<v){if(d=Math.round(d)-1,p=Math.round(p)+1,u>0){for(;d<p;++d)for(s=1,c=a(d);s<i;++s)if(!((h=c*s)<u)){if(h>f)break;g.push(h)}}else for(;d<p;++d)for(s=i-1,c=a(d);s>=1;--s)if(!((h=c*s)<u)){if(h>f)break;g.push(h)}}else g=l(d,p,Math.min(p-d,v)).map(a);return n?g.reverse():g},e.tickFormat=function(n,r){if(null==r&&(r=10===i?".0e":","),"function"!=typeof r&&(r=t.format(r)),n===1/0)return r;null==n&&(n=10);var u=Math.max(1,i*n/e.ticks().length);return function(t){var n=t/a(Math.round(o(t)));return n*i<i-.5&&(n*=i),n<=u?r(t):""}},e.nice=function(){return r(Ea(r(),{floor:function(t){return a(Math.floor(o(t)))},ceil:function(t){return a(Math.ceil(o(t)))}}))},e.copy=function(){return Ma(e,Da().base(i))},e}function Ua(t,n){return t<0?-Math.pow(-t,n):Math.pow(t,n)}function qa(){var t=1,n=Aa(function(n,e){return(e=Ua(e,t)-(n=Ua(n,t)))?function(r){return(Ua(r,t)-n)/e}:_a(e)},function(n,e){return e=Ua(e,t)-(n=Ua(n,t)),function(r){return Ua(n+e*r,1/t)}}),e=n.domain;return n.exponent=function(n){return arguments.length?(t=+n,e(e())):t},n.copy=function(){return Ma(n,qa().exponent(t))},Ta(n)}function Oa(){function t(){var t=0,n=Math.max(1,i.length);for(o=new Array(n-1);++t<n;)o[t-1]=v(r,t/n);return e}function e(t){if(!isNaN(t=+t))return i[Qc(o,t)]}var r=[],i=[],o=[];return e.invertExtent=function(t){var n=i.indexOf(t);return n<0?[NaN,NaN]:[n>0?o[n-1]:r[0],n<o.length?o[n]:r[r.length-1]]},e.domain=function(e){if(!arguments.length)return r.slice();r=[];for(var i,o=0,a=e.length;o<a;++o)null==(i=e[o])||isNaN(i=+i)||r.push(i);return r.sort(n),t()},e.range=function(n){return arguments.length?(i=Ev.call(n),t()):i.slice()},e.quantiles=function(){return o.slice()},e.copy=function(){return Oa().domain(r).range(i)},e}function Ya(){function t(t){if(t<=t)return a[Qc(o,t,0,i)]}function n(){var n=-1;for(o=new Array(i);++n<i;)o[n]=((n+1)*r-(n-i)*e)/(i+1);return t}var e=0,r=1,i=1,o=[.5],a=[0,1];return t.domain=function(t){return arguments.length?(e=+t[0],r=+t[1],n()):[e,r]},t.range=function(t){return arguments.length?(i=(a=Ev.call(t)).length-1,n()):a.slice()},t.invertExtent=function(t){var n=a.indexOf(t);return n<0?[NaN,NaN]:n<1?[e,o[0]]:n>=i?[o[i-1],r]:[o[n-1],o[n]]},t.copy=function(){return Ya().domain([e,r]).range(a)},Ta(t)}function Ba(){function t(t){if(t<=t)return e[Qc(n,t,0,r)]}var n=[.5],e=[0,1],r=1;return t.domain=function(i){return arguments.length?(n=Ev.call(i),r=Math.min(n.length,e.length-1),t):n.slice()},t.range=function(i){return arguments.length?(e=Ev.call(i),r=Math.min(n.length,e.length-1),t):e.slice()},t.invertExtent=function(t){var r=e.indexOf(t);return[n[r-1],n[r]]},t.copy=function(){return Ba().domain(n).range(e)},t}function Fa(t,n,e,r){function i(n){return t(n=new Date(+n)),n}return i.floor=i,i.ceil=function(e){return t(e=new Date(e-1)),n(e,1),t(e),e},i.round=function(t){var n=i(t),e=i.ceil(t);return t-n<e-t?n:e},i.offset=function(t,e){return n(t=new Date(+t),null==e?1:Math.floor(e)),t},i.range=function(e,r,o){var a,u=[];if(e=i.ceil(e),o=null==o?1:Math.floor(o),!(e<r&&o>0))return u;do{u.push(a=new Date(+e)),n(e,o),t(e)}while(a<e&&e<r);return u},i.filter=function(e){return Fa(function(n){if(n>=n)for(;t(n),!e(n);)n.setTime(n-1)},function(t,r){if(t>=t)if(r<0)for(;++r<=0;)for(;n(t,-1),!e(t););else for(;--r>=0;)for(;n(t,1),!e(t););})},e&&(i.count=function(n,r){return Pv.setTime(+n),zv.setTime(+r),t(Pv),t(zv),Math.floor(e(Pv,zv))},i.every=function(t){return t=Math.floor(t),isFinite(t)&&t>0?t>1?i.filter(r?function(n){return r(n)%t==0}:function(n){return i.count(0,n)%t==0}):i:null}),i}function Ia(t){return Fa(function(n){n.setDate(n.getDate()-(n.getDay()+7-t)%7),n.setHours(0,0,0,0)},function(t,n){t.setDate(t.getDate()+7*n)},function(t,n){return(n-t-(n.getTimezoneOffset()-t.getTimezoneOffset())*Dv)/Uv})}function ja(t){return Fa(function(n){n.setUTCDate(n.getUTCDate()-(n.getUTCDay()+7-t)%7),n.setUTCHours(0,0,0,0)},function(t,n){t.setUTCDate(t.getUTCDate()+7*n)},function(t,n){return(n-t)/Uv})}function Ha(t){if(0<=t.y&&t.y<100){var n=new Date(-1,t.m,t.d,t.H,t.M,t.S,t.L);return n.setFullYear(t.y),n}return new Date(t.y,t.m,t.d,t.H,t.M,t.S,t.L)}function Xa(t){if(0<=t.y&&t.y<100){var n=new Date(Date.UTC(-1,t.m,t.d,t.H,t.M,t.S,t.L));return n.setUTCFullYear(t.y),n}return new Date(Date.UTC(t.y,t.m,t.d,t.H,t.M,t.S,t.L))}function Ga(t){return{y:t,m:0,d:1,H:0,M:0,S:0,L:0}}function Va(t){function n(t,n){return function(e){var r,i,o,a=[],u=-1,f=0,c=t.length;for(e instanceof Date||(e=new Date(+e));++u<c;)37===t.charCodeAt(u)&&(a.push(t.slice(f,u)),null!=(i=Lg[r=t.charAt(++u)])?r=t.charAt(++u):i="e"===r?" ":"0",(o=n[r])&&(r=o(e,i)),a.push(r),f=u+1);return a.push(t.slice(f,u)),a.join("")}}function e(t,n){return function(e){var i,o,a=Ga(1900);if(r(a,t,e+="",0)!=e.length)return null;if("Q"in a)return new Date(a.Q);if("p"in a&&(a.H=a.H%12+12*a.p),"V"in a){if(a.V<1||a.V>53)return null;"w"in a||(a.w=1),"Z"in a?(i=(o=(i=Xa(Ga(a.y))).getUTCDay())>4||0===o?gg.ceil(i):gg(i),i=dg.offset(i,7*(a.V-1)),a.y=i.getUTCFullYear(),a.m=i.getUTCMonth(),a.d=i.getUTCDate()+(a.w+6)%7):(i=(o=(i=n(Ga(a.y))).getDay())>4||0===o?Gv.ceil(i):Gv(i),i=jv.offset(i,7*(a.V-1)),a.y=i.getFullYear(),a.m=i.getMonth(),a.d=i.getDate()+(a.w+6)%7)}else("W"in a||"U"in a)&&("w"in a||(a.w="u"in a?a.u%7:"W"in a?1:0),o="Z"in a?Xa(Ga(a.y)).getUTCDay():n(Ga(a.y)).getDay(),a.m=0,a.d="W"in a?(a.w+6)%7+7*a.W-(o+5)%7:a.w+7*a.U-(o+6)%7);return"Z"in a?(a.H+=a.Z/100|0,a.M+=a.Z%100,Xa(a)):n(a)}}function r(t,n,e,r){for(var i,o,a=0,u=n.length,f=e.length;a<u;){if(r>=f)return-1;if(37===(i=n.charCodeAt(a++))){if(i=n.charAt(a++),!(o=A[i in Lg?n.charAt(a++):i])||(r=o(t,e,r))<0)return-1}else if(i!=e.charCodeAt(r++))return-1}return r}var i=t.dateTime,o=t.date,a=t.time,u=t.periods,f=t.days,c=t.shortDays,s=t.months,l=t.shortMonths,h=Za(u),d=Qa(u),p=Za(f),v=Qa(f),g=Za(c),y=Qa(c),_=Za(s),b=Qa(s),m=Za(l),x=Qa(l),w={a:function(t){return c[t.getDay()]},A:function(t){return f[t.getDay()]},b:function(t){return l[t.getMonth()]},B:function(t){return s[t.getMonth()]},c:null,d:yu,e:yu,f:wu,H:_u,I:bu,j:mu,L:xu,m:Mu,M:Au,p:function(t){return u[+(t.getHours()>=12)]},Q:Ju,s:Ku,S:Tu,u:Nu,U:Su,V:Eu,w:ku,W:Cu,x:null,X:null,y:Pu,Y:zu,Z:Ru,"%":Qu},M={a:function(t){return c[t.getUTCDay()]},A:function(t){return f[t.getUTCDay()]},b:function(t){return l[t.getUTCMonth()]},B:function(t){return s[t.getUTCMonth()]},c:null,d:Lu,e:Lu,f:Yu,H:Du,I:Uu,j:qu,L:Ou,m:Bu,M:Fu,p:function(t){return u[+(t.getUTCHours()>=12)]},Q:Ju,s:Ku,S:Iu,u:ju,U:Hu,V:Xu,w:Gu,W:Vu,x:null,X:null,y:$u,Y:Wu,Z:Zu,"%":Qu},A={a:function(t,n,e){var r=g.exec(n.slice(e));return r?(t.w=y[r[0].toLowerCase()],e+r[0].length):-1},A:function(t,n,e){var r=p.exec(n.slice(e));return r?(t.w=v[r[0].toLowerCase()],e+r[0].length):-1},b:function(t,n,e){var r=m.exec(n.slice(e));return r?(t.m=x[r[0].toLowerCase()],e+r[0].length):-1},B:function(t,n,e){var r=_.exec(n.slice(e));return r?(t.m=b[r[0].toLowerCase()],e+r[0].length):-1},c:function(t,n,e){return r(t,i,n,e)},d:uu,e:uu,f:du,H:cu,I:cu,j:fu,L:hu,m:au,M:su,p:function(t,n,e){var r=h.exec(n.slice(e));return r?(t.p=d[r[0].toLowerCase()],e+r[0].length):-1},Q:vu,s:gu,S:lu,u:Ka,U:tu,V:nu,w:Ja,W:eu,x:function(t,n,e){return r(t,o,n,e)},X:function(t,n,e){return r(t,a,n,e)},y:iu,Y:ru,Z:ou,"%":pu};return w.x=n(o,w),w.X=n(a,w),w.c=n(i,w),M.x=n(o,M),M.X=n(a,M),M.c=n(i,M),{format:function(t){var e=n(t+="",w);return e.toString=function(){return t},e},parse:function(t){var n=e(t+="",Ha);return n.toString=function(){return t},n},utcFormat:function(t){var e=n(t+="",M);return e.toString=function(){return t},e},utcParse:function(t){var n=e(t,Xa);return n.toString=function(){return t},n}}}function $a(t,n,e){var r=t<0?"-":"",i=(r?-t:t)+"",o=i.length;return r+(o<e?new Array(e-o+1).join(n)+i:i)}function Wa(t){return t.replace(qg,"\\$&")}function Za(t){return new RegExp("^(?:"+t.map(Wa).join("|")+")","i")}function Qa(t){for(var n={},e=-1,r=t.length;++e<r;)n[t[e].toLowerCase()]=e;return n}function Ja(t,n,e){var r=Dg.exec(n.slice(e,e+1));return r?(t.w=+r[0],e+r[0].length):-1}function Ka(t,n,e){var r=Dg.exec(n.slice(e,e+1));return r?(t.u=+r[0],e+r[0].length):-1}function tu(t,n,e){var r=Dg.exec(n.slice(e,e+2));return r?(t.U=+r[0],e+r[0].length):-1}function nu(t,n,e){var r=Dg.exec(n.slice(e,e+2));return r?(t.V=+r[0],e+r[0].length):-1}function eu(t,n,e){var r=Dg.exec(n.slice(e,e+2));return r?(t.W=+r[0],e+r[0].length):-1}function ru(t,n,e){var r=Dg.exec(n.slice(e,e+4));return r?(t.y=+r[0],e+r[0].length):-1}function iu(t,n,e){var r=Dg.exec(n.slice(e,e+2));return r?(t.y=+r[0]+(+r[0]>68?1900:2e3),e+r[0].length):-1}function ou(t,n,e){var r=/^(Z)|([+-]\d\d)(?::?(\d\d))?/.exec(n.slice(e,e+6));return r?(t.Z=r[1]?0:-(r[2]+(r[3]||"00")),e+r[0].length):-1}function au(t,n,e){var r=Dg.exec(n.slice(e,e+2));return r?(t.m=r[0]-1,e+r[0].length):-1}function uu(t,n,e){var r=Dg.exec(n.slice(e,e+2));return r?(t.d=+r[0],e+r[0].length):-1}function fu(t,n,e){var r=Dg.exec(n.slice(e,e+3));return r?(t.m=0,t.d=+r[0],e+r[0].length):-1}function cu(t,n,e){var r=Dg.exec(n.slice(e,e+2));return r?(t.H=+r[0],e+r[0].length):-1}function su(t,n,e){var r=Dg.exec(n.slice(e,e+2));return r?(t.M=+r[0],e+r[0].length):-1}function lu(t,n,e){var r=Dg.exec(n.slice(e,e+2));return r?(t.S=+r[0],e+r[0].length):-1}function hu(t,n,e){var r=Dg.exec(n.slice(e,e+3));return r?(t.L=+r[0],e+r[0].length):-1}function du(t,n,e){var r=Dg.exec(n.slice(e,e+6));return r?(t.L=Math.floor(r[0]/1e3),e+r[0].length):-1}function pu(t,n,e){var r=Ug.exec(n.slice(e,e+1));return r?e+r[0].length:-1}function vu(t,n,e){var r=Dg.exec(n.slice(e));return r?(t.Q=+r[0],e+r[0].length):-1}function gu(t,n,e){var r=Dg.exec(n.slice(e));return r?(t.Q=1e3*+r[0],e+r[0].length):-1}function yu(t,n){return $a(t.getDate(),n,2)}function _u(t,n){return $a(t.getHours(),n,2)}function bu(t,n){return $a(t.getHours()%12||12,n,2)}function mu(t,n){return $a(1+jv.count(ug(t),t),n,3)}function xu(t,n){return $a(t.getMilliseconds(),n,3)}function wu(t,n){return xu(t,n)+"000"}function Mu(t,n){return $a(t.getMonth()+1,n,2)}function Au(t,n){return $a(t.getMinutes(),n,2)}function Tu(t,n){return $a(t.getSeconds(),n,2)}function Nu(t){var n=t.getDay();return 0===n?7:n}function Su(t,n){return $a(Xv.count(ug(t),t),n,2)}function Eu(t,n){var e=t.getDay();return t=e>=4||0===e?Wv(t):Wv.ceil(t),$a(Wv.count(ug(t),t)+(4===ug(t).getDay()),n,2)}function ku(t){return t.getDay()}function Cu(t,n){return $a(Gv.count(ug(t),t),n,2)}function Pu(t,n){return $a(t.getFullYear()%100,n,2)}function zu(t,n){return $a(t.getFullYear()%1e4,n,4)}function Ru(t){var n=t.getTimezoneOffset();return(n>0?"-":(n*=-1,"+"))+$a(n/60|0,"0",2)+$a(n%60,"0",2)}function Lu(t,n){return $a(t.getUTCDate(),n,2)}function Du(t,n){return $a(t.getUTCHours(),n,2)}function Uu(t,n){return $a(t.getUTCHours()%12||12,n,2)}function qu(t,n){return $a(1+dg.count(Pg(t),t),n,3)}function Ou(t,n){return $a(t.getUTCMilliseconds(),n,3)}function Yu(t,n){return Ou(t,n)+"000"}function Bu(t,n){return $a(t.getUTCMonth()+1,n,2)}function Fu(t,n){return $a(t.getUTCMinutes(),n,2)}function Iu(t,n){return $a(t.getUTCSeconds(),n,2)}function ju(t){var n=t.getUTCDay();return 0===n?7:n}function Hu(t,n){return $a(vg.count(Pg(t),t),n,2)}function Xu(t,n){var e=t.getUTCDay();return t=e>=4||0===e?bg(t):bg.ceil(t),$a(bg.count(Pg(t),t)+(4===Pg(t).getUTCDay()),n,2)}function Gu(t){return t.getUTCDay()}function Vu(t,n){return $a(gg.count(Pg(t),t),n,2)}function $u(t,n){return $a(t.getUTCFullYear()%100,n,2)}function Wu(t,n){return $a(t.getUTCFullYear()%1e4,n,4)}function Zu(){return"+0000"}function Qu(){return"%"}function Ju(t){return+t}function Ku(t){return Math.floor(+t/1e3)}function tf(n){return zg=Va(n),t.timeFormat=zg.format,t.timeParse=zg.parse,t.utcFormat=zg.utcFormat,t.utcParse=zg.utcParse,zg}function nf(t){return new Date(t)}function ef(t){return t instanceof Date?+t:+new Date(+t)}function rf(t,n,r,i,o,a,u,f,c){function s(e){return(u(e)<e?g:a(e)<e?y:o(e)<e?_:i(e)<e?b:n(e)<e?r(e)<e?m:x:t(e)<e?w:M)(e)}function l(n,r,i,o){if(null==n&&(n=10),"number"==typeof n){var a=Math.abs(i-r)/n,u=e(function(t){return t[2]}).right(A,a);u===A.length?(o=d(r/Vg,i/Vg,n),n=t):u?(o=(u=A[a/A[u-1][2]<A[u][2]/a?u-1:u])[1],n=u[0]):(o=Math.max(d(r,i,n),1),n=f)}return null==o?n:n.every(o)}var h=Aa(ma,cn),p=h.invert,v=h.domain,g=c(".%L"),y=c(":%S"),_=c("%I:%M"),b=c("%I %p"),m=c("%a %d"),x=c("%b %d"),w=c("%B"),M=c("%Y"),A=[[u,1,Fg],[u,5,5*Fg],[u,15,15*Fg],[u,30,30*Fg],[a,1,Ig],[a,5,5*Ig],[a,15,15*Ig],[a,30,30*Ig],[o,1,jg],[o,3,3*jg],[o,6,6*jg],[o,12,12*jg],[i,1,Hg],[i,2,2*Hg],[r,1,Xg],[n,1,Gg],[n,3,3*Gg],[t,1,Vg]];return h.invert=function(t){return new Date(p(t))},h.domain=function(t){return arguments.length?v(Sv.call(t,ef)):v().map(nf)},h.ticks=function(t,n){var e,r=v(),i=r[0],o=r[r.length-1],a=o<i;return a&&(e=i,i=o,o=e),e=l(t,i,o,n),e=e?e.range(i,o+1):[],a?e.reverse():e},h.tickFormat=function(t,n){return null==n?s:c(n)},h.nice=function(t,n){var e=v();return(t=l(t,e[0],e[e.length-1],n))?v(Ea(e,t)):h},h.copy=function(){return Ma(h,rf(t,n,r,i,o,a,u,f,c))},h}function of(t){function n(n){var o=(n-e)/(r-e);return t(i?Math.max(0,Math.min(1,o)):o)}var e=0,r=1,i=!1;return n.domain=function(t){return arguments.length?(e=+t[0],r=+t[1],n):[e,r]},n.clamp=function(t){return arguments.length?(i=!!t,n):i},n.interpolator=function(e){return arguments.length?(t=e,n):t},n.copy=function(){return of(t).domain([e,r]).clamp(i)},Ta(n)}function af(t){for(var n=t.length/6|0,e=new Array(n),r=0;r<n;)e[r]="#"+t.slice(6*r,6*++r);return e}function uf(t){return el(t[t.length-1])}function ff(t){var n=t.length;return function(e){return t[Math.max(0,Math.min(n-1,Math.floor(e*n)))]}}function cf(t){return function(){return t}}function sf(t){return t>=1?m_:t<=-1?-m_:Math.asin(t)}function lf(t){return t.innerRadius}function hf(t){return t.outerRadius}function df(t){return t.startAngle}function pf(t){return t.endAngle}function vf(t){return t&&t.padAngle}function gf(t,n,e,r,i,o,a){var u=t-e,f=n-r,c=(a?o:-o)/y_(u*u+f*f),s=c*f,l=-c*u,h=t+s,d=n+l,p=e+s,v=r+l,g=(h+p)/2,y=(d+v)/2,_=p-h,b=v-d,m=_*_+b*b,x=i-o,w=h*v-p*d,M=(b<0?-1:1)*y_(p_(0,x*x*m-w*w)),A=(w*b-_*M)/m,T=(-w*_-b*M)/m,N=(w*b+_*M)/m,S=(-w*_+b*M)/m,E=A-g,k=T-y,C=N-g,P=S-y;return E*E+k*k>C*C+P*P&&(A=N,T=S),{cx:A,cy:T,x01:-s,y01:-l,x11:A*(i/x-1),y11:T*(i/x-1)}}function yf(t){this._context=t}function _f(t){return new yf(t)}function bf(t){return t[0]}function mf(t){return t[1]}function xf(){function t(t){var u,f,c,s=t.length,l=!1;for(null==i&&(a=o(c=ie())),u=0;u<=s;++u)!(u<s&&r(f=t[u],u,t))===l&&((l=!l)?a.lineStart():a.lineEnd()),l&&a.point(+n(f,u,t),+e(f,u,t));if(c)return a=null,c+""||null}var n=bf,e=mf,r=cf(!0),i=null,o=_f,a=null;return t.x=function(e){return arguments.length?(n="function"==typeof e?e:cf(+e),t):n},t.y=function(n){return arguments.length?(e="function"==typeof n?n:cf(+n),t):e},t.defined=function(n){return arguments.length?(r="function"==typeof n?n:cf(!!n),t):r},t.curve=function(n){return arguments.length?(o=n,null!=i&&(a=o(i)),t):o},t.context=function(n){return arguments.length?(null==n?i=a=null:a=o(i=n),t):i},t}function wf(){function t(t){var n,s,l,h,d,p=t.length,v=!1,g=new Array(p),y=new Array(p);for(null==u&&(c=f(d=ie())),n=0;n<=p;++n){if(!(n<p&&a(h=t[n],n,t))===v)if(v=!v)s=n,c.areaStart(),c.lineStart();else{for(c.lineEnd(),c.lineStart(),l=n-1;l>=s;--l)c.point(g[l],y[l]);c.lineEnd(),c.areaEnd()}v&&(g[n]=+e(h,n,t),y[n]=+i(h,n,t),c.point(r?+r(h,n,t):g[n],o?+o(h,n,t):y[n]))}if(d)return c=null,d+""||null}function n(){return xf().defined(a).curve(f).context(u)}var e=bf,r=null,i=cf(0),o=mf,a=cf(!0),u=null,f=_f,c=null;return t.x=function(n){return arguments.length?(e="function"==typeof n?n:cf(+n),r=null,t):e},t.x0=function(n){return arguments.length?(e="function"==typeof n?n:cf(+n),t):e},t.x1=function(n){return arguments.length?(r=null==n?null:"function"==typeof n?n:cf(+n),t):r},t.y=function(n){return arguments.length?(i="function"==typeof n?n:cf(+n),o=null,t):i},t.y0=function(n){return arguments.length?(i="function"==typeof n?n:cf(+n),t):i},t.y1=function(n){return arguments.length?(o=null==n?null:"function"==typeof n?n:cf(+n),t):o},t.lineX0=t.lineY0=function(){return n().x(e).y(i)},t.lineY1=function(){return n().x(e).y(o)},t.lineX1=function(){return n().x(r).y(i)},t.defined=function(n){return arguments.length?(a="function"==typeof n?n:cf(!!n),t):a},t.curve=function(n){return arguments.length?(f=n,null!=u&&(c=f(u)),t):f},t.context=function(n){return arguments.length?(null==n?u=c=null:c=f(u=n),t):u},t}function Mf(t,n){return n<t?-1:n>t?1:n>=t?0:NaN}function Af(t){return t}function Tf(t){this._curve=t}function Nf(t){function n(n){return new Tf(t(n))}return n._curve=t,n}function Sf(t){var n=t.curve;return t.angle=t.x,delete t.x,t.radius=t.y,delete t.y,t.curve=function(t){return arguments.length?n(Nf(t)):n()._curve},t}function Ef(){return Sf(xf().curve(w_))}function kf(){var t=wf().curve(w_),n=t.curve,e=t.lineX0,r=t.lineX1,i=t.lineY0,o=t.lineY1;return t.angle=t.x,delete t.x,t.startAngle=t.x0,delete t.x0,t.endAngle=t.x1,delete t.x1,t.radius=t.y,delete t.y,t.innerRadius=t.y0,delete t.y0,t.outerRadius=t.y1,delete t.y1,t.lineStartAngle=function(){return Sf(e())},delete t.lineX0,t.lineEndAngle=function(){return Sf(r())},delete t.lineX1,t.lineInnerRadius=function(){return Sf(i())},delete t.lineY0,t.lineOuterRadius=function(){return Sf(o())},delete t.lineY1,t.curve=function(t){return arguments.length?n(Nf(t)):n()._curve},t}function Cf(t,n){return[(n=+n)*Math.cos(t-=Math.PI/2),n*Math.sin(t)]}function Pf(t){return t.source}function zf(t){return t.target}function Rf(t){function n(){var n,u=M_.call(arguments),f=e.apply(this,u),c=r.apply(this,u);if(a||(a=n=ie()),t(a,+i.apply(this,(u[0]=f,u)),+o.apply(this,u),+i.apply(this,(u[0]=c,u)),+o.apply(this,u)),n)return a=null,n+""||null}var e=Pf,r=zf,i=bf,o=mf,a=null;return n.source=function(t){return arguments.length?(e=t,n):e},n.target=function(t){return arguments.length?(r=t,n):r},n.x=function(t){return arguments.length?(i="function"==typeof t?t:cf(+t),n):i},n.y=function(t){return arguments.length?(o="function"==typeof t?t:cf(+t),n):o},n.context=function(t){return arguments.length?(a=null==t?null:t,n):a},n}function Lf(t,n,e,r,i){t.moveTo(n,e),t.bezierCurveTo(n=(n+r)/2,e,n,i,r,i)}function Df(t,n,e,r,i){t.moveTo(n,e),t.bezierCurveTo(n,e=(e+i)/2,r,e,r,i)}function Uf(t,n,e,r,i){var o=Cf(n,e),a=Cf(n,e=(e+i)/2),u=Cf(r,e),f=Cf(r,i);t.moveTo(o[0],o[1]),t.bezierCurveTo(a[0],a[1],u[0],u[1],f[0],f[1])}function qf(){}function Of(t,n,e){t._context.bezierCurveTo((2*t._x0+t._x1)/3,(2*t._y0+t._y1)/3,(t._x0+2*t._x1)/3,(t._y0+2*t._y1)/3,(t._x0+4*t._x1+n)/6,(t._y0+4*t._y1+e)/6)}function Yf(t){this._context=t}function Bf(t){this._context=t}function Ff(t){this._context=t}function If(t,n){this._basis=new Yf(t),this._beta=n}function jf(t,n,e){t._context.bezierCurveTo(t._x1+t._k*(t._x2-t._x0),t._y1+t._k*(t._y2-t._y0),t._x2+t._k*(t._x1-n),t._y2+t._k*(t._y1-e),t._x2,t._y2)}function Hf(t,n){this._context=t,this._k=(1-n)/6}function Xf(t,n){this._context=t,this._k=(1-n)/6}function Gf(t,n){this._context=t,this._k=(1-n)/6}function Vf(t,n,e){var r=t._x1,i=t._y1,o=t._x2,a=t._y2;if(t._l01_a>__){var u=2*t._l01_2a+3*t._l01_a*t._l12_a+t._l12_2a,f=3*t._l01_a*(t._l01_a+t._l12_a);r=(r*u-t._x0*t._l12_2a+t._x2*t._l01_2a)/f,i=(i*u-t._y0*t._l12_2a+t._y2*t._l01_2a)/f}if(t._l23_a>__){var c=2*t._l23_2a+3*t._l23_a*t._l12_a+t._l12_2a,s=3*t._l23_a*(t._l23_a+t._l12_a);o=(o*c+t._x1*t._l23_2a-n*t._l12_2a)/s,a=(a*c+t._y1*t._l23_2a-e*t._l12_2a)/s}t._context.bezierCurveTo(r,i,o,a,t._x2,t._y2)}function $f(t,n){this._context=t,this._alpha=n}function Wf(t,n){this._context=t,this._alpha=n}function Zf(t,n){this._context=t,this._alpha=n}function Qf(t){this._context=t}function Jf(t){return t<0?-1:1}function Kf(t,n,e){var r=t._x1-t._x0,i=n-t._x1,o=(t._y1-t._y0)/(r||i<0&&-0),a=(e-t._y1)/(i||r<0&&-0),u=(o*i+a*r)/(r+i);return(Jf(o)+Jf(a))*Math.min(Math.abs(o),Math.abs(a),.5*Math.abs(u))||0}function tc(t,n){var e=t._x1-t._x0;return e?(3*(t._y1-t._y0)/e-n)/2:n}function nc(t,n,e){var r=t._x0,i=t._y0,o=t._x1,a=t._y1,u=(o-r)/3;t._context.bezierCurveTo(r+u,i+u*n,o-u,a-u*e,o,a)}function ec(t){this._context=t}function rc(t){this._context=new ic(t)}function ic(t){this._context=t}function oc(t){this._context=t}function ac(t){var n,e,r=t.length-1,i=new Array(r),o=new Array(r),a=new Array(r);for(i[0]=0,o[0]=2,a[0]=t[0]+2*t[1],n=1;n<r-1;++n)i[n]=1,o[n]=4,a[n]=4*t[n]+2*t[n+1];for(i[r-1]=2,o[r-1]=7,a[r-1]=8*t[r-1]+t[r],n=1;n<r;++n)e=i[n]/o[n-1],o[n]-=e,a[n]-=e*a[n-1];for(i[r-1]=a[r-1]/o[r-1],n=r-2;n>=0;--n)i[n]=(a[n]-i[n+1])/o[n];for(o[r-1]=(t[r]+i[r-1])/2,n=0;n<r-1;++n)o[n]=2*t[n+1]-i[n+1];return[i,o]}function uc(t,n){this._context=t,this._t=n}function fc(t,n){if((i=t.length)>1)for(var e,r,i,o=1,a=t[n[0]],u=a.length;o<i;++o)for(r=a,a=t[n[o]],e=0;e<u;++e)a[e][1]+=a[e][0]=isNaN(r[e][1])?r[e][0]:r[e][1]}function cc(t){for(var n=t.length,e=new Array(n);--n>=0;)e[n]=n;return e}function sc(t,n){return t[n]}function lc(t){var n=t.map(hc);return cc(t).sort(function(t,e){return n[t]-n[e]})}function hc(t){for(var n,e=0,r=-1,i=t.length;++r<i;)(n=+t[r][1])&&(e+=n);return e}function dc(t){return function(){return t}}function pc(t){return t[0]}function vc(t){return t[1]}function gc(){this._=null}function yc(t){t.U=t.C=t.L=t.R=t.P=t.N=null}function _c(t,n){var e=n,r=n.R,i=e.U;i?i.L===e?i.L=r:i.R=r:t._=r,r.U=i,e.U=r,e.R=r.L,e.R&&(e.R.U=e),r.L=e}function bc(t,n){var e=n,r=n.L,i=e.U;i?i.L===e?i.L=r:i.R=r:t._=r,r.U=i,e.U=r,e.L=r.R,e.L&&(e.L.U=e),r.R=e}function mc(t){for(;t.L;)t=t.L;return t}function xc(t,n,e,r){var i=[null,null],o=J_.push(i)-1;return i.left=t,i.right=n,e&&Mc(i,t,n,e),r&&Mc(i,n,t,r),Z_[t.index].halfedges.push(o),Z_[n.index].halfedges.push(o),i}function wc(t,n,e){var r=[n,e];return r.left=t,r}function Mc(t,n,e,r){t[0]||t[1]?t.left===e?t[1]=r:t[0]=r:(t[0]=r,t.left=n,t.right=e)}function Ac(t,n,e,r,i){var o,a=t[0],u=t[1],f=a[0],c=a[1],s=0,l=1,h=u[0]-f,d=u[1]-c;if(o=n-f,h||!(o>0)){if(o/=h,h<0){if(o<s)return;o<l&&(l=o)}else if(h>0){if(o>l)return;o>s&&(s=o)}if(o=r-f,h||!(o<0)){if(o/=h,h<0){if(o>l)return;o>s&&(s=o)}else if(h>0){if(o<s)return;o<l&&(l=o)}if(o=e-c,d||!(o>0)){if(o/=d,d<0){if(o<s)return;o<l&&(l=o)}else if(d>0){if(o>l)return;o>s&&(s=o)}if(o=i-c,d||!(o<0)){if(o/=d,d<0){if(o>l)return;o>s&&(s=o)}else if(d>0){if(o<s)return;o<l&&(l=o)}return!(s>0||l<1)||(s>0&&(t[0]=[f+s*h,c+s*d]),l<1&&(t[1]=[f+l*h,c+l*d]),!0)}}}}}function Tc(t,n,e,r,i){var o=t[1];if(o)return!0;var a,u,f=t[0],c=t.left,s=t.right,l=c[0],h=c[1],d=s[0],p=s[1],v=(l+d)/2,g=(h+p)/2;if(p===h){if(v<n||v>=r)return;if(l>d){if(f){if(f[1]>=i)return}else f=[v,e];o=[v,i]}else{if(f){if(f[1]<e)return}else f=[v,i];o=[v,e]}}else if(a=(l-d)/(p-h),u=g-a*v,a<-1||a>1)if(l>d){if(f){if(f[1]>=i)return}else f=[(e-u)/a,e];o=[(i-u)/a,i]}else{if(f){if(f[1]<e)return}else f=[(i-u)/a,i];o=[(e-u)/a,e]}else if(h<p){if(f){if(f[0]>=r)return}else f=[n,a*n+u];o=[r,a*r+u]}else{if(f){if(f[0]<n)return}else f=[r,a*r+u];o=[n,a*n+u]}return t[0]=f,t[1]=o,!0}function Nc(t,n){var e=t.site,r=n.left,i=n.right;return e===i&&(i=r,r=e),i?Math.atan2(i[1]-r[1],i[0]-r[0]):(e===r?(r=n[1],i=n[0]):(r=n[0],i=n[1]),Math.atan2(r[0]-i[0],i[1]-r[1]))}function Sc(t,n){return n[+(n.left!==t.site)]}function Ec(t,n){return n[+(n.left===t.site)]}function kc(t){var n=t.P,e=t.N;if(n&&e){var r=n.site,i=t.site,o=e.site;if(r!==o){var a=i[0],u=i[1],f=r[0]-a,c=r[1]-u,s=o[0]-a,l=o[1]-u,h=2*(f*l-c*s);if(!(h>=-eb)){var d=f*f+c*c,p=s*s+l*l,v=(l*d-c*p)/h,g=(f*p-s*d)/h,y=K_.pop()||new function(){yc(this),this.x=this.y=this.arc=this.site=this.cy=null};y.arc=t,y.site=i,y.x=v+a,y.y=(y.cy=g+u)+Math.sqrt(v*v+g*g),t.circle=y;for(var _=null,b=Q_._;b;)if(y.y<b.y||y.y===b.y&&y.x<=b.x){if(!b.L){_=b.P;break}b=b.L}else{if(!b.R){_=b;break}b=b.R}Q_.insert(_,y),_||($_=y)}}}}function Cc(t){var n=t.circle;n&&(n.P||($_=n.N),Q_.remove(n),K_.push(n),yc(n),t.circle=null)}function Pc(t){var n=tb.pop()||new function(){yc(this),this.edge=this.site=this.circle=null};return n.site=t,n}function zc(t){Cc(t),W_.remove(t),tb.push(t),yc(t)}function Rc(t){var n=t.circle,e=n.x,r=n.cy,i=[e,r],o=t.P,a=t.N,u=[t];zc(t);for(var f=o;f.circle&&Math.abs(e-f.circle.x)<nb&&Math.abs(r-f.circle.cy)<nb;)o=f.P,u.unshift(f),zc(f),f=o;u.unshift(f),Cc(f);for(var c=a;c.circle&&Math.abs(e-c.circle.x)<nb&&Math.abs(r-c.circle.cy)<nb;)a=c.N,u.push(c),zc(c),c=a;u.push(c),Cc(c);var s,l=u.length;for(s=1;s<l;++s)c=u[s],f=u[s-1],Mc(c.edge,f.site,c.site,i);f=u[0],(c=u[l-1]).edge=xc(f.site,c.site,null,i),kc(f),kc(c)}function Lc(t){for(var n,e,r,i,o=t[0],a=t[1],u=W_._;u;)if((r=Dc(u,a)-o)>nb)u=u.L;else{if(!((i=o-function(t,n){var e=t.N;if(e)return Dc(e,n);var r=t.site;return r[1]===n?r[0]:1/0}(u,a))>nb)){r>-nb?(n=u.P,e=u):i>-nb?(n=u,e=u.N):n=e=u;break}if(!u.R){n=u;break}u=u.R}(function(t){Z_[t.index]={site:t,halfedges:[]}})(t);var f=Pc(t);if(W_.insert(n,f),n||e){if(n===e)return Cc(n),e=Pc(n.site),W_.insert(f,e),f.edge=e.edge=xc(n.site,f.site),kc(n),void kc(e);if(e){Cc(n),Cc(e);var c=n.site,s=c[0],l=c[1],h=t[0]-s,d=t[1]-l,p=e.site,v=p[0]-s,g=p[1]-l,y=2*(h*g-d*v),_=h*h+d*d,b=v*v+g*g,m=[(g*_-d*b)/y+s,(h*b-v*_)/y+l];Mc(e.edge,c,p,m),f.edge=xc(c,t,null,m),e.edge=xc(t,p,null,m),kc(n),kc(e)}else f.edge=xc(n.site,f.site)}}function Dc(t,n){var e=t.site,r=e[0],i=e[1],o=i-n;if(!o)return r;var a=t.P;if(!a)return-1/0;var u=(e=a.site)[0],f=e[1],c=f-n;if(!c)return u;var s=u-r,l=1/o-1/c,h=s/c;return l?(-h+Math.sqrt(h*h-2*l*(s*s/(-2*c)-f+c/2+i-o/2)))/l+r:(r+u)/2}function Uc(t,n,e){return(t[0]-e[0])*(n[1]-t[1])-(t[0]-n[0])*(e[1]-t[1])}function qc(t,n){return n[1]-t[1]||n[0]-t[0]}function Oc(t,n){var e,r,i,o=t.sort(qc).pop();for(J_=[],Z_=new Array(t.length),W_=new gc,Q_=new gc;;)if(i=$_,o&&(!i||o[1]<i.y||o[1]===i.y&&o[0]<i.x))o[0]===e&&o[1]===r||(Lc(o),e=o[0],r=o[1]),o=t.pop();else{if(!i)break;Rc(i.arc)}if(function(){for(var t,n,e,r,i=0,o=Z_.length;i<o;++i)if((t=Z_[i])&&(r=(n=t.halfedges).length)){var a=new Array(r),u=new Array(r);for(e=0;e<r;++e)a[e]=e,u[e]=Nc(t,J_[n[e]]);for(a.sort(function(t,n){return u[n]-u[t]}),e=0;e<r;++e)u[e]=n[a[e]];for(e=0;e<r;++e)n[e]=u[e]}}(),n){var a=+n[0][0],u=+n[0][1],f=+n[1][0],c=+n[1][1];(function(t,n,e,r){for(var i,o=J_.length;o--;)Tc(i=J_[o],t,n,e,r)&&Ac(i,t,n,e,r)&&(Math.abs(i[0][0]-i[1][0])>nb||Math.abs(i[0][1]-i[1][1])>nb)||delete J_[o]})(a,u,f,c),function(t,n,e,r){var i,o,a,u,f,c,s,l,h,d,p,v,g=Z_.length,y=!0;for(i=0;i<g;++i)if(o=Z_[i]){for(a=o.site,u=(f=o.halfedges).length;u--;)J_[f[u]]||f.splice(u,1);for(u=0,c=f.length;u<c;)p=(d=Ec(o,J_[f[u]]))[0],v=d[1],l=(s=Sc(o,J_[f[++u%c]]))[0],h=s[1],(Math.abs(p-l)>nb||Math.abs(v-h)>nb)&&(f.splice(u,0,J_.push(wc(a,d,Math.abs(p-t)<nb&&r-v>nb?[t,Math.abs(l-t)<nb?h:r]:Math.abs(v-r)<nb&&e-p>nb?[Math.abs(h-r)<nb?l:e,r]:Math.abs(p-e)<nb&&v-n>nb?[e,Math.abs(l-e)<nb?h:n]:Math.abs(v-n)<nb&&p-t>nb?[Math.abs(h-n)<nb?l:t,n]:null))-1),++c);c&&(y=!1)}if(y){var _,b,m,x=1/0;for(i=0,y=null;i<g;++i)(o=Z_[i])&&(m=(_=(a=o.site)[0]-t)*_+(b=a[1]-n)*b)<x&&(x=m,y=o);if(y){var w=[t,n],M=[t,r],A=[e,r],T=[e,n];y.halfedges.push(J_.push(wc(a=y.site,w,M))-1,J_.push(wc(a,M,A))-1,J_.push(wc(a,A,T))-1,J_.push(wc(a,T,w))-1)}}for(i=0;i<g;++i)(o=Z_[i])&&(o.halfedges.length||delete Z_[i])}(a,u,f,c)}this.edges=J_,this.cells=Z_,W_=Q_=J_=Z_=null}function Yc(t){return function(){return t}}function Bc(t,n,e){this.k=t,this.x=n,this.y=e}function Fc(t){return t.__zoom||rb}function Ic(){t.event.stopImmediatePropagation()}function jc(){t.event.preventDefault(),t.event.stopImmediatePropagation()}function Hc(){return!t.event.button}function Xc(){var t,n,e=this;return e instanceof SVGElement?(t=(e=e.ownerSVGElement||e).width.baseVal.value,n=e.height.baseVal.value):(t=e.clientWidth,n=e.clientHeight),[[0,0],[t,n]]}function Gc(){return this.__zoom||rb}function Vc(){return-t.event.deltaY*(t.event.deltaMode?120:1)/500}function $c(){return"ontouchstart"in this}function Wc(t,n,e){var r=t.invertX(n[0][0])-e[0][0],i=t.invertX(n[1][0])-e[1][0],o=t.invertY(n[0][1])-e[0][1],a=t.invertY(n[1][1])-e[1][1];return t.translate(i>r?(r+i)/2:Math.min(0,r)||Math.max(0,i),a>o?(o+a)/2:Math.min(0,o)||Math.max(0,a))}var Zc=e(n),Qc=Zc.right,Jc=Zc.left,Kc=Array.prototype,ts=Kc.slice,ns=Kc.map,es=Math.sqrt(50),rs=Math.sqrt(10),is=Math.sqrt(2),os=Array.prototype.slice,as=1,us=2,fs=3,cs=4,ss=1e-6,ls={value:function(){}};S.prototype=N.prototype={constructor:S,on:function(t,n){var e,r=this._,i=function(t,n){return t.trim().split(/^|\s+/).map(function(t){var e="",r=t.indexOf(".");if(r>=0&&(e=t.slice(r+1),t=t.slice(0,r)),t&&!n.hasOwnProperty(t))throw new Error("unknown type: "+t);return{type:t,name:e}})}(t+"",r),o=-1,a=i.length;{if(!(arguments.length<2)){if(null!=n&&"function"!=typeof n)throw new Error("invalid callback: "+n);for(;++o<a;)if(e=(t=i[o]).type)r[e]=E(r[e],t.name,n);else if(null==n)for(e in r)r[e]=E(r[e],t.name,null);return this}for(;++o<a;)if((e=(t=i[o]).type)&&(e=function(t,n){for(var e,r=0,i=t.length;r<i;++r)if((e=t[r]).name===n)return e.value}(r[e],t.name)))return e}},copy:function(){var t={},n=this._;for(var e in n)t[e]=n[e].slice();return new S(t)},call:function(t,n){if((e=arguments.length-2)>0)for(var e,r,i=new Array(e),o=0;o<e;++o)i[o]=arguments[o+2];if(!this._.hasOwnProperty(t))throw new Error("unknown type: "+t);for(o=0,e=(r=this._[t]).length;o<e;++o)r[o].value.apply(n,i)},apply:function(t,n,e){if(!this._.hasOwnProperty(t))throw new Error("unknown type: "+t);for(var r=this._[t],i=0,o=r.length;i<o;++i)r[i].value.apply(n,e)}};var hs="http://www.w3.org/1999/xhtml",ds={svg:"http://www.w3.org/2000/svg",xhtml:hs,xlink:"http://www.w3.org/1999/xlink",xml:"http://www.w3.org/XML/1998/namespace",xmlns:"http://www.w3.org/2000/xmlns/"},ps=function(t){return function(){return this.matches(t)}};if("undefined"!=typeof document){var vs=document.documentElement;if(!vs.matches){var gs=vs.webkitMatchesSelector||vs.msMatchesSelector||vs.mozMatchesSelector||vs.oMatchesSelector;ps=function(t){return function(){return gs.call(this,t)}}}}var ys=ps;U.prototype={constructor:U,appendChild:function(t){return this._parent.insertBefore(t,this._next)},insertBefore:function(t,n){return this._parent.insertBefore(t,n)},querySelector:function(t){return this._parent.querySelector(t)},querySelectorAll:function(t){return this._parent.querySelectorAll(t)}};var _s="$";H.prototype={add:function(t){this._names.indexOf(t)<0&&(this._names.push(t),this._node.setAttribute("class",this._names.join(" ")))},remove:function(t){var n=this._names.indexOf(t);n>=0&&(this._names.splice(n,1),this._node.setAttribute("class",this._names.join(" ")))},contains:function(t){return this._names.indexOf(t)>=0}};var bs={};if(t.event=null,"undefined"!=typeof document){"onmouseenter"in document.documentElement||(bs={mouseenter:"mouseover",mouseleave:"mouseout"})}var ms=[null];ut.prototype=ft.prototype={constructor:ut,select:function(t){"function"!=typeof t&&(t=z(t));for(var n=this._groups,e=n.length,r=new Array(e),i=0;i<e;++i)for(var o,a,u=n[i],f=u.length,c=r[i]=new Array(f),s=0;s<f;++s)(o=u[s])&&(a=t.call(o,o.__data__,s,u))&&("__data__"in o&&(a.__data__=o.__data__),c[s]=a);return new ut(r,this._parents)},selectAll:function(t){"function"!=typeof t&&(t=L(t));for(var n=this._groups,e=n.length,r=[],i=[],o=0;o<e;++o)for(var a,u=n[o],f=u.length,c=0;c<f;++c)(a=u[c])&&(r.push(t.call(a,a.__data__,c,u)),i.push(a));return new ut(r,i)},filter:function(t){"function"!=typeof t&&(t=ys(t));for(var n=this._groups,e=n.length,r=new Array(e),i=0;i<e;++i)for(var o,a=n[i],u=a.length,f=r[i]=[],c=0;c<u;++c)(o=a[c])&&t.call(o,o.__data__,c,a)&&f.push(o);return new ut(r,this._parents)},data:function(t,n){if(!t)return d=new Array(this.size()),c=-1,this.each(function(t){d[++c]=t}),d;var e=n?O:q,r=this._parents,i=this._groups;"function"!=typeof t&&(t=function(t){return function(){return t}}(t));for(var o=i.length,a=new Array(o),u=new Array(o),f=new Array(o),c=0;c<o;++c){var s=r[c],l=i[c],h=l.length,d=t.call(s,s&&s.__data__,c,r),p=d.length,v=u[c]=new Array(p),g=a[c]=new Array(p);e(s,l,v,g,f[c]=new Array(h),d,n);for(var y,_,b=0,m=0;b<p;++b)if(y=v[b]){for(b>=m&&(m=b+1);!(_=g[m])&&++m<p;);y._next=_||null}}return a=new ut(a,r),a._enter=u,a._exit=f,a},enter:function(){return new ut(this._enter||this._groups.map(D),this._parents)},exit:function(){return new ut(this._exit||this._groups.map(D),this._parents)},merge:function(t){for(var n=this._groups,e=t._groups,r=n.length,i=e.length,o=Math.min(r,i),a=new Array(r),u=0;u<o;++u)for(var f,c=n[u],s=e[u],l=c.length,h=a[u]=new Array(l),d=0;d<l;++d)(f=c[d]||s[d])&&(h[d]=f);for(;u<r;++u)a[u]=n[u];return new ut(a,this._parents)},order:function(){for(var t=this._groups,n=-1,e=t.length;++n<e;)for(var r,i=t[n],o=i.length-1,a=i[o];--o>=0;)(r=i[o])&&(a&&a!==r.nextSibling&&a.parentNode.insertBefore(r,a),a=r);return this},sort:function(t){function n(n,e){return n&&e?t(n.__data__,e.__data__):!n-!e}t||(t=Y);for(var e=this._groups,r=e.length,i=new Array(r),o=0;o<r;++o){for(var a,u=e[o],f=u.length,c=i[o]=new Array(f),s=0;s<f;++s)(a=u[s])&&(c[s]=a);c.sort(n)}return new ut(i,this._parents).order()},call:function(){var t=arguments[0];return arguments[0]=this,t.apply(null,arguments),this},nodes:function(){var t=new Array(this.size()),n=-1;return this.each(function(){t[++n]=this}),t},node:function(){for(var t=this._groups,n=0,e=t.length;n<e;++n)for(var r=t[n],i=0,o=r.length;i<o;++i){var a=r[i];if(a)return a}return null},size:function(){var t=0;return this.each(function(){++t}),t},empty:function(){return!this.node()},each:function(t){for(var n=this._groups,e=0,r=n.length;e<r;++e)for(var i,o=n[e],a=0,u=o.length;a<u;++a)(i=o[a])&&t.call(i,i.__data__,a,o);return this},attr:function(t,n){var e=k(t);if(arguments.length<2){var r=this.node();return e.local?r.getAttributeNS(e.space,e.local):r.getAttribute(e)}return this.each((null==n?e.local?function(t){return function(){this.removeAttributeNS(t.space,t.local)}}:function(t){return function(){this.removeAttribute(t)}}:"function"==typeof n?e.local?function(t,n){return function(){var e=n.apply(this,arguments);null==e?this.removeAttributeNS(t.space,t.local):this.setAttributeNS(t.space,t.local,e)}}:function(t,n){return function(){var e=n.apply(this,arguments);null==e?this.removeAttribute(t):this.setAttribute(t,e)}}:e.local?function(t,n){return function(){this.setAttributeNS(t.space,t.local,n)}}:function(t,n){return function(){this.setAttribute(t,n)}})(e,n))},style:function(t,n,e){return arguments.length>1?this.each((null==n?function(t){return function(){this.style.removeProperty(t)}}:"function"==typeof n?function(t,n,e){return function(){var r=n.apply(this,arguments);null==r?this.style.removeProperty(t):this.style.setProperty(t,r,e)}}:function(t,n,e){return function(){this.style.setProperty(t,n,e)}})(t,n,null==e?"":e)):F(this.node(),t)},property:function(t,n){return arguments.length>1?this.each((null==n?function(t){return function(){delete this[t]}}:"function"==typeof n?function(t,n){return function(){var e=n.apply(this,arguments);null==e?delete this[t]:this[t]=e}}:function(t,n){return function(){this[t]=n}})(t,n)):this.node()[t]},classed:function(t,n){var e=I(t+"");if(arguments.length<2){for(var r=j(this.node()),i=-1,o=e.length;++i<o;)if(!r.contains(e[i]))return!1;return!0}return this.each(("function"==typeof n?function(t,n){return function(){(n.apply(this,arguments)?X:G)(this,t)}}:n?function(t){return function(){X(this,t)}}:function(t){return function(){G(this,t)}})(e,n))},text:function(t){return arguments.length?this.each(null==t?V:("function"==typeof t?function(t){return function(){var n=t.apply(this,arguments);this.textContent=null==n?"":n}}:function(t){return function(){this.textContent=t}})(t)):this.node().textContent},html:function(t){return arguments.length?this.each(null==t?$:("function"==typeof t?function(t){return function(){var n=t.apply(this,arguments);this.innerHTML=null==n?"":n}}:function(t){return function(){this.innerHTML=t}})(t)):this.node().innerHTML},raise:function(){return this.each(W)},lower:function(){return this.each(Z)},append:function(t){var n="function"==typeof t?t:C(t);return this.select(function(){return this.appendChild(n.apply(this,arguments))})},insert:function(t,n){var e="function"==typeof t?t:C(t),r=null==n?Q:"function"==typeof n?n:z(n);return this.select(function(){return this.insertBefore(e.apply(this,arguments),r.apply(this,arguments)||null)})},remove:function(){return this.each(J)},clone:function(t){return this.select(t?tt:K)},datum:function(t){return arguments.length?this.property("__data__",t):this.node().__data__},on:function(t,n,e){var r,i,o=function(t){return t.trim().split(/^|\s+/).map(function(t){var n="",e=t.indexOf(".");return e>=0&&(n=t.slice(e+1),t=t.slice(0,e)),{type:t,name:n}})}(t+""),a=o.length;if(!(arguments.length<2)){for(u=n?it:rt,null==e&&(e=!1),r=0;r<a;++r)this.each(u(o[r],n,e));return this}var u=this.node().__on;if(u)for(var f,c=0,s=u.length;c<s;++c)for(r=0,f=u[c];r<a;++r)if((i=o[r]).type===f.type&&i.name===f.name)return f.value},dispatch:function(t,n){return this.each(("function"==typeof n?function(t,n){return function(){return at(this,t,n.apply(this,arguments))}}:function(t,n){return function(){return at(this,t,n)}})(t,n))}};var xs=0;lt.prototype=st.prototype={constructor:lt,get:function(t){for(var n=this._;!(n in t);)if(!(t=t.parentNode))return;return t[n]},set:function(t,n){return t[this._]=n},remove:function(t){return this._ in t&&delete t[this._]},toString:function(){return this._}},xt.prototype.on=function(){var t=this._.on.apply(this._,arguments);return t===this._?this:t};var ws="\\s*([+-]?\\d+)\\s*",Ms="\\s*([+-]?\\d*\\.?\\d+(?:[eE][+-]?\\d+)?)\\s*",As="\\s*([+-]?\\d*\\.?\\d+(?:[eE][+-]?\\d+)?)%\\s*",Ts=/^#([0-9a-f]{3})$/,Ns=/^#([0-9a-f]{6})$/,Ss=new RegExp("^rgb\\("+[ws,ws,ws]+"\\)$"),Es=new RegExp("^rgb\\("+[As,As,As]+"\\)$"),ks=new RegExp("^rgba\\("+[ws,ws,ws,Ms]+"\\)$"),Cs=new RegExp("^rgba\\("+[As,As,As,Ms]+"\\)$"),Ps=new RegExp("^hsl\\("+[Ms,As,As]+"\\)$"),zs=new RegExp("^hsla\\("+[Ms,As,As,Ms]+"\\)$"),Rs={aliceblue:15792383,antiquewhite:16444375,aqua:65535,aquamarine:8388564,azure:15794175,beige:16119260,bisque:16770244,black:0,blanchedalmond:16772045,blue:255,blueviolet:9055202,brown:10824234,burlywood:14596231,cadetblue:6266528,chartreuse:8388352,chocolate:13789470,coral:16744272,cornflowerblue:6591981,cornsilk:16775388,crimson:14423100,cyan:65535,darkblue:139,darkcyan:35723,darkgoldenrod:12092939,darkgray:11119017,darkgreen:25600,darkgrey:11119017,darkkhaki:12433259,darkmagenta:9109643,darkolivegreen:5597999,darkorange:16747520,darkorchid:10040012,darkred:9109504,darksalmon:15308410,darkseagreen:9419919,darkslateblue:4734347,darkslategray:3100495,darkslategrey:3100495,darkturquoise:52945,darkviolet:9699539,deeppink:16716947,deepskyblue:49151,dimgray:6908265,dimgrey:6908265,dodgerblue:2003199,firebrick:11674146,floralwhite:16775920,forestgreen:2263842,fuchsia:16711935,gainsboro:14474460,ghostwhite:16316671,gold:16766720,goldenrod:14329120,gray:8421504,green:32768,greenyellow:11403055,grey:8421504,honeydew:15794160,hotpink:16738740,indianred:13458524,indigo:4915330,ivory:16777200,khaki:15787660,lavender:15132410,lavenderblush:16773365,lawngreen:8190976,lemonchiffon:16775885,lightblue:11393254,lightcoral:15761536,lightcyan:14745599,lightgoldenrodyellow:16448210,lightgray:13882323,lightgreen:9498256,lightgrey:13882323,lightpink:16758465,lightsalmon:16752762,lightseagreen:2142890,lightskyblue:8900346,lightslategray:7833753,lightslategrey:7833753,lightsteelblue:11584734,lightyellow:16777184,lime:65280,limegreen:3329330,linen:16445670,magenta:16711935,maroon:8388608,mediumaquamarine:6737322,mediumblue:205,mediumorchid:12211667,mediumpurple:9662683,mediumseagreen:3978097,mediumslateblue:8087790,mediumspringgreen:64154,mediumturquoise:4772300,mediumvioletred:13047173,midnightblue:1644912,mintcream:16121850,mistyrose:16770273,moccasin:16770229,navajowhite:16768685,navy:128,oldlace:16643558,olive:8421376,olivedrab:7048739,orange:16753920,orangered:16729344,orchid:14315734,palegoldenrod:15657130,palegreen:10025880,paleturquoise:11529966,palevioletred:14381203,papayawhip:16773077,peachpuff:16767673,peru:13468991,pink:16761035,plum:14524637,powderblue:11591910,purple:8388736,rebeccapurple:6697881,red:16711680,rosybrown:12357519,royalblue:4286945,saddlebrown:9127187,salmon:16416882,sandybrown:16032864,seagreen:3050327,seashell:16774638,sienna:10506797,silver:12632256,skyblue:8900331,slateblue:6970061,slategray:7372944,slategrey:7372944,snow:16775930,springgreen:65407,steelblue:4620980,tan:13808780,teal:32896,thistle:14204888,tomato:16737095,turquoise:4251856,violet:15631086,wheat:16113331,white:16777215,whitesmoke:16119285,yellow:16776960,yellowgreen:10145074};Nt(Et,kt,{displayable:function(){return this.rgb().displayable()},toString:function(){return this.rgb()+""}}),Nt(Lt,Rt,St(Et,{brighter:function(t){return t=null==t?1/.7:Math.pow(1/.7,t),new Lt(this.r*t,this.g*t,this.b*t,this.opacity)},darker:function(t){return t=null==t?.7:Math.pow(.7,t),new Lt(this.r*t,this.g*t,this.b*t,this.opacity)},rgb:function(){return this},displayable:function(){return 0<=this.r&&this.r<=255&&0<=this.g&&this.g<=255&&0<=this.b&&this.b<=255&&0<=this.opacity&&this.opacity<=1},toString:function(){var t=this.opacity;return(1===(t=isNaN(t)?1:Math.max(0,Math.min(1,t)))?"rgb(":"rgba(")+Math.max(0,Math.min(255,Math.round(this.r)||0))+", "+Math.max(0,Math.min(255,Math.round(this.g)||0))+", "+Math.max(0,Math.min(255,Math.round(this.b)||0))+(1===t?")":", "+t+")")}})),Nt(qt,Ut,St(Et,{brighter:function(t){return t=null==t?1/.7:Math.pow(1/.7,t),new qt(this.h,this.s,this.l*t,this.opacity)},darker:function(t){return t=null==t?.7:Math.pow(.7,t),new qt(this.h,this.s,this.l*t,this.opacity)},rgb:function(){var t=this.h%360+360*(this.h<0),n=isNaN(t)||isNaN(this.s)?0:this.s,e=this.l,r=e+(e<.5?e:1-e)*n,i=2*e-r;return new Lt(Ot(t>=240?t-240:t+120,i,r),Ot(t,i,r),Ot(t<120?t+240:t-120,i,r),this.opacity)},displayable:function(){return(0<=this.s&&this.s<=1||isNaN(this.s))&&0<=this.l&&this.l<=1&&0<=this.opacity&&this.opacity<=1}}));var Ls=Math.PI/180,Ds=180/Math.PI,Us=.96422,qs=1,Os=.82521,Ys=4/29,Bs=6/29,Fs=3*Bs*Bs,Is=Bs*Bs*Bs;Nt(Ft,Bt,St(Et,{brighter:function(t){return new Ft(this.l+18*(null==t?1:t),this.a,this.b,this.opacity)},darker:function(t){return new Ft(this.l-18*(null==t?1:t),this.a,this.b,this.opacity)},rgb:function(){var t=(this.l+16)/116,n=isNaN(this.a)?t:t+this.a/500,e=isNaN(this.b)?t:t-this.b/200;return n=Us*jt(n),t=qs*jt(t),e=Os*jt(e),new Lt(Ht(3.1338561*n-1.6168667*t-.4906146*e),Ht(-.9787684*n+1.9161415*t+.033454*e),Ht(.0719453*n-.2289914*t+1.4052427*e),this.opacity)}})),Nt($t,Vt,St(Et,{brighter:function(t){return new $t(this.h,this.c,this.l+18*(null==t?1:t),this.opacity)},darker:function(t){return new $t(this.h,this.c,this.l-18*(null==t?1:t),this.opacity)},rgb:function(){return Yt(this).rgb()}}));var js=-.29227,Hs=-.90649,Xs=1.97294,Gs=Xs*Hs,Vs=1.78277*Xs,$s=1.78277*js- -.14861*Hs;Nt(Zt,Wt,St(Et,{brighter:function(t){return t=null==t?1/.7:Math.pow(1/.7,t),new Zt(this.h,this.s,this.l*t,this.opacity)},darker:function(t){return t=null==t?.7:Math.pow(.7,t),new Zt(this.h,this.s,this.l*t,this.opacity)},rgb:function(){var t=isNaN(this.h)?0:(this.h+120)*Ls,n=+this.l,e=isNaN(this.s)?0:this.s*n*(1-n),r=Math.cos(t),i=Math.sin(t);return new Lt(255*(n+e*(-.14861*r+1.78277*i)),255*(n+e*(js*r+Hs*i)),255*(n+e*(Xs*r)),this.opacity)}}));var Ws,Zs,Qs,Js,Ks,tl,nl=function t(n){function e(t,n){var e=r((t=Rt(t)).r,(n=Rt(n)).r),i=r(t.g,n.g),o=r(t.b,n.b),a=on(t.opacity,n.opacity);return function(n){return t.r=e(n),t.g=i(n),t.b=o(n),t.opacity=a(n),t+""}}var r=rn(n);return e.gamma=t,e}(1),el=an(Jt),rl=an(Kt),il=/[-+]?(?:\d+\.?\d*|\.?\d+)(?:[eE][-+]?\d+)?/g,ol=new RegExp(il.source,"g"),al=180/Math.PI,ul={translateX:0,translateY:0,rotate:0,skewX:0,scaleX:1,scaleY:1},fl=vn(function(t){return"none"===t?ul:(Ws||(Ws=document.createElement("DIV"),Zs=document.documentElement,Qs=document.defaultView),Ws.style.transform=t,t=Qs.getComputedStyle(Zs.appendChild(Ws),null).getPropertyValue("transform"),Zs.removeChild(Ws),t=t.slice(7,-1).split(","),pn(+t[0],+t[1],+t[2],+t[3],+t[4],+t[5]))},"px, ","px)","deg)"),cl=vn(function(t){return null==t?ul:(Js||(Js=document.createElementNS("http://www.w3.org/2000/svg","g")),Js.setAttribute("transform",t),(t=Js.transform.baseVal.consolidate())?(t=t.matrix,pn(t.a,t.b,t.c,t.d,t.e,t.f)):ul)},", ",")",")"),sl=Math.SQRT2,ll=2,hl=4,dl=1e-12,pl=_n(en),vl=_n(on),gl=bn(en),yl=bn(on),_l=mn(en),bl=mn(on),ml=0,xl=0,wl=0,Ml=1e3,Al=0,Tl=0,Nl=0,Sl="object"==typeof performance&&performance.now?performance:Date,El="object"==typeof window&&window.requestAnimationFrame?window.requestAnimationFrame.bind(window):function(t){setTimeout(t,17)};Mn.prototype=An.prototype={constructor:Mn,restart:function(t,n,e){if("function"!=typeof t)throw new TypeError("callback is not a function");e=(null==e?xn():+e)+(null==n?0:+n),this._next||tl===this||(tl?tl._next=this:Ks=this,tl=this),this._call=t,this._time=e,En()},stop:function(){this._call&&(this._call=null,this._time=1/0,En())}};var kl=N("start","end","interrupt"),Cl=[],Pl=0,zl=1,Rl=2,Ll=3,Dl=4,Ul=5,ql=6,Ol=ft.prototype.constructor,Yl=0,Bl=ft.prototype;qn.prototype=On.prototype={constructor:qn,select:function(t){var n=this._name,e=this._id;"function"!=typeof t&&(t=z(t));for(var r=this._groups,i=r.length,o=new Array(i),a=0;a<i;++a)for(var u,f,c=r[a],s=c.length,l=o[a]=new Array(s),h=0;h<s;++h)(u=c[h])&&(f=t.call(u,u.__data__,h,c))&&("__data__"in u&&(f.__data__=u.__data__),l[h]=f,Cn(l[h],n,e,h,l,Rn(u,e)));return new qn(o,this._parents,n,e)},selectAll:function(t){var n=this._name,e=this._id;"function"!=typeof t&&(t=L(t));for(var r=this._groups,i=r.length,o=[],a=[],u=0;u<i;++u)for(var f,c=r[u],s=c.length,l=0;l<s;++l)if(f=c[l]){for(var h,d=t.call(f,f.__data__,l,c),p=Rn(f,e),v=0,g=d.length;v<g;++v)(h=d[v])&&Cn(h,n,e,v,d,p);o.push(d),a.push(f)}return new qn(o,a,n,e)},filter:function(t){"function"!=typeof t&&(t=ys(t));for(var n=this._groups,e=n.length,r=new Array(e),i=0;i<e;++i)for(var o,a=n[i],u=a.length,f=r[i]=[],c=0;c<u;++c)(o=a[c])&&t.call(o,o.__data__,c,a)&&f.push(o);return new qn(r,this._parents,this._name,this._id)},merge:function(t){if(t._id!==this._id)throw new Error;for(var n=this._groups,e=t._groups,r=n.length,i=e.length,o=Math.min(r,i),a=new Array(r),u=0;u<o;++u)for(var f,c=n[u],s=e[u],l=c.length,h=a[u]=new Array(l),d=0;d<l;++d)(f=c[d]||s[d])&&(h[d]=f);for(;u<r;++u)a[u]=n[u];return new qn(a,this._parents,this._name,this._id)},selection:function(){return new Ol(this._groups,this._parents)},transition:function(){for(var t=this._name,n=this._id,e=Yn(),r=this._groups,i=r.length,o=0;o<i;++o)for(var a,u=r[o],f=u.length,c=0;c<f;++c)if(a=u[c]){var s=Rn(a,n);Cn(a,t,e,c,u,{time:s.time+s.delay+s.duration,delay:0,duration:s.duration,ease:s.ease})}return new qn(r,this._parents,t,e)},call:Bl.call,nodes:Bl.nodes,node:Bl.node,size:Bl.size,empty:Bl.empty,each:Bl.each,on:function(t,n){var e=this._id;return arguments.length<2?Rn(this.node(),e).on.on(t):this.each(function(t,n,e){var r,i,o=function(t){return(t+"").trim().split(/^|\s+/).every(function(t){var n=t.indexOf(".");return n>=0&&(t=t.slice(0,n)),!t||"start"===t})}(n)?Pn:zn;return function(){var a=o(this,t),u=a.on;u!==r&&(i=(r=u).copy()).on(n,e),a.on=i}}(e,t,n))},attr:function(t,n){var e=k(t),r="transform"===e?cl:Un;return this.attrTween(t,"function"==typeof n?(e.local?function(t,n,e){var r,i,o;return function(){var a,u=e(this);if(null!=u)return(a=this.getAttributeNS(t.space,t.local))===u?null:a===r&&u===i?o:o=n(r=a,i=u);this.removeAttributeNS(t.space,t.local)}}:function(t,n,e){var r,i,o;return function(){var a,u=e(this);if(null!=u)return(a=this.getAttribute(t))===u?null:a===r&&u===i?o:o=n(r=a,i=u);this.removeAttribute(t)}})(e,r,Dn(this,"attr."+t,n)):null==n?(e.local?function(t){return function(){this.removeAttributeNS(t.space,t.local)}}:function(t){return function(){this.removeAttribute(t)}})(e):(e.local?function(t,n,e){var r,i;return function(){var o=this.getAttributeNS(t.space,t.local);return o===e?null:o===r?i:i=n(r=o,e)}}:function(t,n,e){var r,i;return function(){var o=this.getAttribute(t);return o===e?null:o===r?i:i=n(r=o,e)}})(e,r,n+""))},attrTween:function(t,n){var e="attr."+t;if(arguments.length<2)return(e=this.tween(e))&&e._value;if(null==n)return this.tween(e,null);if("function"!=typeof n)throw new Error;var r=k(t);return this.tween(e,(r.local?function(t,n){function e(){var e=this,r=n.apply(e,arguments);return r&&function(n){e.setAttributeNS(t.space,t.local,r(n))}}return e._value=n,e}:function(t,n){function e(){var e=this,r=n.apply(e,arguments);return r&&function(n){e.setAttribute(t,r(n))}}return e._value=n,e})(r,n))},style:function(t,n,e){var r="transform"==(t+="")?fl:Un;return null==n?this.styleTween(t,function(t,n){var e,r,i;return function(){var o=F(this,t),a=(this.style.removeProperty(t),F(this,t));return o===a?null:o===e&&a===r?i:i=n(e=o,r=a)}}(t,r)).on("end.style."+t,function(t){return function(){this.style.removeProperty(t)}}(t)):this.styleTween(t,"function"==typeof n?function(t,n,e){var r,i,o;return function(){var a=F(this,t),u=e(this);return null==u&&(this.style.removeProperty(t),u=F(this,t)),a===u?null:a===r&&u===i?o:o=n(r=a,i=u)}}(t,r,Dn(this,"style."+t,n)):function(t,n,e){var r,i;return function(){var o=F(this,t);return o===e?null:o===r?i:i=n(r=o,e)}}(t,r,n+""),e)},styleTween:function(t,n,e){var r="style."+(t+="");if(arguments.length<2)return(r=this.tween(r))&&r._value;if(null==n)return this.tween(r,null);if("function"!=typeof n)throw new Error;return this.tween(r,function(t,n,e){function r(){var r=this,i=n.apply(r,arguments);return i&&function(n){r.style.setProperty(t,i(n),e)}}return r._value=n,r}(t,n,null==e?"":e))},text:function(t){return this.tween("text","function"==typeof t?function(t){return function(){var n=t(this);this.textContent=null==n?"":n}}(Dn(this,"text",t)):function(t){return function(){this.textContent=t}}(null==t?"":t+""))},remove:function(){return this.on("end.remove",function(t){return function(){var n=this.parentNode;for(var e in this.__transition)if(+e!==t)return;n&&n.removeChild(this)}}(this._id))},tween:function(t,n){var e=this._id;if(t+="",arguments.length<2){for(var r,i=Rn(this.node(),e).tween,o=0,a=i.length;o<a;++o)if((r=i[o]).name===t)return r.value;return null}return this.each((null==n?function(t,n){var e,r;return function(){var i=zn(this,t),o=i.tween;if(o!==e)for(var a=0,u=(r=e=o).length;a<u;++a)if(r[a].name===n){(r=r.slice()).splice(a,1);break}i.tween=r}}:function(t,n,e){var r,i;if("function"!=typeof e)throw new Error;return function(){var o=zn(this,t),a=o.tween;if(a!==r){i=(r=a).slice();for(var u={name:n,value:e},f=0,c=i.length;f<c;++f)if(i[f].name===n){i[f]=u;break}f===c&&i.push(u)}o.tween=i}})(e,t,n))},delay:function(t){var n=this._id;return arguments.length?this.each(("function"==typeof t?function(t,n){return function(){Pn(this,t).delay=+n.apply(this,arguments)}}:function(t,n){return n=+n,function(){Pn(this,t).delay=n}})(n,t)):Rn(this.node(),n).delay},duration:function(t){var n=this._id;return arguments.length?this.each(("function"==typeof t?function(t,n){return function(){zn(this,t).duration=+n.apply(this,arguments)}}:function(t,n){return n=+n,function(){zn(this,t).duration=n}})(n,t)):Rn(this.node(),n).duration},ease:function(t){var n=this._id;return arguments.length?this.each(function(t,n){if("function"!=typeof n)throw new Error;return function(){zn(this,t).ease=n}}(n,t)):Rn(this.node(),n).ease}};var Fl=function t(n){function e(t){return Math.pow(t,n)}return n=+n,e.exponent=t,e}(3),Il=function t(n){function e(t){return 1-Math.pow(1-t,n)}return n=+n,e.exponent=t,e}(3),jl=function t(n){function e(t){return((t*=2)<=1?Math.pow(t,n):2-Math.pow(2-t,n))/2}return n=+n,e.exponent=t,e}(3),Hl=Math.PI,Xl=Hl/2,Gl=4/11,Vl=6/11,$l=8/11,Wl=.75,Zl=9/11,Ql=10/11,Jl=.9375,Kl=21/22,th=63/64,nh=1/Gl/Gl,eh=function t(n){function e(t){return t*t*((n+1)*t-n)}return n=+n,e.overshoot=t,e}(1.70158),rh=function t(n){function e(t){return--t*t*((n+1)*t+n)+1}return n=+n,e.overshoot=t,e}(1.70158),ih=function t(n){function e(t){return((t*=2)<1?t*t*((n+1)*t-n):(t-=2)*t*((n+1)*t+n)+2)/2}return n=+n,e.overshoot=t,e}(1.70158),oh=2*Math.PI,ah=function t(n,e){function r(t){return n*Math.pow(2,10*--t)*Math.sin((i-t)/e)}var i=Math.asin(1/(n=Math.max(1,n)))*(e/=oh);return r.amplitude=function(n){return t(n,e*oh)},r.period=function(e){return t(n,e)},r}(1,.3),uh=function t(n,e){function r(t){return 1-n*Math.pow(2,-10*(t=+t))*Math.sin((t+i)/e)}var i=Math.asin(1/(n=Math.max(1,n)))*(e/=oh);return r.amplitude=function(n){return t(n,e*oh)},r.period=function(e){return t(n,e)},r}(1,.3),fh=function t(n,e){function r(t){return((t=2*t-1)<0?n*Math.pow(2,10*t)*Math.sin((i-t)/e):2-n*Math.pow(2,-10*t)*Math.sin((i+t)/e))/2}var i=Math.asin(1/(n=Math.max(1,n)))*(e/=oh);return r.amplitude=function(n){return t(n,e*oh)},r.period=function(e){return t(n,e)},r}(1,.3),ch={time:null,delay:0,duration:250,ease:Fn};ft.prototype.interrupt=function(t){return this.each(function(){Ln(this,t)})},ft.prototype.transition=function(t){var n,e;t instanceof qn?(n=t._id,t=t._name):(n=Yn(),(e=ch).time=xn(),t=null==t?null:t+"");for(var r=this._groups,i=r.length,o=0;o<i;++o)for(var a,u=r[o],f=u.length,c=0;c<f;++c)(a=u[c])&&Cn(a,t,n,c,u,e||Gn(a,n));return new qn(r,this._parents,t,n)};var sh=[null],lh={name:"drag"},hh={name:"space"},dh={name:"handle"},ph={name:"center"},vh={name:"x",handles:["e","w"].map(Zn),input:function(t,n){return t&&[[t[0],n[0][1]],[t[1],n[1][1]]]},output:function(t){return t&&[t[0][0],t[1][0]]}},gh={name:"y",handles:["n","s"].map(Zn),input:function(t,n){return t&&[[n[0][0],t[0]],[n[1][0],t[1]]]},output:function(t){return t&&[t[0][1],t[1][1]]}},yh={name:"xy",handles:["n","e","s","w","nw","ne","se","sw"].map(Zn),input:function(t){return t},output:function(t){return t}},_h={overlay:"crosshair",selection:"move",n:"ns-resize",e:"ew-resize",s:"ns-resize",w:"ew-resize",nw:"nwse-resize",ne:"nesw-resize",se:"nwse-resize",sw:"nesw-resize"},bh={e:"w",w:"e",nw:"ne",ne:"nw",se:"sw",sw:"se"},mh={n:"s",s:"n",nw:"sw",ne:"se",se:"ne",sw:"nw"},xh={overlay:1,selection:1,n:null,e:1,s:null,w:-1,nw:-1,ne:1,se:1,sw:-1},wh={overlay:1,selection:1,n:-1,e:null,s:1,w:null,nw:-1,ne:-1,se:1,sw:1},Mh=Math.cos,Ah=Math.sin,Th=Math.PI,Nh=Th/2,Sh=2*Th,Eh=Math.max,kh=Array.prototype.slice,Ch=Math.PI,Ph=2*Ch,zh=Ph-1e-6;re.prototype=ie.prototype={constructor:re,moveTo:function(t,n){this._+="M"+(this._x0=this._x1=+t)+","+(this._y0=this._y1=+n)},closePath:function(){null!==this._x1&&(this._x1=this._x0,this._y1=this._y0,this._+="Z")},lineTo:function(t,n){this._+="L"+(this._x1=+t)+","+(this._y1=+n)},quadraticCurveTo:function(t,n,e,r){this._+="Q"+ +t+","+ +n+","+(this._x1=+e)+","+(this._y1=+r)},bezierCurveTo:function(t,n,e,r,i,o){this._+="C"+ +t+","+ +n+","+ +e+","+ +r+","+(this._x1=+i)+","+(this._y1=+o)},arcTo:function(t,n,e,r,i){t=+t,n=+n,e=+e,r=+r,i=+i;var o=this._x1,a=this._y1,u=e-t,f=r-n,c=o-t,s=a-n,l=c*c+s*s;if(i<0)throw new Error("negative radius: "+i);if(null===this._x1)this._+="M"+(this._x1=t)+","+(this._y1=n);else if(l>1e-6)if(Math.abs(s*u-f*c)>1e-6&&i){var h=e-o,d=r-a,p=u*u+f*f,v=h*h+d*d,g=Math.sqrt(p),y=Math.sqrt(l),_=i*Math.tan((Ch-Math.acos((p+l-v)/(2*g*y)))/2),b=_/y,m=_/g;Math.abs(b-1)>1e-6&&(this._+="L"+(t+b*c)+","+(n+b*s)),this._+="A"+i+","+i+",0,0,"+ +(s*h>c*d)+","+(this._x1=t+m*u)+","+(this._y1=n+m*f)}else this._+="L"+(this._x1=t)+","+(this._y1=n);else;},arc:function(t,n,e,r,i,o){t=+t,n=+n;var a=(e=+e)*Math.cos(r),u=e*Math.sin(r),f=t+a,c=n+u,s=1^o,l=o?r-i:i-r;if(e<0)throw new Error("negative radius: "+e);null===this._x1?this._+="M"+f+","+c:(Math.abs(this._x1-f)>1e-6||Math.abs(this._y1-c)>1e-6)&&(this._+="L"+f+","+c),e&&(l<0&&(l=l%Ph+Ph),l>zh?this._+="A"+e+","+e+",0,1,"+s+","+(t-a)+","+(n-u)+"A"+e+","+e+",0,1,"+s+","+(this._x1=f)+","+(this._y1=c):l>1e-6&&(this._+="A"+e+","+e+",0,"+ +(l>=Ch)+","+s+","+(this._x1=t+e*Math.cos(i))+","+(this._y1=n+e*Math.sin(i))))},rect:function(t,n,e,r){this._+="M"+(this._x0=this._x1=+t)+","+(this._y0=this._y1=+n)+"h"+ +e+"v"+ +r+"h"+-e+"Z"},toString:function(){return this._}};se.prototype=le.prototype={constructor:se,has:function(t){return"$"+t in this},get:function(t){return this["$"+t]},set:function(t,n){return this["$"+t]=n,this},remove:function(t){var n="$"+t;return n in this&&delete this[n]},clear:function(){for(var t in this)"$"===t[0]&&delete this[t]},keys:function(){var t=[];for(var n in this)"$"===n[0]&&t.push(n.slice(1));return t},values:function(){var t=[];for(var n in this)"$"===n[0]&&t.push(this[n]);return t},entries:function(){var t=[];for(var n in this)"$"===n[0]&&t.push({key:n.slice(1),value:this[n]});return t},size:function(){var t=0;for(var n in this)"$"===n[0]&&++t;return t},empty:function(){for(var t in this)if("$"===t[0])return!1;return!0},each:function(t){for(var n in this)"$"===n[0]&&t(this[n],n.slice(1),this)}};var Rh=le.prototype;ge.prototype=ye.prototype={constructor:ge,has:Rh.has,add:function(t){return t+="",this["$"+t]=t,this},remove:Rh.remove,clear:Rh.clear,values:Rh.keys,size:Rh.size,empty:Rh.empty,each:Rh.each};var Lh=Array.prototype.slice,Dh=[[],[[[1,1.5],[.5,1]]],[[[1.5,1],[1,1.5]]],[[[1.5,1],[.5,1]]],[[[1,.5],[1.5,1]]],[[[1,1.5],[.5,1]],[[1,.5],[1.5,1]]],[[[1,.5],[1,1.5]]],[[[1,.5],[.5,1]]],[[[.5,1],[1,.5]]],[[[1,1.5],[1,.5]]],[[[.5,1],[1,.5]],[[1.5,1],[1,1.5]]],[[[1.5,1],[1,.5]]],[[[.5,1],[1.5,1]]],[[[1,1.5],[1.5,1]]],[[[.5,1],[1,1.5]]],[]],Uh={},qh={},Oh=34,Yh=10,Bh=13,Fh=Ee(","),Ih=Fh.parse,jh=Fh.parseRows,Hh=Fh.format,Xh=Fh.formatRows,Gh=Ee("\t"),Vh=Gh.parse,$h=Gh.parseRows,Wh=Gh.format,Zh=Gh.formatRows,Qh=Re(Ih),Jh=Re(Vh),Kh=De("application/xml"),td=De("text/html"),nd=De("image/svg+xml"),ed=Ie.prototype=je.prototype;ed.copy=function(){var t,n,e=new je(this._x,this._y,this._x0,this._y0,this._x1,this._y1),r=this._root;if(!r)return e;if(!r.length)return e._root=He(r),e;for(t=[{source:r,target:e._root=new Array(4)}];r=t.pop();)for(var i=0;i<4;++i)(n=r.source[i])&&(n.length?t.push({source:n,target:r.target[i]=new Array(4)}):r.target[i]=He(n));return e},ed.add=function(t){var n=+this._x.call(null,t),e=+this._y.call(null,t);return Oe(this.cover(n,e),n,e,t)},ed.addAll=function(t){var n,e,r,i,o=t.length,a=new Array(o),u=new Array(o),f=1/0,c=1/0,s=-1/0,l=-1/0;for(e=0;e<o;++e)isNaN(r=+this._x.call(null,n=t[e]))||isNaN(i=+this._y.call(null,n))||(a[e]=r,u[e]=i,r<f&&(f=r),r>s&&(s=r),i<c&&(c=i),i>l&&(l=i));for(s<f&&(f=this._x0,s=this._x1),l<c&&(c=this._y0,l=this._y1),this.cover(f,c).cover(s,l),e=0;e<o;++e)Oe(this,a[e],u[e],t[e]);return this},ed.cover=function(t,n){if(isNaN(t=+t)||isNaN(n=+n))return this;var e=this._x0,r=this._y0,i=this._x1,o=this._y1;if(isNaN(e))i=(e=Math.floor(t))+1,o=(r=Math.floor(n))+1;else{if(!(e>t||t>i||r>n||n>o))return this;var a,u,f=i-e,c=this._root;switch(u=(n<(r+o)/2)<<1|t<(e+i)/2){case 0:do{a=new Array(4),a[u]=c,c=a}while(f*=2,i=e+f,o=r+f,t>i||n>o);break;case 1:do{a=new Array(4),a[u]=c,c=a}while(f*=2,e=i-f,o=r+f,e>t||n>o);break;case 2:do{a=new Array(4),a[u]=c,c=a}while(f*=2,i=e+f,r=o-f,t>i||r>n);break;case 3:do{a=new Array(4),a[u]=c,c=a}while(f*=2,e=i-f,r=o-f,e>t||r>n)}this._root&&this._root.length&&(this._root=c)}return this._x0=e,this._y0=r,this._x1=i,this._y1=o,this},ed.data=function(){var t=[];return this.visit(function(n){if(!n.length)do{t.push(n.data)}while(n=n.next)}),t},ed.extent=function(t){return arguments.length?this.cover(+t[0][0],+t[0][1]).cover(+t[1][0],+t[1][1]):isNaN(this._x0)?void 0:[[this._x0,this._y0],[this._x1,this._y1]]},ed.find=function(t,n,e){var r,i,o,a,u,f,c,s=this._x0,l=this._y0,h=this._x1,d=this._y1,p=[],v=this._root;for(v&&p.push(new Ye(v,s,l,h,d)),null==e?e=1/0:(s=t-e,l=n-e,h=t+e,d=n+e,e*=e);f=p.pop();)if(!(!(v=f.node)||(i=f.x0)>h||(o=f.y0)>d||(a=f.x1)<s||(u=f.y1)<l))if(v.length){var g=(i+a)/2,y=(o+u)/2;p.push(new Ye(v[3],g,y,a,u),new Ye(v[2],i,y,g,u),new Ye(v[1],g,o,a,y),new Ye(v[0],i,o,g,y)),(c=(n>=y)<<1|t>=g)&&(f=p[p.length-1],p[p.length-1]=p[p.length-1-c],p[p.length-1-c]=f)}else{var _=t-+this._x.call(null,v.data),b=n-+this._y.call(null,v.data),m=_*_+b*b;if(m<e){var x=Math.sqrt(e=m);s=t-x,l=n-x,h=t+x,d=n+x,r=v.data}}return r},ed.remove=function(t){if(isNaN(o=+this._x.call(null,t))||isNaN(a=+this._y.call(null,t)))return this;var n,e,r,i,o,a,u,f,c,s,l,h,d=this._root,p=this._x0,v=this._y0,g=this._x1,y=this._y1;if(!d)return this;if(d.length)for(;;){if((c=o>=(u=(p+g)/2))?p=u:g=u,(s=a>=(f=(v+y)/2))?v=f:y=f,n=d,!(d=d[l=s<<1|c]))return this;if(!d.length)break;(n[l+1&3]||n[l+2&3]||n[l+3&3])&&(e=n,h=l)}for(;d.data!==t;)if(r=d,!(d=d.next))return this;return(i=d.next)&&delete d.next,r?(i?r.next=i:delete r.next,this):n?(i?n[l]=i:delete n[l],(d=n[0]||n[1]||n[2]||n[3])&&d===(n[3]||n[2]||n[1]||n[0])&&!d.length&&(e?e[h]=d:this._root=d),this):(this._root=i,this)},ed.removeAll=function(t){for(var n=0,e=t.length;n<e;++n)this.remove(t[n]);return this},ed.root=function(){return this._root},ed.size=function(){var t=0;return this.visit(function(n){if(!n.length)do{++t}while(n=n.next)}),t},ed.visit=function(t){var n,e,r,i,o,a,u=[],f=this._root;for(f&&u.push(new Ye(f,this._x0,this._y0,this._x1,this._y1));n=u.pop();)if(!t(f=n.node,r=n.x0,i=n.y0,o=n.x1,a=n.y1)&&f.length){var c=(r+o)/2,s=(i+a)/2;(e=f[3])&&u.push(new Ye(e,c,s,o,a)),(e=f[2])&&u.push(new Ye(e,r,s,c,a)),(e=f[1])&&u.push(new Ye(e,c,i,o,s)),(e=f[0])&&u.push(new Ye(e,r,i,c,s))}return this},ed.visitAfter=function(t){var n,e=[],r=[];for(this._root&&e.push(new Ye(this._root,this._x0,this._y0,this._x1,this._y1));n=e.pop();){var i=n.node;if(i.length){var o,a=n.x0,u=n.y0,f=n.x1,c=n.y1,s=(a+f)/2,l=(u+c)/2;(o=i[0])&&e.push(new Ye(o,a,u,s,l)),(o=i[1])&&e.push(new Ye(o,s,u,f,l)),(o=i[2])&&e.push(new Ye(o,a,l,s,c)),(o=i[3])&&e.push(new Ye(o,s,l,f,c))}r.push(n)}for(;n=r.pop();)t(n.node,n.x0,n.y0,n.x1,n.y1);return this},ed.x=function(t){return arguments.length?(this._x=t,this):this._x},ed.y=function(t){return arguments.length?(this._y=t,this):this._y};var rd,id=10,od=Math.PI*(3-Math.sqrt(5)),ad={"":function(t,n){t:for(var e,r=(t=t.toPrecision(n)).length,i=1,o=-1;i<r;++i)switch(t[i]){case".":o=e=i;break;case"0":0===o&&(o=i),e=i;break;case"e":break t;default:o>0&&(o=0)}return o>0?t.slice(0,o)+t.slice(e+1):t},"%":function(t,n){return(100*t).toFixed(n)},b:function(t){return Math.round(t).toString(2)},c:function(t){return t+""},d:function(t){return Math.round(t).toString(10)},e:function(t,n){return t.toExponential(n)},f:function(t,n){return t.toFixed(n)},g:function(t,n){return t.toPrecision(n)},o:function(t){return Math.round(t).toString(8)},p:function(t,n){return Ke(100*t,n)},r:Ke,s:function(t,n){var e=Qe(t,n);if(!e)return t+"";var r=e[0],i=e[1],o=i-(rd=3*Math.max(-8,Math.min(8,Math.floor(i/3))))+1,a=r.length;return o===a?r:o>a?r+new Array(o-a+1).join("0"):o>0?r.slice(0,o)+"."+r.slice(o):"0."+new Array(1-o).join("0")+Qe(t,Math.max(0,n+o-1))[0]},X:function(t){return Math.round(t).toString(16).toUpperCase()},x:function(t){return Math.round(t).toString(16)}},ud=/^(?:(.)?([<>=^]))?([+\-\( ])?([$#])?(0)?(\d+)?(,)?(\.\d+)?([a-z%])?$/i;tr.prototype=nr.prototype,nr.prototype.toString=function(){return this.fill+this.align+this.sign+this.symbol+(this.zero?"0":"")+(null==this.width?"":Math.max(1,0|this.width))+(this.comma?",":"")+(null==this.precision?"":"."+Math.max(0,0|this.precision))+this.type};var fd,cd=["y","z","a","f","p","n","µ","m","","k","M","G","T","P","E","Z","Y"];ir({decimal:".",thousands:",",grouping:[3],currency:["$",""]}),cr.prototype={constructor:cr,reset:function(){this.s=this.t=0},add:function(t){sr(Fd,t,this.t),sr(this,Fd.s,this.s),this.s?this.t+=Fd.t:this.s=Fd.t},valueOf:function(){return this.s}};var sd,ld,hd,dd,pd,vd,gd,yd,_d,bd,md,xd,wd,Md,Ad,Td,Nd,Sd,Ed,kd,Cd,Pd,zd,Rd,Ld,Dd,Ud,qd,Od,Yd,Bd,Fd=new cr,Id=1e-6,jd=1e-12,Hd=Math.PI,Xd=Hd/2,Gd=Hd/4,Vd=2*Hd,$d=180/Hd,Wd=Hd/180,Zd=Math.abs,Qd=Math.atan,Jd=Math.atan2,Kd=Math.cos,tp=Math.ceil,np=Math.exp,ep=Math.log,rp=Math.pow,ip=Math.sin,op=Math.sign||function(t){return t>0?1:t<0?-1:0},ap=Math.sqrt,up=Math.tan,fp={Feature:function(t,n){vr(t.geometry,n)},FeatureCollection:function(t,n){for(var e=t.features,r=-1,i=e.length;++r<i;)vr(e[r].geometry,n)}},cp={Sphere:function(t,n){n.sphere()},Point:function(t,n){t=t.coordinates,n.point(t[0],t[1],t[2])},MultiPoint:function(t,n){for(var e=t.coordinates,r=-1,i=e.length;++r<i;)t=e[r],n.point(t[0],t[1],t[2])},LineString:function(t,n){gr(t.coordinates,n,0)},MultiLineString:function(t,n){for(var e=t.coordinates,r=-1,i=e.length;++r<i;)gr(e[r],n,0)},Polygon:function(t,n){yr(t.coordinates,n)},MultiPolygon:function(t,n){for(var e=t.coordinates,r=-1,i=e.length;++r<i;)yr(e[r],n)},GeometryCollection:function(t,n){for(var e=t.geometries,r=-1,i=e.length;++r<i;)vr(e[r],n)}},sp=fr(),lp=fr(),hp={point:pr,lineStart:pr,lineEnd:pr,polygonStart:function(){sp.reset(),hp.lineStart=br,hp.lineEnd=mr},polygonEnd:function(){var t=+sp;lp.add(t<0?Vd+t:t),this.lineStart=this.lineEnd=this.point=pr},sphere:function(){lp.add(Vd)}},dp=fr(),pp={point:Cr,lineStart:zr,lineEnd:Rr,polygonStart:function(){pp.point=Lr,pp.lineStart=Dr,pp.lineEnd=Ur,dp.reset(),hp.polygonStart()},polygonEnd:function(){hp.polygonEnd(),pp.point=Cr,pp.lineStart=zr,pp.lineEnd=Rr,sp<0?(vd=-(yd=180),gd=-(_d=90)):dp>Id?_d=90:dp<-Id&&(gd=-90),Ad[0]=vd,Ad[1]=yd}},vp={sphere:pr,point:Br,lineStart:Ir,lineEnd:Xr,polygonStart:function(){vp.lineStart=Gr,vp.lineEnd=Vr},polygonEnd:function(){vp.lineStart=Ir,vp.lineEnd=Xr}};Jr.invert=Jr;var gp,yp,_p,bp,mp,xp,wp,Mp,Ap,Tp,Np,Sp=fr(),Ep=hi(function(){return!0},function(t){var n,e=NaN,r=NaN,i=NaN;return{lineStart:function(){t.lineStart(),n=1},point:function(o,a){var u=o>0?Hd:-Hd,f=Zd(o-e);Zd(f-Hd)<Id?(t.point(e,r=(r+a)/2>0?Xd:-Xd),t.point(i,r),t.lineEnd(),t.lineStart(),t.point(u,r),t.point(o,r),n=0):i!==u&&f>=Hd&&(Zd(e-i)<Id&&(e-=i*Id),Zd(o-u)<Id&&(o-=u*Id),r=function(t,n,e,r){var i,o,a=ip(t-e);return Zd(a)>Id?Qd((ip(n)*(o=Kd(r))*ip(e)-ip(r)*(i=Kd(n))*ip(t))/(i*o*a)):(n+r)/2}(e,r,o,a),t.point(i,r),t.lineEnd(),t.lineStart(),t.point(u,r),n=0),t.point(e=o,r=a),i=u},lineEnd:function(){t.lineEnd(),e=r=NaN},clean:function(){return 2-n}}},function(t,n,e,r){var i;if(null==t)i=e*Xd,r.point(-Hd,i),r.point(0,i),r.point(Hd,i),r.point(Hd,0),r.point(Hd,-i),r.point(0,-i),r.point(-Hd,-i),r.point(-Hd,0),r.point(-Hd,i);else if(Zd(t[0]-n[0])>Id){var o=t[0]<n[0]?Hd:-Hd;i=e*o/2,r.point(-o,i),r.point(0,i),r.point(o,i)}else r.point(n[0],n[1])},[-Hd,-Xd]),kp=1e9,Cp=-kp,Pp=fr(),zp={sphere:pr,point:pr,lineStart:function(){zp.point=_i,zp.lineEnd=yi},lineEnd:pr,polygonStart:pr,polygonEnd:pr},Rp=[null,null],Lp={type:"LineString",coordinates:Rp},Dp={Feature:function(t,n){return wi(t.geometry,n)},FeatureCollection:function(t,n){for(var e=t.features,r=-1,i=e.length;++r<i;)if(wi(e[r].geometry,n))return!0;return!1}},Up={Sphere:function(){return!0},Point:function(t,n){return Mi(t.coordinates,n)},MultiPoint:function(t,n){for(var e=t.coordinates,r=-1,i=e.length;++r<i;)if(Mi(e[r],n))return!0;return!1},LineString:function(t,n){return Ai(t.coordinates,n)},MultiLineString:function(t,n){for(var e=t.coordinates,r=-1,i=e.length;++r<i;)if(Ai(e[r],n))return!0;return!1},Polygon:function(t,n){return Ti(t.coordinates,n)},MultiPolygon:function(t,n){for(var e=t.coordinates,r=-1,i=e.length;++r<i;)if(Ti(e[r],n))return!0;return!1},GeometryCollection:function(t,n){for(var e=t.geometries,r=-1,i=e.length;++r<i;)if(wi(e[r],n))return!0;return!1}},qp=fr(),Op=fr(),Yp={point:pr,lineStart:pr,lineEnd:pr,polygonStart:function(){Yp.lineStart=zi,Yp.lineEnd=Di},polygonEnd:function(){Yp.lineStart=Yp.lineEnd=Yp.point=pr,qp.add(Zd(Op)),Op.reset()},result:function(){var t=qp/2;return qp.reset(),t}},Bp=1/0,Fp=Bp,Ip=-Bp,jp=Ip,Hp={point:function(t,n){t<Bp&&(Bp=t),t>Ip&&(Ip=t),n<Fp&&(Fp=n),n>jp&&(jp=n)},lineStart:pr,lineEnd:pr,polygonStart:pr,polygonEnd:pr,result:function(){var t=[[Bp,Fp],[Ip,jp]];return Ip=jp=-(Fp=Bp=1/0),t}},Xp=0,Gp=0,Vp=0,$p=0,Wp=0,Zp=0,Qp=0,Jp=0,Kp=0,tv={point:Ui,lineStart:qi,lineEnd:Bi,polygonStart:function(){tv.lineStart=Fi,tv.lineEnd=Ii},polygonEnd:function(){tv.point=Ui,tv.lineStart=qi,tv.lineEnd=Bi},result:function(){var t=Kp?[Qp/Kp,Jp/Kp]:Zp?[$p/Zp,Wp/Zp]:Vp?[Xp/Vp,Gp/Vp]:[NaN,NaN];return Xp=Gp=Vp=$p=Wp=Zp=Qp=Jp=Kp=0,t}};Xi.prototype={_radius:4.5,pointRadius:function(t){return this._radius=t,this},polygonStart:function(){this._line=0},polygonEnd:function(){this._line=NaN},lineStart:function(){this._point=0},lineEnd:function(){0===this._line&&this._context.closePath(),this._point=NaN},point:function(t,n){switch(this._point){case 0:this._context.moveTo(t,n),this._point=1;break;case 1:this._context.lineTo(t,n);break;default:this._context.moveTo(t+this._radius,n),this._context.arc(t,n,this._radius,0,Vd)}},result:pr};var nv,ev,rv,iv,ov,av=fr(),uv={point:pr,lineStart:function(){uv.point=Gi},lineEnd:function(){nv&&Vi(ev,rv),uv.point=pr},polygonStart:function(){nv=!0},polygonEnd:function(){nv=null},result:function(){var t=+av;return av.reset(),t}};$i.prototype={_radius:4.5,_circle:Wi(4.5),pointRadius:function(t){return(t=+t)!==this._radius&&(this._radius=t,this._circle=null),this},polygonStart:function(){this._line=0},polygonEnd:function(){this._line=NaN},lineStart:function(){this._point=0},lineEnd:function(){0===this._line&&this._string.push("Z"),this._point=NaN},point:function(t,n){switch(this._point){case 0:this._string.push("M",t,",",n),this._point=1;break;case 1:this._string.push("L",t,",",n);break;default:null==this._circle&&(this._circle=Wi(this._radius)),this._string.push("M",t,",",n,this._circle)}},result:function(){if(this._string.length){var t=this._string.join("");return this._string=[],t}return null}},Qi.prototype={constructor:Qi,point:function(t,n){this.stream.point(t,n)},sphere:function(){this.stream.sphere()},lineStart:function(){this.stream.lineStart()},lineEnd:function(){this.stream.lineEnd()},polygonStart:function(){this.stream.polygonStart()},polygonEnd:function(){this.stream.polygonEnd()}};var fv=16,cv=Kd(30*Wd),sv=Zi({point:function(t,n){this.stream.point(t*Wd,n*Wd)}}),lv=lo(function(t){return ap(2/(1+t))});lv.invert=ho(function(t){return 2*hr(t/2)});var hv=lo(function(t){return(t=lr(t))&&t/ip(t)});hv.invert=ho(function(t){return t}),po.invert=function(t,n){return[t,2*Qd(np(n))-Xd]},_o.invert=_o,mo.invert=ho(Qd),wo.invert=function(t,n){var e,r=n,i=25;do{var o=r*r,a=o*o;r-=e=(r*(1.007226+o*(.015085+a*(.028874*o-.044475-.005916*a)))-n)/(1.007226+o*(.045255+a*(.259866*o-.311325-.005916*11*a)))}while(Zd(e)>Id&&--i>0);return[t/(.8707+(o=r*r)*(o*(o*o*o*(.003971-.001529*o)-.013791)-.131979)),r]},Mo.invert=ho(hr),Ao.invert=ho(function(t){return 2*Qd(t)}),To.invert=function(t,n){return[-n,2*Qd(np(t))-Xd]},Lo.prototype=Co.prototype={constructor:Lo,count:function(){return this.eachAfter(ko)},each:function(t){var n,e,r,i,o=this,a=[o];do{for(n=a.reverse(),a=[];o=n.pop();)if(t(o),e=o.children)for(r=0,i=e.length;r<i;++r)a.push(e[r])}while(a.length);return this},eachAfter:function(t){for(var n,e,r,i=this,o=[i],a=[];i=o.pop();)if(a.push(i),n=i.children)for(e=0,r=n.length;e<r;++e)o.push(n[e]);for(;i=a.pop();)t(i);return this},eachBefore:function(t){for(var n,e,r=this,i=[r];r=i.pop();)if(t(r),n=r.children)for(e=n.length-1;e>=0;--e)i.push(n[e]);return this},sum:function(t){return this.eachAfter(function(n){for(var e=+t(n.data)||0,r=n.children,i=r&&r.length;--i>=0;)e+=r[i].value;n.value=e})},sort:function(t){return this.eachBefore(function(n){n.children&&n.children.sort(t)})},path:function(t){for(var n=this,e=function(t,n){if(t===n)return t;var e=t.ancestors(),r=n.ancestors(),i=null;for(t=e.pop(),n=r.pop();t===n;)i=t,t=e.pop(),n=r.pop();return i}(n,t),r=[n];n!==e;)n=n.parent,r.push(n);for(var i=r.length;t!==e;)r.splice(i,0,t),t=t.parent;return r},ancestors:function(){for(var t=this,n=[t];t=t.parent;)n.push(t);return n},descendants:function(){var t=[];return this.each(function(n){t.push(n)}),t},leaves:function(){var t=[];return this.eachBefore(function(n){n.children||t.push(n)}),t},links:function(){var t=this,n=[];return t.each(function(e){e!==t&&n.push({source:e.parent,target:e})}),n},copy:function(){return Co(this).eachBefore(zo)}};var dv=Array.prototype.slice,pv="$",vv={depth:-1},gv={};fa.prototype=Object.create(Lo.prototype);var yv=(1+Math.sqrt(5))/2,_v=function t(n){function e(t,e,r,i,o){sa(n,t,e,r,i,o)}return e.ratio=function(n){return t((n=+n)>1?n:1)},e}(yv),bv=function t(n){function e(t,e,r,i,o){if((a=t._squarify)&&a.ratio===n)for(var a,u,f,c,s,l=-1,h=a.length,d=t.value;++l<h;){for(f=(u=a[l]).children,c=u.value=0,s=f.length;c<s;++c)u.value+=f[c].value;u.dice?ta(u,e,r,i,r+=(o-r)*u.value/d):ca(u,e,r,e+=(i-e)*u.value/d,o),d-=u.value}else t._squarify=a=sa(n,t,e,r,i,o),a.ratio=n}return e.ratio=function(n){return t((n=+n)>1?n:1)},e}(yv),mv=function t(n){function e(t,e){return t=null==t?0:+t,e=null==e?1:+e,1===arguments.length?(e=t,t=0):e-=t,function(){return n()*e+t}}return e.source=t,e}(pa),xv=function t(n){function e(t,e){var r,i;return t=null==t?0:+t,e=null==e?1:+e,function(){var o;if(null!=r)o=r,r=null;else do{r=2*n()-1,o=2*n()-1,i=r*r+o*o}while(!i||i>1);return t+e*o*Math.sqrt(-2*Math.log(i)/i)}}return e.source=t,e}(pa),wv=function t(n){function e(){var t=xv.source(n).apply(this,arguments);return function(){return Math.exp(t())}}return e.source=t,e}(pa),Mv=function t(n){function e(t){return function(){for(var e=0,r=0;r<t;++r)e+=n();return e}}return e.source=t,e}(pa),Av=function t(n){function e(t){var e=Mv.source(n)(t);return function(){return e()/t}}return e.source=t,e}(pa),Tv=function t(n){function e(t){return function(){return-Math.log(1-n())/t}}return e.source=t,e}(pa),Nv=Array.prototype,Sv=Nv.map,Ev=Nv.slice,kv={name:"implicit"},Cv=[0,1],Pv=new Date,zv=new Date,Rv=Fa(function(){},function(t,n){t.setTime(+t+n)},function(t,n){return n-t});Rv.every=function(t){return t=Math.floor(t),isFinite(t)&&t>0?t>1?Fa(function(n){n.setTime(Math.floor(n/t)*t)},function(n,e){n.setTime(+n+e*t)},function(n,e){return(e-n)/t}):Rv:null};var Lv=Rv.range,Dv=6e4,Uv=6048e5,qv=Fa(function(t){t.setTime(1e3*Math.floor(t/1e3))},function(t,n){t.setTime(+t+1e3*n)},function(t,n){return(n-t)/1e3},function(t){return t.getUTCSeconds()}),Ov=qv.range,Yv=Fa(function(t){t.setTime(Math.floor(t/Dv)*Dv)},function(t,n){t.setTime(+t+n*Dv)},function(t,n){return(n-t)/Dv},function(t){return t.getMinutes()}),Bv=Yv.range,Fv=Fa(function(t){var n=t.getTimezoneOffset()*Dv%36e5;n<0&&(n+=36e5),t.setTime(36e5*Math.floor((+t-n)/36e5)+n)},function(t,n){t.setTime(+t+36e5*n)},function(t,n){return(n-t)/36e5},function(t){return t.getHours()}),Iv=Fv.range,jv=Fa(function(t){t.setHours(0,0,0,0)},function(t,n){t.setDate(t.getDate()+n)},function(t,n){return(n-t-(n.getTimezoneOffset()-t.getTimezoneOffset())*Dv)/864e5},function(t){return t.getDate()-1}),Hv=jv.range,Xv=Ia(0),Gv=Ia(1),Vv=Ia(2),$v=Ia(3),Wv=Ia(4),Zv=Ia(5),Qv=Ia(6),Jv=Xv.range,Kv=Gv.range,tg=Vv.range,ng=$v.range,eg=Wv.range,rg=Zv.range,ig=Qv.range,og=Fa(function(t){t.setDate(1),t.setHours(0,0,0,0)},function(t,n){t.setMonth(t.getMonth()+n)},function(t,n){return n.getMonth()-t.getMonth()+12*(n.getFullYear()-t.getFullYear())},function(t){return t.getMonth()}),ag=og.range,ug=Fa(function(t){t.setMonth(0,1),t.setHours(0,0,0,0)},function(t,n){t.setFullYear(t.getFullYear()+n)},function(t,n){return n.getFullYear()-t.getFullYear()},function(t){return t.getFullYear()});ug.every=function(t){return isFinite(t=Math.floor(t))&&t>0?Fa(function(n){n.setFullYear(Math.floor(n.getFullYear()/t)*t),n.setMonth(0,1),n.setHours(0,0,0,0)},function(n,e){n.setFullYear(n.getFullYear()+e*t)}):null};var fg=ug.range,cg=Fa(function(t){t.setUTCSeconds(0,0)},function(t,n){t.setTime(+t+n*Dv)},function(t,n){return(n-t)/Dv},function(t){return t.getUTCMinutes()}),sg=cg.range,lg=Fa(function(t){t.setUTCMinutes(0,0,0)},function(t,n){t.setTime(+t+36e5*n)},function(t,n){return(n-t)/36e5},function(t){return t.getUTCHours()}),hg=lg.range,dg=Fa(function(t){t.setUTCHours(0,0,0,0)},function(t,n){t.setUTCDate(t.getUTCDate()+n)},function(t,n){return(n-t)/864e5},function(t){return t.getUTCDate()-1}),pg=dg.range,vg=ja(0),gg=ja(1),yg=ja(2),_g=ja(3),bg=ja(4),mg=ja(5),xg=ja(6),wg=vg.range,Mg=gg.range,Ag=yg.range,Tg=_g.range,Ng=bg.range,Sg=mg.range,Eg=xg.range,kg=Fa(function(t){t.setUTCDate(1),t.setUTCHours(0,0,0,0)},function(t,n){t.setUTCMonth(t.getUTCMonth()+n)},function(t,n){return n.getUTCMonth()-t.getUTCMonth()+12*(n.getUTCFullYear()-t.getUTCFullYear())},function(t){return t.getUTCMonth()}),Cg=kg.range,Pg=Fa(function(t){t.setUTCMonth(0,1),t.setUTCHours(0,0,0,0)},function(t,n){t.setUTCFullYear(t.getUTCFullYear()+n)},function(t,n){return n.getUTCFullYear()-t.getUTCFullYear()},function(t){return t.getUTCFullYear()});Pg.every=function(t){return isFinite(t=Math.floor(t))&&t>0?Fa(function(n){n.setUTCFullYear(Math.floor(n.getUTCFullYear()/t)*t),n.setUTCMonth(0,1),n.setUTCHours(0,0,0,0)},function(n,e){n.setUTCFullYear(n.getUTCFullYear()+e*t)}):null};var zg,Rg=Pg.range,Lg={"-":"",_:" ",0:"0"},Dg=/^\s*\d+/,Ug=/^%/,qg=/[\\^$*+?|[\]().{}]/g;tf({dateTime:"%x, %X",date:"%-m/%-d/%Y",time:"%-I:%M:%S %p",periods:["AM","PM"],days:["Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"],shortDays:["Sun","Mon","Tue","Wed","Thu","Fri","Sat"],months:["January","February","March","April","May","June","July","August","September","October","November","December"],shortMonths:["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]});var Og="%Y-%m-%dT%H:%M:%S.%LZ",Yg=Date.prototype.toISOString?function(t){return t.toISOString()}:t.utcFormat(Og),Bg=+new Date("2000-01-01T00:00:00.000Z")?function(t){var n=new Date(t);return isNaN(n)?null:n}:t.utcParse(Og),Fg=1e3,Ig=60*Fg,jg=60*Ig,Hg=24*jg,Xg=7*Hg,Gg=30*Hg,Vg=365*Hg,$g=af("1f77b4ff7f0e2ca02cd627289467bd8c564be377c27f7f7fbcbd2217becf"),Wg=af("7fc97fbeaed4fdc086ffff99386cb0f0027fbf5b17666666"),Zg=af("1b9e77d95f027570b3e7298a66a61ee6ab02a6761d666666"),Qg=af("a6cee31f78b4b2df8a33a02cfb9a99e31a1cfdbf6fff7f00cab2d66a3d9affff99b15928"),Jg=af("fbb4aeb3cde3ccebc5decbe4fed9a6ffffcce5d8bdfddaecf2f2f2"),Kg=af("b3e2cdfdcdaccbd5e8f4cae4e6f5c9fff2aef1e2cccccccc"),ty=af("e41a1c377eb84daf4a984ea3ff7f00ffff33a65628f781bf999999"),ny=af("66c2a5fc8d628da0cbe78ac3a6d854ffd92fe5c494b3b3b3"),ey=af("8dd3c7ffffb3bebadafb807280b1d3fdb462b3de69fccde5d9d9d9bc80bdccebc5ffed6f"),ry=new Array(3).concat("d8b365f5f5f55ab4ac","a6611adfc27d80cdc1018571","a6611adfc27df5f5f580cdc1018571","8c510ad8b365f6e8c3c7eae55ab4ac01665e","8c510ad8b365f6e8c3f5f5f5c7eae55ab4ac01665e","8c510abf812ddfc27df6e8c3c7eae580cdc135978f01665e","8c510abf812ddfc27df6e8c3f5f5f5c7eae580cdc135978f01665e","5430058c510abf812ddfc27df6e8c3c7eae580cdc135978f01665e003c30","5430058c510abf812ddfc27df6e8c3f5f5f5c7eae580cdc135978f01665e003c30").map(af),iy=uf(ry),oy=new Array(3).concat("af8dc3f7f7f77fbf7b","7b3294c2a5cfa6dba0008837","7b3294c2a5cff7f7f7a6dba0008837","762a83af8dc3e7d4e8d9f0d37fbf7b1b7837","762a83af8dc3e7d4e8f7f7f7d9f0d37fbf7b1b7837","762a839970abc2a5cfe7d4e8d9f0d3a6dba05aae611b7837","762a839970abc2a5cfe7d4e8f7f7f7d9f0d3a6dba05aae611b7837","40004b762a839970abc2a5cfe7d4e8d9f0d3a6dba05aae611b783700441b","40004b762a839970abc2a5cfe7d4e8f7f7f7d9f0d3a6dba05aae611b783700441b").map(af),ay=uf(oy),uy=new Array(3).concat("e9a3c9f7f7f7a1d76a","d01c8bf1b6dab8e1864dac26","d01c8bf1b6daf7f7f7b8e1864dac26","c51b7de9a3c9fde0efe6f5d0a1d76a4d9221","c51b7de9a3c9fde0eff7f7f7e6f5d0a1d76a4d9221","c51b7dde77aef1b6dafde0efe6f5d0b8e1867fbc414d9221","c51b7dde77aef1b6dafde0eff7f7f7e6f5d0b8e1867fbc414d9221","8e0152c51b7dde77aef1b6dafde0efe6f5d0b8e1867fbc414d9221276419","8e0152c51b7dde77aef1b6dafde0eff7f7f7e6f5d0b8e1867fbc414d9221276419").map(af),fy=uf(uy),cy=new Array(3).concat("998ec3f7f7f7f1a340","5e3c99b2abd2fdb863e66101","5e3c99b2abd2f7f7f7fdb863e66101","542788998ec3d8daebfee0b6f1a340b35806","542788998ec3d8daebf7f7f7fee0b6f1a340b35806","5427888073acb2abd2d8daebfee0b6fdb863e08214b35806","5427888073acb2abd2d8daebf7f7f7fee0b6fdb863e08214b35806","2d004b5427888073acb2abd2d8daebfee0b6fdb863e08214b358067f3b08","2d004b5427888073acb2abd2d8daebf7f7f7fee0b6fdb863e08214b358067f3b08").map(af),sy=uf(cy),ly=new Array(3).concat("ef8a62f7f7f767a9cf","ca0020f4a58292c5de0571b0","ca0020f4a582f7f7f792c5de0571b0","b2182bef8a62fddbc7d1e5f067a9cf2166ac","b2182bef8a62fddbc7f7f7f7d1e5f067a9cf2166ac","b2182bd6604df4a582fddbc7d1e5f092c5de4393c32166ac","b2182bd6604df4a582fddbc7f7f7f7d1e5f092c5de4393c32166ac","67001fb2182bd6604df4a582fddbc7d1e5f092c5de4393c32166ac053061","67001fb2182bd6604df4a582fddbc7f7f7f7d1e5f092c5de4393c32166ac053061").map(af),hy=uf(ly),dy=new Array(3).concat("ef8a62ffffff999999","ca0020f4a582bababa404040","ca0020f4a582ffffffbababa404040","b2182bef8a62fddbc7e0e0e09999994d4d4d","b2182bef8a62fddbc7ffffffe0e0e09999994d4d4d","b2182bd6604df4a582fddbc7e0e0e0bababa8787874d4d4d","b2182bd6604df4a582fddbc7ffffffe0e0e0bababa8787874d4d4d","67001fb2182bd6604df4a582fddbc7e0e0e0bababa8787874d4d4d1a1a1a","67001fb2182bd6604df4a582fddbc7ffffffe0e0e0bababa8787874d4d4d1a1a1a").map(af),py=uf(dy),vy=new Array(3).concat("fc8d59ffffbf91bfdb","d7191cfdae61abd9e92c7bb6","d7191cfdae61ffffbfabd9e92c7bb6","d73027fc8d59fee090e0f3f891bfdb4575b4","d73027fc8d59fee090ffffbfe0f3f891bfdb4575b4","d73027f46d43fdae61fee090e0f3f8abd9e974add14575b4","d73027f46d43fdae61fee090ffffbfe0f3f8abd9e974add14575b4","a50026d73027f46d43fdae61fee090e0f3f8abd9e974add14575b4313695","a50026d73027f46d43fdae61fee090ffffbfe0f3f8abd9e974add14575b4313695").map(af),gy=uf(vy),yy=new Array(3).concat("fc8d59ffffbf91cf60","d7191cfdae61a6d96a1a9641","d7191cfdae61ffffbfa6d96a1a9641","d73027fc8d59fee08bd9ef8b91cf601a9850","d73027fc8d59fee08bffffbfd9ef8b91cf601a9850","d73027f46d43fdae61fee08bd9ef8ba6d96a66bd631a9850","d73027f46d43fdae61fee08bffffbfd9ef8ba6d96a66bd631a9850","a50026d73027f46d43fdae61fee08bd9ef8ba6d96a66bd631a9850006837","a50026d73027f46d43fdae61fee08bffffbfd9ef8ba6d96a66bd631a9850006837").map(af),_y=uf(yy),by=new Array(3).concat("fc8d59ffffbf99d594","d7191cfdae61abdda42b83ba","d7191cfdae61ffffbfabdda42b83ba","d53e4ffc8d59fee08be6f59899d5943288bd","d53e4ffc8d59fee08bffffbfe6f59899d5943288bd","d53e4ff46d43fdae61fee08be6f598abdda466c2a53288bd","d53e4ff46d43fdae61fee08bffffbfe6f598abdda466c2a53288bd","9e0142d53e4ff46d43fdae61fee08be6f598abdda466c2a53288bd5e4fa2","9e0142d53e4ff46d43fdae61fee08bffffbfe6f598abdda466c2a53288bd5e4fa2").map(af),my=uf(by),xy=new Array(3).concat("e5f5f999d8c92ca25f","edf8fbb2e2e266c2a4238b45","edf8fbb2e2e266c2a42ca25f006d2c","edf8fbccece699d8c966c2a42ca25f006d2c","edf8fbccece699d8c966c2a441ae76238b45005824","f7fcfde5f5f9ccece699d8c966c2a441ae76238b45005824","f7fcfde5f5f9ccece699d8c966c2a441ae76238b45006d2c00441b").map(af),wy=uf(xy),My=new Array(3).concat("e0ecf49ebcda8856a7","edf8fbb3cde38c96c688419d","edf8fbb3cde38c96c68856a7810f7c","edf8fbbfd3e69ebcda8c96c68856a7810f7c","edf8fbbfd3e69ebcda8c96c68c6bb188419d6e016b","f7fcfde0ecf4bfd3e69ebcda8c96c68c6bb188419d6e016b","f7fcfde0ecf4bfd3e69ebcda8c96c68c6bb188419d810f7c4d004b").map(af),Ay=uf(My),Ty=new Array(3).concat("e0f3dba8ddb543a2ca","f0f9e8bae4bc7bccc42b8cbe","f0f9e8bae4bc7bccc443a2ca0868ac","f0f9e8ccebc5a8ddb57bccc443a2ca0868ac","f0f9e8ccebc5a8ddb57bccc44eb3d32b8cbe08589e","f7fcf0e0f3dbccebc5a8ddb57bccc44eb3d32b8cbe08589e","f7fcf0e0f3dbccebc5a8ddb57bccc44eb3d32b8cbe0868ac084081").map(af),Ny=uf(Ty),Sy=new Array(3).concat("fee8c8fdbb84e34a33","fef0d9fdcc8afc8d59d7301f","fef0d9fdcc8afc8d59e34a33b30000","fef0d9fdd49efdbb84fc8d59e34a33b30000","fef0d9fdd49efdbb84fc8d59ef6548d7301f990000","fff7ecfee8c8fdd49efdbb84fc8d59ef6548d7301f990000","fff7ecfee8c8fdd49efdbb84fc8d59ef6548d7301fb300007f0000").map(af),Ey=uf(Sy),ky=new Array(3).concat("ece2f0a6bddb1c9099","f6eff7bdc9e167a9cf02818a","f6eff7bdc9e167a9cf1c9099016c59","f6eff7d0d1e6a6bddb67a9cf1c9099016c59","f6eff7d0d1e6a6bddb67a9cf3690c002818a016450","fff7fbece2f0d0d1e6a6bddb67a9cf3690c002818a016450","fff7fbece2f0d0d1e6a6bddb67a9cf3690c002818a016c59014636").map(af),Cy=uf(ky),Py=new Array(3).concat("ece7f2a6bddb2b8cbe","f1eef6bdc9e174a9cf0570b0","f1eef6bdc9e174a9cf2b8cbe045a8d","f1eef6d0d1e6a6bddb74a9cf2b8cbe045a8d","f1eef6d0d1e6a6bddb74a9cf3690c00570b0034e7b","fff7fbece7f2d0d1e6a6bddb74a9cf3690c00570b0034e7b","fff7fbece7f2d0d1e6a6bddb74a9cf3690c00570b0045a8d023858").map(af),zy=uf(Py),Ry=new Array(3).concat("e7e1efc994c7dd1c77","f1eef6d7b5d8df65b0ce1256","f1eef6d7b5d8df65b0dd1c77980043","f1eef6d4b9dac994c7df65b0dd1c77980043","f1eef6d4b9dac994c7df65b0e7298ace125691003f","f7f4f9e7e1efd4b9dac994c7df65b0e7298ace125691003f","f7f4f9e7e1efd4b9dac994c7df65b0e7298ace125698004367001f").map(af),Ly=uf(Ry),Dy=new Array(3).concat("fde0ddfa9fb5c51b8a","feebe2fbb4b9f768a1ae017e","feebe2fbb4b9f768a1c51b8a7a0177","feebe2fcc5c0fa9fb5f768a1c51b8a7a0177","feebe2fcc5c0fa9fb5f768a1dd3497ae017e7a0177","fff7f3fde0ddfcc5c0fa9fb5f768a1dd3497ae017e7a0177","fff7f3fde0ddfcc5c0fa9fb5f768a1dd3497ae017e7a017749006a").map(af),Uy=uf(Dy),qy=new Array(3).concat("edf8b17fcdbb2c7fb8","ffffcca1dab441b6c4225ea8","ffffcca1dab441b6c42c7fb8253494","ffffccc7e9b47fcdbb41b6c42c7fb8253494","ffffccc7e9b47fcdbb41b6c41d91c0225ea80c2c84","ffffd9edf8b1c7e9b47fcdbb41b6c41d91c0225ea80c2c84","ffffd9edf8b1c7e9b47fcdbb41b6c41d91c0225ea8253494081d58").map(af),Oy=uf(qy),Yy=new Array(3).concat("f7fcb9addd8e31a354","ffffccc2e69978c679238443","ffffccc2e69978c67931a354006837","ffffccd9f0a3addd8e78c67931a354006837","ffffccd9f0a3addd8e78c67941ab5d238443005a32","ffffe5f7fcb9d9f0a3addd8e78c67941ab5d238443005a32","ffffe5f7fcb9d9f0a3addd8e78c67941ab5d238443006837004529").map(af),By=uf(Yy),Fy=new Array(3).concat("fff7bcfec44fd95f0e","ffffd4fed98efe9929cc4c02","ffffd4fed98efe9929d95f0e993404","ffffd4fee391fec44ffe9929d95f0e993404","ffffd4fee391fec44ffe9929ec7014cc4c028c2d04","ffffe5fff7bcfee391fec44ffe9929ec7014cc4c028c2d04","ffffe5fff7bcfee391fec44ffe9929ec7014cc4c02993404662506").map(af),Iy=uf(Fy),jy=new Array(3).concat("ffeda0feb24cf03b20","ffffb2fecc5cfd8d3ce31a1c","ffffb2fecc5cfd8d3cf03b20bd0026","ffffb2fed976feb24cfd8d3cf03b20bd0026","ffffb2fed976feb24cfd8d3cfc4e2ae31a1cb10026","ffffccffeda0fed976feb24cfd8d3cfc4e2ae31a1cb10026","ffffccffeda0fed976feb24cfd8d3cfc4e2ae31a1cbd0026800026").map(af),Hy=uf(jy),Xy=new Array(3).concat("deebf79ecae13182bd","eff3ffbdd7e76baed62171b5","eff3ffbdd7e76baed63182bd08519c","eff3ffc6dbef9ecae16baed63182bd08519c","eff3ffc6dbef9ecae16baed64292c62171b5084594","f7fbffdeebf7c6dbef9ecae16baed64292c62171b5084594","f7fbffdeebf7c6dbef9ecae16baed64292c62171b508519c08306b").map(af),Gy=uf(Xy),Vy=new Array(3).concat("e5f5e0a1d99b31a354","edf8e9bae4b374c476238b45","edf8e9bae4b374c47631a354006d2c","edf8e9c7e9c0a1d99b74c47631a354006d2c","edf8e9c7e9c0a1d99b74c47641ab5d238b45005a32","f7fcf5e5f5e0c7e9c0a1d99b74c47641ab5d238b45005a32","f7fcf5e5f5e0c7e9c0a1d99b74c47641ab5d238b45006d2c00441b").map(af),$y=uf(Vy),Wy=new Array(3).concat("f0f0f0bdbdbd636363","f7f7f7cccccc969696525252","f7f7f7cccccc969696636363252525","f7f7f7d9d9d9bdbdbd969696636363252525","f7f7f7d9d9d9bdbdbd969696737373525252252525","fffffff0f0f0d9d9d9bdbdbd969696737373525252252525","fffffff0f0f0d9d9d9bdbdbd969696737373525252252525000000").map(af),Zy=uf(Wy),Qy=new Array(3).concat("efedf5bcbddc756bb1","f2f0f7cbc9e29e9ac86a51a3","f2f0f7cbc9e29e9ac8756bb154278f","f2f0f7dadaebbcbddc9e9ac8756bb154278f","f2f0f7dadaebbcbddc9e9ac8807dba6a51a34a1486","fcfbfdefedf5dadaebbcbddc9e9ac8807dba6a51a34a1486","fcfbfdefedf5dadaebbcbddc9e9ac8807dba6a51a354278f3f007d").map(af),Jy=uf(Qy),Ky=new Array(3).concat("fee0d2fc9272de2d26","fee5d9fcae91fb6a4acb181d","fee5d9fcae91fb6a4ade2d26a50f15","fee5d9fcbba1fc9272fb6a4ade2d26a50f15","fee5d9fcbba1fc9272fb6a4aef3b2ccb181d99000d","fff5f0fee0d2fcbba1fc9272fb6a4aef3b2ccb181d99000d","fff5f0fee0d2fcbba1fc9272fb6a4aef3b2ccb181da50f1567000d").map(af),t_=uf(Ky),n_=new Array(3).concat("fee6cefdae6be6550d","feeddefdbe85fd8d3cd94701","feeddefdbe85fd8d3ce6550da63603","feeddefdd0a2fdae6bfd8d3ce6550da63603","feeddefdd0a2fdae6bfd8d3cf16913d948018c2d04","fff5ebfee6cefdd0a2fdae6bfd8d3cf16913d948018c2d04","fff5ebfee6cefdd0a2fdae6bfd8d3cf16913d94801a636037f2704").map(af),e_=uf(n_),r_=bl(Wt(300,.5,0),Wt(-240,.5,1)),i_=bl(Wt(-100,.75,.35),Wt(80,1.5,.8)),o_=bl(Wt(260,.75,.35),Wt(80,1.5,.8)),a_=Wt(),u_=ff(af("44015444025645045745055946075a46085c460a5d460b5e470d60470e6147106347116447136548146748166848176948186a481a6c481b6d481c6e481d6f481f70482071482173482374482475482576482677482878482979472a7a472c7a472d7b472e7c472f7d46307e46327e46337f463480453581453781453882443983443a83443b84433d84433e85423f854240864241864142874144874045884046883f47883f48893e49893e4a893e4c8a3d4d8a3d4e8a3c4f8a3c508b3b518b3b528b3a538b3a548c39558c39568c38588c38598c375a8c375b8d365c8d365d8d355e8d355f8d34608d34618d33628d33638d32648e32658e31668e31678e31688e30698e306a8e2f6b8e2f6c8e2e6d8e2e6e8e2e6f8e2d708e2d718e2c718e2c728e2c738e2b748e2b758e2a768e2a778e2a788e29798e297a8e297b8e287c8e287d8e277e8e277f8e27808e26818e26828e26828e25838e25848e25858e24868e24878e23888e23898e238a8d228b8d228c8d228d8d218e8d218f8d21908d21918c20928c20928c20938c1f948c1f958b1f968b1f978b1f988b1f998a1f9a8a1e9b8a1e9c891e9d891f9e891f9f881fa0881fa1881fa1871fa28720a38620a48621a58521a68522a78522a88423a98324aa8325ab8225ac8226ad8127ad8128ae8029af7f2ab07f2cb17e2db27d2eb37c2fb47c31b57b32b67a34b67935b77937b87838b9773aba763bbb753dbc743fbc7340bd7242be7144bf7046c06f48c16e4ac16d4cc26c4ec36b50c46a52c56954c56856c66758c7655ac8645cc8635ec96260ca6063cb5f65cb5e67cc5c69cd5b6ccd5a6ece5870cf5773d05675d05477d1537ad1517cd2507fd34e81d34d84d44b86d54989d5488bd6468ed64590d74393d74195d84098d83e9bd93c9dd93ba0da39a2da37a5db36a8db34aadc32addc30b0dd2fb2dd2db5de2bb8de29bade28bddf26c0df25c2df23c5e021c8e020cae11fcde11dd0e11cd2e21bd5e21ad8e219dae319dde318dfe318e2e418e5e419e7e419eae51aece51befe51cf1e51df4e61ef6e620f8e621fbe723fde725")),f_=ff(af("00000401000501010601010802010902020b02020d03030f03031204041405041606051806051a07061c08071e0907200a08220b09240c09260d0a290e0b2b100b2d110c2f120d31130d34140e36150e38160f3b180f3d19103f1a10421c10441d11471e114920114b21114e22115024125325125527125829115a2a115c2c115f2d11612f116331116533106734106936106b38106c390f6e3b0f703d0f713f0f72400f74420f75440f764510774710784910784a10794c117a4e117b4f127b51127c52137c54137d56147d57157e59157e5a167e5c167f5d177f5f187f601880621980641a80651a80671b80681c816a1c816b1d816d1d816e1e81701f81721f817320817521817621817822817922827b23827c23827e24828025828125818326818426818627818827818928818b29818c29818e2a81902a81912b81932b80942c80962c80982d80992d809b2e7f9c2e7f9e2f7fa02f7fa1307ea3307ea5317ea6317da8327daa337dab337cad347cae347bb0357bb2357bb3367ab5367ab73779b83779ba3878bc3978bd3977bf3a77c03a76c23b75c43c75c53c74c73d73c83e73ca3e72cc3f71cd4071cf4070d0416fd2426fd3436ed5446dd6456cd8456cd9466bdb476adc4869de4968df4a68e04c67e24d66e34e65e44f64e55064e75263e85362e95462ea5661eb5760ec5860ed5a5fee5b5eef5d5ef05f5ef1605df2625df2645cf3655cf4675cf4695cf56b5cf66c5cf66e5cf7705cf7725cf8745cf8765cf9785df9795df97b5dfa7d5efa7f5efa815ffb835ffb8560fb8761fc8961fc8a62fc8c63fc8e64fc9065fd9266fd9467fd9668fd9869fd9a6afd9b6bfe9d6cfe9f6dfea16efea36ffea571fea772fea973feaa74feac76feae77feb078feb27afeb47bfeb67cfeb77efeb97ffebb81febd82febf84fec185fec287fec488fec68afec88cfeca8dfecc8ffecd90fecf92fed194fed395fed597fed799fed89afdda9cfddc9efddea0fde0a1fde2a3fde3a5fde5a7fde7a9fde9aafdebacfcecaefceeb0fcf0b2fcf2b4fcf4b6fcf6b8fcf7b9fcf9bbfcfbbdfcfdbf")),c_=ff(af("00000401000501010601010802010a02020c02020e03021004031204031405041706041907051b08051d09061f0a07220b07240c08260d08290e092b10092d110a30120a32140b34150b37160b39180c3c190c3e1b0c411c0c431e0c451f0c48210c4a230c4c240c4f260c51280b53290b552b0b572d0b592f0a5b310a5c320a5e340a5f3609613809623909633b09643d09653e0966400a67420a68440a68450a69470b6a490b6a4a0c6b4c0c6b4d0d6c4f0d6c510e6c520e6d540f6d550f6d57106e59106e5a116e5c126e5d126e5f136e61136e62146e64156e65156e67166e69166e6a176e6c186e6d186e6f196e71196e721a6e741a6e751b6e771c6d781c6d7a1d6d7c1d6d7d1e6d7f1e6c801f6c82206c84206b85216b87216b88226a8a226a8c23698d23698f24699025689225689326679526679727669827669a28659b29649d29649f2a63a02a63a22b62a32c61a52c60a62d60a82e5fa92e5eab2f5ead305dae305cb0315bb1325ab3325ab43359b63458b73557b93556ba3655bc3754bd3853bf3952c03a51c13a50c33b4fc43c4ec63d4dc73e4cc83f4bca404acb4149cc4248ce4347cf4446d04545d24644d34743d44842d54a41d74b3fd84c3ed94d3dda4e3cdb503bdd513ade5238df5337e05536e15635e25734e35933e45a31e55c30e65d2fe75e2ee8602de9612bea632aeb6429eb6628ec6726ed6925ee6a24ef6c23ef6e21f06f20f1711ff1731df2741cf3761bf37819f47918f57b17f57d15f67e14f68013f78212f78410f8850ff8870ef8890cf98b0bf98c0af98e09fa9008fa9207fa9407fb9606fb9706fb9906fb9b06fb9d07fc9f07fca108fca309fca50afca60cfca80dfcaa0ffcac11fcae12fcb014fcb216fcb418fbb61afbb81dfbba1ffbbc21fbbe23fac026fac228fac42afac62df9c72ff9c932f9cb35f8cd37f8cf3af7d13df7d340f6d543f6d746f5d949f5db4cf4dd4ff4df53f4e156f3e35af3e55df2e661f2e865f2ea69f1ec6df1ed71f1ef75f1f179f2f27df2f482f3f586f3f68af4f88ef5f992f6fa96f8fb9af9fc9dfafda1fcffa4")),s_=ff(af("0d088710078813078916078a19068c1b068d1d068e20068f2206902406912605912805922a05932c05942e05952f059631059733059735049837049938049a3a049a3c049b3e049c3f049c41049d43039e44039e46039f48039f4903a04b03a14c02a14e02a25002a25102a35302a35502a45601a45801a45901a55b01a55c01a65e01a66001a66100a76300a76400a76600a76700a86900a86a00a86c00a86e00a86f00a87100a87201a87401a87501a87701a87801a87a02a87b02a87d03a87e03a88004a88104a78305a78405a78606a68707a68808a68a09a58b0aa58d0ba58e0ca48f0da4910ea3920fa39410a29511a19613a19814a099159f9a169f9c179e9d189d9e199da01a9ca11b9ba21d9aa31e9aa51f99a62098a72197a82296aa2395ab2494ac2694ad2793ae2892b02991b12a90b22b8fb32c8eb42e8db52f8cb6308bb7318ab83289ba3388bb3488bc3587bd3786be3885bf3984c03a83c13b82c23c81c33d80c43e7fc5407ec6417dc7427cc8437bc9447aca457acb4679cc4778cc4977cd4a76ce4b75cf4c74d04d73d14e72d24f71d35171d45270d5536fd5546ed6556dd7566cd8576bd9586ada5a6ada5b69db5c68dc5d67dd5e66de5f65de6164df6263e06363e16462e26561e26660e3685fe4695ee56a5de56b5de66c5ce76e5be76f5ae87059e97158e97257ea7457eb7556eb7655ec7754ed7953ed7a52ee7b51ef7c51ef7e50f07f4ff0804ef1814df1834cf2844bf3854bf3874af48849f48948f58b47f58c46f68d45f68f44f79044f79143f79342f89441f89540f9973ff9983ef99a3efa9b3dfa9c3cfa9e3bfb9f3afba139fba238fca338fca537fca636fca835fca934fdab33fdac33fdae32fdaf31fdb130fdb22ffdb42ffdb52efeb72dfeb82cfeba2cfebb2bfebd2afebe2afec029fdc229fdc328fdc527fdc627fdc827fdca26fdcb26fccd25fcce25fcd025fcd225fbd324fbd524fbd724fad824fada24f9dc24f9dd25f8df25f8e125f7e225f7e425f6e626f6e826f5e926f5eb27f4ed27f3ee27f3f027f2f227f1f426f1f525f0f724f0f921")),l_=Math.abs,h_=Math.atan2,d_=Math.cos,p_=Math.max,v_=Math.min,g_=Math.sin,y_=Math.sqrt,__=1e-12,b_=Math.PI,m_=b_/2,x_=2*b_;yf.prototype={areaStart:function(){this._line=0},areaEnd:function(){this._line=NaN},lineStart:function(){this._point=0},lineEnd:function(){(this._line||0!==this._line&&1===this._point)&&this._context.closePath(),this._line=1-this._line},point:function(t,n){switch(t=+t,n=+n,this._point){case 0:this._point=1,this._line?this._context.lineTo(t,n):this._context.moveTo(t,n);break;case 1:this._point=2;default:this._context.lineTo(t,n)}}};var w_=Nf(_f);Tf.prototype={areaStart:function(){this._curve.areaStart()},areaEnd:function(){this._curve.areaEnd()},lineStart:function(){this._curve.lineStart()},lineEnd:function(){this._curve.lineEnd()},point:function(t,n){this._curve.point(n*Math.sin(t),n*-Math.cos(t))}};var M_=Array.prototype.slice,A_={draw:function(t,n){var e=Math.sqrt(n/b_);t.moveTo(e,0),t.arc(0,0,e,0,x_)}},T_={draw:function(t,n){var e=Math.sqrt(n/5)/2;t.moveTo(-3*e,-e),t.lineTo(-e,-e),t.lineTo(-e,-3*e),t.lineTo(e,-3*e),t.lineTo(e,-e),t.lineTo(3*e,-e),t.lineTo(3*e,e),t.lineTo(e,e),t.lineTo(e,3*e),t.lineTo(-e,3*e),t.lineTo(-e,e),t.lineTo(-3*e,e),t.closePath()}},N_=Math.sqrt(1/3),S_=2*N_,E_={draw:function(t,n){var e=Math.sqrt(n/S_),r=e*N_;t.moveTo(0,-e),t.lineTo(r,0),t.lineTo(0,e),t.lineTo(-r,0),t.closePath()}},k_=Math.sin(b_/10)/Math.sin(7*b_/10),C_=Math.sin(x_/10)*k_,P_=-Math.cos(x_/10)*k_,z_={draw:function(t,n){var e=Math.sqrt(.8908130915292852*n),r=C_*e,i=P_*e;t.moveTo(0,-e),t.lineTo(r,i);for(var o=1;o<5;++o){var a=x_*o/5,u=Math.cos(a),f=Math.sin(a);t.lineTo(f*e,-u*e),t.lineTo(u*r-f*i,f*r+u*i)}t.closePath()}},R_={draw:function(t,n){var e=Math.sqrt(n),r=-e/2;t.rect(r,r,e,e)}},L_=Math.sqrt(3),D_={draw:function(t,n){var e=-Math.sqrt(n/(3*L_));t.moveTo(0,2*e),t.lineTo(-L_*e,-e),t.lineTo(L_*e,-e),t.closePath()}},U_=Math.sqrt(3)/2,q_=1/Math.sqrt(12),O_=3*(q_/2+1),Y_={draw:function(t,n){var e=Math.sqrt(n/O_),r=e/2,i=e*q_,o=r,a=e*q_+e,u=-o,f=a;t.moveTo(r,i),t.lineTo(o,a),t.lineTo(u,f),t.lineTo(-.5*r-U_*i,U_*r+-.5*i),t.lineTo(-.5*o-U_*a,U_*o+-.5*a),t.lineTo(-.5*u-U_*f,U_*u+-.5*f),t.lineTo(-.5*r+U_*i,-.5*i-U_*r),t.lineTo(-.5*o+U_*a,-.5*a-U_*o),t.lineTo(-.5*u+U_*f,-.5*f-U_*u),t.closePath()}},B_=[A_,T_,E_,R_,z_,D_,Y_];Yf.prototype={areaStart:function(){this._line=0},areaEnd:function(){this._line=NaN},lineStart:function(){this._x0=this._x1=this._y0=this._y1=NaN,this._point=0},lineEnd:function(){switch(this._point){case 3:Of(this,this._x1,this._y1);case 2:this._context.lineTo(this._x1,this._y1)}(this._line||0!==this._line&&1===this._point)&&this._context.closePath(),this._line=1-this._line},point:function(t,n){switch(t=+t,n=+n,this._point){case 0:this._point=1,this._line?this._context.lineTo(t,n):this._context.moveTo(t,n);break;case 1:this._point=2;break;case 2:this._point=3,this._context.lineTo((5*this._x0+this._x1)/6,(5*this._y0+this._y1)/6);default:Of(this,t,n)}this._x0=this._x1,this._x1=t,this._y0=this._y1,this._y1=n}},Bf.prototype={areaStart:qf,areaEnd:qf,lineStart:function(){this._x0=this._x1=this._x2=this._x3=this._x4=this._y0=this._y1=this._y2=this._y3=this._y4=NaN,this._point=0},lineEnd:function(){switch(this._point){case 1:this._context.moveTo(this._x2,this._y2),this._context.closePath();break;case 2:this._context.moveTo((this._x2+2*this._x3)/3,(this._y2+2*this._y3)/3),this._context.lineTo((this._x3+2*this._x2)/3,(this._y3+2*this._y2)/3),this._context.closePath();break;case 3:this.point(this._x2,this._y2),this.point(this._x3,this._y3),this.point(this._x4,this._y4)}},point:function(t,n){switch(t=+t,n=+n,this._point){case 0:this._point=1,this._x2=t,this._y2=n;break;case 1:this._point=2,this._x3=t,this._y3=n;break;case 2:this._point=3,this._x4=t,this._y4=n,this._context.moveTo((this._x0+4*this._x1+t)/6,(this._y0+4*this._y1+n)/6);break;default:Of(this,t,n)}this._x0=this._x1,this._x1=t,this._y0=this._y1,this._y1=n}},Ff.prototype={areaStart:function(){this._line=0},areaEnd:function(){this._line=NaN},lineStart:function(){this._x0=this._x1=this._y0=this._y1=NaN,this._point=0},lineEnd:function(){(this._line||0!==this._line&&3===this._point)&&this._context.closePath(),this._line=1-this._line},point:function(t,n){switch(t=+t,n=+n,this._point){case 0:this._point=1;break;case 1:this._point=2;break;case 2:this._point=3;var e=(this._x0+4*this._x1+t)/6,r=(this._y0+4*this._y1+n)/6;this._line?this._context.lineTo(e,r):this._context.moveTo(e,r);break;case 3:this._point=4;default:Of(this,t,n)}this._x0=this._x1,this._x1=t,this._y0=this._y1,this._y1=n}},If.prototype={lineStart:function(){this._x=[],this._y=[],this._basis.lineStart()},lineEnd:function(){var t=this._x,n=this._y,e=t.length-1;if(e>0)for(var r,i=t[0],o=n[0],a=t[e]-i,u=n[e]-o,f=-1;++f<=e;)r=f/e,this._basis.point(this._beta*t[f]+(1-this._beta)*(i+r*a),this._beta*n[f]+(1-this._beta)*(o+r*u));this._x=this._y=null,this._basis.lineEnd()},point:function(t,n){this._x.push(+t),this._y.push(+n)}};var F_=function t(n){function e(t){return 1===n?new Yf(t):new If(t,n)}return e.beta=function(n){return t(+n)},e}(.85);Hf.prototype={areaStart:function(){this._line=0},areaEnd:function(){this._line=NaN},lineStart:function(){this._x0=this._x1=this._x2=this._y0=this._y1=this._y2=NaN,this._point=0},lineEnd:function(){switch(this._point){case 2:this._context.lineTo(this._x2,this._y2);break;case 3:jf(this,this._x1,this._y1)}(this._line||0!==this._line&&1===this._point)&&this._context.closePath(),this._line=1-this._line},point:function(t,n){switch(t=+t,n=+n,this._point){case 0:this._point=1,this._line?this._context.lineTo(t,n):this._context.moveTo(t,n);break;case 1:this._point=2,this._x1=t,this._y1=n;break;case 2:this._point=3;default:jf(this,t,n)}this._x0=this._x1,this._x1=this._x2,this._x2=t,this._y0=this._y1,this._y1=this._y2,this._y2=n}};var I_=function t(n){function e(t){return new Hf(t,n)}return e.tension=function(n){return t(+n)},e}(0);Xf.prototype={areaStart:qf,areaEnd:qf,lineStart:function(){this._x0=this._x1=this._x2=this._x3=this._x4=this._x5=this._y0=this._y1=this._y2=this._y3=this._y4=this._y5=NaN,this._point=0},lineEnd:function(){switch(this._point){case 1:this._context.moveTo(this._x3,this._y3),this._context.closePath();break;case 2:this._context.lineTo(this._x3,this._y3),this._context.closePath();break;case 3:this.point(this._x3,this._y3),this.point(this._x4,this._y4),this.point(this._x5,this._y5)}},point:function(t,n){switch(t=+t,n=+n,this._point){case 0:this._point=1,this._x3=t,this._y3=n;break;case 1:this._point=2,this._context.moveTo(this._x4=t,this._y4=n);break;case 2:this._point=3,this._x5=t,this._y5=n;break;default:jf(this,t,n)}this._x0=this._x1,this._x1=this._x2,this._x2=t,this._y0=this._y1,this._y1=this._y2,this._y2=n}};var j_=function t(n){function e(t){return new Xf(t,n)}return e.tension=function(n){return t(+n)},e}(0);Gf.prototype={areaStart:function(){this._line=0},areaEnd:function(){this._line=NaN},lineStart:function(){this._x0=this._x1=this._x2=this._y0=this._y1=this._y2=NaN,this._point=0},lineEnd:function(){(this._line||0!==this._line&&3===this._point)&&this._context.closePath(),this._line=1-this._line},point:function(t,n){switch(t=+t,n=+n,this._point){case 0:this._point=1;break;case 1:this._point=2;break;case 2:this._point=3,this._line?this._context.lineTo(this._x2,this._y2):this._context.moveTo(this._x2,this._y2);break;case 3:this._point=4;default:jf(this,t,n)}this._x0=this._x1,this._x1=this._x2,this._x2=t,this._y0=this._y1,this._y1=this._y2,this._y2=n}};var H_=function t(n){function e(t){return new Gf(t,n)}return e.tension=function(n){return t(+n)},e}(0);$f.prototype={areaStart:function(){this._line=0},areaEnd:function(){this._line=NaN},lineStart:function(){this._x0=this._x1=this._x2=this._y0=this._y1=this._y2=NaN,this._l01_a=this._l12_a=this._l23_a=this._l01_2a=this._l12_2a=this._l23_2a=this._point=0},lineEnd:function(){switch(this._point){case 2:this._context.lineTo(this._x2,this._y2);break;case 3:this.point(this._x2,this._y2)}(this._line||0!==this._line&&1===this._point)&&this._context.closePath(),this._line=1-this._line},point:function(t,n){if(t=+t,n=+n,this._point){var e=this._x2-t,r=this._y2-n;this._l23_a=Math.sqrt(this._l23_2a=Math.pow(e*e+r*r,this._alpha))}switch(this._point){case 0:this._point=1,this._line?this._context.lineTo(t,n):this._context.moveTo(t,n);break;case 1:this._point=2;break;case 2:this._point=3;default:Vf(this,t,n)}this._l01_a=this._l12_a,this._l12_a=this._l23_a,this._l01_2a=this._l12_2a,this._l12_2a=this._l23_2a,this._x0=this._x1,this._x1=this._x2,this._x2=t,this._y0=this._y1,this._y1=this._y2,this._y2=n}};var X_=function t(n){function e(t){return n?new $f(t,n):new Hf(t,0)}return e.alpha=function(n){return t(+n)},e}(.5);Wf.prototype={areaStart:qf,areaEnd:qf,lineStart:function(){this._x0=this._x1=this._x2=this._x3=this._x4=this._x5=this._y0=this._y1=this._y2=this._y3=this._y4=this._y5=NaN,this._l01_a=this._l12_a=this._l23_a=this._l01_2a=this._l12_2a=this._l23_2a=this._point=0},lineEnd:function(){switch(this._point){case 1:this._context.moveTo(this._x3,this._y3),this._context.closePath();break;case 2:this._context.lineTo(this._x3,this._y3),this._context.closePath();break;case 3:this.point(this._x3,this._y3),this.point(this._x4,this._y4),this.point(this._x5,this._y5)}},point:function(t,n){if(t=+t,n=+n,this._point){var e=this._x2-t,r=this._y2-n;this._l23_a=Math.sqrt(this._l23_2a=Math.pow(e*e+r*r,this._alpha))}switch(this._point){case 0:this._point=1,this._x3=t,this._y3=n;break;case 1:this._point=2,this._context.moveTo(this._x4=t,this._y4=n);break;case 2:this._point=3,this._x5=t,this._y5=n;break;default:Vf(this,t,n)}this._l01_a=this._l12_a,this._l12_a=this._l23_a,this._l01_2a=this._l12_2a,this._l12_2a=this._l23_2a,this._x0=this._x1,this._x1=this._x2,this._x2=t,this._y0=this._y1,this._y1=this._y2,this._y2=n}};var G_=function t(n){function e(t){return n?new Wf(t,n):new Xf(t,0)}return e.alpha=function(n){return t(+n)},e}(.5);Zf.prototype={areaStart:function(){this._line=0},areaEnd:function(){this._line=NaN},lineStart:function(){this._x0=this._x1=this._x2=this._y0=this._y1=this._y2=NaN,this._l01_a=this._l12_a=this._l23_a=this._l01_2a=this._l12_2a=this._l23_2a=this._point=0},lineEnd:function(){(this._line||0!==this._line&&3===this._point)&&this._context.closePath(),this._line=1-this._line},point:function(t,n){if(t=+t,n=+n,this._point){var e=this._x2-t,r=this._y2-n;this._l23_a=Math.sqrt(this._l23_2a=Math.pow(e*e+r*r,this._alpha))}switch(this._point){case 0:this._point=1;break;case 1:this._point=2;break;case 2:this._point=3,this._line?this._context.lineTo(this._x2,this._y2):this._context.moveTo(this._x2,this._y2);break;case 3:this._point=4;default:Vf(this,t,n)}this._l01_a=this._l12_a,this._l12_a=this._l23_a,this._l01_2a=this._l12_2a,this._l12_2a=this._l23_2a,this._x0=this._x1,this._x1=this._x2,this._x2=t,this._y0=this._y1,this._y1=this._y2,this._y2=n}};var V_=function t(n){function e(t){return n?new Zf(t,n):new Gf(t,0)}return e.alpha=function(n){return t(+n)},e}(.5);Qf.prototype={areaStart:qf,areaEnd:qf,lineStart:function(){this._point=0},lineEnd:function(){this._point&&this._context.closePath()},point:function(t,n){t=+t,n=+n,this._point?this._context.lineTo(t,n):(this._point=1,this._context.moveTo(t,n))}},ec.prototype={areaStart:function(){this._line=0},areaEnd:function(){this._line=NaN},lineStart:function(){this._x0=this._x1=this._y0=this._y1=this._t0=NaN,this._point=0},lineEnd:function(){switch(this._point){case 2:this._context.lineTo(this._x1,this._y1);break;case 3:nc(this,this._t0,tc(this,this._t0))}(this._line||0!==this._line&&1===this._point)&&this._context.closePath(),this._line=1-this._line},point:function(t,n){var e=NaN;if(t=+t,n=+n,t!==this._x1||n!==this._y1){switch(this._point){case 0:this._point=1,this._line?this._context.lineTo(t,n):this._context.moveTo(t,n);break;case 1:this._point=2;break;case 2:this._point=3,nc(this,tc(this,e=Kf(this,t,n)),e);break;default:nc(this,this._t0,e=Kf(this,t,n))}this._x0=this._x1,this._x1=t,this._y0=this._y1,this._y1=n,this._t0=e}}},(rc.prototype=Object.create(ec.prototype)).point=function(t,n){ec.prototype.point.call(this,n,t)},ic.prototype={moveTo:function(t,n){this._context.moveTo(n,t)},closePath:function(){this._context.closePath()},lineTo:function(t,n){this._context.lineTo(n,t)},bezierCurveTo:function(t,n,e,r,i,o){this._context.bezierCurveTo(n,t,r,e,o,i)}},oc.prototype={areaStart:function(){this._line=0},areaEnd:function(){this._line=NaN},lineStart:function(){this._x=[],this._y=[]},lineEnd:function(){var t=this._x,n=this._y,e=t.length;if(e)if(this._line?this._context.lineTo(t[0],n[0]):this._context.moveTo(t[0],n[0]),2===e)this._context.lineTo(t[1],n[1]);else for(var r=ac(t),i=ac(n),o=0,a=1;a<e;++o,++a)this._context.bezierCurveTo(r[0][o],i[0][o],r[1][o],i[1][o],t[a],n[a]);(this._line||0!==this._line&&1===e)&&this._context.closePath(),this._line=1-this._line,this._x=this._y=null},point:function(t,n){this._x.push(+t),this._y.push(+n)}},uc.prototype={areaStart:function(){this._line=0},areaEnd:function(){this._line=NaN},lineStart:function(){this._x=this._y=NaN,this._point=0},lineEnd:function(){0<this._t&&this._t<1&&2===this._point&&this._context.lineTo(this._x,this._y),(this._line||0!==this._line&&1===this._point)&&this._context.closePath(),this._line>=0&&(this._t=1-this._t,this._line=1-this._line)},point:function(t,n){switch(t=+t,n=+n,this._point){case 0:this._point=1,this._line?this._context.lineTo(t,n):this._context.moveTo(t,n);break;case 1:this._point=2;default:if(this._t<=0)this._context.lineTo(this._x,n),this._context.lineTo(t,n);else{var e=this._x*(1-this._t)+t*this._t;this._context.lineTo(e,this._y),this._context.lineTo(e,n)}}this._x=t,this._y=n}},gc.prototype={constructor:gc,insert:function(t,n){var e,r,i;if(t){if(n.P=t,n.N=t.N,t.N&&(t.N.P=n),t.N=n,t.R){for(t=t.R;t.L;)t=t.L;t.L=n}else t.R=n;e=t}else this._?(t=mc(this._),n.P=null,n.N=t,t.P=t.L=n,e=t):(n.P=n.N=null,this._=n,e=null);for(n.L=n.R=null,n.U=e,n.C=!0,t=n;e&&e.C;)e===(r=e.U).L?(i=r.R)&&i.C?(e.C=i.C=!1,r.C=!0,t=r):(t===e.R&&(_c(this,e),e=(t=e).U),e.C=!1,r.C=!0,bc(this,r)):(i=r.L)&&i.C?(e.C=i.C=!1,r.C=!0,t=r):(t===e.L&&(bc(this,e),e=(t=e).U),e.C=!1,r.C=!0,_c(this,r)),e=t.U;this._.C=!1},remove:function(t){t.N&&(t.N.P=t.P),t.P&&(t.P.N=t.N),t.N=t.P=null;var n,e,r,i=t.U,o=t.L,a=t.R;if(e=o?a?mc(a):o:a,i?i.L===t?i.L=e:i.R=e:this._=e,o&&a?(r=e.C,e.C=t.C,e.L=o,o.U=e,e!==a?(i=e.U,e.U=t.U,t=e.R,i.L=t,e.R=a,a.U=e):(e.U=i,i=e,t=e.R)):(r=t.C,t=e),t&&(t.U=i),!r)if(t&&t.C)t.C=!1;else{do{if(t===this._)break;if(t===i.L){if((n=i.R).C&&(n.C=!1,i.C=!0,_c(this,i),n=i.R),n.L&&n.L.C||n.R&&n.R.C){n.R&&n.R.C||(n.L.C=!1,n.C=!0,bc(this,n),n=i.R),n.C=i.C,i.C=n.R.C=!1,_c(this,i),t=this._;break}}else if((n=i.L).C&&(n.C=!1,i.C=!0,bc(this,i),n=i.L),n.L&&n.L.C||n.R&&n.R.C){n.L&&n.L.C||(n.R.C=!1,n.C=!0,_c(this,n),n=i.L),n.C=i.C,i.C=n.L.C=!1,bc(this,i),t=this._;break}n.C=!0,t=i,i=i.U}while(!t.C);t&&(t.C=!1)}}};var $_,W_,Z_,Q_,J_,K_=[],tb=[],nb=1e-6,eb=1e-12;Oc.prototype={constructor:Oc,polygons:function(){var t=this.edges;return this.cells.map(function(n){var e=n.halfedges.map(function(e){return Sc(n,t[e])});return e.data=n.site.data,e})},triangles:function(){var t=[],n=this.edges;return this.cells.forEach(function(e,r){if(o=(i=e.halfedges).length)for(var i,o,a,u=e.site,f=-1,c=n[i[o-1]],s=c.left===u?c.right:c.left;++f<o;)a=s,s=(c=n[i[f]]).left===u?c.right:c.left,a&&s&&r<a.index&&r<s.index&&Uc(u,a,s)<0&&t.push([u.data,a.data,s.data])}),t},links:function(){return this.edges.filter(function(t){return t.right}).map(function(t){return{source:t.left.data,target:t.right.data}})},find:function(t,n,e){for(var r,i,o=this,a=o._found||0,u=o.cells.length;!(i=o.cells[a]);)if(++a>=u)return null;var f=t-i.site[0],c=n-i.site[1],s=f*f+c*c;do{i=o.cells[r=a],a=null,i.halfedges.forEach(function(e){var r=o.edges[e],u=r.left;if(u!==i.site&&u||(u=r.right)){var f=t-u[0],c=n-u[1],l=f*f+c*c;l<s&&(s=l,a=u.index)}})}while(null!==a);return o._found=r,null==e||s<=e*e?i.site:null}},Bc.prototype={constructor:Bc,scale:function(t){return 1===t?this:new Bc(this.k*t,this.x,this.y)},translate:function(t,n){return 0===t&0===n?this:new Bc(this.k,this.x+this.k*t,this.y+this.k*n)},apply:function(t){return[t[0]*this.k+this.x,t[1]*this.k+this.y]},applyX:function(t){return t*this.k+this.x},applyY:function(t){return t*this.k+this.y},invert:function(t){return[(t[0]-this.x)/this.k,(t[1]-this.y)/this.k]},invertX:function(t){return(t-this.x)/this.k},invertY:function(t){return(t-this.y)/this.k},rescaleX:function(t){return t.copy().domain(t.range().map(this.invertX,this).map(t.invert,t))},rescaleY:function(t){return t.copy().domain(t.range().map(this.invertY,this).map(t.invert,t))},toString:function(){return"translate("+this.x+","+this.y+") scale("+this.k+")"}};var rb=new Bc(1,0,0);Fc.prototype=Bc.prototype,t.version="5.1.0",t.bisect=Qc,t.bisectRight=Qc,t.bisectLeft=Jc,t.ascending=n,t.bisector=e,t.cross=function(t,n,e){var i,o,a,u,f=t.length,c=n.length,s=new Array(f*c);for(null==e&&(e=r),i=a=0;i<f;++i)for(u=t[i],o=0;o<c;++o,++a)s[a]=e(u,n[o]);return s},t.descending=function(t,n){return n<t?-1:n>t?1:n>=t?0:NaN},t.deviation=a,t.extent=u,t.histogram=function(){function t(t){var i,o,a=t.length,u=new Array(a);for(i=0;i<a;++i)u[i]=n(t[i],i,t);var f=e(u),c=f[0],l=f[1],h=r(u,c,l);Array.isArray(h)||(h=d(c,l,h),h=s(Math.ceil(c/h)*h,Math.floor(l/h)*h,h));for(var p=h.length;h[0]<=c;)h.shift(),--p;for(;h[p-1]>l;)h.pop(),--p;var v,g=new Array(p+1);for(i=0;i<=p;++i)(v=g[i]=[]).x0=i>0?h[i-1]:c,v.x1=i<p?h[i]:l;for(i=0;i<a;++i)c<=(o=u[i])&&o<=l&&g[Qc(h,o,0,p)].push(t[i]);return g}var n=c,e=u,r=p;return t.value=function(e){return arguments.length?(n="function"==typeof e?e:f(e),t):n},t.domain=function(n){return arguments.length?(e="function"==typeof n?n:f([n[0],n[1]]),t):e},t.thresholds=function(n){return arguments.length?(r="function"==typeof n?n:Array.isArray(n)?f(ts.call(n)):f(n),t):r},t},t.thresholdFreedmanDiaconis=function(t,e,r){return t=ns.call(t,i).sort(n),Math.ceil((r-e)/(2*(v(t,.75)-v(t,.25))*Math.pow(t.length,-1/3)))},t.thresholdScott=function(t,n,e){return Math.ceil((e-n)/(3.5*a(t)*Math.pow(t.length,-1/3)))},t.thresholdSturges=p,t.max=g,t.mean=function(t,n){var e,r=t.length,o=r,a=-1,u=0;if(null==n)for(;++a<r;)isNaN(e=i(t[a]))?--o:u+=e;else for(;++a<r;)isNaN(e=i(n(t[a],a,t)))?--o:u+=e;if(o)return u/o},t.median=function(t,e){var r,o=t.length,a=-1,u=[];if(null==e)for(;++a<o;)isNaN(r=i(t[a]))||u.push(r);else for(;++a<o;)isNaN(r=i(e(t[a],a,t)))||u.push(r);return v(u.sort(n),.5)},t.merge=y,t.min=_,t.pairs=function(t,n){null==n&&(n=r);for(var e=0,i=t.length-1,o=t[0],a=new Array(i<0?0:i);e<i;)a[e]=n(o,o=t[++e]);return a},t.permute=function(t,n){for(var e=n.length,r=new Array(e);e--;)r[e]=t[n[e]];return r},t.quantile=v,t.range=s,t.scan=function(t,e){if(r=t.length){var r,i,o=0,a=0,u=t[a];for(null==e&&(e=n);++o<r;)(e(i=t[o],u)<0||0!==e(u,u))&&(u=i,a=o);return 0===e(u,u)?a:void 0}},t.shuffle=function(t,n,e){for(var r,i,o=(null==e?t.length:e)-(n=null==n?0:+n);o;)i=Math.random()*o--|0,r=t[o+n],t[o+n]=t[i+n],t[i+n]=r;return t},t.sum=function(t,n){var e,r=t.length,i=-1,o=0;if(null==n)for(;++i<r;)(e=+t[i])&&(o+=e);else for(;++i<r;)(e=+n(t[i],i,t))&&(o+=e);return o},t.ticks=l,t.tickIncrement=h,t.tickStep=d,t.transpose=b,t.variance=o,t.zip=function(){return b(arguments)},t.axisTop=function(t){return T(as,t)},t.axisRight=function(t){return T(us,t)},t.axisBottom=function(t){return T(fs,t)},t.axisLeft=function(t){return T(cs,t)},t.brush=function(){return ne(yh)},t.brushX=function(){return ne(vh)},t.brushY=function(){return ne(gh)},t.brushSelection=function(t){var n=t.__brush;return n?n.dim.output(n.selection):null},t.chord=function(){function t(t){var o,a,u,f,c,l,h=t.length,d=[],p=s(h),v=[],g=[],y=g.groups=new Array(h),_=new Array(h*h);for(o=0,c=-1;++c<h;){for(a=0,l=-1;++l<h;)a+=t[c][l];d.push(a),v.push(s(h)),o+=a}for(e&&p.sort(function(t,n){return e(d[t],d[n])}),r&&v.forEach(function(n,e){n.sort(function(n,i){return r(t[e][n],t[e][i])})}),f=(o=Eh(0,Sh-n*h)/o)?n:Sh/h,a=0,c=-1;++c<h;){for(u=a,l=-1;++l<h;){var b=p[c],m=v[b][l],x=t[b][m],w=a,M=a+=x*o;_[m*h+b]={index:b,subindex:m,startAngle:w,endAngle:M,value:x}}y[b]={index:b,startAngle:u,endAngle:a,value:d[b]},a+=f}for(c=-1;++c<h;)for(l=c-1;++l<h;){var A=_[l*h+c],T=_[c*h+l];(A.value||T.value)&&g.push(A.value<T.value?{source:T,target:A}:{source:A,target:T})}return i?g.sort(i):g}var n=0,e=null,r=null,i=null;return t.padAngle=function(e){return arguments.length?(n=Eh(0,e),t):n},t.sortGroups=function(n){return arguments.length?(e=n,t):e},t.sortSubgroups=function(n){return arguments.length?(r=n,t):r},t.sortChords=function(n){return arguments.length?(null==n?i=null:(i=function(t){return function(n,e){return t(n.source.value+n.target.value,e.source.value+e.target.value)}}(n))._=n,t):i&&i._},t},t.ribbon=function(){function t(){var t,u=kh.call(arguments),f=n.apply(this,u),c=e.apply(this,u),s=+r.apply(this,(u[0]=f,u)),l=i.apply(this,u)-Nh,h=o.apply(this,u)-Nh,d=s*Mh(l),p=s*Ah(l),v=+r.apply(this,(u[0]=c,u)),g=i.apply(this,u)-Nh,y=o.apply(this,u)-Nh;if(a||(a=t=ie()),a.moveTo(d,p),a.arc(0,0,s,l,h),l===g&&h===y||(a.quadraticCurveTo(0,0,v*Mh(g),v*Ah(g)),a.arc(0,0,v,g,y)),a.quadraticCurveTo(0,0,d,p),a.closePath(),t)return a=null,t+""||null}var n=oe,e=ae,r=ue,i=fe,o=ce,a=null;return t.radius=function(n){return arguments.length?(r="function"==typeof n?n:ee(+n),t):r},t.startAngle=function(n){return arguments.length?(i="function"==typeof n?n:ee(+n),t):i},t.endAngle=function(n){return arguments.length?(o="function"==typeof n?n:ee(+n),t):o},t.source=function(e){return arguments.length?(n=e,t):n},t.target=function(n){return arguments.length?(e=n,t):e},t.context=function(n){return arguments.length?(a=null==n?null:n,t):a},t},t.nest=function(){function t(n,i,a,u){if(i>=o.length)return null!=e&&n.sort(e),null!=r?r(n):n;for(var f,c,s,l=-1,h=n.length,d=o[i++],p=le(),v=a();++l<h;)(s=p.get(f=d(c=n[l])+""))?s.push(c):p.set(f,[c]);return p.each(function(n,e){u(v,e,t(n,i,a,u))}),v}function n(t,e){if(++e>o.length)return t;var i,u=a[e-1];return null!=r&&e>=o.length?i=t.entries():(i=[],t.each(function(t,r){i.push({key:r,values:n(t,e)})})),null!=u?i.sort(function(t,n){return u(t.key,n.key)}):i}var e,r,i,o=[],a=[];return i={object:function(n){return t(n,0,he,de)},map:function(n){return t(n,0,pe,ve)},entries:function(e){return n(t(e,0,pe,ve),0)},key:function(t){return o.push(t),i},sortKeys:function(t){return a[o.length-1]=t,i},sortValues:function(t){return e=t,i},rollup:function(t){return r=t,i}}},t.set=ye,t.map=le,t.keys=function(t){var n=[];for(var e in t)n.push(e);return n},t.values=function(t){var n=[];for(var e in t)n.push(t[e]);return n},t.entries=function(t){var n=[];for(var e in t)n.push({key:e,value:t[e]});return n},t.color=kt,t.rgb=Rt,t.hsl=Ut,t.lab=Bt,t.hcl=Vt,t.lch=function(t,n,e,r){return 1===arguments.length?Gt(t):new $t(e,n,t,null==r?1:r)},t.gray=function(t,n){return new Ft(t,0,0,null==n?1:n)},t.cubehelix=Wt,t.contours=we,t.contourDensity=function(){function t(t){var e=new Float32Array(v*y),r=new Float32Array(v*y);t.forEach(function(t,n,r){var i=a(t,n,r)+p>>h,o=u(t,n,r)+p>>h;i>=0&&i<v&&o>=0&&o<y&&++e[i+o*v]}),Me({width:v,height:y,data:e},{width:v,height:y,data:r},l>>h),Ae({width:v,height:y,data:r},{width:v,height:y,data:e},l>>h),Me({width:v,height:y,data:e},{width:v,height:y,data:r},l>>h),Ae({width:v,height:y,data:r},{width:v,height:y,data:e},l>>h),Me({width:v,height:y,data:e},{width:v,height:y,data:r},l>>h),Ae({width:v,height:y,data:r},{width:v,height:y,data:e},l>>h);var i=_(e);if(!Array.isArray(i)){var o=g(e);i=d(0,o,i),(i=s(0,Math.floor(o/i)*i,i)).shift()}return we().thresholds(i).size([v,y])(e).map(n)}function n(t){return t.value*=Math.pow(2,-2*h),t.coordinates.forEach(e),t}function e(t){t.forEach(r)}function r(t){t.forEach(i)}function i(t){t[0]=t[0]*Math.pow(2,h)-p,t[1]=t[1]*Math.pow(2,h)-p}function o(){return p=3*l,v=f+2*p>>h,y=c+2*p>>h,t}var a=Te,u=Ne,f=960,c=500,l=20,h=2,p=3*l,v=f+2*p>>h,y=c+2*p>>h,_=be(20);return t.x=function(n){return arguments.length?(a="function"==typeof n?n:be(+n),t):a},t.y=function(n){return arguments.length?(u="function"==typeof n?n:be(+n),t):u},t.size=function(t){if(!arguments.length)return[f,c];var n=Math.ceil(t[0]),e=Math.ceil(t[1]);if(!(n>=0||n>=0))throw new Error("invalid size");return f=n,c=e,o()},t.cellSize=function(t){if(!arguments.length)return 1<<h;if(!((t=+t)>=1))throw new Error("invalid cell size");return h=Math.floor(Math.log(t)/Math.LN2),o()},t.thresholds=function(n){return arguments.length?(_="function"==typeof n?n:Array.isArray(n)?be(Lh.call(n)):be(n),t):_},t.bandwidth=function(t){if(!arguments.length)return Math.sqrt(l*(l+1));if(!((t=+t)>=0))throw new Error("invalid bandwidth");return l=Math.round((Math.sqrt(4*t*t+1)-1)/2),o()},t},t.dispatch=N,t.drag=function(){function n(t){t.on("mousedown.drag",e).filter(g).on("touchstart.drag",o).on("touchmove.drag",a).on("touchend.drag touchcancel.drag",u).style("touch-action","none").style("-webkit-tap-highlight-color","rgba(0,0,0,0)")}function e(){if(!h&&d.apply(this,arguments)){var n=f("mouse",p.apply(this,arguments),pt,this,arguments);n&&(ct(t.event.view).on("mousemove.drag",r,!0).on("mouseup.drag",i,!0),_t(t.event.view),gt(),l=!1,c=t.event.clientX,s=t.event.clientY,n("start"))}}function r(){if(yt(),!l){var n=t.event.clientX-c,e=t.event.clientY-s;l=n*n+e*e>m}y.mouse("drag")}function i(){ct(t.event.view).on("mousemove.drag mouseup.drag",null),bt(t.event.view,l),yt(),y.mouse("end")}function o(){if(d.apply(this,arguments)){var n,e,r=t.event.changedTouches,i=p.apply(this,arguments),o=r.length;for(n=0;n<o;++n)(e=f(r[n].identifier,i,vt,this,arguments))&&(gt(),e("start"))}}function a(){var n,e,r=t.event.changedTouches,i=r.length;for(n=0;n<i;++n)(e=y[r[n].identifier])&&(yt(),e("drag"))}function u(){var n,e,r=t.event.changedTouches,i=r.length;for(h&&clearTimeout(h),h=setTimeout(function(){h=null},500),n=0;n<i;++n)(e=y[r[n].identifier])&&(gt(),e("end"))}function f(e,r,i,o,a){var u,f,c,s=i(r,e),l=_.copy();if(ot(new xt(n,"beforestart",u,e,b,s[0],s[1],0,0,l),function(){return null!=(t.event.subject=u=v.apply(o,a))&&(f=u.x-s[0]||0,c=u.y-s[1]||0,!0)}))return function t(h){var d,p=s;switch(h){case"start":y[e]=t,d=b++;break;case"end":delete y[e],--b;case"drag":s=i(r,e),d=b}ot(new xt(n,h,u,e,d,s[0]+f,s[1]+c,s[0]-p[0],s[1]-p[1],l),l.apply,l,[h,o,a])}}var c,s,l,h,d=wt,p=Mt,v=At,g=Tt,y={},_=N("start","drag","end"),b=0,m=0;return n.filter=function(t){return arguments.length?(d="function"==typeof t?t:mt(!!t),n):d},n.container=function(t){return arguments.length?(p="function"==typeof t?t:mt(t),n):p},n.subject=function(t){return arguments.length?(v="function"==typeof t?t:mt(t),n):v},n.touchable=function(t){return arguments.length?(g="function"==typeof t?t:mt(!!t),n):g},n.on=function(){var t=_.on.apply(_,arguments);return t===_?n:t},n.clickDistance=function(t){return arguments.length?(m=(t=+t)*t,n):Math.sqrt(m)},n},t.dragDisable=_t,t.dragEnable=bt,t.dsvFormat=Ee,t.csvParse=Ih,t.csvParseRows=jh,t.csvFormat=Hh,t.csvFormatRows=Xh,t.tsvParse=Vh,t.tsvParseRows=$h,t.tsvFormat=Wh,t.tsvFormatRows=Zh,t.easeLinear=function(t){return+t},t.easeQuad=Bn,t.easeQuadIn=function(t){return t*t},t.easeQuadOut=function(t){return t*(2-t)},t.easeQuadInOut=Bn,t.easeCubic=Fn,t.easeCubicIn=function(t){return t*t*t},t.easeCubicOut=function(t){return--t*t*t+1},t.easeCubicInOut=Fn,t.easePoly=jl,t.easePolyIn=Fl,t.easePolyOut=Il,t.easePolyInOut=jl,t.easeSin=In,t.easeSinIn=function(t){return 1-Math.cos(t*Xl)},t.easeSinOut=function(t){return Math.sin(t*Xl)},t.easeSinInOut=In,t.easeExp=jn,t.easeExpIn=function(t){return Math.pow(2,10*t-10)},t.easeExpOut=function(t){return 1-Math.pow(2,-10*t)},t.easeExpInOut=jn,t.easeCircle=Hn,t.easeCircleIn=function(t){return 1-Math.sqrt(1-t*t)},t.easeCircleOut=function(t){return Math.sqrt(1- --t*t)},t.easeCircleInOut=Hn,t.easeBounce=Xn,t.easeBounceIn=function(t){return 1-Xn(1-t)},t.easeBounceOut=Xn,t.easeBounceInOut=function(t){return((t*=2)<=1?1-Xn(1-t):Xn(t-1)+1)/2},t.easeBack=ih,t.easeBackIn=eh,t.easeBackOut=rh,t.easeBackInOut=ih,t.easeElastic=uh,t.easeElasticIn=ah,t.easeElasticOut=uh,t.easeElasticInOut=fh,t.blob=function(t,n){return fetch(t,n).then(ke)},t.buffer=function(t,n){return fetch(t,n).then(Ce)},t.dsv=function(t,n,e,r){3===arguments.length&&"function"==typeof e&&(r=e,e=void 0);var i=Ee(t);return ze(n,e).then(function(t){return i.parse(t,r)})},t.csv=Qh,t.tsv=Jh,t.image=function(t,n){return new Promise(function(e,r){var i=new Image;for(var o in n)i[o]=n[o];i.onerror=r,i.onload=function(){e(i)},i.src=t})},t.json=function(t,n){return fetch(t,n).then(Le)},t.text=ze,t.xml=Kh,t.html=td,t.svg=nd,t.forceCenter=function(t,n){function e(){var e,i,o=r.length,a=0,u=0;for(e=0;e<o;++e)a+=(i=r[e]).x,u+=i.y;for(a=a/o-t,u=u/o-n,e=0;e<o;++e)(i=r[e]).x-=a,i.y-=u}var r;return null==t&&(t=0),null==n&&(n=0),e.initialize=function(t){r=t},e.x=function(n){return arguments.length?(t=+n,e):t},e.y=function(t){return arguments.length?(n=+t,e):n},e},t.forceCollide=function(t){function n(){for(var t,n,r,f,c,s,l,h=i.length,d=0;d<u;++d)for(n=Ie(i,Xe,Ge).visitAfter(e),t=0;t<h;++t)r=i[t],s=o[r.index],l=s*s,f=r.x+r.vx,c=r.y+r.vy,n.visit(function(t,n,e,i,o){var u=t.data,h=t.r,d=s+h;if(!u)return n>f+d||i<f-d||e>c+d||o<c-d;if(u.index>r.index){var p=f-u.x-u.vx,v=c-u.y-u.vy,g=p*p+v*v;g<d*d&&(0===p&&(p=qe(),g+=p*p),0===v&&(v=qe(),g+=v*v),g=(d-(g=Math.sqrt(g)))/g*a,r.vx+=(p*=g)*(d=(h*=h)/(l+h)),r.vy+=(v*=g)*d,u.vx-=p*(d=1-d),u.vy-=v*d)}})}function e(t){if(t.data)return t.r=o[t.data.index];for(var n=t.r=0;n<4;++n)t[n]&&t[n].r>t.r&&(t.r=t[n].r)}function r(){if(i){var n,e,r=i.length;for(o=new Array(r),n=0;n<r;++n)e=i[n],o[e.index]=+t(e,n,i)}}var i,o,a=1,u=1;return"function"!=typeof t&&(t=Ue(null==t?1:+t)),n.initialize=function(t){i=t,r()},n.iterations=function(t){return arguments.length?(u=+t,n):u},n.strength=function(t){return arguments.length?(a=+t,n):a},n.radius=function(e){return arguments.length?(t="function"==typeof e?e:Ue(+e),r(),n):t},n},t.forceLink=function(t){function n(n){for(var e=0,r=t.length;e<d;++e)for(var i,u,f,s,l,h,p,v=0;v<r;++v)u=(i=t[v]).source,s=(f=i.target).x+f.vx-u.x-u.vx||qe(),l=f.y+f.vy-u.y-u.vy||qe(),s*=h=((h=Math.sqrt(s*s+l*l))-a[v])/h*n*o[v],l*=h,f.vx-=s*(p=c[v]),f.vy-=l*p,u.vx+=s*(p=1-p),u.vy+=l*p}function e(){if(u){var n,e,l=u.length,h=t.length,d=le(u,s);for(n=0,f=new Array(l);n<h;++n)(e=t[n]).index=n,"object"!=typeof e.source&&(e.source=$e(d,e.source)),"object"!=typeof e.target&&(e.target=$e(d,e.target)),f[e.source.index]=(f[e.source.index]||0)+1,f[e.target.index]=(f[e.target.index]||0)+1;for(n=0,c=new Array(h);n<h;++n)e=t[n],c[n]=f[e.source.index]/(f[e.source.index]+f[e.target.index]);o=new Array(h),r(),a=new Array(h),i()}}function r(){if(u)for(var n=0,e=t.length;n<e;++n)o[n]=+l(t[n],n,t)}function i(){if(u)for(var n=0,e=t.length;n<e;++n)a[n]=+h(t[n],n,t)}var o,a,u,f,c,s=Ve,l=function(t){return 1/Math.min(f[t.source.index],f[t.target.index])},h=Ue(30),d=1;return null==t&&(t=[]),n.initialize=function(t){u=t,e()},n.links=function(r){return arguments.length?(t=r,e(),n):t},n.id=function(t){return arguments.length?(s=t,n):s},n.iterations=function(t){return arguments.length?(d=+t,n):d},n.strength=function(t){return arguments.length?(l="function"==typeof t?t:Ue(+t),r(),n):l},n.distance=function(t){return arguments.length?(h="function"==typeof t?t:Ue(+t),i(),n):h},n},t.forceManyBody=function(){function t(t){var n,u=i.length,f=Ie(i,We,Ze).visitAfter(e);for(a=t,n=0;n<u;++n)o=i[n],f.visit(r)}function n(){if(i){var t,n,e=i.length;for(u=new Array(e),t=0;t<e;++t)n=i[t],u[n.index]=+f(n,t,i)}}function e(t){var n,e,r,i,o,a=0,f=0;if(t.length){for(r=i=o=0;o<4;++o)(n=t[o])&&(e=Math.abs(n.value))&&(a+=n.value,f+=e,r+=e*n.x,i+=e*n.y);t.x=r/f,t.y=i/f}else{(n=t).x=n.data.x,n.y=n.data.y;do{a+=u[n.data.index]}while(n=n.next)}t.value=a}function r(t,n,e,r){if(!t.value)return!0;var i=t.x-o.x,f=t.y-o.y,h=r-n,d=i*i+f*f;if(h*h/l<d)return d<s&&(0===i&&(i=qe(),d+=i*i),0===f&&(f=qe(),d+=f*f),d<c&&(d=Math.sqrt(c*d)),o.vx+=i*t.value*a/d,o.vy+=f*t.value*a/d),!0;if(!(t.length||d>=s)){(t.data!==o||t.next)&&(0===i&&(i=qe(),d+=i*i),0===f&&(f=qe(),d+=f*f),d<c&&(d=Math.sqrt(c*d)));do{t.data!==o&&(h=u[t.data.index]*a/d,o.vx+=i*h,o.vy+=f*h)}while(t=t.next)}}var i,o,a,u,f=Ue(-30),c=1,s=1/0,l=.81;return t.initialize=function(t){i=t,n()},t.strength=function(e){return arguments.length?(f="function"==typeof e?e:Ue(+e),n(),t):f},t.distanceMin=function(n){return arguments.length?(c=n*n,t):Math.sqrt(c)},t.distanceMax=function(n){return arguments.length?(s=n*n,t):Math.sqrt(s)},t.theta=function(n){return arguments.length?(l=n*n,t):Math.sqrt(l)},t},t.forceRadial=function(t,n,e){function r(t){for(var r=0,i=o.length;r<i;++r){var f=o[r],c=f.x-n||1e-6,s=f.y-e||1e-6,l=Math.sqrt(c*c+s*s),h=(u[r]-l)*a[r]*t/l;f.vx+=c*h,f.vy+=s*h}}function i(){if(o){var n,e=o.length;for(a=new Array(e),u=new Array(e),n=0;n<e;++n)u[n]=+t(o[n],n,o),a[n]=isNaN(u[n])?0:+f(o[n],n,o)}}var o,a,u,f=Ue(.1);return"function"!=typeof t&&(t=Ue(+t)),null==n&&(n=0),null==e&&(e=0),r.initialize=function(t){o=t,i()},r.strength=function(t){return arguments.length?(f="function"==typeof t?t:Ue(+t),i(),r):f},r.radius=function(n){return arguments.length?(t="function"==typeof n?n:Ue(+n),i(),r):t},r.x=function(t){return arguments.length?(n=+t,r):n},r.y=function(t){return arguments.length?(e=+t,r):e},r},t.forceSimulation=function(t){function n(){e(),d.call("tick",o),a<u&&(h.stop(),d.call("end",o))}function e(){var n,e,r=t.length;for(a+=(c-a)*f,l.each(function(t){t(a)}),n=0;n<r;++n)null==(e=t[n]).fx?e.x+=e.vx*=s:(e.x=e.fx,e.vx=0),null==e.fy?e.y+=e.vy*=s:(e.y=e.fy,e.vy=0)}function r(){for(var n,e=0,r=t.length;e<r;++e){if(n=t[e],n.index=e,isNaN(n.x)||isNaN(n.y)){var i=id*Math.sqrt(e),o=e*od;n.x=i*Math.cos(o),n.y=i*Math.sin(o)}(isNaN(n.vx)||isNaN(n.vy))&&(n.vx=n.vy=0)}}function i(n){return n.initialize&&n.initialize(t),n}var o,a=1,u=.001,f=1-Math.pow(u,1/300),c=0,s=.6,l=le(),h=An(n),d=N("tick","end");return null==t&&(t=[]),r(),o={tick:e,restart:function(){return h.restart(n),o},stop:function(){return h.stop(),o},nodes:function(n){return arguments.length?(t=n,r(),l.each(i),o):t},alpha:function(t){return arguments.length?(a=+t,o):a},alphaMin:function(t){return arguments.length?(u=+t,o):u},alphaDecay:function(t){return arguments.length?(f=+t,o):+f},alphaTarget:function(t){return arguments.length?(c=+t,o):c},velocityDecay:function(t){return arguments.length?(s=1-t,o):1-s},force:function(t,n){return arguments.length>1?(null==n?l.remove(t):l.set(t,i(n)),o):l.get(t)},find:function(n,e,r){var i,o,a,u,f,c=0,s=t.length;for(null==r?r=1/0:r*=r,c=0;c<s;++c)(a=(i=n-(u=t[c]).x)*i+(o=e-u.y)*o)<r&&(f=u,r=a);return f},on:function(t,n){return arguments.length>1?(d.on(t,n),o):d.on(t)}}},t.forceX=function(t){function n(t){for(var n,e=0,a=r.length;e<a;++e)(n=r[e]).vx+=(o[e]-n.x)*i[e]*t}function e(){if(r){var n,e=r.length;for(i=new Array(e),o=new Array(e),n=0;n<e;++n)i[n]=isNaN(o[n]=+t(r[n],n,r))?0:+a(r[n],n,r)}}var r,i,o,a=Ue(.1);return"function"!=typeof t&&(t=Ue(null==t?0:+t)),n.initialize=function(t){r=t,e()},n.strength=function(t){return arguments.length?(a="function"==typeof t?t:Ue(+t),e(),n):a},n.x=function(r){return arguments.length?(t="function"==typeof r?r:Ue(+r),e(),n):t},n},t.forceY=function(t){function n(t){for(var n,e=0,a=r.length;e<a;++e)(n=r[e]).vy+=(o[e]-n.y)*i[e]*t}function e(){if(r){var n,e=r.length;for(i=new Array(e),o=new Array(e),n=0;n<e;++n)i[n]=isNaN(o[n]=+t(r[n],n,r))?0:+a(r[n],n,r)}}var r,i,o,a=Ue(.1);return"function"!=typeof t&&(t=Ue(null==t?0:+t)),n.initialize=function(t){r=t,e()},n.strength=function(t){return arguments.length?(a="function"==typeof t?t:Ue(+t),e(),n):a},n.y=function(r){return arguments.length?(t="function"==typeof r?r:Ue(+r),e(),n):t},n},t.formatDefaultLocale=ir,t.formatLocale=rr,t.formatSpecifier=tr,t.precisionFixed=or,t.precisionPrefix=ar,t.precisionRound=ur,t.geoArea=function(t){return lp.reset(),_r(t,hp),2*lp},t.geoBounds=function(t){var n,e,r,i,o,a,u;if(_d=yd=-(vd=gd=1/0),Md=[],_r(t,pp),e=Md.length){for(Md.sort(Or),n=1,o=[r=Md[0]];n<e;++n)Yr(r,(i=Md[n])[0])||Yr(r,i[1])?(qr(r[0],i[1])>qr(r[0],r[1])&&(r[1]=i[1]),qr(i[0],r[1])>qr(r[0],r[1])&&(r[0]=i[0])):o.push(r=i);for(a=-1/0,n=0,r=o[e=o.length-1];n<=e;r=i,++n)i=o[n],(u=qr(r[1],i[0]))>a&&(a=u,vd=i[0],yd=r[1])}return Md=Ad=null,vd===1/0||gd===1/0?[[NaN,NaN],[NaN,NaN]]:[[vd,gd],[yd,_d]]},t.geoCentroid=function(t){Td=Nd=Sd=Ed=kd=Cd=Pd=zd=Rd=Ld=Dd=0,_r(t,vp);var n=Rd,e=Ld,r=Dd,i=n*n+e*e+r*r;return i<jd&&(n=Cd,e=Pd,r=zd,Nd<Id&&(n=Sd,e=Ed,r=kd),(i=n*n+e*e+r*r)<jd)?[NaN,NaN]:[Jd(e,n)*$d,hr(r/ap(i))*$d]},t.geoCircle=function(){function t(){var t=r.apply(this,arguments),u=i.apply(this,arguments)*Wd,f=o.apply(this,arguments)*Wd;return n=[],e=Kr(-t[0]*Wd,-t[1]*Wd,0).invert,ii(a,u,f,1),t={type:"Polygon",coordinates:[n]},n=e=null,t}var n,e,r=Zr([0,0]),i=Zr(90),o=Zr(6),a={point:function(t,r){n.push(t=e(t,r)),t[0]*=$d,t[1]*=$d}};return t.center=function(n){return arguments.length?(r="function"==typeof n?n:Zr([+n[0],+n[1]]),t):r},t.radius=function(n){return arguments.length?(i="function"==typeof n?n:Zr(+n),t):i},t.precision=function(n){return arguments.length?(o="function"==typeof n?n:Zr(+n),t):o},t},t.geoClipAntimeridian=Ep,t.geoClipCircle=vi,t.geoClipExtent=function(){var t,n,e,r=0,i=0,o=960,a=500;return e={stream:function(e){return t&&n===e?t:t=gi(r,i,o,a)(n=e)},extent:function(u){return arguments.length?(r=+u[0][0],i=+u[0][1],o=+u[1][0],a=+u[1][1],t=n=null,e):[[r,i],[o,a]]}}},t.geoClipRectangle=gi,t.geoContains=function(t,n){return(t&&Dp.hasOwnProperty(t.type)?Dp[t.type]:wi)(t,n)},t.geoDistance=xi,t.geoGraticule=Ci,t.geoGraticule10=function(){return Ci()()},t.geoInterpolate=function(t,n){var e=t[0]*Wd,r=t[1]*Wd,i=n[0]*Wd,o=n[1]*Wd,a=Kd(r),u=ip(r),f=Kd(o),c=ip(o),s=a*Kd(e),l=a*ip(e),h=f*Kd(i),d=f*ip(i),p=2*hr(ap(dr(o-r)+a*f*dr(i-e))),v=ip(p),g=p?function(t){var n=ip(t*=p)/v,e=ip(p-t)/v,r=e*s+n*h,i=e*l+n*d,o=e*u+n*c;return[Jd(i,r)*$d,Jd(o,ap(r*r+i*i))*$d]}:function(){return[e*$d,r*$d]};return g.distance=p,g},t.geoLength=mi,t.geoPath=function(t,n){function e(t){return t&&("function"==typeof o&&i.pointRadius(+o.apply(this,arguments)),_r(t,r(i))),i.result()}var r,i,o=4.5;return e.area=function(t){return _r(t,r(Yp)),Yp.result()},e.measure=function(t){return _r(t,r(uv)),uv.result()},e.bounds=function(t){return _r(t,r(Hp)),Hp.result()},e.centroid=function(t){return _r(t,r(tv)),tv.result()},e.projection=function(n){return arguments.length?(r=null==n?(t=null,Pi):(t=n).stream,e):t},e.context=function(t){return arguments.length?(i=null==t?(n=null,new $i):new Xi(n=t),"function"!=typeof o&&i.pointRadius(o),e):n},e.pointRadius=function(t){return arguments.length?(o="function"==typeof t?t:(i.pointRadius(+t),+t),e):o},e.projection(t).context(n)},t.geoAlbers=so,t.geoAlbersUsa=function(){function t(t){var n=t[0],e=t[1];return u=null,i.point(n,e),u||(o.point(n,e),u)||(a.point(n,e),u)}function n(){return e=r=null,t}var e,r,i,o,a,u,f=so(),c=co().rotate([154,0]).center([-2,58.5]).parallels([55,65]),s=co().rotate([157,0]).center([-3,19.9]).parallels([8,18]),l={point:function(t,n){u=[t,n]}};return t.invert=function(t){var n=f.scale(),e=f.translate(),r=(t[0]-e[0])/n,i=(t[1]-e[1])/n;return(i>=.12&&i<.234&&r>=-.425&&r<-.214?c:i>=.166&&i<.234&&r>=-.214&&r<-.115?s:f).invert(t)},t.stream=function(t){return e&&r===t?e:e=function(t){var n=t.length;return{point:function(e,r){for(var i=-1;++i<n;)t[i].point(e,r)},sphere:function(){for(var e=-1;++e<n;)t[e].sphere()},lineStart:function(){for(var e=-1;++e<n;)t[e].lineStart()},lineEnd:function(){for(var e=-1;++e<n;)t[e].lineEnd()},polygonStart:function(){for(var e=-1;++e<n;)t[e].polygonStart()},polygonEnd:function(){for(var e=-1;++e<n;)t[e].polygonEnd()}}}([f.stream(r=t),c.stream(t),s.stream(t)])},t.precision=function(t){return arguments.length?(f.precision(t),c.precision(t),s.precision(t),n()):f.precision()},t.scale=function(n){return arguments.length?(f.scale(n),c.scale(.35*n),s.scale(n),t.translate(f.translate())):f.scale()},t.translate=function(t){if(!arguments.length)return f.translate();var e=f.scale(),r=+t[0],u=+t[1];return i=f.translate(t).clipExtent([[r-.455*e,u-.238*e],[r+.455*e,u+.238*e]]).stream(l),o=c.translate([r-.307*e,u+.201*e]).clipExtent([[r-.425*e+Id,u+.12*e+Id],[r-.214*e-Id,u+.234*e-Id]]).stream(l),a=s.translate([r-.205*e,u+.212*e]).clipExtent([[r-.214*e+Id,u+.166*e+Id],[r-.115*e-Id,u+.234*e-Id]]).stream(l),n()},t.fitExtent=function(n,e){return Ki(t,n,e)},t.fitSize=function(n,e){return to(t,n,e)},t.fitWidth=function(n,e){return no(t,n,e)},t.fitHeight=function(n,e){return eo(t,n,e)},t.scale(1070)},t.geoAzimuthalEqualArea=function(){return oo(lv).scale(124.75).clipAngle(179.999)},t.geoAzimuthalEqualAreaRaw=lv,t.geoAzimuthalEquidistant=function(){return oo(hv).scale(79.4188).clipAngle(179.999)},t.geoAzimuthalEquidistantRaw=hv,t.geoConicConformal=function(){return uo(yo).scale(109.5).parallels([30,30])},t.geoConicConformalRaw=yo,t.geoConicEqualArea=co,t.geoConicEqualAreaRaw=fo,t.geoConicEquidistant=function(){return uo(bo).scale(131.154).center([0,13.9389])},t.geoConicEquidistantRaw=bo,t.geoEquirectangular=function(){return oo(_o).scale(152.63)},t.geoEquirectangularRaw=_o,t.geoGnomonic=function(){return oo(mo).scale(144.049).clipAngle(60)},t.geoGnomonicRaw=mo,t.geoIdentity=function(){function t(){return i=o=null,a}var n,e,r,i,o,a,u=1,f=0,c=0,s=1,l=1,h=Pi,d=null,p=Pi;return a={stream:function(t){return i&&o===t?i:i=h(p(o=t))},postclip:function(i){return arguments.length?(p=i,d=n=e=r=null,t()):p},clipExtent:function(i){return arguments.length?(p=null==i?(d=n=e=r=null,Pi):gi(d=+i[0][0],n=+i[0][1],e=+i[1][0],r=+i[1][1]),t()):null==d?null:[[d,n],[e,r]]},scale:function(n){return arguments.length?(h=xo((u=+n)*s,u*l,f,c),t()):u},translate:function(n){return arguments.length?(h=xo(u*s,u*l,f=+n[0],c=+n[1]),t()):[f,c]},reflectX:function(n){return arguments.length?(h=xo(u*(s=n?-1:1),u*l,f,c),t()):s<0},reflectY:function(n){return arguments.length?(h=xo(u*s,u*(l=n?-1:1),f,c),t()):l<0},fitExtent:function(t,n){return Ki(a,t,n)},fitSize:function(t,n){return to(a,t,n)},fitWidth:function(t,n){return no(a,t,n)},fitHeight:function(t,n){return eo(a,t,n)}}},t.geoProjection=oo,t.geoProjectionMutator=ao,t.geoMercator=function(){return vo(po).scale(961/Vd)},t.geoMercatorRaw=po,t.geoNaturalEarth1=function(){return oo(wo).scale(175.295)},t.geoNaturalEarth1Raw=wo,t.geoOrthographic=function(){return oo(Mo).scale(249.5).clipAngle(90+Id)},t.geoOrthographicRaw=Mo,t.geoStereographic=function(){return oo(Ao).scale(250).clipAngle(142)},t.geoStereographicRaw=Ao,t.geoTransverseMercator=function(){var t=vo(To),n=t.center,e=t.rotate;return t.center=function(t){return arguments.length?n([-t[1],t[0]]):(t=n(),[t[1],-t[0]])},t.rotate=function(t){return arguments.length?e([t[0],t[1],t.length>2?t[2]+90:90]):(t=e(),[t[0],t[1],t[2]-90])},e([0,0,90]).scale(159.155)},t.geoTransverseMercatorRaw=To,t.geoRotation=ri,t.geoStream=_r,t.geoTransform=function(t){return{stream:Zi(t)}},t.cluster=function(){function t(t){var o,a=0;t.eachAfter(function(t){var e=t.children;e?(t.x=function(t){return t.reduce(So,0)/t.length}(e),t.y=function(t){return 1+t.reduce(Eo,0)}(e)):(t.x=o?a+=n(t,o):0,t.y=0,o=t)});var u=function(t){for(var n;n=t.children;)t=n[0];return t}(t),f=function(t){for(var n;n=t.children;)t=n[n.length-1];return t}(t),c=u.x-n(u,f)/2,s=f.x+n(f,u)/2;return t.eachAfter(i?function(n){n.x=(n.x-t.x)*e,n.y=(t.y-n.y)*r}:function(n){n.x=(n.x-c)/(s-c)*e,n.y=(1-(t.y?n.y/t.y:1))*r})}var n=No,e=1,r=1,i=!1;return t.separation=function(e){return arguments.length?(n=e,t):n},t.size=function(n){return arguments.length?(i=!1,e=+n[0],r=+n[1],t):i?null:[e,r]},t.nodeSize=function(n){return arguments.length?(i=!0,e=+n[0],r=+n[1],t):i?[e,r]:null},t},t.hierarchy=Co,t.pack=function(){function t(t){return t.x=e/2,t.y=r/2,n?t.eachBefore(Zo(n)).eachAfter(Qo(i,.5)).eachBefore(Jo(1)):t.eachBefore(Zo(Wo)).eachAfter(Qo(Vo,1)).eachAfter(Qo(i,t.r/Math.min(e,r))).eachBefore(Jo(Math.min(e,r)/(2*t.r))),t}var n=null,e=1,r=1,i=Vo;return t.radius=function(e){return arguments.length?(n=function(t){return null==t?null:Go(t)}(e),t):n},t.size=function(n){return arguments.length?(e=+n[0],r=+n[1],t):[e,r]},t.padding=function(n){return arguments.length?(i="function"==typeof n?n:$o(+n),t):i},t},t.packSiblings=function(t){return Xo(t),t},t.packEnclose=Do,t.partition=function(){function t(t){var o=t.height+1;return t.x0=t.y0=r,t.x1=n,t.y1=e/o,t.eachBefore(function(t,n){return function(e){e.children&&ta(e,e.x0,t*(e.depth+1)/n,e.x1,t*(e.depth+2)/n);var i=e.x0,o=e.y0,a=e.x1-r,u=e.y1-r;a<i&&(i=a=(i+a)/2),u<o&&(o=u=(o+u)/2),e.x0=i,e.y0=o,e.x1=a,e.y1=u}}(e,o)),i&&t.eachBefore(Ko),t}var n=1,e=1,r=0,i=!1;return t.round=function(n){return arguments.length?(i=!!n,t):i},t.size=function(r){return arguments.length?(n=+r[0],e=+r[1],t):[n,e]},t.padding=function(n){return arguments.length?(r=+n,t):r},t},t.stratify=function(){function t(t){var r,i,o,a,u,f,c,s=t.length,l=new Array(s),h={};for(i=0;i<s;++i)r=t[i],u=l[i]=new Lo(r),null!=(f=n(r,i,t))&&(f+="")&&(h[c=pv+(u.id=f)]=c in h?gv:u);for(i=0;i<s;++i)if(u=l[i],null!=(f=e(t[i],i,t))&&(f+="")){if(!(a=h[pv+f]))throw new Error("missing: "+f);if(a===gv)throw new Error("ambiguous: "+f);a.children?a.children.push(u):a.children=[u],u.parent=a}else{if(o)throw new Error("multiple roots");o=u}if(!o)throw new Error("no root");if(o.parent=vv,o.eachBefore(function(t){t.depth=t.parent.depth+1,--s}).eachBefore(Ro),o.parent=null,s>0)throw new Error("cycle");return o}var n=na,e=ea;return t.id=function(e){return arguments.length?(n=Go(e),t):n},t.parentId=function(n){return arguments.length?(e=Go(n),t):e},t},t.tree=function(){function t(t){var f=function(t){for(var n,e,r,i,o,a=new fa(t,0),u=[a];n=u.pop();)if(r=n._.children)for(n.children=new Array(o=r.length),i=o-1;i>=0;--i)u.push(e=n.children[i]=new fa(r[i],i)),e.parent=n;return(a.parent=new fa(null,0)).children=[a],a}(t);if(f.eachAfter(n),f.parent.m=-f.z,f.eachBefore(e),u)t.eachBefore(r);else{var c=t,s=t,l=t;t.eachBefore(function(t){t.x<c.x&&(c=t),t.x>s.x&&(s=t),t.depth>l.depth&&(l=t)});var h=c===s?1:i(c,s)/2,d=h-c.x,p=o/(s.x+h+d),v=a/(l.depth||1);t.eachBefore(function(t){t.x=(t.x+d)*p,t.y=t.depth*v})}return t}function n(t){var n=t.children,e=t.parent.children,r=t.i?e[t.i-1]:null;if(n){(function(t){for(var n,e=0,r=0,i=t.children,o=i.length;--o>=0;)(n=i[o]).z+=e,n.m+=e,e+=n.s+(r+=n.c)})(t);var o=(n[0].z+n[n.length-1].z)/2;r?(t.z=r.z+i(t._,r._),t.m=t.z-o):t.z=o}else r&&(t.z=r.z+i(t._,r._));t.parent.A=function(t,n,e){if(n){for(var r,o=t,a=t,u=n,f=o.parent.children[0],c=o.m,s=a.m,l=u.m,h=f.m;u=oa(u),o=ia(o),u&&o;)f=ia(f),(a=oa(a)).a=t,(r=u.z+l-o.z-c+i(u._,o._))>0&&(aa(ua(u,t,e),t,r),c+=r,s+=r),l+=u.m,c+=o.m,h+=f.m,s+=a.m;u&&!oa(a)&&(a.t=u,a.m+=l-s),o&&!ia(f)&&(f.t=o,f.m+=c-h,e=t)}return e}(t,r,t.parent.A||e[0])}function e(t){t._.x=t.z+t.parent.m,t.m+=t.parent.m}function r(t){t.x*=o,t.y=t.depth*a}var i=ra,o=1,a=1,u=null;return t.separation=function(n){return arguments.length?(i=n,t):i},t.size=function(n){return arguments.length?(u=!1,o=+n[0],a=+n[1],t):u?null:[o,a]},t.nodeSize=function(n){return arguments.length?(u=!0,o=+n[0],a=+n[1],t):u?[o,a]:null},t},t.treemap=function(){function t(t){return t.x0=t.y0=0,t.x1=i,t.y1=o,t.eachBefore(n),a=[0],r&&t.eachBefore(Ko),t}function n(t){var n=a[t.depth],r=t.x0+n,i=t.y0+n,o=t.x1-n,h=t.y1-n;o<r&&(r=o=(r+o)/2),h<i&&(i=h=(i+h)/2),t.x0=r,t.y0=i,t.x1=o,t.y1=h,t.children&&(n=a[t.depth+1]=u(t)/2,r+=l(t)-n,i+=f(t)-n,o-=c(t)-n,h-=s(t)-n,o<r&&(r=o=(r+o)/2),h<i&&(i=h=(i+h)/2),e(t,r,i,o,h))}var e=_v,r=!1,i=1,o=1,a=[0],u=Vo,f=Vo,c=Vo,s=Vo,l=Vo;return t.round=function(n){return arguments.length?(r=!!n,t):r},t.size=function(n){return arguments.length?(i=+n[0],o=+n[1],t):[i,o]},t.tile=function(n){return arguments.length?(e=Go(n),t):e},t.padding=function(n){return arguments.length?t.paddingInner(n).paddingOuter(n):t.paddingInner()},t.paddingInner=function(n){return arguments.length?(u="function"==typeof n?n:$o(+n),t):u},t.paddingOuter=function(n){return arguments.length?t.paddingTop(n).paddingRight(n).paddingBottom(n).paddingLeft(n):t.paddingTop()},t.paddingTop=function(n){return arguments.length?(f="function"==typeof n?n:$o(+n),t):f},t.paddingRight=function(n){return arguments.length?(c="function"==typeof n?n:$o(+n),t):c},t.paddingBottom=function(n){return arguments.length?(s="function"==typeof n?n:$o(+n),t):s},t.paddingLeft=function(n){return arguments.length?(l="function"==typeof n?n:$o(+n),t):l},t},t.treemapBinary=function(t,n,e,r,i){function o(t,n,e,r,i,a,u){if(t>=n-1){var c=f[t];return c.x0=r,c.y0=i,c.x1=a,void(c.y1=u)}for(var l=s[t],h=e/2+l,d=t+1,p=n-1;d<p;){var v=d+p>>>1;s[v]<h?d=v+1:p=v}h-s[d-1]<s[d]-h&&t+1<d&&--d;var g=s[d]-l,y=e-g;if(a-r>u-i){var _=(r*y+a*g)/e;o(t,d,g,r,i,_,u),o(d,n,y,_,i,a,u)}else{var b=(i*y+u*g)/e;o(t,d,g,r,i,a,b),o(d,n,y,r,b,a,u)}}var a,u,f=t.children,c=f.length,s=new Array(c+1);for(s[0]=u=a=0;a<c;++a)s[a+1]=u+=f[a].value;o(0,c,t.value,n,e,r,i)},t.treemapDice=ta,t.treemapSlice=ca,t.treemapSliceDice=function(t,n,e,r,i){(1&t.depth?ca:ta)(t,n,e,r,i)},t.treemapSquarify=_v,t.treemapResquarify=bv,t.interpolate=hn,t.interpolateArray=un,t.interpolateBasis=Jt,t.interpolateBasisClosed=Kt,t.interpolateDate=fn,t.interpolateNumber=cn,t.interpolateObject=sn,t.interpolateRound=dn,t.interpolateString=ln,t.interpolateTransformCss=fl,t.interpolateTransformSvg=cl,t.interpolateZoom=yn,t.interpolateRgb=nl,t.interpolateRgbBasis=el,t.interpolateRgbBasisClosed=rl,t.interpolateHsl=pl,t.interpolateHslLong=vl,t.interpolateLab=function(t,n){var e=on((t=Bt(t)).l,(n=Bt(n)).l),r=on(t.a,n.a),i=on(t.b,n.b),o=on(t.opacity,n.opacity);return function(n){return t.l=e(n),t.a=r(n),t.b=i(n),t.opacity=o(n),t+""}},t.interpolateHcl=gl,t.interpolateHclLong=yl,t.interpolateCubehelix=_l,t.interpolateCubehelixLong=bl,t.quantize=function(t,n){for(var e=new Array(n),r=0;r<n;++r)e[r]=t(r/(n-1));return e},t.path=ie,t.polygonArea=function(t){for(var n,e=-1,r=t.length,i=t[r-1],o=0;++e<r;)n=i,i=t[e],o+=n[1]*i[0]-n[0]*i[1];return o/2},t.polygonCentroid=function(t){for(var n,e,r=-1,i=t.length,o=0,a=0,u=t[i-1],f=0;++r<i;)n=u,u=t[r],f+=e=n[0]*u[1]-u[0]*n[1],o+=(n[0]+u[0])*e,a+=(n[1]+u[1])*e;return f*=3,[o/f,a/f]},t.polygonHull=function(t){if((e=t.length)<3)return null;var n,e,r=new Array(e),i=new Array(e);for(n=0;n<e;++n)r[n]=[+t[n][0],+t[n][1],n];for(r.sort(ha),n=0;n<e;++n)i[n]=[r[n][0],-r[n][1]];var o=da(r),a=da(i),u=a[0]===o[0],f=a[a.length-1]===o[o.length-1],c=[];for(n=o.length-1;n>=0;--n)c.push(t[r[o[n]][2]]);for(n=+u;n<a.length-f;++n)c.push(t[r[a[n]][2]]);return c},t.polygonContains=function(t,n){for(var e,r,i=t.length,o=t[i-1],a=n[0],u=n[1],f=o[0],c=o[1],s=!1,l=0;l<i;++l)e=(o=t[l])[0],(r=o[1])>u!=c>u&&a<(f-e)*(u-r)/(c-r)+e&&(s=!s),f=e,c=r;return s},t.polygonLength=function(t){for(var n,e,r=-1,i=t.length,o=t[i-1],a=o[0],u=o[1],f=0;++r<i;)n=a,e=u,n-=a=(o=t[r])[0],e-=u=o[1],f+=Math.sqrt(n*n+e*e);return f},t.quadtree=Ie,t.randomUniform=mv,t.randomNormal=xv,t.randomLogNormal=wv,t.randomBates=Av,t.randomIrwinHall=Mv,t.randomExponential=Tv,t.scaleBand=ga,t.scalePoint=function(){return ya(ga().paddingInner(1))},t.scaleIdentity=Sa,t.scaleLinear=Na,t.scaleLog=Da,t.scaleOrdinal=va,t.scaleImplicit=kv,t.scalePow=qa,t.scaleSqrt=function(){return qa().exponent(.5)},t.scaleQuantile=Oa,t.scaleQuantize=Ya,t.scaleThreshold=Ba,t.scaleTime=function(){return rf(ug,og,Xv,jv,Fv,Yv,qv,Rv,t.timeFormat).domain([new Date(2e3,0,1),new Date(2e3,0,2)])},t.scaleUtc=function(){return rf(Pg,kg,vg,dg,lg,cg,qv,Rv,t.utcFormat).domain([Date.UTC(2e3,0,1),Date.UTC(2e3,0,2)])},t.scaleSequential=of,t.schemeCategory10=$g,t.schemeAccent=Wg,t.schemeDark2=Zg,t.schemePaired=Qg,t.schemePastel1=Jg,t.schemePastel2=Kg,t.schemeSet1=ty,t.schemeSet2=ny,t.schemeSet3=ey,t.interpolateBrBG=iy,t.schemeBrBG=ry,t.interpolatePRGn=ay,t.schemePRGn=oy,t.interpolatePiYG=fy,t.schemePiYG=uy,t.interpolatePuOr=sy,t.schemePuOr=cy,t.interpolateRdBu=hy,t.schemeRdBu=ly,t.interpolateRdGy=py,t.schemeRdGy=dy,t.interpolateRdYlBu=gy,t.schemeRdYlBu=vy,t.interpolateRdYlGn=_y,t.schemeRdYlGn=yy,t.interpolateSpectral=my,t.schemeSpectral=by,t.interpolateBuGn=wy,t.schemeBuGn=xy,t.interpolateBuPu=Ay,t.schemeBuPu=My,t.interpolateGnBu=Ny,t.schemeGnBu=Ty,t.interpolateOrRd=Ey,t.schemeOrRd=Sy,t.interpolatePuBuGn=Cy,t.schemePuBuGn=ky,t.interpolatePuBu=zy,t.schemePuBu=Py,t.interpolatePuRd=Ly,t.schemePuRd=Ry,t.interpolateRdPu=Uy,t.schemeRdPu=Dy,t.interpolateYlGnBu=Oy,t.schemeYlGnBu=qy,t.interpolateYlGn=By,t.schemeYlGn=Yy,t.interpolateYlOrBr=Iy,t.schemeYlOrBr=Fy,t.interpolateYlOrRd=Hy,t.schemeYlOrRd=jy,t.interpolateBlues=Gy,t.schemeBlues=Xy,t.interpolateGreens=$y,t.schemeGreens=Vy,t.interpolateGreys=Zy,t.schemeGreys=Wy,t.interpolatePurples=Jy,t.schemePurples=Qy,t.interpolateReds=t_,t.schemeReds=Ky,t.interpolateOranges=e_,t.schemeOranges=n_,t.interpolateCubehelixDefault=r_,t.interpolateRainbow=function(t){(t<0||t>1)&&(t-=Math.floor(t));var n=Math.abs(t-.5);return a_.h=360*t-100,a_.s=1.5-1.5*n,a_.l=.8-.9*n,a_+""},t.interpolateWarm=i_,t.interpolateCool=o_,t.interpolateViridis=u_,t.interpolateMagma=f_,t.interpolateInferno=c_,t.interpolatePlasma=s_,t.create=function(t){return ct(C(t).call(document.documentElement))},t.creator=C,t.local=st,t.matcher=ys,t.mouse=pt,t.namespace=k,t.namespaces=ds,t.clientPoint=dt,t.select=ct,t.selectAll=function(t){return"string"==typeof t?new ut([document.querySelectorAll(t)],[document.documentElement]):new ut([null==t?[]:t],ms)},t.selection=ft,t.selector=z,t.selectorAll=L,t.style=F,t.touch=vt,t.touches=function(t,n){null==n&&(n=ht().touches);for(var e=0,r=n?n.length:0,i=new Array(r);e<r;++e)i[e]=dt(t,n[e]);return i},t.window=B,t.customEvent=ot,t.arc=function(){function t(){var t,c,s=+n.apply(this,arguments),l=+e.apply(this,arguments),h=o.apply(this,arguments)-m_,d=a.apply(this,arguments)-m_,p=l_(d-h),v=d>h;if(f||(f=t=ie()),l<s&&(c=l,l=s,s=c),l>__)if(p>x_-__)f.moveTo(l*d_(h),l*g_(h)),f.arc(0,0,l,h,d,!v),s>__&&(f.moveTo(s*d_(d),s*g_(d)),f.arc(0,0,s,d,h,v));else{var g,y,_=h,b=d,m=h,x=d,w=p,M=p,A=u.apply(this,arguments)/2,T=A>__&&(i?+i.apply(this,arguments):y_(s*s+l*l)),N=v_(l_(l-s)/2,+r.apply(this,arguments)),S=N,E=N;if(T>__){var k=sf(T/s*g_(A)),C=sf(T/l*g_(A));(w-=2*k)>__?(k*=v?1:-1,m+=k,x-=k):(w=0,m=x=(h+d)/2),(M-=2*C)>__?(C*=v?1:-1,_+=C,b-=C):(M=0,_=b=(h+d)/2)}var P=l*d_(_),z=l*g_(_),R=s*d_(x),L=s*g_(x);if(N>__){var D=l*d_(b),U=l*g_(b),q=s*d_(m),O=s*g_(m);if(p<b_){var Y=w>__?function(t,n,e,r,i,o,a,u){var f=e-t,c=r-n,s=a-i,l=u-o,h=(s*(n-o)-l*(t-i))/(l*f-s*c);return[t+h*f,n+h*c]}(P,z,q,O,D,U,R,L):[R,L],B=P-Y[0],F=z-Y[1],I=D-Y[0],j=U-Y[1],H=1/g_(function(t){return t>1?0:t<-1?b_:Math.acos(t)}((B*I+F*j)/(y_(B*B+F*F)*y_(I*I+j*j)))/2),X=y_(Y[0]*Y[0]+Y[1]*Y[1]);S=v_(N,(s-X)/(H-1)),E=v_(N,(l-X)/(H+1))}}M>__?E>__?(g=gf(q,O,P,z,l,E,v),y=gf(D,U,R,L,l,E,v),f.moveTo(g.cx+g.x01,g.cy+g.y01),E<N?f.arc(g.cx,g.cy,E,h_(g.y01,g.x01),h_(y.y01,y.x01),!v):(f.arc(g.cx,g.cy,E,h_(g.y01,g.x01),h_(g.y11,g.x11),!v),f.arc(0,0,l,h_(g.cy+g.y11,g.cx+g.x11),h_(y.cy+y.y11,y.cx+y.x11),!v),f.arc(y.cx,y.cy,E,h_(y.y11,y.x11),h_(y.y01,y.x01),!v))):(f.moveTo(P,z),f.arc(0,0,l,_,b,!v)):f.moveTo(P,z),s>__&&w>__?S>__?(g=gf(R,L,D,U,s,-S,v),y=gf(P,z,q,O,s,-S,v),f.lineTo(g.cx+g.x01,g.cy+g.y01),S<N?f.arc(g.cx,g.cy,S,h_(g.y01,g.x01),h_(y.y01,y.x01),!v):(f.arc(g.cx,g.cy,S,h_(g.y01,g.x01),h_(g.y11,g.x11),!v),f.arc(0,0,s,h_(g.cy+g.y11,g.cx+g.x11),h_(y.cy+y.y11,y.cx+y.x11),v),f.arc(y.cx,y.cy,S,h_(y.y11,y.x11),h_(y.y01,y.x01),!v))):f.arc(0,0,s,x,m,v):f.lineTo(R,L)}else f.moveTo(0,0);if(f.closePath(),t)return f=null,t+""||null}var n=lf,e=hf,r=cf(0),i=null,o=df,a=pf,u=vf,f=null;return t.centroid=function(){var t=(+n.apply(this,arguments)+ +e.apply(this,arguments))/2,r=(+o.apply(this,arguments)+ +a.apply(this,arguments))/2-b_/2;return[d_(r)*t,g_(r)*t]},t.innerRadius=function(e){return arguments.length?(n="function"==typeof e?e:cf(+e),t):n},t.outerRadius=function(n){return arguments.length?(e="function"==typeof n?n:cf(+n),t):e},t.cornerRadius=function(n){return arguments.length?(r="function"==typeof n?n:cf(+n),t):r},t.padRadius=function(n){return arguments.length?(i=null==n?null:"function"==typeof n?n:cf(+n),t):i},t.startAngle=function(n){return arguments.length?(o="function"==typeof n?n:cf(+n),t):o},t.endAngle=function(n){return arguments.length?(a="function"==typeof n?n:cf(+n),t):a},t.padAngle=function(n){return arguments.length?(u="function"==typeof n?n:cf(+n),t):u},t.context=function(n){return arguments.length?(f=null==n?null:n,t):f},t},t.area=wf,t.line=xf,t.pie=function(){function t(t){var u,f,c,s,l,h=t.length,d=0,p=new Array(h),v=new Array(h),g=+i.apply(this,arguments),y=Math.min(x_,Math.max(-x_,o.apply(this,arguments)-g)),_=Math.min(Math.abs(y)/h,a.apply(this,arguments)),b=_*(y<0?-1:1);for(u=0;u<h;++u)(l=v[p[u]=u]=+n(t[u],u,t))>0&&(d+=l);for(null!=e?p.sort(function(t,n){return e(v[t],v[n])}):null!=r&&p.sort(function(n,e){return r(t[n],t[e])}),u=0,c=d?(y-h*b)/d:0;u<h;++u,g=s)f=p[u],s=g+((l=v[f])>0?l*c:0)+b,v[f]={data:t[f],index:u,value:l,startAngle:g,endAngle:s,padAngle:_};return v}var n=Af,e=Mf,r=null,i=cf(0),o=cf(x_),a=cf(0);return t.value=function(e){return arguments.length?(n="function"==typeof e?e:cf(+e),t):n},t.sortValues=function(n){return arguments.length?(e=n,r=null,t):e},t.sort=function(n){return arguments.length?(r=n,e=null,t):r},t.startAngle=function(n){return arguments.length?(i="function"==typeof n?n:cf(+n),t):i},t.endAngle=function(n){return arguments.length?(o="function"==typeof n?n:cf(+n),t):o},t.padAngle=function(n){return arguments.length?(a="function"==typeof n?n:cf(+n),t):a},t},t.areaRadial=kf,t.radialArea=kf,t.lineRadial=Ef,t.radialLine=Ef,t.pointRadial=Cf,t.linkHorizontal=function(){return Rf(Lf)},t.linkVertical=function(){return Rf(Df)},t.linkRadial=function(){var t=Rf(Uf);return t.angle=t.x,delete t.x,t.radius=t.y,delete t.y,t},t.symbol=function(){function t(){var t;if(r||(r=t=ie()),n.apply(this,arguments).draw(r,+e.apply(this,arguments)),t)return r=null,t+""||null}var n=cf(A_),e=cf(64),r=null;return t.type=function(e){return arguments.length?(n="function"==typeof e?e:cf(e),t):n},t.size=function(n){return arguments.length?(e="function"==typeof n?n:cf(+n),t):e},t.context=function(n){return arguments.length?(r=null==n?null:n,t):r},t},t.symbols=B_,t.symbolCircle=A_,t.symbolCross=T_,t.symbolDiamond=E_,t.symbolSquare=R_,t.symbolStar=z_,t.symbolTriangle=D_,t.symbolWye=Y_,t.curveBasisClosed=function(t){return new Bf(t)},t.curveBasisOpen=function(t){return new Ff(t)},t.curveBasis=function(t){return new Yf(t)},t.curveBundle=F_,t.curveCardinalClosed=j_,t.curveCardinalOpen=H_,t.curveCardinal=I_,t.curveCatmullRomClosed=G_,t.curveCatmullRomOpen=V_,t.curveCatmullRom=X_,t.curveLinearClosed=function(t){return new Qf(t)},t.curveLinear=_f,t.curveMonotoneX=function(t){return new ec(t)},t.curveMonotoneY=function(t){return new rc(t)},t.curveNatural=function(t){return new oc(t)},t.curveStep=function(t){return new uc(t,.5)},t.curveStepAfter=function(t){return new uc(t,1)},t.curveStepBefore=function(t){return new uc(t,0)},t.stack=function(){function t(t){var o,a,u=n.apply(this,arguments),f=t.length,c=u.length,s=new Array(c);for(o=0;o<c;++o){for(var l,h=u[o],d=s[o]=new Array(f),p=0;p<f;++p)d[p]=l=[0,+i(t[p],h,p,t)],l.data=t[p];d.key=h}for(o=0,a=e(s);o<c;++o)s[a[o]].index=o;return r(s,a),s}var n=cf([]),e=cc,r=fc,i=sc;return t.keys=function(e){return arguments.length?(n="function"==typeof e?e:cf(M_.call(e)),t):n},t.value=function(n){return arguments.length?(i="function"==typeof n?n:cf(+n),t):i},t.order=function(n){return arguments.length?(e=null==n?cc:"function"==typeof n?n:cf(M_.call(n)),t):e},t.offset=function(n){return arguments.length?(r=null==n?fc:n,t):r},t},t.stackOffsetExpand=function(t,n){if((r=t.length)>0){for(var e,r,i,o=0,a=t[0].length;o<a;++o){for(i=e=0;e<r;++e)i+=t[e][o][1]||0;if(i)for(e=0;e<r;++e)t[e][o][1]/=i}fc(t,n)}},t.stackOffsetDiverging=function(t,n){if((u=t.length)>1)for(var e,r,i,o,a,u,f=0,c=t[n[0]].length;f<c;++f)for(o=a=0,e=0;e<u;++e)(i=(r=t[n[e]][f])[1]-r[0])>=0?(r[0]=o,r[1]=o+=i):i<0?(r[1]=a,r[0]=a+=i):r[0]=o},t.stackOffsetNone=fc,t.stackOffsetSilhouette=function(t,n){if((e=t.length)>0){for(var e,r=0,i=t[n[0]],o=i.length;r<o;++r){for(var a=0,u=0;a<e;++a)u+=t[a][r][1]||0;i[r][1]+=i[r][0]=-u/2}fc(t,n)}},t.stackOffsetWiggle=function(t,n){if((i=t.length)>0&&(r=(e=t[n[0]]).length)>0){for(var e,r,i,o=0,a=1;a<r;++a){for(var u=0,f=0,c=0;u<i;++u){for(var s=t[n[u]],l=s[a][1]||0,h=(l-(s[a-1][1]||0))/2,d=0;d<u;++d){var p=t[n[d]];h+=(p[a][1]||0)-(p[a-1][1]||0)}f+=l,c+=h*l}e[a-1][1]+=e[a-1][0]=o,f&&(o-=c/f)}e[a-1][1]+=e[a-1][0]=o,fc(t,n)}},t.stackOrderAscending=lc,t.stackOrderDescending=function(t){return lc(t).reverse()},t.stackOrderInsideOut=function(t){var n,e,r=t.length,i=t.map(hc),o=cc(t).sort(function(t,n){return i[n]-i[t]}),a=0,u=0,f=[],c=[];for(n=0;n<r;++n)e=o[n],a<u?(a+=i[e],f.push(e)):(u+=i[e],c.push(e));return c.reverse().concat(f)},t.stackOrderNone=cc,t.stackOrderReverse=function(t){return cc(t).reverse()},t.timeInterval=Fa,t.timeMillisecond=Rv,t.timeMilliseconds=Lv,t.utcMillisecond=Rv,t.utcMilliseconds=Lv,t.timeSecond=qv,t.timeSeconds=Ov,t.utcSecond=qv,t.utcSeconds=Ov,t.timeMinute=Yv,t.timeMinutes=Bv,t.timeHour=Fv,t.timeHours=Iv,t.timeDay=jv,t.timeDays=Hv,t.timeWeek=Xv,t.timeWeeks=Jv,t.timeSunday=Xv,t.timeSundays=Jv,t.timeMonday=Gv,t.timeMondays=Kv,t.timeTuesday=Vv,t.timeTuesdays=tg,t.timeWednesday=$v,t.timeWednesdays=ng,t.timeThursday=Wv,t.timeThursdays=eg,t.timeFriday=Zv,t.timeFridays=rg,t.timeSaturday=Qv,t.timeSaturdays=ig,t.timeMonth=og,t.timeMonths=ag,t.timeYear=ug,t.timeYears=fg,t.utcMinute=cg,t.utcMinutes=sg,t.utcHour=lg,t.utcHours=hg,t.utcDay=dg,t.utcDays=pg,t.utcWeek=vg,t.utcWeeks=wg,t.utcSunday=vg,t.utcSundays=wg,t.utcMonday=gg,t.utcMondays=Mg,t.utcTuesday=yg,t.utcTuesdays=Ag,t.utcWednesday=_g,t.utcWednesdays=Tg,t.utcThursday=bg,t.utcThursdays=Ng,t.utcFriday=mg,t.utcFridays=Sg,t.utcSaturday=xg,t.utcSaturdays=Eg,t.utcMonth=kg,t.utcMonths=Cg,t.utcYear=Pg,t.utcYears=Rg,t.timeFormatDefaultLocale=tf,t.timeFormatLocale=Va,t.isoFormat=Yg,t.isoParse=Bg,t.now=xn,t.timer=An,t.timerFlush=Tn,t.timeout=kn,t.interval=function(t,n,e){var r=new Mn,i=n;return null==n?(r.restart(t,n,e),r):(n=+n,e=null==e?xn():+e,r.restart(function o(a){a+=i,r.restart(o,i+=n,e),t(a)},n,e),r)},t.transition=On,t.active=function(t,n){var e,r,i=t.__transition;if(i){n=null==n?null:n+"";for(r in i)if((e=i[r]).state>zl&&e.name===n)return new qn([[t]],sh,n,+r)}return null},t.interrupt=Ln,t.voronoi=function(){function t(t){return new Oc(t.map(function(r,i){var o=[Math.round(n(r,i,t)/nb)*nb,Math.round(e(r,i,t)/nb)*nb];return o.index=i,o.data=r,o}),r)}var n=pc,e=vc,r=null;return t.polygons=function(n){return t(n).polygons()},t.links=function(n){return t(n).links()},t.triangles=function(n){return t(n).triangles()},t.x=function(e){return arguments.length?(n="function"==typeof e?e:dc(+e),t):n},t.y=function(n){return arguments.length?(e="function"==typeof n?n:dc(+n),t):e},t.extent=function(n){return arguments.length?(r=null==n?null:[[+n[0][0],+n[0][1]],[+n[1][0],+n[1][1]]],t):r&&[[r[0][0],r[0][1]],[r[1][0],r[1][1]]]},t.size=function(n){return arguments.length?(r=null==n?null:[[0,0],[+n[0],+n[1]]],t):r&&[r[1][0]-r[0][0],r[1][1]-r[0][1]]},t},t.zoom=function(){function n(t){t.property("__zoom",Gc).on("wheel.zoom",f).on("mousedown.zoom",c).on("dblclick.zoom",s).filter(m).on("touchstart.zoom",l).on("touchmove.zoom",h).on("touchend.zoom touchcancel.zoom",d).style("touch-action","none").style("-webkit-tap-highlight-color","rgba(0,0,0,0)")}function e(t,n){return(n=Math.max(x[0],Math.min(x[1],n)))===t.k?t:new Bc(n,t.x,t.y)}function r(t,n,e){var r=n[0]-e[0]*t.k,i=n[1]-e[1]*t.k;return r===t.x&&i===t.y?t:new Bc(t.k,r,i)}function i(t){return[(+t[0][0]+ +t[1][0])/2,(+t[0][1]+ +t[1][1])/2]}function o(t,n,e){t.on("start.zoom",function(){a(this,arguments).start()}).on("interrupt.zoom end.zoom",function(){a(this,arguments).end()}).tween("zoom",function(){var t=arguments,r=a(this,t),o=y.apply(this,t),u=e||i(o),f=Math.max(o[1][0]-o[0][0],o[1][1]-o[0][1]),c=this.__zoom,s="function"==typeof n?n.apply(this,t):n,l=A(c.invert(u).concat(f/c.k),s.invert(u).concat(f/s.k));return function(t){if(1===t)t=s;else{var n=l(t),e=f/n[2];t=new Bc(e,u[0]-n[0]*e,u[1]-n[1]*e)}r.zoom(null,t)}})}function a(t,n){for(var e,r=0,i=T.length;r<i;++r)if((e=T[r]).that===t)return e;return new u(t,n)}function u(t,n){this.that=t,this.args=n,this.index=-1,this.active=0,this.extent=y.apply(t,n)}function f(){if(g.apply(this,arguments)){var t=a(this,arguments),n=this.__zoom,i=Math.max(x[0],Math.min(x[1],n.k*Math.pow(2,b.apply(this,arguments)))),o=pt(this);if(t.wheel)t.mouse[0][0]===o[0]&&t.mouse[0][1]===o[1]||(t.mouse[1]=n.invert(t.mouse[0]=o)),clearTimeout(t.wheel);else{if(n.k===i)return;t.mouse=[o,n.invert(o)],Ln(this),t.start()}jc(),t.wheel=setTimeout(function(){t.wheel=null,t.end()},k),t.zoom("mouse",_(r(e(n,i),t.mouse[0],t.mouse[1]),t.extent,w))}}function c(){if(!v&&g.apply(this,arguments)){var n=a(this,arguments),e=ct(t.event.view).on("mousemove.zoom",function(){if(jc(),!n.moved){var e=t.event.clientX-o,i=t.event.clientY-u;n.moved=e*e+i*i>C}n.zoom("mouse",_(r(n.that.__zoom,n.mouse[0]=pt(n.that),n.mouse[1]),n.extent,w))},!0).on("mouseup.zoom",function(){e.on("mousemove.zoom mouseup.zoom",null),bt(t.event.view,n.moved),jc(),n.end()},!0),i=pt(this),o=t.event.clientX,u=t.event.clientY;_t(t.event.view),Ic(),n.mouse=[i,this.__zoom.invert(i)],Ln(this),n.start()}}function s(){if(g.apply(this,arguments)){var i=this.__zoom,a=pt(this),u=i.invert(a),f=i.k*(t.event.shiftKey?.5:2),c=_(r(e(i,f),a,u),y.apply(this,arguments),w);jc(),M>0?ct(this).transition().duration(M).call(o,c,a):ct(this).call(n.transform,c)}}function l(){if(g.apply(this,arguments)){var n,e,r,i,o=a(this,arguments),u=t.event.changedTouches,f=u.length;for(Ic(),e=0;e<f;++e)i=[i=vt(this,u,(r=u[e]).identifier),this.__zoom.invert(i),r.identifier],o.touch0?o.touch1||(o.touch1=i):(o.touch0=i,n=!0);if(p&&(p=clearTimeout(p),!o.touch1))return o.end(),void((i=ct(this).on("dblclick.zoom"))&&i.apply(this,arguments));n&&(p=setTimeout(function(){p=null},E),Ln(this),o.start())}}function h(){var n,i,o,u,f=a(this,arguments),c=t.event.changedTouches,s=c.length;for(jc(),p&&(p=clearTimeout(p)),n=0;n<s;++n)o=vt(this,c,(i=c[n]).identifier),f.touch0&&f.touch0[2]===i.identifier?f.touch0[0]=o:f.touch1&&f.touch1[2]===i.identifier&&(f.touch1[0]=o);if(i=f.that.__zoom,f.touch1){var l=f.touch0[0],h=f.touch0[1],d=f.touch1[0],v=f.touch1[1],g=(g=d[0]-l[0])*g+(g=d[1]-l[1])*g,y=(y=v[0]-h[0])*y+(y=v[1]-h[1])*y;i=e(i,Math.sqrt(g/y)),o=[(l[0]+d[0])/2,(l[1]+d[1])/2],u=[(h[0]+v[0])/2,(h[1]+v[1])/2]}else{if(!f.touch0)return;o=f.touch0[0],u=f.touch0[1]}f.zoom("touch",_(r(i,o,u),f.extent,w))}function d(){var n,e,r=a(this,arguments),i=t.event.changedTouches,o=i.length;for(Ic(),v&&clearTimeout(v),v=setTimeout(function(){v=null},E),n=0;n<o;++n)e=i[n],r.touch0&&r.touch0[2]===e.identifier?delete r.touch0:r.touch1&&r.touch1[2]===e.identifier&&delete r.touch1;r.touch1&&!r.touch0&&(r.touch0=r.touch1,delete r.touch1),r.touch0?r.touch0[1]=this.__zoom.invert(r.touch0[0]):r.end()}var p,v,g=Hc,y=Xc,_=Wc,b=Vc,m=$c,x=[0,1/0],w=[[-1/0,-1/0],[1/0,1/0]],M=250,A=yn,T=[],S=N("start","zoom","end"),E=500,k=150,C=0;return n.transform=function(t,n){var e=t.selection?t.selection():t;e.property("__zoom",Gc),t!==e?o(t,n):e.interrupt().each(function(){a(this,arguments).start().zoom(null,"function"==typeof n?n.apply(this,arguments):n).end()})},n.scaleBy=function(t,e){n.scaleTo(t,function(){return this.__zoom.k*("function"==typeof e?e.apply(this,arguments):e)})},n.scaleTo=function(t,o){n.transform(t,function(){var t=y.apply(this,arguments),n=this.__zoom,a=i(t),u=n.invert(a),f="function"==typeof o?o.apply(this,arguments):o;return _(r(e(n,f),a,u),t,w)})},n.translateBy=function(t,e,r){n.transform(t,function(){return _(this.__zoom.translate("function"==typeof e?e.apply(this,arguments):e,"function"==typeof r?r.apply(this,arguments):r),y.apply(this,arguments),w)})},n.translateTo=function(t,e,r){n.transform(t,function(){var t=y.apply(this,arguments),n=this.__zoom,o=i(t);return _(rb.translate(o[0],o[1]).scale(n.k).translate("function"==typeof e?-e.apply(this,arguments):-e,"function"==typeof r?-r.apply(this,arguments):-r),t,w)})},u.prototype={start:function(){return 1==++this.active&&(this.index=T.push(this)-1,this.emit("start")),this},zoom:function(t,n){return this.mouse&&"mouse"!==t&&(this.mouse[1]=n.invert(this.mouse[0])),this.touch0&&"touch"!==t&&(this.touch0[1]=n.invert(this.touch0[0])),this.touch1&&"touch"!==t&&(this.touch1[1]=n.invert(this.touch1[0])),this.that.__zoom=n,this.emit("zoom"),this},end:function(){return 0==--this.active&&(T.splice(this.index,1),this.index=-1,this.emit("end")),this},emit:function(t){ot(new function(t,n,e){this.target=t,this.type=n,this.transform=e}(n,t,this.that.__zoom),S.apply,S,[t,this.that,this.args])}},n.wheelDelta=function(t){return arguments.length?(b="function"==typeof t?t:Yc(+t),n):b},n.filter=function(t){return arguments.length?(g="function"==typeof t?t:Yc(!!t),n):g},n.touchable=function(t){return arguments.length?(m="function"==typeof t?t:Yc(!!t),n):m},n.extent=function(t){return arguments.length?(y="function"==typeof t?t:Yc([[+t[0][0],+t[0][1]],[+t[1][0],+t[1][1]]]),n):y},n.scaleExtent=function(t){return arguments.length?(x[0]=+t[0],x[1]=+t[1],n):[x[0],x[1]]},n.translateExtent=function(t){return arguments.length?(w[0][0]=+t[0][0],w[1][0]=+t[1][0],w[0][1]=+t[0][1],w[1][1]=+t[1][1],n):[[w[0][0],w[0][1]],[w[1][0],w[1][1]]]},n.constrain=function(t){return arguments.length?(_=t,n):_},n.duration=function(t){return arguments.length?(M=+t,n):M},n.interpolate=function(t){return arguments.length?(A=t,n):A},n.on=function(){var t=S.on.apply(S,arguments);return t===S?n:t},n.clickDistance=function(t){return arguments.length?(C=(t=+t)*t,n):Math.sqrt(C)},n},t.zoomTransform=Fc,t.zoomIdentity=rb,Object.defineProperty(t,"__esModule",{value:!0})});
\ No newline at end of file
diff --git a/pebble/tool/data/lsm.css b/pebble/tool/data/lsm.css
new file mode 100644
index 0000000..e5f995b
--- /dev/null
+++ b/pebble/tool/data/lsm.css
@@ -0,0 +1,100 @@
+body {
+    margin: 0 0 0 0;
+}
+
+.counts {
+    font: 10px sans-serif;
+}
+
+.help {
+    font: 12px sans-serif;
+}
+
+.levels {
+    font: 12px sans-serif;
+}
+
+.reason {
+    font: 12px sans-serif;
+}
+
+.sizes {
+    font: 10px sans-serif;
+}
+
+.ticks {
+    font: 10px sans-serif;
+}
+
+.track,
+.track-inset,
+.track-overlay {
+    stroke-linecap: round;
+}
+
+.track {
+    stroke: #000;
+    stroke-opacity: 0.3;
+    stroke-width: 10px;
+}
+
+.track-inset {
+    stroke: #ddd;
+    stroke-width: 8px;
+}
+
+.track-overlay {
+    pointer-events: stroke;
+    stroke-width: 50px;
+    stroke: transparent;
+    cursor: crosshair;
+}
+
+.handle {
+    fill: #fff;
+    stroke: #000;
+    stroke-opacity: 0.5;
+    stroke-width: 1.25px;
+}
+
+#container {
+    height: 100vh;
+    width: 100%;
+    overflow-y: scroll;
+}
+
+#header {
+    display: block;
+}
+
+#index {
+    position: relative;
+    margin: 11px 10px 10px 10px;
+    width: 50px;
+}
+
+#index-container {
+    float: left;
+    width: 76px;
+    height: 40px;
+}
+
+#checkbox-container {
+    float: right;
+    width: 170px;
+    height: 40px;
+    padding-top: 10px;
+}
+
+#slider {
+    background-color: #fff;
+    height: 40px;
+    width: calc(100% - 256px);
+}
+
+#vis {
+    display: block;
+    background-color: #fff;
+    height: calc(100% - 40px);
+    width: 100%;
+}
diff --git a/pebble/tool/data/lsm.js b/pebble/tool/data/lsm.js
new file mode 100644
index 0000000..45e42d2
--- /dev/null
+++ b/pebble/tool/data/lsm.js
@@ -0,0 +1,797 @@
+// TODO(peter):
+//
+// - interactions
+//   - mouse wheel: horizontal zoom
+//   - click/drag: horizontal pan
+
+"use strict";
+
+// The heights of each level. The first few levels are given smaller
+// heights to account for the increasing target file size.
+//
+// TODO(peter): Use the TargetFileSizes specified in the OPTIONS file.
+let levelHeights = [16, 16, 16, 16, 32, 64, 128];
+const offsetStart = 24;
+let levelOffsets = generateLevelOffsets();
+const lineStart = 105;
+const sublevelHeight = 16;
+let levelWidth = 0;
+
+{
+    // Create the base DOM elements.
+    let c = d3
+        .select("body")
+        .append("div")
+        .attr("id", "container");
+    let h = c.append("div").attr("id", "header");
+    h
+        .append("div")
+        .attr("id", "index-container")
+        .append("input")
+        .attr("type", "text")
+        .attr("id", "index")
+        .attr("autocomplete", "off");
+    let checkboxContainer = h
+        .append("div")
+        .attr("id", "checkbox-container");
+    checkboxContainer.append("input")
+        .attr("type", "checkbox")
+        .attr("id", "flatten-sublevels")
+        .on("change", () => {version.onCheckboxChange(d3.event.target.checked)});
+    checkboxContainer.append("label")
+        .attr("for", "flatten-sublevels")
+        .text("Show sublevels");
+    h.append("svg").attr("id", "slider");
+    c.append("svg").attr("id", "vis");
+}
+
+let vis = d3.select("#vis");
+
+function renderHelp() {
+    vis
+        .append("text")
+        .attr("class", "help")
+        .attr("x", 10)
+        .attr("y", levelOffsets[6] + 30)
+        .text(
+            "(space: start/stop, left-arrow[+shift]: step-back, right-arrow[+shift]: step-forward)"
+        );
+}
+
+function renderReason() {
+    return vis
+        .append("text")
+        .attr("class", "reason")
+        .attr("x", 10)
+        .attr("y", 16);
+}
+
+let reason = renderReason();
+
+let index = d3.select("#index");
+
+// Pretty formatting of a number in human readable units.
+function humanize(s) {
+    const iecSuffixes = [" B", " KB", " MB", " GB", " TB", " PB", " EB"];
+    if (s < 10) {
+        return "" + s;
+    }
+    let e = Math.floor(Math.log(s) / Math.log(1024));
+    let suffix = iecSuffixes[Math.floor(e)];
+    let val = Math.floor(s / Math.pow(1024, e) * 10 + 0.5) / 10;
+    return val.toFixed(val < 10 ? 1 : 0) + suffix;
+}
+
+function generateLevelOffsets() {
+    return levelHeights.map((v, i) =>
+        levelHeights.slice(0, i + 1).reduce((sum, elem) => sum + elem, offsetStart)
+    );
+}
+
+function styleWidth(e) {
+    let width = +e.style("width").slice(0, -2);
+    return Math.round(Number(width));
+}
+
+function styleHeight(e) {
+    let height = +e.style("height").slice(0, -2);
+    return Math.round(Number(height));
+}
+
+let sliderX, sliderHandle;
+let offsetSliderX;
+
+// The version object holds the current LSM state.
+let version = {
+    levels: [[], [], [], [], [], [], []],
+    sublevels: [],
+    numSublevels: 0,
+    showSublevels: false,
+    // Generated after every change using setLevelsInfo().
+    levelsInfo: [],
+    // The version edit index.
+    index: -1,
+
+    init: function() {
+        for (let edit of data.Edits) {
+            if (edit.Sublevels === null || edit.Sublevels === undefined) {
+                continue;
+            }
+            for (let [file, sublevel] of Object.entries(edit.Sublevels)) {
+                if (sublevel >= this.numSublevels) {
+                    this.numSublevels = sublevel + 1;
+                }
+            }
+        }
+        for (let i = 0; i < this.numSublevels; i++) {
+            this.sublevels.push([]);
+        }
+        d3.select("#checkbox-container label")
+            .text("Show sublevels (" + this.numSublevels.toString() + ")");
+        this.setHeights();
+        this.setLevelsInfo();
+        renderHelp();
+    },
+
+    setHeights: function() {
+        // Update the height of level 0 to account for the number of sublevels,
+        // if there are any.
+        if (this.numSublevels > 0 && this.showSublevels === true) {
+            levelHeights[0] = sublevelHeight * this.numSublevels;
+        } else {
+            levelHeights[0] = sublevelHeight;
+        }
+        levelOffsets = generateLevelOffsets();
+        vis.style("height", levelOffsets[6] + 100);
+    },
+
+    onCheckboxChange: function(value) {
+        this.showSublevels = value;
+        vis.selectAll("*")
+            .remove();
+        reason = renderReason();
+        this.setHeights();
+        this.setLevelsInfo();
+        renderHelp();
+
+        this.render(true);
+        this.updateSize();
+    },
+
+    // Set the version edit index. This steps either forward or
+    // backward through the version edits, applying or unapplying each
+    // edit.
+    set: function(index) {
+        let prevIndex = this.index;
+        if (index < 0) {
+            index = 0;
+        } else if (index >= data.Edits.length) {
+            index = data.Edits.length - 1;
+        }
+        if (index == this.index) {
+            return;
+        }
+
+        // If the current edit index is less than the target index,
+        // step forward applying edits.
+        for (; this.index < index; this.index++) {
+            let edit = data.Edits[this.index + 1];
+            for (let level in edit.Deleted) {
+                this.remove(level, edit.Deleted[level]);
+            }
+            for (let level in edit.Added) {
+                this.add(level, edit.Added[level]);
+            }
+        }
+
+        // If the current edit index is greater than the target index,
+        // step backward unapplying edits.
+        for (; this.index > index; this.index--) {
+            let edit = data.Edits[this.index];
+            for (let level in edit.Added) {
+                this.remove(level, edit.Added[level]);
+            }
+            for (let level in edit.Deleted) {
+                this.add(level, edit.Deleted[level]);
+            }
+        }
+
+        // Build the sublevels from this.levels[0]. They need to be rebuilt from
+        // scratch each time there's a change to L0.
+        this.sublevels = [];
+        while(this.sublevels.length < this.numSublevels) {
+            this.sublevels.push([]);
+        }
+        for (let file of this.levels[0]) {
+            let sublevel = null;
+            for (let i = index; i >= 0 && (sublevel === null || sublevel === undefined); i--) {
+                if (data.Edits[i].Sublevels == null || data.Edits[i].Sublevels == undefined) {
+                  continue;
+                }
+                sublevel = data.Edits[i].Sublevels[file];
+            }
+            this.sublevels[sublevel].push(file);
+        }
+
+        // Sort the levels.
+        for (let i in this.levels) {
+            if (i == 0) {
+                for (let j in this.sublevels) {
+                    this.sublevels[j].sort(function(a, b) {
+                        let fa = data.Files[a];
+                        let fb = data.Files[b];
+                        if (fa.Smallest < fb.Smallest) {
+                            return -1;
+                        }
+                        if (fa.Smallest > fb.Smallest) {
+                            return +1;
+                        }
+                        return 0;
+                    });
+                }
+                this.levels[i].sort(function(a, b) {
+                    let fa = data.Files[a];
+                    let fb = data.Files[b];
+                    if (fa.LargestSeqNum < fb.LargestSeqNum) {
+                        return -1;
+                    }
+                    if (fa.LargestSeqNum > fb.LargestSeqNum) {
+                        return +1;
+                    }
+                    if (fa.SmallestSeqNum < fb.SmallestSeqNum) {
+                        return -1;
+                    }
+                    if (fa.SmallestSeqNum > fb.SmallestSeqNum) {
+                        return +1;
+                    }
+                    return a < b;
+                });
+            } else {
+                this.levels[i].sort(function(a, b) {
+                    let fa = data.Files[a];
+                    let fb = data.Files[b];
+                    if (fa.Smallest < fb.Smallest) {
+                        return -1;
+                    }
+                    if (fa.Smallest > fb.Smallest) {
+                        return +1;
+                    }
+                    return 0;
+                });
+            }
+        }
+
+        this.updateLevelsInfo();
+        this.render(prevIndex === -1);
+    },
+
+    // Add the specified sstables to the specifed level.
+    add: function(level, fileNums) {
+        for (let i = 0; i < fileNums.length; i++) {
+            this.levels[level].push(fileNums[i]);
+        }
+    },
+
+    // Remove the specified sstables from the specifed level.
+    remove: function(level, fileNums) {
+        let l = this.levels[level];
+        for (let i = 0; i < l.length; i++) {
+            if (fileNums.indexOf(l[i]) != -1) {
+                l[i] = l[l.length - 1];
+                l.pop();
+                i--;
+            }
+        }
+    },
+
+    // Return the size of the sstables in a level.
+    size: function(level, sublevel) {
+        if (level == 0 && sublevel !== null && sublevel !== undefined) {
+            return this.sublevels[sublevel].reduce(
+                (sum, elem) => sum + data.Files[elem].Size,
+                0
+            );
+        }
+        return (this.levels[level] || []).reduce(
+            (sum, elem) => sum + data.Files[elem].Size,
+            0
+        );
+    },
+
+    // Returns the height to use for an sstable.
+    height: function(fileNum) {
+        let meta = data.Files[fileNum];
+        return Math.ceil((meta.Size + 1024.0 * 1024.0 - 1) / (1024.0 * 1024.0));
+    },
+
+    scale: function(level) {
+        return levelWidth < this.levelsInfo[level].files.length
+            ? levelWidth / this.levelsInfo[level].files.length
+            : 1;
+    },
+
+    // Return a summary of the count and size of the specified sstables.
+    summarize: function(level, fileNums) {
+        let count = 0;
+        let size = 0;
+        for (let fileNum of fileNums) {
+            count++;
+            size += data.Files[fileNum].Size;
+        }
+        return count + " @ " + "L" + level + " (" + humanize(size) + ")";
+    },
+
+    // Return a textual description of a version edit.
+    describe: function(edit) {
+        let s = edit.Reason;
+
+        if (edit.Deleted) {
+            let sep = " ";
+            for (let i = 0; i < 7; i++) {
+                if (edit.Deleted[i]) {
+                    s += sep + this.summarize(i, edit.Deleted[i]);
+                    sep = " + ";
+                }
+            }
+        }
+
+        if (edit.Added) {
+            let sep = " => ";
+            for (let i = 0; i < 7; i++) {
+                if (edit.Added[i]) {
+                    s += sep + this.summarize(i, edit.Added[i]);
+                    sep = " + ";
+                }
+            }
+        }
+
+        return s;
+    },
+
+    setLevelsInfo: function() {
+        let sublevelCount = this.numSublevels;
+        let levelsInfo = [];
+        let levelsStart = 1;
+        if (this.showSublevels === true) {
+            levelsInfo = this.sublevels.map((files, sublevel) => ({
+                files: files,
+                levelString: "L0." + sublevel.toString(),
+                levelDisplayString: (sublevel === this.numSublevels - 1 ?
+                    "L0." : "&nbsp;&nbsp;&nbsp;&nbsp;.") + sublevel.toString(),
+                levelClass: "L0-" + sublevel.toString(),
+                level: 0,
+                offset: offsetStart + (sublevelHeight * (sublevelCount - sublevel)),
+                height: sublevelHeight,
+                size: humanize(this.size(0, sublevel)),
+            }));
+            if (levelsInfo.length === 0) {
+                levelsStart = 0;
+            }
+            levelsInfo.reverse();
+        } else {
+            levelsStart = 0;
+        }
+
+        levelsInfo = levelsInfo.concat(this.levels.slice(levelsStart).map((files, level) => ({
+            files: files,
+            levelString: "L" + (level+levelsStart).toString(),
+            levelDisplayString: "L" + (level+levelsStart).toString(),
+            levelClass: "L" + (level+levelsStart).toString(),
+            level: level,
+            offset: levelOffsets[level+levelsStart],
+            height: levelHeights[level+levelsStart],
+            size: humanize(this.size(level+levelsStart)),
+        })));
+        this.levelsInfo = levelsInfo;
+    },
+
+    updateLevelsInfo: function() {
+        let levelsStart = 1;
+        if (this.showSublevels === true) {
+            this.sublevels.forEach((files, sublevel) => {
+                this.levelsInfo[this.numSublevels - (sublevel + 1)].files = files;
+                this.levelsInfo[this.numSublevels - (sublevel + 1)].size = humanize(this.size(0, sublevel));
+            });
+            if (this.numSublevels === 0) {
+                levelsStart = 0;
+            }
+        } else {
+            levelsStart = 0;
+        }
+
+        this.levels.slice(levelsStart).forEach((files, level) => {
+            let sublevelOffset = this.showSublevels === true ? this.numSublevels : 0;
+            this.levelsInfo[sublevelOffset + level].files = files;
+            this.levelsInfo[sublevelOffset + level].size = humanize(this.size(levelsStart + level));
+        });
+    },
+
+    render: function(redraw) {
+        let version = this;
+
+        vis.interrupt();
+
+        // Render the edit info.
+        let info = "[" + this.describe(data.Edits[this.index]) + "]";
+        reason.text(info);
+
+        // Render the text for each level: sstable count and size.
+        vis
+            .selectAll("text.levels")
+            .data(this.levelsInfo)
+            .enter()
+            .append("text")
+            .attr("class", "levels")
+            .attr("x", 10)
+            .attr("y", d => d.offset)
+            .html(d => d.levelDisplayString);
+        vis
+            .selectAll("text.counts")
+            .data(this.levelsInfo)
+            .text((d, i) => d.files.length)
+            .enter()
+            .append("text")
+            .attr("class", "counts")
+            .attr("text-anchor", "end")
+            .attr("x", 55)
+            .attr("y", d => d.offset)
+            .text(d => d.files.length);
+        vis
+            .selectAll("text.sizes")
+            .data(this.levelsInfo)
+            .text((d, i) => d.size)
+            .enter()
+            .append("text")
+            .attr("class", "sizes")
+            .attr("text-anchor", "end")
+            .attr("x", 100)
+            .attr("y", (d, i) => d.offset)
+            .text(d => d.size);
+
+        // Render each of the levels. Each level is composed of an
+        // outer group which provides a clipping recentangle, an inner
+        // group defining the coordinate system, an overlap rectangle
+        // to capture mouse events, an indicator rectangle used to
+        // display sstable overlaps, and the per-sstable rectangles.
+        for (let i in this.levelsInfo) {
+            let g, clipG;
+            if (redraw === false) {
+                g = vis
+                    .selectAll("g.clip" + this.levelsInfo[i].levelClass)
+                    .select("g")
+                    .data([i]);
+                clipG = g
+                    .enter()
+                    .append("g")
+                    .attr("class", "clipRect clip" + this.levelsInfo[i].levelClass)
+                    .attr("clip-path", "url(#" + this.levelsInfo[i].levelClass + ")");
+            } else {
+                clipG = vis
+                    .append("g")
+                    .attr("class", "clipRect clip" + this.levelsInfo[i].levelClass)
+                    .attr("clip-path", "url(#" + this.levelsInfo[i].levelClass + ")")
+                    .data([i]);
+                g = clipG
+                    .append("g");
+            }
+            clipG
+                .append("g")
+                .attr(
+                    "transform",
+                    "translate(" +
+                        lineStart +
+                        "," +
+                        this.levelsInfo[i].offset +
+                        ") scale(1,-1)"
+                );
+            clipG.append("rect").attr("class", "indicator");
+
+            // Define the overlap rectangle for capturing mouse events.
+            clipG
+                .append("rect")
+                .attr("x", lineStart)
+                .attr("y", this.levelsInfo[i].offset - this.levelsInfo[i].height)
+                .attr("width", levelWidth)
+                .attr("height", this.levelsInfo[i].height)
+                .attr("opacity", 0)
+                .attr("pointer-events", "all")
+                .on("mousemove", i => version.onMouseMove(i))
+                .on("mouseout", function() {
+                    reason.text(info);
+                    vis.selectAll("rect.indicator").attr("fill", "none");
+                });
+
+            // Scale each level to fit within the display.
+            let s = this.scale(i);
+            g.attr(
+                "transform",
+                "translate(" +
+                    lineStart +
+                    "," +
+                    this.levelsInfo[i].offset +
+                    ") scale(" +
+                    s +
+                    "," +
+                    -(1 / s) +
+                    ")"
+            );
+
+            // Render the sstables for the level.
+            let level = g.selectAll("rect." + this.levelsInfo[i].levelClass).data(this.levelsInfo[i].files);
+            level.attr("fill", fileNum => (data.Files[fileNum].Virtual?"#8A9":"#555")).attr("x", (fileNum, i) => i);
+            level
+                .enter()
+                .append("rect")
+                .attr("class", this.levelsInfo[i].levelClass + " sstable")
+                .attr("id", fileNum => fileNum)
+                .attr("fill",  fileNum => (data.Files[fileNum].Virtual?"orange":"red"))
+                .attr("x", (fileNum, i) => i)
+                .attr("y", 0)
+                .attr("width", 1)
+                .attr("height", fileNum => version.height(fileNum));
+            level.exit().remove();
+        }
+
+        sliderHandle.attr("cx", sliderX(version.index));
+        index.node().value = version.index + data.StartEdit;
+    },
+
+    onMouseMove: function(i) {
+        i = Number(i);
+        if (Number.isNaN(i) || i >= this.levelsInfo.length || this.levelsInfo[i].files.length === 0) {
+            return;
+        }
+
+        // The mouse coordinates are relative to the
+        // SVG element. Adjust to be relative to the
+        // level position.
+        let mousex = d3.mouse(vis.node())[0] - lineStart;
+        let index = Math.round(mousex / this.scale(i));
+        if (index < 0) {
+            index = 0;
+        } else if (index >= this.levelsInfo[i].files.length) {
+            index = this.levelsInfo[i].files.length - 1;
+        }
+        let fileNum = this.levelsInfo[i].files[index];
+        let meta = data.Files[fileNum];
+
+        // Find the start and end index of the tables
+        // that overlap with filenum.
+        let overlapInfo = "";
+        for (let j = 1; j < this.levelsInfo.length; j++) {
+            if (this.levelsInfo[i].files.length === 0) {
+                continue;
+            }
+            let indicator = vis.select("g.clip" + this.levelsInfo[j].levelClass + " rect.indicator");
+            indicator
+                .attr("fill", "black")
+                .attr("opacity", 0.3)
+                .attr("y", this.levelsInfo[j].offset - this.levelsInfo[j].height)
+                .attr("height", this.levelsInfo[j].height);
+            if (j === i) {
+                continue;
+            }
+            let fileNums = this.levelsInfo[j].files;
+            for (let k in fileNums) {
+                let other = data.Files[fileNums[k]];
+                if (other.Largest < meta.Smallest) {
+                    continue;
+                }
+                let s = this.scale(j);
+                let t = k;
+                for (; k < fileNums.length; k++) {
+                    let other = data.Files[fileNums[k]];
+                    if (other.Smallest >= meta.Largest) {
+                        break;
+                    }
+                }
+                if (k === t) {
+                    indicator.attr("x", lineStart + s * t).attr("width", s);
+                } else {
+                    indicator
+                        .attr("x", lineStart + s * t)
+                        .attr("width", Math.max(0.5, s * (k - t)));
+                }
+                if (i + 1 === j && k > t) {
+                    let overlapSize = this.levelsInfo[j].files
+                        .slice(t, k)
+                        .reduce((sum, elem) => sum + data.Files[elem].Size, 0);
+
+                    overlapInfo =
+                        " overlaps " +
+                        (k - t) +
+                        " @ " +
+                        this.levelsInfo[j].levelString +
+                        " (" +
+                        humanize(overlapSize) +
+                        ")";
+                }
+                break;
+            }
+        }
+
+        reason.text(
+            "[" +
+                this.levelsInfo[i].levelString +
+                (data.Files[fileNum].Virtual? " v":" ") +
+                fileNum +
+                " (" +
+                humanize(data.Files[fileNum].Size) +
+                ")" +
+                overlapInfo +
+                " <" +
+                data.Keys[data.Files[fileNum].Smallest].Pretty +
+                ", " +
+                data.Keys[data.Files[fileNum].Largest].Pretty +
+                ">" +
+                "]"
+        );
+
+        vis
+            .select("g.clip" + this.levelsInfo[i].levelClass + " rect.indicator")
+            .attr("x", lineStart + this.scale(i) * index)
+            .attr("width", 1);
+    },
+
+    // Recalculate structures related to the page width.
+    updateSize: function() {
+        let svg = d3.select("#slider").html("");
+
+        let margin = { right: 10, left: 10 };
+
+        let width = styleWidth(d3.select("#slider")) - margin.left - margin.right,
+            height = styleHeight(svg);
+
+        sliderX = d3
+            .scaleLinear()
+            .domain([0, data.Edits.length - 1])
+            .range([0, width])
+            .clamp(true);
+
+        // Used only to generate offset ticks for slider.
+        // sliderX is used to index into the data.Edits array (0-indexed).
+        offsetSliderX = d3
+          .scaleLinear()
+          .domain([data.StartEdit, data.StartEdit + data.Edits.length - 1])
+          .range([0, width]);
+
+        let slider = svg
+            .append("g")
+            .attr("class", "slider")
+            .attr("transform", "translate(" + margin.left + "," + height / 2 + ")");
+
+        slider
+            .append("line")
+            .attr("class", "track")
+            .attr("x1", sliderX.range()[0])
+            .attr("x2", sliderX.range()[1])
+            .select(function() {
+                return this.parentNode.appendChild(this.cloneNode(true));
+            })
+            .attr("class", "track-inset")
+            .select(function() {
+                return this.parentNode.appendChild(this.cloneNode(true));
+            })
+            .attr("class", "track-overlay")
+            .call(
+                d3
+                    .drag()
+                    .on("start.interrupt", function() {
+                        slider.interrupt();
+                    })
+                    .on("start drag", function() {
+                        version.set(Math.round(sliderX.invert(d3.event.x)));
+                    })
+            );
+
+        slider
+            .insert("g", ".track-overlay")
+            .attr("class", "ticks")
+            .attr("transform", "translate(0," + 18 + ")")
+            .selectAll("text")
+            .data(offsetSliderX.ticks(10))
+            .enter()
+            .append("text")
+            .attr("x", offsetSliderX)
+            .attr("text-anchor", "middle")
+            .text(function(d) {
+                return d;
+            });
+
+        sliderHandle = slider
+            .insert("circle", ".track-overlay")
+            .attr("class", "handle")
+            .attr("r", 9)
+            .attr("cx", sliderX(version.index));
+
+        levelWidth = styleWidth(vis) - 10 - lineStart;
+        let lineEnd = lineStart + levelWidth;
+
+        vis
+            .selectAll("line")
+            .data(this.levelsInfo)
+            .attr("x2", lineEnd)
+            .enter()
+            .append("line")
+            .attr("x1", lineStart)
+            .attr("x2", lineEnd)
+            .attr("y1", d => d.offset)
+            .attr("y2", d => d.offset)
+            .attr("stroke", "#ddd");
+
+        vis
+            .selectAll("defs clipPath rect")
+            .data(this.levelsInfo)
+            .attr("width", lineEnd - lineStart)
+            .enter()
+            .append("defs")
+            .append("clipPath")
+            .attr("id", d => d.levelClass)
+            .append("rect")
+            .attr("x", lineStart)
+            .attr("y", d => d.offset - d.height)
+            .attr("width", lineEnd - lineStart)
+            .attr("height", d => d.height);
+    },
+};
+
+window.onload = function() {
+    version.init();
+    version.updateSize();
+    version.set(0);
+};
+
+window.addEventListener("resize", function() {
+    version.updateSize();
+    version.render();
+});
+
+let timer;
+
+function startPlayback(increment) {
+    timer = d3.timer(function() {
+        let lastIndex = version.index;
+        version.set(version.index + increment);
+        if (lastIndex == version.index) {
+            timer.stop();
+            timer = null;
+        }
+    });
+}
+
+function stopPlayback() {
+    if (timer == null) {
+        return false;
+    }
+    timer.stop();
+    timer = null;
+    return true;
+}
+
+document.addEventListener("keydown", function(e) {
+    switch (e.keyCode) {
+        case 37: // left arrow
+            stopPlayback();
+            version.set(version.index - (e.shiftKey ? 10 : 1));
+            return;
+        case 39: // right arrow
+            stopPlayback();
+            version.set(version.index + (e.shiftKey ? 10 : 1));
+            return;
+        case 32: // space
+            if (stopPlayback()) {
+                return;
+            }
+            startPlayback(1);
+            return;
+    }
+});
+
+index.on("input", function() {
+    if (!isNaN(+this.value)) {
+        const val = Number(this.value) - data.StartEdit;
+        if (val >= 0) {
+            version.set(val);
+        }
+    }
+});
diff --git a/pebble/tool/data_test.go b/pebble/tool/data_test.go
new file mode 100644
index 0000000..0f18737
--- /dev/null
+++ b/pebble/tool/data_test.go
@@ -0,0 +1,136 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package tool
+
+import (
+	"bytes"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/spf13/cobra"
+	"github.com/stretchr/testify/require"
+)
+
+func runTests(t *testing.T, path string) {
+	paths, err := filepath.Glob(path)
+	require.NoError(t, err)
+
+	root := filepath.Dir(path)
+	for {
+		next := filepath.Dir(root)
+		if next == "." {
+			break
+		}
+		root = next
+	}
+
+	normalize := func(name string) string {
+		if os.PathSeparator == '/' {
+			return name
+		}
+		return strings.Replace(name, "/", string(os.PathSeparator), -1)
+	}
+
+	for _, path := range paths {
+		name, err := filepath.Rel(root, path)
+		require.NoError(t, err)
+
+		fs := vfs.NewMem()
+		t.Run(name, func(t *testing.T) {
+			datadriven.RunTest(t, path, func(t *testing.T, d *datadriven.TestData) string {
+				args := []string{d.Cmd}
+				for _, arg := range d.CmdArgs {
+					args = append(args, arg.String())
+				}
+				args = append(args, strings.Fields(d.Input)...)
+
+				// The testdata files contain paths with "/" path separators, but we
+				// might be running on a system with a different path separator
+				// (e.g. Windows). Copy the input data into a mem filesystem which
+				// always uses "/" for the path separator.
+				for i := range args {
+					src := normalize(args[i])
+					dest := vfs.Default.PathBase(src)
+					if ok, err := vfs.Clone(vfs.Default, fs, src, dest); err != nil {
+						return err.Error()
+					} else if ok {
+						args[i] = fs.PathBase(args[i])
+					}
+				}
+
+				var buf bytes.Buffer
+				var secs int64
+				timeNow = func() time.Time { secs++; return time.Unix(secs, 0) }
+
+				defer func() {
+					timeNow = time.Now
+				}()
+
+				// Register a test comparer and merger so that we can check the
+				// behavior of tools when the comparer and merger do not match.
+				comparer := func() *Comparer {
+					c := *base.DefaultComparer
+					c.Name = "test-comparer"
+					c.FormatKey = func(key []byte) fmt.Formatter {
+						return fmtFormatter{
+							fmt: "test formatter: %s",
+							v:   key,
+						}
+					}
+					c.FormatValue = func(_, value []byte) fmt.Formatter {
+						return fmtFormatter{
+							fmt: "test value formatter: %s",
+							v:   value,
+						}
+					}
+					return &c
+				}()
+				altComparer := func() *Comparer {
+					c := *base.DefaultComparer
+					c.Name = "alt-comparer"
+					return &c
+				}()
+				merger := func() *Merger {
+					m := *base.DefaultMerger
+					m.Name = "test-merger"
+					return &m
+				}()
+				openErrEnhancer := func(err error) error {
+					if errors.Is(err, base.ErrCorruption) {
+						return base.CorruptionErrorf("%v\nCustom message in case of corruption error.", err)
+					}
+					return err
+				}
+
+				tool := New(
+					DefaultComparer(comparer),
+					Comparers(altComparer, testkeys.Comparer),
+					Mergers(merger),
+					FS(fs),
+					OpenErrEnhancer(openErrEnhancer),
+				)
+
+				c := &cobra.Command{}
+				c.AddCommand(tool.Commands...)
+				c.SetArgs(args)
+				c.SetOut(&buf)
+				c.SetErr(&buf)
+				if err := c.Execute(); err != nil {
+					return err.Error()
+				}
+				return buf.String()
+			})
+		})
+	}
+}
diff --git a/pebble/tool/db.go b/pebble/tool/db.go
new file mode 100644
index 0000000..dd78daf
--- /dev/null
+++ b/pebble/tool/db.go
@@ -0,0 +1,777 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package tool
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"text/tabwriter"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/errors/oserror"
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/humanize"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/record"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/tool/logs"
+	"github.com/spf13/cobra"
+)
+
+// dbT implements db-level tools, including both configuration state and the
+// commands themselves.
+type dbT struct {
+	Root       *cobra.Command
+	Check      *cobra.Command
+	Checkpoint *cobra.Command
+	Get        *cobra.Command
+	Logs       *cobra.Command
+	LSM        *cobra.Command
+	Properties *cobra.Command
+	Scan       *cobra.Command
+	Set        *cobra.Command
+	Space      *cobra.Command
+	IOBench    *cobra.Command
+
+	// Configuration.
+	opts            *pebble.Options
+	comparers       sstable.Comparers
+	mergers         sstable.Mergers
+	openErrEnhancer func(error) error
+
+	// Flags.
+	comparerName  string
+	mergerName    string
+	fmtKey        keyFormatter
+	fmtValue      valueFormatter
+	start         key
+	end           key
+	count         int64
+	allLevels     bool
+	ioCount       int
+	ioParallelism int
+	ioSizes       string
+	verbose       bool
+}
+
+func newDB(
+	opts *pebble.Options,
+	comparers sstable.Comparers,
+	mergers sstable.Mergers,
+	openErrEnhancer func(error) error,
+) *dbT {
+	d := &dbT{
+		opts:            opts,
+		comparers:       comparers,
+		mergers:         mergers,
+		openErrEnhancer: openErrEnhancer,
+	}
+	d.fmtKey.mustSet("quoted")
+	d.fmtValue.mustSet("[%x]")
+
+	d.Root = &cobra.Command{
+		Use:   "db",
+		Short: "DB introspection tools",
+	}
+	d.Check = &cobra.Command{
+		Use:   "check <dir>",
+		Short: "verify checksums and metadata",
+		Long: `
+Verify sstable, manifest, and WAL checksums. Requires that the specified
+database not be in use by another process.
+`,
+		Args: cobra.ExactArgs(1),
+		Run:  d.runCheck,
+	}
+	d.Checkpoint = &cobra.Command{
+		Use:   "checkpoint <src-dir> <dest-dir>",
+		Short: "create a checkpoint",
+		Long: `
+Creates a Pebble checkpoint in the specified destination directory. A checkpoint
+is a point-in-time snapshot of DB state. Requires that the specified
+database not be in use by another process.
+`,
+		Args: cobra.ExactArgs(2),
+		Run:  d.runCheckpoint,
+	}
+	d.Get = &cobra.Command{
+		Use:   "get <dir> <key>",
+		Short: "get value for a key",
+		Long: `
+Gets a value for a key, if it exists in DB. Prints a "not found" error if key
+does not exist. Requires that the specified database not be in use by another
+process.
+`,
+		Args: cobra.ExactArgs(2),
+		Run:  d.runGet,
+	}
+	d.Logs = logs.NewCmd()
+	d.LSM = &cobra.Command{
+		Use:   "lsm <dir>",
+		Short: "print LSM structure",
+		Long: `
+Print the structure of the LSM tree. Requires that the specified database not
+be in use by another process.
+`,
+		Args: cobra.ExactArgs(1),
+		Run:  d.runLSM,
+	}
+	d.Properties = &cobra.Command{
+		Use:   "properties <dir>",
+		Short: "print aggregated sstable properties",
+		Long: `
+Print SSTable properties, aggregated per level of the LSM.
+`,
+		Args: cobra.ExactArgs(1),
+		Run:  d.runProperties,
+	}
+	d.Scan = &cobra.Command{
+		Use:   "scan <dir>",
+		Short: "print db records",
+		Long: `
+Print the records in the DB. Requires that the specified database not be in use
+by another process.
+`,
+		Args: cobra.ExactArgs(1),
+		Run:  d.runScan,
+	}
+	d.Set = &cobra.Command{
+		Use:   "set <dir> <key> <value>",
+		Short: "set a value for a key",
+		Long: `
+Adds a new key/value to the DB. Requires that the specified database
+not be in use by another process.
+`,
+		Args: cobra.ExactArgs(3),
+		Run:  d.runSet,
+	}
+	d.Space = &cobra.Command{
+		Use:   "space <dir>",
+		Short: "print filesystem space used",
+		Long: `
+Print the estimated filesystem space usage for the inclusive-inclusive range
+specified by --start and --end. Requires that the specified database not be in
+use by another process.
+`,
+		Args: cobra.ExactArgs(1),
+		Run:  d.runSpace,
+	}
+	d.IOBench = &cobra.Command{
+		Use:   "io-bench <dir>",
+		Short: "perform sstable IO benchmark",
+		Long: `
+Run a random IO workload with various IO sizes against the sstables in the
+specified database.
+`,
+		Args: cobra.ExactArgs(1),
+		Run:  d.runIOBench,
+	}
+
+	d.Root.AddCommand(d.Check, d.Checkpoint, d.Get, d.Logs, d.LSM, d.Properties, d.Scan, d.Set, d.Space, d.IOBench)
+	d.Root.PersistentFlags().BoolVarP(&d.verbose, "verbose", "v", false, "verbose output")
+
+	for _, cmd := range []*cobra.Command{d.Check, d.Checkpoint, d.Get, d.LSM, d.Properties, d.Scan, d.Set, d.Space} {
+		cmd.Flags().StringVar(
+			&d.comparerName, "comparer", "", "comparer name (use default if empty)")
+		cmd.Flags().StringVar(
+			&d.mergerName, "merger", "", "merger name (use default if empty)")
+	}
+
+	for _, cmd := range []*cobra.Command{d.Scan, d.Space} {
+		cmd.Flags().Var(
+			&d.start, "start", "start key for the range")
+		cmd.Flags().Var(
+			&d.end, "end", "end key for the range")
+	}
+
+	d.Scan.Flags().Var(
+		&d.fmtKey, "key", "key formatter")
+	for _, cmd := range []*cobra.Command{d.Scan, d.Get} {
+		cmd.Flags().Var(
+			&d.fmtValue, "value", "value formatter")
+	}
+
+	d.Scan.Flags().Int64Var(
+		&d.count, "count", 0, "key count for scan (0 is unlimited)")
+
+	d.IOBench.Flags().BoolVar(
+		&d.allLevels, "all-levels", false, "if set, benchmark all levels (default is only L5/L6)")
+	d.IOBench.Flags().IntVar(
+		&d.ioCount, "io-count", 10000, "number of IOs (per IO size) to benchmark")
+	d.IOBench.Flags().IntVar(
+		&d.ioParallelism, "io-parallelism", 16, "number of goroutines issuing IO")
+	d.IOBench.Flags().StringVar(
+		&d.ioSizes, "io-sizes-kb", "4,16,64,128,256,512,1024", "comma separated list of IO sizes in KB")
+
+	return d
+}
+
+func (d *dbT) loadOptions(dir string) error {
+	ls, err := d.opts.FS.List(dir)
+	if err != nil || len(ls) == 0 {
+		// NB: We don't return the error here as we prefer to return the error from
+		// pebble.Open. Another way to put this is that a non-existent directory is
+		// not a failure in loading the options.
+		return nil
+	}
+
+	hooks := &pebble.ParseHooks{
+		NewComparer: func(name string) (*pebble.Comparer, error) {
+			if c := d.comparers[name]; c != nil {
+				return c, nil
+			}
+			return nil, errors.Errorf("unknown comparer %q", errors.Safe(name))
+		},
+		NewMerger: func(name string) (*pebble.Merger, error) {
+			if m := d.mergers[name]; m != nil {
+				return m, nil
+			}
+			return nil, errors.Errorf("unknown merger %q", errors.Safe(name))
+		},
+		SkipUnknown: func(name, value string) bool {
+			return true
+		},
+	}
+
+	// TODO(peter): RocksDB sometimes leaves multiple OPTIONS files in
+	// existence. We parse all of them as the comparer and merger shouldn't be
+	// changing. We could parse only the first or the latest. Not clear if this
+	// matters.
+	var dbOpts pebble.Options
+	for _, filename := range ls {
+		ft, _, ok := base.ParseFilename(d.opts.FS, filename)
+		if !ok {
+			continue
+		}
+		switch ft {
+		case base.FileTypeOptions:
+			err := func() error {
+				f, err := d.opts.FS.Open(d.opts.FS.PathJoin(dir, filename))
+				if err != nil {
+					return err
+				}
+				defer f.Close()
+
+				data, err := io.ReadAll(f)
+				if err != nil {
+					return err
+				}
+
+				if err := dbOpts.Parse(string(data), hooks); err != nil {
+					return err
+				}
+				return nil
+			}()
+			if err != nil {
+				return err
+			}
+		}
+	}
+
+	if dbOpts.Comparer != nil {
+		d.opts.Comparer = dbOpts.Comparer
+	}
+	if dbOpts.Merger != nil {
+		d.opts.Merger = dbOpts.Merger
+	}
+	return nil
+}
+
+type openOption interface {
+	apply(opts *pebble.Options)
+}
+
+func (d *dbT) openDB(dir string, openOptions ...openOption) (*pebble.DB, error) {
+	db, err := d.openDBInternal(dir, openOptions...)
+	if err != nil {
+		if d.openErrEnhancer != nil {
+			err = d.openErrEnhancer(err)
+		}
+		return nil, err
+	}
+	return db, nil
+}
+
+func (d *dbT) openDBInternal(dir string, openOptions ...openOption) (*pebble.DB, error) {
+	if err := d.loadOptions(dir); err != nil {
+		return nil, errors.Wrap(err, "error loading options")
+	}
+	if d.comparerName != "" {
+		d.opts.Comparer = d.comparers[d.comparerName]
+		if d.opts.Comparer == nil {
+			return nil, errors.Errorf("unknown comparer %q", errors.Safe(d.comparerName))
+		}
+	}
+	if d.mergerName != "" {
+		d.opts.Merger = d.mergers[d.mergerName]
+		if d.opts.Merger == nil {
+			return nil, errors.Errorf("unknown merger %q", errors.Safe(d.mergerName))
+		}
+	}
+	opts := *d.opts
+	for _, opt := range openOptions {
+		opt.apply(&opts)
+	}
+	opts.Cache = pebble.NewCache(128 << 20 /* 128 MB */)
+	defer opts.Cache.Unref()
+	return pebble.Open(dir, &opts)
+}
+
+func (d *dbT) closeDB(stderr io.Writer, db *pebble.DB) {
+	if err := db.Close(); err != nil {
+		fmt.Fprintf(stderr, "%s\n", err)
+	}
+}
+
+func (d *dbT) runCheck(cmd *cobra.Command, args []string) {
+	stdout, stderr := cmd.OutOrStdout(), cmd.ErrOrStderr()
+	db, err := d.openDB(args[0])
+	if err != nil {
+		fmt.Fprintf(stderr, "%s\n", err)
+		return
+	}
+	defer d.closeDB(stderr, db)
+
+	var stats pebble.CheckLevelsStats
+	if err := db.CheckLevels(&stats); err != nil {
+		fmt.Fprintf(stderr, "%s\n", err)
+	}
+	fmt.Fprintf(stdout, "checked %d %s and %d %s\n",
+		stats.NumPoints, makePlural("point", stats.NumPoints), stats.NumTombstones, makePlural("tombstone", int64(stats.NumTombstones)))
+}
+
+type nonReadOnly struct{}
+
+func (n nonReadOnly) apply(opts *pebble.Options) {
+	opts.ReadOnly = false
+	// Increase the L0 compaction threshold to reduce the likelihood of an
+	// unintended compaction changing test output.
+	opts.L0CompactionThreshold = 10
+}
+
+func (d *dbT) runCheckpoint(cmd *cobra.Command, args []string) {
+	stderr := cmd.ErrOrStderr()
+	db, err := d.openDB(args[0], nonReadOnly{})
+	if err != nil {
+		fmt.Fprintf(stderr, "%s\n", err)
+		return
+	}
+	defer d.closeDB(stderr, db)
+	destDir := args[1]
+
+	if err := db.Checkpoint(destDir); err != nil {
+		fmt.Fprintf(stderr, "%s\n", err)
+	}
+}
+
+func (d *dbT) runGet(cmd *cobra.Command, args []string) {
+	stdout, stderr := cmd.OutOrStdout(), cmd.ErrOrStderr()
+	db, err := d.openDB(args[0])
+	if err != nil {
+		fmt.Fprintf(stderr, "%s\n", err)
+		return
+	}
+	defer d.closeDB(stderr, db)
+	var k key
+	if err := k.Set(args[1]); err != nil {
+		fmt.Fprintf(stderr, "%s\n", err)
+		return
+	}
+
+	val, closer, err := db.Get(k)
+	if err != nil {
+		fmt.Fprintf(stderr, "%s\n", err)
+		return
+	}
+	defer func() {
+		if closer != nil {
+			closer.Close()
+		}
+	}()
+	if val != nil {
+		fmt.Fprintf(stdout, "%s\n", d.fmtValue.fn(k, val))
+	}
+}
+
+func (d *dbT) runLSM(cmd *cobra.Command, args []string) {
+	stdout, stderr := cmd.OutOrStdout(), cmd.ErrOrStderr()
+	db, err := d.openDB(args[0])
+	if err != nil {
+		fmt.Fprintf(stderr, "%s\n", err)
+		return
+	}
+	defer d.closeDB(stderr, db)
+
+	fmt.Fprintf(stdout, "%s", db.Metrics())
+}
+
+func (d *dbT) runScan(cmd *cobra.Command, args []string) {
+	stdout, stderr := cmd.OutOrStdout(), cmd.ErrOrStderr()
+	db, err := d.openDB(args[0])
+	if err != nil {
+		fmt.Fprintf(stderr, "%s\n", err)
+		return
+	}
+	defer d.closeDB(stderr, db)
+
+	// Update the internal formatter if this comparator has one specified.
+	if d.opts.Comparer != nil {
+		d.fmtKey.setForComparer(d.opts.Comparer.Name, d.comparers)
+		d.fmtValue.setForComparer(d.opts.Comparer.Name, d.comparers)
+	}
+
+	start := timeNow()
+	fmtKeys := d.fmtKey.spec != "null"
+	fmtValues := d.fmtValue.spec != "null"
+	var count int64
+
+	iter, _ := db.NewIter(&pebble.IterOptions{
+		UpperBound: d.end,
+	})
+	for valid := iter.SeekGE(d.start); valid; valid = iter.Next() {
+		if fmtKeys || fmtValues {
+			needDelimiter := false
+			if fmtKeys {
+				fmt.Fprintf(stdout, "%s", d.fmtKey.fn(iter.Key()))
+				needDelimiter = true
+			}
+			if fmtValues {
+				if needDelimiter {
+					stdout.Write([]byte{' '})
+				}
+				fmt.Fprintf(stdout, "%s", d.fmtValue.fn(iter.Key(), iter.Value()))
+			}
+			stdout.Write([]byte{'\n'})
+		}
+
+		count++
+		if d.count > 0 && count >= d.count {
+			break
+		}
+	}
+
+	if err := iter.Close(); err != nil {
+		fmt.Fprintf(stderr, "%s\n", err)
+	}
+
+	elapsed := timeNow().Sub(start)
+
+	fmt.Fprintf(stdout, "scanned %d %s in %0.1fs\n",
+		count, makePlural("record", count), elapsed.Seconds())
+}
+
+func (d *dbT) runSpace(cmd *cobra.Command, args []string) {
+	stdout, stderr := cmd.OutOrStdout(), cmd.ErrOrStderr()
+	db, err := d.openDB(args[0])
+	if err != nil {
+		fmt.Fprintf(stderr, "%s\n", err)
+		return
+	}
+	defer d.closeDB(stdout, db)
+
+	bytes, err := db.EstimateDiskUsage(d.start, d.end)
+	if err != nil {
+		fmt.Fprintf(stderr, "%s\n", err)
+		return
+	}
+	fmt.Fprintf(stdout, "%d\n", bytes)
+}
+
+func (d *dbT) runProperties(cmd *cobra.Command, args []string) {
+	stdout, stderr := cmd.OutOrStdout(), cmd.ErrOrStderr()
+	dirname := args[0]
+	err := func() error {
+		desc, err := pebble.Peek(dirname, d.opts.FS)
+		if err != nil {
+			return err
+		} else if !desc.Exists {
+			return oserror.ErrNotExist
+		}
+		manifestFilename := d.opts.FS.PathBase(desc.ManifestFilename)
+
+		// Replay the manifest to get the current version.
+		f, err := d.opts.FS.Open(desc.ManifestFilename)
+		if err != nil {
+			return errors.Wrapf(err, "pebble: could not open MANIFEST file %q", manifestFilename)
+		}
+		defer f.Close()
+
+		cmp := base.DefaultComparer
+		var bve manifest.BulkVersionEdit
+		bve.AddedByFileNum = make(map[base.FileNum]*manifest.FileMetadata)
+		rr := record.NewReader(f, 0 /* logNum */)
+		for {
+			r, err := rr.Next()
+			if err == io.EOF {
+				break
+			}
+			if err != nil {
+				return errors.Wrapf(err, "pebble: reading manifest %q", manifestFilename)
+			}
+			var ve manifest.VersionEdit
+			err = ve.Decode(r)
+			if err != nil {
+				return err
+			}
+			if err := bve.Accumulate(&ve); err != nil {
+				return err
+			}
+			if ve.ComparerName != "" {
+				cmp = d.comparers[ve.ComparerName]
+				d.fmtKey.setForComparer(ve.ComparerName, d.comparers)
+				d.fmtValue.setForComparer(ve.ComparerName, d.comparers)
+			}
+		}
+		v, err := bve.Apply(
+			nil /* version */, cmp.Compare, d.fmtKey.fn, d.opts.FlushSplitBytes,
+			d.opts.Experimental.ReadCompactionRate, nil, /* zombies */
+			manifest.AllowSplitUserKeys,
+		)
+		if err != nil {
+			return err
+		}
+
+		objProvider, err := objstorageprovider.Open(objstorageprovider.DefaultSettings(d.opts.FS, dirname))
+		if err != nil {
+			return err
+		}
+		defer objProvider.Close()
+
+		// Load and aggregate sstable properties.
+		tw := tabwriter.NewWriter(stdout, 2, 1, 4, ' ', 0)
+		var total props
+		var all []props
+		for _, l := range v.Levels {
+			iter := l.Iter()
+			var level props
+			for t := iter.First(); t != nil; t = iter.Next() {
+				if t.Virtual {
+					// TODO(bananabrick): Handle virtual sstables here. We don't
+					// really have any stats or properties at this point. Maybe
+					// we could approximate some of these properties for virtual
+					// sstables by first grabbing properties for the backing
+					// physical sstable, and then extrapolating.
+					continue
+				}
+				err := d.addProps(objProvider, t.PhysicalMeta(), &level)
+				if err != nil {
+					return err
+				}
+			}
+			all = append(all, level)
+			total.update(level)
+		}
+		all = append(all, total)
+
+		fmt.Fprintln(tw, "\tL0\tL1\tL2\tL3\tL4\tL5\tL6\tTOTAL")
+
+		fmt.Fprintf(tw, "count\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n",
+			propArgs(all, func(p *props) interface{} { return p.Count })...)
+
+		fmt.Fprintln(tw, "seq num\t\t\t\t\t\t\t\t")
+		fmt.Fprintf(tw, "  smallest\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n",
+			propArgs(all, func(p *props) interface{} { return p.SmallestSeqNum })...)
+		fmt.Fprintf(tw, "  largest\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n",
+			propArgs(all, func(p *props) interface{} { return p.LargestSeqNum })...)
+
+		fmt.Fprintln(tw, "size\t\t\t\t\t\t\t\t")
+		fmt.Fprintf(tw, "  data\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
+			propArgs(all, func(p *props) interface{} { return humanize.Bytes.Uint64(p.DataSize) })...)
+		fmt.Fprintf(tw, "    blocks\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n",
+			propArgs(all, func(p *props) interface{} { return p.NumDataBlocks })...)
+		fmt.Fprintf(tw, "  index\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
+			propArgs(all, func(p *props) interface{} { return humanize.Bytes.Uint64(p.IndexSize) })...)
+		fmt.Fprintf(tw, "    blocks\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n",
+			propArgs(all, func(p *props) interface{} { return p.NumIndexBlocks })...)
+		fmt.Fprintf(tw, "    top-level\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
+			propArgs(all, func(p *props) interface{} { return humanize.Bytes.Uint64(p.TopLevelIndexSize) })...)
+		fmt.Fprintf(tw, "  filter\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
+			propArgs(all, func(p *props) interface{} { return humanize.Bytes.Uint64(p.FilterSize) })...)
+		fmt.Fprintf(tw, "  raw-key\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
+			propArgs(all, func(p *props) interface{} { return humanize.Bytes.Uint64(p.RawKeySize) })...)
+		fmt.Fprintf(tw, "  raw-value\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
+			propArgs(all, func(p *props) interface{} { return humanize.Bytes.Uint64(p.RawValueSize) })...)
+		fmt.Fprintf(tw, "  pinned-key\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
+			propArgs(all, func(p *props) interface{} { return humanize.Bytes.Uint64(p.SnapshotPinnedKeySize) })...)
+		fmt.Fprintf(tw, "  pinned-value\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
+			propArgs(all, func(p *props) interface{} { return humanize.Bytes.Uint64(p.SnapshotPinnedValueSize) })...)
+		fmt.Fprintf(tw, "  point-del-key-size\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
+			propArgs(all, func(p *props) interface{} { return humanize.Bytes.Uint64(p.RawPointTombstoneKeySize) })...)
+		fmt.Fprintf(tw, "  point-del-value-size\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
+			propArgs(all, func(p *props) interface{} { return humanize.Bytes.Uint64(p.RawPointTombstoneValueSize) })...)
+
+		fmt.Fprintln(tw, "records\t\t\t\t\t\t\t\t")
+		fmt.Fprintf(tw, "  set\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
+			propArgs(all, func(p *props) interface{} {
+				return humanize.Count.Uint64(p.NumEntries - p.NumDeletions - p.NumMergeOperands)
+			})...)
+		fmt.Fprintf(tw, "  delete\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
+			propArgs(all, func(p *props) interface{} { return humanize.Count.Uint64(p.NumDeletions - p.NumRangeDeletions) })...)
+		fmt.Fprintf(tw, "  delete-sized\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
+			propArgs(all, func(p *props) interface{} { return humanize.Count.Uint64(p.NumSizedDeletions) })...)
+		fmt.Fprintf(tw, "  range-delete\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
+			propArgs(all, func(p *props) interface{} { return humanize.Count.Uint64(p.NumRangeDeletions) })...)
+		fmt.Fprintf(tw, "  range-key-sets\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
+			propArgs(all, func(p *props) interface{} { return humanize.Count.Uint64(p.NumRangeKeySets) })...)
+		fmt.Fprintf(tw, "  range-key-unsets\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
+			propArgs(all, func(p *props) interface{} { return humanize.Count.Uint64(p.NumRangeKeyUnSets) })...)
+		fmt.Fprintf(tw, "  range-key-deletes\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
+			propArgs(all, func(p *props) interface{} { return humanize.Count.Uint64(p.NumRangeKeyDeletes) })...)
+		fmt.Fprintf(tw, "  merge\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
+			propArgs(all, func(p *props) interface{} { return humanize.Count.Uint64(p.NumMergeOperands) })...)
+		fmt.Fprintf(tw, "  pinned\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
+			propArgs(all, func(p *props) interface{} { return humanize.Count.Uint64(p.SnapshotPinnedKeys) })...)
+
+		if err := tw.Flush(); err != nil {
+			return err
+		}
+		return nil
+	}()
+	if err != nil {
+		fmt.Fprintln(stderr, err)
+	}
+}
+
+func (d *dbT) runSet(cmd *cobra.Command, args []string) {
+	stderr := cmd.ErrOrStderr()
+	db, err := d.openDB(args[0], nonReadOnly{})
+	if err != nil {
+		fmt.Fprintf(stderr, "%s\n", err)
+		return
+	}
+	defer d.closeDB(stderr, db)
+	var k, v key
+	if err := k.Set(args[1]); err != nil {
+		fmt.Fprintf(stderr, "%s\n", err)
+		return
+	}
+	if err := v.Set(args[2]); err != nil {
+		fmt.Fprintf(stderr, "%s\n", err)
+		return
+	}
+
+	if err := db.Set(k, v, nil); err != nil {
+		fmt.Fprintf(stderr, "%s\n", err)
+	}
+}
+
+func propArgs(props []props, getProp func(*props) interface{}) []interface{} {
+	args := make([]interface{}, 0, len(props))
+	for _, p := range props {
+		args = append(args, getProp(&p))
+	}
+	return args
+}
+
+type props struct {
+	Count                      uint64
+	SmallestSeqNum             uint64
+	LargestSeqNum              uint64
+	DataSize                   uint64
+	FilterSize                 uint64
+	IndexSize                  uint64
+	NumDataBlocks              uint64
+	NumIndexBlocks             uint64
+	NumDeletions               uint64
+	NumSizedDeletions          uint64
+	NumEntries                 uint64
+	NumMergeOperands           uint64
+	NumRangeDeletions          uint64
+	NumRangeKeySets            uint64
+	NumRangeKeyUnSets          uint64
+	NumRangeKeyDeletes         uint64
+	RawKeySize                 uint64
+	RawPointTombstoneKeySize   uint64
+	RawPointTombstoneValueSize uint64
+	RawValueSize               uint64
+	SnapshotPinnedKeys         uint64
+	SnapshotPinnedKeySize      uint64
+	SnapshotPinnedValueSize    uint64
+	TopLevelIndexSize          uint64
+}
+
+func (p *props) update(o props) {
+	p.Count += o.Count
+	if o.SmallestSeqNum != 0 && (o.SmallestSeqNum < p.SmallestSeqNum || p.SmallestSeqNum == 0) {
+		p.SmallestSeqNum = o.SmallestSeqNum
+	}
+	if o.LargestSeqNum > p.LargestSeqNum {
+		p.LargestSeqNum = o.LargestSeqNum
+	}
+	p.DataSize += o.DataSize
+	p.FilterSize += o.FilterSize
+	p.IndexSize += o.IndexSize
+	p.NumDataBlocks += o.NumDataBlocks
+	p.NumIndexBlocks += o.NumIndexBlocks
+	p.NumDeletions += o.NumDeletions
+	p.NumSizedDeletions += o.NumSizedDeletions
+	p.NumEntries += o.NumEntries
+	p.NumMergeOperands += o.NumMergeOperands
+	p.NumRangeDeletions += o.NumRangeDeletions
+	p.NumRangeKeySets += o.NumRangeKeySets
+	p.NumRangeKeyUnSets += o.NumRangeKeyUnSets
+	p.NumRangeKeyDeletes += o.NumRangeKeyDeletes
+	p.RawKeySize += o.RawKeySize
+	p.RawPointTombstoneKeySize += o.RawPointTombstoneKeySize
+	p.RawPointTombstoneValueSize += o.RawPointTombstoneValueSize
+	p.RawValueSize += o.RawValueSize
+	p.SnapshotPinnedKeySize += o.SnapshotPinnedKeySize
+	p.SnapshotPinnedValueSize += o.SnapshotPinnedValueSize
+	p.SnapshotPinnedKeys += o.SnapshotPinnedKeys
+	p.TopLevelIndexSize += o.TopLevelIndexSize
+}
+
+func (d *dbT) addProps(
+	objProvider objstorage.Provider, m manifest.PhysicalFileMeta, p *props,
+) error {
+	ctx := context.Background()
+	f, err := objProvider.OpenForReading(ctx, base.FileTypeTable, m.FileBacking.DiskFileNum, objstorage.OpenOptions{})
+	if err != nil {
+		return err
+	}
+	r, err := sstable.NewReader(f, sstable.ReaderOptions{}, d.mergers, d.comparers)
+	if err != nil {
+		_ = f.Close()
+		return err
+	}
+	p.update(props{
+		Count:                      1,
+		SmallestSeqNum:             m.SmallestSeqNum,
+		LargestSeqNum:              m.LargestSeqNum,
+		DataSize:                   r.Properties.DataSize,
+		FilterSize:                 r.Properties.FilterSize,
+		IndexSize:                  r.Properties.IndexSize,
+		NumDataBlocks:              r.Properties.NumDataBlocks,
+		NumIndexBlocks:             1 + r.Properties.IndexPartitions,
+		NumDeletions:               r.Properties.NumDeletions,
+		NumSizedDeletions:          r.Properties.NumSizedDeletions,
+		NumEntries:                 r.Properties.NumEntries,
+		NumMergeOperands:           r.Properties.NumMergeOperands,
+		NumRangeDeletions:          r.Properties.NumRangeDeletions,
+		NumRangeKeySets:            r.Properties.NumRangeKeySets,
+		NumRangeKeyUnSets:          r.Properties.NumRangeKeyUnsets,
+		NumRangeKeyDeletes:         r.Properties.NumRangeKeyDels,
+		RawKeySize:                 r.Properties.RawKeySize,
+		RawPointTombstoneKeySize:   r.Properties.RawPointTombstoneKeySize,
+		RawPointTombstoneValueSize: r.Properties.RawPointTombstoneValueSize,
+		RawValueSize:               r.Properties.RawValueSize,
+		SnapshotPinnedKeySize:      r.Properties.SnapshotPinnedKeySize,
+		SnapshotPinnedValueSize:    r.Properties.SnapshotPinnedValueSize,
+		SnapshotPinnedKeys:         r.Properties.SnapshotPinnedKeys,
+		TopLevelIndexSize:          r.Properties.TopLevelIndexSize,
+	})
+	return r.Close()
+}
+
+func makePlural(singular string, count int64) string {
+	if count > 1 {
+		return fmt.Sprintf("%ss", singular)
+	}
+	return singular
+}
diff --git a/pebble/tool/db_io_bench.go b/pebble/tool/db_io_bench.go
new file mode 100644
index 0000000..a419882
--- /dev/null
+++ b/pebble/tool/db_io_bench.go
@@ -0,0 +1,316 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package tool
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"math"
+	"math/rand"
+	"slices"
+	"sort"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/spf13/cobra"
+)
+
+type benchIO struct {
+	readableIdx int
+	ofs         int64
+	size        int
+	// elapsed time for the IO, filled out by performIOs.
+	elapsed time.Duration
+}
+
+const maxIOSize = 1024 * 1024
+
+// runIOBench runs an IO benchmark against the current sstables of a database.
+// The workload is random IO, with various IO sizes. The main goal of the
+// benchmark is to establish the relationship between IO size and latency,
+// especially against shared object storage.
+func (d *dbT) runIOBench(cmd *cobra.Command, args []string) {
+	stdout := cmd.OutOrStdout()
+
+	ioSizes, err := parseIOSizes(d.ioSizes)
+	if err != nil {
+		fmt.Fprintf(stdout, "error parsing io-sizes: %s\n", err)
+		return
+	}
+
+	db, err := d.openDB(args[0])
+	if err != nil {
+		fmt.Fprintf(stdout, "%s\n", err)
+		return
+	}
+	defer d.closeDB(stdout, db)
+
+	readables, err := d.openBenchTables(db)
+	if err != nil {
+		fmt.Fprintf(stdout, "%s\n", err)
+		return
+	}
+
+	defer func() {
+		for _, r := range readables {
+			r.Close()
+		}
+	}()
+
+	ios := genBenchIOs(stdout, readables, d.ioCount, ioSizes)
+
+	levels := "L5,L6"
+	if d.allLevels {
+		levels = "all"
+	}
+	fmt.Fprintf(stdout, "IO count: %d  Parallelism: %d  Levels: %s\n", d.ioCount, d.ioParallelism, levels)
+
+	var wg sync.WaitGroup
+	wg.Add(d.ioParallelism)
+	remainingIOs := ios
+	for i := 0; i < d.ioParallelism; i++ {
+		// We want to distribute the IOs among d.ioParallelism goroutines. At each
+		// step, we look at the number of IOs remaining and take the average (across
+		// the goroutines that are left); this deals with any rounding issues.
+		n := len(remainingIOs) / (d.ioParallelism - i)
+		go func(workerIdx int, ios []benchIO) {
+			defer wg.Done()
+			if err := performIOs(readables, ios); err != nil {
+				fmt.Fprintf(stdout, "worker %d encountered error: %v", workerIdx, err)
+			}
+		}(i, remainingIOs[:n])
+		remainingIOs = remainingIOs[n:]
+	}
+	wg.Wait()
+
+	elapsed := make([]time.Duration, d.ioCount)
+	for _, ioSize := range ioSizes {
+		elapsed = elapsed[:0]
+		for i := range ios {
+			if ios[i].size == ioSize {
+				elapsed = append(elapsed, ios[i].elapsed)
+			}
+		}
+		fmt.Fprintf(stdout, "%4dKB  --  %s\n", ioSize/1024, getStats(elapsed))
+	}
+}
+
+// genBenchIOs generates <count> IOs for each given size. All IOs (across all
+// sizes) are in random order.
+func genBenchIOs(
+	stdout io.Writer, readables []objstorage.Readable, count int, sizes []int,
+) []benchIO {
+	// size[i] is the size of the object, in blocks of maxIOSize.
+	size := make([]int, len(readables))
+	// sum[i] is the sum (size[0] + ... + size[i]).
+	sum := make([]int, len(readables))
+	total := 0
+	for i, r := range readables {
+		size[i] = int(r.Size() / maxIOSize)
+		total += size[i]
+		sum[i] = total
+	}
+	fmt.Fprintf(stdout, "Opened %d objects; total size %d MB.\n", len(readables), total*maxIOSize/(1024*1024))
+
+	// To avoid a lot of overlap between the reads, the total size should be a
+	// factor larger than the size we will actually read (for the largest IO
+	// size).
+	const sizeFactor = 2
+	if total*maxIOSize < count*sizes[len(sizes)-1]*sizeFactor {
+		fmt.Fprintf(stdout, "Warning: store too small for the given IO count and sizes.\n")
+	}
+
+	// Choose how many IOs we do for each object, by selecting a random block
+	// across all file blocks.
+	// The choice of objects will be the same across all IO sizes.
+	b := make([]int, count)
+	for i := range b {
+		b[i] = rand.Intn(total)
+	}
+	// For each b[i], find the index such that sum[idx-1] <= b < sum[idx].
+	// Sorting b makes this easier: we can "merge" the sorted arrays b and sum.
+	sort.Ints(b)
+	rIdx := make([]int, count)
+	currIdx := 0
+	for i := range b {
+		for b[i] >= sum[currIdx] {
+			currIdx++
+		}
+		rIdx[i] = currIdx
+	}
+
+	res := make([]benchIO, 0, count*len(sizes))
+	for _, ioSize := range sizes {
+		for _, idx := range rIdx {
+			// Random ioSize aligned offset.
+			ofs := ioSize * rand.Intn(size[idx]*maxIOSize/ioSize)
+
+			res = append(res, benchIO{
+				readableIdx: idx,
+				ofs:         int64(ofs),
+				size:        ioSize,
+			})
+		}
+	}
+	rand.Shuffle(len(res), func(i, j int) {
+		res[i], res[j] = res[j], res[i]
+	})
+	return res
+}
+
+// openBenchTables opens the sstables for the benchmark and returns them as a
+// list of Readables.
+//
+// By default, only L5/L6 sstables are used; all levels are used if the
+// allLevels flag is set.
+//
+// Note that only sstables that are at least maxIOSize (1MB) are used.
+func (d *dbT) openBenchTables(db *pebble.DB) ([]objstorage.Readable, error) {
+	tables, err := db.SSTables()
+	if err != nil {
+		return nil, err
+	}
+	startLevel := 5
+	if d.allLevels {
+		startLevel = 0
+	}
+
+	var nums []base.DiskFileNum
+	numsMap := make(map[base.DiskFileNum]struct{})
+	for l := startLevel; l < len(tables); l++ {
+		for _, t := range tables[l] {
+			n := t.BackingSSTNum.DiskFileNum()
+			if _, ok := numsMap[n]; !ok {
+				nums = append(nums, n)
+				numsMap[n] = struct{}{}
+			}
+		}
+	}
+
+	p := db.ObjProvider()
+	var res []objstorage.Readable
+	for _, n := range nums {
+		r, err := p.OpenForReading(context.Background(), base.FileTypeTable, n, objstorage.OpenOptions{})
+		if err != nil {
+			for _, r := range res {
+				_ = r.Close()
+			}
+			return nil, err
+		}
+		if r.Size() < maxIOSize {
+			_ = r.Close()
+			continue
+		}
+		res = append(res, r)
+	}
+	if len(res) == 0 {
+		return nil, errors.Errorf("no sstables (with size at least %d)", maxIOSize)
+	}
+
+	return res, nil
+}
+
+// parseIOSizes parses a comma-separated list of IO sizes, in KB.
+func parseIOSizes(sizes string) ([]int, error) {
+	var res []int
+	for _, s := range strings.Split(sizes, ",") {
+		n, err := strconv.Atoi(s)
+		if err != nil {
+			return nil, err
+		}
+		ioSize := n * 1024
+		if ioSize > maxIOSize {
+			return nil, errors.Errorf("IO sizes over %d not supported", maxIOSize)
+		}
+		if maxIOSize%ioSize != 0 {
+			return nil, errors.Errorf("IO size must be a divisor of %d", maxIOSize)
+		}
+		res = append(res, ioSize)
+	}
+	if len(res) == 0 {
+		return nil, errors.Errorf("no IO sizes specified")
+	}
+	sort.Ints(res)
+	return res, nil
+}
+
+// performIOs performs the given list of IOs and populates the elapsed fields.
+func performIOs(readables []objstorage.Readable, ios []benchIO) error {
+	ctx := context.Background()
+	rh := make([]objstorage.ReadHandle, len(readables))
+	for i := range rh {
+		rh[i] = readables[i].NewReadHandle(ctx)
+	}
+	defer func() {
+		for i := range rh {
+			rh[i].Close()
+		}
+	}()
+
+	buf := make([]byte, maxIOSize)
+	startTime := time.Now()
+	var firstErr error
+	var nOtherErrs int
+	for i := range ios {
+		if err := rh[ios[i].readableIdx].ReadAt(ctx, buf[:ios[i].size], ios[i].ofs); err != nil {
+			if firstErr == nil {
+				firstErr = err
+			} else {
+				nOtherErrs++
+			}
+		}
+		endTime := time.Now()
+		ios[i].elapsed = endTime.Sub(startTime)
+		startTime = endTime
+	}
+	if nOtherErrs > 0 {
+		return errors.Errorf("%v; plus %d more errors", firstErr, nOtherErrs)
+	}
+	return firstErr
+}
+
+// getStats calculates various statistics given a list of elapsed times.
+func getStats(d []time.Duration) string {
+	slices.Sort(d)
+
+	factor := 1.0 / float64(len(d))
+	var mean float64
+	for i := range d {
+		mean += float64(d[i]) * factor
+	}
+	var variance float64
+	for i := range d {
+		delta := float64(d[i]) - mean
+		variance += delta * delta * factor
+	}
+
+	toStr := func(d time.Duration) string {
+		if d < 10*time.Millisecond {
+			return fmt.Sprintf("%1.2fms", float64(d)/float64(time.Millisecond))
+		}
+		if d < 100*time.Millisecond {
+			return fmt.Sprintf("%2.1fms", float64(d)/float64(time.Millisecond))
+		}
+		return fmt.Sprintf("%4dms", d/time.Millisecond)
+	}
+
+	return fmt.Sprintf(
+		"avg %s   stddev %s   p10 %s   p50 %s   p90 %s   p95 %s   p99 %s",
+		toStr(time.Duration(mean)),
+		toStr(time.Duration(math.Sqrt(variance))),
+		toStr(d[len(d)*10/100]),
+		toStr(d[len(d)*50/100]),
+		toStr(d[len(d)*90/100]),
+		toStr(d[len(d)*95/100]),
+		toStr(d[len(d)*99/100]),
+	)
+}
diff --git a/pebble/tool/db_test.go b/pebble/tool/db_test.go
new file mode 100644
index 0000000..b3ce5f0
--- /dev/null
+++ b/pebble/tool/db_test.go
@@ -0,0 +1,11 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package tool
+
+import "testing"
+
+func TestDB(t *testing.T) {
+	runTests(t, "testdata/db_*")
+}
diff --git a/pebble/tool/find.go b/pebble/tool/find.go
new file mode 100644
index 0000000..5a18ec1
--- /dev/null
+++ b/pebble/tool/find.go
@@ -0,0 +1,641 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package tool
+
+import (
+	"bytes"
+	"cmp"
+	"fmt"
+	"io"
+	"slices"
+	"sort"
+
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/internal/private"
+	"github.com/cockroachdb/pebble/internal/rangedel"
+	"github.com/cockroachdb/pebble/record"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/spf13/cobra"
+)
+
+type findRef struct {
+	key     base.InternalKey
+	value   []byte
+	fileNum base.FileNum
+}
+
+// findT implements the find tool.
+//
+// TODO(bananabrick): Add support for virtual sstables in this tool. Currently,
+// the tool will work because we're parsing files from disk, so virtual sstables
+// will never be added to findT.tables. The manifest could contain information
+// about virtual sstables. This is fine because the manifest is only used to
+// compute the findT.editRefs, and editRefs is only used if a file in
+// findT.tables contains a key. Of course, the tool won't be completely
+// accurate without dealing with virtual sstable case.
+type findT struct {
+	Root *cobra.Command
+
+	// Configuration.
+	opts      *pebble.Options
+	comparers sstable.Comparers
+	mergers   sstable.Mergers
+
+	// Flags.
+	comparerName string
+	fmtKey       keyFormatter
+	fmtValue     valueFormatter
+	verbose      bool
+
+	// Map from file num to path on disk.
+	files map[base.DiskFileNum]string
+	// Map from file num to version edit index which references the file num.
+	editRefs map[base.FileNum][]int
+	// List of version edits.
+	edits []manifest.VersionEdit
+	// Sorted list of WAL file nums.
+	logs []base.DiskFileNum
+	// Sorted list of manifest file nums.
+	manifests []base.DiskFileNum
+	// Sorted list of table file nums.
+	tables []base.FileNum
+	// Set of tables that contains references to the search key.
+	tableRefs map[base.FileNum]bool
+	// Map from file num to table metadata.
+	tableMeta map[base.FileNum]*manifest.FileMetadata
+	// List of error messages for SSTables that could not be decoded.
+	errors []string
+}
+
+func newFind(
+	opts *pebble.Options,
+	comparers sstable.Comparers,
+	defaultComparer string,
+	mergers sstable.Mergers,
+) *findT {
+	f := &findT{
+		opts:      opts,
+		comparers: comparers,
+		mergers:   mergers,
+	}
+	f.fmtKey.mustSet("quoted")
+	f.fmtValue.mustSet("[%x]")
+
+	f.Root = &cobra.Command{
+		Use:   "find <dir> <key>",
+		Short: "find references to the specified key",
+		Long: `
+Find references to the specified key and any range tombstones that contain the
+key. This includes references to the key in WAL files and sstables, and the
+provenance of the sstables (flushed, ingested, compacted).
+`,
+		Args: cobra.ExactArgs(2),
+		Run:  f.run,
+	}
+
+	f.Root.Flags().BoolVarP(
+		&f.verbose, "verbose", "v", false, "verbose output")
+	f.Root.Flags().StringVar(
+		&f.comparerName, "comparer", defaultComparer, "comparer name")
+	f.Root.Flags().Var(
+		&f.fmtKey, "key", "key formatter")
+	f.Root.Flags().Var(
+		&f.fmtValue, "value", "value formatter")
+	return f
+}
+
+func (f *findT) run(cmd *cobra.Command, args []string) {
+	stdout, stderr := cmd.OutOrStdout(), cmd.OutOrStderr()
+	var key key
+	if err := key.Set(args[1]); err != nil {
+		fmt.Fprintf(stdout, "%s\n", err)
+		return
+	}
+
+	if err := f.findFiles(stdout, stderr, args[0]); err != nil {
+		fmt.Fprintf(stdout, "%s\n", err)
+		return
+	}
+	f.readManifests(stdout)
+
+	f.opts.Comparer = f.comparers[f.comparerName]
+	if f.opts.Comparer == nil {
+		fmt.Fprintf(stderr, "unknown comparer %q", f.comparerName)
+		return
+	}
+	f.fmtKey.setForComparer(f.opts.Comparer.Name, f.comparers)
+	f.fmtValue.setForComparer(f.opts.Comparer.Name, f.comparers)
+
+	refs := f.search(stdout, key)
+	var lastFileNum base.FileNum
+	for i := range refs {
+		r := &refs[i]
+		if lastFileNum != r.fileNum {
+			lastFileNum = r.fileNum
+			fmt.Fprintf(stdout, "%s", f.opts.FS.PathBase(f.files[r.fileNum.DiskFileNum()]))
+			if m := f.tableMeta[r.fileNum]; m != nil {
+				fmt.Fprintf(stdout, " ")
+				formatKeyRange(stdout, f.fmtKey, &m.Smallest, &m.Largest)
+			}
+			fmt.Fprintf(stdout, "\n")
+			if p := f.tableProvenance(r.fileNum); p != "" {
+				fmt.Fprintf(stdout, "    (%s)\n", p)
+			}
+		}
+		fmt.Fprintf(stdout, "    ")
+		formatKeyValue(stdout, f.fmtKey, f.fmtValue, &r.key, r.value)
+	}
+
+	for _, errorMsg := range f.errors {
+		fmt.Fprint(stdout, errorMsg)
+	}
+}
+
+// Find all of the manifests, logs, and tables in the specified directory.
+func (f *findT) findFiles(stdout, stderr io.Writer, dir string) error {
+	f.files = make(map[base.DiskFileNum]string)
+	f.editRefs = make(map[base.FileNum][]int)
+	f.logs = nil
+	f.manifests = nil
+	f.tables = nil
+	f.tableMeta = make(map[base.FileNum]*manifest.FileMetadata)
+
+	if _, err := f.opts.FS.Stat(dir); err != nil {
+		return err
+	}
+
+	walk(stderr, f.opts.FS, dir, func(path string) {
+		ft, fileNum, ok := base.ParseFilename(f.opts.FS, path)
+		if !ok {
+			return
+		}
+		switch ft {
+		case base.FileTypeLog:
+			f.logs = append(f.logs, fileNum)
+		case base.FileTypeManifest:
+			f.manifests = append(f.manifests, fileNum)
+		case base.FileTypeTable:
+			f.tables = append(f.tables, fileNum.FileNum())
+		default:
+			return
+		}
+		f.files[fileNum] = path
+	})
+
+	slices.Sort(f.logs)
+	slices.Sort(f.manifests)
+	slices.Sort(f.tables)
+
+	if f.verbose {
+		fmt.Fprintf(stdout, "%s\n", dir)
+		fmt.Fprintf(stdout, "%5d %s\n", len(f.manifests), makePlural("manifest", int64(len(f.manifests))))
+		fmt.Fprintf(stdout, "%5d %s\n", len(f.logs), makePlural("log", int64(len(f.logs))))
+		fmt.Fprintf(stdout, "%5d %s\n", len(f.tables), makePlural("sstable", int64(len(f.tables))))
+	}
+	return nil
+}
+
+// Read the manifests and populate the editRefs map which is used to determine
+// the provenance and metadata of tables.
+func (f *findT) readManifests(stdout io.Writer) {
+	for _, fileNum := range f.manifests {
+		func() {
+			path := f.files[fileNum]
+			mf, err := f.opts.FS.Open(path)
+			if err != nil {
+				fmt.Fprintf(stdout, "%s\n", err)
+				return
+			}
+			defer mf.Close()
+
+			if f.verbose {
+				fmt.Fprintf(stdout, "%s\n", path)
+			}
+
+			rr := record.NewReader(mf, 0 /* logNum */)
+			for {
+				r, err := rr.Next()
+				if err != nil {
+					if err != io.EOF {
+						fmt.Fprintf(stdout, "%s: %s\n", path, err)
+					}
+					break
+				}
+
+				var ve manifest.VersionEdit
+				if err := ve.Decode(r); err != nil {
+					fmt.Fprintf(stdout, "%s: %s\n", path, err)
+					break
+				}
+				i := len(f.edits)
+				f.edits = append(f.edits, ve)
+
+				if ve.ComparerName != "" {
+					f.comparerName = ve.ComparerName
+				}
+				if num := ve.MinUnflushedLogNum.FileNum(); num != 0 {
+					f.editRefs[num] = append(f.editRefs[num], i)
+				}
+				for df := range ve.DeletedFiles {
+					f.editRefs[df.FileNum] = append(f.editRefs[df.FileNum], i)
+				}
+				for _, nf := range ve.NewFiles {
+					// The same file can be deleted and added in a single version edit
+					// which indicates a "move" compaction. Only add the edit to the list
+					// once.
+					refs := f.editRefs[nf.Meta.FileNum]
+					if n := len(refs); n == 0 || refs[n-1] != i {
+						f.editRefs[nf.Meta.FileNum] = append(refs, i)
+					}
+					if _, ok := f.tableMeta[nf.Meta.FileNum]; !ok {
+						f.tableMeta[nf.Meta.FileNum] = nf.Meta
+					}
+				}
+			}
+		}()
+	}
+
+	if f.verbose {
+		fmt.Fprintf(stdout, "%5d %s\n", len(f.edits), makePlural("edit", int64(len(f.edits))))
+	}
+}
+
+// Search the logs and sstables for references to the specified key.
+func (f *findT) search(stdout io.Writer, key []byte) []findRef {
+	refs := f.searchLogs(stdout, key, nil)
+	refs = f.searchTables(stdout, key, refs)
+
+	// For a given file (log or table) the references are already in the correct
+	// order. We simply want to order the references by fileNum using a stable
+	// sort.
+	//
+	// TODO(peter): I'm not sure if this is perfectly correct with regards to log
+	// files and ingested sstables, but it is close enough and doing something
+	// better is onerous. Revisit if this ever becomes problematic (e.g. if we
+	// allow finding more than one key at a time).
+	//
+	// An example of the problem with logs and ingestion (which can only occur
+	// with distinct keys). If I write key "a" to a log, I can then ingested key
+	// "b" without causing "a" to be flushed. Then I can write key "c" to the
+	// log. Ideally, we'd show the key "a" from the log, then the key "b" from
+	// the ingested sstable, then key "c" from the log.
+	slices.SortStableFunc(refs, func(a, b findRef) int {
+		return cmp.Compare(a.fileNum, b.fileNum)
+	})
+	return refs
+}
+
+// Search the logs for references to the specified key.
+func (f *findT) searchLogs(stdout io.Writer, searchKey []byte, refs []findRef) []findRef {
+	cmp := f.opts.Comparer.Compare
+	for _, fileNum := range f.logs {
+		_ = func() (err error) {
+			path := f.files[fileNum]
+			lf, err := f.opts.FS.Open(path)
+			if err != nil {
+				fmt.Fprintf(stdout, "%s\n", err)
+				return
+			}
+			defer lf.Close()
+
+			if f.verbose {
+				fmt.Fprintf(stdout, "%s", path)
+				defer fmt.Fprintf(stdout, "\n")
+			}
+			defer func() {
+				switch err {
+				case record.ErrZeroedChunk:
+					if f.verbose {
+						fmt.Fprintf(stdout, ": EOF [%s] (may be due to WAL preallocation)", err)
+					}
+				case record.ErrInvalidChunk:
+					if f.verbose {
+						fmt.Fprintf(stdout, ": EOF [%s] (may be due to WAL recycling)", err)
+					}
+				default:
+					if err != io.EOF {
+						if f.verbose {
+							fmt.Fprintf(stdout, ": %s", err)
+						} else {
+							fmt.Fprintf(stdout, "%s: %s\n", path, err)
+						}
+					}
+				}
+			}()
+
+			var b pebble.Batch
+			var buf bytes.Buffer
+			rr := record.NewReader(lf, fileNum)
+			for {
+				r, err := rr.Next()
+				if err == nil {
+					buf.Reset()
+					_, err = io.Copy(&buf, r)
+				}
+				if err != nil {
+					return err
+				}
+
+				b = pebble.Batch{}
+				if err := b.SetRepr(buf.Bytes()); err != nil {
+					fmt.Fprintf(stdout, "%s: corrupt log file: %v", path, err)
+					continue
+				}
+				seqNum := b.SeqNum()
+				for r := b.Reader(); ; seqNum++ {
+					kind, ukey, value, ok, err := r.Next()
+					if !ok {
+						if err != nil {
+							fmt.Fprintf(stdout, "%s: corrupt log file: %v", path, err)
+							break
+						}
+						break
+					}
+					ikey := base.MakeInternalKey(ukey, seqNum, kind)
+					switch kind {
+					case base.InternalKeyKindDelete,
+						base.InternalKeyKindDeleteSized,
+						base.InternalKeyKindSet,
+						base.InternalKeyKindMerge,
+						base.InternalKeyKindSingleDelete,
+						base.InternalKeyKindSetWithDelete:
+						if cmp(searchKey, ikey.UserKey) != 0 {
+							continue
+						}
+					case base.InternalKeyKindRangeDelete:
+						// Output tombstones that contain or end with the search key.
+						t := rangedel.Decode(ikey, value, nil)
+						if !t.Contains(cmp, searchKey) && cmp(t.End, searchKey) != 0 {
+							continue
+						}
+					default:
+						continue
+					}
+
+					refs = append(refs, findRef{
+						key:     ikey.Clone(),
+						value:   append([]byte(nil), value...),
+						fileNum: fileNum.FileNum(),
+					})
+				}
+			}
+		}()
+	}
+	return refs
+}
+
+// Search the tables for references to the specified key.
+func (f *findT) searchTables(stdout io.Writer, searchKey []byte, refs []findRef) []findRef {
+	cache := pebble.NewCache(128 << 20 /* 128 MB */)
+	defer cache.Unref()
+
+	f.tableRefs = make(map[base.FileNum]bool)
+	for _, fileNum := range f.tables {
+		_ = func() (err error) {
+			path := f.files[fileNum.DiskFileNum()]
+			tf, err := f.opts.FS.Open(path)
+			if err != nil {
+				fmt.Fprintf(stdout, "%s\n", err)
+				return
+			}
+
+			m := f.tableMeta[fileNum]
+			if f.verbose {
+				fmt.Fprintf(stdout, "%s", path)
+				if m != nil && m.SmallestSeqNum == m.LargestSeqNum {
+					fmt.Fprintf(stdout, ": global seqnum: %d", m.LargestSeqNum)
+				}
+				defer fmt.Fprintf(stdout, "\n")
+			}
+			defer func() {
+				switch {
+				case err != nil:
+					if f.verbose {
+						fmt.Fprintf(stdout, ": %v", err)
+					} else {
+						fmt.Fprintf(stdout, "%s: %v\n", path, err)
+					}
+				}
+			}()
+
+			opts := sstable.ReaderOptions{
+				Cache:    cache,
+				Comparer: f.opts.Comparer,
+				Filters:  f.opts.Filters,
+			}
+			readable, err := sstable.NewSimpleReadable(tf)
+			if err != nil {
+				return err
+			}
+			r, err := sstable.NewReader(readable, opts, f.comparers, f.mergers,
+				private.SSTableRawTombstonesOpt.(sstable.ReaderOption))
+			if err != nil {
+				f.errors = append(f.errors, fmt.Sprintf("Unable to decode sstable %s, %s", f.files[fileNum.DiskFileNum()], err.Error()))
+				// Ensure the error only gets printed once.
+				err = nil
+				return
+			}
+			defer r.Close()
+
+			if m != nil && m.SmallestSeqNum == m.LargestSeqNum {
+				r.Properties.GlobalSeqNum = m.LargestSeqNum
+			}
+
+			iter, err := r.NewIter(nil, nil)
+			if err != nil {
+				return err
+			}
+			defer iter.Close()
+			key, value := iter.SeekGE(searchKey, base.SeekGEFlagsNone)
+
+			// We configured sstable.Reader to return raw tombstones which requires a
+			// bit more work here to put them in a form that can be iterated in
+			// parallel with the point records.
+			rangeDelIter, err := func() (keyspan.FragmentIterator, error) {
+				iter, err := r.NewRawRangeDelIter()
+				if err != nil {
+					return nil, err
+				}
+				if iter == nil {
+					return keyspan.NewIter(r.Compare, nil), nil
+				}
+				defer iter.Close()
+
+				var tombstones []keyspan.Span
+				for t := iter.First(); t != nil; t = iter.Next() {
+					if !t.Contains(r.Compare, searchKey) {
+						continue
+					}
+					tombstones = append(tombstones, t.ShallowClone())
+				}
+
+				slices.SortFunc(tombstones, func(a, b keyspan.Span) int {
+					return r.Compare(a.Start, b.Start)
+				})
+				return keyspan.NewIter(r.Compare, tombstones), nil
+			}()
+			if err != nil {
+				return err
+			}
+
+			defer rangeDelIter.Close()
+			rangeDel := rangeDelIter.First()
+
+			foundRef := false
+			for key != nil || rangeDel != nil {
+				if key != nil &&
+					(rangeDel == nil || r.Compare(key.UserKey, rangeDel.Start) < 0) {
+					if r.Compare(searchKey, key.UserKey) != 0 {
+						key, value = nil, base.LazyValue{}
+						continue
+					}
+					v, _, err := value.Value(nil)
+					if err != nil {
+						return err
+					}
+					refs = append(refs, findRef{
+						key:     key.Clone(),
+						value:   append([]byte(nil), v...),
+						fileNum: fileNum,
+					})
+					key, value = iter.Next()
+				} else {
+					// Use rangedel.Encode to add a reference for each key
+					// within the span.
+					err := rangedel.Encode(rangeDel, func(k base.InternalKey, v []byte) error {
+						refs = append(refs, findRef{
+							key:     k.Clone(),
+							value:   append([]byte(nil), v...),
+							fileNum: fileNum,
+						})
+						return nil
+					})
+					if err != nil {
+						return err
+					}
+					rangeDel = rangeDelIter.Next()
+				}
+				foundRef = true
+			}
+
+			if foundRef {
+				f.tableRefs[fileNum] = true
+			}
+			return nil
+		}()
+	}
+	return refs
+}
+
+// Determine the provenance of the specified table. We search the version edits
+// for the first edit which created the table, and then analyze the edit to
+// determine if it was a compaction, flush, or ingestion. Returns an empty
+// string if the provenance of a table cannot be determined.
+func (f *findT) tableProvenance(fileNum base.FileNum) string {
+	editRefs := f.editRefs[fileNum]
+	for len(editRefs) > 0 {
+		ve := f.edits[editRefs[0]]
+		editRefs = editRefs[1:]
+		for _, nf := range ve.NewFiles {
+			if fileNum != nf.Meta.FileNum {
+				continue
+			}
+
+			var buf bytes.Buffer
+			switch {
+			case len(ve.DeletedFiles) > 0:
+				// A version edit with deleted files is a compaction. The deleted
+				// files are the inputs to the compaction. We're going to
+				// reconstruct the input files and display those inputs that
+				// contain the search key (i.e. are list in refs) and use an
+				// ellipsis to indicate when there were other inputs that have
+				// been elided.
+				var sourceLevels []int
+				levels := make(map[int][]base.FileNum)
+				for df := range ve.DeletedFiles {
+					files := levels[df.Level]
+					if len(files) == 0 {
+						sourceLevels = append(sourceLevels, df.Level)
+					}
+					levels[df.Level] = append(files, df.FileNum)
+				}
+
+				sort.Ints(sourceLevels)
+				if sourceLevels[len(sourceLevels)-1] != nf.Level {
+					sourceLevels = append(sourceLevels, nf.Level)
+				}
+
+				sep := " "
+				fmt.Fprintf(&buf, "compacted")
+				for _, level := range sourceLevels {
+					files := levels[level]
+					slices.Sort(files)
+
+					fmt.Fprintf(&buf, "%sL%d [", sep, level)
+					sep = ""
+					elided := false
+					for _, fileNum := range files {
+						if f.tableRefs[fileNum] {
+							fmt.Fprintf(&buf, "%s%s", sep, fileNum)
+							sep = " "
+						} else {
+							elided = true
+						}
+					}
+					if elided {
+						fmt.Fprintf(&buf, "%s...", sep)
+					}
+					fmt.Fprintf(&buf, "]")
+					sep = " + "
+				}
+
+			case ve.MinUnflushedLogNum != 0:
+				// A version edit with a min-unflushed log indicates a flush
+				// operation.
+				fmt.Fprintf(&buf, "flushed to L%d", nf.Level)
+
+			case nf.Meta.SmallestSeqNum == nf.Meta.LargestSeqNum:
+				// If the smallest and largest seqnum are the same, the file was
+				// ingested. Note that this can also occur for a flushed sstable
+				// that contains only a single key, though that would have
+				// already been captured above.
+				fmt.Fprintf(&buf, "ingested to L%d", nf.Level)
+
+			default:
+				// The provenance of the table is unclear. This is usually due to
+				// the MANIFEST rolling over and taking a snapshot of the LSM
+				// state.
+				fmt.Fprintf(&buf, "added to L%d", nf.Level)
+			}
+
+			// After a table is created, it can be moved to a different level via a
+			// move compaction. This is indicated by a version edit that deletes the
+			// table from one level and adds the same table to a different
+			// level. Loop over the remaining version edits for the table looking for
+			// such moves.
+			for len(editRefs) > 0 {
+				ve := f.edits[editRefs[0]]
+				editRefs = editRefs[1:]
+				for _, nf := range ve.NewFiles {
+					if fileNum == nf.Meta.FileNum {
+						for df := range ve.DeletedFiles {
+							if fileNum == df.FileNum {
+								fmt.Fprintf(&buf, ", moved to L%d", nf.Level)
+								break
+							}
+						}
+						break
+					}
+				}
+			}
+
+			return buf.String()
+		}
+	}
+	return ""
+}
diff --git a/pebble/tool/find_test.go b/pebble/tool/find_test.go
new file mode 100644
index 0000000..6a6c887
--- /dev/null
+++ b/pebble/tool/find_test.go
@@ -0,0 +1,11 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package tool
+
+import "testing"
+
+func TestFind(t *testing.T) {
+	runTests(t, "testdata/find")
+}
diff --git a/pebble/tool/logs/compaction.go b/pebble/tool/logs/compaction.go
new file mode 100644
index 0000000..cffe1da
--- /dev/null
+++ b/pebble/tool/logs/compaction.go
@@ -0,0 +1,1230 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package logs
+
+import (
+	"bufio"
+	"bytes"
+	"cmp"
+	"fmt"
+	"math"
+	"os"
+	"path/filepath"
+	"regexp"
+	"slices"
+	"sort"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/humanize"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/spf13/cobra"
+)
+
+const numLevels = manifest.NumLevels
+
+var (
+	// Captures a common logging prefix that can be used as the context for the
+	// surrounding information captured by other expressions. Example:
+	//
+	//   I211215 14:26:56.012382 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [T1,n5,pebble,s5] ...
+	//
+	logContextPattern = regexp.MustCompile(
+		`^.*` +
+			/* Timestamp        */ `(?P<timestamp>\d{6} \d{2}:\d{2}:\d{2}.\d{6}).*` +
+			/* Node / Store     */ `\[(T(\d+|\?),)?n(?P<node>\d+|\?).*,s(?P<store>\d+|\?).*?\].*`,
+	)
+	logContextPatternTimestampIdx = logContextPattern.SubexpIndex("timestamp")
+	logContextPatternNodeIdx      = logContextPattern.SubexpIndex("node")
+	logContextPatternStoreIdx     = logContextPattern.SubexpIndex("store")
+
+	// Matches either a compaction or a memtable flush log line.
+	//
+	// A compaction start / end line resembles:
+	//   "[JOB X] compact(ed|ing)"
+	//
+	// A memtable flush start / end line resembles:
+	//   "[JOB X] flush(ed|ing)"
+	//
+	// An ingested sstable flush looks like:
+	//   "[JOB 226] flushed 6 ingested flushables"
+	sentinelPattern          = regexp.MustCompile(`\[JOB.*(?P<prefix>compact|flush|ingest)(?P<suffix>ed|ing)[^:]`)
+	sentinelPatternPrefixIdx = sentinelPattern.SubexpIndex("prefix")
+	sentinelPatternSuffixIdx = sentinelPattern.SubexpIndex("suffix")
+
+	// Example compaction start and end log lines:
+	// 23.1 and older:
+	//   I211215 14:26:56.012382 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n5,pebble,s5] 1216510  [JOB 284925] compacting(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M)
+	//   I211215 14:26:56.318543 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n5,pebble,s5] 1216554  [JOB 284925] compacted(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M) -> L3 [445883 445887] (13 M), in 0.3s, output rate 42 M/s
+	// current:
+	//   I211215 14:26:56.012382 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n5,pebble,s5] 1216510  [JOB 284925] compacting(default) L2 [442555] (4.2MB) + L3 [445853] (8.4MB)
+	//   I211215 14:26:56.318543 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n5,pebble,s5] 1216554  [JOB 284925] compacted(default) L2 [442555] (4.2MB) + L3 [445853] (8.4MB) -> L3 [445883 445887] (13MB), in 0.3s, output rate 42MB/s
+	//
+	// NOTE: we use the log timestamp to compute the compaction duration rather
+	// than the Pebble log output.
+	compactionPattern = regexp.MustCompile(
+		`^.*` +
+			/* Job ID            */ `\[JOB (?P<job>\d+)]\s` +
+			/* Start / end       */ `compact(?P<suffix>ed|ing)` +
+
+			/* Compaction type   */
+			`\((?P<type>.*?)\)\s` +
+			/* Optional annotation*/ `?(\s*\[(?P<annotations>.*?)\]\s*)?` +
+
+			/* Start / end level */
+			`(?P<levels>L(?P<from>\d).*?(?:.*(?:\+|->)\sL(?P<to>\d))?` +
+			/* Bytes             */
+			`(?:.*?\((?P<bytes>[0-9.]+( [BKMGTPE]|[KMGTPE]?B))\))` +
+			/* Score */
+			`?(\s*(Score=\d+(\.\d+)))?)`,
+	)
+	compactionPatternJobIdx    = compactionPattern.SubexpIndex("job")
+	compactionPatternSuffixIdx = compactionPattern.SubexpIndex("suffix")
+	compactionPatternTypeIdx   = compactionPattern.SubexpIndex("type")
+	compactionPatternLevels    = compactionPattern.SubexpIndex("levels")
+	compactionPatternFromIdx   = compactionPattern.SubexpIndex("from")
+	compactionPatternToIdx     = compactionPattern.SubexpIndex("to")
+	compactionPatternBytesIdx  = compactionPattern.SubexpIndex("bytes")
+
+	// Example memtable flush log lines:
+	// 23.1 and older:
+	//   I211213 16:23:48.903751 21136 3@vendor/github.com/cockroachdb/pebble/event.go:599 ⋮ [n9,pebble,s9] 24 [JOB 10] flushing 2 memtables to L0
+	//   I211213 16:23:49.134464 21136 3@vendor/github.com/cockroachdb/pebble/event.go:603 ⋮ [n9,pebble,s9] 26 [JOB 10] flushed 2 memtables to L0 [1535806] (1.3 M), in 0.2s, output rate 5.8 M/s
+	// current:
+	//   I211213 16:23:48.903751 21136
+	//   3@vendor/github.com/cockroachdb/pebble/event.go:599 ⋮ [n9,pebble,s9] 24 [JOB 10] flushing 2 memtables (1.4MB)  to L0
+	//   I211213 16:23:49.134464 21136
+	//   3@vendor/github.com/cockroachdb/pebble/event.go:603 ⋮ [n9,pebble,s9] 26 [JOB 10] flushed 2 memtables (1.4MB) to L0 [1535806] (1.3MB), in 0.2s, output rate 5.8MB/s
+	//
+	// NOTE: we use the log timestamp to compute the flush duration rather than
+	// the Pebble log output.
+	flushPattern = regexp.MustCompile(
+		`^..*` +
+			/* Job ID                       */ `\[JOB (?P<job>\d+)]\s` +
+			/* Compaction type              */ `flush(?P<suffix>ed|ing)\s` +
+			/* Memtable count; size (23.2+) */ `\d+ memtables? (\([^)]+\))?` +
+			/* SSTable Bytes                */ `(?:.*?\((?P<bytes>[0-9.]+( [BKMGTPE]|[KMGTPE]?B))\))?`,
+	)
+	flushPatternSuffixIdx = flushPattern.SubexpIndex("suffix")
+	flushPatternJobIdx    = flushPattern.SubexpIndex("job")
+	flushPatternBytesIdx  = flushPattern.SubexpIndex("bytes")
+
+	// Example ingested log lines:
+	// 23.1 and older:
+	//   I220228 16:01:22.487906 18476248525 3@vendor/github.com/cockroachdb/pebble/ingest.go:637 ⋮ [n24,pebble,s24] 33430782  [JOB 10211226] ingested L0:21818678 (1.8 K), L0:21818683 (1.2 K), L0:21818679 (1.6 K), L0:21818680 (1.1 K), L0:21818681 (1.1 K), L0:21818682 (160 M)
+	// current:
+	//   I220228 16:01:22.487906 18476248525 3@vendor/github.com/cockroachdb/pebble/ingest.go:637 ⋮ [n24,pebble,s24] 33430782  [JOB 10211226] ingested L0:21818678 (1.8KB), L0:21818683 (1.2KB), L0:21818679 (1.6KB), L0:21818680 (1.1KB), L0:21818681 (1.1KB), L0:21818682 (160MB)
+	//
+	ingestedPattern = regexp.MustCompile(
+		`^.*` +
+			/* Job ID           */ `\[JOB (?P<job>\d+)]\s` +
+			/* ingested */ `ingested\s`)
+	ingestedPatternJobIdx = ingestedPattern.SubexpIndex("job")
+	ingestedFilePattern   = regexp.MustCompile(
+		`L` +
+			/* Level       */ `(?P<level>\d):` +
+			/* File number */ `(?P<file>\d+)\s` +
+			/* Bytes       */ `\((?P<bytes>[0-9.]+( [BKMGTPE]|[KMGTPE]?B))\)`)
+	ingestedFilePatternLevelIdx = ingestedFilePattern.SubexpIndex("level")
+	ingestedFilePatternFileIdx  = ingestedFilePattern.SubexpIndex("file")
+	ingestedFilePatternBytesIdx = ingestedFilePattern.SubexpIndex("bytes")
+
+	// flushable ingestions
+	//
+	// I230831 04:13:28.824280 3780 3@pebble/event.go:685 ⋮ [n10,s10,pebble] 365  [JOB 226] flushed 6 ingested flushables L0:024334 (1.5KB) + L0:024339 (1.0KB) + L0:024335 (1.9KB) + L0:024336 (1.1KB) + L0:024337 (1.1KB) + L0:024338 (12KB) in 0.0s (0.0s total), output rate 67MB/s
+	flushableIngestedPattern = regexp.MustCompile(
+		`^.*` +
+			/* Job ID           */ `\[JOB (?P<job>\d+)]\s` +
+			/* match ingested flushable */ `flushed \d ingested flushable`)
+	flushableIngestedPatternJobIdx = flushableIngestedPattern.SubexpIndex("job")
+
+	// Example read-amp log line:
+	// 23.1 and older:
+	//   total     31766   188 G       -   257 G   187 G    48 K   3.6 G     744   536 G    49 K   278 G       5     2.1
+	// current:
+	//   total |     1   639B     0B |     - |   84B |     0     0B |     0     0B |     3  1.9KB | 1.2KB |   1 23.7
+	readAmpPattern = regexp.MustCompile(
+		/* Read amp */ `(?:^|\+)(?:\s{2}total|total \|).*?\s(?P<value>\d+)\s.{4,7}$`,
+	)
+	readAmpPatternValueIdx = readAmpPattern.SubexpIndex("value")
+)
+
+const (
+	// timeFmt matches the Cockroach log timestamp format.
+	// See: https://github.com/cockroachdb/cockroach/blob/master/pkg/util/log/format_crdb_v2.go
+	timeFmt = "060102 15:04:05.000000"
+
+	// timeFmtSlim is similar to timeFmt, except that it strips components with a
+	// lower granularity than a minute.
+	timeFmtSlim = "060102 15:04"
+
+	// timeFmtHrMinSec prints only the hour, minute and second of the time.
+	timeFmtHrMinSec = "15:04:05"
+)
+
+// compactionType is the type of compaction. It tracks the types in
+// compaction.go. We copy the values here to avoid exporting the types in
+// compaction.go.
+type compactionType uint8
+
+const (
+	compactionTypeDefault compactionType = iota
+	compactionTypeFlush
+	compactionTypeMove
+	compactionTypeDeleteOnly
+	compactionTypeElisionOnly
+	compactionTypeRead
+)
+
+// String implements fmt.Stringer.
+func (c compactionType) String() string {
+	switch c {
+	case compactionTypeDefault:
+		return "default"
+	case compactionTypeMove:
+		return "move"
+	case compactionTypeDeleteOnly:
+		return "delete-only"
+	case compactionTypeElisionOnly:
+		return "elision-only"
+	case compactionTypeRead:
+		return "read"
+	default:
+		panic(errors.Newf("unknown compaction type: %s", c))
+	}
+}
+
+// parseCompactionType parses the given compaction type string and returns a
+// compactionType.
+func parseCompactionType(s string) (t compactionType, err error) {
+	switch s {
+	case "default":
+		t = compactionTypeDefault
+	case "move":
+		t = compactionTypeMove
+	case "delete-only":
+		t = compactionTypeDeleteOnly
+	case "elision-only":
+		t = compactionTypeElisionOnly
+	case "read":
+		t = compactionTypeRead
+	default:
+		err = errors.Newf("unknown compaction type: %s", s)
+	}
+	return
+}
+
+// compactionStart is a compaction start event.
+type compactionStart struct {
+	ctx        logContext
+	jobID      int
+	cType      compactionType
+	fromLevel  int
+	toLevel    int
+	inputBytes uint64
+}
+
+// parseCompactionStart converts the given regular expression sub-matches for a
+// compaction start log line into a compactionStart event.
+func parseCompactionStart(matches []string) (compactionStart, error) {
+	var start compactionStart
+
+	// Parse job ID.
+	jobID, err := strconv.Atoi(matches[compactionPatternJobIdx])
+	if err != nil {
+		return start, errors.Newf("could not parse jobID: %s", err)
+	}
+
+	// Parse compaction type.
+	cType, err := parseCompactionType(matches[compactionPatternTypeIdx])
+	if err != nil {
+		return start, err
+	}
+
+	// Parse input bytes.
+	inputBytes, err := sumInputBytes(matches[compactionPatternLevels])
+	if err != nil {
+		return start, errors.Newf("could not sum input bytes: %s", err)
+	}
+
+	// Parse from-level.
+	from, err := strconv.Atoi(matches[compactionPatternFromIdx])
+	if err != nil {
+		return start, errors.Newf("could not parse from-level: %s", err)
+	}
+
+	// Parse to-level. For deletion and elision compactions, set the same level.
+	to := from
+	if cType != compactionTypeElisionOnly && cType != compactionTypeDeleteOnly {
+		to, err = strconv.Atoi(matches[compactionPatternToIdx])
+		if err != nil {
+			return start, errors.Newf("could not parse to-level: %s", err)
+		}
+	}
+
+	start = compactionStart{
+		jobID:      jobID,
+		cType:      cType,
+		fromLevel:  from,
+		toLevel:    to,
+		inputBytes: inputBytes,
+	}
+
+	return start, nil
+}
+
+// compactionEnd is a compaction end event.
+type compactionEnd struct {
+	jobID        int
+	writtenBytes uint64
+	// TODO(jackson): Parse and include the aggregate size of input
+	// sstables. It may be instructive, because compactions that drop
+	// keys write less data than they remove from the input level.
+}
+
+// parseCompactionEnd converts the given regular expression sub-matches for a
+// compaction end log line into a compactionEnd event.
+func parseCompactionEnd(matches []string) (compactionEnd, error) {
+	var end compactionEnd
+
+	// Parse job ID.
+	jobID, err := strconv.Atoi(matches[compactionPatternJobIdx])
+	if err != nil {
+		return end, errors.Newf("could not parse jobID: %s", err)
+	}
+	end = compactionEnd{jobID: jobID}
+
+	// Optionally, if we have compacted bytes.
+	if matches[compactionPatternBytesIdx] != "" {
+		end.writtenBytes = unHumanize(matches[compactionPatternBytesIdx])
+	}
+
+	return end, nil
+}
+
+// parseFlushStart converts the given regular expression sub-matches for a
+// memtable flush start log line into a compactionStart event.
+func parseFlushStart(matches []string) (compactionStart, error) {
+	var start compactionStart
+	// Parse job ID.
+	jobID, err := strconv.Atoi(matches[flushPatternJobIdx])
+	if err != nil {
+		return start, errors.Newf("could not parse jobID: %s", err)
+	}
+	c := compactionStart{
+		jobID:     jobID,
+		cType:     compactionTypeFlush,
+		fromLevel: -1,
+		toLevel:   0,
+	}
+	return c, nil
+}
+
+// parseFlushEnd converts the given regular expression sub-matches for a
+// memtable flush end log line into a compactionEnd event.
+func parseFlushEnd(matches []string) (compactionEnd, error) {
+	var end compactionEnd
+
+	// Parse job ID.
+	jobID, err := strconv.Atoi(matches[flushPatternJobIdx])
+	if err != nil {
+		return end, errors.Newf("could not parse jobID: %s", err)
+	}
+	end = compactionEnd{jobID: jobID}
+
+	// Optionally, if we have flushed bytes.
+	if matches[flushPatternBytesIdx] != "" {
+		end.writtenBytes = unHumanize(matches[flushPatternBytesIdx])
+	}
+
+	return end, nil
+}
+
+// event describes an aggregated event (eg, start and end events
+// combined if necessary).
+type event struct {
+	nodeID     int
+	storeID    int
+	jobID      int
+	timeStart  time.Time
+	timeEnd    time.Time
+	compaction *compaction
+	ingest     *ingest
+}
+
+// compaction represents an aggregated compaction event (i.e. the combination of
+// a start and end event).
+type compaction struct {
+	cType       compactionType
+	fromLevel   int
+	toLevel     int
+	inputBytes  uint64
+	outputBytes uint64
+}
+
+// ingest describes the completion of an ingest.
+type ingest struct {
+	files []ingestedFile
+}
+
+type ingestedFile struct {
+	level     int
+	fileNum   int
+	sizeBytes uint64
+}
+
+// readAmp represents a read-amp event.
+type readAmp struct {
+	ctx     logContext
+	readAmp int
+}
+
+type nodeStoreJob struct {
+	node, store, job int
+}
+
+func (n nodeStoreJob) String() string {
+	return fmt.Sprintf("(node=%d,store=%d,job=%d)", n.node, n.store, n.job)
+}
+
+type errorEvent struct {
+	path string
+	line string
+	err  error
+}
+
+// logEventCollector keeps track of open compaction events and read-amp events
+// over the course of parsing log line events. Completed compaction events are
+// added to the collector once a matching start and end pair are encountered.
+// Read-amp events are added as they are encountered (the have no start / end
+// concept).
+type logEventCollector struct {
+	ctx      logContext
+	m        map[nodeStoreJob]compactionStart
+	events   []event
+	readAmps []readAmp
+	errors   []errorEvent
+}
+
+// newEventCollector instantiates a new logEventCollector.
+func newEventCollector() *logEventCollector {
+	return &logEventCollector{
+		m: make(map[nodeStoreJob]compactionStart),
+	}
+}
+
+// addError records an error encountered during log parsing.
+func (c *logEventCollector) addError(path, line string, err error) {
+	c.errors = append(c.errors, errorEvent{path: path, line: line, err: err})
+}
+
+// addCompactionStart adds a new compactionStart to the collector. The event is
+// tracked by its job ID.
+func (c *logEventCollector) addCompactionStart(start compactionStart) error {
+	key := nodeStoreJob{c.ctx.node, c.ctx.store, start.jobID}
+	if _, ok := c.m[key]; ok {
+		return errors.Newf("start event already seen for %s", key)
+	}
+	start.ctx = c.ctx
+	c.m[key] = start
+	return nil
+}
+
+// addCompactionEnd completes the compaction event for the given compactionEnd.
+func (c *logEventCollector) addCompactionEnd(end compactionEnd) {
+	key := nodeStoreJob{c.ctx.node, c.ctx.store, end.jobID}
+	start, ok := c.m[key]
+	if !ok {
+		_, _ = fmt.Fprintf(
+			os.Stderr,
+			"compaction end event missing start event for %s; skipping\n", key,
+		)
+		return
+	}
+
+	// Remove the job from the collector once it has been matched.
+	delete(c.m, key)
+
+	c.events = append(c.events, event{
+		nodeID:    start.ctx.node,
+		storeID:   start.ctx.store,
+		jobID:     start.jobID,
+		timeStart: start.ctx.timestamp,
+		timeEnd:   c.ctx.timestamp,
+		compaction: &compaction{
+			cType:       start.cType,
+			fromLevel:   start.fromLevel,
+			toLevel:     start.toLevel,
+			inputBytes:  start.inputBytes,
+			outputBytes: end.writtenBytes,
+		},
+	})
+}
+
+// addReadAmp adds the readAmp event to the collector.
+func (c *logEventCollector) addReadAmp(ra readAmp) {
+	ra.ctx = c.ctx
+	c.readAmps = append(c.readAmps, ra)
+}
+
+// logContext captures the metadata of log lines.
+type logContext struct {
+	timestamp   time.Time
+	node, store int
+}
+
+// saveContext saves the given logContext in the collector.
+func (c *logEventCollector) saveContext(ctx logContext) {
+	c.ctx = ctx
+}
+
+// level is a level in the LSM. The WAL is level -1.
+type level int
+
+// String implements fmt.Stringer.
+func (l level) String() string {
+	if l == -1 {
+		return "WAL"
+	}
+	return "L" + strconv.Itoa(int(l))
+}
+
+// fromTo is a map key for (from, to) level tuples.
+type fromTo struct {
+	from, to level
+}
+
+// compactionTypeCount is a mapping from compaction type to count.
+type compactionTypeCount map[compactionType]int
+
+// windowSummary summarizes events in a window of time between a start and end
+// time. The window tracks:
+// - for each compaction type: counts, total bytes compacted, and total duration.
+// - total ingested bytes for each level
+// - read amp magnitudes
+type windowSummary struct {
+	nodeID, storeID      int
+	tStart, tEnd         time.Time
+	eventCount           int
+	flushedCount         int
+	flushedBytes         uint64
+	flushedTime          time.Duration
+	compactionCounts     map[fromTo]compactionTypeCount
+	compactionBytesIn    map[fromTo]uint64
+	compactionBytesOut   map[fromTo]uint64
+	compactionBytesMoved map[fromTo]uint64
+	compactionBytesDel   map[fromTo]uint64
+	compactionTime       map[fromTo]time.Duration
+	ingestedCount        [numLevels]int
+	ingestedBytes        [numLevels]uint64
+	readAmps             []readAmp
+	longRunning          []event
+}
+
+// String implements fmt.Stringer, returning a formatted window summary.
+func (s windowSummary) String() string {
+	type fromToCount struct {
+		ft         fromTo
+		counts     compactionTypeCount
+		bytesIn    uint64
+		bytesOut   uint64
+		bytesMoved uint64
+		bytesDel   uint64
+		duration   time.Duration
+	}
+	var pairs []fromToCount
+	for k, v := range s.compactionCounts {
+		pairs = append(pairs, fromToCount{
+			ft:         k,
+			counts:     v,
+			bytesIn:    s.compactionBytesIn[k],
+			bytesOut:   s.compactionBytesOut[k],
+			bytesMoved: s.compactionBytesMoved[k],
+			bytesDel:   s.compactionBytesDel[k],
+			duration:   s.compactionTime[k],
+		})
+	}
+	slices.SortFunc(pairs, func(l, r fromToCount) int {
+		if v := cmp.Compare(l.ft.from, r.ft.from); v != 0 {
+			return v
+		}
+		return cmp.Compare(l.ft.to, r.ft.to)
+	})
+
+	nodeID, storeID := "?", "?"
+	if s.nodeID != -1 {
+		nodeID = strconv.Itoa(s.nodeID)
+	}
+	if s.storeID != -1 {
+		storeID = strconv.Itoa(s.storeID)
+	}
+
+	var sb strings.Builder
+	sb.WriteString(fmt.Sprintf("node: %s, store: %s\n", nodeID, storeID))
+	sb.WriteString(fmt.Sprintf("   from: %s\n", s.tStart.Format(timeFmtSlim)))
+	sb.WriteString(fmt.Sprintf("     to: %s\n", s.tEnd.Format(timeFmtSlim)))
+	var count, sum int
+	for _, ra := range s.readAmps {
+		count++
+		sum += ra.readAmp
+	}
+	sb.WriteString(fmt.Sprintf("  r-amp: %.1f\n", float64(sum)/float64(count)))
+
+	// Print flush+ingest statistics.
+	{
+		var headerWritten bool
+		maybeWriteHeader := func() {
+			if !headerWritten {
+				sb.WriteString("_kind______from______to_____________________________________count___bytes______time\n")
+				headerWritten = true
+			}
+		}
+
+		if s.flushedCount > 0 {
+			maybeWriteHeader()
+			fmt.Fprintf(&sb, "%-7s         %7s                                   %7d %7s %9s\n",
+				"flush", "L0", s.flushedCount, humanize.Bytes.Uint64(s.flushedBytes),
+				s.flushedTime.Truncate(time.Second))
+		}
+
+		count := s.flushedCount
+		sum := s.flushedBytes
+		totalTime := s.flushedTime
+		for l := 0; l < len(s.ingestedBytes); l++ {
+			if s.ingestedCount[l] == 0 {
+				continue
+			}
+			maybeWriteHeader()
+			fmt.Fprintf(&sb, "%-7s         %7s                                   %7d %7s\n",
+				"ingest", fmt.Sprintf("L%d", l), s.ingestedCount[l], humanize.Bytes.Uint64(s.ingestedBytes[l]))
+			count += s.ingestedCount[l]
+			sum += s.ingestedBytes[l]
+		}
+		if headerWritten {
+			fmt.Fprintf(&sb, "total                                                     %7d %7s %9s\n",
+				count, humanize.Bytes.Uint64(sum), totalTime.Truncate(time.Second),
+			)
+		}
+	}
+
+	// Print compactions statistics.
+	if len(s.compactionCounts) > 0 {
+		sb.WriteString("_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time\n")
+		var totalDef, totalMove, totalElision, totalDel int
+		var totalBytesIn, totalBytesOut, totalBytesMoved, totalBytesDel uint64
+		var totalTime time.Duration
+		for _, p := range pairs {
+			def := p.counts[compactionTypeDefault]
+			move := p.counts[compactionTypeMove]
+			elision := p.counts[compactionTypeElisionOnly]
+			del := p.counts[compactionTypeDeleteOnly]
+			total := def + move + elision + del
+
+			str := fmt.Sprintf("%-7s %7s %7s   %7d %7d %7d %7d %7d %7s %7s %7s %7s %9s\n",
+				"compact", p.ft.from, p.ft.to, def, move, elision, del, total,
+				humanize.Bytes.Uint64(p.bytesIn), humanize.Bytes.Uint64(p.bytesOut),
+				humanize.Bytes.Uint64(p.bytesMoved), humanize.Bytes.Uint64(p.bytesDel),
+				p.duration.Truncate(time.Second))
+			sb.WriteString(str)
+
+			totalDef += def
+			totalMove += move
+			totalElision += elision
+			totalDel += del
+			totalBytesIn += p.bytesIn
+			totalBytesOut += p.bytesOut
+			totalBytesMoved += p.bytesMoved
+			totalBytesDel += p.bytesDel
+			totalTime += p.duration
+		}
+		sb.WriteString(fmt.Sprintf("total         %19d %7d %7d %7d %7d %7s %7s %7s %7s %9s\n",
+			totalDef, totalMove, totalElision, totalDel, s.eventCount,
+			humanize.Bytes.Uint64(totalBytesIn), humanize.Bytes.Uint64(totalBytesOut),
+			humanize.Bytes.Uint64(totalBytesMoved), humanize.Bytes.Uint64(totalBytesDel),
+			totalTime.Truncate(time.Second)))
+	}
+
+	// (Optional) Long running events.
+	if len(s.longRunning) > 0 {
+		sb.WriteString("long-running events (descending runtime):\n")
+		sb.WriteString("_kind________from________to_______job______type_____start_______end____dur(s)_____bytes:\n")
+		for _, e := range s.longRunning {
+			c := e.compaction
+			kind := "compact"
+			if c.fromLevel == -1 {
+				kind = "flush"
+			}
+			sb.WriteString(fmt.Sprintf("%-7s %9s %9s %9d %9s %9s %9s %9.0f %9s\n",
+				kind, level(c.fromLevel), level(c.toLevel), e.jobID, c.cType,
+				e.timeStart.Format(timeFmtHrMinSec), e.timeEnd.Format(timeFmtHrMinSec),
+				e.timeEnd.Sub(e.timeStart).Seconds(), humanize.Bytes.Uint64(c.outputBytes)))
+		}
+	}
+
+	return sb.String()
+}
+
+// windowSummarySlice is a slice of windowSummary that sorts in order of start
+// time, node, then store.
+type windowsSummarySlice []windowSummary
+
+func (s windowsSummarySlice) Len() int {
+	return len(s)
+}
+
+func (s windowsSummarySlice) Less(i, j int) bool {
+	if !s[i].tStart.Equal(s[j].tStart) {
+		return s[i].tStart.Before(s[j].tStart)
+	}
+	if s[i].nodeID != s[j].nodeID {
+		return s[i].nodeID < s[j].nodeID
+	}
+	return s[i].storeID < s[j].storeID
+}
+
+func (s windowsSummarySlice) Swap(i, j int) {
+	s[i], s[j] = s[j], s[i]
+}
+
+// eventSlice is a slice of events that sorts in order of node, store,
+// then event start time.
+type eventSlice []event
+
+func (s eventSlice) Len() int {
+	return len(s)
+}
+
+func (s eventSlice) Less(i, j int) bool {
+	if s[i].nodeID != s[j].nodeID {
+		return s[i].nodeID < s[j].nodeID
+	}
+	if s[i].storeID != s[j].storeID {
+		return s[i].storeID < s[j].storeID
+	}
+	return s[i].timeStart.Before(s[j].timeStart)
+}
+
+func (s eventSlice) Swap(i, j int) {
+	s[i], s[j] = s[j], s[i]
+}
+
+// readAmpSlice is a slice of readAmp events that sorts in order of node, store,
+// then read amp event start time.
+type readAmpSlice []readAmp
+
+func (r readAmpSlice) Len() int {
+	return len(r)
+}
+
+func (r readAmpSlice) Less(i, j int) bool {
+	// Sort by node, store, then read-amp.
+	if r[i].ctx.node != r[j].ctx.node {
+		return r[i].ctx.node < r[j].ctx.node
+	}
+	if r[i].ctx.store != r[j].ctx.store {
+		return r[i].ctx.store < r[j].ctx.store
+	}
+	return r[i].ctx.timestamp.Before(r[j].ctx.timestamp)
+}
+
+func (r readAmpSlice) Swap(i, j int) {
+	r[i], r[j] = r[j], r[i]
+}
+
+// aggregator combines compaction and read-amp events within windows of fixed
+// duration and returns one aggregated windowSummary struct per window.
+type aggregator struct {
+	window           time.Duration
+	events           []event
+	readAmps         []readAmp
+	longRunningLimit time.Duration
+}
+
+// newAggregator returns a new aggregator.
+func newAggregator(
+	window, longRunningLimit time.Duration, events []event, readAmps []readAmp,
+) *aggregator {
+	return &aggregator{
+		window:           window,
+		events:           events,
+		readAmps:         readAmps,
+		longRunningLimit: longRunningLimit,
+	}
+}
+
+// aggregate aggregates the events into windows, returning the windowSummary for
+// each interval.
+func (a *aggregator) aggregate() []windowSummary {
+	if len(a.events) == 0 {
+		return nil
+	}
+
+	// Sort the event and read-amp slices by start time.
+	sort.Sort(eventSlice(a.events))
+	sort.Sort(readAmpSlice(a.readAmps))
+
+	initWindow := func(e event) *windowSummary {
+		start := e.timeStart.Truncate(a.window)
+		return &windowSummary{
+			nodeID:               e.nodeID,
+			storeID:              e.storeID,
+			tStart:               start,
+			tEnd:                 start.Add(a.window),
+			compactionCounts:     make(map[fromTo]compactionTypeCount),
+			compactionBytesIn:    make(map[fromTo]uint64),
+			compactionBytesOut:   make(map[fromTo]uint64),
+			compactionBytesMoved: make(map[fromTo]uint64),
+			compactionBytesDel:   make(map[fromTo]uint64),
+			compactionTime:       make(map[fromTo]time.Duration),
+		}
+	}
+
+	var windows []windowSummary
+	var j int // index for read-amps
+	finishWindow := func(cur *windowSummary) {
+		// Collect read-amp values for the previous window.
+		var readAmps []readAmp
+		for j < len(a.readAmps) {
+			ra := a.readAmps[j]
+
+			// Skip values before the current window.
+			if ra.ctx.node < cur.nodeID ||
+				ra.ctx.store < cur.storeID ||
+				ra.ctx.timestamp.Before(cur.tStart) {
+				j++
+				continue
+			}
+
+			// We've passed over the current window. Stop.
+			if ra.ctx.node > cur.nodeID ||
+				ra.ctx.store > cur.storeID ||
+				ra.ctx.timestamp.After(cur.tEnd) {
+				break
+			}
+
+			// Collect this read-amp value.
+			readAmps = append(readAmps, ra)
+			j++
+		}
+		cur.readAmps = readAmps
+
+		// Sort long running compactions in descending order of duration.
+		slices.SortFunc(cur.longRunning, func(l, r event) int {
+			return cmp.Compare(l.timeEnd.Sub(l.timeStart), r.timeEnd.Sub(r.timeStart))
+		})
+
+		// Add the completed window to the set of windows.
+		windows = append(windows, *cur)
+	}
+
+	// Move through the compactions, collecting relevant compactions into the same
+	// window. Windows have the same node and store, and a compaction start time
+	// within a given range.
+	i := 0
+	curWindow := initWindow(a.events[i])
+	for ; ; i++ {
+		// No more windows. Complete the current window.
+		if i == len(a.events) {
+			finishWindow(curWindow)
+			break
+		}
+		e := a.events[i]
+
+		// If we're at the start of a new interval, finalize the current window and
+		// start a new one.
+		if curWindow.nodeID != e.nodeID ||
+			curWindow.storeID != e.storeID ||
+			e.timeStart.After(curWindow.tEnd) {
+			finishWindow(curWindow)
+			curWindow = initWindow(e)
+		}
+
+		switch {
+		case e.ingest != nil:
+			// Update ingest stats.
+			for _, f := range e.ingest.files {
+				curWindow.ingestedCount[f.level]++
+				curWindow.ingestedBytes[f.level] += f.sizeBytes
+			}
+		case e.compaction != nil && e.compaction.cType == compactionTypeFlush:
+			// Update flush stats.
+			f := e.compaction
+			curWindow.flushedCount++
+			curWindow.flushedBytes += f.outputBytes
+			curWindow.flushedTime += e.timeEnd.Sub(e.timeStart)
+		case e.compaction != nil:
+			// Update compaction stats.
+			c := e.compaction
+			// Update compaction counts.
+			ft := fromTo{level(c.fromLevel), level(c.toLevel)}
+			m, ok := curWindow.compactionCounts[ft]
+			if !ok {
+				m = make(compactionTypeCount)
+				curWindow.compactionCounts[ft] = m
+			}
+			m[c.cType]++
+			curWindow.eventCount++
+
+			// Update compacted bytes in / out / moved / deleted.
+			switch c.cType {
+			case compactionTypeMove:
+				curWindow.compactionBytesMoved[ft] += c.inputBytes
+			case compactionTypeDeleteOnly:
+				curWindow.compactionBytesDel[ft] += c.inputBytes
+			default:
+				curWindow.compactionBytesIn[ft] += c.inputBytes
+				curWindow.compactionBytesOut[ft] += c.outputBytes
+			}
+
+			// Update compaction time.
+			_, ok = curWindow.compactionTime[ft]
+			if !ok {
+				curWindow.compactionTime[ft] = 0
+			}
+			curWindow.compactionTime[ft] += e.timeEnd.Sub(e.timeStart)
+
+		}
+		// Add "long-running" events. Those that start in this window
+		// that have duration longer than the window interval.
+		if e.timeEnd.Sub(e.timeStart) > a.longRunningLimit {
+			curWindow.longRunning = append(curWindow.longRunning, e)
+		}
+	}
+
+	// Windows are added in order of (node, store, time). Re-sort the windows by
+	// (time, node, store) for better presentation.
+	sort.Sort(windowsSummarySlice(windows))
+
+	return windows
+}
+
+// parseLog parses the log file with the given path, using the given parse
+// function to collect events in the given logEventCollector. parseLog
+// returns a non-nil error if an I/O error was encountered while reading
+// the log file. Parsing errors are accumulated in the
+// logEventCollector.
+func parseLog(path string, b *logEventCollector) error {
+	f, err := os.Open(path)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	s := bufio.NewScanner(f)
+	for s.Scan() {
+		line := s.Text()
+		// Store the log context for the current line, if we have one.
+		if err := parseLogContext(line, b); err != nil {
+			return err
+		}
+
+		// First check for a flush or compaction.
+		matches := sentinelPattern.FindStringSubmatch(line)
+		if matches != nil {
+			// Determine which regexp to apply by testing the first letter of the prefix.
+			var err error
+			switch matches[sentinelPatternPrefixIdx][0] {
+			case 'c':
+				err = parseCompaction(line, b)
+			case 'f':
+				err = parseFlush(line, b)
+			case 'i':
+				err = parseIngest(line, b)
+			default:
+				err = errors.Newf("unexpected line: neither compaction nor flush: %s", line)
+			}
+			if err != nil {
+				b.addError(path, line, err)
+			}
+			continue
+		}
+
+		// Else check for an LSM debug line.
+		if err = parseReadAmp(line, b); err != nil {
+			b.addError(path, line, err)
+			continue
+		}
+	}
+	return s.Err()
+}
+
+// parseLogContext extracts contextual information from the log line (e.g. the
+// timestamp, node and store).
+func parseLogContext(line string, b *logEventCollector) error {
+	matches := logContextPattern.FindStringSubmatch(line)
+	if matches == nil {
+		return nil
+	}
+
+	// Parse start time.
+	t, err := time.Parse(timeFmt, matches[logContextPatternTimestampIdx])
+	if err != nil {
+		return errors.Newf("could not parse timestamp: %s", err)
+	}
+
+	// Parse node and store.
+	nodeID, err := strconv.Atoi(matches[logContextPatternNodeIdx])
+	if err != nil {
+		if matches[logContextPatternNodeIdx] != "?" {
+			return errors.Newf("could not parse node ID: %s", err)
+		}
+		nodeID = -1
+	}
+
+	storeID, err := strconv.Atoi(matches[logContextPatternStoreIdx])
+	if err != nil {
+		if matches[logContextPatternStoreIdx] != "?" {
+			return errors.Newf("could not parse store ID: %s", err)
+		}
+		storeID = -1
+	}
+
+	b.saveContext(logContext{
+		timestamp: t,
+		node:      nodeID,
+		store:     storeID,
+	})
+	return nil
+}
+
+// parseCompaction parses and collects Pebble compaction events.
+func parseCompaction(line string, b *logEventCollector) error {
+	matches := compactionPattern.FindStringSubmatch(line)
+	if matches == nil {
+		return nil
+	}
+
+	// "compacting": implies start line.
+	if matches[compactionPatternSuffixIdx] == "ing" {
+		start, err := parseCompactionStart(matches)
+		if err != nil {
+			return err
+		}
+		if err := b.addCompactionStart(start); err != nil {
+			return err
+		}
+		return nil
+	}
+
+	// "compacted": implies end line.
+	end, err := parseCompactionEnd(matches)
+	if err != nil {
+		return err
+	}
+
+	b.addCompactionEnd(end)
+	return nil
+}
+
+// parseFlush parses and collects Pebble memtable flush events.
+func parseFlush(line string, b *logEventCollector) error {
+	matches := flushPattern.FindStringSubmatch(line)
+	if matches == nil {
+		return nil
+	}
+
+	if matches[flushPatternSuffixIdx] == "ing" {
+		start, err := parseFlushStart(matches)
+		if err != nil {
+			return err
+		}
+		return b.addCompactionStart(start)
+	}
+
+	end, err := parseFlushEnd(matches)
+	if err != nil {
+		return err
+	}
+
+	b.addCompactionEnd(end)
+	return nil
+}
+
+func parseIngestDuringFlush(line string, b *logEventCollector) error {
+	matches := flushableIngestedPattern.FindStringSubmatch(line)
+	if matches == nil {
+		return nil
+	}
+	// Parse job ID.
+	jobID, err := strconv.Atoi(matches[flushableIngestedPatternJobIdx])
+	if err != nil {
+		return errors.Newf("could not parse jobID: %s", err)
+	}
+	return parseRemainingIngestLogLine(jobID, line, b)
+}
+
+// parseIngest parses and collects Pebble ingest complete events.
+func parseIngest(line string, b *logEventCollector) error {
+	matches := ingestedPattern.FindStringSubmatch(line)
+	if matches == nil {
+		// Try and parse the other kind of ingest.
+		return parseIngestDuringFlush(line, b)
+	}
+	// Parse job ID.
+	jobID, err := strconv.Atoi(matches[ingestedPatternJobIdx])
+	if err != nil {
+		return errors.Newf("could not parse jobID: %s", err)
+	}
+	return parseRemainingIngestLogLine(jobID, line, b)
+}
+
+// parses the level, filenum, and bytes for the files which were ingested.
+func parseRemainingIngestLogLine(jobID int, line string, b *logEventCollector) error {
+	fileMatches := ingestedFilePattern.FindAllStringSubmatch(line, -1)
+	files := make([]ingestedFile, len(fileMatches))
+	for i := range fileMatches {
+		level, err := strconv.Atoi(fileMatches[i][ingestedFilePatternLevelIdx])
+		if err != nil {
+			return errors.Newf("could not parse level: %s", err)
+		}
+		fileNum, err := strconv.Atoi(fileMatches[i][ingestedFilePatternFileIdx])
+		if err != nil {
+			return errors.Newf("could not parse file number: %s", err)
+		}
+		files[i] = ingestedFile{
+			level:     level,
+			fileNum:   fileNum,
+			sizeBytes: unHumanize(fileMatches[i][ingestedFilePatternBytesIdx]),
+		}
+	}
+	b.events = append(b.events, event{
+		nodeID:    b.ctx.node,
+		storeID:   b.ctx.store,
+		jobID:     jobID,
+		timeStart: b.ctx.timestamp,
+		timeEnd:   b.ctx.timestamp,
+		ingest: &ingest{
+			files: files,
+		},
+	})
+	return nil
+}
+
+// parseReadAmp attempts to parse the current line as a read amp value
+func parseReadAmp(line string, b *logEventCollector) error {
+	matches := readAmpPattern.FindStringSubmatch(line)
+	if matches == nil {
+		return nil
+	}
+	val, err := strconv.Atoi(matches[readAmpPatternValueIdx])
+	if err != nil {
+		return errors.Newf("could not parse read amp: %s", err)
+	}
+	b.addReadAmp(readAmp{
+		readAmp: val,
+	})
+	return nil
+}
+
+// runCompactionLogs is runnable function of the top-level cobra.Command that
+// parses and collects Pebble compaction events and LSM information.
+func runCompactionLogs(cmd *cobra.Command, args []string) error {
+	// The args contain a list of log files to read.
+	files := args
+
+	// Scan the log files collecting start and end compaction lines.
+	b := newEventCollector()
+	for _, file := range files {
+		err := parseLog(file, b)
+		// parseLog returns an error only on I/O errors, which we
+		// immediately exit with.
+		if err != nil {
+			return err
+		}
+	}
+
+	window, err := cmd.Flags().GetDuration("window")
+	if err != nil {
+		return err
+	}
+
+	longRunningLimit, err := cmd.Flags().GetDuration("long-running-limit")
+	if err != nil {
+		return err
+	}
+	if longRunningLimit == 0 {
+		// Off by default. Set to infinite duration.
+		longRunningLimit = time.Duration(math.MaxInt64)
+	}
+
+	// Aggregate the lines.
+	a := newAggregator(window, longRunningLimit, b.events, b.readAmps)
+	summaries := a.aggregate()
+	for _, s := range summaries {
+		fmt.Printf("%s\n", s)
+	}
+
+	// After the summaries, print accumulated parsing errors to stderr.
+	for _, e := range b.errors {
+		fmt.Fprintf(os.Stderr, "-\n%s: %s\nError: %s\n", filepath.Base(e.path), e.line, e.err)
+	}
+	return nil
+}
+
+// unHumanize performs the opposite of humanize.Bytes.Uint64 (e.g. "10B",
+// "10MB") or the 23.1 humanize.IEC.Uint64 (e.g. "10 B", "10 M"), converting a
+// human-readable value into a raw number of bytes.
+func unHumanize(s string) uint64 {
+	if len(s) < 2 || !(s[0] >= '0' && s[0] <= '9') {
+		panic(errors.Newf("invalid bytes value %q", s))
+	}
+	if s[len(s)-1] == 'B' {
+		s = s[:len(s)-1]
+	}
+
+	multiplier := uint64(1)
+	switch s[len(s)-1] {
+	case 'K':
+		multiplier = 1 << 10
+	case 'M':
+		multiplier = 1 << 20
+	case 'G':
+		multiplier = 1 << 30
+	case 'T':
+		multiplier = 1 << 40
+	case 'P':
+		multiplier = 1 << 50
+	case 'E':
+		multiplier = 1 << 60
+	}
+	if multiplier != 1 {
+		s = s[:len(s)-1]
+	}
+	if s[len(s)-1] == ' ' {
+		s = s[:len(s)-1]
+	}
+	val, err := strconv.ParseFloat(s, 64)
+	if err != nil {
+		panic(fmt.Sprintf("parsing %s: %v", s, err))
+	}
+
+	return uint64(val * float64(multiplier))
+}
+
+// sumInputBytes takes a string as input and returns the sum of the
+// human-readable sizes, as an integer number of bytes.
+func sumInputBytes(s string) (total uint64, _ error) {
+	var (
+		open bool
+		b    bytes.Buffer
+	)
+	for _, c := range s {
+		switch c {
+		case '(':
+			open = true
+		case ')':
+			total += unHumanize(b.String())
+			b.Reset()
+			open = false
+		default:
+			if open {
+				b.WriteRune(c)
+			}
+		}
+	}
+	return
+}
diff --git a/pebble/tool/logs/compaction_test.go b/pebble/tool/logs/compaction_test.go
new file mode 100644
index 0000000..36c3a80
--- /dev/null
+++ b/pebble/tool/logs/compaction_test.go
@@ -0,0 +1,467 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package logs
+
+import (
+	"bytes"
+	"fmt"
+	"math"
+	"os"
+	"path/filepath"
+	"regexp"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/stretchr/testify/require"
+)
+
+const (
+	compactionStartLine23_1 = `I211215 14:26:56.012382 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n5,pebble,s6] 1216510  [JOB 284925] compacting(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M)`
+	compactionStartLine     = `I211215 14:26:56.012382 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n5,pebble,s6] 1216510  [JOB 284925] compacting(default) L2 [442555] (4.2MB) Score=1.01 + L3 [445853] (8.4MB) Score=0.99;  OverlappingRatio: Single 8.03, Multi 25.05;`
+
+	compactionEndLine23_1 = `I211215 14:26:56.318543 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n5,pebble,s6] 1216554  [JOB 284925] compacted(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M) -> L3 [445883 445887] (13 M), in 0.3s, output rate 42 M/s`
+	compactionEndLine     = `I211215 14:26:56.318543 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n5,pebble,s6] 1216554  [JOB 284925] compacted(default) L2 [442555] (4.2MB) Score=1.01 + L3 [445853] (8.4MB) Score=1.01 -> L3 [445883 445887] (13MB), in 0.3s, output rate 42MB/s`
+
+	compactionMultiLevelStartLine = `I211215 14:26:56.318543 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n5,pebble,s6] 1216554 [JOB 11] compacting(default) [multilevel] L2 [250858] (2.1MB) Score=1.09 + L3 [247985 247989 247848] (17MB) Score=0.99 + L4 [250817 250834 238396] (28MB) Score=1.00; OverlappingRatio: Single 3.77, Multi 1.46;`
+	compactionMultiLevelEndline   = `I211215 14:26:56.318543 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n5,pebble,s6] 1216554 [JOB 11] compacted(default) [multilevel] L2 [250858] (2.1MB) Score=1.09 + L3 [247985 247989 247848] (17MB) Score=0.99 + L4 [250817 250834 238396] (28MB) Score=1.00 -> L4 [250859 250860 250861 250862 250863] (46MB), in 0.2s (0.2s total), output rate 185MB/s`
+
+	flushStartLine = `I211213 16:23:48.903751 21136 3@vendor/github.com/cockroachdb/pebble/event.go:599 ⋮ [n9,pebble,s8] 24 [JOB 10] flushing 2 memtables to L0`
+
+	flushEndLine23_1 = `I211213 16:23:49.134464 21136 3@vendor/github.com/cockroachdb/pebble/event.go:603 ⋮ [n9,pebble,s8] 26 [JOB 10] flushed 2 memtables to L0 [1535806] (1.3 M), in 0.2s, output rate 5.8 M/s`
+	flushEndLine     = `I211213 16:23:49.134464 21136 3@vendor/github.com/cockroachdb/pebble/event.go:603 ⋮ [n9,pebble,s8] 26 [JOB 10] flushed 2 memtables to L0 [1535806] (1.3MB), in 0.2s, output rate 5.8MB/s`
+
+	readAmpLine23_1 = `  total     31766   188 G       -   257 G   187 G    48 K   3.6 G     744   536 G    49 K   278 G       5     2.1`
+	readAmpLine     = `total |   32K 188GB     0B |     - | 257GB |   48K  187GB |   744  3.6GB |   49K  536GB | 278GB |   5  2.1`
+
+	compactionStartNoNodeStoreLine23_1 = `I211215 14:26:56.012382 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n?,pebble,s?] 1216510  [JOB 284925] compacting(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M)`
+	compactionStartNoNodeStoreLine     = `I211215 14:26:56.012382 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n?,pebble,s?] 1216510  [JOB 284925] compacting(default) L2 [442555] (4.2 M) + L3 [445853] (8.4MB)`
+	flushStartNoNodeStoreLine          = `I211213 16:23:48.903751 21136 3@vendor/github.com/cockroachdb/pebble/event.go:599 ⋮ [n?,pebble,s?] 24 [JOB 10] flushing 2 memtables to L0`
+
+	flushableIngestionLine23_1 = `I230831 04:13:28.824280 3780 3@pebble/event.go:685 ⋮ [n10,s10,pebble] 365  [JOB 226] flushed 6 ingested flushables L0:024334 (1.5 K) + L0:024339 (1.0 K) + L0:024335 (1.9 K) + L0:024336 (1.1 K) + L0:024337 (1.1 K) + L0:024338 (12 K) in 0.0s (0.0s total), output rate 67 M/s`
+	flushableIngestionLine     = `I230831 04:13:28.824280 3780 3@pebble/event.go:685 ⋮ [n10,s10,pebble] 365  [JOB 226] flushed 6 ingested flushables L0:024334 (1.5KB) + L0:024339 (1.0KB) + L0:024335 (1.9KB) + L0:024336 (1.1KB) + L0:024337 (1.1KB) + L0:024338 (12KB) in 0.0s (0.0s total), output rate 67MB/s`
+)
+
+func TestCompactionLogs_Regex(t *testing.T) {
+	tcs := []struct {
+		name    string
+		re      *regexp.Regexp
+		line    string
+		matches map[int]string
+	}{
+		{
+			name: "compaction start - sentinel",
+			re:   sentinelPattern,
+			line: compactionStartLine23_1,
+			matches: map[int]string{
+				sentinelPatternPrefixIdx: "compact",
+				sentinelPatternSuffixIdx: "ing",
+			},
+		},
+		{
+			name: "compaction start - sentinel",
+			re:   sentinelPattern,
+			line: compactionStartLine,
+			matches: map[int]string{
+				sentinelPatternPrefixIdx: "compact",
+				sentinelPatternSuffixIdx: "ing",
+			},
+		},
+		{
+			name: "compaction end - sentinel",
+			re:   sentinelPattern,
+			line: compactionEndLine23_1,
+			matches: map[int]string{
+				sentinelPatternPrefixIdx: "compact",
+				sentinelPatternSuffixIdx: "ed",
+			},
+		},
+		{
+			name: "compaction end - sentinel",
+			re:   sentinelPattern,
+			line: compactionEndLine,
+			matches: map[int]string{
+				sentinelPatternPrefixIdx: "compact",
+				sentinelPatternSuffixIdx: "ed",
+			},
+		},
+		{
+			name: "flush start - sentinel",
+			re:   sentinelPattern,
+			line: flushStartLine,
+			matches: map[int]string{
+				sentinelPatternPrefixIdx: "flush",
+				sentinelPatternSuffixIdx: "ing",
+			},
+		},
+		{
+			name: "flush end - sentinel",
+			re:   sentinelPattern,
+			line: flushEndLine23_1,
+			matches: map[int]string{
+				sentinelPatternPrefixIdx: "flush",
+				sentinelPatternSuffixIdx: "ed",
+			},
+		},
+		{
+			name: "flush end - sentinel",
+			re:   sentinelPattern,
+			line: flushEndLine,
+			matches: map[int]string{
+				sentinelPatternPrefixIdx: "flush",
+				sentinelPatternSuffixIdx: "ed",
+			},
+		},
+		{
+			name: "compaction start",
+			re:   compactionPattern,
+			line: compactionStartLine23_1,
+			matches: map[int]string{
+				compactionPatternJobIdx:    "284925",
+				compactionPatternSuffixIdx: "ing",
+				compactionPatternTypeIdx:   "default",
+				compactionPatternFromIdx:   "2",
+				compactionPatternToIdx:     "3",
+				compactionPatternLevels:    "L2 [442555] (4.2 M) + L3 [445853] (8.4 M)",
+			},
+		},
+		{
+			name: "compaction start",
+			re:   compactionPattern,
+			line: compactionStartLine,
+			matches: map[int]string{
+				compactionPatternJobIdx:    "284925",
+				compactionPatternSuffixIdx: "ing",
+				compactionPatternTypeIdx:   "default",
+				compactionPatternFromIdx:   "2",
+				compactionPatternToIdx:     "3",
+				compactionPatternLevels:    "L2 [442555] (4.2MB) Score=1.01 + L3 [445853] (8.4MB) Score=0.99",
+			},
+		},
+		{
+			name: "compaction start multilevel",
+			re:   compactionPattern,
+			line: compactionMultiLevelStartLine,
+			matches: map[int]string{
+				compactionPatternJobIdx:    "11",
+				compactionPatternSuffixIdx: "ing",
+				compactionPatternTypeIdx:   "default",
+				compactionPatternFromIdx:   "2",
+				compactionPatternToIdx:     "4",
+				compactionPatternLevels:    "L2 [250858] (2.1MB) Score=1.09 + L3 [247985 247989 247848] (17MB) Score=0.99 + L4 [250817 250834 238396] (28MB) Score=1.00",
+			},
+		},
+		{
+			name: "compaction start - no node / store",
+			re:   compactionPattern,
+			line: compactionStartNoNodeStoreLine,
+			matches: map[int]string{
+				compactionPatternJobIdx:    "284925",
+				compactionPatternSuffixIdx: "ing",
+				compactionPatternTypeIdx:   "default",
+				compactionPatternFromIdx:   "2",
+				compactionPatternToIdx:     "3",
+			},
+		},
+		{
+			name: "compaction end",
+			re:   compactionPattern,
+			line: compactionEndLine,
+			matches: map[int]string{
+				compactionPatternJobIdx:    "284925",
+				compactionPatternSuffixIdx: "ed",
+				compactionPatternTypeIdx:   "default",
+				compactionPatternFromIdx:   "2",
+				compactionPatternToIdx:     "3",
+				compactionPatternBytesIdx:  "13MB",
+			},
+		},
+		{
+			name: "compaction end",
+			re:   compactionPattern,
+			line: compactionMultiLevelEndline,
+			matches: map[int]string{
+				compactionPatternJobIdx:    "11",
+				compactionPatternSuffixIdx: "ed",
+				compactionPatternTypeIdx:   "default",
+				compactionPatternFromIdx:   "2",
+				compactionPatternToIdx:     "4",
+				compactionPatternBytesIdx:  "46MB",
+			},
+		},
+		{
+			name: "flush start",
+			re:   flushPattern,
+			line: flushStartLine,
+			matches: map[int]string{
+				flushPatternJobIdx:    "10",
+				flushPatternSuffixIdx: "ing",
+				flushPatternBytesIdx:  "",
+			},
+		},
+		{
+			name: "flush start - no node / store",
+			re:   flushPattern,
+			line: flushStartNoNodeStoreLine,
+			matches: map[int]string{
+				flushPatternJobIdx:    "10",
+				flushPatternSuffixIdx: "ing",
+				flushPatternBytesIdx:  "",
+			},
+		},
+		{
+			name: "flush end",
+			re:   flushPattern,
+			line: flushEndLine,
+			matches: map[int]string{
+				flushPatternJobIdx:    "10",
+				flushPatternSuffixIdx: "ed",
+				flushPatternBytesIdx:  "1.3MB",
+			},
+		},
+		{
+			name: "read amp suffix",
+			re:   readAmpPattern,
+			line: readAmpLine,
+			matches: map[int]string{
+				readAmpPatternValueIdx: "5",
+			},
+		},
+		{
+			name: "read amp suffix 23.1",
+			re:   readAmpPattern,
+			line: readAmpLine23_1,
+			matches: map[int]string{
+				readAmpPatternValueIdx: "5",
+			},
+		},
+		{
+			name: "ingestion during flush job 23.1",
+			re:   flushableIngestedPattern,
+			line: flushableIngestionLine23_1,
+			matches: map[int]string{
+				flushableIngestedPatternJobIdx: "226",
+			},
+		},
+		{
+			name: "ingestion during flush 23.1",
+			re:   ingestedFilePattern,
+			line: flushableIngestionLine23_1,
+			matches: map[int]string{
+				// Just looking at the first match for these.
+				ingestedFilePatternLevelIdx: "0",
+				ingestedFilePatternFileIdx:  "024334",
+				ingestedFilePatternBytesIdx: "1.5 K",
+			},
+		},
+		{
+			name: "ingestion during flush job",
+			re:   flushableIngestedPattern,
+			line: flushableIngestionLine,
+			matches: map[int]string{
+				flushableIngestedPatternJobIdx: "226",
+			},
+		},
+		{
+			name: "ingestion during flush",
+			re:   ingestedFilePattern,
+			line: flushableIngestionLine,
+			matches: map[int]string{
+				// Just looking at the first match for these.
+				ingestedFilePatternLevelIdx: "0",
+				ingestedFilePatternFileIdx:  "024334",
+				ingestedFilePatternBytesIdx: "1.5KB",
+			},
+		},
+	}
+
+	for _, tc := range tcs {
+		t.Run(tc.name, func(t *testing.T) {
+			matches := tc.re.FindStringSubmatch(tc.line)
+			require.NotNil(t, matches)
+			for idx, want := range tc.matches {
+				require.Equal(t, want, matches[idx])
+			}
+		})
+	}
+}
+
+func TestParseLogContext(t *testing.T) {
+	testCases := []struct {
+		line      string
+		timestamp string
+		node      int
+		store     int
+	}{
+		{
+			line:      `I211215 14:26:56.012382 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n5,pebble,s6] foo`,
+			timestamp: "211215 14:26:56.012382",
+			node:      5,
+			store:     6,
+		},
+		{
+			line:      `I211215 14:26:56.012382 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [T?,n5,pebble,s6] foo`,
+			timestamp: "211215 14:26:56.012382",
+			node:      5,
+			store:     6,
+		},
+		{
+			line:      `I211215 14:26:56.012382 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [T15,n5,pebble,s6] foo`,
+			timestamp: "211215 14:26:56.012382",
+			node:      5,
+			store:     6,
+		},
+		{
+			line:      `I211215 14:26:56.012382 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [T1,n?,pebble,s6] foo`,
+			timestamp: "211215 14:26:56.012382",
+			node:      -1,
+			store:     6,
+		},
+		{
+			line:      `I211215 14:26:56.012382 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n5,pebble,s?] foo`,
+			timestamp: "211215 14:26:56.012382",
+			node:      5,
+			store:     -1,
+		},
+	}
+	for _, tc := range testCases {
+		t.Run("", func(t *testing.T) {
+			var b logEventCollector
+			require.NoError(t, parseLogContext(tc.line, &b))
+			expT, err := time.Parse(timeFmt, tc.timestamp)
+			require.NoError(t, err)
+			require.Equal(t, expT, b.ctx.timestamp)
+			require.Equal(t, tc.node, b.ctx.node)
+			require.Equal(t, tc.store, b.ctx.store)
+		})
+	}
+}
+
+func TestCompactionLogs(t *testing.T) {
+	dir := t.TempDir()
+	datadriven.Walk(t, "testdata", func(t *testing.T, path string) {
+		var c *logEventCollector
+		var logNum int
+		resetFn := func() {
+			c = newEventCollector()
+			logNum = 0
+		}
+		resetFn()
+		datadriven.RunTest(t, path, func(t *testing.T, td *datadriven.TestData) string {
+			switch td.Cmd {
+			case "log":
+				basename := fmt.Sprintf("%d.log", logNum)
+				logNum++
+				f, err := os.Create(filepath.Join(dir, basename))
+				if err != nil {
+					panic(err)
+				}
+				if _, err = f.WriteString(td.Input); err != nil {
+					panic(err)
+				}
+				_ = f.Close()
+
+				if err = parseLog(f.Name(), c); err != nil {
+					return err.Error()
+				}
+				return basename
+
+			case "summarize":
+				window := 1 * time.Minute
+				longRunning := time.Duration(math.MaxInt64)
+
+				var err error
+				for _, cmdArg := range td.CmdArgs {
+					switch cmdArg.Key {
+					case "window":
+						window, err = time.ParseDuration(cmdArg.Vals[0])
+						if err != nil {
+							panic(errors.Newf("could not parse window: %s", err))
+						}
+
+					case "long-running":
+						longRunning, err = time.ParseDuration(cmdArg.Vals[0])
+						if err != nil {
+							panic(errors.Newf("could not parse long-running: %s", err))
+						}
+
+					default:
+						panic(errors.Newf("unknown arg %q", cmdArg.Key))
+					}
+				}
+
+				a := newAggregator(window, longRunning, c.events, c.readAmps)
+				windows := a.aggregate()
+
+				var b bytes.Buffer
+				for _, w := range windows {
+					b.WriteString(w.String())
+				}
+
+				return b.String()
+
+			case "reset":
+				resetFn()
+				return ""
+
+			default:
+				return fmt.Sprintf("unknown command %q", td.Cmd)
+			}
+		})
+	})
+}
+
+func TestParseInputBytes(t *testing.T) {
+	testCases := []struct {
+		s    string
+		want uint64
+	}{
+		// Test both 23.1 and current formats.
+		{
+			"(10 B)",
+			10,
+		},
+		{
+			"(10B)",
+			10,
+		},
+		{
+			"(10 M)",
+			10 << 20,
+		},
+		{
+			"(10MB)",
+			10 << 20,
+		},
+		{
+			"(10 M) + (20 M)",
+			30 << 20,
+		},
+		{
+			"(10MB) + (20MB)",
+			30 << 20,
+		},
+		{
+			"(10 M) + (20 M) + (30 M)",
+			60 << 20,
+		},
+		{
+			"(10MB) + (20MB) + (30MB)",
+			60 << 20,
+		},
+		{
+			"foo",
+			0,
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.s, func(t *testing.T) {
+			got, err := sumInputBytes(tc.s)
+			require.NoError(t, err)
+			require.Equal(t, tc.want, got)
+		})
+	}
+}
diff --git a/pebble/tool/logs/testdata/compactions-23-1 b/pebble/tool/logs/testdata/compactions-23-1
new file mode 100644
index 0000000..718b286
--- /dev/null
+++ b/pebble/tool/logs/testdata/compactions-23-1
@@ -0,0 +1,499 @@
+# Single compaction and flush pair for a single node / store combination.
+#
+# Use a combination of [n1,pebble,s1] and [n1,s1,pebble] to mimic the two
+# formats we see in production.
+
+
+log
+I211215 00:00:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n1,pebble,s1] 1216510  [JOB 1] compacting(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M)
+I211215 00:00:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n1,s1,pebble] 1216554  [JOB 1] compacted(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M) -> L3 [445883 445887] (13 M), in 0.3s, output rate 42 M/s
+
+I211215 00:01:10.000000 21136 3@vendor/github.com/cockroachdb/pebble/event.go:599 ⋮ [n1,s1,pebble] 24 [JOB 2] flushing 2 memtables to L0
+I211215 00:01:20.000000 21136 3@vendor/github.com/cockroachdb/pebble/event.go:603 ⋮ [n1,pebble,s1] 26 [JOB 2] flushed 2 memtables to L0 [1535806] (1.3 M), in 0.2s, output rate 5.8 M/s
+----
+0.log
+
+summarize
+----
+node: 1, store: 1
+   from: 211215 00:00
+     to: 211215 00:01
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L2      L3         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+node: 1, store: 1
+   from: 211215 00:01
+     to: 211215 00:02
+  r-amp: NaN
+_kind______from______to_____________________________________count___bytes______time
+flush                L0                                         1   1.3MB       10s
+total                                                           1   1.3MB       10s
+
+# Same as the previous case, except that the start and end events are are split
+# across multiple files (one log line per file).
+
+reset
+----
+
+log
+I211215 00:00:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n1,bars,s1,foos] 1216510  [JOB 1] compacting(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M)
+----
+0.log
+
+log
+I211215 00:00:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n1,s1,foos] 1216554  [JOB 1] compacted(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M) -> L3 [445883 445887] (13 M), in 0.3s, output rate 42 M/s
+----
+1.log
+
+log
+I211215 00:01:10.000000 21136 3@vendor/github.com/cockroachdb/pebble/event.go:599 ⋮ [n1,s1] 24 [JOB 2] flushing 2 memtables to L0
+----
+2.log
+
+log
+I211215 00:01:20.000000 21136 3@vendor/github.com/cockroachdb/pebble/event.go:603 ⋮ [n1,pebble,s1] 26 [JOB 2] flushed 2 memtables to L0 [1535806] (1.3 M), in 0.2s, output rate 5.8 M/s
+----
+3.log
+
+summarize
+----
+node: 1, store: 1
+   from: 211215 00:00
+     to: 211215 00:01
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L2      L3         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+node: 1, store: 1
+   from: 211215 00:01
+     to: 211215 00:02
+  r-amp: NaN
+_kind______from______to_____________________________________count___bytes______time
+flush                L0                                         1   1.3MB       10s
+total                                                           1   1.3MB       10s
+
+# Read amplification from the Cockroach log, one within an existing window,
+# another outside of the existing window. The latter is not included.
+
+reset
+----
+
+log
+I211215 00:00:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n1,pebble,s1] 1216510  [JOB 1] compacting(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M)
+I211215 00:00:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n1,pebble,s1] 1216554  [JOB 1] compacted(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M) -> L3 [445883 445887] (13 M), in 0.3s, output rate 42 M/s
+----
+0.log
+
+log
+I211215 00:00:15.000000 434 kv/kvserver/store.go:3251 ⋮ [n1,s1] 31356
+__level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
+    WAL         1    54 M       -    65 G       -       -       -       -    70 G       -       -       -     1.1
+      0         0     0 B    0.00    70 G    77 M     133     0 B       0    24 G    19 K   4.2 G       0     0.3
+      1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      2        14    34 M    0.96    18 G     0 B       0    17 M      10    49 G    14 K    55 G       1     2.7
+      3        42   207 M    0.96    12 G     0 B       0   939 M     280    43 G   7.3 K    46 G       1     3.4
+      4       264   1.5 G    0.99   9.1 G    18 M       6   824 M     152    31 G   4.5 K    35 G       1     3.4
+      5      7474    23 G    1.00   2.8 G   116 G    26 K   1.8 G     301   3.2 G     604   3.2 G       1     1.2
+      6     23972   164 G       -    98 G    70 G    22 K   1.6 K       1   129 G   3.8 K   135 G       1     1.3
+  total     31766   188 G       -   257 G   187 G    48 K   3.6 G     744   536 G    49 K   278 G       5     2.1
+I211215 00:01:15.000000 434 kv/kvserver/store.go:3251 ⋮ [n1,s1] 31356
+__level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
+    WAL         1    35 M       -    65 G       -       -       -       -    70 G       -       -       -     1.1
+      0         0     0 B    0.00    70 G    77 M     133     0 B       0    24 G    19 K   4.2 G       0     0.3
+      1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      2        14    34 M    0.95    18 G     0 B       0    17 M      10    49 G    14 K    55 G       1     2.7
+      3        42   207 M    0.96    12 G     0 B       0   939 M     280    43 G   7.3 K    46 G       1     3.4
+      4       264   1.5 G    0.99   9.1 G    18 M       6   824 M     152    31 G   4.5 K    35 G       1     3.4
+      5      7474    23 G    1.00   2.8 G   116 G    26 K   1.8 G     301   3.2 G     604   3.2 G       1     1.2
+      6     23972   164 G       -    98 G    70 G    22 K   1.6 K       1   129 G   3.8 K   135 G       1     1.3
+  total     31766   188 G       -   257 G   187 G    48 K   3.6 G     744   536 G    49 K   278 G       5     2.1
+----
+1.log
+
+summarize
+----
+node: 1, store: 1
+   from: 211215 00:00
+     to: 211215 00:01
+  r-amp: 5.0
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L2      L3         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+
+# Long running compaction.
+
+reset
+----
+
+log
+I211215 00:01:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n1,s1,pebble] 1216510  [JOB 1] compacting(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M)
+I211215 00:03:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n1,pebble,s1] 1216554  [JOB 1] compacted(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M) -> L3 [445883 445887] (13 M), in 0.3s, output rate 42 M/s
+----
+0.log
+
+summarize long-running=1m
+----
+node: 1, store: 1
+   from: 211215 00:01
+     to: 211215 00:02
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L2      L3         1       0       0       0       1    13MB    13MB      0B      0B     2m10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B     2m10s
+long-running events (descending runtime):
+_kind________from________to_______job______type_____start_______end____dur(s)_____bytes:
+compact        L2        L3         1   default  00:01:10  00:03:20       130      13MB
+
+# Single node, multiple stores.
+
+reset
+----
+
+log
+I211215 00:01:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n1,pebble,s1] 1216510  [JOB 1] compacting(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M)
+I211215 00:01:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n1,pebble,s1] 1216554  [JOB 1] compacted(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M) -> L3 [445883 445887] (13 M), in 0.3s, output rate 42 M/s
+
+I211215 00:01:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n1,pebble,s2] 1216510  [JOB 2] compacting(default) L3 [442555] (4.2 M) + L4 [445853] (8.4 M)
+I211215 00:01:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n1,pebble,s2] 1216554  [JOB 2] compacted(default) L3 [442555] (4.2 M) + L4 [445853] (8.4 M) -> L4 [445883 445887] (13 M), in 0.3s, output rate 42 M/s
+----
+0.log
+
+summarize
+----
+node: 1, store: 1
+   from: 211215 00:01
+     to: 211215 00:02
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L2      L3         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+node: 1, store: 2
+   from: 211215 00:01
+     to: 211215 00:02
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L3      L4         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+
+# Multiple nodes, single stores. Two separate pebble logs.
+
+reset
+----
+
+log
+I211215 00:01:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n1,pebble,s1] 1216510  [JOB 1] compacting(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M)
+I211215 00:01:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n1,pebble,s1] 1216554  [JOB 1] compacted(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M) -> L3 [445883 445887] (13 M), in 0.3s, output rate 42 M/s
+----
+0.log
+
+log
+I211215 00:01:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n2,pebble,s1] 1216510  [JOB 1] compacting(default) L3 [442555] (4.2 M) + L4 [445853] (8.4 M)
+I211215 00:01:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n2,pebble,s1] 1216554  [JOB 1] compacted(default) L3 [442555] (4.2 M) + L4 [445853] (8.4 M) -> L4 [445883 445887] (13 M), in 0.3s, output rate 42 M/s
+----
+1.log
+
+summarize
+----
+node: 1, store: 1
+   from: 211215 00:01
+     to: 211215 00:02
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L2      L3         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+node: 2, store: 1
+   from: 211215 00:01
+     to: 211215 00:02
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L3      L4         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+
+# Multiple nodes, multiple stores. Two separate pebble logs. Output is sorted by
+# (time, node, store).
+
+reset
+----
+
+log
+I211215 00:01:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n1,pebble,s1] 1216510  [JOB 1] compacting(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M)
+I211215 00:01:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n1,pebble,s1] 1216554  [JOB 1] compacted(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M) -> L3 [445883 445887] (13 M), in 0.3s, output rate 42 M/s
+
+I211215 00:02:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n1,pebble,s2] 1216510  [JOB 2] compacting(default) L1 [442555] (4.2 M) + L2 [445853] (8.4 M)
+I211215 00:02:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n1,pebble,s2] 1216554  [JOB 2] compacted(default) L1 [442555] (4.2 M) + L2 [445853] (8.4 M) -> L2 [445883 445887] (13 M), in 0.3s, output rate 42 M/s
+----
+0.log
+
+log
+I211215 00:00:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n2,pebble,s1] 1216510  [JOB 1] compacting(default) L3 [442555] (4.2 M) + L4 [445853] (8.4 M)
+I211215 00:00:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n2,pebble,s1] 1216554  [JOB 1] compacted(default) L3 [442555] (4.2 M) + L4 [445853] (8.4 M) -> L4 [445883 445887] (13 M), in 0.3s, output rate 42 M/s
+
+I211215 00:02:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n2,pebble,s2] 1216510  [JOB 2] compacting(default) L4 [442555] (4.2 M) + L5 [445853] (8.4 M)
+I211215 00:02:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n2,pebble,s2] 1216554  [JOB 2] compacted(default) L4 [442555] (4.2 M) + L5 [445853] (8.4 M) -> L5 [445883 445887] (13 M), in 0.3s, output rate 42 M/s
+----
+1.log
+
+summarize
+----
+node: 2, store: 1
+   from: 211215 00:00
+     to: 211215 00:01
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L3      L4         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+node: 1, store: 1
+   from: 211215 00:01
+     to: 211215 00:02
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L2      L3         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+node: 1, store: 2
+   from: 211215 00:02
+     to: 211215 00:03
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L1      L2         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+node: 2, store: 2
+   from: 211215 00:02
+     to: 211215 00:03
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L4      L5         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+
+# Log lines with an absent node / store are aggregated.
+
+reset
+----
+
+log
+I211215 00:01:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n?,pebble,s?] 1216510  [JOB 1] compacting(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M)
+I211215 00:01:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n?,pebble,s?] 1216554  [JOB 1] compacted(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M) -> L3 [445883 445887] (13 M), in 0.3s, output rate 42 M/s
+----
+0.log
+
+log
+I211215 00:01:15.000000 434 kv/kvserver/store.go:3251 ⋮ [n?,s?] 31356
+__level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
+    WAL         1    54 M       -    65 G       -       -       -       -    70 G       -       -       -     1.1
+      0         0     0 B    0.00    70 G    77 M     133     0 B       0    24 G    19 K   4.2 G       0     0.3
+      1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+      2        14    34 M    0.96    18 G     0 B       0    17 M      10    49 G    14 K    55 G       1     2.7
+      3        42   207 M    0.96    12 G     0 B       0   939 M     280    43 G   7.3 K    46 G       1     3.4
+      4       264   1.5 G    0.99   9.1 G    18 M       6   824 M     152    31 G   4.5 K    35 G       1     3.4
+      5      7474    23 G    1.00   2.8 G   116 G    26 K   1.8 G     301   3.2 G     604   3.2 G       1     1.2
+      6     23972   164 G       -    98 G    70 G    22 K   1.6 K       1   129 G   3.8 K   135 G       1     1.3
+  total     31766   188 G       -   257 G   187 G    48 K   3.6 G     744   536 G    49 K   278 G       5     2.1
+----
+1.log
+
+summarize
+----
+node: ?, store: ?
+   from: 211215 00:01
+     to: 211215 00:02
+  r-amp: 5.0
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L2      L3         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+
+# The same Job ID interleaved for multiple nodes / stores.
+
+reset
+----
+
+log
+I211215 00:01:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n1,pebble,s1] 1216510  [JOB 1] compacting(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M)
+I211215 00:02:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n2,pebble,s2] 1216510  [JOB 1] compacting(default) L4 [442555] (4.2 M) + L5 [445853] (8.4 M)
+I211215 00:01:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n1,pebble,s1] 1216554  [JOB 1] compacted(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M) -> L3 [445883 445887] (13 M), in 0.3s, output rate 42 M/s
+I211215 00:02:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n2,pebble,s2] 1216554  [JOB 1] compacted(default) L4 [442555] (4.2 M) + L5 [445853] (8.4 M) -> L5 [445883 445887] (13 M), in 0.3s, output rate 42 M/s
+----
+0.log
+
+summarize
+----
+node: 1, store: 1
+   from: 211215 00:01
+     to: 211215 00:02
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L2      L3         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+node: 2, store: 2
+   from: 211215 00:02
+     to: 211215 00:03
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L4      L5         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+
+# Read amp matching should remain backwards compatible.
+
+reset
+----
+
+log
+I220301 00:00:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n1,pebble,s1] 1216510  [JOB 1] compacting(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M)
+I220301 00:00:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n1,pebble,s1] 1216554  [JOB 1] compacted(default) L2 [442555] (4.2 M) + L3 [445853] (8.4 M) -> L3 [445883 445887] (13 M), in 0.3s, output rate 42 M/s
+----
+0.log
+
+log
+I220301 00:00:30.000000 200 1@gossip/gossip.go:1500 ⋮ [n1] 74  node has connected to cluster via gossip
+I220301 00:00:30.000000 200 kv/kvserver/stores.go:269 ⋮ [n1] 75  wrote 0 node addresses to persistent storage
+I220301 00:00:30.000000 319 2@server/status/runtime.go:569 ⋮ [n1] 76  runtime stats: 154 MiB RSS, 273 goroutines (stacks: 2.5 MiB), 42 MiB/71 MiB Go alloc/total (heap fragmentation: 11 MiB, heap reserved: 3.9 MiB, heap released: 4.2 MiB), 3.2 MiB/5.6 MiB CGO alloc/total (0.0 CGO/sec), 0.0/0.0 %(u/s)time, 0.0 %gc (0x), 425 KiB/500 KiB (r/w)net
+I220301 00:00:30.000000 319 2@server/status/runtime.go:569 ⋮ [n1] 77  runtime stats: 159 MiB RSS, 266 goroutines (stacks: 3.3 MiB), 42 MiB/78 MiB Go alloc/total (heap fragmentation: 12 MiB, heap reserved: 6.7 MiB, heap released: 64 MiB), 4.4 MiB/6.8 MiB CGO alloc/total (0.4 CGO/sec), 2.9/2.1 %(u/s)time, 0.0 %gc (0x), 335 KiB/323 KiB (r/w)net
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +__level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___r-amp___w-amp
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +    WAL         3   779 K       -   773 K       -       -       -       -   779 K       -       -       -     1.0
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +      0         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +      1         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +      2         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +      3         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +      4         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +      5         0     0 B    0.00     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +      6         0     0 B       -     0 B     0 B       0     0 B       0     0 B       0     0 B       0     0.0
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +  total         0     0 B       -   779 K     0 B       0     0 B       0   779 K       0     0 B       1     1.0
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +  flush         0
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +compact         0     0 B             0 B  (size == estimated-debt, in = in-progress-bytes)
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 + memtbl         3   1.8 M
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +zmemtbl         0     0 B
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +   ztbl         0     0 B
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 + bcache         0     0 B    0.0%  (score == hit-rate)
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 + tcache         0     0 B    0.0%  (score == hit-rate)
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 + titers         0
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 + filter         -       -    0.0%  (score == utility)
+----
+1.log
+
+summarize
+----
+node: 1, store: 1
+   from: 220301 00:00
+     to: 220301 00:01
+  r-amp: 1.0
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L2      L3         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+
+reset
+----
+
+log
+I220228 14:44:31.497272 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1366 ⋮ [n24,pebble,s24] 33267888  [JOB 10197855] flushing 1 memtable to L0
+I220228 14:44:31.497485 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267889  [JOB 10197855] flushing: sstable created 21731018
+I220228 14:44:31.527038 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267892  [JOB 10197855] flushing: sstable created 21731020
+I220228 14:44:31.542944 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267894  [JOB 10197855] flushing: sstable created 21731021
+I220228 14:44:31.553581 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267895  [JOB 10197855] flushing: sstable created 21731022
+I220228 14:44:31.554585 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267896  [JOB 10197855] flushing: sstable created 21731023
+I220228 14:44:31.569928 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267897  [JOB 10197855] flushing: sstable created 21731024
+I220228 14:44:31.624309 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267899  [JOB 10197855] flushing: sstable created 21731025
+I220228 14:44:31.685531 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267901  [JOB 10197855] flushing: sstable created 21731026
+I220228 14:44:31.686009 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267902  [JOB 10197855] flushing: sstable created 21731027
+I220228 14:44:31.686415 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267903  [JOB 10197855] flushing: sstable created 21731028
+I220228 14:44:31.780892 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267906  [JOB 10197855] flushing: sstable created 21731030
+I220228 14:44:31.790911 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267907  [JOB 10197855] flushing: sstable created 21731031
+I220228 14:44:31.904614 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267911  [JOB 10197855] flushing: sstable created 21731033
+I220228 14:44:31.905835 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267912  [JOB 10197855] flushing: sstable created 21731034
+I220228 14:44:31.906860 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267913  [JOB 10197855] flushing: sstable created 21731035
+I220228 14:44:31.907602 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267914  [JOB 10197855] flushing: sstable created 21731036
+I220228 14:44:32.019173 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267920  [JOB 10197855] flushing: sstable created 21731037
+I220228 14:44:32.019714 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267921  [JOB 10197855] flushing: sstable created 21731038
+I220228 14:44:32.020161 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267922  [JOB 10197855] flushing: sstable created 21731039
+I220228 14:44:32.100117 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267925  [JOB 10197855] flushing: sstable created 21731040
+I220228 14:44:32.100609 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267926  [JOB 10197855] flushing: sstable created 21731041
+I220228 14:44:32.101065 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267927  [JOB 10197855] flushing: sstable created 21731042
+I220228 14:44:32.101494 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267928  [JOB 10197855] flushing: sstable created 21731043
+I220228 14:44:32.102569 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267929  [JOB 10197855] flushing: sstable created 21731044
+I220228 14:44:32.106284 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267930  [JOB 10197855] flushing: sstable created 21731045
+I220228 14:44:32.138686 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1423 ⋮ [n24,pebble,s24] 33267931  [JOB 10197855] flushed 1 memtable to L0 [21731018 21731020 21731021 21731022 21731023 21731024 21731025 21731026 21731027 21731028 21731030 21731031 21731033 21731034 21731035 21731036 21731037 21731038 21731039 21731040 21731041 21731042 21731043 21731044 21731045] (19 M), in 0.6s, output rate 31 M/s
+----
+0.log
+
+summarize
+----
+node: 24, store: 24
+   from: 220228 14:44
+     to: 220228 14:45
+  r-amp: NaN
+_kind______from______to_____________________________________count___bytes______time
+flush                L0                                         1    19MB        0s
+total                                                           1    19MB        0s
+
+reset
+----
+
+log
+I220228 16:01:22.487906 18476248525 3@vendor/github.com/cockroachdb/pebble/ingest.go:637 ⋮ [n24,pebble,s24] 33430782  [JOB 10211226] ingested L0:21818678 (1.8 K), L0:21818683 (1.2 K), L0:21818679 (1.6 K), L0:21818680 (1.1 K), L0:21818681 (1.1 K), L0:21818682 (160 M)
+45127:I220228 15:58:45.538681 18475981755 3@vendor/github.com/cockroachdb/pebble/ingest.go:637 ⋮ [n24,pebble,s24] 33424719  [JOB 10210743] ingested L0:21814543 (1.4 K), L0:21814548 (1.2 K), L5:21814544 (1.4 K), L5:21814545 (1.1 K), L5:21814546 (1.1 K), L0:21814547 (140 M)
+----
+0.log
+
+summarize
+----
+node: 24, store: 24
+   from: 220228 15:58
+     to: 220228 15:59
+  r-amp: NaN
+_kind______from______to_____________________________________count___bytes______time
+ingest               L0                                         3   140MB
+ingest               L5                                         3   3.6KB
+total                                                           6   140MB        0s
+node: 24, store: 24
+   from: 220228 16:01
+     to: 220228 16:02
+  r-amp: NaN
+_kind______from______to_____________________________________count___bytes______time
+ingest               L0                                         6   160MB
+total                                                           6   160MB        0s
+
+reset
+----
+
+log
+I220907 00:27:21.579807 15082709999 3@vendor/github.com/cockroachdb/pebble/event.go:587 ⋮ [n15,pebble,s15] 2736197  [JOB 743692] compacting(delete-only) L6 [18323385] (11 M)
+I220907 00:27:21.580169 15082709999 3@vendor/github.com/cockroachdb/pebble/event.go:591 ⋮ [n15,pebble,s15] 2736198  [JOB 743692] compacted(delete-only) L6 [18323385] (11 M) -> L6 [] (0 B), in 0.0s, output rate 0 B/s
+
+I220907 00:27:21.631145 15082710355 3@vendor/github.com/cockroachdb/pebble/event.go:587 ⋮ [n15,pebble,s15] 2736201  [JOB 743694] compacting(default) L5 [18323582] (1.8 K) + L6 [17770912] (128 M)
+I220907 00:27:22.729839 15082710355 3@vendor/github.com/cockroachdb/pebble/event.go:591 ⋮ [n15,pebble,s15] 2736208  [JOB 743694] compacted(default) L5 [18323582] (1.8 K) + L6 [17770912] (128 M) -> L6 [18323586] (3.6 M), in 1.1s, output rate 3.3 M/s
+
+I220907 00:27:21.630546 15082710354 3@vendor/github.com/cockroachdb/pebble/event.go:587 ⋮ [n15,pebble,s15] 2736199  [JOB 743693] compacting(move) L5 [18323585] (4.0 M) + L6 [] (0 B)
+I220907 00:27:21.631002 15082710354 3@vendor/github.com/cockroachdb/pebble/event.go:591 ⋮ [n15,pebble,s15] 2736200  [JOB 743693] compacted(move) L5 [18323585] (4.0 M) + L6 [] (0 B) -> L6 [18323585] (4.0 M), in 0.0s, output rate 50 G/s
+----
+0.log
+
+summarize
+----
+node: 15, store: 15
+   from: 220907 00:27
+     to: 220907 00:28
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L5      L6         1       1       0       0       2   128MB   3.6MB   4.0MB      0B        1s
+compact      L6      L6         0       0       0       1       1      0B      0B      0B    11MB        0s
+total                           1       1       0       1       3   128MB   3.6MB   4.0MB    11MB        1s
+
+reset
+----
+
+log
+I230831 04:13:28.824280 3780 3@pebble/event.go:685 ⋮ [n10,s10,pebble] 365  [JOB 226] flushed 6 ingested flushables L0:024334 (1.5 K) + L0:024339 (1.0 K) + L0:024335 (1.9 K) + L0:024336 (1.1 K) + L0:024337 (1.1 K) + L0:024338 (12 K) in 0.0s (0.0s total), output rate 67 M/s
+
+I230831 04:13:28.689575 3717 3@pebble/event.go:685 ⋮ [n10,s10,pebble] 345  [JOB 219] flushed 6 ingested flushables L0:024323 (1.5 K) + L0:024328 (1.0 K) + L0:024324 (2.0 K) + L2:024325 (1.1 K) + L2:024326 (1.1 K) + L0:024327 (54 K) in 0.0s (0.0s total), output rate 152 M/s
+----
+0.log
+
+summarize
+----
+node: 10, store: 10
+   from: 230831 04:13
+     to: 230831 04:14
+  r-amp: NaN
+_kind______from______to_____________________________________count___bytes______time
+ingest               L0                                        10    77KB
+ingest               L2                                         2   2.2KB
+total                                                          12    79KB        0s
diff --git a/pebble/tool/logs/testdata/compactions-latest b/pebble/tool/logs/testdata/compactions-latest
new file mode 100644
index 0000000..532b331
--- /dev/null
+++ b/pebble/tool/logs/testdata/compactions-latest
@@ -0,0 +1,499 @@
+# Single compaction and flush pair for a single node / store combination.
+#
+# Use a combination of [n1,pebble,s1] and [n1,s1,pebble] to mimic the two
+# formats we see in production.
+
+
+log
+I211215 00:00:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n1,pebble,s1] 1216510  [JOB 1] compacting(default) L2 [442555] (4.2MB) Score=1.01 + L3 [445853] (8.4MB) Score=0.99;  OverlappingRatio: Single 8.03, Multi 25.05
+I211215 00:00:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n1,s1,pebble] 1216554  [JOB 1] [JOB 1] compacted(default) L2 [442555] (4.2MB) Score=1.01 + L3 [445853] (8.4MB) Score=1.01 -> L3 [445883 445887] (13MB), in 0.3s, output rate 42MB/s
+
+I211215 00:01:10.000000 21136 3@vendor/github.com/cockroachdb/pebble/event.go:599 ⋮ [n1,s1,pebble] 24 [JOB 2] flushing 2 memtables (1.5MB) to L0
+I211215 00:01:20.000000 21136 3@vendor/github.com/cockroachdb/pebble/event.go:603 ⋮ [n1,pebble,s1] 26 [JOB 2] flushed 2 memtables (1.5MB) to L0 [1535806] (1.3MB), in 0.2s, output rate 5.8MB/s
+----
+0.log
+
+summarize
+----
+node: 1, store: 1
+   from: 211215 00:00
+     to: 211215 00:01
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L2      L3         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+node: 1, store: 1
+   from: 211215 00:01
+     to: 211215 00:02
+  r-amp: NaN
+_kind______from______to_____________________________________count___bytes______time
+flush                L0                                         1   1.3MB       10s
+total                                                           1   1.3MB       10s
+
+# Same as the previous case, except that the start and end events are are split
+# across multiple files (one log line per file).
+
+reset
+----
+
+log
+I211215 00:00:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n1,bars,s1,foos] 1216510  [JOB 1] compacting(default) L2 [442555] (4.2MB) Score=1.01 + L3 [445853] (8.4MB) Score=0.99;  OverlappingRatio: Single 8.03, Multi 25.05
+----
+0.log
+
+log
+I211215 00:00:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n1,s1,foos] 1216554  [JOB 1] [JOB 1] compacted(default) L2 [442555] (4.2MB) Score=1.01 + L3 [445853] (8.4MB) Score=1.01 -> L3 [445883 445887] (13MB), in 0.3s, output rate 42MB/s
+----
+1.log
+
+log
+I211215 00:01:10.000000 21136 3@vendor/github.com/cockroachdb/pebble/event.go:599 ⋮ [n1,s1] 24 [JOB 2] flushing 2 memtables (1.5MB) to L0
+----
+2.log
+
+log
+I211215 00:01:20.000000 21136 3@vendor/github.com/cockroachdb/pebble/event.go:603 ⋮ [n1,pebble,s1] 26 [JOB 2] flushed 2 memtables (1.5MB) to L0 [1535806] (1.3MB), in 0.2s, output rate 5.8MB/s
+----
+3.log
+
+summarize
+----
+node: 1, store: 1
+   from: 211215 00:00
+     to: 211215 00:01
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L2      L3         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+node: 1, store: 1
+   from: 211215 00:01
+     to: 211215 00:02
+  r-amp: NaN
+_kind______from______to_____________________________________count___bytes______time
+flush                L0                                         1   1.3MB       10s
+total                                                           1   1.3MB       10s
+
+# Read amplification from the Cockroach log, one within an existing window,
+# another outside of the existing window. The latter is not included.
+
+reset
+----
+
+log
+I211215 00:00:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n1,pebble,s1] 1216510  [JOB 1] compacting(default) L2 [442555] (4.2MB) Score=1.01 + L3 [445853] (8.4MB) Score=0.99;  OverlappingRatio: Single 8.03, Multi 25.05
+I211215 00:00:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n1,pebble,s1] 1216554  [JOB 1] [JOB 1] compacted(default) L2 [442555] (4.2MB) Score=1.01 + L3 [445853] (8.4MB) Score=1.01 -> L3 [445883 445887] (13MB), in 0.3s, output rate 42MB/s
+----
+0.log
+
+log
+I211215 00:00:15.000000 434 kv/kvserver/store.go:3251 ⋮ [n1,s1] 31356
+      |                             |       |       |   ingested   |     moved    |    written   |       |    amp
+level | tables  size val-bl vtables | score |   in  | tables  size | tables  size | tables  size |  read |   r   w
+------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+---------
+    0 |     0     0B     0B       0 |  0.00 |  70GB |   133   77MB |     0     0B |   19K   24GB |  4.2GB |  0  0.3
+    1 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |     0B |  0  0.0
+    2 |    14   34MB     0B       0 |  0.96 |  18GB |     0     0B |    10   17MB |   14K   49GB |   55GB |  1  2.7
+    3 |    42  207MB     0B       0 |  0.96 |  12GB |     0     0B |   280  939MB |  7.3K   43GB |   46GB |  1  3.4
+    4 |   264  1.5GB     0B       0 |  0.99 | 9.1GB |     6   18MB |   152  824MB |  4.5K   31GB |   35GB |  1  3.4
+    5 |  7.5K   23GB     0B       0 |  1.00 | 2.8GB |   26K  116GB |   301  1.8GB |   604  3.2GB |  3.2GB |  1  1.2
+    6 |   24K  164GB     0B       0 |     - |  98GB |   22K   70GB |     1  1.6KB |  3.8K  128GB |  135GB |  1  1.3
+total |   32K  188GB     0B       0 |     - | 257GB |   48K  187GB |   744  3.6GB |   49K  536GB |  278GB |  5  2.1
+-------------------------------------------------------------------------------------------------------------------
+WAL: 1 files (54MB)  in: 65GB  written: 70B (8% overhead)
+----
+1.log
+
+summarize
+----
+node: 1, store: 1
+   from: 211215 00:00
+     to: 211215 00:01
+  r-amp: 5.0
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L2      L3         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+
+# Long running compaction.
+
+reset
+----
+
+log
+I211215 00:01:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n1,s1,pebble] 1216510  [JOB 1] compacting(default) L2 [442555] (4.2MB) Score=1.01 + L3 [445853] (8.4MB) Score=0.99;  OverlappingRatio: Single 8.03, Multi 25.05
+I211215 00:03:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n1,pebble,s1] 1216554  [JOB 1] [JOB 1] compacted(default) L2 [442555] (4.2MB) Score=1.01 + L3 [445853] (8.4MB) Score=1.01 -> L3 [445883 445887] (13MB), in 0.3s, output rate 42MB/s
+----
+0.log
+
+summarize long-running=1m
+----
+node: 1, store: 1
+   from: 211215 00:01
+     to: 211215 00:02
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L2      L3         1       0       0       0       1    13MB    13MB      0B      0B     2m10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B     2m10s
+long-running events (descending runtime):
+_kind________from________to_______job______type_____start_______end____dur(s)_____bytes:
+compact        L2        L3         1   default  00:01:10  00:03:20       130      13MB
+
+# Single node, multiple stores.
+
+reset
+----
+
+log
+I211215 00:01:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n1,pebble,s1] 1216510  [JOB 1] compacting(default) L2 [442555] (4.2MB) Score=1.01 + L3 [445853] (8.4MB) Score=0.99;  OverlappingRatio: Single 8.03, Multi 25.05
+I211215 00:01:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n1,pebble,s1] 1216554  [JOB 1] [JOB 1] compacted(default) L2 [442555] (4.2MB) Score=1.01 + L3 [445853] (8.4MB) Score=1.01 -> L3 [445883 445887] (13MB), in 0.3s, output rate 42MB/s
+
+I211215 00:01:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n1,pebble,s2] 1216510  [JOB 2] compacting(default) L3 [442555] (4.2MB) + L4 [445853] (8.4MB)
+I211215 00:01:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n1,pebble,s2] 1216554  [JOB 2] compacted(default) L3 [442555] (4.2MB) + L4 [445853] (8.4MB) -> L4 [445883 445887] (13MB), in 0.3s, output rate 42MB/s
+----
+0.log
+
+summarize
+----
+node: 1, store: 1
+   from: 211215 00:01
+     to: 211215 00:02
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L2      L3         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+node: 1, store: 2
+   from: 211215 00:01
+     to: 211215 00:02
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L3      L4         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+
+# Multiple nodes, single stores. Two separate pebble logs.
+
+reset
+----
+
+log
+I211215 00:01:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n1,pebble,s1] 1216510  [JOB 1] compacting(default) L2 [442555] (4.2MB) Score=1.01 + L3 [445853] (8.4MB) Score=0.99;  OverlappingRatio: Single 8.03, Multi 25.05
+I211215 00:01:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n1,pebble,s1] 1216554  [JOB 1] [JOB 1] compacted(default) L2 [442555] (4.2MB) Score=1.01 + L3 [445853] (8.4MB) Score=1.01 -> L3 [445883 445887] (13MB), in 0.3s, output rate 42MB/s
+----
+0.log
+
+log
+I211215 00:01:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n2,pebble,s1] 1216510  [JOB 1] compacting(default) L3 [442555] (4.2MB) + L4 [445853] (8.4MB)
+I211215 00:01:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n2,pebble,s1] 1216554  [JOB 1] compacted(default) L3 [442555] (4.2MB) + L4 [445853] (8.4MB) -> L4 [445883 445887] (13MB), in 0.3s, output rate 42MB/s
+----
+1.log
+
+summarize
+----
+node: 1, store: 1
+   from: 211215 00:01
+     to: 211215 00:02
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L2      L3         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+node: 2, store: 1
+   from: 211215 00:01
+     to: 211215 00:02
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L3      L4         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+
+# Multiple nodes, multiple stores. Two separate pebble logs. Output is sorted by
+# (time, node, store).
+
+reset
+----
+
+log
+I211215 00:01:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n1,pebble,s1] 1216510  [JOB 1] compacting(default) L2 [442555] (4.2MB) Score=1.01 + L3 [445853] (8.4MB) Score=0.99;  OverlappingRatio: Single 8.03, Multi 25.05
+I211215 00:01:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n1,pebble,s1] 1216554  [JOB 1] [JOB 1] compacted(default) L2 [442555] (4.2MB) Score=1.01 + L3 [445853] (8.4MB) Score=1.01 -> L3 [445883 445887] (13MB), in 0.3s, output rate 42MB/s
+
+I211215 00:02:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n1,pebble,s2] 1216510  [JOB 2] compacting(default) L1 [442555] (4.2MB) + L2 [445853] (8.4MB)
+I211215 00:02:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n1,pebble,s2] 1216554  [JOB 2] compacted(default) L1 [442555] (4.2MB) + L2 [445853] (8.4MB) -> L2 [445883 445887] (13MB), in 0.3s, output rate 42MB/s
+----
+0.log
+
+log
+I211215 00:00:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n2,pebble,s1] 1216510  [JOB 1] compacting(default) L3 [442555] (4.2MB) + L4 [445853] (8.4MB)
+I211215 00:00:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n2,pebble,s1] 1216554  [JOB 1] compacted(default) L3 [442555] (4.2MB) + L4 [445853] (8.4MB) -> L4 [445883 445887] (13MB), in 0.3s, output rate 42MB/s
+
+I211215 00:02:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n2,pebble,s2] 1216510  [JOB 2] compacting(default) L4 [442555] (4.2MB) + L5 [445853] (8.4MB)
+I211215 00:02:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n2,pebble,s2] 1216554  [JOB 2] compacted(default) L4 [442555] (4.2MB) + L5 [445853] (8.4MB) -> L5 [445883 445887] (13MB), in 0.3s, output rate 42MB/s
+----
+1.log
+
+summarize
+----
+node: 2, store: 1
+   from: 211215 00:00
+     to: 211215 00:01
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L3      L4         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+node: 1, store: 1
+   from: 211215 00:01
+     to: 211215 00:02
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L2      L3         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+node: 1, store: 2
+   from: 211215 00:02
+     to: 211215 00:03
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L1      L2         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+node: 2, store: 2
+   from: 211215 00:02
+     to: 211215 00:03
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L4      L5         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+
+# Log lines with an absent node / store are aggregated.
+
+reset
+----
+
+log
+I211215 00:01:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n?,pebble,s?] 1216510  [JOB 1] compacting(default) L2 [442555] (4.2MB) Score=1.01 + L3 [445853] (8.4MB) Score=0.99;  OverlappingRatio: Single 8.03, Multi 25.05
+I211215 00:01:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n?,pebble,s?] 1216554  [JOB 1] [JOB 1] compacted(default) L2 [442555] (4.2MB) Score=1.01 + L3 [445853] (8.4MB) Score=1.01 -> L3 [445883 445887] (13MB), in 0.3s, output rate 42MB/s
+----
+0.log
+
+log
+I211215 00:01:15.000000 434 kv/kvserver/store.go:3251 ⋮ [n?,s?] 31356
+      |                             |       |       |   ingested   |     moved    |    written   |       |    amp
+level | tables  size val-bl vtables | score |   in  | tables  size | tables  size | tables  size |  read |   r   w
+------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+---------
+    0 |     0     0B     0B       0 |  0.00 |  70GB |   133   77MB |     0     0B |   19K   24GB |  4.2GB |  0  0.3
+    1 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |     0B |  0  0.0
+    2 |    14   34MB     0B       0 |  0.96 |  18GB |     0     0B |    10   17MB |   14K   49GB |   55GB |  1  2.7
+    3 |    42  207MB     0B       0 |  0.96 |  12GB |     0     0B |   280  939MB |  7.3K   43GB |   46GB |  1  3.4
+    4 |   264  1.5GB     0B       0 |  0.99 | 9.1GB |     6   18MB |   152  824MB |  4.5K   31GB |   35GB |  1  3.4
+    5 |  7.5K   23GB     0B       0 |  1.00 | 2.8GB |   26K  116GB |   301  1.8GB |   604  3.2GB |  3.2GB |  1  1.2
+    6 |   24K  164GB     0B       0 |     - |  98GB |   22K   70GB |     1  1.6KB |  3.8K  128GB |  135GB |  1  1.3
+total |   32K  188GB     0B       0 |     - | 257GB |   48K  187GB |   744  3.6GB |   49K  536GB |  278GB |  5  2.1
+-------------------------------------------------------------------------------------------------------------------
+WAL: 1 files (54MB)  in: 65GB  written: 70B (8% overhead)
+----
+1.log
+
+summarize
+----
+node: ?, store: ?
+   from: 211215 00:01
+     to: 211215 00:02
+  r-amp: 5.0
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L2      L3         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+
+# The same Job ID interleaved for multiple nodes / stores.
+
+reset
+----
+
+log
+I211215 00:01:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n1,pebble,s1] 1216510  [JOB 1] compacting(default) L2 [442555] (4.2MB) Score=1.01 + L3 [445853] (8.4MB) Score=0.99;  OverlappingRatio: Single 8.03, Multi 25.05
+I211215 00:02:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n2,pebble,s2] 1216510  [JOB 1] compacting(default) L4 [442555] (4.2MB) Score=1.01 + L5 [445853] (8.4MB) Score=0.99;  OverlappingRatio: Single 8.03, Multi 25.05
+I211215 00:01:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n1,pebble,s1] 1216554  [JOB 1] [JOB 1] compacted(default) L2 [442555] (4.2MB) Score=1.01 + L3 [445853] (8.4MB) Score=1.01 -> L3 [445883 445887] (13MB), in 0.3s, output rate 42MB/s
+I211215 00:02:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n2,pebble,s2] 1216554  [JOB 1] compacted(default) L4 [442555] (4.2MB) + L5 [445853] (8.4MB) -> L5 [445883 445887] (13MB), in 0.3s, output rate 42MB/s
+----
+0.log
+
+summarize
+----
+node: 1, store: 1
+   from: 211215 00:01
+     to: 211215 00:02
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L2      L3         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+node: 2, store: 2
+   from: 211215 00:02
+     to: 211215 00:03
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L4      L5         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+
+# Read amp matching should remain backwards compatible.
+
+reset
+----
+
+log
+I220301 00:00:10.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1845 ⋮ [n1,pebble,s1] 1216510  [JOB 1] compacting(default) L2 [442555] (4.2MB) Score=1.01 + L3 [445853] (8.4MB) Score=0.99;  OverlappingRatio: Single 8.03, Multi 25.05
+I220301 00:00:20.000000 51831533 3@vendor/github.com/cockroachdb/pebble/compaction.go:1886 ⋮ [n1,pebble,s1] 1216554  [JOB 1] [JOB 1] compacted(default) L2 [442555] (4.2MB) Score=1.01 + L3 [445853] (8.4MB) Score=1.01 -> L3 [445883 445887] (13MB), in 0.3s, output rate 42MB/s
+----
+0.log
+
+log
+I220301 00:00:30.000000 200 1@gossip/gossip.go:1500 ⋮ [n1] 74  node has connected to cluster via gossip
+I220301 00:00:30.000000 200 kv/kvserver/stores.go:269 ⋮ [n1] 75  wrote 0 node addresses to persistent storage
+I220301 00:00:30.000000 319 2@server/status/runtime.go:569 ⋮ [n1] 76  runtime stats: 154 MiB RSS, 273 goroutines (stacks: 2.5 MiB), 42 MiB/71 MiB Go alloc/total (heap fragmentation: 11 MiB, heap reserved: 3.9 MiB, heap released: 4.2 MiB), 3.2 MiB/5.6 MiB CGO alloc/total (0.0 CGO/sec), 0.0/0.0 %(u/s)time, 0.0 %gc (0x), 425 KiB/500 KiB (r/w)net
+I220301 00:00:30.000000 319 2@server/status/runtime.go:569 ⋮ [n1] 77  runtime stats: 159 MiB RSS, 266 goroutines (stacks: 3.3 MiB), 42 MiB/78 MiB Go alloc/total (heap fragmentation: 12 MiB, heap reserved: 6.7 MiB, heap released: 64 MiB), 4.4 MiB/6.8 MiB CGO alloc/total (0.4 CGO/sec), 2.9/2.1 %(u/s)time, 0.0 %gc (0x), 335 KiB/323 KiB (r/w)net
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +      |                             |       |       |   ingested   |     moved    |    written   |       |    amp
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +level | tables  size val-bl vtables | score |   in  | tables  size | tables  size | tables  size |  read |   r   w
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+---------
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +    0 |     0     0B     0B       0 |  0.00 |   56B |     0     0B |     0     0B |     2  1.2KB |    0B |   0 22.6
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +    1 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +    2 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +    3 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +    4 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +    5 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +    6 |     1   639B     0B       0 |     - | 1.2KB |     0     0B |     0     0B |     1   639B | 1.2KB |   1  0.5
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +total |     1   639B     0B       0 |     - |   84B |     0     0B |     0     0B |     3  1.9KB | 1.2KB |   1 23.7
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +-------------------------------------------------------------------------------------------------------------------
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +WAL: 1 files (28B)  in: 34B  written: 84B (147% overhead)
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +Flushes: 2
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +Compactions: 1  estimated debt: 0B  in progress: 0 (0B)
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +             default: 1  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  multi-level: 0
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +MemTables: 1 (256KB)  zombie: 2 (512KB)
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +Zombie tables: 2 (1.2KB)
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +Block cache: 5 entries (1.0KB)  hit rate: 42.9%
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +Table cache: 2 entries (1.6KB)  hit rate: 66.7%
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +Snapshots: 0  earliest seq num: 0
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +Table iters: 2
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +Filter utility: 0.0%
+I220301 00:00:30.000000 315 kv/kvserver/store.go:2713 ⋮ [n1,s1] 78 +Ingestions: 0  as flushable: 0 (0B in 0 tables)
+----
+1.log
+
+summarize
+----
+node: 1, store: 1
+   from: 220301 00:00
+     to: 220301 00:01
+  r-amp: 1.0
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L2      L3         1       0       0       0       1    13MB    13MB      0B      0B       10s
+total                           1       0       0       0       1    13MB    13MB      0B      0B       10s
+
+reset
+----
+
+log
+I220228 14:44:31.497272 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1366 ⋮ [n24,pebble,s24] 33267888  [JOB 10197855] flushing 1 memtable (64MB) to L0
+I220228 14:44:31.497485 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267889  [JOB 10197855] flushing: sstable created 21731018
+I220228 14:44:31.527038 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267892  [JOB 10197855] flushing: sstable created 21731020
+I220228 14:44:31.542944 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267894  [JOB 10197855] flushing: sstable created 21731021
+I220228 14:44:31.553581 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267895  [JOB 10197855] flushing: sstable created 21731022
+I220228 14:44:31.554585 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267896  [JOB 10197855] flushing: sstable created 21731023
+I220228 14:44:31.569928 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267897  [JOB 10197855] flushing: sstable created 21731024
+I220228 14:44:31.624309 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267899  [JOB 10197855] flushing: sstable created 21731025
+I220228 14:44:31.685531 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267901  [JOB 10197855] flushing: sstable created 21731026
+I220228 14:44:31.686009 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267902  [JOB 10197855] flushing: sstable created 21731027
+I220228 14:44:31.686415 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267903  [JOB 10197855] flushing: sstable created 21731028
+I220228 14:44:31.780892 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267906  [JOB 10197855] flushing: sstable created 21731030
+I220228 14:44:31.790911 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267907  [JOB 10197855] flushing: sstable created 21731031
+I220228 14:44:31.904614 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267911  [JOB 10197855] flushing: sstable created 21731033
+I220228 14:44:31.905835 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267912  [JOB 10197855] flushing: sstable created 21731034
+I220228 14:44:31.906860 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267913  [JOB 10197855] flushing: sstable created 21731035
+I220228 14:44:31.907602 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267914  [JOB 10197855] flushing: sstable created 21731036
+I220228 14:44:32.019173 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267920  [JOB 10197855] flushing: sstable created 21731037
+I220228 14:44:32.019714 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267921  [JOB 10197855] flushing: sstable created 21731038
+I220228 14:44:32.020161 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267922  [JOB 10197855] flushing: sstable created 21731039
+I220228 14:44:32.100117 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267925  [JOB 10197855] flushing: sstable created 21731040
+I220228 14:44:32.100609 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267926  [JOB 10197855] flushing: sstable created 21731041
+I220228 14:44:32.101065 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267927  [JOB 10197855] flushing: sstable created 21731042
+I220228 14:44:32.101494 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267928  [JOB 10197855] flushing: sstable created 21731043
+I220228 14:44:32.102569 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267929  [JOB 10197855] flushing: sstable created 21731044
+I220228 14:44:32.106284 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1985 ⋮ [n24,pebble,s24] 33267930  [JOB 10197855] flushing: sstable created 21731045
+I220228 14:44:32.138686 18460916022 3@vendor/github.com/cockroachdb/pebble/compaction.go:1423 ⋮ [n24,pebble,s24] 33267931  [JOB 10197855] flushed 1 memtable to L0 [21731018 21731020 21731021 21731022 21731023 21731024 21731025 21731026 21731027 21731028 21731030 21731031 21731033 21731034 21731035 21731036 21731037 21731038 21731039 21731040 21731041 21731042 21731043 21731044 21731045] (19MB), in 0.6s, output rate 31MB/s
+----
+0.log
+
+summarize
+----
+node: 24, store: 24
+   from: 220228 14:44
+     to: 220228 14:45
+  r-amp: NaN
+_kind______from______to_____________________________________count___bytes______time
+flush                L0                                         1    19MB        0s
+total                                                           1    19MB        0s
+
+reset
+----
+
+log
+I220228 16:01:22.487906 18476248525 3@vendor/github.com/cockroachdb/pebble/ingest.go:637 ⋮ [n24,pebble,s24] 33430782  [JOB 10211226] ingested L0:21818678 (1.8KB), L0:21818683 (1.2KB), L0:21818679 (1.6KB), L0:21818680 (1.1KB), L0:21818681 (1.1KB), L0:21818682 (160MB)
+45127:I220228 15:58:45.538681 18475981755 3@vendor/github.com/cockroachdb/pebble/ingest.go:637 ⋮ [n24,pebble,s24] 33424719  [JOB 10210743] ingested L0:21814543 (1.4KB), L0:21814548 (1.2KB), L5:21814544 (1.4KB), L5:21814545 (1.1KB), L5:21814546 (1.1KB), L0:21814547 (140MB)
+----
+0.log
+
+summarize
+----
+node: 24, store: 24
+   from: 220228 15:58
+     to: 220228 15:59
+  r-amp: NaN
+_kind______from______to_____________________________________count___bytes______time
+ingest               L0                                         3   140MB
+ingest               L5                                         3   3.6KB
+total                                                           6   140MB        0s
+node: 24, store: 24
+   from: 220228 16:01
+     to: 220228 16:02
+  r-amp: NaN
+_kind______from______to_____________________________________count___bytes______time
+ingest               L0                                         6   160MB
+total                                                           6   160MB        0s
+
+reset
+----
+
+log
+I220907 00:27:21.579807 15082709999 3@vendor/github.com/cockroachdb/pebble/event.go:587 ⋮ [n15,pebble,s15] 2736197  [JOB 743692] compacting(delete-only) L6 [18323385] (11MB)
+I220907 00:27:21.580169 15082709999 3@vendor/github.com/cockroachdb/pebble/event.go:591 ⋮ [n15,pebble,s15] 2736198  [JOB 743692] compacted(delete-only) L6 [18323385] (11MB) -> L6 [] (0B), in 0.0s, output rate 0 B/s
+
+I220907 00:27:21.631145 15082710355 3@vendor/github.com/cockroachdb/pebble/event.go:587 ⋮ [n15,pebble,s15] 2736201  [JOB 743694] compacting(default) L5 [18323582] (1.8KB) + L6 [17770912] (128MB)
+I220907 00:27:22.729839 15082710355 3@vendor/github.com/cockroachdb/pebble/event.go:591 ⋮ [n15,pebble,s15] 2736208  [JOB 743694] compacted(default) L5 [18323582] (1.8KB) + L6 [17770912] (128MB) -> L6 [18323586] (3.6MB), in 1.1s, output rate 3.3MB/s
+
+I220907 00:27:21.630546 15082710354 3@vendor/github.com/cockroachdb/pebble/event.go:587 ⋮ [n15,pebble,s15] 2736199  [JOB 743693] compacting(move) L5 [18323585] (4.0MB) + L6 [] (0B)
+I220907 00:27:21.631002 15082710354 3@vendor/github.com/cockroachdb/pebble/event.go:591 ⋮ [n15,pebble,s15] 2736200  [JOB 743693] compacted(move) L5 [18323585] (4.0MB) + L6 [] (0B) -> L6 [18323585] (4.0MB), in 0.0s, output rate 50GB/s
+----
+0.log
+
+summarize
+----
+node: 15, store: 15
+   from: 220907 00:27
+     to: 220907 00:28
+  r-amp: NaN
+_kind______from______to___default____move___elide__delete___count___in(B)__out(B)__mov(B)__del(B)______time
+compact      L5      L6         1       1       0       0       2   128MB   3.6MB   4.0MB      0B        1s
+compact      L6      L6         0       0       0       1       1      0B      0B      0B    11MB        0s
+total                           1       1       0       1       3   128MB   3.6MB   4.0MB    11MB        1s
+
+reset
+----
+
+log
+I230831 04:13:28.824280 3780 3@pebble/event.go:685 ⋮ [n10,s10,pebble] 365  [JOB 226] flushed 6 ingested flushables L0:024334 (1.5KB) + L0:024339 (1.0KB) + L0:024335 (1.9KB) + L0:024336 (1.1KB) + L0:024337 (1.1KB) + L0:024338 (12KB) in 0.0s (0.0s total), output rate 67MB/s
+
+I230831 04:13:28.689575 3717 3@pebble/event.go:685 ⋮ [n10,s10,pebble] 345  [JOB 219] flushed 6 ingested flushables L0:024323 (1.5KB) + L0:024328 (1.0KB) + L0:024324 (2.0KB) + L2:024325 (1.1KB) + L2:024326 (1.1KB) + L0:024327 (54KB) in 0.0s (0.0s total), output rate 152MB/s
+----
+0.log
+
+summarize
+----
+node: 10, store: 10
+   from: 230831 04:13
+     to: 230831 04:14
+  r-amp: NaN
+_kind______from______to_____________________________________count___bytes______time
+ingest               L0                                        10    77KB
+ingest               L2                                         2   2.2KB
+total                                                          12    79KB        0s
diff --git a/pebble/tool/logs/tool.go b/pebble/tool/logs/tool.go
new file mode 100644
index 0000000..35c0783
--- /dev/null
+++ b/pebble/tool/logs/tool.go
@@ -0,0 +1,32 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package logs
+
+import (
+	"time"
+
+	"github.com/spf13/cobra"
+)
+
+// NewCmd returns a new cobra.Command for parsing logs.
+func NewCmd() *cobra.Command {
+	cmd := &cobra.Command{
+		Use:   "logs",
+		Short: "Scan and summarize logs",
+	}
+
+	compactionCmd := &cobra.Command{
+		Use:   "compactions",
+		Short: "Scan and summarize compaction logs",
+		RunE:  runCompactionLogs,
+	}
+	compactionCmd.Flags().Duration(
+		"window", 10*time.Minute, "time window in which to aggregate compactions")
+	compactionCmd.Flags().Duration(
+		"long-running-limit", 0, "log compactions with runtime greater than the limit")
+
+	cmd.AddCommand(compactionCmd)
+	return cmd
+}
diff --git a/pebble/tool/lsm.go b/pebble/tool/lsm.go
new file mode 100644
index 0000000..013a80c
--- /dev/null
+++ b/pebble/tool/lsm.go
@@ -0,0 +1,439 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package tool
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+	"log"
+	"math"
+	"slices"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/record"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/spf13/cobra"
+)
+
+//go:generate ./make_lsm_data.sh
+
+type lsmFileMetadata struct {
+	Size           uint64
+	Smallest       int // ID of smallest key
+	Largest        int // ID of largest key
+	SmallestSeqNum uint64
+	LargestSeqNum  uint64
+	Virtual        bool
+}
+
+type lsmVersionEdit struct {
+	// Reason for the edit: flushed, ingested, compacted, added.
+	Reason string
+	// Map from level to files added to the level.
+	Added map[int][]base.FileNum `json:",omitempty"`
+	// Map from level to files deleted from the level.
+	Deleted map[int][]base.FileNum `json:",omitempty"`
+	// L0 sublevels for any files with changed sublevels so far.
+	Sublevels map[base.FileNum]int `json:",omitempty"`
+}
+
+type lsmKey struct {
+	Pretty string
+	SeqNum uint64
+	Kind   int
+}
+
+type lsmState struct {
+	Manifest  string
+	Edits     []lsmVersionEdit                 `json:",omitempty"`
+	Files     map[base.FileNum]lsmFileMetadata `json:",omitempty"`
+	Keys      []lsmKey                         `json:",omitempty"`
+	StartEdit int64
+}
+
+type lsmT struct {
+	Root *cobra.Command
+
+	// Configuration.
+	opts      *pebble.Options
+	comparers sstable.Comparers
+
+	fmtKey    keyFormatter
+	embed     bool
+	pretty    bool
+	startEdit int64
+	endEdit   int64
+	editCount int64
+
+	cmp    *base.Comparer
+	state  lsmState
+	keyMap map[lsmKey]int
+}
+
+func newLSM(opts *pebble.Options, comparers sstable.Comparers) *lsmT {
+	l := &lsmT{
+		opts:      opts,
+		comparers: comparers,
+	}
+	l.fmtKey.mustSet("quoted")
+
+	l.Root = &cobra.Command{
+		Use:   "lsm <manifest>",
+		Short: "LSM visualization tool",
+		Long: `
+Visualize the evolution of an LSM from the version edits in a MANIFEST.
+
+Given an input MANIFEST, output an HTML file containing a visualization showing
+the evolution of the LSM. Each version edit in the MANIFEST becomes a single
+step in the visualization. The 7 levels of the LSM are depicted with each
+sstable represented as a 1-pixel wide rectangle. The height of the rectangle is
+proportional to the size (in bytes) of the sstable. The sstables are displayed
+in the same order as they occur in the LSM. Note that the sstables from
+different levels are NOT aligned according to their start and end keys (doing so
+is also interesting, but it works against using the area of the rectangle to
+indicate size).
+`,
+		Args: cobra.ExactArgs(1),
+		RunE: l.runLSM,
+	}
+
+	l.Root.Flags().Var(&l.fmtKey, "key", "key formatter")
+	l.Root.Flags().BoolVar(&l.embed, "embed", true, "embed javascript in HTML (disable for development)")
+	l.Root.Flags().BoolVar(&l.pretty, "pretty", false, "pretty JSON output")
+	l.Root.Flags().Int64Var(&l.startEdit, "start-edit", 0, "starting edit # to include in visualization")
+	l.Root.Flags().Int64Var(&l.endEdit, "end-edit", math.MaxInt64, "ending edit # to include in visualization")
+	l.Root.Flags().Int64Var(&l.editCount, "edit-count", math.MaxInt64, "count of edits to include in visualization")
+	return l
+}
+
+func (l *lsmT) isFlagSet(name string) bool {
+	return l.Root.Flags().Changed(name)
+}
+
+func (l *lsmT) validateFlags() error {
+	if l.isFlagSet("edit-count") {
+		if l.isFlagSet("start-edit") && l.isFlagSet("end-edit") {
+			return errors.Errorf("edit-count cannot be provided with both start-edit and end-edit")
+		} else if l.isFlagSet("end-edit") {
+			return errors.Errorf("cannot use edit-count with end-edit, use start-edit and end-edit instead")
+		}
+	}
+
+	if l.startEdit > l.endEdit {
+		return errors.Errorf("start-edit cannot be after end-edit")
+	}
+
+	return nil
+}
+
+func (l *lsmT) runLSM(cmd *cobra.Command, args []string) error {
+	err := l.validateFlags()
+	if err != nil {
+		return err
+	}
+
+	edits := l.readManifest(args[0])
+	if edits == nil {
+		return nil
+	}
+
+	if l.startEdit > 0 {
+		edits, err = l.coalesceEdits(edits)
+		if err != nil {
+			return err
+		}
+	}
+	if l.endEdit < int64(len(edits)) {
+		edits = edits[:l.endEdit-l.startEdit+1]
+	}
+	if l.editCount < int64(len(edits)) {
+		edits = edits[:l.editCount]
+	}
+
+	l.buildKeys(edits)
+	err = l.buildEdits(edits)
+	if err != nil {
+		return err
+	}
+
+	w := l.Root.OutOrStdout()
+
+	fmt.Fprintf(w, `<!DOCTYPE html>
+<html>
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+`)
+	if l.embed {
+		fmt.Fprintf(w, "<style>%s</style>\n", lsmDataCSS)
+	} else {
+		fmt.Fprintf(w, "<link rel=\"stylesheet\" href=\"data/lsm.css\">\n")
+	}
+	fmt.Fprintf(w, "</head>\n<body>\n")
+	if l.embed {
+		fmt.Fprintf(w, "<script src=\"https://d3js.org/d3.v5.min.js\"></script>\n")
+	} else {
+		fmt.Fprintf(w, "<script src=\"data/d3.v5.min.js\"></script>\n")
+	}
+	fmt.Fprintf(w, "<script type=\"text/javascript\">\n")
+	fmt.Fprintf(w, "data = %s\n", l.formatJSON(l.state))
+	fmt.Fprintf(w, "</script>\n")
+	if l.embed {
+		fmt.Fprintf(w, "<script type=\"text/javascript\">%s</script>\n", lsmDataJS)
+	} else {
+		fmt.Fprintf(w, "<script src=\"data/lsm.js\"></script>\n")
+	}
+	fmt.Fprintf(w, "</body>\n</html>\n")
+
+	return nil
+}
+
+func (l *lsmT) readManifest(path string) []*manifest.VersionEdit {
+	f, err := l.opts.FS.Open(path)
+	if err != nil {
+		fmt.Fprintf(l.Root.OutOrStderr(), "%s\n", err)
+		return nil
+	}
+	defer f.Close()
+
+	l.state.Manifest = path
+
+	var edits []*manifest.VersionEdit
+	w := l.Root.OutOrStdout()
+	rr := record.NewReader(f, 0 /* logNum */)
+	for i := 0; ; i++ {
+		r, err := rr.Next()
+		if err != nil {
+			if err != io.EOF {
+				fmt.Fprintf(w, "%s\n", err)
+			}
+			break
+		}
+
+		ve := &manifest.VersionEdit{}
+		err = ve.Decode(r)
+		if err != nil {
+			fmt.Fprintf(w, "%s\n", err)
+			break
+		}
+		edits = append(edits, ve)
+
+		if ve.ComparerName != "" {
+			l.cmp = l.comparers[ve.ComparerName]
+			if l.cmp == nil {
+				fmt.Fprintf(w, "%d: unknown comparer %q\n", i, ve.ComparerName)
+				return nil
+			}
+			l.fmtKey.setForComparer(ve.ComparerName, l.comparers)
+		} else if l.cmp == nil {
+			l.cmp = base.DefaultComparer
+		}
+	}
+	return edits
+}
+
+func (l *lsmT) buildKeys(edits []*manifest.VersionEdit) {
+	var keys []base.InternalKey
+	for _, ve := range edits {
+		for i := range ve.NewFiles {
+			nf := &ve.NewFiles[i]
+			keys = append(keys, nf.Meta.Smallest)
+			keys = append(keys, nf.Meta.Largest)
+		}
+	}
+
+	l.keyMap = make(map[lsmKey]int)
+
+	slices.SortFunc(keys, func(a, b base.InternalKey) int {
+		return base.InternalCompare(l.cmp.Compare, a, b)
+	})
+
+	for i := range keys {
+		k := &keys[i]
+		if i > 0 && base.InternalCompare(l.cmp.Compare, keys[i-1], keys[i]) == 0 {
+			continue
+		}
+		j := len(l.state.Keys)
+		l.state.Keys = append(l.state.Keys, lsmKey{
+			Pretty: fmt.Sprint(l.fmtKey.fn(k.UserKey)),
+			SeqNum: k.SeqNum(),
+			Kind:   int(k.Kind()),
+		})
+		l.keyMap[lsmKey{string(k.UserKey), k.SeqNum(), int(k.Kind())}] = j
+	}
+}
+
+func (l *lsmT) buildEdits(edits []*manifest.VersionEdit) error {
+	l.state.Edits = nil
+	l.state.StartEdit = l.startEdit
+	l.state.Files = make(map[base.FileNum]lsmFileMetadata)
+	var currentFiles [manifest.NumLevels][]*manifest.FileMetadata
+
+	backings := make(map[base.DiskFileNum]*manifest.FileBacking)
+
+	for _, ve := range edits {
+		for _, i := range ve.CreatedBackingTables {
+			backings[i.DiskFileNum] = i
+		}
+		if len(ve.DeletedFiles) == 0 && len(ve.NewFiles) == 0 {
+			continue
+		}
+
+		edit := lsmVersionEdit{
+			Reason:  l.reason(ve),
+			Added:   make(map[int][]base.FileNum),
+			Deleted: make(map[int][]base.FileNum),
+		}
+
+		for j := range ve.NewFiles {
+			nf := &ve.NewFiles[j]
+			if b, ok := backings[nf.BackingFileNum]; ok && nf.Meta.Virtual {
+				nf.Meta.FileBacking = b
+			}
+			if _, ok := l.state.Files[nf.Meta.FileNum]; !ok {
+				l.state.Files[nf.Meta.FileNum] = lsmFileMetadata{
+					Size:           nf.Meta.Size,
+					Smallest:       l.findKey(nf.Meta.Smallest),
+					Largest:        l.findKey(nf.Meta.Largest),
+					SmallestSeqNum: nf.Meta.SmallestSeqNum,
+					LargestSeqNum:  nf.Meta.LargestSeqNum,
+					Virtual:        nf.Meta.Virtual,
+				}
+			}
+			edit.Added[nf.Level] = append(edit.Added[nf.Level], nf.Meta.FileNum)
+			currentFiles[nf.Level] = append(currentFiles[nf.Level], nf.Meta)
+		}
+
+		for df := range ve.DeletedFiles {
+			edit.Deleted[df.Level] = append(edit.Deleted[df.Level], df.FileNum)
+			for j, f := range currentFiles[df.Level] {
+				if f.FileNum == df.FileNum {
+					copy(currentFiles[df.Level][j:], currentFiles[df.Level][j+1:])
+					currentFiles[df.Level] = currentFiles[df.Level][:len(currentFiles[df.Level])-1]
+				}
+			}
+		}
+
+		v := manifest.NewVersion(l.cmp.Compare, l.fmtKey.fn, 0, currentFiles)
+		edit.Sublevels = make(map[base.FileNum]int)
+		for sublevel, files := range v.L0SublevelFiles {
+			iter := files.Iter()
+			for f := iter.First(); f != nil; f = iter.Next() {
+				if len(l.state.Edits) > 0 {
+					lastEdit := l.state.Edits[len(l.state.Edits)-1]
+					if sublevel2, ok := lastEdit.Sublevels[f.FileNum]; ok && sublevel == sublevel2 {
+						continue
+					}
+				}
+				edit.Sublevels[f.FileNum] = sublevel
+			}
+		}
+		l.state.Edits = append(l.state.Edits, edit)
+	}
+
+	if l.state.Edits == nil {
+		return errors.Errorf("there are no edits in [start-edit, end-edit], which add or delete files")
+	}
+	return nil
+}
+
+func (l *lsmT) coalesceEdits(edits []*manifest.VersionEdit) ([]*manifest.VersionEdit, error) {
+	if l.startEdit >= int64(len(edits)) {
+		return nil, errors.Errorf("start-edit is more than the number of edits, %d", len(edits))
+	}
+
+	be := manifest.BulkVersionEdit{}
+	be.AddedByFileNum = make(map[base.FileNum]*manifest.FileMetadata)
+
+	// Coalesce all edits from [0, l.startEdit) into a BulkVersionEdit.
+	for _, ve := range edits[:l.startEdit] {
+		err := be.Accumulate(ve)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	startingEdit := edits[l.startEdit]
+	var beNewFiles []manifest.NewFileEntry
+	beDeletedFiles := make(map[manifest.DeletedFileEntry]*manifest.FileMetadata)
+
+	for level, deletedFiles := range be.Deleted {
+		for _, file := range deletedFiles {
+			dfe := manifest.DeletedFileEntry{
+				Level:   level,
+				FileNum: file.FileNum,
+			}
+			beDeletedFiles[dfe] = file
+		}
+	}
+
+	// Filter out added files that were also deleted in the BulkVersionEdit.
+	for level, newFiles := range be.Added {
+		for _, file := range newFiles {
+			dfe := manifest.DeletedFileEntry{
+				Level:   level,
+				FileNum: file.FileNum,
+			}
+
+			if _, ok := beDeletedFiles[dfe]; !ok {
+				beNewFiles = append(beNewFiles, manifest.NewFileEntry{
+					Level: level,
+					Meta:  file,
+				})
+			}
+		}
+	}
+	startingEdit.NewFiles = append(beNewFiles, startingEdit.NewFiles...)
+
+	edits = edits[l.startEdit:]
+	return edits, nil
+}
+
+func (l *lsmT) findKey(key base.InternalKey) int {
+	return l.keyMap[lsmKey{string(key.UserKey), key.SeqNum(), int(key.Kind())}]
+}
+
+func (l *lsmT) reason(ve *manifest.VersionEdit) string {
+	if len(ve.DeletedFiles) > 0 {
+		return "compacted"
+	}
+	if ve.MinUnflushedLogNum != 0 {
+		return "flushed"
+	}
+	for i := range ve.NewFiles {
+		nf := &ve.NewFiles[i]
+		if nf.Meta.SmallestSeqNum == nf.Meta.LargestSeqNum {
+			return "ingested"
+		}
+	}
+	return "added"
+}
+
+func (l *lsmT) formatJSON(v interface{}) string {
+	if l.pretty {
+		return l.prettyJSON(v)
+	}
+	return l.uglyJSON(v)
+}
+
+func (l *lsmT) uglyJSON(v interface{}) string {
+	data, err := json.Marshal(v)
+	if err != nil {
+		log.Fatal(err)
+	}
+	return string(data)
+}
+
+func (l *lsmT) prettyJSON(v interface{}) string {
+	data, err := json.MarshalIndent(v, "", "\t")
+	if err != nil {
+		log.Fatal(err)
+	}
+	return string(data)
+}
diff --git a/pebble/tool/lsm_data.go b/pebble/tool/lsm_data.go
new file mode 100644
index 0000000..ea34cd2
--- /dev/null
+++ b/pebble/tool/lsm_data.go
@@ -0,0 +1,906 @@
+// Code generated by make_lsm_data.sh; DO NOT EDIT.
+
+package tool
+
+var lsmDataCSS = `
+body {
+    margin: 0 0 0 0;
+}
+
+.counts {
+    font: 10px sans-serif;
+}
+
+.help {
+    font: 12px sans-serif;
+}
+
+.levels {
+    font: 12px sans-serif;
+}
+
+.reason {
+    font: 12px sans-serif;
+}
+
+.sizes {
+    font: 10px sans-serif;
+}
+
+.ticks {
+    font: 10px sans-serif;
+}
+
+.track,
+.track-inset,
+.track-overlay {
+    stroke-linecap: round;
+}
+
+.track {
+    stroke: #000;
+    stroke-opacity: 0.3;
+    stroke-width: 10px;
+}
+
+.track-inset {
+    stroke: #ddd;
+    stroke-width: 8px;
+}
+
+.track-overlay {
+    pointer-events: stroke;
+    stroke-width: 50px;
+    stroke: transparent;
+    cursor: crosshair;
+}
+
+.handle {
+    fill: #fff;
+    stroke: #000;
+    stroke-opacity: 0.5;
+    stroke-width: 1.25px;
+}
+
+#container {
+    height: 100vh;
+    width: 100%;
+    overflow-y: scroll;
+}
+
+#header {
+    display: block;
+}
+
+#index {
+    position: relative;
+    margin: 11px 10px 10px 10px;
+    width: 50px;
+}
+
+#index-container {
+    float: left;
+    width: 76px;
+    height: 40px;
+}
+
+#checkbox-container {
+    float: right;
+    width: 170px;
+    height: 40px;
+    padding-top: 10px;
+}
+
+#slider {
+    background-color: #fff;
+    height: 40px;
+    width: calc(100% - 256px);
+}
+
+#vis {
+    display: block;
+    background-color: #fff;
+    height: calc(100% - 40px);
+    width: 100%;
+}
+`
+
+var lsmDataJS = `
+// TODO(peter):
+//
+// - interactions
+//   - mouse wheel: horizontal zoom
+//   - click/drag: horizontal pan
+
+"use strict";
+
+// The heights of each level. The first few levels are given smaller
+// heights to account for the increasing target file size.
+//
+// TODO(peter): Use the TargetFileSizes specified in the OPTIONS file.
+let levelHeights = [16, 16, 16, 16, 32, 64, 128];
+const offsetStart = 24;
+let levelOffsets = generateLevelOffsets();
+const lineStart = 105;
+const sublevelHeight = 16;
+let levelWidth = 0;
+
+{
+    // Create the base DOM elements.
+    let c = d3
+        .select("body")
+        .append("div")
+        .attr("id", "container");
+    let h = c.append("div").attr("id", "header");
+    h
+        .append("div")
+        .attr("id", "index-container")
+        .append("input")
+        .attr("type", "text")
+        .attr("id", "index")
+        .attr("autocomplete", "off");
+    let checkboxContainer = h
+        .append("div")
+        .attr("id", "checkbox-container");
+    checkboxContainer.append("input")
+        .attr("type", "checkbox")
+        .attr("id", "flatten-sublevels")
+        .on("change", () => {version.onCheckboxChange(d3.event.target.checked)});
+    checkboxContainer.append("label")
+        .attr("for", "flatten-sublevels")
+        .text("Show sublevels");
+    h.append("svg").attr("id", "slider");
+    c.append("svg").attr("id", "vis");
+}
+
+let vis = d3.select("#vis");
+
+function renderHelp() {
+    vis
+        .append("text")
+        .attr("class", "help")
+        .attr("x", 10)
+        .attr("y", levelOffsets[6] + 30)
+        .text(
+            "(space: start/stop, left-arrow[+shift]: step-back, right-arrow[+shift]: step-forward)"
+        );
+}
+
+function renderReason() {
+    return vis
+        .append("text")
+        .attr("class", "reason")
+        .attr("x", 10)
+        .attr("y", 16);
+}
+
+let reason = renderReason();
+
+let index = d3.select("#index");
+
+// Pretty formatting of a number in human readable units.
+function humanize(s) {
+    const iecSuffixes = [" B", " KB", " MB", " GB", " TB", " PB", " EB"];
+    if (s < 10) {
+        return "" + s;
+    }
+    let e = Math.floor(Math.log(s) / Math.log(1024));
+    let suffix = iecSuffixes[Math.floor(e)];
+    let val = Math.floor(s / Math.pow(1024, e) * 10 + 0.5) / 10;
+    return val.toFixed(val < 10 ? 1 : 0) + suffix;
+}
+
+function generateLevelOffsets() {
+    return levelHeights.map((v, i) =>
+        levelHeights.slice(0, i + 1).reduce((sum, elem) => sum + elem, offsetStart)
+    );
+}
+
+function styleWidth(e) {
+    let width = +e.style("width").slice(0, -2);
+    return Math.round(Number(width));
+}
+
+function styleHeight(e) {
+    let height = +e.style("height").slice(0, -2);
+    return Math.round(Number(height));
+}
+
+let sliderX, sliderHandle;
+let offsetSliderX;
+
+// The version object holds the current LSM state.
+let version = {
+    levels: [[], [], [], [], [], [], []],
+    sublevels: [],
+    numSublevels: 0,
+    showSublevels: false,
+    // Generated after every change using setLevelsInfo().
+    levelsInfo: [],
+    // The version edit index.
+    index: -1,
+
+    init: function() {
+        for (let edit of data.Edits) {
+            if (edit.Sublevels === null || edit.Sublevels === undefined) {
+                continue;
+            }
+            for (let [file, sublevel] of Object.entries(edit.Sublevels)) {
+                if (sublevel >= this.numSublevels) {
+                    this.numSublevels = sublevel + 1;
+                }
+            }
+        }
+        for (let i = 0; i < this.numSublevels; i++) {
+            this.sublevels.push([]);
+        }
+        d3.select("#checkbox-container label")
+            .text("Show sublevels (" + this.numSublevels.toString() + ")");
+        this.setHeights();
+        this.setLevelsInfo();
+        renderHelp();
+    },
+
+    setHeights: function() {
+        // Update the height of level 0 to account for the number of sublevels,
+        // if there are any.
+        if (this.numSublevels > 0 && this.showSublevels === true) {
+            levelHeights[0] = sublevelHeight * this.numSublevels;
+        } else {
+            levelHeights[0] = sublevelHeight;
+        }
+        levelOffsets = generateLevelOffsets();
+        vis.style("height", levelOffsets[6] + 100);
+    },
+
+    onCheckboxChange: function(value) {
+        this.showSublevels = value;
+        vis.selectAll("*")
+            .remove();
+        reason = renderReason();
+        this.setHeights();
+        this.setLevelsInfo();
+        renderHelp();
+
+        this.render(true);
+        this.updateSize();
+    },
+
+    // Set the version edit index. This steps either forward or
+    // backward through the version edits, applying or unapplying each
+    // edit.
+    set: function(index) {
+        let prevIndex = this.index;
+        if (index < 0) {
+            index = 0;
+        } else if (index >= data.Edits.length) {
+            index = data.Edits.length - 1;
+        }
+        if (index == this.index) {
+            return;
+        }
+
+        // If the current edit index is less than the target index,
+        // step forward applying edits.
+        for (; this.index < index; this.index++) {
+            let edit = data.Edits[this.index + 1];
+            for (let level in edit.Deleted) {
+                this.remove(level, edit.Deleted[level]);
+            }
+            for (let level in edit.Added) {
+                this.add(level, edit.Added[level]);
+            }
+        }
+
+        // If the current edit index is greater than the target index,
+        // step backward unapplying edits.
+        for (; this.index > index; this.index--) {
+            let edit = data.Edits[this.index];
+            for (let level in edit.Added) {
+                this.remove(level, edit.Added[level]);
+            }
+            for (let level in edit.Deleted) {
+                this.add(level, edit.Deleted[level]);
+            }
+        }
+
+        // Build the sublevels from this.levels[0]. They need to be rebuilt from
+        // scratch each time there's a change to L0.
+        this.sublevels = [];
+        while(this.sublevels.length < this.numSublevels) {
+            this.sublevels.push([]);
+        }
+        for (let file of this.levels[0]) {
+            let sublevel = null;
+            for (let i = index; i >= 0 && (sublevel === null || sublevel === undefined); i--) {
+                if (data.Edits[i].Sublevels == null || data.Edits[i].Sublevels == undefined) {
+                  continue;
+                }
+                sublevel = data.Edits[i].Sublevels[file];
+            }
+            this.sublevels[sublevel].push(file);
+        }
+
+        // Sort the levels.
+        for (let i in this.levels) {
+            if (i == 0) {
+                for (let j in this.sublevels) {
+                    this.sublevels[j].sort(function(a, b) {
+                        let fa = data.Files[a];
+                        let fb = data.Files[b];
+                        if (fa.Smallest < fb.Smallest) {
+                            return -1;
+                        }
+                        if (fa.Smallest > fb.Smallest) {
+                            return +1;
+                        }
+                        return 0;
+                    });
+                }
+                this.levels[i].sort(function(a, b) {
+                    let fa = data.Files[a];
+                    let fb = data.Files[b];
+                    if (fa.LargestSeqNum < fb.LargestSeqNum) {
+                        return -1;
+                    }
+                    if (fa.LargestSeqNum > fb.LargestSeqNum) {
+                        return +1;
+                    }
+                    if (fa.SmallestSeqNum < fb.SmallestSeqNum) {
+                        return -1;
+                    }
+                    if (fa.SmallestSeqNum > fb.SmallestSeqNum) {
+                        return +1;
+                    }
+                    return a < b;
+                });
+            } else {
+                this.levels[i].sort(function(a, b) {
+                    let fa = data.Files[a];
+                    let fb = data.Files[b];
+                    if (fa.Smallest < fb.Smallest) {
+                        return -1;
+                    }
+                    if (fa.Smallest > fb.Smallest) {
+                        return +1;
+                    }
+                    return 0;
+                });
+            }
+        }
+
+        this.updateLevelsInfo();
+        this.render(prevIndex === -1);
+    },
+
+    // Add the specified sstables to the specifed level.
+    add: function(level, fileNums) {
+        for (let i = 0; i < fileNums.length; i++) {
+            this.levels[level].push(fileNums[i]);
+        }
+    },
+
+    // Remove the specified sstables from the specifed level.
+    remove: function(level, fileNums) {
+        let l = this.levels[level];
+        for (let i = 0; i < l.length; i++) {
+            if (fileNums.indexOf(l[i]) != -1) {
+                l[i] = l[l.length - 1];
+                l.pop();
+                i--;
+            }
+        }
+    },
+
+    // Return the size of the sstables in a level.
+    size: function(level, sublevel) {
+        if (level == 0 && sublevel !== null && sublevel !== undefined) {
+            return this.sublevels[sublevel].reduce(
+                (sum, elem) => sum + data.Files[elem].Size,
+                0
+            );
+        }
+        return (this.levels[level] || []).reduce(
+            (sum, elem) => sum + data.Files[elem].Size,
+            0
+        );
+    },
+
+    // Returns the height to use for an sstable.
+    height: function(fileNum) {
+        let meta = data.Files[fileNum];
+        return Math.ceil((meta.Size + 1024.0 * 1024.0 - 1) / (1024.0 * 1024.0));
+    },
+
+    scale: function(level) {
+        return levelWidth < this.levelsInfo[level].files.length
+            ? levelWidth / this.levelsInfo[level].files.length
+            : 1;
+    },
+
+    // Return a summary of the count and size of the specified sstables.
+    summarize: function(level, fileNums) {
+        let count = 0;
+        let size = 0;
+        for (let fileNum of fileNums) {
+            count++;
+            size += data.Files[fileNum].Size;
+        }
+        return count + " @ " + "L" + level + " (" + humanize(size) + ")";
+    },
+
+    // Return a textual description of a version edit.
+    describe: function(edit) {
+        let s = edit.Reason;
+
+        if (edit.Deleted) {
+            let sep = " ";
+            for (let i = 0; i < 7; i++) {
+                if (edit.Deleted[i]) {
+                    s += sep + this.summarize(i, edit.Deleted[i]);
+                    sep = " + ";
+                }
+            }
+        }
+
+        if (edit.Added) {
+            let sep = " => ";
+            for (let i = 0; i < 7; i++) {
+                if (edit.Added[i]) {
+                    s += sep + this.summarize(i, edit.Added[i]);
+                    sep = " + ";
+                }
+            }
+        }
+
+        return s;
+    },
+
+    setLevelsInfo: function() {
+        let sublevelCount = this.numSublevels;
+        let levelsInfo = [];
+        let levelsStart = 1;
+        if (this.showSublevels === true) {
+            levelsInfo = this.sublevels.map((files, sublevel) => ({
+                files: files,
+                levelString: "L0." + sublevel.toString(),
+                levelDisplayString: (sublevel === this.numSublevels - 1 ?
+                    "L0." : "&nbsp;&nbsp;&nbsp;&nbsp;.") + sublevel.toString(),
+                levelClass: "L0-" + sublevel.toString(),
+                level: 0,
+                offset: offsetStart + (sublevelHeight * (sublevelCount - sublevel)),
+                height: sublevelHeight,
+                size: humanize(this.size(0, sublevel)),
+            }));
+            if (levelsInfo.length === 0) {
+                levelsStart = 0;
+            }
+            levelsInfo.reverse();
+        } else {
+            levelsStart = 0;
+        }
+
+        levelsInfo = levelsInfo.concat(this.levels.slice(levelsStart).map((files, level) => ({
+            files: files,
+            levelString: "L" + (level+levelsStart).toString(),
+            levelDisplayString: "L" + (level+levelsStart).toString(),
+            levelClass: "L" + (level+levelsStart).toString(),
+            level: level,
+            offset: levelOffsets[level+levelsStart],
+            height: levelHeights[level+levelsStart],
+            size: humanize(this.size(level+levelsStart)),
+        })));
+        this.levelsInfo = levelsInfo;
+    },
+
+    updateLevelsInfo: function() {
+        let levelsStart = 1;
+        if (this.showSublevels === true) {
+            this.sublevels.forEach((files, sublevel) => {
+                this.levelsInfo[this.numSublevels - (sublevel + 1)].files = files;
+                this.levelsInfo[this.numSublevels - (sublevel + 1)].size = humanize(this.size(0, sublevel));
+            });
+            if (this.numSublevels === 0) {
+                levelsStart = 0;
+            }
+        } else {
+            levelsStart = 0;
+        }
+
+        this.levels.slice(levelsStart).forEach((files, level) => {
+            let sublevelOffset = this.showSublevels === true ? this.numSublevels : 0;
+            this.levelsInfo[sublevelOffset + level].files = files;
+            this.levelsInfo[sublevelOffset + level].size = humanize(this.size(levelsStart + level));
+        });
+    },
+
+    render: function(redraw) {
+        let version = this;
+
+        vis.interrupt();
+
+        // Render the edit info.
+        let info = "[" + this.describe(data.Edits[this.index]) + "]";
+        reason.text(info);
+
+        // Render the text for each level: sstable count and size.
+        vis
+            .selectAll("text.levels")
+            .data(this.levelsInfo)
+            .enter()
+            .append("text")
+            .attr("class", "levels")
+            .attr("x", 10)
+            .attr("y", d => d.offset)
+            .html(d => d.levelDisplayString);
+        vis
+            .selectAll("text.counts")
+            .data(this.levelsInfo)
+            .text((d, i) => d.files.length)
+            .enter()
+            .append("text")
+            .attr("class", "counts")
+            .attr("text-anchor", "end")
+            .attr("x", 55)
+            .attr("y", d => d.offset)
+            .text(d => d.files.length);
+        vis
+            .selectAll("text.sizes")
+            .data(this.levelsInfo)
+            .text((d, i) => d.size)
+            .enter()
+            .append("text")
+            .attr("class", "sizes")
+            .attr("text-anchor", "end")
+            .attr("x", 100)
+            .attr("y", (d, i) => d.offset)
+            .text(d => d.size);
+
+        // Render each of the levels. Each level is composed of an
+        // outer group which provides a clipping recentangle, an inner
+        // group defining the coordinate system, an overlap rectangle
+        // to capture mouse events, an indicator rectangle used to
+        // display sstable overlaps, and the per-sstable rectangles.
+        for (let i in this.levelsInfo) {
+            let g, clipG;
+            if (redraw === false) {
+                g = vis
+                    .selectAll("g.clip" + this.levelsInfo[i].levelClass)
+                    .select("g")
+                    .data([i]);
+                clipG = g
+                    .enter()
+                    .append("g")
+                    .attr("class", "clipRect clip" + this.levelsInfo[i].levelClass)
+                    .attr("clip-path", "url(#" + this.levelsInfo[i].levelClass + ")");
+            } else {
+                clipG = vis
+                    .append("g")
+                    .attr("class", "clipRect clip" + this.levelsInfo[i].levelClass)
+                    .attr("clip-path", "url(#" + this.levelsInfo[i].levelClass + ")")
+                    .data([i]);
+                g = clipG
+                    .append("g");
+            }
+            clipG
+                .append("g")
+                .attr(
+                    "transform",
+                    "translate(" +
+                        lineStart +
+                        "," +
+                        this.levelsInfo[i].offset +
+                        ") scale(1,-1)"
+                );
+            clipG.append("rect").attr("class", "indicator");
+
+            // Define the overlap rectangle for capturing mouse events.
+            clipG
+                .append("rect")
+                .attr("x", lineStart)
+                .attr("y", this.levelsInfo[i].offset - this.levelsInfo[i].height)
+                .attr("width", levelWidth)
+                .attr("height", this.levelsInfo[i].height)
+                .attr("opacity", 0)
+                .attr("pointer-events", "all")
+                .on("mousemove", i => version.onMouseMove(i))
+                .on("mouseout", function() {
+                    reason.text(info);
+                    vis.selectAll("rect.indicator").attr("fill", "none");
+                });
+
+            // Scale each level to fit within the display.
+            let s = this.scale(i);
+            g.attr(
+                "transform",
+                "translate(" +
+                    lineStart +
+                    "," +
+                    this.levelsInfo[i].offset +
+                    ") scale(" +
+                    s +
+                    "," +
+                    -(1 / s) +
+                    ")"
+            );
+
+            // Render the sstables for the level.
+            let level = g.selectAll("rect." + this.levelsInfo[i].levelClass).data(this.levelsInfo[i].files);
+            level.attr("fill", fileNum => (data.Files[fileNum].Virtual?"#8A9":"#555")).attr("x", (fileNum, i) => i);
+            level
+                .enter()
+                .append("rect")
+                .attr("class", this.levelsInfo[i].levelClass + " sstable")
+                .attr("id", fileNum => fileNum)
+                .attr("fill",  fileNum => (data.Files[fileNum].Virtual?"orange":"red"))
+                .attr("x", (fileNum, i) => i)
+                .attr("y", 0)
+                .attr("width", 1)
+                .attr("height", fileNum => version.height(fileNum));
+            level.exit().remove();
+        }
+
+        sliderHandle.attr("cx", sliderX(version.index));
+        index.node().value = version.index + data.StartEdit;
+    },
+
+    onMouseMove: function(i) {
+        i = Number(i);
+        if (Number.isNaN(i) || i >= this.levelsInfo.length || this.levelsInfo[i].files.length === 0) {
+            return;
+        }
+
+        // The mouse coordinates are relative to the
+        // SVG element. Adjust to be relative to the
+        // level position.
+        let mousex = d3.mouse(vis.node())[0] - lineStart;
+        let index = Math.round(mousex / this.scale(i));
+        if (index < 0) {
+            index = 0;
+        } else if (index >= this.levelsInfo[i].files.length) {
+            index = this.levelsInfo[i].files.length - 1;
+        }
+        let fileNum = this.levelsInfo[i].files[index];
+        let meta = data.Files[fileNum];
+
+        // Find the start and end index of the tables
+        // that overlap with filenum.
+        let overlapInfo = "";
+        for (let j = 1; j < this.levelsInfo.length; j++) {
+            if (this.levelsInfo[i].files.length === 0) {
+                continue;
+            }
+            let indicator = vis.select("g.clip" + this.levelsInfo[j].levelClass + " rect.indicator");
+            indicator
+                .attr("fill", "black")
+                .attr("opacity", 0.3)
+                .attr("y", this.levelsInfo[j].offset - this.levelsInfo[j].height)
+                .attr("height", this.levelsInfo[j].height);
+            if (j === i) {
+                continue;
+            }
+            let fileNums = this.levelsInfo[j].files;
+            for (let k in fileNums) {
+                let other = data.Files[fileNums[k]];
+                if (other.Largest < meta.Smallest) {
+                    continue;
+                }
+                let s = this.scale(j);
+                let t = k;
+                for (; k < fileNums.length; k++) {
+                    let other = data.Files[fileNums[k]];
+                    if (other.Smallest >= meta.Largest) {
+                        break;
+                    }
+                }
+                if (k === t) {
+                    indicator.attr("x", lineStart + s * t).attr("width", s);
+                } else {
+                    indicator
+                        .attr("x", lineStart + s * t)
+                        .attr("width", Math.max(0.5, s * (k - t)));
+                }
+                if (i + 1 === j && k > t) {
+                    let overlapSize = this.levelsInfo[j].files
+                        .slice(t, k)
+                        .reduce((sum, elem) => sum + data.Files[elem].Size, 0);
+
+                    overlapInfo =
+                        " overlaps " +
+                        (k - t) +
+                        " @ " +
+                        this.levelsInfo[j].levelString +
+                        " (" +
+                        humanize(overlapSize) +
+                        ")";
+                }
+                break;
+            }
+        }
+
+        reason.text(
+            "[" +
+                this.levelsInfo[i].levelString +
+                (data.Files[fileNum].Virtual? " v":" ") +
+                fileNum +
+                " (" +
+                humanize(data.Files[fileNum].Size) +
+                ")" +
+                overlapInfo +
+                " <" +
+                data.Keys[data.Files[fileNum].Smallest].Pretty +
+                ", " +
+                data.Keys[data.Files[fileNum].Largest].Pretty +
+                ">" +
+                "]"
+        );
+
+        vis
+            .select("g.clip" + this.levelsInfo[i].levelClass + " rect.indicator")
+            .attr("x", lineStart + this.scale(i) * index)
+            .attr("width", 1);
+    },
+
+    // Recalculate structures related to the page width.
+    updateSize: function() {
+        let svg = d3.select("#slider").html("");
+
+        let margin = { right: 10, left: 10 };
+
+        let width = styleWidth(d3.select("#slider")) - margin.left - margin.right,
+            height = styleHeight(svg);
+
+        sliderX = d3
+            .scaleLinear()
+            .domain([0, data.Edits.length - 1])
+            .range([0, width])
+            .clamp(true);
+
+        // Used only to generate offset ticks for slider.
+        // sliderX is used to index into the data.Edits array (0-indexed).
+        offsetSliderX = d3
+          .scaleLinear()
+          .domain([data.StartEdit, data.StartEdit + data.Edits.length - 1])
+          .range([0, width]);
+
+        let slider = svg
+            .append("g")
+            .attr("class", "slider")
+            .attr("transform", "translate(" + margin.left + "," + height / 2 + ")");
+
+        slider
+            .append("line")
+            .attr("class", "track")
+            .attr("x1", sliderX.range()[0])
+            .attr("x2", sliderX.range()[1])
+            .select(function() {
+                return this.parentNode.appendChild(this.cloneNode(true));
+            })
+            .attr("class", "track-inset")
+            .select(function() {
+                return this.parentNode.appendChild(this.cloneNode(true));
+            })
+            .attr("class", "track-overlay")
+            .call(
+                d3
+                    .drag()
+                    .on("start.interrupt", function() {
+                        slider.interrupt();
+                    })
+                    .on("start drag", function() {
+                        version.set(Math.round(sliderX.invert(d3.event.x)));
+                    })
+            );
+
+        slider
+            .insert("g", ".track-overlay")
+            .attr("class", "ticks")
+            .attr("transform", "translate(0," + 18 + ")")
+            .selectAll("text")
+            .data(offsetSliderX.ticks(10))
+            .enter()
+            .append("text")
+            .attr("x", offsetSliderX)
+            .attr("text-anchor", "middle")
+            .text(function(d) {
+                return d;
+            });
+
+        sliderHandle = slider
+            .insert("circle", ".track-overlay")
+            .attr("class", "handle")
+            .attr("r", 9)
+            .attr("cx", sliderX(version.index));
+
+        levelWidth = styleWidth(vis) - 10 - lineStart;
+        let lineEnd = lineStart + levelWidth;
+
+        vis
+            .selectAll("line")
+            .data(this.levelsInfo)
+            .attr("x2", lineEnd)
+            .enter()
+            .append("line")
+            .attr("x1", lineStart)
+            .attr("x2", lineEnd)
+            .attr("y1", d => d.offset)
+            .attr("y2", d => d.offset)
+            .attr("stroke", "#ddd");
+
+        vis
+            .selectAll("defs clipPath rect")
+            .data(this.levelsInfo)
+            .attr("width", lineEnd - lineStart)
+            .enter()
+            .append("defs")
+            .append("clipPath")
+            .attr("id", d => d.levelClass)
+            .append("rect")
+            .attr("x", lineStart)
+            .attr("y", d => d.offset - d.height)
+            .attr("width", lineEnd - lineStart)
+            .attr("height", d => d.height);
+    },
+};
+
+window.onload = function() {
+    version.init();
+    version.updateSize();
+    version.set(0);
+};
+
+window.addEventListener("resize", function() {
+    version.updateSize();
+    version.render();
+});
+
+let timer;
+
+function startPlayback(increment) {
+    timer = d3.timer(function() {
+        let lastIndex = version.index;
+        version.set(version.index + increment);
+        if (lastIndex == version.index) {
+            timer.stop();
+            timer = null;
+        }
+    });
+}
+
+function stopPlayback() {
+    if (timer == null) {
+        return false;
+    }
+    timer.stop();
+    timer = null;
+    return true;
+}
+
+document.addEventListener("keydown", function(e) {
+    switch (e.keyCode) {
+        case 37: // left arrow
+            stopPlayback();
+            version.set(version.index - (e.shiftKey ? 10 : 1));
+            return;
+        case 39: // right arrow
+            stopPlayback();
+            version.set(version.index + (e.shiftKey ? 10 : 1));
+            return;
+        case 32: // space
+            if (stopPlayback()) {
+                return;
+            }
+            startPlayback(1);
+            return;
+    }
+});
+
+index.on("input", function() {
+    if (!isNaN(+this.value)) {
+        const val = Number(this.value) - data.StartEdit;
+        if (val >= 0) {
+            version.set(val);
+        }
+    }
+});
+`
diff --git a/pebble/tool/make_incorrect_manifests.go b/pebble/tool/make_incorrect_manifests.go
new file mode 100644
index 0000000..90aa87d
--- /dev/null
+++ b/pebble/tool/make_incorrect_manifests.go
@@ -0,0 +1,61 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build make_incorrect_manifests
+// +build make_incorrect_manifests
+
+// Run using: go run -tags make_incorrect_manifests ./tool/make_incorrect_manifests.go
+package main
+
+import (
+	"log"
+
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/record"
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+func writeVE(writer *record.Writer, ve *manifest.VersionEdit) {
+	w, err := writer.Next()
+	if err != nil {
+		log.Fatal(err)
+	}
+	err = ve.Encode(w)
+	if err != nil {
+		log.Fatal(err)
+	}
+}
+
+func makeManifest1() {
+	fs := vfs.Default
+	f, err := fs.Create("tool/testdata/MANIFEST-invalid")
+	if err != nil {
+		log.Fatal(err)
+	}
+	writer := record.NewWriter(f)
+	var ve manifest.VersionEdit
+	ve.ComparerName = "leveldb.BytewiseComparator"
+	ve.MinUnflushedLogNum = 2
+	ve.NextFileNum = 5
+	ve.LastSeqNum = 20
+	ve.NewFiles = []manifest.NewFileEntry{
+		{Level: 6, Meta: &manifest.FileMetadata{
+			FileNum: 1, SmallestSeqNum: 2, LargestSeqNum: 5}}}
+	writeVE(writer, &ve)
+
+	ve.MinUnflushedLogNum = 3
+	ve.NewFiles = []manifest.NewFileEntry{
+		{Level: 6, Meta: &manifest.FileMetadata{
+			FileNum: 2, SmallestSeqNum: 1, LargestSeqNum: 4}}}
+	writeVE(writer, &ve)
+
+	err = writer.Close()
+	if err != nil {
+		log.Fatal(err)
+	}
+}
+
+func main() {
+	makeManifest1()
+}
diff --git a/pebble/tool/make_lsm_data.sh b/pebble/tool/make_lsm_data.sh
new file mode 100755
index 0000000..6156ddf
--- /dev/null
+++ b/pebble/tool/make_lsm_data.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+dest=lsm_data.go
+do="DO"
+not="NOT"
+edit="EDIT"
+
+cat > ${dest} <<EOF
+// Code generated by make_lsm_data.sh; ${do} ${not} ${edit}.
+
+package tool
+
+var lsmDataCSS = \`
+EOF
+
+cat data/lsm.css >> ${dest}
+
+cat >> ${dest} <<EOF
+\`
+
+var lsmDataJS = \`
+EOF
+
+cat data/lsm.js >> ${dest}
+
+cat>> ${dest} <<EOF
+\`
+EOF
diff --git a/pebble/tool/make_test_find_db.go b/pebble/tool/make_test_find_db.go
new file mode 100644
index 0000000..62d2978
--- /dev/null
+++ b/pebble/tool/make_test_find_db.go
@@ -0,0 +1,171 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build make_test_find_db
+// +build make_test_find_db
+
+// Run using: go run -tags make_test_find_db ./tool/make_test_find_db.go
+package main
+
+import (
+	"log"
+
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+type db struct {
+	db       *pebble.DB
+	comparer *base.Comparer
+	merger   *base.Merger
+}
+
+func open(fs vfs.FS, dir string) *db {
+	c := *base.DefaultComparer
+	c.Name = "alt-comparer"
+
+	m := *base.DefaultMerger
+	m.Name = "test-merger"
+
+	lel := pebble.MakeLoggingEventListener(pebble.DefaultLogger)
+	d, err := pebble.Open(dir, &pebble.Options{
+		Cleaner:       pebble.ArchiveCleaner{},
+		Comparer:      &c,
+		EventListener: &lel,
+		FS:            fs,
+		Merger:        &m,
+	})
+	if err != nil {
+		log.Fatal(err)
+	}
+	return &db{
+		db:       d,
+		comparer: &c,
+		merger:   &m,
+	}
+}
+
+func (d *db) close() {
+	if err := d.db.Close(); err != nil {
+		log.Fatal(err)
+	}
+}
+
+func (d *db) set(key, value string) {
+	if err := d.db.Set([]byte(key), []byte(value), nil); err != nil {
+		log.Fatal(err)
+	}
+}
+
+func (d *db) merge(key, value string) {
+	if err := d.db.Merge([]byte(key), []byte(value), nil); err != nil {
+		log.Fatal(err)
+	}
+}
+
+func (d *db) delete(key string) {
+	if err := d.db.Delete([]byte(key), nil); err != nil {
+		log.Fatal(err)
+	}
+}
+
+func (d *db) singleDelete(key string) {
+	if err := d.db.SingleDelete([]byte(key), nil); err != nil {
+		log.Fatal(err)
+	}
+}
+
+func (d *db) deleteRange(start, end string) {
+	if err := d.db.DeleteRange([]byte(start), []byte(end), nil); err != nil {
+		log.Fatal(err)
+	}
+}
+
+func (d *db) ingest(keyVals ...string) {
+	const path = "tool/testdata/ingest.tmp"
+
+	if len(keyVals)%2 != 0 {
+		log.Fatalf("even number of key/values required")
+	}
+
+	fs := vfs.Default
+	f, err := fs.Create(path)
+	if err != nil {
+		log.Fatal(err)
+	}
+	w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{
+		Comparer:   d.comparer,
+		MergerName: d.merger.Name,
+	})
+
+	for i := 0; i < len(keyVals); i += 2 {
+		key := keyVals[i]
+		value := keyVals[i+1]
+		if err := w.Set([]byte(key), []byte(value)); err != nil {
+			log.Fatal(err)
+		}
+	}
+
+	if err := w.Close(); err != nil {
+		log.Fatal(err)
+	}
+
+	if err := d.db.Ingest([]string{path}); err != nil {
+		log.Fatal(err)
+	}
+}
+
+func (d *db) flush() {
+	if err := d.db.Flush(); err != nil {
+		log.Fatal(err)
+	}
+}
+
+func (d *db) compact(start, end string) {
+	if err := d.db.Compact([]byte(start), []byte(end), false); err != nil {
+		log.Fatal(err)
+	}
+}
+
+func (d *db) snapshot() *pebble.Snapshot {
+	return d.db.NewSnapshot()
+}
+
+func main() {
+	const dir = "tool/testdata/find-db"
+
+	fs := vfs.Default
+	if err := fs.RemoveAll(dir); err != nil {
+		log.Fatal(err)
+	}
+
+	d := open(fs, dir)
+	defer d.close()
+
+	d.set("aaa", "1")
+	d.set("bbb", "2")
+	d.merge("ccc", "3")
+	d.merge("ccc", "4")
+	d.merge("ccc", "5")
+	d.flush()
+	d.compact("a", "z")
+
+	defer d.snapshot().Close()
+
+	d.ingest("bbb", "22", "ccc", "6")
+	d.ingest("ddd", "33")
+	d.compact("a", "z")
+
+	defer d.snapshot().Close()
+
+	d.delete("aaa")
+	d.singleDelete("ccc")
+	d.deleteRange("bbb", "eee")
+	d.flush()
+
+	d.compact("a", "z")
+}
diff --git a/pebble/tool/make_test_remotecat.go b/pebble/tool/make_test_remotecat.go
new file mode 100644
index 0000000..39856a6
--- /dev/null
+++ b/pebble/tool/make_test_remotecat.go
@@ -0,0 +1,77 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build make_test_remotecat
+// +build make_test_remotecat
+
+// Run using: go run -tags make_test_remotecat ./tool/make_test_remotecat.go
+package main
+
+import (
+	"log"
+	"os"
+	"path/filepath"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/remoteobjcat"
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+func main() {
+	dir, err := os.MkdirTemp("", "make-test-remotecat")
+	if err != nil {
+		log.Fatal(err)
+	}
+	catalog, _, err := remoteobjcat.Open(vfs.Default, dir)
+	if err != nil {
+		log.Fatal(err)
+	}
+	if err := catalog.SetCreatorID(3); err != nil {
+		log.Fatal(err)
+	}
+
+	var b remoteobjcat.Batch
+	b.AddObject(remoteobjcat.RemoteObjectMetadata{
+		FileNum:        base.FileNum(1).DiskFileNum(),
+		FileType:       base.FileTypeTable,
+		CreatorID:      3,
+		CreatorFileNum: base.FileNum(1).DiskFileNum(),
+		CleanupMethod:  objstorage.SharedRefTracking,
+		Locator:        "foo",
+	})
+	if err := catalog.ApplyBatch(b); err != nil {
+		log.Fatal(err)
+	}
+	b.Reset()
+	b.AddObject(remoteobjcat.RemoteObjectMetadata{
+		FileNum:        base.FileNum(2).DiskFileNum(),
+		FileType:       base.FileTypeTable,
+		CreatorID:      5,
+		CreatorFileNum: base.FileNum(10).DiskFileNum(),
+		CleanupMethod:  objstorage.SharedRefTracking,
+		Locator:        "foo",
+	})
+	b.DeleteObject(base.FileNum(1).DiskFileNum())
+	b.AddObject(remoteobjcat.RemoteObjectMetadata{
+		FileNum:          base.FileNum(3).DiskFileNum(),
+		FileType:         base.FileTypeTable,
+		CleanupMethod:    objstorage.SharedRefTracking,
+		Locator:          "bar",
+		CustomObjectName: "external.sst",
+	})
+	if err := catalog.ApplyBatch(b); err != nil {
+		log.Fatal(err)
+	}
+	if err := catalog.Close(); err != nil {
+		log.Fatal(err)
+	}
+	contents, err := os.ReadFile(filepath.Join(dir, "REMOTE-OBJ-CATALOG-000001"))
+	if err != nil {
+		log.Fatal(err)
+	}
+	if err := os.WriteFile("tool/testdata/REMOTE-OBJ-CATALOG", contents, 0666); err != nil {
+		log.Fatal(err)
+	}
+}
diff --git a/pebble/tool/make_test_sstables.go b/pebble/tool/make_test_sstables.go
new file mode 100644
index 0000000..e18a4cd
--- /dev/null
+++ b/pebble/tool/make_test_sstables.go
@@ -0,0 +1,46 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build make_test_sstables
+// +build make_test_sstables
+
+// Run using: go run -tags make_test_sstables ./tool/make_test_sstables.go
+package main
+
+import (
+	"log"
+
+	"github.com/cockroachdb/pebble/internal/private"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+func makeOutOfOrder() {
+	fs := vfs.Default
+	f, err := fs.Create("tool/testdata/out-of-order.sst")
+	if err != nil {
+		log.Fatal(err)
+	}
+	w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
+	private.SSTableWriterDisableKeyOrderChecks(w)
+
+	set := func(key string) {
+		if err := w.Set([]byte(key), nil); err != nil {
+			log.Fatal(err)
+		}
+	}
+
+	set("a")
+	set("c")
+	set("b")
+
+	if err := w.Close(); err != nil {
+		log.Fatal(err)
+	}
+}
+
+func main() {
+	makeOutOfOrder()
+}
diff --git a/pebble/tool/manifest.go b/pebble/tool/manifest.go
new file mode 100644
index 0000000..fe210f5
--- /dev/null
+++ b/pebble/tool/manifest.go
@@ -0,0 +1,576 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package tool
+
+import (
+	"cmp"
+	"fmt"
+	"io"
+	"slices"
+	"time"
+
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/humanize"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/record"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/spf13/cobra"
+)
+
+// manifestT implements manifest-level tools, including both configuration
+// state and the commands themselves.
+type manifestT struct {
+	Root      *cobra.Command
+	Dump      *cobra.Command
+	Summarize *cobra.Command
+	Check     *cobra.Command
+
+	opts      *pebble.Options
+	comparers sstable.Comparers
+	fmtKey    keyFormatter
+	verbose   bool
+
+	filterStart key
+	filterEnd   key
+
+	summarizeDur time.Duration
+}
+
+func newManifest(opts *pebble.Options, comparers sstable.Comparers) *manifestT {
+	m := &manifestT{
+		opts:         opts,
+		comparers:    comparers,
+		summarizeDur: time.Hour,
+	}
+	m.fmtKey.mustSet("quoted")
+
+	m.Root = &cobra.Command{
+		Use:   "manifest",
+		Short: "manifest introspection tools",
+	}
+
+	// Add dump command
+	m.Dump = &cobra.Command{
+		Use:   "dump <manifest-files>",
+		Short: "print manifest contents",
+		Long: `
+Print the contents of the MANIFEST files.
+`,
+		Args: cobra.MinimumNArgs(1),
+		Run:  m.runDump,
+	}
+	m.Dump.Flags().Var(&m.fmtKey, "key", "key formatter")
+	m.Dump.Flags().Var(&m.filterStart, "filter-start", "start key filters out all version edits that only reference sstables containing keys strictly before the given key")
+	m.Dump.Flags().Var(&m.filterEnd, "filter-end", "end key filters out all version edits that only reference sstables containing keys at or strictly after the given key")
+	m.Root.AddCommand(m.Dump)
+	m.Root.PersistentFlags().BoolVarP(&m.verbose, "verbose", "v", false, "verbose output")
+
+	// Add summarize command
+	m.Summarize = &cobra.Command{
+		Use:   "summarize <manifest-files>",
+		Short: "summarize manifest contents",
+		Long: `
+Summarize the edits to the MANIFEST files over time.
+`,
+		Args: cobra.MinimumNArgs(1),
+		Run:  m.runSummarize,
+	}
+	m.Root.AddCommand(m.Summarize)
+	m.Summarize.Flags().DurationVar(
+		&m.summarizeDur, "dur", time.Hour, "bucket duration as a Go duration string (eg, '1h', '15m')")
+
+	// Add check command
+	m.Check = &cobra.Command{
+		Use:   "check <manifest-files>",
+		Short: "check manifest contents",
+		Long: `
+Check the contents of the MANIFEST files.
+`,
+		Args: cobra.MinimumNArgs(1),
+		Run:  m.runCheck,
+	}
+	m.Root.AddCommand(m.Check)
+	m.Check.Flags().Var(
+		&m.fmtKey, "key", "key formatter")
+
+	return m
+}
+
+func (m *manifestT) printLevels(cmp base.Compare, stdout io.Writer, v *manifest.Version) {
+	for level := range v.Levels {
+		if level == 0 && len(v.L0SublevelFiles) > 0 && !v.Levels[level].Empty() {
+			for sublevel := len(v.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- {
+				fmt.Fprintf(stdout, "--- L0.%d ---\n", sublevel)
+				v.L0SublevelFiles[sublevel].Each(func(f *manifest.FileMetadata) {
+					if !anyOverlapFile(cmp, f, m.filterStart, m.filterEnd) {
+						return
+					}
+					fmt.Fprintf(stdout, "  %s:%d", f.FileNum, f.Size)
+					formatSeqNumRange(stdout, f.SmallestSeqNum, f.LargestSeqNum)
+					formatKeyRange(stdout, m.fmtKey, &f.Smallest, &f.Largest)
+					fmt.Fprintf(stdout, "\n")
+				})
+			}
+			continue
+		}
+		fmt.Fprintf(stdout, "--- L%d ---\n", level)
+		iter := v.Levels[level].Iter()
+		for f := iter.First(); f != nil; f = iter.Next() {
+			if !anyOverlapFile(cmp, f, m.filterStart, m.filterEnd) {
+				continue
+			}
+			fmt.Fprintf(stdout, "  %s:%d", f.FileNum, f.Size)
+			formatSeqNumRange(stdout, f.SmallestSeqNum, f.LargestSeqNum)
+			formatKeyRange(stdout, m.fmtKey, &f.Smallest, &f.Largest)
+			fmt.Fprintf(stdout, "\n")
+		}
+	}
+}
+
+func (m *manifestT) runDump(cmd *cobra.Command, args []string) {
+	stdout, stderr := cmd.OutOrStdout(), cmd.OutOrStderr()
+	for _, arg := range args {
+		func() {
+			f, err := m.opts.FS.Open(arg)
+			if err != nil {
+				fmt.Fprintf(stderr, "%s\n", err)
+				return
+			}
+			defer f.Close()
+
+			fmt.Fprintf(stdout, "%s\n", arg)
+
+			var bve manifest.BulkVersionEdit
+			bve.AddedByFileNum = make(map[base.FileNum]*manifest.FileMetadata)
+			var comparer *base.Comparer
+			var editIdx int
+			rr := record.NewReader(f, 0 /* logNum */)
+			for {
+				offset := rr.Offset()
+				r, err := rr.Next()
+				if err != nil {
+					fmt.Fprintf(stdout, "%s\n", err)
+					break
+				}
+
+				var ve manifest.VersionEdit
+				err = ve.Decode(r)
+				if err != nil {
+					fmt.Fprintf(stdout, "%s\n", err)
+					break
+				}
+				if err := bve.Accumulate(&ve); err != nil {
+					fmt.Fprintf(stdout, "%s\n", err)
+					break
+				}
+
+				if comparer != nil && !anyOverlap(comparer.Compare, &ve, m.filterStart, m.filterEnd) {
+					continue
+				}
+
+				empty := true
+				fmt.Fprintf(stdout, "%d/%d\n", offset, editIdx)
+				if ve.ComparerName != "" {
+					empty = false
+					fmt.Fprintf(stdout, "  comparer:     %s", ve.ComparerName)
+					comparer = m.comparers[ve.ComparerName]
+					if comparer == nil {
+						fmt.Fprintf(stdout, " (unknown)")
+					}
+					fmt.Fprintf(stdout, "\n")
+					m.fmtKey.setForComparer(ve.ComparerName, m.comparers)
+				}
+				if ve.MinUnflushedLogNum != 0 {
+					empty = false
+					fmt.Fprintf(stdout, "  log-num:       %d\n", ve.MinUnflushedLogNum)
+				}
+				if ve.ObsoletePrevLogNum != 0 {
+					empty = false
+					fmt.Fprintf(stdout, "  prev-log-num:  %d\n", ve.ObsoletePrevLogNum)
+				}
+				if ve.NextFileNum != 0 {
+					empty = false
+					fmt.Fprintf(stdout, "  next-file-num: %d\n", ve.NextFileNum)
+				}
+				if ve.LastSeqNum != 0 {
+					empty = false
+					fmt.Fprintf(stdout, "  last-seq-num:  %d\n", ve.LastSeqNum)
+				}
+				entries := make([]manifest.DeletedFileEntry, 0, len(ve.DeletedFiles))
+				for df := range ve.DeletedFiles {
+					empty = false
+					entries = append(entries, df)
+				}
+				slices.SortFunc(entries, func(a, b manifest.DeletedFileEntry) int {
+					if v := cmp.Compare(a.Level, b.Level); v != 0 {
+						return v
+					}
+					return cmp.Compare(a.FileNum, b.FileNum)
+				})
+				for _, df := range entries {
+					fmt.Fprintf(stdout, "  deleted:       L%d %s\n", df.Level, df.FileNum)
+				}
+				for _, nf := range ve.NewFiles {
+					empty = false
+					fmt.Fprintf(stdout, "  added:         L%d %s:%d",
+						nf.Level, nf.Meta.FileNum, nf.Meta.Size)
+					formatSeqNumRange(stdout, nf.Meta.SmallestSeqNum, nf.Meta.LargestSeqNum)
+					formatKeyRange(stdout, m.fmtKey, &nf.Meta.Smallest, &nf.Meta.Largest)
+					if nf.Meta.CreationTime != 0 {
+						fmt.Fprintf(stdout, " (%s)",
+							time.Unix(nf.Meta.CreationTime, 0).UTC().Format(time.RFC3339))
+					}
+					fmt.Fprintf(stdout, "\n")
+				}
+				if empty {
+					// NB: An empty version edit can happen if we log a version edit with
+					// a zero field. RocksDB does this with a version edit that contains
+					// `LogNum == 0`.
+					fmt.Fprintf(stdout, "  <empty>\n")
+				}
+				editIdx++
+			}
+
+			if comparer != nil {
+				v, err := bve.Apply(
+					nil /* version */, comparer.Compare, m.fmtKey.fn, 0,
+					m.opts.Experimental.ReadCompactionRate,
+					nil /* zombies */, manifest.AllowSplitUserKeys,
+				)
+				if err != nil {
+					fmt.Fprintf(stdout, "%s\n", err)
+					return
+				}
+				m.printLevels(comparer.Compare, stdout, v)
+			}
+		}()
+	}
+}
+
+func anyOverlap(cmp base.Compare, ve *manifest.VersionEdit, start, end key) bool {
+	if start == nil && end == nil {
+		return true
+	}
+	for _, df := range ve.DeletedFiles {
+		if anyOverlapFile(cmp, df, start, end) {
+			return true
+		}
+	}
+	for _, nf := range ve.NewFiles {
+		if anyOverlapFile(cmp, nf.Meta, start, end) {
+			return true
+		}
+	}
+	return false
+}
+
+func anyOverlapFile(cmp base.Compare, f *manifest.FileMetadata, start, end key) bool {
+	if f == nil {
+		return true
+	}
+	if start != nil {
+		if v := cmp(f.Largest.UserKey, start); v < 0 {
+			return false
+		} else if f.Largest.IsExclusiveSentinel() && v == 0 {
+			return false
+		}
+	}
+	if end != nil && cmp(f.Smallest.UserKey, end) >= 0 {
+		return false
+	}
+	return true
+}
+
+func (m *manifestT) runSummarize(cmd *cobra.Command, args []string) {
+	for _, arg := range args {
+		err := m.runSummarizeOne(cmd.OutOrStdout(), arg)
+		if err != nil {
+			fmt.Fprintf(cmd.OutOrStderr(), "%s\n", err)
+		}
+	}
+}
+
+func (m *manifestT) runSummarizeOne(stdout io.Writer, arg string) error {
+	f, err := m.opts.FS.Open(arg)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+	fmt.Fprintf(stdout, "%s\n", arg)
+
+	type summaryBucket struct {
+		bytesAdded      [manifest.NumLevels]uint64
+		bytesCompactOut [manifest.NumLevels]uint64
+	}
+	var (
+		bve           manifest.BulkVersionEdit
+		newestOverall time.Time
+		oldestOverall time.Time // oldest after initial version edit
+		buckets       = map[time.Time]*summaryBucket{}
+		metadatas     = map[base.FileNum]*manifest.FileMetadata{}
+	)
+	bve.AddedByFileNum = make(map[base.FileNum]*manifest.FileMetadata)
+	rr := record.NewReader(f, 0 /* logNum */)
+	for i := 0; ; i++ {
+		r, err := rr.Next()
+		if err == io.EOF {
+			break
+		} else if err != nil {
+			return err
+		}
+
+		var ve manifest.VersionEdit
+		err = ve.Decode(r)
+		if err != nil {
+			return err
+		}
+		if err := bve.Accumulate(&ve); err != nil {
+			return err
+		}
+
+		veNewest, veOldest := newestOverall, newestOverall
+		for _, nf := range ve.NewFiles {
+			_, seen := metadatas[nf.Meta.FileNum]
+			metadatas[nf.Meta.FileNum] = nf.Meta
+			if nf.Meta.CreationTime == 0 {
+				continue
+			}
+
+			t := time.Unix(nf.Meta.CreationTime, 0).UTC()
+			if veNewest.Before(t) {
+				veNewest = t
+			}
+			// Only update the oldest if we haven't already seen this
+			// file; it might've been moved in which case the sstable's
+			// creation time is from when it was originally created.
+			if veOldest.After(t) && !seen {
+				veOldest = t
+			}
+		}
+		// Ratchet up the most recent timestamp we've seen.
+		if newestOverall.Before(veNewest) {
+			newestOverall = veNewest
+		}
+
+		if i == 0 || newestOverall.IsZero() {
+			continue
+		}
+		// Update oldestOverall once, when we encounter the first version edit
+		// at index >= 1. It should be approximately the start time of the
+		// manifest.
+		if !newestOverall.IsZero() && oldestOverall.IsZero() {
+			oldestOverall = newestOverall
+		}
+
+		bucketKey := newestOverall.Truncate(m.summarizeDur)
+		b := buckets[bucketKey]
+		if b == nil {
+			b = &summaryBucket{}
+			buckets[bucketKey] = b
+		}
+
+		// Increase `bytesAdded` for any version edits that only add files.
+		// These are either flushes or ingests.
+		if len(ve.NewFiles) > 0 && len(ve.DeletedFiles) == 0 {
+			for _, nf := range ve.NewFiles {
+				b.bytesAdded[nf.Level] += nf.Meta.Size
+			}
+			continue
+		}
+
+		// Increase `bytesCompactOut` for the input level of any compactions
+		// that remove bytes from a level (excluding intra-L0 compactions).
+		// compactions.
+		destLevel := -1
+		if len(ve.NewFiles) > 0 {
+			destLevel = ve.NewFiles[0].Level
+		}
+		for dfe := range ve.DeletedFiles {
+			if dfe.Level != destLevel {
+				b.bytesCompactOut[dfe.Level] += metadatas[dfe.FileNum].Size
+			}
+		}
+	}
+
+	formatUint64 := func(v uint64, _ time.Duration) string {
+		if v == 0 {
+			return "."
+		}
+		return humanize.Bytes.Uint64(v).String()
+	}
+	formatRate := func(v uint64, dur time.Duration) string {
+		if v == 0 {
+			return "."
+		}
+		secs := dur.Seconds()
+		if secs == 0 {
+			secs = 1
+		}
+		return humanize.Bytes.Uint64(uint64(float64(v)/secs)).String() + "/s"
+	}
+
+	if newestOverall.IsZero() {
+		fmt.Fprintf(stdout, "(no timestamps)\n")
+	} else {
+		// NB: bt begins unaligned with the bucket duration (m.summarizeDur),
+		// but after the first bucket will always be aligned.
+		for bi, bt := 0, oldestOverall; !bt.After(newestOverall); bi, bt = bi+1, bt.Truncate(m.summarizeDur).Add(m.summarizeDur) {
+			// Truncate the start time to calculate the bucket key, and
+			// retrieve the appropriate bucket.
+			bk := bt.Truncate(m.summarizeDur)
+			var bucket summaryBucket
+			if buckets[bk] != nil {
+				bucket = *buckets[bk]
+			}
+
+			if bi%10 == 0 {
+				fmt.Fprintf(stdout, "                     ")
+				fmt.Fprintf(stdout, "_______L0_______L1_______L2_______L3_______L4_______L5_______L6_____TOTAL\n")
+			}
+			fmt.Fprintf(stdout, "%s\n", bt.Format(time.RFC3339))
+
+			// Compute the bucket duration. It may < `m.summarizeDur` if this is
+			// the first or last bucket.
+			bucketEnd := bt.Truncate(m.summarizeDur).Add(m.summarizeDur)
+			if bucketEnd.After(newestOverall) {
+				bucketEnd = newestOverall
+			}
+			dur := bucketEnd.Sub(bt)
+
+			stats := []struct {
+				label  string
+				format func(uint64, time.Duration) string
+				vals   [manifest.NumLevels]uint64
+			}{
+				{"Ingest+Flush", formatUint64, bucket.bytesAdded},
+				{"Ingest+Flush", formatRate, bucket.bytesAdded},
+				{"Compact (out)", formatUint64, bucket.bytesCompactOut},
+				{"Compact (out)", formatRate, bucket.bytesCompactOut},
+			}
+			for _, stat := range stats {
+				var sum uint64
+				for _, v := range stat.vals {
+					sum += v
+				}
+				fmt.Fprintf(stdout, "%20s   %8s %8s %8s %8s %8s %8s %8s %8s\n",
+					stat.label,
+					stat.format(stat.vals[0], dur),
+					stat.format(stat.vals[1], dur),
+					stat.format(stat.vals[2], dur),
+					stat.format(stat.vals[3], dur),
+					stat.format(stat.vals[4], dur),
+					stat.format(stat.vals[5], dur),
+					stat.format(stat.vals[6], dur),
+					stat.format(sum, dur))
+			}
+		}
+		fmt.Fprintf(stdout, "%s\n", newestOverall.Format(time.RFC3339))
+	}
+
+	dur := newestOverall.Sub(oldestOverall)
+	fmt.Fprintf(stdout, "---\n")
+	fmt.Fprintf(stdout, "Estimated start time: %s\n", oldestOverall.Format(time.RFC3339))
+	fmt.Fprintf(stdout, "Estimated end time:   %s\n", newestOverall.Format(time.RFC3339))
+	fmt.Fprintf(stdout, "Estimated duration:   %s\n", dur.String())
+
+	return nil
+}
+
+func (m *manifestT) runCheck(cmd *cobra.Command, args []string) {
+	stdout, stderr := cmd.OutOrStdout(), cmd.OutOrStderr()
+	ok := true
+	for _, arg := range args {
+		func() {
+			f, err := m.opts.FS.Open(arg)
+			if err != nil {
+				fmt.Fprintf(stderr, "%s\n", err)
+				ok = false
+				return
+			}
+			defer f.Close()
+
+			var v *manifest.Version
+			var cmp *base.Comparer
+			rr := record.NewReader(f, 0 /* logNum */)
+			// Contains the FileMetadata needed by BulkVersionEdit.Apply.
+			// It accumulates the additions since later edits contain
+			// deletions of earlier added files.
+			addedByFileNum := make(map[base.FileNum]*manifest.FileMetadata)
+			for {
+				offset := rr.Offset()
+				r, err := rr.Next()
+				if err != nil {
+					if err == io.EOF {
+						break
+					}
+					fmt.Fprintf(stdout, "%s: offset: %d err: %s\n", arg, offset, err)
+					ok = false
+					break
+				}
+
+				var ve manifest.VersionEdit
+				err = ve.Decode(r)
+				if err != nil {
+					fmt.Fprintf(stdout, "%s: offset: %d err: %s\n", arg, offset, err)
+					ok = false
+					break
+				}
+				var bve manifest.BulkVersionEdit
+				bve.AddedByFileNum = addedByFileNum
+				if err := bve.Accumulate(&ve); err != nil {
+					fmt.Fprintf(stderr, "%s\n", err)
+					ok = false
+					return
+				}
+
+				empty := true
+				if ve.ComparerName != "" {
+					empty = false
+					cmp = m.comparers[ve.ComparerName]
+					if cmp == nil {
+						fmt.Fprintf(stdout, "%s: offset: %d comparer %s not found",
+							arg, offset, ve.ComparerName)
+						ok = false
+						break
+					}
+					m.fmtKey.setForComparer(ve.ComparerName, m.comparers)
+				}
+				empty = empty && ve.MinUnflushedLogNum == 0 && ve.ObsoletePrevLogNum == 0 &&
+					ve.LastSeqNum == 0 && len(ve.DeletedFiles) == 0 &&
+					len(ve.NewFiles) == 0
+				if empty {
+					continue
+				}
+				// TODO(sbhola): add option to Apply that reports all errors instead of
+				// one error.
+				newv, err := bve.Apply(v, cmp.Compare, m.fmtKey.fn, 0, m.opts.Experimental.ReadCompactionRate, nil /* zombies */, manifest.AllowSplitUserKeys)
+				if err != nil {
+					fmt.Fprintf(stdout, "%s: offset: %d err: %s\n",
+						arg, offset, err)
+					fmt.Fprintf(stdout, "Version state before failed Apply\n")
+					m.printLevels(cmp.Compare, stdout, v)
+					fmt.Fprintf(stdout, "Version edit that failed\n")
+					for df := range ve.DeletedFiles {
+						fmt.Fprintf(stdout, "  deleted: L%d %s\n", df.Level, df.FileNum)
+					}
+					for _, nf := range ve.NewFiles {
+						fmt.Fprintf(stdout, "  added: L%d %s:%d",
+							nf.Level, nf.Meta.FileNum, nf.Meta.Size)
+						formatSeqNumRange(stdout, nf.Meta.SmallestSeqNum, nf.Meta.LargestSeqNum)
+						formatKeyRange(stdout, m.fmtKey, &nf.Meta.Smallest, &nf.Meta.Largest)
+						fmt.Fprintf(stdout, "\n")
+					}
+					ok = false
+					break
+				}
+				v = newv
+			}
+		}()
+	}
+	if ok {
+		fmt.Fprintf(stdout, "OK\n")
+	}
+}
diff --git a/pebble/tool/manifest_test.go b/pebble/tool/manifest_test.go
new file mode 100644
index 0000000..26ebca3
--- /dev/null
+++ b/pebble/tool/manifest_test.go
@@ -0,0 +1,11 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package tool
+
+import "testing"
+
+func TestManifest(t *testing.T) {
+	runTests(t, "testdata/manifest_*")
+}
diff --git a/pebble/tool/remotecat.go b/pebble/tool/remotecat.go
new file mode 100644
index 0000000..01eabb9
--- /dev/null
+++ b/pebble/tool/remotecat.go
@@ -0,0 +1,132 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package tool
+
+import (
+	"fmt"
+	"io"
+	"slices"
+
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/remoteobjcat"
+	"github.com/cockroachdb/pebble/record"
+	"github.com/spf13/cobra"
+)
+
+// remoteCatalogT implements tools for the remote object catalog.
+type remoteCatalogT struct {
+	Root *cobra.Command
+	Dump *cobra.Command
+
+	verbose bool
+	opts    *pebble.Options
+}
+
+func newRemoteCatalog(opts *pebble.Options) *remoteCatalogT {
+	m := &remoteCatalogT{
+		opts: opts,
+	}
+
+	m.Root = &cobra.Command{
+		Use:   "remotecat",
+		Short: "remote object catalog introspection tools",
+	}
+
+	// Add dump command
+	m.Dump = &cobra.Command{
+		Use:   "dump <remote-catalog-files>",
+		Short: "print remote object catalog contents",
+		Long: `
+Print the contents of the REMOTE-OBJ-CATALOG files.
+`,
+		Args: cobra.MinimumNArgs(1),
+		Run:  m.runDump,
+	}
+	m.Dump.Flags().BoolVarP(&m.verbose, "verbose", "v", false, "show each record in the catalog")
+	m.Root.AddCommand(m.Dump)
+
+	return m
+}
+
+func (m *remoteCatalogT) runDump(cmd *cobra.Command, args []string) {
+	for _, arg := range args {
+		err := m.runDumpOne(cmd.OutOrStdout(), arg)
+		if err != nil {
+			fmt.Fprintf(cmd.OutOrStderr(), "%s\n", err)
+		}
+	}
+}
+
+func (m *remoteCatalogT) runDumpOne(stdout io.Writer, filename string) error {
+	f, err := m.opts.FS.Open(filename)
+	if err != nil {
+		return err
+	}
+
+	var creatorID objstorage.CreatorID
+	objects := make(map[base.DiskFileNum]remoteobjcat.RemoteObjectMetadata)
+
+	fmt.Fprintf(stdout, "%s\n", filename)
+	var editIdx int
+	rr := record.NewReader(f, 0 /* logNum */)
+	for {
+		offset := rr.Offset()
+		r, err := rr.Next()
+		if err == io.EOF {
+			break
+		} else if err != nil {
+			return err
+		}
+
+		var ve remoteobjcat.VersionEdit
+		err = ve.Decode(r)
+		if err != nil {
+			return err
+		}
+
+		if m.verbose {
+			fmt.Fprintf(stdout, "%d/%d\n", offset, editIdx)
+			if ve.CreatorID.IsSet() {
+				fmt.Fprintf(stdout, "  CreatorID: %s\n", ve.CreatorID)
+			}
+			if len(ve.NewObjects) > 0 {
+				fmt.Fprintf(stdout, "  NewObjects:\n")
+				for _, m := range ve.NewObjects {
+					fmt.Fprintf(
+						stdout, "    %s  CreatorID: %s  CreatorFileNum: %s  Locator: %q CustomObjectName: %q\n",
+						m.FileNum, m.CreatorID, m.CreatorFileNum, m.Locator, m.CustomObjectName,
+					)
+				}
+			}
+			if len(ve.DeletedObjects) > 0 {
+				fmt.Fprintf(stdout, "  DeletedObjects:\n")
+				for _, n := range ve.DeletedObjects {
+					fmt.Fprintf(stdout, "    %s\n", n)
+				}
+			}
+		}
+		editIdx++
+		if err := ve.Apply(&creatorID, objects); err != nil {
+			return err
+		}
+	}
+	fmt.Fprintf(stdout, "CreatorID: %v\n", creatorID)
+	var filenums []base.DiskFileNum
+	for n := range objects {
+		filenums = append(filenums, n)
+	}
+	slices.Sort(filenums)
+	fmt.Fprintf(stdout, "Objects:\n")
+	for _, n := range filenums {
+		m := objects[n]
+		fmt.Fprintf(
+			stdout, "    %s  CreatorID: %s  CreatorFileNum: %s  Locator: %q CustomObjectName: %q\n",
+			n, m.CreatorID, m.CreatorFileNum, m.Locator, m.CustomObjectName,
+		)
+	}
+	return nil
+}
diff --git a/pebble/tool/remotecat_test.go b/pebble/tool/remotecat_test.go
new file mode 100644
index 0000000..378be35
--- /dev/null
+++ b/pebble/tool/remotecat_test.go
@@ -0,0 +1,11 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package tool
+
+import "testing"
+
+func TestRemotecat(t *testing.T) {
+	runTests(t, "testdata/remotecat")
+}
diff --git a/pebble/tool/sstable.go b/pebble/tool/sstable.go
new file mode 100644
index 0000000..1e34877
--- /dev/null
+++ b/pebble/tool/sstable.go
@@ -0,0 +1,564 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package tool
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+	"slices"
+	"text/tabwriter"
+
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/humanize"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/internal/private"
+	"github.com/cockroachdb/pebble/internal/rangedel"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/spf13/cobra"
+)
+
+// sstableT implements sstable-level tools, including both configuration state
+// and the commands themselves.
+type sstableT struct {
+	Root       *cobra.Command
+	Check      *cobra.Command
+	Layout     *cobra.Command
+	Properties *cobra.Command
+	Scan       *cobra.Command
+	Space      *cobra.Command
+
+	// Configuration and state.
+	opts      *pebble.Options
+	comparers sstable.Comparers
+	mergers   sstable.Mergers
+
+	// Flags.
+	fmtKey   keyFormatter
+	fmtValue valueFormatter
+	start    key
+	end      key
+	filter   key
+	count    int64
+	verbose  bool
+}
+
+func newSSTable(
+	opts *pebble.Options, comparers sstable.Comparers, mergers sstable.Mergers,
+) *sstableT {
+	s := &sstableT{
+		opts:      opts,
+		comparers: comparers,
+		mergers:   mergers,
+	}
+	s.fmtKey.mustSet("quoted")
+	s.fmtValue.mustSet("[%x]")
+
+	s.Root = &cobra.Command{
+		Use:   "sstable",
+		Short: "sstable introspection tools",
+	}
+	s.Check = &cobra.Command{
+		Use:   "check <sstables>",
+		Short: "verify checksums and metadata",
+		Long:  ``,
+		Args:  cobra.MinimumNArgs(1),
+		Run:   s.runCheck,
+	}
+	s.Layout = &cobra.Command{
+		Use:   "layout <sstables>",
+		Short: "print sstable block and record layout",
+		Long: `
+Print the layout for the sstables. The -v flag controls whether record layout
+is displayed or omitted.
+`,
+		Args: cobra.MinimumNArgs(1),
+		Run:  s.runLayout,
+	}
+	s.Properties = &cobra.Command{
+		Use:   "properties <sstables>",
+		Short: "print sstable properties",
+		Long: `
+Print the properties for the sstables. The -v flag controls whether the
+properties are pretty-printed or displayed in a verbose/raw format.
+`,
+		Args: cobra.MinimumNArgs(1),
+		Run:  s.runProperties,
+	}
+	s.Scan = &cobra.Command{
+		Use:   "scan <sstables>",
+		Short: "print sstable records",
+		Long: `
+Print the records in the sstables. The sstables are scanned in command line
+order which means the records will be printed in that order. Raw range
+tombstones are displayed interleaved with point records.
+`,
+		Args: cobra.MinimumNArgs(1),
+		Run:  s.runScan,
+	}
+	s.Space = &cobra.Command{
+		Use:   "space <sstables>",
+		Short: "print filesystem space used",
+		Long: `
+Print the estimated space usage in the specified files for the
+inclusive-inclusive range specified by --start and --end.
+`,
+		Args: cobra.MinimumNArgs(1),
+		Run:  s.runSpace,
+	}
+
+	s.Root.AddCommand(s.Check, s.Layout, s.Properties, s.Scan, s.Space)
+	s.Root.PersistentFlags().BoolVarP(&s.verbose, "verbose", "v", false, "verbose output")
+
+	s.Check.Flags().Var(
+		&s.fmtKey, "key", "key formatter")
+	s.Layout.Flags().Var(
+		&s.fmtKey, "key", "key formatter")
+	s.Layout.Flags().Var(
+		&s.fmtValue, "value", "value formatter")
+	s.Scan.Flags().Var(
+		&s.fmtKey, "key", "key formatter")
+	s.Scan.Flags().Var(
+		&s.fmtValue, "value", "value formatter")
+	for _, cmd := range []*cobra.Command{s.Scan, s.Space} {
+		cmd.Flags().Var(
+			&s.start, "start", "start key for the range")
+		cmd.Flags().Var(
+			&s.end, "end", "end key for the range")
+	}
+	s.Scan.Flags().Var(
+		&s.filter, "filter", "only output records with matching prefix or overlapping range tombstones")
+	s.Scan.Flags().Int64Var(
+		&s.count, "count", 0, "key count for scan (0 is unlimited)")
+
+	return s
+}
+
+func (s *sstableT) newReader(f vfs.File) (*sstable.Reader, error) {
+	readable, err := sstable.NewSimpleReadable(f)
+	if err != nil {
+		return nil, err
+	}
+	o := sstable.ReaderOptions{
+		Cache:    pebble.NewCache(128 << 20 /* 128 MB */),
+		Comparer: s.opts.Comparer,
+		Filters:  s.opts.Filters,
+	}
+	defer o.Cache.Unref()
+	return sstable.NewReader(readable, o, s.comparers, s.mergers,
+		private.SSTableRawTombstonesOpt.(sstable.ReaderOption))
+}
+
+func (s *sstableT) runCheck(cmd *cobra.Command, args []string) {
+	stdout, stderr := cmd.OutOrStdout(), cmd.OutOrStderr()
+	s.foreachSstable(stderr, args, func(arg string) {
+		f, err := s.opts.FS.Open(arg)
+		if err != nil {
+			fmt.Fprintf(stderr, "%s\n", err)
+			return
+		}
+
+		fmt.Fprintf(stdout, "%s\n", arg)
+
+		r, err := s.newReader(f)
+
+		if err != nil {
+			fmt.Fprintf(stdout, "%s\n", err)
+			return
+		}
+		defer r.Close()
+
+		// Update the internal formatter if this comparator has one specified.
+		s.fmtKey.setForComparer(r.Properties.ComparerName, s.comparers)
+		s.fmtValue.setForComparer(r.Properties.ComparerName, s.comparers)
+
+		iter, err := r.NewIter(nil, nil)
+		if err != nil {
+			fmt.Fprintf(stderr, "%s\n", err)
+			return
+		}
+
+		// If a split function is defined for the comparer, verify that
+		// SeekPrefixGE can find every key in the table.
+		var prefixIter sstable.Iterator
+		if r.Split != nil {
+			var err error
+			prefixIter, err = r.NewIter(nil, nil)
+			if err != nil {
+				fmt.Fprintf(stderr, "%s\n", err)
+				return
+			}
+		}
+
+		var lastKey base.InternalKey
+		for key, _ := iter.First(); key != nil; key, _ = iter.Next() {
+			if base.InternalCompare(r.Compare, lastKey, *key) >= 0 {
+				fmt.Fprintf(stdout, "WARNING: OUT OF ORDER KEYS!\n")
+				if s.fmtKey.spec != "null" {
+					fmt.Fprintf(stdout, "    %s >= %s\n",
+						lastKey.Pretty(s.fmtKey.fn), key.Pretty(s.fmtKey.fn))
+				}
+			}
+			lastKey.Trailer = key.Trailer
+			lastKey.UserKey = append(lastKey.UserKey[:0], key.UserKey...)
+
+			if prefixIter != nil {
+				n := r.Split(key.UserKey)
+				prefix := key.UserKey[:n]
+				key2, _ := prefixIter.SeekPrefixGE(prefix, key.UserKey, base.SeekGEFlagsNone)
+				if key2 == nil {
+					fmt.Fprintf(stdout, "WARNING: PREFIX ITERATION FAILURE!\n")
+					if s.fmtKey.spec != "null" {
+						fmt.Fprintf(stdout, "    %s not found\n", key.Pretty(s.fmtKey.fn))
+					}
+				}
+			}
+		}
+
+		if err := iter.Close(); err != nil {
+			fmt.Fprintf(stdout, "%s\n", err)
+		}
+		if prefixIter != nil {
+			if err := prefixIter.Close(); err != nil {
+				fmt.Fprintf(stdout, "%s\n", err)
+			}
+		}
+	})
+}
+
+func (s *sstableT) runLayout(cmd *cobra.Command, args []string) {
+	stdout, stderr := cmd.OutOrStdout(), cmd.OutOrStderr()
+	s.foreachSstable(stderr, args, func(arg string) {
+		f, err := s.opts.FS.Open(arg)
+		if err != nil {
+			fmt.Fprintf(stderr, "%s\n", err)
+			return
+		}
+
+		fmt.Fprintf(stdout, "%s\n", arg)
+
+		r, err := s.newReader(f)
+		if err != nil {
+			fmt.Fprintf(stdout, "%s\n", err)
+			return
+		}
+		defer r.Close()
+
+		// Update the internal formatter if this comparator has one specified.
+		s.fmtKey.setForComparer(r.Properties.ComparerName, s.comparers)
+		s.fmtValue.setForComparer(r.Properties.ComparerName, s.comparers)
+
+		l, err := r.Layout()
+		if err != nil {
+			fmt.Fprintf(stderr, "%s\n", err)
+			return
+		}
+		fmtRecord := func(key *base.InternalKey, value []byte) {
+			formatKeyValue(stdout, s.fmtKey, s.fmtValue, key, value)
+		}
+		if s.fmtKey.spec == "null" && s.fmtValue.spec == "null" {
+			fmtRecord = nil
+		}
+		l.Describe(stdout, s.verbose, r, fmtRecord)
+	})
+}
+
+func (s *sstableT) runProperties(cmd *cobra.Command, args []string) {
+	stdout, stderr := cmd.OutOrStdout(), cmd.OutOrStderr()
+	s.foreachSstable(stderr, args, func(arg string) {
+		f, err := s.opts.FS.Open(arg)
+		if err != nil {
+			fmt.Fprintf(stderr, "%s\n", err)
+			return
+		}
+
+		fmt.Fprintf(stdout, "%s\n", arg)
+
+		r, err := s.newReader(f)
+		if err != nil {
+			fmt.Fprintf(stdout, "%s\n", err)
+			return
+		}
+		defer r.Close()
+
+		if s.verbose {
+			fmt.Fprintf(stdout, "%s", r.Properties.String())
+			return
+		}
+
+		stat, err := f.Stat()
+		if err != nil {
+			fmt.Fprintf(stderr, "%s\n", err)
+			return
+		}
+
+		formatNull := func(s string) string {
+			switch s {
+			case "", "nullptr":
+				return "-"
+			}
+			return s
+		}
+
+		tw := tabwriter.NewWriter(stdout, 2, 1, 2, ' ', 0)
+		fmt.Fprintf(tw, "size\t\n")
+		fmt.Fprintf(tw, "  file\t%s\n", humanize.Bytes.Int64(stat.Size()))
+		fmt.Fprintf(tw, "  data\t%s\n", humanize.Bytes.Uint64(r.Properties.DataSize))
+		fmt.Fprintf(tw, "    blocks\t%d\n", r.Properties.NumDataBlocks)
+		fmt.Fprintf(tw, "  index\t%s\n", humanize.Bytes.Uint64(r.Properties.IndexSize))
+		fmt.Fprintf(tw, "    blocks\t%d\n", 1+r.Properties.IndexPartitions)
+		fmt.Fprintf(tw, "    top-level\t%s\n", humanize.Bytes.Uint64(r.Properties.TopLevelIndexSize))
+		fmt.Fprintf(tw, "  filter\t%s\n", humanize.Bytes.Uint64(r.Properties.FilterSize))
+		fmt.Fprintf(tw, "  raw-key\t%s\n", humanize.Bytes.Uint64(r.Properties.RawKeySize))
+		fmt.Fprintf(tw, "  raw-value\t%s\n", humanize.Bytes.Uint64(r.Properties.RawValueSize))
+		fmt.Fprintf(tw, "  pinned-key\t%d\n", r.Properties.SnapshotPinnedKeySize)
+		fmt.Fprintf(tw, "  pinned-val\t%d\n", r.Properties.SnapshotPinnedValueSize)
+		fmt.Fprintf(tw, "  point-del-key-size\t%d\n", r.Properties.RawPointTombstoneKeySize)
+		fmt.Fprintf(tw, "  point-del-value-size\t%d\n", r.Properties.RawPointTombstoneValueSize)
+		fmt.Fprintf(tw, "records\t%d\n", r.Properties.NumEntries)
+		fmt.Fprintf(tw, "  set\t%d\n", r.Properties.NumEntries-
+			(r.Properties.NumDeletions+r.Properties.NumMergeOperands))
+		fmt.Fprintf(tw, "  delete\t%d\n", r.Properties.NumPointDeletions())
+		fmt.Fprintf(tw, "  delete-sized\t%d\n", r.Properties.NumSizedDeletions)
+		fmt.Fprintf(tw, "  range-delete\t%d\n", r.Properties.NumRangeDeletions)
+		fmt.Fprintf(tw, "  range-key-set\t%d\n", r.Properties.NumRangeKeySets)
+		fmt.Fprintf(tw, "  range-key-unset\t%d\n", r.Properties.NumRangeKeyUnsets)
+		fmt.Fprintf(tw, "  range-key-delete\t%d\n", r.Properties.NumRangeKeyDels)
+		fmt.Fprintf(tw, "  merge\t%d\n", r.Properties.NumMergeOperands)
+		fmt.Fprintf(tw, "  global-seq-num\t%d\n", r.Properties.GlobalSeqNum)
+		fmt.Fprintf(tw, "  pinned\t%d\n", r.Properties.SnapshotPinnedKeys)
+		fmt.Fprintf(tw, "index\t\n")
+		fmt.Fprintf(tw, "  key\t")
+		fmt.Fprintf(tw, "  value\t")
+		fmt.Fprintf(tw, "comparer\t%s\n", r.Properties.ComparerName)
+		fmt.Fprintf(tw, "merger\t%s\n", formatNull(r.Properties.MergerName))
+		fmt.Fprintf(tw, "filter\t%s\n", formatNull(r.Properties.FilterPolicyName))
+		fmt.Fprintf(tw, "  prefix\t%t\n", r.Properties.PrefixFiltering)
+		fmt.Fprintf(tw, "  whole-key\t%t\n", r.Properties.WholeKeyFiltering)
+		fmt.Fprintf(tw, "compression\t%s\n", r.Properties.CompressionName)
+		fmt.Fprintf(tw, "  options\t%s\n", r.Properties.CompressionOptions)
+		fmt.Fprintf(tw, "user properties\t\n")
+		fmt.Fprintf(tw, "  collectors\t%s\n", r.Properties.PropertyCollectorNames)
+		keys := make([]string, 0, len(r.Properties.UserProperties))
+		for key := range r.Properties.UserProperties {
+			keys = append(keys, key)
+		}
+		slices.Sort(keys)
+		for _, key := range keys {
+			fmt.Fprintf(tw, "  %s\t%s\n", key, r.Properties.UserProperties[key])
+		}
+		tw.Flush()
+	})
+}
+
+func (s *sstableT) runScan(cmd *cobra.Command, args []string) {
+	stdout, stderr := cmd.OutOrStdout(), cmd.OutOrStderr()
+	s.foreachSstable(stderr, args, func(arg string) {
+		f, err := s.opts.FS.Open(arg)
+		if err != nil {
+			fmt.Fprintf(stderr, "%s\n", err)
+			return
+		}
+
+		// In filter-mode, we prefix ever line that is output with the sstable
+		// filename.
+		var prefix string
+		if s.filter == nil {
+			fmt.Fprintf(stdout, "%s\n", arg)
+		} else {
+			prefix = fmt.Sprintf("%s: ", arg)
+		}
+
+		r, err := s.newReader(f)
+		if err != nil {
+			fmt.Fprintf(stdout, "%s%s\n", prefix, err)
+			return
+		}
+		defer r.Close()
+
+		// Update the internal formatter if this comparator has one specified.
+		s.fmtKey.setForComparer(r.Properties.ComparerName, s.comparers)
+		s.fmtValue.setForComparer(r.Properties.ComparerName, s.comparers)
+
+		iter, err := r.NewIter(nil, s.end)
+		if err != nil {
+			fmt.Fprintf(stderr, "%s%s\n", prefix, err)
+			return
+		}
+		defer iter.Close()
+		key, value := iter.SeekGE(s.start, base.SeekGEFlagsNone)
+
+		// We configured sstable.Reader to return raw tombstones which requires a
+		// bit more work here to put them in a form that can be iterated in
+		// parallel with the point records.
+		rangeDelIter, err := func() (keyspan.FragmentIterator, error) {
+			iter, err := r.NewRawRangeDelIter()
+			if err != nil {
+				return nil, err
+			}
+			if iter == nil {
+				return keyspan.NewIter(r.Compare, nil), nil
+			}
+			defer iter.Close()
+
+			var tombstones []keyspan.Span
+			for t := iter.First(); t != nil; t = iter.Next() {
+				if s.end != nil && r.Compare(s.end, t.Start) <= 0 {
+					// The range tombstone lies after the scan range.
+					continue
+				}
+				if r.Compare(s.start, t.End) >= 0 {
+					// The range tombstone lies before the scan range.
+					continue
+				}
+				tombstones = append(tombstones, t.ShallowClone())
+			}
+
+			slices.SortFunc(tombstones, func(a, b keyspan.Span) int {
+				return r.Compare(a.Start, b.Start)
+			})
+			return keyspan.NewIter(r.Compare, tombstones), nil
+		}()
+		if err != nil {
+			fmt.Fprintf(stdout, "%s%s\n", prefix, err)
+			return
+		}
+
+		defer rangeDelIter.Close()
+		rangeDel := rangeDelIter.First()
+		count := s.count
+
+		var lastKey base.InternalKey
+		for key != nil || rangeDel != nil {
+			if key != nil && (rangeDel == nil || r.Compare(key.UserKey, rangeDel.Start) < 0) {
+				// The filter specifies a prefix of the key.
+				//
+				// TODO(peter): Is using prefix comparison like this kosher for all
+				// comparers? Probably not, but it is for common ones such as the
+				// Pebble default and CockroachDB's comparer.
+				if s.filter == nil || bytes.HasPrefix(key.UserKey, s.filter) {
+					fmt.Fprint(stdout, prefix)
+					v, _, err := value.Value(nil)
+					if err != nil {
+						fmt.Fprintf(stdout, "%s%s\n", prefix, err)
+						return
+					}
+					formatKeyValue(stdout, s.fmtKey, s.fmtValue, key, v)
+
+				}
+				if base.InternalCompare(r.Compare, lastKey, *key) >= 0 {
+					fmt.Fprintf(stdout, "%s    WARNING: OUT OF ORDER KEYS!\n", prefix)
+				}
+				lastKey.Trailer = key.Trailer
+				lastKey.UserKey = append(lastKey.UserKey[:0], key.UserKey...)
+				key, value = iter.Next()
+			} else {
+				// If a filter is specified, we want to output any range tombstone
+				// which overlaps the prefix. The comparison on the start key is
+				// somewhat complex. Consider the tombstone [aaa,ccc). We want to
+				// output this tombstone if filter is "aa", and if it "bbb".
+				if s.filter == nil ||
+					((r.Compare(s.filter, rangeDel.Start) >= 0 ||
+						bytes.HasPrefix(rangeDel.Start, s.filter)) &&
+						r.Compare(s.filter, rangeDel.End) < 0) {
+					fmt.Fprint(stdout, prefix)
+					if err := rangedel.Encode(rangeDel, func(k base.InternalKey, v []byte) error {
+						formatKeyValue(stdout, s.fmtKey, s.fmtValue, &k, v)
+						return nil
+					}); err != nil {
+						fmt.Fprintf(stdout, "%s\n", err)
+						os.Exit(1)
+					}
+				}
+				rangeDel = rangeDelIter.Next()
+			}
+
+			if count > 0 {
+				count--
+				if count == 0 {
+					break
+				}
+			}
+		}
+
+		// Handle range keys.
+		rkIter, err := r.NewRawRangeKeyIter()
+		if err != nil {
+			fmt.Fprintf(stdout, "%s\n", err)
+			os.Exit(1)
+		}
+		if rkIter != nil {
+			defer rkIter.Close()
+			for span := rkIter.SeekGE(s.start); span != nil; span = rkIter.Next() {
+				// By default, emit the key, unless there is a filter.
+				emit := s.filter == nil
+				// Skip spans that start after the end key (if provided). End keys are
+				// exclusive, e.g. [a, b), so we consider the interval [b, +inf).
+				if s.end != nil && r.Compare(span.Start, s.end) >= 0 {
+					emit = false
+				}
+				// Filters override the provided start / end bounds, if provided.
+				if s.filter != nil && bytes.HasPrefix(span.Start, s.filter) {
+					// In filter mode, each line is prefixed with the filename.
+					fmt.Fprint(stdout, prefix)
+					emit = true
+				}
+				if emit {
+					formatSpan(stdout, s.fmtKey, s.fmtValue, span)
+				}
+			}
+		}
+
+		if err := iter.Close(); err != nil {
+			fmt.Fprintf(stdout, "%s\n", err)
+		}
+	})
+}
+
+func (s *sstableT) runSpace(cmd *cobra.Command, args []string) {
+	stdout, stderr := cmd.OutOrStdout(), cmd.OutOrStderr()
+	s.foreachSstable(stderr, args, func(arg string) {
+		f, err := s.opts.FS.Open(arg)
+		if err != nil {
+			fmt.Fprintf(stderr, "%s\n", err)
+			return
+		}
+		r, err := s.newReader(f)
+		if err != nil {
+			fmt.Fprintf(stderr, "%s\n", err)
+			return
+		}
+		defer r.Close()
+
+		bytes, err := r.EstimateDiskUsage(s.start, s.end)
+		if err != nil {
+			fmt.Fprintf(stderr, "%s\n", err)
+			return
+		}
+		fmt.Fprintf(stdout, "%s: %d\n", arg, bytes)
+	})
+}
+
+func (s *sstableT) foreachSstable(stderr io.Writer, args []string, fn func(arg string)) {
+	// Loop over args, invoking fn for each file. Each directory is recursively
+	// listed and fn is invoked on any file with an .sst or .ldb suffix.
+	for _, arg := range args {
+		info, err := s.opts.FS.Stat(arg)
+		if err != nil || !info.IsDir() {
+			fn(arg)
+			continue
+		}
+		walk(stderr, s.opts.FS, arg, func(path string) {
+			switch filepath.Ext(path) {
+			case ".sst", ".ldb":
+				fn(path)
+			}
+		})
+	}
+}
diff --git a/pebble/tool/sstable_test.go b/pebble/tool/sstable_test.go
new file mode 100644
index 0000000..3b56e35
--- /dev/null
+++ b/pebble/tool/sstable_test.go
@@ -0,0 +1,11 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package tool
+
+import "testing"
+
+func TestSSTable(t *testing.T) {
+	runTests(t, "testdata/sstable_*")
+}
diff --git a/pebble/tool/testdata/MANIFEST-invalid b/pebble/tool/testdata/MANIFEST-invalid
new file mode 100644
index 0000000..36ce72a
Binary files /dev/null and b/pebble/tool/testdata/MANIFEST-invalid differ
diff --git a/pebble/tool/testdata/REMOTE-OBJ-CATALOG b/pebble/tool/testdata/REMOTE-OBJ-CATALOG
new file mode 100644
index 0000000..59786d5
Binary files /dev/null and b/pebble/tool/testdata/REMOTE-OBJ-CATALOG differ
diff --git a/pebble/tool/testdata/bad-magic.sst b/pebble/tool/testdata/bad-magic.sst
new file mode 100644
index 0000000..dcb8209
Binary files /dev/null and b/pebble/tool/testdata/bad-magic.sst differ
diff --git a/pebble/tool/testdata/corrupt-options-db/OPTIONS-000002 b/pebble/tool/testdata/corrupt-options-db/OPTIONS-000002
new file mode 100644
index 0000000..bd4a478
--- /dev/null
+++ b/pebble/tool/testdata/corrupt-options-db/OPTIONS-000002
@@ -0,0 +1 @@
+blargle
diff --git a/pebble/tool/testdata/corrupted.sst b/pebble/tool/testdata/corrupted.sst
new file mode 100644
index 0000000..1dffb44
Binary files /dev/null and b/pebble/tool/testdata/corrupted.sst differ
diff --git a/pebble/tool/testdata/db_check b/pebble/tool/testdata/db_check
new file mode 100644
index 0000000..7cdb409
--- /dev/null
+++ b/pebble/tool/testdata/db_check
@@ -0,0 +1,44 @@
+db check
+----
+accepts 1 arg(s), received 0
+
+db check
+non-existent
+----
+error opening database at "non-existent": pebble: database "non-existent" does not exist
+
+db check
+./testdata/corrupt-options-db
+----
+error loading options: invalid key=value syntax: "blargle"
+Custom message in case of corruption error.
+
+
+db check
+../testdata/db-stage-4
+----
+checked 6 points and 0 tombstone
+
+db check
+../testdata/db-stage-4
+--comparer=foo
+----
+unknown comparer "foo"
+
+db check
+../testdata/db-stage-4
+--comparer=test-comparer
+----
+pebble: manifest file "MANIFEST-000006" for DB "db-stage-4": comparer name from file "leveldb.BytewiseComparator" != comparer name from Options "test-comparer"
+
+db check
+../testdata/db-stage-4
+--merger=foo
+----
+unknown merger "foo"
+
+db check
+../testdata/db-stage-4
+--merger=test-merger
+----
+pebble: merger name from file "pebble.concatenate" != merger name from options "test-merger"
diff --git a/pebble/tool/testdata/db_checkpoint b/pebble/tool/testdata/db_checkpoint
new file mode 100644
index 0000000..0641605
--- /dev/null
+++ b/pebble/tool/testdata/db_checkpoint
@@ -0,0 +1,18 @@
+db checkpoint
+----
+accepts 2 arg(s), received 0
+
+db checkpoint
+non-existent
+----
+accepts 2 arg(s), received 1
+
+db checkpoint
+../testdata/db-stage-4
+../testdata/db-checkpoint1
+----
+
+db check
+../testdata/db-checkpoint1
+----
+checked 6 points and 0 tombstone
diff --git a/pebble/tool/testdata/db_get_set b/pebble/tool/testdata/db_get_set
new file mode 100644
index 0000000..72b4fef
--- /dev/null
+++ b/pebble/tool/testdata/db_get_set
@@ -0,0 +1,58 @@
+db check ../testdata/db-stage-4
+----
+checked 6 points and 0 tombstone
+
+db get
+../testdata/db-stage-4
+----
+accepts 2 arg(s), received 1
+
+db get
+../testdata/db-stage-4
+key1
+----
+pebble: not found
+
+db set
+../testdata/db-stage-4
+key1
+value1
+----
+
+db get
+../testdata/db-stage-4
+key1
+----
+[76616c756531]
+
+db check ../testdata/db-stage-4
+----
+checked 7 points and 0 tombstone
+
+# 0x6b657941 = "key1", so this test case verifies that
+# hex decoding works too.
+
+db get
+../testdata/db-stage-4
+hex:6b657931
+----
+[76616c756531]
+
+db set
+../testdata/db-stage-4
+hex:6b657931
+hex:1b1b1b
+----
+
+db get
+../testdata/db-stage-4
+hex:6b657931
+----
+[1b1b1b]
+
+db get
+../testdata/db-stage-4
+hex:6b657931
+--value=quoted
+----
+\x1b\x1b\x1b
diff --git a/pebble/tool/testdata/db_lsm b/pebble/tool/testdata/db_lsm
new file mode 100644
index 0000000..1967c7f
--- /dev/null
+++ b/pebble/tool/testdata/db_lsm
@@ -0,0 +1,39 @@
+db lsm
+----
+accepts 1 arg(s), received 0
+
+db lsm
+non-existent
+----
+error opening database at "non-existent": pebble: database "non-existent" does not exist
+
+db lsm
+../testdata/db-stage-4
+----
+      |                             |       |       |   ingested   |     moved    |    written   |       |    amp
+level | tables  size val-bl vtables | score |   in  | tables  size | tables  size | tables  size |  read |   r   w
+------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+---------
+    0 |     1   709B     0B       0 |  0.50 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    1 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    2 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    3 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    4 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    5 |     0     0B     0B       0 |  0.00 |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+    6 |     0     0B     0B       0 |     - |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+total |     1   709B     0B       0 |     - |    0B |     0     0B |     0     0B |     0     0B |    0B |   0  0.0
+-------------------------------------------------------------------------------------------------------------------
+WAL: 1 files (0B)  in: 0B  written: 0B (0% overhead)
+Flushes: 0
+Compactions: 0  estimated debt: 0B  in progress: 0 (0B)
+             default: 0  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  multi-level: 0
+MemTables: 1 (256KB)  zombie: 0 (0B)
+Zombie tables: 0 (0B)
+Backing tables: 0 (0B)
+Virtual tables: 0 (0B)
+Block cache: 0 entries (0B)  hit rate: 0.0%
+Table cache: 0 entries (0B)  hit rate: 0.0%
+Secondary cache: 0 entries (0B)  hit rate: 0.0%
+Snapshots: 0  earliest seq num: 0
+Table iters: 0
+Filter utility: 0.0%
+Ingestions: 0  as flushable: 0 (0B in 0 tables)
diff --git a/pebble/tool/testdata/db_properties b/pebble/tool/testdata/db_properties
new file mode 100644
index 0000000..1bb3a3b
--- /dev/null
+++ b/pebble/tool/testdata/db_properties
@@ -0,0 +1,72 @@
+db properties
+----
+accepts 1 arg(s), received 0
+
+db properties
+non-existent
+----
+open non-existent/: file does not exist
+
+db properties
+../testdata/db-stage-4
+----
+                          L0     L1    L2    L3    L4    L5    L6    TOTAL
+count                     1      0     0     0     0     0     0     1
+seq num                                                              
+  smallest                12     0     0     0     0     0     0     12
+  largest                 14     0     0     0     0     0     0     14
+size                                                                 
+  data                    62B    0B    0B    0B    0B    0B    0B    62B
+    blocks                1      0     0     0     0     0     0     1
+  index                   27B    0B    0B    0B    0B    0B    0B    27B
+    blocks                1      0     0     0     0     0     0     1
+    top-level             0B     0B    0B    0B    0B    0B    0B    0B
+  filter                  0B     0B    0B    0B    0B    0B    0B    0B
+  raw-key                 33B    0B    0B    0B    0B    0B    0B    33B
+  raw-value               9B     0B    0B    0B    0B    0B    0B    9B
+  pinned-key              0B     0B    0B    0B    0B    0B    0B    0B
+  pinned-value            0B     0B    0B    0B    0B    0B    0B    0B
+  point-del-key-size      3B     0B    0B    0B    0B    0B    0B    3B
+  point-del-value-size    0B     0B    0B    0B    0B    0B    0B    0B
+records                                                              
+  set                     2      0     0     0     0     0     0     2
+  delete                  1      0     0     0     0     0     0     1
+  delete-sized            0      0     0     0     0     0     0     0
+  range-delete            0      0     0     0     0     0     0     0
+  range-key-sets          0      0     0     0     0     0     0     0
+  range-key-unsets        0      0     0     0     0     0     0     0
+  range-key-deletes       0      0     0     0     0     0     0     0
+  merge                   0      0     0     0     0     0     0     0
+  pinned                  0      0     0     0     0     0     0     0
+
+db properties
+./testdata/mixed
+----
+                          L0      L1    L2    L3    L4    L5    L6    TOTAL
+count                     1       0     0     0     0     0     0     1
+seq num                                                               
+  smallest                1       0     0     0     0     0     0     1
+  largest                 29      0     0     0     0     0     0     29
+size                                                                  
+  data                    236B    0B    0B    0B    0B    0B    0B    236B
+    blocks                1       0     0     0     0     0     0     1
+  index                   29B     0B    0B    0B    0B    0B    0B    29B
+    blocks                1       0     0     0     0     0     0     1
+    top-level             0B      0B    0B    0B    0B    0B    0B    0B
+  filter                  0B      0B    0B    0B    0B    0B    0B    0B
+  raw-key                 286B    0B    0B    0B    0B    0B    0B    286B
+  raw-value               0B      0B    0B    0B    0B    0B    0B    0B
+  pinned-key              0B      0B    0B    0B    0B    0B    0B    0B
+  pinned-value            0B      0B    0B    0B    0B    0B    0B    0B
+  point-del-key-size      0B      0B    0B    0B    0B    0B    0B    0B
+  point-del-value-size    0B      0B    0B    0B    0B    0B    0B    0B
+records                                                               
+  set                     26      0     0     0     0     0     0     26
+  delete                  0       0     0     0     0     0     0     0
+  delete-sized            0       0     0     0     0     0     0     0
+  range-delete            0       0     0     0     0     0     0     0
+  range-key-sets          1       0     0     0     0     0     0     1
+  range-key-unsets        1       0     0     0     0     0     0     1
+  range-key-deletes       1       0     0     0     0     0     0     1
+  merge                   0       0     0     0     0     0     0     0
+  pinned                  0       0     0     0     0     0     0     0
diff --git a/pebble/tool/testdata/db_scan b/pebble/tool/testdata/db_scan
new file mode 100644
index 0000000..fb17ff8
--- /dev/null
+++ b/pebble/tool/testdata/db_scan
@@ -0,0 +1,88 @@
+db scan
+----
+accepts 1 arg(s), received 0
+
+db scan
+non-existent
+----
+error opening database at "non-existent": pebble: database "non-existent" does not exist
+
+db scan
+./testdata/corrupt-options-db
+----
+error loading options: invalid key=value syntax: "blargle"
+Custom message in case of corruption error.
+
+db scan
+../testdata/db-stage-4
+----
+foo [66697665]
+quux [736978]
+scanned 2 records in 1.0s
+
+db scan
+../testdata/db-stage-4
+--comparer=foo
+----
+unknown comparer "foo"
+
+db scan
+../testdata/db-stage-4
+--comparer=test-comparer
+----
+pebble: manifest file "MANIFEST-000006" for DB "db-stage-4": comparer name from file "leveldb.BytewiseComparator" != comparer name from Options "test-comparer"
+
+db scan
+../testdata/db-stage-4
+--merger=foo
+----
+unknown merger "foo"
+
+db scan
+../testdata/db-stage-4
+--merger=test-merger
+----
+pebble: merger name from file "pebble.concatenate" != merger name from options "test-merger"
+
+db scan
+../testdata/db-stage-4
+--key=%x
+--value=size
+----
+666f6f <4>
+71757578 <3>
+scanned 2 records in 1.0s
+
+db scan
+../testdata/db-stage-4
+--key=%x
+--value=null
+--start=quux
+----
+71757578
+scanned 1 record in 1.0s
+
+db scan
+../testdata/db-stage-4
+--key=null
+--value=size
+--end=quux
+----
+<4>
+scanned 1 record in 1.0s
+
+db scan
+../testdata/db-stage-4
+--key=null
+--value=null
+----
+scanned 2 records in 1.0s
+
+
+db scan
+../testdata/db-stage-4
+--key=null
+--value=null
+--count=1
+----
+scanned 1 record in 1.0s
diff --git a/pebble/tool/testdata/db_space b/pebble/tool/testdata/db_space
new file mode 100644
index 0000000..e073b34
--- /dev/null
+++ b/pebble/tool/testdata/db_space
@@ -0,0 +1,38 @@
+db space
+----
+accepts 1 arg(s), received 0
+
+# covers the whole 4.sst
+
+db space --start=a --end=z
+../testdata/db-stage-4
+----
+709
+
+# covers from left of 4.sst to its only data block
+
+db space --start=a --end=bar
+../testdata/db-stage-4
+----
+62
+
+# covers from 4.sst's only data block to its right
+
+db space --start=foo --end=z
+../testdata/db-stage-4
+----
+62
+
+# covers non-overlapping range to left of 4.sst
+
+db space --start=a --end=a
+../testdata/db-stage-4
+----
+0
+
+# covers non-overlapping range to right of 4.sst
+
+db space --start=z --end=z
+../testdata/db-stage-4
+----
+0
diff --git a/pebble/tool/testdata/find b/pebble/tool/testdata/find
new file mode 100644
index 0000000..c18bc35
--- /dev/null
+++ b/pebble/tool/testdata/find
@@ -0,0 +1,140 @@
+find
+----
+accepts 2 arg(s), received 0
+
+find
+non-existent
+key
+----
+stat non-existent: file does not exist
+
+find
+testdata/find-db
+aaa
+----
+000002.log
+    aaa#1,SET [31]
+000004.log
+    aaa#8,DEL []
+000005.sst [aaa#1,SET-ccc#5,MERGE]
+    (flushed to L0, moved to L6)
+    aaa#1,SET [31]
+000008.sst [aaa#0,SET-ccc#0,MERGE]
+    (compacted L0 [...] + L6 [000005])
+    aaa#0,SET [31]
+000010.sst [aaa#8,DEL-eee#inf,RANGEDEL]
+    (flushed to L0)
+    aaa#8,DEL []
+000011.sst [aaa#8,DEL-eee#inf,RANGEDEL]
+    (compacted L0 [000010] + L6 [000008 ...])
+    aaa#8,DEL []
+    aaa#0,SET [31]
+
+find
+testdata/find-db
+bbb
+--key=%x
+--value=pretty:test-comparer
+----
+000002.log
+    626262#2,SET test value formatter: 2
+000004.log
+    626262-656565#10,RANGEDEL
+000005.sst [616161#1,SET-636363#5,MERGE]
+    (flushed to L0, moved to L6)
+    626262#2,SET test value formatter: 2
+000006.sst [626262#6,SET-636363#6,SET]
+    (ingested to L0)
+    626262#6,SET test value formatter: 22
+000008.sst [616161#0,SET-636363#0,MERGE]
+    (compacted L0 [000006] + L6 [000005])
+    626262#6,SET test value formatter: 22
+    626262#0,SET test value formatter: 2
+000010.sst [616161#8,DEL-656565#inf,RANGEDEL]
+    (flushed to L0)
+    626262-656565#10,RANGEDEL
+000011.sst [616161#8,DEL-656565#inf,RANGEDEL]
+    (compacted L0 [000010] + L6 [000008 ...])
+    626262-656565#10,RANGEDEL
+    626262#6,SET test value formatter: 22
+    626262#0,SET test value formatter: 2
+
+find
+testdata/find-db
+hex:636363
+--value=null
+----
+000002.log
+    ccc#3,MERGE
+    ccc#4,MERGE
+    ccc#5,MERGE
+000004.log
+    ccc#9,SINGLEDEL
+    bbb-eee#10,RANGEDEL
+000005.sst [aaa#1,SET-ccc#5,MERGE]
+    (flushed to L0, moved to L6)
+    ccc#5,MERGE
+000006.sst [bbb#6,SET-ccc#6,SET]
+    (ingested to L0)
+    ccc#6,SET
+000008.sst [aaa#0,SET-ccc#0,MERGE]
+    (compacted L0 [000006] + L6 [000005])
+    ccc#6,SET
+    ccc#0,MERGE
+000010.sst [aaa#8,DEL-eee#inf,RANGEDEL]
+    (flushed to L0)
+    bbb-eee#10,RANGEDEL
+000011.sst [aaa#8,DEL-eee#inf,RANGEDEL]
+    (compacted L0 [000010] + L6 [000008 ...])
+    bbb-eee#10,RANGEDEL
+    ccc#6,SET
+    ccc#0,MERGE
+
+find
+testdata/find-db
+ddd
+-v
+----
+find-db
+    1 manifest
+    3 logs
+    6 sstables
+find-db/MANIFEST-000001
+   10 edits
+find-db/archive/000002.log
+find-db/archive/000004.log
+find-db/000009.log
+find-db/archive/000005.sst
+find-db/archive/000006.sst: global seqnum: 6
+find-db/archive/000007.sst: global seqnum: 7
+find-db/archive/000008.sst
+find-db/archive/000010.sst
+find-db/archive/000011.sst
+000004.log
+    bbb-eee#10,RANGEDEL
+000007.sst [ddd#7,SET-ddd#7,SET]
+    (ingested to L6)
+    ddd#7,SET [3333]
+000010.sst [aaa#8,DEL-eee#inf,RANGEDEL]
+    (flushed to L0)
+    bbb-eee#10,RANGEDEL
+000011.sst [aaa#8,DEL-eee#inf,RANGEDEL]
+    (compacted L0 [000010] + L6 [000007 ...])
+    bbb-eee#10,RANGEDEL
+    ddd#7,SET [3333]
+
+find
+testdata/find-db
+eee
+----
+000004.log
+    bbb-eee#10,RANGEDEL
+
+find
+testdata/find-mixed
+hex:636363
+--value=null
+----
+000002.sst
+    test formatter: ccc#0,SET
+Unable to decode sstable find-mixed/000001.sst, pebble/table: invalid table (file size is too small)
diff --git a/pebble/tool/testdata/find-db/000009.log b/pebble/tool/testdata/find-db/000009.log
new file mode 100644
index 0000000..9fd4bba
Binary files /dev/null and b/pebble/tool/testdata/find-db/000009.log differ
diff --git a/pebble/tool/testdata/find-db/CURRENT b/pebble/tool/testdata/find-db/CURRENT
new file mode 100644
index 0000000..7ed683d
--- /dev/null
+++ b/pebble/tool/testdata/find-db/CURRENT
@@ -0,0 +1 @@
+MANIFEST-000001
diff --git a/pebble/tool/testdata/find-db/LOCK b/pebble/tool/testdata/find-db/LOCK
new file mode 100644
index 0000000..e69de29
diff --git a/pebble/tool/testdata/find-db/MANIFEST-000001 b/pebble/tool/testdata/find-db/MANIFEST-000001
new file mode 100644
index 0000000..c3aa9f4
Binary files /dev/null and b/pebble/tool/testdata/find-db/MANIFEST-000001 differ
diff --git a/pebble/tool/testdata/find-db/OPTIONS-000003 b/pebble/tool/testdata/find-db/OPTIONS-000003
new file mode 100644
index 0000000..54d3bc2
--- /dev/null
+++ b/pebble/tool/testdata/find-db/OPTIONS-000003
@@ -0,0 +1,36 @@
+[Version]
+  pebble_version=0.1
+
+[Options]
+  bytes_per_sync=524288
+  cache_size=8388608
+  cleaner=archive
+  comparer=alt-comparer
+  delete_range_flush_delay=0s
+  disable_wal=false
+  flush_split_bytes=4194304
+  l0_compaction_concurrency=10
+  l0_compaction_threshold=4
+  l0_stop_writes_threshold=12
+  lbase_max_bytes=67108864
+  max_concurrent_compactions=1
+  max_manifest_file_size=134217728
+  max_open_files=1000
+  mem_table_size=4194304
+  mem_table_stop_writes_threshold=2
+  min_compaction_rate=4194304
+  min_flush_rate=1048576
+  merger=test-merger
+  strict_wal_tail=true
+  table_property_collectors=[]
+  wal_dir=
+  wal_bytes_per_sync=0
+
+[Level "0"]
+  block_restart_interval=16
+  block_size=4096
+  compression=Snappy
+  filter_policy=none
+  filter_type=table
+  index_block_size=4096
+  target_file_size=2097152
diff --git a/pebble/tool/testdata/find-db/archive/000002.log b/pebble/tool/testdata/find-db/archive/000002.log
new file mode 100644
index 0000000..6a11fe7
Binary files /dev/null and b/pebble/tool/testdata/find-db/archive/000002.log differ
diff --git a/pebble/tool/testdata/find-db/archive/000004.log b/pebble/tool/testdata/find-db/archive/000004.log
new file mode 100644
index 0000000..c65b0cd
Binary files /dev/null and b/pebble/tool/testdata/find-db/archive/000004.log differ
diff --git a/pebble/tool/testdata/find-db/archive/000005.sst b/pebble/tool/testdata/find-db/archive/000005.sst
new file mode 100644
index 0000000..757f663
Binary files /dev/null and b/pebble/tool/testdata/find-db/archive/000005.sst differ
diff --git a/pebble/tool/testdata/find-db/archive/000006.sst b/pebble/tool/testdata/find-db/archive/000006.sst
new file mode 100644
index 0000000..80773f2
Binary files /dev/null and b/pebble/tool/testdata/find-db/archive/000006.sst differ
diff --git a/pebble/tool/testdata/find-db/archive/000007.sst b/pebble/tool/testdata/find-db/archive/000007.sst
new file mode 100644
index 0000000..9e8c240
Binary files /dev/null and b/pebble/tool/testdata/find-db/archive/000007.sst differ
diff --git a/pebble/tool/testdata/find-db/archive/000008.sst b/pebble/tool/testdata/find-db/archive/000008.sst
new file mode 100644
index 0000000..61a051b
Binary files /dev/null and b/pebble/tool/testdata/find-db/archive/000008.sst differ
diff --git a/pebble/tool/testdata/find-db/archive/000010.sst b/pebble/tool/testdata/find-db/archive/000010.sst
new file mode 100644
index 0000000..61d3070
Binary files /dev/null and b/pebble/tool/testdata/find-db/archive/000010.sst differ
diff --git a/pebble/tool/testdata/find-db/archive/000011.sst b/pebble/tool/testdata/find-db/archive/000011.sst
new file mode 100644
index 0000000..085593a
Binary files /dev/null and b/pebble/tool/testdata/find-db/archive/000011.sst differ
diff --git a/pebble/tool/testdata/find-mixed/000001.sst b/pebble/tool/testdata/find-mixed/000001.sst
new file mode 100644
index 0000000..9977a28
--- /dev/null
+++ b/pebble/tool/testdata/find-mixed/000001.sst
@@ -0,0 +1 @@
+invalid
diff --git a/pebble/tool/testdata/find-mixed/000002.sst b/pebble/tool/testdata/find-mixed/000002.sst
new file mode 100644
index 0000000..80773f2
Binary files /dev/null and b/pebble/tool/testdata/find-mixed/000002.sst differ
diff --git a/pebble/tool/testdata/manifest_dump b/pebble/tool/testdata/manifest_dump
new file mode 100644
index 0000000..5ec7f4b
--- /dev/null
+++ b/pebble/tool/testdata/manifest_dump
@@ -0,0 +1,399 @@
+manifest dump
+----
+requires at least 1 arg(s), only received 0
+
+manifest dump
+../testdata/db-stage-2/MANIFEST-000001
+----
+MANIFEST-000001
+0/0
+  comparer:     leveldb.BytewiseComparator
+  next-file-num: 2
+39/1
+  log-num:       2
+  next-file-num: 3
+  last-seq-num:  9
+EOF
+--- L0 ---
+--- L1 ---
+--- L2 ---
+--- L3 ---
+--- L4 ---
+--- L5 ---
+--- L6 ---
+
+manifest dump
+../testdata/db-stage-4/MANIFEST-000006
+----
+MANIFEST-000006
+0/0
+  comparer:     leveldb.BytewiseComparator
+  log-num:       2
+  next-file-num: 7
+41/1
+  log-num:       5
+  next-file-num: 6
+  last-seq-num:  14
+  added:         L0 000004:709<#12-#14>[bar#14,DEL-foo#13,SET] (2023-12-04T17:57:25Z)
+EOF
+--- L0.0 ---
+  000004:709<#12-#14>[bar#14,DEL-foo#13,SET]
+--- L1 ---
+--- L2 ---
+--- L3 ---
+--- L4 ---
+--- L5 ---
+--- L6 ---
+
+manifest dump --filter-start=zoo
+../testdata/db-stage-4/MANIFEST-000006
+----
+MANIFEST-000006
+0/0
+  comparer:     leveldb.BytewiseComparator
+  log-num:       2
+  next-file-num: 7
+EOF
+--- L0.0 ---
+--- L1 ---
+--- L2 ---
+--- L3 ---
+--- L4 ---
+--- L5 ---
+--- L6 ---
+
+manifest dump --filter-end=a
+../testdata/db-stage-4/MANIFEST-000006
+----
+MANIFEST-000006
+0/0
+  comparer:     leveldb.BytewiseComparator
+  log-num:       2
+  next-file-num: 7
+EOF
+--- L0.0 ---
+--- L1 ---
+--- L2 ---
+--- L3 ---
+--- L4 ---
+--- L5 ---
+--- L6 ---
+
+manifest dump --filter-start=a --filter-end=d
+../testdata/db-stage-4/MANIFEST-000006
+----
+MANIFEST-000006
+0/0
+  comparer:     leveldb.BytewiseComparator
+  log-num:       2
+  next-file-num: 7
+41/1
+  log-num:       5
+  next-file-num: 6
+  last-seq-num:  14
+  added:         L0 000004:709<#12-#14>[bar#14,DEL-foo#13,SET] (2023-12-04T17:57:25Z)
+EOF
+--- L0.0 ---
+  000004:709<#12-#14>[bar#14,DEL-foo#13,SET]
+--- L1 ---
+--- L2 ---
+--- L3 ---
+--- L4 ---
+--- L5 ---
+--- L6 ---
+
+manifest dump
+../testdata/db-stage-4/MANIFEST-000006
+--key=%x
+----
+MANIFEST-000006
+0/0
+  comparer:     leveldb.BytewiseComparator
+  log-num:       2
+  next-file-num: 7
+41/1
+  log-num:       5
+  next-file-num: 6
+  last-seq-num:  14
+  added:         L0 000004:709<#12-#14>[626172#14,DEL-666f6f#13,SET] (2023-12-04T17:57:25Z)
+EOF
+--- L0.0 ---
+  000004:709<#12-#14>[626172#14,DEL-666f6f#13,SET]
+--- L1 ---
+--- L2 ---
+--- L3 ---
+--- L4 ---
+--- L5 ---
+--- L6 ---
+
+manifest dump
+../testdata/db-stage-4/MANIFEST-000006
+--key=null
+----
+MANIFEST-000006
+0/0
+  comparer:     leveldb.BytewiseComparator
+  log-num:       2
+  next-file-num: 7
+41/1
+  log-num:       5
+  next-file-num: 6
+  last-seq-num:  14
+  added:         L0 000004:709<#12-#14> (2023-12-04T17:57:25Z)
+EOF
+--- L0.0 ---
+  000004:709<#12-#14>
+--- L1 ---
+--- L2 ---
+--- L3 ---
+--- L4 ---
+--- L5 ---
+--- L6 ---
+
+manifest dump
+../testdata/db-stage-4/MANIFEST-000006
+--key=pretty
+----
+MANIFEST-000006
+0/0
+  comparer:     leveldb.BytewiseComparator
+  log-num:       2
+  next-file-num: 7
+41/1
+  log-num:       5
+  next-file-num: 6
+  last-seq-num:  14
+  added:         L0 000004:709<#12-#14>[bar#14,DEL-foo#13,SET] (2023-12-04T17:57:25Z)
+EOF
+--- L0.0 ---
+  000004:709<#12-#14>[bar#14,DEL-foo#13,SET]
+--- L1 ---
+--- L2 ---
+--- L3 ---
+--- L4 ---
+--- L5 ---
+--- L6 ---
+
+manifest dump
+../testdata/db-stage-4/MANIFEST-000006
+--key=pretty:test-comparer
+----
+MANIFEST-000006
+0/0
+  comparer:     leveldb.BytewiseComparator
+  log-num:       2
+  next-file-num: 7
+41/1
+  log-num:       5
+  next-file-num: 6
+  last-seq-num:  14
+  added:         L0 000004:709<#12-#14>[test formatter: bar#14,DEL-test formatter: foo#13,SET] (2023-12-04T17:57:25Z)
+EOF
+--- L0.0 ---
+  000004:709<#12-#14>[test formatter: bar#14,DEL-test formatter: foo#13,SET]
+--- L1 ---
+--- L2 ---
+--- L3 ---
+--- L4 ---
+--- L5 ---
+--- L6 ---
+
+manifest check
+----
+requires at least 1 arg(s), only received 0
+
+manifest check
+../testdata/db-stage-1/MANIFEST-000001
+----
+OK
+
+manifest check
+../testdata/db-stage-2/MANIFEST-000001
+----
+OK
+
+manifest check
+../testdata/db-stage-3/MANIFEST-000005
+----
+open ../testdata/db-stage-3/MANIFEST-000005: file does not exist
+
+manifest check
+../testdata/db-stage-4/MANIFEST-000006
+----
+OK
+
+manifest dump
+./testdata/MANIFEST-invalid
+----
+MANIFEST-invalid
+0/0
+  comparer:     leveldb.BytewiseComparator
+  log-num:       2
+  next-file-num: 5
+  last-seq-num:  20
+  added:         L6 000001:0<#2-#5>[#0,DEL-#0,DEL]
+65/1
+  comparer:     leveldb.BytewiseComparator
+  log-num:       3
+  next-file-num: 5
+  last-seq-num:  20
+  added:         L6 000002:0<#1-#4>[#0,DEL-#0,DEL]
+EOF
+pebble: files 000002 and 000001 collided on sort keys
+
+manifest check
+./testdata/MANIFEST-invalid
+----
+MANIFEST-invalid: offset: 65 err: pebble: files 000002 and 000001 collided on sort keys
+Version state before failed Apply
+--- L0 ---
+--- L1 ---
+--- L2 ---
+--- L3 ---
+--- L4 ---
+--- L5 ---
+--- L6 ---
+  000001:0<#2-#5>[#0,DEL-#0,DEL]
+Version edit that failed
+  added: L6 000002:0<#1-#4>[#0,DEL-#0,DEL]
+
+manifest dump
+./testdata/find-db/MANIFEST-000001
+----
+MANIFEST-000001
+0/0
+  comparer:     alt-comparer
+  next-file-num: 2
+25/1
+  log-num:       2
+  next-file-num: 3
+36/2
+  log-num:       4
+  next-file-num: 6
+  last-seq-num:  5
+  added:         L0 000005:784<#1-#5>[aaa#1,SET-ccc#5,MERGE] (2021-04-01T20:24:02Z)
+88/3
+  next-file-num: 6
+  last-seq-num:  5
+  deleted:       L0 000005
+  added:         L6 000005:784<#1-#5>[aaa#1,SET-ccc#5,MERGE] (2021-04-01T20:24:02Z)
+141/4
+  next-file-num: 7
+  last-seq-num:  6
+  added:         L0 000006:817<#6-#6>[bbb#6,SET-ccc#6,SET] (2021-04-01T20:24:02Z)
+191/5
+  next-file-num: 8
+  last-seq-num:  7
+  added:         L6 000007:808<#7-#7>[ddd#7,SET-ddd#7,SET] (2021-04-01T20:24:02Z)
+241/6
+  next-file-num: 9
+  last-seq-num:  7
+  deleted:       L0 000006
+  deleted:       L6 000005
+  added:         L6 000008:791<#0-#6>[aaa#0,SET-ccc#0,MERGE] (2021-04-01T20:24:02Z)
+297/7
+  log-num:       9
+  next-file-num: 11
+  last-seq-num:  10
+  added:         L0 000010:834<#8-#10>[aaa#8,DEL-eee#inf,RANGEDEL] (2021-04-01T20:24:02Z)
+349/8
+  next-file-num: 12
+  last-seq-num:  10
+  deleted:       L0 000010
+  deleted:       L6 000007
+  deleted:       L6 000008
+  added:         L6 000011:898<#0-#10>[aaa#8,DEL-eee#inf,RANGEDEL] (2021-04-01T20:24:02Z)
+408/9
+  next-file-num: 12
+  last-seq-num:  10
+  deleted:       L6 000011
+EOF
+--- L0 ---
+--- L1 ---
+--- L2 ---
+--- L3 ---
+--- L4 ---
+--- L5 ---
+--- L6 ---
+
+manifest dump --filter-start=bat --filter-end=cat
+./testdata/find-db/MANIFEST-000001
+----
+MANIFEST-000001
+0/0
+  comparer:     alt-comparer
+  next-file-num: 2
+36/1
+  log-num:       4
+  next-file-num: 6
+  last-seq-num:  5
+  added:         L0 000005:784<#1-#5>[aaa#1,SET-ccc#5,MERGE] (2021-04-01T20:24:02Z)
+88/2
+  next-file-num: 6
+  last-seq-num:  5
+  deleted:       L0 000005
+  added:         L6 000005:784<#1-#5>[aaa#1,SET-ccc#5,MERGE] (2021-04-01T20:24:02Z)
+141/3
+  next-file-num: 7
+  last-seq-num:  6
+  added:         L0 000006:817<#6-#6>[bbb#6,SET-ccc#6,SET] (2021-04-01T20:24:02Z)
+241/4
+  next-file-num: 9
+  last-seq-num:  7
+  deleted:       L0 000006
+  deleted:       L6 000005
+  added:         L6 000008:791<#0-#6>[aaa#0,SET-ccc#0,MERGE] (2021-04-01T20:24:02Z)
+297/5
+  log-num:       9
+  next-file-num: 11
+  last-seq-num:  10
+  added:         L0 000010:834<#8-#10>[aaa#8,DEL-eee#inf,RANGEDEL] (2021-04-01T20:24:02Z)
+349/6
+  next-file-num: 12
+  last-seq-num:  10
+  deleted:       L0 000010
+  deleted:       L6 000007
+  deleted:       L6 000008
+  added:         L6 000011:898<#0-#10>[aaa#8,DEL-eee#inf,RANGEDEL] (2021-04-01T20:24:02Z)
+408/7
+  next-file-num: 12
+  last-seq-num:  10
+  deleted:       L6 000011
+EOF
+--- L0 ---
+--- L1 ---
+--- L2 ---
+--- L3 ---
+--- L4 ---
+--- L5 ---
+--- L6 ---
+
+manifest check
+./testdata/mixed/MANIFEST-000001
+----
+OK
+
+manifest dump
+./testdata/mixed/MANIFEST-000001
+----
+MANIFEST-000001
+0/0
+  comparer:     pebble.internal.testkeys
+  next-file-num: 2
+37/1
+  log-num:       2
+  next-file-num: 3
+48/2
+  log-num:       4
+  next-file-num: 6
+  last-seq-num:  29
+  added:         L0 000005:1166<#1-#29>[a#29,RANGEKEYDEL-z@1#26,SET] (2022-08-12T23:21:28Z)
+EOF
+--- L0.0 ---
+  000005:1166<#1-#29>[a#29,RANGEKEYDEL-z@1#26,SET]
+--- L1 ---
+--- L2 ---
+--- L3 ---
+--- L4 ---
+--- L5 ---
+--- L6 ---
diff --git a/pebble/tool/testdata/manifest_summarize b/pebble/tool/testdata/manifest_summarize
new file mode 100644
index 0000000..a307f8f
--- /dev/null
+++ b/pebble/tool/testdata/manifest_summarize
@@ -0,0 +1,45 @@
+manifest summarize
+----
+requires at least 1 arg(s), only received 0
+
+manifest summarize
+../testdata/db-stage-2/MANIFEST-000001
+----
+MANIFEST-000001
+(no timestamps)
+---
+Estimated start time: 0001-01-01T00:00:00Z
+Estimated end time:   0001-01-01T00:00:00Z
+Estimated duration:   0s
+
+manifest summarize
+./testdata/find-db/MANIFEST-000001
+----
+MANIFEST-000001
+                     _______L0_______L1_______L2_______L3_______L4_______L5_______L6_____TOTAL
+2021-04-01T20:24:02Z
+        Ingest+Flush      2.4KB        .        .        .        .        .     808B    3.2KB
+        Ingest+Flush    2.4KB/s        .        .        .        .        .   808B/s  3.2KB/s
+       Compact (out)      2.4KB        .        .        .        .        .     898B    3.3KB
+       Compact (out)    2.4KB/s        .        .        .        .        .   898B/s  3.3KB/s
+2021-04-01T20:24:02Z
+---
+Estimated start time: 2021-04-01T20:24:02Z
+Estimated end time:   2021-04-01T20:24:02Z
+Estimated duration:   0s
+
+manifest summarize
+./testdata/mixed/MANIFEST-000001
+----
+MANIFEST-000001
+                     _______L0_______L1_______L2_______L3_______L4_______L5_______L6_____TOTAL
+2022-08-12T23:21:28Z
+        Ingest+Flush      1.1KB        .        .        .        .        .        .    1.1KB
+        Ingest+Flush    1.1KB/s        .        .        .        .        .        .  1.1KB/s
+       Compact (out)          .        .        .        .        .        .        .        .
+       Compact (out)          .        .        .        .        .        .        .        .
+2022-08-12T23:21:28Z
+---
+Estimated start time: 2022-08-12T23:21:28Z
+Estimated end time:   2022-08-12T23:21:28Z
+Estimated duration:   0s
diff --git a/pebble/tool/testdata/mixed/000002.log b/pebble/tool/testdata/mixed/000002.log
new file mode 100644
index 0000000..04a0fbe
Binary files /dev/null and b/pebble/tool/testdata/mixed/000002.log differ
diff --git a/pebble/tool/testdata/mixed/000004.log b/pebble/tool/testdata/mixed/000004.log
new file mode 100644
index 0000000..d674fcb
Binary files /dev/null and b/pebble/tool/testdata/mixed/000004.log differ
diff --git a/pebble/tool/testdata/mixed/000005.sst b/pebble/tool/testdata/mixed/000005.sst
new file mode 100644
index 0000000..10a3420
Binary files /dev/null and b/pebble/tool/testdata/mixed/000005.sst differ
diff --git a/pebble/tool/testdata/mixed/CURRENT b/pebble/tool/testdata/mixed/CURRENT
new file mode 100644
index 0000000..feda7d6
--- /dev/null
+++ b/pebble/tool/testdata/mixed/CURRENT
@@ -0,0 +1 @@
+MANIFEST-000000
diff --git a/pebble/tool/testdata/mixed/LOCK b/pebble/tool/testdata/mixed/LOCK
new file mode 100644
index 0000000..e69de29
diff --git a/pebble/tool/testdata/mixed/MANIFEST-000001 b/pebble/tool/testdata/mixed/MANIFEST-000001
new file mode 100644
index 0000000..eae2a21
Binary files /dev/null and b/pebble/tool/testdata/mixed/MANIFEST-000001 differ
diff --git a/pebble/tool/testdata/mixed/OPTIONS-000003 b/pebble/tool/testdata/mixed/OPTIONS-000003
new file mode 100644
index 0000000..7de3573
--- /dev/null
+++ b/pebble/tool/testdata/mixed/OPTIONS-000003
@@ -0,0 +1,44 @@
+[Version]
+  pebble_version=0.1
+
+[Options]
+  bytes_per_sync=524288
+  cache_size=8388608
+  cleaner=delete
+  compaction_debt_concurrency=1073741824
+  comparer=pebble.internal.testkeys
+  delete_range_flush_delay=0s
+  disable_wal=false
+  flush_split_bytes=4194304
+  format_major_version=11
+  l0_compaction_concurrency=10
+  l0_compaction_file_threshold=500
+  l0_compaction_threshold=4
+  l0_stop_writes_threshold=12
+  lbase_max_bytes=67108864
+  max_concurrent_compactions=1
+  max_manifest_file_size=134217728
+  max_open_files=1000
+  mem_table_size=4194304
+  mem_table_stop_writes_threshold=2
+  min_deletion_rate=0
+  merger=pebble.concatenate
+  read_compaction_rate=16000
+  read_sampling_multiplier=16
+  strict_wal_tail=true
+  table_cache_shards=16
+  table_property_collectors=[]
+  validate_on_ingest=false
+  wal_dir=
+  wal_bytes_per_sync=0
+  max_writer_concurrency=0
+  force_writer_parallelism=false
+
+[Level "0"]
+  block_restart_interval=16
+  block_size=4096
+  compression=Snappy
+  filter_policy=none
+  filter_type=table
+  index_block_size=4096
+  target_file_size=2097152
diff --git a/pebble/tool/testdata/mixed/main.go b/pebble/tool/testdata/mixed/main.go
new file mode 100644
index 0000000..a4ba3b5
--- /dev/null
+++ b/pebble/tool/testdata/mixed/main.go
@@ -0,0 +1,107 @@
+// Command main generates test fixtures for a DB containing a mixture of point
+// and range keys.
+//
+// Upon running this command, the DB directory will contain:
+//
+//  1. A single SSTable (000005.sst), containing:
+//     a. 26 point keys, a@1 through z@1.
+//     b. a RANGEKEYSET [a, z)@1
+//     c. a RANGEKEYUNSET [a, z)@2
+//     d. a RANGEKEYDEL [a, b)
+//  2. A WAL for an unflushed memtable containing:
+//     a. A single point key a@2.
+//     b. a RANGEKEYSET [a, z)@3
+//     c. a RANGEKEYUNSET [a, z)@4
+//     d. a RANGEKEYDEL [a, b)
+package main
+
+import (
+	"io/fs"
+	"os"
+	"path/filepath"
+
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/testkeys"
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+const outDir = "./tool/testdata/mixed"
+
+func main() {
+	err := filepath.WalkDir(outDir, func(path string, d fs.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+		if filepath.Base(path) == "main.go" || d.IsDir() {
+			return nil
+		}
+		return os.Remove(path)
+	})
+	if err != nil {
+		panic(err)
+	}
+	lel := pebble.MakeLoggingEventListener(pebble.DefaultLogger)
+
+	opts := &pebble.Options{
+		FS:                          vfs.Default,
+		Comparer:                    testkeys.Comparer,
+		FormatMajorVersion:          pebble.FormatNewest,
+		EventListener:               &lel,
+		DisableAutomaticCompactions: true,
+	}
+	db, err := pebble.Open(outDir, opts)
+	if err != nil {
+		panic(err)
+	}
+
+	writePoint := func(b *pebble.Batch, k []byte) {
+		if err := b.Set(k, nil, nil); err != nil {
+			panic(err)
+		}
+	}
+	writeRangeKeySet := func(b *pebble.Batch, start, end, suffix []byte) {
+		if err := b.RangeKeySet(start, end, suffix, nil, nil); err != nil {
+			panic(err)
+		}
+	}
+	writeRangeKeyUnset := func(b *pebble.Batch, start, end, suffix []byte) {
+		if err := b.RangeKeyUnset(start, end, suffix, nil); err != nil {
+			panic(err)
+		}
+	}
+	writeRangeKeyDel := func(b *pebble.Batch, start, end []byte) {
+		if err := b.RangeKeyDelete(start, end, nil); err != nil {
+			panic(err)
+		}
+	}
+
+	// Write some point and range keys.
+	ks := testkeys.Alpha(1)
+	b := db.NewBatch()
+	for i := int64(0); i < ks.Count(); i++ {
+		writePoint(b, testkeys.KeyAt(ks, i, 1))
+	}
+	start, end := testkeys.Key(ks, 0), testkeys.Key(ks, ks.Count()-1)
+	writeRangeKeySet(b, start, end, testkeys.Suffix(1))   // SET   [a, z)@1
+	writeRangeKeyUnset(b, start, end, testkeys.Suffix(2)) // UNSET [a, z)@2
+	writeRangeKeyDel(b, start, testkeys.Key(ks, 1))       // DEL   [a, b)
+	if err := b.Commit(nil); err != nil {
+		panic(err)
+	}
+
+	// Flush memtables.
+	if err := db.Flush(); err != nil {
+		panic(err)
+	}
+
+	// Write one more point and range key into a memtable before closing, so the
+	// WAL is not empty.
+	b = db.NewBatch()
+	writePoint(b, testkeys.KeyAt(ks, 0, 2))
+	writeRangeKeySet(b, start, end, testkeys.Suffix(3))   // SET   [a, z)@3
+	writeRangeKeyUnset(b, start, end, testkeys.Suffix(4)) // UNSET [a, z)@4
+	writeRangeKeyDel(b, start, testkeys.Key(ks, 1))       // DEL   [a, b)
+	if err := b.Commit(nil); err != nil {
+		panic(err)
+	}
+}
diff --git a/pebble/tool/testdata/mixed/marker.format-version.000010.011 b/pebble/tool/testdata/mixed/marker.format-version.000010.011
new file mode 100644
index 0000000..e69de29
diff --git a/pebble/tool/testdata/mixed/marker.manifest.000001.MANIFEST-000001 b/pebble/tool/testdata/mixed/marker.manifest.000001.MANIFEST-000001
new file mode 100644
index 0000000..e69de29
diff --git a/pebble/tool/testdata/out-of-order.sst b/pebble/tool/testdata/out-of-order.sst
new file mode 100644
index 0000000..af39d5f
Binary files /dev/null and b/pebble/tool/testdata/out-of-order.sst differ
diff --git a/pebble/tool/testdata/remotecat b/pebble/tool/testdata/remotecat
new file mode 100644
index 0000000..0f7ab2c
--- /dev/null
+++ b/pebble/tool/testdata/remotecat
@@ -0,0 +1,33 @@
+remotecat dump
+----
+requires at least 1 arg(s), only received 0
+
+remotecat dump
+./testdata/REMOTE-OBJ-CATALOG
+----
+REMOTE-OBJ-CATALOG
+CreatorID: 3
+Objects:
+    000002  CreatorID: 5  CreatorFileNum: 000010  Locator: "foo" CustomObjectName: ""
+    000003  CreatorID: 0  CreatorFileNum: 000000  Locator: "bar" CustomObjectName: "external.sst"
+
+remotecat dump --verbose
+./testdata/REMOTE-OBJ-CATALOG
+----
+REMOTE-OBJ-CATALOG
+0/0
+7/1
+  CreatorID: 3
+16/2
+  NewObjects:
+    000001  CreatorID: 3  CreatorFileNum: 000001  Locator: "foo" CustomObjectName: ""
+35/3
+  NewObjects:
+    000002  CreatorID: 5  CreatorFileNum: 000010  Locator: "foo" CustomObjectName: ""
+    000003  CreatorID: 0  CreatorFileNum: 000000  Locator: "bar" CustomObjectName: "external.sst"
+  DeletedObjects:
+    000001
+CreatorID: 3
+Objects:
+    000002  CreatorID: 5  CreatorFileNum: 000010  Locator: "foo" CustomObjectName: ""
+    000003  CreatorID: 0  CreatorFileNum: 000000  Locator: "bar" CustomObjectName: "external.sst"
diff --git a/pebble/tool/testdata/sstable_check b/pebble/tool/testdata/sstable_check
new file mode 100644
index 0000000..c25772d
--- /dev/null
+++ b/pebble/tool/testdata/sstable_check
@@ -0,0 +1,59 @@
+sstable check
+../sstable/testdata/h.sst
+----
+h.sst
+
+sstable check
+testdata/out-of-order.sst
+----
+out-of-order.sst
+WARNING: OUT OF ORDER KEYS!
+    c#0,SET >= b#0,SET
+
+sstable check
+--key=%x
+testdata/out-of-order.sst
+----
+out-of-order.sst
+WARNING: OUT OF ORDER KEYS!
+    63#0,SET >= 62#0,SET
+
+sstable check
+--key=pretty
+testdata/out-of-order.sst
+----
+out-of-order.sst
+WARNING: OUT OF ORDER KEYS!
+    c#0,SET >= b#0,SET
+
+sstable check
+--key=pretty:test-comparer
+testdata/out-of-order.sst
+----
+out-of-order.sst
+WARNING: OUT OF ORDER KEYS!
+    test formatter: c#0,SET >= test formatter: b#0,SET
+
+sstable check
+--key=null
+testdata/out-of-order.sst
+----
+out-of-order.sst
+WARNING: OUT OF ORDER KEYS!
+
+sstable check
+testdata/corrupted.sst
+----
+corrupted.sst
+pebble/table: invalid table 000000 (checksum mismatch at 0/57)
+
+sstable check
+testdata/bad-magic.sst
+----
+bad-magic.sst
+pebble/table: invalid table (bad magic number: 0xf6cff485b741e288)
+
+sstable check
+./testdata/mixed/000005.sst
+----
+000005.sst
diff --git a/pebble/tool/testdata/sstable_layout b/pebble/tool/testdata/sstable_layout
new file mode 100644
index 0000000..9c4ea7b
--- /dev/null
+++ b/pebble/tool/testdata/sstable_layout
@@ -0,0 +1,3894 @@
+sstable layout
+----
+requires at least 1 arg(s), only received 0
+
+sstable layout
+../sstable/testdata/h.sst
+----
+h.sst
+         0  data (1094)
+      1099  data (1057)
+      2161  data (1074)
+      3240  data (1051)
+      4296  data (1046)
+      5347  data (1091)
+      6443  data (996)
+      7444  data (1060)
+      8509  data (1051)
+      9565  data (1016)
+     10586  data (1026)
+     11617  data (1100)
+     12722  data (1025)
+     13752  data (156)
+     13913  index (245)
+     14163  range-del (421)
+     14589  properties (719)
+     15313  meta-index (61)
+     15379  footer (53)
+     15432  EOF
+
+sstable layout
+../sstable/testdata/h.ldb
+----
+h.ldb
+         0  data (1094)
+      1099  data (1057)
+      2161  data (1074)
+      3240  data (1051)
+      4296  data (1046)
+      5347  data (1091)
+      6443  data (996)
+      7444  data (1060)
+      8509  data (1051)
+      9565  data (1016)
+     10586  data (1026)
+     11617  data (1100)
+     12722  data (1025)
+     13752  data (156)
+     13913  index (245)
+     14163  range-del (421)
+     14589  properties (673)
+     15267  meta-index (61)
+     15333  leveldb-footer (48)
+     15381  EOF
+
+sstable layout
+../sstable/testdata/h.table-bloom.no-compression.sst
+----
+h.table-bloom.no-compression.sst
+         0  data (2041)
+      2046  data (2044)
+      4095  data (2039)
+      6139  data (2036)
+      8180  data (2032)
+     10217  data (2042)
+     12264  data (2039)
+     14308  data (2037)
+     16350  data (2029)
+     18384  data (2040)
+     20429  data (2030)
+     22464  data (2035)
+     24504  data (2036)
+     26545  data (249)
+     26799  filter (2245)
+     29049  index (325)
+     29379  range-del (421)
+     29805  properties (717)
+     30527  meta-index (112)
+     30644  footer (53)
+     30697  EOF
+
+sstable layout
+../sstable/testdata/h.no-compression.two_level_index.sst
+----
+h.no-compression.two_level_index.sst
+         0  data (2041)
+      2046  data (2044)
+      4095  data (2039)
+      6139  data (2036)
+      8180  data (2032)
+     10217  data (2042)
+     12264  data (2039)
+     14308  data (2037)
+     16350  data (2029)
+     18384  data (2040)
+     20429  data (2030)
+     22464  data (2035)
+     24504  data (2036)
+     26545  data (249)
+     26799  index (120)
+     26924  index (118)
+     27047  index (95)
+     27147  top-index (70)
+     27222  range-del (421)
+     27648  properties (765)
+     28418  meta-index (63)
+     28486  footer (53)
+     28539  EOF
+
+sstable layout
+-v
+--value=pretty:test-comparer
+../sstable/testdata/h.no-compression.two_level_index.sst
+----
+h.no-compression.two_level_index.sst
+         0  data (2041)
+         0    record (14 = 3 [0] + 9 + 2) [restart]
+              a#0,SET test value formatter: 97
+        14    record (17 = 3 [1] + 13 + 1)
+              aboard#0,SET test value formatter: 2
+        31    record (14 = 3 [3] + 10 + 1)
+              about#0,SET test value formatter: 2
+        45    record (14 = 3 [3] + 10 + 1)
+              above#0,SET test value formatter: 1
+        59    record (16 = 3 [2] + 12 + 1)
+              abroad#0,SET test value formatter: 1
+        75    record (16 = 3 [2] + 12 + 1)
+              absurd#0,SET test value formatter: 1
+        91    record (16 = 3 [2] + 12 + 1)
+              abused#0,SET test value formatter: 1
+       107    record (17 = 3 [1] + 13 + 1)
+              accord#0,SET test value formatter: 1
+       124    record (15 = 3 [4] + 11 + 1)
+              account#0,SET test value formatter: 1
+       139    record (22 = 3 [2] + 18 + 1)
+              achievements#0,SET test value formatter: 1
+       161    record (18 = 3 [2] + 14 + 1)
+              acquaint#0,SET test value formatter: 1
+       179    record (13 = 3 [2] + 9 + 1)
+              act#0,SET test value formatter: 5
+       192    record (15 = 3 [3] + 11 + 1)
+              action#0,SET test value formatter: 1
+       207    record (13 = 3 [6] + 9 + 1)
+              actions#0,SET test value formatter: 1
+       220    record (19 = 3 [1] + 15 + 1)
+              addition#0,SET test value formatter: 1
+       239    record (16 = 3 [3] + 12 + 1)
+              address#0,SET test value formatter: 1
+       255    record (17 = 3 [0] + 13 + 1) [restart]
+              adieu#0,SET test value formatter: 4
+       272    record (20 = 3 [2] + 16 + 1)
+              admiration#0,SET test value formatter: 1
+       292    record (18 = 3 [2] + 14 + 1)
+              adoption#0,SET test value formatter: 1
+       310    record (20 = 3 [2] + 16 + 1)
+              adulterate#0,SET test value formatter: 1
+       330    record (19 = 3 [2] + 15 + 1)
+              advantage#0,SET test value formatter: 1
+       349    record (15 = 3 [3] + 11 + 1)
+              advice#0,SET test value formatter: 1
+       364    record (17 = 3 [1] + 13 + 1)
+              affair#0,SET test value formatter: 2
+       381    record (18 = 3 [3] + 14 + 1)
+              affection#0,SET test value formatter: 3
+       399    record (15 = 3 [2] + 11 + 1)
+              after#0,SET test value formatter: 1
+       414    record (16 = 3 [5] + 12 + 1)
+              afternoon#0,SET test value formatter: 1
+       430    record (17 = 3 [1] + 12 + 2)
+              again#0,SET test value formatter: 13
+       447    record (14 = 3 [5] + 10 + 1)
+              against#0,SET test value formatter: 5
+       461    record (13 = 3 [1] + 9 + 1)
+              ah#0,SET test value formatter: 2
+       474    record (14 = 3 [1] + 10 + 1)
+              air#0,SET test value formatter: 5
+       488    record (13 = 3 [3] + 9 + 1)
+              airs#0,SET test value formatter: 1
+       501    record (15 = 3 [1] + 11 + 1)
+              alas#0,SET test value formatter: 1
+       516    record (16 = 3 [0] + 11 + 2) [restart]
+              all#0,SET test value formatter: 36
+       532    record (15 = 3 [3] + 11 + 1)
+              alleys#0,SET test value formatter: 1
+       547    record (14 = 3 [3] + 10 + 1)
+              allow#0,SET test value formatter: 1
+       561    record (16 = 3 [2] + 12 + 1)
+              almost#0,SET test value formatter: 4
+       577    record (15 = 3 [2] + 11 + 1)
+              alone#0,SET test value formatter: 4
+       592    record (13 = 3 [4] + 9 + 1)
+              along#0,SET test value formatter: 2
+       605    record (17 = 3 [2] + 13 + 1)
+              already#0,SET test value formatter: 1
+       622    record (16 = 3 [2] + 12 + 1)
+              always#0,SET test value formatter: 1
+       638    record (13 = 3 [1] + 9 + 1)
+              am#0,SET test value formatter: 9
+       651    record (16 = 3 [2] + 12 + 1)
+              amazed#0,SET test value formatter: 1
+       667    record (19 = 3 [2] + 15 + 1)
+              ambiguous#0,SET test value formatter: 1
+       686    record (17 = 3 [4] + 13 + 1)
+              ambitious#0,SET test value formatter: 1
+       703    record (14 = 3 [1] + 9 + 2)
+              an#0,SET test value formatter: 13
+       717    record (15 = 3 [2] + 9 + 3)
+              and#0,SET test value formatter: 227
+       732    record (15 = 3 [2] + 11 + 1)
+              angel#0,SET test value formatter: 1
+       747    record (13 = 3 [5] + 9 + 1)
+              angels#0,SET test value formatter: 1
+       760    record (17 = 3 [0] + 13 + 1) [restart]
+              anger#0,SET test value formatter: 1
+       777    record (14 = 3 [3] + 10 + 1)
+              angry#0,SET test value formatter: 1
+       791    record (17 = 3 [2] + 13 + 1)
+              another#0,SET test value formatter: 1
+       808    record (16 = 3 [2] + 12 + 1)
+              answer#0,SET test value formatter: 4
+       824    record (15 = 3 [2] + 11 + 1)
+              antic#0,SET test value formatter: 1
+       839    record (13 = 3 [2] + 9 + 1)
+              any#0,SET test value formatter: 6
+       852    record (18 = 3 [1] + 14 + 1)
+              apparel#0,SET test value formatter: 1
+       870    record (17 = 3 [5] + 13 + 1)
+              apparition#0,SET test value formatter: 2
+       887    record (15 = 3 [3] + 11 + 1)
+              appear#0,SET test value formatter: 4
+       902    record (13 = 3 [6] + 9 + 1)
+              appears#0,SET test value formatter: 1
+       915    record (16 = 3 [4] + 12 + 1)
+              appetite#0,SET test value formatter: 1
+       931    record (16 = 3 [3] + 12 + 1)
+              approve#0,SET test value formatter: 1
+       947    record (13 = 3 [2] + 9 + 1)
+              apt#0,SET test value formatter: 1
+       960    record (15 = 3 [1] + 10 + 2)
+              are#0,SET test value formatter: 21
+       975    record (13 = 3 [2] + 9 + 1)
+              arm#0,SET test value formatter: 2
+       988    record (14 = 3 [3] + 10 + 1)
+              armed#0,SET test value formatter: 2
+      1002    record (18 = 3 [0] + 14 + 1) [restart]
+              armour#0,SET test value formatter: 1
+      1020    record (13 = 3 [3] + 9 + 1)
+              arms#0,SET test value formatter: 2
+      1033    record (16 = 3 [2] + 12 + 1)
+              arrant#0,SET test value formatter: 1
+      1049    record (13 = 3 [2] + 9 + 1)
+              art#0,SET test value formatter: 6
+      1062    record (15 = 3 [3] + 11 + 1)
+              artery#0,SET test value formatter: 1
+      1077    record (16 = 3 [3] + 12 + 1)
+              article#0,SET test value formatter: 1
+      1093    record (13 = 3 [7] + 9 + 1)
+              articles#0,SET test value formatter: 1
+      1106    record (14 = 3 [1] + 9 + 2)
+              as#0,SET test value formatter: 56
+      1120    record (15 = 3 [2] + 11 + 1)
+              aside#0,SET test value formatter: 1
+      1135    record (16 = 3 [2] + 12 + 1)
+              asking#0,SET test value formatter: 1
+      1151    record (16 = 3 [2] + 12 + 1)
+              assail#0,SET test value formatter: 1
+      1167    record (18 = 3 [3] + 14 + 1)
+              assistant#0,SET test value formatter: 1
+      1185    record (15 = 3 [3] + 11 + 1)
+              assume#0,SET test value formatter: 2
+      1200    record (14 = 3 [1] + 9 + 2)
+              at#0,SET test value formatter: 18
+      1214    record (20 = 3 [2] + 16 + 1)
+              attendants#0,SET test value formatter: 1
+      1234    record (13 = 3 [5] + 9 + 1)
+              attent#0,SET test value formatter: 1
+      1247    record (21 = 3 [0] + 17 + 1) [restart]
+              attribute#0,SET test value formatter: 1
+      1268    record (19 = 3 [1] + 15 + 1)
+              audience#0,SET test value formatter: 1
+      1287    record (15 = 3 [2] + 11 + 1)
+              aught#0,SET test value formatter: 2
+      1302    record (20 = 3 [2] + 16 + 1)
+              auspicious#0,SET test value formatter: 1
+      1322    record (16 = 3 [1] + 12 + 1)
+              avoid#0,SET test value formatter: 1
+      1338    record (15 = 3 [3] + 11 + 1)
+              avouch#0,SET test value formatter: 1
+      1353    record (16 = 3 [1] + 12 + 1)
+              awake#0,SET test value formatter: 1
+      1369    record (13 = 3 [3] + 9 + 1)
+              away#0,SET test value formatter: 7
+      1382    record (16 = 3 [2] + 12 + 1)
+              awhile#0,SET test value formatter: 2
+      1398    record (13 = 3 [1] + 9 + 1)
+              ay#0,SET test value formatter: 7
+      1411    record (16 = 3 [0] + 12 + 1)
+              baby#0,SET test value formatter: 1
+      1427    record (14 = 3 [2] + 10 + 1)
+              back#0,SET test value formatter: 1
+      1441    record (15 = 3 [2] + 11 + 1)
+              baked#0,SET test value formatter: 1
+      1456    record (14 = 3 [2] + 10 + 1)
+              bark#0,SET test value formatter: 1
+      1470    record (13 = 3 [3] + 9 + 1)
+              barr#0,SET test value formatter: 1
+      1483    record (14 = 3 [2] + 10 + 1)
+              base#0,SET test value formatter: 1
+      1497    record (17 = 3 [0] + 13 + 1) [restart]
+              baser#0,SET test value formatter: 1
+      1514    record (15 = 3 [2] + 11 + 1)
+              bawds#0,SET test value formatter: 1
+      1529    record (14 = 3 [1] + 9 + 2)
+              be#0,SET test value formatter: 42
+      1543    record (14 = 3 [2] + 10 + 1)
+              bear#0,SET test value formatter: 5
+      1557    record (13 = 3 [4] + 9 + 1)
+              beard#0,SET test value formatter: 1
+      1570    record (15 = 3 [4] + 11 + 1)
+              bearers#0,SET test value formatter: 1
+      1585    record (13 = 3 [4] + 9 + 1)
+              bears#0,SET test value formatter: 1
+      1598    record (14 = 3 [3] + 10 + 1)
+              beast#0,SET test value formatter: 2
+      1612    record (16 = 3 [3] + 12 + 1)
+              beating#0,SET test value formatter: 1
+      1628    record (15 = 3 [3] + 11 + 1)
+              beauty#0,SET test value formatter: 1
+      1643    record (15 = 3 [3] + 11 + 1)
+              beaver#0,SET test value formatter: 1
+      1658    record (17 = 3 [2] + 13 + 1)
+              beckons#0,SET test value formatter: 2
+      1675    record (13 = 3 [2] + 9 + 1)
+              bed#0,SET test value formatter: 4
+      1688    record (14 = 3 [2] + 10 + 1)
+              been#0,SET test value formatter: 4
+      1702    record (16 = 3 [3] + 12 + 1)
+              beetles#0,SET test value formatter: 1
+      1718    record (18 = 3 [2] + 14 + 1)
+              befitted#0,SET test value formatter: 1
+      1736    record (18 = 3 [0] + 14 + 1) [restart]
+              before#0,SET test value formatter: 6
+      1754    record (13 = 3 [2] + 9 + 1)
+              beg#0,SET test value formatter: 1
+      1767    record (16 = 3 [3] + 12 + 1)
+              beguile#0,SET test value formatter: 1
+      1783    record (16 = 3 [2] + 12 + 1)
+              behold#0,SET test value formatter: 1
+      1799    record (15 = 3 [4] + 11 + 1)
+              behoves#0,SET test value formatter: 1
+      1814    record (15 = 3 [2] + 11 + 1)
+              being#0,SET test value formatter: 4
+      1829    record (16 = 3 [2] + 12 + 1)
+              belief#0,SET test value formatter: 1
+      1845    record (14 = 3 [5] + 10 + 1)
+              believe#0,SET test value formatter: 6
+      1859    record (13 = 3 [3] + 9 + 1)
+              bell#0,SET test value formatter: 1
+      1872    record (14 = 3 [2] + 10 + 1)
+              bend#0,SET test value formatter: 2
+      1886    record (16 = 3 [3] + 12 + 1)
+              beneath#0,SET test value formatter: 5
+      1902    record (15 = 3 [4] + 11 + 1)
+              benefit#0,SET test value formatter: 1
+      1917    record (19 = 3 [2] + 14 + 2)
+              bernardo#0,SET test value formatter: 30
+      1936    record (17 = 3 [2] + 13 + 1)
+              beseech#0,SET test value formatter: 2
+      1953    record (17 = 3 [3] + 13 + 1)
+              besmirch#0,SET test value formatter: 1
+      1970    record (13 = 3 [3] + 9 + 1)
+              best#0,SET test value formatter: 5
+      1983    record (18 = 3 [0] + 14 + 1) [restart]
+              beteem#0,SET test value formatter: 1
+      2001    [restart 0]
+      2005    [restart 255]
+      2009    [restart 516]
+      2013    [restart 760]
+      2017    [restart 1002]
+      2021    [restart 1247]
+      2025    [restart 1497]
+      2029    [restart 1736]
+      2033    [restart 1983]
+      2041    [trailer compression=none checksum=0x13c15fb]
+      2046  data (2044)
+      2046    record (21 = 3 [0] + 17 + 1) [restart]
+              bethought#0,SET test value formatter: 1
+      2067    record (15 = 3 [3] + 11 + 1)
+              better#0,SET test value formatter: 2
+      2082    record (16 = 3 [3] + 12 + 1)
+              between#0,SET test value formatter: 2
+      2098    record (16 = 3 [2] + 12 + 1)
+              beware#0,SET test value formatter: 2
+      2114    record (16 = 3 [2] + 12 + 1)
+              beyond#0,SET test value formatter: 1
+      2130    record (14 = 3 [1] + 10 + 1)
+              bid#0,SET test value formatter: 2
+      2144    record (14 = 3 [2] + 10 + 1)
+              bird#0,SET test value formatter: 2
+      2158    record (14 = 3 [3] + 10 + 1)
+              birth#0,SET test value formatter: 3
+      2172    record (15 = 3 [2] + 11 + 1)
+              bites#0,SET test value formatter: 1
+      2187    record (15 = 3 [3] + 11 + 1)
+              bitter#0,SET test value formatter: 1
+      2202    record (16 = 3 [1] + 12 + 1)
+              black#0,SET test value formatter: 1
+      2218    record (14 = 3 [3] + 10 + 1)
+              blast#0,SET test value formatter: 1
+      2232    record (17 = 3 [5] + 13 + 1)
+              blastments#0,SET test value formatter: 1
+      2249    record (13 = 3 [5] + 9 + 1)
+              blasts#0,SET test value formatter: 1
+      2262    record (15 = 3 [3] + 11 + 1)
+              blazes#0,SET test value formatter: 1
+      2277    record (14 = 3 [4] + 10 + 1)
+              blazon#0,SET test value formatter: 1
+      2291    record (20 = 3 [0] + 16 + 1) [restart]
+              blessing#0,SET test value formatter: 3
+      2311    record (15 = 3 [2] + 11 + 1)
+              blood#0,SET test value formatter: 7
+      2326    record (17 = 3 [3] + 13 + 1)
+              blossoms#0,SET test value formatter: 1
+      2343    record (14 = 3 [3] + 10 + 1)
+              blows#0,SET test value formatter: 1
+      2357    record (16 = 3 [1] + 12 + 1)
+              bodes#0,SET test value formatter: 1
+      2373    record (13 = 3 [3] + 9 + 1)
+              body#0,SET test value formatter: 5
+      2386    record (15 = 3 [2] + 11 + 1)
+              bonds#0,SET test value formatter: 1
+      2401    record (14 = 3 [3] + 10 + 1)
+              bones#0,SET test value formatter: 1
+      2415    record (14 = 3 [2] + 10 + 1)
+              book#0,SET test value formatter: 1
+      2429    record (13 = 3 [4] + 9 + 1)
+              books#0,SET test value formatter: 1
+      2442    record (14 = 3 [2] + 10 + 1)
+              born#0,SET test value formatter: 2
+      2456    record (17 = 3 [3] + 13 + 1)
+              borrower#0,SET test value formatter: 1
+      2473    record (15 = 3 [6] + 11 + 1)
+              borrowing#0,SET test value formatter: 1
+      2488    record (15 = 3 [2] + 11 + 1)
+              bosom#0,SET test value formatter: 1
+      2503    record (14 = 3 [2] + 10 + 1)
+              both#0,SET test value formatter: 3
+      2517    record (15 = 3 [2] + 11 + 1)
+              bound#0,SET test value formatter: 2
+      2532    record (21 = 3 [0] + 17 + 1) [restart]
+              bounteous#0,SET test value formatter: 1
+      2553    record (13 = 3 [2] + 9 + 1)
+              bow#0,SET test value formatter: 1
+      2566    record (13 = 3 [2] + 9 + 1)
+              boy#0,SET test value formatter: 2
+      2579    record (16 = 3 [1] + 12 + 1)
+              brain#0,SET test value formatter: 2
+      2595    record (13 = 3 [3] + 9 + 1)
+              bray#0,SET test value formatter: 1
+      2608    record (15 = 3 [3] + 11 + 1)
+              brazen#0,SET test value formatter: 1
+      2623    record (16 = 3 [2] + 12 + 1)
+              breach#0,SET test value formatter: 1
+      2639    record (13 = 3 [4] + 9 + 1)
+              break#0,SET test value formatter: 3
+      2652    record (15 = 3 [5] + 11 + 1)
+              breaking#0,SET test value formatter: 1
+      2667    record (14 = 3 [4] + 10 + 1)
+              breath#0,SET test value formatter: 1
+      2681    record (15 = 3 [6] + 11 + 1)
+              breathing#0,SET test value formatter: 1
+      2696    record (15 = 3 [2] + 11 + 1)
+              brief#0,SET test value formatter: 1
+      2711    record (14 = 3 [3] + 10 + 1)
+              bring#0,SET test value formatter: 1
+      2725    record (17 = 3 [2] + 13 + 1)
+              brokers#0,SET test value formatter: 1
+      2742    record (16 = 3 [3] + 12 + 1)
+              brother#0,SET test value formatter: 6
+      2758    record (13 = 3 [3] + 9 + 1)
+              brow#0,SET test value formatter: 1
+      2771    record (17 = 3 [0] + 13 + 1) [restart]
+              bruit#0,SET test value formatter: 1
+      2788    record (15 = 3 [1] + 11 + 1)
+              bulk#0,SET test value formatter: 1
+      2803    record (16 = 3 [2] + 12 + 1)
+              buried#0,SET test value formatter: 1
+      2819    record (14 = 3 [3] + 10 + 1)
+              burns#0,SET test value formatter: 2
+      2833    record (13 = 3 [4] + 9 + 1)
+              burnt#0,SET test value formatter: 1
+      2846    record (14 = 3 [3] + 10 + 1)
+              burst#0,SET test value formatter: 2
+      2860    record (18 = 3 [2] + 14 + 1)
+              business#0,SET test value formatter: 4
+      2878    record (14 = 3 [2] + 9 + 2)
+              but#0,SET test value formatter: 58
+      2892    record (16 = 3 [3] + 12 + 1)
+              buttons#0,SET test value formatter: 1
+      2908    record (13 = 3 [2] + 9 + 1)
+              buy#0,SET test value formatter: 1
+      2921    record (14 = 3 [1] + 9 + 2)
+              by#0,SET test value formatter: 31
+      2935    record (16 = 3 [0] + 12 + 1)
+              call#0,SET test value formatter: 4
+      2951    record (19 = 3 [3] + 15 + 1)
+              calumnious#0,SET test value formatter: 1
+      2970    record (14 = 3 [2] + 10 + 1)
+              came#0,SET test value formatter: 2
+      2984    record (13 = 3 [2] + 9 + 1)
+              can#0,SET test value formatter: 5
+      2997    record (15 = 3 [3] + 11 + 1)
+              canker#0,SET test value formatter: 1
+      3012    record (18 = 3 [0] + 14 + 1) [restart]
+              cannon#0,SET test value formatter: 2
+      3030    record (13 = 3 [5] + 9 + 1)
+              cannot#0,SET test value formatter: 3
+      3043    record (14 = 3 [3] + 10 + 1)
+              canon#0,SET test value formatter: 1
+      3057    record (16 = 3 [5] + 12 + 1)
+              canonized#0,SET test value formatter: 1
+      3073    record (14 = 3 [3] + 10 + 1)
+              canst#0,SET test value formatter: 2
+      3087    record (13 = 3 [2] + 9 + 1)
+              cap#0,SET test value formatter: 1
+      3100    record (19 = 3 [2] + 15 + 1)
+              carefully#0,SET test value formatter: 1
+      3119    record (17 = 3 [3] + 13 + 1)
+              carriage#0,SET test value formatter: 1
+      3136    record (16 = 3 [4] + 12 + 1)
+              carrying#0,SET test value formatter: 1
+      3152    record (14 = 3 [3] + 10 + 1)
+              carve#0,SET test value formatter: 1
+      3166    record (14 = 3 [2] + 10 + 1)
+              cast#0,SET test value formatter: 3
+      3180    record (14 = 3 [4] + 10 + 1)
+              castle#0,SET test value formatter: 2
+      3194    record (15 = 3 [2] + 11 + 1)
+              catch#0,SET test value formatter: 1
+      3209    record (16 = 3 [2] + 12 + 1)
+              cautel#0,SET test value formatter: 1
+      3225    record (15 = 3 [4] + 11 + 1)
+              caution#0,SET test value formatter: 1
+      3240    record (21 = 3 [1] + 17 + 1)
+              celebrated#0,SET test value formatter: 1
+      3261    record (21 = 3 [0] + 17 + 1) [restart]
+              celestial#0,SET test value formatter: 1
+      3282    record (18 = 3 [3] + 14 + 1)
+              cellarage#0,SET test value formatter: 1
+      3300    record (17 = 3 [2] + 13 + 1)
+              censure#0,SET test value formatter: 2
+      3317    record (19 = 3 [2] + 15 + 1)
+              cerements#0,SET test value formatter: 1
+      3336    record (16 = 3 [3] + 12 + 1)
+              certain#0,SET test value formatter: 1
+      3352    record (18 = 3 [1] + 14 + 1)
+              chances#0,SET test value formatter: 1
+      3370    record (14 = 3 [4] + 10 + 1)
+              change#0,SET test value formatter: 1
+      3384    record (18 = 3 [3] + 14 + 1)
+              character#0,SET test value formatter: 1
+      3402    record (14 = 3 [4] + 10 + 1)
+              charge#0,SET test value formatter: 3
+      3416    record (16 = 3 [4] + 12 + 1)
+              chariest#0,SET test value formatter: 1
+      3432    record (17 = 3 [5] + 13 + 1)
+              charitable#0,SET test value formatter: 1
+      3449    record (13 = 3 [4] + 9 + 1)
+              charm#0,SET test value formatter: 1
+      3462    record (15 = 3 [3] + 11 + 1)
+              chaste#0,SET test value formatter: 1
+      3477    record (15 = 3 [2] + 11 + 1)
+              cheer#0,SET test value formatter: 1
+      3492    record (15 = 3 [2] + 11 + 1)
+              chief#0,SET test value formatter: 2
+      3507    record (15 = 3 [5] + 11 + 1)
+              chiefest#0,SET test value formatter: 1
+      3522    record (18 = 3 [0] + 14 + 1) [restart]
+              choice#0,SET test value formatter: 2
+      3540    record (15 = 3 [3] + 11 + 1)
+              choose#0,SET test value formatter: 1
+      3555    record (24 = 3 [1] + 20 + 1)
+              circumscribed#0,SET test value formatter: 1
+      3579    record (17 = 3 [7] + 13 + 1)
+              circumstance#0,SET test value formatter: 2
+      3596    record (15 = 3 [1] + 11 + 1)
+              clad#0,SET test value formatter: 1
+      3611    record (17 = 3 [3] + 13 + 1)
+              claudius#0,SET test value formatter: 8
+      3628    record (17 = 3 [2] + 13 + 1)
+              clearly#0,SET test value formatter: 1
+      3645    record (14 = 3 [3] + 10 + 1)
+              clepe#0,SET test value formatter: 1
+      3659    record (15 = 3 [2] + 11 + 1)
+              cliff#0,SET test value formatter: 1
+      3674    record (19 = 3 [3] + 15 + 1)
+              climatures#0,SET test value formatter: 1
+      3693    record (15 = 3 [2] + 11 + 1)
+              cloak#0,SET test value formatter: 1
+      3708    record (15 = 3 [3] + 11 + 1)
+              clouds#0,SET test value formatter: 2
+      3723    record (15 = 3 [1] + 11 + 1)
+              cock#0,SET test value formatter: 5
+      3738    record (14 = 3 [2] + 10 + 1)
+              cold#0,SET test value formatter: 2
+      3752    record (14 = 3 [4] + 10 + 1)
+              coldly#0,SET test value formatter: 1
+      3766    record (19 = 3 [3] + 15 + 1)
+              colleagued#0,SET test value formatter: 1
+      3785    record (18 = 3 [0] + 14 + 1) [restart]
+              colour#0,SET test value formatter: 1
+      3803    record (16 = 3 [2] + 12 + 1)
+              combat#0,SET test value formatter: 1
+      3819    record (14 = 3 [6] + 10 + 1)
+              combated#0,SET test value formatter: 1
+      3833    record (16 = 3 [4] + 12 + 1)
+              combined#0,SET test value formatter: 1
+      3849    record (14 = 3 [3] + 9 + 2)
+              come#0,SET test value formatter: 17
+      3863    record (13 = 3 [4] + 9 + 1)
+              comes#0,SET test value formatter: 7
+      3876    record (13 = 3 [5] + 9 + 1)
+              comest#0,SET test value formatter: 1
+      3889    record (16 = 3 [3] + 12 + 1)
+              comfort#0,SET test value formatter: 1
+      3905    record (15 = 3 [3] + 11 + 1)
+              coming#0,SET test value formatter: 1
+      3920    record (16 = 3 [3] + 12 + 1)
+              command#0,SET test value formatter: 1
+      3936    record (16 = 3 [7] + 12 + 1)
+              commandment#0,SET test value formatter: 1
+      3952    record (15 = 3 [4] + 11 + 1)
+              commend#0,SET test value formatter: 2
+      3967    record (16 = 3 [7] + 12 + 1)
+              commendable#0,SET test value formatter: 1
+      3983    record (14 = 3 [4] + 10 + 1)
+              common#0,SET test value formatter: 4
+      3997    record (16 = 3 [3] + 12 + 1)
+              compact#0,SET test value formatter: 1
+      4013    record (17 = 3 [4] + 13 + 1)
+              competent#0,SET test value formatter: 1
+      4030    record (20 = 3 [0] + 16 + 1) [restart]
+              complete#0,SET test value formatter: 1
+      4050    [restart 2046]
+      4054    [restart 2291]
+      4058    [restart 2532]
+      4062    [restart 2771]
+      4066    [restart 3012]
+      4070    [restart 3261]
+      4074    [restart 3522]
+      4078    [restart 3785]
+      4082    [restart 4030]
+      4090    [trailer compression=none checksum=0x334c5e54]
+      4095  data (2039)
+      4095    record (22 = 3 [0] + 18 + 1) [restart]
+              complexion#0,SET test value formatter: 1
+      4117    record (20 = 3 [4] + 16 + 1)
+              compulsatory#0,SET test value formatter: 1
+      4137    record (16 = 3 [3] + 12 + 1)
+              comrade#0,SET test value formatter: 1
+      4153    record (17 = 3 [2] + 13 + 1)
+              conceal#0,SET test value formatter: 1
+      4170    record (20 = 3 [3] + 16 + 1)
+              condolement#0,SET test value formatter: 1
+      4190    record (16 = 3 [3] + 12 + 1)
+              confess#0,SET test value formatter: 1
+      4206    record (15 = 3 [4] + 11 + 1)
+              confine#0,SET test value formatter: 1
+      4221    record (13 = 3 [7] + 9 + 1)
+              confined#0,SET test value formatter: 1
+      4234    record (18 = 3 [3] + 14 + 1)
+              conqueror#0,SET test value formatter: 1
+      4252    record (16 = 3 [3] + 12 + 1)
+              consent#0,SET test value formatter: 3
+      4268    record (18 = 3 [4] + 14 + 1)
+              constantly#0,SET test value formatter: 1
+      4286    record (19 = 3 [3] + 15 + 1)
+              contagious#0,SET test value formatter: 1
+      4305    record (18 = 3 [4] + 14 + 1)
+              contracted#0,SET test value formatter: 1
+      4323    record (15 = 3 [5] + 11 + 1)
+              contrive#0,SET test value formatter: 1
+      4338    record (21 = 3 [3] + 17 + 1)
+              conveniently#0,SET test value formatter: 1
+      4359    record (14 = 3 [4] + 10 + 1)
+              convoy#0,SET test value formatter: 1
+      4373    record (18 = 3 [0] + 14 + 1) [restart]
+              copied#0,SET test value formatter: 1
+      4391    record (19 = 3 [2] + 15 + 1)
+              cornelius#0,SET test value formatter: 4
+      4410    record (19 = 3 [3] + 15 + 1)
+              coronation#0,SET test value formatter: 1
+      4429    record (19 = 3 [3] + 15 + 1)
+              corruption#0,SET test value formatter: 1
+      4448    record (14 = 3 [3] + 10 + 1)
+              corse#0,SET test value formatter: 2
+      4462    record (16 = 3 [2] + 12 + 1)
+              costly#0,SET test value formatter: 1
+      4478    record (15 = 3 [2] + 11 + 1)
+              couch#0,SET test value formatter: 1
+      4493    record (14 = 3 [3] + 10 + 1)
+              could#0,SET test value formatter: 2
+      4507    record (20 = 3 [3] + 16 + 1)
+              countenance#0,SET test value formatter: 2
+      4527    record (14 = 3 [5] + 10 + 1)
+              country#0,SET test value formatter: 1
+      4541    record (15 = 3 [7] + 11 + 1)
+              countrymen#0,SET test value formatter: 1
+      4556    record (15 = 3 [3] + 11 + 1)
+              couple#0,SET test value formatter: 1
+      4571    record (15 = 3 [3] + 11 + 1)
+              course#0,SET test value formatter: 2
+      4586    record (13 = 3 [6] + 9 + 1)
+              courses#0,SET test value formatter: 1
+      4599    record (13 = 3 [4] + 9 + 1)
+              court#0,SET test value formatter: 1
+      4612    record (16 = 3 [5] + 12 + 1)
+              courteous#0,SET test value formatter: 1
+      4628    record (20 = 3 [0] + 16 + 1) [restart]
+              courtier#0,SET test value formatter: 1
+      4648    record (15 = 3 [3] + 11 + 1)
+              cousin#0,SET test value formatter: 2
+      4663    record (18 = 3 [2] + 14 + 1)
+              covenant#0,SET test value formatter: 1
+      4681    record (16 = 3 [1] + 12 + 1)
+              crack#0,SET test value formatter: 1
+      4697    record (17 = 3 [2] + 13 + 1)
+              credent#0,SET test value formatter: 1
+      4714    record (17 = 3 [3] + 13 + 1)
+              crescent#0,SET test value formatter: 1
+      4731    record (13 = 3 [3] + 9 + 1)
+              crew#0,SET test value formatter: 2
+      4744    record (15 = 3 [2] + 11 + 1)
+              cried#0,SET test value formatter: 1
+      4759    record (13 = 3 [4] + 9 + 1)
+              cries#0,SET test value formatter: 1
+      4772    record (15 = 3 [3] + 11 + 1)
+              crimes#0,SET test value formatter: 1
+      4787    record (15 = 3 [2] + 11 + 1)
+              cross#0,SET test value formatter: 1
+      4802    record (16 = 3 [3] + 12 + 1)
+              crowing#0,SET test value formatter: 1
+      4818    record (13 = 3 [4] + 9 + 1)
+              crown#0,SET test value formatter: 2
+      4831    record (13 = 3 [4] + 9 + 1)
+              crows#0,SET test value formatter: 1
+      4844    record (15 = 3 [2] + 11 + 1)
+              crust#0,SET test value formatter: 1
+      4859    record (15 = 3 [1] + 11 + 1)
+              curd#0,SET test value formatter: 1
+      4874    record (18 = 3 [0] + 14 + 1) [restart]
+              cursed#0,SET test value formatter: 2
+      4892    record (16 = 3 [2] + 12 + 1)
+              custom#0,SET test value formatter: 3
+      4908    record (15 = 3 [6] + 11 + 1)
+              customary#0,SET test value formatter: 1
+      4923    record (13 = 3 [2] + 9 + 1)
+              cut#0,SET test value formatter: 1
+      4936    record (14 = 3 [0] + 9 + 2)
+              d#0,SET test value formatter: 54
+      4950    record (16 = 3 [1] + 12 + 1)
+              daily#0,SET test value formatter: 1
+      4966    record (19 = 3 [2] + 15 + 1)
+              dalliance#0,SET test value formatter: 1
+      4985    record (14 = 3 [2] + 10 + 1)
+              damn#0,SET test value formatter: 1
+      4999    record (14 = 3 [4] + 10 + 1)
+              damned#0,SET test value formatter: 2
+      5013    record (14 = 3 [2] + 10 + 1)
+              dane#0,SET test value formatter: 3
+      5027    record (15 = 3 [3] + 11 + 1)
+              danger#0,SET test value formatter: 1
+      5042    record (15 = 3 [2] + 11 + 1)
+              dared#0,SET test value formatter: 1
+      5057    record (13 = 3 [4] + 9 + 1)
+              dares#0,SET test value formatter: 1
+      5070    record (18 = 3 [2] + 14 + 1)
+              daughter#0,SET test value formatter: 2
+      5088    record (17 = 3 [2] + 13 + 1)
+              dawning#0,SET test value formatter: 1
+      5105    record (13 = 3 [2] + 9 + 1)
+              day#0,SET test value formatter: 8
+      5118    record (16 = 3 [0] + 12 + 1) [restart]
+              days#0,SET test value formatter: 1
+      5134    record (15 = 3 [1] + 11 + 1)
+              dead#0,SET test value formatter: 7
+      5149    record (13 = 3 [3] + 9 + 1)
+              dear#0,SET test value formatter: 4
+      5162    record (15 = 3 [4] + 11 + 1)
+              dearest#0,SET test value formatter: 2
+      5177    record (14 = 3 [4] + 10 + 1)
+              dearly#0,SET test value formatter: 1
+      5191    record (14 = 3 [3] + 10 + 1)
+              death#0,SET test value formatter: 6
+      5205    record (17 = 3 [2] + 13 + 1)
+              decline#0,SET test value formatter: 1
+      5222    record (14 = 3 [2] + 10 + 1)
+              deed#0,SET test value formatter: 1
+      5236    record (13 = 3 [4] + 9 + 1)
+              deeds#0,SET test value formatter: 1
+      5249    record (13 = 3 [3] + 9 + 1)
+              deep#0,SET test value formatter: 1
+      5262    record (18 = 3 [2] + 14 + 1)
+              defeated#0,SET test value formatter: 1
+      5280    record (14 = 3 [4] + 10 + 1)
+              defect#0,SET test value formatter: 1
+      5294    record (14 = 3 [4] + 10 + 1)
+              defend#0,SET test value formatter: 1
+      5308    record (18 = 3 [2] + 14 + 1)
+              dejected#0,SET test value formatter: 1
+      5326    record (17 = 3 [2] + 13 + 1)
+              delated#0,SET test value formatter: 1
+      5343    record (16 = 3 [3] + 12 + 1)
+              delight#0,SET test value formatter: 1
+      5359    record (19 = 3 [0] + 15 + 1) [restart]
+              deliver#0,SET test value formatter: 2
+      5378    record (22 = 3 [2] + 18 + 1)
+              demonstrated#0,SET test value formatter: 1
+      5400    record (18 = 3 [2] + 13 + 2)
+              denmark#0,SET test value formatter: 13
+      5418    record (15 = 3 [3] + 11 + 1)
+              denote#0,SET test value formatter: 1
+      5433    record (16 = 3 [2] + 12 + 1)
+              depart#0,SET test value formatter: 1
+      5449    record (16 = 3 [3] + 12 + 1)
+              depends#0,SET test value formatter: 1
+      5465    record (16 = 3 [3] + 12 + 1)
+              deprive#0,SET test value formatter: 1
+      5481    record (16 = 3 [2] + 12 + 1)
+              design#0,SET test value formatter: 1
+      5497    record (14 = 3 [4] + 10 + 1)
+              desire#0,SET test value formatter: 6
+      5511    record (18 = 3 [3] + 14 + 1)
+              desperate#0,SET test value formatter: 1
+      5529    record (15 = 3 [8] + 11 + 1)
+              desperation#0,SET test value formatter: 1
+      5544    record (13 = 3 [2] + 9 + 1)
+              dew#0,SET test value formatter: 3
+      5557    record (13 = 3 [3] + 9 + 1)
+              dews#0,SET test value formatter: 1
+      5570    record (19 = 3 [2] + 15 + 1)
+              dexterity#0,SET test value formatter: 1
+      5589    record (15 = 3 [1] + 10 + 2)
+              did#0,SET test value formatter: 14
+      5604    record (14 = 3 [3] + 10 + 1)
+              didst#0,SET test value formatter: 1
+      5618    record (15 = 3 [0] + 11 + 1) [restart]
+              die#0,SET test value formatter: 1
+      5633    record (13 = 3 [3] + 9 + 1)
+              died#0,SET test value formatter: 1
+      5646    record (13 = 3 [3] + 9 + 1)
+              diet#0,SET test value formatter: 1
+      5659    record (17 = 3 [2] + 13 + 1)
+              dignity#0,SET test value formatter: 1
+      5676    record (16 = 3 [2] + 12 + 1)
+              direct#0,SET test value formatter: 1
+      5692    record (14 = 3 [3] + 10 + 1)
+              dirge#0,SET test value formatter: 1
+      5706    record (22 = 3 [2] + 18 + 1)
+              disappointed#0,SET test value formatter: 1
+      5728    record (17 = 3 [4] + 13 + 1)
+              disasters#0,SET test value formatter: 1
+      5745    record (18 = 3 [3] + 14 + 1)
+              disclosed#0,SET test value formatter: 1
+      5763    record (17 = 3 [4] + 13 + 1)
+              discourse#0,SET test value formatter: 1
+      5780    record (18 = 3 [4] + 14 + 1)
+              discretion#0,SET test value formatter: 1
+      5798    record (17 = 3 [3] + 13 + 1)
+              disjoint#0,SET test value formatter: 1
+      5815    record (17 = 3 [3] + 13 + 1)
+              dispatch#0,SET test value formatter: 2
+      5832    record (19 = 3 [4] + 15 + 1)
+              disposition#0,SET test value formatter: 3
+      5851    record (18 = 3 [3] + 14 + 1)
+              distilled#0,SET test value formatter: 1
+      5869    record (16 = 3 [6] + 12 + 1)
+              distilment#0,SET test value formatter: 1
+      5885    record (22 = 3 [0] + 18 + 1) [restart]
+              distracted#0,SET test value formatter: 1
+      5907    record (16 = 3 [2] + 12 + 1)
+              divide#0,SET test value formatter: 1
+      5923    record (14 = 3 [1] + 9 + 2)
+              do#0,SET test value formatter: 36
+      5937    record (14 = 3 [2] + 10 + 1)
+              does#0,SET test value formatter: 3
+      5951    record (14 = 3 [2] + 10 + 1)
+              dole#0,SET test value formatter: 1
+      5965    record (14 = 3 [2] + 10 + 1)
+              done#0,SET test value formatter: 3
+      5979    record (14 = 3 [2] + 10 + 1)
+              doom#0,SET test value formatter: 1
+      5993    record (16 = 3 [4] + 12 + 1)
+              doomsday#0,SET test value formatter: 1
+      6009    record (14 = 3 [2] + 10 + 1)
+              doth#0,SET test value formatter: 7
+      6023    record (16 = 3 [2] + 12 + 1)
+              double#0,SET test value formatter: 2
+      6039    record (13 = 3 [4] + 9 + 1)
+              doubt#0,SET test value formatter: 4
+      6052    record (15 = 3 [5] + 11 + 1)
+              doubtful#0,SET test value formatter: 1
+      6067    record (14 = 3 [2] + 10 + 1)
+              down#0,SET test value formatter: 7
+      6081    record (17 = 3 [1] + 13 + 1)
+              drains#0,SET test value formatter: 1
+      6098    [restart 4095]
+      6102    [restart 4373]
+      6106    [restart 4628]
+      6110    [restart 4874]
+      6114    [restart 5118]
+      6118    [restart 5359]
+      6122    [restart 5618]
+      6126    [restart 5885]
+      6134    [trailer compression=none checksum=0xa3c2c00f]
+      6139  data (2036)
+      6139    record (16 = 3 [0] + 12 + 1) [restart]
+              dram#0,SET test value formatter: 1
+      6155    record (17 = 3 [3] + 13 + 1)
+              draughts#0,SET test value formatter: 1
+      6172    record (13 = 3 [3] + 9 + 1)
+              draw#0,SET test value formatter: 1
+      6185    record (13 = 3 [4] + 9 + 1)
+              draws#0,SET test value formatter: 1
+      6198    record (15 = 3 [2] + 11 + 1)
+              dread#0,SET test value formatter: 1
+      6213    record (14 = 3 [5] + 10 + 1)
+              dreaded#0,SET test value formatter: 1
+      6227    record (15 = 3 [5] + 11 + 1)
+              dreadful#0,SET test value formatter: 2
+      6242    record (13 = 3 [4] + 9 + 1)
+              dream#0,SET test value formatter: 1
+      6255    record (13 = 3 [5] + 9 + 1)
+              dreamt#0,SET test value formatter: 1
+      6268    record (15 = 3 [2] + 11 + 1)
+              drink#0,SET test value formatter: 1
+      6283    record (13 = 3 [5] + 9 + 1)
+              drinks#0,SET test value formatter: 1
+      6296    record (18 = 3 [2] + 14 + 1)
+              dropping#0,SET test value formatter: 1
+      6314    record (13 = 3 [8] + 9 + 1)
+              droppings#0,SET test value formatter: 1
+      6327    record (14 = 3 [2] + 10 + 1)
+              drum#0,SET test value formatter: 1
+      6341    record (18 = 3 [3] + 14 + 1)
+              drunkards#0,SET test value formatter: 1
+      6359    record (15 = 3 [1] + 11 + 1)
+              dull#0,SET test value formatter: 1
+      6374    record (18 = 3 [0] + 14 + 1) [restart]
+              duller#0,SET test value formatter: 1
+      6392    record (13 = 3 [4] + 9 + 1)
+              dulls#0,SET test value formatter: 1
+      6405    record (14 = 3 [2] + 10 + 1)
+              dumb#0,SET test value formatter: 2
+      6419    record (14 = 3 [2] + 10 + 1)
+              dust#0,SET test value formatter: 1
+      6433    record (16 = 3 [2] + 12 + 1)
+              duties#0,SET test value formatter: 1
+      6449    record (13 = 3 [3] + 9 + 1)
+              duty#0,SET test value formatter: 7
+      6462    record (19 = 3 [1] + 15 + 1)
+              dwelling#0,SET test value formatter: 1
+      6481    record (14 = 3 [1] + 10 + 1)
+              dye#0,SET test value formatter: 1
+      6495    record (13 = 3 [0] + 9 + 1)
+              e#0,SET test value formatter: 1
+      6508    record (15 = 3 [1] + 11 + 1)
+              each#0,SET test value formatter: 5
+      6523    record (15 = 3 [2] + 11 + 1)
+              eager#0,SET test value formatter: 2
+      6538    record (14 = 3 [2] + 10 + 1)
+              eale#0,SET test value formatter: 1
+      6552    record (13 = 3 [2] + 9 + 1)
+              ear#0,SET test value formatter: 5
+      6565    record (13 = 3 [3] + 9 + 1)
+              ears#0,SET test value formatter: 3
+      6578    record (14 = 3 [3] + 10 + 1)
+              earth#0,SET test value formatter: 9
+      6592    record (14 = 3 [5] + 10 + 1)
+              earthly#0,SET test value formatter: 1
+      6606    record (16 = 3 [0] + 12 + 1) [restart]
+              ease#0,SET test value formatter: 2
+      6622    record (13 = 3 [3] + 9 + 1)
+              east#0,SET test value formatter: 1
+      6635    record (16 = 3 [4] + 12 + 1)
+              eastward#0,SET test value formatter: 1
+      6651    record (18 = 3 [1] + 14 + 1)
+              eclipse#0,SET test value formatter: 1
+      6669    record (15 = 3 [1] + 11 + 1)
+              edge#0,SET test value formatter: 1
+      6684    record (17 = 3 [1] + 13 + 1)
+              effect#0,SET test value formatter: 2
+      6701    record (17 = 3 [1] + 13 + 1)
+              eleven#0,SET test value formatter: 1
+      6718    record (14 = 3 [2] + 10 + 1)
+              else#0,SET test value formatter: 4
+      6732    record (17 = 3 [3] + 13 + 1)
+              elsinore#0,SET test value formatter: 2
+      6749    record (17 = 3 [1] + 13 + 1)
+              embark#0,SET test value formatter: 1
+      6766    record (16 = 3 [2] + 12 + 1)
+              empire#0,SET test value formatter: 1
+      6782    record (17 = 3 [2] + 13 + 1)
+              emulate#0,SET test value formatter: 1
+      6799    record (13 = 3 [1] + 9 + 1)
+              en#0,SET test value formatter: 2
+      6812    record (19 = 3 [2] + 15 + 1)
+              encounter#0,SET test value formatter: 1
+      6831    record (17 = 3 [3] + 13 + 1)
+              encumber#0,SET test value formatter: 1
+      6848    record (13 = 3 [2] + 9 + 1)
+              end#0,SET test value formatter: 1
+      6861    record (17 = 3 [0] + 13 + 1) [restart]
+              enemy#0,SET test value formatter: 1
+      6878    record (16 = 3 [2] + 12 + 1)
+              enmity#0,SET test value formatter: 1
+      6894    record (16 = 3 [2] + 12 + 1)
+              enough#0,SET test value formatter: 1
+      6910    record (16 = 3 [2] + 11 + 2)
+              enter#0,SET test value formatter: 12
+      6926    record (17 = 3 [5] + 13 + 1)
+              enterprise#0,SET test value formatter: 1
+      6943    record (20 = 3 [5] + 16 + 1)
+              entertainment#0,SET test value formatter: 1
+      6963    record (17 = 3 [3] + 13 + 1)
+              entrance#0,SET test value formatter: 1
+      6980    record (17 = 3 [4] + 13 + 1)
+              entreated#0,SET test value formatter: 1
+      6997    record (17 = 3 [7] + 13 + 1)
+              entreatments#0,SET test value formatter: 1
+      7014    record (16 = 3 [1] + 12 + 1)
+              equal#0,SET test value formatter: 1
+      7030    record (13 = 3 [1] + 9 + 1)
+              er#0,SET test value formatter: 5
+      7043    record (13 = 3 [2] + 9 + 1)
+              ere#0,SET test value formatter: 4
+      7056    record (18 = 3 [2] + 14 + 1)
+              ergrowth#0,SET test value formatter: 1
+      7074    record (18 = 3 [2] + 14 + 1)
+              ermaster#0,SET test value formatter: 1
+      7092    record (16 = 3 [2] + 12 + 1)
+              erring#0,SET test value formatter: 1
+      7108    record (18 = 3 [2] + 14 + 1)
+              eruption#0,SET test value formatter: 1
+      7126    record (19 = 3 [0] + 15 + 1) [restart]
+              erwhelm#0,SET test value formatter: 1
+      7145    record (17 = 3 [1] + 13 + 1)
+              esteem#0,SET test value formatter: 1
+      7162    record (13 = 3 [1] + 9 + 1)
+              et#0,SET test value formatter: 1
+      7175    record (17 = 3 [2] + 13 + 1)
+              eternal#0,SET test value formatter: 1
+      7192    record (15 = 3 [5] + 11 + 1)
+              eternity#0,SET test value formatter: 1
+      7207    record (15 = 3 [1] + 11 + 1)
+              even#0,SET test value formatter: 8
+      7222    record (14 = 3 [4] + 10 + 1)
+              events#0,SET test value formatter: 1
+      7236    record (13 = 3 [3] + 9 + 1)
+              ever#0,SET test value formatter: 6
+      7249    record (19 = 3 [4] + 15 + 1)
+              everlasting#0,SET test value formatter: 1
+      7268    record (13 = 3 [4] + 9 + 1)
+              every#0,SET test value formatter: 3
+      7281    record (18 = 3 [1] + 14 + 1)
+              exactly#0,SET test value formatter: 1
+      7299    record (19 = 3 [2] + 15 + 1)
+              excellent#0,SET test value formatter: 1
+      7318    record (16 = 3 [2] + 12 + 1)
+              exeunt#0,SET test value formatter: 8
+      7334    record (14 = 3 [2] + 10 + 1)
+              exit#0,SET test value formatter: 6
+      7348    record (17 = 3 [2] + 13 + 1)
+              express#0,SET test value formatter: 2
+      7365    record (17 = 3 [2] + 13 + 1)
+              extinct#0,SET test value formatter: 1
+      7382    record (20 = 3 [0] + 16 + 1) [restart]
+              extorted#0,SET test value formatter: 1
+      7402    record (20 = 3 [3] + 16 + 1)
+              extravagant#0,SET test value formatter: 1
+      7422    record (14 = 3 [1] + 10 + 1)
+              eye#0,SET test value formatter: 6
+      7436    record (13 = 3 [3] + 9 + 1)
+              eyes#0,SET test value formatter: 7
+      7449    record (16 = 3 [0] + 12 + 1)
+              face#0,SET test value formatter: 2
+      7465    record (15 = 3 [2] + 11 + 1)
+              faded#0,SET test value formatter: 1
+      7480    record (14 = 3 [2] + 10 + 1)
+              fail#0,SET test value formatter: 1
+      7494    record (13 = 3 [3] + 9 + 1)
+              fair#0,SET test value formatter: 3
+      7507    record (13 = 3 [4] + 9 + 1)
+              fairy#0,SET test value formatter: 1
+      7520    record (14 = 3 [3] + 10 + 1)
+              faith#0,SET test value formatter: 4
+      7534    record (17 = 3 [2] + 13 + 1)
+              falling#0,SET test value formatter: 1
+      7551    record (14 = 3 [3] + 10 + 1)
+              false#0,SET test value formatter: 1
+      7565    record (18 = 3 [2] + 14 + 1)
+              familiar#0,SET test value formatter: 1
+      7583    record (15 = 3 [2] + 11 + 1)
+              fancy#0,SET test value formatter: 1
+      7598    record (16 = 3 [3] + 12 + 1)
+              fantasy#0,SET test value formatter: 2
+      7614    record (13 = 3 [2] + 9 + 1)
+              far#0,SET test value formatter: 2
+      7627    record (16 = 3 [0] + 12 + 1) [restart]
+              fare#0,SET test value formatter: 2
+      7643    record (16 = 3 [4] + 12 + 1)
+              farewell#0,SET test value formatter: 8
+      7659    record (17 = 3 [2] + 13 + 1)
+              fashion#0,SET test value formatter: 3
+      7676    record (13 = 3 [3] + 9 + 1)
+              fast#0,SET test value formatter: 2
+      7689    record (13 = 3 [2] + 9 + 1)
+              fat#0,SET test value formatter: 1
+      7702    record (13 = 3 [3] + 9 + 1)
+              fate#0,SET test value formatter: 2
+      7715    record (13 = 3 [4] + 9 + 1)
+              fates#0,SET test value formatter: 1
+      7728    record (16 = 3 [3] + 11 + 2)
+              father#0,SET test value formatter: 28
+      7744    record (13 = 3 [6] + 9 + 1)
+              fathers#0,SET test value formatter: 1
+      7757    record (15 = 3 [4] + 11 + 1)
+              fathoms#0,SET test value formatter: 1
+      7772    record (15 = 3 [2] + 11 + 1)
+              fault#0,SET test value formatter: 4
+      7787    record (16 = 3 [2] + 12 + 1)
+              favour#0,SET test value formatter: 2
+      7803    record (15 = 3 [1] + 11 + 1)
+              fear#0,SET test value formatter: 9
+      7818    record (15 = 3 [4] + 11 + 1)
+              fearful#0,SET test value formatter: 1
+      7833    record (13 = 3 [2] + 9 + 1)
+              fed#0,SET test value formatter: 1
+      7846    record (13 = 3 [2] + 9 + 1)
+              fee#0,SET test value formatter: 1
+      7859    record (16 = 3 [0] + 12 + 1) [restart]
+              fell#0,SET test value formatter: 2
+      7875    record (14 = 3 [4] + 10 + 1)
+              fellow#0,SET test value formatter: 2
+      7889    record (13 = 3 [2] + 9 + 1)
+              few#0,SET test value formatter: 3
+      7902    record (14 = 3 [1] + 10 + 1)
+              fie#0,SET test value formatter: 4
+      7916    record (15 = 3 [3] + 11 + 1)
+              fierce#0,SET test value formatter: 1
+      7931    record (16 = 3 [2] + 12 + 1)
+              figure#0,SET test value formatter: 3
+      7947    record (16 = 3 [2] + 12 + 1)
+              filial#0,SET test value formatter: 1
+      7963    record (14 = 3 [2] + 10 + 1)
+              find#0,SET test value formatter: 2
+      7977    record (16 = 3 [3] + 12 + 1)
+              fingers#0,SET test value formatter: 1
+      7993    record (14 = 3 [2] + 10 + 1)
+              fire#0,SET test value formatter: 4
+      8007    record (13 = 3 [4] + 9 + 1)
+              fires#0,SET test value formatter: 1
+      8020    record (14 = 3 [3] + 10 + 1)
+              first#0,SET test value formatter: 1
+      8034    record (13 = 3 [2] + 9 + 1)
+              fit#0,SET test value formatter: 2
+      8047    record (13 = 3 [3] + 9 + 1)
+              fits#0,SET test value formatter: 1
+      8060    record (16 = 3 [3] + 12 + 1)
+              fitting#0,SET test value formatter: 1
+      8076    record (13 = 3 [2] + 9 + 1)
+              fix#0,SET test value formatter: 2
+      8089    record (18 = 3 [0] + 14 + 1) [restart]
+              flames#0,SET test value formatter: 1
+      8107    record (13 = 3 [3] + 9 + 1)
+              flat#0,SET test value formatter: 1
+      8120    record (15 = 3 [2] + 11 + 1)
+              flesh#0,SET test value formatter: 2
+      8135    [restart 6139]
+      8139    [restart 6374]
+      8143    [restart 6606]
+      8147    [restart 6861]
+      8151    [restart 7126]
+      8155    [restart 7382]
+      8159    [restart 7627]
+      8163    [restart 7859]
+      8167    [restart 8089]
+      8175    [trailer compression=none checksum=0xc4a3b6e0]
+      8180  data (2032)
+      8180    record (17 = 3 [0] + 13 + 1) [restart]
+              flood#0,SET test value formatter: 1
+      8197    record (17 = 3 [3] + 13 + 1)
+              flourish#0,SET test value formatter: 1
+      8214    record (18 = 3 [2] + 14 + 1)
+              flushing#0,SET test value formatter: 1
+      8232    record (14 = 3 [1] + 10 + 1)
+              foe#0,SET test value formatter: 1
+      8246    record (16 = 3 [2] + 12 + 1)
+              follow#0,SET test value formatter: 9
+      8262    record (13 = 3 [6] + 9 + 1)
+              follows#0,SET test value formatter: 1
+      8275    record (14 = 3 [2] + 10 + 1)
+              fond#0,SET test value formatter: 1
+      8289    record (14 = 3 [2] + 10 + 1)
+              food#0,SET test value formatter: 1
+      8303    record (13 = 3 [3] + 9 + 1)
+              fool#0,SET test value formatter: 1
+      8316    record (13 = 3 [4] + 9 + 1)
+              fools#0,SET test value formatter: 1
+      8329    record (13 = 3 [3] + 9 + 1)
+              foot#0,SET test value formatter: 1
+      8342    record (14 = 3 [2] + 9 + 2)
+              for#0,SET test value formatter: 45
+      8356    record (15 = 3 [3] + 11 + 1)
+              forbid#0,SET test value formatter: 1
+      8371    record (15 = 3 [3] + 11 + 1)
+              forced#0,SET test value formatter: 1
+      8386    record (16 = 3 [3] + 12 + 1)
+              foreign#0,SET test value formatter: 1
+      8402    record (19 = 3 [4] + 15 + 1)
+              foreknowing#0,SET test value formatter: 1
+      8421    record (20 = 3 [0] + 16 + 1) [restart]
+              foresaid#0,SET test value formatter: 1
+      8441    record (16 = 3 [3] + 12 + 1)
+              forfeit#0,SET test value formatter: 1
+      8457    record (15 = 3 [3] + 11 + 1)
+              forged#0,SET test value formatter: 1
+      8472    record (13 = 3 [5] + 9 + 1)
+              forget#0,SET test value formatter: 1
+      8485    record (13 = 3 [3] + 9 + 1)
+              form#0,SET test value formatter: 4
+      8498    record (13 = 3 [4] + 9 + 1)
+              forms#0,SET test value formatter: 2
+      8511    record (14 = 3 [3] + 10 + 1)
+              forth#0,SET test value formatter: 3
+      8525    record (17 = 3 [4] + 13 + 1)
+              fortified#0,SET test value formatter: 1
+      8542    record (17 = 3 [5] + 13 + 1)
+              fortinbras#0,SET test value formatter: 6
+      8559    record (13 = 3 [4] + 9 + 1)
+              forts#0,SET test value formatter: 1
+      8572    record (15 = 3 [4] + 11 + 1)
+              fortune#0,SET test value formatter: 1
+      8587    record (16 = 3 [3] + 12 + 1)
+              forward#0,SET test value formatter: 1
+      8603    record (16 = 3 [2] + 12 + 1)
+              fought#0,SET test value formatter: 1
+      8619    record (13 = 3 [3] + 9 + 1)
+              foul#0,SET test value formatter: 6
+      8632    record (18 = 3 [1] + 14 + 1)
+              frailty#0,SET test value formatter: 1
+      8650    record (14 = 3 [3] + 10 + 1)
+              frame#0,SET test value formatter: 1
+      8664    record (18 = 3 [0] + 14 + 1) [restart]
+              france#0,SET test value formatter: 3
+      8682    record (17 = 3 [5] + 12 + 2)
+              francisco#0,SET test value formatter: 10
+      8699    record (14 = 3 [2] + 10 + 1)
+              free#0,SET test value formatter: 1
+      8713    record (14 = 3 [4] + 10 + 1)
+              freely#0,SET test value formatter: 1
+      8727    record (14 = 3 [4] + 10 + 1)
+              freeze#0,SET test value formatter: 1
+      8741    record (16 = 3 [3] + 12 + 1)
+              fretful#0,SET test value formatter: 1
+      8757    record (16 = 3 [2] + 12 + 1)
+              friend#0,SET test value formatter: 3
+      8773    record (15 = 3 [6] + 11 + 1)
+              friending#0,SET test value formatter: 1
+      8788    record (13 = 3 [6] + 9 + 1)
+              friends#0,SET test value formatter: 5
+      8801    record (15 = 3 [2] + 10 + 2)
+              from#0,SET test value formatter: 21
+      8816    record (14 = 3 [3] + 10 + 1)
+              frown#0,SET test value formatter: 1
+      8830    record (17 = 3 [5] + 13 + 1)
+              frowningly#0,SET test value formatter: 1
+      8847    record (18 = 3 [2] + 14 + 1)
+              fruitful#0,SET test value formatter: 1
+      8865    record (15 = 3 [1] + 11 + 1)
+              full#0,SET test value formatter: 2
+      8880    record (17 = 3 [2] + 13 + 1)
+              funeral#0,SET test value formatter: 3
+      8897    record (17 = 3 [2] + 13 + 1)
+              furnish#0,SET test value formatter: 1
+      8914    record (19 = 3 [0] + 15 + 1) [restart]
+              further#0,SET test value formatter: 4
+      8933    record (17 = 3 [0] + 13 + 1)
+              gaged#0,SET test value formatter: 1
+      8950    record (16 = 3 [2] + 12 + 1)
+              gainst#0,SET test value formatter: 2
+      8966    record (13 = 3 [3] + 9 + 1)
+              gait#0,SET test value formatter: 1
+      8979    record (16 = 3 [2] + 12 + 1)
+              galled#0,SET test value formatter: 1
+      8995    record (13 = 3 [4] + 9 + 1)
+              galls#0,SET test value formatter: 1
+      9008    record (14 = 3 [2] + 10 + 1)
+              gape#0,SET test value formatter: 1
+      9022    record (17 = 3 [2] + 13 + 1)
+              garbage#0,SET test value formatter: 1
+      9039    record (15 = 3 [3] + 11 + 1)
+              garden#0,SET test value formatter: 1
+      9054    record (15 = 3 [2] + 11 + 1)
+              gates#0,SET test value formatter: 1
+      9069    record (15 = 3 [2] + 11 + 1)
+              gaudy#0,SET test value formatter: 1
+      9084    record (18 = 3 [1] + 14 + 1)
+              general#0,SET test value formatter: 1
+      9102    record (15 = 3 [5] + 11 + 1)
+              generous#0,SET test value formatter: 1
+      9117    record (15 = 3 [3] + 11 + 1)
+              gentle#0,SET test value formatter: 1
+      9132    record (15 = 3 [6] + 11 + 1)
+              gentlemen#0,SET test value formatter: 5
+      9147    record (18 = 3 [2] + 14 + 1)
+              gertrude#0,SET test value formatter: 4
+      9165    record (15 = 3 [0] + 11 + 1) [restart]
+              get#0,SET test value formatter: 1
+      9180    record (17 = 3 [1] + 12 + 2)
+              ghost#0,SET test value formatter: 26
+      9197    record (17 = 3 [1] + 13 + 1)
+              gibber#0,SET test value formatter: 1
+      9214    record (15 = 3 [2] + 11 + 1)
+              gifts#0,SET test value formatter: 3
+      9229    record (14 = 3 [2] + 10 + 1)
+              gins#0,SET test value formatter: 1
+      9243    record (14 = 3 [2] + 10 + 1)
+              girl#0,SET test value formatter: 1
+      9257    record (15 = 3 [2] + 10 + 2)
+              give#0,SET test value formatter: 13
+      9272    record (13 = 3 [4] + 9 + 1)
+              given#0,SET test value formatter: 4
+      9285    record (15 = 3 [3] + 11 + 1)
+              giving#0,SET test value formatter: 3
+      9300    record (15 = 3 [1] + 11 + 1)
+              glad#0,SET test value formatter: 2
+      9315    record (18 = 3 [2] + 14 + 1)
+              glimpses#0,SET test value formatter: 1
+      9333    record (15 = 3 [2] + 11 + 1)
+              globe#0,SET test value formatter: 1
+      9348    record (13 = 3 [3] + 9 + 1)
+              glow#0,SET test value formatter: 1
+      9361    record (14 = 3 [1] + 9 + 2)
+              go#0,SET test value formatter: 15
+      9375    record (16 = 3 [2] + 12 + 1)
+              goblin#0,SET test value formatter: 1
+      9391    record (13 = 3 [2] + 9 + 1)
+              god#0,SET test value formatter: 8
+      9404    record (16 = 3 [0] + 12 + 1) [restart]
+              goes#0,SET test value formatter: 3
+      9420    record (15 = 3 [2] + 11 + 1)
+              going#0,SET test value formatter: 1
+      9435    record (14 = 3 [2] + 10 + 1)
+              gone#0,SET test value formatter: 4
+      9449    record (15 = 3 [2] + 10 + 2)
+              good#0,SET test value formatter: 20
+      9464    record (14 = 3 [4] + 10 + 1)
+              goodly#0,SET test value formatter: 1
+      9478    record (16 = 3 [1] + 12 + 1)
+              grace#0,SET test value formatter: 6
+      9494    record (13 = 3 [5] + 9 + 1)
+              graces#0,SET test value formatter: 1
+      9507    record (16 = 3 [4] + 12 + 1)
+              gracious#0,SET test value formatter: 2
+      9523    record (16 = 3 [3] + 12 + 1)
+              grapple#0,SET test value formatter: 1
+      9539    record (14 = 3 [3] + 10 + 1)
+              grave#0,SET test value formatter: 1
+      9553    record (13 = 3 [5] + 9 + 1)
+              graves#0,SET test value formatter: 1
+      9566    record (15 = 3 [2] + 11 + 1)
+              great#0,SET test value formatter: 1
+      9581    record (16 = 3 [5] + 12 + 1)
+              greatness#0,SET test value formatter: 1
+      9597    record (14 = 3 [3] + 10 + 1)
+              green#0,SET test value formatter: 2
+      9611    record (16 = 3 [4] + 12 + 1)
+              greeting#0,SET test value formatter: 1
+      9627    record (15 = 3 [2] + 11 + 1)
+              grief#0,SET test value formatter: 3
+      9642    record (20 = 3 [0] + 16 + 1) [restart]
+              grizzled#0,SET test value formatter: 1
+      9662    record (15 = 3 [2] + 11 + 1)
+              gross#0,SET test value formatter: 2
+      9677    record (15 = 3 [3] + 11 + 1)
+              ground#0,SET test value formatter: 3
+      9692    record (13 = 3 [3] + 9 + 1)
+              grow#0,SET test value formatter: 2
+      9705    record (13 = 3 [4] + 9 + 1)
+              grown#0,SET test value formatter: 1
+      9718    record (13 = 3 [4] + 9 + 1)
+              grows#0,SET test value formatter: 2
+      9731    record (16 = 3 [1] + 12 + 1)
+              guard#0,SET test value formatter: 1
+      9747    record (16 = 3 [2] + 12 + 1)
+              guilty#0,SET test value formatter: 2
+      9763    record (14 = 3 [0] + 10 + 1)
+              ha#0,SET test value formatter: 1
+      9777    record (15 = 3 [2] + 11 + 1)
+              habit#0,SET test value formatter: 2
+      9792    record (14 = 3 [2] + 9 + 2)
+              had#0,SET test value formatter: 13
+      9806    record (14 = 3 [2] + 10 + 1)
+              hail#0,SET test value formatter: 1
+      9820    record (13 = 3 [3] + 9 + 1)
+              hair#0,SET test value formatter: 1
+      9833    record (16 = 3 [2] + 12 + 1)
+              hallow#0,SET test value formatter: 1
+      9849    record (18 = 3 [2] + 12 + 3)
+              hamlet#0,SET test value formatter: 100
+      9867    record (14 = 3 [2] + 10 + 1)
+              hand#0,SET test value formatter: 5
+      9881    record (17 = 3 [0] + 13 + 1) [restart]
+              hands#0,SET test value formatter: 4
+      9898    record (13 = 3 [3] + 9 + 1)
+              hang#0,SET test value formatter: 2
+      9911    record (13 = 3 [2] + 9 + 1)
+              hap#0,SET test value formatter: 1
+      9924    record (16 = 3 [3] + 12 + 1)
+              happily#0,SET test value formatter: 1
+      9940    record (20 = 3 [2] + 16 + 1)
+              harbingers#0,SET test value formatter: 1
+      9960    record (13 = 3 [3] + 9 + 1)
+              hard#0,SET test value formatter: 2
+      9973    record (13 = 3 [4] + 9 + 1)
+              hardy#0,SET test value formatter: 1
+      9986    record (15 = 3 [3] + 11 + 1)
+              harrow#0,SET test value formatter: 1
+     10001    record (13 = 3 [6] + 9 + 1)
+              harrows#0,SET test value formatter: 1
+     10014    record (13 = 3 [2] + 9 + 1)
+              has#0,SET test value formatter: 3
+     10027    record (13 = 3 [3] + 9 + 1)
+              hast#0,SET test value formatter: 4
+     10040    record (13 = 3 [4] + 9 + 1)
+              haste#0,SET test value formatter: 7
+     10053    record (15 = 3 [2] + 11 + 1)
+              hatch#0,SET test value formatter: 1
+     10068    record (14 = 3 [3] + 9 + 2)
+              hath#0,SET test value formatter: 15
+     10082    record (15 = 3 [2] + 10 + 2)
+              have#0,SET test value formatter: 31
+     10097    record (15 = 3 [3] + 11 + 1)
+              havior#0,SET test value formatter: 1
+     10112    record (15 = 3 [0] + 10 + 2) [restart]
+              he#0,SET test value formatter: 34
+     10127    record (14 = 3 [2] + 10 + 1)
+              head#0,SET test value formatter: 6
+     10141    record (14 = 3 [4] + 10 + 1)
+              headed#0,SET test value formatter: 1
+     10155    record (17 = 3 [4] + 13 + 1)
+              headshake#0,SET test value formatter: 1
+     10172    [restart 8180]
+     10176    [restart 8421]
+     10180    [restart 8664]
+     10184    [restart 8914]
+     10188    [restart 9165]
+     10192    [restart 9404]
+     10196    [restart 9642]
+     10200    [restart 9881]
+     10204    [restart 10112]
+     10212    [trailer compression=none checksum=0x23db05a]
+     10217  data (2042)
+     10217    record (18 = 3 [0] + 14 + 1) [restart]
+              health#0,SET test value formatter: 3
+     10235    record (13 = 3 [3] + 9 + 1)
+              hear#0,SET test value formatter: 9
+     10248    record (13 = 3 [4] + 9 + 1)
+              heard#0,SET test value formatter: 4
+     10261    record (15 = 3 [4] + 11 + 1)
+              hearing#0,SET test value formatter: 1
+     10276    record (13 = 3 [4] + 9 + 1)
+              hears#0,SET test value formatter: 2
+     10289    record (14 = 3 [5] + 10 + 1)
+              hearsed#0,SET test value formatter: 1
+     10303    record (14 = 3 [4] + 9 + 2)
+              heart#0,SET test value formatter: 10
+     10317    record (15 = 3 [5] + 11 + 1)
+              heartily#0,SET test value formatter: 3
+     10332    record (13 = 3 [5] + 9 + 1)
+              hearts#0,SET test value formatter: 1
+     10345    record (13 = 3 [3] + 9 + 1)
+              heat#0,SET test value formatter: 1
+     10358    record (16 = 3 [3] + 11 + 2)
+              heaven#0,SET test value formatter: 21
+     10374    record (13 = 3 [6] + 9 + 1)
+              heavens#0,SET test value formatter: 1
+     10387    record (13 = 3 [4] + 9 + 1)
+              heavy#0,SET test value formatter: 1
+     10400    record (17 = 3 [2] + 13 + 1)
+              hebenon#0,SET test value formatter: 1
+     10417    record (16 = 3 [2] + 12 + 1)
+              height#0,SET test value formatter: 1
+     10433    record (14 = 3 [2] + 10 + 1)
+              held#0,SET test value formatter: 1
+     10447    record (16 = 3 [0] + 12 + 1) [restart]
+              hell#0,SET test value formatter: 3
+     10463    record (13 = 3 [3] + 9 + 1)
+              help#0,SET test value formatter: 2
+     10476    record (13 = 3 [2] + 9 + 1)
+              her#0,SET test value formatter: 8
+     10489    record (17 = 3 [3] + 13 + 1)
+              heraldry#0,SET test value formatter: 1
+     10506    record (17 = 3 [3] + 13 + 1)
+              hercules#0,SET test value formatter: 1
+     10523    record (14 = 3 [3] + 9 + 2)
+              here#0,SET test value formatter: 11
+     10537    record (17 = 3 [4] + 13 + 1)
+              hereafter#0,SET test value formatter: 1
+     10554    record (14 = 3 [4] + 10 + 1)
+              herein#0,SET test value formatter: 3
+     10568    record (14 = 3 [1] + 10 + 1)
+              hic#0,SET test value formatter: 1
+     10582    record (17 = 3 [2] + 13 + 1)
+              hideous#0,SET test value formatter: 1
+     10599    record (14 = 3 [2] + 10 + 1)
+              hies#0,SET test value formatter: 1
+     10613    record (14 = 3 [2] + 10 + 1)
+              high#0,SET test value formatter: 2
+     10627    record (14 = 3 [4] + 10 + 1)
+              higher#0,SET test value formatter: 1
+     10641    record (14 = 3 [2] + 10 + 1)
+              hill#0,SET test value formatter: 1
+     10655    record (13 = 3 [4] + 9 + 1)
+              hillo#0,SET test value formatter: 2
+     10668    record (14 = 3 [2] + 9 + 2)
+              him#0,SET test value formatter: 21
+     10682    record (19 = 3 [0] + 15 + 1) [restart]
+              himself#0,SET test value formatter: 3
+     10701    record (14 = 3 [2] + 9 + 2)
+              his#0,SET test value formatter: 57
+     10715    record (16 = 3 [2] + 12 + 1)
+              hither#0,SET test value formatter: 1
+     10731    record (14 = 3 [6] + 10 + 1)
+              hitherto#0,SET test value formatter: 1
+     10745    record (13 = 3 [1] + 9 + 1)
+              ho#0,SET test value formatter: 5
+     10758    record (14 = 3 [2] + 10 + 1)
+              hold#0,SET test value formatter: 9
+     10772    record (15 = 3 [4] + 11 + 1)
+              holding#0,SET test value formatter: 1
+     10787    record (13 = 3 [4] + 9 + 1)
+              holds#0,SET test value formatter: 2
+     10800    record (14 = 3 [3] + 10 + 1)
+              holla#0,SET test value formatter: 1
+     10814    record (13 = 3 [3] + 9 + 1)
+              holy#0,SET test value formatter: 1
+     10827    record (16 = 3 [2] + 12 + 1)
+              honest#0,SET test value formatter: 2
+     10843    record (15 = 3 [3] + 11 + 1)
+              honour#0,SET test value formatter: 5
+     10858    record (16 = 3 [6] + 12 + 1)
+              honourable#0,SET test value formatter: 1
+     10874    record (15 = 3 [2] + 11 + 1)
+              hoops#0,SET test value formatter: 1
+     10889    record (18 = 3 [2] + 13 + 2)
+              horatio#0,SET test value formatter: 85
+     10907    record (17 = 3 [3] + 13 + 1)
+              horrible#0,SET test value formatter: 4
+     10924    record (20 = 3 [0] + 16 + 1) [restart]
+              horridly#0,SET test value formatter: 1
+     10944    record (14 = 3 [2] + 10 + 1)
+              host#0,SET test value formatter: 1
+     10958    record (13 = 3 [2] + 9 + 1)
+              hot#0,SET test value formatter: 1
+     10971    record (14 = 3 [2] + 10 + 1)
+              hour#0,SET test value formatter: 6
+     10985    record (14 = 3 [3] + 10 + 1)
+              house#0,SET test value formatter: 2
+     10999    record (13 = 3 [2] + 9 + 1)
+              how#0,SET test value formatter: 7
+     11012    record (18 = 3 [3] + 14 + 1)
+              howsoever#0,SET test value formatter: 1
+     11030    record (17 = 3 [1] + 13 + 1)
+              humbly#0,SET test value formatter: 1
+     11047    record (17 = 3 [2] + 13 + 1)
+              hundred#0,SET test value formatter: 1
+     11064    record (19 = 3 [2] + 15 + 1)
+              husbandry#0,SET test value formatter: 1
+     11083    record (19 = 3 [1] + 15 + 1)
+              hyperion#0,SET test value formatter: 1
+     11102    record (15 = 3 [0] + 9 + 3)
+              i#0,SET test value formatter: 124
+     11117    record (14 = 3 [1] + 10 + 1)
+              ice#0,SET test value formatter: 1
+     11131    record (14 = 3 [1] + 9 + 2)
+              if#0,SET test value formatter: 22
+     11145    record (20 = 3 [1] + 16 + 1)
+              ignorance#0,SET test value formatter: 1
+     11165    record (13 = 3 [1] + 9 + 1)
+              ii#0,SET test value formatter: 1
+     11178    record (15 = 3 [0] + 11 + 1) [restart]
+              iii#0,SET test value formatter: 1
+     11193    record (17 = 3 [1] + 13 + 1)
+              illume#0,SET test value formatter: 1
+     11210    record (16 = 3 [4] + 12 + 1)
+              illusion#0,SET test value formatter: 1
+     11226    record (16 = 3 [1] + 12 + 1)
+              image#0,SET test value formatter: 1
+     11242    record (19 = 3 [4] + 15 + 1)
+              imagination#0,SET test value formatter: 1
+     11261    record (19 = 3 [2] + 15 + 1)
+              immediate#0,SET test value formatter: 1
+     11280    record (17 = 3 [3] + 13 + 1)
+              imminent#0,SET test value formatter: 1
+     11297    record (17 = 3 [3] + 13 + 1)
+              immortal#0,SET test value formatter: 1
+     11314    record (16 = 3 [2] + 12 + 1)
+              impart#0,SET test value formatter: 3
+     11330    record (16 = 3 [6] + 12 + 1)
+              impartment#0,SET test value formatter: 1
+     11346    record (17 = 3 [4] + 13 + 1)
+              impatient#0,SET test value formatter: 1
+     11363    record (22 = 3 [3] + 18 + 1)
+              imperfections#0,SET test value formatter: 1
+     11385    record (15 = 3 [5] + 11 + 1)
+              imperial#0,SET test value formatter: 1
+     11400    record (16 = 3 [3] + 12 + 1)
+              impious#0,SET test value formatter: 1
+     11416    record (19 = 3 [3] + 15 + 1)
+              implements#0,SET test value formatter: 1
+     11435    record (19 = 3 [4] + 15 + 1)
+              implorators#0,SET test value formatter: 1
+     11454    record (21 = 3 [0] + 17 + 1) [restart]
+              importing#0,SET test value formatter: 1
+     11475    record (16 = 3 [6] + 12 + 1)
+              importuned#0,SET test value formatter: 1
+     11491    record (15 = 3 [8] + 11 + 1)
+              importunity#0,SET test value formatter: 1
+     11506    record (16 = 3 [4] + 12 + 1)
+              impotent#0,SET test value formatter: 1
+     11522    record (16 = 3 [3] + 12 + 1)
+              impress#0,SET test value formatter: 1
+     11538    record (15 = 3 [1] + 9 + 3)
+              in#0,SET test value formatter: 118
+     11553    record (16 = 3 [2] + 12 + 1)
+              incest#0,SET test value formatter: 1
+     11569    record (16 = 3 [6] + 12 + 1)
+              incestuous#0,SET test value formatter: 2
+     11585    record (18 = 3 [3] + 14 + 1)
+              incorrect#0,SET test value formatter: 1
+     11603    record (17 = 3 [3] + 13 + 1)
+              increase#0,SET test value formatter: 1
+     11620    record (16 = 3 [2] + 12 + 1)
+              indeed#0,SET test value formatter: 8
+     11636    record (17 = 3 [2] + 13 + 1)
+              infants#0,SET test value formatter: 1
+     11653    record (17 = 3 [3] + 13 + 1)
+              infinite#0,SET test value formatter: 1
+     11670    record (18 = 3 [3] + 14 + 1)
+              influence#0,SET test value formatter: 1
+     11688    record (15 = 3 [3] + 11 + 1)
+              inform#0,SET test value formatter: 1
+     11703    record (21 = 3 [2] + 17 + 1)
+              inheritance#0,SET test value formatter: 1
+     11724    record (16 = 3 [0] + 12 + 1) [restart]
+              inky#0,SET test value formatter: 1
+     11740    record (17 = 3 [2] + 13 + 1)
+              instant#0,SET test value formatter: 2
+     11757    record (20 = 3 [4] + 16 + 1)
+              instrumental#0,SET test value formatter: 1
+     11777    record (16 = 3 [2] + 12 + 1)
+              intent#0,SET test value formatter: 1
+     11793    record (13 = 3 [6] + 9 + 1)
+              intents#0,SET test value formatter: 1
+     11806    record (13 = 3 [3] + 9 + 1)
+              into#0,SET test value formatter: 5
+     11819    record (15 = 3 [2] + 11 + 1)
+              inurn#0,SET test value formatter: 1
+     11834    record (21 = 3 [2] + 17 + 1)
+              investments#0,SET test value formatter: 1
+     11855    record (16 = 3 [3] + 12 + 1)
+              invites#0,SET test value formatter: 1
+     11871    record (21 = 3 [3] + 17 + 1)
+              invulnerable#0,SET test value formatter: 1
+     11892    record (16 = 3 [2] + 12 + 1)
+              inward#0,SET test value formatter: 1
+     11908    record (14 = 3 [1] + 9 + 2)
+              is#0,SET test value formatter: 62
+     11922    record (15 = 3 [2] + 11 + 1)
+              issue#0,SET test value formatter: 1
+     11937    record (15 = 3 [1] + 9 + 3)
+              it#0,SET test value formatter: 126
+     11952    record (13 = 3 [2] + 9 + 1)
+              its#0,SET test value formatter: 1
+     11965    record (15 = 3 [3] + 11 + 1)
+              itself#0,SET test value formatter: 9
+     11980    record (14 = 3 [0] + 10 + 1) [restart]
+              iv#0,SET test value formatter: 1
+     11994    record (16 = 3 [0] + 12 + 1)
+              jaws#0,SET test value formatter: 1
+     12010    record (16 = 3 [1] + 12 + 1)
+              jelly#0,SET test value formatter: 1
+     12026    record (17 = 3 [1] + 13 + 1)
+              jocund#0,SET test value formatter: 1
+     12043    record (15 = 3 [2] + 11 + 1)
+              joint#0,SET test value formatter: 2
+     12058    record (16 = 3 [5] + 12 + 1)
+              jointress#0,SET test value formatter: 1
+     12074    record (13 = 3 [2] + 9 + 1)
+              joy#0,SET test value formatter: 1
+     12087    record (19 = 3 [1] + 15 + 1)
+              judgment#0,SET test value formatter: 1
+     12106    record (15 = 3 [2] + 11 + 1)
+              juice#0,SET test value formatter: 1
+     12121    record (16 = 3 [2] + 12 + 1)
+              julius#0,SET test value formatter: 1
+     12137    record (14 = 3 [2] + 10 + 1)
+              jump#0,SET test value formatter: 1
+     12151    record (16 = 3 [0] + 12 + 1)
+              keep#0,SET test value formatter: 3
+     12167    record (13 = 3 [4] + 9 + 1)
+              keeps#0,SET test value formatter: 1
+     12180    record (14 = 3 [2] + 10 + 1)
+              kept#0,SET test value formatter: 1
+     12194    record (16 = 3 [2] + 12 + 1)
+              kettle#0,SET test value formatter: 1
+     12210    record (13 = 3 [2] + 9 + 1)
+              key#0,SET test value formatter: 1
+     12223    [restart 10217]
+     12227    [restart 10447]
+     12231    [restart 10682]
+     12235    [restart 10924]
+     12239    [restart 11178]
+     12243    [restart 11454]
+     12247    [restart 11724]
+     12251    [restart 11980]
+     12259    [trailer compression=none checksum=0x935ea812]
+     12264  data (2039)
+     12264    record (15 = 3 [0] + 11 + 1) [restart]
+              kin#0,SET test value formatter: 1
+     12279    record (13 = 3 [3] + 9 + 1)
+              kind#0,SET test value formatter: 1
+     12292    record (14 = 3 [3] + 9 + 2)
+              king#0,SET test value formatter: 23
+     12306    record (15 = 3 [4] + 11 + 1)
+              kingdom#0,SET test value formatter: 1
+     12321    record (16 = 3 [1] + 12 + 1)
+              knave#0,SET test value formatter: 1
+     12337    record (14 = 3 [2] + 10 + 1)
+              knew#0,SET test value formatter: 1
+     12351    record (17 = 3 [2] + 13 + 1)
+              knotted#0,SET test value formatter: 1
+     12368    record (14 = 3 [3] + 9 + 2)
+              know#0,SET test value formatter: 17
+     12382    record (13 = 3 [4] + 9 + 1)
+              known#0,SET test value formatter: 2
+     12395    record (13 = 3 [4] + 9 + 1)
+              knows#0,SET test value formatter: 1
+     12408    record (20 = 3 [0] + 16 + 1)
+              labourer#0,SET test value formatter: 1
+     12428    record (16 = 3 [6] + 12 + 1)
+              laboursome#0,SET test value formatter: 1
+     12444    record (14 = 3 [2] + 10 + 1)
+              lack#0,SET test value formatter: 1
+     12458    record (13 = 3 [4] + 9 + 1)
+              lacks#0,SET test value formatter: 1
+     12471    record (18 = 3 [2] + 13 + 2)
+              laertes#0,SET test value formatter: 16
+     12489    record (14 = 3 [2] + 10 + 1)
+              land#0,SET test value formatter: 2
+     12503    record (17 = 3 [0] + 13 + 1) [restart]
+              lands#0,SET test value formatter: 3
+     12520    record (16 = 3 [2] + 12 + 1)
+              larger#0,SET test value formatter: 1
+     12536    record (14 = 3 [2] + 10 + 1)
+              last#0,SET test value formatter: 3
+     12550    record (15 = 3 [4] + 11 + 1)
+              lasting#0,SET test value formatter: 1
+     12565    record (14 = 3 [2] + 10 + 1)
+              late#0,SET test value formatter: 3
+     12579    record (13 = 3 [2] + 9 + 1)
+              law#0,SET test value formatter: 2
+     12592    record (16 = 3 [3] + 12 + 1)
+              lawless#0,SET test value formatter: 1
+     12608    record (13 = 3 [2] + 9 + 1)
+              lay#0,SET test value formatter: 1
+     12621    record (15 = 3 [2] + 11 + 1)
+              lazar#0,SET test value formatter: 1
+     12636    record (15 = 3 [1] + 11 + 1)
+              lead#0,SET test value formatter: 1
+     12651    record (14 = 3 [3] + 10 + 1)
+              least#0,SET test value formatter: 2
+     12665    record (14 = 3 [3] + 10 + 1)
+              leave#0,SET test value formatter: 8
+     12679    record (14 = 3 [5] + 10 + 1)
+              leavens#0,SET test value formatter: 1
+     12693    record (14 = 3 [2] + 10 + 1)
+              left#0,SET test value formatter: 1
+     12707    record (17 = 3 [2] + 13 + 1)
+              leisure#0,SET test value formatter: 1
+     12724    record (14 = 3 [2] + 10 + 1)
+              lend#0,SET test value formatter: 1
+     12738    record (18 = 3 [0] + 14 + 1) [restart]
+              lender#0,SET test value formatter: 1
+     12756    record (13 = 3 [4] + 9 + 1)
+              lends#0,SET test value formatter: 1
+     12769    record (15 = 3 [3] + 11 + 1)
+              length#0,SET test value formatter: 1
+     12784    record (18 = 3 [2] + 14 + 1)
+              leperous#0,SET test value formatter: 1
+     12802    record (14 = 3 [2] + 10 + 1)
+              less#0,SET test value formatter: 2
+     12816    record (14 = 3 [4] + 10 + 1)
+              lesson#0,SET test value formatter: 1
+     12830    record (14 = 3 [2] + 9 + 2)
+              let#0,SET test value formatter: 23
+     12844    record (14 = 3 [3] + 10 + 1)
+              lethe#0,SET test value formatter: 1
+     12858    record (13 = 3 [3] + 9 + 1)
+              lets#0,SET test value formatter: 1
+     12871    record (16 = 3 [2] + 12 + 1)
+              levies#0,SET test value formatter: 1
+     12887    record (18 = 3 [2] + 14 + 1)
+              lewdness#0,SET test value formatter: 1
+     12905    record (20 = 3 [1] + 16 + 1)
+              libertine#0,SET test value formatter: 1
+     12925    record (14 = 3 [2] + 10 + 1)
+              lids#0,SET test value formatter: 1
+     12939    record (18 = 3 [2] + 14 + 1)
+              liegemen#0,SET test value formatter: 1
+     12957    record (13 = 3 [3] + 9 + 1)
+              lies#0,SET test value formatter: 1
+     12970    record (14 = 3 [2] + 10 + 1)
+              life#0,SET test value formatter: 7
+     12984    record (18 = 3 [0] + 14 + 1) [restart]
+              lifted#0,SET test value formatter: 1
+     13002    record (15 = 3 [2] + 11 + 1)
+              light#0,SET test value formatter: 1
+     13017    record (15 = 3 [5] + 11 + 1)
+              lightest#0,SET test value formatter: 1
+     13032    record (15 = 3 [2] + 10 + 2)
+              like#0,SET test value formatter: 23
+     13047    record (14 = 3 [2] + 10 + 1)
+              link#0,SET test value formatter: 1
+     13061    record (14 = 3 [2] + 10 + 1)
+              lion#0,SET test value formatter: 1
+     13075    record (14 = 3 [2] + 10 + 1)
+              lips#0,SET test value formatter: 1
+     13089    record (16 = 3 [2] + 12 + 1)
+              liquid#0,SET test value formatter: 1
+     13105    record (14 = 3 [2] + 10 + 1)
+              list#0,SET test value formatter: 6
+     13119    record (13 = 3 [4] + 9 + 1)
+              lists#0,SET test value formatter: 1
+     13132    record (16 = 3 [2] + 12 + 1)
+              little#0,SET test value formatter: 3
+     13148    record (14 = 3 [2] + 10 + 1)
+              live#0,SET test value formatter: 3
+     13162    record (14 = 3 [4] + 10 + 1)
+              livery#0,SET test value formatter: 1
+     13176    record (13 = 3 [4] + 9 + 1)
+              lives#0,SET test value formatter: 1
+     13189    record (14 = 3 [1] + 9 + 2)
+              ll#0,SET test value formatter: 18
+     13203    record (13 = 3 [1] + 9 + 1)
+              lo#0,SET test value formatter: 1
+     13216    record (16 = 3 [0] + 12 + 1) [restart]
+              loan#0,SET test value formatter: 1
+     13232    record (18 = 3 [3] + 14 + 1)
+              loathsome#0,SET test value formatter: 1
+     13250    record (14 = 3 [2] + 10 + 1)
+              lock#0,SET test value formatter: 1
+     13264    record (13 = 3 [4] + 9 + 1)
+              locks#0,SET test value formatter: 1
+     13277    record (15 = 3 [2] + 11 + 1)
+              lodge#0,SET test value formatter: 1
+     13292    record (15 = 3 [2] + 11 + 1)
+              lofty#0,SET test value formatter: 1
+     13307    record (14 = 3 [2] + 10 + 1)
+              long#0,SET test value formatter: 4
+     13321    record (14 = 3 [4] + 10 + 1)
+              longer#0,SET test value formatter: 3
+     13335    record (15 = 3 [2] + 10 + 2)
+              look#0,SET test value formatter: 10
+     13350    record (13 = 3 [4] + 9 + 1)
+              looks#0,SET test value formatter: 2
+     13363    record (14 = 3 [3] + 10 + 1)
+              loose#0,SET test value formatter: 1
+     13377    record (15 = 3 [2] + 10 + 2)
+              lord#0,SET test value formatter: 60
+     13392    record (13 = 3 [4] + 9 + 1)
+              lords#0,SET test value formatter: 1
+     13405    record (15 = 3 [5] + 11 + 1)
+              lordship#0,SET test value formatter: 1
+     13420    record (14 = 3 [2] + 10 + 1)
+              lose#0,SET test value formatter: 2
+     13434    record (13 = 3 [4] + 9 + 1)
+              loses#0,SET test value formatter: 1
+     13447    record (16 = 3 [0] + 12 + 1) [restart]
+              loss#0,SET test value formatter: 1
+     13463    record (13 = 3 [3] + 9 + 1)
+              lost#0,SET test value formatter: 5
+     13476    record (14 = 3 [2] + 10 + 1)
+              loud#0,SET test value formatter: 1
+     13490    record (14 = 3 [2] + 10 + 1)
+              love#0,SET test value formatter: 8
+     13504    record (13 = 3 [4] + 9 + 1)
+              loves#0,SET test value formatter: 5
+     13517    record (15 = 3 [3] + 11 + 1)
+              loving#0,SET test value formatter: 2
+     13532    record (15 = 3 [1] + 11 + 1)
+              lust#0,SET test value formatter: 2
+     13547    record (16 = 3 [2] + 12 + 1)
+              luxury#0,SET test value formatter: 1
+     13563    record (13 = 3 [0] + 9 + 1)
+              m#0,SET test value formatter: 2
+     13576    record (16 = 3 [1] + 12 + 1)
+              madam#0,SET test value formatter: 4
+     13592    record (13 = 3 [3] + 9 + 1)
+              made#0,SET test value formatter: 8
+     13605    record (16 = 3 [3] + 12 + 1)
+              madness#0,SET test value formatter: 1
+     13621    record (14 = 3 [2] + 10 + 1)
+              maid#0,SET test value formatter: 1
+     13635    record (14 = 3 [4] + 10 + 1)
+              maiden#0,SET test value formatter: 1
+     13649    record (13 = 3 [3] + 9 + 1)
+              main#0,SET test value formatter: 2
+     13662    record (20 = 3 [2] + 16 + 1)
+              majestical#0,SET test value formatter: 1
+     13682    record (19 = 3 [0] + 15 + 1) [restart]
+              majesty#0,SET test value formatter: 1
+     13701    record (14 = 3 [2] + 10 + 1)
+              make#0,SET test value formatter: 8
+     13715    record (13 = 3 [4] + 9 + 1)
+              makes#0,SET test value formatter: 2
+     13728    record (15 = 3 [3] + 11 + 1)
+              making#0,SET test value formatter: 2
+     13743    record (19 = 3 [2] + 15 + 1)
+              malicious#0,SET test value formatter: 1
+     13762    record (14 = 3 [2] + 9 + 2)
+              man#0,SET test value formatter: 11
+     13776    record (15 = 3 [3] + 11 + 1)
+              manner#0,SET test value formatter: 1
+     13791    record (13 = 3 [6] + 9 + 1)
+              manners#0,SET test value formatter: 1
+     13804    record (15 = 3 [3] + 11 + 1)
+              mantle#0,SET test value formatter: 1
+     13819    record (13 = 3 [3] + 9 + 1)
+              many#0,SET test value formatter: 2
+     13832    record (16 = 3 [2] + 12 + 1)
+              marble#0,SET test value formatter: 1
+     13848    record (19 = 3 [3] + 14 + 2)
+              marcellus#0,SET test value formatter: 46
+     13867    record (13 = 3 [4] + 9 + 1)
+              march#0,SET test value formatter: 2
+     13880    record (13 = 3 [3] + 9 + 1)
+              mark#0,SET test value formatter: 2
+     13893    record (17 = 3 [3] + 13 + 1)
+              marriage#0,SET test value formatter: 3
+     13910    record (14 = 3 [5] + 10 + 1)
+              married#0,SET test value formatter: 2
+     13924    record (18 = 3 [0] + 14 + 1) [restart]
+              marrow#0,SET test value formatter: 1
+     13942    record (13 = 3 [4] + 9 + 1)
+              marry#0,SET test value formatter: 3
+     13955    record (13 = 3 [3] + 9 + 1)
+              mart#0,SET test value formatter: 1
+     13968    record (15 = 3 [4] + 11 + 1)
+              martial#0,SET test value formatter: 1
+     13983    record (15 = 3 [3] + 11 + 1)
+              marvel#0,SET test value formatter: 1
+     13998    record (15 = 3 [2] + 11 + 1)
+              matin#0,SET test value formatter: 1
+     14013    record (15 = 3 [3] + 11 + 1)
+              matter#0,SET test value formatter: 1
+     14028    record (14 = 3 [2] + 9 + 2)
+              may#0,SET test value formatter: 19
+     14042    record (14 = 3 [1] + 9 + 2)
+              me#0,SET test value formatter: 47
+     14056    record (14 = 3 [2] + 10 + 1)
+              mean#0,SET test value formatter: 2
+     14070    record (13 = 3 [4] + 9 + 1)
+              means#0,SET test value formatter: 2
+     14083    record (14 = 3 [3] + 10 + 1)
+              meats#0,SET test value formatter: 1
+     14097    record (20 = 3 [2] + 16 + 1)
+              meditation#0,SET test value formatter: 1
+     14117    record (14 = 3 [2] + 10 + 1)
+              meet#0,SET test value formatter: 3
+     14131    record (15 = 3 [4] + 11 + 1)
+              meeting#0,SET test value formatter: 1
+     14146    record (14 = 3 [2] + 10 + 1)
+              melt#0,SET test value formatter: 1
+     14160    record (18 = 3 [0] + 14 + 1) [restart]
+              memory#0,SET test value formatter: 5
+     14178    record (13 = 3 [2] + 9 + 1)
+              men#0,SET test value formatter: 3
+     14191    record (15 = 3 [2] + 11 + 1)
+              mercy#0,SET test value formatter: 2
+     14206    record (13 = 3 [3] + 9 + 1)
+              mere#0,SET test value formatter: 1
+     14219    record (14 = 3 [4] + 10 + 1)
+              merely#0,SET test value formatter: 1
+     14233    record (17 = 3 [2] + 13 + 1)
+              message#0,SET test value formatter: 1
+     14250    record (13 = 3 [2] + 9 + 1)
+              met#0,SET test value formatter: 1
+     14263    [restart 12264]
+     14267    [restart 12503]
+     14271    [restart 12738]
+     14275    [restart 12984]
+     14279    [restart 13216]
+     14283    [restart 13447]
+     14287    [restart 13682]
+     14291    [restart 13924]
+     14295    [restart 14160]
+     14303    [trailer compression=none checksum=0xb57d9fa6]
+     14308  data (2037)
+     14308    record (20 = 3 [0] + 16 + 1) [restart]
+              methinks#0,SET test value formatter: 2
+     14328    record (17 = 3 [4] + 13 + 1)
+              methought#0,SET test value formatter: 1
+     14345    record (15 = 3 [3] + 11 + 1)
+              mettle#0,SET test value formatter: 1
+     14360    record (17 = 3 [1] + 13 + 1)
+              middle#0,SET test value formatter: 1
+     14377    record (15 = 3 [2] + 11 + 1)
+              might#0,SET test value formatter: 7
+     14392    record (16 = 3 [5] + 12 + 1)
+              mightiest#0,SET test value formatter: 1
+     14408    record (14 = 3 [2] + 10 + 1)
+              milk#0,SET test value formatter: 1
+     14422    record (14 = 3 [2] + 10 + 1)
+              mind#0,SET test value formatter: 6
+     14436    record (13 = 3 [3] + 9 + 1)
+              mine#0,SET test value formatter: 6
+     14449    record (18 = 3 [3] + 14 + 1)
+              ministers#0,SET test value formatter: 1
+     14467    record (15 = 3 [3] + 11 + 1)
+              minute#0,SET test value formatter: 1
+     14482    record (13 = 3 [6] + 9 + 1)
+              minutes#0,SET test value formatter: 1
+     14495    record (15 = 3 [2] + 11 + 1)
+              mirth#0,SET test value formatter: 1
+     14510    record (15 = 3 [1] + 11 + 1)
+              mock#0,SET test value formatter: 1
+     14525    record (15 = 3 [4] + 11 + 1)
+              mockery#0,SET test value formatter: 1
+     14540    record (18 = 3 [2] + 14 + 1)
+              moderate#0,SET test value formatter: 1
+     14558    record (18 = 3 [0] + 14 + 1) [restart]
+              moiety#0,SET test value formatter: 1
+     14576    record (14 = 3 [3] + 10 + 1)
+              moist#0,SET test value formatter: 1
+     14590    record (14 = 3 [2] + 10 + 1)
+              mole#0,SET test value formatter: 2
+     14604    record (16 = 3 [2] + 12 + 1)
+              moment#0,SET test value formatter: 1
+     14620    record (15 = 3 [2] + 11 + 1)
+              month#0,SET test value formatter: 3
+     14635    record (13 = 3 [5] + 9 + 1)
+              months#0,SET test value formatter: 1
+     14648    record (15 = 3 [2] + 11 + 1)
+              moods#0,SET test value formatter: 1
+     14663    record (13 = 3 [3] + 9 + 1)
+              moon#0,SET test value formatter: 2
+     14676    record (15 = 3 [2] + 10 + 2)
+              more#0,SET test value formatter: 19
+     14691    record (13 = 3 [3] + 9 + 1)
+              morn#0,SET test value formatter: 3
+     14704    record (15 = 3 [4] + 11 + 1)
+              morning#0,SET test value formatter: 3
+     14719    record (15 = 3 [2] + 10 + 2)
+              most#0,SET test value formatter: 28
+     14734    record (14 = 3 [2] + 10 + 1)
+              mote#0,SET test value formatter: 1
+     14748    record (15 = 3 [3] + 11 + 1)
+              mother#0,SET test value formatter: 5
+     14763    record (15 = 3 [3] + 11 + 1)
+              motion#0,SET test value formatter: 1
+     14778    record (14 = 3 [4] + 10 + 1)
+              motive#0,SET test value formatter: 2
+     14792    record (17 = 3 [0] + 13 + 1) [restart]
+              mourn#0,SET test value formatter: 1
+     14809    record (15 = 3 [5] + 11 + 1)
+              mourning#0,SET test value formatter: 1
+     14824    record (14 = 3 [3] + 10 + 1)
+              mouse#0,SET test value formatter: 1
+     14838    record (14 = 3 [3] + 10 + 1)
+              mouth#0,SET test value formatter: 1
+     14852    record (15 = 3 [2] + 11 + 1)
+              moved#0,SET test value formatter: 1
+     14867    record (15 = 3 [1] + 11 + 1)
+              much#0,SET test value formatter: 9
+     14882    record (16 = 3 [2] + 12 + 1)
+              murder#0,SET test value formatter: 3
+     14898    record (15 = 3 [2] + 10 + 2)
+              must#0,SET test value formatter: 14
+     14913    record (15 = 3 [1] + 9 + 3)
+              my#0,SET test value formatter: 126
+     14928    record (16 = 3 [2] + 12 + 1)
+              myself#0,SET test value formatter: 4
+     14944    record (16 = 3 [0] + 12 + 1)
+              name#0,SET test value formatter: 2
+     14960    record (17 = 3 [2] + 13 + 1)
+              nations#0,SET test value formatter: 1
+     14977    record (14 = 3 [4] + 10 + 1)
+              native#0,SET test value formatter: 2
+     14991    record (16 = 3 [3] + 12 + 1)
+              natural#0,SET test value formatter: 2
+     15007    record (14 = 3 [5] + 9 + 2)
+              nature#0,SET test value formatter: 13
+     15021    record (13 = 3 [2] + 9 + 1)
+              nay#0,SET test value formatter: 7
+     15034    record (14 = 3 [0] + 10 + 1) [restart]
+              ne#0,SET test value formatter: 1
+     15048    record (14 = 3 [2] + 10 + 1)
+              near#0,SET test value formatter: 3
+     15062    record (21 = 3 [2] + 17 + 1)
+              necessaries#0,SET test value formatter: 1
+     15083    record (14 = 3 [2] + 10 + 1)
+              need#0,SET test value formatter: 1
+     15097    record (15 = 3 [4] + 11 + 1)
+              needful#0,SET test value formatter: 1
+     15112    record (13 = 3 [4] + 9 + 1)
+              needs#0,SET test value formatter: 1
+     15125    record (17 = 3 [2] + 13 + 1)
+              neither#0,SET test value formatter: 1
+     15142    record (16 = 3 [2] + 12 + 1)
+              nemean#0,SET test value formatter: 1
+     15158    record (16 = 3 [2] + 12 + 1)
+              nephew#0,SET test value formatter: 1
+     15174    record (16 = 3 [3] + 12 + 1)
+              neptune#0,SET test value formatter: 1
+     15190    record (15 = 3 [2] + 11 + 1)
+              nerve#0,SET test value formatter: 1
+     15205    record (15 = 3 [2] + 11 + 1)
+              never#0,SET test value formatter: 6
+     15220    record (13 = 3 [2] + 9 + 1)
+              new#0,SET test value formatter: 1
+     15233    record (13 = 3 [3] + 9 + 1)
+              news#0,SET test value formatter: 2
+     15246    record (17 = 3 [1] + 12 + 2)
+              night#0,SET test value formatter: 22
+     15263    record (14 = 3 [5] + 10 + 1)
+              nighted#0,SET test value formatter: 1
+     15277    record (19 = 3 [0] + 15 + 1) [restart]
+              nightly#0,SET test value formatter: 1
+     15296    record (13 = 3 [5] + 9 + 1)
+              nights#0,SET test value formatter: 3
+     15309    record (15 = 3 [2] + 11 + 1)
+              niobe#0,SET test value formatter: 1
+     15324    record (17 = 3 [2] + 13 + 1)
+              nipping#0,SET test value formatter: 1
+     15341    record (14 = 3 [1] + 9 + 2)
+              no#0,SET test value formatter: 28
+     15355    record (18 = 3 [2] + 14 + 1)
+              nobility#0,SET test value formatter: 1
+     15373    record (14 = 3 [3] + 10 + 1)
+              noble#0,SET test value formatter: 5
+     15387    record (14 = 3 [2] + 10 + 1)
+              none#0,SET test value formatter: 2
+     15401    record (14 = 3 [2] + 9 + 2)
+              nor#0,SET test value formatter: 14
+     15415    record (15 = 3 [3] + 11 + 1)
+              norway#0,SET test value formatter: 5
+     15430    record (14 = 3 [2] + 9 + 2)
+              not#0,SET test value formatter: 80
+     15444    record (13 = 3 [3] + 9 + 1)
+              note#0,SET test value formatter: 2
+     15457    record (16 = 3 [3] + 12 + 1)
+              nothing#0,SET test value formatter: 2
+     15473    record (14 = 3 [2] + 9 + 2)
+              now#0,SET test value formatter: 19
+     15487    record (14 = 3 [0] + 9 + 2)
+              o#0,SET test value formatter: 30
+     15501    record (15 = 3 [1] + 11 + 1)
+              oath#0,SET test value formatter: 1
+     15516    record (16 = 3 [0] + 12 + 1) [restart]
+              obey#0,SET test value formatter: 3
+     15532    record (16 = 3 [2] + 12 + 1)
+              object#0,SET test value formatter: 1
+     15548    record (20 = 3 [2] + 16 + 1)
+              obligation#0,SET test value formatter: 1
+     15568    record (20 = 3 [2] + 16 + 1)
+              obsequious#0,SET test value formatter: 1
+     15588    record (18 = 3 [4] + 14 + 1)
+              observance#0,SET test value formatter: 1
+     15606    record (13 = 3 [8] + 9 + 1)
+              observant#0,SET test value formatter: 1
+     15619    record (16 = 3 [7] + 12 + 1)
+              observation#0,SET test value formatter: 1
+     15635    record (18 = 3 [3] + 14 + 1)
+              obstinate#0,SET test value formatter: 1
+     15653    record (19 = 3 [1] + 15 + 1)
+              occasion#0,SET test value formatter: 1
+     15672    record (14 = 3 [1] + 10 + 1)
+              odd#0,SET test value formatter: 1
+     15686    record (15 = 3 [1] + 9 + 3)
+              of#0,SET test value formatter: 176
+     15701    record (13 = 3 [2] + 9 + 1)
+              off#0,SET test value formatter: 6
+     15714    record (16 = 3 [3] + 12 + 1)
+              offence#0,SET test value formatter: 2
+     15730    record (13 = 3 [5] + 9 + 1)
+              offend#0,SET test value formatter: 1
+     15743    record (14 = 3 [6] + 10 + 1)
+              offended#0,SET test value formatter: 1
+     15757    record (13 = 3 [4] + 9 + 1)
+              offer#0,SET test value formatter: 2
+     15770    record (15 = 3 [0] + 11 + 1) [restart]
+              oft#0,SET test value formatter: 7
+     15785    record (14 = 3 [1] + 10 + 1)
+              old#0,SET test value formatter: 4
+     15799    record (15 = 3 [1] + 11 + 1)
+              omen#0,SET test value formatter: 1
+     15814    record (14 = 3 [1] + 9 + 2)
+              on#0,SET test value formatter: 25
+     15828    record (14 = 3 [2] + 10 + 1)
+              once#0,SET test value formatter: 8
+     15842    record (13 = 3 [2] + 9 + 1)
+              one#0,SET test value formatter: 6
+     15855    record (15 = 3 [1] + 11 + 1)
+              oped#0,SET test value formatter: 1
+     15870    record (13 = 3 [3] + 9 + 1)
+              open#0,SET test value formatter: 1
+     15883    record (18 = 3 [2] + 13 + 2)
+              ophelia#0,SET test value formatter: 15
+     15901    record (17 = 3 [2] + 13 + 1)
+              opinion#0,SET test value formatter: 1
+     15918    record (17 = 3 [2] + 13 + 1)
+              opposed#0,SET test value formatter: 1
+     15935    record (17 = 3 [5] + 13 + 1)
+              opposition#0,SET test value formatter: 1
+     15952    record (16 = 3 [3] + 12 + 1)
+              oppress#0,SET test value formatter: 1
+     15968    record (14 = 3 [1] + 9 + 2)
+              or#0,SET test value formatter: 28
+     15982    record (17 = 3 [2] + 13 + 1)
+              orchard#0,SET test value formatter: 2
+     15999    record (18 = 3 [2] + 14 + 1)
+              ordnance#0,SET test value formatter: 1
+     16017    record (18 = 3 [0] + 14 + 1) [restart]
+              origin#0,SET test value formatter: 1
+     16035    record (16 = 3 [1] + 12 + 1)
+              other#0,SET test value formatter: 4
+     16051    record (15 = 3 [1] + 10 + 2)
+              our#0,SET test value formatter: 45
+     16066    record (16 = 3 [3] + 12 + 1)
+              ourself#0,SET test value formatter: 2
+     16082    record (15 = 3 [6] + 11 + 1)
+              ourselves#0,SET test value formatter: 1
+     16097    record (13 = 3 [2] + 9 + 1)
+              out#0,SET test value formatter: 8
+     16110    record (14 = 3 [1] + 10 + 1)
+              own#0,SET test value formatter: 6
+     16124    record (16 = 3 [3] + 12 + 1)
+              ownself#0,SET test value formatter: 1
+     16140    record (16 = 3 [0] + 12 + 1)
+              pale#0,SET test value formatter: 4
+     16156    record (13 = 3 [4] + 9 + 1)
+              pales#0,SET test value formatter: 1
+     16169    record (13 = 3 [3] + 9 + 1)
+              palm#0,SET test value formatter: 1
+     16182    record (13 = 3 [4] + 9 + 1)
+              palmy#0,SET test value formatter: 1
+     16195    record (16 = 3 [2] + 12 + 1)
+              pardon#0,SET test value formatter: 1
+     16211    record (14 = 3 [3] + 10 + 1)
+              parle#0,SET test value formatter: 1
+     16225    record (13 = 3 [5] + 9 + 1)
+              parley#0,SET test value formatter: 1
+     16238    record (13 = 3 [3] + 9 + 1)
+              part#0,SET test value formatter: 6
+     16251    record (22 = 3 [0] + 18 + 1) [restart]
+              particular#0,SET test value formatter: 6
+     16273    record (15 = 3 [5] + 11 + 1)
+              partisan#0,SET test value formatter: 1
+     16288    record (17 = 3 [2] + 13 + 1)
+              passeth#0,SET test value formatter: 1
+     16305    [restart 14308]
+     16309    [restart 14558]
+     16313    [restart 14792]
+     16317    [restart 15034]
+     16321    [restart 15277]
+     16325    [restart 15516]
+     16329    [restart 15770]
+     16333    [restart 16017]
+     16337    [restart 16251]
+     16345    [trailer compression=none checksum=0x88e82f4b]
+     16350  data (2029)
+     16350    record (19 = 3 [0] + 15 + 1) [restart]
+              passing#0,SET test value formatter: 1
+     16369    record (13 = 3 [3] + 9 + 1)
+              past#0,SET test value formatter: 1
+     16382    record (15 = 3 [4] + 11 + 1)
+              pastors#0,SET test value formatter: 1
+     16397    record (14 = 3 [2] + 10 + 1)
+              path#0,SET test value formatter: 1
+     16411    record (16 = 3 [3] + 12 + 1)
+              patrick#0,SET test value formatter: 1
+     16427    record (13 = 3 [2] + 9 + 1)
+              pay#0,SET test value formatter: 1
+     16440    record (13 = 3 [1] + 9 + 1)
+              pe#0,SET test value formatter: 1
+     16453    record (15 = 3 [2] + 11 + 1)
+              peace#0,SET test value formatter: 2
+     16468    record (17 = 3 [2] + 13 + 1)
+              peevish#0,SET test value formatter: 1
+     16485    record (19 = 3 [2] + 15 + 1)
+              perchance#0,SET test value formatter: 2
+     16504    record (16 = 3 [3] + 12 + 1)
+              perform#0,SET test value formatter: 1
+     16520    record (15 = 3 [4] + 11 + 1)
+              perfume#0,SET test value formatter: 1
+     16535    record (16 = 3 [3] + 12 + 1)
+              perhaps#0,SET test value formatter: 1
+     16551    record (17 = 3 [3] + 13 + 1)
+              perilous#0,SET test value formatter: 1
+     16568    record (18 = 3 [3] + 14 + 1)
+              permanent#0,SET test value formatter: 1
+     16586    record (19 = 3 [3] + 15 + 1)
+              pernicious#0,SET test value formatter: 1
+     16605    record (20 = 3 [0] + 16 + 1) [restart]
+              persever#0,SET test value formatter: 1
+     16625    record (14 = 3 [4] + 10 + 1)
+              person#0,SET test value formatter: 1
+     16639    record (14 = 3 [6] + 10 + 1)
+              personal#0,SET test value formatter: 1
+     16653    record (13 = 3 [6] + 9 + 1)
+              persons#0,SET test value formatter: 1
+     16666    record (18 = 3 [3] + 14 + 1)
+              perturbed#0,SET test value formatter: 1
+     16684    record (16 = 3 [2] + 12 + 1)
+              pester#0,SET test value formatter: 1
+     16700    record (18 = 3 [2] + 14 + 1)
+              petition#0,SET test value formatter: 1
+     16718    record (14 = 3 [3] + 10 + 1)
+              petty#0,SET test value formatter: 1
+     16732    record (21 = 3 [1] + 17 + 1)
+              philosophy#0,SET test value formatter: 1
+     16753    record (16 = 3 [2] + 12 + 1)
+              phrase#0,SET test value formatter: 3
+     16769    record (16 = 3 [1] + 12 + 1)
+              piece#0,SET test value formatter: 1
+     16785    record (13 = 3 [2] + 9 + 1)
+              pin#0,SET test value formatter: 1
+     16798    record (16 = 3 [2] + 12 + 1)
+              pioner#0,SET test value formatter: 1
+     16814    record (14 = 3 [3] + 10 + 1)
+              pious#0,SET test value formatter: 1
+     16828    record (14 = 3 [2] + 10 + 1)
+              pith#0,SET test value formatter: 1
+     16842    record (13 = 3 [3] + 9 + 1)
+              pity#0,SET test value formatter: 1
+     16855    record (17 = 3 [0] + 13 + 1) [restart]
+              place#0,SET test value formatter: 3
+     16872    record (14 = 3 [3] + 10 + 1)
+              plain#0,SET test value formatter: 1
+     16886    record (16 = 3 [3] + 12 + 1)
+              planets#0,SET test value formatter: 1
+     16902    record (17 = 3 [3] + 13 + 1)
+              platform#0,SET test value formatter: 5
+     16919    record (17 = 3 [3] + 13 + 1)
+              plausive#0,SET test value formatter: 1
+     16936    record (13 = 3 [3] + 9 + 1)
+              play#0,SET test value formatter: 2
+     16949    record (16 = 3 [2] + 12 + 1)
+              please#0,SET test value formatter: 1
+     16965    record (15 = 3 [3] + 11 + 1)
+              pledge#0,SET test value formatter: 1
+     16980    record (16 = 3 [1] + 12 + 1)
+              point#0,SET test value formatter: 2
+     16996    record (17 = 3 [2] + 13 + 1)
+              polacks#0,SET test value formatter: 1
+     17013    record (13 = 3 [3] + 9 + 1)
+              pole#0,SET test value formatter: 1
+     17026    record (18 = 3 [3] + 13 + 2)
+              polonius#0,SET test value formatter: 13
+     17044    record (19 = 3 [2] + 15 + 1)
+              ponderous#0,SET test value formatter: 1
+     17063    record (14 = 3 [2] + 10 + 1)
+              pooh#0,SET test value formatter: 1
+     17077    record (13 = 3 [3] + 9 + 1)
+              poor#0,SET test value formatter: 9
+     17090    record (17 = 3 [2] + 13 + 1)
+              porches#0,SET test value formatter: 1
+     17107    record (22 = 3 [0] + 18 + 1) [restart]
+              porpentine#0,SET test value formatter: 1
+     17129    record (19 = 3 [3] + 15 + 1)
+              portentous#0,SET test value formatter: 1
+     17148    record (17 = 3 [2] + 13 + 1)
+              possess#0,SET test value formatter: 1
+     17165    record (13 = 3 [5] + 9 + 1)
+              posset#0,SET test value formatter: 1
+     17178    record (13 = 3 [3] + 9 + 1)
+              post#0,SET test value formatter: 3
+     17191    record (14 = 3 [2] + 10 + 1)
+              pour#0,SET test value formatter: 1
+     17205    record (15 = 3 [2] + 11 + 1)
+              power#0,SET test value formatter: 3
+     17220    record (15 = 3 [1] + 11 + 1)
+              pray#0,SET test value formatter: 7
+     17235    record (15 = 3 [4] + 11 + 1)
+              prayers#0,SET test value formatter: 1
+     17250    record (19 = 3 [2] + 15 + 1)
+              preceding#0,SET test value formatter: 1
+     17269    record (15 = 3 [5] + 11 + 1)
+              precepts#0,SET test value formatter: 1
+     17284    record (16 = 3 [4] + 12 + 1)
+              precurse#0,SET test value formatter: 1
+     17300    record (21 = 3 [3] + 17 + 1)
+              preparations#0,SET test value formatter: 1
+     17321    record (17 = 3 [3] + 13 + 1)
+              presence#0,SET test value formatter: 1
+     17338    record (13 = 3 [6] + 9 + 1)
+              present#0,SET test value formatter: 1
+     17351    record (17 = 3 [4] + 13 + 1)
+              pressures#0,SET test value formatter: 1
+     17368    record (16 = 3 [0] + 12 + 1) [restart]
+              prey#0,SET test value formatter: 1
+     17384    record (15 = 3 [2] + 11 + 1)
+              prick#0,SET test value formatter: 2
+     17399    record (14 = 3 [3] + 10 + 1)
+              pride#0,SET test value formatter: 1
+     17413    record (17 = 3 [3] + 13 + 1)
+              primrose#0,SET test value formatter: 1
+     17430    record (13 = 3 [4] + 9 + 1)
+              primy#0,SET test value formatter: 1
+     17443    record (15 = 3 [3] + 11 + 1)
+              prince#0,SET test value formatter: 1
+     17458    record (15 = 3 [3] + 11 + 1)
+              prison#0,SET test value formatter: 1
+     17473    record (16 = 3 [3] + 12 + 1)
+              private#0,SET test value formatter: 1
+     17489    record (13 = 3 [4] + 9 + 1)
+              privy#0,SET test value formatter: 1
+     17502    record (19 = 3 [2] + 15 + 1)
+              probation#0,SET test value formatter: 1
+     17521    record (16 = 3 [3] + 12 + 1)
+              process#0,SET test value formatter: 1
+     17537    record (17 = 3 [4] + 13 + 1)
+              proclaims#0,SET test value formatter: 1
+     17554    record (17 = 3 [3] + 13 + 1)
+              prodigal#0,SET test value formatter: 2
+     17571    record (17 = 3 [3] + 13 + 1)
+              prologue#0,SET test value formatter: 1
+     17588    record (16 = 3 [3] + 12 + 1)
+              promise#0,SET test value formatter: 1
+     17604    record (20 = 3 [3] + 16 + 1)
+              pronouncing#0,SET test value formatter: 1
+     17624    record (21 = 3 [0] + 17 + 1) [restart]
+              prophetic#0,SET test value formatter: 1
+     17645    record (19 = 3 [4] + 15 + 1)
+              proportions#0,SET test value formatter: 1
+     17664    record (14 = 3 [5] + 10 + 1)
+              propose#0,SET test value formatter: 1
+     17678    record (15 = 3 [1] + 11 + 1)
+              puff#0,SET test value formatter: 1
+     17693    record (14 = 3 [2] + 10 + 1)
+              pure#0,SET test value formatter: 1
+     17707    record (15 = 3 [3] + 11 + 1)
+              purged#0,SET test value formatter: 1
+     17722    record (16 = 3 [3] + 12 + 1)
+              purpose#0,SET test value formatter: 1
+     17738    record (14 = 3 [3] + 10 + 1)
+              purse#0,SET test value formatter: 1
+     17752    record (16 = 3 [4] + 12 + 1)
+              pursuest#0,SET test value formatter: 1
+     17768    record (13 = 3 [2] + 9 + 1)
+              put#0,SET test value formatter: 2
+     17781    record (13 = 3 [3] + 9 + 1)
+              puts#0,SET test value formatter: 1
+     17794    record (19 = 3 [0] + 15 + 1)
+              quarrel#0,SET test value formatter: 1
+     17813    record (15 = 3 [2] + 11 + 1)
+              queen#0,SET test value formatter: 7
+     17828    record (17 = 3 [3] + 13 + 1)
+              question#0,SET test value formatter: 2
+     17845    record (16 = 3 [8] + 12 + 1)
+              questionable#0,SET test value formatter: 1
+     17861    record (21 = 3 [2] + 17 + 1)
+              quicksilver#0,SET test value formatter: 1
+     17882    record (17 = 3 [0] + 13 + 1) [restart]
+              quiet#0,SET test value formatter: 1
+     17899    record (14 = 3 [5] + 10 + 1)
+              quietly#0,SET test value formatter: 1
+     17913    record (15 = 3 [3] + 11 + 1)
+              quills#0,SET test value formatter: 1
+     17928    record (19 = 3 [0] + 15 + 1)
+              radiant#0,SET test value formatter: 1
+     17947    record (14 = 3 [2] + 10 + 1)
+              rank#0,SET test value formatter: 2
+     17961    record (14 = 3 [4] + 10 + 1)
+              rankly#0,SET test value formatter: 1
+     17975    record (14 = 3 [2] + 10 + 1)
+              rate#0,SET test value formatter: 1
+     17989    record (17 = 3 [3] + 13 + 1)
+              ratified#0,SET test value formatter: 1
+     18006    record (13 = 3 [1] + 9 + 1)
+              re#0,SET test value formatter: 2
+     18019    record (17 = 3 [2] + 13 + 1)
+              reaches#0,SET test value formatter: 1
+     18036    record (13 = 3 [3] + 9 + 1)
+              rear#0,SET test value formatter: 1
+     18049    record (15 = 3 [3] + 11 + 1)
+              reason#0,SET test value formatter: 5
+     18064    record (16 = 3 [2] + 12 + 1)
+              rebels#0,SET test value formatter: 1
+     18080    record (18 = 3 [2] + 14 + 1)
+              reckless#0,SET test value formatter: 1
+     18098    record (17 = 3 [4] + 13 + 1)
+              reckoning#0,SET test value formatter: 1
+     18115    record (13 = 3 [4] + 9 + 1)
+              recks#0,SET test value formatter: 1
+     18128    record (19 = 3 [0] + 15 + 1) [restart]
+              records#0,SET test value formatter: 1
+     18147    record (15 = 3 [4] + 11 + 1)
+              recover#0,SET test value formatter: 1
+     18162    record (13 = 3 [2] + 9 + 1)
+              red#0,SET test value formatter: 1
+     18175    record (13 = 3 [3] + 9 + 1)
+              rede#0,SET test value formatter: 1
+     18188    record (15 = 3 [2] + 11 + 1)
+              reels#0,SET test value formatter: 1
+     18203    record (16 = 3 [2] + 12 + 1)
+              relief#0,SET test value formatter: 1
+     18219    record (15 = 3 [5] + 11 + 1)
+              relieved#0,SET test value formatter: 1
+     18234    record (16 = 3 [2] + 12 + 1)
+              remain#0,SET test value formatter: 1
+     18250    record (17 = 3 [3] + 13 + 1)
+              remember#0,SET test value formatter: 6
+     18267    record (17 = 3 [6] + 13 + 1)
+              remembrance#0,SET test value formatter: 1
+     18284    record (15 = 3 [3] + 11 + 1)
+              remove#0,SET test value formatter: 1
+     18299    record (13 = 3 [6] + 9 + 1)
+              removed#0,SET test value formatter: 1
+     18312    record (16 = 3 [2] + 12 + 1)
+              render#0,SET test value formatter: 1
+     18328    record (15 = 3 [2] + 11 + 1)
+              reply#0,SET test value formatter: 1
+     18343    [restart 16350]
+     18347    [restart 16605]
+     18351    [restart 16855]
+     18355    [restart 17107]
+     18359    [restart 17368]
+     18363    [restart 17624]
+     18367    [restart 17882]
+     18371    [restart 18128]
+     18379    [trailer compression=none checksum=0xa251cc65]
+     18384  data (2040)
+     18384    record (18 = 3 [0] + 14 + 1) [restart]
+              report#0,SET test value formatter: 1
+     18402    record (17 = 3 [2] + 13 + 1)
+              request#0,SET test value formatter: 1
+     18419    record (15 = 3 [4] + 11 + 1)
+              requite#0,SET test value formatter: 1
+     18434    record (17 = 3 [2] + 13 + 1)
+              reserve#0,SET test value formatter: 1
+     18451    record (18 = 3 [3] + 14 + 1)
+              resolutes#0,SET test value formatter: 1
+     18469    record (14 = 3 [5] + 10 + 1)
+              resolve#0,SET test value formatter: 1
+     18483    record (13 = 3 [3] + 9 + 1)
+              rest#0,SET test value formatter: 2
+     18496    record (20 = 3 [2] + 16 + 1)
+              retrograde#0,SET test value formatter: 1
+     18516    record (15 = 3 [3] + 11 + 1)
+              return#0,SET test value formatter: 2
+     18531    record (16 = 3 [2] + 12 + 1)
+              reveal#0,SET test value formatter: 1
+     18547    record (13 = 3 [4] + 9 + 1)
+              revel#0,SET test value formatter: 1
+     18560    record (15 = 3 [4] + 11 + 1)
+              revenge#0,SET test value formatter: 3
+     18575    record (16 = 3 [3] + 12 + 1)
+              revisit#0,SET test value formatter: 1
+     18591    record (18 = 3 [1] + 14 + 1)
+              rhenish#0,SET test value formatter: 1
+     18609    record (15 = 3 [1] + 11 + 1)
+              rich#0,SET test value formatter: 1
+     18624    record (13 = 3 [2] + 9 + 1)
+              rid#0,SET test value formatter: 1
+     18637    record (17 = 3 [0] + 13 + 1) [restart]
+              right#0,SET test value formatter: 3
+     18654    record (14 = 3 [2] + 10 + 1)
+              rise#0,SET test value formatter: 1
+     18668    record (16 = 3 [2] + 12 + 1)
+              rivals#0,SET test value formatter: 1
+     18684    record (14 = 3 [3] + 10 + 1)
+              river#0,SET test value formatter: 1
+     18698    record (15 = 3 [1] + 11 + 1)
+              roar#0,SET test value formatter: 1
+     18713    record (16 = 3 [2] + 12 + 1)
+              romage#0,SET test value formatter: 1
+     18729    record (13 = 3 [4] + 9 + 1)
+              roman#0,SET test value formatter: 1
+     18742    record (13 = 3 [3] + 9 + 1)
+              rome#0,SET test value formatter: 1
+     18755    record (14 = 3 [2] + 10 + 1)
+              room#0,SET test value formatter: 2
+     18769    record (14 = 3 [3] + 10 + 1)
+              roots#0,SET test value formatter: 1
+     18783    record (16 = 3 [2] + 12 + 1)
+              rotten#0,SET test value formatter: 1
+     18799    record (17 = 3 [2] + 13 + 1)
+              roughly#0,SET test value formatter: 1
+     18816    record (14 = 3 [3] + 10 + 1)
+              rouse#0,SET test value formatter: 2
+     18830    record (15 = 3 [2] + 11 + 1)
+              royal#0,SET test value formatter: 2
+     18845    record (16 = 3 [1] + 12 + 1)
+              ruled#0,SET test value formatter: 1
+     18861    record (17 = 3 [2] + 13 + 1)
+              running#0,SET test value formatter: 1
+     18878    record (18 = 3 [0] + 14 + 1) [restart]
+              russet#0,SET test value formatter: 1
+     18896    record (14 = 3 [0] + 9 + 2)
+              s#0,SET test value formatter: 39
+     18910    record (16 = 3 [1] + 12 + 1)
+              sable#0,SET test value formatter: 1
+     18926    record (16 = 3 [2] + 12 + 1)
+              safety#0,SET test value formatter: 2
+     18942    record (14 = 3 [2] + 10 + 1)
+              said#0,SET test value formatter: 3
+     18956    record (13 = 3 [3] + 9 + 1)
+              sail#0,SET test value formatter: 1
+     18969    record (14 = 3 [3] + 10 + 1)
+              saint#0,SET test value formatter: 1
+     18983    record (14 = 3 [2] + 10 + 1)
+              salt#0,SET test value formatter: 1
+     18997    record (14 = 3 [2] + 10 + 1)
+              same#0,SET test value formatter: 5
+     19011    record (20 = 3 [2] + 16 + 1)
+              sanctified#0,SET test value formatter: 1
+     19031    record (14 = 3 [2] + 10 + 1)
+              sate#0,SET test value formatter: 1
+     19045    record (14 = 3 [3] + 10 + 1)
+              satyr#0,SET test value formatter: 1
+     19059    record (17 = 3 [2] + 13 + 1)
+              saviour#0,SET test value formatter: 1
+     19076    record (13 = 3 [2] + 9 + 1)
+              saw#0,SET test value formatter: 6
+     19089    record (13 = 3 [3] + 9 + 1)
+              saws#0,SET test value formatter: 1
+     19102    record (14 = 3 [2] + 9 + 2)
+              say#0,SET test value formatter: 11
+     19116    record (18 = 3 [0] + 14 + 1) [restart]
+              saying#0,SET test value formatter: 1
+     19134    record (13 = 3 [3] + 9 + 1)
+              says#0,SET test value formatter: 3
+     19147    record (16 = 3 [1] + 12 + 1)
+              scale#0,SET test value formatter: 1
+     19163    record (16 = 3 [3] + 12 + 1)
+              scandal#0,SET test value formatter: 1
+     19179    record (15 = 3 [4] + 11 + 1)
+              scanter#0,SET test value formatter: 1
+     19194    record (15 = 3 [3] + 11 + 1)
+              scapes#0,SET test value formatter: 1
+     19209    record (17 = 3 [3] + 13 + 1)
+              scarcely#0,SET test value formatter: 1
+     19226    record (15 = 3 [2] + 11 + 1)
+              scene#0,SET test value formatter: 5
+     19241    record (13 = 3 [4] + 9 + 1)
+              scent#0,SET test value formatter: 1
+     19254    record (17 = 3 [2] + 13 + 1)
+              scholar#0,SET test value formatter: 1
+     19271    record (13 = 3 [7] + 9 + 1)
+              scholars#0,SET test value formatter: 1
+     19284    record (14 = 3 [4] + 10 + 1)
+              school#0,SET test value formatter: 1
+     19298    record (15 = 3 [2] + 11 + 1)
+              scope#0,SET test value formatter: 2
+     19313    record (14 = 3 [1] + 10 + 1)
+              sea#0,SET test value formatter: 3
+     19327    record (13 = 3 [3] + 9 + 1)
+              seal#0,SET test value formatter: 2
+     19340    record (15 = 3 [3] + 11 + 1)
+              season#0,SET test value formatter: 4
+     19355    record (16 = 3 [0] + 12 + 1) [restart]
+              seat#0,SET test value formatter: 1
+     19371    record (16 = 3 [2] + 12 + 1)
+              second#0,SET test value formatter: 1
+     19387    record (16 = 3 [3] + 12 + 1)
+              secrecy#0,SET test value formatter: 1
+     19403    record (13 = 3 [5] + 9 + 1)
+              secret#0,SET test value formatter: 1
+     19416    record (13 = 3 [6] + 9 + 1)
+              secrets#0,SET test value formatter: 1
+     19429    record (15 = 3 [3] + 11 + 1)
+              secure#0,SET test value formatter: 2
+     19444    record (16 = 3 [2] + 12 + 1)
+              seduce#0,SET test value formatter: 1
+     19460    record (13 = 3 [2] + 9 + 1)
+              see#0,SET test value formatter: 7
+     19473    record (13 = 3 [3] + 9 + 1)
+              seed#0,SET test value formatter: 1
+     19486    record (15 = 3 [3] + 11 + 1)
+              seeing#0,SET test value formatter: 1
+     19501    record (13 = 3 [3] + 9 + 1)
+              seek#0,SET test value formatter: 1
+     19514    record (13 = 3 [3] + 9 + 1)
+              seem#0,SET test value formatter: 2
+     19527    record (15 = 3 [4] + 11 + 1)
+              seeming#0,SET test value formatter: 1
+     19542    record (13 = 3 [4] + 9 + 1)
+              seems#0,SET test value formatter: 3
+     19555    record (13 = 3 [3] + 9 + 1)
+              seen#0,SET test value formatter: 8
+     19568    record (16 = 3 [2] + 12 + 1)
+              seized#0,SET test value formatter: 1
+     19584    record (18 = 3 [0] + 14 + 1) [restart]
+              select#0,SET test value formatter: 1
+     19602    record (13 = 3 [3] + 9 + 1)
+              self#0,SET test value formatter: 1
+     19615    record (15 = 3 [2] + 11 + 1)
+              sense#0,SET test value formatter: 1
+     19630    record (16 = 3 [4] + 12 + 1)
+              sensible#0,SET test value formatter: 1
+     19646    record (13 = 3 [3] + 9 + 1)
+              sent#0,SET test value formatter: 1
+     19659    record (19 = 3 [2] + 15 + 1)
+              sepulchre#0,SET test value formatter: 1
+     19678    record (17 = 3 [2] + 13 + 1)
+              serious#0,SET test value formatter: 1
+     19695    record (16 = 3 [3] + 12 + 1)
+              serpent#0,SET test value formatter: 2
+     19711    record (16 = 3 [3] + 12 + 1)
+              servant#0,SET test value formatter: 1
+     19727    record (13 = 3 [7] + 9 + 1)
+              servants#0,SET test value formatter: 1
+     19740    record (15 = 3 [4] + 11 + 1)
+              service#0,SET test value formatter: 1
+     19755    record (13 = 3 [2] + 9 + 1)
+              set#0,SET test value formatter: 4
+     19768    record (16 = 3 [1] + 12 + 1)
+              shake#0,SET test value formatter: 2
+     19784    record (15 = 3 [3] + 10 + 2)
+              shall#0,SET test value formatter: 22
+     19799    record (13 = 3 [4] + 9 + 1)
+              shalt#0,SET test value formatter: 1
+     19812    record (14 = 3 [3] + 10 + 1)
+              shame#0,SET test value formatter: 1
+     19826    record (20 = 3 [0] + 16 + 1) [restart]
+              shameful#0,SET test value formatter: 1
+     19846    record (14 = 3 [3] + 10 + 1)
+              shape#0,SET test value formatter: 2
+     19860    record (13 = 3 [5] + 9 + 1)
+              shapes#0,SET test value formatter: 1
+     19873    record (14 = 3 [3] + 10 + 1)
+              shark#0,SET test value formatter: 1
+     19887    record (13 = 3 [2] + 9 + 1)
+              she#0,SET test value formatter: 6
+     19900    record (16 = 3 [3] + 12 + 1)
+              sheeted#0,SET test value formatter: 1
+     19916    record (13 = 3 [5] + 9 + 1)
+              sheets#0,SET test value formatter: 1
+     19929    record (15 = 3 [2] + 11 + 1)
+              shift#0,SET test value formatter: 1
+     19944    record (20 = 3 [3] + 16 + 1)
+              shipwrights#0,SET test value formatter: 1
+     19964    record (15 = 3 [2] + 11 + 1)
+              shoes#0,SET test value formatter: 1
+     19979    record (13 = 3 [3] + 9 + 1)
+              shot#0,SET test value formatter: 2
+     19992    record (15 = 3 [3] + 11 + 1)
+              should#0,SET test value formatter: 6
+     20007    record (14 = 3 [6] + 10 + 1)
+              shoulder#0,SET test value formatter: 1
+     20021    record (14 = 3 [6] + 10 + 1)
+              shouldst#0,SET test value formatter: 1
+     20035    record (13 = 3 [3] + 9 + 1)
+              show#0,SET test value formatter: 6
+     20048    record (13 = 3 [4] + 9 + 1)
+              shows#0,SET test value formatter: 2
+     20061    record (20 = 3 [0] + 16 + 1) [restart]
+              shrewdly#0,SET test value formatter: 1
+     20081    record (15 = 3 [3] + 11 + 1)
+              shrill#0,SET test value formatter: 1
+     20096    record (15 = 3 [3] + 11 + 1)
+              shrunk#0,SET test value formatter: 1
+     20111    record (15 = 3 [1] + 11 + 1)
+              sick#0,SET test value formatter: 2
+     20126    record (14 = 3 [2] + 10 + 1)
+              side#0,SET test value formatter: 1
+     20140    record (15 = 3 [2] + 11 + 1)
+              sight#0,SET test value formatter: 3
+     20155    record (17 = 3 [2] + 13 + 1)
+              silence#0,SET test value formatter: 1
+     20172    record (15 = 3 [3] + 11 + 1)
+              silver#0,SET test value formatter: 1
+     20187    record (16 = 3 [2] + 12 + 1)
+              simple#0,SET test value formatter: 1
+     20203    record (13 = 3 [2] + 9 + 1)
+              sin#0,SET test value formatter: 1
+     20216    record (14 = 3 [3] + 10 + 1)
+              since#0,SET test value formatter: 1
+     20230    record (15 = 3 [3] + 11 + 1)
+              sinews#0,SET test value formatter: 1
+     20245    record (16 = 3 [3] + 12 + 1)
+              singeth#0,SET test value formatter: 1
+     20261    record (13 = 3 [2] + 9 + 1)
+              sir#0,SET test value formatter: 3
+     20274    record (13 = 3 [3] + 9 + 1)
+              sirs#0,SET test value formatter: 1
+     20287    record (16 = 3 [2] + 12 + 1)
+              sister#0,SET test value formatter: 3
+     20303    record (15 = 3 [0] + 11 + 1) [restart]
+              sit#0,SET test value formatter: 4
+     20318    record (13 = 3 [3] + 9 + 1)
+              sits#0,SET test value formatter: 2
+     20331    record (17 = 3 [1] + 13 + 1)
+              skirts#0,SET test value formatter: 1
+     20348    record (18 = 3 [1] + 14 + 1)
+              slander#0,SET test value formatter: 1
+     20366    record (18 = 3 [3] + 14 + 1)
+              slaughter#0,SET test value formatter: 1
+     20384    [restart 18384]
+     20388    [restart 18637]
+     20392    [restart 18878]
+     20396    [restart 19116]
+     20400    [restart 19355]
+     20404    [restart 19584]
+     20408    [restart 19826]
+     20412    [restart 20061]
+     20416    [restart 20303]
+     20424    [trailer compression=none checksum=0x8953c396]
+     20429  data (2030)
+     20429    record (16 = 3 [0] + 12 + 1) [restart]
+              slay#0,SET test value formatter: 1
+     20445    record (17 = 3 [2] + 13 + 1)
+              sledded#0,SET test value formatter: 1
+     20462    record (14 = 3 [3] + 10 + 1)
+              sleep#0,SET test value formatter: 1
+     20476    record (15 = 3 [5] + 11 + 1)
+              sleeping#0,SET test value formatter: 3
+     20491    record (14 = 3 [2] + 10 + 1)
+              slow#0,SET test value formatter: 2
+     20505    record (16 = 3 [1] + 12 + 1)
+              smile#0,SET test value formatter: 2
+     20521    record (13 = 3 [5] + 9 + 1)
+              smiles#0,SET test value formatter: 1
+     20534    record (15 = 3 [4] + 11 + 1)
+              smiling#0,SET test value formatter: 2
+     20549    record (16 = 3 [2] + 12 + 1)
+              smooth#0,SET test value formatter: 1
+     20565    record (14 = 3 [3] + 10 + 1)
+              smote#0,SET test value formatter: 1
+     20579    record (14 = 3 [1] + 9 + 2)
+              so#0,SET test value formatter: 48
+     20593    record (13 = 3 [2] + 9 + 1)
+              soe#0,SET test value formatter: 1
+     20606    record (14 = 3 [2] + 10 + 1)
+              soft#0,SET test value formatter: 2
+     20620    record (14 = 3 [2] + 10 + 1)
+              soil#0,SET test value formatter: 2
+     20634    record (17 = 3 [2] + 13 + 1)
+              soldier#0,SET test value formatter: 1
+     20651    record (13 = 3 [7] + 9 + 1)
+              soldiers#0,SET test value formatter: 1
+     20664    record (18 = 3 [0] + 14 + 1) [restart]
+              solemn#0,SET test value formatter: 2
+     20682    record (14 = 3 [3] + 10 + 1)
+              solid#0,SET test value formatter: 1
+     20696    record (15 = 3 [2] + 10 + 2)
+              some#0,SET test value formatter: 13
+     20711    record (17 = 3 [4] + 13 + 1)
+              something#0,SET test value formatter: 3
+     20728    record (15 = 3 [5] + 11 + 1)
+              sometime#0,SET test value formatter: 1
+     20743    record (13 = 3 [8] + 9 + 1)
+              sometimes#0,SET test value formatter: 1
+     20756    record (16 = 3 [4] + 12 + 1)
+              somewhat#0,SET test value formatter: 1
+     20772    record (13 = 3 [2] + 9 + 1)
+              son#0,SET test value formatter: 3
+     20785    record (14 = 3 [3] + 10 + 1)
+              songs#0,SET test value formatter: 1
+     20799    record (14 = 3 [2] + 10 + 1)
+              sore#0,SET test value formatter: 1
+     20813    record (15 = 3 [3] + 11 + 1)
+              sorrow#0,SET test value formatter: 3
+     20828    record (13 = 3 [4] + 9 + 1)
+              sorry#0,SET test value formatter: 1
+     20841    record (13 = 3 [3] + 9 + 1)
+              sort#0,SET test value formatter: 1
+     20854    record (14 = 3 [2] + 10 + 1)
+              soul#0,SET test value formatter: 8
+     20868    record (13 = 3 [4] + 9 + 1)
+              souls#0,SET test value formatter: 1
+     20881    record (14 = 3 [3] + 10 + 1)
+              sound#0,SET test value formatter: 2
+     20895    record (20 = 3 [0] + 16 + 1) [restart]
+              sounding#0,SET test value formatter: 1
+     20915    record (15 = 3 [3] + 11 + 1)
+              source#0,SET test value formatter: 1
+     20930    record (21 = 3 [2] + 17 + 1)
+              sovereignty#0,SET test value formatter: 1
+     20951    record (17 = 3 [1] + 12 + 2)
+              speak#0,SET test value formatter: 27
+     20968    record (15 = 3 [5] + 11 + 1)
+              speaking#0,SET test value formatter: 1
+     20983    record (15 = 3 [3] + 11 + 1)
+              speech#0,SET test value formatter: 1
+     20998    record (13 = 3 [4] + 9 + 1)
+              speed#0,SET test value formatter: 1
+     21011    record (14 = 3 [3] + 10 + 1)
+              spend#0,SET test value formatter: 1
+     21025    record (17 = 3 [2] + 13 + 1)
+              spheres#0,SET test value formatter: 1
+     21042    record (16 = 3 [2] + 12 + 1)
+              spirit#0,SET test value formatter: 8
+     21058    record (13 = 3 [6] + 9 + 1)
+              spirits#0,SET test value formatter: 1
+     21071    record (14 = 3 [3] + 10 + 1)
+              spite#0,SET test value formatter: 1
+     21085    record (15 = 3 [2] + 11 + 1)
+              spoke#0,SET test value formatter: 1
+     21100    record (16 = 3 [2] + 12 + 1)
+              spring#0,SET test value formatter: 2
+     21116    record (14 = 3 [6] + 10 + 1)
+              springes#0,SET test value formatter: 1
+     21130    record (17 = 3 [1] + 13 + 1)
+              squeak#0,SET test value formatter: 1
+     21147    record (14 = 3 [0] + 10 + 1) [restart]
+              st#0,SET test value formatter: 4
+     21161    record (15 = 3 [2] + 11 + 1)
+              stale#0,SET test value formatter: 1
+     21176    record (13 = 3 [4] + 9 + 1)
+              stalk#0,SET test value formatter: 1
+     21189    record (13 = 3 [5] + 9 + 1)
+              stalks#0,SET test value formatter: 1
+     21202    record (14 = 3 [3] + 10 + 1)
+              stamp#0,SET test value formatter: 1
+     21216    record (14 = 3 [3] + 10 + 1)
+              stand#0,SET test value formatter: 5
+     21230    record (13 = 3 [5] + 9 + 1)
+              stands#0,SET test value formatter: 1
+     21243    record (13 = 3 [3] + 9 + 1)
+              star#0,SET test value formatter: 3
+     21256    record (13 = 3 [4] + 9 + 1)
+              stars#0,SET test value formatter: 2
+     21269    record (13 = 3 [4] + 9 + 1)
+              start#0,SET test value formatter: 1
+     21282    record (14 = 3 [5] + 10 + 1)
+              started#0,SET test value formatter: 1
+     21296    record (14 = 3 [3] + 10 + 1)
+              state#0,SET test value formatter: 8
+     21310    record (14 = 3 [5] + 10 + 1)
+              stately#0,SET test value formatter: 1
+     21324    record (15 = 3 [4] + 11 + 1)
+              station#0,SET test value formatter: 1
+     21339    record (13 = 3 [3] + 9 + 1)
+              stay#0,SET test value formatter: 7
+     21352    record (15 = 3 [2] + 11 + 1)
+              steel#0,SET test value formatter: 2
+     21367    record (17 = 3 [0] + 13 + 1) [restart]
+              steep#0,SET test value formatter: 1
+     21384    record (17 = 3 [3] + 13 + 1)
+              sterling#0,SET test value formatter: 1
+     21401    record (17 = 3 [2] + 13 + 1)
+              stiffly#0,SET test value formatter: 1
+     21418    record (14 = 3 [3] + 10 + 1)
+              still#0,SET test value formatter: 8
+     21432    record (14 = 3 [3] + 10 + 1)
+              sting#0,SET test value formatter: 2
+     21446    record (13 = 3 [3] + 9 + 1)
+              stir#0,SET test value formatter: 2
+     21459    record (16 = 3 [4] + 12 + 1)
+              stirring#0,SET test value formatter: 1
+     21475    record (15 = 3 [2] + 11 + 1)
+              stole#0,SET test value formatter: 1
+     21490    record (16 = 3 [3] + 12 + 1)
+              stomach#0,SET test value formatter: 1
+     21506    record (14 = 3 [3] + 10 + 1)
+              stood#0,SET test value formatter: 2
+     21520    record (13 = 3 [3] + 9 + 1)
+              stop#0,SET test value formatter: 1
+     21533    record (14 = 3 [3] + 10 + 1)
+              story#0,SET test value formatter: 1
+     21547    record (17 = 3 [2] + 13 + 1)
+              strange#0,SET test value formatter: 6
+     21564    record (13 = 3 [7] + 9 + 1)
+              stranger#0,SET test value formatter: 1
+     21577    record (16 = 3 [3] + 12 + 1)
+              streets#0,SET test value formatter: 1
+     21593    record (15 = 3 [3] + 11 + 1)
+              strict#0,SET test value formatter: 1
+     21608    record (18 = 3 [0] + 14 + 1) [restart]
+              strike#0,SET test value formatter: 2
+     21626    record (16 = 3 [3] + 12 + 1)
+              strokes#0,SET test value formatter: 1
+     21642    record (14 = 3 [4] + 10 + 1)
+              strong#0,SET test value formatter: 1
+     21656    record (15 = 3 [3] + 11 + 1)
+              struck#0,SET test value formatter: 2
+     21671    record (22 = 3 [2] + 18 + 1)
+              stubbornness#0,SET test value formatter: 1
+     21693    record (16 = 3 [3] + 12 + 1)
+              student#0,SET test value formatter: 1
+     21709    record (14 = 3 [3] + 10 + 1)
+              stung#0,SET test value formatter: 1
+     21723    record (18 = 3 [1] + 14 + 1)
+              subject#0,SET test value formatter: 3
+     21741    record (18 = 3 [3] + 14 + 1)
+              substance#0,SET test value formatter: 1
+     21759    record (15 = 3 [2] + 10 + 2)
+              such#0,SET test value formatter: 10
+     21774    record (16 = 3 [2] + 12 + 1)
+              sudden#0,SET test value formatter: 1
+     21790    record (14 = 3 [2] + 10 + 1)
+              suit#0,SET test value formatter: 1
+     21804    record (13 = 3 [4] + 9 + 1)
+              suits#0,SET test value formatter: 3
+     21817    record (20 = 3 [2] + 16 + 1)
+              sulphurous#0,SET test value formatter: 1
+     21837    record (16 = 3 [2] + 12 + 1)
+              summit#0,SET test value formatter: 1
+     21853    record (15 = 3 [4] + 11 + 1)
+              summons#0,SET test value formatter: 1
+     21868    record (15 = 3 [0] + 11 + 1) [restart]
+              sun#0,SET test value formatter: 2
+     21883    record (15 = 3 [3] + 11 + 1)
+              sunday#0,SET test value formatter: 1
+     21898    record (20 = 3 [2] + 16 + 1)
+              suppliance#0,SET test value formatter: 1
+     21918    record (16 = 3 [4] + 12 + 1)
+              supposal#0,SET test value formatter: 1
+     21934    record (16 = 3 [4] + 12 + 1)
+              suppress#0,SET test value formatter: 1
+     21950    record (14 = 3 [2] + 10 + 1)
+              sure#0,SET test value formatter: 1
+     21964    record (18 = 3 [3] + 14 + 1)
+              surprised#0,SET test value formatter: 1
+     21982    record (18 = 3 [3] + 14 + 1)
+              surrender#0,SET test value formatter: 1
+     22000    record (17 = 3 [3] + 13 + 1)
+              survivor#0,SET test value formatter: 1
+     22017    record (21 = 3 [2] + 17 + 1)
+              suspiration#0,SET test value formatter: 1
+     22038    record (16 = 3 [3] + 12 + 1)
+              sustain#0,SET test value formatter: 1
+     22054    record (21 = 3 [1] + 17 + 1)
+              swaggering#0,SET test value formatter: 1
+     22075    record (16 = 3 [2] + 11 + 2)
+              swear#0,SET test value formatter: 10
+     22091    record (14 = 3 [4] + 10 + 1)
+              sweaty#0,SET test value formatter: 1
+     22105    record (14 = 3 [3] + 10 + 1)
+              sweep#0,SET test value formatter: 1
+     22119    record (13 = 3 [4] + 9 + 1)
+              sweet#0,SET test value formatter: 2
+     22132    record (17 = 3 [0] + 13 + 1) [restart]
+              swift#0,SET test value formatter: 2
+     22149    record (16 = 3 [3] + 12 + 1)
+              swinish#0,SET test value formatter: 1
+     22165    record (15 = 3 [2] + 11 + 1)
+              sword#0,SET test value formatter: 5
+     22180    record (13 = 3 [4] + 9 + 1)
+              sworn#0,SET test value formatter: 2
+     22193    record (14 = 3 [0] + 9 + 2)
+              t#0,SET test value formatter: 18
+     22207    record (13 = 3 [1] + 9 + 1)
+              ta#0,SET test value formatter: 1
+     22220    record (15 = 3 [2] + 11 + 1)
+              table#0,SET test value formatter: 1
+     22235    record (13 = 3 [5] + 9 + 1)
+              tables#0,SET test value formatter: 2
+     22248    record (15 = 3 [2] + 11 + 1)
+              taint#0,SET test value formatter: 1
+     22263    record (15 = 3 [2] + 10 + 2)
+              take#0,SET test value formatter: 10
+     22278    record (13 = 3 [4] + 9 + 1)
+              taken#0,SET test value formatter: 1
+     22291    record (13 = 3 [4] + 9 + 1)
+              takes#0,SET test value formatter: 3
+     22304    record (14 = 3 [2] + 10 + 1)
+              tale#0,SET test value formatter: 1
+     22318    record (13 = 3 [3] + 9 + 1)
+              talk#0,SET test value formatter: 1
+     22331    record (14 = 3 [2] + 10 + 1)
+              task#0,SET test value formatter: 1
+     22345    record (13 = 3 [2] + 9 + 1)
+              tax#0,SET test value formatter: 1
+     22358    record (17 = 3 [0] + 13 + 1) [restart]
+              teach#0,SET test value formatter: 2
+     22375    record (14 = 3 [3] + 10 + 1)
+              tears#0,SET test value formatter: 2
+     22389    record (14 = 3 [2] + 10 + 1)
+              tell#0,SET test value formatter: 9
+     22403    record (16 = 3 [2] + 12 + 1)
+              temple#0,SET test value formatter: 1
+     22419    [restart 20429]
+     22423    [restart 20664]
+     22427    [restart 20895]
+     22431    [restart 21147]
+     22435    [restart 21367]
+     22439    [restart 21608]
+     22443    [restart 21868]
+     22447    [restart 22132]
+     22451    [restart 22358]
+     22459    [trailer compression=none checksum=0x53c39637]
+     22464  data (2035)
+     22464    record (17 = 3 [0] + 13 + 1) [restart]
+              tempt#0,SET test value formatter: 1
+     22481    record (17 = 3 [2] + 13 + 1)
+              tenable#0,SET test value formatter: 1
+     22498    record (18 = 3 [4] + 14 + 1)
+              tenantless#0,SET test value formatter: 1
+     22516    record (13 = 3 [3] + 9 + 1)
+              tend#0,SET test value formatter: 1
+     22529    record (14 = 3 [4] + 10 + 1)
+              tender#0,SET test value formatter: 2
+     22543    record (13 = 3 [6] + 9 + 1)
+              tenders#0,SET test value formatter: 3
+     22556    record (14 = 3 [2] + 10 + 1)
+              term#0,SET test value formatter: 2
+     22570    record (13 = 3 [4] + 9 + 1)
+              terms#0,SET test value formatter: 2
+     22583    record (16 = 3 [2] + 12 + 1)
+              tether#0,SET test value formatter: 1
+     22599    record (15 = 3 [3] + 11 + 1)
+              tetter#0,SET test value formatter: 1
+     22614    record (16 = 3 [1] + 11 + 2)
+              than#0,SET test value formatter: 15
+     22630    record (14 = 3 [4] + 10 + 1)
+              thanks#0,SET test value formatter: 2
+     22644    record (14 = 3 [3] + 9 + 2)
+              that#0,SET test value formatter: 83
+     22658    record (13 = 3 [3] + 9 + 1)
+              thaw#0,SET test value formatter: 1
+     22671    record (15 = 3 [2] + 9 + 3)
+              the#0,SET test value formatter: 237
+     22686    record (14 = 3 [3] + 9 + 2)
+              thee#0,SET test value formatter: 23
+     22700    record (18 = 3 [0] + 13 + 2) [restart]
+              their#0,SET test value formatter: 10
+     22718    record (14 = 3 [3] + 9 + 2)
+              them#0,SET test value formatter: 10
+     22732    record (13 = 3 [4] + 9 + 1)
+              theme#0,SET test value formatter: 1
+     22745    record (14 = 3 [3] + 9 + 2)
+              then#0,SET test value formatter: 15
+     22759    record (15 = 3 [3] + 10 + 2)
+              there#0,SET test value formatter: 18
+     22774    record (16 = 3 [5] + 12 + 1)
+              therefore#0,SET test value formatter: 4
+     22790    record (14 = 3 [5] + 10 + 1)
+              thereto#0,SET test value formatter: 1
+     22804    record (15 = 3 [3] + 10 + 2)
+              these#0,SET test value formatter: 13
+     22819    record (14 = 3 [3] + 10 + 1)
+              thews#0,SET test value formatter: 1
+     22833    record (14 = 3 [3] + 9 + 2)
+              they#0,SET test value formatter: 14
+     22847    record (14 = 3 [2] + 10 + 1)
+              thin#0,SET test value formatter: 1
+     22861    record (13 = 3 [4] + 9 + 1)
+              thine#0,SET test value formatter: 3
+     22874    record (13 = 3 [4] + 9 + 1)
+              thing#0,SET test value formatter: 6
+     22887    record (13 = 3 [5] + 9 + 1)
+              things#0,SET test value formatter: 3
+     22900    record (14 = 3 [4] + 9 + 2)
+              think#0,SET test value formatter: 16
+     22914    record (15 = 3 [5] + 11 + 1)
+              thinking#0,SET test value formatter: 1
+     22929    record (17 = 3 [0] + 13 + 1) [restart]
+              third#0,SET test value formatter: 1
+     22946    record (14 = 3 [3] + 9 + 2)
+              this#0,SET test value formatter: 67
+     22960    record (16 = 3 [2] + 12 + 1)
+              thorns#0,SET test value formatter: 1
+     22976    record (13 = 3 [5] + 9 + 1)
+              thorny#0,SET test value formatter: 1
+     22989    record (14 = 3 [3] + 10 + 1)
+              those#0,SET test value formatter: 7
+     23003    record (14 = 3 [3] + 9 + 2)
+              thou#0,SET test value formatter: 28
+     23017    record (15 = 3 [4] + 10 + 2)
+              though#0,SET test value formatter: 10
+     23032    record (13 = 3 [6] + 9 + 1)
+              thought#0,SET test value formatter: 2
+     23045    record (13 = 3 [7] + 9 + 1)
+              thoughts#0,SET test value formatter: 4
+     23058    record (16 = 3 [2] + 12 + 1)
+              thrice#0,SET test value formatter: 1
+     23074    record (14 = 3 [4] + 10 + 1)
+              thrift#0,SET test value formatter: 2
+     23088    record (15 = 3 [3] + 11 + 1)
+              throat#0,SET test value formatter: 1
+     23103    record (14 = 3 [4] + 10 + 1)
+              throne#0,SET test value formatter: 2
+     23117    record (15 = 3 [4] + 11 + 1)
+              through#0,SET test value formatter: 3
+     23132    record (13 = 3 [4] + 9 + 1)
+              throw#0,SET test value formatter: 1
+     23145    record (17 = 3 [2] + 13 + 1)
+              thunder#0,SET test value formatter: 1
+     23162    record (16 = 3 [0] + 12 + 1) [restart]
+              thus#0,SET test value formatter: 9
+     23178    record (14 = 3 [2] + 9 + 2)
+              thy#0,SET test value formatter: 36
+     23192    record (16 = 3 [3] + 12 + 1)
+              thyself#0,SET test value formatter: 1
+     23208    record (15 = 3 [1] + 11 + 1)
+              till#0,SET test value formatter: 4
+     23223    record (15 = 3 [2] + 10 + 2)
+              time#0,SET test value formatter: 10
+     23238    record (13 = 3 [4] + 9 + 1)
+              times#0,SET test value formatter: 1
+     23251    record (14 = 3 [2] + 9 + 2)
+              tis#0,SET test value formatter: 22
+     23265    record (15 = 3 [1] + 9 + 3)
+              to#0,SET test value formatter: 192
+     23280    record (13 = 3 [2] + 9 + 1)
+              toe#0,SET test value formatter: 1
+     23293    record (18 = 3 [2] + 14 + 1)
+              together#0,SET test value formatter: 7
+     23311    record (15 = 3 [2] + 11 + 1)
+              toils#0,SET test value formatter: 1
+     23326    record (14 = 3 [2] + 10 + 1)
+              told#0,SET test value formatter: 2
+     23340    record (16 = 3 [2] + 12 + 1)
+              tongue#0,SET test value formatter: 4
+     23356    record (13 = 3 [2] + 9 + 1)
+              too#0,SET test value formatter: 9
+     23369    record (13 = 3 [2] + 9 + 1)
+              top#0,SET test value formatter: 1
+     23382    record (20 = 3 [2] + 16 + 1)
+              tormenting#0,SET test value formatter: 1
+     23402    record (20 = 3 [0] + 16 + 1) [restart]
+              touching#0,SET test value formatter: 3
+     23422    record (16 = 3 [2] + 12 + 1)
+              toward#0,SET test value formatter: 4
+     23438    record (13 = 3 [2] + 9 + 1)
+              toy#0,SET test value formatter: 1
+     23451    record (13 = 3 [3] + 9 + 1)
+              toys#0,SET test value formatter: 1
+     23464    record (19 = 3 [1] + 15 + 1)
+              traduced#0,SET test value formatter: 1
+     23483    record (16 = 3 [3] + 12 + 1)
+              tragedy#0,SET test value formatter: 1
+     23499    record (15 = 3 [3] + 11 + 1)
+              trains#0,SET test value formatter: 1
+     23514    record (18 = 3 [4] + 14 + 1)
+              traitorous#0,SET test value formatter: 1
+     23532    record (18 = 3 [3] + 14 + 1)
+              trappings#0,SET test value formatter: 1
+     23550    record (16 = 3 [2] + 12 + 1)
+              treads#0,SET test value formatter: 1
+     23566    record (16 = 3 [4] + 12 + 1)
+              treasure#0,SET test value formatter: 2
+     23582    record (16 = 3 [3] + 12 + 1)
+              tremble#0,SET test value formatter: 1
+     23598    record (15 = 3 [2] + 11 + 1)
+              tried#0,SET test value formatter: 1
+     23613    record (17 = 3 [3] + 13 + 1)
+              trifling#0,SET test value formatter: 1
+     23630    record (16 = 3 [3] + 12 + 1)
+              triumph#0,SET test value formatter: 1
+     23646    record (16 = 3 [3] + 12 + 1)
+              trivial#0,SET test value formatter: 1
+     23662    record (19 = 3 [0] + 15 + 1) [restart]
+              trouble#0,SET test value formatter: 1
+     23681    record (13 = 3 [7] + 9 + 1)
+              troubles#0,SET test value formatter: 1
+     23694    record (16 = 3 [2] + 12 + 1)
+              truant#0,SET test value formatter: 2
+     23710    record (13 = 3 [3] + 9 + 1)
+              true#0,SET test value formatter: 5
+     23723    record (17 = 3 [4] + 13 + 1)
+              truepenny#0,SET test value formatter: 1
+     23740    record (14 = 3 [3] + 10 + 1)
+              truly#0,SET test value formatter: 1
+     23754    record (16 = 3 [3] + 12 + 1)
+              trumpet#0,SET test value formatter: 2
+     23770    record (13 = 3 [7] + 9 + 1)
+              trumpets#0,SET test value formatter: 1
+     23783    record (18 = 3 [3] + 14 + 1)
+              truncheon#0,SET test value formatter: 1
+     23801    record (16 = 3 [3] + 12 + 1)
+              truster#0,SET test value formatter: 1
+     23817    record (14 = 3 [3] + 10 + 1)
+              truth#0,SET test value formatter: 2
+     23831    record (15 = 3 [1] + 11 + 1)
+              tush#0,SET test value formatter: 2
+     23846    record (17 = 3 [1] + 13 + 1)
+              twelve#0,SET test value formatter: 3
+     23863    record (14 = 3 [3] + 10 + 1)
+              twere#0,SET test value formatter: 1
+     23877    record (15 = 3 [2] + 11 + 1)
+              twice#0,SET test value formatter: 2
+     23892    record (14 = 3 [3] + 10 + 1)
+              twill#0,SET test value formatter: 2
+     23906    record (17 = 3 [0] + 13 + 1) [restart]
+              twixt#0,SET test value formatter: 1
+     23923    record (13 = 3 [2] + 9 + 1)
+              two#0,SET test value formatter: 5
+     23936    record (18 = 3 [0] + 14 + 1)
+              ubique#0,SET test value formatter: 1
+     23954    record (17 = 3 [1] + 13 + 1)
+              unanel#0,SET test value formatter: 1
+     23971    record (15 = 3 [2] + 11 + 1)
+              uncle#0,SET test value formatter: 5
+     23986    record (17 = 3 [2] + 13 + 1)
+              undergo#0,SET test value formatter: 1
+     24003    record (17 = 3 [5] + 13 + 1)
+              understand#0,SET test value formatter: 1
+     24020    record (15 = 3 [10] + 11 + 1)
+              understanding#0,SET test value formatter: 2
+     24035    record (21 = 3 [2] + 17 + 1)
+              uneffectual#0,SET test value formatter: 1
+     24056    record (19 = 3 [2] + 15 + 1)
+              unfledged#0,SET test value formatter: 1
+     24075    record (15 = 3 [3] + 11 + 1)
+              unfold#0,SET test value formatter: 3
+     24090    record (16 = 3 [4] + 12 + 1)
+              unforced#0,SET test value formatter: 1
+     24106    record (18 = 3 [5] + 14 + 1)
+              unfortified#0,SET test value formatter: 1
+     24124    record (20 = 3 [2] + 16 + 1)
+              ungracious#0,SET test value formatter: 1
+     24144    record (16 = 3 [2] + 12 + 1)
+              unhand#0,SET test value formatter: 1
+     24160    record (15 = 3 [3] + 11 + 1)
+              unholy#0,SET test value formatter: 1
+     24175    record (20 = 3 [0] + 16 + 1) [restart]
+              unhousel#0,SET test value formatter: 1
+     24195    record (20 = 3 [2] + 16 + 1)
+              unimproved#0,SET test value formatter: 1
+     24215    record (17 = 3 [2] + 13 + 1)
+              unmanly#0,SET test value formatter: 1
+     24232    record (14 = 3 [4] + 10 + 1)
+              unmask#0,SET test value formatter: 1
+     24246    record (15 = 3 [5] + 11 + 1)
+              unmaster#0,SET test value formatter: 1
+     24261    record (14 = 3 [3] + 10 + 1)
+              unmix#0,SET test value formatter: 1
+     24275    record (19 = 3 [2] + 15 + 1)
+              unnatural#0,SET test value formatter: 2
+     24294    record (22 = 3 [2] + 18 + 1)
+              unprevailing#0,SET test value formatter: 1
+     24316    record (20 = 3 [4] + 16 + 1)
+              unprofitable#0,SET test value formatter: 1
+     24336    record (21 = 3 [5] + 17 + 1)
+              unproportioned#0,SET test value formatter: 1
+     24357    record (21 = 3 [2] + 17 + 1)
+              unrighteous#0,SET test value formatter: 1
+     24378    record (18 = 3 [2] + 14 + 1)
+              unschool#0,SET test value formatter: 1
+     24396    record (17 = 3 [3] + 13 + 1)
+              unsifted#0,SET test value formatter: 1
+     24413    record (14 = 3 [2] + 10 + 1)
+              unto#0,SET test value formatter: 4
+     24427    record (18 = 3 [2] + 14 + 1)
+              unvalued#0,SET test value formatter: 1
+     24445    record (18 = 3 [2] + 14 + 1)
+              unweeded#0,SET test value formatter: 1
+     24463    [restart 22464]
+     24467    [restart 22700]
+     24471    [restart 22929]
+     24475    [restart 23162]
+     24479    [restart 23402]
+     24483    [restart 23662]
+     24487    [restart 23906]
+     24491    [restart 24175]
+     24499    [trailer compression=none checksum=0xe12c134e]
+     24504  data (2036)
+     24504    record (15 = 3 [0] + 10 + 2) [restart]
+              up#0,SET test value formatter: 10
+     24519    record (19 = 3 [2] + 15 + 1)
+              uphoarded#0,SET test value formatter: 1
+     24538    record (15 = 3 [2] + 10 + 2)
+              upon#0,SET test value formatter: 18
+     24553    record (14 = 3 [1] + 9 + 2)
+              us#0,SET test value formatter: 19
+     24567    record (13 = 3 [2] + 9 + 1)
+              use#0,SET test value formatter: 1
+     24580    record (13 = 3 [3] + 9 + 1)
+              uses#0,SET test value formatter: 1
+     24593    record (15 = 3 [2] + 11 + 1)
+              usurp#0,SET test value formatter: 1
+     24608    record (13 = 3 [0] + 9 + 1)
+              v#0,SET test value formatter: 1
+     24621    record (17 = 3 [1] + 13 + 1)
+              vailed#0,SET test value formatter: 1
+     24638    record (13 = 3 [3] + 9 + 1)
+              vain#0,SET test value formatter: 1
+     24651    record (17 = 3 [2] + 13 + 1)
+              valiant#0,SET test value formatter: 2
+     24668    record (16 = 3 [2] + 12 + 1)
+              vanish#0,SET test value formatter: 1
+     24684    record (19 = 3 [3] + 15 + 1)
+              vanquisher#0,SET test value formatter: 1
+     24703    record (14 = 3 [2] + 10 + 1)
+              vast#0,SET test value formatter: 1
+     24717    record (15 = 3 [1] + 11 + 1)
+              very#0,SET test value formatter: 9
+     24732    record (15 = 3 [1] + 11 + 1)
+              vial#0,SET test value formatter: 1
+     24747    record (19 = 3 [0] + 15 + 1) [restart]
+              vicious#0,SET test value formatter: 1
+     24766    record (16 = 3 [2] + 12 + 1)
+              vigour#0,SET test value formatter: 1
+     24782    record (14 = 3 [2] + 10 + 1)
+              vile#0,SET test value formatter: 1
+     24796    record (16 = 3 [3] + 12 + 1)
+              villain#0,SET test value formatter: 5
+     24812    record (18 = 3 [2] + 14 + 1)
+              violence#0,SET test value formatter: 2
+     24830    record (13 = 3 [5] + 9 + 1)
+              violet#0,SET test value formatter: 1
+     24843    record (16 = 3 [2] + 12 + 1)
+              virtue#0,SET test value formatter: 3
+     24859    record (13 = 3 [6] + 9 + 1)
+              virtues#0,SET test value formatter: 1
+     24872    record (15 = 3 [5] + 11 + 1)
+              virtuous#0,SET test value formatter: 1
+     24887    record (16 = 3 [2] + 12 + 1)
+              visage#0,SET test value formatter: 1
+     24903    record (15 = 3 [3] + 11 + 1)
+              vision#0,SET test value formatter: 1
+     24918    record (13 = 3 [4] + 9 + 1)
+              visit#0,SET test value formatter: 2
+     24931    record (16 = 3 [1] + 12 + 1)
+              voice#0,SET test value formatter: 5
+     24947    record (19 = 3 [2] + 15 + 1)
+              voltimand#0,SET test value formatter: 4
+     24966    record (15 = 3 [3] + 11 + 1)
+              volume#0,SET test value formatter: 1
+     24981    record (13 = 3 [2] + 9 + 1)
+              vow#0,SET test value formatter: 1
+     24994    record (16 = 3 [0] + 12 + 1) [restart]
+              vows#0,SET test value formatter: 3
+     25010    record (17 = 3 [1] + 13 + 1)
+              vulgar#0,SET test value formatter: 2
+     25027    record (16 = 3 [0] + 12 + 1)
+              wake#0,SET test value formatter: 1
+     25043    record (14 = 3 [2] + 10 + 1)
+              walk#0,SET test value formatter: 6
+     25057    record (13 = 3 [4] + 9 + 1)
+              walks#0,SET test value formatter: 1
+     25070    record (15 = 3 [2] + 11 + 1)
+              wants#0,SET test value formatter: 1
+     25085    record (13 = 3 [2] + 9 + 1)
+              war#0,SET test value formatter: 1
+     25098    record (16 = 3 [3] + 12 + 1)
+              warlike#0,SET test value formatter: 2
+     25114    record (16 = 3 [3] + 12 + 1)
+              warning#0,SET test value formatter: 1
+     25130    record (16 = 3 [3] + 12 + 1)
+              warrant#0,SET test value formatter: 1
+     25146    record (13 = 3 [3] + 9 + 1)
+              wars#0,SET test value formatter: 1
+     25159    record (13 = 3 [3] + 9 + 1)
+              wary#0,SET test value formatter: 1
+     25172    record (14 = 3 [2] + 9 + 2)
+              was#0,SET test value formatter: 17
+     25186    record (16 = 3 [3] + 12 + 1)
+              wassail#0,SET test value formatter: 1
+     25202    record (16 = 3 [2] + 11 + 2)
+              watch#0,SET test value formatter: 12
+     25218    record (15 = 3 [5] + 11 + 1)
+              watchman#0,SET test value formatter: 1
+     25233    record (17 = 3 [0] + 13 + 1) [restart]
+              waves#0,SET test value formatter: 3
+     25250    record (15 = 3 [2] + 11 + 1)
+              waxes#0,SET test value formatter: 2
+     25265    record (13 = 3 [2] + 9 + 1)
+              way#0,SET test value formatter: 2
+     25278    record (13 = 3 [3] + 9 + 1)
+              ways#0,SET test value formatter: 1
+     25291    record (14 = 3 [1] + 9 + 2)
+              we#0,SET test value formatter: 34
+     25305    record (14 = 3 [2] + 10 + 1)
+              weak#0,SET test value formatter: 1
+     25319    record (14 = 3 [3] + 10 + 1)
+              wears#0,SET test value formatter: 1
+     25333    record (13 = 3 [4] + 9 + 1)
+              weary#0,SET test value formatter: 1
+     25346    record (17 = 3 [2] + 13 + 1)
+              wedding#0,SET test value formatter: 1
+     25363    record (14 = 3 [2] + 10 + 1)
+              weed#0,SET test value formatter: 1
+     25377    record (13 = 3 [3] + 9 + 1)
+              week#0,SET test value formatter: 1
+     25390    record (15 = 3 [2] + 11 + 1)
+              weigh#0,SET test value formatter: 2
+     25405    record (15 = 3 [5] + 11 + 1)
+              weighing#0,SET test value formatter: 1
+     25420    record (17 = 3 [2] + 13 + 1)
+              welcome#0,SET test value formatter: 3
+     25437    record (14 = 3 [3] + 9 + 2)
+              well#0,SET test value formatter: 14
+     25451    record (14 = 3 [2] + 10 + 1)
+              went#0,SET test value formatter: 1
+     25465    record (16 = 3 [0] + 12 + 1) [restart]
+              were#0,SET test value formatter: 3
+     25481    record (14 = 3 [2] + 10 + 1)
+              west#0,SET test value formatter: 1
+     25495    record (16 = 3 [4] + 12 + 1)
+              westward#0,SET test value formatter: 1
+     25511    record (16 = 3 [1] + 12 + 1)
+              wharf#0,SET test value formatter: 1
+     25527    record (14 = 3 [3] + 9 + 2)
+              what#0,SET test value formatter: 42
+     25541    record (18 = 3 [4] + 14 + 1)
+              whatsoever#0,SET test value formatter: 1
+     25559    record (14 = 3 [2] + 10 + 1)
+              when#0,SET test value formatter: 8
+     25573    record (14 = 3 [4] + 10 + 1)
+              whence#0,SET test value formatter: 1
+     25587    record (14 = 3 [3] + 10 + 1)
+              where#0,SET test value formatter: 9
+     25601    record (16 = 3 [5] + 12 + 1)
+              wherefore#0,SET test value formatter: 1
+     25617    record (14 = 3 [5] + 10 + 1)
+              wherein#0,SET test value formatter: 4
+     25631    record (14 = 3 [5] + 10 + 1)
+              whereof#0,SET test value formatter: 2
+     25645    record (16 = 3 [3] + 12 + 1)
+              whether#0,SET test value formatter: 1
+     25661    record (16 = 3 [2] + 11 + 2)
+              which#0,SET test value formatter: 16
+     25677    record (14 = 3 [3] + 10 + 1)
+              while#0,SET test value formatter: 2
+     25691    record (13 = 3 [5] + 9 + 1)
+              whiles#0,SET test value formatter: 1
+     25704    record (18 = 3 [0] + 14 + 1) [restart]
+              whilst#0,SET test value formatter: 1
+     25722    record (17 = 3 [3] + 13 + 1)
+              whirling#0,SET test value formatter: 1
+     25739    record (16 = 3 [3] + 12 + 1)
+              whisper#0,SET test value formatter: 1
+     25755    record (13 = 3 [2] + 9 + 1)
+              who#0,SET test value formatter: 8
+     25768    record (14 = 3 [3] + 10 + 1)
+              whole#0,SET test value formatter: 3
+     25782    record (16 = 3 [5] + 12 + 1)
+              wholesome#0,SET test value formatter: 2
+     25798    record (14 = 3 [3] + 10 + 1)
+              whose#0,SET test value formatter: 8
+     25812    record (14 = 3 [2] + 9 + 2)
+              why#0,SET test value formatter: 13
+     25826    record (17 = 3 [1] + 13 + 1)
+              wicked#0,SET test value formatter: 3
+     25843    record (14 = 3 [2] + 10 + 1)
+              wide#0,SET test value formatter: 1
+     25857    record (14 = 3 [2] + 10 + 1)
+              wife#0,SET test value formatter: 1
+     25871    record (14 = 3 [2] + 10 + 1)
+              wild#0,SET test value formatter: 1
+     25885    record (14 = 3 [3] + 9 + 2)
+              will#0,SET test value formatter: 25
+     25899    record (15 = 3 [4] + 11 + 1)
+              willing#0,SET test value formatter: 1
+     25914    record (14 = 3 [7] + 10 + 1)
+              willingly#0,SET test value formatter: 1
+     25928    record (13 = 3 [3] + 9 + 1)
+              wilt#0,SET test value formatter: 1
+     25941    record (16 = 3 [0] + 12 + 1) [restart]
+              wind#0,SET test value formatter: 2
+     25957    record (13 = 3 [4] + 9 + 1)
+              winds#0,SET test value formatter: 2
+     25970    record (13 = 3 [4] + 9 + 1)
+              windy#0,SET test value formatter: 1
+     25983    record (14 = 3 [3] + 10 + 1)
+              wings#0,SET test value formatter: 1
+     25997    record (14 = 3 [2] + 10 + 1)
+              wipe#0,SET test value formatter: 1
+     26011    record (16 = 3 [2] + 12 + 1)
+              wisdom#0,SET test value formatter: 1
+     26027    record (13 = 3 [6] + 9 + 1)
+              wisdoms#0,SET test value formatter: 1
+     26040    record (15 = 3 [3] + 11 + 1)
+              wisest#0,SET test value formatter: 1
+     26055    record (15 = 3 [3] + 11 + 1)
+              wishes#0,SET test value formatter: 1
+     26070    record (13 = 3 [2] + 9 + 1)
+              wit#0,SET test value formatter: 2
+     26083    record (14 = 3 [3] + 10 + 1)
+              witch#0,SET test value formatter: 1
+     26097    record (17 = 3 [5] + 13 + 1)
+              witchcraft#0,SET test value formatter: 1
+     26114    record (14 = 3 [3] + 9 + 2)
+              with#0,SET test value formatter: 65
+     26128    record (14 = 3 [4] + 10 + 1)
+              withal#0,SET test value formatter: 2
+     26142    record (15 = 3 [4] + 10 + 2)
+              within#0,SET test value formatter: 11
+     26157    record (15 = 3 [4] + 11 + 1)
+              without#0,SET test value formatter: 3
+     26172    record (19 = 3 [0] + 15 + 1) [restart]
+              witness#0,SET test value formatter: 1
+     26191    record (19 = 3 [3] + 15 + 1)
+              wittenberg#0,SET test value formatter: 4
+     26210    record (14 = 3 [1] + 10 + 1)
+              woe#0,SET test value formatter: 3
+     26224    record (15 = 3 [2] + 11 + 1)
+              woman#0,SET test value formatter: 2
+     26239    record (13 = 3 [3] + 9 + 1)
+              womb#0,SET test value formatter: 1
+     26252    record (13 = 3 [2] + 9 + 1)
+              won#0,SET test value formatter: 1
+     26265    record (15 = 3 [3] + 11 + 1)
+              wonder#0,SET test value formatter: 1
+     26280    record (15 = 3 [6] + 11 + 1)
+              wonderful#0,SET test value formatter: 1
+     26295    record (16 = 3 [4] + 12 + 1)
+              wondrous#0,SET test value formatter: 1
+     26311    record (13 = 3 [3] + 9 + 1)
+              wont#0,SET test value formatter: 1
+     26324    record (19 = 3 [2] + 15 + 1)
+              woodcocks#0,SET test value formatter: 1
+     26343    record (14 = 3 [2] + 10 + 1)
+              word#0,SET test value formatter: 3
+     26357    record (13 = 3 [4] + 9 + 1)
+              words#0,SET test value formatter: 2
+     26370    record (13 = 3 [3] + 9 + 1)
+              wore#0,SET test value formatter: 1
+     26383    record (13 = 3 [3] + 9 + 1)
+              work#0,SET test value formatter: 2
+     26396    record (14 = 3 [3] + 10 + 1)
+              world#0,SET test value formatter: 3
+     26410    record (16 = 3 [0] + 12 + 1) [restart]
+              worm#0,SET test value formatter: 1
+     26426    record (14 = 3 [3] + 10 + 1)
+              worth#0,SET test value formatter: 1
+     26440    record (13 = 3 [5] + 9 + 1)
+              worthy#0,SET test value formatter: 1
+     26453    record (16 = 3 [2] + 11 + 2)
+              would#0,SET test value formatter: 14
+     26469    record (14 = 3 [5] + 10 + 1)
+              wouldst#0,SET test value formatter: 3
+     26483    record (17 = 3 [1] + 13 + 1)
+              wretch#0,SET test value formatter: 1
+     26500    [restart 24504]
+     26504    [restart 24747]
+     26508    [restart 24994]
+     26512    [restart 25233]
+     26516    [restart 25465]
+     26520    [restart 25704]
+     26524    [restart 25941]
+     26528    [restart 26172]
+     26532    [restart 26410]
+     26540    [trailer compression=none checksum=0x99c293d8]
+     26545  data (249)
+     26545    record (16 = 3 [0] + 12 + 1) [restart]
+              writ#0,SET test value formatter: 2
+     26561    record (15 = 3 [4] + 11 + 1)
+              writing#0,SET test value formatter: 1
+     26576    record (15 = 3 [2] + 11 + 1)
+              wrong#0,SET test value formatter: 1
+     26591    record (15 = 3 [2] + 11 + 1)
+              wrung#0,SET test value formatter: 1
+     26606    record (15 = 3 [0] + 11 + 1)
+              yea#0,SET test value formatter: 1
+     26621    record (13 = 3 [2] + 9 + 1)
+              yes#0,SET test value formatter: 4
+     26634    record (20 = 3 [3] + 16 + 1)
+              yesternight#0,SET test value formatter: 1
+     26654    record (13 = 3 [2] + 9 + 1)
+              yet#0,SET test value formatter: 7
+     26667    record (19 = 3 [1] + 15 + 1)
+              yielding#0,SET test value formatter: 1
+     26686    record (14 = 3 [1] + 10 + 1)
+              yon#0,SET test value formatter: 1
+     26700    record (13 = 3 [3] + 9 + 1)
+              yond#0,SET test value formatter: 1
+     26713    record (15 = 3 [2] + 9 + 3)
+              you#0,SET test value formatter: 110
+     26728    record (14 = 3 [3] + 10 + 1)
+              young#0,SET test value formatter: 6
+     26742    record (14 = 3 [3] + 9 + 2)
+              your#0,SET test value formatter: 49
+     26756    record (16 = 3 [4] + 12 + 1)
+              yourself#0,SET test value formatter: 7
+     26772    record (14 = 3 [3] + 10 + 1)
+              youth#0,SET test value formatter: 5
+     26786    [restart 26545]
+     26794    [trailer compression=none checksum=0x2bb2856]
+     26799  index (120)
+     26799    block:0/2041 [restart]
+     26817    block:2046/2044 [restart]
+     26839    block:4095/2039 [restart]
+     26858    block:6139/2036 [restart]
+     26876    block:8180/2032 [restart]
+     26895    [restart 26799]
+     26899    [restart 26817]
+     26903    [restart 26839]
+     26907    [restart 26858]
+     26911    [restart 26876]
+     26919    [trailer compression=none checksum=0x4b1bc52e]
+     26924  index (118)
+     26924    block:10217/2042 [restart]
+     26941    block:12264/2039 [restart]
+     26959    block:14308/2037 [restart]
+     26979    block:16350/2029 [restart]
+     26998    block:18384/2040 [restart]
+     27018    [restart 26924]
+     27022    [restart 26941]
+     27026    [restart 26959]
+     27030    [restart 26979]
+     27034    [restart 26998]
+     27042    [trailer compression=none checksum=0xe1dd6a77]
+     27047  index (95)
+     27047    block:20429/2030 [restart]
+     27068    block:22464/2035 [restart]
+     27086    block:24504/2036 [restart]
+     27105    block:26545/249 [restart]
+     27122    [restart 27047]
+     27126    [restart 27068]
+     27130    [restart 27086]
+     27134    [restart 27105]
+     27142    [trailer compression=none checksum=0x3b1313e3]
+     27147  top-index (70)
+     27147    block:26799/120 [restart]
+     27166    block:26924/118 [restart]
+     27185    block:27047/95 [restart]
+     27201    [restart 27147]
+     27205    [restart 27166]
+     27209    [restart 27185]
+     27217    [trailer compression=none checksum=0xd20fdc47]
+     27222  range-del (421)
+     27222    record (13 = 3 [0] + 9 + 1) [restart]
+              a-a#0,RANGEDEL
+     27235    record (23 = 3 [0] + 13 + 7) [restart]
+              beard-bearers#0,RANGEDEL
+     27258    record (24 = 3 [0] + 16 + 5) [restart]
+              carriage-carve#0,RANGEDEL
+     27282    record (21 = 3 [0] + 13 + 5) [restart]
+              cross-crows#0,RANGEDEL
+     27303    record (23 = 3 [0] + 14 + 6) [restart]
+              duller-duties#0,RANGEDEL
+     27326    record (21 = 3 [0] + 14 + 4) [restart]
+              fierce-fire#0,RANGEDEL
+     27347    record (21 = 3 [0] + 13 + 5) [restart]
+              grace-great#0,RANGEDEL
+     27368    record (17 = 3 [0] + 11 + 3) [restart]
+              how-ice#0,RANGEDEL
+     27385    record (20 = 3 [0] + 12 + 5) [restart]
+              lead-lends#0,RANGEDEL
+     27405    record (18 = 3 [0] + 12 + 3) [restart]
+              meet-met#0,RANGEDEL
+     27423    record (17 = 3 [0] + 10 + 4) [restart]
+              of-once#0,RANGEDEL
+     27440    record (25 = 3 [0] + 16 + 6) [restart]
+              precurse-prison#0,RANGEDEL
+     27465    record (15 = 3 [0] + 9 + 3) [restart]
+              s-saw#0,RANGEDEL
+     27480    record (19 = 3 [0] + 12 + 4) [restart]
+              slay-soil#0,RANGEDEL
+     27499    record (24 = 3 [0] + 16 + 5) [restart]
+              suppress-sword#0,RANGEDEL
+     27523    record (23 = 3 [0] + 16 + 4) [restart]
+              traduced-true#0,RANGEDEL
+     27546    record (25 = 3 [0] + 15 + 7) [restart]
+              warning-wedding#0,RANGEDEL
+     27571    [restart 27222]
+     27575    [restart 27235]
+     27579    [restart 27258]
+     27583    [restart 27282]
+     27587    [restart 27303]
+     27591    [restart 27326]
+     27595    [restart 27347]
+     27599    [restart 27368]
+     27603    [restart 27385]
+     27607    [restart 27405]
+     27611    [restart 27423]
+     27615    [restart 27440]
+     27619    [restart 27465]
+     27623    [restart 27480]
+     27627    [restart 27499]
+     27631    [restart 27523]
+     27635    [restart 27546]
+     27643    [trailer compression=none checksum=0xb93b31c5]
+     27648  properties (765)
+     27648    rocksdb.block.based.table.index.type (43) [restart]
+     27691    rocksdb.block.based.table.prefix.filtering (20)
+     27711    rocksdb.block.based.table.whole.key.filtering (23)
+     27734    rocksdb.column.family.id (24)
+     27758    rocksdb.comparator (37)
+     27795    rocksdb.compression (23)
+     27818    rocksdb.compression_options (106)
+     27924    rocksdb.creation.time (16)
+     27940    rocksdb.data.size (15)
+     27955    rocksdb.deleted.keys (15)
+     27970    rocksdb.external_sst_file.global_seqno (41)
+     28011    rocksdb.external_sst_file.version (14)
+     28025    rocksdb.filter.size (15)
+     28040    rocksdb.fixed.key.length (18)
+     28058    rocksdb.format.version (17)
+     28075    rocksdb.index.key.is.user.key (25)
+     28100    rocksdb.index.partitions (14)
+     28114    rocksdb.index.size (9)
+     28123    rocksdb.index.value.is.delta.encoded (26)
+     28149    rocksdb.merge.operands (18)
+     28167    rocksdb.merge.operator (13)
+     28180    rocksdb.num.data.blocks (19)
+     28199    rocksdb.num.entries (12)
+     28211    rocksdb.num.range-deletions (19)
+     28230    rocksdb.oldest.key.time (19)
+     28249    rocksdb.prefix.extractor.name (31)
+     28280    rocksdb.property.collectors (47)
+     28327    rocksdb.raw.key.size (18)
+     28345    rocksdb.raw.value.size (15)
+     28360    rocksdb.top-level.index.size (24)
+     28384    test.key-count (21)
+     28405    [restart 27648]
+     28413    [trailer compression=none checksum=0x8542f94f]
+     28418  meta-index (63)
+     28418    rocksdb.properties block:27648/765 [restart]
+     28444    rocksdb.range_del block:27222/421 [restart]
+     28469    [restart 28418]
+     28473    [restart 28444]
+     28481    [trailer compression=none checksum=0xabc8467e]
+     28486  footer (53)
+     28486    checksum type: crc32c
+     28487    meta: offset=28418, length=63
+     28491    index: offset=27147, length=70
+     28495    [padding]
+     28527    version: 2
+     28531    magic number: 0xf7cff485b741e288
+     28539  EOF
+
+sstable layout
+-v
+testdata/out-of-order.sst
+----
+out-of-order.sst
+         0  data (28)
+         0    record (12 = 3 [0] + 9 + 0) [restart]
+              a#0,SET []
+        12    record (12 = 3 [0] + 9 + 0)
+              c#0,SET []
+        24    record (12 = 3 [0] + 9 + 0)
+              b#0,SET []
+              WARNING: OUT OF ORDER KEYS!
+        36    [restart 0]
+        28    [trailer compression=snappy checksum=0x94ebf32b]
+        33  index (22)
+        33    block:0/28 [restart]
+        47    [restart 33]
+        55    [trailer compression=none checksum=0xc316e0d2]
+        60  properties (678)
+        60    rocksdb.block.based.table.index.type (43) [restart]
+       103    rocksdb.block.based.table.prefix.filtering (20)
+       123    rocksdb.block.based.table.whole.key.filtering (23)
+       146    rocksdb.column.family.id (24)
+       170    rocksdb.comparator (37)
+       207    rocksdb.compression (16)
+       223    rocksdb.compression_options (106)
+       329    rocksdb.creation.time (16)
+       345    rocksdb.data.size (13)
+       358    rocksdb.deleted.keys (15)
+       373    rocksdb.external_sst_file.global_seqno (41)
+       414    rocksdb.external_sst_file.version (14)
+       428    rocksdb.filter.size (15)
+       443    rocksdb.fixed.key.length (18)
+       461    rocksdb.format.version (17)
+       478    rocksdb.index.key.is.user.key (25)
+       503    rocksdb.index.size (8)
+       511    rocksdb.index.value.is.delta.encoded (26)
+       537    rocksdb.merge.operands (18)
+       555    rocksdb.merge.operator (24)
+       579    rocksdb.num.data.blocks (19)
+       598    rocksdb.num.entries (11)
+       609    rocksdb.num.range-deletions (19)
+       628    rocksdb.oldest.key.time (19)
+       647    rocksdb.prefix.extractor.name (31)
+       678    rocksdb.property.collectors (22)
+       700    rocksdb.raw.key.size (16)
+       716    rocksdb.raw.value.size (14)
+       730    [restart 60]
+       738    [trailer compression=none checksum=0xbbcd4fc4]
+       743  meta-index (32)
+       743    rocksdb.properties block:60/678 [restart]
+       767    [restart 743]
+       775    [trailer compression=none checksum=0x1a67e403]
+       780  footer (53)
+       780    checksum type: crc32c
+       781    meta: offset=743, length=32
+       784    index: offset=33, length=22
+       786    [padding]
+       821    version: 2
+       825    magic number: 0xf7cff485b741e288
+       833  EOF
+
+sstable layout
+./testdata/mixed/000005.sst
+----
+000005.sst
+         0  data (231)
+       236  index (24)
+       265  range-key (64)
+       334  properties (710)
+      1049  meta-index (59)
+      1113  footer (53)
+      1166  EOF
diff --git a/pebble/tool/testdata/sstable_properties b/pebble/tool/testdata/sstable_properties
new file mode 100644
index 0000000..4dada20
--- /dev/null
+++ b/pebble/tool/testdata/sstable_properties
@@ -0,0 +1,229 @@
+sstable properties
+----
+requires at least 1 arg(s), only received 0
+
+sstable properties
+../sstable/testdata/h.sst
+----
+h.sst
+size                    
+  file                  15KB
+  data                  14KB
+    blocks              14
+  index                 325B
+    blocks              1
+    top-level           0B
+  filter                0B
+  raw-key               23KB
+  raw-value             1.9KB
+  pinned-key            0
+  pinned-val            0
+  point-del-key-size    0
+  point-del-value-size  0
+records                 1727
+  set                   1710
+  delete                0
+  delete-sized          0
+  range-delete          17
+  range-key-set         0
+  range-key-unset       0
+  range-key-delete      0
+  merge                 0
+  global-seq-num        0
+  pinned                0
+index                   
+  key                     value  comparer  leveldb.BytewiseComparator
+merger                  -
+filter                  -
+  prefix                false
+  whole-key             false
+compression             Snappy
+  options               window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; 
+user properties         
+  collectors            [KeyCountPropertyCollector]
+  test.key-count        1727
+
+sstable properties
+../sstable/testdata/h.ldb
+----
+h.ldb
+size                    
+  file                  15KB
+  data                  14KB
+    blocks              14
+  index                 325B
+    blocks              1
+    top-level           0B
+  filter                0B
+  raw-key               23KB
+  raw-value             1.9KB
+  pinned-key            0
+  pinned-val            0
+  point-del-key-size    0
+  point-del-value-size  0
+records                 1727
+  set                   1710
+  delete                0
+  delete-sized          0
+  range-delete          17
+  range-key-set         0
+  range-key-unset       0
+  range-key-delete      0
+  merge                 0
+  global-seq-num        0
+  pinned                0
+index                   
+  key                     value  comparer  leveldb.BytewiseComparator
+merger                  -
+filter                  -
+  prefix                false
+  whole-key             false
+compression             Snappy
+  options               window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; 
+user properties         
+  collectors            []
+
+sstable properties
+../sstable/testdata/h.no-compression.two_level_index.sst
+----
+h.no-compression.two_level_index.sst
+size                    
+  file                  28KB
+  data                  26KB
+    blocks              14
+  index                 408B
+    blocks              4
+    top-level           70B
+  filter                0B
+  raw-key               23KB
+  raw-value             1.9KB
+  pinned-key            0
+  pinned-val            0
+  point-del-key-size    0
+  point-del-value-size  0
+records                 1727
+  set                   1710
+  delete                0
+  delete-sized          0
+  range-delete          17
+  range-key-set         0
+  range-key-unset       0
+  range-key-delete      0
+  merge                 0
+  global-seq-num        0
+  pinned                0
+index                   
+  key                     value  comparer  leveldb.BytewiseComparator
+merger                  -
+filter                  -
+  prefix                false
+  whole-key             false
+compression             NoCompression
+  options               window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; 
+user properties         
+  collectors            [KeyCountPropertyCollector]
+  test.key-count        1727
+
+sstable properties
+-v
+../sstable/testdata/h.no-compression.two_level_index.sst
+----
+h.no-compression.two_level_index.sst
+rocksdb.num.entries: 1727
+rocksdb.raw.key.size: 23938
+rocksdb.raw.value.size: 1912
+rocksdb.deleted.keys: 17
+rocksdb.num.range-deletions: 17
+rocksdb.comparator: leveldb.BytewiseComparator
+rocksdb.compression: NoCompression
+rocksdb.compression_options: window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; 
+rocksdb.data.size: 26799
+rocksdb.external_sst_file.version: 2
+rocksdb.filter.size: 0
+rocksdb.external_sst_file.global_seqno: 0
+rocksdb.index.partitions: 3
+rocksdb.index.size: 408
+rocksdb.block.based.table.index.type: 2
+rocksdb.merge.operator: nullptr
+rocksdb.num.data.blocks: 14
+rocksdb.merge.operands: 0
+rocksdb.prefix.extractor.name: nullptr
+rocksdb.block.based.table.prefix.filtering: false
+rocksdb.property.collectors: [KeyCountPropertyCollector]
+rocksdb.top-level.index.size: 70
+rocksdb.block.based.table.whole.key.filtering: false
+test.key-count: 1727
+
+# Test for properties in SSTs made by the db itself. Should not contain
+# rocksdb.external_sst_file.* . See
+# https://github.com/cockroachdb/pebble/issues/532
+sstable properties
+-v
+testdata/find-db/archive/000011.sst
+----
+000011.sst
+rocksdb.num.entries: 8
+rocksdb.raw.key.size: 88
+rocksdb.raw.value.size: 13
+rocksdb.deleted.keys: 2
+rocksdb.num.range-deletions: 1
+rocksdb.comparator: alt-comparer
+rocksdb.compression: Snappy
+rocksdb.compression_options: window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; 
+rocksdb.data.size: 90
+rocksdb.filter.size: 0
+rocksdb.index.size: 27
+rocksdb.block.based.table.index.type: 0
+rocksdb.merge.operator: test-merger
+rocksdb.num.data.blocks: 1
+rocksdb.merge.operands: 1
+rocksdb.prefix.extractor.name: nullptr
+rocksdb.block.based.table.prefix.filtering: false
+rocksdb.property.collectors: []
+rocksdb.block.based.table.whole.key.filtering: false
+
+sstable properties
+testdata/bad-magic.sst
+----
+bad-magic.sst
+pebble/table: invalid table (bad magic number: 0xf6cff485b741e288)
+
+sstable properties
+testdata/mixed/000005.sst
+----
+000005.sst
+size                    
+  file                  1.1KB
+  data                  236B
+    blocks              1
+  index                 29B
+    blocks              1
+    top-level           0B
+  filter                0B
+  raw-key               286B
+  raw-value             0B
+  pinned-key            0
+  pinned-val            0
+  point-del-key-size    0
+  point-del-value-size  0
+records                 26
+  set                   26
+  delete                0
+  delete-sized          0
+  range-delete          0
+  range-key-set         1
+  range-key-unset       1
+  range-key-delete      1
+  merge                 0
+  global-seq-num        0
+  pinned                0
+index                   
+  key                     value  comparer  pebble.internal.testkeys
+merger                  pebble.concatenate
+filter                  -
+  prefix                false
+  whole-key             false
+compression             Snappy
+  options               window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; 
+user properties         
+  collectors            []
diff --git a/pebble/tool/testdata/sstable_scan b/pebble/tool/testdata/sstable_scan
new file mode 100644
index 0000000..75164c2
--- /dev/null
+++ b/pebble/tool/testdata/sstable_scan
@@ -0,0 +1,402 @@
+sstable scan
+--start=arm
+--end=aside
+../sstable/testdata/h.sst
+----
+h.sst
+arm#0,SET [32]
+armed#0,SET [32]
+armour#0,SET [31]
+arms#0,SET [32]
+arrant#0,SET [31]
+art#0,SET [36]
+artery#0,SET [31]
+article#0,SET [31]
+articles#0,SET [31]
+as#0,SET [3536]
+
+sstable scan
+--end=abused
+../sstable/testdata/h.sst
+----
+h.sst
+a-a#0,RANGEDEL
+a#0,SET [3937]
+aboard#0,SET [32]
+about#0,SET [32]
+above#0,SET [31]
+abroad#0,SET [31]
+absurd#0,SET [31]
+
+sstable scan
+--start=you
+../sstable/testdata/h.sst
+----
+h.sst
+you#0,SET [313130]
+young#0,SET [36]
+your#0,SET [3439]
+yourself#0,SET [37]
+youth#0,SET [35]
+
+sstable scan
+--key=%x
+--value=null
+--start=you
+../sstable/testdata/h.sst
+----
+h.sst
+796f75#0,SET
+796f756e67#0,SET
+796f7572#0,SET
+796f757273656c66#0,SET
+796f757468#0,SET
+
+sstable scan
+--key=%q
+--value=null
+--start=hex:796f75
+--end=raw:yourself
+../sstable/testdata/h.sst
+----
+h.sst
+"you"#0,SET
+"young"#0,SET
+"your"#0,SET
+
+sstable scan
+--key=null
+--value=[%x]
+--start=hex:796f75
+--end=raw:yourself
+../sstable/testdata/h.sst
+----
+h.sst
+[313130]
+[36]
+[3439]
+
+sstable scan
+--key=pretty
+--value=[%x]
+--start=hex:796f75
+--end=raw:yourself
+../sstable/testdata/h.sst
+----
+h.sst
+you#0,SET [313130]
+young#0,SET [36]
+your#0,SET [3439]
+
+sstable scan
+--key=pretty
+--value=[%x]
+--start=hex:796f75
+--end=raw:yourself
+../sstable/testdata/h.sst
+----
+h.sst
+you#0,SET [313130]
+young#0,SET [36]
+your#0,SET [3439]
+
+sstable scan
+--key=pretty
+--value=pretty
+--start=hex:796f75
+--end=raw:yourself
+../sstable/testdata/h.sst
+----
+h.sst
+you#0,SET 110
+young#0,SET 6
+your#0,SET 49
+
+sstable scan
+--key=pretty:test-comparer
+--value=pretty:test-comparer
+--start=hex:796f75
+--end=raw:yourself
+../sstable/testdata/h.sst
+----
+h.sst
+test formatter: you#0,SET test value formatter: 110
+test formatter: young#0,SET test value formatter: 6
+test formatter: your#0,SET test value formatter: 49
+
+# Start and end scan keys lie within range tombstones.
+sstable scan
+--start=beards
+--end=carrying
+../sstable/testdata/h.sst
+----
+h.sst
+beard-bearers#0,RANGEDEL
+bearers#0,SET [31]
+bears#0,SET [31]
+beast#0,SET [32]
+beating#0,SET [31]
+beauty#0,SET [31]
+beaver#0,SET [31]
+beckons#0,SET [32]
+bed#0,SET [34]
+been#0,SET [34]
+beetles#0,SET [31]
+befitted#0,SET [31]
+before#0,SET [36]
+beg#0,SET [31]
+beguile#0,SET [31]
+behold#0,SET [31]
+behoves#0,SET [31]
+being#0,SET [34]
+belief#0,SET [31]
+believe#0,SET [36]
+bell#0,SET [31]
+bend#0,SET [32]
+beneath#0,SET [35]
+benefit#0,SET [31]
+bernardo#0,SET [3330]
+beseech#0,SET [32]
+besmirch#0,SET [31]
+best#0,SET [35]
+beteem#0,SET [31]
+bethought#0,SET [31]
+better#0,SET [32]
+between#0,SET [32]
+beware#0,SET [32]
+beyond#0,SET [31]
+bid#0,SET [32]
+bird#0,SET [32]
+birth#0,SET [33]
+bites#0,SET [31]
+bitter#0,SET [31]
+black#0,SET [31]
+blast#0,SET [31]
+blastments#0,SET [31]
+blasts#0,SET [31]
+blazes#0,SET [31]
+blazon#0,SET [31]
+blessing#0,SET [33]
+blood#0,SET [37]
+blossoms#0,SET [31]
+blows#0,SET [31]
+bodes#0,SET [31]
+body#0,SET [35]
+bonds#0,SET [31]
+bones#0,SET [31]
+book#0,SET [31]
+books#0,SET [31]
+born#0,SET [32]
+borrower#0,SET [31]
+borrowing#0,SET [31]
+bosom#0,SET [31]
+both#0,SET [33]
+bound#0,SET [32]
+bounteous#0,SET [31]
+bow#0,SET [31]
+boy#0,SET [32]
+brain#0,SET [32]
+bray#0,SET [31]
+brazen#0,SET [31]
+breach#0,SET [31]
+break#0,SET [33]
+breaking#0,SET [31]
+breath#0,SET [31]
+breathing#0,SET [31]
+brief#0,SET [31]
+bring#0,SET [31]
+brokers#0,SET [31]
+brother#0,SET [36]
+brow#0,SET [31]
+bruit#0,SET [31]
+bulk#0,SET [31]
+buried#0,SET [31]
+burns#0,SET [32]
+burnt#0,SET [31]
+burst#0,SET [32]
+business#0,SET [34]
+but#0,SET [3538]
+buttons#0,SET [31]
+buy#0,SET [31]
+by#0,SET [3331]
+call#0,SET [34]
+calumnious#0,SET [31]
+came#0,SET [32]
+can#0,SET [35]
+canker#0,SET [31]
+cannon#0,SET [32]
+cannot#0,SET [33]
+canon#0,SET [31]
+canonized#0,SET [31]
+canst#0,SET [32]
+cap#0,SET [31]
+carefully#0,SET [31]
+carriage-carve#0,RANGEDEL
+carriage#0,SET [31]
+
+# Start scan key lies on range tombstone end key.
+sstable scan
+--start=bearers
+--end=bears
+../sstable/testdata/h.sst
+----
+h.sst
+bearers#0,SET [31]
+
+# End scan key lies on range tombstone start key.
+sstable scan
+--start=bear
+--end=beard
+../sstable/testdata/h.sst
+----
+h.sst
+bear#0,SET [35]
+
+# Count that only includes point records.
+sstable scan
+--start=armed
+--count=3
+../sstable/testdata/h.sst
+----
+h.sst
+armed#0,SET [32]
+armour#0,SET [31]
+arms#0,SET [32]
+
+# Count that includes point records and range tombstones.
+sstable scan
+--start=beards
+--count=2
+../sstable/testdata/h.sst
+----
+h.sst
+beard-bearers#0,RANGEDEL
+bearers#0,SET [31]
+
+sstable scan
+testdata/out-of-order.sst
+----
+out-of-order.sst
+a#0,SET []
+c#0,SET []
+b#0,SET []
+    WARNING: OUT OF ORDER KEYS!
+
+sstable scan
+--filter=arms
+../sstable/testdata/h.sst
+----
+h.sst: arms#0,SET [32]
+
+sstable scan
+--filter=bear
+../sstable/testdata/h.sst
+----
+h.sst: bear#0,SET [35]
+h.sst: beard-bearers#0,RANGEDEL
+h.sst: beard#0,SET [31]
+h.sst: bearers#0,SET [31]
+h.sst: bears#0,SET [31]
+
+sstable scan
+--filter=beards
+../sstable/testdata/h.sst
+----
+h.sst: beard-bearers#0,RANGEDEL
+
+sstable scan
+--filter=beard
+../sstable/testdata/
+----
+testdata/h.block-bloom.no-compression.sst: beard-bearers#0,RANGEDEL
+testdata/h.block-bloom.no-compression.sst: beard#0,SET [31]
+testdata/h.ldb: beard-bearers#0,RANGEDEL
+testdata/h.ldb: beard#0,SET [31]
+testdata/h.no-compression.sst: beard-bearers#0,RANGEDEL
+testdata/h.no-compression.sst: beard#0,SET [31]
+testdata/h.no-compression.two_level_index.sst: beard-bearers#0,RANGEDEL
+testdata/h.no-compression.two_level_index.sst: beard#0,SET [31]
+testdata/h.sst: beard-bearers#0,RANGEDEL
+testdata/h.sst: beard#0,SET [31]
+testdata/h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst: beard-bearers#0,RANGEDEL
+testdata/h.table-bloom.no-compression.prefix_extractor.no_whole_key_filter.sst: beard#0,SET [31]
+testdata/h.table-bloom.no-compression.sst: beard-bearers#0,RANGEDEL
+testdata/h.table-bloom.no-compression.sst: beard#0,SET [31]
+testdata/h.table-bloom.sst: beard-bearers#0,RANGEDEL
+testdata/h.table-bloom.sst: beard#0,SET [31]
+testdata/h.zstd-compression.sst: beard-bearers#0,RANGEDEL
+testdata/h.zstd-compression.sst: beard#0,SET [31]
+
+sstable scan
+--filter=beard
+--start=boar
+../sstable/testdata/h.sst
+----
+
+sstable scan
+./testdata/mixed/000005.sst
+----
+000005.sst
+a@1#1,SET []
+b@1#2,SET []
+c@1#3,SET []
+d@1#4,SET []
+e@1#5,SET []
+f@1#6,SET []
+g@1#7,SET []
+h@1#8,SET []
+i@1#9,SET []
+j@1#10,SET []
+k@1#11,SET []
+l@1#12,SET []
+m@1#13,SET []
+n@1#14,SET []
+o@1#15,SET []
+p@1#16,SET []
+q@1#17,SET []
+r@1#18,SET []
+s@1#19,SET []
+t@1#20,SET []
+u@1#21,SET []
+v@1#22,SET []
+w@1#23,SET []
+x@1#24,SET []
+y@1#25,SET []
+z@1#26,SET []
+[a-b):
+  #29,RANGEKEYDEL
+[b-z):
+  #28,RANGEKEYUNSET: @2
+  #27,RANGEKEYSET: @1 []
+
+sstable scan
+--start=b
+--end=e
+./testdata/mixed/000005.sst
+----
+000005.sst
+b@1#2,SET []
+c@1#3,SET []
+d@1#4,SET []
+[b-z):
+  #28,RANGEKEYUNSET: @2
+  #27,RANGEKEYSET: @1 []
+
+sstable scan
+--filter=a
+./testdata/mixed/000005.sst
+----
+000005.sst: a@1#1,SET []
+000005.sst: [a-b):
+  #29,RANGEKEYDEL
+
+sstable scan
+--filter=b
+--start=b
+--end=d
+./testdata/mixed/000005.sst
+----
+000005.sst: b@1#2,SET []
+000005.sst: [b-z):
+  #28,RANGEKEYUNSET: @2
+  #27,RANGEKEYSET: @1 []
diff --git a/pebble/tool/testdata/sstable_space b/pebble/tool/testdata/sstable_space
new file mode 100644
index 0000000..cfec706
--- /dev/null
+++ b/pebble/tool/testdata/sstable_space
@@ -0,0 +1,68 @@
+sstable space
+----
+requires at least 1 arg(s), only received 0
+
+# first key in first data block
+
+sstable space --start=a --end=a
+../sstable/testdata/h.sst
+----
+h.sst: 1099
+
+# first data block through last key in first data block
+
+sstable space --start=a --end=beteem
+../sstable/testdata/h.sst
+----
+h.sst: 1099
+
+# last key in first data block through first key in second data block
+
+sstable space --start=beteem --end=bethought
+../sstable/testdata/h.sst
+----
+h.sst: 2161
+
+# last key in first data block through last key in last data block
+
+sstable space --start=beteem --end=youth
+../sstable/testdata/h.sst
+----
+h.sst: 13913
+
+# second last key in last data block through last key in last data block
+
+sstable space --start=yourself --end=youth
+../sstable/testdata/h.sst
+----
+h.sst: 161
+
+# Two-level index: first key in first data block
+
+sstable space --start=a --end=a
+../sstable/testdata/h.no-compression.two_level_index.sst
+----
+h.no-compression.two_level_index.sst: 2046
+
+# Two-level index: last key in last data block
+
+sstable space --start=youth --end=youth
+../sstable/testdata/h.no-compression.two_level_index.sst
+----
+h.no-compression.two_level_index.sst: 254
+
+# Two-level index: last key in first top-level index partition and first key
+# in second top-level index partition
+
+sstable space --start=headshake --end=health
+../sstable/testdata/h.no-compression.two_level_index.sst
+----
+h.no-compression.two_level_index.sst: 4084
+
+# Two-level index: last key in first top-level index partition through last
+# key in last data block.
+
+sstable space --start=headshake --end=youth
+../sstable/testdata/h.no-compression.two_level_index.sst
+----
+h.no-compression.two_level_index.sst: 18619
diff --git a/pebble/tool/testdata/wal_dump b/pebble/tool/testdata/wal_dump
new file mode 100644
index 0000000..9e8af91
--- /dev/null
+++ b/pebble/tool/testdata/wal_dump
@@ -0,0 +1,118 @@
+wal dump
+----
+requires at least 1 arg(s), only received 0
+
+wal dump
+../testdata/db-stage-2/000002.log
+----
+000002.log
+0(21) seq=10 count=1
+    SET(test formatter: foo,test value formatter: one)
+32(21) seq=11 count=1
+    SET(test formatter: bar,test value formatter: two)
+64(23) seq=12 count=1
+    SET(test formatter: baz,test value formatter: three)
+98(22) seq=13 count=1
+    SET(test formatter: foo,test value formatter: four)
+131(17) seq=14 count=1
+    DEL(test formatter: bar)
+EOF
+
+wal dump
+../testdata/db-stage-2/000002.log
+--key=pretty:leveldb.BytewiseComparator
+--value=size
+----
+000002.log
+0(21) seq=10 count=1
+    SET(foo,<3>)
+32(21) seq=11 count=1
+    SET(bar,<3>)
+64(23) seq=12 count=1
+    SET(baz,<5>)
+98(22) seq=13 count=1
+    SET(foo,<4>)
+131(17) seq=14 count=1
+    DEL(bar)
+EOF
+
+wal dump
+--key=pretty:leveldb.BytewiseComparator
+--value=size
+../testdata/db-stage-4/000005.log
+----
+000005.log
+0(22) seq=15 count=1
+    SET(foo,<4>)
+33(22) seq=16 count=1
+    SET(quux,<3>)
+66(17) seq=17 count=1
+    DEL(baz)
+EOF
+
+wal dump
+../testdata/db-stage-4/000005.log
+--key=%x
+--value=%x
+----
+000005.log
+0(22) seq=15 count=1
+    SET(666f6f,66697665)
+33(22) seq=16 count=1
+    SET(71757578,736978)
+66(17) seq=17 count=1
+    DEL(62617a)
+EOF
+
+wal dump
+../testdata/db-stage-4/000005.log
+--key=pretty:leveldb.BytewiseComparator
+--value=pretty:test-comparer
+----
+000005.log
+0(22) seq=15 count=1
+    SET(foo,test value formatter: five)
+33(22) seq=16 count=1
+    SET(quux,test value formatter: six)
+66(17) seq=17 count=1
+    DEL(baz)
+EOF
+
+wal dump
+../testdata/db-stage-4/000005.log
+--key=pretty:test-comparer
+--value=%x
+----
+000005.log
+0(22) seq=15 count=1
+    SET(test formatter: foo,66697665)
+33(22) seq=16 count=1
+    SET(test formatter: quux,736978)
+66(17) seq=17 count=1
+    DEL(test formatter: baz)
+EOF
+
+wal dump
+../testdata/db-stage-4/000005.log
+--key=pretty:leveldb.BytewiseComparator
+--value=quoted
+----
+000005.log
+0(22) seq=15 count=1
+    SET(foo,five)
+33(22) seq=16 count=1
+    SET(quux,six)
+66(17) seq=17 count=1
+    DEL(baz)
+EOF
+
+wal dump
+./testdata/mixed/000004.log
+----
+000004.log
+0(42) seq=30 count=4
+    SET(test formatter: a@2,test value formatter: )
+    RANGEKEYSET(test formatter: a-test formatter: z:{(#31,RANGEKEYSET,@3)})
+    RANGEKEYUNSET(test formatter: a-test formatter: z:{(#32,RANGEKEYUNSET,@4)})
+    RANGEKEYDEL(test formatter: a-test formatter: b:{(#33,RANGEKEYDEL)})
+EOF
diff --git a/pebble/tool/tool.go b/pebble/tool/tool.go
new file mode 100644
index 0000000..4895b76
--- /dev/null
+++ b/pebble/tool/tool.go
@@ -0,0 +1,152 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package tool
+
+import (
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/bloom"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/objstorage/remote"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/spf13/cobra"
+)
+
+// Comparer exports the base.Comparer type.
+type Comparer = base.Comparer
+
+// FilterPolicy exports the base.FilterPolicy type.
+type FilterPolicy = base.FilterPolicy
+
+// Merger exports the base.Merger type.
+type Merger = base.Merger
+
+// T is the container for all of the introspection tools.
+type T struct {
+	Commands        []*cobra.Command
+	db              *dbT
+	find            *findT
+	lsm             *lsmT
+	manifest        *manifestT
+	remotecat       *remoteCatalogT
+	sstable         *sstableT
+	wal             *walT
+	opts            pebble.Options
+	comparers       sstable.Comparers
+	mergers         sstable.Mergers
+	defaultComparer string
+	openErrEnhancer func(error) error
+}
+
+// A Option configures the Pebble introspection tool.
+type Option func(*T)
+
+// Comparers may be passed to New to register comparers for use by
+// the introspesction tools.
+func Comparers(cmps ...*Comparer) Option {
+	return func(t *T) {
+		for _, c := range cmps {
+			t.comparers[c.Name] = c
+		}
+	}
+}
+
+// DefaultComparer registers a comparer for use by the introspection tools and
+// sets it as the default.
+func DefaultComparer(c *Comparer) Option {
+	return func(t *T) {
+		t.comparers[c.Name] = c
+		t.defaultComparer = c.Name
+	}
+}
+
+// Mergers may be passed to New to register mergers for use by the
+// introspection tools.
+func Mergers(mergers ...*Merger) Option {
+	return func(t *T) {
+		for _, m := range mergers {
+			t.mergers[m.Name] = m
+		}
+	}
+}
+
+// Filters may be passed to New to register filter policies for use by the
+// introspection tools.
+func Filters(filters ...FilterPolicy) Option {
+	return func(t *T) {
+		for _, f := range filters {
+			t.opts.Filters[f.Name()] = f
+		}
+	}
+}
+
+// FS sets the filesystem implementation to use by the introspection tools.
+func FS(fs vfs.FS) Option {
+	return func(t *T) {
+		t.opts.FS = fs
+	}
+}
+
+// OpenErrEnhancer sets a function that enhances an error encountered when the
+// tool opens a database; used to provide the user additional context, for
+// example that a corruption error might be caused by encryption at rest not
+// being configured properly.
+func OpenErrEnhancer(fn func(error) error) Option {
+	return func(t *T) {
+		t.openErrEnhancer = fn
+	}
+}
+
+// New creates a new introspection tool.
+func New(opts ...Option) *T {
+	t := &T{
+		opts: pebble.Options{
+			Filters:  make(map[string]FilterPolicy),
+			FS:       vfs.Default,
+			ReadOnly: true,
+		},
+		comparers:       make(sstable.Comparers),
+		mergers:         make(sstable.Mergers),
+		defaultComparer: base.DefaultComparer.Name,
+	}
+
+	opts = append(opts,
+		Comparers(base.DefaultComparer),
+		Filters(bloom.FilterPolicy(10)),
+		Mergers(base.DefaultMerger))
+
+	for _, opt := range opts {
+		opt(t)
+	}
+
+	t.db = newDB(&t.opts, t.comparers, t.mergers, t.openErrEnhancer)
+	t.find = newFind(&t.opts, t.comparers, t.defaultComparer, t.mergers)
+	t.lsm = newLSM(&t.opts, t.comparers)
+	t.manifest = newManifest(&t.opts, t.comparers)
+	t.remotecat = newRemoteCatalog(&t.opts)
+	t.sstable = newSSTable(&t.opts, t.comparers, t.mergers)
+	t.wal = newWAL(&t.opts, t.comparers, t.defaultComparer)
+	t.Commands = []*cobra.Command{
+		t.db.Root,
+		t.find.Root,
+		t.lsm.Root,
+		t.manifest.Root,
+		t.remotecat.Root,
+		t.sstable.Root,
+		t.wal.Root,
+	}
+	return t
+}
+
+// ConfigureSharedStorage updates the shared storage options.
+func (t *T) ConfigureSharedStorage(
+	s remote.StorageFactory,
+	createOnShared remote.CreateOnSharedStrategy,
+	createOnSharedLocator remote.Locator,
+) {
+	t.opts.Experimental.RemoteStorage = s
+	t.opts.Experimental.CreateOnShared = createOnShared
+	t.opts.Experimental.CreateOnSharedLocator = createOnSharedLocator
+}
diff --git a/pebble/tool/util.go b/pebble/tool/util.go
new file mode 100644
index 0000000..e315424
--- /dev/null
+++ b/pebble/tool/util.go
@@ -0,0 +1,325 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package tool
+
+import (
+	"encoding/hex"
+	"fmt"
+	"io"
+	"sort"
+	"strings"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/keyspan"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+var timeNow = time.Now
+
+type key []byte
+
+func (k *key) String() string {
+	return string(*k)
+}
+
+func (k *key) Type() string {
+	return "key"
+}
+
+func (k *key) Set(v string) error {
+	switch {
+	case strings.HasPrefix(v, "hex:"):
+		v = strings.TrimPrefix(v, "hex:")
+		b, err := hex.DecodeString(v)
+		if err != nil {
+			return err
+		}
+		*k = key(b)
+
+	case strings.HasPrefix(v, "raw:"):
+		*k = key(strings.TrimPrefix(v, "raw:"))
+
+	default:
+		*k = key(v)
+	}
+	return nil
+}
+
+type keyFormatter struct {
+	spec      string
+	fn        base.FormatKey
+	setByUser bool
+	comparer  string
+}
+
+func (f *keyFormatter) String() string {
+	return f.spec
+}
+
+func (f *keyFormatter) Type() string {
+	return "keyFormatter"
+}
+
+func (f *keyFormatter) Set(spec string) error {
+	f.spec = spec
+	f.setByUser = true
+	switch spec {
+	case "null":
+		f.fn = formatKeyNull
+	case "quoted":
+		f.fn = formatKeyQuoted
+	case "pretty":
+		// Using "pretty" defaults to base.FormatBytes (just like formatKeyQuoted),
+		// except with the ability of having the comparer-provided formatter
+		// overwrite f.fn if there is one specified. We determine whether to do
+		// that overwrite through setByUser.
+		f.fn = formatKeyQuoted
+		f.setByUser = false
+	case "size":
+		f.fn = formatKeySize
+	default:
+		if strings.HasPrefix(spec, "pretty:") {
+			// Usage: pretty:<comparer-name>
+			f.comparer = spec[7:]
+			f.fn = formatKeyQuoted
+			return nil
+		}
+		if strings.Count(spec, "%") != 1 {
+			return errors.Errorf("unknown formatter: %q", errors.Safe(spec))
+		}
+		f.fn = func(v []byte) fmt.Formatter {
+			return fmtFormatter{f.spec, v}
+		}
+	}
+	return nil
+}
+
+func (f *keyFormatter) mustSet(spec string) {
+	if err := f.Set(spec); err != nil {
+		panic(err)
+	}
+	f.setByUser = false
+}
+
+// Sets the appropriate formatter function for this comparer.
+func (f *keyFormatter) setForComparer(comparerName string, comparers sstable.Comparers) {
+	if f.setByUser && len(f.comparer) == 0 {
+		// User specified a different formatter, no-op.
+		return
+	}
+
+	if len(f.comparer) > 0 {
+		// User specified a comparer to reference for formatting, which takes
+		// precedence.
+		comparerName = f.comparer
+	} else if len(comparerName) == 0 {
+		return
+	}
+
+	if cmp := comparers[comparerName]; cmp != nil && cmp.FormatKey != nil {
+		f.fn = cmp.FormatKey
+	}
+}
+
+type valueFormatter struct {
+	spec      string
+	fn        base.FormatValue
+	setByUser bool
+	comparer  string
+}
+
+func (f *valueFormatter) String() string {
+	return f.spec
+}
+
+func (f *valueFormatter) Type() string {
+	return "valueFormatter"
+}
+
+func (f *valueFormatter) Set(spec string) error {
+	f.spec = spec
+	f.setByUser = true
+	switch spec {
+	case "null":
+		f.fn = formatValueNull
+	case "quoted":
+		f.fn = formatValueQuoted
+	case "pretty":
+		// Using "pretty" defaults to base.FormatBytes (just like
+		// formatValueQuoted), except with the ability of having the
+		// comparer-provided formatter overwrite f.fn if there is one specified. We
+		// determine whether to do that overwrite through setByUser.
+		f.fn = formatValueQuoted
+		f.setByUser = false
+	case "size":
+		f.fn = formatValueSize
+	default:
+		if strings.HasPrefix(spec, "pretty:") {
+			// Usage: pretty:<comparer-name>
+			f.comparer = spec[7:]
+			f.fn = formatValueQuoted
+			return nil
+		}
+		if strings.Count(spec, "%") != 1 {
+			return errors.Errorf("unknown formatter: %q", errors.Safe(spec))
+		}
+		f.fn = func(k, v []byte) fmt.Formatter {
+			return fmtFormatter{f.spec, v}
+		}
+	}
+	return nil
+}
+
+func (f *valueFormatter) mustSet(spec string) {
+	if err := f.Set(spec); err != nil {
+		panic(err)
+	}
+	f.setByUser = false
+}
+
+// Sets the appropriate formatter function for this comparer.
+func (f *valueFormatter) setForComparer(comparerName string, comparers sstable.Comparers) {
+	if f.setByUser && len(f.comparer) == 0 {
+		// User specified a different formatter, no-op.
+		return
+	}
+
+	if len(f.comparer) > 0 {
+		// User specified a comparer to reference for formatting, which takes
+		// precedence.
+		comparerName = f.comparer
+	} else if len(comparerName) == 0 {
+		return
+	}
+
+	if cmp := comparers[comparerName]; cmp != nil && cmp.FormatValue != nil {
+		f.fn = cmp.FormatValue
+	}
+}
+
+type fmtFormatter struct {
+	fmt string
+	v   []byte
+}
+
+func (f fmtFormatter) Format(s fmt.State, c rune) {
+	fmt.Fprintf(s, f.fmt, f.v)
+}
+
+type nullFormatter struct{}
+
+func (nullFormatter) Format(s fmt.State, c rune) {
+}
+
+func formatKeyNull(v []byte) fmt.Formatter {
+	return nullFormatter{}
+}
+
+func formatValueNull(k, v []byte) fmt.Formatter {
+	return nullFormatter{}
+}
+
+func formatKeyQuoted(v []byte) fmt.Formatter {
+	return base.FormatBytes(v)
+}
+
+func formatValueQuoted(k, v []byte) fmt.Formatter {
+	return base.FormatBytes(v)
+}
+
+type sizeFormatter []byte
+
+func (v sizeFormatter) Format(s fmt.State, c rune) {
+	fmt.Fprintf(s, "<%d>", len(v))
+}
+
+func formatKeySize(v []byte) fmt.Formatter {
+	return sizeFormatter(v)
+}
+
+func formatValueSize(k, v []byte) fmt.Formatter {
+	return sizeFormatter(v)
+}
+
+func formatKey(w io.Writer, fmtKey keyFormatter, key *base.InternalKey) bool {
+	if fmtKey.spec == "null" {
+		return false
+	}
+	fmt.Fprintf(w, "%s", key.Pretty(fmtKey.fn))
+	return true
+}
+
+func formatSeqNumRange(w io.Writer, start, end uint64) {
+	fmt.Fprintf(w, "<#%d-#%d>", start, end)
+}
+
+func formatKeyRange(w io.Writer, fmtKey keyFormatter, start, end *base.InternalKey) {
+	if fmtKey.spec == "null" {
+		return
+	}
+	fmt.Fprintf(w, "[%s-%s]", start.Pretty(fmtKey.fn), end.Pretty(fmtKey.fn))
+}
+
+func formatKeyValue(
+	w io.Writer, fmtKey keyFormatter, fmtValue valueFormatter, key *base.InternalKey, value []byte,
+) {
+	if key.Kind() == base.InternalKeyKindRangeDelete {
+		if fmtKey.spec != "null" {
+			fmt.Fprintf(w, "%s-%s#%d,%s",
+				fmtKey.fn(key.UserKey), fmtKey.fn(value),
+				key.SeqNum(), key.Kind())
+		}
+	} else {
+		needDelimiter := formatKey(w, fmtKey, key)
+		if fmtValue.spec != "null" {
+			if needDelimiter {
+				w.Write([]byte{' '})
+			}
+			fmt.Fprintf(w, "%s", fmtValue.fn(key.UserKey, value))
+		}
+	}
+	w.Write([]byte{'\n'})
+}
+
+func formatSpan(w io.Writer, fmtKey keyFormatter, fmtValue valueFormatter, s *keyspan.Span) {
+	if fmtKey.spec != "null" {
+		fmt.Fprintf(w, "[%s-%s):\n", fmtKey.fn(s.Start), fmtKey.fn(s.End))
+		for _, k := range s.Keys {
+			fmt.Fprintf(w, "  #%d,%s", k.SeqNum(), k.Kind())
+			switch k.Kind() {
+			case base.InternalKeyKindRangeKeySet:
+				fmt.Fprintf(w, ": %s %s", k.Suffix, fmtValue.fn(s.Start, k.Value))
+			case base.InternalKeyKindRangeKeyUnset:
+				fmt.Fprintf(w, ": %s", k.Suffix)
+			}
+			w.Write([]byte{'\n'})
+		}
+	}
+}
+
+func walk(stderr io.Writer, fs vfs.FS, dir string, fn func(path string)) {
+	paths, err := fs.List(dir)
+	if err != nil {
+		fmt.Fprintf(stderr, "%s: %v\n", dir, err)
+		return
+	}
+	sort.Strings(paths)
+	for _, part := range paths {
+		path := fs.PathJoin(dir, part)
+		info, err := fs.Stat(path)
+		if err != nil {
+			fmt.Fprintf(stderr, "%s: %v\n", path, err)
+			continue
+		}
+		if info.IsDir() {
+			walk(stderr, fs, path, fn)
+		} else {
+			fn(path)
+		}
+	}
+}
diff --git a/pebble/tool/wal.go b/pebble/tool/wal.go
new file mode 100644
index 0000000..2d3775f
--- /dev/null
+++ b/pebble/tool/wal.go
@@ -0,0 +1,171 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package tool
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"io"
+
+	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/rangekey"
+	"github.com/cockroachdb/pebble/record"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/spf13/cobra"
+)
+
+// walT implements WAL-level tools, including both configuration state and the
+// commands themselves.
+type walT struct {
+	Root *cobra.Command
+	Dump *cobra.Command
+
+	opts     *pebble.Options
+	fmtKey   keyFormatter
+	fmtValue valueFormatter
+
+	defaultComparer string
+	comparers       sstable.Comparers
+	verbose         bool
+}
+
+func newWAL(opts *pebble.Options, comparers sstable.Comparers, defaultComparer string) *walT {
+	w := &walT{
+		opts: opts,
+	}
+	w.fmtKey.mustSet("quoted")
+	w.fmtValue.mustSet("size")
+	w.comparers = comparers
+	w.defaultComparer = defaultComparer
+
+	w.Root = &cobra.Command{
+		Use:   "wal",
+		Short: "WAL introspection tools",
+	}
+	w.Dump = &cobra.Command{
+		Use:   "dump <wal-files>",
+		Short: "print WAL contents",
+		Long: `
+Print the contents of the WAL files.
+`,
+		Args: cobra.MinimumNArgs(1),
+		Run:  w.runDump,
+	}
+
+	w.Root.AddCommand(w.Dump)
+	w.Root.PersistentFlags().BoolVarP(&w.verbose, "verbose", "v", false, "verbose output")
+
+	w.Dump.Flags().Var(
+		&w.fmtKey, "key", "key formatter")
+	w.Dump.Flags().Var(
+		&w.fmtValue, "value", "value formatter")
+	return w
+}
+
+func (w *walT) runDump(cmd *cobra.Command, args []string) {
+	stdout, stderr := cmd.OutOrStdout(), cmd.OutOrStderr()
+	w.fmtKey.setForComparer(w.defaultComparer, w.comparers)
+	w.fmtValue.setForComparer(w.defaultComparer, w.comparers)
+
+	for _, arg := range args {
+		func() {
+			// Parse the filename in order to extract the file number. This is
+			// necessary in case WAL recycling was used (which it is usually is). If
+			// we can't parse the filename or it isn't a log file, we'll plow ahead
+			// anyways (which will likely fail when we try to read the file).
+			_, fileNum, ok := base.ParseFilename(w.opts.FS, arg)
+			if !ok {
+				fileNum = base.FileNum(0).DiskFileNum()
+			}
+
+			f, err := w.opts.FS.Open(arg)
+			if err != nil {
+				fmt.Fprintf(stderr, "%s\n", err)
+				return
+			}
+			defer f.Close()
+
+			fmt.Fprintf(stdout, "%s\n", arg)
+
+			var b pebble.Batch
+			var buf bytes.Buffer
+			rr := record.NewReader(f, fileNum)
+			for {
+				offset := rr.Offset()
+				r, err := rr.Next()
+				if err == nil {
+					buf.Reset()
+					_, err = io.Copy(&buf, r)
+				}
+				if err != nil {
+					// It is common to encounter a zeroed or invalid chunk due to WAL
+					// preallocation and WAL recycling. We need to distinguish these
+					// errors from EOF in order to recognize that the record was
+					// truncated, but want to otherwise treat them like EOF.
+					switch err {
+					case record.ErrZeroedChunk:
+						fmt.Fprintf(stdout, "EOF [%s] (may be due to WAL preallocation)\n", err)
+					case record.ErrInvalidChunk:
+						fmt.Fprintf(stdout, "EOF [%s] (may be due to WAL recycling)\n", err)
+					default:
+						fmt.Fprintf(stdout, "%s\n", err)
+					}
+					return
+				}
+
+				b = pebble.Batch{}
+				if err := b.SetRepr(buf.Bytes()); err != nil {
+					fmt.Fprintf(stdout, "corrupt batch within log file %q: %v", arg, err)
+					return
+				}
+				fmt.Fprintf(stdout, "%d(%d) seq=%d count=%d\n",
+					offset, len(b.Repr()), b.SeqNum(), b.Count())
+				for r, idx := b.Reader(), 0; ; idx++ {
+					kind, ukey, value, ok, err := r.Next()
+					if !ok {
+						if err != nil {
+							fmt.Fprintf(stdout, "corrupt batch within log file %q: %v", arg, err)
+						}
+						break
+					}
+					fmt.Fprintf(stdout, "    %s(", kind)
+					switch kind {
+					case base.InternalKeyKindDelete:
+						fmt.Fprintf(stdout, "%s", w.fmtKey.fn(ukey))
+					case base.InternalKeyKindSet:
+						fmt.Fprintf(stdout, "%s,%s", w.fmtKey.fn(ukey), w.fmtValue.fn(ukey, value))
+					case base.InternalKeyKindMerge:
+						fmt.Fprintf(stdout, "%s,%s", w.fmtKey.fn(ukey), w.fmtValue.fn(ukey, value))
+					case base.InternalKeyKindLogData:
+						fmt.Fprintf(stdout, "<%d>", len(value))
+					case base.InternalKeyKindIngestSST:
+						fileNum, _ := binary.Uvarint(ukey)
+						fmt.Fprintf(stdout, "%s", base.FileNum(fileNum))
+					case base.InternalKeyKindSingleDelete:
+						fmt.Fprintf(stdout, "%s", w.fmtKey.fn(ukey))
+					case base.InternalKeyKindSetWithDelete:
+						fmt.Fprintf(stdout, "%s", w.fmtKey.fn(ukey))
+					case base.InternalKeyKindRangeDelete:
+						fmt.Fprintf(stdout, "%s,%s", w.fmtKey.fn(ukey), w.fmtKey.fn(value))
+					case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset, base.InternalKeyKindRangeKeyDelete:
+						ik := base.MakeInternalKey(ukey, b.SeqNum()+uint64(idx), kind)
+						s, err := rangekey.Decode(ik, value, nil)
+						if err != nil {
+							fmt.Fprintf(stdout, "%s: error decoding %s", w.fmtKey.fn(ukey), err)
+						} else {
+							fmt.Fprintf(stdout, "%s", s.Pretty(w.fmtKey.fn))
+						}
+					case base.InternalKeyKindDeleteSized:
+						v, _ := binary.Uvarint(value)
+						fmt.Fprintf(stdout, "%s,%d", w.fmtKey.fn(ukey), v)
+					}
+					fmt.Fprintf(stdout, ")\n")
+				}
+			}
+		}()
+	}
+}
diff --git a/pebble/tool/wal_test.go b/pebble/tool/wal_test.go
new file mode 100644
index 0000000..f5de8eb
--- /dev/null
+++ b/pebble/tool/wal_test.go
@@ -0,0 +1,11 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package tool
+
+import "testing"
+
+func TestWAL(t *testing.T) {
+	runTests(t, "testdata/wal_*")
+}
diff --git a/pebble/version_set.go b/pebble/version_set.go
new file mode 100644
index 0000000..35dd0e7
--- /dev/null
+++ b/pebble/version_set.go
@@ -0,0 +1,1009 @@
+// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"sync"
+	"sync/atomic"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/errors/oserror"
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/record"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/cockroachdb/pebble/vfs/atomicfs"
+)
+
+const numLevels = manifest.NumLevels
+
+const manifestMarkerName = `manifest`
+
+// Provide type aliases for the various manifest structs.
+type bulkVersionEdit = manifest.BulkVersionEdit
+type deletedFileEntry = manifest.DeletedFileEntry
+type fileMetadata = manifest.FileMetadata
+type physicalMeta = manifest.PhysicalFileMeta
+type virtualMeta = manifest.VirtualFileMeta
+type fileBacking = manifest.FileBacking
+type newFileEntry = manifest.NewFileEntry
+type version = manifest.Version
+type versionEdit = manifest.VersionEdit
+type versionList = manifest.VersionList
+
+// versionSet manages a collection of immutable versions, and manages the
+// creation of a new version from the most recent version. A new version is
+// created from an existing version by applying a version edit which is just
+// like it sounds: a delta from the previous version. Version edits are logged
+// to the MANIFEST file, which is replayed at startup.
+type versionSet struct {
+	// Next seqNum to use for WAL writes.
+	logSeqNum atomic.Uint64
+
+	// The upper bound on sequence numbers that have been assigned so far. A
+	// suffix of these sequence numbers may not have been written to a WAL. Both
+	// logSeqNum and visibleSeqNum are atomically updated by the commitPipeline.
+	// visibleSeqNum is <= logSeqNum.
+	visibleSeqNum atomic.Uint64
+
+	// Number of bytes present in sstables being written by in-progress
+	// compactions. This value will be zero if there are no in-progress
+	// compactions. Updated and read atomically.
+	atomicInProgressBytes atomic.Int64
+
+	// Immutable fields.
+	dirname string
+	// Set to DB.mu.
+	mu      *sync.Mutex
+	opts    *Options
+	fs      vfs.FS
+	cmp     Compare
+	cmpName string
+	// Dynamic base level allows the dynamic base level computation to be
+	// disabled. Used by tests which want to create specific LSM structures.
+	dynamicBaseLevel bool
+
+	// Mutable fields.
+	versions versionList
+	picker   compactionPicker
+
+	metrics Metrics
+
+	// A pointer to versionSet.addObsoleteLocked. Avoids allocating a new closure
+	// on the creation of every version.
+	obsoleteFn        func(obsolete []*fileBacking)
+	obsoleteTables    []fileInfo
+	obsoleteManifests []fileInfo
+	obsoleteOptions   []fileInfo
+
+	// Zombie tables which have been removed from the current version but are
+	// still referenced by an inuse iterator.
+	zombieTables map[base.DiskFileNum]uint64 // filenum -> size
+
+	// backingState is protected by the versionSet.logLock. It's populated
+	// during Open in versionSet.load, but it's not used concurrently during
+	// load.
+	backingState struct {
+		// fileBackingMap is a map for the FileBacking which is supporting virtual
+		// sstables in the latest version. Once the file backing is backing no
+		// virtual sstables in the latest version, it is removed from this map and
+		// the corresponding state is added to the zombieTables map. Note that we
+		// don't keep track of file backing which supports a virtual sstable
+		// which is not in the latest version.
+		fileBackingMap map[base.DiskFileNum]*fileBacking
+		// fileBackingSize is the sum of the sizes of the fileBackings in the
+		// fileBackingMap.
+		fileBackingSize uint64
+	}
+
+	// minUnflushedLogNum is the smallest WAL log file number corresponding to
+	// mutations that have not been flushed to an sstable.
+	minUnflushedLogNum base.DiskFileNum
+
+	// The next file number. A single counter is used to assign file
+	// numbers for the WAL, MANIFEST, sstable, and OPTIONS files.
+	nextFileNum uint64
+
+	// The current manifest file number.
+	manifestFileNum base.DiskFileNum
+	manifestMarker  *atomicfs.Marker
+
+	manifestFile          vfs.File
+	manifest              *record.Writer
+	setCurrent            func(base.DiskFileNum) error
+	getFormatMajorVersion func() FormatMajorVersion
+
+	writing    bool
+	writerCond sync.Cond
+	// State for deciding when to write a snapshot. Protected by mu.
+	rotationHelper record.RotationHelper
+}
+
+func (vs *versionSet) init(
+	dirname string,
+	opts *Options,
+	marker *atomicfs.Marker,
+	setCurrent func(base.DiskFileNum) error,
+	getFMV func() FormatMajorVersion,
+	mu *sync.Mutex,
+) {
+	vs.dirname = dirname
+	vs.mu = mu
+	vs.writerCond.L = mu
+	vs.opts = opts
+	vs.fs = opts.FS
+	vs.cmp = opts.Comparer.Compare
+	vs.cmpName = opts.Comparer.Name
+	vs.dynamicBaseLevel = true
+	vs.versions.Init(mu)
+	vs.obsoleteFn = vs.addObsoleteLocked
+	vs.zombieTables = make(map[base.DiskFileNum]uint64)
+	vs.backingState.fileBackingMap = make(map[base.DiskFileNum]*fileBacking)
+	vs.backingState.fileBackingSize = 0
+	vs.nextFileNum = 1
+	vs.manifestMarker = marker
+	vs.setCurrent = setCurrent
+	vs.getFormatMajorVersion = getFMV
+}
+
+// create creates a version set for a fresh DB.
+func (vs *versionSet) create(
+	jobID int,
+	dirname string,
+	opts *Options,
+	marker *atomicfs.Marker,
+	setCurrent func(base.DiskFileNum) error,
+	getFormatMajorVersion func() FormatMajorVersion,
+	mu *sync.Mutex,
+) error {
+	vs.init(dirname, opts, marker, setCurrent, getFormatMajorVersion, mu)
+	newVersion := &version{}
+	vs.append(newVersion)
+	var err error
+
+	vs.picker = newCompactionPicker(newVersion, vs.opts, nil)
+	// Note that a "snapshot" version edit is written to the manifest when it is
+	// created.
+	vs.manifestFileNum = vs.getNextDiskFileNum()
+	err = vs.createManifest(vs.dirname, vs.manifestFileNum, vs.minUnflushedLogNum, vs.nextFileNum)
+	if err == nil {
+		if err = vs.manifest.Flush(); err != nil {
+			vs.opts.Logger.Fatalf("MANIFEST flush failed: %v", err)
+		}
+	}
+	if err == nil {
+		if err = vs.manifestFile.Sync(); err != nil {
+			vs.opts.Logger.Fatalf("MANIFEST sync failed: %v", err)
+		}
+	}
+	if err == nil {
+		// NB: setCurrent is responsible for syncing the data directory.
+		if err = vs.setCurrent(vs.manifestFileNum); err != nil {
+			vs.opts.Logger.Fatalf("MANIFEST set current failed: %v", err)
+		}
+	}
+
+	vs.opts.EventListener.ManifestCreated(ManifestCreateInfo{
+		JobID:   jobID,
+		Path:    base.MakeFilepath(vs.fs, vs.dirname, fileTypeManifest, vs.manifestFileNum),
+		FileNum: vs.manifestFileNum,
+		Err:     err,
+	})
+	if err != nil {
+		return err
+	}
+	return nil
+}
+
+// load loads the version set from the manifest file.
+func (vs *versionSet) load(
+	dirname string,
+	opts *Options,
+	manifestFileNum base.DiskFileNum,
+	marker *atomicfs.Marker,
+	setCurrent func(base.DiskFileNum) error,
+	getFormatMajorVersion func() FormatMajorVersion,
+	mu *sync.Mutex,
+) error {
+	vs.init(dirname, opts, marker, setCurrent, getFormatMajorVersion, mu)
+
+	vs.manifestFileNum = manifestFileNum
+	manifestPath := base.MakeFilepath(opts.FS, dirname, fileTypeManifest, vs.manifestFileNum)
+	manifestFilename := opts.FS.PathBase(manifestPath)
+
+	// Read the versionEdits in the manifest file.
+	var bve bulkVersionEdit
+	bve.AddedByFileNum = make(map[base.FileNum]*fileMetadata)
+	manifest, err := vs.fs.Open(manifestPath)
+	if err != nil {
+		return errors.Wrapf(err, "pebble: could not open manifest file %q for DB %q",
+			errors.Safe(manifestFilename), dirname)
+	}
+	defer manifest.Close()
+	rr := record.NewReader(manifest, 0 /* logNum */)
+	for {
+		r, err := rr.Next()
+		if err == io.EOF || record.IsInvalidRecord(err) {
+			break
+		}
+		if err != nil {
+			return errors.Wrapf(err, "pebble: error when loading manifest file %q",
+				errors.Safe(manifestFilename))
+		}
+		var ve versionEdit
+		err = ve.Decode(r)
+		if err != nil {
+			// Break instead of returning an error if the record is corrupted
+			// or invalid.
+			if err == io.EOF || record.IsInvalidRecord(err) {
+				break
+			}
+			return err
+		}
+		if ve.ComparerName != "" {
+			if ve.ComparerName != vs.cmpName {
+				return errors.Errorf("pebble: manifest file %q for DB %q: "+
+					"comparer name from file %q != comparer name from Options %q",
+					errors.Safe(manifestFilename), dirname, errors.Safe(ve.ComparerName), errors.Safe(vs.cmpName))
+			}
+		}
+		if err := bve.Accumulate(&ve); err != nil {
+			return err
+		}
+		if ve.MinUnflushedLogNum != 0 {
+			vs.minUnflushedLogNum = ve.MinUnflushedLogNum
+		}
+		if ve.NextFileNum != 0 {
+			vs.nextFileNum = ve.NextFileNum
+		}
+		if ve.LastSeqNum != 0 {
+			// logSeqNum is the _next_ sequence number that will be assigned,
+			// while LastSeqNum is the last assigned sequence number. Note that
+			// this behaviour mimics that in RocksDB; the first sequence number
+			// assigned is one greater than the one present in the manifest
+			// (assuming no WALs contain higher sequence numbers than the
+			// manifest's LastSeqNum). Increment LastSeqNum by 1 to get the
+			// next sequence number that will be assigned.
+			//
+			// If LastSeqNum is less than SeqNumStart, increase it to at least
+			// SeqNumStart to leave ample room for reserved sequence numbers.
+			if ve.LastSeqNum+1 < base.SeqNumStart {
+				vs.logSeqNum.Store(base.SeqNumStart)
+			} else {
+				vs.logSeqNum.Store(ve.LastSeqNum + 1)
+			}
+		}
+	}
+	// We have already set vs.nextFileNum = 2 at the beginning of the
+	// function and could have only updated it to some other non-zero value,
+	// so it cannot be 0 here.
+	if vs.minUnflushedLogNum == 0 {
+		if vs.nextFileNum >= 2 {
+			// We either have a freshly created DB, or a DB created by RocksDB
+			// that has not had a single flushed SSTable yet. This is because
+			// RocksDB bumps up nextFileNum in this case without bumping up
+			// minUnflushedLogNum, even if WALs with non-zero file numbers are
+			// present in the directory.
+		} else {
+			return base.CorruptionErrorf("pebble: malformed manifest file %q for DB %q",
+				errors.Safe(manifestFilename), dirname)
+		}
+	}
+	vs.markFileNumUsed(vs.minUnflushedLogNum)
+
+	// Populate the fileBackingMap and the FileBacking for virtual sstables since
+	// we have finished version edit accumulation.
+	for _, s := range bve.AddedFileBacking {
+		vs.addFileBacking(s)
+	}
+
+	for _, fileNum := range bve.RemovedFileBacking {
+		vs.removeFileBacking(fileNum)
+	}
+
+	newVersion, err := bve.Apply(
+		nil, vs.cmp, opts.Comparer.FormatKey, opts.FlushSplitBytes,
+		opts.Experimental.ReadCompactionRate, nil, /* zombies */
+		getFormatMajorVersion().orderingInvariants(),
+	)
+	if err != nil {
+		return err
+	}
+	newVersion.L0Sublevels.InitCompactingFileInfo(nil /* in-progress compactions */)
+	vs.append(newVersion)
+
+	for i := range vs.metrics.Levels {
+		l := &vs.metrics.Levels[i]
+		l.NumFiles = int64(newVersion.Levels[i].Len())
+		files := newVersion.Levels[i].Slice()
+		l.Size = int64(files.SizeSum())
+	}
+
+	vs.picker = newCompactionPicker(newVersion, vs.opts, nil)
+	return nil
+}
+
+func (vs *versionSet) close() error {
+	if vs.manifestFile != nil {
+		if err := vs.manifestFile.Close(); err != nil {
+			return err
+		}
+	}
+	if vs.manifestMarker != nil {
+		if err := vs.manifestMarker.Close(); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// logLock locks the manifest for writing. The lock must be released by either
+// a call to logUnlock or logAndApply.
+//
+// DB.mu must be held when calling this method, but the mutex may be dropped and
+// re-acquired during the course of this method.
+func (vs *versionSet) logLock() {
+	// Wait for any existing writing to the manifest to complete, then mark the
+	// manifest as busy.
+	for vs.writing {
+		vs.writerCond.Wait()
+	}
+	vs.writing = true
+}
+
+// logUnlock releases the lock for manifest writing.
+//
+// DB.mu must be held when calling this method.
+func (vs *versionSet) logUnlock() {
+	if !vs.writing {
+		vs.opts.Logger.Fatalf("MANIFEST not locked for writing")
+	}
+	vs.writing = false
+	vs.writerCond.Signal()
+}
+
+// Only call if the DiskFileNum doesn't exist in the fileBackingMap.
+func (vs *versionSet) addFileBacking(backing *manifest.FileBacking) {
+	_, ok := vs.backingState.fileBackingMap[backing.DiskFileNum]
+	if ok {
+		panic("pebble: trying to add an existing file backing")
+	}
+	vs.backingState.fileBackingMap[backing.DiskFileNum] = backing
+	vs.backingState.fileBackingSize += backing.Size
+}
+
+// Only call if the the DiskFileNum exists in the fileBackingMap.
+func (vs *versionSet) removeFileBacking(dfn base.DiskFileNum) {
+	backing, ok := vs.backingState.fileBackingMap[dfn]
+	if !ok {
+		panic("pebble: trying to remove an unknown file backing")
+	}
+	delete(vs.backingState.fileBackingMap, dfn)
+	vs.backingState.fileBackingSize -= backing.Size
+}
+
+// logAndApply logs the version edit to the manifest, applies the version edit
+// to the current version, and installs the new version.
+//
+// DB.mu must be held when calling this method and will be released temporarily
+// while performing file I/O. Requires that the manifest is locked for writing
+// (see logLock). Will unconditionally release the manifest lock (via
+// logUnlock) even if an error occurs.
+//
+// inProgressCompactions is called while DB.mu is held, to get the list of
+// in-progress compactions.
+func (vs *versionSet) logAndApply(
+	jobID int,
+	ve *versionEdit,
+	metrics map[int]*LevelMetrics,
+	forceRotation bool,
+	inProgressCompactions func() []compactionInfo,
+) error {
+	if !vs.writing {
+		vs.opts.Logger.Fatalf("MANIFEST not locked for writing")
+	}
+	defer vs.logUnlock()
+
+	if ve.MinUnflushedLogNum != 0 {
+		if ve.MinUnflushedLogNum < vs.minUnflushedLogNum ||
+			vs.nextFileNum <= uint64(ve.MinUnflushedLogNum) {
+			panic(fmt.Sprintf("pebble: inconsistent versionEdit minUnflushedLogNum %d",
+				ve.MinUnflushedLogNum))
+		}
+	}
+
+	// This is the next manifest filenum, but if the current file is too big we
+	// will write this ve to the next file which means what ve encodes is the
+	// current filenum and not the next one.
+	//
+	// TODO(sbhola): figure out why this is correct and update comment.
+	ve.NextFileNum = vs.nextFileNum
+
+	// LastSeqNum is set to the current upper bound on the assigned sequence
+	// numbers. Note that this is exactly the behavior of RocksDB. LastSeqNum is
+	// used to initialize versionSet.logSeqNum and versionSet.visibleSeqNum on
+	// replay. It must be higher than or equal to any than any sequence number
+	// written to an sstable, including sequence numbers in ingested files.
+	// Note that LastSeqNum is not (and cannot be) the minimum unflushed sequence
+	// number. This is fallout from ingestion which allows a sequence number X to
+	// be assigned to an ingested sstable even though sequence number X-1 resides
+	// in an unflushed memtable. logSeqNum is the _next_ sequence number that
+	// will be assigned, so subtract that by 1 to get the upper bound on the
+	// last assigned sequence number.
+	logSeqNum := vs.logSeqNum.Load()
+	ve.LastSeqNum = logSeqNum - 1
+	if logSeqNum == 0 {
+		// logSeqNum is initialized to 1 in Open() if there are no previous WAL
+		// or manifest records, so this case should never happen.
+		vs.opts.Logger.Fatalf("logSeqNum must be a positive integer: %d", logSeqNum)
+	}
+
+	currentVersion := vs.currentVersion()
+	fmv := vs.getFormatMajorVersion()
+	orderingInvariants := fmv.orderingInvariants()
+	var newVersion *version
+
+	// Generate a new manifest if we don't currently have one, or forceRotation
+	// is true, or the current one is too large.
+	//
+	// For largeness, we do not exclusively use MaxManifestFileSize size
+	// threshold since we have had incidents where due to either large keys or
+	// large numbers of files, each edit results in a snapshot + write of the
+	// edit. This slows the system down since each flush or compaction is
+	// writing a new manifest snapshot. The primary goal of the size-based
+	// rollover logic is to ensure that when reopening a DB, the number of edits
+	// that need to be replayed on top of the snapshot is "sane". Rolling over
+	// to a new manifest after each edit is not relevant to that goal.
+	//
+	// Consider the following cases:
+	// - The number of live files F in the DB is roughly stable: after writing
+	//   the snapshot (with F files), say we require that there be enough edits
+	//   such that the cumulative number of files in those edits, E, be greater
+	//   than F. This will ensure that the total amount of time in logAndApply
+	//   that is spent in snapshot writing is ~50%.
+	//
+	// - The number of live files F in the DB is shrinking drastically, say from
+	//   F to F/10: This can happen for various reasons, like wide range
+	//   tombstones, or large numbers of smaller than usual files that are being
+	//   merged together into larger files. And say the new files generated
+	//   during this shrinkage is insignificant compared to F/10, and so for
+	//   this example we will assume it is effectively 0. After this shrinking,
+	//   E = 0.9F, and so if we used the previous snapshot file count, F, as the
+	//   threshold that needs to be exceeded, we will further delay the snapshot
+	//   writing. Which means on DB reopen we will need to replay 0.9F edits to
+	//   get to a version with 0.1F files. It would be better to create a new
+	//   snapshot when E exceeds the number of files in the current version.
+	//
+	// - The number of live files F in the DB is growing via perfect ingests
+	//   into L6: Say we wrote the snapshot when there were F files and now we
+	//   have 10F files, so E = 9F. We will further delay writing a new
+	//   snapshot. This case can be critiqued as contrived, but we consider it
+	//   nonetheless.
+	//
+	// The logic below uses the min of the last snapshot file count and the file
+	// count in the current version.
+	vs.rotationHelper.AddRecord(int64(len(ve.DeletedFiles) + len(ve.NewFiles)))
+	sizeExceeded := vs.manifest.Size() >= vs.opts.MaxManifestFileSize
+	requireRotation := forceRotation || vs.manifest == nil
+
+	var nextSnapshotFilecount int64
+	for i := range vs.metrics.Levels {
+		nextSnapshotFilecount += vs.metrics.Levels[i].NumFiles
+	}
+	if sizeExceeded && !requireRotation {
+		requireRotation = vs.rotationHelper.ShouldRotate(nextSnapshotFilecount)
+	}
+	var newManifestFileNum base.DiskFileNum
+	var prevManifestFileSize uint64
+	if requireRotation {
+		newManifestFileNum = vs.getNextDiskFileNum()
+		prevManifestFileSize = uint64(vs.manifest.Size())
+	}
+
+	// Grab certain values before releasing vs.mu, in case createManifest() needs
+	// to be called.
+	minUnflushedLogNum := vs.minUnflushedLogNum
+	nextFileNum := vs.nextFileNum
+
+	var zombies map[base.DiskFileNum]uint64
+	if err := func() error {
+		vs.mu.Unlock()
+		defer vs.mu.Lock()
+
+		var err error
+		if vs.getFormatMajorVersion() < FormatVirtualSSTables && len(ve.CreatedBackingTables) > 0 {
+			return errors.AssertionFailedf("MANIFEST cannot contain virtual sstable records due to format major version")
+		}
+		newVersion, zombies, err = manifest.AccumulateIncompleteAndApplySingleVE(
+			ve, currentVersion, vs.cmp, vs.opts.Comparer.FormatKey,
+			vs.opts.FlushSplitBytes, vs.opts.Experimental.ReadCompactionRate,
+			vs.backingState.fileBackingMap, vs.addFileBacking, vs.removeFileBacking,
+			orderingInvariants,
+		)
+		if err != nil {
+			return errors.Wrap(err, "MANIFEST apply failed")
+		}
+
+		if newManifestFileNum != 0 {
+			if err := vs.createManifest(vs.dirname, newManifestFileNum, minUnflushedLogNum, nextFileNum); err != nil {
+				vs.opts.EventListener.ManifestCreated(ManifestCreateInfo{
+					JobID:   jobID,
+					Path:    base.MakeFilepath(vs.fs, vs.dirname, fileTypeManifest, newManifestFileNum),
+					FileNum: newManifestFileNum,
+					Err:     err,
+				})
+				return errors.Wrap(err, "MANIFEST create failed")
+			}
+		}
+
+		w, err := vs.manifest.Next()
+		if err != nil {
+			return errors.Wrap(err, "MANIFEST next record write failed")
+		}
+
+		// NB: Any error from this point on is considered fatal as we don't know if
+		// the MANIFEST write occurred or not. Trying to determine that is
+		// fraught. Instead we rely on the standard recovery mechanism run when a
+		// database is open. In particular, that mechanism generates a new MANIFEST
+		// and ensures it is synced.
+		if err := ve.Encode(w); err != nil {
+			return errors.Wrap(err, "MANIFEST write failed")
+		}
+		if err := vs.manifest.Flush(); err != nil {
+			return errors.Wrap(err, "MANIFEST flush failed")
+		}
+		if err := vs.manifestFile.Sync(); err != nil {
+			return errors.Wrap(err, "MANIFEST sync failed")
+		}
+		if newManifestFileNum != 0 {
+			// NB: setCurrent is responsible for syncing the data directory.
+			if err := vs.setCurrent(newManifestFileNum); err != nil {
+				return errors.Wrap(err, "MANIFEST set current failed")
+			}
+			vs.opts.EventListener.ManifestCreated(ManifestCreateInfo{
+				JobID:   jobID,
+				Path:    base.MakeFilepath(vs.fs, vs.dirname, fileTypeManifest, newManifestFileNum),
+				FileNum: newManifestFileNum,
+			})
+		}
+		return nil
+	}(); err != nil {
+		// Any error encountered during any of the operations in the previous
+		// closure are considered fatal. Treating such errors as fatal is preferred
+		// to attempting to unwind various file and b-tree reference counts, and
+		// re-generating L0 sublevel metadata. This may change in the future, if
+		// certain manifest / WAL operations become retryable. For more context, see
+		// #1159 and #1792.
+		vs.opts.Logger.Fatalf("%s", err)
+		return err
+	}
+
+	if requireRotation {
+		// Successfully rotated.
+		vs.rotationHelper.Rotate(nextSnapshotFilecount)
+	}
+	// Now that DB.mu is held again, initialize compacting file info in
+	// L0Sublevels.
+	inProgress := inProgressCompactions()
+
+	newVersion.L0Sublevels.InitCompactingFileInfo(inProgressL0Compactions(inProgress))
+
+	// Update the zombie tables set first, as installation of the new version
+	// will unref the previous version which could result in addObsoleteLocked
+	// being called.
+	for fileNum, size := range zombies {
+		vs.zombieTables[fileNum] = size
+	}
+
+	// Install the new version.
+	vs.append(newVersion)
+	if ve.MinUnflushedLogNum != 0 {
+		vs.minUnflushedLogNum = ve.MinUnflushedLogNum
+	}
+	if newManifestFileNum != 0 {
+		if vs.manifestFileNum != 0 {
+			vs.obsoleteManifests = append(vs.obsoleteManifests, fileInfo{
+				fileNum:  vs.manifestFileNum,
+				fileSize: prevManifestFileSize,
+			})
+		}
+		vs.manifestFileNum = newManifestFileNum
+	}
+
+	for level, update := range metrics {
+		vs.metrics.Levels[level].Add(update)
+	}
+	for i := range vs.metrics.Levels {
+		l := &vs.metrics.Levels[i]
+		l.NumFiles = int64(newVersion.Levels[i].Len())
+		l.NumVirtualFiles = newVersion.Levels[i].NumVirtual
+		l.VirtualSize = newVersion.Levels[i].VirtualSize
+		l.Size = int64(newVersion.Levels[i].Size())
+
+		l.Sublevels = 0
+		if l.NumFiles > 0 {
+			l.Sublevels = 1
+		}
+		if invariants.Enabled {
+			levelFiles := newVersion.Levels[i].Slice()
+			if size := int64(levelFiles.SizeSum()); l.Size != size {
+				vs.opts.Logger.Fatalf("versionSet metrics L%d Size = %d, actual size = %d", i, l.Size, size)
+			}
+			if nVirtual := levelFiles.NumVirtual(); nVirtual != l.NumVirtualFiles {
+				vs.opts.Logger.Fatalf(
+					"versionSet metrics L%d NumVirtual = %d, actual NumVirtual = %d",
+					i, l.NumVirtualFiles, nVirtual,
+				)
+			}
+			if vSize := levelFiles.VirtualSizeSum(); vSize != l.VirtualSize {
+				vs.opts.Logger.Fatalf(
+					"versionSet metrics L%d Virtual size = %d, actual size = %d",
+					i, l.VirtualSize, vSize,
+				)
+			}
+		}
+	}
+	vs.metrics.Levels[0].Sublevels = int32(len(newVersion.L0SublevelFiles))
+
+	vs.picker = newCompactionPicker(newVersion, vs.opts, inProgress)
+	if !vs.dynamicBaseLevel {
+		vs.picker.forceBaseLevel1()
+	}
+	return nil
+}
+
+func (vs *versionSet) incrementCompactions(
+	kind compactionKind, extraLevels []*compactionLevel, pickerMetrics compactionPickerMetrics,
+) {
+	switch kind {
+	case compactionKindDefault:
+		vs.metrics.Compact.Count++
+		vs.metrics.Compact.DefaultCount++
+
+	case compactionKindFlush, compactionKindIngestedFlushable:
+		vs.metrics.Flush.Count++
+
+	case compactionKindMove:
+		vs.metrics.Compact.Count++
+		vs.metrics.Compact.MoveCount++
+
+	case compactionKindDeleteOnly:
+		vs.metrics.Compact.Count++
+		vs.metrics.Compact.DeleteOnlyCount++
+
+	case compactionKindElisionOnly:
+		vs.metrics.Compact.Count++
+		vs.metrics.Compact.ElisionOnlyCount++
+
+	case compactionKindRead:
+		vs.metrics.Compact.Count++
+		vs.metrics.Compact.ReadCount++
+
+	case compactionKindRewrite:
+		vs.metrics.Compact.Count++
+		vs.metrics.Compact.RewriteCount++
+	}
+	if len(extraLevels) > 0 {
+		vs.metrics.Compact.MultiLevelCount++
+	}
+}
+
+func (vs *versionSet) incrementCompactionBytes(numBytes int64) {
+	vs.atomicInProgressBytes.Add(numBytes)
+}
+
+// createManifest creates a manifest file that contains a snapshot of vs.
+func (vs *versionSet) createManifest(
+	dirname string, fileNum, minUnflushedLogNum base.DiskFileNum, nextFileNum uint64,
+) (err error) {
+	var (
+		filename     = base.MakeFilepath(vs.fs, dirname, fileTypeManifest, fileNum)
+		manifestFile vfs.File
+		manifest     *record.Writer
+	)
+	defer func() {
+		if manifest != nil {
+			manifest.Close()
+		}
+		if manifestFile != nil {
+			manifestFile.Close()
+		}
+		if err != nil {
+			vs.fs.Remove(filename)
+		}
+	}()
+	manifestFile, err = vs.fs.Create(filename)
+	if err != nil {
+		return err
+	}
+	manifest = record.NewWriter(manifestFile)
+
+	snapshot := versionEdit{
+		ComparerName: vs.cmpName,
+	}
+	dedup := make(map[base.DiskFileNum]struct{})
+	for level, levelMetadata := range vs.currentVersion().Levels {
+		iter := levelMetadata.Iter()
+		for meta := iter.First(); meta != nil; meta = iter.Next() {
+			snapshot.NewFiles = append(snapshot.NewFiles, newFileEntry{
+				Level: level,
+				Meta:  meta,
+			})
+			if _, ok := dedup[meta.FileBacking.DiskFileNum]; meta.Virtual && !ok {
+				dedup[meta.FileBacking.DiskFileNum] = struct{}{}
+				snapshot.CreatedBackingTables = append(
+					snapshot.CreatedBackingTables,
+					meta.FileBacking,
+				)
+			}
+		}
+	}
+
+	// When creating a version snapshot for an existing DB, this snapshot VersionEdit will be
+	// immediately followed by another VersionEdit (being written in logAndApply()). That
+	// VersionEdit always contains a LastSeqNum, so we don't need to include that in the snapshot.
+	// But it does not necessarily include MinUnflushedLogNum, NextFileNum, so we initialize those
+	// using the corresponding fields in the versionSet (which came from the latest preceding
+	// VersionEdit that had those fields).
+	snapshot.MinUnflushedLogNum = minUnflushedLogNum
+	snapshot.NextFileNum = nextFileNum
+
+	w, err1 := manifest.Next()
+	if err1 != nil {
+		return err1
+	}
+	if err := snapshot.Encode(w); err != nil {
+		return err
+	}
+
+	if vs.manifest != nil {
+		vs.manifest.Close()
+		vs.manifest = nil
+	}
+	if vs.manifestFile != nil {
+		if err := vs.manifestFile.Close(); err != nil {
+			return err
+		}
+		vs.manifestFile = nil
+	}
+
+	vs.manifest, manifest = manifest, nil
+	vs.manifestFile, manifestFile = manifestFile, nil
+	return nil
+}
+
+func (vs *versionSet) markFileNumUsed(fileNum base.DiskFileNum) {
+	if vs.nextFileNum <= uint64(fileNum) {
+		vs.nextFileNum = uint64(fileNum + 1)
+	}
+}
+
+func (vs *versionSet) getNextFileNum() base.FileNum {
+	x := vs.nextFileNum
+	vs.nextFileNum++
+	return base.FileNum(x)
+}
+
+func (vs *versionSet) getNextDiskFileNum() base.DiskFileNum {
+	x := vs.nextFileNum
+	vs.nextFileNum++
+	return base.DiskFileNum(x)
+}
+
+func (vs *versionSet) append(v *version) {
+	if v.Refs() != 0 {
+		panic("pebble: version should be unreferenced")
+	}
+	if !vs.versions.Empty() {
+		vs.versions.Back().UnrefLocked()
+	}
+	v.Deleted = vs.obsoleteFn
+	v.Ref()
+	vs.versions.PushBack(v)
+}
+
+func (vs *versionSet) currentVersion() *version {
+	return vs.versions.Back()
+}
+
+func (vs *versionSet) addLiveFileNums(m map[base.DiskFileNum]struct{}) {
+	current := vs.currentVersion()
+	for v := vs.versions.Front(); true; v = v.Next() {
+		for _, lm := range v.Levels {
+			iter := lm.Iter()
+			for f := iter.First(); f != nil; f = iter.Next() {
+				m[f.FileBacking.DiskFileNum] = struct{}{}
+			}
+		}
+		if v == current {
+			break
+		}
+	}
+}
+
+// addObsoleteLocked will add the fileInfo associated with obsolete backing
+// sstables to the obsolete tables list.
+//
+// The file backings in the obsolete list must not appear more than once.
+//
+// DB.mu must be held when addObsoleteLocked is called.
+func (vs *versionSet) addObsoleteLocked(obsolete []*fileBacking) {
+	if len(obsolete) == 0 {
+		return
+	}
+
+	obsoleteFileInfo := make([]fileInfo, len(obsolete))
+	for i, bs := range obsolete {
+		obsoleteFileInfo[i].fileNum = bs.DiskFileNum
+		obsoleteFileInfo[i].fileSize = bs.Size
+	}
+
+	if invariants.Enabled {
+		dedup := make(map[base.DiskFileNum]struct{})
+		for _, fi := range obsoleteFileInfo {
+			dedup[fi.fileNum] = struct{}{}
+		}
+		if len(dedup) != len(obsoleteFileInfo) {
+			panic("pebble: duplicate FileBacking present in obsolete list")
+		}
+	}
+
+	for _, fi := range obsoleteFileInfo {
+		// Note that the obsolete tables are no longer zombie by the definition of
+		// zombie, but we leave them in the zombie tables map until they are
+		// deleted from disk.
+		if _, ok := vs.zombieTables[fi.fileNum]; !ok {
+			vs.opts.Logger.Fatalf("MANIFEST obsolete table %s not marked as zombie", fi.fileNum)
+		}
+	}
+
+	vs.obsoleteTables = append(vs.obsoleteTables, obsoleteFileInfo...)
+	vs.updateObsoleteTableMetricsLocked()
+}
+
+// addObsolete will acquire DB.mu, so DB.mu must not be held when this is
+// called.
+func (vs *versionSet) addObsolete(obsolete []*fileBacking) {
+	vs.mu.Lock()
+	defer vs.mu.Unlock()
+	vs.addObsoleteLocked(obsolete)
+}
+
+func (vs *versionSet) updateObsoleteTableMetricsLocked() {
+	vs.metrics.Table.ObsoleteCount = int64(len(vs.obsoleteTables))
+	vs.metrics.Table.ObsoleteSize = 0
+	for _, fi := range vs.obsoleteTables {
+		vs.metrics.Table.ObsoleteSize += fi.fileSize
+	}
+}
+
+func setCurrentFunc(
+	vers FormatMajorVersion, marker *atomicfs.Marker, fs vfs.FS, dirname string, dir vfs.File,
+) func(base.DiskFileNum) error {
+	if vers < formatVersionedManifestMarker {
+		// Pebble versions before `formatVersionedManifestMarker` used
+		// the CURRENT file to signal which MANIFEST is current. Ignore
+		// the filename read during LocateMarker.
+		return func(manifestFileNum base.DiskFileNum) error {
+			if err := setCurrentFile(dirname, fs, manifestFileNum); err != nil {
+				return err
+			}
+			if err := dir.Sync(); err != nil {
+				// This is a panic here, rather than higher in the call
+				// stack, for parity with the atomicfs.Marker behavior.
+				// A panic is always necessary because failed Syncs are
+				// unrecoverable.
+				panic(errors.Wrap(err, "fatal: MANIFEST dirsync failed"))
+			}
+			return nil
+		}
+	}
+	return setCurrentFuncMarker(marker, fs, dirname)
+}
+
+func setCurrentFuncMarker(
+	marker *atomicfs.Marker, fs vfs.FS, dirname string,
+) func(base.DiskFileNum) error {
+	return func(manifestFileNum base.DiskFileNum) error {
+		return marker.Move(base.MakeFilename(fileTypeManifest, manifestFileNum))
+	}
+}
+
+func findCurrentManifest(
+	vers FormatMajorVersion, fs vfs.FS, dirname string,
+) (marker *atomicfs.Marker, manifestNum base.DiskFileNum, exists bool, err error) {
+	// NB: We always locate the manifest marker, even if we might not
+	// actually use it (because we're opening the database at an earlier
+	// format major version that uses the CURRENT file).  Locating a
+	// marker should succeed even if the marker has never been placed.
+	var filename string
+	marker, filename, err = atomicfs.LocateMarker(fs, dirname, manifestMarkerName)
+	if err != nil {
+		return nil, base.FileNum(0).DiskFileNum(), false, err
+	}
+
+	if vers < formatVersionedManifestMarker {
+		// Pebble versions before `formatVersionedManifestMarker` used
+		// the CURRENT file to signal which MANIFEST is current. Ignore
+		// the filename read during LocateMarker.
+
+		manifestNum, err = readCurrentFile(fs, dirname)
+		if oserror.IsNotExist(err) {
+			return marker, base.FileNum(0).DiskFileNum(), false, nil
+		} else if err != nil {
+			return marker, base.FileNum(0).DiskFileNum(), false, err
+		}
+		return marker, manifestNum, true, nil
+	}
+
+	// The current format major version is >=
+	// formatVersionedManifestMarker indicating that the
+	// atomicfs.Marker is the source of truth on the current manifest.
+
+	if filename == "" {
+		// The marker hasn't been set yet. This database doesn't exist.
+		return marker, base.FileNum(0).DiskFileNum(), false, nil
+	}
+
+	var ok bool
+	_, manifestNum, ok = base.ParseFilename(fs, filename)
+	if !ok {
+		return marker, base.FileNum(0).DiskFileNum(), false, base.CorruptionErrorf("pebble: MANIFEST name %q is malformed", errors.Safe(filename))
+	}
+	return marker, manifestNum, true, nil
+}
+
+func readCurrentFile(fs vfs.FS, dirname string) (base.DiskFileNum, error) {
+	// Read the CURRENT file to find the current manifest file.
+	current, err := fs.Open(base.MakeFilepath(fs, dirname, fileTypeCurrent, base.FileNum(0).DiskFileNum()))
+	if err != nil {
+		return base.FileNum(0).DiskFileNum(), errors.Wrapf(err, "pebble: could not open CURRENT file for DB %q", dirname)
+	}
+	defer current.Close()
+	stat, err := current.Stat()
+	if err != nil {
+		return base.FileNum(0).DiskFileNum(), err
+	}
+	n := stat.Size()
+	if n == 0 {
+		return base.FileNum(0).DiskFileNum(), errors.Errorf("pebble: CURRENT file for DB %q is empty", dirname)
+	}
+	if n > 4096 {
+		return base.FileNum(0).DiskFileNum(), errors.Errorf("pebble: CURRENT file for DB %q is too large", dirname)
+	}
+	b := make([]byte, n)
+	_, err = current.ReadAt(b, 0)
+	if err != nil {
+		return base.FileNum(0).DiskFileNum(), err
+	}
+	if b[n-1] != '\n' {
+		return base.FileNum(0).DiskFileNum(), base.CorruptionErrorf("pebble: CURRENT file for DB %q is malformed", dirname)
+	}
+	b = bytes.TrimSpace(b)
+
+	_, manifestFileNum, ok := base.ParseFilename(fs, string(b))
+	if !ok {
+		return base.FileNum(0).DiskFileNum(), base.CorruptionErrorf("pebble: MANIFEST name %q is malformed", errors.Safe(b))
+	}
+	return manifestFileNum, nil
+}
+
+func newFileMetrics(newFiles []manifest.NewFileEntry) map[int]*LevelMetrics {
+	m := map[int]*LevelMetrics{}
+	for _, nf := range newFiles {
+		lm := m[nf.Level]
+		if lm == nil {
+			lm = &LevelMetrics{}
+			m[nf.Level] = lm
+		}
+		lm.NumFiles++
+		lm.Size += int64(nf.Meta.Size)
+	}
+	return m
+}
diff --git a/pebble/version_set_test.go b/pebble/version_set_test.go
new file mode 100644
index 0000000..d0059db
--- /dev/null
+++ b/pebble/version_set_test.go
@@ -0,0 +1,509 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package pebble
+
+import (
+	"io"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/pebble/internal/base"
+	"github.com/cockroachdb/pebble/internal/manifest"
+	"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
+	"github.com/cockroachdb/pebble/record"
+	"github.com/cockroachdb/pebble/sstable"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+func writeAndIngest(t *testing.T, mem vfs.FS, d *DB, k InternalKey, v []byte, filename string) {
+	path := mem.PathJoin("ext", filename)
+	f, err := mem.Create(path)
+	require.NoError(t, err)
+	w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), sstable.WriterOptions{})
+	require.NoError(t, w.Add(k, v))
+	require.NoError(t, w.Close())
+	require.NoError(t, d.Ingest([]string{path}))
+}
+
+// d.mu should be help. logLock should not be held.
+func checkBackingSize(t *testing.T, d *DB) {
+	d.mu.versions.logLock()
+	var backingSizeSum uint64
+	for _, backing := range d.mu.versions.backingState.fileBackingMap {
+		backingSizeSum += backing.Size
+	}
+	require.Equal(t, backingSizeSum, d.mu.versions.backingState.fileBackingSize)
+	d.mu.versions.logUnlock()
+}
+
+// TestLatestRefCounting sanity checks the ref counting implementation for
+// FileMetadata.latestRefs, and makes sure that the zombie table implementation
+// works when the version edit contains virtual sstables. It also checks that
+// we're adding the physical sstable to the obsolete tables list iff the file is
+// truly obsolete.
+func TestLatestRefCounting(t *testing.T) {
+	mem := vfs.NewMem()
+	require.NoError(t, mem.MkdirAll("ext", 0755))
+
+	opts := &Options{
+		FS:                          mem,
+		MaxManifestFileSize:         1,
+		DisableAutomaticCompactions: true,
+		FormatMajorVersion:          FormatVirtualSSTables,
+	}
+	d, err := Open("", opts)
+	require.NoError(t, err)
+
+	err = d.Set([]byte{'a'}, []byte{'a'}, nil)
+	require.NoError(t, err)
+	err = d.Set([]byte{'b'}, []byte{'b'}, nil)
+	require.NoError(t, err)
+
+	err = d.Flush()
+	require.NoError(t, err)
+
+	iter := d.mu.versions.currentVersion().Levels[0].Iter()
+	var f *fileMetadata = iter.First()
+	require.NotNil(t, f)
+	require.Equal(t, 1, int(f.LatestRefs()))
+	require.Equal(t, 0, len(d.mu.versions.obsoleteTables))
+
+	// Grab some new file nums.
+	d.mu.Lock()
+	f1 := FileNum(d.mu.versions.nextFileNum)
+	f2 := f1 + 1
+	d.mu.versions.nextFileNum += 2
+	d.mu.Unlock()
+
+	m1 := &manifest.FileMetadata{
+		FileBacking:    f.FileBacking,
+		FileNum:        f1,
+		CreationTime:   time.Now().Unix(),
+		Size:           f.Size / 2,
+		SmallestSeqNum: f.SmallestSeqNum,
+		LargestSeqNum:  f.LargestSeqNum,
+		Smallest:       base.MakeInternalKey([]byte{'a'}, f.Smallest.SeqNum(), InternalKeyKindSet),
+		Largest:        base.MakeInternalKey([]byte{'a'}, f.Smallest.SeqNum(), InternalKeyKindSet),
+		HasPointKeys:   true,
+		Virtual:        true,
+	}
+
+	m2 := &manifest.FileMetadata{
+		FileBacking:    f.FileBacking,
+		FileNum:        f2,
+		CreationTime:   time.Now().Unix(),
+		Size:           f.Size - m1.Size,
+		SmallestSeqNum: f.SmallestSeqNum,
+		LargestSeqNum:  f.LargestSeqNum,
+		Smallest:       base.MakeInternalKey([]byte{'b'}, f.Largest.SeqNum(), InternalKeyKindSet),
+		Largest:        base.MakeInternalKey([]byte{'b'}, f.Largest.SeqNum(), InternalKeyKindSet),
+		HasPointKeys:   true,
+		Virtual:        true,
+	}
+
+	m1.LargestPointKey = m1.Largest
+	m1.SmallestPointKey = m1.Smallest
+
+	m2.LargestPointKey = m2.Largest
+	m2.SmallestPointKey = m2.Smallest
+
+	m1.ValidateVirtual(f)
+	d.checkVirtualBounds(m1)
+	m2.ValidateVirtual(f)
+	d.checkVirtualBounds(m2)
+
+	fileMetrics := func(ve *versionEdit) map[int]*LevelMetrics {
+		metrics := newFileMetrics(ve.NewFiles)
+		for de, f := range ve.DeletedFiles {
+			lm := metrics[de.Level]
+			if lm == nil {
+				lm = &LevelMetrics{}
+				metrics[de.Level] = lm
+			}
+			metrics[de.Level].NumFiles--
+			metrics[de.Level].Size -= int64(f.Size)
+		}
+		return metrics
+	}
+
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	applyVE := func(ve *versionEdit) error {
+		d.mu.versions.logLock()
+		jobID := d.mu.nextJobID
+		d.mu.nextJobID++
+
+		err := d.mu.versions.logAndApply(jobID, ve, fileMetrics(ve), false, func() []compactionInfo {
+			return d.getInProgressCompactionInfoLocked(nil)
+		})
+		d.updateReadStateLocked(nil)
+		return err
+	}
+
+	// Virtualize f.
+	ve := manifest.VersionEdit{}
+	d1 := manifest.DeletedFileEntry{Level: 0, FileNum: f.FileNum}
+	n1 := manifest.NewFileEntry{Level: 0, Meta: m1}
+	n2 := manifest.NewFileEntry{Level: 0, Meta: m2}
+
+	ve.DeletedFiles = make(map[manifest.DeletedFileEntry]*manifest.FileMetadata)
+	ve.DeletedFiles[d1] = f
+	ve.NewFiles = append(ve.NewFiles, n1)
+	ve.NewFiles = append(ve.NewFiles, n2)
+	ve.CreatedBackingTables = append(ve.CreatedBackingTables, f.FileBacking)
+
+	require.NoError(t, applyVE(&ve))
+	// 2 latestRefs from 2 virtual sstables in the latest version which refer
+	// to the physical sstable.
+	require.Equal(t, 2, int(m1.LatestRefs()))
+	require.Equal(t, 0, len(d.mu.versions.obsoleteTables))
+	require.Equal(t, 1, len(d.mu.versions.backingState.fileBackingMap))
+	_, ok := d.mu.versions.backingState.fileBackingMap[f.FileBacking.DiskFileNum]
+	require.True(t, ok)
+	require.Equal(t, f.Size, m2.FileBacking.VirtualizedSize.Load())
+	checkBackingSize(t, d)
+
+	// Make sure that f is not present in zombie list, because it is not yet a
+	// zombie.
+	require.Equal(t, 0, len(d.mu.versions.zombieTables))
+
+	// Delete the virtual sstable m1.
+	ve = manifest.VersionEdit{}
+	d1 = manifest.DeletedFileEntry{Level: 0, FileNum: m1.FileNum}
+	ve.DeletedFiles = make(map[manifest.DeletedFileEntry]*manifest.FileMetadata)
+	ve.DeletedFiles[d1] = m1
+	require.NoError(t, applyVE(&ve))
+
+	// Only one virtual sstable in the latest version, confirm that the latest
+	// version ref counting is correct.
+	require.Equal(t, 1, int(m2.LatestRefs()))
+	require.Equal(t, 0, len(d.mu.versions.zombieTables))
+	require.Equal(t, 0, len(d.mu.versions.obsoleteTables))
+	require.Equal(t, 1, len(d.mu.versions.backingState.fileBackingMap))
+	_, ok = d.mu.versions.backingState.fileBackingMap[f.FileBacking.DiskFileNum]
+	require.True(t, ok)
+	require.Equal(t, m2.Size, m2.FileBacking.VirtualizedSize.Load())
+	checkBackingSize(t, d)
+
+	// Move m2 from L0 to L6 to test the move compaction case.
+	ve = manifest.VersionEdit{}
+	d1 = manifest.DeletedFileEntry{Level: 0, FileNum: m2.FileNum}
+	n1 = manifest.NewFileEntry{Level: 6, Meta: m2}
+	ve.DeletedFiles = make(map[manifest.DeletedFileEntry]*manifest.FileMetadata)
+	ve.DeletedFiles[d1] = m2
+	ve.NewFiles = append(ve.NewFiles, n1)
+	require.NoError(t, applyVE(&ve))
+	checkBackingSize(t, d)
+
+	require.Equal(t, 1, int(m2.LatestRefs()))
+	require.Equal(t, 0, len(d.mu.versions.zombieTables))
+	require.Equal(t, 0, len(d.mu.versions.obsoleteTables))
+	require.Equal(t, 1, len(d.mu.versions.backingState.fileBackingMap))
+	_, ok = d.mu.versions.backingState.fileBackingMap[f.FileBacking.DiskFileNum]
+	require.True(t, ok)
+	require.Equal(t, m2.Size, m2.FileBacking.VirtualizedSize.Load())
+
+	// Delete m2 from L6.
+	ve = manifest.VersionEdit{}
+	d1 = manifest.DeletedFileEntry{Level: 6, FileNum: m2.FileNum}
+	ve.DeletedFiles = make(map[manifest.DeletedFileEntry]*manifest.FileMetadata)
+	ve.DeletedFiles[d1] = m2
+	require.NoError(t, applyVE(&ve))
+	checkBackingSize(t, d)
+
+	// All virtual sstables are gone.
+	require.Equal(t, 0, int(m2.LatestRefs()))
+	require.Equal(t, 1, len(d.mu.versions.zombieTables))
+	require.Equal(t, f.Size, d.mu.versions.zombieTables[f.FileBacking.DiskFileNum])
+	require.Equal(t, 0, len(d.mu.versions.backingState.fileBackingMap))
+	_, ok = d.mu.versions.backingState.fileBackingMap[f.FileBacking.DiskFileNum]
+	require.False(t, ok)
+	require.Equal(t, 0, int(m2.FileBacking.VirtualizedSize.Load()))
+	checkBackingSize(t, d)
+}
+
+// TODO(bananabrick): Convert TestLatestRefCounting and this test into a single
+// datadriven test.
+func TestVirtualSSTableManifestReplay(t *testing.T) {
+	mem := vfs.NewMem()
+	require.NoError(t, mem.MkdirAll("ext", 0755))
+
+	opts := &Options{
+		FormatMajorVersion:          FormatVirtualSSTables,
+		FS:                          mem,
+		MaxManifestFileSize:         1,
+		DisableAutomaticCompactions: true,
+	}
+	d, err := Open("", opts)
+	require.NoError(t, err)
+
+	err = d.Set([]byte{'a'}, []byte{'a'}, nil)
+	require.NoError(t, err)
+	err = d.Set([]byte{'b'}, []byte{'b'}, nil)
+	require.NoError(t, err)
+
+	err = d.Flush()
+	require.NoError(t, err)
+
+	iter := d.mu.versions.currentVersion().Levels[0].Iter()
+	var f *fileMetadata = iter.First()
+	require.NotNil(t, f)
+	require.Equal(t, 1, int(f.LatestRefs()))
+	require.Equal(t, 0, len(d.mu.versions.obsoleteTables))
+
+	// Grab some new file nums.
+	d.mu.Lock()
+	f1 := FileNum(d.mu.versions.nextFileNum)
+	f2 := f1 + 1
+	d.mu.versions.nextFileNum += 2
+	d.mu.Unlock()
+
+	m1 := &manifest.FileMetadata{
+		FileBacking:    f.FileBacking,
+		FileNum:        f1,
+		CreationTime:   time.Now().Unix(),
+		Size:           f.Size / 2,
+		SmallestSeqNum: f.SmallestSeqNum,
+		LargestSeqNum:  f.LargestSeqNum,
+		Smallest:       base.MakeInternalKey([]byte{'a'}, f.Smallest.SeqNum(), InternalKeyKindSet),
+		Largest:        base.MakeInternalKey([]byte{'a'}, f.Smallest.SeqNum(), InternalKeyKindSet),
+		HasPointKeys:   true,
+		Virtual:        true,
+	}
+
+	m2 := &manifest.FileMetadata{
+		FileBacking:    f.FileBacking,
+		FileNum:        f2,
+		CreationTime:   time.Now().Unix(),
+		Size:           f.Size - m1.Size,
+		SmallestSeqNum: f.SmallestSeqNum,
+		LargestSeqNum:  f.LargestSeqNum,
+		Smallest:       base.MakeInternalKey([]byte{'b'}, f.Largest.SeqNum(), InternalKeyKindSet),
+		Largest:        base.MakeInternalKey([]byte{'b'}, f.Largest.SeqNum(), InternalKeyKindSet),
+		HasPointKeys:   true,
+		Virtual:        true,
+	}
+
+	m1.LargestPointKey = m1.Largest
+	m1.SmallestPointKey = m1.Smallest
+	m1.Stats.NumEntries = 1
+
+	m2.LargestPointKey = m2.Largest
+	m2.SmallestPointKey = m2.Smallest
+	m2.Stats.NumEntries = 1
+
+	m1.ValidateVirtual(f)
+	d.checkVirtualBounds(m1)
+	m2.ValidateVirtual(f)
+	d.checkVirtualBounds(m2)
+
+	fileMetrics := func(ve *versionEdit) map[int]*LevelMetrics {
+		metrics := newFileMetrics(ve.NewFiles)
+		for de, f := range ve.DeletedFiles {
+			lm := metrics[de.Level]
+			if lm == nil {
+				lm = &LevelMetrics{}
+				metrics[de.Level] = lm
+			}
+			metrics[de.Level].NumFiles--
+			metrics[de.Level].Size -= int64(f.Size)
+		}
+		return metrics
+	}
+
+	d.mu.Lock()
+	applyVE := func(ve *versionEdit) error {
+		d.mu.versions.logLock()
+		jobID := d.mu.nextJobID
+		d.mu.nextJobID++
+
+		err := d.mu.versions.logAndApply(jobID, ve, fileMetrics(ve), false, func() []compactionInfo {
+			return d.getInProgressCompactionInfoLocked(nil)
+		})
+		d.updateReadStateLocked(nil)
+		return err
+	}
+
+	// Virtualize f.
+	ve := manifest.VersionEdit{}
+	d1 := manifest.DeletedFileEntry{Level: 0, FileNum: f.FileNum}
+	n1 := manifest.NewFileEntry{Level: 0, Meta: m1}
+	n2 := manifest.NewFileEntry{Level: 0, Meta: m2}
+
+	ve.DeletedFiles = make(map[manifest.DeletedFileEntry]*manifest.FileMetadata)
+	ve.DeletedFiles[d1] = f
+	ve.NewFiles = append(ve.NewFiles, n1)
+	ve.NewFiles = append(ve.NewFiles, n2)
+	ve.CreatedBackingTables = append(ve.CreatedBackingTables, f.FileBacking)
+
+	require.NoError(t, applyVE(&ve))
+	checkBackingSize(t, d)
+	d.mu.Unlock()
+
+	require.Equal(t, 2, int(m1.LatestRefs()))
+	require.Equal(t, 0, len(d.mu.versions.obsoleteTables))
+	require.Equal(t, 1, len(d.mu.versions.backingState.fileBackingMap))
+	_, ok := d.mu.versions.backingState.fileBackingMap[f.FileBacking.DiskFileNum]
+	require.True(t, ok)
+	require.Equal(t, f.Size, m2.FileBacking.VirtualizedSize.Load())
+
+	// Snapshot version edit will be written to a new manifest due to the flush.
+	d.Set([]byte{'c'}, []byte{'c'}, nil)
+	d.Flush()
+
+	require.NoError(t, d.Close())
+	d, err = Open("", opts)
+	require.NoError(t, err)
+
+	d.mu.Lock()
+	it := d.mu.versions.currentVersion().Levels[0].Iter()
+	var virtualFile *fileMetadata
+	for f := it.First(); f != nil; f = it.Next() {
+		if f.Virtual {
+			virtualFile = f
+			break
+		}
+	}
+
+	require.Equal(t, 2, int(virtualFile.LatestRefs()))
+	require.Equal(t, 0, len(d.mu.versions.obsoleteTables))
+	require.Equal(t, 1, len(d.mu.versions.backingState.fileBackingMap))
+	_, ok = d.mu.versions.backingState.fileBackingMap[f.FileBacking.DiskFileNum]
+	require.True(t, ok)
+	require.Equal(t, f.Size, virtualFile.FileBacking.VirtualizedSize.Load())
+	checkBackingSize(t, d)
+	d.mu.Unlock()
+
+	// Will cause the virtual sstables to be deleted, and the file backing should
+	// also be removed.
+	d.Compact([]byte{'a'}, []byte{'z'}, false)
+
+	d.mu.Lock()
+	virtualFile = nil
+	it = d.mu.versions.currentVersion().Levels[0].Iter()
+	for f := it.First(); f != nil; f = it.Next() {
+		if f.Virtual {
+			virtualFile = f
+			break
+		}
+	}
+	require.Nil(t, virtualFile)
+	require.Equal(t, 0, len(d.mu.versions.obsoleteTables))
+	require.Equal(t, 0, len(d.mu.versions.backingState.fileBackingMap))
+	checkBackingSize(t, d)
+	d.mu.Unlock()
+
+	// Close and restart to make sure that the new snapshot written during
+	// compaction doesn't have the file backing.
+	require.NoError(t, d.Close())
+	d, err = Open("", opts)
+	require.NoError(t, err)
+
+	d.mu.Lock()
+	virtualFile = nil
+	it = d.mu.versions.currentVersion().Levels[0].Iter()
+	for f := it.First(); f != nil; f = it.Next() {
+		if f.Virtual {
+			virtualFile = f
+			break
+		}
+	}
+	require.Nil(t, virtualFile)
+	require.Equal(t, 0, len(d.mu.versions.obsoleteTables))
+	require.Equal(t, 0, len(d.mu.versions.backingState.fileBackingMap))
+	checkBackingSize(t, d)
+	d.mu.Unlock()
+	require.NoError(t, d.Close())
+}
+
+func TestVersionSetCheckpoint(t *testing.T) {
+	mem := vfs.NewMem()
+	require.NoError(t, mem.MkdirAll("ext", 0755))
+
+	opts := &Options{
+		FS:                  mem,
+		MaxManifestFileSize: 1,
+	}
+	d, err := Open("", opts)
+	require.NoError(t, err)
+
+	// Multiple manifest files are created such that the latest one must have a correct snapshot
+	// of the preceding state for the DB to be opened correctly and see the written data.
+	// Snapshot has no files, so first edit will cause manifest rotation.
+	writeAndIngest(t, mem, d, base.MakeInternalKey([]byte("a"), 0, InternalKeyKindSet), []byte("b"), "a")
+	// Snapshot has no files, and manifest has an edit from the previous ingest,
+	// so this second ingest will cause manifest rotation.
+	writeAndIngest(t, mem, d, base.MakeInternalKey([]byte("c"), 0, InternalKeyKindSet), []byte("d"), "c")
+	require.NoError(t, d.Close())
+	d, err = Open("", opts)
+	require.NoError(t, err)
+	checkValue := func(k string, expected string) {
+		v, closer, err := d.Get([]byte(k))
+		require.NoError(t, err)
+		require.Equal(t, expected, string(v))
+		closer.Close()
+	}
+	checkValue("a", "b")
+	checkValue("c", "d")
+	require.NoError(t, d.Close())
+}
+
+func TestVersionSetSeqNums(t *testing.T) {
+	mem := vfs.NewMem()
+	require.NoError(t, mem.MkdirAll("ext", 0755))
+
+	opts := &Options{
+		FS:                  mem,
+		MaxManifestFileSize: 1,
+	}
+	d, err := Open("", opts)
+	require.NoError(t, err)
+
+	// Snapshot has no files, so first edit will cause manifest rotation.
+	writeAndIngest(t, mem, d, base.MakeInternalKey([]byte("a"), 0, InternalKeyKindSet), []byte("b"), "a")
+	// Snapshot has no files, and manifest has an edit from the previous ingest,
+	// so this second ingest will cause manifest rotation.
+	writeAndIngest(t, mem, d, base.MakeInternalKey([]byte("c"), 0, InternalKeyKindSet), []byte("d"), "c")
+	require.NoError(t, d.Close())
+	d, err = Open("", opts)
+	require.NoError(t, err)
+	defer d.Close()
+	d.TestOnlyWaitForCleaning()
+
+	// Check that the manifest has the correct LastSeqNum, equalling the highest
+	// observed SeqNum.
+	filenames, err := mem.List("")
+	require.NoError(t, err)
+	var manifest vfs.File
+	for _, filename := range filenames {
+		fileType, _, ok := base.ParseFilename(mem, filename)
+		if ok && fileType == fileTypeManifest {
+			manifest, err = mem.Open(filename)
+			require.NoError(t, err)
+		}
+	}
+	require.NotNil(t, manifest)
+	defer manifest.Close()
+	rr := record.NewReader(manifest, 0 /* logNum */)
+	lastSeqNum := uint64(0)
+	for {
+		r, err := rr.Next()
+		if err == io.EOF {
+			break
+		}
+		require.NoError(t, err)
+		var ve versionEdit
+		err = ve.Decode(r)
+		require.NoError(t, err)
+		if ve.LastSeqNum != 0 {
+			lastSeqNum = ve.LastSeqNum
+		}
+	}
+	// 2 ingestions happened, so LastSeqNum should equal base.SeqNumStart + 1.
+	require.Equal(t, uint64(11), lastSeqNum)
+	// logSeqNum is always one greater than the last assigned sequence number.
+	require.Equal(t, d.mu.versions.logSeqNum.Load(), lastSeqNum+1)
+}
diff --git a/pebble/vfs/atomicfs/marker.go b/pebble/vfs/atomicfs/marker.go
new file mode 100644
index 0000000..6bdc506
--- /dev/null
+++ b/pebble/vfs/atomicfs/marker.go
@@ -0,0 +1,241 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package atomicfs
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/errors/oserror"
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+// ReadMarker looks up the current state of a marker returning just the
+// current value of the marker. Callers that may need to move the marker
+// to a new value should use LocateMarker.
+func ReadMarker(fs vfs.FS, dir, markerName string) (string, error) {
+	state, err := scanForMarker(fs, dir, markerName)
+	if err != nil {
+		return "", err
+	}
+	return state.value, nil
+}
+
+// LocateMarker loads the current state of a marker. It returns a handle
+// to the Marker that may be used to move the marker and the
+// current value of the marker.
+func LocateMarker(fs vfs.FS, dir, markerName string) (*Marker, string, error) {
+	state, err := scanForMarker(fs, dir, markerName)
+	if err != nil {
+		return nil, "", err
+	}
+	dirFD, err := fs.OpenDir(dir)
+	if err != nil {
+		return nil, "", err
+	}
+	return &Marker{
+		fs:            fs,
+		dir:           dir,
+		dirFD:         dirFD,
+		name:          markerName,
+		filename:      state.filename,
+		iter:          state.iter,
+		obsoleteFiles: state.obsolete,
+	}, state.value, nil
+}
+
+type scannedState struct {
+	// filename is the latest marker file found (the one with the highest iter value).
+	filename string
+	iter     uint64
+	value    string
+	// obsolete is a list of earlier markers that were found.
+	obsolete []string
+}
+
+func scanForMarker(fs vfs.FS, dir, markerName string) (scannedState, error) {
+	ls, err := fs.List(dir)
+	if err != nil {
+		return scannedState{}, err
+	}
+	var state scannedState
+	for _, filename := range ls {
+		if !strings.HasPrefix(filename, `marker.`) {
+			continue
+		}
+		// Any filenames with the `marker.` prefix are required to be
+		// well-formed and parse as markers.
+		name, iter, value, err := parseMarkerFilename(filename)
+		if err != nil {
+			return scannedState{}, err
+		}
+		if name != markerName {
+			continue
+		}
+
+		if state.filename == "" || state.iter < iter {
+			if state.filename != "" {
+				state.obsolete = append(state.obsolete, state.filename)
+			}
+			state.filename = filename
+			state.iter = iter
+			state.value = value
+		} else {
+			state.obsolete = append(state.obsolete, filename)
+		}
+	}
+	return state, nil
+}
+
+// A Marker provides an interface for maintaining a single string value on the
+// filesystem. The marker may be atomically moved from value to value.
+//
+// The implementation creates a new marker file for each new value, embedding
+// the value in the marker filename.
+//
+// Marker is not safe for concurrent use. Multiple processes may not read or
+// move the same marker simultaneously. A Marker may only be constructed through
+// LocateMarker.
+//
+// Marker names must be unique within the directory.
+type Marker struct {
+	fs    vfs.FS
+	dir   string
+	dirFD vfs.File
+	// name identifies the marker.
+	name string
+	// filename contains the entire filename of the current marker. It
+	// has a format of `marker.<name>.<iter>.<value>`. It's not
+	// necessarily in sync with iter, since filename is only updated
+	// when the marker is successfully moved.
+	filename string
+	// iter holds the current iteration value. It matches the iteration
+	// value encoded within filename, if filename is non-empty. Iter is
+	// monotonically increasing over the lifetime of a marker. Actual
+	// marker files will always have a positive iter value.
+	iter uint64
+	// obsoleteFiles holds a list of marker files discovered by LocateMarker that
+	// are old values for this marker. These files may exist in certain error
+	// cases or crashes (e.g. if the deletion of the previous marker file failed
+	// during Move).
+	obsoleteFiles []string
+}
+
+func markerFilename(name string, iter uint64, value string) string {
+	return fmt.Sprintf("marker.%s.%06d.%s", name, iter, value)
+}
+
+func parseMarkerFilename(s string) (name string, iter uint64, value string, err error) {
+	// Check for and remove the `marker.` prefix.
+	if !strings.HasPrefix(s, `marker.`) {
+		return "", 0, "", errors.Newf("invalid marker filename: %q", s)
+	}
+	s = s[len(`marker.`):]
+
+	// Extract the marker's name.
+	i := strings.IndexByte(s, '.')
+	if i == -1 {
+		return "", 0, "", errors.Newf("invalid marker filename: %q", s)
+	}
+	name = s[:i]
+	s = s[i+1:]
+
+	// Extract the marker's iteration number.
+	i = strings.IndexByte(s, '.')
+	if i == -1 {
+		return "", 0, "", errors.Newf("invalid marker filename: %q", s)
+	}
+	iter, err = strconv.ParseUint(s[:i], 10, 64)
+	if err != nil {
+		return "", 0, "", errors.Newf("invalid marker filename: %q", s)
+	}
+
+	// Everything after the iteration's `.` delimiter is the value.
+	s = s[i+1:]
+
+	return name, iter, s, nil
+}
+
+// Close releases all resources in use by the marker.
+func (a *Marker) Close() error {
+	return a.dirFD.Close()
+}
+
+// Move atomically moves the marker to a new value.
+//
+// If Move returns a nil error, the new marker value is guaranteed to be
+// persisted to stable storage. If Move returns an error, the current
+// value of the marker may be the old value or the new value. Callers
+// may retry a Move error.
+//
+// If an error occurs while syncing the directory, Move panics.
+func (a *Marker) Move(newValue string) error {
+	a.iter++
+	dstFilename := markerFilename(a.name, a.iter, newValue)
+	dstPath := a.fs.PathJoin(a.dir, dstFilename)
+	oldFilename := a.filename
+
+	// Create the new marker.
+	f, err := a.fs.Create(dstPath)
+	if err != nil {
+		// On a distributed filesystem, an error doesn't guarantee that
+		// the file wasn't created. A retry of the same Move call will
+		// use a new iteration value, and try to a create a new file. If
+		// the errored invocation was actually successful in creating
+		// the file, we'll leak a file. That's okay, because the next
+		// time the marker is located we'll add it to the obsolete files
+		// list.
+		//
+		// Note that the unconditional increment of `a.iter` means that
+		// `a.iter` and `a.filename` are not necessarily in sync,
+		// because `a.filename` is only updated on success.
+		return err
+	}
+	a.filename = dstFilename
+	if err := f.Close(); err != nil {
+		return err
+	}
+
+	// Remove the now defunct file. If an error is surfaced, we record
+	// the file as an obsolete file.  The file's presence does not
+	// affect correctness, and it will be cleaned up the next time
+	// RemoveObsolete is called, either by this process or the next.
+	if oldFilename != "" {
+		if err := a.fs.Remove(a.fs.PathJoin(a.dir, oldFilename)); err != nil && !oserror.IsNotExist(err) {
+			a.obsoleteFiles = append(a.obsoleteFiles, oldFilename)
+		}
+	}
+
+	// Sync the directory to ensure marker movement is synced.
+	if err := a.dirFD.Sync(); err != nil {
+		// Fsync errors are unrecoverable.
+		// See https://wiki.postgresql.org/wiki/Fsync_Errors and
+		// https://danluu.com/fsyncgate.
+		panic(errors.WithStack(err))
+	}
+	return nil
+}
+
+// NextIter returns the next iteration number that the marker will use.
+// Clients may use this number for formulating new values that are
+// unused.
+func (a *Marker) NextIter() uint64 {
+	return a.iter + 1
+}
+
+// RemoveObsolete removes any obsolete files discovered while locating
+// the marker or files unable to be removed during Move.
+func (a *Marker) RemoveObsolete() error {
+	for i, filename := range a.obsoleteFiles {
+		if err := a.fs.Remove(a.fs.PathJoin(a.dir, filename)); err != nil && !oserror.IsNotExist(err) {
+			a.obsoleteFiles = a.obsoleteFiles[i:]
+			return err
+		}
+	}
+	a.obsoleteFiles = nil
+	return nil
+}
diff --git a/pebble/vfs/atomicfs/marker_test.go b/pebble/vfs/atomicfs/marker_test.go
new file mode 100644
index 0000000..f43c90d
--- /dev/null
+++ b/pebble/vfs/atomicfs/marker_test.go
@@ -0,0 +1,295 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package atomicfs
+
+import (
+	"bytes"
+	"fmt"
+	"os"
+	"sort"
+	"strconv"
+	"strings"
+	"sync/atomic"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/cockroachdb/pebble/vfs/errorfs"
+	"github.com/stretchr/testify/require"
+)
+
+func TestMarker_FilenameRoundtrip(t *testing.T) {
+	filenames := []string{
+		"marker.foo.000003.MANIFEST-000021",
+		"marker.bar.000003.MANIFEST-000021",
+		"marker.version.000003.1",
+		"marker.version.000003.1.2.3.4",
+		"marker.current.500000.MANIFEST-000001",
+		"marker.current.18446744073709551615.MANIFEST-000001",
+	}
+	for _, testFilename := range filenames {
+		t.Run(testFilename, func(t *testing.T) {
+			name, iter, value, err := parseMarkerFilename(testFilename)
+			require.NoError(t, err)
+
+			filename := markerFilename(name, iter, value)
+			require.Equal(t, testFilename, filename)
+		})
+	}
+}
+
+func TestMarker_Parsefilename(t *testing.T) {
+	testCases := map[string]func(require.TestingT, error, ...interface{}){
+		"marker.current.000003.MANIFEST-000021":  require.NoError,
+		"marker.current.10.MANIFEST-000021":      require.NoError,
+		"marker.v.10.1.2.3.4":                    require.NoError,
+		"marker.name.18446744073709551615.value": require.NoError,
+		"marke.current.000003.MANIFEST-000021":   require.Error,
+		"marker.current.foo.MANIFEST-000021":     require.Error,
+		"marker.current.ffffff.MANIFEST-000021":  require.Error,
+	}
+	for filename, assert := range testCases {
+		t.Run(filename, func(t *testing.T) {
+			_, _, _, err := parseMarkerFilename(filename)
+			assert(t, err)
+		})
+	}
+}
+
+func TestMarker(t *testing.T) {
+	markers := map[string]*Marker{}
+	memFS := vfs.NewMem()
+
+	var buf bytes.Buffer
+	datadriven.RunTest(t, "testdata/marker", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "list":
+			ls, err := memFS.List(td.CmdArgs[0].String())
+			if err != nil {
+				return err.Error()
+			}
+			sort.Strings(ls)
+			buf.Reset()
+			for _, filename := range ls {
+				fmt.Fprintln(&buf, filename)
+			}
+			return buf.String()
+
+		case "locate":
+			var dir, marker string
+			td.ScanArgs(t, "dir", &dir)
+			td.ScanArgs(t, "marker", &marker)
+			m, v, err := LocateMarker(memFS, dir, marker)
+			if err != nil {
+				return err.Error()
+			}
+			p := memFS.PathJoin(dir, marker)
+			if oldMarker := markers[p]; oldMarker != nil {
+				if err := oldMarker.Close(); err != nil {
+					return err.Error()
+				}
+			}
+
+			markers[p] = m
+			return v
+
+		case "mkdir-all":
+			if len(td.CmdArgs) != 1 {
+				return "usage: mkdir-all <dir>"
+			}
+			if err := memFS.MkdirAll(td.CmdArgs[0].String(), os.ModePerm); err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "move":
+			var dir, marker string
+			td.ScanArgs(t, "dir", &dir)
+			td.ScanArgs(t, "marker", &marker)
+			m := markers[memFS.PathJoin(dir, marker)]
+			require.NotNil(t, m)
+			err := m.Move(td.Input)
+			if err != nil {
+				return err.Error()
+			}
+			return ""
+
+		case "next-iter":
+			var dir, marker string
+			td.ScanArgs(t, "dir", &dir)
+			td.ScanArgs(t, "marker", &marker)
+			m := markers[memFS.PathJoin(dir, marker)]
+			require.NotNil(t, m)
+			return fmt.Sprintf("%d", m.NextIter())
+
+		case "read":
+			var dir, marker string
+			td.ScanArgs(t, "dir", &dir)
+			td.ScanArgs(t, "marker", &marker)
+			v, err := ReadMarker(memFS, dir, marker)
+			if err != nil {
+				return err.Error()
+			}
+			return v
+
+		case "remove-obsolete":
+			var dir, marker string
+			td.ScanArgs(t, "dir", &dir)
+			td.ScanArgs(t, "marker", &marker)
+			m := markers[memFS.PathJoin(dir, marker)]
+			require.NotNil(t, m)
+			obsoleteCount := len(m.obsoleteFiles)
+			require.NoError(t, m.RemoveObsolete())
+			removedCount := obsoleteCount - len(m.obsoleteFiles)
+			return fmt.Sprintf("Removed %d files.", removedCount)
+
+		case "touch":
+			for _, filename := range strings.Split(td.Input, "\n") {
+				f, err := memFS.Create(filename)
+				if err != nil {
+					return err.Error()
+				}
+				if err := f.Close(); err != nil {
+					return err.Error()
+				}
+			}
+			return ""
+
+		default:
+			panic(fmt.Sprintf("unknown command %q", td.Cmd))
+		}
+	})
+}
+
+func TestMarker_StrictSync(t *testing.T) {
+	// Use an in-memory FS that strictly enforces syncs.
+	mem := vfs.NewStrictMem()
+	syncDir := func(dir string) {
+		fdir, err := mem.OpenDir(dir)
+		require.NoError(t, err)
+		require.NoError(t, fdir.Sync())
+		require.NoError(t, fdir.Close())
+	}
+
+	require.NoError(t, mem.MkdirAll("foo", os.ModePerm))
+	syncDir("")
+	m, v, err := LocateMarker(mem, "foo", "bar")
+	require.NoError(t, err)
+	require.Equal(t, "", v)
+	require.NoError(t, m.Move("hello"))
+	require.NoError(t, m.Close())
+
+	// Discard any unsynced writes to make sure we set up the test
+	// preconditions correctly.
+	mem.ResetToSyncedState()
+	m, v, err = LocateMarker(mem, "foo", "bar")
+	require.NoError(t, err)
+	require.Equal(t, "hello", v)
+	require.NoError(t, m.Move("hello-world"))
+	require.NoError(t, m.Close())
+
+	// Discard any unsynced writes.
+	mem.ResetToSyncedState()
+	m, v, err = LocateMarker(mem, "foo", "bar")
+	require.NoError(t, err)
+	require.Equal(t, "hello-world", v)
+	require.NoError(t, m.Close())
+}
+
+// TestMarker_FaultTolerance attempts a series of operations on atomic
+// markers, injecting errors at successively higher indexed operations.
+// It completes when an error is never injected, because the index is
+// higher than the number of filesystem operations performed by the
+// test.
+func TestMarker_FaultTolerance(t *testing.T) {
+	done := false
+	for i := 1; !done && i < 1000; i++ {
+		t.Run(strconv.Itoa(i), func(t *testing.T) {
+			var count atomic.Int32
+			count.Store(int32(i))
+			inj := errorfs.InjectorFunc(func(op errorfs.Op) error {
+				// Don't inject on Sync errors. They're fatal.
+				if op.Kind == errorfs.OpFileSync {
+					return nil
+				}
+				if v := count.Add(-1); v == 0 {
+					return errorfs.ErrInjected
+				}
+				return nil
+			})
+
+			mem := vfs.NewMem()
+			fs := errorfs.Wrap(mem, inj)
+			markers := map[string]*Marker{}
+			ops := []struct {
+				op    string
+				name  string
+				value string
+			}{
+				{op: "locate", name: "foo", value: ""},
+				{op: "locate", name: "foo", value: ""},
+				{op: "locate", name: "bar", value: ""},
+				{op: "rm-obsolete", name: "foo"},
+				{op: "move", name: "bar", value: "california"},
+				{op: "rm-obsolete", name: "bar"},
+				{op: "move", name: "bar", value: "california"},
+				{op: "move", name: "bar", value: "new-york"},
+				{op: "locate", name: "bar", value: "new-york"},
+				{op: "move", name: "bar", value: "california"},
+				{op: "rm-obsolete", name: "bar"},
+				{op: "locate", name: "bar", value: "california"},
+				{op: "move", name: "foo", value: "connecticut"},
+				{op: "locate", name: "foo", value: "connecticut"},
+			}
+
+			for _, op := range ops {
+				runOp := func() error {
+					switch op.op {
+					case "locate":
+						m, v, err := LocateMarker(fs, "", op.name)
+						if err != nil {
+							return err
+						}
+						require.NotNil(t, m)
+						require.Equal(t, op.value, v)
+						if existingMarker := markers[op.name]; existingMarker != nil {
+							require.NoError(t, existingMarker.Close())
+						}
+						markers[op.name] = m
+						return nil
+					case "move":
+						m := markers[op.name]
+						require.NotNil(t, m)
+						return m.Move(op.value)
+					case "rm-obsolete":
+						m := markers[op.name]
+						require.NotNil(t, m)
+						return m.RemoveObsolete()
+					default:
+						panic("unreachable")
+					}
+				}
+
+				// Run the operation, if it fails with the injected
+				// error, retry it exactly once. The retry should always
+				// succeed.
+				err := runOp()
+				if errors.Is(err, errorfs.ErrInjected) {
+					err = runOp()
+				}
+				require.NoError(t, err)
+			}
+
+			for _, m := range markers {
+				require.NoError(t, m.Close())
+			}
+
+			// Stop if the number of operations in the test case is
+			// fewer than `i`.
+			done = count.Load() > 0
+		})
+	}
+}
diff --git a/pebble/vfs/atomicfs/testdata/marker b/pebble/vfs/atomicfs/testdata/marker
new file mode 100644
index 0000000..4e50b44
--- /dev/null
+++ b/pebble/vfs/atomicfs/testdata/marker
@@ -0,0 +1,136 @@
+# Errors if the containing directory does not exist.
+locate dir=bar marker=foo
+----
+open bar/: file does not exist
+
+mkdir-all data
+----
+
+read dir=data marker=foo
+----
+
+# Loads a nonexistent marker correctly.
+locate dir=data marker=foo
+----
+
+next-iter  dir=data marker=foo
+----
+1
+
+next-iter  dir=data marker=foo
+----
+1
+
+# The directory should still be empty.
+list data
+----
+
+# Moving the marker for the first time should create a marker file.
+move dir=data marker=foo
+MANIFEST-000010
+----
+
+list data
+----
+marker.foo.000001.MANIFEST-000010
+
+next-iter  dir=data marker=foo
+----
+2
+
+read dir=data marker=foo
+----
+MANIFEST-000010
+
+# Moving the marker should move the existing marker file.
+move dir=data marker=foo
+MANIFEST-000016
+----
+
+next-iter  dir=data marker=foo
+----
+3
+
+list data
+----
+marker.foo.000002.MANIFEST-000016
+
+read dir=data marker=foo
+----
+MANIFEST-000016
+
+# Create non-marker files.
+touch
+data/MANIFEST-000016
+data/CURRENT
+data/000004.sst
+----
+
+# Re-locate the marker. It should be unchanged.
+locate dir=data marker=foo
+----
+MANIFEST-000016
+
+# Locate a new marker.
+locate dir=data marker=bar
+----
+
+move dir=data marker=bar
+MANIFEST-000016
+----
+
+list data
+----
+000004.sst
+CURRENT
+MANIFEST-000016
+marker.bar.000001.MANIFEST-000016
+marker.foo.000002.MANIFEST-000016
+
+move dir=data marker=foo
+MANIFEST-000021
+----
+
+list data
+----
+000004.sst
+CURRENT
+MANIFEST-000016
+marker.bar.000001.MANIFEST-000016
+marker.foo.000003.MANIFEST-000021
+
+touch
+data/marker.bar.000009.MANIFEST-000099
+----
+
+list data
+----
+000004.sst
+CURRENT
+MANIFEST-000016
+marker.bar.000001.MANIFEST-000016
+marker.bar.000009.MANIFEST-000099
+marker.foo.000003.MANIFEST-000021
+
+locate dir=data marker=bar
+----
+MANIFEST-000099
+
+remove-obsolete dir=data marker=bar
+----
+Removed 1 files.
+
+list data
+----
+000004.sst
+CURRENT
+MANIFEST-000016
+marker.bar.000009.MANIFEST-000099
+marker.foo.000003.MANIFEST-000021
+
+read dir=data marker=bar
+----
+MANIFEST-000099
+
+read dir=data marker=garbage
+----
diff --git a/pebble/vfs/clone.go b/pebble/vfs/clone.go
new file mode 100644
index 0000000..5d6edf2
--- /dev/null
+++ b/pebble/vfs/clone.go
@@ -0,0 +1,137 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package vfs
+
+import (
+	"io"
+	"sort"
+
+	"github.com/cockroachdb/errors/oserror"
+)
+
+type cloneOpts struct {
+	skip    func(string) bool
+	sync    bool
+	tryLink bool
+}
+
+// A CloneOption configures the behavior of Clone.
+type CloneOption func(*cloneOpts)
+
+// CloneSkip configures Clone to skip files for which the provided function
+// returns true when passed the file's path.
+func CloneSkip(fn func(string) bool) CloneOption {
+	return func(co *cloneOpts) { co.skip = fn }
+}
+
+// CloneSync configures Clone to sync files and directories.
+var CloneSync CloneOption = func(o *cloneOpts) { o.sync = true }
+
+// CloneTryLink configures Clone to link files to the destination if the source and
+// destination filesystems are the same. If the source and destination
+// filesystems are not the same or the filesystem does not support linking, then
+// Clone falls back to copying.
+var CloneTryLink CloneOption = func(o *cloneOpts) { o.tryLink = true }
+
+// Clone recursively copies a directory structure from srcFS to dstFS. srcPath
+// specifies the path in srcFS to copy from and must be compatible with the
+// srcFS path format. dstDir is the target directory in dstFS and must be
+// compatible with the dstFS path format. Returns (true,nil) on a successful
+// copy, (false,nil) if srcPath does not exist, and (false,err) if an error
+// occurred.
+func Clone(srcFS, dstFS FS, srcPath, dstPath string, opts ...CloneOption) (bool, error) {
+	var o cloneOpts
+	for _, opt := range opts {
+		opt(&o)
+	}
+
+	srcFile, err := srcFS.Open(srcPath)
+	if err != nil {
+		if oserror.IsNotExist(err) {
+			// Ignore non-existent errors. Those will translate into non-existent
+			// files in the destination filesystem.
+			return false, nil
+		}
+		return false, err
+	}
+	defer srcFile.Close()
+
+	stat, err := srcFile.Stat()
+	if err != nil {
+		return false, err
+	}
+
+	if stat.IsDir() {
+		if err := dstFS.MkdirAll(dstPath, 0755); err != nil {
+			return false, err
+		}
+		list, err := srcFS.List(srcPath)
+		if err != nil {
+			return false, err
+		}
+		// Sort the paths so we get deterministic test output.
+		sort.Strings(list)
+		for _, name := range list {
+			if o.skip != nil && o.skip(srcFS.PathJoin(srcPath, name)) {
+				continue
+			}
+			_, err := Clone(srcFS, dstFS, srcFS.PathJoin(srcPath, name), dstFS.PathJoin(dstPath, name), opts...)
+			if err != nil {
+				return false, err
+			}
+		}
+
+		if o.sync {
+			dir, err := dstFS.OpenDir(dstPath)
+			if err != nil {
+				return false, err
+			}
+			if err := dir.Sync(); err != nil {
+				return false, err
+			}
+			if err := dir.Close(); err != nil {
+				return false, err
+			}
+		}
+
+		return true, nil
+	}
+
+	// If the source and destination filesystems are the same and the user
+	// specified they'd prefer to link if possible, try to use a hardlink,
+	// falling back to copying if it fails.
+	if srcFS == dstFS && o.tryLink {
+		if err := LinkOrCopy(srcFS, srcPath, dstPath); oserror.IsNotExist(err) {
+			// Clone's semantics are such that it returns (false,nil) if the
+			// source does not exist.
+			return false, nil
+		} else if err != nil {
+			return false, err
+		} else {
+			return true, nil
+		}
+	}
+
+	data, err := io.ReadAll(srcFile)
+	if err != nil {
+		return false, err
+	}
+	dstFile, err := dstFS.Create(dstPath)
+	if err != nil {
+		return false, err
+	}
+	if _, err = dstFile.Write(data); err != nil {
+		return false, err
+	}
+	if o.sync {
+		if err := dstFile.Sync(); err != nil {
+			return false, err
+		}
+	}
+	if err := dstFile.Close(); err != nil {
+		return false, err
+	}
+	return true, nil
+}
diff --git a/pebble/vfs/default_linux.go b/pebble/vfs/default_linux.go
new file mode 100644
index 0000000..ec29074
--- /dev/null
+++ b/pebble/vfs/default_linux.go
@@ -0,0 +1,132 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build linux
+// +build linux
+
+package vfs
+
+import (
+	"os"
+	"syscall"
+
+	"github.com/cockroachdb/errors"
+	"golang.org/x/sys/unix"
+)
+
+func wrapOSFileImpl(f *os.File) File {
+	lf := &linuxFile{File: f, fd: f.Fd()}
+	if lf.fd != InvalidFd {
+		lf.useSyncRange = isSyncRangeSupported(lf.fd)
+	}
+	return lf
+}
+
+func (defaultFS) OpenDir(name string) (File, error) {
+	f, err := os.OpenFile(name, syscall.O_CLOEXEC, 0)
+	if err != nil {
+		return nil, errors.WithStack(err)
+	}
+	return &linuxDir{f}, nil
+}
+
+// Assert that linuxFile and linuxDir implement vfs.File.
+var (
+	_ File = (*linuxDir)(nil)
+	_ File = (*linuxFile)(nil)
+)
+
+type linuxDir struct {
+	*os.File
+}
+
+func (d *linuxDir) Prefetch(offset int64, length int64) error      { return nil }
+func (d *linuxDir) Preallocate(offset, length int64) error         { return nil }
+func (d *linuxDir) SyncData() error                                { return d.Sync() }
+func (d *linuxDir) SyncTo(offset int64) (fullSync bool, err error) { return false, nil }
+
+type linuxFile struct {
+	*os.File
+	fd           uintptr
+	useSyncRange bool
+}
+
+func (f *linuxFile) Prefetch(offset int64, length int64) error {
+	_, _, err := unix.Syscall(unix.SYS_READAHEAD, uintptr(f.fd), uintptr(offset), uintptr(length))
+	return err
+}
+
+func (f *linuxFile) Preallocate(offset, length int64) error {
+	return unix.Fallocate(int(f.fd), unix.FALLOC_FL_KEEP_SIZE, offset, length)
+}
+
+func (f *linuxFile) SyncData() error {
+	return unix.Fdatasync(int(f.fd))
+}
+
+func (f *linuxFile) SyncTo(offset int64) (fullSync bool, err error) {
+	if !f.useSyncRange {
+		// Use fdatasync, which does provide persistence guarantees but won't
+		// update all file metadata. From the `fdatasync` man page:
+		//
+		// fdatasync() is similar to fsync(), but does not flush modified
+		// metadata unless that metadata is needed in order to allow a
+		// subsequent data retrieval to be correctly handled. For example,
+		// changes to st_atime or st_mtime (respectively, time of last access
+		// and time of last modification; see stat(2)) do not require flushing
+		// because they are not necessary for a subsequent data read to be
+		// handled correctly. On the other hand, a change to the file size
+		// (st_size, as made by say ftruncate(2)), would require a metadata
+		// flush.
+		if err = unix.Fdatasync(int(f.fd)); err != nil {
+			return false, err
+		}
+		return true, nil
+	}
+
+	const (
+		waitBefore = 0x1
+		write      = 0x2
+		// waitAfter = 0x4
+	)
+
+	// By specifying write|waitBefore for the flags, we're instructing
+	// SyncFileRange to a) wait for any outstanding data being written to finish,
+	// and b) to queue any other dirty data blocks in the range [0,offset] for
+	// writing. The actual writing of this data will occur asynchronously. The
+	// use of `waitBefore` is to limit how much dirty data is allowed to
+	// accumulate. Linux sometimes behaves poorly when a large amount of dirty
+	// data accumulates, impacting other I/O operations.
+	return false, unix.SyncFileRange(int(f.fd), 0, offset, write|waitBefore)
+}
+
+type syncFileRange func(fd int, off int64, n int64, flags int) (err error)
+
+// sync_file_range depends on both the filesystem, and the broader kernel
+// support. In particular, Windows Subsystem for Linux does not support
+// sync_file_range, even when used with ext{2,3,4}. syncRangeSmokeTest performs
+// a test of of sync_file_range, returning false on ENOSYS, and true otherwise.
+func syncRangeSmokeTest(fd uintptr, syncFn syncFileRange) bool {
+	err := syncFn(int(fd), 0 /* offset */, 0 /* nbytes */, 0 /* flags */)
+	return err != unix.ENOSYS
+}
+
+func isSyncRangeSupported(fd uintptr) bool {
+	var stat unix.Statfs_t
+	if err := unix.Fstatfs(int(fd), &stat); err != nil {
+		return false
+	}
+
+	// Allowlist which filesystems we allow using sync_file_range with as some
+	// filesystems treat that syscall as a noop (notably ZFS). A allowlist is
+	// used instead of a denylist in order to have a more graceful failure mode
+	// in case a filesystem we haven't tested is encountered. Currently only
+	// ext2/3/4 are known to work properly.
+	const extMagic = 0xef53
+	switch stat.Type {
+	case extMagic:
+		return syncRangeSmokeTest(fd, unix.SyncFileRange)
+	}
+	return false
+}
diff --git a/pebble/vfs/default_unix.go b/pebble/vfs/default_unix.go
new file mode 100644
index 0000000..836185e
--- /dev/null
+++ b/pebble/vfs/default_unix.go
@@ -0,0 +1,49 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build darwin || dragonfly || freebsd || netbsd || openbsd || solaris
+// +build darwin dragonfly freebsd netbsd openbsd solaris
+
+package vfs
+
+import (
+	"os"
+	"syscall"
+
+	"github.com/cockroachdb/errors"
+)
+
+func wrapOSFileImpl(osFile *os.File) File {
+	return &unixFile{File: osFile, fd: osFile.Fd()}
+}
+
+func (defaultFS) OpenDir(name string) (File, error) {
+	f, err := os.OpenFile(name, syscall.O_CLOEXEC, 0)
+	if err != nil {
+		return nil, errors.WithStack(err)
+	}
+	return &unixFile{f, InvalidFd}, nil
+}
+
+// Assert that unixFile implements vfs.File.
+var _ File = (*unixFile)(nil)
+
+type unixFile struct {
+	*os.File
+	fd uintptr
+}
+
+func (*unixFile) Prefetch(offset int64, length int64) error { return nil }
+func (*unixFile) Preallocate(offset, length int64) error    { return nil }
+
+func (f *unixFile) SyncData() error {
+	return f.Sync()
+}
+
+func (f *unixFile) SyncTo(int64) (fullSync bool, err error) {
+	if err = f.Sync(); err != nil {
+		return false, err
+	}
+	return true, nil
+}
diff --git a/pebble/vfs/default_windows.go b/pebble/vfs/default_windows.go
new file mode 100644
index 0000000..f314193
--- /dev/null
+++ b/pebble/vfs/default_windows.go
@@ -0,0 +1,61 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build windows
+// +build windows
+
+package vfs
+
+import (
+	"os"
+	"syscall"
+
+	"github.com/cockroachdb/errors"
+)
+
+func wrapOSFileImpl(f *os.File) File {
+	return &windowsFile{f}
+}
+
+func (defaultFS) OpenDir(name string) (File, error) {
+	f, err := os.OpenFile(name, syscall.O_CLOEXEC, 0)
+	if err != nil {
+		return nil, errors.WithStack(err)
+	}
+	return &windowsDir{f}, nil
+}
+
+// Assert that windowsFile and windowsDir implement vfs.File.
+var (
+	_ File = (*windowsFile)(nil)
+	_ File = (*windowsDir)(nil)
+)
+
+type windowsDir struct {
+	*os.File
+}
+
+func (*windowsDir) Prefetch(offset int64, length int64) error { return nil }
+func (*windowsDir) Preallocate(off, length int64) error       { return nil }
+
+// Silently ignore Sync() on Windows. This is the same behavior as
+// RocksDB. See port/win/io_win.cc:WinDirectory::Fsync().
+func (*windowsDir) Sync() error                                    { return nil }
+func (*windowsDir) SyncData() error                                { return nil }
+func (*windowsDir) SyncTo(length int64) (fullSync bool, err error) { return false, nil }
+
+type windowsFile struct {
+	*os.File
+}
+
+func (*windowsFile) Prefetch(offset int64, length int64) error { return nil }
+func (*windowsFile) Preallocate(offset, length int64) error    { return nil }
+
+func (f *windowsFile) SyncData() error { return f.Sync() }
+func (f *windowsFile) SyncTo(length int64) (fullSync bool, err error) {
+	if err = f.Sync(); err != nil {
+		return false, err
+	}
+	return true, nil
+}
diff --git a/pebble/vfs/disk_full.go b/pebble/vfs/disk_full.go
new file mode 100644
index 0000000..16f6a5d
--- /dev/null
+++ b/pebble/vfs/disk_full.go
@@ -0,0 +1,453 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package vfs
+
+import (
+	"io"
+	"os"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"github.com/cockroachdb/errors"
+)
+
+// OnDiskFull wraps the provided FS with an FS that examines returned errors,
+// looking for ENOSPC errors. It invokes the provided callback when the
+// underlying filesystem returns an error signifying the storage is out of
+// disk space.
+//
+// All new writes to the filesystem are blocked while the callback executes,
+// so care must be taken to avoid expensive work from within the callback.
+//
+// Once the callback completes, any write-oriented operations that encountered
+// ENOSPC are retried exactly once. Once the callback completes, it will not
+// be invoked again until a new operation that began after the callback
+// returned encounters an ENOSPC error.
+//
+// OnDiskFull may be used to automatically manage a ballast file, which is
+// removed from the filesystem from within the callback. Note that if managing
+// a ballast, the caller should maintain a reference to the inner FS and
+// remove the ballast on the unwrapped FS.
+func OnDiskFull(fs FS, fn func()) FS {
+	newFS := &enospcFS{inner: fs}
+	newFS.mu.Cond.L = &newFS.mu.Mutex
+	newFS.mu.onDiskFull = fn
+	return newFS
+}
+
+type enospcFS struct {
+	inner FS
+	// generation is a monotonically increasing number that encodes the
+	// current state of ENOSPC error handling. Incoming writes are
+	// organized into generations to provide strong guarantees on when the
+	// disk full callback is invoked. The callback is invoked once per
+	// write generation.
+	//
+	// Special significance is given to the parity of this generation
+	// field to optimize incoming writes in the normal state, which only
+	// need to perform a single atomic load. If generation is odd, an
+	// ENOSPC error is being actively handled. The generations associated
+	// with writes are always even.
+	//
+	// The lifecycle of a write is:
+	//
+	// 1. Atomically load the current generation.
+	//    a. If it's even, this is the write's generation number.
+	//    b. If it's odd, an ENOSPC was recently encountered and the
+	//       corresponding invocation of the disk full callback has not
+	//       yet completed. The write must wait until the callback has
+	//       completed and generation is updated to an even number, which
+	//       becomes the write's generation number.
+	// 2. Perform the write. If it encounters no error or an error other
+	//    than ENOSPC, the write returns and proceeds no further in this
+	//    lifecycle.
+	// 3. Handle ENOSPC. If the write encounters ENOSPC, the callback must
+	//    be invoked for the write's generation. The write's goroutine
+	//    acquires the FS's mutex.
+	//    a. If the FS's current generation is still equal to the write's
+	//       generation, the write is the first write of its generation to
+	//       encounter ENOSPC. It increments the FS's current generation
+	//       to an odd number, signifying that an ENOSPC is being handled
+	//       and invokes the callback.
+	//    b. If the FS's current generation has changed, some other write
+	//       from the same generation encountered an ENOSPC first. This
+	//       write waits on the condition variable until the FS's current
+	//       generation is updated indicating that the generation's
+	//       callback invocation has completed.
+	// 3. Retry the write once. The callback for the write's generation
+	//    has completed, either by this write's goroutine or another's.
+	//    The write may proceed with the expectation that the callback
+	//    remedied the full disk by freeing up disk space and an ENOSPC
+	//    should not be encountered again for at least a few minutes. If
+	//    we do encounter another ENOSPC on the retry, the callback was
+	//    unable to remedy the full disk and another retry won't be
+	//    useful. Any error, including ENOSPC, during the retry is
+	//    returned without further handling.  None of the retries invoke
+	//    the callback.
+	//
+	// This scheme has a few nice properties:
+	// * Once the disk-full callback completes, it won't be invoked
+	//   again unless a write that started strictly later encounters an
+	//   ENOSPC. This is convenient if the callback strives to 'fix' the
+	//   full disk, for example, by removing a ballast file. A new
+	//   invocation of the callback guarantees a new problem.
+	// * Incoming writes block if there's an unhandled ENOSPC. Some
+	//   writes, like WAL or MANIFEST fsyncs, are fatal if they encounter
+	//   an ENOSPC.
+	generation atomic.Uint32
+	mu         struct {
+		sync.Mutex
+		sync.Cond
+		onDiskFull func()
+	}
+}
+
+// Unwrap returns the underlying FS. This may be called by vfs.Root to access
+// the underlying filesystem.
+func (fs *enospcFS) Unwrap() FS {
+	return fs.inner
+}
+
+// waitUntilReady is called before every FS or File operation that
+// might return ENOSPC. If an ENOSPC was encountered and the corresponding
+// invocation of the `onDiskFull` callback has not yet returned,
+// waitUntilReady blocks until the callback returns. The returned generation
+// is always even.
+func (fs *enospcFS) waitUntilReady() uint32 {
+	gen := fs.generation.Load()
+	if gen%2 == 0 {
+		// An even generation indicates that we're not currently handling an
+		// ENOSPC. Allow the write to proceed.
+		return gen
+	}
+
+	// We're currently handling an ENOSPC error. Wait on the condition
+	// variable until we're not handling an ENOSPC.
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+
+	// Load the generation again with fs.mu locked.
+	gen = fs.generation.Load()
+	for gen%2 == 1 {
+		fs.mu.Wait()
+		gen = fs.generation.Load()
+	}
+	return gen
+}
+
+func (fs *enospcFS) handleENOSPC(gen uint32) {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+
+	currentGeneration := fs.generation.Load()
+
+	// If the current generation is still `gen`, this is the first goroutine
+	// to hit an ENOSPC within this write generation, so this goroutine is
+	// responsible for invoking the callback.
+	if currentGeneration == gen {
+		// Increment the generation to an odd number, indicating that the FS
+		// is out-of-disk space and incoming writes should pause and wait for
+		// the next generation before continuing.
+		fs.generation.Store(gen + 1)
+
+		func() {
+			// Drop the mutex while we invoke the callback, re-acquiring
+			// afterwards.
+			fs.mu.Unlock()
+			defer fs.mu.Lock()
+			fs.mu.onDiskFull()
+		}()
+
+		// Update the current generation again to an even number, indicating
+		// that the callback has completed for the write generation `gen`.
+		fs.generation.Store(gen + 2)
+		fs.mu.Broadcast()
+		return
+	}
+
+	// The current generation has already been incremented, so either the
+	// callback is currently being run by another goroutine or it's already
+	// completed. Wait for it complete if it hasn't already.
+	//
+	// The current generation may be updated multiple times, including to an
+	// odd number signifying a later write generation has already encountered
+	// ENOSPC. In that case, the callback was not able to remedy the full disk
+	// and waiting is unlikely to be helpful.  Continuing to wait risks
+	// blocking an unbounded number of generations.  Retrying and bubbling the
+	// ENOSPC up might be helpful if we can abort a large compaction that
+	// started before we became more selective about compaction picking, so
+	// this loop only waits for this write generation's callback and no
+	// subsequent generations' callbacks.
+	for currentGeneration == gen+1 {
+		fs.mu.Wait()
+		currentGeneration = fs.generation.Load()
+	}
+}
+
+func (fs *enospcFS) Create(name string) (File, error) {
+	gen := fs.waitUntilReady()
+
+	f, err := fs.inner.Create(name)
+
+	if err != nil && isENOSPC(err) {
+		fs.handleENOSPC(gen)
+		f, err = fs.inner.Create(name)
+	}
+	if f != nil {
+		f = &enospcFile{
+			fs:    fs,
+			inner: f,
+		}
+	}
+	return f, err
+}
+
+func (fs *enospcFS) Link(oldname, newname string) error {
+	gen := fs.waitUntilReady()
+
+	err := fs.inner.Link(oldname, newname)
+
+	if err != nil && isENOSPC(err) {
+		fs.handleENOSPC(gen)
+		err = fs.inner.Link(oldname, newname)
+	}
+	return err
+}
+
+func (fs *enospcFS) Open(name string, opts ...OpenOption) (File, error) {
+	f, err := fs.inner.Open(name, opts...)
+	if f != nil {
+		f = &enospcFile{
+			fs:    fs,
+			inner: f,
+		}
+	}
+	return f, err
+}
+
+func (fs *enospcFS) OpenReadWrite(name string, opts ...OpenOption) (File, error) {
+	f, err := fs.inner.OpenReadWrite(name, opts...)
+	if f != nil {
+		f = &enospcFile{
+			fs:    fs,
+			inner: f,
+		}
+	}
+	return f, err
+}
+
+func (fs *enospcFS) OpenDir(name string) (File, error) {
+	f, err := fs.inner.OpenDir(name)
+	if f != nil {
+		f = &enospcFile{
+			fs:    fs,
+			inner: f,
+		}
+	}
+	return f, err
+}
+
+func (fs *enospcFS) Remove(name string) error {
+	gen := fs.waitUntilReady()
+
+	err := fs.inner.Remove(name)
+
+	if err != nil && isENOSPC(err) {
+		fs.handleENOSPC(gen)
+		err = fs.inner.Remove(name)
+	}
+	return err
+}
+
+func (fs *enospcFS) RemoveAll(name string) error {
+	gen := fs.waitUntilReady()
+
+	err := fs.inner.RemoveAll(name)
+
+	if err != nil && isENOSPC(err) {
+		fs.handleENOSPC(gen)
+		err = fs.inner.RemoveAll(name)
+	}
+	return err
+}
+
+func (fs *enospcFS) Rename(oldname, newname string) error {
+	gen := fs.waitUntilReady()
+
+	err := fs.inner.Rename(oldname, newname)
+
+	if err != nil && isENOSPC(err) {
+		fs.handleENOSPC(gen)
+		err = fs.inner.Rename(oldname, newname)
+	}
+	return err
+}
+
+func (fs *enospcFS) ReuseForWrite(oldname, newname string) (File, error) {
+	gen := fs.waitUntilReady()
+
+	f, err := fs.inner.ReuseForWrite(oldname, newname)
+
+	if err != nil && isENOSPC(err) {
+		fs.handleENOSPC(gen)
+		f, err = fs.inner.ReuseForWrite(oldname, newname)
+	}
+
+	if f != nil {
+		f = &enospcFile{
+			fs:    fs,
+			inner: f,
+		}
+	}
+	return f, err
+}
+
+func (fs *enospcFS) MkdirAll(dir string, perm os.FileMode) error {
+	gen := fs.waitUntilReady()
+
+	err := fs.inner.MkdirAll(dir, perm)
+
+	if err != nil && isENOSPC(err) {
+		fs.handleENOSPC(gen)
+		err = fs.inner.MkdirAll(dir, perm)
+	}
+	return err
+}
+
+func (fs *enospcFS) Lock(name string) (io.Closer, error) {
+	gen := fs.waitUntilReady()
+
+	closer, err := fs.inner.Lock(name)
+
+	if err != nil && isENOSPC(err) {
+		fs.handleENOSPC(gen)
+		closer, err = fs.inner.Lock(name)
+	}
+	return closer, err
+}
+
+func (fs *enospcFS) List(dir string) ([]string, error) {
+	return fs.inner.List(dir)
+}
+
+func (fs *enospcFS) Stat(name string) (os.FileInfo, error) {
+	return fs.inner.Stat(name)
+}
+
+func (fs *enospcFS) PathBase(path string) string {
+	return fs.inner.PathBase(path)
+}
+
+func (fs *enospcFS) PathJoin(elem ...string) string {
+	return fs.inner.PathJoin(elem...)
+}
+
+func (fs *enospcFS) PathDir(path string) string {
+	return fs.inner.PathDir(path)
+}
+
+func (fs *enospcFS) GetDiskUsage(path string) (DiskUsage, error) {
+	return fs.inner.GetDiskUsage(path)
+}
+
+type enospcFile struct {
+	fs    *enospcFS
+	inner File
+}
+
+var _ File = (*enospcFile)(nil)
+
+func (f *enospcFile) Close() error {
+	return f.inner.Close()
+}
+
+func (f *enospcFile) Read(p []byte) (n int, err error) {
+	return f.inner.Read(p)
+}
+
+func (f *enospcFile) ReadAt(p []byte, off int64) (n int, err error) {
+	return f.inner.ReadAt(p, off)
+}
+
+func (f *enospcFile) Write(p []byte) (n int, err error) {
+	gen := f.fs.waitUntilReady()
+
+	n, err = f.inner.Write(p)
+
+	if err != nil && isENOSPC(err) {
+		f.fs.handleENOSPC(gen)
+		var n2 int
+		n2, err = f.inner.Write(p[n:])
+		n += n2
+	}
+	return n, err
+}
+
+func (f *enospcFile) WriteAt(p []byte, ofs int64) (n int, err error) {
+	gen := f.fs.waitUntilReady()
+
+	n, err = f.inner.WriteAt(p, ofs)
+
+	if err != nil && isENOSPC(err) {
+		f.fs.handleENOSPC(gen)
+		var n2 int
+		n2, err = f.inner.WriteAt(p[n:], ofs+int64(n))
+		n += n2
+	}
+	return n, err
+}
+
+func (f *enospcFile) Prefetch(offset, length int64) error {
+	return f.inner.Prefetch(offset, length)
+}
+
+func (f *enospcFile) Preallocate(offset, length int64) error {
+	return f.inner.Preallocate(offset, length)
+}
+
+func (f *enospcFile) Stat() (os.FileInfo, error) {
+	return f.inner.Stat()
+}
+
+func (f *enospcFile) Sync() error {
+	gen := f.fs.waitUntilReady()
+
+	err := f.inner.Sync()
+
+	if err != nil && isENOSPC(err) {
+		f.fs.handleENOSPC(gen)
+
+		// NB: It is NOT safe to retry the Sync. See the PostgreSQL
+		// 'fsyncgate' discussion. A successful Sync after a failed one does
+		// not provide any guarantees and (always?) loses the unsynced writes.
+		// We need to bubble the error up and hope we weren't syncing a WAL or
+		// MANIFEST, because we'll have no choice but to crash. Errors while
+		// syncing an sstable will result in a failed flush/compaction, and
+		// the relevant sstable(s) will be marked as obsolete and deleted.
+		// See: https://lwn.net/Articles/752063/
+	}
+	return err
+}
+
+func (f *enospcFile) SyncData() error {
+	return f.inner.SyncData()
+}
+
+func (f *enospcFile) SyncTo(length int64) (fullSync bool, err error) {
+	return f.inner.SyncTo(length)
+}
+
+func (f *enospcFile) Fd() uintptr {
+	return f.inner.Fd()
+}
+
+var _ FS = (*enospcFS)(nil)
+
+func isENOSPC(err error) bool {
+	err = errors.UnwrapAll(err)
+	e, ok := err.(syscall.Errno)
+	return ok && e == syscall.ENOSPC
+}
diff --git a/pebble/vfs/disk_full_test.go b/pebble/vfs/disk_full_test.go
new file mode 100644
index 0000000..d56fc55
--- /dev/null
+++ b/pebble/vfs/disk_full_test.go
@@ -0,0 +1,254 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package vfs
+
+import (
+	"io"
+	"os"
+	"sync"
+	"sync/atomic"
+	"syscall"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/stretchr/testify/require"
+)
+
+var filesystemWriteOps = map[string]func(FS) error{
+	"Create": func(fs FS) error {
+		_, err := fs.Create("foo")
+		return err
+	},
+	"Lock": func(fs FS) error {
+		_, err := fs.Lock("foo")
+		return err
+	},
+	"ReuseForWrite": func(fs FS) error {
+		_, err := fs.ReuseForWrite("foo", "bar")
+		return err
+	},
+	"Link":      func(fs FS) error { return fs.Link("foo", "bar") },
+	"MkdirAll":  func(fs FS) error { return fs.MkdirAll("foo", os.ModePerm) },
+	"Remove":    func(fs FS) error { return fs.Remove("foo") },
+	"RemoveAll": func(fs FS) error { return fs.RemoveAll("foo") },
+	"Rename":    func(fs FS) error { return fs.Rename("foo", "bar") },
+}
+
+func TestOnDiskFull_FS(t *testing.T) {
+	for name, fn := range filesystemWriteOps {
+		t.Run(name, func(t *testing.T) {
+			innerFS := &enospcMockFS{}
+			innerFS.enospcs.Store(1)
+			var callbackInvocations int
+			fs := OnDiskFull(innerFS, func() {
+				callbackInvocations++
+			})
+
+			// Call this vfs.FS method on the wrapped filesystem. The first
+			// call should return ENOSPC. Our registered callback should be
+			// invoked, then the method should be retried and return a nil
+			// error.
+			require.NoError(t, fn(fs))
+			require.Equal(t, 1, callbackInvocations)
+			// The inner filesystem should be invoked twice because of the
+			// retry.
+			require.Equal(t, uint32(2), innerFS.invocations.Load())
+		})
+	}
+}
+
+func TestOnDiskFull_File(t *testing.T) {
+	t.Run("Write", func(t *testing.T) {
+		innerFS := &enospcMockFS{bytesWritten: 6}
+		var callbackInvocations int
+		fs := OnDiskFull(innerFS, func() {
+			callbackInvocations++
+		})
+
+		f, err := fs.Create("foo")
+		require.NoError(t, err)
+
+		// The next Write should ENOSPC.
+		innerFS.enospcs.Store(1)
+
+		// Call the Write method on the wrapped file. The first call should return
+		// ENOSPC, but also that six bytes were written. Our registered callback
+		// should be invoked, then Write should be retried and return a nil error
+		// and five bytes written.
+		n, err := f.Write([]byte("hello world"))
+		require.NoError(t, err)
+		require.Equal(t, 11, n)
+		require.Equal(t, 1, callbackInvocations)
+		// The inner filesystem should be invoked 3 times. Once during Create
+		// and twice during Write.
+		require.Equal(t, uint32(3), innerFS.invocations.Load())
+	})
+	t.Run("Sync", func(t *testing.T) {
+		innerFS := &enospcMockFS{bytesWritten: 6}
+		var callbackInvocations int
+		fs := OnDiskFull(innerFS, func() {
+			callbackInvocations++
+		})
+
+		f, err := fs.Create("foo")
+		require.NoError(t, err)
+
+		// The next Sync should ENOSPC. The callback should be invoked, but a
+		// Sync cannot be retried.
+		innerFS.enospcs.Store(1)
+
+		err = f.Sync()
+		require.Error(t, err)
+		require.Equal(t, 1, callbackInvocations)
+		// The inner filesystem should be invoked 2 times. Once during Create
+		// and once during Sync.
+		require.Equal(t, uint32(2), innerFS.invocations.Load())
+	})
+}
+
+func TestOnDiskFull_Concurrent(t *testing.T) {
+	innerFS := &enospcMockFS{
+		opDelay: 10 * time.Millisecond,
+	}
+	innerFS.enospcs.Store(10)
+	var callbackInvocations atomic.Int32
+	fs := OnDiskFull(innerFS, func() {
+		callbackInvocations.Add(1)
+	})
+
+	var wg sync.WaitGroup
+	for i := 0; i < 10; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			_, err := fs.Create("foo")
+			// They all should succeed on retry.
+			require.NoError(t, err)
+		}()
+	}
+	wg.Wait()
+	// Since all operations should start before the first one returns an
+	// ENOSPC, the callback should only be invoked once.
+	require.Equal(t, int32(1), callbackInvocations.Load())
+	require.Equal(t, uint32(20), innerFS.invocations.Load())
+}
+
+type enospcMockFS struct {
+	FS
+	opDelay      time.Duration
+	bytesWritten int
+	enospcs      atomic.Int32
+	invocations  atomic.Uint32
+}
+
+func (fs *enospcMockFS) maybeENOSPC() error {
+	fs.invocations.Add(1)
+	v := fs.enospcs.Add(-1)
+
+	// Sleep before returning so that tests may issue concurrent writes that
+	// fall into the same write generation.
+	time.Sleep(fs.opDelay)
+
+	if v >= 0 {
+		// Wrap the error to test error unwrapping.
+		err := &os.PathError{Op: "mock", Path: "mock", Err: syscall.ENOSPC}
+		return errors.Wrap(err, "uh oh")
+	}
+	return nil
+}
+
+func (fs *enospcMockFS) Create(name string) (File, error) {
+	if err := fs.maybeENOSPC(); err != nil {
+		return nil, err
+	}
+	return &enospcMockFile{fs: fs}, nil
+}
+
+func (fs *enospcMockFS) Link(oldname, newname string) error {
+	if err := fs.maybeENOSPC(); err != nil {
+		return err
+	}
+	return nil
+}
+
+func (fs *enospcMockFS) Remove(name string) error {
+	if err := fs.maybeENOSPC(); err != nil {
+		return err
+	}
+	return nil
+}
+
+func (fs *enospcMockFS) RemoveAll(name string) error {
+	if err := fs.maybeENOSPC(); err != nil {
+		return err
+	}
+	return nil
+}
+
+func (fs *enospcMockFS) Rename(oldname, newname string) error {
+	if err := fs.maybeENOSPC(); err != nil {
+		return err
+	}
+	return nil
+}
+
+func (fs *enospcMockFS) ReuseForWrite(oldname, newname string) (File, error) {
+	if err := fs.maybeENOSPC(); err != nil {
+		return nil, err
+	}
+	return &enospcMockFile{fs: fs}, nil
+}
+
+func (fs *enospcMockFS) MkdirAll(dir string, perm os.FileMode) error {
+	if err := fs.maybeENOSPC(); err != nil {
+		return err
+	}
+	return nil
+}
+
+func (fs *enospcMockFS) Lock(name string) (io.Closer, error) {
+	if err := fs.maybeENOSPC(); err != nil {
+		return nil, err
+	}
+	return nil, nil
+}
+
+type enospcMockFile struct {
+	fs *enospcMockFS
+	File
+}
+
+func (f *enospcMockFile) Write(b []byte) (int, error) {
+
+	if err := f.fs.maybeENOSPC(); err != nil {
+		n := len(b)
+		if f.fs.bytesWritten < n {
+			n = f.fs.bytesWritten
+		}
+		return n, err
+	}
+	return len(b), nil
+}
+
+func (f *enospcMockFile) Sync() error {
+	return f.fs.maybeENOSPC()
+}
+
+// BenchmarkOnDiskFull benchmarks the overhead of the OnDiskFull filesystem
+// wrapper during a Write when there is no ENOSPC.
+func BenchmarkOnDiskFull(b *testing.B) {
+	fs := OnDiskFull(NewMem(), func() {})
+
+	f, err := fs.Create("foo")
+	require.NoError(b, err)
+	defer func() { require.NoError(b, f.Close()) }()
+
+	payload := []byte("hello world")
+	for i := 0; i < b.N; i++ {
+		_, err := f.Write(payload)
+		require.NoError(b, err)
+	}
+}
diff --git a/pebble/vfs/disk_health.go b/pebble/vfs/disk_health.go
new file mode 100644
index 0000000..3df4017
--- /dev/null
+++ b/pebble/vfs/disk_health.go
@@ -0,0 +1,806 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package vfs
+
+import (
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/cockroachdb/redact"
+)
+
+const (
+	// preallocatedSlotCount is the default number of slots available for
+	// concurrent filesystem operations. The slot count may be exceeded, but
+	// each additional slot will incur an additional allocation. We choose 16
+	// here with the expectation that it is significantly more than required in
+	// practice. See the comment above the diskHealthCheckingFS type definition.
+	preallocatedSlotCount = 16
+	// deltaBits is the number of bits in the packed 64-bit integer used for
+	// identifying a delta from the file creation time in milliseconds.
+	deltaBits = 40
+	// writeSizeBits is the number of bits in the packed 64-bit integer used for
+	// identifying the size of the write operation, if the operation is sized. See
+	// writeSizePrecision below for precision of size.
+	writeSizeBits = 20
+	// Track size of writes at kilobyte precision. See comment above lastWritePacked for more.
+	writeSizePrecision = 1024
+)
+
+// Variables to enable testing.
+var (
+	// defaultTickInterval is the default interval between two ticks of each
+	// diskHealthCheckingFile loop iteration.
+	defaultTickInterval = 2 * time.Second
+)
+
+// OpType is the type of IO operation being monitored by a
+// diskHealthCheckingFile.
+type OpType uint8
+
+// The following OpTypes is limited to the subset of file system operations that
+// a diskHealthCheckingFile supports (namely writes and syncs).
+const (
+	OpTypeUnknown OpType = iota
+	OpTypeWrite
+	OpTypeSync
+	OpTypeSyncData
+	OpTypeSyncTo
+	OpTypeCreate
+	OpTypeLink
+	OpTypeMkdirAll
+	OpTypePreallocate
+	OpTypeRemove
+	OpTypeRemoveAll
+	OpTypeRename
+	OpTypeReuseForWrite
+	// Note: opTypeMax is just used in tests. It must appear last in the list
+	// of OpTypes.
+	opTypeMax
+)
+
+// String implements fmt.Stringer.
+func (o OpType) String() string {
+	switch o {
+	case OpTypeWrite:
+		return "write"
+	case OpTypeSync:
+		return "sync"
+	case OpTypeSyncData:
+		return "syncdata"
+	case OpTypeSyncTo:
+		return "syncto"
+	case OpTypeCreate:
+		return "create"
+	case OpTypeLink:
+		return "link"
+	case OpTypeMkdirAll:
+		return "mkdirall"
+	case OpTypePreallocate:
+		return "preallocate"
+	case OpTypeRemove:
+		return "remove"
+	case OpTypeRemoveAll:
+		return "removall"
+	case OpTypeRename:
+		return "rename"
+	case OpTypeReuseForWrite:
+		return "reuseforwrite"
+	case OpTypeUnknown:
+		return "unknown"
+	default:
+		panic(fmt.Sprintf("vfs: unknown op type: %d", o))
+	}
+}
+
+// diskHealthCheckingFile is a File wrapper to detect slow disk operations, and
+// call onSlowDisk if a disk operation is seen to exceed diskSlowThreshold.
+//
+// This struct creates a goroutine (in startTicker()) that, at every tick
+// interval, sees if there's a disk operation taking longer than the specified
+// duration. This setup is preferable to creating a new timer at every disk
+// operation, as it reduces overhead per disk operation.
+type diskHealthCheckingFile struct {
+	file              File
+	onSlowDisk        func(opType OpType, writeSizeInBytes int, duration time.Duration)
+	diskSlowThreshold time.Duration
+	tickInterval      time.Duration
+
+	stopper chan struct{}
+	// lastWritePacked is a 64-bit unsigned int. The most significant
+	// 40 bits represent an delta (in milliseconds) from the creation
+	// time of the diskHealthCheckingFile. The next most significant 20 bits
+	// represent the size of the write in KBs, if the write has a size. (If
+	// it doesn't, the 20 bits are zeroed). The least significant four bits
+	// contains the OpType.
+	//
+	// The use of 40 bits for an delta provides ~34 years of effective
+	// monitoring time before the uint wraps around, at millisecond precision.
+	// ~34 years of process uptime "ought to be enough for anybody". Millisecond
+	// writeSizePrecision is sufficient, given that we are monitoring for writes that take
+	// longer than one millisecond.
+	//
+	// The use of 20 bits for the size in KBs allows representing sizes up
+	// to nearly one GB. If the write is larger than that, we round down to ~one GB.
+	//
+	// The use of four bits for OpType allows for 16 operation types.
+	//
+	// NB: this packing scheme is not persisted, and is therefore safe to adjust
+	// across process boundaries.
+	lastWritePacked atomic.Uint64
+	createTimeNanos int64
+}
+
+// newDiskHealthCheckingFile instantiates a new diskHealthCheckingFile, with the
+// specified time threshold and event listener.
+func newDiskHealthCheckingFile(
+	file File,
+	diskSlowThreshold time.Duration,
+	onSlowDisk func(OpType OpType, writeSizeInBytes int, duration time.Duration),
+) *diskHealthCheckingFile {
+	return &diskHealthCheckingFile{
+		file:              file,
+		onSlowDisk:        onSlowDisk,
+		diskSlowThreshold: diskSlowThreshold,
+		tickInterval:      defaultTickInterval,
+
+		stopper:         make(chan struct{}),
+		createTimeNanos: time.Now().UnixNano(),
+	}
+}
+
+// startTicker starts a new goroutine with a ticker to monitor disk operations.
+// Can only be called if the ticker goroutine isn't running already.
+func (d *diskHealthCheckingFile) startTicker() {
+	if d.diskSlowThreshold == 0 {
+		return
+	}
+
+	go func() {
+		ticker := time.NewTicker(d.tickInterval)
+		defer ticker.Stop()
+
+		for {
+			select {
+			case <-d.stopper:
+				return
+
+			case <-ticker.C:
+				packed := d.lastWritePacked.Load()
+				if packed == 0 {
+					continue
+				}
+				delta, writeSize, op := unpack(packed)
+				lastWrite := time.Unix(0, d.createTimeNanos+delta.Nanoseconds())
+				now := time.Now()
+				if lastWrite.Add(d.diskSlowThreshold).Before(now) {
+					// diskSlowThreshold was exceeded. Call the passed-in
+					// listener.
+					d.onSlowDisk(op, writeSize, now.Sub(lastWrite))
+				}
+			}
+		}
+	}()
+}
+
+// stopTicker stops the goroutine started in startTicker.
+func (d *diskHealthCheckingFile) stopTicker() {
+	close(d.stopper)
+}
+
+// Fd implements (vfs.File).Fd.
+func (d *diskHealthCheckingFile) Fd() uintptr {
+	return d.file.Fd()
+}
+
+// Read implements (vfs.File).Read
+func (d *diskHealthCheckingFile) Read(p []byte) (int, error) {
+	return d.file.Read(p)
+}
+
+// ReadAt implements (vfs.File).ReadAt
+func (d *diskHealthCheckingFile) ReadAt(p []byte, off int64) (int, error) {
+	return d.file.ReadAt(p, off)
+}
+
+// Write implements the io.Writer interface.
+func (d *diskHealthCheckingFile) Write(p []byte) (n int, err error) {
+	d.timeDiskOp(OpTypeWrite, int64(len(p)), func() {
+		n, err = d.file.Write(p)
+	}, time.Now().UnixNano())
+	return n, err
+}
+
+// Write implements the io.WriterAt interface.
+func (d *diskHealthCheckingFile) WriteAt(p []byte, ofs int64) (n int, err error) {
+	d.timeDiskOp(OpTypeWrite, int64(len(p)), func() {
+		n, err = d.file.WriteAt(p, ofs)
+	}, time.Now().UnixNano())
+	return n, err
+}
+
+// Close implements the io.Closer interface.
+func (d *diskHealthCheckingFile) Close() error {
+	d.stopTicker()
+	return d.file.Close()
+}
+
+// Prefetch implements (vfs.File).Prefetch.
+func (d *diskHealthCheckingFile) Prefetch(offset, length int64) error {
+	return d.file.Prefetch(offset, length)
+}
+
+// Preallocate implements (vfs.File).Preallocate.
+func (d *diskHealthCheckingFile) Preallocate(off, n int64) (err error) {
+	d.timeDiskOp(OpTypePreallocate, n, func() {
+		err = d.file.Preallocate(off, n)
+	}, time.Now().UnixNano())
+	return err
+}
+
+// Stat implements (vfs.File).Stat.
+func (d *diskHealthCheckingFile) Stat() (os.FileInfo, error) {
+	return d.file.Stat()
+}
+
+// Sync implements the io.Syncer interface.
+func (d *diskHealthCheckingFile) Sync() (err error) {
+	d.timeDiskOp(OpTypeSync, 0, func() {
+		err = d.file.Sync()
+	}, time.Now().UnixNano())
+	return err
+}
+
+// SyncData implements (vfs.File).SyncData.
+func (d *diskHealthCheckingFile) SyncData() (err error) {
+	d.timeDiskOp(OpTypeSyncData, 0, func() {
+		err = d.file.SyncData()
+	}, time.Now().UnixNano())
+	return err
+}
+
+// SyncTo implements (vfs.File).SyncTo.
+func (d *diskHealthCheckingFile) SyncTo(length int64) (fullSync bool, err error) {
+	d.timeDiskOp(OpTypeSyncTo, length, func() {
+		fullSync, err = d.file.SyncTo(length)
+	}, time.Now().UnixNano())
+	return fullSync, err
+}
+
+// timeDiskOp runs the specified closure and makes its timing visible to the
+// monitoring goroutine, in case it exceeds one of the slow disk durations.
+// opType should always be set. writeSizeInBytes should be set if the write
+// operation is sized. If not, it should be set to zero.
+//
+// The start time is taken as a parameter in the form of nanoseconds since the
+// unix epoch so that it appears in stack traces during crashes (if GOTRACEBACK
+// is set appropriately), aiding postmortem debugging.
+func (d *diskHealthCheckingFile) timeDiskOp(
+	opType OpType, writeSizeInBytes int64, op func(), startNanos int64,
+) {
+	if d == nil {
+		op()
+		return
+	}
+
+	delta := time.Duration(startNanos - d.createTimeNanos)
+	packed := pack(delta, writeSizeInBytes, opType)
+	if d.lastWritePacked.Swap(packed) != 0 {
+		panic("concurrent write operations detected on file")
+	}
+	defer func() {
+		if d.lastWritePacked.Swap(0) != packed {
+			panic("concurrent write operations detected on file")
+		}
+	}()
+	op()
+}
+
+// Note the slight lack of symmetry between pack & unpack. pack takes an int64 for writeSizeInBytes, since
+// callers of pack use an int64. This is dictated by the vfs interface. unpack OTOH returns an int. This is
+// safe because the packing scheme implies we only actually need 32 bits.
+func pack(delta time.Duration, writeSizeInBytes int64, opType OpType) uint64 {
+	// We have no guarantee of clock monotonicity. If we have a small regression
+	// in the clock, we set deltaMillis to zero, so we can still catch the operation
+	// if happens to be slow.
+	deltaMillis := delta.Milliseconds()
+	if deltaMillis < 0 {
+		deltaMillis = 0
+	}
+	// As of 3/7/2023, the use of 40 bits for an delta provides ~34 years
+	// of effective monitoring time before the uint wraps around, at millisecond
+	// precision.
+	if deltaMillis > 1<<deltaBits-1 {
+		panic("vfs: last write delta would result in integer wraparound")
+	}
+
+	// See writeSizePrecision to get the unit of writeSize. As of 1/26/2023, the unit is KBs.
+	writeSize := writeSizeInBytes / writeSizePrecision
+	// If the size of the write is larger than we can store in the packed int, store the max
+	// value we can store in the packed int.
+	const writeSizeCeiling = 1<<writeSizeBits - 1
+	if writeSize > writeSizeCeiling {
+		writeSize = writeSizeCeiling
+	}
+
+	return uint64(deltaMillis)<<(64-deltaBits) | uint64(writeSize)<<(64-deltaBits-writeSizeBits) | uint64(opType)
+}
+
+func unpack(packed uint64) (delta time.Duration, writeSizeInBytes int, opType OpType) {
+	delta = time.Duration(packed>>(64-deltaBits)) * time.Millisecond
+	wz := int64(packed>>(64-deltaBits-writeSizeBits)) & ((1 << writeSizeBits) - 1) * writeSizePrecision
+	// Given the packing scheme, converting wz to an int will not truncate anything.
+	writeSizeInBytes = int(wz)
+	opType = OpType(packed & 0xf)
+	return delta, writeSizeInBytes, opType
+}
+
+// diskHealthCheckingDir implements disk-health checking for directories. Unlike
+// other files, we allow directories to receive concurrent write operations
+// (Syncs are the only write operations supported by a directory.) Since the
+// diskHealthCheckingFile's timeDiskOp can only track a single in-flight
+// operation at a time, we time the operation using the filesystem-level
+// timeFilesystemOp function instead.
+type diskHealthCheckingDir struct {
+	File
+	name string
+	fs   *diskHealthCheckingFS
+}
+
+// Sync implements the io.Syncer interface.
+func (d *diskHealthCheckingDir) Sync() (err error) {
+	d.fs.timeFilesystemOp(d.name, OpTypeSync, func() {
+		err = d.File.Sync()
+	}, time.Now().UnixNano())
+	return err
+}
+
+// DiskSlowInfo captures info about detected slow operations on the vfs.
+type DiskSlowInfo struct {
+	// Path of file being written to.
+	Path string
+	// Operation being performed on the file.
+	OpType OpType
+	// Size of write in bytes, if the write is sized.
+	WriteSize int
+	// Duration that has elapsed since this disk operation started.
+	Duration time.Duration
+}
+
+func (i DiskSlowInfo) String() string {
+	return redact.StringWithoutMarkers(i)
+}
+
+// SafeFormat implements redact.SafeFormatter.
+func (i DiskSlowInfo) SafeFormat(w redact.SafePrinter, _ rune) {
+	switch i.OpType {
+	// Operations for which i.WriteSize is meaningful.
+	case OpTypeWrite, OpTypeSyncTo, OpTypePreallocate:
+		w.Printf("disk slowness detected: %s on file %s (%d bytes) has been ongoing for %0.1fs",
+			redact.Safe(i.OpType.String()), redact.Safe(filepath.Base(i.Path)),
+			redact.Safe(i.WriteSize), redact.Safe(i.Duration.Seconds()))
+	default:
+		w.Printf("disk slowness detected: %s on file %s has been ongoing for %0.1fs",
+			redact.Safe(i.OpType.String()), redact.Safe(filepath.Base(i.Path)),
+			redact.Safe(i.Duration.Seconds()))
+	}
+}
+
+// diskHealthCheckingFS adds disk-health checking facilities to a VFS.
+// It times disk write operations in two ways:
+//
+// 1. Wrapping vfs.Files.
+//
+// The bulk of write I/O activity is file writing and syncing, invoked through
+// the `vfs.File` interface. This VFS wraps all files open for writing with a
+// special diskHealthCheckingFile implementation of the vfs.File interface. See
+// above for the implementation.
+//
+// 2. Monitoring filesystem metadata operations.
+//
+// Filesystem metadata operations (create, link, remove, rename, etc) are also
+// sources of disk writes. Unlike a vfs.File which requires Write and Sync calls
+// to be sequential, a vfs.FS may receive these filesystem metadata operations
+// in parallel. To accommodate this parallelism, the diskHealthCheckingFS's
+// write-oriented filesystem operations record their start times into a 'slot'
+// on the filesystem. A single long-running goroutine periodically scans the
+// slots looking for slow operations.
+//
+// The number of slots on a diskHealthCheckingFS grows to a working set of the
+// maximum concurrent filesystem operations. This is expected to be very few
+// for these reasons:
+//  1. Pebble has limited write concurrency. Flushes, compactions and WAL
+//     rotations are the primary sources of filesystem metadata operations. With
+//     the default max-compaction concurrency, these operations require at most 5
+//     concurrent slots if all 5 perform a filesystem metadata operation
+//     simultaneously.
+//  2. Pebble's limited concurrent I/O writers spend most of their time
+//     performing file I/O, not performing the filesystem metadata operations that
+//     require recording a slot on the diskHealthCheckingFS.
+//  3. In CockroachDB, each additional store/Pebble instance has its own vfs.FS
+//     which provides a separate goroutine and set of slots.
+//  4. In CockroachDB, many of the additional sources of filesystem metadata
+//     operations (like encryption-at-rest) are sequential with respect to Pebble's
+//     threads.
+type diskHealthCheckingFS struct {
+	tickInterval      time.Duration
+	diskSlowThreshold time.Duration
+	onSlowDisk        func(DiskSlowInfo)
+	fs                FS
+	mu                struct {
+		sync.Mutex
+		tickerRunning bool
+		stopper       chan struct{}
+		inflight      []*slot
+	}
+	// prealloc preallocates the memory for mu.inflight slots and the slice
+	// itself. The contained fields are not accessed directly except by
+	// WithDiskHealthChecks when initializing mu.inflight. The number of slots
+	// in d.mu.inflight will grow to the maximum number of concurrent file
+	// metadata operations (create, remove, link, etc). If the number of
+	// concurrent operations never exceeds preallocatedSlotCount, we'll never
+	// incur an additional allocation.
+	prealloc struct {
+		slots        [preallocatedSlotCount]slot
+		slotPtrSlice [preallocatedSlotCount]*slot
+	}
+}
+
+type slot struct {
+	name       string
+	opType     OpType
+	startNanos atomic.Int64
+}
+
+// diskHealthCheckingFS implements FS.
+var _ FS = (*diskHealthCheckingFS)(nil)
+
+// WithDiskHealthChecks wraps an FS and ensures that all write-oriented
+// operations on the FS are wrapped with disk health detection checks. Disk
+// operations that are observed to take longer than diskSlowThreshold trigger an
+// onSlowDisk call.
+//
+// A threshold of zero disables disk-health checking.
+func WithDiskHealthChecks(
+	innerFS FS, diskSlowThreshold time.Duration, onSlowDisk func(info DiskSlowInfo),
+) (FS, io.Closer) {
+	if diskSlowThreshold == 0 {
+		return innerFS, noopCloser{}
+	}
+
+	fs := &diskHealthCheckingFS{
+		fs:                innerFS,
+		tickInterval:      defaultTickInterval,
+		diskSlowThreshold: diskSlowThreshold,
+		onSlowDisk:        onSlowDisk,
+	}
+	fs.mu.stopper = make(chan struct{})
+	// The fs holds preallocated slots and a preallocated array of slot pointers
+	// with equal length. Initialize the inflight slice to use a slice backed by
+	// the preallocated array with each slot initialized to a preallocated slot.
+	fs.mu.inflight = fs.prealloc.slotPtrSlice[:]
+	for i := range fs.mu.inflight {
+		fs.mu.inflight[i] = &fs.prealloc.slots[i]
+	}
+	return fs, fs
+}
+
+// timeFilesystemOp executes the provided closure, which should perform a
+// singular filesystem operation of a type matching opType on the named file. It
+// records the provided start time such that the long-lived disk-health checking
+// goroutine can observe if the operation is blocked for an inordinate time.
+//
+// The start time is taken as a parameter in the form of nanoseconds since the
+// unix epoch so that it appears in stack traces during crashes (if GOTRACEBACK
+// is set appropriately), aiding postmortem debugging.
+func (d *diskHealthCheckingFS) timeFilesystemOp(
+	name string, opType OpType, op func(), startNanos int64,
+) {
+	if d == nil {
+		op()
+		return
+	}
+
+	// Record this operation's start time on the FS, so that the long-running
+	// goroutine can monitor the filesystem operation.
+	//
+	// The diskHealthCheckingFile implementation uses a single field that is
+	// atomically updated, taking advantage of the fact that writes to a single
+	// vfs.File handle are not performed in parallel. The vfs.FS however may
+	// receive write filesystem operations in parallel. To accommodate this
+	// parallelism, writing goroutines append their start time to a
+	// mutex-protected vector. On ticks, the long-running goroutine scans the
+	// vector searching for start times older than the slow-disk threshold. When
+	// a writing goroutine completes its operation, it atomically overwrites its
+	// slot to signal completion.
+	var s *slot
+	func() {
+		d.mu.Lock()
+		defer d.mu.Unlock()
+
+		// If there's no long-running goroutine to monitor this filesystem
+		// operation, start one.
+		if !d.mu.tickerRunning {
+			d.startTickerLocked()
+		}
+
+		for i := 0; i < len(d.mu.inflight); i++ {
+			if d.mu.inflight[i].startNanos.Load() == 0 {
+				// This slot is not in use. Claim it.
+				s = d.mu.inflight[i]
+				s.name = name
+				s.opType = opType
+				s.startNanos.Store(startNanos)
+				break
+			}
+		}
+		// If we didn't find any unused slots, create a new slot and append it.
+		// This slot will exist forever. The number of slots will grow to the
+		// maximum number of concurrent filesystem operations over the lifetime
+		// of the process. Only operations that grow the number of slots must
+		// incur an allocation.
+		if s == nil {
+			s = &slot{
+				name:   name,
+				opType: opType,
+			}
+			s.startNanos.Store(startNanos)
+			d.mu.inflight = append(d.mu.inflight, s)
+		}
+	}()
+
+	op()
+
+	// Signal completion by zeroing the start time.
+	s.startNanos.Store(0)
+}
+
+// startTickerLocked starts a new goroutine with a ticker to monitor disk
+// filesystem operations. Requires d.mu and !d.mu.tickerRunning.
+func (d *diskHealthCheckingFS) startTickerLocked() {
+	d.mu.tickerRunning = true
+	stopper := d.mu.stopper
+	go func() {
+		ticker := time.NewTicker(d.tickInterval)
+		defer ticker.Stop()
+		type exceededSlot struct {
+			name       string
+			opType     OpType
+			startNanos int64
+		}
+		var exceededSlots []exceededSlot
+
+		for {
+			select {
+			case <-ticker.C:
+				// Scan the inflight slots for any slots recording a start
+				// time older than the diskSlowThreshold.
+				exceededSlots = exceededSlots[:0]
+				d.mu.Lock()
+				now := time.Now()
+				for i := range d.mu.inflight {
+					nanos := d.mu.inflight[i].startNanos.Load()
+					if nanos != 0 && time.Unix(0, nanos).Add(d.diskSlowThreshold).Before(now) {
+						// diskSlowThreshold was exceeded. Copy this inflightOp into
+						// exceededSlots and call d.onSlowDisk after dropping the mutex.
+						inflightOp := exceededSlot{
+							name:       d.mu.inflight[i].name,
+							opType:     d.mu.inflight[i].opType,
+							startNanos: nanos,
+						}
+						exceededSlots = append(exceededSlots, inflightOp)
+					}
+				}
+				d.mu.Unlock()
+				for i := range exceededSlots {
+					d.onSlowDisk(
+						DiskSlowInfo{
+							Path:      exceededSlots[i].name,
+							OpType:    exceededSlots[i].opType,
+							WriteSize: 0, // writes at the fs level are not sized
+							Duration:  now.Sub(time.Unix(0, exceededSlots[i].startNanos)),
+						})
+				}
+			case <-stopper:
+				return
+			}
+		}
+	}()
+}
+
+// Close implements io.Closer. Close stops the long-running goroutine that
+// monitors for slow filesystem metadata operations. Close may be called
+// multiple times. If the filesystem is used after Close has been called, a new
+// long-running goroutine will be created.
+func (d *diskHealthCheckingFS) Close() error {
+	d.mu.Lock()
+	if !d.mu.tickerRunning {
+		// Nothing to stop.
+		d.mu.Unlock()
+		return nil
+	}
+
+	// Grab the stopper so we can request the long-running goroutine to stop.
+	// Replace the stopper in case this FS is reused. It's possible to Close and
+	// reuse a disk-health checking FS. This is to accommodate the on-by-default
+	// behavior in Pebble, and the possibility that users may continue to use
+	// the Pebble default FS beyond the lifetime of a single DB.
+	stopper := d.mu.stopper
+	d.mu.stopper = make(chan struct{})
+	d.mu.tickerRunning = false
+	d.mu.Unlock()
+
+	// Ask the long-running goroutine to stop. This is a synchronous channel
+	// send.
+	stopper <- struct{}{}
+	close(stopper)
+	return nil
+}
+
+// Create implements the FS interface.
+func (d *diskHealthCheckingFS) Create(name string) (File, error) {
+	var f File
+	var err error
+	d.timeFilesystemOp(name, OpTypeCreate, func() {
+		f, err = d.fs.Create(name)
+	}, time.Now().UnixNano())
+	if err != nil {
+		return f, err
+	}
+	if d.diskSlowThreshold == 0 {
+		return f, nil
+	}
+	checkingFile := newDiskHealthCheckingFile(f, d.diskSlowThreshold, func(opType OpType, writeSizeInBytes int, duration time.Duration) {
+		d.onSlowDisk(
+			DiskSlowInfo{
+				Path:      name,
+				OpType:    opType,
+				WriteSize: writeSizeInBytes,
+				Duration:  duration,
+			})
+	})
+	checkingFile.startTicker()
+	return checkingFile, nil
+}
+
+// GetDiskUsage implements the FS interface.
+func (d *diskHealthCheckingFS) GetDiskUsage(path string) (DiskUsage, error) {
+	return d.fs.GetDiskUsage(path)
+}
+
+// Link implements the FS interface.
+func (d *diskHealthCheckingFS) Link(oldname, newname string) error {
+	var err error
+	d.timeFilesystemOp(newname, OpTypeLink, func() {
+		err = d.fs.Link(oldname, newname)
+	}, time.Now().UnixNano())
+	return err
+}
+
+// List implements the FS interface.
+func (d *diskHealthCheckingFS) List(dir string) ([]string, error) {
+	return d.fs.List(dir)
+}
+
+// Lock implements the FS interface.
+func (d *diskHealthCheckingFS) Lock(name string) (io.Closer, error) {
+	return d.fs.Lock(name)
+}
+
+// MkdirAll implements the FS interface.
+func (d *diskHealthCheckingFS) MkdirAll(dir string, perm os.FileMode) error {
+	var err error
+	d.timeFilesystemOp(dir, OpTypeMkdirAll, func() {
+		err = d.fs.MkdirAll(dir, perm)
+	}, time.Now().UnixNano())
+	return err
+}
+
+// Open implements the FS interface.
+func (d *diskHealthCheckingFS) Open(name string, opts ...OpenOption) (File, error) {
+	return d.fs.Open(name, opts...)
+}
+
+// OpenReadWrite implements the FS interface.
+func (d *diskHealthCheckingFS) OpenReadWrite(name string, opts ...OpenOption) (File, error) {
+	return d.fs.OpenReadWrite(name, opts...)
+}
+
+// OpenDir implements the FS interface.
+func (d *diskHealthCheckingFS) OpenDir(name string) (File, error) {
+	f, err := d.fs.OpenDir(name)
+	if err != nil {
+		return f, err
+	}
+	// Directories opened with OpenDir must be opened with health checking,
+	// because they may be explicitly synced.
+	return &diskHealthCheckingDir{
+		File: f,
+		name: name,
+		fs:   d,
+	}, nil
+}
+
+// PathBase implements the FS interface.
+func (d *diskHealthCheckingFS) PathBase(path string) string {
+	return d.fs.PathBase(path)
+}
+
+// PathJoin implements the FS interface.
+func (d *diskHealthCheckingFS) PathJoin(elem ...string) string {
+	return d.fs.PathJoin(elem...)
+}
+
+// PathDir implements the FS interface.
+func (d *diskHealthCheckingFS) PathDir(path string) string {
+	return d.fs.PathDir(path)
+}
+
+// Remove implements the FS interface.
+func (d *diskHealthCheckingFS) Remove(name string) error {
+	var err error
+	d.timeFilesystemOp(name, OpTypeRemove, func() {
+		err = d.fs.Remove(name)
+	}, time.Now().UnixNano())
+	return err
+}
+
+// RemoveAll implements the FS interface.
+func (d *diskHealthCheckingFS) RemoveAll(name string) error {
+	var err error
+	d.timeFilesystemOp(name, OpTypeRemoveAll, func() {
+		err = d.fs.RemoveAll(name)
+	}, time.Now().UnixNano())
+	return err
+}
+
+// Rename implements the FS interface.
+func (d *diskHealthCheckingFS) Rename(oldname, newname string) error {
+	var err error
+	d.timeFilesystemOp(newname, OpTypeRename, func() {
+		err = d.fs.Rename(oldname, newname)
+	}, time.Now().UnixNano())
+	return err
+}
+
+// ReuseForWrite implements the FS interface.
+func (d *diskHealthCheckingFS) ReuseForWrite(oldname, newname string) (File, error) {
+	var f File
+	var err error
+	d.timeFilesystemOp(newname, OpTypeReuseForWrite, func() {
+		f, err = d.fs.ReuseForWrite(oldname, newname)
+	}, time.Now().UnixNano())
+	if err != nil {
+		return f, err
+	}
+	if d.diskSlowThreshold == 0 {
+		return f, nil
+	}
+	checkingFile := newDiskHealthCheckingFile(f, d.diskSlowThreshold, func(opType OpType, writeSizeInBytes int, duration time.Duration) {
+		d.onSlowDisk(
+			DiskSlowInfo{
+				Path:      newname,
+				OpType:    opType,
+				WriteSize: writeSizeInBytes,
+				Duration:  duration,
+			})
+	})
+	checkingFile.startTicker()
+	return checkingFile, nil
+}
+
+// Stat implements the FS interface.
+func (d *diskHealthCheckingFS) Stat(name string) (os.FileInfo, error) {
+	return d.fs.Stat(name)
+}
+
+type noopCloser struct{}
+
+func (noopCloser) Close() error { return nil }
diff --git a/pebble/vfs/disk_health_test.go b/pebble/vfs/disk_health_test.go
new file mode 100644
index 0000000..7bb2058
--- /dev/null
+++ b/pebble/vfs/disk_health_test.go
@@ -0,0 +1,571 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package vfs
+
+import (
+	"io"
+	"math"
+	"os"
+	"runtime"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/stretchr/testify/require"
+)
+
+type mockFile struct {
+	syncAndWriteDuration time.Duration
+}
+
+func (m mockFile) Close() error {
+	return nil
+}
+
+func (m mockFile) Read(p []byte) (n int, err error) {
+	panic("unimplemented")
+}
+
+func (m mockFile) ReadAt(p []byte, off int64) (n int, err error) {
+	panic("unimplemented")
+}
+
+func (m mockFile) Write(p []byte) (n int, err error) {
+	time.Sleep(m.syncAndWriteDuration)
+	return len(p), nil
+}
+
+func (m mockFile) WriteAt(p []byte, ofs int64) (n int, err error) {
+	time.Sleep(m.syncAndWriteDuration)
+	return len(p), nil
+}
+
+func (m mockFile) Prefetch(offset, length int64) error {
+	panic("unimplemented")
+}
+
+func (m mockFile) Preallocate(int64, int64) error {
+	time.Sleep(m.syncAndWriteDuration)
+	return nil
+}
+
+func (m mockFile) Stat() (os.FileInfo, error) {
+	panic("unimplemented")
+}
+
+func (m mockFile) Fd() uintptr {
+	return InvalidFd
+}
+
+func (m mockFile) Sync() error {
+	time.Sleep(m.syncAndWriteDuration)
+	return nil
+}
+
+func (m mockFile) SyncData() error {
+	time.Sleep(m.syncAndWriteDuration)
+	return nil
+}
+
+func (m mockFile) SyncTo(int64) (fullSync bool, err error) {
+	time.Sleep(m.syncAndWriteDuration)
+	return false, nil
+}
+
+var _ File = &mockFile{}
+
+type mockFS struct {
+	create        func(string) (File, error)
+	link          func(string, string) error
+	list          func(string) ([]string, error)
+	lock          func(string) (io.Closer, error)
+	mkdirAll      func(string, os.FileMode) error
+	open          func(string, ...OpenOption) (File, error)
+	openDir       func(string) (File, error)
+	pathBase      func(string) string
+	pathJoin      func(...string) string
+	pathDir       func(string) string
+	remove        func(string) error
+	removeAll     func(string) error
+	rename        func(string, string) error
+	reuseForWrite func(string, string) (File, error)
+	stat          func(string) (os.FileInfo, error)
+	getDiskUsage  func(string) (DiskUsage, error)
+}
+
+func (m mockFS) Create(name string) (File, error) {
+	if m.create == nil {
+		panic("unimplemented")
+	}
+	return m.create(name)
+}
+
+func (m mockFS) Link(oldname, newname string) error {
+	if m.link == nil {
+		panic("unimplemented")
+	}
+	return m.link(oldname, newname)
+}
+
+func (m mockFS) Open(name string, opts ...OpenOption) (File, error) {
+	if m.open == nil {
+		panic("unimplemented")
+	}
+	return m.open(name, opts...)
+}
+
+func (m mockFS) OpenReadWrite(name string, opts ...OpenOption) (File, error) {
+	panic("unimplemented")
+}
+
+func (m mockFS) OpenDir(name string) (File, error) {
+	if m.openDir == nil {
+		panic("unimplemented")
+	}
+	return m.openDir(name)
+}
+
+func (m mockFS) Remove(name string) error {
+	if m.remove == nil {
+		panic("unimplemented")
+	}
+	return m.remove(name)
+}
+
+func (m mockFS) RemoveAll(name string) error {
+	if m.removeAll == nil {
+		panic("unimplemented")
+	}
+	return m.removeAll(name)
+}
+
+func (m mockFS) Rename(oldname, newname string) error {
+	if m.rename == nil {
+		panic("unimplemented")
+	}
+	return m.rename(oldname, newname)
+}
+
+func (m mockFS) ReuseForWrite(oldname, newname string) (File, error) {
+	if m.reuseForWrite == nil {
+		panic("unimplemented")
+	}
+	return m.reuseForWrite(oldname, newname)
+}
+
+func (m mockFS) MkdirAll(dir string, perm os.FileMode) error {
+	if m.mkdirAll == nil {
+		panic("unimplemented")
+	}
+	return m.mkdirAll(dir, perm)
+}
+
+func (m mockFS) Lock(name string) (io.Closer, error) {
+	if m.lock == nil {
+		panic("unimplemented")
+	}
+	return m.lock(name)
+}
+
+func (m mockFS) List(dir string) ([]string, error) {
+	if m.list == nil {
+		panic("unimplemented")
+	}
+	return m.list(dir)
+}
+
+func (m mockFS) Stat(name string) (os.FileInfo, error) {
+	if m.stat == nil {
+		panic("unimplemented")
+	}
+	return m.stat(name)
+}
+
+func (m mockFS) PathBase(path string) string {
+	if m.pathBase == nil {
+		panic("unimplemented")
+	}
+	return m.pathBase(path)
+}
+
+func (m mockFS) PathJoin(elem ...string) string {
+	if m.pathJoin == nil {
+		panic("unimplemented")
+	}
+	return m.pathJoin(elem...)
+}
+
+func (m mockFS) PathDir(path string) string {
+	if m.pathDir == nil {
+		panic("unimplemented")
+	}
+	return m.pathDir(path)
+}
+
+func (m mockFS) GetDiskUsage(path string) (DiskUsage, error) {
+	if m.getDiskUsage == nil {
+		panic("unimplemented")
+	}
+	return m.getDiskUsage(path)
+}
+
+var _ FS = &mockFS{}
+
+func TestDiskHealthChecking_File(t *testing.T) {
+	oldTickInterval := defaultTickInterval
+	defaultTickInterval = time.Millisecond
+	if runtime.GOOS == "windows" {
+		t.Skipf("skipped on windows due to unreliable runtimes")
+	}
+
+	defer func() { defaultTickInterval = oldTickInterval }()
+
+	const (
+		slowThreshold = 50 * time.Millisecond
+	)
+
+	fiveKB := make([]byte, 5*writeSizePrecision)
+	testCases := []struct {
+		op               OpType
+		writeSize        int
+		writeDuration    time.Duration
+		fn               func(f File)
+		createWriteDelta time.Duration
+	}{
+		{
+			op:            OpTypeWrite,
+			writeSize:     5 * writeSizePrecision, // five KB
+			writeDuration: 100 * time.Millisecond,
+			fn:            func(f File) { f.Write(fiveKB) },
+		},
+		{
+			op:            OpTypeSync,
+			writeSize:     0,
+			writeDuration: 100 * time.Millisecond,
+			fn:            func(f File) { f.Sync() },
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.op.String(), func(t *testing.T) {
+			diskSlow := make(chan DiskSlowInfo, 3)
+			mockFS := &mockFS{create: func(name string) (File, error) {
+				return mockFile{syncAndWriteDuration: tc.writeDuration}, nil
+			}}
+			fs, closer := WithDiskHealthChecks(mockFS, slowThreshold,
+				func(info DiskSlowInfo) {
+					diskSlow <- info
+				})
+			defer closer.Close()
+			dhFile, _ := fs.Create("test")
+			defer dhFile.Close()
+
+			// Writing after file creation tests computation of delta between file
+			// creation time & write time.
+			time.Sleep(tc.createWriteDelta)
+
+			tc.fn(dhFile)
+
+			select {
+			case i := <-diskSlow:
+				d := i.Duration
+				if d.Seconds() < slowThreshold.Seconds() {
+					t.Fatalf("expected %0.1f to be greater than threshold %0.1f", d.Seconds(), slowThreshold.Seconds())
+				}
+				require.Equal(t, tc.writeSize, i.WriteSize)
+				require.Equal(t, tc.op, i.OpType)
+			case <-time.After(10 * time.Second):
+				t.Fatal("disk stall detector did not detect slow disk operation")
+			}
+		})
+	}
+}
+
+func TestDiskHealthChecking_NotTooManyOps(t *testing.T) {
+	numBitsForOpType := 64 - deltaBits - writeSizeBits
+	numOpTypesAllowed := int(math.Pow(2, float64(numBitsForOpType)))
+	numOpTypes := int(opTypeMax)
+	require.LessOrEqual(t, numOpTypes, numOpTypesAllowed)
+}
+
+func TestDiskHealthChecking_File_PackingAndUnpacking(t *testing.T) {
+	testCases := []struct {
+		desc          string
+		delta         time.Duration
+		writeSize     int64
+		opType        OpType
+		wantDelta     time.Duration
+		wantWriteSize int
+	}{
+		// Write op with write size in bytes.
+		{
+			desc:          "write, sized op",
+			delta:         3000 * time.Millisecond,
+			writeSize:     1024, // 1 KB.
+			opType:        OpTypeWrite,
+			wantDelta:     3000 * time.Millisecond,
+			wantWriteSize: 1024,
+		},
+		// Sync op. No write size. Max-ish delta that packing scheme can handle.
+		{
+			desc:          "sync, no write size",
+			delta:         34 * time.Hour * 24 * 365,
+			writeSize:     0,
+			opType:        OpTypeSync,
+			wantDelta:     34 * time.Hour * 24 * 365,
+			wantWriteSize: 0,
+		},
+		// Delta is negative (e.g. due to clock sync). Set to
+		// zero.
+		{
+			desc:          "delta negative",
+			delta:         -5,
+			writeSize:     5120, // 5 KB
+			opType:        OpTypeWrite,
+			wantDelta:     0,
+			wantWriteSize: 5120,
+		},
+		// Write size in bytes is larger than can fit in 20 bits.
+		// Round down to max that can fit in 20 bits.
+		{
+			desc:          "write size truncated",
+			delta:         231 * time.Millisecond,
+			writeSize:     2097152000, // too big!
+			opType:        OpTypeWrite,
+			wantDelta:     231 * time.Millisecond,
+			wantWriteSize: 1073740800, // (2^20-1) * writeSizePrecision ~= a bit less than one GB
+		},
+		// Write size in bytes is max representable less than the ceiling.
+		{
+			desc:          "write size barely not truncated",
+			delta:         231 * time.Millisecond,
+			writeSize:     1073739776, // max representable less than the ceiling
+			opType:        OpTypeWrite,
+			wantDelta:     231 * time.Millisecond,
+			wantWriteSize: 1073739776, // since can fit, unchanged
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.desc, func(t *testing.T) {
+			packed := pack(tc.delta, tc.writeSize, tc.opType)
+			gotDelta, gotWriteSize, gotOpType := unpack(packed)
+
+			require.Equal(t, tc.wantDelta, gotDelta)
+			require.Equal(t, tc.wantWriteSize, gotWriteSize)
+			require.Equal(t, tc.opType, gotOpType)
+		})
+	}
+}
+
+func TestDiskHealthChecking_File_Underflow(t *testing.T) {
+	f := &mockFile{}
+	hcFile := newDiskHealthCheckingFile(f, 1*time.Second, func(opType OpType, writeSizeInBytes int, duration time.Duration) {
+		// We expect to panic before sending the event.
+		t.Fatalf("unexpected slow disk event")
+	})
+	defer hcFile.Close()
+
+	t.Run("too large delta leads to panic", func(t *testing.T) {
+		// Given the packing scheme, 35 years of process uptime will lead to a delta
+		// that is too large to fit in the packed int64.
+		tCreate := time.Now().Add(-35 * time.Hour * 24 * 365)
+		hcFile.createTimeNanos = tCreate.UnixNano()
+
+		// Assert that the time since tCreate (in milliseconds) is indeed greater
+		// than the max delta that can fit.
+		require.True(t, time.Since(tCreate).Milliseconds() > 1<<deltaBits-1)
+
+		// Attempting to start the clock for a new operation on the file should
+		// trigger a panic, as the calculated delta from the file creation time would
+		// result in integer overflow.
+		require.Panics(t, func() { _, _ = hcFile.Write([]byte("uh oh")) })
+	})
+	t.Run("pretty large delta but not too large leads to no panic", func(t *testing.T) {
+		// Given the packing scheme, 34 years of process uptime will lead to a delta
+		// that is just small enough to fit in the packed int64.
+		tCreate := time.Now().Add(-34 * time.Hour * 24 * 365)
+		hcFile.createTimeNanos = tCreate.UnixNano()
+
+		require.True(t, time.Since(tCreate).Milliseconds() < 1<<deltaBits-1)
+		require.NotPanics(t, func() { _, _ = hcFile.Write([]byte("should be fine")) })
+	})
+}
+
+var (
+	errInjected = errors.New("injected error")
+)
+
+// filesystemOpsMockFS returns a filesystem that will block until it reads from
+// the provided channel on filesystem operations.
+func filesystemOpsMockFS(ch chan struct{}) *mockFS {
+	return &mockFS{
+		create: func(name string) (File, error) {
+			<-ch
+			return nil, errInjected
+		},
+		link: func(oldname, newname string) error {
+			<-ch
+			return errInjected
+		},
+		mkdirAll: func(string, os.FileMode) error {
+			<-ch
+			return errInjected
+		},
+		remove: func(name string) error {
+			<-ch
+			return errInjected
+		},
+		removeAll: func(name string) error {
+			<-ch
+			return errInjected
+		},
+		rename: func(oldname, newname string) error {
+			<-ch
+			return errInjected
+		},
+		reuseForWrite: func(oldname, newname string) (File, error) {
+			<-ch
+			return nil, errInjected
+		},
+	}
+}
+
+func stallFilesystemOperations(fs FS) []filesystemOperation {
+	return []filesystemOperation{
+		{
+			"create", OpTypeCreate, func() {
+				f, _ := fs.Create("foo")
+				if f != nil {
+					f.Close()
+				}
+			},
+		},
+		{
+			"link", OpTypeLink, func() { _ = fs.Link("foo", "bar") },
+		},
+		{
+			"mkdirall", OpTypeMkdirAll, func() { _ = fs.MkdirAll("foo", os.ModePerm) },
+		},
+		{
+			"remove", OpTypeRemove, func() { _ = fs.Remove("foo") },
+		},
+		{
+			"removeall", OpTypeRemoveAll, func() { _ = fs.RemoveAll("foo") },
+		},
+		{
+			"rename", OpTypeRename, func() { _ = fs.Rename("foo", "bar") },
+		},
+		{
+			"reuseforwrite", OpTypeReuseForWrite, func() { _, _ = fs.ReuseForWrite("foo", "bar") },
+		},
+	}
+}
+
+type filesystemOperation struct {
+	name   string
+	opType OpType
+	f      func()
+}
+
+func TestDiskHealthChecking_Filesystem(t *testing.T) {
+	const stallThreshold = 10 * time.Millisecond
+	if runtime.GOOS == "windows" {
+		t.Skipf("skipped on windows due to unreliable runtimes")
+	}
+
+	// Wrap with disk-health checking, counting each stall via stallCount.
+	var expectedOpType OpType
+	var stallCount atomic.Uint64
+	unstall := make(chan struct{})
+	var lastOpType OpType
+	fs, closer := WithDiskHealthChecks(filesystemOpsMockFS(unstall), stallThreshold,
+		func(info DiskSlowInfo) {
+			require.Equal(t, 0, info.WriteSize)
+			stallCount.Add(1)
+			if lastOpType != info.OpType {
+				require.Equal(t, expectedOpType, info.OpType)
+				lastOpType = info.OpType
+				// Sending on `unstall` releases the blocked filesystem
+				// operation, allowing the test to proceed.
+				unstall <- struct{}{}
+			}
+		})
+
+	defer closer.Close()
+	fs.(*diskHealthCheckingFS).tickInterval = 5 * time.Millisecond
+	ops := stallFilesystemOperations(fs)
+	for _, o := range ops {
+		t.Run(o.name, func(t *testing.T) {
+			expectedOpType = o.opType
+			before := stallCount.Load()
+			// o.f() will perform the filesystem operation and block within the
+			// mock filesystem until the disk stall detector notices the stall
+			// and sends to the `unstall` channel.
+			o.f()
+			after := stallCount.Load()
+			require.Greater(t, int(after-before), 0)
+		})
+	}
+}
+
+// TestDiskHealthChecking_Filesystem_Close tests the behavior of repeatedly
+// closing and reusing a filesystem wrapped by WithDiskHealthChecks. This is a
+// permitted usage because it allows (*pebble.Options).EnsureDefaults to wrap
+// with disk-health checking by default, and to clean up the long-running
+// goroutine on (*pebble.DB).Close, while still allowing the FS to be used
+// multiple times.
+func TestDiskHealthChecking_Filesystem_Close(t *testing.T) {
+	const stallThreshold = 10 * time.Millisecond
+	stallChan := make(chan struct{}, 1)
+	mockFS := &mockFS{
+		create: func(name string) (File, error) {
+			<-stallChan
+			return &mockFile{}, nil
+		},
+	}
+
+	files := []string{"foo", "bar", "bax"}
+	var lastPath string
+	stalled := make(chan string)
+	fs, closer := WithDiskHealthChecks(mockFS, stallThreshold,
+		func(info DiskSlowInfo) {
+			if lastPath != info.Path {
+				lastPath = info.Path
+				stalled <- info.Path
+			}
+		})
+	fs.(*diskHealthCheckingFS).tickInterval = 5 * time.Millisecond
+
+	var wg sync.WaitGroup
+	for _, filename := range files {
+		filename := filename
+		// Create will stall, and the detector should write to the stalled channel
+		// with the filename.
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			f, _ := fs.Create(filename)
+			if f != nil {
+				f.Close()
+			}
+		}()
+
+		select {
+		case stalledPath := <-stalled:
+			require.Equal(t, filename, stalledPath)
+		case <-time.After(10 * time.Second):
+			t.Fatalf("timed out waiting for stall")
+		}
+		// Unblock the call to Create().
+		stallChan <- struct{}{}
+
+		// Invoke the closer. This will cause the long-running goroutine to
+		// exit, but the fs should still be usable and should still detect
+		// subsequent stalls on the next iteration.
+		require.NoError(t, closer.Close())
+	}
+	wg.Wait()
+}
diff --git a/pebble/vfs/disk_usage_linux.go b/pebble/vfs/disk_usage_linux.go
new file mode 100644
index 0000000..4f0bcd9
--- /dev/null
+++ b/pebble/vfs/disk_usage_linux.go
@@ -0,0 +1,44 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build linux
+// +build linux
+
+package vfs
+
+import "golang.org/x/sys/unix"
+
+func (defaultFS) GetDiskUsage(path string) (DiskUsage, error) {
+	stat := unix.Statfs_t{}
+	if err := unix.Statfs(path, &stat); err != nil {
+		return DiskUsage{}, err
+	}
+
+	// We use stat.Frsize here rather than stat.Bsize because on
+	// Linux Bavail and Bfree are in Frsize units.
+	//
+	// On most filesystems Frsize and Bsize will be set to the
+	// same value, but on some filesystems bsize returns the
+	// "optimal transfer block size"[1] which may be different
+	// (typically larger) than the actual block size.
+	//
+	// This confusion is cleared up in the statvfs[2] libc function,
+	// but the statfs system call used above varies across
+	// platforms.
+	//
+	// Frsize is used by GNU coreutils and other libraries, so
+	// this also helps ensure that we get the same results as one
+	// would get if they ran `df` on the given path.
+	//
+	// [1] https://man7.org/linux/man-pages/man2/statfs.2.html
+	// [2] https://man7.org/linux/man-pages/man3/statvfs.3.html
+	freeBytes := uint64(stat.Frsize) * uint64(stat.Bfree)
+	availBytes := uint64(stat.Frsize) * uint64(stat.Bavail)
+	totalBytes := uint64(stat.Frsize) * uint64(stat.Blocks)
+	return DiskUsage{
+		AvailBytes: availBytes,
+		TotalBytes: totalBytes,
+		UsedBytes:  totalBytes - freeBytes,
+	}, nil
+}
diff --git a/pebble/vfs/disk_usage_netbsd.go b/pebble/vfs/disk_usage_netbsd.go
new file mode 100644
index 0000000..8b7e9dd
--- /dev/null
+++ b/pebble/vfs/disk_usage_netbsd.go
@@ -0,0 +1,26 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build netbsd
+// +build netbsd
+
+package vfs
+
+import "golang.org/x/sys/unix"
+
+func (defaultFS) GetDiskUsage(path string) (DiskUsage, error) {
+	stat := unix.Statvfs_t{}
+	if err := unix.Statvfs(path, &stat); err != nil {
+		return DiskUsage{}, err
+	}
+
+	freeBytes := uint64(stat.Bsize) * uint64(stat.Bfree)
+	availBytes := uint64(stat.Bsize) * uint64(stat.Bavail)
+	totalBytes := uint64(stat.Bsize) * uint64(stat.Blocks)
+	return DiskUsage{
+		AvailBytes: availBytes,
+		TotalBytes: totalBytes,
+		UsedBytes:  totalBytes - freeBytes,
+	}, nil
+}
diff --git a/pebble/vfs/disk_usage_openbsd.go b/pebble/vfs/disk_usage_openbsd.go
new file mode 100644
index 0000000..19e9c5c
--- /dev/null
+++ b/pebble/vfs/disk_usage_openbsd.go
@@ -0,0 +1,26 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build openbsd
+// +build openbsd
+
+package vfs
+
+import "golang.org/x/sys/unix"
+
+func (defaultFS) GetDiskUsage(path string) (DiskUsage, error) {
+	stat := unix.Statfs_t{}
+	if err := unix.Statfs(path, &stat); err != nil {
+		return DiskUsage{}, err
+	}
+
+	freeBytes := uint64(stat.F_bsize) * uint64(stat.F_bfree)
+	availBytes := uint64(stat.F_bsize) * uint64(stat.F_bavail)
+	totalBytes := uint64(stat.F_bsize) * uint64(stat.F_blocks)
+	return DiskUsage{
+		AvailBytes: availBytes,
+		TotalBytes: totalBytes,
+		UsedBytes:  totalBytes - freeBytes,
+	}, nil
+}
diff --git a/pebble/vfs/disk_usage_unix.go b/pebble/vfs/disk_usage_unix.go
new file mode 100644
index 0000000..95d9a7c
--- /dev/null
+++ b/pebble/vfs/disk_usage_unix.go
@@ -0,0 +1,26 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build darwin || dragonfly || freebsd
+// +build darwin dragonfly freebsd
+
+package vfs
+
+import "golang.org/x/sys/unix"
+
+func (defaultFS) GetDiskUsage(path string) (DiskUsage, error) {
+	stat := unix.Statfs_t{}
+	if err := unix.Statfs(path, &stat); err != nil {
+		return DiskUsage{}, err
+	}
+
+	freeBytes := uint64(stat.Bsize) * uint64(stat.Bfree)
+	availBytes := uint64(stat.Bsize) * uint64(stat.Bavail)
+	totalBytes := uint64(stat.Bsize) * uint64(stat.Blocks)
+	return DiskUsage{
+		AvailBytes: availBytes,
+		TotalBytes: totalBytes,
+		UsedBytes:  totalBytes - freeBytes,
+	}, nil
+}
diff --git a/pebble/vfs/disk_usage_windows.go b/pebble/vfs/disk_usage_windows.go
new file mode 100644
index 0000000..a380bcb
--- /dev/null
+++ b/pebble/vfs/disk_usage_windows.go
@@ -0,0 +1,22 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build windows
+// +build windows
+
+package vfs
+
+import "golang.org/x/sys/windows"
+
+func (defaultFS) GetDiskUsage(path string) (DiskUsage, error) {
+	p, err := windows.UTF16PtrFromString(path)
+	if err != nil {
+		return DiskUsage{}, err
+	}
+	var freeBytes uint64
+	du := DiskUsage{}
+	err = windows.GetDiskFreeSpaceEx(p, &du.AvailBytes, &du.TotalBytes, &freeBytes)
+	du.UsedBytes = du.TotalBytes - freeBytes
+	return du, err
+}
diff --git a/pebble/vfs/errorfs/dsl.go b/pebble/vfs/errorfs/dsl.go
new file mode 100644
index 0000000..6f6156c
--- /dev/null
+++ b/pebble/vfs/errorfs/dsl.go
@@ -0,0 +1,300 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package errorfs
+
+import (
+	"encoding/binary"
+	"fmt"
+	"go/token"
+	"hash/maphash"
+	"math/rand"
+	"path/filepath"
+	"strconv"
+	"sync"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/pebble/internal/dsl"
+)
+
+// Predicate encodes conditional logic that determines whether to inject an
+// error.
+type Predicate = dsl.Predicate[Op]
+
+// PathMatch returns a predicate that returns true if an operation's file path
+// matches the provided pattern according to filepath.Match.
+func PathMatch(pattern string) Predicate {
+	return &pathMatch{pattern: pattern}
+}
+
+type pathMatch struct {
+	pattern string
+}
+
+func (pm *pathMatch) String() string {
+	return fmt.Sprintf("(PathMatch %q)", pm.pattern)
+}
+
+func (pm *pathMatch) Evaluate(op Op) bool {
+	matched, err := filepath.Match(pm.pattern, op.Path)
+	if err != nil {
+		// Only possible error is ErrBadPattern, indicating an issue with
+		// the test itself.
+		panic(err)
+	}
+	return matched
+}
+
+var (
+	// Reads is a predicate that returns true iff an operation is a read
+	// operation.
+	Reads Predicate = opKindPred{kind: OpIsRead}
+	// Writes is a predicate that returns true iff an operation is a write
+	// operation.
+	Writes Predicate = opKindPred{kind: OpIsWrite}
+)
+
+type opFileReadAt struct {
+	// offset configures the predicate to evaluate to true only if the
+	// operation's offset exactly matches offset.
+	offset int64
+}
+
+func (o *opFileReadAt) String() string {
+	return fmt.Sprintf("(FileReadAt %d)", o.offset)
+}
+
+func (o *opFileReadAt) Evaluate(op Op) bool {
+	return op.Kind == OpFileReadAt && o.offset == op.Offset
+}
+
+type opKindPred struct {
+	kind OpReadWrite
+}
+
+func (p opKindPred) String() string      { return p.kind.String() }
+func (p opKindPred) Evaluate(op Op) bool { return p.kind == op.Kind.ReadOrWrite() }
+
+// Randomly constructs a new predicate that pseudorandomly evaluates to true
+// with probability p using randomness determinstically derived from seed.
+//
+// The predicate is deterministic with respect to file paths: its behavior for a
+// particular file is deterministic regardless of intervening evaluations for
+// operations on other files. This can be used to ensure determinism despite
+// nondeterministic concurrency if the concurrency is constrained to separate
+// files.
+func Randomly(p float64, seed int64) Predicate {
+	rs := &randomSeed{p: p, rootSeed: seed}
+	rs.mu.perFilePrng = make(map[string]*rand.Rand)
+	return rs
+}
+
+type randomSeed struct {
+	// p defines the probability of an error being injected.
+	p        float64
+	rootSeed int64
+	mu       struct {
+		sync.Mutex
+		h           maphash.Hash
+		perFilePrng map[string]*rand.Rand
+	}
+}
+
+func (rs *randomSeed) String() string {
+	if rs.rootSeed == 0 {
+		return fmt.Sprintf("(Randomly %.2f)", rs.p)
+	}
+	return fmt.Sprintf("(Randomly %.2f %d)", rs.p, rs.rootSeed)
+}
+
+func (rs *randomSeed) Evaluate(op Op) bool {
+	rs.mu.Lock()
+	defer rs.mu.Unlock()
+	prng, ok := rs.mu.perFilePrng[op.Path]
+	if !ok {
+		// This is the first time an operation has been performed on the file at
+		// this path. Initialize the per-file prng by computing a deterministic
+		// hash of the path.
+		rs.mu.h.Reset()
+		var b [8]byte
+		binary.LittleEndian.PutUint64(b[:], uint64(rs.rootSeed))
+		if _, err := rs.mu.h.Write(b[:]); err != nil {
+			panic(err)
+		}
+		if _, err := rs.mu.h.WriteString(op.Path); err != nil {
+			panic(err)
+		}
+		seed := rs.mu.h.Sum64()
+		prng = rand.New(rand.NewSource(int64(seed)))
+		rs.mu.perFilePrng[op.Path] = prng
+	}
+	return prng.Float64() < rs.p
+}
+
+// ParseDSL parses the provided string using the default DSL parser.
+func ParseDSL(s string) (Injector, error) {
+	return defaultParser.Parse(s)
+}
+
+var defaultParser = NewParser()
+
+// NewParser constructs a new parser for an encoding of a lisp-like DSL
+// describing error injectors.
+//
+// Errors:
+// - ErrInjected is the only error currently supported by the DSL.
+//
+// Injectors:
+//   - <ERROR>: An error by itself is an injector that injects an error every
+//     time.
+//   - (<ERROR> <PREDICATE>) is an injector that injects an error only when
+//     the operation satisfies the predicate.
+//
+// Predicates:
+//   - Reads is a constant predicate that evalutes to true iff the operation is a
+//     read operation (eg, Open, Read, ReadAt, Stat)
+//   - Writes is a constant predicate that evaluates to true iff the operation is
+//     a write operation (eg, Create, Rename, Write, WriteAt, etc).
+//   - (PathMatch <STRING>) is a predicate that evalutes to true iff the
+//     operation's file path matches the provided shell pattern.
+//   - (OnIndex <INTEGER>) is a predicate that evaluates to true only on the n-th
+//     invocation.
+//   - (And <PREDICATE> [PREDICATE]...) is a predicate that evaluates to true
+//     iff all the provided predicates evaluate to true. And short circuits on
+//     the first predicate to evaluate to false.
+//   - (Or <PREDICATE> [PREDICATE]...) is a predicate that evaluates to true iff
+//     at least one of the provided predicates evaluates to true. Or short
+//     circuits on the first predicate to evaluate to true.
+//   - (Not <PREDICATE>) is a predicate that evaluates to true iff its provided
+//     predicates evaluates to false.
+//   - (Randomly <FLOAT> [INTEGER]) is a predicate that pseudorandomly evaluates
+//     to true. The probability of evaluating to true is determined by the
+//     required float argument (must be ≤1). The optional second parameter is a
+//     pseudorandom seed, for adjusting the deterministic randomness.
+//   - Operation-specific:
+//     (OpFileReadAt <INTEGER>) is a predicate that evaluates to true iff
+//     an operation is a file ReadAt call with an offset that's exactly equal.
+//
+// Example: (ErrInjected (And (PathMatch "*.sst") (OnIndex 5))) is a rule set
+// that will inject an error on the 5-th I/O operation involving an sstable.
+func NewParser() *Parser {
+	p := &Parser{
+		predicates: dsl.NewPredicateParser[Op](),
+		injectors:  dsl.NewParser[Injector](),
+	}
+	p.predicates.DefineConstant("Reads", func() dsl.Predicate[Op] { return Reads })
+	p.predicates.DefineConstant("Writes", func() dsl.Predicate[Op] { return Writes })
+	p.predicates.DefineFunc("PathMatch",
+		func(p *dsl.Parser[dsl.Predicate[Op]], s *dsl.Scanner) dsl.Predicate[Op] {
+			pattern := s.ConsumeString()
+			s.Consume(token.RPAREN)
+			return PathMatch(pattern)
+		})
+	p.predicates.DefineFunc("OpFileReadAt",
+		func(p *dsl.Parser[dsl.Predicate[Op]], s *dsl.Scanner) dsl.Predicate[Op] {
+			return parseFileReadAtOp(s)
+		})
+	p.predicates.DefineFunc("Randomly",
+		func(p *dsl.Parser[dsl.Predicate[Op]], s *dsl.Scanner) dsl.Predicate[Op] {
+			return parseRandomly(s)
+		})
+	p.AddError(ErrInjected)
+	return p
+}
+
+// A Parser parses the error-injecting DSL. It may be extended to include
+// additional errors through AddError.
+type Parser struct {
+	predicates *dsl.Parser[dsl.Predicate[Op]]
+	injectors  *dsl.Parser[Injector]
+}
+
+// Parse parses the error injection DSL, returning the parsed injector.
+func (p *Parser) Parse(s string) (Injector, error) {
+	return p.injectors.Parse(s)
+}
+
+// AddError defines a new error that may be used within the DSL parsed by
+// Parse and will inject the provided error.
+func (p *Parser) AddError(le LabelledError) {
+	// Define the error both as a constant that unconditionally injects the
+	// error, and as a function that injects the error only if the provided
+	// predicate evaluates to true.
+	p.injectors.DefineConstant(le.Label, func() Injector { return le })
+	p.injectors.DefineFunc(le.Label,
+		func(_ *dsl.Parser[Injector], s *dsl.Scanner) Injector {
+			pred := p.predicates.ParseFromPos(s, s.Scan())
+			s.Consume(token.RPAREN)
+			return le.If(pred)
+		})
+}
+
+// LabelledError is an error that also implements Injector, unconditionally
+// injecting itself. It implements String() by returning its label. It
+// implements Error() by returning its underlying error.
+type LabelledError struct {
+	error
+	Label     string
+	predicate Predicate
+}
+
+// String implements fmt.Stringer.
+func (le LabelledError) String() string {
+	if le.predicate == nil {
+		return le.Label
+	}
+	return fmt.Sprintf("(%s %s)", le.Label, le.predicate.String())
+}
+
+// MaybeError implements Injector.
+func (le LabelledError) MaybeError(op Op) error {
+	if le.predicate == nil || le.predicate.Evaluate(op) {
+		return le
+	}
+	return nil
+}
+
+// If returns an Injector that returns the receiver error if the provided
+// predicate evalutes to true.
+func (le LabelledError) If(p Predicate) Injector {
+	le.predicate = p
+	return le
+}
+
+func parseFileReadAtOp(s *dsl.Scanner) *opFileReadAt {
+	lit := s.Consume(token.INT).Lit
+	off, err := strconv.ParseInt(lit, 10, 64)
+	if err != nil {
+		panic(err)
+	}
+	s.Consume(token.RPAREN)
+	return &opFileReadAt{offset: off}
+}
+
+func parseRandomly(s *dsl.Scanner) Predicate {
+	lit := s.Consume(token.FLOAT).Lit
+	p, err := strconv.ParseFloat(lit, 64)
+	if err != nil {
+		panic(err)
+	} else if p > 1.0 {
+		// NB: It's not possible for p to be less than zero because we don't
+		// try to parse the '-' token.
+		panic(errors.Newf("errorfs: Randomly proability p must be within p ≤ 1.0"))
+	}
+
+	var seed int64
+	tok := s.Scan()
+	switch tok.Kind {
+	case token.RPAREN:
+	case token.INT:
+		seed, err = strconv.ParseInt(tok.Lit, 10, 64)
+		if err != nil {
+			panic(err)
+		}
+		s.Consume(token.RPAREN)
+	default:
+		panic(errors.Errorf("errorfs: unexpected token %s; expected RPAREN | FLOAT", tok.String()))
+	}
+	return Randomly(p, seed)
+}
diff --git a/pebble/vfs/errorfs/errorfs.go b/pebble/vfs/errorfs/errorfs.go
new file mode 100644
index 0000000..b357fcf
--- /dev/null
+++ b/pebble/vfs/errorfs/errorfs.go
@@ -0,0 +1,518 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package errorfs
+
+import (
+	"fmt"
+	"io"
+	"os"
+	"strings"
+	"sync/atomic"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/errors/oserror"
+	"github.com/cockroachdb/pebble/internal/dsl"
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+// ErrInjected is an error artificially injected for testing fs error paths.
+var ErrInjected = LabelledError{
+	error: errors.New("injected error"),
+	Label: "ErrInjected",
+}
+
+// Op describes a filesystem operation.
+type Op struct {
+	// Kind describes the particular kind of operation being performed.
+	Kind OpKind
+	// Path is the path of the file of the file being operated on.
+	Path string
+	// Offset is the offset of an operation. It's set for OpFileReadAt and
+	// OpFileWriteAt operations.
+	Offset int64
+}
+
+// OpKind is an enum describing the type of operation.
+type OpKind int
+
+const (
+	// OpCreate describes a create file operation.
+	OpCreate OpKind = iota
+	// OpLink describes a hardlink operation.
+	OpLink
+	// OpOpen describes a file open operation.
+	OpOpen
+	// OpOpenDir describes a directory open operation.
+	OpOpenDir
+	// OpRemove describes a remove file operation.
+	OpRemove
+	// OpRemoveAll describes a recursive remove operation.
+	OpRemoveAll
+	// OpRename describes a rename operation.
+	OpRename
+	// OpReuseForWrite describes a reuse for write operation.
+	OpReuseForWrite
+	// OpMkdirAll describes a make directory including parents operation.
+	OpMkdirAll
+	// OpLock describes a lock file operation.
+	OpLock
+	// OpList describes a list directory operation.
+	OpList
+	// OpFilePreallocate describes a file preallocate operation.
+	OpFilePreallocate
+	// OpStat describes a path-based stat operation.
+	OpStat
+	// OpGetDiskUsage describes a disk usage operation.
+	OpGetDiskUsage
+	// OpFileClose describes a close file operation.
+	OpFileClose
+	// OpFileRead describes a file read operation.
+	OpFileRead
+	// OpFileReadAt describes a file seek read operation.
+	OpFileReadAt
+	// OpFileWrite describes a file write operation.
+	OpFileWrite
+	// OpFileWriteAt describes a file seek write operation.
+	OpFileWriteAt
+	// OpFileStat describes a file stat operation.
+	OpFileStat
+	// OpFileSync describes a file sync operation.
+	OpFileSync
+	// OpFileFlush describes a file flush operation.
+	OpFileFlush
+)
+
+// ReadOrWrite returns the operation's kind.
+func (o OpKind) ReadOrWrite() OpReadWrite {
+	switch o {
+	case OpOpen, OpOpenDir, OpList, OpStat, OpGetDiskUsage, OpFileRead, OpFileReadAt, OpFileStat:
+		return OpIsRead
+	case OpCreate, OpLink, OpRemove, OpRemoveAll, OpRename, OpReuseForWrite, OpMkdirAll, OpLock, OpFileClose, OpFileWrite, OpFileWriteAt, OpFileSync, OpFileFlush, OpFilePreallocate:
+		return OpIsWrite
+	default:
+		panic(fmt.Sprintf("unrecognized op %v\n", o))
+	}
+}
+
+// OpReadWrite is an enum describing whether an operation is a read or write
+// operation.
+type OpReadWrite int
+
+const (
+	// OpIsRead describes read operations.
+	OpIsRead OpReadWrite = iota
+	// OpIsWrite describes write operations.
+	OpIsWrite
+)
+
+// String implements fmt.Stringer.
+func (kind OpReadWrite) String() string {
+	switch kind {
+	case OpIsRead:
+		return "Reads"
+	case OpIsWrite:
+		return "Writes"
+	default:
+		panic(fmt.Sprintf("unrecognized OpKind %d", kind))
+	}
+}
+
+// OnIndex is a convenience function for constructing a dsl.OnIndex for use with
+// an error-injecting filesystem.
+func OnIndex(index int32) *InjectIndex {
+	return &InjectIndex{dsl.OnIndex[Op](index)}
+}
+
+// InjectIndex implements Injector, injecting an error at a specific index.
+type InjectIndex struct {
+	*dsl.Index[Op]
+}
+
+// MaybeError implements the Injector interface.
+//
+// TODO(jackson): Remove this implementation and update callers to compose it
+// with other injectors.
+func (ii *InjectIndex) MaybeError(op Op) error {
+	if !ii.Evaluate(op) {
+		return nil
+	}
+	return ErrInjected
+}
+
+// InjectorFunc implements the Injector interface for a function with
+// MaybeError's signature.
+type InjectorFunc func(Op) error
+
+// String implements fmt.Stringer.
+func (f InjectorFunc) String() string { return "<opaque func>" }
+
+// MaybeError implements the Injector interface.
+func (f InjectorFunc) MaybeError(op Op) error { return f(op) }
+
+// Injector injects errors into FS operations.
+type Injector interface {
+	fmt.Stringer
+	// MaybeError is invoked by an errorfs before an operation is executed. It
+	// is passed an enum indicating the type of operation and a path of the
+	// subject file or directory. If the operation takes two paths (eg,
+	// Rename, Link), the original source path is provided.
+	MaybeError(op Op) error
+}
+
+// Any returns an injector that injects an error if any of the provided
+// injectors inject an error. The error returned by the first injector to return
+// an error is used.
+func Any(injectors ...Injector) Injector {
+	return anyInjector(injectors)
+}
+
+type anyInjector []Injector
+
+func (a anyInjector) String() string {
+	var sb strings.Builder
+	sb.WriteString("(Any")
+	for _, inj := range a {
+		sb.WriteString(" ")
+		sb.WriteString(inj.String())
+	}
+	sb.WriteString(")")
+	return sb.String()
+}
+
+func (a anyInjector) MaybeError(op Op) error {
+	for _, inj := range a {
+		if err := inj.MaybeError(op); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Counter wraps an Injector, counting the number of errors injected. It may be
+// used in tests to ensure that when an error is injected, the error is
+// surfaced through the user interface.
+type Counter struct {
+	Injector
+	atomic.Uint64
+}
+
+// String implements fmt.Stringer.
+func (c *Counter) String() string {
+	return c.Injector.String()
+}
+
+// MaybeError implements Injector.
+func (c *Counter) MaybeError(op Op) error {
+	err := c.Injector.MaybeError(op)
+	if err != nil {
+		c.Uint64.Add(1)
+	}
+	return err
+}
+
+// Toggle wraps an Injector. By default, Toggle injects nothing. When toggled on
+// through its On method, it begins injecting errors when the contained injector
+// injects them. It may be returned to its original state through Off.
+type Toggle struct {
+	Injector
+	on atomic.Bool
+}
+
+// String implements fmt.Stringer.
+func (t *Toggle) String() string {
+	return t.Injector.String()
+}
+
+// MaybeError implements Injector.
+func (t *Toggle) MaybeError(op Op) error {
+	if !t.on.Load() {
+		return nil
+	}
+	return t.Injector.MaybeError(op)
+}
+
+// On enables error injection.
+func (t *Toggle) On() { t.on.Store(true) }
+
+// Off disables error injection.
+func (t *Toggle) Off() { t.on.Store(false) }
+
+// FS implements vfs.FS, injecting errors into
+// its operations.
+type FS struct {
+	fs  vfs.FS
+	inj Injector
+}
+
+// Wrap wraps an existing vfs.FS implementation, returning a new
+// vfs.FS implementation that shadows operations to the provided FS.
+// It uses the provided Injector for deciding when to inject errors.
+// If an error is injected, FS propagates the error instead of
+// shadowing the operation.
+func Wrap(fs vfs.FS, inj Injector) *FS {
+	return &FS{
+		fs:  fs,
+		inj: inj,
+	}
+}
+
+// WrapFile wraps an existing vfs.File, returning a new vfs.File that shadows
+// operations to the provided vfs.File. It uses the provided Injector for
+// deciding when to inject errors. If an error is injected, the file
+// propagates the error instead of shadowing the operation.
+func WrapFile(f vfs.File, inj Injector) vfs.File {
+	return &errorFile{file: f, inj: inj}
+}
+
+// Unwrap returns the FS implementation underlying fs.
+// See pebble/vfs.Root.
+func (fs *FS) Unwrap() vfs.FS {
+	return fs.fs
+}
+
+// Create implements FS.Create.
+func (fs *FS) Create(name string) (vfs.File, error) {
+	if err := fs.inj.MaybeError(Op{Kind: OpCreate, Path: name}); err != nil {
+		return nil, err
+	}
+	f, err := fs.fs.Create(name)
+	if err != nil {
+		return nil, err
+	}
+	return &errorFile{name, f, fs.inj}, nil
+}
+
+// Link implements FS.Link.
+func (fs *FS) Link(oldname, newname string) error {
+	if err := fs.inj.MaybeError(Op{Kind: OpLink, Path: oldname}); err != nil {
+		return err
+	}
+	return fs.fs.Link(oldname, newname)
+}
+
+// Open implements FS.Open.
+func (fs *FS) Open(name string, opts ...vfs.OpenOption) (vfs.File, error) {
+	if err := fs.inj.MaybeError(Op{Kind: OpOpen, Path: name}); err != nil {
+		return nil, err
+	}
+	f, err := fs.fs.Open(name)
+	if err != nil {
+		return nil, err
+	}
+	ef := &errorFile{name, f, fs.inj}
+	for _, opt := range opts {
+		opt.Apply(ef)
+	}
+	return ef, nil
+}
+
+// OpenReadWrite implements FS.OpenReadWrite.
+func (fs *FS) OpenReadWrite(name string, opts ...vfs.OpenOption) (vfs.File, error) {
+	if err := fs.inj.MaybeError(Op{Kind: OpOpen, Path: name}); err != nil {
+		return nil, err
+	}
+	f, err := fs.fs.OpenReadWrite(name)
+	if err != nil {
+		return nil, err
+	}
+	ef := &errorFile{name, f, fs.inj}
+	for _, opt := range opts {
+		opt.Apply(ef)
+	}
+	return ef, nil
+}
+
+// OpenDir implements FS.OpenDir.
+func (fs *FS) OpenDir(name string) (vfs.File, error) {
+	if err := fs.inj.MaybeError(Op{Kind: OpOpenDir, Path: name}); err != nil {
+		return nil, err
+	}
+	f, err := fs.fs.OpenDir(name)
+	if err != nil {
+		return nil, err
+	}
+	return &errorFile{name, f, fs.inj}, nil
+}
+
+// GetDiskUsage implements FS.GetDiskUsage.
+func (fs *FS) GetDiskUsage(path string) (vfs.DiskUsage, error) {
+	if err := fs.inj.MaybeError(Op{Kind: OpGetDiskUsage, Path: path}); err != nil {
+		return vfs.DiskUsage{}, err
+	}
+	return fs.fs.GetDiskUsage(path)
+}
+
+// PathBase implements FS.PathBase.
+func (fs *FS) PathBase(p string) string {
+	return fs.fs.PathBase(p)
+}
+
+// PathDir implements FS.PathDir.
+func (fs *FS) PathDir(p string) string {
+	return fs.fs.PathDir(p)
+}
+
+// PathJoin implements FS.PathJoin.
+func (fs *FS) PathJoin(elem ...string) string {
+	return fs.fs.PathJoin(elem...)
+}
+
+// Remove implements FS.Remove.
+func (fs *FS) Remove(name string) error {
+	if _, err := fs.fs.Stat(name); oserror.IsNotExist(err) {
+		return nil
+	}
+
+	if err := fs.inj.MaybeError(Op{Kind: OpRemove, Path: name}); err != nil {
+		return err
+	}
+	return fs.fs.Remove(name)
+}
+
+// RemoveAll implements FS.RemoveAll.
+func (fs *FS) RemoveAll(fullname string) error {
+	if err := fs.inj.MaybeError(Op{Kind: OpRemoveAll, Path: fullname}); err != nil {
+		return err
+	}
+	return fs.fs.RemoveAll(fullname)
+}
+
+// Rename implements FS.Rename.
+func (fs *FS) Rename(oldname, newname string) error {
+	if err := fs.inj.MaybeError(Op{Kind: OpRename, Path: oldname}); err != nil {
+		return err
+	}
+	return fs.fs.Rename(oldname, newname)
+}
+
+// ReuseForWrite implements FS.ReuseForWrite.
+func (fs *FS) ReuseForWrite(oldname, newname string) (vfs.File, error) {
+	if err := fs.inj.MaybeError(Op{Kind: OpReuseForWrite, Path: oldname}); err != nil {
+		return nil, err
+	}
+	return fs.fs.ReuseForWrite(oldname, newname)
+}
+
+// MkdirAll implements FS.MkdirAll.
+func (fs *FS) MkdirAll(dir string, perm os.FileMode) error {
+	if err := fs.inj.MaybeError(Op{Kind: OpMkdirAll, Path: dir}); err != nil {
+		return err
+	}
+	return fs.fs.MkdirAll(dir, perm)
+}
+
+// Lock implements FS.Lock.
+func (fs *FS) Lock(name string) (io.Closer, error) {
+	if err := fs.inj.MaybeError(Op{Kind: OpLock, Path: name}); err != nil {
+		return nil, err
+	}
+	return fs.fs.Lock(name)
+}
+
+// List implements FS.List.
+func (fs *FS) List(dir string) ([]string, error) {
+	if err := fs.inj.MaybeError(Op{Kind: OpList, Path: dir}); err != nil {
+		return nil, err
+	}
+	return fs.fs.List(dir)
+}
+
+// Stat implements FS.Stat.
+func (fs *FS) Stat(name string) (os.FileInfo, error) {
+	if err := fs.inj.MaybeError(Op{Kind: OpStat, Path: name}); err != nil {
+		return nil, err
+	}
+	return fs.fs.Stat(name)
+}
+
+// errorFile implements vfs.File. The interface is implemented on the pointer
+// type to allow pointer equality comparisons.
+type errorFile struct {
+	path string
+	file vfs.File
+	inj  Injector
+}
+
+func (f *errorFile) Close() error {
+	// We don't inject errors during close as those calls should never fail in
+	// practice.
+	return f.file.Close()
+}
+
+func (f *errorFile) Read(p []byte) (int, error) {
+	if err := f.inj.MaybeError(Op{Kind: OpFileRead, Path: f.path}); err != nil {
+		return 0, err
+	}
+	return f.file.Read(p)
+}
+
+func (f *errorFile) ReadAt(p []byte, off int64) (int, error) {
+	if err := f.inj.MaybeError(Op{
+		Kind:   OpFileReadAt,
+		Path:   f.path,
+		Offset: off,
+	}); err != nil {
+		return 0, err
+	}
+	return f.file.ReadAt(p, off)
+}
+
+func (f *errorFile) Write(p []byte) (int, error) {
+	if err := f.inj.MaybeError(Op{Kind: OpFileWrite, Path: f.path}); err != nil {
+		return 0, err
+	}
+	return f.file.Write(p)
+}
+
+func (f *errorFile) WriteAt(p []byte, off int64) (int, error) {
+	if err := f.inj.MaybeError(Op{
+		Kind:   OpFileWriteAt,
+		Path:   f.path,
+		Offset: off,
+	}); err != nil {
+		return 0, err
+	}
+	return f.file.WriteAt(p, off)
+}
+
+func (f *errorFile) Stat() (os.FileInfo, error) {
+	if err := f.inj.MaybeError(Op{Kind: OpFileStat, Path: f.path}); err != nil {
+		return nil, err
+	}
+	return f.file.Stat()
+}
+
+func (f *errorFile) Prefetch(offset, length int64) error {
+	// TODO(radu): Consider error injection.
+	return f.file.Prefetch(offset, length)
+}
+
+func (f *errorFile) Preallocate(offset, length int64) error {
+	if err := f.inj.MaybeError(Op{Kind: OpFilePreallocate, Path: f.path}); err != nil {
+		return err
+	}
+	return f.file.Preallocate(offset, length)
+}
+
+func (f *errorFile) Sync() error {
+	if err := f.inj.MaybeError(Op{Kind: OpFileSync, Path: f.path}); err != nil {
+		return err
+	}
+	return f.file.Sync()
+}
+
+func (f *errorFile) SyncData() error {
+	// TODO(jackson): Consider error injection.
+	return f.file.SyncData()
+}
+
+func (f *errorFile) SyncTo(length int64) (fullSync bool, err error) {
+	// TODO(jackson): Consider error injection.
+	return f.file.SyncTo(length)
+}
+
+func (f *errorFile) Fd() uintptr {
+	return f.file.Fd()
+}
diff --git a/pebble/vfs/errorfs/errorfs_test.go b/pebble/vfs/errorfs/errorfs_test.go
new file mode 100644
index 0000000..af9b6ba
--- /dev/null
+++ b/pebble/vfs/errorfs/errorfs_test.go
@@ -0,0 +1,34 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package errorfs
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+)
+
+func TestErrorFS(t *testing.T) {
+	var sb strings.Builder
+	datadriven.RunTest(t, "testdata/errorfs", func(t *testing.T, td *datadriven.TestData) string {
+		sb.Reset()
+		switch td.Cmd {
+		case "parse-dsl":
+			for _, l := range strings.Split(strings.TrimSpace(td.Input), "\n") {
+				inj, err := ParseDSL(l)
+				if err != nil {
+					fmt.Fprintf(&sb, "parsing err: %s\n", err)
+				} else {
+					fmt.Fprintf(&sb, "%s\n", inj.String())
+				}
+			}
+			return sb.String()
+		default:
+			return fmt.Sprintf("unrecognized command %q", td.Cmd)
+		}
+	})
+}
diff --git a/pebble/vfs/errorfs/testdata/errorfs b/pebble/vfs/errorfs/testdata/errorfs
new file mode 100644
index 0000000..8c4e336
--- /dev/null
+++ b/pebble/vfs/errorfs/testdata/errorfs
@@ -0,0 +1,75 @@
+parse-dsl
+ErrInjected
+(ErrInjected Reads)
+(ErrInjected (PathMatch "foo/*.sst"))
+(ErrInjected (OnIndex 1))
+(ErrInjected (Or Reads Writes))
+(ErrInjected (And (PathMatch "foo/bar/*.sst") (OnIndex 1)))
+(ErrInjected (Or (OnIndex 2) (PathMatch "*.sst")))
+(ErrInjected (And Reads (PathMatch "*.sst")))
+(ErrInjected (Or Writes (PathMatch "*.sst")))
+----
+ErrInjected
+(ErrInjected Reads)
+(ErrInjected (PathMatch "foo/*.sst"))
+(ErrInjected (OnIndex 1))
+(ErrInjected (Or Reads Writes))
+(ErrInjected (And (PathMatch "foo/bar/*.sst") (OnIndex 1)))
+(ErrInjected (Or (OnIndex 2) (PathMatch "*.sst")))
+(ErrInjected (And Reads (PathMatch "*.sst")))
+(ErrInjected (Or Writes (PathMatch "*.sst")))
+
+parse-dsl
+errInjected
+ErrInjected()
+(ErrInjected (PathMatch foo/*.sst))
+(alwoes (PathMatch "foo/*.sst"))
+(ErrInjected (PathMatch "foo/*.sst" ""))
+(ErrInjected PathMatch "foo/*.sst")
+(ErrInjected (OnIndex ErrInjected))
+(Or ErrInjected ErrInjected ErrInjected
+(And ErrInjected ErrInjected ErrInjected)
+(Or 1 4 5)
+(ErrInjected (OnIndex foo))
+(ErrInjected (OnIndex 9223372036854775807))
+----
+parsing err: dsl: unknown constant "errInjected"
+parsing err: dsl: unexpected token ( at pos 12; expected EOF
+parsing err: dsl: unexpected token (IDENT, "foo") at pos 25; expected STRING
+parsing err: dsl: unknown func "alwoes"
+parsing err: dsl: unexpected token (STRING, "\"\"") at pos 37; expected )
+parsing err: dsl: unknown constant "PathMatch"
+parsing err: dsl: unexpected token (IDENT, "ErrInjected") at pos 23; expected INT
+parsing err: dsl: unknown func "Or"
+parsing err: dsl: unknown func "And"
+parsing err: dsl: unknown func "Or"
+parsing err: dsl: unexpected token (IDENT, "foo") at pos 23; expected INT
+parsing err: strconv.ParseInt: parsing "9223372036854775807": value out of range
+
+parse-dsl
+(ErrInjected (OpFileReadAt _))
+(ErrInjected (OpFileReadAt foo))
+(ErrInjected (OpFileReadAt 1052363))
+----
+parsing err: dsl: unexpected token (IDENT, "_") at pos 28; expected INT
+parsing err: dsl: unexpected token (IDENT, "foo") at pos 28; expected INT
+(ErrInjected (FileReadAt 1052363))
+
+parse-dsl
+(ErrInjected (Randomly 0))
+(ErrInjected (Randomly 0.1))
+(ErrInjected (Randomly 0.2 18520850252))
+(ErrInjected (Randomly 1.2 18520850252))
+(ErrInjected (Randomly -0.3 18520850252))
+(ErrInjected (Randomly 18520850252 0.25))
+(ErrInjected (And (PathMatch "*.sst") (Randomly 0.05 185957252)))
+(ErrInjected (And (PathMatch "*.sst") (Randomly 0.05)))
+----
+parsing err: dsl: unexpected token (INT, "0") at pos 24; expected FLOAT
+(ErrInjected (Randomly 0.10))
+(ErrInjected (Randomly 0.20 18520850252))
+parsing err: errorfs: Randomly proability p must be within p ≤ 1.0
+parsing err: dsl: unexpected token - at pos 24; expected FLOAT
+parsing err: dsl: unexpected token (INT, "18520850252") at pos 24; expected FLOAT
+(ErrInjected (And (PathMatch "*.sst") (Randomly 0.05 185957252)))
+(ErrInjected (And (PathMatch "*.sst") (Randomly 0.05)))
diff --git a/pebble/vfs/errors_unix.go b/pebble/vfs/errors_unix.go
new file mode 100644
index 0000000..2d05e14
--- /dev/null
+++ b/pebble/vfs/errors_unix.go
@@ -0,0 +1,21 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build darwin || dragonfly || freebsd || linux || openbsd || netbsd
+// +build darwin dragonfly freebsd linux openbsd netbsd
+
+package vfs
+
+import (
+	"github.com/cockroachdb/errors"
+	"golang.org/x/sys/unix"
+)
+
+var errNotEmpty = unix.ENOTEMPTY
+
+// IsNoSpaceError returns true if the given error indicates that the disk is
+// out of space.
+func IsNoSpaceError(err error) bool {
+	return errors.Is(err, unix.ENOSPC)
+}
diff --git a/pebble/vfs/errors_unix_test.go b/pebble/vfs/errors_unix_test.go
new file mode 100644
index 0000000..7a1c1dc
--- /dev/null
+++ b/pebble/vfs/errors_unix_test.go
@@ -0,0 +1,21 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build darwin || dragonfly || freebsd || linux || openbsd || netbsd
+// +build darwin dragonfly freebsd linux openbsd netbsd
+
+package vfs
+
+import (
+	"testing"
+
+	"github.com/cockroachdb/errors"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/sys/unix"
+)
+
+func TestIsNoSpaceError(t *testing.T) {
+	err := errors.WithStack(unix.ENOSPC)
+	require.True(t, IsNoSpaceError(err))
+}
diff --git a/pebble/vfs/errors_windows.go b/pebble/vfs/errors_windows.go
new file mode 100644
index 0000000..65920fc
--- /dev/null
+++ b/pebble/vfs/errors_windows.go
@@ -0,0 +1,22 @@
+// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build windows
+// +build windows
+
+package vfs
+
+import (
+	"github.com/cockroachdb/errors"
+	"golang.org/x/sys/windows"
+)
+
+var errNotEmpty = windows.ERROR_DIR_NOT_EMPTY
+
+// IsNoSpaceError returns true if the given error indicates that the disk is
+// out of space.
+func IsNoSpaceError(err error) bool {
+	return errors.Is(err, windows.ERROR_DISK_FULL) ||
+		errors.Is(err, windows.ERROR_HANDLE_DISK_FULL)
+}
diff --git a/pebble/vfs/fadvise_generic.go b/pebble/vfs/fadvise_generic.go
new file mode 100644
index 0000000..d6b9b41
--- /dev/null
+++ b/pebble/vfs/fadvise_generic.go
@@ -0,0 +1,16 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build !linux
+// +build !linux
+
+package vfs
+
+func fadviseRandom(f uintptr) error {
+	return nil
+}
+
+func fadviseSequential(f uintptr) error {
+	return nil
+}
diff --git a/pebble/vfs/fadvise_linux.go b/pebble/vfs/fadvise_linux.go
new file mode 100644
index 0000000..6bb4db1
--- /dev/null
+++ b/pebble/vfs/fadvise_linux.go
@@ -0,0 +1,20 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build linux
+// +build linux
+
+package vfs
+
+import "golang.org/x/sys/unix"
+
+// Calls Fadvise with FADV_RANDOM to disable readahead on a file descriptor.
+func fadviseRandom(f uintptr) error {
+	return unix.Fadvise(int(f), 0, 0, unix.FADV_RANDOM)
+}
+
+// Calls Fadvise with FADV_SEQUENTIAL to enable readahead on a file descriptor.
+func fadviseSequential(f uintptr) error {
+	return unix.Fadvise(int(f), 0, 0, unix.FADV_SEQUENTIAL)
+}
diff --git a/pebble/vfs/fd_test.go b/pebble/vfs/fd_test.go
new file mode 100644
index 0000000..d86aba5
--- /dev/null
+++ b/pebble/vfs/fd_test.go
@@ -0,0 +1,36 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package vfs
+
+import (
+	"os"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestFileWrappersHaveFd(t *testing.T) {
+	// Use the real filesystem so that we can test vfs.Default, which returns
+	// files with Fd().
+	tmpf, err := os.CreateTemp("", "pebble-db-fd-file")
+	require.NoError(t, err)
+	filename := tmpf.Name()
+	defer os.Remove(filename)
+
+	// File wrapper case 1: Check if diskHealthCheckingFile has Fd().
+	fs2, closer := WithDiskHealthChecks(Default, 10*time.Second,
+		func(info DiskSlowInfo) {})
+	defer closer.Close()
+	f2, err := fs2.Open(filename)
+	require.NoError(t, err)
+	require.NotZero(t, f2.Fd())
+	require.NotEqual(t, f2.Fd(), InvalidFd)
+	// File wrapper case 2: Check if syncingFile has Fd().
+	f3 := NewSyncingFile(f2, SyncingFileOptions{BytesPerSync: 8 << 10 /* 8 KB */})
+	require.NotZero(t, f3.Fd())
+	require.NotEqual(t, f3.Fd(), InvalidFd)
+	require.NoError(t, f2.Close())
+}
diff --git a/pebble/vfs/file_lock_generic.go b/pebble/vfs/file_lock_generic.go
new file mode 100644
index 0000000..6177da6
--- /dev/null
+++ b/pebble/vfs/file_lock_generic.go
@@ -0,0 +1,20 @@
+// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build !darwin && !dragonfly && !freebsd && !linux && !netbsd && !openbsd && !solaris && !windows
+// +build !darwin,!dragonfly,!freebsd,!linux,!netbsd,!openbsd,!solaris,!windows
+
+package vfs
+
+import (
+	"io"
+	"runtime"
+
+	"github.com/cockroachdb/errors"
+)
+
+func (defFS) Lock(name string) (io.Closer, error) {
+	return nil, errors.Errorf("pebble: file locking is not implemented on %s/%s",
+		errors.Safe(runtime.GOOS), errors.Safe(runtime.GOARCH))
+}
diff --git a/pebble/vfs/file_lock_test.go b/pebble/vfs/file_lock_test.go
new file mode 100644
index 0000000..0b8cf11
--- /dev/null
+++ b/pebble/vfs/file_lock_test.go
@@ -0,0 +1,106 @@
+// Copyright 2014 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package vfs_test
+
+import (
+	"bytes"
+	"flag"
+	"os"
+	"os/exec"
+	"testing"
+
+	"github.com/cockroachdb/pebble/vfs"
+	"github.com/stretchr/testify/require"
+)
+
+var lockFilename = flag.String("lockfile", "", "File to lock. A non-empty value implies a child process.")
+
+func spawn(prog, filename string) ([]byte, error) {
+	return exec.Command(prog, "-lockfile", filename, "-test.v",
+		"-test.run=TestLock$").CombinedOutput()
+}
+
+// TestLock locks a file, spawns a second process that attempts to grab the
+// lock to verify it fails.
+// Then it closes the lock, and spawns a third copy to verify it can be
+// relocked.
+func TestLock(t *testing.T) {
+	child := *lockFilename != ""
+	var filename string
+	if child {
+		filename = *lockFilename
+	} else {
+		f, err := os.CreateTemp("", "golang-pebble-db-testlock-")
+		require.NoError(t, err)
+
+		filename = f.Name()
+		// NB: On Windows, locking will fail if the file is already open by the
+		// current process, so we close the lockfile here.
+		require.NoError(t, f.Close())
+		defer os.Remove(filename)
+	}
+
+	// Avoid truncating an existing, non-empty file.
+	fi, err := os.Stat(filename)
+	if err == nil && fi.Size() != 0 {
+		t.Fatalf("The file %s is not empty", filename)
+	}
+
+	t.Logf("Locking: %s", filename)
+	lock, err := vfs.Default.Lock(filename)
+	if err != nil {
+		t.Fatalf("Could not lock %s: %v", filename, err)
+	}
+
+	if !child {
+		t.Logf("Spawning child, should fail to grab lock.")
+		out, err := spawn(os.Args[0], filename)
+		if err == nil {
+			t.Fatalf("Attempt to grab open lock should have failed.\n%s", out)
+		}
+		if !bytes.Contains(out, []byte("Could not lock")) {
+			t.Fatalf("Child failed with unexpected output: %s", out)
+		}
+		t.Logf("Child failed to grab lock as expected.")
+	}
+
+	t.Logf("Unlocking %s", filename)
+	if err := lock.Close(); err != nil {
+		t.Fatalf("Could not unlock %s: %v", filename, err)
+	}
+
+	if !child {
+		t.Logf("Spawning child, should successfully grab lock.")
+		if out, err := spawn(os.Args[0], filename); err != nil {
+			t.Fatalf("Attempt to re-open lock should have succeeded: %v\n%s",
+				err, out)
+		}
+		t.Logf("Child grabbed lock.")
+	}
+}
+
+func TestLockSameProcess(t *testing.T) {
+	f, err := os.CreateTemp("", "pebble-testlocksameprocess-")
+	require.NoError(t, err)
+	filename := f.Name()
+
+	// NB: On Windows, locking will fail if the file is already open by the
+	// current process, so we close the lockfile here.
+	require.NoError(t, f.Close())
+	defer os.Remove(filename)
+
+	lock1, err := vfs.Default.Lock(filename)
+	require.NoError(t, err)
+
+	// Locking the file again from within the same process should fail.
+	// On Unix, Lock should detect the file in the global map of
+	// process-locked files.
+	// On Windows, locking will fail since the file is already open by the
+	// current process.
+	_, err = vfs.Default.Lock(filename)
+	require.Error(t, err)
+
+	require.NoError(t, lock1.Close())
+}
diff --git a/pebble/vfs/file_lock_unix.go b/pebble/vfs/file_lock_unix.go
new file mode 100644
index 0000000..4d05e8c
--- /dev/null
+++ b/pebble/vfs/file_lock_unix.go
@@ -0,0 +1,70 @@
+// Copyright 2014 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris
+// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+
+package vfs
+
+import (
+	"io"
+	"os"
+	"sync"
+
+	"github.com/cockroachdb/errors"
+	"golang.org/x/sys/unix"
+)
+
+var lockedFiles struct {
+	mu struct {
+		sync.Mutex
+		files map[string]bool
+	}
+}
+
+// lockCloser hides all of an os.File's methods, except for Close.
+type lockCloser struct {
+	name string
+	f    *os.File
+}
+
+func (l lockCloser) Close() error {
+	lockedFiles.mu.Lock()
+	defer lockedFiles.mu.Unlock()
+	if !lockedFiles.mu.files[l.name] {
+		panic(errors.Errorf("lock file %q is not locked", l.name))
+	}
+	delete(lockedFiles.mu.files, l.name)
+
+	return l.f.Close()
+}
+
+func (defaultFS) Lock(name string) (io.Closer, error) {
+	lockedFiles.mu.Lock()
+	defer lockedFiles.mu.Unlock()
+	if lockedFiles.mu.files == nil {
+		lockedFiles.mu.files = map[string]bool{}
+	}
+	if lockedFiles.mu.files[name] {
+		return nil, errors.New("lock held by current process")
+	}
+
+	f, err := os.Create(name)
+	if err != nil {
+		return nil, err
+	}
+	spec := unix.Flock_t{
+		Type:   unix.F_WRLCK,
+		Whence: io.SeekStart,
+		Start:  0,
+		Len:    0, // 0 means to lock the entire file.
+		Pid:    int32(os.Getpid()),
+	}
+	if err := unix.FcntlFlock(f.Fd(), unix.F_SETLK, &spec); err != nil {
+		f.Close()
+		return nil, err
+	}
+	lockedFiles.mu.files[name] = true
+	return lockCloser{name, f}, nil
+}
diff --git a/pebble/vfs/file_lock_windows.go b/pebble/vfs/file_lock_windows.go
new file mode 100644
index 0000000..31fd17a
--- /dev/null
+++ b/pebble/vfs/file_lock_windows.go
@@ -0,0 +1,42 @@
+// Copyright 2013 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build windows
+// +build windows
+
+package vfs
+
+import (
+	"io"
+
+	"golang.org/x/sys/windows"
+)
+
+// lockCloser hides all of an windows.Handle's methods, except for Close.
+type lockCloser struct {
+	fd windows.Handle
+}
+
+func (l lockCloser) Close() error {
+	return windows.Close(l.fd)
+}
+
+// Lock locks the given file. On Windows, Locking will fail if the file is
+// already open by the current process.
+func (defaultFS) Lock(name string) (io.Closer, error) {
+	p, err := windows.UTF16PtrFromString(name)
+	if err != nil {
+		return nil, err
+	}
+	fd, err := windows.CreateFile(p,
+		windows.GENERIC_READ|windows.GENERIC_WRITE,
+		0, nil, windows.CREATE_ALWAYS,
+		windows.FILE_ATTRIBUTE_NORMAL,
+		0,
+	)
+	if err != nil {
+		return nil, err
+	}
+	return lockCloser{fd: fd}, nil
+}
diff --git a/pebble/vfs/logging_fs.go b/pebble/vfs/logging_fs.go
new file mode 100644
index 0000000..820c081
--- /dev/null
+++ b/pebble/vfs/logging_fs.go
@@ -0,0 +1,157 @@
+// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package vfs
+
+import (
+	"io"
+	"os"
+)
+
+// WithLogging wraps an FS and logs filesystem modification operations to the
+// given logFn.
+func WithLogging(fs FS, logFn LogFn) FS {
+	return &loggingFS{
+		FS:    fs,
+		logFn: logFn,
+	}
+}
+
+// LogFn is a function that is used to capture a log when WithLogging is used.
+type LogFn func(fmt string, args ...interface{})
+
+type loggingFS struct {
+	FS
+	logFn LogFn
+}
+
+var _ FS = (*loggingFS)(nil)
+
+func (fs *loggingFS) Create(name string) (File, error) {
+	fs.logFn("create: %s", name)
+	f, err := fs.FS.Create(name)
+	if err != nil {
+		return nil, err
+	}
+	return newLoggingFile(f, name, fs.logFn), nil
+}
+
+func (fs *loggingFS) Open(name string, opts ...OpenOption) (File, error) {
+	fs.logFn("open: %s", name)
+	f, err := fs.FS.Open(name, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return newLoggingFile(f, name, fs.logFn), nil
+}
+
+func (fs *loggingFS) OpenReadWrite(name string, opts ...OpenOption) (File, error) {
+	fs.logFn("open-read-write: %s", name)
+	f, err := fs.FS.OpenReadWrite(name, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return newLoggingFile(f, name, fs.logFn), nil
+}
+
+func (fs *loggingFS) Link(oldname, newname string) error {
+	fs.logFn("link: %s -> %s", oldname, newname)
+	return fs.FS.Link(oldname, newname)
+}
+
+func (fs *loggingFS) OpenDir(name string) (File, error) {
+	fs.logFn("open-dir: %s", name)
+	f, err := fs.FS.OpenDir(name)
+	if err != nil {
+		return nil, err
+	}
+	return newLoggingFile(f, name, fs.logFn), nil
+}
+
+func (fs *loggingFS) Rename(oldname, newname string) error {
+	fs.logFn("rename: %s -> %s", oldname, newname)
+	return fs.FS.Rename(oldname, newname)
+}
+
+func (fs *loggingFS) ReuseForWrite(oldname, newname string) (File, error) {
+	fs.logFn("reuseForWrite: %s -> %s", oldname, newname)
+	f, err := fs.FS.ReuseForWrite(oldname, newname)
+	if err != nil {
+		return nil, err
+	}
+	return newLoggingFile(f, newname, fs.logFn), nil
+}
+
+func (fs *loggingFS) MkdirAll(dir string, perm os.FileMode) error {
+	fs.logFn("mkdir-all: %s %#o", dir, perm)
+	return fs.FS.MkdirAll(dir, perm)
+}
+
+func (fs *loggingFS) Lock(name string) (io.Closer, error) {
+	fs.logFn("lock: %s", name)
+	return fs.FS.Lock(name)
+}
+
+func (fs loggingFS) Remove(name string) error {
+	fs.logFn("remove: %s", name)
+	err := fs.FS.Remove(name)
+	return err
+}
+
+func (fs loggingFS) RemoveAll(name string) error {
+	fs.logFn("remove-all: %s", name)
+	err := fs.FS.RemoveAll(name)
+	return err
+}
+
+type loggingFile struct {
+	File
+	name  string
+	logFn LogFn
+}
+
+var _ File = (*loggingFile)(nil)
+
+func newLoggingFile(f File, name string, logFn LogFn) *loggingFile {
+	return &loggingFile{
+		File:  f,
+		name:  name,
+		logFn: logFn,
+	}
+}
+
+func (f *loggingFile) Close() error {
+	f.logFn("close: %s", f.name)
+	return f.File.Close()
+}
+
+func (f *loggingFile) Sync() error {
+	f.logFn("sync: %s", f.name)
+	return f.File.Sync()
+}
+
+func (f *loggingFile) SyncData() error {
+	f.logFn("sync-data: %s", f.name)
+	return f.File.SyncData()
+}
+
+func (f *loggingFile) SyncTo(length int64) (fullSync bool, err error) {
+	f.logFn("sync-to(%d): %s", length, f.name)
+	return f.File.SyncTo(length)
+}
+
+func (f *loggingFile) ReadAt(p []byte, offset int64) (int, error) {
+	f.logFn("read-at(%d, %d): %s", offset, len(p), f.name)
+	return f.File.ReadAt(p, offset)
+}
+
+func (f *loggingFile) WriteAt(p []byte, offset int64) (int, error) {
+	f.logFn("write-at(%d, %d): %s", offset, len(p), f.name)
+	return f.File.WriteAt(p, offset)
+}
+
+func (f *loggingFile) Prefetch(offset int64, length int64) error {
+	f.logFn("prefetch(%d, %d): %s", offset, length, f.name)
+	return f.File.Prefetch(offset, length)
+}
diff --git a/pebble/vfs/mem_fs.go b/pebble/vfs/mem_fs.go
new file mode 100644
index 0000000..ea2be17
--- /dev/null
+++ b/pebble/vfs/mem_fs.go
@@ -0,0 +1,832 @@
+// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package vfs // import "github.com/cockroachdb/pebble/vfs"
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"os"
+	"path"
+	"sort"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"syscall"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/errors/oserror"
+	"github.com/cockroachdb/pebble/internal/invariants"
+	"github.com/cockroachdb/pebble/shims/slices"
+)
+
+const sep = "/"
+
+// NewMem returns a new memory-backed FS implementation.
+func NewMem() *MemFS {
+	return &MemFS{
+		root: newRootMemNode(),
+	}
+}
+
+// NewStrictMem returns a "strict" memory-backed FS implementation. The behaviour is strict wrt
+// needing a Sync() call on files or directories for the state changes to be finalized. Any
+// changes that are not finalized are visible to reads until MemFS.ResetToSyncedState() is called,
+// at which point they are discarded and no longer visible.
+//
+// Expected usage:
+//
+//	strictFS := NewStrictMem()
+//	db := Open(..., &Options{FS: strictFS})
+//	// Do and commit various operations.
+//	...
+//	// Prevent any more changes to finalized state.
+//	strictFS.SetIgnoreSyncs(true)
+//	// This will finish any ongoing background flushes, compactions but none of these writes will
+//	// be finalized since syncs are being ignored.
+//	db.Close()
+//	// Discard unsynced state.
+//	strictFS.ResetToSyncedState()
+//	// Allow changes to finalized state.
+//	strictFS.SetIgnoreSyncs(false)
+//	// Open the DB. This DB should have the same state as if the earlier strictFS operations and
+//	// db.Close() were not called.
+//	db := Open(..., &Options{FS: strictFS})
+func NewStrictMem() *MemFS {
+	return &MemFS{
+		root:   newRootMemNode(),
+		strict: true,
+	}
+}
+
+// NewMemFile returns a memory-backed File implementation. The memory-backed
+// file takes ownership of data.
+func NewMemFile(data []byte) File {
+	n := &memNode{}
+	n.refs.Store(1)
+	n.mu.data = data
+	n.mu.modTime = time.Now()
+	return &memFile{
+		n:    n,
+		read: true,
+	}
+}
+
+// MemFS implements FS.
+type MemFS struct {
+	mu   sync.Mutex
+	root *memNode
+
+	// lockFiles holds a map of open file locks. Presence in this map indicates
+	// a file lock is currently held. Keys are strings holding the path of the
+	// locked file. The stored value is untyped and  unused; only presence of
+	// the key within the map is significant.
+	lockedFiles sync.Map
+	strict      bool
+	ignoreSyncs bool
+	// Windows has peculiar semantics with respect to hard links and deleting
+	// open files. In tests meant to exercise this behavior, this flag can be
+	// set to error if removing an open file.
+	windowsSemantics bool
+}
+
+var _ FS = &MemFS{}
+
+// UseWindowsSemantics configures whether the MemFS implements Windows-style
+// semantics, in particular with respect to whether any of an open file's links
+// may be removed. Windows semantics default to off.
+func (y *MemFS) UseWindowsSemantics(windowsSemantics bool) {
+	y.mu.Lock()
+	defer y.mu.Unlock()
+	y.windowsSemantics = windowsSemantics
+}
+
+// String dumps the contents of the MemFS.
+func (y *MemFS) String() string {
+	y.mu.Lock()
+	defer y.mu.Unlock()
+
+	s := new(bytes.Buffer)
+	y.root.dump(s, 0)
+	return s.String()
+}
+
+// SetIgnoreSyncs sets the MemFS.ignoreSyncs field. See the usage comment with NewStrictMem() for
+// details.
+func (y *MemFS) SetIgnoreSyncs(ignoreSyncs bool) {
+	if !y.strict {
+		panic("SetIgnoreSyncs can only be used on a strict MemFS")
+	}
+	y.mu.Lock()
+	y.ignoreSyncs = ignoreSyncs
+	y.mu.Unlock()
+}
+
+// ResetToSyncedState discards state in the FS that is not synced. See the usage comment with
+// NewStrictMem() for details.
+func (y *MemFS) ResetToSyncedState() {
+	if !y.strict {
+		panic("ResetToSyncedState can only be used on a strict MemFS")
+	}
+	y.mu.Lock()
+	y.root.resetToSyncedState()
+	y.mu.Unlock()
+}
+
+// walk walks the directory tree for the fullname, calling f at each step. If
+// f returns an error, the walk will be aborted and return that same error.
+//
+// Each walk is atomic: y's mutex is held for the entire operation, including
+// all calls to f.
+//
+// dir is the directory at that step, frag is the name fragment, and final is
+// whether it is the final step. For example, walking "/foo/bar/x" will result
+// in 3 calls to f:
+//   - "/", "foo", false
+//   - "/foo/", "bar", false
+//   - "/foo/bar/", "x", true
+//
+// Similarly, walking "/y/z/", with a trailing slash, will result in 3 calls to f:
+//   - "/", "y", false
+//   - "/y/", "z", false
+//   - "/y/z/", "", true
+func (y *MemFS) walk(fullname string, f func(dir *memNode, frag string, final bool) error) error {
+	y.mu.Lock()
+	defer y.mu.Unlock()
+
+	// For memfs, the current working directory is the same as the root directory,
+	// so we strip off any leading "/"s to make fullname a relative path, and
+	// the walk starts at y.root.
+	for len(fullname) > 0 && fullname[0] == sep[0] {
+		fullname = fullname[1:]
+	}
+	dir := y.root
+
+	for {
+		frag, remaining := fullname, ""
+		i := strings.IndexRune(fullname, rune(sep[0]))
+		final := i < 0
+		if !final {
+			frag, remaining = fullname[:i], fullname[i+1:]
+			for len(remaining) > 0 && remaining[0] == sep[0] {
+				remaining = remaining[1:]
+			}
+		}
+		if err := f(dir, frag, final); err != nil {
+			return err
+		}
+		if final {
+			break
+		}
+		child := dir.children[frag]
+		if child == nil {
+			return &os.PathError{
+				Op:   "open",
+				Path: fullname,
+				Err:  oserror.ErrNotExist,
+			}
+		}
+		if !child.isDir {
+			return &os.PathError{
+				Op:   "open",
+				Path: fullname,
+				Err:  errors.New("not a directory"),
+			}
+		}
+		dir, fullname = child, remaining
+	}
+	return nil
+}
+
+// Create implements FS.Create.
+func (y *MemFS) Create(fullname string) (File, error) {
+	var ret *memFile
+	err := y.walk(fullname, func(dir *memNode, frag string, final bool) error {
+		if final {
+			if frag == "" {
+				return errors.New("pebble/vfs: empty file name")
+			}
+			n := &memNode{name: frag}
+			dir.children[frag] = n
+			ret = &memFile{
+				n:     n,
+				fs:    y,
+				read:  true,
+				write: true,
+			}
+		}
+		return nil
+	})
+	if err != nil {
+		return nil, err
+	}
+	ret.n.refs.Add(1)
+	return ret, nil
+}
+
+// Link implements FS.Link.
+func (y *MemFS) Link(oldname, newname string) error {
+	var n *memNode
+	err := y.walk(oldname, func(dir *memNode, frag string, final bool) error {
+		if final {
+			if frag == "" {
+				return errors.New("pebble/vfs: empty file name")
+			}
+			n = dir.children[frag]
+		}
+		return nil
+	})
+	if err != nil {
+		return err
+	}
+	if n == nil {
+		return &os.LinkError{
+			Op:  "link",
+			Old: oldname,
+			New: newname,
+			Err: oserror.ErrNotExist,
+		}
+	}
+	return y.walk(newname, func(dir *memNode, frag string, final bool) error {
+		if final {
+			if frag == "" {
+				return errors.New("pebble/vfs: empty file name")
+			}
+			if _, ok := dir.children[frag]; ok {
+				return &os.LinkError{
+					Op:  "link",
+					Old: oldname,
+					New: newname,
+					Err: oserror.ErrExist,
+				}
+			}
+			dir.children[frag] = n
+		}
+		return nil
+	})
+}
+
+func (y *MemFS) open(fullname string, openForWrite bool) (File, error) {
+	var ret *memFile
+	err := y.walk(fullname, func(dir *memNode, frag string, final bool) error {
+		if final {
+			if frag == "" {
+				ret = &memFile{
+					n:  dir,
+					fs: y,
+				}
+				return nil
+			}
+			if n := dir.children[frag]; n != nil {
+				ret = &memFile{
+					n:     n,
+					fs:    y,
+					read:  true,
+					write: openForWrite,
+				}
+			}
+		}
+		return nil
+	})
+	if err != nil {
+		return nil, err
+	}
+	if ret == nil {
+		return nil, &os.PathError{
+			Op:   "open",
+			Path: fullname,
+			Err:  oserror.ErrNotExist,
+		}
+	}
+	ret.n.refs.Add(1)
+	return ret, nil
+}
+
+// Open implements FS.Open.
+func (y *MemFS) Open(fullname string, opts ...OpenOption) (File, error) {
+	return y.open(fullname, false /* openForWrite */)
+}
+
+// OpenReadWrite implements FS.OpenReadWrite.
+func (y *MemFS) OpenReadWrite(fullname string, opts ...OpenOption) (File, error) {
+	f, err := y.open(fullname, true /* openForWrite */)
+	pathErr, ok := err.(*os.PathError)
+	if ok && pathErr.Err == oserror.ErrNotExist {
+		return y.Create(fullname)
+	}
+	return f, err
+}
+
+// OpenDir implements FS.OpenDir.
+func (y *MemFS) OpenDir(fullname string) (File, error) {
+	return y.open(fullname, false /* openForWrite */)
+}
+
+// Remove implements FS.Remove.
+func (y *MemFS) Remove(fullname string) error {
+	return y.walk(fullname, func(dir *memNode, frag string, final bool) error {
+		if final {
+			if frag == "" {
+				return errors.New("pebble/vfs: empty file name")
+			}
+			child, ok := dir.children[frag]
+			if !ok {
+				return oserror.ErrNotExist
+			}
+			if y.windowsSemantics {
+				// Disallow removal of open files/directories which implements
+				// Windows semantics. This ensures that we don't regress in the
+				// ordering of operations and try to remove a file while it is
+				// still open.
+				if n := child.refs.Load(); n > 0 {
+					return oserror.ErrInvalid
+				}
+			}
+			if len(child.children) > 0 {
+				return errNotEmpty
+			}
+			delete(dir.children, frag)
+		}
+		return nil
+	})
+}
+
+// RemoveAll implements FS.RemoveAll.
+func (y *MemFS) RemoveAll(fullname string) error {
+	err := y.walk(fullname, func(dir *memNode, frag string, final bool) error {
+		if final {
+			if frag == "" {
+				return errors.New("pebble/vfs: empty file name")
+			}
+			_, ok := dir.children[frag]
+			if !ok {
+				return nil
+			}
+			delete(dir.children, frag)
+		}
+		return nil
+	})
+	// Match os.RemoveAll which returns a nil error even if the parent
+	// directories don't exist.
+	if oserror.IsNotExist(err) {
+		err = nil
+	}
+	return err
+}
+
+// Rename implements FS.Rename.
+func (y *MemFS) Rename(oldname, newname string) error {
+	var n *memNode
+	err := y.walk(oldname, func(dir *memNode, frag string, final bool) error {
+		if final {
+			if frag == "" {
+				return errors.New("pebble/vfs: empty file name")
+			}
+			n = dir.children[frag]
+			delete(dir.children, frag)
+		}
+		return nil
+	})
+	if err != nil {
+		return err
+	}
+	if n == nil {
+		return &os.PathError{
+			Op:   "open",
+			Path: oldname,
+			Err:  oserror.ErrNotExist,
+		}
+	}
+	return y.walk(newname, func(dir *memNode, frag string, final bool) error {
+		if final {
+			if frag == "" {
+				return errors.New("pebble/vfs: empty file name")
+			}
+			dir.children[frag] = n
+			n.name = frag
+		}
+		return nil
+	})
+}
+
+// ReuseForWrite implements FS.ReuseForWrite.
+func (y *MemFS) ReuseForWrite(oldname, newname string) (File, error) {
+	if err := y.Rename(oldname, newname); err != nil {
+		return nil, err
+	}
+	f, err := y.Open(newname)
+	if err != nil {
+		return nil, err
+	}
+	y.mu.Lock()
+	defer y.mu.Unlock()
+
+	mf := f.(*memFile)
+	mf.read = false
+	mf.write = true
+	return f, nil
+}
+
+// MkdirAll implements FS.MkdirAll.
+func (y *MemFS) MkdirAll(dirname string, perm os.FileMode) error {
+	return y.walk(dirname, func(dir *memNode, frag string, final bool) error {
+		if frag == "" {
+			if final {
+				return nil
+			}
+			return errors.New("pebble/vfs: empty file name")
+		}
+		child := dir.children[frag]
+		if child == nil {
+			dir.children[frag] = &memNode{
+				name:     frag,
+				children: make(map[string]*memNode),
+				isDir:    true,
+			}
+			return nil
+		}
+		if !child.isDir {
+			return &os.PathError{
+				Op:   "open",
+				Path: dirname,
+				Err:  errors.New("not a directory"),
+			}
+		}
+		return nil
+	})
+}
+
+// Lock implements FS.Lock.
+func (y *MemFS) Lock(fullname string) (io.Closer, error) {
+	// FS.Lock excludes other processes, but other processes cannot see this
+	// process' memory. However some uses (eg, Cockroach tests) may open and
+	// close the same MemFS-backed database multiple times. We want mutual
+	// exclusion in this case too. See cockroachdb/cockroach#110645.
+	_, loaded := y.lockedFiles.Swap(fullname, nil /* the value itself is insignificant */)
+	if loaded {
+		// This file lock has already been acquired. On unix, this results in
+		// either EACCES or EAGAIN so we mimic.
+		return nil, syscall.EAGAIN
+	}
+	// Otherwise, we successfully acquired the lock. Locks are visible in the
+	// parent directory listing, and they also must be created under an existent
+	// directory. Create the path so that we have the normal detection of
+	// non-existent directory paths, and make the lock visible when listing
+	// directory entries.
+	f, err := y.Create(fullname)
+	if err != nil {
+		// "Release" the lock since we failed.
+		y.lockedFiles.Delete(fullname)
+		return nil, err
+	}
+	return &memFileLock{
+		y:        y,
+		f:        f,
+		fullname: fullname,
+	}, nil
+}
+
+// List implements FS.List.
+func (y *MemFS) List(dirname string) ([]string, error) {
+	if !strings.HasSuffix(dirname, sep) {
+		dirname += sep
+	}
+	var ret []string
+	err := y.walk(dirname, func(dir *memNode, frag string, final bool) error {
+		if final {
+			if frag != "" {
+				panic("unreachable")
+			}
+			ret = make([]string, 0, len(dir.children))
+			for s := range dir.children {
+				ret = append(ret, s)
+			}
+		}
+		return nil
+	})
+	return ret, err
+}
+
+// Stat implements FS.Stat.
+func (y *MemFS) Stat(name string) (os.FileInfo, error) {
+	f, err := y.Open(name)
+	if err != nil {
+		if pe, ok := err.(*os.PathError); ok {
+			pe.Op = "stat"
+		}
+		return nil, err
+	}
+	defer f.Close()
+	return f.Stat()
+}
+
+// PathBase implements FS.PathBase.
+func (*MemFS) PathBase(p string) string {
+	// Note that MemFS uses forward slashes for its separator, hence the use of
+	// path.Base, not filepath.Base.
+	return path.Base(p)
+}
+
+// PathJoin implements FS.PathJoin.
+func (*MemFS) PathJoin(elem ...string) string {
+	// Note that MemFS uses forward slashes for its separator, hence the use of
+	// path.Join, not filepath.Join.
+	return path.Join(elem...)
+}
+
+// PathDir implements FS.PathDir.
+func (*MemFS) PathDir(p string) string {
+	// Note that MemFS uses forward slashes for its separator, hence the use of
+	// path.Dir, not filepath.Dir.
+	return path.Dir(p)
+}
+
+// GetDiskUsage implements FS.GetDiskUsage.
+func (*MemFS) GetDiskUsage(string) (DiskUsage, error) {
+	return DiskUsage{}, ErrUnsupported
+}
+
+// memNode holds a file's data or a directory's children, and implements os.FileInfo.
+type memNode struct {
+	name  string
+	isDir bool
+	refs  atomic.Int32
+
+	// Mutable state.
+	// - For a file: data, syncedDate, modTime: A file is only being mutated by a single goroutine,
+	//   but there can be concurrent readers e.g. DB.Checkpoint() which can read WAL or MANIFEST
+	//   files that are being written to. Additionally Sync() calls can be concurrent with writing.
+	// - For a directory: children and syncedChildren. Concurrent writes are possible, and
+	//   these are protected using MemFS.mu.
+	mu struct {
+		sync.Mutex
+		data       []byte
+		syncedData []byte
+		modTime    time.Time
+	}
+
+	children       map[string]*memNode
+	syncedChildren map[string]*memNode
+}
+
+func newRootMemNode() *memNode {
+	return &memNode{
+		name:     "/", // set the name to match what file systems do
+		children: make(map[string]*memNode),
+		isDir:    true,
+	}
+}
+
+func (f *memNode) IsDir() bool {
+	return f.isDir
+}
+
+func (f *memNode) ModTime() time.Time {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	return f.mu.modTime
+}
+
+func (f *memNode) Mode() os.FileMode {
+	if f.isDir {
+		return os.ModeDir | 0755
+	}
+	return 0755
+}
+
+func (f *memNode) Name() string {
+	return f.name
+}
+
+func (f *memNode) Size() int64 {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	return int64(len(f.mu.data))
+}
+
+func (f *memNode) Sys() interface{} {
+	return nil
+}
+
+func (f *memNode) dump(w *bytes.Buffer, level int) {
+	if f.isDir {
+		w.WriteString("          ")
+	} else {
+		f.mu.Lock()
+		fmt.Fprintf(w, "%8d  ", len(f.mu.data))
+		f.mu.Unlock()
+	}
+	for i := 0; i < level; i++ {
+		w.WriteString("  ")
+	}
+	w.WriteString(f.name)
+	if !f.isDir {
+		w.WriteByte('\n')
+		return
+	}
+	if level > 0 { // deal with the fact that the root's name is already "/"
+		w.WriteByte(sep[0])
+	}
+	w.WriteByte('\n')
+	names := make([]string, 0, len(f.children))
+	for name := range f.children {
+		names = append(names, name)
+	}
+	sort.Strings(names)
+	for _, name := range names {
+		f.children[name].dump(w, level+1)
+	}
+}
+
+func (f *memNode) resetToSyncedState() {
+	if f.isDir {
+		f.children = make(map[string]*memNode)
+		for k, v := range f.syncedChildren {
+			f.children[k] = v
+		}
+		for _, v := range f.children {
+			v.resetToSyncedState()
+		}
+	} else {
+		f.mu.Lock()
+		f.mu.data = slices.Clone(f.mu.syncedData)
+		f.mu.Unlock()
+	}
+}
+
+// memFile is a reader or writer of a node's data, and implements File.
+type memFile struct {
+	n           *memNode
+	fs          *MemFS // nil for a standalone memFile
+	rpos        int
+	wpos        int
+	read, write bool
+}
+
+var _ File = (*memFile)(nil)
+
+func (f *memFile) Close() error {
+	if n := f.n.refs.Add(-1); n < 0 {
+		panic(fmt.Sprintf("pebble: close of unopened file: %d", n))
+	}
+	f.n = nil
+	return nil
+}
+
+func (f *memFile) Read(p []byte) (int, error) {
+	if !f.read {
+		return 0, errors.New("pebble/vfs: file was not opened for reading")
+	}
+	if f.n.isDir {
+		return 0, errors.New("pebble/vfs: cannot read a directory")
+	}
+	f.n.mu.Lock()
+	defer f.n.mu.Unlock()
+	if f.rpos >= len(f.n.mu.data) {
+		return 0, io.EOF
+	}
+	n := copy(p, f.n.mu.data[f.rpos:])
+	f.rpos += n
+	return n, nil
+}
+
+func (f *memFile) ReadAt(p []byte, off int64) (int, error) {
+	if !f.read {
+		return 0, errors.New("pebble/vfs: file was not opened for reading")
+	}
+	if f.n.isDir {
+		return 0, errors.New("pebble/vfs: cannot read a directory")
+	}
+	f.n.mu.Lock()
+	defer f.n.mu.Unlock()
+	if off >= int64(len(f.n.mu.data)) {
+		return 0, io.EOF
+	}
+	n := copy(p, f.n.mu.data[off:])
+	if n < len(p) {
+		return n, io.EOF
+	}
+	return n, nil
+}
+
+func (f *memFile) Write(p []byte) (int, error) {
+	if !f.write {
+		return 0, errors.New("pebble/vfs: file was not created for writing")
+	}
+	if f.n.isDir {
+		return 0, errors.New("pebble/vfs: cannot write a directory")
+	}
+	f.n.mu.Lock()
+	defer f.n.mu.Unlock()
+	f.n.mu.modTime = time.Now()
+	if f.wpos+len(p) <= len(f.n.mu.data) {
+		n := copy(f.n.mu.data[f.wpos:f.wpos+len(p)], p)
+		if n != len(p) {
+			panic("stuff")
+		}
+	} else {
+		f.n.mu.data = append(f.n.mu.data[:f.wpos], p...)
+	}
+	f.wpos += len(p)
+
+	if invariants.Enabled {
+		// Mutate the input buffer to flush out bugs in Pebble which expect the
+		// input buffer to be unmodified.
+		for i := range p {
+			p[i] ^= 0xff
+		}
+	}
+	return len(p), nil
+}
+
+func (f *memFile) WriteAt(p []byte, ofs int64) (int, error) {
+	if !f.write {
+		return 0, errors.New("pebble/vfs: file was not created for writing")
+	}
+	if f.n.isDir {
+		return 0, errors.New("pebble/vfs: cannot write a directory")
+	}
+	f.n.mu.Lock()
+	defer f.n.mu.Unlock()
+	f.n.mu.modTime = time.Now()
+
+	for len(f.n.mu.data) < int(ofs)+len(p) {
+		f.n.mu.data = append(f.n.mu.data, 0)
+	}
+
+	n := copy(f.n.mu.data[int(ofs):int(ofs)+len(p)], p)
+	if n != len(p) {
+		panic("stuff")
+	}
+
+	return len(p), nil
+}
+
+func (f *memFile) Prefetch(offset int64, length int64) error { return nil }
+func (f *memFile) Preallocate(offset, length int64) error    { return nil }
+
+func (f *memFile) Stat() (os.FileInfo, error) {
+	return f.n, nil
+}
+
+func (f *memFile) Sync() error {
+	if f.fs == nil || !f.fs.strict {
+		return nil
+	}
+	f.fs.mu.Lock()
+	defer f.fs.mu.Unlock()
+	if f.fs.ignoreSyncs {
+		return nil
+	}
+	if f.n.isDir {
+		f.n.syncedChildren = make(map[string]*memNode)
+		for k, v := range f.n.children {
+			f.n.syncedChildren[k] = v
+		}
+	} else {
+		f.n.mu.Lock()
+		f.n.mu.syncedData = slices.Clone(f.n.mu.data)
+		f.n.mu.Unlock()
+	}
+	return nil
+}
+
+func (f *memFile) SyncData() error {
+	return f.Sync()
+}
+
+func (f *memFile) SyncTo(length int64) (fullSync bool, err error) {
+	// NB: This SyncTo implementation lies, with its return values claiming it
+	// synced the data up to `length`. When fullSync=false, SyncTo provides no
+	// durability guarantees, so this can help surface bugs where we improperly
+	// rely on SyncTo providing durability.
+	return false, nil
+}
+
+func (f *memFile) Fd() uintptr {
+	return InvalidFd
+}
+
+// Flush is a no-op and present only to prevent buffering at higher levels
+// (e.g. it prevents sstable.Writer from using a bufio.Writer).
+func (f *memFile) Flush() error {
+	return nil
+}
+
+type memFileLock struct {
+	y        *MemFS
+	f        File
+	fullname string
+}
+
+func (l *memFileLock) Close() error {
+	if l.y == nil {
+		return nil
+	}
+	l.y.lockedFiles.Delete(l.fullname)
+	l.y = nil
+	return l.f.Close()
+}
diff --git a/pebble/vfs/mem_fs_test.go b/pebble/vfs/mem_fs_test.go
new file mode 100644
index 0000000..592b392
--- /dev/null
+++ b/pebble/vfs/mem_fs_test.go
@@ -0,0 +1,460 @@
+// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package vfs
+
+import (
+	"fmt"
+	"io"
+	"os"
+	"sort"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/stretchr/testify/require"
+)
+
+func runTestCases(t *testing.T, testCases []string, fs *MemFS) {
+	var f File
+	for _, tc := range testCases {
+		s := strings.Split(tc, " ")[1:]
+
+		saveF := s[0] == "f" && s[1] == "="
+		if saveF {
+			s = s[2:]
+		}
+
+		fails := s[len(s)-1] == "fails"
+		if fails {
+			s = s[:len(s)-1]
+		}
+
+		var (
+			fi  os.FileInfo
+			g   File
+			err error
+		)
+		switch s[0] {
+		case "create":
+			g, err = fs.Create(s[1])
+		case "link":
+			err = fs.Link(s[1], s[2])
+		case "open":
+			g, err = fs.Open(s[1])
+		case "openDir":
+			g, err = fs.OpenDir(s[1])
+		case "mkdirall":
+			err = fs.MkdirAll(s[1], 0755)
+		case "remove":
+			err = fs.Remove(s[1])
+		case "rename":
+			err = fs.Rename(s[1], s[2])
+		case "reuseForWrite":
+			g, err = fs.ReuseForWrite(s[1], s[2])
+		case "resetToSynced":
+			fs.ResetToSyncedState()
+		case "ignoreSyncs":
+			fs.SetIgnoreSyncs(true)
+		case "stopIgnoringSyncs":
+			fs.SetIgnoreSyncs(false)
+		case "f.write":
+			_, err = f.Write([]byte(s[1]))
+		case "f.sync":
+			err = f.Sync()
+		case "f.read":
+			n, _ := strconv.Atoi(s[1])
+			buf := make([]byte, n)
+			_, err = io.ReadFull(f, buf)
+			if err != nil {
+				break
+			}
+			if got, want := string(buf), s[3]; got != want {
+				t.Fatalf("%q: got %q, want %q", tc, got, want)
+			}
+		case "f.readat":
+			n, _ := strconv.Atoi(s[1])
+			off, _ := strconv.Atoi(s[2])
+			buf := make([]byte, n)
+			_, err = f.ReadAt(buf, int64(off))
+			if err != nil {
+				break
+			}
+			if got, want := string(buf), s[4]; got != want {
+				t.Fatalf("%q: got %q, want %q", tc, got, want)
+			}
+		case "f.close":
+			f, err = nil, f.Close()
+		case "f.stat.name":
+			fi, err = f.Stat()
+			if err != nil {
+				break
+			}
+			if got, want := fi.Name(), s[2]; got != want {
+				t.Fatalf("%q: got %q, want %q", tc, got, want)
+			}
+		default:
+			t.Fatalf("bad test case: %q", tc)
+		}
+
+		if saveF {
+			f, g = g, nil
+		} else if g != nil {
+			g.Close()
+		}
+
+		if fails {
+			if err == nil {
+				t.Fatalf("%q: got nil error, want non-nil", tc)
+			}
+		} else {
+			if err != nil {
+				t.Fatalf("%q: %v", tc, err)
+			}
+		}
+	}
+
+	// Both "" and "/" are allowed to be used to refer to the root of the FS
+	// for the purposes of cloning.
+	checkClonedIsEquivalent(t, fs, "")
+	checkClonedIsEquivalent(t, fs, "/")
+}
+
+// Test that the FS can be cloned and that the clone serializes identically.
+func checkClonedIsEquivalent(t *testing.T, fs *MemFS, path string) {
+	t.Helper()
+	clone := NewMem()
+	cloned, err := Clone(fs, clone, path, path)
+	require.NoError(t, err)
+	require.True(t, cloned)
+	require.Equal(t, fs.String(), clone.String())
+}
+
+func TestBasics(t *testing.T) {
+	fs := NewMem()
+	testCases := []string{
+		// Create a top-level file.
+		"1a: create /foo",
+		// Create a child of that file. It should fail, since /foo is not a directory.
+		"2a: create /foo/x fails",
+		// Create a third-level file. It should fail, since /bar has not been created.
+		// Similarly, opening that file should fail.
+		"3a: create /bar/baz/y fails",
+		"3b: open /bar/baz/y fails",
+		// Make the /bar/baz directory; create a third-level file. Creation should now succeed.
+		"4a: mkdirall /bar/baz",
+		"4b: f = create /bar/baz/y",
+		"4c: f.stat.name == y",
+		// Write some data; read it back.
+		"5a: f.write abcde",
+		"5b: f.close",
+		"5c: f = open /bar/baz/y",
+		"5d: f.read 5 == abcde",
+		"5e: f.readat 2 1 == bc",
+		"5f: f.close",
+		// Link /bar/baz/y to /bar/baz/z. We should be able to read from both files
+		// and remove them independently.
+		"6a: link /bar/baz/y /bar/baz/z",
+		"6b: f = open /bar/baz/z",
+		"6c: f.read 5 == abcde",
+		"6d: f.close",
+		"6e: remove /bar/baz/z",
+		"6f: f = open /bar/baz/y",
+		"6g: f.read 5 == abcde",
+		"6h: f.close",
+		// Remove the file twice. The first should succeed, the second should fail.
+		"7a: remove /bar/baz/y",
+		"7b: remove /bar/baz/y fails",
+		"7c: open /bar/baz/y fails",
+		// Rename /foo to /goo. Trying to open /foo should succeed before the rename and
+		// fail afterwards, and vice versa for /goo.
+		"8a: open /foo",
+		"8b: open /goo fails",
+		"8c: rename /foo /goo",
+		"8d: open /foo fails",
+		"8e: open /goo",
+		// Create /bar/baz/z and rename /bar/baz to /bar/caz.
+		"9a: create /bar/baz/z",
+		"9b: open /bar/baz/z",
+		"9c: open /bar/caz/z fails",
+		"9d: rename /bar/baz /bar/caz",
+		"9e: open /bar/baz/z fails",
+		"9f: open /bar/caz/z",
+		// ReuseForWrite
+		"10a: reuseForWrite /bar/caz/z /bar/z",
+		"10b: open /bar/caz/z fails",
+		"10c: open /bar/z",
+		// Opening the root directory works.
+		"11a: f = open /",
+		"11b: f.stat.name == /",
+	}
+	runTestCases(t, testCases, fs)
+}
+
+func TestList(t *testing.T) {
+	fs := NewMem()
+
+	dirnames := []string{
+		"/bar",
+		"/foo/2",
+	}
+	for _, dirname := range dirnames {
+		err := fs.MkdirAll(dirname, 0755)
+		if err != nil {
+			t.Fatalf("MkdirAll %q: %v", dirname, err)
+		}
+	}
+
+	filenames := []string{
+		"/a",
+		"/bar/baz",
+		"/foo/0",
+		"/foo/1",
+		"/foo/2/a",
+		"/foo/2/b",
+		"/foo/3",
+		"/foot",
+	}
+	for _, filename := range filenames {
+		f, err := fs.Create(filename)
+		if err != nil {
+			t.Fatalf("Create %q: %v", filename, err)
+		}
+		if err := f.Close(); err != nil {
+			t.Fatalf("Close %q: %v", filename, err)
+		}
+	}
+
+	{
+		got := fs.String()
+		const want = `          /
+       0    a
+            bar/
+       0      baz
+            foo/
+       0      0
+       0      1
+              2/
+       0        a
+       0        b
+       0      3
+       0    foot
+`
+		if got != want {
+			t.Fatalf("String:\n----got----\n%s----want----\n%s", got, want)
+		}
+	}
+
+	testCases := []string{
+		"/:a bar foo foot",
+		"/bar:baz",
+		"/bar/:baz",
+		"/baz:",
+		"/baz/:",
+		"/foo:0 1 2 3",
+		"/foo/:0 1 2 3",
+		"/foo/1:",
+		"/foo/1/:",
+		"/foo/2:a b",
+		"/foo/2/:a b",
+		"/foot:",
+		"/foot/:",
+	}
+	for _, tc := range testCases {
+		s := strings.Split(tc, ":")
+		list, _ := fs.List(s[0])
+		sort.Strings(list)
+		got := strings.Join(list, " ")
+		want := s[1]
+		if got != want {
+			t.Errorf("List %q: got %q, want %q", s[0], got, want)
+		}
+	}
+}
+
+func TestMemFile(t *testing.T) {
+	want := "foo"
+	f := NewMemFile([]byte(want))
+	buf, err := io.ReadAll(f)
+	if err != nil {
+		t.Fatalf("%v", err)
+	}
+	if got := string(buf); got != want {
+		t.Fatalf("got %q, want %q", got, want)
+	}
+}
+
+func TestStrictFS(t *testing.T) {
+	fs := NewStrictMem()
+	testCases := []string{
+		// Created file disappears if directory is not synced.
+		"1a: create /foo",
+		"1b: open /foo",
+		"1c: resetToSynced",
+		"1d: open /foo fails",
+
+		// Create directory and a file in it and write and read from it.
+		"2a: mkdirall /bar",
+		"2b: f = create /bar/y",
+		"2c: f.stat.name == y",
+		// Write some data; read it back.
+		"2d: f.write abcde",
+		"2e: f.close",
+		"2f: f = open /bar/y",
+		"2g: f.read 5 == abcde",
+		"2h: f.close",
+		"2i: open /bar",
+
+		// Resetting causes both the directory and file to disappear.
+		"3a: resetToSynced",
+		"3b: openDir /bar fails",
+		"3c: open /bar/y fails",
+
+		// Create the directory and file again. Link the file to another file in the same dir,
+		// and to a file in the root dir. Sync the root dir. After reset, the created dir and the
+		// file in the root dir are the only ones visible.
+		"4a: mkdirall /bar",
+		"4b: create /bar/y",
+		"4c: f = openDir /",
+		"4d: f.sync",
+		"4e: f.close",
+		"4f: link /bar/y /bar/z",
+		"4g: link /bar/y /z",
+		"4h: f = openDir /",
+		"4i: f.sync",
+		"4j: f.close",
+		"4k: resetToSynced",
+		"4l: openDir /bar",
+		"4m: open /bar/y fails",
+		"4n: open /bar/z fails",
+		"4o: open /z",
+
+		// Create the file in the directory again and this time sync /bar directory. The file is
+		// preserved after reset.
+		"5a: create /bar/y",
+		"5b: f = openDir /bar",
+		"5c: f.sync",
+		"5d: f.close",
+		"5e: resetToSynced",
+		"5f: openDir /bar",
+		"5g: open /bar/y",
+
+		// Unsynced data in the file is lost on reset.
+		"5a: f = create /bar/y",
+		"5b: f.write a",
+		"5c: f.sync",
+		"5d: f.write b",
+		"5e: f.close",
+		"5f: f = openDir /bar",
+		"5g: f.sync",
+		"5h: f.close",
+		"5i: resetToSynced",
+		"5j: f = open /bar/y",
+		"5k: f.read 1 = a",
+		"5l: f.read 1 fails",
+		"5m: f.close",
+
+		// reuseForWrite works correctly in strict mode in that unsynced data does not overwrite
+		// previous contents when a reset happens.
+		"6a: f = create /z",
+		"6b: f.write abcdefgh",
+		"6c: f.sync",
+		"6d: f.close",
+		"6e: f = reuseForWrite /z /y",
+		"6f: f.write x",
+		"6g: f.sync",
+		"6h: f.write y", // will be lost
+		"6i: f.close",
+		"6j: f = openDir /",
+		"6k: f.sync",
+		"6l: f.close",
+		"6m: resetToSynced",
+		"6n: f = open /y",
+		"6o: f.read 8 = xbcdefgh",
+		"6p: f.close",
+
+		// Ignore syncs.
+		"7a: f = create /z",
+		"7b: f.write a",
+		"7c: f.sync",
+		"7d: ignoreSyncs",
+		"7e: f.write b",
+		"7f: f.sync",
+		"7g: f.close",
+		"7h: stopIgnoringSyncs",
+		"7e: f = openDir /",
+		"7f: f.sync",
+		"7g: f.close",
+		"7h: resetToSynced",
+		"7i: f = open /z",
+		"7j: f.read 1 = a",
+		"7k: f.read 1 fails",
+		"7l: f.close",
+	}
+	runTestCases(t, testCases, fs)
+}
+
+func TestMemFSLock(t *testing.T) {
+	filesystems := map[string]FS{}
+	fileLocks := map[string]io.Closer{}
+
+	datadriven.RunTest(t, "testdata/memfs_lock", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "mkfs":
+			for _, arg := range td.CmdArgs {
+				filesystems[arg.String()] = NewMem()
+			}
+			return "OK"
+
+		// lock fs=<filesystem-name> handle=<handle> path=<path>
+		case "lock":
+			var filesystemName string
+			var path string
+			var handle string
+			td.ScanArgs(t, "fs", &filesystemName)
+			td.ScanArgs(t, "path", &path)
+			td.ScanArgs(t, "handle", &handle)
+			fs := filesystems[filesystemName]
+			if fs == nil {
+				return fmt.Sprintf("filesystem %q doesn't exist", filesystemName)
+			}
+			l, err := fs.Lock(path)
+			if err != nil {
+				return err.Error()
+			}
+			fileLocks[handle] = l
+			return "OK"
+
+		// mkdirall fs=<filesystem-name> path=<path>
+		case "mkdirall":
+			var filesystemName string
+			var path string
+			td.ScanArgs(t, "fs", &filesystemName)
+			td.ScanArgs(t, "path", &path)
+			fs := filesystems[filesystemName]
+			if fs == nil {
+				return fmt.Sprintf("filesystem %q doesn't exist", filesystemName)
+			}
+			err := fs.MkdirAll(path, 0755)
+			if err != nil {
+				return err.Error()
+			}
+			return "OK"
+
+		// close handle=<handle>
+		case "close":
+			var handle string
+			td.ScanArgs(t, "handle", &handle)
+			err := fileLocks[handle].Close()
+			delete(fileLocks, handle)
+			if err != nil {
+				return err.Error()
+			}
+			return "OK"
+		default:
+			return fmt.Sprintf("unrecognized command %q", td.Cmd)
+		}
+	})
+}
diff --git a/pebble/vfs/syncing_file.go b/pebble/vfs/syncing_file.go
new file mode 100644
index 0000000..cf59862
--- /dev/null
+++ b/pebble/vfs/syncing_file.go
@@ -0,0 +1,215 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package vfs
+
+import (
+	"sync/atomic"
+
+	"github.com/cockroachdb/errors"
+)
+
+// SyncingFileOptions holds the options for a syncingFile.
+type SyncingFileOptions struct {
+	// NoSyncOnClose elides the automatic Sync during Close if it's not possible
+	// to sync the remainder of the file in a non-blocking way.
+	NoSyncOnClose   bool
+	BytesPerSync    int
+	PreallocateSize int
+}
+
+type syncingFile struct {
+	File
+	// fd can be InvalidFd if the underlying File does not support it.
+	fd              uintptr
+	noSyncOnClose   bool
+	bytesPerSync    int64
+	preallocateSize int64
+	// The offset at which dirty data has been written.
+	offset atomic.Int64
+	// The offset at which data has been synced. Note that if SyncFileRange is
+	// being used, the periodic syncing of data during writing will only ever
+	// sync up to offset-1MB. This is done to avoid rewriting the tail of the
+	// file multiple times, but has the side effect of ensuring that Close will
+	// sync the file's metadata.
+	syncOffset         atomic.Int64
+	preallocatedBlocks int64
+}
+
+// NewSyncingFile wraps a writable file and ensures that data is synced
+// periodically as it is written. The syncing does not provide persistency
+// guarantees for these periodic syncs, but is used to avoid latency spikes if
+// the OS automatically decides to write out a large chunk of dirty filesystem
+// buffers. The underlying file is fully synced upon close.
+func NewSyncingFile(f File, opts SyncingFileOptions) File {
+	s := &syncingFile{
+		File:            f,
+		fd:              f.Fd(),
+		noSyncOnClose:   bool(opts.NoSyncOnClose),
+		bytesPerSync:    int64(opts.BytesPerSync),
+		preallocateSize: int64(opts.PreallocateSize),
+	}
+	// Ensure a file that is opened and then closed will be synced, even if no
+	// data has been written to it.
+	s.syncOffset.Store(-1)
+	return s
+}
+
+// NB: syncingFile.Write is unsafe for concurrent use!
+func (f *syncingFile) Write(p []byte) (n int, err error) {
+	_ = f.preallocate(f.offset.Load())
+
+	n, err = f.File.Write(p)
+	if err != nil {
+		return n, errors.WithStack(err)
+	}
+	// The offset is updated atomically so that it can be accessed safely from
+	// Sync.
+	f.offset.Add(int64(n))
+	if err := f.maybeSync(); err != nil {
+		return 0, err
+	}
+	return n, nil
+}
+
+func (f *syncingFile) preallocate(offset int64) error {
+	if f.fd == InvalidFd || f.preallocateSize == 0 {
+		return nil
+	}
+
+	newPreallocatedBlocks := (offset + f.preallocateSize - 1) / f.preallocateSize
+	if newPreallocatedBlocks <= f.preallocatedBlocks {
+		return nil
+	}
+
+	length := f.preallocateSize * (newPreallocatedBlocks - f.preallocatedBlocks)
+	offset = f.preallocateSize * f.preallocatedBlocks
+	f.preallocatedBlocks = newPreallocatedBlocks
+	return f.Preallocate(offset, length)
+}
+
+func (f *syncingFile) ratchetSyncOffset(offset int64) {
+	for {
+		syncOffset := f.syncOffset.Load()
+		if syncOffset >= offset {
+			return
+		}
+		if f.syncOffset.CompareAndSwap(syncOffset, offset) {
+			return
+		}
+	}
+}
+
+func (f *syncingFile) Sync() error {
+	// We update syncOffset (atomically) in order to avoid spurious syncs in
+	// maybeSync. Note that even if syncOffset is larger than the current file
+	// offset, we still need to call the underlying file's sync for persistence
+	// guarantees which are not provided by SyncTo (or by sync_file_range on
+	// Linux).
+	f.ratchetSyncOffset(f.offset.Load())
+	return f.SyncData()
+}
+
+func (f *syncingFile) maybeSync() error {
+	if f.bytesPerSync <= 0 {
+		return nil
+	}
+
+	// From the RocksDB source:
+	//
+	//   We try to avoid sync to the last 1MB of data. For two reasons:
+	//   (1) avoid rewrite the same page that is modified later.
+	//   (2) for older version of OS, write can block while writing out
+	//       the page.
+	//   Xfs does neighbor page flushing outside of the specified ranges. We
+	//   need to make sure sync range is far from the write offset.
+	const syncRangeBuffer = 1 << 20 // 1 MB
+	offset := f.offset.Load()
+	if offset <= syncRangeBuffer {
+		return nil
+	}
+
+	const syncRangeAlignment = 4 << 10 // 4 KB
+	syncToOffset := offset - syncRangeBuffer
+	syncToOffset -= syncToOffset % syncRangeAlignment
+	syncOffset := f.syncOffset.Load()
+	if syncToOffset < 0 || (syncToOffset-syncOffset) < f.bytesPerSync {
+		return nil
+	}
+
+	if f.fd == InvalidFd {
+		return errors.WithStack(f.Sync())
+	}
+
+	// Note that SyncTo will always be called with an offset < atomic.offset.
+	// The SyncTo implementation may choose to sync the entire file (i.e. on
+	// OSes which do not support syncing a portion of the file).
+	fullSync, err := f.SyncTo(syncToOffset)
+	if err != nil {
+		return errors.WithStack(err)
+	}
+	if fullSync {
+		f.ratchetSyncOffset(offset)
+	} else {
+		f.ratchetSyncOffset(syncToOffset)
+	}
+	return nil
+}
+
+func (f *syncingFile) Close() error {
+	// Sync any data that has been written but not yet synced unless the file
+	// has noSyncOnClose option explicitly set.
+	//
+	// NB: If the file is capable of non-durability-guarantee SyncTos, and the
+	// caller has not called Sync since the last write, syncOffset is guaranteed
+	// to be less than atomic.offset. This ensures we fall into the below
+	// conditional and perform a full sync to durably persist the file.
+	if off := f.offset.Load(); off > f.syncOffset.Load() {
+		// There's still remaining dirty data.
+
+		if f.noSyncOnClose {
+			// If NoSyncOnClose is set, only perform a SyncTo. On linux, SyncTo
+			// translates to a non-blocking `sync_file_range` call which
+			// provides no persistence guarantee. Since it's non-blocking,
+			// there's no latency hit of a blocking sync call, but we still
+			// ensure we're not allowing significant dirty data to accumulate.
+			if _, err := f.File.SyncTo(off); err != nil {
+				return err
+			}
+			f.ratchetSyncOffset(off)
+		} else if err := f.Sync(); err != nil {
+			return errors.WithStack(err)
+		}
+	}
+	return errors.WithStack(f.File.Close())
+}
+
+// NewSyncingFS wraps a vfs.FS with one that wraps newly created files with
+// vfs.NewSyncingFile.
+func NewSyncingFS(fs FS, syncOpts SyncingFileOptions) FS {
+	return &syncingFS{
+		FS:       fs,
+		syncOpts: syncOpts,
+	}
+}
+
+type syncingFS struct {
+	FS
+	syncOpts SyncingFileOptions
+}
+
+var _ FS = (*syncingFS)(nil)
+
+func (fs *syncingFS) Create(name string) (File, error) {
+	f, err := fs.FS.Create(name)
+	if err != nil {
+		return nil, err
+	}
+	return NewSyncingFile(f, fs.syncOpts), nil
+}
+
+func (fs *syncingFS) ReuseForWrite(oldname, newname string) (File, error) {
+	// TODO(radu): implement this if needed.
+	panic("unimplemented")
+}
diff --git a/pebble/vfs/syncing_file_linux_test.go b/pebble/vfs/syncing_file_linux_test.go
new file mode 100644
index 0000000..83458a3
--- /dev/null
+++ b/pebble/vfs/syncing_file_linux_test.go
@@ -0,0 +1,107 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+//go:build linux && !arm
+// +build linux,!arm
+
+package vfs
+
+import (
+	"fmt"
+	"os"
+	"syscall"
+	"testing"
+	"unsafe"
+)
+
+func TestSyncRangeSmokeTest(t *testing.T) {
+	testCases := []struct {
+		err      error
+		expected bool
+	}{
+		{nil, true},
+		{syscall.EINVAL, true},
+		{syscall.ENOSYS, false},
+	}
+	for i, c := range testCases {
+		t.Run("", func(t *testing.T) {
+			ok := syncRangeSmokeTest(uintptr(i),
+				func(fd int, off int64, n int64, flags int) (err error) {
+					if i != fd {
+						t.Fatalf("expected fd %d, but got %d", i, fd)
+					}
+					return c.err
+				})
+			if c.expected != ok {
+				t.Fatalf("expected %t, but got %t: %v", c.expected, ok, c.err)
+			}
+		})
+	}
+}
+
+func BenchmarkDirectIOWrite(b *testing.B) {
+	const targetSize = 16 << 20
+	const alignment = 4096
+
+	var wsizes []int
+	if testing.Verbose() {
+		wsizes = []int{4 << 10, 8 << 10, 16 << 10, 32 << 10}
+	} else {
+		wsizes = []int{4096}
+	}
+
+	for _, wsize := range wsizes {
+		b.Run(fmt.Sprintf("wsize=%d", wsize), func(b *testing.B) {
+			tmpf, err := os.CreateTemp("", "pebble-db-syncing-file-")
+			if err != nil {
+				b.Fatal(err)
+			}
+			filename := tmpf.Name()
+			_ = tmpf.Close()
+			defer os.Remove(filename)
+
+			var f *os.File
+			var size int
+			buf := make([]byte, wsize+alignment)
+			if a := uintptr(unsafe.Pointer(&buf[0])) & uintptr(alignment-1); a != 0 {
+				buf = buf[alignment-a:]
+			}
+			buf = buf[:wsize]
+			init := true
+
+			b.SetBytes(int64(len(buf)))
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				if f == nil {
+					b.StopTimer()
+					f, err = os.OpenFile(filename, syscall.O_DIRECT|os.O_RDWR, 0666)
+					if err != nil {
+						b.Fatal(err)
+					}
+					if init {
+						for size = 0; size < targetSize; size += len(buf) {
+							if _, err := f.WriteAt(buf, int64(size)); err != nil {
+								b.Fatal(err)
+							}
+						}
+					}
+					if err := f.Sync(); err != nil {
+						b.Fatal(err)
+					}
+					size = 0
+					b.StartTimer()
+				}
+				if _, err := f.WriteAt(buf, int64(size)); err != nil {
+					b.Fatal(err)
+				}
+				size += len(buf)
+				if size >= targetSize {
+					_ = f.Close()
+					f = nil
+				}
+			}
+			b.StopTimer()
+		})
+	}
+}
diff --git a/pebble/vfs/syncing_file_test.go b/pebble/vfs/syncing_file_test.go
new file mode 100644
index 0000000..3a49bdc
--- /dev/null
+++ b/pebble/vfs/syncing_file_test.go
@@ -0,0 +1,292 @@
+// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package vfs
+
+import (
+	"bytes"
+	"fmt"
+	"os"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestSyncingFile(t *testing.T) {
+	const mb = 1 << 20
+
+	tmpf, err := os.CreateTemp("", "pebble-db-syncing-file-")
+	require.NoError(t, err)
+
+	filename := tmpf.Name()
+	require.NoError(t, tmpf.Close())
+	defer os.Remove(filename)
+
+	f, err := Default.Create(filename)
+	require.NoError(t, err)
+
+	tf := &mockSyncToFile{File: f, canSyncTo: true}
+	sf := NewSyncingFile(tf, SyncingFileOptions{BytesPerSync: 8 << 10 /* 8 KB */})
+	sf.(*syncingFile).fd = 1
+	testCases := []struct {
+		n              int64
+		expectedSyncTo int64
+	}{
+		{mb, -1},
+		{mb, mb},
+		{4 << 10, mb},
+		{4 << 10, mb + 8<<10},
+		{8 << 10, mb + 16<<10},
+		{16 << 10, mb + 32<<10},
+	}
+	for i, c := range testCases {
+		_, err := sf.Write(make([]byte, c.n))
+		require.NoError(t, err)
+
+		syncTo := sf.(*syncingFile).syncOffset.Load()
+		if c.expectedSyncTo != syncTo {
+			t.Fatalf("%d: expected sync to %d, but found %d", i, c.expectedSyncTo, syncTo)
+		}
+	}
+}
+
+func TestSyncingFileClose(t *testing.T) {
+	testCases := []struct {
+		canSyncTo bool
+		expected  string
+	}{
+		{true, `sync-to(1048576): test [false,<nil>]
+sync-to(2097152): test [false,<nil>]
+sync-to(3145728): test [false,<nil>]
+pre-close: test [offset=4194304 sync-offset=3145728]
+sync-data: test [<nil>]
+close: test [<nil>]
+`},
+		// When SyncTo is not being used, the last sync call ends up syncing all
+		// of the data causing syncingFile.Close to elide the sync.
+		{false, `sync-to(1048576): test [true,<nil>]
+sync-to(3145728): test [true,<nil>]
+pre-close: test [offset=4194304 sync-offset=4194304]
+close: test [<nil>]
+`},
+	}
+	for _, c := range testCases {
+		t.Run(fmt.Sprintf("canSyncTo=%t", c.canSyncTo), func(t *testing.T) {
+			tmpf, err := os.CreateTemp("", "pebble-db-syncing-file-")
+			require.NoError(t, err)
+
+			filename := tmpf.Name()
+			require.NoError(t, tmpf.Close())
+			defer os.Remove(filename)
+
+			f, err := Default.Create(filename)
+			require.NoError(t, err)
+
+			var buf bytes.Buffer
+			tf := &mockSyncToFile{File: f, canSyncTo: c.canSyncTo}
+			lf := &vfsTestFSFile{tf, "test", &buf}
+			s := NewSyncingFile(lf, SyncingFileOptions{BytesPerSync: 8 << 10 /* 8 KB */}).(*syncingFile)
+
+			write := func(n int64) {
+				t.Helper()
+				_, err := s.Write(make([]byte, n))
+				require.NoError(t, err)
+			}
+
+			const mb = 1 << 20
+			write(2 * mb)
+			write(mb)
+			write(mb)
+
+			fmt.Fprintf(&buf, "pre-close: %s [offset=%d sync-offset=%d]\n",
+				lf.name, s.offset.Load(), s.syncOffset.Load())
+			require.NoError(t, s.Close())
+
+			if s := buf.String(); c.expected != s {
+				t.Fatalf("expected\n%s\nbut found\n%s", c.expected, s)
+			}
+		})
+	}
+}
+
+type mockSyncToFile struct {
+	File
+	canSyncTo bool
+}
+
+func (f *mockSyncToFile) SyncTo(length int64) (fullSync bool, err error) {
+	if !f.canSyncTo {
+		if err = f.File.SyncData(); err != nil {
+			return false, err
+		}
+		return true, nil
+	}
+	// f.canSyncTo = true
+	if _, err = f.File.SyncTo(length); err != nil {
+		return false, err
+	}
+	// NB: If the underlying file performed a full sync, lie.
+	return false, nil
+}
+
+func TestSyncingFileNoSyncOnClose(t *testing.T) {
+	testCases := []struct {
+		useSyncTo    bool
+		expectBefore int64
+		expectAfter  int64
+	}{
+		{false, 2 << 20, 3<<20 + 128},
+		{true, 2 << 20, 3<<20 + 128},
+	}
+
+	for _, c := range testCases {
+		t.Run(fmt.Sprintf("useSyncTo=%v", c.useSyncTo), func(t *testing.T) {
+			tmpf, err := os.CreateTemp("", "pebble-db-syncing-file-")
+			require.NoError(t, err)
+
+			filename := tmpf.Name()
+			require.NoError(t, tmpf.Close())
+			defer os.Remove(filename)
+
+			f, err := Default.Create(filename)
+			require.NoError(t, err)
+
+			tf := &mockSyncToFile{f, c.useSyncTo}
+			s := NewSyncingFile(tf, SyncingFileOptions{NoSyncOnClose: true, BytesPerSync: 8 << 10}).(*syncingFile)
+
+			write := func(n int64) {
+				t.Helper()
+				_, err := s.Write(make([]byte, n))
+				require.NoError(t, err)
+			}
+
+			const mb = 1 << 20
+			write(2 * mb) // Sync first 2MB
+			write(mb)     // No sync because syncToOffset = 3M-1M = 2M
+			write(128)    // No sync for the same reason
+
+			syncToBefore := s.syncOffset.Load()
+			require.NoError(t, s.Close())
+			syncToAfter := s.syncOffset.Load()
+
+			// If we're not able to non-blockingly sync using sync-to,
+			// NoSyncOnClose should elide the sync.
+			if !c.useSyncTo {
+				if syncToBefore != c.expectBefore || syncToAfter != c.expectAfter {
+					t.Fatalf("Expected syncTo before and after closing are %d %d but found %d %d",
+						c.expectBefore, c.expectAfter, syncToBefore, syncToAfter)
+				}
+			}
+		})
+	}
+}
+
+func BenchmarkSyncWrite(b *testing.B) {
+	const targetSize = 16 << 20
+
+	var wsizes []int
+	if testing.Verbose() {
+		wsizes = []int{64, 512, 1 << 10, 2 << 10, 4 << 10, 8 << 10, 16 << 10, 32 << 10}
+	} else {
+		wsizes = []int{64}
+	}
+
+	run := func(b *testing.B, wsize int, newFile func(string) File) {
+		tmpf, err := os.CreateTemp("", "pebble-db-syncing-file-")
+		if err != nil {
+			b.Fatal(err)
+		}
+		filename := tmpf.Name()
+		_ = tmpf.Close()
+		defer os.Remove(filename)
+
+		var f File
+		var size int
+		buf := make([]byte, wsize)
+
+		b.SetBytes(int64(len(buf)))
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			if f == nil {
+				b.StopTimer()
+				f = newFile(filename)
+				size = 0
+				b.StartTimer()
+			}
+			if _, err := f.Write(buf); err != nil {
+				b.Fatal(err)
+			}
+			if err := f.Sync(); err != nil {
+				b.Fatal(err)
+			}
+			size += len(buf)
+			if size >= targetSize {
+				_ = f.Close()
+				f = nil
+			}
+		}
+		b.StopTimer()
+	}
+
+	b.Run("no-prealloc", func(b *testing.B) {
+		for _, wsize := range wsizes {
+			b.Run(fmt.Sprintf("wsize=%d", wsize), func(b *testing.B) {
+				run(b, wsize, func(filename string) File {
+					_ = os.Remove(filename)
+					t, err := os.Create(filename)
+					if err != nil {
+						b.Fatal(err)
+					}
+					return NewSyncingFile(wrapOSFile(t), SyncingFileOptions{PreallocateSize: 0})
+				})
+			})
+		}
+	})
+
+	b.Run("prealloc-4MB", func(b *testing.B) {
+		for _, wsize := range wsizes {
+			b.Run(fmt.Sprintf("wsize=%d", wsize), func(b *testing.B) {
+				run(b, wsize, func(filename string) File {
+					_ = os.Remove(filename)
+					t, err := os.Create(filename)
+					if err != nil {
+						b.Fatal(err)
+					}
+					return NewSyncingFile(wrapOSFile(t), SyncingFileOptions{PreallocateSize: 4 << 20})
+				})
+			})
+		}
+	})
+
+	b.Run("reuse", func(b *testing.B) {
+		for _, wsize := range wsizes {
+			b.Run(fmt.Sprintf("wsize=%d", wsize), func(b *testing.B) {
+				init := true
+				run(b, wsize, func(filename string) File {
+					if init {
+						init = false
+
+						t, err := os.OpenFile(filename, os.O_RDWR, 0755)
+						if err != nil {
+							b.Fatal(err)
+						}
+						if _, err := t.Write(make([]byte, targetSize)); err != nil {
+							b.Fatal(err)
+						}
+						if err := t.Sync(); err != nil {
+							b.Fatal(err)
+						}
+						t.Close()
+					}
+
+					t, err := os.OpenFile(filename, os.O_RDWR, 0755)
+					if err != nil {
+						b.Fatal(err)
+					}
+					return NewSyncingFile(wrapOSFile(t), SyncingFileOptions{PreallocateSize: 0})
+				})
+			})
+		}
+	})
+}
diff --git a/pebble/vfs/testdata/memfs_lock b/pebble/vfs/testdata/memfs_lock
new file mode 100644
index 0000000..0ac2655
--- /dev/null
+++ b/pebble/vfs/testdata/memfs_lock
@@ -0,0 +1,55 @@
+mkfs A B
+----
+OK
+
+#
+# Locking a path with parents that don't exist should error.
+#
+
+lock fs=A path=a/b/c handle=fsApathABC
+----
+open a/b/c: file does not exist
+
+#
+# If we create the parents, it should succeed.
+#
+
+mkdirall fs=A path=a/b
+----
+OK
+
+lock fs=A path=a/b/c handle=fsApathABC
+----
+OK
+
+#
+# Locking the same path on the same filesystem should fail with EAGAIN.
+#
+
+lock fs=A path=a/b/c handle=bogus
+----
+resource temporarily unavailable
+
+#
+# Locking the same path on a DIFFERENT filesystem should succeed.
+#
+
+mkdirall fs=B path=a/b
+----
+OK
+
+lock fs=B path=a/b/c handle=fsBpathABC
+----
+OK
+
+#
+# Releasing the lock on fs A should allow us to reacquire it.
+#
+
+close handle=fsApathABC
+----
+OK
+
+lock fs=A path=a/b/c handle=fsApathABC
+----
+OK
diff --git a/pebble/vfs/testdata/vfs b/pebble/vfs/testdata/vfs
new file mode 100644
index 0000000..450377e
--- /dev/null
+++ b/pebble/vfs/testdata/vfs
@@ -0,0 +1,188 @@
+define
+link a b
+create a
+link a b
+link a b
+link c d
+remove b
+link-or-copy a b
+remove b
+----
+link: a -> b [file does not exist]
+create: a [<nil>]
+close: a [<nil>]
+link: a -> b [<nil>]
+link: a -> b [file already exists]
+link: c -> d [file does not exist]
+remove: b [<nil>]
+link: a -> b [<nil>]
+remove: b [<nil>]
+
+define linkErr=ErrExist
+create a
+link a b
+link-or-copy a b
+----
+create: a [<nil>]
+close: a [<nil>]
+link: a -> b [file already exists]
+link: a -> b [file already exists]
+
+define linkErr=ErrNotExist
+create a
+link a b
+link-or-copy a b
+----
+create: a [<nil>]
+close: a [<nil>]
+link: a -> b [file does not exist]
+link: a -> b [file does not exist]
+
+define linkErr=ErrPermission
+create a
+link a b
+link-or-copy a b
+----
+create: a [<nil>]
+close: a [<nil>]
+link: a -> b [permission denied]
+link: a -> b [permission denied]
+
+define linkErr=random
+create a
+link a b
+link-or-copy a b
+----
+create: a [<nil>]
+close: a [<nil>]
+link: a -> b [random]
+link: a -> b [random]
+open: a [<nil>]
+create: b [<nil>]
+sync: b [<nil>]
+close: b [<nil>]
+close: a [<nil>]
+
+define
+mkdir d
+create d/a
+mkdir  d/b
+create d/b/c
+----
+mkdir: d [<nil>]
+create: d/a [<nil>]
+close: a [<nil>]
+mkdir: d/b [<nil>]
+create: d/b/c [<nil>]
+close: c [<nil>]
+
+# NB: This clone does not specify a destination FS, so the clone target will be
+# the same FS. This results in the use of link.
+
+define
+clone d e link
+----
+open: d [<nil>]
+mkdir: e [<nil>]
+open: d/a [<nil>]
+link: d/a -> e/a [random]
+open: d/a [<nil>]
+create: e/a [<nil>]
+sync: a [<nil>]
+close: a [<nil>]
+close: d/a [<nil>]
+close: d/a [<nil>]
+open: d/b [<nil>]
+mkdir: e/b [<nil>]
+open: d/b/c [<nil>]
+link: d/b/c -> e/b/c [random]
+open: d/b/c [<nil>]
+create: e/b/c [<nil>]
+sync: c [<nil>]
+close: c [<nil>]
+close: d/b/c [<nil>]
+close: d/b/c [<nil>]
+close: d/b [<nil>]
+close: d [<nil>]
+
+define
+list e
+----
+a
+b
+
+define
+list e/b
+----
+c
+
+define
+list /
+remove e
+remove-all e
+remove-all e
+remove-all e/a/b/c
+list /
+----
+a
+b
+d
+e
+remove: e [file already exists]
+remove-all: e [<nil>]
+remove-all: e [<nil>]
+remove-all: e/a/b/c [<nil>]
+a
+b
+d
+
+define
+reuseForWrite a b
+reuseForWrite x y
+----
+reuseForWrite: a -> b [<nil>]
+reuseForWrite: x -> y [file does not exist]
+
+# NB: This clone target specified a different FS. This results in no use of
+# link, despite link being provided.
+
+define
+clone d f mem link
+----
+open: d [<nil>]
+mkdir: f [<nil>]
+open: d/a [<nil>]
+create: f/a [<nil>]
+close: a [<nil>]
+close: d/a [<nil>]
+open: d/b [<nil>]
+mkdir: f/b [<nil>]
+open: d/b/c [<nil>]
+create: f/b/c [<nil>]
+close: c [<nil>]
+close: d/b/c [<nil>]
+close: d/b [<nil>]
+close: d [<nil>]
+
+# NB: This clone does not specify link, so all files are copied. It does specify
+# sync, so all files and directories are synced.
+
+define
+clone d g sync
+----
+open: d [<nil>]
+mkdir: g [<nil>]
+open: d/a [<nil>]
+create: g/a [<nil>]
+sync: a [<nil>]
+close: a [<nil>]
+close: d/a [<nil>]
+open: d/b [<nil>]
+mkdir: g/b [<nil>]
+open: d/b/c [<nil>]
+create: g/b/c [<nil>]
+sync: c [<nil>]
+close: c [<nil>]
+close: d/b/c [<nil>]
+close: d/b [<nil>]
+close: d [<nil>]
diff --git a/pebble/vfs/vfs.go b/pebble/vfs/vfs.go
new file mode 100644
index 0000000..3cc0b43
--- /dev/null
+++ b/pebble/vfs/vfs.go
@@ -0,0 +1,419 @@
+// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package vfs
+
+import (
+	"io"
+	"os"
+	"path/filepath"
+	"syscall"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/errors/oserror"
+)
+
+// File is a readable, writable sequence of bytes.
+//
+// Typically, it will be an *os.File, but test code may choose to substitute
+// memory-backed implementations.
+//
+// Write-oriented operations (Write, Sync) must be called sequentially: At most
+// 1 call to Write or Sync may be executed at any given time.
+type File interface {
+	io.Closer
+	io.Reader
+	io.ReaderAt
+	// Unlike the specification for io.Writer.Write(), the vfs.File.Write()
+	// method *is* allowed to modify the slice passed in, whether temporarily
+	// or permanently. Callers of Write() need to take this into account.
+	io.Writer
+	// WriteAt() is only supported for files that were opened with FS.OpenReadWrite.
+	io.WriterAt
+
+	// Preallocate optionally preallocates storage for `length` at `offset`
+	// within the file. Implementations may choose to do nothing.
+	Preallocate(offset, length int64) error
+	Stat() (os.FileInfo, error)
+	Sync() error
+
+	// SyncTo requests that a prefix of the file's data be synced to stable
+	// storage. The caller passes provides a `length`, indicating how many bytes
+	// to sync from the beginning of the file. SyncTo is a no-op for
+	// directories, and therefore always returns false.
+	//
+	// SyncTo returns a fullSync return value, indicating one of two possible
+	// outcomes.
+	//
+	// If fullSync is false, the first `length` bytes of the file was queued to
+	// be synced to stable storage. The syncing of the file prefix may happen
+	// asynchronously. No persistence guarantee is provided.
+	//
+	// If fullSync is true, the entirety of the file's contents were
+	// synchronously synced to stable storage, and a persistence guarantee is
+	// provided. In this outcome, any modified metadata for the file is not
+	// guaranteed to be synced unless that metadata is needed in order to allow
+	// a subsequent data retrieval to be correctly handled.
+	SyncTo(length int64) (fullSync bool, err error)
+
+	// SyncData requires that all written data be persisted. File metadata is
+	// not required to be synced. Unsophisticated implementations may call Sync.
+	SyncData() error
+
+	// Prefetch signals the OS (on supported platforms) to fetch the next length
+	// bytes in file (as returned by os.File.Fd()) after offset into cache. Any
+	// subsequent reads in that range will not issue disk IO.
+	Prefetch(offset int64, length int64) error
+
+	// Fd returns the raw file descriptor when a File is backed by an *os.File.
+	// It can be used for specific functionality like Prefetch.
+	// Returns InvalidFd if not supported.
+	Fd() uintptr
+}
+
+// InvalidFd is a special value returned by File.Fd() when the file is not
+// backed by an OS descriptor.
+// Note: the special value is consistent with what os.File implementation
+// returns on a nil receiver.
+const InvalidFd uintptr = ^(uintptr(0))
+
+// OpenOption provide an interface to do work on file handles in the Open()
+// call.
+type OpenOption interface {
+	// Apply is called on the file handle after it's opened.
+	Apply(File)
+}
+
+// FS is a namespace for files.
+//
+// The names are filepath names: they may be / separated or \ separated,
+// depending on the underlying operating system.
+type FS interface {
+	// Create creates the named file for reading and writing. If a file
+	// already exists at the provided name, it's removed first ensuring the
+	// resulting file descriptor points to a new inode.
+	Create(name string) (File, error)
+
+	// Link creates newname as a hard link to the oldname file.
+	Link(oldname, newname string) error
+
+	// Open opens the named file for reading. openOptions provides
+	Open(name string, opts ...OpenOption) (File, error)
+
+	// OpenReadWrite opens the named file for reading and writing. If the file
+	// does not exist, it is created.
+	OpenReadWrite(name string, opts ...OpenOption) (File, error)
+
+	// OpenDir opens the named directory for syncing.
+	OpenDir(name string) (File, error)
+
+	// Remove removes the named file or directory.
+	Remove(name string) error
+
+	// Remove removes the named file or directory and any children it
+	// contains. It removes everything it can but returns the first error it
+	// encounters.
+	RemoveAll(name string) error
+
+	// Rename renames a file. It overwrites the file at newname if one exists,
+	// the same as os.Rename.
+	Rename(oldname, newname string) error
+
+	// ReuseForWrite attempts to reuse the file with oldname by renaming it to newname and opening
+	// it for writing without truncation. It is acceptable for the implementation to choose not
+	// to reuse oldname, and simply create the file with newname -- in this case the implementation
+	// should delete oldname. If the caller calls this function with an oldname that does not exist,
+	// the implementation may return an error.
+	ReuseForWrite(oldname, newname string) (File, error)
+
+	// MkdirAll creates a directory and all necessary parents. The permission
+	// bits perm have the same semantics as in os.MkdirAll. If the directory
+	// already exists, MkdirAll does nothing and returns nil.
+	MkdirAll(dir string, perm os.FileMode) error
+
+	// Lock locks the given file, creating the file if necessary, and
+	// truncating the file if it already exists. The lock is an exclusive lock
+	// (a write lock), but locked files should neither be read from nor written
+	// to. Such files should have zero size and only exist to co-ordinate
+	// ownership across processes.
+	//
+	// A nil Closer is returned if an error occurred. Otherwise, close that
+	// Closer to release the lock.
+	//
+	// On Linux and OSX, a lock has the same semantics as fcntl(2)'s advisory
+	// locks. In particular, closing any other file descriptor for the same
+	// file will release the lock prematurely.
+	//
+	// Attempting to lock a file that is already locked by the current process
+	// returns an error and leaves the existing lock untouched.
+	//
+	// Lock is not yet implemented on other operating systems, and calling it
+	// will return an error.
+	Lock(name string) (io.Closer, error)
+
+	// List returns a listing of the given directory. The names returned are
+	// relative to dir.
+	List(dir string) ([]string, error)
+
+	// Stat returns an os.FileInfo describing the named file.
+	Stat(name string) (os.FileInfo, error)
+
+	// PathBase returns the last element of path. Trailing path separators are
+	// removed before extracting the last element. If the path is empty, PathBase
+	// returns ".".  If the path consists entirely of separators, PathBase returns a
+	// single separator.
+	PathBase(path string) string
+
+	// PathJoin joins any number of path elements into a single path, adding a
+	// separator if necessary.
+	PathJoin(elem ...string) string
+
+	// PathDir returns all but the last element of path, typically the path's directory.
+	PathDir(path string) string
+
+	// GetDiskUsage returns disk space statistics for the filesystem where
+	// path is any file or directory within that filesystem.
+	GetDiskUsage(path string) (DiskUsage, error)
+}
+
+// DiskUsage summarizes disk space usage on a filesystem.
+type DiskUsage struct {
+	// Total disk space available to the current process in bytes.
+	AvailBytes uint64
+	// Total disk space in bytes.
+	TotalBytes uint64
+	// Used disk space in bytes.
+	UsedBytes uint64
+}
+
+// Default is a FS implementation backed by the underlying operating system's
+// file system.
+var Default FS = defaultFS{}
+
+type defaultFS struct{}
+
+// wrapOSFile takes a standard library OS file and returns a vfs.File. f may be
+// nil, in which case wrapOSFile must not panic. In such cases, it's okay if the
+// returned vfs.File may panic if used.
+func wrapOSFile(f *os.File) File {
+	// See the implementations in default_{linux,unix,windows}.go.
+	return wrapOSFileImpl(f)
+}
+
+func (defaultFS) Create(name string) (File, error) {
+	const openFlags = os.O_RDWR | os.O_CREATE | os.O_EXCL | syscall.O_CLOEXEC
+
+	osFile, err := os.OpenFile(name, openFlags, 0666)
+	// If the file already exists, remove it and try again.
+	//
+	// NB: We choose to remove the file instead of truncating it, despite the
+	// fact that we can't do so atomically, because it's more resistant to
+	// misuse when using hard links.
+
+	// We must loop in case another goroutine/thread/process is also
+	// attempting to create the a file at the same path.
+	for oserror.IsExist(err) {
+		if removeErr := os.Remove(name); removeErr != nil && !oserror.IsNotExist(removeErr) {
+			return wrapOSFile(osFile), errors.WithStack(removeErr)
+		}
+		osFile, err = os.OpenFile(name, openFlags, 0666)
+	}
+	return wrapOSFile(osFile), errors.WithStack(err)
+}
+
+func (defaultFS) Link(oldname, newname string) error {
+	return errors.WithStack(os.Link(oldname, newname))
+}
+
+func (defaultFS) Open(name string, opts ...OpenOption) (File, error) {
+	osFile, err := os.OpenFile(name, os.O_RDONLY|syscall.O_CLOEXEC, 0)
+	if err != nil {
+		return nil, errors.WithStack(err)
+	}
+	file := wrapOSFile(osFile)
+	for _, opt := range opts {
+		opt.Apply(file)
+	}
+	return file, nil
+}
+
+func (defaultFS) OpenReadWrite(name string, opts ...OpenOption) (File, error) {
+	osFile, err := os.OpenFile(name, os.O_RDWR|syscall.O_CLOEXEC|os.O_CREATE, 0666)
+	if err != nil {
+		return nil, errors.WithStack(err)
+	}
+	file := wrapOSFile(osFile)
+	for _, opt := range opts {
+		opt.Apply(file)
+	}
+	return file, nil
+}
+
+func (defaultFS) Remove(name string) error {
+	return errors.WithStack(os.Remove(name))
+}
+
+func (defaultFS) RemoveAll(name string) error {
+	return errors.WithStack(os.RemoveAll(name))
+}
+
+func (defaultFS) Rename(oldname, newname string) error {
+	return errors.WithStack(os.Rename(oldname, newname))
+}
+
+func (fs defaultFS) ReuseForWrite(oldname, newname string) (File, error) {
+	if err := fs.Rename(oldname, newname); err != nil {
+		return nil, errors.WithStack(err)
+	}
+	f, err := os.OpenFile(newname, os.O_RDWR|os.O_CREATE|syscall.O_CLOEXEC, 0666)
+	return wrapOSFile(f), errors.WithStack(err)
+}
+
+func (defaultFS) MkdirAll(dir string, perm os.FileMode) error {
+	return errors.WithStack(os.MkdirAll(dir, perm))
+}
+
+func (defaultFS) List(dir string) ([]string, error) {
+	f, err := os.Open(dir)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	dirnames, err := f.Readdirnames(-1)
+	return dirnames, errors.WithStack(err)
+}
+
+func (defaultFS) Stat(name string) (os.FileInfo, error) {
+	finfo, err := os.Stat(name)
+	return finfo, errors.WithStack(err)
+}
+
+func (defaultFS) PathBase(path string) string {
+	return filepath.Base(path)
+}
+
+func (defaultFS) PathJoin(elem ...string) string {
+	return filepath.Join(elem...)
+}
+
+func (defaultFS) PathDir(path string) string {
+	return filepath.Dir(path)
+}
+
+type randomReadsOption struct{}
+
+// RandomReadsOption is an OpenOption that optimizes opened file handle for
+// random reads, by calling  fadvise() with POSIX_FADV_RANDOM on Linux systems
+// to disable readahead.
+var RandomReadsOption OpenOption = &randomReadsOption{}
+
+// Apply implements the OpenOption interface.
+func (randomReadsOption) Apply(f File) {
+	if fd := f.Fd(); fd != InvalidFd {
+		_ = fadviseRandom(fd)
+	}
+}
+
+type sequentialReadsOption struct{}
+
+// SequentialReadsOption is an OpenOption that optimizes opened file handle for
+// sequential reads, by calling fadvise() with POSIX_FADV_SEQUENTIAL on Linux
+// systems to enable readahead.
+var SequentialReadsOption OpenOption = &sequentialReadsOption{}
+
+// Apply implements the OpenOption interface.
+func (sequentialReadsOption) Apply(f File) {
+	if fd := f.Fd(); fd != InvalidFd {
+		_ = fadviseSequential(fd)
+	}
+}
+
+// Copy copies the contents of oldname to newname. If newname exists, it will
+// be overwritten.
+func Copy(fs FS, oldname, newname string) error {
+	return CopyAcrossFS(fs, oldname, fs, newname)
+}
+
+// CopyAcrossFS copies the contents of oldname on srcFS to newname dstFS. If
+// newname exists, it will be overwritten.
+func CopyAcrossFS(srcFS FS, oldname string, dstFS FS, newname string) error {
+	src, err := srcFS.Open(oldname, SequentialReadsOption)
+	if err != nil {
+		return err
+	}
+	defer src.Close()
+
+	dst, err := dstFS.Create(newname)
+	if err != nil {
+		return err
+	}
+	defer dst.Close()
+
+	if _, err := io.Copy(dst, src); err != nil {
+		return err
+	}
+	return dst.Sync()
+}
+
+// LimitedCopy copies up to maxBytes from oldname to newname. If newname
+// exists, it will be overwritten.
+func LimitedCopy(fs FS, oldname, newname string, maxBytes int64) error {
+	src, err := fs.Open(oldname, SequentialReadsOption)
+	if err != nil {
+		return err
+	}
+	defer src.Close()
+
+	dst, err := fs.Create(newname)
+	if err != nil {
+		return err
+	}
+	defer dst.Close()
+
+	if _, err := io.Copy(dst, &io.LimitedReader{R: src, N: maxBytes}); err != nil {
+		return err
+	}
+	return dst.Sync()
+}
+
+// LinkOrCopy creates newname as a hard link to the oldname file. If creating
+// the hard link fails, LinkOrCopy falls back to copying the file (which may
+// also fail if oldname doesn't exist or newname already exists).
+func LinkOrCopy(fs FS, oldname, newname string) error {
+	err := fs.Link(oldname, newname)
+	if err == nil {
+		return nil
+	}
+	// Permit a handful of errors which we know won't be fixed by copying the
+	// file. Note that we don't check for the specifics of the error code as it
+	// isn't easy to do so in a portable manner. On Unix we'd have to check for
+	// LinkError.Err == syscall.EXDEV. On Windows we'd have to check for
+	// ERROR_NOT_SAME_DEVICE, ERROR_INVALID_FUNCTION, and
+	// ERROR_INVALID_PARAMETER. Rather that such OS specific checks, we fall back
+	// to always trying to copy if hard-linking failed.
+	if oserror.IsExist(err) || oserror.IsNotExist(err) || oserror.IsPermission(err) {
+		return err
+	}
+	return Copy(fs, oldname, newname)
+}
+
+// Root returns the base FS implementation, unwrapping all nested FSs that
+// expose an Unwrap method.
+func Root(fs FS) FS {
+	type unwrapper interface {
+		Unwrap() FS
+	}
+
+	for {
+		u, ok := fs.(unwrapper)
+		if !ok {
+			break
+		}
+		fs = u.Unwrap()
+	}
+	return fs
+}
+
+// ErrUnsupported may be returned a FS when it does not support an operation.
+var ErrUnsupported = errors.New("pebble: not supported")
diff --git a/pebble/vfs/vfs_test.go b/pebble/vfs/vfs_test.go
new file mode 100644
index 0000000..39082fc
--- /dev/null
+++ b/pebble/vfs/vfs_test.go
@@ -0,0 +1,361 @@
+// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+package vfs
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"os"
+	"runtime"
+	"sort"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/datadriven"
+	"github.com/cockroachdb/errors"
+	"github.com/cockroachdb/errors/oserror"
+	"github.com/stretchr/testify/require"
+)
+
+func normalizeError(err error) error {
+	// It is OS-specific which errors match IsExist, IsNotExist, and
+	// IsPermission, with OS-specific error messages. We normalize to the
+	// oserror.Err* errors which have standard error messages across
+	// platforms.
+	switch {
+	case oserror.IsExist(err):
+		return oserror.ErrExist
+	case oserror.IsNotExist(err):
+		return oserror.ErrNotExist
+	case oserror.IsPermission(err):
+		return oserror.ErrPermission
+	}
+	return err
+}
+
+// vfsTestFS is similar to loggingFS but is more specific to the vfs test. It
+// logs more operations and logs return values and errors.
+// It also supports injecting an error on Link.
+type vfsTestFS struct {
+	FS
+	base    string
+	w       io.Writer
+	linkErr error
+}
+
+func (fs vfsTestFS) stripBase(path string) string {
+	if strings.HasPrefix(path, fs.base+"/") {
+		return path[len(fs.base)+1:]
+	}
+	return path
+}
+
+func (fs vfsTestFS) Create(name string) (File, error) {
+	f, err := fs.FS.Create(name)
+	fmt.Fprintf(fs.w, "create: %s [%v]\n", fs.stripBase(name), normalizeError(err))
+	return vfsTestFSFile{f, fs.PathBase(name), fs.w}, err
+}
+
+func (fs vfsTestFS) Link(oldname, newname string) error {
+	err := fs.linkErr
+	if err == nil {
+		err = fs.FS.Link(oldname, newname)
+	}
+	fmt.Fprintf(fs.w, "link: %s -> %s [%v]\n",
+		fs.stripBase(oldname), fs.stripBase(newname), normalizeError(err))
+	return err
+}
+
+func (fs vfsTestFS) ReuseForWrite(oldname, newname string) (File, error) {
+	f, err := fs.FS.ReuseForWrite(oldname, newname)
+	if err == nil {
+		f = vfsTestFSFile{f, fs.PathBase(newname), fs.w}
+	}
+	fmt.Fprintf(fs.w, "reuseForWrite: %s -> %s [%v]\n",
+		fs.stripBase(oldname), fs.stripBase(newname), normalizeError(err))
+	return f, err
+}
+
+func (fs vfsTestFS) MkdirAll(dir string, perm os.FileMode) error {
+	err := fs.FS.MkdirAll(dir, perm)
+	fmt.Fprintf(fs.w, "mkdir: %s [%v]\n", fs.stripBase(dir), normalizeError(err))
+	return err
+}
+
+func (fs vfsTestFS) Open(name string, opts ...OpenOption) (File, error) {
+	f, err := fs.FS.Open(name, opts...)
+	fmt.Fprintf(fs.w, "open: %s [%v]\n", fs.stripBase(name), normalizeError(err))
+	return vfsTestFSFile{f, fs.stripBase(name), fs.w}, err
+}
+
+func (fs vfsTestFS) Remove(name string) error {
+	err := fs.FS.Remove(name)
+	fmt.Fprintf(fs.w, "remove: %s [%v]\n", fs.stripBase(name), normalizeError(err))
+	return err
+}
+
+func (fs vfsTestFS) RemoveAll(name string) error {
+	err := fs.FS.RemoveAll(name)
+	fmt.Fprintf(fs.w, "remove-all: %s [%v]\n", fs.stripBase(name), normalizeError(err))
+	return err
+}
+
+type vfsTestFSFile struct {
+	File
+	name string
+	w    io.Writer
+}
+
+func (f vfsTestFSFile) Close() error {
+	err := f.File.Close()
+	fmt.Fprintf(f.w, "close: %s [%v]\n", f.name, err)
+	return err
+}
+
+func (f vfsTestFSFile) Preallocate(off, n int64) error {
+	err := f.File.Preallocate(off, n)
+	fmt.Fprintf(f.w, "preallocate(off=%d,n=%d): %s [%v]\n", off, n, f.name, err)
+	return err
+}
+
+func (f vfsTestFSFile) Sync() error {
+	err := f.File.Sync()
+	fmt.Fprintf(f.w, "sync: %s [%v]\n", f.name, err)
+	return err
+}
+
+func (f vfsTestFSFile) SyncData() error {
+	err := f.File.SyncData()
+	fmt.Fprintf(f.w, "sync-data: %s [%v]\n", f.name, err)
+	return err
+}
+
+func (f vfsTestFSFile) SyncTo(length int64) (fullSync bool, err error) {
+	fullSync, err = f.File.SyncTo(length)
+	fmt.Fprintf(f.w, "sync-to(%d): %s [%t,%v]\n", length, f.name, fullSync, err)
+	return fullSync, err
+}
+
+func runTestVFS(t *testing.T, baseFS FS, dir string) {
+	var buf bytes.Buffer
+	fs := vfsTestFS{FS: baseFS, base: dir, w: &buf}
+
+	datadriven.RunTest(t, "testdata/vfs", func(t *testing.T, td *datadriven.TestData) string {
+		switch td.Cmd {
+		case "define":
+			buf.Reset()
+
+			for _, arg := range td.CmdArgs {
+				switch arg.Key {
+				case "linkErr":
+					if len(arg.Vals) != 1 {
+						return fmt.Sprintf("%s: %s expected 1 value", td.Cmd, arg.Key)
+					}
+					switch arg.Vals[0] {
+					case "ErrExist":
+						fs.linkErr = oserror.ErrExist
+					case "ErrNotExist":
+						fs.linkErr = oserror.ErrNotExist
+					case "ErrPermission":
+						fs.linkErr = oserror.ErrPermission
+					default:
+						fs.linkErr = errors.New(arg.Vals[0])
+					}
+				default:
+					return fmt.Sprintf("%s: unknown arg: %s", td.Cmd, arg.Key)
+				}
+			}
+
+			for _, line := range strings.Split(td.Input, "\n") {
+				parts := strings.Fields(line)
+				if len(parts) == 0 {
+					return "<op> [<args>]"
+				}
+
+				switch parts[0] {
+				case "clone":
+					if len(parts) < 3 {
+						return "clone <src> <dest> [disk|mem] [link] [sync]"
+					}
+					dstFS := fs
+					var opts []CloneOption
+					for _, p := range parts[3:] {
+						switch p {
+						case "disk":
+							dstFS = vfsTestFS{FS: Default, base: dir, w: &buf}
+						case "mem":
+							dstFS = vfsTestFS{FS: NewMem(), base: dir, w: &buf}
+						case "link":
+							opts = append(opts, CloneTryLink)
+						case "sync":
+							opts = append(opts, CloneSync)
+						default:
+							return fmt.Sprintf("unrecognized argument %q", p)
+						}
+					}
+
+					_, _ = Clone(fs, dstFS, fs.PathJoin(dir, parts[1]), fs.PathJoin(dir, parts[2]), opts...)
+
+				case "create":
+					if len(parts) != 2 {
+						return "create <name>"
+					}
+					f, _ := fs.Create(fs.PathJoin(dir, parts[1]))
+					f.Close()
+
+				case "link":
+					if len(parts) != 3 {
+						return "link <oldname> <newname>"
+					}
+					_ = fs.Link(fs.PathJoin(dir, parts[1]), fs.PathJoin(dir, parts[2]))
+
+				case "link-or-copy":
+					if len(parts) != 3 {
+						return "link-or-copy <oldname> <newname>"
+					}
+					_ = LinkOrCopy(fs, fs.PathJoin(dir, parts[1]), fs.PathJoin(dir, parts[2]))
+
+				case "reuseForWrite":
+					if len(parts) != 3 {
+						return "reuseForWrite <oldname> <newname>"
+					}
+					_, _ = fs.ReuseForWrite(fs.PathJoin(dir, parts[1]), fs.PathJoin(dir, parts[2]))
+
+				case "list":
+					if len(parts) != 2 {
+						return "list <dir>"
+					}
+					paths, _ := fs.List(fs.PathJoin(dir, parts[1]))
+					sort.Strings(paths)
+					for _, p := range paths {
+						fmt.Fprintln(&buf, p)
+					}
+
+				case "mkdir":
+					if len(parts) != 2 {
+						return "mkdir <dir>"
+					}
+					_ = fs.MkdirAll(fs.PathJoin(dir, parts[1]), 0755)
+
+				case "remove":
+					if len(parts) != 2 {
+						return "remove <name>"
+					}
+					_ = fs.Remove(fs.PathJoin(dir, parts[1]))
+
+				case "remove-all":
+					if len(parts) != 2 {
+						return "remove-all <name>"
+					}
+					_ = fs.RemoveAll(fs.PathJoin(dir, parts[1]))
+				}
+			}
+
+			return buf.String()
+
+		default:
+			return fmt.Sprintf("unknown command: %s", td.Cmd)
+		}
+	})
+}
+
+func TestVFS(t *testing.T) {
+	t.Run("mem", func(t *testing.T) {
+		runTestVFS(t, NewMem(), "")
+	})
+	if runtime.GOOS != "windows" {
+		t.Run("disk", func(t *testing.T) {
+			dir, err := os.MkdirTemp("", "test-vfs")
+			require.NoError(t, err)
+			defer func() {
+				_ = os.RemoveAll(dir)
+			}()
+			runTestVFS(t, Default, dir)
+		})
+	}
+}
+
+func TestVFSGetDiskUsage(t *testing.T) {
+	dir, err := os.MkdirTemp("", "test-free-space")
+	require.NoError(t, err)
+	defer func() {
+		_ = os.RemoveAll(dir)
+	}()
+	_, err = Default.GetDiskUsage(dir)
+	require.Nil(t, err)
+}
+
+func TestVFSCreateLinkSemantics(t *testing.T) {
+	dir, err := os.MkdirTemp("", "test-create-link")
+	require.NoError(t, err)
+	defer func() { _ = os.RemoveAll(dir) }()
+
+	for _, fs := range []FS{Default, NewMem()} {
+		t.Run(fmt.Sprintf("%T", fs), func(t *testing.T) {
+			writeFile := func(path, contents string) {
+				path = fs.PathJoin(dir, path)
+				f, err := fs.Create(path)
+				require.NoError(t, err)
+				_, err = f.Write([]byte(contents))
+				require.NoError(t, err)
+				require.NoError(t, f.Close())
+			}
+			readFile := func(path string) string {
+				path = fs.PathJoin(dir, path)
+				f, err := fs.Open(path)
+				require.NoError(t, err)
+				b, err := io.ReadAll(f)
+				require.NoError(t, err)
+				require.NoError(t, f.Close())
+				return string(b)
+			}
+			require.NoError(t, fs.MkdirAll(dir, 0755))
+
+			// Write a file 'foo' and create a hardlink at 'bar'.
+			writeFile("foo", "foo")
+			require.NoError(t, fs.Link(fs.PathJoin(dir, "foo"), fs.PathJoin(dir, "bar")))
+
+			// Both files should contain equal contents, because they're backed by
+			// the same inode.
+			require.Equal(t, "foo", readFile("foo"))
+			require.Equal(t, "foo", readFile("bar"))
+
+			// Calling Create on 'bar' must NOT truncate 'foo'. It should create a
+			// new file at path 'bar' with a new inode.
+			writeFile("bar", "bar")
+
+			require.Equal(t, "foo", readFile("foo"))
+			require.Equal(t, "bar", readFile("bar"))
+		})
+	}
+}
+
+// TestVFSRootDirName ensures that opening the root directory on both the
+// Default and MemFS works and returns a File which has the name of the
+// path separator.
+func TestVFSRootDirName(t *testing.T) {
+	for _, fs := range []FS{Default, NewMem()} {
+		sep := sep
+		if fs == Default {
+			sep = string(os.PathSeparator)
+		}
+		rootDir, err := fs.Open(sep)
+		require.NoError(t, err)
+		fi, err := rootDir.Stat()
+		require.NoError(t, err)
+		require.Equal(t, sep, fi.Name())
+	}
+}
+
+// TestOpType is intended to catch operations that have been added without an
+// associated string, which could result in a runtime panic.
+func TestOpType(t *testing.T) {
+	for i := 0; i < int(opTypeMax); i++ {
+		require.NotPanics(t, func() {
+			_ = OpType(i).String()
+		})
+	}
+}
diff --git a/pebble/vfs/vfstest/vfstest.go b/pebble/vfs/vfstest/vfstest.go
new file mode 100644
index 0000000..7ea3207
--- /dev/null
+++ b/pebble/vfs/vfstest/vfstest.go
@@ -0,0 +1,32 @@
+// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
+// of this source code is governed by a BSD-style license that can be found in
+// the LICENSE file.
+
+// Package vfstest provides facilities for interacting with or faking
+// filesystems during tests and benchmarks.
+package vfstest
+
+import (
+	"os"
+
+	"github.com/cockroachdb/pebble/vfs"
+)
+
+// DiscardFile implements vfs.File but discards all written data and reads
+// without mutating input buffers.
+var DiscardFile vfs.File = (*discardFile)(nil)
+
+type discardFile struct{}
+
+func (*discardFile) Close() error                                   { return nil }
+func (*discardFile) Read(p []byte) (int, error)                     { return len(p), nil }
+func (*discardFile) ReadAt(p []byte, off int64) (int, error)        { return len(p), nil }
+func (*discardFile) Write(p []byte) (int, error)                    { return len(p), nil }
+func (*discardFile) WriteAt(p []byte, ofs int64) (int, error)       { return len(p), nil }
+func (*discardFile) Preallocate(offset, length int64) error         { return nil }
+func (*discardFile) Stat() (os.FileInfo, error)                     { return nil, nil }
+func (*discardFile) Sync() error                                    { return nil }
+func (*discardFile) SyncTo(length int64) (fullSync bool, err error) { return false, nil }
+func (*discardFile) SyncData() error                                { return nil }
+func (*discardFile) Prefetch(offset int64, length int64) error      { return nil }
+func (*discardFile) Fd() uintptr                                    { return 0 }